| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 1560, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.7291141748428345, |
| "epoch": 0.0032102728731942215, |
| "grad_norm": 16.57156753540039, |
| "learning_rate": 0.0, |
| "loss": 1.4779, |
| "mean_token_accuracy": 0.6312866508960724, |
| "num_tokens": 7755.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.6665578484535217, |
| "epoch": 0.006420545746388443, |
| "grad_norm": 18.545442581176758, |
| "learning_rate": 1.282051282051282e-07, |
| "loss": 1.4834, |
| "mean_token_accuracy": 0.6333149969577789, |
| "num_tokens": 16049.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.6309752464294434, |
| "epoch": 0.009630818619582664, |
| "grad_norm": 20.336366653442383, |
| "learning_rate": 2.564102564102564e-07, |
| "loss": 1.5418, |
| "mean_token_accuracy": 0.6078976392745972, |
| "num_tokens": 23512.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.7980514764785767, |
| "epoch": 0.012841091492776886, |
| "grad_norm": 17.363161087036133, |
| "learning_rate": 3.846153846153847e-07, |
| "loss": 1.4917, |
| "mean_token_accuracy": 0.6385786831378937, |
| "num_tokens": 32196.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.6440390348434448, |
| "epoch": 0.016051364365971106, |
| "grad_norm": 21.76154899597168, |
| "learning_rate": 5.128205128205128e-07, |
| "loss": 1.4381, |
| "mean_token_accuracy": 0.6364026069641113, |
| "num_tokens": 41612.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.811388909816742, |
| "epoch": 0.019261637239165328, |
| "grad_norm": 16.966796875, |
| "learning_rate": 6.41025641025641e-07, |
| "loss": 1.4449, |
| "mean_token_accuracy": 0.636858195066452, |
| "num_tokens": 49568.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.7483888268470764, |
| "epoch": 0.02247191011235955, |
| "grad_norm": 16.943206787109375, |
| "learning_rate": 7.692307692307694e-07, |
| "loss": 1.478, |
| "mean_token_accuracy": 0.6292121112346649, |
| "num_tokens": 58061.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.723523497581482, |
| "epoch": 0.025682182985553772, |
| "grad_norm": 16.687475204467773, |
| "learning_rate": 8.974358974358975e-07, |
| "loss": 1.4675, |
| "mean_token_accuracy": 0.6215986013412476, |
| "num_tokens": 66009.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.713658332824707, |
| "epoch": 0.028892455858747994, |
| "grad_norm": 15.334748268127441, |
| "learning_rate": 1.0256410256410257e-06, |
| "loss": 1.5391, |
| "mean_token_accuracy": 0.6208073198795319, |
| "num_tokens": 74373.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.8282572031021118, |
| "epoch": 0.03210272873194221, |
| "grad_norm": 14.283801078796387, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 1.3535, |
| "mean_token_accuracy": 0.6430239975452423, |
| "num_tokens": 82933.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.7159126996994019, |
| "epoch": 0.03531300160513644, |
| "grad_norm": 13.662557601928711, |
| "learning_rate": 1.282051282051282e-06, |
| "loss": 1.3138, |
| "mean_token_accuracy": 0.6514350175857544, |
| "num_tokens": 91580.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.8093950748443604, |
| "epoch": 0.038523274478330656, |
| "grad_norm": 13.19852066040039, |
| "learning_rate": 1.4102564102564104e-06, |
| "loss": 1.373, |
| "mean_token_accuracy": 0.6353383362293243, |
| "num_tokens": 99139.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.7789645195007324, |
| "epoch": 0.04173354735152488, |
| "grad_norm": 16.865962982177734, |
| "learning_rate": 1.5384615384615387e-06, |
| "loss": 1.2458, |
| "mean_token_accuracy": 0.6800651550292969, |
| "num_tokens": 106948.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 1.7515007853507996, |
| "epoch": 0.0449438202247191, |
| "grad_norm": 10.389936447143555, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.2552, |
| "mean_token_accuracy": 0.6667510867118835, |
| "num_tokens": 114925.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.7482985258102417, |
| "epoch": 0.048154093097913325, |
| "grad_norm": 8.63404655456543, |
| "learning_rate": 1.794871794871795e-06, |
| "loss": 1.1545, |
| "mean_token_accuracy": 0.6908352673053741, |
| "num_tokens": 123854.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.7924981117248535, |
| "epoch": 0.051364365971107544, |
| "grad_norm": 10.083547592163086, |
| "learning_rate": 1.9230769230769234e-06, |
| "loss": 1.2271, |
| "mean_token_accuracy": 0.6758890450000763, |
| "num_tokens": 133377.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 1.8247886896133423, |
| "epoch": 0.05457463884430177, |
| "grad_norm": 8.738141059875488, |
| "learning_rate": 2.0512820512820513e-06, |
| "loss": 1.2349, |
| "mean_token_accuracy": 0.6745803952217102, |
| "num_tokens": 142243.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 1.6763617396354675, |
| "epoch": 0.05778491171749599, |
| "grad_norm": 42.90169906616211, |
| "learning_rate": 2.1794871794871797e-06, |
| "loss": 0.9605, |
| "mean_token_accuracy": 0.7231403291225433, |
| "num_tokens": 150915.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 1.668739914894104, |
| "epoch": 0.060995184590690206, |
| "grad_norm": 7.330332279205322, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 1.0346, |
| "mean_token_accuracy": 0.7043041586875916, |
| "num_tokens": 160421.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 1.61678808927536, |
| "epoch": 0.06420545746388442, |
| "grad_norm": 7.4708147048950195, |
| "learning_rate": 2.435897435897436e-06, |
| "loss": 0.9297, |
| "mean_token_accuracy": 0.7384328246116638, |
| "num_tokens": 168561.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.7209243774414062, |
| "epoch": 0.06741573033707865, |
| "grad_norm": 6.788050651550293, |
| "learning_rate": 2.564102564102564e-06, |
| "loss": 0.9503, |
| "mean_token_accuracy": 0.7228703796863556, |
| "num_tokens": 177282.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 1.6552881002426147, |
| "epoch": 0.07062600321027288, |
| "grad_norm": 6.641953468322754, |
| "learning_rate": 2.6923076923076923e-06, |
| "loss": 0.918, |
| "mean_token_accuracy": 0.720936506986618, |
| "num_tokens": 185240.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 1.5451985001564026, |
| "epoch": 0.0738362760834671, |
| "grad_norm": 7.2707953453063965, |
| "learning_rate": 2.8205128205128207e-06, |
| "loss": 0.9192, |
| "mean_token_accuracy": 0.7248755097389221, |
| "num_tokens": 193481.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 1.5412019491195679, |
| "epoch": 0.07704654895666131, |
| "grad_norm": 6.560652256011963, |
| "learning_rate": 2.948717948717949e-06, |
| "loss": 0.8359, |
| "mean_token_accuracy": 0.739835798740387, |
| "num_tokens": 201616.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 1.7039520740509033, |
| "epoch": 0.08025682182985554, |
| "grad_norm": 7.809452056884766, |
| "learning_rate": 3.0769230769230774e-06, |
| "loss": 0.8146, |
| "mean_token_accuracy": 0.7461437880992889, |
| "num_tokens": 211646.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.7537184357643127, |
| "epoch": 0.08346709470304976, |
| "grad_norm": 5.6596903800964355, |
| "learning_rate": 3.205128205128206e-06, |
| "loss": 0.7816, |
| "mean_token_accuracy": 0.7539893090724945, |
| "num_tokens": 221121.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 1.692587435245514, |
| "epoch": 0.08667736757624397, |
| "grad_norm": 6.648515224456787, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.7075, |
| "mean_token_accuracy": 0.7867643237113953, |
| "num_tokens": 228393.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 1.5505646467208862, |
| "epoch": 0.0898876404494382, |
| "grad_norm": 4.989018440246582, |
| "learning_rate": 3.4615384615384617e-06, |
| "loss": 0.7552, |
| "mean_token_accuracy": 0.7660565078258514, |
| "num_tokens": 237025.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 1.48868727684021, |
| "epoch": 0.09309791332263243, |
| "grad_norm": 5.676994800567627, |
| "learning_rate": 3.58974358974359e-06, |
| "loss": 0.6922, |
| "mean_token_accuracy": 0.774169385433197, |
| "num_tokens": 245283.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 1.5744507908821106, |
| "epoch": 0.09630818619582665, |
| "grad_norm": 12.6621732711792, |
| "learning_rate": 3.7179487179487184e-06, |
| "loss": 0.7593, |
| "mean_token_accuracy": 0.7522169053554535, |
| "num_tokens": 254358.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.441491186618805, |
| "epoch": 0.09951845906902086, |
| "grad_norm": 4.436478614807129, |
| "learning_rate": 3.846153846153847e-06, |
| "loss": 0.76, |
| "mean_token_accuracy": 0.7571093142032623, |
| "num_tokens": 263539.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 1.6964601874351501, |
| "epoch": 0.10272873194221509, |
| "grad_norm": 7.042190074920654, |
| "learning_rate": 3.974358974358974e-06, |
| "loss": 0.7626, |
| "mean_token_accuracy": 0.7548583149909973, |
| "num_tokens": 273100.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 1.482248067855835, |
| "epoch": 0.10593900481540931, |
| "grad_norm": 5.204789638519287, |
| "learning_rate": 4.102564102564103e-06, |
| "loss": 0.6846, |
| "mean_token_accuracy": 0.7835729420185089, |
| "num_tokens": 283183.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 1.5532912611961365, |
| "epoch": 0.10914927768860354, |
| "grad_norm": 5.190078258514404, |
| "learning_rate": 4.230769230769231e-06, |
| "loss": 0.7163, |
| "mean_token_accuracy": 0.7673235833644867, |
| "num_tokens": 292075.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 1.4678760766983032, |
| "epoch": 0.11235955056179775, |
| "grad_norm": 5.29253625869751, |
| "learning_rate": 4.358974358974359e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.7700887024402618, |
| "num_tokens": 300604.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.4619144797325134, |
| "epoch": 0.11556982343499198, |
| "grad_norm": 5.18648624420166, |
| "learning_rate": 4.487179487179488e-06, |
| "loss": 0.6847, |
| "mean_token_accuracy": 0.7695477604866028, |
| "num_tokens": 308831.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 1.5652631521224976, |
| "epoch": 0.1187800963081862, |
| "grad_norm": 5.542480945587158, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.6814, |
| "mean_token_accuracy": 0.7790135145187378, |
| "num_tokens": 317048.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 1.533966839313507, |
| "epoch": 0.12199036918138041, |
| "grad_norm": 6.46714973449707, |
| "learning_rate": 4.743589743589744e-06, |
| "loss": 0.6381, |
| "mean_token_accuracy": 0.789818674325943, |
| "num_tokens": 325284.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 1.4124146699905396, |
| "epoch": 0.12520064205457465, |
| "grad_norm": 4.356935977935791, |
| "learning_rate": 4.871794871794872e-06, |
| "loss": 0.6803, |
| "mean_token_accuracy": 0.7765854597091675, |
| "num_tokens": 334504.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 1.3948511481285095, |
| "epoch": 0.12841091492776885, |
| "grad_norm": 5.148672580718994, |
| "learning_rate": 5e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.7939413189888, |
| "num_tokens": 343396.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.7629846334457397, |
| "epoch": 0.13162118780096307, |
| "grad_norm": 8.669179916381836, |
| "learning_rate": 5.128205128205128e-06, |
| "loss": 0.6534, |
| "mean_token_accuracy": 0.7880546152591705, |
| "num_tokens": 353580.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 1.5166628956794739, |
| "epoch": 0.1348314606741573, |
| "grad_norm": 6.4972310066223145, |
| "learning_rate": 5.256410256410257e-06, |
| "loss": 0.6503, |
| "mean_token_accuracy": 0.7786408066749573, |
| "num_tokens": 362812.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 1.5299481749534607, |
| "epoch": 0.13804173354735153, |
| "grad_norm": 11.583465576171875, |
| "learning_rate": 5.384615384615385e-06, |
| "loss": 0.6344, |
| "mean_token_accuracy": 0.7911360859870911, |
| "num_tokens": 371115.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 1.347219169139862, |
| "epoch": 0.14125200642054575, |
| "grad_norm": 5.8644185066223145, |
| "learning_rate": 5.512820512820514e-06, |
| "loss": 0.6108, |
| "mean_token_accuracy": 0.791721373796463, |
| "num_tokens": 379905.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 1.4316428899765015, |
| "epoch": 0.14446227929373998, |
| "grad_norm": 5.175519943237305, |
| "learning_rate": 5.641025641025641e-06, |
| "loss": 0.6052, |
| "mean_token_accuracy": 0.7954953908920288, |
| "num_tokens": 388232.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.6255661249160767, |
| "epoch": 0.1476725521669342, |
| "grad_norm": 11.087921142578125, |
| "learning_rate": 5.769230769230769e-06, |
| "loss": 0.6345, |
| "mean_token_accuracy": 0.7858785092830658, |
| "num_tokens": 396302.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 1.498759388923645, |
| "epoch": 0.1508828250401284, |
| "grad_norm": 10.765654563903809, |
| "learning_rate": 5.897435897435898e-06, |
| "loss": 0.5593, |
| "mean_token_accuracy": 0.7999837696552277, |
| "num_tokens": 405773.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 1.510821521282196, |
| "epoch": 0.15409309791332262, |
| "grad_norm": 6.4933695793151855, |
| "learning_rate": 6.025641025641026e-06, |
| "loss": 0.5956, |
| "mean_token_accuracy": 0.7734400928020477, |
| "num_tokens": 415320.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 1.58489990234375, |
| "epoch": 0.15730337078651685, |
| "grad_norm": 8.327532768249512, |
| "learning_rate": 6.153846153846155e-06, |
| "loss": 0.5779, |
| "mean_token_accuracy": 0.811727911233902, |
| "num_tokens": 423824.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 1.6798959970474243, |
| "epoch": 0.16051364365971107, |
| "grad_norm": 9.243504524230957, |
| "learning_rate": 6.282051282051282e-06, |
| "loss": 0.6383, |
| "mean_token_accuracy": 0.7815350294113159, |
| "num_tokens": 433980.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.5442488193511963, |
| "epoch": 0.1637239165329053, |
| "grad_norm": 10.32125473022461, |
| "learning_rate": 6.410256410256412e-06, |
| "loss": 0.6572, |
| "mean_token_accuracy": 0.7940186858177185, |
| "num_tokens": 443555.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 1.5454481840133667, |
| "epoch": 0.16693418940609953, |
| "grad_norm": 7.995494842529297, |
| "learning_rate": 6.538461538461539e-06, |
| "loss": 0.6512, |
| "mean_token_accuracy": 0.7784733474254608, |
| "num_tokens": 451491.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 1.4755758047103882, |
| "epoch": 0.17014446227929375, |
| "grad_norm": 9.288561820983887, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.6035, |
| "mean_token_accuracy": 0.7999023199081421, |
| "num_tokens": 461177.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 1.6358034014701843, |
| "epoch": 0.17335473515248795, |
| "grad_norm": 6.63067102432251, |
| "learning_rate": 6.794871794871796e-06, |
| "loss": 0.6189, |
| "mean_token_accuracy": 0.7967036664485931, |
| "num_tokens": 469468.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 1.605944573879242, |
| "epoch": 0.17656500802568217, |
| "grad_norm": 6.3592987060546875, |
| "learning_rate": 6.923076923076923e-06, |
| "loss": 0.6694, |
| "mean_token_accuracy": 0.7812408804893494, |
| "num_tokens": 477498.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 1.6061294674873352, |
| "epoch": 0.1797752808988764, |
| "grad_norm": 5.8274712562561035, |
| "learning_rate": 7.051282051282053e-06, |
| "loss": 0.6096, |
| "mean_token_accuracy": 0.78871089220047, |
| "num_tokens": 486587.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 1.5025911927223206, |
| "epoch": 0.18298555377207062, |
| "grad_norm": 4.877136707305908, |
| "learning_rate": 7.17948717948718e-06, |
| "loss": 0.5809, |
| "mean_token_accuracy": 0.7969581782817841, |
| "num_tokens": 494600.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 1.5793496370315552, |
| "epoch": 0.18619582664526485, |
| "grad_norm": 5.73530912399292, |
| "learning_rate": 7.307692307692308e-06, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.8010603785514832, |
| "num_tokens": 505041.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 1.4806573987007141, |
| "epoch": 0.18940609951845908, |
| "grad_norm": 4.669845104217529, |
| "learning_rate": 7.435897435897437e-06, |
| "loss": 0.591, |
| "mean_token_accuracy": 0.8051888644695282, |
| "num_tokens": 514202.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 1.501672387123108, |
| "epoch": 0.1926163723916533, |
| "grad_norm": 5.279482364654541, |
| "learning_rate": 7.564102564102564e-06, |
| "loss": 0.5285, |
| "mean_token_accuracy": 0.8189839124679565, |
| "num_tokens": 522981.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.4994003772735596, |
| "epoch": 0.1958266452648475, |
| "grad_norm": 4.435177803039551, |
| "learning_rate": 7.692307692307694e-06, |
| "loss": 0.6004, |
| "mean_token_accuracy": 0.7918008863925934, |
| "num_tokens": 532071.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 1.4552969932556152, |
| "epoch": 0.19903691813804172, |
| "grad_norm": 5.8819403648376465, |
| "learning_rate": 7.820512820512822e-06, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.7975403964519501, |
| "num_tokens": 542292.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 1.617630124092102, |
| "epoch": 0.20224719101123595, |
| "grad_norm": 5.218127727508545, |
| "learning_rate": 7.948717948717949e-06, |
| "loss": 0.5843, |
| "mean_token_accuracy": 0.8035672008991241, |
| "num_tokens": 551404.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 1.5008496046066284, |
| "epoch": 0.20545746388443017, |
| "grad_norm": 11.88429069519043, |
| "learning_rate": 8.076923076923077e-06, |
| "loss": 0.5689, |
| "mean_token_accuracy": 0.807811826467514, |
| "num_tokens": 559745.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 1.607909917831421, |
| "epoch": 0.2086677367576244, |
| "grad_norm": 4.793018341064453, |
| "learning_rate": 8.205128205128205e-06, |
| "loss": 0.5537, |
| "mean_token_accuracy": 0.8123330175876617, |
| "num_tokens": 569437.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 1.5648667216300964, |
| "epoch": 0.21187800963081863, |
| "grad_norm": 4.589395999908447, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 0.5838, |
| "mean_token_accuracy": 0.8071021139621735, |
| "num_tokens": 577731.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 1.438844621181488, |
| "epoch": 0.21508828250401285, |
| "grad_norm": 4.69675350189209, |
| "learning_rate": 8.461538461538462e-06, |
| "loss": 0.5286, |
| "mean_token_accuracy": 0.8204745054244995, |
| "num_tokens": 585650.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 1.507163166999817, |
| "epoch": 0.21829855537720708, |
| "grad_norm": 6.146286487579346, |
| "learning_rate": 8.58974358974359e-06, |
| "loss": 0.5781, |
| "mean_token_accuracy": 0.7892117500305176, |
| "num_tokens": 595188.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 1.4560467600822449, |
| "epoch": 0.22150882825040127, |
| "grad_norm": 5.049193382263184, |
| "learning_rate": 8.717948717948719e-06, |
| "loss": 0.6418, |
| "mean_token_accuracy": 0.7874622046947479, |
| "num_tokens": 604332.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 1.4594195485115051, |
| "epoch": 0.2247191011235955, |
| "grad_norm": 6.110840320587158, |
| "learning_rate": 8.846153846153847e-06, |
| "loss": 0.5814, |
| "mean_token_accuracy": 0.8057654201984406, |
| "num_tokens": 612359.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.540019154548645, |
| "epoch": 0.22792937399678972, |
| "grad_norm": 6.3223395347595215, |
| "learning_rate": 8.974358974358976e-06, |
| "loss": 0.5083, |
| "mean_token_accuracy": 0.8341903388500214, |
| "num_tokens": 619746.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 1.4772478938102722, |
| "epoch": 0.23113964686998395, |
| "grad_norm": 9.083407402038574, |
| "learning_rate": 9.102564102564104e-06, |
| "loss": 0.5976, |
| "mean_token_accuracy": 0.7991544604301453, |
| "num_tokens": 628789.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 1.5935535430908203, |
| "epoch": 0.23434991974317818, |
| "grad_norm": 5.82806396484375, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 0.5975, |
| "mean_token_accuracy": 0.8066189289093018, |
| "num_tokens": 637168.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 1.6355071067810059, |
| "epoch": 0.2375601926163724, |
| "grad_norm": 5.713013172149658, |
| "learning_rate": 9.358974358974359e-06, |
| "loss": 0.6243, |
| "mean_token_accuracy": 0.7890200316905975, |
| "num_tokens": 645489.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 1.5683773159980774, |
| "epoch": 0.24077046548956663, |
| "grad_norm": 4.871849060058594, |
| "learning_rate": 9.487179487179487e-06, |
| "loss": 0.5996, |
| "mean_token_accuracy": 0.7987681031227112, |
| "num_tokens": 654474.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.554315447807312, |
| "epoch": 0.24398073836276082, |
| "grad_norm": 4.6450653076171875, |
| "learning_rate": 9.615384615384616e-06, |
| "loss": 0.607, |
| "mean_token_accuracy": 0.7968572080135345, |
| "num_tokens": 664329.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 1.5780853033065796, |
| "epoch": 0.24719101123595505, |
| "grad_norm": 5.811628818511963, |
| "learning_rate": 9.743589743589744e-06, |
| "loss": 0.632, |
| "mean_token_accuracy": 0.7882947325706482, |
| "num_tokens": 673150.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 1.7099721431732178, |
| "epoch": 0.2504012841091493, |
| "grad_norm": 13.076167106628418, |
| "learning_rate": 9.871794871794872e-06, |
| "loss": 0.5605, |
| "mean_token_accuracy": 0.8057580888271332, |
| "num_tokens": 682512.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 1.4776220321655273, |
| "epoch": 0.2536115569823435, |
| "grad_norm": 7.290124416351318, |
| "learning_rate": 1e-05, |
| "loss": 0.6197, |
| "mean_token_accuracy": 0.7889206409454346, |
| "num_tokens": 691640.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 1.739963173866272, |
| "epoch": 0.2568218298555377, |
| "grad_norm": 6.372115135192871, |
| "learning_rate": 9.999988765773283e-06, |
| "loss": 0.6253, |
| "mean_token_accuracy": 0.7942008674144745, |
| "num_tokens": 700384.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.6132588982582092, |
| "epoch": 0.26003210272873195, |
| "grad_norm": 9.141928672790527, |
| "learning_rate": 9.99995506314361e-06, |
| "loss": 0.586, |
| "mean_token_accuracy": 0.8013035655021667, |
| "num_tokens": 709604.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 1.7951747179031372, |
| "epoch": 0.26324237560192615, |
| "grad_norm": 15.085265159606934, |
| "learning_rate": 9.999898892262433e-06, |
| "loss": 0.5251, |
| "mean_token_accuracy": 0.8083482682704926, |
| "num_tokens": 717638.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 1.7024835348129272, |
| "epoch": 0.2664526484751204, |
| "grad_norm": 5.644409656524658, |
| "learning_rate": 9.99982025338217e-06, |
| "loss": 0.602, |
| "mean_token_accuracy": 0.8106231689453125, |
| "num_tokens": 726166.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 1.5966495275497437, |
| "epoch": 0.2696629213483146, |
| "grad_norm": 5.191967487335205, |
| "learning_rate": 9.999719146856191e-06, |
| "loss": 0.5468, |
| "mean_token_accuracy": 0.814395397901535, |
| "num_tokens": 734713.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 1.5932486057281494, |
| "epoch": 0.27287319422150885, |
| "grad_norm": 8.375090599060059, |
| "learning_rate": 9.999595573138845e-06, |
| "loss": 0.5339, |
| "mean_token_accuracy": 0.81120365858078, |
| "num_tokens": 742232.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 1.7226688861846924, |
| "epoch": 0.27608346709470305, |
| "grad_norm": 4.805099964141846, |
| "learning_rate": 9.99944953278543e-06, |
| "loss": 0.5941, |
| "mean_token_accuracy": 0.8079180121421814, |
| "num_tokens": 750192.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 1.6014992594718933, |
| "epoch": 0.27929373996789725, |
| "grad_norm": 16.921457290649414, |
| "learning_rate": 9.99928102645221e-06, |
| "loss": 0.5497, |
| "mean_token_accuracy": 0.8063566982746124, |
| "num_tokens": 758813.0, |
| "step": 87 |
| }, |
| { |
| "entropy": 1.560517430305481, |
| "epoch": 0.2825040128410915, |
| "grad_norm": 9.258691787719727, |
| "learning_rate": 9.999090054896397e-06, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.7950149774551392, |
| "num_tokens": 768181.0, |
| "step": 88 |
| }, |
| { |
| "entropy": 1.7196683287620544, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 5.792410850524902, |
| "learning_rate": 9.99887661897616e-06, |
| "loss": 0.5559, |
| "mean_token_accuracy": 0.808037519454956, |
| "num_tokens": 776667.0, |
| "step": 89 |
| }, |
| { |
| "entropy": 1.6795648336410522, |
| "epoch": 0.28892455858747995, |
| "grad_norm": 6.827014446258545, |
| "learning_rate": 9.998640719650609e-06, |
| "loss": 0.5508, |
| "mean_token_accuracy": 0.8196805119514465, |
| "num_tokens": 784702.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.5682292580604553, |
| "epoch": 0.29213483146067415, |
| "grad_norm": 4.962204456329346, |
| "learning_rate": 9.99838235797981e-06, |
| "loss": 0.5451, |
| "mean_token_accuracy": 0.8104510009288788, |
| "num_tokens": 793038.0, |
| "step": 91 |
| }, |
| { |
| "entropy": 1.6802659630775452, |
| "epoch": 0.2953451043338684, |
| "grad_norm": 5.178422451019287, |
| "learning_rate": 9.998101535124758e-06, |
| "loss": 0.5749, |
| "mean_token_accuracy": 0.8083841502666473, |
| "num_tokens": 801811.0, |
| "step": 92 |
| }, |
| { |
| "entropy": 1.5281678438186646, |
| "epoch": 0.2985553772070626, |
| "grad_norm": 5.047668933868408, |
| "learning_rate": 9.997798252347382e-06, |
| "loss": 0.5217, |
| "mean_token_accuracy": 0.8163295686244965, |
| "num_tokens": 810749.0, |
| "step": 93 |
| }, |
| { |
| "entropy": 1.4543544054031372, |
| "epoch": 0.3017656500802568, |
| "grad_norm": 7.823535919189453, |
| "learning_rate": 9.997472511010543e-06, |
| "loss": 0.625, |
| "mean_token_accuracy": 0.7929400205612183, |
| "num_tokens": 819856.0, |
| "step": 94 |
| }, |
| { |
| "entropy": 1.565036654472351, |
| "epoch": 0.30497592295345105, |
| "grad_norm": 6.509371280670166, |
| "learning_rate": 9.99712431257802e-06, |
| "loss": 0.5937, |
| "mean_token_accuracy": 0.7926050424575806, |
| "num_tokens": 828198.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 1.639006495475769, |
| "epoch": 0.30818619582664525, |
| "grad_norm": 7.456268310546875, |
| "learning_rate": 9.99675365861451e-06, |
| "loss": 0.5543, |
| "mean_token_accuracy": 0.8075222671031952, |
| "num_tokens": 837031.0, |
| "step": 96 |
| }, |
| { |
| "entropy": 1.4620029926300049, |
| "epoch": 0.3113964686998395, |
| "grad_norm": 5.094261646270752, |
| "learning_rate": 9.996360550785619e-06, |
| "loss": 0.5775, |
| "mean_token_accuracy": 0.8004140257835388, |
| "num_tokens": 846009.0, |
| "step": 97 |
| }, |
| { |
| "entropy": 1.5976275205612183, |
| "epoch": 0.3146067415730337, |
| "grad_norm": 8.545952796936035, |
| "learning_rate": 9.995944990857848e-06, |
| "loss": 0.5532, |
| "mean_token_accuracy": 0.821792334318161, |
| "num_tokens": 855269.0, |
| "step": 98 |
| }, |
| { |
| "entropy": 1.5488384366035461, |
| "epoch": 0.31781701444622795, |
| "grad_norm": 10.916169166564941, |
| "learning_rate": 9.9955069806986e-06, |
| "loss": 0.5638, |
| "mean_token_accuracy": 0.8070700764656067, |
| "num_tokens": 862934.0, |
| "step": 99 |
| }, |
| { |
| "entropy": 1.5096496939659119, |
| "epoch": 0.32102728731942215, |
| "grad_norm": 11.81431770324707, |
| "learning_rate": 9.995046522276152e-06, |
| "loss": 0.5911, |
| "mean_token_accuracy": 0.8029804527759552, |
| "num_tokens": 871122.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.5772724151611328, |
| "epoch": 0.32423756019261635, |
| "grad_norm": 4.687896251678467, |
| "learning_rate": 9.994563617659665e-06, |
| "loss": 0.6396, |
| "mean_token_accuracy": 0.7922047674655914, |
| "num_tokens": 880220.0, |
| "step": 101 |
| }, |
| { |
| "entropy": 1.5804290175437927, |
| "epoch": 0.3274478330658106, |
| "grad_norm": 5.048791408538818, |
| "learning_rate": 9.994058269019163e-06, |
| "loss": 0.6029, |
| "mean_token_accuracy": 0.7916356921195984, |
| "num_tokens": 889551.0, |
| "step": 102 |
| }, |
| { |
| "entropy": 1.591521441936493, |
| "epoch": 0.3306581059390048, |
| "grad_norm": 11.681397438049316, |
| "learning_rate": 9.993530478625524e-06, |
| "loss": 0.5049, |
| "mean_token_accuracy": 0.8214498460292816, |
| "num_tokens": 897040.0, |
| "step": 103 |
| }, |
| { |
| "entropy": 1.5347830057144165, |
| "epoch": 0.33386837881219905, |
| "grad_norm": 10.396520614624023, |
| "learning_rate": 9.992980248850476e-06, |
| "loss": 0.5725, |
| "mean_token_accuracy": 0.8061753809452057, |
| "num_tokens": 905563.0, |
| "step": 104 |
| }, |
| { |
| "entropy": 1.6515621542930603, |
| "epoch": 0.33707865168539325, |
| "grad_norm": 4.542778491973877, |
| "learning_rate": 9.992407582166582e-06, |
| "loss": 0.6193, |
| "mean_token_accuracy": 0.7810203433036804, |
| "num_tokens": 914347.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 1.626991093158722, |
| "epoch": 0.3402889245585875, |
| "grad_norm": 5.993000507354736, |
| "learning_rate": 9.99181248114723e-06, |
| "loss": 0.6076, |
| "mean_token_accuracy": 0.7984636723995209, |
| "num_tokens": 923142.0, |
| "step": 106 |
| }, |
| { |
| "entropy": 1.5122568607330322, |
| "epoch": 0.3434991974317817, |
| "grad_norm": 6.345304489135742, |
| "learning_rate": 9.991194948466615e-06, |
| "loss": 0.6133, |
| "mean_token_accuracy": 0.7982348203659058, |
| "num_tokens": 931756.0, |
| "step": 107 |
| }, |
| { |
| "entropy": 1.4367440938949585, |
| "epoch": 0.3467094703049759, |
| "grad_norm": 7.016558647155762, |
| "learning_rate": 9.990554986899745e-06, |
| "loss": 0.5683, |
| "mean_token_accuracy": 0.808490514755249, |
| "num_tokens": 940192.0, |
| "step": 108 |
| }, |
| { |
| "entropy": 1.6145474910736084, |
| "epoch": 0.34991974317817015, |
| "grad_norm": 7.367400169372559, |
| "learning_rate": 9.989892599322404e-06, |
| "loss": 0.5275, |
| "mean_token_accuracy": 0.8245007693767548, |
| "num_tokens": 948340.0, |
| "step": 109 |
| }, |
| { |
| "entropy": 1.6267709136009216, |
| "epoch": 0.35313001605136435, |
| "grad_norm": 8.42909049987793, |
| "learning_rate": 9.98920778871116e-06, |
| "loss": 0.5518, |
| "mean_token_accuracy": 0.8199409544467926, |
| "num_tokens": 957371.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.5337051153182983, |
| "epoch": 0.3563402889245586, |
| "grad_norm": 5.027129650115967, |
| "learning_rate": 9.988500558143337e-06, |
| "loss": 0.6374, |
| "mean_token_accuracy": 0.7932157814502716, |
| "num_tokens": 968899.0, |
| "step": 111 |
| }, |
| { |
| "entropy": 1.5816736221313477, |
| "epoch": 0.3595505617977528, |
| "grad_norm": 5.9406304359436035, |
| "learning_rate": 9.987770910797014e-06, |
| "loss": 0.5699, |
| "mean_token_accuracy": 0.8138624727725983, |
| "num_tokens": 977002.0, |
| "step": 112 |
| }, |
| { |
| "entropy": 1.5768181681632996, |
| "epoch": 0.36276083467094705, |
| "grad_norm": 5.563234806060791, |
| "learning_rate": 9.987018849950996e-06, |
| "loss": 0.5396, |
| "mean_token_accuracy": 0.8184849619865417, |
| "num_tokens": 985349.0, |
| "step": 113 |
| }, |
| { |
| "entropy": 1.6216952800750732, |
| "epoch": 0.36597110754414125, |
| "grad_norm": 4.94663667678833, |
| "learning_rate": 9.986244378984817e-06, |
| "loss": 0.5606, |
| "mean_token_accuracy": 0.8066204190254211, |
| "num_tokens": 994148.0, |
| "step": 114 |
| }, |
| { |
| "entropy": 1.587065875530243, |
| "epoch": 0.36918138041733545, |
| "grad_norm": 6.290283203125, |
| "learning_rate": 9.985447501378706e-06, |
| "loss": 0.5305, |
| "mean_token_accuracy": 0.8203730583190918, |
| "num_tokens": 1003995.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 1.6694093346595764, |
| "epoch": 0.3723916532905297, |
| "grad_norm": 7.1124138832092285, |
| "learning_rate": 9.984628220713587e-06, |
| "loss": 0.5579, |
| "mean_token_accuracy": 0.7883734703063965, |
| "num_tokens": 1012908.0, |
| "step": 116 |
| }, |
| { |
| "entropy": 1.710681140422821, |
| "epoch": 0.3756019261637239, |
| "grad_norm": 4.218459606170654, |
| "learning_rate": 9.983786540671052e-06, |
| "loss": 0.6475, |
| "mean_token_accuracy": 0.7862134873867035, |
| "num_tokens": 1021508.0, |
| "step": 117 |
| }, |
| { |
| "entropy": 1.6554288864135742, |
| "epoch": 0.37881219903691815, |
| "grad_norm": 22.958293914794922, |
| "learning_rate": 9.98292246503335e-06, |
| "loss": 0.5268, |
| "mean_token_accuracy": 0.8189591467380524, |
| "num_tokens": 1029277.0, |
| "step": 118 |
| }, |
| { |
| "entropy": 1.614859402179718, |
| "epoch": 0.38202247191011235, |
| "grad_norm": 7.260986328125, |
| "learning_rate": 9.982035997683372e-06, |
| "loss": 0.5515, |
| "mean_token_accuracy": 0.7965718805789948, |
| "num_tokens": 1037552.0, |
| "step": 119 |
| }, |
| { |
| "entropy": 1.623701572418213, |
| "epoch": 0.3852327447833066, |
| "grad_norm": 6.900394439697266, |
| "learning_rate": 9.981127142604628e-06, |
| "loss": 0.5543, |
| "mean_token_accuracy": 0.8073444068431854, |
| "num_tokens": 1046341.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.6763284802436829, |
| "epoch": 0.3884430176565008, |
| "grad_norm": 5.463217735290527, |
| "learning_rate": 9.980195903881231e-06, |
| "loss": 0.5907, |
| "mean_token_accuracy": 0.8022545874118805, |
| "num_tokens": 1054765.0, |
| "step": 121 |
| }, |
| { |
| "entropy": 1.7533277869224548, |
| "epoch": 0.391653290529695, |
| "grad_norm": 4.653329372406006, |
| "learning_rate": 9.979242285697878e-06, |
| "loss": 0.5433, |
| "mean_token_accuracy": 0.8024509847164154, |
| "num_tokens": 1063762.0, |
| "step": 122 |
| }, |
| { |
| "entropy": 1.6675923466682434, |
| "epoch": 0.39486356340288925, |
| "grad_norm": 4.841404914855957, |
| "learning_rate": 9.978266292339838e-06, |
| "loss": 0.5855, |
| "mean_token_accuracy": 0.8054037988185883, |
| "num_tokens": 1073721.0, |
| "step": 123 |
| }, |
| { |
| "entropy": 1.549901008605957, |
| "epoch": 0.39807383627608345, |
| "grad_norm": 4.415830612182617, |
| "learning_rate": 9.97726792819292e-06, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.8016513884067535, |
| "num_tokens": 1081712.0, |
| "step": 124 |
| }, |
| { |
| "entropy": 1.6741828322410583, |
| "epoch": 0.4012841091492777, |
| "grad_norm": 3.7602241039276123, |
| "learning_rate": 9.976247197743465e-06, |
| "loss": 0.5501, |
| "mean_token_accuracy": 0.8158861398696899, |
| "num_tokens": 1090661.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.747463345527649, |
| "epoch": 0.4044943820224719, |
| "grad_norm": 5.772115707397461, |
| "learning_rate": 9.975204105578318e-06, |
| "loss": 0.6089, |
| "mean_token_accuracy": 0.7918793559074402, |
| "num_tokens": 1099806.0, |
| "step": 126 |
| }, |
| { |
| "entropy": 1.503204584121704, |
| "epoch": 0.40770465489566615, |
| "grad_norm": 5.531157970428467, |
| "learning_rate": 9.974138656384815e-06, |
| "loss": 0.5032, |
| "mean_token_accuracy": 0.8174611032009125, |
| "num_tokens": 1107804.0, |
| "step": 127 |
| }, |
| { |
| "entropy": 1.6939732432365417, |
| "epoch": 0.41091492776886035, |
| "grad_norm": 6.103886127471924, |
| "learning_rate": 9.973050854950756e-06, |
| "loss": 0.5437, |
| "mean_token_accuracy": 0.8124994933605194, |
| "num_tokens": 1116670.0, |
| "step": 128 |
| }, |
| { |
| "entropy": 1.6444594860076904, |
| "epoch": 0.41412520064205455, |
| "grad_norm": 4.691072940826416, |
| "learning_rate": 9.97194070616438e-06, |
| "loss": 0.5612, |
| "mean_token_accuracy": 0.8085650205612183, |
| "num_tokens": 1125515.0, |
| "step": 129 |
| }, |
| { |
| "entropy": 1.577980101108551, |
| "epoch": 0.4173354735152488, |
| "grad_norm": 5.344744682312012, |
| "learning_rate": 9.970808215014357e-06, |
| "loss": 0.5397, |
| "mean_token_accuracy": 0.8058919608592987, |
| "num_tokens": 1133769.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.7274922728538513, |
| "epoch": 0.420545746388443, |
| "grad_norm": 12.869426727294922, |
| "learning_rate": 9.969653386589749e-06, |
| "loss": 0.5811, |
| "mean_token_accuracy": 0.8053319454193115, |
| "num_tokens": 1143863.0, |
| "step": 131 |
| }, |
| { |
| "entropy": 1.695865511894226, |
| "epoch": 0.42375601926163725, |
| "grad_norm": 7.400782585144043, |
| "learning_rate": 9.968476226079997e-06, |
| "loss": 0.5592, |
| "mean_token_accuracy": 0.8116855323314667, |
| "num_tokens": 1152046.0, |
| "step": 132 |
| }, |
| { |
| "entropy": 1.7647086381912231, |
| "epoch": 0.42696629213483145, |
| "grad_norm": 13.192926406860352, |
| "learning_rate": 9.967276738774897e-06, |
| "loss": 0.5565, |
| "mean_token_accuracy": 0.8083753287792206, |
| "num_tokens": 1160864.0, |
| "step": 133 |
| }, |
| { |
| "entropy": 1.5161982774734497, |
| "epoch": 0.4301765650080257, |
| "grad_norm": 4.342813014984131, |
| "learning_rate": 9.966054930064577e-06, |
| "loss": 0.5696, |
| "mean_token_accuracy": 0.8116317987442017, |
| "num_tokens": 1169164.0, |
| "step": 134 |
| }, |
| { |
| "entropy": 1.7910877466201782, |
| "epoch": 0.4333868378812199, |
| "grad_norm": 6.658879280090332, |
| "learning_rate": 9.964810805439464e-06, |
| "loss": 0.5709, |
| "mean_token_accuracy": 0.8017919361591339, |
| "num_tokens": 1179921.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 1.600732445716858, |
| "epoch": 0.43659711075441415, |
| "grad_norm": 14.430280685424805, |
| "learning_rate": 9.96354437049027e-06, |
| "loss": 0.625, |
| "mean_token_accuracy": 0.7819797992706299, |
| "num_tokens": 1189616.0, |
| "step": 136 |
| }, |
| { |
| "entropy": 1.7231240272521973, |
| "epoch": 0.43980738362760835, |
| "grad_norm": 9.525609970092773, |
| "learning_rate": 9.962255630907964e-06, |
| "loss": 0.5661, |
| "mean_token_accuracy": 0.8119661808013916, |
| "num_tokens": 1197261.0, |
| "step": 137 |
| }, |
| { |
| "entropy": 1.6479978561401367, |
| "epoch": 0.44301765650080255, |
| "grad_norm": 22.548439025878906, |
| "learning_rate": 9.96094459248374e-06, |
| "loss": 0.5784, |
| "mean_token_accuracy": 0.8013301193714142, |
| "num_tokens": 1204728.0, |
| "step": 138 |
| }, |
| { |
| "entropy": 1.7986060976982117, |
| "epoch": 0.4462279293739968, |
| "grad_norm": 5.41772985458374, |
| "learning_rate": 9.959611261108999e-06, |
| "loss": 0.546, |
| "mean_token_accuracy": 0.8117686808109283, |
| "num_tokens": 1214191.0, |
| "step": 139 |
| }, |
| { |
| "entropy": 1.5903725624084473, |
| "epoch": 0.449438202247191, |
| "grad_norm": 5.444282054901123, |
| "learning_rate": 9.95825564277532e-06, |
| "loss": 0.5919, |
| "mean_token_accuracy": 0.7964106798171997, |
| "num_tokens": 1223443.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.4649049639701843, |
| "epoch": 0.45264847512038525, |
| "grad_norm": 10.411364555358887, |
| "learning_rate": 9.956877743574437e-06, |
| "loss": 0.5833, |
| "mean_token_accuracy": 0.8069176971912384, |
| "num_tokens": 1231905.0, |
| "step": 141 |
| }, |
| { |
| "entropy": 1.6031562089920044, |
| "epoch": 0.45585874799357945, |
| "grad_norm": 4.023768424987793, |
| "learning_rate": 9.955477569698197e-06, |
| "loss": 0.5195, |
| "mean_token_accuracy": 0.8230155110359192, |
| "num_tokens": 1241887.0, |
| "step": 142 |
| }, |
| { |
| "entropy": 1.6915020942687988, |
| "epoch": 0.4590690208667737, |
| "grad_norm": 4.516135215759277, |
| "learning_rate": 9.954055127438554e-06, |
| "loss": 0.548, |
| "mean_token_accuracy": 0.8158730268478394, |
| "num_tokens": 1250696.0, |
| "step": 143 |
| }, |
| { |
| "entropy": 1.5421813130378723, |
| "epoch": 0.4622792937399679, |
| "grad_norm": 4.3148722648620605, |
| "learning_rate": 9.952610423187516e-06, |
| "loss": 0.6044, |
| "mean_token_accuracy": 0.794927716255188, |
| "num_tokens": 1259186.0, |
| "step": 144 |
| }, |
| { |
| "entropy": 1.6087906956672668, |
| "epoch": 0.4654895666131621, |
| "grad_norm": 10.47445011138916, |
| "learning_rate": 9.951143463437145e-06, |
| "loss": 0.6004, |
| "mean_token_accuracy": 0.7896897196769714, |
| "num_tokens": 1269293.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 1.522541105747223, |
| "epoch": 0.46869983948635635, |
| "grad_norm": 5.343850612640381, |
| "learning_rate": 9.949654254779499e-06, |
| "loss": 0.5312, |
| "mean_token_accuracy": 0.8037137985229492, |
| "num_tokens": 1278821.0, |
| "step": 146 |
| }, |
| { |
| "entropy": 1.67844557762146, |
| "epoch": 0.47191011235955055, |
| "grad_norm": 6.206827640533447, |
| "learning_rate": 9.948142803906623e-06, |
| "loss": 0.6064, |
| "mean_token_accuracy": 0.7928981781005859, |
| "num_tokens": 1286812.0, |
| "step": 147 |
| }, |
| { |
| "entropy": 1.5753509998321533, |
| "epoch": 0.4751203852327448, |
| "grad_norm": 5.666691303253174, |
| "learning_rate": 9.946609117610508e-06, |
| "loss": 0.5374, |
| "mean_token_accuracy": 0.8405792713165283, |
| "num_tokens": 1297550.0, |
| "step": 148 |
| }, |
| { |
| "entropy": 1.699306309223175, |
| "epoch": 0.478330658105939, |
| "grad_norm": 5.685131072998047, |
| "learning_rate": 9.94505320278307e-06, |
| "loss": 0.6641, |
| "mean_token_accuracy": 0.7865519523620605, |
| "num_tokens": 1307649.0, |
| "step": 149 |
| }, |
| { |
| "entropy": 1.8110342025756836, |
| "epoch": 0.48154093097913325, |
| "grad_norm": 6.575296401977539, |
| "learning_rate": 9.943475066416105e-06, |
| "loss": 0.5588, |
| "mean_token_accuracy": 0.8110237419605255, |
| "num_tokens": 1317315.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.5596372485160828, |
| "epoch": 0.48475120385232745, |
| "grad_norm": 14.576911926269531, |
| "learning_rate": 9.94187471560127e-06, |
| "loss": 0.6137, |
| "mean_token_accuracy": 0.8028269708156586, |
| "num_tokens": 1326698.0, |
| "step": 151 |
| }, |
| { |
| "entropy": 1.6433868408203125, |
| "epoch": 0.48796147672552165, |
| "grad_norm": 4.3727216720581055, |
| "learning_rate": 9.940252157530048e-06, |
| "loss": 0.5117, |
| "mean_token_accuracy": 0.8254751861095428, |
| "num_tokens": 1334747.0, |
| "step": 152 |
| }, |
| { |
| "entropy": 1.7227093577384949, |
| "epoch": 0.4911717495987159, |
| "grad_norm": 10.242402076721191, |
| "learning_rate": 9.938607399493714e-06, |
| "loss": 0.5957, |
| "mean_token_accuracy": 0.8011538982391357, |
| "num_tokens": 1343974.0, |
| "step": 153 |
| }, |
| { |
| "entropy": 1.6710272431373596, |
| "epoch": 0.4943820224719101, |
| "grad_norm": 9.269742012023926, |
| "learning_rate": 9.936940448883299e-06, |
| "loss": 0.564, |
| "mean_token_accuracy": 0.810530036687851, |
| "num_tokens": 1352278.0, |
| "step": 154 |
| }, |
| { |
| "entropy": 1.6066045761108398, |
| "epoch": 0.49759229534510435, |
| "grad_norm": 4.906155109405518, |
| "learning_rate": 9.935251313189564e-06, |
| "loss": 0.5821, |
| "mean_token_accuracy": 0.8002186417579651, |
| "num_tokens": 1360110.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 1.6245338916778564, |
| "epoch": 0.5008025682182986, |
| "grad_norm": 4.081302642822266, |
| "learning_rate": 9.933540000002966e-06, |
| "loss": 0.5425, |
| "mean_token_accuracy": 0.8081716001033783, |
| "num_tokens": 1369064.0, |
| "step": 156 |
| }, |
| { |
| "entropy": 1.814435362815857, |
| "epoch": 0.5040128410914928, |
| "grad_norm": 4.305764198303223, |
| "learning_rate": 9.931806517013612e-06, |
| "loss": 0.5317, |
| "mean_token_accuracy": 0.8192135095596313, |
| "num_tokens": 1377809.0, |
| "step": 157 |
| }, |
| { |
| "entropy": 1.5969063639640808, |
| "epoch": 0.507223113964687, |
| "grad_norm": 7.35665225982666, |
| "learning_rate": 9.930050872011242e-06, |
| "loss": 0.583, |
| "mean_token_accuracy": 0.7939075827598572, |
| "num_tokens": 1386467.0, |
| "step": 158 |
| }, |
| { |
| "entropy": 1.554268717765808, |
| "epoch": 0.5104333868378812, |
| "grad_norm": 4.8409271240234375, |
| "learning_rate": 9.92827307288518e-06, |
| "loss": 0.5961, |
| "mean_token_accuracy": 0.801440566778183, |
| "num_tokens": 1395387.0, |
| "step": 159 |
| }, |
| { |
| "entropy": 1.658279836177826, |
| "epoch": 0.5136436597110754, |
| "grad_norm": 4.918241024017334, |
| "learning_rate": 9.926473127624306e-06, |
| "loss": 0.5237, |
| "mean_token_accuracy": 0.8125604391098022, |
| "num_tokens": 1402911.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.6950554847717285, |
| "epoch": 0.5168539325842697, |
| "grad_norm": 4.100020885467529, |
| "learning_rate": 9.924651044317017e-06, |
| "loss": 0.5679, |
| "mean_token_accuracy": 0.8086461424827576, |
| "num_tokens": 1410709.0, |
| "step": 161 |
| }, |
| { |
| "entropy": 1.6739388704299927, |
| "epoch": 0.5200642054574639, |
| "grad_norm": 4.0072126388549805, |
| "learning_rate": 9.922806831151192e-06, |
| "loss": 0.483, |
| "mean_token_accuracy": 0.8350411057472229, |
| "num_tokens": 1419682.0, |
| "step": 162 |
| }, |
| { |
| "entropy": 1.7393546104431152, |
| "epoch": 0.5232744783306581, |
| "grad_norm": 4.075192451477051, |
| "learning_rate": 9.920940496414153e-06, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.8328077495098114, |
| "num_tokens": 1428573.0, |
| "step": 163 |
| }, |
| { |
| "entropy": 1.685001015663147, |
| "epoch": 0.5264847512038523, |
| "grad_norm": 4.823047637939453, |
| "learning_rate": 9.919052048492633e-06, |
| "loss": 0.6149, |
| "mean_token_accuracy": 0.8053491711616516, |
| "num_tokens": 1438592.0, |
| "step": 164 |
| }, |
| { |
| "entropy": 1.5803438425064087, |
| "epoch": 0.5296950240770465, |
| "grad_norm": 6.810695648193359, |
| "learning_rate": 9.917141495872733e-06, |
| "loss": 0.5079, |
| "mean_token_accuracy": 0.8178175091743469, |
| "num_tokens": 1446506.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 1.6139253377914429, |
| "epoch": 0.5329052969502408, |
| "grad_norm": 8.459833145141602, |
| "learning_rate": 9.915208847139883e-06, |
| "loss": 0.5564, |
| "mean_token_accuracy": 0.8102662563323975, |
| "num_tokens": 1454804.0, |
| "step": 166 |
| }, |
| { |
| "entropy": 1.501044511795044, |
| "epoch": 0.536115569823435, |
| "grad_norm": 56.71458435058594, |
| "learning_rate": 9.913254110978812e-06, |
| "loss": 0.5484, |
| "mean_token_accuracy": 0.8092103004455566, |
| "num_tokens": 1463547.0, |
| "step": 167 |
| }, |
| { |
| "entropy": 1.6031732559204102, |
| "epoch": 0.5393258426966292, |
| "grad_norm": 47.60392761230469, |
| "learning_rate": 9.911277296173498e-06, |
| "loss": 0.5669, |
| "mean_token_accuracy": 0.8047437965869904, |
| "num_tokens": 1472335.0, |
| "step": 168 |
| }, |
| { |
| "entropy": 1.471640169620514, |
| "epoch": 0.5425361155698234, |
| "grad_norm": 4.785444259643555, |
| "learning_rate": 9.909278411607134e-06, |
| "loss": 0.5332, |
| "mean_token_accuracy": 0.816964328289032, |
| "num_tokens": 1481165.0, |
| "step": 169 |
| }, |
| { |
| "entropy": 1.6395891308784485, |
| "epoch": 0.5457463884430177, |
| "grad_norm": 5.150639533996582, |
| "learning_rate": 9.90725746626209e-06, |
| "loss": 0.5777, |
| "mean_token_accuracy": 0.802484005689621, |
| "num_tokens": 1490534.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.7222612500190735, |
| "epoch": 0.5489566613162119, |
| "grad_norm": 10.601120948791504, |
| "learning_rate": 9.90521446921987e-06, |
| "loss": 0.5492, |
| "mean_token_accuracy": 0.812993735074997, |
| "num_tokens": 1497850.0, |
| "step": 171 |
| }, |
| { |
| "entropy": 1.4922645092010498, |
| "epoch": 0.5521669341894061, |
| "grad_norm": 6.482858657836914, |
| "learning_rate": 9.903149429661072e-06, |
| "loss": 0.6384, |
| "mean_token_accuracy": 0.7931837439537048, |
| "num_tokens": 1506706.0, |
| "step": 172 |
| }, |
| { |
| "entropy": 1.649798572063446, |
| "epoch": 0.5553772070626003, |
| "grad_norm": 4.392049789428711, |
| "learning_rate": 9.90106235686534e-06, |
| "loss": 0.4938, |
| "mean_token_accuracy": 0.8184403777122498, |
| "num_tokens": 1514792.0, |
| "step": 173 |
| }, |
| { |
| "entropy": 1.639334261417389, |
| "epoch": 0.5585874799357945, |
| "grad_norm": 5.779803276062012, |
| "learning_rate": 9.89895326021134e-06, |
| "loss": 0.5342, |
| "mean_token_accuracy": 0.8119508326053619, |
| "num_tokens": 1522723.0, |
| "step": 174 |
| }, |
| { |
| "entropy": 1.6969090700149536, |
| "epoch": 0.5617977528089888, |
| "grad_norm": 4.983688831329346, |
| "learning_rate": 9.896822149176695e-06, |
| "loss": 0.5159, |
| "mean_token_accuracy": 0.8205990195274353, |
| "num_tokens": 1531592.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.609685242176056, |
| "epoch": 0.565008025682183, |
| "grad_norm": 5.268664836883545, |
| "learning_rate": 9.894669033337962e-06, |
| "loss": 0.5402, |
| "mean_token_accuracy": 0.8162535429000854, |
| "num_tokens": 1541237.0, |
| "step": 176 |
| }, |
| { |
| "entropy": 1.7183340787887573, |
| "epoch": 0.5682182985553772, |
| "grad_norm": 6.601222038269043, |
| "learning_rate": 9.892493922370575e-06, |
| "loss": 0.5283, |
| "mean_token_accuracy": 0.8257523775100708, |
| "num_tokens": 1548767.0, |
| "step": 177 |
| }, |
| { |
| "entropy": 1.793528974056244, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 7.592155933380127, |
| "learning_rate": 9.89029682604881e-06, |
| "loss": 0.51, |
| "mean_token_accuracy": 0.8226527869701385, |
| "num_tokens": 1556348.0, |
| "step": 178 |
| }, |
| { |
| "entropy": 1.7014802694320679, |
| "epoch": 0.5746388443017657, |
| "grad_norm": 5.251842021942139, |
| "learning_rate": 9.888077754245741e-06, |
| "loss": 0.5704, |
| "mean_token_accuracy": 0.8048693239688873, |
| "num_tokens": 1565716.0, |
| "step": 179 |
| }, |
| { |
| "entropy": 1.56605464220047, |
| "epoch": 0.5778491171749599, |
| "grad_norm": 7.1781086921691895, |
| "learning_rate": 9.88583671693319e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.8167294263839722, |
| "num_tokens": 1574130.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.8687368035316467, |
| "epoch": 0.5810593900481541, |
| "grad_norm": 6.982422828674316, |
| "learning_rate": 9.883573724181683e-06, |
| "loss": 0.5324, |
| "mean_token_accuracy": 0.8053440749645233, |
| "num_tokens": 1583316.0, |
| "step": 181 |
| }, |
| { |
| "entropy": 1.5925783514976501, |
| "epoch": 0.5842696629213483, |
| "grad_norm": 4.656749725341797, |
| "learning_rate": 9.881288786160413e-06, |
| "loss": 0.5597, |
| "mean_token_accuracy": 0.8063121140003204, |
| "num_tokens": 1591444.0, |
| "step": 182 |
| }, |
| { |
| "entropy": 1.5454033613204956, |
| "epoch": 0.5874799357945425, |
| "grad_norm": 4.605663299560547, |
| "learning_rate": 9.878981913137178e-06, |
| "loss": 0.4796, |
| "mean_token_accuracy": 0.8154693841934204, |
| "num_tokens": 1601849.0, |
| "step": 183 |
| }, |
| { |
| "entropy": 1.6912750005722046, |
| "epoch": 0.5906902086677368, |
| "grad_norm": 6.622162342071533, |
| "learning_rate": 9.87665311547836e-06, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8236511945724487, |
| "num_tokens": 1609557.0, |
| "step": 184 |
| }, |
| { |
| "entropy": 1.6364272832870483, |
| "epoch": 0.593900481540931, |
| "grad_norm": 5.712879657745361, |
| "learning_rate": 9.87430240364885e-06, |
| "loss": 0.5518, |
| "mean_token_accuracy": 0.8128435611724854, |
| "num_tokens": 1619608.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 1.7632077932357788, |
| "epoch": 0.5971107544141252, |
| "grad_norm": 4.955244541168213, |
| "learning_rate": 9.871929788212022e-06, |
| "loss": 0.5948, |
| "mean_token_accuracy": 0.7959562540054321, |
| "num_tokens": 1629727.0, |
| "step": 186 |
| }, |
| { |
| "entropy": 1.6786929965019226, |
| "epoch": 0.6003210272873194, |
| "grad_norm": 6.2427473068237305, |
| "learning_rate": 9.869535279829674e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.7966740429401398, |
| "num_tokens": 1639226.0, |
| "step": 187 |
| }, |
| { |
| "entropy": 1.6981608867645264, |
| "epoch": 0.6035313001605136, |
| "grad_norm": 4.792971134185791, |
| "learning_rate": 9.867118889261988e-06, |
| "loss": 0.5328, |
| "mean_token_accuracy": 0.81971076130867, |
| "num_tokens": 1647305.0, |
| "step": 188 |
| }, |
| { |
| "entropy": 1.669969916343689, |
| "epoch": 0.6067415730337079, |
| "grad_norm": 3.715688943862915, |
| "learning_rate": 9.864680627367476e-06, |
| "loss": 0.6066, |
| "mean_token_accuracy": 0.7924070656299591, |
| "num_tokens": 1656645.0, |
| "step": 189 |
| }, |
| { |
| "entropy": 1.75922429561615, |
| "epoch": 0.6099518459069021, |
| "grad_norm": 5.012786388397217, |
| "learning_rate": 9.862220505102933e-06, |
| "loss": 0.5591, |
| "mean_token_accuracy": 0.8037042319774628, |
| "num_tokens": 1665007.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.7086448073387146, |
| "epoch": 0.6131621187800963, |
| "grad_norm": 6.47388219833374, |
| "learning_rate": 9.859738533523384e-06, |
| "loss": 0.5558, |
| "mean_token_accuracy": 0.8141147196292877, |
| "num_tokens": 1673184.0, |
| "step": 191 |
| }, |
| { |
| "entropy": 1.6764840483665466, |
| "epoch": 0.6163723916532905, |
| "grad_norm": 5.120999336242676, |
| "learning_rate": 9.857234723782044e-06, |
| "loss": 0.5278, |
| "mean_token_accuracy": 0.8128564059734344, |
| "num_tokens": 1682622.0, |
| "step": 192 |
| }, |
| { |
| "entropy": 1.7752752900123596, |
| "epoch": 0.6195826645264848, |
| "grad_norm": 5.461716175079346, |
| "learning_rate": 9.854709087130261e-06, |
| "loss": 0.5447, |
| "mean_token_accuracy": 0.8150747120380402, |
| "num_tokens": 1690983.0, |
| "step": 193 |
| }, |
| { |
| "entropy": 1.576039433479309, |
| "epoch": 0.622792937399679, |
| "grad_norm": 16.93771743774414, |
| "learning_rate": 9.852161634917463e-06, |
| "loss": 0.5159, |
| "mean_token_accuracy": 0.8044776320457458, |
| "num_tokens": 1699951.0, |
| "step": 194 |
| }, |
| { |
| "entropy": 1.6042520999908447, |
| "epoch": 0.6260032102728732, |
| "grad_norm": 4.1131062507629395, |
| "learning_rate": 9.849592378591113e-06, |
| "loss": 0.5323, |
| "mean_token_accuracy": 0.8186607956886292, |
| "num_tokens": 1708057.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 1.8272185921669006, |
| "epoch": 0.6292134831460674, |
| "grad_norm": 5.804073810577393, |
| "learning_rate": 9.847001329696653e-06, |
| "loss": 0.5514, |
| "mean_token_accuracy": 0.8083562552928925, |
| "num_tokens": 1717292.0, |
| "step": 196 |
| }, |
| { |
| "entropy": 1.5746408700942993, |
| "epoch": 0.6324237560192616, |
| "grad_norm": 13.559549331665039, |
| "learning_rate": 9.844388499877457e-06, |
| "loss": 0.6, |
| "mean_token_accuracy": 0.8068454265594482, |
| "num_tokens": 1725347.0, |
| "step": 197 |
| }, |
| { |
| "entropy": 1.698961853981018, |
| "epoch": 0.6356340288924559, |
| "grad_norm": 4.889476299285889, |
| "learning_rate": 9.841753900874774e-06, |
| "loss": 0.5872, |
| "mean_token_accuracy": 0.799273282289505, |
| "num_tokens": 1732840.0, |
| "step": 198 |
| }, |
| { |
| "entropy": 1.7588367462158203, |
| "epoch": 0.6388443017656501, |
| "grad_norm": 5.369776725769043, |
| "learning_rate": 9.839097544527674e-06, |
| "loss": 0.5042, |
| "mean_token_accuracy": 0.8182494044303894, |
| "num_tokens": 1740984.0, |
| "step": 199 |
| }, |
| { |
| "entropy": 1.730661690235138, |
| "epoch": 0.6420545746388443, |
| "grad_norm": 17.513286590576172, |
| "learning_rate": 9.836419442773004e-06, |
| "loss": 0.5309, |
| "mean_token_accuracy": 0.8073310256004333, |
| "num_tokens": 1749331.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.7495179772377014, |
| "epoch": 0.6452648475120385, |
| "grad_norm": 5.817629814147949, |
| "learning_rate": 9.833719607645325e-06, |
| "loss": 0.5107, |
| "mean_token_accuracy": 0.8234334290027618, |
| "num_tokens": 1757183.0, |
| "step": 201 |
| }, |
| { |
| "entropy": 1.6555100679397583, |
| "epoch": 0.6484751203852327, |
| "grad_norm": 5.809082508087158, |
| "learning_rate": 9.830998051276858e-06, |
| "loss": 0.5884, |
| "mean_token_accuracy": 0.795190691947937, |
| "num_tokens": 1764907.0, |
| "step": 202 |
| }, |
| { |
| "entropy": 1.8146103024482727, |
| "epoch": 0.651685393258427, |
| "grad_norm": 4.854206085205078, |
| "learning_rate": 9.82825478589744e-06, |
| "loss": 0.5527, |
| "mean_token_accuracy": 0.8077179789543152, |
| "num_tokens": 1773880.0, |
| "step": 203 |
| }, |
| { |
| "entropy": 1.6006608605384827, |
| "epoch": 0.6548956661316212, |
| "grad_norm": 4.588094711303711, |
| "learning_rate": 9.825489823834454e-06, |
| "loss": 0.5905, |
| "mean_token_accuracy": 0.7999958395957947, |
| "num_tokens": 1782825.0, |
| "step": 204 |
| }, |
| { |
| "entropy": 1.6996418237686157, |
| "epoch": 0.6581059390048154, |
| "grad_norm": 6.713345050811768, |
| "learning_rate": 9.822703177512783e-06, |
| "loss": 0.5563, |
| "mean_token_accuracy": 0.8049568831920624, |
| "num_tokens": 1790524.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 1.5989940166473389, |
| "epoch": 0.6613162118780096, |
| "grad_norm": 4.155484199523926, |
| "learning_rate": 9.819894859454756e-06, |
| "loss": 0.5546, |
| "mean_token_accuracy": 0.8159286975860596, |
| "num_tokens": 1799241.0, |
| "step": 206 |
| }, |
| { |
| "entropy": 1.6221102476119995, |
| "epoch": 0.6645264847512039, |
| "grad_norm": 5.686789035797119, |
| "learning_rate": 9.817064882280085e-06, |
| "loss": 0.5798, |
| "mean_token_accuracy": 0.8095100820064545, |
| "num_tokens": 1807291.0, |
| "step": 207 |
| }, |
| { |
| "entropy": 1.556957483291626, |
| "epoch": 0.6677367576243981, |
| "grad_norm": 3.906980514526367, |
| "learning_rate": 9.814213258705813e-06, |
| "loss": 0.6096, |
| "mean_token_accuracy": 0.8013098835945129, |
| "num_tokens": 1815356.0, |
| "step": 208 |
| }, |
| { |
| "entropy": 1.6848229765892029, |
| "epoch": 0.6709470304975923, |
| "grad_norm": 4.213841438293457, |
| "learning_rate": 9.811340001546252e-06, |
| "loss": 0.5178, |
| "mean_token_accuracy": 0.8091440200805664, |
| "num_tokens": 1823404.0, |
| "step": 209 |
| }, |
| { |
| "entropy": 1.767141580581665, |
| "epoch": 0.6741573033707865, |
| "grad_norm": 4.5109782218933105, |
| "learning_rate": 9.808445123712934e-06, |
| "loss": 0.5404, |
| "mean_token_accuracy": 0.8120012879371643, |
| "num_tokens": 1832487.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.936172902584076, |
| "epoch": 0.6773675762439807, |
| "grad_norm": 6.350546360015869, |
| "learning_rate": 9.805528638214543e-06, |
| "loss": 0.5833, |
| "mean_token_accuracy": 0.8069708943367004, |
| "num_tokens": 1841466.0, |
| "step": 211 |
| }, |
| { |
| "entropy": 1.730377733707428, |
| "epoch": 0.680577849117175, |
| "grad_norm": 3.720072031021118, |
| "learning_rate": 9.802590558156863e-06, |
| "loss": 0.5529, |
| "mean_token_accuracy": 0.8099434673786163, |
| "num_tokens": 1851255.0, |
| "step": 212 |
| }, |
| { |
| "entropy": 1.8273388147354126, |
| "epoch": 0.6837881219903692, |
| "grad_norm": 4.521886348724365, |
| "learning_rate": 9.799630896742716e-06, |
| "loss": 0.5624, |
| "mean_token_accuracy": 0.8054306507110596, |
| "num_tokens": 1859161.0, |
| "step": 213 |
| }, |
| { |
| "entropy": 1.6505126357078552, |
| "epoch": 0.6869983948635634, |
| "grad_norm": 3.8699588775634766, |
| "learning_rate": 9.796649667271905e-06, |
| "loss": 0.5313, |
| "mean_token_accuracy": 0.8173911273479462, |
| "num_tokens": 1869881.0, |
| "step": 214 |
| }, |
| { |
| "entropy": 1.7609619498252869, |
| "epoch": 0.6902086677367576, |
| "grad_norm": 4.7254228591918945, |
| "learning_rate": 9.793646883141155e-06, |
| "loss": 0.5471, |
| "mean_token_accuracy": 0.8083101809024811, |
| "num_tokens": 1878217.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 1.8547272682189941, |
| "epoch": 0.6934189406099518, |
| "grad_norm": 5.278886795043945, |
| "learning_rate": 9.790622557844047e-06, |
| "loss": 0.5315, |
| "mean_token_accuracy": 0.811885803937912, |
| "num_tokens": 1886890.0, |
| "step": 216 |
| }, |
| { |
| "entropy": 1.6623858213424683, |
| "epoch": 0.6966292134831461, |
| "grad_norm": 4.21257209777832, |
| "learning_rate": 9.787576704970965e-06, |
| "loss": 0.5739, |
| "mean_token_accuracy": 0.8085341453552246, |
| "num_tokens": 1895443.0, |
| "step": 217 |
| }, |
| { |
| "entropy": 1.8965311646461487, |
| "epoch": 0.6998394863563403, |
| "grad_norm": 6.084798336029053, |
| "learning_rate": 9.784509338209026e-06, |
| "loss": 0.5523, |
| "mean_token_accuracy": 0.819883793592453, |
| "num_tokens": 1903524.0, |
| "step": 218 |
| }, |
| { |
| "entropy": 1.78938889503479, |
| "epoch": 0.7030497592295345, |
| "grad_norm": 4.32880973815918, |
| "learning_rate": 9.781420471342035e-06, |
| "loss": 0.5657, |
| "mean_token_accuracy": 0.8084932565689087, |
| "num_tokens": 1912941.0, |
| "step": 219 |
| }, |
| { |
| "entropy": 1.7305169105529785, |
| "epoch": 0.7062600321027287, |
| "grad_norm": 28.916038513183594, |
| "learning_rate": 9.778310118250397e-06, |
| "loss": 0.4682, |
| "mean_token_accuracy": 0.8382920622825623, |
| "num_tokens": 1921692.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.7564342617988586, |
| "epoch": 0.709470304975923, |
| "grad_norm": 3.821200370788574, |
| "learning_rate": 9.77517829291108e-06, |
| "loss": 0.4967, |
| "mean_token_accuracy": 0.828892856836319, |
| "num_tokens": 1930053.0, |
| "step": 221 |
| }, |
| { |
| "entropy": 1.6086868047714233, |
| "epoch": 0.7126805778491172, |
| "grad_norm": 4.011463642120361, |
| "learning_rate": 9.772025009397538e-06, |
| "loss": 0.5361, |
| "mean_token_accuracy": 0.8106788098812103, |
| "num_tokens": 1938556.0, |
| "step": 222 |
| }, |
| { |
| "entropy": 1.7607861757278442, |
| "epoch": 0.7158908507223114, |
| "grad_norm": 3.9651222229003906, |
| "learning_rate": 9.768850281879651e-06, |
| "loss": 0.5446, |
| "mean_token_accuracy": 0.8144137263298035, |
| "num_tokens": 1946709.0, |
| "step": 223 |
| }, |
| { |
| "entropy": 1.8207188248634338, |
| "epoch": 0.7191011235955056, |
| "grad_norm": 17.640336990356445, |
| "learning_rate": 9.765654124623664e-06, |
| "loss": 0.5415, |
| "mean_token_accuracy": 0.8262408673763275, |
| "num_tokens": 1954532.0, |
| "step": 224 |
| }, |
| { |
| "entropy": 1.9907708168029785, |
| "epoch": 0.7223113964686998, |
| "grad_norm": 16.587417602539062, |
| "learning_rate": 9.762436551992117e-06, |
| "loss": 0.5316, |
| "mean_token_accuracy": 0.8141103982925415, |
| "num_tokens": 1962607.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.771414875984192, |
| "epoch": 0.7255216693418941, |
| "grad_norm": 5.63399076461792, |
| "learning_rate": 9.759197578443787e-06, |
| "loss": 0.5473, |
| "mean_token_accuracy": 0.8127427995204926, |
| "num_tokens": 1971317.0, |
| "step": 226 |
| }, |
| { |
| "entropy": 1.8039385080337524, |
| "epoch": 0.7287319422150883, |
| "grad_norm": 9.91141128540039, |
| "learning_rate": 9.755937218533622e-06, |
| "loss": 0.5521, |
| "mean_token_accuracy": 0.8093193471431732, |
| "num_tokens": 1979337.0, |
| "step": 227 |
| }, |
| { |
| "entropy": 1.8303923606872559, |
| "epoch": 0.7319422150882825, |
| "grad_norm": 3.973386526107788, |
| "learning_rate": 9.752655486912666e-06, |
| "loss": 0.5292, |
| "mean_token_accuracy": 0.8189589083194733, |
| "num_tokens": 1989078.0, |
| "step": 228 |
| }, |
| { |
| "entropy": 1.8203710913658142, |
| "epoch": 0.7351524879614767, |
| "grad_norm": 4.977687358856201, |
| "learning_rate": 9.74935239832801e-06, |
| "loss": 0.5419, |
| "mean_token_accuracy": 0.8108758628368378, |
| "num_tokens": 1997064.0, |
| "step": 229 |
| }, |
| { |
| "entropy": 1.7490638494491577, |
| "epoch": 0.7383627608346709, |
| "grad_norm": 3.8043088912963867, |
| "learning_rate": 9.746027967622709e-06, |
| "loss": 0.5665, |
| "mean_token_accuracy": 0.807654470205307, |
| "num_tokens": 2005949.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 2.048327326774597, |
| "epoch": 0.7415730337078652, |
| "grad_norm": 4.386172771453857, |
| "learning_rate": 9.742682209735727e-06, |
| "loss": 0.507, |
| "mean_token_accuracy": 0.8336832225322723, |
| "num_tokens": 2014059.0, |
| "step": 231 |
| }, |
| { |
| "entropy": 1.7495536804199219, |
| "epoch": 0.7447833065810594, |
| "grad_norm": 4.291626453399658, |
| "learning_rate": 9.739315139701868e-06, |
| "loss": 0.5388, |
| "mean_token_accuracy": 0.8060157299041748, |
| "num_tokens": 2023566.0, |
| "step": 232 |
| }, |
| { |
| "entropy": 1.949560523033142, |
| "epoch": 0.7479935794542536, |
| "grad_norm": 3.818927764892578, |
| "learning_rate": 9.735926772651703e-06, |
| "loss": 0.5368, |
| "mean_token_accuracy": 0.8259284198284149, |
| "num_tokens": 2033999.0, |
| "step": 233 |
| }, |
| { |
| "entropy": 1.6942541599273682, |
| "epoch": 0.7512038523274478, |
| "grad_norm": 4.930175304412842, |
| "learning_rate": 9.732517123811502e-06, |
| "loss": 0.4826, |
| "mean_token_accuracy": 0.8296216726303101, |
| "num_tokens": 2043181.0, |
| "step": 234 |
| }, |
| { |
| "entropy": 1.7138220071792603, |
| "epoch": 0.7544141252006421, |
| "grad_norm": 12.674991607666016, |
| "learning_rate": 9.729086208503174e-06, |
| "loss": 0.6105, |
| "mean_token_accuracy": 0.8042653799057007, |
| "num_tokens": 2051883.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 1.7605062127113342, |
| "epoch": 0.7576243980738363, |
| "grad_norm": 4.540236473083496, |
| "learning_rate": 9.725634042144192e-06, |
| "loss": 0.5518, |
| "mean_token_accuracy": 0.8065124750137329, |
| "num_tokens": 2060315.0, |
| "step": 236 |
| }, |
| { |
| "entropy": 1.7666863799095154, |
| "epoch": 0.7608346709470305, |
| "grad_norm": 8.36759090423584, |
| "learning_rate": 9.722160640247523e-06, |
| "loss": 0.6014, |
| "mean_token_accuracy": 0.8097147047519684, |
| "num_tokens": 2070260.0, |
| "step": 237 |
| }, |
| { |
| "entropy": 1.674265742301941, |
| "epoch": 0.7640449438202247, |
| "grad_norm": 3.945760726928711, |
| "learning_rate": 9.71866601842156e-06, |
| "loss": 0.503, |
| "mean_token_accuracy": 0.8159506320953369, |
| "num_tokens": 2079997.0, |
| "step": 238 |
| }, |
| { |
| "entropy": 1.7028595209121704, |
| "epoch": 0.7672552166934189, |
| "grad_norm": 4.027834892272949, |
| "learning_rate": 9.715150192370054e-06, |
| "loss": 0.5632, |
| "mean_token_accuracy": 0.8053656220436096, |
| "num_tokens": 2088508.0, |
| "step": 239 |
| }, |
| { |
| "entropy": 1.7460883855819702, |
| "epoch": 0.7704654895666132, |
| "grad_norm": 5.3727312088012695, |
| "learning_rate": 9.71161317789204e-06, |
| "loss": 0.536, |
| "mean_token_accuracy": 0.812755823135376, |
| "num_tokens": 2097683.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.7760279774665833, |
| "epoch": 0.7736757624398074, |
| "grad_norm": 11.409591674804688, |
| "learning_rate": 9.708054990881763e-06, |
| "loss": 0.5327, |
| "mean_token_accuracy": 0.8147374391555786, |
| "num_tokens": 2106963.0, |
| "step": 241 |
| }, |
| { |
| "entropy": 1.7303342819213867, |
| "epoch": 0.7768860353130016, |
| "grad_norm": 3.939631462097168, |
| "learning_rate": 9.70447564732862e-06, |
| "loss": 0.5212, |
| "mean_token_accuracy": 0.8153030872344971, |
| "num_tokens": 2116407.0, |
| "step": 242 |
| }, |
| { |
| "entropy": 1.629269540309906, |
| "epoch": 0.7800963081861958, |
| "grad_norm": 6.595477104187012, |
| "learning_rate": 9.700875163317072e-06, |
| "loss": 0.5316, |
| "mean_token_accuracy": 0.8108282685279846, |
| "num_tokens": 2124321.0, |
| "step": 243 |
| }, |
| { |
| "entropy": 1.7000606060028076, |
| "epoch": 0.78330658105939, |
| "grad_norm": 4.974279403686523, |
| "learning_rate": 9.69725355502658e-06, |
| "loss": 0.532, |
| "mean_token_accuracy": 0.8205364644527435, |
| "num_tokens": 2132747.0, |
| "step": 244 |
| }, |
| { |
| "entropy": 1.7894864082336426, |
| "epoch": 0.7865168539325843, |
| "grad_norm": 4.206717491149902, |
| "learning_rate": 9.693610838731532e-06, |
| "loss": 0.5707, |
| "mean_token_accuracy": 0.8029159605503082, |
| "num_tokens": 2142048.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 1.8813952803611755, |
| "epoch": 0.7897271268057785, |
| "grad_norm": 3.395116090774536, |
| "learning_rate": 9.689947030801168e-06, |
| "loss": 0.5403, |
| "mean_token_accuracy": 0.8176522552967072, |
| "num_tokens": 2151683.0, |
| "step": 246 |
| }, |
| { |
| "entropy": 1.795137882232666, |
| "epoch": 0.7929373996789727, |
| "grad_norm": 4.789092540740967, |
| "learning_rate": 9.686262147699507e-06, |
| "loss": 0.5635, |
| "mean_token_accuracy": 0.8097488880157471, |
| "num_tokens": 2160162.0, |
| "step": 247 |
| }, |
| { |
| "entropy": 1.7523850202560425, |
| "epoch": 0.7961476725521669, |
| "grad_norm": 3.9210240840911865, |
| "learning_rate": 9.682556205985274e-06, |
| "loss": 0.5662, |
| "mean_token_accuracy": 0.802306056022644, |
| "num_tokens": 2168445.0, |
| "step": 248 |
| }, |
| { |
| "entropy": 2.022180676460266, |
| "epoch": 0.7993579454253612, |
| "grad_norm": 4.333629608154297, |
| "learning_rate": 9.678829222311827e-06, |
| "loss": 0.5617, |
| "mean_token_accuracy": 0.8150058388710022, |
| "num_tokens": 2177350.0, |
| "step": 249 |
| }, |
| { |
| "entropy": 1.7913747429847717, |
| "epoch": 0.8025682182985554, |
| "grad_norm": 6.312867641448975, |
| "learning_rate": 9.675081213427076e-06, |
| "loss": 0.5039, |
| "mean_token_accuracy": 0.8336711227893829, |
| "num_tokens": 2185344.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.955717146396637, |
| "epoch": 0.8057784911717496, |
| "grad_norm": 4.374723434448242, |
| "learning_rate": 9.671312196173413e-06, |
| "loss": 0.5752, |
| "mean_token_accuracy": 0.8053406774997711, |
| "num_tokens": 2195099.0, |
| "step": 251 |
| }, |
| { |
| "entropy": 1.8786735534667969, |
| "epoch": 0.8089887640449438, |
| "grad_norm": 6.004689693450928, |
| "learning_rate": 9.667522187487635e-06, |
| "loss": 0.5429, |
| "mean_token_accuracy": 0.8088372349739075, |
| "num_tokens": 2203830.0, |
| "step": 252 |
| }, |
| { |
| "entropy": 1.984777808189392, |
| "epoch": 0.812199036918138, |
| "grad_norm": 5.8004374504089355, |
| "learning_rate": 9.663711204400872e-06, |
| "loss": 0.5701, |
| "mean_token_accuracy": 0.8084724247455597, |
| "num_tokens": 2212939.0, |
| "step": 253 |
| }, |
| { |
| "entropy": 1.973739504814148, |
| "epoch": 0.8154093097913323, |
| "grad_norm": 12.577515602111816, |
| "learning_rate": 9.659879264038499e-06, |
| "loss": 0.4932, |
| "mean_token_accuracy": 0.8239920437335968, |
| "num_tokens": 2221298.0, |
| "step": 254 |
| }, |
| { |
| "entropy": 1.9998502731323242, |
| "epoch": 0.8186195826645265, |
| "grad_norm": 5.38623046875, |
| "learning_rate": 9.656026383620076e-06, |
| "loss": 0.5411, |
| "mean_token_accuracy": 0.7981366217136383, |
| "num_tokens": 2230917.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 1.9804831147193909, |
| "epoch": 0.8218298555377207, |
| "grad_norm": 11.439884185791016, |
| "learning_rate": 9.65215258045925e-06, |
| "loss": 0.5161, |
| "mean_token_accuracy": 0.8303695321083069, |
| "num_tokens": 2238718.0, |
| "step": 256 |
| }, |
| { |
| "entropy": 1.8764265179634094, |
| "epoch": 0.8250401284109149, |
| "grad_norm": 6.7170538902282715, |
| "learning_rate": 9.6482578719637e-06, |
| "loss": 0.5284, |
| "mean_token_accuracy": 0.8177531659603119, |
| "num_tokens": 2247613.0, |
| "step": 257 |
| }, |
| { |
| "entropy": 1.9280529618263245, |
| "epoch": 0.8282504012841091, |
| "grad_norm": 4.502701759338379, |
| "learning_rate": 9.644342275635036e-06, |
| "loss": 0.514, |
| "mean_token_accuracy": 0.8217505216598511, |
| "num_tokens": 2255285.0, |
| "step": 258 |
| }, |
| { |
| "entropy": 1.780007779598236, |
| "epoch": 0.8314606741573034, |
| "grad_norm": 12.708043098449707, |
| "learning_rate": 9.640405809068743e-06, |
| "loss": 0.5769, |
| "mean_token_accuracy": 0.8037888705730438, |
| "num_tokens": 2263131.0, |
| "step": 259 |
| }, |
| { |
| "entropy": 1.878428339958191, |
| "epoch": 0.8346709470304976, |
| "grad_norm": 5.675548553466797, |
| "learning_rate": 9.636448489954077e-06, |
| "loss": 0.5525, |
| "mean_token_accuracy": 0.8077818155288696, |
| "num_tokens": 2271596.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.684307873249054, |
| "epoch": 0.8378812199036918, |
| "grad_norm": 7.081056118011475, |
| "learning_rate": 9.632470336074009e-06, |
| "loss": 0.5188, |
| "mean_token_accuracy": 0.8152169585227966, |
| "num_tokens": 2280005.0, |
| "step": 261 |
| }, |
| { |
| "entropy": 1.9224953651428223, |
| "epoch": 0.841091492776886, |
| "grad_norm": 4.184887886047363, |
| "learning_rate": 9.628471365305134e-06, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.7943918704986572, |
| "num_tokens": 2289825.0, |
| "step": 262 |
| }, |
| { |
| "entropy": 1.864977478981018, |
| "epoch": 0.8443017656500803, |
| "grad_norm": 36.45917510986328, |
| "learning_rate": 9.624451595617588e-06, |
| "loss": 0.5088, |
| "mean_token_accuracy": 0.8221506774425507, |
| "num_tokens": 2298079.0, |
| "step": 263 |
| }, |
| { |
| "entropy": 1.9027855396270752, |
| "epoch": 0.8475120385232745, |
| "grad_norm": 5.5275187492370605, |
| "learning_rate": 9.620411045074972e-06, |
| "loss": 0.5644, |
| "mean_token_accuracy": 0.8087839186191559, |
| "num_tokens": 2308579.0, |
| "step": 264 |
| }, |
| { |
| "entropy": 1.8372774124145508, |
| "epoch": 0.8507223113964687, |
| "grad_norm": 6.006095886230469, |
| "learning_rate": 9.616349731834271e-06, |
| "loss": 0.5356, |
| "mean_token_accuracy": 0.816933810710907, |
| "num_tokens": 2318463.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 1.573245882987976, |
| "epoch": 0.8539325842696629, |
| "grad_norm": 4.500341415405273, |
| "learning_rate": 9.612267674145772e-06, |
| "loss": 0.5938, |
| "mean_token_accuracy": 0.8043918609619141, |
| "num_tokens": 2327089.0, |
| "step": 266 |
| }, |
| { |
| "entropy": 1.8638948798179626, |
| "epoch": 0.8571428571428571, |
| "grad_norm": 5.24827241897583, |
| "learning_rate": 9.608164890352977e-06, |
| "loss": 0.5005, |
| "mean_token_accuracy": 0.8218279480934143, |
| "num_tokens": 2335135.0, |
| "step": 267 |
| }, |
| { |
| "entropy": 1.8430355787277222, |
| "epoch": 0.8603531300160514, |
| "grad_norm": 4.886566638946533, |
| "learning_rate": 9.604041398892528e-06, |
| "loss": 0.5614, |
| "mean_token_accuracy": 0.8217492401599884, |
| "num_tokens": 2343916.0, |
| "step": 268 |
| }, |
| { |
| "entropy": 1.8909154534339905, |
| "epoch": 0.8635634028892456, |
| "grad_norm": 7.561775207519531, |
| "learning_rate": 9.599897218294122e-06, |
| "loss": 0.5394, |
| "mean_token_accuracy": 0.8251607716083527, |
| "num_tokens": 2351407.0, |
| "step": 269 |
| }, |
| { |
| "entropy": 1.7426018118858337, |
| "epoch": 0.8667736757624398, |
| "grad_norm": 4.487576484680176, |
| "learning_rate": 9.595732367180422e-06, |
| "loss": 0.5097, |
| "mean_token_accuracy": 0.8088293373584747, |
| "num_tokens": 2359413.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.844031572341919, |
| "epoch": 0.869983948635634, |
| "grad_norm": 4.225221157073975, |
| "learning_rate": 9.591546864266983e-06, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.8313940167427063, |
| "num_tokens": 2367432.0, |
| "step": 271 |
| }, |
| { |
| "entropy": 1.8397764563560486, |
| "epoch": 0.8731942215088283, |
| "grad_norm": 4.163444519042969, |
| "learning_rate": 9.58734072836216e-06, |
| "loss": 0.5367, |
| "mean_token_accuracy": 0.8088929355144501, |
| "num_tokens": 2376009.0, |
| "step": 272 |
| }, |
| { |
| "entropy": 1.9551055431365967, |
| "epoch": 0.8764044943820225, |
| "grad_norm": 188.6858367919922, |
| "learning_rate": 9.583113978367026e-06, |
| "loss": 0.4763, |
| "mean_token_accuracy": 0.8416739702224731, |
| "num_tokens": 2383876.0, |
| "step": 273 |
| }, |
| { |
| "entropy": 1.733047068119049, |
| "epoch": 0.8796147672552167, |
| "grad_norm": 7.61765718460083, |
| "learning_rate": 9.578866633275289e-06, |
| "loss": 0.5334, |
| "mean_token_accuracy": 0.8125023543834686, |
| "num_tokens": 2392551.0, |
| "step": 274 |
| }, |
| { |
| "entropy": 1.7080771327018738, |
| "epoch": 0.8828250401284109, |
| "grad_norm": 4.636716365814209, |
| "learning_rate": 9.574598712173202e-06, |
| "loss": 0.558, |
| "mean_token_accuracy": 0.8121279180049896, |
| "num_tokens": 2401060.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.6740916967391968, |
| "epoch": 0.8860353130016051, |
| "grad_norm": 4.136287689208984, |
| "learning_rate": 9.570310234239483e-06, |
| "loss": 0.5276, |
| "mean_token_accuracy": 0.8157520890235901, |
| "num_tokens": 2409417.0, |
| "step": 276 |
| }, |
| { |
| "entropy": 1.7176891565322876, |
| "epoch": 0.8892455858747994, |
| "grad_norm": 7.969487190246582, |
| "learning_rate": 9.56600121874523e-06, |
| "loss": 0.5402, |
| "mean_token_accuracy": 0.8149141073226929, |
| "num_tokens": 2418325.0, |
| "step": 277 |
| }, |
| { |
| "entropy": 1.7098489999771118, |
| "epoch": 0.8924558587479936, |
| "grad_norm": 5.37632942199707, |
| "learning_rate": 9.561671685053818e-06, |
| "loss": 0.5738, |
| "mean_token_accuracy": 0.8035610914230347, |
| "num_tokens": 2427522.0, |
| "step": 278 |
| }, |
| { |
| "entropy": 1.633002758026123, |
| "epoch": 0.8956661316211878, |
| "grad_norm": 4.541212558746338, |
| "learning_rate": 9.557321652620839e-06, |
| "loss": 0.5547, |
| "mean_token_accuracy": 0.8119913339614868, |
| "num_tokens": 2437685.0, |
| "step": 279 |
| }, |
| { |
| "entropy": 1.6630198955535889, |
| "epoch": 0.898876404494382, |
| "grad_norm": 4.517637729644775, |
| "learning_rate": 9.55295114099399e-06, |
| "loss": 0.572, |
| "mean_token_accuracy": 0.8125506043434143, |
| "num_tokens": 2446482.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.938102126121521, |
| "epoch": 0.9020866773675762, |
| "grad_norm": 5.673907279968262, |
| "learning_rate": 9.548560169812997e-06, |
| "loss": 0.5425, |
| "mean_token_accuracy": 0.8131826519966125, |
| "num_tokens": 2455118.0, |
| "step": 281 |
| }, |
| { |
| "entropy": 1.6700218319892883, |
| "epoch": 0.9052969502407705, |
| "grad_norm": 9.876907348632812, |
| "learning_rate": 9.544148758809528e-06, |
| "loss": 0.5354, |
| "mean_token_accuracy": 0.8099010288715363, |
| "num_tokens": 2464518.0, |
| "step": 282 |
| }, |
| { |
| "entropy": 1.8363152146339417, |
| "epoch": 0.9085072231139647, |
| "grad_norm": 13.005200386047363, |
| "learning_rate": 9.539716927807102e-06, |
| "loss": 0.4792, |
| "mean_token_accuracy": 0.824327141046524, |
| "num_tokens": 2473454.0, |
| "step": 283 |
| }, |
| { |
| "entropy": 1.775430977344513, |
| "epoch": 0.9117174959871589, |
| "grad_norm": 5.766530513763428, |
| "learning_rate": 9.535264696720993e-06, |
| "loss": 0.55, |
| "mean_token_accuracy": 0.8180139362812042, |
| "num_tokens": 2481651.0, |
| "step": 284 |
| }, |
| { |
| "entropy": 1.669650673866272, |
| "epoch": 0.9149277688603531, |
| "grad_norm": 6.167349815368652, |
| "learning_rate": 9.530792085558151e-06, |
| "loss": 0.4869, |
| "mean_token_accuracy": 0.8234744071960449, |
| "num_tokens": 2489837.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 1.6679094433784485, |
| "epoch": 0.9181380417335474, |
| "grad_norm": 5.257747650146484, |
| "learning_rate": 9.526299114417108e-06, |
| "loss": 0.5862, |
| "mean_token_accuracy": 0.8026742041110992, |
| "num_tokens": 2498698.0, |
| "step": 286 |
| }, |
| { |
| "entropy": 1.8049131035804749, |
| "epoch": 0.9213483146067416, |
| "grad_norm": 6.02241325378418, |
| "learning_rate": 9.521785803487888e-06, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.8336274325847626, |
| "num_tokens": 2506100.0, |
| "step": 287 |
| }, |
| { |
| "entropy": 1.7622082233428955, |
| "epoch": 0.9245585874799358, |
| "grad_norm": 5.4801764488220215, |
| "learning_rate": 9.517252173051912e-06, |
| "loss": 0.5652, |
| "mean_token_accuracy": 0.8085108995437622, |
| "num_tokens": 2513985.0, |
| "step": 288 |
| }, |
| { |
| "entropy": 1.7733423709869385, |
| "epoch": 0.92776886035313, |
| "grad_norm": 4.067267894744873, |
| "learning_rate": 9.512698243481914e-06, |
| "loss": 0.5772, |
| "mean_token_accuracy": 0.8086209297180176, |
| "num_tokens": 2522092.0, |
| "step": 289 |
| }, |
| { |
| "entropy": 1.9992202520370483, |
| "epoch": 0.9309791332263242, |
| "grad_norm": 3.6951725482940674, |
| "learning_rate": 9.508124035241843e-06, |
| "loss": 0.5263, |
| "mean_token_accuracy": 0.8235167562961578, |
| "num_tokens": 2531345.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.7889231443405151, |
| "epoch": 0.9341894060995185, |
| "grad_norm": 3.931403636932373, |
| "learning_rate": 9.50352956888678e-06, |
| "loss": 0.5272, |
| "mean_token_accuracy": 0.8177125155925751, |
| "num_tokens": 2539270.0, |
| "step": 291 |
| }, |
| { |
| "entropy": 1.6634029746055603, |
| "epoch": 0.9373996789727127, |
| "grad_norm": 8.0676851272583, |
| "learning_rate": 9.498914865062831e-06, |
| "loss": 0.5565, |
| "mean_token_accuracy": 0.8150179386138916, |
| "num_tokens": 2547863.0, |
| "step": 292 |
| }, |
| { |
| "entropy": 1.5966331362724304, |
| "epoch": 0.9406099518459069, |
| "grad_norm": 3.3051044940948486, |
| "learning_rate": 9.49427994450705e-06, |
| "loss": 0.5098, |
| "mean_token_accuracy": 0.819977194070816, |
| "num_tokens": 2556039.0, |
| "step": 293 |
| }, |
| { |
| "entropy": 1.873258113861084, |
| "epoch": 0.9438202247191011, |
| "grad_norm": 7.652100563049316, |
| "learning_rate": 9.489624828047336e-06, |
| "loss": 0.5398, |
| "mean_token_accuracy": 0.8114376664161682, |
| "num_tokens": 2564744.0, |
| "step": 294 |
| }, |
| { |
| "entropy": 1.9302705526351929, |
| "epoch": 0.9470304975922953, |
| "grad_norm": 4.018932342529297, |
| "learning_rate": 9.484949536602343e-06, |
| "loss": 0.5363, |
| "mean_token_accuracy": 0.8164487481117249, |
| "num_tokens": 2573899.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 1.5373682379722595, |
| "epoch": 0.9502407704654896, |
| "grad_norm": 4.182193756103516, |
| "learning_rate": 9.480254091181385e-06, |
| "loss": 0.585, |
| "mean_token_accuracy": 0.7996087670326233, |
| "num_tokens": 2582879.0, |
| "step": 296 |
| }, |
| { |
| "entropy": 1.8067876696586609, |
| "epoch": 0.9534510433386838, |
| "grad_norm": 4.5391950607299805, |
| "learning_rate": 9.47553851288434e-06, |
| "loss": 0.4947, |
| "mean_token_accuracy": 0.8318784534931183, |
| "num_tokens": 2591802.0, |
| "step": 297 |
| }, |
| { |
| "entropy": 1.8316718339920044, |
| "epoch": 0.956661316211878, |
| "grad_norm": 5.184760570526123, |
| "learning_rate": 9.470802822901558e-06, |
| "loss": 0.5586, |
| "mean_token_accuracy": 0.8131641149520874, |
| "num_tokens": 2600207.0, |
| "step": 298 |
| }, |
| { |
| "entropy": 1.859476923942566, |
| "epoch": 0.9598715890850722, |
| "grad_norm": 5.1902852058410645, |
| "learning_rate": 9.466047042513767e-06, |
| "loss": 0.5501, |
| "mean_token_accuracy": 0.8048664629459381, |
| "num_tokens": 2608420.0, |
| "step": 299 |
| }, |
| { |
| "entropy": 1.8378886580467224, |
| "epoch": 0.9630818619582665, |
| "grad_norm": 4.642928123474121, |
| "learning_rate": 9.461271193091971e-06, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.7992973029613495, |
| "num_tokens": 2617800.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.8681280612945557, |
| "epoch": 0.9662921348314607, |
| "grad_norm": 4.008616924285889, |
| "learning_rate": 9.45647529609736e-06, |
| "loss": 0.5605, |
| "mean_token_accuracy": 0.8001963198184967, |
| "num_tokens": 2627976.0, |
| "step": 301 |
| }, |
| { |
| "entropy": 1.6626688241958618, |
| "epoch": 0.9695024077046549, |
| "grad_norm": 13.212479591369629, |
| "learning_rate": 9.451659373081214e-06, |
| "loss": 0.5672, |
| "mean_token_accuracy": 0.8115538358688354, |
| "num_tokens": 2636906.0, |
| "step": 302 |
| }, |
| { |
| "entropy": 1.8471931219100952, |
| "epoch": 0.9727126805778491, |
| "grad_norm": 4.148143291473389, |
| "learning_rate": 9.4468234456848e-06, |
| "loss": 0.5647, |
| "mean_token_accuracy": 0.8091489970684052, |
| "num_tokens": 2645774.0, |
| "step": 303 |
| }, |
| { |
| "entropy": 1.7227018475532532, |
| "epoch": 0.9759229534510433, |
| "grad_norm": 3.8793492317199707, |
| "learning_rate": 9.44196753563928e-06, |
| "loss": 0.5244, |
| "mean_token_accuracy": 0.8176108598709106, |
| "num_tokens": 2654629.0, |
| "step": 304 |
| }, |
| { |
| "entropy": 1.9042375683784485, |
| "epoch": 0.9791332263242376, |
| "grad_norm": 8.690786361694336, |
| "learning_rate": 9.437091664765611e-06, |
| "loss": 0.548, |
| "mean_token_accuracy": 0.8241380155086517, |
| "num_tokens": 2663716.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 1.7460474371910095, |
| "epoch": 0.9823434991974318, |
| "grad_norm": 9.691555976867676, |
| "learning_rate": 9.43219585497445e-06, |
| "loss": 0.5014, |
| "mean_token_accuracy": 0.8227463662624359, |
| "num_tokens": 2672778.0, |
| "step": 306 |
| }, |
| { |
| "entropy": 1.878986418247223, |
| "epoch": 0.985553772070626, |
| "grad_norm": 4.636747360229492, |
| "learning_rate": 9.427280128266049e-06, |
| "loss": 0.5629, |
| "mean_token_accuracy": 0.8111841678619385, |
| "num_tokens": 2681537.0, |
| "step": 307 |
| }, |
| { |
| "entropy": 1.9950389862060547, |
| "epoch": 0.9887640449438202, |
| "grad_norm": 12.368703842163086, |
| "learning_rate": 9.422344506730168e-06, |
| "loss": 0.5101, |
| "mean_token_accuracy": 0.822578638792038, |
| "num_tokens": 2689579.0, |
| "step": 308 |
| }, |
| { |
| "entropy": 1.7799192070960999, |
| "epoch": 0.9919743178170144, |
| "grad_norm": 7.675198554992676, |
| "learning_rate": 9.41738901254596e-06, |
| "loss": 0.5046, |
| "mean_token_accuracy": 0.8357812464237213, |
| "num_tokens": 2697211.0, |
| "step": 309 |
| }, |
| { |
| "entropy": 1.8566884994506836, |
| "epoch": 0.9951845906902087, |
| "grad_norm": 31.68392562866211, |
| "learning_rate": 9.412413667981884e-06, |
| "loss": 0.5595, |
| "mean_token_accuracy": 0.8127318024635315, |
| "num_tokens": 2707794.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.7565560340881348, |
| "epoch": 0.9983948635634029, |
| "grad_norm": 5.741061210632324, |
| "learning_rate": 9.4074184953956e-06, |
| "loss": 0.6057, |
| "mean_token_accuracy": 0.8059540390968323, |
| "num_tokens": 2716378.0, |
| "step": 311 |
| }, |
| { |
| "entropy": 1.9112659692764282, |
| "epoch": 1.0, |
| "grad_norm": 8.97977066040039, |
| "learning_rate": 9.402403517233867e-06, |
| "loss": 0.5477, |
| "mean_token_accuracy": 0.8098132610321045, |
| "num_tokens": 2721142.0, |
| "step": 312 |
| }, |
| { |
| "entropy": 1.8866798877716064, |
| "epoch": 1.0032102728731942, |
| "grad_norm": 3.0067737102508545, |
| "learning_rate": 9.397368756032445e-06, |
| "loss": 0.2756, |
| "mean_token_accuracy": 0.8953090310096741, |
| "num_tokens": 2729237.0, |
| "step": 313 |
| }, |
| { |
| "entropy": 1.5604918003082275, |
| "epoch": 1.0064205457463884, |
| "grad_norm": 2.753265380859375, |
| "learning_rate": 9.392314234415999e-06, |
| "loss": 0.3299, |
| "mean_token_accuracy": 0.8884185254573822, |
| "num_tokens": 2738049.0, |
| "step": 314 |
| }, |
| { |
| "entropy": 1.6893478035926819, |
| "epoch": 1.0096308186195826, |
| "grad_norm": 5.572351932525635, |
| "learning_rate": 9.38723997509798e-06, |
| "loss": 0.3426, |
| "mean_token_accuracy": 0.8786461353302002, |
| "num_tokens": 2747474.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 1.7195146679878235, |
| "epoch": 1.0128410914927768, |
| "grad_norm": 6.990971088409424, |
| "learning_rate": 9.38214600088054e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8795572221279144, |
| "num_tokens": 2755445.0, |
| "step": 316 |
| }, |
| { |
| "entropy": 1.5811264514923096, |
| "epoch": 1.0160513643659712, |
| "grad_norm": 3.330709218978882, |
| "learning_rate": 9.37703233465443e-06, |
| "loss": 0.3017, |
| "mean_token_accuracy": 0.8671407401561737, |
| "num_tokens": 2764469.0, |
| "step": 317 |
| }, |
| { |
| "entropy": 1.5029963254928589, |
| "epoch": 1.0192616372391654, |
| "grad_norm": 9.694075584411621, |
| "learning_rate": 9.371898999398876e-06, |
| "loss": 0.3368, |
| "mean_token_accuracy": 0.884416937828064, |
| "num_tokens": 2772747.0, |
| "step": 318 |
| }, |
| { |
| "entropy": 1.77669358253479, |
| "epoch": 1.0224719101123596, |
| "grad_norm": 5.008193492889404, |
| "learning_rate": 9.366746018181503e-06, |
| "loss": 0.3311, |
| "mean_token_accuracy": 0.8841279745101929, |
| "num_tokens": 2782046.0, |
| "step": 319 |
| }, |
| { |
| "entropy": 1.5491546988487244, |
| "epoch": 1.0256821829855538, |
| "grad_norm": 3.3377037048339844, |
| "learning_rate": 9.361573414158215e-06, |
| "loss": 0.2557, |
| "mean_token_accuracy": 0.9022665619850159, |
| "num_tokens": 2790262.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.423735797405243, |
| "epoch": 1.028892455858748, |
| "grad_norm": 4.702206611633301, |
| "learning_rate": 9.356381210573092e-06, |
| "loss": 0.3956, |
| "mean_token_accuracy": 0.8604268729686737, |
| "num_tokens": 2799962.0, |
| "step": 321 |
| }, |
| { |
| "entropy": 1.642482876777649, |
| "epoch": 1.0321027287319422, |
| "grad_norm": 4.782188892364502, |
| "learning_rate": 9.351169430758293e-06, |
| "loss": 0.226, |
| "mean_token_accuracy": 0.9207814335823059, |
| "num_tokens": 2808389.0, |
| "step": 322 |
| }, |
| { |
| "entropy": 1.5962989330291748, |
| "epoch": 1.0353130016051364, |
| "grad_norm": 4.449636936187744, |
| "learning_rate": 9.345938098133946e-06, |
| "loss": 0.316, |
| "mean_token_accuracy": 0.8767516911029816, |
| "num_tokens": 2817170.0, |
| "step": 323 |
| }, |
| { |
| "entropy": 1.6813729405403137, |
| "epoch": 1.0385232744783306, |
| "grad_norm": 3.268564462661743, |
| "learning_rate": 9.340687236208037e-06, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.8798855543136597, |
| "num_tokens": 2826388.0, |
| "step": 324 |
| }, |
| { |
| "entropy": 1.4963070154190063, |
| "epoch": 1.0417335473515248, |
| "grad_norm": 3.9683163166046143, |
| "learning_rate": 9.33541686857632e-06, |
| "loss": 0.3418, |
| "mean_token_accuracy": 0.8698484897613525, |
| "num_tokens": 2836296.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 1.4952961206436157, |
| "epoch": 1.0449438202247192, |
| "grad_norm": 4.533268928527832, |
| "learning_rate": 9.330127018922195e-06, |
| "loss": 0.3369, |
| "mean_token_accuracy": 0.8843473196029663, |
| "num_tokens": 2845829.0, |
| "step": 326 |
| }, |
| { |
| "entropy": 1.4619093537330627, |
| "epoch": 1.0481540930979134, |
| "grad_norm": 6.075706481933594, |
| "learning_rate": 9.324817711016609e-06, |
| "loss": 0.3602, |
| "mean_token_accuracy": 0.872114509344101, |
| "num_tokens": 2855278.0, |
| "step": 327 |
| }, |
| { |
| "entropy": 1.3780204057693481, |
| "epoch": 1.0513643659711076, |
| "grad_norm": 3.4777228832244873, |
| "learning_rate": 9.31948896871795e-06, |
| "loss": 0.3687, |
| "mean_token_accuracy": 0.874431699514389, |
| "num_tokens": 2863462.0, |
| "step": 328 |
| }, |
| { |
| "entropy": 1.508378028869629, |
| "epoch": 1.0545746388443018, |
| "grad_norm": 3.8630146980285645, |
| "learning_rate": 9.31414081597194e-06, |
| "loss": 0.2862, |
| "mean_token_accuracy": 0.8832896053791046, |
| "num_tokens": 2873221.0, |
| "step": 329 |
| }, |
| { |
| "entropy": 1.4881436228752136, |
| "epoch": 1.057784911717496, |
| "grad_norm": 3.041048526763916, |
| "learning_rate": 9.30877327681152e-06, |
| "loss": 0.3019, |
| "mean_token_accuracy": 0.880987137556076, |
| "num_tokens": 2882010.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.5098777413368225, |
| "epoch": 1.0609951845906902, |
| "grad_norm": 10.562122344970703, |
| "learning_rate": 9.303386375356752e-06, |
| "loss": 0.2991, |
| "mean_token_accuracy": 0.8782722651958466, |
| "num_tokens": 2891471.0, |
| "step": 331 |
| }, |
| { |
| "entropy": 1.3975720405578613, |
| "epoch": 1.0642054574638844, |
| "grad_norm": 5.349188804626465, |
| "learning_rate": 9.297980135814706e-06, |
| "loss": 0.3329, |
| "mean_token_accuracy": 0.8701120913028717, |
| "num_tokens": 2900424.0, |
| "step": 332 |
| }, |
| { |
| "entropy": 1.4648704528808594, |
| "epoch": 1.0674157303370786, |
| "grad_norm": 4.09747838973999, |
| "learning_rate": 9.292554582479349e-06, |
| "loss": 0.3001, |
| "mean_token_accuracy": 0.8921301364898682, |
| "num_tokens": 2908440.0, |
| "step": 333 |
| }, |
| { |
| "entropy": 1.433031976222992, |
| "epoch": 1.0706260032102728, |
| "grad_norm": 5.114440441131592, |
| "learning_rate": 9.28710973973144e-06, |
| "loss": 0.3182, |
| "mean_token_accuracy": 0.886705756187439, |
| "num_tokens": 2917238.0, |
| "step": 334 |
| }, |
| { |
| "entropy": 1.5841457843780518, |
| "epoch": 1.0738362760834672, |
| "grad_norm": 15.462648391723633, |
| "learning_rate": 9.281645632038417e-06, |
| "loss": 0.2744, |
| "mean_token_accuracy": 0.8975834846496582, |
| "num_tokens": 2925815.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 1.5319228768348694, |
| "epoch": 1.0770465489566614, |
| "grad_norm": 4.613631725311279, |
| "learning_rate": 9.276162283954293e-06, |
| "loss": 0.3216, |
| "mean_token_accuracy": 0.8822270631790161, |
| "num_tokens": 2933894.0, |
| "step": 336 |
| }, |
| { |
| "entropy": 1.6462609767913818, |
| "epoch": 1.0802568218298556, |
| "grad_norm": 3.268683910369873, |
| "learning_rate": 9.270659720119533e-06, |
| "loss": 0.3096, |
| "mean_token_accuracy": 0.892312079668045, |
| "num_tokens": 2941880.0, |
| "step": 337 |
| }, |
| { |
| "entropy": 1.439853310585022, |
| "epoch": 1.0834670947030498, |
| "grad_norm": 4.163455009460449, |
| "learning_rate": 9.265137965260962e-06, |
| "loss": 0.3484, |
| "mean_token_accuracy": 0.8765892088413239, |
| "num_tokens": 2950411.0, |
| "step": 338 |
| }, |
| { |
| "entropy": 1.4200059175491333, |
| "epoch": 1.086677367576244, |
| "grad_norm": 3.743356466293335, |
| "learning_rate": 9.259597044191635e-06, |
| "loss": 0.3198, |
| "mean_token_accuracy": 0.8831844925880432, |
| "num_tokens": 2959444.0, |
| "step": 339 |
| }, |
| { |
| "entropy": 1.6156633496284485, |
| "epoch": 1.0898876404494382, |
| "grad_norm": 4.8488945960998535, |
| "learning_rate": 9.254036981810741e-06, |
| "loss": 0.2395, |
| "mean_token_accuracy": 0.8998830020427704, |
| "num_tokens": 2967850.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.6063914895057678, |
| "epoch": 1.0930979133226324, |
| "grad_norm": 3.8594541549682617, |
| "learning_rate": 9.248457803103476e-06, |
| "loss": 0.2982, |
| "mean_token_accuracy": 0.8650166988372803, |
| "num_tokens": 2977216.0, |
| "step": 341 |
| }, |
| { |
| "entropy": 1.4595788717269897, |
| "epoch": 1.0963081861958266, |
| "grad_norm": 3.777676582336426, |
| "learning_rate": 9.242859533140947e-06, |
| "loss": 0.3471, |
| "mean_token_accuracy": 0.875117301940918, |
| "num_tokens": 2985963.0, |
| "step": 342 |
| }, |
| { |
| "entropy": 1.3544176816940308, |
| "epoch": 1.0995184590690208, |
| "grad_norm": 7.397167205810547, |
| "learning_rate": 9.237242197080045e-06, |
| "loss": 0.3612, |
| "mean_token_accuracy": 0.8471320867538452, |
| "num_tokens": 2995982.0, |
| "step": 343 |
| }, |
| { |
| "entropy": 1.4412473440170288, |
| "epoch": 1.102728731942215, |
| "grad_norm": 6.994678020477295, |
| "learning_rate": 9.231605820163343e-06, |
| "loss": 0.2973, |
| "mean_token_accuracy": 0.8883349299430847, |
| "num_tokens": 3004311.0, |
| "step": 344 |
| }, |
| { |
| "entropy": 1.2334765791893005, |
| "epoch": 1.1059390048154094, |
| "grad_norm": 11.067462921142578, |
| "learning_rate": 9.225950427718974e-06, |
| "loss": 0.3466, |
| "mean_token_accuracy": 0.8650515079498291, |
| "num_tokens": 3013691.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 1.3161783814430237, |
| "epoch": 1.1091492776886036, |
| "grad_norm": 4.6484832763671875, |
| "learning_rate": 9.220276045160524e-06, |
| "loss": 0.4264, |
| "mean_token_accuracy": 0.8466585278511047, |
| "num_tokens": 3024373.0, |
| "step": 346 |
| }, |
| { |
| "entropy": 1.307463526725769, |
| "epoch": 1.1123595505617978, |
| "grad_norm": 6.318870544433594, |
| "learning_rate": 9.21458269798691e-06, |
| "loss": 0.298, |
| "mean_token_accuracy": 0.8816176652908325, |
| "num_tokens": 3033068.0, |
| "step": 347 |
| }, |
| { |
| "entropy": 1.4655287265777588, |
| "epoch": 1.115569823434992, |
| "grad_norm": 3.1357920169830322, |
| "learning_rate": 9.208870411782276e-06, |
| "loss": 0.2534, |
| "mean_token_accuracy": 0.8845996260643005, |
| "num_tokens": 3043131.0, |
| "step": 348 |
| }, |
| { |
| "entropy": 1.343300223350525, |
| "epoch": 1.1187800963081862, |
| "grad_norm": 6.207579612731934, |
| "learning_rate": 9.203139212215868e-06, |
| "loss": 0.3468, |
| "mean_token_accuracy": 0.8744291663169861, |
| "num_tokens": 3051014.0, |
| "step": 349 |
| }, |
| { |
| "entropy": 1.7229687571525574, |
| "epoch": 1.1219903691813804, |
| "grad_norm": 5.9254536628723145, |
| "learning_rate": 9.197389125041925e-06, |
| "loss": 0.3166, |
| "mean_token_accuracy": 0.8954213857650757, |
| "num_tokens": 3059323.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.3070868849754333, |
| "epoch": 1.1252006420545746, |
| "grad_norm": 4.462583065032959, |
| "learning_rate": 9.191620176099559e-06, |
| "loss": 0.3637, |
| "mean_token_accuracy": 0.8643763959407806, |
| "num_tokens": 3068280.0, |
| "step": 351 |
| }, |
| { |
| "entropy": 1.20599365234375, |
| "epoch": 1.1284109149277688, |
| "grad_norm": 3.932370185852051, |
| "learning_rate": 9.185832391312644e-06, |
| "loss": 0.3351, |
| "mean_token_accuracy": 0.8627801537513733, |
| "num_tokens": 3079052.0, |
| "step": 352 |
| }, |
| { |
| "entropy": 1.3846279382705688, |
| "epoch": 1.131621187800963, |
| "grad_norm": 2.907315969467163, |
| "learning_rate": 9.180025796689692e-06, |
| "loss": 0.3181, |
| "mean_token_accuracy": 0.8801276385784149, |
| "num_tokens": 3088219.0, |
| "step": 353 |
| }, |
| { |
| "entropy": 1.3442675471305847, |
| "epoch": 1.1348314606741572, |
| "grad_norm": 2.9003660678863525, |
| "learning_rate": 9.174200418323746e-06, |
| "loss": 0.2736, |
| "mean_token_accuracy": 0.887022852897644, |
| "num_tokens": 3096082.0, |
| "step": 354 |
| }, |
| { |
| "entropy": 1.311316728591919, |
| "epoch": 1.1380417335473516, |
| "grad_norm": 4.425498962402344, |
| "learning_rate": 9.168356282392253e-06, |
| "loss": 0.3228, |
| "mean_token_accuracy": 0.8825305104255676, |
| "num_tokens": 3104528.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 1.2196236848831177, |
| "epoch": 1.1412520064205458, |
| "grad_norm": 16.457611083984375, |
| "learning_rate": 9.16249341515695e-06, |
| "loss": 0.311, |
| "mean_token_accuracy": 0.885350912809372, |
| "num_tokens": 3112791.0, |
| "step": 356 |
| }, |
| { |
| "entropy": 1.3958068490028381, |
| "epoch": 1.14446227929374, |
| "grad_norm": 5.1613545417785645, |
| "learning_rate": 9.156611842963753e-06, |
| "loss": 0.3306, |
| "mean_token_accuracy": 0.8757579624652863, |
| "num_tokens": 3121180.0, |
| "step": 357 |
| }, |
| { |
| "entropy": 1.383374571800232, |
| "epoch": 1.1476725521669342, |
| "grad_norm": 3.217242956161499, |
| "learning_rate": 9.150711592242627e-06, |
| "loss": 0.2985, |
| "mean_token_accuracy": 0.8951647877693176, |
| "num_tokens": 3129243.0, |
| "step": 358 |
| }, |
| { |
| "entropy": 1.5093455910682678, |
| "epoch": 1.1508828250401284, |
| "grad_norm": 7.666992664337158, |
| "learning_rate": 9.144792689507471e-06, |
| "loss": 0.2927, |
| "mean_token_accuracy": 0.901302307844162, |
| "num_tokens": 3136965.0, |
| "step": 359 |
| }, |
| { |
| "entropy": 1.4201687574386597, |
| "epoch": 1.1540930979133226, |
| "grad_norm": 2.976116895675659, |
| "learning_rate": 9.138855161356006e-06, |
| "loss": 0.2453, |
| "mean_token_accuracy": 0.8940177857875824, |
| "num_tokens": 3146272.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.2107245326042175, |
| "epoch": 1.1573033707865168, |
| "grad_norm": 7.402556896209717, |
| "learning_rate": 9.132899034469648e-06, |
| "loss": 0.3276, |
| "mean_token_accuracy": 0.8673433661460876, |
| "num_tokens": 3156121.0, |
| "step": 361 |
| }, |
| { |
| "entropy": 1.3413746356964111, |
| "epoch": 1.160513643659711, |
| "grad_norm": 4.934394359588623, |
| "learning_rate": 9.126924335613385e-06, |
| "loss": 0.3168, |
| "mean_token_accuracy": 0.8895758986473083, |
| "num_tokens": 3164320.0, |
| "step": 362 |
| }, |
| { |
| "entropy": 1.3330776691436768, |
| "epoch": 1.1637239165329052, |
| "grad_norm": 3.920137643814087, |
| "learning_rate": 9.120931091635669e-06, |
| "loss": 0.2967, |
| "mean_token_accuracy": 0.899406909942627, |
| "num_tokens": 3172443.0, |
| "step": 363 |
| }, |
| { |
| "entropy": 1.3503963947296143, |
| "epoch": 1.1669341894060996, |
| "grad_norm": 7.547220706939697, |
| "learning_rate": 9.114919329468283e-06, |
| "loss": 0.2409, |
| "mean_token_accuracy": 0.911847323179245, |
| "num_tokens": 3180659.0, |
| "step": 364 |
| }, |
| { |
| "entropy": 1.4835132360458374, |
| "epoch": 1.1701444622792938, |
| "grad_norm": 4.738755226135254, |
| "learning_rate": 9.108889076126226e-06, |
| "loss": 0.2998, |
| "mean_token_accuracy": 0.879132866859436, |
| "num_tokens": 3188574.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 1.345651626586914, |
| "epoch": 1.173354735152488, |
| "grad_norm": 7.365373611450195, |
| "learning_rate": 9.102840358707594e-06, |
| "loss": 0.2489, |
| "mean_token_accuracy": 0.9016467928886414, |
| "num_tokens": 3195985.0, |
| "step": 366 |
| }, |
| { |
| "entropy": 1.262892246246338, |
| "epoch": 1.1765650080256822, |
| "grad_norm": 3.022606134414673, |
| "learning_rate": 9.09677320439345e-06, |
| "loss": 0.3176, |
| "mean_token_accuracy": 0.8682043254375458, |
| "num_tokens": 3206735.0, |
| "step": 367 |
| }, |
| { |
| "entropy": 1.4119667410850525, |
| "epoch": 1.1797752808988764, |
| "grad_norm": 4.2746453285217285, |
| "learning_rate": 9.090687640447709e-06, |
| "loss": 0.2945, |
| "mean_token_accuracy": 0.8927364349365234, |
| "num_tokens": 3215251.0, |
| "step": 368 |
| }, |
| { |
| "entropy": 1.6216139197349548, |
| "epoch": 1.1829855537720706, |
| "grad_norm": 22.629234313964844, |
| "learning_rate": 9.084583694217012e-06, |
| "loss": 0.2952, |
| "mean_token_accuracy": 0.8840005397796631, |
| "num_tokens": 3223744.0, |
| "step": 369 |
| }, |
| { |
| "entropy": 1.5899872779846191, |
| "epoch": 1.1861958266452648, |
| "grad_norm": 3.4773221015930176, |
| "learning_rate": 9.07846139313061e-06, |
| "loss": 0.2761, |
| "mean_token_accuracy": 0.8875099122524261, |
| "num_tokens": 3232776.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.3123191595077515, |
| "epoch": 1.189406099518459, |
| "grad_norm": 4.043982982635498, |
| "learning_rate": 9.072320764700223e-06, |
| "loss": 0.3154, |
| "mean_token_accuracy": 0.8860943615436554, |
| "num_tokens": 3240962.0, |
| "step": 371 |
| }, |
| { |
| "entropy": 1.23344486951828, |
| "epoch": 1.1926163723916532, |
| "grad_norm": 3.159976005554199, |
| "learning_rate": 9.066161836519942e-06, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.8750773966312408, |
| "num_tokens": 3250253.0, |
| "step": 372 |
| }, |
| { |
| "entropy": 1.3419618606567383, |
| "epoch": 1.1958266452648476, |
| "grad_norm": 3.6338860988616943, |
| "learning_rate": 9.059984636266082e-06, |
| "loss": 0.2862, |
| "mean_token_accuracy": 0.8736852407455444, |
| "num_tokens": 3259204.0, |
| "step": 373 |
| }, |
| { |
| "entropy": 1.4191534519195557, |
| "epoch": 1.1990369181380418, |
| "grad_norm": 28.642192840576172, |
| "learning_rate": 9.053789191697072e-06, |
| "loss": 0.3138, |
| "mean_token_accuracy": 0.8769473731517792, |
| "num_tokens": 3267199.0, |
| "step": 374 |
| }, |
| { |
| "entropy": 1.361560583114624, |
| "epoch": 1.202247191011236, |
| "grad_norm": 3.961038589477539, |
| "learning_rate": 9.047575530653324e-06, |
| "loss": 0.2677, |
| "mean_token_accuracy": 0.9008506536483765, |
| "num_tokens": 3275974.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 1.390976905822754, |
| "epoch": 1.2054574638844302, |
| "grad_norm": 3.8710885047912598, |
| "learning_rate": 9.041343681057106e-06, |
| "loss": 0.3181, |
| "mean_token_accuracy": 0.8792448043823242, |
| "num_tokens": 3284764.0, |
| "step": 376 |
| }, |
| { |
| "entropy": 1.4287749528884888, |
| "epoch": 1.2086677367576244, |
| "grad_norm": 3.154195785522461, |
| "learning_rate": 9.035093670912424e-06, |
| "loss": 0.2961, |
| "mean_token_accuracy": 0.8887947499752045, |
| "num_tokens": 3292669.0, |
| "step": 377 |
| }, |
| { |
| "entropy": 1.4230648279190063, |
| "epoch": 1.2118780096308186, |
| "grad_norm": 28.748876571655273, |
| "learning_rate": 9.028825528304892e-06, |
| "loss": 0.2871, |
| "mean_token_accuracy": 0.8998285830020905, |
| "num_tokens": 3300237.0, |
| "step": 378 |
| }, |
| { |
| "entropy": 1.361795425415039, |
| "epoch": 1.2150882825040128, |
| "grad_norm": 4.176957607269287, |
| "learning_rate": 9.022539281401601e-06, |
| "loss": 0.3278, |
| "mean_token_accuracy": 0.8757419288158417, |
| "num_tokens": 3309485.0, |
| "step": 379 |
| }, |
| { |
| "entropy": 1.3511288166046143, |
| "epoch": 1.218298555377207, |
| "grad_norm": 3.7439827919006348, |
| "learning_rate": 9.016234958451002e-06, |
| "loss": 0.3049, |
| "mean_token_accuracy": 0.8841515481472015, |
| "num_tokens": 3316976.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.3589674234390259, |
| "epoch": 1.2215088282504012, |
| "grad_norm": 8.258329391479492, |
| "learning_rate": 9.009912587782772e-06, |
| "loss": 0.2759, |
| "mean_token_accuracy": 0.8956619203090668, |
| "num_tokens": 3325227.0, |
| "step": 381 |
| }, |
| { |
| "entropy": 1.3772531151771545, |
| "epoch": 1.2247191011235956, |
| "grad_norm": 4.925998687744141, |
| "learning_rate": 9.00357219780769e-06, |
| "loss": 0.3135, |
| "mean_token_accuracy": 0.8864164352416992, |
| "num_tokens": 3333617.0, |
| "step": 382 |
| }, |
| { |
| "entropy": 1.4004551768302917, |
| "epoch": 1.2279293739967898, |
| "grad_norm": 8.397912979125977, |
| "learning_rate": 8.997213817017508e-06, |
| "loss": 0.3237, |
| "mean_token_accuracy": 0.8806163370609283, |
| "num_tokens": 3342339.0, |
| "step": 383 |
| }, |
| { |
| "entropy": 1.2548794150352478, |
| "epoch": 1.231139646869984, |
| "grad_norm": 3.6379387378692627, |
| "learning_rate": 8.990837473984818e-06, |
| "loss": 0.3339, |
| "mean_token_accuracy": 0.8684331476688385, |
| "num_tokens": 3351825.0, |
| "step": 384 |
| }, |
| { |
| "entropy": 1.3366854786872864, |
| "epoch": 1.2343499197431782, |
| "grad_norm": 3.443938732147217, |
| "learning_rate": 8.984443197362938e-06, |
| "loss": 0.3126, |
| "mean_token_accuracy": 0.882027268409729, |
| "num_tokens": 3360884.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 1.536583423614502, |
| "epoch": 1.2375601926163724, |
| "grad_norm": 3.1750330924987793, |
| "learning_rate": 8.978031015885767e-06, |
| "loss": 0.2539, |
| "mean_token_accuracy": 0.8960652649402618, |
| "num_tokens": 3369121.0, |
| "step": 386 |
| }, |
| { |
| "entropy": 1.4677174091339111, |
| "epoch": 1.2407704654895666, |
| "grad_norm": 3.4102578163146973, |
| "learning_rate": 8.971600958367668e-06, |
| "loss": 0.305, |
| "mean_token_accuracy": 0.8863416612148285, |
| "num_tokens": 3378236.0, |
| "step": 387 |
| }, |
| { |
| "entropy": 1.4675362706184387, |
| "epoch": 1.2439807383627608, |
| "grad_norm": 3.6679139137268066, |
| "learning_rate": 8.965153053703325e-06, |
| "loss": 0.2975, |
| "mean_token_accuracy": 0.883073091506958, |
| "num_tokens": 3387737.0, |
| "step": 388 |
| }, |
| { |
| "entropy": 1.4074093103408813, |
| "epoch": 1.247191011235955, |
| "grad_norm": 3.084782123565674, |
| "learning_rate": 8.958687330867634e-06, |
| "loss": 0.3106, |
| "mean_token_accuracy": 0.8841233551502228, |
| "num_tokens": 3396914.0, |
| "step": 389 |
| }, |
| { |
| "entropy": 1.3245580792427063, |
| "epoch": 1.2504012841091492, |
| "grad_norm": 2.8408830165863037, |
| "learning_rate": 8.952203818915548e-06, |
| "loss": 0.3442, |
| "mean_token_accuracy": 0.8646701872348785, |
| "num_tokens": 3409549.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.347778558731079, |
| "epoch": 1.2536115569823436, |
| "grad_norm": 5.054625988006592, |
| "learning_rate": 8.94570254698197e-06, |
| "loss": 0.3257, |
| "mean_token_accuracy": 0.872046560049057, |
| "num_tokens": 3419435.0, |
| "step": 391 |
| }, |
| { |
| "entropy": 1.2921775579452515, |
| "epoch": 1.2568218298555376, |
| "grad_norm": 3.920675277709961, |
| "learning_rate": 8.939183544281597e-06, |
| "loss": 0.2803, |
| "mean_token_accuracy": 0.8850438892841339, |
| "num_tokens": 3428152.0, |
| "step": 392 |
| }, |
| { |
| "entropy": 1.4277359247207642, |
| "epoch": 1.260032102728732, |
| "grad_norm": 9.427045822143555, |
| "learning_rate": 8.932646840108818e-06, |
| "loss": 0.3121, |
| "mean_token_accuracy": 0.8868101239204407, |
| "num_tokens": 3436649.0, |
| "step": 393 |
| }, |
| { |
| "entropy": 1.3226105570793152, |
| "epoch": 1.2632423756019262, |
| "grad_norm": 3.1199164390563965, |
| "learning_rate": 8.926092463837557e-06, |
| "loss": 0.3151, |
| "mean_token_accuracy": 0.8866923451423645, |
| "num_tokens": 3446140.0, |
| "step": 394 |
| }, |
| { |
| "entropy": 1.4028651714324951, |
| "epoch": 1.2664526484751204, |
| "grad_norm": 3.997776746749878, |
| "learning_rate": 8.919520444921153e-06, |
| "loss": 0.2777, |
| "mean_token_accuracy": 0.8968737721443176, |
| "num_tokens": 3454108.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 1.2859973907470703, |
| "epoch": 1.2696629213483146, |
| "grad_norm": 10.685837745666504, |
| "learning_rate": 8.912930812892228e-06, |
| "loss": 0.3432, |
| "mean_token_accuracy": 0.8622964024543762, |
| "num_tokens": 3462566.0, |
| "step": 396 |
| }, |
| { |
| "entropy": 1.4156718850135803, |
| "epoch": 1.2728731942215088, |
| "grad_norm": 8.489455223083496, |
| "learning_rate": 8.906323597362547e-06, |
| "loss": 0.272, |
| "mean_token_accuracy": 0.9009366929531097, |
| "num_tokens": 3470245.0, |
| "step": 397 |
| }, |
| { |
| "entropy": 1.522966742515564, |
| "epoch": 1.276083467094703, |
| "grad_norm": 3.9344754219055176, |
| "learning_rate": 8.899698828022895e-06, |
| "loss": 0.3411, |
| "mean_token_accuracy": 0.8857994973659515, |
| "num_tokens": 3479251.0, |
| "step": 398 |
| }, |
| { |
| "entropy": 1.2589990496635437, |
| "epoch": 1.2792937399678972, |
| "grad_norm": 2.7946629524230957, |
| "learning_rate": 8.893056534642938e-06, |
| "loss": 0.2591, |
| "mean_token_accuracy": 0.902241587638855, |
| "num_tokens": 3487630.0, |
| "step": 399 |
| }, |
| { |
| "entropy": 1.5324031114578247, |
| "epoch": 1.2825040128410916, |
| "grad_norm": 2.9131085872650146, |
| "learning_rate": 8.886396747071085e-06, |
| "loss": 0.2662, |
| "mean_token_accuracy": 0.898281991481781, |
| "num_tokens": 3496881.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.3316246271133423, |
| "epoch": 1.2857142857142856, |
| "grad_norm": 4.0149664878845215, |
| "learning_rate": 8.879719495234363e-06, |
| "loss": 0.282, |
| "mean_token_accuracy": 0.8956633508205414, |
| "num_tokens": 3505613.0, |
| "step": 401 |
| }, |
| { |
| "entropy": 1.3576619029045105, |
| "epoch": 1.28892455858748, |
| "grad_norm": 6.47138786315918, |
| "learning_rate": 8.873024809138272e-06, |
| "loss": 0.3248, |
| "mean_token_accuracy": 0.8703168630599976, |
| "num_tokens": 3515196.0, |
| "step": 402 |
| }, |
| { |
| "entropy": 1.5192933678627014, |
| "epoch": 1.2921348314606742, |
| "grad_norm": 4.7108635902404785, |
| "learning_rate": 8.866312718866669e-06, |
| "loss": 0.3272, |
| "mean_token_accuracy": 0.8735582530498505, |
| "num_tokens": 3525258.0, |
| "step": 403 |
| }, |
| { |
| "entropy": 1.5145000219345093, |
| "epoch": 1.2953451043338684, |
| "grad_norm": 7.440964698791504, |
| "learning_rate": 8.859583254581604e-06, |
| "loss": 0.3202, |
| "mean_token_accuracy": 0.8838435411453247, |
| "num_tokens": 3533799.0, |
| "step": 404 |
| }, |
| { |
| "entropy": 1.6401035785675049, |
| "epoch": 1.2985553772070626, |
| "grad_norm": 4.604669570922852, |
| "learning_rate": 8.852836446523213e-06, |
| "loss": 0.3255, |
| "mean_token_accuracy": 0.8817890584468842, |
| "num_tokens": 3541962.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 1.5389510989189148, |
| "epoch": 1.3017656500802568, |
| "grad_norm": 4.766162872314453, |
| "learning_rate": 8.846072325009562e-06, |
| "loss": 0.3256, |
| "mean_token_accuracy": 0.8761164546012878, |
| "num_tokens": 3551542.0, |
| "step": 406 |
| }, |
| { |
| "entropy": 1.457167625427246, |
| "epoch": 1.304975922953451, |
| "grad_norm": 4.539270401000977, |
| "learning_rate": 8.83929092043652e-06, |
| "loss": 0.3338, |
| "mean_token_accuracy": 0.872197687625885, |
| "num_tokens": 3559941.0, |
| "step": 407 |
| }, |
| { |
| "entropy": 1.4390366077423096, |
| "epoch": 1.3081861958266452, |
| "grad_norm": 4.6166582107543945, |
| "learning_rate": 8.832492263277624e-06, |
| "loss": 0.2873, |
| "mean_token_accuracy": 0.8764486014842987, |
| "num_tokens": 3569502.0, |
| "step": 408 |
| }, |
| { |
| "entropy": 1.5826404094696045, |
| "epoch": 1.3113964686998396, |
| "grad_norm": 35.64375686645508, |
| "learning_rate": 8.825676384083936e-06, |
| "loss": 0.327, |
| "mean_token_accuracy": 0.8866596817970276, |
| "num_tokens": 3578601.0, |
| "step": 409 |
| }, |
| { |
| "entropy": 1.3942549228668213, |
| "epoch": 1.3146067415730336, |
| "grad_norm": 3.5190839767456055, |
| "learning_rate": 8.818843313483907e-06, |
| "loss": 0.2994, |
| "mean_token_accuracy": 0.8889473378658295, |
| "num_tokens": 3587207.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.5776238441467285, |
| "epoch": 1.317817014446228, |
| "grad_norm": 9.147720336914062, |
| "learning_rate": 8.811993082183243e-06, |
| "loss": 0.2829, |
| "mean_token_accuracy": 0.8859248757362366, |
| "num_tokens": 3595617.0, |
| "step": 411 |
| }, |
| { |
| "entropy": 1.596324384212494, |
| "epoch": 1.3210272873194222, |
| "grad_norm": 5.241089344024658, |
| "learning_rate": 8.805125720964766e-06, |
| "loss": 0.2953, |
| "mean_token_accuracy": 0.8866595029830933, |
| "num_tokens": 3604639.0, |
| "step": 412 |
| }, |
| { |
| "entropy": 1.3364008069038391, |
| "epoch": 1.3242375601926164, |
| "grad_norm": 2.9017364978790283, |
| "learning_rate": 8.798241260688273e-06, |
| "loss": 0.3193, |
| "mean_token_accuracy": 0.8743776082992554, |
| "num_tokens": 3613839.0, |
| "step": 413 |
| }, |
| { |
| "entropy": 1.352913737297058, |
| "epoch": 1.3274478330658106, |
| "grad_norm": 3.1927762031555176, |
| "learning_rate": 8.791339732290398e-06, |
| "loss": 0.2869, |
| "mean_token_accuracy": 0.884104460477829, |
| "num_tokens": 3623670.0, |
| "step": 414 |
| }, |
| { |
| "entropy": 1.4007618427276611, |
| "epoch": 1.3306581059390048, |
| "grad_norm": 3.5387911796569824, |
| "learning_rate": 8.784421166784476e-06, |
| "loss": 0.2834, |
| "mean_token_accuracy": 0.8924818634986877, |
| "num_tokens": 3631397.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 1.5313379764556885, |
| "epoch": 1.333868378812199, |
| "grad_norm": 3.7980873584747314, |
| "learning_rate": 8.7774855952604e-06, |
| "loss": 0.284, |
| "mean_token_accuracy": 0.8888964354991913, |
| "num_tokens": 3640022.0, |
| "step": 416 |
| }, |
| { |
| "entropy": 1.3799718618392944, |
| "epoch": 1.3370786516853932, |
| "grad_norm": 3.859992027282715, |
| "learning_rate": 8.770533048884483e-06, |
| "loss": 0.2425, |
| "mean_token_accuracy": 0.9088575541973114, |
| "num_tokens": 3648177.0, |
| "step": 417 |
| }, |
| { |
| "entropy": 1.412042498588562, |
| "epoch": 1.3402889245585876, |
| "grad_norm": 4.135337829589844, |
| "learning_rate": 8.763563558899317e-06, |
| "loss": 0.2928, |
| "mean_token_accuracy": 0.8850153088569641, |
| "num_tokens": 3657888.0, |
| "step": 418 |
| }, |
| { |
| "entropy": 1.5607710480690002, |
| "epoch": 1.3434991974317816, |
| "grad_norm": 3.7058753967285156, |
| "learning_rate": 8.756577156623636e-06, |
| "loss": 0.2907, |
| "mean_token_accuracy": 0.8865703344345093, |
| "num_tokens": 3667083.0, |
| "step": 419 |
| }, |
| { |
| "entropy": 1.3873555660247803, |
| "epoch": 1.346709470304976, |
| "grad_norm": 3.0918662548065186, |
| "learning_rate": 8.749573873452169e-06, |
| "loss": 0.2682, |
| "mean_token_accuracy": 0.8970977663993835, |
| "num_tokens": 3675648.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.4942026734352112, |
| "epoch": 1.3499197431781702, |
| "grad_norm": 4.505756855010986, |
| "learning_rate": 8.742553740855507e-06, |
| "loss": 0.3386, |
| "mean_token_accuracy": 0.8737636208534241, |
| "num_tokens": 3684547.0, |
| "step": 421 |
| }, |
| { |
| "entropy": 1.3530024290084839, |
| "epoch": 1.3531300160513644, |
| "grad_norm": 4.251113414764404, |
| "learning_rate": 8.735516790379952e-06, |
| "loss": 0.3532, |
| "mean_token_accuracy": 0.871391236782074, |
| "num_tokens": 3693082.0, |
| "step": 422 |
| }, |
| { |
| "entropy": 1.3358929753303528, |
| "epoch": 1.3563402889245586, |
| "grad_norm": 3.3764379024505615, |
| "learning_rate": 8.728463053647382e-06, |
| "loss": 0.2892, |
| "mean_token_accuracy": 0.8893671631813049, |
| "num_tokens": 3701174.0, |
| "step": 423 |
| }, |
| { |
| "entropy": 1.5129033923149109, |
| "epoch": 1.3595505617977528, |
| "grad_norm": 3.298848867416382, |
| "learning_rate": 8.721392562355113e-06, |
| "loss": 0.2135, |
| "mean_token_accuracy": 0.9176328778266907, |
| "num_tokens": 3710272.0, |
| "step": 424 |
| }, |
| { |
| "entropy": 1.3921077251434326, |
| "epoch": 1.362760834670947, |
| "grad_norm": 4.316572189331055, |
| "learning_rate": 8.71430534827574e-06, |
| "loss": 0.3236, |
| "mean_token_accuracy": 0.8857169449329376, |
| "num_tokens": 3720135.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 1.37892746925354, |
| "epoch": 1.3659711075441412, |
| "grad_norm": 3.766646146774292, |
| "learning_rate": 8.707201443257015e-06, |
| "loss": 0.2828, |
| "mean_token_accuracy": 0.8919144570827484, |
| "num_tokens": 3728595.0, |
| "step": 426 |
| }, |
| { |
| "entropy": 1.2769124507904053, |
| "epoch": 1.3691813804173354, |
| "grad_norm": 3.253934144973755, |
| "learning_rate": 8.700080879221689e-06, |
| "loss": 0.2761, |
| "mean_token_accuracy": 0.8954965174198151, |
| "num_tokens": 3736117.0, |
| "step": 427 |
| }, |
| { |
| "entropy": 1.2963144183158875, |
| "epoch": 1.3723916532905296, |
| "grad_norm": 6.249416351318359, |
| "learning_rate": 8.692943688167371e-06, |
| "loss": 0.2693, |
| "mean_token_accuracy": 0.8879525661468506, |
| "num_tokens": 3744390.0, |
| "step": 428 |
| }, |
| { |
| "entropy": 1.3017955422401428, |
| "epoch": 1.375601926163724, |
| "grad_norm": 2.980830669403076, |
| "learning_rate": 8.685789902166395e-06, |
| "loss": 0.3061, |
| "mean_token_accuracy": 0.8886123299598694, |
| "num_tokens": 3753837.0, |
| "step": 429 |
| }, |
| { |
| "entropy": 1.349017083644867, |
| "epoch": 1.3788121990369182, |
| "grad_norm": 5.0209245681762695, |
| "learning_rate": 8.67861955336566e-06, |
| "loss": 0.2885, |
| "mean_token_accuracy": 0.8791326582431793, |
| "num_tokens": 3763493.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.4477825164794922, |
| "epoch": 1.3820224719101124, |
| "grad_norm": 11.758344650268555, |
| "learning_rate": 8.671432673986493e-06, |
| "loss": 0.3331, |
| "mean_token_accuracy": 0.8793164193630219, |
| "num_tokens": 3772358.0, |
| "step": 431 |
| }, |
| { |
| "entropy": 1.3367245197296143, |
| "epoch": 1.3852327447833066, |
| "grad_norm": 4.308809280395508, |
| "learning_rate": 8.664229296324514e-06, |
| "loss": 0.3042, |
| "mean_token_accuracy": 0.8680384755134583, |
| "num_tokens": 3780490.0, |
| "step": 432 |
| }, |
| { |
| "entropy": 1.449657917022705, |
| "epoch": 1.3884430176565008, |
| "grad_norm": 4.295592784881592, |
| "learning_rate": 8.657009452749466e-06, |
| "loss": 0.2871, |
| "mean_token_accuracy": 0.8916987478733063, |
| "num_tokens": 3788463.0, |
| "step": 433 |
| }, |
| { |
| "entropy": 1.49513840675354, |
| "epoch": 1.391653290529695, |
| "grad_norm": 4.6402201652526855, |
| "learning_rate": 8.649773175705099e-06, |
| "loss": 0.3005, |
| "mean_token_accuracy": 0.8849748373031616, |
| "num_tokens": 3797066.0, |
| "step": 434 |
| }, |
| { |
| "entropy": 1.2568953037261963, |
| "epoch": 1.3948635634028892, |
| "grad_norm": 3.7639451026916504, |
| "learning_rate": 8.642520497709001e-06, |
| "loss": 0.3126, |
| "mean_token_accuracy": 0.8799223005771637, |
| "num_tokens": 3805553.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 1.4464783668518066, |
| "epoch": 1.3980738362760834, |
| "grad_norm": 3.1986289024353027, |
| "learning_rate": 8.635251451352463e-06, |
| "loss": 0.3125, |
| "mean_token_accuracy": 0.8844713270664215, |
| "num_tokens": 3814202.0, |
| "step": 436 |
| }, |
| { |
| "entropy": 1.4772561192512512, |
| "epoch": 1.4012841091492776, |
| "grad_norm": 2.8951005935668945, |
| "learning_rate": 8.627966069300332e-06, |
| "loss": 0.2926, |
| "mean_token_accuracy": 0.8794938027858734, |
| "num_tokens": 3823416.0, |
| "step": 437 |
| }, |
| { |
| "entropy": 1.4769402742385864, |
| "epoch": 1.404494382022472, |
| "grad_norm": 4.300238609313965, |
| "learning_rate": 8.620664384290863e-06, |
| "loss": 0.3378, |
| "mean_token_accuracy": 0.8839116096496582, |
| "num_tokens": 3832081.0, |
| "step": 438 |
| }, |
| { |
| "entropy": 1.3235585689544678, |
| "epoch": 1.4077046548956662, |
| "grad_norm": 3.8008766174316406, |
| "learning_rate": 8.613346429135567e-06, |
| "loss": 0.3076, |
| "mean_token_accuracy": 0.8767895400524139, |
| "num_tokens": 3841534.0, |
| "step": 439 |
| }, |
| { |
| "entropy": 1.423071026802063, |
| "epoch": 1.4109149277688604, |
| "grad_norm": 5.64111852645874, |
| "learning_rate": 8.606012236719073e-06, |
| "loss": 0.2855, |
| "mean_token_accuracy": 0.8974299728870392, |
| "num_tokens": 3849597.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.5393714904785156, |
| "epoch": 1.4141252006420546, |
| "grad_norm": 3.2510595321655273, |
| "learning_rate": 8.598661839998972e-06, |
| "loss": 0.2969, |
| "mean_token_accuracy": 0.89105424284935, |
| "num_tokens": 3858231.0, |
| "step": 441 |
| }, |
| { |
| "entropy": 1.4409209489822388, |
| "epoch": 1.4173354735152488, |
| "grad_norm": 3.6261956691741943, |
| "learning_rate": 8.591295272005674e-06, |
| "loss": 0.3012, |
| "mean_token_accuracy": 0.8888025879859924, |
| "num_tokens": 3867227.0, |
| "step": 442 |
| }, |
| { |
| "entropy": 1.5317130088806152, |
| "epoch": 1.420545746388443, |
| "grad_norm": 2.9155311584472656, |
| "learning_rate": 8.583912565842258e-06, |
| "loss": 0.2643, |
| "mean_token_accuracy": 0.9002439677715302, |
| "num_tokens": 3875137.0, |
| "step": 443 |
| }, |
| { |
| "entropy": 1.5092316269874573, |
| "epoch": 1.4237560192616372, |
| "grad_norm": 8.434717178344727, |
| "learning_rate": 8.576513754684318e-06, |
| "loss": 0.2908, |
| "mean_token_accuracy": 0.8917935788631439, |
| "num_tokens": 3883306.0, |
| "step": 444 |
| }, |
| { |
| "entropy": 1.4440070390701294, |
| "epoch": 1.4269662921348314, |
| "grad_norm": 3.2048768997192383, |
| "learning_rate": 8.569098871779828e-06, |
| "loss": 0.2837, |
| "mean_token_accuracy": 0.8814037442207336, |
| "num_tokens": 3892331.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 1.4215741753578186, |
| "epoch": 1.4301765650080256, |
| "grad_norm": 4.36214017868042, |
| "learning_rate": 8.561667950448973e-06, |
| "loss": 0.3272, |
| "mean_token_accuracy": 0.8756458759307861, |
| "num_tokens": 3901483.0, |
| "step": 446 |
| }, |
| { |
| "entropy": 1.5180083513259888, |
| "epoch": 1.43338683788122, |
| "grad_norm": 3.3972907066345215, |
| "learning_rate": 8.554221024084019e-06, |
| "loss": 0.2934, |
| "mean_token_accuracy": 0.8790063261985779, |
| "num_tokens": 3910713.0, |
| "step": 447 |
| }, |
| { |
| "entropy": 1.4909613132476807, |
| "epoch": 1.4365971107544142, |
| "grad_norm": 3.484736919403076, |
| "learning_rate": 8.546758126149148e-06, |
| "loss": 0.3411, |
| "mean_token_accuracy": 0.8712872564792633, |
| "num_tokens": 3918429.0, |
| "step": 448 |
| }, |
| { |
| "entropy": 1.4828435182571411, |
| "epoch": 1.4398073836276084, |
| "grad_norm": 3.011584997177124, |
| "learning_rate": 8.539279290180315e-06, |
| "loss": 0.2702, |
| "mean_token_accuracy": 0.8983392119407654, |
| "num_tokens": 3927059.0, |
| "step": 449 |
| }, |
| { |
| "entropy": 1.3964380025863647, |
| "epoch": 1.4430176565008026, |
| "grad_norm": 3.2408764362335205, |
| "learning_rate": 8.531784549785098e-06, |
| "loss": 0.3098, |
| "mean_token_accuracy": 0.8849463164806366, |
| "num_tokens": 3936625.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.4703381657600403, |
| "epoch": 1.4462279293739968, |
| "grad_norm": 2.8378424644470215, |
| "learning_rate": 8.524273938642539e-06, |
| "loss": 0.2708, |
| "mean_token_accuracy": 0.9015527069568634, |
| "num_tokens": 3944552.0, |
| "step": 451 |
| }, |
| { |
| "entropy": 1.693705976009369, |
| "epoch": 1.449438202247191, |
| "grad_norm": 4.766742706298828, |
| "learning_rate": 8.516747490503001e-06, |
| "loss": 0.2855, |
| "mean_token_accuracy": 0.8845842480659485, |
| "num_tokens": 3953203.0, |
| "step": 452 |
| }, |
| { |
| "entropy": 1.4736073017120361, |
| "epoch": 1.4526484751203852, |
| "grad_norm": 3.187501907348633, |
| "learning_rate": 8.509205239188017e-06, |
| "loss": 0.3253, |
| "mean_token_accuracy": 0.8820372521877289, |
| "num_tokens": 3961297.0, |
| "step": 453 |
| }, |
| { |
| "entropy": 1.4047070145606995, |
| "epoch": 1.4558587479935794, |
| "grad_norm": 2.950268507003784, |
| "learning_rate": 8.501647218590127e-06, |
| "loss": 0.3094, |
| "mean_token_accuracy": 0.8798324465751648, |
| "num_tokens": 3970101.0, |
| "step": 454 |
| }, |
| { |
| "entropy": 1.5628494620323181, |
| "epoch": 1.4590690208667736, |
| "grad_norm": 5.165965557098389, |
| "learning_rate": 8.494073462672743e-06, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.8838167488574982, |
| "num_tokens": 3978005.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 1.3936602473258972, |
| "epoch": 1.462279293739968, |
| "grad_norm": 3.246717929840088, |
| "learning_rate": 8.486484005469977e-06, |
| "loss": 0.3419, |
| "mean_token_accuracy": 0.8834190964698792, |
| "num_tokens": 3985914.0, |
| "step": 456 |
| }, |
| { |
| "entropy": 1.401951789855957, |
| "epoch": 1.465489566613162, |
| "grad_norm": 4.547334671020508, |
| "learning_rate": 8.478878881086505e-06, |
| "loss": 0.3182, |
| "mean_token_accuracy": 0.8777275681495667, |
| "num_tokens": 3994565.0, |
| "step": 457 |
| }, |
| { |
| "entropy": 1.3956224918365479, |
| "epoch": 1.4686998394863564, |
| "grad_norm": 3.96466064453125, |
| "learning_rate": 8.471258123697403e-06, |
| "loss": 0.4326, |
| "mean_token_accuracy": 0.8498894572257996, |
| "num_tokens": 4005280.0, |
| "step": 458 |
| }, |
| { |
| "entropy": 1.5551961660385132, |
| "epoch": 1.4719101123595506, |
| "grad_norm": 3.1760103702545166, |
| "learning_rate": 8.463621767547998e-06, |
| "loss": 0.2698, |
| "mean_token_accuracy": 0.8973476886749268, |
| "num_tokens": 4014231.0, |
| "step": 459 |
| }, |
| { |
| "entropy": 1.4384177923202515, |
| "epoch": 1.4751203852327448, |
| "grad_norm": 2.9949951171875, |
| "learning_rate": 8.455969846953711e-06, |
| "loss": 0.2774, |
| "mean_token_accuracy": 0.8971374332904816, |
| "num_tokens": 4022904.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.6596906185150146, |
| "epoch": 1.478330658105939, |
| "grad_norm": 3.8711040019989014, |
| "learning_rate": 8.448302396299906e-06, |
| "loss": 0.3023, |
| "mean_token_accuracy": 0.8945289552211761, |
| "num_tokens": 4031368.0, |
| "step": 461 |
| }, |
| { |
| "entropy": 1.5498453974723816, |
| "epoch": 1.4815409309791332, |
| "grad_norm": 3.1765880584716797, |
| "learning_rate": 8.440619450041736e-06, |
| "loss": 0.2813, |
| "mean_token_accuracy": 0.8945924937725067, |
| "num_tokens": 4039373.0, |
| "step": 462 |
| }, |
| { |
| "entropy": 1.5488550662994385, |
| "epoch": 1.4847512038523274, |
| "grad_norm": 4.509853363037109, |
| "learning_rate": 8.432921042703985e-06, |
| "loss": 0.2502, |
| "mean_token_accuracy": 0.9076395630836487, |
| "num_tokens": 4047151.0, |
| "step": 463 |
| }, |
| { |
| "entropy": 1.3990018367767334, |
| "epoch": 1.4879614767255216, |
| "grad_norm": 4.891584396362305, |
| "learning_rate": 8.425207208880914e-06, |
| "loss": 0.3393, |
| "mean_token_accuracy": 0.8612608909606934, |
| "num_tokens": 4055428.0, |
| "step": 464 |
| }, |
| { |
| "entropy": 1.5296116471290588, |
| "epoch": 1.491171749598716, |
| "grad_norm": 3.9132232666015625, |
| "learning_rate": 8.417477983236107e-06, |
| "loss": 0.2889, |
| "mean_token_accuracy": 0.9005896151065826, |
| "num_tokens": 4063567.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 1.4369441270828247, |
| "epoch": 1.49438202247191, |
| "grad_norm": 4.291438102722168, |
| "learning_rate": 8.409733400502311e-06, |
| "loss": 0.343, |
| "mean_token_accuracy": 0.8742758929729462, |
| "num_tokens": 4072146.0, |
| "step": 466 |
| }, |
| { |
| "entropy": 1.4824082255363464, |
| "epoch": 1.4975922953451044, |
| "grad_norm": 3.5444793701171875, |
| "learning_rate": 8.401973495481289e-06, |
| "loss": 0.3, |
| "mean_token_accuracy": 0.8864571452140808, |
| "num_tokens": 4080309.0, |
| "step": 467 |
| }, |
| { |
| "entropy": 1.3830168843269348, |
| "epoch": 1.5008025682182986, |
| "grad_norm": 4.718812465667725, |
| "learning_rate": 8.39419830304365e-06, |
| "loss": 0.2767, |
| "mean_token_accuracy": 0.8947529196739197, |
| "num_tokens": 4088027.0, |
| "step": 468 |
| }, |
| { |
| "entropy": 1.501311182975769, |
| "epoch": 1.5040128410914928, |
| "grad_norm": 4.948775768280029, |
| "learning_rate": 8.386407858128707e-06, |
| "loss": 0.2856, |
| "mean_token_accuracy": 0.8906499743461609, |
| "num_tokens": 4097004.0, |
| "step": 469 |
| }, |
| { |
| "entropy": 1.4303480386734009, |
| "epoch": 1.507223113964687, |
| "grad_norm": 3.6689600944519043, |
| "learning_rate": 8.378602195744308e-06, |
| "loss": 0.2981, |
| "mean_token_accuracy": 0.8818539083003998, |
| "num_tokens": 4105389.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.4083414673805237, |
| "epoch": 1.5104333868378812, |
| "grad_norm": 3.417105197906494, |
| "learning_rate": 8.370781350966683e-06, |
| "loss": 0.3039, |
| "mean_token_accuracy": 0.8782542049884796, |
| "num_tokens": 4114039.0, |
| "step": 471 |
| }, |
| { |
| "entropy": 1.3129056096076965, |
| "epoch": 1.5136436597110754, |
| "grad_norm": 3.7305545806884766, |
| "learning_rate": 8.362945358940295e-06, |
| "loss": 0.3669, |
| "mean_token_accuracy": 0.8697949945926666, |
| "num_tokens": 4123365.0, |
| "step": 472 |
| }, |
| { |
| "entropy": 1.3081690669059753, |
| "epoch": 1.5168539325842696, |
| "grad_norm": 3.302035093307495, |
| "learning_rate": 8.355094254877665e-06, |
| "loss": 0.3144, |
| "mean_token_accuracy": 0.8910820186138153, |
| "num_tokens": 4132153.0, |
| "step": 473 |
| }, |
| { |
| "entropy": 1.47750985622406, |
| "epoch": 1.520064205457464, |
| "grad_norm": 3.3554630279541016, |
| "learning_rate": 8.347228074059227e-06, |
| "loss": 0.2683, |
| "mean_token_accuracy": 0.9022042751312256, |
| "num_tokens": 4140393.0, |
| "step": 474 |
| }, |
| { |
| "entropy": 1.348096251487732, |
| "epoch": 1.523274478330658, |
| "grad_norm": 2.9571056365966797, |
| "learning_rate": 8.339346851833163e-06, |
| "loss": 0.3485, |
| "mean_token_accuracy": 0.8722924292087555, |
| "num_tokens": 4149933.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 1.4152057766914368, |
| "epoch": 1.5264847512038524, |
| "grad_norm": 3.6118574142456055, |
| "learning_rate": 8.33145062361525e-06, |
| "loss": 0.2283, |
| "mean_token_accuracy": 0.9181205034255981, |
| "num_tokens": 4157852.0, |
| "step": 476 |
| }, |
| { |
| "entropy": 1.393871009349823, |
| "epoch": 1.5296950240770464, |
| "grad_norm": 3.289285898208618, |
| "learning_rate": 8.323539424888695e-06, |
| "loss": 0.3666, |
| "mean_token_accuracy": 0.8745063245296478, |
| "num_tokens": 4167313.0, |
| "step": 477 |
| }, |
| { |
| "entropy": 1.4740851521492004, |
| "epoch": 1.5329052969502408, |
| "grad_norm": 2.9904513359069824, |
| "learning_rate": 8.315613291203977e-06, |
| "loss": 0.2871, |
| "mean_token_accuracy": 0.8881636559963226, |
| "num_tokens": 4175515.0, |
| "step": 478 |
| }, |
| { |
| "entropy": 1.4914516806602478, |
| "epoch": 1.536115569823435, |
| "grad_norm": 5.311322212219238, |
| "learning_rate": 8.30767225817869e-06, |
| "loss": 0.3231, |
| "mean_token_accuracy": 0.878555953502655, |
| "num_tokens": 4183781.0, |
| "step": 479 |
| }, |
| { |
| "entropy": 1.4010317921638489, |
| "epoch": 1.5393258426966292, |
| "grad_norm": 3.215667963027954, |
| "learning_rate": 8.299716361497377e-06, |
| "loss": 0.3094, |
| "mean_token_accuracy": 0.8759619891643524, |
| "num_tokens": 4191987.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.506856381893158, |
| "epoch": 1.5425361155698234, |
| "grad_norm": 3.147756576538086, |
| "learning_rate": 8.291745636911382e-06, |
| "loss": 0.2931, |
| "mean_token_accuracy": 0.8908743560314178, |
| "num_tokens": 4200711.0, |
| "step": 481 |
| }, |
| { |
| "entropy": 1.324585199356079, |
| "epoch": 1.5457463884430176, |
| "grad_norm": 3.0587069988250732, |
| "learning_rate": 8.283760120238672e-06, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.8850060105323792, |
| "num_tokens": 4209179.0, |
| "step": 482 |
| }, |
| { |
| "entropy": 1.4364397525787354, |
| "epoch": 1.548956661316212, |
| "grad_norm": 10.770256042480469, |
| "learning_rate": 8.27575984736369e-06, |
| "loss": 0.2789, |
| "mean_token_accuracy": 0.8973394930362701, |
| "num_tokens": 4217103.0, |
| "step": 483 |
| }, |
| { |
| "entropy": 1.5607190132141113, |
| "epoch": 1.552166934189406, |
| "grad_norm": 3.770235538482666, |
| "learning_rate": 8.26774485423719e-06, |
| "loss": 0.3243, |
| "mean_token_accuracy": 0.8835341334342957, |
| "num_tokens": 4226475.0, |
| "step": 484 |
| }, |
| { |
| "entropy": 1.5396313071250916, |
| "epoch": 1.5553772070626004, |
| "grad_norm": 3.7008605003356934, |
| "learning_rate": 8.259715176876069e-06, |
| "loss": 0.3293, |
| "mean_token_accuracy": 0.8814294040203094, |
| "num_tokens": 4234718.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 1.359902560710907, |
| "epoch": 1.5585874799357944, |
| "grad_norm": 4.006896495819092, |
| "learning_rate": 8.251670851363214e-06, |
| "loss": 0.2896, |
| "mean_token_accuracy": 0.8906521201133728, |
| "num_tokens": 4243360.0, |
| "step": 486 |
| }, |
| { |
| "entropy": 1.4276302456855774, |
| "epoch": 1.5617977528089888, |
| "grad_norm": 7.1651716232299805, |
| "learning_rate": 8.243611913847337e-06, |
| "loss": 0.2822, |
| "mean_token_accuracy": 0.8945540487766266, |
| "num_tokens": 4252105.0, |
| "step": 487 |
| }, |
| { |
| "entropy": 1.4439811706542969, |
| "epoch": 1.565008025682183, |
| "grad_norm": 6.675753116607666, |
| "learning_rate": 8.235538400542809e-06, |
| "loss": 0.2913, |
| "mean_token_accuracy": 0.8855163156986237, |
| "num_tokens": 4261011.0, |
| "step": 488 |
| }, |
| { |
| "entropy": 1.5044021606445312, |
| "epoch": 1.5682182985553772, |
| "grad_norm": 3.4322025775909424, |
| "learning_rate": 8.2274503477295e-06, |
| "loss": 0.264, |
| "mean_token_accuracy": 0.9048478901386261, |
| "num_tokens": 4268926.0, |
| "step": 489 |
| }, |
| { |
| "entropy": 1.449196219444275, |
| "epoch": 1.5714285714285714, |
| "grad_norm": 4.121733665466309, |
| "learning_rate": 8.21934779175262e-06, |
| "loss": 0.3145, |
| "mean_token_accuracy": 0.8815996646881104, |
| "num_tokens": 4277478.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.583847999572754, |
| "epoch": 1.5746388443017656, |
| "grad_norm": 3.647516965866089, |
| "learning_rate": 8.211230769022552e-06, |
| "loss": 0.2795, |
| "mean_token_accuracy": 0.8901920318603516, |
| "num_tokens": 4287141.0, |
| "step": 491 |
| }, |
| { |
| "entropy": 1.7253791093826294, |
| "epoch": 1.57784911717496, |
| "grad_norm": 5.149407386779785, |
| "learning_rate": 8.203099316014679e-06, |
| "loss": 0.2618, |
| "mean_token_accuracy": 0.9063104391098022, |
| "num_tokens": 4295839.0, |
| "step": 492 |
| }, |
| { |
| "entropy": 1.8327444195747375, |
| "epoch": 1.581059390048154, |
| "grad_norm": 4.351746559143066, |
| "learning_rate": 8.19495346926924e-06, |
| "loss": 0.359, |
| "mean_token_accuracy": 0.8663514256477356, |
| "num_tokens": 4305643.0, |
| "step": 493 |
| }, |
| { |
| "entropy": 1.4635063409805298, |
| "epoch": 1.5842696629213484, |
| "grad_norm": 3.7534878253936768, |
| "learning_rate": 8.18679326539115e-06, |
| "loss": 0.287, |
| "mean_token_accuracy": 0.8927922546863556, |
| "num_tokens": 4314046.0, |
| "step": 494 |
| }, |
| { |
| "entropy": 1.347511351108551, |
| "epoch": 1.5874799357945424, |
| "grad_norm": 3.258915424346924, |
| "learning_rate": 8.178618741049841e-06, |
| "loss": 0.3147, |
| "mean_token_accuracy": 0.8884759247303009, |
| "num_tokens": 4322655.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 1.5007055401802063, |
| "epoch": 1.5906902086677368, |
| "grad_norm": 3.146191120147705, |
| "learning_rate": 8.170429932979097e-06, |
| "loss": 0.2464, |
| "mean_token_accuracy": 0.9067124128341675, |
| "num_tokens": 4330697.0, |
| "step": 496 |
| }, |
| { |
| "entropy": 1.4456924796104431, |
| "epoch": 1.593900481540931, |
| "grad_norm": 4.14158821105957, |
| "learning_rate": 8.162226877976886e-06, |
| "loss": 0.2741, |
| "mean_token_accuracy": 0.890965074300766, |
| "num_tokens": 4338442.0, |
| "step": 497 |
| }, |
| { |
| "entropy": 1.3182119727134705, |
| "epoch": 1.5971107544141252, |
| "grad_norm": 4.268415451049805, |
| "learning_rate": 8.154009612905205e-06, |
| "loss": 0.3057, |
| "mean_token_accuracy": 0.8871277570724487, |
| "num_tokens": 4346989.0, |
| "step": 498 |
| }, |
| { |
| "entropy": 1.3188686966896057, |
| "epoch": 1.6003210272873194, |
| "grad_norm": 3.7496144771575928, |
| "learning_rate": 8.145778174689897e-06, |
| "loss": 0.327, |
| "mean_token_accuracy": 0.8702134490013123, |
| "num_tokens": 4355824.0, |
| "step": 499 |
| }, |
| { |
| "entropy": 1.5009222030639648, |
| "epoch": 1.6035313001605136, |
| "grad_norm": 3.6059823036193848, |
| "learning_rate": 8.137532600320502e-06, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.8813262283802032, |
| "num_tokens": 4364156.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.4714254140853882, |
| "epoch": 1.606741573033708, |
| "grad_norm": 3.3746225833892822, |
| "learning_rate": 8.129272926850079e-06, |
| "loss": 0.3275, |
| "mean_token_accuracy": 0.8713617920875549, |
| "num_tokens": 4373790.0, |
| "step": 501 |
| }, |
| { |
| "entropy": 1.2796449065208435, |
| "epoch": 1.609951845906902, |
| "grad_norm": 8.295031547546387, |
| "learning_rate": 8.120999191395048e-06, |
| "loss": 0.3182, |
| "mean_token_accuracy": 0.8713172674179077, |
| "num_tokens": 4381975.0, |
| "step": 502 |
| }, |
| { |
| "entropy": 1.2624800205230713, |
| "epoch": 1.6131621187800964, |
| "grad_norm": 3.607821226119995, |
| "learning_rate": 8.112711431135014e-06, |
| "loss": 0.3789, |
| "mean_token_accuracy": 0.8556053936481476, |
| "num_tokens": 4393419.0, |
| "step": 503 |
| }, |
| { |
| "entropy": 1.4322052001953125, |
| "epoch": 1.6163723916532904, |
| "grad_norm": 22.992494583129883, |
| "learning_rate": 8.10440968331261e-06, |
| "loss": 0.2864, |
| "mean_token_accuracy": 0.8980510830879211, |
| "num_tokens": 4402672.0, |
| "step": 504 |
| }, |
| { |
| "entropy": 1.5261085033416748, |
| "epoch": 1.6195826645264848, |
| "grad_norm": 3.8417842388153076, |
| "learning_rate": 8.096093985233323e-06, |
| "loss": 0.3399, |
| "mean_token_accuracy": 0.8749682307243347, |
| "num_tokens": 4413027.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 1.5344858169555664, |
| "epoch": 1.622792937399679, |
| "grad_norm": 29.822771072387695, |
| "learning_rate": 8.087764374265325e-06, |
| "loss": 0.3371, |
| "mean_token_accuracy": 0.8792240917682648, |
| "num_tokens": 4422236.0, |
| "step": 506 |
| }, |
| { |
| "entropy": 1.4433594942092896, |
| "epoch": 1.6260032102728732, |
| "grad_norm": 3.875814914703369, |
| "learning_rate": 8.079420887839316e-06, |
| "loss": 0.3143, |
| "mean_token_accuracy": 0.8798151612281799, |
| "num_tokens": 4431073.0, |
| "step": 507 |
| }, |
| { |
| "entropy": 1.580399751663208, |
| "epoch": 1.6292134831460674, |
| "grad_norm": 4.023304462432861, |
| "learning_rate": 8.071063563448341e-06, |
| "loss": 0.306, |
| "mean_token_accuracy": 0.8894245326519012, |
| "num_tokens": 4439845.0, |
| "step": 508 |
| }, |
| { |
| "entropy": 1.366957426071167, |
| "epoch": 1.6324237560192616, |
| "grad_norm": 4.008258819580078, |
| "learning_rate": 8.062692438647628e-06, |
| "loss": 0.3259, |
| "mean_token_accuracy": 0.8769682347774506, |
| "num_tokens": 4447925.0, |
| "step": 509 |
| }, |
| { |
| "entropy": 1.3271659016609192, |
| "epoch": 1.635634028892456, |
| "grad_norm": 15.104333877563477, |
| "learning_rate": 8.054307551054427e-06, |
| "loss": 0.3196, |
| "mean_token_accuracy": 0.8766265213489532, |
| "num_tokens": 4459279.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.3582776188850403, |
| "epoch": 1.63884430176565, |
| "grad_norm": 3.408693790435791, |
| "learning_rate": 8.045908938347828e-06, |
| "loss": 0.3227, |
| "mean_token_accuracy": 0.8774316906929016, |
| "num_tokens": 4467698.0, |
| "step": 511 |
| }, |
| { |
| "entropy": 1.440934956073761, |
| "epoch": 1.6420545746388444, |
| "grad_norm": 4.110688209533691, |
| "learning_rate": 8.037496638268599e-06, |
| "loss": 0.3005, |
| "mean_token_accuracy": 0.8867291510105133, |
| "num_tokens": 4477102.0, |
| "step": 512 |
| }, |
| { |
| "entropy": 1.7379968762397766, |
| "epoch": 1.6452648475120384, |
| "grad_norm": 12.632403373718262, |
| "learning_rate": 8.029070688619013e-06, |
| "loss": 0.2961, |
| "mean_token_accuracy": 0.8926231265068054, |
| "num_tokens": 4487287.0, |
| "step": 513 |
| }, |
| { |
| "entropy": 1.3730071783065796, |
| "epoch": 1.6484751203852328, |
| "grad_norm": 2.8596882820129395, |
| "learning_rate": 8.020631127262681e-06, |
| "loss": 0.2615, |
| "mean_token_accuracy": 0.90137779712677, |
| "num_tokens": 4496229.0, |
| "step": 514 |
| }, |
| { |
| "entropy": 1.4635502099990845, |
| "epoch": 1.651685393258427, |
| "grad_norm": 3.51971173286438, |
| "learning_rate": 8.012177992124385e-06, |
| "loss": 0.3054, |
| "mean_token_accuracy": 0.8946611881256104, |
| "num_tokens": 4504147.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 1.335837960243225, |
| "epoch": 1.6548956661316212, |
| "grad_norm": 2.9270989894866943, |
| "learning_rate": 8.003711321189895e-06, |
| "loss": 0.3361, |
| "mean_token_accuracy": 0.872332900762558, |
| "num_tokens": 4513383.0, |
| "step": 516 |
| }, |
| { |
| "entropy": 1.4679010510444641, |
| "epoch": 1.6581059390048154, |
| "grad_norm": 2.6027233600616455, |
| "learning_rate": 7.995231152505815e-06, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.9143733978271484, |
| "num_tokens": 4521270.0, |
| "step": 517 |
| }, |
| { |
| "entropy": 1.3284828662872314, |
| "epoch": 1.6613162118780096, |
| "grad_norm": 5.920923709869385, |
| "learning_rate": 7.986737524179398e-06, |
| "loss": 0.3287, |
| "mean_token_accuracy": 0.8773960471153259, |
| "num_tokens": 4530869.0, |
| "step": 518 |
| }, |
| { |
| "entropy": 1.5258394479751587, |
| "epoch": 1.664526484751204, |
| "grad_norm": 10.5205078125, |
| "learning_rate": 7.978230474378383e-06, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.9132257103919983, |
| "num_tokens": 4539566.0, |
| "step": 519 |
| }, |
| { |
| "entropy": 1.3652837872505188, |
| "epoch": 1.667736757624398, |
| "grad_norm": 2.759265661239624, |
| "learning_rate": 7.96971004133082e-06, |
| "loss": 0.3594, |
| "mean_token_accuracy": 0.8666320443153381, |
| "num_tokens": 4549459.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.4656896591186523, |
| "epoch": 1.6709470304975924, |
| "grad_norm": 4.031230449676514, |
| "learning_rate": 7.961176263324902e-06, |
| "loss": 0.3404, |
| "mean_token_accuracy": 0.8758351504802704, |
| "num_tokens": 4558053.0, |
| "step": 521 |
| }, |
| { |
| "entropy": 1.383994698524475, |
| "epoch": 1.6741573033707864, |
| "grad_norm": 12.914740562438965, |
| "learning_rate": 7.952629178708783e-06, |
| "loss": 0.3391, |
| "mean_token_accuracy": 0.8680421113967896, |
| "num_tokens": 4566800.0, |
| "step": 522 |
| }, |
| { |
| "entropy": 1.3812520503997803, |
| "epoch": 1.6773675762439808, |
| "grad_norm": 3.0095322132110596, |
| "learning_rate": 7.944068825890424e-06, |
| "loss": 0.3193, |
| "mean_token_accuracy": 0.877071350812912, |
| "num_tokens": 4575949.0, |
| "step": 523 |
| }, |
| { |
| "entropy": 1.4497495293617249, |
| "epoch": 1.680577849117175, |
| "grad_norm": 3.192546844482422, |
| "learning_rate": 7.935495243337397e-06, |
| "loss": 0.2906, |
| "mean_token_accuracy": 0.8865289986133575, |
| "num_tokens": 4586128.0, |
| "step": 524 |
| }, |
| { |
| "entropy": 1.4661012291908264, |
| "epoch": 1.6837881219903692, |
| "grad_norm": 4.490471363067627, |
| "learning_rate": 7.92690846957673e-06, |
| "loss": 0.3304, |
| "mean_token_accuracy": 0.8626764714717865, |
| "num_tokens": 4595541.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 1.4555456638336182, |
| "epoch": 1.6869983948635634, |
| "grad_norm": 3.7944064140319824, |
| "learning_rate": 7.918308543194735e-06, |
| "loss": 0.3044, |
| "mean_token_accuracy": 0.8946430087089539, |
| "num_tokens": 4604904.0, |
| "step": 526 |
| }, |
| { |
| "entropy": 1.4736084938049316, |
| "epoch": 1.6902086677367576, |
| "grad_norm": 3.330254077911377, |
| "learning_rate": 7.909695502836814e-06, |
| "loss": 0.3478, |
| "mean_token_accuracy": 0.8704387247562408, |
| "num_tokens": 4613636.0, |
| "step": 527 |
| }, |
| { |
| "entropy": 1.3352216482162476, |
| "epoch": 1.6934189406099518, |
| "grad_norm": 3.57326078414917, |
| "learning_rate": 7.90106938720731e-06, |
| "loss": 0.2742, |
| "mean_token_accuracy": 0.9011513888835907, |
| "num_tokens": 4622451.0, |
| "step": 528 |
| }, |
| { |
| "entropy": 1.4958056211471558, |
| "epoch": 1.696629213483146, |
| "grad_norm": 3.4741249084472656, |
| "learning_rate": 7.892430235069317e-06, |
| "loss": 0.2964, |
| "mean_token_accuracy": 0.8987282514572144, |
| "num_tokens": 4630703.0, |
| "step": 529 |
| }, |
| { |
| "entropy": 1.4583451747894287, |
| "epoch": 1.6998394863563404, |
| "grad_norm": 50.94432067871094, |
| "learning_rate": 7.883778085244514e-06, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.8844209909439087, |
| "num_tokens": 4638687.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.5013746619224548, |
| "epoch": 1.7030497592295344, |
| "grad_norm": 2.947014808654785, |
| "learning_rate": 7.875112976612984e-06, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.8807470500469208, |
| "num_tokens": 4648423.0, |
| "step": 531 |
| }, |
| { |
| "entropy": 1.4312713742256165, |
| "epoch": 1.7062600321027288, |
| "grad_norm": 3.6040570735931396, |
| "learning_rate": 7.866434948113046e-06, |
| "loss": 0.2446, |
| "mean_token_accuracy": 0.9076212048530579, |
| "num_tokens": 4656129.0, |
| "step": 532 |
| }, |
| { |
| "entropy": 1.4022682309150696, |
| "epoch": 1.709470304975923, |
| "grad_norm": 2.8968658447265625, |
| "learning_rate": 7.857744038741076e-06, |
| "loss": 0.2686, |
| "mean_token_accuracy": 0.8959876894950867, |
| "num_tokens": 4664283.0, |
| "step": 533 |
| }, |
| { |
| "entropy": 1.4385973811149597, |
| "epoch": 1.7126805778491172, |
| "grad_norm": 5.81376838684082, |
| "learning_rate": 7.849040287551331e-06, |
| "loss": 0.2839, |
| "mean_token_accuracy": 0.899337649345398, |
| "num_tokens": 4672631.0, |
| "step": 534 |
| }, |
| { |
| "entropy": 1.347984254360199, |
| "epoch": 1.7158908507223114, |
| "grad_norm": 3.789095401763916, |
| "learning_rate": 7.84032373365578e-06, |
| "loss": 0.3542, |
| "mean_token_accuracy": 0.851859450340271, |
| "num_tokens": 4681402.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 1.2535653114318848, |
| "epoch": 1.7191011235955056, |
| "grad_norm": 3.5626060962677, |
| "learning_rate": 7.831594416223916e-06, |
| "loss": 0.2727, |
| "mean_token_accuracy": 0.8892323970794678, |
| "num_tokens": 4689845.0, |
| "step": 536 |
| }, |
| { |
| "entropy": 1.5344293117523193, |
| "epoch": 1.7223113964686998, |
| "grad_norm": 3.13451886177063, |
| "learning_rate": 7.822852374482597e-06, |
| "loss": 0.254, |
| "mean_token_accuracy": 0.912846565246582, |
| "num_tokens": 4698327.0, |
| "step": 537 |
| }, |
| { |
| "entropy": 1.2384685277938843, |
| "epoch": 1.725521669341894, |
| "grad_norm": 2.712188482284546, |
| "learning_rate": 7.814097647715848e-06, |
| "loss": 0.2943, |
| "mean_token_accuracy": 0.892603188753128, |
| "num_tokens": 4708113.0, |
| "step": 538 |
| }, |
| { |
| "entropy": 1.2682060599327087, |
| "epoch": 1.7287319422150884, |
| "grad_norm": 6.144076824188232, |
| "learning_rate": 7.805330275264707e-06, |
| "loss": 0.3334, |
| "mean_token_accuracy": 0.8756844997406006, |
| "num_tokens": 4717320.0, |
| "step": 539 |
| }, |
| { |
| "entropy": 1.5746580362319946, |
| "epoch": 1.7319422150882824, |
| "grad_norm": 4.383955478668213, |
| "learning_rate": 7.796550296527032e-06, |
| "loss": 0.2648, |
| "mean_token_accuracy": 0.8971992433071136, |
| "num_tokens": 4725706.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.4106557965278625, |
| "epoch": 1.7351524879614768, |
| "grad_norm": 3.649662733078003, |
| "learning_rate": 7.787757750957335e-06, |
| "loss": 0.2688, |
| "mean_token_accuracy": 0.900052547454834, |
| "num_tokens": 4733543.0, |
| "step": 541 |
| }, |
| { |
| "entropy": 1.468329668045044, |
| "epoch": 1.7383627608346708, |
| "grad_norm": 3.449719190597534, |
| "learning_rate": 7.778952678066591e-06, |
| "loss": 0.3198, |
| "mean_token_accuracy": 0.874014675617218, |
| "num_tokens": 4742070.0, |
| "step": 542 |
| }, |
| { |
| "entropy": 1.3497502207756042, |
| "epoch": 1.7415730337078652, |
| "grad_norm": 6.58897066116333, |
| "learning_rate": 7.77013511742208e-06, |
| "loss": 0.3388, |
| "mean_token_accuracy": 0.8744527399539948, |
| "num_tokens": 4751985.0, |
| "step": 543 |
| }, |
| { |
| "entropy": 1.2957723736763, |
| "epoch": 1.7447833065810594, |
| "grad_norm": 4.113377094268799, |
| "learning_rate": 7.761305108647188e-06, |
| "loss": 0.2744, |
| "mean_token_accuracy": 0.8848893344402313, |
| "num_tokens": 4760494.0, |
| "step": 544 |
| }, |
| { |
| "entropy": 1.4583874940872192, |
| "epoch": 1.7479935794542536, |
| "grad_norm": 4.075041770935059, |
| "learning_rate": 7.752462691421245e-06, |
| "loss": 0.2886, |
| "mean_token_accuracy": 0.8956989943981171, |
| "num_tokens": 4769447.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 1.2832042574882507, |
| "epoch": 1.7512038523274478, |
| "grad_norm": 2.9702274799346924, |
| "learning_rate": 7.743607905479338e-06, |
| "loss": 0.2636, |
| "mean_token_accuracy": 0.9015891551971436, |
| "num_tokens": 4777454.0, |
| "step": 546 |
| }, |
| { |
| "entropy": 1.4110126495361328, |
| "epoch": 1.754414125200642, |
| "grad_norm": 3.432813882827759, |
| "learning_rate": 7.734740790612137e-06, |
| "loss": 0.2805, |
| "mean_token_accuracy": 0.8956256210803986, |
| "num_tokens": 4785393.0, |
| "step": 547 |
| }, |
| { |
| "entropy": 1.3326915502548218, |
| "epoch": 1.7576243980738364, |
| "grad_norm": 3.350756883621216, |
| "learning_rate": 7.72586138666571e-06, |
| "loss": 0.3388, |
| "mean_token_accuracy": 0.8730327486991882, |
| "num_tokens": 4795285.0, |
| "step": 548 |
| }, |
| { |
| "entropy": 1.433797538280487, |
| "epoch": 1.7608346709470304, |
| "grad_norm": 3.5722227096557617, |
| "learning_rate": 7.716969733541357e-06, |
| "loss": 0.2556, |
| "mean_token_accuracy": 0.9026345610618591, |
| "num_tokens": 4802574.0, |
| "step": 549 |
| }, |
| { |
| "entropy": 1.4505755305290222, |
| "epoch": 1.7640449438202248, |
| "grad_norm": 3.7785494327545166, |
| "learning_rate": 7.708065871195413e-06, |
| "loss": 0.2808, |
| "mean_token_accuracy": 0.8939539194107056, |
| "num_tokens": 4811257.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.4082772731781006, |
| "epoch": 1.7672552166934188, |
| "grad_norm": 3.876687526702881, |
| "learning_rate": 7.699149839639086e-06, |
| "loss": 0.3146, |
| "mean_token_accuracy": 0.8838759064674377, |
| "num_tokens": 4818898.0, |
| "step": 551 |
| }, |
| { |
| "entropy": 1.3682604432106018, |
| "epoch": 1.7704654895666132, |
| "grad_norm": 16.92738914489746, |
| "learning_rate": 7.690221678938258e-06, |
| "loss": 0.2918, |
| "mean_token_accuracy": 0.8928067088127136, |
| "num_tokens": 4826651.0, |
| "step": 552 |
| }, |
| { |
| "entropy": 1.310297429561615, |
| "epoch": 1.7736757624398074, |
| "grad_norm": 2.7511773109436035, |
| "learning_rate": 7.681281429213328e-06, |
| "loss": 0.284, |
| "mean_token_accuracy": 0.8867529034614563, |
| "num_tokens": 4835780.0, |
| "step": 553 |
| }, |
| { |
| "entropy": 1.3972845673561096, |
| "epoch": 1.7768860353130016, |
| "grad_norm": 4.055933475494385, |
| "learning_rate": 7.672329130639007e-06, |
| "loss": 0.2989, |
| "mean_token_accuracy": 0.8902477920055389, |
| "num_tokens": 4844260.0, |
| "step": 554 |
| }, |
| { |
| "entropy": 1.6165253520011902, |
| "epoch": 1.7800963081861958, |
| "grad_norm": 3.326050043106079, |
| "learning_rate": 7.663364823444157e-06, |
| "loss": 0.2543, |
| "mean_token_accuracy": 0.9065029919147491, |
| "num_tokens": 4852457.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 1.5481394529342651, |
| "epoch": 1.78330658105939, |
| "grad_norm": 3.191687822341919, |
| "learning_rate": 7.654388547911605e-06, |
| "loss": 0.3421, |
| "mean_token_accuracy": 0.8783987462520599, |
| "num_tokens": 4861111.0, |
| "step": 556 |
| }, |
| { |
| "entropy": 1.4075528979301453, |
| "epoch": 1.7865168539325844, |
| "grad_norm": 4.169843673706055, |
| "learning_rate": 7.645400344377953e-06, |
| "loss": 0.3012, |
| "mean_token_accuracy": 0.8839040398597717, |
| "num_tokens": 4870169.0, |
| "step": 557 |
| }, |
| { |
| "entropy": 1.6146376132965088, |
| "epoch": 1.7897271268057784, |
| "grad_norm": 4.2372331619262695, |
| "learning_rate": 7.63640025323341e-06, |
| "loss": 0.2876, |
| "mean_token_accuracy": 0.8897527158260345, |
| "num_tokens": 4878024.0, |
| "step": 558 |
| }, |
| { |
| "entropy": 1.3466166257858276, |
| "epoch": 1.7929373996789728, |
| "grad_norm": 2.635805130004883, |
| "learning_rate": 7.627388314921602e-06, |
| "loss": 0.2716, |
| "mean_token_accuracy": 0.8980992436408997, |
| "num_tokens": 4886585.0, |
| "step": 559 |
| }, |
| { |
| "entropy": 1.569422721862793, |
| "epoch": 1.7961476725521668, |
| "grad_norm": 4.736917018890381, |
| "learning_rate": 7.61836456993939e-06, |
| "loss": 0.3077, |
| "mean_token_accuracy": 0.8818827569484711, |
| "num_tokens": 4894236.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.2981528639793396, |
| "epoch": 1.7993579454253612, |
| "grad_norm": 2.962735891342163, |
| "learning_rate": 7.609329058836694e-06, |
| "loss": 0.2825, |
| "mean_token_accuracy": 0.8849244713783264, |
| "num_tokens": 4904278.0, |
| "step": 561 |
| }, |
| { |
| "entropy": 1.3867509961128235, |
| "epoch": 1.8025682182985554, |
| "grad_norm": 11.534844398498535, |
| "learning_rate": 7.600281822216307e-06, |
| "loss": 0.2866, |
| "mean_token_accuracy": 0.8904646039009094, |
| "num_tokens": 4912560.0, |
| "step": 562 |
| }, |
| { |
| "entropy": 1.3264847993850708, |
| "epoch": 1.8057784911717496, |
| "grad_norm": 3.852452516555786, |
| "learning_rate": 7.59122290073371e-06, |
| "loss": 0.3326, |
| "mean_token_accuracy": 0.8820092082023621, |
| "num_tokens": 4922110.0, |
| "step": 563 |
| }, |
| { |
| "entropy": 1.4996045231819153, |
| "epoch": 1.8089887640449438, |
| "grad_norm": 3.772239923477173, |
| "learning_rate": 7.582152335096896e-06, |
| "loss": 0.294, |
| "mean_token_accuracy": 0.8812500834465027, |
| "num_tokens": 4929180.0, |
| "step": 564 |
| }, |
| { |
| "entropy": 1.4106106758117676, |
| "epoch": 1.812199036918138, |
| "grad_norm": 3.0418591499328613, |
| "learning_rate": 7.5730701660661795e-06, |
| "loss": 0.3007, |
| "mean_token_accuracy": 0.8839779198169708, |
| "num_tokens": 4937696.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 1.4185760021209717, |
| "epoch": 1.8154093097913324, |
| "grad_norm": 4.139153480529785, |
| "learning_rate": 7.563976434454021e-06, |
| "loss": 0.313, |
| "mean_token_accuracy": 0.8874213993549347, |
| "num_tokens": 4945516.0, |
| "step": 566 |
| }, |
| { |
| "entropy": 1.4388535618782043, |
| "epoch": 1.8186195826645264, |
| "grad_norm": 3.0319912433624268, |
| "learning_rate": 7.554871181124836e-06, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.9032376706600189, |
| "num_tokens": 4953130.0, |
| "step": 567 |
| }, |
| { |
| "entropy": 1.437139868736267, |
| "epoch": 1.8218298555377208, |
| "grad_norm": 2.6992785930633545, |
| "learning_rate": 7.5457544469948164e-06, |
| "loss": 0.281, |
| "mean_token_accuracy": 0.8920381367206573, |
| "num_tokens": 4961942.0, |
| "step": 568 |
| }, |
| { |
| "entropy": 1.335317313671112, |
| "epoch": 1.8250401284109148, |
| "grad_norm": 3.9841814041137695, |
| "learning_rate": 7.536626273031747e-06, |
| "loss": 0.2964, |
| "mean_token_accuracy": 0.8849454522132874, |
| "num_tokens": 4969921.0, |
| "step": 569 |
| }, |
| { |
| "entropy": 1.6164771914482117, |
| "epoch": 1.8282504012841092, |
| "grad_norm": 4.4012770652771, |
| "learning_rate": 7.5274867002548154e-06, |
| "loss": 0.276, |
| "mean_token_accuracy": 0.9028495252132416, |
| "num_tokens": 4978461.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.535290777683258, |
| "epoch": 1.8314606741573034, |
| "grad_norm": 25.161006927490234, |
| "learning_rate": 7.5183357697344395e-06, |
| "loss": 0.3082, |
| "mean_token_accuracy": 0.8934054970741272, |
| "num_tokens": 4986774.0, |
| "step": 571 |
| }, |
| { |
| "entropy": 1.2994403839111328, |
| "epoch": 1.8346709470304976, |
| "grad_norm": 2.891382932662964, |
| "learning_rate": 7.509173522592066e-06, |
| "loss": 0.334, |
| "mean_token_accuracy": 0.8758228123188019, |
| "num_tokens": 4995401.0, |
| "step": 572 |
| }, |
| { |
| "entropy": 1.3535541892051697, |
| "epoch": 1.8378812199036918, |
| "grad_norm": 3.3566458225250244, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.3106, |
| "mean_token_accuracy": 0.8938397765159607, |
| "num_tokens": 5003723.0, |
| "step": 573 |
| }, |
| { |
| "entropy": 1.4879749417304993, |
| "epoch": 1.841091492776886, |
| "grad_norm": 14.512699127197266, |
| "learning_rate": 7.4908152431812175e-06, |
| "loss": 0.303, |
| "mean_token_accuracy": 0.8893711566925049, |
| "num_tokens": 5012065.0, |
| "step": 574 |
| }, |
| { |
| "entropy": 1.439945101737976, |
| "epoch": 1.8443017656500804, |
| "grad_norm": 3.0775973796844482, |
| "learning_rate": 7.481619293409173e-06, |
| "loss": 0.3103, |
| "mean_token_accuracy": 0.8844351768493652, |
| "num_tokens": 5020574.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 1.339579463005066, |
| "epoch": 1.8475120385232744, |
| "grad_norm": 3.9711005687713623, |
| "learning_rate": 7.472412192007619e-06, |
| "loss": 0.3433, |
| "mean_token_accuracy": 0.8750526309013367, |
| "num_tokens": 5028805.0, |
| "step": 576 |
| }, |
| { |
| "entropy": 1.5097241401672363, |
| "epoch": 1.8507223113964688, |
| "grad_norm": 3.872302770614624, |
| "learning_rate": 7.4631939803504215e-06, |
| "loss": 0.3172, |
| "mean_token_accuracy": 0.8816109299659729, |
| "num_tokens": 5037794.0, |
| "step": 577 |
| }, |
| { |
| "entropy": 1.3860506415367126, |
| "epoch": 1.8539325842696628, |
| "grad_norm": 3.852682113647461, |
| "learning_rate": 7.453964699861376e-06, |
| "loss": 0.3045, |
| "mean_token_accuracy": 0.8891089558601379, |
| "num_tokens": 5045600.0, |
| "step": 578 |
| }, |
| { |
| "entropy": 1.2616366147994995, |
| "epoch": 1.8571428571428572, |
| "grad_norm": 3.328491449356079, |
| "learning_rate": 7.44472439201401e-06, |
| "loss": 0.379, |
| "mean_token_accuracy": 0.8576820492744446, |
| "num_tokens": 5055136.0, |
| "step": 579 |
| }, |
| { |
| "entropy": 1.4181751608848572, |
| "epoch": 1.8603531300160514, |
| "grad_norm": 9.961250305175781, |
| "learning_rate": 7.435473098331411e-06, |
| "loss": 0.3049, |
| "mean_token_accuracy": 0.88387331366539, |
| "num_tokens": 5064465.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.3726850152015686, |
| "epoch": 1.8635634028892456, |
| "grad_norm": 3.580294609069824, |
| "learning_rate": 7.426210860386032e-06, |
| "loss": 0.3343, |
| "mean_token_accuracy": 0.8814990222454071, |
| "num_tokens": 5073152.0, |
| "step": 581 |
| }, |
| { |
| "entropy": 1.542908489704132, |
| "epoch": 1.8667736757624398, |
| "grad_norm": 2.881855010986328, |
| "learning_rate": 7.416937719799502e-06, |
| "loss": 0.3088, |
| "mean_token_accuracy": 0.887274444103241, |
| "num_tokens": 5082323.0, |
| "step": 582 |
| }, |
| { |
| "entropy": 1.4042843580245972, |
| "epoch": 1.869983948635634, |
| "grad_norm": 6.571777820587158, |
| "learning_rate": 7.407653718242449e-06, |
| "loss": 0.3387, |
| "mean_token_accuracy": 0.8746259808540344, |
| "num_tokens": 5090828.0, |
| "step": 583 |
| }, |
| { |
| "entropy": 1.3728899359703064, |
| "epoch": 1.8731942215088284, |
| "grad_norm": 3.124652624130249, |
| "learning_rate": 7.398358897434303e-06, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.8831090331077576, |
| "num_tokens": 5099452.0, |
| "step": 584 |
| }, |
| { |
| "entropy": 1.6130582094192505, |
| "epoch": 1.8764044943820224, |
| "grad_norm": 4.153020858764648, |
| "learning_rate": 7.3890532991431174e-06, |
| "loss": 0.2957, |
| "mean_token_accuracy": 0.8826304972171783, |
| "num_tokens": 5107713.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 1.5210551023483276, |
| "epoch": 1.8796147672552168, |
| "grad_norm": 2.822763204574585, |
| "learning_rate": 7.379736965185369e-06, |
| "loss": 0.2601, |
| "mean_token_accuracy": 0.903123527765274, |
| "num_tokens": 5117399.0, |
| "step": 586 |
| }, |
| { |
| "entropy": 1.421782910823822, |
| "epoch": 1.8828250401284108, |
| "grad_norm": 4.258233070373535, |
| "learning_rate": 7.370409937425781e-06, |
| "loss": 0.3441, |
| "mean_token_accuracy": 0.8721145987510681, |
| "num_tokens": 5125660.0, |
| "step": 587 |
| }, |
| { |
| "entropy": 1.4874483942985535, |
| "epoch": 1.8860353130016052, |
| "grad_norm": 4.4238972663879395, |
| "learning_rate": 7.361072257777132e-06, |
| "loss": 0.2908, |
| "mean_token_accuracy": 0.8859201669692993, |
| "num_tokens": 5134941.0, |
| "step": 588 |
| }, |
| { |
| "entropy": 1.47506844997406, |
| "epoch": 1.8892455858747994, |
| "grad_norm": 2.905021905899048, |
| "learning_rate": 7.3517239682000675e-06, |
| "loss": 0.3107, |
| "mean_token_accuracy": 0.8757966160774231, |
| "num_tokens": 5142323.0, |
| "step": 589 |
| }, |
| { |
| "entropy": 1.3628226518630981, |
| "epoch": 1.8924558587479936, |
| "grad_norm": 2.5774905681610107, |
| "learning_rate": 7.342365110702907e-06, |
| "loss": 0.3283, |
| "mean_token_accuracy": 0.8776254951953888, |
| "num_tokens": 5151640.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.57500422000885, |
| "epoch": 1.8956661316211878, |
| "grad_norm": 3.3131790161132812, |
| "learning_rate": 7.332995727341462e-06, |
| "loss": 0.3328, |
| "mean_token_accuracy": 0.8693976402282715, |
| "num_tokens": 5162302.0, |
| "step": 591 |
| }, |
| { |
| "entropy": 1.5631265044212341, |
| "epoch": 1.898876404494382, |
| "grad_norm": 3.2233312129974365, |
| "learning_rate": 7.323615860218844e-06, |
| "loss": 0.3031, |
| "mean_token_accuracy": 0.8829487860202789, |
| "num_tokens": 5171343.0, |
| "step": 592 |
| }, |
| { |
| "entropy": 1.474395990371704, |
| "epoch": 1.9020866773675762, |
| "grad_norm": 3.0282936096191406, |
| "learning_rate": 7.314225551485273e-06, |
| "loss": 0.2859, |
| "mean_token_accuracy": 0.8863288462162018, |
| "num_tokens": 5179319.0, |
| "step": 593 |
| }, |
| { |
| "entropy": 1.5331798791885376, |
| "epoch": 1.9052969502407704, |
| "grad_norm": 3.055107593536377, |
| "learning_rate": 7.304824843337893e-06, |
| "loss": 0.2923, |
| "mean_token_accuracy": 0.8974760174751282, |
| "num_tokens": 5188212.0, |
| "step": 594 |
| }, |
| { |
| "entropy": 1.4196932315826416, |
| "epoch": 1.9085072231139648, |
| "grad_norm": 3.611940622329712, |
| "learning_rate": 7.295413778020579e-06, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.875687837600708, |
| "num_tokens": 5196437.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 1.7519562244415283, |
| "epoch": 1.9117174959871588, |
| "grad_norm": 4.035586357116699, |
| "learning_rate": 7.285992397823747e-06, |
| "loss": 0.3085, |
| "mean_token_accuracy": 0.8901273906230927, |
| "num_tokens": 5205394.0, |
| "step": 596 |
| }, |
| { |
| "entropy": 1.423706591129303, |
| "epoch": 1.9149277688603532, |
| "grad_norm": 3.4519264698028564, |
| "learning_rate": 7.276560745084167e-06, |
| "loss": 0.3381, |
| "mean_token_accuracy": 0.8672617375850677, |
| "num_tokens": 5213855.0, |
| "step": 597 |
| }, |
| { |
| "entropy": 1.3611206412315369, |
| "epoch": 1.9181380417335474, |
| "grad_norm": 3.74892258644104, |
| "learning_rate": 7.267118862184767e-06, |
| "loss": 0.3482, |
| "mean_token_accuracy": 0.8646276295185089, |
| "num_tokens": 5223352.0, |
| "step": 598 |
| }, |
| { |
| "entropy": 1.2371296286582947, |
| "epoch": 1.9213483146067416, |
| "grad_norm": 5.596102237701416, |
| "learning_rate": 7.257666791554448e-06, |
| "loss": 0.3715, |
| "mean_token_accuracy": 0.8619127869606018, |
| "num_tokens": 5233034.0, |
| "step": 599 |
| }, |
| { |
| "entropy": 1.4176723957061768, |
| "epoch": 1.9245585874799358, |
| "grad_norm": 3.404355525970459, |
| "learning_rate": 7.248204575667893e-06, |
| "loss": 0.2603, |
| "mean_token_accuracy": 0.9018069803714752, |
| "num_tokens": 5240859.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.5233874917030334, |
| "epoch": 1.92776886035313, |
| "grad_norm": 4.02052116394043, |
| "learning_rate": 7.2387322570453724e-06, |
| "loss": 0.2732, |
| "mean_token_accuracy": 0.8832527995109558, |
| "num_tokens": 5250145.0, |
| "step": 601 |
| }, |
| { |
| "entropy": 1.3011326789855957, |
| "epoch": 1.9309791332263242, |
| "grad_norm": 2.9296908378601074, |
| "learning_rate": 7.229249878252558e-06, |
| "loss": 0.3317, |
| "mean_token_accuracy": 0.8786691129207611, |
| "num_tokens": 5258015.0, |
| "step": 602 |
| }, |
| { |
| "entropy": 1.3884427547454834, |
| "epoch": 1.9341894060995184, |
| "grad_norm": 2.848806381225586, |
| "learning_rate": 7.219757481900325e-06, |
| "loss": 0.2806, |
| "mean_token_accuracy": 0.893255889415741, |
| "num_tokens": 5267533.0, |
| "step": 603 |
| }, |
| { |
| "entropy": 1.5049036145210266, |
| "epoch": 1.9373996789727128, |
| "grad_norm": 3.0091021060943604, |
| "learning_rate": 7.210255110644569e-06, |
| "loss": 0.2552, |
| "mean_token_accuracy": 0.9070955812931061, |
| "num_tokens": 5275537.0, |
| "step": 604 |
| }, |
| { |
| "entropy": 1.367910087108612, |
| "epoch": 1.9406099518459068, |
| "grad_norm": 4.554563522338867, |
| "learning_rate": 7.2007428071860045e-06, |
| "loss": 0.2996, |
| "mean_token_accuracy": 0.8829465806484222, |
| "num_tokens": 5284322.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 1.5234755277633667, |
| "epoch": 1.9438202247191012, |
| "grad_norm": 7.072104454040527, |
| "learning_rate": 7.191220614269981e-06, |
| "loss": 0.2866, |
| "mean_token_accuracy": 0.8958406448364258, |
| "num_tokens": 5293226.0, |
| "step": 606 |
| }, |
| { |
| "entropy": 1.4429296255111694, |
| "epoch": 1.9470304975922952, |
| "grad_norm": 2.972801685333252, |
| "learning_rate": 7.181688574686292e-06, |
| "loss": 0.2546, |
| "mean_token_accuracy": 0.9038203060626984, |
| "num_tokens": 5301639.0, |
| "step": 607 |
| }, |
| { |
| "entropy": 1.3830759525299072, |
| "epoch": 1.9502407704654896, |
| "grad_norm": 3.0528810024261475, |
| "learning_rate": 7.17214673126897e-06, |
| "loss": 0.3471, |
| "mean_token_accuracy": 0.8751060366630554, |
| "num_tokens": 5310539.0, |
| "step": 608 |
| }, |
| { |
| "entropy": 1.6103679537773132, |
| "epoch": 1.9534510433386838, |
| "grad_norm": 3.2677433490753174, |
| "learning_rate": 7.162595126896111e-06, |
| "loss": 0.3372, |
| "mean_token_accuracy": 0.8642582893371582, |
| "num_tokens": 5320301.0, |
| "step": 609 |
| }, |
| { |
| "entropy": 1.3761922121047974, |
| "epoch": 1.956661316211878, |
| "grad_norm": 3.2383453845977783, |
| "learning_rate": 7.15303380448967e-06, |
| "loss": 0.3415, |
| "mean_token_accuracy": 0.8723717033863068, |
| "num_tokens": 5328446.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.4116157293319702, |
| "epoch": 1.9598715890850722, |
| "grad_norm": 5.559144973754883, |
| "learning_rate": 7.143462807015271e-06, |
| "loss": 0.2742, |
| "mean_token_accuracy": 0.8913781046867371, |
| "num_tokens": 5337095.0, |
| "step": 611 |
| }, |
| { |
| "entropy": 1.2883580923080444, |
| "epoch": 1.9630818619582664, |
| "grad_norm": 5.594122886657715, |
| "learning_rate": 7.133882177482019e-06, |
| "loss": 0.2767, |
| "mean_token_accuracy": 0.8971899151802063, |
| "num_tokens": 5345189.0, |
| "step": 612 |
| }, |
| { |
| "entropy": 1.3326961398124695, |
| "epoch": 1.9662921348314608, |
| "grad_norm": 3.9941868782043457, |
| "learning_rate": 7.1242919589422974e-06, |
| "loss": 0.3223, |
| "mean_token_accuracy": 0.8792887628078461, |
| "num_tokens": 5353477.0, |
| "step": 613 |
| }, |
| { |
| "entropy": 1.463496744632721, |
| "epoch": 1.9695024077046548, |
| "grad_norm": 3.5513710975646973, |
| "learning_rate": 7.114692194491583e-06, |
| "loss": 0.3035, |
| "mean_token_accuracy": 0.883477658033371, |
| "num_tokens": 5361375.0, |
| "step": 614 |
| }, |
| { |
| "entropy": 1.2219607830047607, |
| "epoch": 1.9727126805778492, |
| "grad_norm": 3.10086989402771, |
| "learning_rate": 7.105082927268247e-06, |
| "loss": 0.3237, |
| "mean_token_accuracy": 0.8641158044338226, |
| "num_tokens": 5371062.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 1.388469636440277, |
| "epoch": 1.9759229534510432, |
| "grad_norm": 4.3963398933410645, |
| "learning_rate": 7.095464200453366e-06, |
| "loss": 0.3199, |
| "mean_token_accuracy": 0.8787851929664612, |
| "num_tokens": 5380088.0, |
| "step": 616 |
| }, |
| { |
| "entropy": 1.4874065518379211, |
| "epoch": 1.9791332263242376, |
| "grad_norm": 4.8666205406188965, |
| "learning_rate": 7.085836057270521e-06, |
| "loss": 0.2764, |
| "mean_token_accuracy": 0.8986081182956696, |
| "num_tokens": 5388417.0, |
| "step": 617 |
| }, |
| { |
| "entropy": 1.3346271514892578, |
| "epoch": 1.9823434991974318, |
| "grad_norm": 3.119516611099243, |
| "learning_rate": 7.07619854098561e-06, |
| "loss": 0.2786, |
| "mean_token_accuracy": 0.8969616293907166, |
| "num_tokens": 5396308.0, |
| "step": 618 |
| }, |
| { |
| "entropy": 1.4097462892532349, |
| "epoch": 1.985553772070626, |
| "grad_norm": 4.254458427429199, |
| "learning_rate": 7.066551694906651e-06, |
| "loss": 0.2261, |
| "mean_token_accuracy": 0.90898796916008, |
| "num_tokens": 5403926.0, |
| "step": 619 |
| }, |
| { |
| "entropy": 1.433124840259552, |
| "epoch": 1.9887640449438202, |
| "grad_norm": 5.4737868309021, |
| "learning_rate": 7.056895562383585e-06, |
| "loss": 0.315, |
| "mean_token_accuracy": 0.8711326122283936, |
| "num_tokens": 5412353.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.420740008354187, |
| "epoch": 1.9919743178170144, |
| "grad_norm": 7.06497049331665, |
| "learning_rate": 7.047230186808085e-06, |
| "loss": 0.3284, |
| "mean_token_accuracy": 0.8794163167476654, |
| "num_tokens": 5420634.0, |
| "step": 621 |
| }, |
| { |
| "entropy": 1.5400715470314026, |
| "epoch": 1.9951845906902088, |
| "grad_norm": 2.705223560333252, |
| "learning_rate": 7.0375556116133605e-06, |
| "loss": 0.3048, |
| "mean_token_accuracy": 0.888154536485672, |
| "num_tokens": 5430245.0, |
| "step": 622 |
| }, |
| { |
| "entropy": 1.3293360471725464, |
| "epoch": 1.9983948635634028, |
| "grad_norm": 3.812591314315796, |
| "learning_rate": 7.027871880273959e-06, |
| "loss": 0.3132, |
| "mean_token_accuracy": 0.881993293762207, |
| "num_tokens": 5437894.0, |
| "step": 623 |
| }, |
| { |
| "entropy": 1.294339656829834, |
| "epoch": 2.0, |
| "grad_norm": 4.192768573760986, |
| "learning_rate": 7.018179036305574e-06, |
| "loss": 0.252, |
| "mean_token_accuracy": 0.9110794067382812, |
| "num_tokens": 5442284.0, |
| "step": 624 |
| }, |
| { |
| "entropy": 1.4277611374855042, |
| "epoch": 2.0032102728731944, |
| "grad_norm": 3.0245370864868164, |
| "learning_rate": 7.008477123264849e-06, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9453595578670502, |
| "num_tokens": 5451260.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 1.4434685707092285, |
| "epoch": 2.0064205457463884, |
| "grad_norm": 3.368790626525879, |
| "learning_rate": 6.9987661847491786e-06, |
| "loss": 0.1528, |
| "mean_token_accuracy": 0.9484553039073944, |
| "num_tokens": 5459386.0, |
| "step": 626 |
| }, |
| { |
| "entropy": 1.2321021556854248, |
| "epoch": 2.009630818619583, |
| "grad_norm": 2.4026436805725098, |
| "learning_rate": 6.989046264396516e-06, |
| "loss": 0.131, |
| "mean_token_accuracy": 0.9575425088405609, |
| "num_tokens": 5467001.0, |
| "step": 627 |
| }, |
| { |
| "entropy": 1.2222203612327576, |
| "epoch": 2.012841091492777, |
| "grad_norm": 30.97420310974121, |
| "learning_rate": 6.9793174058851805e-06, |
| "loss": 0.148, |
| "mean_token_accuracy": 0.9470765292644501, |
| "num_tokens": 5475451.0, |
| "step": 628 |
| }, |
| { |
| "entropy": 1.422147512435913, |
| "epoch": 2.016051364365971, |
| "grad_norm": 7.181051731109619, |
| "learning_rate": 6.96957965293365e-06, |
| "loss": 0.1134, |
| "mean_token_accuracy": 0.9622469842433929, |
| "num_tokens": 5482704.0, |
| "step": 629 |
| }, |
| { |
| "entropy": 1.1400924921035767, |
| "epoch": 2.019261637239165, |
| "grad_norm": 2.26750111579895, |
| "learning_rate": 6.959833049300376e-06, |
| "loss": 0.146, |
| "mean_token_accuracy": 0.9331265091896057, |
| "num_tokens": 5492344.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.236695945262909, |
| "epoch": 2.0224719101123596, |
| "grad_norm": 2.5727972984313965, |
| "learning_rate": 6.9500776387835785e-06, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9469403326511383, |
| "num_tokens": 5501664.0, |
| "step": 631 |
| }, |
| { |
| "entropy": 1.193276584148407, |
| "epoch": 2.0256821829855536, |
| "grad_norm": 2.4318366050720215, |
| "learning_rate": 6.940313465221057e-06, |
| "loss": 0.1117, |
| "mean_token_accuracy": 0.9551983177661896, |
| "num_tokens": 5510226.0, |
| "step": 632 |
| }, |
| { |
| "entropy": 1.175793468952179, |
| "epoch": 2.028892455858748, |
| "grad_norm": 3.1776866912841797, |
| "learning_rate": 6.9305405724899876e-06, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9337913990020752, |
| "num_tokens": 5519131.0, |
| "step": 633 |
| }, |
| { |
| "entropy": 1.1542350053787231, |
| "epoch": 2.0321027287319424, |
| "grad_norm": 3.319153308868408, |
| "learning_rate": 6.920759004506723e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9449957609176636, |
| "num_tokens": 5527804.0, |
| "step": 634 |
| }, |
| { |
| "entropy": 1.1090035438537598, |
| "epoch": 2.0353130016051364, |
| "grad_norm": 4.43798303604126, |
| "learning_rate": 6.91096880522661e-06, |
| "loss": 0.1159, |
| "mean_token_accuracy": 0.9575115442276001, |
| "num_tokens": 5535406.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 1.0131879448890686, |
| "epoch": 2.038523274478331, |
| "grad_norm": 3.432020664215088, |
| "learning_rate": 6.90117001864377e-06, |
| "loss": 0.1494, |
| "mean_token_accuracy": 0.9273790717124939, |
| "num_tokens": 5545569.0, |
| "step": 636 |
| }, |
| { |
| "entropy": 1.1925968527793884, |
| "epoch": 2.041733547351525, |
| "grad_norm": 2.716158390045166, |
| "learning_rate": 6.891362688790925e-06, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9510330855846405, |
| "num_tokens": 5553431.0, |
| "step": 637 |
| }, |
| { |
| "entropy": 1.1236230731010437, |
| "epoch": 2.044943820224719, |
| "grad_norm": 4.862785339355469, |
| "learning_rate": 6.8815468597391785e-06, |
| "loss": 0.204, |
| "mean_token_accuracy": 0.915119081735611, |
| "num_tokens": 5563682.0, |
| "step": 638 |
| }, |
| { |
| "entropy": 1.1460051536560059, |
| "epoch": 2.048154093097913, |
| "grad_norm": 5.143580436706543, |
| "learning_rate": 6.871722575597829e-06, |
| "loss": 0.118, |
| "mean_token_accuracy": 0.9572257995605469, |
| "num_tokens": 5571730.0, |
| "step": 639 |
| }, |
| { |
| "entropy": 1.2631003260612488, |
| "epoch": 2.0513643659711076, |
| "grad_norm": 2.6166398525238037, |
| "learning_rate": 6.8618898805141744e-06, |
| "loss": 0.1175, |
| "mean_token_accuracy": 0.9546981453895569, |
| "num_tokens": 5579619.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.212298333644867, |
| "epoch": 2.0545746388443016, |
| "grad_norm": 4.810606002807617, |
| "learning_rate": 6.8520488186733e-06, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9437867701053619, |
| "num_tokens": 5587467.0, |
| "step": 641 |
| }, |
| { |
| "entropy": 1.1090901494026184, |
| "epoch": 2.057784911717496, |
| "grad_norm": 2.9815127849578857, |
| "learning_rate": 6.8421994342979e-06, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9465640485286713, |
| "num_tokens": 5595910.0, |
| "step": 642 |
| }, |
| { |
| "entropy": 1.1284350156784058, |
| "epoch": 2.0609951845906904, |
| "grad_norm": 2.8620505332946777, |
| "learning_rate": 6.832341771648057e-06, |
| "loss": 0.1785, |
| "mean_token_accuracy": 0.9314178228378296, |
| "num_tokens": 5605815.0, |
| "step": 643 |
| }, |
| { |
| "entropy": 1.2043840885162354, |
| "epoch": 2.0642054574638844, |
| "grad_norm": 5.458688735961914, |
| "learning_rate": 6.822475875021057e-06, |
| "loss": 0.1485, |
| "mean_token_accuracy": 0.9459114074707031, |
| "num_tokens": 5615275.0, |
| "step": 644 |
| }, |
| { |
| "entropy": 1.3176180124282837, |
| "epoch": 2.067415730337079, |
| "grad_norm": 2.8183062076568604, |
| "learning_rate": 6.812601788751192e-06, |
| "loss": 0.1116, |
| "mean_token_accuracy": 0.9547081887722015, |
| "num_tokens": 5623783.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 1.2037148475646973, |
| "epoch": 2.070626003210273, |
| "grad_norm": 2.6667561531066895, |
| "learning_rate": 6.802719557209547e-06, |
| "loss": 0.1381, |
| "mean_token_accuracy": 0.954013854265213, |
| "num_tokens": 5631939.0, |
| "step": 646 |
| }, |
| { |
| "entropy": 1.1883854269981384, |
| "epoch": 2.073836276083467, |
| "grad_norm": 2.371598243713379, |
| "learning_rate": 6.792829224803816e-06, |
| "loss": 0.1445, |
| "mean_token_accuracy": 0.9299385249614716, |
| "num_tokens": 5641553.0, |
| "step": 647 |
| }, |
| { |
| "entropy": 1.263631522655487, |
| "epoch": 2.077046548956661, |
| "grad_norm": 2.858833074569702, |
| "learning_rate": 6.782930835978094e-06, |
| "loss": 0.117, |
| "mean_token_accuracy": 0.9559842646121979, |
| "num_tokens": 5650631.0, |
| "step": 648 |
| }, |
| { |
| "entropy": 1.2522808909416199, |
| "epoch": 2.0802568218298556, |
| "grad_norm": 5.837004661560059, |
| "learning_rate": 6.773024435212678e-06, |
| "loss": 0.1141, |
| "mean_token_accuracy": 0.9567996859550476, |
| "num_tokens": 5659252.0, |
| "step": 649 |
| }, |
| { |
| "entropy": 1.1148146390914917, |
| "epoch": 2.0834670947030496, |
| "grad_norm": 3.9768710136413574, |
| "learning_rate": 6.76311006702387e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9364666640758514, |
| "num_tokens": 5668324.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.3358003497123718, |
| "epoch": 2.086677367576244, |
| "grad_norm": 2.3943331241607666, |
| "learning_rate": 6.753187775963773e-06, |
| "loss": 0.1011, |
| "mean_token_accuracy": 0.9628923833370209, |
| "num_tokens": 5676989.0, |
| "step": 651 |
| }, |
| { |
| "entropy": 1.0956073999404907, |
| "epoch": 2.0898876404494384, |
| "grad_norm": 2.8093678951263428, |
| "learning_rate": 6.743257606620094e-06, |
| "loss": 0.12, |
| "mean_token_accuracy": 0.953659862279892, |
| "num_tokens": 5685189.0, |
| "step": 652 |
| }, |
| { |
| "entropy": 1.2954540252685547, |
| "epoch": 2.0930979133226324, |
| "grad_norm": 2.636671304702759, |
| "learning_rate": 6.733319603615941e-06, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9414326548576355, |
| "num_tokens": 5694647.0, |
| "step": 653 |
| }, |
| { |
| "entropy": 1.1856536865234375, |
| "epoch": 2.096308186195827, |
| "grad_norm": 3.4469292163848877, |
| "learning_rate": 6.723373811609628e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9572050869464874, |
| "num_tokens": 5701968.0, |
| "step": 654 |
| }, |
| { |
| "entropy": 1.061371386051178, |
| "epoch": 2.099518459069021, |
| "grad_norm": 3.5963852405548096, |
| "learning_rate": 6.713420275294467e-06, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.9384236931800842, |
| "num_tokens": 5710326.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 1.1693629026412964, |
| "epoch": 2.102728731942215, |
| "grad_norm": 3.4809770584106445, |
| "learning_rate": 6.703459039398571e-06, |
| "loss": 0.1198, |
| "mean_token_accuracy": 0.951190173625946, |
| "num_tokens": 5718963.0, |
| "step": 656 |
| }, |
| { |
| "entropy": 1.1097606420516968, |
| "epoch": 2.105939004815409, |
| "grad_norm": 5.711026191711426, |
| "learning_rate": 6.693490148684654e-06, |
| "loss": 0.1431, |
| "mean_token_accuracy": 0.9451210498809814, |
| "num_tokens": 5727156.0, |
| "step": 657 |
| }, |
| { |
| "entropy": 1.307206630706787, |
| "epoch": 2.1091492776886036, |
| "grad_norm": 3.316901206970215, |
| "learning_rate": 6.683513647949826e-06, |
| "loss": 0.1198, |
| "mean_token_accuracy": 0.9504996240139008, |
| "num_tokens": 5735593.0, |
| "step": 658 |
| }, |
| { |
| "entropy": 1.1543167233467102, |
| "epoch": 2.1123595505617976, |
| "grad_norm": 2.2867281436920166, |
| "learning_rate": 6.673529582025398e-06, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9489535987377167, |
| "num_tokens": 5743790.0, |
| "step": 659 |
| }, |
| { |
| "entropy": 1.0919539332389832, |
| "epoch": 2.115569823434992, |
| "grad_norm": 2.8135688304901123, |
| "learning_rate": 6.66353799577667e-06, |
| "loss": 0.1146, |
| "mean_token_accuracy": 0.9532299339771271, |
| "num_tokens": 5751660.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.1081830263137817, |
| "epoch": 2.1187800963081864, |
| "grad_norm": 2.3218095302581787, |
| "learning_rate": 6.653538934102743e-06, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9097401201725006, |
| "num_tokens": 5761720.0, |
| "step": 661 |
| }, |
| { |
| "entropy": 1.2457672357559204, |
| "epoch": 2.1219903691813804, |
| "grad_norm": 13.804795265197754, |
| "learning_rate": 6.643532441936307e-06, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9488136768341064, |
| "num_tokens": 5769431.0, |
| "step": 662 |
| }, |
| { |
| "entropy": 1.1809271574020386, |
| "epoch": 2.125200642054575, |
| "grad_norm": 6.929125785827637, |
| "learning_rate": 6.633518564243442e-06, |
| "loss": 0.0981, |
| "mean_token_accuracy": 0.9624985456466675, |
| "num_tokens": 5776379.0, |
| "step": 663 |
| }, |
| { |
| "entropy": 1.1943358182907104, |
| "epoch": 2.128410914927769, |
| "grad_norm": 3.932882070541382, |
| "learning_rate": 6.6234973460234184e-06, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9579845666885376, |
| "num_tokens": 5785230.0, |
| "step": 664 |
| }, |
| { |
| "entropy": 1.1744264364242554, |
| "epoch": 2.131621187800963, |
| "grad_norm": 2.690080165863037, |
| "learning_rate": 6.6134688323084884e-06, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.9294100701808929, |
| "num_tokens": 5793482.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 0.9881645143032074, |
| "epoch": 2.134831460674157, |
| "grad_norm": 4.003161430358887, |
| "learning_rate": 6.603433068163694e-06, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9402081966400146, |
| "num_tokens": 5803065.0, |
| "step": 666 |
| }, |
| { |
| "entropy": 1.1991289258003235, |
| "epoch": 2.1380417335473516, |
| "grad_norm": 3.018016815185547, |
| "learning_rate": 6.593390098686653e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9330925345420837, |
| "num_tokens": 5813306.0, |
| "step": 667 |
| }, |
| { |
| "entropy": 1.0930429697036743, |
| "epoch": 2.1412520064205456, |
| "grad_norm": 5.356147289276123, |
| "learning_rate": 6.583339969007364e-06, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9488291144371033, |
| "num_tokens": 5821225.0, |
| "step": 668 |
| }, |
| { |
| "entropy": 1.146267056465149, |
| "epoch": 2.14446227929374, |
| "grad_norm": 4.305771827697754, |
| "learning_rate": 6.573282724288001e-06, |
| "loss": 0.12, |
| "mean_token_accuracy": 0.9525870680809021, |
| "num_tokens": 5829679.0, |
| "step": 669 |
| }, |
| { |
| "entropy": 1.1834629774093628, |
| "epoch": 2.1476725521669344, |
| "grad_norm": 3.5672402381896973, |
| "learning_rate": 6.563218409722712e-06, |
| "loss": 0.1158, |
| "mean_token_accuracy": 0.9621096253395081, |
| "num_tokens": 5837110.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.25631844997406, |
| "epoch": 2.1508828250401284, |
| "grad_norm": 8.232504844665527, |
| "learning_rate": 6.553147070537413e-06, |
| "loss": 0.1041, |
| "mean_token_accuracy": 0.9604884684085846, |
| "num_tokens": 5845214.0, |
| "step": 671 |
| }, |
| { |
| "entropy": 1.1267945170402527, |
| "epoch": 2.154093097913323, |
| "grad_norm": 10.115373611450195, |
| "learning_rate": 6.543068751989585e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9541674256324768, |
| "num_tokens": 5854190.0, |
| "step": 672 |
| }, |
| { |
| "entropy": 1.2294913530349731, |
| "epoch": 2.157303370786517, |
| "grad_norm": 2.8828227519989014, |
| "learning_rate": 6.532983499368078e-06, |
| "loss": 0.1436, |
| "mean_token_accuracy": 0.9478682279586792, |
| "num_tokens": 5862906.0, |
| "step": 673 |
| }, |
| { |
| "entropy": 1.118057906627655, |
| "epoch": 2.160513643659711, |
| "grad_norm": 3.830436944961548, |
| "learning_rate": 6.522891357992895e-06, |
| "loss": 0.1177, |
| "mean_token_accuracy": 0.9477755129337311, |
| "num_tokens": 5871654.0, |
| "step": 674 |
| }, |
| { |
| "entropy": 1.1084845662117004, |
| "epoch": 2.163723916532905, |
| "grad_norm": 2.5382556915283203, |
| "learning_rate": 6.512792373215e-06, |
| "loss": 0.1703, |
| "mean_token_accuracy": 0.9324210584163666, |
| "num_tokens": 5880988.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 1.1747573018074036, |
| "epoch": 2.1669341894060996, |
| "grad_norm": 4.374128818511963, |
| "learning_rate": 6.502686590416105e-06, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9460614025592804, |
| "num_tokens": 5890735.0, |
| "step": 676 |
| }, |
| { |
| "entropy": 1.1294305920600891, |
| "epoch": 2.1701444622792936, |
| "grad_norm": 2.974681854248047, |
| "learning_rate": 6.492574055008474e-06, |
| "loss": 0.1403, |
| "mean_token_accuracy": 0.9466178119182587, |
| "num_tokens": 5899074.0, |
| "step": 677 |
| }, |
| { |
| "entropy": 1.0897773504257202, |
| "epoch": 2.173354735152488, |
| "grad_norm": 2.3436830043792725, |
| "learning_rate": 6.482454812434711e-06, |
| "loss": 0.1215, |
| "mean_token_accuracy": 0.9511640667915344, |
| "num_tokens": 5907654.0, |
| "step": 678 |
| }, |
| { |
| "entropy": 1.1333916187286377, |
| "epoch": 2.176565008025682, |
| "grad_norm": 2.5949723720550537, |
| "learning_rate": 6.472328908167562e-06, |
| "loss": 0.1084, |
| "mean_token_accuracy": 0.9622556865215302, |
| "num_tokens": 5915090.0, |
| "step": 679 |
| }, |
| { |
| "entropy": 1.1536172032356262, |
| "epoch": 2.1797752808988764, |
| "grad_norm": 2.2703359127044678, |
| "learning_rate": 6.4621963877097105e-06, |
| "loss": 0.1148, |
| "mean_token_accuracy": 0.9524502754211426, |
| "num_tokens": 5922195.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.0362255573272705, |
| "epoch": 2.182985553772071, |
| "grad_norm": 2.933612823486328, |
| "learning_rate": 6.452057296593568e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.9349975883960724, |
| "num_tokens": 5931493.0, |
| "step": 681 |
| }, |
| { |
| "entropy": 1.0833754539489746, |
| "epoch": 2.186195826645265, |
| "grad_norm": 3.006075382232666, |
| "learning_rate": 6.441911680381074e-06, |
| "loss": 0.1322, |
| "mean_token_accuracy": 0.9482509791851044, |
| "num_tokens": 5939442.0, |
| "step": 682 |
| }, |
| { |
| "entropy": 0.9261104166507721, |
| "epoch": 2.189406099518459, |
| "grad_norm": 4.065014362335205, |
| "learning_rate": 6.431759584663492e-06, |
| "loss": 0.176, |
| "mean_token_accuracy": 0.9163274765014648, |
| "num_tokens": 5950169.0, |
| "step": 683 |
| }, |
| { |
| "entropy": 1.121094822883606, |
| "epoch": 2.192616372391653, |
| "grad_norm": 2.5804195404052734, |
| "learning_rate": 6.421601055061195e-06, |
| "loss": 0.1381, |
| "mean_token_accuracy": 0.92803093791008, |
| "num_tokens": 5960170.0, |
| "step": 684 |
| }, |
| { |
| "entropy": 1.1522566080093384, |
| "epoch": 2.1958266452648476, |
| "grad_norm": 2.320777416229248, |
| "learning_rate": 6.411436137223479e-06, |
| "loss": 0.107, |
| "mean_token_accuracy": 0.9578154981136322, |
| "num_tokens": 5968542.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 1.353051781654358, |
| "epoch": 2.1990369181380416, |
| "grad_norm": 4.244365215301514, |
| "learning_rate": 6.401264876828335e-06, |
| "loss": 0.0945, |
| "mean_token_accuracy": 0.9666432440280914, |
| "num_tokens": 5976718.0, |
| "step": 686 |
| }, |
| { |
| "entropy": 1.0453286170959473, |
| "epoch": 2.202247191011236, |
| "grad_norm": 6.80914306640625, |
| "learning_rate": 6.391087319582264e-06, |
| "loss": 0.1441, |
| "mean_token_accuracy": 0.9426902532577515, |
| "num_tokens": 5986164.0, |
| "step": 687 |
| }, |
| { |
| "entropy": 1.2862181663513184, |
| "epoch": 2.20545746388443, |
| "grad_norm": 2.674420118331909, |
| "learning_rate": 6.38090351122006e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9401120543479919, |
| "num_tokens": 5995535.0, |
| "step": 688 |
| }, |
| { |
| "entropy": 1.223171889781952, |
| "epoch": 2.2086677367576244, |
| "grad_norm": 2.823160171508789, |
| "learning_rate": 6.370713497504607e-06, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.9436621367931366, |
| "num_tokens": 6003683.0, |
| "step": 689 |
| }, |
| { |
| "entropy": 1.1387187242507935, |
| "epoch": 2.211878009630819, |
| "grad_norm": 3.315049171447754, |
| "learning_rate": 6.360517324226676e-06, |
| "loss": 0.117, |
| "mean_token_accuracy": 0.9554562270641327, |
| "num_tokens": 6012357.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.0211151242256165, |
| "epoch": 2.215088282504013, |
| "grad_norm": 2.7127597332000732, |
| "learning_rate": 6.350315037204714e-06, |
| "loss": 0.1254, |
| "mean_token_accuracy": 0.9496433734893799, |
| "num_tokens": 6020588.0, |
| "step": 691 |
| }, |
| { |
| "entropy": 1.149724304676056, |
| "epoch": 2.218298555377207, |
| "grad_norm": 3.5706706047058105, |
| "learning_rate": 6.340106682284645e-06, |
| "loss": 0.1244, |
| "mean_token_accuracy": 0.9411612749099731, |
| "num_tokens": 6028693.0, |
| "step": 692 |
| }, |
| { |
| "entropy": 1.1703895926475525, |
| "epoch": 2.221508828250401, |
| "grad_norm": 3.06144380569458, |
| "learning_rate": 6.329892305339659e-06, |
| "loss": 0.143, |
| "mean_token_accuracy": 0.9452816843986511, |
| "num_tokens": 6037889.0, |
| "step": 693 |
| }, |
| { |
| "entropy": 1.172494113445282, |
| "epoch": 2.2247191011235956, |
| "grad_norm": 7.45186185836792, |
| "learning_rate": 6.319671952270004e-06, |
| "loss": 0.135, |
| "mean_token_accuracy": 0.9451717436313629, |
| "num_tokens": 6045716.0, |
| "step": 694 |
| }, |
| { |
| "entropy": 1.128127098083496, |
| "epoch": 2.2279293739967896, |
| "grad_norm": 2.54144549369812, |
| "learning_rate": 6.309445669002787e-06, |
| "loss": 0.1349, |
| "mean_token_accuracy": 0.9428853690624237, |
| "num_tokens": 6054100.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 1.1079555749893188, |
| "epoch": 2.231139646869984, |
| "grad_norm": 2.946728467941284, |
| "learning_rate": 6.299213501491761e-06, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9281862378120422, |
| "num_tokens": 6063316.0, |
| "step": 696 |
| }, |
| { |
| "entropy": 1.2575078010559082, |
| "epoch": 2.234349919743178, |
| "grad_norm": 3.1350910663604736, |
| "learning_rate": 6.288975495717124e-06, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9531635940074921, |
| "num_tokens": 6071694.0, |
| "step": 697 |
| }, |
| { |
| "entropy": 1.1943337321281433, |
| "epoch": 2.2375601926163724, |
| "grad_norm": 3.640120267868042, |
| "learning_rate": 6.2787316976853045e-06, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9265032112598419, |
| "num_tokens": 6081857.0, |
| "step": 698 |
| }, |
| { |
| "entropy": 1.1847606897354126, |
| "epoch": 2.240770465489567, |
| "grad_norm": 2.607599973678589, |
| "learning_rate": 6.268482153428763e-06, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.923068642616272, |
| "num_tokens": 6090548.0, |
| "step": 699 |
| }, |
| { |
| "entropy": 1.184391736984253, |
| "epoch": 2.243980738362761, |
| "grad_norm": 2.14422869682312, |
| "learning_rate": 6.258226909005783e-06, |
| "loss": 0.0921, |
| "mean_token_accuracy": 0.9700891375541687, |
| "num_tokens": 6098175.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.1512706875801086, |
| "epoch": 2.247191011235955, |
| "grad_norm": 2.4187541007995605, |
| "learning_rate": 6.247966010500258e-06, |
| "loss": 0.1196, |
| "mean_token_accuracy": 0.9588777124881744, |
| "num_tokens": 6106064.0, |
| "step": 701 |
| }, |
| { |
| "entropy": 0.9829612672328949, |
| "epoch": 2.250401284109149, |
| "grad_norm": 3.0330357551574707, |
| "learning_rate": 6.237699504021495e-06, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9402399659156799, |
| "num_tokens": 6114865.0, |
| "step": 702 |
| }, |
| { |
| "entropy": 1.2972161173820496, |
| "epoch": 2.2536115569823436, |
| "grad_norm": 2.9143383502960205, |
| "learning_rate": 6.227427435703997e-06, |
| "loss": 0.1475, |
| "mean_token_accuracy": 0.9466615319252014, |
| "num_tokens": 6123153.0, |
| "step": 703 |
| }, |
| { |
| "entropy": 1.2215816974639893, |
| "epoch": 2.2568218298555376, |
| "grad_norm": 4.461921691894531, |
| "learning_rate": 6.217149851707261e-06, |
| "loss": 0.1094, |
| "mean_token_accuracy": 0.9588871896266937, |
| "num_tokens": 6130391.0, |
| "step": 704 |
| }, |
| { |
| "entropy": 1.1384071707725525, |
| "epoch": 2.260032102728732, |
| "grad_norm": 2.7823870182037354, |
| "learning_rate": 6.206866798215571e-06, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9387631118297577, |
| "num_tokens": 6139453.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 1.029437243938446, |
| "epoch": 2.263242375601926, |
| "grad_norm": 2.3868653774261475, |
| "learning_rate": 6.1965783214377895e-06, |
| "loss": 0.1529, |
| "mean_token_accuracy": 0.9455865919589996, |
| "num_tokens": 6148848.0, |
| "step": 706 |
| }, |
| { |
| "entropy": 1.1833890676498413, |
| "epoch": 2.2664526484751204, |
| "grad_norm": 6.3576178550720215, |
| "learning_rate": 6.186284467607149e-06, |
| "loss": 0.1385, |
| "mean_token_accuracy": 0.9505321979522705, |
| "num_tokens": 6156345.0, |
| "step": 707 |
| }, |
| { |
| "entropy": 1.0868958830833435, |
| "epoch": 2.2696629213483144, |
| "grad_norm": 3.012101650238037, |
| "learning_rate": 6.175985282981042e-06, |
| "loss": 0.1352, |
| "mean_token_accuracy": 0.9491030275821686, |
| "num_tokens": 6165226.0, |
| "step": 708 |
| }, |
| { |
| "entropy": 1.2288443446159363, |
| "epoch": 2.272873194221509, |
| "grad_norm": 3.23870587348938, |
| "learning_rate": 6.165680813840822e-06, |
| "loss": 0.1438, |
| "mean_token_accuracy": 0.9390627443790436, |
| "num_tokens": 6174349.0, |
| "step": 709 |
| }, |
| { |
| "entropy": 1.2095741033554077, |
| "epoch": 2.276083467094703, |
| "grad_norm": 2.6538543701171875, |
| "learning_rate": 6.155371106491584e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9434215128421783, |
| "num_tokens": 6181879.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.1225184798240662, |
| "epoch": 2.279293739967897, |
| "grad_norm": 4.149666786193848, |
| "learning_rate": 6.1450562072619635e-06, |
| "loss": 0.1123, |
| "mean_token_accuracy": 0.9551192224025726, |
| "num_tokens": 6189233.0, |
| "step": 711 |
| }, |
| { |
| "entropy": 0.990815132856369, |
| "epoch": 2.2825040128410916, |
| "grad_norm": 2.544497489929199, |
| "learning_rate": 6.134736162503929e-06, |
| "loss": 0.156, |
| "mean_token_accuracy": 0.9360251128673553, |
| "num_tokens": 6197991.0, |
| "step": 712 |
| }, |
| { |
| "entropy": 1.1359922289848328, |
| "epoch": 2.2857142857142856, |
| "grad_norm": 2.076765537261963, |
| "learning_rate": 6.124411018592568e-06, |
| "loss": 0.1079, |
| "mean_token_accuracy": 0.9633454084396362, |
| "num_tokens": 6206731.0, |
| "step": 713 |
| }, |
| { |
| "entropy": 1.1928575038909912, |
| "epoch": 2.28892455858748, |
| "grad_norm": 3.9280059337615967, |
| "learning_rate": 6.114080821925885e-06, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9514023959636688, |
| "num_tokens": 6215379.0, |
| "step": 714 |
| }, |
| { |
| "entropy": 0.9379362761974335, |
| "epoch": 2.292134831460674, |
| "grad_norm": 3.893974542617798, |
| "learning_rate": 6.103745618924587e-06, |
| "loss": 0.1763, |
| "mean_token_accuracy": 0.9305950999259949, |
| "num_tokens": 6225119.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 1.1294885873794556, |
| "epoch": 2.2953451043338684, |
| "grad_norm": 3.6253561973571777, |
| "learning_rate": 6.09340545603188e-06, |
| "loss": 0.1135, |
| "mean_token_accuracy": 0.954678863286972, |
| "num_tokens": 6233378.0, |
| "step": 716 |
| }, |
| { |
| "entropy": 1.2344108819961548, |
| "epoch": 2.2985553772070624, |
| "grad_norm": 2.6362674236297607, |
| "learning_rate": 6.0830603797132574e-06, |
| "loss": 0.1134, |
| "mean_token_accuracy": 0.9517610669136047, |
| "num_tokens": 6242391.0, |
| "step": 717 |
| }, |
| { |
| "entropy": 1.053533911705017, |
| "epoch": 2.301765650080257, |
| "grad_norm": 2.504680633544922, |
| "learning_rate": 6.072710436456293e-06, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9305368363857269, |
| "num_tokens": 6252383.0, |
| "step": 718 |
| }, |
| { |
| "entropy": 1.176681101322174, |
| "epoch": 2.304975922953451, |
| "grad_norm": 2.3614320755004883, |
| "learning_rate": 6.0623556727704306e-06, |
| "loss": 0.1222, |
| "mean_token_accuracy": 0.9493369460105896, |
| "num_tokens": 6260216.0, |
| "step": 719 |
| }, |
| { |
| "entropy": 1.2141610383987427, |
| "epoch": 2.308186195826645, |
| "grad_norm": 2.8210721015930176, |
| "learning_rate": 6.051996135186774e-06, |
| "loss": 0.1233, |
| "mean_token_accuracy": 0.9536011517047882, |
| "num_tokens": 6270180.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.0050411820411682, |
| "epoch": 2.3113964686998396, |
| "grad_norm": 2.533144235610962, |
| "learning_rate": 6.041631870257882e-06, |
| "loss": 0.137, |
| "mean_token_accuracy": 0.9331673085689545, |
| "num_tokens": 6280110.0, |
| "step": 721 |
| }, |
| { |
| "entropy": 0.9854940176010132, |
| "epoch": 2.3146067415730336, |
| "grad_norm": 7.011678218841553, |
| "learning_rate": 6.0312629245575534e-06, |
| "loss": 0.1519, |
| "mean_token_accuracy": 0.9333108365535736, |
| "num_tokens": 6289456.0, |
| "step": 722 |
| }, |
| { |
| "entropy": 1.0914210677146912, |
| "epoch": 2.317817014446228, |
| "grad_norm": 6.578116416931152, |
| "learning_rate": 6.020889344680627e-06, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9352452456951141, |
| "num_tokens": 6297538.0, |
| "step": 723 |
| }, |
| { |
| "entropy": 1.0971428155899048, |
| "epoch": 2.321027287319422, |
| "grad_norm": 17.007915496826172, |
| "learning_rate": 6.010511177242757e-06, |
| "loss": 0.1499, |
| "mean_token_accuracy": 0.9454688131809235, |
| "num_tokens": 6305880.0, |
| "step": 724 |
| }, |
| { |
| "entropy": 1.052825391292572, |
| "epoch": 2.3242375601926164, |
| "grad_norm": 3.563309907913208, |
| "learning_rate": 6.000128468880223e-06, |
| "loss": 0.146, |
| "mean_token_accuracy": 0.9364206492900848, |
| "num_tokens": 6313624.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 1.0826497673988342, |
| "epoch": 2.3274478330658104, |
| "grad_norm": 4.190746307373047, |
| "learning_rate": 5.989741266249701e-06, |
| "loss": 0.1502, |
| "mean_token_accuracy": 0.9419477880001068, |
| "num_tokens": 6322725.0, |
| "step": 726 |
| }, |
| { |
| "entropy": 1.2108085751533508, |
| "epoch": 2.330658105939005, |
| "grad_norm": 2.0272305011749268, |
| "learning_rate": 5.979349616028067e-06, |
| "loss": 0.1171, |
| "mean_token_accuracy": 0.9540502727031708, |
| "num_tokens": 6331227.0, |
| "step": 727 |
| }, |
| { |
| "entropy": 1.0932866334915161, |
| "epoch": 2.333868378812199, |
| "grad_norm": 3.3109891414642334, |
| "learning_rate": 5.9689535649121855e-06, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.915838360786438, |
| "num_tokens": 6342205.0, |
| "step": 728 |
| }, |
| { |
| "entropy": 1.1593711376190186, |
| "epoch": 2.337078651685393, |
| "grad_norm": 2.3071560859680176, |
| "learning_rate": 5.958553159618693e-06, |
| "loss": 0.1143, |
| "mean_token_accuracy": 0.9542573690414429, |
| "num_tokens": 6349942.0, |
| "step": 729 |
| }, |
| { |
| "entropy": 1.1865712404251099, |
| "epoch": 2.3402889245585876, |
| "grad_norm": 4.078185558319092, |
| "learning_rate": 5.948148446883794e-06, |
| "loss": 0.1875, |
| "mean_token_accuracy": 0.9212767481803894, |
| "num_tokens": 6359147.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.2947837710380554, |
| "epoch": 2.3434991974317816, |
| "grad_norm": 15.557731628417969, |
| "learning_rate": 5.937739473463047e-06, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.951077938079834, |
| "num_tokens": 6367884.0, |
| "step": 731 |
| }, |
| { |
| "entropy": 0.9925798773765564, |
| "epoch": 2.346709470304976, |
| "grad_norm": 2.6986594200134277, |
| "learning_rate": 5.927326286131162e-06, |
| "loss": 0.1312, |
| "mean_token_accuracy": 0.9450666606426239, |
| "num_tokens": 6377022.0, |
| "step": 732 |
| }, |
| { |
| "entropy": 1.0658472776412964, |
| "epoch": 2.34991974317817, |
| "grad_norm": 10.180908203125, |
| "learning_rate": 5.916908931681781e-06, |
| "loss": 0.1768, |
| "mean_token_accuracy": 0.9244714379310608, |
| "num_tokens": 6385574.0, |
| "step": 733 |
| }, |
| { |
| "entropy": 1.1259647607803345, |
| "epoch": 2.3531300160513644, |
| "grad_norm": 4.821587562561035, |
| "learning_rate": 5.906487456927273e-06, |
| "loss": 0.1847, |
| "mean_token_accuracy": 0.9284574687480927, |
| "num_tokens": 6395078.0, |
| "step": 734 |
| }, |
| { |
| "entropy": 1.082639455795288, |
| "epoch": 2.3563402889245584, |
| "grad_norm": 2.4788339138031006, |
| "learning_rate": 5.896061908698521e-06, |
| "loss": 0.1344, |
| "mean_token_accuracy": 0.9499082267284393, |
| "num_tokens": 6404040.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 1.0406213402748108, |
| "epoch": 2.359550561797753, |
| "grad_norm": 3.1999518871307373, |
| "learning_rate": 5.885632333844714e-06, |
| "loss": 0.1442, |
| "mean_token_accuracy": 0.94551220536232, |
| "num_tokens": 6412489.0, |
| "step": 736 |
| }, |
| { |
| "entropy": 1.1878241896629333, |
| "epoch": 2.362760834670947, |
| "grad_norm": 2.712620496749878, |
| "learning_rate": 5.8751987792331365e-06, |
| "loss": 0.1398, |
| "mean_token_accuracy": 0.9454472362995148, |
| "num_tokens": 6421742.0, |
| "step": 737 |
| }, |
| { |
| "entropy": 1.1380284428596497, |
| "epoch": 2.365971107544141, |
| "grad_norm": 2.5213255882263184, |
| "learning_rate": 5.864761291748956e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.928494393825531, |
| "num_tokens": 6430727.0, |
| "step": 738 |
| }, |
| { |
| "entropy": 1.2049461007118225, |
| "epoch": 2.3691813804173356, |
| "grad_norm": 1.8564283847808838, |
| "learning_rate": 5.854319918295012e-06, |
| "loss": 0.0949, |
| "mean_token_accuracy": 0.9627068936824799, |
| "num_tokens": 6438650.0, |
| "step": 739 |
| }, |
| { |
| "entropy": 1.165435016155243, |
| "epoch": 2.3723916532905296, |
| "grad_norm": 2.9071779251098633, |
| "learning_rate": 5.843874705791607e-06, |
| "loss": 0.138, |
| "mean_token_accuracy": 0.95395627617836, |
| "num_tokens": 6447438.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.1078984141349792, |
| "epoch": 2.375601926163724, |
| "grad_norm": 5.674651145935059, |
| "learning_rate": 5.833425701176294e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9678383767604828, |
| "num_tokens": 6455540.0, |
| "step": 741 |
| }, |
| { |
| "entropy": 1.1248586177825928, |
| "epoch": 2.378812199036918, |
| "grad_norm": 2.420736312866211, |
| "learning_rate": 5.82297295140367e-06, |
| "loss": 0.1336, |
| "mean_token_accuracy": 0.9477813839912415, |
| "num_tokens": 6464823.0, |
| "step": 742 |
| }, |
| { |
| "entropy": 1.1130772829055786, |
| "epoch": 2.3820224719101124, |
| "grad_norm": 3.5600759983062744, |
| "learning_rate": 5.812516503445158e-06, |
| "loss": 0.1509, |
| "mean_token_accuracy": 0.927628219127655, |
| "num_tokens": 6474728.0, |
| "step": 743 |
| }, |
| { |
| "entropy": 1.0952720642089844, |
| "epoch": 2.3852327447833064, |
| "grad_norm": 2.5865023136138916, |
| "learning_rate": 5.8020564042888015e-06, |
| "loss": 0.1442, |
| "mean_token_accuracy": 0.9443020522594452, |
| "num_tokens": 6484029.0, |
| "step": 744 |
| }, |
| { |
| "entropy": 1.0994407534599304, |
| "epoch": 2.388443017656501, |
| "grad_norm": 3.299069881439209, |
| "learning_rate": 5.79159270093905e-06, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9384024739265442, |
| "num_tokens": 6492190.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 1.0726031064987183, |
| "epoch": 2.391653290529695, |
| "grad_norm": 2.606271266937256, |
| "learning_rate": 5.781125440416552e-06, |
| "loss": 0.1508, |
| "mean_token_accuracy": 0.9255422055721283, |
| "num_tokens": 6502485.0, |
| "step": 746 |
| }, |
| { |
| "entropy": 1.0770372152328491, |
| "epoch": 2.394863563402889, |
| "grad_norm": 4.949024200439453, |
| "learning_rate": 5.770654669757935e-06, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9429983794689178, |
| "num_tokens": 6511201.0, |
| "step": 747 |
| }, |
| { |
| "entropy": 1.1048730611801147, |
| "epoch": 2.3980738362760836, |
| "grad_norm": 2.6892385482788086, |
| "learning_rate": 5.760180436015604e-06, |
| "loss": 0.1366, |
| "mean_token_accuracy": 0.9469018876552582, |
| "num_tokens": 6519487.0, |
| "step": 748 |
| }, |
| { |
| "entropy": 1.2111053466796875, |
| "epoch": 2.4012841091492776, |
| "grad_norm": 3.027834892272949, |
| "learning_rate": 5.749702786257529e-06, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.943552553653717, |
| "num_tokens": 6527831.0, |
| "step": 749 |
| }, |
| { |
| "entropy": 1.0329571962356567, |
| "epoch": 2.404494382022472, |
| "grad_norm": 2.4994142055511475, |
| "learning_rate": 5.739221767567025e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9160129129886627, |
| "num_tokens": 6536664.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.1039209365844727, |
| "epoch": 2.407704654895666, |
| "grad_norm": 5.443882465362549, |
| "learning_rate": 5.7287374270425475e-06, |
| "loss": 0.1182, |
| "mean_token_accuracy": 0.951865941286087, |
| "num_tokens": 6544959.0, |
| "step": 751 |
| }, |
| { |
| "entropy": 1.20868319272995, |
| "epoch": 2.4109149277688604, |
| "grad_norm": 3.0676169395446777, |
| "learning_rate": 5.718249811797482e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9474264085292816, |
| "num_tokens": 6554395.0, |
| "step": 752 |
| }, |
| { |
| "entropy": 1.1987990736961365, |
| "epoch": 2.4141252006420544, |
| "grad_norm": 18.106931686401367, |
| "learning_rate": 5.707758968959923e-06, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.937873363494873, |
| "num_tokens": 6562668.0, |
| "step": 753 |
| }, |
| { |
| "entropy": 1.1869001388549805, |
| "epoch": 2.417335473515249, |
| "grad_norm": 3.868323564529419, |
| "learning_rate": 5.69726494567248e-06, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9423214197158813, |
| "num_tokens": 6572126.0, |
| "step": 754 |
| }, |
| { |
| "entropy": 1.066800206899643, |
| "epoch": 2.420545746388443, |
| "grad_norm": 2.391563653945923, |
| "learning_rate": 5.686767789092041e-06, |
| "loss": 0.1477, |
| "mean_token_accuracy": 0.9441542625427246, |
| "num_tokens": 6580655.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 1.1427485942840576, |
| "epoch": 2.423756019261637, |
| "grad_norm": 3.986257553100586, |
| "learning_rate": 5.676267546389587e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9257921278476715, |
| "num_tokens": 6589834.0, |
| "step": 756 |
| }, |
| { |
| "entropy": 1.1412203907966614, |
| "epoch": 2.4269662921348316, |
| "grad_norm": 7.656455993652344, |
| "learning_rate": 5.6657642647499545e-06, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9529256224632263, |
| "num_tokens": 6598073.0, |
| "step": 757 |
| }, |
| { |
| "entropy": 1.0917719006538391, |
| "epoch": 2.4301765650080256, |
| "grad_norm": 2.823206663131714, |
| "learning_rate": 5.655257991371646e-06, |
| "loss": 0.1521, |
| "mean_token_accuracy": 0.9446594417095184, |
| "num_tokens": 6606618.0, |
| "step": 758 |
| }, |
| { |
| "entropy": 1.1908010244369507, |
| "epoch": 2.43338683788122, |
| "grad_norm": 5.725603103637695, |
| "learning_rate": 5.644748773466606e-06, |
| "loss": 0.1133, |
| "mean_token_accuracy": 0.9505617022514343, |
| "num_tokens": 6614664.0, |
| "step": 759 |
| }, |
| { |
| "entropy": 1.1573768258094788, |
| "epoch": 2.436597110754414, |
| "grad_norm": 2.2337048053741455, |
| "learning_rate": 5.6342366582600035e-06, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9296829104423523, |
| "num_tokens": 6625043.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.1481091380119324, |
| "epoch": 2.4398073836276084, |
| "grad_norm": 3.4945619106292725, |
| "learning_rate": 5.62372169299004e-06, |
| "loss": 0.121, |
| "mean_token_accuracy": 0.9498609006404877, |
| "num_tokens": 6633006.0, |
| "step": 761 |
| }, |
| { |
| "entropy": 1.1791866421699524, |
| "epoch": 2.4430176565008024, |
| "grad_norm": 2.9412434101104736, |
| "learning_rate": 5.613203924907711e-06, |
| "loss": 0.1251, |
| "mean_token_accuracy": 0.9411061108112335, |
| "num_tokens": 6642848.0, |
| "step": 762 |
| }, |
| { |
| "entropy": 1.0868923664093018, |
| "epoch": 2.446227929373997, |
| "grad_norm": 3.4383668899536133, |
| "learning_rate": 5.6026834012766155e-06, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9447084367275238, |
| "num_tokens": 6651619.0, |
| "step": 763 |
| }, |
| { |
| "entropy": 1.0268099308013916, |
| "epoch": 2.449438202247191, |
| "grad_norm": 2.650527238845825, |
| "learning_rate": 5.592160169372734e-06, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9500701725482941, |
| "num_tokens": 6659996.0, |
| "step": 764 |
| }, |
| { |
| "entropy": 1.166718602180481, |
| "epoch": 2.452648475120385, |
| "grad_norm": 2.866605281829834, |
| "learning_rate": 5.581634276484211e-06, |
| "loss": 0.1177, |
| "mean_token_accuracy": 0.95614093542099, |
| "num_tokens": 6668090.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 1.2826551795005798, |
| "epoch": 2.4558587479935796, |
| "grad_norm": 2.394258975982666, |
| "learning_rate": 5.571105769911159e-06, |
| "loss": 0.1224, |
| "mean_token_accuracy": 0.9535133540630341, |
| "num_tokens": 6676019.0, |
| "step": 766 |
| }, |
| { |
| "entropy": 1.192893922328949, |
| "epoch": 2.4590690208667736, |
| "grad_norm": 2.396747589111328, |
| "learning_rate": 5.560574696965425e-06, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9526415169239044, |
| "num_tokens": 6683999.0, |
| "step": 767 |
| }, |
| { |
| "entropy": 1.288866102695465, |
| "epoch": 2.462279293739968, |
| "grad_norm": 5.264256000518799, |
| "learning_rate": 5.550041104970398e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9669314324855804, |
| "num_tokens": 6692511.0, |
| "step": 768 |
| }, |
| { |
| "entropy": 1.1260980367660522, |
| "epoch": 2.465489566613162, |
| "grad_norm": 2.4346187114715576, |
| "learning_rate": 5.539505041260779e-06, |
| "loss": 0.1496, |
| "mean_token_accuracy": 0.9367939531803131, |
| "num_tokens": 6702286.0, |
| "step": 769 |
| }, |
| { |
| "entropy": 1.0402022004127502, |
| "epoch": 2.4686998394863564, |
| "grad_norm": 5.237890720367432, |
| "learning_rate": 5.528966553182379e-06, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.947739988565445, |
| "num_tokens": 6710359.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.0591676533222198, |
| "epoch": 2.4719101123595504, |
| "grad_norm": 3.0694470405578613, |
| "learning_rate": 5.518425688091906e-06, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9322700500488281, |
| "num_tokens": 6719975.0, |
| "step": 771 |
| }, |
| { |
| "entropy": 1.0547669529914856, |
| "epoch": 2.475120385232745, |
| "grad_norm": 5.70684289932251, |
| "learning_rate": 5.507882493356745e-06, |
| "loss": 0.1328, |
| "mean_token_accuracy": 0.9526722431182861, |
| "num_tokens": 6728954.0, |
| "step": 772 |
| }, |
| { |
| "entropy": 1.1029804944992065, |
| "epoch": 2.478330658105939, |
| "grad_norm": 2.295919418334961, |
| "learning_rate": 5.497337016354757e-06, |
| "loss": 0.1179, |
| "mean_token_accuracy": 0.9506842195987701, |
| "num_tokens": 6737394.0, |
| "step": 773 |
| }, |
| { |
| "entropy": 1.0925171375274658, |
| "epoch": 2.481540930979133, |
| "grad_norm": 6.271547794342041, |
| "learning_rate": 5.486789304474047e-06, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.9553323984146118, |
| "num_tokens": 6745760.0, |
| "step": 774 |
| }, |
| { |
| "entropy": 1.1927781105041504, |
| "epoch": 2.4847512038523276, |
| "grad_norm": 3.240992546081543, |
| "learning_rate": 5.476239405112775e-06, |
| "loss": 0.0908, |
| "mean_token_accuracy": 0.9667895436286926, |
| "num_tokens": 6753422.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 1.1005544662475586, |
| "epoch": 2.4879614767255216, |
| "grad_norm": 2.522223949432373, |
| "learning_rate": 5.465687365678921e-06, |
| "loss": 0.1211, |
| "mean_token_accuracy": 0.9548394978046417, |
| "num_tokens": 6762310.0, |
| "step": 776 |
| }, |
| { |
| "entropy": 1.1497004628181458, |
| "epoch": 2.491171749598716, |
| "grad_norm": 6.606053829193115, |
| "learning_rate": 5.45513323359009e-06, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9237475097179413, |
| "num_tokens": 6771461.0, |
| "step": 777 |
| }, |
| { |
| "entropy": 1.1692470908164978, |
| "epoch": 2.49438202247191, |
| "grad_norm": 2.561846971511841, |
| "learning_rate": 5.444577056273284e-06, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.9331184327602386, |
| "num_tokens": 6780794.0, |
| "step": 778 |
| }, |
| { |
| "entropy": 1.1475553512573242, |
| "epoch": 2.4975922953451044, |
| "grad_norm": 4.43130350112915, |
| "learning_rate": 5.434018881164702e-06, |
| "loss": 0.1414, |
| "mean_token_accuracy": 0.9424753189086914, |
| "num_tokens": 6789767.0, |
| "step": 779 |
| }, |
| { |
| "entropy": 1.1029905378818512, |
| "epoch": 2.5008025682182984, |
| "grad_norm": 2.932034969329834, |
| "learning_rate": 5.423458755709516e-06, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9238942563533783, |
| "num_tokens": 6798435.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.0897286534309387, |
| "epoch": 2.504012841091493, |
| "grad_norm": 4.083599090576172, |
| "learning_rate": 5.412896727361663e-06, |
| "loss": 0.1381, |
| "mean_token_accuracy": 0.9539946615695953, |
| "num_tokens": 6806305.0, |
| "step": 781 |
| }, |
| { |
| "entropy": 1.283981204032898, |
| "epoch": 2.5072231139646872, |
| "grad_norm": 2.844965696334839, |
| "learning_rate": 5.402332843583631e-06, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9530335962772369, |
| "num_tokens": 6814666.0, |
| "step": 782 |
| }, |
| { |
| "entropy": 1.0701724290847778, |
| "epoch": 2.510433386837881, |
| "grad_norm": 3.165010690689087, |
| "learning_rate": 5.391767151846247e-06, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9041315615177155, |
| "num_tokens": 6825399.0, |
| "step": 783 |
| }, |
| { |
| "entropy": 1.116648256778717, |
| "epoch": 2.513643659711075, |
| "grad_norm": 2.8302817344665527, |
| "learning_rate": 5.381199699628459e-06, |
| "loss": 0.1319, |
| "mean_token_accuracy": 0.9495140314102173, |
| "num_tokens": 6834076.0, |
| "step": 784 |
| }, |
| { |
| "entropy": 1.0305944681167603, |
| "epoch": 2.5168539325842696, |
| "grad_norm": 2.3221435546875, |
| "learning_rate": 5.370630534417133e-06, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9132009148597717, |
| "num_tokens": 6844788.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 1.040054589509964, |
| "epoch": 2.520064205457464, |
| "grad_norm": 2.6023216247558594, |
| "learning_rate": 5.360059703706823e-06, |
| "loss": 0.1773, |
| "mean_token_accuracy": 0.9199038743972778, |
| "num_tokens": 6854025.0, |
| "step": 786 |
| }, |
| { |
| "entropy": 1.228873372077942, |
| "epoch": 2.523274478330658, |
| "grad_norm": 5.279784679412842, |
| "learning_rate": 5.349487254999579e-06, |
| "loss": 0.0966, |
| "mean_token_accuracy": 0.964431494474411, |
| "num_tokens": 6862209.0, |
| "step": 787 |
| }, |
| { |
| "entropy": 1.2652587294578552, |
| "epoch": 2.5264847512038524, |
| "grad_norm": 3.7914271354675293, |
| "learning_rate": 5.3389132358047115e-06, |
| "loss": 0.1027, |
| "mean_token_accuracy": 0.9610486626625061, |
| "num_tokens": 6870917.0, |
| "step": 788 |
| }, |
| { |
| "entropy": 1.2009222507476807, |
| "epoch": 2.5296950240770464, |
| "grad_norm": 2.931785821914673, |
| "learning_rate": 5.328337693638591e-06, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9512718021869659, |
| "num_tokens": 6879476.0, |
| "step": 789 |
| }, |
| { |
| "entropy": 1.254488468170166, |
| "epoch": 2.532905296950241, |
| "grad_norm": 27.21143341064453, |
| "learning_rate": 5.317760676024436e-06, |
| "loss": 0.1399, |
| "mean_token_accuracy": 0.9459502100944519, |
| "num_tokens": 6888458.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.2336873412132263, |
| "epoch": 2.5361155698234352, |
| "grad_norm": 2.5371665954589844, |
| "learning_rate": 5.307182230492089e-06, |
| "loss": 0.1327, |
| "mean_token_accuracy": 0.9516365826129913, |
| "num_tokens": 6897670.0, |
| "step": 791 |
| }, |
| { |
| "entropy": 0.9518597722053528, |
| "epoch": 2.539325842696629, |
| "grad_norm": 2.7813923358917236, |
| "learning_rate": 5.296602404577814e-06, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9245217740535736, |
| "num_tokens": 6906804.0, |
| "step": 792 |
| }, |
| { |
| "entropy": 1.167235255241394, |
| "epoch": 2.542536115569823, |
| "grad_norm": 2.641925573348999, |
| "learning_rate": 5.286021245824075e-06, |
| "loss": 0.1123, |
| "mean_token_accuracy": 0.9582434296607971, |
| "num_tokens": 6914869.0, |
| "step": 793 |
| }, |
| { |
| "entropy": 1.3288288116455078, |
| "epoch": 2.5457463884430176, |
| "grad_norm": 2.3714306354522705, |
| "learning_rate": 5.275438801779328e-06, |
| "loss": 0.1135, |
| "mean_token_accuracy": 0.9562829434871674, |
| "num_tokens": 6924038.0, |
| "step": 794 |
| }, |
| { |
| "entropy": 1.1833221912384033, |
| "epoch": 2.548956661316212, |
| "grad_norm": 2.5956239700317383, |
| "learning_rate": 5.264855119997803e-06, |
| "loss": 0.1405, |
| "mean_token_accuracy": 0.9424801170825958, |
| "num_tokens": 6932622.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 1.1478038430213928, |
| "epoch": 2.552166934189406, |
| "grad_norm": 4.386684417724609, |
| "learning_rate": 5.254270248039291e-06, |
| "loss": 0.1419, |
| "mean_token_accuracy": 0.9366876184940338, |
| "num_tokens": 6941209.0, |
| "step": 796 |
| }, |
| { |
| "entropy": 1.1999590992927551, |
| "epoch": 2.5553772070626004, |
| "grad_norm": 2.239611864089966, |
| "learning_rate": 5.243684233468933e-06, |
| "loss": 0.1416, |
| "mean_token_accuracy": 0.9222292900085449, |
| "num_tokens": 6951363.0, |
| "step": 797 |
| }, |
| { |
| "entropy": 1.188470184803009, |
| "epoch": 2.5585874799357944, |
| "grad_norm": 2.155332565307617, |
| "learning_rate": 5.233097123857004e-06, |
| "loss": 0.1192, |
| "mean_token_accuracy": 0.9411430060863495, |
| "num_tokens": 6960774.0, |
| "step": 798 |
| }, |
| { |
| "entropy": 1.0205163359642029, |
| "epoch": 2.561797752808989, |
| "grad_norm": 2.36620831489563, |
| "learning_rate": 5.222508966778702e-06, |
| "loss": 0.1414, |
| "mean_token_accuracy": 0.9473484754562378, |
| "num_tokens": 6969967.0, |
| "step": 799 |
| }, |
| { |
| "entropy": 1.1117953062057495, |
| "epoch": 2.5650080256821832, |
| "grad_norm": 83.64684295654297, |
| "learning_rate": 5.211919809813927e-06, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.9633921980857849, |
| "num_tokens": 6977751.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.1597265005111694, |
| "epoch": 2.568218298555377, |
| "grad_norm": 7.2872419357299805, |
| "learning_rate": 5.201329700547077e-06, |
| "loss": 0.1348, |
| "mean_token_accuracy": 0.9420501291751862, |
| "num_tokens": 6986967.0, |
| "step": 801 |
| }, |
| { |
| "entropy": 1.062886893749237, |
| "epoch": 2.571428571428571, |
| "grad_norm": 3.075824737548828, |
| "learning_rate": 5.190738686566826e-06, |
| "loss": 0.1747, |
| "mean_token_accuracy": 0.9246014654636383, |
| "num_tokens": 6995637.0, |
| "step": 802 |
| }, |
| { |
| "entropy": 1.1086429953575134, |
| "epoch": 2.5746388443017656, |
| "grad_norm": 2.755178928375244, |
| "learning_rate": 5.180146815465915e-06, |
| "loss": 0.1475, |
| "mean_token_accuracy": 0.9486549496650696, |
| "num_tokens": 7005492.0, |
| "step": 803 |
| }, |
| { |
| "entropy": 1.2635900974273682, |
| "epoch": 2.57784911717496, |
| "grad_norm": 3.3014962673187256, |
| "learning_rate": 5.169554134840937e-06, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9536003470420837, |
| "num_tokens": 7013405.0, |
| "step": 804 |
| }, |
| { |
| "entropy": 1.2595646977424622, |
| "epoch": 2.581059390048154, |
| "grad_norm": 9.928709030151367, |
| "learning_rate": 5.158960692292122e-06, |
| "loss": 0.1435, |
| "mean_token_accuracy": 0.930577278137207, |
| "num_tokens": 7022575.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 1.2178662419319153, |
| "epoch": 2.5842696629213484, |
| "grad_norm": 2.059295177459717, |
| "learning_rate": 5.148366535423126e-06, |
| "loss": 0.104, |
| "mean_token_accuracy": 0.9549370110034943, |
| "num_tokens": 7031068.0, |
| "step": 806 |
| }, |
| { |
| "entropy": 1.182856559753418, |
| "epoch": 2.5874799357945424, |
| "grad_norm": 4.150783538818359, |
| "learning_rate": 5.137771711840811e-06, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9229248464107513, |
| "num_tokens": 7040170.0, |
| "step": 807 |
| }, |
| { |
| "entropy": 1.1488960981369019, |
| "epoch": 2.590690208667737, |
| "grad_norm": 2.7888660430908203, |
| "learning_rate": 5.1271762691550375e-06, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9194961786270142, |
| "num_tokens": 7049813.0, |
| "step": 808 |
| }, |
| { |
| "entropy": 1.1741920709609985, |
| "epoch": 2.5939004815409312, |
| "grad_norm": 4.083916664123535, |
| "learning_rate": 5.116580254978447e-06, |
| "loss": 0.123, |
| "mean_token_accuracy": 0.951829195022583, |
| "num_tokens": 7057807.0, |
| "step": 809 |
| }, |
| { |
| "entropy": 1.0202910602092743, |
| "epoch": 2.597110754414125, |
| "grad_norm": 2.8720052242279053, |
| "learning_rate": 5.1059837169262506e-06, |
| "loss": 0.1796, |
| "mean_token_accuracy": 0.9331225156784058, |
| "num_tokens": 7066075.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.1626269817352295, |
| "epoch": 2.600321027287319, |
| "grad_norm": 3.792236089706421, |
| "learning_rate": 5.095386702616012e-06, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9492098689079285, |
| "num_tokens": 7074889.0, |
| "step": 811 |
| }, |
| { |
| "entropy": 1.1408878564834595, |
| "epoch": 2.6035313001605136, |
| "grad_norm": 3.076556921005249, |
| "learning_rate": 5.084789259667437e-06, |
| "loss": 0.1227, |
| "mean_token_accuracy": 0.9498015642166138, |
| "num_tokens": 7083337.0, |
| "step": 812 |
| }, |
| { |
| "entropy": 1.1532814502716064, |
| "epoch": 2.606741573033708, |
| "grad_norm": 2.449476718902588, |
| "learning_rate": 5.074191435702155e-06, |
| "loss": 0.1387, |
| "mean_token_accuracy": 0.9504629671573639, |
| "num_tokens": 7092174.0, |
| "step": 813 |
| }, |
| { |
| "entropy": 1.2164504528045654, |
| "epoch": 2.609951845906902, |
| "grad_norm": 2.563089609146118, |
| "learning_rate": 5.06359327834351e-06, |
| "loss": 0.144, |
| "mean_token_accuracy": 0.9414226114749908, |
| "num_tokens": 7100611.0, |
| "step": 814 |
| }, |
| { |
| "entropy": 1.1398358345031738, |
| "epoch": 2.6131621187800964, |
| "grad_norm": 2.054319381713867, |
| "learning_rate": 5.05299483521634e-06, |
| "loss": 0.1458, |
| "mean_token_accuracy": 0.9323011636734009, |
| "num_tokens": 7108693.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 1.2047650218009949, |
| "epoch": 2.6163723916532904, |
| "grad_norm": 3.8257853984832764, |
| "learning_rate": 5.0423961539467754e-06, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9467697441577911, |
| "num_tokens": 7117228.0, |
| "step": 816 |
| }, |
| { |
| "entropy": 1.2512578964233398, |
| "epoch": 2.619582664526485, |
| "grad_norm": 2.4949073791503906, |
| "learning_rate": 5.031797282162007e-06, |
| "loss": 0.1164, |
| "mean_token_accuracy": 0.9542975723743439, |
| "num_tokens": 7125961.0, |
| "step": 817 |
| }, |
| { |
| "entropy": 1.202860414981842, |
| "epoch": 2.6227929373996792, |
| "grad_norm": 3.2338783740997314, |
| "learning_rate": 5.021198267490088e-06, |
| "loss": 0.1173, |
| "mean_token_accuracy": 0.9500894248485565, |
| "num_tokens": 7133976.0, |
| "step": 818 |
| }, |
| { |
| "entropy": 1.1789140701293945, |
| "epoch": 2.626003210272873, |
| "grad_norm": 2.3607523441314697, |
| "learning_rate": 5.010599157559713e-06, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9499906599521637, |
| "num_tokens": 7144087.0, |
| "step": 819 |
| }, |
| { |
| "entropy": 1.3041833639144897, |
| "epoch": 2.629213483146067, |
| "grad_norm": 4.608010768890381, |
| "learning_rate": 5e-06, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9459123015403748, |
| "num_tokens": 7153618.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.1604467630386353, |
| "epoch": 2.6324237560192616, |
| "grad_norm": 2.8961966037750244, |
| "learning_rate": 4.98940084244029e-06, |
| "loss": 0.1219, |
| "mean_token_accuracy": 0.9523343443870544, |
| "num_tokens": 7162203.0, |
| "step": 821 |
| }, |
| { |
| "entropy": 1.329535961151123, |
| "epoch": 2.635634028892456, |
| "grad_norm": 2.5095512866973877, |
| "learning_rate": 4.9788017325099134e-06, |
| "loss": 0.1495, |
| "mean_token_accuracy": 0.9324021637439728, |
| "num_tokens": 7171282.0, |
| "step": 822 |
| }, |
| { |
| "entropy": 1.0252764225006104, |
| "epoch": 2.63884430176565, |
| "grad_norm": 12.967425346374512, |
| "learning_rate": 4.968202717837996e-06, |
| "loss": 0.0946, |
| "mean_token_accuracy": 0.9648852646350861, |
| "num_tokens": 7179569.0, |
| "step": 823 |
| }, |
| { |
| "entropy": 1.1990549564361572, |
| "epoch": 2.6420545746388444, |
| "grad_norm": 1.95919930934906, |
| "learning_rate": 4.957603846053225e-06, |
| "loss": 0.1448, |
| "mean_token_accuracy": 0.9189603328704834, |
| "num_tokens": 7190125.0, |
| "step": 824 |
| }, |
| { |
| "entropy": 1.2720578908920288, |
| "epoch": 2.6452648475120384, |
| "grad_norm": 3.3589868545532227, |
| "learning_rate": 4.947005164783661e-06, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.941864937543869, |
| "num_tokens": 7200048.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 1.2720287442207336, |
| "epoch": 2.648475120385233, |
| "grad_norm": 4.5492024421691895, |
| "learning_rate": 4.936406721656492e-06, |
| "loss": 0.1051, |
| "mean_token_accuracy": 0.9559166431427002, |
| "num_tokens": 7208839.0, |
| "step": 826 |
| }, |
| { |
| "entropy": 1.2180215120315552, |
| "epoch": 2.6516853932584272, |
| "grad_norm": 2.815068483352661, |
| "learning_rate": 4.925808564297847e-06, |
| "loss": 0.1149, |
| "mean_token_accuracy": 0.9604234099388123, |
| "num_tokens": 7217154.0, |
| "step": 827 |
| }, |
| { |
| "entropy": 0.9922587871551514, |
| "epoch": 2.654895666131621, |
| "grad_norm": 59.59284210205078, |
| "learning_rate": 4.915210740332564e-06, |
| "loss": 0.1447, |
| "mean_token_accuracy": 0.9238322377204895, |
| "num_tokens": 7226381.0, |
| "step": 828 |
| }, |
| { |
| "entropy": 1.1355258226394653, |
| "epoch": 2.658105939004815, |
| "grad_norm": 2.589543342590332, |
| "learning_rate": 4.9046132973839895e-06, |
| "loss": 0.1164, |
| "mean_token_accuracy": 0.956183135509491, |
| "num_tokens": 7234286.0, |
| "step": 829 |
| }, |
| { |
| "entropy": 0.9799058735370636, |
| "epoch": 2.6613162118780096, |
| "grad_norm": 3.102374315261841, |
| "learning_rate": 4.894016283073753e-06, |
| "loss": 0.1386, |
| "mean_token_accuracy": 0.9398746490478516, |
| "num_tokens": 7243474.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.2898936867713928, |
| "epoch": 2.664526484751204, |
| "grad_norm": 3.5710971355438232, |
| "learning_rate": 4.883419745021554e-06, |
| "loss": 0.1233, |
| "mean_token_accuracy": 0.9488505721092224, |
| "num_tokens": 7251936.0, |
| "step": 831 |
| }, |
| { |
| "entropy": 1.2016060948371887, |
| "epoch": 2.667736757624398, |
| "grad_norm": 2.7869205474853516, |
| "learning_rate": 4.872823730844966e-06, |
| "loss": 0.1072, |
| "mean_token_accuracy": 0.9632176756858826, |
| "num_tokens": 7260147.0, |
| "step": 832 |
| }, |
| { |
| "entropy": 1.1167156100273132, |
| "epoch": 2.6709470304975924, |
| "grad_norm": 2.521682024002075, |
| "learning_rate": 4.862228288159191e-06, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9312631487846375, |
| "num_tokens": 7269675.0, |
| "step": 833 |
| }, |
| { |
| "entropy": 1.2334627509117126, |
| "epoch": 2.6741573033707864, |
| "grad_norm": 5.0381011962890625, |
| "learning_rate": 4.851633464576876e-06, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9190080761909485, |
| "num_tokens": 7278754.0, |
| "step": 834 |
| }, |
| { |
| "entropy": 1.0725289583206177, |
| "epoch": 2.677367576243981, |
| "grad_norm": 2.2365589141845703, |
| "learning_rate": 4.841039307707878e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9546129703521729, |
| "num_tokens": 7286779.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 1.3398520350456238, |
| "epoch": 2.6805778491171752, |
| "grad_norm": 3.4143424034118652, |
| "learning_rate": 4.8304458651590645e-06, |
| "loss": 0.1481, |
| "mean_token_accuracy": 0.939169704914093, |
| "num_tokens": 7296679.0, |
| "step": 836 |
| }, |
| { |
| "entropy": 1.1583664417266846, |
| "epoch": 2.683788121990369, |
| "grad_norm": 3.6284713745117188, |
| "learning_rate": 4.819853184534085e-06, |
| "loss": 0.1225, |
| "mean_token_accuracy": 0.9425530731678009, |
| "num_tokens": 7304789.0, |
| "step": 837 |
| }, |
| { |
| "entropy": 1.1957999467849731, |
| "epoch": 2.686998394863563, |
| "grad_norm": 3.876544237136841, |
| "learning_rate": 4.809261313433176e-06, |
| "loss": 0.138, |
| "mean_token_accuracy": 0.9428149461746216, |
| "num_tokens": 7313959.0, |
| "step": 838 |
| }, |
| { |
| "entropy": 1.2033058404922485, |
| "epoch": 2.6902086677367576, |
| "grad_norm": 2.247842788696289, |
| "learning_rate": 4.798670299452926e-06, |
| "loss": 0.1128, |
| "mean_token_accuracy": 0.9541947841644287, |
| "num_tokens": 7322037.0, |
| "step": 839 |
| }, |
| { |
| "entropy": 1.0784823894500732, |
| "epoch": 2.693418940609952, |
| "grad_norm": 3.17934250831604, |
| "learning_rate": 4.788080190186075e-06, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.9339147508144379, |
| "num_tokens": 7331689.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.94371497631073, |
| "epoch": 2.696629213483146, |
| "grad_norm": 2.5439469814300537, |
| "learning_rate": 4.7774910332213005e-06, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9517930448055267, |
| "num_tokens": 7339675.0, |
| "step": 841 |
| }, |
| { |
| "entropy": 1.293647587299347, |
| "epoch": 2.6998394863563404, |
| "grad_norm": 3.3014588356018066, |
| "learning_rate": 4.766902876142996e-06, |
| "loss": 0.1151, |
| "mean_token_accuracy": 0.9499547779560089, |
| "num_tokens": 7348038.0, |
| "step": 842 |
| }, |
| { |
| "entropy": 1.1160376071929932, |
| "epoch": 2.7030497592295344, |
| "grad_norm": 2.3735413551330566, |
| "learning_rate": 4.756315766531069e-06, |
| "loss": 0.1221, |
| "mean_token_accuracy": 0.949918121099472, |
| "num_tokens": 7356513.0, |
| "step": 843 |
| }, |
| { |
| "entropy": 1.091322898864746, |
| "epoch": 2.706260032102729, |
| "grad_norm": 2.5370192527770996, |
| "learning_rate": 4.74572975196071e-06, |
| "loss": 0.1349, |
| "mean_token_accuracy": 0.9399248659610748, |
| "num_tokens": 7365541.0, |
| "step": 844 |
| }, |
| { |
| "entropy": 1.1114393472671509, |
| "epoch": 2.7094703049759232, |
| "grad_norm": 2.9894139766693115, |
| "learning_rate": 4.735144880002199e-06, |
| "loss": 0.1457, |
| "mean_token_accuracy": 0.932213693857193, |
| "num_tokens": 7374308.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 1.2670826315879822, |
| "epoch": 2.712680577849117, |
| "grad_norm": 3.916121244430542, |
| "learning_rate": 4.724561198220672e-06, |
| "loss": 0.116, |
| "mean_token_accuracy": 0.9497752785682678, |
| "num_tokens": 7382886.0, |
| "step": 846 |
| }, |
| { |
| "entropy": 1.2188202738761902, |
| "epoch": 2.715890850722311, |
| "grad_norm": 5.7917094230651855, |
| "learning_rate": 4.713978754175926e-06, |
| "loss": 0.1174, |
| "mean_token_accuracy": 0.9567406475543976, |
| "num_tokens": 7391546.0, |
| "step": 847 |
| }, |
| { |
| "entropy": 1.2436976432800293, |
| "epoch": 2.7191011235955056, |
| "grad_norm": 2.361128091812134, |
| "learning_rate": 4.703397595422188e-06, |
| "loss": 0.1303, |
| "mean_token_accuracy": 0.942131757736206, |
| "num_tokens": 7400026.0, |
| "step": 848 |
| }, |
| { |
| "entropy": 1.019951045513153, |
| "epoch": 2.7223113964687, |
| "grad_norm": 2.2465875148773193, |
| "learning_rate": 4.692817769507912e-06, |
| "loss": 0.1243, |
| "mean_token_accuracy": 0.9461689591407776, |
| "num_tokens": 7409147.0, |
| "step": 849 |
| }, |
| { |
| "entropy": 1.087328314781189, |
| "epoch": 2.725521669341894, |
| "grad_norm": 7.778383255004883, |
| "learning_rate": 4.682239323975566e-06, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9359224736690521, |
| "num_tokens": 7417525.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.2314087748527527, |
| "epoch": 2.7287319422150884, |
| "grad_norm": 2.4920654296875, |
| "learning_rate": 4.671662306361409e-06, |
| "loss": 0.12, |
| "mean_token_accuracy": 0.9538951516151428, |
| "num_tokens": 7427404.0, |
| "step": 851 |
| }, |
| { |
| "entropy": 1.3098554015159607, |
| "epoch": 2.7319422150882824, |
| "grad_norm": 2.8234992027282715, |
| "learning_rate": 4.66108676419529e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9484634101390839, |
| "num_tokens": 7435312.0, |
| "step": 852 |
| }, |
| { |
| "entropy": 1.2485319375991821, |
| "epoch": 2.735152487961477, |
| "grad_norm": 2.5828335285186768, |
| "learning_rate": 4.6505127450004216e-06, |
| "loss": 0.1205, |
| "mean_token_accuracy": 0.957422286272049, |
| "num_tokens": 7443603.0, |
| "step": 853 |
| }, |
| { |
| "entropy": 1.3845638036727905, |
| "epoch": 2.738362760834671, |
| "grad_norm": 2.860136032104492, |
| "learning_rate": 4.6399402962931775e-06, |
| "loss": 0.1355, |
| "mean_token_accuracy": 0.9504896700382233, |
| "num_tokens": 7453049.0, |
| "step": 854 |
| }, |
| { |
| "entropy": 1.0702533721923828, |
| "epoch": 2.741573033707865, |
| "grad_norm": 2.2271461486816406, |
| "learning_rate": 4.62936946558287e-06, |
| "loss": 0.1051, |
| "mean_token_accuracy": 0.9618172347545624, |
| "num_tokens": 7461658.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 1.1994733810424805, |
| "epoch": 2.744783306581059, |
| "grad_norm": 2.68015456199646, |
| "learning_rate": 4.618800300371543e-06, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9526884853839874, |
| "num_tokens": 7470063.0, |
| "step": 856 |
| }, |
| { |
| "entropy": 1.1785258650779724, |
| "epoch": 2.7479935794542536, |
| "grad_norm": 2.1752262115478516, |
| "learning_rate": 4.608232848153757e-06, |
| "loss": 0.1362, |
| "mean_token_accuracy": 0.9400155544281006, |
| "num_tokens": 7479325.0, |
| "step": 857 |
| }, |
| { |
| "entropy": 1.1845125555992126, |
| "epoch": 2.751203852327448, |
| "grad_norm": 3.401946544647217, |
| "learning_rate": 4.597667156416371e-06, |
| "loss": 0.1159, |
| "mean_token_accuracy": 0.955258846282959, |
| "num_tokens": 7487362.0, |
| "step": 858 |
| }, |
| { |
| "entropy": 1.193034589290619, |
| "epoch": 2.754414125200642, |
| "grad_norm": 5.033675670623779, |
| "learning_rate": 4.587103272638339e-06, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9349705278873444, |
| "num_tokens": 7496720.0, |
| "step": 859 |
| }, |
| { |
| "entropy": 0.996446818113327, |
| "epoch": 2.7576243980738364, |
| "grad_norm": 2.372493028640747, |
| "learning_rate": 4.576541244290484e-06, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9507549703121185, |
| "num_tokens": 7505134.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.076714038848877, |
| "epoch": 2.7608346709470304, |
| "grad_norm": 2.4927151203155518, |
| "learning_rate": 4.565981118835299e-06, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9250488579273224, |
| "num_tokens": 7515661.0, |
| "step": 861 |
| }, |
| { |
| "entropy": 1.0132020115852356, |
| "epoch": 2.764044943820225, |
| "grad_norm": 2.702894926071167, |
| "learning_rate": 4.555422943726715e-06, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.9141054153442383, |
| "num_tokens": 7524731.0, |
| "step": 862 |
| }, |
| { |
| "entropy": 1.2586361765861511, |
| "epoch": 2.767255216693419, |
| "grad_norm": 2.5868072509765625, |
| "learning_rate": 4.5448667664099125e-06, |
| "loss": 0.1188, |
| "mean_token_accuracy": 0.9554040431976318, |
| "num_tokens": 7532750.0, |
| "step": 863 |
| }, |
| { |
| "entropy": 1.2303617000579834, |
| "epoch": 2.770465489566613, |
| "grad_norm": 2.8830461502075195, |
| "learning_rate": 4.534312634321081e-06, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9583016335964203, |
| "num_tokens": 7542198.0, |
| "step": 864 |
| }, |
| { |
| "entropy": 1.2619620561599731, |
| "epoch": 2.773675762439807, |
| "grad_norm": 2.6447360515594482, |
| "learning_rate": 4.523760594887228e-06, |
| "loss": 0.1126, |
| "mean_token_accuracy": 0.9589046835899353, |
| "num_tokens": 7549862.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 1.136208415031433, |
| "epoch": 2.7768860353130016, |
| "grad_norm": 2.4705092906951904, |
| "learning_rate": 4.513210695525954e-06, |
| "loss": 0.1076, |
| "mean_token_accuracy": 0.956132709980011, |
| "num_tokens": 7557050.0, |
| "step": 866 |
| }, |
| { |
| "entropy": 1.1666526198387146, |
| "epoch": 2.780096308186196, |
| "grad_norm": 2.446833610534668, |
| "learning_rate": 4.5026629836452445e-06, |
| "loss": 0.1372, |
| "mean_token_accuracy": 0.9423530399799347, |
| "num_tokens": 7566074.0, |
| "step": 867 |
| }, |
| { |
| "entropy": 1.1064327955245972, |
| "epoch": 2.78330658105939, |
| "grad_norm": 3.509469747543335, |
| "learning_rate": 4.492117506643256e-06, |
| "loss": 0.0984, |
| "mean_token_accuracy": 0.9642610847949982, |
| "num_tokens": 7573235.0, |
| "step": 868 |
| }, |
| { |
| "entropy": 1.143051952123642, |
| "epoch": 2.7865168539325844, |
| "grad_norm": 5.667890548706055, |
| "learning_rate": 4.481574311908096e-06, |
| "loss": 0.125, |
| "mean_token_accuracy": 0.9446974098682404, |
| "num_tokens": 7581498.0, |
| "step": 869 |
| }, |
| { |
| "entropy": 1.3021160960197449, |
| "epoch": 2.7897271268057784, |
| "grad_norm": 4.6017069816589355, |
| "learning_rate": 4.471033446817623e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9526508450508118, |
| "num_tokens": 7590660.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.9912780523300171, |
| "epoch": 2.792937399678973, |
| "grad_norm": 2.9224236011505127, |
| "learning_rate": 4.460494958739223e-06, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9244946837425232, |
| "num_tokens": 7600008.0, |
| "step": 871 |
| }, |
| { |
| "entropy": 1.1159600019454956, |
| "epoch": 2.796147672552167, |
| "grad_norm": 3.117302417755127, |
| "learning_rate": 4.449958895029604e-06, |
| "loss": 0.15, |
| "mean_token_accuracy": 0.934796154499054, |
| "num_tokens": 7609102.0, |
| "step": 872 |
| }, |
| { |
| "entropy": 1.2988770604133606, |
| "epoch": 2.799357945425361, |
| "grad_norm": 2.7682743072509766, |
| "learning_rate": 4.439425303034576e-06, |
| "loss": 0.1074, |
| "mean_token_accuracy": 0.961311399936676, |
| "num_tokens": 7616956.0, |
| "step": 873 |
| }, |
| { |
| "entropy": 1.3484828472137451, |
| "epoch": 2.802568218298555, |
| "grad_norm": 2.468888282775879, |
| "learning_rate": 4.428894230088842e-06, |
| "loss": 0.1245, |
| "mean_token_accuracy": 0.9545502364635468, |
| "num_tokens": 7625302.0, |
| "step": 874 |
| }, |
| { |
| "entropy": 1.2678768038749695, |
| "epoch": 2.8057784911717496, |
| "grad_norm": 2.3519935607910156, |
| "learning_rate": 4.418365723515791e-06, |
| "loss": 0.1015, |
| "mean_token_accuracy": 0.9572258293628693, |
| "num_tokens": 7634369.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 1.0742478966712952, |
| "epoch": 2.808988764044944, |
| "grad_norm": 5.775500774383545, |
| "learning_rate": 4.407839830627269e-06, |
| "loss": 0.1192, |
| "mean_token_accuracy": 0.9556642770767212, |
| "num_tokens": 7643376.0, |
| "step": 876 |
| }, |
| { |
| "entropy": 1.1620059609413147, |
| "epoch": 2.812199036918138, |
| "grad_norm": 2.9240689277648926, |
| "learning_rate": 4.397316598723385e-06, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9445300102233887, |
| "num_tokens": 7651065.0, |
| "step": 877 |
| }, |
| { |
| "entropy": 0.9733690023422241, |
| "epoch": 2.8154093097913324, |
| "grad_norm": 2.123840570449829, |
| "learning_rate": 4.38679607509229e-06, |
| "loss": 0.1415, |
| "mean_token_accuracy": 0.9152352511882782, |
| "num_tokens": 7662378.0, |
| "step": 878 |
| }, |
| { |
| "entropy": 1.0626774430274963, |
| "epoch": 2.8186195826645264, |
| "grad_norm": 2.542992353439331, |
| "learning_rate": 4.376278307009962e-06, |
| "loss": 0.1357, |
| "mean_token_accuracy": 0.939263254404068, |
| "num_tokens": 7670141.0, |
| "step": 879 |
| }, |
| { |
| "entropy": 1.1844760179519653, |
| "epoch": 2.821829855537721, |
| "grad_norm": 2.248908281326294, |
| "learning_rate": 4.365763341739996e-06, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9115343689918518, |
| "num_tokens": 7679575.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.0241534113883972, |
| "epoch": 2.825040128410915, |
| "grad_norm": 2.588132381439209, |
| "learning_rate": 4.355251226533396e-06, |
| "loss": 0.1375, |
| "mean_token_accuracy": 0.9446264207363129, |
| "num_tokens": 7688269.0, |
| "step": 881 |
| }, |
| { |
| "entropy": 1.2254069447517395, |
| "epoch": 2.828250401284109, |
| "grad_norm": 2.150693655014038, |
| "learning_rate": 4.344742008628356e-06, |
| "loss": 0.1072, |
| "mean_token_accuracy": 0.957080066204071, |
| "num_tokens": 7696149.0, |
| "step": 882 |
| }, |
| { |
| "entropy": 1.1962010264396667, |
| "epoch": 2.831460674157303, |
| "grad_norm": 2.6374399662017822, |
| "learning_rate": 4.334235735250047e-06, |
| "loss": 0.1048, |
| "mean_token_accuracy": 0.954596996307373, |
| "num_tokens": 7703819.0, |
| "step": 883 |
| }, |
| { |
| "entropy": 1.0402805805206299, |
| "epoch": 2.8346709470304976, |
| "grad_norm": 2.519651174545288, |
| "learning_rate": 4.3237324536104165e-06, |
| "loss": 0.1361, |
| "mean_token_accuracy": 0.9493061900138855, |
| "num_tokens": 7712289.0, |
| "step": 884 |
| }, |
| { |
| "entropy": 1.2400332689285278, |
| "epoch": 2.837881219903692, |
| "grad_norm": 3.449007034301758, |
| "learning_rate": 4.313232210907959e-06, |
| "loss": 0.0991, |
| "mean_token_accuracy": 0.9627736210823059, |
| "num_tokens": 7720220.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 1.1178945302963257, |
| "epoch": 2.841091492776886, |
| "grad_norm": 2.6759750843048096, |
| "learning_rate": 4.302735054327523e-06, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9373737573623657, |
| "num_tokens": 7729471.0, |
| "step": 886 |
| }, |
| { |
| "entropy": 1.1199336647987366, |
| "epoch": 2.8443017656500804, |
| "grad_norm": 3.439345359802246, |
| "learning_rate": 4.292241031040077e-06, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9443832635879517, |
| "num_tokens": 7737350.0, |
| "step": 887 |
| }, |
| { |
| "entropy": 0.9779994189739227, |
| "epoch": 2.8475120385232744, |
| "grad_norm": 4.169414520263672, |
| "learning_rate": 4.28175018820252e-06, |
| "loss": 0.1363, |
| "mean_token_accuracy": 0.9478906691074371, |
| "num_tokens": 7746581.0, |
| "step": 888 |
| }, |
| { |
| "entropy": 1.1126951575279236, |
| "epoch": 2.850722311396469, |
| "grad_norm": 2.338712692260742, |
| "learning_rate": 4.271262572957453e-06, |
| "loss": 0.113, |
| "mean_token_accuracy": 0.9514721035957336, |
| "num_tokens": 7754335.0, |
| "step": 889 |
| }, |
| { |
| "entropy": 1.04286390542984, |
| "epoch": 2.853932584269663, |
| "grad_norm": 2.685058832168579, |
| "learning_rate": 4.2607782324329776e-06, |
| "loss": 0.144, |
| "mean_token_accuracy": 0.9451346397399902, |
| "num_tokens": 7762713.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.0703285932540894, |
| "epoch": 2.857142857142857, |
| "grad_norm": 2.5074543952941895, |
| "learning_rate": 4.250297213742473e-06, |
| "loss": 0.1358, |
| "mean_token_accuracy": 0.9469152390956879, |
| "num_tokens": 7771455.0, |
| "step": 891 |
| }, |
| { |
| "entropy": 1.0886583030223846, |
| "epoch": 2.860353130016051, |
| "grad_norm": 2.3599722385406494, |
| "learning_rate": 4.239819563984397e-06, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9473360478878021, |
| "num_tokens": 7780173.0, |
| "step": 892 |
| }, |
| { |
| "entropy": 1.0926342606544495, |
| "epoch": 2.8635634028892456, |
| "grad_norm": 3.8707892894744873, |
| "learning_rate": 4.229345330242067e-06, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9592483639717102, |
| "num_tokens": 7789100.0, |
| "step": 893 |
| }, |
| { |
| "entropy": 1.1691067814826965, |
| "epoch": 2.86677367576244, |
| "grad_norm": 2.060270309448242, |
| "learning_rate": 4.21887455958345e-06, |
| "loss": 0.1028, |
| "mean_token_accuracy": 0.9459799826145172, |
| "num_tokens": 7797562.0, |
| "step": 894 |
| }, |
| { |
| "entropy": 1.0735175609588623, |
| "epoch": 2.869983948635634, |
| "grad_norm": 2.2516980171203613, |
| "learning_rate": 4.2084072990609505e-06, |
| "loss": 0.1051, |
| "mean_token_accuracy": 0.9549958407878876, |
| "num_tokens": 7805770.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 1.1715295910835266, |
| "epoch": 2.8731942215088284, |
| "grad_norm": 2.7888271808624268, |
| "learning_rate": 4.1979435957111984e-06, |
| "loss": 0.1514, |
| "mean_token_accuracy": 0.9374975264072418, |
| "num_tokens": 7814614.0, |
| "step": 896 |
| }, |
| { |
| "entropy": 1.0718038082122803, |
| "epoch": 2.8764044943820224, |
| "grad_norm": 2.4147868156433105, |
| "learning_rate": 4.187483496554844e-06, |
| "loss": 0.1459, |
| "mean_token_accuracy": 0.9425850808620453, |
| "num_tokens": 7823306.0, |
| "step": 897 |
| }, |
| { |
| "entropy": 0.9856555461883545, |
| "epoch": 2.879614767255217, |
| "grad_norm": 2.4258735179901123, |
| "learning_rate": 4.17702704859633e-06, |
| "loss": 0.1085, |
| "mean_token_accuracy": 0.9541434049606323, |
| "num_tokens": 7831505.0, |
| "step": 898 |
| }, |
| { |
| "entropy": 1.1180533170700073, |
| "epoch": 2.882825040128411, |
| "grad_norm": 64.09413146972656, |
| "learning_rate": 4.166574298823707e-06, |
| "loss": 0.1211, |
| "mean_token_accuracy": 0.9537391066551208, |
| "num_tokens": 7839948.0, |
| "step": 899 |
| }, |
| { |
| "entropy": 1.0941823720932007, |
| "epoch": 2.886035313001605, |
| "grad_norm": 4.044398307800293, |
| "learning_rate": 4.156125294208396e-06, |
| "loss": 0.1209, |
| "mean_token_accuracy": 0.9536958932876587, |
| "num_tokens": 7847550.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.1155158281326294, |
| "epoch": 2.889245585874799, |
| "grad_norm": 5.201903820037842, |
| "learning_rate": 4.145680081704989e-06, |
| "loss": 0.1461, |
| "mean_token_accuracy": 0.9484029412269592, |
| "num_tokens": 7856829.0, |
| "step": 901 |
| }, |
| { |
| "entropy": 1.0542896389961243, |
| "epoch": 2.8924558587479936, |
| "grad_norm": 2.08670973777771, |
| "learning_rate": 4.135238708251045e-06, |
| "loss": 0.138, |
| "mean_token_accuracy": 0.9373140633106232, |
| "num_tokens": 7865501.0, |
| "step": 902 |
| }, |
| { |
| "entropy": 1.058946669101715, |
| "epoch": 2.895666131621188, |
| "grad_norm": 2.177431344985962, |
| "learning_rate": 4.1248012207668635e-06, |
| "loss": 0.0885, |
| "mean_token_accuracy": 0.9713162779808044, |
| "num_tokens": 7873454.0, |
| "step": 903 |
| }, |
| { |
| "entropy": 1.1428315043449402, |
| "epoch": 2.898876404494382, |
| "grad_norm": 2.48675799369812, |
| "learning_rate": 4.1143676661552876e-06, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9380343556404114, |
| "num_tokens": 7882945.0, |
| "step": 904 |
| }, |
| { |
| "entropy": 1.1195711493492126, |
| "epoch": 2.902086677367576, |
| "grad_norm": 2.479515552520752, |
| "learning_rate": 4.103938091301479e-06, |
| "loss": 0.1243, |
| "mean_token_accuracy": 0.9570600390434265, |
| "num_tokens": 7890724.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 1.0864347219467163, |
| "epoch": 2.9052969502407704, |
| "grad_norm": 2.0829453468322754, |
| "learning_rate": 4.093512543072729e-06, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9525614678859711, |
| "num_tokens": 7899892.0, |
| "step": 906 |
| }, |
| { |
| "entropy": 1.057499647140503, |
| "epoch": 2.908507223113965, |
| "grad_norm": 2.903841972351074, |
| "learning_rate": 4.08309106831822e-06, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9514180123806, |
| "num_tokens": 7908834.0, |
| "step": 907 |
| }, |
| { |
| "entropy": 1.0348553955554962, |
| "epoch": 2.911717495987159, |
| "grad_norm": 2.450044631958008, |
| "learning_rate": 4.07267371386884e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9236886203289032, |
| "num_tokens": 7919643.0, |
| "step": 908 |
| }, |
| { |
| "entropy": 1.1447957754135132, |
| "epoch": 2.914927768860353, |
| "grad_norm": 2.1943976879119873, |
| "learning_rate": 4.062260526536955e-06, |
| "loss": 0.1434, |
| "mean_token_accuracy": 0.9311514496803284, |
| "num_tokens": 7928989.0, |
| "step": 909 |
| }, |
| { |
| "entropy": 1.2298431992530823, |
| "epoch": 2.918138041733547, |
| "grad_norm": 3.0917184352874756, |
| "learning_rate": 4.051851553116208e-06, |
| "loss": 0.1502, |
| "mean_token_accuracy": 0.9259648025035858, |
| "num_tokens": 7939150.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.2159399390220642, |
| "epoch": 2.9213483146067416, |
| "grad_norm": 2.3988490104675293, |
| "learning_rate": 4.041446840381309e-06, |
| "loss": 0.1005, |
| "mean_token_accuracy": 0.9625458717346191, |
| "num_tokens": 7947256.0, |
| "step": 911 |
| }, |
| { |
| "entropy": 0.9480354189872742, |
| "epoch": 2.924558587479936, |
| "grad_norm": 2.3145833015441895, |
| "learning_rate": 4.0310464350878145e-06, |
| "loss": 0.1322, |
| "mean_token_accuracy": 0.9379940032958984, |
| "num_tokens": 7955862.0, |
| "step": 912 |
| }, |
| { |
| "entropy": 1.056964099407196, |
| "epoch": 2.92776886035313, |
| "grad_norm": 2.7758469581604004, |
| "learning_rate": 4.0206503839719335e-06, |
| "loss": 0.1445, |
| "mean_token_accuracy": 0.9247631430625916, |
| "num_tokens": 7966052.0, |
| "step": 913 |
| }, |
| { |
| "entropy": 1.1305363178253174, |
| "epoch": 2.930979133226324, |
| "grad_norm": 2.4620068073272705, |
| "learning_rate": 4.0102587337503e-06, |
| "loss": 0.1141, |
| "mean_token_accuracy": 0.9462770223617554, |
| "num_tokens": 7974103.0, |
| "step": 914 |
| }, |
| { |
| "entropy": 1.148315191268921, |
| "epoch": 2.9341894060995184, |
| "grad_norm": 8.483851432800293, |
| "learning_rate": 3.999871531119779e-06, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.936420351266861, |
| "num_tokens": 7984482.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 0.9917832612991333, |
| "epoch": 2.937399678972713, |
| "grad_norm": 2.34243106842041, |
| "learning_rate": 3.989488822757244e-06, |
| "loss": 0.1418, |
| "mean_token_accuracy": 0.9488804042339325, |
| "num_tokens": 7993276.0, |
| "step": 916 |
| }, |
| { |
| "entropy": 1.3016346096992493, |
| "epoch": 2.940609951845907, |
| "grad_norm": 5.7551703453063965, |
| "learning_rate": 3.9791106553193746e-06, |
| "loss": 0.1081, |
| "mean_token_accuracy": 0.9585071802139282, |
| "num_tokens": 8001595.0, |
| "step": 917 |
| }, |
| { |
| "entropy": 0.9895669221878052, |
| "epoch": 2.943820224719101, |
| "grad_norm": 2.7677927017211914, |
| "learning_rate": 3.968737075442449e-06, |
| "loss": 0.1007, |
| "mean_token_accuracy": 0.9597472846508026, |
| "num_tokens": 8009133.0, |
| "step": 918 |
| }, |
| { |
| "entropy": 1.2291934490203857, |
| "epoch": 2.947030497592295, |
| "grad_norm": 2.158738851547241, |
| "learning_rate": 3.9583681297421194e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9689956903457642, |
| "num_tokens": 8017442.0, |
| "step": 919 |
| }, |
| { |
| "entropy": 1.0079643726348877, |
| "epoch": 2.9502407704654896, |
| "grad_norm": 2.449575901031494, |
| "learning_rate": 3.9480038648132285e-06, |
| "loss": 0.1454, |
| "mean_token_accuracy": 0.9376430809497833, |
| "num_tokens": 8025695.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.2521326541900635, |
| "epoch": 2.953451043338684, |
| "grad_norm": 2.2050304412841797, |
| "learning_rate": 3.937644327229572e-06, |
| "loss": 0.0997, |
| "mean_token_accuracy": 0.9615428447723389, |
| "num_tokens": 8034102.0, |
| "step": 921 |
| }, |
| { |
| "entropy": 1.1765710711479187, |
| "epoch": 2.956661316211878, |
| "grad_norm": 2.4940969944000244, |
| "learning_rate": 3.927289563543709e-06, |
| "loss": 0.1323, |
| "mean_token_accuracy": 0.9401477873325348, |
| "num_tokens": 8043454.0, |
| "step": 922 |
| }, |
| { |
| "entropy": 1.1329762935638428, |
| "epoch": 2.959871589085072, |
| "grad_norm": 2.112072229385376, |
| "learning_rate": 3.916939620286743e-06, |
| "loss": 0.1073, |
| "mean_token_accuracy": 0.9537553191184998, |
| "num_tokens": 8051936.0, |
| "step": 923 |
| }, |
| { |
| "entropy": 1.1569647192955017, |
| "epoch": 2.9630818619582664, |
| "grad_norm": 2.632157325744629, |
| "learning_rate": 3.906594543968122e-06, |
| "loss": 0.1351, |
| "mean_token_accuracy": 0.9533334970474243, |
| "num_tokens": 8060925.0, |
| "step": 924 |
| }, |
| { |
| "entropy": 1.1497327089309692, |
| "epoch": 2.966292134831461, |
| "grad_norm": 2.5286219120025635, |
| "learning_rate": 3.896254381075416e-06, |
| "loss": 0.143, |
| "mean_token_accuracy": 0.9448714256286621, |
| "num_tokens": 8070286.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 1.2087596654891968, |
| "epoch": 2.969502407704655, |
| "grad_norm": 2.437488317489624, |
| "learning_rate": 3.885919178074116e-06, |
| "loss": 0.1167, |
| "mean_token_accuracy": 0.9381493926048279, |
| "num_tokens": 8080606.0, |
| "step": 926 |
| }, |
| { |
| "entropy": 1.0841256976127625, |
| "epoch": 2.972712680577849, |
| "grad_norm": 2.3981645107269287, |
| "learning_rate": 3.875588981407433e-06, |
| "loss": 0.1118, |
| "mean_token_accuracy": 0.9561174213886261, |
| "num_tokens": 8088771.0, |
| "step": 927 |
| }, |
| { |
| "entropy": 1.31938898563385, |
| "epoch": 2.975922953451043, |
| "grad_norm": 2.942803382873535, |
| "learning_rate": 3.865263837496072e-06, |
| "loss": 0.105, |
| "mean_token_accuracy": 0.9646160304546356, |
| "num_tokens": 8097762.0, |
| "step": 928 |
| }, |
| { |
| "entropy": 1.1591737866401672, |
| "epoch": 2.9791332263242376, |
| "grad_norm": 2.5216867923736572, |
| "learning_rate": 3.854943792738037e-06, |
| "loss": 0.1174, |
| "mean_token_accuracy": 0.9540515542030334, |
| "num_tokens": 8106011.0, |
| "step": 929 |
| }, |
| { |
| "entropy": 1.048449695110321, |
| "epoch": 2.982343499197432, |
| "grad_norm": 4.191019058227539, |
| "learning_rate": 3.844628893508417e-06, |
| "loss": 0.1492, |
| "mean_token_accuracy": 0.9386603832244873, |
| "num_tokens": 8115559.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.1279476284980774, |
| "epoch": 2.985553772070626, |
| "grad_norm": 3.519676685333252, |
| "learning_rate": 3.834319186159179e-06, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9528370797634125, |
| "num_tokens": 8125054.0, |
| "step": 931 |
| }, |
| { |
| "entropy": 1.1605662107467651, |
| "epoch": 2.98876404494382, |
| "grad_norm": 1.9513523578643799, |
| "learning_rate": 3.8240147170189575e-06, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.9599840044975281, |
| "num_tokens": 8134348.0, |
| "step": 932 |
| }, |
| { |
| "entropy": 1.1763730645179749, |
| "epoch": 2.9919743178170144, |
| "grad_norm": 2.3354673385620117, |
| "learning_rate": 3.8137155323928526e-06, |
| "loss": 0.1072, |
| "mean_token_accuracy": 0.9594251811504364, |
| "num_tokens": 8142889.0, |
| "step": 933 |
| }, |
| { |
| "entropy": 1.0491589307785034, |
| "epoch": 2.995184590690209, |
| "grad_norm": 4.819876194000244, |
| "learning_rate": 3.803421678562213e-06, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9479463994503021, |
| "num_tokens": 8151077.0, |
| "step": 934 |
| }, |
| { |
| "entropy": 1.0207865834236145, |
| "epoch": 2.998394863563403, |
| "grad_norm": 2.169591188430786, |
| "learning_rate": 3.7931332017844302e-06, |
| "loss": 0.1391, |
| "mean_token_accuracy": 0.9430664777755737, |
| "num_tokens": 8159670.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 1.1213672161102295, |
| "epoch": 3.0, |
| "grad_norm": 3.4354679584503174, |
| "learning_rate": 3.7828501482927416e-06, |
| "loss": 0.1082, |
| "mean_token_accuracy": 0.9591605067253113, |
| "num_tokens": 8163426.0, |
| "step": 936 |
| }, |
| { |
| "entropy": 1.1103613376617432, |
| "epoch": 3.0032102728731944, |
| "grad_norm": 1.608437180519104, |
| "learning_rate": 3.7725725642960047e-06, |
| "loss": 0.064, |
| "mean_token_accuracy": 0.9727907776832581, |
| "num_tokens": 8172145.0, |
| "step": 937 |
| }, |
| { |
| "entropy": 0.972640872001648, |
| "epoch": 3.0064205457463884, |
| "grad_norm": 1.5159800052642822, |
| "learning_rate": 3.7623004959785066e-06, |
| "loss": 0.0545, |
| "mean_token_accuracy": 0.9817889928817749, |
| "num_tokens": 8181034.0, |
| "step": 938 |
| }, |
| { |
| "entropy": 1.1834158897399902, |
| "epoch": 3.009630818619583, |
| "grad_norm": 1.317238688468933, |
| "learning_rate": 3.752033989499742e-06, |
| "loss": 0.0394, |
| "mean_token_accuracy": 0.9858468174934387, |
| "num_tokens": 8188630.0, |
| "step": 939 |
| }, |
| { |
| "entropy": 1.2065880298614502, |
| "epoch": 3.012841091492777, |
| "grad_norm": 1.3101822137832642, |
| "learning_rate": 3.7417730909942184e-06, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9882438480854034, |
| "num_tokens": 8196618.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.1227167248725891, |
| "epoch": 3.016051364365971, |
| "grad_norm": 2.0894644260406494, |
| "learning_rate": 3.7315178465712364e-06, |
| "loss": 0.0477, |
| "mean_token_accuracy": 0.9829187393188477, |
| "num_tokens": 8205147.0, |
| "step": 941 |
| }, |
| { |
| "entropy": 1.0455093383789062, |
| "epoch": 3.019261637239165, |
| "grad_norm": 1.322717308998108, |
| "learning_rate": 3.721268302314698e-06, |
| "loss": 0.0604, |
| "mean_token_accuracy": 0.9677430689334869, |
| "num_tokens": 8214515.0, |
| "step": 942 |
| }, |
| { |
| "entropy": 1.129750370979309, |
| "epoch": 3.0224719101123596, |
| "grad_norm": 2.6341843605041504, |
| "learning_rate": 3.7110245042828786e-06, |
| "loss": 0.0454, |
| "mean_token_accuracy": 0.9830158650875092, |
| "num_tokens": 8222371.0, |
| "step": 943 |
| }, |
| { |
| "entropy": 0.9655829071998596, |
| "epoch": 3.0256821829855536, |
| "grad_norm": 1.5849435329437256, |
| "learning_rate": 3.70078649850824e-06, |
| "loss": 0.0614, |
| "mean_token_accuracy": 0.9761482775211334, |
| "num_tokens": 8231542.0, |
| "step": 944 |
| }, |
| { |
| "entropy": 1.1203289031982422, |
| "epoch": 3.028892455858748, |
| "grad_norm": 3.3625926971435547, |
| "learning_rate": 3.690554330997215e-06, |
| "loss": 0.0556, |
| "mean_token_accuracy": 0.9775514900684357, |
| "num_tokens": 8240850.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 0.9774525761604309, |
| "epoch": 3.0321027287319424, |
| "grad_norm": 2.0355944633483887, |
| "learning_rate": 3.6803280477299975e-06, |
| "loss": 0.0559, |
| "mean_token_accuracy": 0.9804919064044952, |
| "num_tokens": 8250103.0, |
| "step": 946 |
| }, |
| { |
| "entropy": 0.8719805181026459, |
| "epoch": 3.0353130016051364, |
| "grad_norm": 5.178523540496826, |
| "learning_rate": 3.670107694660343e-06, |
| "loss": 0.0716, |
| "mean_token_accuracy": 0.966615617275238, |
| "num_tokens": 8259351.0, |
| "step": 947 |
| }, |
| { |
| "entropy": 1.1192417740821838, |
| "epoch": 3.038523274478331, |
| "grad_norm": 1.9812731742858887, |
| "learning_rate": 3.659893317715355e-06, |
| "loss": 0.0596, |
| "mean_token_accuracy": 0.9587984681129456, |
| "num_tokens": 8269251.0, |
| "step": 948 |
| }, |
| { |
| "entropy": 1.0000159740447998, |
| "epoch": 3.041733547351525, |
| "grad_norm": 3.3250041007995605, |
| "learning_rate": 3.6496849627952875e-06, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.972270131111145, |
| "num_tokens": 8278240.0, |
| "step": 949 |
| }, |
| { |
| "entropy": 1.0413598418235779, |
| "epoch": 3.044943820224719, |
| "grad_norm": 1.7375184297561646, |
| "learning_rate": 3.639482675773324e-06, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.9655102491378784, |
| "num_tokens": 8287483.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.9738638401031494, |
| "epoch": 3.048154093097913, |
| "grad_norm": 1.5906513929367065, |
| "learning_rate": 3.6292865024953945e-06, |
| "loss": 0.0573, |
| "mean_token_accuracy": 0.9786509871482849, |
| "num_tokens": 8295377.0, |
| "step": 951 |
| }, |
| { |
| "entropy": 1.0087870359420776, |
| "epoch": 3.0513643659711076, |
| "grad_norm": 2.6421918869018555, |
| "learning_rate": 3.6190964887799418e-06, |
| "loss": 0.0447, |
| "mean_token_accuracy": 0.9821014106273651, |
| "num_tokens": 8303064.0, |
| "step": 952 |
| }, |
| { |
| "entropy": 0.9565515518188477, |
| "epoch": 3.0545746388443016, |
| "grad_norm": 14.785826683044434, |
| "learning_rate": 3.6089126804177373e-06, |
| "loss": 0.0517, |
| "mean_token_accuracy": 0.9803934693336487, |
| "num_tokens": 8311134.0, |
| "step": 953 |
| }, |
| { |
| "entropy": 1.0219348073005676, |
| "epoch": 3.057784911717496, |
| "grad_norm": 3.0249783992767334, |
| "learning_rate": 3.5987351231716665e-06, |
| "loss": 0.0515, |
| "mean_token_accuracy": 0.9799693524837494, |
| "num_tokens": 8319515.0, |
| "step": 954 |
| }, |
| { |
| "entropy": 1.0492010116577148, |
| "epoch": 3.0609951845906904, |
| "grad_norm": 2.180885076522827, |
| "learning_rate": 3.5885638627765228e-06, |
| "loss": 0.0732, |
| "mean_token_accuracy": 0.9596208930015564, |
| "num_tokens": 8328850.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 1.0292112231254578, |
| "epoch": 3.0642054574638844, |
| "grad_norm": 2.0570931434631348, |
| "learning_rate": 3.5783989449388063e-06, |
| "loss": 0.0559, |
| "mean_token_accuracy": 0.9788411557674408, |
| "num_tokens": 8337971.0, |
| "step": 956 |
| }, |
| { |
| "entropy": 0.9308125078678131, |
| "epoch": 3.067415730337079, |
| "grad_norm": 2.3526527881622314, |
| "learning_rate": 3.568240415336509e-06, |
| "loss": 0.062, |
| "mean_token_accuracy": 0.9766317903995514, |
| "num_tokens": 8347188.0, |
| "step": 957 |
| }, |
| { |
| "entropy": 0.9573177099227905, |
| "epoch": 3.070626003210273, |
| "grad_norm": 2.2333858013153076, |
| "learning_rate": 3.5580883196189265e-06, |
| "loss": 0.0529, |
| "mean_token_accuracy": 0.9760076105594635, |
| "num_tokens": 8355560.0, |
| "step": 958 |
| }, |
| { |
| "entropy": 0.910061776638031, |
| "epoch": 3.073836276083467, |
| "grad_norm": 2.2188963890075684, |
| "learning_rate": 3.547942703406433e-06, |
| "loss": 0.0579, |
| "mean_token_accuracy": 0.9800683856010437, |
| "num_tokens": 8364189.0, |
| "step": 959 |
| }, |
| { |
| "entropy": 0.8721618950366974, |
| "epoch": 3.077046548956661, |
| "grad_norm": 1.6384341716766357, |
| "learning_rate": 3.5378036122902907e-06, |
| "loss": 0.0547, |
| "mean_token_accuracy": 0.9822494089603424, |
| "num_tokens": 8373325.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.1164506673812866, |
| "epoch": 3.0802568218298556, |
| "grad_norm": 1.643190860748291, |
| "learning_rate": 3.52767109183244e-06, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9842567443847656, |
| "num_tokens": 8382070.0, |
| "step": 961 |
| }, |
| { |
| "entropy": 0.9429452419281006, |
| "epoch": 3.0834670947030496, |
| "grad_norm": 2.5088934898376465, |
| "learning_rate": 3.5175451875652906e-06, |
| "loss": 0.0445, |
| "mean_token_accuracy": 0.9792550504207611, |
| "num_tokens": 8390101.0, |
| "step": 962 |
| }, |
| { |
| "entropy": 0.895278811454773, |
| "epoch": 3.086677367576244, |
| "grad_norm": 1.734321117401123, |
| "learning_rate": 3.507425944991529e-06, |
| "loss": 0.0407, |
| "mean_token_accuracy": 0.9858044385910034, |
| "num_tokens": 8398284.0, |
| "step": 963 |
| }, |
| { |
| "entropy": 0.9392586350440979, |
| "epoch": 3.0898876404494384, |
| "grad_norm": 2.23935604095459, |
| "learning_rate": 3.4973134095838943e-06, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9840765595436096, |
| "num_tokens": 8406504.0, |
| "step": 964 |
| }, |
| { |
| "entropy": 0.9843576550483704, |
| "epoch": 3.0930979133226324, |
| "grad_norm": 2.003302574157715, |
| "learning_rate": 3.4872076267850015e-06, |
| "loss": 0.0574, |
| "mean_token_accuracy": 0.9778844714164734, |
| "num_tokens": 8415045.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 1.0201025605201721, |
| "epoch": 3.096308186195827, |
| "grad_norm": 3.679013967514038, |
| "learning_rate": 3.4771086420071053e-06, |
| "loss": 0.0511, |
| "mean_token_accuracy": 0.980453222990036, |
| "num_tokens": 8422888.0, |
| "step": 966 |
| }, |
| { |
| "entropy": 1.300868034362793, |
| "epoch": 3.099518459069021, |
| "grad_norm": 2.1512715816497803, |
| "learning_rate": 3.4670165006319236e-06, |
| "loss": 0.0397, |
| "mean_token_accuracy": 0.9860817492008209, |
| "num_tokens": 8431965.0, |
| "step": 967 |
| }, |
| { |
| "entropy": 1.0145321488380432, |
| "epoch": 3.102728731942215, |
| "grad_norm": 5.593048095703125, |
| "learning_rate": 3.4569312480104157e-06, |
| "loss": 0.0457, |
| "mean_token_accuracy": 0.9817441999912262, |
| "num_tokens": 8440345.0, |
| "step": 968 |
| }, |
| { |
| "entropy": 0.9421059787273407, |
| "epoch": 3.105939004815409, |
| "grad_norm": 1.5621843338012695, |
| "learning_rate": 3.4468529294625895e-06, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.977195680141449, |
| "num_tokens": 8448849.0, |
| "step": 969 |
| }, |
| { |
| "entropy": 0.9190913140773773, |
| "epoch": 3.1091492776886036, |
| "grad_norm": 1.9628736972808838, |
| "learning_rate": 3.4367815902772917e-06, |
| "loss": 0.0582, |
| "mean_token_accuracy": 0.9760018289089203, |
| "num_tokens": 8457341.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.0129391252994537, |
| "epoch": 3.1123595505617976, |
| "grad_norm": 1.6691995859146118, |
| "learning_rate": 3.4267172757120005e-06, |
| "loss": 0.0511, |
| "mean_token_accuracy": 0.9774636328220367, |
| "num_tokens": 8465557.0, |
| "step": 971 |
| }, |
| { |
| "entropy": 1.0064606368541718, |
| "epoch": 3.115569823434992, |
| "grad_norm": 6.057915687561035, |
| "learning_rate": 3.416660030992639e-06, |
| "loss": 0.0569, |
| "mean_token_accuracy": 0.9772391021251678, |
| "num_tokens": 8474480.0, |
| "step": 972 |
| }, |
| { |
| "entropy": 1.0960939228534698, |
| "epoch": 3.1187800963081864, |
| "grad_norm": 1.9116007089614868, |
| "learning_rate": 3.406609901313349e-06, |
| "loss": 0.0567, |
| "mean_token_accuracy": 0.9655565321445465, |
| "num_tokens": 8484235.0, |
| "step": 973 |
| }, |
| { |
| "entropy": 0.9288766980171204, |
| "epoch": 3.1219903691813804, |
| "grad_norm": 11.40886116027832, |
| "learning_rate": 3.396566931836308e-06, |
| "loss": 0.0723, |
| "mean_token_accuracy": 0.9614908397197723, |
| "num_tokens": 8494366.0, |
| "step": 974 |
| }, |
| { |
| "entropy": 0.9967909157276154, |
| "epoch": 3.125200642054575, |
| "grad_norm": 1.3702481985092163, |
| "learning_rate": 3.386531167691512e-06, |
| "loss": 0.053, |
| "mean_token_accuracy": 0.978669673204422, |
| "num_tokens": 8504740.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 1.1818091869354248, |
| "epoch": 3.128410914927769, |
| "grad_norm": 1.1393635272979736, |
| "learning_rate": 3.3765026539765832e-06, |
| "loss": 0.0232, |
| "mean_token_accuracy": 0.9925740659236908, |
| "num_tokens": 8513110.0, |
| "step": 976 |
| }, |
| { |
| "entropy": 0.9709301590919495, |
| "epoch": 3.131621187800963, |
| "grad_norm": 2.256098747253418, |
| "learning_rate": 3.36648143575656e-06, |
| "loss": 0.0641, |
| "mean_token_accuracy": 0.9657347202301025, |
| "num_tokens": 8523348.0, |
| "step": 977 |
| }, |
| { |
| "entropy": 0.8825556635856628, |
| "epoch": 3.134831460674157, |
| "grad_norm": 7.103281497955322, |
| "learning_rate": 3.3564675580636946e-06, |
| "loss": 0.0543, |
| "mean_token_accuracy": 0.9809089303016663, |
| "num_tokens": 8531559.0, |
| "step": 978 |
| }, |
| { |
| "entropy": 0.8873893618583679, |
| "epoch": 3.1380417335473516, |
| "grad_norm": 1.9619849920272827, |
| "learning_rate": 3.3464610658972584e-06, |
| "loss": 0.0503, |
| "mean_token_accuracy": 0.9831579029560089, |
| "num_tokens": 8539982.0, |
| "step": 979 |
| }, |
| { |
| "entropy": 1.0100898146629333, |
| "epoch": 3.1412520064205456, |
| "grad_norm": 1.5911046266555786, |
| "learning_rate": 3.3364620042233316e-06, |
| "loss": 0.0325, |
| "mean_token_accuracy": 0.9892257153987885, |
| "num_tokens": 8548033.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.9736209511756897, |
| "epoch": 3.14446227929374, |
| "grad_norm": 6.580364227294922, |
| "learning_rate": 3.326470417974604e-06, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9663437604904175, |
| "num_tokens": 8556845.0, |
| "step": 981 |
| }, |
| { |
| "entropy": 0.9981383681297302, |
| "epoch": 3.1476725521669344, |
| "grad_norm": 2.606773853302002, |
| "learning_rate": 3.3164863520501744e-06, |
| "loss": 0.0511, |
| "mean_token_accuracy": 0.982105016708374, |
| "num_tokens": 8565025.0, |
| "step": 982 |
| }, |
| { |
| "entropy": 0.8959035575389862, |
| "epoch": 3.1508828250401284, |
| "grad_norm": 1.872805118560791, |
| "learning_rate": 3.3065098513153473e-06, |
| "loss": 0.0516, |
| "mean_token_accuracy": 0.9818290174007416, |
| "num_tokens": 8573640.0, |
| "step": 983 |
| }, |
| { |
| "entropy": 1.0557291507720947, |
| "epoch": 3.154093097913323, |
| "grad_norm": 1.5255221128463745, |
| "learning_rate": 3.29654096060143e-06, |
| "loss": 0.0645, |
| "mean_token_accuracy": 0.9671049416065216, |
| "num_tokens": 8581760.0, |
| "step": 984 |
| }, |
| { |
| "entropy": 0.899553507566452, |
| "epoch": 3.157303370786517, |
| "grad_norm": 1.7938666343688965, |
| "learning_rate": 3.2865797247055354e-06, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9840608239173889, |
| "num_tokens": 8589385.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 0.9979848265647888, |
| "epoch": 3.160513643659711, |
| "grad_norm": 2.103757381439209, |
| "learning_rate": 3.2766261883903744e-06, |
| "loss": 0.0547, |
| "mean_token_accuracy": 0.9734188914299011, |
| "num_tokens": 8597447.0, |
| "step": 986 |
| }, |
| { |
| "entropy": 0.9901096820831299, |
| "epoch": 3.163723916532905, |
| "grad_norm": 2.168027877807617, |
| "learning_rate": 3.266680396384061e-06, |
| "loss": 0.0601, |
| "mean_token_accuracy": 0.9723277390003204, |
| "num_tokens": 8606238.0, |
| "step": 987 |
| }, |
| { |
| "entropy": 0.9194740653038025, |
| "epoch": 3.1669341894060996, |
| "grad_norm": 2.060809373855591, |
| "learning_rate": 3.256742393379909e-06, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.9673266708850861, |
| "num_tokens": 8614880.0, |
| "step": 988 |
| }, |
| { |
| "entropy": 1.0402765274047852, |
| "epoch": 3.1701444622792936, |
| "grad_norm": 2.8911445140838623, |
| "learning_rate": 3.2468122240362287e-06, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9752613008022308, |
| "num_tokens": 8623325.0, |
| "step": 989 |
| }, |
| { |
| "entropy": 1.0076416432857513, |
| "epoch": 3.173354735152488, |
| "grad_norm": 2.1394102573394775, |
| "learning_rate": 3.2368899329761316e-06, |
| "loss": 0.074, |
| "mean_token_accuracy": 0.9638822078704834, |
| "num_tokens": 8633417.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.98023921251297, |
| "epoch": 3.176565008025682, |
| "grad_norm": 1.8175908327102661, |
| "learning_rate": 3.226975564787322e-06, |
| "loss": 0.069, |
| "mean_token_accuracy": 0.9612008631229401, |
| "num_tokens": 8643761.0, |
| "step": 991 |
| }, |
| { |
| "entropy": 0.9907765090465546, |
| "epoch": 3.1797752808988764, |
| "grad_norm": 1.614745020866394, |
| "learning_rate": 3.2170691640219077e-06, |
| "loss": 0.0516, |
| "mean_token_accuracy": 0.9799723327159882, |
| "num_tokens": 8652126.0, |
| "step": 992 |
| }, |
| { |
| "entropy": 1.0575563311576843, |
| "epoch": 3.182985553772071, |
| "grad_norm": 2.4234402179718018, |
| "learning_rate": 3.2071707751961838e-06, |
| "loss": 0.0795, |
| "mean_token_accuracy": 0.9665969610214233, |
| "num_tokens": 8660420.0, |
| "step": 993 |
| }, |
| { |
| "entropy": 1.0722668170928955, |
| "epoch": 3.186195826645265, |
| "grad_norm": 2.0058796405792236, |
| "learning_rate": 3.197280442790455e-06, |
| "loss": 0.0515, |
| "mean_token_accuracy": 0.9787575006484985, |
| "num_tokens": 8669423.0, |
| "step": 994 |
| }, |
| { |
| "entropy": 0.9013467729091644, |
| "epoch": 3.189406099518459, |
| "grad_norm": 1.7118332386016846, |
| "learning_rate": 3.187398211248811e-06, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9585808515548706, |
| "num_tokens": 8678899.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 0.9303914904594421, |
| "epoch": 3.192616372391653, |
| "grad_norm": 1.357763409614563, |
| "learning_rate": 3.1775241249789434e-06, |
| "loss": 0.0419, |
| "mean_token_accuracy": 0.9790163636207581, |
| "num_tokens": 8687793.0, |
| "step": 996 |
| }, |
| { |
| "entropy": 0.9078644514083862, |
| "epoch": 3.1958266452648476, |
| "grad_norm": 1.659339427947998, |
| "learning_rate": 3.1676582283519454e-06, |
| "loss": 0.0566, |
| "mean_token_accuracy": 0.9744837284088135, |
| "num_tokens": 8696488.0, |
| "step": 997 |
| }, |
| { |
| "entropy": 1.036493569612503, |
| "epoch": 3.1990369181380416, |
| "grad_norm": 2.6456494331359863, |
| "learning_rate": 3.1578005657021004e-06, |
| "loss": 0.0325, |
| "mean_token_accuracy": 0.9887253046035767, |
| "num_tokens": 8704137.0, |
| "step": 998 |
| }, |
| { |
| "entropy": 0.9375521242618561, |
| "epoch": 3.202247191011236, |
| "grad_norm": 2.551023483276367, |
| "learning_rate": 3.1479511813267006e-06, |
| "loss": 0.0474, |
| "mean_token_accuracy": 0.9822751581668854, |
| "num_tokens": 8712176.0, |
| "step": 999 |
| }, |
| { |
| "entropy": 0.9312825500965118, |
| "epoch": 3.20545746388443, |
| "grad_norm": 1.506113052368164, |
| "learning_rate": 3.1381101194858264e-06, |
| "loss": 0.0443, |
| "mean_token_accuracy": 0.9840706288814545, |
| "num_tokens": 8720821.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.1557820439338684, |
| "epoch": 3.2086677367576244, |
| "grad_norm": 1.624644160270691, |
| "learning_rate": 3.1282774244021717e-06, |
| "loss": 0.0499, |
| "mean_token_accuracy": 0.9834720492362976, |
| "num_tokens": 8730703.0, |
| "step": 1001 |
| }, |
| { |
| "entropy": 1.0071000158786774, |
| "epoch": 3.211878009630819, |
| "grad_norm": 2.3831636905670166, |
| "learning_rate": 3.118453140260823e-06, |
| "loss": 0.0435, |
| "mean_token_accuracy": 0.9820217788219452, |
| "num_tokens": 8738677.0, |
| "step": 1002 |
| }, |
| { |
| "entropy": 1.228837251663208, |
| "epoch": 3.215088282504013, |
| "grad_norm": 15.644500732421875, |
| "learning_rate": 3.1086373112090762e-06, |
| "loss": 0.0451, |
| "mean_token_accuracy": 0.9836629033088684, |
| "num_tokens": 8747024.0, |
| "step": 1003 |
| }, |
| { |
| "entropy": 0.9827360510826111, |
| "epoch": 3.218298555377207, |
| "grad_norm": 2.2126834392547607, |
| "learning_rate": 3.0988299813562304e-06, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.9848807752132416, |
| "num_tokens": 8754758.0, |
| "step": 1004 |
| }, |
| { |
| "entropy": 1.0021247863769531, |
| "epoch": 3.221508828250401, |
| "grad_norm": 2.114436388015747, |
| "learning_rate": 3.089031194773392e-06, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9792338311672211, |
| "num_tokens": 8763540.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 1.1125357747077942, |
| "epoch": 3.2247191011235956, |
| "grad_norm": 3.2401301860809326, |
| "learning_rate": 3.079240995493279e-06, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.9815241992473602, |
| "num_tokens": 8772746.0, |
| "step": 1006 |
| }, |
| { |
| "entropy": 0.9442520439624786, |
| "epoch": 3.2279293739967896, |
| "grad_norm": 1.4487026929855347, |
| "learning_rate": 3.069459427510014e-06, |
| "loss": 0.0349, |
| "mean_token_accuracy": 0.9873338937759399, |
| "num_tokens": 8781167.0, |
| "step": 1007 |
| }, |
| { |
| "entropy": 0.9544734358787537, |
| "epoch": 3.231139646869984, |
| "grad_norm": 1.6530356407165527, |
| "learning_rate": 3.0596865347789444e-06, |
| "loss": 0.0764, |
| "mean_token_accuracy": 0.9499466717243195, |
| "num_tokens": 8791131.0, |
| "step": 1008 |
| }, |
| { |
| "entropy": 0.9192224740982056, |
| "epoch": 3.234349919743178, |
| "grad_norm": 2.113713264465332, |
| "learning_rate": 3.049922361216422e-06, |
| "loss": 0.0597, |
| "mean_token_accuracy": 0.9740520119667053, |
| "num_tokens": 8799470.0, |
| "step": 1009 |
| }, |
| { |
| "entropy": 0.9941827058792114, |
| "epoch": 3.2375601926163724, |
| "grad_norm": 2.598651885986328, |
| "learning_rate": 3.040166950699626e-06, |
| "loss": 0.0663, |
| "mean_token_accuracy": 0.979779839515686, |
| "num_tokens": 8809349.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.0034226775169373, |
| "epoch": 3.240770465489567, |
| "grad_norm": 1.9838776588439941, |
| "learning_rate": 3.0304203470663507e-06, |
| "loss": 0.0445, |
| "mean_token_accuracy": 0.9826087951660156, |
| "num_tokens": 8816640.0, |
| "step": 1011 |
| }, |
| { |
| "entropy": 1.0576593279838562, |
| "epoch": 3.243980738362761, |
| "grad_norm": 2.287182569503784, |
| "learning_rate": 3.0206825941148203e-06, |
| "loss": 0.0445, |
| "mean_token_accuracy": 0.9830202162265778, |
| "num_tokens": 8824940.0, |
| "step": 1012 |
| }, |
| { |
| "entropy": 0.8799232840538025, |
| "epoch": 3.247191011235955, |
| "grad_norm": 1.7231913805007935, |
| "learning_rate": 3.0109537356034856e-06, |
| "loss": 0.0692, |
| "mean_token_accuracy": 0.964619368314743, |
| "num_tokens": 8835331.0, |
| "step": 1013 |
| }, |
| { |
| "entropy": 0.915042519569397, |
| "epoch": 3.250401284109149, |
| "grad_norm": 2.0848681926727295, |
| "learning_rate": 3.001233815250823e-06, |
| "loss": 0.0769, |
| "mean_token_accuracy": 0.9604291617870331, |
| "num_tokens": 8845517.0, |
| "step": 1014 |
| }, |
| { |
| "entropy": 0.9062038064002991, |
| "epoch": 3.2536115569823436, |
| "grad_norm": 2.325322151184082, |
| "learning_rate": 2.991522876735154e-06, |
| "loss": 0.088, |
| "mean_token_accuracy": 0.9585862755775452, |
| "num_tokens": 8855134.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 0.9652998447418213, |
| "epoch": 3.2568218298555376, |
| "grad_norm": 2.039144277572632, |
| "learning_rate": 2.981820963694427e-06, |
| "loss": 0.0369, |
| "mean_token_accuracy": 0.986418753862381, |
| "num_tokens": 8863208.0, |
| "step": 1016 |
| }, |
| { |
| "entropy": 0.9741517305374146, |
| "epoch": 3.260032102728732, |
| "grad_norm": 1.9456210136413574, |
| "learning_rate": 2.9721281197260427e-06, |
| "loss": 0.0621, |
| "mean_token_accuracy": 0.9771439135074615, |
| "num_tokens": 8871148.0, |
| "step": 1017 |
| }, |
| { |
| "entropy": 1.0131028890609741, |
| "epoch": 3.263242375601926, |
| "grad_norm": 2.170149803161621, |
| "learning_rate": 2.9624443883866403e-06, |
| "loss": 0.043, |
| "mean_token_accuracy": 0.9852020740509033, |
| "num_tokens": 8880453.0, |
| "step": 1018 |
| }, |
| { |
| "entropy": 1.0579794645309448, |
| "epoch": 3.2664526484751204, |
| "grad_norm": 1.5485424995422363, |
| "learning_rate": 2.9527698131919156e-06, |
| "loss": 0.038, |
| "mean_token_accuracy": 0.9849701225757599, |
| "num_tokens": 8888958.0, |
| "step": 1019 |
| }, |
| { |
| "entropy": 1.0165627002716064, |
| "epoch": 3.2696629213483144, |
| "grad_norm": 2.5804836750030518, |
| "learning_rate": 2.9431044376164165e-06, |
| "loss": 0.0444, |
| "mean_token_accuracy": 0.9853474199771881, |
| "num_tokens": 8896866.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.9434081315994263, |
| "epoch": 3.272873194221509, |
| "grad_norm": 1.672654151916504, |
| "learning_rate": 2.9334483050933506e-06, |
| "loss": 0.0778, |
| "mean_token_accuracy": 0.9503156840801239, |
| "num_tokens": 8906557.0, |
| "step": 1021 |
| }, |
| { |
| "entropy": 0.9355335235595703, |
| "epoch": 3.276083467094703, |
| "grad_norm": 2.510097026824951, |
| "learning_rate": 2.9238014590143925e-06, |
| "loss": 0.0485, |
| "mean_token_accuracy": 0.9850959777832031, |
| "num_tokens": 8914286.0, |
| "step": 1022 |
| }, |
| { |
| "entropy": 0.8669168055057526, |
| "epoch": 3.279293739967897, |
| "grad_norm": 9.337780952453613, |
| "learning_rate": 2.91416394272948e-06, |
| "loss": 0.06, |
| "mean_token_accuracy": 0.9728347659111023, |
| "num_tokens": 8922690.0, |
| "step": 1023 |
| }, |
| { |
| "entropy": 0.9273563027381897, |
| "epoch": 3.2825040128410916, |
| "grad_norm": 2.416398525238037, |
| "learning_rate": 2.904535799546636e-06, |
| "loss": 0.054, |
| "mean_token_accuracy": 0.9759286046028137, |
| "num_tokens": 8931721.0, |
| "step": 1024 |
| }, |
| { |
| "entropy": 0.914330780506134, |
| "epoch": 3.2857142857142856, |
| "grad_norm": 1.6186331510543823, |
| "learning_rate": 2.894917072731753e-06, |
| "loss": 0.0362, |
| "mean_token_accuracy": 0.9858781099319458, |
| "num_tokens": 8939293.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 0.9978557825088501, |
| "epoch": 3.28892455858748, |
| "grad_norm": 1.6200190782546997, |
| "learning_rate": 2.8853078055084192e-06, |
| "loss": 0.0465, |
| "mean_token_accuracy": 0.9847012162208557, |
| "num_tokens": 8947140.0, |
| "step": 1026 |
| }, |
| { |
| "entropy": 1.1273800134658813, |
| "epoch": 3.292134831460674, |
| "grad_norm": 1.6572569608688354, |
| "learning_rate": 2.8757080410577042e-06, |
| "loss": 0.0402, |
| "mean_token_accuracy": 0.9817869663238525, |
| "num_tokens": 8955096.0, |
| "step": 1027 |
| }, |
| { |
| "entropy": 0.9369406700134277, |
| "epoch": 3.2953451043338684, |
| "grad_norm": 1.2704312801361084, |
| "learning_rate": 2.866117822517982e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9776290953159332, |
| "num_tokens": 8963424.0, |
| "step": 1028 |
| }, |
| { |
| "entropy": 1.066481113433838, |
| "epoch": 3.2985553772070624, |
| "grad_norm": 2.7775206565856934, |
| "learning_rate": 2.8565371929847286e-06, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9550573229789734, |
| "num_tokens": 8973499.0, |
| "step": 1029 |
| }, |
| { |
| "entropy": 1.1146639585494995, |
| "epoch": 3.301765650080257, |
| "grad_norm": 2.5436532497406006, |
| "learning_rate": 2.846966195510332e-06, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.9698319137096405, |
| "num_tokens": 8982236.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.9178177118301392, |
| "epoch": 3.304975922953451, |
| "grad_norm": 1.7456978559494019, |
| "learning_rate": 2.83740487310389e-06, |
| "loss": 0.0553, |
| "mean_token_accuracy": 0.9707920253276825, |
| "num_tokens": 8990852.0, |
| "step": 1031 |
| }, |
| { |
| "entropy": 1.0382152795791626, |
| "epoch": 3.308186195826645, |
| "grad_norm": 1.6169335842132568, |
| "learning_rate": 2.82785326873103e-06, |
| "loss": 0.0378, |
| "mean_token_accuracy": 0.9834270775318146, |
| "num_tokens": 8998939.0, |
| "step": 1032 |
| }, |
| { |
| "entropy": 0.993056982755661, |
| "epoch": 3.3113964686998396, |
| "grad_norm": 1.7227951288223267, |
| "learning_rate": 2.81831142531371e-06, |
| "loss": 0.0483, |
| "mean_token_accuracy": 0.9841017127037048, |
| "num_tokens": 9008345.0, |
| "step": 1033 |
| }, |
| { |
| "entropy": 1.0055307149887085, |
| "epoch": 3.3146067415730336, |
| "grad_norm": 1.711963176727295, |
| "learning_rate": 2.8087793857300193e-06, |
| "loss": 0.0533, |
| "mean_token_accuracy": 0.9790107905864716, |
| "num_tokens": 9017254.0, |
| "step": 1034 |
| }, |
| { |
| "entropy": 1.0133417248725891, |
| "epoch": 3.317817014446228, |
| "grad_norm": 2.1641650199890137, |
| "learning_rate": 2.7992571928139984e-06, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9752263724803925, |
| "num_tokens": 9026056.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 0.9465528428554535, |
| "epoch": 3.321027287319422, |
| "grad_norm": 10.7071533203125, |
| "learning_rate": 2.7897448893554335e-06, |
| "loss": 0.0466, |
| "mean_token_accuracy": 0.9831990897655487, |
| "num_tokens": 9034411.0, |
| "step": 1036 |
| }, |
| { |
| "entropy": 0.9765456318855286, |
| "epoch": 3.3242375601926164, |
| "grad_norm": 2.850576400756836, |
| "learning_rate": 2.780242518099675e-06, |
| "loss": 0.0477, |
| "mean_token_accuracy": 0.9827427566051483, |
| "num_tokens": 9043094.0, |
| "step": 1037 |
| }, |
| { |
| "entropy": 0.9502105414867401, |
| "epoch": 3.3274478330658104, |
| "grad_norm": 3.9407103061676025, |
| "learning_rate": 2.7707501217474443e-06, |
| "loss": 0.0483, |
| "mean_token_accuracy": 0.9828430116176605, |
| "num_tokens": 9050481.0, |
| "step": 1038 |
| }, |
| { |
| "entropy": 0.962793231010437, |
| "epoch": 3.330658105939005, |
| "grad_norm": 1.6887401342391968, |
| "learning_rate": 2.761267742954629e-06, |
| "loss": 0.0414, |
| "mean_token_accuracy": 0.9845134019851685, |
| "num_tokens": 9058366.0, |
| "step": 1039 |
| }, |
| { |
| "entropy": 0.9595210254192352, |
| "epoch": 3.333868378812199, |
| "grad_norm": 1.9709408283233643, |
| "learning_rate": 2.7517954243321097e-06, |
| "loss": 0.0461, |
| "mean_token_accuracy": 0.9848853349685669, |
| "num_tokens": 9066272.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.081430196762085, |
| "epoch": 3.337078651685393, |
| "grad_norm": 1.9100728034973145, |
| "learning_rate": 2.7423332084455543e-06, |
| "loss": 0.0562, |
| "mean_token_accuracy": 0.9827399849891663, |
| "num_tokens": 9074883.0, |
| "step": 1041 |
| }, |
| { |
| "entropy": 1.0304247736930847, |
| "epoch": 3.3402889245585876, |
| "grad_norm": 2.094480276107788, |
| "learning_rate": 2.7328811378152355e-06, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9760044515132904, |
| "num_tokens": 9083403.0, |
| "step": 1042 |
| }, |
| { |
| "entropy": 0.9497547447681427, |
| "epoch": 3.3434991974317816, |
| "grad_norm": 2.770940065383911, |
| "learning_rate": 2.723439254915834e-06, |
| "loss": 0.0674, |
| "mean_token_accuracy": 0.9691638946533203, |
| "num_tokens": 9093305.0, |
| "step": 1043 |
| }, |
| { |
| "entropy": 0.9552264213562012, |
| "epoch": 3.346709470304976, |
| "grad_norm": 1.8198769092559814, |
| "learning_rate": 2.714007602176254e-06, |
| "loss": 0.0561, |
| "mean_token_accuracy": 0.9818349182605743, |
| "num_tokens": 9101874.0, |
| "step": 1044 |
| }, |
| { |
| "entropy": 1.0001209378242493, |
| "epoch": 3.34991974317817, |
| "grad_norm": 2.130765199661255, |
| "learning_rate": 2.704586221979422e-06, |
| "loss": 0.0443, |
| "mean_token_accuracy": 0.9807817041873932, |
| "num_tokens": 9111703.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 1.0091004967689514, |
| "epoch": 3.3531300160513644, |
| "grad_norm": 1.6668496131896973, |
| "learning_rate": 2.695175156662107e-06, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9821631014347076, |
| "num_tokens": 9119776.0, |
| "step": 1046 |
| }, |
| { |
| "entropy": 0.9886317849159241, |
| "epoch": 3.3563402889245584, |
| "grad_norm": 2.097139596939087, |
| "learning_rate": 2.6857744485147286e-06, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.972685307264328, |
| "num_tokens": 9129487.0, |
| "step": 1047 |
| }, |
| { |
| "entropy": 0.8918091654777527, |
| "epoch": 3.359550561797753, |
| "grad_norm": 3.4406795501708984, |
| "learning_rate": 2.6763841397811576e-06, |
| "loss": 0.0656, |
| "mean_token_accuracy": 0.9593222439289093, |
| "num_tokens": 9140250.0, |
| "step": 1048 |
| }, |
| { |
| "entropy": 1.0260462164878845, |
| "epoch": 3.362760834670947, |
| "grad_norm": 1.657132863998413, |
| "learning_rate": 2.667004272658541e-06, |
| "loss": 0.052, |
| "mean_token_accuracy": 0.9808726608753204, |
| "num_tokens": 9149142.0, |
| "step": 1049 |
| }, |
| { |
| "entropy": 0.9491671919822693, |
| "epoch": 3.365971107544141, |
| "grad_norm": 2.0722362995147705, |
| "learning_rate": 2.6576348892970947e-06, |
| "loss": 0.0456, |
| "mean_token_accuracy": 0.9820691347122192, |
| "num_tokens": 9156975.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 1.0107131600379944, |
| "epoch": 3.3691813804173356, |
| "grad_norm": 1.412963628768921, |
| "learning_rate": 2.6482760317999338e-06, |
| "loss": 0.0466, |
| "mean_token_accuracy": 0.9728775322437286, |
| "num_tokens": 9165612.0, |
| "step": 1051 |
| }, |
| { |
| "entropy": 0.9788424372673035, |
| "epoch": 3.3723916532905296, |
| "grad_norm": 1.7426668405532837, |
| "learning_rate": 2.638927742222868e-06, |
| "loss": 0.0434, |
| "mean_token_accuracy": 0.9835657775402069, |
| "num_tokens": 9173886.0, |
| "step": 1052 |
| }, |
| { |
| "entropy": 0.9652360081672668, |
| "epoch": 3.375601926163724, |
| "grad_norm": 1.7111523151397705, |
| "learning_rate": 2.629590062574221e-06, |
| "loss": 0.0488, |
| "mean_token_accuracy": 0.9817163050174713, |
| "num_tokens": 9182802.0, |
| "step": 1053 |
| }, |
| { |
| "entropy": 0.9846494793891907, |
| "epoch": 3.378812199036918, |
| "grad_norm": 2.427800178527832, |
| "learning_rate": 2.6202630348146323e-06, |
| "loss": 0.0692, |
| "mean_token_accuracy": 0.970174103975296, |
| "num_tokens": 9191462.0, |
| "step": 1054 |
| }, |
| { |
| "entropy": 1.0558529496192932, |
| "epoch": 3.3820224719101124, |
| "grad_norm": 1.4618818759918213, |
| "learning_rate": 2.610946700856885e-06, |
| "loss": 0.0409, |
| "mean_token_accuracy": 0.9835179150104523, |
| "num_tokens": 9200009.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 0.8598636388778687, |
| "epoch": 3.3852327447833064, |
| "grad_norm": 1.6279900074005127, |
| "learning_rate": 2.6016411025656973e-06, |
| "loss": 0.0749, |
| "mean_token_accuracy": 0.9546991288661957, |
| "num_tokens": 9209206.0, |
| "step": 1056 |
| }, |
| { |
| "entropy": 0.9998753070831299, |
| "epoch": 3.388443017656501, |
| "grad_norm": 2.4744317531585693, |
| "learning_rate": 2.592346281757552e-06, |
| "loss": 0.0532, |
| "mean_token_accuracy": 0.9732007086277008, |
| "num_tokens": 9218881.0, |
| "step": 1057 |
| }, |
| { |
| "entropy": 0.7904504835605621, |
| "epoch": 3.391653290529695, |
| "grad_norm": 1.8489923477172852, |
| "learning_rate": 2.583062280200501e-06, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9678979218006134, |
| "num_tokens": 9227741.0, |
| "step": 1058 |
| }, |
| { |
| "entropy": 0.8975243866443634, |
| "epoch": 3.394863563402889, |
| "grad_norm": 10.23280143737793, |
| "learning_rate": 2.5737891396139713e-06, |
| "loss": 0.0623, |
| "mean_token_accuracy": 0.9779176414012909, |
| "num_tokens": 9235749.0, |
| "step": 1059 |
| }, |
| { |
| "entropy": 0.9414438307285309, |
| "epoch": 3.3980738362760836, |
| "grad_norm": 2.947763681411743, |
| "learning_rate": 2.5645269016685905e-06, |
| "loss": 0.0729, |
| "mean_token_accuracy": 0.9428462088108063, |
| "num_tokens": 9247768.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.9279659390449524, |
| "epoch": 3.4012841091492776, |
| "grad_norm": 2.126018762588501, |
| "learning_rate": 2.5552756079859904e-06, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9858635365962982, |
| "num_tokens": 9255066.0, |
| "step": 1061 |
| }, |
| { |
| "entropy": 1.1867004036903381, |
| "epoch": 3.404494382022472, |
| "grad_norm": 1.8234483003616333, |
| "learning_rate": 2.5460353001386263e-06, |
| "loss": 0.0393, |
| "mean_token_accuracy": 0.9828614294528961, |
| "num_tokens": 9263725.0, |
| "step": 1062 |
| }, |
| { |
| "entropy": 0.9649862051010132, |
| "epoch": 3.407704654895666, |
| "grad_norm": 1.4239985942840576, |
| "learning_rate": 2.5368060196495785e-06, |
| "loss": 0.0272, |
| "mean_token_accuracy": 0.9885632693767548, |
| "num_tokens": 9272284.0, |
| "step": 1063 |
| }, |
| { |
| "entropy": 1.181826651096344, |
| "epoch": 3.4109149277688604, |
| "grad_norm": 1.85142982006073, |
| "learning_rate": 2.527587807992383e-06, |
| "loss": 0.0486, |
| "mean_token_accuracy": 0.9821223318576813, |
| "num_tokens": 9281554.0, |
| "step": 1064 |
| }, |
| { |
| "entropy": 0.816430002450943, |
| "epoch": 3.4141252006420544, |
| "grad_norm": 1.6388859748840332, |
| "learning_rate": 2.5183807065908296e-06, |
| "loss": 0.0936, |
| "mean_token_accuracy": 0.9386367201805115, |
| "num_tokens": 9292509.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 0.7913811504840851, |
| "epoch": 3.417335473515249, |
| "grad_norm": 1.6456538438796997, |
| "learning_rate": 2.5091847568187834e-06, |
| "loss": 0.0583, |
| "mean_token_accuracy": 0.9755530953407288, |
| "num_tokens": 9301615.0, |
| "step": 1066 |
| }, |
| { |
| "entropy": 1.036412626504898, |
| "epoch": 3.420545746388443, |
| "grad_norm": 1.9353599548339844, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.0503, |
| "mean_token_accuracy": 0.9810832142829895, |
| "num_tokens": 9310589.0, |
| "step": 1067 |
| }, |
| { |
| "entropy": 0.7897546291351318, |
| "epoch": 3.423756019261637, |
| "grad_norm": 2.1252787113189697, |
| "learning_rate": 2.4908264774079355e-06, |
| "loss": 0.0568, |
| "mean_token_accuracy": 0.9774205684661865, |
| "num_tokens": 9319064.0, |
| "step": 1068 |
| }, |
| { |
| "entropy": 1.031320035457611, |
| "epoch": 3.4269662921348316, |
| "grad_norm": 1.4396086931228638, |
| "learning_rate": 2.4816642302655634e-06, |
| "loss": 0.0328, |
| "mean_token_accuracy": 0.9882897734642029, |
| "num_tokens": 9327453.0, |
| "step": 1069 |
| }, |
| { |
| "entropy": 0.9724080562591553, |
| "epoch": 3.4301765650080256, |
| "grad_norm": 2.4586398601531982, |
| "learning_rate": 2.4725132997451833e-06, |
| "loss": 0.0551, |
| "mean_token_accuracy": 0.9825824201107025, |
| "num_tokens": 9336542.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.9164369702339172, |
| "epoch": 3.43338683788122, |
| "grad_norm": 2.929311513900757, |
| "learning_rate": 2.4633737269682546e-06, |
| "loss": 0.0558, |
| "mean_token_accuracy": 0.9753805994987488, |
| "num_tokens": 9345800.0, |
| "step": 1071 |
| }, |
| { |
| "entropy": 0.8264903128147125, |
| "epoch": 3.436597110754414, |
| "grad_norm": 1.931374192237854, |
| "learning_rate": 2.454245553005184e-06, |
| "loss": 0.0537, |
| "mean_token_accuracy": 0.978405624628067, |
| "num_tokens": 9354054.0, |
| "step": 1072 |
| }, |
| { |
| "entropy": 0.8463916182518005, |
| "epoch": 3.4398073836276084, |
| "grad_norm": 1.8540114164352417, |
| "learning_rate": 2.445128818875166e-06, |
| "loss": 0.0593, |
| "mean_token_accuracy": 0.9726223349571228, |
| "num_tokens": 9363638.0, |
| "step": 1073 |
| }, |
| { |
| "entropy": 0.8569443821907043, |
| "epoch": 3.4430176565008024, |
| "grad_norm": 1.8228055238723755, |
| "learning_rate": 2.4360235655459804e-06, |
| "loss": 0.0325, |
| "mean_token_accuracy": 0.9868163168430328, |
| "num_tokens": 9371525.0, |
| "step": 1074 |
| }, |
| { |
| "entropy": 0.9562407732009888, |
| "epoch": 3.446227929373997, |
| "grad_norm": 1.992489218711853, |
| "learning_rate": 2.4269298339338205e-06, |
| "loss": 0.0544, |
| "mean_token_accuracy": 0.9766569435596466, |
| "num_tokens": 9381012.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 0.8250246644020081, |
| "epoch": 3.449438202247191, |
| "grad_norm": 2.0697720050811768, |
| "learning_rate": 2.4178476649031057e-06, |
| "loss": 0.0686, |
| "mean_token_accuracy": 0.9746454358100891, |
| "num_tokens": 9389158.0, |
| "step": 1076 |
| }, |
| { |
| "entropy": 0.8266815543174744, |
| "epoch": 3.452648475120385, |
| "grad_norm": 2.3080739974975586, |
| "learning_rate": 2.408777099266291e-06, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9853142201900482, |
| "num_tokens": 9396753.0, |
| "step": 1077 |
| }, |
| { |
| "entropy": 0.9661762714385986, |
| "epoch": 3.4558587479935796, |
| "grad_norm": 1.6249902248382568, |
| "learning_rate": 2.3997181777836955e-06, |
| "loss": 0.052, |
| "mean_token_accuracy": 0.9755984842777252, |
| "num_tokens": 9405294.0, |
| "step": 1078 |
| }, |
| { |
| "entropy": 0.904697835445404, |
| "epoch": 3.4590690208667736, |
| "grad_norm": 2.562695026397705, |
| "learning_rate": 2.3906709411633073e-06, |
| "loss": 0.0473, |
| "mean_token_accuracy": 0.9813754260540009, |
| "num_tokens": 9413357.0, |
| "step": 1079 |
| }, |
| { |
| "entropy": 1.0254833102226257, |
| "epoch": 3.462279293739968, |
| "grad_norm": 1.5701454877853394, |
| "learning_rate": 2.381635430060611e-06, |
| "loss": 0.056, |
| "mean_token_accuracy": 0.9748643338680267, |
| "num_tokens": 9423096.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.9985256493091583, |
| "epoch": 3.465489566613162, |
| "grad_norm": 1.7520354986190796, |
| "learning_rate": 2.3726116850783987e-06, |
| "loss": 0.0455, |
| "mean_token_accuracy": 0.9827691316604614, |
| "num_tokens": 9431857.0, |
| "step": 1081 |
| }, |
| { |
| "entropy": 0.9760493338108063, |
| "epoch": 3.4686998394863564, |
| "grad_norm": 1.6460782289505005, |
| "learning_rate": 2.3635997467665905e-06, |
| "loss": 0.0426, |
| "mean_token_accuracy": 0.9851132333278656, |
| "num_tokens": 9439408.0, |
| "step": 1082 |
| }, |
| { |
| "entropy": 1.1165828108787537, |
| "epoch": 3.4719101123595504, |
| "grad_norm": 1.316416621208191, |
| "learning_rate": 2.354599655622049e-06, |
| "loss": 0.0366, |
| "mean_token_accuracy": 0.9826886057853699, |
| "num_tokens": 9449020.0, |
| "step": 1083 |
| }, |
| { |
| "entropy": 0.8983205258846283, |
| "epoch": 3.475120385232745, |
| "grad_norm": 1.3936208486557007, |
| "learning_rate": 2.3456114520883956e-06, |
| "loss": 0.0486, |
| "mean_token_accuracy": 0.9764930009841919, |
| "num_tokens": 9458218.0, |
| "step": 1084 |
| }, |
| { |
| "entropy": 0.9232110977172852, |
| "epoch": 3.478330658105939, |
| "grad_norm": 4.920319557189941, |
| "learning_rate": 2.3366351765558437e-06, |
| "loss": 0.0515, |
| "mean_token_accuracy": 0.9800127148628235, |
| "num_tokens": 9466307.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 0.9811087250709534, |
| "epoch": 3.481540930979133, |
| "grad_norm": 1.8882064819335938, |
| "learning_rate": 2.3276708693609947e-06, |
| "loss": 0.0456, |
| "mean_token_accuracy": 0.9796569645404816, |
| "num_tokens": 9474803.0, |
| "step": 1086 |
| }, |
| { |
| "entropy": 0.96051424741745, |
| "epoch": 3.4847512038523276, |
| "grad_norm": 2.430860996246338, |
| "learning_rate": 2.318718570786675e-06, |
| "loss": 0.056, |
| "mean_token_accuracy": 0.9816667437553406, |
| "num_tokens": 9482461.0, |
| "step": 1087 |
| }, |
| { |
| "entropy": 0.997778058052063, |
| "epoch": 3.4879614767255216, |
| "grad_norm": 3.68143892288208, |
| "learning_rate": 2.309778321061742e-06, |
| "loss": 0.0429, |
| "mean_token_accuracy": 0.9797113835811615, |
| "num_tokens": 9491235.0, |
| "step": 1088 |
| }, |
| { |
| "entropy": 0.8883638083934784, |
| "epoch": 3.491171749598716, |
| "grad_norm": 1.9456281661987305, |
| "learning_rate": 2.3008501603609147e-06, |
| "loss": 0.0474, |
| "mean_token_accuracy": 0.9790343940258026, |
| "num_tokens": 9500344.0, |
| "step": 1089 |
| }, |
| { |
| "entropy": 0.9200843572616577, |
| "epoch": 3.49438202247191, |
| "grad_norm": 1.5608545541763306, |
| "learning_rate": 2.2919341288045853e-06, |
| "loss": 0.0422, |
| "mean_token_accuracy": 0.9835698008537292, |
| "num_tokens": 9508039.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.8582479655742645, |
| "epoch": 3.4975922953451044, |
| "grad_norm": 1.6942861080169678, |
| "learning_rate": 2.283030266458644e-06, |
| "loss": 0.0572, |
| "mean_token_accuracy": 0.9660318493843079, |
| "num_tokens": 9517616.0, |
| "step": 1091 |
| }, |
| { |
| "entropy": 0.9380939602851868, |
| "epoch": 3.5008025682182984, |
| "grad_norm": 1.8551942110061646, |
| "learning_rate": 2.2741386133342923e-06, |
| "loss": 0.0646, |
| "mean_token_accuracy": 0.9725143611431122, |
| "num_tokens": 9527651.0, |
| "step": 1092 |
| }, |
| { |
| "entropy": 0.8994499444961548, |
| "epoch": 3.504012841091493, |
| "grad_norm": 3.501117706298828, |
| "learning_rate": 2.265259209387867e-06, |
| "loss": 0.0894, |
| "mean_token_accuracy": 0.9523926079273224, |
| "num_tokens": 9537469.0, |
| "step": 1093 |
| }, |
| { |
| "entropy": 0.8829051852226257, |
| "epoch": 3.5072231139646872, |
| "grad_norm": 1.714121699333191, |
| "learning_rate": 2.256392094520664e-06, |
| "loss": 0.0526, |
| "mean_token_accuracy": 0.9835188090801239, |
| "num_tokens": 9545776.0, |
| "step": 1094 |
| }, |
| { |
| "entropy": 0.9695333242416382, |
| "epoch": 3.510433386837881, |
| "grad_norm": 1.8620688915252686, |
| "learning_rate": 2.2475373085787568e-06, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9754804074764252, |
| "num_tokens": 9555075.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 0.9979302287101746, |
| "epoch": 3.513643659711075, |
| "grad_norm": 2.942721366882324, |
| "learning_rate": 2.238694891352814e-06, |
| "loss": 0.0654, |
| "mean_token_accuracy": 0.9712414145469666, |
| "num_tokens": 9564309.0, |
| "step": 1096 |
| }, |
| { |
| "entropy": 0.8716126382350922, |
| "epoch": 3.5168539325842696, |
| "grad_norm": 1.671080470085144, |
| "learning_rate": 2.229864882577921e-06, |
| "loss": 0.034, |
| "mean_token_accuracy": 0.9897077977657318, |
| "num_tokens": 9572682.0, |
| "step": 1097 |
| }, |
| { |
| "entropy": 0.8658833503723145, |
| "epoch": 3.520064205457464, |
| "grad_norm": 5.941058158874512, |
| "learning_rate": 2.2210473219334083e-06, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.986923485994339, |
| "num_tokens": 9581560.0, |
| "step": 1098 |
| }, |
| { |
| "entropy": 0.9082388877868652, |
| "epoch": 3.523274478330658, |
| "grad_norm": 2.375026226043701, |
| "learning_rate": 2.2122422490426676e-06, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9843015968799591, |
| "num_tokens": 9589017.0, |
| "step": 1099 |
| }, |
| { |
| "entropy": 1.0297791361808777, |
| "epoch": 3.5264847512038524, |
| "grad_norm": 1.8525582551956177, |
| "learning_rate": 2.203449703472969e-06, |
| "loss": 0.0335, |
| "mean_token_accuracy": 0.9889396727085114, |
| "num_tokens": 9597697.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.1267684698104858, |
| "epoch": 3.5296950240770464, |
| "grad_norm": 1.5526533126831055, |
| "learning_rate": 2.194669724735296e-06, |
| "loss": 0.0373, |
| "mean_token_accuracy": 0.9850233793258667, |
| "num_tokens": 9606293.0, |
| "step": 1101 |
| }, |
| { |
| "entropy": 0.9412164390087128, |
| "epoch": 3.532905296950241, |
| "grad_norm": 5.0308966636657715, |
| "learning_rate": 2.1859023522841543e-06, |
| "loss": 0.0576, |
| "mean_token_accuracy": 0.9791690409183502, |
| "num_tokens": 9615202.0, |
| "step": 1102 |
| }, |
| { |
| "entropy": 0.9152273833751678, |
| "epoch": 3.5361155698234352, |
| "grad_norm": 1.82210373878479, |
| "learning_rate": 2.1771476255174056e-06, |
| "loss": 0.0522, |
| "mean_token_accuracy": 0.9795254766941071, |
| "num_tokens": 9623925.0, |
| "step": 1103 |
| }, |
| { |
| "entropy": 0.9812245965003967, |
| "epoch": 3.539325842696629, |
| "grad_norm": 1.7946172952651978, |
| "learning_rate": 2.1684055837760837e-06, |
| "loss": 0.0691, |
| "mean_token_accuracy": 0.9617434144020081, |
| "num_tokens": 9632749.0, |
| "step": 1104 |
| }, |
| { |
| "entropy": 0.8433893024921417, |
| "epoch": 3.542536115569823, |
| "grad_norm": 2.0056488513946533, |
| "learning_rate": 2.159676266344222e-06, |
| "loss": 0.0499, |
| "mean_token_accuracy": 0.9789745509624481, |
| "num_tokens": 9641328.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 0.9740126729011536, |
| "epoch": 3.5457463884430176, |
| "grad_norm": 1.5101797580718994, |
| "learning_rate": 2.1509597124486693e-06, |
| "loss": 0.0432, |
| "mean_token_accuracy": 0.9827054440975189, |
| "num_tokens": 9650409.0, |
| "step": 1106 |
| }, |
| { |
| "entropy": 0.9411405026912689, |
| "epoch": 3.548956661316212, |
| "grad_norm": 1.9099135398864746, |
| "learning_rate": 2.1422559612589266e-06, |
| "loss": 0.0548, |
| "mean_token_accuracy": 0.9779680073261261, |
| "num_tokens": 9658267.0, |
| "step": 1107 |
| }, |
| { |
| "entropy": 0.9189652502536774, |
| "epoch": 3.552166934189406, |
| "grad_norm": 2.395860195159912, |
| "learning_rate": 2.1335650518869555e-06, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9741061329841614, |
| "num_tokens": 9666958.0, |
| "step": 1108 |
| }, |
| { |
| "entropy": 0.9332525432109833, |
| "epoch": 3.5553772070626004, |
| "grad_norm": 1.5610212087631226, |
| "learning_rate": 2.124887023387017e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9716036319732666, |
| "num_tokens": 9675816.0, |
| "step": 1109 |
| }, |
| { |
| "entropy": 0.7910544574260712, |
| "epoch": 3.5585874799357944, |
| "grad_norm": 1.3642332553863525, |
| "learning_rate": 2.1162219147554884e-06, |
| "loss": 0.0409, |
| "mean_token_accuracy": 0.9837254583835602, |
| "num_tokens": 9684953.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.9269280433654785, |
| "epoch": 3.561797752808989, |
| "grad_norm": 1.9942110776901245, |
| "learning_rate": 2.1075697649306838e-06, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9430525600910187, |
| "num_tokens": 9695435.0, |
| "step": 1111 |
| }, |
| { |
| "entropy": 1.0160551369190216, |
| "epoch": 3.5650080256821832, |
| "grad_norm": 1.7112821340560913, |
| "learning_rate": 2.09893061279269e-06, |
| "loss": 0.0436, |
| "mean_token_accuracy": 0.9804078638553619, |
| "num_tokens": 9703957.0, |
| "step": 1112 |
| }, |
| { |
| "entropy": 1.1850579977035522, |
| "epoch": 3.568218298555377, |
| "grad_norm": 1.7301826477050781, |
| "learning_rate": 2.0903044971631854e-06, |
| "loss": 0.0283, |
| "mean_token_accuracy": 0.9907903373241425, |
| "num_tokens": 9711468.0, |
| "step": 1113 |
| }, |
| { |
| "entropy": 0.8523805141448975, |
| "epoch": 3.571428571428571, |
| "grad_norm": 2.1167054176330566, |
| "learning_rate": 2.0816914568052664e-06, |
| "loss": 0.0625, |
| "mean_token_accuracy": 0.966397762298584, |
| "num_tokens": 9720236.0, |
| "step": 1114 |
| }, |
| { |
| "entropy": 1.0446801781654358, |
| "epoch": 3.5746388443017656, |
| "grad_norm": 3.2320852279663086, |
| "learning_rate": 2.0730915304232692e-06, |
| "loss": 0.0543, |
| "mean_token_accuracy": 0.9817685484886169, |
| "num_tokens": 9729457.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 0.9242656528949738, |
| "epoch": 3.57784911717496, |
| "grad_norm": 2.0443546772003174, |
| "learning_rate": 2.0645047566626057e-06, |
| "loss": 0.0361, |
| "mean_token_accuracy": 0.9866881966590881, |
| "num_tokens": 9737693.0, |
| "step": 1116 |
| }, |
| { |
| "entropy": 0.9498637318611145, |
| "epoch": 3.581059390048154, |
| "grad_norm": 4.602542400360107, |
| "learning_rate": 2.055931174109579e-06, |
| "loss": 0.0541, |
| "mean_token_accuracy": 0.970384955406189, |
| "num_tokens": 9747416.0, |
| "step": 1117 |
| }, |
| { |
| "entropy": 0.8563813269138336, |
| "epoch": 3.5842696629213484, |
| "grad_norm": 1.4554708003997803, |
| "learning_rate": 2.0473708212912167e-06, |
| "loss": 0.0423, |
| "mean_token_accuracy": 0.9849738478660583, |
| "num_tokens": 9756080.0, |
| "step": 1118 |
| }, |
| { |
| "entropy": 0.985105961561203, |
| "epoch": 3.5874799357945424, |
| "grad_norm": 1.49336838722229, |
| "learning_rate": 2.0388237366751005e-06, |
| "loss": 0.0417, |
| "mean_token_accuracy": 0.9799233675003052, |
| "num_tokens": 9765320.0, |
| "step": 1119 |
| }, |
| { |
| "entropy": 0.9542718529701233, |
| "epoch": 3.590690208667737, |
| "grad_norm": 1.775028944015503, |
| "learning_rate": 2.030289958669181e-06, |
| "loss": 0.0383, |
| "mean_token_accuracy": 0.9866673350334167, |
| "num_tokens": 9773379.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.903919130563736, |
| "epoch": 3.5939004815409312, |
| "grad_norm": 2.254425048828125, |
| "learning_rate": 2.02176952562162e-06, |
| "loss": 0.0619, |
| "mean_token_accuracy": 0.9764844477176666, |
| "num_tokens": 9781226.0, |
| "step": 1121 |
| }, |
| { |
| "entropy": 1.0569791793823242, |
| "epoch": 3.597110754414125, |
| "grad_norm": 1.8237416744232178, |
| "learning_rate": 2.013262475820602e-06, |
| "loss": 0.0362, |
| "mean_token_accuracy": 0.9858551025390625, |
| "num_tokens": 9789760.0, |
| "step": 1122 |
| }, |
| { |
| "entropy": 1.1246366500854492, |
| "epoch": 3.600321027287319, |
| "grad_norm": 3.342473268508911, |
| "learning_rate": 2.004768847494186e-06, |
| "loss": 0.0345, |
| "mean_token_accuracy": 0.9862525463104248, |
| "num_tokens": 9798992.0, |
| "step": 1123 |
| }, |
| { |
| "entropy": 0.840887725353241, |
| "epoch": 3.6035313001605136, |
| "grad_norm": 1.3073285818099976, |
| "learning_rate": 1.996288678810105e-06, |
| "loss": 0.0462, |
| "mean_token_accuracy": 0.9791007936000824, |
| "num_tokens": 9807916.0, |
| "step": 1124 |
| }, |
| { |
| "entropy": 0.9333318173885345, |
| "epoch": 3.606741573033708, |
| "grad_norm": 1.5826224088668823, |
| "learning_rate": 1.987822007875617e-06, |
| "loss": 0.0535, |
| "mean_token_accuracy": 0.9793388843536377, |
| "num_tokens": 9817036.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 0.9564113020896912, |
| "epoch": 3.609951845906902, |
| "grad_norm": 1.7803699970245361, |
| "learning_rate": 1.979368872737319e-06, |
| "loss": 0.0324, |
| "mean_token_accuracy": 0.9891190826892853, |
| "num_tokens": 9825054.0, |
| "step": 1126 |
| }, |
| { |
| "entropy": 0.8959764540195465, |
| "epoch": 3.6131621187800964, |
| "grad_norm": 1.7652521133422852, |
| "learning_rate": 1.9709293113809876e-06, |
| "loss": 0.0596, |
| "mean_token_accuracy": 0.9736920297145844, |
| "num_tokens": 9834378.0, |
| "step": 1127 |
| }, |
| { |
| "entropy": 1.0392656326293945, |
| "epoch": 3.6163723916532904, |
| "grad_norm": 1.287834644317627, |
| "learning_rate": 1.962503361731403e-06, |
| "loss": 0.0332, |
| "mean_token_accuracy": 0.9876022934913635, |
| "num_tokens": 9842473.0, |
| "step": 1128 |
| }, |
| { |
| "entropy": 0.8851215839385986, |
| "epoch": 3.619582664526485, |
| "grad_norm": 2.2874505519866943, |
| "learning_rate": 1.954091061652172e-06, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9856323301792145, |
| "num_tokens": 9850848.0, |
| "step": 1129 |
| }, |
| { |
| "entropy": 0.911045104265213, |
| "epoch": 3.6227929373996792, |
| "grad_norm": 2.200509548187256, |
| "learning_rate": 1.945692448945574e-06, |
| "loss": 0.0427, |
| "mean_token_accuracy": 0.9832578599452972, |
| "num_tokens": 9859913.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.8001708984375, |
| "epoch": 3.626003210272873, |
| "grad_norm": 26.58523178100586, |
| "learning_rate": 1.9373075613523728e-06, |
| "loss": 0.0567, |
| "mean_token_accuracy": 0.9789122343063354, |
| "num_tokens": 9868320.0, |
| "step": 1131 |
| }, |
| { |
| "entropy": 0.8729480504989624, |
| "epoch": 3.629213483146067, |
| "grad_norm": 1.7441883087158203, |
| "learning_rate": 1.928936436551661e-06, |
| "loss": 0.0425, |
| "mean_token_accuracy": 0.9858026504516602, |
| "num_tokens": 9877124.0, |
| "step": 1132 |
| }, |
| { |
| "entropy": 0.8823387026786804, |
| "epoch": 3.6324237560192616, |
| "grad_norm": 1.6312586069107056, |
| "learning_rate": 1.920579112160685e-06, |
| "loss": 0.047, |
| "mean_token_accuracy": 0.9812817573547363, |
| "num_tokens": 9885575.0, |
| "step": 1133 |
| }, |
| { |
| "entropy": 0.839708000421524, |
| "epoch": 3.635634028892456, |
| "grad_norm": 3.5764527320861816, |
| "learning_rate": 1.912235625734676e-06, |
| "loss": 0.0508, |
| "mean_token_accuracy": 0.9844213128089905, |
| "num_tokens": 9894771.0, |
| "step": 1134 |
| }, |
| { |
| "entropy": 0.9408073723316193, |
| "epoch": 3.63884430176565, |
| "grad_norm": 6.8944010734558105, |
| "learning_rate": 1.903906014766681e-06, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9850984215736389, |
| "num_tokens": 9903863.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 0.9451856017112732, |
| "epoch": 3.6420545746388444, |
| "grad_norm": 3.8949129581451416, |
| "learning_rate": 1.8955903166873924e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9816523194313049, |
| "num_tokens": 9911733.0, |
| "step": 1136 |
| }, |
| { |
| "entropy": 0.9529353082180023, |
| "epoch": 3.6452648475120384, |
| "grad_norm": 1.9870771169662476, |
| "learning_rate": 1.8872885688649879e-06, |
| "loss": 0.0546, |
| "mean_token_accuracy": 0.961370050907135, |
| "num_tokens": 9921287.0, |
| "step": 1137 |
| }, |
| { |
| "entropy": 1.0110719799995422, |
| "epoch": 3.648475120385233, |
| "grad_norm": 5.824323654174805, |
| "learning_rate": 1.8790008086049534e-06, |
| "loss": 0.0623, |
| "mean_token_accuracy": 0.9748804271221161, |
| "num_tokens": 9930558.0, |
| "step": 1138 |
| }, |
| { |
| "entropy": 0.9448727667331696, |
| "epoch": 3.6516853932584272, |
| "grad_norm": 1.6180834770202637, |
| "learning_rate": 1.8707270731499223e-06, |
| "loss": 0.0345, |
| "mean_token_accuracy": 0.9872872233390808, |
| "num_tokens": 9938191.0, |
| "step": 1139 |
| }, |
| { |
| "entropy": 0.9800795316696167, |
| "epoch": 3.654895666131621, |
| "grad_norm": 2.881392240524292, |
| "learning_rate": 1.862467399679499e-06, |
| "loss": 0.0468, |
| "mean_token_accuracy": 0.9823011159896851, |
| "num_tokens": 9947471.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.9238375723361969, |
| "epoch": 3.658105939004815, |
| "grad_norm": 2.864609479904175, |
| "learning_rate": 1.854221825310103e-06, |
| "loss": 0.0682, |
| "mean_token_accuracy": 0.9768114387989044, |
| "num_tokens": 9956161.0, |
| "step": 1141 |
| }, |
| { |
| "entropy": 0.8451533913612366, |
| "epoch": 3.6613162118780096, |
| "grad_norm": 2.6949856281280518, |
| "learning_rate": 1.8459903870947954e-06, |
| "loss": 0.0493, |
| "mean_token_accuracy": 0.9820267260074615, |
| "num_tokens": 9964537.0, |
| "step": 1142 |
| }, |
| { |
| "entropy": 0.9593810737133026, |
| "epoch": 3.664526484751204, |
| "grad_norm": 5.500277042388916, |
| "learning_rate": 1.8377731220231144e-06, |
| "loss": 0.049, |
| "mean_token_accuracy": 0.9815813601016998, |
| "num_tokens": 9973257.0, |
| "step": 1143 |
| }, |
| { |
| "entropy": 0.9055139720439911, |
| "epoch": 3.667736757624398, |
| "grad_norm": 2.1441760063171387, |
| "learning_rate": 1.829570067020906e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9799624383449554, |
| "num_tokens": 9981629.0, |
| "step": 1144 |
| }, |
| { |
| "entropy": 1.0653272867202759, |
| "epoch": 3.6709470304975924, |
| "grad_norm": 1.7149978876113892, |
| "learning_rate": 1.8213812589501611e-06, |
| "loss": 0.0316, |
| "mean_token_accuracy": 0.9889988601207733, |
| "num_tokens": 9990584.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 0.8833724558353424, |
| "epoch": 3.6741573033707864, |
| "grad_norm": 2.4739246368408203, |
| "learning_rate": 1.813206734608851e-06, |
| "loss": 0.0589, |
| "mean_token_accuracy": 0.9782996773719788, |
| "num_tokens": 9998933.0, |
| "step": 1146 |
| }, |
| { |
| "entropy": 1.0559395551681519, |
| "epoch": 3.677367576243981, |
| "grad_norm": 1.88206946849823, |
| "learning_rate": 1.8050465307307602e-06, |
| "loss": 0.0349, |
| "mean_token_accuracy": 0.9845419228076935, |
| "num_tokens": 10007298.0, |
| "step": 1147 |
| }, |
| { |
| "entropy": 0.95055291056633, |
| "epoch": 3.6805778491171752, |
| "grad_norm": 2.4245777130126953, |
| "learning_rate": 1.7969006839853227e-06, |
| "loss": 0.0617, |
| "mean_token_accuracy": 0.9638779163360596, |
| "num_tokens": 10017955.0, |
| "step": 1148 |
| }, |
| { |
| "entropy": 0.8645216226577759, |
| "epoch": 3.683788121990369, |
| "grad_norm": 2.250040054321289, |
| "learning_rate": 1.78876923097745e-06, |
| "loss": 0.0566, |
| "mean_token_accuracy": 0.97188401222229, |
| "num_tokens": 10027187.0, |
| "step": 1149 |
| }, |
| { |
| "entropy": 0.8478440046310425, |
| "epoch": 3.686998394863563, |
| "grad_norm": 1.9543880224227905, |
| "learning_rate": 1.7806522082473809e-06, |
| "loss": 0.0385, |
| "mean_token_accuracy": 0.986393392086029, |
| "num_tokens": 10035355.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.8509060144424438, |
| "epoch": 3.6902086677367576, |
| "grad_norm": 2.297797679901123, |
| "learning_rate": 1.7725496522704998e-06, |
| "loss": 0.0507, |
| "mean_token_accuracy": 0.977655678987503, |
| "num_tokens": 10043721.0, |
| "step": 1151 |
| }, |
| { |
| "entropy": 0.8556525111198425, |
| "epoch": 3.693418940609952, |
| "grad_norm": 1.3254326581954956, |
| "learning_rate": 1.7644615994571934e-06, |
| "loss": 0.0468, |
| "mean_token_accuracy": 0.9736096262931824, |
| "num_tokens": 10053117.0, |
| "step": 1152 |
| }, |
| { |
| "entropy": 1.0885184109210968, |
| "epoch": 3.696629213483146, |
| "grad_norm": 1.5111006498336792, |
| "learning_rate": 1.7563880861526656e-06, |
| "loss": 0.0522, |
| "mean_token_accuracy": 0.9707938432693481, |
| "num_tokens": 10064650.0, |
| "step": 1153 |
| }, |
| { |
| "entropy": 0.8594561219215393, |
| "epoch": 3.6998394863563404, |
| "grad_norm": 2.320162534713745, |
| "learning_rate": 1.748329148636787e-06, |
| "loss": 0.0455, |
| "mean_token_accuracy": 0.9815157949924469, |
| "num_tokens": 10073861.0, |
| "step": 1154 |
| }, |
| { |
| "entropy": 1.0225560665130615, |
| "epoch": 3.7030497592295344, |
| "grad_norm": 2.3292529582977295, |
| "learning_rate": 1.7402848231239317e-06, |
| "loss": 0.033, |
| "mean_token_accuracy": 0.9850149750709534, |
| "num_tokens": 10081779.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 0.916724443435669, |
| "epoch": 3.706260032102729, |
| "grad_norm": 1.684735655784607, |
| "learning_rate": 1.73225514576281e-06, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.9606063067913055, |
| "num_tokens": 10092506.0, |
| "step": 1156 |
| }, |
| { |
| "entropy": 0.8862948417663574, |
| "epoch": 3.7094703049759232, |
| "grad_norm": 1.5901275873184204, |
| "learning_rate": 1.7242401526363095e-06, |
| "loss": 0.0479, |
| "mean_token_accuracy": 0.9779063761234283, |
| "num_tokens": 10101108.0, |
| "step": 1157 |
| }, |
| { |
| "entropy": 0.9733172357082367, |
| "epoch": 3.712680577849117, |
| "grad_norm": 2.0854811668395996, |
| "learning_rate": 1.7162398797613284e-06, |
| "loss": 0.0524, |
| "mean_token_accuracy": 0.9822612106800079, |
| "num_tokens": 10110150.0, |
| "step": 1158 |
| }, |
| { |
| "entropy": 0.9979034662246704, |
| "epoch": 3.715890850722311, |
| "grad_norm": 2.5162012577056885, |
| "learning_rate": 1.70825436308862e-06, |
| "loss": 0.0348, |
| "mean_token_accuracy": 0.9886457622051239, |
| "num_tokens": 10118322.0, |
| "step": 1159 |
| }, |
| { |
| "entropy": 0.9389398097991943, |
| "epoch": 3.7191011235955056, |
| "grad_norm": 2.1497745513916016, |
| "learning_rate": 1.7002836385026234e-06, |
| "loss": 0.0479, |
| "mean_token_accuracy": 0.982035368680954, |
| "num_tokens": 10127091.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.908422589302063, |
| "epoch": 3.7223113964687, |
| "grad_norm": 2.7780277729034424, |
| "learning_rate": 1.692327741821312e-06, |
| "loss": 0.0343, |
| "mean_token_accuracy": 0.9829728901386261, |
| "num_tokens": 10134938.0, |
| "step": 1161 |
| }, |
| { |
| "entropy": 0.9549289643764496, |
| "epoch": 3.725521669341894, |
| "grad_norm": 1.9559484720230103, |
| "learning_rate": 1.6843867087960252e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9825837016105652, |
| "num_tokens": 10143758.0, |
| "step": 1162 |
| }, |
| { |
| "entropy": 0.9937103390693665, |
| "epoch": 3.7287319422150884, |
| "grad_norm": 1.8624933958053589, |
| "learning_rate": 1.676460575111306e-06, |
| "loss": 0.0408, |
| "mean_token_accuracy": 0.9817036986351013, |
| "num_tokens": 10152815.0, |
| "step": 1163 |
| }, |
| { |
| "entropy": 0.9328649938106537, |
| "epoch": 3.7319422150882824, |
| "grad_norm": 1.420202374458313, |
| "learning_rate": 1.6685493763847515e-06, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.9874887466430664, |
| "num_tokens": 10161410.0, |
| "step": 1164 |
| }, |
| { |
| "entropy": 0.8172413110733032, |
| "epoch": 3.735152487961477, |
| "grad_norm": 6.408421516418457, |
| "learning_rate": 1.6606531481668364e-06, |
| "loss": 0.059, |
| "mean_token_accuracy": 0.9759405851364136, |
| "num_tokens": 10170748.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 1.009339064359665, |
| "epoch": 3.738362760834671, |
| "grad_norm": 1.9974569082260132, |
| "learning_rate": 1.6527719259407743e-06, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9845010340213776, |
| "num_tokens": 10179326.0, |
| "step": 1166 |
| }, |
| { |
| "entropy": 0.9294591248035431, |
| "epoch": 3.741573033707865, |
| "grad_norm": 3.3291337490081787, |
| "learning_rate": 1.6449057451223354e-06, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9824508726596832, |
| "num_tokens": 10188628.0, |
| "step": 1167 |
| }, |
| { |
| "entropy": 0.9091964364051819, |
| "epoch": 3.744783306581059, |
| "grad_norm": 2.6491594314575195, |
| "learning_rate": 1.6370546410597066e-06, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9683757126331329, |
| "num_tokens": 10198174.0, |
| "step": 1168 |
| }, |
| { |
| "entropy": 0.8906883001327515, |
| "epoch": 3.7479935794542536, |
| "grad_norm": 1.4277008771896362, |
| "learning_rate": 1.6292186490333172e-06, |
| "loss": 0.0436, |
| "mean_token_accuracy": 0.9807183742523193, |
| "num_tokens": 10206596.0, |
| "step": 1169 |
| }, |
| { |
| "entropy": 0.8736357986927032, |
| "epoch": 3.751203852327448, |
| "grad_norm": 2.0264739990234375, |
| "learning_rate": 1.6213978042556938e-06, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9833411276340485, |
| "num_tokens": 10214049.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.9112011790275574, |
| "epoch": 3.754414125200642, |
| "grad_norm": 2.4318811893463135, |
| "learning_rate": 1.6135921418712959e-06, |
| "loss": 0.0397, |
| "mean_token_accuracy": 0.9881952702999115, |
| "num_tokens": 10222212.0, |
| "step": 1171 |
| }, |
| { |
| "entropy": 0.9542613327503204, |
| "epoch": 3.7576243980738364, |
| "grad_norm": 3.0339975357055664, |
| "learning_rate": 1.6058016969563512e-06, |
| "loss": 0.0602, |
| "mean_token_accuracy": 0.9773413836956024, |
| "num_tokens": 10231984.0, |
| "step": 1172 |
| }, |
| { |
| "entropy": 0.8619020581245422, |
| "epoch": 3.7608346709470304, |
| "grad_norm": 1.3047006130218506, |
| "learning_rate": 1.5980265045187139e-06, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.9838049113750458, |
| "num_tokens": 10240121.0, |
| "step": 1173 |
| }, |
| { |
| "entropy": 0.8658272624015808, |
| "epoch": 3.764044943820225, |
| "grad_norm": 9.934652328491211, |
| "learning_rate": 1.5902665994976896e-06, |
| "loss": 0.0563, |
| "mean_token_accuracy": 0.9773064851760864, |
| "num_tokens": 10248747.0, |
| "step": 1174 |
| }, |
| { |
| "entropy": 1.0443784594535828, |
| "epoch": 3.767255216693419, |
| "grad_norm": 1.5346416234970093, |
| "learning_rate": 1.5825220167638945e-06, |
| "loss": 0.0301, |
| "mean_token_accuracy": 0.9880676567554474, |
| "num_tokens": 10257534.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 0.9445019364356995, |
| "epoch": 3.770465489566613, |
| "grad_norm": 1.3560583591461182, |
| "learning_rate": 1.5747927911190858e-06, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.989595890045166, |
| "num_tokens": 10265790.0, |
| "step": 1176 |
| }, |
| { |
| "entropy": 0.8617814779281616, |
| "epoch": 3.773675762439807, |
| "grad_norm": 2.822770118713379, |
| "learning_rate": 1.567078957296016e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.971150130033493, |
| "num_tokens": 10274497.0, |
| "step": 1177 |
| }, |
| { |
| "entropy": 0.9549345076084137, |
| "epoch": 3.7768860353130016, |
| "grad_norm": 1.5040322542190552, |
| "learning_rate": 1.5593805499582659e-06, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9895735383033752, |
| "num_tokens": 10282792.0, |
| "step": 1178 |
| }, |
| { |
| "entropy": 0.9082767367362976, |
| "epoch": 3.780096308186196, |
| "grad_norm": 1.8122907876968384, |
| "learning_rate": 1.5516976037000941e-06, |
| "loss": 0.0413, |
| "mean_token_accuracy": 0.9823250472545624, |
| "num_tokens": 10290439.0, |
| "step": 1179 |
| }, |
| { |
| "entropy": 0.9210187792778015, |
| "epoch": 3.78330658105939, |
| "grad_norm": 2.638749599456787, |
| "learning_rate": 1.544030153046291e-06, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.9761735498905182, |
| "num_tokens": 10299177.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.9334617257118225, |
| "epoch": 3.7865168539325844, |
| "grad_norm": 2.229041576385498, |
| "learning_rate": 1.5363782324520033e-06, |
| "loss": 0.0538, |
| "mean_token_accuracy": 0.9787396788597107, |
| "num_tokens": 10308598.0, |
| "step": 1181 |
| }, |
| { |
| "entropy": 0.8418469130992889, |
| "epoch": 3.7897271268057784, |
| "grad_norm": 2.188577175140381, |
| "learning_rate": 1.528741876302598e-06, |
| "loss": 0.0601, |
| "mean_token_accuracy": 0.965255469083786, |
| "num_tokens": 10318063.0, |
| "step": 1182 |
| }, |
| { |
| "entropy": 0.9776929020881653, |
| "epoch": 3.792937399678973, |
| "grad_norm": 1.7480143308639526, |
| "learning_rate": 1.5211211189134955e-06, |
| "loss": 0.0492, |
| "mean_token_accuracy": 0.9740462601184845, |
| "num_tokens": 10327620.0, |
| "step": 1183 |
| }, |
| { |
| "entropy": 0.8690580427646637, |
| "epoch": 3.796147672552167, |
| "grad_norm": 3.2047064304351807, |
| "learning_rate": 1.5135159945300232e-06, |
| "loss": 0.0341, |
| "mean_token_accuracy": 0.9891680479049683, |
| "num_tokens": 10335916.0, |
| "step": 1184 |
| }, |
| { |
| "entropy": 1.0153252184391022, |
| "epoch": 3.799357945425361, |
| "grad_norm": 1.8621257543563843, |
| "learning_rate": 1.5059265373272574e-06, |
| "loss": 0.0293, |
| "mean_token_accuracy": 0.989485502243042, |
| "num_tokens": 10344614.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 1.0363346338272095, |
| "epoch": 3.802568218298555, |
| "grad_norm": 1.3696978092193604, |
| "learning_rate": 1.4983527814098736e-06, |
| "loss": 0.0344, |
| "mean_token_accuracy": 0.9842495024204254, |
| "num_tokens": 10353085.0, |
| "step": 1186 |
| }, |
| { |
| "entropy": 0.9636200070381165, |
| "epoch": 3.8057784911717496, |
| "grad_norm": 3.6484992504119873, |
| "learning_rate": 1.4907947608119866e-06, |
| "loss": 0.0569, |
| "mean_token_accuracy": 0.9779880046844482, |
| "num_tokens": 10361502.0, |
| "step": 1187 |
| }, |
| { |
| "entropy": 1.1602963209152222, |
| "epoch": 3.808988764044944, |
| "grad_norm": 1.8796751499176025, |
| "learning_rate": 1.4832525094970007e-06, |
| "loss": 0.051, |
| "mean_token_accuracy": 0.9757754504680634, |
| "num_tokens": 10371614.0, |
| "step": 1188 |
| }, |
| { |
| "entropy": 1.043602168560028, |
| "epoch": 3.812199036918138, |
| "grad_norm": 1.5110524892807007, |
| "learning_rate": 1.475726061357463e-06, |
| "loss": 0.0361, |
| "mean_token_accuracy": 0.9858299195766449, |
| "num_tokens": 10380855.0, |
| "step": 1189 |
| }, |
| { |
| "entropy": 0.9013993740081787, |
| "epoch": 3.8154093097913324, |
| "grad_norm": 1.8058748245239258, |
| "learning_rate": 1.4682154502149025e-06, |
| "loss": 0.0485, |
| "mean_token_accuracy": 0.9798838496208191, |
| "num_tokens": 10390505.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.8585264086723328, |
| "epoch": 3.8186195826645264, |
| "grad_norm": 1.8328495025634766, |
| "learning_rate": 1.4607207098196851e-06, |
| "loss": 0.0622, |
| "mean_token_accuracy": 0.9619594812393188, |
| "num_tokens": 10399325.0, |
| "step": 1191 |
| }, |
| { |
| "entropy": 0.8735026121139526, |
| "epoch": 3.821829855537721, |
| "grad_norm": 1.9175422191619873, |
| "learning_rate": 1.4532418738508525e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9814468622207642, |
| "num_tokens": 10406972.0, |
| "step": 1192 |
| }, |
| { |
| "entropy": 0.8725822865962982, |
| "epoch": 3.825040128410915, |
| "grad_norm": 2.073087453842163, |
| "learning_rate": 1.4457789759159813e-06, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9837340116500854, |
| "num_tokens": 10416155.0, |
| "step": 1193 |
| }, |
| { |
| "entropy": 0.7589974999427795, |
| "epoch": 3.828250401284109, |
| "grad_norm": 2.09479022026062, |
| "learning_rate": 1.4383320495510267e-06, |
| "loss": 0.0518, |
| "mean_token_accuracy": 0.9773995578289032, |
| "num_tokens": 10424983.0, |
| "step": 1194 |
| }, |
| { |
| "entropy": 0.7707741558551788, |
| "epoch": 3.831460674157303, |
| "grad_norm": 1.806342601776123, |
| "learning_rate": 1.430901128220174e-06, |
| "loss": 0.0442, |
| "mean_token_accuracy": 0.983624279499054, |
| "num_tokens": 10432712.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 0.9553577899932861, |
| "epoch": 3.8346709470304976, |
| "grad_norm": 2.0812230110168457, |
| "learning_rate": 1.4234862453156839e-06, |
| "loss": 0.0388, |
| "mean_token_accuracy": 0.9831459522247314, |
| "num_tokens": 10440865.0, |
| "step": 1196 |
| }, |
| { |
| "entropy": 0.987030029296875, |
| "epoch": 3.837881219903692, |
| "grad_norm": 1.751489520072937, |
| "learning_rate": 1.4160874341577447e-06, |
| "loss": 0.0409, |
| "mean_token_accuracy": 0.985140860080719, |
| "num_tokens": 10448801.0, |
| "step": 1197 |
| }, |
| { |
| "entropy": 0.785410076379776, |
| "epoch": 3.841091492776886, |
| "grad_norm": 2.0246682167053223, |
| "learning_rate": 1.4087047279943267e-06, |
| "loss": 0.051, |
| "mean_token_accuracy": 0.9750083088874817, |
| "num_tokens": 10457486.0, |
| "step": 1198 |
| }, |
| { |
| "entropy": 0.7790045142173767, |
| "epoch": 3.8443017656500804, |
| "grad_norm": 1.6499295234680176, |
| "learning_rate": 1.4013381600010278e-06, |
| "loss": 0.0457, |
| "mean_token_accuracy": 0.9819171726703644, |
| "num_tokens": 10466429.0, |
| "step": 1199 |
| }, |
| { |
| "entropy": 1.0195372998714447, |
| "epoch": 3.8475120385232744, |
| "grad_norm": 1.9666370153427124, |
| "learning_rate": 1.3939877632809279e-06, |
| "loss": 0.0449, |
| "mean_token_accuracy": 0.9832908809185028, |
| "num_tokens": 10474853.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.8610271215438843, |
| "epoch": 3.850722311396469, |
| "grad_norm": 1.5382120609283447, |
| "learning_rate": 1.3866535708644335e-06, |
| "loss": 0.0483, |
| "mean_token_accuracy": 0.983114629983902, |
| "num_tokens": 10484253.0, |
| "step": 1201 |
| }, |
| { |
| "entropy": 0.8548697829246521, |
| "epoch": 3.853932584269663, |
| "grad_norm": 2.9462549686431885, |
| "learning_rate": 1.3793356157091387e-06, |
| "loss": 0.0563, |
| "mean_token_accuracy": 0.9759114682674408, |
| "num_tokens": 10493150.0, |
| "step": 1202 |
| }, |
| { |
| "entropy": 0.981879711151123, |
| "epoch": 3.857142857142857, |
| "grad_norm": 2.5988550186157227, |
| "learning_rate": 1.3720339306996666e-06, |
| "loss": 0.0318, |
| "mean_token_accuracy": 0.9906176030635834, |
| "num_tokens": 10501519.0, |
| "step": 1203 |
| }, |
| { |
| "entropy": 0.8637133836746216, |
| "epoch": 3.860353130016051, |
| "grad_norm": 1.6211717128753662, |
| "learning_rate": 1.3647485486475376e-06, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.98866006731987, |
| "num_tokens": 10509870.0, |
| "step": 1204 |
| }, |
| { |
| "entropy": 1.0288158059120178, |
| "epoch": 3.8635634028892456, |
| "grad_norm": 1.9696396589279175, |
| "learning_rate": 1.3574795022910014e-06, |
| "loss": 0.0465, |
| "mean_token_accuracy": 0.9840021729469299, |
| "num_tokens": 10518929.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 0.8618482649326324, |
| "epoch": 3.86677367576244, |
| "grad_norm": 2.089820384979248, |
| "learning_rate": 1.3502268242949025e-06, |
| "loss": 0.0506, |
| "mean_token_accuracy": 0.9794828593730927, |
| "num_tokens": 10527460.0, |
| "step": 1206 |
| }, |
| { |
| "entropy": 0.8384652733802795, |
| "epoch": 3.869983948635634, |
| "grad_norm": 1.6938061714172363, |
| "learning_rate": 1.3429905472505344e-06, |
| "loss": 0.0441, |
| "mean_token_accuracy": 0.9818322360515594, |
| "num_tokens": 10535140.0, |
| "step": 1207 |
| }, |
| { |
| "entropy": 0.9508244693279266, |
| "epoch": 3.8731942215088284, |
| "grad_norm": 2.3421173095703125, |
| "learning_rate": 1.3357707036754875e-06, |
| "loss": 0.0552, |
| "mean_token_accuracy": 0.9806767404079437, |
| "num_tokens": 10543385.0, |
| "step": 1208 |
| }, |
| { |
| "entropy": 0.9171972870826721, |
| "epoch": 3.8764044943820224, |
| "grad_norm": 2.3140108585357666, |
| "learning_rate": 1.3285673260135073e-06, |
| "loss": 0.0369, |
| "mean_token_accuracy": 0.9835132956504822, |
| "num_tokens": 10551332.0, |
| "step": 1209 |
| }, |
| { |
| "entropy": 0.9664874970912933, |
| "epoch": 3.879614767255217, |
| "grad_norm": 1.4102840423583984, |
| "learning_rate": 1.321380446634342e-06, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9883507788181305, |
| "num_tokens": 10558804.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.8216537237167358, |
| "epoch": 3.882825040128411, |
| "grad_norm": 6.906599998474121, |
| "learning_rate": 1.314210097833607e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9810819029808044, |
| "num_tokens": 10568053.0, |
| "step": 1211 |
| }, |
| { |
| "entropy": 0.8050373494625092, |
| "epoch": 3.886035313001605, |
| "grad_norm": 5.290853023529053, |
| "learning_rate": 1.3070563118326295e-06, |
| "loss": 0.0648, |
| "mean_token_accuracy": 0.9657208025455475, |
| "num_tokens": 10576876.0, |
| "step": 1212 |
| }, |
| { |
| "entropy": 0.8461142778396606, |
| "epoch": 3.889245585874799, |
| "grad_norm": 2.1909284591674805, |
| "learning_rate": 1.2999191207783129e-06, |
| "loss": 0.0387, |
| "mean_token_accuracy": 0.9846838414669037, |
| "num_tokens": 10585747.0, |
| "step": 1213 |
| }, |
| { |
| "entropy": 0.862621545791626, |
| "epoch": 3.8924558587479936, |
| "grad_norm": 7.942407608032227, |
| "learning_rate": 1.2927985567429868e-06, |
| "loss": 0.0469, |
| "mean_token_accuracy": 0.9817521870136261, |
| "num_tokens": 10593612.0, |
| "step": 1214 |
| }, |
| { |
| "entropy": 1.056757628917694, |
| "epoch": 3.895666131621188, |
| "grad_norm": 1.4919307231903076, |
| "learning_rate": 1.2856946517242608e-06, |
| "loss": 0.0428, |
| "mean_token_accuracy": 0.9853585362434387, |
| "num_tokens": 10602304.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 0.9187009334564209, |
| "epoch": 3.898876404494382, |
| "grad_norm": 1.4443905353546143, |
| "learning_rate": 1.27860743764489e-06, |
| "loss": 0.0582, |
| "mean_token_accuracy": 0.9673136472702026, |
| "num_tokens": 10611835.0, |
| "step": 1216 |
| }, |
| { |
| "entropy": 0.8659194707870483, |
| "epoch": 3.902086677367576, |
| "grad_norm": 2.3289132118225098, |
| "learning_rate": 1.2715369463526173e-06, |
| "loss": 0.0545, |
| "mean_token_accuracy": 0.9800289571285248, |
| "num_tokens": 10620313.0, |
| "step": 1217 |
| }, |
| { |
| "entropy": 0.8493823409080505, |
| "epoch": 3.9052969502407704, |
| "grad_norm": 2.543363571166992, |
| "learning_rate": 1.2644832096200498e-06, |
| "loss": 0.0461, |
| "mean_token_accuracy": 0.9816104173660278, |
| "num_tokens": 10628548.0, |
| "step": 1218 |
| }, |
| { |
| "entropy": 0.8991810083389282, |
| "epoch": 3.908507223113965, |
| "grad_norm": 6.281195163726807, |
| "learning_rate": 1.257446259144494e-06, |
| "loss": 0.053, |
| "mean_token_accuracy": 0.9824222326278687, |
| "num_tokens": 10636626.0, |
| "step": 1219 |
| }, |
| { |
| "entropy": 0.9110390543937683, |
| "epoch": 3.911717495987159, |
| "grad_norm": 3.237565040588379, |
| "learning_rate": 1.2504261265478324e-06, |
| "loss": 0.0467, |
| "mean_token_accuracy": 0.9759093225002289, |
| "num_tokens": 10645242.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.9176389575004578, |
| "epoch": 3.914927768860353, |
| "grad_norm": 1.461086392402649, |
| "learning_rate": 1.2434228433763657e-06, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.9800988137722015, |
| "num_tokens": 10654835.0, |
| "step": 1221 |
| }, |
| { |
| "entropy": 0.938913106918335, |
| "epoch": 3.918138041733547, |
| "grad_norm": 1.900651454925537, |
| "learning_rate": 1.2364364411006841e-06, |
| "loss": 0.05, |
| "mean_token_accuracy": 0.9824710190296173, |
| "num_tokens": 10663724.0, |
| "step": 1222 |
| }, |
| { |
| "entropy": 0.9196376204490662, |
| "epoch": 3.9213483146067416, |
| "grad_norm": 1.5418288707733154, |
| "learning_rate": 1.2294669511155193e-06, |
| "loss": 0.0393, |
| "mean_token_accuracy": 0.9803338348865509, |
| "num_tokens": 10673446.0, |
| "step": 1223 |
| }, |
| { |
| "entropy": 0.8525184988975525, |
| "epoch": 3.924558587479936, |
| "grad_norm": 2.4837660789489746, |
| "learning_rate": 1.2225144047396015e-06, |
| "loss": 0.0702, |
| "mean_token_accuracy": 0.9695371985435486, |
| "num_tokens": 10681905.0, |
| "step": 1224 |
| }, |
| { |
| "entropy": 0.9096053838729858, |
| "epoch": 3.92776886035313, |
| "grad_norm": 27.353504180908203, |
| "learning_rate": 1.215578833215526e-06, |
| "loss": 0.0619, |
| "mean_token_accuracy": 0.9736766219139099, |
| "num_tokens": 10690825.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 0.8512005805969238, |
| "epoch": 3.930979133226324, |
| "grad_norm": 1.7140419483184814, |
| "learning_rate": 1.2086602677096033e-06, |
| "loss": 0.0635, |
| "mean_token_accuracy": 0.9623365104198456, |
| "num_tokens": 10699282.0, |
| "step": 1226 |
| }, |
| { |
| "entropy": 0.9099750518798828, |
| "epoch": 3.9341894060995184, |
| "grad_norm": 1.4028735160827637, |
| "learning_rate": 1.201758739311728e-06, |
| "loss": 0.0346, |
| "mean_token_accuracy": 0.9866108298301697, |
| "num_tokens": 10707893.0, |
| "step": 1227 |
| }, |
| { |
| "entropy": 0.8565336465835571, |
| "epoch": 3.937399678972713, |
| "grad_norm": 1.4634716510772705, |
| "learning_rate": 1.1948742790352342e-06, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9821693003177643, |
| "num_tokens": 10717249.0, |
| "step": 1228 |
| }, |
| { |
| "entropy": 0.9397711753845215, |
| "epoch": 3.940609951845907, |
| "grad_norm": 1.8528227806091309, |
| "learning_rate": 1.1880069178167586e-06, |
| "loss": 0.0764, |
| "mean_token_accuracy": 0.9538247883319855, |
| "num_tokens": 10727017.0, |
| "step": 1229 |
| }, |
| { |
| "entropy": 0.8851629793643951, |
| "epoch": 3.943820224719101, |
| "grad_norm": 1.2845957279205322, |
| "learning_rate": 1.1811566865160961e-06, |
| "loss": 0.034, |
| "mean_token_accuracy": 0.982793927192688, |
| "num_tokens": 10734428.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.8674412071704865, |
| "epoch": 3.947030497592295, |
| "grad_norm": 4.364323139190674, |
| "learning_rate": 1.1743236159160654e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9849294424057007, |
| "num_tokens": 10742510.0, |
| "step": 1231 |
| }, |
| { |
| "entropy": 0.9346510767936707, |
| "epoch": 3.9502407704654896, |
| "grad_norm": 1.6875059604644775, |
| "learning_rate": 1.167507736722377e-06, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9862508773803711, |
| "num_tokens": 10750667.0, |
| "step": 1232 |
| }, |
| { |
| "entropy": 0.9485654532909393, |
| "epoch": 3.953451043338684, |
| "grad_norm": 1.916317105293274, |
| "learning_rate": 1.1607090795634802e-06, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9837748110294342, |
| "num_tokens": 10759026.0, |
| "step": 1233 |
| }, |
| { |
| "entropy": 0.8760998845100403, |
| "epoch": 3.956661316211878, |
| "grad_norm": 1.6034512519836426, |
| "learning_rate": 1.15392767499044e-06, |
| "loss": 0.0598, |
| "mean_token_accuracy": 0.9702793061733246, |
| "num_tokens": 10769283.0, |
| "step": 1234 |
| }, |
| { |
| "entropy": 0.9222274422645569, |
| "epoch": 3.959871589085072, |
| "grad_norm": 1.5884339809417725, |
| "learning_rate": 1.1471635534767877e-06, |
| "loss": 0.0343, |
| "mean_token_accuracy": 0.9872113466262817, |
| "num_tokens": 10777409.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 1.0063312649726868, |
| "epoch": 3.9630818619582664, |
| "grad_norm": 1.3955051898956299, |
| "learning_rate": 1.1404167454183957e-06, |
| "loss": 0.0289, |
| "mean_token_accuracy": 0.9890806972980499, |
| "num_tokens": 10784631.0, |
| "step": 1236 |
| }, |
| { |
| "entropy": 1.01164972782135, |
| "epoch": 3.966292134831461, |
| "grad_norm": 1.528063416481018, |
| "learning_rate": 1.133687281133331e-06, |
| "loss": 0.0481, |
| "mean_token_accuracy": 0.9762919843196869, |
| "num_tokens": 10793409.0, |
| "step": 1237 |
| }, |
| { |
| "entropy": 1.0118040144443512, |
| "epoch": 3.969502407704655, |
| "grad_norm": 1.9172968864440918, |
| "learning_rate": 1.1269751908617277e-06, |
| "loss": 0.057, |
| "mean_token_accuracy": 0.9794862866401672, |
| "num_tokens": 10801911.0, |
| "step": 1238 |
| }, |
| { |
| "entropy": 0.959458976984024, |
| "epoch": 3.972712680577849, |
| "grad_norm": 1.7244352102279663, |
| "learning_rate": 1.1202805047656406e-06, |
| "loss": 0.0461, |
| "mean_token_accuracy": 0.9820380210876465, |
| "num_tokens": 10810254.0, |
| "step": 1239 |
| }, |
| { |
| "entropy": 1.0892210602760315, |
| "epoch": 3.975922953451043, |
| "grad_norm": 1.4209359884262085, |
| "learning_rate": 1.113603252928917e-06, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9800035953521729, |
| "num_tokens": 10819473.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.8789343237876892, |
| "epoch": 3.9791332263242376, |
| "grad_norm": 3.5406527519226074, |
| "learning_rate": 1.1069434653570633e-06, |
| "loss": 0.0439, |
| "mean_token_accuracy": 0.9809198379516602, |
| "num_tokens": 10827936.0, |
| "step": 1241 |
| }, |
| { |
| "entropy": 0.9018587470054626, |
| "epoch": 3.982343499197432, |
| "grad_norm": 3.4543299674987793, |
| "learning_rate": 1.1003011719771046e-06, |
| "loss": 0.0404, |
| "mean_token_accuracy": 0.987114280462265, |
| "num_tokens": 10835439.0, |
| "step": 1242 |
| }, |
| { |
| "entropy": 0.952752411365509, |
| "epoch": 3.985553772070626, |
| "grad_norm": 1.4983313083648682, |
| "learning_rate": 1.0936764026374547e-06, |
| "loss": 0.0359, |
| "mean_token_accuracy": 0.9853008389472961, |
| "num_tokens": 10843254.0, |
| "step": 1243 |
| }, |
| { |
| "entropy": 0.901978075504303, |
| "epoch": 3.98876404494382, |
| "grad_norm": 1.8080071210861206, |
| "learning_rate": 1.0870691871077738e-06, |
| "loss": 0.0571, |
| "mean_token_accuracy": 0.9717438220977783, |
| "num_tokens": 10852491.0, |
| "step": 1244 |
| }, |
| { |
| "entropy": 0.9695694446563721, |
| "epoch": 3.9919743178170144, |
| "grad_norm": 1.5428680181503296, |
| "learning_rate": 1.0804795550788473e-06, |
| "loss": 0.059, |
| "mean_token_accuracy": 0.9658881723880768, |
| "num_tokens": 10861299.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 0.8626547455787659, |
| "epoch": 3.995184590690209, |
| "grad_norm": 1.5923748016357422, |
| "learning_rate": 1.073907536162443e-06, |
| "loss": 0.0464, |
| "mean_token_accuracy": 0.9818092882633209, |
| "num_tokens": 10870030.0, |
| "step": 1246 |
| }, |
| { |
| "entropy": 0.9375744163990021, |
| "epoch": 3.998394863563403, |
| "grad_norm": 2.5676188468933105, |
| "learning_rate": 1.0673531598911824e-06, |
| "loss": 0.058, |
| "mean_token_accuracy": 0.966584324836731, |
| "num_tokens": 10880729.0, |
| "step": 1247 |
| }, |
| { |
| "entropy": 0.7778120040893555, |
| "epoch": 4.0, |
| "grad_norm": 2.0153770446777344, |
| "learning_rate": 1.0608164557184042e-06, |
| "loss": 0.0344, |
| "mean_token_accuracy": 0.9873684048652649, |
| "num_tokens": 10884568.0, |
| "step": 1248 |
| }, |
| { |
| "entropy": 0.8640480041503906, |
| "epoch": 4.003210272873194, |
| "grad_norm": 0.6675413250923157, |
| "learning_rate": 1.0542974530180327e-06, |
| "loss": 0.0262, |
| "mean_token_accuracy": 0.9823751747608185, |
| "num_tokens": 10892829.0, |
| "step": 1249 |
| }, |
| { |
| "entropy": 0.9694719612598419, |
| "epoch": 4.006420545746389, |
| "grad_norm": 0.7210123538970947, |
| "learning_rate": 1.0477961810844517e-06, |
| "loss": 0.0157, |
| "mean_token_accuracy": 0.9941366910934448, |
| "num_tokens": 10900762.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.9497312605381012, |
| "epoch": 4.009630818619582, |
| "grad_norm": 1.7998807430267334, |
| "learning_rate": 1.0413126691323667e-06, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9944363832473755, |
| "num_tokens": 10908632.0, |
| "step": 1251 |
| }, |
| { |
| "entropy": 1.0852590799331665, |
| "epoch": 4.012841091492777, |
| "grad_norm": 0.7353365421295166, |
| "learning_rate": 1.0348469462966753e-06, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.9962048232555389, |
| "num_tokens": 10918033.0, |
| "step": 1252 |
| }, |
| { |
| "entropy": 1.0034122467041016, |
| "epoch": 4.016051364365971, |
| "grad_norm": 0.7921286821365356, |
| "learning_rate": 1.0283990416323336e-06, |
| "loss": 0.0163, |
| "mean_token_accuracy": 0.995498538017273, |
| "num_tokens": 10926551.0, |
| "step": 1253 |
| }, |
| { |
| "entropy": 0.8701649308204651, |
| "epoch": 4.019261637239166, |
| "grad_norm": 0.7349697947502136, |
| "learning_rate": 1.0219689841142343e-06, |
| "loss": 0.0274, |
| "mean_token_accuracy": 0.9848228096961975, |
| "num_tokens": 10936080.0, |
| "step": 1254 |
| }, |
| { |
| "entropy": 0.7810315489768982, |
| "epoch": 4.022471910112359, |
| "grad_norm": 0.7971243262290955, |
| "learning_rate": 1.0155568026370637e-06, |
| "loss": 0.0309, |
| "mean_token_accuracy": 0.9813618063926697, |
| "num_tokens": 10946666.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 0.8642408847808838, |
| "epoch": 4.025682182985554, |
| "grad_norm": 1.208862543106079, |
| "learning_rate": 1.0091625260151827e-06, |
| "loss": 0.0209, |
| "mean_token_accuracy": 0.9927391707897186, |
| "num_tokens": 10954165.0, |
| "step": 1256 |
| }, |
| { |
| "entropy": 0.9416361153125763, |
| "epoch": 4.028892455858748, |
| "grad_norm": 0.9059441089630127, |
| "learning_rate": 1.0027861829824953e-06, |
| "loss": 0.026, |
| "mean_token_accuracy": 0.9885562062263489, |
| "num_tokens": 10963217.0, |
| "step": 1257 |
| }, |
| { |
| "entropy": 0.832787424325943, |
| "epoch": 4.032102728731942, |
| "grad_norm": 1.0136638879776, |
| "learning_rate": 9.964278021923107e-07, |
| "loss": 0.0269, |
| "mean_token_accuracy": 0.9904702007770538, |
| "num_tokens": 10971882.0, |
| "step": 1258 |
| }, |
| { |
| "entropy": 0.8856756091117859, |
| "epoch": 4.035313001605137, |
| "grad_norm": 0.8064892292022705, |
| "learning_rate": 9.900874122172294e-07, |
| "loss": 0.019, |
| "mean_token_accuracy": 0.9927627444267273, |
| "num_tokens": 10980909.0, |
| "step": 1259 |
| }, |
| { |
| "entropy": 0.827369898557663, |
| "epoch": 4.03852327447833, |
| "grad_norm": 0.593895673751831, |
| "learning_rate": 9.83765041548998e-07, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9960170388221741, |
| "num_tokens": 10989655.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.9583877325057983, |
| "epoch": 4.041733547351525, |
| "grad_norm": 1.0141297578811646, |
| "learning_rate": 9.774607185984004e-07, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9941740334033966, |
| "num_tokens": 10998018.0, |
| "step": 1261 |
| }, |
| { |
| "entropy": 0.8924273252487183, |
| "epoch": 4.044943820224719, |
| "grad_norm": 0.8163235187530518, |
| "learning_rate": 9.711744716951093e-07, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9947156310081482, |
| "num_tokens": 11006756.0, |
| "step": 1262 |
| }, |
| { |
| "entropy": 0.8327958583831787, |
| "epoch": 4.048154093097914, |
| "grad_norm": 0.8127865791320801, |
| "learning_rate": 9.649063290875771e-07, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.994948148727417, |
| "num_tokens": 11014455.0, |
| "step": 1263 |
| }, |
| { |
| "entropy": 0.8657267093658447, |
| "epoch": 4.051364365971107, |
| "grad_norm": 0.9289994835853577, |
| "learning_rate": 9.586563189428954e-07, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9955416023731232, |
| "num_tokens": 11022936.0, |
| "step": 1264 |
| }, |
| { |
| "entropy": 0.8278777003288269, |
| "epoch": 4.054574638844302, |
| "grad_norm": 1.0349270105361938, |
| "learning_rate": 9.524244693466773e-07, |
| "loss": 0.0174, |
| "mean_token_accuracy": 0.9945272207260132, |
| "num_tokens": 11030932.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 0.7735994756221771, |
| "epoch": 4.057784911717496, |
| "grad_norm": 0.7948999404907227, |
| "learning_rate": 9.462108083029287e-07, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.9946495592594147, |
| "num_tokens": 11038898.0, |
| "step": 1266 |
| }, |
| { |
| "entropy": 0.9241797029972076, |
| "epoch": 4.06099518459069, |
| "grad_norm": 1.0356444120407104, |
| "learning_rate": 9.400153637339182e-07, |
| "loss": 0.0149, |
| "mean_token_accuracy": 0.9944348335266113, |
| "num_tokens": 11046059.0, |
| "step": 1267 |
| }, |
| { |
| "entropy": 0.9038277566432953, |
| "epoch": 4.064205457463885, |
| "grad_norm": 0.8357171416282654, |
| "learning_rate": 9.338381634800597e-07, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9948267638683319, |
| "num_tokens": 11053771.0, |
| "step": 1268 |
| }, |
| { |
| "entropy": 0.7923440039157867, |
| "epoch": 4.067415730337078, |
| "grad_norm": 1.267638087272644, |
| "learning_rate": 9.276792352997782e-07, |
| "loss": 0.0257, |
| "mean_token_accuracy": 0.9828329980373383, |
| "num_tokens": 11062458.0, |
| "step": 1269 |
| }, |
| { |
| "entropy": 0.845217764377594, |
| "epoch": 4.070626003210273, |
| "grad_norm": 1.1078312397003174, |
| "learning_rate": 9.215386068693927e-07, |
| "loss": 0.0195, |
| "mean_token_accuracy": 0.9937855005264282, |
| "num_tokens": 11070919.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.9916780292987823, |
| "epoch": 4.073836276083467, |
| "grad_norm": 0.9299560785293579, |
| "learning_rate": 9.154163057829879e-07, |
| "loss": 0.0171, |
| "mean_token_accuracy": 0.993675172328949, |
| "num_tokens": 11079381.0, |
| "step": 1271 |
| }, |
| { |
| "entropy": 0.8888976275920868, |
| "epoch": 4.077046548956662, |
| "grad_norm": 0.7998571991920471, |
| "learning_rate": 9.093123595522929e-07, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.993833065032959, |
| "num_tokens": 11087689.0, |
| "step": 1272 |
| }, |
| { |
| "entropy": 0.9080740213394165, |
| "epoch": 4.080256821829855, |
| "grad_norm": 1.1759231090545654, |
| "learning_rate": 9.032267956065516e-07, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9817738234996796, |
| "num_tokens": 11097553.0, |
| "step": 1273 |
| }, |
| { |
| "entropy": 0.8172707259654999, |
| "epoch": 4.08346709470305, |
| "grad_norm": 7.202850341796875, |
| "learning_rate": 8.971596412924067e-07, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9940112233161926, |
| "num_tokens": 11105794.0, |
| "step": 1274 |
| }, |
| { |
| "entropy": 0.8537603318691254, |
| "epoch": 4.086677367576244, |
| "grad_norm": 0.9416528940200806, |
| "learning_rate": 8.911109238737748e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9931537806987762, |
| "num_tokens": 11113728.0, |
| "step": 1275 |
| }, |
| { |
| "entropy": 0.8437024652957916, |
| "epoch": 4.089887640449438, |
| "grad_norm": 8.162856101989746, |
| "learning_rate": 8.850806705317183e-07, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9965576529502869, |
| "num_tokens": 11121815.0, |
| "step": 1276 |
| }, |
| { |
| "entropy": 0.769570529460907, |
| "epoch": 4.093097913322633, |
| "grad_norm": 1.77950119972229, |
| "learning_rate": 8.790689083643328e-07, |
| "loss": 0.0259, |
| "mean_token_accuracy": 0.9933834969997406, |
| "num_tokens": 11130079.0, |
| "step": 1277 |
| }, |
| { |
| "entropy": 0.8534726202487946, |
| "epoch": 4.096308186195826, |
| "grad_norm": 3.9178075790405273, |
| "learning_rate": 8.730756643866157e-07, |
| "loss": 0.0123, |
| "mean_token_accuracy": 0.9955320656299591, |
| "num_tokens": 11138390.0, |
| "step": 1278 |
| }, |
| { |
| "entropy": 0.78662109375, |
| "epoch": 4.099518459069021, |
| "grad_norm": 0.9845862984657288, |
| "learning_rate": 8.671009655303531e-07, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9906353950500488, |
| "num_tokens": 11147129.0, |
| "step": 1279 |
| }, |
| { |
| "entropy": 0.789972335100174, |
| "epoch": 4.102728731942215, |
| "grad_norm": 0.6823396682739258, |
| "learning_rate": 8.611448386439936e-07, |
| "loss": 0.0123, |
| "mean_token_accuracy": 0.995839536190033, |
| "num_tokens": 11154604.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.7805255651473999, |
| "epoch": 4.10593900481541, |
| "grad_norm": 1.8053884506225586, |
| "learning_rate": 8.552073104925296e-07, |
| "loss": 0.0394, |
| "mean_token_accuracy": 0.9804310500621796, |
| "num_tokens": 11165116.0, |
| "step": 1281 |
| }, |
| { |
| "entropy": 0.8384647369384766, |
| "epoch": 4.109149277688603, |
| "grad_norm": 1.5999270677566528, |
| "learning_rate": 8.492884077573749e-07, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9951248466968536, |
| "num_tokens": 11173466.0, |
| "step": 1282 |
| }, |
| { |
| "entropy": 0.841315507888794, |
| "epoch": 4.112359550561798, |
| "grad_norm": 1.1852508783340454, |
| "learning_rate": 8.433881570362484e-07, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9929181337356567, |
| "num_tokens": 11182017.0, |
| "step": 1283 |
| }, |
| { |
| "entropy": 0.8237481415271759, |
| "epoch": 4.115569823434992, |
| "grad_norm": 0.8450478911399841, |
| "learning_rate": 8.375065848430508e-07, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9960514605045319, |
| "num_tokens": 11190254.0, |
| "step": 1284 |
| }, |
| { |
| "entropy": 0.729523628950119, |
| "epoch": 4.118780096308186, |
| "grad_norm": 1.372982382774353, |
| "learning_rate": 8.316437176077491e-07, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9932019710540771, |
| "num_tokens": 11198588.0, |
| "step": 1285 |
| }, |
| { |
| "entropy": 0.8260809779167175, |
| "epoch": 4.121990369181381, |
| "grad_norm": 0.9410303235054016, |
| "learning_rate": 8.257995816762559e-07, |
| "loss": 0.015, |
| "mean_token_accuracy": 0.9945105910301208, |
| "num_tokens": 11207145.0, |
| "step": 1286 |
| }, |
| { |
| "entropy": 0.7663401365280151, |
| "epoch": 4.125200642054574, |
| "grad_norm": 1.1343247890472412, |
| "learning_rate": 8.199742033103091e-07, |
| "loss": 0.0196, |
| "mean_token_accuracy": 0.9935542345046997, |
| "num_tokens": 11216391.0, |
| "step": 1287 |
| }, |
| { |
| "entropy": 0.7641996741294861, |
| "epoch": 4.128410914927769, |
| "grad_norm": 0.8208123445510864, |
| "learning_rate": 8.141676086873574e-07, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9957410991191864, |
| "num_tokens": 11224925.0, |
| "step": 1288 |
| }, |
| { |
| "entropy": 0.7193006873130798, |
| "epoch": 4.131621187800963, |
| "grad_norm": 1.0635079145431519, |
| "learning_rate": 8.083798239004408e-07, |
| "loss": 0.0428, |
| "mean_token_accuracy": 0.9678496420383453, |
| "num_tokens": 11235004.0, |
| "step": 1289 |
| }, |
| { |
| "entropy": 0.8392094969749451, |
| "epoch": 4.134831460674158, |
| "grad_norm": 1.0508886575698853, |
| "learning_rate": 8.026108749580758e-07, |
| "loss": 0.0204, |
| "mean_token_accuracy": 0.9942511916160583, |
| "num_tokens": 11245098.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.8576850295066833, |
| "epoch": 4.138041733547351, |
| "grad_norm": 1.0217255353927612, |
| "learning_rate": 7.968607877841333e-07, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.9947031438350677, |
| "num_tokens": 11254398.0, |
| "step": 1291 |
| }, |
| { |
| "entropy": 0.8804000616073608, |
| "epoch": 4.141252006420546, |
| "grad_norm": 2.1171441078186035, |
| "learning_rate": 7.911295882177256e-07, |
| "loss": 0.0128, |
| "mean_token_accuracy": 0.9961491823196411, |
| "num_tokens": 11262182.0, |
| "step": 1292 |
| }, |
| { |
| "entropy": 0.857866108417511, |
| "epoch": 4.14446227929374, |
| "grad_norm": 0.8988444209098816, |
| "learning_rate": 7.854173020130906e-07, |
| "loss": 0.0273, |
| "mean_token_accuracy": 0.9844126403331757, |
| "num_tokens": 11271620.0, |
| "step": 1293 |
| }, |
| { |
| "entropy": 0.9568844437599182, |
| "epoch": 4.147672552166934, |
| "grad_norm": 0.6712120771408081, |
| "learning_rate": 7.79723954839477e-07, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9966877102851868, |
| "num_tokens": 11279381.0, |
| "step": 1294 |
| }, |
| { |
| "entropy": 0.7858869731426239, |
| "epoch": 4.150882825040128, |
| "grad_norm": 1.349931001663208, |
| "learning_rate": 7.740495722810271e-07, |
| "loss": 0.027, |
| "mean_token_accuracy": 0.9832697808742523, |
| "num_tokens": 11289271.0, |
| "step": 1295 |
| }, |
| { |
| "entropy": 0.8988691568374634, |
| "epoch": 4.154093097913322, |
| "grad_norm": 0.7759506106376648, |
| "learning_rate": 7.683941798366578e-07, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9971850514411926, |
| "num_tokens": 11297331.0, |
| "step": 1296 |
| }, |
| { |
| "entropy": 0.8500702083110809, |
| "epoch": 4.157303370786517, |
| "grad_norm": 1.1883063316345215, |
| "learning_rate": 7.627578029199562e-07, |
| "loss": 0.0257, |
| "mean_token_accuracy": 0.9848445653915405, |
| "num_tokens": 11307060.0, |
| "step": 1297 |
| }, |
| { |
| "entropy": 0.8567503988742828, |
| "epoch": 4.160513643659711, |
| "grad_norm": 1.2109795808792114, |
| "learning_rate": 7.571404668590532e-07, |
| "loss": 0.0332, |
| "mean_token_accuracy": 0.9718556702136993, |
| "num_tokens": 11316494.0, |
| "step": 1298 |
| }, |
| { |
| "entropy": 0.853458046913147, |
| "epoch": 4.163723916532906, |
| "grad_norm": 0.9728054404258728, |
| "learning_rate": 7.515421968965242e-07, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.991171270608902, |
| "num_tokens": 11324981.0, |
| "step": 1299 |
| }, |
| { |
| "entropy": 0.9854492545127869, |
| "epoch": 4.166934189406099, |
| "grad_norm": 0.9858706593513489, |
| "learning_rate": 7.459630181892608e-07, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9902990758419037, |
| "num_tokens": 11334714.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.8187530934810638, |
| "epoch": 4.170144462279294, |
| "grad_norm": 1.1075387001037598, |
| "learning_rate": 7.404029558083653e-07, |
| "loss": 0.0249, |
| "mean_token_accuracy": 0.9834834933280945, |
| "num_tokens": 11343590.0, |
| "step": 1301 |
| }, |
| { |
| "entropy": 0.7375591695308685, |
| "epoch": 4.173354735152488, |
| "grad_norm": 1.350464940071106, |
| "learning_rate": 7.348620347390384e-07, |
| "loss": 0.0206, |
| "mean_token_accuracy": 0.9900037944316864, |
| "num_tokens": 11353244.0, |
| "step": 1302 |
| }, |
| { |
| "entropy": 0.9315820038318634, |
| "epoch": 4.176565008025682, |
| "grad_norm": 1.5498111248016357, |
| "learning_rate": 7.293402798804667e-07, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9929503202438354, |
| "num_tokens": 11363019.0, |
| "step": 1303 |
| }, |
| { |
| "entropy": 0.8681217432022095, |
| "epoch": 4.179775280898877, |
| "grad_norm": 0.8983978033065796, |
| "learning_rate": 7.238377160457094e-07, |
| "loss": 0.0155, |
| "mean_token_accuracy": 0.996653288602829, |
| "num_tokens": 11371486.0, |
| "step": 1304 |
| }, |
| { |
| "entropy": 0.8901689052581787, |
| "epoch": 4.18298555377207, |
| "grad_norm": 0.8819789290428162, |
| "learning_rate": 7.183543679615834e-07, |
| "loss": 0.0149, |
| "mean_token_accuracy": 0.9940185546875, |
| "num_tokens": 11379965.0, |
| "step": 1305 |
| }, |
| { |
| "entropy": 0.8389869928359985, |
| "epoch": 4.186195826645265, |
| "grad_norm": 1.1202179193496704, |
| "learning_rate": 7.128902602685617e-07, |
| "loss": 0.0298, |
| "mean_token_accuracy": 0.9756748974323273, |
| "num_tokens": 11390299.0, |
| "step": 1306 |
| }, |
| { |
| "entropy": 0.849632740020752, |
| "epoch": 4.189406099518459, |
| "grad_norm": 1.1636488437652588, |
| "learning_rate": 7.074454175206524e-07, |
| "loss": 0.0207, |
| "mean_token_accuracy": 0.9929619431495667, |
| "num_tokens": 11398813.0, |
| "step": 1307 |
| }, |
| { |
| "entropy": 0.8657627403736115, |
| "epoch": 4.192616372391654, |
| "grad_norm": 0.7907083034515381, |
| "learning_rate": 7.020198641852949e-07, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9957354366779327, |
| "num_tokens": 11407073.0, |
| "step": 1308 |
| }, |
| { |
| "entropy": 0.890144944190979, |
| "epoch": 4.195826645264847, |
| "grad_norm": 0.7237719893455505, |
| "learning_rate": 6.966136246432492e-07, |
| "loss": 0.0101, |
| "mean_token_accuracy": 0.9964252412319183, |
| "num_tokens": 11416069.0, |
| "step": 1309 |
| }, |
| { |
| "entropy": 0.912724643945694, |
| "epoch": 4.199036918138042, |
| "grad_norm": 0.8627637028694153, |
| "learning_rate": 6.912267231884817e-07, |
| "loss": 0.0187, |
| "mean_token_accuracy": 0.9939960241317749, |
| "num_tokens": 11425409.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.9045071303844452, |
| "epoch": 4.202247191011236, |
| "grad_norm": 0.6950350999832153, |
| "learning_rate": 6.858591840280627e-07, |
| "loss": 0.0126, |
| "mean_token_accuracy": 0.9963579177856445, |
| "num_tokens": 11433984.0, |
| "step": 1311 |
| }, |
| { |
| "entropy": 0.8476213812828064, |
| "epoch": 4.20545746388443, |
| "grad_norm": 1.5948781967163086, |
| "learning_rate": 6.805110312820501e-07, |
| "loss": 0.0163, |
| "mean_token_accuracy": 0.9928786754608154, |
| "num_tokens": 11443158.0, |
| "step": 1312 |
| }, |
| { |
| "entropy": 0.9233494997024536, |
| "epoch": 4.208667736757624, |
| "grad_norm": 1.3917864561080933, |
| "learning_rate": 6.751822889833926e-07, |
| "loss": 0.0191, |
| "mean_token_accuracy": 0.9936199188232422, |
| "num_tokens": 11452244.0, |
| "step": 1313 |
| }, |
| { |
| "entropy": 0.8223008215427399, |
| "epoch": 4.211878009630818, |
| "grad_norm": 0.9445686936378479, |
| "learning_rate": 6.698729810778065e-07, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9957300126552582, |
| "num_tokens": 11459714.0, |
| "step": 1314 |
| }, |
| { |
| "entropy": 0.9119621515274048, |
| "epoch": 4.215088282504013, |
| "grad_norm": 1.3611754179000854, |
| "learning_rate": 6.645831314236817e-07, |
| "loss": 0.0251, |
| "mean_token_accuracy": 0.9857873618602753, |
| "num_tokens": 11467671.0, |
| "step": 1315 |
| }, |
| { |
| "entropy": 0.8818807601928711, |
| "epoch": 4.218298555377207, |
| "grad_norm": 1.3261618614196777, |
| "learning_rate": 6.593127637919633e-07, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9928447604179382, |
| "num_tokens": 11476584.0, |
| "step": 1316 |
| }, |
| { |
| "entropy": 0.9259830713272095, |
| "epoch": 4.221508828250402, |
| "grad_norm": 1.2929267883300781, |
| "learning_rate": 6.540619018660555e-07, |
| "loss": 0.0215, |
| "mean_token_accuracy": 0.9909241497516632, |
| "num_tokens": 11485902.0, |
| "step": 1317 |
| }, |
| { |
| "entropy": 0.8626327812671661, |
| "epoch": 4.224719101123595, |
| "grad_norm": 0.8614633083343506, |
| "learning_rate": 6.488305692417074e-07, |
| "loss": 0.015, |
| "mean_token_accuracy": 0.9946702420711517, |
| "num_tokens": 11493744.0, |
| "step": 1318 |
| }, |
| { |
| "entropy": 0.8261954486370087, |
| "epoch": 4.22792937399679, |
| "grad_norm": 0.8554882407188416, |
| "learning_rate": 6.436187894269086e-07, |
| "loss": 0.0271, |
| "mean_token_accuracy": 0.9795649349689484, |
| "num_tokens": 11502359.0, |
| "step": 1319 |
| }, |
| { |
| "entropy": 0.905153900384903, |
| "epoch": 4.231139646869984, |
| "grad_norm": 0.7063010931015015, |
| "learning_rate": 6.384265858417877e-07, |
| "loss": 0.0263, |
| "mean_token_accuracy": 0.9765981733798981, |
| "num_tokens": 11512068.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.7651464641094208, |
| "epoch": 4.234349919743178, |
| "grad_norm": 2.0079517364501953, |
| "learning_rate": 6.332539818184985e-07, |
| "loss": 0.0218, |
| "mean_token_accuracy": 0.9929872751235962, |
| "num_tokens": 11520923.0, |
| "step": 1321 |
| }, |
| { |
| "entropy": 0.789582222700119, |
| "epoch": 4.237560192616373, |
| "grad_norm": 1.0748300552368164, |
| "learning_rate": 6.281010006011256e-07, |
| "loss": 0.0247, |
| "mean_token_accuracy": 0.9863014817237854, |
| "num_tokens": 11529886.0, |
| "step": 1322 |
| }, |
| { |
| "entropy": 0.780224084854126, |
| "epoch": 4.240770465489566, |
| "grad_norm": 1.1340675354003906, |
| "learning_rate": 6.229676653455719e-07, |
| "loss": 0.0184, |
| "mean_token_accuracy": 0.9943748712539673, |
| "num_tokens": 11537909.0, |
| "step": 1323 |
| }, |
| { |
| "entropy": 0.8488284349441528, |
| "epoch": 4.243980738362761, |
| "grad_norm": 1.945456862449646, |
| "learning_rate": 6.178539991194599e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9947131276130676, |
| "num_tokens": 11546221.0, |
| "step": 1324 |
| }, |
| { |
| "entropy": 0.7531148791313171, |
| "epoch": 4.247191011235955, |
| "grad_norm": 0.9983758330345154, |
| "learning_rate": 6.127600249020216e-07, |
| "loss": 0.0153, |
| "mean_token_accuracy": 0.9957643747329712, |
| "num_tokens": 11554442.0, |
| "step": 1325 |
| }, |
| { |
| "entropy": 0.858670711517334, |
| "epoch": 4.25040128410915, |
| "grad_norm": 2.524183988571167, |
| "learning_rate": 6.076857655840024e-07, |
| "loss": 0.0215, |
| "mean_token_accuracy": 0.9911713302135468, |
| "num_tokens": 11563316.0, |
| "step": 1326 |
| }, |
| { |
| "entropy": 0.9219168722629547, |
| "epoch": 4.253611556982343, |
| "grad_norm": 0.8876209855079651, |
| "learning_rate": 6.026312439675553e-07, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.9944645464420319, |
| "num_tokens": 11571721.0, |
| "step": 1327 |
| }, |
| { |
| "entropy": 0.8005822896957397, |
| "epoch": 4.256821829855538, |
| "grad_norm": 0.860865592956543, |
| "learning_rate": 5.975964827661346e-07, |
| "loss": 0.0297, |
| "mean_token_accuracy": 0.9751374423503876, |
| "num_tokens": 11581449.0, |
| "step": 1328 |
| }, |
| { |
| "entropy": 0.7818480730056763, |
| "epoch": 4.260032102728732, |
| "grad_norm": 4.995467662811279, |
| "learning_rate": 5.925815046044026e-07, |
| "loss": 0.0315, |
| "mean_token_accuracy": 0.9817995131015778, |
| "num_tokens": 11590914.0, |
| "step": 1329 |
| }, |
| { |
| "entropy": 0.781961977481842, |
| "epoch": 4.263242375601926, |
| "grad_norm": 1.655155897140503, |
| "learning_rate": 5.875863320181175e-07, |
| "loss": 0.0306, |
| "mean_token_accuracy": 0.9803996980190277, |
| "num_tokens": 11601404.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.9626095294952393, |
| "epoch": 4.26645264847512, |
| "grad_norm": 1.3983674049377441, |
| "learning_rate": 5.826109874540409e-07, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9947277307510376, |
| "num_tokens": 11610077.0, |
| "step": 1331 |
| }, |
| { |
| "entropy": 0.9014355540275574, |
| "epoch": 4.269662921348314, |
| "grad_norm": 2.09114146232605, |
| "learning_rate": 5.776554932698325e-07, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9936963021755219, |
| "num_tokens": 11619265.0, |
| "step": 1332 |
| }, |
| { |
| "entropy": 0.8946935832500458, |
| "epoch": 4.272873194221509, |
| "grad_norm": 0.841145932674408, |
| "learning_rate": 5.727198717339511e-07, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9885965883731842, |
| "num_tokens": 11628528.0, |
| "step": 1333 |
| }, |
| { |
| "entropy": 0.8457264304161072, |
| "epoch": 4.276083467094703, |
| "grad_norm": 1.5083551406860352, |
| "learning_rate": 5.678041450255512e-07, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9952947199344635, |
| "num_tokens": 11636719.0, |
| "step": 1334 |
| }, |
| { |
| "entropy": 0.7869194149971008, |
| "epoch": 4.279293739967898, |
| "grad_norm": 1.0477701425552368, |
| "learning_rate": 5.6290833523439e-07, |
| "loss": 0.0181, |
| "mean_token_accuracy": 0.9945695102214813, |
| "num_tokens": 11645004.0, |
| "step": 1335 |
| }, |
| { |
| "entropy": 0.9997333586215973, |
| "epoch": 4.282504012841091, |
| "grad_norm": 0.8144447803497314, |
| "learning_rate": 5.58032464360721e-07, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.9962140023708344, |
| "num_tokens": 11653557.0, |
| "step": 1336 |
| }, |
| { |
| "entropy": 0.8154990077018738, |
| "epoch": 4.285714285714286, |
| "grad_norm": 1.150378704071045, |
| "learning_rate": 5.531765543152002e-07, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9930236041545868, |
| "num_tokens": 11662284.0, |
| "step": 1337 |
| }, |
| { |
| "entropy": 0.799591451883316, |
| "epoch": 4.28892455858748, |
| "grad_norm": 0.8984584808349609, |
| "learning_rate": 5.483406269187869e-07, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9946328103542328, |
| "num_tokens": 11670400.0, |
| "step": 1338 |
| }, |
| { |
| "entropy": 0.7851231098175049, |
| "epoch": 4.292134831460674, |
| "grad_norm": 1.5604168176651, |
| "learning_rate": 5.435247039026398e-07, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.9917657375335693, |
| "num_tokens": 11679997.0, |
| "step": 1339 |
| }, |
| { |
| "entropy": 0.941244900226593, |
| "epoch": 4.295345104333869, |
| "grad_norm": 2.561316728591919, |
| "learning_rate": 5.387288069080298e-07, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9957253336906433, |
| "num_tokens": 11689727.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.7649567723274231, |
| "epoch": 4.298555377207062, |
| "grad_norm": 0.8507292866706848, |
| "learning_rate": 5.33952957486234e-07, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9926185607910156, |
| "num_tokens": 11698033.0, |
| "step": 1341 |
| }, |
| { |
| "entropy": 0.7722039520740509, |
| "epoch": 4.301765650080257, |
| "grad_norm": 0.7529566287994385, |
| "learning_rate": 5.291971770984428e-07, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.9899595379829407, |
| "num_tokens": 11707024.0, |
| "step": 1342 |
| }, |
| { |
| "entropy": 0.8715614974498749, |
| "epoch": 4.304975922953451, |
| "grad_norm": 0.7769994139671326, |
| "learning_rate": 5.244614871156612e-07, |
| "loss": 0.0128, |
| "mean_token_accuracy": 0.9952245056629181, |
| "num_tokens": 11714142.0, |
| "step": 1343 |
| }, |
| { |
| "entropy": 0.7915047109127045, |
| "epoch": 4.308186195826646, |
| "grad_norm": 1.3179060220718384, |
| "learning_rate": 5.197459088186163e-07, |
| "loss": 0.015, |
| "mean_token_accuracy": 0.9947215914726257, |
| "num_tokens": 11722750.0, |
| "step": 1344 |
| }, |
| { |
| "entropy": 0.8518697619438171, |
| "epoch": 4.311396468699839, |
| "grad_norm": 1.0496397018432617, |
| "learning_rate": 5.150504633976572e-07, |
| "loss": 0.0274, |
| "mean_token_accuracy": 0.9796628355979919, |
| "num_tokens": 11732462.0, |
| "step": 1345 |
| }, |
| { |
| "entropy": 0.9217169284820557, |
| "epoch": 4.314606741573034, |
| "grad_norm": 1.6036604642868042, |
| "learning_rate": 5.103751719526639e-07, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.994863748550415, |
| "num_tokens": 11740413.0, |
| "step": 1346 |
| }, |
| { |
| "entropy": 0.8194193542003632, |
| "epoch": 4.317817014446228, |
| "grad_norm": 2.075221300125122, |
| "learning_rate": 5.057200554929509e-07, |
| "loss": 0.0243, |
| "mean_token_accuracy": 0.9908171594142914, |
| "num_tokens": 11749708.0, |
| "step": 1347 |
| }, |
| { |
| "entropy": 0.8294661045074463, |
| "epoch": 4.321027287319422, |
| "grad_norm": 0.9790740013122559, |
| "learning_rate": 5.010851349371704e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9910164773464203, |
| "num_tokens": 11758025.0, |
| "step": 1348 |
| }, |
| { |
| "entropy": 0.7852829992771149, |
| "epoch": 4.324237560192616, |
| "grad_norm": 1.6573703289031982, |
| "learning_rate": 4.964704311132224e-07, |
| "loss": 0.0163, |
| "mean_token_accuracy": 0.9949574768543243, |
| "num_tokens": 11765881.0, |
| "step": 1349 |
| }, |
| { |
| "entropy": 0.8169362843036652, |
| "epoch": 4.32744783306581, |
| "grad_norm": 0.9623476266860962, |
| "learning_rate": 4.918759647581578e-07, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9888501465320587, |
| "num_tokens": 11774323.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.8484928011894226, |
| "epoch": 4.330658105939005, |
| "grad_norm": 2.397317886352539, |
| "learning_rate": 4.873017565180871e-07, |
| "loss": 0.0295, |
| "mean_token_accuracy": 0.9821926355361938, |
| "num_tokens": 11783381.0, |
| "step": 1351 |
| }, |
| { |
| "entropy": 0.8673693537712097, |
| "epoch": 4.333868378812199, |
| "grad_norm": 1.7956311702728271, |
| "learning_rate": 4.827478269480895e-07, |
| "loss": 0.0148, |
| "mean_token_accuracy": 0.994268536567688, |
| "num_tokens": 11791626.0, |
| "step": 1352 |
| }, |
| { |
| "entropy": 0.8526739478111267, |
| "epoch": 4.337078651685394, |
| "grad_norm": 1.6738061904907227, |
| "learning_rate": 4.782141965121129e-07, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.9941445887088776, |
| "num_tokens": 11800040.0, |
| "step": 1353 |
| }, |
| { |
| "entropy": 0.8506563901901245, |
| "epoch": 4.340288924558587, |
| "grad_norm": 0.7996606230735779, |
| "learning_rate": 4.7370088558289175e-07, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9958418607711792, |
| "num_tokens": 11809269.0, |
| "step": 1354 |
| }, |
| { |
| "entropy": 0.8788428902626038, |
| "epoch": 4.343499197431782, |
| "grad_norm": 0.7069100141525269, |
| "learning_rate": 4.6920791444184934e-07, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.9968018531799316, |
| "num_tokens": 11817225.0, |
| "step": 1355 |
| }, |
| { |
| "entropy": 0.8030937612056732, |
| "epoch": 4.346709470304976, |
| "grad_norm": 2.9055588245391846, |
| "learning_rate": 4.647353032790086e-07, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9941924810409546, |
| "num_tokens": 11825674.0, |
| "step": 1356 |
| }, |
| { |
| "entropy": 0.9827845692634583, |
| "epoch": 4.34991974317817, |
| "grad_norm": 1.107529878616333, |
| "learning_rate": 4.602830721928997e-07, |
| "loss": 0.035, |
| "mean_token_accuracy": 0.9746537506580353, |
| "num_tokens": 11834822.0, |
| "step": 1357 |
| }, |
| { |
| "entropy": 0.9760691821575165, |
| "epoch": 4.353130016051364, |
| "grad_norm": 0.5545116662979126, |
| "learning_rate": 4.558512411904731e-07, |
| "loss": 0.0106, |
| "mean_token_accuracy": 0.9977552592754364, |
| "num_tokens": 11843119.0, |
| "step": 1358 |
| }, |
| { |
| "entropy": 0.8231452703475952, |
| "epoch": 4.356340288924558, |
| "grad_norm": 1.0874269008636475, |
| "learning_rate": 4.5143983018700485e-07, |
| "loss": 0.0256, |
| "mean_token_accuracy": 0.9907995164394379, |
| "num_tokens": 11852225.0, |
| "step": 1359 |
| }, |
| { |
| "entropy": 0.7386958003044128, |
| "epoch": 4.359550561797753, |
| "grad_norm": 1.1277332305908203, |
| "learning_rate": 4.4704885900601236e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9933885335922241, |
| "num_tokens": 11860832.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.8382771015167236, |
| "epoch": 4.362760834670947, |
| "grad_norm": 0.8998692035675049, |
| "learning_rate": 4.4267834737916295e-07, |
| "loss": 0.0125, |
| "mean_token_accuracy": 0.996128261089325, |
| "num_tokens": 11869460.0, |
| "step": 1361 |
| }, |
| { |
| "entropy": 0.9397884905338287, |
| "epoch": 4.365971107544142, |
| "grad_norm": 0.9518370628356934, |
| "learning_rate": 4.3832831494618255e-07, |
| "loss": 0.0369, |
| "mean_token_accuracy": 0.9735174477100372, |
| "num_tokens": 11880031.0, |
| "step": 1362 |
| }, |
| { |
| "entropy": 0.7591385245323181, |
| "epoch": 4.369181380417335, |
| "grad_norm": 1.1569485664367676, |
| "learning_rate": 4.33998781254773e-07, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9944524168968201, |
| "num_tokens": 11888471.0, |
| "step": 1363 |
| }, |
| { |
| "entropy": 0.8310154378414154, |
| "epoch": 4.37239165329053, |
| "grad_norm": 1.0223422050476074, |
| "learning_rate": 4.2968976576051703e-07, |
| "loss": 0.0136, |
| "mean_token_accuracy": 0.9942927956581116, |
| "num_tokens": 11896131.0, |
| "step": 1364 |
| }, |
| { |
| "entropy": 0.7848845720291138, |
| "epoch": 4.375601926163724, |
| "grad_norm": 0.8517405986785889, |
| "learning_rate": 4.2540128782679934e-07, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9930301904678345, |
| "num_tokens": 11904730.0, |
| "step": 1365 |
| }, |
| { |
| "entropy": 0.8888550400733948, |
| "epoch": 4.378812199036918, |
| "grad_norm": 1.2937294244766235, |
| "learning_rate": 4.211333667247125e-07, |
| "loss": 0.0172, |
| "mean_token_accuracy": 0.9935269057750702, |
| "num_tokens": 11913169.0, |
| "step": 1366 |
| }, |
| { |
| "entropy": 0.7717286348342896, |
| "epoch": 4.382022471910112, |
| "grad_norm": 0.9954974055290222, |
| "learning_rate": 4.1688602163297564e-07, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9969731569290161, |
| "num_tokens": 11922132.0, |
| "step": 1367 |
| }, |
| { |
| "entropy": 0.8456141352653503, |
| "epoch": 4.385232744783306, |
| "grad_norm": 1.324472427368164, |
| "learning_rate": 4.126592716378408e-07, |
| "loss": 0.0102, |
| "mean_token_accuracy": 0.9973188638687134, |
| "num_tokens": 11929738.0, |
| "step": 1368 |
| }, |
| { |
| "entropy": 0.9674667716026306, |
| "epoch": 4.388443017656501, |
| "grad_norm": 0.7544914484024048, |
| "learning_rate": 4.0845313573301736e-07, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.996004194021225, |
| "num_tokens": 11938396.0, |
| "step": 1369 |
| }, |
| { |
| "entropy": 0.8736143708229065, |
| "epoch": 4.391653290529695, |
| "grad_norm": 0.7559876441955566, |
| "learning_rate": 4.042676328195788e-07, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9960503578186035, |
| "num_tokens": 11947418.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.8970188498497009, |
| "epoch": 4.39486356340289, |
| "grad_norm": 0.8440226316452026, |
| "learning_rate": 4.001027817058789e-07, |
| "loss": 0.0153, |
| "mean_token_accuracy": 0.9943675100803375, |
| "num_tokens": 11956316.0, |
| "step": 1371 |
| }, |
| { |
| "entropy": 0.7634763419628143, |
| "epoch": 4.398073836276083, |
| "grad_norm": 1.064709186553955, |
| "learning_rate": 3.959586011074729e-07, |
| "loss": 0.0313, |
| "mean_token_accuracy": 0.9725034534931183, |
| "num_tokens": 11965704.0, |
| "step": 1372 |
| }, |
| { |
| "entropy": 0.8148213326931, |
| "epoch": 4.401284109149278, |
| "grad_norm": 1.9659761190414429, |
| "learning_rate": 3.9183510964702463e-07, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9943975806236267, |
| "num_tokens": 11974096.0, |
| "step": 1373 |
| }, |
| { |
| "entropy": 0.7761686444282532, |
| "epoch": 4.404494382022472, |
| "grad_norm": 1.3696943521499634, |
| "learning_rate": 3.8773232585422924e-07, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9964625537395477, |
| "num_tokens": 11982355.0, |
| "step": 1374 |
| }, |
| { |
| "entropy": 0.7501409351825714, |
| "epoch": 4.407704654895666, |
| "grad_norm": 0.7303560376167297, |
| "learning_rate": 3.836502681657289e-07, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.990977019071579, |
| "num_tokens": 11991634.0, |
| "step": 1375 |
| }, |
| { |
| "entropy": 0.8639850616455078, |
| "epoch": 4.41091492776886, |
| "grad_norm": 1.1209553480148315, |
| "learning_rate": 3.795889549250292e-07, |
| "loss": 0.0138, |
| "mean_token_accuracy": 0.9946679472923279, |
| "num_tokens": 12000344.0, |
| "step": 1376 |
| }, |
| { |
| "entropy": 0.8161064386367798, |
| "epoch": 4.414125200642054, |
| "grad_norm": 2.2693448066711426, |
| "learning_rate": 3.755484043824131e-07, |
| "loss": 0.0138, |
| "mean_token_accuracy": 0.9931889176368713, |
| "num_tokens": 12009056.0, |
| "step": 1377 |
| }, |
| { |
| "entropy": 0.7727282047271729, |
| "epoch": 4.417335473515249, |
| "grad_norm": 0.9428284764289856, |
| "learning_rate": 3.715286346948671e-07, |
| "loss": 0.0215, |
| "mean_token_accuracy": 0.9886394441127777, |
| "num_tokens": 12018012.0, |
| "step": 1378 |
| }, |
| { |
| "entropy": 0.6903052926063538, |
| "epoch": 4.420545746388443, |
| "grad_norm": 2.0464818477630615, |
| "learning_rate": 3.675296639259912e-07, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.992774486541748, |
| "num_tokens": 12025935.0, |
| "step": 1379 |
| }, |
| { |
| "entropy": 0.9016786813735962, |
| "epoch": 4.423756019261638, |
| "grad_norm": 2.821998357772827, |
| "learning_rate": 3.6355151004592414e-07, |
| "loss": 0.0207, |
| "mean_token_accuracy": 0.9895893335342407, |
| "num_tokens": 12034016.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.7792826294898987, |
| "epoch": 4.426966292134831, |
| "grad_norm": 0.8225787878036499, |
| "learning_rate": 3.595941909312595e-07, |
| "loss": 0.0282, |
| "mean_token_accuracy": 0.9868153035640717, |
| "num_tokens": 12042647.0, |
| "step": 1381 |
| }, |
| { |
| "entropy": 0.7803854644298553, |
| "epoch": 4.430176565008026, |
| "grad_norm": 1.0302590131759644, |
| "learning_rate": 3.5565772436496336e-07, |
| "loss": 0.0205, |
| "mean_token_accuracy": 0.9852829873561859, |
| "num_tokens": 12051756.0, |
| "step": 1382 |
| }, |
| { |
| "entropy": 1.015624314546585, |
| "epoch": 4.43338683788122, |
| "grad_norm": 2.2021806240081787, |
| "learning_rate": 3.517421280363004e-07, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.9948179125785828, |
| "num_tokens": 12061734.0, |
| "step": 1383 |
| }, |
| { |
| "entropy": 0.7805822789669037, |
| "epoch": 4.436597110754414, |
| "grad_norm": 0.9023168087005615, |
| "learning_rate": 3.4784741954074884e-07, |
| "loss": 0.0197, |
| "mean_token_accuracy": 0.9920674264431, |
| "num_tokens": 12071018.0, |
| "step": 1384 |
| }, |
| { |
| "entropy": 0.8862617015838623, |
| "epoch": 4.439807383627608, |
| "grad_norm": 0.7753137946128845, |
| "learning_rate": 3.439736163799251e-07, |
| "loss": 0.0128, |
| "mean_token_accuracy": 0.9962200224399567, |
| "num_tokens": 12078519.0, |
| "step": 1385 |
| }, |
| { |
| "entropy": 0.7836082875728607, |
| "epoch": 4.443017656500802, |
| "grad_norm": 1.1008198261260986, |
| "learning_rate": 3.4012073596150106e-07, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9951090216636658, |
| "num_tokens": 12086876.0, |
| "step": 1386 |
| }, |
| { |
| "entropy": 0.7967692613601685, |
| "epoch": 4.446227929373997, |
| "grad_norm": 0.8891757130622864, |
| "learning_rate": 3.362887955991301e-07, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.9891441464424133, |
| "num_tokens": 12095596.0, |
| "step": 1387 |
| }, |
| { |
| "entropy": 0.829216718673706, |
| "epoch": 4.449438202247191, |
| "grad_norm": 0.743823230266571, |
| "learning_rate": 3.3247781251236623e-07, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.9918361604213715, |
| "num_tokens": 12104120.0, |
| "step": 1388 |
| }, |
| { |
| "entropy": 0.8801736235618591, |
| "epoch": 4.452648475120386, |
| "grad_norm": 1.7145832777023315, |
| "learning_rate": 3.2868780382658895e-07, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.9942232072353363, |
| "num_tokens": 12113979.0, |
| "step": 1389 |
| }, |
| { |
| "entropy": 0.7938494682312012, |
| "epoch": 4.455858747993579, |
| "grad_norm": 0.9124286770820618, |
| "learning_rate": 3.2491878657292643e-07, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.994210958480835, |
| "num_tokens": 12122733.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.8369238376617432, |
| "epoch": 4.459069020866774, |
| "grad_norm": 1.171193242073059, |
| "learning_rate": 3.2117077768817395e-07, |
| "loss": 0.0305, |
| "mean_token_accuracy": 0.9782870709896088, |
| "num_tokens": 12131758.0, |
| "step": 1391 |
| }, |
| { |
| "entropy": 0.8763419985771179, |
| "epoch": 4.462279293739968, |
| "grad_norm": 0.8820005059242249, |
| "learning_rate": 3.174437940147268e-07, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9950818717479706, |
| "num_tokens": 12141100.0, |
| "step": 1392 |
| }, |
| { |
| "entropy": 0.8697878122329712, |
| "epoch": 4.465489566613162, |
| "grad_norm": 0.9343982934951782, |
| "learning_rate": 3.1373785230049356e-07, |
| "loss": 0.0122, |
| "mean_token_accuracy": 0.9963211715221405, |
| "num_tokens": 12149229.0, |
| "step": 1393 |
| }, |
| { |
| "entropy": 0.8618627786636353, |
| "epoch": 4.468699839486356, |
| "grad_norm": 1.007222294807434, |
| "learning_rate": 3.1005296919883354e-07, |
| "loss": 0.0212, |
| "mean_token_accuracy": 0.9941324591636658, |
| "num_tokens": 12157044.0, |
| "step": 1394 |
| }, |
| { |
| "entropy": 0.7762088775634766, |
| "epoch": 4.47191011235955, |
| "grad_norm": 1.0554577112197876, |
| "learning_rate": 3.0638916126846885e-07, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9922034442424774, |
| "num_tokens": 12166146.0, |
| "step": 1395 |
| }, |
| { |
| "entropy": 0.901738703250885, |
| "epoch": 4.475120385232745, |
| "grad_norm": 1.7087578773498535, |
| "learning_rate": 3.0274644497342133e-07, |
| "loss": 0.0146, |
| "mean_token_accuracy": 0.9947327375411987, |
| "num_tokens": 12174767.0, |
| "step": 1396 |
| }, |
| { |
| "entropy": 0.9199126064777374, |
| "epoch": 4.478330658105939, |
| "grad_norm": 1.3522827625274658, |
| "learning_rate": 2.991248366829291e-07, |
| "loss": 0.0128, |
| "mean_token_accuracy": 0.9958087801933289, |
| "num_tokens": 12182552.0, |
| "step": 1397 |
| }, |
| { |
| "entropy": 0.7853618264198303, |
| "epoch": 4.481540930979134, |
| "grad_norm": 0.7523943781852722, |
| "learning_rate": 2.955243526713808e-07, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9948084652423859, |
| "num_tokens": 12190296.0, |
| "step": 1398 |
| }, |
| { |
| "entropy": 0.8466727435588837, |
| "epoch": 4.484751203852327, |
| "grad_norm": 1.0168616771697998, |
| "learning_rate": 2.91945009118238e-07, |
| "loss": 0.0165, |
| "mean_token_accuracy": 0.9945223033428192, |
| "num_tokens": 12198478.0, |
| "step": 1399 |
| }, |
| { |
| "entropy": 0.8088905811309814, |
| "epoch": 4.487961476725522, |
| "grad_norm": 1.3884814977645874, |
| "learning_rate": 2.883868221079628e-07, |
| "loss": 0.0218, |
| "mean_token_accuracy": 0.991487979888916, |
| "num_tokens": 12207216.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.7696555852890015, |
| "epoch": 4.491171749598716, |
| "grad_norm": 1.1995670795440674, |
| "learning_rate": 2.848498076299483e-07, |
| "loss": 0.0202, |
| "mean_token_accuracy": 0.9934026896953583, |
| "num_tokens": 12215618.0, |
| "step": 1401 |
| }, |
| { |
| "entropy": 0.8784035742282867, |
| "epoch": 4.49438202247191, |
| "grad_norm": 1.1318795680999756, |
| "learning_rate": 2.813339815784416e-07, |
| "loss": 0.014, |
| "mean_token_accuracy": 0.996482789516449, |
| "num_tokens": 12223941.0, |
| "step": 1402 |
| }, |
| { |
| "entropy": 0.9082882702350616, |
| "epoch": 4.497592295345104, |
| "grad_norm": 0.5473995804786682, |
| "learning_rate": 2.7783935975247867e-07, |
| "loss": 0.0101, |
| "mean_token_accuracy": 0.9972693026065826, |
| "num_tokens": 12232545.0, |
| "step": 1403 |
| }, |
| { |
| "entropy": 0.8398851752281189, |
| "epoch": 4.500802568218298, |
| "grad_norm": 1.0231767892837524, |
| "learning_rate": 2.743659578558089e-07, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9750750958919525, |
| "num_tokens": 12241817.0, |
| "step": 1404 |
| }, |
| { |
| "entropy": 0.6918390095233917, |
| "epoch": 4.504012841091493, |
| "grad_norm": 0.8074386119842529, |
| "learning_rate": 2.7091379149682683e-07, |
| "loss": 0.0224, |
| "mean_token_accuracy": 0.9840397238731384, |
| "num_tokens": 12250369.0, |
| "step": 1405 |
| }, |
| { |
| "entropy": 0.8113892078399658, |
| "epoch": 4.507223113964687, |
| "grad_norm": 2.0139825344085693, |
| "learning_rate": 2.6748287618849957e-07, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.993849903345108, |
| "num_tokens": 12258742.0, |
| "step": 1406 |
| }, |
| { |
| "entropy": 0.8949583470821381, |
| "epoch": 4.510433386837882, |
| "grad_norm": 0.6584023237228394, |
| "learning_rate": 2.6407322734829763e-07, |
| "loss": 0.0095, |
| "mean_token_accuracy": 0.9971945285797119, |
| "num_tokens": 12266628.0, |
| "step": 1407 |
| }, |
| { |
| "entropy": 0.8380038738250732, |
| "epoch": 4.513643659711075, |
| "grad_norm": 0.801179826259613, |
| "learning_rate": 2.6068486029813154e-07, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9962698221206665, |
| "num_tokens": 12274399.0, |
| "step": 1408 |
| }, |
| { |
| "entropy": 0.7071676850318909, |
| "epoch": 4.51685393258427, |
| "grad_norm": 0.8389870524406433, |
| "learning_rate": 2.573177902642726e-07, |
| "loss": 0.0233, |
| "mean_token_accuracy": 0.9872068166732788, |
| "num_tokens": 12285419.0, |
| "step": 1409 |
| }, |
| { |
| "entropy": 0.7678396701812744, |
| "epoch": 4.520064205457464, |
| "grad_norm": 0.7854102849960327, |
| "learning_rate": 2.539720323772926e-07, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9910922050476074, |
| "num_tokens": 12293828.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.7876458466053009, |
| "epoch": 4.523274478330658, |
| "grad_norm": 0.9480336308479309, |
| "learning_rate": 2.506476016719922e-07, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9949440360069275, |
| "num_tokens": 12301568.0, |
| "step": 1411 |
| }, |
| { |
| "entropy": 0.8209743797779083, |
| "epoch": 4.526484751203852, |
| "grad_norm": 0.8667705059051514, |
| "learning_rate": 2.473445130873353e-07, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9932181537151337, |
| "num_tokens": 12310382.0, |
| "step": 1412 |
| }, |
| { |
| "entropy": 0.866468995809555, |
| "epoch": 4.529695024077046, |
| "grad_norm": 0.9462944865226746, |
| "learning_rate": 2.440627814663804e-07, |
| "loss": 0.0193, |
| "mean_token_accuracy": 0.9907811284065247, |
| "num_tokens": 12318521.0, |
| "step": 1413 |
| }, |
| { |
| "entropy": 0.7582902610301971, |
| "epoch": 4.532905296950241, |
| "grad_norm": 1.1882989406585693, |
| "learning_rate": 2.4080242155621327e-07, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.9941226541996002, |
| "num_tokens": 12327369.0, |
| "step": 1414 |
| }, |
| { |
| "entropy": 0.7556898593902588, |
| "epoch": 4.536115569823435, |
| "grad_norm": 2.0978384017944336, |
| "learning_rate": 2.3756344800788421e-07, |
| "loss": 0.0217, |
| "mean_token_accuracy": 0.9932650327682495, |
| "num_tokens": 12335719.0, |
| "step": 1415 |
| }, |
| { |
| "entropy": 0.858602374792099, |
| "epoch": 4.539325842696629, |
| "grad_norm": 1.414963960647583, |
| "learning_rate": 2.343458753763378e-07, |
| "loss": 0.0195, |
| "mean_token_accuracy": 0.9922243356704712, |
| "num_tokens": 12344686.0, |
| "step": 1416 |
| }, |
| { |
| "entropy": 0.8799131810665131, |
| "epoch": 4.542536115569823, |
| "grad_norm": 0.9761103391647339, |
| "learning_rate": 2.3114971812034981e-07, |
| "loss": 0.0148, |
| "mean_token_accuracy": 0.9959944486618042, |
| "num_tokens": 12352531.0, |
| "step": 1417 |
| }, |
| { |
| "entropy": 0.8054837286472321, |
| "epoch": 4.545746388443018, |
| "grad_norm": 0.7446231842041016, |
| "learning_rate": 2.2797499060246253e-07, |
| "loss": 0.0148, |
| "mean_token_accuracy": 0.9948087632656097, |
| "num_tokens": 12361539.0, |
| "step": 1418 |
| }, |
| { |
| "entropy": 0.8667932152748108, |
| "epoch": 4.548956661316212, |
| "grad_norm": 2.7349588871002197, |
| "learning_rate": 2.2482170708892083e-07, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9932528138160706, |
| "num_tokens": 12371893.0, |
| "step": 1419 |
| }, |
| { |
| "entropy": 0.8341934084892273, |
| "epoch": 4.552166934189406, |
| "grad_norm": 0.983849823474884, |
| "learning_rate": 2.2168988174960382e-07, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.9940782487392426, |
| "num_tokens": 12381677.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.8310158252716064, |
| "epoch": 4.5553772070626, |
| "grad_norm": 0.8708466291427612, |
| "learning_rate": 2.1857952865796616e-07, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.9952942728996277, |
| "num_tokens": 12390585.0, |
| "step": 1421 |
| }, |
| { |
| "entropy": 0.7231708765029907, |
| "epoch": 4.558587479935794, |
| "grad_norm": 1.949539065361023, |
| "learning_rate": 2.1549066179097355e-07, |
| "loss": 0.0166, |
| "mean_token_accuracy": 0.9928002655506134, |
| "num_tokens": 12399388.0, |
| "step": 1422 |
| }, |
| { |
| "entropy": 0.8198031783103943, |
| "epoch": 4.561797752808989, |
| "grad_norm": 0.8939534425735474, |
| "learning_rate": 2.124232950290367e-07, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9933803975582123, |
| "num_tokens": 12407701.0, |
| "step": 1423 |
| }, |
| { |
| "entropy": 0.7757489085197449, |
| "epoch": 4.565008025682183, |
| "grad_norm": 0.7762877345085144, |
| "learning_rate": 2.0937744215595467e-07, |
| "loss": 0.0239, |
| "mean_token_accuracy": 0.981669157743454, |
| "num_tokens": 12417604.0, |
| "step": 1424 |
| }, |
| { |
| "entropy": 0.7322670519351959, |
| "epoch": 4.568218298555378, |
| "grad_norm": 0.7595078349113464, |
| "learning_rate": 2.0635311685884675e-07, |
| "loss": 0.0226, |
| "mean_token_accuracy": 0.9879952669143677, |
| "num_tokens": 12427171.0, |
| "step": 1425 |
| }, |
| { |
| "entropy": 0.9202703237533569, |
| "epoch": 4.571428571428571, |
| "grad_norm": 1.0135276317596436, |
| "learning_rate": 2.0335033272809612e-07, |
| "loss": 0.0141, |
| "mean_token_accuracy": 0.995720237493515, |
| "num_tokens": 12436454.0, |
| "step": 1426 |
| }, |
| { |
| "entropy": 0.6941528022289276, |
| "epoch": 4.574638844301766, |
| "grad_norm": 1.0334597826004028, |
| "learning_rate": 2.0036910325728521e-07, |
| "loss": 0.0129, |
| "mean_token_accuracy": 0.9945822060108185, |
| "num_tokens": 12444854.0, |
| "step": 1427 |
| }, |
| { |
| "entropy": 0.7655043005943298, |
| "epoch": 4.57784911717496, |
| "grad_norm": 1.1090309619903564, |
| "learning_rate": 1.9740944184313882e-07, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.993369847536087, |
| "num_tokens": 12453038.0, |
| "step": 1428 |
| }, |
| { |
| "entropy": 0.8777413368225098, |
| "epoch": 4.581059390048154, |
| "grad_norm": 1.6091341972351074, |
| "learning_rate": 1.9447136178545766e-07, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.995211273431778, |
| "num_tokens": 12460768.0, |
| "step": 1429 |
| }, |
| { |
| "entropy": 0.834505558013916, |
| "epoch": 4.584269662921348, |
| "grad_norm": 0.9753161668777466, |
| "learning_rate": 1.9155487628706672e-07, |
| "loss": 0.019, |
| "mean_token_accuracy": 0.9906420409679413, |
| "num_tokens": 12470811.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.8439209461212158, |
| "epoch": 4.587479935794542, |
| "grad_norm": 0.7517779469490051, |
| "learning_rate": 1.8865999845374794e-07, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.9933716654777527, |
| "num_tokens": 12479429.0, |
| "step": 1431 |
| }, |
| { |
| "entropy": 0.7574312388896942, |
| "epoch": 4.590690208667737, |
| "grad_norm": 1.003862738609314, |
| "learning_rate": 1.857867412941883e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9921233355998993, |
| "num_tokens": 12487836.0, |
| "step": 1432 |
| }, |
| { |
| "entropy": 0.7914498746395111, |
| "epoch": 4.593900481540931, |
| "grad_norm": 0.7261200547218323, |
| "learning_rate": 1.8293511771991624e-07, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.9958767890930176, |
| "num_tokens": 12496088.0, |
| "step": 1433 |
| }, |
| { |
| "entropy": 0.8081851303577423, |
| "epoch": 4.597110754414125, |
| "grad_norm": 1.6329476833343506, |
| "learning_rate": 1.8010514054524531e-07, |
| "loss": 0.0278, |
| "mean_token_accuracy": 0.9796946048736572, |
| "num_tokens": 12505783.0, |
| "step": 1434 |
| }, |
| { |
| "entropy": 0.8041447103023529, |
| "epoch": 4.600321027287319, |
| "grad_norm": 0.9152159094810486, |
| "learning_rate": 1.7729682248721848e-07, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9919094741344452, |
| "num_tokens": 12515067.0, |
| "step": 1435 |
| }, |
| { |
| "entropy": 0.8927958905696869, |
| "epoch": 4.603531300160514, |
| "grad_norm": 1.4165750741958618, |
| "learning_rate": 1.7451017616554822e-07, |
| "loss": 0.0255, |
| "mean_token_accuracy": 0.9866718351840973, |
| "num_tokens": 12523827.0, |
| "step": 1436 |
| }, |
| { |
| "entropy": 0.8060368299484253, |
| "epoch": 4.606741573033708, |
| "grad_norm": 0.9053508639335632, |
| "learning_rate": 1.7174521410256162e-07, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9934398829936981, |
| "num_tokens": 12532347.0, |
| "step": 1437 |
| }, |
| { |
| "entropy": 0.8163271546363831, |
| "epoch": 4.609951845906902, |
| "grad_norm": 1.3091577291488647, |
| "learning_rate": 1.69001948723142e-07, |
| "loss": 0.02, |
| "mean_token_accuracy": 0.9915553331375122, |
| "num_tokens": 12541394.0, |
| "step": 1438 |
| }, |
| { |
| "entropy": 0.7544900178909302, |
| "epoch": 4.613162118780096, |
| "grad_norm": 1.1534007787704468, |
| "learning_rate": 1.6628039235467686e-07, |
| "loss": 0.0214, |
| "mean_token_accuracy": 0.9864902794361115, |
| "num_tokens": 12551329.0, |
| "step": 1439 |
| }, |
| { |
| "entropy": 0.853958249092102, |
| "epoch": 4.61637239165329, |
| "grad_norm": 0.8594070672988892, |
| "learning_rate": 1.6358055722699662e-07, |
| "loss": 0.0111, |
| "mean_token_accuracy": 0.9959721565246582, |
| "num_tokens": 12558908.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.9293086230754852, |
| "epoch": 4.619582664526485, |
| "grad_norm": 0.8603348731994629, |
| "learning_rate": 1.6090245547232707e-07, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9953678846359253, |
| "num_tokens": 12566798.0, |
| "step": 1441 |
| }, |
| { |
| "entropy": 0.9653850197792053, |
| "epoch": 4.622792937399679, |
| "grad_norm": 0.7823249101638794, |
| "learning_rate": 1.5824609912522825e-07, |
| "loss": 0.0189, |
| "mean_token_accuracy": 0.993864506483078, |
| "num_tokens": 12575250.0, |
| "step": 1442 |
| }, |
| { |
| "entropy": 0.8890847563743591, |
| "epoch": 4.626003210272874, |
| "grad_norm": 0.6177572011947632, |
| "learning_rate": 1.5561150012254446e-07, |
| "loss": 0.0113, |
| "mean_token_accuracy": 0.9965679347515106, |
| "num_tokens": 12583333.0, |
| "step": 1443 |
| }, |
| { |
| "entropy": 0.8079483807086945, |
| "epoch": 4.629213483146067, |
| "grad_norm": 1.4150201082229614, |
| "learning_rate": 1.5299867030334815e-07, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9950767457485199, |
| "num_tokens": 12592056.0, |
| "step": 1444 |
| }, |
| { |
| "entropy": 0.8464027941226959, |
| "epoch": 4.632423756019262, |
| "grad_norm": 4.529459476470947, |
| "learning_rate": 1.5040762140888843e-07, |
| "loss": 0.0148, |
| "mean_token_accuracy": 0.9947774410247803, |
| "num_tokens": 12600705.0, |
| "step": 1445 |
| }, |
| { |
| "entropy": 0.848853588104248, |
| "epoch": 4.635634028892456, |
| "grad_norm": 2.467555284500122, |
| "learning_rate": 1.4783836508253823e-07, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9937762022018433, |
| "num_tokens": 12609927.0, |
| "step": 1446 |
| }, |
| { |
| "entropy": 0.7718561589717865, |
| "epoch": 4.63884430176565, |
| "grad_norm": 3.8097293376922607, |
| "learning_rate": 1.4529091286973994e-07, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9952149987220764, |
| "num_tokens": 12618947.0, |
| "step": 1447 |
| }, |
| { |
| "entropy": 0.9486541748046875, |
| "epoch": 4.642054574638844, |
| "grad_norm": 0.7113092541694641, |
| "learning_rate": 1.4276527621795655e-07, |
| "loss": 0.0116, |
| "mean_token_accuracy": 0.9962652921676636, |
| "num_tokens": 12627476.0, |
| "step": 1448 |
| }, |
| { |
| "entropy": 0.8682657778263092, |
| "epoch": 4.645264847512038, |
| "grad_norm": 0.6125422716140747, |
| "learning_rate": 1.402614664766172e-07, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9960006773471832, |
| "num_tokens": 12635027.0, |
| "step": 1449 |
| }, |
| { |
| "entropy": 0.9030327200889587, |
| "epoch": 4.648475120385233, |
| "grad_norm": 1.8995373249053955, |
| "learning_rate": 1.3777949489706898e-07, |
| "loss": 0.0214, |
| "mean_token_accuracy": 0.991911381483078, |
| "num_tokens": 12644120.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.9397462904453278, |
| "epoch": 4.651685393258427, |
| "grad_norm": 0.7002797722816467, |
| "learning_rate": 1.353193726325247e-07, |
| "loss": 0.0123, |
| "mean_token_accuracy": 0.9955561757087708, |
| "num_tokens": 12653499.0, |
| "step": 1451 |
| }, |
| { |
| "entropy": 0.8843958079814911, |
| "epoch": 4.654895666131621, |
| "grad_norm": 1.1001935005187988, |
| "learning_rate": 1.3288111073801235e-07, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9950455129146576, |
| "num_tokens": 12662275.0, |
| "step": 1452 |
| }, |
| { |
| "entropy": 0.8470652103424072, |
| "epoch": 4.658105939004815, |
| "grad_norm": 1.5061019659042358, |
| "learning_rate": 1.3046472017032685e-07, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.994785338640213, |
| "num_tokens": 12670692.0, |
| "step": 1453 |
| }, |
| { |
| "entropy": 0.8604851365089417, |
| "epoch": 4.66131621187801, |
| "grad_norm": 0.8102709650993347, |
| "learning_rate": 1.280702117879795e-07, |
| "loss": 0.0205, |
| "mean_token_accuracy": 0.99430912733078, |
| "num_tokens": 12679153.0, |
| "step": 1454 |
| }, |
| { |
| "entropy": 0.8193712532520294, |
| "epoch": 4.664526484751204, |
| "grad_norm": 1.2029215097427368, |
| "learning_rate": 1.2569759635115086e-07, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.9853865206241608, |
| "num_tokens": 12688175.0, |
| "step": 1455 |
| }, |
| { |
| "entropy": 0.792403519153595, |
| "epoch": 4.667736757624398, |
| "grad_norm": 2.498199462890625, |
| "learning_rate": 1.2334688452164122e-07, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9941717386245728, |
| "num_tokens": 12696973.0, |
| "step": 1456 |
| }, |
| { |
| "entropy": 0.6442500352859497, |
| "epoch": 4.670947030497592, |
| "grad_norm": 1.1447046995162964, |
| "learning_rate": 1.210180868628219e-07, |
| "loss": 0.0318, |
| "mean_token_accuracy": 0.9858497381210327, |
| "num_tokens": 12707211.0, |
| "step": 1457 |
| }, |
| { |
| "entropy": 0.8001232743263245, |
| "epoch": 4.674157303370786, |
| "grad_norm": 0.8121841549873352, |
| "learning_rate": 1.1871121383958961e-07, |
| "loss": 0.0358, |
| "mean_token_accuracy": 0.9611911177635193, |
| "num_tokens": 12717263.0, |
| "step": 1458 |
| }, |
| { |
| "entropy": 1.0088177621364594, |
| "epoch": 4.677367576243981, |
| "grad_norm": 0.7857615351676941, |
| "learning_rate": 1.1642627581831767e-07, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9962861239910126, |
| "num_tokens": 12727899.0, |
| "step": 1459 |
| }, |
| { |
| "entropy": 0.7727092504501343, |
| "epoch": 4.680577849117175, |
| "grad_norm": 1.3959094285964966, |
| "learning_rate": 1.1416328306681046e-07, |
| "loss": 0.026, |
| "mean_token_accuracy": 0.9916978478431702, |
| "num_tokens": 12736448.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.9188545346260071, |
| "epoch": 4.68378812199037, |
| "grad_norm": 1.0266892910003662, |
| "learning_rate": 1.1192224575425848e-07, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.9939980804920197, |
| "num_tokens": 12744974.0, |
| "step": 1461 |
| }, |
| { |
| "entropy": 0.7683831751346588, |
| "epoch": 4.686998394863563, |
| "grad_norm": 1.3337358236312866, |
| "learning_rate": 1.0970317395119001e-07, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.9839833378791809, |
| "num_tokens": 12755044.0, |
| "step": 1462 |
| }, |
| { |
| "entropy": 1.0480545163154602, |
| "epoch": 4.690208667736758, |
| "grad_norm": 1.289138674736023, |
| "learning_rate": 1.0750607762942622e-07, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9952871203422546, |
| "num_tokens": 12764198.0, |
| "step": 1463 |
| }, |
| { |
| "entropy": 0.8789570927619934, |
| "epoch": 4.693418940609952, |
| "grad_norm": 3.7718007564544678, |
| "learning_rate": 1.0533096666203946e-07, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9947271049022675, |
| "num_tokens": 12772123.0, |
| "step": 1464 |
| }, |
| { |
| "entropy": 0.8715825378894806, |
| "epoch": 4.696629213483146, |
| "grad_norm": 0.849036693572998, |
| "learning_rate": 1.0317785082330555e-07, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9959846436977386, |
| "num_tokens": 12781152.0, |
| "step": 1465 |
| }, |
| { |
| "entropy": 0.8796096742153168, |
| "epoch": 4.69983948635634, |
| "grad_norm": 1.3320071697235107, |
| "learning_rate": 1.0104673978866164e-07, |
| "loss": 0.0209, |
| "mean_token_accuracy": 0.9909534156322479, |
| "num_tokens": 12790728.0, |
| "step": 1466 |
| }, |
| { |
| "entropy": 0.8747296929359436, |
| "epoch": 4.703049759229534, |
| "grad_norm": 1.0539300441741943, |
| "learning_rate": 9.89376431346606e-08, |
| "loss": 0.0173, |
| "mean_token_accuracy": 0.9945295751094818, |
| "num_tokens": 12799861.0, |
| "step": 1467 |
| }, |
| { |
| "entropy": 0.7705346345901489, |
| "epoch": 4.706260032102729, |
| "grad_norm": 1.595797061920166, |
| "learning_rate": 9.685057033892998e-08, |
| "loss": 0.0153, |
| "mean_token_accuracy": 0.9951062500476837, |
| "num_tokens": 12807796.0, |
| "step": 1468 |
| }, |
| { |
| "entropy": 0.8849585354328156, |
| "epoch": 4.709470304975923, |
| "grad_norm": 0.6699755787849426, |
| "learning_rate": 9.478553078013042e-08, |
| "loss": 0.014, |
| "mean_token_accuracy": 0.9962992668151855, |
| "num_tokens": 12816289.0, |
| "step": 1469 |
| }, |
| { |
| "entropy": 0.803180068731308, |
| "epoch": 4.712680577849117, |
| "grad_norm": 1.6749389171600342, |
| "learning_rate": 9.274253373791064e-08, |
| "loss": 0.0215, |
| "mean_token_accuracy": 0.9928462505340576, |
| "num_tokens": 12825105.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.8213406801223755, |
| "epoch": 4.715890850722311, |
| "grad_norm": 1.0112464427947998, |
| "learning_rate": 9.072158839286748e-08, |
| "loss": 0.0248, |
| "mean_token_accuracy": 0.9840686023235321, |
| "num_tokens": 12834541.0, |
| "step": 1471 |
| }, |
| { |
| "entropy": 0.7910043299198151, |
| "epoch": 4.719101123595506, |
| "grad_norm": 0.8969994187355042, |
| "learning_rate": 8.872270382650372e-08, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9944667518138885, |
| "num_tokens": 12843003.0, |
| "step": 1472 |
| }, |
| { |
| "entropy": 0.8581178486347198, |
| "epoch": 4.7223113964687, |
| "grad_norm": 0.8233668208122253, |
| "learning_rate": 8.674588902118919e-08, |
| "loss": 0.0206, |
| "mean_token_accuracy": 0.9897873401641846, |
| "num_tokens": 12852903.0, |
| "step": 1473 |
| }, |
| { |
| "entropy": 0.8728442490100861, |
| "epoch": 4.725521669341894, |
| "grad_norm": 0.8017443418502808, |
| "learning_rate": 8.479115286011752e-08, |
| "loss": 0.012, |
| "mean_token_accuracy": 0.995659202337265, |
| "num_tokens": 12862161.0, |
| "step": 1474 |
| }, |
| { |
| "entropy": 0.8244474232196808, |
| "epoch": 4.728731942215088, |
| "grad_norm": 1.4547849893569946, |
| "learning_rate": 8.285850412726837e-08, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9953339695930481, |
| "num_tokens": 12870310.0, |
| "step": 1475 |
| }, |
| { |
| "entropy": 0.8828493654727936, |
| "epoch": 4.731942215088282, |
| "grad_norm": 1.2691254615783691, |
| "learning_rate": 8.094795150736745e-08, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.9957073032855988, |
| "num_tokens": 12878220.0, |
| "step": 1476 |
| }, |
| { |
| "entropy": 0.8865284621715546, |
| "epoch": 4.735152487961477, |
| "grad_norm": 0.8567522764205933, |
| "learning_rate": 7.905950358584768e-08, |
| "loss": 0.0198, |
| "mean_token_accuracy": 0.9906137585639954, |
| "num_tokens": 12886608.0, |
| "step": 1477 |
| }, |
| { |
| "entropy": 0.8262231051921844, |
| "epoch": 4.738362760834671, |
| "grad_norm": 1.0661060810089111, |
| "learning_rate": 7.719316884880922e-08, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9928100407123566, |
| "num_tokens": 12895485.0, |
| "step": 1478 |
| }, |
| { |
| "entropy": 0.7830840349197388, |
| "epoch": 4.741573033707866, |
| "grad_norm": 1.305324912071228, |
| "learning_rate": 7.534895568298395e-08, |
| "loss": 0.0164, |
| "mean_token_accuracy": 0.9932570457458496, |
| "num_tokens": 12903598.0, |
| "step": 1479 |
| }, |
| { |
| "entropy": 0.8163612484931946, |
| "epoch": 4.744783306581059, |
| "grad_norm": 0.6662154197692871, |
| "learning_rate": 7.352687237569489e-08, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9972147941589355, |
| "num_tokens": 12911756.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.9148772060871124, |
| "epoch": 4.747993579454254, |
| "grad_norm": 0.9864494800567627, |
| "learning_rate": 7.172692711482022e-08, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9813115894794464, |
| "num_tokens": 12920570.0, |
| "step": 1481 |
| }, |
| { |
| "entropy": 0.9331631064414978, |
| "epoch": 4.751203852327448, |
| "grad_norm": 0.6732865571975708, |
| "learning_rate": 6.994912798875875e-08, |
| "loss": 0.0111, |
| "mean_token_accuracy": 0.9955635666847229, |
| "num_tokens": 12928073.0, |
| "step": 1482 |
| }, |
| { |
| "entropy": 0.9015617370605469, |
| "epoch": 4.754414125200642, |
| "grad_norm": 1.075960636138916, |
| "learning_rate": 6.819348298638839e-08, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9937729239463806, |
| "num_tokens": 12936594.0, |
| "step": 1483 |
| }, |
| { |
| "entropy": 1.0504088997840881, |
| "epoch": 4.757624398073836, |
| "grad_norm": 0.8205307722091675, |
| "learning_rate": 6.6459999997035e-08, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9749292135238647, |
| "num_tokens": 12948928.0, |
| "step": 1484 |
| }, |
| { |
| "entropy": 0.8226450979709625, |
| "epoch": 4.76083467094703, |
| "grad_norm": 0.8952516317367554, |
| "learning_rate": 6.474868681043578e-08, |
| "loss": 0.0205, |
| "mean_token_accuracy": 0.9924160242080688, |
| "num_tokens": 12957574.0, |
| "step": 1485 |
| }, |
| { |
| "entropy": 0.8268488645553589, |
| "epoch": 4.764044943820225, |
| "grad_norm": 2.353219747543335, |
| "learning_rate": 6.305955111670204e-08, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.9924924671649933, |
| "num_tokens": 12967174.0, |
| "step": 1486 |
| }, |
| { |
| "entropy": 1.0455307960510254, |
| "epoch": 4.767255216693419, |
| "grad_norm": 1.843535304069519, |
| "learning_rate": 6.13926005062876e-08, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.9977114200592041, |
| "num_tokens": 12976414.0, |
| "step": 1487 |
| }, |
| { |
| "entropy": 0.9094734787940979, |
| "epoch": 4.770465489566613, |
| "grad_norm": 0.8578094244003296, |
| "learning_rate": 5.974784246995214e-08, |
| "loss": 0.0146, |
| "mean_token_accuracy": 0.9947217106819153, |
| "num_tokens": 12984704.0, |
| "step": 1488 |
| }, |
| { |
| "entropy": 0.8561606407165527, |
| "epoch": 4.773675762439807, |
| "grad_norm": 1.4484977722167969, |
| "learning_rate": 5.8125284398730666e-08, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.990572988986969, |
| "num_tokens": 12993088.0, |
| "step": 1489 |
| }, |
| { |
| "entropy": 0.8771012425422668, |
| "epoch": 4.776886035313002, |
| "grad_norm": 0.8078578114509583, |
| "learning_rate": 5.6524933583896326e-08, |
| "loss": 0.0267, |
| "mean_token_accuracy": 0.9761388897895813, |
| "num_tokens": 13003109.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 1.0275262296199799, |
| "epoch": 4.780096308186196, |
| "grad_norm": 2.095111846923828, |
| "learning_rate": 5.4946797216931524e-08, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9961507022380829, |
| "num_tokens": 13011066.0, |
| "step": 1491 |
| }, |
| { |
| "entropy": 0.8347211182117462, |
| "epoch": 4.78330658105939, |
| "grad_norm": 1.8958369493484497, |
| "learning_rate": 5.339088238949186e-08, |
| "loss": 0.0245, |
| "mean_token_accuracy": 0.9858895838260651, |
| "num_tokens": 13019563.0, |
| "step": 1492 |
| }, |
| { |
| "entropy": 0.6933950185775757, |
| "epoch": 4.786516853932584, |
| "grad_norm": 1.2030583620071411, |
| "learning_rate": 5.185719609337836e-08, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9926663935184479, |
| "num_tokens": 13028661.0, |
| "step": 1493 |
| }, |
| { |
| "entropy": 0.8557841181755066, |
| "epoch": 4.789727126805778, |
| "grad_norm": 1.0296180248260498, |
| "learning_rate": 5.034574522050251e-08, |
| "loss": 0.0129, |
| "mean_token_accuracy": 0.9967096447944641, |
| "num_tokens": 13036535.0, |
| "step": 1494 |
| }, |
| { |
| "entropy": 0.740024745464325, |
| "epoch": 4.792937399678973, |
| "grad_norm": 0.9318440556526184, |
| "learning_rate": 4.885653656285627e-08, |
| "loss": 0.0195, |
| "mean_token_accuracy": 0.9905839264392853, |
| "num_tokens": 13045182.0, |
| "step": 1495 |
| }, |
| { |
| "entropy": 0.7861917316913605, |
| "epoch": 4.796147672552167, |
| "grad_norm": 0.8870951533317566, |
| "learning_rate": 4.73895768124838e-08, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.9954321086406708, |
| "num_tokens": 13054371.0, |
| "step": 1496 |
| }, |
| { |
| "entropy": 0.8475262522697449, |
| "epoch": 4.799357945425362, |
| "grad_norm": 3.0993971824645996, |
| "learning_rate": 4.5944872561448084e-08, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.9950073659420013, |
| "num_tokens": 13062352.0, |
| "step": 1497 |
| }, |
| { |
| "entropy": 0.9056753218173981, |
| "epoch": 4.802568218298555, |
| "grad_norm": 0.47460415959358215, |
| "learning_rate": 4.45224303018027e-08, |
| "loss": 0.0102, |
| "mean_token_accuracy": 0.9964375793933868, |
| "num_tokens": 13070029.0, |
| "step": 1498 |
| }, |
| { |
| "entropy": 0.8055447340011597, |
| "epoch": 4.80577849117175, |
| "grad_norm": 1.6484469175338745, |
| "learning_rate": 4.3122256425563444e-08, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9926461279392242, |
| "num_tokens": 13078374.0, |
| "step": 1499 |
| }, |
| { |
| "entropy": 0.8468793332576752, |
| "epoch": 4.808988764044944, |
| "grad_norm": 1.0291019678115845, |
| "learning_rate": 4.174435722467951e-08, |
| "loss": 0.0125, |
| "mean_token_accuracy": 0.99583500623703, |
| "num_tokens": 13086183.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.9175726771354675, |
| "epoch": 4.8121990369181376, |
| "grad_norm": 0.8731938004493713, |
| "learning_rate": 4.038873889100237e-08, |
| "loss": 0.0103, |
| "mean_token_accuracy": 0.9962366223335266, |
| "num_tokens": 13095228.0, |
| "step": 1501 |
| }, |
| { |
| "entropy": 0.8306655585765839, |
| "epoch": 4.815409309791332, |
| "grad_norm": 0.9269821643829346, |
| "learning_rate": 3.905540751626191e-08, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9948339760303497, |
| "num_tokens": 13104282.0, |
| "step": 1502 |
| }, |
| { |
| "entropy": 0.8430630564689636, |
| "epoch": 4.818619582664526, |
| "grad_norm": 1.019399642944336, |
| "learning_rate": 3.77443690920376e-08, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9845989346504211, |
| "num_tokens": 13113668.0, |
| "step": 1503 |
| }, |
| { |
| "entropy": 0.8044372200965881, |
| "epoch": 4.821829855537721, |
| "grad_norm": 0.831784188747406, |
| "learning_rate": 3.645562950973014e-08, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9945938289165497, |
| "num_tokens": 13122339.0, |
| "step": 1504 |
| }, |
| { |
| "entropy": 0.813548743724823, |
| "epoch": 4.825040128410915, |
| "grad_norm": 0.8253054022789001, |
| "learning_rate": 3.518919456053649e-08, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9954196214675903, |
| "num_tokens": 13130760.0, |
| "step": 1505 |
| }, |
| { |
| "entropy": 0.8437825441360474, |
| "epoch": 4.828250401284109, |
| "grad_norm": 1.0685738325119019, |
| "learning_rate": 3.3945069935423234e-08, |
| "loss": 0.0126, |
| "mean_token_accuracy": 0.9963714182376862, |
| "num_tokens": 13139593.0, |
| "step": 1506 |
| }, |
| { |
| "entropy": 0.8084733188152313, |
| "epoch": 4.831460674157303, |
| "grad_norm": 0.9814333915710449, |
| "learning_rate": 3.2723261225102164e-08, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9951196610927582, |
| "num_tokens": 13147734.0, |
| "step": 1507 |
| }, |
| { |
| "entropy": 0.9518248438835144, |
| "epoch": 4.834670947030498, |
| "grad_norm": 0.9631071090698242, |
| "learning_rate": 3.152377392000361e-08, |
| "loss": 0.0138, |
| "mean_token_accuracy": 0.9940395951271057, |
| "num_tokens": 13156608.0, |
| "step": 1508 |
| }, |
| { |
| "entropy": 0.8305748105049133, |
| "epoch": 4.837881219903692, |
| "grad_norm": 0.647260308265686, |
| "learning_rate": 3.034661341025258e-08, |
| "loss": 0.0106, |
| "mean_token_accuracy": 0.9974878430366516, |
| "num_tokens": 13164500.0, |
| "step": 1509 |
| }, |
| { |
| "entropy": 0.93598473072052, |
| "epoch": 4.841091492776886, |
| "grad_norm": 0.5800625681877136, |
| "learning_rate": 2.9191784985644345e-08, |
| "loss": 0.0105, |
| "mean_token_accuracy": 0.995660126209259, |
| "num_tokens": 13173445.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.9177517294883728, |
| "epoch": 4.84430176565008, |
| "grad_norm": 0.9533823132514954, |
| "learning_rate": 2.8059293835620006e-08, |
| "loss": 0.0126, |
| "mean_token_accuracy": 0.9962038695812225, |
| "num_tokens": 13181508.0, |
| "step": 1511 |
| }, |
| { |
| "entropy": 0.8395648300647736, |
| "epoch": 4.847512038523274, |
| "grad_norm": 0.8162403106689453, |
| "learning_rate": 2.6949145049245396e-08, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9970743358135223, |
| "num_tokens": 13189174.0, |
| "step": 1512 |
| }, |
| { |
| "entropy": 0.8557933866977692, |
| "epoch": 4.850722311396469, |
| "grad_norm": 1.0253946781158447, |
| "learning_rate": 2.5861343615184997e-08, |
| "loss": 0.0323, |
| "mean_token_accuracy": 0.9815960228443146, |
| "num_tokens": 13198056.0, |
| "step": 1513 |
| }, |
| { |
| "entropy": 0.8704625368118286, |
| "epoch": 4.853932584269663, |
| "grad_norm": 0.6910727620124817, |
| "learning_rate": 2.479589442168251e-08, |
| "loss": 0.0122, |
| "mean_token_accuracy": 0.9952109158039093, |
| "num_tokens": 13206774.0, |
| "step": 1514 |
| }, |
| { |
| "entropy": 1.0808364748954773, |
| "epoch": 4.857142857142857, |
| "grad_norm": 0.9426855444908142, |
| "learning_rate": 2.3752802256536423e-08, |
| "loss": 0.0146, |
| "mean_token_accuracy": 0.9937877953052521, |
| "num_tokens": 13215645.0, |
| "step": 1515 |
| }, |
| { |
| "entropy": 0.7950730621814728, |
| "epoch": 4.860353130016051, |
| "grad_norm": 0.948599100112915, |
| "learning_rate": 2.2732071807081147e-08, |
| "loss": 0.0361, |
| "mean_token_accuracy": 0.9701245427131653, |
| "num_tokens": 13225286.0, |
| "step": 1516 |
| }, |
| { |
| "entropy": 0.7923941910266876, |
| "epoch": 4.863563402889246, |
| "grad_norm": 0.886232316493988, |
| "learning_rate": 2.173370766016314e-08, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.9943108260631561, |
| "num_tokens": 13234636.0, |
| "step": 1517 |
| }, |
| { |
| "entropy": 0.8961697816848755, |
| "epoch": 4.86677367576244, |
| "grad_norm": 0.49067121744155884, |
| "learning_rate": 2.0757714302122035e-08, |
| "loss": 0.0092, |
| "mean_token_accuracy": 0.9976347088813782, |
| "num_tokens": 13243074.0, |
| "step": 1518 |
| }, |
| { |
| "entropy": 0.8559750616550446, |
| "epoch": 4.8699839486356336, |
| "grad_norm": 0.9737809896469116, |
| "learning_rate": 1.98040961187701e-08, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.9791916608810425, |
| "num_tokens": 13252750.0, |
| "step": 1519 |
| }, |
| { |
| "entropy": 0.8368252515792847, |
| "epoch": 4.873194221508828, |
| "grad_norm": 1.5021547079086304, |
| "learning_rate": 1.8872857395372812e-08, |
| "loss": 0.0247, |
| "mean_token_accuracy": 0.9902662932872772, |
| "num_tokens": 13262034.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.8385051488876343, |
| "epoch": 4.876404494382022, |
| "grad_norm": 1.3577289581298828, |
| "learning_rate": 1.7964002316628316e-08, |
| "loss": 0.0259, |
| "mean_token_accuracy": 0.99179607629776, |
| "num_tokens": 13271283.0, |
| "step": 1521 |
| }, |
| { |
| "entropy": 0.8158581852912903, |
| "epoch": 4.879614767255217, |
| "grad_norm": 0.9924322366714478, |
| "learning_rate": 1.7077534966650767e-08, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9940443634986877, |
| "num_tokens": 13280203.0, |
| "step": 1522 |
| }, |
| { |
| "entropy": 0.9179269969463348, |
| "epoch": 4.882825040128411, |
| "grad_norm": 0.9713982939720154, |
| "learning_rate": 1.6213459328950355e-08, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9961579740047455, |
| "num_tokens": 13288465.0, |
| "step": 1523 |
| }, |
| { |
| "entropy": 0.7583066523075104, |
| "epoch": 4.886035313001605, |
| "grad_norm": 0.917400062084198, |
| "learning_rate": 1.537177928641498e-08, |
| "loss": 0.0187, |
| "mean_token_accuracy": 0.9913864731788635, |
| "num_tokens": 13297225.0, |
| "step": 1524 |
| }, |
| { |
| "entropy": 0.8535176515579224, |
| "epoch": 4.889245585874799, |
| "grad_norm": 0.8154090046882629, |
| "learning_rate": 1.4552498621295264e-08, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9920974969863892, |
| "num_tokens": 13305361.0, |
| "step": 1525 |
| }, |
| { |
| "entropy": 0.7557698488235474, |
| "epoch": 4.892455858747994, |
| "grad_norm": 0.8725979924201965, |
| "learning_rate": 1.3755621015184018e-08, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9954307377338409, |
| "num_tokens": 13314453.0, |
| "step": 1526 |
| }, |
| { |
| "entropy": 0.8124125897884369, |
| "epoch": 4.895666131621188, |
| "grad_norm": 1.709907054901123, |
| "learning_rate": 1.2981150049004021e-08, |
| "loss": 0.0233, |
| "mean_token_accuracy": 0.9874438941478729, |
| "num_tokens": 13323863.0, |
| "step": 1527 |
| }, |
| { |
| "entropy": 0.8402214646339417, |
| "epoch": 4.898876404494382, |
| "grad_norm": 1.5540814399719238, |
| "learning_rate": 1.2229089202987487e-08, |
| "loss": 0.0232, |
| "mean_token_accuracy": 0.9862401783466339, |
| "num_tokens": 13332876.0, |
| "step": 1528 |
| }, |
| { |
| "entropy": 0.6956556737422943, |
| "epoch": 4.902086677367576, |
| "grad_norm": 1.3305988311767578, |
| "learning_rate": 1.1499441856663296e-08, |
| "loss": 0.0278, |
| "mean_token_accuracy": 0.9894447326660156, |
| "num_tokens": 13342874.0, |
| "step": 1529 |
| }, |
| { |
| "entropy": 0.7873148918151855, |
| "epoch": 4.90529695024077, |
| "grad_norm": 1.0279408693313599, |
| "learning_rate": 1.0792211288841447e-08, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.9951443374156952, |
| "num_tokens": 13351380.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.8480234742164612, |
| "epoch": 4.908507223113965, |
| "grad_norm": 4.671872138977051, |
| "learning_rate": 1.0107400677596413e-08, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9976670742034912, |
| "num_tokens": 13359235.0, |
| "step": 1531 |
| }, |
| { |
| "entropy": 0.7645809650421143, |
| "epoch": 4.911717495987159, |
| "grad_norm": 0.9382192492485046, |
| "learning_rate": 9.44501310025603e-09, |
| "loss": 0.0293, |
| "mean_token_accuracy": 0.9814907908439636, |
| "num_tokens": 13368521.0, |
| "step": 1532 |
| }, |
| { |
| "entropy": 0.9574826657772064, |
| "epoch": 4.914927768860353, |
| "grad_norm": 2.1182193756103516, |
| "learning_rate": 8.805051533384846e-09, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.994848906993866, |
| "num_tokens": 13378012.0, |
| "step": 1533 |
| }, |
| { |
| "entropy": 0.8288282752037048, |
| "epoch": 4.918138041733547, |
| "grad_norm": 1.078923225402832, |
| "learning_rate": 8.187518852771914e-09, |
| "loss": 0.0236, |
| "mean_token_accuracy": 0.9885841608047485, |
| "num_tokens": 13387681.0, |
| "step": 1534 |
| }, |
| { |
| "entropy": 0.7388648986816406, |
| "epoch": 4.921348314606742, |
| "grad_norm": 1.584767460823059, |
| "learning_rate": 7.59241783341913e-09, |
| "loss": 0.0339, |
| "mean_token_accuracy": 0.9835653305053711, |
| "num_tokens": 13397378.0, |
| "step": 1535 |
| }, |
| { |
| "entropy": 0.8454699814319611, |
| "epoch": 4.924558587479936, |
| "grad_norm": 0.950717031955719, |
| "learning_rate": 7.019751149525133e-09, |
| "loss": 0.0138, |
| "mean_token_accuracy": 0.9948793649673462, |
| "num_tokens": 13405427.0, |
| "step": 1536 |
| }, |
| { |
| "entropy": 0.8609819412231445, |
| "epoch": 4.9277688603531296, |
| "grad_norm": 1.4333152770996094, |
| "learning_rate": 6.469521374477539e-09, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9950776100158691, |
| "num_tokens": 13412745.0, |
| "step": 1537 |
| }, |
| { |
| "entropy": 0.9362488389015198, |
| "epoch": 4.930979133226324, |
| "grad_norm": 0.8439843654632568, |
| "learning_rate": 5.941730980839056e-09, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9941385388374329, |
| "num_tokens": 13421064.0, |
| "step": 1538 |
| }, |
| { |
| "entropy": 0.9060971140861511, |
| "epoch": 4.934189406099518, |
| "grad_norm": 0.8039864301681519, |
| "learning_rate": 5.436382340335833e-09, |
| "loss": 0.0122, |
| "mean_token_accuracy": 0.9962558746337891, |
| "num_tokens": 13430385.0, |
| "step": 1539 |
| }, |
| { |
| "entropy": 0.8013357520103455, |
| "epoch": 4.937399678972713, |
| "grad_norm": 1.3938992023468018, |
| "learning_rate": 4.9534777238485764e-09, |
| "loss": 0.0081, |
| "mean_token_accuracy": 0.9980973601341248, |
| "num_tokens": 13438726.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.8251314759254456, |
| "epoch": 4.940609951845907, |
| "grad_norm": 0.7287578582763672, |
| "learning_rate": 4.493019301401447e-09, |
| "loss": 0.0228, |
| "mean_token_accuracy": 0.9862898588180542, |
| "num_tokens": 13446926.0, |
| "step": 1541 |
| }, |
| { |
| "entropy": 0.7694223821163177, |
| "epoch": 4.943820224719101, |
| "grad_norm": 1.3944227695465088, |
| "learning_rate": 4.055009142152066e-09, |
| "loss": 0.0306, |
| "mean_token_accuracy": 0.9875506162643433, |
| "num_tokens": 13455631.0, |
| "step": 1542 |
| }, |
| { |
| "entropy": 0.7734574675559998, |
| "epoch": 4.947030497592295, |
| "grad_norm": 1.3640856742858887, |
| "learning_rate": 3.6394492143820847e-09, |
| "loss": 0.0164, |
| "mean_token_accuracy": 0.992920994758606, |
| "num_tokens": 13463784.0, |
| "step": 1543 |
| }, |
| { |
| "entropy": 0.8477518856525421, |
| "epoch": 4.95024077046549, |
| "grad_norm": 1.2641561031341553, |
| "learning_rate": 3.2463413854899594e-09, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9660729467868805, |
| "num_tokens": 13473673.0, |
| "step": 1544 |
| }, |
| { |
| "entropy": 0.7746582925319672, |
| "epoch": 4.953451043338684, |
| "grad_norm": 1.149497389793396, |
| "learning_rate": 2.875687421980966e-09, |
| "loss": 0.0198, |
| "mean_token_accuracy": 0.9907875061035156, |
| "num_tokens": 13482323.0, |
| "step": 1545 |
| }, |
| { |
| "entropy": 0.8061198890209198, |
| "epoch": 4.956661316211878, |
| "grad_norm": 1.7826182842254639, |
| "learning_rate": 2.5274889894583156e-09, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9891678988933563, |
| "num_tokens": 13491215.0, |
| "step": 1546 |
| }, |
| { |
| "entropy": 0.7129835784435272, |
| "epoch": 4.959871589085072, |
| "grad_norm": 0.6761153936386108, |
| "learning_rate": 2.201747652618713e-09, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9959794282913208, |
| "num_tokens": 13500246.0, |
| "step": 1547 |
| }, |
| { |
| "entropy": 0.7953590452671051, |
| "epoch": 4.963081861958266, |
| "grad_norm": 0.9348256587982178, |
| "learning_rate": 1.8984648752429222e-09, |
| "loss": 0.0181, |
| "mean_token_accuracy": 0.9927394986152649, |
| "num_tokens": 13509290.0, |
| "step": 1548 |
| }, |
| { |
| "entropy": 0.7777528762817383, |
| "epoch": 4.966292134831461, |
| "grad_norm": 1.0118370056152344, |
| "learning_rate": 1.6176420201902132e-09, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9956157803535461, |
| "num_tokens": 13517919.0, |
| "step": 1549 |
| }, |
| { |
| "entropy": 0.7566848695278168, |
| "epoch": 4.969502407704655, |
| "grad_norm": 0.8841029405593872, |
| "learning_rate": 1.3592803493905904e-09, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9926736652851105, |
| "num_tokens": 13525933.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.7857078313827515, |
| "epoch": 4.972712680577849, |
| "grad_norm": 0.8450862169265747, |
| "learning_rate": 1.1233810238425735e-09, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9971432387828827, |
| "num_tokens": 13533747.0, |
| "step": 1551 |
| }, |
| { |
| "entropy": 0.8908677697181702, |
| "epoch": 4.975922953451043, |
| "grad_norm": 0.5804739594459534, |
| "learning_rate": 9.099451036048701e-10, |
| "loss": 0.0088, |
| "mean_token_accuracy": 0.9979284405708313, |
| "num_tokens": 13541753.0, |
| "step": 1552 |
| }, |
| { |
| "entropy": 0.8221500515937805, |
| "epoch": 4.979133226324238, |
| "grad_norm": 1.1342151165008545, |
| "learning_rate": 7.189735477913795e-10, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9946431219577789, |
| "num_tokens": 13549463.0, |
| "step": 1553 |
| }, |
| { |
| "entropy": 0.8285010457038879, |
| "epoch": 4.982343499197432, |
| "grad_norm": 0.6920357942581177, |
| "learning_rate": 5.504672145700829e-10, |
| "loss": 0.0111, |
| "mean_token_accuracy": 0.9967454969882965, |
| "num_tokens": 13557770.0, |
| "step": 1554 |
| }, |
| { |
| "entropy": 0.9648899137973785, |
| "epoch": 4.9855537720706256, |
| "grad_norm": 1.078553318977356, |
| "learning_rate": 4.0442686115582665e-10, |
| "loss": 0.0209, |
| "mean_token_accuracy": 0.9916457831859589, |
| "num_tokens": 13567441.0, |
| "step": 1555 |
| }, |
| { |
| "entropy": 0.8371059596538544, |
| "epoch": 4.98876404494382, |
| "grad_norm": 0.7630732655525208, |
| "learning_rate": 2.8085314380976725e-10, |
| "loss": 0.0127, |
| "mean_token_accuracy": 0.9956499338150024, |
| "num_tokens": 13575810.0, |
| "step": 1556 |
| }, |
| { |
| "entropy": 0.7578130662441254, |
| "epoch": 4.991974317817014, |
| "grad_norm": 0.8679744005203247, |
| "learning_rate": 1.797466178327101e-10, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9955244362354279, |
| "num_tokens": 13583801.0, |
| "step": 1557 |
| }, |
| { |
| "entropy": 0.8360298871994019, |
| "epoch": 4.995184590690209, |
| "grad_norm": 0.9718247652053833, |
| "learning_rate": 1.011077375662195e-10, |
| "loss": 0.0151, |
| "mean_token_accuracy": 0.9935263097286224, |
| "num_tokens": 13592100.0, |
| "step": 1558 |
| }, |
| { |
| "entropy": 0.870440274477005, |
| "epoch": 4.998394863563403, |
| "grad_norm": 1.1290241479873657, |
| "learning_rate": 4.4936856390398465e-11, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9939267933368683, |
| "num_tokens": 13600502.0, |
| "step": 1559 |
| }, |
| { |
| "entropy": 0.7823728322982788, |
| "epoch": 5.0, |
| "grad_norm": 1.0314245223999023, |
| "learning_rate": 1.1234226718337405e-11, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9929947257041931, |
| "num_tokens": 13605710.0, |
| "step": 1560 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 1560, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.906663239306445e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|