Safetensors
English
qwen2_5_vl
MedVLThinker-7B-SFT_5K / trainer_state.json
NingsenWang's picture
Upload MedVLThinker-7B-SFT_5K model (2025-10-22)
a2209f2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.7291141748428345,
"epoch": 0.0032102728731942215,
"grad_norm": 16.57156753540039,
"learning_rate": 0.0,
"loss": 1.4779,
"mean_token_accuracy": 0.6312866508960724,
"num_tokens": 7755.0,
"step": 1
},
{
"entropy": 1.6665578484535217,
"epoch": 0.006420545746388443,
"grad_norm": 18.545442581176758,
"learning_rate": 1.282051282051282e-07,
"loss": 1.4834,
"mean_token_accuracy": 0.6333149969577789,
"num_tokens": 16049.0,
"step": 2
},
{
"entropy": 1.6309752464294434,
"epoch": 0.009630818619582664,
"grad_norm": 20.336366653442383,
"learning_rate": 2.564102564102564e-07,
"loss": 1.5418,
"mean_token_accuracy": 0.6078976392745972,
"num_tokens": 23512.0,
"step": 3
},
{
"entropy": 1.7980514764785767,
"epoch": 0.012841091492776886,
"grad_norm": 17.363161087036133,
"learning_rate": 3.846153846153847e-07,
"loss": 1.4917,
"mean_token_accuracy": 0.6385786831378937,
"num_tokens": 32196.0,
"step": 4
},
{
"entropy": 1.6440390348434448,
"epoch": 0.016051364365971106,
"grad_norm": 21.76154899597168,
"learning_rate": 5.128205128205128e-07,
"loss": 1.4381,
"mean_token_accuracy": 0.6364026069641113,
"num_tokens": 41612.0,
"step": 5
},
{
"entropy": 1.811388909816742,
"epoch": 0.019261637239165328,
"grad_norm": 16.966796875,
"learning_rate": 6.41025641025641e-07,
"loss": 1.4449,
"mean_token_accuracy": 0.636858195066452,
"num_tokens": 49568.0,
"step": 6
},
{
"entropy": 1.7483888268470764,
"epoch": 0.02247191011235955,
"grad_norm": 16.943206787109375,
"learning_rate": 7.692307692307694e-07,
"loss": 1.478,
"mean_token_accuracy": 0.6292121112346649,
"num_tokens": 58061.0,
"step": 7
},
{
"entropy": 1.723523497581482,
"epoch": 0.025682182985553772,
"grad_norm": 16.687475204467773,
"learning_rate": 8.974358974358975e-07,
"loss": 1.4675,
"mean_token_accuracy": 0.6215986013412476,
"num_tokens": 66009.0,
"step": 8
},
{
"entropy": 1.713658332824707,
"epoch": 0.028892455858747994,
"grad_norm": 15.334748268127441,
"learning_rate": 1.0256410256410257e-06,
"loss": 1.5391,
"mean_token_accuracy": 0.6208073198795319,
"num_tokens": 74373.0,
"step": 9
},
{
"entropy": 1.8282572031021118,
"epoch": 0.03210272873194221,
"grad_norm": 14.283801078796387,
"learning_rate": 1.153846153846154e-06,
"loss": 1.3535,
"mean_token_accuracy": 0.6430239975452423,
"num_tokens": 82933.0,
"step": 10
},
{
"entropy": 1.7159126996994019,
"epoch": 0.03531300160513644,
"grad_norm": 13.662557601928711,
"learning_rate": 1.282051282051282e-06,
"loss": 1.3138,
"mean_token_accuracy": 0.6514350175857544,
"num_tokens": 91580.0,
"step": 11
},
{
"entropy": 1.8093950748443604,
"epoch": 0.038523274478330656,
"grad_norm": 13.19852066040039,
"learning_rate": 1.4102564102564104e-06,
"loss": 1.373,
"mean_token_accuracy": 0.6353383362293243,
"num_tokens": 99139.0,
"step": 12
},
{
"entropy": 1.7789645195007324,
"epoch": 0.04173354735152488,
"grad_norm": 16.865962982177734,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.2458,
"mean_token_accuracy": 0.6800651550292969,
"num_tokens": 106948.0,
"step": 13
},
{
"entropy": 1.7515007853507996,
"epoch": 0.0449438202247191,
"grad_norm": 10.389936447143555,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.2552,
"mean_token_accuracy": 0.6667510867118835,
"num_tokens": 114925.0,
"step": 14
},
{
"entropy": 1.7482985258102417,
"epoch": 0.048154093097913325,
"grad_norm": 8.63404655456543,
"learning_rate": 1.794871794871795e-06,
"loss": 1.1545,
"mean_token_accuracy": 0.6908352673053741,
"num_tokens": 123854.0,
"step": 15
},
{
"entropy": 1.7924981117248535,
"epoch": 0.051364365971107544,
"grad_norm": 10.083547592163086,
"learning_rate": 1.9230769230769234e-06,
"loss": 1.2271,
"mean_token_accuracy": 0.6758890450000763,
"num_tokens": 133377.0,
"step": 16
},
{
"entropy": 1.8247886896133423,
"epoch": 0.05457463884430177,
"grad_norm": 8.738141059875488,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.2349,
"mean_token_accuracy": 0.6745803952217102,
"num_tokens": 142243.0,
"step": 17
},
{
"entropy": 1.6763617396354675,
"epoch": 0.05778491171749599,
"grad_norm": 42.90169906616211,
"learning_rate": 2.1794871794871797e-06,
"loss": 0.9605,
"mean_token_accuracy": 0.7231403291225433,
"num_tokens": 150915.0,
"step": 18
},
{
"entropy": 1.668739914894104,
"epoch": 0.060995184590690206,
"grad_norm": 7.330332279205322,
"learning_rate": 2.307692307692308e-06,
"loss": 1.0346,
"mean_token_accuracy": 0.7043041586875916,
"num_tokens": 160421.0,
"step": 19
},
{
"entropy": 1.61678808927536,
"epoch": 0.06420545746388442,
"grad_norm": 7.4708147048950195,
"learning_rate": 2.435897435897436e-06,
"loss": 0.9297,
"mean_token_accuracy": 0.7384328246116638,
"num_tokens": 168561.0,
"step": 20
},
{
"entropy": 1.7209243774414062,
"epoch": 0.06741573033707865,
"grad_norm": 6.788050651550293,
"learning_rate": 2.564102564102564e-06,
"loss": 0.9503,
"mean_token_accuracy": 0.7228703796863556,
"num_tokens": 177282.0,
"step": 21
},
{
"entropy": 1.6552881002426147,
"epoch": 0.07062600321027288,
"grad_norm": 6.641953468322754,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.918,
"mean_token_accuracy": 0.720936506986618,
"num_tokens": 185240.0,
"step": 22
},
{
"entropy": 1.5451985001564026,
"epoch": 0.0738362760834671,
"grad_norm": 7.2707953453063965,
"learning_rate": 2.8205128205128207e-06,
"loss": 0.9192,
"mean_token_accuracy": 0.7248755097389221,
"num_tokens": 193481.0,
"step": 23
},
{
"entropy": 1.5412019491195679,
"epoch": 0.07704654895666131,
"grad_norm": 6.560652256011963,
"learning_rate": 2.948717948717949e-06,
"loss": 0.8359,
"mean_token_accuracy": 0.739835798740387,
"num_tokens": 201616.0,
"step": 24
},
{
"entropy": 1.7039520740509033,
"epoch": 0.08025682182985554,
"grad_norm": 7.809452056884766,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.8146,
"mean_token_accuracy": 0.7461437880992889,
"num_tokens": 211646.0,
"step": 25
},
{
"entropy": 1.7537184357643127,
"epoch": 0.08346709470304976,
"grad_norm": 5.6596903800964355,
"learning_rate": 3.205128205128206e-06,
"loss": 0.7816,
"mean_token_accuracy": 0.7539893090724945,
"num_tokens": 221121.0,
"step": 26
},
{
"entropy": 1.692587435245514,
"epoch": 0.08667736757624397,
"grad_norm": 6.648515224456787,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7075,
"mean_token_accuracy": 0.7867643237113953,
"num_tokens": 228393.0,
"step": 27
},
{
"entropy": 1.5505646467208862,
"epoch": 0.0898876404494382,
"grad_norm": 4.989018440246582,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.7552,
"mean_token_accuracy": 0.7660565078258514,
"num_tokens": 237025.0,
"step": 28
},
{
"entropy": 1.48868727684021,
"epoch": 0.09309791332263243,
"grad_norm": 5.676994800567627,
"learning_rate": 3.58974358974359e-06,
"loss": 0.6922,
"mean_token_accuracy": 0.774169385433197,
"num_tokens": 245283.0,
"step": 29
},
{
"entropy": 1.5744507908821106,
"epoch": 0.09630818619582665,
"grad_norm": 12.6621732711792,
"learning_rate": 3.7179487179487184e-06,
"loss": 0.7593,
"mean_token_accuracy": 0.7522169053554535,
"num_tokens": 254358.0,
"step": 30
},
{
"entropy": 1.441491186618805,
"epoch": 0.09951845906902086,
"grad_norm": 4.436478614807129,
"learning_rate": 3.846153846153847e-06,
"loss": 0.76,
"mean_token_accuracy": 0.7571093142032623,
"num_tokens": 263539.0,
"step": 31
},
{
"entropy": 1.6964601874351501,
"epoch": 0.10272873194221509,
"grad_norm": 7.042190074920654,
"learning_rate": 3.974358974358974e-06,
"loss": 0.7626,
"mean_token_accuracy": 0.7548583149909973,
"num_tokens": 273100.0,
"step": 32
},
{
"entropy": 1.482248067855835,
"epoch": 0.10593900481540931,
"grad_norm": 5.204789638519287,
"learning_rate": 4.102564102564103e-06,
"loss": 0.6846,
"mean_token_accuracy": 0.7835729420185089,
"num_tokens": 283183.0,
"step": 33
},
{
"entropy": 1.5532912611961365,
"epoch": 0.10914927768860354,
"grad_norm": 5.190078258514404,
"learning_rate": 4.230769230769231e-06,
"loss": 0.7163,
"mean_token_accuracy": 0.7673235833644867,
"num_tokens": 292075.0,
"step": 34
},
{
"entropy": 1.4678760766983032,
"epoch": 0.11235955056179775,
"grad_norm": 5.29253625869751,
"learning_rate": 4.358974358974359e-06,
"loss": 0.7016,
"mean_token_accuracy": 0.7700887024402618,
"num_tokens": 300604.0,
"step": 35
},
{
"entropy": 1.4619144797325134,
"epoch": 0.11556982343499198,
"grad_norm": 5.18648624420166,
"learning_rate": 4.487179487179488e-06,
"loss": 0.6847,
"mean_token_accuracy": 0.7695477604866028,
"num_tokens": 308831.0,
"step": 36
},
{
"entropy": 1.5652631521224976,
"epoch": 0.1187800963081862,
"grad_norm": 5.542480945587158,
"learning_rate": 4.615384615384616e-06,
"loss": 0.6814,
"mean_token_accuracy": 0.7790135145187378,
"num_tokens": 317048.0,
"step": 37
},
{
"entropy": 1.533966839313507,
"epoch": 0.12199036918138041,
"grad_norm": 6.46714973449707,
"learning_rate": 4.743589743589744e-06,
"loss": 0.6381,
"mean_token_accuracy": 0.789818674325943,
"num_tokens": 325284.0,
"step": 38
},
{
"entropy": 1.4124146699905396,
"epoch": 0.12520064205457465,
"grad_norm": 4.356935977935791,
"learning_rate": 4.871794871794872e-06,
"loss": 0.6803,
"mean_token_accuracy": 0.7765854597091675,
"num_tokens": 334504.0,
"step": 39
},
{
"entropy": 1.3948511481285095,
"epoch": 0.12841091492776885,
"grad_norm": 5.148672580718994,
"learning_rate": 5e-06,
"loss": 0.6072,
"mean_token_accuracy": 0.7939413189888,
"num_tokens": 343396.0,
"step": 40
},
{
"entropy": 1.7629846334457397,
"epoch": 0.13162118780096307,
"grad_norm": 8.669179916381836,
"learning_rate": 5.128205128205128e-06,
"loss": 0.6534,
"mean_token_accuracy": 0.7880546152591705,
"num_tokens": 353580.0,
"step": 41
},
{
"entropy": 1.5166628956794739,
"epoch": 0.1348314606741573,
"grad_norm": 6.4972310066223145,
"learning_rate": 5.256410256410257e-06,
"loss": 0.6503,
"mean_token_accuracy": 0.7786408066749573,
"num_tokens": 362812.0,
"step": 42
},
{
"entropy": 1.5299481749534607,
"epoch": 0.13804173354735153,
"grad_norm": 11.583465576171875,
"learning_rate": 5.384615384615385e-06,
"loss": 0.6344,
"mean_token_accuracy": 0.7911360859870911,
"num_tokens": 371115.0,
"step": 43
},
{
"entropy": 1.347219169139862,
"epoch": 0.14125200642054575,
"grad_norm": 5.8644185066223145,
"learning_rate": 5.512820512820514e-06,
"loss": 0.6108,
"mean_token_accuracy": 0.791721373796463,
"num_tokens": 379905.0,
"step": 44
},
{
"entropy": 1.4316428899765015,
"epoch": 0.14446227929373998,
"grad_norm": 5.175519943237305,
"learning_rate": 5.641025641025641e-06,
"loss": 0.6052,
"mean_token_accuracy": 0.7954953908920288,
"num_tokens": 388232.0,
"step": 45
},
{
"entropy": 1.6255661249160767,
"epoch": 0.1476725521669342,
"grad_norm": 11.087921142578125,
"learning_rate": 5.769230769230769e-06,
"loss": 0.6345,
"mean_token_accuracy": 0.7858785092830658,
"num_tokens": 396302.0,
"step": 46
},
{
"entropy": 1.498759388923645,
"epoch": 0.1508828250401284,
"grad_norm": 10.765654563903809,
"learning_rate": 5.897435897435898e-06,
"loss": 0.5593,
"mean_token_accuracy": 0.7999837696552277,
"num_tokens": 405773.0,
"step": 47
},
{
"entropy": 1.510821521282196,
"epoch": 0.15409309791332262,
"grad_norm": 6.4933695793151855,
"learning_rate": 6.025641025641026e-06,
"loss": 0.5956,
"mean_token_accuracy": 0.7734400928020477,
"num_tokens": 415320.0,
"step": 48
},
{
"entropy": 1.58489990234375,
"epoch": 0.15730337078651685,
"grad_norm": 8.327532768249512,
"learning_rate": 6.153846153846155e-06,
"loss": 0.5779,
"mean_token_accuracy": 0.811727911233902,
"num_tokens": 423824.0,
"step": 49
},
{
"entropy": 1.6798959970474243,
"epoch": 0.16051364365971107,
"grad_norm": 9.243504524230957,
"learning_rate": 6.282051282051282e-06,
"loss": 0.6383,
"mean_token_accuracy": 0.7815350294113159,
"num_tokens": 433980.0,
"step": 50
},
{
"entropy": 1.5442488193511963,
"epoch": 0.1637239165329053,
"grad_norm": 10.32125473022461,
"learning_rate": 6.410256410256412e-06,
"loss": 0.6572,
"mean_token_accuracy": 0.7940186858177185,
"num_tokens": 443555.0,
"step": 51
},
{
"entropy": 1.5454481840133667,
"epoch": 0.16693418940609953,
"grad_norm": 7.995494842529297,
"learning_rate": 6.538461538461539e-06,
"loss": 0.6512,
"mean_token_accuracy": 0.7784733474254608,
"num_tokens": 451491.0,
"step": 52
},
{
"entropy": 1.4755758047103882,
"epoch": 0.17014446227929375,
"grad_norm": 9.288561820983887,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6035,
"mean_token_accuracy": 0.7999023199081421,
"num_tokens": 461177.0,
"step": 53
},
{
"entropy": 1.6358034014701843,
"epoch": 0.17335473515248795,
"grad_norm": 6.63067102432251,
"learning_rate": 6.794871794871796e-06,
"loss": 0.6189,
"mean_token_accuracy": 0.7967036664485931,
"num_tokens": 469468.0,
"step": 54
},
{
"entropy": 1.605944573879242,
"epoch": 0.17656500802568217,
"grad_norm": 6.3592987060546875,
"learning_rate": 6.923076923076923e-06,
"loss": 0.6694,
"mean_token_accuracy": 0.7812408804893494,
"num_tokens": 477498.0,
"step": 55
},
{
"entropy": 1.6061294674873352,
"epoch": 0.1797752808988764,
"grad_norm": 5.8274712562561035,
"learning_rate": 7.051282051282053e-06,
"loss": 0.6096,
"mean_token_accuracy": 0.78871089220047,
"num_tokens": 486587.0,
"step": 56
},
{
"entropy": 1.5025911927223206,
"epoch": 0.18298555377207062,
"grad_norm": 4.877136707305908,
"learning_rate": 7.17948717948718e-06,
"loss": 0.5809,
"mean_token_accuracy": 0.7969581782817841,
"num_tokens": 494600.0,
"step": 57
},
{
"entropy": 1.5793496370315552,
"epoch": 0.18619582664526485,
"grad_norm": 5.73530912399292,
"learning_rate": 7.307692307692308e-06,
"loss": 0.5828,
"mean_token_accuracy": 0.8010603785514832,
"num_tokens": 505041.0,
"step": 58
},
{
"entropy": 1.4806573987007141,
"epoch": 0.18940609951845908,
"grad_norm": 4.669845104217529,
"learning_rate": 7.435897435897437e-06,
"loss": 0.591,
"mean_token_accuracy": 0.8051888644695282,
"num_tokens": 514202.0,
"step": 59
},
{
"entropy": 1.501672387123108,
"epoch": 0.1926163723916533,
"grad_norm": 5.279482364654541,
"learning_rate": 7.564102564102564e-06,
"loss": 0.5285,
"mean_token_accuracy": 0.8189839124679565,
"num_tokens": 522981.0,
"step": 60
},
{
"entropy": 1.4994003772735596,
"epoch": 0.1958266452648475,
"grad_norm": 4.435177803039551,
"learning_rate": 7.692307692307694e-06,
"loss": 0.6004,
"mean_token_accuracy": 0.7918008863925934,
"num_tokens": 532071.0,
"step": 61
},
{
"entropy": 1.4552969932556152,
"epoch": 0.19903691813804172,
"grad_norm": 5.8819403648376465,
"learning_rate": 7.820512820512822e-06,
"loss": 0.6007,
"mean_token_accuracy": 0.7975403964519501,
"num_tokens": 542292.0,
"step": 62
},
{
"entropy": 1.617630124092102,
"epoch": 0.20224719101123595,
"grad_norm": 5.218127727508545,
"learning_rate": 7.948717948717949e-06,
"loss": 0.5843,
"mean_token_accuracy": 0.8035672008991241,
"num_tokens": 551404.0,
"step": 63
},
{
"entropy": 1.5008496046066284,
"epoch": 0.20545746388443017,
"grad_norm": 11.88429069519043,
"learning_rate": 8.076923076923077e-06,
"loss": 0.5689,
"mean_token_accuracy": 0.807811826467514,
"num_tokens": 559745.0,
"step": 64
},
{
"entropy": 1.607909917831421,
"epoch": 0.2086677367576244,
"grad_norm": 4.793018341064453,
"learning_rate": 8.205128205128205e-06,
"loss": 0.5537,
"mean_token_accuracy": 0.8123330175876617,
"num_tokens": 569437.0,
"step": 65
},
{
"entropy": 1.5648667216300964,
"epoch": 0.21187800963081863,
"grad_norm": 4.589395999908447,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5838,
"mean_token_accuracy": 0.8071021139621735,
"num_tokens": 577731.0,
"step": 66
},
{
"entropy": 1.438844621181488,
"epoch": 0.21508828250401285,
"grad_norm": 4.69675350189209,
"learning_rate": 8.461538461538462e-06,
"loss": 0.5286,
"mean_token_accuracy": 0.8204745054244995,
"num_tokens": 585650.0,
"step": 67
},
{
"entropy": 1.507163166999817,
"epoch": 0.21829855537720708,
"grad_norm": 6.146286487579346,
"learning_rate": 8.58974358974359e-06,
"loss": 0.5781,
"mean_token_accuracy": 0.7892117500305176,
"num_tokens": 595188.0,
"step": 68
},
{
"entropy": 1.4560467600822449,
"epoch": 0.22150882825040127,
"grad_norm": 5.049193382263184,
"learning_rate": 8.717948717948719e-06,
"loss": 0.6418,
"mean_token_accuracy": 0.7874622046947479,
"num_tokens": 604332.0,
"step": 69
},
{
"entropy": 1.4594195485115051,
"epoch": 0.2247191011235955,
"grad_norm": 6.110840320587158,
"learning_rate": 8.846153846153847e-06,
"loss": 0.5814,
"mean_token_accuracy": 0.8057654201984406,
"num_tokens": 612359.0,
"step": 70
},
{
"entropy": 1.540019154548645,
"epoch": 0.22792937399678972,
"grad_norm": 6.3223395347595215,
"learning_rate": 8.974358974358976e-06,
"loss": 0.5083,
"mean_token_accuracy": 0.8341903388500214,
"num_tokens": 619746.0,
"step": 71
},
{
"entropy": 1.4772478938102722,
"epoch": 0.23113964686998395,
"grad_norm": 9.083407402038574,
"learning_rate": 9.102564102564104e-06,
"loss": 0.5976,
"mean_token_accuracy": 0.7991544604301453,
"num_tokens": 628789.0,
"step": 72
},
{
"entropy": 1.5935535430908203,
"epoch": 0.23434991974317818,
"grad_norm": 5.82806396484375,
"learning_rate": 9.230769230769232e-06,
"loss": 0.5975,
"mean_token_accuracy": 0.8066189289093018,
"num_tokens": 637168.0,
"step": 73
},
{
"entropy": 1.6355071067810059,
"epoch": 0.2375601926163724,
"grad_norm": 5.713013172149658,
"learning_rate": 9.358974358974359e-06,
"loss": 0.6243,
"mean_token_accuracy": 0.7890200316905975,
"num_tokens": 645489.0,
"step": 74
},
{
"entropy": 1.5683773159980774,
"epoch": 0.24077046548956663,
"grad_norm": 4.871849060058594,
"learning_rate": 9.487179487179487e-06,
"loss": 0.5996,
"mean_token_accuracy": 0.7987681031227112,
"num_tokens": 654474.0,
"step": 75
},
{
"entropy": 1.554315447807312,
"epoch": 0.24398073836276082,
"grad_norm": 4.6450653076171875,
"learning_rate": 9.615384615384616e-06,
"loss": 0.607,
"mean_token_accuracy": 0.7968572080135345,
"num_tokens": 664329.0,
"step": 76
},
{
"entropy": 1.5780853033065796,
"epoch": 0.24719101123595505,
"grad_norm": 5.811628818511963,
"learning_rate": 9.743589743589744e-06,
"loss": 0.632,
"mean_token_accuracy": 0.7882947325706482,
"num_tokens": 673150.0,
"step": 77
},
{
"entropy": 1.7099721431732178,
"epoch": 0.2504012841091493,
"grad_norm": 13.076167106628418,
"learning_rate": 9.871794871794872e-06,
"loss": 0.5605,
"mean_token_accuracy": 0.8057580888271332,
"num_tokens": 682512.0,
"step": 78
},
{
"entropy": 1.4776220321655273,
"epoch": 0.2536115569823435,
"grad_norm": 7.290124416351318,
"learning_rate": 1e-05,
"loss": 0.6197,
"mean_token_accuracy": 0.7889206409454346,
"num_tokens": 691640.0,
"step": 79
},
{
"entropy": 1.739963173866272,
"epoch": 0.2568218298555377,
"grad_norm": 6.372115135192871,
"learning_rate": 9.999988765773283e-06,
"loss": 0.6253,
"mean_token_accuracy": 0.7942008674144745,
"num_tokens": 700384.0,
"step": 80
},
{
"entropy": 1.6132588982582092,
"epoch": 0.26003210272873195,
"grad_norm": 9.141928672790527,
"learning_rate": 9.99995506314361e-06,
"loss": 0.586,
"mean_token_accuracy": 0.8013035655021667,
"num_tokens": 709604.0,
"step": 81
},
{
"entropy": 1.7951747179031372,
"epoch": 0.26324237560192615,
"grad_norm": 15.085265159606934,
"learning_rate": 9.999898892262433e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8083482682704926,
"num_tokens": 717638.0,
"step": 82
},
{
"entropy": 1.7024835348129272,
"epoch": 0.2664526484751204,
"grad_norm": 5.644409656524658,
"learning_rate": 9.99982025338217e-06,
"loss": 0.602,
"mean_token_accuracy": 0.8106231689453125,
"num_tokens": 726166.0,
"step": 83
},
{
"entropy": 1.5966495275497437,
"epoch": 0.2696629213483146,
"grad_norm": 5.191967487335205,
"learning_rate": 9.999719146856191e-06,
"loss": 0.5468,
"mean_token_accuracy": 0.814395397901535,
"num_tokens": 734713.0,
"step": 84
},
{
"entropy": 1.5932486057281494,
"epoch": 0.27287319422150885,
"grad_norm": 8.375090599060059,
"learning_rate": 9.999595573138845e-06,
"loss": 0.5339,
"mean_token_accuracy": 0.81120365858078,
"num_tokens": 742232.0,
"step": 85
},
{
"entropy": 1.7226688861846924,
"epoch": 0.27608346709470305,
"grad_norm": 4.805099964141846,
"learning_rate": 9.99944953278543e-06,
"loss": 0.5941,
"mean_token_accuracy": 0.8079180121421814,
"num_tokens": 750192.0,
"step": 86
},
{
"entropy": 1.6014992594718933,
"epoch": 0.27929373996789725,
"grad_norm": 16.921457290649414,
"learning_rate": 9.99928102645221e-06,
"loss": 0.5497,
"mean_token_accuracy": 0.8063566982746124,
"num_tokens": 758813.0,
"step": 87
},
{
"entropy": 1.560517430305481,
"epoch": 0.2825040128410915,
"grad_norm": 9.258691787719727,
"learning_rate": 9.999090054896397e-06,
"loss": 0.6238,
"mean_token_accuracy": 0.7950149774551392,
"num_tokens": 768181.0,
"step": 88
},
{
"entropy": 1.7196683287620544,
"epoch": 0.2857142857142857,
"grad_norm": 5.792410850524902,
"learning_rate": 9.99887661897616e-06,
"loss": 0.5559,
"mean_token_accuracy": 0.808037519454956,
"num_tokens": 776667.0,
"step": 89
},
{
"entropy": 1.6795648336410522,
"epoch": 0.28892455858747995,
"grad_norm": 6.827014446258545,
"learning_rate": 9.998640719650609e-06,
"loss": 0.5508,
"mean_token_accuracy": 0.8196805119514465,
"num_tokens": 784702.0,
"step": 90
},
{
"entropy": 1.5682292580604553,
"epoch": 0.29213483146067415,
"grad_norm": 4.962204456329346,
"learning_rate": 9.99838235797981e-06,
"loss": 0.5451,
"mean_token_accuracy": 0.8104510009288788,
"num_tokens": 793038.0,
"step": 91
},
{
"entropy": 1.6802659630775452,
"epoch": 0.2953451043338684,
"grad_norm": 5.178422451019287,
"learning_rate": 9.998101535124758e-06,
"loss": 0.5749,
"mean_token_accuracy": 0.8083841502666473,
"num_tokens": 801811.0,
"step": 92
},
{
"entropy": 1.5281678438186646,
"epoch": 0.2985553772070626,
"grad_norm": 5.047668933868408,
"learning_rate": 9.997798252347382e-06,
"loss": 0.5217,
"mean_token_accuracy": 0.8163295686244965,
"num_tokens": 810749.0,
"step": 93
},
{
"entropy": 1.4543544054031372,
"epoch": 0.3017656500802568,
"grad_norm": 7.823535919189453,
"learning_rate": 9.997472511010543e-06,
"loss": 0.625,
"mean_token_accuracy": 0.7929400205612183,
"num_tokens": 819856.0,
"step": 94
},
{
"entropy": 1.565036654472351,
"epoch": 0.30497592295345105,
"grad_norm": 6.509371280670166,
"learning_rate": 9.99712431257802e-06,
"loss": 0.5937,
"mean_token_accuracy": 0.7926050424575806,
"num_tokens": 828198.0,
"step": 95
},
{
"entropy": 1.639006495475769,
"epoch": 0.30818619582664525,
"grad_norm": 7.456268310546875,
"learning_rate": 9.99675365861451e-06,
"loss": 0.5543,
"mean_token_accuracy": 0.8075222671031952,
"num_tokens": 837031.0,
"step": 96
},
{
"entropy": 1.4620029926300049,
"epoch": 0.3113964686998395,
"grad_norm": 5.094261646270752,
"learning_rate": 9.996360550785619e-06,
"loss": 0.5775,
"mean_token_accuracy": 0.8004140257835388,
"num_tokens": 846009.0,
"step": 97
},
{
"entropy": 1.5976275205612183,
"epoch": 0.3146067415730337,
"grad_norm": 8.545952796936035,
"learning_rate": 9.995944990857848e-06,
"loss": 0.5532,
"mean_token_accuracy": 0.821792334318161,
"num_tokens": 855269.0,
"step": 98
},
{
"entropy": 1.5488384366035461,
"epoch": 0.31781701444622795,
"grad_norm": 10.916169166564941,
"learning_rate": 9.9955069806986e-06,
"loss": 0.5638,
"mean_token_accuracy": 0.8070700764656067,
"num_tokens": 862934.0,
"step": 99
},
{
"entropy": 1.5096496939659119,
"epoch": 0.32102728731942215,
"grad_norm": 11.81431770324707,
"learning_rate": 9.995046522276152e-06,
"loss": 0.5911,
"mean_token_accuracy": 0.8029804527759552,
"num_tokens": 871122.0,
"step": 100
},
{
"entropy": 1.5772724151611328,
"epoch": 0.32423756019261635,
"grad_norm": 4.687896251678467,
"learning_rate": 9.994563617659665e-06,
"loss": 0.6396,
"mean_token_accuracy": 0.7922047674655914,
"num_tokens": 880220.0,
"step": 101
},
{
"entropy": 1.5804290175437927,
"epoch": 0.3274478330658106,
"grad_norm": 5.048791408538818,
"learning_rate": 9.994058269019163e-06,
"loss": 0.6029,
"mean_token_accuracy": 0.7916356921195984,
"num_tokens": 889551.0,
"step": 102
},
{
"entropy": 1.591521441936493,
"epoch": 0.3306581059390048,
"grad_norm": 11.681397438049316,
"learning_rate": 9.993530478625524e-06,
"loss": 0.5049,
"mean_token_accuracy": 0.8214498460292816,
"num_tokens": 897040.0,
"step": 103
},
{
"entropy": 1.5347830057144165,
"epoch": 0.33386837881219905,
"grad_norm": 10.396520614624023,
"learning_rate": 9.992980248850476e-06,
"loss": 0.5725,
"mean_token_accuracy": 0.8061753809452057,
"num_tokens": 905563.0,
"step": 104
},
{
"entropy": 1.6515621542930603,
"epoch": 0.33707865168539325,
"grad_norm": 4.542778491973877,
"learning_rate": 9.992407582166582e-06,
"loss": 0.6193,
"mean_token_accuracy": 0.7810203433036804,
"num_tokens": 914347.0,
"step": 105
},
{
"entropy": 1.626991093158722,
"epoch": 0.3402889245585875,
"grad_norm": 5.993000507354736,
"learning_rate": 9.99181248114723e-06,
"loss": 0.6076,
"mean_token_accuracy": 0.7984636723995209,
"num_tokens": 923142.0,
"step": 106
},
{
"entropy": 1.5122568607330322,
"epoch": 0.3434991974317817,
"grad_norm": 6.345304489135742,
"learning_rate": 9.991194948466615e-06,
"loss": 0.6133,
"mean_token_accuracy": 0.7982348203659058,
"num_tokens": 931756.0,
"step": 107
},
{
"entropy": 1.4367440938949585,
"epoch": 0.3467094703049759,
"grad_norm": 7.016558647155762,
"learning_rate": 9.990554986899745e-06,
"loss": 0.5683,
"mean_token_accuracy": 0.808490514755249,
"num_tokens": 940192.0,
"step": 108
},
{
"entropy": 1.6145474910736084,
"epoch": 0.34991974317817015,
"grad_norm": 7.367400169372559,
"learning_rate": 9.989892599322404e-06,
"loss": 0.5275,
"mean_token_accuracy": 0.8245007693767548,
"num_tokens": 948340.0,
"step": 109
},
{
"entropy": 1.6267709136009216,
"epoch": 0.35313001605136435,
"grad_norm": 8.42909049987793,
"learning_rate": 9.98920778871116e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8199409544467926,
"num_tokens": 957371.0,
"step": 110
},
{
"entropy": 1.5337051153182983,
"epoch": 0.3563402889245586,
"grad_norm": 5.027129650115967,
"learning_rate": 9.988500558143337e-06,
"loss": 0.6374,
"mean_token_accuracy": 0.7932157814502716,
"num_tokens": 968899.0,
"step": 111
},
{
"entropy": 1.5816736221313477,
"epoch": 0.3595505617977528,
"grad_norm": 5.9406304359436035,
"learning_rate": 9.987770910797014e-06,
"loss": 0.5699,
"mean_token_accuracy": 0.8138624727725983,
"num_tokens": 977002.0,
"step": 112
},
{
"entropy": 1.5768181681632996,
"epoch": 0.36276083467094705,
"grad_norm": 5.563234806060791,
"learning_rate": 9.987018849950996e-06,
"loss": 0.5396,
"mean_token_accuracy": 0.8184849619865417,
"num_tokens": 985349.0,
"step": 113
},
{
"entropy": 1.6216952800750732,
"epoch": 0.36597110754414125,
"grad_norm": 4.94663667678833,
"learning_rate": 9.986244378984817e-06,
"loss": 0.5606,
"mean_token_accuracy": 0.8066204190254211,
"num_tokens": 994148.0,
"step": 114
},
{
"entropy": 1.587065875530243,
"epoch": 0.36918138041733545,
"grad_norm": 6.290283203125,
"learning_rate": 9.985447501378706e-06,
"loss": 0.5305,
"mean_token_accuracy": 0.8203730583190918,
"num_tokens": 1003995.0,
"step": 115
},
{
"entropy": 1.6694093346595764,
"epoch": 0.3723916532905297,
"grad_norm": 7.1124138832092285,
"learning_rate": 9.984628220713587e-06,
"loss": 0.5579,
"mean_token_accuracy": 0.7883734703063965,
"num_tokens": 1012908.0,
"step": 116
},
{
"entropy": 1.710681140422821,
"epoch": 0.3756019261637239,
"grad_norm": 4.218459606170654,
"learning_rate": 9.983786540671052e-06,
"loss": 0.6475,
"mean_token_accuracy": 0.7862134873867035,
"num_tokens": 1021508.0,
"step": 117
},
{
"entropy": 1.6554288864135742,
"epoch": 0.37881219903691815,
"grad_norm": 22.958293914794922,
"learning_rate": 9.98292246503335e-06,
"loss": 0.5268,
"mean_token_accuracy": 0.8189591467380524,
"num_tokens": 1029277.0,
"step": 118
},
{
"entropy": 1.614859402179718,
"epoch": 0.38202247191011235,
"grad_norm": 7.260986328125,
"learning_rate": 9.982035997683372e-06,
"loss": 0.5515,
"mean_token_accuracy": 0.7965718805789948,
"num_tokens": 1037552.0,
"step": 119
},
{
"entropy": 1.623701572418213,
"epoch": 0.3852327447833066,
"grad_norm": 6.900394439697266,
"learning_rate": 9.981127142604628e-06,
"loss": 0.5543,
"mean_token_accuracy": 0.8073444068431854,
"num_tokens": 1046341.0,
"step": 120
},
{
"entropy": 1.6763284802436829,
"epoch": 0.3884430176565008,
"grad_norm": 5.463217735290527,
"learning_rate": 9.980195903881231e-06,
"loss": 0.5907,
"mean_token_accuracy": 0.8022545874118805,
"num_tokens": 1054765.0,
"step": 121
},
{
"entropy": 1.7533277869224548,
"epoch": 0.391653290529695,
"grad_norm": 4.653329372406006,
"learning_rate": 9.979242285697878e-06,
"loss": 0.5433,
"mean_token_accuracy": 0.8024509847164154,
"num_tokens": 1063762.0,
"step": 122
},
{
"entropy": 1.6675923466682434,
"epoch": 0.39486356340288925,
"grad_norm": 4.841404914855957,
"learning_rate": 9.978266292339838e-06,
"loss": 0.5855,
"mean_token_accuracy": 0.8054037988185883,
"num_tokens": 1073721.0,
"step": 123
},
{
"entropy": 1.549901008605957,
"epoch": 0.39807383627608345,
"grad_norm": 4.415830612182617,
"learning_rate": 9.97726792819292e-06,
"loss": 0.6026,
"mean_token_accuracy": 0.8016513884067535,
"num_tokens": 1081712.0,
"step": 124
},
{
"entropy": 1.6741828322410583,
"epoch": 0.4012841091492777,
"grad_norm": 3.7602241039276123,
"learning_rate": 9.976247197743465e-06,
"loss": 0.5501,
"mean_token_accuracy": 0.8158861398696899,
"num_tokens": 1090661.0,
"step": 125
},
{
"entropy": 1.747463345527649,
"epoch": 0.4044943820224719,
"grad_norm": 5.772115707397461,
"learning_rate": 9.975204105578318e-06,
"loss": 0.6089,
"mean_token_accuracy": 0.7918793559074402,
"num_tokens": 1099806.0,
"step": 126
},
{
"entropy": 1.503204584121704,
"epoch": 0.40770465489566615,
"grad_norm": 5.531157970428467,
"learning_rate": 9.974138656384815e-06,
"loss": 0.5032,
"mean_token_accuracy": 0.8174611032009125,
"num_tokens": 1107804.0,
"step": 127
},
{
"entropy": 1.6939732432365417,
"epoch": 0.41091492776886035,
"grad_norm": 6.103886127471924,
"learning_rate": 9.973050854950756e-06,
"loss": 0.5437,
"mean_token_accuracy": 0.8124994933605194,
"num_tokens": 1116670.0,
"step": 128
},
{
"entropy": 1.6444594860076904,
"epoch": 0.41412520064205455,
"grad_norm": 4.691072940826416,
"learning_rate": 9.97194070616438e-06,
"loss": 0.5612,
"mean_token_accuracy": 0.8085650205612183,
"num_tokens": 1125515.0,
"step": 129
},
{
"entropy": 1.577980101108551,
"epoch": 0.4173354735152488,
"grad_norm": 5.344744682312012,
"learning_rate": 9.970808215014357e-06,
"loss": 0.5397,
"mean_token_accuracy": 0.8058919608592987,
"num_tokens": 1133769.0,
"step": 130
},
{
"entropy": 1.7274922728538513,
"epoch": 0.420545746388443,
"grad_norm": 12.869426727294922,
"learning_rate": 9.969653386589749e-06,
"loss": 0.5811,
"mean_token_accuracy": 0.8053319454193115,
"num_tokens": 1143863.0,
"step": 131
},
{
"entropy": 1.695865511894226,
"epoch": 0.42375601926163725,
"grad_norm": 7.400782585144043,
"learning_rate": 9.968476226079997e-06,
"loss": 0.5592,
"mean_token_accuracy": 0.8116855323314667,
"num_tokens": 1152046.0,
"step": 132
},
{
"entropy": 1.7647086381912231,
"epoch": 0.42696629213483145,
"grad_norm": 13.192926406860352,
"learning_rate": 9.967276738774897e-06,
"loss": 0.5565,
"mean_token_accuracy": 0.8083753287792206,
"num_tokens": 1160864.0,
"step": 133
},
{
"entropy": 1.5161982774734497,
"epoch": 0.4301765650080257,
"grad_norm": 4.342813014984131,
"learning_rate": 9.966054930064577e-06,
"loss": 0.5696,
"mean_token_accuracy": 0.8116317987442017,
"num_tokens": 1169164.0,
"step": 134
},
{
"entropy": 1.7910877466201782,
"epoch": 0.4333868378812199,
"grad_norm": 6.658879280090332,
"learning_rate": 9.964810805439464e-06,
"loss": 0.5709,
"mean_token_accuracy": 0.8017919361591339,
"num_tokens": 1179921.0,
"step": 135
},
{
"entropy": 1.600732445716858,
"epoch": 0.43659711075441415,
"grad_norm": 14.430280685424805,
"learning_rate": 9.96354437049027e-06,
"loss": 0.625,
"mean_token_accuracy": 0.7819797992706299,
"num_tokens": 1189616.0,
"step": 136
},
{
"entropy": 1.7231240272521973,
"epoch": 0.43980738362760835,
"grad_norm": 9.525609970092773,
"learning_rate": 9.962255630907964e-06,
"loss": 0.5661,
"mean_token_accuracy": 0.8119661808013916,
"num_tokens": 1197261.0,
"step": 137
},
{
"entropy": 1.6479978561401367,
"epoch": 0.44301765650080255,
"grad_norm": 22.548439025878906,
"learning_rate": 9.96094459248374e-06,
"loss": 0.5784,
"mean_token_accuracy": 0.8013301193714142,
"num_tokens": 1204728.0,
"step": 138
},
{
"entropy": 1.7986060976982117,
"epoch": 0.4462279293739968,
"grad_norm": 5.41772985458374,
"learning_rate": 9.959611261108999e-06,
"loss": 0.546,
"mean_token_accuracy": 0.8117686808109283,
"num_tokens": 1214191.0,
"step": 139
},
{
"entropy": 1.5903725624084473,
"epoch": 0.449438202247191,
"grad_norm": 5.444282054901123,
"learning_rate": 9.95825564277532e-06,
"loss": 0.5919,
"mean_token_accuracy": 0.7964106798171997,
"num_tokens": 1223443.0,
"step": 140
},
{
"entropy": 1.4649049639701843,
"epoch": 0.45264847512038525,
"grad_norm": 10.411364555358887,
"learning_rate": 9.956877743574437e-06,
"loss": 0.5833,
"mean_token_accuracy": 0.8069176971912384,
"num_tokens": 1231905.0,
"step": 141
},
{
"entropy": 1.6031562089920044,
"epoch": 0.45585874799357945,
"grad_norm": 4.023768424987793,
"learning_rate": 9.955477569698197e-06,
"loss": 0.5195,
"mean_token_accuracy": 0.8230155110359192,
"num_tokens": 1241887.0,
"step": 142
},
{
"entropy": 1.6915020942687988,
"epoch": 0.4590690208667737,
"grad_norm": 4.516135215759277,
"learning_rate": 9.954055127438554e-06,
"loss": 0.548,
"mean_token_accuracy": 0.8158730268478394,
"num_tokens": 1250696.0,
"step": 143
},
{
"entropy": 1.5421813130378723,
"epoch": 0.4622792937399679,
"grad_norm": 4.3148722648620605,
"learning_rate": 9.952610423187516e-06,
"loss": 0.6044,
"mean_token_accuracy": 0.794927716255188,
"num_tokens": 1259186.0,
"step": 144
},
{
"entropy": 1.6087906956672668,
"epoch": 0.4654895666131621,
"grad_norm": 10.47445011138916,
"learning_rate": 9.951143463437145e-06,
"loss": 0.6004,
"mean_token_accuracy": 0.7896897196769714,
"num_tokens": 1269293.0,
"step": 145
},
{
"entropy": 1.522541105747223,
"epoch": 0.46869983948635635,
"grad_norm": 5.343850612640381,
"learning_rate": 9.949654254779499e-06,
"loss": 0.5312,
"mean_token_accuracy": 0.8037137985229492,
"num_tokens": 1278821.0,
"step": 146
},
{
"entropy": 1.67844557762146,
"epoch": 0.47191011235955055,
"grad_norm": 6.206827640533447,
"learning_rate": 9.948142803906623e-06,
"loss": 0.6064,
"mean_token_accuracy": 0.7928981781005859,
"num_tokens": 1286812.0,
"step": 147
},
{
"entropy": 1.5753509998321533,
"epoch": 0.4751203852327448,
"grad_norm": 5.666691303253174,
"learning_rate": 9.946609117610508e-06,
"loss": 0.5374,
"mean_token_accuracy": 0.8405792713165283,
"num_tokens": 1297550.0,
"step": 148
},
{
"entropy": 1.699306309223175,
"epoch": 0.478330658105939,
"grad_norm": 5.685131072998047,
"learning_rate": 9.94505320278307e-06,
"loss": 0.6641,
"mean_token_accuracy": 0.7865519523620605,
"num_tokens": 1307649.0,
"step": 149
},
{
"entropy": 1.8110342025756836,
"epoch": 0.48154093097913325,
"grad_norm": 6.575296401977539,
"learning_rate": 9.943475066416105e-06,
"loss": 0.5588,
"mean_token_accuracy": 0.8110237419605255,
"num_tokens": 1317315.0,
"step": 150
},
{
"entropy": 1.5596372485160828,
"epoch": 0.48475120385232745,
"grad_norm": 14.576911926269531,
"learning_rate": 9.94187471560127e-06,
"loss": 0.6137,
"mean_token_accuracy": 0.8028269708156586,
"num_tokens": 1326698.0,
"step": 151
},
{
"entropy": 1.6433868408203125,
"epoch": 0.48796147672552165,
"grad_norm": 4.3727216720581055,
"learning_rate": 9.940252157530048e-06,
"loss": 0.5117,
"mean_token_accuracy": 0.8254751861095428,
"num_tokens": 1334747.0,
"step": 152
},
{
"entropy": 1.7227093577384949,
"epoch": 0.4911717495987159,
"grad_norm": 10.242402076721191,
"learning_rate": 9.938607399493714e-06,
"loss": 0.5957,
"mean_token_accuracy": 0.8011538982391357,
"num_tokens": 1343974.0,
"step": 153
},
{
"entropy": 1.6710272431373596,
"epoch": 0.4943820224719101,
"grad_norm": 9.269742012023926,
"learning_rate": 9.936940448883299e-06,
"loss": 0.564,
"mean_token_accuracy": 0.810530036687851,
"num_tokens": 1352278.0,
"step": 154
},
{
"entropy": 1.6066045761108398,
"epoch": 0.49759229534510435,
"grad_norm": 4.906155109405518,
"learning_rate": 9.935251313189564e-06,
"loss": 0.5821,
"mean_token_accuracy": 0.8002186417579651,
"num_tokens": 1360110.0,
"step": 155
},
{
"entropy": 1.6245338916778564,
"epoch": 0.5008025682182986,
"grad_norm": 4.081302642822266,
"learning_rate": 9.933540000002966e-06,
"loss": 0.5425,
"mean_token_accuracy": 0.8081716001033783,
"num_tokens": 1369064.0,
"step": 156
},
{
"entropy": 1.814435362815857,
"epoch": 0.5040128410914928,
"grad_norm": 4.305764198303223,
"learning_rate": 9.931806517013612e-06,
"loss": 0.5317,
"mean_token_accuracy": 0.8192135095596313,
"num_tokens": 1377809.0,
"step": 157
},
{
"entropy": 1.5969063639640808,
"epoch": 0.507223113964687,
"grad_norm": 7.35665225982666,
"learning_rate": 9.930050872011242e-06,
"loss": 0.583,
"mean_token_accuracy": 0.7939075827598572,
"num_tokens": 1386467.0,
"step": 158
},
{
"entropy": 1.554268717765808,
"epoch": 0.5104333868378812,
"grad_norm": 4.8409271240234375,
"learning_rate": 9.92827307288518e-06,
"loss": 0.5961,
"mean_token_accuracy": 0.801440566778183,
"num_tokens": 1395387.0,
"step": 159
},
{
"entropy": 1.658279836177826,
"epoch": 0.5136436597110754,
"grad_norm": 4.918241024017334,
"learning_rate": 9.926473127624306e-06,
"loss": 0.5237,
"mean_token_accuracy": 0.8125604391098022,
"num_tokens": 1402911.0,
"step": 160
},
{
"entropy": 1.6950554847717285,
"epoch": 0.5168539325842697,
"grad_norm": 4.100020885467529,
"learning_rate": 9.924651044317017e-06,
"loss": 0.5679,
"mean_token_accuracy": 0.8086461424827576,
"num_tokens": 1410709.0,
"step": 161
},
{
"entropy": 1.6739388704299927,
"epoch": 0.5200642054574639,
"grad_norm": 4.0072126388549805,
"learning_rate": 9.922806831151192e-06,
"loss": 0.483,
"mean_token_accuracy": 0.8350411057472229,
"num_tokens": 1419682.0,
"step": 162
},
{
"entropy": 1.7393546104431152,
"epoch": 0.5232744783306581,
"grad_norm": 4.075192451477051,
"learning_rate": 9.920940496414153e-06,
"loss": 0.4849,
"mean_token_accuracy": 0.8328077495098114,
"num_tokens": 1428573.0,
"step": 163
},
{
"entropy": 1.685001015663147,
"epoch": 0.5264847512038523,
"grad_norm": 4.823047637939453,
"learning_rate": 9.919052048492633e-06,
"loss": 0.6149,
"mean_token_accuracy": 0.8053491711616516,
"num_tokens": 1438592.0,
"step": 164
},
{
"entropy": 1.5803438425064087,
"epoch": 0.5296950240770465,
"grad_norm": 6.810695648193359,
"learning_rate": 9.917141495872733e-06,
"loss": 0.5079,
"mean_token_accuracy": 0.8178175091743469,
"num_tokens": 1446506.0,
"step": 165
},
{
"entropy": 1.6139253377914429,
"epoch": 0.5329052969502408,
"grad_norm": 8.459833145141602,
"learning_rate": 9.915208847139883e-06,
"loss": 0.5564,
"mean_token_accuracy": 0.8102662563323975,
"num_tokens": 1454804.0,
"step": 166
},
{
"entropy": 1.501044511795044,
"epoch": 0.536115569823435,
"grad_norm": 56.71458435058594,
"learning_rate": 9.913254110978812e-06,
"loss": 0.5484,
"mean_token_accuracy": 0.8092103004455566,
"num_tokens": 1463547.0,
"step": 167
},
{
"entropy": 1.6031732559204102,
"epoch": 0.5393258426966292,
"grad_norm": 47.60392761230469,
"learning_rate": 9.911277296173498e-06,
"loss": 0.5669,
"mean_token_accuracy": 0.8047437965869904,
"num_tokens": 1472335.0,
"step": 168
},
{
"entropy": 1.471640169620514,
"epoch": 0.5425361155698234,
"grad_norm": 4.785444259643555,
"learning_rate": 9.909278411607134e-06,
"loss": 0.5332,
"mean_token_accuracy": 0.816964328289032,
"num_tokens": 1481165.0,
"step": 169
},
{
"entropy": 1.6395891308784485,
"epoch": 0.5457463884430177,
"grad_norm": 5.150639533996582,
"learning_rate": 9.90725746626209e-06,
"loss": 0.5777,
"mean_token_accuracy": 0.802484005689621,
"num_tokens": 1490534.0,
"step": 170
},
{
"entropy": 1.7222612500190735,
"epoch": 0.5489566613162119,
"grad_norm": 10.601120948791504,
"learning_rate": 9.90521446921987e-06,
"loss": 0.5492,
"mean_token_accuracy": 0.812993735074997,
"num_tokens": 1497850.0,
"step": 171
},
{
"entropy": 1.4922645092010498,
"epoch": 0.5521669341894061,
"grad_norm": 6.482858657836914,
"learning_rate": 9.903149429661072e-06,
"loss": 0.6384,
"mean_token_accuracy": 0.7931837439537048,
"num_tokens": 1506706.0,
"step": 172
},
{
"entropy": 1.649798572063446,
"epoch": 0.5553772070626003,
"grad_norm": 4.392049789428711,
"learning_rate": 9.90106235686534e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.8184403777122498,
"num_tokens": 1514792.0,
"step": 173
},
{
"entropy": 1.639334261417389,
"epoch": 0.5585874799357945,
"grad_norm": 5.779803276062012,
"learning_rate": 9.89895326021134e-06,
"loss": 0.5342,
"mean_token_accuracy": 0.8119508326053619,
"num_tokens": 1522723.0,
"step": 174
},
{
"entropy": 1.6969090700149536,
"epoch": 0.5617977528089888,
"grad_norm": 4.983688831329346,
"learning_rate": 9.896822149176695e-06,
"loss": 0.5159,
"mean_token_accuracy": 0.8205990195274353,
"num_tokens": 1531592.0,
"step": 175
},
{
"entropy": 1.609685242176056,
"epoch": 0.565008025682183,
"grad_norm": 5.268664836883545,
"learning_rate": 9.894669033337962e-06,
"loss": 0.5402,
"mean_token_accuracy": 0.8162535429000854,
"num_tokens": 1541237.0,
"step": 176
},
{
"entropy": 1.7183340787887573,
"epoch": 0.5682182985553772,
"grad_norm": 6.601222038269043,
"learning_rate": 9.892493922370575e-06,
"loss": 0.5283,
"mean_token_accuracy": 0.8257523775100708,
"num_tokens": 1548767.0,
"step": 177
},
{
"entropy": 1.793528974056244,
"epoch": 0.5714285714285714,
"grad_norm": 7.592155933380127,
"learning_rate": 9.89029682604881e-06,
"loss": 0.51,
"mean_token_accuracy": 0.8226527869701385,
"num_tokens": 1556348.0,
"step": 178
},
{
"entropy": 1.7014802694320679,
"epoch": 0.5746388443017657,
"grad_norm": 5.251842021942139,
"learning_rate": 9.888077754245741e-06,
"loss": 0.5704,
"mean_token_accuracy": 0.8048693239688873,
"num_tokens": 1565716.0,
"step": 179
},
{
"entropy": 1.56605464220047,
"epoch": 0.5778491171749599,
"grad_norm": 7.1781086921691895,
"learning_rate": 9.88583671693319e-06,
"loss": 0.5432,
"mean_token_accuracy": 0.8167294263839722,
"num_tokens": 1574130.0,
"step": 180
},
{
"entropy": 1.8687368035316467,
"epoch": 0.5810593900481541,
"grad_norm": 6.982422828674316,
"learning_rate": 9.883573724181683e-06,
"loss": 0.5324,
"mean_token_accuracy": 0.8053440749645233,
"num_tokens": 1583316.0,
"step": 181
},
{
"entropy": 1.5925783514976501,
"epoch": 0.5842696629213483,
"grad_norm": 4.656749725341797,
"learning_rate": 9.881288786160413e-06,
"loss": 0.5597,
"mean_token_accuracy": 0.8063121140003204,
"num_tokens": 1591444.0,
"step": 182
},
{
"entropy": 1.5454033613204956,
"epoch": 0.5874799357945425,
"grad_norm": 4.605663299560547,
"learning_rate": 9.878981913137178e-06,
"loss": 0.4796,
"mean_token_accuracy": 0.8154693841934204,
"num_tokens": 1601849.0,
"step": 183
},
{
"entropy": 1.6912750005722046,
"epoch": 0.5906902086677368,
"grad_norm": 6.622162342071533,
"learning_rate": 9.87665311547836e-06,
"loss": 0.5147,
"mean_token_accuracy": 0.8236511945724487,
"num_tokens": 1609557.0,
"step": 184
},
{
"entropy": 1.6364272832870483,
"epoch": 0.593900481540931,
"grad_norm": 5.712879657745361,
"learning_rate": 9.87430240364885e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8128435611724854,
"num_tokens": 1619608.0,
"step": 185
},
{
"entropy": 1.7632077932357788,
"epoch": 0.5971107544141252,
"grad_norm": 4.955244541168213,
"learning_rate": 9.871929788212022e-06,
"loss": 0.5948,
"mean_token_accuracy": 0.7959562540054321,
"num_tokens": 1629727.0,
"step": 186
},
{
"entropy": 1.6786929965019226,
"epoch": 0.6003210272873194,
"grad_norm": 6.2427473068237305,
"learning_rate": 9.869535279829674e-06,
"loss": 0.6028,
"mean_token_accuracy": 0.7966740429401398,
"num_tokens": 1639226.0,
"step": 187
},
{
"entropy": 1.6981608867645264,
"epoch": 0.6035313001605136,
"grad_norm": 4.792971134185791,
"learning_rate": 9.867118889261988e-06,
"loss": 0.5328,
"mean_token_accuracy": 0.81971076130867,
"num_tokens": 1647305.0,
"step": 188
},
{
"entropy": 1.669969916343689,
"epoch": 0.6067415730337079,
"grad_norm": 3.715688943862915,
"learning_rate": 9.864680627367476e-06,
"loss": 0.6066,
"mean_token_accuracy": 0.7924070656299591,
"num_tokens": 1656645.0,
"step": 189
},
{
"entropy": 1.75922429561615,
"epoch": 0.6099518459069021,
"grad_norm": 5.012786388397217,
"learning_rate": 9.862220505102933e-06,
"loss": 0.5591,
"mean_token_accuracy": 0.8037042319774628,
"num_tokens": 1665007.0,
"step": 190
},
{
"entropy": 1.7086448073387146,
"epoch": 0.6131621187800963,
"grad_norm": 6.47388219833374,
"learning_rate": 9.859738533523384e-06,
"loss": 0.5558,
"mean_token_accuracy": 0.8141147196292877,
"num_tokens": 1673184.0,
"step": 191
},
{
"entropy": 1.6764840483665466,
"epoch": 0.6163723916532905,
"grad_norm": 5.120999336242676,
"learning_rate": 9.857234723782044e-06,
"loss": 0.5278,
"mean_token_accuracy": 0.8128564059734344,
"num_tokens": 1682622.0,
"step": 192
},
{
"entropy": 1.7752752900123596,
"epoch": 0.6195826645264848,
"grad_norm": 5.461716175079346,
"learning_rate": 9.854709087130261e-06,
"loss": 0.5447,
"mean_token_accuracy": 0.8150747120380402,
"num_tokens": 1690983.0,
"step": 193
},
{
"entropy": 1.576039433479309,
"epoch": 0.622792937399679,
"grad_norm": 16.93771743774414,
"learning_rate": 9.852161634917463e-06,
"loss": 0.5159,
"mean_token_accuracy": 0.8044776320457458,
"num_tokens": 1699951.0,
"step": 194
},
{
"entropy": 1.6042520999908447,
"epoch": 0.6260032102728732,
"grad_norm": 4.1131062507629395,
"learning_rate": 9.849592378591113e-06,
"loss": 0.5323,
"mean_token_accuracy": 0.8186607956886292,
"num_tokens": 1708057.0,
"step": 195
},
{
"entropy": 1.8272185921669006,
"epoch": 0.6292134831460674,
"grad_norm": 5.804073810577393,
"learning_rate": 9.847001329696653e-06,
"loss": 0.5514,
"mean_token_accuracy": 0.8083562552928925,
"num_tokens": 1717292.0,
"step": 196
},
{
"entropy": 1.5746408700942993,
"epoch": 0.6324237560192616,
"grad_norm": 13.559549331665039,
"learning_rate": 9.844388499877457e-06,
"loss": 0.6,
"mean_token_accuracy": 0.8068454265594482,
"num_tokens": 1725347.0,
"step": 197
},
{
"entropy": 1.698961853981018,
"epoch": 0.6356340288924559,
"grad_norm": 4.889476299285889,
"learning_rate": 9.841753900874774e-06,
"loss": 0.5872,
"mean_token_accuracy": 0.799273282289505,
"num_tokens": 1732840.0,
"step": 198
},
{
"entropy": 1.7588367462158203,
"epoch": 0.6388443017656501,
"grad_norm": 5.369776725769043,
"learning_rate": 9.839097544527674e-06,
"loss": 0.5042,
"mean_token_accuracy": 0.8182494044303894,
"num_tokens": 1740984.0,
"step": 199
},
{
"entropy": 1.730661690235138,
"epoch": 0.6420545746388443,
"grad_norm": 17.513286590576172,
"learning_rate": 9.836419442773004e-06,
"loss": 0.5309,
"mean_token_accuracy": 0.8073310256004333,
"num_tokens": 1749331.0,
"step": 200
},
{
"entropy": 1.7495179772377014,
"epoch": 0.6452648475120385,
"grad_norm": 5.817629814147949,
"learning_rate": 9.833719607645325e-06,
"loss": 0.5107,
"mean_token_accuracy": 0.8234334290027618,
"num_tokens": 1757183.0,
"step": 201
},
{
"entropy": 1.6555100679397583,
"epoch": 0.6484751203852327,
"grad_norm": 5.809082508087158,
"learning_rate": 9.830998051276858e-06,
"loss": 0.5884,
"mean_token_accuracy": 0.795190691947937,
"num_tokens": 1764907.0,
"step": 202
},
{
"entropy": 1.8146103024482727,
"epoch": 0.651685393258427,
"grad_norm": 4.854206085205078,
"learning_rate": 9.82825478589744e-06,
"loss": 0.5527,
"mean_token_accuracy": 0.8077179789543152,
"num_tokens": 1773880.0,
"step": 203
},
{
"entropy": 1.6006608605384827,
"epoch": 0.6548956661316212,
"grad_norm": 4.588094711303711,
"learning_rate": 9.825489823834454e-06,
"loss": 0.5905,
"mean_token_accuracy": 0.7999958395957947,
"num_tokens": 1782825.0,
"step": 204
},
{
"entropy": 1.6996418237686157,
"epoch": 0.6581059390048154,
"grad_norm": 6.713345050811768,
"learning_rate": 9.822703177512783e-06,
"loss": 0.5563,
"mean_token_accuracy": 0.8049568831920624,
"num_tokens": 1790524.0,
"step": 205
},
{
"entropy": 1.5989940166473389,
"epoch": 0.6613162118780096,
"grad_norm": 4.155484199523926,
"learning_rate": 9.819894859454756e-06,
"loss": 0.5546,
"mean_token_accuracy": 0.8159286975860596,
"num_tokens": 1799241.0,
"step": 206
},
{
"entropy": 1.6221102476119995,
"epoch": 0.6645264847512039,
"grad_norm": 5.686789035797119,
"learning_rate": 9.817064882280085e-06,
"loss": 0.5798,
"mean_token_accuracy": 0.8095100820064545,
"num_tokens": 1807291.0,
"step": 207
},
{
"entropy": 1.556957483291626,
"epoch": 0.6677367576243981,
"grad_norm": 3.906980514526367,
"learning_rate": 9.814213258705813e-06,
"loss": 0.6096,
"mean_token_accuracy": 0.8013098835945129,
"num_tokens": 1815356.0,
"step": 208
},
{
"entropy": 1.6848229765892029,
"epoch": 0.6709470304975923,
"grad_norm": 4.213841438293457,
"learning_rate": 9.811340001546252e-06,
"loss": 0.5178,
"mean_token_accuracy": 0.8091440200805664,
"num_tokens": 1823404.0,
"step": 209
},
{
"entropy": 1.767141580581665,
"epoch": 0.6741573033707865,
"grad_norm": 4.5109782218933105,
"learning_rate": 9.808445123712934e-06,
"loss": 0.5404,
"mean_token_accuracy": 0.8120012879371643,
"num_tokens": 1832487.0,
"step": 210
},
{
"entropy": 1.936172902584076,
"epoch": 0.6773675762439807,
"grad_norm": 6.350546360015869,
"learning_rate": 9.805528638214543e-06,
"loss": 0.5833,
"mean_token_accuracy": 0.8069708943367004,
"num_tokens": 1841466.0,
"step": 211
},
{
"entropy": 1.730377733707428,
"epoch": 0.680577849117175,
"grad_norm": 3.720072031021118,
"learning_rate": 9.802590558156863e-06,
"loss": 0.5529,
"mean_token_accuracy": 0.8099434673786163,
"num_tokens": 1851255.0,
"step": 212
},
{
"entropy": 1.8273388147354126,
"epoch": 0.6837881219903692,
"grad_norm": 4.521886348724365,
"learning_rate": 9.799630896742716e-06,
"loss": 0.5624,
"mean_token_accuracy": 0.8054306507110596,
"num_tokens": 1859161.0,
"step": 213
},
{
"entropy": 1.6505126357078552,
"epoch": 0.6869983948635634,
"grad_norm": 3.8699588775634766,
"learning_rate": 9.796649667271905e-06,
"loss": 0.5313,
"mean_token_accuracy": 0.8173911273479462,
"num_tokens": 1869881.0,
"step": 214
},
{
"entropy": 1.7609619498252869,
"epoch": 0.6902086677367576,
"grad_norm": 4.7254228591918945,
"learning_rate": 9.793646883141155e-06,
"loss": 0.5471,
"mean_token_accuracy": 0.8083101809024811,
"num_tokens": 1878217.0,
"step": 215
},
{
"entropy": 1.8547272682189941,
"epoch": 0.6934189406099518,
"grad_norm": 5.278886795043945,
"learning_rate": 9.790622557844047e-06,
"loss": 0.5315,
"mean_token_accuracy": 0.811885803937912,
"num_tokens": 1886890.0,
"step": 216
},
{
"entropy": 1.6623858213424683,
"epoch": 0.6966292134831461,
"grad_norm": 4.21257209777832,
"learning_rate": 9.787576704970965e-06,
"loss": 0.5739,
"mean_token_accuracy": 0.8085341453552246,
"num_tokens": 1895443.0,
"step": 217
},
{
"entropy": 1.8965311646461487,
"epoch": 0.6998394863563403,
"grad_norm": 6.084798336029053,
"learning_rate": 9.784509338209026e-06,
"loss": 0.5523,
"mean_token_accuracy": 0.819883793592453,
"num_tokens": 1903524.0,
"step": 218
},
{
"entropy": 1.78938889503479,
"epoch": 0.7030497592295345,
"grad_norm": 4.32880973815918,
"learning_rate": 9.781420471342035e-06,
"loss": 0.5657,
"mean_token_accuracy": 0.8084932565689087,
"num_tokens": 1912941.0,
"step": 219
},
{
"entropy": 1.7305169105529785,
"epoch": 0.7062600321027287,
"grad_norm": 28.916038513183594,
"learning_rate": 9.778310118250397e-06,
"loss": 0.4682,
"mean_token_accuracy": 0.8382920622825623,
"num_tokens": 1921692.0,
"step": 220
},
{
"entropy": 1.7564342617988586,
"epoch": 0.709470304975923,
"grad_norm": 3.821200370788574,
"learning_rate": 9.77517829291108e-06,
"loss": 0.4967,
"mean_token_accuracy": 0.828892856836319,
"num_tokens": 1930053.0,
"step": 221
},
{
"entropy": 1.6086868047714233,
"epoch": 0.7126805778491172,
"grad_norm": 4.011463642120361,
"learning_rate": 9.772025009397538e-06,
"loss": 0.5361,
"mean_token_accuracy": 0.8106788098812103,
"num_tokens": 1938556.0,
"step": 222
},
{
"entropy": 1.7607861757278442,
"epoch": 0.7158908507223114,
"grad_norm": 3.9651222229003906,
"learning_rate": 9.768850281879651e-06,
"loss": 0.5446,
"mean_token_accuracy": 0.8144137263298035,
"num_tokens": 1946709.0,
"step": 223
},
{
"entropy": 1.8207188248634338,
"epoch": 0.7191011235955056,
"grad_norm": 17.640336990356445,
"learning_rate": 9.765654124623664e-06,
"loss": 0.5415,
"mean_token_accuracy": 0.8262408673763275,
"num_tokens": 1954532.0,
"step": 224
},
{
"entropy": 1.9907708168029785,
"epoch": 0.7223113964686998,
"grad_norm": 16.587417602539062,
"learning_rate": 9.762436551992117e-06,
"loss": 0.5316,
"mean_token_accuracy": 0.8141103982925415,
"num_tokens": 1962607.0,
"step": 225
},
{
"entropy": 1.771414875984192,
"epoch": 0.7255216693418941,
"grad_norm": 5.63399076461792,
"learning_rate": 9.759197578443787e-06,
"loss": 0.5473,
"mean_token_accuracy": 0.8127427995204926,
"num_tokens": 1971317.0,
"step": 226
},
{
"entropy": 1.8039385080337524,
"epoch": 0.7287319422150883,
"grad_norm": 9.91141128540039,
"learning_rate": 9.755937218533622e-06,
"loss": 0.5521,
"mean_token_accuracy": 0.8093193471431732,
"num_tokens": 1979337.0,
"step": 227
},
{
"entropy": 1.8303923606872559,
"epoch": 0.7319422150882825,
"grad_norm": 3.973386526107788,
"learning_rate": 9.752655486912666e-06,
"loss": 0.5292,
"mean_token_accuracy": 0.8189589083194733,
"num_tokens": 1989078.0,
"step": 228
},
{
"entropy": 1.8203710913658142,
"epoch": 0.7351524879614767,
"grad_norm": 4.977687358856201,
"learning_rate": 9.74935239832801e-06,
"loss": 0.5419,
"mean_token_accuracy": 0.8108758628368378,
"num_tokens": 1997064.0,
"step": 229
},
{
"entropy": 1.7490638494491577,
"epoch": 0.7383627608346709,
"grad_norm": 3.8043088912963867,
"learning_rate": 9.746027967622709e-06,
"loss": 0.5665,
"mean_token_accuracy": 0.807654470205307,
"num_tokens": 2005949.0,
"step": 230
},
{
"entropy": 2.048327326774597,
"epoch": 0.7415730337078652,
"grad_norm": 4.386172771453857,
"learning_rate": 9.742682209735727e-06,
"loss": 0.507,
"mean_token_accuracy": 0.8336832225322723,
"num_tokens": 2014059.0,
"step": 231
},
{
"entropy": 1.7495536804199219,
"epoch": 0.7447833065810594,
"grad_norm": 4.291626453399658,
"learning_rate": 9.739315139701868e-06,
"loss": 0.5388,
"mean_token_accuracy": 0.8060157299041748,
"num_tokens": 2023566.0,
"step": 232
},
{
"entropy": 1.949560523033142,
"epoch": 0.7479935794542536,
"grad_norm": 3.818927764892578,
"learning_rate": 9.735926772651703e-06,
"loss": 0.5368,
"mean_token_accuracy": 0.8259284198284149,
"num_tokens": 2033999.0,
"step": 233
},
{
"entropy": 1.6942541599273682,
"epoch": 0.7512038523274478,
"grad_norm": 4.930175304412842,
"learning_rate": 9.732517123811502e-06,
"loss": 0.4826,
"mean_token_accuracy": 0.8296216726303101,
"num_tokens": 2043181.0,
"step": 234
},
{
"entropy": 1.7138220071792603,
"epoch": 0.7544141252006421,
"grad_norm": 12.674991607666016,
"learning_rate": 9.729086208503174e-06,
"loss": 0.6105,
"mean_token_accuracy": 0.8042653799057007,
"num_tokens": 2051883.0,
"step": 235
},
{
"entropy": 1.7605062127113342,
"epoch": 0.7576243980738363,
"grad_norm": 4.540236473083496,
"learning_rate": 9.725634042144192e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8065124750137329,
"num_tokens": 2060315.0,
"step": 236
},
{
"entropy": 1.7666863799095154,
"epoch": 0.7608346709470305,
"grad_norm": 8.36759090423584,
"learning_rate": 9.722160640247523e-06,
"loss": 0.6014,
"mean_token_accuracy": 0.8097147047519684,
"num_tokens": 2070260.0,
"step": 237
},
{
"entropy": 1.674265742301941,
"epoch": 0.7640449438202247,
"grad_norm": 3.945760726928711,
"learning_rate": 9.71866601842156e-06,
"loss": 0.503,
"mean_token_accuracy": 0.8159506320953369,
"num_tokens": 2079997.0,
"step": 238
},
{
"entropy": 1.7028595209121704,
"epoch": 0.7672552166934189,
"grad_norm": 4.027834892272949,
"learning_rate": 9.715150192370054e-06,
"loss": 0.5632,
"mean_token_accuracy": 0.8053656220436096,
"num_tokens": 2088508.0,
"step": 239
},
{
"entropy": 1.7460883855819702,
"epoch": 0.7704654895666132,
"grad_norm": 5.3727312088012695,
"learning_rate": 9.71161317789204e-06,
"loss": 0.536,
"mean_token_accuracy": 0.812755823135376,
"num_tokens": 2097683.0,
"step": 240
},
{
"entropy": 1.7760279774665833,
"epoch": 0.7736757624398074,
"grad_norm": 11.409591674804688,
"learning_rate": 9.708054990881763e-06,
"loss": 0.5327,
"mean_token_accuracy": 0.8147374391555786,
"num_tokens": 2106963.0,
"step": 241
},
{
"entropy": 1.7303342819213867,
"epoch": 0.7768860353130016,
"grad_norm": 3.939631462097168,
"learning_rate": 9.70447564732862e-06,
"loss": 0.5212,
"mean_token_accuracy": 0.8153030872344971,
"num_tokens": 2116407.0,
"step": 242
},
{
"entropy": 1.629269540309906,
"epoch": 0.7800963081861958,
"grad_norm": 6.595477104187012,
"learning_rate": 9.700875163317072e-06,
"loss": 0.5316,
"mean_token_accuracy": 0.8108282685279846,
"num_tokens": 2124321.0,
"step": 243
},
{
"entropy": 1.7000606060028076,
"epoch": 0.78330658105939,
"grad_norm": 4.974279403686523,
"learning_rate": 9.69725355502658e-06,
"loss": 0.532,
"mean_token_accuracy": 0.8205364644527435,
"num_tokens": 2132747.0,
"step": 244
},
{
"entropy": 1.7894864082336426,
"epoch": 0.7865168539325843,
"grad_norm": 4.206717491149902,
"learning_rate": 9.693610838731532e-06,
"loss": 0.5707,
"mean_token_accuracy": 0.8029159605503082,
"num_tokens": 2142048.0,
"step": 245
},
{
"entropy": 1.8813952803611755,
"epoch": 0.7897271268057785,
"grad_norm": 3.395116090774536,
"learning_rate": 9.689947030801168e-06,
"loss": 0.5403,
"mean_token_accuracy": 0.8176522552967072,
"num_tokens": 2151683.0,
"step": 246
},
{
"entropy": 1.795137882232666,
"epoch": 0.7929373996789727,
"grad_norm": 4.789092540740967,
"learning_rate": 9.686262147699507e-06,
"loss": 0.5635,
"mean_token_accuracy": 0.8097488880157471,
"num_tokens": 2160162.0,
"step": 247
},
{
"entropy": 1.7523850202560425,
"epoch": 0.7961476725521669,
"grad_norm": 3.9210240840911865,
"learning_rate": 9.682556205985274e-06,
"loss": 0.5662,
"mean_token_accuracy": 0.802306056022644,
"num_tokens": 2168445.0,
"step": 248
},
{
"entropy": 2.022180676460266,
"epoch": 0.7993579454253612,
"grad_norm": 4.333629608154297,
"learning_rate": 9.678829222311827e-06,
"loss": 0.5617,
"mean_token_accuracy": 0.8150058388710022,
"num_tokens": 2177350.0,
"step": 249
},
{
"entropy": 1.7913747429847717,
"epoch": 0.8025682182985554,
"grad_norm": 6.312867641448975,
"learning_rate": 9.675081213427076e-06,
"loss": 0.5039,
"mean_token_accuracy": 0.8336711227893829,
"num_tokens": 2185344.0,
"step": 250
},
{
"entropy": 1.955717146396637,
"epoch": 0.8057784911717496,
"grad_norm": 4.374723434448242,
"learning_rate": 9.671312196173413e-06,
"loss": 0.5752,
"mean_token_accuracy": 0.8053406774997711,
"num_tokens": 2195099.0,
"step": 251
},
{
"entropy": 1.8786735534667969,
"epoch": 0.8089887640449438,
"grad_norm": 6.004689693450928,
"learning_rate": 9.667522187487635e-06,
"loss": 0.5429,
"mean_token_accuracy": 0.8088372349739075,
"num_tokens": 2203830.0,
"step": 252
},
{
"entropy": 1.984777808189392,
"epoch": 0.812199036918138,
"grad_norm": 5.8004374504089355,
"learning_rate": 9.663711204400872e-06,
"loss": 0.5701,
"mean_token_accuracy": 0.8084724247455597,
"num_tokens": 2212939.0,
"step": 253
},
{
"entropy": 1.973739504814148,
"epoch": 0.8154093097913323,
"grad_norm": 12.577515602111816,
"learning_rate": 9.659879264038499e-06,
"loss": 0.4932,
"mean_token_accuracy": 0.8239920437335968,
"num_tokens": 2221298.0,
"step": 254
},
{
"entropy": 1.9998502731323242,
"epoch": 0.8186195826645265,
"grad_norm": 5.38623046875,
"learning_rate": 9.656026383620076e-06,
"loss": 0.5411,
"mean_token_accuracy": 0.7981366217136383,
"num_tokens": 2230917.0,
"step": 255
},
{
"entropy": 1.9804831147193909,
"epoch": 0.8218298555377207,
"grad_norm": 11.439884185791016,
"learning_rate": 9.65215258045925e-06,
"loss": 0.5161,
"mean_token_accuracy": 0.8303695321083069,
"num_tokens": 2238718.0,
"step": 256
},
{
"entropy": 1.8764265179634094,
"epoch": 0.8250401284109149,
"grad_norm": 6.7170538902282715,
"learning_rate": 9.6482578719637e-06,
"loss": 0.5284,
"mean_token_accuracy": 0.8177531659603119,
"num_tokens": 2247613.0,
"step": 257
},
{
"entropy": 1.9280529618263245,
"epoch": 0.8282504012841091,
"grad_norm": 4.502701759338379,
"learning_rate": 9.644342275635036e-06,
"loss": 0.514,
"mean_token_accuracy": 0.8217505216598511,
"num_tokens": 2255285.0,
"step": 258
},
{
"entropy": 1.780007779598236,
"epoch": 0.8314606741573034,
"grad_norm": 12.708043098449707,
"learning_rate": 9.640405809068743e-06,
"loss": 0.5769,
"mean_token_accuracy": 0.8037888705730438,
"num_tokens": 2263131.0,
"step": 259
},
{
"entropy": 1.878428339958191,
"epoch": 0.8346709470304976,
"grad_norm": 5.675548553466797,
"learning_rate": 9.636448489954077e-06,
"loss": 0.5525,
"mean_token_accuracy": 0.8077818155288696,
"num_tokens": 2271596.0,
"step": 260
},
{
"entropy": 1.684307873249054,
"epoch": 0.8378812199036918,
"grad_norm": 7.081056118011475,
"learning_rate": 9.632470336074009e-06,
"loss": 0.5188,
"mean_token_accuracy": 0.8152169585227966,
"num_tokens": 2280005.0,
"step": 261
},
{
"entropy": 1.9224953651428223,
"epoch": 0.841091492776886,
"grad_norm": 4.184887886047363,
"learning_rate": 9.628471365305134e-06,
"loss": 0.6043,
"mean_token_accuracy": 0.7943918704986572,
"num_tokens": 2289825.0,
"step": 262
},
{
"entropy": 1.864977478981018,
"epoch": 0.8443017656500803,
"grad_norm": 36.45917510986328,
"learning_rate": 9.624451595617588e-06,
"loss": 0.5088,
"mean_token_accuracy": 0.8221506774425507,
"num_tokens": 2298079.0,
"step": 263
},
{
"entropy": 1.9027855396270752,
"epoch": 0.8475120385232745,
"grad_norm": 5.5275187492370605,
"learning_rate": 9.620411045074972e-06,
"loss": 0.5644,
"mean_token_accuracy": 0.8087839186191559,
"num_tokens": 2308579.0,
"step": 264
},
{
"entropy": 1.8372774124145508,
"epoch": 0.8507223113964687,
"grad_norm": 6.006095886230469,
"learning_rate": 9.616349731834271e-06,
"loss": 0.5356,
"mean_token_accuracy": 0.816933810710907,
"num_tokens": 2318463.0,
"step": 265
},
{
"entropy": 1.573245882987976,
"epoch": 0.8539325842696629,
"grad_norm": 4.500341415405273,
"learning_rate": 9.612267674145772e-06,
"loss": 0.5938,
"mean_token_accuracy": 0.8043918609619141,
"num_tokens": 2327089.0,
"step": 266
},
{
"entropy": 1.8638948798179626,
"epoch": 0.8571428571428571,
"grad_norm": 5.24827241897583,
"learning_rate": 9.608164890352977e-06,
"loss": 0.5005,
"mean_token_accuracy": 0.8218279480934143,
"num_tokens": 2335135.0,
"step": 267
},
{
"entropy": 1.8430355787277222,
"epoch": 0.8603531300160514,
"grad_norm": 4.886566638946533,
"learning_rate": 9.604041398892528e-06,
"loss": 0.5614,
"mean_token_accuracy": 0.8217492401599884,
"num_tokens": 2343916.0,
"step": 268
},
{
"entropy": 1.8909154534339905,
"epoch": 0.8635634028892456,
"grad_norm": 7.561775207519531,
"learning_rate": 9.599897218294122e-06,
"loss": 0.5394,
"mean_token_accuracy": 0.8251607716083527,
"num_tokens": 2351407.0,
"step": 269
},
{
"entropy": 1.7426018118858337,
"epoch": 0.8667736757624398,
"grad_norm": 4.487576484680176,
"learning_rate": 9.595732367180422e-06,
"loss": 0.5097,
"mean_token_accuracy": 0.8088293373584747,
"num_tokens": 2359413.0,
"step": 270
},
{
"entropy": 1.844031572341919,
"epoch": 0.869983948635634,
"grad_norm": 4.225221157073975,
"learning_rate": 9.591546864266983e-06,
"loss": 0.5153,
"mean_token_accuracy": 0.8313940167427063,
"num_tokens": 2367432.0,
"step": 271
},
{
"entropy": 1.8397764563560486,
"epoch": 0.8731942215088283,
"grad_norm": 4.163444519042969,
"learning_rate": 9.58734072836216e-06,
"loss": 0.5367,
"mean_token_accuracy": 0.8088929355144501,
"num_tokens": 2376009.0,
"step": 272
},
{
"entropy": 1.9551055431365967,
"epoch": 0.8764044943820225,
"grad_norm": 188.6858367919922,
"learning_rate": 9.583113978367026e-06,
"loss": 0.4763,
"mean_token_accuracy": 0.8416739702224731,
"num_tokens": 2383876.0,
"step": 273
},
{
"entropy": 1.733047068119049,
"epoch": 0.8796147672552167,
"grad_norm": 7.61765718460083,
"learning_rate": 9.578866633275289e-06,
"loss": 0.5334,
"mean_token_accuracy": 0.8125023543834686,
"num_tokens": 2392551.0,
"step": 274
},
{
"entropy": 1.7080771327018738,
"epoch": 0.8828250401284109,
"grad_norm": 4.636716365814209,
"learning_rate": 9.574598712173202e-06,
"loss": 0.558,
"mean_token_accuracy": 0.8121279180049896,
"num_tokens": 2401060.0,
"step": 275
},
{
"entropy": 1.6740916967391968,
"epoch": 0.8860353130016051,
"grad_norm": 4.136287689208984,
"learning_rate": 9.570310234239483e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8157520890235901,
"num_tokens": 2409417.0,
"step": 276
},
{
"entropy": 1.7176891565322876,
"epoch": 0.8892455858747994,
"grad_norm": 7.969487190246582,
"learning_rate": 9.56600121874523e-06,
"loss": 0.5402,
"mean_token_accuracy": 0.8149141073226929,
"num_tokens": 2418325.0,
"step": 277
},
{
"entropy": 1.7098489999771118,
"epoch": 0.8924558587479936,
"grad_norm": 5.37632942199707,
"learning_rate": 9.561671685053818e-06,
"loss": 0.5738,
"mean_token_accuracy": 0.8035610914230347,
"num_tokens": 2427522.0,
"step": 278
},
{
"entropy": 1.633002758026123,
"epoch": 0.8956661316211878,
"grad_norm": 4.541212558746338,
"learning_rate": 9.557321652620839e-06,
"loss": 0.5547,
"mean_token_accuracy": 0.8119913339614868,
"num_tokens": 2437685.0,
"step": 279
},
{
"entropy": 1.6630198955535889,
"epoch": 0.898876404494382,
"grad_norm": 4.517637729644775,
"learning_rate": 9.55295114099399e-06,
"loss": 0.572,
"mean_token_accuracy": 0.8125506043434143,
"num_tokens": 2446482.0,
"step": 280
},
{
"entropy": 1.938102126121521,
"epoch": 0.9020866773675762,
"grad_norm": 5.673907279968262,
"learning_rate": 9.548560169812997e-06,
"loss": 0.5425,
"mean_token_accuracy": 0.8131826519966125,
"num_tokens": 2455118.0,
"step": 281
},
{
"entropy": 1.6700218319892883,
"epoch": 0.9052969502407705,
"grad_norm": 9.876907348632812,
"learning_rate": 9.544148758809528e-06,
"loss": 0.5354,
"mean_token_accuracy": 0.8099010288715363,
"num_tokens": 2464518.0,
"step": 282
},
{
"entropy": 1.8363152146339417,
"epoch": 0.9085072231139647,
"grad_norm": 13.005200386047363,
"learning_rate": 9.539716927807102e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.824327141046524,
"num_tokens": 2473454.0,
"step": 283
},
{
"entropy": 1.775430977344513,
"epoch": 0.9117174959871589,
"grad_norm": 5.766530513763428,
"learning_rate": 9.535264696720993e-06,
"loss": 0.55,
"mean_token_accuracy": 0.8180139362812042,
"num_tokens": 2481651.0,
"step": 284
},
{
"entropy": 1.669650673866272,
"epoch": 0.9149277688603531,
"grad_norm": 6.167349815368652,
"learning_rate": 9.530792085558151e-06,
"loss": 0.4869,
"mean_token_accuracy": 0.8234744071960449,
"num_tokens": 2489837.0,
"step": 285
},
{
"entropy": 1.6679094433784485,
"epoch": 0.9181380417335474,
"grad_norm": 5.257747650146484,
"learning_rate": 9.526299114417108e-06,
"loss": 0.5862,
"mean_token_accuracy": 0.8026742041110992,
"num_tokens": 2498698.0,
"step": 286
},
{
"entropy": 1.8049131035804749,
"epoch": 0.9213483146067416,
"grad_norm": 6.02241325378418,
"learning_rate": 9.521785803487888e-06,
"loss": 0.49,
"mean_token_accuracy": 0.8336274325847626,
"num_tokens": 2506100.0,
"step": 287
},
{
"entropy": 1.7622082233428955,
"epoch": 0.9245585874799358,
"grad_norm": 5.4801764488220215,
"learning_rate": 9.517252173051912e-06,
"loss": 0.5652,
"mean_token_accuracy": 0.8085108995437622,
"num_tokens": 2513985.0,
"step": 288
},
{
"entropy": 1.7733423709869385,
"epoch": 0.92776886035313,
"grad_norm": 4.067267894744873,
"learning_rate": 9.512698243481914e-06,
"loss": 0.5772,
"mean_token_accuracy": 0.8086209297180176,
"num_tokens": 2522092.0,
"step": 289
},
{
"entropy": 1.9992202520370483,
"epoch": 0.9309791332263242,
"grad_norm": 3.6951725482940674,
"learning_rate": 9.508124035241843e-06,
"loss": 0.5263,
"mean_token_accuracy": 0.8235167562961578,
"num_tokens": 2531345.0,
"step": 290
},
{
"entropy": 1.7889231443405151,
"epoch": 0.9341894060995185,
"grad_norm": 3.931403636932373,
"learning_rate": 9.50352956888678e-06,
"loss": 0.5272,
"mean_token_accuracy": 0.8177125155925751,
"num_tokens": 2539270.0,
"step": 291
},
{
"entropy": 1.6634029746055603,
"epoch": 0.9373996789727127,
"grad_norm": 8.0676851272583,
"learning_rate": 9.498914865062831e-06,
"loss": 0.5565,
"mean_token_accuracy": 0.8150179386138916,
"num_tokens": 2547863.0,
"step": 292
},
{
"entropy": 1.5966331362724304,
"epoch": 0.9406099518459069,
"grad_norm": 3.3051044940948486,
"learning_rate": 9.49427994450705e-06,
"loss": 0.5098,
"mean_token_accuracy": 0.819977194070816,
"num_tokens": 2556039.0,
"step": 293
},
{
"entropy": 1.873258113861084,
"epoch": 0.9438202247191011,
"grad_norm": 7.652100563049316,
"learning_rate": 9.489624828047336e-06,
"loss": 0.5398,
"mean_token_accuracy": 0.8114376664161682,
"num_tokens": 2564744.0,
"step": 294
},
{
"entropy": 1.9302705526351929,
"epoch": 0.9470304975922953,
"grad_norm": 4.018932342529297,
"learning_rate": 9.484949536602343e-06,
"loss": 0.5363,
"mean_token_accuracy": 0.8164487481117249,
"num_tokens": 2573899.0,
"step": 295
},
{
"entropy": 1.5373682379722595,
"epoch": 0.9502407704654896,
"grad_norm": 4.182193756103516,
"learning_rate": 9.480254091181385e-06,
"loss": 0.585,
"mean_token_accuracy": 0.7996087670326233,
"num_tokens": 2582879.0,
"step": 296
},
{
"entropy": 1.8067876696586609,
"epoch": 0.9534510433386838,
"grad_norm": 4.5391950607299805,
"learning_rate": 9.47553851288434e-06,
"loss": 0.4947,
"mean_token_accuracy": 0.8318784534931183,
"num_tokens": 2591802.0,
"step": 297
},
{
"entropy": 1.8316718339920044,
"epoch": 0.956661316211878,
"grad_norm": 5.184760570526123,
"learning_rate": 9.470802822901558e-06,
"loss": 0.5586,
"mean_token_accuracy": 0.8131641149520874,
"num_tokens": 2600207.0,
"step": 298
},
{
"entropy": 1.859476923942566,
"epoch": 0.9598715890850722,
"grad_norm": 5.1902852058410645,
"learning_rate": 9.466047042513767e-06,
"loss": 0.5501,
"mean_token_accuracy": 0.8048664629459381,
"num_tokens": 2608420.0,
"step": 299
},
{
"entropy": 1.8378886580467224,
"epoch": 0.9630818619582665,
"grad_norm": 4.642928123474121,
"learning_rate": 9.461271193091971e-06,
"loss": 0.6043,
"mean_token_accuracy": 0.7992973029613495,
"num_tokens": 2617800.0,
"step": 300
},
{
"entropy": 1.8681280612945557,
"epoch": 0.9662921348314607,
"grad_norm": 4.008616924285889,
"learning_rate": 9.45647529609736e-06,
"loss": 0.5605,
"mean_token_accuracy": 0.8001963198184967,
"num_tokens": 2627976.0,
"step": 301
},
{
"entropy": 1.6626688241958618,
"epoch": 0.9695024077046549,
"grad_norm": 13.212479591369629,
"learning_rate": 9.451659373081214e-06,
"loss": 0.5672,
"mean_token_accuracy": 0.8115538358688354,
"num_tokens": 2636906.0,
"step": 302
},
{
"entropy": 1.8471931219100952,
"epoch": 0.9727126805778491,
"grad_norm": 4.148143291473389,
"learning_rate": 9.4468234456848e-06,
"loss": 0.5647,
"mean_token_accuracy": 0.8091489970684052,
"num_tokens": 2645774.0,
"step": 303
},
{
"entropy": 1.7227018475532532,
"epoch": 0.9759229534510433,
"grad_norm": 3.8793492317199707,
"learning_rate": 9.44196753563928e-06,
"loss": 0.5244,
"mean_token_accuracy": 0.8176108598709106,
"num_tokens": 2654629.0,
"step": 304
},
{
"entropy": 1.9042375683784485,
"epoch": 0.9791332263242376,
"grad_norm": 8.690786361694336,
"learning_rate": 9.437091664765611e-06,
"loss": 0.548,
"mean_token_accuracy": 0.8241380155086517,
"num_tokens": 2663716.0,
"step": 305
},
{
"entropy": 1.7460474371910095,
"epoch": 0.9823434991974318,
"grad_norm": 9.691555976867676,
"learning_rate": 9.43219585497445e-06,
"loss": 0.5014,
"mean_token_accuracy": 0.8227463662624359,
"num_tokens": 2672778.0,
"step": 306
},
{
"entropy": 1.878986418247223,
"epoch": 0.985553772070626,
"grad_norm": 4.636747360229492,
"learning_rate": 9.427280128266049e-06,
"loss": 0.5629,
"mean_token_accuracy": 0.8111841678619385,
"num_tokens": 2681537.0,
"step": 307
},
{
"entropy": 1.9950389862060547,
"epoch": 0.9887640449438202,
"grad_norm": 12.368703842163086,
"learning_rate": 9.422344506730168e-06,
"loss": 0.5101,
"mean_token_accuracy": 0.822578638792038,
"num_tokens": 2689579.0,
"step": 308
},
{
"entropy": 1.7799192070960999,
"epoch": 0.9919743178170144,
"grad_norm": 7.675198554992676,
"learning_rate": 9.41738901254596e-06,
"loss": 0.5046,
"mean_token_accuracy": 0.8357812464237213,
"num_tokens": 2697211.0,
"step": 309
},
{
"entropy": 1.8566884994506836,
"epoch": 0.9951845906902087,
"grad_norm": 31.68392562866211,
"learning_rate": 9.412413667981884e-06,
"loss": 0.5595,
"mean_token_accuracy": 0.8127318024635315,
"num_tokens": 2707794.0,
"step": 310
},
{
"entropy": 1.7565560340881348,
"epoch": 0.9983948635634029,
"grad_norm": 5.741061210632324,
"learning_rate": 9.4074184953956e-06,
"loss": 0.6057,
"mean_token_accuracy": 0.8059540390968323,
"num_tokens": 2716378.0,
"step": 311
},
{
"entropy": 1.9112659692764282,
"epoch": 1.0,
"grad_norm": 8.97977066040039,
"learning_rate": 9.402403517233867e-06,
"loss": 0.5477,
"mean_token_accuracy": 0.8098132610321045,
"num_tokens": 2721142.0,
"step": 312
},
{
"entropy": 1.8866798877716064,
"epoch": 1.0032102728731942,
"grad_norm": 3.0067737102508545,
"learning_rate": 9.397368756032445e-06,
"loss": 0.2756,
"mean_token_accuracy": 0.8953090310096741,
"num_tokens": 2729237.0,
"step": 313
},
{
"entropy": 1.5604918003082275,
"epoch": 1.0064205457463884,
"grad_norm": 2.753265380859375,
"learning_rate": 9.392314234415999e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.8884185254573822,
"num_tokens": 2738049.0,
"step": 314
},
{
"entropy": 1.6893478035926819,
"epoch": 1.0096308186195826,
"grad_norm": 5.572351932525635,
"learning_rate": 9.38723997509798e-06,
"loss": 0.3426,
"mean_token_accuracy": 0.8786461353302002,
"num_tokens": 2747474.0,
"step": 315
},
{
"entropy": 1.7195146679878235,
"epoch": 1.0128410914927768,
"grad_norm": 6.990971088409424,
"learning_rate": 9.38214600088054e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8795572221279144,
"num_tokens": 2755445.0,
"step": 316
},
{
"entropy": 1.5811264514923096,
"epoch": 1.0160513643659712,
"grad_norm": 3.330709218978882,
"learning_rate": 9.37703233465443e-06,
"loss": 0.3017,
"mean_token_accuracy": 0.8671407401561737,
"num_tokens": 2764469.0,
"step": 317
},
{
"entropy": 1.5029963254928589,
"epoch": 1.0192616372391654,
"grad_norm": 9.694075584411621,
"learning_rate": 9.371898999398876e-06,
"loss": 0.3368,
"mean_token_accuracy": 0.884416937828064,
"num_tokens": 2772747.0,
"step": 318
},
{
"entropy": 1.77669358253479,
"epoch": 1.0224719101123596,
"grad_norm": 5.008193492889404,
"learning_rate": 9.366746018181503e-06,
"loss": 0.3311,
"mean_token_accuracy": 0.8841279745101929,
"num_tokens": 2782046.0,
"step": 319
},
{
"entropy": 1.5491546988487244,
"epoch": 1.0256821829855538,
"grad_norm": 3.3377037048339844,
"learning_rate": 9.361573414158215e-06,
"loss": 0.2557,
"mean_token_accuracy": 0.9022665619850159,
"num_tokens": 2790262.0,
"step": 320
},
{
"entropy": 1.423735797405243,
"epoch": 1.028892455858748,
"grad_norm": 4.702206611633301,
"learning_rate": 9.356381210573092e-06,
"loss": 0.3956,
"mean_token_accuracy": 0.8604268729686737,
"num_tokens": 2799962.0,
"step": 321
},
{
"entropy": 1.642482876777649,
"epoch": 1.0321027287319422,
"grad_norm": 4.782188892364502,
"learning_rate": 9.351169430758293e-06,
"loss": 0.226,
"mean_token_accuracy": 0.9207814335823059,
"num_tokens": 2808389.0,
"step": 322
},
{
"entropy": 1.5962989330291748,
"epoch": 1.0353130016051364,
"grad_norm": 4.449636936187744,
"learning_rate": 9.345938098133946e-06,
"loss": 0.316,
"mean_token_accuracy": 0.8767516911029816,
"num_tokens": 2817170.0,
"step": 323
},
{
"entropy": 1.6813729405403137,
"epoch": 1.0385232744783306,
"grad_norm": 3.268564462661743,
"learning_rate": 9.340687236208037e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8798855543136597,
"num_tokens": 2826388.0,
"step": 324
},
{
"entropy": 1.4963070154190063,
"epoch": 1.0417335473515248,
"grad_norm": 3.9683163166046143,
"learning_rate": 9.33541686857632e-06,
"loss": 0.3418,
"mean_token_accuracy": 0.8698484897613525,
"num_tokens": 2836296.0,
"step": 325
},
{
"entropy": 1.4952961206436157,
"epoch": 1.0449438202247192,
"grad_norm": 4.533268928527832,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3369,
"mean_token_accuracy": 0.8843473196029663,
"num_tokens": 2845829.0,
"step": 326
},
{
"entropy": 1.4619093537330627,
"epoch": 1.0481540930979134,
"grad_norm": 6.075706481933594,
"learning_rate": 9.324817711016609e-06,
"loss": 0.3602,
"mean_token_accuracy": 0.872114509344101,
"num_tokens": 2855278.0,
"step": 327
},
{
"entropy": 1.3780204057693481,
"epoch": 1.0513643659711076,
"grad_norm": 3.4777228832244873,
"learning_rate": 9.31948896871795e-06,
"loss": 0.3687,
"mean_token_accuracy": 0.874431699514389,
"num_tokens": 2863462.0,
"step": 328
},
{
"entropy": 1.508378028869629,
"epoch": 1.0545746388443018,
"grad_norm": 3.8630146980285645,
"learning_rate": 9.31414081597194e-06,
"loss": 0.2862,
"mean_token_accuracy": 0.8832896053791046,
"num_tokens": 2873221.0,
"step": 329
},
{
"entropy": 1.4881436228752136,
"epoch": 1.057784911717496,
"grad_norm": 3.041048526763916,
"learning_rate": 9.30877327681152e-06,
"loss": 0.3019,
"mean_token_accuracy": 0.880987137556076,
"num_tokens": 2882010.0,
"step": 330
},
{
"entropy": 1.5098777413368225,
"epoch": 1.0609951845906902,
"grad_norm": 10.562122344970703,
"learning_rate": 9.303386375356752e-06,
"loss": 0.2991,
"mean_token_accuracy": 0.8782722651958466,
"num_tokens": 2891471.0,
"step": 331
},
{
"entropy": 1.3975720405578613,
"epoch": 1.0642054574638844,
"grad_norm": 5.349188804626465,
"learning_rate": 9.297980135814706e-06,
"loss": 0.3329,
"mean_token_accuracy": 0.8701120913028717,
"num_tokens": 2900424.0,
"step": 332
},
{
"entropy": 1.4648704528808594,
"epoch": 1.0674157303370786,
"grad_norm": 4.09747838973999,
"learning_rate": 9.292554582479349e-06,
"loss": 0.3001,
"mean_token_accuracy": 0.8921301364898682,
"num_tokens": 2908440.0,
"step": 333
},
{
"entropy": 1.433031976222992,
"epoch": 1.0706260032102728,
"grad_norm": 5.114440441131592,
"learning_rate": 9.28710973973144e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.886705756187439,
"num_tokens": 2917238.0,
"step": 334
},
{
"entropy": 1.5841457843780518,
"epoch": 1.0738362760834672,
"grad_norm": 15.462648391723633,
"learning_rate": 9.281645632038417e-06,
"loss": 0.2744,
"mean_token_accuracy": 0.8975834846496582,
"num_tokens": 2925815.0,
"step": 335
},
{
"entropy": 1.5319228768348694,
"epoch": 1.0770465489566614,
"grad_norm": 4.613631725311279,
"learning_rate": 9.276162283954293e-06,
"loss": 0.3216,
"mean_token_accuracy": 0.8822270631790161,
"num_tokens": 2933894.0,
"step": 336
},
{
"entropy": 1.6462609767913818,
"epoch": 1.0802568218298556,
"grad_norm": 3.268683910369873,
"learning_rate": 9.270659720119533e-06,
"loss": 0.3096,
"mean_token_accuracy": 0.892312079668045,
"num_tokens": 2941880.0,
"step": 337
},
{
"entropy": 1.439853310585022,
"epoch": 1.0834670947030498,
"grad_norm": 4.163455009460449,
"learning_rate": 9.265137965260962e-06,
"loss": 0.3484,
"mean_token_accuracy": 0.8765892088413239,
"num_tokens": 2950411.0,
"step": 338
},
{
"entropy": 1.4200059175491333,
"epoch": 1.086677367576244,
"grad_norm": 3.743356466293335,
"learning_rate": 9.259597044191635e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.8831844925880432,
"num_tokens": 2959444.0,
"step": 339
},
{
"entropy": 1.6156633496284485,
"epoch": 1.0898876404494382,
"grad_norm": 4.8488945960998535,
"learning_rate": 9.254036981810741e-06,
"loss": 0.2395,
"mean_token_accuracy": 0.8998830020427704,
"num_tokens": 2967850.0,
"step": 340
},
{
"entropy": 1.6063914895057678,
"epoch": 1.0930979133226324,
"grad_norm": 3.8594541549682617,
"learning_rate": 9.248457803103476e-06,
"loss": 0.2982,
"mean_token_accuracy": 0.8650166988372803,
"num_tokens": 2977216.0,
"step": 341
},
{
"entropy": 1.4595788717269897,
"epoch": 1.0963081861958266,
"grad_norm": 3.777676582336426,
"learning_rate": 9.242859533140947e-06,
"loss": 0.3471,
"mean_token_accuracy": 0.875117301940918,
"num_tokens": 2985963.0,
"step": 342
},
{
"entropy": 1.3544176816940308,
"epoch": 1.0995184590690208,
"grad_norm": 7.397167205810547,
"learning_rate": 9.237242197080045e-06,
"loss": 0.3612,
"mean_token_accuracy": 0.8471320867538452,
"num_tokens": 2995982.0,
"step": 343
},
{
"entropy": 1.4412473440170288,
"epoch": 1.102728731942215,
"grad_norm": 6.994678020477295,
"learning_rate": 9.231605820163343e-06,
"loss": 0.2973,
"mean_token_accuracy": 0.8883349299430847,
"num_tokens": 3004311.0,
"step": 344
},
{
"entropy": 1.2334765791893005,
"epoch": 1.1059390048154094,
"grad_norm": 11.067462921142578,
"learning_rate": 9.225950427718974e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.8650515079498291,
"num_tokens": 3013691.0,
"step": 345
},
{
"entropy": 1.3161783814430237,
"epoch": 1.1091492776886036,
"grad_norm": 4.6484832763671875,
"learning_rate": 9.220276045160524e-06,
"loss": 0.4264,
"mean_token_accuracy": 0.8466585278511047,
"num_tokens": 3024373.0,
"step": 346
},
{
"entropy": 1.307463526725769,
"epoch": 1.1123595505617978,
"grad_norm": 6.318870544433594,
"learning_rate": 9.21458269798691e-06,
"loss": 0.298,
"mean_token_accuracy": 0.8816176652908325,
"num_tokens": 3033068.0,
"step": 347
},
{
"entropy": 1.4655287265777588,
"epoch": 1.115569823434992,
"grad_norm": 3.1357920169830322,
"learning_rate": 9.208870411782276e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.8845996260643005,
"num_tokens": 3043131.0,
"step": 348
},
{
"entropy": 1.343300223350525,
"epoch": 1.1187800963081862,
"grad_norm": 6.207579612731934,
"learning_rate": 9.203139212215868e-06,
"loss": 0.3468,
"mean_token_accuracy": 0.8744291663169861,
"num_tokens": 3051014.0,
"step": 349
},
{
"entropy": 1.7229687571525574,
"epoch": 1.1219903691813804,
"grad_norm": 5.9254536628723145,
"learning_rate": 9.197389125041925e-06,
"loss": 0.3166,
"mean_token_accuracy": 0.8954213857650757,
"num_tokens": 3059323.0,
"step": 350
},
{
"entropy": 1.3070868849754333,
"epoch": 1.1252006420545746,
"grad_norm": 4.462583065032959,
"learning_rate": 9.191620176099559e-06,
"loss": 0.3637,
"mean_token_accuracy": 0.8643763959407806,
"num_tokens": 3068280.0,
"step": 351
},
{
"entropy": 1.20599365234375,
"epoch": 1.1284109149277688,
"grad_norm": 3.932370185852051,
"learning_rate": 9.185832391312644e-06,
"loss": 0.3351,
"mean_token_accuracy": 0.8627801537513733,
"num_tokens": 3079052.0,
"step": 352
},
{
"entropy": 1.3846279382705688,
"epoch": 1.131621187800963,
"grad_norm": 2.907315969467163,
"learning_rate": 9.180025796689692e-06,
"loss": 0.3181,
"mean_token_accuracy": 0.8801276385784149,
"num_tokens": 3088219.0,
"step": 353
},
{
"entropy": 1.3442675471305847,
"epoch": 1.1348314606741572,
"grad_norm": 2.9003660678863525,
"learning_rate": 9.174200418323746e-06,
"loss": 0.2736,
"mean_token_accuracy": 0.887022852897644,
"num_tokens": 3096082.0,
"step": 354
},
{
"entropy": 1.311316728591919,
"epoch": 1.1380417335473516,
"grad_norm": 4.425498962402344,
"learning_rate": 9.168356282392253e-06,
"loss": 0.3228,
"mean_token_accuracy": 0.8825305104255676,
"num_tokens": 3104528.0,
"step": 355
},
{
"entropy": 1.2196236848831177,
"epoch": 1.1412520064205458,
"grad_norm": 16.457611083984375,
"learning_rate": 9.16249341515695e-06,
"loss": 0.311,
"mean_token_accuracy": 0.885350912809372,
"num_tokens": 3112791.0,
"step": 356
},
{
"entropy": 1.3958068490028381,
"epoch": 1.14446227929374,
"grad_norm": 5.1613545417785645,
"learning_rate": 9.156611842963753e-06,
"loss": 0.3306,
"mean_token_accuracy": 0.8757579624652863,
"num_tokens": 3121180.0,
"step": 357
},
{
"entropy": 1.383374571800232,
"epoch": 1.1476725521669342,
"grad_norm": 3.217242956161499,
"learning_rate": 9.150711592242627e-06,
"loss": 0.2985,
"mean_token_accuracy": 0.8951647877693176,
"num_tokens": 3129243.0,
"step": 358
},
{
"entropy": 1.5093455910682678,
"epoch": 1.1508828250401284,
"grad_norm": 7.666992664337158,
"learning_rate": 9.144792689507471e-06,
"loss": 0.2927,
"mean_token_accuracy": 0.901302307844162,
"num_tokens": 3136965.0,
"step": 359
},
{
"entropy": 1.4201687574386597,
"epoch": 1.1540930979133226,
"grad_norm": 2.976116895675659,
"learning_rate": 9.138855161356006e-06,
"loss": 0.2453,
"mean_token_accuracy": 0.8940177857875824,
"num_tokens": 3146272.0,
"step": 360
},
{
"entropy": 1.2107245326042175,
"epoch": 1.1573033707865168,
"grad_norm": 7.402556896209717,
"learning_rate": 9.132899034469648e-06,
"loss": 0.3276,
"mean_token_accuracy": 0.8673433661460876,
"num_tokens": 3156121.0,
"step": 361
},
{
"entropy": 1.3413746356964111,
"epoch": 1.160513643659711,
"grad_norm": 4.934394359588623,
"learning_rate": 9.126924335613385e-06,
"loss": 0.3168,
"mean_token_accuracy": 0.8895758986473083,
"num_tokens": 3164320.0,
"step": 362
},
{
"entropy": 1.3330776691436768,
"epoch": 1.1637239165329052,
"grad_norm": 3.920137643814087,
"learning_rate": 9.120931091635669e-06,
"loss": 0.2967,
"mean_token_accuracy": 0.899406909942627,
"num_tokens": 3172443.0,
"step": 363
},
{
"entropy": 1.3503963947296143,
"epoch": 1.1669341894060996,
"grad_norm": 7.547220706939697,
"learning_rate": 9.114919329468283e-06,
"loss": 0.2409,
"mean_token_accuracy": 0.911847323179245,
"num_tokens": 3180659.0,
"step": 364
},
{
"entropy": 1.4835132360458374,
"epoch": 1.1701444622792938,
"grad_norm": 4.738755226135254,
"learning_rate": 9.108889076126226e-06,
"loss": 0.2998,
"mean_token_accuracy": 0.879132866859436,
"num_tokens": 3188574.0,
"step": 365
},
{
"entropy": 1.345651626586914,
"epoch": 1.173354735152488,
"grad_norm": 7.365373611450195,
"learning_rate": 9.102840358707594e-06,
"loss": 0.2489,
"mean_token_accuracy": 0.9016467928886414,
"num_tokens": 3195985.0,
"step": 366
},
{
"entropy": 1.262892246246338,
"epoch": 1.1765650080256822,
"grad_norm": 3.022606134414673,
"learning_rate": 9.09677320439345e-06,
"loss": 0.3176,
"mean_token_accuracy": 0.8682043254375458,
"num_tokens": 3206735.0,
"step": 367
},
{
"entropy": 1.4119667410850525,
"epoch": 1.1797752808988764,
"grad_norm": 4.2746453285217285,
"learning_rate": 9.090687640447709e-06,
"loss": 0.2945,
"mean_token_accuracy": 0.8927364349365234,
"num_tokens": 3215251.0,
"step": 368
},
{
"entropy": 1.6216139197349548,
"epoch": 1.1829855537720706,
"grad_norm": 22.629234313964844,
"learning_rate": 9.084583694217012e-06,
"loss": 0.2952,
"mean_token_accuracy": 0.8840005397796631,
"num_tokens": 3223744.0,
"step": 369
},
{
"entropy": 1.5899872779846191,
"epoch": 1.1861958266452648,
"grad_norm": 3.4773221015930176,
"learning_rate": 9.07846139313061e-06,
"loss": 0.2761,
"mean_token_accuracy": 0.8875099122524261,
"num_tokens": 3232776.0,
"step": 370
},
{
"entropy": 1.3123191595077515,
"epoch": 1.189406099518459,
"grad_norm": 4.043982982635498,
"learning_rate": 9.072320764700223e-06,
"loss": 0.3154,
"mean_token_accuracy": 0.8860943615436554,
"num_tokens": 3240962.0,
"step": 371
},
{
"entropy": 1.23344486951828,
"epoch": 1.1926163723916532,
"grad_norm": 3.159976005554199,
"learning_rate": 9.066161836519942e-06,
"loss": 0.3036,
"mean_token_accuracy": 0.8750773966312408,
"num_tokens": 3250253.0,
"step": 372
},
{
"entropy": 1.3419618606567383,
"epoch": 1.1958266452648476,
"grad_norm": 3.6338860988616943,
"learning_rate": 9.059984636266082e-06,
"loss": 0.2862,
"mean_token_accuracy": 0.8736852407455444,
"num_tokens": 3259204.0,
"step": 373
},
{
"entropy": 1.4191534519195557,
"epoch": 1.1990369181380418,
"grad_norm": 28.642192840576172,
"learning_rate": 9.053789191697072e-06,
"loss": 0.3138,
"mean_token_accuracy": 0.8769473731517792,
"num_tokens": 3267199.0,
"step": 374
},
{
"entropy": 1.361560583114624,
"epoch": 1.202247191011236,
"grad_norm": 3.961038589477539,
"learning_rate": 9.047575530653324e-06,
"loss": 0.2677,
"mean_token_accuracy": 0.9008506536483765,
"num_tokens": 3275974.0,
"step": 375
},
{
"entropy": 1.390976905822754,
"epoch": 1.2054574638844302,
"grad_norm": 3.8710885047912598,
"learning_rate": 9.041343681057106e-06,
"loss": 0.3181,
"mean_token_accuracy": 0.8792448043823242,
"num_tokens": 3284764.0,
"step": 376
},
{
"entropy": 1.4287749528884888,
"epoch": 1.2086677367576244,
"grad_norm": 3.154195785522461,
"learning_rate": 9.035093670912424e-06,
"loss": 0.2961,
"mean_token_accuracy": 0.8887947499752045,
"num_tokens": 3292669.0,
"step": 377
},
{
"entropy": 1.4230648279190063,
"epoch": 1.2118780096308186,
"grad_norm": 28.748876571655273,
"learning_rate": 9.028825528304892e-06,
"loss": 0.2871,
"mean_token_accuracy": 0.8998285830020905,
"num_tokens": 3300237.0,
"step": 378
},
{
"entropy": 1.361795425415039,
"epoch": 1.2150882825040128,
"grad_norm": 4.176957607269287,
"learning_rate": 9.022539281401601e-06,
"loss": 0.3278,
"mean_token_accuracy": 0.8757419288158417,
"num_tokens": 3309485.0,
"step": 379
},
{
"entropy": 1.3511288166046143,
"epoch": 1.218298555377207,
"grad_norm": 3.7439827919006348,
"learning_rate": 9.016234958451002e-06,
"loss": 0.3049,
"mean_token_accuracy": 0.8841515481472015,
"num_tokens": 3316976.0,
"step": 380
},
{
"entropy": 1.3589674234390259,
"epoch": 1.2215088282504012,
"grad_norm": 8.258329391479492,
"learning_rate": 9.009912587782772e-06,
"loss": 0.2759,
"mean_token_accuracy": 0.8956619203090668,
"num_tokens": 3325227.0,
"step": 381
},
{
"entropy": 1.3772531151771545,
"epoch": 1.2247191011235956,
"grad_norm": 4.925998687744141,
"learning_rate": 9.00357219780769e-06,
"loss": 0.3135,
"mean_token_accuracy": 0.8864164352416992,
"num_tokens": 3333617.0,
"step": 382
},
{
"entropy": 1.4004551768302917,
"epoch": 1.2279293739967898,
"grad_norm": 8.397912979125977,
"learning_rate": 8.997213817017508e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.8806163370609283,
"num_tokens": 3342339.0,
"step": 383
},
{
"entropy": 1.2548794150352478,
"epoch": 1.231139646869984,
"grad_norm": 3.6379387378692627,
"learning_rate": 8.990837473984818e-06,
"loss": 0.3339,
"mean_token_accuracy": 0.8684331476688385,
"num_tokens": 3351825.0,
"step": 384
},
{
"entropy": 1.3366854786872864,
"epoch": 1.2343499197431782,
"grad_norm": 3.443938732147217,
"learning_rate": 8.984443197362938e-06,
"loss": 0.3126,
"mean_token_accuracy": 0.882027268409729,
"num_tokens": 3360884.0,
"step": 385
},
{
"entropy": 1.536583423614502,
"epoch": 1.2375601926163724,
"grad_norm": 3.1750330924987793,
"learning_rate": 8.978031015885767e-06,
"loss": 0.2539,
"mean_token_accuracy": 0.8960652649402618,
"num_tokens": 3369121.0,
"step": 386
},
{
"entropy": 1.4677174091339111,
"epoch": 1.2407704654895666,
"grad_norm": 3.4102578163146973,
"learning_rate": 8.971600958367668e-06,
"loss": 0.305,
"mean_token_accuracy": 0.8863416612148285,
"num_tokens": 3378236.0,
"step": 387
},
{
"entropy": 1.4675362706184387,
"epoch": 1.2439807383627608,
"grad_norm": 3.6679139137268066,
"learning_rate": 8.965153053703325e-06,
"loss": 0.2975,
"mean_token_accuracy": 0.883073091506958,
"num_tokens": 3387737.0,
"step": 388
},
{
"entropy": 1.4074093103408813,
"epoch": 1.247191011235955,
"grad_norm": 3.084782123565674,
"learning_rate": 8.958687330867634e-06,
"loss": 0.3106,
"mean_token_accuracy": 0.8841233551502228,
"num_tokens": 3396914.0,
"step": 389
},
{
"entropy": 1.3245580792427063,
"epoch": 1.2504012841091492,
"grad_norm": 2.8408830165863037,
"learning_rate": 8.952203818915548e-06,
"loss": 0.3442,
"mean_token_accuracy": 0.8646701872348785,
"num_tokens": 3409549.0,
"step": 390
},
{
"entropy": 1.347778558731079,
"epoch": 1.2536115569823436,
"grad_norm": 5.054625988006592,
"learning_rate": 8.94570254698197e-06,
"loss": 0.3257,
"mean_token_accuracy": 0.872046560049057,
"num_tokens": 3419435.0,
"step": 391
},
{
"entropy": 1.2921775579452515,
"epoch": 1.2568218298555376,
"grad_norm": 3.920675277709961,
"learning_rate": 8.939183544281597e-06,
"loss": 0.2803,
"mean_token_accuracy": 0.8850438892841339,
"num_tokens": 3428152.0,
"step": 392
},
{
"entropy": 1.4277359247207642,
"epoch": 1.260032102728732,
"grad_norm": 9.427045822143555,
"learning_rate": 8.932646840108818e-06,
"loss": 0.3121,
"mean_token_accuracy": 0.8868101239204407,
"num_tokens": 3436649.0,
"step": 393
},
{
"entropy": 1.3226105570793152,
"epoch": 1.2632423756019262,
"grad_norm": 3.1199164390563965,
"learning_rate": 8.926092463837557e-06,
"loss": 0.3151,
"mean_token_accuracy": 0.8866923451423645,
"num_tokens": 3446140.0,
"step": 394
},
{
"entropy": 1.4028651714324951,
"epoch": 1.2664526484751204,
"grad_norm": 3.997776746749878,
"learning_rate": 8.919520444921153e-06,
"loss": 0.2777,
"mean_token_accuracy": 0.8968737721443176,
"num_tokens": 3454108.0,
"step": 395
},
{
"entropy": 1.2859973907470703,
"epoch": 1.2696629213483146,
"grad_norm": 10.685837745666504,
"learning_rate": 8.912930812892228e-06,
"loss": 0.3432,
"mean_token_accuracy": 0.8622964024543762,
"num_tokens": 3462566.0,
"step": 396
},
{
"entropy": 1.4156718850135803,
"epoch": 1.2728731942215088,
"grad_norm": 8.489455223083496,
"learning_rate": 8.906323597362547e-06,
"loss": 0.272,
"mean_token_accuracy": 0.9009366929531097,
"num_tokens": 3470245.0,
"step": 397
},
{
"entropy": 1.522966742515564,
"epoch": 1.276083467094703,
"grad_norm": 3.9344754219055176,
"learning_rate": 8.899698828022895e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.8857994973659515,
"num_tokens": 3479251.0,
"step": 398
},
{
"entropy": 1.2589990496635437,
"epoch": 1.2792937399678972,
"grad_norm": 2.7946629524230957,
"learning_rate": 8.893056534642938e-06,
"loss": 0.2591,
"mean_token_accuracy": 0.902241587638855,
"num_tokens": 3487630.0,
"step": 399
},
{
"entropy": 1.5324031114578247,
"epoch": 1.2825040128410916,
"grad_norm": 2.9131085872650146,
"learning_rate": 8.886396747071085e-06,
"loss": 0.2662,
"mean_token_accuracy": 0.898281991481781,
"num_tokens": 3496881.0,
"step": 400
},
{
"entropy": 1.3316246271133423,
"epoch": 1.2857142857142856,
"grad_norm": 4.0149664878845215,
"learning_rate": 8.879719495234363e-06,
"loss": 0.282,
"mean_token_accuracy": 0.8956633508205414,
"num_tokens": 3505613.0,
"step": 401
},
{
"entropy": 1.3576619029045105,
"epoch": 1.28892455858748,
"grad_norm": 6.47138786315918,
"learning_rate": 8.873024809138272e-06,
"loss": 0.3248,
"mean_token_accuracy": 0.8703168630599976,
"num_tokens": 3515196.0,
"step": 402
},
{
"entropy": 1.5192933678627014,
"epoch": 1.2921348314606742,
"grad_norm": 4.7108635902404785,
"learning_rate": 8.866312718866669e-06,
"loss": 0.3272,
"mean_token_accuracy": 0.8735582530498505,
"num_tokens": 3525258.0,
"step": 403
},
{
"entropy": 1.5145000219345093,
"epoch": 1.2953451043338684,
"grad_norm": 7.440964698791504,
"learning_rate": 8.859583254581604e-06,
"loss": 0.3202,
"mean_token_accuracy": 0.8838435411453247,
"num_tokens": 3533799.0,
"step": 404
},
{
"entropy": 1.6401035785675049,
"epoch": 1.2985553772070626,
"grad_norm": 4.604669570922852,
"learning_rate": 8.852836446523213e-06,
"loss": 0.3255,
"mean_token_accuracy": 0.8817890584468842,
"num_tokens": 3541962.0,
"step": 405
},
{
"entropy": 1.5389510989189148,
"epoch": 1.3017656500802568,
"grad_norm": 4.766162872314453,
"learning_rate": 8.846072325009562e-06,
"loss": 0.3256,
"mean_token_accuracy": 0.8761164546012878,
"num_tokens": 3551542.0,
"step": 406
},
{
"entropy": 1.457167625427246,
"epoch": 1.304975922953451,
"grad_norm": 4.539270401000977,
"learning_rate": 8.83929092043652e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.872197687625885,
"num_tokens": 3559941.0,
"step": 407
},
{
"entropy": 1.4390366077423096,
"epoch": 1.3081861958266452,
"grad_norm": 4.6166582107543945,
"learning_rate": 8.832492263277624e-06,
"loss": 0.2873,
"mean_token_accuracy": 0.8764486014842987,
"num_tokens": 3569502.0,
"step": 408
},
{
"entropy": 1.5826404094696045,
"epoch": 1.3113964686998396,
"grad_norm": 35.64375686645508,
"learning_rate": 8.825676384083936e-06,
"loss": 0.327,
"mean_token_accuracy": 0.8866596817970276,
"num_tokens": 3578601.0,
"step": 409
},
{
"entropy": 1.3942549228668213,
"epoch": 1.3146067415730336,
"grad_norm": 3.5190839767456055,
"learning_rate": 8.818843313483907e-06,
"loss": 0.2994,
"mean_token_accuracy": 0.8889473378658295,
"num_tokens": 3587207.0,
"step": 410
},
{
"entropy": 1.5776238441467285,
"epoch": 1.317817014446228,
"grad_norm": 9.147720336914062,
"learning_rate": 8.811993082183243e-06,
"loss": 0.2829,
"mean_token_accuracy": 0.8859248757362366,
"num_tokens": 3595617.0,
"step": 411
},
{
"entropy": 1.596324384212494,
"epoch": 1.3210272873194222,
"grad_norm": 5.241089344024658,
"learning_rate": 8.805125720964766e-06,
"loss": 0.2953,
"mean_token_accuracy": 0.8866595029830933,
"num_tokens": 3604639.0,
"step": 412
},
{
"entropy": 1.3364008069038391,
"epoch": 1.3242375601926164,
"grad_norm": 2.9017364978790283,
"learning_rate": 8.798241260688273e-06,
"loss": 0.3193,
"mean_token_accuracy": 0.8743776082992554,
"num_tokens": 3613839.0,
"step": 413
},
{
"entropy": 1.352913737297058,
"epoch": 1.3274478330658106,
"grad_norm": 3.1927762031555176,
"learning_rate": 8.791339732290398e-06,
"loss": 0.2869,
"mean_token_accuracy": 0.884104460477829,
"num_tokens": 3623670.0,
"step": 414
},
{
"entropy": 1.4007618427276611,
"epoch": 1.3306581059390048,
"grad_norm": 3.5387911796569824,
"learning_rate": 8.784421166784476e-06,
"loss": 0.2834,
"mean_token_accuracy": 0.8924818634986877,
"num_tokens": 3631397.0,
"step": 415
},
{
"entropy": 1.5313379764556885,
"epoch": 1.333868378812199,
"grad_norm": 3.7980873584747314,
"learning_rate": 8.7774855952604e-06,
"loss": 0.284,
"mean_token_accuracy": 0.8888964354991913,
"num_tokens": 3640022.0,
"step": 416
},
{
"entropy": 1.3799718618392944,
"epoch": 1.3370786516853932,
"grad_norm": 3.859992027282715,
"learning_rate": 8.770533048884483e-06,
"loss": 0.2425,
"mean_token_accuracy": 0.9088575541973114,
"num_tokens": 3648177.0,
"step": 417
},
{
"entropy": 1.412042498588562,
"epoch": 1.3402889245585876,
"grad_norm": 4.135337829589844,
"learning_rate": 8.763563558899317e-06,
"loss": 0.2928,
"mean_token_accuracy": 0.8850153088569641,
"num_tokens": 3657888.0,
"step": 418
},
{
"entropy": 1.5607710480690002,
"epoch": 1.3434991974317816,
"grad_norm": 3.7058753967285156,
"learning_rate": 8.756577156623636e-06,
"loss": 0.2907,
"mean_token_accuracy": 0.8865703344345093,
"num_tokens": 3667083.0,
"step": 419
},
{
"entropy": 1.3873555660247803,
"epoch": 1.346709470304976,
"grad_norm": 3.0918662548065186,
"learning_rate": 8.749573873452169e-06,
"loss": 0.2682,
"mean_token_accuracy": 0.8970977663993835,
"num_tokens": 3675648.0,
"step": 420
},
{
"entropy": 1.4942026734352112,
"epoch": 1.3499197431781702,
"grad_norm": 4.505756855010986,
"learning_rate": 8.742553740855507e-06,
"loss": 0.3386,
"mean_token_accuracy": 0.8737636208534241,
"num_tokens": 3684547.0,
"step": 421
},
{
"entropy": 1.3530024290084839,
"epoch": 1.3531300160513644,
"grad_norm": 4.251113414764404,
"learning_rate": 8.735516790379952e-06,
"loss": 0.3532,
"mean_token_accuracy": 0.871391236782074,
"num_tokens": 3693082.0,
"step": 422
},
{
"entropy": 1.3358929753303528,
"epoch": 1.3563402889245586,
"grad_norm": 3.3764379024505615,
"learning_rate": 8.728463053647382e-06,
"loss": 0.2892,
"mean_token_accuracy": 0.8893671631813049,
"num_tokens": 3701174.0,
"step": 423
},
{
"entropy": 1.5129033923149109,
"epoch": 1.3595505617977528,
"grad_norm": 3.298848867416382,
"learning_rate": 8.721392562355113e-06,
"loss": 0.2135,
"mean_token_accuracy": 0.9176328778266907,
"num_tokens": 3710272.0,
"step": 424
},
{
"entropy": 1.3921077251434326,
"epoch": 1.362760834670947,
"grad_norm": 4.316572189331055,
"learning_rate": 8.71430534827574e-06,
"loss": 0.3236,
"mean_token_accuracy": 0.8857169449329376,
"num_tokens": 3720135.0,
"step": 425
},
{
"entropy": 1.37892746925354,
"epoch": 1.3659711075441412,
"grad_norm": 3.766646146774292,
"learning_rate": 8.707201443257015e-06,
"loss": 0.2828,
"mean_token_accuracy": 0.8919144570827484,
"num_tokens": 3728595.0,
"step": 426
},
{
"entropy": 1.2769124507904053,
"epoch": 1.3691813804173354,
"grad_norm": 3.253934144973755,
"learning_rate": 8.700080879221689e-06,
"loss": 0.2761,
"mean_token_accuracy": 0.8954965174198151,
"num_tokens": 3736117.0,
"step": 427
},
{
"entropy": 1.2963144183158875,
"epoch": 1.3723916532905296,
"grad_norm": 6.249416351318359,
"learning_rate": 8.692943688167371e-06,
"loss": 0.2693,
"mean_token_accuracy": 0.8879525661468506,
"num_tokens": 3744390.0,
"step": 428
},
{
"entropy": 1.3017955422401428,
"epoch": 1.375601926163724,
"grad_norm": 2.980830669403076,
"learning_rate": 8.685789902166395e-06,
"loss": 0.3061,
"mean_token_accuracy": 0.8886123299598694,
"num_tokens": 3753837.0,
"step": 429
},
{
"entropy": 1.349017083644867,
"epoch": 1.3788121990369182,
"grad_norm": 5.0209245681762695,
"learning_rate": 8.67861955336566e-06,
"loss": 0.2885,
"mean_token_accuracy": 0.8791326582431793,
"num_tokens": 3763493.0,
"step": 430
},
{
"entropy": 1.4477825164794922,
"epoch": 1.3820224719101124,
"grad_norm": 11.758344650268555,
"learning_rate": 8.671432673986493e-06,
"loss": 0.3331,
"mean_token_accuracy": 0.8793164193630219,
"num_tokens": 3772358.0,
"step": 431
},
{
"entropy": 1.3367245197296143,
"epoch": 1.3852327447833066,
"grad_norm": 4.308809280395508,
"learning_rate": 8.664229296324514e-06,
"loss": 0.3042,
"mean_token_accuracy": 0.8680384755134583,
"num_tokens": 3780490.0,
"step": 432
},
{
"entropy": 1.449657917022705,
"epoch": 1.3884430176565008,
"grad_norm": 4.295592784881592,
"learning_rate": 8.657009452749466e-06,
"loss": 0.2871,
"mean_token_accuracy": 0.8916987478733063,
"num_tokens": 3788463.0,
"step": 433
},
{
"entropy": 1.49513840675354,
"epoch": 1.391653290529695,
"grad_norm": 4.6402201652526855,
"learning_rate": 8.649773175705099e-06,
"loss": 0.3005,
"mean_token_accuracy": 0.8849748373031616,
"num_tokens": 3797066.0,
"step": 434
},
{
"entropy": 1.2568953037261963,
"epoch": 1.3948635634028892,
"grad_norm": 3.7639451026916504,
"learning_rate": 8.642520497709001e-06,
"loss": 0.3126,
"mean_token_accuracy": 0.8799223005771637,
"num_tokens": 3805553.0,
"step": 435
},
{
"entropy": 1.4464783668518066,
"epoch": 1.3980738362760834,
"grad_norm": 3.1986289024353027,
"learning_rate": 8.635251451352463e-06,
"loss": 0.3125,
"mean_token_accuracy": 0.8844713270664215,
"num_tokens": 3814202.0,
"step": 436
},
{
"entropy": 1.4772561192512512,
"epoch": 1.4012841091492776,
"grad_norm": 2.8951005935668945,
"learning_rate": 8.627966069300332e-06,
"loss": 0.2926,
"mean_token_accuracy": 0.8794938027858734,
"num_tokens": 3823416.0,
"step": 437
},
{
"entropy": 1.4769402742385864,
"epoch": 1.404494382022472,
"grad_norm": 4.300238609313965,
"learning_rate": 8.620664384290863e-06,
"loss": 0.3378,
"mean_token_accuracy": 0.8839116096496582,
"num_tokens": 3832081.0,
"step": 438
},
{
"entropy": 1.3235585689544678,
"epoch": 1.4077046548956662,
"grad_norm": 3.8008766174316406,
"learning_rate": 8.613346429135567e-06,
"loss": 0.3076,
"mean_token_accuracy": 0.8767895400524139,
"num_tokens": 3841534.0,
"step": 439
},
{
"entropy": 1.423071026802063,
"epoch": 1.4109149277688604,
"grad_norm": 5.64111852645874,
"learning_rate": 8.606012236719073e-06,
"loss": 0.2855,
"mean_token_accuracy": 0.8974299728870392,
"num_tokens": 3849597.0,
"step": 440
},
{
"entropy": 1.5393714904785156,
"epoch": 1.4141252006420546,
"grad_norm": 3.2510595321655273,
"learning_rate": 8.598661839998972e-06,
"loss": 0.2969,
"mean_token_accuracy": 0.89105424284935,
"num_tokens": 3858231.0,
"step": 441
},
{
"entropy": 1.4409209489822388,
"epoch": 1.4173354735152488,
"grad_norm": 3.6261956691741943,
"learning_rate": 8.591295272005674e-06,
"loss": 0.3012,
"mean_token_accuracy": 0.8888025879859924,
"num_tokens": 3867227.0,
"step": 442
},
{
"entropy": 1.5317130088806152,
"epoch": 1.420545746388443,
"grad_norm": 2.9155311584472656,
"learning_rate": 8.583912565842258e-06,
"loss": 0.2643,
"mean_token_accuracy": 0.9002439677715302,
"num_tokens": 3875137.0,
"step": 443
},
{
"entropy": 1.5092316269874573,
"epoch": 1.4237560192616372,
"grad_norm": 8.434717178344727,
"learning_rate": 8.576513754684318e-06,
"loss": 0.2908,
"mean_token_accuracy": 0.8917935788631439,
"num_tokens": 3883306.0,
"step": 444
},
{
"entropy": 1.4440070390701294,
"epoch": 1.4269662921348314,
"grad_norm": 3.2048768997192383,
"learning_rate": 8.569098871779828e-06,
"loss": 0.2837,
"mean_token_accuracy": 0.8814037442207336,
"num_tokens": 3892331.0,
"step": 445
},
{
"entropy": 1.4215741753578186,
"epoch": 1.4301765650080256,
"grad_norm": 4.36214017868042,
"learning_rate": 8.561667950448973e-06,
"loss": 0.3272,
"mean_token_accuracy": 0.8756458759307861,
"num_tokens": 3901483.0,
"step": 446
},
{
"entropy": 1.5180083513259888,
"epoch": 1.43338683788122,
"grad_norm": 3.3972907066345215,
"learning_rate": 8.554221024084019e-06,
"loss": 0.2934,
"mean_token_accuracy": 0.8790063261985779,
"num_tokens": 3910713.0,
"step": 447
},
{
"entropy": 1.4909613132476807,
"epoch": 1.4365971107544142,
"grad_norm": 3.484736919403076,
"learning_rate": 8.546758126149148e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.8712872564792633,
"num_tokens": 3918429.0,
"step": 448
},
{
"entropy": 1.4828435182571411,
"epoch": 1.4398073836276084,
"grad_norm": 3.011584997177124,
"learning_rate": 8.539279290180315e-06,
"loss": 0.2702,
"mean_token_accuracy": 0.8983392119407654,
"num_tokens": 3927059.0,
"step": 449
},
{
"entropy": 1.3964380025863647,
"epoch": 1.4430176565008026,
"grad_norm": 3.2408764362335205,
"learning_rate": 8.531784549785098e-06,
"loss": 0.3098,
"mean_token_accuracy": 0.8849463164806366,
"num_tokens": 3936625.0,
"step": 450
},
{
"entropy": 1.4703381657600403,
"epoch": 1.4462279293739968,
"grad_norm": 2.8378424644470215,
"learning_rate": 8.524273938642539e-06,
"loss": 0.2708,
"mean_token_accuracy": 0.9015527069568634,
"num_tokens": 3944552.0,
"step": 451
},
{
"entropy": 1.693705976009369,
"epoch": 1.449438202247191,
"grad_norm": 4.766742706298828,
"learning_rate": 8.516747490503001e-06,
"loss": 0.2855,
"mean_token_accuracy": 0.8845842480659485,
"num_tokens": 3953203.0,
"step": 452
},
{
"entropy": 1.4736073017120361,
"epoch": 1.4526484751203852,
"grad_norm": 3.187501907348633,
"learning_rate": 8.509205239188017e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8820372521877289,
"num_tokens": 3961297.0,
"step": 453
},
{
"entropy": 1.4047070145606995,
"epoch": 1.4558587479935794,
"grad_norm": 2.950268507003784,
"learning_rate": 8.501647218590127e-06,
"loss": 0.3094,
"mean_token_accuracy": 0.8798324465751648,
"num_tokens": 3970101.0,
"step": 454
},
{
"entropy": 1.5628494620323181,
"epoch": 1.4590690208667736,
"grad_norm": 5.165965557098389,
"learning_rate": 8.494073462672743e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8838167488574982,
"num_tokens": 3978005.0,
"step": 455
},
{
"entropy": 1.3936602473258972,
"epoch": 1.462279293739968,
"grad_norm": 3.246717929840088,
"learning_rate": 8.486484005469977e-06,
"loss": 0.3419,
"mean_token_accuracy": 0.8834190964698792,
"num_tokens": 3985914.0,
"step": 456
},
{
"entropy": 1.401951789855957,
"epoch": 1.465489566613162,
"grad_norm": 4.547334671020508,
"learning_rate": 8.478878881086505e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.8777275681495667,
"num_tokens": 3994565.0,
"step": 457
},
{
"entropy": 1.3956224918365479,
"epoch": 1.4686998394863564,
"grad_norm": 3.96466064453125,
"learning_rate": 8.471258123697403e-06,
"loss": 0.4326,
"mean_token_accuracy": 0.8498894572257996,
"num_tokens": 4005280.0,
"step": 458
},
{
"entropy": 1.5551961660385132,
"epoch": 1.4719101123595506,
"grad_norm": 3.1760103702545166,
"learning_rate": 8.463621767547998e-06,
"loss": 0.2698,
"mean_token_accuracy": 0.8973476886749268,
"num_tokens": 4014231.0,
"step": 459
},
{
"entropy": 1.4384177923202515,
"epoch": 1.4751203852327448,
"grad_norm": 2.9949951171875,
"learning_rate": 8.455969846953711e-06,
"loss": 0.2774,
"mean_token_accuracy": 0.8971374332904816,
"num_tokens": 4022904.0,
"step": 460
},
{
"entropy": 1.6596906185150146,
"epoch": 1.478330658105939,
"grad_norm": 3.8711040019989014,
"learning_rate": 8.448302396299906e-06,
"loss": 0.3023,
"mean_token_accuracy": 0.8945289552211761,
"num_tokens": 4031368.0,
"step": 461
},
{
"entropy": 1.5498453974723816,
"epoch": 1.4815409309791332,
"grad_norm": 3.1765880584716797,
"learning_rate": 8.440619450041736e-06,
"loss": 0.2813,
"mean_token_accuracy": 0.8945924937725067,
"num_tokens": 4039373.0,
"step": 462
},
{
"entropy": 1.5488550662994385,
"epoch": 1.4847512038523274,
"grad_norm": 4.509853363037109,
"learning_rate": 8.432921042703985e-06,
"loss": 0.2502,
"mean_token_accuracy": 0.9076395630836487,
"num_tokens": 4047151.0,
"step": 463
},
{
"entropy": 1.3990018367767334,
"epoch": 1.4879614767255216,
"grad_norm": 4.891584396362305,
"learning_rate": 8.425207208880914e-06,
"loss": 0.3393,
"mean_token_accuracy": 0.8612608909606934,
"num_tokens": 4055428.0,
"step": 464
},
{
"entropy": 1.5296116471290588,
"epoch": 1.491171749598716,
"grad_norm": 3.9132232666015625,
"learning_rate": 8.417477983236107e-06,
"loss": 0.2889,
"mean_token_accuracy": 0.9005896151065826,
"num_tokens": 4063567.0,
"step": 465
},
{
"entropy": 1.4369441270828247,
"epoch": 1.49438202247191,
"grad_norm": 4.291438102722168,
"learning_rate": 8.409733400502311e-06,
"loss": 0.343,
"mean_token_accuracy": 0.8742758929729462,
"num_tokens": 4072146.0,
"step": 466
},
{
"entropy": 1.4824082255363464,
"epoch": 1.4975922953451044,
"grad_norm": 3.5444793701171875,
"learning_rate": 8.401973495481289e-06,
"loss": 0.3,
"mean_token_accuracy": 0.8864571452140808,
"num_tokens": 4080309.0,
"step": 467
},
{
"entropy": 1.3830168843269348,
"epoch": 1.5008025682182986,
"grad_norm": 4.718812465667725,
"learning_rate": 8.39419830304365e-06,
"loss": 0.2767,
"mean_token_accuracy": 0.8947529196739197,
"num_tokens": 4088027.0,
"step": 468
},
{
"entropy": 1.501311182975769,
"epoch": 1.5040128410914928,
"grad_norm": 4.948775768280029,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2856,
"mean_token_accuracy": 0.8906499743461609,
"num_tokens": 4097004.0,
"step": 469
},
{
"entropy": 1.4303480386734009,
"epoch": 1.507223113964687,
"grad_norm": 3.6689600944519043,
"learning_rate": 8.378602195744308e-06,
"loss": 0.2981,
"mean_token_accuracy": 0.8818539083003998,
"num_tokens": 4105389.0,
"step": 470
},
{
"entropy": 1.4083414673805237,
"epoch": 1.5104333868378812,
"grad_norm": 3.417105197906494,
"learning_rate": 8.370781350966683e-06,
"loss": 0.3039,
"mean_token_accuracy": 0.8782542049884796,
"num_tokens": 4114039.0,
"step": 471
},
{
"entropy": 1.3129056096076965,
"epoch": 1.5136436597110754,
"grad_norm": 3.7305545806884766,
"learning_rate": 8.362945358940295e-06,
"loss": 0.3669,
"mean_token_accuracy": 0.8697949945926666,
"num_tokens": 4123365.0,
"step": 472
},
{
"entropy": 1.3081690669059753,
"epoch": 1.5168539325842696,
"grad_norm": 3.302035093307495,
"learning_rate": 8.355094254877665e-06,
"loss": 0.3144,
"mean_token_accuracy": 0.8910820186138153,
"num_tokens": 4132153.0,
"step": 473
},
{
"entropy": 1.47750985622406,
"epoch": 1.520064205457464,
"grad_norm": 3.3554630279541016,
"learning_rate": 8.347228074059227e-06,
"loss": 0.2683,
"mean_token_accuracy": 0.9022042751312256,
"num_tokens": 4140393.0,
"step": 474
},
{
"entropy": 1.348096251487732,
"epoch": 1.523274478330658,
"grad_norm": 2.9571056365966797,
"learning_rate": 8.339346851833163e-06,
"loss": 0.3485,
"mean_token_accuracy": 0.8722924292087555,
"num_tokens": 4149933.0,
"step": 475
},
{
"entropy": 1.4152057766914368,
"epoch": 1.5264847512038524,
"grad_norm": 3.6118574142456055,
"learning_rate": 8.33145062361525e-06,
"loss": 0.2283,
"mean_token_accuracy": 0.9181205034255981,
"num_tokens": 4157852.0,
"step": 476
},
{
"entropy": 1.393871009349823,
"epoch": 1.5296950240770464,
"grad_norm": 3.289285898208618,
"learning_rate": 8.323539424888695e-06,
"loss": 0.3666,
"mean_token_accuracy": 0.8745063245296478,
"num_tokens": 4167313.0,
"step": 477
},
{
"entropy": 1.4740851521492004,
"epoch": 1.5329052969502408,
"grad_norm": 2.9904513359069824,
"learning_rate": 8.315613291203977e-06,
"loss": 0.2871,
"mean_token_accuracy": 0.8881636559963226,
"num_tokens": 4175515.0,
"step": 478
},
{
"entropy": 1.4914516806602478,
"epoch": 1.536115569823435,
"grad_norm": 5.311322212219238,
"learning_rate": 8.30767225817869e-06,
"loss": 0.3231,
"mean_token_accuracy": 0.878555953502655,
"num_tokens": 4183781.0,
"step": 479
},
{
"entropy": 1.4010317921638489,
"epoch": 1.5393258426966292,
"grad_norm": 3.215667963027954,
"learning_rate": 8.299716361497377e-06,
"loss": 0.3094,
"mean_token_accuracy": 0.8759619891643524,
"num_tokens": 4191987.0,
"step": 480
},
{
"entropy": 1.506856381893158,
"epoch": 1.5425361155698234,
"grad_norm": 3.147756576538086,
"learning_rate": 8.291745636911382e-06,
"loss": 0.2931,
"mean_token_accuracy": 0.8908743560314178,
"num_tokens": 4200711.0,
"step": 481
},
{
"entropy": 1.324585199356079,
"epoch": 1.5457463884430176,
"grad_norm": 3.0587069988250732,
"learning_rate": 8.283760120238672e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8850060105323792,
"num_tokens": 4209179.0,
"step": 482
},
{
"entropy": 1.4364397525787354,
"epoch": 1.548956661316212,
"grad_norm": 10.770256042480469,
"learning_rate": 8.27575984736369e-06,
"loss": 0.2789,
"mean_token_accuracy": 0.8973394930362701,
"num_tokens": 4217103.0,
"step": 483
},
{
"entropy": 1.5607190132141113,
"epoch": 1.552166934189406,
"grad_norm": 3.770235538482666,
"learning_rate": 8.26774485423719e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.8835341334342957,
"num_tokens": 4226475.0,
"step": 484
},
{
"entropy": 1.5396313071250916,
"epoch": 1.5553772070626004,
"grad_norm": 3.7008605003356934,
"learning_rate": 8.259715176876069e-06,
"loss": 0.3293,
"mean_token_accuracy": 0.8814294040203094,
"num_tokens": 4234718.0,
"step": 485
},
{
"entropy": 1.359902560710907,
"epoch": 1.5585874799357944,
"grad_norm": 4.006896495819092,
"learning_rate": 8.251670851363214e-06,
"loss": 0.2896,
"mean_token_accuracy": 0.8906521201133728,
"num_tokens": 4243360.0,
"step": 486
},
{
"entropy": 1.4276302456855774,
"epoch": 1.5617977528089888,
"grad_norm": 7.1651716232299805,
"learning_rate": 8.243611913847337e-06,
"loss": 0.2822,
"mean_token_accuracy": 0.8945540487766266,
"num_tokens": 4252105.0,
"step": 487
},
{
"entropy": 1.4439811706542969,
"epoch": 1.565008025682183,
"grad_norm": 6.675753116607666,
"learning_rate": 8.235538400542809e-06,
"loss": 0.2913,
"mean_token_accuracy": 0.8855163156986237,
"num_tokens": 4261011.0,
"step": 488
},
{
"entropy": 1.5044021606445312,
"epoch": 1.5682182985553772,
"grad_norm": 3.4322025775909424,
"learning_rate": 8.2274503477295e-06,
"loss": 0.264,
"mean_token_accuracy": 0.9048478901386261,
"num_tokens": 4268926.0,
"step": 489
},
{
"entropy": 1.449196219444275,
"epoch": 1.5714285714285714,
"grad_norm": 4.121733665466309,
"learning_rate": 8.21934779175262e-06,
"loss": 0.3145,
"mean_token_accuracy": 0.8815996646881104,
"num_tokens": 4277478.0,
"step": 490
},
{
"entropy": 1.583847999572754,
"epoch": 1.5746388443017656,
"grad_norm": 3.647516965866089,
"learning_rate": 8.211230769022552e-06,
"loss": 0.2795,
"mean_token_accuracy": 0.8901920318603516,
"num_tokens": 4287141.0,
"step": 491
},
{
"entropy": 1.7253791093826294,
"epoch": 1.57784911717496,
"grad_norm": 5.149407386779785,
"learning_rate": 8.203099316014679e-06,
"loss": 0.2618,
"mean_token_accuracy": 0.9063104391098022,
"num_tokens": 4295839.0,
"step": 492
},
{
"entropy": 1.8327444195747375,
"epoch": 1.581059390048154,
"grad_norm": 4.351746559143066,
"learning_rate": 8.19495346926924e-06,
"loss": 0.359,
"mean_token_accuracy": 0.8663514256477356,
"num_tokens": 4305643.0,
"step": 493
},
{
"entropy": 1.4635063409805298,
"epoch": 1.5842696629213484,
"grad_norm": 3.7534878253936768,
"learning_rate": 8.18679326539115e-06,
"loss": 0.287,
"mean_token_accuracy": 0.8927922546863556,
"num_tokens": 4314046.0,
"step": 494
},
{
"entropy": 1.347511351108551,
"epoch": 1.5874799357945424,
"grad_norm": 3.258915424346924,
"learning_rate": 8.178618741049841e-06,
"loss": 0.3147,
"mean_token_accuracy": 0.8884759247303009,
"num_tokens": 4322655.0,
"step": 495
},
{
"entropy": 1.5007055401802063,
"epoch": 1.5906902086677368,
"grad_norm": 3.146191120147705,
"learning_rate": 8.170429932979097e-06,
"loss": 0.2464,
"mean_token_accuracy": 0.9067124128341675,
"num_tokens": 4330697.0,
"step": 496
},
{
"entropy": 1.4456924796104431,
"epoch": 1.593900481540931,
"grad_norm": 4.14158821105957,
"learning_rate": 8.162226877976886e-06,
"loss": 0.2741,
"mean_token_accuracy": 0.890965074300766,
"num_tokens": 4338442.0,
"step": 497
},
{
"entropy": 1.3182119727134705,
"epoch": 1.5971107544141252,
"grad_norm": 4.268415451049805,
"learning_rate": 8.154009612905205e-06,
"loss": 0.3057,
"mean_token_accuracy": 0.8871277570724487,
"num_tokens": 4346989.0,
"step": 498
},
{
"entropy": 1.3188686966896057,
"epoch": 1.6003210272873194,
"grad_norm": 3.7496144771575928,
"learning_rate": 8.145778174689897e-06,
"loss": 0.327,
"mean_token_accuracy": 0.8702134490013123,
"num_tokens": 4355824.0,
"step": 499
},
{
"entropy": 1.5009222030639648,
"epoch": 1.6035313001605136,
"grad_norm": 3.6059823036193848,
"learning_rate": 8.137532600320502e-06,
"loss": 0.3169,
"mean_token_accuracy": 0.8813262283802032,
"num_tokens": 4364156.0,
"step": 500
},
{
"entropy": 1.4714254140853882,
"epoch": 1.606741573033708,
"grad_norm": 3.3746225833892822,
"learning_rate": 8.129272926850079e-06,
"loss": 0.3275,
"mean_token_accuracy": 0.8713617920875549,
"num_tokens": 4373790.0,
"step": 501
},
{
"entropy": 1.2796449065208435,
"epoch": 1.609951845906902,
"grad_norm": 8.295031547546387,
"learning_rate": 8.120999191395048e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.8713172674179077,
"num_tokens": 4381975.0,
"step": 502
},
{
"entropy": 1.2624800205230713,
"epoch": 1.6131621187800964,
"grad_norm": 3.607821226119995,
"learning_rate": 8.112711431135014e-06,
"loss": 0.3789,
"mean_token_accuracy": 0.8556053936481476,
"num_tokens": 4393419.0,
"step": 503
},
{
"entropy": 1.4322052001953125,
"epoch": 1.6163723916532904,
"grad_norm": 22.992494583129883,
"learning_rate": 8.10440968331261e-06,
"loss": 0.2864,
"mean_token_accuracy": 0.8980510830879211,
"num_tokens": 4402672.0,
"step": 504
},
{
"entropy": 1.5261085033416748,
"epoch": 1.6195826645264848,
"grad_norm": 3.8417842388153076,
"learning_rate": 8.096093985233323e-06,
"loss": 0.3399,
"mean_token_accuracy": 0.8749682307243347,
"num_tokens": 4413027.0,
"step": 505
},
{
"entropy": 1.5344858169555664,
"epoch": 1.622792937399679,
"grad_norm": 29.822771072387695,
"learning_rate": 8.087764374265325e-06,
"loss": 0.3371,
"mean_token_accuracy": 0.8792240917682648,
"num_tokens": 4422236.0,
"step": 506
},
{
"entropy": 1.4433594942092896,
"epoch": 1.6260032102728732,
"grad_norm": 3.875814914703369,
"learning_rate": 8.079420887839316e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.8798151612281799,
"num_tokens": 4431073.0,
"step": 507
},
{
"entropy": 1.580399751663208,
"epoch": 1.6292134831460674,
"grad_norm": 4.023304462432861,
"learning_rate": 8.071063563448341e-06,
"loss": 0.306,
"mean_token_accuracy": 0.8894245326519012,
"num_tokens": 4439845.0,
"step": 508
},
{
"entropy": 1.366957426071167,
"epoch": 1.6324237560192616,
"grad_norm": 4.008258819580078,
"learning_rate": 8.062692438647628e-06,
"loss": 0.3259,
"mean_token_accuracy": 0.8769682347774506,
"num_tokens": 4447925.0,
"step": 509
},
{
"entropy": 1.3271659016609192,
"epoch": 1.635634028892456,
"grad_norm": 15.104333877563477,
"learning_rate": 8.054307551054427e-06,
"loss": 0.3196,
"mean_token_accuracy": 0.8766265213489532,
"num_tokens": 4459279.0,
"step": 510
},
{
"entropy": 1.3582776188850403,
"epoch": 1.63884430176565,
"grad_norm": 3.408693790435791,
"learning_rate": 8.045908938347828e-06,
"loss": 0.3227,
"mean_token_accuracy": 0.8774316906929016,
"num_tokens": 4467698.0,
"step": 511
},
{
"entropy": 1.440934956073761,
"epoch": 1.6420545746388444,
"grad_norm": 4.110688209533691,
"learning_rate": 8.037496638268599e-06,
"loss": 0.3005,
"mean_token_accuracy": 0.8867291510105133,
"num_tokens": 4477102.0,
"step": 512
},
{
"entropy": 1.7379968762397766,
"epoch": 1.6452648475120384,
"grad_norm": 12.632403373718262,
"learning_rate": 8.029070688619013e-06,
"loss": 0.2961,
"mean_token_accuracy": 0.8926231265068054,
"num_tokens": 4487287.0,
"step": 513
},
{
"entropy": 1.3730071783065796,
"epoch": 1.6484751203852328,
"grad_norm": 2.8596882820129395,
"learning_rate": 8.020631127262681e-06,
"loss": 0.2615,
"mean_token_accuracy": 0.90137779712677,
"num_tokens": 4496229.0,
"step": 514
},
{
"entropy": 1.4635502099990845,
"epoch": 1.651685393258427,
"grad_norm": 3.51971173286438,
"learning_rate": 8.012177992124385e-06,
"loss": 0.3054,
"mean_token_accuracy": 0.8946611881256104,
"num_tokens": 4504147.0,
"step": 515
},
{
"entropy": 1.335837960243225,
"epoch": 1.6548956661316212,
"grad_norm": 2.9270989894866943,
"learning_rate": 8.003711321189895e-06,
"loss": 0.3361,
"mean_token_accuracy": 0.872332900762558,
"num_tokens": 4513383.0,
"step": 516
},
{
"entropy": 1.4679010510444641,
"epoch": 1.6581059390048154,
"grad_norm": 2.6027233600616455,
"learning_rate": 7.995231152505815e-06,
"loss": 0.2406,
"mean_token_accuracy": 0.9143733978271484,
"num_tokens": 4521270.0,
"step": 517
},
{
"entropy": 1.3284828662872314,
"epoch": 1.6613162118780096,
"grad_norm": 5.920923709869385,
"learning_rate": 7.986737524179398e-06,
"loss": 0.3287,
"mean_token_accuracy": 0.8773960471153259,
"num_tokens": 4530869.0,
"step": 518
},
{
"entropy": 1.5258394479751587,
"epoch": 1.664526484751204,
"grad_norm": 10.5205078125,
"learning_rate": 7.978230474378383e-06,
"loss": 0.2435,
"mean_token_accuracy": 0.9132257103919983,
"num_tokens": 4539566.0,
"step": 519
},
{
"entropy": 1.3652837872505188,
"epoch": 1.667736757624398,
"grad_norm": 2.759265661239624,
"learning_rate": 7.96971004133082e-06,
"loss": 0.3594,
"mean_token_accuracy": 0.8666320443153381,
"num_tokens": 4549459.0,
"step": 520
},
{
"entropy": 1.4656896591186523,
"epoch": 1.6709470304975924,
"grad_norm": 4.031230449676514,
"learning_rate": 7.961176263324902e-06,
"loss": 0.3404,
"mean_token_accuracy": 0.8758351504802704,
"num_tokens": 4558053.0,
"step": 521
},
{
"entropy": 1.383994698524475,
"epoch": 1.6741573033707864,
"grad_norm": 12.914740562438965,
"learning_rate": 7.952629178708783e-06,
"loss": 0.3391,
"mean_token_accuracy": 0.8680421113967896,
"num_tokens": 4566800.0,
"step": 522
},
{
"entropy": 1.3812520503997803,
"epoch": 1.6773675762439808,
"grad_norm": 3.0095322132110596,
"learning_rate": 7.944068825890424e-06,
"loss": 0.3193,
"mean_token_accuracy": 0.877071350812912,
"num_tokens": 4575949.0,
"step": 523
},
{
"entropy": 1.4497495293617249,
"epoch": 1.680577849117175,
"grad_norm": 3.192546844482422,
"learning_rate": 7.935495243337397e-06,
"loss": 0.2906,
"mean_token_accuracy": 0.8865289986133575,
"num_tokens": 4586128.0,
"step": 524
},
{
"entropy": 1.4661012291908264,
"epoch": 1.6837881219903692,
"grad_norm": 4.490471363067627,
"learning_rate": 7.92690846957673e-06,
"loss": 0.3304,
"mean_token_accuracy": 0.8626764714717865,
"num_tokens": 4595541.0,
"step": 525
},
{
"entropy": 1.4555456638336182,
"epoch": 1.6869983948635634,
"grad_norm": 3.7944064140319824,
"learning_rate": 7.918308543194735e-06,
"loss": 0.3044,
"mean_token_accuracy": 0.8946430087089539,
"num_tokens": 4604904.0,
"step": 526
},
{
"entropy": 1.4736084938049316,
"epoch": 1.6902086677367576,
"grad_norm": 3.330254077911377,
"learning_rate": 7.909695502836814e-06,
"loss": 0.3478,
"mean_token_accuracy": 0.8704387247562408,
"num_tokens": 4613636.0,
"step": 527
},
{
"entropy": 1.3352216482162476,
"epoch": 1.6934189406099518,
"grad_norm": 3.57326078414917,
"learning_rate": 7.90106938720731e-06,
"loss": 0.2742,
"mean_token_accuracy": 0.9011513888835907,
"num_tokens": 4622451.0,
"step": 528
},
{
"entropy": 1.4958056211471558,
"epoch": 1.696629213483146,
"grad_norm": 3.4741249084472656,
"learning_rate": 7.892430235069317e-06,
"loss": 0.2964,
"mean_token_accuracy": 0.8987282514572144,
"num_tokens": 4630703.0,
"step": 529
},
{
"entropy": 1.4583451747894287,
"epoch": 1.6998394863563404,
"grad_norm": 50.94432067871094,
"learning_rate": 7.883778085244514e-06,
"loss": 0.3036,
"mean_token_accuracy": 0.8844209909439087,
"num_tokens": 4638687.0,
"step": 530
},
{
"entropy": 1.5013746619224548,
"epoch": 1.7030497592295344,
"grad_norm": 2.947014808654785,
"learning_rate": 7.875112976612984e-06,
"loss": 0.3169,
"mean_token_accuracy": 0.8807470500469208,
"num_tokens": 4648423.0,
"step": 531
},
{
"entropy": 1.4312713742256165,
"epoch": 1.7062600321027288,
"grad_norm": 3.6040570735931396,
"learning_rate": 7.866434948113046e-06,
"loss": 0.2446,
"mean_token_accuracy": 0.9076212048530579,
"num_tokens": 4656129.0,
"step": 532
},
{
"entropy": 1.4022682309150696,
"epoch": 1.709470304975923,
"grad_norm": 2.8968658447265625,
"learning_rate": 7.857744038741076e-06,
"loss": 0.2686,
"mean_token_accuracy": 0.8959876894950867,
"num_tokens": 4664283.0,
"step": 533
},
{
"entropy": 1.4385973811149597,
"epoch": 1.7126805778491172,
"grad_norm": 5.81376838684082,
"learning_rate": 7.849040287551331e-06,
"loss": 0.2839,
"mean_token_accuracy": 0.899337649345398,
"num_tokens": 4672631.0,
"step": 534
},
{
"entropy": 1.347984254360199,
"epoch": 1.7158908507223114,
"grad_norm": 3.789095401763916,
"learning_rate": 7.84032373365578e-06,
"loss": 0.3542,
"mean_token_accuracy": 0.851859450340271,
"num_tokens": 4681402.0,
"step": 535
},
{
"entropy": 1.2535653114318848,
"epoch": 1.7191011235955056,
"grad_norm": 3.5626060962677,
"learning_rate": 7.831594416223916e-06,
"loss": 0.2727,
"mean_token_accuracy": 0.8892323970794678,
"num_tokens": 4689845.0,
"step": 536
},
{
"entropy": 1.5344293117523193,
"epoch": 1.7223113964686998,
"grad_norm": 3.13451886177063,
"learning_rate": 7.822852374482597e-06,
"loss": 0.254,
"mean_token_accuracy": 0.912846565246582,
"num_tokens": 4698327.0,
"step": 537
},
{
"entropy": 1.2384685277938843,
"epoch": 1.725521669341894,
"grad_norm": 2.712188482284546,
"learning_rate": 7.814097647715848e-06,
"loss": 0.2943,
"mean_token_accuracy": 0.892603188753128,
"num_tokens": 4708113.0,
"step": 538
},
{
"entropy": 1.2682060599327087,
"epoch": 1.7287319422150884,
"grad_norm": 6.144076824188232,
"learning_rate": 7.805330275264707e-06,
"loss": 0.3334,
"mean_token_accuracy": 0.8756844997406006,
"num_tokens": 4717320.0,
"step": 539
},
{
"entropy": 1.5746580362319946,
"epoch": 1.7319422150882824,
"grad_norm": 4.383955478668213,
"learning_rate": 7.796550296527032e-06,
"loss": 0.2648,
"mean_token_accuracy": 0.8971992433071136,
"num_tokens": 4725706.0,
"step": 540
},
{
"entropy": 1.4106557965278625,
"epoch": 1.7351524879614768,
"grad_norm": 3.649662733078003,
"learning_rate": 7.787757750957335e-06,
"loss": 0.2688,
"mean_token_accuracy": 0.900052547454834,
"num_tokens": 4733543.0,
"step": 541
},
{
"entropy": 1.468329668045044,
"epoch": 1.7383627608346708,
"grad_norm": 3.449719190597534,
"learning_rate": 7.778952678066591e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.874014675617218,
"num_tokens": 4742070.0,
"step": 542
},
{
"entropy": 1.3497502207756042,
"epoch": 1.7415730337078652,
"grad_norm": 6.58897066116333,
"learning_rate": 7.77013511742208e-06,
"loss": 0.3388,
"mean_token_accuracy": 0.8744527399539948,
"num_tokens": 4751985.0,
"step": 543
},
{
"entropy": 1.2957723736763,
"epoch": 1.7447833065810594,
"grad_norm": 4.113377094268799,
"learning_rate": 7.761305108647188e-06,
"loss": 0.2744,
"mean_token_accuracy": 0.8848893344402313,
"num_tokens": 4760494.0,
"step": 544
},
{
"entropy": 1.4583874940872192,
"epoch": 1.7479935794542536,
"grad_norm": 4.075041770935059,
"learning_rate": 7.752462691421245e-06,
"loss": 0.2886,
"mean_token_accuracy": 0.8956989943981171,
"num_tokens": 4769447.0,
"step": 545
},
{
"entropy": 1.2832042574882507,
"epoch": 1.7512038523274478,
"grad_norm": 2.9702274799346924,
"learning_rate": 7.743607905479338e-06,
"loss": 0.2636,
"mean_token_accuracy": 0.9015891551971436,
"num_tokens": 4777454.0,
"step": 546
},
{
"entropy": 1.4110126495361328,
"epoch": 1.754414125200642,
"grad_norm": 3.432813882827759,
"learning_rate": 7.734740790612137e-06,
"loss": 0.2805,
"mean_token_accuracy": 0.8956256210803986,
"num_tokens": 4785393.0,
"step": 547
},
{
"entropy": 1.3326915502548218,
"epoch": 1.7576243980738364,
"grad_norm": 3.350756883621216,
"learning_rate": 7.72586138666571e-06,
"loss": 0.3388,
"mean_token_accuracy": 0.8730327486991882,
"num_tokens": 4795285.0,
"step": 548
},
{
"entropy": 1.433797538280487,
"epoch": 1.7608346709470304,
"grad_norm": 3.5722227096557617,
"learning_rate": 7.716969733541357e-06,
"loss": 0.2556,
"mean_token_accuracy": 0.9026345610618591,
"num_tokens": 4802574.0,
"step": 549
},
{
"entropy": 1.4505755305290222,
"epoch": 1.7640449438202248,
"grad_norm": 3.7785494327545166,
"learning_rate": 7.708065871195413e-06,
"loss": 0.2808,
"mean_token_accuracy": 0.8939539194107056,
"num_tokens": 4811257.0,
"step": 550
},
{
"entropy": 1.4082772731781006,
"epoch": 1.7672552166934188,
"grad_norm": 3.876687526702881,
"learning_rate": 7.699149839639086e-06,
"loss": 0.3146,
"mean_token_accuracy": 0.8838759064674377,
"num_tokens": 4818898.0,
"step": 551
},
{
"entropy": 1.3682604432106018,
"epoch": 1.7704654895666132,
"grad_norm": 16.92738914489746,
"learning_rate": 7.690221678938258e-06,
"loss": 0.2918,
"mean_token_accuracy": 0.8928067088127136,
"num_tokens": 4826651.0,
"step": 552
},
{
"entropy": 1.310297429561615,
"epoch": 1.7736757624398074,
"grad_norm": 2.7511773109436035,
"learning_rate": 7.681281429213328e-06,
"loss": 0.284,
"mean_token_accuracy": 0.8867529034614563,
"num_tokens": 4835780.0,
"step": 553
},
{
"entropy": 1.3972845673561096,
"epoch": 1.7768860353130016,
"grad_norm": 4.055933475494385,
"learning_rate": 7.672329130639007e-06,
"loss": 0.2989,
"mean_token_accuracy": 0.8902477920055389,
"num_tokens": 4844260.0,
"step": 554
},
{
"entropy": 1.6165253520011902,
"epoch": 1.7800963081861958,
"grad_norm": 3.326050043106079,
"learning_rate": 7.663364823444157e-06,
"loss": 0.2543,
"mean_token_accuracy": 0.9065029919147491,
"num_tokens": 4852457.0,
"step": 555
},
{
"entropy": 1.5481394529342651,
"epoch": 1.78330658105939,
"grad_norm": 3.191687822341919,
"learning_rate": 7.654388547911605e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.8783987462520599,
"num_tokens": 4861111.0,
"step": 556
},
{
"entropy": 1.4075528979301453,
"epoch": 1.7865168539325844,
"grad_norm": 4.169843673706055,
"learning_rate": 7.645400344377953e-06,
"loss": 0.3012,
"mean_token_accuracy": 0.8839040398597717,
"num_tokens": 4870169.0,
"step": 557
},
{
"entropy": 1.6146376132965088,
"epoch": 1.7897271268057784,
"grad_norm": 4.2372331619262695,
"learning_rate": 7.63640025323341e-06,
"loss": 0.2876,
"mean_token_accuracy": 0.8897527158260345,
"num_tokens": 4878024.0,
"step": 558
},
{
"entropy": 1.3466166257858276,
"epoch": 1.7929373996789728,
"grad_norm": 2.635805130004883,
"learning_rate": 7.627388314921602e-06,
"loss": 0.2716,
"mean_token_accuracy": 0.8980992436408997,
"num_tokens": 4886585.0,
"step": 559
},
{
"entropy": 1.569422721862793,
"epoch": 1.7961476725521668,
"grad_norm": 4.736917018890381,
"learning_rate": 7.61836456993939e-06,
"loss": 0.3077,
"mean_token_accuracy": 0.8818827569484711,
"num_tokens": 4894236.0,
"step": 560
},
{
"entropy": 1.2981528639793396,
"epoch": 1.7993579454253612,
"grad_norm": 2.962735891342163,
"learning_rate": 7.609329058836694e-06,
"loss": 0.2825,
"mean_token_accuracy": 0.8849244713783264,
"num_tokens": 4904278.0,
"step": 561
},
{
"entropy": 1.3867509961128235,
"epoch": 1.8025682182985554,
"grad_norm": 11.534844398498535,
"learning_rate": 7.600281822216307e-06,
"loss": 0.2866,
"mean_token_accuracy": 0.8904646039009094,
"num_tokens": 4912560.0,
"step": 562
},
{
"entropy": 1.3264847993850708,
"epoch": 1.8057784911717496,
"grad_norm": 3.852452516555786,
"learning_rate": 7.59122290073371e-06,
"loss": 0.3326,
"mean_token_accuracy": 0.8820092082023621,
"num_tokens": 4922110.0,
"step": 563
},
{
"entropy": 1.4996045231819153,
"epoch": 1.8089887640449438,
"grad_norm": 3.772239923477173,
"learning_rate": 7.582152335096896e-06,
"loss": 0.294,
"mean_token_accuracy": 0.8812500834465027,
"num_tokens": 4929180.0,
"step": 564
},
{
"entropy": 1.4106106758117676,
"epoch": 1.812199036918138,
"grad_norm": 3.0418591499328613,
"learning_rate": 7.5730701660661795e-06,
"loss": 0.3007,
"mean_token_accuracy": 0.8839779198169708,
"num_tokens": 4937696.0,
"step": 565
},
{
"entropy": 1.4185760021209717,
"epoch": 1.8154093097913324,
"grad_norm": 4.139153480529785,
"learning_rate": 7.563976434454021e-06,
"loss": 0.313,
"mean_token_accuracy": 0.8874213993549347,
"num_tokens": 4945516.0,
"step": 566
},
{
"entropy": 1.4388535618782043,
"epoch": 1.8186195826645264,
"grad_norm": 3.0319912433624268,
"learning_rate": 7.554871181124836e-06,
"loss": 0.2406,
"mean_token_accuracy": 0.9032376706600189,
"num_tokens": 4953130.0,
"step": 567
},
{
"entropy": 1.437139868736267,
"epoch": 1.8218298555377208,
"grad_norm": 2.6992785930633545,
"learning_rate": 7.5457544469948164e-06,
"loss": 0.281,
"mean_token_accuracy": 0.8920381367206573,
"num_tokens": 4961942.0,
"step": 568
},
{
"entropy": 1.335317313671112,
"epoch": 1.8250401284109148,
"grad_norm": 3.9841814041137695,
"learning_rate": 7.536626273031747e-06,
"loss": 0.2964,
"mean_token_accuracy": 0.8849454522132874,
"num_tokens": 4969921.0,
"step": 569
},
{
"entropy": 1.6164771914482117,
"epoch": 1.8282504012841092,
"grad_norm": 4.4012770652771,
"learning_rate": 7.5274867002548154e-06,
"loss": 0.276,
"mean_token_accuracy": 0.9028495252132416,
"num_tokens": 4978461.0,
"step": 570
},
{
"entropy": 1.535290777683258,
"epoch": 1.8314606741573034,
"grad_norm": 25.161006927490234,
"learning_rate": 7.5183357697344395e-06,
"loss": 0.3082,
"mean_token_accuracy": 0.8934054970741272,
"num_tokens": 4986774.0,
"step": 571
},
{
"entropy": 1.2994403839111328,
"epoch": 1.8346709470304976,
"grad_norm": 2.891382932662964,
"learning_rate": 7.509173522592066e-06,
"loss": 0.334,
"mean_token_accuracy": 0.8758228123188019,
"num_tokens": 4995401.0,
"step": 572
},
{
"entropy": 1.3535541892051697,
"epoch": 1.8378812199036918,
"grad_norm": 3.3566458225250244,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3106,
"mean_token_accuracy": 0.8938397765159607,
"num_tokens": 5003723.0,
"step": 573
},
{
"entropy": 1.4879749417304993,
"epoch": 1.841091492776886,
"grad_norm": 14.512699127197266,
"learning_rate": 7.4908152431812175e-06,
"loss": 0.303,
"mean_token_accuracy": 0.8893711566925049,
"num_tokens": 5012065.0,
"step": 574
},
{
"entropy": 1.439945101737976,
"epoch": 1.8443017656500804,
"grad_norm": 3.0775973796844482,
"learning_rate": 7.481619293409173e-06,
"loss": 0.3103,
"mean_token_accuracy": 0.8844351768493652,
"num_tokens": 5020574.0,
"step": 575
},
{
"entropy": 1.339579463005066,
"epoch": 1.8475120385232744,
"grad_norm": 3.9711005687713623,
"learning_rate": 7.472412192007619e-06,
"loss": 0.3433,
"mean_token_accuracy": 0.8750526309013367,
"num_tokens": 5028805.0,
"step": 576
},
{
"entropy": 1.5097241401672363,
"epoch": 1.8507223113964688,
"grad_norm": 3.872302770614624,
"learning_rate": 7.4631939803504215e-06,
"loss": 0.3172,
"mean_token_accuracy": 0.8816109299659729,
"num_tokens": 5037794.0,
"step": 577
},
{
"entropy": 1.3860506415367126,
"epoch": 1.8539325842696628,
"grad_norm": 3.852682113647461,
"learning_rate": 7.453964699861376e-06,
"loss": 0.3045,
"mean_token_accuracy": 0.8891089558601379,
"num_tokens": 5045600.0,
"step": 578
},
{
"entropy": 1.2616366147994995,
"epoch": 1.8571428571428572,
"grad_norm": 3.328491449356079,
"learning_rate": 7.44472439201401e-06,
"loss": 0.379,
"mean_token_accuracy": 0.8576820492744446,
"num_tokens": 5055136.0,
"step": 579
},
{
"entropy": 1.4181751608848572,
"epoch": 1.8603531300160514,
"grad_norm": 9.961250305175781,
"learning_rate": 7.435473098331411e-06,
"loss": 0.3049,
"mean_token_accuracy": 0.88387331366539,
"num_tokens": 5064465.0,
"step": 580
},
{
"entropy": 1.3726850152015686,
"epoch": 1.8635634028892456,
"grad_norm": 3.580294609069824,
"learning_rate": 7.426210860386032e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.8814990222454071,
"num_tokens": 5073152.0,
"step": 581
},
{
"entropy": 1.542908489704132,
"epoch": 1.8667736757624398,
"grad_norm": 2.881855010986328,
"learning_rate": 7.416937719799502e-06,
"loss": 0.3088,
"mean_token_accuracy": 0.887274444103241,
"num_tokens": 5082323.0,
"step": 582
},
{
"entropy": 1.4042843580245972,
"epoch": 1.869983948635634,
"grad_norm": 6.571777820587158,
"learning_rate": 7.407653718242449e-06,
"loss": 0.3387,
"mean_token_accuracy": 0.8746259808540344,
"num_tokens": 5090828.0,
"step": 583
},
{
"entropy": 1.3728899359703064,
"epoch": 1.8731942215088284,
"grad_norm": 3.124652624130249,
"learning_rate": 7.398358897434303e-06,
"loss": 0.3169,
"mean_token_accuracy": 0.8831090331077576,
"num_tokens": 5099452.0,
"step": 584
},
{
"entropy": 1.6130582094192505,
"epoch": 1.8764044943820224,
"grad_norm": 4.153020858764648,
"learning_rate": 7.3890532991431174e-06,
"loss": 0.2957,
"mean_token_accuracy": 0.8826304972171783,
"num_tokens": 5107713.0,
"step": 585
},
{
"entropy": 1.5210551023483276,
"epoch": 1.8796147672552168,
"grad_norm": 2.822763204574585,
"learning_rate": 7.379736965185369e-06,
"loss": 0.2601,
"mean_token_accuracy": 0.903123527765274,
"num_tokens": 5117399.0,
"step": 586
},
{
"entropy": 1.421782910823822,
"epoch": 1.8828250401284108,
"grad_norm": 4.258233070373535,
"learning_rate": 7.370409937425781e-06,
"loss": 0.3441,
"mean_token_accuracy": 0.8721145987510681,
"num_tokens": 5125660.0,
"step": 587
},
{
"entropy": 1.4874483942985535,
"epoch": 1.8860353130016052,
"grad_norm": 4.4238972663879395,
"learning_rate": 7.361072257777132e-06,
"loss": 0.2908,
"mean_token_accuracy": 0.8859201669692993,
"num_tokens": 5134941.0,
"step": 588
},
{
"entropy": 1.47506844997406,
"epoch": 1.8892455858747994,
"grad_norm": 2.905021905899048,
"learning_rate": 7.3517239682000675e-06,
"loss": 0.3107,
"mean_token_accuracy": 0.8757966160774231,
"num_tokens": 5142323.0,
"step": 589
},
{
"entropy": 1.3628226518630981,
"epoch": 1.8924558587479936,
"grad_norm": 2.5774905681610107,
"learning_rate": 7.342365110702907e-06,
"loss": 0.3283,
"mean_token_accuracy": 0.8776254951953888,
"num_tokens": 5151640.0,
"step": 590
},
{
"entropy": 1.57500422000885,
"epoch": 1.8956661316211878,
"grad_norm": 3.3131790161132812,
"learning_rate": 7.332995727341462e-06,
"loss": 0.3328,
"mean_token_accuracy": 0.8693976402282715,
"num_tokens": 5162302.0,
"step": 591
},
{
"entropy": 1.5631265044212341,
"epoch": 1.898876404494382,
"grad_norm": 3.2233312129974365,
"learning_rate": 7.323615860218844e-06,
"loss": 0.3031,
"mean_token_accuracy": 0.8829487860202789,
"num_tokens": 5171343.0,
"step": 592
},
{
"entropy": 1.474395990371704,
"epoch": 1.9020866773675762,
"grad_norm": 3.0282936096191406,
"learning_rate": 7.314225551485273e-06,
"loss": 0.2859,
"mean_token_accuracy": 0.8863288462162018,
"num_tokens": 5179319.0,
"step": 593
},
{
"entropy": 1.5331798791885376,
"epoch": 1.9052969502407704,
"grad_norm": 3.055107593536377,
"learning_rate": 7.304824843337893e-06,
"loss": 0.2923,
"mean_token_accuracy": 0.8974760174751282,
"num_tokens": 5188212.0,
"step": 594
},
{
"entropy": 1.4196932315826416,
"epoch": 1.9085072231139648,
"grad_norm": 3.611940622329712,
"learning_rate": 7.295413778020579e-06,
"loss": 0.3169,
"mean_token_accuracy": 0.875687837600708,
"num_tokens": 5196437.0,
"step": 595
},
{
"entropy": 1.7519562244415283,
"epoch": 1.9117174959871588,
"grad_norm": 4.035586357116699,
"learning_rate": 7.285992397823747e-06,
"loss": 0.3085,
"mean_token_accuracy": 0.8901273906230927,
"num_tokens": 5205394.0,
"step": 596
},
{
"entropy": 1.423706591129303,
"epoch": 1.9149277688603532,
"grad_norm": 3.4519264698028564,
"learning_rate": 7.276560745084167e-06,
"loss": 0.3381,
"mean_token_accuracy": 0.8672617375850677,
"num_tokens": 5213855.0,
"step": 597
},
{
"entropy": 1.3611206412315369,
"epoch": 1.9181380417335474,
"grad_norm": 3.74892258644104,
"learning_rate": 7.267118862184767e-06,
"loss": 0.3482,
"mean_token_accuracy": 0.8646276295185089,
"num_tokens": 5223352.0,
"step": 598
},
{
"entropy": 1.2371296286582947,
"epoch": 1.9213483146067416,
"grad_norm": 5.596102237701416,
"learning_rate": 7.257666791554448e-06,
"loss": 0.3715,
"mean_token_accuracy": 0.8619127869606018,
"num_tokens": 5233034.0,
"step": 599
},
{
"entropy": 1.4176723957061768,
"epoch": 1.9245585874799358,
"grad_norm": 3.404355525970459,
"learning_rate": 7.248204575667893e-06,
"loss": 0.2603,
"mean_token_accuracy": 0.9018069803714752,
"num_tokens": 5240859.0,
"step": 600
},
{
"entropy": 1.5233874917030334,
"epoch": 1.92776886035313,
"grad_norm": 4.02052116394043,
"learning_rate": 7.2387322570453724e-06,
"loss": 0.2732,
"mean_token_accuracy": 0.8832527995109558,
"num_tokens": 5250145.0,
"step": 601
},
{
"entropy": 1.3011326789855957,
"epoch": 1.9309791332263242,
"grad_norm": 2.9296908378601074,
"learning_rate": 7.229249878252558e-06,
"loss": 0.3317,
"mean_token_accuracy": 0.8786691129207611,
"num_tokens": 5258015.0,
"step": 602
},
{
"entropy": 1.3884427547454834,
"epoch": 1.9341894060995184,
"grad_norm": 2.848806381225586,
"learning_rate": 7.219757481900325e-06,
"loss": 0.2806,
"mean_token_accuracy": 0.893255889415741,
"num_tokens": 5267533.0,
"step": 603
},
{
"entropy": 1.5049036145210266,
"epoch": 1.9373996789727128,
"grad_norm": 3.0091021060943604,
"learning_rate": 7.210255110644569e-06,
"loss": 0.2552,
"mean_token_accuracy": 0.9070955812931061,
"num_tokens": 5275537.0,
"step": 604
},
{
"entropy": 1.367910087108612,
"epoch": 1.9406099518459068,
"grad_norm": 4.554563522338867,
"learning_rate": 7.2007428071860045e-06,
"loss": 0.2996,
"mean_token_accuracy": 0.8829465806484222,
"num_tokens": 5284322.0,
"step": 605
},
{
"entropy": 1.5234755277633667,
"epoch": 1.9438202247191012,
"grad_norm": 7.072104454040527,
"learning_rate": 7.191220614269981e-06,
"loss": 0.2866,
"mean_token_accuracy": 0.8958406448364258,
"num_tokens": 5293226.0,
"step": 606
},
{
"entropy": 1.4429296255111694,
"epoch": 1.9470304975922952,
"grad_norm": 2.972801685333252,
"learning_rate": 7.181688574686292e-06,
"loss": 0.2546,
"mean_token_accuracy": 0.9038203060626984,
"num_tokens": 5301639.0,
"step": 607
},
{
"entropy": 1.3830759525299072,
"epoch": 1.9502407704654896,
"grad_norm": 3.0528810024261475,
"learning_rate": 7.17214673126897e-06,
"loss": 0.3471,
"mean_token_accuracy": 0.8751060366630554,
"num_tokens": 5310539.0,
"step": 608
},
{
"entropy": 1.6103679537773132,
"epoch": 1.9534510433386838,
"grad_norm": 3.2677433490753174,
"learning_rate": 7.162595126896111e-06,
"loss": 0.3372,
"mean_token_accuracy": 0.8642582893371582,
"num_tokens": 5320301.0,
"step": 609
},
{
"entropy": 1.3761922121047974,
"epoch": 1.956661316211878,
"grad_norm": 3.2383453845977783,
"learning_rate": 7.15303380448967e-06,
"loss": 0.3415,
"mean_token_accuracy": 0.8723717033863068,
"num_tokens": 5328446.0,
"step": 610
},
{
"entropy": 1.4116157293319702,
"epoch": 1.9598715890850722,
"grad_norm": 5.559144973754883,
"learning_rate": 7.143462807015271e-06,
"loss": 0.2742,
"mean_token_accuracy": 0.8913781046867371,
"num_tokens": 5337095.0,
"step": 611
},
{
"entropy": 1.2883580923080444,
"epoch": 1.9630818619582664,
"grad_norm": 5.594122886657715,
"learning_rate": 7.133882177482019e-06,
"loss": 0.2767,
"mean_token_accuracy": 0.8971899151802063,
"num_tokens": 5345189.0,
"step": 612
},
{
"entropy": 1.3326961398124695,
"epoch": 1.9662921348314608,
"grad_norm": 3.9941868782043457,
"learning_rate": 7.1242919589422974e-06,
"loss": 0.3223,
"mean_token_accuracy": 0.8792887628078461,
"num_tokens": 5353477.0,
"step": 613
},
{
"entropy": 1.463496744632721,
"epoch": 1.9695024077046548,
"grad_norm": 3.5513710975646973,
"learning_rate": 7.114692194491583e-06,
"loss": 0.3035,
"mean_token_accuracy": 0.883477658033371,
"num_tokens": 5361375.0,
"step": 614
},
{
"entropy": 1.2219607830047607,
"epoch": 1.9727126805778492,
"grad_norm": 3.10086989402771,
"learning_rate": 7.105082927268247e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.8641158044338226,
"num_tokens": 5371062.0,
"step": 615
},
{
"entropy": 1.388469636440277,
"epoch": 1.9759229534510432,
"grad_norm": 4.3963398933410645,
"learning_rate": 7.095464200453366e-06,
"loss": 0.3199,
"mean_token_accuracy": 0.8787851929664612,
"num_tokens": 5380088.0,
"step": 616
},
{
"entropy": 1.4874065518379211,
"epoch": 1.9791332263242376,
"grad_norm": 4.8666205406188965,
"learning_rate": 7.085836057270521e-06,
"loss": 0.2764,
"mean_token_accuracy": 0.8986081182956696,
"num_tokens": 5388417.0,
"step": 617
},
{
"entropy": 1.3346271514892578,
"epoch": 1.9823434991974318,
"grad_norm": 3.119516611099243,
"learning_rate": 7.07619854098561e-06,
"loss": 0.2786,
"mean_token_accuracy": 0.8969616293907166,
"num_tokens": 5396308.0,
"step": 618
},
{
"entropy": 1.4097462892532349,
"epoch": 1.985553772070626,
"grad_norm": 4.254458427429199,
"learning_rate": 7.066551694906651e-06,
"loss": 0.2261,
"mean_token_accuracy": 0.90898796916008,
"num_tokens": 5403926.0,
"step": 619
},
{
"entropy": 1.433124840259552,
"epoch": 1.9887640449438202,
"grad_norm": 5.4737868309021,
"learning_rate": 7.056895562383585e-06,
"loss": 0.315,
"mean_token_accuracy": 0.8711326122283936,
"num_tokens": 5412353.0,
"step": 620
},
{
"entropy": 1.420740008354187,
"epoch": 1.9919743178170144,
"grad_norm": 7.06497049331665,
"learning_rate": 7.047230186808085e-06,
"loss": 0.3284,
"mean_token_accuracy": 0.8794163167476654,
"num_tokens": 5420634.0,
"step": 621
},
{
"entropy": 1.5400715470314026,
"epoch": 1.9951845906902088,
"grad_norm": 2.705223560333252,
"learning_rate": 7.0375556116133605e-06,
"loss": 0.3048,
"mean_token_accuracy": 0.888154536485672,
"num_tokens": 5430245.0,
"step": 622
},
{
"entropy": 1.3293360471725464,
"epoch": 1.9983948635634028,
"grad_norm": 3.812591314315796,
"learning_rate": 7.027871880273959e-06,
"loss": 0.3132,
"mean_token_accuracy": 0.881993293762207,
"num_tokens": 5437894.0,
"step": 623
},
{
"entropy": 1.294339656829834,
"epoch": 2.0,
"grad_norm": 4.192768573760986,
"learning_rate": 7.018179036305574e-06,
"loss": 0.252,
"mean_token_accuracy": 0.9110794067382812,
"num_tokens": 5442284.0,
"step": 624
},
{
"entropy": 1.4277611374855042,
"epoch": 2.0032102728731944,
"grad_norm": 3.0245370864868164,
"learning_rate": 7.008477123264849e-06,
"loss": 0.1706,
"mean_token_accuracy": 0.9453595578670502,
"num_tokens": 5451260.0,
"step": 625
},
{
"entropy": 1.4434685707092285,
"epoch": 2.0064205457463884,
"grad_norm": 3.368790626525879,
"learning_rate": 6.9987661847491786e-06,
"loss": 0.1528,
"mean_token_accuracy": 0.9484553039073944,
"num_tokens": 5459386.0,
"step": 626
},
{
"entropy": 1.2321021556854248,
"epoch": 2.009630818619583,
"grad_norm": 2.4026436805725098,
"learning_rate": 6.989046264396516e-06,
"loss": 0.131,
"mean_token_accuracy": 0.9575425088405609,
"num_tokens": 5467001.0,
"step": 627
},
{
"entropy": 1.2222203612327576,
"epoch": 2.012841091492777,
"grad_norm": 30.97420310974121,
"learning_rate": 6.9793174058851805e-06,
"loss": 0.148,
"mean_token_accuracy": 0.9470765292644501,
"num_tokens": 5475451.0,
"step": 628
},
{
"entropy": 1.422147512435913,
"epoch": 2.016051364365971,
"grad_norm": 7.181051731109619,
"learning_rate": 6.96957965293365e-06,
"loss": 0.1134,
"mean_token_accuracy": 0.9622469842433929,
"num_tokens": 5482704.0,
"step": 629
},
{
"entropy": 1.1400924921035767,
"epoch": 2.019261637239165,
"grad_norm": 2.26750111579895,
"learning_rate": 6.959833049300376e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9331265091896057,
"num_tokens": 5492344.0,
"step": 630
},
{
"entropy": 1.236695945262909,
"epoch": 2.0224719101123596,
"grad_norm": 2.5727972984313965,
"learning_rate": 6.9500776387835785e-06,
"loss": 0.1295,
"mean_token_accuracy": 0.9469403326511383,
"num_tokens": 5501664.0,
"step": 631
},
{
"entropy": 1.193276584148407,
"epoch": 2.0256821829855536,
"grad_norm": 2.4318366050720215,
"learning_rate": 6.940313465221057e-06,
"loss": 0.1117,
"mean_token_accuracy": 0.9551983177661896,
"num_tokens": 5510226.0,
"step": 632
},
{
"entropy": 1.175793468952179,
"epoch": 2.028892455858748,
"grad_norm": 3.1776866912841797,
"learning_rate": 6.9305405724899876e-06,
"loss": 0.1635,
"mean_token_accuracy": 0.9337913990020752,
"num_tokens": 5519131.0,
"step": 633
},
{
"entropy": 1.1542350053787231,
"epoch": 2.0321027287319424,
"grad_norm": 3.319153308868408,
"learning_rate": 6.920759004506723e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9449957609176636,
"num_tokens": 5527804.0,
"step": 634
},
{
"entropy": 1.1090035438537598,
"epoch": 2.0353130016051364,
"grad_norm": 4.43798303604126,
"learning_rate": 6.91096880522661e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.9575115442276001,
"num_tokens": 5535406.0,
"step": 635
},
{
"entropy": 1.0131879448890686,
"epoch": 2.038523274478331,
"grad_norm": 3.432020664215088,
"learning_rate": 6.90117001864377e-06,
"loss": 0.1494,
"mean_token_accuracy": 0.9273790717124939,
"num_tokens": 5545569.0,
"step": 636
},
{
"entropy": 1.1925968527793884,
"epoch": 2.041733547351525,
"grad_norm": 2.716158390045166,
"learning_rate": 6.891362688790925e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9510330855846405,
"num_tokens": 5553431.0,
"step": 637
},
{
"entropy": 1.1236230731010437,
"epoch": 2.044943820224719,
"grad_norm": 4.862785339355469,
"learning_rate": 6.8815468597391785e-06,
"loss": 0.204,
"mean_token_accuracy": 0.915119081735611,
"num_tokens": 5563682.0,
"step": 638
},
{
"entropy": 1.1460051536560059,
"epoch": 2.048154093097913,
"grad_norm": 5.143580436706543,
"learning_rate": 6.871722575597829e-06,
"loss": 0.118,
"mean_token_accuracy": 0.9572257995605469,
"num_tokens": 5571730.0,
"step": 639
},
{
"entropy": 1.2631003260612488,
"epoch": 2.0513643659711076,
"grad_norm": 2.6166398525238037,
"learning_rate": 6.8618898805141744e-06,
"loss": 0.1175,
"mean_token_accuracy": 0.9546981453895569,
"num_tokens": 5579619.0,
"step": 640
},
{
"entropy": 1.212298333644867,
"epoch": 2.0545746388443016,
"grad_norm": 4.810606002807617,
"learning_rate": 6.8520488186733e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9437867701053619,
"num_tokens": 5587467.0,
"step": 641
},
{
"entropy": 1.1090901494026184,
"epoch": 2.057784911717496,
"grad_norm": 2.9815127849578857,
"learning_rate": 6.8421994342979e-06,
"loss": 0.1286,
"mean_token_accuracy": 0.9465640485286713,
"num_tokens": 5595910.0,
"step": 642
},
{
"entropy": 1.1284350156784058,
"epoch": 2.0609951845906904,
"grad_norm": 2.8620505332946777,
"learning_rate": 6.832341771648057e-06,
"loss": 0.1785,
"mean_token_accuracy": 0.9314178228378296,
"num_tokens": 5605815.0,
"step": 643
},
{
"entropy": 1.2043840885162354,
"epoch": 2.0642054574638844,
"grad_norm": 5.458688735961914,
"learning_rate": 6.822475875021057e-06,
"loss": 0.1485,
"mean_token_accuracy": 0.9459114074707031,
"num_tokens": 5615275.0,
"step": 644
},
{
"entropy": 1.3176180124282837,
"epoch": 2.067415730337079,
"grad_norm": 2.8183062076568604,
"learning_rate": 6.812601788751192e-06,
"loss": 0.1116,
"mean_token_accuracy": 0.9547081887722015,
"num_tokens": 5623783.0,
"step": 645
},
{
"entropy": 1.2037148475646973,
"epoch": 2.070626003210273,
"grad_norm": 2.6667561531066895,
"learning_rate": 6.802719557209547e-06,
"loss": 0.1381,
"mean_token_accuracy": 0.954013854265213,
"num_tokens": 5631939.0,
"step": 646
},
{
"entropy": 1.1883854269981384,
"epoch": 2.073836276083467,
"grad_norm": 2.371598243713379,
"learning_rate": 6.792829224803816e-06,
"loss": 0.1445,
"mean_token_accuracy": 0.9299385249614716,
"num_tokens": 5641553.0,
"step": 647
},
{
"entropy": 1.263631522655487,
"epoch": 2.077046548956661,
"grad_norm": 2.858833074569702,
"learning_rate": 6.782930835978094e-06,
"loss": 0.117,
"mean_token_accuracy": 0.9559842646121979,
"num_tokens": 5650631.0,
"step": 648
},
{
"entropy": 1.2522808909416199,
"epoch": 2.0802568218298556,
"grad_norm": 5.837004661560059,
"learning_rate": 6.773024435212678e-06,
"loss": 0.1141,
"mean_token_accuracy": 0.9567996859550476,
"num_tokens": 5659252.0,
"step": 649
},
{
"entropy": 1.1148146390914917,
"epoch": 2.0834670947030496,
"grad_norm": 3.9768710136413574,
"learning_rate": 6.76311006702387e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9364666640758514,
"num_tokens": 5668324.0,
"step": 650
},
{
"entropy": 1.3358003497123718,
"epoch": 2.086677367576244,
"grad_norm": 2.3943331241607666,
"learning_rate": 6.753187775963773e-06,
"loss": 0.1011,
"mean_token_accuracy": 0.9628923833370209,
"num_tokens": 5676989.0,
"step": 651
},
{
"entropy": 1.0956073999404907,
"epoch": 2.0898876404494384,
"grad_norm": 2.8093678951263428,
"learning_rate": 6.743257606620094e-06,
"loss": 0.12,
"mean_token_accuracy": 0.953659862279892,
"num_tokens": 5685189.0,
"step": 652
},
{
"entropy": 1.2954540252685547,
"epoch": 2.0930979133226324,
"grad_norm": 2.636671304702759,
"learning_rate": 6.733319603615941e-06,
"loss": 0.1617,
"mean_token_accuracy": 0.9414326548576355,
"num_tokens": 5694647.0,
"step": 653
},
{
"entropy": 1.1856536865234375,
"epoch": 2.096308186195827,
"grad_norm": 3.4469292163848877,
"learning_rate": 6.723373811609628e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9572050869464874,
"num_tokens": 5701968.0,
"step": 654
},
{
"entropy": 1.061371386051178,
"epoch": 2.099518459069021,
"grad_norm": 3.5963852405548096,
"learning_rate": 6.713420275294467e-06,
"loss": 0.1472,
"mean_token_accuracy": 0.9384236931800842,
"num_tokens": 5710326.0,
"step": 655
},
{
"entropy": 1.1693629026412964,
"epoch": 2.102728731942215,
"grad_norm": 3.4809770584106445,
"learning_rate": 6.703459039398571e-06,
"loss": 0.1198,
"mean_token_accuracy": 0.951190173625946,
"num_tokens": 5718963.0,
"step": 656
},
{
"entropy": 1.1097606420516968,
"epoch": 2.105939004815409,
"grad_norm": 5.711026191711426,
"learning_rate": 6.693490148684654e-06,
"loss": 0.1431,
"mean_token_accuracy": 0.9451210498809814,
"num_tokens": 5727156.0,
"step": 657
},
{
"entropy": 1.307206630706787,
"epoch": 2.1091492776886036,
"grad_norm": 3.316901206970215,
"learning_rate": 6.683513647949826e-06,
"loss": 0.1198,
"mean_token_accuracy": 0.9504996240139008,
"num_tokens": 5735593.0,
"step": 658
},
{
"entropy": 1.1543167233467102,
"epoch": 2.1123595505617976,
"grad_norm": 2.2867281436920166,
"learning_rate": 6.673529582025398e-06,
"loss": 0.1263,
"mean_token_accuracy": 0.9489535987377167,
"num_tokens": 5743790.0,
"step": 659
},
{
"entropy": 1.0919539332389832,
"epoch": 2.115569823434992,
"grad_norm": 2.8135688304901123,
"learning_rate": 6.66353799577667e-06,
"loss": 0.1146,
"mean_token_accuracy": 0.9532299339771271,
"num_tokens": 5751660.0,
"step": 660
},
{
"entropy": 1.1081830263137817,
"epoch": 2.1187800963081864,
"grad_norm": 2.3218095302581787,
"learning_rate": 6.653538934102743e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9097401201725006,
"num_tokens": 5761720.0,
"step": 661
},
{
"entropy": 1.2457672357559204,
"epoch": 2.1219903691813804,
"grad_norm": 13.804795265197754,
"learning_rate": 6.643532441936307e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9488136768341064,
"num_tokens": 5769431.0,
"step": 662
},
{
"entropy": 1.1809271574020386,
"epoch": 2.125200642054575,
"grad_norm": 6.929125785827637,
"learning_rate": 6.633518564243442e-06,
"loss": 0.0981,
"mean_token_accuracy": 0.9624985456466675,
"num_tokens": 5776379.0,
"step": 663
},
{
"entropy": 1.1943358182907104,
"epoch": 2.128410914927769,
"grad_norm": 3.932882070541382,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9579845666885376,
"num_tokens": 5785230.0,
"step": 664
},
{
"entropy": 1.1744264364242554,
"epoch": 2.131621187800963,
"grad_norm": 2.690080165863037,
"learning_rate": 6.6134688323084884e-06,
"loss": 0.1561,
"mean_token_accuracy": 0.9294100701808929,
"num_tokens": 5793482.0,
"step": 665
},
{
"entropy": 0.9881645143032074,
"epoch": 2.134831460674157,
"grad_norm": 4.003161430358887,
"learning_rate": 6.603433068163694e-06,
"loss": 0.154,
"mean_token_accuracy": 0.9402081966400146,
"num_tokens": 5803065.0,
"step": 666
},
{
"entropy": 1.1991289258003235,
"epoch": 2.1380417335473516,
"grad_norm": 3.018016815185547,
"learning_rate": 6.593390098686653e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9330925345420837,
"num_tokens": 5813306.0,
"step": 667
},
{
"entropy": 1.0930429697036743,
"epoch": 2.1412520064205456,
"grad_norm": 5.356147289276123,
"learning_rate": 6.583339969007364e-06,
"loss": 0.1285,
"mean_token_accuracy": 0.9488291144371033,
"num_tokens": 5821225.0,
"step": 668
},
{
"entropy": 1.146267056465149,
"epoch": 2.14446227929374,
"grad_norm": 4.305771827697754,
"learning_rate": 6.573282724288001e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9525870680809021,
"num_tokens": 5829679.0,
"step": 669
},
{
"entropy": 1.1834629774093628,
"epoch": 2.1476725521669344,
"grad_norm": 3.5672402381896973,
"learning_rate": 6.563218409722712e-06,
"loss": 0.1158,
"mean_token_accuracy": 0.9621096253395081,
"num_tokens": 5837110.0,
"step": 670
},
{
"entropy": 1.25631844997406,
"epoch": 2.1508828250401284,
"grad_norm": 8.232504844665527,
"learning_rate": 6.553147070537413e-06,
"loss": 0.1041,
"mean_token_accuracy": 0.9604884684085846,
"num_tokens": 5845214.0,
"step": 671
},
{
"entropy": 1.1267945170402527,
"epoch": 2.154093097913323,
"grad_norm": 10.115373611450195,
"learning_rate": 6.543068751989585e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9541674256324768,
"num_tokens": 5854190.0,
"step": 672
},
{
"entropy": 1.2294913530349731,
"epoch": 2.157303370786517,
"grad_norm": 2.8828227519989014,
"learning_rate": 6.532983499368078e-06,
"loss": 0.1436,
"mean_token_accuracy": 0.9478682279586792,
"num_tokens": 5862906.0,
"step": 673
},
{
"entropy": 1.118057906627655,
"epoch": 2.160513643659711,
"grad_norm": 3.830436944961548,
"learning_rate": 6.522891357992895e-06,
"loss": 0.1177,
"mean_token_accuracy": 0.9477755129337311,
"num_tokens": 5871654.0,
"step": 674
},
{
"entropy": 1.1084845662117004,
"epoch": 2.163723916532905,
"grad_norm": 2.5382556915283203,
"learning_rate": 6.512792373215e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9324210584163666,
"num_tokens": 5880988.0,
"step": 675
},
{
"entropy": 1.1747573018074036,
"epoch": 2.1669341894060996,
"grad_norm": 4.374128818511963,
"learning_rate": 6.502686590416105e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.9460614025592804,
"num_tokens": 5890735.0,
"step": 676
},
{
"entropy": 1.1294305920600891,
"epoch": 2.1701444622792936,
"grad_norm": 2.974681854248047,
"learning_rate": 6.492574055008474e-06,
"loss": 0.1403,
"mean_token_accuracy": 0.9466178119182587,
"num_tokens": 5899074.0,
"step": 677
},
{
"entropy": 1.0897773504257202,
"epoch": 2.173354735152488,
"grad_norm": 2.3436830043792725,
"learning_rate": 6.482454812434711e-06,
"loss": 0.1215,
"mean_token_accuracy": 0.9511640667915344,
"num_tokens": 5907654.0,
"step": 678
},
{
"entropy": 1.1333916187286377,
"epoch": 2.176565008025682,
"grad_norm": 2.5949723720550537,
"learning_rate": 6.472328908167562e-06,
"loss": 0.1084,
"mean_token_accuracy": 0.9622556865215302,
"num_tokens": 5915090.0,
"step": 679
},
{
"entropy": 1.1536172032356262,
"epoch": 2.1797752808988764,
"grad_norm": 2.2703359127044678,
"learning_rate": 6.4621963877097105e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9524502754211426,
"num_tokens": 5922195.0,
"step": 680
},
{
"entropy": 1.0362255573272705,
"epoch": 2.182985553772071,
"grad_norm": 2.933612823486328,
"learning_rate": 6.452057296593568e-06,
"loss": 0.1539,
"mean_token_accuracy": 0.9349975883960724,
"num_tokens": 5931493.0,
"step": 681
},
{
"entropy": 1.0833754539489746,
"epoch": 2.186195826645265,
"grad_norm": 3.006075382232666,
"learning_rate": 6.441911680381074e-06,
"loss": 0.1322,
"mean_token_accuracy": 0.9482509791851044,
"num_tokens": 5939442.0,
"step": 682
},
{
"entropy": 0.9261104166507721,
"epoch": 2.189406099518459,
"grad_norm": 4.065014362335205,
"learning_rate": 6.431759584663492e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9163274765014648,
"num_tokens": 5950169.0,
"step": 683
},
{
"entropy": 1.121094822883606,
"epoch": 2.192616372391653,
"grad_norm": 2.5804195404052734,
"learning_rate": 6.421601055061195e-06,
"loss": 0.1381,
"mean_token_accuracy": 0.92803093791008,
"num_tokens": 5960170.0,
"step": 684
},
{
"entropy": 1.1522566080093384,
"epoch": 2.1958266452648476,
"grad_norm": 2.320777416229248,
"learning_rate": 6.411436137223479e-06,
"loss": 0.107,
"mean_token_accuracy": 0.9578154981136322,
"num_tokens": 5968542.0,
"step": 685
},
{
"entropy": 1.353051781654358,
"epoch": 2.1990369181380416,
"grad_norm": 4.244365215301514,
"learning_rate": 6.401264876828335e-06,
"loss": 0.0945,
"mean_token_accuracy": 0.9666432440280914,
"num_tokens": 5976718.0,
"step": 686
},
{
"entropy": 1.0453286170959473,
"epoch": 2.202247191011236,
"grad_norm": 6.80914306640625,
"learning_rate": 6.391087319582264e-06,
"loss": 0.1441,
"mean_token_accuracy": 0.9426902532577515,
"num_tokens": 5986164.0,
"step": 687
},
{
"entropy": 1.2862181663513184,
"epoch": 2.20545746388443,
"grad_norm": 2.674420118331909,
"learning_rate": 6.38090351122006e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9401120543479919,
"num_tokens": 5995535.0,
"step": 688
},
{
"entropy": 1.223171889781952,
"epoch": 2.2086677367576244,
"grad_norm": 2.823160171508789,
"learning_rate": 6.370713497504607e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.9436621367931366,
"num_tokens": 6003683.0,
"step": 689
},
{
"entropy": 1.1387187242507935,
"epoch": 2.211878009630819,
"grad_norm": 3.315049171447754,
"learning_rate": 6.360517324226676e-06,
"loss": 0.117,
"mean_token_accuracy": 0.9554562270641327,
"num_tokens": 6012357.0,
"step": 690
},
{
"entropy": 1.0211151242256165,
"epoch": 2.215088282504013,
"grad_norm": 2.7127597332000732,
"learning_rate": 6.350315037204714e-06,
"loss": 0.1254,
"mean_token_accuracy": 0.9496433734893799,
"num_tokens": 6020588.0,
"step": 691
},
{
"entropy": 1.149724304676056,
"epoch": 2.218298555377207,
"grad_norm": 3.5706706047058105,
"learning_rate": 6.340106682284645e-06,
"loss": 0.1244,
"mean_token_accuracy": 0.9411612749099731,
"num_tokens": 6028693.0,
"step": 692
},
{
"entropy": 1.1703895926475525,
"epoch": 2.221508828250401,
"grad_norm": 3.06144380569458,
"learning_rate": 6.329892305339659e-06,
"loss": 0.143,
"mean_token_accuracy": 0.9452816843986511,
"num_tokens": 6037889.0,
"step": 693
},
{
"entropy": 1.172494113445282,
"epoch": 2.2247191011235956,
"grad_norm": 7.45186185836792,
"learning_rate": 6.319671952270004e-06,
"loss": 0.135,
"mean_token_accuracy": 0.9451717436313629,
"num_tokens": 6045716.0,
"step": 694
},
{
"entropy": 1.128127098083496,
"epoch": 2.2279293739967896,
"grad_norm": 2.54144549369812,
"learning_rate": 6.309445669002787e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.9428853690624237,
"num_tokens": 6054100.0,
"step": 695
},
{
"entropy": 1.1079555749893188,
"epoch": 2.231139646869984,
"grad_norm": 2.946728467941284,
"learning_rate": 6.299213501491761e-06,
"loss": 0.1523,
"mean_token_accuracy": 0.9281862378120422,
"num_tokens": 6063316.0,
"step": 696
},
{
"entropy": 1.2575078010559082,
"epoch": 2.234349919743178,
"grad_norm": 3.1350910663604736,
"learning_rate": 6.288975495717124e-06,
"loss": 0.1302,
"mean_token_accuracy": 0.9531635940074921,
"num_tokens": 6071694.0,
"step": 697
},
{
"entropy": 1.1943337321281433,
"epoch": 2.2375601926163724,
"grad_norm": 3.640120267868042,
"learning_rate": 6.2787316976853045e-06,
"loss": 0.1566,
"mean_token_accuracy": 0.9265032112598419,
"num_tokens": 6081857.0,
"step": 698
},
{
"entropy": 1.1847606897354126,
"epoch": 2.240770465489567,
"grad_norm": 2.607599973678589,
"learning_rate": 6.268482153428763e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.923068642616272,
"num_tokens": 6090548.0,
"step": 699
},
{
"entropy": 1.184391736984253,
"epoch": 2.243980738362761,
"grad_norm": 2.14422869682312,
"learning_rate": 6.258226909005783e-06,
"loss": 0.0921,
"mean_token_accuracy": 0.9700891375541687,
"num_tokens": 6098175.0,
"step": 700
},
{
"entropy": 1.1512706875801086,
"epoch": 2.247191011235955,
"grad_norm": 2.4187541007995605,
"learning_rate": 6.247966010500258e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9588777124881744,
"num_tokens": 6106064.0,
"step": 701
},
{
"entropy": 0.9829612672328949,
"epoch": 2.250401284109149,
"grad_norm": 3.0330357551574707,
"learning_rate": 6.237699504021495e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9402399659156799,
"num_tokens": 6114865.0,
"step": 702
},
{
"entropy": 1.2972161173820496,
"epoch": 2.2536115569823436,
"grad_norm": 2.9143383502960205,
"learning_rate": 6.227427435703997e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9466615319252014,
"num_tokens": 6123153.0,
"step": 703
},
{
"entropy": 1.2215816974639893,
"epoch": 2.2568218298555376,
"grad_norm": 4.461921691894531,
"learning_rate": 6.217149851707261e-06,
"loss": 0.1094,
"mean_token_accuracy": 0.9588871896266937,
"num_tokens": 6130391.0,
"step": 704
},
{
"entropy": 1.1384071707725525,
"epoch": 2.260032102728732,
"grad_norm": 2.7823870182037354,
"learning_rate": 6.206866798215571e-06,
"loss": 0.1294,
"mean_token_accuracy": 0.9387631118297577,
"num_tokens": 6139453.0,
"step": 705
},
{
"entropy": 1.029437243938446,
"epoch": 2.263242375601926,
"grad_norm": 2.3868653774261475,
"learning_rate": 6.1965783214377895e-06,
"loss": 0.1529,
"mean_token_accuracy": 0.9455865919589996,
"num_tokens": 6148848.0,
"step": 706
},
{
"entropy": 1.1833890676498413,
"epoch": 2.2664526484751204,
"grad_norm": 6.3576178550720215,
"learning_rate": 6.186284467607149e-06,
"loss": 0.1385,
"mean_token_accuracy": 0.9505321979522705,
"num_tokens": 6156345.0,
"step": 707
},
{
"entropy": 1.0868958830833435,
"epoch": 2.2696629213483144,
"grad_norm": 3.012101650238037,
"learning_rate": 6.175985282981042e-06,
"loss": 0.1352,
"mean_token_accuracy": 0.9491030275821686,
"num_tokens": 6165226.0,
"step": 708
},
{
"entropy": 1.2288443446159363,
"epoch": 2.272873194221509,
"grad_norm": 3.23870587348938,
"learning_rate": 6.165680813840822e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9390627443790436,
"num_tokens": 6174349.0,
"step": 709
},
{
"entropy": 1.2095741033554077,
"epoch": 2.276083467094703,
"grad_norm": 2.6538543701171875,
"learning_rate": 6.155371106491584e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9434215128421783,
"num_tokens": 6181879.0,
"step": 710
},
{
"entropy": 1.1225184798240662,
"epoch": 2.279293739967897,
"grad_norm": 4.149666786193848,
"learning_rate": 6.1450562072619635e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9551192224025726,
"num_tokens": 6189233.0,
"step": 711
},
{
"entropy": 0.990815132856369,
"epoch": 2.2825040128410916,
"grad_norm": 2.544497489929199,
"learning_rate": 6.134736162503929e-06,
"loss": 0.156,
"mean_token_accuracy": 0.9360251128673553,
"num_tokens": 6197991.0,
"step": 712
},
{
"entropy": 1.1359922289848328,
"epoch": 2.2857142857142856,
"grad_norm": 2.076765537261963,
"learning_rate": 6.124411018592568e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9633454084396362,
"num_tokens": 6206731.0,
"step": 713
},
{
"entropy": 1.1928575038909912,
"epoch": 2.28892455858748,
"grad_norm": 3.9280059337615967,
"learning_rate": 6.114080821925885e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9514023959636688,
"num_tokens": 6215379.0,
"step": 714
},
{
"entropy": 0.9379362761974335,
"epoch": 2.292134831460674,
"grad_norm": 3.893974542617798,
"learning_rate": 6.103745618924587e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.9305950999259949,
"num_tokens": 6225119.0,
"step": 715
},
{
"entropy": 1.1294885873794556,
"epoch": 2.2953451043338684,
"grad_norm": 3.6253561973571777,
"learning_rate": 6.09340545603188e-06,
"loss": 0.1135,
"mean_token_accuracy": 0.954678863286972,
"num_tokens": 6233378.0,
"step": 716
},
{
"entropy": 1.2344108819961548,
"epoch": 2.2985553772070624,
"grad_norm": 2.6362674236297607,
"learning_rate": 6.0830603797132574e-06,
"loss": 0.1134,
"mean_token_accuracy": 0.9517610669136047,
"num_tokens": 6242391.0,
"step": 717
},
{
"entropy": 1.053533911705017,
"epoch": 2.301765650080257,
"grad_norm": 2.504680633544922,
"learning_rate": 6.072710436456293e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.9305368363857269,
"num_tokens": 6252383.0,
"step": 718
},
{
"entropy": 1.176681101322174,
"epoch": 2.304975922953451,
"grad_norm": 2.3614320755004883,
"learning_rate": 6.0623556727704306e-06,
"loss": 0.1222,
"mean_token_accuracy": 0.9493369460105896,
"num_tokens": 6260216.0,
"step": 719
},
{
"entropy": 1.2141610383987427,
"epoch": 2.308186195826645,
"grad_norm": 2.8210721015930176,
"learning_rate": 6.051996135186774e-06,
"loss": 0.1233,
"mean_token_accuracy": 0.9536011517047882,
"num_tokens": 6270180.0,
"step": 720
},
{
"entropy": 1.0050411820411682,
"epoch": 2.3113964686998396,
"grad_norm": 2.533144235610962,
"learning_rate": 6.041631870257882e-06,
"loss": 0.137,
"mean_token_accuracy": 0.9331673085689545,
"num_tokens": 6280110.0,
"step": 721
},
{
"entropy": 0.9854940176010132,
"epoch": 2.3146067415730336,
"grad_norm": 7.011678218841553,
"learning_rate": 6.0312629245575534e-06,
"loss": 0.1519,
"mean_token_accuracy": 0.9333108365535736,
"num_tokens": 6289456.0,
"step": 722
},
{
"entropy": 1.0914210677146912,
"epoch": 2.317817014446228,
"grad_norm": 6.578116416931152,
"learning_rate": 6.020889344680627e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.9352452456951141,
"num_tokens": 6297538.0,
"step": 723
},
{
"entropy": 1.0971428155899048,
"epoch": 2.321027287319422,
"grad_norm": 17.007915496826172,
"learning_rate": 6.010511177242757e-06,
"loss": 0.1499,
"mean_token_accuracy": 0.9454688131809235,
"num_tokens": 6305880.0,
"step": 724
},
{
"entropy": 1.052825391292572,
"epoch": 2.3242375601926164,
"grad_norm": 3.563309907913208,
"learning_rate": 6.000128468880223e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9364206492900848,
"num_tokens": 6313624.0,
"step": 725
},
{
"entropy": 1.0826497673988342,
"epoch": 2.3274478330658104,
"grad_norm": 4.190746307373047,
"learning_rate": 5.989741266249701e-06,
"loss": 0.1502,
"mean_token_accuracy": 0.9419477880001068,
"num_tokens": 6322725.0,
"step": 726
},
{
"entropy": 1.2108085751533508,
"epoch": 2.330658105939005,
"grad_norm": 2.0272305011749268,
"learning_rate": 5.979349616028067e-06,
"loss": 0.1171,
"mean_token_accuracy": 0.9540502727031708,
"num_tokens": 6331227.0,
"step": 727
},
{
"entropy": 1.0932866334915161,
"epoch": 2.333868378812199,
"grad_norm": 3.3109891414642334,
"learning_rate": 5.9689535649121855e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.915838360786438,
"num_tokens": 6342205.0,
"step": 728
},
{
"entropy": 1.1593711376190186,
"epoch": 2.337078651685393,
"grad_norm": 2.3071560859680176,
"learning_rate": 5.958553159618693e-06,
"loss": 0.1143,
"mean_token_accuracy": 0.9542573690414429,
"num_tokens": 6349942.0,
"step": 729
},
{
"entropy": 1.1865712404251099,
"epoch": 2.3402889245585876,
"grad_norm": 4.078185558319092,
"learning_rate": 5.948148446883794e-06,
"loss": 0.1875,
"mean_token_accuracy": 0.9212767481803894,
"num_tokens": 6359147.0,
"step": 730
},
{
"entropy": 1.2947837710380554,
"epoch": 2.3434991974317816,
"grad_norm": 15.557731628417969,
"learning_rate": 5.937739473463047e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.951077938079834,
"num_tokens": 6367884.0,
"step": 731
},
{
"entropy": 0.9925798773765564,
"epoch": 2.346709470304976,
"grad_norm": 2.6986594200134277,
"learning_rate": 5.927326286131162e-06,
"loss": 0.1312,
"mean_token_accuracy": 0.9450666606426239,
"num_tokens": 6377022.0,
"step": 732
},
{
"entropy": 1.0658472776412964,
"epoch": 2.34991974317817,
"grad_norm": 10.180908203125,
"learning_rate": 5.916908931681781e-06,
"loss": 0.1768,
"mean_token_accuracy": 0.9244714379310608,
"num_tokens": 6385574.0,
"step": 733
},
{
"entropy": 1.1259647607803345,
"epoch": 2.3531300160513644,
"grad_norm": 4.821587562561035,
"learning_rate": 5.906487456927273e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9284574687480927,
"num_tokens": 6395078.0,
"step": 734
},
{
"entropy": 1.082639455795288,
"epoch": 2.3563402889245584,
"grad_norm": 2.4788339138031006,
"learning_rate": 5.896061908698521e-06,
"loss": 0.1344,
"mean_token_accuracy": 0.9499082267284393,
"num_tokens": 6404040.0,
"step": 735
},
{
"entropy": 1.0406213402748108,
"epoch": 2.359550561797753,
"grad_norm": 3.1999518871307373,
"learning_rate": 5.885632333844714e-06,
"loss": 0.1442,
"mean_token_accuracy": 0.94551220536232,
"num_tokens": 6412489.0,
"step": 736
},
{
"entropy": 1.1878241896629333,
"epoch": 2.362760834670947,
"grad_norm": 2.712620496749878,
"learning_rate": 5.8751987792331365e-06,
"loss": 0.1398,
"mean_token_accuracy": 0.9454472362995148,
"num_tokens": 6421742.0,
"step": 737
},
{
"entropy": 1.1380284428596497,
"epoch": 2.365971107544141,
"grad_norm": 2.5213255882263184,
"learning_rate": 5.864761291748956e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.928494393825531,
"num_tokens": 6430727.0,
"step": 738
},
{
"entropy": 1.2049461007118225,
"epoch": 2.3691813804173356,
"grad_norm": 1.8564283847808838,
"learning_rate": 5.854319918295012e-06,
"loss": 0.0949,
"mean_token_accuracy": 0.9627068936824799,
"num_tokens": 6438650.0,
"step": 739
},
{
"entropy": 1.165435016155243,
"epoch": 2.3723916532905296,
"grad_norm": 2.9071779251098633,
"learning_rate": 5.843874705791607e-06,
"loss": 0.138,
"mean_token_accuracy": 0.95395627617836,
"num_tokens": 6447438.0,
"step": 740
},
{
"entropy": 1.1078984141349792,
"epoch": 2.375601926163724,
"grad_norm": 5.674651145935059,
"learning_rate": 5.833425701176294e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9678383767604828,
"num_tokens": 6455540.0,
"step": 741
},
{
"entropy": 1.1248586177825928,
"epoch": 2.378812199036918,
"grad_norm": 2.420736312866211,
"learning_rate": 5.82297295140367e-06,
"loss": 0.1336,
"mean_token_accuracy": 0.9477813839912415,
"num_tokens": 6464823.0,
"step": 742
},
{
"entropy": 1.1130772829055786,
"epoch": 2.3820224719101124,
"grad_norm": 3.5600759983062744,
"learning_rate": 5.812516503445158e-06,
"loss": 0.1509,
"mean_token_accuracy": 0.927628219127655,
"num_tokens": 6474728.0,
"step": 743
},
{
"entropy": 1.0952720642089844,
"epoch": 2.3852327447833064,
"grad_norm": 2.5865023136138916,
"learning_rate": 5.8020564042888015e-06,
"loss": 0.1442,
"mean_token_accuracy": 0.9443020522594452,
"num_tokens": 6484029.0,
"step": 744
},
{
"entropy": 1.0994407534599304,
"epoch": 2.388443017656501,
"grad_norm": 3.299069881439209,
"learning_rate": 5.79159270093905e-06,
"loss": 0.1609,
"mean_token_accuracy": 0.9384024739265442,
"num_tokens": 6492190.0,
"step": 745
},
{
"entropy": 1.0726031064987183,
"epoch": 2.391653290529695,
"grad_norm": 2.606271266937256,
"learning_rate": 5.781125440416552e-06,
"loss": 0.1508,
"mean_token_accuracy": 0.9255422055721283,
"num_tokens": 6502485.0,
"step": 746
},
{
"entropy": 1.0770372152328491,
"epoch": 2.394863563402889,
"grad_norm": 4.949024200439453,
"learning_rate": 5.770654669757935e-06,
"loss": 0.1276,
"mean_token_accuracy": 0.9429983794689178,
"num_tokens": 6511201.0,
"step": 747
},
{
"entropy": 1.1048730611801147,
"epoch": 2.3980738362760836,
"grad_norm": 2.6892385482788086,
"learning_rate": 5.760180436015604e-06,
"loss": 0.1366,
"mean_token_accuracy": 0.9469018876552582,
"num_tokens": 6519487.0,
"step": 748
},
{
"entropy": 1.2111053466796875,
"epoch": 2.4012841091492776,
"grad_norm": 3.027834892272949,
"learning_rate": 5.749702786257529e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.943552553653717,
"num_tokens": 6527831.0,
"step": 749
},
{
"entropy": 1.0329571962356567,
"epoch": 2.404494382022472,
"grad_norm": 2.4994142055511475,
"learning_rate": 5.739221767567025e-06,
"loss": 0.1709,
"mean_token_accuracy": 0.9160129129886627,
"num_tokens": 6536664.0,
"step": 750
},
{
"entropy": 1.1039209365844727,
"epoch": 2.407704654895666,
"grad_norm": 5.443882465362549,
"learning_rate": 5.7287374270425475e-06,
"loss": 0.1182,
"mean_token_accuracy": 0.951865941286087,
"num_tokens": 6544959.0,
"step": 751
},
{
"entropy": 1.20868319272995,
"epoch": 2.4109149277688604,
"grad_norm": 3.0676169395446777,
"learning_rate": 5.718249811797482e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9474264085292816,
"num_tokens": 6554395.0,
"step": 752
},
{
"entropy": 1.1987990736961365,
"epoch": 2.4141252006420544,
"grad_norm": 18.106931686401367,
"learning_rate": 5.707758968959923e-06,
"loss": 0.1274,
"mean_token_accuracy": 0.937873363494873,
"num_tokens": 6562668.0,
"step": 753
},
{
"entropy": 1.1869001388549805,
"epoch": 2.417335473515249,
"grad_norm": 3.868323564529419,
"learning_rate": 5.69726494567248e-06,
"loss": 0.162,
"mean_token_accuracy": 0.9423214197158813,
"num_tokens": 6572126.0,
"step": 754
},
{
"entropy": 1.066800206899643,
"epoch": 2.420545746388443,
"grad_norm": 2.391563653945923,
"learning_rate": 5.686767789092041e-06,
"loss": 0.1477,
"mean_token_accuracy": 0.9441542625427246,
"num_tokens": 6580655.0,
"step": 755
},
{
"entropy": 1.1427485942840576,
"epoch": 2.423756019261637,
"grad_norm": 3.986257553100586,
"learning_rate": 5.676267546389587e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9257921278476715,
"num_tokens": 6589834.0,
"step": 756
},
{
"entropy": 1.1412203907966614,
"epoch": 2.4269662921348316,
"grad_norm": 7.656455993652344,
"learning_rate": 5.6657642647499545e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9529256224632263,
"num_tokens": 6598073.0,
"step": 757
},
{
"entropy": 1.0917719006538391,
"epoch": 2.4301765650080256,
"grad_norm": 2.823206663131714,
"learning_rate": 5.655257991371646e-06,
"loss": 0.1521,
"mean_token_accuracy": 0.9446594417095184,
"num_tokens": 6606618.0,
"step": 758
},
{
"entropy": 1.1908010244369507,
"epoch": 2.43338683788122,
"grad_norm": 5.725603103637695,
"learning_rate": 5.644748773466606e-06,
"loss": 0.1133,
"mean_token_accuracy": 0.9505617022514343,
"num_tokens": 6614664.0,
"step": 759
},
{
"entropy": 1.1573768258094788,
"epoch": 2.436597110754414,
"grad_norm": 2.2337048053741455,
"learning_rate": 5.6342366582600035e-06,
"loss": 0.1609,
"mean_token_accuracy": 0.9296829104423523,
"num_tokens": 6625043.0,
"step": 760
},
{
"entropy": 1.1481091380119324,
"epoch": 2.4398073836276084,
"grad_norm": 3.4945619106292725,
"learning_rate": 5.62372169299004e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9498609006404877,
"num_tokens": 6633006.0,
"step": 761
},
{
"entropy": 1.1791866421699524,
"epoch": 2.4430176565008024,
"grad_norm": 2.9412434101104736,
"learning_rate": 5.613203924907711e-06,
"loss": 0.1251,
"mean_token_accuracy": 0.9411061108112335,
"num_tokens": 6642848.0,
"step": 762
},
{
"entropy": 1.0868923664093018,
"epoch": 2.446227929373997,
"grad_norm": 3.4383668899536133,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.1517,
"mean_token_accuracy": 0.9447084367275238,
"num_tokens": 6651619.0,
"step": 763
},
{
"entropy": 1.0268099308013916,
"epoch": 2.449438202247191,
"grad_norm": 2.650527238845825,
"learning_rate": 5.592160169372734e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9500701725482941,
"num_tokens": 6659996.0,
"step": 764
},
{
"entropy": 1.166718602180481,
"epoch": 2.452648475120385,
"grad_norm": 2.866605281829834,
"learning_rate": 5.581634276484211e-06,
"loss": 0.1177,
"mean_token_accuracy": 0.95614093542099,
"num_tokens": 6668090.0,
"step": 765
},
{
"entropy": 1.2826551795005798,
"epoch": 2.4558587479935796,
"grad_norm": 2.394258975982666,
"learning_rate": 5.571105769911159e-06,
"loss": 0.1224,
"mean_token_accuracy": 0.9535133540630341,
"num_tokens": 6676019.0,
"step": 766
},
{
"entropy": 1.192893922328949,
"epoch": 2.4590690208667736,
"grad_norm": 2.396747589111328,
"learning_rate": 5.560574696965425e-06,
"loss": 0.124,
"mean_token_accuracy": 0.9526415169239044,
"num_tokens": 6683999.0,
"step": 767
},
{
"entropy": 1.288866102695465,
"epoch": 2.462279293739968,
"grad_norm": 5.264256000518799,
"learning_rate": 5.550041104970398e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9669314324855804,
"num_tokens": 6692511.0,
"step": 768
},
{
"entropy": 1.1260980367660522,
"epoch": 2.465489566613162,
"grad_norm": 2.4346187114715576,
"learning_rate": 5.539505041260779e-06,
"loss": 0.1496,
"mean_token_accuracy": 0.9367939531803131,
"num_tokens": 6702286.0,
"step": 769
},
{
"entropy": 1.0402022004127502,
"epoch": 2.4686998394863564,
"grad_norm": 5.237890720367432,
"learning_rate": 5.528966553182379e-06,
"loss": 0.126,
"mean_token_accuracy": 0.947739988565445,
"num_tokens": 6710359.0,
"step": 770
},
{
"entropy": 1.0591676533222198,
"epoch": 2.4719101123595504,
"grad_norm": 3.0694470405578613,
"learning_rate": 5.518425688091906e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9322700500488281,
"num_tokens": 6719975.0,
"step": 771
},
{
"entropy": 1.0547669529914856,
"epoch": 2.475120385232745,
"grad_norm": 5.70684289932251,
"learning_rate": 5.507882493356745e-06,
"loss": 0.1328,
"mean_token_accuracy": 0.9526722431182861,
"num_tokens": 6728954.0,
"step": 772
},
{
"entropy": 1.1029804944992065,
"epoch": 2.478330658105939,
"grad_norm": 2.295919418334961,
"learning_rate": 5.497337016354757e-06,
"loss": 0.1179,
"mean_token_accuracy": 0.9506842195987701,
"num_tokens": 6737394.0,
"step": 773
},
{
"entropy": 1.0925171375274658,
"epoch": 2.481540930979133,
"grad_norm": 6.271547794342041,
"learning_rate": 5.486789304474047e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.9553323984146118,
"num_tokens": 6745760.0,
"step": 774
},
{
"entropy": 1.1927781105041504,
"epoch": 2.4847512038523276,
"grad_norm": 3.240992546081543,
"learning_rate": 5.476239405112775e-06,
"loss": 0.0908,
"mean_token_accuracy": 0.9667895436286926,
"num_tokens": 6753422.0,
"step": 775
},
{
"entropy": 1.1005544662475586,
"epoch": 2.4879614767255216,
"grad_norm": 2.522223949432373,
"learning_rate": 5.465687365678921e-06,
"loss": 0.1211,
"mean_token_accuracy": 0.9548394978046417,
"num_tokens": 6762310.0,
"step": 776
},
{
"entropy": 1.1497004628181458,
"epoch": 2.491171749598716,
"grad_norm": 6.606053829193115,
"learning_rate": 5.45513323359009e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.9237475097179413,
"num_tokens": 6771461.0,
"step": 777
},
{
"entropy": 1.1692470908164978,
"epoch": 2.49438202247191,
"grad_norm": 2.561846971511841,
"learning_rate": 5.444577056273284e-06,
"loss": 0.157,
"mean_token_accuracy": 0.9331184327602386,
"num_tokens": 6780794.0,
"step": 778
},
{
"entropy": 1.1475553512573242,
"epoch": 2.4975922953451044,
"grad_norm": 4.43130350112915,
"learning_rate": 5.434018881164702e-06,
"loss": 0.1414,
"mean_token_accuracy": 0.9424753189086914,
"num_tokens": 6789767.0,
"step": 779
},
{
"entropy": 1.1029905378818512,
"epoch": 2.5008025682182984,
"grad_norm": 2.932034969329834,
"learning_rate": 5.423458755709516e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9238942563533783,
"num_tokens": 6798435.0,
"step": 780
},
{
"entropy": 1.0897286534309387,
"epoch": 2.504012841091493,
"grad_norm": 4.083599090576172,
"learning_rate": 5.412896727361663e-06,
"loss": 0.1381,
"mean_token_accuracy": 0.9539946615695953,
"num_tokens": 6806305.0,
"step": 781
},
{
"entropy": 1.283981204032898,
"epoch": 2.5072231139646872,
"grad_norm": 2.844965696334839,
"learning_rate": 5.402332843583631e-06,
"loss": 0.1294,
"mean_token_accuracy": 0.9530335962772369,
"num_tokens": 6814666.0,
"step": 782
},
{
"entropy": 1.0701724290847778,
"epoch": 2.510433386837881,
"grad_norm": 3.165010690689087,
"learning_rate": 5.391767151846247e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.9041315615177155,
"num_tokens": 6825399.0,
"step": 783
},
{
"entropy": 1.116648256778717,
"epoch": 2.513643659711075,
"grad_norm": 2.8302817344665527,
"learning_rate": 5.381199699628459e-06,
"loss": 0.1319,
"mean_token_accuracy": 0.9495140314102173,
"num_tokens": 6834076.0,
"step": 784
},
{
"entropy": 1.0305944681167603,
"epoch": 2.5168539325842696,
"grad_norm": 2.3221435546875,
"learning_rate": 5.370630534417133e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9132009148597717,
"num_tokens": 6844788.0,
"step": 785
},
{
"entropy": 1.040054589509964,
"epoch": 2.520064205457464,
"grad_norm": 2.6023216247558594,
"learning_rate": 5.360059703706823e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.9199038743972778,
"num_tokens": 6854025.0,
"step": 786
},
{
"entropy": 1.228873372077942,
"epoch": 2.523274478330658,
"grad_norm": 5.279784679412842,
"learning_rate": 5.349487254999579e-06,
"loss": 0.0966,
"mean_token_accuracy": 0.964431494474411,
"num_tokens": 6862209.0,
"step": 787
},
{
"entropy": 1.2652587294578552,
"epoch": 2.5264847512038524,
"grad_norm": 3.7914271354675293,
"learning_rate": 5.3389132358047115e-06,
"loss": 0.1027,
"mean_token_accuracy": 0.9610486626625061,
"num_tokens": 6870917.0,
"step": 788
},
{
"entropy": 1.2009222507476807,
"epoch": 2.5296950240770464,
"grad_norm": 2.931785821914673,
"learning_rate": 5.328337693638591e-06,
"loss": 0.1315,
"mean_token_accuracy": 0.9512718021869659,
"num_tokens": 6879476.0,
"step": 789
},
{
"entropy": 1.254488468170166,
"epoch": 2.532905296950241,
"grad_norm": 27.21143341064453,
"learning_rate": 5.317760676024436e-06,
"loss": 0.1399,
"mean_token_accuracy": 0.9459502100944519,
"num_tokens": 6888458.0,
"step": 790
},
{
"entropy": 1.2336873412132263,
"epoch": 2.5361155698234352,
"grad_norm": 2.5371665954589844,
"learning_rate": 5.307182230492089e-06,
"loss": 0.1327,
"mean_token_accuracy": 0.9516365826129913,
"num_tokens": 6897670.0,
"step": 791
},
{
"entropy": 0.9518597722053528,
"epoch": 2.539325842696629,
"grad_norm": 2.7813923358917236,
"learning_rate": 5.296602404577814e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.9245217740535736,
"num_tokens": 6906804.0,
"step": 792
},
{
"entropy": 1.167235255241394,
"epoch": 2.542536115569823,
"grad_norm": 2.641925573348999,
"learning_rate": 5.286021245824075e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9582434296607971,
"num_tokens": 6914869.0,
"step": 793
},
{
"entropy": 1.3288288116455078,
"epoch": 2.5457463884430176,
"grad_norm": 2.3714306354522705,
"learning_rate": 5.275438801779328e-06,
"loss": 0.1135,
"mean_token_accuracy": 0.9562829434871674,
"num_tokens": 6924038.0,
"step": 794
},
{
"entropy": 1.1833221912384033,
"epoch": 2.548956661316212,
"grad_norm": 2.5956239700317383,
"learning_rate": 5.264855119997803e-06,
"loss": 0.1405,
"mean_token_accuracy": 0.9424801170825958,
"num_tokens": 6932622.0,
"step": 795
},
{
"entropy": 1.1478038430213928,
"epoch": 2.552166934189406,
"grad_norm": 4.386684417724609,
"learning_rate": 5.254270248039291e-06,
"loss": 0.1419,
"mean_token_accuracy": 0.9366876184940338,
"num_tokens": 6941209.0,
"step": 796
},
{
"entropy": 1.1999590992927551,
"epoch": 2.5553772070626004,
"grad_norm": 2.239611864089966,
"learning_rate": 5.243684233468933e-06,
"loss": 0.1416,
"mean_token_accuracy": 0.9222292900085449,
"num_tokens": 6951363.0,
"step": 797
},
{
"entropy": 1.188470184803009,
"epoch": 2.5585874799357944,
"grad_norm": 2.155332565307617,
"learning_rate": 5.233097123857004e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9411430060863495,
"num_tokens": 6960774.0,
"step": 798
},
{
"entropy": 1.0205163359642029,
"epoch": 2.561797752808989,
"grad_norm": 2.36620831489563,
"learning_rate": 5.222508966778702e-06,
"loss": 0.1414,
"mean_token_accuracy": 0.9473484754562378,
"num_tokens": 6969967.0,
"step": 799
},
{
"entropy": 1.1117953062057495,
"epoch": 2.5650080256821832,
"grad_norm": 83.64684295654297,
"learning_rate": 5.211919809813927e-06,
"loss": 0.1064,
"mean_token_accuracy": 0.9633921980857849,
"num_tokens": 6977751.0,
"step": 800
},
{
"entropy": 1.1597265005111694,
"epoch": 2.568218298555377,
"grad_norm": 7.2872419357299805,
"learning_rate": 5.201329700547077e-06,
"loss": 0.1348,
"mean_token_accuracy": 0.9420501291751862,
"num_tokens": 6986967.0,
"step": 801
},
{
"entropy": 1.062886893749237,
"epoch": 2.571428571428571,
"grad_norm": 3.075824737548828,
"learning_rate": 5.190738686566826e-06,
"loss": 0.1747,
"mean_token_accuracy": 0.9246014654636383,
"num_tokens": 6995637.0,
"step": 802
},
{
"entropy": 1.1086429953575134,
"epoch": 2.5746388443017656,
"grad_norm": 2.755178928375244,
"learning_rate": 5.180146815465915e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9486549496650696,
"num_tokens": 7005492.0,
"step": 803
},
{
"entropy": 1.2635900974273682,
"epoch": 2.57784911717496,
"grad_norm": 3.3014962673187256,
"learning_rate": 5.169554134840937e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.9536003470420837,
"num_tokens": 7013405.0,
"step": 804
},
{
"entropy": 1.2595646977424622,
"epoch": 2.581059390048154,
"grad_norm": 9.928709030151367,
"learning_rate": 5.158960692292122e-06,
"loss": 0.1435,
"mean_token_accuracy": 0.930577278137207,
"num_tokens": 7022575.0,
"step": 805
},
{
"entropy": 1.2178662419319153,
"epoch": 2.5842696629213484,
"grad_norm": 2.059295177459717,
"learning_rate": 5.148366535423126e-06,
"loss": 0.104,
"mean_token_accuracy": 0.9549370110034943,
"num_tokens": 7031068.0,
"step": 806
},
{
"entropy": 1.182856559753418,
"epoch": 2.5874799357945424,
"grad_norm": 4.150783538818359,
"learning_rate": 5.137771711840811e-06,
"loss": 0.169,
"mean_token_accuracy": 0.9229248464107513,
"num_tokens": 7040170.0,
"step": 807
},
{
"entropy": 1.1488960981369019,
"epoch": 2.590690208667737,
"grad_norm": 2.7888660430908203,
"learning_rate": 5.1271762691550375e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9194961786270142,
"num_tokens": 7049813.0,
"step": 808
},
{
"entropy": 1.1741920709609985,
"epoch": 2.5939004815409312,
"grad_norm": 4.083916664123535,
"learning_rate": 5.116580254978447e-06,
"loss": 0.123,
"mean_token_accuracy": 0.951829195022583,
"num_tokens": 7057807.0,
"step": 809
},
{
"entropy": 1.0202910602092743,
"epoch": 2.597110754414125,
"grad_norm": 2.8720052242279053,
"learning_rate": 5.1059837169262506e-06,
"loss": 0.1796,
"mean_token_accuracy": 0.9331225156784058,
"num_tokens": 7066075.0,
"step": 810
},
{
"entropy": 1.1626269817352295,
"epoch": 2.600321027287319,
"grad_norm": 3.792236089706421,
"learning_rate": 5.095386702616012e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.9492098689079285,
"num_tokens": 7074889.0,
"step": 811
},
{
"entropy": 1.1408878564834595,
"epoch": 2.6035313001605136,
"grad_norm": 3.076556921005249,
"learning_rate": 5.084789259667437e-06,
"loss": 0.1227,
"mean_token_accuracy": 0.9498015642166138,
"num_tokens": 7083337.0,
"step": 812
},
{
"entropy": 1.1532814502716064,
"epoch": 2.606741573033708,
"grad_norm": 2.449476718902588,
"learning_rate": 5.074191435702155e-06,
"loss": 0.1387,
"mean_token_accuracy": 0.9504629671573639,
"num_tokens": 7092174.0,
"step": 813
},
{
"entropy": 1.2164504528045654,
"epoch": 2.609951845906902,
"grad_norm": 2.563089609146118,
"learning_rate": 5.06359327834351e-06,
"loss": 0.144,
"mean_token_accuracy": 0.9414226114749908,
"num_tokens": 7100611.0,
"step": 814
},
{
"entropy": 1.1398358345031738,
"epoch": 2.6131621187800964,
"grad_norm": 2.054319381713867,
"learning_rate": 5.05299483521634e-06,
"loss": 0.1458,
"mean_token_accuracy": 0.9323011636734009,
"num_tokens": 7108693.0,
"step": 815
},
{
"entropy": 1.2047650218009949,
"epoch": 2.6163723916532904,
"grad_norm": 3.8257853984832764,
"learning_rate": 5.0423961539467754e-06,
"loss": 0.1266,
"mean_token_accuracy": 0.9467697441577911,
"num_tokens": 7117228.0,
"step": 816
},
{
"entropy": 1.2512578964233398,
"epoch": 2.619582664526485,
"grad_norm": 2.4949073791503906,
"learning_rate": 5.031797282162007e-06,
"loss": 0.1164,
"mean_token_accuracy": 0.9542975723743439,
"num_tokens": 7125961.0,
"step": 817
},
{
"entropy": 1.202860414981842,
"epoch": 2.6227929373996792,
"grad_norm": 3.2338783740997314,
"learning_rate": 5.021198267490088e-06,
"loss": 0.1173,
"mean_token_accuracy": 0.9500894248485565,
"num_tokens": 7133976.0,
"step": 818
},
{
"entropy": 1.1789140701293945,
"epoch": 2.626003210272873,
"grad_norm": 2.3607523441314697,
"learning_rate": 5.010599157559713e-06,
"loss": 0.1294,
"mean_token_accuracy": 0.9499906599521637,
"num_tokens": 7144087.0,
"step": 819
},
{
"entropy": 1.3041833639144897,
"epoch": 2.629213483146067,
"grad_norm": 4.608010768890381,
"learning_rate": 5e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.9459123015403748,
"num_tokens": 7153618.0,
"step": 820
},
{
"entropy": 1.1604467630386353,
"epoch": 2.6324237560192616,
"grad_norm": 2.8961966037750244,
"learning_rate": 4.98940084244029e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.9523343443870544,
"num_tokens": 7162203.0,
"step": 821
},
{
"entropy": 1.329535961151123,
"epoch": 2.635634028892456,
"grad_norm": 2.5095512866973877,
"learning_rate": 4.9788017325099134e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9324021637439728,
"num_tokens": 7171282.0,
"step": 822
},
{
"entropy": 1.0252764225006104,
"epoch": 2.63884430176565,
"grad_norm": 12.967425346374512,
"learning_rate": 4.968202717837996e-06,
"loss": 0.0946,
"mean_token_accuracy": 0.9648852646350861,
"num_tokens": 7179569.0,
"step": 823
},
{
"entropy": 1.1990549564361572,
"epoch": 2.6420545746388444,
"grad_norm": 1.95919930934906,
"learning_rate": 4.957603846053225e-06,
"loss": 0.1448,
"mean_token_accuracy": 0.9189603328704834,
"num_tokens": 7190125.0,
"step": 824
},
{
"entropy": 1.2720578908920288,
"epoch": 2.6452648475120384,
"grad_norm": 3.3589868545532227,
"learning_rate": 4.947005164783661e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.941864937543869,
"num_tokens": 7200048.0,
"step": 825
},
{
"entropy": 1.2720287442207336,
"epoch": 2.648475120385233,
"grad_norm": 4.5492024421691895,
"learning_rate": 4.936406721656492e-06,
"loss": 0.1051,
"mean_token_accuracy": 0.9559166431427002,
"num_tokens": 7208839.0,
"step": 826
},
{
"entropy": 1.2180215120315552,
"epoch": 2.6516853932584272,
"grad_norm": 2.815068483352661,
"learning_rate": 4.925808564297847e-06,
"loss": 0.1149,
"mean_token_accuracy": 0.9604234099388123,
"num_tokens": 7217154.0,
"step": 827
},
{
"entropy": 0.9922587871551514,
"epoch": 2.654895666131621,
"grad_norm": 59.59284210205078,
"learning_rate": 4.915210740332564e-06,
"loss": 0.1447,
"mean_token_accuracy": 0.9238322377204895,
"num_tokens": 7226381.0,
"step": 828
},
{
"entropy": 1.1355258226394653,
"epoch": 2.658105939004815,
"grad_norm": 2.589543342590332,
"learning_rate": 4.9046132973839895e-06,
"loss": 0.1164,
"mean_token_accuracy": 0.956183135509491,
"num_tokens": 7234286.0,
"step": 829
},
{
"entropy": 0.9799058735370636,
"epoch": 2.6613162118780096,
"grad_norm": 3.102374315261841,
"learning_rate": 4.894016283073753e-06,
"loss": 0.1386,
"mean_token_accuracy": 0.9398746490478516,
"num_tokens": 7243474.0,
"step": 830
},
{
"entropy": 1.2898936867713928,
"epoch": 2.664526484751204,
"grad_norm": 3.5710971355438232,
"learning_rate": 4.883419745021554e-06,
"loss": 0.1233,
"mean_token_accuracy": 0.9488505721092224,
"num_tokens": 7251936.0,
"step": 831
},
{
"entropy": 1.2016060948371887,
"epoch": 2.667736757624398,
"grad_norm": 2.7869205474853516,
"learning_rate": 4.872823730844966e-06,
"loss": 0.1072,
"mean_token_accuracy": 0.9632176756858826,
"num_tokens": 7260147.0,
"step": 832
},
{
"entropy": 1.1167156100273132,
"epoch": 2.6709470304975924,
"grad_norm": 2.521682024002075,
"learning_rate": 4.862228288159191e-06,
"loss": 0.1574,
"mean_token_accuracy": 0.9312631487846375,
"num_tokens": 7269675.0,
"step": 833
},
{
"entropy": 1.2334627509117126,
"epoch": 2.6741573033707864,
"grad_norm": 5.0381011962890625,
"learning_rate": 4.851633464576876e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.9190080761909485,
"num_tokens": 7278754.0,
"step": 834
},
{
"entropy": 1.0725289583206177,
"epoch": 2.677367576243981,
"grad_norm": 2.2365589141845703,
"learning_rate": 4.841039307707878e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9546129703521729,
"num_tokens": 7286779.0,
"step": 835
},
{
"entropy": 1.3398520350456238,
"epoch": 2.6805778491171752,
"grad_norm": 3.4143424034118652,
"learning_rate": 4.8304458651590645e-06,
"loss": 0.1481,
"mean_token_accuracy": 0.939169704914093,
"num_tokens": 7296679.0,
"step": 836
},
{
"entropy": 1.1583664417266846,
"epoch": 2.683788121990369,
"grad_norm": 3.6284713745117188,
"learning_rate": 4.819853184534085e-06,
"loss": 0.1225,
"mean_token_accuracy": 0.9425530731678009,
"num_tokens": 7304789.0,
"step": 837
},
{
"entropy": 1.1957999467849731,
"epoch": 2.686998394863563,
"grad_norm": 3.876544237136841,
"learning_rate": 4.809261313433176e-06,
"loss": 0.138,
"mean_token_accuracy": 0.9428149461746216,
"num_tokens": 7313959.0,
"step": 838
},
{
"entropy": 1.2033058404922485,
"epoch": 2.6902086677367576,
"grad_norm": 2.247842788696289,
"learning_rate": 4.798670299452926e-06,
"loss": 0.1128,
"mean_token_accuracy": 0.9541947841644287,
"num_tokens": 7322037.0,
"step": 839
},
{
"entropy": 1.0784823894500732,
"epoch": 2.693418940609952,
"grad_norm": 3.17934250831604,
"learning_rate": 4.788080190186075e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9339147508144379,
"num_tokens": 7331689.0,
"step": 840
},
{
"entropy": 0.94371497631073,
"epoch": 2.696629213483146,
"grad_norm": 2.5439469814300537,
"learning_rate": 4.7774910332213005e-06,
"loss": 0.1307,
"mean_token_accuracy": 0.9517930448055267,
"num_tokens": 7339675.0,
"step": 841
},
{
"entropy": 1.293647587299347,
"epoch": 2.6998394863563404,
"grad_norm": 3.3014588356018066,
"learning_rate": 4.766902876142996e-06,
"loss": 0.1151,
"mean_token_accuracy": 0.9499547779560089,
"num_tokens": 7348038.0,
"step": 842
},
{
"entropy": 1.1160376071929932,
"epoch": 2.7030497592295344,
"grad_norm": 2.3735413551330566,
"learning_rate": 4.756315766531069e-06,
"loss": 0.1221,
"mean_token_accuracy": 0.949918121099472,
"num_tokens": 7356513.0,
"step": 843
},
{
"entropy": 1.091322898864746,
"epoch": 2.706260032102729,
"grad_norm": 2.5370192527770996,
"learning_rate": 4.74572975196071e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.9399248659610748,
"num_tokens": 7365541.0,
"step": 844
},
{
"entropy": 1.1114393472671509,
"epoch": 2.7094703049759232,
"grad_norm": 2.9894139766693115,
"learning_rate": 4.735144880002199e-06,
"loss": 0.1457,
"mean_token_accuracy": 0.932213693857193,
"num_tokens": 7374308.0,
"step": 845
},
{
"entropy": 1.2670826315879822,
"epoch": 2.712680577849117,
"grad_norm": 3.916121244430542,
"learning_rate": 4.724561198220672e-06,
"loss": 0.116,
"mean_token_accuracy": 0.9497752785682678,
"num_tokens": 7382886.0,
"step": 846
},
{
"entropy": 1.2188202738761902,
"epoch": 2.715890850722311,
"grad_norm": 5.7917094230651855,
"learning_rate": 4.713978754175926e-06,
"loss": 0.1174,
"mean_token_accuracy": 0.9567406475543976,
"num_tokens": 7391546.0,
"step": 847
},
{
"entropy": 1.2436976432800293,
"epoch": 2.7191011235955056,
"grad_norm": 2.361128091812134,
"learning_rate": 4.703397595422188e-06,
"loss": 0.1303,
"mean_token_accuracy": 0.942131757736206,
"num_tokens": 7400026.0,
"step": 848
},
{
"entropy": 1.019951045513153,
"epoch": 2.7223113964687,
"grad_norm": 2.2465875148773193,
"learning_rate": 4.692817769507912e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9461689591407776,
"num_tokens": 7409147.0,
"step": 849
},
{
"entropy": 1.087328314781189,
"epoch": 2.725521669341894,
"grad_norm": 7.778383255004883,
"learning_rate": 4.682239323975566e-06,
"loss": 0.1628,
"mean_token_accuracy": 0.9359224736690521,
"num_tokens": 7417525.0,
"step": 850
},
{
"entropy": 1.2314087748527527,
"epoch": 2.7287319422150884,
"grad_norm": 2.4920654296875,
"learning_rate": 4.671662306361409e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9538951516151428,
"num_tokens": 7427404.0,
"step": 851
},
{
"entropy": 1.3098554015159607,
"epoch": 2.7319422150882824,
"grad_norm": 2.8234992027282715,
"learning_rate": 4.66108676419529e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9484634101390839,
"num_tokens": 7435312.0,
"step": 852
},
{
"entropy": 1.2485319375991821,
"epoch": 2.735152487961477,
"grad_norm": 2.5828335285186768,
"learning_rate": 4.6505127450004216e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.957422286272049,
"num_tokens": 7443603.0,
"step": 853
},
{
"entropy": 1.3845638036727905,
"epoch": 2.738362760834671,
"grad_norm": 2.860136032104492,
"learning_rate": 4.6399402962931775e-06,
"loss": 0.1355,
"mean_token_accuracy": 0.9504896700382233,
"num_tokens": 7453049.0,
"step": 854
},
{
"entropy": 1.0702533721923828,
"epoch": 2.741573033707865,
"grad_norm": 2.2271461486816406,
"learning_rate": 4.62936946558287e-06,
"loss": 0.1051,
"mean_token_accuracy": 0.9618172347545624,
"num_tokens": 7461658.0,
"step": 855
},
{
"entropy": 1.1994733810424805,
"epoch": 2.744783306581059,
"grad_norm": 2.68015456199646,
"learning_rate": 4.618800300371543e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9526884853839874,
"num_tokens": 7470063.0,
"step": 856
},
{
"entropy": 1.1785258650779724,
"epoch": 2.7479935794542536,
"grad_norm": 2.1752262115478516,
"learning_rate": 4.608232848153757e-06,
"loss": 0.1362,
"mean_token_accuracy": 0.9400155544281006,
"num_tokens": 7479325.0,
"step": 857
},
{
"entropy": 1.1845125555992126,
"epoch": 2.751203852327448,
"grad_norm": 3.401946544647217,
"learning_rate": 4.597667156416371e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.955258846282959,
"num_tokens": 7487362.0,
"step": 858
},
{
"entropy": 1.193034589290619,
"epoch": 2.754414125200642,
"grad_norm": 5.033675670623779,
"learning_rate": 4.587103272638339e-06,
"loss": 0.1302,
"mean_token_accuracy": 0.9349705278873444,
"num_tokens": 7496720.0,
"step": 859
},
{
"entropy": 0.996446818113327,
"epoch": 2.7576243980738364,
"grad_norm": 2.372493028640747,
"learning_rate": 4.576541244290484e-06,
"loss": 0.1257,
"mean_token_accuracy": 0.9507549703121185,
"num_tokens": 7505134.0,
"step": 860
},
{
"entropy": 1.076714038848877,
"epoch": 2.7608346709470304,
"grad_norm": 2.4927151203155518,
"learning_rate": 4.565981118835299e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9250488579273224,
"num_tokens": 7515661.0,
"step": 861
},
{
"entropy": 1.0132020115852356,
"epoch": 2.764044943820225,
"grad_norm": 2.702894926071167,
"learning_rate": 4.555422943726715e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9141054153442383,
"num_tokens": 7524731.0,
"step": 862
},
{
"entropy": 1.2586361765861511,
"epoch": 2.767255216693419,
"grad_norm": 2.5868072509765625,
"learning_rate": 4.5448667664099125e-06,
"loss": 0.1188,
"mean_token_accuracy": 0.9554040431976318,
"num_tokens": 7532750.0,
"step": 863
},
{
"entropy": 1.2303617000579834,
"epoch": 2.770465489566613,
"grad_norm": 2.8830461502075195,
"learning_rate": 4.534312634321081e-06,
"loss": 0.1314,
"mean_token_accuracy": 0.9583016335964203,
"num_tokens": 7542198.0,
"step": 864
},
{
"entropy": 1.2619620561599731,
"epoch": 2.773675762439807,
"grad_norm": 2.6447360515594482,
"learning_rate": 4.523760594887228e-06,
"loss": 0.1126,
"mean_token_accuracy": 0.9589046835899353,
"num_tokens": 7549862.0,
"step": 865
},
{
"entropy": 1.136208415031433,
"epoch": 2.7768860353130016,
"grad_norm": 2.4705092906951904,
"learning_rate": 4.513210695525954e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.956132709980011,
"num_tokens": 7557050.0,
"step": 866
},
{
"entropy": 1.1666526198387146,
"epoch": 2.780096308186196,
"grad_norm": 2.446833610534668,
"learning_rate": 4.5026629836452445e-06,
"loss": 0.1372,
"mean_token_accuracy": 0.9423530399799347,
"num_tokens": 7566074.0,
"step": 867
},
{
"entropy": 1.1064327955245972,
"epoch": 2.78330658105939,
"grad_norm": 3.509469747543335,
"learning_rate": 4.492117506643256e-06,
"loss": 0.0984,
"mean_token_accuracy": 0.9642610847949982,
"num_tokens": 7573235.0,
"step": 868
},
{
"entropy": 1.143051952123642,
"epoch": 2.7865168539325844,
"grad_norm": 5.667890548706055,
"learning_rate": 4.481574311908096e-06,
"loss": 0.125,
"mean_token_accuracy": 0.9446974098682404,
"num_tokens": 7581498.0,
"step": 869
},
{
"entropy": 1.3021160960197449,
"epoch": 2.7897271268057784,
"grad_norm": 4.6017069816589355,
"learning_rate": 4.471033446817623e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9526508450508118,
"num_tokens": 7590660.0,
"step": 870
},
{
"entropy": 0.9912780523300171,
"epoch": 2.792937399678973,
"grad_norm": 2.9224236011505127,
"learning_rate": 4.460494958739223e-06,
"loss": 0.1789,
"mean_token_accuracy": 0.9244946837425232,
"num_tokens": 7600008.0,
"step": 871
},
{
"entropy": 1.1159600019454956,
"epoch": 2.796147672552167,
"grad_norm": 3.117302417755127,
"learning_rate": 4.449958895029604e-06,
"loss": 0.15,
"mean_token_accuracy": 0.934796154499054,
"num_tokens": 7609102.0,
"step": 872
},
{
"entropy": 1.2988770604133606,
"epoch": 2.799357945425361,
"grad_norm": 2.7682743072509766,
"learning_rate": 4.439425303034576e-06,
"loss": 0.1074,
"mean_token_accuracy": 0.961311399936676,
"num_tokens": 7616956.0,
"step": 873
},
{
"entropy": 1.3484828472137451,
"epoch": 2.802568218298555,
"grad_norm": 2.468888282775879,
"learning_rate": 4.428894230088842e-06,
"loss": 0.1245,
"mean_token_accuracy": 0.9545502364635468,
"num_tokens": 7625302.0,
"step": 874
},
{
"entropy": 1.2678768038749695,
"epoch": 2.8057784911717496,
"grad_norm": 2.3519935607910156,
"learning_rate": 4.418365723515791e-06,
"loss": 0.1015,
"mean_token_accuracy": 0.9572258293628693,
"num_tokens": 7634369.0,
"step": 875
},
{
"entropy": 1.0742478966712952,
"epoch": 2.808988764044944,
"grad_norm": 5.775500774383545,
"learning_rate": 4.407839830627269e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9556642770767212,
"num_tokens": 7643376.0,
"step": 876
},
{
"entropy": 1.1620059609413147,
"epoch": 2.812199036918138,
"grad_norm": 2.9240689277648926,
"learning_rate": 4.397316598723385e-06,
"loss": 0.1285,
"mean_token_accuracy": 0.9445300102233887,
"num_tokens": 7651065.0,
"step": 877
},
{
"entropy": 0.9733690023422241,
"epoch": 2.8154093097913324,
"grad_norm": 2.123840570449829,
"learning_rate": 4.38679607509229e-06,
"loss": 0.1415,
"mean_token_accuracy": 0.9152352511882782,
"num_tokens": 7662378.0,
"step": 878
},
{
"entropy": 1.0626774430274963,
"epoch": 2.8186195826645264,
"grad_norm": 2.542992353439331,
"learning_rate": 4.376278307009962e-06,
"loss": 0.1357,
"mean_token_accuracy": 0.939263254404068,
"num_tokens": 7670141.0,
"step": 879
},
{
"entropy": 1.1844760179519653,
"epoch": 2.821829855537721,
"grad_norm": 2.248908281326294,
"learning_rate": 4.365763341739996e-06,
"loss": 0.1315,
"mean_token_accuracy": 0.9115343689918518,
"num_tokens": 7679575.0,
"step": 880
},
{
"entropy": 1.0241534113883972,
"epoch": 2.825040128410915,
"grad_norm": 2.588132381439209,
"learning_rate": 4.355251226533396e-06,
"loss": 0.1375,
"mean_token_accuracy": 0.9446264207363129,
"num_tokens": 7688269.0,
"step": 881
},
{
"entropy": 1.2254069447517395,
"epoch": 2.828250401284109,
"grad_norm": 2.150693655014038,
"learning_rate": 4.344742008628356e-06,
"loss": 0.1072,
"mean_token_accuracy": 0.957080066204071,
"num_tokens": 7696149.0,
"step": 882
},
{
"entropy": 1.1962010264396667,
"epoch": 2.831460674157303,
"grad_norm": 2.6374399662017822,
"learning_rate": 4.334235735250047e-06,
"loss": 0.1048,
"mean_token_accuracy": 0.954596996307373,
"num_tokens": 7703819.0,
"step": 883
},
{
"entropy": 1.0402805805206299,
"epoch": 2.8346709470304976,
"grad_norm": 2.519651174545288,
"learning_rate": 4.3237324536104165e-06,
"loss": 0.1361,
"mean_token_accuracy": 0.9493061900138855,
"num_tokens": 7712289.0,
"step": 884
},
{
"entropy": 1.2400332689285278,
"epoch": 2.837881219903692,
"grad_norm": 3.449007034301758,
"learning_rate": 4.313232210907959e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9627736210823059,
"num_tokens": 7720220.0,
"step": 885
},
{
"entropy": 1.1178945302963257,
"epoch": 2.841091492776886,
"grad_norm": 2.6759750843048096,
"learning_rate": 4.302735054327523e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9373737573623657,
"num_tokens": 7729471.0,
"step": 886
},
{
"entropy": 1.1199336647987366,
"epoch": 2.8443017656500804,
"grad_norm": 3.439345359802246,
"learning_rate": 4.292241031040077e-06,
"loss": 0.128,
"mean_token_accuracy": 0.9443832635879517,
"num_tokens": 7737350.0,
"step": 887
},
{
"entropy": 0.9779994189739227,
"epoch": 2.8475120385232744,
"grad_norm": 4.169414520263672,
"learning_rate": 4.28175018820252e-06,
"loss": 0.1363,
"mean_token_accuracy": 0.9478906691074371,
"num_tokens": 7746581.0,
"step": 888
},
{
"entropy": 1.1126951575279236,
"epoch": 2.850722311396469,
"grad_norm": 2.338712692260742,
"learning_rate": 4.271262572957453e-06,
"loss": 0.113,
"mean_token_accuracy": 0.9514721035957336,
"num_tokens": 7754335.0,
"step": 889
},
{
"entropy": 1.04286390542984,
"epoch": 2.853932584269663,
"grad_norm": 2.685058832168579,
"learning_rate": 4.2607782324329776e-06,
"loss": 0.144,
"mean_token_accuracy": 0.9451346397399902,
"num_tokens": 7762713.0,
"step": 890
},
{
"entropy": 1.0703285932540894,
"epoch": 2.857142857142857,
"grad_norm": 2.5074543952941895,
"learning_rate": 4.250297213742473e-06,
"loss": 0.1358,
"mean_token_accuracy": 0.9469152390956879,
"num_tokens": 7771455.0,
"step": 891
},
{
"entropy": 1.0886583030223846,
"epoch": 2.860353130016051,
"grad_norm": 2.3599722385406494,
"learning_rate": 4.239819563984397e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9473360478878021,
"num_tokens": 7780173.0,
"step": 892
},
{
"entropy": 1.0926342606544495,
"epoch": 2.8635634028892456,
"grad_norm": 3.8707892894744873,
"learning_rate": 4.229345330242067e-06,
"loss": 0.114,
"mean_token_accuracy": 0.9592483639717102,
"num_tokens": 7789100.0,
"step": 893
},
{
"entropy": 1.1691067814826965,
"epoch": 2.86677367576244,
"grad_norm": 2.060270309448242,
"learning_rate": 4.21887455958345e-06,
"loss": 0.1028,
"mean_token_accuracy": 0.9459799826145172,
"num_tokens": 7797562.0,
"step": 894
},
{
"entropy": 1.0735175609588623,
"epoch": 2.869983948635634,
"grad_norm": 2.2516980171203613,
"learning_rate": 4.2084072990609505e-06,
"loss": 0.1051,
"mean_token_accuracy": 0.9549958407878876,
"num_tokens": 7805770.0,
"step": 895
},
{
"entropy": 1.1715295910835266,
"epoch": 2.8731942215088284,
"grad_norm": 2.7888271808624268,
"learning_rate": 4.1979435957111984e-06,
"loss": 0.1514,
"mean_token_accuracy": 0.9374975264072418,
"num_tokens": 7814614.0,
"step": 896
},
{
"entropy": 1.0718038082122803,
"epoch": 2.8764044943820224,
"grad_norm": 2.4147868156433105,
"learning_rate": 4.187483496554844e-06,
"loss": 0.1459,
"mean_token_accuracy": 0.9425850808620453,
"num_tokens": 7823306.0,
"step": 897
},
{
"entropy": 0.9856555461883545,
"epoch": 2.879614767255217,
"grad_norm": 2.4258735179901123,
"learning_rate": 4.17702704859633e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9541434049606323,
"num_tokens": 7831505.0,
"step": 898
},
{
"entropy": 1.1180533170700073,
"epoch": 2.882825040128411,
"grad_norm": 64.09413146972656,
"learning_rate": 4.166574298823707e-06,
"loss": 0.1211,
"mean_token_accuracy": 0.9537391066551208,
"num_tokens": 7839948.0,
"step": 899
},
{
"entropy": 1.0941823720932007,
"epoch": 2.886035313001605,
"grad_norm": 4.044398307800293,
"learning_rate": 4.156125294208396e-06,
"loss": 0.1209,
"mean_token_accuracy": 0.9536958932876587,
"num_tokens": 7847550.0,
"step": 900
},
{
"entropy": 1.1155158281326294,
"epoch": 2.889245585874799,
"grad_norm": 5.201903820037842,
"learning_rate": 4.145680081704989e-06,
"loss": 0.1461,
"mean_token_accuracy": 0.9484029412269592,
"num_tokens": 7856829.0,
"step": 901
},
{
"entropy": 1.0542896389961243,
"epoch": 2.8924558587479936,
"grad_norm": 2.08670973777771,
"learning_rate": 4.135238708251045e-06,
"loss": 0.138,
"mean_token_accuracy": 0.9373140633106232,
"num_tokens": 7865501.0,
"step": 902
},
{
"entropy": 1.058946669101715,
"epoch": 2.895666131621188,
"grad_norm": 2.177431344985962,
"learning_rate": 4.1248012207668635e-06,
"loss": 0.0885,
"mean_token_accuracy": 0.9713162779808044,
"num_tokens": 7873454.0,
"step": 903
},
{
"entropy": 1.1428315043449402,
"epoch": 2.898876404494382,
"grad_norm": 2.48675799369812,
"learning_rate": 4.1143676661552876e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.9380343556404114,
"num_tokens": 7882945.0,
"step": 904
},
{
"entropy": 1.1195711493492126,
"epoch": 2.902086677367576,
"grad_norm": 2.479515552520752,
"learning_rate": 4.103938091301479e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9570600390434265,
"num_tokens": 7890724.0,
"step": 905
},
{
"entropy": 1.0864347219467163,
"epoch": 2.9052969502407704,
"grad_norm": 2.0829453468322754,
"learning_rate": 4.093512543072729e-06,
"loss": 0.1256,
"mean_token_accuracy": 0.9525614678859711,
"num_tokens": 7899892.0,
"step": 906
},
{
"entropy": 1.057499647140503,
"epoch": 2.908507223113965,
"grad_norm": 2.903841972351074,
"learning_rate": 4.08309106831822e-06,
"loss": 0.1255,
"mean_token_accuracy": 0.9514180123806,
"num_tokens": 7908834.0,
"step": 907
},
{
"entropy": 1.0348553955554962,
"epoch": 2.911717495987159,
"grad_norm": 2.450044631958008,
"learning_rate": 4.07267371386884e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9236886203289032,
"num_tokens": 7919643.0,
"step": 908
},
{
"entropy": 1.1447957754135132,
"epoch": 2.914927768860353,
"grad_norm": 2.1943976879119873,
"learning_rate": 4.062260526536955e-06,
"loss": 0.1434,
"mean_token_accuracy": 0.9311514496803284,
"num_tokens": 7928989.0,
"step": 909
},
{
"entropy": 1.2298431992530823,
"epoch": 2.918138041733547,
"grad_norm": 3.0917184352874756,
"learning_rate": 4.051851553116208e-06,
"loss": 0.1502,
"mean_token_accuracy": 0.9259648025035858,
"num_tokens": 7939150.0,
"step": 910
},
{
"entropy": 1.2159399390220642,
"epoch": 2.9213483146067416,
"grad_norm": 2.3988490104675293,
"learning_rate": 4.041446840381309e-06,
"loss": 0.1005,
"mean_token_accuracy": 0.9625458717346191,
"num_tokens": 7947256.0,
"step": 911
},
{
"entropy": 0.9480354189872742,
"epoch": 2.924558587479936,
"grad_norm": 2.3145833015441895,
"learning_rate": 4.0310464350878145e-06,
"loss": 0.1322,
"mean_token_accuracy": 0.9379940032958984,
"num_tokens": 7955862.0,
"step": 912
},
{
"entropy": 1.056964099407196,
"epoch": 2.92776886035313,
"grad_norm": 2.7758469581604004,
"learning_rate": 4.0206503839719335e-06,
"loss": 0.1445,
"mean_token_accuracy": 0.9247631430625916,
"num_tokens": 7966052.0,
"step": 913
},
{
"entropy": 1.1305363178253174,
"epoch": 2.930979133226324,
"grad_norm": 2.4620068073272705,
"learning_rate": 4.0102587337503e-06,
"loss": 0.1141,
"mean_token_accuracy": 0.9462770223617554,
"num_tokens": 7974103.0,
"step": 914
},
{
"entropy": 1.148315191268921,
"epoch": 2.9341894060995184,
"grad_norm": 8.483851432800293,
"learning_rate": 3.999871531119779e-06,
"loss": 0.1201,
"mean_token_accuracy": 0.936420351266861,
"num_tokens": 7984482.0,
"step": 915
},
{
"entropy": 0.9917832612991333,
"epoch": 2.937399678972713,
"grad_norm": 2.34243106842041,
"learning_rate": 3.989488822757244e-06,
"loss": 0.1418,
"mean_token_accuracy": 0.9488804042339325,
"num_tokens": 7993276.0,
"step": 916
},
{
"entropy": 1.3016346096992493,
"epoch": 2.940609951845907,
"grad_norm": 5.7551703453063965,
"learning_rate": 3.9791106553193746e-06,
"loss": 0.1081,
"mean_token_accuracy": 0.9585071802139282,
"num_tokens": 8001595.0,
"step": 917
},
{
"entropy": 0.9895669221878052,
"epoch": 2.943820224719101,
"grad_norm": 2.7677927017211914,
"learning_rate": 3.968737075442449e-06,
"loss": 0.1007,
"mean_token_accuracy": 0.9597472846508026,
"num_tokens": 8009133.0,
"step": 918
},
{
"entropy": 1.2291934490203857,
"epoch": 2.947030497592295,
"grad_norm": 2.158738851547241,
"learning_rate": 3.9583681297421194e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9689956903457642,
"num_tokens": 8017442.0,
"step": 919
},
{
"entropy": 1.0079643726348877,
"epoch": 2.9502407704654896,
"grad_norm": 2.449575901031494,
"learning_rate": 3.9480038648132285e-06,
"loss": 0.1454,
"mean_token_accuracy": 0.9376430809497833,
"num_tokens": 8025695.0,
"step": 920
},
{
"entropy": 1.2521326541900635,
"epoch": 2.953451043338684,
"grad_norm": 2.2050304412841797,
"learning_rate": 3.937644327229572e-06,
"loss": 0.0997,
"mean_token_accuracy": 0.9615428447723389,
"num_tokens": 8034102.0,
"step": 921
},
{
"entropy": 1.1765710711479187,
"epoch": 2.956661316211878,
"grad_norm": 2.4940969944000244,
"learning_rate": 3.927289563543709e-06,
"loss": 0.1323,
"mean_token_accuracy": 0.9401477873325348,
"num_tokens": 8043454.0,
"step": 922
},
{
"entropy": 1.1329762935638428,
"epoch": 2.959871589085072,
"grad_norm": 2.112072229385376,
"learning_rate": 3.916939620286743e-06,
"loss": 0.1073,
"mean_token_accuracy": 0.9537553191184998,
"num_tokens": 8051936.0,
"step": 923
},
{
"entropy": 1.1569647192955017,
"epoch": 2.9630818619582664,
"grad_norm": 2.632157325744629,
"learning_rate": 3.906594543968122e-06,
"loss": 0.1351,
"mean_token_accuracy": 0.9533334970474243,
"num_tokens": 8060925.0,
"step": 924
},
{
"entropy": 1.1497327089309692,
"epoch": 2.966292134831461,
"grad_norm": 2.5286219120025635,
"learning_rate": 3.896254381075416e-06,
"loss": 0.143,
"mean_token_accuracy": 0.9448714256286621,
"num_tokens": 8070286.0,
"step": 925
},
{
"entropy": 1.2087596654891968,
"epoch": 2.969502407704655,
"grad_norm": 2.437488317489624,
"learning_rate": 3.885919178074116e-06,
"loss": 0.1167,
"mean_token_accuracy": 0.9381493926048279,
"num_tokens": 8080606.0,
"step": 926
},
{
"entropy": 1.0841256976127625,
"epoch": 2.972712680577849,
"grad_norm": 2.3981645107269287,
"learning_rate": 3.875588981407433e-06,
"loss": 0.1118,
"mean_token_accuracy": 0.9561174213886261,
"num_tokens": 8088771.0,
"step": 927
},
{
"entropy": 1.31938898563385,
"epoch": 2.975922953451043,
"grad_norm": 2.942803382873535,
"learning_rate": 3.865263837496072e-06,
"loss": 0.105,
"mean_token_accuracy": 0.9646160304546356,
"num_tokens": 8097762.0,
"step": 928
},
{
"entropy": 1.1591737866401672,
"epoch": 2.9791332263242376,
"grad_norm": 2.5216867923736572,
"learning_rate": 3.854943792738037e-06,
"loss": 0.1174,
"mean_token_accuracy": 0.9540515542030334,
"num_tokens": 8106011.0,
"step": 929
},
{
"entropy": 1.048449695110321,
"epoch": 2.982343499197432,
"grad_norm": 4.191019058227539,
"learning_rate": 3.844628893508417e-06,
"loss": 0.1492,
"mean_token_accuracy": 0.9386603832244873,
"num_tokens": 8115559.0,
"step": 930
},
{
"entropy": 1.1279476284980774,
"epoch": 2.985553772070626,
"grad_norm": 3.519676685333252,
"learning_rate": 3.834319186159179e-06,
"loss": 0.1268,
"mean_token_accuracy": 0.9528370797634125,
"num_tokens": 8125054.0,
"step": 931
},
{
"entropy": 1.1605662107467651,
"epoch": 2.98876404494382,
"grad_norm": 1.9513523578643799,
"learning_rate": 3.8240147170189575e-06,
"loss": 0.1064,
"mean_token_accuracy": 0.9599840044975281,
"num_tokens": 8134348.0,
"step": 932
},
{
"entropy": 1.1763730645179749,
"epoch": 2.9919743178170144,
"grad_norm": 2.3354673385620117,
"learning_rate": 3.8137155323928526e-06,
"loss": 0.1072,
"mean_token_accuracy": 0.9594251811504364,
"num_tokens": 8142889.0,
"step": 933
},
{
"entropy": 1.0491589307785034,
"epoch": 2.995184590690209,
"grad_norm": 4.819876194000244,
"learning_rate": 3.803421678562213e-06,
"loss": 0.1606,
"mean_token_accuracy": 0.9479463994503021,
"num_tokens": 8151077.0,
"step": 934
},
{
"entropy": 1.0207865834236145,
"epoch": 2.998394863563403,
"grad_norm": 2.169591188430786,
"learning_rate": 3.7931332017844302e-06,
"loss": 0.1391,
"mean_token_accuracy": 0.9430664777755737,
"num_tokens": 8159670.0,
"step": 935
},
{
"entropy": 1.1213672161102295,
"epoch": 3.0,
"grad_norm": 3.4354679584503174,
"learning_rate": 3.7828501482927416e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.9591605067253113,
"num_tokens": 8163426.0,
"step": 936
},
{
"entropy": 1.1103613376617432,
"epoch": 3.0032102728731944,
"grad_norm": 1.608437180519104,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9727907776832581,
"num_tokens": 8172145.0,
"step": 937
},
{
"entropy": 0.972640872001648,
"epoch": 3.0064205457463884,
"grad_norm": 1.5159800052642822,
"learning_rate": 3.7623004959785066e-06,
"loss": 0.0545,
"mean_token_accuracy": 0.9817889928817749,
"num_tokens": 8181034.0,
"step": 938
},
{
"entropy": 1.1834158897399902,
"epoch": 3.009630818619583,
"grad_norm": 1.317238688468933,
"learning_rate": 3.752033989499742e-06,
"loss": 0.0394,
"mean_token_accuracy": 0.9858468174934387,
"num_tokens": 8188630.0,
"step": 939
},
{
"entropy": 1.2065880298614502,
"epoch": 3.012841091492777,
"grad_norm": 1.3101822137832642,
"learning_rate": 3.7417730909942184e-06,
"loss": 0.0406,
"mean_token_accuracy": 0.9882438480854034,
"num_tokens": 8196618.0,
"step": 940
},
{
"entropy": 1.1227167248725891,
"epoch": 3.016051364365971,
"grad_norm": 2.0894644260406494,
"learning_rate": 3.7315178465712364e-06,
"loss": 0.0477,
"mean_token_accuracy": 0.9829187393188477,
"num_tokens": 8205147.0,
"step": 941
},
{
"entropy": 1.0455093383789062,
"epoch": 3.019261637239165,
"grad_norm": 1.322717308998108,
"learning_rate": 3.721268302314698e-06,
"loss": 0.0604,
"mean_token_accuracy": 0.9677430689334869,
"num_tokens": 8214515.0,
"step": 942
},
{
"entropy": 1.129750370979309,
"epoch": 3.0224719101123596,
"grad_norm": 2.6341843605041504,
"learning_rate": 3.7110245042828786e-06,
"loss": 0.0454,
"mean_token_accuracy": 0.9830158650875092,
"num_tokens": 8222371.0,
"step": 943
},
{
"entropy": 0.9655829071998596,
"epoch": 3.0256821829855536,
"grad_norm": 1.5849435329437256,
"learning_rate": 3.70078649850824e-06,
"loss": 0.0614,
"mean_token_accuracy": 0.9761482775211334,
"num_tokens": 8231542.0,
"step": 944
},
{
"entropy": 1.1203289031982422,
"epoch": 3.028892455858748,
"grad_norm": 3.3625926971435547,
"learning_rate": 3.690554330997215e-06,
"loss": 0.0556,
"mean_token_accuracy": 0.9775514900684357,
"num_tokens": 8240850.0,
"step": 945
},
{
"entropy": 0.9774525761604309,
"epoch": 3.0321027287319424,
"grad_norm": 2.0355944633483887,
"learning_rate": 3.6803280477299975e-06,
"loss": 0.0559,
"mean_token_accuracy": 0.9804919064044952,
"num_tokens": 8250103.0,
"step": 946
},
{
"entropy": 0.8719805181026459,
"epoch": 3.0353130016051364,
"grad_norm": 5.178523540496826,
"learning_rate": 3.670107694660343e-06,
"loss": 0.0716,
"mean_token_accuracy": 0.966615617275238,
"num_tokens": 8259351.0,
"step": 947
},
{
"entropy": 1.1192417740821838,
"epoch": 3.038523274478331,
"grad_norm": 1.9812731742858887,
"learning_rate": 3.659893317715355e-06,
"loss": 0.0596,
"mean_token_accuracy": 0.9587984681129456,
"num_tokens": 8269251.0,
"step": 948
},
{
"entropy": 1.0000159740447998,
"epoch": 3.041733547351525,
"grad_norm": 3.3250041007995605,
"learning_rate": 3.6496849627952875e-06,
"loss": 0.0722,
"mean_token_accuracy": 0.972270131111145,
"num_tokens": 8278240.0,
"step": 949
},
{
"entropy": 1.0413598418235779,
"epoch": 3.044943820224719,
"grad_norm": 1.7375184297561646,
"learning_rate": 3.639482675773324e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9655102491378784,
"num_tokens": 8287483.0,
"step": 950
},
{
"entropy": 0.9738638401031494,
"epoch": 3.048154093097913,
"grad_norm": 1.5906513929367065,
"learning_rate": 3.6292865024953945e-06,
"loss": 0.0573,
"mean_token_accuracy": 0.9786509871482849,
"num_tokens": 8295377.0,
"step": 951
},
{
"entropy": 1.0087870359420776,
"epoch": 3.0513643659711076,
"grad_norm": 2.6421918869018555,
"learning_rate": 3.6190964887799418e-06,
"loss": 0.0447,
"mean_token_accuracy": 0.9821014106273651,
"num_tokens": 8303064.0,
"step": 952
},
{
"entropy": 0.9565515518188477,
"epoch": 3.0545746388443016,
"grad_norm": 14.785826683044434,
"learning_rate": 3.6089126804177373e-06,
"loss": 0.0517,
"mean_token_accuracy": 0.9803934693336487,
"num_tokens": 8311134.0,
"step": 953
},
{
"entropy": 1.0219348073005676,
"epoch": 3.057784911717496,
"grad_norm": 3.0249783992767334,
"learning_rate": 3.5987351231716665e-06,
"loss": 0.0515,
"mean_token_accuracy": 0.9799693524837494,
"num_tokens": 8319515.0,
"step": 954
},
{
"entropy": 1.0492010116577148,
"epoch": 3.0609951845906904,
"grad_norm": 2.180885076522827,
"learning_rate": 3.5885638627765228e-06,
"loss": 0.0732,
"mean_token_accuracy": 0.9596208930015564,
"num_tokens": 8328850.0,
"step": 955
},
{
"entropy": 1.0292112231254578,
"epoch": 3.0642054574638844,
"grad_norm": 2.0570931434631348,
"learning_rate": 3.5783989449388063e-06,
"loss": 0.0559,
"mean_token_accuracy": 0.9788411557674408,
"num_tokens": 8337971.0,
"step": 956
},
{
"entropy": 0.9308125078678131,
"epoch": 3.067415730337079,
"grad_norm": 2.3526527881622314,
"learning_rate": 3.568240415336509e-06,
"loss": 0.062,
"mean_token_accuracy": 0.9766317903995514,
"num_tokens": 8347188.0,
"step": 957
},
{
"entropy": 0.9573177099227905,
"epoch": 3.070626003210273,
"grad_norm": 2.2333858013153076,
"learning_rate": 3.5580883196189265e-06,
"loss": 0.0529,
"mean_token_accuracy": 0.9760076105594635,
"num_tokens": 8355560.0,
"step": 958
},
{
"entropy": 0.910061776638031,
"epoch": 3.073836276083467,
"grad_norm": 2.2188963890075684,
"learning_rate": 3.547942703406433e-06,
"loss": 0.0579,
"mean_token_accuracy": 0.9800683856010437,
"num_tokens": 8364189.0,
"step": 959
},
{
"entropy": 0.8721618950366974,
"epoch": 3.077046548956661,
"grad_norm": 1.6384341716766357,
"learning_rate": 3.5378036122902907e-06,
"loss": 0.0547,
"mean_token_accuracy": 0.9822494089603424,
"num_tokens": 8373325.0,
"step": 960
},
{
"entropy": 1.1164506673812866,
"epoch": 3.0802568218298556,
"grad_norm": 1.643190860748291,
"learning_rate": 3.52767109183244e-06,
"loss": 0.045,
"mean_token_accuracy": 0.9842567443847656,
"num_tokens": 8382070.0,
"step": 961
},
{
"entropy": 0.9429452419281006,
"epoch": 3.0834670947030496,
"grad_norm": 2.5088934898376465,
"learning_rate": 3.5175451875652906e-06,
"loss": 0.0445,
"mean_token_accuracy": 0.9792550504207611,
"num_tokens": 8390101.0,
"step": 962
},
{
"entropy": 0.895278811454773,
"epoch": 3.086677367576244,
"grad_norm": 1.734321117401123,
"learning_rate": 3.507425944991529e-06,
"loss": 0.0407,
"mean_token_accuracy": 0.9858044385910034,
"num_tokens": 8398284.0,
"step": 963
},
{
"entropy": 0.9392586350440979,
"epoch": 3.0898876404494384,
"grad_norm": 2.23935604095459,
"learning_rate": 3.4973134095838943e-06,
"loss": 0.045,
"mean_token_accuracy": 0.9840765595436096,
"num_tokens": 8406504.0,
"step": 964
},
{
"entropy": 0.9843576550483704,
"epoch": 3.0930979133226324,
"grad_norm": 2.003302574157715,
"learning_rate": 3.4872076267850015e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.9778844714164734,
"num_tokens": 8415045.0,
"step": 965
},
{
"entropy": 1.0201025605201721,
"epoch": 3.096308186195827,
"grad_norm": 3.679013967514038,
"learning_rate": 3.4771086420071053e-06,
"loss": 0.0511,
"mean_token_accuracy": 0.980453222990036,
"num_tokens": 8422888.0,
"step": 966
},
{
"entropy": 1.300868034362793,
"epoch": 3.099518459069021,
"grad_norm": 2.1512715816497803,
"learning_rate": 3.4670165006319236e-06,
"loss": 0.0397,
"mean_token_accuracy": 0.9860817492008209,
"num_tokens": 8431965.0,
"step": 967
},
{
"entropy": 1.0145321488380432,
"epoch": 3.102728731942215,
"grad_norm": 5.593048095703125,
"learning_rate": 3.4569312480104157e-06,
"loss": 0.0457,
"mean_token_accuracy": 0.9817441999912262,
"num_tokens": 8440345.0,
"step": 968
},
{
"entropy": 0.9421059787273407,
"epoch": 3.105939004815409,
"grad_norm": 1.5621843338012695,
"learning_rate": 3.4468529294625895e-06,
"loss": 0.0472,
"mean_token_accuracy": 0.977195680141449,
"num_tokens": 8448849.0,
"step": 969
},
{
"entropy": 0.9190913140773773,
"epoch": 3.1091492776886036,
"grad_norm": 1.9628736972808838,
"learning_rate": 3.4367815902772917e-06,
"loss": 0.0582,
"mean_token_accuracy": 0.9760018289089203,
"num_tokens": 8457341.0,
"step": 970
},
{
"entropy": 1.0129391252994537,
"epoch": 3.1123595505617976,
"grad_norm": 1.6691995859146118,
"learning_rate": 3.4267172757120005e-06,
"loss": 0.0511,
"mean_token_accuracy": 0.9774636328220367,
"num_tokens": 8465557.0,
"step": 971
},
{
"entropy": 1.0064606368541718,
"epoch": 3.115569823434992,
"grad_norm": 6.057915687561035,
"learning_rate": 3.416660030992639e-06,
"loss": 0.0569,
"mean_token_accuracy": 0.9772391021251678,
"num_tokens": 8474480.0,
"step": 972
},
{
"entropy": 1.0960939228534698,
"epoch": 3.1187800963081864,
"grad_norm": 1.9116007089614868,
"learning_rate": 3.406609901313349e-06,
"loss": 0.0567,
"mean_token_accuracy": 0.9655565321445465,
"num_tokens": 8484235.0,
"step": 973
},
{
"entropy": 0.9288766980171204,
"epoch": 3.1219903691813804,
"grad_norm": 11.40886116027832,
"learning_rate": 3.396566931836308e-06,
"loss": 0.0723,
"mean_token_accuracy": 0.9614908397197723,
"num_tokens": 8494366.0,
"step": 974
},
{
"entropy": 0.9967909157276154,
"epoch": 3.125200642054575,
"grad_norm": 1.3702481985092163,
"learning_rate": 3.386531167691512e-06,
"loss": 0.053,
"mean_token_accuracy": 0.978669673204422,
"num_tokens": 8504740.0,
"step": 975
},
{
"entropy": 1.1818091869354248,
"epoch": 3.128410914927769,
"grad_norm": 1.1393635272979736,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.0232,
"mean_token_accuracy": 0.9925740659236908,
"num_tokens": 8513110.0,
"step": 976
},
{
"entropy": 0.9709301590919495,
"epoch": 3.131621187800963,
"grad_norm": 2.256098747253418,
"learning_rate": 3.36648143575656e-06,
"loss": 0.0641,
"mean_token_accuracy": 0.9657347202301025,
"num_tokens": 8523348.0,
"step": 977
},
{
"entropy": 0.8825556635856628,
"epoch": 3.134831460674157,
"grad_norm": 7.103281497955322,
"learning_rate": 3.3564675580636946e-06,
"loss": 0.0543,
"mean_token_accuracy": 0.9809089303016663,
"num_tokens": 8531559.0,
"step": 978
},
{
"entropy": 0.8873893618583679,
"epoch": 3.1380417335473516,
"grad_norm": 1.9619849920272827,
"learning_rate": 3.3464610658972584e-06,
"loss": 0.0503,
"mean_token_accuracy": 0.9831579029560089,
"num_tokens": 8539982.0,
"step": 979
},
{
"entropy": 1.0100898146629333,
"epoch": 3.1412520064205456,
"grad_norm": 1.5911046266555786,
"learning_rate": 3.3364620042233316e-06,
"loss": 0.0325,
"mean_token_accuracy": 0.9892257153987885,
"num_tokens": 8548033.0,
"step": 980
},
{
"entropy": 0.9736209511756897,
"epoch": 3.14446227929374,
"grad_norm": 6.580364227294922,
"learning_rate": 3.326470417974604e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9663437604904175,
"num_tokens": 8556845.0,
"step": 981
},
{
"entropy": 0.9981383681297302,
"epoch": 3.1476725521669344,
"grad_norm": 2.606773853302002,
"learning_rate": 3.3164863520501744e-06,
"loss": 0.0511,
"mean_token_accuracy": 0.982105016708374,
"num_tokens": 8565025.0,
"step": 982
},
{
"entropy": 0.8959035575389862,
"epoch": 3.1508828250401284,
"grad_norm": 1.872805118560791,
"learning_rate": 3.3065098513153473e-06,
"loss": 0.0516,
"mean_token_accuracy": 0.9818290174007416,
"num_tokens": 8573640.0,
"step": 983
},
{
"entropy": 1.0557291507720947,
"epoch": 3.154093097913323,
"grad_norm": 1.5255221128463745,
"learning_rate": 3.29654096060143e-06,
"loss": 0.0645,
"mean_token_accuracy": 0.9671049416065216,
"num_tokens": 8581760.0,
"step": 984
},
{
"entropy": 0.899553507566452,
"epoch": 3.157303370786517,
"grad_norm": 1.7938666343688965,
"learning_rate": 3.2865797247055354e-06,
"loss": 0.044,
"mean_token_accuracy": 0.9840608239173889,
"num_tokens": 8589385.0,
"step": 985
},
{
"entropy": 0.9979848265647888,
"epoch": 3.160513643659711,
"grad_norm": 2.103757381439209,
"learning_rate": 3.2766261883903744e-06,
"loss": 0.0547,
"mean_token_accuracy": 0.9734188914299011,
"num_tokens": 8597447.0,
"step": 986
},
{
"entropy": 0.9901096820831299,
"epoch": 3.163723916532905,
"grad_norm": 2.168027877807617,
"learning_rate": 3.266680396384061e-06,
"loss": 0.0601,
"mean_token_accuracy": 0.9723277390003204,
"num_tokens": 8606238.0,
"step": 987
},
{
"entropy": 0.9194740653038025,
"epoch": 3.1669341894060996,
"grad_norm": 2.060809373855591,
"learning_rate": 3.256742393379909e-06,
"loss": 0.0671,
"mean_token_accuracy": 0.9673266708850861,
"num_tokens": 8614880.0,
"step": 988
},
{
"entropy": 1.0402765274047852,
"epoch": 3.1701444622792936,
"grad_norm": 2.8911445140838623,
"learning_rate": 3.2468122240362287e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9752613008022308,
"num_tokens": 8623325.0,
"step": 989
},
{
"entropy": 1.0076416432857513,
"epoch": 3.173354735152488,
"grad_norm": 2.1394102573394775,
"learning_rate": 3.2368899329761316e-06,
"loss": 0.074,
"mean_token_accuracy": 0.9638822078704834,
"num_tokens": 8633417.0,
"step": 990
},
{
"entropy": 0.98023921251297,
"epoch": 3.176565008025682,
"grad_norm": 1.8175908327102661,
"learning_rate": 3.226975564787322e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9612008631229401,
"num_tokens": 8643761.0,
"step": 991
},
{
"entropy": 0.9907765090465546,
"epoch": 3.1797752808988764,
"grad_norm": 1.614745020866394,
"learning_rate": 3.2170691640219077e-06,
"loss": 0.0516,
"mean_token_accuracy": 0.9799723327159882,
"num_tokens": 8652126.0,
"step": 992
},
{
"entropy": 1.0575563311576843,
"epoch": 3.182985553772071,
"grad_norm": 2.4234402179718018,
"learning_rate": 3.2071707751961838e-06,
"loss": 0.0795,
"mean_token_accuracy": 0.9665969610214233,
"num_tokens": 8660420.0,
"step": 993
},
{
"entropy": 1.0722668170928955,
"epoch": 3.186195826645265,
"grad_norm": 2.0058796405792236,
"learning_rate": 3.197280442790455e-06,
"loss": 0.0515,
"mean_token_accuracy": 0.9787575006484985,
"num_tokens": 8669423.0,
"step": 994
},
{
"entropy": 0.9013467729091644,
"epoch": 3.189406099518459,
"grad_norm": 1.7118332386016846,
"learning_rate": 3.187398211248811e-06,
"loss": 0.0772,
"mean_token_accuracy": 0.9585808515548706,
"num_tokens": 8678899.0,
"step": 995
},
{
"entropy": 0.9303914904594421,
"epoch": 3.192616372391653,
"grad_norm": 1.357763409614563,
"learning_rate": 3.1775241249789434e-06,
"loss": 0.0419,
"mean_token_accuracy": 0.9790163636207581,
"num_tokens": 8687793.0,
"step": 996
},
{
"entropy": 0.9078644514083862,
"epoch": 3.1958266452648476,
"grad_norm": 1.659339427947998,
"learning_rate": 3.1676582283519454e-06,
"loss": 0.0566,
"mean_token_accuracy": 0.9744837284088135,
"num_tokens": 8696488.0,
"step": 997
},
{
"entropy": 1.036493569612503,
"epoch": 3.1990369181380416,
"grad_norm": 2.6456494331359863,
"learning_rate": 3.1578005657021004e-06,
"loss": 0.0325,
"mean_token_accuracy": 0.9887253046035767,
"num_tokens": 8704137.0,
"step": 998
},
{
"entropy": 0.9375521242618561,
"epoch": 3.202247191011236,
"grad_norm": 2.551023483276367,
"learning_rate": 3.1479511813267006e-06,
"loss": 0.0474,
"mean_token_accuracy": 0.9822751581668854,
"num_tokens": 8712176.0,
"step": 999
},
{
"entropy": 0.9312825500965118,
"epoch": 3.20545746388443,
"grad_norm": 1.506113052368164,
"learning_rate": 3.1381101194858264e-06,
"loss": 0.0443,
"mean_token_accuracy": 0.9840706288814545,
"num_tokens": 8720821.0,
"step": 1000
},
{
"entropy": 1.1557820439338684,
"epoch": 3.2086677367576244,
"grad_norm": 1.624644160270691,
"learning_rate": 3.1282774244021717e-06,
"loss": 0.0499,
"mean_token_accuracy": 0.9834720492362976,
"num_tokens": 8730703.0,
"step": 1001
},
{
"entropy": 1.0071000158786774,
"epoch": 3.211878009630819,
"grad_norm": 2.3831636905670166,
"learning_rate": 3.118453140260823e-06,
"loss": 0.0435,
"mean_token_accuracy": 0.9820217788219452,
"num_tokens": 8738677.0,
"step": 1002
},
{
"entropy": 1.228837251663208,
"epoch": 3.215088282504013,
"grad_norm": 15.644500732421875,
"learning_rate": 3.1086373112090762e-06,
"loss": 0.0451,
"mean_token_accuracy": 0.9836629033088684,
"num_tokens": 8747024.0,
"step": 1003
},
{
"entropy": 0.9827360510826111,
"epoch": 3.218298555377207,
"grad_norm": 2.2126834392547607,
"learning_rate": 3.0988299813562304e-06,
"loss": 0.0472,
"mean_token_accuracy": 0.9848807752132416,
"num_tokens": 8754758.0,
"step": 1004
},
{
"entropy": 1.0021247863769531,
"epoch": 3.221508828250401,
"grad_norm": 2.114436388015747,
"learning_rate": 3.089031194773392e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9792338311672211,
"num_tokens": 8763540.0,
"step": 1005
},
{
"entropy": 1.1125357747077942,
"epoch": 3.2247191011235956,
"grad_norm": 3.2401301860809326,
"learning_rate": 3.079240995493279e-06,
"loss": 0.0472,
"mean_token_accuracy": 0.9815241992473602,
"num_tokens": 8772746.0,
"step": 1006
},
{
"entropy": 0.9442520439624786,
"epoch": 3.2279293739967896,
"grad_norm": 1.4487026929855347,
"learning_rate": 3.069459427510014e-06,
"loss": 0.0349,
"mean_token_accuracy": 0.9873338937759399,
"num_tokens": 8781167.0,
"step": 1007
},
{
"entropy": 0.9544734358787537,
"epoch": 3.231139646869984,
"grad_norm": 1.6530356407165527,
"learning_rate": 3.0596865347789444e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9499466717243195,
"num_tokens": 8791131.0,
"step": 1008
},
{
"entropy": 0.9192224740982056,
"epoch": 3.234349919743178,
"grad_norm": 2.113713264465332,
"learning_rate": 3.049922361216422e-06,
"loss": 0.0597,
"mean_token_accuracy": 0.9740520119667053,
"num_tokens": 8799470.0,
"step": 1009
},
{
"entropy": 0.9941827058792114,
"epoch": 3.2375601926163724,
"grad_norm": 2.598651885986328,
"learning_rate": 3.040166950699626e-06,
"loss": 0.0663,
"mean_token_accuracy": 0.979779839515686,
"num_tokens": 8809349.0,
"step": 1010
},
{
"entropy": 1.0034226775169373,
"epoch": 3.240770465489567,
"grad_norm": 1.9838776588439941,
"learning_rate": 3.0304203470663507e-06,
"loss": 0.0445,
"mean_token_accuracy": 0.9826087951660156,
"num_tokens": 8816640.0,
"step": 1011
},
{
"entropy": 1.0576593279838562,
"epoch": 3.243980738362761,
"grad_norm": 2.287182569503784,
"learning_rate": 3.0206825941148203e-06,
"loss": 0.0445,
"mean_token_accuracy": 0.9830202162265778,
"num_tokens": 8824940.0,
"step": 1012
},
{
"entropy": 0.8799232840538025,
"epoch": 3.247191011235955,
"grad_norm": 1.7231913805007935,
"learning_rate": 3.0109537356034856e-06,
"loss": 0.0692,
"mean_token_accuracy": 0.964619368314743,
"num_tokens": 8835331.0,
"step": 1013
},
{
"entropy": 0.915042519569397,
"epoch": 3.250401284109149,
"grad_norm": 2.0848681926727295,
"learning_rate": 3.001233815250823e-06,
"loss": 0.0769,
"mean_token_accuracy": 0.9604291617870331,
"num_tokens": 8845517.0,
"step": 1014
},
{
"entropy": 0.9062038064002991,
"epoch": 3.2536115569823436,
"grad_norm": 2.325322151184082,
"learning_rate": 2.991522876735154e-06,
"loss": 0.088,
"mean_token_accuracy": 0.9585862755775452,
"num_tokens": 8855134.0,
"step": 1015
},
{
"entropy": 0.9652998447418213,
"epoch": 3.2568218298555376,
"grad_norm": 2.039144277572632,
"learning_rate": 2.981820963694427e-06,
"loss": 0.0369,
"mean_token_accuracy": 0.986418753862381,
"num_tokens": 8863208.0,
"step": 1016
},
{
"entropy": 0.9741517305374146,
"epoch": 3.260032102728732,
"grad_norm": 1.9456210136413574,
"learning_rate": 2.9721281197260427e-06,
"loss": 0.0621,
"mean_token_accuracy": 0.9771439135074615,
"num_tokens": 8871148.0,
"step": 1017
},
{
"entropy": 1.0131028890609741,
"epoch": 3.263242375601926,
"grad_norm": 2.170149803161621,
"learning_rate": 2.9624443883866403e-06,
"loss": 0.043,
"mean_token_accuracy": 0.9852020740509033,
"num_tokens": 8880453.0,
"step": 1018
},
{
"entropy": 1.0579794645309448,
"epoch": 3.2664526484751204,
"grad_norm": 1.5485424995422363,
"learning_rate": 2.9527698131919156e-06,
"loss": 0.038,
"mean_token_accuracy": 0.9849701225757599,
"num_tokens": 8888958.0,
"step": 1019
},
{
"entropy": 1.0165627002716064,
"epoch": 3.2696629213483144,
"grad_norm": 2.5804836750030518,
"learning_rate": 2.9431044376164165e-06,
"loss": 0.0444,
"mean_token_accuracy": 0.9853474199771881,
"num_tokens": 8896866.0,
"step": 1020
},
{
"entropy": 0.9434081315994263,
"epoch": 3.272873194221509,
"grad_norm": 1.672654151916504,
"learning_rate": 2.9334483050933506e-06,
"loss": 0.0778,
"mean_token_accuracy": 0.9503156840801239,
"num_tokens": 8906557.0,
"step": 1021
},
{
"entropy": 0.9355335235595703,
"epoch": 3.276083467094703,
"grad_norm": 2.510097026824951,
"learning_rate": 2.9238014590143925e-06,
"loss": 0.0485,
"mean_token_accuracy": 0.9850959777832031,
"num_tokens": 8914286.0,
"step": 1022
},
{
"entropy": 0.8669168055057526,
"epoch": 3.279293739967897,
"grad_norm": 9.337780952453613,
"learning_rate": 2.91416394272948e-06,
"loss": 0.06,
"mean_token_accuracy": 0.9728347659111023,
"num_tokens": 8922690.0,
"step": 1023
},
{
"entropy": 0.9273563027381897,
"epoch": 3.2825040128410916,
"grad_norm": 2.416398525238037,
"learning_rate": 2.904535799546636e-06,
"loss": 0.054,
"mean_token_accuracy": 0.9759286046028137,
"num_tokens": 8931721.0,
"step": 1024
},
{
"entropy": 0.914330780506134,
"epoch": 3.2857142857142856,
"grad_norm": 1.6186331510543823,
"learning_rate": 2.894917072731753e-06,
"loss": 0.0362,
"mean_token_accuracy": 0.9858781099319458,
"num_tokens": 8939293.0,
"step": 1025
},
{
"entropy": 0.9978557825088501,
"epoch": 3.28892455858748,
"grad_norm": 1.6200190782546997,
"learning_rate": 2.8853078055084192e-06,
"loss": 0.0465,
"mean_token_accuracy": 0.9847012162208557,
"num_tokens": 8947140.0,
"step": 1026
},
{
"entropy": 1.1273800134658813,
"epoch": 3.292134831460674,
"grad_norm": 1.6572569608688354,
"learning_rate": 2.8757080410577042e-06,
"loss": 0.0402,
"mean_token_accuracy": 0.9817869663238525,
"num_tokens": 8955096.0,
"step": 1027
},
{
"entropy": 0.9369406700134277,
"epoch": 3.2953451043338684,
"grad_norm": 1.2704312801361084,
"learning_rate": 2.866117822517982e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9776290953159332,
"num_tokens": 8963424.0,
"step": 1028
},
{
"entropy": 1.066481113433838,
"epoch": 3.2985553772070624,
"grad_norm": 2.7775206565856934,
"learning_rate": 2.8565371929847286e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9550573229789734,
"num_tokens": 8973499.0,
"step": 1029
},
{
"entropy": 1.1146639585494995,
"epoch": 3.301765650080257,
"grad_norm": 2.5436532497406006,
"learning_rate": 2.846966195510332e-06,
"loss": 0.0671,
"mean_token_accuracy": 0.9698319137096405,
"num_tokens": 8982236.0,
"step": 1030
},
{
"entropy": 0.9178177118301392,
"epoch": 3.304975922953451,
"grad_norm": 1.7456978559494019,
"learning_rate": 2.83740487310389e-06,
"loss": 0.0553,
"mean_token_accuracy": 0.9707920253276825,
"num_tokens": 8990852.0,
"step": 1031
},
{
"entropy": 1.0382152795791626,
"epoch": 3.308186195826645,
"grad_norm": 1.6169335842132568,
"learning_rate": 2.82785326873103e-06,
"loss": 0.0378,
"mean_token_accuracy": 0.9834270775318146,
"num_tokens": 8998939.0,
"step": 1032
},
{
"entropy": 0.993056982755661,
"epoch": 3.3113964686998396,
"grad_norm": 1.7227951288223267,
"learning_rate": 2.81831142531371e-06,
"loss": 0.0483,
"mean_token_accuracy": 0.9841017127037048,
"num_tokens": 9008345.0,
"step": 1033
},
{
"entropy": 1.0055307149887085,
"epoch": 3.3146067415730336,
"grad_norm": 1.711963176727295,
"learning_rate": 2.8087793857300193e-06,
"loss": 0.0533,
"mean_token_accuracy": 0.9790107905864716,
"num_tokens": 9017254.0,
"step": 1034
},
{
"entropy": 1.0133417248725891,
"epoch": 3.317817014446228,
"grad_norm": 2.1641650199890137,
"learning_rate": 2.7992571928139984e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9752263724803925,
"num_tokens": 9026056.0,
"step": 1035
},
{
"entropy": 0.9465528428554535,
"epoch": 3.321027287319422,
"grad_norm": 10.7071533203125,
"learning_rate": 2.7897448893554335e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.9831990897655487,
"num_tokens": 9034411.0,
"step": 1036
},
{
"entropy": 0.9765456318855286,
"epoch": 3.3242375601926164,
"grad_norm": 2.850576400756836,
"learning_rate": 2.780242518099675e-06,
"loss": 0.0477,
"mean_token_accuracy": 0.9827427566051483,
"num_tokens": 9043094.0,
"step": 1037
},
{
"entropy": 0.9502105414867401,
"epoch": 3.3274478330658104,
"grad_norm": 3.9407103061676025,
"learning_rate": 2.7707501217474443e-06,
"loss": 0.0483,
"mean_token_accuracy": 0.9828430116176605,
"num_tokens": 9050481.0,
"step": 1038
},
{
"entropy": 0.962793231010437,
"epoch": 3.330658105939005,
"grad_norm": 1.6887401342391968,
"learning_rate": 2.761267742954629e-06,
"loss": 0.0414,
"mean_token_accuracy": 0.9845134019851685,
"num_tokens": 9058366.0,
"step": 1039
},
{
"entropy": 0.9595210254192352,
"epoch": 3.333868378812199,
"grad_norm": 1.9709408283233643,
"learning_rate": 2.7517954243321097e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9848853349685669,
"num_tokens": 9066272.0,
"step": 1040
},
{
"entropy": 1.081430196762085,
"epoch": 3.337078651685393,
"grad_norm": 1.9100728034973145,
"learning_rate": 2.7423332084455543e-06,
"loss": 0.0562,
"mean_token_accuracy": 0.9827399849891663,
"num_tokens": 9074883.0,
"step": 1041
},
{
"entropy": 1.0304247736930847,
"epoch": 3.3402889245585876,
"grad_norm": 2.094480276107788,
"learning_rate": 2.7328811378152355e-06,
"loss": 0.0549,
"mean_token_accuracy": 0.9760044515132904,
"num_tokens": 9083403.0,
"step": 1042
},
{
"entropy": 0.9497547447681427,
"epoch": 3.3434991974317816,
"grad_norm": 2.770940065383911,
"learning_rate": 2.723439254915834e-06,
"loss": 0.0674,
"mean_token_accuracy": 0.9691638946533203,
"num_tokens": 9093305.0,
"step": 1043
},
{
"entropy": 0.9552264213562012,
"epoch": 3.346709470304976,
"grad_norm": 1.8198769092559814,
"learning_rate": 2.714007602176254e-06,
"loss": 0.0561,
"mean_token_accuracy": 0.9818349182605743,
"num_tokens": 9101874.0,
"step": 1044
},
{
"entropy": 1.0001209378242493,
"epoch": 3.34991974317817,
"grad_norm": 2.130765199661255,
"learning_rate": 2.704586221979422e-06,
"loss": 0.0443,
"mean_token_accuracy": 0.9807817041873932,
"num_tokens": 9111703.0,
"step": 1045
},
{
"entropy": 1.0091004967689514,
"epoch": 3.3531300160513644,
"grad_norm": 1.6668496131896973,
"learning_rate": 2.695175156662107e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9821631014347076,
"num_tokens": 9119776.0,
"step": 1046
},
{
"entropy": 0.9886317849159241,
"epoch": 3.3563402889245584,
"grad_norm": 2.097139596939087,
"learning_rate": 2.6857744485147286e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.972685307264328,
"num_tokens": 9129487.0,
"step": 1047
},
{
"entropy": 0.8918091654777527,
"epoch": 3.359550561797753,
"grad_norm": 3.4406795501708984,
"learning_rate": 2.6763841397811576e-06,
"loss": 0.0656,
"mean_token_accuracy": 0.9593222439289093,
"num_tokens": 9140250.0,
"step": 1048
},
{
"entropy": 1.0260462164878845,
"epoch": 3.362760834670947,
"grad_norm": 1.657132863998413,
"learning_rate": 2.667004272658541e-06,
"loss": 0.052,
"mean_token_accuracy": 0.9808726608753204,
"num_tokens": 9149142.0,
"step": 1049
},
{
"entropy": 0.9491671919822693,
"epoch": 3.365971107544141,
"grad_norm": 2.0722362995147705,
"learning_rate": 2.6576348892970947e-06,
"loss": 0.0456,
"mean_token_accuracy": 0.9820691347122192,
"num_tokens": 9156975.0,
"step": 1050
},
{
"entropy": 1.0107131600379944,
"epoch": 3.3691813804173356,
"grad_norm": 1.412963628768921,
"learning_rate": 2.6482760317999338e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.9728775322437286,
"num_tokens": 9165612.0,
"step": 1051
},
{
"entropy": 0.9788424372673035,
"epoch": 3.3723916532905296,
"grad_norm": 1.7426668405532837,
"learning_rate": 2.638927742222868e-06,
"loss": 0.0434,
"mean_token_accuracy": 0.9835657775402069,
"num_tokens": 9173886.0,
"step": 1052
},
{
"entropy": 0.9652360081672668,
"epoch": 3.375601926163724,
"grad_norm": 1.7111523151397705,
"learning_rate": 2.629590062574221e-06,
"loss": 0.0488,
"mean_token_accuracy": 0.9817163050174713,
"num_tokens": 9182802.0,
"step": 1053
},
{
"entropy": 0.9846494793891907,
"epoch": 3.378812199036918,
"grad_norm": 2.427800178527832,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.0692,
"mean_token_accuracy": 0.970174103975296,
"num_tokens": 9191462.0,
"step": 1054
},
{
"entropy": 1.0558529496192932,
"epoch": 3.3820224719101124,
"grad_norm": 1.4618818759918213,
"learning_rate": 2.610946700856885e-06,
"loss": 0.0409,
"mean_token_accuracy": 0.9835179150104523,
"num_tokens": 9200009.0,
"step": 1055
},
{
"entropy": 0.8598636388778687,
"epoch": 3.3852327447833064,
"grad_norm": 1.6279900074005127,
"learning_rate": 2.6016411025656973e-06,
"loss": 0.0749,
"mean_token_accuracy": 0.9546991288661957,
"num_tokens": 9209206.0,
"step": 1056
},
{
"entropy": 0.9998753070831299,
"epoch": 3.388443017656501,
"grad_norm": 2.4744317531585693,
"learning_rate": 2.592346281757552e-06,
"loss": 0.0532,
"mean_token_accuracy": 0.9732007086277008,
"num_tokens": 9218881.0,
"step": 1057
},
{
"entropy": 0.7904504835605621,
"epoch": 3.391653290529695,
"grad_norm": 1.8489923477172852,
"learning_rate": 2.583062280200501e-06,
"loss": 0.0701,
"mean_token_accuracy": 0.9678979218006134,
"num_tokens": 9227741.0,
"step": 1058
},
{
"entropy": 0.8975243866443634,
"epoch": 3.394863563402889,
"grad_norm": 10.23280143737793,
"learning_rate": 2.5737891396139713e-06,
"loss": 0.0623,
"mean_token_accuracy": 0.9779176414012909,
"num_tokens": 9235749.0,
"step": 1059
},
{
"entropy": 0.9414438307285309,
"epoch": 3.3980738362760836,
"grad_norm": 2.947763681411743,
"learning_rate": 2.5645269016685905e-06,
"loss": 0.0729,
"mean_token_accuracy": 0.9428462088108063,
"num_tokens": 9247768.0,
"step": 1060
},
{
"entropy": 0.9279659390449524,
"epoch": 3.4012841091492776,
"grad_norm": 2.126018762588501,
"learning_rate": 2.5552756079859904e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9858635365962982,
"num_tokens": 9255066.0,
"step": 1061
},
{
"entropy": 1.1867004036903381,
"epoch": 3.404494382022472,
"grad_norm": 1.8234483003616333,
"learning_rate": 2.5460353001386263e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9828614294528961,
"num_tokens": 9263725.0,
"step": 1062
},
{
"entropy": 0.9649862051010132,
"epoch": 3.407704654895666,
"grad_norm": 1.4239985942840576,
"learning_rate": 2.5368060196495785e-06,
"loss": 0.0272,
"mean_token_accuracy": 0.9885632693767548,
"num_tokens": 9272284.0,
"step": 1063
},
{
"entropy": 1.181826651096344,
"epoch": 3.4109149277688604,
"grad_norm": 1.85142982006073,
"learning_rate": 2.527587807992383e-06,
"loss": 0.0486,
"mean_token_accuracy": 0.9821223318576813,
"num_tokens": 9281554.0,
"step": 1064
},
{
"entropy": 0.816430002450943,
"epoch": 3.4141252006420544,
"grad_norm": 1.6388859748840332,
"learning_rate": 2.5183807065908296e-06,
"loss": 0.0936,
"mean_token_accuracy": 0.9386367201805115,
"num_tokens": 9292509.0,
"step": 1065
},
{
"entropy": 0.7913811504840851,
"epoch": 3.417335473515249,
"grad_norm": 1.6456538438796997,
"learning_rate": 2.5091847568187834e-06,
"loss": 0.0583,
"mean_token_accuracy": 0.9755530953407288,
"num_tokens": 9301615.0,
"step": 1066
},
{
"entropy": 1.036412626504898,
"epoch": 3.420545746388443,
"grad_norm": 1.9353599548339844,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0503,
"mean_token_accuracy": 0.9810832142829895,
"num_tokens": 9310589.0,
"step": 1067
},
{
"entropy": 0.7897546291351318,
"epoch": 3.423756019261637,
"grad_norm": 2.1252787113189697,
"learning_rate": 2.4908264774079355e-06,
"loss": 0.0568,
"mean_token_accuracy": 0.9774205684661865,
"num_tokens": 9319064.0,
"step": 1068
},
{
"entropy": 1.031320035457611,
"epoch": 3.4269662921348316,
"grad_norm": 1.4396086931228638,
"learning_rate": 2.4816642302655634e-06,
"loss": 0.0328,
"mean_token_accuracy": 0.9882897734642029,
"num_tokens": 9327453.0,
"step": 1069
},
{
"entropy": 0.9724080562591553,
"epoch": 3.4301765650080256,
"grad_norm": 2.4586398601531982,
"learning_rate": 2.4725132997451833e-06,
"loss": 0.0551,
"mean_token_accuracy": 0.9825824201107025,
"num_tokens": 9336542.0,
"step": 1070
},
{
"entropy": 0.9164369702339172,
"epoch": 3.43338683788122,
"grad_norm": 2.929311513900757,
"learning_rate": 2.4633737269682546e-06,
"loss": 0.0558,
"mean_token_accuracy": 0.9753805994987488,
"num_tokens": 9345800.0,
"step": 1071
},
{
"entropy": 0.8264903128147125,
"epoch": 3.436597110754414,
"grad_norm": 1.931374192237854,
"learning_rate": 2.454245553005184e-06,
"loss": 0.0537,
"mean_token_accuracy": 0.978405624628067,
"num_tokens": 9354054.0,
"step": 1072
},
{
"entropy": 0.8463916182518005,
"epoch": 3.4398073836276084,
"grad_norm": 1.8540114164352417,
"learning_rate": 2.445128818875166e-06,
"loss": 0.0593,
"mean_token_accuracy": 0.9726223349571228,
"num_tokens": 9363638.0,
"step": 1073
},
{
"entropy": 0.8569443821907043,
"epoch": 3.4430176565008024,
"grad_norm": 1.8228055238723755,
"learning_rate": 2.4360235655459804e-06,
"loss": 0.0325,
"mean_token_accuracy": 0.9868163168430328,
"num_tokens": 9371525.0,
"step": 1074
},
{
"entropy": 0.9562407732009888,
"epoch": 3.446227929373997,
"grad_norm": 1.992489218711853,
"learning_rate": 2.4269298339338205e-06,
"loss": 0.0544,
"mean_token_accuracy": 0.9766569435596466,
"num_tokens": 9381012.0,
"step": 1075
},
{
"entropy": 0.8250246644020081,
"epoch": 3.449438202247191,
"grad_norm": 2.0697720050811768,
"learning_rate": 2.4178476649031057e-06,
"loss": 0.0686,
"mean_token_accuracy": 0.9746454358100891,
"num_tokens": 9389158.0,
"step": 1076
},
{
"entropy": 0.8266815543174744,
"epoch": 3.452648475120385,
"grad_norm": 2.3080739974975586,
"learning_rate": 2.408777099266291e-06,
"loss": 0.0319,
"mean_token_accuracy": 0.9853142201900482,
"num_tokens": 9396753.0,
"step": 1077
},
{
"entropy": 0.9661762714385986,
"epoch": 3.4558587479935796,
"grad_norm": 1.6249902248382568,
"learning_rate": 2.3997181777836955e-06,
"loss": 0.052,
"mean_token_accuracy": 0.9755984842777252,
"num_tokens": 9405294.0,
"step": 1078
},
{
"entropy": 0.904697835445404,
"epoch": 3.4590690208667736,
"grad_norm": 2.562695026397705,
"learning_rate": 2.3906709411633073e-06,
"loss": 0.0473,
"mean_token_accuracy": 0.9813754260540009,
"num_tokens": 9413357.0,
"step": 1079
},
{
"entropy": 1.0254833102226257,
"epoch": 3.462279293739968,
"grad_norm": 1.5701454877853394,
"learning_rate": 2.381635430060611e-06,
"loss": 0.056,
"mean_token_accuracy": 0.9748643338680267,
"num_tokens": 9423096.0,
"step": 1080
},
{
"entropy": 0.9985256493091583,
"epoch": 3.465489566613162,
"grad_norm": 1.7520354986190796,
"learning_rate": 2.3726116850783987e-06,
"loss": 0.0455,
"mean_token_accuracy": 0.9827691316604614,
"num_tokens": 9431857.0,
"step": 1081
},
{
"entropy": 0.9760493338108063,
"epoch": 3.4686998394863564,
"grad_norm": 1.6460782289505005,
"learning_rate": 2.3635997467665905e-06,
"loss": 0.0426,
"mean_token_accuracy": 0.9851132333278656,
"num_tokens": 9439408.0,
"step": 1082
},
{
"entropy": 1.1165828108787537,
"epoch": 3.4719101123595504,
"grad_norm": 1.316416621208191,
"learning_rate": 2.354599655622049e-06,
"loss": 0.0366,
"mean_token_accuracy": 0.9826886057853699,
"num_tokens": 9449020.0,
"step": 1083
},
{
"entropy": 0.8983205258846283,
"epoch": 3.475120385232745,
"grad_norm": 1.3936208486557007,
"learning_rate": 2.3456114520883956e-06,
"loss": 0.0486,
"mean_token_accuracy": 0.9764930009841919,
"num_tokens": 9458218.0,
"step": 1084
},
{
"entropy": 0.9232110977172852,
"epoch": 3.478330658105939,
"grad_norm": 4.920319557189941,
"learning_rate": 2.3366351765558437e-06,
"loss": 0.0515,
"mean_token_accuracy": 0.9800127148628235,
"num_tokens": 9466307.0,
"step": 1085
},
{
"entropy": 0.9811087250709534,
"epoch": 3.481540930979133,
"grad_norm": 1.8882064819335938,
"learning_rate": 2.3276708693609947e-06,
"loss": 0.0456,
"mean_token_accuracy": 0.9796569645404816,
"num_tokens": 9474803.0,
"step": 1086
},
{
"entropy": 0.96051424741745,
"epoch": 3.4847512038523276,
"grad_norm": 2.430860996246338,
"learning_rate": 2.318718570786675e-06,
"loss": 0.056,
"mean_token_accuracy": 0.9816667437553406,
"num_tokens": 9482461.0,
"step": 1087
},
{
"entropy": 0.997778058052063,
"epoch": 3.4879614767255216,
"grad_norm": 3.68143892288208,
"learning_rate": 2.309778321061742e-06,
"loss": 0.0429,
"mean_token_accuracy": 0.9797113835811615,
"num_tokens": 9491235.0,
"step": 1088
},
{
"entropy": 0.8883638083934784,
"epoch": 3.491171749598716,
"grad_norm": 1.9456281661987305,
"learning_rate": 2.3008501603609147e-06,
"loss": 0.0474,
"mean_token_accuracy": 0.9790343940258026,
"num_tokens": 9500344.0,
"step": 1089
},
{
"entropy": 0.9200843572616577,
"epoch": 3.49438202247191,
"grad_norm": 1.5608545541763306,
"learning_rate": 2.2919341288045853e-06,
"loss": 0.0422,
"mean_token_accuracy": 0.9835698008537292,
"num_tokens": 9508039.0,
"step": 1090
},
{
"entropy": 0.8582479655742645,
"epoch": 3.4975922953451044,
"grad_norm": 1.6942861080169678,
"learning_rate": 2.283030266458644e-06,
"loss": 0.0572,
"mean_token_accuracy": 0.9660318493843079,
"num_tokens": 9517616.0,
"step": 1091
},
{
"entropy": 0.9380939602851868,
"epoch": 3.5008025682182984,
"grad_norm": 1.8551942110061646,
"learning_rate": 2.2741386133342923e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9725143611431122,
"num_tokens": 9527651.0,
"step": 1092
},
{
"entropy": 0.8994499444961548,
"epoch": 3.504012841091493,
"grad_norm": 3.501117706298828,
"learning_rate": 2.265259209387867e-06,
"loss": 0.0894,
"mean_token_accuracy": 0.9523926079273224,
"num_tokens": 9537469.0,
"step": 1093
},
{
"entropy": 0.8829051852226257,
"epoch": 3.5072231139646872,
"grad_norm": 1.714121699333191,
"learning_rate": 2.256392094520664e-06,
"loss": 0.0526,
"mean_token_accuracy": 0.9835188090801239,
"num_tokens": 9545776.0,
"step": 1094
},
{
"entropy": 0.9695333242416382,
"epoch": 3.510433386837881,
"grad_norm": 1.8620688915252686,
"learning_rate": 2.2475373085787568e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9754804074764252,
"num_tokens": 9555075.0,
"step": 1095
},
{
"entropy": 0.9979302287101746,
"epoch": 3.513643659711075,
"grad_norm": 2.942721366882324,
"learning_rate": 2.238694891352814e-06,
"loss": 0.0654,
"mean_token_accuracy": 0.9712414145469666,
"num_tokens": 9564309.0,
"step": 1096
},
{
"entropy": 0.8716126382350922,
"epoch": 3.5168539325842696,
"grad_norm": 1.671080470085144,
"learning_rate": 2.229864882577921e-06,
"loss": 0.034,
"mean_token_accuracy": 0.9897077977657318,
"num_tokens": 9572682.0,
"step": 1097
},
{
"entropy": 0.8658833503723145,
"epoch": 3.520064205457464,
"grad_norm": 5.941058158874512,
"learning_rate": 2.2210473219334083e-06,
"loss": 0.0424,
"mean_token_accuracy": 0.986923485994339,
"num_tokens": 9581560.0,
"step": 1098
},
{
"entropy": 0.9082388877868652,
"epoch": 3.523274478330658,
"grad_norm": 2.375026226043701,
"learning_rate": 2.2122422490426676e-06,
"loss": 0.0401,
"mean_token_accuracy": 0.9843015968799591,
"num_tokens": 9589017.0,
"step": 1099
},
{
"entropy": 1.0297791361808777,
"epoch": 3.5264847512038524,
"grad_norm": 1.8525582551956177,
"learning_rate": 2.203449703472969e-06,
"loss": 0.0335,
"mean_token_accuracy": 0.9889396727085114,
"num_tokens": 9597697.0,
"step": 1100
},
{
"entropy": 1.1267684698104858,
"epoch": 3.5296950240770464,
"grad_norm": 1.5526533126831055,
"learning_rate": 2.194669724735296e-06,
"loss": 0.0373,
"mean_token_accuracy": 0.9850233793258667,
"num_tokens": 9606293.0,
"step": 1101
},
{
"entropy": 0.9412164390087128,
"epoch": 3.532905296950241,
"grad_norm": 5.0308966636657715,
"learning_rate": 2.1859023522841543e-06,
"loss": 0.0576,
"mean_token_accuracy": 0.9791690409183502,
"num_tokens": 9615202.0,
"step": 1102
},
{
"entropy": 0.9152273833751678,
"epoch": 3.5361155698234352,
"grad_norm": 1.82210373878479,
"learning_rate": 2.1771476255174056e-06,
"loss": 0.0522,
"mean_token_accuracy": 0.9795254766941071,
"num_tokens": 9623925.0,
"step": 1103
},
{
"entropy": 0.9812245965003967,
"epoch": 3.539325842696629,
"grad_norm": 1.7946172952651978,
"learning_rate": 2.1684055837760837e-06,
"loss": 0.0691,
"mean_token_accuracy": 0.9617434144020081,
"num_tokens": 9632749.0,
"step": 1104
},
{
"entropy": 0.8433893024921417,
"epoch": 3.542536115569823,
"grad_norm": 2.0056488513946533,
"learning_rate": 2.159676266344222e-06,
"loss": 0.0499,
"mean_token_accuracy": 0.9789745509624481,
"num_tokens": 9641328.0,
"step": 1105
},
{
"entropy": 0.9740126729011536,
"epoch": 3.5457463884430176,
"grad_norm": 1.5101797580718994,
"learning_rate": 2.1509597124486693e-06,
"loss": 0.0432,
"mean_token_accuracy": 0.9827054440975189,
"num_tokens": 9650409.0,
"step": 1106
},
{
"entropy": 0.9411405026912689,
"epoch": 3.548956661316212,
"grad_norm": 1.9099135398864746,
"learning_rate": 2.1422559612589266e-06,
"loss": 0.0548,
"mean_token_accuracy": 0.9779680073261261,
"num_tokens": 9658267.0,
"step": 1107
},
{
"entropy": 0.9189652502536774,
"epoch": 3.552166934189406,
"grad_norm": 2.395860195159912,
"learning_rate": 2.1335650518869555e-06,
"loss": 0.0549,
"mean_token_accuracy": 0.9741061329841614,
"num_tokens": 9666958.0,
"step": 1108
},
{
"entropy": 0.9332525432109833,
"epoch": 3.5553772070626004,
"grad_norm": 1.5610212087631226,
"learning_rate": 2.124887023387017e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9716036319732666,
"num_tokens": 9675816.0,
"step": 1109
},
{
"entropy": 0.7910544574260712,
"epoch": 3.5585874799357944,
"grad_norm": 1.3642332553863525,
"learning_rate": 2.1162219147554884e-06,
"loss": 0.0409,
"mean_token_accuracy": 0.9837254583835602,
"num_tokens": 9684953.0,
"step": 1110
},
{
"entropy": 0.9269280433654785,
"epoch": 3.561797752808989,
"grad_norm": 1.9942110776901245,
"learning_rate": 2.1075697649306838e-06,
"loss": 0.0772,
"mean_token_accuracy": 0.9430525600910187,
"num_tokens": 9695435.0,
"step": 1111
},
{
"entropy": 1.0160551369190216,
"epoch": 3.5650080256821832,
"grad_norm": 1.7112821340560913,
"learning_rate": 2.09893061279269e-06,
"loss": 0.0436,
"mean_token_accuracy": 0.9804078638553619,
"num_tokens": 9703957.0,
"step": 1112
},
{
"entropy": 1.1850579977035522,
"epoch": 3.568218298555377,
"grad_norm": 1.7301826477050781,
"learning_rate": 2.0903044971631854e-06,
"loss": 0.0283,
"mean_token_accuracy": 0.9907903373241425,
"num_tokens": 9711468.0,
"step": 1113
},
{
"entropy": 0.8523805141448975,
"epoch": 3.571428571428571,
"grad_norm": 2.1167054176330566,
"learning_rate": 2.0816914568052664e-06,
"loss": 0.0625,
"mean_token_accuracy": 0.966397762298584,
"num_tokens": 9720236.0,
"step": 1114
},
{
"entropy": 1.0446801781654358,
"epoch": 3.5746388443017656,
"grad_norm": 3.2320852279663086,
"learning_rate": 2.0730915304232692e-06,
"loss": 0.0543,
"mean_token_accuracy": 0.9817685484886169,
"num_tokens": 9729457.0,
"step": 1115
},
{
"entropy": 0.9242656528949738,
"epoch": 3.57784911717496,
"grad_norm": 2.0443546772003174,
"learning_rate": 2.0645047566626057e-06,
"loss": 0.0361,
"mean_token_accuracy": 0.9866881966590881,
"num_tokens": 9737693.0,
"step": 1116
},
{
"entropy": 0.9498637318611145,
"epoch": 3.581059390048154,
"grad_norm": 4.602542400360107,
"learning_rate": 2.055931174109579e-06,
"loss": 0.0541,
"mean_token_accuracy": 0.970384955406189,
"num_tokens": 9747416.0,
"step": 1117
},
{
"entropy": 0.8563813269138336,
"epoch": 3.5842696629213484,
"grad_norm": 1.4554708003997803,
"learning_rate": 2.0473708212912167e-06,
"loss": 0.0423,
"mean_token_accuracy": 0.9849738478660583,
"num_tokens": 9756080.0,
"step": 1118
},
{
"entropy": 0.985105961561203,
"epoch": 3.5874799357945424,
"grad_norm": 1.49336838722229,
"learning_rate": 2.0388237366751005e-06,
"loss": 0.0417,
"mean_token_accuracy": 0.9799233675003052,
"num_tokens": 9765320.0,
"step": 1119
},
{
"entropy": 0.9542718529701233,
"epoch": 3.590690208667737,
"grad_norm": 1.775028944015503,
"learning_rate": 2.030289958669181e-06,
"loss": 0.0383,
"mean_token_accuracy": 0.9866673350334167,
"num_tokens": 9773379.0,
"step": 1120
},
{
"entropy": 0.903919130563736,
"epoch": 3.5939004815409312,
"grad_norm": 2.254425048828125,
"learning_rate": 2.02176952562162e-06,
"loss": 0.0619,
"mean_token_accuracy": 0.9764844477176666,
"num_tokens": 9781226.0,
"step": 1121
},
{
"entropy": 1.0569791793823242,
"epoch": 3.597110754414125,
"grad_norm": 1.8237416744232178,
"learning_rate": 2.013262475820602e-06,
"loss": 0.0362,
"mean_token_accuracy": 0.9858551025390625,
"num_tokens": 9789760.0,
"step": 1122
},
{
"entropy": 1.1246366500854492,
"epoch": 3.600321027287319,
"grad_norm": 3.342473268508911,
"learning_rate": 2.004768847494186e-06,
"loss": 0.0345,
"mean_token_accuracy": 0.9862525463104248,
"num_tokens": 9798992.0,
"step": 1123
},
{
"entropy": 0.840887725353241,
"epoch": 3.6035313001605136,
"grad_norm": 1.3073285818099976,
"learning_rate": 1.996288678810105e-06,
"loss": 0.0462,
"mean_token_accuracy": 0.9791007936000824,
"num_tokens": 9807916.0,
"step": 1124
},
{
"entropy": 0.9333318173885345,
"epoch": 3.606741573033708,
"grad_norm": 1.5826224088668823,
"learning_rate": 1.987822007875617e-06,
"loss": 0.0535,
"mean_token_accuracy": 0.9793388843536377,
"num_tokens": 9817036.0,
"step": 1125
},
{
"entropy": 0.9564113020896912,
"epoch": 3.609951845906902,
"grad_norm": 1.7803699970245361,
"learning_rate": 1.979368872737319e-06,
"loss": 0.0324,
"mean_token_accuracy": 0.9891190826892853,
"num_tokens": 9825054.0,
"step": 1126
},
{
"entropy": 0.8959764540195465,
"epoch": 3.6131621187800964,
"grad_norm": 1.7652521133422852,
"learning_rate": 1.9709293113809876e-06,
"loss": 0.0596,
"mean_token_accuracy": 0.9736920297145844,
"num_tokens": 9834378.0,
"step": 1127
},
{
"entropy": 1.0392656326293945,
"epoch": 3.6163723916532904,
"grad_norm": 1.287834644317627,
"learning_rate": 1.962503361731403e-06,
"loss": 0.0332,
"mean_token_accuracy": 0.9876022934913635,
"num_tokens": 9842473.0,
"step": 1128
},
{
"entropy": 0.8851215839385986,
"epoch": 3.619582664526485,
"grad_norm": 2.2874505519866943,
"learning_rate": 1.954091061652172e-06,
"loss": 0.0401,
"mean_token_accuracy": 0.9856323301792145,
"num_tokens": 9850848.0,
"step": 1129
},
{
"entropy": 0.911045104265213,
"epoch": 3.6227929373996792,
"grad_norm": 2.200509548187256,
"learning_rate": 1.945692448945574e-06,
"loss": 0.0427,
"mean_token_accuracy": 0.9832578599452972,
"num_tokens": 9859913.0,
"step": 1130
},
{
"entropy": 0.8001708984375,
"epoch": 3.626003210272873,
"grad_norm": 26.58523178100586,
"learning_rate": 1.9373075613523728e-06,
"loss": 0.0567,
"mean_token_accuracy": 0.9789122343063354,
"num_tokens": 9868320.0,
"step": 1131
},
{
"entropy": 0.8729480504989624,
"epoch": 3.629213483146067,
"grad_norm": 1.7441883087158203,
"learning_rate": 1.928936436551661e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9858026504516602,
"num_tokens": 9877124.0,
"step": 1132
},
{
"entropy": 0.8823387026786804,
"epoch": 3.6324237560192616,
"grad_norm": 1.6312586069107056,
"learning_rate": 1.920579112160685e-06,
"loss": 0.047,
"mean_token_accuracy": 0.9812817573547363,
"num_tokens": 9885575.0,
"step": 1133
},
{
"entropy": 0.839708000421524,
"epoch": 3.635634028892456,
"grad_norm": 3.5764527320861816,
"learning_rate": 1.912235625734676e-06,
"loss": 0.0508,
"mean_token_accuracy": 0.9844213128089905,
"num_tokens": 9894771.0,
"step": 1134
},
{
"entropy": 0.9408073723316193,
"epoch": 3.63884430176565,
"grad_norm": 6.8944010734558105,
"learning_rate": 1.903906014766681e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9850984215736389,
"num_tokens": 9903863.0,
"step": 1135
},
{
"entropy": 0.9451856017112732,
"epoch": 3.6420545746388444,
"grad_norm": 3.8949129581451416,
"learning_rate": 1.8955903166873924e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9816523194313049,
"num_tokens": 9911733.0,
"step": 1136
},
{
"entropy": 0.9529353082180023,
"epoch": 3.6452648475120384,
"grad_norm": 1.9870771169662476,
"learning_rate": 1.8872885688649879e-06,
"loss": 0.0546,
"mean_token_accuracy": 0.961370050907135,
"num_tokens": 9921287.0,
"step": 1137
},
{
"entropy": 1.0110719799995422,
"epoch": 3.648475120385233,
"grad_norm": 5.824323654174805,
"learning_rate": 1.8790008086049534e-06,
"loss": 0.0623,
"mean_token_accuracy": 0.9748804271221161,
"num_tokens": 9930558.0,
"step": 1138
},
{
"entropy": 0.9448727667331696,
"epoch": 3.6516853932584272,
"grad_norm": 1.6180834770202637,
"learning_rate": 1.8707270731499223e-06,
"loss": 0.0345,
"mean_token_accuracy": 0.9872872233390808,
"num_tokens": 9938191.0,
"step": 1139
},
{
"entropy": 0.9800795316696167,
"epoch": 3.654895666131621,
"grad_norm": 2.881392240524292,
"learning_rate": 1.862467399679499e-06,
"loss": 0.0468,
"mean_token_accuracy": 0.9823011159896851,
"num_tokens": 9947471.0,
"step": 1140
},
{
"entropy": 0.9238375723361969,
"epoch": 3.658105939004815,
"grad_norm": 2.864609479904175,
"learning_rate": 1.854221825310103e-06,
"loss": 0.0682,
"mean_token_accuracy": 0.9768114387989044,
"num_tokens": 9956161.0,
"step": 1141
},
{
"entropy": 0.8451533913612366,
"epoch": 3.6613162118780096,
"grad_norm": 2.6949856281280518,
"learning_rate": 1.8459903870947954e-06,
"loss": 0.0493,
"mean_token_accuracy": 0.9820267260074615,
"num_tokens": 9964537.0,
"step": 1142
},
{
"entropy": 0.9593810737133026,
"epoch": 3.664526484751204,
"grad_norm": 5.500277042388916,
"learning_rate": 1.8377731220231144e-06,
"loss": 0.049,
"mean_token_accuracy": 0.9815813601016998,
"num_tokens": 9973257.0,
"step": 1143
},
{
"entropy": 0.9055139720439911,
"epoch": 3.667736757624398,
"grad_norm": 2.1441760063171387,
"learning_rate": 1.829570067020906e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9799624383449554,
"num_tokens": 9981629.0,
"step": 1144
},
{
"entropy": 1.0653272867202759,
"epoch": 3.6709470304975924,
"grad_norm": 1.7149978876113892,
"learning_rate": 1.8213812589501611e-06,
"loss": 0.0316,
"mean_token_accuracy": 0.9889988601207733,
"num_tokens": 9990584.0,
"step": 1145
},
{
"entropy": 0.8833724558353424,
"epoch": 3.6741573033707864,
"grad_norm": 2.4739246368408203,
"learning_rate": 1.813206734608851e-06,
"loss": 0.0589,
"mean_token_accuracy": 0.9782996773719788,
"num_tokens": 9998933.0,
"step": 1146
},
{
"entropy": 1.0559395551681519,
"epoch": 3.677367576243981,
"grad_norm": 1.88206946849823,
"learning_rate": 1.8050465307307602e-06,
"loss": 0.0349,
"mean_token_accuracy": 0.9845419228076935,
"num_tokens": 10007298.0,
"step": 1147
},
{
"entropy": 0.95055291056633,
"epoch": 3.6805778491171752,
"grad_norm": 2.4245777130126953,
"learning_rate": 1.7969006839853227e-06,
"loss": 0.0617,
"mean_token_accuracy": 0.9638779163360596,
"num_tokens": 10017955.0,
"step": 1148
},
{
"entropy": 0.8645216226577759,
"epoch": 3.683788121990369,
"grad_norm": 2.250040054321289,
"learning_rate": 1.78876923097745e-06,
"loss": 0.0566,
"mean_token_accuracy": 0.97188401222229,
"num_tokens": 10027187.0,
"step": 1149
},
{
"entropy": 0.8478440046310425,
"epoch": 3.686998394863563,
"grad_norm": 1.9543880224227905,
"learning_rate": 1.7806522082473809e-06,
"loss": 0.0385,
"mean_token_accuracy": 0.986393392086029,
"num_tokens": 10035355.0,
"step": 1150
},
{
"entropy": 0.8509060144424438,
"epoch": 3.6902086677367576,
"grad_norm": 2.297797679901123,
"learning_rate": 1.7725496522704998e-06,
"loss": 0.0507,
"mean_token_accuracy": 0.977655678987503,
"num_tokens": 10043721.0,
"step": 1151
},
{
"entropy": 0.8556525111198425,
"epoch": 3.693418940609952,
"grad_norm": 1.3254326581954956,
"learning_rate": 1.7644615994571934e-06,
"loss": 0.0468,
"mean_token_accuracy": 0.9736096262931824,
"num_tokens": 10053117.0,
"step": 1152
},
{
"entropy": 1.0885184109210968,
"epoch": 3.696629213483146,
"grad_norm": 1.5111006498336792,
"learning_rate": 1.7563880861526656e-06,
"loss": 0.0522,
"mean_token_accuracy": 0.9707938432693481,
"num_tokens": 10064650.0,
"step": 1153
},
{
"entropy": 0.8594561219215393,
"epoch": 3.6998394863563404,
"grad_norm": 2.320162534713745,
"learning_rate": 1.748329148636787e-06,
"loss": 0.0455,
"mean_token_accuracy": 0.9815157949924469,
"num_tokens": 10073861.0,
"step": 1154
},
{
"entropy": 1.0225560665130615,
"epoch": 3.7030497592295344,
"grad_norm": 2.3292529582977295,
"learning_rate": 1.7402848231239317e-06,
"loss": 0.033,
"mean_token_accuracy": 0.9850149750709534,
"num_tokens": 10081779.0,
"step": 1155
},
{
"entropy": 0.916724443435669,
"epoch": 3.706260032102729,
"grad_norm": 1.684735655784607,
"learning_rate": 1.73225514576281e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.9606063067913055,
"num_tokens": 10092506.0,
"step": 1156
},
{
"entropy": 0.8862948417663574,
"epoch": 3.7094703049759232,
"grad_norm": 1.5901275873184204,
"learning_rate": 1.7242401526363095e-06,
"loss": 0.0479,
"mean_token_accuracy": 0.9779063761234283,
"num_tokens": 10101108.0,
"step": 1157
},
{
"entropy": 0.9733172357082367,
"epoch": 3.712680577849117,
"grad_norm": 2.0854811668395996,
"learning_rate": 1.7162398797613284e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9822612106800079,
"num_tokens": 10110150.0,
"step": 1158
},
{
"entropy": 0.9979034662246704,
"epoch": 3.715890850722311,
"grad_norm": 2.5162012577056885,
"learning_rate": 1.70825436308862e-06,
"loss": 0.0348,
"mean_token_accuracy": 0.9886457622051239,
"num_tokens": 10118322.0,
"step": 1159
},
{
"entropy": 0.9389398097991943,
"epoch": 3.7191011235955056,
"grad_norm": 2.1497745513916016,
"learning_rate": 1.7002836385026234e-06,
"loss": 0.0479,
"mean_token_accuracy": 0.982035368680954,
"num_tokens": 10127091.0,
"step": 1160
},
{
"entropy": 0.908422589302063,
"epoch": 3.7223113964687,
"grad_norm": 2.7780277729034424,
"learning_rate": 1.692327741821312e-06,
"loss": 0.0343,
"mean_token_accuracy": 0.9829728901386261,
"num_tokens": 10134938.0,
"step": 1161
},
{
"entropy": 0.9549289643764496,
"epoch": 3.725521669341894,
"grad_norm": 1.9559484720230103,
"learning_rate": 1.6843867087960252e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9825837016105652,
"num_tokens": 10143758.0,
"step": 1162
},
{
"entropy": 0.9937103390693665,
"epoch": 3.7287319422150884,
"grad_norm": 1.8624933958053589,
"learning_rate": 1.676460575111306e-06,
"loss": 0.0408,
"mean_token_accuracy": 0.9817036986351013,
"num_tokens": 10152815.0,
"step": 1163
},
{
"entropy": 0.9328649938106537,
"epoch": 3.7319422150882824,
"grad_norm": 1.420202374458313,
"learning_rate": 1.6685493763847515e-06,
"loss": 0.0371,
"mean_token_accuracy": 0.9874887466430664,
"num_tokens": 10161410.0,
"step": 1164
},
{
"entropy": 0.8172413110733032,
"epoch": 3.735152487961477,
"grad_norm": 6.408421516418457,
"learning_rate": 1.6606531481668364e-06,
"loss": 0.059,
"mean_token_accuracy": 0.9759405851364136,
"num_tokens": 10170748.0,
"step": 1165
},
{
"entropy": 1.009339064359665,
"epoch": 3.738362760834671,
"grad_norm": 1.9974569082260132,
"learning_rate": 1.6527719259407743e-06,
"loss": 0.044,
"mean_token_accuracy": 0.9845010340213776,
"num_tokens": 10179326.0,
"step": 1166
},
{
"entropy": 0.9294591248035431,
"epoch": 3.741573033707865,
"grad_norm": 3.3291337490081787,
"learning_rate": 1.6449057451223354e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9824508726596832,
"num_tokens": 10188628.0,
"step": 1167
},
{
"entropy": 0.9091964364051819,
"epoch": 3.744783306581059,
"grad_norm": 2.6491594314575195,
"learning_rate": 1.6370546410597066e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9683757126331329,
"num_tokens": 10198174.0,
"step": 1168
},
{
"entropy": 0.8906883001327515,
"epoch": 3.7479935794542536,
"grad_norm": 1.4277008771896362,
"learning_rate": 1.6292186490333172e-06,
"loss": 0.0436,
"mean_token_accuracy": 0.9807183742523193,
"num_tokens": 10206596.0,
"step": 1169
},
{
"entropy": 0.8736357986927032,
"epoch": 3.751203852327448,
"grad_norm": 2.0264739990234375,
"learning_rate": 1.6213978042556938e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9833411276340485,
"num_tokens": 10214049.0,
"step": 1170
},
{
"entropy": 0.9112011790275574,
"epoch": 3.754414125200642,
"grad_norm": 2.4318811893463135,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.0397,
"mean_token_accuracy": 0.9881952702999115,
"num_tokens": 10222212.0,
"step": 1171
},
{
"entropy": 0.9542613327503204,
"epoch": 3.7576243980738364,
"grad_norm": 3.0339975357055664,
"learning_rate": 1.6058016969563512e-06,
"loss": 0.0602,
"mean_token_accuracy": 0.9773413836956024,
"num_tokens": 10231984.0,
"step": 1172
},
{
"entropy": 0.8619020581245422,
"epoch": 3.7608346709470304,
"grad_norm": 1.3047006130218506,
"learning_rate": 1.5980265045187139e-06,
"loss": 0.042,
"mean_token_accuracy": 0.9838049113750458,
"num_tokens": 10240121.0,
"step": 1173
},
{
"entropy": 0.8658272624015808,
"epoch": 3.764044943820225,
"grad_norm": 9.934652328491211,
"learning_rate": 1.5902665994976896e-06,
"loss": 0.0563,
"mean_token_accuracy": 0.9773064851760864,
"num_tokens": 10248747.0,
"step": 1174
},
{
"entropy": 1.0443784594535828,
"epoch": 3.767255216693419,
"grad_norm": 1.5346416234970093,
"learning_rate": 1.5825220167638945e-06,
"loss": 0.0301,
"mean_token_accuracy": 0.9880676567554474,
"num_tokens": 10257534.0,
"step": 1175
},
{
"entropy": 0.9445019364356995,
"epoch": 3.770465489566613,
"grad_norm": 1.3560583591461182,
"learning_rate": 1.5747927911190858e-06,
"loss": 0.0288,
"mean_token_accuracy": 0.989595890045166,
"num_tokens": 10265790.0,
"step": 1176
},
{
"entropy": 0.8617814779281616,
"epoch": 3.773675762439807,
"grad_norm": 2.822770118713379,
"learning_rate": 1.567078957296016e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.971150130033493,
"num_tokens": 10274497.0,
"step": 1177
},
{
"entropy": 0.9549345076084137,
"epoch": 3.7768860353130016,
"grad_norm": 1.5040322542190552,
"learning_rate": 1.5593805499582659e-06,
"loss": 0.0299,
"mean_token_accuracy": 0.9895735383033752,
"num_tokens": 10282792.0,
"step": 1178
},
{
"entropy": 0.9082767367362976,
"epoch": 3.780096308186196,
"grad_norm": 1.8122907876968384,
"learning_rate": 1.5516976037000941e-06,
"loss": 0.0413,
"mean_token_accuracy": 0.9823250472545624,
"num_tokens": 10290439.0,
"step": 1179
},
{
"entropy": 0.9210187792778015,
"epoch": 3.78330658105939,
"grad_norm": 2.638749599456787,
"learning_rate": 1.544030153046291e-06,
"loss": 0.0424,
"mean_token_accuracy": 0.9761735498905182,
"num_tokens": 10299177.0,
"step": 1180
},
{
"entropy": 0.9334617257118225,
"epoch": 3.7865168539325844,
"grad_norm": 2.229041576385498,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9787396788597107,
"num_tokens": 10308598.0,
"step": 1181
},
{
"entropy": 0.8418469130992889,
"epoch": 3.7897271268057784,
"grad_norm": 2.188577175140381,
"learning_rate": 1.528741876302598e-06,
"loss": 0.0601,
"mean_token_accuracy": 0.965255469083786,
"num_tokens": 10318063.0,
"step": 1182
},
{
"entropy": 0.9776929020881653,
"epoch": 3.792937399678973,
"grad_norm": 1.7480143308639526,
"learning_rate": 1.5211211189134955e-06,
"loss": 0.0492,
"mean_token_accuracy": 0.9740462601184845,
"num_tokens": 10327620.0,
"step": 1183
},
{
"entropy": 0.8690580427646637,
"epoch": 3.796147672552167,
"grad_norm": 3.2047064304351807,
"learning_rate": 1.5135159945300232e-06,
"loss": 0.0341,
"mean_token_accuracy": 0.9891680479049683,
"num_tokens": 10335916.0,
"step": 1184
},
{
"entropy": 1.0153252184391022,
"epoch": 3.799357945425361,
"grad_norm": 1.8621257543563843,
"learning_rate": 1.5059265373272574e-06,
"loss": 0.0293,
"mean_token_accuracy": 0.989485502243042,
"num_tokens": 10344614.0,
"step": 1185
},
{
"entropy": 1.0363346338272095,
"epoch": 3.802568218298555,
"grad_norm": 1.3696978092193604,
"learning_rate": 1.4983527814098736e-06,
"loss": 0.0344,
"mean_token_accuracy": 0.9842495024204254,
"num_tokens": 10353085.0,
"step": 1186
},
{
"entropy": 0.9636200070381165,
"epoch": 3.8057784911717496,
"grad_norm": 3.6484992504119873,
"learning_rate": 1.4907947608119866e-06,
"loss": 0.0569,
"mean_token_accuracy": 0.9779880046844482,
"num_tokens": 10361502.0,
"step": 1187
},
{
"entropy": 1.1602963209152222,
"epoch": 3.808988764044944,
"grad_norm": 1.8796751499176025,
"learning_rate": 1.4832525094970007e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9757754504680634,
"num_tokens": 10371614.0,
"step": 1188
},
{
"entropy": 1.043602168560028,
"epoch": 3.812199036918138,
"grad_norm": 1.5110524892807007,
"learning_rate": 1.475726061357463e-06,
"loss": 0.0361,
"mean_token_accuracy": 0.9858299195766449,
"num_tokens": 10380855.0,
"step": 1189
},
{
"entropy": 0.9013993740081787,
"epoch": 3.8154093097913324,
"grad_norm": 1.8058748245239258,
"learning_rate": 1.4682154502149025e-06,
"loss": 0.0485,
"mean_token_accuracy": 0.9798838496208191,
"num_tokens": 10390505.0,
"step": 1190
},
{
"entropy": 0.8585264086723328,
"epoch": 3.8186195826645264,
"grad_norm": 1.8328495025634766,
"learning_rate": 1.4607207098196851e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.9619594812393188,
"num_tokens": 10399325.0,
"step": 1191
},
{
"entropy": 0.8735026121139526,
"epoch": 3.821829855537721,
"grad_norm": 1.9175422191619873,
"learning_rate": 1.4532418738508525e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9814468622207642,
"num_tokens": 10406972.0,
"step": 1192
},
{
"entropy": 0.8725822865962982,
"epoch": 3.825040128410915,
"grad_norm": 2.073087453842163,
"learning_rate": 1.4457789759159813e-06,
"loss": 0.0401,
"mean_token_accuracy": 0.9837340116500854,
"num_tokens": 10416155.0,
"step": 1193
},
{
"entropy": 0.7589974999427795,
"epoch": 3.828250401284109,
"grad_norm": 2.09479022026062,
"learning_rate": 1.4383320495510267e-06,
"loss": 0.0518,
"mean_token_accuracy": 0.9773995578289032,
"num_tokens": 10424983.0,
"step": 1194
},
{
"entropy": 0.7707741558551788,
"epoch": 3.831460674157303,
"grad_norm": 1.806342601776123,
"learning_rate": 1.430901128220174e-06,
"loss": 0.0442,
"mean_token_accuracy": 0.983624279499054,
"num_tokens": 10432712.0,
"step": 1195
},
{
"entropy": 0.9553577899932861,
"epoch": 3.8346709470304976,
"grad_norm": 2.0812230110168457,
"learning_rate": 1.4234862453156839e-06,
"loss": 0.0388,
"mean_token_accuracy": 0.9831459522247314,
"num_tokens": 10440865.0,
"step": 1196
},
{
"entropy": 0.987030029296875,
"epoch": 3.837881219903692,
"grad_norm": 1.751489520072937,
"learning_rate": 1.4160874341577447e-06,
"loss": 0.0409,
"mean_token_accuracy": 0.985140860080719,
"num_tokens": 10448801.0,
"step": 1197
},
{
"entropy": 0.785410076379776,
"epoch": 3.841091492776886,
"grad_norm": 2.0246682167053223,
"learning_rate": 1.4087047279943267e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9750083088874817,
"num_tokens": 10457486.0,
"step": 1198
},
{
"entropy": 0.7790045142173767,
"epoch": 3.8443017656500804,
"grad_norm": 1.6499295234680176,
"learning_rate": 1.4013381600010278e-06,
"loss": 0.0457,
"mean_token_accuracy": 0.9819171726703644,
"num_tokens": 10466429.0,
"step": 1199
},
{
"entropy": 1.0195372998714447,
"epoch": 3.8475120385232744,
"grad_norm": 1.9666370153427124,
"learning_rate": 1.3939877632809279e-06,
"loss": 0.0449,
"mean_token_accuracy": 0.9832908809185028,
"num_tokens": 10474853.0,
"step": 1200
},
{
"entropy": 0.8610271215438843,
"epoch": 3.850722311396469,
"grad_norm": 1.5382120609283447,
"learning_rate": 1.3866535708644335e-06,
"loss": 0.0483,
"mean_token_accuracy": 0.983114629983902,
"num_tokens": 10484253.0,
"step": 1201
},
{
"entropy": 0.8548697829246521,
"epoch": 3.853932584269663,
"grad_norm": 2.9462549686431885,
"learning_rate": 1.3793356157091387e-06,
"loss": 0.0563,
"mean_token_accuracy": 0.9759114682674408,
"num_tokens": 10493150.0,
"step": 1202
},
{
"entropy": 0.981879711151123,
"epoch": 3.857142857142857,
"grad_norm": 2.5988550186157227,
"learning_rate": 1.3720339306996666e-06,
"loss": 0.0318,
"mean_token_accuracy": 0.9906176030635834,
"num_tokens": 10501519.0,
"step": 1203
},
{
"entropy": 0.8637133836746216,
"epoch": 3.860353130016051,
"grad_norm": 1.6211717128753662,
"learning_rate": 1.3647485486475376e-06,
"loss": 0.0371,
"mean_token_accuracy": 0.98866006731987,
"num_tokens": 10509870.0,
"step": 1204
},
{
"entropy": 1.0288158059120178,
"epoch": 3.8635634028892456,
"grad_norm": 1.9696396589279175,
"learning_rate": 1.3574795022910014e-06,
"loss": 0.0465,
"mean_token_accuracy": 0.9840021729469299,
"num_tokens": 10518929.0,
"step": 1205
},
{
"entropy": 0.8618482649326324,
"epoch": 3.86677367576244,
"grad_norm": 2.089820384979248,
"learning_rate": 1.3502268242949025e-06,
"loss": 0.0506,
"mean_token_accuracy": 0.9794828593730927,
"num_tokens": 10527460.0,
"step": 1206
},
{
"entropy": 0.8384652733802795,
"epoch": 3.869983948635634,
"grad_norm": 1.6938061714172363,
"learning_rate": 1.3429905472505344e-06,
"loss": 0.0441,
"mean_token_accuracy": 0.9818322360515594,
"num_tokens": 10535140.0,
"step": 1207
},
{
"entropy": 0.9508244693279266,
"epoch": 3.8731942215088284,
"grad_norm": 2.3421173095703125,
"learning_rate": 1.3357707036754875e-06,
"loss": 0.0552,
"mean_token_accuracy": 0.9806767404079437,
"num_tokens": 10543385.0,
"step": 1208
},
{
"entropy": 0.9171972870826721,
"epoch": 3.8764044943820224,
"grad_norm": 2.3140108585357666,
"learning_rate": 1.3285673260135073e-06,
"loss": 0.0369,
"mean_token_accuracy": 0.9835132956504822,
"num_tokens": 10551332.0,
"step": 1209
},
{
"entropy": 0.9664874970912933,
"epoch": 3.879614767255217,
"grad_norm": 1.4102840423583984,
"learning_rate": 1.321380446634342e-06,
"loss": 0.0299,
"mean_token_accuracy": 0.9883507788181305,
"num_tokens": 10558804.0,
"step": 1210
},
{
"entropy": 0.8216537237167358,
"epoch": 3.882825040128411,
"grad_norm": 6.906599998474121,
"learning_rate": 1.314210097833607e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9810819029808044,
"num_tokens": 10568053.0,
"step": 1211
},
{
"entropy": 0.8050373494625092,
"epoch": 3.886035313001605,
"grad_norm": 5.290853023529053,
"learning_rate": 1.3070563118326295e-06,
"loss": 0.0648,
"mean_token_accuracy": 0.9657208025455475,
"num_tokens": 10576876.0,
"step": 1212
},
{
"entropy": 0.8461142778396606,
"epoch": 3.889245585874799,
"grad_norm": 2.1909284591674805,
"learning_rate": 1.2999191207783129e-06,
"loss": 0.0387,
"mean_token_accuracy": 0.9846838414669037,
"num_tokens": 10585747.0,
"step": 1213
},
{
"entropy": 0.862621545791626,
"epoch": 3.8924558587479936,
"grad_norm": 7.942407608032227,
"learning_rate": 1.2927985567429868e-06,
"loss": 0.0469,
"mean_token_accuracy": 0.9817521870136261,
"num_tokens": 10593612.0,
"step": 1214
},
{
"entropy": 1.056757628917694,
"epoch": 3.895666131621188,
"grad_norm": 1.4919307231903076,
"learning_rate": 1.2856946517242608e-06,
"loss": 0.0428,
"mean_token_accuracy": 0.9853585362434387,
"num_tokens": 10602304.0,
"step": 1215
},
{
"entropy": 0.9187009334564209,
"epoch": 3.898876404494382,
"grad_norm": 1.4443905353546143,
"learning_rate": 1.27860743764489e-06,
"loss": 0.0582,
"mean_token_accuracy": 0.9673136472702026,
"num_tokens": 10611835.0,
"step": 1216
},
{
"entropy": 0.8659194707870483,
"epoch": 3.902086677367576,
"grad_norm": 2.3289132118225098,
"learning_rate": 1.2715369463526173e-06,
"loss": 0.0545,
"mean_token_accuracy": 0.9800289571285248,
"num_tokens": 10620313.0,
"step": 1217
},
{
"entropy": 0.8493823409080505,
"epoch": 3.9052969502407704,
"grad_norm": 2.543363571166992,
"learning_rate": 1.2644832096200498e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9816104173660278,
"num_tokens": 10628548.0,
"step": 1218
},
{
"entropy": 0.8991810083389282,
"epoch": 3.908507223113965,
"grad_norm": 6.281195163726807,
"learning_rate": 1.257446259144494e-06,
"loss": 0.053,
"mean_token_accuracy": 0.9824222326278687,
"num_tokens": 10636626.0,
"step": 1219
},
{
"entropy": 0.9110390543937683,
"epoch": 3.911717495987159,
"grad_norm": 3.237565040588379,
"learning_rate": 1.2504261265478324e-06,
"loss": 0.0467,
"mean_token_accuracy": 0.9759093225002289,
"num_tokens": 10645242.0,
"step": 1220
},
{
"entropy": 0.9176389575004578,
"epoch": 3.914927768860353,
"grad_norm": 1.461086392402649,
"learning_rate": 1.2434228433763657e-06,
"loss": 0.0472,
"mean_token_accuracy": 0.9800988137722015,
"num_tokens": 10654835.0,
"step": 1221
},
{
"entropy": 0.938913106918335,
"epoch": 3.918138041733547,
"grad_norm": 1.900651454925537,
"learning_rate": 1.2364364411006841e-06,
"loss": 0.05,
"mean_token_accuracy": 0.9824710190296173,
"num_tokens": 10663724.0,
"step": 1222
},
{
"entropy": 0.9196376204490662,
"epoch": 3.9213483146067416,
"grad_norm": 1.5418288707733154,
"learning_rate": 1.2294669511155193e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9803338348865509,
"num_tokens": 10673446.0,
"step": 1223
},
{
"entropy": 0.8525184988975525,
"epoch": 3.924558587479936,
"grad_norm": 2.4837660789489746,
"learning_rate": 1.2225144047396015e-06,
"loss": 0.0702,
"mean_token_accuracy": 0.9695371985435486,
"num_tokens": 10681905.0,
"step": 1224
},
{
"entropy": 0.9096053838729858,
"epoch": 3.92776886035313,
"grad_norm": 27.353504180908203,
"learning_rate": 1.215578833215526e-06,
"loss": 0.0619,
"mean_token_accuracy": 0.9736766219139099,
"num_tokens": 10690825.0,
"step": 1225
},
{
"entropy": 0.8512005805969238,
"epoch": 3.930979133226324,
"grad_norm": 1.7140419483184814,
"learning_rate": 1.2086602677096033e-06,
"loss": 0.0635,
"mean_token_accuracy": 0.9623365104198456,
"num_tokens": 10699282.0,
"step": 1226
},
{
"entropy": 0.9099750518798828,
"epoch": 3.9341894060995184,
"grad_norm": 1.4028735160827637,
"learning_rate": 1.201758739311728e-06,
"loss": 0.0346,
"mean_token_accuracy": 0.9866108298301697,
"num_tokens": 10707893.0,
"step": 1227
},
{
"entropy": 0.8565336465835571,
"epoch": 3.937399678972713,
"grad_norm": 1.4634716510772705,
"learning_rate": 1.1948742790352342e-06,
"loss": 0.044,
"mean_token_accuracy": 0.9821693003177643,
"num_tokens": 10717249.0,
"step": 1228
},
{
"entropy": 0.9397711753845215,
"epoch": 3.940609951845907,
"grad_norm": 1.8528227806091309,
"learning_rate": 1.1880069178167586e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9538247883319855,
"num_tokens": 10727017.0,
"step": 1229
},
{
"entropy": 0.8851629793643951,
"epoch": 3.943820224719101,
"grad_norm": 1.2845957279205322,
"learning_rate": 1.1811566865160961e-06,
"loss": 0.034,
"mean_token_accuracy": 0.982793927192688,
"num_tokens": 10734428.0,
"step": 1230
},
{
"entropy": 0.8674412071704865,
"epoch": 3.947030497592295,
"grad_norm": 4.364323139190674,
"learning_rate": 1.1743236159160654e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9849294424057007,
"num_tokens": 10742510.0,
"step": 1231
},
{
"entropy": 0.9346510767936707,
"epoch": 3.9502407704654896,
"grad_norm": 1.6875059604644775,
"learning_rate": 1.167507736722377e-06,
"loss": 0.0374,
"mean_token_accuracy": 0.9862508773803711,
"num_tokens": 10750667.0,
"step": 1232
},
{
"entropy": 0.9485654532909393,
"epoch": 3.953451043338684,
"grad_norm": 1.916317105293274,
"learning_rate": 1.1607090795634802e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9837748110294342,
"num_tokens": 10759026.0,
"step": 1233
},
{
"entropy": 0.8760998845100403,
"epoch": 3.956661316211878,
"grad_norm": 1.6034512519836426,
"learning_rate": 1.15392767499044e-06,
"loss": 0.0598,
"mean_token_accuracy": 0.9702793061733246,
"num_tokens": 10769283.0,
"step": 1234
},
{
"entropy": 0.9222274422645569,
"epoch": 3.959871589085072,
"grad_norm": 1.5884339809417725,
"learning_rate": 1.1471635534767877e-06,
"loss": 0.0343,
"mean_token_accuracy": 0.9872113466262817,
"num_tokens": 10777409.0,
"step": 1235
},
{
"entropy": 1.0063312649726868,
"epoch": 3.9630818619582664,
"grad_norm": 1.3955051898956299,
"learning_rate": 1.1404167454183957e-06,
"loss": 0.0289,
"mean_token_accuracy": 0.9890806972980499,
"num_tokens": 10784631.0,
"step": 1236
},
{
"entropy": 1.01164972782135,
"epoch": 3.966292134831461,
"grad_norm": 1.528063416481018,
"learning_rate": 1.133687281133331e-06,
"loss": 0.0481,
"mean_token_accuracy": 0.9762919843196869,
"num_tokens": 10793409.0,
"step": 1237
},
{
"entropy": 1.0118040144443512,
"epoch": 3.969502407704655,
"grad_norm": 1.9172968864440918,
"learning_rate": 1.1269751908617277e-06,
"loss": 0.057,
"mean_token_accuracy": 0.9794862866401672,
"num_tokens": 10801911.0,
"step": 1238
},
{
"entropy": 0.959458976984024,
"epoch": 3.972712680577849,
"grad_norm": 1.7244352102279663,
"learning_rate": 1.1202805047656406e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9820380210876465,
"num_tokens": 10810254.0,
"step": 1239
},
{
"entropy": 1.0892210602760315,
"epoch": 3.975922953451043,
"grad_norm": 1.4209359884262085,
"learning_rate": 1.113603252928917e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9800035953521729,
"num_tokens": 10819473.0,
"step": 1240
},
{
"entropy": 0.8789343237876892,
"epoch": 3.9791332263242376,
"grad_norm": 3.5406527519226074,
"learning_rate": 1.1069434653570633e-06,
"loss": 0.0439,
"mean_token_accuracy": 0.9809198379516602,
"num_tokens": 10827936.0,
"step": 1241
},
{
"entropy": 0.9018587470054626,
"epoch": 3.982343499197432,
"grad_norm": 3.4543299674987793,
"learning_rate": 1.1003011719771046e-06,
"loss": 0.0404,
"mean_token_accuracy": 0.987114280462265,
"num_tokens": 10835439.0,
"step": 1242
},
{
"entropy": 0.952752411365509,
"epoch": 3.985553772070626,
"grad_norm": 1.4983313083648682,
"learning_rate": 1.0936764026374547e-06,
"loss": 0.0359,
"mean_token_accuracy": 0.9853008389472961,
"num_tokens": 10843254.0,
"step": 1243
},
{
"entropy": 0.901978075504303,
"epoch": 3.98876404494382,
"grad_norm": 1.8080071210861206,
"learning_rate": 1.0870691871077738e-06,
"loss": 0.0571,
"mean_token_accuracy": 0.9717438220977783,
"num_tokens": 10852491.0,
"step": 1244
},
{
"entropy": 0.9695694446563721,
"epoch": 3.9919743178170144,
"grad_norm": 1.5428680181503296,
"learning_rate": 1.0804795550788473e-06,
"loss": 0.059,
"mean_token_accuracy": 0.9658881723880768,
"num_tokens": 10861299.0,
"step": 1245
},
{
"entropy": 0.8626547455787659,
"epoch": 3.995184590690209,
"grad_norm": 1.5923748016357422,
"learning_rate": 1.073907536162443e-06,
"loss": 0.0464,
"mean_token_accuracy": 0.9818092882633209,
"num_tokens": 10870030.0,
"step": 1246
},
{
"entropy": 0.9375744163990021,
"epoch": 3.998394863563403,
"grad_norm": 2.5676188468933105,
"learning_rate": 1.0673531598911824e-06,
"loss": 0.058,
"mean_token_accuracy": 0.966584324836731,
"num_tokens": 10880729.0,
"step": 1247
},
{
"entropy": 0.7778120040893555,
"epoch": 4.0,
"grad_norm": 2.0153770446777344,
"learning_rate": 1.0608164557184042e-06,
"loss": 0.0344,
"mean_token_accuracy": 0.9873684048652649,
"num_tokens": 10884568.0,
"step": 1248
},
{
"entropy": 0.8640480041503906,
"epoch": 4.003210272873194,
"grad_norm": 0.6675413250923157,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.0262,
"mean_token_accuracy": 0.9823751747608185,
"num_tokens": 10892829.0,
"step": 1249
},
{
"entropy": 0.9694719612598419,
"epoch": 4.006420545746389,
"grad_norm": 0.7210123538970947,
"learning_rate": 1.0477961810844517e-06,
"loss": 0.0157,
"mean_token_accuracy": 0.9941366910934448,
"num_tokens": 10900762.0,
"step": 1250
},
{
"entropy": 0.9497312605381012,
"epoch": 4.009630818619582,
"grad_norm": 1.7998807430267334,
"learning_rate": 1.0413126691323667e-06,
"loss": 0.0177,
"mean_token_accuracy": 0.9944363832473755,
"num_tokens": 10908632.0,
"step": 1251
},
{
"entropy": 1.0852590799331665,
"epoch": 4.012841091492777,
"grad_norm": 0.7353365421295166,
"learning_rate": 1.0348469462966753e-06,
"loss": 0.0162,
"mean_token_accuracy": 0.9962048232555389,
"num_tokens": 10918033.0,
"step": 1252
},
{
"entropy": 1.0034122467041016,
"epoch": 4.016051364365971,
"grad_norm": 0.7921286821365356,
"learning_rate": 1.0283990416323336e-06,
"loss": 0.0163,
"mean_token_accuracy": 0.995498538017273,
"num_tokens": 10926551.0,
"step": 1253
},
{
"entropy": 0.8701649308204651,
"epoch": 4.019261637239166,
"grad_norm": 0.7349697947502136,
"learning_rate": 1.0219689841142343e-06,
"loss": 0.0274,
"mean_token_accuracy": 0.9848228096961975,
"num_tokens": 10936080.0,
"step": 1254
},
{
"entropy": 0.7810315489768982,
"epoch": 4.022471910112359,
"grad_norm": 0.7971243262290955,
"learning_rate": 1.0155568026370637e-06,
"loss": 0.0309,
"mean_token_accuracy": 0.9813618063926697,
"num_tokens": 10946666.0,
"step": 1255
},
{
"entropy": 0.8642408847808838,
"epoch": 4.025682182985554,
"grad_norm": 1.208862543106079,
"learning_rate": 1.0091625260151827e-06,
"loss": 0.0209,
"mean_token_accuracy": 0.9927391707897186,
"num_tokens": 10954165.0,
"step": 1256
},
{
"entropy": 0.9416361153125763,
"epoch": 4.028892455858748,
"grad_norm": 0.9059441089630127,
"learning_rate": 1.0027861829824953e-06,
"loss": 0.026,
"mean_token_accuracy": 0.9885562062263489,
"num_tokens": 10963217.0,
"step": 1257
},
{
"entropy": 0.832787424325943,
"epoch": 4.032102728731942,
"grad_norm": 1.0136638879776,
"learning_rate": 9.964278021923107e-07,
"loss": 0.0269,
"mean_token_accuracy": 0.9904702007770538,
"num_tokens": 10971882.0,
"step": 1258
},
{
"entropy": 0.8856756091117859,
"epoch": 4.035313001605137,
"grad_norm": 0.8064892292022705,
"learning_rate": 9.900874122172294e-07,
"loss": 0.019,
"mean_token_accuracy": 0.9927627444267273,
"num_tokens": 10980909.0,
"step": 1259
},
{
"entropy": 0.827369898557663,
"epoch": 4.03852327447833,
"grad_norm": 0.593895673751831,
"learning_rate": 9.83765041548998e-07,
"loss": 0.0137,
"mean_token_accuracy": 0.9960170388221741,
"num_tokens": 10989655.0,
"step": 1260
},
{
"entropy": 0.9583877325057983,
"epoch": 4.041733547351525,
"grad_norm": 1.0141297578811646,
"learning_rate": 9.774607185984004e-07,
"loss": 0.0169,
"mean_token_accuracy": 0.9941740334033966,
"num_tokens": 10998018.0,
"step": 1261
},
{
"entropy": 0.8924273252487183,
"epoch": 4.044943820224719,
"grad_norm": 0.8163235187530518,
"learning_rate": 9.711744716951093e-07,
"loss": 0.016,
"mean_token_accuracy": 0.9947156310081482,
"num_tokens": 11006756.0,
"step": 1262
},
{
"entropy": 0.8327958583831787,
"epoch": 4.048154093097914,
"grad_norm": 0.8127865791320801,
"learning_rate": 9.649063290875771e-07,
"loss": 0.016,
"mean_token_accuracy": 0.994948148727417,
"num_tokens": 11014455.0,
"step": 1263
},
{
"entropy": 0.8657267093658447,
"epoch": 4.051364365971107,
"grad_norm": 0.9289994835853577,
"learning_rate": 9.586563189428954e-07,
"loss": 0.0159,
"mean_token_accuracy": 0.9955416023731232,
"num_tokens": 11022936.0,
"step": 1264
},
{
"entropy": 0.8278777003288269,
"epoch": 4.054574638844302,
"grad_norm": 1.0349270105361938,
"learning_rate": 9.524244693466773e-07,
"loss": 0.0174,
"mean_token_accuracy": 0.9945272207260132,
"num_tokens": 11030932.0,
"step": 1265
},
{
"entropy": 0.7735994756221771,
"epoch": 4.057784911717496,
"grad_norm": 0.7948999404907227,
"learning_rate": 9.462108083029287e-07,
"loss": 0.0147,
"mean_token_accuracy": 0.9946495592594147,
"num_tokens": 11038898.0,
"step": 1266
},
{
"entropy": 0.9241797029972076,
"epoch": 4.06099518459069,
"grad_norm": 1.0356444120407104,
"learning_rate": 9.400153637339182e-07,
"loss": 0.0149,
"mean_token_accuracy": 0.9944348335266113,
"num_tokens": 11046059.0,
"step": 1267
},
{
"entropy": 0.9038277566432953,
"epoch": 4.064205457463885,
"grad_norm": 0.8357171416282654,
"learning_rate": 9.338381634800597e-07,
"loss": 0.0144,
"mean_token_accuracy": 0.9948267638683319,
"num_tokens": 11053771.0,
"step": 1268
},
{
"entropy": 0.7923440039157867,
"epoch": 4.067415730337078,
"grad_norm": 1.267638087272644,
"learning_rate": 9.276792352997782e-07,
"loss": 0.0257,
"mean_token_accuracy": 0.9828329980373383,
"num_tokens": 11062458.0,
"step": 1269
},
{
"entropy": 0.845217764377594,
"epoch": 4.070626003210273,
"grad_norm": 1.1078312397003174,
"learning_rate": 9.215386068693927e-07,
"loss": 0.0195,
"mean_token_accuracy": 0.9937855005264282,
"num_tokens": 11070919.0,
"step": 1270
},
{
"entropy": 0.9916780292987823,
"epoch": 4.073836276083467,
"grad_norm": 0.9299560785293579,
"learning_rate": 9.154163057829879e-07,
"loss": 0.0171,
"mean_token_accuracy": 0.993675172328949,
"num_tokens": 11079381.0,
"step": 1271
},
{
"entropy": 0.8888976275920868,
"epoch": 4.077046548956662,
"grad_norm": 0.7998571991920471,
"learning_rate": 9.093123595522929e-07,
"loss": 0.0176,
"mean_token_accuracy": 0.993833065032959,
"num_tokens": 11087689.0,
"step": 1272
},
{
"entropy": 0.9080740213394165,
"epoch": 4.080256821829855,
"grad_norm": 1.1759231090545654,
"learning_rate": 9.032267956065516e-07,
"loss": 0.0266,
"mean_token_accuracy": 0.9817738234996796,
"num_tokens": 11097553.0,
"step": 1273
},
{
"entropy": 0.8172707259654999,
"epoch": 4.08346709470305,
"grad_norm": 7.202850341796875,
"learning_rate": 8.971596412924067e-07,
"loss": 0.016,
"mean_token_accuracy": 0.9940112233161926,
"num_tokens": 11105794.0,
"step": 1274
},
{
"entropy": 0.8537603318691254,
"epoch": 4.086677367576244,
"grad_norm": 0.9416528940200806,
"learning_rate": 8.911109238737748e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9931537806987762,
"num_tokens": 11113728.0,
"step": 1275
},
{
"entropy": 0.8437024652957916,
"epoch": 4.089887640449438,
"grad_norm": 8.162856101989746,
"learning_rate": 8.850806705317183e-07,
"loss": 0.0119,
"mean_token_accuracy": 0.9965576529502869,
"num_tokens": 11121815.0,
"step": 1276
},
{
"entropy": 0.769570529460907,
"epoch": 4.093097913322633,
"grad_norm": 1.77950119972229,
"learning_rate": 8.790689083643328e-07,
"loss": 0.0259,
"mean_token_accuracy": 0.9933834969997406,
"num_tokens": 11130079.0,
"step": 1277
},
{
"entropy": 0.8534726202487946,
"epoch": 4.096308186195826,
"grad_norm": 3.9178075790405273,
"learning_rate": 8.730756643866157e-07,
"loss": 0.0123,
"mean_token_accuracy": 0.9955320656299591,
"num_tokens": 11138390.0,
"step": 1278
},
{
"entropy": 0.78662109375,
"epoch": 4.099518459069021,
"grad_norm": 0.9845862984657288,
"learning_rate": 8.671009655303531e-07,
"loss": 0.0234,
"mean_token_accuracy": 0.9906353950500488,
"num_tokens": 11147129.0,
"step": 1279
},
{
"entropy": 0.789972335100174,
"epoch": 4.102728731942215,
"grad_norm": 0.6823396682739258,
"learning_rate": 8.611448386439936e-07,
"loss": 0.0123,
"mean_token_accuracy": 0.995839536190033,
"num_tokens": 11154604.0,
"step": 1280
},
{
"entropy": 0.7805255651473999,
"epoch": 4.10593900481541,
"grad_norm": 1.8053884506225586,
"learning_rate": 8.552073104925296e-07,
"loss": 0.0394,
"mean_token_accuracy": 0.9804310500621796,
"num_tokens": 11165116.0,
"step": 1281
},
{
"entropy": 0.8384647369384766,
"epoch": 4.109149277688603,
"grad_norm": 1.5999270677566528,
"learning_rate": 8.492884077573749e-07,
"loss": 0.0133,
"mean_token_accuracy": 0.9951248466968536,
"num_tokens": 11173466.0,
"step": 1282
},
{
"entropy": 0.841315507888794,
"epoch": 4.112359550561798,
"grad_norm": 1.1852508783340454,
"learning_rate": 8.433881570362484e-07,
"loss": 0.0235,
"mean_token_accuracy": 0.9929181337356567,
"num_tokens": 11182017.0,
"step": 1283
},
{
"entropy": 0.8237481415271759,
"epoch": 4.115569823434992,
"grad_norm": 0.8450478911399841,
"learning_rate": 8.375065848430508e-07,
"loss": 0.0137,
"mean_token_accuracy": 0.9960514605045319,
"num_tokens": 11190254.0,
"step": 1284
},
{
"entropy": 0.729523628950119,
"epoch": 4.118780096308186,
"grad_norm": 1.372982382774353,
"learning_rate": 8.316437176077491e-07,
"loss": 0.0199,
"mean_token_accuracy": 0.9932019710540771,
"num_tokens": 11198588.0,
"step": 1285
},
{
"entropy": 0.8260809779167175,
"epoch": 4.121990369181381,
"grad_norm": 0.9410303235054016,
"learning_rate": 8.257995816762559e-07,
"loss": 0.015,
"mean_token_accuracy": 0.9945105910301208,
"num_tokens": 11207145.0,
"step": 1286
},
{
"entropy": 0.7663401365280151,
"epoch": 4.125200642054574,
"grad_norm": 1.1343247890472412,
"learning_rate": 8.199742033103091e-07,
"loss": 0.0196,
"mean_token_accuracy": 0.9935542345046997,
"num_tokens": 11216391.0,
"step": 1287
},
{
"entropy": 0.7641996741294861,
"epoch": 4.128410914927769,
"grad_norm": 0.8208123445510864,
"learning_rate": 8.141676086873574e-07,
"loss": 0.016,
"mean_token_accuracy": 0.9957410991191864,
"num_tokens": 11224925.0,
"step": 1288
},
{
"entropy": 0.7193006873130798,
"epoch": 4.131621187800963,
"grad_norm": 1.0635079145431519,
"learning_rate": 8.083798239004408e-07,
"loss": 0.0428,
"mean_token_accuracy": 0.9678496420383453,
"num_tokens": 11235004.0,
"step": 1289
},
{
"entropy": 0.8392094969749451,
"epoch": 4.134831460674158,
"grad_norm": 1.0508886575698853,
"learning_rate": 8.026108749580758e-07,
"loss": 0.0204,
"mean_token_accuracy": 0.9942511916160583,
"num_tokens": 11245098.0,
"step": 1290
},
{
"entropy": 0.8576850295066833,
"epoch": 4.138041733547351,
"grad_norm": 1.0217255353927612,
"learning_rate": 7.968607877841333e-07,
"loss": 0.0132,
"mean_token_accuracy": 0.9947031438350677,
"num_tokens": 11254398.0,
"step": 1291
},
{
"entropy": 0.8804000616073608,
"epoch": 4.141252006420546,
"grad_norm": 2.1171441078186035,
"learning_rate": 7.911295882177256e-07,
"loss": 0.0128,
"mean_token_accuracy": 0.9961491823196411,
"num_tokens": 11262182.0,
"step": 1292
},
{
"entropy": 0.857866108417511,
"epoch": 4.14446227929374,
"grad_norm": 0.8988444209098816,
"learning_rate": 7.854173020130906e-07,
"loss": 0.0273,
"mean_token_accuracy": 0.9844126403331757,
"num_tokens": 11271620.0,
"step": 1293
},
{
"entropy": 0.9568844437599182,
"epoch": 4.147672552166934,
"grad_norm": 0.6712120771408081,
"learning_rate": 7.79723954839477e-07,
"loss": 0.0107,
"mean_token_accuracy": 0.9966877102851868,
"num_tokens": 11279381.0,
"step": 1294
},
{
"entropy": 0.7858869731426239,
"epoch": 4.150882825040128,
"grad_norm": 1.349931001663208,
"learning_rate": 7.740495722810271e-07,
"loss": 0.027,
"mean_token_accuracy": 0.9832697808742523,
"num_tokens": 11289271.0,
"step": 1295
},
{
"entropy": 0.8988691568374634,
"epoch": 4.154093097913322,
"grad_norm": 0.7759506106376648,
"learning_rate": 7.683941798366578e-07,
"loss": 0.0117,
"mean_token_accuracy": 0.9971850514411926,
"num_tokens": 11297331.0,
"step": 1296
},
{
"entropy": 0.8500702083110809,
"epoch": 4.157303370786517,
"grad_norm": 1.1883063316345215,
"learning_rate": 7.627578029199562e-07,
"loss": 0.0257,
"mean_token_accuracy": 0.9848445653915405,
"num_tokens": 11307060.0,
"step": 1297
},
{
"entropy": 0.8567503988742828,
"epoch": 4.160513643659711,
"grad_norm": 1.2109795808792114,
"learning_rate": 7.571404668590532e-07,
"loss": 0.0332,
"mean_token_accuracy": 0.9718556702136993,
"num_tokens": 11316494.0,
"step": 1298
},
{
"entropy": 0.853458046913147,
"epoch": 4.163723916532906,
"grad_norm": 0.9728054404258728,
"learning_rate": 7.515421968965242e-07,
"loss": 0.0186,
"mean_token_accuracy": 0.991171270608902,
"num_tokens": 11324981.0,
"step": 1299
},
{
"entropy": 0.9854492545127869,
"epoch": 4.166934189406099,
"grad_norm": 0.9858706593513489,
"learning_rate": 7.459630181892608e-07,
"loss": 0.0183,
"mean_token_accuracy": 0.9902990758419037,
"num_tokens": 11334714.0,
"step": 1300
},
{
"entropy": 0.8187530934810638,
"epoch": 4.170144462279294,
"grad_norm": 1.1075387001037598,
"learning_rate": 7.404029558083653e-07,
"loss": 0.0249,
"mean_token_accuracy": 0.9834834933280945,
"num_tokens": 11343590.0,
"step": 1301
},
{
"entropy": 0.7375591695308685,
"epoch": 4.173354735152488,
"grad_norm": 1.350464940071106,
"learning_rate": 7.348620347390384e-07,
"loss": 0.0206,
"mean_token_accuracy": 0.9900037944316864,
"num_tokens": 11353244.0,
"step": 1302
},
{
"entropy": 0.9315820038318634,
"epoch": 4.176565008025682,
"grad_norm": 1.5498111248016357,
"learning_rate": 7.293402798804667e-07,
"loss": 0.0159,
"mean_token_accuracy": 0.9929503202438354,
"num_tokens": 11363019.0,
"step": 1303
},
{
"entropy": 0.8681217432022095,
"epoch": 4.179775280898877,
"grad_norm": 0.8983978033065796,
"learning_rate": 7.238377160457094e-07,
"loss": 0.0155,
"mean_token_accuracy": 0.996653288602829,
"num_tokens": 11371486.0,
"step": 1304
},
{
"entropy": 0.8901689052581787,
"epoch": 4.18298555377207,
"grad_norm": 0.8819789290428162,
"learning_rate": 7.183543679615834e-07,
"loss": 0.0149,
"mean_token_accuracy": 0.9940185546875,
"num_tokens": 11379965.0,
"step": 1305
},
{
"entropy": 0.8389869928359985,
"epoch": 4.186195826645265,
"grad_norm": 1.1202179193496704,
"learning_rate": 7.128902602685617e-07,
"loss": 0.0298,
"mean_token_accuracy": 0.9756748974323273,
"num_tokens": 11390299.0,
"step": 1306
},
{
"entropy": 0.849632740020752,
"epoch": 4.189406099518459,
"grad_norm": 1.1636488437652588,
"learning_rate": 7.074454175206524e-07,
"loss": 0.0207,
"mean_token_accuracy": 0.9929619431495667,
"num_tokens": 11398813.0,
"step": 1307
},
{
"entropy": 0.8657627403736115,
"epoch": 4.192616372391654,
"grad_norm": 0.7907083034515381,
"learning_rate": 7.020198641852949e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9957354366779327,
"num_tokens": 11407073.0,
"step": 1308
},
{
"entropy": 0.890144944190979,
"epoch": 4.195826645264847,
"grad_norm": 0.7237719893455505,
"learning_rate": 6.966136246432492e-07,
"loss": 0.0101,
"mean_token_accuracy": 0.9964252412319183,
"num_tokens": 11416069.0,
"step": 1309
},
{
"entropy": 0.912724643945694,
"epoch": 4.199036918138042,
"grad_norm": 0.8627637028694153,
"learning_rate": 6.912267231884817e-07,
"loss": 0.0187,
"mean_token_accuracy": 0.9939960241317749,
"num_tokens": 11425409.0,
"step": 1310
},
{
"entropy": 0.9045071303844452,
"epoch": 4.202247191011236,
"grad_norm": 0.6950350999832153,
"learning_rate": 6.858591840280627e-07,
"loss": 0.0126,
"mean_token_accuracy": 0.9963579177856445,
"num_tokens": 11433984.0,
"step": 1311
},
{
"entropy": 0.8476213812828064,
"epoch": 4.20545746388443,
"grad_norm": 1.5948781967163086,
"learning_rate": 6.805110312820501e-07,
"loss": 0.0163,
"mean_token_accuracy": 0.9928786754608154,
"num_tokens": 11443158.0,
"step": 1312
},
{
"entropy": 0.9233494997024536,
"epoch": 4.208667736757624,
"grad_norm": 1.3917864561080933,
"learning_rate": 6.751822889833926e-07,
"loss": 0.0191,
"mean_token_accuracy": 0.9936199188232422,
"num_tokens": 11452244.0,
"step": 1313
},
{
"entropy": 0.8223008215427399,
"epoch": 4.211878009630818,
"grad_norm": 0.9445686936378479,
"learning_rate": 6.698729810778065e-07,
"loss": 0.0154,
"mean_token_accuracy": 0.9957300126552582,
"num_tokens": 11459714.0,
"step": 1314
},
{
"entropy": 0.9119621515274048,
"epoch": 4.215088282504013,
"grad_norm": 1.3611754179000854,
"learning_rate": 6.645831314236817e-07,
"loss": 0.0251,
"mean_token_accuracy": 0.9857873618602753,
"num_tokens": 11467671.0,
"step": 1315
},
{
"entropy": 0.8818807601928711,
"epoch": 4.218298555377207,
"grad_norm": 1.3261618614196777,
"learning_rate": 6.593127637919633e-07,
"loss": 0.0194,
"mean_token_accuracy": 0.9928447604179382,
"num_tokens": 11476584.0,
"step": 1316
},
{
"entropy": 0.9259830713272095,
"epoch": 4.221508828250402,
"grad_norm": 1.2929267883300781,
"learning_rate": 6.540619018660555e-07,
"loss": 0.0215,
"mean_token_accuracy": 0.9909241497516632,
"num_tokens": 11485902.0,
"step": 1317
},
{
"entropy": 0.8626327812671661,
"epoch": 4.224719101123595,
"grad_norm": 0.8614633083343506,
"learning_rate": 6.488305692417074e-07,
"loss": 0.015,
"mean_token_accuracy": 0.9946702420711517,
"num_tokens": 11493744.0,
"step": 1318
},
{
"entropy": 0.8261954486370087,
"epoch": 4.22792937399679,
"grad_norm": 0.8554882407188416,
"learning_rate": 6.436187894269086e-07,
"loss": 0.0271,
"mean_token_accuracy": 0.9795649349689484,
"num_tokens": 11502359.0,
"step": 1319
},
{
"entropy": 0.905153900384903,
"epoch": 4.231139646869984,
"grad_norm": 0.7063010931015015,
"learning_rate": 6.384265858417877e-07,
"loss": 0.0263,
"mean_token_accuracy": 0.9765981733798981,
"num_tokens": 11512068.0,
"step": 1320
},
{
"entropy": 0.7651464641094208,
"epoch": 4.234349919743178,
"grad_norm": 2.0079517364501953,
"learning_rate": 6.332539818184985e-07,
"loss": 0.0218,
"mean_token_accuracy": 0.9929872751235962,
"num_tokens": 11520923.0,
"step": 1321
},
{
"entropy": 0.789582222700119,
"epoch": 4.237560192616373,
"grad_norm": 1.0748300552368164,
"learning_rate": 6.281010006011256e-07,
"loss": 0.0247,
"mean_token_accuracy": 0.9863014817237854,
"num_tokens": 11529886.0,
"step": 1322
},
{
"entropy": 0.780224084854126,
"epoch": 4.240770465489566,
"grad_norm": 1.1340675354003906,
"learning_rate": 6.229676653455719e-07,
"loss": 0.0184,
"mean_token_accuracy": 0.9943748712539673,
"num_tokens": 11537909.0,
"step": 1323
},
{
"entropy": 0.8488284349441528,
"epoch": 4.243980738362761,
"grad_norm": 1.945456862449646,
"learning_rate": 6.178539991194599e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9947131276130676,
"num_tokens": 11546221.0,
"step": 1324
},
{
"entropy": 0.7531148791313171,
"epoch": 4.247191011235955,
"grad_norm": 0.9983758330345154,
"learning_rate": 6.127600249020216e-07,
"loss": 0.0153,
"mean_token_accuracy": 0.9957643747329712,
"num_tokens": 11554442.0,
"step": 1325
},
{
"entropy": 0.858670711517334,
"epoch": 4.25040128410915,
"grad_norm": 2.524183988571167,
"learning_rate": 6.076857655840024e-07,
"loss": 0.0215,
"mean_token_accuracy": 0.9911713302135468,
"num_tokens": 11563316.0,
"step": 1326
},
{
"entropy": 0.9219168722629547,
"epoch": 4.253611556982343,
"grad_norm": 0.8876209855079651,
"learning_rate": 6.026312439675553e-07,
"loss": 0.0143,
"mean_token_accuracy": 0.9944645464420319,
"num_tokens": 11571721.0,
"step": 1327
},
{
"entropy": 0.8005822896957397,
"epoch": 4.256821829855538,
"grad_norm": 0.860865592956543,
"learning_rate": 5.975964827661346e-07,
"loss": 0.0297,
"mean_token_accuracy": 0.9751374423503876,
"num_tokens": 11581449.0,
"step": 1328
},
{
"entropy": 0.7818480730056763,
"epoch": 4.260032102728732,
"grad_norm": 4.995467662811279,
"learning_rate": 5.925815046044026e-07,
"loss": 0.0315,
"mean_token_accuracy": 0.9817995131015778,
"num_tokens": 11590914.0,
"step": 1329
},
{
"entropy": 0.781961977481842,
"epoch": 4.263242375601926,
"grad_norm": 1.655155897140503,
"learning_rate": 5.875863320181175e-07,
"loss": 0.0306,
"mean_token_accuracy": 0.9803996980190277,
"num_tokens": 11601404.0,
"step": 1330
},
{
"entropy": 0.9626095294952393,
"epoch": 4.26645264847512,
"grad_norm": 1.3983674049377441,
"learning_rate": 5.826109874540409e-07,
"loss": 0.0179,
"mean_token_accuracy": 0.9947277307510376,
"num_tokens": 11610077.0,
"step": 1331
},
{
"entropy": 0.9014355540275574,
"epoch": 4.269662921348314,
"grad_norm": 2.09114146232605,
"learning_rate": 5.776554932698325e-07,
"loss": 0.0152,
"mean_token_accuracy": 0.9936963021755219,
"num_tokens": 11619265.0,
"step": 1332
},
{
"entropy": 0.8946935832500458,
"epoch": 4.272873194221509,
"grad_norm": 0.841145932674408,
"learning_rate": 5.727198717339511e-07,
"loss": 0.0203,
"mean_token_accuracy": 0.9885965883731842,
"num_tokens": 11628528.0,
"step": 1333
},
{
"entropy": 0.8457264304161072,
"epoch": 4.276083467094703,
"grad_norm": 1.5083551406860352,
"learning_rate": 5.678041450255512e-07,
"loss": 0.016,
"mean_token_accuracy": 0.9952947199344635,
"num_tokens": 11636719.0,
"step": 1334
},
{
"entropy": 0.7869194149971008,
"epoch": 4.279293739967898,
"grad_norm": 1.0477701425552368,
"learning_rate": 5.6290833523439e-07,
"loss": 0.0181,
"mean_token_accuracy": 0.9945695102214813,
"num_tokens": 11645004.0,
"step": 1335
},
{
"entropy": 0.9997333586215973,
"epoch": 4.282504012841091,
"grad_norm": 0.8144447803497314,
"learning_rate": 5.58032464360721e-07,
"loss": 0.0134,
"mean_token_accuracy": 0.9962140023708344,
"num_tokens": 11653557.0,
"step": 1336
},
{
"entropy": 0.8154990077018738,
"epoch": 4.285714285714286,
"grad_norm": 1.150378704071045,
"learning_rate": 5.531765543152002e-07,
"loss": 0.0201,
"mean_token_accuracy": 0.9930236041545868,
"num_tokens": 11662284.0,
"step": 1337
},
{
"entropy": 0.799591451883316,
"epoch": 4.28892455858748,
"grad_norm": 0.8984584808349609,
"learning_rate": 5.483406269187869e-07,
"loss": 0.0145,
"mean_token_accuracy": 0.9946328103542328,
"num_tokens": 11670400.0,
"step": 1338
},
{
"entropy": 0.7851231098175049,
"epoch": 4.292134831460674,
"grad_norm": 1.5604168176651,
"learning_rate": 5.435247039026398e-07,
"loss": 0.0275,
"mean_token_accuracy": 0.9917657375335693,
"num_tokens": 11679997.0,
"step": 1339
},
{
"entropy": 0.941244900226593,
"epoch": 4.295345104333869,
"grad_norm": 2.561316728591919,
"learning_rate": 5.387288069080298e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9957253336906433,
"num_tokens": 11689727.0,
"step": 1340
},
{
"entropy": 0.7649567723274231,
"epoch": 4.298555377207062,
"grad_norm": 0.8507292866706848,
"learning_rate": 5.33952957486234e-07,
"loss": 0.0158,
"mean_token_accuracy": 0.9926185607910156,
"num_tokens": 11698033.0,
"step": 1341
},
{
"entropy": 0.7722039520740509,
"epoch": 4.301765650080257,
"grad_norm": 0.7529566287994385,
"learning_rate": 5.291971770984428e-07,
"loss": 0.018,
"mean_token_accuracy": 0.9899595379829407,
"num_tokens": 11707024.0,
"step": 1342
},
{
"entropy": 0.8715614974498749,
"epoch": 4.304975922953451,
"grad_norm": 0.7769994139671326,
"learning_rate": 5.244614871156612e-07,
"loss": 0.0128,
"mean_token_accuracy": 0.9952245056629181,
"num_tokens": 11714142.0,
"step": 1343
},
{
"entropy": 0.7915047109127045,
"epoch": 4.308186195826646,
"grad_norm": 1.3179060220718384,
"learning_rate": 5.197459088186163e-07,
"loss": 0.015,
"mean_token_accuracy": 0.9947215914726257,
"num_tokens": 11722750.0,
"step": 1344
},
{
"entropy": 0.8518697619438171,
"epoch": 4.311396468699839,
"grad_norm": 1.0496397018432617,
"learning_rate": 5.150504633976572e-07,
"loss": 0.0274,
"mean_token_accuracy": 0.9796628355979919,
"num_tokens": 11732462.0,
"step": 1345
},
{
"entropy": 0.9217169284820557,
"epoch": 4.314606741573034,
"grad_norm": 1.6036604642868042,
"learning_rate": 5.103751719526639e-07,
"loss": 0.0143,
"mean_token_accuracy": 0.994863748550415,
"num_tokens": 11740413.0,
"step": 1346
},
{
"entropy": 0.8194193542003632,
"epoch": 4.317817014446228,
"grad_norm": 2.075221300125122,
"learning_rate": 5.057200554929509e-07,
"loss": 0.0243,
"mean_token_accuracy": 0.9908171594142914,
"num_tokens": 11749708.0,
"step": 1347
},
{
"entropy": 0.8294661045074463,
"epoch": 4.321027287319422,
"grad_norm": 0.9790740013122559,
"learning_rate": 5.010851349371704e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9910164773464203,
"num_tokens": 11758025.0,
"step": 1348
},
{
"entropy": 0.7852829992771149,
"epoch": 4.324237560192616,
"grad_norm": 1.6573703289031982,
"learning_rate": 4.964704311132224e-07,
"loss": 0.0163,
"mean_token_accuracy": 0.9949574768543243,
"num_tokens": 11765881.0,
"step": 1349
},
{
"entropy": 0.8169362843036652,
"epoch": 4.32744783306581,
"grad_norm": 0.9623476266860962,
"learning_rate": 4.918759647581578e-07,
"loss": 0.0194,
"mean_token_accuracy": 0.9888501465320587,
"num_tokens": 11774323.0,
"step": 1350
},
{
"entropy": 0.8484928011894226,
"epoch": 4.330658105939005,
"grad_norm": 2.397317886352539,
"learning_rate": 4.873017565180871e-07,
"loss": 0.0295,
"mean_token_accuracy": 0.9821926355361938,
"num_tokens": 11783381.0,
"step": 1351
},
{
"entropy": 0.8673693537712097,
"epoch": 4.333868378812199,
"grad_norm": 1.7956311702728271,
"learning_rate": 4.827478269480895e-07,
"loss": 0.0148,
"mean_token_accuracy": 0.994268536567688,
"num_tokens": 11791626.0,
"step": 1352
},
{
"entropy": 0.8526739478111267,
"epoch": 4.337078651685394,
"grad_norm": 1.6738061904907227,
"learning_rate": 4.782141965121129e-07,
"loss": 0.0162,
"mean_token_accuracy": 0.9941445887088776,
"num_tokens": 11800040.0,
"step": 1353
},
{
"entropy": 0.8506563901901245,
"epoch": 4.340288924558587,
"grad_norm": 0.7996606230735779,
"learning_rate": 4.7370088558289175e-07,
"loss": 0.0144,
"mean_token_accuracy": 0.9958418607711792,
"num_tokens": 11809269.0,
"step": 1354
},
{
"entropy": 0.8788428902626038,
"epoch": 4.343499197431782,
"grad_norm": 0.7069100141525269,
"learning_rate": 4.6920791444184934e-07,
"loss": 0.0099,
"mean_token_accuracy": 0.9968018531799316,
"num_tokens": 11817225.0,
"step": 1355
},
{
"entropy": 0.8030937612056732,
"epoch": 4.346709470304976,
"grad_norm": 2.9055588245391846,
"learning_rate": 4.647353032790086e-07,
"loss": 0.0177,
"mean_token_accuracy": 0.9941924810409546,
"num_tokens": 11825674.0,
"step": 1356
},
{
"entropy": 0.9827845692634583,
"epoch": 4.34991974317817,
"grad_norm": 1.107529878616333,
"learning_rate": 4.602830721928997e-07,
"loss": 0.035,
"mean_token_accuracy": 0.9746537506580353,
"num_tokens": 11834822.0,
"step": 1357
},
{
"entropy": 0.9760691821575165,
"epoch": 4.353130016051364,
"grad_norm": 0.5545116662979126,
"learning_rate": 4.558512411904731e-07,
"loss": 0.0106,
"mean_token_accuracy": 0.9977552592754364,
"num_tokens": 11843119.0,
"step": 1358
},
{
"entropy": 0.8231452703475952,
"epoch": 4.356340288924558,
"grad_norm": 1.0874269008636475,
"learning_rate": 4.5143983018700485e-07,
"loss": 0.0256,
"mean_token_accuracy": 0.9907995164394379,
"num_tokens": 11852225.0,
"step": 1359
},
{
"entropy": 0.7386958003044128,
"epoch": 4.359550561797753,
"grad_norm": 1.1277332305908203,
"learning_rate": 4.4704885900601236e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9933885335922241,
"num_tokens": 11860832.0,
"step": 1360
},
{
"entropy": 0.8382771015167236,
"epoch": 4.362760834670947,
"grad_norm": 0.8998692035675049,
"learning_rate": 4.4267834737916295e-07,
"loss": 0.0125,
"mean_token_accuracy": 0.996128261089325,
"num_tokens": 11869460.0,
"step": 1361
},
{
"entropy": 0.9397884905338287,
"epoch": 4.365971107544142,
"grad_norm": 0.9518370628356934,
"learning_rate": 4.3832831494618255e-07,
"loss": 0.0369,
"mean_token_accuracy": 0.9735174477100372,
"num_tokens": 11880031.0,
"step": 1362
},
{
"entropy": 0.7591385245323181,
"epoch": 4.369181380417335,
"grad_norm": 1.1569485664367676,
"learning_rate": 4.33998781254773e-07,
"loss": 0.0183,
"mean_token_accuracy": 0.9944524168968201,
"num_tokens": 11888471.0,
"step": 1363
},
{
"entropy": 0.8310154378414154,
"epoch": 4.37239165329053,
"grad_norm": 1.0223422050476074,
"learning_rate": 4.2968976576051703e-07,
"loss": 0.0136,
"mean_token_accuracy": 0.9942927956581116,
"num_tokens": 11896131.0,
"step": 1364
},
{
"entropy": 0.7848845720291138,
"epoch": 4.375601926163724,
"grad_norm": 0.8517405986785889,
"learning_rate": 4.2540128782679934e-07,
"loss": 0.0177,
"mean_token_accuracy": 0.9930301904678345,
"num_tokens": 11904730.0,
"step": 1365
},
{
"entropy": 0.8888550400733948,
"epoch": 4.378812199036918,
"grad_norm": 1.2937294244766235,
"learning_rate": 4.211333667247125e-07,
"loss": 0.0172,
"mean_token_accuracy": 0.9935269057750702,
"num_tokens": 11913169.0,
"step": 1366
},
{
"entropy": 0.7717286348342896,
"epoch": 4.382022471910112,
"grad_norm": 0.9954974055290222,
"learning_rate": 4.1688602163297564e-07,
"loss": 0.0119,
"mean_token_accuracy": 0.9969731569290161,
"num_tokens": 11922132.0,
"step": 1367
},
{
"entropy": 0.8456141352653503,
"epoch": 4.385232744783306,
"grad_norm": 1.324472427368164,
"learning_rate": 4.126592716378408e-07,
"loss": 0.0102,
"mean_token_accuracy": 0.9973188638687134,
"num_tokens": 11929738.0,
"step": 1368
},
{
"entropy": 0.9674667716026306,
"epoch": 4.388443017656501,
"grad_norm": 0.7544914484024048,
"learning_rate": 4.0845313573301736e-07,
"loss": 0.0134,
"mean_token_accuracy": 0.996004194021225,
"num_tokens": 11938396.0,
"step": 1369
},
{
"entropy": 0.8736143708229065,
"epoch": 4.391653290529695,
"grad_norm": 0.7559876441955566,
"learning_rate": 4.042676328195788e-07,
"loss": 0.0158,
"mean_token_accuracy": 0.9960503578186035,
"num_tokens": 11947418.0,
"step": 1370
},
{
"entropy": 0.8970188498497009,
"epoch": 4.39486356340289,
"grad_norm": 0.8440226316452026,
"learning_rate": 4.001027817058789e-07,
"loss": 0.0153,
"mean_token_accuracy": 0.9943675100803375,
"num_tokens": 11956316.0,
"step": 1371
},
{
"entropy": 0.7634763419628143,
"epoch": 4.398073836276083,
"grad_norm": 1.064709186553955,
"learning_rate": 3.959586011074729e-07,
"loss": 0.0313,
"mean_token_accuracy": 0.9725034534931183,
"num_tokens": 11965704.0,
"step": 1372
},
{
"entropy": 0.8148213326931,
"epoch": 4.401284109149278,
"grad_norm": 1.9659761190414429,
"learning_rate": 3.9183510964702463e-07,
"loss": 0.0152,
"mean_token_accuracy": 0.9943975806236267,
"num_tokens": 11974096.0,
"step": 1373
},
{
"entropy": 0.7761686444282532,
"epoch": 4.404494382022472,
"grad_norm": 1.3696943521499634,
"learning_rate": 3.8773232585422924e-07,
"loss": 0.0131,
"mean_token_accuracy": 0.9964625537395477,
"num_tokens": 11982355.0,
"step": 1374
},
{
"entropy": 0.7501409351825714,
"epoch": 4.407704654895666,
"grad_norm": 0.7303560376167297,
"learning_rate": 3.836502681657289e-07,
"loss": 0.0152,
"mean_token_accuracy": 0.990977019071579,
"num_tokens": 11991634.0,
"step": 1375
},
{
"entropy": 0.8639850616455078,
"epoch": 4.41091492776886,
"grad_norm": 1.1209553480148315,
"learning_rate": 3.795889549250292e-07,
"loss": 0.0138,
"mean_token_accuracy": 0.9946679472923279,
"num_tokens": 12000344.0,
"step": 1376
},
{
"entropy": 0.8161064386367798,
"epoch": 4.414125200642054,
"grad_norm": 2.2693448066711426,
"learning_rate": 3.755484043824131e-07,
"loss": 0.0138,
"mean_token_accuracy": 0.9931889176368713,
"num_tokens": 12009056.0,
"step": 1377
},
{
"entropy": 0.7727282047271729,
"epoch": 4.417335473515249,
"grad_norm": 0.9428284764289856,
"learning_rate": 3.715286346948671e-07,
"loss": 0.0215,
"mean_token_accuracy": 0.9886394441127777,
"num_tokens": 12018012.0,
"step": 1378
},
{
"entropy": 0.6903052926063538,
"epoch": 4.420545746388443,
"grad_norm": 2.0464818477630615,
"learning_rate": 3.675296639259912e-07,
"loss": 0.022,
"mean_token_accuracy": 0.992774486541748,
"num_tokens": 12025935.0,
"step": 1379
},
{
"entropy": 0.9016786813735962,
"epoch": 4.423756019261638,
"grad_norm": 2.821998357772827,
"learning_rate": 3.6355151004592414e-07,
"loss": 0.0207,
"mean_token_accuracy": 0.9895893335342407,
"num_tokens": 12034016.0,
"step": 1380
},
{
"entropy": 0.7792826294898987,
"epoch": 4.426966292134831,
"grad_norm": 0.8225787878036499,
"learning_rate": 3.595941909312595e-07,
"loss": 0.0282,
"mean_token_accuracy": 0.9868153035640717,
"num_tokens": 12042647.0,
"step": 1381
},
{
"entropy": 0.7803854644298553,
"epoch": 4.430176565008026,
"grad_norm": 1.0302590131759644,
"learning_rate": 3.5565772436496336e-07,
"loss": 0.0205,
"mean_token_accuracy": 0.9852829873561859,
"num_tokens": 12051756.0,
"step": 1382
},
{
"entropy": 1.015624314546585,
"epoch": 4.43338683788122,
"grad_norm": 2.2021806240081787,
"learning_rate": 3.517421280363004e-07,
"loss": 0.0162,
"mean_token_accuracy": 0.9948179125785828,
"num_tokens": 12061734.0,
"step": 1383
},
{
"entropy": 0.7805822789669037,
"epoch": 4.436597110754414,
"grad_norm": 0.9023168087005615,
"learning_rate": 3.4784741954074884e-07,
"loss": 0.0197,
"mean_token_accuracy": 0.9920674264431,
"num_tokens": 12071018.0,
"step": 1384
},
{
"entropy": 0.8862617015838623,
"epoch": 4.439807383627608,
"grad_norm": 0.7753137946128845,
"learning_rate": 3.439736163799251e-07,
"loss": 0.0128,
"mean_token_accuracy": 0.9962200224399567,
"num_tokens": 12078519.0,
"step": 1385
},
{
"entropy": 0.7836082875728607,
"epoch": 4.443017656500802,
"grad_norm": 1.1008198261260986,
"learning_rate": 3.4012073596150106e-07,
"loss": 0.0133,
"mean_token_accuracy": 0.9951090216636658,
"num_tokens": 12086876.0,
"step": 1386
},
{
"entropy": 0.7967692613601685,
"epoch": 4.446227929373997,
"grad_norm": 0.8891757130622864,
"learning_rate": 3.362887955991301e-07,
"loss": 0.0219,
"mean_token_accuracy": 0.9891441464424133,
"num_tokens": 12095596.0,
"step": 1387
},
{
"entropy": 0.829216718673706,
"epoch": 4.449438202247191,
"grad_norm": 0.743823230266571,
"learning_rate": 3.3247781251236623e-07,
"loss": 0.0176,
"mean_token_accuracy": 0.9918361604213715,
"num_tokens": 12104120.0,
"step": 1388
},
{
"entropy": 0.8801736235618591,
"epoch": 4.452648475120386,
"grad_norm": 1.7145832777023315,
"learning_rate": 3.2868780382658895e-07,
"loss": 0.0167,
"mean_token_accuracy": 0.9942232072353363,
"num_tokens": 12113979.0,
"step": 1389
},
{
"entropy": 0.7938494682312012,
"epoch": 4.455858747993579,
"grad_norm": 0.9124286770820618,
"learning_rate": 3.2491878657292643e-07,
"loss": 0.0156,
"mean_token_accuracy": 0.994210958480835,
"num_tokens": 12122733.0,
"step": 1390
},
{
"entropy": 0.8369238376617432,
"epoch": 4.459069020866774,
"grad_norm": 1.171193242073059,
"learning_rate": 3.2117077768817395e-07,
"loss": 0.0305,
"mean_token_accuracy": 0.9782870709896088,
"num_tokens": 12131758.0,
"step": 1391
},
{
"entropy": 0.8763419985771179,
"epoch": 4.462279293739968,
"grad_norm": 0.8820005059242249,
"learning_rate": 3.174437940147268e-07,
"loss": 0.0144,
"mean_token_accuracy": 0.9950818717479706,
"num_tokens": 12141100.0,
"step": 1392
},
{
"entropy": 0.8697878122329712,
"epoch": 4.465489566613162,
"grad_norm": 0.9343982934951782,
"learning_rate": 3.1373785230049356e-07,
"loss": 0.0122,
"mean_token_accuracy": 0.9963211715221405,
"num_tokens": 12149229.0,
"step": 1393
},
{
"entropy": 0.8618627786636353,
"epoch": 4.468699839486356,
"grad_norm": 1.007222294807434,
"learning_rate": 3.1005296919883354e-07,
"loss": 0.0212,
"mean_token_accuracy": 0.9941324591636658,
"num_tokens": 12157044.0,
"step": 1394
},
{
"entropy": 0.7762088775634766,
"epoch": 4.47191011235955,
"grad_norm": 1.0554577112197876,
"learning_rate": 3.0638916126846885e-07,
"loss": 0.0203,
"mean_token_accuracy": 0.9922034442424774,
"num_tokens": 12166146.0,
"step": 1395
},
{
"entropy": 0.901738703250885,
"epoch": 4.475120385232745,
"grad_norm": 1.7087578773498535,
"learning_rate": 3.0274644497342133e-07,
"loss": 0.0146,
"mean_token_accuracy": 0.9947327375411987,
"num_tokens": 12174767.0,
"step": 1396
},
{
"entropy": 0.9199126064777374,
"epoch": 4.478330658105939,
"grad_norm": 1.3522827625274658,
"learning_rate": 2.991248366829291e-07,
"loss": 0.0128,
"mean_token_accuracy": 0.9958087801933289,
"num_tokens": 12182552.0,
"step": 1397
},
{
"entropy": 0.7853618264198303,
"epoch": 4.481540930979134,
"grad_norm": 0.7523943781852722,
"learning_rate": 2.955243526713808e-07,
"loss": 0.0137,
"mean_token_accuracy": 0.9948084652423859,
"num_tokens": 12190296.0,
"step": 1398
},
{
"entropy": 0.8466727435588837,
"epoch": 4.484751203852327,
"grad_norm": 1.0168616771697998,
"learning_rate": 2.91945009118238e-07,
"loss": 0.0165,
"mean_token_accuracy": 0.9945223033428192,
"num_tokens": 12198478.0,
"step": 1399
},
{
"entropy": 0.8088905811309814,
"epoch": 4.487961476725522,
"grad_norm": 1.3884814977645874,
"learning_rate": 2.883868221079628e-07,
"loss": 0.0218,
"mean_token_accuracy": 0.991487979888916,
"num_tokens": 12207216.0,
"step": 1400
},
{
"entropy": 0.7696555852890015,
"epoch": 4.491171749598716,
"grad_norm": 1.1995670795440674,
"learning_rate": 2.848498076299483e-07,
"loss": 0.0202,
"mean_token_accuracy": 0.9934026896953583,
"num_tokens": 12215618.0,
"step": 1401
},
{
"entropy": 0.8784035742282867,
"epoch": 4.49438202247191,
"grad_norm": 1.1318795680999756,
"learning_rate": 2.813339815784416e-07,
"loss": 0.014,
"mean_token_accuracy": 0.996482789516449,
"num_tokens": 12223941.0,
"step": 1402
},
{
"entropy": 0.9082882702350616,
"epoch": 4.497592295345104,
"grad_norm": 0.5473995804786682,
"learning_rate": 2.7783935975247867e-07,
"loss": 0.0101,
"mean_token_accuracy": 0.9972693026065826,
"num_tokens": 12232545.0,
"step": 1403
},
{
"entropy": 0.8398851752281189,
"epoch": 4.500802568218298,
"grad_norm": 1.0231767892837524,
"learning_rate": 2.743659578558089e-07,
"loss": 0.0288,
"mean_token_accuracy": 0.9750750958919525,
"num_tokens": 12241817.0,
"step": 1404
},
{
"entropy": 0.6918390095233917,
"epoch": 4.504012841091493,
"grad_norm": 0.8074386119842529,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.0224,
"mean_token_accuracy": 0.9840397238731384,
"num_tokens": 12250369.0,
"step": 1405
},
{
"entropy": 0.8113892078399658,
"epoch": 4.507223113964687,
"grad_norm": 2.0139825344085693,
"learning_rate": 2.6748287618849957e-07,
"loss": 0.018,
"mean_token_accuracy": 0.993849903345108,
"num_tokens": 12258742.0,
"step": 1406
},
{
"entropy": 0.8949583470821381,
"epoch": 4.510433386837882,
"grad_norm": 0.6584023237228394,
"learning_rate": 2.6407322734829763e-07,
"loss": 0.0095,
"mean_token_accuracy": 0.9971945285797119,
"num_tokens": 12266628.0,
"step": 1407
},
{
"entropy": 0.8380038738250732,
"epoch": 4.513643659711075,
"grad_norm": 0.801179826259613,
"learning_rate": 2.6068486029813154e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9962698221206665,
"num_tokens": 12274399.0,
"step": 1408
},
{
"entropy": 0.7071676850318909,
"epoch": 4.51685393258427,
"grad_norm": 0.8389870524406433,
"learning_rate": 2.573177902642726e-07,
"loss": 0.0233,
"mean_token_accuracy": 0.9872068166732788,
"num_tokens": 12285419.0,
"step": 1409
},
{
"entropy": 0.7678396701812744,
"epoch": 4.520064205457464,
"grad_norm": 0.7854102849960327,
"learning_rate": 2.539720323772926e-07,
"loss": 0.0199,
"mean_token_accuracy": 0.9910922050476074,
"num_tokens": 12293828.0,
"step": 1410
},
{
"entropy": 0.7876458466053009,
"epoch": 4.523274478330658,
"grad_norm": 0.9480336308479309,
"learning_rate": 2.506476016719922e-07,
"loss": 0.0154,
"mean_token_accuracy": 0.9949440360069275,
"num_tokens": 12301568.0,
"step": 1411
},
{
"entropy": 0.8209743797779083,
"epoch": 4.526484751203852,
"grad_norm": 0.8667705059051514,
"learning_rate": 2.473445130873353e-07,
"loss": 0.0159,
"mean_token_accuracy": 0.9932181537151337,
"num_tokens": 12310382.0,
"step": 1412
},
{
"entropy": 0.866468995809555,
"epoch": 4.529695024077046,
"grad_norm": 0.9462944865226746,
"learning_rate": 2.440627814663804e-07,
"loss": 0.0193,
"mean_token_accuracy": 0.9907811284065247,
"num_tokens": 12318521.0,
"step": 1413
},
{
"entropy": 0.7582902610301971,
"epoch": 4.532905296950241,
"grad_norm": 1.1882989406585693,
"learning_rate": 2.4080242155621327e-07,
"loss": 0.018,
"mean_token_accuracy": 0.9941226541996002,
"num_tokens": 12327369.0,
"step": 1414
},
{
"entropy": 0.7556898593902588,
"epoch": 4.536115569823435,
"grad_norm": 2.0978384017944336,
"learning_rate": 2.3756344800788421e-07,
"loss": 0.0217,
"mean_token_accuracy": 0.9932650327682495,
"num_tokens": 12335719.0,
"step": 1415
},
{
"entropy": 0.858602374792099,
"epoch": 4.539325842696629,
"grad_norm": 1.414963960647583,
"learning_rate": 2.343458753763378e-07,
"loss": 0.0195,
"mean_token_accuracy": 0.9922243356704712,
"num_tokens": 12344686.0,
"step": 1416
},
{
"entropy": 0.8799131810665131,
"epoch": 4.542536115569823,
"grad_norm": 0.9761103391647339,
"learning_rate": 2.3114971812034981e-07,
"loss": 0.0148,
"mean_token_accuracy": 0.9959944486618042,
"num_tokens": 12352531.0,
"step": 1417
},
{
"entropy": 0.8054837286472321,
"epoch": 4.545746388443018,
"grad_norm": 0.7446231842041016,
"learning_rate": 2.2797499060246253e-07,
"loss": 0.0148,
"mean_token_accuracy": 0.9948087632656097,
"num_tokens": 12361539.0,
"step": 1418
},
{
"entropy": 0.8667932152748108,
"epoch": 4.548956661316212,
"grad_norm": 2.7349588871002197,
"learning_rate": 2.2482170708892083e-07,
"loss": 0.0201,
"mean_token_accuracy": 0.9932528138160706,
"num_tokens": 12371893.0,
"step": 1419
},
{
"entropy": 0.8341934084892273,
"epoch": 4.552166934189406,
"grad_norm": 0.983849823474884,
"learning_rate": 2.2168988174960382e-07,
"loss": 0.0176,
"mean_token_accuracy": 0.9940782487392426,
"num_tokens": 12381677.0,
"step": 1420
},
{
"entropy": 0.8310158252716064,
"epoch": 4.5553772070626,
"grad_norm": 0.8708466291427612,
"learning_rate": 2.1857952865796616e-07,
"loss": 0.0147,
"mean_token_accuracy": 0.9952942728996277,
"num_tokens": 12390585.0,
"step": 1421
},
{
"entropy": 0.7231708765029907,
"epoch": 4.558587479935794,
"grad_norm": 1.949539065361023,
"learning_rate": 2.1549066179097355e-07,
"loss": 0.0166,
"mean_token_accuracy": 0.9928002655506134,
"num_tokens": 12399388.0,
"step": 1422
},
{
"entropy": 0.8198031783103943,
"epoch": 4.561797752808989,
"grad_norm": 0.8939534425735474,
"learning_rate": 2.124232950290367e-07,
"loss": 0.0168,
"mean_token_accuracy": 0.9933803975582123,
"num_tokens": 12407701.0,
"step": 1423
},
{
"entropy": 0.7757489085197449,
"epoch": 4.565008025682183,
"grad_norm": 0.7762877345085144,
"learning_rate": 2.0937744215595467e-07,
"loss": 0.0239,
"mean_token_accuracy": 0.981669157743454,
"num_tokens": 12417604.0,
"step": 1424
},
{
"entropy": 0.7322670519351959,
"epoch": 4.568218298555378,
"grad_norm": 0.7595078349113464,
"learning_rate": 2.0635311685884675e-07,
"loss": 0.0226,
"mean_token_accuracy": 0.9879952669143677,
"num_tokens": 12427171.0,
"step": 1425
},
{
"entropy": 0.9202703237533569,
"epoch": 4.571428571428571,
"grad_norm": 1.0135276317596436,
"learning_rate": 2.0335033272809612e-07,
"loss": 0.0141,
"mean_token_accuracy": 0.995720237493515,
"num_tokens": 12436454.0,
"step": 1426
},
{
"entropy": 0.6941528022289276,
"epoch": 4.574638844301766,
"grad_norm": 1.0334597826004028,
"learning_rate": 2.0036910325728521e-07,
"loss": 0.0129,
"mean_token_accuracy": 0.9945822060108185,
"num_tokens": 12444854.0,
"step": 1427
},
{
"entropy": 0.7655043005943298,
"epoch": 4.57784911717496,
"grad_norm": 1.1090309619903564,
"learning_rate": 1.9740944184313882e-07,
"loss": 0.0147,
"mean_token_accuracy": 0.993369847536087,
"num_tokens": 12453038.0,
"step": 1428
},
{
"entropy": 0.8777413368225098,
"epoch": 4.581059390048154,
"grad_norm": 1.6091341972351074,
"learning_rate": 1.9447136178545766e-07,
"loss": 0.0162,
"mean_token_accuracy": 0.995211273431778,
"num_tokens": 12460768.0,
"step": 1429
},
{
"entropy": 0.834505558013916,
"epoch": 4.584269662921348,
"grad_norm": 0.9753161668777466,
"learning_rate": 1.9155487628706672e-07,
"loss": 0.019,
"mean_token_accuracy": 0.9906420409679413,
"num_tokens": 12470811.0,
"step": 1430
},
{
"entropy": 0.8439209461212158,
"epoch": 4.587479935794542,
"grad_norm": 0.7517779469490051,
"learning_rate": 1.8865999845374794e-07,
"loss": 0.0156,
"mean_token_accuracy": 0.9933716654777527,
"num_tokens": 12479429.0,
"step": 1431
},
{
"entropy": 0.7574312388896942,
"epoch": 4.590690208667737,
"grad_norm": 1.003862738609314,
"learning_rate": 1.857867412941883e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9921233355998993,
"num_tokens": 12487836.0,
"step": 1432
},
{
"entropy": 0.7914498746395111,
"epoch": 4.593900481540931,
"grad_norm": 0.7261200547218323,
"learning_rate": 1.8293511771991624e-07,
"loss": 0.0132,
"mean_token_accuracy": 0.9958767890930176,
"num_tokens": 12496088.0,
"step": 1433
},
{
"entropy": 0.8081851303577423,
"epoch": 4.597110754414125,
"grad_norm": 1.6329476833343506,
"learning_rate": 1.8010514054524531e-07,
"loss": 0.0278,
"mean_token_accuracy": 0.9796946048736572,
"num_tokens": 12505783.0,
"step": 1434
},
{
"entropy": 0.8041447103023529,
"epoch": 4.600321027287319,
"grad_norm": 0.9152159094810486,
"learning_rate": 1.7729682248721848e-07,
"loss": 0.0194,
"mean_token_accuracy": 0.9919094741344452,
"num_tokens": 12515067.0,
"step": 1435
},
{
"entropy": 0.8927958905696869,
"epoch": 4.603531300160514,
"grad_norm": 1.4165750741958618,
"learning_rate": 1.7451017616554822e-07,
"loss": 0.0255,
"mean_token_accuracy": 0.9866718351840973,
"num_tokens": 12523827.0,
"step": 1436
},
{
"entropy": 0.8060368299484253,
"epoch": 4.606741573033708,
"grad_norm": 0.9053508639335632,
"learning_rate": 1.7174521410256162e-07,
"loss": 0.017,
"mean_token_accuracy": 0.9934398829936981,
"num_tokens": 12532347.0,
"step": 1437
},
{
"entropy": 0.8163271546363831,
"epoch": 4.609951845906902,
"grad_norm": 1.3091577291488647,
"learning_rate": 1.69001948723142e-07,
"loss": 0.02,
"mean_token_accuracy": 0.9915553331375122,
"num_tokens": 12541394.0,
"step": 1438
},
{
"entropy": 0.7544900178909302,
"epoch": 4.613162118780096,
"grad_norm": 1.1534007787704468,
"learning_rate": 1.6628039235467686e-07,
"loss": 0.0214,
"mean_token_accuracy": 0.9864902794361115,
"num_tokens": 12551329.0,
"step": 1439
},
{
"entropy": 0.853958249092102,
"epoch": 4.61637239165329,
"grad_norm": 0.8594070672988892,
"learning_rate": 1.6358055722699662e-07,
"loss": 0.0111,
"mean_token_accuracy": 0.9959721565246582,
"num_tokens": 12558908.0,
"step": 1440
},
{
"entropy": 0.9293086230754852,
"epoch": 4.619582664526485,
"grad_norm": 0.8603348731994629,
"learning_rate": 1.6090245547232707e-07,
"loss": 0.0133,
"mean_token_accuracy": 0.9953678846359253,
"num_tokens": 12566798.0,
"step": 1441
},
{
"entropy": 0.9653850197792053,
"epoch": 4.622792937399679,
"grad_norm": 0.7823249101638794,
"learning_rate": 1.5824609912522825e-07,
"loss": 0.0189,
"mean_token_accuracy": 0.993864506483078,
"num_tokens": 12575250.0,
"step": 1442
},
{
"entropy": 0.8890847563743591,
"epoch": 4.626003210272874,
"grad_norm": 0.6177572011947632,
"learning_rate": 1.5561150012254446e-07,
"loss": 0.0113,
"mean_token_accuracy": 0.9965679347515106,
"num_tokens": 12583333.0,
"step": 1443
},
{
"entropy": 0.8079483807086945,
"epoch": 4.629213483146067,
"grad_norm": 1.4150201082229614,
"learning_rate": 1.5299867030334815e-07,
"loss": 0.0131,
"mean_token_accuracy": 0.9950767457485199,
"num_tokens": 12592056.0,
"step": 1444
},
{
"entropy": 0.8464027941226959,
"epoch": 4.632423756019262,
"grad_norm": 4.529459476470947,
"learning_rate": 1.5040762140888843e-07,
"loss": 0.0148,
"mean_token_accuracy": 0.9947774410247803,
"num_tokens": 12600705.0,
"step": 1445
},
{
"entropy": 0.848853588104248,
"epoch": 4.635634028892456,
"grad_norm": 2.467555284500122,
"learning_rate": 1.4783836508253823e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9937762022018433,
"num_tokens": 12609927.0,
"step": 1446
},
{
"entropy": 0.7718561589717865,
"epoch": 4.63884430176565,
"grad_norm": 3.8097293376922607,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.0186,
"mean_token_accuracy": 0.9952149987220764,
"num_tokens": 12618947.0,
"step": 1447
},
{
"entropy": 0.9486541748046875,
"epoch": 4.642054574638844,
"grad_norm": 0.7113092541694641,
"learning_rate": 1.4276527621795655e-07,
"loss": 0.0116,
"mean_token_accuracy": 0.9962652921676636,
"num_tokens": 12627476.0,
"step": 1448
},
{
"entropy": 0.8682657778263092,
"epoch": 4.645264847512038,
"grad_norm": 0.6125422716140747,
"learning_rate": 1.402614664766172e-07,
"loss": 0.0118,
"mean_token_accuracy": 0.9960006773471832,
"num_tokens": 12635027.0,
"step": 1449
},
{
"entropy": 0.9030327200889587,
"epoch": 4.648475120385233,
"grad_norm": 1.8995373249053955,
"learning_rate": 1.3777949489706898e-07,
"loss": 0.0214,
"mean_token_accuracy": 0.991911381483078,
"num_tokens": 12644120.0,
"step": 1450
},
{
"entropy": 0.9397462904453278,
"epoch": 4.651685393258427,
"grad_norm": 0.7002797722816467,
"learning_rate": 1.353193726325247e-07,
"loss": 0.0123,
"mean_token_accuracy": 0.9955561757087708,
"num_tokens": 12653499.0,
"step": 1451
},
{
"entropy": 0.8843958079814911,
"epoch": 4.654895666131621,
"grad_norm": 1.1001935005187988,
"learning_rate": 1.3288111073801235e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9950455129146576,
"num_tokens": 12662275.0,
"step": 1452
},
{
"entropy": 0.8470652103424072,
"epoch": 4.658105939004815,
"grad_norm": 1.5061019659042358,
"learning_rate": 1.3046472017032685e-07,
"loss": 0.018,
"mean_token_accuracy": 0.994785338640213,
"num_tokens": 12670692.0,
"step": 1453
},
{
"entropy": 0.8604851365089417,
"epoch": 4.66131621187801,
"grad_norm": 0.8102709650993347,
"learning_rate": 1.280702117879795e-07,
"loss": 0.0205,
"mean_token_accuracy": 0.99430912733078,
"num_tokens": 12679153.0,
"step": 1454
},
{
"entropy": 0.8193712532520294,
"epoch": 4.664526484751204,
"grad_norm": 1.2029215097427368,
"learning_rate": 1.2569759635115086e-07,
"loss": 0.022,
"mean_token_accuracy": 0.9853865206241608,
"num_tokens": 12688175.0,
"step": 1455
},
{
"entropy": 0.792403519153595,
"epoch": 4.667736757624398,
"grad_norm": 2.498199462890625,
"learning_rate": 1.2334688452164122e-07,
"loss": 0.0159,
"mean_token_accuracy": 0.9941717386245728,
"num_tokens": 12696973.0,
"step": 1456
},
{
"entropy": 0.6442500352859497,
"epoch": 4.670947030497592,
"grad_norm": 1.1447046995162964,
"learning_rate": 1.210180868628219e-07,
"loss": 0.0318,
"mean_token_accuracy": 0.9858497381210327,
"num_tokens": 12707211.0,
"step": 1457
},
{
"entropy": 0.8001232743263245,
"epoch": 4.674157303370786,
"grad_norm": 0.8121841549873352,
"learning_rate": 1.1871121383958961e-07,
"loss": 0.0358,
"mean_token_accuracy": 0.9611911177635193,
"num_tokens": 12717263.0,
"step": 1458
},
{
"entropy": 1.0088177621364594,
"epoch": 4.677367576243981,
"grad_norm": 0.7857615351676941,
"learning_rate": 1.1642627581831767e-07,
"loss": 0.0117,
"mean_token_accuracy": 0.9962861239910126,
"num_tokens": 12727899.0,
"step": 1459
},
{
"entropy": 0.7727092504501343,
"epoch": 4.680577849117175,
"grad_norm": 1.3959094285964966,
"learning_rate": 1.1416328306681046e-07,
"loss": 0.026,
"mean_token_accuracy": 0.9916978478431702,
"num_tokens": 12736448.0,
"step": 1460
},
{
"entropy": 0.9188545346260071,
"epoch": 4.68378812199037,
"grad_norm": 1.0266892910003662,
"learning_rate": 1.1192224575425848e-07,
"loss": 0.0156,
"mean_token_accuracy": 0.9939980804920197,
"num_tokens": 12744974.0,
"step": 1461
},
{
"entropy": 0.7683831751346588,
"epoch": 4.686998394863563,
"grad_norm": 1.3337358236312866,
"learning_rate": 1.0970317395119001e-07,
"loss": 0.0275,
"mean_token_accuracy": 0.9839833378791809,
"num_tokens": 12755044.0,
"step": 1462
},
{
"entropy": 1.0480545163154602,
"epoch": 4.690208667736758,
"grad_norm": 1.289138674736023,
"learning_rate": 1.0750607762942622e-07,
"loss": 0.0135,
"mean_token_accuracy": 0.9952871203422546,
"num_tokens": 12764198.0,
"step": 1463
},
{
"entropy": 0.8789570927619934,
"epoch": 4.693418940609952,
"grad_norm": 3.7718007564544678,
"learning_rate": 1.0533096666203946e-07,
"loss": 0.0158,
"mean_token_accuracy": 0.9947271049022675,
"num_tokens": 12772123.0,
"step": 1464
},
{
"entropy": 0.8715825378894806,
"epoch": 4.696629213483146,
"grad_norm": 0.849036693572998,
"learning_rate": 1.0317785082330555e-07,
"loss": 0.0107,
"mean_token_accuracy": 0.9959846436977386,
"num_tokens": 12781152.0,
"step": 1465
},
{
"entropy": 0.8796096742153168,
"epoch": 4.69983948635634,
"grad_norm": 1.3320071697235107,
"learning_rate": 1.0104673978866164e-07,
"loss": 0.0209,
"mean_token_accuracy": 0.9909534156322479,
"num_tokens": 12790728.0,
"step": 1466
},
{
"entropy": 0.8747296929359436,
"epoch": 4.703049759229534,
"grad_norm": 1.0539300441741943,
"learning_rate": 9.89376431346606e-08,
"loss": 0.0173,
"mean_token_accuracy": 0.9945295751094818,
"num_tokens": 12799861.0,
"step": 1467
},
{
"entropy": 0.7705346345901489,
"epoch": 4.706260032102729,
"grad_norm": 1.595797061920166,
"learning_rate": 9.685057033892998e-08,
"loss": 0.0153,
"mean_token_accuracy": 0.9951062500476837,
"num_tokens": 12807796.0,
"step": 1468
},
{
"entropy": 0.8849585354328156,
"epoch": 4.709470304975923,
"grad_norm": 0.6699755787849426,
"learning_rate": 9.478553078013042e-08,
"loss": 0.014,
"mean_token_accuracy": 0.9962992668151855,
"num_tokens": 12816289.0,
"step": 1469
},
{
"entropy": 0.803180068731308,
"epoch": 4.712680577849117,
"grad_norm": 1.6749389171600342,
"learning_rate": 9.274253373791064e-08,
"loss": 0.0215,
"mean_token_accuracy": 0.9928462505340576,
"num_tokens": 12825105.0,
"step": 1470
},
{
"entropy": 0.8213406801223755,
"epoch": 4.715890850722311,
"grad_norm": 1.0112464427947998,
"learning_rate": 9.072158839286748e-08,
"loss": 0.0248,
"mean_token_accuracy": 0.9840686023235321,
"num_tokens": 12834541.0,
"step": 1471
},
{
"entropy": 0.7910043299198151,
"epoch": 4.719101123595506,
"grad_norm": 0.8969994187355042,
"learning_rate": 8.872270382650372e-08,
"loss": 0.0168,
"mean_token_accuracy": 0.9944667518138885,
"num_tokens": 12843003.0,
"step": 1472
},
{
"entropy": 0.8581178486347198,
"epoch": 4.7223113964687,
"grad_norm": 0.8233668208122253,
"learning_rate": 8.674588902118919e-08,
"loss": 0.0206,
"mean_token_accuracy": 0.9897873401641846,
"num_tokens": 12852903.0,
"step": 1473
},
{
"entropy": 0.8728442490100861,
"epoch": 4.725521669341894,
"grad_norm": 0.8017443418502808,
"learning_rate": 8.479115286011752e-08,
"loss": 0.012,
"mean_token_accuracy": 0.995659202337265,
"num_tokens": 12862161.0,
"step": 1474
},
{
"entropy": 0.8244474232196808,
"epoch": 4.728731942215088,
"grad_norm": 1.4547849893569946,
"learning_rate": 8.285850412726837e-08,
"loss": 0.0152,
"mean_token_accuracy": 0.9953339695930481,
"num_tokens": 12870310.0,
"step": 1475
},
{
"entropy": 0.8828493654727936,
"epoch": 4.731942215088282,
"grad_norm": 1.2691254615783691,
"learning_rate": 8.094795150736745e-08,
"loss": 0.0132,
"mean_token_accuracy": 0.9957073032855988,
"num_tokens": 12878220.0,
"step": 1476
},
{
"entropy": 0.8865284621715546,
"epoch": 4.735152487961477,
"grad_norm": 0.8567522764205933,
"learning_rate": 7.905950358584768e-08,
"loss": 0.0198,
"mean_token_accuracy": 0.9906137585639954,
"num_tokens": 12886608.0,
"step": 1477
},
{
"entropy": 0.8262231051921844,
"epoch": 4.738362760834671,
"grad_norm": 1.0661060810089111,
"learning_rate": 7.719316884880922e-08,
"loss": 0.0185,
"mean_token_accuracy": 0.9928100407123566,
"num_tokens": 12895485.0,
"step": 1478
},
{
"entropy": 0.7830840349197388,
"epoch": 4.741573033707866,
"grad_norm": 1.305324912071228,
"learning_rate": 7.534895568298395e-08,
"loss": 0.0164,
"mean_token_accuracy": 0.9932570457458496,
"num_tokens": 12903598.0,
"step": 1479
},
{
"entropy": 0.8163612484931946,
"epoch": 4.744783306581059,
"grad_norm": 0.6662154197692871,
"learning_rate": 7.352687237569489e-08,
"loss": 0.0117,
"mean_token_accuracy": 0.9972147941589355,
"num_tokens": 12911756.0,
"step": 1480
},
{
"entropy": 0.9148772060871124,
"epoch": 4.747993579454254,
"grad_norm": 0.9864494800567627,
"learning_rate": 7.172692711482022e-08,
"loss": 0.0235,
"mean_token_accuracy": 0.9813115894794464,
"num_tokens": 12920570.0,
"step": 1481
},
{
"entropy": 0.9331631064414978,
"epoch": 4.751203852327448,
"grad_norm": 0.6732865571975708,
"learning_rate": 6.994912798875875e-08,
"loss": 0.0111,
"mean_token_accuracy": 0.9955635666847229,
"num_tokens": 12928073.0,
"step": 1482
},
{
"entropy": 0.9015617370605469,
"epoch": 4.754414125200642,
"grad_norm": 1.075960636138916,
"learning_rate": 6.819348298638839e-08,
"loss": 0.0168,
"mean_token_accuracy": 0.9937729239463806,
"num_tokens": 12936594.0,
"step": 1483
},
{
"entropy": 1.0504088997840881,
"epoch": 4.757624398073836,
"grad_norm": 0.8205307722091675,
"learning_rate": 6.6459999997035e-08,
"loss": 0.0288,
"mean_token_accuracy": 0.9749292135238647,
"num_tokens": 12948928.0,
"step": 1484
},
{
"entropy": 0.8226450979709625,
"epoch": 4.76083467094703,
"grad_norm": 0.8952516317367554,
"learning_rate": 6.474868681043578e-08,
"loss": 0.0205,
"mean_token_accuracy": 0.9924160242080688,
"num_tokens": 12957574.0,
"step": 1485
},
{
"entropy": 0.8268488645553589,
"epoch": 4.764044943820225,
"grad_norm": 2.353219747543335,
"learning_rate": 6.305955111670204e-08,
"loss": 0.0216,
"mean_token_accuracy": 0.9924924671649933,
"num_tokens": 12967174.0,
"step": 1486
},
{
"entropy": 1.0455307960510254,
"epoch": 4.767255216693419,
"grad_norm": 1.843535304069519,
"learning_rate": 6.13926005062876e-08,
"loss": 0.0099,
"mean_token_accuracy": 0.9977114200592041,
"num_tokens": 12976414.0,
"step": 1487
},
{
"entropy": 0.9094734787940979,
"epoch": 4.770465489566613,
"grad_norm": 0.8578094244003296,
"learning_rate": 5.974784246995214e-08,
"loss": 0.0146,
"mean_token_accuracy": 0.9947217106819153,
"num_tokens": 12984704.0,
"step": 1488
},
{
"entropy": 0.8561606407165527,
"epoch": 4.773675762439807,
"grad_norm": 1.4484977722167969,
"learning_rate": 5.8125284398730666e-08,
"loss": 0.022,
"mean_token_accuracy": 0.990572988986969,
"num_tokens": 12993088.0,
"step": 1489
},
{
"entropy": 0.8771012425422668,
"epoch": 4.776886035313002,
"grad_norm": 0.8078578114509583,
"learning_rate": 5.6524933583896326e-08,
"loss": 0.0267,
"mean_token_accuracy": 0.9761388897895813,
"num_tokens": 13003109.0,
"step": 1490
},
{
"entropy": 1.0275262296199799,
"epoch": 4.780096308186196,
"grad_norm": 2.095111846923828,
"learning_rate": 5.4946797216931524e-08,
"loss": 0.0144,
"mean_token_accuracy": 0.9961507022380829,
"num_tokens": 13011066.0,
"step": 1491
},
{
"entropy": 0.8347211182117462,
"epoch": 4.78330658105939,
"grad_norm": 1.8958369493484497,
"learning_rate": 5.339088238949186e-08,
"loss": 0.0245,
"mean_token_accuracy": 0.9858895838260651,
"num_tokens": 13019563.0,
"step": 1492
},
{
"entropy": 0.6933950185775757,
"epoch": 4.786516853932584,
"grad_norm": 1.2030583620071411,
"learning_rate": 5.185719609337836e-08,
"loss": 0.0201,
"mean_token_accuracy": 0.9926663935184479,
"num_tokens": 13028661.0,
"step": 1493
},
{
"entropy": 0.8557841181755066,
"epoch": 4.789727126805778,
"grad_norm": 1.0296180248260498,
"learning_rate": 5.034574522050251e-08,
"loss": 0.0129,
"mean_token_accuracy": 0.9967096447944641,
"num_tokens": 13036535.0,
"step": 1494
},
{
"entropy": 0.740024745464325,
"epoch": 4.792937399678973,
"grad_norm": 0.9318440556526184,
"learning_rate": 4.885653656285627e-08,
"loss": 0.0195,
"mean_token_accuracy": 0.9905839264392853,
"num_tokens": 13045182.0,
"step": 1495
},
{
"entropy": 0.7861917316913605,
"epoch": 4.796147672552167,
"grad_norm": 0.8870951533317566,
"learning_rate": 4.73895768124838e-08,
"loss": 0.0134,
"mean_token_accuracy": 0.9954321086406708,
"num_tokens": 13054371.0,
"step": 1496
},
{
"entropy": 0.8475262522697449,
"epoch": 4.799357945425362,
"grad_norm": 3.0993971824645996,
"learning_rate": 4.5944872561448084e-08,
"loss": 0.0143,
"mean_token_accuracy": 0.9950073659420013,
"num_tokens": 13062352.0,
"step": 1497
},
{
"entropy": 0.9056753218173981,
"epoch": 4.802568218298555,
"grad_norm": 0.47460415959358215,
"learning_rate": 4.45224303018027e-08,
"loss": 0.0102,
"mean_token_accuracy": 0.9964375793933868,
"num_tokens": 13070029.0,
"step": 1498
},
{
"entropy": 0.8055447340011597,
"epoch": 4.80577849117175,
"grad_norm": 1.6484469175338745,
"learning_rate": 4.3122256425563444e-08,
"loss": 0.0177,
"mean_token_accuracy": 0.9926461279392242,
"num_tokens": 13078374.0,
"step": 1499
},
{
"entropy": 0.8468793332576752,
"epoch": 4.808988764044944,
"grad_norm": 1.0291019678115845,
"learning_rate": 4.174435722467951e-08,
"loss": 0.0125,
"mean_token_accuracy": 0.99583500623703,
"num_tokens": 13086183.0,
"step": 1500
},
{
"entropy": 0.9175726771354675,
"epoch": 4.8121990369181376,
"grad_norm": 0.8731938004493713,
"learning_rate": 4.038873889100237e-08,
"loss": 0.0103,
"mean_token_accuracy": 0.9962366223335266,
"num_tokens": 13095228.0,
"step": 1501
},
{
"entropy": 0.8306655585765839,
"epoch": 4.815409309791332,
"grad_norm": 0.9269821643829346,
"learning_rate": 3.905540751626191e-08,
"loss": 0.0154,
"mean_token_accuracy": 0.9948339760303497,
"num_tokens": 13104282.0,
"step": 1502
},
{
"entropy": 0.8430630564689636,
"epoch": 4.818619582664526,
"grad_norm": 1.019399642944336,
"learning_rate": 3.77443690920376e-08,
"loss": 0.0288,
"mean_token_accuracy": 0.9845989346504211,
"num_tokens": 13113668.0,
"step": 1503
},
{
"entropy": 0.8044372200965881,
"epoch": 4.821829855537721,
"grad_norm": 0.831784188747406,
"learning_rate": 3.645562950973014e-08,
"loss": 0.0133,
"mean_token_accuracy": 0.9945938289165497,
"num_tokens": 13122339.0,
"step": 1504
},
{
"entropy": 0.813548743724823,
"epoch": 4.825040128410915,
"grad_norm": 0.8253054022789001,
"learning_rate": 3.518919456053649e-08,
"loss": 0.0131,
"mean_token_accuracy": 0.9954196214675903,
"num_tokens": 13130760.0,
"step": 1505
},
{
"entropy": 0.8437825441360474,
"epoch": 4.828250401284109,
"grad_norm": 1.0685738325119019,
"learning_rate": 3.3945069935423234e-08,
"loss": 0.0126,
"mean_token_accuracy": 0.9963714182376862,
"num_tokens": 13139593.0,
"step": 1506
},
{
"entropy": 0.8084733188152313,
"epoch": 4.831460674157303,
"grad_norm": 0.9814333915710449,
"learning_rate": 3.2723261225102164e-08,
"loss": 0.0168,
"mean_token_accuracy": 0.9951196610927582,
"num_tokens": 13147734.0,
"step": 1507
},
{
"entropy": 0.9518248438835144,
"epoch": 4.834670947030498,
"grad_norm": 0.9631071090698242,
"learning_rate": 3.152377392000361e-08,
"loss": 0.0138,
"mean_token_accuracy": 0.9940395951271057,
"num_tokens": 13156608.0,
"step": 1508
},
{
"entropy": 0.8305748105049133,
"epoch": 4.837881219903692,
"grad_norm": 0.647260308265686,
"learning_rate": 3.034661341025258e-08,
"loss": 0.0106,
"mean_token_accuracy": 0.9974878430366516,
"num_tokens": 13164500.0,
"step": 1509
},
{
"entropy": 0.93598473072052,
"epoch": 4.841091492776886,
"grad_norm": 0.5800625681877136,
"learning_rate": 2.9191784985644345e-08,
"loss": 0.0105,
"mean_token_accuracy": 0.995660126209259,
"num_tokens": 13173445.0,
"step": 1510
},
{
"entropy": 0.9177517294883728,
"epoch": 4.84430176565008,
"grad_norm": 0.9533823132514954,
"learning_rate": 2.8059293835620006e-08,
"loss": 0.0126,
"mean_token_accuracy": 0.9962038695812225,
"num_tokens": 13181508.0,
"step": 1511
},
{
"entropy": 0.8395648300647736,
"epoch": 4.847512038523274,
"grad_norm": 0.8162403106689453,
"learning_rate": 2.6949145049245396e-08,
"loss": 0.0135,
"mean_token_accuracy": 0.9970743358135223,
"num_tokens": 13189174.0,
"step": 1512
},
{
"entropy": 0.8557933866977692,
"epoch": 4.850722311396469,
"grad_norm": 1.0253946781158447,
"learning_rate": 2.5861343615184997e-08,
"loss": 0.0323,
"mean_token_accuracy": 0.9815960228443146,
"num_tokens": 13198056.0,
"step": 1513
},
{
"entropy": 0.8704625368118286,
"epoch": 4.853932584269663,
"grad_norm": 0.6910727620124817,
"learning_rate": 2.479589442168251e-08,
"loss": 0.0122,
"mean_token_accuracy": 0.9952109158039093,
"num_tokens": 13206774.0,
"step": 1514
},
{
"entropy": 1.0808364748954773,
"epoch": 4.857142857142857,
"grad_norm": 0.9426855444908142,
"learning_rate": 2.3752802256536423e-08,
"loss": 0.0146,
"mean_token_accuracy": 0.9937877953052521,
"num_tokens": 13215645.0,
"step": 1515
},
{
"entropy": 0.7950730621814728,
"epoch": 4.860353130016051,
"grad_norm": 0.948599100112915,
"learning_rate": 2.2732071807081147e-08,
"loss": 0.0361,
"mean_token_accuracy": 0.9701245427131653,
"num_tokens": 13225286.0,
"step": 1516
},
{
"entropy": 0.7923941910266876,
"epoch": 4.863563402889246,
"grad_norm": 0.886232316493988,
"learning_rate": 2.173370766016314e-08,
"loss": 0.0162,
"mean_token_accuracy": 0.9943108260631561,
"num_tokens": 13234636.0,
"step": 1517
},
{
"entropy": 0.8961697816848755,
"epoch": 4.86677367576244,
"grad_norm": 0.49067121744155884,
"learning_rate": 2.0757714302122035e-08,
"loss": 0.0092,
"mean_token_accuracy": 0.9976347088813782,
"num_tokens": 13243074.0,
"step": 1518
},
{
"entropy": 0.8559750616550446,
"epoch": 4.8699839486356336,
"grad_norm": 0.9737809896469116,
"learning_rate": 1.98040961187701e-08,
"loss": 0.0275,
"mean_token_accuracy": 0.9791916608810425,
"num_tokens": 13252750.0,
"step": 1519
},
{
"entropy": 0.8368252515792847,
"epoch": 4.873194221508828,
"grad_norm": 1.5021547079086304,
"learning_rate": 1.8872857395372812e-08,
"loss": 0.0247,
"mean_token_accuracy": 0.9902662932872772,
"num_tokens": 13262034.0,
"step": 1520
},
{
"entropy": 0.8385051488876343,
"epoch": 4.876404494382022,
"grad_norm": 1.3577289581298828,
"learning_rate": 1.7964002316628316e-08,
"loss": 0.0259,
"mean_token_accuracy": 0.99179607629776,
"num_tokens": 13271283.0,
"step": 1521
},
{
"entropy": 0.8158581852912903,
"epoch": 4.879614767255217,
"grad_norm": 0.9924322366714478,
"learning_rate": 1.7077534966650767e-08,
"loss": 0.0154,
"mean_token_accuracy": 0.9940443634986877,
"num_tokens": 13280203.0,
"step": 1522
},
{
"entropy": 0.9179269969463348,
"epoch": 4.882825040128411,
"grad_norm": 0.9713982939720154,
"learning_rate": 1.6213459328950355e-08,
"loss": 0.0145,
"mean_token_accuracy": 0.9961579740047455,
"num_tokens": 13288465.0,
"step": 1523
},
{
"entropy": 0.7583066523075104,
"epoch": 4.886035313001605,
"grad_norm": 0.917400062084198,
"learning_rate": 1.537177928641498e-08,
"loss": 0.0187,
"mean_token_accuracy": 0.9913864731788635,
"num_tokens": 13297225.0,
"step": 1524
},
{
"entropy": 0.8535176515579224,
"epoch": 4.889245585874799,
"grad_norm": 0.8154090046882629,
"learning_rate": 1.4552498621295264e-08,
"loss": 0.0179,
"mean_token_accuracy": 0.9920974969863892,
"num_tokens": 13305361.0,
"step": 1525
},
{
"entropy": 0.7557698488235474,
"epoch": 4.892455858747994,
"grad_norm": 0.8725979924201965,
"learning_rate": 1.3755621015184018e-08,
"loss": 0.0152,
"mean_token_accuracy": 0.9954307377338409,
"num_tokens": 13314453.0,
"step": 1526
},
{
"entropy": 0.8124125897884369,
"epoch": 4.895666131621188,
"grad_norm": 1.709907054901123,
"learning_rate": 1.2981150049004021e-08,
"loss": 0.0233,
"mean_token_accuracy": 0.9874438941478729,
"num_tokens": 13323863.0,
"step": 1527
},
{
"entropy": 0.8402214646339417,
"epoch": 4.898876404494382,
"grad_norm": 1.5540814399719238,
"learning_rate": 1.2229089202987487e-08,
"loss": 0.0232,
"mean_token_accuracy": 0.9862401783466339,
"num_tokens": 13332876.0,
"step": 1528
},
{
"entropy": 0.6956556737422943,
"epoch": 4.902086677367576,
"grad_norm": 1.3305988311767578,
"learning_rate": 1.1499441856663296e-08,
"loss": 0.0278,
"mean_token_accuracy": 0.9894447326660156,
"num_tokens": 13342874.0,
"step": 1529
},
{
"entropy": 0.7873148918151855,
"epoch": 4.90529695024077,
"grad_norm": 1.0279408693313599,
"learning_rate": 1.0792211288841447e-08,
"loss": 0.0132,
"mean_token_accuracy": 0.9951443374156952,
"num_tokens": 13351380.0,
"step": 1530
},
{
"entropy": 0.8480234742164612,
"epoch": 4.908507223113965,
"grad_norm": 4.671872138977051,
"learning_rate": 1.0107400677596413e-08,
"loss": 0.0135,
"mean_token_accuracy": 0.9976670742034912,
"num_tokens": 13359235.0,
"step": 1531
},
{
"entropy": 0.7645809650421143,
"epoch": 4.911717495987159,
"grad_norm": 0.9382192492485046,
"learning_rate": 9.44501310025603e-09,
"loss": 0.0293,
"mean_token_accuracy": 0.9814907908439636,
"num_tokens": 13368521.0,
"step": 1532
},
{
"entropy": 0.9574826657772064,
"epoch": 4.914927768860353,
"grad_norm": 2.1182193756103516,
"learning_rate": 8.805051533384846e-09,
"loss": 0.0177,
"mean_token_accuracy": 0.994848906993866,
"num_tokens": 13378012.0,
"step": 1533
},
{
"entropy": 0.8288282752037048,
"epoch": 4.918138041733547,
"grad_norm": 1.078923225402832,
"learning_rate": 8.187518852771914e-09,
"loss": 0.0236,
"mean_token_accuracy": 0.9885841608047485,
"num_tokens": 13387681.0,
"step": 1534
},
{
"entropy": 0.7388648986816406,
"epoch": 4.921348314606742,
"grad_norm": 1.584767460823059,
"learning_rate": 7.59241783341913e-09,
"loss": 0.0339,
"mean_token_accuracy": 0.9835653305053711,
"num_tokens": 13397378.0,
"step": 1535
},
{
"entropy": 0.8454699814319611,
"epoch": 4.924558587479936,
"grad_norm": 0.950717031955719,
"learning_rate": 7.019751149525133e-09,
"loss": 0.0138,
"mean_token_accuracy": 0.9948793649673462,
"num_tokens": 13405427.0,
"step": 1536
},
{
"entropy": 0.8609819412231445,
"epoch": 4.9277688603531296,
"grad_norm": 1.4333152770996094,
"learning_rate": 6.469521374477539e-09,
"loss": 0.0131,
"mean_token_accuracy": 0.9950776100158691,
"num_tokens": 13412745.0,
"step": 1537
},
{
"entropy": 0.9362488389015198,
"epoch": 4.930979133226324,
"grad_norm": 0.8439843654632568,
"learning_rate": 5.941730980839056e-09,
"loss": 0.0145,
"mean_token_accuracy": 0.9941385388374329,
"num_tokens": 13421064.0,
"step": 1538
},
{
"entropy": 0.9060971140861511,
"epoch": 4.934189406099518,
"grad_norm": 0.8039864301681519,
"learning_rate": 5.436382340335833e-09,
"loss": 0.0122,
"mean_token_accuracy": 0.9962558746337891,
"num_tokens": 13430385.0,
"step": 1539
},
{
"entropy": 0.8013357520103455,
"epoch": 4.937399678972713,
"grad_norm": 1.3938992023468018,
"learning_rate": 4.9534777238485764e-09,
"loss": 0.0081,
"mean_token_accuracy": 0.9980973601341248,
"num_tokens": 13438726.0,
"step": 1540
},
{
"entropy": 0.8251314759254456,
"epoch": 4.940609951845907,
"grad_norm": 0.7287578582763672,
"learning_rate": 4.493019301401447e-09,
"loss": 0.0228,
"mean_token_accuracy": 0.9862898588180542,
"num_tokens": 13446926.0,
"step": 1541
},
{
"entropy": 0.7694223821163177,
"epoch": 4.943820224719101,
"grad_norm": 1.3944227695465088,
"learning_rate": 4.055009142152066e-09,
"loss": 0.0306,
"mean_token_accuracy": 0.9875506162643433,
"num_tokens": 13455631.0,
"step": 1542
},
{
"entropy": 0.7734574675559998,
"epoch": 4.947030497592295,
"grad_norm": 1.3640856742858887,
"learning_rate": 3.6394492143820847e-09,
"loss": 0.0164,
"mean_token_accuracy": 0.992920994758606,
"num_tokens": 13463784.0,
"step": 1543
},
{
"entropy": 0.8477518856525421,
"epoch": 4.95024077046549,
"grad_norm": 1.2641561031341553,
"learning_rate": 3.2463413854899594e-09,
"loss": 0.0433,
"mean_token_accuracy": 0.9660729467868805,
"num_tokens": 13473673.0,
"step": 1544
},
{
"entropy": 0.7746582925319672,
"epoch": 4.953451043338684,
"grad_norm": 1.149497389793396,
"learning_rate": 2.875687421980966e-09,
"loss": 0.0198,
"mean_token_accuracy": 0.9907875061035156,
"num_tokens": 13482323.0,
"step": 1545
},
{
"entropy": 0.8061198890209198,
"epoch": 4.956661316211878,
"grad_norm": 1.7826182842254639,
"learning_rate": 2.5274889894583156e-09,
"loss": 0.0201,
"mean_token_accuracy": 0.9891678988933563,
"num_tokens": 13491215.0,
"step": 1546
},
{
"entropy": 0.7129835784435272,
"epoch": 4.959871589085072,
"grad_norm": 0.6761153936386108,
"learning_rate": 2.201747652618713e-09,
"loss": 0.0124,
"mean_token_accuracy": 0.9959794282913208,
"num_tokens": 13500246.0,
"step": 1547
},
{
"entropy": 0.7953590452671051,
"epoch": 4.963081861958266,
"grad_norm": 0.9348256587982178,
"learning_rate": 1.8984648752429222e-09,
"loss": 0.0181,
"mean_token_accuracy": 0.9927394986152649,
"num_tokens": 13509290.0,
"step": 1548
},
{
"entropy": 0.7777528762817383,
"epoch": 4.966292134831461,
"grad_norm": 1.0118370056152344,
"learning_rate": 1.6176420201902132e-09,
"loss": 0.0179,
"mean_token_accuracy": 0.9956157803535461,
"num_tokens": 13517919.0,
"step": 1549
},
{
"entropy": 0.7566848695278168,
"epoch": 4.969502407704655,
"grad_norm": 0.8841029405593872,
"learning_rate": 1.3592803493905904e-09,
"loss": 0.0137,
"mean_token_accuracy": 0.9926736652851105,
"num_tokens": 13525933.0,
"step": 1550
},
{
"entropy": 0.7857078313827515,
"epoch": 4.972712680577849,
"grad_norm": 0.8450862169265747,
"learning_rate": 1.1233810238425735e-09,
"loss": 0.0135,
"mean_token_accuracy": 0.9971432387828827,
"num_tokens": 13533747.0,
"step": 1551
},
{
"entropy": 0.8908677697181702,
"epoch": 4.975922953451043,
"grad_norm": 0.5804739594459534,
"learning_rate": 9.099451036048701e-10,
"loss": 0.0088,
"mean_token_accuracy": 0.9979284405708313,
"num_tokens": 13541753.0,
"step": 1552
},
{
"entropy": 0.8221500515937805,
"epoch": 4.979133226324238,
"grad_norm": 1.1342151165008545,
"learning_rate": 7.189735477913795e-10,
"loss": 0.016,
"mean_token_accuracy": 0.9946431219577789,
"num_tokens": 13549463.0,
"step": 1553
},
{
"entropy": 0.8285010457038879,
"epoch": 4.982343499197432,
"grad_norm": 0.6920357942581177,
"learning_rate": 5.504672145700829e-10,
"loss": 0.0111,
"mean_token_accuracy": 0.9967454969882965,
"num_tokens": 13557770.0,
"step": 1554
},
{
"entropy": 0.9648899137973785,
"epoch": 4.9855537720706256,
"grad_norm": 1.078553318977356,
"learning_rate": 4.0442686115582665e-10,
"loss": 0.0209,
"mean_token_accuracy": 0.9916457831859589,
"num_tokens": 13567441.0,
"step": 1555
},
{
"entropy": 0.8371059596538544,
"epoch": 4.98876404494382,
"grad_norm": 0.7630732655525208,
"learning_rate": 2.8085314380976725e-10,
"loss": 0.0127,
"mean_token_accuracy": 0.9956499338150024,
"num_tokens": 13575810.0,
"step": 1556
},
{
"entropy": 0.7578130662441254,
"epoch": 4.991974317817014,
"grad_norm": 0.8679744005203247,
"learning_rate": 1.797466178327101e-10,
"loss": 0.0137,
"mean_token_accuracy": 0.9955244362354279,
"num_tokens": 13583801.0,
"step": 1557
},
{
"entropy": 0.8360298871994019,
"epoch": 4.995184590690209,
"grad_norm": 0.9718247652053833,
"learning_rate": 1.011077375662195e-10,
"loss": 0.0151,
"mean_token_accuracy": 0.9935263097286224,
"num_tokens": 13592100.0,
"step": 1558
},
{
"entropy": 0.870440274477005,
"epoch": 4.998394863563403,
"grad_norm": 1.1290241479873657,
"learning_rate": 4.4936856390398465e-11,
"loss": 0.0158,
"mean_token_accuracy": 0.9939267933368683,
"num_tokens": 13600502.0,
"step": 1559
},
{
"entropy": 0.7823728322982788,
"epoch": 5.0,
"grad_norm": 1.0314245223999023,
"learning_rate": 1.1234226718337405e-11,
"loss": 0.0135,
"mean_token_accuracy": 0.9929947257041931,
"num_tokens": 13605710.0,
"step": 1560
}
],
"logging_steps": 1.0,
"max_steps": 1560,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.906663239306445e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}