leonMW's picture
Model save
1275973 verified
Invalid JSON: Unexpected token 'I', ..."_metric": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": Infinity,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5664891004562378,
"epoch": 0.05,
"grad_norm": 28.908475875854492,
"learning_rate": 0.0,
"loss": 1.6941,
"mean_token_accuracy": 0.6499292850494385,
"num_tokens": 262040.0,
"step": 1
},
{
"entropy": 0.5717914700508118,
"epoch": 0.1,
"grad_norm": 28.778432846069336,
"learning_rate": 3.3333333333333334e-08,
"loss": 1.6883,
"mean_token_accuracy": 0.644157350063324,
"num_tokens": 524072.0,
"step": 2
},
{
"entropy": 0.5704326629638672,
"epoch": 0.15,
"grad_norm": 29.55396842956543,
"learning_rate": 6.666666666666667e-08,
"loss": 1.6709,
"mean_token_accuracy": 0.6549088954925537,
"num_tokens": 786102.0,
"step": 3
},
{
"entropy": 0.5724536180496216,
"epoch": 0.2,
"grad_norm": 28.995824813842773,
"learning_rate": 1e-07,
"loss": 1.6821,
"mean_token_accuracy": 0.6475409865379333,
"num_tokens": 1048106.0,
"step": 4
},
{
"entropy": 0.5743661522865295,
"epoch": 0.25,
"grad_norm": 26.334909439086914,
"learning_rate": 1.3333333333333334e-07,
"loss": 1.6381,
"mean_token_accuracy": 0.6527131795883179,
"num_tokens": 1310172.0,
"step": 5
},
{
"entropy": 0.5727732181549072,
"epoch": 0.3,
"grad_norm": 28.073936462402344,
"learning_rate": 1.6666666666666665e-07,
"loss": 1.6422,
"mean_token_accuracy": 0.6617563962936401,
"num_tokens": 1572233.0,
"step": 6
},
{
"entropy": 0.57081538438797,
"epoch": 0.35,
"grad_norm": 28.08388900756836,
"learning_rate": 2e-07,
"loss": 1.6665,
"mean_token_accuracy": 0.6370558142662048,
"num_tokens": 1834270.0,
"step": 7
},
{
"entropy": 0.5670446157455444,
"epoch": 0.4,
"grad_norm": 28.71568489074707,
"learning_rate": 2.3333333333333333e-07,
"loss": 1.6712,
"mean_token_accuracy": 0.6549586653709412,
"num_tokens": 2096348.0,
"step": 8
},
{
"entropy": 0.5676782131195068,
"epoch": 0.45,
"grad_norm": 27.27318572998047,
"learning_rate": 2.6666666666666667e-07,
"loss": 1.7086,
"mean_token_accuracy": 0.6470588445663452,
"num_tokens": 2358417.0,
"step": 9
},
{
"entropy": 0.5776336789131165,
"epoch": 0.5,
"grad_norm": 28.332353591918945,
"learning_rate": 3e-07,
"loss": 1.6415,
"mean_token_accuracy": 0.6566205620765686,
"num_tokens": 2620440.0,
"step": 10
},
{
"entropy": 0.572010338306427,
"epoch": 0.55,
"grad_norm": 27.795087814331055,
"learning_rate": 3.333333333333333e-07,
"loss": 1.7014,
"mean_token_accuracy": 0.6489361524581909,
"num_tokens": 2882489.0,
"step": 11
},
{
"entropy": 0.5682384967803955,
"epoch": 0.6,
"grad_norm": 28.871315002441406,
"learning_rate": 3.666666666666666e-07,
"loss": 1.7225,
"mean_token_accuracy": 0.6407634615898132,
"num_tokens": 3144549.0,
"step": 12
},
{
"entropy": 0.5681391954421997,
"epoch": 0.65,
"grad_norm": 27.233278274536133,
"learning_rate": 4e-07,
"loss": 1.5898,
"mean_token_accuracy": 0.6640344858169556,
"num_tokens": 3406634.0,
"step": 13
},
{
"entropy": 0.5735733509063721,
"epoch": 0.7,
"grad_norm": 28.354537963867188,
"learning_rate": 4.3333333333333335e-07,
"loss": 1.6334,
"mean_token_accuracy": 0.6414058208465576,
"num_tokens": 3668652.0,
"step": 14
},
{
"entropy": 0.5718874931335449,
"epoch": 0.75,
"grad_norm": 26.07267189025879,
"learning_rate": 4.6666666666666666e-07,
"loss": 1.6714,
"mean_token_accuracy": 0.6530214548110962,
"num_tokens": 3930718.0,
"step": 15
},
{
"entropy": 0.5689897537231445,
"epoch": 0.8,
"grad_norm": 26.958057403564453,
"learning_rate": 5e-07,
"loss": 1.6341,
"mean_token_accuracy": 0.6614886522293091,
"num_tokens": 4192790.0,
"step": 16
},
{
"entropy": 0.5694440603256226,
"epoch": 0.85,
"grad_norm": 25.388864517211914,
"learning_rate": 5.333333333333333e-07,
"loss": 1.5602,
"mean_token_accuracy": 0.6735086441040039,
"num_tokens": 4454876.0,
"step": 17
},
{
"entropy": 0.5694302916526794,
"epoch": 0.9,
"grad_norm": 27.550378799438477,
"learning_rate": 5.666666666666666e-07,
"loss": 1.572,
"mean_token_accuracy": 0.6659559607505798,
"num_tokens": 4716909.0,
"step": 18
},
{
"entropy": 0.5698995590209961,
"epoch": 0.95,
"grad_norm": 26.377119064331055,
"learning_rate": 6e-07,
"loss": 1.6109,
"mean_token_accuracy": 0.6562277674674988,
"num_tokens": 4978973.0,
"step": 19
},
{
"entropy": 0.566437304019928,
"epoch": 1.0,
"grad_norm": 27.442617416381836,
"learning_rate": 6.333333333333332e-07,
"loss": 1.6484,
"mean_token_accuracy": 0.6565737128257751,
"num_tokens": 5241019.0,
"step": 20
},
{
"epoch": 1.0,
"eval_entropy": 0.573745846748352,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.5927419066429138,
"eval_num_tokens": 5241019.0,
"eval_runtime": 0.5652,
"eval_samples_per_second": 442.286,
"eval_steps_per_second": 1.769,
"step": 20
},
{
"entropy": 0.5741308927536011,
"epoch": 1.05,
"grad_norm": 26.800888061523438,
"learning_rate": 6.666666666666666e-07,
"loss": 1.5418,
"mean_token_accuracy": 0.6670629978179932,
"num_tokens": 5503030.0,
"step": 21
},
{
"entropy": 0.5693758726119995,
"epoch": 1.1,
"grad_norm": 24.638330459594727,
"learning_rate": 7e-07,
"loss": 1.4704,
"mean_token_accuracy": 0.6821052432060242,
"num_tokens": 5765114.0,
"step": 22
},
{
"entropy": 0.5774779915809631,
"epoch": 1.15,
"grad_norm": 23.92709732055664,
"learning_rate": 7.333333333333332e-07,
"loss": 1.3992,
"mean_token_accuracy": 0.7002063989639282,
"num_tokens": 6027183.0,
"step": 23
},
{
"entropy": 0.5733228921890259,
"epoch": 1.2,
"grad_norm": 20.69150733947754,
"learning_rate": 7.666666666666667e-07,
"loss": 1.2931,
"mean_token_accuracy": 0.732215166091919,
"num_tokens": 6289274.0,
"step": 24
},
{
"entropy": 0.5707331895828247,
"epoch": 1.25,
"grad_norm": 21.446800231933594,
"learning_rate": 8e-07,
"loss": 1.3543,
"mean_token_accuracy": 0.7079277038574219,
"num_tokens": 6551338.0,
"step": 25
},
{
"entropy": 0.5755348801612854,
"epoch": 1.3,
"grad_norm": 22.206480026245117,
"learning_rate": 8.333333333333333e-07,
"loss": 1.3262,
"mean_token_accuracy": 0.7222517132759094,
"num_tokens": 6813352.0,
"step": 26
},
{
"entropy": 0.5768681168556213,
"epoch": 1.35,
"grad_norm": 21.231828689575195,
"learning_rate": 8.666666666666667e-07,
"loss": 1.2112,
"mean_token_accuracy": 0.7373400330543518,
"num_tokens": 7075412.0,
"step": 27
},
{
"entropy": 0.5692260265350342,
"epoch": 1.4,
"grad_norm": 22.956790924072266,
"learning_rate": 9e-07,
"loss": 1.3309,
"mean_token_accuracy": 0.7283333539962769,
"num_tokens": 7337436.0,
"step": 28
},
{
"entropy": 0.5696606636047363,
"epoch": 1.45,
"grad_norm": 21.15884780883789,
"learning_rate": 9.333333333333333e-07,
"loss": 1.2323,
"mean_token_accuracy": 0.7210144996643066,
"num_tokens": 7599508.0,
"step": 29
},
{
"entropy": 0.5705867409706116,
"epoch": 1.5,
"grad_norm": 21.3349609375,
"learning_rate": 9.666666666666666e-07,
"loss": 1.2339,
"mean_token_accuracy": 0.7246376872062683,
"num_tokens": 7861547.0,
"step": 30
},
{
"entropy": 0.5731691718101501,
"epoch": 1.55,
"grad_norm": 19.02399253845215,
"learning_rate": 1e-06,
"loss": 1.1907,
"mean_token_accuracy": 0.724764883518219,
"num_tokens": 8123603.0,
"step": 31
},
{
"entropy": 0.575802206993103,
"epoch": 1.6,
"grad_norm": 13.962172508239746,
"learning_rate": 1e-06,
"loss": 0.9259,
"mean_token_accuracy": 0.7593783736228943,
"num_tokens": 8385666.0,
"step": 32
},
{
"entropy": 0.5742422342300415,
"epoch": 1.65,
"grad_norm": 13.67746353149414,
"learning_rate": 1e-06,
"loss": 0.8003,
"mean_token_accuracy": 0.7683302760124207,
"num_tokens": 8647717.0,
"step": 33
},
{
"entropy": 0.5761300325393677,
"epoch": 1.7,
"grad_norm": 13.221238136291504,
"learning_rate": 1e-06,
"loss": 0.8127,
"mean_token_accuracy": 0.7722646594047546,
"num_tokens": 8909783.0,
"step": 34
},
{
"entropy": 0.5746512413024902,
"epoch": 1.75,
"grad_norm": 14.354029655456543,
"learning_rate": 1e-06,
"loss": 0.9135,
"mean_token_accuracy": 0.7408514022827148,
"num_tokens": 9171847.0,
"step": 35
},
{
"entropy": 0.581115186214447,
"epoch": 1.8,
"grad_norm": 13.553462982177734,
"learning_rate": 1e-06,
"loss": 0.8726,
"mean_token_accuracy": 0.7709611654281616,
"num_tokens": 9433868.0,
"step": 36
},
{
"entropy": 0.5802749395370483,
"epoch": 1.85,
"grad_norm": 13.30045223236084,
"learning_rate": 1e-06,
"loss": 0.7273,
"mean_token_accuracy": 0.7865416407585144,
"num_tokens": 9695878.0,
"step": 37
},
{
"entropy": 0.5772420763969421,
"epoch": 1.9,
"grad_norm": 12.090519905090332,
"learning_rate": 1e-06,
"loss": 0.7367,
"mean_token_accuracy": 0.8035824298858643,
"num_tokens": 9957925.0,
"step": 38
},
{
"entropy": 0.5726953744888306,
"epoch": 1.95,
"grad_norm": 12.22325325012207,
"learning_rate": 1e-06,
"loss": 0.794,
"mean_token_accuracy": 0.7968627214431763,
"num_tokens": 10219991.0,
"step": 39
},
{
"entropy": 0.5777957439422607,
"epoch": 2.0,
"grad_norm": 11.300572395324707,
"learning_rate": 1e-06,
"loss": 0.6735,
"mean_token_accuracy": 0.8091511130332947,
"num_tokens": 10482014.0,
"step": 40
},
{
"epoch": 2.0,
"eval_entropy": 0.5787050724029541,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7641128897666931,
"eval_num_tokens": 10482014.0,
"eval_runtime": 0.5646,
"eval_samples_per_second": 442.773,
"eval_steps_per_second": 1.771,
"step": 40
},
{
"entropy": 0.5709609985351562,
"epoch": 2.05,
"grad_norm": 9.544706344604492,
"learning_rate": 1e-06,
"loss": 0.5315,
"mean_token_accuracy": 0.8500468730926514,
"num_tokens": 10744103.0,
"step": 41
},
{
"entropy": 0.5764386653900146,
"epoch": 2.1,
"grad_norm": 10.447105407714844,
"learning_rate": 1e-06,
"loss": 0.7079,
"mean_token_accuracy": 0.8055056929588318,
"num_tokens": 11006169.0,
"step": 42
},
{
"entropy": 0.5761690139770508,
"epoch": 2.15,
"grad_norm": 8.850165367126465,
"learning_rate": 1e-06,
"loss": 0.5672,
"mean_token_accuracy": 0.8491296172142029,
"num_tokens": 11268181.0,
"step": 43
},
{
"entropy": 0.5757296085357666,
"epoch": 2.2,
"grad_norm": 7.652801036834717,
"learning_rate": 1e-06,
"loss": 0.5925,
"mean_token_accuracy": 0.8589305877685547,
"num_tokens": 11530235.0,
"step": 44
},
{
"entropy": 0.5732800960540771,
"epoch": 2.25,
"grad_norm": 8.43525505065918,
"learning_rate": 1e-06,
"loss": 0.5096,
"mean_token_accuracy": 0.8493317365646362,
"num_tokens": 11792276.0,
"step": 45
},
{
"entropy": 0.5726200342178345,
"epoch": 2.3,
"grad_norm": 6.224635601043701,
"learning_rate": 1e-06,
"loss": 0.3717,
"mean_token_accuracy": 0.8944099545478821,
"num_tokens": 12054314.0,
"step": 46
},
{
"entropy": 0.5759143829345703,
"epoch": 2.35,
"grad_norm": 7.955408096313477,
"learning_rate": 1e-06,
"loss": 0.4937,
"mean_token_accuracy": 0.8561508059501648,
"num_tokens": 12316329.0,
"step": 47
},
{
"entropy": 0.5743073225021362,
"epoch": 2.4,
"grad_norm": 8.218153953552246,
"learning_rate": 1e-06,
"loss": 0.4752,
"mean_token_accuracy": 0.8561111092567444,
"num_tokens": 12578392.0,
"step": 48
},
{
"entropy": 0.5710327625274658,
"epoch": 2.45,
"grad_norm": 6.69417667388916,
"learning_rate": 1e-06,
"loss": 0.4361,
"mean_token_accuracy": 0.8712534308433533,
"num_tokens": 12840453.0,
"step": 49
},
{
"entropy": 0.5683552622795105,
"epoch": 2.5,
"grad_norm": 7.398046016693115,
"learning_rate": 1e-06,
"loss": 0.4676,
"mean_token_accuracy": 0.8655256628990173,
"num_tokens": 13102537.0,
"step": 50
},
{
"entropy": 0.5726050734519958,
"epoch": 2.55,
"grad_norm": 5.699220657348633,
"learning_rate": 1e-06,
"loss": 0.3854,
"mean_token_accuracy": 0.8902208209037781,
"num_tokens": 13364583.0,
"step": 51
},
{
"entropy": 0.5710792541503906,
"epoch": 2.6,
"grad_norm": 5.051173210144043,
"learning_rate": 1e-06,
"loss": 0.3505,
"mean_token_accuracy": 0.8793442845344543,
"num_tokens": 13626635.0,
"step": 52
},
{
"entropy": 0.5732054710388184,
"epoch": 2.65,
"grad_norm": 4.918524265289307,
"learning_rate": 1e-06,
"loss": 0.351,
"mean_token_accuracy": 0.87595534324646,
"num_tokens": 13888665.0,
"step": 53
},
{
"entropy": 0.5696585178375244,
"epoch": 2.7,
"grad_norm": 5.258333206176758,
"learning_rate": 1e-06,
"loss": 0.3094,
"mean_token_accuracy": 0.8884353637695312,
"num_tokens": 14150728.0,
"step": 54
},
{
"entropy": 0.5700670480728149,
"epoch": 2.75,
"grad_norm": 5.786867618560791,
"learning_rate": 1e-06,
"loss": 0.2947,
"mean_token_accuracy": 0.8788819909095764,
"num_tokens": 14412733.0,
"step": 55
},
{
"entropy": 0.5727378129959106,
"epoch": 2.8,
"grad_norm": 4.969060897827148,
"learning_rate": 1e-06,
"loss": 0.4303,
"mean_token_accuracy": 0.8706739544868469,
"num_tokens": 14674775.0,
"step": 56
},
{
"entropy": 0.5705511569976807,
"epoch": 2.85,
"grad_norm": 6.415738105773926,
"learning_rate": 1e-06,
"loss": 0.2955,
"mean_token_accuracy": 0.8876941204071045,
"num_tokens": 14936877.0,
"step": 57
},
{
"entropy": 0.572034478187561,
"epoch": 2.9,
"grad_norm": 5.3498029708862305,
"learning_rate": 1e-06,
"loss": 0.2835,
"mean_token_accuracy": 0.8952603936195374,
"num_tokens": 15198915.0,
"step": 58
},
{
"entropy": 0.5720596313476562,
"epoch": 2.95,
"grad_norm": 3.9302492141723633,
"learning_rate": 1e-06,
"loss": 0.3066,
"mean_token_accuracy": 0.8984684944152832,
"num_tokens": 15460974.0,
"step": 59
},
{
"entropy": 0.5642393231391907,
"epoch": 3.0,
"grad_norm": 3.9795563220977783,
"learning_rate": 1e-06,
"loss": 0.2578,
"mean_token_accuracy": 0.9144676923751831,
"num_tokens": 15723062.0,
"step": 60
},
{
"epoch": 3.0,
"eval_entropy": 0.5693275332450867,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8682795763015747,
"eval_num_tokens": 15723062.0,
"eval_runtime": 0.5625,
"eval_samples_per_second": 444.468,
"eval_steps_per_second": 1.778,
"step": 60
},
{
"entropy": 0.5673961043357849,
"epoch": 3.05,
"grad_norm": 3.3862600326538086,
"learning_rate": 1e-06,
"loss": 0.2377,
"mean_token_accuracy": 0.9254385828971863,
"num_tokens": 15985152.0,
"step": 61
},
{
"entropy": 0.5655107498168945,
"epoch": 3.1,
"grad_norm": 4.405587196350098,
"learning_rate": 1e-06,
"loss": 0.3086,
"mean_token_accuracy": 0.8858006000518799,
"num_tokens": 16247235.0,
"step": 62
},
{
"entropy": 0.5651353597640991,
"epoch": 3.15,
"grad_norm": 3.514052391052246,
"learning_rate": 1e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.9162125587463379,
"num_tokens": 16509263.0,
"step": 63
},
{
"entropy": 0.5678315162658691,
"epoch": 3.2,
"grad_norm": 4.790067672729492,
"learning_rate": 1e-06,
"loss": 0.249,
"mean_token_accuracy": 0.9055072665214539,
"num_tokens": 16771251.0,
"step": 64
},
{
"entropy": 0.5601281523704529,
"epoch": 3.25,
"grad_norm": 4.446920394897461,
"learning_rate": 1e-06,
"loss": 0.2501,
"mean_token_accuracy": 0.9066374897956848,
"num_tokens": 17033314.0,
"step": 65
},
{
"entropy": 0.5643904209136963,
"epoch": 3.3,
"grad_norm": 3.9066216945648193,
"learning_rate": 1e-06,
"loss": 0.2619,
"mean_token_accuracy": 0.9103641510009766,
"num_tokens": 17295329.0,
"step": 66
},
{
"entropy": 0.5618723630905151,
"epoch": 3.35,
"grad_norm": 3.168095588684082,
"learning_rate": 1e-06,
"loss": 0.2269,
"mean_token_accuracy": 0.9209107160568237,
"num_tokens": 17557360.0,
"step": 67
},
{
"entropy": 0.5572832822799683,
"epoch": 3.4,
"grad_norm": 4.440161228179932,
"learning_rate": 1e-06,
"loss": 0.28,
"mean_token_accuracy": 0.8996282815933228,
"num_tokens": 17819430.0,
"step": 68
},
{
"entropy": 0.5604659914970398,
"epoch": 3.45,
"grad_norm": 3.969372510910034,
"learning_rate": 1e-06,
"loss": 0.2705,
"mean_token_accuracy": 0.9063779711723328,
"num_tokens": 18081468.0,
"step": 69
},
{
"entropy": 0.5578862428665161,
"epoch": 3.5,
"grad_norm": 4.655684947967529,
"learning_rate": 1e-06,
"loss": 0.2104,
"mean_token_accuracy": 0.9167120456695557,
"num_tokens": 18343568.0,
"step": 70
},
{
"entropy": 0.5561624765396118,
"epoch": 3.55,
"grad_norm": 4.448247909545898,
"learning_rate": 1e-06,
"loss": 0.2552,
"mean_token_accuracy": 0.8955500721931458,
"num_tokens": 18605614.0,
"step": 71
},
{
"entropy": 0.5536712408065796,
"epoch": 3.6,
"grad_norm": 4.12972354888916,
"learning_rate": 1e-06,
"loss": 0.2933,
"mean_token_accuracy": 0.8912237286567688,
"num_tokens": 18867660.0,
"step": 72
},
{
"entropy": 0.5511398315429688,
"epoch": 3.65,
"grad_norm": 4.112148284912109,
"learning_rate": 1e-06,
"loss": 0.2228,
"mean_token_accuracy": 0.9108073115348816,
"num_tokens": 19129723.0,
"step": 73
},
{
"entropy": 0.5500166416168213,
"epoch": 3.7,
"grad_norm": 4.219006538391113,
"learning_rate": 1e-06,
"loss": 0.2266,
"mean_token_accuracy": 0.9114799499511719,
"num_tokens": 19391795.0,
"step": 74
},
{
"entropy": 0.5509252548217773,
"epoch": 3.75,
"grad_norm": 5.647234916687012,
"learning_rate": 1e-06,
"loss": 0.1964,
"mean_token_accuracy": 0.9147146940231323,
"num_tokens": 19653855.0,
"step": 75
},
{
"entropy": 0.551358699798584,
"epoch": 3.8,
"grad_norm": 3.1081528663635254,
"learning_rate": 1e-06,
"loss": 0.1849,
"mean_token_accuracy": 0.9201655983924866,
"num_tokens": 19915908.0,
"step": 76
},
{
"entropy": 0.5487557053565979,
"epoch": 3.85,
"grad_norm": 4.483115196228027,
"learning_rate": 1e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.9138851761817932,
"num_tokens": 20177933.0,
"step": 77
},
{
"entropy": 0.5503401160240173,
"epoch": 3.9,
"grad_norm": 3.959207534790039,
"learning_rate": 1e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9275280833244324,
"num_tokens": 20439976.0,
"step": 78
},
{
"entropy": 0.544685959815979,
"epoch": 3.95,
"grad_norm": 3.581266403198242,
"learning_rate": 1e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9270231127738953,
"num_tokens": 20702019.0,
"step": 79
},
{
"entropy": 0.5437827110290527,
"epoch": 4.0,
"grad_norm": 5.461308002471924,
"learning_rate": 1e-06,
"loss": 0.2157,
"mean_token_accuracy": 0.9198629856109619,
"num_tokens": 20964105.0,
"step": 80
},
{
"epoch": 4.0,
"eval_entropy": 0.5438513159751892,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8924731016159058,
"eval_num_tokens": 20964105.0,
"eval_runtime": 0.5684,
"eval_samples_per_second": 439.865,
"eval_steps_per_second": 1.759,
"step": 80
},
{
"entropy": 0.5403033494949341,
"epoch": 4.05,
"grad_norm": 4.5201497077941895,
"learning_rate": 1e-06,
"loss": 0.2073,
"mean_token_accuracy": 0.9200834035873413,
"num_tokens": 21226170.0,
"step": 81
},
{
"entropy": 0.5385745763778687,
"epoch": 4.1,
"grad_norm": 3.7788710594177246,
"learning_rate": 1e-06,
"loss": 0.2063,
"mean_token_accuracy": 0.9138225317001343,
"num_tokens": 21488232.0,
"step": 82
},
{
"entropy": 0.5423810482025146,
"epoch": 4.15,
"grad_norm": 3.069916248321533,
"learning_rate": 1e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9218472242355347,
"num_tokens": 21750283.0,
"step": 83
},
{
"entropy": 0.5381914973258972,
"epoch": 4.2,
"grad_norm": 4.174190521240234,
"learning_rate": 1e-06,
"loss": 0.206,
"mean_token_accuracy": 0.9193011522293091,
"num_tokens": 22012342.0,
"step": 84
},
{
"entropy": 0.5410702228546143,
"epoch": 4.25,
"grad_norm": 4.735873222351074,
"learning_rate": 1e-06,
"loss": 0.2291,
"mean_token_accuracy": 0.9051008224487305,
"num_tokens": 22274390.0,
"step": 85
},
{
"entropy": 0.5410845279693604,
"epoch": 4.3,
"grad_norm": 2.7942323684692383,
"learning_rate": 1e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9240579605102539,
"num_tokens": 22536477.0,
"step": 86
},
{
"entropy": 0.5370749831199646,
"epoch": 4.35,
"grad_norm": 4.275319576263428,
"learning_rate": 1e-06,
"loss": 0.1859,
"mean_token_accuracy": 0.9190635681152344,
"num_tokens": 22798499.0,
"step": 87
},
{
"entropy": 0.5372530221939087,
"epoch": 4.4,
"grad_norm": 3.8254811763763428,
"learning_rate": 1e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.9244146943092346,
"num_tokens": 23060554.0,
"step": 88
},
{
"entropy": 0.5390424728393555,
"epoch": 4.45,
"grad_norm": 3.9321508407592773,
"learning_rate": 1e-06,
"loss": 0.1901,
"mean_token_accuracy": 0.9273531436920166,
"num_tokens": 23322598.0,
"step": 89
},
{
"entropy": 0.5382624864578247,
"epoch": 4.5,
"grad_norm": 3.039321184158325,
"learning_rate": 1e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9295870065689087,
"num_tokens": 23584635.0,
"step": 90
},
{
"entropy": 0.5385315418243408,
"epoch": 4.55,
"grad_norm": 3.339580774307251,
"learning_rate": 1e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9232121706008911,
"num_tokens": 23846703.0,
"step": 91
},
{
"entropy": 0.5352039337158203,
"epoch": 4.6,
"grad_norm": 3.184174060821533,
"learning_rate": 1e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9252479076385498,
"num_tokens": 24108772.0,
"step": 92
},
{
"entropy": 0.5365791320800781,
"epoch": 4.65,
"grad_norm": 2.1336984634399414,
"learning_rate": 1e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9318456053733826,
"num_tokens": 24370792.0,
"step": 93
},
{
"entropy": 0.535903811454773,
"epoch": 4.7,
"grad_norm": 3.2332236766815186,
"learning_rate": 1e-06,
"loss": 0.1706,
"mean_token_accuracy": 0.9316656589508057,
"num_tokens": 24632826.0,
"step": 94
},
{
"entropy": 0.5376459360122681,
"epoch": 4.75,
"grad_norm": 4.174566268920898,
"learning_rate": 1e-06,
"loss": 0.1938,
"mean_token_accuracy": 0.9220023155212402,
"num_tokens": 24894906.0,
"step": 95
},
{
"entropy": 0.5379908084869385,
"epoch": 4.8,
"grad_norm": 2.7120425701141357,
"learning_rate": 1e-06,
"loss": 0.1818,
"mean_token_accuracy": 0.9200000166893005,
"num_tokens": 25156928.0,
"step": 96
},
{
"entropy": 0.5380344986915588,
"epoch": 4.85,
"grad_norm": 3.080734968185425,
"learning_rate": 1e-06,
"loss": 0.1615,
"mean_token_accuracy": 0.934725821018219,
"num_tokens": 25419007.0,
"step": 97
},
{
"entropy": 0.5391592979431152,
"epoch": 4.9,
"grad_norm": 2.436408519744873,
"learning_rate": 1e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.9324124455451965,
"num_tokens": 25681033.0,
"step": 98
},
{
"entropy": 0.5356138348579407,
"epoch": 4.95,
"grad_norm": 3.9103612899780273,
"learning_rate": 1e-06,
"loss": 0.1553,
"mean_token_accuracy": 0.9342178106307983,
"num_tokens": 25943105.0,
"step": 99
},
{
"entropy": 0.5300828218460083,
"epoch": 5.0,
"grad_norm": 3.9791324138641357,
"learning_rate": 1e-06,
"loss": 0.1867,
"mean_token_accuracy": 0.9147771596908569,
"num_tokens": 26205142.0,
"step": 100
},
{
"epoch": 5.0,
"eval_entropy": 0.532131552696228,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8958333134651184,
"eval_num_tokens": 26205142.0,
"eval_runtime": 0.5658,
"eval_samples_per_second": 441.867,
"eval_steps_per_second": 1.767,
"step": 100
},
{
"entropy": 0.5331390500068665,
"epoch": 5.05,
"grad_norm": 3.375486373901367,
"learning_rate": 1e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9255132079124451,
"num_tokens": 26467176.0,
"step": 101
},
{
"entropy": 0.5314509868621826,
"epoch": 5.1,
"grad_norm": 3.910857915878296,
"learning_rate": 1e-06,
"loss": 0.1619,
"mean_token_accuracy": 0.9261786341667175,
"num_tokens": 26729216.0,
"step": 102
},
{
"entropy": 0.5281578302383423,
"epoch": 5.15,
"grad_norm": 2.9953229427337646,
"learning_rate": 1e-06,
"loss": 0.169,
"mean_token_accuracy": 0.935003399848938,
"num_tokens": 26991253.0,
"step": 103
},
{
"entropy": 0.5287604331970215,
"epoch": 5.2,
"grad_norm": 2.6056575775146484,
"learning_rate": 1e-06,
"loss": 0.1483,
"mean_token_accuracy": 0.9292088150978088,
"num_tokens": 27253296.0,
"step": 104
},
{
"entropy": 0.5280485153198242,
"epoch": 5.25,
"grad_norm": 3.1244957447052,
"learning_rate": 1e-06,
"loss": 0.1576,
"mean_token_accuracy": 0.9250302314758301,
"num_tokens": 27515312.0,
"step": 105
},
{
"entropy": 0.5249028205871582,
"epoch": 5.3,
"grad_norm": 3.752169370651245,
"learning_rate": 1e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9309778213500977,
"num_tokens": 27777353.0,
"step": 106
},
{
"entropy": 0.5274486541748047,
"epoch": 5.35,
"grad_norm": 2.915797233581543,
"learning_rate": 1e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9288889169692993,
"num_tokens": 28039383.0,
"step": 107
},
{
"entropy": 0.5247786641120911,
"epoch": 5.4,
"grad_norm": 3.0220959186553955,
"learning_rate": 1e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9305993914604187,
"num_tokens": 28301429.0,
"step": 108
},
{
"entropy": 0.5274587273597717,
"epoch": 5.45,
"grad_norm": 3.330185651779175,
"learning_rate": 1e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9378365278244019,
"num_tokens": 28563471.0,
"step": 109
},
{
"entropy": 0.52198326587677,
"epoch": 5.5,
"grad_norm": 3.4707701206207275,
"learning_rate": 1e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.933920681476593,
"num_tokens": 28825554.0,
"step": 110
},
{
"entropy": 0.5196748971939087,
"epoch": 5.55,
"grad_norm": 3.8048267364501953,
"learning_rate": 1e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9194383025169373,
"num_tokens": 29087632.0,
"step": 111
},
{
"entropy": 0.5231800079345703,
"epoch": 5.6,
"grad_norm": 2.951167106628418,
"learning_rate": 1e-06,
"loss": 0.1463,
"mean_token_accuracy": 0.9328449368476868,
"num_tokens": 29349665.0,
"step": 112
},
{
"entropy": 0.5201854705810547,
"epoch": 5.65,
"grad_norm": 3.4071881771087646,
"learning_rate": 1e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9292557239532471,
"num_tokens": 29611714.0,
"step": 113
},
{
"entropy": 0.5249330997467041,
"epoch": 5.7,
"grad_norm": 2.2466695308685303,
"learning_rate": 1e-06,
"loss": 0.158,
"mean_token_accuracy": 0.93149334192276,
"num_tokens": 29873775.0,
"step": 114
},
{
"entropy": 0.5218572616577148,
"epoch": 5.75,
"grad_norm": 2.9838244915008545,
"learning_rate": 1e-06,
"loss": 0.1371,
"mean_token_accuracy": 0.9416499137878418,
"num_tokens": 30135859.0,
"step": 115
},
{
"entropy": 0.5222463607788086,
"epoch": 5.8,
"grad_norm": 3.629559278488159,
"learning_rate": 1e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9253350496292114,
"num_tokens": 30397920.0,
"step": 116
},
{
"entropy": 0.5209097862243652,
"epoch": 5.85,
"grad_norm": 5.2597174644470215,
"learning_rate": 1e-06,
"loss": 0.1736,
"mean_token_accuracy": 0.9180887341499329,
"num_tokens": 30659978.0,
"step": 117
},
{
"entropy": 0.5217398405075073,
"epoch": 5.9,
"grad_norm": 2.7834465503692627,
"learning_rate": 1e-06,
"loss": 0.1539,
"mean_token_accuracy": 0.9320943355560303,
"num_tokens": 30922069.0,
"step": 118
},
{
"entropy": 0.5216600894927979,
"epoch": 5.95,
"grad_norm": 3.517230749130249,
"learning_rate": 1e-06,
"loss": 0.1599,
"mean_token_accuracy": 0.9316734075546265,
"num_tokens": 31184125.0,
"step": 119
},
{
"entropy": 0.5233047604560852,
"epoch": 6.0,
"grad_norm": 4.519037246704102,
"learning_rate": 1e-06,
"loss": 0.1686,
"mean_token_accuracy": 0.9267241358757019,
"num_tokens": 31446144.0,
"step": 120
},
{
"epoch": 6.0,
"eval_entropy": 0.5248987674713135,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8958333134651184,
"eval_num_tokens": 31446144.0,
"eval_runtime": 0.5646,
"eval_samples_per_second": 442.826,
"eval_steps_per_second": 1.771,
"step": 120
},
{
"entropy": 0.5230313539505005,
"epoch": 6.05,
"grad_norm": 3.3974978923797607,
"learning_rate": 1e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9349269866943359,
"num_tokens": 31708177.0,
"step": 121
},
{
"entropy": 0.5233445167541504,
"epoch": 6.1,
"grad_norm": 3.316178321838379,
"learning_rate": 1e-06,
"loss": 0.1417,
"mean_token_accuracy": 0.9336429834365845,
"num_tokens": 31970211.0,
"step": 122
},
{
"entropy": 0.519654393196106,
"epoch": 6.15,
"grad_norm": 4.040668487548828,
"learning_rate": 1e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9241849780082703,
"num_tokens": 32232255.0,
"step": 123
},
{
"entropy": 0.5215832591056824,
"epoch": 6.2,
"grad_norm": 4.117729663848877,
"learning_rate": 1e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9260615110397339,
"num_tokens": 32494346.0,
"step": 124
},
{
"entropy": 0.5216290950775146,
"epoch": 6.25,
"grad_norm": 3.0051236152648926,
"learning_rate": 1e-06,
"loss": 0.1519,
"mean_token_accuracy": 0.9318435788154602,
"num_tokens": 32756399.0,
"step": 125
},
{
"entropy": 0.5209139585494995,
"epoch": 6.3,
"grad_norm": 2.9851608276367188,
"learning_rate": 1e-06,
"loss": 0.133,
"mean_token_accuracy": 0.9429529905319214,
"num_tokens": 33018449.0,
"step": 126
},
{
"entropy": 0.5213064551353455,
"epoch": 6.35,
"grad_norm": 3.356963872909546,
"learning_rate": 1e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9313392043113708,
"num_tokens": 33280513.0,
"step": 127
},
{
"entropy": 0.5207207202911377,
"epoch": 6.4,
"grad_norm": 3.217970132827759,
"learning_rate": 1e-06,
"loss": 0.1518,
"mean_token_accuracy": 0.9335684776306152,
"num_tokens": 33542576.0,
"step": 128
},
{
"entropy": 0.5207620859146118,
"epoch": 6.45,
"grad_norm": 4.5813703536987305,
"learning_rate": 1e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9296131730079651,
"num_tokens": 33804614.0,
"step": 129
},
{
"entropy": 0.5200029611587524,
"epoch": 6.5,
"grad_norm": 2.659916400909424,
"learning_rate": 1e-06,
"loss": 0.1566,
"mean_token_accuracy": 0.9301252365112305,
"num_tokens": 34066691.0,
"step": 130
},
{
"entropy": 0.5188630819320679,
"epoch": 6.55,
"grad_norm": 3.103395700454712,
"learning_rate": 1e-06,
"loss": 0.1655,
"mean_token_accuracy": 0.9257642030715942,
"num_tokens": 34328757.0,
"step": 131
},
{
"entropy": 0.5178630948066711,
"epoch": 6.6,
"grad_norm": 3.037834644317627,
"learning_rate": 1e-06,
"loss": 0.1467,
"mean_token_accuracy": 0.9354194402694702,
"num_tokens": 34590819.0,
"step": 132
},
{
"entropy": 0.5198030471801758,
"epoch": 6.65,
"grad_norm": 2.739222526550293,
"learning_rate": 1e-06,
"loss": 0.1211,
"mean_token_accuracy": 0.9462962746620178,
"num_tokens": 34852867.0,
"step": 133
},
{
"entropy": 0.5187867879867554,
"epoch": 6.7,
"grad_norm": 3.5631425380706787,
"learning_rate": 1e-06,
"loss": 0.1459,
"mean_token_accuracy": 0.9339437484741211,
"num_tokens": 35114923.0,
"step": 134
},
{
"entropy": 0.5223994851112366,
"epoch": 6.75,
"grad_norm": 3.349653482437134,
"learning_rate": 1e-06,
"loss": 0.15,
"mean_token_accuracy": 0.9327133297920227,
"num_tokens": 35376948.0,
"step": 135
},
{
"entropy": 0.5207353830337524,
"epoch": 6.8,
"grad_norm": 3.7862677574157715,
"learning_rate": 1e-06,
"loss": 0.1588,
"mean_token_accuracy": 0.9323570728302002,
"num_tokens": 35639008.0,
"step": 136
},
{
"entropy": 0.522723376750946,
"epoch": 6.85,
"grad_norm": 3.227595090866089,
"learning_rate": 1e-06,
"loss": 0.1556,
"mean_token_accuracy": 0.9338235259056091,
"num_tokens": 35901080.0,
"step": 137
},
{
"entropy": 0.5194015502929688,
"epoch": 6.9,
"grad_norm": 3.0805652141571045,
"learning_rate": 1e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9439759254455566,
"num_tokens": 36163102.0,
"step": 138
},
{
"entropy": 0.5224202871322632,
"epoch": 6.95,
"grad_norm": 2.6702420711517334,
"learning_rate": 1e-06,
"loss": 0.1331,
"mean_token_accuracy": 0.942307710647583,
"num_tokens": 36425129.0,
"step": 139
},
{
"entropy": 0.5183343887329102,
"epoch": 7.0,
"grad_norm": 3.2100484371185303,
"learning_rate": 1e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.9357267022132874,
"num_tokens": 36687152.0,
"step": 140
},
{
"epoch": 7.0,
"eval_entropy": 0.5203882455825806,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9045698642730713,
"eval_num_tokens": 36687152.0,
"eval_runtime": 0.5662,
"eval_samples_per_second": 441.545,
"eval_steps_per_second": 1.766,
"step": 140
},
{
"entropy": 0.5172215104103088,
"epoch": 7.05,
"grad_norm": 2.8229575157165527,
"learning_rate": 1e-06,
"loss": 0.1359,
"mean_token_accuracy": 0.9341161251068115,
"num_tokens": 36949179.0,
"step": 141
},
{
"entropy": 0.5188791751861572,
"epoch": 7.1,
"grad_norm": 3.300265312194824,
"learning_rate": 1e-06,
"loss": 0.1479,
"mean_token_accuracy": 0.9282218813896179,
"num_tokens": 37211215.0,
"step": 142
},
{
"entropy": 0.5181975364685059,
"epoch": 7.15,
"grad_norm": 3.0605766773223877,
"learning_rate": 1e-06,
"loss": 0.1311,
"mean_token_accuracy": 0.9403209686279297,
"num_tokens": 37473208.0,
"step": 143
},
{
"entropy": 0.5172451734542847,
"epoch": 7.2,
"grad_norm": 3.0566470623016357,
"learning_rate": 1e-06,
"loss": 0.1371,
"mean_token_accuracy": 0.9368836283683777,
"num_tokens": 37735223.0,
"step": 144
},
{
"entropy": 0.5153446197509766,
"epoch": 7.25,
"grad_norm": 3.771998167037964,
"learning_rate": 1e-06,
"loss": 0.1531,
"mean_token_accuracy": 0.9337349534034729,
"num_tokens": 37997309.0,
"step": 145
},
{
"entropy": 0.5160014033317566,
"epoch": 7.3,
"grad_norm": 3.9155826568603516,
"learning_rate": 1e-06,
"loss": 0.1457,
"mean_token_accuracy": 0.9334239363670349,
"num_tokens": 38259341.0,
"step": 146
},
{
"entropy": 0.5150690078735352,
"epoch": 7.35,
"grad_norm": 3.842313766479492,
"learning_rate": 1e-06,
"loss": 0.1434,
"mean_token_accuracy": 0.9330238699913025,
"num_tokens": 38521409.0,
"step": 147
},
{
"entropy": 0.5158810615539551,
"epoch": 7.4,
"grad_norm": 3.2817740440368652,
"learning_rate": 1e-06,
"loss": 0.1542,
"mean_token_accuracy": 0.9296690225601196,
"num_tokens": 38783430.0,
"step": 148
},
{
"entropy": 0.5140302777290344,
"epoch": 7.45,
"grad_norm": 3.40156626701355,
"learning_rate": 1e-06,
"loss": 0.1385,
"mean_token_accuracy": 0.9309021234512329,
"num_tokens": 39045487.0,
"step": 149
},
{
"entropy": 0.5199052691459656,
"epoch": 7.5,
"grad_norm": 3.458606004714966,
"learning_rate": 1e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9361202716827393,
"num_tokens": 39307549.0,
"step": 150
},
{
"entropy": 0.5160842537879944,
"epoch": 7.55,
"grad_norm": 2.932157278060913,
"learning_rate": 1e-06,
"loss": 0.1373,
"mean_token_accuracy": 0.9383945465087891,
"num_tokens": 39569617.0,
"step": 151
},
{
"entropy": 0.5171875953674316,
"epoch": 7.6,
"grad_norm": 3.465000867843628,
"learning_rate": 1e-06,
"loss": 0.174,
"mean_token_accuracy": 0.9331210255622864,
"num_tokens": 39831648.0,
"step": 152
},
{
"entropy": 0.5146853923797607,
"epoch": 7.65,
"grad_norm": 5.0309343338012695,
"learning_rate": 1e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.9341809749603271,
"num_tokens": 40093743.0,
"step": 153
},
{
"entropy": 0.5160114765167236,
"epoch": 7.7,
"grad_norm": 4.118295192718506,
"learning_rate": 1e-06,
"loss": 0.1289,
"mean_token_accuracy": 0.9425212144851685,
"num_tokens": 40355801.0,
"step": 154
},
{
"entropy": 0.5127236843109131,
"epoch": 7.75,
"grad_norm": 3.2528462409973145,
"learning_rate": 1e-06,
"loss": 0.1355,
"mean_token_accuracy": 0.94050532579422,
"num_tokens": 40617885.0,
"step": 155
},
{
"entropy": 0.5161481499671936,
"epoch": 7.8,
"grad_norm": 3.1190099716186523,
"learning_rate": 1e-06,
"loss": 0.16,
"mean_token_accuracy": 0.9346323013305664,
"num_tokens": 40879937.0,
"step": 156
},
{
"entropy": 0.5165537595748901,
"epoch": 7.85,
"grad_norm": 2.945587635040283,
"learning_rate": 1e-06,
"loss": 0.1374,
"mean_token_accuracy": 0.9363411664962769,
"num_tokens": 41141983.0,
"step": 157
},
{
"entropy": 0.5178842544555664,
"epoch": 7.9,
"grad_norm": 2.951826572418213,
"learning_rate": 1e-06,
"loss": 0.1364,
"mean_token_accuracy": 0.9384886026382446,
"num_tokens": 41404052.0,
"step": 158
},
{
"entropy": 0.5159619450569153,
"epoch": 7.95,
"grad_norm": 4.019174575805664,
"learning_rate": 1e-06,
"loss": 0.132,
"mean_token_accuracy": 0.9305019378662109,
"num_tokens": 41666100.0,
"step": 159
},
{
"entropy": 0.5143953561782837,
"epoch": 8.0,
"grad_norm": 4.0759196281433105,
"learning_rate": 1e-06,
"loss": 0.143,
"mean_token_accuracy": 0.9287616610527039,
"num_tokens": 41928162.0,
"step": 160
},
{
"epoch": 8.0,
"eval_entropy": 0.5199548602104187,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8958333134651184,
"eval_num_tokens": 41928162.0,
"eval_runtime": 0.5663,
"eval_samples_per_second": 441.44,
"eval_steps_per_second": 1.766,
"step": 160
},
{
"entropy": 0.5157938003540039,
"epoch": 8.05,
"grad_norm": 3.2986180782318115,
"learning_rate": 1e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9472459554672241,
"num_tokens": 42190242.0,
"step": 161
},
{
"entropy": 0.5174839496612549,
"epoch": 8.1,
"grad_norm": 2.7032060623168945,
"learning_rate": 1e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.9354838728904724,
"num_tokens": 42452270.0,
"step": 162
},
{
"entropy": 0.5150723457336426,
"epoch": 8.15,
"grad_norm": 3.3034956455230713,
"learning_rate": 1e-06,
"loss": 0.1562,
"mean_token_accuracy": 0.9342105388641357,
"num_tokens": 42714340.0,
"step": 163
},
{
"entropy": 0.5165647268295288,
"epoch": 8.2,
"grad_norm": 3.147430181503296,
"learning_rate": 1e-06,
"loss": 0.1267,
"mean_token_accuracy": 0.9487179517745972,
"num_tokens": 42976388.0,
"step": 164
},
{
"entropy": 0.5171810388565063,
"epoch": 8.25,
"grad_norm": 2.788745164871216,
"learning_rate": 1e-06,
"loss": 0.1383,
"mean_token_accuracy": 0.937644362449646,
"num_tokens": 43238449.0,
"step": 165
},
{
"entropy": 0.5187046527862549,
"epoch": 8.3,
"grad_norm": 3.536580801010132,
"learning_rate": 1e-06,
"loss": 0.1401,
"mean_token_accuracy": 0.9310526251792908,
"num_tokens": 43500447.0,
"step": 166
},
{
"entropy": 0.5151098370552063,
"epoch": 8.35,
"grad_norm": 3.484966516494751,
"learning_rate": 1e-06,
"loss": 0.1584,
"mean_token_accuracy": 0.9285714030265808,
"num_tokens": 43762496.0,
"step": 167
},
{
"entropy": 0.5161045789718628,
"epoch": 8.4,
"grad_norm": 2.554356813430786,
"learning_rate": 1e-06,
"loss": 0.1444,
"mean_token_accuracy": 0.9319999814033508,
"num_tokens": 44024542.0,
"step": 168
},
{
"entropy": 0.5149120688438416,
"epoch": 8.45,
"grad_norm": 4.06463623046875,
"learning_rate": 1e-06,
"loss": 0.1288,
"mean_token_accuracy": 0.9389110207557678,
"num_tokens": 44286608.0,
"step": 169
},
{
"entropy": 0.5138819813728333,
"epoch": 8.5,
"grad_norm": 4.850083827972412,
"learning_rate": 1e-06,
"loss": 0.1328,
"mean_token_accuracy": 0.9411404132843018,
"num_tokens": 44548667.0,
"step": 170
},
{
"entropy": 0.513306736946106,
"epoch": 8.55,
"grad_norm": 2.4267070293426514,
"learning_rate": 1e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9418439865112305,
"num_tokens": 44810703.0,
"step": 171
},
{
"entropy": 0.5128031969070435,
"epoch": 8.6,
"grad_norm": 3.5913071632385254,
"learning_rate": 1e-06,
"loss": 0.1225,
"mean_token_accuracy": 0.9422430992126465,
"num_tokens": 45072719.0,
"step": 172
},
{
"entropy": 0.5119505524635315,
"epoch": 8.65,
"grad_norm": 3.707689046859741,
"learning_rate": 1e-06,
"loss": 0.1477,
"mean_token_accuracy": 0.9355555772781372,
"num_tokens": 45334794.0,
"step": 173
},
{
"entropy": 0.5131097435951233,
"epoch": 8.7,
"grad_norm": 4.792629241943359,
"learning_rate": 1e-06,
"loss": 0.1563,
"mean_token_accuracy": 0.9289617538452148,
"num_tokens": 45596869.0,
"step": 174
},
{
"entropy": 0.51198810338974,
"epoch": 8.75,
"grad_norm": 2.6373438835144043,
"learning_rate": 1e-06,
"loss": 0.1328,
"mean_token_accuracy": 0.9399141669273376,
"num_tokens": 45858928.0,
"step": 175
},
{
"entropy": 0.5113101005554199,
"epoch": 8.8,
"grad_norm": 2.828310966491699,
"learning_rate": 1e-06,
"loss": 0.1292,
"mean_token_accuracy": 0.944847583770752,
"num_tokens": 46120998.0,
"step": 176
},
{
"entropy": 0.514806866645813,
"epoch": 8.85,
"grad_norm": 3.7976365089416504,
"learning_rate": 1e-06,
"loss": 0.1378,
"mean_token_accuracy": 0.9391401410102844,
"num_tokens": 46383019.0,
"step": 177
},
{
"entropy": 0.5154971480369568,
"epoch": 8.9,
"grad_norm": 3.059340000152588,
"learning_rate": 1e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9379541873931885,
"num_tokens": 46645071.0,
"step": 178
},
{
"entropy": 0.5132753849029541,
"epoch": 8.95,
"grad_norm": 2.7030842304229736,
"learning_rate": 1e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9378563165664673,
"num_tokens": 46907121.0,
"step": 179
},
{
"entropy": 0.5132700204849243,
"epoch": 9.0,
"grad_norm": 3.4913828372955322,
"learning_rate": 1e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9455108642578125,
"num_tokens": 47169197.0,
"step": 180
},
{
"epoch": 9.0,
"eval_entropy": 0.5133532285690308,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9072580933570862,
"eval_num_tokens": 47169197.0,
"eval_runtime": 0.5634,
"eval_samples_per_second": 443.762,
"eval_steps_per_second": 1.775,
"step": 180
},
{
"entropy": 0.5132405757904053,
"epoch": 9.05,
"grad_norm": 2.907648801803589,
"learning_rate": 1e-06,
"loss": 0.1295,
"mean_token_accuracy": 0.9371069073677063,
"num_tokens": 47431209.0,
"step": 181
},
{
"entropy": 0.5094999670982361,
"epoch": 9.1,
"grad_norm": 3.242464303970337,
"learning_rate": 1e-06,
"loss": 0.1316,
"mean_token_accuracy": 0.9456915259361267,
"num_tokens": 47693282.0,
"step": 182
},
{
"entropy": 0.5093837380409241,
"epoch": 9.15,
"grad_norm": 3.4022397994995117,
"learning_rate": 1e-06,
"loss": 0.119,
"mean_token_accuracy": 0.9439567923545837,
"num_tokens": 47955290.0,
"step": 183
},
{
"entropy": 0.5123763680458069,
"epoch": 9.2,
"grad_norm": 3.2485334873199463,
"learning_rate": 1e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.9474899172782898,
"num_tokens": 48217352.0,
"step": 184
},
{
"entropy": 0.5091462135314941,
"epoch": 9.25,
"grad_norm": 2.531839370727539,
"learning_rate": 1e-06,
"loss": 0.1277,
"mean_token_accuracy": 0.9405654668807983,
"num_tokens": 48479414.0,
"step": 185
},
{
"entropy": 0.5079025030136108,
"epoch": 9.3,
"grad_norm": 4.208319187164307,
"learning_rate": 1e-06,
"loss": 0.1655,
"mean_token_accuracy": 0.9309878349304199,
"num_tokens": 48741485.0,
"step": 186
},
{
"entropy": 0.5062220096588135,
"epoch": 9.35,
"grad_norm": 4.336572647094727,
"learning_rate": 1e-06,
"loss": 0.1605,
"mean_token_accuracy": 0.9226190447807312,
"num_tokens": 49003521.0,
"step": 187
},
{
"entropy": 0.5075182914733887,
"epoch": 9.4,
"grad_norm": 3.8903305530548096,
"learning_rate": 1e-06,
"loss": 0.1279,
"mean_token_accuracy": 0.9393737316131592,
"num_tokens": 49265549.0,
"step": 188
},
{
"entropy": 0.5041744709014893,
"epoch": 9.45,
"grad_norm": 4.592701435089111,
"learning_rate": 1e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9337060451507568,
"num_tokens": 49527625.0,
"step": 189
},
{
"entropy": 0.5066587924957275,
"epoch": 9.5,
"grad_norm": 4.691225528717041,
"learning_rate": 1e-06,
"loss": 0.1354,
"mean_token_accuracy": 0.9368420839309692,
"num_tokens": 49789713.0,
"step": 190
},
{
"entropy": 0.5104098320007324,
"epoch": 9.55,
"grad_norm": 2.6505699157714844,
"learning_rate": 1e-06,
"loss": 0.1374,
"mean_token_accuracy": 0.942954957485199,
"num_tokens": 50051762.0,
"step": 191
},
{
"entropy": 0.5087345242500305,
"epoch": 9.6,
"grad_norm": 3.0128960609436035,
"learning_rate": 1e-06,
"loss": 0.1244,
"mean_token_accuracy": 0.9477911591529846,
"num_tokens": 50313783.0,
"step": 192
},
{
"entropy": 0.5104490518569946,
"epoch": 9.65,
"grad_norm": 2.859647035598755,
"learning_rate": 1e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9420821070671082,
"num_tokens": 50575839.0,
"step": 193
},
{
"entropy": 0.5095815062522888,
"epoch": 9.7,
"grad_norm": 3.4269556999206543,
"learning_rate": 1e-06,
"loss": 0.1245,
"mean_token_accuracy": 0.9457672238349915,
"num_tokens": 50837911.0,
"step": 194
},
{
"entropy": 0.5142146348953247,
"epoch": 9.75,
"grad_norm": 2.8217012882232666,
"learning_rate": 1e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9454896450042725,
"num_tokens": 51099983.0,
"step": 195
},
{
"entropy": 0.5132467150688171,
"epoch": 9.8,
"grad_norm": 3.072129964828491,
"learning_rate": 1e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.9424341917037964,
"num_tokens": 51362037.0,
"step": 196
},
{
"entropy": 0.5132461786270142,
"epoch": 9.85,
"grad_norm": 4.272913932800293,
"learning_rate": 1e-06,
"loss": 0.1399,
"mean_token_accuracy": 0.9311075806617737,
"num_tokens": 51624088.0,
"step": 197
},
{
"entropy": 0.5129303932189941,
"epoch": 9.9,
"grad_norm": 4.9169230461120605,
"learning_rate": 1e-06,
"loss": 0.1576,
"mean_token_accuracy": 0.9312573671340942,
"num_tokens": 51886119.0,
"step": 198
},
{
"entropy": 0.5136593580245972,
"epoch": 9.95,
"grad_norm": 2.8221092224121094,
"learning_rate": 1e-06,
"loss": 0.1408,
"mean_token_accuracy": 0.9360189437866211,
"num_tokens": 52148169.0,
"step": 199
},
{
"entropy": 0.513308048248291,
"epoch": 10.0,
"grad_norm": 2.4588990211486816,
"learning_rate": 1e-06,
"loss": 0.1364,
"mean_token_accuracy": 0.9390919208526611,
"num_tokens": 52410205.0,
"step": 200
},
{
"epoch": 10.0,
"eval_entropy": 0.5121233463287354,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8965053558349609,
"eval_num_tokens": 52410205.0,
"eval_runtime": 0.5632,
"eval_samples_per_second": 443.897,
"eval_steps_per_second": 1.776,
"step": 200
},
{
"entropy": 0.5156605243682861,
"epoch": 10.05,
"grad_norm": 3.8599801063537598,
"learning_rate": 1e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9414870142936707,
"num_tokens": 52672213.0,
"step": 201
},
{
"entropy": 0.5084142684936523,
"epoch": 10.1,
"grad_norm": 3.849475860595703,
"learning_rate": 1e-06,
"loss": 0.1361,
"mean_token_accuracy": 0.9329004287719727,
"num_tokens": 52934291.0,
"step": 202
},
{
"entropy": 0.5113674998283386,
"epoch": 10.15,
"grad_norm": 3.281127691268921,
"learning_rate": 1e-06,
"loss": 0.1253,
"mean_token_accuracy": 0.9463624954223633,
"num_tokens": 53196341.0,
"step": 203
},
{
"entropy": 0.5095717906951904,
"epoch": 10.2,
"grad_norm": 3.2623631954193115,
"learning_rate": 1e-06,
"loss": 0.1342,
"mean_token_accuracy": 0.9394292235374451,
"num_tokens": 53458420.0,
"step": 204
},
{
"entropy": 0.5095763206481934,
"epoch": 10.25,
"grad_norm": 3.0780463218688965,
"learning_rate": 1e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.9502018690109253,
"num_tokens": 53720433.0,
"step": 205
},
{
"entropy": 0.5088077783584595,
"epoch": 10.3,
"grad_norm": 3.142488479614258,
"learning_rate": 1e-06,
"loss": 0.1411,
"mean_token_accuracy": 0.9367321729660034,
"num_tokens": 53982489.0,
"step": 206
},
{
"entropy": 0.5087566375732422,
"epoch": 10.35,
"grad_norm": 3.4320948123931885,
"learning_rate": 1e-06,
"loss": 0.1141,
"mean_token_accuracy": 0.9442488551139832,
"num_tokens": 54244555.0,
"step": 207
},
{
"entropy": 0.5068839192390442,
"epoch": 10.4,
"grad_norm": 4.642038345336914,
"learning_rate": 1e-06,
"loss": 0.168,
"mean_token_accuracy": 0.9365397691726685,
"num_tokens": 54506612.0,
"step": 208
},
{
"entropy": 0.5061399936676025,
"epoch": 10.45,
"grad_norm": 4.175653457641602,
"learning_rate": 1e-06,
"loss": 0.1312,
"mean_token_accuracy": 0.9389256834983826,
"num_tokens": 54768696.0,
"step": 209
},
{
"entropy": 0.5083756446838379,
"epoch": 10.5,
"grad_norm": 3.5277068614959717,
"learning_rate": 1e-06,
"loss": 0.1301,
"mean_token_accuracy": 0.9352391958236694,
"num_tokens": 55030739.0,
"step": 210
},
{
"entropy": 0.505259096622467,
"epoch": 10.55,
"grad_norm": 4.416886806488037,
"learning_rate": 1e-06,
"loss": 0.1353,
"mean_token_accuracy": 0.9358885288238525,
"num_tokens": 55292767.0,
"step": 211
},
{
"entropy": 0.5073595643043518,
"epoch": 10.6,
"grad_norm": 2.7743871212005615,
"learning_rate": 1e-06,
"loss": 0.1397,
"mean_token_accuracy": 0.9359895586967468,
"num_tokens": 55554825.0,
"step": 212
},
{
"entropy": 0.5083353519439697,
"epoch": 10.65,
"grad_norm": 2.932196617126465,
"learning_rate": 1e-06,
"loss": 0.122,
"mean_token_accuracy": 0.9455605745315552,
"num_tokens": 55816862.0,
"step": 213
},
{
"entropy": 0.5083277821540833,
"epoch": 10.7,
"grad_norm": 3.528801441192627,
"learning_rate": 1e-06,
"loss": 0.13,
"mean_token_accuracy": 0.9445459842681885,
"num_tokens": 56078931.0,
"step": 214
},
{
"entropy": 0.5057054162025452,
"epoch": 10.75,
"grad_norm": 4.0908589363098145,
"learning_rate": 1e-06,
"loss": 0.1257,
"mean_token_accuracy": 0.9417199492454529,
"num_tokens": 56340964.0,
"step": 215
},
{
"entropy": 0.5087761878967285,
"epoch": 10.8,
"grad_norm": 3.4696297645568848,
"learning_rate": 1e-06,
"loss": 0.1151,
"mean_token_accuracy": 0.9496581554412842,
"num_tokens": 56603001.0,
"step": 216
},
{
"entropy": 0.5087063908576965,
"epoch": 10.85,
"grad_norm": 3.221892833709717,
"learning_rate": 1e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9435195922851562,
"num_tokens": 56865045.0,
"step": 217
},
{
"entropy": 0.5074091553688049,
"epoch": 10.9,
"grad_norm": 4.037084102630615,
"learning_rate": 1e-06,
"loss": 0.1468,
"mean_token_accuracy": 0.9345403909683228,
"num_tokens": 57127107.0,
"step": 218
},
{
"entropy": 0.510859489440918,
"epoch": 10.95,
"grad_norm": 3.951176166534424,
"learning_rate": 1e-06,
"loss": 0.1251,
"mean_token_accuracy": 0.9449082016944885,
"num_tokens": 57389167.0,
"step": 219
},
{
"entropy": 0.5087305903434753,
"epoch": 11.0,
"grad_norm": 3.742441177368164,
"learning_rate": 1e-06,
"loss": 0.127,
"mean_token_accuracy": 0.945555567741394,
"num_tokens": 57651197.0,
"step": 220
},
{
"epoch": 11.0,
"eval_entropy": 0.5088062286376953,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9099462628364563,
"eval_num_tokens": 57651197.0,
"eval_runtime": 0.5668,
"eval_samples_per_second": 441.055,
"eval_steps_per_second": 1.764,
"step": 220
},
{
"entropy": 0.5034325122833252,
"epoch": 11.05,
"grad_norm": 5.275214672088623,
"learning_rate": 1e-06,
"loss": 0.15,
"mean_token_accuracy": 0.9286394119262695,
"num_tokens": 57913303.0,
"step": 221
},
{
"entropy": 0.505387008190155,
"epoch": 11.1,
"grad_norm": 3.4117355346679688,
"learning_rate": 1e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9479674696922302,
"num_tokens": 58175357.0,
"step": 222
},
{
"entropy": 0.5113849639892578,
"epoch": 11.15,
"grad_norm": 2.8327202796936035,
"learning_rate": 1e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9426156878471375,
"num_tokens": 58437406.0,
"step": 223
},
{
"entropy": 0.5018768310546875,
"epoch": 11.2,
"grad_norm": 3.272310972213745,
"learning_rate": 1e-06,
"loss": 0.1094,
"mean_token_accuracy": 0.9515201449394226,
"num_tokens": 58699480.0,
"step": 224
},
{
"entropy": 0.505340576171875,
"epoch": 11.25,
"grad_norm": 2.4740121364593506,
"learning_rate": 1e-06,
"loss": 0.1209,
"mean_token_accuracy": 0.9433842301368713,
"num_tokens": 58961546.0,
"step": 225
},
{
"entropy": 0.506737232208252,
"epoch": 11.3,
"grad_norm": 3.198965311050415,
"learning_rate": 1e-06,
"loss": 0.1236,
"mean_token_accuracy": 0.9417647123336792,
"num_tokens": 59223575.0,
"step": 226
},
{
"entropy": 0.5045244693756104,
"epoch": 11.35,
"grad_norm": 3.001002550125122,
"learning_rate": 1e-06,
"loss": 0.1265,
"mean_token_accuracy": 0.9461426734924316,
"num_tokens": 59485641.0,
"step": 227
},
{
"entropy": 0.5068516135215759,
"epoch": 11.4,
"grad_norm": 3.9516587257385254,
"learning_rate": 1e-06,
"loss": 0.1325,
"mean_token_accuracy": 0.9444125890731812,
"num_tokens": 59747649.0,
"step": 228
},
{
"entropy": 0.5061191916465759,
"epoch": 11.45,
"grad_norm": 3.9736275672912598,
"learning_rate": 1e-06,
"loss": 0.1202,
"mean_token_accuracy": 0.9477000832557678,
"num_tokens": 60009697.0,
"step": 229
},
{
"entropy": 0.5050452947616577,
"epoch": 11.5,
"grad_norm": 3.388237714767456,
"learning_rate": 1e-06,
"loss": 0.1113,
"mean_token_accuracy": 0.9482221007347107,
"num_tokens": 60271794.0,
"step": 230
},
{
"entropy": 0.5037835836410522,
"epoch": 11.55,
"grad_norm": 4.176617622375488,
"learning_rate": 1e-06,
"loss": 0.1532,
"mean_token_accuracy": 0.933163583278656,
"num_tokens": 60533859.0,
"step": 231
},
{
"entropy": 0.504481852054596,
"epoch": 11.6,
"grad_norm": 4.4760212898254395,
"learning_rate": 1e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.9319999814033508,
"num_tokens": 60795919.0,
"step": 232
},
{
"entropy": 0.5054460763931274,
"epoch": 11.65,
"grad_norm": 2.788715362548828,
"learning_rate": 1e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9475218653678894,
"num_tokens": 61057963.0,
"step": 233
},
{
"entropy": 0.5077073574066162,
"epoch": 11.7,
"grad_norm": 2.838501214981079,
"learning_rate": 1e-06,
"loss": 0.1288,
"mean_token_accuracy": 0.9445010423660278,
"num_tokens": 61319959.0,
"step": 234
},
{
"entropy": 0.5062661170959473,
"epoch": 11.75,
"grad_norm": 3.208291530609131,
"learning_rate": 1e-06,
"loss": 0.1303,
"mean_token_accuracy": 0.9417750239372253,
"num_tokens": 61581991.0,
"step": 235
},
{
"entropy": 0.5048550963401794,
"epoch": 11.8,
"grad_norm": 3.915485382080078,
"learning_rate": 1e-06,
"loss": 0.1204,
"mean_token_accuracy": 0.9506539702415466,
"num_tokens": 61843969.0,
"step": 236
},
{
"entropy": 0.5031319856643677,
"epoch": 11.85,
"grad_norm": 3.0714540481567383,
"learning_rate": 1e-06,
"loss": 0.116,
"mean_token_accuracy": 0.9487970471382141,
"num_tokens": 62106051.0,
"step": 237
},
{
"entropy": 0.5029826164245605,
"epoch": 11.9,
"grad_norm": 3.172436475753784,
"learning_rate": 1e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9473684430122375,
"num_tokens": 62368113.0,
"step": 238
},
{
"entropy": 0.5026971101760864,
"epoch": 11.95,
"grad_norm": 3.787898540496826,
"learning_rate": 1e-06,
"loss": 0.1188,
"mean_token_accuracy": 0.9513981342315674,
"num_tokens": 62630142.0,
"step": 239
},
{
"entropy": 0.501825213432312,
"epoch": 12.0,
"grad_norm": 3.851665735244751,
"learning_rate": 1e-06,
"loss": 0.1269,
"mean_token_accuracy": 0.9424046277999878,
"num_tokens": 62892256.0,
"step": 240
},
{
"epoch": 12.0,
"eval_entropy": 0.5031265020370483,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9032257795333862,
"eval_num_tokens": 62892256.0,
"eval_runtime": 0.5654,
"eval_samples_per_second": 442.144,
"eval_steps_per_second": 1.769,
"step": 240
},
{
"entropy": 0.5012459754943848,
"epoch": 12.05,
"grad_norm": 4.037276744842529,
"learning_rate": 1e-06,
"loss": 0.1271,
"mean_token_accuracy": 0.9416014552116394,
"num_tokens": 63154312.0,
"step": 241
},
{
"entropy": 0.5014461278915405,
"epoch": 12.1,
"grad_norm": 2.862247943878174,
"learning_rate": 1e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9479434490203857,
"num_tokens": 63416362.0,
"step": 242
},
{
"entropy": 0.5007213354110718,
"epoch": 12.15,
"grad_norm": 2.3149445056915283,
"learning_rate": 1e-06,
"loss": 0.1107,
"mean_token_accuracy": 0.9534450769424438,
"num_tokens": 63678434.0,
"step": 243
},
{
"entropy": 0.5017549991607666,
"epoch": 12.2,
"grad_norm": 3.403278350830078,
"learning_rate": 1e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.941082775592804,
"num_tokens": 63940449.0,
"step": 244
},
{
"entropy": 0.4991673529148102,
"epoch": 12.25,
"grad_norm": 3.251974105834961,
"learning_rate": 1e-06,
"loss": 0.1095,
"mean_token_accuracy": 0.9473365545272827,
"num_tokens": 64202496.0,
"step": 245
},
{
"entropy": 0.4982728958129883,
"epoch": 12.3,
"grad_norm": 3.218226909637451,
"learning_rate": 1e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9382879734039307,
"num_tokens": 64464563.0,
"step": 246
},
{
"entropy": 0.4956706464290619,
"epoch": 12.35,
"grad_norm": 3.3675098419189453,
"learning_rate": 1e-06,
"loss": 0.1157,
"mean_token_accuracy": 0.9458943605422974,
"num_tokens": 64726628.0,
"step": 247
},
{
"entropy": 0.4967312514781952,
"epoch": 12.4,
"grad_norm": 3.337940216064453,
"learning_rate": 1e-06,
"loss": 0.1248,
"mean_token_accuracy": 0.948885977268219,
"num_tokens": 64988681.0,
"step": 248
},
{
"entropy": 0.4996580481529236,
"epoch": 12.45,
"grad_norm": 3.4728662967681885,
"learning_rate": 1e-06,
"loss": 0.1167,
"mean_token_accuracy": 0.9493753910064697,
"num_tokens": 65250762.0,
"step": 249
},
{
"entropy": 0.49834275245666504,
"epoch": 12.5,
"grad_norm": 5.884078502655029,
"learning_rate": 1e-06,
"loss": 0.1132,
"mean_token_accuracy": 0.9457477927207947,
"num_tokens": 65512785.0,
"step": 250
},
{
"entropy": 0.5003111362457275,
"epoch": 12.55,
"grad_norm": 2.507913112640381,
"learning_rate": 1e-06,
"loss": 0.1098,
"mean_token_accuracy": 0.9487054347991943,
"num_tokens": 65774831.0,
"step": 251
},
{
"entropy": 0.49748995900154114,
"epoch": 12.6,
"grad_norm": 3.47552490234375,
"learning_rate": 1e-06,
"loss": 0.1201,
"mean_token_accuracy": 0.9420111179351807,
"num_tokens": 66036880.0,
"step": 252
},
{
"entropy": 0.4972376227378845,
"epoch": 12.65,
"grad_norm": 4.500434875488281,
"learning_rate": 1e-06,
"loss": 0.1166,
"mean_token_accuracy": 0.9459459185600281,
"num_tokens": 66298966.0,
"step": 253
},
{
"entropy": 0.49823814630508423,
"epoch": 12.7,
"grad_norm": 4.090944290161133,
"learning_rate": 1e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.943792462348938,
"num_tokens": 66560980.0,
"step": 254
},
{
"entropy": 0.4966731369495392,
"epoch": 12.75,
"grad_norm": 4.648547649383545,
"learning_rate": 1e-06,
"loss": 0.1449,
"mean_token_accuracy": 0.937831699848175,
"num_tokens": 66823057.0,
"step": 255
},
{
"entropy": 0.49896514415740967,
"epoch": 12.8,
"grad_norm": 3.447160482406616,
"learning_rate": 1e-06,
"loss": 0.1325,
"mean_token_accuracy": 0.939793586730957,
"num_tokens": 67085064.0,
"step": 256
},
{
"entropy": 0.501239538192749,
"epoch": 12.85,
"grad_norm": 3.2995057106018066,
"learning_rate": 1e-06,
"loss": 0.133,
"mean_token_accuracy": 0.9414660930633545,
"num_tokens": 67347122.0,
"step": 257
},
{
"entropy": 0.4989623427391052,
"epoch": 12.9,
"grad_norm": 3.629384756088257,
"learning_rate": 1e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9440922141075134,
"num_tokens": 67609153.0,
"step": 258
},
{
"entropy": 0.497197687625885,
"epoch": 12.95,
"grad_norm": 4.829705715179443,
"learning_rate": 1e-06,
"loss": 0.1351,
"mean_token_accuracy": 0.9379671216011047,
"num_tokens": 67871216.0,
"step": 259
},
{
"entropy": 0.49780064821243286,
"epoch": 13.0,
"grad_norm": 4.333249092102051,
"learning_rate": 1e-06,
"loss": 0.1023,
"mean_token_accuracy": 0.9474367499351501,
"num_tokens": 68133284.0,
"step": 260
},
{
"epoch": 13.0,
"eval_entropy": 0.503221333026886,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9052419066429138,
"eval_num_tokens": 68133284.0,
"eval_runtime": 0.561,
"eval_samples_per_second": 445.601,
"eval_steps_per_second": 1.782,
"step": 260
},
{
"entropy": 0.5047539472579956,
"epoch": 13.05,
"grad_norm": 3.8571035861968994,
"learning_rate": 1e-06,
"loss": 0.119,
"mean_token_accuracy": 0.9480260014533997,
"num_tokens": 68395350.0,
"step": 261
},
{
"entropy": 0.5000075101852417,
"epoch": 13.1,
"grad_norm": 3.3609304428100586,
"learning_rate": 1e-06,
"loss": 0.1286,
"mean_token_accuracy": 0.9436339735984802,
"num_tokens": 68657418.0,
"step": 262
},
{
"entropy": 0.5047616958618164,
"epoch": 13.15,
"grad_norm": 2.9678988456726074,
"learning_rate": 1e-06,
"loss": 0.103,
"mean_token_accuracy": 0.9512548446655273,
"num_tokens": 68919456.0,
"step": 263
},
{
"entropy": 0.4988555312156677,
"epoch": 13.2,
"grad_norm": 3.5749735832214355,
"learning_rate": 1e-06,
"loss": 0.1156,
"mean_token_accuracy": 0.948113203048706,
"num_tokens": 69181519.0,
"step": 264
},
{
"entropy": 0.5020204782485962,
"epoch": 13.25,
"grad_norm": 3.25724196434021,
"learning_rate": 1e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9557783007621765,
"num_tokens": 69443544.0,
"step": 265
},
{
"entropy": 0.500059187412262,
"epoch": 13.3,
"grad_norm": 3.753115177154541,
"learning_rate": 1e-06,
"loss": 0.1237,
"mean_token_accuracy": 0.9440914988517761,
"num_tokens": 69705579.0,
"step": 266
},
{
"entropy": 0.5037001967430115,
"epoch": 13.35,
"grad_norm": 3.255347728729248,
"learning_rate": 1e-06,
"loss": 0.1105,
"mean_token_accuracy": 0.9521912336349487,
"num_tokens": 69967586.0,
"step": 267
},
{
"entropy": 0.4993184208869934,
"epoch": 13.4,
"grad_norm": 3.5563864707946777,
"learning_rate": 1e-06,
"loss": 0.1111,
"mean_token_accuracy": 0.9501557350158691,
"num_tokens": 70229652.0,
"step": 268
},
{
"entropy": 0.4983921945095062,
"epoch": 13.45,
"grad_norm": 3.5320169925689697,
"learning_rate": 1e-06,
"loss": 0.1116,
"mean_token_accuracy": 0.9436893463134766,
"num_tokens": 70491691.0,
"step": 269
},
{
"entropy": 0.498714804649353,
"epoch": 13.5,
"grad_norm": 3.004915475845337,
"learning_rate": 1e-06,
"loss": 0.1126,
"mean_token_accuracy": 0.950441300868988,
"num_tokens": 70753724.0,
"step": 270
},
{
"entropy": 0.4960983693599701,
"epoch": 13.55,
"grad_norm": 4.270773887634277,
"learning_rate": 1e-06,
"loss": 0.1157,
"mean_token_accuracy": 0.9450740814208984,
"num_tokens": 71015794.0,
"step": 271
},
{
"entropy": 0.4993487000465393,
"epoch": 13.6,
"grad_norm": 4.245420932769775,
"learning_rate": 1e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9475739002227783,
"num_tokens": 71277883.0,
"step": 272
},
{
"entropy": 0.49819624423980713,
"epoch": 13.65,
"grad_norm": 4.052130222320557,
"learning_rate": 1e-06,
"loss": 0.1226,
"mean_token_accuracy": 0.9484173655509949,
"num_tokens": 71539951.0,
"step": 273
},
{
"entropy": 0.4982229471206665,
"epoch": 13.7,
"grad_norm": 4.078166484832764,
"learning_rate": 1e-06,
"loss": 0.1337,
"mean_token_accuracy": 0.9399612545967102,
"num_tokens": 71802027.0,
"step": 274
},
{
"entropy": 0.5001723766326904,
"epoch": 13.75,
"grad_norm": 3.4441871643066406,
"learning_rate": 1e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9488428831100464,
"num_tokens": 72064064.0,
"step": 275
},
{
"entropy": 0.5003495812416077,
"epoch": 13.8,
"grad_norm": 4.0370097160339355,
"learning_rate": 1e-06,
"loss": 0.1247,
"mean_token_accuracy": 0.9390096664428711,
"num_tokens": 72326115.0,
"step": 276
},
{
"entropy": 0.4975491166114807,
"epoch": 13.85,
"grad_norm": 3.9948337078094482,
"learning_rate": 1e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9438806176185608,
"num_tokens": 72588152.0,
"step": 277
},
{
"entropy": 0.4980151653289795,
"epoch": 13.9,
"grad_norm": 3.5774476528167725,
"learning_rate": 1e-06,
"loss": 0.1213,
"mean_token_accuracy": 0.9462665915489197,
"num_tokens": 72850178.0,
"step": 278
},
{
"entropy": 0.4962918162345886,
"epoch": 13.95,
"grad_norm": 3.5639283657073975,
"learning_rate": 1e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9504778385162354,
"num_tokens": 73112252.0,
"step": 279
},
{
"entropy": 0.49853619933128357,
"epoch": 14.0,
"grad_norm": 3.286870241165161,
"learning_rate": 1e-06,
"loss": 0.1101,
"mean_token_accuracy": 0.9513888955116272,
"num_tokens": 73374318.0,
"step": 280
},
{
"epoch": 14.0,
"eval_entropy": 0.4983806610107422,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9052419066429138,
"eval_num_tokens": 73374318.0,
"eval_runtime": 0.5649,
"eval_samples_per_second": 442.518,
"eval_steps_per_second": 1.77,
"step": 280
},
{
"entropy": 0.4951111078262329,
"epoch": 14.05,
"grad_norm": 3.988466739654541,
"learning_rate": 1e-06,
"loss": 0.1101,
"mean_token_accuracy": 0.9488795399665833,
"num_tokens": 73636405.0,
"step": 281
},
{
"entropy": 0.498315691947937,
"epoch": 14.1,
"grad_norm": 3.465620517730713,
"learning_rate": 1e-06,
"loss": 0.1114,
"mean_token_accuracy": 0.9480925798416138,
"num_tokens": 73898432.0,
"step": 282
},
{
"entropy": 0.4973567724227905,
"epoch": 14.15,
"grad_norm": 3.7496891021728516,
"learning_rate": 1e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9504950642585754,
"num_tokens": 74160476.0,
"step": 283
},
{
"entropy": 0.4969732165336609,
"epoch": 14.2,
"grad_norm": 3.5036423206329346,
"learning_rate": 1e-06,
"loss": 0.1131,
"mean_token_accuracy": 0.9461114406585693,
"num_tokens": 74422504.0,
"step": 284
},
{
"entropy": 0.49568063020706177,
"epoch": 14.25,
"grad_norm": 3.9930689334869385,
"learning_rate": 1e-06,
"loss": 0.1154,
"mean_token_accuracy": 0.9531859755516052,
"num_tokens": 74684569.0,
"step": 285
},
{
"entropy": 0.4962500035762787,
"epoch": 14.3,
"grad_norm": 2.8734872341156006,
"learning_rate": 1e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9498327970504761,
"num_tokens": 74946593.0,
"step": 286
},
{
"entropy": 0.4961685836315155,
"epoch": 14.35,
"grad_norm": 3.3552212715148926,
"learning_rate": 1e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9423274993896484,
"num_tokens": 75208633.0,
"step": 287
},
{
"entropy": 0.49256008863449097,
"epoch": 14.4,
"grad_norm": 3.5463008880615234,
"learning_rate": 1e-06,
"loss": 0.111,
"mean_token_accuracy": 0.9447004795074463,
"num_tokens": 75470693.0,
"step": 288
},
{
"entropy": 0.4943183660507202,
"epoch": 14.45,
"grad_norm": 3.921447277069092,
"learning_rate": 1e-06,
"loss": 0.1058,
"mean_token_accuracy": 0.953438401222229,
"num_tokens": 75732748.0,
"step": 289
},
{
"entropy": 0.49364545941352844,
"epoch": 14.5,
"grad_norm": 3.0754876136779785,
"learning_rate": 1e-06,
"loss": 0.1173,
"mean_token_accuracy": 0.9483187794685364,
"num_tokens": 75994815.0,
"step": 290
},
{
"entropy": 0.4927959144115448,
"epoch": 14.55,
"grad_norm": 2.622016191482544,
"learning_rate": 1e-06,
"loss": 0.1096,
"mean_token_accuracy": 0.9544615149497986,
"num_tokens": 76256835.0,
"step": 291
},
{
"entropy": 0.4929812550544739,
"epoch": 14.6,
"grad_norm": 4.265964508056641,
"learning_rate": 1e-06,
"loss": 0.1088,
"mean_token_accuracy": 0.953329861164093,
"num_tokens": 76518873.0,
"step": 292
},
{
"entropy": 0.49077093601226807,
"epoch": 14.65,
"grad_norm": 4.118034839630127,
"learning_rate": 1e-06,
"loss": 0.1189,
"mean_token_accuracy": 0.9410150647163391,
"num_tokens": 76780957.0,
"step": 293
},
{
"entropy": 0.4885583221912384,
"epoch": 14.7,
"grad_norm": 4.893588066101074,
"learning_rate": 1e-06,
"loss": 0.0994,
"mean_token_accuracy": 0.9585448503494263,
"num_tokens": 77042996.0,
"step": 294
},
{
"entropy": 0.48888856172561646,
"epoch": 14.75,
"grad_norm": 4.3738789558410645,
"learning_rate": 1e-06,
"loss": 0.138,
"mean_token_accuracy": 0.9407114386558533,
"num_tokens": 77305052.0,
"step": 295
},
{
"entropy": 0.49154844880104065,
"epoch": 14.8,
"grad_norm": 6.126094341278076,
"learning_rate": 1e-06,
"loss": 0.1172,
"mean_token_accuracy": 0.954346776008606,
"num_tokens": 77567110.0,
"step": 296
},
{
"entropy": 0.490234911441803,
"epoch": 14.85,
"grad_norm": 5.756350994110107,
"learning_rate": 1e-06,
"loss": 0.1215,
"mean_token_accuracy": 0.9456824660301208,
"num_tokens": 77829205.0,
"step": 297
},
{
"entropy": 0.4910707175731659,
"epoch": 14.9,
"grad_norm": 3.7809011936187744,
"learning_rate": 1e-06,
"loss": 0.1062,
"mean_token_accuracy": 0.9476373195648193,
"num_tokens": 78091232.0,
"step": 298
},
{
"entropy": 0.49150994420051575,
"epoch": 14.95,
"grad_norm": 3.2236623764038086,
"learning_rate": 1e-06,
"loss": 0.1213,
"mean_token_accuracy": 0.9479166865348816,
"num_tokens": 78353286.0,
"step": 299
},
{
"entropy": 0.49145615100860596,
"epoch": 15.0,
"grad_norm": 2.271028757095337,
"learning_rate": 1e-06,
"loss": 0.1012,
"mean_token_accuracy": 0.9547767043113708,
"num_tokens": 78615351.0,
"step": 300
},
{
"epoch": 15.0,
"eval_entropy": 0.4905474781990051,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9045698642730713,
"eval_num_tokens": 78615351.0,
"eval_runtime": 0.5672,
"eval_samples_per_second": 440.741,
"eval_steps_per_second": 1.763,
"step": 300
},
{
"entropy": 0.4869128465652466,
"epoch": 15.05,
"grad_norm": 5.4611053466796875,
"learning_rate": 1e-06,
"loss": 0.1218,
"mean_token_accuracy": 0.940838098526001,
"num_tokens": 78877392.0,
"step": 301
},
{
"entropy": 0.491014301776886,
"epoch": 15.1,
"grad_norm": 3.0112688541412354,
"learning_rate": 1e-06,
"loss": 0.1045,
"mean_token_accuracy": 0.9524050354957581,
"num_tokens": 79139432.0,
"step": 302
},
{
"entropy": 0.49170082807540894,
"epoch": 15.15,
"grad_norm": 4.067041873931885,
"learning_rate": 1e-06,
"loss": 0.1254,
"mean_token_accuracy": 0.9467312097549438,
"num_tokens": 79401479.0,
"step": 303
},
{
"entropy": 0.49128258228302,
"epoch": 15.2,
"grad_norm": 3.7372446060180664,
"learning_rate": 1e-06,
"loss": 0.1062,
"mean_token_accuracy": 0.952275276184082,
"num_tokens": 79663544.0,
"step": 304
},
{
"entropy": 0.48871222138404846,
"epoch": 15.25,
"grad_norm": 3.4806947708129883,
"learning_rate": 1e-06,
"loss": 0.1028,
"mean_token_accuracy": 0.9505928754806519,
"num_tokens": 79925622.0,
"step": 305
},
{
"entropy": 0.4901201128959656,
"epoch": 15.3,
"grad_norm": 3.2800400257110596,
"learning_rate": 1e-06,
"loss": 0.1022,
"mean_token_accuracy": 0.9538653492927551,
"num_tokens": 80187687.0,
"step": 306
},
{
"entropy": 0.49247848987579346,
"epoch": 15.35,
"grad_norm": 2.735215663909912,
"learning_rate": 1e-06,
"loss": 0.1091,
"mean_token_accuracy": 0.9488189220428467,
"num_tokens": 80449695.0,
"step": 307
},
{
"entropy": 0.4879264235496521,
"epoch": 15.4,
"grad_norm": 3.8763179779052734,
"learning_rate": 1e-06,
"loss": 0.1018,
"mean_token_accuracy": 0.9574912786483765,
"num_tokens": 80711756.0,
"step": 308
},
{
"entropy": 0.489043265581131,
"epoch": 15.45,
"grad_norm": 3.1737430095672607,
"learning_rate": 1e-06,
"loss": 0.1091,
"mean_token_accuracy": 0.949999988079071,
"num_tokens": 80973817.0,
"step": 309
},
{
"entropy": 0.4896194338798523,
"epoch": 15.5,
"grad_norm": 2.9024124145507812,
"learning_rate": 1e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.9545205235481262,
"num_tokens": 81235839.0,
"step": 310
},
{
"entropy": 0.4866348206996918,
"epoch": 15.55,
"grad_norm": 2.981309175491333,
"learning_rate": 1e-06,
"loss": 0.1096,
"mean_token_accuracy": 0.9505454301834106,
"num_tokens": 81497873.0,
"step": 311
},
{
"entropy": 0.4875825047492981,
"epoch": 15.6,
"grad_norm": 3.687138319015503,
"learning_rate": 1e-06,
"loss": 0.1089,
"mean_token_accuracy": 0.9505016803741455,
"num_tokens": 81759928.0,
"step": 312
},
{
"entropy": 0.48804572224617004,
"epoch": 15.65,
"grad_norm": 3.807471752166748,
"learning_rate": 1e-06,
"loss": 0.1089,
"mean_token_accuracy": 0.9481101632118225,
"num_tokens": 82021983.0,
"step": 313
},
{
"entropy": 0.48681819438934326,
"epoch": 15.7,
"grad_norm": 3.4905779361724854,
"learning_rate": 1e-06,
"loss": 0.099,
"mean_token_accuracy": 0.9566075205802917,
"num_tokens": 82284064.0,
"step": 314
},
{
"entropy": 0.48834747076034546,
"epoch": 15.75,
"grad_norm": 5.331181526184082,
"learning_rate": 1e-06,
"loss": 0.1124,
"mean_token_accuracy": 0.9493902325630188,
"num_tokens": 82546132.0,
"step": 315
},
{
"entropy": 0.48611488938331604,
"epoch": 15.8,
"grad_norm": 3.41743803024292,
"learning_rate": 1e-06,
"loss": 0.1213,
"mean_token_accuracy": 0.9472049474716187,
"num_tokens": 82808203.0,
"step": 316
},
{
"entropy": 0.4868428409099579,
"epoch": 15.85,
"grad_norm": 4.189897537231445,
"learning_rate": 1e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9522203207015991,
"num_tokens": 83070245.0,
"step": 317
},
{
"entropy": 0.48746997117996216,
"epoch": 15.9,
"grad_norm": 4.698352813720703,
"learning_rate": 1e-06,
"loss": 0.1084,
"mean_token_accuracy": 0.9526795744895935,
"num_tokens": 83332229.0,
"step": 318
},
{
"entropy": 0.4847297966480255,
"epoch": 15.95,
"grad_norm": 3.628556728363037,
"learning_rate": 1e-06,
"loss": 0.116,
"mean_token_accuracy": 0.9458874464035034,
"num_tokens": 83594307.0,
"step": 319
},
{
"entropy": 0.4873000979423523,
"epoch": 16.0,
"grad_norm": 3.9242656230926514,
"learning_rate": 1e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.944888174533844,
"num_tokens": 83856350.0,
"step": 320
},
{
"epoch": 16.0,
"eval_entropy": 0.489590585231781,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9065860509872437,
"eval_num_tokens": 83856350.0,
"eval_runtime": 0.564,
"eval_samples_per_second": 443.283,
"eval_steps_per_second": 1.773,
"step": 320
},
{
"entropy": 0.4858003854751587,
"epoch": 16.05,
"grad_norm": 4.172854423522949,
"learning_rate": 1e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.946601927280426,
"num_tokens": 84118410.0,
"step": 321
},
{
"entropy": 0.48904526233673096,
"epoch": 16.1,
"grad_norm": 3.2139930725097656,
"learning_rate": 1e-06,
"loss": 0.1026,
"mean_token_accuracy": 0.9490445852279663,
"num_tokens": 84380441.0,
"step": 322
},
{
"entropy": 0.48847872018814087,
"epoch": 16.15,
"grad_norm": 3.9387967586517334,
"learning_rate": 1e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9527458548545837,
"num_tokens": 84642468.0,
"step": 323
},
{
"entropy": 0.48851415514945984,
"epoch": 16.2,
"grad_norm": 3.1942989826202393,
"learning_rate": 1e-06,
"loss": 0.1178,
"mean_token_accuracy": 0.9528598189353943,
"num_tokens": 84904553.0,
"step": 324
},
{
"entropy": 0.4875437021255493,
"epoch": 16.25,
"grad_norm": 4.474672317504883,
"learning_rate": 1e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9609507918357849,
"num_tokens": 85166583.0,
"step": 325
},
{
"entropy": 0.49070078134536743,
"epoch": 16.3,
"grad_norm": 3.77111554145813,
"learning_rate": 1e-06,
"loss": 0.1109,
"mean_token_accuracy": 0.9536523818969727,
"num_tokens": 85428633.0,
"step": 326
},
{
"entropy": 0.4889669418334961,
"epoch": 16.35,
"grad_norm": 3.3292832374572754,
"learning_rate": 1e-06,
"loss": 0.1066,
"mean_token_accuracy": 0.9531335234642029,
"num_tokens": 85690632.0,
"step": 327
},
{
"entropy": 0.48486199975013733,
"epoch": 16.4,
"grad_norm": 3.8034586906433105,
"learning_rate": 1e-06,
"loss": 0.1161,
"mean_token_accuracy": 0.9467918872833252,
"num_tokens": 85952701.0,
"step": 328
},
{
"entropy": 0.4868103563785553,
"epoch": 16.45,
"grad_norm": 2.931748151779175,
"learning_rate": 1e-06,
"loss": 0.0993,
"mean_token_accuracy": 0.9569321274757385,
"num_tokens": 86214758.0,
"step": 329
},
{
"entropy": 0.48670750856399536,
"epoch": 16.5,
"grad_norm": 4.134925842285156,
"learning_rate": 1e-06,
"loss": 0.1098,
"mean_token_accuracy": 0.9492447376251221,
"num_tokens": 86476808.0,
"step": 330
},
{
"entropy": 0.485757052898407,
"epoch": 16.55,
"grad_norm": 3.8004045486450195,
"learning_rate": 1e-06,
"loss": 0.106,
"mean_token_accuracy": 0.9532483220100403,
"num_tokens": 86738817.0,
"step": 331
},
{
"entropy": 0.4827927350997925,
"epoch": 16.6,
"grad_norm": 4.365555286407471,
"learning_rate": 1e-06,
"loss": 0.0962,
"mean_token_accuracy": 0.9560723304748535,
"num_tokens": 87000859.0,
"step": 332
},
{
"entropy": 0.4798928499221802,
"epoch": 16.65,
"grad_norm": 4.611724376678467,
"learning_rate": 1e-06,
"loss": 0.108,
"mean_token_accuracy": 0.9538087248802185,
"num_tokens": 87262950.0,
"step": 333
},
{
"entropy": 0.48096251487731934,
"epoch": 16.7,
"grad_norm": 4.28861665725708,
"learning_rate": 1e-06,
"loss": 0.103,
"mean_token_accuracy": 0.9518492817878723,
"num_tokens": 87525009.0,
"step": 334
},
{
"entropy": 0.484794557094574,
"epoch": 16.75,
"grad_norm": 3.724881172180176,
"learning_rate": 1e-06,
"loss": 0.1138,
"mean_token_accuracy": 0.9499734044075012,
"num_tokens": 87787052.0,
"step": 335
},
{
"entropy": 0.4819689393043518,
"epoch": 16.8,
"grad_norm": 5.316562652587891,
"learning_rate": 1e-06,
"loss": 0.0998,
"mean_token_accuracy": 0.9543702006340027,
"num_tokens": 88049135.0,
"step": 336
},
{
"entropy": 0.4811255931854248,
"epoch": 16.85,
"grad_norm": 4.379755973815918,
"learning_rate": 1e-06,
"loss": 0.1125,
"mean_token_accuracy": 0.9438552856445312,
"num_tokens": 88311199.0,
"step": 337
},
{
"entropy": 0.4819214940071106,
"epoch": 16.9,
"grad_norm": 3.4126381874084473,
"learning_rate": 1e-06,
"loss": 0.1071,
"mean_token_accuracy": 0.9480443596839905,
"num_tokens": 88573241.0,
"step": 338
},
{
"entropy": 0.48113828897476196,
"epoch": 16.95,
"grad_norm": 4.438907146453857,
"learning_rate": 1e-06,
"loss": 0.1032,
"mean_token_accuracy": 0.953698456287384,
"num_tokens": 88835308.0,
"step": 339
},
{
"entropy": 0.4829384684562683,
"epoch": 17.0,
"grad_norm": 4.242271423339844,
"learning_rate": 1e-06,
"loss": 0.1189,
"mean_token_accuracy": 0.949438214302063,
"num_tokens": 89097378.0,
"step": 340
},
{
"epoch": 17.0,
"eval_entropy": 0.48575037717819214,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9059139490127563,
"eval_num_tokens": 89097378.0,
"eval_runtime": 0.5659,
"eval_samples_per_second": 441.736,
"eval_steps_per_second": 1.767,
"step": 340
},
{
"entropy": 0.4829714000225067,
"epoch": 17.05,
"grad_norm": 4.21494197845459,
"learning_rate": 1e-06,
"loss": 0.1039,
"mean_token_accuracy": 0.954402506351471,
"num_tokens": 89359441.0,
"step": 341
},
{
"entropy": 0.4828266501426697,
"epoch": 17.1,
"grad_norm": 3.5206823348999023,
"learning_rate": 1e-06,
"loss": 0.1119,
"mean_token_accuracy": 0.9511111378669739,
"num_tokens": 89621477.0,
"step": 342
},
{
"entropy": 0.48437392711639404,
"epoch": 17.15,
"grad_norm": 4.2214674949646,
"learning_rate": 1e-06,
"loss": 0.1004,
"mean_token_accuracy": 0.9563080072402954,
"num_tokens": 89883571.0,
"step": 343
},
{
"entropy": 0.4833766222000122,
"epoch": 17.2,
"grad_norm": 4.171907901763916,
"learning_rate": 1e-06,
"loss": 0.105,
"mean_token_accuracy": 0.953951895236969,
"num_tokens": 90145619.0,
"step": 344
},
{
"entropy": 0.48499199748039246,
"epoch": 17.25,
"grad_norm": 3.7562005519866943,
"learning_rate": 1e-06,
"loss": 0.1001,
"mean_token_accuracy": 0.9547511339187622,
"num_tokens": 90407683.0,
"step": 345
},
{
"entropy": 0.48458331823349,
"epoch": 17.3,
"grad_norm": 3.6610958576202393,
"learning_rate": 1e-06,
"loss": 0.1032,
"mean_token_accuracy": 0.9532163739204407,
"num_tokens": 90669722.0,
"step": 346
},
{
"entropy": 0.4811995327472687,
"epoch": 17.35,
"grad_norm": 3.4695615768432617,
"learning_rate": 1e-06,
"loss": 0.1127,
"mean_token_accuracy": 0.9553333520889282,
"num_tokens": 90931782.0,
"step": 347
},
{
"entropy": 0.4832395017147064,
"epoch": 17.4,
"grad_norm": 4.198061466217041,
"learning_rate": 1e-06,
"loss": 0.088,
"mean_token_accuracy": 0.9628930687904358,
"num_tokens": 91193833.0,
"step": 348
},
{
"entropy": 0.4821211099624634,
"epoch": 17.45,
"grad_norm": 3.404797315597534,
"learning_rate": 1e-06,
"loss": 0.0964,
"mean_token_accuracy": 0.9528061151504517,
"num_tokens": 91455895.0,
"step": 349
},
{
"entropy": 0.479155957698822,
"epoch": 17.5,
"grad_norm": 5.393930912017822,
"learning_rate": 1e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.9529499411582947,
"num_tokens": 91717992.0,
"step": 350
},
{
"entropy": 0.48262494802474976,
"epoch": 17.55,
"grad_norm": 3.950324535369873,
"learning_rate": 1e-06,
"loss": 0.1016,
"mean_token_accuracy": 0.9538905024528503,
"num_tokens": 91979974.0,
"step": 351
},
{
"entropy": 0.4808007478713989,
"epoch": 17.6,
"grad_norm": 5.840694427490234,
"learning_rate": 1e-06,
"loss": 0.0978,
"mean_token_accuracy": 0.9548532962799072,
"num_tokens": 92242028.0,
"step": 352
},
{
"entropy": 0.4793074131011963,
"epoch": 17.65,
"grad_norm": 4.341586112976074,
"learning_rate": 1e-06,
"loss": 0.1026,
"mean_token_accuracy": 0.9537906050682068,
"num_tokens": 92504072.0,
"step": 353
},
{
"entropy": 0.48200857639312744,
"epoch": 17.7,
"grad_norm": 4.7615485191345215,
"learning_rate": 1e-06,
"loss": 0.0989,
"mean_token_accuracy": 0.9592834115028381,
"num_tokens": 92766111.0,
"step": 354
},
{
"entropy": 0.4793251156806946,
"epoch": 17.75,
"grad_norm": 4.265474796295166,
"learning_rate": 1e-06,
"loss": 0.0984,
"mean_token_accuracy": 0.9533022046089172,
"num_tokens": 93028137.0,
"step": 355
},
{
"entropy": 0.48376020789146423,
"epoch": 17.8,
"grad_norm": 4.087716579437256,
"learning_rate": 1e-06,
"loss": 0.1104,
"mean_token_accuracy": 0.9477089047431946,
"num_tokens": 93290189.0,
"step": 356
},
{
"entropy": 0.48131391406059265,
"epoch": 17.85,
"grad_norm": 3.9392213821411133,
"learning_rate": 1e-06,
"loss": 0.1016,
"mean_token_accuracy": 0.9560810923576355,
"num_tokens": 93552229.0,
"step": 357
},
{
"entropy": 0.4821656346321106,
"epoch": 17.9,
"grad_norm": 4.806204795837402,
"learning_rate": 1e-06,
"loss": 0.1097,
"mean_token_accuracy": 0.9533898234367371,
"num_tokens": 93814276.0,
"step": 358
},
{
"entropy": 0.4840206801891327,
"epoch": 17.95,
"grad_norm": 4.974476337432861,
"learning_rate": 1e-06,
"loss": 0.0998,
"mean_token_accuracy": 0.9556295275688171,
"num_tokens": 94076342.0,
"step": 359
},
{
"entropy": 0.48071110248565674,
"epoch": 18.0,
"grad_norm": 3.907980442047119,
"learning_rate": 1e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9533995389938354,
"num_tokens": 94338409.0,
"step": 360
},
{
"epoch": 18.0,
"eval_entropy": 0.4855648875236511,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9072580933570862,
"eval_num_tokens": 94338409.0,
"eval_runtime": 0.5648,
"eval_samples_per_second": 442.64,
"eval_steps_per_second": 1.771,
"step": 360
},
{
"entropy": 0.4835587739944458,
"epoch": 18.05,
"grad_norm": 3.5856211185455322,
"learning_rate": 1e-06,
"loss": 0.106,
"mean_token_accuracy": 0.9543883800506592,
"num_tokens": 94600449.0,
"step": 361
},
{
"entropy": 0.4841495454311371,
"epoch": 18.1,
"grad_norm": 3.932619094848633,
"learning_rate": 1e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9541892409324646,
"num_tokens": 94862470.0,
"step": 362
},
{
"entropy": 0.48350268602371216,
"epoch": 18.15,
"grad_norm": 3.5396127700805664,
"learning_rate": 1e-06,
"loss": 0.0872,
"mean_token_accuracy": 0.9625223278999329,
"num_tokens": 95124546.0,
"step": 363
},
{
"entropy": 0.48403599858283997,
"epoch": 18.2,
"grad_norm": 3.43064546585083,
"learning_rate": 1e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9590017795562744,
"num_tokens": 95386591.0,
"step": 364
},
{
"entropy": 0.4856931269168854,
"epoch": 18.25,
"grad_norm": 3.188349485397339,
"learning_rate": 1e-06,
"loss": 0.0961,
"mean_token_accuracy": 0.9583789706230164,
"num_tokens": 95648581.0,
"step": 365
},
{
"entropy": 0.48474887013435364,
"epoch": 18.3,
"grad_norm": 2.6797800064086914,
"learning_rate": 1e-06,
"loss": 0.0945,
"mean_token_accuracy": 0.9626865386962891,
"num_tokens": 95910617.0,
"step": 366
},
{
"entropy": 0.48159462213516235,
"epoch": 18.35,
"grad_norm": 4.948982238769531,
"learning_rate": 1e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.9509345889091492,
"num_tokens": 96172692.0,
"step": 367
},
{
"entropy": 0.48261338472366333,
"epoch": 18.4,
"grad_norm": 4.678440093994141,
"learning_rate": 1e-06,
"loss": 0.1114,
"mean_token_accuracy": 0.9544126391410828,
"num_tokens": 96434732.0,
"step": 368
},
{
"entropy": 0.47885391116142273,
"epoch": 18.45,
"grad_norm": 6.533933639526367,
"learning_rate": 1e-06,
"loss": 0.0894,
"mean_token_accuracy": 0.9582701325416565,
"num_tokens": 96696808.0,
"step": 369
},
{
"entropy": 0.47860729694366455,
"epoch": 18.5,
"grad_norm": 4.395998001098633,
"learning_rate": 1e-06,
"loss": 0.1031,
"mean_token_accuracy": 0.9530686140060425,
"num_tokens": 96958885.0,
"step": 370
},
{
"entropy": 0.4792514443397522,
"epoch": 18.55,
"grad_norm": 5.65232515335083,
"learning_rate": 1e-06,
"loss": 0.0967,
"mean_token_accuracy": 0.9528796076774597,
"num_tokens": 97220973.0,
"step": 371
},
{
"entropy": 0.47906193137168884,
"epoch": 18.6,
"grad_norm": 4.153817176818848,
"learning_rate": 1e-06,
"loss": 0.0983,
"mean_token_accuracy": 0.9545454382896423,
"num_tokens": 97483051.0,
"step": 372
},
{
"entropy": 0.4794267416000366,
"epoch": 18.65,
"grad_norm": 4.057419300079346,
"learning_rate": 1e-06,
"loss": 0.0891,
"mean_token_accuracy": 0.9647576808929443,
"num_tokens": 97745101.0,
"step": 373
},
{
"entropy": 0.4788510203361511,
"epoch": 18.7,
"grad_norm": 4.535802841186523,
"learning_rate": 1e-06,
"loss": 0.1052,
"mean_token_accuracy": 0.955997884273529,
"num_tokens": 98007141.0,
"step": 374
},
{
"entropy": 0.4793304204940796,
"epoch": 18.75,
"grad_norm": 3.66812801361084,
"learning_rate": 1e-06,
"loss": 0.0972,
"mean_token_accuracy": 0.95691978931427,
"num_tokens": 98269129.0,
"step": 375
},
{
"entropy": 0.4771343767642975,
"epoch": 18.8,
"grad_norm": 5.437928199768066,
"learning_rate": 1e-06,
"loss": 0.1163,
"mean_token_accuracy": 0.9437780976295471,
"num_tokens": 98531221.0,
"step": 376
},
{
"entropy": 0.4786272644996643,
"epoch": 18.85,
"grad_norm": 7.437087059020996,
"learning_rate": 1e-06,
"loss": 0.1095,
"mean_token_accuracy": 0.9502487778663635,
"num_tokens": 98793257.0,
"step": 377
},
{
"entropy": 0.47760748863220215,
"epoch": 18.9,
"grad_norm": 4.315995216369629,
"learning_rate": 1e-06,
"loss": 0.0996,
"mean_token_accuracy": 0.9553039073944092,
"num_tokens": 99055330.0,
"step": 378
},
{
"entropy": 0.4789770841598511,
"epoch": 18.95,
"grad_norm": 3.436211109161377,
"learning_rate": 1e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9601989984512329,
"num_tokens": 99317366.0,
"step": 379
},
{
"entropy": 0.4768607020378113,
"epoch": 19.0,
"grad_norm": 4.5564093589782715,
"learning_rate": 1e-06,
"loss": 0.1028,
"mean_token_accuracy": 0.9576333165168762,
"num_tokens": 99579427.0,
"step": 380
},
{
"epoch": 19.0,
"eval_entropy": 0.48156681656837463,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9072580933570862,
"eval_num_tokens": 99579427.0,
"eval_runtime": 0.5632,
"eval_samples_per_second": 443.912,
"eval_steps_per_second": 1.776,
"step": 380
},
{
"entropy": 0.4807729125022888,
"epoch": 19.05,
"grad_norm": 2.999615430831909,
"learning_rate": 1e-06,
"loss": 0.0948,
"mean_token_accuracy": 0.9598582983016968,
"num_tokens": 99841483.0,
"step": 381
},
{
"entropy": 0.47727420926094055,
"epoch": 19.1,
"grad_norm": 3.7125136852264404,
"learning_rate": 1e-06,
"loss": 0.0898,
"mean_token_accuracy": 0.9632495045661926,
"num_tokens": 100103528.0,
"step": 382
},
{
"entropy": 0.4780922532081604,
"epoch": 19.15,
"grad_norm": 3.4127087593078613,
"learning_rate": 1e-06,
"loss": 0.1037,
"mean_token_accuracy": 0.955041766166687,
"num_tokens": 100365579.0,
"step": 383
},
{
"entropy": 0.47612839937210083,
"epoch": 19.2,
"grad_norm": 5.690220832824707,
"learning_rate": 1e-06,
"loss": 0.0873,
"mean_token_accuracy": 0.9584121108055115,
"num_tokens": 100627627.0,
"step": 384
},
{
"entropy": 0.4755370616912842,
"epoch": 19.25,
"grad_norm": 4.630006790161133,
"learning_rate": 1e-06,
"loss": 0.1026,
"mean_token_accuracy": 0.9599140882492065,
"num_tokens": 100889716.0,
"step": 385
},
{
"entropy": 0.4773571789264679,
"epoch": 19.3,
"grad_norm": 4.160724639892578,
"learning_rate": 1e-06,
"loss": 0.092,
"mean_token_accuracy": 0.9613651037216187,
"num_tokens": 101151796.0,
"step": 386
},
{
"entropy": 0.4772469997406006,
"epoch": 19.35,
"grad_norm": 4.370746612548828,
"learning_rate": 1e-06,
"loss": 0.0965,
"mean_token_accuracy": 0.9557135105133057,
"num_tokens": 101413822.0,
"step": 387
},
{
"entropy": 0.4769657552242279,
"epoch": 19.4,
"grad_norm": 3.9834535121917725,
"learning_rate": 1e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9644970297813416,
"num_tokens": 101675903.0,
"step": 388
},
{
"entropy": 0.4751873016357422,
"epoch": 19.45,
"grad_norm": 5.0992021560668945,
"learning_rate": 1e-06,
"loss": 0.0923,
"mean_token_accuracy": 0.9595441818237305,
"num_tokens": 101937954.0,
"step": 389
},
{
"entropy": 0.47450515627861023,
"epoch": 19.5,
"grad_norm": 6.339524269104004,
"learning_rate": 1e-06,
"loss": 0.1009,
"mean_token_accuracy": 0.9467408657073975,
"num_tokens": 102200003.0,
"step": 390
},
{
"entropy": 0.4756562411785126,
"epoch": 19.55,
"grad_norm": 4.202500820159912,
"learning_rate": 1e-06,
"loss": 0.0917,
"mean_token_accuracy": 0.9617834687232971,
"num_tokens": 102462034.0,
"step": 391
},
{
"entropy": 0.47639748454093933,
"epoch": 19.6,
"grad_norm": 4.514294147491455,
"learning_rate": 1e-06,
"loss": 0.0952,
"mean_token_accuracy": 0.9570673704147339,
"num_tokens": 102724075.0,
"step": 392
},
{
"entropy": 0.4771527051925659,
"epoch": 19.65,
"grad_norm": 4.23642110824585,
"learning_rate": 1e-06,
"loss": 0.0959,
"mean_token_accuracy": 0.9596773982048035,
"num_tokens": 102986140.0,
"step": 393
},
{
"entropy": 0.476939857006073,
"epoch": 19.7,
"grad_norm": 3.8977198600769043,
"learning_rate": 1e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9618320465087891,
"num_tokens": 103248205.0,
"step": 394
},
{
"entropy": 0.4789218604564667,
"epoch": 19.75,
"grad_norm": 4.690950393676758,
"learning_rate": 1e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9504048824310303,
"num_tokens": 103510180.0,
"step": 395
},
{
"entropy": 0.4752090275287628,
"epoch": 19.8,
"grad_norm": 3.9899864196777344,
"learning_rate": 1e-06,
"loss": 0.091,
"mean_token_accuracy": 0.9608516693115234,
"num_tokens": 103772229.0,
"step": 396
},
{
"entropy": 0.47282856702804565,
"epoch": 19.85,
"grad_norm": 5.252200126647949,
"learning_rate": 1e-06,
"loss": 0.1034,
"mean_token_accuracy": 0.9577922224998474,
"num_tokens": 104034285.0,
"step": 397
},
{
"entropy": 0.47455742955207825,
"epoch": 19.9,
"grad_norm": 7.813296318054199,
"learning_rate": 1e-06,
"loss": 0.1068,
"mean_token_accuracy": 0.9442644119262695,
"num_tokens": 104296355.0,
"step": 398
},
{
"entropy": 0.4774863123893738,
"epoch": 19.95,
"grad_norm": 8.987563133239746,
"learning_rate": 1e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9434475302696228,
"num_tokens": 104558391.0,
"step": 399
},
{
"entropy": 0.4767053723335266,
"epoch": 20.0,
"grad_norm": 5.698646068572998,
"learning_rate": 1e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9530162215232849,
"num_tokens": 104820444.0,
"step": 400
},
{
"epoch": 20.0,
"eval_entropy": 0.4785197973251343,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9038978219032288,
"eval_num_tokens": 104820444.0,
"eval_runtime": 0.5628,
"eval_samples_per_second": 444.175,
"eval_steps_per_second": 1.777,
"step": 400
},
{
"entropy": 0.47759389877319336,
"epoch": 20.05,
"grad_norm": 5.164842128753662,
"learning_rate": 1e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9596510529518127,
"num_tokens": 105082475.0,
"step": 401
},
{
"entropy": 0.4755489230155945,
"epoch": 20.1,
"grad_norm": 4.541907787322998,
"learning_rate": 1e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9554093480110168,
"num_tokens": 105344502.0,
"step": 402
},
{
"entropy": 0.4764818847179413,
"epoch": 20.15,
"grad_norm": 4.786900043487549,
"learning_rate": 1e-06,
"loss": 0.1111,
"mean_token_accuracy": 0.9530423283576965,
"num_tokens": 105606574.0,
"step": 403
},
{
"entropy": 0.4768408536911011,
"epoch": 20.2,
"grad_norm": 5.436928749084473,
"learning_rate": 1e-06,
"loss": 0.1035,
"mean_token_accuracy": 0.9548913240432739,
"num_tokens": 105868611.0,
"step": 404
},
{
"entropy": 0.4776271879673004,
"epoch": 20.25,
"grad_norm": 6.8953657150268555,
"learning_rate": 1e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9477487206459045,
"num_tokens": 106130673.0,
"step": 405
},
{
"entropy": 0.47521114349365234,
"epoch": 20.3,
"grad_norm": 5.883774280548096,
"learning_rate": 1e-06,
"loss": 0.1033,
"mean_token_accuracy": 0.948503851890564,
"num_tokens": 106392769.0,
"step": 406
},
{
"entropy": 0.47845274209976196,
"epoch": 20.35,
"grad_norm": 3.9064784049987793,
"learning_rate": 1e-06,
"loss": 0.0857,
"mean_token_accuracy": 0.9641460180282593,
"num_tokens": 106654830.0,
"step": 407
},
{
"entropy": 0.4787122309207916,
"epoch": 20.4,
"grad_norm": 3.2227232456207275,
"learning_rate": 1e-06,
"loss": 0.0992,
"mean_token_accuracy": 0.9579360485076904,
"num_tokens": 106916876.0,
"step": 408
},
{
"entropy": 0.4786139130592346,
"epoch": 20.45,
"grad_norm": 3.6466708183288574,
"learning_rate": 1e-06,
"loss": 0.0906,
"mean_token_accuracy": 0.9564660787582397,
"num_tokens": 107178932.0,
"step": 409
},
{
"entropy": 0.47711610794067383,
"epoch": 20.5,
"grad_norm": 5.3844194412231445,
"learning_rate": 1e-06,
"loss": 0.098,
"mean_token_accuracy": 0.9515488743782043,
"num_tokens": 107440982.0,
"step": 410
},
{
"entropy": 0.47591736912727356,
"epoch": 20.55,
"grad_norm": 4.034522533416748,
"learning_rate": 1e-06,
"loss": 0.0857,
"mean_token_accuracy": 0.9637036919593811,
"num_tokens": 107703057.0,
"step": 411
},
{
"entropy": 0.4796789884567261,
"epoch": 20.6,
"grad_norm": 3.7229764461517334,
"learning_rate": 1e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9600798487663269,
"num_tokens": 107965126.0,
"step": 412
},
{
"entropy": 0.47761350870132446,
"epoch": 20.65,
"grad_norm": 3.5426137447357178,
"learning_rate": 1e-06,
"loss": 0.0894,
"mean_token_accuracy": 0.9674220681190491,
"num_tokens": 108227164.0,
"step": 413
},
{
"entropy": 0.480240136384964,
"epoch": 20.7,
"grad_norm": 3.649472713470459,
"learning_rate": 1e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9633389711380005,
"num_tokens": 108489167.0,
"step": 414
},
{
"entropy": 0.4761279821395874,
"epoch": 20.75,
"grad_norm": 4.2589616775512695,
"learning_rate": 1e-06,
"loss": 0.0914,
"mean_token_accuracy": 0.96128249168396,
"num_tokens": 108751215.0,
"step": 415
},
{
"entropy": 0.47693654894828796,
"epoch": 20.8,
"grad_norm": 4.516826152801514,
"learning_rate": 1e-06,
"loss": 0.0915,
"mean_token_accuracy": 0.9612069129943848,
"num_tokens": 109013235.0,
"step": 416
},
{
"entropy": 0.4774542450904846,
"epoch": 20.85,
"grad_norm": 3.8276429176330566,
"learning_rate": 1e-06,
"loss": 0.0906,
"mean_token_accuracy": 0.9607046246528625,
"num_tokens": 109275304.0,
"step": 417
},
{
"entropy": 0.47494709491729736,
"epoch": 20.9,
"grad_norm": 4.62904167175293,
"learning_rate": 1e-06,
"loss": 0.0916,
"mean_token_accuracy": 0.9534883499145508,
"num_tokens": 109537356.0,
"step": 418
},
{
"entropy": 0.4711452126502991,
"epoch": 20.95,
"grad_norm": 4.15134334564209,
"learning_rate": 1e-06,
"loss": 0.0912,
"mean_token_accuracy": 0.9620253443717957,
"num_tokens": 109799431.0,
"step": 419
},
{
"entropy": 0.47310134768486023,
"epoch": 21.0,
"grad_norm": 6.700887680053711,
"learning_rate": 1e-06,
"loss": 0.0948,
"mean_token_accuracy": 0.9579694271087646,
"num_tokens": 110061460.0,
"step": 420
},
{
"epoch": 21.0,
"eval_entropy": 0.4740469455718994,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9059139490127563,
"eval_num_tokens": 110061460.0,
"eval_runtime": 0.5634,
"eval_samples_per_second": 443.746,
"eval_steps_per_second": 1.775,
"step": 420
},
{
"entropy": 0.470045268535614,
"epoch": 21.05,
"grad_norm": 7.74640417098999,
"learning_rate": 1e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9586715698242188,
"num_tokens": 110323507.0,
"step": 421
},
{
"entropy": 0.47086870670318604,
"epoch": 21.1,
"grad_norm": 4.6416754722595215,
"learning_rate": 1e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9621498584747314,
"num_tokens": 110585553.0,
"step": 422
},
{
"entropy": 0.4706752896308899,
"epoch": 21.15,
"grad_norm": 4.6220703125,
"learning_rate": 1e-06,
"loss": 0.1057,
"mean_token_accuracy": 0.952162504196167,
"num_tokens": 110847573.0,
"step": 423
},
{
"entropy": 0.4702892303466797,
"epoch": 21.2,
"grad_norm": 4.489948272705078,
"learning_rate": 1e-06,
"loss": 0.0966,
"mean_token_accuracy": 0.9588235020637512,
"num_tokens": 111109630.0,
"step": 424
},
{
"entropy": 0.46930834650993347,
"epoch": 21.25,
"grad_norm": 3.6754980087280273,
"learning_rate": 1e-06,
"loss": 0.0863,
"mean_token_accuracy": 0.9629878997802734,
"num_tokens": 111371709.0,
"step": 425
},
{
"entropy": 0.4697002172470093,
"epoch": 21.3,
"grad_norm": 4.992099285125732,
"learning_rate": 1e-06,
"loss": 0.0925,
"mean_token_accuracy": 0.9623864889144897,
"num_tokens": 111633745.0,
"step": 426
},
{
"entropy": 0.469596803188324,
"epoch": 21.35,
"grad_norm": 5.482630729675293,
"learning_rate": 1e-06,
"loss": 0.0905,
"mean_token_accuracy": 0.9579145908355713,
"num_tokens": 111895798.0,
"step": 427
},
{
"entropy": 0.47250843048095703,
"epoch": 21.4,
"grad_norm": 4.3867716789245605,
"learning_rate": 1e-06,
"loss": 0.0899,
"mean_token_accuracy": 0.9563699960708618,
"num_tokens": 112157846.0,
"step": 428
},
{
"entropy": 0.4697571396827698,
"epoch": 21.45,
"grad_norm": 4.48779296875,
"learning_rate": 1e-06,
"loss": 0.0962,
"mean_token_accuracy": 0.9523077011108398,
"num_tokens": 112419937.0,
"step": 429
},
{
"entropy": 0.47207871079444885,
"epoch": 21.5,
"grad_norm": 4.785567760467529,
"learning_rate": 1e-06,
"loss": 0.0903,
"mean_token_accuracy": 0.9592496752738953,
"num_tokens": 112681977.0,
"step": 430
},
{
"entropy": 0.47401899099349976,
"epoch": 21.55,
"grad_norm": 4.775023460388184,
"learning_rate": 1e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9604700803756714,
"num_tokens": 112944013.0,
"step": 431
},
{
"entropy": 0.4714542031288147,
"epoch": 21.6,
"grad_norm": 3.748880624771118,
"learning_rate": 1e-06,
"loss": 0.0885,
"mean_token_accuracy": 0.9565749168395996,
"num_tokens": 113206076.0,
"step": 432
},
{
"entropy": 0.4722508192062378,
"epoch": 21.65,
"grad_norm": 4.005458831787109,
"learning_rate": 1e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9660633206367493,
"num_tokens": 113468107.0,
"step": 433
},
{
"entropy": 0.47126466035842896,
"epoch": 21.7,
"grad_norm": 4.053618431091309,
"learning_rate": 1e-06,
"loss": 0.0888,
"mean_token_accuracy": 0.9616148471832275,
"num_tokens": 113730145.0,
"step": 434
},
{
"entropy": 0.4720988869667053,
"epoch": 21.75,
"grad_norm": 3.8416616916656494,
"learning_rate": 1e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9624871611595154,
"num_tokens": 113992189.0,
"step": 435
},
{
"entropy": 0.4716281294822693,
"epoch": 21.8,
"grad_norm": 4.562581539154053,
"learning_rate": 1e-06,
"loss": 0.0901,
"mean_token_accuracy": 0.9555829763412476,
"num_tokens": 114254238.0,
"step": 436
},
{
"entropy": 0.4716986119747162,
"epoch": 21.85,
"grad_norm": 4.10395622253418,
"learning_rate": 1e-06,
"loss": 0.0911,
"mean_token_accuracy": 0.9650474190711975,
"num_tokens": 114516321.0,
"step": 437
},
{
"entropy": 0.47269925475120544,
"epoch": 21.9,
"grad_norm": 4.068876266479492,
"learning_rate": 1e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9637249708175659,
"num_tokens": 114778365.0,
"step": 438
},
{
"entropy": 0.47152841091156006,
"epoch": 21.95,
"grad_norm": 5.2423095703125,
"learning_rate": 1e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9640921354293823,
"num_tokens": 115040434.0,
"step": 439
},
{
"entropy": 0.47160810232162476,
"epoch": 22.0,
"grad_norm": 5.08480167388916,
"learning_rate": 1e-06,
"loss": 0.0932,
"mean_token_accuracy": 0.9635722637176514,
"num_tokens": 115302465.0,
"step": 440
},
{
"epoch": 22.0,
"eval_entropy": 0.4727582335472107,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9119623899459839,
"eval_num_tokens": 115302465.0,
"eval_runtime": 0.5604,
"eval_samples_per_second": 446.076,
"eval_steps_per_second": 1.784,
"step": 440
},
{
"entropy": 0.47174668312072754,
"epoch": 22.05,
"grad_norm": 3.2800099849700928,
"learning_rate": 1e-06,
"loss": 0.0718,
"mean_token_accuracy": 0.9681742191314697,
"num_tokens": 115564519.0,
"step": 441
},
{
"entropy": 0.47139155864715576,
"epoch": 22.1,
"grad_norm": 4.268564701080322,
"learning_rate": 1e-06,
"loss": 0.0982,
"mean_token_accuracy": 0.9564149975776672,
"num_tokens": 115826576.0,
"step": 442
},
{
"entropy": 0.47082388401031494,
"epoch": 22.15,
"grad_norm": 4.853943824768066,
"learning_rate": 1e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9619899392127991,
"num_tokens": 116088628.0,
"step": 443
},
{
"entropy": 0.4726426601409912,
"epoch": 22.2,
"grad_norm": 3.3755438327789307,
"learning_rate": 1e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9618708491325378,
"num_tokens": 116350660.0,
"step": 444
},
{
"entropy": 0.47002309560775757,
"epoch": 22.25,
"grad_norm": 3.2275404930114746,
"learning_rate": 1e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9626911282539368,
"num_tokens": 116612690.0,
"step": 445
},
{
"entropy": 0.4685373902320862,
"epoch": 22.3,
"grad_norm": 5.329719066619873,
"learning_rate": 1e-06,
"loss": 0.0913,
"mean_token_accuracy": 0.9623402953147888,
"num_tokens": 116874737.0,
"step": 446
},
{
"entropy": 0.4670417010784149,
"epoch": 22.35,
"grad_norm": 3.7413110733032227,
"learning_rate": 1e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9646719694137573,
"num_tokens": 117136816.0,
"step": 447
},
{
"entropy": 0.46998491883277893,
"epoch": 22.4,
"grad_norm": 2.7414612770080566,
"learning_rate": 1e-06,
"loss": 0.0775,
"mean_token_accuracy": 0.9648514986038208,
"num_tokens": 117398835.0,
"step": 448
},
{
"entropy": 0.4664579927921295,
"epoch": 22.45,
"grad_norm": 4.6384406089782715,
"learning_rate": 1e-06,
"loss": 0.073,
"mean_token_accuracy": 0.9710144996643066,
"num_tokens": 117660898.0,
"step": 449
},
{
"entropy": 0.46621328592300415,
"epoch": 22.5,
"grad_norm": 5.154250144958496,
"learning_rate": 1e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9636255502700806,
"num_tokens": 117922970.0,
"step": 450
},
{
"entropy": 0.46722596883773804,
"epoch": 22.55,
"grad_norm": 6.065870761871338,
"learning_rate": 1e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.9548147916793823,
"num_tokens": 118185045.0,
"step": 451
},
{
"entropy": 0.4652860760688782,
"epoch": 22.6,
"grad_norm": 4.755091190338135,
"learning_rate": 1e-06,
"loss": 0.0709,
"mean_token_accuracy": 0.9737588763237,
"num_tokens": 118447114.0,
"step": 452
},
{
"entropy": 0.4672521650791168,
"epoch": 22.65,
"grad_norm": 4.636857509613037,
"learning_rate": 1e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9595959782600403,
"num_tokens": 118709192.0,
"step": 453
},
{
"entropy": 0.46378546953201294,
"epoch": 22.7,
"grad_norm": 6.048754692077637,
"learning_rate": 1e-06,
"loss": 0.0929,
"mean_token_accuracy": 0.9589357972145081,
"num_tokens": 118971217.0,
"step": 454
},
{
"entropy": 0.4664173722267151,
"epoch": 22.75,
"grad_norm": 4.586204528808594,
"learning_rate": 1e-06,
"loss": 0.0926,
"mean_token_accuracy": 0.9641411304473877,
"num_tokens": 119233209.0,
"step": 455
},
{
"entropy": 0.46483659744262695,
"epoch": 22.8,
"grad_norm": 5.882786750793457,
"learning_rate": 1e-06,
"loss": 0.0874,
"mean_token_accuracy": 0.9628571271896362,
"num_tokens": 119495268.0,
"step": 456
},
{
"entropy": 0.46636223793029785,
"epoch": 22.85,
"grad_norm": 8.683144569396973,
"learning_rate": 1e-06,
"loss": 0.1024,
"mean_token_accuracy": 0.9540635943412781,
"num_tokens": 119757295.0,
"step": 457
},
{
"entropy": 0.4629400968551636,
"epoch": 22.9,
"grad_norm": 8.564299583435059,
"learning_rate": 1e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.9609755873680115,
"num_tokens": 120019382.0,
"step": 458
},
{
"entropy": 0.46590715646743774,
"epoch": 22.95,
"grad_norm": 4.7376275062561035,
"learning_rate": 1e-06,
"loss": 0.0898,
"mean_token_accuracy": 0.9637865424156189,
"num_tokens": 120281450.0,
"step": 459
},
{
"entropy": 0.466198205947876,
"epoch": 23.0,
"grad_norm": 4.441730976104736,
"learning_rate": 1e-06,
"loss": 0.0738,
"mean_token_accuracy": 0.9684579372406006,
"num_tokens": 120543491.0,
"step": 460
},
{
"epoch": 23.0,
"eval_entropy": 0.46709567308425903,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9045698642730713,
"eval_num_tokens": 120543491.0,
"eval_runtime": 0.6287,
"eval_samples_per_second": 397.673,
"eval_steps_per_second": 1.591,
"step": 460
},
{
"entropy": 0.46561282873153687,
"epoch": 23.05,
"grad_norm": 3.567425012588501,
"learning_rate": 1e-06,
"loss": 0.0809,
"mean_token_accuracy": 0.9651972055435181,
"num_tokens": 120805544.0,
"step": 461
},
{
"entropy": 0.46857595443725586,
"epoch": 23.1,
"grad_norm": 4.191952228546143,
"learning_rate": 1e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9606382846832275,
"num_tokens": 121067588.0,
"step": 462
},
{
"entropy": 0.4691683053970337,
"epoch": 23.15,
"grad_norm": 5.60888671875,
"learning_rate": 1e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9594594836235046,
"num_tokens": 121329651.0,
"step": 463
},
{
"entropy": 0.46791988611221313,
"epoch": 23.2,
"grad_norm": 5.512171745300293,
"learning_rate": 1e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9642616510391235,
"num_tokens": 121591694.0,
"step": 464
},
{
"entropy": 0.47018688917160034,
"epoch": 23.25,
"grad_norm": 6.818735122680664,
"learning_rate": 1e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.9616252779960632,
"num_tokens": 121853781.0,
"step": 465
},
{
"entropy": 0.4723391532897949,
"epoch": 23.3,
"grad_norm": 3.6118452548980713,
"learning_rate": 1e-06,
"loss": 0.076,
"mean_token_accuracy": 0.9650793671607971,
"num_tokens": 122115769.0,
"step": 466
},
{
"entropy": 0.4676530957221985,
"epoch": 23.35,
"grad_norm": 4.851842880249023,
"learning_rate": 1e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9640449285507202,
"num_tokens": 122377862.0,
"step": 467
},
{
"entropy": 0.46646612882614136,
"epoch": 23.4,
"grad_norm": 5.6723175048828125,
"learning_rate": 1e-06,
"loss": 0.0904,
"mean_token_accuracy": 0.9647058844566345,
"num_tokens": 122639914.0,
"step": 468
},
{
"entropy": 0.46782416105270386,
"epoch": 23.45,
"grad_norm": 6.064637184143066,
"learning_rate": 1e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9623545408248901,
"num_tokens": 122901968.0,
"step": 469
},
{
"entropy": 0.4674442410469055,
"epoch": 23.5,
"grad_norm": 5.13816499710083,
"learning_rate": 1e-06,
"loss": 0.0889,
"mean_token_accuracy": 0.9569685459136963,
"num_tokens": 123164019.0,
"step": 470
},
{
"entropy": 0.4655379354953766,
"epoch": 23.55,
"grad_norm": 5.55079984664917,
"learning_rate": 1e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9623115658760071,
"num_tokens": 123426103.0,
"step": 471
},
{
"entropy": 0.4674234986305237,
"epoch": 23.6,
"grad_norm": 4.348241806030273,
"learning_rate": 1e-06,
"loss": 0.0795,
"mean_token_accuracy": 0.9644389748573303,
"num_tokens": 123688162.0,
"step": 472
},
{
"entropy": 0.4678855538368225,
"epoch": 23.65,
"grad_norm": 4.124541282653809,
"learning_rate": 1e-06,
"loss": 0.065,
"mean_token_accuracy": 0.9705128073692322,
"num_tokens": 123950216.0,
"step": 473
},
{
"entropy": 0.4669386148452759,
"epoch": 23.7,
"grad_norm": 4.676552772521973,
"learning_rate": 1e-06,
"loss": 0.0745,
"mean_token_accuracy": 0.970704197883606,
"num_tokens": 124212287.0,
"step": 474
},
{
"entropy": 0.46618372201919556,
"epoch": 23.75,
"grad_norm": 4.589600086212158,
"learning_rate": 1e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9622377753257751,
"num_tokens": 124474343.0,
"step": 475
},
{
"entropy": 0.4674683213233948,
"epoch": 23.8,
"grad_norm": 5.328636646270752,
"learning_rate": 1e-06,
"loss": 0.0744,
"mean_token_accuracy": 0.9667332172393799,
"num_tokens": 124736406.0,
"step": 476
},
{
"entropy": 0.4664192497730255,
"epoch": 23.85,
"grad_norm": 6.037339210510254,
"learning_rate": 1e-06,
"loss": 0.0935,
"mean_token_accuracy": 0.9550173282623291,
"num_tokens": 124998444.0,
"step": 477
},
{
"entropy": 0.46771666407585144,
"epoch": 23.9,
"grad_norm": 5.8049468994140625,
"learning_rate": 1e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.9661781191825867,
"num_tokens": 125260481.0,
"step": 478
},
{
"entropy": 0.46755754947662354,
"epoch": 23.95,
"grad_norm": 6.086460113525391,
"learning_rate": 1e-06,
"loss": 0.1008,
"mean_token_accuracy": 0.9578189253807068,
"num_tokens": 125522490.0,
"step": 479
},
{
"entropy": 0.4669610261917114,
"epoch": 24.0,
"grad_norm": 4.249874114990234,
"learning_rate": 1e-06,
"loss": 0.0785,
"mean_token_accuracy": 0.9679803252220154,
"num_tokens": 125784509.0,
"step": 480
},
{
"epoch": 24.0,
"eval_entropy": 0.46940919756889343,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9126344323158264,
"eval_num_tokens": 125784509.0,
"eval_runtime": 0.5641,
"eval_samples_per_second": 443.21,
"eval_steps_per_second": 1.773,
"step": 480
},
{
"entropy": 0.4667162299156189,
"epoch": 24.05,
"grad_norm": 3.9976305961608887,
"learning_rate": 1e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9649001955986023,
"num_tokens": 126046588.0,
"step": 481
},
{
"entropy": 0.46550101041793823,
"epoch": 24.1,
"grad_norm": 3.9286561012268066,
"learning_rate": 1e-06,
"loss": 0.0735,
"mean_token_accuracy": 0.968622088432312,
"num_tokens": 126308647.0,
"step": 482
},
{
"entropy": 0.46659862995147705,
"epoch": 24.15,
"grad_norm": 4.12390661239624,
"learning_rate": 1e-06,
"loss": 0.0806,
"mean_token_accuracy": 0.9664310812950134,
"num_tokens": 126570674.0,
"step": 483
},
{
"entropy": 0.4668567180633545,
"epoch": 24.2,
"grad_norm": 4.626502990722656,
"learning_rate": 1e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9562251567840576,
"num_tokens": 126832696.0,
"step": 484
},
{
"entropy": 0.4655776619911194,
"epoch": 24.25,
"grad_norm": 6.302225589752197,
"learning_rate": 1e-06,
"loss": 0.0797,
"mean_token_accuracy": 0.9599271416664124,
"num_tokens": 127094738.0,
"step": 485
},
{
"entropy": 0.4636232852935791,
"epoch": 24.3,
"grad_norm": 6.734894752502441,
"learning_rate": 1e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9588276147842407,
"num_tokens": 127356764.0,
"step": 486
},
{
"entropy": 0.4671253561973572,
"epoch": 24.35,
"grad_norm": 4.378500938415527,
"learning_rate": 1e-06,
"loss": 0.0689,
"mean_token_accuracy": 0.9688196182250977,
"num_tokens": 127618790.0,
"step": 487
},
{
"entropy": 0.4644678235054016,
"epoch": 24.4,
"grad_norm": 5.7774858474731445,
"learning_rate": 1e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9631399512290955,
"num_tokens": 127880881.0,
"step": 488
},
{
"entropy": 0.4653833210468292,
"epoch": 24.45,
"grad_norm": 3.9272964000701904,
"learning_rate": 1e-06,
"loss": 0.0625,
"mean_token_accuracy": 0.9740871787071228,
"num_tokens": 128142908.0,
"step": 489
},
{
"entropy": 0.4655180275440216,
"epoch": 24.5,
"grad_norm": 4.535080909729004,
"learning_rate": 1e-06,
"loss": 0.0791,
"mean_token_accuracy": 0.969737708568573,
"num_tokens": 128404955.0,
"step": 490
},
{
"entropy": 0.46757546067237854,
"epoch": 24.55,
"grad_norm": 4.897022724151611,
"learning_rate": 1e-06,
"loss": 0.0694,
"mean_token_accuracy": 0.9702759981155396,
"num_tokens": 128667003.0,
"step": 491
},
{
"entropy": 0.466405987739563,
"epoch": 24.6,
"grad_norm": 4.4283037185668945,
"learning_rate": 1e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9647526741027832,
"num_tokens": 128929025.0,
"step": 492
},
{
"entropy": 0.46565863490104675,
"epoch": 24.65,
"grad_norm": 5.80818510055542,
"learning_rate": 1e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9615384340286255,
"num_tokens": 129191087.0,
"step": 493
},
{
"entropy": 0.4646396338939667,
"epoch": 24.7,
"grad_norm": 5.854940891265869,
"learning_rate": 1e-06,
"loss": 0.0726,
"mean_token_accuracy": 0.9698593616485596,
"num_tokens": 129453140.0,
"step": 494
},
{
"entropy": 0.46409285068511963,
"epoch": 24.75,
"grad_norm": 4.3521552085876465,
"learning_rate": 1e-06,
"loss": 0.071,
"mean_token_accuracy": 0.9737654328346252,
"num_tokens": 129715194.0,
"step": 495
},
{
"entropy": 0.46351325511932373,
"epoch": 24.8,
"grad_norm": 10.49264144897461,
"learning_rate": 1e-06,
"loss": 0.0912,
"mean_token_accuracy": 0.9609375,
"num_tokens": 129977261.0,
"step": 496
},
{
"entropy": 0.4629173278808594,
"epoch": 24.85,
"grad_norm": 5.705246448516846,
"learning_rate": 1e-06,
"loss": 0.0736,
"mean_token_accuracy": 0.9654731750488281,
"num_tokens": 130239319.0,
"step": 497
},
{
"entropy": 0.4622166156768799,
"epoch": 24.9,
"grad_norm": 4.9481706619262695,
"learning_rate": 1e-06,
"loss": 0.0768,
"mean_token_accuracy": 0.9647132754325867,
"num_tokens": 130501400.0,
"step": 498
},
{
"entropy": 0.4619133770465851,
"epoch": 24.95,
"grad_norm": 5.516783714294434,
"learning_rate": 1e-06,
"loss": 0.0972,
"mean_token_accuracy": 0.9551239013671875,
"num_tokens": 130763486.0,
"step": 499
},
{
"entropy": 0.46198371052742004,
"epoch": 25.0,
"grad_norm": 4.832233905792236,
"learning_rate": 1e-06,
"loss": 0.0775,
"mean_token_accuracy": 0.9691321849822998,
"num_tokens": 131025532.0,
"step": 500
},
{
"epoch": 25.0,
"eval_entropy": 0.464277058839798,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9025537371635437,
"eval_num_tokens": 131025532.0,
"eval_runtime": 0.564,
"eval_samples_per_second": 443.297,
"eval_steps_per_second": 1.773,
"step": 500
},
{
"entropy": 0.4616357684135437,
"epoch": 25.05,
"grad_norm": 5.327179431915283,
"learning_rate": 1e-06,
"loss": 0.0762,
"mean_token_accuracy": 0.965786337852478,
"num_tokens": 131287560.0,
"step": 501
},
{
"entropy": 0.4612486958503723,
"epoch": 25.1,
"grad_norm": 5.3239426612854,
"learning_rate": 1e-06,
"loss": 0.0772,
"mean_token_accuracy": 0.9653465151786804,
"num_tokens": 131549633.0,
"step": 502
},
{
"entropy": 0.4629809260368347,
"epoch": 25.15,
"grad_norm": 4.609165191650391,
"learning_rate": 1e-06,
"loss": 0.0789,
"mean_token_accuracy": 0.9663962721824646,
"num_tokens": 131811688.0,
"step": 503
},
{
"entropy": 0.46398937702178955,
"epoch": 25.2,
"grad_norm": 4.2075700759887695,
"learning_rate": 1e-06,
"loss": 0.0781,
"mean_token_accuracy": 0.9686935544013977,
"num_tokens": 132073678.0,
"step": 504
},
{
"entropy": 0.46296095848083496,
"epoch": 25.25,
"grad_norm": 4.920988082885742,
"learning_rate": 1e-06,
"loss": 0.0642,
"mean_token_accuracy": 0.9684147834777832,
"num_tokens": 132335718.0,
"step": 505
},
{
"entropy": 0.45965808629989624,
"epoch": 25.3,
"grad_norm": 3.9255125522613525,
"learning_rate": 1e-06,
"loss": 0.068,
"mean_token_accuracy": 0.9701306819915771,
"num_tokens": 132597786.0,
"step": 506
},
{
"entropy": 0.45947885513305664,
"epoch": 25.35,
"grad_norm": 4.092470169067383,
"learning_rate": 1e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9691497087478638,
"num_tokens": 132859840.0,
"step": 507
},
{
"entropy": 0.45919084548950195,
"epoch": 25.4,
"grad_norm": 4.688226699829102,
"learning_rate": 1e-06,
"loss": 0.071,
"mean_token_accuracy": 0.9719813466072083,
"num_tokens": 133121899.0,
"step": 508
},
{
"entropy": 0.4593312442302704,
"epoch": 25.45,
"grad_norm": 4.132238388061523,
"learning_rate": 1e-06,
"loss": 0.0813,
"mean_token_accuracy": 0.9599350094795227,
"num_tokens": 133383910.0,
"step": 509
},
{
"entropy": 0.45791468024253845,
"epoch": 25.5,
"grad_norm": 3.8919591903686523,
"learning_rate": 1e-06,
"loss": 0.0658,
"mean_token_accuracy": 0.9714285731315613,
"num_tokens": 133645956.0,
"step": 510
},
{
"entropy": 0.45953884720802307,
"epoch": 25.55,
"grad_norm": 6.311083793640137,
"learning_rate": 1e-06,
"loss": 0.0643,
"mean_token_accuracy": 0.9748603105545044,
"num_tokens": 133908014.0,
"step": 511
},
{
"entropy": 0.4577752947807312,
"epoch": 25.6,
"grad_norm": 11.283148765563965,
"learning_rate": 1e-06,
"loss": 0.0902,
"mean_token_accuracy": 0.9606496095657349,
"num_tokens": 134170076.0,
"step": 512
},
{
"entropy": 0.45760929584503174,
"epoch": 25.65,
"grad_norm": 4.889045715332031,
"learning_rate": 1e-06,
"loss": 0.0731,
"mean_token_accuracy": 0.9718985557556152,
"num_tokens": 134432128.0,
"step": 513
},
{
"entropy": 0.45951539278030396,
"epoch": 25.7,
"grad_norm": 4.273900508880615,
"learning_rate": 1e-06,
"loss": 0.0713,
"mean_token_accuracy": 0.9665513038635254,
"num_tokens": 134694191.0,
"step": 514
},
{
"entropy": 0.458004355430603,
"epoch": 25.75,
"grad_norm": 4.518304347991943,
"learning_rate": 1e-06,
"loss": 0.0696,
"mean_token_accuracy": 0.9705690145492554,
"num_tokens": 134956247.0,
"step": 515
},
{
"entropy": 0.4571762979030609,
"epoch": 25.8,
"grad_norm": 5.303156852722168,
"learning_rate": 1e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9643678069114685,
"num_tokens": 135218283.0,
"step": 516
},
{
"entropy": 0.457592248916626,
"epoch": 25.85,
"grad_norm": 4.145455837249756,
"learning_rate": 1e-06,
"loss": 0.0708,
"mean_token_accuracy": 0.9692658185958862,
"num_tokens": 135480369.0,
"step": 517
},
{
"entropy": 0.4571569561958313,
"epoch": 25.9,
"grad_norm": 5.37058162689209,
"learning_rate": 1e-06,
"loss": 0.0775,
"mean_token_accuracy": 0.9683794379234314,
"num_tokens": 135742414.0,
"step": 518
},
{
"entropy": 0.45553267002105713,
"epoch": 25.95,
"grad_norm": 6.640298843383789,
"learning_rate": 1e-06,
"loss": 0.0802,
"mean_token_accuracy": 0.9627623558044434,
"num_tokens": 136004484.0,
"step": 519
},
{
"entropy": 0.45771491527557373,
"epoch": 26.0,
"grad_norm": 5.2289958000183105,
"learning_rate": 1e-06,
"loss": 0.0812,
"mean_token_accuracy": 0.9678688645362854,
"num_tokens": 136266536.0,
"step": 520
},
{
"epoch": 26.0,
"eval_entropy": 0.45934179425239563,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9032257795333862,
"eval_num_tokens": 136266536.0,
"eval_runtime": 0.563,
"eval_samples_per_second": 444.014,
"eval_steps_per_second": 1.776,
"step": 520
},
{
"entropy": 0.4587504267692566,
"epoch": 26.05,
"grad_norm": 3.8674097061157227,
"learning_rate": 1e-06,
"loss": 0.0728,
"mean_token_accuracy": 0.9732477068901062,
"num_tokens": 136528569.0,
"step": 521
},
{
"entropy": 0.45837363600730896,
"epoch": 26.1,
"grad_norm": 5.667929172515869,
"learning_rate": 1e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9693174958229065,
"num_tokens": 136790660.0,
"step": 522
},
{
"entropy": 0.45661094784736633,
"epoch": 26.15,
"grad_norm": 5.02635383605957,
"learning_rate": 1e-06,
"loss": 0.0727,
"mean_token_accuracy": 0.9678813815116882,
"num_tokens": 137052707.0,
"step": 523
},
{
"entropy": 0.4580841362476349,
"epoch": 26.2,
"grad_norm": 4.592870712280273,
"learning_rate": 1e-06,
"loss": 0.0736,
"mean_token_accuracy": 0.9665246605873108,
"num_tokens": 137314778.0,
"step": 524
},
{
"entropy": 0.45721596479415894,
"epoch": 26.25,
"grad_norm": 3.333099603652954,
"learning_rate": 1e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9748427867889404,
"num_tokens": 137576868.0,
"step": 525
},
{
"entropy": 0.4584483504295349,
"epoch": 26.3,
"grad_norm": 4.201588153839111,
"learning_rate": 1e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9658351540565491,
"num_tokens": 137838876.0,
"step": 526
},
{
"entropy": 0.45824259519577026,
"epoch": 26.35,
"grad_norm": 4.535568714141846,
"learning_rate": 1e-06,
"loss": 0.0677,
"mean_token_accuracy": 0.9732847809791565,
"num_tokens": 138100918.0,
"step": 527
},
{
"entropy": 0.45829930901527405,
"epoch": 26.4,
"grad_norm": 7.854061603546143,
"learning_rate": 1e-06,
"loss": 0.0634,
"mean_token_accuracy": 0.9765415787696838,
"num_tokens": 138362937.0,
"step": 528
},
{
"entropy": 0.4546545743942261,
"epoch": 26.45,
"grad_norm": 10.19962215423584,
"learning_rate": 1e-06,
"loss": 0.0741,
"mean_token_accuracy": 0.9672130942344666,
"num_tokens": 138625004.0,
"step": 529
},
{
"entropy": 0.4562210440635681,
"epoch": 26.5,
"grad_norm": 7.326644420623779,
"learning_rate": 1e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9659023880958557,
"num_tokens": 138887034.0,
"step": 530
},
{
"entropy": 0.4569784104824066,
"epoch": 26.55,
"grad_norm": 4.9447736740112305,
"learning_rate": 1e-06,
"loss": 0.0741,
"mean_token_accuracy": 0.9705063700675964,
"num_tokens": 139149061.0,
"step": 531
},
{
"entropy": 0.45219576358795166,
"epoch": 26.6,
"grad_norm": 3.8060805797576904,
"learning_rate": 1e-06,
"loss": 0.0724,
"mean_token_accuracy": 0.9715009331703186,
"num_tokens": 139411134.0,
"step": 532
},
{
"entropy": 0.45544517040252686,
"epoch": 26.65,
"grad_norm": 6.335866928100586,
"learning_rate": 1e-06,
"loss": 0.0774,
"mean_token_accuracy": 0.9698432087898254,
"num_tokens": 139673154.0,
"step": 533
},
{
"entropy": 0.4518481492996216,
"epoch": 26.7,
"grad_norm": 6.290351867675781,
"learning_rate": 1e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9635453820228577,
"num_tokens": 139935245.0,
"step": 534
},
{
"entropy": 0.45323824882507324,
"epoch": 26.75,
"grad_norm": 7.986852169036865,
"learning_rate": 1e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9629629850387573,
"num_tokens": 140197320.0,
"step": 535
},
{
"entropy": 0.45260536670684814,
"epoch": 26.8,
"grad_norm": 6.8090105056762695,
"learning_rate": 1e-06,
"loss": 0.0767,
"mean_token_accuracy": 0.9647606611251831,
"num_tokens": 140459384.0,
"step": 536
},
{
"entropy": 0.45437586307525635,
"epoch": 26.85,
"grad_norm": 5.623941898345947,
"learning_rate": 1e-06,
"loss": 0.0723,
"mean_token_accuracy": 0.9687874913215637,
"num_tokens": 140721412.0,
"step": 537
},
{
"entropy": 0.45372387766838074,
"epoch": 26.9,
"grad_norm": 6.018904209136963,
"learning_rate": 1e-06,
"loss": 0.0726,
"mean_token_accuracy": 0.96875,
"num_tokens": 140983473.0,
"step": 538
},
{
"entropy": 0.4547538161277771,
"epoch": 26.95,
"grad_norm": 4.399332046508789,
"learning_rate": 1e-06,
"loss": 0.066,
"mean_token_accuracy": 0.9699872136116028,
"num_tokens": 141245533.0,
"step": 539
},
{
"entropy": 0.4539608359336853,
"epoch": 27.0,
"grad_norm": 4.773989677429199,
"learning_rate": 1e-06,
"loss": 0.0754,
"mean_token_accuracy": 0.9662853479385376,
"num_tokens": 141507556.0,
"step": 540
},
{
"epoch": 27.0,
"eval_entropy": 0.4568031132221222,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9045698642730713,
"eval_num_tokens": 141507556.0,
"eval_runtime": 0.5608,
"eval_samples_per_second": 445.81,
"eval_steps_per_second": 1.783,
"step": 540
},
{
"entropy": 0.4549258053302765,
"epoch": 27.05,
"grad_norm": 5.992208957672119,
"learning_rate": 1e-06,
"loss": 0.0696,
"mean_token_accuracy": 0.9692832827568054,
"num_tokens": 141769581.0,
"step": 541
},
{
"entropy": 0.45245492458343506,
"epoch": 27.1,
"grad_norm": 5.600193500518799,
"learning_rate": 1e-06,
"loss": 0.0684,
"mean_token_accuracy": 0.9731225371360779,
"num_tokens": 142031637.0,
"step": 542
},
{
"entropy": 0.453826367855072,
"epoch": 27.15,
"grad_norm": 6.150087356567383,
"learning_rate": 1e-06,
"loss": 0.0743,
"mean_token_accuracy": 0.9659667015075684,
"num_tokens": 142293677.0,
"step": 543
},
{
"entropy": 0.45176124572753906,
"epoch": 27.2,
"grad_norm": 5.019855976104736,
"learning_rate": 1e-06,
"loss": 0.0599,
"mean_token_accuracy": 0.9740341901779175,
"num_tokens": 142555717.0,
"step": 544
},
{
"entropy": 0.45249661803245544,
"epoch": 27.25,
"grad_norm": 3.5191256999969482,
"learning_rate": 1e-06,
"loss": 0.0639,
"mean_token_accuracy": 0.9764208197593689,
"num_tokens": 142817799.0,
"step": 545
},
{
"entropy": 0.45174354314804077,
"epoch": 27.3,
"grad_norm": 5.357618808746338,
"learning_rate": 1e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9682329297065735,
"num_tokens": 143079903.0,
"step": 546
},
{
"entropy": 0.4494438171386719,
"epoch": 27.35,
"grad_norm": 3.706282377243042,
"learning_rate": 1e-06,
"loss": 0.0693,
"mean_token_accuracy": 0.9701896905899048,
"num_tokens": 143341978.0,
"step": 547
},
{
"entropy": 0.4479014277458191,
"epoch": 27.4,
"grad_norm": 7.060003280639648,
"learning_rate": 1e-06,
"loss": 0.0689,
"mean_token_accuracy": 0.973009467124939,
"num_tokens": 143604053.0,
"step": 548
},
{
"entropy": 0.44879603385925293,
"epoch": 27.45,
"grad_norm": 6.722444534301758,
"learning_rate": 1e-06,
"loss": 0.0649,
"mean_token_accuracy": 0.974518358707428,
"num_tokens": 143866123.0,
"step": 549
},
{
"entropy": 0.4483737051486969,
"epoch": 27.5,
"grad_norm": 6.251794815063477,
"learning_rate": 1e-06,
"loss": 0.0762,
"mean_token_accuracy": 0.9707950353622437,
"num_tokens": 144128136.0,
"step": 550
},
{
"entropy": 0.4492543339729309,
"epoch": 27.55,
"grad_norm": 4.857920169830322,
"learning_rate": 1e-06,
"loss": 0.059,
"mean_token_accuracy": 0.973440408706665,
"num_tokens": 144390216.0,
"step": 551
},
{
"entropy": 0.448012113571167,
"epoch": 27.6,
"grad_norm": 4.787059307098389,
"learning_rate": 1e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.968769907951355,
"num_tokens": 144652246.0,
"step": 552
},
{
"entropy": 0.4510306119918823,
"epoch": 27.65,
"grad_norm": 7.7825727462768555,
"learning_rate": 1e-06,
"loss": 0.0957,
"mean_token_accuracy": 0.9633767604827881,
"num_tokens": 144914285.0,
"step": 553
},
{
"entropy": 0.4503524899482727,
"epoch": 27.7,
"grad_norm": 6.944081783294678,
"learning_rate": 1e-06,
"loss": 0.0658,
"mean_token_accuracy": 0.9696066975593567,
"num_tokens": 145176325.0,
"step": 554
},
{
"entropy": 0.45102667808532715,
"epoch": 27.75,
"grad_norm": 6.495534896850586,
"learning_rate": 1e-06,
"loss": 0.066,
"mean_token_accuracy": 0.9710467457771301,
"num_tokens": 145438351.0,
"step": 555
},
{
"entropy": 0.45106881856918335,
"epoch": 27.8,
"grad_norm": 5.490335464477539,
"learning_rate": 1e-06,
"loss": 0.0585,
"mean_token_accuracy": 0.9763991832733154,
"num_tokens": 145700394.0,
"step": 556
},
{
"entropy": 0.4523155689239502,
"epoch": 27.85,
"grad_norm": 5.689394950866699,
"learning_rate": 1e-06,
"loss": 0.0652,
"mean_token_accuracy": 0.9750356674194336,
"num_tokens": 145962455.0,
"step": 557
},
{
"entropy": 0.4528997540473938,
"epoch": 27.9,
"grad_norm": 5.096836090087891,
"learning_rate": 1e-06,
"loss": 0.0661,
"mean_token_accuracy": 0.9740124940872192,
"num_tokens": 146224477.0,
"step": 558
},
{
"entropy": 0.4539637267589569,
"epoch": 27.95,
"grad_norm": 6.182263374328613,
"learning_rate": 1e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9613526463508606,
"num_tokens": 146486519.0,
"step": 559
},
{
"entropy": 0.4534997344017029,
"epoch": 28.0,
"grad_norm": 4.790921688079834,
"learning_rate": 1e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.9655537605285645,
"num_tokens": 146748570.0,
"step": 560
},
{
"epoch": 28.0,
"eval_entropy": 0.4555630087852478,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9032257795333862,
"eval_num_tokens": 146748570.0,
"eval_runtime": 0.563,
"eval_samples_per_second": 444.059,
"eval_steps_per_second": 1.776,
"step": 560
},
{
"entropy": 0.4534584879875183,
"epoch": 28.05,
"grad_norm": 11.155506134033203,
"learning_rate": 1e-06,
"loss": 0.078,
"mean_token_accuracy": 0.9656893610954285,
"num_tokens": 147010634.0,
"step": 561
},
{
"entropy": 0.45180657505989075,
"epoch": 28.1,
"grad_norm": 5.834336280822754,
"learning_rate": 1e-06,
"loss": 0.0753,
"mean_token_accuracy": 0.9643248915672302,
"num_tokens": 147272686.0,
"step": 562
},
{
"entropy": 0.44956350326538086,
"epoch": 28.15,
"grad_norm": 5.324082851409912,
"learning_rate": 1e-06,
"loss": 0.0641,
"mean_token_accuracy": 0.9707057476043701,
"num_tokens": 147534715.0,
"step": 563
},
{
"entropy": 0.4504123628139496,
"epoch": 28.2,
"grad_norm": 5.74534273147583,
"learning_rate": 1e-06,
"loss": 0.0716,
"mean_token_accuracy": 0.9698461294174194,
"num_tokens": 147796768.0,
"step": 564
},
{
"entropy": 0.450857013463974,
"epoch": 28.25,
"grad_norm": 7.693669319152832,
"learning_rate": 1e-06,
"loss": 0.0629,
"mean_token_accuracy": 0.9717868566513062,
"num_tokens": 148058824.0,
"step": 565
},
{
"entropy": 0.4519267976284027,
"epoch": 28.3,
"grad_norm": 6.2823662757873535,
"learning_rate": 1e-06,
"loss": 0.0635,
"mean_token_accuracy": 0.9744499921798706,
"num_tokens": 148320892.0,
"step": 566
},
{
"entropy": 0.45379358530044556,
"epoch": 28.35,
"grad_norm": 4.106582164764404,
"learning_rate": 1e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9773442149162292,
"num_tokens": 148582909.0,
"step": 567
},
{
"entropy": 0.4516447186470032,
"epoch": 28.4,
"grad_norm": 3.7747578620910645,
"learning_rate": 1e-06,
"loss": 0.0601,
"mean_token_accuracy": 0.9765396118164062,
"num_tokens": 148844998.0,
"step": 568
},
{
"entropy": 0.4513830542564392,
"epoch": 28.45,
"grad_norm": 6.135379791259766,
"learning_rate": 1e-06,
"loss": 0.0704,
"mean_token_accuracy": 0.9721642136573792,
"num_tokens": 149107061.0,
"step": 569
},
{
"entropy": 0.4540482759475708,
"epoch": 28.5,
"grad_norm": 5.572679042816162,
"learning_rate": 1e-06,
"loss": 0.0591,
"mean_token_accuracy": 0.977356493473053,
"num_tokens": 149369124.0,
"step": 570
},
{
"entropy": 0.4550696015357971,
"epoch": 28.55,
"grad_norm": 4.970890998840332,
"learning_rate": 1e-06,
"loss": 0.076,
"mean_token_accuracy": 0.9676320552825928,
"num_tokens": 149631148.0,
"step": 571
},
{
"entropy": 0.4574969410896301,
"epoch": 28.6,
"grad_norm": 6.8962483406066895,
"learning_rate": 1e-06,
"loss": 0.0753,
"mean_token_accuracy": 0.9676133394241333,
"num_tokens": 149893154.0,
"step": 572
},
{
"entropy": 0.4522814452648163,
"epoch": 28.65,
"grad_norm": 7.506436347961426,
"learning_rate": 1e-06,
"loss": 0.0743,
"mean_token_accuracy": 0.96835857629776,
"num_tokens": 150155231.0,
"step": 573
},
{
"entropy": 0.45204830169677734,
"epoch": 28.7,
"grad_norm": 3.9133265018463135,
"learning_rate": 1e-06,
"loss": 0.0534,
"mean_token_accuracy": 0.9761350154876709,
"num_tokens": 150417278.0,
"step": 574
},
{
"entropy": 0.4525344967842102,
"epoch": 28.75,
"grad_norm": 4.4698967933654785,
"learning_rate": 1e-06,
"loss": 0.0738,
"mean_token_accuracy": 0.9701104760169983,
"num_tokens": 150679278.0,
"step": 575
},
{
"entropy": 0.4537837505340576,
"epoch": 28.8,
"grad_norm": 8.1281156539917,
"learning_rate": 1e-06,
"loss": 0.0671,
"mean_token_accuracy": 0.9691211581230164,
"num_tokens": 150941324.0,
"step": 576
},
{
"entropy": 0.449785053730011,
"epoch": 28.85,
"grad_norm": 6.921140670776367,
"learning_rate": 1e-06,
"loss": 0.048,
"mean_token_accuracy": 0.9780303239822388,
"num_tokens": 151203402.0,
"step": 577
},
{
"entropy": 0.45099085569381714,
"epoch": 28.9,
"grad_norm": 5.108087539672852,
"learning_rate": 1e-06,
"loss": 0.0702,
"mean_token_accuracy": 0.9698275923728943,
"num_tokens": 151465454.0,
"step": 578
},
{
"entropy": 0.4535222053527832,
"epoch": 28.95,
"grad_norm": 7.629486560821533,
"learning_rate": 1e-06,
"loss": 0.0768,
"mean_token_accuracy": 0.9671140909194946,
"num_tokens": 151727537.0,
"step": 579
},
{
"entropy": 0.4530973434448242,
"epoch": 29.0,
"grad_norm": 6.288832187652588,
"learning_rate": 1e-06,
"loss": 0.0673,
"mean_token_accuracy": 0.9699453711509705,
"num_tokens": 151989594.0,
"step": 580
},
{
"epoch": 29.0,
"eval_entropy": 0.45486804842948914,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9005376100540161,
"eval_num_tokens": 151989594.0,
"eval_runtime": 0.5674,
"eval_samples_per_second": 440.627,
"eval_steps_per_second": 1.763,
"step": 580
},
{
"entropy": 0.45344972610473633,
"epoch": 29.05,
"grad_norm": 3.3168556690216064,
"learning_rate": 1e-06,
"loss": 0.0725,
"mean_token_accuracy": 0.9668222069740295,
"num_tokens": 152251621.0,
"step": 581
},
{
"entropy": 0.4522762894630432,
"epoch": 29.1,
"grad_norm": 5.237521171569824,
"learning_rate": 1e-06,
"loss": 0.0749,
"mean_token_accuracy": 0.973809540271759,
"num_tokens": 152513705.0,
"step": 582
},
{
"entropy": 0.45483410358428955,
"epoch": 29.15,
"grad_norm": 5.303705215454102,
"learning_rate": 1e-06,
"loss": 0.0697,
"mean_token_accuracy": 0.9701678156852722,
"num_tokens": 152775775.0,
"step": 583
},
{
"entropy": 0.4540916681289673,
"epoch": 29.2,
"grad_norm": 5.1602630615234375,
"learning_rate": 1e-06,
"loss": 0.0507,
"mean_token_accuracy": 0.9763142466545105,
"num_tokens": 153037802.0,
"step": 584
},
{
"entropy": 0.4509674608707428,
"epoch": 29.25,
"grad_norm": 4.621006965637207,
"learning_rate": 1e-06,
"loss": 0.052,
"mean_token_accuracy": 0.9785344004631042,
"num_tokens": 153299878.0,
"step": 585
},
{
"entropy": 0.4527459144592285,
"epoch": 29.3,
"grad_norm": 5.821897506713867,
"learning_rate": 1e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9764150977134705,
"num_tokens": 153561922.0,
"step": 586
},
{
"entropy": 0.4510110914707184,
"epoch": 29.35,
"grad_norm": 9.293543815612793,
"learning_rate": 1e-06,
"loss": 0.0557,
"mean_token_accuracy": 0.9750346541404724,
"num_tokens": 153823957.0,
"step": 587
},
{
"entropy": 0.45252758264541626,
"epoch": 29.4,
"grad_norm": 5.370669364929199,
"learning_rate": 1e-06,
"loss": 0.0759,
"mean_token_accuracy": 0.9652448892593384,
"num_tokens": 154086047.0,
"step": 588
},
{
"entropy": 0.45324066281318665,
"epoch": 29.45,
"grad_norm": 6.146717071533203,
"learning_rate": 1e-06,
"loss": 0.0568,
"mean_token_accuracy": 0.97549968957901,
"num_tokens": 154348125.0,
"step": 589
},
{
"entropy": 0.45057687163352966,
"epoch": 29.5,
"grad_norm": 9.826089859008789,
"learning_rate": 1e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.972470223903656,
"num_tokens": 154610194.0,
"step": 590
},
{
"entropy": 0.45304763317108154,
"epoch": 29.55,
"grad_norm": 6.804394721984863,
"learning_rate": 1e-06,
"loss": 0.0695,
"mean_token_accuracy": 0.9685990214347839,
"num_tokens": 154872230.0,
"step": 591
},
{
"entropy": 0.4490482807159424,
"epoch": 29.6,
"grad_norm": 8.995187759399414,
"learning_rate": 1e-06,
"loss": 0.0624,
"mean_token_accuracy": 0.9748928546905518,
"num_tokens": 155134258.0,
"step": 592
},
{
"entropy": 0.44886791706085205,
"epoch": 29.65,
"grad_norm": 10.23698616027832,
"learning_rate": 1e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9673491716384888,
"num_tokens": 155396295.0,
"step": 593
},
{
"entropy": 0.45058315992355347,
"epoch": 29.7,
"grad_norm": 10.80184555053711,
"learning_rate": 1e-06,
"loss": 0.0688,
"mean_token_accuracy": 0.9761580228805542,
"num_tokens": 155658323.0,
"step": 594
},
{
"entropy": 0.4487557113170624,
"epoch": 29.75,
"grad_norm": 7.765211582183838,
"learning_rate": 1e-06,
"loss": 0.0508,
"mean_token_accuracy": 0.9813148975372314,
"num_tokens": 155920361.0,
"step": 595
},
{
"entropy": 0.44877922534942627,
"epoch": 29.8,
"grad_norm": 5.086024761199951,
"learning_rate": 1e-06,
"loss": 0.0614,
"mean_token_accuracy": 0.9721935987472534,
"num_tokens": 156182401.0,
"step": 596
},
{
"entropy": 0.44867563247680664,
"epoch": 29.85,
"grad_norm": 5.775504112243652,
"learning_rate": 1e-06,
"loss": 0.0674,
"mean_token_accuracy": 0.9715116024017334,
"num_tokens": 156444450.0,
"step": 597
},
{
"entropy": 0.4486447274684906,
"epoch": 29.9,
"grad_norm": 9.032340049743652,
"learning_rate": 1e-06,
"loss": 0.0724,
"mean_token_accuracy": 0.9696561098098755,
"num_tokens": 156706493.0,
"step": 598
},
{
"entropy": 0.4501435458660126,
"epoch": 29.95,
"grad_norm": 7.640662670135498,
"learning_rate": 1e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9632047414779663,
"num_tokens": 156968540.0,
"step": 599
},
{
"entropy": 0.4509955048561096,
"epoch": 30.0,
"grad_norm": 3.8913021087646484,
"learning_rate": 1e-06,
"loss": 0.0674,
"mean_token_accuracy": 0.9720497131347656,
"num_tokens": 157230603.0,
"step": 600
},
{
"epoch": 30.0,
"eval_entropy": 0.4519185423851013,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9072580933570862,
"eval_num_tokens": 157230603.0,
"eval_runtime": 0.5621,
"eval_samples_per_second": 444.74,
"eval_steps_per_second": 1.779,
"step": 600
},
{
"entropy": 0.44891834259033203,
"epoch": 30.05,
"grad_norm": 3.6025571823120117,
"learning_rate": 1e-06,
"loss": 0.0532,
"mean_token_accuracy": 0.9758663177490234,
"num_tokens": 157492680.0,
"step": 601
},
{
"entropy": 0.4488842785358429,
"epoch": 30.1,
"grad_norm": 4.095007419586182,
"learning_rate": 1e-06,
"loss": 0.0485,
"mean_token_accuracy": 0.9798251390457153,
"num_tokens": 157754760.0,
"step": 602
},
{
"entropy": 0.44925498962402344,
"epoch": 30.15,
"grad_norm": 4.470601558685303,
"learning_rate": 1e-06,
"loss": 0.0502,
"mean_token_accuracy": 0.9817113280296326,
"num_tokens": 158016818.0,
"step": 603
},
{
"entropy": 0.4520828127861023,
"epoch": 30.2,
"grad_norm": 6.587667465209961,
"learning_rate": 1e-06,
"loss": 0.0553,
"mean_token_accuracy": 0.9785202741622925,
"num_tokens": 158278889.0,
"step": 604
},
{
"entropy": 0.4510212242603302,
"epoch": 30.25,
"grad_norm": 4.29756498336792,
"learning_rate": 1e-06,
"loss": 0.0603,
"mean_token_accuracy": 0.9725936055183411,
"num_tokens": 158540879.0,
"step": 605
},
{
"entropy": 0.45233023166656494,
"epoch": 30.3,
"grad_norm": 5.920616149902344,
"learning_rate": 1e-06,
"loss": 0.0639,
"mean_token_accuracy": 0.9738602042198181,
"num_tokens": 158802919.0,
"step": 606
},
{
"entropy": 0.44946354627609253,
"epoch": 30.35,
"grad_norm": 8.821480751037598,
"learning_rate": 1e-06,
"loss": 0.0544,
"mean_token_accuracy": 0.9739726185798645,
"num_tokens": 159064939.0,
"step": 607
},
{
"entropy": 0.4503335952758789,
"epoch": 30.4,
"grad_norm": 5.011677265167236,
"learning_rate": 1e-06,
"loss": 0.0653,
"mean_token_accuracy": 0.9722222089767456,
"num_tokens": 159326987.0,
"step": 608
},
{
"entropy": 0.4475363492965698,
"epoch": 30.45,
"grad_norm": 4.521021842956543,
"learning_rate": 1e-06,
"loss": 0.0573,
"mean_token_accuracy": 0.9772393703460693,
"num_tokens": 159589074.0,
"step": 609
},
{
"entropy": 0.4487738609313965,
"epoch": 30.5,
"grad_norm": 4.860763072967529,
"learning_rate": 1e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9696969985961914,
"num_tokens": 159851152.0,
"step": 610
},
{
"entropy": 0.4483632445335388,
"epoch": 30.55,
"grad_norm": 7.282129287719727,
"learning_rate": 1e-06,
"loss": 0.0473,
"mean_token_accuracy": 0.9839181303977966,
"num_tokens": 160113212.0,
"step": 611
},
{
"entropy": 0.4463149607181549,
"epoch": 30.6,
"grad_norm": 7.292006969451904,
"learning_rate": 1e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9573770761489868,
"num_tokens": 160375297.0,
"step": 612
},
{
"entropy": 0.44664695858955383,
"epoch": 30.65,
"grad_norm": 6.004032135009766,
"learning_rate": 1e-06,
"loss": 0.0619,
"mean_token_accuracy": 0.9759036302566528,
"num_tokens": 160637336.0,
"step": 613
},
{
"entropy": 0.4448843002319336,
"epoch": 30.7,
"grad_norm": 7.444438934326172,
"learning_rate": 1e-06,
"loss": 0.0686,
"mean_token_accuracy": 0.9725457429885864,
"num_tokens": 160899395.0,
"step": 614
},
{
"entropy": 0.44413405656814575,
"epoch": 30.75,
"grad_norm": 7.9332804679870605,
"learning_rate": 1e-06,
"loss": 0.0583,
"mean_token_accuracy": 0.9747899174690247,
"num_tokens": 161161469.0,
"step": 615
},
{
"entropy": 0.4475647211074829,
"epoch": 30.8,
"grad_norm": 7.166046142578125,
"learning_rate": 1e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9761489033699036,
"num_tokens": 161423484.0,
"step": 616
},
{
"entropy": 0.44248896837234497,
"epoch": 30.85,
"grad_norm": 4.561134338378906,
"learning_rate": 1e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9668790102005005,
"num_tokens": 161685515.0,
"step": 617
},
{
"entropy": 0.4430382549762726,
"epoch": 30.9,
"grad_norm": 12.0337495803833,
"learning_rate": 1e-06,
"loss": 0.0723,
"mean_token_accuracy": 0.9708737730979919,
"num_tokens": 161947583.0,
"step": 618
},
{
"entropy": 0.44385209679603577,
"epoch": 30.95,
"grad_norm": 5.855442523956299,
"learning_rate": 1e-06,
"loss": 0.0771,
"mean_token_accuracy": 0.9631548523902893,
"num_tokens": 162209583.0,
"step": 619
},
{
"entropy": 0.4453125596046448,
"epoch": 31.0,
"grad_norm": 9.10976791381836,
"learning_rate": 1e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.96875,
"num_tokens": 162471638.0,
"step": 620
},
{
"epoch": 31.0,
"eval_entropy": 0.44676893949508667,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9065860509872437,
"eval_num_tokens": 162471638.0,
"eval_runtime": 0.5669,
"eval_samples_per_second": 440.995,
"eval_steps_per_second": 1.764,
"step": 620
},
{
"entropy": 0.4446421265602112,
"epoch": 31.05,
"grad_norm": 5.643488883972168,
"learning_rate": 1e-06,
"loss": 0.0501,
"mean_token_accuracy": 0.9764078855514526,
"num_tokens": 162733710.0,
"step": 621
},
{
"entropy": 0.4460245370864868,
"epoch": 31.1,
"grad_norm": 7.093123912811279,
"learning_rate": 1e-06,
"loss": 0.0643,
"mean_token_accuracy": 0.971222996711731,
"num_tokens": 162995785.0,
"step": 622
},
{
"entropy": 0.4435853958129883,
"epoch": 31.15,
"grad_norm": 7.843852519989014,
"learning_rate": 1e-06,
"loss": 0.06,
"mean_token_accuracy": 0.9783491492271423,
"num_tokens": 163257856.0,
"step": 623
},
{
"entropy": 0.44595205783843994,
"epoch": 31.2,
"grad_norm": 9.885993003845215,
"learning_rate": 1e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.9701765179634094,
"num_tokens": 163519927.0,
"step": 624
},
{
"entropy": 0.4446321129798889,
"epoch": 31.25,
"grad_norm": 9.069074630737305,
"learning_rate": 1e-06,
"loss": 0.0682,
"mean_token_accuracy": 0.9672130942344666,
"num_tokens": 163781974.0,
"step": 625
},
{
"entropy": 0.44555115699768066,
"epoch": 31.3,
"grad_norm": 7.41979455947876,
"learning_rate": 1e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9729869961738586,
"num_tokens": 164043997.0,
"step": 626
},
{
"entropy": 0.44311100244522095,
"epoch": 31.35,
"grad_norm": 6.325224876403809,
"learning_rate": 1e-06,
"loss": 0.0643,
"mean_token_accuracy": 0.9761525988578796,
"num_tokens": 164306046.0,
"step": 627
},
{
"entropy": 0.44550782442092896,
"epoch": 31.4,
"grad_norm": 5.873880863189697,
"learning_rate": 1e-06,
"loss": 0.0489,
"mean_token_accuracy": 0.9810085296630859,
"num_tokens": 164568133.0,
"step": 628
},
{
"entropy": 0.44575005769729614,
"epoch": 31.45,
"grad_norm": 6.150679111480713,
"learning_rate": 1e-06,
"loss": 0.0692,
"mean_token_accuracy": 0.9675675630569458,
"num_tokens": 164830154.0,
"step": 629
},
{
"entropy": 0.44503775238990784,
"epoch": 31.5,
"grad_norm": 6.830018520355225,
"learning_rate": 1e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9699599742889404,
"num_tokens": 165092212.0,
"step": 630
},
{
"entropy": 0.44527047872543335,
"epoch": 31.55,
"grad_norm": 8.90091323852539,
"learning_rate": 1e-06,
"loss": 0.0623,
"mean_token_accuracy": 0.9720279574394226,
"num_tokens": 165354268.0,
"step": 631
},
{
"entropy": 0.4455764591693878,
"epoch": 31.6,
"grad_norm": 10.846599578857422,
"learning_rate": 1e-06,
"loss": 0.0505,
"mean_token_accuracy": 0.9804713726043701,
"num_tokens": 165616280.0,
"step": 632
},
{
"entropy": 0.44468122720718384,
"epoch": 31.65,
"grad_norm": 6.496395111083984,
"learning_rate": 1e-06,
"loss": 0.0701,
"mean_token_accuracy": 0.9713010191917419,
"num_tokens": 165878342.0,
"step": 633
},
{
"entropy": 0.4464184045791626,
"epoch": 31.7,
"grad_norm": 4.161031723022461,
"learning_rate": 1e-06,
"loss": 0.0486,
"mean_token_accuracy": 0.9813348650932312,
"num_tokens": 166140340.0,
"step": 634
},
{
"entropy": 0.4466201961040497,
"epoch": 31.75,
"grad_norm": 4.4769287109375,
"learning_rate": 1e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9720670580863953,
"num_tokens": 166402374.0,
"step": 635
},
{
"entropy": 0.4480331540107727,
"epoch": 31.8,
"grad_norm": 5.939889430999756,
"learning_rate": 1e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9711799621582031,
"num_tokens": 166664443.0,
"step": 636
},
{
"entropy": 0.4440794587135315,
"epoch": 31.85,
"grad_norm": 8.899001121520996,
"learning_rate": 1e-06,
"loss": 0.0615,
"mean_token_accuracy": 0.9747899174690247,
"num_tokens": 166926510.0,
"step": 637
},
{
"entropy": 0.4426196217536926,
"epoch": 31.9,
"grad_norm": 7.370424747467041,
"learning_rate": 1e-06,
"loss": 0.0638,
"mean_token_accuracy": 0.9732052683830261,
"num_tokens": 167188553.0,
"step": 638
},
{
"entropy": 0.4424576759338379,
"epoch": 31.95,
"grad_norm": 5.7402801513671875,
"learning_rate": 1e-06,
"loss": 0.0609,
"mean_token_accuracy": 0.9725839495658875,
"num_tokens": 167450605.0,
"step": 639
},
{
"entropy": 0.4405357241630554,
"epoch": 32.0,
"grad_norm": 6.917886734008789,
"learning_rate": 1e-06,
"loss": 0.0561,
"mean_token_accuracy": 0.9764705896377563,
"num_tokens": 167712685.0,
"step": 640
},
{
"epoch": 32.0,
"eval_entropy": 0.44569727778434753,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9086021780967712,
"eval_num_tokens": 167712685.0,
"eval_runtime": 0.5657,
"eval_samples_per_second": 441.926,
"eval_steps_per_second": 1.768,
"step": 640
},
{
"entropy": 0.44468456506729126,
"epoch": 32.05,
"grad_norm": 3.7619173526763916,
"learning_rate": 1e-06,
"loss": 0.0504,
"mean_token_accuracy": 0.9797172546386719,
"num_tokens": 167974740.0,
"step": 641
},
{
"entropy": 0.44157981872558594,
"epoch": 32.1,
"grad_norm": 6.412563323974609,
"learning_rate": 1e-06,
"loss": 0.0458,
"mean_token_accuracy": 0.9774086475372314,
"num_tokens": 168236805.0,
"step": 642
},
{
"entropy": 0.44227516651153564,
"epoch": 32.15,
"grad_norm": 7.628910064697266,
"learning_rate": 1e-06,
"loss": 0.065,
"mean_token_accuracy": 0.966926097869873,
"num_tokens": 168498874.0,
"step": 643
},
{
"entropy": 0.4421759843826294,
"epoch": 32.2,
"grad_norm": 4.509586811065674,
"learning_rate": 1e-06,
"loss": 0.0453,
"mean_token_accuracy": 0.9816176295280457,
"num_tokens": 168760901.0,
"step": 644
},
{
"entropy": 0.44034460186958313,
"epoch": 32.25,
"grad_norm": 4.948095798492432,
"learning_rate": 1e-06,
"loss": 0.0462,
"mean_token_accuracy": 0.9810659289360046,
"num_tokens": 169022986.0,
"step": 645
},
{
"entropy": 0.43732625246047974,
"epoch": 32.3,
"grad_norm": 6.055516719818115,
"learning_rate": 1e-06,
"loss": 0.0719,
"mean_token_accuracy": 0.969675600528717,
"num_tokens": 169285063.0,
"step": 646
},
{
"entropy": 0.44073206186294556,
"epoch": 32.35,
"grad_norm": 6.052289962768555,
"learning_rate": 1e-06,
"loss": 0.0546,
"mean_token_accuracy": 0.979825496673584,
"num_tokens": 169547127.0,
"step": 647
},
{
"entropy": 0.43953657150268555,
"epoch": 32.4,
"grad_norm": 5.309092044830322,
"learning_rate": 1e-06,
"loss": 0.0562,
"mean_token_accuracy": 0.977673351764679,
"num_tokens": 169809191.0,
"step": 648
},
{
"entropy": 0.4400402307510376,
"epoch": 32.45,
"grad_norm": 3.462468385696411,
"learning_rate": 1e-06,
"loss": 0.0536,
"mean_token_accuracy": 0.9798488616943359,
"num_tokens": 170071240.0,
"step": 649
},
{
"entropy": 0.4371636211872101,
"epoch": 32.5,
"grad_norm": 9.595351219177246,
"learning_rate": 1e-06,
"loss": 0.0584,
"mean_token_accuracy": 0.9729729890823364,
"num_tokens": 170333259.0,
"step": 650
},
{
"entropy": 0.43737897276878357,
"epoch": 32.55,
"grad_norm": 6.217532634735107,
"learning_rate": 1e-06,
"loss": 0.0549,
"mean_token_accuracy": 0.9765312075614929,
"num_tokens": 170595335.0,
"step": 651
},
{
"entropy": 0.4356423318386078,
"epoch": 32.6,
"grad_norm": 6.676355361938477,
"learning_rate": 1e-06,
"loss": 0.0569,
"mean_token_accuracy": 0.9793530702590942,
"num_tokens": 170857414.0,
"step": 652
},
{
"entropy": 0.43704909086227417,
"epoch": 32.65,
"grad_norm": 5.60529899597168,
"learning_rate": 1e-06,
"loss": 0.0594,
"mean_token_accuracy": 0.9732779264450073,
"num_tokens": 171119460.0,
"step": 653
},
{
"entropy": 0.43586838245391846,
"epoch": 32.7,
"grad_norm": 9.050461769104004,
"learning_rate": 1e-06,
"loss": 0.0717,
"mean_token_accuracy": 0.9660377502441406,
"num_tokens": 171381511.0,
"step": 654
},
{
"entropy": 0.43764546513557434,
"epoch": 32.75,
"grad_norm": 7.719151020050049,
"learning_rate": 1e-06,
"loss": 0.0519,
"mean_token_accuracy": 0.9790164232254028,
"num_tokens": 171643563.0,
"step": 655
},
{
"entropy": 0.4377431273460388,
"epoch": 32.8,
"grad_norm": 6.429050922393799,
"learning_rate": 1e-06,
"loss": 0.0613,
"mean_token_accuracy": 0.9767326712608337,
"num_tokens": 171905582.0,
"step": 656
},
{
"entropy": 0.4376789927482605,
"epoch": 32.85,
"grad_norm": 7.1417236328125,
"learning_rate": 1e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.970708429813385,
"num_tokens": 172167610.0,
"step": 657
},
{
"entropy": 0.4353598952293396,
"epoch": 32.9,
"grad_norm": 6.490878105163574,
"learning_rate": 1e-06,
"loss": 0.0577,
"mean_token_accuracy": 0.9740012884140015,
"num_tokens": 172429615.0,
"step": 658
},
{
"entropy": 0.43777158856391907,
"epoch": 32.95,
"grad_norm": 8.160463333129883,
"learning_rate": 1e-06,
"loss": 0.0696,
"mean_token_accuracy": 0.9702098965644836,
"num_tokens": 172691685.0,
"step": 659
},
{
"entropy": 0.43828877806663513,
"epoch": 33.0,
"grad_norm": 7.0130391120910645,
"learning_rate": 1e-06,
"loss": 0.0501,
"mean_token_accuracy": 0.9810486435890198,
"num_tokens": 172953696.0,
"step": 660
},
{
"epoch": 33.0,
"eval_entropy": 0.4405567944049835,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9025537371635437,
"eval_num_tokens": 172953696.0,
"eval_runtime": 0.5665,
"eval_samples_per_second": 441.287,
"eval_steps_per_second": 1.765,
"step": 660
},
{
"entropy": 0.4386732578277588,
"epoch": 33.05,
"grad_norm": 4.480530261993408,
"learning_rate": 1e-06,
"loss": 0.062,
"mean_token_accuracy": 0.9707673788070679,
"num_tokens": 173215733.0,
"step": 661
},
{
"entropy": 0.4378315806388855,
"epoch": 33.1,
"grad_norm": 8.64880657196045,
"learning_rate": 1e-06,
"loss": 0.0588,
"mean_token_accuracy": 0.9721043109893799,
"num_tokens": 173477810.0,
"step": 662
},
{
"entropy": 0.438620388507843,
"epoch": 33.15,
"grad_norm": 6.820461750030518,
"learning_rate": 1e-06,
"loss": 0.0623,
"mean_token_accuracy": 0.973120927810669,
"num_tokens": 173739851.0,
"step": 663
},
{
"entropy": 0.4371694326400757,
"epoch": 33.2,
"grad_norm": 5.388397693634033,
"learning_rate": 1e-06,
"loss": 0.0438,
"mean_token_accuracy": 0.9832369685173035,
"num_tokens": 174001910.0,
"step": 664
},
{
"entropy": 0.4353873133659363,
"epoch": 33.25,
"grad_norm": 5.447931289672852,
"learning_rate": 1e-06,
"loss": 0.0507,
"mean_token_accuracy": 0.9806560277938843,
"num_tokens": 174263956.0,
"step": 665
},
{
"entropy": 0.43813008069992065,
"epoch": 33.3,
"grad_norm": 7.453225612640381,
"learning_rate": 1e-06,
"loss": 0.0583,
"mean_token_accuracy": 0.9732201099395752,
"num_tokens": 174525981.0,
"step": 666
},
{
"entropy": 0.4330652356147766,
"epoch": 33.35,
"grad_norm": 4.948639869689941,
"learning_rate": 1e-06,
"loss": 0.055,
"mean_token_accuracy": 0.9784615635871887,
"num_tokens": 174788067.0,
"step": 667
},
{
"entropy": 0.43555963039398193,
"epoch": 33.4,
"grad_norm": 9.127644538879395,
"learning_rate": 1e-06,
"loss": 0.0534,
"mean_token_accuracy": 0.9801324605941772,
"num_tokens": 175050118.0,
"step": 668
},
{
"entropy": 0.4352026581764221,
"epoch": 33.45,
"grad_norm": 10.331482887268066,
"learning_rate": 1e-06,
"loss": 0.0666,
"mean_token_accuracy": 0.9726110696792603,
"num_tokens": 175312189.0,
"step": 669
},
{
"entropy": 0.434938907623291,
"epoch": 33.5,
"grad_norm": 7.918214797973633,
"learning_rate": 1e-06,
"loss": 0.0586,
"mean_token_accuracy": 0.9758656620979309,
"num_tokens": 175574259.0,
"step": 670
},
{
"entropy": 0.43676304817199707,
"epoch": 33.55,
"grad_norm": 7.050904750823975,
"learning_rate": 1e-06,
"loss": 0.0636,
"mean_token_accuracy": 0.9727171659469604,
"num_tokens": 175836318.0,
"step": 671
},
{
"entropy": 0.4355317950248718,
"epoch": 33.6,
"grad_norm": 3.8058547973632812,
"learning_rate": 1e-06,
"loss": 0.0469,
"mean_token_accuracy": 0.9824120402336121,
"num_tokens": 176098338.0,
"step": 672
},
{
"entropy": 0.437080442905426,
"epoch": 33.65,
"grad_norm": 4.726657390594482,
"learning_rate": 1e-06,
"loss": 0.0504,
"mean_token_accuracy": 0.981028139591217,
"num_tokens": 176360400.0,
"step": 673
},
{
"entropy": 0.4355364441871643,
"epoch": 33.7,
"grad_norm": 4.205655097961426,
"learning_rate": 1e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9719271659851074,
"num_tokens": 176622443.0,
"step": 674
},
{
"entropy": 0.43918710947036743,
"epoch": 33.75,
"grad_norm": 8.278221130371094,
"learning_rate": 1e-06,
"loss": 0.0544,
"mean_token_accuracy": 0.9779179692268372,
"num_tokens": 176884456.0,
"step": 675
},
{
"entropy": 0.43781429529190063,
"epoch": 33.8,
"grad_norm": 10.086792945861816,
"learning_rate": 1e-06,
"loss": 0.0579,
"mean_token_accuracy": 0.9812734127044678,
"num_tokens": 177146486.0,
"step": 676
},
{
"entropy": 0.4368705749511719,
"epoch": 33.85,
"grad_norm": 7.0400567054748535,
"learning_rate": 1e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.973964512348175,
"num_tokens": 177408538.0,
"step": 677
},
{
"entropy": 0.4386277198791504,
"epoch": 33.9,
"grad_norm": 5.393922805786133,
"learning_rate": 1e-06,
"loss": 0.0525,
"mean_token_accuracy": 0.9760109782218933,
"num_tokens": 177670590.0,
"step": 678
},
{
"entropy": 0.4350482225418091,
"epoch": 33.95,
"grad_norm": 8.861899375915527,
"learning_rate": 1e-06,
"loss": 0.0541,
"mean_token_accuracy": 0.9747545719146729,
"num_tokens": 177932675.0,
"step": 679
},
{
"entropy": 0.43752387166023254,
"epoch": 34.0,
"grad_norm": 8.595000267028809,
"learning_rate": 1e-06,
"loss": 0.0608,
"mean_token_accuracy": 0.9761388301849365,
"num_tokens": 178194717.0,
"step": 680
},
{
"epoch": 34.0,
"eval_entropy": 0.4389887750148773,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9045698642730713,
"eval_num_tokens": 178194717.0,
"eval_runtime": 0.565,
"eval_samples_per_second": 442.441,
"eval_steps_per_second": 1.77,
"step": 680
},
{
"entropy": 0.4371822476387024,
"epoch": 34.05,
"grad_norm": 5.267615795135498,
"learning_rate": 1e-06,
"loss": 0.0442,
"mean_token_accuracy": 0.9826968908309937,
"num_tokens": 178456788.0,
"step": 681
},
{
"entropy": 0.4385065734386444,
"epoch": 34.1,
"grad_norm": 6.0776567459106445,
"learning_rate": 1e-06,
"loss": 0.0497,
"mean_token_accuracy": 0.9802631735801697,
"num_tokens": 178718822.0,
"step": 682
},
{
"entropy": 0.43671929836273193,
"epoch": 34.15,
"grad_norm": 6.3837175369262695,
"learning_rate": 1e-06,
"loss": 0.0593,
"mean_token_accuracy": 0.9744361042976379,
"num_tokens": 178980877.0,
"step": 683
},
{
"entropy": 0.43611472845077515,
"epoch": 34.2,
"grad_norm": 8.407622337341309,
"learning_rate": 1e-06,
"loss": 0.0491,
"mean_token_accuracy": 0.9801462888717651,
"num_tokens": 179242922.0,
"step": 684
},
{
"entropy": 0.4359371066093445,
"epoch": 34.25,
"grad_norm": 5.882180213928223,
"learning_rate": 1e-06,
"loss": 0.074,
"mean_token_accuracy": 0.9692609310150146,
"num_tokens": 179504978.0,
"step": 685
},
{
"entropy": 0.4321635961532593,
"epoch": 34.3,
"grad_norm": 9.319916725158691,
"learning_rate": 1e-06,
"loss": 0.0675,
"mean_token_accuracy": 0.9795918464660645,
"num_tokens": 179767047.0,
"step": 686
},
{
"entropy": 0.4343339502811432,
"epoch": 34.35,
"grad_norm": 6.66765832901001,
"learning_rate": 1e-06,
"loss": 0.0455,
"mean_token_accuracy": 0.9825620651245117,
"num_tokens": 180029098.0,
"step": 687
},
{
"entropy": 0.43469831347465515,
"epoch": 34.4,
"grad_norm": 6.452228546142578,
"learning_rate": 1e-06,
"loss": 0.0577,
"mean_token_accuracy": 0.9775280952453613,
"num_tokens": 180291151.0,
"step": 688
},
{
"entropy": 0.43252041935920715,
"epoch": 34.45,
"grad_norm": 4.9461822509765625,
"learning_rate": 1e-06,
"loss": 0.0601,
"mean_token_accuracy": 0.9745628237724304,
"num_tokens": 180553169.0,
"step": 689
},
{
"entropy": 0.43243032693862915,
"epoch": 34.5,
"grad_norm": 13.353845596313477,
"learning_rate": 1e-06,
"loss": 0.0657,
"mean_token_accuracy": 0.9685197472572327,
"num_tokens": 180815222.0,
"step": 690
},
{
"entropy": 0.4328831434249878,
"epoch": 34.55,
"grad_norm": 10.329776763916016,
"learning_rate": 1e-06,
"loss": 0.0614,
"mean_token_accuracy": 0.9704106450080872,
"num_tokens": 181077273.0,
"step": 691
},
{
"entropy": 0.43350595235824585,
"epoch": 34.6,
"grad_norm": 8.362601280212402,
"learning_rate": 1e-06,
"loss": 0.0581,
"mean_token_accuracy": 0.9745671153068542,
"num_tokens": 181339285.0,
"step": 692
},
{
"entropy": 0.43465209007263184,
"epoch": 34.65,
"grad_norm": 4.411050796508789,
"learning_rate": 1e-06,
"loss": 0.0474,
"mean_token_accuracy": 0.980923056602478,
"num_tokens": 181601338.0,
"step": 693
},
{
"entropy": 0.4354056715965271,
"epoch": 34.7,
"grad_norm": 4.7629008293151855,
"learning_rate": 1e-06,
"loss": 0.0485,
"mean_token_accuracy": 0.980560302734375,
"num_tokens": 181863383.0,
"step": 694
},
{
"entropy": 0.43551138043403625,
"epoch": 34.75,
"grad_norm": 7.455565929412842,
"learning_rate": 1e-06,
"loss": 0.0518,
"mean_token_accuracy": 0.9773609042167664,
"num_tokens": 182125456.0,
"step": 695
},
{
"entropy": 0.4353017807006836,
"epoch": 34.8,
"grad_norm": 4.370556354522705,
"learning_rate": 1e-06,
"loss": 0.0517,
"mean_token_accuracy": 0.9772007465362549,
"num_tokens": 182387562.0,
"step": 696
},
{
"entropy": 0.4363037943840027,
"epoch": 34.85,
"grad_norm": 5.329375267028809,
"learning_rate": 1e-06,
"loss": 0.0498,
"mean_token_accuracy": 0.978723406791687,
"num_tokens": 182649622.0,
"step": 697
},
{
"entropy": 0.43782755732536316,
"epoch": 34.9,
"grad_norm": 8.173315048217773,
"learning_rate": 1e-06,
"loss": 0.0813,
"mean_token_accuracy": 0.9646511673927307,
"num_tokens": 182911639.0,
"step": 698
},
{
"entropy": 0.434041827917099,
"epoch": 34.95,
"grad_norm": 7.220884323120117,
"learning_rate": 1e-06,
"loss": 0.0546,
"mean_token_accuracy": 0.9753968119621277,
"num_tokens": 183173690.0,
"step": 699
},
{
"entropy": 0.434684693813324,
"epoch": 35.0,
"grad_norm": 5.7697296142578125,
"learning_rate": 1e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9744229316711426,
"num_tokens": 183435754.0,
"step": 700
},
{
"epoch": 35.0,
"eval_entropy": 0.437461256980896,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8998655676841736,
"eval_num_tokens": 183435754.0,
"eval_runtime": 0.5651,
"eval_samples_per_second": 442.394,
"eval_steps_per_second": 1.77,
"step": 700
},
{
"entropy": 0.435441255569458,
"epoch": 35.05,
"grad_norm": 4.88384485244751,
"learning_rate": 1e-06,
"loss": 0.0589,
"mean_token_accuracy": 0.9778853058815002,
"num_tokens": 183697794.0,
"step": 701
},
{
"entropy": 0.4339606463909149,
"epoch": 35.1,
"grad_norm": 4.933961391448975,
"learning_rate": 1e-06,
"loss": 0.0552,
"mean_token_accuracy": 0.9752303957939148,
"num_tokens": 183959859.0,
"step": 702
},
{
"entropy": 0.4330012798309326,
"epoch": 35.15,
"grad_norm": 7.595359802246094,
"learning_rate": 1e-06,
"loss": 0.055,
"mean_token_accuracy": 0.9751279950141907,
"num_tokens": 184221885.0,
"step": 703
},
{
"entropy": 0.4331626296043396,
"epoch": 35.2,
"grad_norm": 3.9097628593444824,
"learning_rate": 1e-06,
"loss": 0.0441,
"mean_token_accuracy": 0.983565092086792,
"num_tokens": 184483961.0,
"step": 704
},
{
"entropy": 0.43158963322639465,
"epoch": 35.25,
"grad_norm": 5.426919937133789,
"learning_rate": 1e-06,
"loss": 0.0534,
"mean_token_accuracy": 0.977968156337738,
"num_tokens": 184745990.0,
"step": 705
},
{
"entropy": 0.4347250759601593,
"epoch": 35.3,
"grad_norm": 7.520493030548096,
"learning_rate": 1e-06,
"loss": 0.0506,
"mean_token_accuracy": 0.9777158498764038,
"num_tokens": 185008048.0,
"step": 706
},
{
"entropy": 0.4325712323188782,
"epoch": 35.35,
"grad_norm": 6.389200210571289,
"learning_rate": 1e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9767596125602722,
"num_tokens": 185270114.0,
"step": 707
},
{
"entropy": 0.43263205885887146,
"epoch": 35.4,
"grad_norm": 4.24953031539917,
"learning_rate": 1e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9763349294662476,
"num_tokens": 185532190.0,
"step": 708
},
{
"entropy": 0.4333184063434601,
"epoch": 35.45,
"grad_norm": 3.9612417221069336,
"learning_rate": 1e-06,
"loss": 0.0518,
"mean_token_accuracy": 0.9805447459220886,
"num_tokens": 185794266.0,
"step": 709
},
{
"entropy": 0.43302151560783386,
"epoch": 35.5,
"grad_norm": 9.319104194641113,
"learning_rate": 1e-06,
"loss": 0.0529,
"mean_token_accuracy": 0.9785074591636658,
"num_tokens": 186056270.0,
"step": 710
},
{
"entropy": 0.4321300983428955,
"epoch": 35.55,
"grad_norm": 8.200410842895508,
"learning_rate": 1e-06,
"loss": 0.0483,
"mean_token_accuracy": 0.9809750318527222,
"num_tokens": 186318314.0,
"step": 711
},
{
"entropy": 0.43272072076797485,
"epoch": 35.6,
"grad_norm": 3.841181516647339,
"learning_rate": 1e-06,
"loss": 0.0489,
"mean_token_accuracy": 0.977331280708313,
"num_tokens": 186580353.0,
"step": 712
},
{
"entropy": 0.4306867718696594,
"epoch": 35.65,
"grad_norm": 4.986248016357422,
"learning_rate": 1e-06,
"loss": 0.061,
"mean_token_accuracy": 0.9757112860679626,
"num_tokens": 186842420.0,
"step": 713
},
{
"entropy": 0.43096476793289185,
"epoch": 35.7,
"grad_norm": 6.300476551055908,
"learning_rate": 1e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.9771689772605896,
"num_tokens": 187104492.0,
"step": 714
},
{
"entropy": 0.43027129769325256,
"epoch": 35.75,
"grad_norm": 8.516679763793945,
"learning_rate": 1e-06,
"loss": 0.0592,
"mean_token_accuracy": 0.9696202278137207,
"num_tokens": 187366533.0,
"step": 715
},
{
"entropy": 0.4307771325111389,
"epoch": 35.8,
"grad_norm": 5.331250190734863,
"learning_rate": 1e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9772095680236816,
"num_tokens": 187628562.0,
"step": 716
},
{
"entropy": 0.43008124828338623,
"epoch": 35.85,
"grad_norm": 5.206639289855957,
"learning_rate": 1e-06,
"loss": 0.0589,
"mean_token_accuracy": 0.9751424193382263,
"num_tokens": 187890591.0,
"step": 717
},
{
"entropy": 0.43140071630477905,
"epoch": 35.9,
"grad_norm": 6.906051158905029,
"learning_rate": 1e-06,
"loss": 0.0564,
"mean_token_accuracy": 0.9787104725837708,
"num_tokens": 188152663.0,
"step": 718
},
{
"entropy": 0.4307631850242615,
"epoch": 35.95,
"grad_norm": 10.007140159606934,
"learning_rate": 1e-06,
"loss": 0.0484,
"mean_token_accuracy": 0.9796776175498962,
"num_tokens": 188414716.0,
"step": 719
},
{
"entropy": 0.4294223189353943,
"epoch": 36.0,
"grad_norm": 4.129380226135254,
"learning_rate": 1e-06,
"loss": 0.0459,
"mean_token_accuracy": 0.980211079120636,
"num_tokens": 188676759.0,
"step": 720
},
{
"epoch": 36.0,
"eval_entropy": 0.4343888759613037,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9025537371635437,
"eval_num_tokens": 188676759.0,
"eval_runtime": 0.5635,
"eval_samples_per_second": 443.648,
"eval_steps_per_second": 1.775,
"step": 720
},
{
"entropy": 0.4333025813102722,
"epoch": 36.05,
"grad_norm": 7.116022109985352,
"learning_rate": 1e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9802095293998718,
"num_tokens": 188938806.0,
"step": 721
},
{
"entropy": 0.4314154386520386,
"epoch": 36.1,
"grad_norm": 8.479453086853027,
"learning_rate": 1e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9697193503379822,
"num_tokens": 189200885.0,
"step": 722
},
{
"entropy": 0.4306054711341858,
"epoch": 36.15,
"grad_norm": 5.400243282318115,
"learning_rate": 1e-06,
"loss": 0.0532,
"mean_token_accuracy": 0.9769552946090698,
"num_tokens": 189462943.0,
"step": 723
},
{
"entropy": 0.4325631856918335,
"epoch": 36.2,
"grad_norm": 4.690549850463867,
"learning_rate": 1e-06,
"loss": 0.0597,
"mean_token_accuracy": 0.9788106679916382,
"num_tokens": 189724999.0,
"step": 724
},
{
"entropy": 0.43309372663497925,
"epoch": 36.25,
"grad_norm": 5.27554988861084,
"learning_rate": 1e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9805699586868286,
"num_tokens": 189987037.0,
"step": 725
},
{
"entropy": 0.43083733320236206,
"epoch": 36.3,
"grad_norm": 6.90108060836792,
"learning_rate": 1e-06,
"loss": 0.046,
"mean_token_accuracy": 0.980445384979248,
"num_tokens": 190249042.0,
"step": 726
},
{
"entropy": 0.43399274349212646,
"epoch": 36.35,
"grad_norm": 6.533586502075195,
"learning_rate": 1e-06,
"loss": 0.0521,
"mean_token_accuracy": 0.9788219928741455,
"num_tokens": 190511146.0,
"step": 727
},
{
"entropy": 0.43022042512893677,
"epoch": 36.4,
"grad_norm": 8.063142776489258,
"learning_rate": 1e-06,
"loss": 0.053,
"mean_token_accuracy": 0.9760554432868958,
"num_tokens": 190773227.0,
"step": 728
},
{
"entropy": 0.4320385456085205,
"epoch": 36.45,
"grad_norm": 5.83900260925293,
"learning_rate": 1e-06,
"loss": 0.0455,
"mean_token_accuracy": 0.9809523820877075,
"num_tokens": 191035290.0,
"step": 729
},
{
"entropy": 0.4300408363342285,
"epoch": 36.5,
"grad_norm": 5.8197808265686035,
"learning_rate": 1e-06,
"loss": 0.0668,
"mean_token_accuracy": 0.9678930044174194,
"num_tokens": 191297345.0,
"step": 730
},
{
"entropy": 0.4326372444629669,
"epoch": 36.55,
"grad_norm": 4.969232082366943,
"learning_rate": 1e-06,
"loss": 0.0426,
"mean_token_accuracy": 0.9821896553039551,
"num_tokens": 191559385.0,
"step": 731
},
{
"entropy": 0.4319472312927246,
"epoch": 36.6,
"grad_norm": 5.88218355178833,
"learning_rate": 1e-06,
"loss": 0.0403,
"mean_token_accuracy": 0.9824660420417786,
"num_tokens": 191821416.0,
"step": 732
},
{
"entropy": 0.4315246343612671,
"epoch": 36.65,
"grad_norm": 3.370635747909546,
"learning_rate": 1e-06,
"loss": 0.0444,
"mean_token_accuracy": 0.983902096748352,
"num_tokens": 192083430.0,
"step": 733
},
{
"entropy": 0.43474385142326355,
"epoch": 36.7,
"grad_norm": 3.9786579608917236,
"learning_rate": 1e-06,
"loss": 0.0453,
"mean_token_accuracy": 0.9828022122383118,
"num_tokens": 192345472.0,
"step": 734
},
{
"entropy": 0.4301164150238037,
"epoch": 36.75,
"grad_norm": 5.871670722961426,
"learning_rate": 1e-06,
"loss": 0.0595,
"mean_token_accuracy": 0.9774436354637146,
"num_tokens": 192607528.0,
"step": 735
},
{
"entropy": 0.4330886900424957,
"epoch": 36.8,
"grad_norm": 3.582524061203003,
"learning_rate": 1e-06,
"loss": 0.048,
"mean_token_accuracy": 0.9793762564659119,
"num_tokens": 192869581.0,
"step": 736
},
{
"entropy": 0.4340907037258148,
"epoch": 36.85,
"grad_norm": 8.269258499145508,
"learning_rate": 1e-06,
"loss": 0.0491,
"mean_token_accuracy": 0.9776817560195923,
"num_tokens": 193131662.0,
"step": 737
},
{
"entropy": 0.43378227949142456,
"epoch": 36.9,
"grad_norm": 7.181451797485352,
"learning_rate": 1e-06,
"loss": 0.0462,
"mean_token_accuracy": 0.980169951915741,
"num_tokens": 193393700.0,
"step": 738
},
{
"entropy": 0.4317992925643921,
"epoch": 36.95,
"grad_norm": 4.083510398864746,
"learning_rate": 1e-06,
"loss": 0.0549,
"mean_token_accuracy": 0.9743944406509399,
"num_tokens": 193655738.0,
"step": 739
},
{
"entropy": 0.4305614233016968,
"epoch": 37.0,
"grad_norm": 5.232417106628418,
"learning_rate": 1e-06,
"loss": 0.0575,
"mean_token_accuracy": 0.9750133156776428,
"num_tokens": 193917750.0,
"step": 740
},
{
"epoch": 37.0,
"eval_entropy": 0.43406665325164795,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9052419066429138,
"eval_num_tokens": 193917750.0,
"eval_runtime": 0.5662,
"eval_samples_per_second": 441.571,
"eval_steps_per_second": 1.766,
"step": 740
},
{
"entropy": 0.4319198727607727,
"epoch": 37.05,
"grad_norm": 3.9398393630981445,
"learning_rate": 1e-06,
"loss": 0.0402,
"mean_token_accuracy": 0.9842470288276672,
"num_tokens": 194179798.0,
"step": 741
},
{
"entropy": 0.4315286874771118,
"epoch": 37.1,
"grad_norm": 6.6097025871276855,
"learning_rate": 1e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.9826037883758545,
"num_tokens": 194441843.0,
"step": 742
},
{
"entropy": 0.4326039254665375,
"epoch": 37.15,
"grad_norm": 4.204853057861328,
"learning_rate": 1e-06,
"loss": 0.0429,
"mean_token_accuracy": 0.9815917611122131,
"num_tokens": 194703887.0,
"step": 743
},
{
"entropy": 0.4300069510936737,
"epoch": 37.2,
"grad_norm": 3.7171454429626465,
"learning_rate": 1e-06,
"loss": 0.0437,
"mean_token_accuracy": 0.9782465100288391,
"num_tokens": 194965964.0,
"step": 744
},
{
"entropy": 0.43168967962265015,
"epoch": 37.25,
"grad_norm": 3.8260834217071533,
"learning_rate": 1e-06,
"loss": 0.0364,
"mean_token_accuracy": 0.9823232293128967,
"num_tokens": 195228042.0,
"step": 745
},
{
"entropy": 0.4312366247177124,
"epoch": 37.3,
"grad_norm": 9.227532386779785,
"learning_rate": 1e-06,
"loss": 0.0567,
"mean_token_accuracy": 0.9740932583808899,
"num_tokens": 195490146.0,
"step": 746
},
{
"entropy": 0.43321493268013,
"epoch": 37.35,
"grad_norm": 6.180078983306885,
"learning_rate": 1e-06,
"loss": 0.0491,
"mean_token_accuracy": 0.9776632189750671,
"num_tokens": 195752155.0,
"step": 747
},
{
"entropy": 0.43195170164108276,
"epoch": 37.4,
"grad_norm": 5.982614994049072,
"learning_rate": 1e-06,
"loss": 0.058,
"mean_token_accuracy": 0.9748128056526184,
"num_tokens": 196014217.0,
"step": 748
},
{
"entropy": 0.43084219098091125,
"epoch": 37.45,
"grad_norm": 3.5786056518554688,
"learning_rate": 1e-06,
"loss": 0.0381,
"mean_token_accuracy": 0.9858906269073486,
"num_tokens": 196276280.0,
"step": 749
},
{
"entropy": 0.42843180894851685,
"epoch": 37.5,
"grad_norm": 7.1140360832214355,
"learning_rate": 1e-06,
"loss": 0.0519,
"mean_token_accuracy": 0.9748620390892029,
"num_tokens": 196538306.0,
"step": 750
},
{
"entropy": 0.4283401370048523,
"epoch": 37.55,
"grad_norm": 4.959768772125244,
"learning_rate": 1e-06,
"loss": 0.0352,
"mean_token_accuracy": 0.9865038394927979,
"num_tokens": 196800356.0,
"step": 751
},
{
"entropy": 0.42931511998176575,
"epoch": 37.6,
"grad_norm": 4.487987995147705,
"learning_rate": 1e-06,
"loss": 0.0487,
"mean_token_accuracy": 0.9777777791023254,
"num_tokens": 197062395.0,
"step": 752
},
{
"entropy": 0.43021565675735474,
"epoch": 37.65,
"grad_norm": 5.882363796234131,
"learning_rate": 1e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.9759414196014404,
"num_tokens": 197324405.0,
"step": 753
},
{
"entropy": 0.42972004413604736,
"epoch": 37.7,
"grad_norm": 5.719748020172119,
"learning_rate": 1e-06,
"loss": 0.0518,
"mean_token_accuracy": 0.9800443649291992,
"num_tokens": 197586450.0,
"step": 754
},
{
"entropy": 0.42973631620407104,
"epoch": 37.75,
"grad_norm": 5.8527398109436035,
"learning_rate": 1e-06,
"loss": 0.0561,
"mean_token_accuracy": 0.9730787873268127,
"num_tokens": 197848459.0,
"step": 755
},
{
"entropy": 0.428989052772522,
"epoch": 37.8,
"grad_norm": 6.304094314575195,
"learning_rate": 1e-06,
"loss": 0.0529,
"mean_token_accuracy": 0.9790301322937012,
"num_tokens": 198110512.0,
"step": 756
},
{
"entropy": 0.4285169839859009,
"epoch": 37.85,
"grad_norm": 7.316928863525391,
"learning_rate": 1e-06,
"loss": 0.0656,
"mean_token_accuracy": 0.9671322703361511,
"num_tokens": 198372586.0,
"step": 757
},
{
"entropy": 0.4262317419052124,
"epoch": 37.9,
"grad_norm": 5.824263095855713,
"learning_rate": 1e-06,
"loss": 0.0475,
"mean_token_accuracy": 0.9809321761131287,
"num_tokens": 198634661.0,
"step": 758
},
{
"entropy": 0.43088752031326294,
"epoch": 37.95,
"grad_norm": 4.740274429321289,
"learning_rate": 1e-06,
"loss": 0.0448,
"mean_token_accuracy": 0.9836280345916748,
"num_tokens": 198896715.0,
"step": 759
},
{
"entropy": 0.4321492612361908,
"epoch": 38.0,
"grad_norm": 7.113720417022705,
"learning_rate": 1e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9796651005744934,
"num_tokens": 199158782.0,
"step": 760
},
{
"epoch": 38.0,
"eval_entropy": 0.4316790699958801,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9079301357269287,
"eval_num_tokens": 199158782.0,
"eval_runtime": 0.5603,
"eval_samples_per_second": 446.212,
"eval_steps_per_second": 1.785,
"step": 760
},
{
"entropy": 0.4297526180744171,
"epoch": 38.05,
"grad_norm": 6.399072647094727,
"learning_rate": 1e-06,
"loss": 0.0446,
"mean_token_accuracy": 0.9808707237243652,
"num_tokens": 199420825.0,
"step": 761
},
{
"entropy": 0.4314311742782593,
"epoch": 38.1,
"grad_norm": 5.302173614501953,
"learning_rate": 1e-06,
"loss": 0.0601,
"mean_token_accuracy": 0.9708454608917236,
"num_tokens": 199682902.0,
"step": 762
},
{
"entropy": 0.4309634566307068,
"epoch": 38.15,
"grad_norm": 6.87785005569458,
"learning_rate": 1e-06,
"loss": 0.0541,
"mean_token_accuracy": 0.976190447807312,
"num_tokens": 199944938.0,
"step": 763
},
{
"entropy": 0.43027186393737793,
"epoch": 38.2,
"grad_norm": 5.8634138107299805,
"learning_rate": 1e-06,
"loss": 0.0414,
"mean_token_accuracy": 0.9833240509033203,
"num_tokens": 200207000.0,
"step": 764
},
{
"entropy": 0.4300784468650818,
"epoch": 38.25,
"grad_norm": 17.801645278930664,
"learning_rate": 1e-06,
"loss": 0.033,
"mean_token_accuracy": 0.9908369183540344,
"num_tokens": 200469065.0,
"step": 765
},
{
"entropy": 0.43063056468963623,
"epoch": 38.3,
"grad_norm": 4.167501926422119,
"learning_rate": 1e-06,
"loss": 0.0519,
"mean_token_accuracy": 0.97929847240448,
"num_tokens": 200731067.0,
"step": 766
},
{
"entropy": 0.4308306872844696,
"epoch": 38.35,
"grad_norm": 4.516330718994141,
"learning_rate": 1e-06,
"loss": 0.0485,
"mean_token_accuracy": 0.9816091656684875,
"num_tokens": 200993097.0,
"step": 767
},
{
"entropy": 0.4297667145729065,
"epoch": 38.4,
"grad_norm": 4.7929277420043945,
"learning_rate": 1e-06,
"loss": 0.0488,
"mean_token_accuracy": 0.9790105223655701,
"num_tokens": 201255156.0,
"step": 768
},
{
"entropy": 0.4306723475456238,
"epoch": 38.45,
"grad_norm": 4.585225582122803,
"learning_rate": 1e-06,
"loss": 0.0455,
"mean_token_accuracy": 0.9819868803024292,
"num_tokens": 201517185.0,
"step": 769
},
{
"entropy": 0.4299125671386719,
"epoch": 38.5,
"grad_norm": 4.201162815093994,
"learning_rate": 1e-06,
"loss": 0.0431,
"mean_token_accuracy": 0.9805615544319153,
"num_tokens": 201779233.0,
"step": 770
},
{
"entropy": 0.43096548318862915,
"epoch": 38.55,
"grad_norm": 5.361374855041504,
"learning_rate": 1e-06,
"loss": 0.05,
"mean_token_accuracy": 0.9783693552017212,
"num_tokens": 202041266.0,
"step": 771
},
{
"entropy": 0.4295889735221863,
"epoch": 38.6,
"grad_norm": 7.002900123596191,
"learning_rate": 1e-06,
"loss": 0.0585,
"mean_token_accuracy": 0.9755419492721558,
"num_tokens": 202303295.0,
"step": 772
},
{
"entropy": 0.42901116609573364,
"epoch": 38.65,
"grad_norm": 10.230154037475586,
"learning_rate": 1e-06,
"loss": 0.0523,
"mean_token_accuracy": 0.9762485027313232,
"num_tokens": 202565332.0,
"step": 773
},
{
"entropy": 0.42743608355522156,
"epoch": 38.7,
"grad_norm": 5.833381175994873,
"learning_rate": 1e-06,
"loss": 0.0582,
"mean_token_accuracy": 0.9707192778587341,
"num_tokens": 202827364.0,
"step": 774
},
{
"entropy": 0.42607641220092773,
"epoch": 38.75,
"grad_norm": 6.477557182312012,
"learning_rate": 1e-06,
"loss": 0.0551,
"mean_token_accuracy": 0.974967896938324,
"num_tokens": 203089449.0,
"step": 775
},
{
"entropy": 0.42739659547805786,
"epoch": 38.8,
"grad_norm": 7.6487627029418945,
"learning_rate": 1e-06,
"loss": 0.0333,
"mean_token_accuracy": 0.9863269329071045,
"num_tokens": 203351519.0,
"step": 776
},
{
"entropy": 0.4268617630004883,
"epoch": 38.85,
"grad_norm": 4.835480213165283,
"learning_rate": 1e-06,
"loss": 0.0378,
"mean_token_accuracy": 0.9873916506767273,
"num_tokens": 203613612.0,
"step": 777
},
{
"entropy": 0.4261825680732727,
"epoch": 38.9,
"grad_norm": 5.235621452331543,
"learning_rate": 1e-06,
"loss": 0.0496,
"mean_token_accuracy": 0.98097825050354,
"num_tokens": 203875682.0,
"step": 778
},
{
"entropy": 0.4306997060775757,
"epoch": 38.95,
"grad_norm": 6.902498722076416,
"learning_rate": 1e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9750712513923645,
"num_tokens": 204137745.0,
"step": 779
},
{
"entropy": 0.4279959797859192,
"epoch": 39.0,
"grad_norm": 5.471578598022461,
"learning_rate": 1e-06,
"loss": 0.0479,
"mean_token_accuracy": 0.9792307615280151,
"num_tokens": 204399803.0,
"step": 780
},
{
"epoch": 39.0,
"eval_entropy": 0.4311733841896057,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9106183052062988,
"eval_num_tokens": 204399803.0,
"eval_runtime": 0.563,
"eval_samples_per_second": 444.061,
"eval_steps_per_second": 1.776,
"step": 780
},
{
"entropy": 0.4303116500377655,
"epoch": 39.05,
"grad_norm": 5.4172492027282715,
"learning_rate": 1e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9733893275260925,
"num_tokens": 204661857.0,
"step": 781
},
{
"entropy": 0.4293464422225952,
"epoch": 39.1,
"grad_norm": 12.146878242492676,
"learning_rate": 1e-06,
"loss": 0.0566,
"mean_token_accuracy": 0.9732072353363037,
"num_tokens": 204923950.0,
"step": 782
},
{
"entropy": 0.4306670129299164,
"epoch": 39.15,
"grad_norm": 11.113872528076172,
"learning_rate": 1e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9793368577957153,
"num_tokens": 205186030.0,
"step": 783
},
{
"entropy": 0.4303710460662842,
"epoch": 39.2,
"grad_norm": 4.949894905090332,
"learning_rate": 1e-06,
"loss": 0.046,
"mean_token_accuracy": 0.9795244336128235,
"num_tokens": 205448071.0,
"step": 784
},
{
"entropy": 0.4308937191963196,
"epoch": 39.25,
"grad_norm": 6.6487650871276855,
"learning_rate": 1e-06,
"loss": 0.048,
"mean_token_accuracy": 0.9802955389022827,
"num_tokens": 205710085.0,
"step": 785
},
{
"entropy": 0.42958366870880127,
"epoch": 39.3,
"grad_norm": 6.201844692230225,
"learning_rate": 1e-06,
"loss": 0.0487,
"mean_token_accuracy": 0.97826087474823,
"num_tokens": 205972150.0,
"step": 786
},
{
"entropy": 0.43153223395347595,
"epoch": 39.35,
"grad_norm": 8.383527755737305,
"learning_rate": 1e-06,
"loss": 0.043,
"mean_token_accuracy": 0.9783783555030823,
"num_tokens": 206234177.0,
"step": 787
},
{
"entropy": 0.4291541278362274,
"epoch": 39.4,
"grad_norm": 4.192736625671387,
"learning_rate": 1e-06,
"loss": 0.0371,
"mean_token_accuracy": 0.9854142069816589,
"num_tokens": 206496187.0,
"step": 788
},
{
"entropy": 0.4308379590511322,
"epoch": 39.45,
"grad_norm": 7.078681468963623,
"learning_rate": 1e-06,
"loss": 0.0397,
"mean_token_accuracy": 0.9830328822135925,
"num_tokens": 206758204.0,
"step": 789
},
{
"entropy": 0.42960792779922485,
"epoch": 39.5,
"grad_norm": 3.608894109725952,
"learning_rate": 1e-06,
"loss": 0.0301,
"mean_token_accuracy": 0.9880810379981995,
"num_tokens": 207020244.0,
"step": 790
},
{
"entropy": 0.4298979938030243,
"epoch": 39.55,
"grad_norm": 4.762541770935059,
"learning_rate": 1e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9795918464660645,
"num_tokens": 207282305.0,
"step": 791
},
{
"entropy": 0.43151146173477173,
"epoch": 39.6,
"grad_norm": 3.833782434463501,
"learning_rate": 1e-06,
"loss": 0.0453,
"mean_token_accuracy": 0.982624351978302,
"num_tokens": 207544369.0,
"step": 792
},
{
"entropy": 0.4292357563972473,
"epoch": 39.65,
"grad_norm": 4.718764781951904,
"learning_rate": 1e-06,
"loss": 0.0395,
"mean_token_accuracy": 0.9843843579292297,
"num_tokens": 207806396.0,
"step": 793
},
{
"entropy": 0.4275195598602295,
"epoch": 39.7,
"grad_norm": 4.482115268707275,
"learning_rate": 1e-06,
"loss": 0.0482,
"mean_token_accuracy": 0.980169951915741,
"num_tokens": 208068467.0,
"step": 794
},
{
"entropy": 0.4273834824562073,
"epoch": 39.75,
"grad_norm": 8.38062572479248,
"learning_rate": 1e-06,
"loss": 0.055,
"mean_token_accuracy": 0.9755538702011108,
"num_tokens": 208330534.0,
"step": 795
},
{
"entropy": 0.4268650710582733,
"epoch": 39.8,
"grad_norm": 7.30850887298584,
"learning_rate": 1e-06,
"loss": 0.0434,
"mean_token_accuracy": 0.9825853705406189,
"num_tokens": 208592587.0,
"step": 796
},
{
"entropy": 0.42521435022354126,
"epoch": 39.85,
"grad_norm": 5.413529396057129,
"learning_rate": 1e-06,
"loss": 0.0459,
"mean_token_accuracy": 0.9793140888214111,
"num_tokens": 208854654.0,
"step": 797
},
{
"entropy": 0.4262104332447052,
"epoch": 39.9,
"grad_norm": 8.122570991516113,
"learning_rate": 1e-06,
"loss": 0.0598,
"mean_token_accuracy": 0.9763879776000977,
"num_tokens": 209116715.0,
"step": 798
},
{
"entropy": 0.4243553578853607,
"epoch": 39.95,
"grad_norm": 11.898737907409668,
"learning_rate": 1e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9789808988571167,
"num_tokens": 209378779.0,
"step": 799
},
{
"entropy": 0.42552608251571655,
"epoch": 40.0,
"grad_norm": 4.8858723640441895,
"learning_rate": 1e-06,
"loss": 0.0554,
"mean_token_accuracy": 0.974328339099884,
"num_tokens": 209640816.0,
"step": 800
},
{
"epoch": 40.0,
"eval_entropy": 0.4265006184577942,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8985214829444885,
"eval_num_tokens": 209640816.0,
"eval_runtime": 0.5639,
"eval_samples_per_second": 443.309,
"eval_steps_per_second": 1.773,
"step": 800
},
{
"entropy": 0.42547088861465454,
"epoch": 40.05,
"grad_norm": 3.928428888320923,
"learning_rate": 1e-06,
"loss": 0.0336,
"mean_token_accuracy": 0.9873657822608948,
"num_tokens": 209902893.0,
"step": 801
},
{
"entropy": 0.4265702962875366,
"epoch": 40.1,
"grad_norm": 6.94235372543335,
"learning_rate": 1e-06,
"loss": 0.049,
"mean_token_accuracy": 0.9771819710731506,
"num_tokens": 210164942.0,
"step": 802
},
{
"entropy": 0.42739298939704895,
"epoch": 40.15,
"grad_norm": 10.066420555114746,
"learning_rate": 1e-06,
"loss": 0.0489,
"mean_token_accuracy": 0.9780876636505127,
"num_tokens": 210426975.0,
"step": 803
},
{
"entropy": 0.4282771050930023,
"epoch": 40.2,
"grad_norm": 4.0888848304748535,
"learning_rate": 1e-06,
"loss": 0.0416,
"mean_token_accuracy": 0.9854447245597839,
"num_tokens": 210689060.0,
"step": 804
},
{
"entropy": 0.4287988841533661,
"epoch": 40.25,
"grad_norm": 5.135344505310059,
"learning_rate": 1e-06,
"loss": 0.0305,
"mean_token_accuracy": 0.9879999756813049,
"num_tokens": 210951087.0,
"step": 805
},
{
"entropy": 0.4283785820007324,
"epoch": 40.3,
"grad_norm": 7.806493759155273,
"learning_rate": 1e-06,
"loss": 0.0605,
"mean_token_accuracy": 0.9761051535606384,
"num_tokens": 211213123.0,
"step": 806
},
{
"entropy": 0.42876169085502625,
"epoch": 40.35,
"grad_norm": 4.562885761260986,
"learning_rate": 1e-06,
"loss": 0.0432,
"mean_token_accuracy": 0.9820144176483154,
"num_tokens": 211475167.0,
"step": 807
},
{
"entropy": 0.42846542596817017,
"epoch": 40.4,
"grad_norm": 5.972072124481201,
"learning_rate": 1e-06,
"loss": 0.0439,
"mean_token_accuracy": 0.9850237965583801,
"num_tokens": 211737196.0,
"step": 808
},
{
"entropy": 0.42836448550224304,
"epoch": 40.45,
"grad_norm": 4.6945319175720215,
"learning_rate": 1e-06,
"loss": 0.0395,
"mean_token_accuracy": 0.9843930602073669,
"num_tokens": 211999255.0,
"step": 809
},
{
"entropy": 0.4259374439716339,
"epoch": 40.5,
"grad_norm": 7.984348773956299,
"learning_rate": 1e-06,
"loss": 0.0521,
"mean_token_accuracy": 0.9777777791023254,
"num_tokens": 212261345.0,
"step": 810
},
{
"entropy": 0.4267132580280304,
"epoch": 40.55,
"grad_norm": 5.722595691680908,
"learning_rate": 1e-06,
"loss": 0.0537,
"mean_token_accuracy": 0.9772727489471436,
"num_tokens": 212523401.0,
"step": 811
},
{
"entropy": 0.42591041326522827,
"epoch": 40.6,
"grad_norm": 8.059563636779785,
"learning_rate": 1e-06,
"loss": 0.0512,
"mean_token_accuracy": 0.9775811433792114,
"num_tokens": 212785491.0,
"step": 812
},
{
"entropy": 0.4272247552871704,
"epoch": 40.65,
"grad_norm": 4.5345869064331055,
"learning_rate": 1e-06,
"loss": 0.0423,
"mean_token_accuracy": 0.9814593195915222,
"num_tokens": 213047558.0,
"step": 813
},
{
"entropy": 0.4253997802734375,
"epoch": 40.7,
"grad_norm": 10.771305084228516,
"learning_rate": 1e-06,
"loss": 0.0596,
"mean_token_accuracy": 0.972937285900116,
"num_tokens": 213309600.0,
"step": 814
},
{
"entropy": 0.4264408349990845,
"epoch": 40.75,
"grad_norm": 8.536927223205566,
"learning_rate": 1e-06,
"loss": 0.058,
"mean_token_accuracy": 0.9755164384841919,
"num_tokens": 213571632.0,
"step": 815
},
{
"entropy": 0.42377805709838867,
"epoch": 40.8,
"grad_norm": 6.131271839141846,
"learning_rate": 1e-06,
"loss": 0.0382,
"mean_token_accuracy": 0.9832285046577454,
"num_tokens": 213833689.0,
"step": 816
},
{
"entropy": 0.4256356656551361,
"epoch": 40.85,
"grad_norm": 5.8921613693237305,
"learning_rate": 1e-06,
"loss": 0.0491,
"mean_token_accuracy": 0.9806221127510071,
"num_tokens": 214095715.0,
"step": 817
},
{
"entropy": 0.42740219831466675,
"epoch": 40.9,
"grad_norm": 7.05807638168335,
"learning_rate": 1e-06,
"loss": 0.0419,
"mean_token_accuracy": 0.9825970530509949,
"num_tokens": 214357769.0,
"step": 818
},
{
"entropy": 0.42619818449020386,
"epoch": 40.95,
"grad_norm": 8.339810371398926,
"learning_rate": 1e-06,
"loss": 0.0539,
"mean_token_accuracy": 0.9760000109672546,
"num_tokens": 214619789.0,
"step": 819
},
{
"entropy": 0.4287991523742676,
"epoch": 41.0,
"grad_norm": 6.295149326324463,
"learning_rate": 1e-06,
"loss": 0.0439,
"mean_token_accuracy": 0.981502890586853,
"num_tokens": 214881848.0,
"step": 820
},
{
"epoch": 41.0,
"eval_entropy": 0.42802900075912476,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9032257795333862,
"eval_num_tokens": 214881848.0,
"eval_runtime": 0.5634,
"eval_samples_per_second": 443.721,
"eval_steps_per_second": 1.775,
"step": 820
},
{
"entropy": 0.42603129148483276,
"epoch": 41.05,
"grad_norm": 4.932643890380859,
"learning_rate": 1e-06,
"loss": 0.0385,
"mean_token_accuracy": 0.9828060269355774,
"num_tokens": 215143928.0,
"step": 821
},
{
"entropy": 0.4281240701675415,
"epoch": 41.1,
"grad_norm": 8.819388389587402,
"learning_rate": 1e-06,
"loss": 0.0526,
"mean_token_accuracy": 0.977624773979187,
"num_tokens": 215405967.0,
"step": 822
},
{
"entropy": 0.4279868006706238,
"epoch": 41.15,
"grad_norm": 5.604109764099121,
"learning_rate": 1e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.9837586879730225,
"num_tokens": 215668020.0,
"step": 823
},
{
"entropy": 0.4266362190246582,
"epoch": 41.2,
"grad_norm": 5.590734481811523,
"learning_rate": 1e-06,
"loss": 0.0478,
"mean_token_accuracy": 0.978723406791687,
"num_tokens": 215930074.0,
"step": 824
},
{
"entropy": 0.42430201172828674,
"epoch": 41.25,
"grad_norm": 6.548094749450684,
"learning_rate": 1e-06,
"loss": 0.0404,
"mean_token_accuracy": 0.9815140962600708,
"num_tokens": 216192133.0,
"step": 825
},
{
"entropy": 0.4260609447956085,
"epoch": 41.3,
"grad_norm": 4.57349967956543,
"learning_rate": 1e-06,
"loss": 0.0441,
"mean_token_accuracy": 0.981225848197937,
"num_tokens": 216454174.0,
"step": 826
},
{
"entropy": 0.42658817768096924,
"epoch": 41.35,
"grad_norm": 6.89821195602417,
"learning_rate": 1e-06,
"loss": 0.0463,
"mean_token_accuracy": 0.9805615544319153,
"num_tokens": 216716222.0,
"step": 827
},
{
"entropy": 0.4260250926017761,
"epoch": 41.4,
"grad_norm": 5.968894958496094,
"learning_rate": 1e-06,
"loss": 0.0421,
"mean_token_accuracy": 0.9821937084197998,
"num_tokens": 216978285.0,
"step": 828
},
{
"entropy": 0.4265430271625519,
"epoch": 41.45,
"grad_norm": 4.956072807312012,
"learning_rate": 1e-06,
"loss": 0.0581,
"mean_token_accuracy": 0.9706303477287292,
"num_tokens": 217240373.0,
"step": 829
},
{
"entropy": 0.4250110387802124,
"epoch": 41.5,
"grad_norm": 5.861893177032471,
"learning_rate": 1e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9804489612579346,
"num_tokens": 217502413.0,
"step": 830
},
{
"entropy": 0.42582571506500244,
"epoch": 41.55,
"grad_norm": 6.167220592498779,
"learning_rate": 1e-06,
"loss": 0.0489,
"mean_token_accuracy": 0.9791246056556702,
"num_tokens": 217764491.0,
"step": 831
},
{
"entropy": 0.42461884021759033,
"epoch": 41.6,
"grad_norm": 6.987247943878174,
"learning_rate": 1e-06,
"loss": 0.0438,
"mean_token_accuracy": 0.9817232489585876,
"num_tokens": 218026550.0,
"step": 832
},
{
"entropy": 0.4255552291870117,
"epoch": 41.65,
"grad_norm": 3.994992256164551,
"learning_rate": 1e-06,
"loss": 0.0361,
"mean_token_accuracy": 0.9873896837234497,
"num_tokens": 218288597.0,
"step": 833
},
{
"entropy": 0.42477789521217346,
"epoch": 41.7,
"grad_norm": 4.242578506469727,
"learning_rate": 1e-06,
"loss": 0.0366,
"mean_token_accuracy": 0.9860182404518127,
"num_tokens": 218550637.0,
"step": 834
},
{
"entropy": 0.42430806159973145,
"epoch": 41.75,
"grad_norm": 4.776451110839844,
"learning_rate": 1e-06,
"loss": 0.0477,
"mean_token_accuracy": 0.9796454310417175,
"num_tokens": 218812720.0,
"step": 835
},
{
"entropy": 0.42646872997283936,
"epoch": 41.8,
"grad_norm": 4.909146785736084,
"learning_rate": 1e-06,
"loss": 0.052,
"mean_token_accuracy": 0.974078357219696,
"num_tokens": 219074719.0,
"step": 836
},
{
"entropy": 0.4244542121887207,
"epoch": 41.85,
"grad_norm": 5.178461074829102,
"learning_rate": 1e-06,
"loss": 0.0413,
"mean_token_accuracy": 0.9836763739585876,
"num_tokens": 219336787.0,
"step": 837
},
{
"entropy": 0.4257306456565857,
"epoch": 41.9,
"grad_norm": 2.8526721000671387,
"learning_rate": 1e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.984000027179718,
"num_tokens": 219598826.0,
"step": 838
},
{
"entropy": 0.4252172112464905,
"epoch": 41.95,
"grad_norm": 3.7361361980438232,
"learning_rate": 1e-06,
"loss": 0.0477,
"mean_token_accuracy": 0.98320072889328,
"num_tokens": 219860823.0,
"step": 839
},
{
"entropy": 0.4218195080757141,
"epoch": 42.0,
"grad_norm": 8.049124717712402,
"learning_rate": 1e-06,
"loss": 0.0335,
"mean_token_accuracy": 0.9856651425361633,
"num_tokens": 220122896.0,
"step": 840
},
{
"epoch": 42.0,
"eval_entropy": 0.4240899682044983,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9005376100540161,
"eval_num_tokens": 220122896.0,
"eval_runtime": 0.5603,
"eval_samples_per_second": 446.2,
"eval_steps_per_second": 1.785,
"step": 840
},
{
"entropy": 0.42247503995895386,
"epoch": 42.05,
"grad_norm": 4.249022483825684,
"learning_rate": 1e-06,
"loss": 0.0493,
"mean_token_accuracy": 0.9803094267845154,
"num_tokens": 220384944.0,
"step": 841
},
{
"entropy": 0.42273712158203125,
"epoch": 42.1,
"grad_norm": 3.7368266582489014,
"learning_rate": 1e-06,
"loss": 0.05,
"mean_token_accuracy": 0.9767295718193054,
"num_tokens": 220646995.0,
"step": 842
},
{
"entropy": 0.4206734001636505,
"epoch": 42.15,
"grad_norm": 2.553225040435791,
"learning_rate": 1e-06,
"loss": 0.0397,
"mean_token_accuracy": 0.9799764156341553,
"num_tokens": 220909055.0,
"step": 843
},
{
"entropy": 0.42262324690818787,
"epoch": 42.2,
"grad_norm": 4.007044792175293,
"learning_rate": 1e-06,
"loss": 0.0338,
"mean_token_accuracy": 0.9858860373497009,
"num_tokens": 221171099.0,
"step": 844
},
{
"entropy": 0.4206918478012085,
"epoch": 42.25,
"grad_norm": 5.613060474395752,
"learning_rate": 1e-06,
"loss": 0.0499,
"mean_token_accuracy": 0.9805825352668762,
"num_tokens": 221433171.0,
"step": 845
},
{
"entropy": 0.4224393963813782,
"epoch": 42.3,
"grad_norm": 7.5523200035095215,
"learning_rate": 1e-06,
"loss": 0.0302,
"mean_token_accuracy": 0.9912751913070679,
"num_tokens": 221695221.0,
"step": 846
},
{
"entropy": 0.4227662980556488,
"epoch": 42.35,
"grad_norm": 3.233724594116211,
"learning_rate": 1e-06,
"loss": 0.0431,
"mean_token_accuracy": 0.9814459085464478,
"num_tokens": 221957278.0,
"step": 847
},
{
"entropy": 0.4223962128162384,
"epoch": 42.4,
"grad_norm": 5.855934143066406,
"learning_rate": 1e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.9815050959587097,
"num_tokens": 222219340.0,
"step": 848
},
{
"entropy": 0.42316097021102905,
"epoch": 42.45,
"grad_norm": 4.901780605316162,
"learning_rate": 1e-06,
"loss": 0.0429,
"mean_token_accuracy": 0.9836879372596741,
"num_tokens": 222481409.0,
"step": 849
},
{
"entropy": 0.42461538314819336,
"epoch": 42.5,
"grad_norm": 5.965307712554932,
"learning_rate": 1e-06,
"loss": 0.044,
"mean_token_accuracy": 0.9818791747093201,
"num_tokens": 222743459.0,
"step": 850
},
{
"entropy": 0.42335671186447144,
"epoch": 42.55,
"grad_norm": 4.5567426681518555,
"learning_rate": 1e-06,
"loss": 0.0458,
"mean_token_accuracy": 0.981582522392273,
"num_tokens": 223005518.0,
"step": 851
},
{
"entropy": 0.4207357168197632,
"epoch": 42.6,
"grad_norm": 6.25122594833374,
"learning_rate": 1e-06,
"loss": 0.0336,
"mean_token_accuracy": 0.9905003309249878,
"num_tokens": 223267591.0,
"step": 852
},
{
"entropy": 0.42162737250328064,
"epoch": 42.65,
"grad_norm": 5.886632442474365,
"learning_rate": 1e-06,
"loss": 0.0428,
"mean_token_accuracy": 0.9807093739509583,
"num_tokens": 223529626.0,
"step": 853
},
{
"entropy": 0.4217204749584198,
"epoch": 42.7,
"grad_norm": 4.245472431182861,
"learning_rate": 1e-06,
"loss": 0.0367,
"mean_token_accuracy": 0.9864498376846313,
"num_tokens": 223791695.0,
"step": 854
},
{
"entropy": 0.42133527994155884,
"epoch": 42.75,
"grad_norm": 3.2346744537353516,
"learning_rate": 1e-06,
"loss": 0.0394,
"mean_token_accuracy": 0.9854153394699097,
"num_tokens": 224053733.0,
"step": 855
},
{
"entropy": 0.4236084222793579,
"epoch": 42.8,
"grad_norm": 3.6470589637756348,
"learning_rate": 1e-06,
"loss": 0.0518,
"mean_token_accuracy": 0.9786067008972168,
"num_tokens": 224315753.0,
"step": 856
},
{
"entropy": 0.42061948776245117,
"epoch": 42.85,
"grad_norm": 5.6351423263549805,
"learning_rate": 1e-06,
"loss": 0.0392,
"mean_token_accuracy": 0.9876466989517212,
"num_tokens": 224577800.0,
"step": 857
},
{
"entropy": 0.4228881597518921,
"epoch": 42.9,
"grad_norm": 2.740384101867676,
"learning_rate": 1e-06,
"loss": 0.0423,
"mean_token_accuracy": 0.9824660420417786,
"num_tokens": 224839831.0,
"step": 858
},
{
"entropy": 0.4235576391220093,
"epoch": 42.95,
"grad_norm": 3.5690433979034424,
"learning_rate": 1e-06,
"loss": 0.0525,
"mean_token_accuracy": 0.9748982191085815,
"num_tokens": 225101898.0,
"step": 859
},
{
"entropy": 0.4226834177970886,
"epoch": 43.0,
"grad_norm": 5.949954986572266,
"learning_rate": 1e-06,
"loss": 0.0388,
"mean_token_accuracy": 0.9843918085098267,
"num_tokens": 225363920.0,
"step": 860
},
{
"epoch": 43.0,
"eval_entropy": 0.42189276218414307,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.899193525314331,
"eval_num_tokens": 225363920.0,
"eval_runtime": 0.5598,
"eval_samples_per_second": 446.553,
"eval_steps_per_second": 1.786,
"step": 860
},
{
"entropy": 0.4208260178565979,
"epoch": 43.05,
"grad_norm": 3.177051544189453,
"learning_rate": 1e-06,
"loss": 0.0438,
"mean_token_accuracy": 0.980252742767334,
"num_tokens": 225625977.0,
"step": 861
},
{
"entropy": 0.42219871282577515,
"epoch": 43.1,
"grad_norm": 4.462610721588135,
"learning_rate": 1e-06,
"loss": 0.0451,
"mean_token_accuracy": 0.9779735803604126,
"num_tokens": 225888027.0,
"step": 862
},
{
"entropy": 0.420362651348114,
"epoch": 43.15,
"grad_norm": 2.9560749530792236,
"learning_rate": 1e-06,
"loss": 0.0343,
"mean_token_accuracy": 0.9873754382133484,
"num_tokens": 226150059.0,
"step": 863
},
{
"entropy": 0.4182065427303314,
"epoch": 43.2,
"grad_norm": 6.363971710205078,
"learning_rate": 1e-06,
"loss": 0.0484,
"mean_token_accuracy": 0.982958972454071,
"num_tokens": 226412141.0,
"step": 864
},
{
"entropy": 0.42202645540237427,
"epoch": 43.25,
"grad_norm": 4.371028900146484,
"learning_rate": 1e-06,
"loss": 0.0365,
"mean_token_accuracy": 0.985382080078125,
"num_tokens": 226674206.0,
"step": 865
},
{
"entropy": 0.419817179441452,
"epoch": 43.3,
"grad_norm": 5.31802225112915,
"learning_rate": 1e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9777777791023254,
"num_tokens": 226936212.0,
"step": 866
},
{
"entropy": 0.42121073603630066,
"epoch": 43.35,
"grad_norm": 5.029830455780029,
"learning_rate": 1e-06,
"loss": 0.0318,
"mean_token_accuracy": 0.9885404109954834,
"num_tokens": 227198232.0,
"step": 867
},
{
"entropy": 0.4217337667942047,
"epoch": 43.4,
"grad_norm": 2.7915053367614746,
"learning_rate": 1e-06,
"loss": 0.0345,
"mean_token_accuracy": 0.9873780608177185,
"num_tokens": 227460304.0,
"step": 868
},
{
"entropy": 0.42147403955459595,
"epoch": 43.45,
"grad_norm": 4.434348106384277,
"learning_rate": 1e-06,
"loss": 0.0575,
"mean_token_accuracy": 0.9757281541824341,
"num_tokens": 227722380.0,
"step": 869
},
{
"entropy": 0.42027783393859863,
"epoch": 43.5,
"grad_norm": 6.0223283767700195,
"learning_rate": 1e-06,
"loss": 0.0434,
"mean_token_accuracy": 0.981566846370697,
"num_tokens": 227984393.0,
"step": 870
},
{
"entropy": 0.4178963899612427,
"epoch": 43.55,
"grad_norm": 4.975872993469238,
"learning_rate": 1e-06,
"loss": 0.0364,
"mean_token_accuracy": 0.9844054579734802,
"num_tokens": 228246459.0,
"step": 871
},
{
"entropy": 0.41726964712142944,
"epoch": 43.6,
"grad_norm": 4.340257167816162,
"learning_rate": 1e-06,
"loss": 0.0346,
"mean_token_accuracy": 0.9853479862213135,
"num_tokens": 228508516.0,
"step": 872
},
{
"entropy": 0.41982972621917725,
"epoch": 43.65,
"grad_norm": 6.434086322784424,
"learning_rate": 1e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9811320900917053,
"num_tokens": 228770542.0,
"step": 873
},
{
"entropy": 0.4189579486846924,
"epoch": 43.7,
"grad_norm": 5.864006519317627,
"learning_rate": 1e-06,
"loss": 0.0444,
"mean_token_accuracy": 0.9811431765556335,
"num_tokens": 229032568.0,
"step": 874
},
{
"entropy": 0.4170803129673004,
"epoch": 43.75,
"grad_norm": 6.057931423187256,
"learning_rate": 1e-06,
"loss": 0.0392,
"mean_token_accuracy": 0.9832439422607422,
"num_tokens": 229294620.0,
"step": 875
},
{
"entropy": 0.41728881001472473,
"epoch": 43.8,
"grad_norm": 2.91896653175354,
"learning_rate": 1e-06,
"loss": 0.0402,
"mean_token_accuracy": 0.9860140085220337,
"num_tokens": 229556687.0,
"step": 876
},
{
"entropy": 0.4162396192550659,
"epoch": 43.85,
"grad_norm": 8.440881729125977,
"learning_rate": 1e-06,
"loss": 0.0462,
"mean_token_accuracy": 0.9821428656578064,
"num_tokens": 229818749.0,
"step": 877
},
{
"entropy": 0.4187852144241333,
"epoch": 43.9,
"grad_norm": 5.375515937805176,
"learning_rate": 1e-06,
"loss": 0.0492,
"mean_token_accuracy": 0.9800754189491272,
"num_tokens": 230080803.0,
"step": 878
},
{
"entropy": 0.41738641262054443,
"epoch": 43.95,
"grad_norm": 7.682968616485596,
"learning_rate": 1e-06,
"loss": 0.0373,
"mean_token_accuracy": 0.9865732789039612,
"num_tokens": 230342878.0,
"step": 879
},
{
"entropy": 0.4192846417427063,
"epoch": 44.0,
"grad_norm": 5.086990833282471,
"learning_rate": 1e-06,
"loss": 0.0388,
"mean_token_accuracy": 0.9832605719566345,
"num_tokens": 230604944.0,
"step": 880
},
{
"epoch": 44.0,
"eval_entropy": 0.42026257514953613,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9079301357269287,
"eval_num_tokens": 230604944.0,
"eval_runtime": 0.5735,
"eval_samples_per_second": 435.937,
"eval_steps_per_second": 1.744,
"step": 880
},
{
"entropy": 0.4180365204811096,
"epoch": 44.05,
"grad_norm": 3.057579755783081,
"learning_rate": 1e-06,
"loss": 0.032,
"mean_token_accuracy": 0.9887217879295349,
"num_tokens": 230867034.0,
"step": 881
},
{
"entropy": 0.4178960621356964,
"epoch": 44.1,
"grad_norm": 5.593236446380615,
"learning_rate": 1e-06,
"loss": 0.0315,
"mean_token_accuracy": 0.9854651093482971,
"num_tokens": 231129083.0,
"step": 882
},
{
"entropy": 0.4179683327674866,
"epoch": 44.15,
"grad_norm": 5.593606948852539,
"learning_rate": 1e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.9800342321395874,
"num_tokens": 231391132.0,
"step": 883
},
{
"entropy": 0.4196028411388397,
"epoch": 44.2,
"grad_norm": 3.4420926570892334,
"learning_rate": 1e-06,
"loss": 0.0384,
"mean_token_accuracy": 0.9837988615036011,
"num_tokens": 231653185.0,
"step": 884
},
{
"entropy": 0.4175465703010559,
"epoch": 44.25,
"grad_norm": 6.346271991729736,
"learning_rate": 1e-06,
"loss": 0.0478,
"mean_token_accuracy": 0.9762389659881592,
"num_tokens": 231915251.0,
"step": 885
},
{
"entropy": 0.416942834854126,
"epoch": 44.3,
"grad_norm": 9.280478477478027,
"learning_rate": 1e-06,
"loss": 0.0313,
"mean_token_accuracy": 0.9847715497016907,
"num_tokens": 232177321.0,
"step": 886
},
{
"entropy": 0.42018964886665344,
"epoch": 44.35,
"grad_norm": 6.544849395751953,
"learning_rate": 1e-06,
"loss": 0.043,
"mean_token_accuracy": 0.9793233275413513,
"num_tokens": 232439378.0,
"step": 887
},
{
"entropy": 0.41923967003822327,
"epoch": 44.4,
"grad_norm": 2.252636432647705,
"learning_rate": 1e-06,
"loss": 0.0417,
"mean_token_accuracy": 0.9808584451675415,
"num_tokens": 232701431.0,
"step": 888
},
{
"entropy": 0.4177750051021576,
"epoch": 44.45,
"grad_norm": 2.6777243614196777,
"learning_rate": 1e-06,
"loss": 0.0395,
"mean_token_accuracy": 0.9847931861877441,
"num_tokens": 232963470.0,
"step": 889
},
{
"entropy": 0.41958120465278625,
"epoch": 44.5,
"grad_norm": 3.8446385860443115,
"learning_rate": 1e-06,
"loss": 0.0369,
"mean_token_accuracy": 0.9842866063117981,
"num_tokens": 233225456.0,
"step": 890
},
{
"entropy": 0.4201487600803375,
"epoch": 44.55,
"grad_norm": 2.9724559783935547,
"learning_rate": 1e-06,
"loss": 0.0331,
"mean_token_accuracy": 0.9868995547294617,
"num_tokens": 233487520.0,
"step": 891
},
{
"entropy": 0.41942694783210754,
"epoch": 44.6,
"grad_norm": 4.230043888092041,
"learning_rate": 1e-06,
"loss": 0.0513,
"mean_token_accuracy": 0.9784172773361206,
"num_tokens": 233749531.0,
"step": 892
},
{
"entropy": 0.41816410422325134,
"epoch": 44.65,
"grad_norm": 7.377174377441406,
"learning_rate": 1e-06,
"loss": 0.0397,
"mean_token_accuracy": 0.9815521836280823,
"num_tokens": 234011564.0,
"step": 893
},
{
"entropy": 0.41752010583877563,
"epoch": 44.7,
"grad_norm": 10.760427474975586,
"learning_rate": 1e-06,
"loss": 0.0304,
"mean_token_accuracy": 0.9876638650894165,
"num_tokens": 234273619.0,
"step": 894
},
{
"entropy": 0.41638725996017456,
"epoch": 44.75,
"grad_norm": 12.622779846191406,
"learning_rate": 1e-06,
"loss": 0.0445,
"mean_token_accuracy": 0.981951892375946,
"num_tokens": 234535642.0,
"step": 895
},
{
"entropy": 0.41678711771965027,
"epoch": 44.8,
"grad_norm": 5.13592004776001,
"learning_rate": 1e-06,
"loss": 0.0378,
"mean_token_accuracy": 0.9822485446929932,
"num_tokens": 234797723.0,
"step": 896
},
{
"entropy": 0.417410671710968,
"epoch": 44.85,
"grad_norm": 4.6480607986450195,
"learning_rate": 1e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9833546876907349,
"num_tokens": 235059779.0,
"step": 897
},
{
"entropy": 0.41528117656707764,
"epoch": 44.9,
"grad_norm": 5.57151985168457,
"learning_rate": 1e-06,
"loss": 0.0402,
"mean_token_accuracy": 0.9841897487640381,
"num_tokens": 235321857.0,
"step": 898
},
{
"entropy": 0.4173772633075714,
"epoch": 44.95,
"grad_norm": 9.17105770111084,
"learning_rate": 1e-06,
"loss": 0.0414,
"mean_token_accuracy": 0.9790310859680176,
"num_tokens": 235583899.0,
"step": 899
},
{
"entropy": 0.4168033003807068,
"epoch": 45.0,
"grad_norm": 3.8450264930725098,
"learning_rate": 1e-06,
"loss": 0.0437,
"mean_token_accuracy": 0.9839246273040771,
"num_tokens": 235845966.0,
"step": 900
},
{
"epoch": 45.0,
"eval_entropy": 0.4168623983860016,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8965053558349609,
"eval_num_tokens": 235845966.0,
"eval_runtime": 0.5778,
"eval_samples_per_second": 432.681,
"eval_steps_per_second": 1.731,
"step": 900
},
{
"entropy": 0.41515421867370605,
"epoch": 45.05,
"grad_norm": 7.109200477600098,
"learning_rate": 1e-06,
"loss": 0.0523,
"mean_token_accuracy": 0.9788933396339417,
"num_tokens": 236108015.0,
"step": 901
},
{
"entropy": 0.41663509607315063,
"epoch": 45.1,
"grad_norm": 10.517169952392578,
"learning_rate": 1e-06,
"loss": 0.0341,
"mean_token_accuracy": 0.9875898361206055,
"num_tokens": 236370040.0,
"step": 902
},
{
"entropy": 0.42009973526000977,
"epoch": 45.15,
"grad_norm": 7.42244815826416,
"learning_rate": 1e-06,
"loss": 0.0381,
"mean_token_accuracy": 0.9828254580497742,
"num_tokens": 236632075.0,
"step": 903
},
{
"entropy": 0.418070524930954,
"epoch": 45.2,
"grad_norm": 6.486195087432861,
"learning_rate": 1e-06,
"loss": 0.0394,
"mean_token_accuracy": 0.9808374643325806,
"num_tokens": 236894110.0,
"step": 904
},
{
"entropy": 0.4179428815841675,
"epoch": 45.25,
"grad_norm": 4.680195331573486,
"learning_rate": 1e-06,
"loss": 0.0365,
"mean_token_accuracy": 0.9855072498321533,
"num_tokens": 237156164.0,
"step": 905
},
{
"entropy": 0.41826122999191284,
"epoch": 45.3,
"grad_norm": 4.5584282875061035,
"learning_rate": 1e-06,
"loss": 0.0345,
"mean_token_accuracy": 0.9875816702842712,
"num_tokens": 237418221.0,
"step": 906
},
{
"entropy": 0.41678982973098755,
"epoch": 45.35,
"grad_norm": 11.087577819824219,
"learning_rate": 1e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9833915829658508,
"num_tokens": 237680288.0,
"step": 907
},
{
"entropy": 0.4179689586162567,
"epoch": 45.4,
"grad_norm": 3.9059412479400635,
"learning_rate": 1e-06,
"loss": 0.039,
"mean_token_accuracy": 0.9850313663482666,
"num_tokens": 237942325.0,
"step": 908
},
{
"entropy": 0.4192636013031006,
"epoch": 45.45,
"grad_norm": 10.241129875183105,
"learning_rate": 1e-06,
"loss": 0.0458,
"mean_token_accuracy": 0.9825620651245117,
"num_tokens": 238204376.0,
"step": 909
},
{
"entropy": 0.41725245118141174,
"epoch": 45.5,
"grad_norm": 7.922611236572266,
"learning_rate": 1e-06,
"loss": 0.0506,
"mean_token_accuracy": 0.9770580530166626,
"num_tokens": 238466385.0,
"step": 910
},
{
"entropy": 0.41635823249816895,
"epoch": 45.55,
"grad_norm": 8.423656463623047,
"learning_rate": 1e-06,
"loss": 0.0423,
"mean_token_accuracy": 0.9865138530731201,
"num_tokens": 238728428.0,
"step": 911
},
{
"entropy": 0.4160480499267578,
"epoch": 45.6,
"grad_norm": 5.975074768066406,
"learning_rate": 1e-06,
"loss": 0.0459,
"mean_token_accuracy": 0.9800771474838257,
"num_tokens": 238990478.0,
"step": 912
},
{
"entropy": 0.4144827425479889,
"epoch": 45.65,
"grad_norm": 3.006824493408203,
"learning_rate": 1e-06,
"loss": 0.0367,
"mean_token_accuracy": 0.9852420091629028,
"num_tokens": 239252534.0,
"step": 913
},
{
"entropy": 0.4175737500190735,
"epoch": 45.7,
"grad_norm": 4.676286697387695,
"learning_rate": 1e-06,
"loss": 0.0336,
"mean_token_accuracy": 0.9869791865348816,
"num_tokens": 239514597.0,
"step": 914
},
{
"entropy": 0.4146580696105957,
"epoch": 45.75,
"grad_norm": 6.910285472869873,
"learning_rate": 1e-06,
"loss": 0.0474,
"mean_token_accuracy": 0.9800514578819275,
"num_tokens": 239776645.0,
"step": 915
},
{
"entropy": 0.4156876802444458,
"epoch": 45.8,
"grad_norm": 11.429252624511719,
"learning_rate": 1e-06,
"loss": 0.0488,
"mean_token_accuracy": 0.9763739109039307,
"num_tokens": 240038723.0,
"step": 916
},
{
"entropy": 0.4177379608154297,
"epoch": 45.85,
"grad_norm": 11.261126518249512,
"learning_rate": 1e-06,
"loss": 0.0525,
"mean_token_accuracy": 0.9810874462127686,
"num_tokens": 240300777.0,
"step": 917
},
{
"entropy": 0.4161033630371094,
"epoch": 45.9,
"grad_norm": 7.90402364730835,
"learning_rate": 1e-06,
"loss": 0.0458,
"mean_token_accuracy": 0.9802880883216858,
"num_tokens": 240562854.0,
"step": 918
},
{
"entropy": 0.4177625775337219,
"epoch": 45.95,
"grad_norm": 5.553004741668701,
"learning_rate": 1e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9828277230262756,
"num_tokens": 240824930.0,
"step": 919
},
{
"entropy": 0.41690176725387573,
"epoch": 46.0,
"grad_norm": 4.296492099761963,
"learning_rate": 1e-06,
"loss": 0.0408,
"mean_token_accuracy": 0.9832713603973389,
"num_tokens": 241086972.0,
"step": 920
},
{
"epoch": 46.0,
"eval_entropy": 0.41648760437965393,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9025537371635437,
"eval_num_tokens": 241086972.0,
"eval_runtime": 0.5516,
"eval_samples_per_second": 453.233,
"eval_steps_per_second": 1.813,
"step": 920
},
{
"entropy": 0.41621556878089905,
"epoch": 46.05,
"grad_norm": 4.381577014923096,
"learning_rate": 1e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9823434948921204,
"num_tokens": 241349005.0,
"step": 921
},
{
"entropy": 0.4166816174983978,
"epoch": 46.1,
"grad_norm": 5.124240875244141,
"learning_rate": 1e-06,
"loss": 0.0355,
"mean_token_accuracy": 0.9883086681365967,
"num_tokens": 241611046.0,
"step": 922
},
{
"entropy": 0.4189669191837311,
"epoch": 46.15,
"grad_norm": 5.907957553863525,
"learning_rate": 1e-06,
"loss": 0.0346,
"mean_token_accuracy": 0.9852768182754517,
"num_tokens": 241873106.0,
"step": 923
},
{
"entropy": 0.41633808612823486,
"epoch": 46.2,
"grad_norm": 7.788755893707275,
"learning_rate": 1e-06,
"loss": 0.0462,
"mean_token_accuracy": 0.9764492511749268,
"num_tokens": 242135124.0,
"step": 924
},
{
"entropy": 0.4165252447128296,
"epoch": 46.25,
"grad_norm": 5.588308811187744,
"learning_rate": 1e-06,
"loss": 0.0439,
"mean_token_accuracy": 0.977729856967926,
"num_tokens": 242397208.0,
"step": 925
},
{
"entropy": 0.4190508723258972,
"epoch": 46.3,
"grad_norm": 2.2610719203948975,
"learning_rate": 1e-06,
"loss": 0.0379,
"mean_token_accuracy": 0.9827175140380859,
"num_tokens": 242659248.0,
"step": 926
},
{
"entropy": 0.4179667830467224,
"epoch": 46.35,
"grad_norm": 3.7483508586883545,
"learning_rate": 1e-06,
"loss": 0.0403,
"mean_token_accuracy": 0.9825544953346252,
"num_tokens": 242921347.0,
"step": 927
},
{
"entropy": 0.41781550645828247,
"epoch": 46.4,
"grad_norm": 5.416518688201904,
"learning_rate": 1e-06,
"loss": 0.0388,
"mean_token_accuracy": 0.985497236251831,
"num_tokens": 243183388.0,
"step": 928
},
{
"entropy": 0.41736748814582825,
"epoch": 46.45,
"grad_norm": 3.2504491806030273,
"learning_rate": 1e-06,
"loss": 0.0352,
"mean_token_accuracy": 0.9862027764320374,
"num_tokens": 243445417.0,
"step": 929
},
{
"entropy": 0.41560643911361694,
"epoch": 46.5,
"grad_norm": 5.10044002532959,
"learning_rate": 1e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9725239872932434,
"num_tokens": 243707476.0,
"step": 930
},
{
"entropy": 0.41733497381210327,
"epoch": 46.55,
"grad_norm": 4.848858833312988,
"learning_rate": 1e-06,
"loss": 0.0356,
"mean_token_accuracy": 0.9850000143051147,
"num_tokens": 243969504.0,
"step": 931
},
{
"entropy": 0.41563642024993896,
"epoch": 46.6,
"grad_norm": 8.018087387084961,
"learning_rate": 1e-06,
"loss": 0.0373,
"mean_token_accuracy": 0.9849973917007446,
"num_tokens": 244231568.0,
"step": 932
},
{
"entropy": 0.415079265832901,
"epoch": 46.65,
"grad_norm": 4.895575523376465,
"learning_rate": 1e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9828532338142395,
"num_tokens": 244493619.0,
"step": 933
},
{
"entropy": 0.4182128608226776,
"epoch": 46.7,
"grad_norm": 6.3751220703125,
"learning_rate": 1e-06,
"loss": 0.0271,
"mean_token_accuracy": 0.9895547032356262,
"num_tokens": 244755635.0,
"step": 934
},
{
"entropy": 0.4158663749694824,
"epoch": 46.75,
"grad_norm": 4.839704513549805,
"learning_rate": 1e-06,
"loss": 0.05,
"mean_token_accuracy": 0.9784736037254333,
"num_tokens": 245017695.0,
"step": 935
},
{
"entropy": 0.4153270125389099,
"epoch": 46.8,
"grad_norm": 8.145550727844238,
"learning_rate": 1e-06,
"loss": 0.04,
"mean_token_accuracy": 0.9837110638618469,
"num_tokens": 245279766.0,
"step": 936
},
{
"entropy": 0.4160436987876892,
"epoch": 46.85,
"grad_norm": 3.469226598739624,
"learning_rate": 1e-06,
"loss": 0.0365,
"mean_token_accuracy": 0.9860582947731018,
"num_tokens": 245541838.0,
"step": 937
},
{
"entropy": 0.4171416759490967,
"epoch": 46.9,
"grad_norm": 6.618907928466797,
"learning_rate": 1e-06,
"loss": 0.0372,
"mean_token_accuracy": 0.9837837815284729,
"num_tokens": 245803865.0,
"step": 938
},
{
"entropy": 0.41581034660339355,
"epoch": 46.95,
"grad_norm": 3.842113494873047,
"learning_rate": 1e-06,
"loss": 0.0429,
"mean_token_accuracy": 0.9822601675987244,
"num_tokens": 246065914.0,
"step": 939
},
{
"entropy": 0.4186379015445709,
"epoch": 47.0,
"grad_norm": 4.248569965362549,
"learning_rate": 1e-06,
"loss": 0.0399,
"mean_token_accuracy": 0.9806201457977295,
"num_tokens": 246327983.0,
"step": 940
},
{
"epoch": 47.0,
"eval_entropy": 0.41853243112564087,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9012096524238586,
"eval_num_tokens": 246327983.0,
"eval_runtime": 0.5635,
"eval_samples_per_second": 443.637,
"eval_steps_per_second": 1.775,
"step": 940
},
{
"entropy": 0.41847890615463257,
"epoch": 47.05,
"grad_norm": 2.8673055171966553,
"learning_rate": 1e-06,
"loss": 0.0353,
"mean_token_accuracy": 0.9876089692115784,
"num_tokens": 246590029.0,
"step": 941
},
{
"entropy": 0.4161589741706848,
"epoch": 47.1,
"grad_norm": 4.205740928649902,
"learning_rate": 1e-06,
"loss": 0.0353,
"mean_token_accuracy": 0.9830721020698547,
"num_tokens": 246852085.0,
"step": 942
},
{
"entropy": 0.4160376787185669,
"epoch": 47.15,
"grad_norm": 6.36973237991333,
"learning_rate": 1e-06,
"loss": 0.032,
"mean_token_accuracy": 0.9876118302345276,
"num_tokens": 247114098.0,
"step": 943
},
{
"entropy": 0.4143107831478119,
"epoch": 47.2,
"grad_norm": 4.71365213394165,
"learning_rate": 1e-06,
"loss": 0.0306,
"mean_token_accuracy": 0.9879518151283264,
"num_tokens": 247376171.0,
"step": 944
},
{
"entropy": 0.41483235359191895,
"epoch": 47.25,
"grad_norm": 3.322777509689331,
"learning_rate": 1e-06,
"loss": 0.0476,
"mean_token_accuracy": 0.9799240231513977,
"num_tokens": 247638211.0,
"step": 945
},
{
"entropy": 0.41483980417251587,
"epoch": 47.3,
"grad_norm": 2.933046817779541,
"learning_rate": 1e-06,
"loss": 0.0323,
"mean_token_accuracy": 0.986775815486908,
"num_tokens": 247900260.0,
"step": 946
},
{
"entropy": 0.4141683280467987,
"epoch": 47.35,
"grad_norm": 8.35169506072998,
"learning_rate": 1e-06,
"loss": 0.0336,
"mean_token_accuracy": 0.9878836870193481,
"num_tokens": 248162322.0,
"step": 947
},
{
"entropy": 0.41416099667549133,
"epoch": 47.4,
"grad_norm": 8.373310089111328,
"learning_rate": 1e-06,
"loss": 0.0417,
"mean_token_accuracy": 0.9806157350540161,
"num_tokens": 248424339.0,
"step": 948
},
{
"entropy": 0.414173424243927,
"epoch": 47.45,
"grad_norm": 4.492981433868408,
"learning_rate": 1e-06,
"loss": 0.0282,
"mean_token_accuracy": 0.9873417615890503,
"num_tokens": 248686380.0,
"step": 949
},
{
"entropy": 0.4131871461868286,
"epoch": 47.5,
"grad_norm": 3.0811541080474854,
"learning_rate": 1e-06,
"loss": 0.0426,
"mean_token_accuracy": 0.9823922514915466,
"num_tokens": 248948455.0,
"step": 950
},
{
"entropy": 0.417308509349823,
"epoch": 47.55,
"grad_norm": 4.044399738311768,
"learning_rate": 1e-06,
"loss": 0.0418,
"mean_token_accuracy": 0.9816828966140747,
"num_tokens": 249210498.0,
"step": 951
},
{
"entropy": 0.4158850908279419,
"epoch": 47.6,
"grad_norm": 4.268923759460449,
"learning_rate": 1e-06,
"loss": 0.0306,
"mean_token_accuracy": 0.9876543283462524,
"num_tokens": 249472492.0,
"step": 952
},
{
"entropy": 0.41297072172164917,
"epoch": 47.65,
"grad_norm": 8.944028854370117,
"learning_rate": 1e-06,
"loss": 0.0344,
"mean_token_accuracy": 0.9835164546966553,
"num_tokens": 249734524.0,
"step": 953
},
{
"entropy": 0.41369563341140747,
"epoch": 47.7,
"grad_norm": 11.018954277038574,
"learning_rate": 1e-06,
"loss": 0.0594,
"mean_token_accuracy": 0.9824694991111755,
"num_tokens": 249996594.0,
"step": 954
},
{
"entropy": 0.412643164396286,
"epoch": 47.75,
"grad_norm": 12.327390670776367,
"learning_rate": 1e-06,
"loss": 0.0443,
"mean_token_accuracy": 0.9781690239906311,
"num_tokens": 250258673.0,
"step": 955
},
{
"entropy": 0.41407012939453125,
"epoch": 47.8,
"grad_norm": 7.58923864364624,
"learning_rate": 1e-06,
"loss": 0.0414,
"mean_token_accuracy": 0.9808841347694397,
"num_tokens": 250520742.0,
"step": 956
},
{
"entropy": 0.4143466055393219,
"epoch": 47.85,
"grad_norm": 5.423033237457275,
"learning_rate": 1e-06,
"loss": 0.0429,
"mean_token_accuracy": 0.9812453389167786,
"num_tokens": 250782800.0,
"step": 957
},
{
"entropy": 0.41478925943374634,
"epoch": 47.9,
"grad_norm": 7.479618072509766,
"learning_rate": 1e-06,
"loss": 0.041,
"mean_token_accuracy": 0.9829221963882446,
"num_tokens": 251044842.0,
"step": 958
},
{
"entropy": 0.4130868911743164,
"epoch": 47.95,
"grad_norm": 3.688286304473877,
"learning_rate": 1e-06,
"loss": 0.0472,
"mean_token_accuracy": 0.9771689772605896,
"num_tokens": 251306935.0,
"step": 959
},
{
"entropy": 0.41515159606933594,
"epoch": 48.0,
"grad_norm": 8.519615173339844,
"learning_rate": 1e-06,
"loss": 0.0442,
"mean_token_accuracy": 0.9841463565826416,
"num_tokens": 251569003.0,
"step": 960
},
{
"epoch": 48.0,
"eval_entropy": 0.41626524925231934,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8985214829444885,
"eval_num_tokens": 251569003.0,
"eval_runtime": 0.5603,
"eval_samples_per_second": 446.208,
"eval_steps_per_second": 1.785,
"step": 960
},
{
"entropy": 0.4145430326461792,
"epoch": 48.05,
"grad_norm": 6.435347080230713,
"learning_rate": 1e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9809423089027405,
"num_tokens": 251831056.0,
"step": 961
},
{
"entropy": 0.4150584936141968,
"epoch": 48.1,
"grad_norm": 7.255335807800293,
"learning_rate": 1e-06,
"loss": 0.0434,
"mean_token_accuracy": 0.9832116961479187,
"num_tokens": 252093151.0,
"step": 962
},
{
"entropy": 0.4166991114616394,
"epoch": 48.15,
"grad_norm": 6.78645133972168,
"learning_rate": 1e-06,
"loss": 0.0513,
"mean_token_accuracy": 0.9758485555648804,
"num_tokens": 252355210.0,
"step": 963
},
{
"entropy": 0.4148341417312622,
"epoch": 48.2,
"grad_norm": 15.08552074432373,
"learning_rate": 1e-06,
"loss": 0.0494,
"mean_token_accuracy": 0.9825581312179565,
"num_tokens": 252617271.0,
"step": 964
},
{
"entropy": 0.4165031313896179,
"epoch": 48.25,
"grad_norm": 3.8147826194763184,
"learning_rate": 1e-06,
"loss": 0.0387,
"mean_token_accuracy": 0.9838601350784302,
"num_tokens": 252879318.0,
"step": 965
},
{
"entropy": 0.41597020626068115,
"epoch": 48.3,
"grad_norm": 3.696824550628662,
"learning_rate": 1e-06,
"loss": 0.033,
"mean_token_accuracy": 0.9860774874687195,
"num_tokens": 253141332.0,
"step": 966
},
{
"entropy": 0.4169830083847046,
"epoch": 48.35,
"grad_norm": 4.46604061126709,
"learning_rate": 1e-06,
"loss": 0.0322,
"mean_token_accuracy": 0.9871951341629028,
"num_tokens": 253403367.0,
"step": 967
},
{
"entropy": 0.4176675081253052,
"epoch": 48.4,
"grad_norm": 9.332520484924316,
"learning_rate": 1e-06,
"loss": 0.0479,
"mean_token_accuracy": 0.978586733341217,
"num_tokens": 253665460.0,
"step": 968
},
{
"entropy": 0.4181513786315918,
"epoch": 48.45,
"grad_norm": 3.3263468742370605,
"learning_rate": 1e-06,
"loss": 0.0399,
"mean_token_accuracy": 0.9831246733665466,
"num_tokens": 253927527.0,
"step": 969
},
{
"entropy": 0.415880024433136,
"epoch": 48.5,
"grad_norm": 6.814778804779053,
"learning_rate": 1e-06,
"loss": 0.0372,
"mean_token_accuracy": 0.983660101890564,
"num_tokens": 254189584.0,
"step": 970
},
{
"entropy": 0.4180268943309784,
"epoch": 48.55,
"grad_norm": 5.279390335083008,
"learning_rate": 1e-06,
"loss": 0.0286,
"mean_token_accuracy": 0.9860907793045044,
"num_tokens": 254451675.0,
"step": 971
},
{
"entropy": 0.41518670320510864,
"epoch": 48.6,
"grad_norm": 4.843584060668945,
"learning_rate": 1e-06,
"loss": 0.0416,
"mean_token_accuracy": 0.9814371466636658,
"num_tokens": 254713674.0,
"step": 972
},
{
"entropy": 0.41497260332107544,
"epoch": 48.65,
"grad_norm": 4.6544036865234375,
"learning_rate": 1e-06,
"loss": 0.0347,
"mean_token_accuracy": 0.9851101636886597,
"num_tokens": 254975748.0,
"step": 973
},
{
"entropy": 0.41487622261047363,
"epoch": 48.7,
"grad_norm": 11.355942726135254,
"learning_rate": 1e-06,
"loss": 0.0363,
"mean_token_accuracy": 0.9817721247673035,
"num_tokens": 255237755.0,
"step": 974
},
{
"entropy": 0.4146149754524231,
"epoch": 48.75,
"grad_norm": 5.135185241699219,
"learning_rate": 1e-06,
"loss": 0.0368,
"mean_token_accuracy": 0.9845132827758789,
"num_tokens": 255499803.0,
"step": 975
},
{
"entropy": 0.4158018231391907,
"epoch": 48.8,
"grad_norm": 9.837135314941406,
"learning_rate": 1e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.98525071144104,
"num_tokens": 255761851.0,
"step": 976
},
{
"entropy": 0.41550880670547485,
"epoch": 48.85,
"grad_norm": 2.997006893157959,
"learning_rate": 1e-06,
"loss": 0.0314,
"mean_token_accuracy": 0.9878566861152649,
"num_tokens": 256023893.0,
"step": 977
},
{
"entropy": 0.4196978807449341,
"epoch": 48.9,
"grad_norm": 4.659507751464844,
"learning_rate": 1e-06,
"loss": 0.0395,
"mean_token_accuracy": 0.9817137122154236,
"num_tokens": 256285938.0,
"step": 978
},
{
"entropy": 0.4174753427505493,
"epoch": 48.95,
"grad_norm": 7.701611518859863,
"learning_rate": 1e-06,
"loss": 0.0444,
"mean_token_accuracy": 0.9807291626930237,
"num_tokens": 256547989.0,
"step": 979
},
{
"entropy": 0.41477835178375244,
"epoch": 49.0,
"grad_norm": 5.701596260070801,
"learning_rate": 1e-06,
"loss": 0.0334,
"mean_token_accuracy": 0.9884169697761536,
"num_tokens": 256810037.0,
"step": 980
},
{
"epoch": 49.0,
"eval_entropy": 0.4165271818637848,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.9005376100540161,
"eval_num_tokens": 256810037.0,
"eval_runtime": 0.5618,
"eval_samples_per_second": 445.006,
"eval_steps_per_second": 1.78,
"step": 980
},
{
"entropy": 0.4151040017604828,
"epoch": 49.05,
"grad_norm": 7.52377462387085,
"learning_rate": 1e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.983753502368927,
"num_tokens": 257072085.0,
"step": 981
},
{
"entropy": 0.41499486565589905,
"epoch": 49.1,
"grad_norm": 5.606761455535889,
"learning_rate": 1e-06,
"loss": 0.0359,
"mean_token_accuracy": 0.9828641414642334,
"num_tokens": 257334114.0,
"step": 982
},
{
"entropy": 0.4133911728858948,
"epoch": 49.15,
"grad_norm": 3.4444432258605957,
"learning_rate": 1e-06,
"loss": 0.0296,
"mean_token_accuracy": 0.9844412803649902,
"num_tokens": 257596187.0,
"step": 983
},
{
"entropy": 0.41423529386520386,
"epoch": 49.2,
"grad_norm": 4.566822052001953,
"learning_rate": 1e-06,
"loss": 0.0335,
"mean_token_accuracy": 0.9858490824699402,
"num_tokens": 257858264.0,
"step": 984
},
{
"entropy": 0.41284215450286865,
"epoch": 49.25,
"grad_norm": 10.468485832214355,
"learning_rate": 1e-06,
"loss": 0.0444,
"mean_token_accuracy": 0.9830007553100586,
"num_tokens": 258120309.0,
"step": 985
},
{
"entropy": 0.41363033652305603,
"epoch": 49.3,
"grad_norm": 5.765902996063232,
"learning_rate": 1e-06,
"loss": 0.0312,
"mean_token_accuracy": 0.985897421836853,
"num_tokens": 258382396.0,
"step": 986
},
{
"entropy": 0.4127556085586548,
"epoch": 49.35,
"grad_norm": 7.179581165313721,
"learning_rate": 1e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.9815910458564758,
"num_tokens": 258644477.0,
"step": 987
},
{
"entropy": 0.41511815786361694,
"epoch": 49.4,
"grad_norm": 8.856917381286621,
"learning_rate": 1e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9796854257583618,
"num_tokens": 258906530.0,
"step": 988
},
{
"entropy": 0.41756629943847656,
"epoch": 49.45,
"grad_norm": 5.423385143280029,
"learning_rate": 1e-06,
"loss": 0.0357,
"mean_token_accuracy": 0.9856938719749451,
"num_tokens": 259168554.0,
"step": 989
},
{
"entropy": 0.41290774941444397,
"epoch": 49.5,
"grad_norm": 7.986942768096924,
"learning_rate": 1e-06,
"loss": 0.0399,
"mean_token_accuracy": 0.9850448369979858,
"num_tokens": 259430625.0,
"step": 990
},
{
"entropy": 0.4130344092845917,
"epoch": 49.55,
"grad_norm": 5.943664073944092,
"learning_rate": 1e-06,
"loss": 0.0351,
"mean_token_accuracy": 0.9877350926399231,
"num_tokens": 259692705.0,
"step": 991
},
{
"entropy": 0.4145318269729614,
"epoch": 49.6,
"grad_norm": 6.782146453857422,
"learning_rate": 1e-06,
"loss": 0.0332,
"mean_token_accuracy": 0.9839532971382141,
"num_tokens": 259954702.0,
"step": 992
},
{
"entropy": 0.41375476121902466,
"epoch": 49.65,
"grad_norm": 9.113283157348633,
"learning_rate": 1e-06,
"loss": 0.0471,
"mean_token_accuracy": 0.9787600636482239,
"num_tokens": 260216740.0,
"step": 993
},
{
"entropy": 0.41189008951187134,
"epoch": 49.7,
"grad_norm": 10.127237319946289,
"learning_rate": 1e-06,
"loss": 0.0328,
"mean_token_accuracy": 0.9894217252731323,
"num_tokens": 260478817.0,
"step": 994
},
{
"entropy": 0.4134674072265625,
"epoch": 49.75,
"grad_norm": 3.2917778491973877,
"learning_rate": 1e-06,
"loss": 0.0389,
"mean_token_accuracy": 0.9848576784133911,
"num_tokens": 260740863.0,
"step": 995
},
{
"entropy": 0.4141203761100769,
"epoch": 49.8,
"grad_norm": 12.72966194152832,
"learning_rate": 1e-06,
"loss": 0.0411,
"mean_token_accuracy": 0.9854904413223267,
"num_tokens": 261002915.0,
"step": 996
},
{
"entropy": 0.41340839862823486,
"epoch": 49.85,
"grad_norm": 5.995748519897461,
"learning_rate": 1e-06,
"loss": 0.0478,
"mean_token_accuracy": 0.9802231192588806,
"num_tokens": 261264952.0,
"step": 997
},
{
"entropy": 0.4108712673187256,
"epoch": 49.9,
"grad_norm": 2.843021869659424,
"learning_rate": 1e-06,
"loss": 0.0386,
"mean_token_accuracy": 0.9838079214096069,
"num_tokens": 261526973.0,
"step": 998
},
{
"entropy": 0.41318273544311523,
"epoch": 49.95,
"grad_norm": 5.03126859664917,
"learning_rate": 1e-06,
"loss": 0.0338,
"mean_token_accuracy": 0.982284665107727,
"num_tokens": 261789005.0,
"step": 999
},
{
"entropy": 0.41257742047309875,
"epoch": 50.0,
"grad_norm": 6.049678802490234,
"learning_rate": 1e-06,
"loss": 0.0375,
"mean_token_accuracy": 0.9856985807418823,
"num_tokens": 262051053.0,
"step": 1000
},
{
"epoch": 50.0,
"eval_entropy": 0.41491925716400146,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.8998655676841736,
"eval_num_tokens": 262051053.0,
"eval_runtime": 0.5646,
"eval_samples_per_second": 442.828,
"eval_steps_per_second": 1.771,
"step": 1000
},
{
"epoch": 50.0,
"step": 1000,
"total_flos": 227901702144000.0,
"train_loss": 0.14342150183208285,
"train_runtime": 3480.9498,
"train_samples_per_second": 71.819,
"train_steps_per_second": 0.287
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 227901702144000.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}