clean-subliminal-learning-foxes / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
144af0c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.12656632065773,
"epoch": 0.003738317757009346,
"grad_norm": 0.4271441102027893,
"learning_rate": 0.0002,
"loss": 2.4663805961608887,
"mean_token_accuracy": 0.543229952454567,
"num_tokens": 16235.0,
"step": 1
},
{
"entropy": 1.2336603701114655,
"epoch": 0.007476635514018692,
"grad_norm": 0.38558802008628845,
"learning_rate": 0.0002,
"loss": 2.1421403884887695,
"mean_token_accuracy": 0.5718609094619751,
"num_tokens": 32508.0,
"step": 2
},
{
"entropy": 1.3997833728790283,
"epoch": 0.011214953271028037,
"grad_norm": 0.2918585538864136,
"learning_rate": 0.0002,
"loss": 1.7201573848724365,
"mean_token_accuracy": 0.5951470136642456,
"num_tokens": 48740.0,
"step": 3
},
{
"entropy": 1.3798432350158691,
"epoch": 0.014953271028037384,
"grad_norm": 0.22533445060253143,
"learning_rate": 0.0002,
"loss": 1.409985899925232,
"mean_token_accuracy": 0.6346195936203003,
"num_tokens": 65174.0,
"step": 4
},
{
"entropy": 1.3409797251224518,
"epoch": 0.018691588785046728,
"grad_norm": 0.3003067374229431,
"learning_rate": 0.0002,
"loss": 1.28883695602417,
"mean_token_accuracy": 0.6407334357500076,
"num_tokens": 81213.0,
"step": 5
},
{
"entropy": 1.2644231617450714,
"epoch": 0.022429906542056073,
"grad_norm": 0.1622222661972046,
"learning_rate": 0.0002,
"loss": 1.1853853464126587,
"mean_token_accuracy": 0.6605143547058105,
"num_tokens": 97766.0,
"step": 6
},
{
"entropy": 1.208539754152298,
"epoch": 0.026168224299065422,
"grad_norm": 0.10511886328458786,
"learning_rate": 0.0002,
"loss": 1.1084699630737305,
"mean_token_accuracy": 0.6641467809677124,
"num_tokens": 114186.0,
"step": 7
},
{
"entropy": 1.1391400694847107,
"epoch": 0.029906542056074768,
"grad_norm": 0.10200454294681549,
"learning_rate": 0.0002,
"loss": 1.0516071319580078,
"mean_token_accuracy": 0.6707163900136948,
"num_tokens": 130305.0,
"step": 8
},
{
"entropy": 1.0563430190086365,
"epoch": 0.03364485981308411,
"grad_norm": 0.1273493468761444,
"learning_rate": 0.0002,
"loss": 0.992067813873291,
"mean_token_accuracy": 0.6933889836072922,
"num_tokens": 146652.0,
"step": 9
},
{
"entropy": 0.9964777082204819,
"epoch": 0.037383177570093455,
"grad_norm": 0.1289750188589096,
"learning_rate": 0.0002,
"loss": 0.9485647082328796,
"mean_token_accuracy": 0.6941430121660233,
"num_tokens": 162967.0,
"step": 10
},
{
"entropy": 0.9659603089094162,
"epoch": 0.041121495327102804,
"grad_norm": 0.10667150467634201,
"learning_rate": 0.0002,
"loss": 0.8798340559005737,
"mean_token_accuracy": 0.7052389085292816,
"num_tokens": 179255.0,
"step": 11
},
{
"entropy": 0.9392479658126831,
"epoch": 0.044859813084112146,
"grad_norm": 0.11929332464933395,
"learning_rate": 0.0002,
"loss": 0.8541638851165771,
"mean_token_accuracy": 0.7038426250219345,
"num_tokens": 195430.0,
"step": 12
},
{
"entropy": 0.863442063331604,
"epoch": 0.048598130841121495,
"grad_norm": 1.4121192693710327,
"learning_rate": 0.0002,
"loss": 0.8078625202178955,
"mean_token_accuracy": 0.7139769345521927,
"num_tokens": 211424.0,
"step": 13
},
{
"entropy": 0.8306129276752472,
"epoch": 0.052336448598130844,
"grad_norm": 0.10941090434789658,
"learning_rate": 0.0002,
"loss": 0.7781446576118469,
"mean_token_accuracy": 0.7239344716072083,
"num_tokens": 227810.0,
"step": 14
},
{
"entropy": 0.7757371664047241,
"epoch": 0.056074766355140186,
"grad_norm": 0.10486897826194763,
"learning_rate": 0.0002,
"loss": 0.7468726634979248,
"mean_token_accuracy": 0.7250657230615616,
"num_tokens": 243991.0,
"step": 15
},
{
"entropy": 0.7809797525405884,
"epoch": 0.059813084112149535,
"grad_norm": 0.8654316663742065,
"learning_rate": 0.0002,
"loss": 0.7594712376594543,
"mean_token_accuracy": 0.7155007421970367,
"num_tokens": 260281.0,
"step": 16
},
{
"entropy": 0.7353586554527283,
"epoch": 0.06355140186915888,
"grad_norm": 0.0876963660120964,
"learning_rate": 0.0002,
"loss": 0.7153522372245789,
"mean_token_accuracy": 0.7296042591333389,
"num_tokens": 276669.0,
"step": 17
},
{
"entropy": 0.6980300098657608,
"epoch": 0.06728971962616823,
"grad_norm": 0.07835765182971954,
"learning_rate": 0.0002,
"loss": 0.6894713640213013,
"mean_token_accuracy": 0.7386218756437302,
"num_tokens": 292849.0,
"step": 18
},
{
"entropy": 0.6726928502321243,
"epoch": 0.07102803738317758,
"grad_norm": 0.08941305428743362,
"learning_rate": 0.0002,
"loss": 0.6766728162765503,
"mean_token_accuracy": 0.7433070838451385,
"num_tokens": 309145.0,
"step": 19
},
{
"entropy": 0.6663309931755066,
"epoch": 0.07476635514018691,
"grad_norm": 0.08141425251960754,
"learning_rate": 0.0002,
"loss": 0.6594260334968567,
"mean_token_accuracy": 0.7467465251684189,
"num_tokens": 325653.0,
"step": 20
},
{
"entropy": 0.6754828691482544,
"epoch": 0.07850467289719626,
"grad_norm": 0.08411722630262375,
"learning_rate": 0.0002,
"loss": 0.661962628364563,
"mean_token_accuracy": 0.7418759763240814,
"num_tokens": 341884.0,
"step": 21
},
{
"entropy": 0.6487417817115784,
"epoch": 0.08224299065420561,
"grad_norm": 0.08564816415309906,
"learning_rate": 0.0002,
"loss": 0.6224545240402222,
"mean_token_accuracy": 0.7568920999765396,
"num_tokens": 358367.0,
"step": 22
},
{
"entropy": 0.6594859212636948,
"epoch": 0.08598130841121496,
"grad_norm": 0.08242395520210266,
"learning_rate": 0.0002,
"loss": 0.6353108882904053,
"mean_token_accuracy": 0.748349204659462,
"num_tokens": 374461.0,
"step": 23
},
{
"entropy": 0.6361121833324432,
"epoch": 0.08971962616822429,
"grad_norm": 0.06784524023532867,
"learning_rate": 0.0002,
"loss": 0.6177537441253662,
"mean_token_accuracy": 0.7591407150030136,
"num_tokens": 390663.0,
"step": 24
},
{
"entropy": 0.633724257349968,
"epoch": 0.09345794392523364,
"grad_norm": 0.06730605661869049,
"learning_rate": 0.0002,
"loss": 0.6257245540618896,
"mean_token_accuracy": 0.7586156576871872,
"num_tokens": 407000.0,
"step": 25
},
{
"entropy": 0.6177336722612381,
"epoch": 0.09719626168224299,
"grad_norm": 0.07131887227296829,
"learning_rate": 0.0002,
"loss": 0.6150547862052917,
"mean_token_accuracy": 0.7589291036128998,
"num_tokens": 423358.0,
"step": 26
},
{
"entropy": 0.6160949915647507,
"epoch": 0.10093457943925234,
"grad_norm": 0.06616901606321335,
"learning_rate": 0.0002,
"loss": 0.6125935316085815,
"mean_token_accuracy": 0.7595443874597549,
"num_tokens": 439799.0,
"step": 27
},
{
"entropy": 0.6129452586174011,
"epoch": 0.10467289719626169,
"grad_norm": 0.05841955915093422,
"learning_rate": 0.0002,
"loss": 0.608031153678894,
"mean_token_accuracy": 0.7601521760225296,
"num_tokens": 456163.0,
"step": 28
},
{
"entropy": 0.5918006747961044,
"epoch": 0.10841121495327102,
"grad_norm": 0.06275882571935654,
"learning_rate": 0.0002,
"loss": 0.5862717628479004,
"mean_token_accuracy": 0.7687633484601974,
"num_tokens": 472127.0,
"step": 29
},
{
"entropy": 0.6155861914157867,
"epoch": 0.11214953271028037,
"grad_norm": 0.06225947290658951,
"learning_rate": 0.0002,
"loss": 0.6084246635437012,
"mean_token_accuracy": 0.7598295211791992,
"num_tokens": 488332.0,
"step": 30
},
{
"entropy": 0.6035004556179047,
"epoch": 0.11588785046728972,
"grad_norm": 0.06444618105888367,
"learning_rate": 0.0002,
"loss": 0.5935206413269043,
"mean_token_accuracy": 0.7651257067918777,
"num_tokens": 504710.0,
"step": 31
},
{
"entropy": 0.6106846928596497,
"epoch": 0.11962616822429907,
"grad_norm": 0.0602172389626503,
"learning_rate": 0.0002,
"loss": 0.5996757745742798,
"mean_token_accuracy": 0.760893777012825,
"num_tokens": 521082.0,
"step": 32
},
{
"entropy": 0.5945021361112595,
"epoch": 0.1233644859813084,
"grad_norm": 0.06356704980134964,
"learning_rate": 0.0002,
"loss": 0.5945574045181274,
"mean_token_accuracy": 0.765913113951683,
"num_tokens": 537475.0,
"step": 33
},
{
"entropy": 0.5772300958633423,
"epoch": 0.12710280373831775,
"grad_norm": 0.06089172512292862,
"learning_rate": 0.0002,
"loss": 0.5904273986816406,
"mean_token_accuracy": 0.76410873234272,
"num_tokens": 553508.0,
"step": 34
},
{
"entropy": 0.550044596195221,
"epoch": 0.1308411214953271,
"grad_norm": 0.06109277158975601,
"learning_rate": 0.0002,
"loss": 0.5613641142845154,
"mean_token_accuracy": 0.7737480998039246,
"num_tokens": 569417.0,
"step": 35
},
{
"entropy": 0.5723532140254974,
"epoch": 0.13457943925233645,
"grad_norm": 0.05618736520409584,
"learning_rate": 0.0002,
"loss": 0.5777797698974609,
"mean_token_accuracy": 0.7723707407712936,
"num_tokens": 585786.0,
"step": 36
},
{
"entropy": 0.580461397767067,
"epoch": 0.1383177570093458,
"grad_norm": 0.05472671613097191,
"learning_rate": 0.0002,
"loss": 0.5808417201042175,
"mean_token_accuracy": 0.7668861597776413,
"num_tokens": 602132.0,
"step": 37
},
{
"entropy": 0.5738302320241928,
"epoch": 0.14205607476635515,
"grad_norm": 0.06117068976163864,
"learning_rate": 0.0002,
"loss": 0.57148277759552,
"mean_token_accuracy": 0.774108350276947,
"num_tokens": 618157.0,
"step": 38
},
{
"entropy": 0.5823365598917007,
"epoch": 0.14579439252336449,
"grad_norm": 0.05150913447141647,
"learning_rate": 0.0002,
"loss": 0.5758525729179382,
"mean_token_accuracy": 0.7670020014047623,
"num_tokens": 634401.0,
"step": 39
},
{
"entropy": 0.5688591003417969,
"epoch": 0.14953271028037382,
"grad_norm": 0.054129600524902344,
"learning_rate": 0.0002,
"loss": 0.5642731189727783,
"mean_token_accuracy": 0.7723482251167297,
"num_tokens": 650471.0,
"step": 40
},
{
"entropy": 0.5930688679218292,
"epoch": 0.15327102803738318,
"grad_norm": 0.04651381075382233,
"learning_rate": 0.0002,
"loss": 0.5882899761199951,
"mean_token_accuracy": 0.7660222053527832,
"num_tokens": 667141.0,
"step": 41
},
{
"entropy": 0.5680070519447327,
"epoch": 0.15700934579439252,
"grad_norm": 0.04372819885611534,
"learning_rate": 0.0002,
"loss": 0.5683910846710205,
"mean_token_accuracy": 0.7714007496833801,
"num_tokens": 683716.0,
"step": 42
},
{
"entropy": 0.577846348285675,
"epoch": 0.16074766355140188,
"grad_norm": 0.050794582813978195,
"learning_rate": 0.0002,
"loss": 0.5828132033348083,
"mean_token_accuracy": 0.7683440744876862,
"num_tokens": 700166.0,
"step": 43
},
{
"entropy": 0.5514896064996719,
"epoch": 0.16448598130841122,
"grad_norm": 0.05992089584469795,
"learning_rate": 0.0002,
"loss": 0.563271701335907,
"mean_token_accuracy": 0.7739104330539703,
"num_tokens": 716342.0,
"step": 44
},
{
"entropy": 0.575609490275383,
"epoch": 0.16822429906542055,
"grad_norm": 0.05013341084122658,
"learning_rate": 0.0002,
"loss": 0.5849894285202026,
"mean_token_accuracy": 0.7635113149881363,
"num_tokens": 732893.0,
"step": 45
},
{
"entropy": 0.5762993842363358,
"epoch": 0.17196261682242991,
"grad_norm": 0.048744700849056244,
"learning_rate": 0.0002,
"loss": 0.574410080909729,
"mean_token_accuracy": 0.7676838040351868,
"num_tokens": 749295.0,
"step": 46
},
{
"entropy": 0.5723859369754791,
"epoch": 0.17570093457943925,
"grad_norm": 0.05009591579437256,
"learning_rate": 0.0002,
"loss": 0.5668792128562927,
"mean_token_accuracy": 0.7715302407741547,
"num_tokens": 765549.0,
"step": 47
},
{
"entropy": 0.5764475762844086,
"epoch": 0.17943925233644858,
"grad_norm": 0.04878581687808037,
"learning_rate": 0.0002,
"loss": 0.5665475130081177,
"mean_token_accuracy": 0.7720314264297485,
"num_tokens": 781843.0,
"step": 48
},
{
"entropy": 0.5845135897397995,
"epoch": 0.18317757009345795,
"grad_norm": 0.04589271917939186,
"learning_rate": 0.0002,
"loss": 0.5771698951721191,
"mean_token_accuracy": 0.7694474011659622,
"num_tokens": 798405.0,
"step": 49
},
{
"entropy": 0.569475919008255,
"epoch": 0.18691588785046728,
"grad_norm": 0.04119531437754631,
"learning_rate": 0.0002,
"loss": 0.5674958229064941,
"mean_token_accuracy": 0.7736699432134628,
"num_tokens": 814777.0,
"step": 50
},
{
"entropy": 0.5692360401153564,
"epoch": 0.19065420560747665,
"grad_norm": 0.0399826280772686,
"learning_rate": 0.0002,
"loss": 0.5702151656150818,
"mean_token_accuracy": 0.7684639543294907,
"num_tokens": 831134.0,
"step": 51
},
{
"entropy": 0.5498989522457123,
"epoch": 0.19439252336448598,
"grad_norm": 0.05800061300396919,
"learning_rate": 0.0002,
"loss": 0.5609486699104309,
"mean_token_accuracy": 0.7740016728639603,
"num_tokens": 847344.0,
"step": 52
},
{
"entropy": 0.5662340968847275,
"epoch": 0.19813084112149532,
"grad_norm": 0.047494642436504364,
"learning_rate": 0.0002,
"loss": 0.5766743421554565,
"mean_token_accuracy": 0.7678139507770538,
"num_tokens": 863618.0,
"step": 53
},
{
"entropy": 0.5752062201499939,
"epoch": 0.20186915887850468,
"grad_norm": 0.05196239426732063,
"learning_rate": 0.0002,
"loss": 0.5726749300956726,
"mean_token_accuracy": 0.7699306309223175,
"num_tokens": 879844.0,
"step": 54
},
{
"entropy": 0.5600160509347916,
"epoch": 0.205607476635514,
"grad_norm": 0.04689890146255493,
"learning_rate": 0.0002,
"loss": 0.5549072623252869,
"mean_token_accuracy": 0.7740037143230438,
"num_tokens": 896085.0,
"step": 55
},
{
"entropy": 0.5731441378593445,
"epoch": 0.20934579439252338,
"grad_norm": 0.04465720057487488,
"learning_rate": 0.0002,
"loss": 0.5675906538963318,
"mean_token_accuracy": 0.7729700356721878,
"num_tokens": 912450.0,
"step": 56
},
{
"entropy": 0.5865043848752975,
"epoch": 0.2130841121495327,
"grad_norm": 0.03869406878948212,
"learning_rate": 0.0002,
"loss": 0.5814957022666931,
"mean_token_accuracy": 0.7672637850046158,
"num_tokens": 928895.0,
"step": 57
},
{
"entropy": 0.5643806457519531,
"epoch": 0.21682242990654205,
"grad_norm": 0.03822167217731476,
"learning_rate": 0.0002,
"loss": 0.5589677691459656,
"mean_token_accuracy": 0.7748006731271744,
"num_tokens": 945239.0,
"step": 58
},
{
"entropy": 0.5692119598388672,
"epoch": 0.2205607476635514,
"grad_norm": 0.042791273444890976,
"learning_rate": 0.0002,
"loss": 0.5735809206962585,
"mean_token_accuracy": 0.7694528251886368,
"num_tokens": 961363.0,
"step": 59
},
{
"entropy": 0.5709938555955887,
"epoch": 0.22429906542056074,
"grad_norm": 0.04215843975543976,
"learning_rate": 0.0002,
"loss": 0.5765149593353271,
"mean_token_accuracy": 0.7663712352514267,
"num_tokens": 977455.0,
"step": 60
},
{
"entropy": 0.5535417348146439,
"epoch": 0.22803738317757008,
"grad_norm": 0.046243466436862946,
"learning_rate": 0.0002,
"loss": 0.5625483989715576,
"mean_token_accuracy": 0.7734335362911224,
"num_tokens": 993620.0,
"step": 61
},
{
"entropy": 0.5543283224105835,
"epoch": 0.23177570093457944,
"grad_norm": 0.0379357784986496,
"learning_rate": 0.0002,
"loss": 0.5572807788848877,
"mean_token_accuracy": 0.7759047746658325,
"num_tokens": 1009834.0,
"step": 62
},
{
"entropy": 0.5534257739782333,
"epoch": 0.23551401869158878,
"grad_norm": 0.03617486730217934,
"learning_rate": 0.0002,
"loss": 0.5538501739501953,
"mean_token_accuracy": 0.7762316316366196,
"num_tokens": 1025981.0,
"step": 63
},
{
"entropy": 0.5688228756189346,
"epoch": 0.23925233644859814,
"grad_norm": 0.03479798510670662,
"learning_rate": 0.0002,
"loss": 0.5626167058944702,
"mean_token_accuracy": 0.7745891660451889,
"num_tokens": 1042596.0,
"step": 64
},
{
"entropy": 0.5704841166734695,
"epoch": 0.24299065420560748,
"grad_norm": 0.04157167300581932,
"learning_rate": 0.0002,
"loss": 0.568891704082489,
"mean_token_accuracy": 0.7680116444826126,
"num_tokens": 1058884.0,
"step": 65
},
{
"entropy": 0.5743043571710587,
"epoch": 0.2467289719626168,
"grad_norm": 0.03632580116391182,
"learning_rate": 0.0002,
"loss": 0.5711199045181274,
"mean_token_accuracy": 0.769555926322937,
"num_tokens": 1075319.0,
"step": 66
},
{
"entropy": 0.559576690196991,
"epoch": 0.2504672897196262,
"grad_norm": 0.038374125957489014,
"learning_rate": 0.0002,
"loss": 0.5629530549049377,
"mean_token_accuracy": 0.771178126335144,
"num_tokens": 1091451.0,
"step": 67
},
{
"entropy": 0.5548212379217148,
"epoch": 0.2542056074766355,
"grad_norm": 0.03802485764026642,
"learning_rate": 0.0002,
"loss": 0.5578069686889648,
"mean_token_accuracy": 0.7767467051744461,
"num_tokens": 1107549.0,
"step": 68
},
{
"entropy": 0.5731668472290039,
"epoch": 0.25794392523364484,
"grad_norm": 0.03902502730488777,
"learning_rate": 0.0002,
"loss": 0.5750908255577087,
"mean_token_accuracy": 0.7706117182970047,
"num_tokens": 1123904.0,
"step": 69
},
{
"entropy": 0.5669015496969223,
"epoch": 0.2616822429906542,
"grad_norm": 0.03905792534351349,
"learning_rate": 0.0002,
"loss": 0.5693663358688354,
"mean_token_accuracy": 0.7708643227815628,
"num_tokens": 1139931.0,
"step": 70
},
{
"entropy": 0.5661756098270416,
"epoch": 0.26542056074766357,
"grad_norm": 0.04826045408844948,
"learning_rate": 0.0002,
"loss": 0.5717936754226685,
"mean_token_accuracy": 0.7682332992553711,
"num_tokens": 1156090.0,
"step": 71
},
{
"entropy": 0.568753570318222,
"epoch": 0.2691588785046729,
"grad_norm": 0.03873279318213463,
"learning_rate": 0.0002,
"loss": 0.5717485547065735,
"mean_token_accuracy": 0.7686503529548645,
"num_tokens": 1172312.0,
"step": 72
},
{
"entropy": 0.5719727724790573,
"epoch": 0.27289719626168224,
"grad_norm": 0.039684589952230453,
"learning_rate": 0.0002,
"loss": 0.565541684627533,
"mean_token_accuracy": 0.769890546798706,
"num_tokens": 1188846.0,
"step": 73
},
{
"entropy": 0.5802080780267715,
"epoch": 0.2766355140186916,
"grad_norm": 0.03692556545138359,
"learning_rate": 0.0002,
"loss": 0.5813108682632446,
"mean_token_accuracy": 0.7652633637189865,
"num_tokens": 1205115.0,
"step": 74
},
{
"entropy": 0.5709390044212341,
"epoch": 0.2803738317757009,
"grad_norm": 0.03715148940682411,
"learning_rate": 0.0002,
"loss": 0.5739152431488037,
"mean_token_accuracy": 0.7695163637399673,
"num_tokens": 1221457.0,
"step": 75
},
{
"entropy": 0.5634023249149323,
"epoch": 0.2841121495327103,
"grad_norm": 0.035052694380283356,
"learning_rate": 0.0002,
"loss": 0.5634779334068298,
"mean_token_accuracy": 0.7735425382852554,
"num_tokens": 1237852.0,
"step": 76
},
{
"entropy": 0.5770431756973267,
"epoch": 0.28785046728971964,
"grad_norm": 0.04037750884890556,
"learning_rate": 0.0002,
"loss": 0.5792219042778015,
"mean_token_accuracy": 0.7656148821115494,
"num_tokens": 1253991.0,
"step": 77
},
{
"entropy": 0.5483120232820511,
"epoch": 0.29158878504672897,
"grad_norm": 0.04199967905879021,
"learning_rate": 0.0002,
"loss": 0.5473575592041016,
"mean_token_accuracy": 0.7797968685626984,
"num_tokens": 1270154.0,
"step": 78
},
{
"entropy": 0.5623519718647003,
"epoch": 0.2953271028037383,
"grad_norm": 0.04001434147357941,
"learning_rate": 0.0002,
"loss": 0.5669924020767212,
"mean_token_accuracy": 0.7740958780050278,
"num_tokens": 1286373.0,
"step": 79
},
{
"entropy": 0.5505794137716293,
"epoch": 0.29906542056074764,
"grad_norm": 0.039846453815698624,
"learning_rate": 0.0002,
"loss": 0.5637381076812744,
"mean_token_accuracy": 0.7710813283920288,
"num_tokens": 1302910.0,
"step": 80
},
{
"entropy": 0.573449894785881,
"epoch": 0.30280373831775703,
"grad_norm": 0.03970034047961235,
"learning_rate": 0.0002,
"loss": 0.5817972421646118,
"mean_token_accuracy": 0.767284482717514,
"num_tokens": 1319105.0,
"step": 81
},
{
"entropy": 0.5815064907073975,
"epoch": 0.30654205607476637,
"grad_norm": 0.036917295306921005,
"learning_rate": 0.0002,
"loss": 0.5764390826225281,
"mean_token_accuracy": 0.7660059034824371,
"num_tokens": 1335418.0,
"step": 82
},
{
"entropy": 0.5537111163139343,
"epoch": 0.3102803738317757,
"grad_norm": 0.038016658276319504,
"learning_rate": 0.0002,
"loss": 0.544030487537384,
"mean_token_accuracy": 0.780098170042038,
"num_tokens": 1351471.0,
"step": 83
},
{
"entropy": 0.5532083511352539,
"epoch": 0.31401869158878504,
"grad_norm": 0.03766188770532608,
"learning_rate": 0.0002,
"loss": 0.543038010597229,
"mean_token_accuracy": 0.7815051227807999,
"num_tokens": 1367729.0,
"step": 84
},
{
"entropy": 0.569915771484375,
"epoch": 0.3177570093457944,
"grad_norm": 0.03935057669878006,
"learning_rate": 0.0002,
"loss": 0.5673943758010864,
"mean_token_accuracy": 0.7705481499433517,
"num_tokens": 1384218.0,
"step": 85
},
{
"entropy": 0.5557460188865662,
"epoch": 0.32149532710280376,
"grad_norm": 0.0382615365087986,
"learning_rate": 0.0002,
"loss": 0.5650104284286499,
"mean_token_accuracy": 0.7701956182718277,
"num_tokens": 1400496.0,
"step": 86
},
{
"entropy": 0.5529367923736572,
"epoch": 0.3252336448598131,
"grad_norm": 0.03607897832989693,
"learning_rate": 0.0002,
"loss": 0.5612208843231201,
"mean_token_accuracy": 0.773573562502861,
"num_tokens": 1416728.0,
"step": 87
},
{
"entropy": 0.5617222934961319,
"epoch": 0.32897196261682243,
"grad_norm": 0.0373239666223526,
"learning_rate": 0.0002,
"loss": 0.5661642551422119,
"mean_token_accuracy": 0.7711510807275772,
"num_tokens": 1433091.0,
"step": 88
},
{
"entropy": 0.55742546916008,
"epoch": 0.33271028037383177,
"grad_norm": 0.03938078507781029,
"learning_rate": 0.0002,
"loss": 0.5600550770759583,
"mean_token_accuracy": 0.7730235010385513,
"num_tokens": 1449246.0,
"step": 89
},
{
"entropy": 0.5685389190912247,
"epoch": 0.3364485981308411,
"grad_norm": 0.040714140981435776,
"learning_rate": 0.0002,
"loss": 0.5676398873329163,
"mean_token_accuracy": 0.7700921297073364,
"num_tokens": 1465805.0,
"step": 90
},
{
"entropy": 0.5774114727973938,
"epoch": 0.3401869158878505,
"grad_norm": 0.03398137167096138,
"learning_rate": 0.0002,
"loss": 0.5775306224822998,
"mean_token_accuracy": 0.7659128755331039,
"num_tokens": 1482298.0,
"step": 91
},
{
"entropy": 0.5467455387115479,
"epoch": 0.34392523364485983,
"grad_norm": 0.032925065606832504,
"learning_rate": 0.0002,
"loss": 0.5481046438217163,
"mean_token_accuracy": 0.7773325145244598,
"num_tokens": 1498536.0,
"step": 92
},
{
"entropy": 0.5445878356695175,
"epoch": 0.34766355140186916,
"grad_norm": 0.03473861888051033,
"learning_rate": 0.0002,
"loss": 0.5424526929855347,
"mean_token_accuracy": 0.7816839218139648,
"num_tokens": 1514823.0,
"step": 93
},
{
"entropy": 0.5637122839689255,
"epoch": 0.3514018691588785,
"grad_norm": 0.03804982081055641,
"learning_rate": 0.0002,
"loss": 0.5646781325340271,
"mean_token_accuracy": 0.7692969292402267,
"num_tokens": 1531148.0,
"step": 94
},
{
"entropy": 0.5571535974740982,
"epoch": 0.35514018691588783,
"grad_norm": 0.03457267954945564,
"learning_rate": 0.0002,
"loss": 0.5619444251060486,
"mean_token_accuracy": 0.7773198187351227,
"num_tokens": 1547476.0,
"step": 95
},
{
"entropy": 0.5707617700099945,
"epoch": 0.35887850467289717,
"grad_norm": 0.03933979198336601,
"learning_rate": 0.0002,
"loss": 0.572324275970459,
"mean_token_accuracy": 0.7692963778972626,
"num_tokens": 1563979.0,
"step": 96
},
{
"entropy": 0.556370198726654,
"epoch": 0.36261682242990656,
"grad_norm": 0.03271894529461861,
"learning_rate": 0.0002,
"loss": 0.5558284521102905,
"mean_token_accuracy": 0.7744213789701462,
"num_tokens": 1580311.0,
"step": 97
},
{
"entropy": 0.5528354942798615,
"epoch": 0.3663551401869159,
"grad_norm": 0.03302107751369476,
"learning_rate": 0.0002,
"loss": 0.5553282499313354,
"mean_token_accuracy": 0.77690689265728,
"num_tokens": 1596402.0,
"step": 98
},
{
"entropy": 0.5531659126281738,
"epoch": 0.37009345794392523,
"grad_norm": 0.03468908742070198,
"learning_rate": 0.0002,
"loss": 0.5576953887939453,
"mean_token_accuracy": 0.7762762904167175,
"num_tokens": 1612430.0,
"step": 99
},
{
"entropy": 0.5810890346765518,
"epoch": 0.37383177570093457,
"grad_norm": 0.03342665359377861,
"learning_rate": 0.0002,
"loss": 0.5769139528274536,
"mean_token_accuracy": 0.7672095000743866,
"num_tokens": 1628891.0,
"step": 100
},
{
"entropy": 0.5750298053026199,
"epoch": 0.3775700934579439,
"grad_norm": 0.03441772237420082,
"learning_rate": 0.0002,
"loss": 0.5772010087966919,
"mean_token_accuracy": 0.7646144926548004,
"num_tokens": 1645047.0,
"step": 101
},
{
"entropy": 0.5650183409452438,
"epoch": 0.3813084112149533,
"grad_norm": 0.03096170350909233,
"learning_rate": 0.0002,
"loss": 0.5606149435043335,
"mean_token_accuracy": 0.7738576829433441,
"num_tokens": 1661380.0,
"step": 102
},
{
"entropy": 0.5494536608457565,
"epoch": 0.3850467289719626,
"grad_norm": 0.03677360713481903,
"learning_rate": 0.0002,
"loss": 0.5568496584892273,
"mean_token_accuracy": 0.775225818157196,
"num_tokens": 1677541.0,
"step": 103
},
{
"entropy": 0.5550926774740219,
"epoch": 0.38878504672897196,
"grad_norm": 0.03032948076725006,
"learning_rate": 0.0002,
"loss": 0.558656632900238,
"mean_token_accuracy": 0.7753722071647644,
"num_tokens": 1693849.0,
"step": 104
},
{
"entropy": 0.5538856834173203,
"epoch": 0.3925233644859813,
"grad_norm": 0.033197011798620224,
"learning_rate": 0.0002,
"loss": 0.5585562586784363,
"mean_token_accuracy": 0.7750265747308731,
"num_tokens": 1710410.0,
"step": 105
},
{
"entropy": 0.557091012597084,
"epoch": 0.39626168224299063,
"grad_norm": 0.03343191742897034,
"learning_rate": 0.0002,
"loss": 0.5658184885978699,
"mean_token_accuracy": 0.7713737785816193,
"num_tokens": 1726519.0,
"step": 106
},
{
"entropy": 0.573070839047432,
"epoch": 0.4,
"grad_norm": 0.03520960360765457,
"learning_rate": 0.0002,
"loss": 0.5683936476707458,
"mean_token_accuracy": 0.7706228792667389,
"num_tokens": 1742802.0,
"step": 107
},
{
"entropy": 0.5730053037405014,
"epoch": 0.40373831775700936,
"grad_norm": 0.032127268612384796,
"learning_rate": 0.0002,
"loss": 0.5697438716888428,
"mean_token_accuracy": 0.7664725631475449,
"num_tokens": 1759059.0,
"step": 108
},
{
"entropy": 0.5633453279733658,
"epoch": 0.4074766355140187,
"grad_norm": 0.03088793158531189,
"learning_rate": 0.0002,
"loss": 0.5599843263626099,
"mean_token_accuracy": 0.7760611772537231,
"num_tokens": 1775536.0,
"step": 109
},
{
"entropy": 0.550876572728157,
"epoch": 0.411214953271028,
"grad_norm": 0.032173894345760345,
"learning_rate": 0.0002,
"loss": 0.552717387676239,
"mean_token_accuracy": 0.7752785235643387,
"num_tokens": 1792019.0,
"step": 110
},
{
"entropy": 0.5721830427646637,
"epoch": 0.41495327102803736,
"grad_norm": 0.033584315329790115,
"learning_rate": 0.0002,
"loss": 0.5759853720664978,
"mean_token_accuracy": 0.7664880454540253,
"num_tokens": 1808419.0,
"step": 111
},
{
"entropy": 0.5759546905755997,
"epoch": 0.41869158878504675,
"grad_norm": 0.03846940025687218,
"learning_rate": 0.0002,
"loss": 0.5841522216796875,
"mean_token_accuracy": 0.7626957893371582,
"num_tokens": 1824543.0,
"step": 112
},
{
"entropy": 0.5635320693254471,
"epoch": 0.4224299065420561,
"grad_norm": 0.03328083083033562,
"learning_rate": 0.0002,
"loss": 0.5629671812057495,
"mean_token_accuracy": 0.7737283408641815,
"num_tokens": 1840757.0,
"step": 113
},
{
"entropy": 0.5591580420732498,
"epoch": 0.4261682242990654,
"grad_norm": 0.0327068492770195,
"learning_rate": 0.0002,
"loss": 0.551184356212616,
"mean_token_accuracy": 0.7753513604402542,
"num_tokens": 1857132.0,
"step": 114
},
{
"entropy": 0.5579714924097061,
"epoch": 0.42990654205607476,
"grad_norm": 0.0334380678832531,
"learning_rate": 0.0002,
"loss": 0.5555400252342224,
"mean_token_accuracy": 0.7759147882461548,
"num_tokens": 1873360.0,
"step": 115
},
{
"entropy": 0.5697025954723358,
"epoch": 0.4336448598130841,
"grad_norm": 0.03651506081223488,
"learning_rate": 0.0002,
"loss": 0.568575382232666,
"mean_token_accuracy": 0.7692690938711166,
"num_tokens": 1889933.0,
"step": 116
},
{
"entropy": 0.5710670948028564,
"epoch": 0.4373831775700935,
"grad_norm": 0.03260137885808945,
"learning_rate": 0.0002,
"loss": 0.5754102468490601,
"mean_token_accuracy": 0.7645916491746902,
"num_tokens": 1906415.0,
"step": 117
},
{
"entropy": 0.5612241625785828,
"epoch": 0.4411214953271028,
"grad_norm": 0.030186068266630173,
"learning_rate": 0.0002,
"loss": 0.5625964403152466,
"mean_token_accuracy": 0.7733658254146576,
"num_tokens": 1922692.0,
"step": 118
},
{
"entropy": 0.5558670610189438,
"epoch": 0.44485981308411215,
"grad_norm": 0.0367811918258667,
"learning_rate": 0.0002,
"loss": 0.5577695965766907,
"mean_token_accuracy": 0.772549107670784,
"num_tokens": 1939001.0,
"step": 119
},
{
"entropy": 0.5691811889410019,
"epoch": 0.4485981308411215,
"grad_norm": 0.03843454644083977,
"learning_rate": 0.0002,
"loss": 0.5703588128089905,
"mean_token_accuracy": 0.7689766734838486,
"num_tokens": 1955537.0,
"step": 120
},
{
"entropy": 0.5652327984571457,
"epoch": 0.4523364485981308,
"grad_norm": 0.032110750675201416,
"learning_rate": 0.0002,
"loss": 0.5627662539482117,
"mean_token_accuracy": 0.7731665819883347,
"num_tokens": 1971820.0,
"step": 121
},
{
"entropy": 0.5414326637983322,
"epoch": 0.45607476635514016,
"grad_norm": 0.031934358179569244,
"learning_rate": 0.0002,
"loss": 0.5432534217834473,
"mean_token_accuracy": 0.7791064232587814,
"num_tokens": 1988118.0,
"step": 122
},
{
"entropy": 0.5502553433179855,
"epoch": 0.45981308411214955,
"grad_norm": 0.035253144800662994,
"learning_rate": 0.0002,
"loss": 0.5521403551101685,
"mean_token_accuracy": 0.7760459184646606,
"num_tokens": 2004642.0,
"step": 123
},
{
"entropy": 0.5582242012023926,
"epoch": 0.4635514018691589,
"grad_norm": 0.035558655858039856,
"learning_rate": 0.0002,
"loss": 0.5682451725006104,
"mean_token_accuracy": 0.7699540108442307,
"num_tokens": 2020965.0,
"step": 124
},
{
"entropy": 0.5626089125871658,
"epoch": 0.4672897196261682,
"grad_norm": 0.028148163110017776,
"learning_rate": 0.0002,
"loss": 0.5638797283172607,
"mean_token_accuracy": 0.7697459608316422,
"num_tokens": 2037202.0,
"step": 125
},
{
"entropy": 0.5653271377086639,
"epoch": 0.47102803738317756,
"grad_norm": 0.03597045689821243,
"learning_rate": 0.0002,
"loss": 0.5635451078414917,
"mean_token_accuracy": 0.7696232795715332,
"num_tokens": 2053309.0,
"step": 126
},
{
"entropy": 0.560562789440155,
"epoch": 0.4747663551401869,
"grad_norm": 0.03047817200422287,
"learning_rate": 0.0002,
"loss": 0.5625080466270447,
"mean_token_accuracy": 0.7718035280704498,
"num_tokens": 2069535.0,
"step": 127
},
{
"entropy": 0.5554249584674835,
"epoch": 0.4785046728971963,
"grad_norm": 0.028741145506501198,
"learning_rate": 0.0002,
"loss": 0.5504335165023804,
"mean_token_accuracy": 0.7771810442209244,
"num_tokens": 2085763.0,
"step": 128
},
{
"entropy": 0.5567069947719574,
"epoch": 0.4822429906542056,
"grad_norm": 0.031639862805604935,
"learning_rate": 0.0002,
"loss": 0.5562032461166382,
"mean_token_accuracy": 0.7760691046714783,
"num_tokens": 2102046.0,
"step": 129
},
{
"entropy": 0.5418022572994232,
"epoch": 0.48598130841121495,
"grad_norm": 0.03434485197067261,
"learning_rate": 0.0002,
"loss": 0.5446175932884216,
"mean_token_accuracy": 0.7789350152015686,
"num_tokens": 2118239.0,
"step": 130
},
{
"entropy": 0.5367967188358307,
"epoch": 0.4897196261682243,
"grad_norm": 0.03757743164896965,
"learning_rate": 0.0002,
"loss": 0.5414644479751587,
"mean_token_accuracy": 0.7816939055919647,
"num_tokens": 2134627.0,
"step": 131
},
{
"entropy": 0.5399434715509415,
"epoch": 0.4934579439252336,
"grad_norm": 0.03444533050060272,
"learning_rate": 0.0002,
"loss": 0.5489372611045837,
"mean_token_accuracy": 0.7746081054210663,
"num_tokens": 2150944.0,
"step": 132
},
{
"entropy": 0.5634311139583588,
"epoch": 0.497196261682243,
"grad_norm": 0.028091201558709145,
"learning_rate": 0.0002,
"loss": 0.5653581619262695,
"mean_token_accuracy": 0.7713855057954788,
"num_tokens": 2167218.0,
"step": 133
},
{
"entropy": 0.5568374693393707,
"epoch": 0.5009345794392523,
"grad_norm": 0.029833409935235977,
"learning_rate": 0.0002,
"loss": 0.5585245490074158,
"mean_token_accuracy": 0.7745143622159958,
"num_tokens": 2183449.0,
"step": 134
},
{
"entropy": 0.5839870423078537,
"epoch": 0.5046728971962616,
"grad_norm": 0.03770853579044342,
"learning_rate": 0.0002,
"loss": 0.5719978213310242,
"mean_token_accuracy": 0.7675238102674484,
"num_tokens": 2199875.0,
"step": 135
},
{
"entropy": 0.5689375847578049,
"epoch": 0.508411214953271,
"grad_norm": 0.03635553643107414,
"learning_rate": 0.0002,
"loss": 0.5626992583274841,
"mean_token_accuracy": 0.7723798751831055,
"num_tokens": 2216163.0,
"step": 136
},
{
"entropy": 0.5507294833660126,
"epoch": 0.5121495327102804,
"grad_norm": 0.03596559911966324,
"learning_rate": 0.0002,
"loss": 0.5608267188072205,
"mean_token_accuracy": 0.7710549086332321,
"num_tokens": 2232636.0,
"step": 137
},
{
"entropy": 0.5623424351215363,
"epoch": 0.5158878504672897,
"grad_norm": 0.033818867057561874,
"learning_rate": 0.0002,
"loss": 0.5718593597412109,
"mean_token_accuracy": 0.7696182578802109,
"num_tokens": 2248825.0,
"step": 138
},
{
"entropy": 0.5675409585237503,
"epoch": 0.5196261682242991,
"grad_norm": 0.03331133350729942,
"learning_rate": 0.0002,
"loss": 0.5714356899261475,
"mean_token_accuracy": 0.7693182229995728,
"num_tokens": 2265359.0,
"step": 139
},
{
"entropy": 0.5522013902664185,
"epoch": 0.5233644859813084,
"grad_norm": 0.03208749741315842,
"learning_rate": 0.0002,
"loss": 0.5529259443283081,
"mean_token_accuracy": 0.7765516042709351,
"num_tokens": 2281629.0,
"step": 140
},
{
"entropy": 0.5493837893009186,
"epoch": 0.5271028037383177,
"grad_norm": 0.0305814016610384,
"learning_rate": 0.0002,
"loss": 0.5490883588790894,
"mean_token_accuracy": 0.7763204425573349,
"num_tokens": 2297908.0,
"step": 141
},
{
"entropy": 0.5564678907394409,
"epoch": 0.5308411214953271,
"grad_norm": 0.034225739538669586,
"learning_rate": 0.0002,
"loss": 0.5602461099624634,
"mean_token_accuracy": 0.7709554880857468,
"num_tokens": 2314115.0,
"step": 142
},
{
"entropy": 0.5697164833545685,
"epoch": 0.5345794392523364,
"grad_norm": 0.03395864740014076,
"learning_rate": 0.0002,
"loss": 0.5692602396011353,
"mean_token_accuracy": 0.766906350851059,
"num_tokens": 2330462.0,
"step": 143
},
{
"entropy": 0.5691278576850891,
"epoch": 0.5383177570093458,
"grad_norm": 0.03194013983011246,
"learning_rate": 0.0002,
"loss": 0.562545657157898,
"mean_token_accuracy": 0.7723768651485443,
"num_tokens": 2346630.0,
"step": 144
},
{
"entropy": 0.558807983994484,
"epoch": 0.5420560747663551,
"grad_norm": 0.036789294332265854,
"learning_rate": 0.0002,
"loss": 0.5632457733154297,
"mean_token_accuracy": 0.772635355591774,
"num_tokens": 2362732.0,
"step": 145
},
{
"entropy": 0.5582777112722397,
"epoch": 0.5457943925233645,
"grad_norm": 0.02997492626309395,
"learning_rate": 0.0002,
"loss": 0.5614091753959656,
"mean_token_accuracy": 0.7702963054180145,
"num_tokens": 2379199.0,
"step": 146
},
{
"entropy": 0.5584180504083633,
"epoch": 0.5495327102803739,
"grad_norm": 0.033580392599105835,
"learning_rate": 0.0002,
"loss": 0.5605478286743164,
"mean_token_accuracy": 0.7730905264616013,
"num_tokens": 2395497.0,
"step": 147
},
{
"entropy": 0.5477179437875748,
"epoch": 0.5532710280373832,
"grad_norm": 0.03941367194056511,
"learning_rate": 0.0002,
"loss": 0.5504173636436462,
"mean_token_accuracy": 0.77938412129879,
"num_tokens": 2411648.0,
"step": 148
},
{
"entropy": 0.5601572096347809,
"epoch": 0.5570093457943925,
"grad_norm": 0.030582338571548462,
"learning_rate": 0.0002,
"loss": 0.5634943246841431,
"mean_token_accuracy": 0.7728341221809387,
"num_tokens": 2427925.0,
"step": 149
},
{
"entropy": 0.5869706571102142,
"epoch": 0.5607476635514018,
"grad_norm": 0.036973923444747925,
"learning_rate": 0.0002,
"loss": 0.5785589218139648,
"mean_token_accuracy": 0.765045240521431,
"num_tokens": 2444416.0,
"step": 150
},
{
"entropy": 0.5624907165765762,
"epoch": 0.5644859813084112,
"grad_norm": 0.036355964839458466,
"learning_rate": 0.0002,
"loss": 0.5561196208000183,
"mean_token_accuracy": 0.7752401679754257,
"num_tokens": 2460808.0,
"step": 151
},
{
"entropy": 0.5570034384727478,
"epoch": 0.5682242990654206,
"grad_norm": 0.027923110872507095,
"learning_rate": 0.0002,
"loss": 0.5550441145896912,
"mean_token_accuracy": 0.7757884711027145,
"num_tokens": 2477437.0,
"step": 152
},
{
"entropy": 0.5643865615129471,
"epoch": 0.5719626168224299,
"grad_norm": 0.0321192592382431,
"learning_rate": 0.0002,
"loss": 0.5707546472549438,
"mean_token_accuracy": 0.7692134529352188,
"num_tokens": 2493966.0,
"step": 153
},
{
"entropy": 0.5535547733306885,
"epoch": 0.5757009345794393,
"grad_norm": 0.03465733677148819,
"learning_rate": 0.0002,
"loss": 0.5610126256942749,
"mean_token_accuracy": 0.7733882069587708,
"num_tokens": 2510442.0,
"step": 154
},
{
"entropy": 0.5411207228899002,
"epoch": 0.5794392523364486,
"grad_norm": 0.03268473595380783,
"learning_rate": 0.0002,
"loss": 0.5444988012313843,
"mean_token_accuracy": 0.7791947424411774,
"num_tokens": 2526738.0,
"step": 155
},
{
"entropy": 0.5539679378271103,
"epoch": 0.5831775700934579,
"grad_norm": 0.03345946595072746,
"learning_rate": 0.0002,
"loss": 0.5571167469024658,
"mean_token_accuracy": 0.7733618319034576,
"num_tokens": 2543004.0,
"step": 156
},
{
"entropy": 0.547135517001152,
"epoch": 0.5869158878504673,
"grad_norm": 0.03414901718497276,
"learning_rate": 0.0002,
"loss": 0.5551236867904663,
"mean_token_accuracy": 0.7734578996896744,
"num_tokens": 2559150.0,
"step": 157
},
{
"entropy": 0.5595978051424026,
"epoch": 0.5906542056074766,
"grad_norm": 0.03502917289733887,
"learning_rate": 0.0002,
"loss": 0.5722506046295166,
"mean_token_accuracy": 0.7680937796831131,
"num_tokens": 2575360.0,
"step": 158
},
{
"entropy": 0.56221604347229,
"epoch": 0.594392523364486,
"grad_norm": 0.036693476140499115,
"learning_rate": 0.0002,
"loss": 0.5663124918937683,
"mean_token_accuracy": 0.7699347287416458,
"num_tokens": 2591749.0,
"step": 159
},
{
"entropy": 0.5489411354064941,
"epoch": 0.5981308411214953,
"grad_norm": 0.029823357239365578,
"learning_rate": 0.0002,
"loss": 0.5525665879249573,
"mean_token_accuracy": 0.7778102308511734,
"num_tokens": 2608011.0,
"step": 160
},
{
"entropy": 0.5679098963737488,
"epoch": 0.6018691588785047,
"grad_norm": 0.03129269927740097,
"learning_rate": 0.0002,
"loss": 0.5632325410842896,
"mean_token_accuracy": 0.7711086720228195,
"num_tokens": 2624110.0,
"step": 161
},
{
"entropy": 0.5759385228157043,
"epoch": 0.6056074766355141,
"grad_norm": 0.03027232177555561,
"learning_rate": 0.0002,
"loss": 0.566430926322937,
"mean_token_accuracy": 0.7684105038642883,
"num_tokens": 2640619.0,
"step": 162
},
{
"entropy": 0.5755711048841476,
"epoch": 0.6093457943925233,
"grad_norm": 0.02997921220958233,
"learning_rate": 0.0002,
"loss": 0.5693614482879639,
"mean_token_accuracy": 0.7678638249635696,
"num_tokens": 2656816.0,
"step": 163
},
{
"entropy": 0.5675656646490097,
"epoch": 0.6130841121495327,
"grad_norm": 0.02925792895257473,
"learning_rate": 0.0002,
"loss": 0.5620183348655701,
"mean_token_accuracy": 0.7710973769426346,
"num_tokens": 2673238.0,
"step": 164
},
{
"entropy": 0.5436252653598785,
"epoch": 0.616822429906542,
"grad_norm": 0.030324436724185944,
"learning_rate": 0.0002,
"loss": 0.5462444424629211,
"mean_token_accuracy": 0.779330775141716,
"num_tokens": 2689740.0,
"step": 165
},
{
"entropy": 0.5572406202554703,
"epoch": 0.6205607476635514,
"grad_norm": 0.03400828689336777,
"learning_rate": 0.0002,
"loss": 0.5641958713531494,
"mean_token_accuracy": 0.7692032605409622,
"num_tokens": 2706162.0,
"step": 166
},
{
"entropy": 0.554596871137619,
"epoch": 0.6242990654205608,
"grad_norm": 0.03054538182914257,
"learning_rate": 0.0002,
"loss": 0.556669294834137,
"mean_token_accuracy": 0.7765887379646301,
"num_tokens": 2722464.0,
"step": 167
},
{
"entropy": 0.5644665658473969,
"epoch": 0.6280373831775701,
"grad_norm": 0.03194966912269592,
"learning_rate": 0.0002,
"loss": 0.5671694278717041,
"mean_token_accuracy": 0.7694765031337738,
"num_tokens": 2738958.0,
"step": 168
},
{
"entropy": 0.5491771847009659,
"epoch": 0.6317757009345795,
"grad_norm": 0.03178941458463669,
"learning_rate": 0.0002,
"loss": 0.5497304797172546,
"mean_token_accuracy": 0.7750105261802673,
"num_tokens": 2755355.0,
"step": 169
},
{
"entropy": 0.5742185562849045,
"epoch": 0.6355140186915887,
"grad_norm": 0.027454091235995293,
"learning_rate": 0.0002,
"loss": 0.5754401683807373,
"mean_token_accuracy": 0.7658552527427673,
"num_tokens": 2771556.0,
"step": 170
},
{
"entropy": 0.5589788407087326,
"epoch": 0.6392523364485981,
"grad_norm": 0.029149651527404785,
"learning_rate": 0.0002,
"loss": 0.554992139339447,
"mean_token_accuracy": 0.7758396863937378,
"num_tokens": 2787760.0,
"step": 171
},
{
"entropy": 0.5677189081907272,
"epoch": 0.6429906542056075,
"grad_norm": 0.03037264011800289,
"learning_rate": 0.0002,
"loss": 0.5637961626052856,
"mean_token_accuracy": 0.7705356478691101,
"num_tokens": 2803802.0,
"step": 172
},
{
"entropy": 0.5565283447504044,
"epoch": 0.6467289719626168,
"grad_norm": 0.03331301361322403,
"learning_rate": 0.0002,
"loss": 0.5568943023681641,
"mean_token_accuracy": 0.77414271235466,
"num_tokens": 2820371.0,
"step": 173
},
{
"entropy": 0.5312813073396683,
"epoch": 0.6504672897196262,
"grad_norm": 0.03152315691113472,
"learning_rate": 0.0002,
"loss": 0.5355879664421082,
"mean_token_accuracy": 0.785700336098671,
"num_tokens": 2836694.0,
"step": 174
},
{
"entropy": 0.5379063338041306,
"epoch": 0.6542056074766355,
"grad_norm": 0.037841469049453735,
"learning_rate": 0.0002,
"loss": 0.5525423288345337,
"mean_token_accuracy": 0.7756439745426178,
"num_tokens": 2852864.0,
"step": 175
},
{
"entropy": 0.5613906681537628,
"epoch": 0.6579439252336449,
"grad_norm": 0.035853054374456406,
"learning_rate": 0.0002,
"loss": 0.5655968189239502,
"mean_token_accuracy": 0.7716417163610458,
"num_tokens": 2869313.0,
"step": 176
},
{
"entropy": 0.5639201551675797,
"epoch": 0.6616822429906543,
"grad_norm": 0.026397736743092537,
"learning_rate": 0.0002,
"loss": 0.5627295970916748,
"mean_token_accuracy": 0.7704634070396423,
"num_tokens": 2885495.0,
"step": 177
},
{
"entropy": 0.5702281445264816,
"epoch": 0.6654205607476635,
"grad_norm": 0.03206147998571396,
"learning_rate": 0.0002,
"loss": 0.5647550821304321,
"mean_token_accuracy": 0.7702795714139938,
"num_tokens": 2901765.0,
"step": 178
},
{
"entropy": 0.5528819262981415,
"epoch": 0.6691588785046729,
"grad_norm": 0.03629858419299126,
"learning_rate": 0.0002,
"loss": 0.5473611950874329,
"mean_token_accuracy": 0.7778798639774323,
"num_tokens": 2918124.0,
"step": 179
},
{
"entropy": 0.5617557764053345,
"epoch": 0.6728971962616822,
"grad_norm": 0.03116736188530922,
"learning_rate": 0.0002,
"loss": 0.5709046721458435,
"mean_token_accuracy": 0.7677187621593475,
"num_tokens": 2934418.0,
"step": 180
},
{
"entropy": 0.544835090637207,
"epoch": 0.6766355140186916,
"grad_norm": 0.03548549860715866,
"learning_rate": 0.0002,
"loss": 0.5551706552505493,
"mean_token_accuracy": 0.7762557417154312,
"num_tokens": 2951100.0,
"step": 181
},
{
"entropy": 0.5660403668880463,
"epoch": 0.680373831775701,
"grad_norm": 0.03100365214049816,
"learning_rate": 0.0002,
"loss": 0.5729965567588806,
"mean_token_accuracy": 0.7690318375825882,
"num_tokens": 2967440.0,
"step": 182
},
{
"entropy": 0.5780525356531143,
"epoch": 0.6841121495327103,
"grad_norm": 0.03490225970745087,
"learning_rate": 0.0002,
"loss": 0.5734487771987915,
"mean_token_accuracy": 0.7699766159057617,
"num_tokens": 2983954.0,
"step": 183
},
{
"entropy": 0.5722559094429016,
"epoch": 0.6878504672897197,
"grad_norm": 0.031209329143166542,
"learning_rate": 0.0002,
"loss": 0.5663836002349854,
"mean_token_accuracy": 0.7720828950405121,
"num_tokens": 3000256.0,
"step": 184
},
{
"entropy": 0.5506948530673981,
"epoch": 0.6915887850467289,
"grad_norm": 0.029818221926689148,
"learning_rate": 0.0002,
"loss": 0.5445064306259155,
"mean_token_accuracy": 0.7804610878229141,
"num_tokens": 3016740.0,
"step": 185
},
{
"entropy": 0.5661566108465195,
"epoch": 0.6953271028037383,
"grad_norm": 0.03627892956137657,
"learning_rate": 0.0002,
"loss": 0.5731881260871887,
"mean_token_accuracy": 0.7681418061256409,
"num_tokens": 3033200.0,
"step": 186
},
{
"entropy": 0.5561655461788177,
"epoch": 0.6990654205607477,
"grad_norm": 0.028912672773003578,
"learning_rate": 0.0002,
"loss": 0.559117317199707,
"mean_token_accuracy": 0.7737248986959457,
"num_tokens": 3049728.0,
"step": 187
},
{
"entropy": 0.5450099408626556,
"epoch": 0.702803738317757,
"grad_norm": 0.03303583338856697,
"learning_rate": 0.0002,
"loss": 0.5467768907546997,
"mean_token_accuracy": 0.7775131165981293,
"num_tokens": 3066007.0,
"step": 188
},
{
"entropy": 0.5617918968200684,
"epoch": 0.7065420560747664,
"grad_norm": 0.035768017172813416,
"learning_rate": 0.0002,
"loss": 0.563019871711731,
"mean_token_accuracy": 0.770862489938736,
"num_tokens": 3082324.0,
"step": 189
},
{
"entropy": 0.5339331775903702,
"epoch": 0.7102803738317757,
"grad_norm": 0.031208420172333717,
"learning_rate": 0.0002,
"loss": 0.547924280166626,
"mean_token_accuracy": 0.7771021723747253,
"num_tokens": 3098546.0,
"step": 190
},
{
"entropy": 0.5686406493186951,
"epoch": 0.7140186915887851,
"grad_norm": 0.028388923034071922,
"learning_rate": 0.0002,
"loss": 0.5657324194908142,
"mean_token_accuracy": 0.772287517786026,
"num_tokens": 3114868.0,
"step": 191
},
{
"entropy": 0.5583553314208984,
"epoch": 0.7177570093457943,
"grad_norm": 0.027447570115327835,
"learning_rate": 0.0002,
"loss": 0.5535703897476196,
"mean_token_accuracy": 0.7759178727865219,
"num_tokens": 3131210.0,
"step": 192
},
{
"entropy": 0.5578874051570892,
"epoch": 0.7214953271028037,
"grad_norm": 0.033130839467048645,
"learning_rate": 0.0002,
"loss": 0.5513507723808289,
"mean_token_accuracy": 0.7747978419065475,
"num_tokens": 3147445.0,
"step": 193
},
{
"entropy": 0.5491522252559662,
"epoch": 0.7252336448598131,
"grad_norm": 0.030513031408190727,
"learning_rate": 0.0002,
"loss": 0.5503372550010681,
"mean_token_accuracy": 0.7780584990978241,
"num_tokens": 3163723.0,
"step": 194
},
{
"entropy": 0.5677588433027267,
"epoch": 0.7289719626168224,
"grad_norm": 0.030064091086387634,
"learning_rate": 0.0002,
"loss": 0.5684211850166321,
"mean_token_accuracy": 0.7694611251354218,
"num_tokens": 3180127.0,
"step": 195
},
{
"entropy": 0.5523021966218948,
"epoch": 0.7327102803738318,
"grad_norm": 0.028454501181840897,
"learning_rate": 0.0002,
"loss": 0.5564773082733154,
"mean_token_accuracy": 0.7736252546310425,
"num_tokens": 3196384.0,
"step": 196
},
{
"entropy": 0.5594403147697449,
"epoch": 0.7364485981308411,
"grad_norm": 0.031159594655036926,
"learning_rate": 0.0002,
"loss": 0.5678831934928894,
"mean_token_accuracy": 0.7687141001224518,
"num_tokens": 3212579.0,
"step": 197
},
{
"entropy": 0.5670231431722641,
"epoch": 0.7401869158878505,
"grad_norm": 0.026576390489935875,
"learning_rate": 0.0002,
"loss": 0.5695415735244751,
"mean_token_accuracy": 0.7709443867206573,
"num_tokens": 3229005.0,
"step": 198
},
{
"entropy": 0.5550480484962463,
"epoch": 0.7439252336448599,
"grad_norm": 0.030606523156166077,
"learning_rate": 0.0002,
"loss": 0.5502464771270752,
"mean_token_accuracy": 0.7791616022586823,
"num_tokens": 3245287.0,
"step": 199
},
{
"entropy": 0.5619281828403473,
"epoch": 0.7476635514018691,
"grad_norm": 0.030474133789539337,
"learning_rate": 0.0002,
"loss": 0.5586714148521423,
"mean_token_accuracy": 0.7734764218330383,
"num_tokens": 3261691.0,
"step": 200
},
{
"entropy": 0.5405223369598389,
"epoch": 0.7514018691588785,
"grad_norm": 0.032003577798604965,
"learning_rate": 0.0002,
"loss": 0.5496760010719299,
"mean_token_accuracy": 0.7761346995830536,
"num_tokens": 3277743.0,
"step": 201
},
{
"entropy": 0.5539799779653549,
"epoch": 0.7551401869158878,
"grad_norm": 0.026676569133996964,
"learning_rate": 0.0002,
"loss": 0.5552941560745239,
"mean_token_accuracy": 0.7729017436504364,
"num_tokens": 3293921.0,
"step": 202
},
{
"entropy": 0.5504231303930283,
"epoch": 0.7588785046728972,
"grad_norm": 0.02650677040219307,
"learning_rate": 0.0002,
"loss": 0.5463041663169861,
"mean_token_accuracy": 0.7773067653179169,
"num_tokens": 3310038.0,
"step": 203
},
{
"entropy": 0.5567349493503571,
"epoch": 0.7626168224299066,
"grad_norm": 0.028487270697951317,
"learning_rate": 0.0002,
"loss": 0.5550260543823242,
"mean_token_accuracy": 0.7747003883123398,
"num_tokens": 3326542.0,
"step": 204
},
{
"entropy": 0.5515165776014328,
"epoch": 0.7663551401869159,
"grad_norm": 0.02944660186767578,
"learning_rate": 0.0002,
"loss": 0.5483176708221436,
"mean_token_accuracy": 0.7772196680307388,
"num_tokens": 3342960.0,
"step": 205
},
{
"entropy": 0.5516369044780731,
"epoch": 0.7700934579439253,
"grad_norm": 0.02446347288787365,
"learning_rate": 0.0002,
"loss": 0.5510342121124268,
"mean_token_accuracy": 0.7753156870603561,
"num_tokens": 3359361.0,
"step": 206
},
{
"entropy": 0.562598317861557,
"epoch": 0.7738317757009345,
"grad_norm": 0.032002996653318405,
"learning_rate": 0.0002,
"loss": 0.5551044344902039,
"mean_token_accuracy": 0.7748953849077225,
"num_tokens": 3375695.0,
"step": 207
},
{
"entropy": 0.5636338144540787,
"epoch": 0.7775700934579439,
"grad_norm": 0.032179221510887146,
"learning_rate": 0.0002,
"loss": 0.564883291721344,
"mean_token_accuracy": 0.7722733914852142,
"num_tokens": 3391711.0,
"step": 208
},
{
"entropy": 0.5475672632455826,
"epoch": 0.7813084112149533,
"grad_norm": 0.03206668421626091,
"learning_rate": 0.0002,
"loss": 0.5551382899284363,
"mean_token_accuracy": 0.7726904302835464,
"num_tokens": 3407951.0,
"step": 209
},
{
"entropy": 0.540259450674057,
"epoch": 0.7850467289719626,
"grad_norm": 0.02936564013361931,
"learning_rate": 0.0002,
"loss": 0.5508178472518921,
"mean_token_accuracy": 0.7771763801574707,
"num_tokens": 3424278.0,
"step": 210
},
{
"entropy": 0.5564334988594055,
"epoch": 0.788785046728972,
"grad_norm": 0.03052506223320961,
"learning_rate": 0.0002,
"loss": 0.5652161240577698,
"mean_token_accuracy": 0.770373746752739,
"num_tokens": 3440796.0,
"step": 211
},
{
"entropy": 0.5524326264858246,
"epoch": 0.7925233644859813,
"grad_norm": 0.025716882199048996,
"learning_rate": 0.0002,
"loss": 0.5483862161636353,
"mean_token_accuracy": 0.778383657336235,
"num_tokens": 3457162.0,
"step": 212
},
{
"entropy": 0.5574807077646255,
"epoch": 0.7962616822429907,
"grad_norm": 0.026924515143036842,
"learning_rate": 0.0002,
"loss": 0.5535562634468079,
"mean_token_accuracy": 0.7756220400333405,
"num_tokens": 3473707.0,
"step": 213
},
{
"entropy": 0.558317020535469,
"epoch": 0.8,
"grad_norm": 0.025764374062418938,
"learning_rate": 0.0002,
"loss": 0.560704231262207,
"mean_token_accuracy": 0.7712857127189636,
"num_tokens": 3490125.0,
"step": 214
},
{
"entropy": 0.5554333925247192,
"epoch": 0.8037383177570093,
"grad_norm": 0.028298519551753998,
"learning_rate": 0.0002,
"loss": 0.5522173643112183,
"mean_token_accuracy": 0.7743871361017227,
"num_tokens": 3506505.0,
"step": 215
},
{
"entropy": 0.5587067157030106,
"epoch": 0.8074766355140187,
"grad_norm": 0.02431626431643963,
"learning_rate": 0.0002,
"loss": 0.5544553995132446,
"mean_token_accuracy": 0.7743324339389801,
"num_tokens": 3522958.0,
"step": 216
},
{
"entropy": 0.5645765364170074,
"epoch": 0.811214953271028,
"grad_norm": 0.02611798420548439,
"learning_rate": 0.0002,
"loss": 0.5644361972808838,
"mean_token_accuracy": 0.7711465507745743,
"num_tokens": 3539490.0,
"step": 217
},
{
"entropy": 0.5525356978178024,
"epoch": 0.8149532710280374,
"grad_norm": 0.03383297845721245,
"learning_rate": 0.0002,
"loss": 0.5598211884498596,
"mean_token_accuracy": 0.7742004096508026,
"num_tokens": 3555746.0,
"step": 218
},
{
"entropy": 0.5621150583028793,
"epoch": 0.8186915887850468,
"grad_norm": 0.030269736424088478,
"learning_rate": 0.0002,
"loss": 0.5634778738021851,
"mean_token_accuracy": 0.7692747861146927,
"num_tokens": 3572256.0,
"step": 219
},
{
"entropy": 0.5514157265424728,
"epoch": 0.822429906542056,
"grad_norm": 0.028750412166118622,
"learning_rate": 0.0002,
"loss": 0.5467870831489563,
"mean_token_accuracy": 0.7769519984722137,
"num_tokens": 3588550.0,
"step": 220
},
{
"entropy": 0.5368104577064514,
"epoch": 0.8261682242990654,
"grad_norm": 0.03091045655310154,
"learning_rate": 0.0002,
"loss": 0.5372405648231506,
"mean_token_accuracy": 0.7840253859758377,
"num_tokens": 3604659.0,
"step": 221
},
{
"entropy": 0.5409716814756393,
"epoch": 0.8299065420560747,
"grad_norm": 0.03386515751481056,
"learning_rate": 0.0002,
"loss": 0.548212468624115,
"mean_token_accuracy": 0.7736510932445526,
"num_tokens": 3620843.0,
"step": 222
},
{
"entropy": 0.5629084706306458,
"epoch": 0.8336448598130841,
"grad_norm": 0.040728501975536346,
"learning_rate": 0.0002,
"loss": 0.5746021270751953,
"mean_token_accuracy": 0.7647373080253601,
"num_tokens": 3637324.0,
"step": 223
},
{
"entropy": 0.5369234085083008,
"epoch": 0.8373831775700935,
"grad_norm": 0.029392162337899208,
"learning_rate": 0.0002,
"loss": 0.5397970080375671,
"mean_token_accuracy": 0.7819121479988098,
"num_tokens": 3653633.0,
"step": 224
},
{
"entropy": 0.5768532902002335,
"epoch": 0.8411214953271028,
"grad_norm": 0.033986181020736694,
"learning_rate": 0.0002,
"loss": 0.5701450109481812,
"mean_token_accuracy": 0.7669256031513214,
"num_tokens": 3670158.0,
"step": 225
},
{
"entropy": 0.5465534925460815,
"epoch": 0.8448598130841122,
"grad_norm": 0.034689608961343765,
"learning_rate": 0.0002,
"loss": 0.539010226726532,
"mean_token_accuracy": 0.7829751968383789,
"num_tokens": 3686415.0,
"step": 226
},
{
"entropy": 0.5669656842947006,
"epoch": 0.8485981308411215,
"grad_norm": 0.029157601296901703,
"learning_rate": 0.0002,
"loss": 0.5645594596862793,
"mean_token_accuracy": 0.7721282690763474,
"num_tokens": 3702620.0,
"step": 227
},
{
"entropy": 0.5713803917169571,
"epoch": 0.8523364485981308,
"grad_norm": 0.032975275069475174,
"learning_rate": 0.0002,
"loss": 0.5758609771728516,
"mean_token_accuracy": 0.7657817453145981,
"num_tokens": 3719219.0,
"step": 228
},
{
"entropy": 0.5463247001171112,
"epoch": 0.8560747663551402,
"grad_norm": 0.039444658905267715,
"learning_rate": 0.0002,
"loss": 0.5534209609031677,
"mean_token_accuracy": 0.7726487815380096,
"num_tokens": 3735438.0,
"step": 229
},
{
"entropy": 0.556586429476738,
"epoch": 0.8598130841121495,
"grad_norm": 0.02616702765226364,
"learning_rate": 0.0002,
"loss": 0.5549170970916748,
"mean_token_accuracy": 0.7752689123153687,
"num_tokens": 3751785.0,
"step": 230
},
{
"entropy": 0.5389135032892227,
"epoch": 0.8635514018691589,
"grad_norm": 0.03276278078556061,
"learning_rate": 0.0002,
"loss": 0.5399537086486816,
"mean_token_accuracy": 0.781702533364296,
"num_tokens": 3767826.0,
"step": 231
},
{
"entropy": 0.5364359021186829,
"epoch": 0.8672897196261682,
"grad_norm": 0.026118800044059753,
"learning_rate": 0.0002,
"loss": 0.5382952094078064,
"mean_token_accuracy": 0.780514121055603,
"num_tokens": 3783919.0,
"step": 232
},
{
"entropy": 0.5687360912561417,
"epoch": 0.8710280373831776,
"grad_norm": 0.03209976479411125,
"learning_rate": 0.0002,
"loss": 0.5756676197052002,
"mean_token_accuracy": 0.7664439678192139,
"num_tokens": 3800454.0,
"step": 233
},
{
"entropy": 0.5679410099983215,
"epoch": 0.874766355140187,
"grad_norm": 0.025931114330887794,
"learning_rate": 0.0002,
"loss": 0.5656247138977051,
"mean_token_accuracy": 0.7693636864423752,
"num_tokens": 3816747.0,
"step": 234
},
{
"entropy": 0.557420089840889,
"epoch": 0.8785046728971962,
"grad_norm": 0.02894972637295723,
"learning_rate": 0.0002,
"loss": 0.5490383505821228,
"mean_token_accuracy": 0.7750599384307861,
"num_tokens": 3833058.0,
"step": 235
},
{
"entropy": 0.560372844338417,
"epoch": 0.8822429906542056,
"grad_norm": 0.03646957501769066,
"learning_rate": 0.0002,
"loss": 0.5596282482147217,
"mean_token_accuracy": 0.7726272940635681,
"num_tokens": 3849415.0,
"step": 236
},
{
"entropy": 0.5550010055303574,
"epoch": 0.8859813084112149,
"grad_norm": 0.026594942435622215,
"learning_rate": 0.0002,
"loss": 0.5539083480834961,
"mean_token_accuracy": 0.7734427750110626,
"num_tokens": 3865776.0,
"step": 237
},
{
"entropy": 0.5347648710012436,
"epoch": 0.8897196261682243,
"grad_norm": 0.03385410085320473,
"learning_rate": 0.0002,
"loss": 0.5472573041915894,
"mean_token_accuracy": 0.7766564786434174,
"num_tokens": 3882018.0,
"step": 238
},
{
"entropy": 0.5376404300332069,
"epoch": 0.8934579439252337,
"grad_norm": 0.040597062557935715,
"learning_rate": 0.0002,
"loss": 0.5544540286064148,
"mean_token_accuracy": 0.7728734314441681,
"num_tokens": 3898287.0,
"step": 239
},
{
"entropy": 0.5667798519134521,
"epoch": 0.897196261682243,
"grad_norm": 0.027665674686431885,
"learning_rate": 0.0002,
"loss": 0.5663026571273804,
"mean_token_accuracy": 0.770405575633049,
"num_tokens": 3914775.0,
"step": 240
},
{
"entropy": 0.550272524356842,
"epoch": 0.9009345794392524,
"grad_norm": 0.029484877362847328,
"learning_rate": 0.0002,
"loss": 0.5427078008651733,
"mean_token_accuracy": 0.7818168848752975,
"num_tokens": 3930889.0,
"step": 241
},
{
"entropy": 0.5710694193840027,
"epoch": 0.9046728971962616,
"grad_norm": 0.027631685137748718,
"learning_rate": 0.0002,
"loss": 0.561673641204834,
"mean_token_accuracy": 0.7728846818208694,
"num_tokens": 3947233.0,
"step": 242
},
{
"entropy": 0.5513755828142166,
"epoch": 0.908411214953271,
"grad_norm": 0.030272630974650383,
"learning_rate": 0.0002,
"loss": 0.5467454195022583,
"mean_token_accuracy": 0.7779553532600403,
"num_tokens": 3963468.0,
"step": 243
},
{
"entropy": 0.5469895005226135,
"epoch": 0.9121495327102803,
"grad_norm": 0.03090892918407917,
"learning_rate": 0.0002,
"loss": 0.5560286045074463,
"mean_token_accuracy": 0.7723891735076904,
"num_tokens": 3979910.0,
"step": 244
},
{
"entropy": 0.5544413626194,
"epoch": 0.9158878504672897,
"grad_norm": 0.041499219834804535,
"learning_rate": 0.0002,
"loss": 0.5768874883651733,
"mean_token_accuracy": 0.7659346610307693,
"num_tokens": 3996196.0,
"step": 245
},
{
"entropy": 0.5447600036859512,
"epoch": 0.9196261682242991,
"grad_norm": 0.03076878748834133,
"learning_rate": 0.0002,
"loss": 0.5456743836402893,
"mean_token_accuracy": 0.7770105451345444,
"num_tokens": 4012511.0,
"step": 246
},
{
"entropy": 0.5538895577192307,
"epoch": 0.9233644859813084,
"grad_norm": 0.03173721581697464,
"learning_rate": 0.0002,
"loss": 0.5483969449996948,
"mean_token_accuracy": 0.7781166434288025,
"num_tokens": 4028651.0,
"step": 247
},
{
"entropy": 0.5794132798910141,
"epoch": 0.9271028037383178,
"grad_norm": 0.0297909714281559,
"learning_rate": 0.0002,
"loss": 0.5648066401481628,
"mean_token_accuracy": 0.7718619257211685,
"num_tokens": 4045251.0,
"step": 248
},
{
"entropy": 0.5547907501459122,
"epoch": 0.930841121495327,
"grad_norm": 0.03679649531841278,
"learning_rate": 0.0002,
"loss": 0.5462634563446045,
"mean_token_accuracy": 0.7801699191331863,
"num_tokens": 4061348.0,
"step": 249
},
{
"entropy": 0.5539078116416931,
"epoch": 0.9345794392523364,
"grad_norm": 0.02851703390479088,
"learning_rate": 0.0002,
"loss": 0.5593677163124084,
"mean_token_accuracy": 0.7756806910037994,
"num_tokens": 4077453.0,
"step": 250
},
{
"entropy": 0.5443865954875946,
"epoch": 0.9383177570093458,
"grad_norm": 0.030135581269860268,
"learning_rate": 0.0002,
"loss": 0.5505210161209106,
"mean_token_accuracy": 0.7767539322376251,
"num_tokens": 4093944.0,
"step": 251
},
{
"entropy": 0.5541698932647705,
"epoch": 0.9420560747663551,
"grad_norm": 0.03800193592905998,
"learning_rate": 0.0002,
"loss": 0.5603746175765991,
"mean_token_accuracy": 0.7716375887393951,
"num_tokens": 4110397.0,
"step": 252
},
{
"entropy": 0.5497024953365326,
"epoch": 0.9457943925233645,
"grad_norm": 0.030841615051031113,
"learning_rate": 0.0002,
"loss": 0.5577483177185059,
"mean_token_accuracy": 0.776105210185051,
"num_tokens": 4126788.0,
"step": 253
},
{
"entropy": 0.5452855974435806,
"epoch": 0.9495327102803738,
"grad_norm": 0.027110353112220764,
"learning_rate": 0.0002,
"loss": 0.5468145608901978,
"mean_token_accuracy": 0.7746452689170837,
"num_tokens": 4143252.0,
"step": 254
},
{
"entropy": 0.5483012199401855,
"epoch": 0.9532710280373832,
"grad_norm": 0.02763090282678604,
"learning_rate": 0.0002,
"loss": 0.542940616607666,
"mean_token_accuracy": 0.7776369601488113,
"num_tokens": 4159556.0,
"step": 255
},
{
"entropy": 0.5598485320806503,
"epoch": 0.9570093457943926,
"grad_norm": 0.02750120870769024,
"learning_rate": 0.0002,
"loss": 0.5518869161605835,
"mean_token_accuracy": 0.7762151658535004,
"num_tokens": 4175947.0,
"step": 256
},
{
"entropy": 0.5783872008323669,
"epoch": 0.9607476635514018,
"grad_norm": 0.03151006996631622,
"learning_rate": 0.0002,
"loss": 0.5734107494354248,
"mean_token_accuracy": 0.7695904821157455,
"num_tokens": 4192348.0,
"step": 257
},
{
"entropy": 0.5653168857097626,
"epoch": 0.9644859813084112,
"grad_norm": 0.03166348114609718,
"learning_rate": 0.0002,
"loss": 0.5732910633087158,
"mean_token_accuracy": 0.7679464519023895,
"num_tokens": 4208898.0,
"step": 258
},
{
"entropy": 0.5390284806489944,
"epoch": 0.9682242990654205,
"grad_norm": 0.026950784027576447,
"learning_rate": 0.0002,
"loss": 0.5455009937286377,
"mean_token_accuracy": 0.7775461375713348,
"num_tokens": 4225149.0,
"step": 259
},
{
"entropy": 0.565416008234024,
"epoch": 0.9719626168224299,
"grad_norm": 0.030768675729632378,
"learning_rate": 0.0002,
"loss": 0.5689860582351685,
"mean_token_accuracy": 0.7684348970651627,
"num_tokens": 4241389.0,
"step": 260
},
{
"entropy": 0.5577588826417923,
"epoch": 0.9757009345794393,
"grad_norm": 0.02680326998233795,
"learning_rate": 0.0002,
"loss": 0.5625928640365601,
"mean_token_accuracy": 0.7695075571537018,
"num_tokens": 4257979.0,
"step": 261
},
{
"entropy": 0.55104960501194,
"epoch": 0.9794392523364486,
"grad_norm": 0.027646353468298912,
"learning_rate": 0.0002,
"loss": 0.5484559535980225,
"mean_token_accuracy": 0.7766857743263245,
"num_tokens": 4274290.0,
"step": 262
},
{
"entropy": 0.5638265609741211,
"epoch": 0.983177570093458,
"grad_norm": 0.02871805429458618,
"learning_rate": 0.0002,
"loss": 0.5657901167869568,
"mean_token_accuracy": 0.7715673297643661,
"num_tokens": 4290725.0,
"step": 263
},
{
"entropy": 0.547324076294899,
"epoch": 0.9869158878504672,
"grad_norm": 0.02937854453921318,
"learning_rate": 0.0002,
"loss": 0.55534827709198,
"mean_token_accuracy": 0.7751762270927429,
"num_tokens": 4307326.0,
"step": 264
},
{
"entropy": 0.5487106442451477,
"epoch": 0.9906542056074766,
"grad_norm": 0.02548016607761383,
"learning_rate": 0.0002,
"loss": 0.5505661964416504,
"mean_token_accuracy": 0.7752106785774231,
"num_tokens": 4323823.0,
"step": 265
},
{
"entropy": 0.5634673833847046,
"epoch": 0.994392523364486,
"grad_norm": 0.026015356183052063,
"learning_rate": 0.0002,
"loss": 0.5634418725967407,
"mean_token_accuracy": 0.7709382921457291,
"num_tokens": 4340138.0,
"step": 266
},
{
"entropy": 0.5507746189832687,
"epoch": 0.9981308411214953,
"grad_norm": 0.026798918843269348,
"learning_rate": 0.0002,
"loss": 0.5513297915458679,
"mean_token_accuracy": 0.7769380956888199,
"num_tokens": 4356482.0,
"step": 267
},
{
"entropy": 0.5597052276134491,
"epoch": 1.0,
"grad_norm": 0.0342809222638607,
"learning_rate": 0.0002,
"loss": 0.5571821331977844,
"mean_token_accuracy": 0.774641364812851,
"num_tokens": 4364744.0,
"step": 268
},
{
"entropy": 0.557921290397644,
"epoch": 1.0037383177570094,
"grad_norm": 0.029891351237893105,
"learning_rate": 0.0002,
"loss": 0.5539438128471375,
"mean_token_accuracy": 0.7773818224668503,
"num_tokens": 4380930.0,
"step": 269
},
{
"entropy": 0.5416439026594162,
"epoch": 1.0074766355140188,
"grad_norm": 0.02803446725010872,
"learning_rate": 0.0002,
"loss": 0.5438423752784729,
"mean_token_accuracy": 0.7798180431127548,
"num_tokens": 4397244.0,
"step": 270
},
{
"entropy": 0.5285164415836334,
"epoch": 1.011214953271028,
"grad_norm": 0.03023347444832325,
"learning_rate": 0.0002,
"loss": 0.5358922481536865,
"mean_token_accuracy": 0.7807245850563049,
"num_tokens": 4413671.0,
"step": 271
},
{
"entropy": 0.5514080822467804,
"epoch": 1.0149532710280373,
"grad_norm": 0.027458516880869865,
"learning_rate": 0.0002,
"loss": 0.552421510219574,
"mean_token_accuracy": 0.7761755585670471,
"num_tokens": 4430035.0,
"step": 272
},
{
"entropy": 0.5706226229667664,
"epoch": 1.0186915887850467,
"grad_norm": 0.030846886336803436,
"learning_rate": 0.0002,
"loss": 0.5667564272880554,
"mean_token_accuracy": 0.7689130008220673,
"num_tokens": 4446382.0,
"step": 273
},
{
"entropy": 0.5511225461959839,
"epoch": 1.0224299065420561,
"grad_norm": 0.029439929872751236,
"learning_rate": 0.0002,
"loss": 0.5465920567512512,
"mean_token_accuracy": 0.7808292508125305,
"num_tokens": 4462677.0,
"step": 274
},
{
"entropy": 0.5416547358036041,
"epoch": 1.0261682242990655,
"grad_norm": 0.02822115644812584,
"learning_rate": 0.0002,
"loss": 0.5419396758079529,
"mean_token_accuracy": 0.7816834002733231,
"num_tokens": 4479083.0,
"step": 275
},
{
"entropy": 0.5574266612529755,
"epoch": 1.0299065420560747,
"grad_norm": 0.0327095128595829,
"learning_rate": 0.0002,
"loss": 0.5565608739852905,
"mean_token_accuracy": 0.7745349258184433,
"num_tokens": 4495797.0,
"step": 276
},
{
"entropy": 0.5387104451656342,
"epoch": 1.033644859813084,
"grad_norm": 0.03164896368980408,
"learning_rate": 0.0002,
"loss": 0.5406032800674438,
"mean_token_accuracy": 0.7823146730661392,
"num_tokens": 4512262.0,
"step": 277
},
{
"entropy": 0.5471370071172714,
"epoch": 1.0373831775700935,
"grad_norm": 0.03483380377292633,
"learning_rate": 0.0002,
"loss": 0.5550093054771423,
"mean_token_accuracy": 0.7783246338367462,
"num_tokens": 4528616.0,
"step": 278
},
{
"entropy": 0.5368807017803192,
"epoch": 1.0411214953271029,
"grad_norm": 0.03120633400976658,
"learning_rate": 0.0002,
"loss": 0.5417410731315613,
"mean_token_accuracy": 0.7802102267742157,
"num_tokens": 4544882.0,
"step": 279
},
{
"entropy": 0.5481929332017899,
"epoch": 1.0448598130841122,
"grad_norm": 0.029517389833927155,
"learning_rate": 0.0002,
"loss": 0.5472978353500366,
"mean_token_accuracy": 0.7788140177726746,
"num_tokens": 4561427.0,
"step": 280
},
{
"entropy": 0.5531918853521347,
"epoch": 1.0485981308411214,
"grad_norm": 0.03256995975971222,
"learning_rate": 0.0002,
"loss": 0.5502868890762329,
"mean_token_accuracy": 0.7784827798604965,
"num_tokens": 4577723.0,
"step": 281
},
{
"entropy": 0.5540415197610855,
"epoch": 1.0523364485981308,
"grad_norm": 0.026578353717923164,
"learning_rate": 0.0002,
"loss": 0.555966854095459,
"mean_token_accuracy": 0.775706946849823,
"num_tokens": 4594128.0,
"step": 282
},
{
"entropy": 0.5517027229070663,
"epoch": 1.0560747663551402,
"grad_norm": 0.030103787779808044,
"learning_rate": 0.0002,
"loss": 0.5502108931541443,
"mean_token_accuracy": 0.7753856778144836,
"num_tokens": 4610255.0,
"step": 283
},
{
"entropy": 0.5304621160030365,
"epoch": 1.0598130841121496,
"grad_norm": 0.029368899762630463,
"learning_rate": 0.0002,
"loss": 0.5297666788101196,
"mean_token_accuracy": 0.7840214222669601,
"num_tokens": 4626599.0,
"step": 284
},
{
"entropy": 0.5305260270833969,
"epoch": 1.063551401869159,
"grad_norm": 0.029124870896339417,
"learning_rate": 0.0002,
"loss": 0.5363407135009766,
"mean_token_accuracy": 0.7847000658512115,
"num_tokens": 4642927.0,
"step": 285
},
{
"entropy": 0.5300263911485672,
"epoch": 1.0672897196261681,
"grad_norm": 0.028800450265407562,
"learning_rate": 0.0002,
"loss": 0.52923583984375,
"mean_token_accuracy": 0.7828178703784943,
"num_tokens": 4659455.0,
"step": 286
},
{
"entropy": 0.5497115254402161,
"epoch": 1.0710280373831775,
"grad_norm": 0.03032800555229187,
"learning_rate": 0.0002,
"loss": 0.5526697039604187,
"mean_token_accuracy": 0.7718490660190582,
"num_tokens": 4675747.0,
"step": 287
},
{
"entropy": 0.5266695320606232,
"epoch": 1.074766355140187,
"grad_norm": 0.02653171867132187,
"learning_rate": 0.0002,
"loss": 0.5255345702171326,
"mean_token_accuracy": 0.7853638082742691,
"num_tokens": 4691992.0,
"step": 288
},
{
"entropy": 0.5461495667695999,
"epoch": 1.0785046728971963,
"grad_norm": 0.025956284254789352,
"learning_rate": 0.0002,
"loss": 0.5439239740371704,
"mean_token_accuracy": 0.7808811217546463,
"num_tokens": 4708487.0,
"step": 289
},
{
"entropy": 0.5421788841485977,
"epoch": 1.0822429906542057,
"grad_norm": 0.02735847234725952,
"learning_rate": 0.0002,
"loss": 0.5411931872367859,
"mean_token_accuracy": 0.7771425247192383,
"num_tokens": 4724824.0,
"step": 290
},
{
"entropy": 0.5556438118219376,
"epoch": 1.0859813084112149,
"grad_norm": 0.026816118508577347,
"learning_rate": 0.0002,
"loss": 0.5484311580657959,
"mean_token_accuracy": 0.7775956392288208,
"num_tokens": 4741264.0,
"step": 291
},
{
"entropy": 0.5614602714776993,
"epoch": 1.0897196261682243,
"grad_norm": 0.03428835794329643,
"learning_rate": 0.0002,
"loss": 0.5635286569595337,
"mean_token_accuracy": 0.7734779864549637,
"num_tokens": 4757630.0,
"step": 292
},
{
"entropy": 0.5510146170854568,
"epoch": 1.0934579439252337,
"grad_norm": 0.030845943838357925,
"learning_rate": 0.0002,
"loss": 0.5562302470207214,
"mean_token_accuracy": 0.773259237408638,
"num_tokens": 4773723.0,
"step": 293
},
{
"entropy": 0.5555125325918198,
"epoch": 1.097196261682243,
"grad_norm": 0.028586354106664658,
"learning_rate": 0.0002,
"loss": 0.5588027834892273,
"mean_token_accuracy": 0.7723042815923691,
"num_tokens": 4790204.0,
"step": 294
},
{
"entropy": 0.53548863530159,
"epoch": 1.1009345794392524,
"grad_norm": 0.032421719282865524,
"learning_rate": 0.0002,
"loss": 0.5428792238235474,
"mean_token_accuracy": 0.780792623758316,
"num_tokens": 4806715.0,
"step": 295
},
{
"entropy": 0.5266362577676773,
"epoch": 1.1046728971962616,
"grad_norm": 0.044794633984565735,
"learning_rate": 0.0002,
"loss": 0.5296044945716858,
"mean_token_accuracy": 0.7850557416677475,
"num_tokens": 4822693.0,
"step": 296
},
{
"entropy": 0.547786682844162,
"epoch": 1.108411214953271,
"grad_norm": 0.03065192885696888,
"learning_rate": 0.0002,
"loss": 0.545957088470459,
"mean_token_accuracy": 0.7773084342479706,
"num_tokens": 4838834.0,
"step": 297
},
{
"entropy": 0.5526397377252579,
"epoch": 1.1121495327102804,
"grad_norm": 0.03121815249323845,
"learning_rate": 0.0002,
"loss": 0.5505586862564087,
"mean_token_accuracy": 0.7751570343971252,
"num_tokens": 4854891.0,
"step": 298
},
{
"entropy": 0.556088924407959,
"epoch": 1.1158878504672898,
"grad_norm": 0.03519770875573158,
"learning_rate": 0.0002,
"loss": 0.5572479367256165,
"mean_token_accuracy": 0.7747550010681152,
"num_tokens": 4871140.0,
"step": 299
},
{
"entropy": 0.5376470685005188,
"epoch": 1.1196261682242992,
"grad_norm": 0.03193943575024605,
"learning_rate": 0.0002,
"loss": 0.5455138087272644,
"mean_token_accuracy": 0.7797031998634338,
"num_tokens": 4887274.0,
"step": 300
},
{
"entropy": 0.5635453760623932,
"epoch": 1.1233644859813083,
"grad_norm": 0.041273750364780426,
"learning_rate": 0.0002,
"loss": 0.5696390867233276,
"mean_token_accuracy": 0.76914082467556,
"num_tokens": 4903573.0,
"step": 301
},
{
"entropy": 0.5702975988388062,
"epoch": 1.1271028037383177,
"grad_norm": 0.03010556660592556,
"learning_rate": 0.0002,
"loss": 0.5622550845146179,
"mean_token_accuracy": 0.7727158814668655,
"num_tokens": 4919926.0,
"step": 302
},
{
"entropy": 0.5415271073579788,
"epoch": 1.1308411214953271,
"grad_norm": 0.0310966819524765,
"learning_rate": 0.0002,
"loss": 0.5458844900131226,
"mean_token_accuracy": 0.776058241724968,
"num_tokens": 4936123.0,
"step": 303
},
{
"entropy": 0.5403020679950714,
"epoch": 1.1345794392523365,
"grad_norm": 0.04535767808556557,
"learning_rate": 0.0002,
"loss": 0.5387758612632751,
"mean_token_accuracy": 0.7784536480903625,
"num_tokens": 4952502.0,
"step": 304
},
{
"entropy": 0.5479062646627426,
"epoch": 1.1383177570093457,
"grad_norm": 0.028153905645012856,
"learning_rate": 0.0002,
"loss": 0.5478588938713074,
"mean_token_accuracy": 0.7770532369613647,
"num_tokens": 4968823.0,
"step": 305
},
{
"entropy": 0.5423109382390976,
"epoch": 1.142056074766355,
"grad_norm": 0.03606940805912018,
"learning_rate": 0.0002,
"loss": 0.5508921146392822,
"mean_token_accuracy": 0.7769752442836761,
"num_tokens": 4985183.0,
"step": 306
},
{
"entropy": 0.5484813451766968,
"epoch": 1.1457943925233645,
"grad_norm": 0.02960861474275589,
"learning_rate": 0.0002,
"loss": 0.5549089312553406,
"mean_token_accuracy": 0.7753880023956299,
"num_tokens": 5001335.0,
"step": 307
},
{
"entropy": 0.5498395711183548,
"epoch": 1.1495327102803738,
"grad_norm": 0.036366142332553864,
"learning_rate": 0.0002,
"loss": 0.5471988916397095,
"mean_token_accuracy": 0.7787120938301086,
"num_tokens": 5017387.0,
"step": 308
},
{
"entropy": 0.5530393719673157,
"epoch": 1.1532710280373832,
"grad_norm": 0.029028775170445442,
"learning_rate": 0.0002,
"loss": 0.5492241978645325,
"mean_token_accuracy": 0.7761663198471069,
"num_tokens": 5033567.0,
"step": 309
},
{
"entropy": 0.5492727905511856,
"epoch": 1.1570093457943926,
"grad_norm": 0.03352445736527443,
"learning_rate": 0.0002,
"loss": 0.5540640354156494,
"mean_token_accuracy": 0.7749823033809662,
"num_tokens": 5049801.0,
"step": 310
},
{
"entropy": 0.5666168481111526,
"epoch": 1.1607476635514018,
"grad_norm": 0.035840339958667755,
"learning_rate": 0.0002,
"loss": 0.5706231594085693,
"mean_token_accuracy": 0.7669289708137512,
"num_tokens": 5066204.0,
"step": 311
},
{
"entropy": 0.5425457805395126,
"epoch": 1.1644859813084112,
"grad_norm": 0.03181692957878113,
"learning_rate": 0.0002,
"loss": 0.5458914041519165,
"mean_token_accuracy": 0.7774879634380341,
"num_tokens": 5082493.0,
"step": 312
},
{
"entropy": 0.5557267963886261,
"epoch": 1.1682242990654206,
"grad_norm": 0.035230670124292374,
"learning_rate": 0.0002,
"loss": 0.5475496053695679,
"mean_token_accuracy": 0.7787989675998688,
"num_tokens": 5098639.0,
"step": 313
},
{
"entropy": 0.5714587569236755,
"epoch": 1.17196261682243,
"grad_norm": 0.03392059728503227,
"learning_rate": 0.0002,
"loss": 0.5622156262397766,
"mean_token_accuracy": 0.7719752937555313,
"num_tokens": 5114831.0,
"step": 314
},
{
"entropy": 0.5439812690019608,
"epoch": 1.1757009345794391,
"grad_norm": 0.027537284418940544,
"learning_rate": 0.0002,
"loss": 0.5427182912826538,
"mean_token_accuracy": 0.7786365002393723,
"num_tokens": 5131121.0,
"step": 315
},
{
"entropy": 0.5388712882995605,
"epoch": 1.1794392523364485,
"grad_norm": 0.03216094896197319,
"learning_rate": 0.0002,
"loss": 0.5446818470954895,
"mean_token_accuracy": 0.7791234254837036,
"num_tokens": 5147422.0,
"step": 316
},
{
"entropy": 0.53206005692482,
"epoch": 1.183177570093458,
"grad_norm": 0.032054752111434937,
"learning_rate": 0.0002,
"loss": 0.5439627170562744,
"mean_token_accuracy": 0.7801449149847031,
"num_tokens": 5163884.0,
"step": 317
},
{
"entropy": 0.5308776497840881,
"epoch": 1.1869158878504673,
"grad_norm": 0.032574739307165146,
"learning_rate": 0.0002,
"loss": 0.5392112731933594,
"mean_token_accuracy": 0.777498260140419,
"num_tokens": 5180398.0,
"step": 318
},
{
"entropy": 0.5427455455064774,
"epoch": 1.1906542056074767,
"grad_norm": 0.03152874857187271,
"learning_rate": 0.0002,
"loss": 0.5452929139137268,
"mean_token_accuracy": 0.7787911593914032,
"num_tokens": 5196640.0,
"step": 319
},
{
"entropy": 0.570340633392334,
"epoch": 1.194392523364486,
"grad_norm": 0.03098403289914131,
"learning_rate": 0.0002,
"loss": 0.5688466429710388,
"mean_token_accuracy": 0.7672817558050156,
"num_tokens": 5212767.0,
"step": 320
},
{
"entropy": 0.5646504908800125,
"epoch": 1.1981308411214953,
"grad_norm": 0.032602474093437195,
"learning_rate": 0.0002,
"loss": 0.5595831274986267,
"mean_token_accuracy": 0.7738354504108429,
"num_tokens": 5229143.0,
"step": 321
},
{
"entropy": 0.541440024971962,
"epoch": 1.2018691588785047,
"grad_norm": 0.0346127450466156,
"learning_rate": 0.0002,
"loss": 0.5328572988510132,
"mean_token_accuracy": 0.7842471748590469,
"num_tokens": 5245349.0,
"step": 322
},
{
"entropy": 0.5371421873569489,
"epoch": 1.205607476635514,
"grad_norm": 0.030524473637342453,
"learning_rate": 0.0002,
"loss": 0.5316073894500732,
"mean_token_accuracy": 0.7839267402887344,
"num_tokens": 5261740.0,
"step": 323
},
{
"entropy": 0.5501479953527451,
"epoch": 1.2093457943925234,
"grad_norm": 0.04006117209792137,
"learning_rate": 0.0002,
"loss": 0.5546258687973022,
"mean_token_accuracy": 0.7740581333637238,
"num_tokens": 5278402.0,
"step": 324
},
{
"entropy": 0.5427927225828171,
"epoch": 1.2130841121495326,
"grad_norm": 0.028997933492064476,
"learning_rate": 0.0002,
"loss": 0.546272873878479,
"mean_token_accuracy": 0.77626071870327,
"num_tokens": 5295096.0,
"step": 325
},
{
"entropy": 0.5374629199504852,
"epoch": 1.216822429906542,
"grad_norm": 0.031449392437934875,
"learning_rate": 0.0002,
"loss": 0.5484204292297363,
"mean_token_accuracy": 0.7783177495002747,
"num_tokens": 5311451.0,
"step": 326
},
{
"entropy": 0.5593861639499664,
"epoch": 1.2205607476635514,
"grad_norm": 0.033892612904310226,
"learning_rate": 0.0002,
"loss": 0.5527151823043823,
"mean_token_accuracy": 0.7769543379545212,
"num_tokens": 5327705.0,
"step": 327
},
{
"entropy": 0.5403755158185959,
"epoch": 1.2242990654205608,
"grad_norm": 0.029873648658394814,
"learning_rate": 0.0002,
"loss": 0.5416997075080872,
"mean_token_accuracy": 0.7783119082450867,
"num_tokens": 5344110.0,
"step": 328
},
{
"entropy": 0.5473423600196838,
"epoch": 1.2280373831775702,
"grad_norm": 0.028266677632927895,
"learning_rate": 0.0002,
"loss": 0.5524438619613647,
"mean_token_accuracy": 0.7769231647253036,
"num_tokens": 5360378.0,
"step": 329
},
{
"entropy": 0.5364970713853836,
"epoch": 1.2317757009345796,
"grad_norm": 0.03534099832177162,
"learning_rate": 0.0002,
"loss": 0.5341481566429138,
"mean_token_accuracy": 0.783685103058815,
"num_tokens": 5376600.0,
"step": 330
},
{
"entropy": 0.5472245216369629,
"epoch": 1.2355140186915887,
"grad_norm": 0.030261849984526634,
"learning_rate": 0.0002,
"loss": 0.5478684306144714,
"mean_token_accuracy": 0.7797873020172119,
"num_tokens": 5392761.0,
"step": 331
},
{
"entropy": 0.545607790350914,
"epoch": 1.2392523364485981,
"grad_norm": 0.029436452314257622,
"learning_rate": 0.0002,
"loss": 0.546855628490448,
"mean_token_accuracy": 0.7786357402801514,
"num_tokens": 5409133.0,
"step": 332
},
{
"entropy": 0.5291889756917953,
"epoch": 1.2429906542056075,
"grad_norm": 0.03353505581617355,
"learning_rate": 0.0002,
"loss": 0.5353861451148987,
"mean_token_accuracy": 0.7811570167541504,
"num_tokens": 5425384.0,
"step": 333
},
{
"entropy": 0.5578002631664276,
"epoch": 1.246728971962617,
"grad_norm": 0.03168244659900665,
"learning_rate": 0.0002,
"loss": 0.5618013143539429,
"mean_token_accuracy": 0.7705619186162949,
"num_tokens": 5441708.0,
"step": 334
},
{
"entropy": 0.555315688252449,
"epoch": 1.250467289719626,
"grad_norm": 0.03206615522503853,
"learning_rate": 0.0002,
"loss": 0.5600447654724121,
"mean_token_accuracy": 0.7714688628911972,
"num_tokens": 5457884.0,
"step": 335
},
{
"entropy": 0.5601648688316345,
"epoch": 1.2542056074766355,
"grad_norm": 0.03804044798016548,
"learning_rate": 0.0002,
"loss": 0.5550276637077332,
"mean_token_accuracy": 0.7733457237482071,
"num_tokens": 5474231.0,
"step": 336
},
{
"entropy": 0.542451411485672,
"epoch": 1.2579439252336448,
"grad_norm": 0.029554393142461777,
"learning_rate": 0.0002,
"loss": 0.5353547930717468,
"mean_token_accuracy": 0.7827602028846741,
"num_tokens": 5490557.0,
"step": 337
},
{
"entropy": 0.5396464318037033,
"epoch": 1.2616822429906542,
"grad_norm": 0.02930438332259655,
"learning_rate": 0.0002,
"loss": 0.5352525115013123,
"mean_token_accuracy": 0.782452329993248,
"num_tokens": 5506827.0,
"step": 338
},
{
"entropy": 0.551433265209198,
"epoch": 1.2654205607476636,
"grad_norm": 0.03803868591785431,
"learning_rate": 0.0002,
"loss": 0.5564743280410767,
"mean_token_accuracy": 0.7742451429367065,
"num_tokens": 5523197.0,
"step": 339
},
{
"entropy": 0.5405130237340927,
"epoch": 1.269158878504673,
"grad_norm": 0.03335575759410858,
"learning_rate": 0.0002,
"loss": 0.5447483062744141,
"mean_token_accuracy": 0.777386024594307,
"num_tokens": 5539570.0,
"step": 340
},
{
"entropy": 0.5281671732664108,
"epoch": 1.2728971962616822,
"grad_norm": 0.03668655455112457,
"learning_rate": 0.0002,
"loss": 0.5369662642478943,
"mean_token_accuracy": 0.7818697243928909,
"num_tokens": 5556018.0,
"step": 341
},
{
"entropy": 0.5445946455001831,
"epoch": 1.2766355140186916,
"grad_norm": 0.03418565168976784,
"learning_rate": 0.0002,
"loss": 0.5481922626495361,
"mean_token_accuracy": 0.7817248553037643,
"num_tokens": 5571921.0,
"step": 342
},
{
"entropy": 0.5692614763975143,
"epoch": 1.280373831775701,
"grad_norm": 0.032861191779375076,
"learning_rate": 0.0002,
"loss": 0.5536470413208008,
"mean_token_accuracy": 0.7768330574035645,
"num_tokens": 5588242.0,
"step": 343
},
{
"entropy": 0.5534744560718536,
"epoch": 1.2841121495327104,
"grad_norm": 0.02994309738278389,
"learning_rate": 0.0002,
"loss": 0.5490615367889404,
"mean_token_accuracy": 0.7776058167219162,
"num_tokens": 5604646.0,
"step": 344
},
{
"entropy": 0.5477103441953659,
"epoch": 1.2878504672897195,
"grad_norm": 0.0329648032784462,
"learning_rate": 0.0002,
"loss": 0.5608856678009033,
"mean_token_accuracy": 0.769044816493988,
"num_tokens": 5620822.0,
"step": 345
},
{
"entropy": 0.5447603911161423,
"epoch": 1.291588785046729,
"grad_norm": 0.038630835711956024,
"learning_rate": 0.0002,
"loss": 0.5517427921295166,
"mean_token_accuracy": 0.776050254702568,
"num_tokens": 5637254.0,
"step": 346
},
{
"entropy": 0.5543326735496521,
"epoch": 1.2953271028037383,
"grad_norm": 0.03234436735510826,
"learning_rate": 0.0002,
"loss": 0.5605747103691101,
"mean_token_accuracy": 0.7735925763845444,
"num_tokens": 5653687.0,
"step": 347
},
{
"entropy": 0.5351574122905731,
"epoch": 1.2990654205607477,
"grad_norm": 0.03387833759188652,
"learning_rate": 0.0002,
"loss": 0.5403937697410583,
"mean_token_accuracy": 0.7819892168045044,
"num_tokens": 5670152.0,
"step": 348
},
{
"entropy": 0.5567533820867538,
"epoch": 1.302803738317757,
"grad_norm": 0.0311372522264719,
"learning_rate": 0.0002,
"loss": 0.5512552261352539,
"mean_token_accuracy": 0.7762364596128464,
"num_tokens": 5686422.0,
"step": 349
},
{
"entropy": 0.5508190989494324,
"epoch": 1.3065420560747665,
"grad_norm": 0.027689168229699135,
"learning_rate": 0.0002,
"loss": 0.5455954074859619,
"mean_token_accuracy": 0.7787918448448181,
"num_tokens": 5702832.0,
"step": 350
},
{
"entropy": 0.5493623167276382,
"epoch": 1.3102803738317756,
"grad_norm": 0.03188028931617737,
"learning_rate": 0.0002,
"loss": 0.5508118867874146,
"mean_token_accuracy": 0.7741293609142303,
"num_tokens": 5719201.0,
"step": 351
},
{
"entropy": 0.5517994910478592,
"epoch": 1.314018691588785,
"grad_norm": 0.03255178779363632,
"learning_rate": 0.0002,
"loss": 0.5581218004226685,
"mean_token_accuracy": 0.7717841118574142,
"num_tokens": 5735507.0,
"step": 352
},
{
"entropy": 0.5363009721040726,
"epoch": 1.3177570093457944,
"grad_norm": 0.0318707600235939,
"learning_rate": 0.0002,
"loss": 0.5422943234443665,
"mean_token_accuracy": 0.7783725261688232,
"num_tokens": 5751653.0,
"step": 353
},
{
"entropy": 0.5449318736791611,
"epoch": 1.3214953271028038,
"grad_norm": 0.028741504997015,
"learning_rate": 0.0002,
"loss": 0.539950966835022,
"mean_token_accuracy": 0.7803268283605576,
"num_tokens": 5768167.0,
"step": 354
},
{
"entropy": 0.5602855980396271,
"epoch": 1.325233644859813,
"grad_norm": 0.030420802533626556,
"learning_rate": 0.0002,
"loss": 0.554990291595459,
"mean_token_accuracy": 0.7761643081903458,
"num_tokens": 5784542.0,
"step": 355
},
{
"entropy": 0.56887586414814,
"epoch": 1.3289719626168224,
"grad_norm": 0.03126989305019379,
"learning_rate": 0.0002,
"loss": 0.5672231912612915,
"mean_token_accuracy": 0.7678193151950836,
"num_tokens": 5801095.0,
"step": 356
},
{
"entropy": 0.5738541036844254,
"epoch": 1.3327102803738318,
"grad_norm": 0.03625823184847832,
"learning_rate": 0.0002,
"loss": 0.5728395581245422,
"mean_token_accuracy": 0.7666806429624557,
"num_tokens": 5817738.0,
"step": 357
},
{
"entropy": 0.5436241179704666,
"epoch": 1.3364485981308412,
"grad_norm": 0.03443320468068123,
"learning_rate": 0.0002,
"loss": 0.5367251634597778,
"mean_token_accuracy": 0.7828597128391266,
"num_tokens": 5834159.0,
"step": 358
},
{
"entropy": 0.5450441539287567,
"epoch": 1.3401869158878505,
"grad_norm": 0.02960045635700226,
"learning_rate": 0.0002,
"loss": 0.5478132963180542,
"mean_token_accuracy": 0.7773353010416031,
"num_tokens": 5850353.0,
"step": 359
},
{
"entropy": 0.559371218085289,
"epoch": 1.34392523364486,
"grad_norm": 0.043439071625471115,
"learning_rate": 0.0002,
"loss": 0.5704307556152344,
"mean_token_accuracy": 0.7674223929643631,
"num_tokens": 5866661.0,
"step": 360
},
{
"entropy": 0.5383078157901764,
"epoch": 1.347663551401869,
"grad_norm": 0.031151141971349716,
"learning_rate": 0.0002,
"loss": 0.5475639700889587,
"mean_token_accuracy": 0.7764850705862045,
"num_tokens": 5883147.0,
"step": 361
},
{
"entropy": 0.5361460000276566,
"epoch": 1.3514018691588785,
"grad_norm": 0.0367986336350441,
"learning_rate": 0.0002,
"loss": 0.5413030385971069,
"mean_token_accuracy": 0.7792898863554001,
"num_tokens": 5899337.0,
"step": 362
},
{
"entropy": 0.5393686443567276,
"epoch": 1.355140186915888,
"grad_norm": 0.032062407582998276,
"learning_rate": 0.0002,
"loss": 0.5485578775405884,
"mean_token_accuracy": 0.7746371626853943,
"num_tokens": 5915592.0,
"step": 363
},
{
"entropy": 0.5442528575658798,
"epoch": 1.358878504672897,
"grad_norm": 0.030468052253127098,
"learning_rate": 0.0002,
"loss": 0.5427553653717041,
"mean_token_accuracy": 0.7785662263631821,
"num_tokens": 5931951.0,
"step": 364
},
{
"entropy": 0.5824908316135406,
"epoch": 1.3626168224299064,
"grad_norm": 0.037210624665021896,
"learning_rate": 0.0002,
"loss": 0.5697020292282104,
"mean_token_accuracy": 0.7692236304283142,
"num_tokens": 5948490.0,
"step": 365
},
{
"entropy": 0.5620522499084473,
"epoch": 1.3663551401869158,
"grad_norm": 0.0335218720138073,
"learning_rate": 0.0002,
"loss": 0.5542594194412231,
"mean_token_accuracy": 0.7753977477550507,
"num_tokens": 5964660.0,
"step": 366
},
{
"entropy": 0.5603572577238083,
"epoch": 1.3700934579439252,
"grad_norm": 0.031322672963142395,
"learning_rate": 0.0002,
"loss": 0.5575450658798218,
"mean_token_accuracy": 0.7735055536031723,
"num_tokens": 5981101.0,
"step": 367
},
{
"entropy": 0.5505388826131821,
"epoch": 1.3738317757009346,
"grad_norm": 0.030650589615106583,
"learning_rate": 0.0002,
"loss": 0.5557997822761536,
"mean_token_accuracy": 0.7740475237369537,
"num_tokens": 5997642.0,
"step": 368
},
{
"entropy": 0.5392187088727951,
"epoch": 1.377570093457944,
"grad_norm": 0.030460603535175323,
"learning_rate": 0.0002,
"loss": 0.5474120378494263,
"mean_token_accuracy": 0.7756936997175217,
"num_tokens": 6013826.0,
"step": 369
},
{
"entropy": 0.5465079843997955,
"epoch": 1.3813084112149534,
"grad_norm": 0.03873775899410248,
"learning_rate": 0.0002,
"loss": 0.5496590733528137,
"mean_token_accuracy": 0.7778041809797287,
"num_tokens": 6030111.0,
"step": 370
},
{
"entropy": 0.5502425879240036,
"epoch": 1.3850467289719626,
"grad_norm": 0.027835069224238396,
"learning_rate": 0.0002,
"loss": 0.5515455007553101,
"mean_token_accuracy": 0.7742271274328232,
"num_tokens": 6046613.0,
"step": 371
},
{
"entropy": 0.5496622025966644,
"epoch": 1.388785046728972,
"grad_norm": 0.02913137525320053,
"learning_rate": 0.0002,
"loss": 0.5523219108581543,
"mean_token_accuracy": 0.7767279595136642,
"num_tokens": 6062935.0,
"step": 372
},
{
"entropy": 0.5480591654777527,
"epoch": 1.3925233644859814,
"grad_norm": 0.028895994648337364,
"learning_rate": 0.0002,
"loss": 0.5464932918548584,
"mean_token_accuracy": 0.7779257446527481,
"num_tokens": 6079276.0,
"step": 373
},
{
"entropy": 0.5592564791440964,
"epoch": 1.3962616822429905,
"grad_norm": 0.030813386663794518,
"learning_rate": 0.0002,
"loss": 0.5641001462936401,
"mean_token_accuracy": 0.7706102132797241,
"num_tokens": 6095477.0,
"step": 374
},
{
"entropy": 0.5482244938611984,
"epoch": 1.4,
"grad_norm": 0.034681808203458786,
"learning_rate": 0.0002,
"loss": 0.5535820722579956,
"mean_token_accuracy": 0.7740350067615509,
"num_tokens": 6111503.0,
"step": 375
},
{
"entropy": 0.5437954962253571,
"epoch": 1.4037383177570093,
"grad_norm": 0.029899772256612778,
"learning_rate": 0.0002,
"loss": 0.5384761691093445,
"mean_token_accuracy": 0.7813697308301926,
"num_tokens": 6127666.0,
"step": 376
},
{
"entropy": 0.5516242235898972,
"epoch": 1.4074766355140187,
"grad_norm": 0.03098697029054165,
"learning_rate": 0.0002,
"loss": 0.5510317087173462,
"mean_token_accuracy": 0.7748206406831741,
"num_tokens": 6143974.0,
"step": 377
},
{
"entropy": 0.5456867665052414,
"epoch": 1.411214953271028,
"grad_norm": 0.03481059893965721,
"learning_rate": 0.0002,
"loss": 0.5417442917823792,
"mean_token_accuracy": 0.7805673629045486,
"num_tokens": 6160284.0,
"step": 378
},
{
"entropy": 0.5566543191671371,
"epoch": 1.4149532710280375,
"grad_norm": 0.03302835300564766,
"learning_rate": 0.0002,
"loss": 0.5596388578414917,
"mean_token_accuracy": 0.7757162600755692,
"num_tokens": 6176900.0,
"step": 379
},
{
"entropy": 0.5518665462732315,
"epoch": 1.4186915887850469,
"grad_norm": 0.042512837797403336,
"learning_rate": 0.0002,
"loss": 0.554313600063324,
"mean_token_accuracy": 0.7725758254528046,
"num_tokens": 6193295.0,
"step": 380
},
{
"entropy": 0.5387768298387527,
"epoch": 1.422429906542056,
"grad_norm": 0.031335704028606415,
"learning_rate": 0.0002,
"loss": 0.5456656813621521,
"mean_token_accuracy": 0.7767685800790787,
"num_tokens": 6209473.0,
"step": 381
},
{
"entropy": 0.552179217338562,
"epoch": 1.4261682242990654,
"grad_norm": 0.03560006618499756,
"learning_rate": 0.0002,
"loss": 0.5536052584648132,
"mean_token_accuracy": 0.7741381675004959,
"num_tokens": 6225795.0,
"step": 382
},
{
"entropy": 0.5529111623764038,
"epoch": 1.4299065420560748,
"grad_norm": 0.03298206627368927,
"learning_rate": 0.0002,
"loss": 0.5456759929656982,
"mean_token_accuracy": 0.7785012274980545,
"num_tokens": 6241738.0,
"step": 383
},
{
"entropy": 0.5528014451265335,
"epoch": 1.433644859813084,
"grad_norm": 0.02689899317920208,
"learning_rate": 0.0002,
"loss": 0.5489047765731812,
"mean_token_accuracy": 0.7755105197429657,
"num_tokens": 6258266.0,
"step": 384
},
{
"entropy": 0.5488691926002502,
"epoch": 1.4373831775700934,
"grad_norm": 0.03345772624015808,
"learning_rate": 0.0002,
"loss": 0.5473658442497253,
"mean_token_accuracy": 0.776367112994194,
"num_tokens": 6274629.0,
"step": 385
},
{
"entropy": 0.5326814502477646,
"epoch": 1.4411214953271028,
"grad_norm": 0.0327431820333004,
"learning_rate": 0.0002,
"loss": 0.5437192916870117,
"mean_token_accuracy": 0.7790791392326355,
"num_tokens": 6290843.0,
"step": 386
},
{
"entropy": 0.5463947802782059,
"epoch": 1.4448598130841122,
"grad_norm": 0.029317917302250862,
"learning_rate": 0.0002,
"loss": 0.5482510924339294,
"mean_token_accuracy": 0.7787915766239166,
"num_tokens": 6307390.0,
"step": 387
},
{
"entropy": 0.5279744416475296,
"epoch": 1.4485981308411215,
"grad_norm": 0.032164428383111954,
"learning_rate": 0.0002,
"loss": 0.5396722555160522,
"mean_token_accuracy": 0.7793098241090775,
"num_tokens": 6323780.0,
"step": 388
},
{
"entropy": 0.5401588678359985,
"epoch": 1.452336448598131,
"grad_norm": 0.029884206131100655,
"learning_rate": 0.0002,
"loss": 0.5457247495651245,
"mean_token_accuracy": 0.7772396057844162,
"num_tokens": 6340075.0,
"step": 389
},
{
"entropy": 0.5614192336797714,
"epoch": 1.45607476635514,
"grad_norm": 0.031751908361911774,
"learning_rate": 0.0002,
"loss": 0.5567028522491455,
"mean_token_accuracy": 0.7716124802827835,
"num_tokens": 6356186.0,
"step": 390
},
{
"entropy": 0.5345210433006287,
"epoch": 1.4598130841121495,
"grad_norm": 0.030872350558638573,
"learning_rate": 0.0002,
"loss": 0.5334336757659912,
"mean_token_accuracy": 0.7826623171567917,
"num_tokens": 6372159.0,
"step": 391
},
{
"entropy": 0.5622972398996353,
"epoch": 1.4635514018691589,
"grad_norm": 0.0314875952899456,
"learning_rate": 0.0002,
"loss": 0.5557999610900879,
"mean_token_accuracy": 0.7731751799583435,
"num_tokens": 6388490.0,
"step": 392
},
{
"entropy": 0.5456393212080002,
"epoch": 1.4672897196261683,
"grad_norm": 0.030306922271847725,
"learning_rate": 0.0002,
"loss": 0.5478385090827942,
"mean_token_accuracy": 0.7785396575927734,
"num_tokens": 6404875.0,
"step": 393
},
{
"entropy": 0.553615927696228,
"epoch": 1.4710280373831774,
"grad_norm": 0.03159041702747345,
"learning_rate": 0.0002,
"loss": 0.5525414347648621,
"mean_token_accuracy": 0.7762843668460846,
"num_tokens": 6421373.0,
"step": 394
},
{
"entropy": 0.54654960334301,
"epoch": 1.4747663551401868,
"grad_norm": 0.041343770921230316,
"learning_rate": 0.0002,
"loss": 0.5578322410583496,
"mean_token_accuracy": 0.7733658850193024,
"num_tokens": 6437609.0,
"step": 395
},
{
"entropy": 0.531049445271492,
"epoch": 1.4785046728971962,
"grad_norm": 0.029535705223679543,
"learning_rate": 0.0002,
"loss": 0.5336673855781555,
"mean_token_accuracy": 0.7787897735834122,
"num_tokens": 6453830.0,
"step": 396
},
{
"entropy": 0.5598567724227905,
"epoch": 1.4822429906542056,
"grad_norm": 0.030157895758748055,
"learning_rate": 0.0002,
"loss": 0.558460533618927,
"mean_token_accuracy": 0.7739997208118439,
"num_tokens": 6469831.0,
"step": 397
},
{
"entropy": 0.5455051362514496,
"epoch": 1.485981308411215,
"grad_norm": 0.02824362926185131,
"learning_rate": 0.0002,
"loss": 0.5309131145477295,
"mean_token_accuracy": 0.7840657532215118,
"num_tokens": 6485983.0,
"step": 398
},
{
"entropy": 0.5548417568206787,
"epoch": 1.4897196261682244,
"grad_norm": 0.028244182467460632,
"learning_rate": 0.0002,
"loss": 0.5448263883590698,
"mean_token_accuracy": 0.7788312286138535,
"num_tokens": 6502375.0,
"step": 399
},
{
"entropy": 0.5614428222179413,
"epoch": 1.4934579439252336,
"grad_norm": 0.029092902317643166,
"learning_rate": 0.0002,
"loss": 0.5640357732772827,
"mean_token_accuracy": 0.7694920003414154,
"num_tokens": 6518515.0,
"step": 400
},
{
"entropy": 0.5202381461858749,
"epoch": 1.497196261682243,
"grad_norm": 0.0347515270113945,
"learning_rate": 0.0002,
"loss": 0.5334154963493347,
"mean_token_accuracy": 0.7812663912773132,
"num_tokens": 6534874.0,
"step": 401
},
{
"entropy": 0.5337788164615631,
"epoch": 1.5009345794392523,
"grad_norm": 0.036383189260959625,
"learning_rate": 0.0002,
"loss": 0.5497745871543884,
"mean_token_accuracy": 0.778416782617569,
"num_tokens": 6551531.0,
"step": 402
},
{
"entropy": 0.5441624820232391,
"epoch": 1.5046728971962615,
"grad_norm": 0.029430663213133812,
"learning_rate": 0.0002,
"loss": 0.5452989935874939,
"mean_token_accuracy": 0.7810618728399277,
"num_tokens": 6568009.0,
"step": 403
},
{
"entropy": 0.5418661385774612,
"epoch": 1.508411214953271,
"grad_norm": 0.030562201514840126,
"learning_rate": 0.0002,
"loss": 0.5342137813568115,
"mean_token_accuracy": 0.7829063683748245,
"num_tokens": 6584207.0,
"step": 404
},
{
"entropy": 0.5485459864139557,
"epoch": 1.5121495327102803,
"grad_norm": 0.03423624485731125,
"learning_rate": 0.0002,
"loss": 0.5410490036010742,
"mean_token_accuracy": 0.7787354588508606,
"num_tokens": 6600370.0,
"step": 405
},
{
"entropy": 0.5426456183195114,
"epoch": 1.5158878504672897,
"grad_norm": 0.02885623089969158,
"learning_rate": 0.0002,
"loss": 0.5436002612113953,
"mean_token_accuracy": 0.7796245515346527,
"num_tokens": 6616756.0,
"step": 406
},
{
"entropy": 0.5356003642082214,
"epoch": 1.519626168224299,
"grad_norm": 0.03115919418632984,
"learning_rate": 0.0002,
"loss": 0.5386699438095093,
"mean_token_accuracy": 0.7803057432174683,
"num_tokens": 6632844.0,
"step": 407
},
{
"entropy": 0.5387707352638245,
"epoch": 1.5233644859813085,
"grad_norm": 0.039791349321603775,
"learning_rate": 0.0002,
"loss": 0.5529868006706238,
"mean_token_accuracy": 0.7759213447570801,
"num_tokens": 6649378.0,
"step": 408
},
{
"entropy": 0.5559847801923752,
"epoch": 1.5271028037383179,
"grad_norm": 0.02880096808075905,
"learning_rate": 0.0002,
"loss": 0.5526622533798218,
"mean_token_accuracy": 0.7757584452629089,
"num_tokens": 6665680.0,
"step": 409
},
{
"entropy": 0.5568434447050095,
"epoch": 1.5308411214953273,
"grad_norm": 0.03131592646241188,
"learning_rate": 0.0002,
"loss": 0.5511536002159119,
"mean_token_accuracy": 0.7751762717962265,
"num_tokens": 6682037.0,
"step": 410
},
{
"entropy": 0.5535785406827927,
"epoch": 1.5345794392523364,
"grad_norm": 0.027654770761728287,
"learning_rate": 0.0002,
"loss": 0.5505651831626892,
"mean_token_accuracy": 0.7777209877967834,
"num_tokens": 6698293.0,
"step": 411
},
{
"entropy": 0.5670723766088486,
"epoch": 1.5383177570093458,
"grad_norm": 0.028583014383912086,
"learning_rate": 0.0002,
"loss": 0.562312662601471,
"mean_token_accuracy": 0.7695807963609695,
"num_tokens": 6714701.0,
"step": 412
},
{
"entropy": 0.5622154772281647,
"epoch": 1.542056074766355,
"grad_norm": 0.02976270206272602,
"learning_rate": 0.0002,
"loss": 0.5625367164611816,
"mean_token_accuracy": 0.7716499269008636,
"num_tokens": 6731185.0,
"step": 413
},
{
"entropy": 0.5430750995874405,
"epoch": 1.5457943925233644,
"grad_norm": 0.033997952938079834,
"learning_rate": 0.0002,
"loss": 0.5533574819564819,
"mean_token_accuracy": 0.7739907056093216,
"num_tokens": 6747611.0,
"step": 414
},
{
"entropy": 0.5383965522050858,
"epoch": 1.5495327102803738,
"grad_norm": 0.030417680740356445,
"learning_rate": 0.0002,
"loss": 0.5392584204673767,
"mean_token_accuracy": 0.781003326177597,
"num_tokens": 6764041.0,
"step": 415
},
{
"entropy": 0.5423173159360886,
"epoch": 1.5532710280373832,
"grad_norm": 0.03076282888650894,
"learning_rate": 0.0002,
"loss": 0.5466949343681335,
"mean_token_accuracy": 0.7772891670465469,
"num_tokens": 6780355.0,
"step": 416
},
{
"entropy": 0.5329848676919937,
"epoch": 1.5570093457943925,
"grad_norm": 0.031416404992341995,
"learning_rate": 0.0002,
"loss": 0.5372002720832825,
"mean_token_accuracy": 0.7831790894269943,
"num_tokens": 6796818.0,
"step": 417
},
{
"entropy": 0.5694616734981537,
"epoch": 1.560747663551402,
"grad_norm": 0.03140864148736,
"learning_rate": 0.0002,
"loss": 0.5736896395683289,
"mean_token_accuracy": 0.7680276483297348,
"num_tokens": 6813313.0,
"step": 418
},
{
"entropy": 0.5422861874103546,
"epoch": 1.5644859813084113,
"grad_norm": 0.029503118246793747,
"learning_rate": 0.0002,
"loss": 0.5412414073944092,
"mean_token_accuracy": 0.7787739634513855,
"num_tokens": 6829806.0,
"step": 419
},
{
"entropy": 0.5583456158638,
"epoch": 1.5682242990654207,
"grad_norm": 0.02907589264214039,
"learning_rate": 0.0002,
"loss": 0.5538471937179565,
"mean_token_accuracy": 0.7733865231275558,
"num_tokens": 6846001.0,
"step": 420
},
{
"entropy": 0.541300505399704,
"epoch": 1.5719626168224299,
"grad_norm": 0.030364159494638443,
"learning_rate": 0.0002,
"loss": 0.5440077781677246,
"mean_token_accuracy": 0.7778935730457306,
"num_tokens": 6862199.0,
"step": 421
},
{
"entropy": 0.5432893335819244,
"epoch": 1.5757009345794393,
"grad_norm": 0.030575595796108246,
"learning_rate": 0.0002,
"loss": 0.5458940267562866,
"mean_token_accuracy": 0.7759649753570557,
"num_tokens": 6878579.0,
"step": 422
},
{
"entropy": 0.5597539693117142,
"epoch": 1.5794392523364484,
"grad_norm": 0.03023570403456688,
"learning_rate": 0.0002,
"loss": 0.5611036419868469,
"mean_token_accuracy": 0.771359458565712,
"num_tokens": 6895118.0,
"step": 423
},
{
"entropy": 0.5647385269403458,
"epoch": 1.5831775700934578,
"grad_norm": 0.03682006523013115,
"learning_rate": 0.0002,
"loss": 0.5706467032432556,
"mean_token_accuracy": 0.7648251056671143,
"num_tokens": 6911258.0,
"step": 424
},
{
"entropy": 0.5421442538499832,
"epoch": 1.5869158878504672,
"grad_norm": 0.02758963778614998,
"learning_rate": 0.0002,
"loss": 0.540165364742279,
"mean_token_accuracy": 0.7803500890731812,
"num_tokens": 6927685.0,
"step": 425
},
{
"entropy": 0.529248058795929,
"epoch": 1.5906542056074766,
"grad_norm": 0.03153234347701073,
"learning_rate": 0.0002,
"loss": 0.5238373875617981,
"mean_token_accuracy": 0.7865803390741348,
"num_tokens": 6944032.0,
"step": 426
},
{
"entropy": 0.575338825583458,
"epoch": 1.594392523364486,
"grad_norm": 0.038368549197912216,
"learning_rate": 0.0002,
"loss": 0.5686851739883423,
"mean_token_accuracy": 0.7687085419893265,
"num_tokens": 6960292.0,
"step": 427
},
{
"entropy": 0.5576592534780502,
"epoch": 1.5981308411214954,
"grad_norm": 0.028228625655174255,
"learning_rate": 0.0002,
"loss": 0.5487405061721802,
"mean_token_accuracy": 0.7753542214632034,
"num_tokens": 6976714.0,
"step": 428
},
{
"entropy": 0.5344701558351517,
"epoch": 1.6018691588785048,
"grad_norm": 0.04058045893907547,
"learning_rate": 0.0002,
"loss": 0.5446043014526367,
"mean_token_accuracy": 0.7796988487243652,
"num_tokens": 6993050.0,
"step": 429
},
{
"entropy": 0.5357878655195236,
"epoch": 1.6056074766355142,
"grad_norm": 0.03584378957748413,
"learning_rate": 0.0002,
"loss": 0.5503512620925903,
"mean_token_accuracy": 0.7766520529985428,
"num_tokens": 7009209.0,
"step": 430
},
{
"entropy": 0.5416888147592545,
"epoch": 1.6093457943925233,
"grad_norm": 0.035834796726703644,
"learning_rate": 0.0002,
"loss": 0.5537422895431519,
"mean_token_accuracy": 0.7721364051103592,
"num_tokens": 7025449.0,
"step": 431
},
{
"entropy": 0.5495986640453339,
"epoch": 1.6130841121495327,
"grad_norm": 0.032027650624513626,
"learning_rate": 0.0002,
"loss": 0.5545753836631775,
"mean_token_accuracy": 0.7711912095546722,
"num_tokens": 7041746.0,
"step": 432
},
{
"entropy": 0.545868456363678,
"epoch": 1.616822429906542,
"grad_norm": 0.03172159940004349,
"learning_rate": 0.0002,
"loss": 0.5401636958122253,
"mean_token_accuracy": 0.7796500027179718,
"num_tokens": 7057795.0,
"step": 433
},
{
"entropy": 0.5575663447380066,
"epoch": 1.6205607476635513,
"grad_norm": 0.033373311161994934,
"learning_rate": 0.0002,
"loss": 0.5508802533149719,
"mean_token_accuracy": 0.776265561580658,
"num_tokens": 7074106.0,
"step": 434
},
{
"entropy": 0.552743598818779,
"epoch": 1.6242990654205607,
"grad_norm": 0.028903203085064888,
"learning_rate": 0.0002,
"loss": 0.5493654012680054,
"mean_token_accuracy": 0.7769621759653091,
"num_tokens": 7090537.0,
"step": 435
},
{
"entropy": 0.5319768935441971,
"epoch": 1.62803738317757,
"grad_norm": 0.034539636224508286,
"learning_rate": 0.0002,
"loss": 0.5467936396598816,
"mean_token_accuracy": 0.7773739099502563,
"num_tokens": 7106864.0,
"step": 436
},
{
"entropy": 0.5451867878437042,
"epoch": 1.6317757009345795,
"grad_norm": 0.03423994407057762,
"learning_rate": 0.0002,
"loss": 0.5547507405281067,
"mean_token_accuracy": 0.7716930210590363,
"num_tokens": 7123027.0,
"step": 437
},
{
"entropy": 0.5614334046840668,
"epoch": 1.6355140186915889,
"grad_norm": 0.030570637434720993,
"learning_rate": 0.0002,
"loss": 0.5614769458770752,
"mean_token_accuracy": 0.772892951965332,
"num_tokens": 7139089.0,
"step": 438
},
{
"entropy": 0.5780467242002487,
"epoch": 1.6392523364485982,
"grad_norm": 0.028702719137072563,
"learning_rate": 0.0002,
"loss": 0.5703617334365845,
"mean_token_accuracy": 0.7703514397144318,
"num_tokens": 7155613.0,
"step": 439
},
{
"entropy": 0.5620117634534836,
"epoch": 1.6429906542056076,
"grad_norm": 0.032911110669374466,
"learning_rate": 0.0002,
"loss": 0.5519667863845825,
"mean_token_accuracy": 0.776491329073906,
"num_tokens": 7171940.0,
"step": 440
},
{
"entropy": 0.5613545030355453,
"epoch": 1.6467289719626168,
"grad_norm": 0.02767273783683777,
"learning_rate": 0.0002,
"loss": 0.5548912286758423,
"mean_token_accuracy": 0.7774568051099777,
"num_tokens": 7188459.0,
"step": 441
},
{
"entropy": 0.5349740386009216,
"epoch": 1.6504672897196262,
"grad_norm": 0.03398311510682106,
"learning_rate": 0.0002,
"loss": 0.5359267592430115,
"mean_token_accuracy": 0.7792400866746902,
"num_tokens": 7204742.0,
"step": 442
},
{
"entropy": 0.5435358434915543,
"epoch": 1.6542056074766354,
"grad_norm": 0.03121669590473175,
"learning_rate": 0.0002,
"loss": 0.5480291247367859,
"mean_token_accuracy": 0.7757425308227539,
"num_tokens": 7220970.0,
"step": 443
},
{
"entropy": 0.5408525168895721,
"epoch": 1.6579439252336448,
"grad_norm": 0.03187638521194458,
"learning_rate": 0.0002,
"loss": 0.5458962321281433,
"mean_token_accuracy": 0.7777377218008041,
"num_tokens": 7237303.0,
"step": 444
},
{
"entropy": 0.5296604186296463,
"epoch": 1.6616822429906541,
"grad_norm": 0.033922888338565826,
"learning_rate": 0.0002,
"loss": 0.5350003242492676,
"mean_token_accuracy": 0.7817184776067734,
"num_tokens": 7253313.0,
"step": 445
},
{
"entropy": 0.5386542528867722,
"epoch": 1.6654205607476635,
"grad_norm": 0.03487584367394447,
"learning_rate": 0.0002,
"loss": 0.5504403710365295,
"mean_token_accuracy": 0.7764954715967178,
"num_tokens": 7269689.0,
"step": 446
},
{
"entropy": 0.5447485446929932,
"epoch": 1.669158878504673,
"grad_norm": 0.028691545128822327,
"learning_rate": 0.0002,
"loss": 0.5440992712974548,
"mean_token_accuracy": 0.7813538759946823,
"num_tokens": 7286072.0,
"step": 447
},
{
"entropy": 0.5479656606912613,
"epoch": 1.6728971962616823,
"grad_norm": 0.02881709486246109,
"learning_rate": 0.0002,
"loss": 0.5415880084037781,
"mean_token_accuracy": 0.7795199900865555,
"num_tokens": 7302255.0,
"step": 448
},
{
"entropy": 0.5570111870765686,
"epoch": 1.6766355140186917,
"grad_norm": 0.028915997594594955,
"learning_rate": 0.0002,
"loss": 0.5533952713012695,
"mean_token_accuracy": 0.7753083109855652,
"num_tokens": 7318517.0,
"step": 449
},
{
"entropy": 0.5548125952482224,
"epoch": 1.680373831775701,
"grad_norm": 0.029765961691737175,
"learning_rate": 0.0002,
"loss": 0.5539486408233643,
"mean_token_accuracy": 0.7759220153093338,
"num_tokens": 7334708.0,
"step": 450
},
{
"entropy": 0.5474168807268143,
"epoch": 1.6841121495327103,
"grad_norm": 0.028495540842413902,
"learning_rate": 0.0002,
"loss": 0.542155921459198,
"mean_token_accuracy": 0.7810131311416626,
"num_tokens": 7351081.0,
"step": 451
},
{
"entropy": 0.5660932809114456,
"epoch": 1.6878504672897197,
"grad_norm": 0.029109494760632515,
"learning_rate": 0.0002,
"loss": 0.5608826279640198,
"mean_token_accuracy": 0.7715775072574615,
"num_tokens": 7367731.0,
"step": 452
},
{
"entropy": 0.5341303050518036,
"epoch": 1.6915887850467288,
"grad_norm": 0.0320415273308754,
"learning_rate": 0.0002,
"loss": 0.5458233952522278,
"mean_token_accuracy": 0.7763672173023224,
"num_tokens": 7383855.0,
"step": 453
},
{
"entropy": 0.5321396738290787,
"epoch": 1.6953271028037382,
"grad_norm": 0.02727021649479866,
"learning_rate": 0.0002,
"loss": 0.5336453318595886,
"mean_token_accuracy": 0.7841753661632538,
"num_tokens": 7400413.0,
"step": 454
},
{
"entropy": 0.5274764150381088,
"epoch": 1.6990654205607476,
"grad_norm": 0.03324299305677414,
"learning_rate": 0.0002,
"loss": 0.5358706116676331,
"mean_token_accuracy": 0.7782862633466721,
"num_tokens": 7416652.0,
"step": 455
},
{
"entropy": 0.5659113973379135,
"epoch": 1.702803738317757,
"grad_norm": 0.02792423591017723,
"learning_rate": 0.0002,
"loss": 0.5652596354484558,
"mean_token_accuracy": 0.7699151486158371,
"num_tokens": 7433182.0,
"step": 456
},
{
"entropy": 0.5379252284765244,
"epoch": 1.7065420560747664,
"grad_norm": 0.029364224523305893,
"learning_rate": 0.0002,
"loss": 0.5403070449829102,
"mean_token_accuracy": 0.780923143029213,
"num_tokens": 7449489.0,
"step": 457
},
{
"entropy": 0.5333061218261719,
"epoch": 1.7102803738317758,
"grad_norm": 0.03605153039097786,
"learning_rate": 0.0002,
"loss": 0.5397148728370667,
"mean_token_accuracy": 0.7807264924049377,
"num_tokens": 7465639.0,
"step": 458
},
{
"entropy": 0.5705498605966568,
"epoch": 1.7140186915887852,
"grad_norm": 0.03089967370033264,
"learning_rate": 0.0002,
"loss": 0.5634230375289917,
"mean_token_accuracy": 0.770861804485321,
"num_tokens": 7482026.0,
"step": 459
},
{
"entropy": 0.5468743443489075,
"epoch": 1.7177570093457943,
"grad_norm": 0.030453559011220932,
"learning_rate": 0.0002,
"loss": 0.545179545879364,
"mean_token_accuracy": 0.7774305045604706,
"num_tokens": 7498135.0,
"step": 460
},
{
"entropy": 0.5617033839225769,
"epoch": 1.7214953271028037,
"grad_norm": 0.03324849158525467,
"learning_rate": 0.0002,
"loss": 0.5638455748558044,
"mean_token_accuracy": 0.7687248736619949,
"num_tokens": 7514525.0,
"step": 461
},
{
"entropy": 0.5581229031085968,
"epoch": 1.7252336448598131,
"grad_norm": 0.03176411613821983,
"learning_rate": 0.0002,
"loss": 0.5653245449066162,
"mean_token_accuracy": 0.7685625553131104,
"num_tokens": 7530775.0,
"step": 462
},
{
"entropy": 0.5476332157850266,
"epoch": 1.7289719626168223,
"grad_norm": 0.02840348146855831,
"learning_rate": 0.0002,
"loss": 0.5459728240966797,
"mean_token_accuracy": 0.7803480625152588,
"num_tokens": 7547133.0,
"step": 463
},
{
"entropy": 0.5295307040214539,
"epoch": 1.7327102803738317,
"grad_norm": 0.03073256090283394,
"learning_rate": 0.0002,
"loss": 0.5271958708763123,
"mean_token_accuracy": 0.7856812626123428,
"num_tokens": 7563202.0,
"step": 464
},
{
"entropy": 0.5600748807191849,
"epoch": 1.736448598130841,
"grad_norm": 0.02645997144281864,
"learning_rate": 0.0002,
"loss": 0.5613283514976501,
"mean_token_accuracy": 0.7728501409292221,
"num_tokens": 7579316.0,
"step": 465
},
{
"entropy": 0.5520564913749695,
"epoch": 1.7401869158878505,
"grad_norm": 0.03572427108883858,
"learning_rate": 0.0002,
"loss": 0.5537987947463989,
"mean_token_accuracy": 0.7724860310554504,
"num_tokens": 7595641.0,
"step": 466
},
{
"entropy": 0.5529971420764923,
"epoch": 1.7439252336448599,
"grad_norm": 0.03125125169754028,
"learning_rate": 0.0002,
"loss": 0.5582661628723145,
"mean_token_accuracy": 0.7737809270620346,
"num_tokens": 7611643.0,
"step": 467
},
{
"entropy": 0.5647894889116287,
"epoch": 1.7476635514018692,
"grad_norm": 0.029365174472332,
"learning_rate": 0.0002,
"loss": 0.5628995895385742,
"mean_token_accuracy": 0.770697221159935,
"num_tokens": 7628011.0,
"step": 468
},
{
"entropy": 0.554974377155304,
"epoch": 1.7514018691588786,
"grad_norm": 0.03162689507007599,
"learning_rate": 0.0002,
"loss": 0.5540342330932617,
"mean_token_accuracy": 0.7753277122974396,
"num_tokens": 7644033.0,
"step": 469
},
{
"entropy": 0.5500662177801132,
"epoch": 1.7551401869158878,
"grad_norm": 0.03005298413336277,
"learning_rate": 0.0002,
"loss": 0.5444310307502747,
"mean_token_accuracy": 0.7801364362239838,
"num_tokens": 7660280.0,
"step": 470
},
{
"entropy": 0.5447323620319366,
"epoch": 1.7588785046728972,
"grad_norm": 0.03137346729636192,
"learning_rate": 0.0002,
"loss": 0.5573670864105225,
"mean_token_accuracy": 0.7713485956192017,
"num_tokens": 7676463.0,
"step": 471
},
{
"entropy": 0.5369779318571091,
"epoch": 1.7626168224299066,
"grad_norm": 0.03314938396215439,
"learning_rate": 0.0002,
"loss": 0.5444561839103699,
"mean_token_accuracy": 0.7770639657974243,
"num_tokens": 7692602.0,
"step": 472
},
{
"entropy": 0.5475834012031555,
"epoch": 1.7663551401869158,
"grad_norm": 0.02887626923620701,
"learning_rate": 0.0002,
"loss": 0.548475980758667,
"mean_token_accuracy": 0.7783610373735428,
"num_tokens": 7708846.0,
"step": 473
},
{
"entropy": 0.5512323975563049,
"epoch": 1.7700934579439251,
"grad_norm": 0.029940130189061165,
"learning_rate": 0.0002,
"loss": 0.5473303198814392,
"mean_token_accuracy": 0.7762128710746765,
"num_tokens": 7725069.0,
"step": 474
},
{
"entropy": 0.553005576133728,
"epoch": 1.7738317757009345,
"grad_norm": 0.030464377254247665,
"learning_rate": 0.0002,
"loss": 0.5503718852996826,
"mean_token_accuracy": 0.774563655257225,
"num_tokens": 7741245.0,
"step": 475
},
{
"entropy": 0.5530129075050354,
"epoch": 1.777570093457944,
"grad_norm": 0.03166594356298447,
"learning_rate": 0.0002,
"loss": 0.5523677468299866,
"mean_token_accuracy": 0.7772203087806702,
"num_tokens": 7757438.0,
"step": 476
},
{
"entropy": 0.5589546114206314,
"epoch": 1.7813084112149533,
"grad_norm": 0.031029848381876945,
"learning_rate": 0.0002,
"loss": 0.562568724155426,
"mean_token_accuracy": 0.7697692364454269,
"num_tokens": 7773613.0,
"step": 477
},
{
"entropy": 0.5485216081142426,
"epoch": 1.7850467289719627,
"grad_norm": 0.03148766979575157,
"learning_rate": 0.0002,
"loss": 0.5566563010215759,
"mean_token_accuracy": 0.7735153138637543,
"num_tokens": 7790250.0,
"step": 478
},
{
"entropy": 0.5454483330249786,
"epoch": 1.788785046728972,
"grad_norm": 0.02934390679001808,
"learning_rate": 0.0002,
"loss": 0.5470514297485352,
"mean_token_accuracy": 0.777851864695549,
"num_tokens": 7806794.0,
"step": 479
},
{
"entropy": 0.5577091723680496,
"epoch": 1.7925233644859813,
"grad_norm": 0.032060954719781876,
"learning_rate": 0.0002,
"loss": 0.5573920011520386,
"mean_token_accuracy": 0.7715256214141846,
"num_tokens": 7823378.0,
"step": 480
},
{
"entropy": 0.5442305952310562,
"epoch": 1.7962616822429907,
"grad_norm": 0.027305442839860916,
"learning_rate": 0.0002,
"loss": 0.5404268503189087,
"mean_token_accuracy": 0.7780007869005203,
"num_tokens": 7839749.0,
"step": 481
},
{
"entropy": 0.5555779784917831,
"epoch": 1.8,
"grad_norm": 0.03287232294678688,
"learning_rate": 0.0002,
"loss": 0.5462092161178589,
"mean_token_accuracy": 0.7763689607381821,
"num_tokens": 7855947.0,
"step": 482
},
{
"entropy": 0.5372089967131615,
"epoch": 1.8037383177570092,
"grad_norm": 0.031652286648750305,
"learning_rate": 0.0002,
"loss": 0.5363561511039734,
"mean_token_accuracy": 0.7853012979030609,
"num_tokens": 7872142.0,
"step": 483
},
{
"entropy": 0.5340928807854652,
"epoch": 1.8074766355140186,
"grad_norm": 0.031619228422641754,
"learning_rate": 0.0002,
"loss": 0.5403937697410583,
"mean_token_accuracy": 0.7826676219701767,
"num_tokens": 7888470.0,
"step": 484
},
{
"entropy": 0.5592721551656723,
"epoch": 1.811214953271028,
"grad_norm": 0.03946106135845184,
"learning_rate": 0.0002,
"loss": 0.5722806453704834,
"mean_token_accuracy": 0.7665584683418274,
"num_tokens": 7904942.0,
"step": 485
},
{
"entropy": 0.5392829775810242,
"epoch": 1.8149532710280374,
"grad_norm": 0.04261912405490875,
"learning_rate": 0.0002,
"loss": 0.5484760999679565,
"mean_token_accuracy": 0.7759799510240555,
"num_tokens": 7921095.0,
"step": 486
},
{
"entropy": 0.5537964701652527,
"epoch": 1.8186915887850468,
"grad_norm": 0.029489269480109215,
"learning_rate": 0.0002,
"loss": 0.5515441298484802,
"mean_token_accuracy": 0.7770739942789078,
"num_tokens": 7937493.0,
"step": 487
},
{
"entropy": 0.5820317566394806,
"epoch": 1.8224299065420562,
"grad_norm": 0.032789647579193115,
"learning_rate": 0.0002,
"loss": 0.5696999430656433,
"mean_token_accuracy": 0.766129344701767,
"num_tokens": 7953872.0,
"step": 488
},
{
"entropy": 0.5591157823801041,
"epoch": 1.8261682242990656,
"grad_norm": 0.03274792060256004,
"learning_rate": 0.0002,
"loss": 0.5492164492607117,
"mean_token_accuracy": 0.7776104360818863,
"num_tokens": 7970399.0,
"step": 489
},
{
"entropy": 0.5613900125026703,
"epoch": 1.8299065420560747,
"grad_norm": 0.03268195316195488,
"learning_rate": 0.0002,
"loss": 0.5613545179367065,
"mean_token_accuracy": 0.7726269513368607,
"num_tokens": 7986663.0,
"step": 490
},
{
"entropy": 0.540773555636406,
"epoch": 1.8336448598130841,
"grad_norm": 0.031849462538957596,
"learning_rate": 0.0002,
"loss": 0.5427927374839783,
"mean_token_accuracy": 0.7795483022928238,
"num_tokens": 8002949.0,
"step": 491
},
{
"entropy": 0.5281448513269424,
"epoch": 1.8373831775700935,
"grad_norm": 0.037760283797979355,
"learning_rate": 0.0002,
"loss": 0.5398802161216736,
"mean_token_accuracy": 0.7793932110071182,
"num_tokens": 8018924.0,
"step": 492
},
{
"entropy": 0.5640152990818024,
"epoch": 1.8411214953271027,
"grad_norm": 0.03318220004439354,
"learning_rate": 0.0002,
"loss": 0.5651699900627136,
"mean_token_accuracy": 0.7711258381605148,
"num_tokens": 8035544.0,
"step": 493
},
{
"entropy": 0.5498005002737045,
"epoch": 1.844859813084112,
"grad_norm": 0.0300876684486866,
"learning_rate": 0.0002,
"loss": 0.5483426451683044,
"mean_token_accuracy": 0.777212604880333,
"num_tokens": 8051604.0,
"step": 494
},
{
"entropy": 0.5553054213523865,
"epoch": 1.8485981308411215,
"grad_norm": 0.03142329677939415,
"learning_rate": 0.0002,
"loss": 0.5571571588516235,
"mean_token_accuracy": 0.7740218490362167,
"num_tokens": 8067812.0,
"step": 495
},
{
"entropy": 0.5580199360847473,
"epoch": 1.8523364485981308,
"grad_norm": 0.03293558582663536,
"learning_rate": 0.0002,
"loss": 0.5583306550979614,
"mean_token_accuracy": 0.7746147364377975,
"num_tokens": 8083966.0,
"step": 496
},
{
"entropy": 0.5503615736961365,
"epoch": 1.8560747663551402,
"grad_norm": 0.031184855848550797,
"learning_rate": 0.0002,
"loss": 0.5509845614433289,
"mean_token_accuracy": 0.7762554883956909,
"num_tokens": 8100276.0,
"step": 497
},
{
"entropy": 0.5609902739524841,
"epoch": 1.8598130841121496,
"grad_norm": 0.03478863090276718,
"learning_rate": 0.0002,
"loss": 0.5611089468002319,
"mean_token_accuracy": 0.7710845172405243,
"num_tokens": 8116579.0,
"step": 498
},
{
"entropy": 0.5358163863420486,
"epoch": 1.863551401869159,
"grad_norm": 0.03343072161078453,
"learning_rate": 0.0002,
"loss": 0.5352976322174072,
"mean_token_accuracy": 0.7815191894769669,
"num_tokens": 8132938.0,
"step": 499
},
{
"entropy": 0.5323279201984406,
"epoch": 1.8672897196261682,
"grad_norm": 0.030239535495638847,
"learning_rate": 0.0002,
"loss": 0.5383006930351257,
"mean_token_accuracy": 0.7808633744716644,
"num_tokens": 8149182.0,
"step": 500
},
{
"entropy": 0.557625338435173,
"epoch": 1.8710280373831776,
"grad_norm": 0.031314413994550705,
"learning_rate": 0.0002,
"loss": 0.5607120990753174,
"mean_token_accuracy": 0.7726259678602219,
"num_tokens": 8165713.0,
"step": 501
},
{
"entropy": 0.5501556247472763,
"epoch": 1.874766355140187,
"grad_norm": 0.029330939054489136,
"learning_rate": 0.0002,
"loss": 0.5527728796005249,
"mean_token_accuracy": 0.7722220122814178,
"num_tokens": 8182157.0,
"step": 502
},
{
"entropy": 0.5571380257606506,
"epoch": 1.8785046728971961,
"grad_norm": 0.027965383604168892,
"learning_rate": 0.0002,
"loss": 0.5537632703781128,
"mean_token_accuracy": 0.7755916863679886,
"num_tokens": 8198641.0,
"step": 503
},
{
"entropy": 0.5457630455493927,
"epoch": 1.8822429906542055,
"grad_norm": 0.030688611790537834,
"learning_rate": 0.0002,
"loss": 0.5442954897880554,
"mean_token_accuracy": 0.7765072137117386,
"num_tokens": 8214799.0,
"step": 504
},
{
"entropy": 0.5432839095592499,
"epoch": 1.885981308411215,
"grad_norm": 0.0319070965051651,
"learning_rate": 0.0002,
"loss": 0.5535275936126709,
"mean_token_accuracy": 0.7709672451019287,
"num_tokens": 8230973.0,
"step": 505
},
{
"entropy": 0.5594919174909592,
"epoch": 1.8897196261682243,
"grad_norm": 0.04258793592453003,
"learning_rate": 0.0002,
"loss": 0.5607203841209412,
"mean_token_accuracy": 0.7712259739637375,
"num_tokens": 8247156.0,
"step": 506
},
{
"entropy": 0.5589391887187958,
"epoch": 1.8934579439252337,
"grad_norm": 0.033864762634038925,
"learning_rate": 0.0002,
"loss": 0.5650033950805664,
"mean_token_accuracy": 0.7718524932861328,
"num_tokens": 8263441.0,
"step": 507
},
{
"entropy": 0.5569577813148499,
"epoch": 1.897196261682243,
"grad_norm": 0.03338006138801575,
"learning_rate": 0.0002,
"loss": 0.5555600523948669,
"mean_token_accuracy": 0.7759018540382385,
"num_tokens": 8279848.0,
"step": 508
},
{
"entropy": 0.5524785667657852,
"epoch": 1.9009345794392525,
"grad_norm": 0.034291088581085205,
"learning_rate": 0.0002,
"loss": 0.554389238357544,
"mean_token_accuracy": 0.7732797265052795,
"num_tokens": 8296286.0,
"step": 509
},
{
"entropy": 0.5341912508010864,
"epoch": 1.9046728971962616,
"grad_norm": 0.03332460671663284,
"learning_rate": 0.0002,
"loss": 0.5296705365180969,
"mean_token_accuracy": 0.7850336581468582,
"num_tokens": 8312462.0,
"step": 510
},
{
"entropy": 0.5374017357826233,
"epoch": 1.908411214953271,
"grad_norm": 0.029762303456664085,
"learning_rate": 0.0002,
"loss": 0.5377117395401001,
"mean_token_accuracy": 0.7782561480998993,
"num_tokens": 8328514.0,
"step": 511
},
{
"entropy": 0.5621481090784073,
"epoch": 1.9121495327102802,
"grad_norm": 0.02770383097231388,
"learning_rate": 0.0002,
"loss": 0.556929349899292,
"mean_token_accuracy": 0.7750183939933777,
"num_tokens": 8345018.0,
"step": 512
},
{
"entropy": 0.5308145210146904,
"epoch": 1.9158878504672896,
"grad_norm": 0.031799450516700745,
"learning_rate": 0.0002,
"loss": 0.5367879867553711,
"mean_token_accuracy": 0.7811458259820938,
"num_tokens": 8361450.0,
"step": 513
},
{
"entropy": 0.5505598485469818,
"epoch": 1.919626168224299,
"grad_norm": 0.030035199597477913,
"learning_rate": 0.0002,
"loss": 0.55583256483078,
"mean_token_accuracy": 0.7735087871551514,
"num_tokens": 8378205.0,
"step": 514
},
{
"entropy": 0.5498997569084167,
"epoch": 1.9233644859813084,
"grad_norm": 0.031478267163038254,
"learning_rate": 0.0002,
"loss": 0.554360568523407,
"mean_token_accuracy": 0.7755851894617081,
"num_tokens": 8394730.0,
"step": 515
},
{
"entropy": 0.5447141826152802,
"epoch": 1.9271028037383178,
"grad_norm": 0.034256696701049805,
"learning_rate": 0.0002,
"loss": 0.5524182915687561,
"mean_token_accuracy": 0.7743232250213623,
"num_tokens": 8410799.0,
"step": 516
},
{
"entropy": 0.5548212677240372,
"epoch": 1.9308411214953272,
"grad_norm": 0.0296107679605484,
"learning_rate": 0.0002,
"loss": 0.5498183965682983,
"mean_token_accuracy": 0.7740313857793808,
"num_tokens": 8427372.0,
"step": 517
},
{
"entropy": 0.5684213787317276,
"epoch": 1.9345794392523366,
"grad_norm": 0.03422481194138527,
"learning_rate": 0.0002,
"loss": 0.5559389591217041,
"mean_token_accuracy": 0.7754881531000137,
"num_tokens": 8443822.0,
"step": 518
},
{
"entropy": 0.5545912981033325,
"epoch": 1.938317757009346,
"grad_norm": 0.031684234738349915,
"learning_rate": 0.0002,
"loss": 0.5498573780059814,
"mean_token_accuracy": 0.7783227860927582,
"num_tokens": 8460032.0,
"step": 519
},
{
"entropy": 0.5595797300338745,
"epoch": 1.9420560747663551,
"grad_norm": 0.02719406597316265,
"learning_rate": 0.0002,
"loss": 0.5614221096038818,
"mean_token_accuracy": 0.7715103030204773,
"num_tokens": 8476297.0,
"step": 520
},
{
"entropy": 0.5345963835716248,
"epoch": 1.9457943925233645,
"grad_norm": 0.03023097850382328,
"learning_rate": 0.0002,
"loss": 0.5425735116004944,
"mean_token_accuracy": 0.7805851995944977,
"num_tokens": 8492637.0,
"step": 521
},
{
"entropy": 0.5391188263893127,
"epoch": 1.9495327102803737,
"grad_norm": 0.05476713180541992,
"learning_rate": 0.0002,
"loss": 0.5556075572967529,
"mean_token_accuracy": 0.7749961167573929,
"num_tokens": 8509129.0,
"step": 522
},
{
"entropy": 0.5553655624389648,
"epoch": 1.953271028037383,
"grad_norm": 0.03542236238718033,
"learning_rate": 0.0002,
"loss": 0.5655393004417419,
"mean_token_accuracy": 0.7717009782791138,
"num_tokens": 8525641.0,
"step": 523
},
{
"entropy": 0.5613285005092621,
"epoch": 1.9570093457943925,
"grad_norm": 0.06946822255849838,
"learning_rate": 0.0002,
"loss": 0.5717962384223938,
"mean_token_accuracy": 0.7724136412143707,
"num_tokens": 8542275.0,
"step": 524
},
{
"entropy": 0.5575561076402664,
"epoch": 1.9607476635514018,
"grad_norm": 0.03460278734564781,
"learning_rate": 0.0002,
"loss": 0.5417395830154419,
"mean_token_accuracy": 0.7819567322731018,
"num_tokens": 8558373.0,
"step": 525
},
{
"entropy": 0.5704021006822586,
"epoch": 1.9644859813084112,
"grad_norm": 0.030037706717848778,
"learning_rate": 0.0002,
"loss": 0.5573901534080505,
"mean_token_accuracy": 0.7713392674922943,
"num_tokens": 8574839.0,
"step": 526
},
{
"entropy": 0.5286285877227783,
"epoch": 1.9682242990654206,
"grad_norm": 0.032038215547800064,
"learning_rate": 0.0002,
"loss": 0.5231573581695557,
"mean_token_accuracy": 0.7873097807168961,
"num_tokens": 8591063.0,
"step": 527
},
{
"entropy": 0.535316064953804,
"epoch": 1.97196261682243,
"grad_norm": 0.04137961193919182,
"learning_rate": 0.0002,
"loss": 0.5491993427276611,
"mean_token_accuracy": 0.7760031670331955,
"num_tokens": 8607354.0,
"step": 528
},
{
"entropy": 0.5287620276212692,
"epoch": 1.9757009345794394,
"grad_norm": 0.03144775703549385,
"learning_rate": 0.0002,
"loss": 0.5313848853111267,
"mean_token_accuracy": 0.784307450056076,
"num_tokens": 8623542.0,
"step": 529
},
{
"entropy": 0.5521504878997803,
"epoch": 1.9794392523364486,
"grad_norm": 0.03497127816081047,
"learning_rate": 0.0002,
"loss": 0.5516395568847656,
"mean_token_accuracy": 0.7736653387546539,
"num_tokens": 8639626.0,
"step": 530
},
{
"entropy": 0.5580714792013168,
"epoch": 1.983177570093458,
"grad_norm": 0.030566083267331123,
"learning_rate": 0.0002,
"loss": 0.5535013675689697,
"mean_token_accuracy": 0.7748955637216568,
"num_tokens": 8655957.0,
"step": 531
},
{
"entropy": 0.5411636233329773,
"epoch": 1.9869158878504671,
"grad_norm": 0.03356699272990227,
"learning_rate": 0.0002,
"loss": 0.5376905202865601,
"mean_token_accuracy": 0.7788012474775314,
"num_tokens": 8672109.0,
"step": 532
},
{
"entropy": 0.5470294207334518,
"epoch": 1.9906542056074765,
"grad_norm": 0.0316782146692276,
"learning_rate": 0.0002,
"loss": 0.5445536971092224,
"mean_token_accuracy": 0.7801567167043686,
"num_tokens": 8688512.0,
"step": 533
},
{
"entropy": 0.5573801398277283,
"epoch": 1.994392523364486,
"grad_norm": 0.0308368057012558,
"learning_rate": 0.0002,
"loss": 0.5613093376159668,
"mean_token_accuracy": 0.7755008339881897,
"num_tokens": 8704882.0,
"step": 534
},
{
"entropy": 0.5606262683868408,
"epoch": 1.9981308411214953,
"grad_norm": 0.033759523183107376,
"learning_rate": 0.0002,
"loss": 0.5673450827598572,
"mean_token_accuracy": 0.7693974524736404,
"num_tokens": 8721476.0,
"step": 535
},
{
"entropy": 0.5470572412014008,
"epoch": 2.0,
"grad_norm": 0.045990657061338425,
"learning_rate": 0.0002,
"loss": 0.5525597333908081,
"mean_token_accuracy": 0.7788615226745605,
"num_tokens": 8729601.0,
"step": 536
},
{
"entropy": 0.5381215959787369,
"epoch": 2.0037383177570094,
"grad_norm": 0.03212118148803711,
"learning_rate": 0.0002,
"loss": 0.5325874090194702,
"mean_token_accuracy": 0.7825482040643692,
"num_tokens": 8745950.0,
"step": 537
},
{
"entropy": 0.5637937486171722,
"epoch": 2.007476635514019,
"grad_norm": 0.036541201174259186,
"learning_rate": 0.0002,
"loss": 0.5618294477462769,
"mean_token_accuracy": 0.773602232336998,
"num_tokens": 8762499.0,
"step": 538
},
{
"entropy": 0.5491923093795776,
"epoch": 2.011214953271028,
"grad_norm": 0.033549197018146515,
"learning_rate": 0.0002,
"loss": 0.548430323600769,
"mean_token_accuracy": 0.7764875292778015,
"num_tokens": 8778855.0,
"step": 539
},
{
"entropy": 0.5251094102859497,
"epoch": 2.0149532710280376,
"grad_norm": 0.036079153418540955,
"learning_rate": 0.0002,
"loss": 0.5315405130386353,
"mean_token_accuracy": 0.7840714603662491,
"num_tokens": 8794810.0,
"step": 540
},
{
"entropy": 0.5423221588134766,
"epoch": 2.0186915887850465,
"grad_norm": 0.03329861909151077,
"learning_rate": 0.0002,
"loss": 0.5420343279838562,
"mean_token_accuracy": 0.7797907888889313,
"num_tokens": 8811426.0,
"step": 541
},
{
"entropy": 0.5213563144207001,
"epoch": 2.022429906542056,
"grad_norm": 0.03049337863922119,
"learning_rate": 0.0002,
"loss": 0.5193029642105103,
"mean_token_accuracy": 0.7878206521272659,
"num_tokens": 8827505.0,
"step": 542
},
{
"entropy": 0.5485236346721649,
"epoch": 2.0261682242990653,
"grad_norm": 0.038072168827056885,
"learning_rate": 0.0002,
"loss": 0.5403975248336792,
"mean_token_accuracy": 0.7787782251834869,
"num_tokens": 8843789.0,
"step": 543
},
{
"entropy": 0.5497236847877502,
"epoch": 2.0299065420560747,
"grad_norm": 0.037746790796518326,
"learning_rate": 0.0002,
"loss": 0.5424782037734985,
"mean_token_accuracy": 0.7821084409952164,
"num_tokens": 8860524.0,
"step": 544
},
{
"entropy": 0.5128878131508827,
"epoch": 2.033644859813084,
"grad_norm": 0.03184136748313904,
"learning_rate": 0.0002,
"loss": 0.5119982957839966,
"mean_token_accuracy": 0.7925940603017807,
"num_tokens": 8876520.0,
"step": 545
},
{
"entropy": 0.53415547311306,
"epoch": 2.0373831775700935,
"grad_norm": 0.04230194166302681,
"learning_rate": 0.0002,
"loss": 0.5436858534812927,
"mean_token_accuracy": 0.7798719555139542,
"num_tokens": 8892800.0,
"step": 546
},
{
"entropy": 0.527920126914978,
"epoch": 2.041121495327103,
"grad_norm": 0.035794876515865326,
"learning_rate": 0.0002,
"loss": 0.537831723690033,
"mean_token_accuracy": 0.7832628786563873,
"num_tokens": 8908779.0,
"step": 547
},
{
"entropy": 0.528620719909668,
"epoch": 2.0448598130841122,
"grad_norm": 0.043260980397462845,
"learning_rate": 0.0002,
"loss": 0.5385839343070984,
"mean_token_accuracy": 0.7800839692354202,
"num_tokens": 8925225.0,
"step": 548
},
{
"entropy": 0.5344889611005783,
"epoch": 2.0485981308411216,
"grad_norm": 0.03616830334067345,
"learning_rate": 0.0002,
"loss": 0.5279685258865356,
"mean_token_accuracy": 0.7877432852983475,
"num_tokens": 8941370.0,
"step": 549
},
{
"entropy": 0.5505447387695312,
"epoch": 2.052336448598131,
"grad_norm": 0.03392447903752327,
"learning_rate": 0.0002,
"loss": 0.5464667081832886,
"mean_token_accuracy": 0.778993234038353,
"num_tokens": 8957759.0,
"step": 550
},
{
"entropy": 0.537495419383049,
"epoch": 2.05607476635514,
"grad_norm": 0.03487386927008629,
"learning_rate": 0.0002,
"loss": 0.5327776074409485,
"mean_token_accuracy": 0.7819164842367172,
"num_tokens": 8974120.0,
"step": 551
},
{
"entropy": 0.5181033089756966,
"epoch": 2.0598130841121494,
"grad_norm": 0.03655601665377617,
"learning_rate": 0.0002,
"loss": 0.5197772979736328,
"mean_token_accuracy": 0.7876780480146408,
"num_tokens": 8990084.0,
"step": 552
},
{
"entropy": 0.5097288861870766,
"epoch": 2.0635514018691588,
"grad_norm": 0.04094317555427551,
"learning_rate": 0.0002,
"loss": 0.5214163661003113,
"mean_token_accuracy": 0.7877646237611771,
"num_tokens": 9006115.0,
"step": 553
},
{
"entropy": 0.5392448753118515,
"epoch": 2.067289719626168,
"grad_norm": 0.042336490005254745,
"learning_rate": 0.0002,
"loss": 0.5487770438194275,
"mean_token_accuracy": 0.7746841162443161,
"num_tokens": 9022503.0,
"step": 554
},
{
"entropy": 0.5353204905986786,
"epoch": 2.0710280373831775,
"grad_norm": 0.04751956835389137,
"learning_rate": 0.0002,
"loss": 0.5423939824104309,
"mean_token_accuracy": 0.7819565683603287,
"num_tokens": 9038587.0,
"step": 555
},
{
"entropy": 0.5576211661100388,
"epoch": 2.074766355140187,
"grad_norm": 0.034248773008584976,
"learning_rate": 0.0002,
"loss": 0.5450438261032104,
"mean_token_accuracy": 0.7806050181388855,
"num_tokens": 9054978.0,
"step": 556
},
{
"entropy": 0.5164358094334602,
"epoch": 2.0785046728971963,
"grad_norm": 0.03642895817756653,
"learning_rate": 0.0002,
"loss": 0.5048035979270935,
"mean_token_accuracy": 0.7946237772703171,
"num_tokens": 9071189.0,
"step": 557
},
{
"entropy": 0.5479462146759033,
"epoch": 2.0822429906542057,
"grad_norm": 0.03524266555905342,
"learning_rate": 0.0002,
"loss": 0.5424850583076477,
"mean_token_accuracy": 0.7782812714576721,
"num_tokens": 9087453.0,
"step": 558
},
{
"entropy": 0.5207670480012894,
"epoch": 2.085981308411215,
"grad_norm": 0.04086553677916527,
"learning_rate": 0.0002,
"loss": 0.5275461673736572,
"mean_token_accuracy": 0.7870053201913834,
"num_tokens": 9103538.0,
"step": 559
},
{
"entropy": 0.5350566729903221,
"epoch": 2.0897196261682245,
"grad_norm": 0.036386121064424515,
"learning_rate": 0.0002,
"loss": 0.5380175113677979,
"mean_token_accuracy": 0.7814048826694489,
"num_tokens": 9119858.0,
"step": 560
},
{
"entropy": 0.5368697345256805,
"epoch": 2.0934579439252334,
"grad_norm": 0.039366140961647034,
"learning_rate": 0.0002,
"loss": 0.5444531440734863,
"mean_token_accuracy": 0.7792541682720184,
"num_tokens": 9136204.0,
"step": 561
},
{
"entropy": 0.5295629873871803,
"epoch": 2.097196261682243,
"grad_norm": 0.03559441864490509,
"learning_rate": 0.0002,
"loss": 0.5286230444908142,
"mean_token_accuracy": 0.784547358751297,
"num_tokens": 9152718.0,
"step": 562
},
{
"entropy": 0.5568843930959702,
"epoch": 2.100934579439252,
"grad_norm": 0.034528154879808426,
"learning_rate": 0.0002,
"loss": 0.5466718077659607,
"mean_token_accuracy": 0.7782703340053558,
"num_tokens": 9168840.0,
"step": 563
},
{
"entropy": 0.5514650642871857,
"epoch": 2.1046728971962616,
"grad_norm": 0.034620221704244614,
"learning_rate": 0.0002,
"loss": 0.5481366515159607,
"mean_token_accuracy": 0.7774865627288818,
"num_tokens": 9185012.0,
"step": 564
},
{
"entropy": 0.5468508899211884,
"epoch": 2.108411214953271,
"grad_norm": 0.038367778062820435,
"learning_rate": 0.0002,
"loss": 0.5465208888053894,
"mean_token_accuracy": 0.7787877917289734,
"num_tokens": 9201579.0,
"step": 565
},
{
"entropy": 0.5365718752145767,
"epoch": 2.1121495327102804,
"grad_norm": 0.033649299293756485,
"learning_rate": 0.0002,
"loss": 0.5394605398178101,
"mean_token_accuracy": 0.7824818789958954,
"num_tokens": 9217958.0,
"step": 566
},
{
"entropy": 0.5342001020908356,
"epoch": 2.1158878504672898,
"grad_norm": 0.04148790240287781,
"learning_rate": 0.0002,
"loss": 0.541080892086029,
"mean_token_accuracy": 0.7807753682136536,
"num_tokens": 9234182.0,
"step": 567
},
{
"entropy": 0.5269056260585785,
"epoch": 2.119626168224299,
"grad_norm": 0.031905628740787506,
"learning_rate": 0.0002,
"loss": 0.529283881187439,
"mean_token_accuracy": 0.7837703377008438,
"num_tokens": 9250712.0,
"step": 568
},
{
"entropy": 0.5335036367177963,
"epoch": 2.1233644859813086,
"grad_norm": 0.041321150958538055,
"learning_rate": 0.0002,
"loss": 0.5374078154563904,
"mean_token_accuracy": 0.782123014330864,
"num_tokens": 9266961.0,
"step": 569
},
{
"entropy": 0.5442205667495728,
"epoch": 2.127102803738318,
"grad_norm": 0.034318044781684875,
"learning_rate": 0.0002,
"loss": 0.5429351329803467,
"mean_token_accuracy": 0.7788351625204086,
"num_tokens": 9283528.0,
"step": 570
},
{
"entropy": 0.5432394444942474,
"epoch": 2.130841121495327,
"grad_norm": 0.047397077083587646,
"learning_rate": 0.0002,
"loss": 0.5424203276634216,
"mean_token_accuracy": 0.7810939103364944,
"num_tokens": 9299837.0,
"step": 571
},
{
"entropy": 0.5400207340717316,
"epoch": 2.1345794392523363,
"grad_norm": 0.03500756248831749,
"learning_rate": 0.0002,
"loss": 0.5377690196037292,
"mean_token_accuracy": 0.783811166882515,
"num_tokens": 9315959.0,
"step": 572
},
{
"entropy": 0.5296697020530701,
"epoch": 2.1383177570093457,
"grad_norm": 0.03790782764554024,
"learning_rate": 0.0002,
"loss": 0.5289957523345947,
"mean_token_accuracy": 0.7867159694433212,
"num_tokens": 9332370.0,
"step": 573
},
{
"entropy": 0.5078830569982529,
"epoch": 2.142056074766355,
"grad_norm": 0.045958928763866425,
"learning_rate": 0.0002,
"loss": 0.5104236006736755,
"mean_token_accuracy": 0.7909017950296402,
"num_tokens": 9348594.0,
"step": 574
},
{
"entropy": 0.5188925862312317,
"epoch": 2.1457943925233645,
"grad_norm": 0.03916464373469353,
"learning_rate": 0.0002,
"loss": 0.5316386818885803,
"mean_token_accuracy": 0.7828120291233063,
"num_tokens": 9365046.0,
"step": 575
},
{
"entropy": 0.5045325607061386,
"epoch": 2.149532710280374,
"grad_norm": 0.04434382542967796,
"learning_rate": 0.0002,
"loss": 0.5116738080978394,
"mean_token_accuracy": 0.7905466854572296,
"num_tokens": 9381007.0,
"step": 576
},
{
"entropy": 0.5541563183069229,
"epoch": 2.1532710280373832,
"grad_norm": 0.038000430911779404,
"learning_rate": 0.0002,
"loss": 0.5551270842552185,
"mean_token_accuracy": 0.7762157022953033,
"num_tokens": 9397394.0,
"step": 577
},
{
"entropy": 0.5460502356290817,
"epoch": 2.1570093457943926,
"grad_norm": 0.038676705211400986,
"learning_rate": 0.0002,
"loss": 0.5363121032714844,
"mean_token_accuracy": 0.7802022695541382,
"num_tokens": 9413810.0,
"step": 578
},
{
"entropy": 0.5573510080575943,
"epoch": 2.160747663551402,
"grad_norm": 0.03721381351351738,
"learning_rate": 0.0002,
"loss": 0.5444300174713135,
"mean_token_accuracy": 0.7804805636405945,
"num_tokens": 9430091.0,
"step": 579
},
{
"entropy": 0.5371396392583847,
"epoch": 2.1644859813084114,
"grad_norm": 0.04258019104599953,
"learning_rate": 0.0002,
"loss": 0.5351753234863281,
"mean_token_accuracy": 0.7820869237184525,
"num_tokens": 9446665.0,
"step": 580
},
{
"entropy": 0.5393694788217545,
"epoch": 2.1682242990654204,
"grad_norm": 0.0406467579305172,
"learning_rate": 0.0002,
"loss": 0.5430103540420532,
"mean_token_accuracy": 0.7779065668582916,
"num_tokens": 9463118.0,
"step": 581
},
{
"entropy": 0.5272447615861893,
"epoch": 2.1719626168224297,
"grad_norm": 0.04435638338327408,
"learning_rate": 0.0002,
"loss": 0.5354752540588379,
"mean_token_accuracy": 0.7838975638151169,
"num_tokens": 9479432.0,
"step": 582
},
{
"entropy": 0.5255759209394455,
"epoch": 2.175700934579439,
"grad_norm": 0.03574801981449127,
"learning_rate": 0.0002,
"loss": 0.531680703163147,
"mean_token_accuracy": 0.7842760384082794,
"num_tokens": 9495707.0,
"step": 583
},
{
"entropy": 0.5348410457372665,
"epoch": 2.1794392523364485,
"grad_norm": 0.03383009880781174,
"learning_rate": 0.0002,
"loss": 0.5284703373908997,
"mean_token_accuracy": 0.7889558225870132,
"num_tokens": 9512236.0,
"step": 584
},
{
"entropy": 0.5311737060546875,
"epoch": 2.183177570093458,
"grad_norm": 0.035349104553461075,
"learning_rate": 0.0002,
"loss": 0.5332157611846924,
"mean_token_accuracy": 0.7814211249351501,
"num_tokens": 9528589.0,
"step": 585
},
{
"entropy": 0.5255388617515564,
"epoch": 2.1869158878504673,
"grad_norm": 0.043005745857954025,
"learning_rate": 0.0002,
"loss": 0.5251577496528625,
"mean_token_accuracy": 0.7884248644113541,
"num_tokens": 9544965.0,
"step": 586
},
{
"entropy": 0.5347089469432831,
"epoch": 2.1906542056074767,
"grad_norm": 0.03752923756837845,
"learning_rate": 0.0002,
"loss": 0.5362472534179688,
"mean_token_accuracy": 0.7811613231897354,
"num_tokens": 9561276.0,
"step": 587
},
{
"entropy": 0.5310826078057289,
"epoch": 2.194392523364486,
"grad_norm": 0.05228811874985695,
"learning_rate": 0.0002,
"loss": 0.5329592227935791,
"mean_token_accuracy": 0.7827970087528229,
"num_tokens": 9577509.0,
"step": 588
},
{
"entropy": 0.5254483968019485,
"epoch": 2.1981308411214955,
"grad_norm": 0.03692999482154846,
"learning_rate": 0.0002,
"loss": 0.5311483144760132,
"mean_token_accuracy": 0.7830882370471954,
"num_tokens": 9593982.0,
"step": 589
},
{
"entropy": 0.5360620766878128,
"epoch": 2.201869158878505,
"grad_norm": 0.04609117656946182,
"learning_rate": 0.0002,
"loss": 0.5386216640472412,
"mean_token_accuracy": 0.7802708595991135,
"num_tokens": 9610311.0,
"step": 590
},
{
"entropy": 0.5463242679834366,
"epoch": 2.205607476635514,
"grad_norm": 0.03901510685682297,
"learning_rate": 0.0002,
"loss": 0.5447873473167419,
"mean_token_accuracy": 0.7785727232694626,
"num_tokens": 9626678.0,
"step": 591
},
{
"entropy": 0.5129301249980927,
"epoch": 2.209345794392523,
"grad_norm": 0.043117035180330276,
"learning_rate": 0.0002,
"loss": 0.5128067135810852,
"mean_token_accuracy": 0.7911233007907867,
"num_tokens": 9642843.0,
"step": 592
},
{
"entropy": 0.5312749594449997,
"epoch": 2.2130841121495326,
"grad_norm": 0.03675411641597748,
"learning_rate": 0.0002,
"loss": 0.5329593420028687,
"mean_token_accuracy": 0.7832809239625931,
"num_tokens": 9659218.0,
"step": 593
},
{
"entropy": 0.5422542840242386,
"epoch": 2.216822429906542,
"grad_norm": 0.036754533648490906,
"learning_rate": 0.0002,
"loss": 0.5398430824279785,
"mean_token_accuracy": 0.7803453654050827,
"num_tokens": 9675649.0,
"step": 594
},
{
"entropy": 0.5472271293401718,
"epoch": 2.2205607476635514,
"grad_norm": 0.043753694742918015,
"learning_rate": 0.0002,
"loss": 0.5421810150146484,
"mean_token_accuracy": 0.7812557965517044,
"num_tokens": 9691932.0,
"step": 595
},
{
"entropy": 0.5446718335151672,
"epoch": 2.2242990654205608,
"grad_norm": 0.0450102761387825,
"learning_rate": 0.0002,
"loss": 0.5450670719146729,
"mean_token_accuracy": 0.7795027941465378,
"num_tokens": 9708243.0,
"step": 596
},
{
"entropy": 0.5422708988189697,
"epoch": 2.22803738317757,
"grad_norm": 0.042899005115032196,
"learning_rate": 0.0002,
"loss": 0.5427168011665344,
"mean_token_accuracy": 0.7769834697246552,
"num_tokens": 9724620.0,
"step": 597
},
{
"entropy": 0.5316948816180229,
"epoch": 2.2317757009345796,
"grad_norm": 0.0438719242811203,
"learning_rate": 0.0002,
"loss": 0.5369054675102234,
"mean_token_accuracy": 0.7818674147129059,
"num_tokens": 9740813.0,
"step": 598
},
{
"entropy": 0.5353083610534668,
"epoch": 2.235514018691589,
"grad_norm": 0.045174483209848404,
"learning_rate": 0.0002,
"loss": 0.535564124584198,
"mean_token_accuracy": 0.7826817184686661,
"num_tokens": 9757081.0,
"step": 599
},
{
"entropy": 0.53409144282341,
"epoch": 2.2392523364485983,
"grad_norm": 0.046971406787633896,
"learning_rate": 0.0002,
"loss": 0.5388940572738647,
"mean_token_accuracy": 0.7797097563743591,
"num_tokens": 9773286.0,
"step": 600
},
{
"entropy": 0.5229181125760078,
"epoch": 2.2429906542056073,
"grad_norm": 0.04818117991089821,
"learning_rate": 0.0002,
"loss": 0.5283955931663513,
"mean_token_accuracy": 0.7855319827795029,
"num_tokens": 9789231.0,
"step": 601
},
{
"entropy": 0.5502548068761826,
"epoch": 2.2467289719626167,
"grad_norm": 0.041451770812273026,
"learning_rate": 0.0002,
"loss": 0.5441420078277588,
"mean_token_accuracy": 0.7805446833372116,
"num_tokens": 9805737.0,
"step": 602
},
{
"entropy": 0.5555277764797211,
"epoch": 2.250467289719626,
"grad_norm": 0.03888588771224022,
"learning_rate": 0.0002,
"loss": 0.5571548938751221,
"mean_token_accuracy": 0.7741208076477051,
"num_tokens": 9822370.0,
"step": 603
},
{
"entropy": 0.5331219285726547,
"epoch": 2.2542056074766355,
"grad_norm": 0.050726499408483505,
"learning_rate": 0.0002,
"loss": 0.5355172157287598,
"mean_token_accuracy": 0.7803194671869278,
"num_tokens": 9838846.0,
"step": 604
},
{
"entropy": 0.5391329601407051,
"epoch": 2.257943925233645,
"grad_norm": 0.03473533317446709,
"learning_rate": 0.0002,
"loss": 0.5380818843841553,
"mean_token_accuracy": 0.7837731093168259,
"num_tokens": 9855269.0,
"step": 605
},
{
"entropy": 0.5419459789991379,
"epoch": 2.2616822429906542,
"grad_norm": 0.04428257793188095,
"learning_rate": 0.0002,
"loss": 0.5402700304985046,
"mean_token_accuracy": 0.7803330719470978,
"num_tokens": 9871498.0,
"step": 606
},
{
"entropy": 0.5475794821977615,
"epoch": 2.2654205607476636,
"grad_norm": 0.03847254440188408,
"learning_rate": 0.0002,
"loss": 0.5443584322929382,
"mean_token_accuracy": 0.7776888459920883,
"num_tokens": 9887880.0,
"step": 607
},
{
"entropy": 0.5413693785667419,
"epoch": 2.269158878504673,
"grad_norm": 0.03769246116280556,
"learning_rate": 0.0002,
"loss": 0.5448262095451355,
"mean_token_accuracy": 0.7788306772708893,
"num_tokens": 9904482.0,
"step": 608
},
{
"entropy": 0.5233470648527145,
"epoch": 2.2728971962616824,
"grad_norm": 0.041845668107271194,
"learning_rate": 0.0002,
"loss": 0.5302014946937561,
"mean_token_accuracy": 0.7834525555372238,
"num_tokens": 9920720.0,
"step": 609
},
{
"entropy": 0.526485301554203,
"epoch": 2.2766355140186914,
"grad_norm": 0.04298217222094536,
"learning_rate": 0.0002,
"loss": 0.5376767516136169,
"mean_token_accuracy": 0.7815933078527451,
"num_tokens": 9936855.0,
"step": 610
},
{
"entropy": 0.5407330542802811,
"epoch": 2.2803738317757007,
"grad_norm": 0.03829406201839447,
"learning_rate": 0.0002,
"loss": 0.5375736951828003,
"mean_token_accuracy": 0.7817153483629227,
"num_tokens": 9953359.0,
"step": 611
},
{
"entropy": 0.557465985417366,
"epoch": 2.28411214953271,
"grad_norm": 0.0430569127202034,
"learning_rate": 0.0002,
"loss": 0.5485789775848389,
"mean_token_accuracy": 0.7774669080972672,
"num_tokens": 9969809.0,
"step": 612
},
{
"entropy": 0.5491045266389847,
"epoch": 2.2878504672897195,
"grad_norm": 0.04154661297798157,
"learning_rate": 0.0002,
"loss": 0.5452516078948975,
"mean_token_accuracy": 0.7782464772462845,
"num_tokens": 9986122.0,
"step": 613
},
{
"entropy": 0.5396340191364288,
"epoch": 2.291588785046729,
"grad_norm": 0.03867339715361595,
"learning_rate": 0.0002,
"loss": 0.5436422228813171,
"mean_token_accuracy": 0.7793163359165192,
"num_tokens": 10002373.0,
"step": 614
},
{
"entropy": 0.5227179303765297,
"epoch": 2.2953271028037383,
"grad_norm": 0.055158648639917374,
"learning_rate": 0.0002,
"loss": 0.5356475710868835,
"mean_token_accuracy": 0.7828944474458694,
"num_tokens": 10018532.0,
"step": 615
},
{
"entropy": 0.5101833418011665,
"epoch": 2.2990654205607477,
"grad_norm": 0.04139378294348717,
"learning_rate": 0.0002,
"loss": 0.5111054182052612,
"mean_token_accuracy": 0.7948217988014221,
"num_tokens": 10034449.0,
"step": 616
},
{
"entropy": 0.5332518517971039,
"epoch": 2.302803738317757,
"grad_norm": 0.042138371616601944,
"learning_rate": 0.0002,
"loss": 0.5291332602500916,
"mean_token_accuracy": 0.7875723540782928,
"num_tokens": 10050791.0,
"step": 617
},
{
"entropy": 0.5545465350151062,
"epoch": 2.3065420560747665,
"grad_norm": 0.04594315588474274,
"learning_rate": 0.0002,
"loss": 0.5547114610671997,
"mean_token_accuracy": 0.7752625793218613,
"num_tokens": 10067160.0,
"step": 618
},
{
"entropy": 0.538428008556366,
"epoch": 2.310280373831776,
"grad_norm": 0.038197144865989685,
"learning_rate": 0.0002,
"loss": 0.5356147885322571,
"mean_token_accuracy": 0.7812609076499939,
"num_tokens": 10083623.0,
"step": 619
},
{
"entropy": 0.515357218682766,
"epoch": 2.3140186915887853,
"grad_norm": 0.04305245727300644,
"learning_rate": 0.0002,
"loss": 0.5182097554206848,
"mean_token_accuracy": 0.7897254973649979,
"num_tokens": 10099734.0,
"step": 620
},
{
"entropy": 0.5176303833723068,
"epoch": 2.317757009345794,
"grad_norm": 0.040814559906721115,
"learning_rate": 0.0002,
"loss": 0.5241186618804932,
"mean_token_accuracy": 0.7862492203712463,
"num_tokens": 10115923.0,
"step": 621
},
{
"entropy": 0.5319753438234329,
"epoch": 2.3214953271028036,
"grad_norm": 0.038612622767686844,
"learning_rate": 0.0002,
"loss": 0.5332948565483093,
"mean_token_accuracy": 0.7826831489801407,
"num_tokens": 10132186.0,
"step": 622
},
{
"entropy": 0.5231878906488419,
"epoch": 2.325233644859813,
"grad_norm": 0.04399793595075607,
"learning_rate": 0.0002,
"loss": 0.5220815539360046,
"mean_token_accuracy": 0.7883405387401581,
"num_tokens": 10148176.0,
"step": 623
},
{
"entropy": 0.5503655076026917,
"epoch": 2.3289719626168224,
"grad_norm": 0.03310840204358101,
"learning_rate": 0.0002,
"loss": 0.5424314737319946,
"mean_token_accuracy": 0.7791298031806946,
"num_tokens": 10164602.0,
"step": 624
},
{
"entropy": 0.5562791079282761,
"epoch": 2.3327102803738318,
"grad_norm": 0.046219419687986374,
"learning_rate": 0.0002,
"loss": 0.5487840175628662,
"mean_token_accuracy": 0.7803521156311035,
"num_tokens": 10180910.0,
"step": 625
},
{
"entropy": 0.536386102437973,
"epoch": 2.336448598130841,
"grad_norm": 0.038521721959114075,
"learning_rate": 0.0002,
"loss": 0.5320638418197632,
"mean_token_accuracy": 0.7856791615486145,
"num_tokens": 10197138.0,
"step": 626
},
{
"entropy": 0.5220321416854858,
"epoch": 2.3401869158878505,
"grad_norm": 0.046215180307626724,
"learning_rate": 0.0002,
"loss": 0.5289742946624756,
"mean_token_accuracy": 0.784678503870964,
"num_tokens": 10213246.0,
"step": 627
},
{
"entropy": 0.5178990513086319,
"epoch": 2.34392523364486,
"grad_norm": 0.04778464511036873,
"learning_rate": 0.0002,
"loss": 0.522329568862915,
"mean_token_accuracy": 0.7881183475255966,
"num_tokens": 10229431.0,
"step": 628
},
{
"entropy": 0.5353438407182693,
"epoch": 2.3476635514018693,
"grad_norm": 0.04080234467983246,
"learning_rate": 0.0002,
"loss": 0.5433787107467651,
"mean_token_accuracy": 0.7780589759349823,
"num_tokens": 10245684.0,
"step": 629
},
{
"entropy": 0.5368916243314743,
"epoch": 2.3514018691588783,
"grad_norm": 0.043697554618120193,
"learning_rate": 0.0002,
"loss": 0.541444718837738,
"mean_token_accuracy": 0.7807413637638092,
"num_tokens": 10262210.0,
"step": 630
},
{
"entropy": 0.5506647378206253,
"epoch": 2.3551401869158877,
"grad_norm": 0.038478951901197433,
"learning_rate": 0.0002,
"loss": 0.5461610555648804,
"mean_token_accuracy": 0.7788456082344055,
"num_tokens": 10278611.0,
"step": 631
},
{
"entropy": 0.5395764261484146,
"epoch": 2.358878504672897,
"grad_norm": 0.03904217854142189,
"learning_rate": 0.0002,
"loss": 0.5317508578300476,
"mean_token_accuracy": 0.7833081781864166,
"num_tokens": 10294800.0,
"step": 632
},
{
"entropy": 0.5478651374578476,
"epoch": 2.3626168224299064,
"grad_norm": 0.048824410885572433,
"learning_rate": 0.0002,
"loss": 0.5395293831825256,
"mean_token_accuracy": 0.783235713839531,
"num_tokens": 10311090.0,
"step": 633
},
{
"entropy": 0.5332029610872269,
"epoch": 2.366355140186916,
"grad_norm": 0.04313044250011444,
"learning_rate": 0.0002,
"loss": 0.5401085615158081,
"mean_token_accuracy": 0.778812825679779,
"num_tokens": 10327348.0,
"step": 634
},
{
"entropy": 0.5406146496534348,
"epoch": 2.3700934579439252,
"grad_norm": 0.04600725322961807,
"learning_rate": 0.0002,
"loss": 0.5516705513000488,
"mean_token_accuracy": 0.7761097699403763,
"num_tokens": 10343800.0,
"step": 635
},
{
"entropy": 0.5261052846908569,
"epoch": 2.3738317757009346,
"grad_norm": 0.045134712010622025,
"learning_rate": 0.0002,
"loss": 0.5412300825119019,
"mean_token_accuracy": 0.7802619636058807,
"num_tokens": 10360082.0,
"step": 636
},
{
"entropy": 0.5589279979467392,
"epoch": 2.377570093457944,
"grad_norm": 0.041725922375917435,
"learning_rate": 0.0002,
"loss": 0.5517748594284058,
"mean_token_accuracy": 0.778441995382309,
"num_tokens": 10376345.0,
"step": 637
},
{
"entropy": 0.5504082888364792,
"epoch": 2.3813084112149534,
"grad_norm": 0.03725145012140274,
"learning_rate": 0.0002,
"loss": 0.5404931306838989,
"mean_token_accuracy": 0.7776447534561157,
"num_tokens": 10392870.0,
"step": 638
},
{
"entropy": 0.5359382033348083,
"epoch": 2.385046728971963,
"grad_norm": 0.0364760085940361,
"learning_rate": 0.0002,
"loss": 0.533162534236908,
"mean_token_accuracy": 0.7851890027523041,
"num_tokens": 10409256.0,
"step": 639
},
{
"entropy": 0.5336398631334305,
"epoch": 2.388785046728972,
"grad_norm": 0.036078356206417084,
"learning_rate": 0.0002,
"loss": 0.5374175906181335,
"mean_token_accuracy": 0.7814856320619583,
"num_tokens": 10425831.0,
"step": 640
},
{
"entropy": 0.5284569710493088,
"epoch": 2.392523364485981,
"grad_norm": 0.04704172909259796,
"learning_rate": 0.0002,
"loss": 0.5387214422225952,
"mean_token_accuracy": 0.7815752625465393,
"num_tokens": 10442382.0,
"step": 641
},
{
"entropy": 0.5344073623418808,
"epoch": 2.3962616822429905,
"grad_norm": 0.0398792028427124,
"learning_rate": 0.0002,
"loss": 0.5398225784301758,
"mean_token_accuracy": 0.7818136066198349,
"num_tokens": 10458810.0,
"step": 642
},
{
"entropy": 0.5323895663022995,
"epoch": 2.4,
"grad_norm": 0.037454817444086075,
"learning_rate": 0.0002,
"loss": 0.5368887782096863,
"mean_token_accuracy": 0.7800801247358322,
"num_tokens": 10474692.0,
"step": 643
},
{
"entropy": 0.5394662618637085,
"epoch": 2.4037383177570093,
"grad_norm": 0.03576047718524933,
"learning_rate": 0.0002,
"loss": 0.5351858735084534,
"mean_token_accuracy": 0.7815855145454407,
"num_tokens": 10491015.0,
"step": 644
},
{
"entropy": 0.547369509935379,
"epoch": 2.4074766355140187,
"grad_norm": 0.0398087315261364,
"learning_rate": 0.0002,
"loss": 0.5397285223007202,
"mean_token_accuracy": 0.7805114239454269,
"num_tokens": 10507366.0,
"step": 645
},
{
"entropy": 0.5508280843496323,
"epoch": 2.411214953271028,
"grad_norm": 0.03709566593170166,
"learning_rate": 0.0002,
"loss": 0.5448777675628662,
"mean_token_accuracy": 0.7763405591249466,
"num_tokens": 10523374.0,
"step": 646
},
{
"entropy": 0.5248509049415588,
"epoch": 2.4149532710280375,
"grad_norm": 0.03418833017349243,
"learning_rate": 0.0002,
"loss": 0.5208706855773926,
"mean_token_accuracy": 0.7874817848205566,
"num_tokens": 10539624.0,
"step": 647
},
{
"entropy": 0.5466809421777725,
"epoch": 2.418691588785047,
"grad_norm": 0.039764732122421265,
"learning_rate": 0.0002,
"loss": 0.5513855218887329,
"mean_token_accuracy": 0.776073694229126,
"num_tokens": 10556212.0,
"step": 648
},
{
"entropy": 0.5117013603448868,
"epoch": 2.4224299065420563,
"grad_norm": 0.04086057096719742,
"learning_rate": 0.0002,
"loss": 0.5219972729682922,
"mean_token_accuracy": 0.7889275252819061,
"num_tokens": 10572323.0,
"step": 649
},
{
"entropy": 0.5393745452165604,
"epoch": 2.426168224299065,
"grad_norm": 0.037193622440099716,
"learning_rate": 0.0002,
"loss": 0.5456075668334961,
"mean_token_accuracy": 0.7753270417451859,
"num_tokens": 10588533.0,
"step": 650
},
{
"entropy": 0.5517471730709076,
"epoch": 2.4299065420560746,
"grad_norm": 0.04061353579163551,
"learning_rate": 0.0002,
"loss": 0.5480504035949707,
"mean_token_accuracy": 0.7777185589075089,
"num_tokens": 10604736.0,
"step": 651
},
{
"entropy": 0.5332285165786743,
"epoch": 2.433644859813084,
"grad_norm": 0.037262339144945145,
"learning_rate": 0.0002,
"loss": 0.52723628282547,
"mean_token_accuracy": 0.7820963263511658,
"num_tokens": 10621005.0,
"step": 652
},
{
"entropy": 0.5427125096321106,
"epoch": 2.4373831775700934,
"grad_norm": 0.038290560245513916,
"learning_rate": 0.0002,
"loss": 0.5433245897293091,
"mean_token_accuracy": 0.7764440774917603,
"num_tokens": 10637274.0,
"step": 653
},
{
"entropy": 0.515294149518013,
"epoch": 2.4411214953271028,
"grad_norm": 0.07859813421964645,
"learning_rate": 0.0002,
"loss": 0.5192139744758606,
"mean_token_accuracy": 0.7903406471014023,
"num_tokens": 10653571.0,
"step": 654
},
{
"entropy": 0.5411062091588974,
"epoch": 2.444859813084112,
"grad_norm": 0.04054918885231018,
"learning_rate": 0.0002,
"loss": 0.5439664721488953,
"mean_token_accuracy": 0.7815183401107788,
"num_tokens": 10670139.0,
"step": 655
},
{
"entropy": 0.5487605780363083,
"epoch": 2.4485981308411215,
"grad_norm": 0.04026317596435547,
"learning_rate": 0.0002,
"loss": 0.5495845675468445,
"mean_token_accuracy": 0.7765460163354874,
"num_tokens": 10686846.0,
"step": 656
},
{
"entropy": 0.5351516157388687,
"epoch": 2.452336448598131,
"grad_norm": 0.040862392634153366,
"learning_rate": 0.0002,
"loss": 0.5336912870407104,
"mean_token_accuracy": 0.7818685173988342,
"num_tokens": 10703200.0,
"step": 657
},
{
"entropy": 0.5463723838329315,
"epoch": 2.4560747663551403,
"grad_norm": 0.03873393312096596,
"learning_rate": 0.0002,
"loss": 0.5465680360794067,
"mean_token_accuracy": 0.7760122418403625,
"num_tokens": 10719561.0,
"step": 658
},
{
"entropy": 0.5416133552789688,
"epoch": 2.4598130841121497,
"grad_norm": 0.044795434921979904,
"learning_rate": 0.0002,
"loss": 0.5411824584007263,
"mean_token_accuracy": 0.7804904133081436,
"num_tokens": 10735767.0,
"step": 659
},
{
"entropy": 0.5494029372930527,
"epoch": 2.463551401869159,
"grad_norm": 0.04379895702004433,
"learning_rate": 0.0002,
"loss": 0.5456870198249817,
"mean_token_accuracy": 0.7755402028560638,
"num_tokens": 10751886.0,
"step": 660
},
{
"entropy": 0.5367189347743988,
"epoch": 2.467289719626168,
"grad_norm": 0.03852448984980583,
"learning_rate": 0.0002,
"loss": 0.5393000841140747,
"mean_token_accuracy": 0.7800532579421997,
"num_tokens": 10768210.0,
"step": 661
},
{
"entropy": 0.5270116031169891,
"epoch": 2.4710280373831774,
"grad_norm": 0.03792192041873932,
"learning_rate": 0.0002,
"loss": 0.5289605259895325,
"mean_token_accuracy": 0.7838020473718643,
"num_tokens": 10784434.0,
"step": 662
},
{
"entropy": 0.5338448286056519,
"epoch": 2.474766355140187,
"grad_norm": 0.0350453220307827,
"learning_rate": 0.0002,
"loss": 0.5380920767784119,
"mean_token_accuracy": 0.7818057388067245,
"num_tokens": 10800619.0,
"step": 663
},
{
"entropy": 0.5228566378355026,
"epoch": 2.4785046728971962,
"grad_norm": 0.046152058988809586,
"learning_rate": 0.0002,
"loss": 0.5300622582435608,
"mean_token_accuracy": 0.7793385684490204,
"num_tokens": 10816801.0,
"step": 664
},
{
"entropy": 0.5290849655866623,
"epoch": 2.4822429906542056,
"grad_norm": 0.03659910336136818,
"learning_rate": 0.0002,
"loss": 0.5329374074935913,
"mean_token_accuracy": 0.7838267683982849,
"num_tokens": 10833095.0,
"step": 665
},
{
"entropy": 0.545561358332634,
"epoch": 2.485981308411215,
"grad_norm": 0.04097100347280502,
"learning_rate": 0.0002,
"loss": 0.5479649901390076,
"mean_token_accuracy": 0.7784263789653778,
"num_tokens": 10849473.0,
"step": 666
},
{
"entropy": 0.5502291470766068,
"epoch": 2.4897196261682244,
"grad_norm": 0.04253846034407616,
"learning_rate": 0.0002,
"loss": 0.5466883182525635,
"mean_token_accuracy": 0.7778628617525101,
"num_tokens": 10865837.0,
"step": 667
},
{
"entropy": 0.5474338084459305,
"epoch": 2.493457943925234,
"grad_norm": 0.037734732031822205,
"learning_rate": 0.0002,
"loss": 0.5415964126586914,
"mean_token_accuracy": 0.7777974009513855,
"num_tokens": 10882273.0,
"step": 668
},
{
"entropy": 0.5401993542909622,
"epoch": 2.497196261682243,
"grad_norm": 0.039542876183986664,
"learning_rate": 0.0002,
"loss": 0.5339391231536865,
"mean_token_accuracy": 0.784349262714386,
"num_tokens": 10898780.0,
"step": 669
},
{
"entropy": 0.5420306771993637,
"epoch": 2.500934579439252,
"grad_norm": 0.049927666783332825,
"learning_rate": 0.0002,
"loss": 0.5389054417610168,
"mean_token_accuracy": 0.7841761559247971,
"num_tokens": 10915059.0,
"step": 670
},
{
"entropy": 0.5333422720432281,
"epoch": 2.5046728971962615,
"grad_norm": 0.042702775448560715,
"learning_rate": 0.0002,
"loss": 0.5403023958206177,
"mean_token_accuracy": 0.7792320251464844,
"num_tokens": 10931718.0,
"step": 671
},
{
"entropy": 0.5289912968873978,
"epoch": 2.508411214953271,
"grad_norm": 0.050530027598142624,
"learning_rate": 0.0002,
"loss": 0.5404794216156006,
"mean_token_accuracy": 0.7815851122140884,
"num_tokens": 10948084.0,
"step": 672
},
{
"entropy": 0.5341697633266449,
"epoch": 2.5121495327102803,
"grad_norm": 0.04310121387243271,
"learning_rate": 0.0002,
"loss": 0.5389139652252197,
"mean_token_accuracy": 0.778786912560463,
"num_tokens": 10964373.0,
"step": 673
},
{
"entropy": 0.5569636076688766,
"epoch": 2.5158878504672897,
"grad_norm": 0.03820215165615082,
"learning_rate": 0.0002,
"loss": 0.5578426122665405,
"mean_token_accuracy": 0.7730483710765839,
"num_tokens": 10980732.0,
"step": 674
},
{
"entropy": 0.5347766578197479,
"epoch": 2.519626168224299,
"grad_norm": 0.04349920526146889,
"learning_rate": 0.0002,
"loss": 0.5336275100708008,
"mean_token_accuracy": 0.7815207839012146,
"num_tokens": 10997005.0,
"step": 675
},
{
"entropy": 0.5299794673919678,
"epoch": 2.5233644859813085,
"grad_norm": 0.04003509134054184,
"learning_rate": 0.0002,
"loss": 0.5294742584228516,
"mean_token_accuracy": 0.7869250029325485,
"num_tokens": 11013055.0,
"step": 676
},
{
"entropy": 0.5352783799171448,
"epoch": 2.527102803738318,
"grad_norm": 0.054121218621730804,
"learning_rate": 0.0002,
"loss": 0.5448738932609558,
"mean_token_accuracy": 0.7791888117790222,
"num_tokens": 11029266.0,
"step": 677
},
{
"entropy": 0.5354646146297455,
"epoch": 2.5308411214953273,
"grad_norm": 0.03573855757713318,
"learning_rate": 0.0002,
"loss": 0.5352723002433777,
"mean_token_accuracy": 0.7825258523225784,
"num_tokens": 11045806.0,
"step": 678
},
{
"entropy": 0.556391716003418,
"epoch": 2.5345794392523366,
"grad_norm": 0.04871753975749016,
"learning_rate": 0.0002,
"loss": 0.5602859258651733,
"mean_token_accuracy": 0.7722157090902328,
"num_tokens": 11062035.0,
"step": 679
},
{
"entropy": 0.5508870929479599,
"epoch": 2.538317757009346,
"grad_norm": 0.03932088986039162,
"learning_rate": 0.0002,
"loss": 0.5469393730163574,
"mean_token_accuracy": 0.7782620638608932,
"num_tokens": 11078375.0,
"step": 680
},
{
"entropy": 0.5481788516044617,
"epoch": 2.542056074766355,
"grad_norm": 0.04463294520974159,
"learning_rate": 0.0002,
"loss": 0.5469505190849304,
"mean_token_accuracy": 0.7766976356506348,
"num_tokens": 11094977.0,
"step": 681
},
{
"entropy": 0.5154567137360573,
"epoch": 2.5457943925233644,
"grad_norm": 0.044517725706100464,
"learning_rate": 0.0002,
"loss": 0.5210436582565308,
"mean_token_accuracy": 0.7881979048252106,
"num_tokens": 11110907.0,
"step": 682
},
{
"entropy": 0.5250661969184875,
"epoch": 2.5495327102803738,
"grad_norm": 0.03574059158563614,
"learning_rate": 0.0002,
"loss": 0.5239285826683044,
"mean_token_accuracy": 0.7901371419429779,
"num_tokens": 11127432.0,
"step": 683
},
{
"entropy": 0.541177287697792,
"epoch": 2.553271028037383,
"grad_norm": 0.03583724424242973,
"learning_rate": 0.0002,
"loss": 0.5399287343025208,
"mean_token_accuracy": 0.7795550227165222,
"num_tokens": 11143788.0,
"step": 684
},
{
"entropy": 0.5319067388772964,
"epoch": 2.5570093457943925,
"grad_norm": 0.038700610399246216,
"learning_rate": 0.0002,
"loss": 0.5372647047042847,
"mean_token_accuracy": 0.7816288769245148,
"num_tokens": 11160145.0,
"step": 685
},
{
"entropy": 0.5243031531572342,
"epoch": 2.560747663551402,
"grad_norm": 0.0457780659198761,
"learning_rate": 0.0002,
"loss": 0.5248138308525085,
"mean_token_accuracy": 0.7840212136507034,
"num_tokens": 11176075.0,
"step": 686
},
{
"entropy": 0.5483701825141907,
"epoch": 2.5644859813084113,
"grad_norm": 0.0399782694876194,
"learning_rate": 0.0002,
"loss": 0.5485758185386658,
"mean_token_accuracy": 0.7779590934514999,
"num_tokens": 11192293.0,
"step": 687
},
{
"entropy": 0.5290739685297012,
"epoch": 2.5682242990654207,
"grad_norm": 0.056546278297901154,
"learning_rate": 0.0002,
"loss": 0.5325236320495605,
"mean_token_accuracy": 0.7835103422403336,
"num_tokens": 11208542.0,
"step": 688
},
{
"entropy": 0.5161010921001434,
"epoch": 2.5719626168224297,
"grad_norm": 0.042589396238327026,
"learning_rate": 0.0002,
"loss": 0.5185222625732422,
"mean_token_accuracy": 0.7873405963182449,
"num_tokens": 11224578.0,
"step": 689
},
{
"entropy": 0.5410270541906357,
"epoch": 2.575700934579439,
"grad_norm": 0.05106229707598686,
"learning_rate": 0.0002,
"loss": 0.5452054142951965,
"mean_token_accuracy": 0.7787328362464905,
"num_tokens": 11240887.0,
"step": 690
},
{
"entropy": 0.5375277251005173,
"epoch": 2.5794392523364484,
"grad_norm": 0.03891480341553688,
"learning_rate": 0.0002,
"loss": 0.5347110033035278,
"mean_token_accuracy": 0.7833239287137985,
"num_tokens": 11256921.0,
"step": 691
},
{
"entropy": 0.5428935289382935,
"epoch": 2.583177570093458,
"grad_norm": 0.04642964154481888,
"learning_rate": 0.0002,
"loss": 0.5380253195762634,
"mean_token_accuracy": 0.7818872332572937,
"num_tokens": 11273253.0,
"step": 692
},
{
"entropy": 0.5503559708595276,
"epoch": 2.586915887850467,
"grad_norm": 0.04631572589278221,
"learning_rate": 0.0002,
"loss": 0.5499509572982788,
"mean_token_accuracy": 0.7778131514787674,
"num_tokens": 11289524.0,
"step": 693
},
{
"entropy": 0.5296535789966583,
"epoch": 2.5906542056074766,
"grad_norm": 0.04232152923941612,
"learning_rate": 0.0002,
"loss": 0.5292780995368958,
"mean_token_accuracy": 0.7848498374223709,
"num_tokens": 11305878.0,
"step": 694
},
{
"entropy": 0.5324369296431541,
"epoch": 2.594392523364486,
"grad_norm": 0.04305447265505791,
"learning_rate": 0.0002,
"loss": 0.5328658223152161,
"mean_token_accuracy": 0.7839655876159668,
"num_tokens": 11322266.0,
"step": 695
},
{
"entropy": 0.5353843569755554,
"epoch": 2.5981308411214954,
"grad_norm": 0.04098288714885712,
"learning_rate": 0.0002,
"loss": 0.5361748933792114,
"mean_token_accuracy": 0.7821073234081268,
"num_tokens": 11338684.0,
"step": 696
},
{
"entropy": 0.5268280059099197,
"epoch": 2.601869158878505,
"grad_norm": 0.05113406851887703,
"learning_rate": 0.0002,
"loss": 0.5360528230667114,
"mean_token_accuracy": 0.7813736945390701,
"num_tokens": 11354924.0,
"step": 697
},
{
"entropy": 0.5334519147872925,
"epoch": 2.605607476635514,
"grad_norm": 0.036048226058483124,
"learning_rate": 0.0002,
"loss": 0.5367494225502014,
"mean_token_accuracy": 0.782368615269661,
"num_tokens": 11371138.0,
"step": 698
},
{
"entropy": 0.5625623911619186,
"epoch": 2.6093457943925236,
"grad_norm": 0.04338160157203674,
"learning_rate": 0.0002,
"loss": 0.5562830567359924,
"mean_token_accuracy": 0.7749900668859482,
"num_tokens": 11387674.0,
"step": 699
},
{
"entropy": 0.5387382507324219,
"epoch": 2.613084112149533,
"grad_norm": 0.04549875482916832,
"learning_rate": 0.0002,
"loss": 0.5360974073410034,
"mean_token_accuracy": 0.781986802816391,
"num_tokens": 11403934.0,
"step": 700
},
{
"entropy": 0.5418427735567093,
"epoch": 2.616822429906542,
"grad_norm": 0.04425078630447388,
"learning_rate": 0.0002,
"loss": 0.5500712990760803,
"mean_token_accuracy": 0.7762207537889481,
"num_tokens": 11420207.0,
"step": 701
},
{
"entropy": 0.5345925241708755,
"epoch": 2.6205607476635513,
"grad_norm": 0.0503389798104763,
"learning_rate": 0.0002,
"loss": 0.5410506129264832,
"mean_token_accuracy": 0.7824158221483231,
"num_tokens": 11436366.0,
"step": 702
},
{
"entropy": 0.5293083861470222,
"epoch": 2.6242990654205607,
"grad_norm": 0.03849806264042854,
"learning_rate": 0.0002,
"loss": 0.5313189625740051,
"mean_token_accuracy": 0.7851823717355728,
"num_tokens": 11452692.0,
"step": 703
},
{
"entropy": 0.5381535738706589,
"epoch": 2.62803738317757,
"grad_norm": 0.04830117151141167,
"learning_rate": 0.0002,
"loss": 0.5306882262229919,
"mean_token_accuracy": 0.7875523120164871,
"num_tokens": 11468948.0,
"step": 704
},
{
"entropy": 0.5537677556276321,
"epoch": 2.6317757009345795,
"grad_norm": 0.03648355230689049,
"learning_rate": 0.0002,
"loss": 0.549413800239563,
"mean_token_accuracy": 0.7742456942796707,
"num_tokens": 11485304.0,
"step": 705
},
{
"entropy": 0.5376065969467163,
"epoch": 2.635514018691589,
"grad_norm": 0.03775647282600403,
"learning_rate": 0.0002,
"loss": 0.5347313284873962,
"mean_token_accuracy": 0.7820166647434235,
"num_tokens": 11501515.0,
"step": 706
},
{
"entropy": 0.5389592945575714,
"epoch": 2.6392523364485982,
"grad_norm": 0.03849456459283829,
"learning_rate": 0.0002,
"loss": 0.542040228843689,
"mean_token_accuracy": 0.7777668088674545,
"num_tokens": 11517823.0,
"step": 707
},
{
"entropy": 0.5297961235046387,
"epoch": 2.6429906542056076,
"grad_norm": 0.03884672373533249,
"learning_rate": 0.0002,
"loss": 0.5295203924179077,
"mean_token_accuracy": 0.7848687022924423,
"num_tokens": 11534089.0,
"step": 708
},
{
"entropy": 0.5374749451875687,
"epoch": 2.6467289719626166,
"grad_norm": 0.040985025465488434,
"learning_rate": 0.0002,
"loss": 0.5486632585525513,
"mean_token_accuracy": 0.7780227363109589,
"num_tokens": 11550404.0,
"step": 709
},
{
"entropy": 0.5216163545846939,
"epoch": 2.650467289719626,
"grad_norm": 0.041445303708314896,
"learning_rate": 0.0002,
"loss": 0.5271479487419128,
"mean_token_accuracy": 0.7851904779672623,
"num_tokens": 11566700.0,
"step": 710
},
{
"entropy": 0.548863023519516,
"epoch": 2.6542056074766354,
"grad_norm": 0.03768117353320122,
"learning_rate": 0.0002,
"loss": 0.5421991944313049,
"mean_token_accuracy": 0.7786275446414948,
"num_tokens": 11583296.0,
"step": 711
},
{
"entropy": 0.5540084540843964,
"epoch": 2.6579439252336448,
"grad_norm": 0.03594231605529785,
"learning_rate": 0.0002,
"loss": 0.5558887720108032,
"mean_token_accuracy": 0.775081142783165,
"num_tokens": 11599637.0,
"step": 712
},
{
"entropy": 0.528472974896431,
"epoch": 2.661682242990654,
"grad_norm": 0.03718520700931549,
"learning_rate": 0.0002,
"loss": 0.5246076583862305,
"mean_token_accuracy": 0.7852199673652649,
"num_tokens": 11615767.0,
"step": 713
},
{
"entropy": 0.546594500541687,
"epoch": 2.6654205607476635,
"grad_norm": 0.042944129556417465,
"learning_rate": 0.0002,
"loss": 0.5401133298873901,
"mean_token_accuracy": 0.7802519649267197,
"num_tokens": 11632056.0,
"step": 714
},
{
"entropy": 0.5382472574710846,
"epoch": 2.669158878504673,
"grad_norm": 0.04242360591888428,
"learning_rate": 0.0002,
"loss": 0.5468363761901855,
"mean_token_accuracy": 0.7763016223907471,
"num_tokens": 11648587.0,
"step": 715
},
{
"entropy": 0.5384316891431808,
"epoch": 2.6728971962616823,
"grad_norm": 0.04231888800859451,
"learning_rate": 0.0002,
"loss": 0.5447696447372437,
"mean_token_accuracy": 0.7771705389022827,
"num_tokens": 11665216.0,
"step": 716
},
{
"entropy": 0.536566972732544,
"epoch": 2.6766355140186917,
"grad_norm": 0.051330000162124634,
"learning_rate": 0.0002,
"loss": 0.5337138175964355,
"mean_token_accuracy": 0.7841814905405045,
"num_tokens": 11681565.0,
"step": 717
},
{
"entropy": 0.5605298280715942,
"epoch": 2.680373831775701,
"grad_norm": 0.04393962025642395,
"learning_rate": 0.0002,
"loss": 0.5522550344467163,
"mean_token_accuracy": 0.7745645940303802,
"num_tokens": 11697734.0,
"step": 718
},
{
"entropy": 0.5421400368213654,
"epoch": 2.6841121495327105,
"grad_norm": 0.04087737947702408,
"learning_rate": 0.0002,
"loss": 0.5356095433235168,
"mean_token_accuracy": 0.7823581695556641,
"num_tokens": 11714256.0,
"step": 719
},
{
"entropy": 0.5455932766199112,
"epoch": 2.68785046728972,
"grad_norm": 0.04586983844637871,
"learning_rate": 0.0002,
"loss": 0.5500515699386597,
"mean_token_accuracy": 0.7770348936319351,
"num_tokens": 11730670.0,
"step": 720
},
{
"entropy": 0.521054208278656,
"epoch": 2.691588785046729,
"grad_norm": 0.04511021822690964,
"learning_rate": 0.0002,
"loss": 0.5274732112884521,
"mean_token_accuracy": 0.7863785922527313,
"num_tokens": 11747011.0,
"step": 721
},
{
"entropy": 0.5369152277708054,
"epoch": 2.695327102803738,
"grad_norm": 0.04111414775252342,
"learning_rate": 0.0002,
"loss": 0.5466327667236328,
"mean_token_accuracy": 0.7800845950841904,
"num_tokens": 11763325.0,
"step": 722
},
{
"entropy": 0.5467284768819809,
"epoch": 2.6990654205607476,
"grad_norm": 0.04847726225852966,
"learning_rate": 0.0002,
"loss": 0.5574571490287781,
"mean_token_accuracy": 0.7709622234106064,
"num_tokens": 11779629.0,
"step": 723
},
{
"entropy": 0.556825578212738,
"epoch": 2.702803738317757,
"grad_norm": 0.04135042428970337,
"learning_rate": 0.0002,
"loss": 0.5567163228988647,
"mean_token_accuracy": 0.773699164390564,
"num_tokens": 11795735.0,
"step": 724
},
{
"entropy": 0.5429602861404419,
"epoch": 2.7065420560747664,
"grad_norm": 0.0402897410094738,
"learning_rate": 0.0002,
"loss": 0.5313383936882019,
"mean_token_accuracy": 0.7854284048080444,
"num_tokens": 11812127.0,
"step": 725
},
{
"entropy": 0.5411138385534286,
"epoch": 2.710280373831776,
"grad_norm": 0.04476531967520714,
"learning_rate": 0.0002,
"loss": 0.5395961403846741,
"mean_token_accuracy": 0.7811660319566727,
"num_tokens": 11828424.0,
"step": 726
},
{
"entropy": 0.5500029474496841,
"epoch": 2.714018691588785,
"grad_norm": 0.03904065489768982,
"learning_rate": 0.0002,
"loss": 0.5481054186820984,
"mean_token_accuracy": 0.7797027230262756,
"num_tokens": 11844904.0,
"step": 727
},
{
"entropy": 0.5594752728939056,
"epoch": 2.717757009345794,
"grad_norm": 0.04920347407460213,
"learning_rate": 0.0002,
"loss": 0.5654065012931824,
"mean_token_accuracy": 0.7703305035829544,
"num_tokens": 11861341.0,
"step": 728
},
{
"entropy": 0.5409399420022964,
"epoch": 2.7214953271028035,
"grad_norm": 0.04093843698501587,
"learning_rate": 0.0002,
"loss": 0.5432956218719482,
"mean_token_accuracy": 0.7790299355983734,
"num_tokens": 11877689.0,
"step": 729
},
{
"entropy": 0.5429576933383942,
"epoch": 2.725233644859813,
"grad_norm": 0.049346111714839935,
"learning_rate": 0.0002,
"loss": 0.55011385679245,
"mean_token_accuracy": 0.77861687541008,
"num_tokens": 11893814.0,
"step": 730
},
{
"entropy": 0.5407661944627762,
"epoch": 2.7289719626168223,
"grad_norm": 0.0420721061527729,
"learning_rate": 0.0002,
"loss": 0.5426504015922546,
"mean_token_accuracy": 0.7803787589073181,
"num_tokens": 11910096.0,
"step": 731
},
{
"entropy": 0.5468227863311768,
"epoch": 2.7327102803738317,
"grad_norm": 0.0373503714799881,
"learning_rate": 0.0002,
"loss": 0.5417306423187256,
"mean_token_accuracy": 0.782159686088562,
"num_tokens": 11926285.0,
"step": 732
},
{
"entropy": 0.5427874177694321,
"epoch": 2.736448598130841,
"grad_norm": 0.041012153029441833,
"learning_rate": 0.0002,
"loss": 0.5334447622299194,
"mean_token_accuracy": 0.7827651649713516,
"num_tokens": 11942656.0,
"step": 733
},
{
"entropy": 0.5550535768270493,
"epoch": 2.7401869158878505,
"grad_norm": 0.03842266649007797,
"learning_rate": 0.0002,
"loss": 0.5497796535491943,
"mean_token_accuracy": 0.7729970514774323,
"num_tokens": 11959059.0,
"step": 734
},
{
"entropy": 0.5359070003032684,
"epoch": 2.74392523364486,
"grad_norm": 0.039268966764211655,
"learning_rate": 0.0002,
"loss": 0.5411967039108276,
"mean_token_accuracy": 0.7831978797912598,
"num_tokens": 11975265.0,
"step": 735
},
{
"entropy": 0.5536347031593323,
"epoch": 2.7476635514018692,
"grad_norm": 0.045411862432956696,
"learning_rate": 0.0002,
"loss": 0.5618187189102173,
"mean_token_accuracy": 0.7741181403398514,
"num_tokens": 11991498.0,
"step": 736
},
{
"entropy": 0.5233520418405533,
"epoch": 2.7514018691588786,
"grad_norm": 0.040144748985767365,
"learning_rate": 0.0002,
"loss": 0.5300607681274414,
"mean_token_accuracy": 0.7847813218832016,
"num_tokens": 12007487.0,
"step": 737
},
{
"entropy": 0.5281567052006721,
"epoch": 2.755140186915888,
"grad_norm": 0.04088376462459564,
"learning_rate": 0.0002,
"loss": 0.5294374823570251,
"mean_token_accuracy": 0.7852809429168701,
"num_tokens": 12023900.0,
"step": 738
},
{
"entropy": 0.5510239601135254,
"epoch": 2.7588785046728974,
"grad_norm": 0.04011458903551102,
"learning_rate": 0.0002,
"loss": 0.5465855002403259,
"mean_token_accuracy": 0.7779260277748108,
"num_tokens": 12040338.0,
"step": 739
},
{
"entropy": 0.57439024746418,
"epoch": 2.762616822429907,
"grad_norm": 0.036590199917554855,
"learning_rate": 0.0002,
"loss": 0.5653122663497925,
"mean_token_accuracy": 0.7694305032491684,
"num_tokens": 12056958.0,
"step": 740
},
{
"entropy": 0.5615127831697464,
"epoch": 2.7663551401869158,
"grad_norm": 0.036815449595451355,
"learning_rate": 0.0002,
"loss": 0.550983190536499,
"mean_token_accuracy": 0.7743483930826187,
"num_tokens": 12073644.0,
"step": 741
},
{
"entropy": 0.5349987298250198,
"epoch": 2.770093457943925,
"grad_norm": 0.03783464804291725,
"learning_rate": 0.0002,
"loss": 0.5378219485282898,
"mean_token_accuracy": 0.7834212332963943,
"num_tokens": 12090085.0,
"step": 742
},
{
"entropy": 0.5288607105612755,
"epoch": 2.7738317757009345,
"grad_norm": 0.047371115535497665,
"learning_rate": 0.0002,
"loss": 0.5444093346595764,
"mean_token_accuracy": 0.7794700562953949,
"num_tokens": 12106341.0,
"step": 743
},
{
"entropy": 0.5414262413978577,
"epoch": 2.777570093457944,
"grad_norm": 0.04306622967123985,
"learning_rate": 0.0002,
"loss": 0.548575222492218,
"mean_token_accuracy": 0.7780982106924057,
"num_tokens": 12122689.0,
"step": 744
},
{
"entropy": 0.5265444070100784,
"epoch": 2.7813084112149533,
"grad_norm": 0.038641780614852905,
"learning_rate": 0.0002,
"loss": 0.5287938117980957,
"mean_token_accuracy": 0.7837643325328827,
"num_tokens": 12138802.0,
"step": 745
},
{
"entropy": 0.5466189384460449,
"epoch": 2.7850467289719627,
"grad_norm": 0.0338594987988472,
"learning_rate": 0.0002,
"loss": 0.5439702272415161,
"mean_token_accuracy": 0.7782793641090393,
"num_tokens": 12154981.0,
"step": 746
},
{
"entropy": 0.5158288925886154,
"epoch": 2.788785046728972,
"grad_norm": 0.040148280560970306,
"learning_rate": 0.0002,
"loss": 0.5098775625228882,
"mean_token_accuracy": 0.7936903238296509,
"num_tokens": 12171278.0,
"step": 747
},
{
"entropy": 0.5605306029319763,
"epoch": 2.792523364485981,
"grad_norm": 0.03989556431770325,
"learning_rate": 0.0002,
"loss": 0.5507832169532776,
"mean_token_accuracy": 0.7760983258485794,
"num_tokens": 12187732.0,
"step": 748
},
{
"entropy": 0.561933159828186,
"epoch": 2.7962616822429904,
"grad_norm": 0.04341628775000572,
"learning_rate": 0.0002,
"loss": 0.5628443956375122,
"mean_token_accuracy": 0.7725982367992401,
"num_tokens": 12204073.0,
"step": 749
},
{
"entropy": 0.5275013446807861,
"epoch": 2.8,
"grad_norm": 0.04758904501795769,
"learning_rate": 0.0002,
"loss": 0.5401396751403809,
"mean_token_accuracy": 0.7802035212516785,
"num_tokens": 12220319.0,
"step": 750
},
{
"entropy": 0.5415465384721756,
"epoch": 2.803738317757009,
"grad_norm": 0.04323052614927292,
"learning_rate": 0.0002,
"loss": 0.5467565059661865,
"mean_token_accuracy": 0.7801296561956406,
"num_tokens": 12236798.0,
"step": 751
},
{
"entropy": 0.5384011566638947,
"epoch": 2.8074766355140186,
"grad_norm": 0.04094940423965454,
"learning_rate": 0.0002,
"loss": 0.5408844947814941,
"mean_token_accuracy": 0.7790292948484421,
"num_tokens": 12253226.0,
"step": 752
},
{
"entropy": 0.5556510388851166,
"epoch": 2.811214953271028,
"grad_norm": 0.037975817918777466,
"learning_rate": 0.0002,
"loss": 0.5480787754058838,
"mean_token_accuracy": 0.7771931290626526,
"num_tokens": 12269489.0,
"step": 753
},
{
"entropy": 0.5475790053606033,
"epoch": 2.8149532710280374,
"grad_norm": 0.041421882808208466,
"learning_rate": 0.0002,
"loss": 0.5383135676383972,
"mean_token_accuracy": 0.7827092558145523,
"num_tokens": 12285892.0,
"step": 754
},
{
"entropy": 0.5555797815322876,
"epoch": 2.8186915887850468,
"grad_norm": 0.03941413015127182,
"learning_rate": 0.0002,
"loss": 0.552151083946228,
"mean_token_accuracy": 0.7751595675945282,
"num_tokens": 12302269.0,
"step": 755
},
{
"entropy": 0.5256431847810745,
"epoch": 2.822429906542056,
"grad_norm": 0.040782686322927475,
"learning_rate": 0.0002,
"loss": 0.5262829661369324,
"mean_token_accuracy": 0.7846409976482391,
"num_tokens": 12318521.0,
"step": 756
},
{
"entropy": 0.538894459605217,
"epoch": 2.8261682242990656,
"grad_norm": 0.052266813814640045,
"learning_rate": 0.0002,
"loss": 0.5539013147354126,
"mean_token_accuracy": 0.7756392508745193,
"num_tokens": 12334819.0,
"step": 757
},
{
"entropy": 0.5483682453632355,
"epoch": 2.829906542056075,
"grad_norm": 0.04095127433538437,
"learning_rate": 0.0002,
"loss": 0.5520408749580383,
"mean_token_accuracy": 0.7747367471456528,
"num_tokens": 12351218.0,
"step": 758
},
{
"entropy": 0.5276503935456276,
"epoch": 2.8336448598130843,
"grad_norm": 0.04603305831551552,
"learning_rate": 0.0002,
"loss": 0.5317422151565552,
"mean_token_accuracy": 0.780977338552475,
"num_tokens": 12367390.0,
"step": 759
},
{
"entropy": 0.5502448529005051,
"epoch": 2.8373831775700937,
"grad_norm": 0.04640703275799751,
"learning_rate": 0.0002,
"loss": 0.5535072684288025,
"mean_token_accuracy": 0.7761691957712173,
"num_tokens": 12383960.0,
"step": 760
},
{
"entropy": 0.547056645154953,
"epoch": 2.8411214953271027,
"grad_norm": 0.033438824117183685,
"learning_rate": 0.0002,
"loss": 0.5412831902503967,
"mean_token_accuracy": 0.7795712947845459,
"num_tokens": 12400550.0,
"step": 761
},
{
"entropy": 0.5364657193422318,
"epoch": 2.844859813084112,
"grad_norm": 0.04271340370178223,
"learning_rate": 0.0002,
"loss": 0.5346530079841614,
"mean_token_accuracy": 0.7835509330034256,
"num_tokens": 12417061.0,
"step": 762
},
{
"entropy": 0.5455985963344574,
"epoch": 2.8485981308411215,
"grad_norm": 0.03856063261628151,
"learning_rate": 0.0002,
"loss": 0.5402116179466248,
"mean_token_accuracy": 0.7816472351551056,
"num_tokens": 12433548.0,
"step": 763
},
{
"entropy": 0.532633364200592,
"epoch": 2.852336448598131,
"grad_norm": 0.039442550390958786,
"learning_rate": 0.0002,
"loss": 0.5322520732879639,
"mean_token_accuracy": 0.783360943198204,
"num_tokens": 12449702.0,
"step": 764
},
{
"entropy": 0.5533113479614258,
"epoch": 2.8560747663551402,
"grad_norm": 0.03981044888496399,
"learning_rate": 0.0002,
"loss": 0.5526716113090515,
"mean_token_accuracy": 0.7752720266580582,
"num_tokens": 12465797.0,
"step": 765
},
{
"entropy": 0.5458943992853165,
"epoch": 2.8598130841121496,
"grad_norm": 0.043415430933237076,
"learning_rate": 0.0002,
"loss": 0.5514388084411621,
"mean_token_accuracy": 0.7782578617334366,
"num_tokens": 12482100.0,
"step": 766
},
{
"entropy": 0.5316417217254639,
"epoch": 2.863551401869159,
"grad_norm": 0.03658653050661087,
"learning_rate": 0.0002,
"loss": 0.5376189947128296,
"mean_token_accuracy": 0.7812371999025345,
"num_tokens": 12498442.0,
"step": 767
},
{
"entropy": 0.5365964025259018,
"epoch": 2.867289719626168,
"grad_norm": 0.04015335068106651,
"learning_rate": 0.0002,
"loss": 0.5381023287773132,
"mean_token_accuracy": 0.7802128046751022,
"num_tokens": 12514722.0,
"step": 768
},
{
"entropy": 0.5392501503229141,
"epoch": 2.8710280373831774,
"grad_norm": 0.04526032134890556,
"learning_rate": 0.0002,
"loss": 0.5440354347229004,
"mean_token_accuracy": 0.7788137197494507,
"num_tokens": 12531173.0,
"step": 769
},
{
"entropy": 0.5416650772094727,
"epoch": 2.8747663551401867,
"grad_norm": 0.03573603555560112,
"learning_rate": 0.0002,
"loss": 0.5344440340995789,
"mean_token_accuracy": 0.782467320561409,
"num_tokens": 12547297.0,
"step": 770
},
{
"entropy": 0.537946805357933,
"epoch": 2.878504672897196,
"grad_norm": 0.043754760175943375,
"learning_rate": 0.0002,
"loss": 0.5369762778282166,
"mean_token_accuracy": 0.7813331335783005,
"num_tokens": 12563639.0,
"step": 771
},
{
"entropy": 0.5417525321245193,
"epoch": 2.8822429906542055,
"grad_norm": 0.03892975300550461,
"learning_rate": 0.0002,
"loss": 0.5408830642700195,
"mean_token_accuracy": 0.7807131111621857,
"num_tokens": 12579951.0,
"step": 772
},
{
"entropy": 0.5286070853471756,
"epoch": 2.885981308411215,
"grad_norm": 0.041709210723638535,
"learning_rate": 0.0002,
"loss": 0.5315775275230408,
"mean_token_accuracy": 0.7836516797542572,
"num_tokens": 12596427.0,
"step": 773
},
{
"entropy": 0.5347200036048889,
"epoch": 2.8897196261682243,
"grad_norm": 0.04162106290459633,
"learning_rate": 0.0002,
"loss": 0.5488803386688232,
"mean_token_accuracy": 0.7781624644994736,
"num_tokens": 12612693.0,
"step": 774
},
{
"entropy": 0.5630818009376526,
"epoch": 2.8934579439252337,
"grad_norm": 0.03779264912009239,
"learning_rate": 0.0002,
"loss": 0.5618957281112671,
"mean_token_accuracy": 0.7714088261127472,
"num_tokens": 12629093.0,
"step": 775
},
{
"entropy": 0.5579015165567398,
"epoch": 2.897196261682243,
"grad_norm": 0.04071388393640518,
"learning_rate": 0.0002,
"loss": 0.5509809255599976,
"mean_token_accuracy": 0.7759078145027161,
"num_tokens": 12645440.0,
"step": 776
},
{
"entropy": 0.5593527257442474,
"epoch": 2.9009345794392525,
"grad_norm": 0.041921358555555344,
"learning_rate": 0.0002,
"loss": 0.5505045056343079,
"mean_token_accuracy": 0.7758798003196716,
"num_tokens": 12661819.0,
"step": 777
},
{
"entropy": 0.5402603298425674,
"epoch": 2.904672897196262,
"grad_norm": 0.03740124776959419,
"learning_rate": 0.0002,
"loss": 0.5350624322891235,
"mean_token_accuracy": 0.7829450070858002,
"num_tokens": 12678029.0,
"step": 778
},
{
"entropy": 0.5501836538314819,
"epoch": 2.9084112149532713,
"grad_norm": 0.03699700906872749,
"learning_rate": 0.0002,
"loss": 0.5496166944503784,
"mean_token_accuracy": 0.7787871360778809,
"num_tokens": 12694566.0,
"step": 779
},
{
"entropy": 0.5449737459421158,
"epoch": 2.91214953271028,
"grad_norm": 0.03947729989886284,
"learning_rate": 0.0002,
"loss": 0.5487996935844421,
"mean_token_accuracy": 0.7771195471286774,
"num_tokens": 12711096.0,
"step": 780
},
{
"entropy": 0.509773313999176,
"epoch": 2.9158878504672896,
"grad_norm": 0.04015858471393585,
"learning_rate": 0.0002,
"loss": 0.5180044174194336,
"mean_token_accuracy": 0.7871870398521423,
"num_tokens": 12727181.0,
"step": 781
},
{
"entropy": 0.5145790874958038,
"epoch": 2.919626168224299,
"grad_norm": 0.04480452463030815,
"learning_rate": 0.0002,
"loss": 0.517657995223999,
"mean_token_accuracy": 0.7905906438827515,
"num_tokens": 12743263.0,
"step": 782
},
{
"entropy": 0.536189079284668,
"epoch": 2.9233644859813084,
"grad_norm": 0.0368233323097229,
"learning_rate": 0.0002,
"loss": 0.5374237895011902,
"mean_token_accuracy": 0.7814907878637314,
"num_tokens": 12759582.0,
"step": 783
},
{
"entropy": 0.5301052629947662,
"epoch": 2.9271028037383178,
"grad_norm": 0.036369625478982925,
"learning_rate": 0.0002,
"loss": 0.5254780054092407,
"mean_token_accuracy": 0.7876885831356049,
"num_tokens": 12775680.0,
"step": 784
},
{
"entropy": 0.5395437628030777,
"epoch": 2.930841121495327,
"grad_norm": 0.037106823176145554,
"learning_rate": 0.0002,
"loss": 0.5353831648826599,
"mean_token_accuracy": 0.7856823652982712,
"num_tokens": 12791849.0,
"step": 785
},
{
"entropy": 0.5460378974676132,
"epoch": 2.9345794392523366,
"grad_norm": 0.0374838188290596,
"learning_rate": 0.0002,
"loss": 0.5441444516181946,
"mean_token_accuracy": 0.7800013571977615,
"num_tokens": 12808470.0,
"step": 786
},
{
"entropy": 0.5510992407798767,
"epoch": 2.938317757009346,
"grad_norm": 0.03663073852658272,
"learning_rate": 0.0002,
"loss": 0.5466246604919434,
"mean_token_accuracy": 0.7789618521928787,
"num_tokens": 12824709.0,
"step": 787
},
{
"entropy": 0.5445446521043777,
"epoch": 2.942056074766355,
"grad_norm": 0.03850307688117027,
"learning_rate": 0.0002,
"loss": 0.5457326769828796,
"mean_token_accuracy": 0.779052123427391,
"num_tokens": 12841079.0,
"step": 788
},
{
"entropy": 0.5365033894777298,
"epoch": 2.9457943925233643,
"grad_norm": 0.04035929962992668,
"learning_rate": 0.0002,
"loss": 0.5459482073783875,
"mean_token_accuracy": 0.7797062546014786,
"num_tokens": 12857523.0,
"step": 789
},
{
"entropy": 0.535067155957222,
"epoch": 2.9495327102803737,
"grad_norm": 0.04887193441390991,
"learning_rate": 0.0002,
"loss": 0.5398947596549988,
"mean_token_accuracy": 0.7823842316865921,
"num_tokens": 12874241.0,
"step": 790
},
{
"entropy": 0.5346145331859589,
"epoch": 2.953271028037383,
"grad_norm": 0.03713555634021759,
"learning_rate": 0.0002,
"loss": 0.5383285880088806,
"mean_token_accuracy": 0.7822743952274323,
"num_tokens": 12890347.0,
"step": 791
},
{
"entropy": 0.5538973659276962,
"epoch": 2.9570093457943925,
"grad_norm": 0.042103007435798645,
"learning_rate": 0.0002,
"loss": 0.5548110604286194,
"mean_token_accuracy": 0.7737681418657303,
"num_tokens": 12906728.0,
"step": 792
},
{
"entropy": 0.5500922650098801,
"epoch": 2.960747663551402,
"grad_norm": 0.03705638647079468,
"learning_rate": 0.0002,
"loss": 0.5455094575881958,
"mean_token_accuracy": 0.7803948670625687,
"num_tokens": 12923166.0,
"step": 793
},
{
"entropy": 0.562080979347229,
"epoch": 2.9644859813084112,
"grad_norm": 0.045153554528951645,
"learning_rate": 0.0002,
"loss": 0.5568199157714844,
"mean_token_accuracy": 0.7736331224441528,
"num_tokens": 12939504.0,
"step": 794
},
{
"entropy": 0.5559557229280472,
"epoch": 2.9682242990654206,
"grad_norm": 0.04255378246307373,
"learning_rate": 0.0002,
"loss": 0.5531718134880066,
"mean_token_accuracy": 0.7762871235609055,
"num_tokens": 12955898.0,
"step": 795
},
{
"entropy": 0.5435759872198105,
"epoch": 2.97196261682243,
"grad_norm": 0.03799128159880638,
"learning_rate": 0.0002,
"loss": 0.5441620349884033,
"mean_token_accuracy": 0.7793318778276443,
"num_tokens": 12972346.0,
"step": 796
},
{
"entropy": 0.5359157919883728,
"epoch": 2.9757009345794394,
"grad_norm": 0.05715997889637947,
"learning_rate": 0.0002,
"loss": 0.5515891909599304,
"mean_token_accuracy": 0.7771831452846527,
"num_tokens": 12988848.0,
"step": 797
},
{
"entropy": 0.5230652317404747,
"epoch": 2.979439252336449,
"grad_norm": 0.04036436975002289,
"learning_rate": 0.0002,
"loss": 0.5234889388084412,
"mean_token_accuracy": 0.7856348752975464,
"num_tokens": 13004832.0,
"step": 798
},
{
"entropy": 0.5457260459661484,
"epoch": 2.983177570093458,
"grad_norm": 0.04120893031358719,
"learning_rate": 0.0002,
"loss": 0.5378625392913818,
"mean_token_accuracy": 0.7840824872255325,
"num_tokens": 13021226.0,
"step": 799
},
{
"entropy": 0.5480275601148605,
"epoch": 2.986915887850467,
"grad_norm": 0.050067413598299026,
"learning_rate": 0.0002,
"loss": 0.5414943099021912,
"mean_token_accuracy": 0.7796735763549805,
"num_tokens": 13037664.0,
"step": 800
},
{
"entropy": 0.5385295897722244,
"epoch": 2.9906542056074765,
"grad_norm": 0.03477542847394943,
"learning_rate": 0.0002,
"loss": 0.5353237390518188,
"mean_token_accuracy": 0.7814339101314545,
"num_tokens": 13053836.0,
"step": 801
},
{
"entropy": 0.5408166199922562,
"epoch": 2.994392523364486,
"grad_norm": 0.038822371512651443,
"learning_rate": 0.0002,
"loss": 0.5407392382621765,
"mean_token_accuracy": 0.7796344310045242,
"num_tokens": 13070132.0,
"step": 802
},
{
"entropy": 0.533338338136673,
"epoch": 2.9981308411214953,
"grad_norm": 0.04834038019180298,
"learning_rate": 0.0002,
"loss": 0.5456323027610779,
"mean_token_accuracy": 0.7770627439022064,
"num_tokens": 13086317.0,
"step": 803
},
{
"entropy": 0.520211398601532,
"epoch": 3.0,
"grad_norm": 0.04815197363495827,
"learning_rate": 0.0002,
"loss": 0.5207195281982422,
"mean_token_accuracy": 0.7871742844581604,
"num_tokens": 13094581.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2192829660484076e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}