olmo2_1b_sft_plus_200 / trainer_state.json
suzeva's picture
Upload folder using huggingface_hub
a047379 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4508,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.496292558312416,
"epoch": 0.0022186477342060014,
"grad_norm": 0.6640625,
"learning_rate": 4.990017746228927e-05,
"loss": 1.0869,
"mean_token_accuracy": 0.725723172724247,
"num_tokens": 925119.0,
"step": 10
},
{
"entropy": 1.49339357316494,
"epoch": 0.004437295468412003,
"grad_norm": 0.796875,
"learning_rate": 4.978926353149956e-05,
"loss": 1.0992,
"mean_token_accuracy": 0.72515804246068,
"num_tokens": 1846450.0,
"step": 20
},
{
"entropy": 1.4267458260059356,
"epoch": 0.006655943202618004,
"grad_norm": 0.57421875,
"learning_rate": 4.967834960070985e-05,
"loss": 1.0639,
"mean_token_accuracy": 0.7327800326049327,
"num_tokens": 2786157.0,
"step": 30
},
{
"entropy": 1.409786120057106,
"epoch": 0.008874590936824005,
"grad_norm": 0.57421875,
"learning_rate": 4.9567435669920145e-05,
"loss": 1.0149,
"mean_token_accuracy": 0.7418569244444371,
"num_tokens": 3732821.0,
"step": 40
},
{
"entropy": 1.412144237756729,
"epoch": 0.011093238671030008,
"grad_norm": 0.5703125,
"learning_rate": 4.945652173913044e-05,
"loss": 1.0482,
"mean_token_accuracy": 0.7331000074744225,
"num_tokens": 4691429.0,
"step": 50
},
{
"entropy": 1.434348814189434,
"epoch": 0.013311886405236008,
"grad_norm": 0.55859375,
"learning_rate": 4.934560780834073e-05,
"loss": 1.0588,
"mean_token_accuracy": 0.7321462295949459,
"num_tokens": 5627667.0,
"step": 60
},
{
"entropy": 1.4207842454314232,
"epoch": 0.01553053413944201,
"grad_norm": 0.58203125,
"learning_rate": 4.923469387755102e-05,
"loss": 1.0427,
"mean_token_accuracy": 0.7373643882572651,
"num_tokens": 6554855.0,
"step": 70
},
{
"entropy": 1.403839322924614,
"epoch": 0.01774918187364801,
"grad_norm": 0.5703125,
"learning_rate": 4.9123779946761314e-05,
"loss": 1.0559,
"mean_token_accuracy": 0.7321439690887928,
"num_tokens": 7519970.0,
"step": 80
},
{
"entropy": 1.4707388430833817,
"epoch": 0.019967829607854013,
"grad_norm": 0.5625,
"learning_rate": 4.9012866015971606e-05,
"loss": 1.0769,
"mean_token_accuracy": 0.7276073284447193,
"num_tokens": 8448434.0,
"step": 90
},
{
"entropy": 1.4391525745391847,
"epoch": 0.022186477342060015,
"grad_norm": 1.671875,
"learning_rate": 4.8901952085181905e-05,
"loss": 1.0773,
"mean_token_accuracy": 0.7285938866436481,
"num_tokens": 9372876.0,
"step": 100
},
{
"entropy": 1.4438337981700897,
"epoch": 0.024405125076266018,
"grad_norm": 0.515625,
"learning_rate": 4.87910381543922e-05,
"loss": 1.0468,
"mean_token_accuracy": 0.7326018497347832,
"num_tokens": 10316544.0,
"step": 110
},
{
"entropy": 1.4177550919353963,
"epoch": 0.026623772810472016,
"grad_norm": 0.5859375,
"learning_rate": 4.868012422360249e-05,
"loss": 1.0585,
"mean_token_accuracy": 0.7324540324509143,
"num_tokens": 11247187.0,
"step": 120
},
{
"entropy": 1.471057978272438,
"epoch": 0.02884242054467802,
"grad_norm": 0.6171875,
"learning_rate": 4.856921029281278e-05,
"loss": 1.0761,
"mean_token_accuracy": 0.7275320313870907,
"num_tokens": 12173265.0,
"step": 130
},
{
"entropy": 1.3705433815717698,
"epoch": 0.03106106827888402,
"grad_norm": 0.57421875,
"learning_rate": 4.845829636202307e-05,
"loss": 0.9901,
"mean_token_accuracy": 0.74607959613204,
"num_tokens": 13130818.0,
"step": 140
},
{
"entropy": 1.3908753886818885,
"epoch": 0.03327971601309002,
"grad_norm": 0.58203125,
"learning_rate": 4.8347382431233365e-05,
"loss": 1.0128,
"mean_token_accuracy": 0.7397120602428913,
"num_tokens": 14082298.0,
"step": 150
},
{
"entropy": 1.3889487609267235,
"epoch": 0.03549836374729602,
"grad_norm": 0.58203125,
"learning_rate": 4.823646850044366e-05,
"loss": 1.0346,
"mean_token_accuracy": 0.7391657814383507,
"num_tokens": 15020735.0,
"step": 160
},
{
"entropy": 1.3904974788427353,
"epoch": 0.03771701148150203,
"grad_norm": 0.6484375,
"learning_rate": 4.812555456965395e-05,
"loss": 1.0208,
"mean_token_accuracy": 0.7383547216653824,
"num_tokens": 15966964.0,
"step": 170
},
{
"entropy": 1.4144786164164542,
"epoch": 0.039935659215708026,
"grad_norm": 0.55078125,
"learning_rate": 4.801464063886424e-05,
"loss": 1.0335,
"mean_token_accuracy": 0.7372442841529846,
"num_tokens": 16859851.0,
"step": 180
},
{
"entropy": 1.4213875874876976,
"epoch": 0.042154306949914025,
"grad_norm": 0.609375,
"learning_rate": 4.7903726708074534e-05,
"loss": 1.03,
"mean_token_accuracy": 0.735869013518095,
"num_tokens": 17767909.0,
"step": 190
},
{
"entropy": 1.4285096868872642,
"epoch": 0.04437295468412003,
"grad_norm": 0.54296875,
"learning_rate": 4.7792812777284826e-05,
"loss": 1.0358,
"mean_token_accuracy": 0.7385697923600674,
"num_tokens": 18734689.0,
"step": 200
},
{
"entropy": 1.4109881177544594,
"epoch": 0.04659160241832603,
"grad_norm": 0.5703125,
"learning_rate": 4.768189884649512e-05,
"loss": 1.0404,
"mean_token_accuracy": 0.7372543781995773,
"num_tokens": 19661481.0,
"step": 210
},
{
"entropy": 1.3780835449695588,
"epoch": 0.048810250152532035,
"grad_norm": 0.546875,
"learning_rate": 4.757098491570541e-05,
"loss": 1.0115,
"mean_token_accuracy": 0.7417904600501061,
"num_tokens": 20586364.0,
"step": 220
},
{
"entropy": 1.3467031195759773,
"epoch": 0.051028897886738034,
"grad_norm": 0.52734375,
"learning_rate": 4.74600709849157e-05,
"loss": 0.9833,
"mean_token_accuracy": 0.7490709364414215,
"num_tokens": 21547442.0,
"step": 230
},
{
"entropy": 1.424035993218422,
"epoch": 0.05324754562094403,
"grad_norm": 0.57421875,
"learning_rate": 4.7349157054126e-05,
"loss": 1.0266,
"mean_token_accuracy": 0.7366823427379131,
"num_tokens": 22502716.0,
"step": 240
},
{
"entropy": 1.409425649046898,
"epoch": 0.05546619335515004,
"grad_norm": 0.6640625,
"learning_rate": 4.7238243123336293e-05,
"loss": 1.0304,
"mean_token_accuracy": 0.7378800459206104,
"num_tokens": 23426781.0,
"step": 250
},
{
"entropy": 1.4083105452358722,
"epoch": 0.05768484108935604,
"grad_norm": 0.6015625,
"learning_rate": 4.7127329192546586e-05,
"loss": 1.0399,
"mean_token_accuracy": 0.7385689981281758,
"num_tokens": 24379852.0,
"step": 260
},
{
"entropy": 1.3957985565066338,
"epoch": 0.059903488823562036,
"grad_norm": 0.59765625,
"learning_rate": 4.701641526175688e-05,
"loss": 1.0115,
"mean_token_accuracy": 0.7397762380540371,
"num_tokens": 25322772.0,
"step": 270
},
{
"entropy": 1.376178003847599,
"epoch": 0.06212213655776804,
"grad_norm": 0.56640625,
"learning_rate": 4.690550133096717e-05,
"loss": 1.0075,
"mean_token_accuracy": 0.7439929395914078,
"num_tokens": 26287748.0,
"step": 280
},
{
"entropy": 1.3666997633874416,
"epoch": 0.06434078429197404,
"grad_norm": 0.58203125,
"learning_rate": 4.679458740017746e-05,
"loss": 0.9893,
"mean_token_accuracy": 0.7455881536006927,
"num_tokens": 27239049.0,
"step": 290
},
{
"entropy": 1.3740440711379052,
"epoch": 0.06655943202618005,
"grad_norm": 0.578125,
"learning_rate": 4.6683673469387754e-05,
"loss": 1.0038,
"mean_token_accuracy": 0.7433481432497502,
"num_tokens": 28209460.0,
"step": 300
},
{
"entropy": 1.3969299167394638,
"epoch": 0.06877807976038605,
"grad_norm": 0.546875,
"learning_rate": 4.6572759538598046e-05,
"loss": 1.0058,
"mean_token_accuracy": 0.7407899357378482,
"num_tokens": 29152647.0,
"step": 310
},
{
"entropy": 1.4039423167705536,
"epoch": 0.07099672749459204,
"grad_norm": 0.5625,
"learning_rate": 4.646184560780834e-05,
"loss": 1.0078,
"mean_token_accuracy": 0.7412165470421315,
"num_tokens": 30065474.0,
"step": 320
},
{
"entropy": 1.3215527072548867,
"epoch": 0.07321537522879805,
"grad_norm": 0.5625,
"learning_rate": 4.635093167701863e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.752603680640459,
"num_tokens": 31019135.0,
"step": 330
},
{
"entropy": 1.4225746989250183,
"epoch": 0.07543402296300405,
"grad_norm": 0.6015625,
"learning_rate": 4.624001774622893e-05,
"loss": 1.0285,
"mean_token_accuracy": 0.7391470916569233,
"num_tokens": 31939707.0,
"step": 340
},
{
"entropy": 1.3544291421771049,
"epoch": 0.07765267069721005,
"grad_norm": 0.546875,
"learning_rate": 4.612910381543922e-05,
"loss": 0.968,
"mean_token_accuracy": 0.7495346136391163,
"num_tokens": 32871530.0,
"step": 350
},
{
"entropy": 1.3839760735630988,
"epoch": 0.07987131843141605,
"grad_norm": 0.59375,
"learning_rate": 4.6018189884649514e-05,
"loss": 0.9895,
"mean_token_accuracy": 0.7453494131565094,
"num_tokens": 33813137.0,
"step": 360
},
{
"entropy": 1.414746204763651,
"epoch": 0.08208996616562206,
"grad_norm": 0.609375,
"learning_rate": 4.5907275953859806e-05,
"loss": 1.025,
"mean_token_accuracy": 0.7388280339539051,
"num_tokens": 34771343.0,
"step": 370
},
{
"entropy": 1.3732567429542542,
"epoch": 0.08430861389982805,
"grad_norm": 0.55859375,
"learning_rate": 4.57963620230701e-05,
"loss": 1.0058,
"mean_token_accuracy": 0.7429468773305417,
"num_tokens": 35713295.0,
"step": 380
},
{
"entropy": 1.3798431143164636,
"epoch": 0.08652726163403406,
"grad_norm": 0.62109375,
"learning_rate": 4.568544809228039e-05,
"loss": 1.0094,
"mean_token_accuracy": 0.7400036215782165,
"num_tokens": 36651252.0,
"step": 390
},
{
"entropy": 1.3924534171819687,
"epoch": 0.08874590936824006,
"grad_norm": 0.5625,
"learning_rate": 4.557453416149068e-05,
"loss": 0.9963,
"mean_token_accuracy": 0.7455349668860436,
"num_tokens": 37592125.0,
"step": 400
},
{
"entropy": 1.4052727609872817,
"epoch": 0.09096455710244605,
"grad_norm": 0.55859375,
"learning_rate": 4.5463620230700974e-05,
"loss": 1.0201,
"mean_token_accuracy": 0.7395130477845668,
"num_tokens": 38534139.0,
"step": 410
},
{
"entropy": 1.418926975131035,
"epoch": 0.09318320483665206,
"grad_norm": 0.59765625,
"learning_rate": 4.5352706299911266e-05,
"loss": 1.0096,
"mean_token_accuracy": 0.7400068089365959,
"num_tokens": 39478516.0,
"step": 420
},
{
"entropy": 1.3901023603975773,
"epoch": 0.09540185257085806,
"grad_norm": 0.54296875,
"learning_rate": 4.5241792369121565e-05,
"loss": 1.0051,
"mean_token_accuracy": 0.7408906109631062,
"num_tokens": 40443771.0,
"step": 430
},
{
"entropy": 1.3361934393644332,
"epoch": 0.09762050030506407,
"grad_norm": 0.578125,
"learning_rate": 4.513087843833186e-05,
"loss": 0.9356,
"mean_token_accuracy": 0.7576181195676327,
"num_tokens": 41385687.0,
"step": 440
},
{
"entropy": 1.4129181623458862,
"epoch": 0.09983914803927006,
"grad_norm": 0.578125,
"learning_rate": 4.501996450754215e-05,
"loss": 1.0427,
"mean_token_accuracy": 0.7341138951480388,
"num_tokens": 42316542.0,
"step": 450
},
{
"entropy": 1.3746674314141274,
"epoch": 0.10205779577347607,
"grad_norm": 0.57421875,
"learning_rate": 4.490905057675244e-05,
"loss": 1.0078,
"mean_token_accuracy": 0.7429340846836567,
"num_tokens": 43252482.0,
"step": 460
},
{
"entropy": 1.3764732837677003,
"epoch": 0.10427644350768207,
"grad_norm": 0.51953125,
"learning_rate": 4.4798136645962734e-05,
"loss": 1.0107,
"mean_token_accuracy": 0.7416411705315114,
"num_tokens": 44226551.0,
"step": 470
},
{
"entropy": 1.4007945582270622,
"epoch": 0.10649509124188807,
"grad_norm": 0.5703125,
"learning_rate": 4.4687222715173026e-05,
"loss": 1.0161,
"mean_token_accuracy": 0.7399431690573692,
"num_tokens": 45171661.0,
"step": 480
},
{
"entropy": 1.400729776918888,
"epoch": 0.10871373897609407,
"grad_norm": 0.58203125,
"learning_rate": 4.457630878438332e-05,
"loss": 1.01,
"mean_token_accuracy": 0.7416381858289242,
"num_tokens": 46102823.0,
"step": 490
},
{
"entropy": 1.3740653365850448,
"epoch": 0.11093238671030008,
"grad_norm": 0.640625,
"learning_rate": 4.446539485359361e-05,
"loss": 0.9948,
"mean_token_accuracy": 0.7436435185372829,
"num_tokens": 47038182.0,
"step": 500
},
{
"entropy": 1.385279569029808,
"epoch": 0.11315103444450607,
"grad_norm": 0.53125,
"learning_rate": 4.43544809228039e-05,
"loss": 0.9812,
"mean_token_accuracy": 0.7462167225778102,
"num_tokens": 47978197.0,
"step": 510
},
{
"entropy": 1.355229352414608,
"epoch": 0.11536968217871207,
"grad_norm": 0.498046875,
"learning_rate": 4.42435669920142e-05,
"loss": 0.9756,
"mean_token_accuracy": 0.748258039355278,
"num_tokens": 48929164.0,
"step": 520
},
{
"entropy": 1.340594267845154,
"epoch": 0.11758832991291808,
"grad_norm": 0.55078125,
"learning_rate": 4.4132653061224493e-05,
"loss": 0.9557,
"mean_token_accuracy": 0.7537197224795819,
"num_tokens": 49872840.0,
"step": 530
},
{
"entropy": 1.3749348096549512,
"epoch": 0.11980697764712407,
"grad_norm": 0.53125,
"learning_rate": 4.4021739130434786e-05,
"loss": 0.9964,
"mean_token_accuracy": 0.7432300426065922,
"num_tokens": 50820730.0,
"step": 540
},
{
"entropy": 1.3660012029111386,
"epoch": 0.12202562538133008,
"grad_norm": 0.6015625,
"learning_rate": 4.391082519964508e-05,
"loss": 0.9864,
"mean_token_accuracy": 0.7461238898336887,
"num_tokens": 51758109.0,
"step": 550
},
{
"entropy": 1.3850644059479236,
"epoch": 0.12424427311553608,
"grad_norm": 0.546875,
"learning_rate": 4.379991126885537e-05,
"loss": 0.9998,
"mean_token_accuracy": 0.7451605953276157,
"num_tokens": 52706955.0,
"step": 560
},
{
"entropy": 1.3438964366912842,
"epoch": 0.1264629208497421,
"grad_norm": 0.5703125,
"learning_rate": 4.368899733806566e-05,
"loss": 0.9759,
"mean_token_accuracy": 0.7506989397108554,
"num_tokens": 53654927.0,
"step": 570
},
{
"entropy": 1.3823294579982757,
"epoch": 0.12868156858394808,
"grad_norm": 0.55078125,
"learning_rate": 4.3578083407275954e-05,
"loss": 1.002,
"mean_token_accuracy": 0.7399756357073783,
"num_tokens": 54607725.0,
"step": 580
},
{
"entropy": 1.3431190609931947,
"epoch": 0.13090021631815407,
"grad_norm": 0.5546875,
"learning_rate": 4.3467169476486246e-05,
"loss": 0.9922,
"mean_token_accuracy": 0.7472275733947754,
"num_tokens": 55544472.0,
"step": 590
},
{
"entropy": 1.386824431270361,
"epoch": 0.1331188640523601,
"grad_norm": 0.5625,
"learning_rate": 4.335625554569654e-05,
"loss": 1.0184,
"mean_token_accuracy": 0.7395146794617176,
"num_tokens": 56492504.0,
"step": 600
},
{
"entropy": 1.3801176637411117,
"epoch": 0.13533751178656608,
"grad_norm": 0.58203125,
"learning_rate": 4.324534161490684e-05,
"loss": 1.0153,
"mean_token_accuracy": 0.740266764163971,
"num_tokens": 57446818.0,
"step": 610
},
{
"entropy": 1.3780507385730743,
"epoch": 0.1375561595207721,
"grad_norm": 0.5625,
"learning_rate": 4.313442768411713e-05,
"loss": 1.0282,
"mean_token_accuracy": 0.7379110969603062,
"num_tokens": 58388320.0,
"step": 620
},
{
"entropy": 1.3641229078173638,
"epoch": 0.1397748072549781,
"grad_norm": 0.546875,
"learning_rate": 4.302351375332742e-05,
"loss": 0.9693,
"mean_token_accuracy": 0.7492371432483196,
"num_tokens": 59335625.0,
"step": 630
},
{
"entropy": 1.3909922763705254,
"epoch": 0.1419934549891841,
"grad_norm": 0.55078125,
"learning_rate": 4.2912599822537714e-05,
"loss": 1.0178,
"mean_token_accuracy": 0.7371242880821228,
"num_tokens": 60271739.0,
"step": 640
},
{
"entropy": 1.358926948904991,
"epoch": 0.1442121027233901,
"grad_norm": 0.5390625,
"learning_rate": 4.2801685891748006e-05,
"loss": 0.9545,
"mean_token_accuracy": 0.7560696460306644,
"num_tokens": 61229087.0,
"step": 650
},
{
"entropy": 1.3929373525083064,
"epoch": 0.1464307504575961,
"grad_norm": 0.55859375,
"learning_rate": 4.26907719609583e-05,
"loss": 1.0238,
"mean_token_accuracy": 0.7393973417580127,
"num_tokens": 62178123.0,
"step": 660
},
{
"entropy": 1.327741453051567,
"epoch": 0.1486493981918021,
"grad_norm": 0.57421875,
"learning_rate": 4.257985803016859e-05,
"loss": 0.9456,
"mean_token_accuracy": 0.7535205587744713,
"num_tokens": 63126423.0,
"step": 670
},
{
"entropy": 1.3679818481206893,
"epoch": 0.1508680459260081,
"grad_norm": 0.54296875,
"learning_rate": 4.246894409937888e-05,
"loss": 0.9728,
"mean_token_accuracy": 0.7467674180865288,
"num_tokens": 64079693.0,
"step": 680
},
{
"entropy": 1.381383018195629,
"epoch": 0.1530866936602141,
"grad_norm": 0.5703125,
"learning_rate": 4.2358030168589174e-05,
"loss": 1.0088,
"mean_token_accuracy": 0.7418314486742019,
"num_tokens": 65018338.0,
"step": 690
},
{
"entropy": 1.3862957283854485,
"epoch": 0.1553053413944201,
"grad_norm": 0.6171875,
"learning_rate": 4.224711623779947e-05,
"loss": 1.0144,
"mean_token_accuracy": 0.7403538078069687,
"num_tokens": 65952052.0,
"step": 700
},
{
"entropy": 1.3384159475564956,
"epoch": 0.1575239891286261,
"grad_norm": 0.56640625,
"learning_rate": 4.2136202307009765e-05,
"loss": 0.9922,
"mean_token_accuracy": 0.7461440391838551,
"num_tokens": 66921819.0,
"step": 710
},
{
"entropy": 1.3640660651028156,
"epoch": 0.1597426368628321,
"grad_norm": 0.546875,
"learning_rate": 4.202528837622006e-05,
"loss": 0.9682,
"mean_token_accuracy": 0.7501711919903755,
"num_tokens": 67876968.0,
"step": 720
},
{
"entropy": 1.4149701073765755,
"epoch": 0.1619612845970381,
"grad_norm": 0.71875,
"learning_rate": 4.191437444543035e-05,
"loss": 0.9993,
"mean_token_accuracy": 0.7446250684559346,
"num_tokens": 68806954.0,
"step": 730
},
{
"entropy": 1.3790323272347451,
"epoch": 0.16417993233124412,
"grad_norm": 0.50390625,
"learning_rate": 4.180346051464064e-05,
"loss": 1.0043,
"mean_token_accuracy": 0.7418724097311497,
"num_tokens": 69713797.0,
"step": 740
},
{
"entropy": 1.3732078664004803,
"epoch": 0.1663985800654501,
"grad_norm": 0.56640625,
"learning_rate": 4.1692546583850934e-05,
"loss": 0.9737,
"mean_token_accuracy": 0.7472748421132565,
"num_tokens": 70644616.0,
"step": 750
},
{
"entropy": 1.3796279937028886,
"epoch": 0.1686172277996561,
"grad_norm": 0.5703125,
"learning_rate": 4.1581632653061226e-05,
"loss": 0.9888,
"mean_token_accuracy": 0.7464235134422779,
"num_tokens": 71614683.0,
"step": 760
},
{
"entropy": 1.3841315507888794,
"epoch": 0.17083587553386212,
"grad_norm": 0.5546875,
"learning_rate": 4.147071872227152e-05,
"loss": 1.0505,
"mean_token_accuracy": 0.7321801386773586,
"num_tokens": 72565413.0,
"step": 770
},
{
"entropy": 1.3817786656320095,
"epoch": 0.1730545232680681,
"grad_norm": 0.55078125,
"learning_rate": 4.135980479148181e-05,
"loss": 0.9787,
"mean_token_accuracy": 0.7469952210783959,
"num_tokens": 73526877.0,
"step": 780
},
{
"entropy": 1.3620466977357863,
"epoch": 0.1752731710022741,
"grad_norm": 0.5625,
"learning_rate": 4.124889086069211e-05,
"loss": 0.9949,
"mean_token_accuracy": 0.7441352687776088,
"num_tokens": 74473784.0,
"step": 790
},
{
"entropy": 1.3590239346027375,
"epoch": 0.17749181873648012,
"grad_norm": 0.546875,
"learning_rate": 4.11379769299024e-05,
"loss": 0.9791,
"mean_token_accuracy": 0.748184335231781,
"num_tokens": 75417836.0,
"step": 800
},
{
"entropy": 1.3708465218544006,
"epoch": 0.17971046647068611,
"grad_norm": 0.55859375,
"learning_rate": 4.1027062999112693e-05,
"loss": 0.9631,
"mean_token_accuracy": 0.7510820157825947,
"num_tokens": 76354490.0,
"step": 810
},
{
"entropy": 1.341279798746109,
"epoch": 0.1819291142048921,
"grad_norm": 0.57421875,
"learning_rate": 4.0916149068322986e-05,
"loss": 0.9668,
"mean_token_accuracy": 0.7506601929664611,
"num_tokens": 77302002.0,
"step": 820
},
{
"entropy": 1.3692625604569912,
"epoch": 0.18414776193909813,
"grad_norm": 0.5546875,
"learning_rate": 4.080523513753328e-05,
"loss": 0.9652,
"mean_token_accuracy": 0.7492272712290287,
"num_tokens": 78250737.0,
"step": 830
},
{
"entropy": 1.3755904287099838,
"epoch": 0.18636640967330412,
"grad_norm": 0.53515625,
"learning_rate": 4.069432120674357e-05,
"loss": 0.9899,
"mean_token_accuracy": 0.7458423741161824,
"num_tokens": 79188862.0,
"step": 840
},
{
"entropy": 1.4084200143814087,
"epoch": 0.1885850574075101,
"grad_norm": 0.56640625,
"learning_rate": 4.058340727595386e-05,
"loss": 1.0189,
"mean_token_accuracy": 0.739814518392086,
"num_tokens": 80106826.0,
"step": 850
},
{
"entropy": 1.3865490198135375,
"epoch": 0.19080370514171613,
"grad_norm": 0.52734375,
"learning_rate": 4.0472493345164154e-05,
"loss": 0.9995,
"mean_token_accuracy": 0.7435623817145824,
"num_tokens": 81042920.0,
"step": 860
},
{
"entropy": 1.4000697553157806,
"epoch": 0.19302235287592212,
"grad_norm": 0.5625,
"learning_rate": 4.0361579414374446e-05,
"loss": 0.9822,
"mean_token_accuracy": 0.7476979814469814,
"num_tokens": 81961887.0,
"step": 870
},
{
"entropy": 1.4024762332439422,
"epoch": 0.19524100061012814,
"grad_norm": 0.59765625,
"learning_rate": 4.025066548358474e-05,
"loss": 1.0031,
"mean_token_accuracy": 0.7444592162966728,
"num_tokens": 82897270.0,
"step": 880
},
{
"entropy": 1.3742952913045883,
"epoch": 0.19745964834433413,
"grad_norm": 0.52734375,
"learning_rate": 4.013975155279504e-05,
"loss": 0.9859,
"mean_token_accuracy": 0.7470481149852276,
"num_tokens": 83862674.0,
"step": 890
},
{
"entropy": 1.3406959801912308,
"epoch": 0.19967829607854012,
"grad_norm": 0.55078125,
"learning_rate": 4.002883762200533e-05,
"loss": 0.9555,
"mean_token_accuracy": 0.7523404717445373,
"num_tokens": 84815713.0,
"step": 900
},
{
"entropy": 1.4000651821494103,
"epoch": 0.20189694381274614,
"grad_norm": 0.56640625,
"learning_rate": 3.991792369121562e-05,
"loss": 0.9945,
"mean_token_accuracy": 0.7464812904596329,
"num_tokens": 85748577.0,
"step": 910
},
{
"entropy": 1.3798070877790451,
"epoch": 0.20411559154695214,
"grad_norm": 0.55078125,
"learning_rate": 3.9807009760425914e-05,
"loss": 1.0066,
"mean_token_accuracy": 0.7432169638574123,
"num_tokens": 86686607.0,
"step": 920
},
{
"entropy": 1.3416421085596084,
"epoch": 0.20633423928115813,
"grad_norm": 0.828125,
"learning_rate": 3.9696095829636206e-05,
"loss": 0.9644,
"mean_token_accuracy": 0.7486262872815133,
"num_tokens": 87638726.0,
"step": 930
},
{
"entropy": 1.3670515537261962,
"epoch": 0.20855288701536415,
"grad_norm": 0.498046875,
"learning_rate": 3.95851818988465e-05,
"loss": 0.9849,
"mean_token_accuracy": 0.7475749678909779,
"num_tokens": 88601047.0,
"step": 940
},
{
"entropy": 1.379362154006958,
"epoch": 0.21077153474957014,
"grad_norm": 0.57421875,
"learning_rate": 3.947426796805679e-05,
"loss": 0.9476,
"mean_token_accuracy": 0.7543636500835419,
"num_tokens": 89520415.0,
"step": 950
},
{
"entropy": 1.371240857243538,
"epoch": 0.21299018248377613,
"grad_norm": 0.5625,
"learning_rate": 3.936335403726708e-05,
"loss": 0.9664,
"mean_token_accuracy": 0.7490607380867005,
"num_tokens": 90455065.0,
"step": 960
},
{
"entropy": 1.340223667025566,
"epoch": 0.21520883021798215,
"grad_norm": 0.55078125,
"learning_rate": 3.9252440106477374e-05,
"loss": 0.9414,
"mean_token_accuracy": 0.75420788154006,
"num_tokens": 91408476.0,
"step": 970
},
{
"entropy": 1.358856461942196,
"epoch": 0.21742747795218814,
"grad_norm": 0.5625,
"learning_rate": 3.914152617568767e-05,
"loss": 0.9898,
"mean_token_accuracy": 0.748216237872839,
"num_tokens": 92379704.0,
"step": 980
},
{
"entropy": 1.3565750516951085,
"epoch": 0.21964612568639413,
"grad_norm": 0.51953125,
"learning_rate": 3.9030612244897965e-05,
"loss": 0.947,
"mean_token_accuracy": 0.7539278566837311,
"num_tokens": 93330334.0,
"step": 990
},
{
"entropy": 1.3714484706521035,
"epoch": 0.22186477342060015,
"grad_norm": 0.55859375,
"learning_rate": 3.891969831410826e-05,
"loss": 0.9702,
"mean_token_accuracy": 0.7506582617759705,
"num_tokens": 94283931.0,
"step": 1000
},
{
"entropy": 1.4159359961748124,
"epoch": 0.22408342115480614,
"grad_norm": 0.58203125,
"learning_rate": 3.880878438331855e-05,
"loss": 1.0096,
"mean_token_accuracy": 0.7425235278904438,
"num_tokens": 95207171.0,
"step": 1010
},
{
"entropy": 1.3379707857966423,
"epoch": 0.22630206888901214,
"grad_norm": 0.54296875,
"learning_rate": 3.869787045252884e-05,
"loss": 0.9549,
"mean_token_accuracy": 0.7548819564282894,
"num_tokens": 96148468.0,
"step": 1020
},
{
"entropy": 1.36017052680254,
"epoch": 0.22852071662321816,
"grad_norm": 0.53125,
"learning_rate": 3.8586956521739134e-05,
"loss": 0.9901,
"mean_token_accuracy": 0.7463661000132561,
"num_tokens": 97119881.0,
"step": 1030
},
{
"entropy": 1.3724884755909443,
"epoch": 0.23073936435742415,
"grad_norm": 0.51171875,
"learning_rate": 3.8476042590949426e-05,
"loss": 0.9822,
"mean_token_accuracy": 0.7475468330085278,
"num_tokens": 98077875.0,
"step": 1040
},
{
"entropy": 1.3405052460730076,
"epoch": 0.23295801209163014,
"grad_norm": 0.51953125,
"learning_rate": 3.836512866015972e-05,
"loss": 0.9432,
"mean_token_accuracy": 0.7541234731674195,
"num_tokens": 99031833.0,
"step": 1050
},
{
"entropy": 1.3757181286811828,
"epoch": 0.23517665982583616,
"grad_norm": 0.54296875,
"learning_rate": 3.825421472937001e-05,
"loss": 0.9888,
"mean_token_accuracy": 0.7472271144390106,
"num_tokens": 99991709.0,
"step": 1060
},
{
"entropy": 1.3773247390985488,
"epoch": 0.23739530756004215,
"grad_norm": 0.55859375,
"learning_rate": 3.814330079858031e-05,
"loss": 0.9933,
"mean_token_accuracy": 0.7446820683777332,
"num_tokens": 100941262.0,
"step": 1070
},
{
"entropy": 1.3600140511989594,
"epoch": 0.23961395529424814,
"grad_norm": 0.54296875,
"learning_rate": 3.80323868677906e-05,
"loss": 0.9708,
"mean_token_accuracy": 0.7481269456446171,
"num_tokens": 101897106.0,
"step": 1080
},
{
"entropy": 1.4162090666592122,
"epoch": 0.24183260302845416,
"grad_norm": 0.5546875,
"learning_rate": 3.7921472937000893e-05,
"loss": 0.9589,
"mean_token_accuracy": 0.7518798463046551,
"num_tokens": 102824359.0,
"step": 1090
},
{
"entropy": 1.342430242151022,
"epoch": 0.24405125076266015,
"grad_norm": 0.546875,
"learning_rate": 3.7810559006211186e-05,
"loss": 0.9439,
"mean_token_accuracy": 0.7558625318109989,
"num_tokens": 103772965.0,
"step": 1100
},
{
"entropy": 1.3550659596920014,
"epoch": 0.24626989849686615,
"grad_norm": 0.578125,
"learning_rate": 3.769964507542148e-05,
"loss": 0.9645,
"mean_token_accuracy": 0.7508764907717704,
"num_tokens": 104691589.0,
"step": 1110
},
{
"entropy": 1.3936431795358657,
"epoch": 0.24848854623107217,
"grad_norm": 0.58203125,
"learning_rate": 3.758873114463177e-05,
"loss": 0.989,
"mean_token_accuracy": 0.7461673654615879,
"num_tokens": 105581821.0,
"step": 1120
},
{
"entropy": 1.38260547965765,
"epoch": 0.2507071939652782,
"grad_norm": 0.62109375,
"learning_rate": 3.747781721384206e-05,
"loss": 0.9923,
"mean_token_accuracy": 0.7444244168698788,
"num_tokens": 106548122.0,
"step": 1130
},
{
"entropy": 1.3805961057543754,
"epoch": 0.2529258416994842,
"grad_norm": 0.5703125,
"learning_rate": 3.7366903283052354e-05,
"loss": 0.988,
"mean_token_accuracy": 0.746028533577919,
"num_tokens": 107471780.0,
"step": 1140
},
{
"entropy": 1.3697818227112293,
"epoch": 0.25514448943369017,
"grad_norm": 0.5625,
"learning_rate": 3.7255989352262646e-05,
"loss": 0.9879,
"mean_token_accuracy": 0.7449548006057739,
"num_tokens": 108423848.0,
"step": 1150
},
{
"entropy": 1.3857608392834664,
"epoch": 0.25736313716789616,
"grad_norm": 0.5546875,
"learning_rate": 3.714507542147294e-05,
"loss": 0.9909,
"mean_token_accuracy": 0.7463208839297295,
"num_tokens": 109339085.0,
"step": 1160
},
{
"entropy": 1.4123259857296944,
"epoch": 0.25958178490210215,
"grad_norm": 0.52734375,
"learning_rate": 3.703416149068323e-05,
"loss": 1.0152,
"mean_token_accuracy": 0.7402355149388313,
"num_tokens": 110293198.0,
"step": 1170
},
{
"entropy": 1.3887243419885635,
"epoch": 0.26180043263630814,
"grad_norm": 0.5859375,
"learning_rate": 3.692324755989352e-05,
"loss": 1.0135,
"mean_token_accuracy": 0.7396637931466102,
"num_tokens": 111236146.0,
"step": 1180
},
{
"entropy": 1.3697876557707787,
"epoch": 0.2640190803705142,
"grad_norm": 0.54296875,
"learning_rate": 3.6812333629103815e-05,
"loss": 1.0056,
"mean_token_accuracy": 0.7433505475521087,
"num_tokens": 112180565.0,
"step": 1190
},
{
"entropy": 1.3680458456277846,
"epoch": 0.2662377281047202,
"grad_norm": 0.57421875,
"learning_rate": 3.670141969831411e-05,
"loss": 0.9455,
"mean_token_accuracy": 0.7568574421107769,
"num_tokens": 113127830.0,
"step": 1200
},
{
"entropy": 1.3552896961569787,
"epoch": 0.2684563758389262,
"grad_norm": 0.6015625,
"learning_rate": 3.65905057675244e-05,
"loss": 0.9666,
"mean_token_accuracy": 0.7498848676681519,
"num_tokens": 114067246.0,
"step": 1210
},
{
"entropy": 1.3530599243938923,
"epoch": 0.27067502357313217,
"grad_norm": 0.55859375,
"learning_rate": 3.64795918367347e-05,
"loss": 0.9768,
"mean_token_accuracy": 0.7468112826347351,
"num_tokens": 114993058.0,
"step": 1220
},
{
"entropy": 1.3884347334504128,
"epoch": 0.27289367130733816,
"grad_norm": 0.515625,
"learning_rate": 3.636867790594499e-05,
"loss": 1.0055,
"mean_token_accuracy": 0.7407384052872658,
"num_tokens": 115913621.0,
"step": 1230
},
{
"entropy": 1.3924200147390366,
"epoch": 0.2751123190415442,
"grad_norm": 0.56640625,
"learning_rate": 3.625776397515528e-05,
"loss": 1.0014,
"mean_token_accuracy": 0.7461141526699067,
"num_tokens": 116873252.0,
"step": 1240
},
{
"entropy": 1.3493129260838033,
"epoch": 0.2773309667757502,
"grad_norm": 0.54296875,
"learning_rate": 3.6146850044365574e-05,
"loss": 0.9607,
"mean_token_accuracy": 0.7491537302732467,
"num_tokens": 117827994.0,
"step": 1250
},
{
"entropy": 1.382804460823536,
"epoch": 0.2795496145099562,
"grad_norm": 0.5703125,
"learning_rate": 3.6035936113575866e-05,
"loss": 0.9795,
"mean_token_accuracy": 0.7502268873155117,
"num_tokens": 118765286.0,
"step": 1260
},
{
"entropy": 1.374519681930542,
"epoch": 0.2817682622441622,
"grad_norm": 0.546875,
"learning_rate": 3.592502218278616e-05,
"loss": 0.9933,
"mean_token_accuracy": 0.7442196063697338,
"num_tokens": 119728068.0,
"step": 1270
},
{
"entropy": 1.3972402699291706,
"epoch": 0.2839869099783682,
"grad_norm": 0.56640625,
"learning_rate": 3.581410825199645e-05,
"loss": 1.0037,
"mean_token_accuracy": 0.7407265052199363,
"num_tokens": 120663567.0,
"step": 1280
},
{
"entropy": 1.379422479122877,
"epoch": 0.28620555771257417,
"grad_norm": 0.54296875,
"learning_rate": 3.570319432120674e-05,
"loss": 0.9921,
"mean_token_accuracy": 0.7444989711046219,
"num_tokens": 121604187.0,
"step": 1290
},
{
"entropy": 1.3605211839079856,
"epoch": 0.2884242054467802,
"grad_norm": 0.55859375,
"learning_rate": 3.5592280390417035e-05,
"loss": 0.9541,
"mean_token_accuracy": 0.7547078765928745,
"num_tokens": 122549091.0,
"step": 1300
},
{
"entropy": 1.358756284415722,
"epoch": 0.2906428531809862,
"grad_norm": 0.546875,
"learning_rate": 3.548136645962733e-05,
"loss": 0.9763,
"mean_token_accuracy": 0.7475451476871967,
"num_tokens": 123493867.0,
"step": 1310
},
{
"entropy": 1.3583389446139336,
"epoch": 0.2928615009151922,
"grad_norm": 0.5625,
"learning_rate": 3.537045252883762e-05,
"loss": 0.9564,
"mean_token_accuracy": 0.7530794121325016,
"num_tokens": 124444992.0,
"step": 1320
},
{
"entropy": 1.3372597798705101,
"epoch": 0.2950801486493982,
"grad_norm": 0.56640625,
"learning_rate": 3.525953859804791e-05,
"loss": 0.917,
"mean_token_accuracy": 0.7617659427225589,
"num_tokens": 125376281.0,
"step": 1330
},
{
"entropy": 1.3307228960096835,
"epoch": 0.2972987963836042,
"grad_norm": 0.5390625,
"learning_rate": 3.514862466725821e-05,
"loss": 0.9606,
"mean_token_accuracy": 0.749411403387785,
"num_tokens": 126299926.0,
"step": 1340
},
{
"entropy": 1.3589562863111495,
"epoch": 0.29951744411781017,
"grad_norm": 0.5703125,
"learning_rate": 3.50377107364685e-05,
"loss": 0.9547,
"mean_token_accuracy": 0.753935182094574,
"num_tokens": 127248113.0,
"step": 1350
},
{
"entropy": 1.3731069147586823,
"epoch": 0.3017360918520162,
"grad_norm": 0.58984375,
"learning_rate": 3.4926796805678794e-05,
"loss": 0.9724,
"mean_token_accuracy": 0.7470672108232975,
"num_tokens": 128181913.0,
"step": 1360
},
{
"entropy": 1.3970228135585785,
"epoch": 0.3039547395862222,
"grad_norm": 0.5625,
"learning_rate": 3.481588287488909e-05,
"loss": 0.9808,
"mean_token_accuracy": 0.7479516059160233,
"num_tokens": 129129397.0,
"step": 1370
},
{
"entropy": 1.3645412735641003,
"epoch": 0.3061733873204282,
"grad_norm": 0.5625,
"learning_rate": 3.470496894409938e-05,
"loss": 0.9904,
"mean_token_accuracy": 0.7432928495109081,
"num_tokens": 130072657.0,
"step": 1380
},
{
"entropy": 1.3820607632398605,
"epoch": 0.3083920350546342,
"grad_norm": 0.60546875,
"learning_rate": 3.459405501330967e-05,
"loss": 0.9456,
"mean_token_accuracy": 0.7521802820265293,
"num_tokens": 131009716.0,
"step": 1390
},
{
"entropy": 1.370594221353531,
"epoch": 0.3106106827888402,
"grad_norm": 0.58984375,
"learning_rate": 3.448314108251996e-05,
"loss": 0.9802,
"mean_token_accuracy": 0.7480019509792328,
"num_tokens": 131933570.0,
"step": 1400
},
{
"entropy": 1.4096333682537079,
"epoch": 0.3128293305230462,
"grad_norm": 0.5625,
"learning_rate": 3.4372227151730255e-05,
"loss": 0.9865,
"mean_token_accuracy": 0.7449732661247254,
"num_tokens": 132867719.0,
"step": 1410
},
{
"entropy": 1.3462214186787604,
"epoch": 0.3150479782572522,
"grad_norm": 0.55078125,
"learning_rate": 3.426131322094055e-05,
"loss": 0.965,
"mean_token_accuracy": 0.7521069377660752,
"num_tokens": 133800807.0,
"step": 1420
},
{
"entropy": 1.3778568729758263,
"epoch": 0.3172666259914582,
"grad_norm": 0.54296875,
"learning_rate": 3.415039929015084e-05,
"loss": 0.984,
"mean_token_accuracy": 0.7461141437292099,
"num_tokens": 134735911.0,
"step": 1430
},
{
"entropy": 1.3803854644298554,
"epoch": 0.3194852737256642,
"grad_norm": 0.53125,
"learning_rate": 3.403948535936114e-05,
"loss": 1.0009,
"mean_token_accuracy": 0.7454333089292049,
"num_tokens": 135679919.0,
"step": 1440
},
{
"entropy": 1.4075747832655907,
"epoch": 0.3217039214598702,
"grad_norm": 0.55859375,
"learning_rate": 3.392857142857143e-05,
"loss": 0.9977,
"mean_token_accuracy": 0.7427182622253895,
"num_tokens": 136600159.0,
"step": 1450
},
{
"entropy": 1.3631702698767185,
"epoch": 0.3239225691940762,
"grad_norm": 0.54296875,
"learning_rate": 3.381765749778172e-05,
"loss": 0.9834,
"mean_token_accuracy": 0.7481319233775139,
"num_tokens": 137552890.0,
"step": 1460
},
{
"entropy": 1.3479732781648637,
"epoch": 0.3261412169282822,
"grad_norm": 0.54296875,
"learning_rate": 3.3706743566992015e-05,
"loss": 0.9415,
"mean_token_accuracy": 0.7547242395579815,
"num_tokens": 138503386.0,
"step": 1470
},
{
"entropy": 1.3664841935038567,
"epoch": 0.32835986466248823,
"grad_norm": 0.57421875,
"learning_rate": 3.359582963620231e-05,
"loss": 0.9414,
"mean_token_accuracy": 0.7544668681919575,
"num_tokens": 139436618.0,
"step": 1480
},
{
"entropy": 1.369805136322975,
"epoch": 0.3305785123966942,
"grad_norm": 0.515625,
"learning_rate": 3.34849157054126e-05,
"loss": 0.9659,
"mean_token_accuracy": 0.748472998291254,
"num_tokens": 140386106.0,
"step": 1490
},
{
"entropy": 1.3472276948392392,
"epoch": 0.3327971601309002,
"grad_norm": 0.5078125,
"learning_rate": 3.337400177462289e-05,
"loss": 0.9306,
"mean_token_accuracy": 0.7600706323981286,
"num_tokens": 141318351.0,
"step": 1500
},
{
"entropy": 1.3616932608187198,
"epoch": 0.3350158078651062,
"grad_norm": 0.640625,
"learning_rate": 3.326308784383318e-05,
"loss": 0.949,
"mean_token_accuracy": 0.7525821574032306,
"num_tokens": 142261086.0,
"step": 1510
},
{
"entropy": 1.3529250659048557,
"epoch": 0.3372344555993122,
"grad_norm": 0.5390625,
"learning_rate": 3.3152173913043475e-05,
"loss": 0.9622,
"mean_token_accuracy": 0.7509421311318875,
"num_tokens": 143210426.0,
"step": 1520
},
{
"entropy": 1.3494533702731133,
"epoch": 0.33945310333351825,
"grad_norm": 0.6015625,
"learning_rate": 3.3041259982253774e-05,
"loss": 0.9719,
"mean_token_accuracy": 0.7484220921993255,
"num_tokens": 144159675.0,
"step": 1530
},
{
"entropy": 1.3390969790518283,
"epoch": 0.34167175106772424,
"grad_norm": 0.5625,
"learning_rate": 3.2930346051464066e-05,
"loss": 0.951,
"mean_token_accuracy": 0.7507020443677902,
"num_tokens": 145086625.0,
"step": 1540
},
{
"entropy": 1.3771871954202652,
"epoch": 0.34389039880193023,
"grad_norm": 0.60546875,
"learning_rate": 3.281943212067436e-05,
"loss": 0.9817,
"mean_token_accuracy": 0.7470030762255192,
"num_tokens": 145986719.0,
"step": 1550
},
{
"entropy": 1.3694410175085068,
"epoch": 0.3461090465361362,
"grad_norm": 1.421875,
"learning_rate": 3.270851818988465e-05,
"loss": 0.9611,
"mean_token_accuracy": 0.7503976099193096,
"num_tokens": 146918044.0,
"step": 1560
},
{
"entropy": 1.3784858211874962,
"epoch": 0.3483276942703422,
"grad_norm": 0.5703125,
"learning_rate": 3.259760425909494e-05,
"loss": 0.9773,
"mean_token_accuracy": 0.7474872335791588,
"num_tokens": 147883804.0,
"step": 1570
},
{
"entropy": 1.3432278633117676,
"epoch": 0.3505463420045482,
"grad_norm": 0.56640625,
"learning_rate": 3.2486690328305235e-05,
"loss": 0.936,
"mean_token_accuracy": 0.755360123515129,
"num_tokens": 148818674.0,
"step": 1580
},
{
"entropy": 1.3762368500232696,
"epoch": 0.35276498973875425,
"grad_norm": 0.53125,
"learning_rate": 3.237577639751553e-05,
"loss": 1.0127,
"mean_token_accuracy": 0.7423109777271748,
"num_tokens": 149775495.0,
"step": 1590
},
{
"entropy": 1.389584618806839,
"epoch": 0.35498363747296025,
"grad_norm": 0.64453125,
"learning_rate": 3.226486246672582e-05,
"loss": 0.956,
"mean_token_accuracy": 0.7540929049253464,
"num_tokens": 150715437.0,
"step": 1600
},
{
"entropy": 1.3511891454458236,
"epoch": 0.35720228520716624,
"grad_norm": 0.55078125,
"learning_rate": 3.215394853593611e-05,
"loss": 0.9424,
"mean_token_accuracy": 0.7547151155769825,
"num_tokens": 151648229.0,
"step": 1610
},
{
"entropy": 1.342407089471817,
"epoch": 0.35942093294137223,
"grad_norm": 0.5390625,
"learning_rate": 3.204303460514641e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.75165830925107,
"num_tokens": 152568041.0,
"step": 1620
},
{
"entropy": 1.3556662440299987,
"epoch": 0.3616395806755782,
"grad_norm": 0.55859375,
"learning_rate": 3.19321206743567e-05,
"loss": 0.9579,
"mean_token_accuracy": 0.7535672217607499,
"num_tokens": 153481444.0,
"step": 1630
},
{
"entropy": 1.3596419125795365,
"epoch": 0.3638582284097842,
"grad_norm": 0.5390625,
"learning_rate": 3.1821206743566994e-05,
"loss": 0.9707,
"mean_token_accuracy": 0.7496115677058697,
"num_tokens": 154436286.0,
"step": 1640
},
{
"entropy": 1.3691009670495986,
"epoch": 0.36607687614399026,
"grad_norm": 0.5546875,
"learning_rate": 3.171029281277729e-05,
"loss": 0.9829,
"mean_token_accuracy": 0.7472112305462361,
"num_tokens": 155369838.0,
"step": 1650
},
{
"entropy": 1.3596091173589229,
"epoch": 0.36829552387819625,
"grad_norm": 0.546875,
"learning_rate": 3.159937888198758e-05,
"loss": 0.9514,
"mean_token_accuracy": 0.7531104668974876,
"num_tokens": 156331805.0,
"step": 1660
},
{
"entropy": 1.4006396278738975,
"epoch": 0.37051417161240224,
"grad_norm": 0.5390625,
"learning_rate": 3.148846495119787e-05,
"loss": 1.0172,
"mean_token_accuracy": 0.7395706221461296,
"num_tokens": 157249437.0,
"step": 1670
},
{
"entropy": 1.3770634673535824,
"epoch": 0.37273281934660824,
"grad_norm": 0.5546875,
"learning_rate": 3.137755102040816e-05,
"loss": 0.9982,
"mean_token_accuracy": 0.7440236747264862,
"num_tokens": 158190770.0,
"step": 1680
},
{
"entropy": 1.370580254495144,
"epoch": 0.3749514670808142,
"grad_norm": 0.51171875,
"learning_rate": 3.1266637089618455e-05,
"loss": 0.9632,
"mean_token_accuracy": 0.7505429275333881,
"num_tokens": 159111105.0,
"step": 1690
},
{
"entropy": 1.3719338580965996,
"epoch": 0.3771701148150202,
"grad_norm": 0.58203125,
"learning_rate": 3.115572315882875e-05,
"loss": 0.9831,
"mean_token_accuracy": 0.7460063569247722,
"num_tokens": 160054633.0,
"step": 1700
},
{
"entropy": 1.4081777222454548,
"epoch": 0.37938876254922627,
"grad_norm": 0.53125,
"learning_rate": 3.1044809228039046e-05,
"loss": 1.0079,
"mean_token_accuracy": 0.7412950038909912,
"num_tokens": 161001374.0,
"step": 1710
},
{
"entropy": 1.3786864325404167,
"epoch": 0.38160741028343226,
"grad_norm": 0.546875,
"learning_rate": 3.093389529724934e-05,
"loss": 1.0045,
"mean_token_accuracy": 0.7446122042834759,
"num_tokens": 161931135.0,
"step": 1720
},
{
"entropy": 1.3475178599357605,
"epoch": 0.38382605801763825,
"grad_norm": 0.58984375,
"learning_rate": 3.082298136645963e-05,
"loss": 0.9488,
"mean_token_accuracy": 0.7558258168399334,
"num_tokens": 162879415.0,
"step": 1730
},
{
"entropy": 1.369861949980259,
"epoch": 0.38604470575184424,
"grad_norm": 0.56640625,
"learning_rate": 3.071206743566992e-05,
"loss": 0.965,
"mean_token_accuracy": 0.7485872730612755,
"num_tokens": 163815621.0,
"step": 1740
},
{
"entropy": 1.3579731062054634,
"epoch": 0.38826335348605023,
"grad_norm": 0.55859375,
"learning_rate": 3.0601153504880215e-05,
"loss": 0.9566,
"mean_token_accuracy": 0.7517626143991947,
"num_tokens": 164760214.0,
"step": 1750
},
{
"entropy": 1.3689505890011788,
"epoch": 0.3904820012202563,
"grad_norm": 0.53125,
"learning_rate": 3.0490239574090507e-05,
"loss": 0.9746,
"mean_token_accuracy": 0.7488324150443078,
"num_tokens": 165725369.0,
"step": 1760
},
{
"entropy": 1.3397907942533493,
"epoch": 0.3927006489544623,
"grad_norm": 0.51953125,
"learning_rate": 3.03793256433008e-05,
"loss": 0.9336,
"mean_token_accuracy": 0.7559916451573372,
"num_tokens": 166673239.0,
"step": 1770
},
{
"entropy": 1.3467686548829079,
"epoch": 0.39491929668866826,
"grad_norm": 0.51953125,
"learning_rate": 3.026841171251109e-05,
"loss": 0.9639,
"mean_token_accuracy": 0.7545785017311573,
"num_tokens": 167629364.0,
"step": 1780
},
{
"entropy": 1.3575761772692203,
"epoch": 0.39713794442287426,
"grad_norm": 0.5703125,
"learning_rate": 3.0157497781721383e-05,
"loss": 0.9724,
"mean_token_accuracy": 0.7488372251391411,
"num_tokens": 168566244.0,
"step": 1790
},
{
"entropy": 1.353706033527851,
"epoch": 0.39935659215708025,
"grad_norm": 0.6015625,
"learning_rate": 3.0046583850931682e-05,
"loss": 0.9412,
"mean_token_accuracy": 0.7558333098888397,
"num_tokens": 169511821.0,
"step": 1800
},
{
"entropy": 1.3735353089869022,
"epoch": 0.40157523989128624,
"grad_norm": 0.5625,
"learning_rate": 2.9935669920141974e-05,
"loss": 0.985,
"mean_token_accuracy": 0.7467327207326889,
"num_tokens": 170438159.0,
"step": 1810
},
{
"entropy": 1.383265955746174,
"epoch": 0.4037938876254923,
"grad_norm": 0.51171875,
"learning_rate": 2.9824755989352266e-05,
"loss": 0.9906,
"mean_token_accuracy": 0.7453700192272663,
"num_tokens": 171372232.0,
"step": 1820
},
{
"entropy": 1.3484601899981499,
"epoch": 0.4060125353596983,
"grad_norm": 0.5390625,
"learning_rate": 2.971384205856256e-05,
"loss": 0.9455,
"mean_token_accuracy": 0.7536688603460788,
"num_tokens": 172306458.0,
"step": 1830
},
{
"entropy": 1.3571648687124251,
"epoch": 0.40823118309390427,
"grad_norm": 0.57421875,
"learning_rate": 2.960292812777285e-05,
"loss": 0.9957,
"mean_token_accuracy": 0.7446161836385727,
"num_tokens": 173255617.0,
"step": 1840
},
{
"entropy": 1.3686935976147652,
"epoch": 0.41044983082811026,
"grad_norm": 0.5390625,
"learning_rate": 2.9492014196983143e-05,
"loss": 0.9407,
"mean_token_accuracy": 0.7566944785416126,
"num_tokens": 174208124.0,
"step": 1850
},
{
"entropy": 1.3695332050323485,
"epoch": 0.41266847856231625,
"grad_norm": 0.55859375,
"learning_rate": 2.9381100266193435e-05,
"loss": 0.973,
"mean_token_accuracy": 0.7486338473856449,
"num_tokens": 175159867.0,
"step": 1860
},
{
"entropy": 1.3632485464215278,
"epoch": 0.41488712629652225,
"grad_norm": 0.5703125,
"learning_rate": 2.9270186335403727e-05,
"loss": 0.9524,
"mean_token_accuracy": 0.7515506997704506,
"num_tokens": 176092841.0,
"step": 1870
},
{
"entropy": 1.3715375781059265,
"epoch": 0.4171057740307283,
"grad_norm": 0.5546875,
"learning_rate": 2.915927240461402e-05,
"loss": 0.981,
"mean_token_accuracy": 0.7478602975606918,
"num_tokens": 177053245.0,
"step": 1880
},
{
"entropy": 1.3497217521071434,
"epoch": 0.4193244217649343,
"grad_norm": 0.5703125,
"learning_rate": 2.9048358473824318e-05,
"loss": 0.9605,
"mean_token_accuracy": 0.7520354442298413,
"num_tokens": 178014206.0,
"step": 1890
},
{
"entropy": 1.380847369134426,
"epoch": 0.4215430694991403,
"grad_norm": 0.53515625,
"learning_rate": 2.893744454303461e-05,
"loss": 0.9716,
"mean_token_accuracy": 0.7520315021276474,
"num_tokens": 178955937.0,
"step": 1900
},
{
"entropy": 1.3706847220659255,
"epoch": 0.42376171723334627,
"grad_norm": 0.51953125,
"learning_rate": 2.8826530612244902e-05,
"loss": 0.9787,
"mean_token_accuracy": 0.7502063922584057,
"num_tokens": 179913308.0,
"step": 1910
},
{
"entropy": 1.386249950528145,
"epoch": 0.42598036496755226,
"grad_norm": 0.58203125,
"learning_rate": 2.8715616681455194e-05,
"loss": 0.9614,
"mean_token_accuracy": 0.7522901840507984,
"num_tokens": 180862001.0,
"step": 1920
},
{
"entropy": 1.387051635980606,
"epoch": 0.42819901270175825,
"grad_norm": 0.54296875,
"learning_rate": 2.8604702750665487e-05,
"loss": 0.9775,
"mean_token_accuracy": 0.7487853363156318,
"num_tokens": 181811443.0,
"step": 1930
},
{
"entropy": 1.3455969080328942,
"epoch": 0.4304176604359643,
"grad_norm": 0.49609375,
"learning_rate": 2.849378881987578e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.751628965884447,
"num_tokens": 182766304.0,
"step": 1940
},
{
"entropy": 1.364281751215458,
"epoch": 0.4326363081701703,
"grad_norm": 0.5390625,
"learning_rate": 2.838287488908607e-05,
"loss": 0.9469,
"mean_token_accuracy": 0.7545228533446788,
"num_tokens": 183713237.0,
"step": 1950
},
{
"entropy": 1.3550898402929306,
"epoch": 0.4348549559043763,
"grad_norm": 0.60546875,
"learning_rate": 2.8271960958296363e-05,
"loss": 0.9731,
"mean_token_accuracy": 0.7482604801654815,
"num_tokens": 184646169.0,
"step": 1960
},
{
"entropy": 1.3582610800862311,
"epoch": 0.4370736036385823,
"grad_norm": 0.57421875,
"learning_rate": 2.8161047027506655e-05,
"loss": 0.948,
"mean_token_accuracy": 0.7556835524737835,
"num_tokens": 185593240.0,
"step": 1970
},
{
"entropy": 1.4051440745592116,
"epoch": 0.43929225137278827,
"grad_norm": 0.58984375,
"learning_rate": 2.8050133096716947e-05,
"loss": 1.0018,
"mean_token_accuracy": 0.7416196145117283,
"num_tokens": 186511580.0,
"step": 1980
},
{
"entropy": 1.3595298886299134,
"epoch": 0.4415108991069943,
"grad_norm": 0.55859375,
"learning_rate": 2.7939219165927243e-05,
"loss": 0.966,
"mean_token_accuracy": 0.7499643869698047,
"num_tokens": 187463868.0,
"step": 1990
},
{
"entropy": 1.344205194711685,
"epoch": 0.4437295468412003,
"grad_norm": 0.5078125,
"learning_rate": 2.7828305235137535e-05,
"loss": 0.9686,
"mean_token_accuracy": 0.7512450948357582,
"num_tokens": 188409504.0,
"step": 2000
},
{
"entropy": 1.4076050415635109,
"epoch": 0.4459481945754063,
"grad_norm": 0.55859375,
"learning_rate": 2.7717391304347827e-05,
"loss": 1.0067,
"mean_token_accuracy": 0.7415082044899464,
"num_tokens": 189355779.0,
"step": 2010
},
{
"entropy": 1.3575765684247016,
"epoch": 0.4481668423096123,
"grad_norm": 0.5625,
"learning_rate": 2.760647737355812e-05,
"loss": 0.9601,
"mean_token_accuracy": 0.7506938494741917,
"num_tokens": 190278778.0,
"step": 2020
},
{
"entropy": 1.3836154788732529,
"epoch": 0.4503854900438183,
"grad_norm": 0.5546875,
"learning_rate": 2.749556344276841e-05,
"loss": 1.0157,
"mean_token_accuracy": 0.7393032193183899,
"num_tokens": 191241326.0,
"step": 2030
},
{
"entropy": 1.391631406545639,
"epoch": 0.4526041377780243,
"grad_norm": 0.5625,
"learning_rate": 2.7384649511978703e-05,
"loss": 0.9776,
"mean_token_accuracy": 0.7508174151182174,
"num_tokens": 192214681.0,
"step": 2040
},
{
"entropy": 1.3410830795764923,
"epoch": 0.4548227855122303,
"grad_norm": 0.55859375,
"learning_rate": 2.7273735581188996e-05,
"loss": 0.943,
"mean_token_accuracy": 0.7552683062851429,
"num_tokens": 193166398.0,
"step": 2050
},
{
"entropy": 1.3428148820996284,
"epoch": 0.4570414332464363,
"grad_norm": 0.58203125,
"learning_rate": 2.7162821650399288e-05,
"loss": 0.9664,
"mean_token_accuracy": 0.750061446428299,
"num_tokens": 194098519.0,
"step": 2060
},
{
"entropy": 1.379681558907032,
"epoch": 0.4592600809806423,
"grad_norm": 0.6015625,
"learning_rate": 2.7051907719609583e-05,
"loss": 0.9832,
"mean_token_accuracy": 0.7464658364653587,
"num_tokens": 195004862.0,
"step": 2070
},
{
"entropy": 1.3508182168006897,
"epoch": 0.4614787287148483,
"grad_norm": 0.53125,
"learning_rate": 2.694099378881988e-05,
"loss": 0.9602,
"mean_token_accuracy": 0.7505707196891308,
"num_tokens": 195942523.0,
"step": 2080
},
{
"entropy": 1.3505974404513836,
"epoch": 0.4636973764490543,
"grad_norm": 0.57421875,
"learning_rate": 2.683007985803017e-05,
"loss": 0.9358,
"mean_token_accuracy": 0.7566464401781559,
"num_tokens": 196877376.0,
"step": 2090
},
{
"entropy": 1.3485953092575074,
"epoch": 0.4659160241832603,
"grad_norm": 0.55078125,
"learning_rate": 2.6719165927240463e-05,
"loss": 0.9482,
"mean_token_accuracy": 0.7524454712867736,
"num_tokens": 197810400.0,
"step": 2100
},
{
"entropy": 1.3627469688653946,
"epoch": 0.4681346719174663,
"grad_norm": 0.57421875,
"learning_rate": 2.6608251996450755e-05,
"loss": 0.9897,
"mean_token_accuracy": 0.7450599886476994,
"num_tokens": 198757408.0,
"step": 2110
},
{
"entropy": 1.3389364905655383,
"epoch": 0.4703533196516723,
"grad_norm": 0.5546875,
"learning_rate": 2.6497338065661047e-05,
"loss": 0.9129,
"mean_token_accuracy": 0.7605686038732529,
"num_tokens": 199692340.0,
"step": 2120
},
{
"entropy": 1.3748140163719653,
"epoch": 0.4725719673858783,
"grad_norm": 0.5703125,
"learning_rate": 2.638642413487134e-05,
"loss": 0.955,
"mean_token_accuracy": 0.7520599849522114,
"num_tokens": 200650664.0,
"step": 2130
},
{
"entropy": 1.3346731156110763,
"epoch": 0.4747906151200843,
"grad_norm": 0.56640625,
"learning_rate": 2.627551020408163e-05,
"loss": 0.9212,
"mean_token_accuracy": 0.7582221433520318,
"num_tokens": 201597933.0,
"step": 2140
},
{
"entropy": 1.3664929166436195,
"epoch": 0.4770092628542903,
"grad_norm": 0.55078125,
"learning_rate": 2.6164596273291924e-05,
"loss": 0.9675,
"mean_token_accuracy": 0.7481563113629818,
"num_tokens": 202551711.0,
"step": 2150
},
{
"entropy": 1.3843952640891075,
"epoch": 0.4792279105884963,
"grad_norm": 0.53125,
"learning_rate": 2.6053682342502216e-05,
"loss": 1.0013,
"mean_token_accuracy": 0.7428773507475853,
"num_tokens": 203475006.0,
"step": 2160
},
{
"entropy": 1.3748216979205607,
"epoch": 0.48144655832270233,
"grad_norm": 0.53125,
"learning_rate": 2.5942768411712515e-05,
"loss": 0.9858,
"mean_token_accuracy": 0.7472008153796196,
"num_tokens": 204414096.0,
"step": 2170
},
{
"entropy": 1.3418536871671676,
"epoch": 0.4836652060569083,
"grad_norm": 0.54296875,
"learning_rate": 2.5831854480922807e-05,
"loss": 0.9614,
"mean_token_accuracy": 0.7489035427570343,
"num_tokens": 205346877.0,
"step": 2180
},
{
"entropy": 1.334491142630577,
"epoch": 0.4858838537911143,
"grad_norm": 0.52734375,
"learning_rate": 2.57209405501331e-05,
"loss": 0.952,
"mean_token_accuracy": 0.7544978365302086,
"num_tokens": 206299157.0,
"step": 2190
},
{
"entropy": 1.3800398319959641,
"epoch": 0.4881025015253203,
"grad_norm": 0.56640625,
"learning_rate": 2.561002661934339e-05,
"loss": 0.9965,
"mean_token_accuracy": 0.7438121646642685,
"num_tokens": 207235480.0,
"step": 2200
},
{
"entropy": 1.3536552309989929,
"epoch": 0.4903211492595263,
"grad_norm": 0.578125,
"learning_rate": 2.5499112688553683e-05,
"loss": 0.9518,
"mean_token_accuracy": 0.7553939551115036,
"num_tokens": 208192869.0,
"step": 2210
},
{
"entropy": 1.3550305306911468,
"epoch": 0.4925397969937323,
"grad_norm": 0.55859375,
"learning_rate": 2.5388198757763975e-05,
"loss": 0.962,
"mean_token_accuracy": 0.7509313143789769,
"num_tokens": 209163216.0,
"step": 2220
},
{
"entropy": 1.3438825502991676,
"epoch": 0.49475844472793834,
"grad_norm": 0.5234375,
"learning_rate": 2.5277284826974267e-05,
"loss": 0.9612,
"mean_token_accuracy": 0.7521602623164654,
"num_tokens": 210107351.0,
"step": 2230
},
{
"entropy": 1.3802131339907646,
"epoch": 0.49697709246214433,
"grad_norm": 0.53125,
"learning_rate": 2.516637089618456e-05,
"loss": 0.9761,
"mean_token_accuracy": 0.7472608901560307,
"num_tokens": 211041572.0,
"step": 2240
},
{
"entropy": 1.340398869663477,
"epoch": 0.4991957401963503,
"grad_norm": 0.5703125,
"learning_rate": 2.5055456965394852e-05,
"loss": 0.9588,
"mean_token_accuracy": 0.7512741200625896,
"num_tokens": 211977487.0,
"step": 2250
},
{
"entropy": 1.3554712682962418,
"epoch": 0.5014143879305564,
"grad_norm": 0.515625,
"learning_rate": 2.4944543034605147e-05,
"loss": 0.9593,
"mean_token_accuracy": 0.7513454340398311,
"num_tokens": 212944715.0,
"step": 2260
},
{
"entropy": 1.3268218383193016,
"epoch": 0.5036330356647624,
"grad_norm": 0.484375,
"learning_rate": 2.483362910381544e-05,
"loss": 0.9323,
"mean_token_accuracy": 0.7580919787287712,
"num_tokens": 213894148.0,
"step": 2270
},
{
"entropy": 1.3675538420677185,
"epoch": 0.5058516833989684,
"grad_norm": 0.5625,
"learning_rate": 2.4722715173025735e-05,
"loss": 0.9662,
"mean_token_accuracy": 0.7483490623533726,
"num_tokens": 214849284.0,
"step": 2280
},
{
"entropy": 1.3869496576488018,
"epoch": 0.5080703311331743,
"grad_norm": 0.58203125,
"learning_rate": 2.4611801242236027e-05,
"loss": 0.9855,
"mean_token_accuracy": 0.7459334179759025,
"num_tokens": 215795497.0,
"step": 2290
},
{
"entropy": 1.3464835032820701,
"epoch": 0.5102889788673803,
"grad_norm": 0.52734375,
"learning_rate": 2.450088731144632e-05,
"loss": 0.9559,
"mean_token_accuracy": 0.7510037913918495,
"num_tokens": 216730353.0,
"step": 2300
},
{
"entropy": 1.3536331675946713,
"epoch": 0.5125076266015863,
"grad_norm": 0.54296875,
"learning_rate": 2.438997338065661e-05,
"loss": 0.9768,
"mean_token_accuracy": 0.7474269300699234,
"num_tokens": 217652371.0,
"step": 2310
},
{
"entropy": 1.3597574278712272,
"epoch": 0.5147262743357923,
"grad_norm": 0.5390625,
"learning_rate": 2.4279059449866903e-05,
"loss": 0.9739,
"mean_token_accuracy": 0.7476049326360226,
"num_tokens": 218599573.0,
"step": 2320
},
{
"entropy": 1.3576934173703195,
"epoch": 0.5169449220699983,
"grad_norm": 0.54296875,
"learning_rate": 2.41681455190772e-05,
"loss": 0.9472,
"mean_token_accuracy": 0.7533730484545231,
"num_tokens": 219526034.0,
"step": 2330
},
{
"entropy": 1.3639849349856377,
"epoch": 0.5191635698042043,
"grad_norm": 0.55078125,
"learning_rate": 2.405723158828749e-05,
"loss": 1.0025,
"mean_token_accuracy": 0.7426088079810143,
"num_tokens": 220457735.0,
"step": 2340
},
{
"entropy": 1.3650838419795037,
"epoch": 0.5213822175384103,
"grad_norm": 0.5546875,
"learning_rate": 2.3946317657497783e-05,
"loss": 0.9795,
"mean_token_accuracy": 0.7475734516978264,
"num_tokens": 221414161.0,
"step": 2350
},
{
"entropy": 1.381436189264059,
"epoch": 0.5236008652726163,
"grad_norm": 0.5625,
"learning_rate": 2.3835403726708075e-05,
"loss": 0.9928,
"mean_token_accuracy": 0.744963239133358,
"num_tokens": 222354700.0,
"step": 2360
},
{
"entropy": 1.3839796632528305,
"epoch": 0.5258195130068224,
"grad_norm": 0.609375,
"learning_rate": 2.372448979591837e-05,
"loss": 0.9714,
"mean_token_accuracy": 0.7483658462762832,
"num_tokens": 223258825.0,
"step": 2370
},
{
"entropy": 1.3521512404084206,
"epoch": 0.5280381607410284,
"grad_norm": 0.5390625,
"learning_rate": 2.3613575865128663e-05,
"loss": 0.9965,
"mean_token_accuracy": 0.7439669594168663,
"num_tokens": 224207000.0,
"step": 2380
},
{
"entropy": 1.3651843503117562,
"epoch": 0.5302568084752344,
"grad_norm": 0.5546875,
"learning_rate": 2.3502661934338955e-05,
"loss": 0.9609,
"mean_token_accuracy": 0.7512055054306984,
"num_tokens": 225168371.0,
"step": 2390
},
{
"entropy": 1.3762024179100991,
"epoch": 0.5324754562094404,
"grad_norm": 0.67578125,
"learning_rate": 2.3391748003549247e-05,
"loss": 0.9818,
"mean_token_accuracy": 0.7457237169146538,
"num_tokens": 226082017.0,
"step": 2400
},
{
"entropy": 1.3472841560840607,
"epoch": 0.5346941039436464,
"grad_norm": 0.5234375,
"learning_rate": 2.328083407275954e-05,
"loss": 0.9581,
"mean_token_accuracy": 0.7504221297800541,
"num_tokens": 227034510.0,
"step": 2410
},
{
"entropy": 1.3381285414099693,
"epoch": 0.5369127516778524,
"grad_norm": 0.54296875,
"learning_rate": 2.3169920141969835e-05,
"loss": 0.9492,
"mean_token_accuracy": 0.7552238062024117,
"num_tokens": 228002765.0,
"step": 2420
},
{
"entropy": 1.3511281102895736,
"epoch": 0.5391313994120583,
"grad_norm": 0.5390625,
"learning_rate": 2.3059006211180127e-05,
"loss": 0.9393,
"mean_token_accuracy": 0.7557635813951492,
"num_tokens": 228965429.0,
"step": 2430
},
{
"entropy": 1.3392139934003353,
"epoch": 0.5413500471462643,
"grad_norm": 0.5859375,
"learning_rate": 2.294809228039042e-05,
"loss": 0.9343,
"mean_token_accuracy": 0.7574323169887066,
"num_tokens": 229896813.0,
"step": 2440
},
{
"entropy": 1.3686351031064987,
"epoch": 0.5435686948804703,
"grad_norm": 0.59765625,
"learning_rate": 2.283717834960071e-05,
"loss": 0.937,
"mean_token_accuracy": 0.7549462541937828,
"num_tokens": 230834752.0,
"step": 2450
},
{
"entropy": 1.339580136537552,
"epoch": 0.5457873426146763,
"grad_norm": 0.51171875,
"learning_rate": 2.2726264418811003e-05,
"loss": 0.9429,
"mean_token_accuracy": 0.7544058203697205,
"num_tokens": 231770668.0,
"step": 2460
},
{
"entropy": 1.3737561523914337,
"epoch": 0.5480059903488823,
"grad_norm": 0.5703125,
"learning_rate": 2.26153504880213e-05,
"loss": 0.9539,
"mean_token_accuracy": 0.752394187450409,
"num_tokens": 232706737.0,
"step": 2470
},
{
"entropy": 1.3709870815277099,
"epoch": 0.5502246380830884,
"grad_norm": 0.53125,
"learning_rate": 2.250443655723159e-05,
"loss": 0.9915,
"mean_token_accuracy": 0.7455207951366901,
"num_tokens": 233667028.0,
"step": 2480
},
{
"entropy": 1.3425948224961757,
"epoch": 0.5524432858172944,
"grad_norm": 0.5625,
"learning_rate": 2.2393522626441883e-05,
"loss": 0.9351,
"mean_token_accuracy": 0.7563626609742642,
"num_tokens": 234615377.0,
"step": 2490
},
{
"entropy": 1.382601398229599,
"epoch": 0.5546619335515004,
"grad_norm": 0.57421875,
"learning_rate": 2.2282608695652175e-05,
"loss": 0.9837,
"mean_token_accuracy": 0.7450042508542538,
"num_tokens": 235554965.0,
"step": 2500
},
{
"entropy": 1.3571919694542884,
"epoch": 0.5568805812857064,
"grad_norm": 0.60546875,
"learning_rate": 2.2171694764862467e-05,
"loss": 0.9675,
"mean_token_accuracy": 0.7487703949213028,
"num_tokens": 236510004.0,
"step": 2510
},
{
"entropy": 1.3876874506473542,
"epoch": 0.5590992290199124,
"grad_norm": 0.55859375,
"learning_rate": 2.206078083407276e-05,
"loss": 0.9859,
"mean_token_accuracy": 0.7465429671108723,
"num_tokens": 237426607.0,
"step": 2520
},
{
"entropy": 1.3840069979429246,
"epoch": 0.5613178767541184,
"grad_norm": 0.57421875,
"learning_rate": 2.1949866903283052e-05,
"loss": 0.9573,
"mean_token_accuracy": 0.7508729174733162,
"num_tokens": 238388324.0,
"step": 2530
},
{
"entropy": 1.3555088877677917,
"epoch": 0.5635365244883244,
"grad_norm": 0.55859375,
"learning_rate": 2.1838952972493347e-05,
"loss": 0.9508,
"mean_token_accuracy": 0.7524322152137757,
"num_tokens": 239305641.0,
"step": 2540
},
{
"entropy": 1.4074857875704765,
"epoch": 0.5657551722225304,
"grad_norm": 0.56640625,
"learning_rate": 2.172803904170364e-05,
"loss": 0.9851,
"mean_token_accuracy": 0.7465929470956325,
"num_tokens": 240221454.0,
"step": 2550
},
{
"entropy": 1.3208380579948424,
"epoch": 0.5679738199567363,
"grad_norm": 0.546875,
"learning_rate": 2.161712511091393e-05,
"loss": 0.9432,
"mean_token_accuracy": 0.754857836663723,
"num_tokens": 241170406.0,
"step": 2560
},
{
"entropy": 1.36037939786911,
"epoch": 0.5701924676909423,
"grad_norm": 0.53515625,
"learning_rate": 2.1506211180124224e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.7514487348496914,
"num_tokens": 242100616.0,
"step": 2570
},
{
"entropy": 1.3895253077149392,
"epoch": 0.5724111154251483,
"grad_norm": 0.52734375,
"learning_rate": 2.1395297249334516e-05,
"loss": 0.9847,
"mean_token_accuracy": 0.7462774030864239,
"num_tokens": 243038631.0,
"step": 2580
},
{
"entropy": 1.3492624297738076,
"epoch": 0.5746297631593543,
"grad_norm": 0.5390625,
"learning_rate": 2.1284383318544808e-05,
"loss": 0.9428,
"mean_token_accuracy": 0.7537525497376919,
"num_tokens": 244000961.0,
"step": 2590
},
{
"entropy": 1.357532762736082,
"epoch": 0.5768484108935604,
"grad_norm": 0.51171875,
"learning_rate": 2.1173469387755103e-05,
"loss": 0.9577,
"mean_token_accuracy": 0.7505512781441211,
"num_tokens": 244952283.0,
"step": 2600
},
{
"entropy": 1.3500149488449096,
"epoch": 0.5790670586277664,
"grad_norm": 0.5625,
"learning_rate": 2.1062555456965396e-05,
"loss": 0.9333,
"mean_token_accuracy": 0.757443331182003,
"num_tokens": 245897365.0,
"step": 2610
},
{
"entropy": 1.3495190888643265,
"epoch": 0.5812857063619724,
"grad_norm": 0.5390625,
"learning_rate": 2.0951641526175688e-05,
"loss": 0.9582,
"mean_token_accuracy": 0.7513453289866447,
"num_tokens": 246833155.0,
"step": 2620
},
{
"entropy": 1.3707278072834015,
"epoch": 0.5835043540961784,
"grad_norm": 0.55078125,
"learning_rate": 2.084072759538598e-05,
"loss": 0.9817,
"mean_token_accuracy": 0.7466557987034321,
"num_tokens": 247796159.0,
"step": 2630
},
{
"entropy": 1.340453139692545,
"epoch": 0.5857230018303844,
"grad_norm": 0.57421875,
"learning_rate": 2.0729813664596272e-05,
"loss": 0.9589,
"mean_token_accuracy": 0.7525463417172432,
"num_tokens": 248736277.0,
"step": 2640
},
{
"entropy": 1.3754316791892052,
"epoch": 0.5879416495645904,
"grad_norm": 0.51171875,
"learning_rate": 2.0618899733806567e-05,
"loss": 0.9697,
"mean_token_accuracy": 0.7479398109018802,
"num_tokens": 249662809.0,
"step": 2650
},
{
"entropy": 1.3680956415832042,
"epoch": 0.5901602972987964,
"grad_norm": 0.5625,
"learning_rate": 2.050798580301686e-05,
"loss": 0.9565,
"mean_token_accuracy": 0.7505876325070858,
"num_tokens": 250581187.0,
"step": 2660
},
{
"entropy": 1.372042527794838,
"epoch": 0.5923789450330024,
"grad_norm": 0.578125,
"learning_rate": 2.0397071872227152e-05,
"loss": 0.9573,
"mean_token_accuracy": 0.7499286234378815,
"num_tokens": 251494513.0,
"step": 2670
},
{
"entropy": 1.3676451787352562,
"epoch": 0.5945975927672084,
"grad_norm": 0.53515625,
"learning_rate": 2.0286157941437444e-05,
"loss": 1.0024,
"mean_token_accuracy": 0.7436093680560589,
"num_tokens": 252450478.0,
"step": 2680
},
{
"entropy": 1.321447344124317,
"epoch": 0.5968162405014144,
"grad_norm": 0.8046875,
"learning_rate": 2.0175244010647736e-05,
"loss": 0.9297,
"mean_token_accuracy": 0.7592410154640674,
"num_tokens": 253388200.0,
"step": 2690
},
{
"entropy": 1.367007777094841,
"epoch": 0.5990348882356203,
"grad_norm": 0.55859375,
"learning_rate": 2.006433007985803e-05,
"loss": 0.951,
"mean_token_accuracy": 0.7534979909658432,
"num_tokens": 254303611.0,
"step": 2700
},
{
"entropy": 1.3557642981410027,
"epoch": 0.6012535359698263,
"grad_norm": 0.55078125,
"learning_rate": 1.9953416149068324e-05,
"loss": 0.9455,
"mean_token_accuracy": 0.7532623074948788,
"num_tokens": 255267315.0,
"step": 2710
},
{
"entropy": 1.3694660350680352,
"epoch": 0.6034721837040324,
"grad_norm": 0.58203125,
"learning_rate": 1.9842502218278616e-05,
"loss": 0.9634,
"mean_token_accuracy": 0.7510243773460388,
"num_tokens": 256198305.0,
"step": 2720
},
{
"entropy": 1.3600564405322075,
"epoch": 0.6056908314382384,
"grad_norm": 0.53515625,
"learning_rate": 1.9731588287488908e-05,
"loss": 0.9597,
"mean_token_accuracy": 0.7512127391993999,
"num_tokens": 257177077.0,
"step": 2730
},
{
"entropy": 1.3179180152714252,
"epoch": 0.6079094791724444,
"grad_norm": 0.55859375,
"learning_rate": 1.9620674356699203e-05,
"loss": 0.935,
"mean_token_accuracy": 0.757300040870905,
"num_tokens": 258134360.0,
"step": 2740
},
{
"entropy": 1.35346722304821,
"epoch": 0.6101281269066504,
"grad_norm": 0.52734375,
"learning_rate": 1.9509760425909496e-05,
"loss": 0.945,
"mean_token_accuracy": 0.7527749851346016,
"num_tokens": 259094913.0,
"step": 2750
},
{
"entropy": 1.3553345277905464,
"epoch": 0.6123467746408564,
"grad_norm": 0.51953125,
"learning_rate": 1.9398846495119788e-05,
"loss": 0.9358,
"mean_token_accuracy": 0.7586644418537617,
"num_tokens": 260019758.0,
"step": 2760
},
{
"entropy": 1.3732210516929626,
"epoch": 0.6145654223750624,
"grad_norm": 0.57421875,
"learning_rate": 1.928793256433008e-05,
"loss": 0.9604,
"mean_token_accuracy": 0.7491915933787823,
"num_tokens": 260944733.0,
"step": 2770
},
{
"entropy": 1.3680385306477547,
"epoch": 0.6167840701092684,
"grad_norm": 0.56640625,
"learning_rate": 1.9177018633540372e-05,
"loss": 0.9676,
"mean_token_accuracy": 0.7478097401559353,
"num_tokens": 261873242.0,
"step": 2780
},
{
"entropy": 1.3868214182555676,
"epoch": 0.6190027178434744,
"grad_norm": 0.515625,
"learning_rate": 1.9066104702750667e-05,
"loss": 0.9516,
"mean_token_accuracy": 0.7528703935444355,
"num_tokens": 262814898.0,
"step": 2790
},
{
"entropy": 1.3595042198896408,
"epoch": 0.6212213655776804,
"grad_norm": 0.546875,
"learning_rate": 1.895519077196096e-05,
"loss": 0.9479,
"mean_token_accuracy": 0.7566476508975029,
"num_tokens": 263753086.0,
"step": 2800
},
{
"entropy": 1.33022148758173,
"epoch": 0.6234400133118864,
"grad_norm": 0.515625,
"learning_rate": 1.8844276841171252e-05,
"loss": 0.9321,
"mean_token_accuracy": 0.7572297543287277,
"num_tokens": 264700281.0,
"step": 2810
},
{
"entropy": 1.3536405637860298,
"epoch": 0.6256586610460924,
"grad_norm": 0.55078125,
"learning_rate": 1.8733362910381544e-05,
"loss": 0.9419,
"mean_token_accuracy": 0.7534777402877808,
"num_tokens": 265647657.0,
"step": 2820
},
{
"entropy": 1.3843684569001198,
"epoch": 0.6278773087802985,
"grad_norm": 0.57421875,
"learning_rate": 1.862244897959184e-05,
"loss": 0.9653,
"mean_token_accuracy": 0.7501688152551651,
"num_tokens": 266578441.0,
"step": 2830
},
{
"entropy": 1.3544567473232747,
"epoch": 0.6300959565145045,
"grad_norm": 0.515625,
"learning_rate": 1.851153504880213e-05,
"loss": 0.9457,
"mean_token_accuracy": 0.7535403810441494,
"num_tokens": 267530874.0,
"step": 2840
},
{
"entropy": 1.3669777683913709,
"epoch": 0.6323146042487104,
"grad_norm": 0.53125,
"learning_rate": 1.8400621118012424e-05,
"loss": 0.9422,
"mean_token_accuracy": 0.7547655880451203,
"num_tokens": 268461165.0,
"step": 2850
},
{
"entropy": 1.3307232797145843,
"epoch": 0.6345332519829164,
"grad_norm": 0.5625,
"learning_rate": 1.8289707187222716e-05,
"loss": 0.9172,
"mean_token_accuracy": 0.7619568608701229,
"num_tokens": 269407213.0,
"step": 2860
},
{
"entropy": 1.367132118344307,
"epoch": 0.6367518997171224,
"grad_norm": 0.54296875,
"learning_rate": 1.8178793256433008e-05,
"loss": 0.9508,
"mean_token_accuracy": 0.7541770383715629,
"num_tokens": 270328899.0,
"step": 2870
},
{
"entropy": 1.3831279791891575,
"epoch": 0.6389705474513284,
"grad_norm": 0.5625,
"learning_rate": 1.8067879325643303e-05,
"loss": 0.9817,
"mean_token_accuracy": 0.746974790096283,
"num_tokens": 271265664.0,
"step": 2880
},
{
"entropy": 1.3699244022369386,
"epoch": 0.6411891951855344,
"grad_norm": 0.5390625,
"learning_rate": 1.7956965394853596e-05,
"loss": 0.9696,
"mean_token_accuracy": 0.7492304258048534,
"num_tokens": 272186809.0,
"step": 2890
},
{
"entropy": 1.3639655753970146,
"epoch": 0.6434078429197404,
"grad_norm": 0.52734375,
"learning_rate": 1.7846051464063888e-05,
"loss": 0.9668,
"mean_token_accuracy": 0.7505564413964748,
"num_tokens": 273108290.0,
"step": 2900
},
{
"entropy": 1.403120481967926,
"epoch": 0.6456264906539464,
"grad_norm": 0.55078125,
"learning_rate": 1.773513753327418e-05,
"loss": 0.9769,
"mean_token_accuracy": 0.746848201751709,
"num_tokens": 274041696.0,
"step": 2910
},
{
"entropy": 1.4035961225628852,
"epoch": 0.6478451383881524,
"grad_norm": 0.53515625,
"learning_rate": 1.7624223602484475e-05,
"loss": 1.028,
"mean_token_accuracy": 0.7384353429079056,
"num_tokens": 274987065.0,
"step": 2920
},
{
"entropy": 1.3617764726281165,
"epoch": 0.6500637861223584,
"grad_norm": 0.55859375,
"learning_rate": 1.7513309671694767e-05,
"loss": 0.9566,
"mean_token_accuracy": 0.7519726864993572,
"num_tokens": 275931121.0,
"step": 2930
},
{
"entropy": 1.3632805831730366,
"epoch": 0.6522824338565644,
"grad_norm": 0.57421875,
"learning_rate": 1.740239574090506e-05,
"loss": 0.9548,
"mean_token_accuracy": 0.751621701568365,
"num_tokens": 276887903.0,
"step": 2940
},
{
"entropy": 1.3548810198903083,
"epoch": 0.6545010815907705,
"grad_norm": 0.51953125,
"learning_rate": 1.7291481810115352e-05,
"loss": 0.9696,
"mean_token_accuracy": 0.7500473111867905,
"num_tokens": 277797512.0,
"step": 2950
},
{
"entropy": 1.3761564634740353,
"epoch": 0.6567197293249765,
"grad_norm": 0.53125,
"learning_rate": 1.7180567879325644e-05,
"loss": 0.9801,
"mean_token_accuracy": 0.7459054350852966,
"num_tokens": 278758206.0,
"step": 2960
},
{
"entropy": 1.3604497477412223,
"epoch": 0.6589383770591825,
"grad_norm": 0.58203125,
"learning_rate": 1.706965394853594e-05,
"loss": 0.9701,
"mean_token_accuracy": 0.7494505539536476,
"num_tokens": 279682051.0,
"step": 2970
},
{
"entropy": 1.365204595029354,
"epoch": 0.6611570247933884,
"grad_norm": 0.51953125,
"learning_rate": 1.695874001774623e-05,
"loss": 0.9354,
"mean_token_accuracy": 0.7586885608732701,
"num_tokens": 280616177.0,
"step": 2980
},
{
"entropy": 1.3509592905640602,
"epoch": 0.6633756725275944,
"grad_norm": 0.5625,
"learning_rate": 1.6847826086956524e-05,
"loss": 0.9502,
"mean_token_accuracy": 0.7536125592887402,
"num_tokens": 281562822.0,
"step": 2990
},
{
"entropy": 1.361519905924797,
"epoch": 0.6655943202618004,
"grad_norm": 0.52734375,
"learning_rate": 1.6736912156166816e-05,
"loss": 0.9355,
"mean_token_accuracy": 0.755986961722374,
"num_tokens": 282503409.0,
"step": 3000
},
{
"entropy": 1.351868186891079,
"epoch": 0.6678129679960064,
"grad_norm": 0.53125,
"learning_rate": 1.6625998225377108e-05,
"loss": 0.9363,
"mean_token_accuracy": 0.7572803579270839,
"num_tokens": 283411000.0,
"step": 3010
},
{
"entropy": 1.3627968370914458,
"epoch": 0.6700316157302124,
"grad_norm": 0.5234375,
"learning_rate": 1.6515084294587403e-05,
"loss": 0.954,
"mean_token_accuracy": 0.7517252512276172,
"num_tokens": 284363870.0,
"step": 3020
},
{
"entropy": 1.350551488995552,
"epoch": 0.6722502634644184,
"grad_norm": 0.51953125,
"learning_rate": 1.6404170363797696e-05,
"loss": 0.9243,
"mean_token_accuracy": 0.7591398231685161,
"num_tokens": 285307777.0,
"step": 3030
},
{
"entropy": 1.3500189259648323,
"epoch": 0.6744689111986244,
"grad_norm": 0.55078125,
"learning_rate": 1.6293256433007988e-05,
"loss": 0.938,
"mean_token_accuracy": 0.7556370176374912,
"num_tokens": 286253628.0,
"step": 3040
},
{
"entropy": 1.344145791977644,
"epoch": 0.6766875589328304,
"grad_norm": 0.53515625,
"learning_rate": 1.618234250221828e-05,
"loss": 0.9545,
"mean_token_accuracy": 0.7536003112792968,
"num_tokens": 287183850.0,
"step": 3050
},
{
"entropy": 1.3805773958563805,
"epoch": 0.6789062066670365,
"grad_norm": 0.546875,
"learning_rate": 1.6071428571428572e-05,
"loss": 0.9733,
"mean_token_accuracy": 0.7494976818561554,
"num_tokens": 288100206.0,
"step": 3060
},
{
"entropy": 1.344843527674675,
"epoch": 0.6811248544012425,
"grad_norm": 0.5078125,
"learning_rate": 1.5960514640638864e-05,
"loss": 0.9458,
"mean_token_accuracy": 0.7523748345673085,
"num_tokens": 289028085.0,
"step": 3070
},
{
"entropy": 1.3617902539670468,
"epoch": 0.6833435021354485,
"grad_norm": 0.5,
"learning_rate": 1.5849600709849156e-05,
"loss": 0.9616,
"mean_token_accuracy": 0.7512977905571461,
"num_tokens": 289972169.0,
"step": 3080
},
{
"entropy": 1.3891084134578704,
"epoch": 0.6855621498696545,
"grad_norm": 0.515625,
"learning_rate": 1.573868677905945e-05,
"loss": 0.984,
"mean_token_accuracy": 0.7475369438529015,
"num_tokens": 290890546.0,
"step": 3090
},
{
"entropy": 1.3548466391861438,
"epoch": 0.6877807976038605,
"grad_norm": 0.58984375,
"learning_rate": 1.5627772848269744e-05,
"loss": 0.9156,
"mean_token_accuracy": 0.7610405057668685,
"num_tokens": 291831317.0,
"step": 3100
},
{
"entropy": 1.3873838737607003,
"epoch": 0.6899994453380665,
"grad_norm": 0.5234375,
"learning_rate": 1.5516858917480036e-05,
"loss": 0.9849,
"mean_token_accuracy": 0.7466968774795533,
"num_tokens": 292773910.0,
"step": 3110
},
{
"entropy": 1.374309216439724,
"epoch": 0.6922180930722724,
"grad_norm": 0.56640625,
"learning_rate": 1.5405944986690328e-05,
"loss": 0.9691,
"mean_token_accuracy": 0.7497981458902359,
"num_tokens": 293706792.0,
"step": 3120
},
{
"entropy": 1.3540133006870747,
"epoch": 0.6944367408064784,
"grad_norm": 0.54296875,
"learning_rate": 1.529503105590062e-05,
"loss": 0.9675,
"mean_token_accuracy": 0.7486146375536918,
"num_tokens": 294649185.0,
"step": 3130
},
{
"entropy": 1.3608009479939938,
"epoch": 0.6966553885406844,
"grad_norm": 0.5546875,
"learning_rate": 1.5184117125110914e-05,
"loss": 0.9602,
"mean_token_accuracy": 0.7509834311902523,
"num_tokens": 295571599.0,
"step": 3140
},
{
"entropy": 1.3526781380176545,
"epoch": 0.6988740362748904,
"grad_norm": 0.54296875,
"learning_rate": 1.5073203194321208e-05,
"loss": 0.9838,
"mean_token_accuracy": 0.7449854724109173,
"num_tokens": 296510262.0,
"step": 3150
},
{
"entropy": 1.3786792248487472,
"epoch": 0.7010926840090964,
"grad_norm": 0.5546875,
"learning_rate": 1.4962289263531502e-05,
"loss": 0.9576,
"mean_token_accuracy": 0.7524838514626027,
"num_tokens": 297462659.0,
"step": 3160
},
{
"entropy": 1.37396137714386,
"epoch": 0.7033113317433024,
"grad_norm": 0.51953125,
"learning_rate": 1.4851375332741794e-05,
"loss": 0.9655,
"mean_token_accuracy": 0.7482963159680367,
"num_tokens": 298410420.0,
"step": 3170
},
{
"entropy": 1.3479665741324425,
"epoch": 0.7055299794775085,
"grad_norm": 0.5234375,
"learning_rate": 1.4740461401952086e-05,
"loss": 0.9735,
"mean_token_accuracy": 0.7480811208486557,
"num_tokens": 299335997.0,
"step": 3180
},
{
"entropy": 1.4149656519293785,
"epoch": 0.7077486272117145,
"grad_norm": 0.4921875,
"learning_rate": 1.4629547471162378e-05,
"loss": 0.9868,
"mean_token_accuracy": 0.7450309813022613,
"num_tokens": 300300688.0,
"step": 3190
},
{
"entropy": 1.3166653901338576,
"epoch": 0.7099672749459205,
"grad_norm": 0.52734375,
"learning_rate": 1.4518633540372672e-05,
"loss": 0.9244,
"mean_token_accuracy": 0.7599585182964802,
"num_tokens": 301275137.0,
"step": 3200
},
{
"entropy": 1.3608283437788486,
"epoch": 0.7121859226801265,
"grad_norm": 0.53125,
"learning_rate": 1.4407719609582964e-05,
"loss": 0.9792,
"mean_token_accuracy": 0.7479373283684254,
"num_tokens": 302208727.0,
"step": 3210
},
{
"entropy": 1.3675961531698704,
"epoch": 0.7144045704143325,
"grad_norm": 0.53125,
"learning_rate": 1.4296805678793256e-05,
"loss": 0.9522,
"mean_token_accuracy": 0.7523340001702309,
"num_tokens": 303160006.0,
"step": 3220
},
{
"entropy": 1.3157847836613654,
"epoch": 0.7166232181485385,
"grad_norm": 0.51953125,
"learning_rate": 1.4185891748003548e-05,
"loss": 0.9469,
"mean_token_accuracy": 0.7520846240222454,
"num_tokens": 304102524.0,
"step": 3230
},
{
"entropy": 1.3752693004906178,
"epoch": 0.7188418658827445,
"grad_norm": 0.52734375,
"learning_rate": 1.4074977817213844e-05,
"loss": 0.9426,
"mean_token_accuracy": 0.753889911621809,
"num_tokens": 305042287.0,
"step": 3240
},
{
"entropy": 1.3292134046554565,
"epoch": 0.7210605136169504,
"grad_norm": 0.61328125,
"learning_rate": 1.3964063886424136e-05,
"loss": 0.9464,
"mean_token_accuracy": 0.754455479234457,
"num_tokens": 305988003.0,
"step": 3250
},
{
"entropy": 1.3723205238580705,
"epoch": 0.7232791613511564,
"grad_norm": 0.578125,
"learning_rate": 1.3853149955634428e-05,
"loss": 0.9942,
"mean_token_accuracy": 0.7461909614503384,
"num_tokens": 306927584.0,
"step": 3260
},
{
"entropy": 1.3628524258732795,
"epoch": 0.7254978090853624,
"grad_norm": 0.56640625,
"learning_rate": 1.374223602484472e-05,
"loss": 0.9594,
"mean_token_accuracy": 0.7528522469103336,
"num_tokens": 307863697.0,
"step": 3270
},
{
"entropy": 1.353959833085537,
"epoch": 0.7277164568195684,
"grad_norm": 0.5390625,
"learning_rate": 1.3631322094055012e-05,
"loss": 0.9472,
"mean_token_accuracy": 0.7561062417924405,
"num_tokens": 308808276.0,
"step": 3280
},
{
"entropy": 1.3523946583271027,
"epoch": 0.7299351045537745,
"grad_norm": 0.470703125,
"learning_rate": 1.3520408163265308e-05,
"loss": 0.9578,
"mean_token_accuracy": 0.7514262087643147,
"num_tokens": 309773086.0,
"step": 3290
},
{
"entropy": 1.3321008674800396,
"epoch": 0.7321537522879805,
"grad_norm": 0.54296875,
"learning_rate": 1.34094942324756e-05,
"loss": 0.9513,
"mean_token_accuracy": 0.7530274912714958,
"num_tokens": 310728967.0,
"step": 3300
},
{
"entropy": 1.3726357147097588,
"epoch": 0.7343724000221865,
"grad_norm": 0.55078125,
"learning_rate": 1.3298580301685892e-05,
"loss": 0.9519,
"mean_token_accuracy": 0.7526456661522388,
"num_tokens": 311671029.0,
"step": 3310
},
{
"entropy": 1.3460698679089547,
"epoch": 0.7365910477563925,
"grad_norm": 0.59765625,
"learning_rate": 1.3187666370896184e-05,
"loss": 0.977,
"mean_token_accuracy": 0.7480454221367836,
"num_tokens": 312608775.0,
"step": 3320
},
{
"entropy": 1.358740784227848,
"epoch": 0.7388096954905985,
"grad_norm": 0.54296875,
"learning_rate": 1.3076752440106476e-05,
"loss": 0.9388,
"mean_token_accuracy": 0.7564548753201962,
"num_tokens": 313562050.0,
"step": 3330
},
{
"entropy": 1.3844745293259622,
"epoch": 0.7410283432248045,
"grad_norm": 0.5703125,
"learning_rate": 1.2965838509316772e-05,
"loss": 0.9834,
"mean_token_accuracy": 0.7466944210231304,
"num_tokens": 314509120.0,
"step": 3340
},
{
"entropy": 1.3659690007567407,
"epoch": 0.7432469909590105,
"grad_norm": 0.51171875,
"learning_rate": 1.2854924578527064e-05,
"loss": 0.941,
"mean_token_accuracy": 0.7546365484595299,
"num_tokens": 315486453.0,
"step": 3350
},
{
"entropy": 1.3873593926429748,
"epoch": 0.7454656386932165,
"grad_norm": 0.5390625,
"learning_rate": 1.2744010647737356e-05,
"loss": 0.985,
"mean_token_accuracy": 0.7476673908531666,
"num_tokens": 316431477.0,
"step": 3360
},
{
"entropy": 1.3676550433039665,
"epoch": 0.7476842864274225,
"grad_norm": 0.5234375,
"learning_rate": 1.2633096716947648e-05,
"loss": 0.9627,
"mean_token_accuracy": 0.7511092610657215,
"num_tokens": 317357092.0,
"step": 3370
},
{
"entropy": 1.3835733927786351,
"epoch": 0.7499029341616285,
"grad_norm": 0.55078125,
"learning_rate": 1.2522182786157944e-05,
"loss": 0.948,
"mean_token_accuracy": 0.7537398427724838,
"num_tokens": 318280117.0,
"step": 3380
},
{
"entropy": 1.3766888722777366,
"epoch": 0.7521215818958344,
"grad_norm": 0.55859375,
"learning_rate": 1.2411268855368236e-05,
"loss": 0.9683,
"mean_token_accuracy": 0.7515489347279072,
"num_tokens": 319203557.0,
"step": 3390
},
{
"entropy": 1.357860617339611,
"epoch": 0.7543402296300404,
"grad_norm": 0.515625,
"learning_rate": 1.2300354924578528e-05,
"loss": 0.9535,
"mean_token_accuracy": 0.752687606215477,
"num_tokens": 320136852.0,
"step": 3400
},
{
"entropy": 1.3469961121678353,
"epoch": 0.7565588773642465,
"grad_norm": 0.54296875,
"learning_rate": 1.218944099378882e-05,
"loss": 0.9459,
"mean_token_accuracy": 0.7537472225725651,
"num_tokens": 321106324.0,
"step": 3410
},
{
"entropy": 1.334907030314207,
"epoch": 0.7587775250984525,
"grad_norm": 0.55078125,
"learning_rate": 1.2078527062999114e-05,
"loss": 0.9359,
"mean_token_accuracy": 0.7552877001464366,
"num_tokens": 322042151.0,
"step": 3420
},
{
"entropy": 1.3580046392977239,
"epoch": 0.7609961728326585,
"grad_norm": 0.53125,
"learning_rate": 1.1967613132209406e-05,
"loss": 0.9404,
"mean_token_accuracy": 0.7551106229424477,
"num_tokens": 322952370.0,
"step": 3430
},
{
"entropy": 1.3473434820771217,
"epoch": 0.7632148205668645,
"grad_norm": 0.54296875,
"learning_rate": 1.18566992014197e-05,
"loss": 0.9527,
"mean_token_accuracy": 0.7552920714020729,
"num_tokens": 323909905.0,
"step": 3440
},
{
"entropy": 1.388922219723463,
"epoch": 0.7654334683010705,
"grad_norm": 0.56640625,
"learning_rate": 1.1745785270629992e-05,
"loss": 0.9835,
"mean_token_accuracy": 0.7475064925849437,
"num_tokens": 324843712.0,
"step": 3450
},
{
"entropy": 1.3555053889751434,
"epoch": 0.7676521160352765,
"grad_norm": 0.55078125,
"learning_rate": 1.1634871339840284e-05,
"loss": 0.9791,
"mean_token_accuracy": 0.747514633089304,
"num_tokens": 325763726.0,
"step": 3460
},
{
"entropy": 1.4119513988494874,
"epoch": 0.7698707637694825,
"grad_norm": 0.5703125,
"learning_rate": 1.1523957409050576e-05,
"loss": 0.9634,
"mean_token_accuracy": 0.752115435898304,
"num_tokens": 326704959.0,
"step": 3470
},
{
"entropy": 1.358751341700554,
"epoch": 0.7720894115036885,
"grad_norm": 0.5078125,
"learning_rate": 1.141304347826087e-05,
"loss": 0.9502,
"mean_token_accuracy": 0.752843676507473,
"num_tokens": 327654129.0,
"step": 3480
},
{
"entropy": 1.373246306180954,
"epoch": 0.7743080592378945,
"grad_norm": 0.56640625,
"learning_rate": 1.1302129547471162e-05,
"loss": 0.9896,
"mean_token_accuracy": 0.7471863307058811,
"num_tokens": 328577246.0,
"step": 3490
},
{
"entropy": 1.3401599921286107,
"epoch": 0.7765267069721005,
"grad_norm": 0.51171875,
"learning_rate": 1.1191215616681455e-05,
"loss": 0.9046,
"mean_token_accuracy": 0.7622545510530472,
"num_tokens": 329531531.0,
"step": 3500
},
{
"entropy": 1.403013862669468,
"epoch": 0.7787453547063065,
"grad_norm": 0.5234375,
"learning_rate": 1.1080301685891748e-05,
"loss": 1.0199,
"mean_token_accuracy": 0.7393336437642575,
"num_tokens": 330493898.0,
"step": 3510
},
{
"entropy": 1.3496449366211891,
"epoch": 0.7809640024405126,
"grad_norm": 0.5859375,
"learning_rate": 1.096938775510204e-05,
"loss": 0.9422,
"mean_token_accuracy": 0.7540981650352478,
"num_tokens": 331418294.0,
"step": 3520
},
{
"entropy": 1.344119517505169,
"epoch": 0.7831826501747186,
"grad_norm": 0.53515625,
"learning_rate": 1.0858473824312334e-05,
"loss": 0.9386,
"mean_token_accuracy": 0.757073562592268,
"num_tokens": 332378464.0,
"step": 3530
},
{
"entropy": 1.3490448504686356,
"epoch": 0.7854012979089245,
"grad_norm": 0.53125,
"learning_rate": 1.0747559893522626e-05,
"loss": 0.9457,
"mean_token_accuracy": 0.7535859100520611,
"num_tokens": 333318703.0,
"step": 3540
},
{
"entropy": 1.3718070283532142,
"epoch": 0.7876199456431305,
"grad_norm": 0.546875,
"learning_rate": 1.063664596273292e-05,
"loss": 0.9685,
"mean_token_accuracy": 0.7511322259902954,
"num_tokens": 334255047.0,
"step": 3550
},
{
"entropy": 1.3711335480213165,
"epoch": 0.7898385933773365,
"grad_norm": 0.515625,
"learning_rate": 1.0525732031943212e-05,
"loss": 0.9743,
"mean_token_accuracy": 0.7471988372504711,
"num_tokens": 335189458.0,
"step": 3560
},
{
"entropy": 1.3890349462628364,
"epoch": 0.7920572411115425,
"grad_norm": 0.53125,
"learning_rate": 1.0414818101153505e-05,
"loss": 0.9872,
"mean_token_accuracy": 0.7452017098665238,
"num_tokens": 336139155.0,
"step": 3570
},
{
"entropy": 1.336748766899109,
"epoch": 0.7942758888457485,
"grad_norm": 0.5390625,
"learning_rate": 1.0303904170363798e-05,
"loss": 0.9194,
"mean_token_accuracy": 0.7595400720834732,
"num_tokens": 337103166.0,
"step": 3580
},
{
"entropy": 1.3947007723152638,
"epoch": 0.7964945365799545,
"grad_norm": 0.53515625,
"learning_rate": 1.019299023957409e-05,
"loss": 0.9857,
"mean_token_accuracy": 0.7481105640530586,
"num_tokens": 338049665.0,
"step": 3590
},
{
"entropy": 1.3394004009664058,
"epoch": 0.7987131843141605,
"grad_norm": 0.5546875,
"learning_rate": 1.0082076308784384e-05,
"loss": 0.9501,
"mean_token_accuracy": 0.7537369303405285,
"num_tokens": 339030359.0,
"step": 3600
},
{
"entropy": 1.4002343088388443,
"epoch": 0.8009318320483665,
"grad_norm": 0.5625,
"learning_rate": 9.971162377994676e-06,
"loss": 0.9899,
"mean_token_accuracy": 0.7460181936621666,
"num_tokens": 339965846.0,
"step": 3610
},
{
"entropy": 1.3751978531479836,
"epoch": 0.8031504797825725,
"grad_norm": 0.53125,
"learning_rate": 9.86024844720497e-06,
"loss": 0.9663,
"mean_token_accuracy": 0.7495487280189991,
"num_tokens": 340909085.0,
"step": 3620
},
{
"entropy": 1.3296589955687523,
"epoch": 0.8053691275167785,
"grad_norm": 0.5390625,
"learning_rate": 9.749334516415262e-06,
"loss": 0.9116,
"mean_token_accuracy": 0.7615578956902027,
"num_tokens": 341836396.0,
"step": 3630
},
{
"entropy": 1.3545545935630798,
"epoch": 0.8075877752509846,
"grad_norm": 0.5546875,
"learning_rate": 9.638420585625555e-06,
"loss": 0.946,
"mean_token_accuracy": 0.7542130470275878,
"num_tokens": 342759623.0,
"step": 3640
},
{
"entropy": 1.3891134530305862,
"epoch": 0.8098064229851906,
"grad_norm": 0.57421875,
"learning_rate": 9.527506654835848e-06,
"loss": 1.0098,
"mean_token_accuracy": 0.7399868927896023,
"num_tokens": 343714548.0,
"step": 3650
},
{
"entropy": 1.3653203830122949,
"epoch": 0.8120250707193966,
"grad_norm": 0.53125,
"learning_rate": 9.41659272404614e-06,
"loss": 0.9689,
"mean_token_accuracy": 0.747505272179842,
"num_tokens": 344676512.0,
"step": 3660
},
{
"entropy": 1.3524901941418648,
"epoch": 0.8142437184536025,
"grad_norm": 0.5625,
"learning_rate": 9.305678793256434e-06,
"loss": 0.9416,
"mean_token_accuracy": 0.7552405230700969,
"num_tokens": 345609814.0,
"step": 3670
},
{
"entropy": 1.3355680212378502,
"epoch": 0.8164623661878085,
"grad_norm": 0.55078125,
"learning_rate": 9.194764862466726e-06,
"loss": 0.9409,
"mean_token_accuracy": 0.7546605832874775,
"num_tokens": 346552116.0,
"step": 3680
},
{
"entropy": 1.3585198432207108,
"epoch": 0.8186810139220145,
"grad_norm": 0.53125,
"learning_rate": 9.08385093167702e-06,
"loss": 0.9451,
"mean_token_accuracy": 0.7566642910242081,
"num_tokens": 347476547.0,
"step": 3690
},
{
"entropy": 1.3646457374095917,
"epoch": 0.8208996616562205,
"grad_norm": 0.56640625,
"learning_rate": 8.972937000887312e-06,
"loss": 0.9328,
"mean_token_accuracy": 0.757544395327568,
"num_tokens": 348402286.0,
"step": 3700
},
{
"entropy": 1.4008646070957185,
"epoch": 0.8231183093904265,
"grad_norm": 0.55859375,
"learning_rate": 8.862023070097605e-06,
"loss": 1.0166,
"mean_token_accuracy": 0.7399243280291558,
"num_tokens": 349350422.0,
"step": 3710
},
{
"entropy": 1.305922406166792,
"epoch": 0.8253369571246325,
"grad_norm": 0.53125,
"learning_rate": 8.751109139307898e-06,
"loss": 0.9002,
"mean_token_accuracy": 0.7653868660330773,
"num_tokens": 350307992.0,
"step": 3720
},
{
"entropy": 1.344923496246338,
"epoch": 0.8275556048588385,
"grad_norm": 0.52734375,
"learning_rate": 8.64019520851819e-06,
"loss": 0.923,
"mean_token_accuracy": 0.7606289356946945,
"num_tokens": 351239736.0,
"step": 3730
},
{
"entropy": 1.356829535961151,
"epoch": 0.8297742525930445,
"grad_norm": 0.54296875,
"learning_rate": 8.529281277728483e-06,
"loss": 0.9208,
"mean_token_accuracy": 0.7572924271225929,
"num_tokens": 352175438.0,
"step": 3740
},
{
"entropy": 1.371825471520424,
"epoch": 0.8319929003272506,
"grad_norm": 0.55859375,
"learning_rate": 8.418367346938775e-06,
"loss": 0.9769,
"mean_token_accuracy": 0.7484906286001205,
"num_tokens": 353093469.0,
"step": 3750
},
{
"entropy": 1.3505297660827638,
"epoch": 0.8342115480614566,
"grad_norm": 0.55078125,
"learning_rate": 8.307453416149069e-06,
"loss": 0.9634,
"mean_token_accuracy": 0.748659697920084,
"num_tokens": 354042260.0,
"step": 3760
},
{
"entropy": 1.3741331085562707,
"epoch": 0.8364301957956626,
"grad_norm": 0.5390625,
"learning_rate": 8.19653948535936e-06,
"loss": 0.982,
"mean_token_accuracy": 0.7466577455401421,
"num_tokens": 354960941.0,
"step": 3770
},
{
"entropy": 1.3521684527397155,
"epoch": 0.8386488435298686,
"grad_norm": 0.54296875,
"learning_rate": 8.085625554569655e-06,
"loss": 0.951,
"mean_token_accuracy": 0.7545395441353321,
"num_tokens": 355899405.0,
"step": 3780
},
{
"entropy": 1.3922821909189225,
"epoch": 0.8408674912640746,
"grad_norm": 0.5390625,
"learning_rate": 7.974711623779947e-06,
"loss": 0.9774,
"mean_token_accuracy": 0.7484460555016994,
"num_tokens": 356837111.0,
"step": 3790
},
{
"entropy": 1.341869878768921,
"epoch": 0.8430861389982806,
"grad_norm": 0.50390625,
"learning_rate": 7.863797692990239e-06,
"loss": 0.9371,
"mean_token_accuracy": 0.7555838227272034,
"num_tokens": 357775995.0,
"step": 3800
},
{
"entropy": 1.3769854843616485,
"epoch": 0.8453047867324865,
"grad_norm": 0.5546875,
"learning_rate": 7.752883762200533e-06,
"loss": 0.9788,
"mean_token_accuracy": 0.7470424689352513,
"num_tokens": 358730556.0,
"step": 3810
},
{
"entropy": 1.3654131770133973,
"epoch": 0.8475234344666925,
"grad_norm": 0.59375,
"learning_rate": 7.641969831410825e-06,
"loss": 0.9543,
"mean_token_accuracy": 0.7543313026428222,
"num_tokens": 359702857.0,
"step": 3820
},
{
"entropy": 1.3479675091803074,
"epoch": 0.8497420822008985,
"grad_norm": 0.5078125,
"learning_rate": 7.5310559006211186e-06,
"loss": 0.9434,
"mean_token_accuracy": 0.7545025050640106,
"num_tokens": 360637451.0,
"step": 3830
},
{
"entropy": 1.368970339745283,
"epoch": 0.8519607299351045,
"grad_norm": 0.53515625,
"learning_rate": 7.420141969831411e-06,
"loss": 0.9585,
"mean_token_accuracy": 0.7518557466566562,
"num_tokens": 361574227.0,
"step": 3840
},
{
"entropy": 1.3598952896893024,
"epoch": 0.8541793776693105,
"grad_norm": 0.5234375,
"learning_rate": 7.3092280390417045e-06,
"loss": 0.9427,
"mean_token_accuracy": 0.754470182955265,
"num_tokens": 362506193.0,
"step": 3850
},
{
"entropy": 1.3638700023293495,
"epoch": 0.8563980254035165,
"grad_norm": 0.5390625,
"learning_rate": 7.198314108251997e-06,
"loss": 0.9701,
"mean_token_accuracy": 0.7472980074584484,
"num_tokens": 363441282.0,
"step": 3860
},
{
"entropy": 1.3546796232461928,
"epoch": 0.8586166731377226,
"grad_norm": 0.4765625,
"learning_rate": 7.0874001774622905e-06,
"loss": 0.9753,
"mean_token_accuracy": 0.7478179946541786,
"num_tokens": 364393822.0,
"step": 3870
},
{
"entropy": 1.350717130303383,
"epoch": 0.8608353208719286,
"grad_norm": 0.55859375,
"learning_rate": 6.976486246672583e-06,
"loss": 0.9486,
"mean_token_accuracy": 0.7572776488959789,
"num_tokens": 365331779.0,
"step": 3880
},
{
"entropy": 1.3585814163088799,
"epoch": 0.8630539686061346,
"grad_norm": 0.5859375,
"learning_rate": 6.865572315882875e-06,
"loss": 0.9629,
"mean_token_accuracy": 0.748991634696722,
"num_tokens": 366257855.0,
"step": 3890
},
{
"entropy": 1.3992498129606248,
"epoch": 0.8652726163403406,
"grad_norm": 0.58203125,
"learning_rate": 6.7546583850931686e-06,
"loss": 0.9949,
"mean_token_accuracy": 0.7451303206384182,
"num_tokens": 367181436.0,
"step": 3900
},
{
"entropy": 1.3461244717240333,
"epoch": 0.8674912640745466,
"grad_norm": 0.52734375,
"learning_rate": 6.643744454303461e-06,
"loss": 0.9381,
"mean_token_accuracy": 0.7554601080715656,
"num_tokens": 368128436.0,
"step": 3910
},
{
"entropy": 1.3403765760362147,
"epoch": 0.8697099118087526,
"grad_norm": 0.5390625,
"learning_rate": 6.532830523513754e-06,
"loss": 0.9265,
"mean_token_accuracy": 0.7592454843223095,
"num_tokens": 369092081.0,
"step": 3920
},
{
"entropy": 1.3784636914730073,
"epoch": 0.8719285595429586,
"grad_norm": 0.54296875,
"learning_rate": 6.421916592724047e-06,
"loss": 0.9607,
"mean_token_accuracy": 0.751040443778038,
"num_tokens": 370022755.0,
"step": 3930
},
{
"entropy": 1.3625924080610274,
"epoch": 0.8741472072771646,
"grad_norm": 0.56640625,
"learning_rate": 6.31100266193434e-06,
"loss": 0.9696,
"mean_token_accuracy": 0.7501497231423855,
"num_tokens": 370963738.0,
"step": 3940
},
{
"entropy": 1.3465173587203025,
"epoch": 0.8763658550113705,
"grad_norm": 0.56640625,
"learning_rate": 6.200088731144632e-06,
"loss": 0.9578,
"mean_token_accuracy": 0.7541100673377514,
"num_tokens": 371888931.0,
"step": 3950
},
{
"entropy": 1.3527381241321563,
"epoch": 0.8785845027455765,
"grad_norm": 0.5390625,
"learning_rate": 6.089174800354925e-06,
"loss": 0.9467,
"mean_token_accuracy": 0.7544343665242195,
"num_tokens": 372849693.0,
"step": 3960
},
{
"entropy": 1.3818270325660706,
"epoch": 0.8808031504797825,
"grad_norm": 0.55078125,
"learning_rate": 5.978260869565218e-06,
"loss": 0.9551,
"mean_token_accuracy": 0.7534758277237416,
"num_tokens": 373792435.0,
"step": 3970
},
{
"entropy": 1.3504199832677841,
"epoch": 0.8830217982139886,
"grad_norm": 0.5234375,
"learning_rate": 5.867346938775511e-06,
"loss": 0.9445,
"mean_token_accuracy": 0.756835724413395,
"num_tokens": 374746035.0,
"step": 3980
},
{
"entropy": 1.3496798947453499,
"epoch": 0.8852404459481946,
"grad_norm": 0.53125,
"learning_rate": 5.756433007985803e-06,
"loss": 0.9419,
"mean_token_accuracy": 0.7544379711151123,
"num_tokens": 375703477.0,
"step": 3990
},
{
"entropy": 1.3732656255364417,
"epoch": 0.8874590936824006,
"grad_norm": 0.546875,
"learning_rate": 5.645519077196096e-06,
"loss": 0.9604,
"mean_token_accuracy": 0.751821743696928,
"num_tokens": 376636971.0,
"step": 4000
},
{
"entropy": 1.375483873486519,
"epoch": 0.8896777414166066,
"grad_norm": 0.52734375,
"learning_rate": 5.534605146406389e-06,
"loss": 0.9671,
"mean_token_accuracy": 0.7487936913967133,
"num_tokens": 377588517.0,
"step": 4010
},
{
"entropy": 1.3773034647107125,
"epoch": 0.8918963891508126,
"grad_norm": 0.55859375,
"learning_rate": 5.423691215616682e-06,
"loss": 0.9667,
"mean_token_accuracy": 0.7501926451921463,
"num_tokens": 378524822.0,
"step": 4020
},
{
"entropy": 1.3265659905970097,
"epoch": 0.8941150368850186,
"grad_norm": 0.54296875,
"learning_rate": 5.312777284826975e-06,
"loss": 0.9452,
"mean_token_accuracy": 0.7550780981779098,
"num_tokens": 379505593.0,
"step": 4030
},
{
"entropy": 1.335418175160885,
"epoch": 0.8963336846192246,
"grad_norm": 0.58203125,
"learning_rate": 5.201863354037268e-06,
"loss": 0.9482,
"mean_token_accuracy": 0.7534942403435707,
"num_tokens": 380473052.0,
"step": 4040
},
{
"entropy": 1.3610256776213645,
"epoch": 0.8985523323534306,
"grad_norm": 0.5546875,
"learning_rate": 5.090949423247561e-06,
"loss": 0.958,
"mean_token_accuracy": 0.7533226810395718,
"num_tokens": 381443551.0,
"step": 4050
},
{
"entropy": 1.3507319584488868,
"epoch": 0.9007709800876366,
"grad_norm": 0.55078125,
"learning_rate": 4.980035492457853e-06,
"loss": 0.9489,
"mean_token_accuracy": 0.7544699974358082,
"num_tokens": 382378572.0,
"step": 4060
},
{
"entropy": 1.3752561420202256,
"epoch": 0.9029896278218426,
"grad_norm": 0.53515625,
"learning_rate": 4.869121561668146e-06,
"loss": 0.9519,
"mean_token_accuracy": 0.7520841076970101,
"num_tokens": 383311592.0,
"step": 4070
},
{
"entropy": 1.3476091951131821,
"epoch": 0.9052082755560485,
"grad_norm": 0.5234375,
"learning_rate": 4.758207630878438e-06,
"loss": 0.9415,
"mean_token_accuracy": 0.7553776867687703,
"num_tokens": 384248057.0,
"step": 4080
},
{
"entropy": 1.3605633400380612,
"epoch": 0.9074269232902545,
"grad_norm": 0.56640625,
"learning_rate": 4.647293700088731e-06,
"loss": 0.9244,
"mean_token_accuracy": 0.7569485224783421,
"num_tokens": 385189392.0,
"step": 4090
},
{
"entropy": 1.365138278901577,
"epoch": 0.9096455710244606,
"grad_norm": 0.5078125,
"learning_rate": 4.536379769299024e-06,
"loss": 0.9531,
"mean_token_accuracy": 0.7533132433891296,
"num_tokens": 386110731.0,
"step": 4100
},
{
"entropy": 1.3613657392561436,
"epoch": 0.9118642187586666,
"grad_norm": 0.55078125,
"learning_rate": 4.425465838509317e-06,
"loss": 0.943,
"mean_token_accuracy": 0.7544411860406399,
"num_tokens": 387057495.0,
"step": 4110
},
{
"entropy": 1.3290772818028926,
"epoch": 0.9140828664928726,
"grad_norm": 0.5546875,
"learning_rate": 4.31455190771961e-06,
"loss": 0.9183,
"mean_token_accuracy": 0.7608602307736874,
"num_tokens": 388021016.0,
"step": 4120
},
{
"entropy": 1.3475232422351837,
"epoch": 0.9163015142270786,
"grad_norm": 0.52734375,
"learning_rate": 4.203637976929903e-06,
"loss": 0.9411,
"mean_token_accuracy": 0.7554165907204151,
"num_tokens": 388924467.0,
"step": 4130
},
{
"entropy": 1.3602249071002006,
"epoch": 0.9185201619612846,
"grad_norm": 0.51953125,
"learning_rate": 4.092724046140196e-06,
"loss": 0.9307,
"mean_token_accuracy": 0.7554832518100738,
"num_tokens": 389879904.0,
"step": 4140
},
{
"entropy": 1.332291903346777,
"epoch": 0.9207388096954906,
"grad_norm": 0.5234375,
"learning_rate": 3.981810115350488e-06,
"loss": 0.9559,
"mean_token_accuracy": 0.752610693871975,
"num_tokens": 390857519.0,
"step": 4150
},
{
"entropy": 1.3272889666259289,
"epoch": 0.9229574574296966,
"grad_norm": 0.53125,
"learning_rate": 3.870896184560781e-06,
"loss": 0.9306,
"mean_token_accuracy": 0.7583662964403629,
"num_tokens": 391792594.0,
"step": 4160
},
{
"entropy": 1.3560280472040176,
"epoch": 0.9251761051639026,
"grad_norm": 0.5390625,
"learning_rate": 3.759982253771074e-06,
"loss": 0.9577,
"mean_token_accuracy": 0.7537056483328343,
"num_tokens": 392749013.0,
"step": 4170
},
{
"entropy": 1.3662122264504433,
"epoch": 0.9273947528981086,
"grad_norm": 0.53125,
"learning_rate": 3.6490683229813664e-06,
"loss": 0.9715,
"mean_token_accuracy": 0.7491789266467095,
"num_tokens": 393666251.0,
"step": 4180
},
{
"entropy": 1.31452574133873,
"epoch": 0.9296134006323146,
"grad_norm": 0.5390625,
"learning_rate": 3.5381543921916594e-06,
"loss": 0.9006,
"mean_token_accuracy": 0.7624947860836983,
"num_tokens": 394596882.0,
"step": 4190
},
{
"entropy": 1.3401482120156287,
"epoch": 0.9318320483665206,
"grad_norm": 0.53125,
"learning_rate": 3.4272404614019524e-06,
"loss": 0.9282,
"mean_token_accuracy": 0.756990148127079,
"num_tokens": 395526506.0,
"step": 4200
},
{
"entropy": 1.4013796046376228,
"epoch": 0.9340506961007266,
"grad_norm": 0.54296875,
"learning_rate": 3.3163265306122454e-06,
"loss": 0.9932,
"mean_token_accuracy": 0.7460516929626465,
"num_tokens": 396460730.0,
"step": 4210
},
{
"entropy": 1.3630273953080176,
"epoch": 0.9362693438349327,
"grad_norm": 0.5390625,
"learning_rate": 3.2054125998225384e-06,
"loss": 0.9513,
"mean_token_accuracy": 0.7526130631566048,
"num_tokens": 397410429.0,
"step": 4220
},
{
"entropy": 1.2898547686636448,
"epoch": 0.9384879915691386,
"grad_norm": 0.5078125,
"learning_rate": 3.094498669032831e-06,
"loss": 0.9228,
"mean_token_accuracy": 0.7592225328087807,
"num_tokens": 398358920.0,
"step": 4230
},
{
"entropy": 1.3584180302917956,
"epoch": 0.9407066393033446,
"grad_norm": 0.62890625,
"learning_rate": 2.9835847382431235e-06,
"loss": 0.9683,
"mean_token_accuracy": 0.7511270597577095,
"num_tokens": 399332691.0,
"step": 4240
},
{
"entropy": 1.3851253606379033,
"epoch": 0.9429252870375506,
"grad_norm": 0.55078125,
"learning_rate": 2.872670807453416e-06,
"loss": 0.974,
"mean_token_accuracy": 0.7480023667216301,
"num_tokens": 400266541.0,
"step": 4250
},
{
"entropy": 1.3538463555276394,
"epoch": 0.9451439347717566,
"grad_norm": 0.52734375,
"learning_rate": 2.761756876663709e-06,
"loss": 0.9443,
"mean_token_accuracy": 0.7554497793316841,
"num_tokens": 401201822.0,
"step": 4260
},
{
"entropy": 1.3774816602468491,
"epoch": 0.9473625825059626,
"grad_norm": 0.5390625,
"learning_rate": 2.650842945874002e-06,
"loss": 0.9843,
"mean_token_accuracy": 0.746273136138916,
"num_tokens": 402152611.0,
"step": 4270
},
{
"entropy": 1.3171171061694622,
"epoch": 0.9495812302401686,
"grad_norm": 0.5234375,
"learning_rate": 2.539929015084295e-06,
"loss": 0.9111,
"mean_token_accuracy": 0.7632594168186188,
"num_tokens": 403118617.0,
"step": 4280
},
{
"entropy": 1.3414073579013348,
"epoch": 0.9517998779743746,
"grad_norm": 0.55078125,
"learning_rate": 2.4290150842945875e-06,
"loss": 0.9402,
"mean_token_accuracy": 0.7527715168893337,
"num_tokens": 404056789.0,
"step": 4290
},
{
"entropy": 1.3378793716430664,
"epoch": 0.9540185257085806,
"grad_norm": 0.546875,
"learning_rate": 2.3181011535048805e-06,
"loss": 0.9353,
"mean_token_accuracy": 0.7563605636358262,
"num_tokens": 405004371.0,
"step": 4300
},
{
"entropy": 1.372169415652752,
"epoch": 0.9562371734427866,
"grad_norm": 0.51953125,
"learning_rate": 2.207187222715173e-06,
"loss": 0.9436,
"mean_token_accuracy": 0.7551277004182338,
"num_tokens": 405922059.0,
"step": 4310
},
{
"entropy": 1.3533624187111855,
"epoch": 0.9584558211769926,
"grad_norm": 0.5390625,
"learning_rate": 2.096273291925466e-06,
"loss": 0.9497,
"mean_token_accuracy": 0.7533909723162651,
"num_tokens": 406838792.0,
"step": 4320
},
{
"entropy": 1.3719367325305938,
"epoch": 0.9606744689111987,
"grad_norm": 0.51953125,
"learning_rate": 1.9853593611357586e-06,
"loss": 0.9919,
"mean_token_accuracy": 0.7434499144554139,
"num_tokens": 407786498.0,
"step": 4330
},
{
"entropy": 1.3563473880290986,
"epoch": 0.9628931166454047,
"grad_norm": 0.5234375,
"learning_rate": 1.8744454303460516e-06,
"loss": 0.9401,
"mean_token_accuracy": 0.7530663572251797,
"num_tokens": 408736948.0,
"step": 4340
},
{
"entropy": 1.347538560628891,
"epoch": 0.9651117643796107,
"grad_norm": 0.53125,
"learning_rate": 1.7635314995563443e-06,
"loss": 0.933,
"mean_token_accuracy": 0.7575333446264267,
"num_tokens": 409664542.0,
"step": 4350
},
{
"entropy": 1.3749746069312097,
"epoch": 0.9673304121138167,
"grad_norm": 0.51953125,
"learning_rate": 1.6526175687666373e-06,
"loss": 0.9698,
"mean_token_accuracy": 0.7511622585356236,
"num_tokens": 410602122.0,
"step": 4360
},
{
"entropy": 1.3442941211163997,
"epoch": 0.9695490598480226,
"grad_norm": 0.52734375,
"learning_rate": 1.54170363797693e-06,
"loss": 0.9572,
"mean_token_accuracy": 0.7501766428351402,
"num_tokens": 411528742.0,
"step": 4370
},
{
"entropy": 1.3314830370247364,
"epoch": 0.9717677075822286,
"grad_norm": 0.52734375,
"learning_rate": 1.4307897071872228e-06,
"loss": 0.9528,
"mean_token_accuracy": 0.7543077766895294,
"num_tokens": 412496866.0,
"step": 4380
},
{
"entropy": 1.362931652367115,
"epoch": 0.9739863553164346,
"grad_norm": 0.5390625,
"learning_rate": 1.3198757763975156e-06,
"loss": 0.9539,
"mean_token_accuracy": 0.7541424036026001,
"num_tokens": 413426074.0,
"step": 4390
},
{
"entropy": 1.314641258120537,
"epoch": 0.9762050030506406,
"grad_norm": 0.53515625,
"learning_rate": 1.2089618456078084e-06,
"loss": 0.9326,
"mean_token_accuracy": 0.7566425338387489,
"num_tokens": 414366936.0,
"step": 4400
},
{
"entropy": 1.3944153673946857,
"epoch": 0.9784236507848466,
"grad_norm": 0.546875,
"learning_rate": 1.0980479148181013e-06,
"loss": 0.9887,
"mean_token_accuracy": 0.7444652430713177,
"num_tokens": 415305379.0,
"step": 4410
},
{
"entropy": 1.3453952841460706,
"epoch": 0.9806422985190526,
"grad_norm": 0.49609375,
"learning_rate": 9.871339840283939e-07,
"loss": 0.9563,
"mean_token_accuracy": 0.7523396387696266,
"num_tokens": 416266511.0,
"step": 4420
},
{
"entropy": 1.3440303832292557,
"epoch": 0.9828609462532586,
"grad_norm": 0.53515625,
"learning_rate": 8.762200532386869e-07,
"loss": 0.9456,
"mean_token_accuracy": 0.7547272637486457,
"num_tokens": 417231789.0,
"step": 4430
},
{
"entropy": 1.367350959777832,
"epoch": 0.9850795939874646,
"grad_norm": 0.5546875,
"learning_rate": 7.653061224489796e-07,
"loss": 0.9693,
"mean_token_accuracy": 0.7508193962275982,
"num_tokens": 418194063.0,
"step": 4440
},
{
"entropy": 1.3651387616991997,
"epoch": 0.9872982417216707,
"grad_norm": 0.5703125,
"learning_rate": 6.543921916592724e-07,
"loss": 0.9463,
"mean_token_accuracy": 0.7529610082507133,
"num_tokens": 419114425.0,
"step": 4450
},
{
"entropy": 1.347716721892357,
"epoch": 0.9895168894558767,
"grad_norm": 0.490234375,
"learning_rate": 5.434782608695653e-07,
"loss": 0.9493,
"mean_token_accuracy": 0.7544530227780342,
"num_tokens": 420062804.0,
"step": 4460
},
{
"entropy": 1.323940635472536,
"epoch": 0.9917355371900827,
"grad_norm": 0.5234375,
"learning_rate": 4.3256433007985804e-07,
"loss": 0.9176,
"mean_token_accuracy": 0.7596350736916065,
"num_tokens": 420985488.0,
"step": 4470
},
{
"entropy": 1.3813497826457024,
"epoch": 0.9939541849242887,
"grad_norm": 0.54296875,
"learning_rate": 3.2165039929015086e-07,
"loss": 0.9854,
"mean_token_accuracy": 0.7467473462224007,
"num_tokens": 421920339.0,
"step": 4480
},
{
"entropy": 1.3907534167170525,
"epoch": 0.9961728326584947,
"grad_norm": 0.52734375,
"learning_rate": 2.1073646850044365e-07,
"loss": 0.9956,
"mean_token_accuracy": 0.7431350871920586,
"num_tokens": 422875342.0,
"step": 4490
},
{
"entropy": 1.4102192774415017,
"epoch": 0.9983914803927006,
"grad_norm": 0.546875,
"learning_rate": 9.982253771073646e-08,
"loss": 1.0,
"mean_token_accuracy": 0.7438700333237648,
"num_tokens": 423810727.0,
"step": 4500
}
],
"logging_steps": 10,
"max_steps": 4508,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.665075636310376e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}