{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4508, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.496292558312416, "epoch": 0.0022186477342060014, "grad_norm": 0.6640625, "learning_rate": 4.990017746228927e-05, "loss": 1.0869, "mean_token_accuracy": 0.725723172724247, "num_tokens": 925119.0, "step": 10 }, { "entropy": 1.49339357316494, "epoch": 0.004437295468412003, "grad_norm": 0.796875, "learning_rate": 4.978926353149956e-05, "loss": 1.0992, "mean_token_accuracy": 0.72515804246068, "num_tokens": 1846450.0, "step": 20 }, { "entropy": 1.4267458260059356, "epoch": 0.006655943202618004, "grad_norm": 0.57421875, "learning_rate": 4.967834960070985e-05, "loss": 1.0639, "mean_token_accuracy": 0.7327800326049327, "num_tokens": 2786157.0, "step": 30 }, { "entropy": 1.409786120057106, "epoch": 0.008874590936824005, "grad_norm": 0.57421875, "learning_rate": 4.9567435669920145e-05, "loss": 1.0149, "mean_token_accuracy": 0.7418569244444371, "num_tokens": 3732821.0, "step": 40 }, { "entropy": 1.412144237756729, "epoch": 0.011093238671030008, "grad_norm": 0.5703125, "learning_rate": 4.945652173913044e-05, "loss": 1.0482, "mean_token_accuracy": 0.7331000074744225, "num_tokens": 4691429.0, "step": 50 }, { "entropy": 1.434348814189434, "epoch": 0.013311886405236008, "grad_norm": 0.55859375, "learning_rate": 4.934560780834073e-05, "loss": 1.0588, "mean_token_accuracy": 0.7321462295949459, "num_tokens": 5627667.0, "step": 60 }, { "entropy": 1.4207842454314232, "epoch": 0.01553053413944201, "grad_norm": 0.58203125, "learning_rate": 4.923469387755102e-05, "loss": 1.0427, "mean_token_accuracy": 0.7373643882572651, "num_tokens": 6554855.0, "step": 70 }, { "entropy": 1.403839322924614, "epoch": 0.01774918187364801, "grad_norm": 0.5703125, "learning_rate": 4.9123779946761314e-05, "loss": 1.0559, "mean_token_accuracy": 0.7321439690887928, "num_tokens": 7519970.0, "step": 80 }, { "entropy": 1.4707388430833817, "epoch": 0.019967829607854013, "grad_norm": 0.5625, "learning_rate": 4.9012866015971606e-05, "loss": 1.0769, "mean_token_accuracy": 0.7276073284447193, "num_tokens": 8448434.0, "step": 90 }, { "entropy": 1.4391525745391847, "epoch": 0.022186477342060015, "grad_norm": 1.671875, "learning_rate": 4.8901952085181905e-05, "loss": 1.0773, "mean_token_accuracy": 0.7285938866436481, "num_tokens": 9372876.0, "step": 100 }, { "entropy": 1.4438337981700897, "epoch": 0.024405125076266018, "grad_norm": 0.515625, "learning_rate": 4.87910381543922e-05, "loss": 1.0468, "mean_token_accuracy": 0.7326018497347832, "num_tokens": 10316544.0, "step": 110 }, { "entropy": 1.4177550919353963, "epoch": 0.026623772810472016, "grad_norm": 0.5859375, "learning_rate": 4.868012422360249e-05, "loss": 1.0585, "mean_token_accuracy": 0.7324540324509143, "num_tokens": 11247187.0, "step": 120 }, { "entropy": 1.471057978272438, "epoch": 0.02884242054467802, "grad_norm": 0.6171875, "learning_rate": 4.856921029281278e-05, "loss": 1.0761, "mean_token_accuracy": 0.7275320313870907, "num_tokens": 12173265.0, "step": 130 }, { "entropy": 1.3705433815717698, "epoch": 0.03106106827888402, "grad_norm": 0.57421875, "learning_rate": 4.845829636202307e-05, "loss": 0.9901, "mean_token_accuracy": 0.74607959613204, "num_tokens": 13130818.0, "step": 140 }, { "entropy": 1.3908753886818885, "epoch": 0.03327971601309002, "grad_norm": 0.58203125, "learning_rate": 4.8347382431233365e-05, "loss": 1.0128, "mean_token_accuracy": 0.7397120602428913, "num_tokens": 14082298.0, "step": 150 }, { "entropy": 1.3889487609267235, "epoch": 0.03549836374729602, "grad_norm": 0.58203125, "learning_rate": 4.823646850044366e-05, "loss": 1.0346, "mean_token_accuracy": 0.7391657814383507, "num_tokens": 15020735.0, "step": 160 }, { "entropy": 1.3904974788427353, "epoch": 0.03771701148150203, "grad_norm": 0.6484375, "learning_rate": 4.812555456965395e-05, "loss": 1.0208, "mean_token_accuracy": 0.7383547216653824, "num_tokens": 15966964.0, "step": 170 }, { "entropy": 1.4144786164164542, "epoch": 0.039935659215708026, "grad_norm": 0.55078125, "learning_rate": 4.801464063886424e-05, "loss": 1.0335, "mean_token_accuracy": 0.7372442841529846, "num_tokens": 16859851.0, "step": 180 }, { "entropy": 1.4213875874876976, "epoch": 0.042154306949914025, "grad_norm": 0.609375, "learning_rate": 4.7903726708074534e-05, "loss": 1.03, "mean_token_accuracy": 0.735869013518095, "num_tokens": 17767909.0, "step": 190 }, { "entropy": 1.4285096868872642, "epoch": 0.04437295468412003, "grad_norm": 0.54296875, "learning_rate": 4.7792812777284826e-05, "loss": 1.0358, "mean_token_accuracy": 0.7385697923600674, "num_tokens": 18734689.0, "step": 200 }, { "entropy": 1.4109881177544594, "epoch": 0.04659160241832603, "grad_norm": 0.5703125, "learning_rate": 4.768189884649512e-05, "loss": 1.0404, "mean_token_accuracy": 0.7372543781995773, "num_tokens": 19661481.0, "step": 210 }, { "entropy": 1.3780835449695588, "epoch": 0.048810250152532035, "grad_norm": 0.546875, "learning_rate": 4.757098491570541e-05, "loss": 1.0115, "mean_token_accuracy": 0.7417904600501061, "num_tokens": 20586364.0, "step": 220 }, { "entropy": 1.3467031195759773, "epoch": 0.051028897886738034, "grad_norm": 0.52734375, "learning_rate": 4.74600709849157e-05, "loss": 0.9833, "mean_token_accuracy": 0.7490709364414215, "num_tokens": 21547442.0, "step": 230 }, { "entropy": 1.424035993218422, "epoch": 0.05324754562094403, "grad_norm": 0.57421875, "learning_rate": 4.7349157054126e-05, "loss": 1.0266, "mean_token_accuracy": 0.7366823427379131, "num_tokens": 22502716.0, "step": 240 }, { "entropy": 1.409425649046898, "epoch": 0.05546619335515004, "grad_norm": 0.6640625, "learning_rate": 4.7238243123336293e-05, "loss": 1.0304, "mean_token_accuracy": 0.7378800459206104, "num_tokens": 23426781.0, "step": 250 }, { "entropy": 1.4083105452358722, "epoch": 0.05768484108935604, "grad_norm": 0.6015625, "learning_rate": 4.7127329192546586e-05, "loss": 1.0399, "mean_token_accuracy": 0.7385689981281758, "num_tokens": 24379852.0, "step": 260 }, { "entropy": 1.3957985565066338, "epoch": 0.059903488823562036, "grad_norm": 0.59765625, "learning_rate": 4.701641526175688e-05, "loss": 1.0115, "mean_token_accuracy": 0.7397762380540371, "num_tokens": 25322772.0, "step": 270 }, { "entropy": 1.376178003847599, "epoch": 0.06212213655776804, "grad_norm": 0.56640625, "learning_rate": 4.690550133096717e-05, "loss": 1.0075, "mean_token_accuracy": 0.7439929395914078, "num_tokens": 26287748.0, "step": 280 }, { "entropy": 1.3666997633874416, "epoch": 0.06434078429197404, "grad_norm": 0.58203125, "learning_rate": 4.679458740017746e-05, "loss": 0.9893, "mean_token_accuracy": 0.7455881536006927, "num_tokens": 27239049.0, "step": 290 }, { "entropy": 1.3740440711379052, "epoch": 0.06655943202618005, "grad_norm": 0.578125, "learning_rate": 4.6683673469387754e-05, "loss": 1.0038, "mean_token_accuracy": 0.7433481432497502, "num_tokens": 28209460.0, "step": 300 }, { "entropy": 1.3969299167394638, "epoch": 0.06877807976038605, "grad_norm": 0.546875, "learning_rate": 4.6572759538598046e-05, "loss": 1.0058, "mean_token_accuracy": 0.7407899357378482, "num_tokens": 29152647.0, "step": 310 }, { "entropy": 1.4039423167705536, "epoch": 0.07099672749459204, "grad_norm": 0.5625, "learning_rate": 4.646184560780834e-05, "loss": 1.0078, "mean_token_accuracy": 0.7412165470421315, "num_tokens": 30065474.0, "step": 320 }, { "entropy": 1.3215527072548867, "epoch": 0.07321537522879805, "grad_norm": 0.5625, "learning_rate": 4.635093167701863e-05, "loss": 0.9572, "mean_token_accuracy": 0.752603680640459, "num_tokens": 31019135.0, "step": 330 }, { "entropy": 1.4225746989250183, "epoch": 0.07543402296300405, "grad_norm": 0.6015625, "learning_rate": 4.624001774622893e-05, "loss": 1.0285, "mean_token_accuracy": 0.7391470916569233, "num_tokens": 31939707.0, "step": 340 }, { "entropy": 1.3544291421771049, "epoch": 0.07765267069721005, "grad_norm": 0.546875, "learning_rate": 4.612910381543922e-05, "loss": 0.968, "mean_token_accuracy": 0.7495346136391163, "num_tokens": 32871530.0, "step": 350 }, { "entropy": 1.3839760735630988, "epoch": 0.07987131843141605, "grad_norm": 0.59375, "learning_rate": 4.6018189884649514e-05, "loss": 0.9895, "mean_token_accuracy": 0.7453494131565094, "num_tokens": 33813137.0, "step": 360 }, { "entropy": 1.414746204763651, "epoch": 0.08208996616562206, "grad_norm": 0.609375, "learning_rate": 4.5907275953859806e-05, "loss": 1.025, "mean_token_accuracy": 0.7388280339539051, "num_tokens": 34771343.0, "step": 370 }, { "entropy": 1.3732567429542542, "epoch": 0.08430861389982805, "grad_norm": 0.55859375, "learning_rate": 4.57963620230701e-05, "loss": 1.0058, "mean_token_accuracy": 0.7429468773305417, "num_tokens": 35713295.0, "step": 380 }, { "entropy": 1.3798431143164636, "epoch": 0.08652726163403406, "grad_norm": 0.62109375, "learning_rate": 4.568544809228039e-05, "loss": 1.0094, "mean_token_accuracy": 0.7400036215782165, "num_tokens": 36651252.0, "step": 390 }, { "entropy": 1.3924534171819687, "epoch": 0.08874590936824006, "grad_norm": 0.5625, "learning_rate": 4.557453416149068e-05, "loss": 0.9963, "mean_token_accuracy": 0.7455349668860436, "num_tokens": 37592125.0, "step": 400 }, { "entropy": 1.4052727609872817, "epoch": 0.09096455710244605, "grad_norm": 0.55859375, "learning_rate": 4.5463620230700974e-05, "loss": 1.0201, "mean_token_accuracy": 0.7395130477845668, "num_tokens": 38534139.0, "step": 410 }, { "entropy": 1.418926975131035, "epoch": 0.09318320483665206, "grad_norm": 0.59765625, "learning_rate": 4.5352706299911266e-05, "loss": 1.0096, "mean_token_accuracy": 0.7400068089365959, "num_tokens": 39478516.0, "step": 420 }, { "entropy": 1.3901023603975773, "epoch": 0.09540185257085806, "grad_norm": 0.54296875, "learning_rate": 4.5241792369121565e-05, "loss": 1.0051, "mean_token_accuracy": 0.7408906109631062, "num_tokens": 40443771.0, "step": 430 }, { "entropy": 1.3361934393644332, "epoch": 0.09762050030506407, "grad_norm": 0.578125, "learning_rate": 4.513087843833186e-05, "loss": 0.9356, "mean_token_accuracy": 0.7576181195676327, "num_tokens": 41385687.0, "step": 440 }, { "entropy": 1.4129181623458862, "epoch": 0.09983914803927006, "grad_norm": 0.578125, "learning_rate": 4.501996450754215e-05, "loss": 1.0427, "mean_token_accuracy": 0.7341138951480388, "num_tokens": 42316542.0, "step": 450 }, { "entropy": 1.3746674314141274, "epoch": 0.10205779577347607, "grad_norm": 0.57421875, "learning_rate": 4.490905057675244e-05, "loss": 1.0078, "mean_token_accuracy": 0.7429340846836567, "num_tokens": 43252482.0, "step": 460 }, { "entropy": 1.3764732837677003, "epoch": 0.10427644350768207, "grad_norm": 0.51953125, "learning_rate": 4.4798136645962734e-05, "loss": 1.0107, "mean_token_accuracy": 0.7416411705315114, "num_tokens": 44226551.0, "step": 470 }, { "entropy": 1.4007945582270622, "epoch": 0.10649509124188807, "grad_norm": 0.5703125, "learning_rate": 4.4687222715173026e-05, "loss": 1.0161, "mean_token_accuracy": 0.7399431690573692, "num_tokens": 45171661.0, "step": 480 }, { "entropy": 1.400729776918888, "epoch": 0.10871373897609407, "grad_norm": 0.58203125, "learning_rate": 4.457630878438332e-05, "loss": 1.01, "mean_token_accuracy": 0.7416381858289242, "num_tokens": 46102823.0, "step": 490 }, { "entropy": 1.3740653365850448, "epoch": 0.11093238671030008, "grad_norm": 0.640625, "learning_rate": 4.446539485359361e-05, "loss": 0.9948, "mean_token_accuracy": 0.7436435185372829, "num_tokens": 47038182.0, "step": 500 }, { "entropy": 1.385279569029808, "epoch": 0.11315103444450607, "grad_norm": 0.53125, "learning_rate": 4.43544809228039e-05, "loss": 0.9812, "mean_token_accuracy": 0.7462167225778102, "num_tokens": 47978197.0, "step": 510 }, { "entropy": 1.355229352414608, "epoch": 0.11536968217871207, "grad_norm": 0.498046875, "learning_rate": 4.42435669920142e-05, "loss": 0.9756, "mean_token_accuracy": 0.748258039355278, "num_tokens": 48929164.0, "step": 520 }, { "entropy": 1.340594267845154, "epoch": 0.11758832991291808, "grad_norm": 0.55078125, "learning_rate": 4.4132653061224493e-05, "loss": 0.9557, "mean_token_accuracy": 0.7537197224795819, "num_tokens": 49872840.0, "step": 530 }, { "entropy": 1.3749348096549512, "epoch": 0.11980697764712407, "grad_norm": 0.53125, "learning_rate": 4.4021739130434786e-05, "loss": 0.9964, "mean_token_accuracy": 0.7432300426065922, "num_tokens": 50820730.0, "step": 540 }, { "entropy": 1.3660012029111386, "epoch": 0.12202562538133008, "grad_norm": 0.6015625, "learning_rate": 4.391082519964508e-05, "loss": 0.9864, "mean_token_accuracy": 0.7461238898336887, "num_tokens": 51758109.0, "step": 550 }, { "entropy": 1.3850644059479236, "epoch": 0.12424427311553608, "grad_norm": 0.546875, "learning_rate": 4.379991126885537e-05, "loss": 0.9998, "mean_token_accuracy": 0.7451605953276157, "num_tokens": 52706955.0, "step": 560 }, { "entropy": 1.3438964366912842, "epoch": 0.1264629208497421, "grad_norm": 0.5703125, "learning_rate": 4.368899733806566e-05, "loss": 0.9759, "mean_token_accuracy": 0.7506989397108554, "num_tokens": 53654927.0, "step": 570 }, { "entropy": 1.3823294579982757, "epoch": 0.12868156858394808, "grad_norm": 0.55078125, "learning_rate": 4.3578083407275954e-05, "loss": 1.002, "mean_token_accuracy": 0.7399756357073783, "num_tokens": 54607725.0, "step": 580 }, { "entropy": 1.3431190609931947, "epoch": 0.13090021631815407, "grad_norm": 0.5546875, "learning_rate": 4.3467169476486246e-05, "loss": 0.9922, "mean_token_accuracy": 0.7472275733947754, "num_tokens": 55544472.0, "step": 590 }, { "entropy": 1.386824431270361, "epoch": 0.1331188640523601, "grad_norm": 0.5625, "learning_rate": 4.335625554569654e-05, "loss": 1.0184, "mean_token_accuracy": 0.7395146794617176, "num_tokens": 56492504.0, "step": 600 }, { "entropy": 1.3801176637411117, "epoch": 0.13533751178656608, "grad_norm": 0.58203125, "learning_rate": 4.324534161490684e-05, "loss": 1.0153, "mean_token_accuracy": 0.740266764163971, "num_tokens": 57446818.0, "step": 610 }, { "entropy": 1.3780507385730743, "epoch": 0.1375561595207721, "grad_norm": 0.5625, "learning_rate": 4.313442768411713e-05, "loss": 1.0282, "mean_token_accuracy": 0.7379110969603062, "num_tokens": 58388320.0, "step": 620 }, { "entropy": 1.3641229078173638, "epoch": 0.1397748072549781, "grad_norm": 0.546875, "learning_rate": 4.302351375332742e-05, "loss": 0.9693, "mean_token_accuracy": 0.7492371432483196, "num_tokens": 59335625.0, "step": 630 }, { "entropy": 1.3909922763705254, "epoch": 0.1419934549891841, "grad_norm": 0.55078125, "learning_rate": 4.2912599822537714e-05, "loss": 1.0178, "mean_token_accuracy": 0.7371242880821228, "num_tokens": 60271739.0, "step": 640 }, { "entropy": 1.358926948904991, "epoch": 0.1442121027233901, "grad_norm": 0.5390625, "learning_rate": 4.2801685891748006e-05, "loss": 0.9545, "mean_token_accuracy": 0.7560696460306644, "num_tokens": 61229087.0, "step": 650 }, { "entropy": 1.3929373525083064, "epoch": 0.1464307504575961, "grad_norm": 0.55859375, "learning_rate": 4.26907719609583e-05, "loss": 1.0238, "mean_token_accuracy": 0.7393973417580127, "num_tokens": 62178123.0, "step": 660 }, { "entropy": 1.327741453051567, "epoch": 0.1486493981918021, "grad_norm": 0.57421875, "learning_rate": 4.257985803016859e-05, "loss": 0.9456, "mean_token_accuracy": 0.7535205587744713, "num_tokens": 63126423.0, "step": 670 }, { "entropy": 1.3679818481206893, "epoch": 0.1508680459260081, "grad_norm": 0.54296875, "learning_rate": 4.246894409937888e-05, "loss": 0.9728, "mean_token_accuracy": 0.7467674180865288, "num_tokens": 64079693.0, "step": 680 }, { "entropy": 1.381383018195629, "epoch": 0.1530866936602141, "grad_norm": 0.5703125, "learning_rate": 4.2358030168589174e-05, "loss": 1.0088, "mean_token_accuracy": 0.7418314486742019, "num_tokens": 65018338.0, "step": 690 }, { "entropy": 1.3862957283854485, "epoch": 0.1553053413944201, "grad_norm": 0.6171875, "learning_rate": 4.224711623779947e-05, "loss": 1.0144, "mean_token_accuracy": 0.7403538078069687, "num_tokens": 65952052.0, "step": 700 }, { "entropy": 1.3384159475564956, "epoch": 0.1575239891286261, "grad_norm": 0.56640625, "learning_rate": 4.2136202307009765e-05, "loss": 0.9922, "mean_token_accuracy": 0.7461440391838551, "num_tokens": 66921819.0, "step": 710 }, { "entropy": 1.3640660651028156, "epoch": 0.1597426368628321, "grad_norm": 0.546875, "learning_rate": 4.202528837622006e-05, "loss": 0.9682, "mean_token_accuracy": 0.7501711919903755, "num_tokens": 67876968.0, "step": 720 }, { "entropy": 1.4149701073765755, "epoch": 0.1619612845970381, "grad_norm": 0.71875, "learning_rate": 4.191437444543035e-05, "loss": 0.9993, "mean_token_accuracy": 0.7446250684559346, "num_tokens": 68806954.0, "step": 730 }, { "entropy": 1.3790323272347451, "epoch": 0.16417993233124412, "grad_norm": 0.50390625, "learning_rate": 4.180346051464064e-05, "loss": 1.0043, "mean_token_accuracy": 0.7418724097311497, "num_tokens": 69713797.0, "step": 740 }, { "entropy": 1.3732078664004803, "epoch": 0.1663985800654501, "grad_norm": 0.56640625, "learning_rate": 4.1692546583850934e-05, "loss": 0.9737, "mean_token_accuracy": 0.7472748421132565, "num_tokens": 70644616.0, "step": 750 }, { "entropy": 1.3796279937028886, "epoch": 0.1686172277996561, "grad_norm": 0.5703125, "learning_rate": 4.1581632653061226e-05, "loss": 0.9888, "mean_token_accuracy": 0.7464235134422779, "num_tokens": 71614683.0, "step": 760 }, { "entropy": 1.3841315507888794, "epoch": 0.17083587553386212, "grad_norm": 0.5546875, "learning_rate": 4.147071872227152e-05, "loss": 1.0505, "mean_token_accuracy": 0.7321801386773586, "num_tokens": 72565413.0, "step": 770 }, { "entropy": 1.3817786656320095, "epoch": 0.1730545232680681, "grad_norm": 0.55078125, "learning_rate": 4.135980479148181e-05, "loss": 0.9787, "mean_token_accuracy": 0.7469952210783959, "num_tokens": 73526877.0, "step": 780 }, { "entropy": 1.3620466977357863, "epoch": 0.1752731710022741, "grad_norm": 0.5625, "learning_rate": 4.124889086069211e-05, "loss": 0.9949, "mean_token_accuracy": 0.7441352687776088, "num_tokens": 74473784.0, "step": 790 }, { "entropy": 1.3590239346027375, "epoch": 0.17749181873648012, "grad_norm": 0.546875, "learning_rate": 4.11379769299024e-05, "loss": 0.9791, "mean_token_accuracy": 0.748184335231781, "num_tokens": 75417836.0, "step": 800 }, { "entropy": 1.3708465218544006, "epoch": 0.17971046647068611, "grad_norm": 0.55859375, "learning_rate": 4.1027062999112693e-05, "loss": 0.9631, "mean_token_accuracy": 0.7510820157825947, "num_tokens": 76354490.0, "step": 810 }, { "entropy": 1.341279798746109, "epoch": 0.1819291142048921, "grad_norm": 0.57421875, "learning_rate": 4.0916149068322986e-05, "loss": 0.9668, "mean_token_accuracy": 0.7506601929664611, "num_tokens": 77302002.0, "step": 820 }, { "entropy": 1.3692625604569912, "epoch": 0.18414776193909813, "grad_norm": 0.5546875, "learning_rate": 4.080523513753328e-05, "loss": 0.9652, "mean_token_accuracy": 0.7492272712290287, "num_tokens": 78250737.0, "step": 830 }, { "entropy": 1.3755904287099838, "epoch": 0.18636640967330412, "grad_norm": 0.53515625, "learning_rate": 4.069432120674357e-05, "loss": 0.9899, "mean_token_accuracy": 0.7458423741161824, "num_tokens": 79188862.0, "step": 840 }, { "entropy": 1.4084200143814087, "epoch": 0.1885850574075101, "grad_norm": 0.56640625, "learning_rate": 4.058340727595386e-05, "loss": 1.0189, "mean_token_accuracy": 0.739814518392086, "num_tokens": 80106826.0, "step": 850 }, { "entropy": 1.3865490198135375, "epoch": 0.19080370514171613, "grad_norm": 0.52734375, "learning_rate": 4.0472493345164154e-05, "loss": 0.9995, "mean_token_accuracy": 0.7435623817145824, "num_tokens": 81042920.0, "step": 860 }, { "entropy": 1.4000697553157806, "epoch": 0.19302235287592212, "grad_norm": 0.5625, "learning_rate": 4.0361579414374446e-05, "loss": 0.9822, "mean_token_accuracy": 0.7476979814469814, "num_tokens": 81961887.0, "step": 870 }, { "entropy": 1.4024762332439422, "epoch": 0.19524100061012814, "grad_norm": 0.59765625, "learning_rate": 4.025066548358474e-05, "loss": 1.0031, "mean_token_accuracy": 0.7444592162966728, "num_tokens": 82897270.0, "step": 880 }, { "entropy": 1.3742952913045883, "epoch": 0.19745964834433413, "grad_norm": 0.52734375, "learning_rate": 4.013975155279504e-05, "loss": 0.9859, "mean_token_accuracy": 0.7470481149852276, "num_tokens": 83862674.0, "step": 890 }, { "entropy": 1.3406959801912308, "epoch": 0.19967829607854012, "grad_norm": 0.55078125, "learning_rate": 4.002883762200533e-05, "loss": 0.9555, "mean_token_accuracy": 0.7523404717445373, "num_tokens": 84815713.0, "step": 900 }, { "entropy": 1.4000651821494103, "epoch": 0.20189694381274614, "grad_norm": 0.56640625, "learning_rate": 3.991792369121562e-05, "loss": 0.9945, "mean_token_accuracy": 0.7464812904596329, "num_tokens": 85748577.0, "step": 910 }, { "entropy": 1.3798070877790451, "epoch": 0.20411559154695214, "grad_norm": 0.55078125, "learning_rate": 3.9807009760425914e-05, "loss": 1.0066, "mean_token_accuracy": 0.7432169638574123, "num_tokens": 86686607.0, "step": 920 }, { "entropy": 1.3416421085596084, "epoch": 0.20633423928115813, "grad_norm": 0.828125, "learning_rate": 3.9696095829636206e-05, "loss": 0.9644, "mean_token_accuracy": 0.7486262872815133, "num_tokens": 87638726.0, "step": 930 }, { "entropy": 1.3670515537261962, "epoch": 0.20855288701536415, "grad_norm": 0.498046875, "learning_rate": 3.95851818988465e-05, "loss": 0.9849, "mean_token_accuracy": 0.7475749678909779, "num_tokens": 88601047.0, "step": 940 }, { "entropy": 1.379362154006958, "epoch": 0.21077153474957014, "grad_norm": 0.57421875, "learning_rate": 3.947426796805679e-05, "loss": 0.9476, "mean_token_accuracy": 0.7543636500835419, "num_tokens": 89520415.0, "step": 950 }, { "entropy": 1.371240857243538, "epoch": 0.21299018248377613, "grad_norm": 0.5625, "learning_rate": 3.936335403726708e-05, "loss": 0.9664, "mean_token_accuracy": 0.7490607380867005, "num_tokens": 90455065.0, "step": 960 }, { "entropy": 1.340223667025566, "epoch": 0.21520883021798215, "grad_norm": 0.55078125, "learning_rate": 3.9252440106477374e-05, "loss": 0.9414, "mean_token_accuracy": 0.75420788154006, "num_tokens": 91408476.0, "step": 970 }, { "entropy": 1.358856461942196, "epoch": 0.21742747795218814, "grad_norm": 0.5625, "learning_rate": 3.914152617568767e-05, "loss": 0.9898, "mean_token_accuracy": 0.748216237872839, "num_tokens": 92379704.0, "step": 980 }, { "entropy": 1.3565750516951085, "epoch": 0.21964612568639413, "grad_norm": 0.51953125, "learning_rate": 3.9030612244897965e-05, "loss": 0.947, "mean_token_accuracy": 0.7539278566837311, "num_tokens": 93330334.0, "step": 990 }, { "entropy": 1.3714484706521035, "epoch": 0.22186477342060015, "grad_norm": 0.55859375, "learning_rate": 3.891969831410826e-05, "loss": 0.9702, "mean_token_accuracy": 0.7506582617759705, "num_tokens": 94283931.0, "step": 1000 }, { "entropy": 1.4159359961748124, "epoch": 0.22408342115480614, "grad_norm": 0.58203125, "learning_rate": 3.880878438331855e-05, "loss": 1.0096, "mean_token_accuracy": 0.7425235278904438, "num_tokens": 95207171.0, "step": 1010 }, { "entropy": 1.3379707857966423, "epoch": 0.22630206888901214, "grad_norm": 0.54296875, "learning_rate": 3.869787045252884e-05, "loss": 0.9549, "mean_token_accuracy": 0.7548819564282894, "num_tokens": 96148468.0, "step": 1020 }, { "entropy": 1.36017052680254, "epoch": 0.22852071662321816, "grad_norm": 0.53125, "learning_rate": 3.8586956521739134e-05, "loss": 0.9901, "mean_token_accuracy": 0.7463661000132561, "num_tokens": 97119881.0, "step": 1030 }, { "entropy": 1.3724884755909443, "epoch": 0.23073936435742415, "grad_norm": 0.51171875, "learning_rate": 3.8476042590949426e-05, "loss": 0.9822, "mean_token_accuracy": 0.7475468330085278, "num_tokens": 98077875.0, "step": 1040 }, { "entropy": 1.3405052460730076, "epoch": 0.23295801209163014, "grad_norm": 0.51953125, "learning_rate": 3.836512866015972e-05, "loss": 0.9432, "mean_token_accuracy": 0.7541234731674195, "num_tokens": 99031833.0, "step": 1050 }, { "entropy": 1.3757181286811828, "epoch": 0.23517665982583616, "grad_norm": 0.54296875, "learning_rate": 3.825421472937001e-05, "loss": 0.9888, "mean_token_accuracy": 0.7472271144390106, "num_tokens": 99991709.0, "step": 1060 }, { "entropy": 1.3773247390985488, "epoch": 0.23739530756004215, "grad_norm": 0.55859375, "learning_rate": 3.814330079858031e-05, "loss": 0.9933, "mean_token_accuracy": 0.7446820683777332, "num_tokens": 100941262.0, "step": 1070 }, { "entropy": 1.3600140511989594, "epoch": 0.23961395529424814, "grad_norm": 0.54296875, "learning_rate": 3.80323868677906e-05, "loss": 0.9708, "mean_token_accuracy": 0.7481269456446171, "num_tokens": 101897106.0, "step": 1080 }, { "entropy": 1.4162090666592122, "epoch": 0.24183260302845416, "grad_norm": 0.5546875, "learning_rate": 3.7921472937000893e-05, "loss": 0.9589, "mean_token_accuracy": 0.7518798463046551, "num_tokens": 102824359.0, "step": 1090 }, { "entropy": 1.342430242151022, "epoch": 0.24405125076266015, "grad_norm": 0.546875, "learning_rate": 3.7810559006211186e-05, "loss": 0.9439, "mean_token_accuracy": 0.7558625318109989, "num_tokens": 103772965.0, "step": 1100 }, { "entropy": 1.3550659596920014, "epoch": 0.24626989849686615, "grad_norm": 0.578125, "learning_rate": 3.769964507542148e-05, "loss": 0.9645, "mean_token_accuracy": 0.7508764907717704, "num_tokens": 104691589.0, "step": 1110 }, { "entropy": 1.3936431795358657, "epoch": 0.24848854623107217, "grad_norm": 0.58203125, "learning_rate": 3.758873114463177e-05, "loss": 0.989, "mean_token_accuracy": 0.7461673654615879, "num_tokens": 105581821.0, "step": 1120 }, { "entropy": 1.38260547965765, "epoch": 0.2507071939652782, "grad_norm": 0.62109375, "learning_rate": 3.747781721384206e-05, "loss": 0.9923, "mean_token_accuracy": 0.7444244168698788, "num_tokens": 106548122.0, "step": 1130 }, { "entropy": 1.3805961057543754, "epoch": 0.2529258416994842, "grad_norm": 0.5703125, "learning_rate": 3.7366903283052354e-05, "loss": 0.988, "mean_token_accuracy": 0.746028533577919, "num_tokens": 107471780.0, "step": 1140 }, { "entropy": 1.3697818227112293, "epoch": 0.25514448943369017, "grad_norm": 0.5625, "learning_rate": 3.7255989352262646e-05, "loss": 0.9879, "mean_token_accuracy": 0.7449548006057739, "num_tokens": 108423848.0, "step": 1150 }, { "entropy": 1.3857608392834664, "epoch": 0.25736313716789616, "grad_norm": 0.5546875, "learning_rate": 3.714507542147294e-05, "loss": 0.9909, "mean_token_accuracy": 0.7463208839297295, "num_tokens": 109339085.0, "step": 1160 }, { "entropy": 1.4123259857296944, "epoch": 0.25958178490210215, "grad_norm": 0.52734375, "learning_rate": 3.703416149068323e-05, "loss": 1.0152, "mean_token_accuracy": 0.7402355149388313, "num_tokens": 110293198.0, "step": 1170 }, { "entropy": 1.3887243419885635, "epoch": 0.26180043263630814, "grad_norm": 0.5859375, "learning_rate": 3.692324755989352e-05, "loss": 1.0135, "mean_token_accuracy": 0.7396637931466102, "num_tokens": 111236146.0, "step": 1180 }, { "entropy": 1.3697876557707787, "epoch": 0.2640190803705142, "grad_norm": 0.54296875, "learning_rate": 3.6812333629103815e-05, "loss": 1.0056, "mean_token_accuracy": 0.7433505475521087, "num_tokens": 112180565.0, "step": 1190 }, { "entropy": 1.3680458456277846, "epoch": 0.2662377281047202, "grad_norm": 0.57421875, "learning_rate": 3.670141969831411e-05, "loss": 0.9455, "mean_token_accuracy": 0.7568574421107769, "num_tokens": 113127830.0, "step": 1200 }, { "entropy": 1.3552896961569787, "epoch": 0.2684563758389262, "grad_norm": 0.6015625, "learning_rate": 3.65905057675244e-05, "loss": 0.9666, "mean_token_accuracy": 0.7498848676681519, "num_tokens": 114067246.0, "step": 1210 }, { "entropy": 1.3530599243938923, "epoch": 0.27067502357313217, "grad_norm": 0.55859375, "learning_rate": 3.64795918367347e-05, "loss": 0.9768, "mean_token_accuracy": 0.7468112826347351, "num_tokens": 114993058.0, "step": 1220 }, { "entropy": 1.3884347334504128, "epoch": 0.27289367130733816, "grad_norm": 0.515625, "learning_rate": 3.636867790594499e-05, "loss": 1.0055, "mean_token_accuracy": 0.7407384052872658, "num_tokens": 115913621.0, "step": 1230 }, { "entropy": 1.3924200147390366, "epoch": 0.2751123190415442, "grad_norm": 0.56640625, "learning_rate": 3.625776397515528e-05, "loss": 1.0014, "mean_token_accuracy": 0.7461141526699067, "num_tokens": 116873252.0, "step": 1240 }, { "entropy": 1.3493129260838033, "epoch": 0.2773309667757502, "grad_norm": 0.54296875, "learning_rate": 3.6146850044365574e-05, "loss": 0.9607, "mean_token_accuracy": 0.7491537302732467, "num_tokens": 117827994.0, "step": 1250 }, { "entropy": 1.382804460823536, "epoch": 0.2795496145099562, "grad_norm": 0.5703125, "learning_rate": 3.6035936113575866e-05, "loss": 0.9795, "mean_token_accuracy": 0.7502268873155117, "num_tokens": 118765286.0, "step": 1260 }, { "entropy": 1.374519681930542, "epoch": 0.2817682622441622, "grad_norm": 0.546875, "learning_rate": 3.592502218278616e-05, "loss": 0.9933, "mean_token_accuracy": 0.7442196063697338, "num_tokens": 119728068.0, "step": 1270 }, { "entropy": 1.3972402699291706, "epoch": 0.2839869099783682, "grad_norm": 0.56640625, "learning_rate": 3.581410825199645e-05, "loss": 1.0037, "mean_token_accuracy": 0.7407265052199363, "num_tokens": 120663567.0, "step": 1280 }, { "entropy": 1.379422479122877, "epoch": 0.28620555771257417, "grad_norm": 0.54296875, "learning_rate": 3.570319432120674e-05, "loss": 0.9921, "mean_token_accuracy": 0.7444989711046219, "num_tokens": 121604187.0, "step": 1290 }, { "entropy": 1.3605211839079856, "epoch": 0.2884242054467802, "grad_norm": 0.55859375, "learning_rate": 3.5592280390417035e-05, "loss": 0.9541, "mean_token_accuracy": 0.7547078765928745, "num_tokens": 122549091.0, "step": 1300 }, { "entropy": 1.358756284415722, "epoch": 0.2906428531809862, "grad_norm": 0.546875, "learning_rate": 3.548136645962733e-05, "loss": 0.9763, "mean_token_accuracy": 0.7475451476871967, "num_tokens": 123493867.0, "step": 1310 }, { "entropy": 1.3583389446139336, "epoch": 0.2928615009151922, "grad_norm": 0.5625, "learning_rate": 3.537045252883762e-05, "loss": 0.9564, "mean_token_accuracy": 0.7530794121325016, "num_tokens": 124444992.0, "step": 1320 }, { "entropy": 1.3372597798705101, "epoch": 0.2950801486493982, "grad_norm": 0.56640625, "learning_rate": 3.525953859804791e-05, "loss": 0.917, "mean_token_accuracy": 0.7617659427225589, "num_tokens": 125376281.0, "step": 1330 }, { "entropy": 1.3307228960096835, "epoch": 0.2972987963836042, "grad_norm": 0.5390625, "learning_rate": 3.514862466725821e-05, "loss": 0.9606, "mean_token_accuracy": 0.749411403387785, "num_tokens": 126299926.0, "step": 1340 }, { "entropy": 1.3589562863111495, "epoch": 0.29951744411781017, "grad_norm": 0.5703125, "learning_rate": 3.50377107364685e-05, "loss": 0.9547, "mean_token_accuracy": 0.753935182094574, "num_tokens": 127248113.0, "step": 1350 }, { "entropy": 1.3731069147586823, "epoch": 0.3017360918520162, "grad_norm": 0.58984375, "learning_rate": 3.4926796805678794e-05, "loss": 0.9724, "mean_token_accuracy": 0.7470672108232975, "num_tokens": 128181913.0, "step": 1360 }, { "entropy": 1.3970228135585785, "epoch": 0.3039547395862222, "grad_norm": 0.5625, "learning_rate": 3.481588287488909e-05, "loss": 0.9808, "mean_token_accuracy": 0.7479516059160233, "num_tokens": 129129397.0, "step": 1370 }, { "entropy": 1.3645412735641003, "epoch": 0.3061733873204282, "grad_norm": 0.5625, "learning_rate": 3.470496894409938e-05, "loss": 0.9904, "mean_token_accuracy": 0.7432928495109081, "num_tokens": 130072657.0, "step": 1380 }, { "entropy": 1.3820607632398605, "epoch": 0.3083920350546342, "grad_norm": 0.60546875, "learning_rate": 3.459405501330967e-05, "loss": 0.9456, "mean_token_accuracy": 0.7521802820265293, "num_tokens": 131009716.0, "step": 1390 }, { "entropy": 1.370594221353531, "epoch": 0.3106106827888402, "grad_norm": 0.58984375, "learning_rate": 3.448314108251996e-05, "loss": 0.9802, "mean_token_accuracy": 0.7480019509792328, "num_tokens": 131933570.0, "step": 1400 }, { "entropy": 1.4096333682537079, "epoch": 0.3128293305230462, "grad_norm": 0.5625, "learning_rate": 3.4372227151730255e-05, "loss": 0.9865, "mean_token_accuracy": 0.7449732661247254, "num_tokens": 132867719.0, "step": 1410 }, { "entropy": 1.3462214186787604, "epoch": 0.3150479782572522, "grad_norm": 0.55078125, "learning_rate": 3.426131322094055e-05, "loss": 0.965, "mean_token_accuracy": 0.7521069377660752, "num_tokens": 133800807.0, "step": 1420 }, { "entropy": 1.3778568729758263, "epoch": 0.3172666259914582, "grad_norm": 0.54296875, "learning_rate": 3.415039929015084e-05, "loss": 0.984, "mean_token_accuracy": 0.7461141437292099, "num_tokens": 134735911.0, "step": 1430 }, { "entropy": 1.3803854644298554, "epoch": 0.3194852737256642, "grad_norm": 0.53125, "learning_rate": 3.403948535936114e-05, "loss": 1.0009, "mean_token_accuracy": 0.7454333089292049, "num_tokens": 135679919.0, "step": 1440 }, { "entropy": 1.4075747832655907, "epoch": 0.3217039214598702, "grad_norm": 0.55859375, "learning_rate": 3.392857142857143e-05, "loss": 0.9977, "mean_token_accuracy": 0.7427182622253895, "num_tokens": 136600159.0, "step": 1450 }, { "entropy": 1.3631702698767185, "epoch": 0.3239225691940762, "grad_norm": 0.54296875, "learning_rate": 3.381765749778172e-05, "loss": 0.9834, "mean_token_accuracy": 0.7481319233775139, "num_tokens": 137552890.0, "step": 1460 }, { "entropy": 1.3479732781648637, "epoch": 0.3261412169282822, "grad_norm": 0.54296875, "learning_rate": 3.3706743566992015e-05, "loss": 0.9415, "mean_token_accuracy": 0.7547242395579815, "num_tokens": 138503386.0, "step": 1470 }, { "entropy": 1.3664841935038567, "epoch": 0.32835986466248823, "grad_norm": 0.57421875, "learning_rate": 3.359582963620231e-05, "loss": 0.9414, "mean_token_accuracy": 0.7544668681919575, "num_tokens": 139436618.0, "step": 1480 }, { "entropy": 1.369805136322975, "epoch": 0.3305785123966942, "grad_norm": 0.515625, "learning_rate": 3.34849157054126e-05, "loss": 0.9659, "mean_token_accuracy": 0.748472998291254, "num_tokens": 140386106.0, "step": 1490 }, { "entropy": 1.3472276948392392, "epoch": 0.3327971601309002, "grad_norm": 0.5078125, "learning_rate": 3.337400177462289e-05, "loss": 0.9306, "mean_token_accuracy": 0.7600706323981286, "num_tokens": 141318351.0, "step": 1500 }, { "entropy": 1.3616932608187198, "epoch": 0.3350158078651062, "grad_norm": 0.640625, "learning_rate": 3.326308784383318e-05, "loss": 0.949, "mean_token_accuracy": 0.7525821574032306, "num_tokens": 142261086.0, "step": 1510 }, { "entropy": 1.3529250659048557, "epoch": 0.3372344555993122, "grad_norm": 0.5390625, "learning_rate": 3.3152173913043475e-05, "loss": 0.9622, "mean_token_accuracy": 0.7509421311318875, "num_tokens": 143210426.0, "step": 1520 }, { "entropy": 1.3494533702731133, "epoch": 0.33945310333351825, "grad_norm": 0.6015625, "learning_rate": 3.3041259982253774e-05, "loss": 0.9719, "mean_token_accuracy": 0.7484220921993255, "num_tokens": 144159675.0, "step": 1530 }, { "entropy": 1.3390969790518283, "epoch": 0.34167175106772424, "grad_norm": 0.5625, "learning_rate": 3.2930346051464066e-05, "loss": 0.951, "mean_token_accuracy": 0.7507020443677902, "num_tokens": 145086625.0, "step": 1540 }, { "entropy": 1.3771871954202652, "epoch": 0.34389039880193023, "grad_norm": 0.60546875, "learning_rate": 3.281943212067436e-05, "loss": 0.9817, "mean_token_accuracy": 0.7470030762255192, "num_tokens": 145986719.0, "step": 1550 }, { "entropy": 1.3694410175085068, "epoch": 0.3461090465361362, "grad_norm": 1.421875, "learning_rate": 3.270851818988465e-05, "loss": 0.9611, "mean_token_accuracy": 0.7503976099193096, "num_tokens": 146918044.0, "step": 1560 }, { "entropy": 1.3784858211874962, "epoch": 0.3483276942703422, "grad_norm": 0.5703125, "learning_rate": 3.259760425909494e-05, "loss": 0.9773, "mean_token_accuracy": 0.7474872335791588, "num_tokens": 147883804.0, "step": 1570 }, { "entropy": 1.3432278633117676, "epoch": 0.3505463420045482, "grad_norm": 0.56640625, "learning_rate": 3.2486690328305235e-05, "loss": 0.936, "mean_token_accuracy": 0.755360123515129, "num_tokens": 148818674.0, "step": 1580 }, { "entropy": 1.3762368500232696, "epoch": 0.35276498973875425, "grad_norm": 0.53125, "learning_rate": 3.237577639751553e-05, "loss": 1.0127, "mean_token_accuracy": 0.7423109777271748, "num_tokens": 149775495.0, "step": 1590 }, { "entropy": 1.389584618806839, "epoch": 0.35498363747296025, "grad_norm": 0.64453125, "learning_rate": 3.226486246672582e-05, "loss": 0.956, "mean_token_accuracy": 0.7540929049253464, "num_tokens": 150715437.0, "step": 1600 }, { "entropy": 1.3511891454458236, "epoch": 0.35720228520716624, "grad_norm": 0.55078125, "learning_rate": 3.215394853593611e-05, "loss": 0.9424, "mean_token_accuracy": 0.7547151155769825, "num_tokens": 151648229.0, "step": 1610 }, { "entropy": 1.342407089471817, "epoch": 0.35942093294137223, "grad_norm": 0.5390625, "learning_rate": 3.204303460514641e-05, "loss": 0.9572, "mean_token_accuracy": 0.75165830925107, "num_tokens": 152568041.0, "step": 1620 }, { "entropy": 1.3556662440299987, "epoch": 0.3616395806755782, "grad_norm": 0.55859375, "learning_rate": 3.19321206743567e-05, "loss": 0.9579, "mean_token_accuracy": 0.7535672217607499, "num_tokens": 153481444.0, "step": 1630 }, { "entropy": 1.3596419125795365, "epoch": 0.3638582284097842, "grad_norm": 0.5390625, "learning_rate": 3.1821206743566994e-05, "loss": 0.9707, "mean_token_accuracy": 0.7496115677058697, "num_tokens": 154436286.0, "step": 1640 }, { "entropy": 1.3691009670495986, "epoch": 0.36607687614399026, "grad_norm": 0.5546875, "learning_rate": 3.171029281277729e-05, "loss": 0.9829, "mean_token_accuracy": 0.7472112305462361, "num_tokens": 155369838.0, "step": 1650 }, { "entropy": 1.3596091173589229, "epoch": 0.36829552387819625, "grad_norm": 0.546875, "learning_rate": 3.159937888198758e-05, "loss": 0.9514, "mean_token_accuracy": 0.7531104668974876, "num_tokens": 156331805.0, "step": 1660 }, { "entropy": 1.4006396278738975, "epoch": 0.37051417161240224, "grad_norm": 0.5390625, "learning_rate": 3.148846495119787e-05, "loss": 1.0172, "mean_token_accuracy": 0.7395706221461296, "num_tokens": 157249437.0, "step": 1670 }, { "entropy": 1.3770634673535824, "epoch": 0.37273281934660824, "grad_norm": 0.5546875, "learning_rate": 3.137755102040816e-05, "loss": 0.9982, "mean_token_accuracy": 0.7440236747264862, "num_tokens": 158190770.0, "step": 1680 }, { "entropy": 1.370580254495144, "epoch": 0.3749514670808142, "grad_norm": 0.51171875, "learning_rate": 3.1266637089618455e-05, "loss": 0.9632, "mean_token_accuracy": 0.7505429275333881, "num_tokens": 159111105.0, "step": 1690 }, { "entropy": 1.3719338580965996, "epoch": 0.3771701148150202, "grad_norm": 0.58203125, "learning_rate": 3.115572315882875e-05, "loss": 0.9831, "mean_token_accuracy": 0.7460063569247722, "num_tokens": 160054633.0, "step": 1700 }, { "entropy": 1.4081777222454548, "epoch": 0.37938876254922627, "grad_norm": 0.53125, "learning_rate": 3.1044809228039046e-05, "loss": 1.0079, "mean_token_accuracy": 0.7412950038909912, "num_tokens": 161001374.0, "step": 1710 }, { "entropy": 1.3786864325404167, "epoch": 0.38160741028343226, "grad_norm": 0.546875, "learning_rate": 3.093389529724934e-05, "loss": 1.0045, "mean_token_accuracy": 0.7446122042834759, "num_tokens": 161931135.0, "step": 1720 }, { "entropy": 1.3475178599357605, "epoch": 0.38382605801763825, "grad_norm": 0.58984375, "learning_rate": 3.082298136645963e-05, "loss": 0.9488, "mean_token_accuracy": 0.7558258168399334, "num_tokens": 162879415.0, "step": 1730 }, { "entropy": 1.369861949980259, "epoch": 0.38604470575184424, "grad_norm": 0.56640625, "learning_rate": 3.071206743566992e-05, "loss": 0.965, "mean_token_accuracy": 0.7485872730612755, "num_tokens": 163815621.0, "step": 1740 }, { "entropy": 1.3579731062054634, "epoch": 0.38826335348605023, "grad_norm": 0.55859375, "learning_rate": 3.0601153504880215e-05, "loss": 0.9566, "mean_token_accuracy": 0.7517626143991947, "num_tokens": 164760214.0, "step": 1750 }, { "entropy": 1.3689505890011788, "epoch": 0.3904820012202563, "grad_norm": 0.53125, "learning_rate": 3.0490239574090507e-05, "loss": 0.9746, "mean_token_accuracy": 0.7488324150443078, "num_tokens": 165725369.0, "step": 1760 }, { "entropy": 1.3397907942533493, "epoch": 0.3927006489544623, "grad_norm": 0.51953125, "learning_rate": 3.03793256433008e-05, "loss": 0.9336, "mean_token_accuracy": 0.7559916451573372, "num_tokens": 166673239.0, "step": 1770 }, { "entropy": 1.3467686548829079, "epoch": 0.39491929668866826, "grad_norm": 0.51953125, "learning_rate": 3.026841171251109e-05, "loss": 0.9639, "mean_token_accuracy": 0.7545785017311573, "num_tokens": 167629364.0, "step": 1780 }, { "entropy": 1.3575761772692203, "epoch": 0.39713794442287426, "grad_norm": 0.5703125, "learning_rate": 3.0157497781721383e-05, "loss": 0.9724, "mean_token_accuracy": 0.7488372251391411, "num_tokens": 168566244.0, "step": 1790 }, { "entropy": 1.353706033527851, "epoch": 0.39935659215708025, "grad_norm": 0.6015625, "learning_rate": 3.0046583850931682e-05, "loss": 0.9412, "mean_token_accuracy": 0.7558333098888397, "num_tokens": 169511821.0, "step": 1800 }, { "entropy": 1.3735353089869022, "epoch": 0.40157523989128624, "grad_norm": 0.5625, "learning_rate": 2.9935669920141974e-05, "loss": 0.985, "mean_token_accuracy": 0.7467327207326889, "num_tokens": 170438159.0, "step": 1810 }, { "entropy": 1.383265955746174, "epoch": 0.4037938876254923, "grad_norm": 0.51171875, "learning_rate": 2.9824755989352266e-05, "loss": 0.9906, "mean_token_accuracy": 0.7453700192272663, "num_tokens": 171372232.0, "step": 1820 }, { "entropy": 1.3484601899981499, "epoch": 0.4060125353596983, "grad_norm": 0.5390625, "learning_rate": 2.971384205856256e-05, "loss": 0.9455, "mean_token_accuracy": 0.7536688603460788, "num_tokens": 172306458.0, "step": 1830 }, { "entropy": 1.3571648687124251, "epoch": 0.40823118309390427, "grad_norm": 0.57421875, "learning_rate": 2.960292812777285e-05, "loss": 0.9957, "mean_token_accuracy": 0.7446161836385727, "num_tokens": 173255617.0, "step": 1840 }, { "entropy": 1.3686935976147652, "epoch": 0.41044983082811026, "grad_norm": 0.5390625, "learning_rate": 2.9492014196983143e-05, "loss": 0.9407, "mean_token_accuracy": 0.7566944785416126, "num_tokens": 174208124.0, "step": 1850 }, { "entropy": 1.3695332050323485, "epoch": 0.41266847856231625, "grad_norm": 0.55859375, "learning_rate": 2.9381100266193435e-05, "loss": 0.973, "mean_token_accuracy": 0.7486338473856449, "num_tokens": 175159867.0, "step": 1860 }, { "entropy": 1.3632485464215278, "epoch": 0.41488712629652225, "grad_norm": 0.5703125, "learning_rate": 2.9270186335403727e-05, "loss": 0.9524, "mean_token_accuracy": 0.7515506997704506, "num_tokens": 176092841.0, "step": 1870 }, { "entropy": 1.3715375781059265, "epoch": 0.4171057740307283, "grad_norm": 0.5546875, "learning_rate": 2.915927240461402e-05, "loss": 0.981, "mean_token_accuracy": 0.7478602975606918, "num_tokens": 177053245.0, "step": 1880 }, { "entropy": 1.3497217521071434, "epoch": 0.4193244217649343, "grad_norm": 0.5703125, "learning_rate": 2.9048358473824318e-05, "loss": 0.9605, "mean_token_accuracy": 0.7520354442298413, "num_tokens": 178014206.0, "step": 1890 }, { "entropy": 1.380847369134426, "epoch": 0.4215430694991403, "grad_norm": 0.53515625, "learning_rate": 2.893744454303461e-05, "loss": 0.9716, "mean_token_accuracy": 0.7520315021276474, "num_tokens": 178955937.0, "step": 1900 }, { "entropy": 1.3706847220659255, "epoch": 0.42376171723334627, "grad_norm": 0.51953125, "learning_rate": 2.8826530612244902e-05, "loss": 0.9787, "mean_token_accuracy": 0.7502063922584057, "num_tokens": 179913308.0, "step": 1910 }, { "entropy": 1.386249950528145, "epoch": 0.42598036496755226, "grad_norm": 0.58203125, "learning_rate": 2.8715616681455194e-05, "loss": 0.9614, "mean_token_accuracy": 0.7522901840507984, "num_tokens": 180862001.0, "step": 1920 }, { "entropy": 1.387051635980606, "epoch": 0.42819901270175825, "grad_norm": 0.54296875, "learning_rate": 2.8604702750665487e-05, "loss": 0.9775, "mean_token_accuracy": 0.7487853363156318, "num_tokens": 181811443.0, "step": 1930 }, { "entropy": 1.3455969080328942, "epoch": 0.4304176604359643, "grad_norm": 0.49609375, "learning_rate": 2.849378881987578e-05, "loss": 0.9572, "mean_token_accuracy": 0.751628965884447, "num_tokens": 182766304.0, "step": 1940 }, { "entropy": 1.364281751215458, "epoch": 0.4326363081701703, "grad_norm": 0.5390625, "learning_rate": 2.838287488908607e-05, "loss": 0.9469, "mean_token_accuracy": 0.7545228533446788, "num_tokens": 183713237.0, "step": 1950 }, { "entropy": 1.3550898402929306, "epoch": 0.4348549559043763, "grad_norm": 0.60546875, "learning_rate": 2.8271960958296363e-05, "loss": 0.9731, "mean_token_accuracy": 0.7482604801654815, "num_tokens": 184646169.0, "step": 1960 }, { "entropy": 1.3582610800862311, "epoch": 0.4370736036385823, "grad_norm": 0.57421875, "learning_rate": 2.8161047027506655e-05, "loss": 0.948, "mean_token_accuracy": 0.7556835524737835, "num_tokens": 185593240.0, "step": 1970 }, { "entropy": 1.4051440745592116, "epoch": 0.43929225137278827, "grad_norm": 0.58984375, "learning_rate": 2.8050133096716947e-05, "loss": 1.0018, "mean_token_accuracy": 0.7416196145117283, "num_tokens": 186511580.0, "step": 1980 }, { "entropy": 1.3595298886299134, "epoch": 0.4415108991069943, "grad_norm": 0.55859375, "learning_rate": 2.7939219165927243e-05, "loss": 0.966, "mean_token_accuracy": 0.7499643869698047, "num_tokens": 187463868.0, "step": 1990 }, { "entropy": 1.344205194711685, "epoch": 0.4437295468412003, "grad_norm": 0.5078125, "learning_rate": 2.7828305235137535e-05, "loss": 0.9686, "mean_token_accuracy": 0.7512450948357582, "num_tokens": 188409504.0, "step": 2000 }, { "entropy": 1.4076050415635109, "epoch": 0.4459481945754063, "grad_norm": 0.55859375, "learning_rate": 2.7717391304347827e-05, "loss": 1.0067, "mean_token_accuracy": 0.7415082044899464, "num_tokens": 189355779.0, "step": 2010 }, { "entropy": 1.3575765684247016, "epoch": 0.4481668423096123, "grad_norm": 0.5625, "learning_rate": 2.760647737355812e-05, "loss": 0.9601, "mean_token_accuracy": 0.7506938494741917, "num_tokens": 190278778.0, "step": 2020 }, { "entropy": 1.3836154788732529, "epoch": 0.4503854900438183, "grad_norm": 0.5546875, "learning_rate": 2.749556344276841e-05, "loss": 1.0157, "mean_token_accuracy": 0.7393032193183899, "num_tokens": 191241326.0, "step": 2030 }, { "entropy": 1.391631406545639, "epoch": 0.4526041377780243, "grad_norm": 0.5625, "learning_rate": 2.7384649511978703e-05, "loss": 0.9776, "mean_token_accuracy": 0.7508174151182174, "num_tokens": 192214681.0, "step": 2040 }, { "entropy": 1.3410830795764923, "epoch": 0.4548227855122303, "grad_norm": 0.55859375, "learning_rate": 2.7273735581188996e-05, "loss": 0.943, "mean_token_accuracy": 0.7552683062851429, "num_tokens": 193166398.0, "step": 2050 }, { "entropy": 1.3428148820996284, "epoch": 0.4570414332464363, "grad_norm": 0.58203125, "learning_rate": 2.7162821650399288e-05, "loss": 0.9664, "mean_token_accuracy": 0.750061446428299, "num_tokens": 194098519.0, "step": 2060 }, { "entropy": 1.379681558907032, "epoch": 0.4592600809806423, "grad_norm": 0.6015625, "learning_rate": 2.7051907719609583e-05, "loss": 0.9832, "mean_token_accuracy": 0.7464658364653587, "num_tokens": 195004862.0, "step": 2070 }, { "entropy": 1.3508182168006897, "epoch": 0.4614787287148483, "grad_norm": 0.53125, "learning_rate": 2.694099378881988e-05, "loss": 0.9602, "mean_token_accuracy": 0.7505707196891308, "num_tokens": 195942523.0, "step": 2080 }, { "entropy": 1.3505974404513836, "epoch": 0.4636973764490543, "grad_norm": 0.57421875, "learning_rate": 2.683007985803017e-05, "loss": 0.9358, "mean_token_accuracy": 0.7566464401781559, "num_tokens": 196877376.0, "step": 2090 }, { "entropy": 1.3485953092575074, "epoch": 0.4659160241832603, "grad_norm": 0.55078125, "learning_rate": 2.6719165927240463e-05, "loss": 0.9482, "mean_token_accuracy": 0.7524454712867736, "num_tokens": 197810400.0, "step": 2100 }, { "entropy": 1.3627469688653946, "epoch": 0.4681346719174663, "grad_norm": 0.57421875, "learning_rate": 2.6608251996450755e-05, "loss": 0.9897, "mean_token_accuracy": 0.7450599886476994, "num_tokens": 198757408.0, "step": 2110 }, { "entropy": 1.3389364905655383, "epoch": 0.4703533196516723, "grad_norm": 0.5546875, "learning_rate": 2.6497338065661047e-05, "loss": 0.9129, "mean_token_accuracy": 0.7605686038732529, "num_tokens": 199692340.0, "step": 2120 }, { "entropy": 1.3748140163719653, "epoch": 0.4725719673858783, "grad_norm": 0.5703125, "learning_rate": 2.638642413487134e-05, "loss": 0.955, "mean_token_accuracy": 0.7520599849522114, "num_tokens": 200650664.0, "step": 2130 }, { "entropy": 1.3346731156110763, "epoch": 0.4747906151200843, "grad_norm": 0.56640625, "learning_rate": 2.627551020408163e-05, "loss": 0.9212, "mean_token_accuracy": 0.7582221433520318, "num_tokens": 201597933.0, "step": 2140 }, { "entropy": 1.3664929166436195, "epoch": 0.4770092628542903, "grad_norm": 0.55078125, "learning_rate": 2.6164596273291924e-05, "loss": 0.9675, "mean_token_accuracy": 0.7481563113629818, "num_tokens": 202551711.0, "step": 2150 }, { "entropy": 1.3843952640891075, "epoch": 0.4792279105884963, "grad_norm": 0.53125, "learning_rate": 2.6053682342502216e-05, "loss": 1.0013, "mean_token_accuracy": 0.7428773507475853, "num_tokens": 203475006.0, "step": 2160 }, { "entropy": 1.3748216979205607, "epoch": 0.48144655832270233, "grad_norm": 0.53125, "learning_rate": 2.5942768411712515e-05, "loss": 0.9858, "mean_token_accuracy": 0.7472008153796196, "num_tokens": 204414096.0, "step": 2170 }, { "entropy": 1.3418536871671676, "epoch": 0.4836652060569083, "grad_norm": 0.54296875, "learning_rate": 2.5831854480922807e-05, "loss": 0.9614, "mean_token_accuracy": 0.7489035427570343, "num_tokens": 205346877.0, "step": 2180 }, { "entropy": 1.334491142630577, "epoch": 0.4858838537911143, "grad_norm": 0.52734375, "learning_rate": 2.57209405501331e-05, "loss": 0.952, "mean_token_accuracy": 0.7544978365302086, "num_tokens": 206299157.0, "step": 2190 }, { "entropy": 1.3800398319959641, "epoch": 0.4881025015253203, "grad_norm": 0.56640625, "learning_rate": 2.561002661934339e-05, "loss": 0.9965, "mean_token_accuracy": 0.7438121646642685, "num_tokens": 207235480.0, "step": 2200 }, { "entropy": 1.3536552309989929, "epoch": 0.4903211492595263, "grad_norm": 0.578125, "learning_rate": 2.5499112688553683e-05, "loss": 0.9518, "mean_token_accuracy": 0.7553939551115036, "num_tokens": 208192869.0, "step": 2210 }, { "entropy": 1.3550305306911468, "epoch": 0.4925397969937323, "grad_norm": 0.55859375, "learning_rate": 2.5388198757763975e-05, "loss": 0.962, "mean_token_accuracy": 0.7509313143789769, "num_tokens": 209163216.0, "step": 2220 }, { "entropy": 1.3438825502991676, "epoch": 0.49475844472793834, "grad_norm": 0.5234375, "learning_rate": 2.5277284826974267e-05, "loss": 0.9612, "mean_token_accuracy": 0.7521602623164654, "num_tokens": 210107351.0, "step": 2230 }, { "entropy": 1.3802131339907646, "epoch": 0.49697709246214433, "grad_norm": 0.53125, "learning_rate": 2.516637089618456e-05, "loss": 0.9761, "mean_token_accuracy": 0.7472608901560307, "num_tokens": 211041572.0, "step": 2240 }, { "entropy": 1.340398869663477, "epoch": 0.4991957401963503, "grad_norm": 0.5703125, "learning_rate": 2.5055456965394852e-05, "loss": 0.9588, "mean_token_accuracy": 0.7512741200625896, "num_tokens": 211977487.0, "step": 2250 }, { "entropy": 1.3554712682962418, "epoch": 0.5014143879305564, "grad_norm": 0.515625, "learning_rate": 2.4944543034605147e-05, "loss": 0.9593, "mean_token_accuracy": 0.7513454340398311, "num_tokens": 212944715.0, "step": 2260 }, { "entropy": 1.3268218383193016, "epoch": 0.5036330356647624, "grad_norm": 0.484375, "learning_rate": 2.483362910381544e-05, "loss": 0.9323, "mean_token_accuracy": 0.7580919787287712, "num_tokens": 213894148.0, "step": 2270 }, { "entropy": 1.3675538420677185, "epoch": 0.5058516833989684, "grad_norm": 0.5625, "learning_rate": 2.4722715173025735e-05, "loss": 0.9662, "mean_token_accuracy": 0.7483490623533726, "num_tokens": 214849284.0, "step": 2280 }, { "entropy": 1.3869496576488018, "epoch": 0.5080703311331743, "grad_norm": 0.58203125, "learning_rate": 2.4611801242236027e-05, "loss": 0.9855, "mean_token_accuracy": 0.7459334179759025, "num_tokens": 215795497.0, "step": 2290 }, { "entropy": 1.3464835032820701, "epoch": 0.5102889788673803, "grad_norm": 0.52734375, "learning_rate": 2.450088731144632e-05, "loss": 0.9559, "mean_token_accuracy": 0.7510037913918495, "num_tokens": 216730353.0, "step": 2300 }, { "entropy": 1.3536331675946713, "epoch": 0.5125076266015863, "grad_norm": 0.54296875, "learning_rate": 2.438997338065661e-05, "loss": 0.9768, "mean_token_accuracy": 0.7474269300699234, "num_tokens": 217652371.0, "step": 2310 }, { "entropy": 1.3597574278712272, "epoch": 0.5147262743357923, "grad_norm": 0.5390625, "learning_rate": 2.4279059449866903e-05, "loss": 0.9739, "mean_token_accuracy": 0.7476049326360226, "num_tokens": 218599573.0, "step": 2320 }, { "entropy": 1.3576934173703195, "epoch": 0.5169449220699983, "grad_norm": 0.54296875, "learning_rate": 2.41681455190772e-05, "loss": 0.9472, "mean_token_accuracy": 0.7533730484545231, "num_tokens": 219526034.0, "step": 2330 }, { "entropy": 1.3639849349856377, "epoch": 0.5191635698042043, "grad_norm": 0.55078125, "learning_rate": 2.405723158828749e-05, "loss": 1.0025, "mean_token_accuracy": 0.7426088079810143, "num_tokens": 220457735.0, "step": 2340 }, { "entropy": 1.3650838419795037, "epoch": 0.5213822175384103, "grad_norm": 0.5546875, "learning_rate": 2.3946317657497783e-05, "loss": 0.9795, "mean_token_accuracy": 0.7475734516978264, "num_tokens": 221414161.0, "step": 2350 }, { "entropy": 1.381436189264059, "epoch": 0.5236008652726163, "grad_norm": 0.5625, "learning_rate": 2.3835403726708075e-05, "loss": 0.9928, "mean_token_accuracy": 0.744963239133358, "num_tokens": 222354700.0, "step": 2360 }, { "entropy": 1.3839796632528305, "epoch": 0.5258195130068224, "grad_norm": 0.609375, "learning_rate": 2.372448979591837e-05, "loss": 0.9714, "mean_token_accuracy": 0.7483658462762832, "num_tokens": 223258825.0, "step": 2370 }, { "entropy": 1.3521512404084206, "epoch": 0.5280381607410284, "grad_norm": 0.5390625, "learning_rate": 2.3613575865128663e-05, "loss": 0.9965, "mean_token_accuracy": 0.7439669594168663, "num_tokens": 224207000.0, "step": 2380 }, { "entropy": 1.3651843503117562, "epoch": 0.5302568084752344, "grad_norm": 0.5546875, "learning_rate": 2.3502661934338955e-05, "loss": 0.9609, "mean_token_accuracy": 0.7512055054306984, "num_tokens": 225168371.0, "step": 2390 }, { "entropy": 1.3762024179100991, "epoch": 0.5324754562094404, "grad_norm": 0.67578125, "learning_rate": 2.3391748003549247e-05, "loss": 0.9818, "mean_token_accuracy": 0.7457237169146538, "num_tokens": 226082017.0, "step": 2400 }, { "entropy": 1.3472841560840607, "epoch": 0.5346941039436464, "grad_norm": 0.5234375, "learning_rate": 2.328083407275954e-05, "loss": 0.9581, "mean_token_accuracy": 0.7504221297800541, "num_tokens": 227034510.0, "step": 2410 }, { "entropy": 1.3381285414099693, "epoch": 0.5369127516778524, "grad_norm": 0.54296875, "learning_rate": 2.3169920141969835e-05, "loss": 0.9492, "mean_token_accuracy": 0.7552238062024117, "num_tokens": 228002765.0, "step": 2420 }, { "entropy": 1.3511281102895736, "epoch": 0.5391313994120583, "grad_norm": 0.5390625, "learning_rate": 2.3059006211180127e-05, "loss": 0.9393, "mean_token_accuracy": 0.7557635813951492, "num_tokens": 228965429.0, "step": 2430 }, { "entropy": 1.3392139934003353, "epoch": 0.5413500471462643, "grad_norm": 0.5859375, "learning_rate": 2.294809228039042e-05, "loss": 0.9343, "mean_token_accuracy": 0.7574323169887066, "num_tokens": 229896813.0, "step": 2440 }, { "entropy": 1.3686351031064987, "epoch": 0.5435686948804703, "grad_norm": 0.59765625, "learning_rate": 2.283717834960071e-05, "loss": 0.937, "mean_token_accuracy": 0.7549462541937828, "num_tokens": 230834752.0, "step": 2450 }, { "entropy": 1.339580136537552, "epoch": 0.5457873426146763, "grad_norm": 0.51171875, "learning_rate": 2.2726264418811003e-05, "loss": 0.9429, "mean_token_accuracy": 0.7544058203697205, "num_tokens": 231770668.0, "step": 2460 }, { "entropy": 1.3737561523914337, "epoch": 0.5480059903488823, "grad_norm": 0.5703125, "learning_rate": 2.26153504880213e-05, "loss": 0.9539, "mean_token_accuracy": 0.752394187450409, "num_tokens": 232706737.0, "step": 2470 }, { "entropy": 1.3709870815277099, "epoch": 0.5502246380830884, "grad_norm": 0.53125, "learning_rate": 2.250443655723159e-05, "loss": 0.9915, "mean_token_accuracy": 0.7455207951366901, "num_tokens": 233667028.0, "step": 2480 }, { "entropy": 1.3425948224961757, "epoch": 0.5524432858172944, "grad_norm": 0.5625, "learning_rate": 2.2393522626441883e-05, "loss": 0.9351, "mean_token_accuracy": 0.7563626609742642, "num_tokens": 234615377.0, "step": 2490 }, { "entropy": 1.382601398229599, "epoch": 0.5546619335515004, "grad_norm": 0.57421875, "learning_rate": 2.2282608695652175e-05, "loss": 0.9837, "mean_token_accuracy": 0.7450042508542538, "num_tokens": 235554965.0, "step": 2500 }, { "entropy": 1.3571919694542884, "epoch": 0.5568805812857064, "grad_norm": 0.60546875, "learning_rate": 2.2171694764862467e-05, "loss": 0.9675, "mean_token_accuracy": 0.7487703949213028, "num_tokens": 236510004.0, "step": 2510 }, { "entropy": 1.3876874506473542, "epoch": 0.5590992290199124, "grad_norm": 0.55859375, "learning_rate": 2.206078083407276e-05, "loss": 0.9859, "mean_token_accuracy": 0.7465429671108723, "num_tokens": 237426607.0, "step": 2520 }, { "entropy": 1.3840069979429246, "epoch": 0.5613178767541184, "grad_norm": 0.57421875, "learning_rate": 2.1949866903283052e-05, "loss": 0.9573, "mean_token_accuracy": 0.7508729174733162, "num_tokens": 238388324.0, "step": 2530 }, { "entropy": 1.3555088877677917, "epoch": 0.5635365244883244, "grad_norm": 0.55859375, "learning_rate": 2.1838952972493347e-05, "loss": 0.9508, "mean_token_accuracy": 0.7524322152137757, "num_tokens": 239305641.0, "step": 2540 }, { "entropy": 1.4074857875704765, "epoch": 0.5657551722225304, "grad_norm": 0.56640625, "learning_rate": 2.172803904170364e-05, "loss": 0.9851, "mean_token_accuracy": 0.7465929470956325, "num_tokens": 240221454.0, "step": 2550 }, { "entropy": 1.3208380579948424, "epoch": 0.5679738199567363, "grad_norm": 0.546875, "learning_rate": 2.161712511091393e-05, "loss": 0.9432, "mean_token_accuracy": 0.754857836663723, "num_tokens": 241170406.0, "step": 2560 }, { "entropy": 1.36037939786911, "epoch": 0.5701924676909423, "grad_norm": 0.53515625, "learning_rate": 2.1506211180124224e-05, "loss": 0.9572, "mean_token_accuracy": 0.7514487348496914, "num_tokens": 242100616.0, "step": 2570 }, { "entropy": 1.3895253077149392, "epoch": 0.5724111154251483, "grad_norm": 0.52734375, "learning_rate": 2.1395297249334516e-05, "loss": 0.9847, "mean_token_accuracy": 0.7462774030864239, "num_tokens": 243038631.0, "step": 2580 }, { "entropy": 1.3492624297738076, "epoch": 0.5746297631593543, "grad_norm": 0.5390625, "learning_rate": 2.1284383318544808e-05, "loss": 0.9428, "mean_token_accuracy": 0.7537525497376919, "num_tokens": 244000961.0, "step": 2590 }, { "entropy": 1.357532762736082, "epoch": 0.5768484108935604, "grad_norm": 0.51171875, "learning_rate": 2.1173469387755103e-05, "loss": 0.9577, "mean_token_accuracy": 0.7505512781441211, "num_tokens": 244952283.0, "step": 2600 }, { "entropy": 1.3500149488449096, "epoch": 0.5790670586277664, "grad_norm": 0.5625, "learning_rate": 2.1062555456965396e-05, "loss": 0.9333, "mean_token_accuracy": 0.757443331182003, "num_tokens": 245897365.0, "step": 2610 }, { "entropy": 1.3495190888643265, "epoch": 0.5812857063619724, "grad_norm": 0.5390625, "learning_rate": 2.0951641526175688e-05, "loss": 0.9582, "mean_token_accuracy": 0.7513453289866447, "num_tokens": 246833155.0, "step": 2620 }, { "entropy": 1.3707278072834015, "epoch": 0.5835043540961784, "grad_norm": 0.55078125, "learning_rate": 2.084072759538598e-05, "loss": 0.9817, "mean_token_accuracy": 0.7466557987034321, "num_tokens": 247796159.0, "step": 2630 }, { "entropy": 1.340453139692545, "epoch": 0.5857230018303844, "grad_norm": 0.57421875, "learning_rate": 2.0729813664596272e-05, "loss": 0.9589, "mean_token_accuracy": 0.7525463417172432, "num_tokens": 248736277.0, "step": 2640 }, { "entropy": 1.3754316791892052, "epoch": 0.5879416495645904, "grad_norm": 0.51171875, "learning_rate": 2.0618899733806567e-05, "loss": 0.9697, "mean_token_accuracy": 0.7479398109018802, "num_tokens": 249662809.0, "step": 2650 }, { "entropy": 1.3680956415832042, "epoch": 0.5901602972987964, "grad_norm": 0.5625, "learning_rate": 2.050798580301686e-05, "loss": 0.9565, "mean_token_accuracy": 0.7505876325070858, "num_tokens": 250581187.0, "step": 2660 }, { "entropy": 1.372042527794838, "epoch": 0.5923789450330024, "grad_norm": 0.578125, "learning_rate": 2.0397071872227152e-05, "loss": 0.9573, "mean_token_accuracy": 0.7499286234378815, "num_tokens": 251494513.0, "step": 2670 }, { "entropy": 1.3676451787352562, "epoch": 0.5945975927672084, "grad_norm": 0.53515625, "learning_rate": 2.0286157941437444e-05, "loss": 1.0024, "mean_token_accuracy": 0.7436093680560589, "num_tokens": 252450478.0, "step": 2680 }, { "entropy": 1.321447344124317, "epoch": 0.5968162405014144, "grad_norm": 0.8046875, "learning_rate": 2.0175244010647736e-05, "loss": 0.9297, "mean_token_accuracy": 0.7592410154640674, "num_tokens": 253388200.0, "step": 2690 }, { "entropy": 1.367007777094841, "epoch": 0.5990348882356203, "grad_norm": 0.55859375, "learning_rate": 2.006433007985803e-05, "loss": 0.951, "mean_token_accuracy": 0.7534979909658432, "num_tokens": 254303611.0, "step": 2700 }, { "entropy": 1.3557642981410027, "epoch": 0.6012535359698263, "grad_norm": 0.55078125, "learning_rate": 1.9953416149068324e-05, "loss": 0.9455, "mean_token_accuracy": 0.7532623074948788, "num_tokens": 255267315.0, "step": 2710 }, { "entropy": 1.3694660350680352, "epoch": 0.6034721837040324, "grad_norm": 0.58203125, "learning_rate": 1.9842502218278616e-05, "loss": 0.9634, "mean_token_accuracy": 0.7510243773460388, "num_tokens": 256198305.0, "step": 2720 }, { "entropy": 1.3600564405322075, "epoch": 0.6056908314382384, "grad_norm": 0.53515625, "learning_rate": 1.9731588287488908e-05, "loss": 0.9597, "mean_token_accuracy": 0.7512127391993999, "num_tokens": 257177077.0, "step": 2730 }, { "entropy": 1.3179180152714252, "epoch": 0.6079094791724444, "grad_norm": 0.55859375, "learning_rate": 1.9620674356699203e-05, "loss": 0.935, "mean_token_accuracy": 0.757300040870905, "num_tokens": 258134360.0, "step": 2740 }, { "entropy": 1.35346722304821, "epoch": 0.6101281269066504, "grad_norm": 0.52734375, "learning_rate": 1.9509760425909496e-05, "loss": 0.945, "mean_token_accuracy": 0.7527749851346016, "num_tokens": 259094913.0, "step": 2750 }, { "entropy": 1.3553345277905464, "epoch": 0.6123467746408564, "grad_norm": 0.51953125, "learning_rate": 1.9398846495119788e-05, "loss": 0.9358, "mean_token_accuracy": 0.7586644418537617, "num_tokens": 260019758.0, "step": 2760 }, { "entropy": 1.3732210516929626, "epoch": 0.6145654223750624, "grad_norm": 0.57421875, "learning_rate": 1.928793256433008e-05, "loss": 0.9604, "mean_token_accuracy": 0.7491915933787823, "num_tokens": 260944733.0, "step": 2770 }, { "entropy": 1.3680385306477547, "epoch": 0.6167840701092684, "grad_norm": 0.56640625, "learning_rate": 1.9177018633540372e-05, "loss": 0.9676, "mean_token_accuracy": 0.7478097401559353, "num_tokens": 261873242.0, "step": 2780 }, { "entropy": 1.3868214182555676, "epoch": 0.6190027178434744, "grad_norm": 0.515625, "learning_rate": 1.9066104702750667e-05, "loss": 0.9516, "mean_token_accuracy": 0.7528703935444355, "num_tokens": 262814898.0, "step": 2790 }, { "entropy": 1.3595042198896408, "epoch": 0.6212213655776804, "grad_norm": 0.546875, "learning_rate": 1.895519077196096e-05, "loss": 0.9479, "mean_token_accuracy": 0.7566476508975029, "num_tokens": 263753086.0, "step": 2800 }, { "entropy": 1.33022148758173, "epoch": 0.6234400133118864, "grad_norm": 0.515625, "learning_rate": 1.8844276841171252e-05, "loss": 0.9321, "mean_token_accuracy": 0.7572297543287277, "num_tokens": 264700281.0, "step": 2810 }, { "entropy": 1.3536405637860298, "epoch": 0.6256586610460924, "grad_norm": 0.55078125, "learning_rate": 1.8733362910381544e-05, "loss": 0.9419, "mean_token_accuracy": 0.7534777402877808, "num_tokens": 265647657.0, "step": 2820 }, { "entropy": 1.3843684569001198, "epoch": 0.6278773087802985, "grad_norm": 0.57421875, "learning_rate": 1.862244897959184e-05, "loss": 0.9653, "mean_token_accuracy": 0.7501688152551651, "num_tokens": 266578441.0, "step": 2830 }, { "entropy": 1.3544567473232747, "epoch": 0.6300959565145045, "grad_norm": 0.515625, "learning_rate": 1.851153504880213e-05, "loss": 0.9457, "mean_token_accuracy": 0.7535403810441494, "num_tokens": 267530874.0, "step": 2840 }, { "entropy": 1.3669777683913709, "epoch": 0.6323146042487104, "grad_norm": 0.53125, "learning_rate": 1.8400621118012424e-05, "loss": 0.9422, "mean_token_accuracy": 0.7547655880451203, "num_tokens": 268461165.0, "step": 2850 }, { "entropy": 1.3307232797145843, "epoch": 0.6345332519829164, "grad_norm": 0.5625, "learning_rate": 1.8289707187222716e-05, "loss": 0.9172, "mean_token_accuracy": 0.7619568608701229, "num_tokens": 269407213.0, "step": 2860 }, { "entropy": 1.367132118344307, "epoch": 0.6367518997171224, "grad_norm": 0.54296875, "learning_rate": 1.8178793256433008e-05, "loss": 0.9508, "mean_token_accuracy": 0.7541770383715629, "num_tokens": 270328899.0, "step": 2870 }, { "entropy": 1.3831279791891575, "epoch": 0.6389705474513284, "grad_norm": 0.5625, "learning_rate": 1.8067879325643303e-05, "loss": 0.9817, "mean_token_accuracy": 0.746974790096283, "num_tokens": 271265664.0, "step": 2880 }, { "entropy": 1.3699244022369386, "epoch": 0.6411891951855344, "grad_norm": 0.5390625, "learning_rate": 1.7956965394853596e-05, "loss": 0.9696, "mean_token_accuracy": 0.7492304258048534, "num_tokens": 272186809.0, "step": 2890 }, { "entropy": 1.3639655753970146, "epoch": 0.6434078429197404, "grad_norm": 0.52734375, "learning_rate": 1.7846051464063888e-05, "loss": 0.9668, "mean_token_accuracy": 0.7505564413964748, "num_tokens": 273108290.0, "step": 2900 }, { "entropy": 1.403120481967926, "epoch": 0.6456264906539464, "grad_norm": 0.55078125, "learning_rate": 1.773513753327418e-05, "loss": 0.9769, "mean_token_accuracy": 0.746848201751709, "num_tokens": 274041696.0, "step": 2910 }, { "entropy": 1.4035961225628852, "epoch": 0.6478451383881524, "grad_norm": 0.53515625, "learning_rate": 1.7624223602484475e-05, "loss": 1.028, "mean_token_accuracy": 0.7384353429079056, "num_tokens": 274987065.0, "step": 2920 }, { "entropy": 1.3617764726281165, "epoch": 0.6500637861223584, "grad_norm": 0.55859375, "learning_rate": 1.7513309671694767e-05, "loss": 0.9566, "mean_token_accuracy": 0.7519726864993572, "num_tokens": 275931121.0, "step": 2930 }, { "entropy": 1.3632805831730366, "epoch": 0.6522824338565644, "grad_norm": 0.57421875, "learning_rate": 1.740239574090506e-05, "loss": 0.9548, "mean_token_accuracy": 0.751621701568365, "num_tokens": 276887903.0, "step": 2940 }, { "entropy": 1.3548810198903083, "epoch": 0.6545010815907705, "grad_norm": 0.51953125, "learning_rate": 1.7291481810115352e-05, "loss": 0.9696, "mean_token_accuracy": 0.7500473111867905, "num_tokens": 277797512.0, "step": 2950 }, { "entropy": 1.3761564634740353, "epoch": 0.6567197293249765, "grad_norm": 0.53125, "learning_rate": 1.7180567879325644e-05, "loss": 0.9801, "mean_token_accuracy": 0.7459054350852966, "num_tokens": 278758206.0, "step": 2960 }, { "entropy": 1.3604497477412223, "epoch": 0.6589383770591825, "grad_norm": 0.58203125, "learning_rate": 1.706965394853594e-05, "loss": 0.9701, "mean_token_accuracy": 0.7494505539536476, "num_tokens": 279682051.0, "step": 2970 }, { "entropy": 1.365204595029354, "epoch": 0.6611570247933884, "grad_norm": 0.51953125, "learning_rate": 1.695874001774623e-05, "loss": 0.9354, "mean_token_accuracy": 0.7586885608732701, "num_tokens": 280616177.0, "step": 2980 }, { "entropy": 1.3509592905640602, "epoch": 0.6633756725275944, "grad_norm": 0.5625, "learning_rate": 1.6847826086956524e-05, "loss": 0.9502, "mean_token_accuracy": 0.7536125592887402, "num_tokens": 281562822.0, "step": 2990 }, { "entropy": 1.361519905924797, "epoch": 0.6655943202618004, "grad_norm": 0.52734375, "learning_rate": 1.6736912156166816e-05, "loss": 0.9355, "mean_token_accuracy": 0.755986961722374, "num_tokens": 282503409.0, "step": 3000 }, { "entropy": 1.351868186891079, "epoch": 0.6678129679960064, "grad_norm": 0.53125, "learning_rate": 1.6625998225377108e-05, "loss": 0.9363, "mean_token_accuracy": 0.7572803579270839, "num_tokens": 283411000.0, "step": 3010 }, { "entropy": 1.3627968370914458, "epoch": 0.6700316157302124, "grad_norm": 0.5234375, "learning_rate": 1.6515084294587403e-05, "loss": 0.954, "mean_token_accuracy": 0.7517252512276172, "num_tokens": 284363870.0, "step": 3020 }, { "entropy": 1.350551488995552, "epoch": 0.6722502634644184, "grad_norm": 0.51953125, "learning_rate": 1.6404170363797696e-05, "loss": 0.9243, "mean_token_accuracy": 0.7591398231685161, "num_tokens": 285307777.0, "step": 3030 }, { "entropy": 1.3500189259648323, "epoch": 0.6744689111986244, "grad_norm": 0.55078125, "learning_rate": 1.6293256433007988e-05, "loss": 0.938, "mean_token_accuracy": 0.7556370176374912, "num_tokens": 286253628.0, "step": 3040 }, { "entropy": 1.344145791977644, "epoch": 0.6766875589328304, "grad_norm": 0.53515625, "learning_rate": 1.618234250221828e-05, "loss": 0.9545, "mean_token_accuracy": 0.7536003112792968, "num_tokens": 287183850.0, "step": 3050 }, { "entropy": 1.3805773958563805, "epoch": 0.6789062066670365, "grad_norm": 0.546875, "learning_rate": 1.6071428571428572e-05, "loss": 0.9733, "mean_token_accuracy": 0.7494976818561554, "num_tokens": 288100206.0, "step": 3060 }, { "entropy": 1.344843527674675, "epoch": 0.6811248544012425, "grad_norm": 0.5078125, "learning_rate": 1.5960514640638864e-05, "loss": 0.9458, "mean_token_accuracy": 0.7523748345673085, "num_tokens": 289028085.0, "step": 3070 }, { "entropy": 1.3617902539670468, "epoch": 0.6833435021354485, "grad_norm": 0.5, "learning_rate": 1.5849600709849156e-05, "loss": 0.9616, "mean_token_accuracy": 0.7512977905571461, "num_tokens": 289972169.0, "step": 3080 }, { "entropy": 1.3891084134578704, "epoch": 0.6855621498696545, "grad_norm": 0.515625, "learning_rate": 1.573868677905945e-05, "loss": 0.984, "mean_token_accuracy": 0.7475369438529015, "num_tokens": 290890546.0, "step": 3090 }, { "entropy": 1.3548466391861438, "epoch": 0.6877807976038605, "grad_norm": 0.58984375, "learning_rate": 1.5627772848269744e-05, "loss": 0.9156, "mean_token_accuracy": 0.7610405057668685, "num_tokens": 291831317.0, "step": 3100 }, { "entropy": 1.3873838737607003, "epoch": 0.6899994453380665, "grad_norm": 0.5234375, "learning_rate": 1.5516858917480036e-05, "loss": 0.9849, "mean_token_accuracy": 0.7466968774795533, "num_tokens": 292773910.0, "step": 3110 }, { "entropy": 1.374309216439724, "epoch": 0.6922180930722724, "grad_norm": 0.56640625, "learning_rate": 1.5405944986690328e-05, "loss": 0.9691, "mean_token_accuracy": 0.7497981458902359, "num_tokens": 293706792.0, "step": 3120 }, { "entropy": 1.3540133006870747, "epoch": 0.6944367408064784, "grad_norm": 0.54296875, "learning_rate": 1.529503105590062e-05, "loss": 0.9675, "mean_token_accuracy": 0.7486146375536918, "num_tokens": 294649185.0, "step": 3130 }, { "entropy": 1.3608009479939938, "epoch": 0.6966553885406844, "grad_norm": 0.5546875, "learning_rate": 1.5184117125110914e-05, "loss": 0.9602, "mean_token_accuracy": 0.7509834311902523, "num_tokens": 295571599.0, "step": 3140 }, { "entropy": 1.3526781380176545, "epoch": 0.6988740362748904, "grad_norm": 0.54296875, "learning_rate": 1.5073203194321208e-05, "loss": 0.9838, "mean_token_accuracy": 0.7449854724109173, "num_tokens": 296510262.0, "step": 3150 }, { "entropy": 1.3786792248487472, "epoch": 0.7010926840090964, "grad_norm": 0.5546875, "learning_rate": 1.4962289263531502e-05, "loss": 0.9576, "mean_token_accuracy": 0.7524838514626027, "num_tokens": 297462659.0, "step": 3160 }, { "entropy": 1.37396137714386, "epoch": 0.7033113317433024, "grad_norm": 0.51953125, "learning_rate": 1.4851375332741794e-05, "loss": 0.9655, "mean_token_accuracy": 0.7482963159680367, "num_tokens": 298410420.0, "step": 3170 }, { "entropy": 1.3479665741324425, "epoch": 0.7055299794775085, "grad_norm": 0.5234375, "learning_rate": 1.4740461401952086e-05, "loss": 0.9735, "mean_token_accuracy": 0.7480811208486557, "num_tokens": 299335997.0, "step": 3180 }, { "entropy": 1.4149656519293785, "epoch": 0.7077486272117145, "grad_norm": 0.4921875, "learning_rate": 1.4629547471162378e-05, "loss": 0.9868, "mean_token_accuracy": 0.7450309813022613, "num_tokens": 300300688.0, "step": 3190 }, { "entropy": 1.3166653901338576, "epoch": 0.7099672749459205, "grad_norm": 0.52734375, "learning_rate": 1.4518633540372672e-05, "loss": 0.9244, "mean_token_accuracy": 0.7599585182964802, "num_tokens": 301275137.0, "step": 3200 }, { "entropy": 1.3608283437788486, "epoch": 0.7121859226801265, "grad_norm": 0.53125, "learning_rate": 1.4407719609582964e-05, "loss": 0.9792, "mean_token_accuracy": 0.7479373283684254, "num_tokens": 302208727.0, "step": 3210 }, { "entropy": 1.3675961531698704, "epoch": 0.7144045704143325, "grad_norm": 0.53125, "learning_rate": 1.4296805678793256e-05, "loss": 0.9522, "mean_token_accuracy": 0.7523340001702309, "num_tokens": 303160006.0, "step": 3220 }, { "entropy": 1.3157847836613654, "epoch": 0.7166232181485385, "grad_norm": 0.51953125, "learning_rate": 1.4185891748003548e-05, "loss": 0.9469, "mean_token_accuracy": 0.7520846240222454, "num_tokens": 304102524.0, "step": 3230 }, { "entropy": 1.3752693004906178, "epoch": 0.7188418658827445, "grad_norm": 0.52734375, "learning_rate": 1.4074977817213844e-05, "loss": 0.9426, "mean_token_accuracy": 0.753889911621809, "num_tokens": 305042287.0, "step": 3240 }, { "entropy": 1.3292134046554565, "epoch": 0.7210605136169504, "grad_norm": 0.61328125, "learning_rate": 1.3964063886424136e-05, "loss": 0.9464, "mean_token_accuracy": 0.754455479234457, "num_tokens": 305988003.0, "step": 3250 }, { "entropy": 1.3723205238580705, "epoch": 0.7232791613511564, "grad_norm": 0.578125, "learning_rate": 1.3853149955634428e-05, "loss": 0.9942, "mean_token_accuracy": 0.7461909614503384, "num_tokens": 306927584.0, "step": 3260 }, { "entropy": 1.3628524258732795, "epoch": 0.7254978090853624, "grad_norm": 0.56640625, "learning_rate": 1.374223602484472e-05, "loss": 0.9594, "mean_token_accuracy": 0.7528522469103336, "num_tokens": 307863697.0, "step": 3270 }, { "entropy": 1.353959833085537, "epoch": 0.7277164568195684, "grad_norm": 0.5390625, "learning_rate": 1.3631322094055012e-05, "loss": 0.9472, "mean_token_accuracy": 0.7561062417924405, "num_tokens": 308808276.0, "step": 3280 }, { "entropy": 1.3523946583271027, "epoch": 0.7299351045537745, "grad_norm": 0.470703125, "learning_rate": 1.3520408163265308e-05, "loss": 0.9578, "mean_token_accuracy": 0.7514262087643147, "num_tokens": 309773086.0, "step": 3290 }, { "entropy": 1.3321008674800396, "epoch": 0.7321537522879805, "grad_norm": 0.54296875, "learning_rate": 1.34094942324756e-05, "loss": 0.9513, "mean_token_accuracy": 0.7530274912714958, "num_tokens": 310728967.0, "step": 3300 }, { "entropy": 1.3726357147097588, "epoch": 0.7343724000221865, "grad_norm": 0.55078125, "learning_rate": 1.3298580301685892e-05, "loss": 0.9519, "mean_token_accuracy": 0.7526456661522388, "num_tokens": 311671029.0, "step": 3310 }, { "entropy": 1.3460698679089547, "epoch": 0.7365910477563925, "grad_norm": 0.59765625, "learning_rate": 1.3187666370896184e-05, "loss": 0.977, "mean_token_accuracy": 0.7480454221367836, "num_tokens": 312608775.0, "step": 3320 }, { "entropy": 1.358740784227848, "epoch": 0.7388096954905985, "grad_norm": 0.54296875, "learning_rate": 1.3076752440106476e-05, "loss": 0.9388, "mean_token_accuracy": 0.7564548753201962, "num_tokens": 313562050.0, "step": 3330 }, { "entropy": 1.3844745293259622, "epoch": 0.7410283432248045, "grad_norm": 0.5703125, "learning_rate": 1.2965838509316772e-05, "loss": 0.9834, "mean_token_accuracy": 0.7466944210231304, "num_tokens": 314509120.0, "step": 3340 }, { "entropy": 1.3659690007567407, "epoch": 0.7432469909590105, "grad_norm": 0.51171875, "learning_rate": 1.2854924578527064e-05, "loss": 0.941, "mean_token_accuracy": 0.7546365484595299, "num_tokens": 315486453.0, "step": 3350 }, { "entropy": 1.3873593926429748, "epoch": 0.7454656386932165, "grad_norm": 0.5390625, "learning_rate": 1.2744010647737356e-05, "loss": 0.985, "mean_token_accuracy": 0.7476673908531666, "num_tokens": 316431477.0, "step": 3360 }, { "entropy": 1.3676550433039665, "epoch": 0.7476842864274225, "grad_norm": 0.5234375, "learning_rate": 1.2633096716947648e-05, "loss": 0.9627, "mean_token_accuracy": 0.7511092610657215, "num_tokens": 317357092.0, "step": 3370 }, { "entropy": 1.3835733927786351, "epoch": 0.7499029341616285, "grad_norm": 0.55078125, "learning_rate": 1.2522182786157944e-05, "loss": 0.948, "mean_token_accuracy": 0.7537398427724838, "num_tokens": 318280117.0, "step": 3380 }, { "entropy": 1.3766888722777366, "epoch": 0.7521215818958344, "grad_norm": 0.55859375, "learning_rate": 1.2411268855368236e-05, "loss": 0.9683, "mean_token_accuracy": 0.7515489347279072, "num_tokens": 319203557.0, "step": 3390 }, { "entropy": 1.357860617339611, "epoch": 0.7543402296300404, "grad_norm": 0.515625, "learning_rate": 1.2300354924578528e-05, "loss": 0.9535, "mean_token_accuracy": 0.752687606215477, "num_tokens": 320136852.0, "step": 3400 }, { "entropy": 1.3469961121678353, "epoch": 0.7565588773642465, "grad_norm": 0.54296875, "learning_rate": 1.218944099378882e-05, "loss": 0.9459, "mean_token_accuracy": 0.7537472225725651, "num_tokens": 321106324.0, "step": 3410 }, { "entropy": 1.334907030314207, "epoch": 0.7587775250984525, "grad_norm": 0.55078125, "learning_rate": 1.2078527062999114e-05, "loss": 0.9359, "mean_token_accuracy": 0.7552877001464366, "num_tokens": 322042151.0, "step": 3420 }, { "entropy": 1.3580046392977239, "epoch": 0.7609961728326585, "grad_norm": 0.53125, "learning_rate": 1.1967613132209406e-05, "loss": 0.9404, "mean_token_accuracy": 0.7551106229424477, "num_tokens": 322952370.0, "step": 3430 }, { "entropy": 1.3473434820771217, "epoch": 0.7632148205668645, "grad_norm": 0.54296875, "learning_rate": 1.18566992014197e-05, "loss": 0.9527, "mean_token_accuracy": 0.7552920714020729, "num_tokens": 323909905.0, "step": 3440 }, { "entropy": 1.388922219723463, "epoch": 0.7654334683010705, "grad_norm": 0.56640625, "learning_rate": 1.1745785270629992e-05, "loss": 0.9835, "mean_token_accuracy": 0.7475064925849437, "num_tokens": 324843712.0, "step": 3450 }, { "entropy": 1.3555053889751434, "epoch": 0.7676521160352765, "grad_norm": 0.55078125, "learning_rate": 1.1634871339840284e-05, "loss": 0.9791, "mean_token_accuracy": 0.747514633089304, "num_tokens": 325763726.0, "step": 3460 }, { "entropy": 1.4119513988494874, "epoch": 0.7698707637694825, "grad_norm": 0.5703125, "learning_rate": 1.1523957409050576e-05, "loss": 0.9634, "mean_token_accuracy": 0.752115435898304, "num_tokens": 326704959.0, "step": 3470 }, { "entropy": 1.358751341700554, "epoch": 0.7720894115036885, "grad_norm": 0.5078125, "learning_rate": 1.141304347826087e-05, "loss": 0.9502, "mean_token_accuracy": 0.752843676507473, "num_tokens": 327654129.0, "step": 3480 }, { "entropy": 1.373246306180954, "epoch": 0.7743080592378945, "grad_norm": 0.56640625, "learning_rate": 1.1302129547471162e-05, "loss": 0.9896, "mean_token_accuracy": 0.7471863307058811, "num_tokens": 328577246.0, "step": 3490 }, { "entropy": 1.3401599921286107, "epoch": 0.7765267069721005, "grad_norm": 0.51171875, "learning_rate": 1.1191215616681455e-05, "loss": 0.9046, "mean_token_accuracy": 0.7622545510530472, "num_tokens": 329531531.0, "step": 3500 }, { "entropy": 1.403013862669468, "epoch": 0.7787453547063065, "grad_norm": 0.5234375, "learning_rate": 1.1080301685891748e-05, "loss": 1.0199, "mean_token_accuracy": 0.7393336437642575, "num_tokens": 330493898.0, "step": 3510 }, { "entropy": 1.3496449366211891, "epoch": 0.7809640024405126, "grad_norm": 0.5859375, "learning_rate": 1.096938775510204e-05, "loss": 0.9422, "mean_token_accuracy": 0.7540981650352478, "num_tokens": 331418294.0, "step": 3520 }, { "entropy": 1.344119517505169, "epoch": 0.7831826501747186, "grad_norm": 0.53515625, "learning_rate": 1.0858473824312334e-05, "loss": 0.9386, "mean_token_accuracy": 0.757073562592268, "num_tokens": 332378464.0, "step": 3530 }, { "entropy": 1.3490448504686356, "epoch": 0.7854012979089245, "grad_norm": 0.53125, "learning_rate": 1.0747559893522626e-05, "loss": 0.9457, "mean_token_accuracy": 0.7535859100520611, "num_tokens": 333318703.0, "step": 3540 }, { "entropy": 1.3718070283532142, "epoch": 0.7876199456431305, "grad_norm": 0.546875, "learning_rate": 1.063664596273292e-05, "loss": 0.9685, "mean_token_accuracy": 0.7511322259902954, "num_tokens": 334255047.0, "step": 3550 }, { "entropy": 1.3711335480213165, "epoch": 0.7898385933773365, "grad_norm": 0.515625, "learning_rate": 1.0525732031943212e-05, "loss": 0.9743, "mean_token_accuracy": 0.7471988372504711, "num_tokens": 335189458.0, "step": 3560 }, { "entropy": 1.3890349462628364, "epoch": 0.7920572411115425, "grad_norm": 0.53125, "learning_rate": 1.0414818101153505e-05, "loss": 0.9872, "mean_token_accuracy": 0.7452017098665238, "num_tokens": 336139155.0, "step": 3570 }, { "entropy": 1.336748766899109, "epoch": 0.7942758888457485, "grad_norm": 0.5390625, "learning_rate": 1.0303904170363798e-05, "loss": 0.9194, "mean_token_accuracy": 0.7595400720834732, "num_tokens": 337103166.0, "step": 3580 }, { "entropy": 1.3947007723152638, "epoch": 0.7964945365799545, "grad_norm": 0.53515625, "learning_rate": 1.019299023957409e-05, "loss": 0.9857, "mean_token_accuracy": 0.7481105640530586, "num_tokens": 338049665.0, "step": 3590 }, { "entropy": 1.3394004009664058, "epoch": 0.7987131843141605, "grad_norm": 0.5546875, "learning_rate": 1.0082076308784384e-05, "loss": 0.9501, "mean_token_accuracy": 0.7537369303405285, "num_tokens": 339030359.0, "step": 3600 }, { "entropy": 1.4002343088388443, "epoch": 0.8009318320483665, "grad_norm": 0.5625, "learning_rate": 9.971162377994676e-06, "loss": 0.9899, "mean_token_accuracy": 0.7460181936621666, "num_tokens": 339965846.0, "step": 3610 }, { "entropy": 1.3751978531479836, "epoch": 0.8031504797825725, "grad_norm": 0.53125, "learning_rate": 9.86024844720497e-06, "loss": 0.9663, "mean_token_accuracy": 0.7495487280189991, "num_tokens": 340909085.0, "step": 3620 }, { "entropy": 1.3296589955687523, "epoch": 0.8053691275167785, "grad_norm": 0.5390625, "learning_rate": 9.749334516415262e-06, "loss": 0.9116, "mean_token_accuracy": 0.7615578956902027, "num_tokens": 341836396.0, "step": 3630 }, { "entropy": 1.3545545935630798, "epoch": 0.8075877752509846, "grad_norm": 0.5546875, "learning_rate": 9.638420585625555e-06, "loss": 0.946, "mean_token_accuracy": 0.7542130470275878, "num_tokens": 342759623.0, "step": 3640 }, { "entropy": 1.3891134530305862, "epoch": 0.8098064229851906, "grad_norm": 0.57421875, "learning_rate": 9.527506654835848e-06, "loss": 1.0098, "mean_token_accuracy": 0.7399868927896023, "num_tokens": 343714548.0, "step": 3650 }, { "entropy": 1.3653203830122949, "epoch": 0.8120250707193966, "grad_norm": 0.53125, "learning_rate": 9.41659272404614e-06, "loss": 0.9689, "mean_token_accuracy": 0.747505272179842, "num_tokens": 344676512.0, "step": 3660 }, { "entropy": 1.3524901941418648, "epoch": 0.8142437184536025, "grad_norm": 0.5625, "learning_rate": 9.305678793256434e-06, "loss": 0.9416, "mean_token_accuracy": 0.7552405230700969, "num_tokens": 345609814.0, "step": 3670 }, { "entropy": 1.3355680212378502, "epoch": 0.8164623661878085, "grad_norm": 0.55078125, "learning_rate": 9.194764862466726e-06, "loss": 0.9409, "mean_token_accuracy": 0.7546605832874775, "num_tokens": 346552116.0, "step": 3680 }, { "entropy": 1.3585198432207108, "epoch": 0.8186810139220145, "grad_norm": 0.53125, "learning_rate": 9.08385093167702e-06, "loss": 0.9451, "mean_token_accuracy": 0.7566642910242081, "num_tokens": 347476547.0, "step": 3690 }, { "entropy": 1.3646457374095917, "epoch": 0.8208996616562205, "grad_norm": 0.56640625, "learning_rate": 8.972937000887312e-06, "loss": 0.9328, "mean_token_accuracy": 0.757544395327568, "num_tokens": 348402286.0, "step": 3700 }, { "entropy": 1.4008646070957185, "epoch": 0.8231183093904265, "grad_norm": 0.55859375, "learning_rate": 8.862023070097605e-06, "loss": 1.0166, "mean_token_accuracy": 0.7399243280291558, "num_tokens": 349350422.0, "step": 3710 }, { "entropy": 1.305922406166792, "epoch": 0.8253369571246325, "grad_norm": 0.53125, "learning_rate": 8.751109139307898e-06, "loss": 0.9002, "mean_token_accuracy": 0.7653868660330773, "num_tokens": 350307992.0, "step": 3720 }, { "entropy": 1.344923496246338, "epoch": 0.8275556048588385, "grad_norm": 0.52734375, "learning_rate": 8.64019520851819e-06, "loss": 0.923, "mean_token_accuracy": 0.7606289356946945, "num_tokens": 351239736.0, "step": 3730 }, { "entropy": 1.356829535961151, "epoch": 0.8297742525930445, "grad_norm": 0.54296875, "learning_rate": 8.529281277728483e-06, "loss": 0.9208, "mean_token_accuracy": 0.7572924271225929, "num_tokens": 352175438.0, "step": 3740 }, { "entropy": 1.371825471520424, "epoch": 0.8319929003272506, "grad_norm": 0.55859375, "learning_rate": 8.418367346938775e-06, "loss": 0.9769, "mean_token_accuracy": 0.7484906286001205, "num_tokens": 353093469.0, "step": 3750 }, { "entropy": 1.3505297660827638, "epoch": 0.8342115480614566, "grad_norm": 0.55078125, "learning_rate": 8.307453416149069e-06, "loss": 0.9634, "mean_token_accuracy": 0.748659697920084, "num_tokens": 354042260.0, "step": 3760 }, { "entropy": 1.3741331085562707, "epoch": 0.8364301957956626, "grad_norm": 0.5390625, "learning_rate": 8.19653948535936e-06, "loss": 0.982, "mean_token_accuracy": 0.7466577455401421, "num_tokens": 354960941.0, "step": 3770 }, { "entropy": 1.3521684527397155, "epoch": 0.8386488435298686, "grad_norm": 0.54296875, "learning_rate": 8.085625554569655e-06, "loss": 0.951, "mean_token_accuracy": 0.7545395441353321, "num_tokens": 355899405.0, "step": 3780 }, { "entropy": 1.3922821909189225, "epoch": 0.8408674912640746, "grad_norm": 0.5390625, "learning_rate": 7.974711623779947e-06, "loss": 0.9774, "mean_token_accuracy": 0.7484460555016994, "num_tokens": 356837111.0, "step": 3790 }, { "entropy": 1.341869878768921, "epoch": 0.8430861389982806, "grad_norm": 0.50390625, "learning_rate": 7.863797692990239e-06, "loss": 0.9371, "mean_token_accuracy": 0.7555838227272034, "num_tokens": 357775995.0, "step": 3800 }, { "entropy": 1.3769854843616485, "epoch": 0.8453047867324865, "grad_norm": 0.5546875, "learning_rate": 7.752883762200533e-06, "loss": 0.9788, "mean_token_accuracy": 0.7470424689352513, "num_tokens": 358730556.0, "step": 3810 }, { "entropy": 1.3654131770133973, "epoch": 0.8475234344666925, "grad_norm": 0.59375, "learning_rate": 7.641969831410825e-06, "loss": 0.9543, "mean_token_accuracy": 0.7543313026428222, "num_tokens": 359702857.0, "step": 3820 }, { "entropy": 1.3479675091803074, "epoch": 0.8497420822008985, "grad_norm": 0.5078125, "learning_rate": 7.5310559006211186e-06, "loss": 0.9434, "mean_token_accuracy": 0.7545025050640106, "num_tokens": 360637451.0, "step": 3830 }, { "entropy": 1.368970339745283, "epoch": 0.8519607299351045, "grad_norm": 0.53515625, "learning_rate": 7.420141969831411e-06, "loss": 0.9585, "mean_token_accuracy": 0.7518557466566562, "num_tokens": 361574227.0, "step": 3840 }, { "entropy": 1.3598952896893024, "epoch": 0.8541793776693105, "grad_norm": 0.5234375, "learning_rate": 7.3092280390417045e-06, "loss": 0.9427, "mean_token_accuracy": 0.754470182955265, "num_tokens": 362506193.0, "step": 3850 }, { "entropy": 1.3638700023293495, "epoch": 0.8563980254035165, "grad_norm": 0.5390625, "learning_rate": 7.198314108251997e-06, "loss": 0.9701, "mean_token_accuracy": 0.7472980074584484, "num_tokens": 363441282.0, "step": 3860 }, { "entropy": 1.3546796232461928, "epoch": 0.8586166731377226, "grad_norm": 0.4765625, "learning_rate": 7.0874001774622905e-06, "loss": 0.9753, "mean_token_accuracy": 0.7478179946541786, "num_tokens": 364393822.0, "step": 3870 }, { "entropy": 1.350717130303383, "epoch": 0.8608353208719286, "grad_norm": 0.55859375, "learning_rate": 6.976486246672583e-06, "loss": 0.9486, "mean_token_accuracy": 0.7572776488959789, "num_tokens": 365331779.0, "step": 3880 }, { "entropy": 1.3585814163088799, "epoch": 0.8630539686061346, "grad_norm": 0.5859375, "learning_rate": 6.865572315882875e-06, "loss": 0.9629, "mean_token_accuracy": 0.748991634696722, "num_tokens": 366257855.0, "step": 3890 }, { "entropy": 1.3992498129606248, "epoch": 0.8652726163403406, "grad_norm": 0.58203125, "learning_rate": 6.7546583850931686e-06, "loss": 0.9949, "mean_token_accuracy": 0.7451303206384182, "num_tokens": 367181436.0, "step": 3900 }, { "entropy": 1.3461244717240333, "epoch": 0.8674912640745466, "grad_norm": 0.52734375, "learning_rate": 6.643744454303461e-06, "loss": 0.9381, "mean_token_accuracy": 0.7554601080715656, "num_tokens": 368128436.0, "step": 3910 }, { "entropy": 1.3403765760362147, "epoch": 0.8697099118087526, "grad_norm": 0.5390625, "learning_rate": 6.532830523513754e-06, "loss": 0.9265, "mean_token_accuracy": 0.7592454843223095, "num_tokens": 369092081.0, "step": 3920 }, { "entropy": 1.3784636914730073, "epoch": 0.8719285595429586, "grad_norm": 0.54296875, "learning_rate": 6.421916592724047e-06, "loss": 0.9607, "mean_token_accuracy": 0.751040443778038, "num_tokens": 370022755.0, "step": 3930 }, { "entropy": 1.3625924080610274, "epoch": 0.8741472072771646, "grad_norm": 0.56640625, "learning_rate": 6.31100266193434e-06, "loss": 0.9696, "mean_token_accuracy": 0.7501497231423855, "num_tokens": 370963738.0, "step": 3940 }, { "entropy": 1.3465173587203025, "epoch": 0.8763658550113705, "grad_norm": 0.56640625, "learning_rate": 6.200088731144632e-06, "loss": 0.9578, "mean_token_accuracy": 0.7541100673377514, "num_tokens": 371888931.0, "step": 3950 }, { "entropy": 1.3527381241321563, "epoch": 0.8785845027455765, "grad_norm": 0.5390625, "learning_rate": 6.089174800354925e-06, "loss": 0.9467, "mean_token_accuracy": 0.7544343665242195, "num_tokens": 372849693.0, "step": 3960 }, { "entropy": 1.3818270325660706, "epoch": 0.8808031504797825, "grad_norm": 0.55078125, "learning_rate": 5.978260869565218e-06, "loss": 0.9551, "mean_token_accuracy": 0.7534758277237416, "num_tokens": 373792435.0, "step": 3970 }, { "entropy": 1.3504199832677841, "epoch": 0.8830217982139886, "grad_norm": 0.5234375, "learning_rate": 5.867346938775511e-06, "loss": 0.9445, "mean_token_accuracy": 0.756835724413395, "num_tokens": 374746035.0, "step": 3980 }, { "entropy": 1.3496798947453499, "epoch": 0.8852404459481946, "grad_norm": 0.53125, "learning_rate": 5.756433007985803e-06, "loss": 0.9419, "mean_token_accuracy": 0.7544379711151123, "num_tokens": 375703477.0, "step": 3990 }, { "entropy": 1.3732656255364417, "epoch": 0.8874590936824006, "grad_norm": 0.546875, "learning_rate": 5.645519077196096e-06, "loss": 0.9604, "mean_token_accuracy": 0.751821743696928, "num_tokens": 376636971.0, "step": 4000 }, { "entropy": 1.375483873486519, "epoch": 0.8896777414166066, "grad_norm": 0.52734375, "learning_rate": 5.534605146406389e-06, "loss": 0.9671, "mean_token_accuracy": 0.7487936913967133, "num_tokens": 377588517.0, "step": 4010 }, { "entropy": 1.3773034647107125, "epoch": 0.8918963891508126, "grad_norm": 0.55859375, "learning_rate": 5.423691215616682e-06, "loss": 0.9667, "mean_token_accuracy": 0.7501926451921463, "num_tokens": 378524822.0, "step": 4020 }, { "entropy": 1.3265659905970097, "epoch": 0.8941150368850186, "grad_norm": 0.54296875, "learning_rate": 5.312777284826975e-06, "loss": 0.9452, "mean_token_accuracy": 0.7550780981779098, "num_tokens": 379505593.0, "step": 4030 }, { "entropy": 1.335418175160885, "epoch": 0.8963336846192246, "grad_norm": 0.58203125, "learning_rate": 5.201863354037268e-06, "loss": 0.9482, "mean_token_accuracy": 0.7534942403435707, "num_tokens": 380473052.0, "step": 4040 }, { "entropy": 1.3610256776213645, "epoch": 0.8985523323534306, "grad_norm": 0.5546875, "learning_rate": 5.090949423247561e-06, "loss": 0.958, "mean_token_accuracy": 0.7533226810395718, "num_tokens": 381443551.0, "step": 4050 }, { "entropy": 1.3507319584488868, "epoch": 0.9007709800876366, "grad_norm": 0.55078125, "learning_rate": 4.980035492457853e-06, "loss": 0.9489, "mean_token_accuracy": 0.7544699974358082, "num_tokens": 382378572.0, "step": 4060 }, { "entropy": 1.3752561420202256, "epoch": 0.9029896278218426, "grad_norm": 0.53515625, "learning_rate": 4.869121561668146e-06, "loss": 0.9519, "mean_token_accuracy": 0.7520841076970101, "num_tokens": 383311592.0, "step": 4070 }, { "entropy": 1.3476091951131821, "epoch": 0.9052082755560485, "grad_norm": 0.5234375, "learning_rate": 4.758207630878438e-06, "loss": 0.9415, "mean_token_accuracy": 0.7553776867687703, "num_tokens": 384248057.0, "step": 4080 }, { "entropy": 1.3605633400380612, "epoch": 0.9074269232902545, "grad_norm": 0.56640625, "learning_rate": 4.647293700088731e-06, "loss": 0.9244, "mean_token_accuracy": 0.7569485224783421, "num_tokens": 385189392.0, "step": 4090 }, { "entropy": 1.365138278901577, "epoch": 0.9096455710244606, "grad_norm": 0.5078125, "learning_rate": 4.536379769299024e-06, "loss": 0.9531, "mean_token_accuracy": 0.7533132433891296, "num_tokens": 386110731.0, "step": 4100 }, { "entropy": 1.3613657392561436, "epoch": 0.9118642187586666, "grad_norm": 0.55078125, "learning_rate": 4.425465838509317e-06, "loss": 0.943, "mean_token_accuracy": 0.7544411860406399, "num_tokens": 387057495.0, "step": 4110 }, { "entropy": 1.3290772818028926, "epoch": 0.9140828664928726, "grad_norm": 0.5546875, "learning_rate": 4.31455190771961e-06, "loss": 0.9183, "mean_token_accuracy": 0.7608602307736874, "num_tokens": 388021016.0, "step": 4120 }, { "entropy": 1.3475232422351837, "epoch": 0.9163015142270786, "grad_norm": 0.52734375, "learning_rate": 4.203637976929903e-06, "loss": 0.9411, "mean_token_accuracy": 0.7554165907204151, "num_tokens": 388924467.0, "step": 4130 }, { "entropy": 1.3602249071002006, "epoch": 0.9185201619612846, "grad_norm": 0.51953125, "learning_rate": 4.092724046140196e-06, "loss": 0.9307, "mean_token_accuracy": 0.7554832518100738, "num_tokens": 389879904.0, "step": 4140 }, { "entropy": 1.332291903346777, "epoch": 0.9207388096954906, "grad_norm": 0.5234375, "learning_rate": 3.981810115350488e-06, "loss": 0.9559, "mean_token_accuracy": 0.752610693871975, "num_tokens": 390857519.0, "step": 4150 }, { "entropy": 1.3272889666259289, "epoch": 0.9229574574296966, "grad_norm": 0.53125, "learning_rate": 3.870896184560781e-06, "loss": 0.9306, "mean_token_accuracy": 0.7583662964403629, "num_tokens": 391792594.0, "step": 4160 }, { "entropy": 1.3560280472040176, "epoch": 0.9251761051639026, "grad_norm": 0.5390625, "learning_rate": 3.759982253771074e-06, "loss": 0.9577, "mean_token_accuracy": 0.7537056483328343, "num_tokens": 392749013.0, "step": 4170 }, { "entropy": 1.3662122264504433, "epoch": 0.9273947528981086, "grad_norm": 0.53125, "learning_rate": 3.6490683229813664e-06, "loss": 0.9715, "mean_token_accuracy": 0.7491789266467095, "num_tokens": 393666251.0, "step": 4180 }, { "entropy": 1.31452574133873, "epoch": 0.9296134006323146, "grad_norm": 0.5390625, "learning_rate": 3.5381543921916594e-06, "loss": 0.9006, "mean_token_accuracy": 0.7624947860836983, "num_tokens": 394596882.0, "step": 4190 }, { "entropy": 1.3401482120156287, "epoch": 0.9318320483665206, "grad_norm": 0.53125, "learning_rate": 3.4272404614019524e-06, "loss": 0.9282, "mean_token_accuracy": 0.756990148127079, "num_tokens": 395526506.0, "step": 4200 }, { "entropy": 1.4013796046376228, "epoch": 0.9340506961007266, "grad_norm": 0.54296875, "learning_rate": 3.3163265306122454e-06, "loss": 0.9932, "mean_token_accuracy": 0.7460516929626465, "num_tokens": 396460730.0, "step": 4210 }, { "entropy": 1.3630273953080176, "epoch": 0.9362693438349327, "grad_norm": 0.5390625, "learning_rate": 3.2054125998225384e-06, "loss": 0.9513, "mean_token_accuracy": 0.7526130631566048, "num_tokens": 397410429.0, "step": 4220 }, { "entropy": 1.2898547686636448, "epoch": 0.9384879915691386, "grad_norm": 0.5078125, "learning_rate": 3.094498669032831e-06, "loss": 0.9228, "mean_token_accuracy": 0.7592225328087807, "num_tokens": 398358920.0, "step": 4230 }, { "entropy": 1.3584180302917956, "epoch": 0.9407066393033446, "grad_norm": 0.62890625, "learning_rate": 2.9835847382431235e-06, "loss": 0.9683, "mean_token_accuracy": 0.7511270597577095, "num_tokens": 399332691.0, "step": 4240 }, { "entropy": 1.3851253606379033, "epoch": 0.9429252870375506, "grad_norm": 0.55078125, "learning_rate": 2.872670807453416e-06, "loss": 0.974, "mean_token_accuracy": 0.7480023667216301, "num_tokens": 400266541.0, "step": 4250 }, { "entropy": 1.3538463555276394, "epoch": 0.9451439347717566, "grad_norm": 0.52734375, "learning_rate": 2.761756876663709e-06, "loss": 0.9443, "mean_token_accuracy": 0.7554497793316841, "num_tokens": 401201822.0, "step": 4260 }, { "entropy": 1.3774816602468491, "epoch": 0.9473625825059626, "grad_norm": 0.5390625, "learning_rate": 2.650842945874002e-06, "loss": 0.9843, "mean_token_accuracy": 0.746273136138916, "num_tokens": 402152611.0, "step": 4270 }, { "entropy": 1.3171171061694622, "epoch": 0.9495812302401686, "grad_norm": 0.5234375, "learning_rate": 2.539929015084295e-06, "loss": 0.9111, "mean_token_accuracy": 0.7632594168186188, "num_tokens": 403118617.0, "step": 4280 }, { "entropy": 1.3414073579013348, "epoch": 0.9517998779743746, "grad_norm": 0.55078125, "learning_rate": 2.4290150842945875e-06, "loss": 0.9402, "mean_token_accuracy": 0.7527715168893337, "num_tokens": 404056789.0, "step": 4290 }, { "entropy": 1.3378793716430664, "epoch": 0.9540185257085806, "grad_norm": 0.546875, "learning_rate": 2.3181011535048805e-06, "loss": 0.9353, "mean_token_accuracy": 0.7563605636358262, "num_tokens": 405004371.0, "step": 4300 }, { "entropy": 1.372169415652752, "epoch": 0.9562371734427866, "grad_norm": 0.51953125, "learning_rate": 2.207187222715173e-06, "loss": 0.9436, "mean_token_accuracy": 0.7551277004182338, "num_tokens": 405922059.0, "step": 4310 }, { "entropy": 1.3533624187111855, "epoch": 0.9584558211769926, "grad_norm": 0.5390625, "learning_rate": 2.096273291925466e-06, "loss": 0.9497, "mean_token_accuracy": 0.7533909723162651, "num_tokens": 406838792.0, "step": 4320 }, { "entropy": 1.3719367325305938, "epoch": 0.9606744689111987, "grad_norm": 0.51953125, "learning_rate": 1.9853593611357586e-06, "loss": 0.9919, "mean_token_accuracy": 0.7434499144554139, "num_tokens": 407786498.0, "step": 4330 }, { "entropy": 1.3563473880290986, "epoch": 0.9628931166454047, "grad_norm": 0.5234375, "learning_rate": 1.8744454303460516e-06, "loss": 0.9401, "mean_token_accuracy": 0.7530663572251797, "num_tokens": 408736948.0, "step": 4340 }, { "entropy": 1.347538560628891, "epoch": 0.9651117643796107, "grad_norm": 0.53125, "learning_rate": 1.7635314995563443e-06, "loss": 0.933, "mean_token_accuracy": 0.7575333446264267, "num_tokens": 409664542.0, "step": 4350 }, { "entropy": 1.3749746069312097, "epoch": 0.9673304121138167, "grad_norm": 0.51953125, "learning_rate": 1.6526175687666373e-06, "loss": 0.9698, "mean_token_accuracy": 0.7511622585356236, "num_tokens": 410602122.0, "step": 4360 }, { "entropy": 1.3442941211163997, "epoch": 0.9695490598480226, "grad_norm": 0.52734375, "learning_rate": 1.54170363797693e-06, "loss": 0.9572, "mean_token_accuracy": 0.7501766428351402, "num_tokens": 411528742.0, "step": 4370 }, { "entropy": 1.3314830370247364, "epoch": 0.9717677075822286, "grad_norm": 0.52734375, "learning_rate": 1.4307897071872228e-06, "loss": 0.9528, "mean_token_accuracy": 0.7543077766895294, "num_tokens": 412496866.0, "step": 4380 }, { "entropy": 1.362931652367115, "epoch": 0.9739863553164346, "grad_norm": 0.5390625, "learning_rate": 1.3198757763975156e-06, "loss": 0.9539, "mean_token_accuracy": 0.7541424036026001, "num_tokens": 413426074.0, "step": 4390 }, { "entropy": 1.314641258120537, "epoch": 0.9762050030506406, "grad_norm": 0.53515625, "learning_rate": 1.2089618456078084e-06, "loss": 0.9326, "mean_token_accuracy": 0.7566425338387489, "num_tokens": 414366936.0, "step": 4400 }, { "entropy": 1.3944153673946857, "epoch": 0.9784236507848466, "grad_norm": 0.546875, "learning_rate": 1.0980479148181013e-06, "loss": 0.9887, "mean_token_accuracy": 0.7444652430713177, "num_tokens": 415305379.0, "step": 4410 }, { "entropy": 1.3453952841460706, "epoch": 0.9806422985190526, "grad_norm": 0.49609375, "learning_rate": 9.871339840283939e-07, "loss": 0.9563, "mean_token_accuracy": 0.7523396387696266, "num_tokens": 416266511.0, "step": 4420 }, { "entropy": 1.3440303832292557, "epoch": 0.9828609462532586, "grad_norm": 0.53515625, "learning_rate": 8.762200532386869e-07, "loss": 0.9456, "mean_token_accuracy": 0.7547272637486457, "num_tokens": 417231789.0, "step": 4430 }, { "entropy": 1.367350959777832, "epoch": 0.9850795939874646, "grad_norm": 0.5546875, "learning_rate": 7.653061224489796e-07, "loss": 0.9693, "mean_token_accuracy": 0.7508193962275982, "num_tokens": 418194063.0, "step": 4440 }, { "entropy": 1.3651387616991997, "epoch": 0.9872982417216707, "grad_norm": 0.5703125, "learning_rate": 6.543921916592724e-07, "loss": 0.9463, "mean_token_accuracy": 0.7529610082507133, "num_tokens": 419114425.0, "step": 4450 }, { "entropy": 1.347716721892357, "epoch": 0.9895168894558767, "grad_norm": 0.490234375, "learning_rate": 5.434782608695653e-07, "loss": 0.9493, "mean_token_accuracy": 0.7544530227780342, "num_tokens": 420062804.0, "step": 4460 }, { "entropy": 1.323940635472536, "epoch": 0.9917355371900827, "grad_norm": 0.5234375, "learning_rate": 4.3256433007985804e-07, "loss": 0.9176, "mean_token_accuracy": 0.7596350736916065, "num_tokens": 420985488.0, "step": 4470 }, { "entropy": 1.3813497826457024, "epoch": 0.9939541849242887, "grad_norm": 0.54296875, "learning_rate": 3.2165039929015086e-07, "loss": 0.9854, "mean_token_accuracy": 0.7467473462224007, "num_tokens": 421920339.0, "step": 4480 }, { "entropy": 1.3907534167170525, "epoch": 0.9961728326584947, "grad_norm": 0.52734375, "learning_rate": 2.1073646850044365e-07, "loss": 0.9956, "mean_token_accuracy": 0.7431350871920586, "num_tokens": 422875342.0, "step": 4490 }, { "entropy": 1.4102192774415017, "epoch": 0.9983914803927006, "grad_norm": 0.546875, "learning_rate": 9.982253771073646e-08, "loss": 1.0, "mean_token_accuracy": 0.7438700333237648, "num_tokens": 423810727.0, "step": 4500 } ], "logging_steps": 10, "max_steps": 4508, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.665075636310376e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }