| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 4508, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.496292558312416, | |
| "epoch": 0.0022186477342060014, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 4.990017746228927e-05, | |
| "loss": 1.0869, | |
| "mean_token_accuracy": 0.725723172724247, | |
| "num_tokens": 925119.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.49339357316494, | |
| "epoch": 0.004437295468412003, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 4.978926353149956e-05, | |
| "loss": 1.0992, | |
| "mean_token_accuracy": 0.72515804246068, | |
| "num_tokens": 1846450.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.4267458260059356, | |
| "epoch": 0.006655943202618004, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.967834960070985e-05, | |
| "loss": 1.0639, | |
| "mean_token_accuracy": 0.7327800326049327, | |
| "num_tokens": 2786157.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.409786120057106, | |
| "epoch": 0.008874590936824005, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.9567435669920145e-05, | |
| "loss": 1.0149, | |
| "mean_token_accuracy": 0.7418569244444371, | |
| "num_tokens": 3732821.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.412144237756729, | |
| "epoch": 0.011093238671030008, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.945652173913044e-05, | |
| "loss": 1.0482, | |
| "mean_token_accuracy": 0.7331000074744225, | |
| "num_tokens": 4691429.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.434348814189434, | |
| "epoch": 0.013311886405236008, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.934560780834073e-05, | |
| "loss": 1.0588, | |
| "mean_token_accuracy": 0.7321462295949459, | |
| "num_tokens": 5627667.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.4207842454314232, | |
| "epoch": 0.01553053413944201, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.923469387755102e-05, | |
| "loss": 1.0427, | |
| "mean_token_accuracy": 0.7373643882572651, | |
| "num_tokens": 6554855.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.403839322924614, | |
| "epoch": 0.01774918187364801, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.9123779946761314e-05, | |
| "loss": 1.0559, | |
| "mean_token_accuracy": 0.7321439690887928, | |
| "num_tokens": 7519970.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.4707388430833817, | |
| "epoch": 0.019967829607854013, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.9012866015971606e-05, | |
| "loss": 1.0769, | |
| "mean_token_accuracy": 0.7276073284447193, | |
| "num_tokens": 8448434.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.4391525745391847, | |
| "epoch": 0.022186477342060015, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 4.8901952085181905e-05, | |
| "loss": 1.0773, | |
| "mean_token_accuracy": 0.7285938866436481, | |
| "num_tokens": 9372876.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.4438337981700897, | |
| "epoch": 0.024405125076266018, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 4.87910381543922e-05, | |
| "loss": 1.0468, | |
| "mean_token_accuracy": 0.7326018497347832, | |
| "num_tokens": 10316544.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.4177550919353963, | |
| "epoch": 0.026623772810472016, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.868012422360249e-05, | |
| "loss": 1.0585, | |
| "mean_token_accuracy": 0.7324540324509143, | |
| "num_tokens": 11247187.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.471057978272438, | |
| "epoch": 0.02884242054467802, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.856921029281278e-05, | |
| "loss": 1.0761, | |
| "mean_token_accuracy": 0.7275320313870907, | |
| "num_tokens": 12173265.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.3705433815717698, | |
| "epoch": 0.03106106827888402, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.845829636202307e-05, | |
| "loss": 0.9901, | |
| "mean_token_accuracy": 0.74607959613204, | |
| "num_tokens": 13130818.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.3908753886818885, | |
| "epoch": 0.03327971601309002, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.8347382431233365e-05, | |
| "loss": 1.0128, | |
| "mean_token_accuracy": 0.7397120602428913, | |
| "num_tokens": 14082298.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3889487609267235, | |
| "epoch": 0.03549836374729602, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.823646850044366e-05, | |
| "loss": 1.0346, | |
| "mean_token_accuracy": 0.7391657814383507, | |
| "num_tokens": 15020735.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.3904974788427353, | |
| "epoch": 0.03771701148150203, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 4.812555456965395e-05, | |
| "loss": 1.0208, | |
| "mean_token_accuracy": 0.7383547216653824, | |
| "num_tokens": 15966964.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4144786164164542, | |
| "epoch": 0.039935659215708026, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.801464063886424e-05, | |
| "loss": 1.0335, | |
| "mean_token_accuracy": 0.7372442841529846, | |
| "num_tokens": 16859851.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.4213875874876976, | |
| "epoch": 0.042154306949914025, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.7903726708074534e-05, | |
| "loss": 1.03, | |
| "mean_token_accuracy": 0.735869013518095, | |
| "num_tokens": 17767909.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.4285096868872642, | |
| "epoch": 0.04437295468412003, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.7792812777284826e-05, | |
| "loss": 1.0358, | |
| "mean_token_accuracy": 0.7385697923600674, | |
| "num_tokens": 18734689.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.4109881177544594, | |
| "epoch": 0.04659160241832603, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.768189884649512e-05, | |
| "loss": 1.0404, | |
| "mean_token_accuracy": 0.7372543781995773, | |
| "num_tokens": 19661481.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.3780835449695588, | |
| "epoch": 0.048810250152532035, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.757098491570541e-05, | |
| "loss": 1.0115, | |
| "mean_token_accuracy": 0.7417904600501061, | |
| "num_tokens": 20586364.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.3467031195759773, | |
| "epoch": 0.051028897886738034, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.74600709849157e-05, | |
| "loss": 0.9833, | |
| "mean_token_accuracy": 0.7490709364414215, | |
| "num_tokens": 21547442.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.424035993218422, | |
| "epoch": 0.05324754562094403, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.7349157054126e-05, | |
| "loss": 1.0266, | |
| "mean_token_accuracy": 0.7366823427379131, | |
| "num_tokens": 22502716.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.409425649046898, | |
| "epoch": 0.05546619335515004, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 4.7238243123336293e-05, | |
| "loss": 1.0304, | |
| "mean_token_accuracy": 0.7378800459206104, | |
| "num_tokens": 23426781.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.4083105452358722, | |
| "epoch": 0.05768484108935604, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.7127329192546586e-05, | |
| "loss": 1.0399, | |
| "mean_token_accuracy": 0.7385689981281758, | |
| "num_tokens": 24379852.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.3957985565066338, | |
| "epoch": 0.059903488823562036, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 4.701641526175688e-05, | |
| "loss": 1.0115, | |
| "mean_token_accuracy": 0.7397762380540371, | |
| "num_tokens": 25322772.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.376178003847599, | |
| "epoch": 0.06212213655776804, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.690550133096717e-05, | |
| "loss": 1.0075, | |
| "mean_token_accuracy": 0.7439929395914078, | |
| "num_tokens": 26287748.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.3666997633874416, | |
| "epoch": 0.06434078429197404, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.679458740017746e-05, | |
| "loss": 0.9893, | |
| "mean_token_accuracy": 0.7455881536006927, | |
| "num_tokens": 27239049.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.3740440711379052, | |
| "epoch": 0.06655943202618005, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.6683673469387754e-05, | |
| "loss": 1.0038, | |
| "mean_token_accuracy": 0.7433481432497502, | |
| "num_tokens": 28209460.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.3969299167394638, | |
| "epoch": 0.06877807976038605, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.6572759538598046e-05, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.7407899357378482, | |
| "num_tokens": 29152647.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.4039423167705536, | |
| "epoch": 0.07099672749459204, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.646184560780834e-05, | |
| "loss": 1.0078, | |
| "mean_token_accuracy": 0.7412165470421315, | |
| "num_tokens": 30065474.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.3215527072548867, | |
| "epoch": 0.07321537522879805, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.635093167701863e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.752603680640459, | |
| "num_tokens": 31019135.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.4225746989250183, | |
| "epoch": 0.07543402296300405, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.624001774622893e-05, | |
| "loss": 1.0285, | |
| "mean_token_accuracy": 0.7391470916569233, | |
| "num_tokens": 31939707.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.3544291421771049, | |
| "epoch": 0.07765267069721005, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.612910381543922e-05, | |
| "loss": 0.968, | |
| "mean_token_accuracy": 0.7495346136391163, | |
| "num_tokens": 32871530.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.3839760735630988, | |
| "epoch": 0.07987131843141605, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.6018189884649514e-05, | |
| "loss": 0.9895, | |
| "mean_token_accuracy": 0.7453494131565094, | |
| "num_tokens": 33813137.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.414746204763651, | |
| "epoch": 0.08208996616562206, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.5907275953859806e-05, | |
| "loss": 1.025, | |
| "mean_token_accuracy": 0.7388280339539051, | |
| "num_tokens": 34771343.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.3732567429542542, | |
| "epoch": 0.08430861389982805, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.57963620230701e-05, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.7429468773305417, | |
| "num_tokens": 35713295.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.3798431143164636, | |
| "epoch": 0.08652726163403406, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.568544809228039e-05, | |
| "loss": 1.0094, | |
| "mean_token_accuracy": 0.7400036215782165, | |
| "num_tokens": 36651252.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.3924534171819687, | |
| "epoch": 0.08874590936824006, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.557453416149068e-05, | |
| "loss": 0.9963, | |
| "mean_token_accuracy": 0.7455349668860436, | |
| "num_tokens": 37592125.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.4052727609872817, | |
| "epoch": 0.09096455710244605, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.5463620230700974e-05, | |
| "loss": 1.0201, | |
| "mean_token_accuracy": 0.7395130477845668, | |
| "num_tokens": 38534139.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.418926975131035, | |
| "epoch": 0.09318320483665206, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 4.5352706299911266e-05, | |
| "loss": 1.0096, | |
| "mean_token_accuracy": 0.7400068089365959, | |
| "num_tokens": 39478516.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.3901023603975773, | |
| "epoch": 0.09540185257085806, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.5241792369121565e-05, | |
| "loss": 1.0051, | |
| "mean_token_accuracy": 0.7408906109631062, | |
| "num_tokens": 40443771.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.3361934393644332, | |
| "epoch": 0.09762050030506407, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.513087843833186e-05, | |
| "loss": 0.9356, | |
| "mean_token_accuracy": 0.7576181195676327, | |
| "num_tokens": 41385687.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.4129181623458862, | |
| "epoch": 0.09983914803927006, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.501996450754215e-05, | |
| "loss": 1.0427, | |
| "mean_token_accuracy": 0.7341138951480388, | |
| "num_tokens": 42316542.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.3746674314141274, | |
| "epoch": 0.10205779577347607, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.490905057675244e-05, | |
| "loss": 1.0078, | |
| "mean_token_accuracy": 0.7429340846836567, | |
| "num_tokens": 43252482.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.3764732837677003, | |
| "epoch": 0.10427644350768207, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 4.4798136645962734e-05, | |
| "loss": 1.0107, | |
| "mean_token_accuracy": 0.7416411705315114, | |
| "num_tokens": 44226551.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.4007945582270622, | |
| "epoch": 0.10649509124188807, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.4687222715173026e-05, | |
| "loss": 1.0161, | |
| "mean_token_accuracy": 0.7399431690573692, | |
| "num_tokens": 45171661.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.400729776918888, | |
| "epoch": 0.10871373897609407, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.457630878438332e-05, | |
| "loss": 1.01, | |
| "mean_token_accuracy": 0.7416381858289242, | |
| "num_tokens": 46102823.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.3740653365850448, | |
| "epoch": 0.11093238671030008, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 4.446539485359361e-05, | |
| "loss": 0.9948, | |
| "mean_token_accuracy": 0.7436435185372829, | |
| "num_tokens": 47038182.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.385279569029808, | |
| "epoch": 0.11315103444450607, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.43544809228039e-05, | |
| "loss": 0.9812, | |
| "mean_token_accuracy": 0.7462167225778102, | |
| "num_tokens": 47978197.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.355229352414608, | |
| "epoch": 0.11536968217871207, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 4.42435669920142e-05, | |
| "loss": 0.9756, | |
| "mean_token_accuracy": 0.748258039355278, | |
| "num_tokens": 48929164.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.340594267845154, | |
| "epoch": 0.11758832991291808, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.4132653061224493e-05, | |
| "loss": 0.9557, | |
| "mean_token_accuracy": 0.7537197224795819, | |
| "num_tokens": 49872840.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.3749348096549512, | |
| "epoch": 0.11980697764712407, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.4021739130434786e-05, | |
| "loss": 0.9964, | |
| "mean_token_accuracy": 0.7432300426065922, | |
| "num_tokens": 50820730.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.3660012029111386, | |
| "epoch": 0.12202562538133008, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 4.391082519964508e-05, | |
| "loss": 0.9864, | |
| "mean_token_accuracy": 0.7461238898336887, | |
| "num_tokens": 51758109.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.3850644059479236, | |
| "epoch": 0.12424427311553608, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.379991126885537e-05, | |
| "loss": 0.9998, | |
| "mean_token_accuracy": 0.7451605953276157, | |
| "num_tokens": 52706955.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.3438964366912842, | |
| "epoch": 0.1264629208497421, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.368899733806566e-05, | |
| "loss": 0.9759, | |
| "mean_token_accuracy": 0.7506989397108554, | |
| "num_tokens": 53654927.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.3823294579982757, | |
| "epoch": 0.12868156858394808, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.3578083407275954e-05, | |
| "loss": 1.002, | |
| "mean_token_accuracy": 0.7399756357073783, | |
| "num_tokens": 54607725.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.3431190609931947, | |
| "epoch": 0.13090021631815407, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.3467169476486246e-05, | |
| "loss": 0.9922, | |
| "mean_token_accuracy": 0.7472275733947754, | |
| "num_tokens": 55544472.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.386824431270361, | |
| "epoch": 0.1331188640523601, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.335625554569654e-05, | |
| "loss": 1.0184, | |
| "mean_token_accuracy": 0.7395146794617176, | |
| "num_tokens": 56492504.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.3801176637411117, | |
| "epoch": 0.13533751178656608, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 4.324534161490684e-05, | |
| "loss": 1.0153, | |
| "mean_token_accuracy": 0.740266764163971, | |
| "num_tokens": 57446818.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.3780507385730743, | |
| "epoch": 0.1375561595207721, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.313442768411713e-05, | |
| "loss": 1.0282, | |
| "mean_token_accuracy": 0.7379110969603062, | |
| "num_tokens": 58388320.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.3641229078173638, | |
| "epoch": 0.1397748072549781, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.302351375332742e-05, | |
| "loss": 0.9693, | |
| "mean_token_accuracy": 0.7492371432483196, | |
| "num_tokens": 59335625.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.3909922763705254, | |
| "epoch": 0.1419934549891841, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.2912599822537714e-05, | |
| "loss": 1.0178, | |
| "mean_token_accuracy": 0.7371242880821228, | |
| "num_tokens": 60271739.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.358926948904991, | |
| "epoch": 0.1442121027233901, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 4.2801685891748006e-05, | |
| "loss": 0.9545, | |
| "mean_token_accuracy": 0.7560696460306644, | |
| "num_tokens": 61229087.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.3929373525083064, | |
| "epoch": 0.1464307504575961, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.26907719609583e-05, | |
| "loss": 1.0238, | |
| "mean_token_accuracy": 0.7393973417580127, | |
| "num_tokens": 62178123.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.327741453051567, | |
| "epoch": 0.1486493981918021, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.257985803016859e-05, | |
| "loss": 0.9456, | |
| "mean_token_accuracy": 0.7535205587744713, | |
| "num_tokens": 63126423.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.3679818481206893, | |
| "epoch": 0.1508680459260081, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.246894409937888e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.7467674180865288, | |
| "num_tokens": 64079693.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.381383018195629, | |
| "epoch": 0.1530866936602141, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.2358030168589174e-05, | |
| "loss": 1.0088, | |
| "mean_token_accuracy": 0.7418314486742019, | |
| "num_tokens": 65018338.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.3862957283854485, | |
| "epoch": 0.1553053413944201, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.224711623779947e-05, | |
| "loss": 1.0144, | |
| "mean_token_accuracy": 0.7403538078069687, | |
| "num_tokens": 65952052.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.3384159475564956, | |
| "epoch": 0.1575239891286261, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.2136202307009765e-05, | |
| "loss": 0.9922, | |
| "mean_token_accuracy": 0.7461440391838551, | |
| "num_tokens": 66921819.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.3640660651028156, | |
| "epoch": 0.1597426368628321, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.202528837622006e-05, | |
| "loss": 0.9682, | |
| "mean_token_accuracy": 0.7501711919903755, | |
| "num_tokens": 67876968.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.4149701073765755, | |
| "epoch": 0.1619612845970381, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 4.191437444543035e-05, | |
| "loss": 0.9993, | |
| "mean_token_accuracy": 0.7446250684559346, | |
| "num_tokens": 68806954.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.3790323272347451, | |
| "epoch": 0.16417993233124412, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 4.180346051464064e-05, | |
| "loss": 1.0043, | |
| "mean_token_accuracy": 0.7418724097311497, | |
| "num_tokens": 69713797.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.3732078664004803, | |
| "epoch": 0.1663985800654501, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.1692546583850934e-05, | |
| "loss": 0.9737, | |
| "mean_token_accuracy": 0.7472748421132565, | |
| "num_tokens": 70644616.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.3796279937028886, | |
| "epoch": 0.1686172277996561, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.1581632653061226e-05, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.7464235134422779, | |
| "num_tokens": 71614683.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.3841315507888794, | |
| "epoch": 0.17083587553386212, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.147071872227152e-05, | |
| "loss": 1.0505, | |
| "mean_token_accuracy": 0.7321801386773586, | |
| "num_tokens": 72565413.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.3817786656320095, | |
| "epoch": 0.1730545232680681, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.135980479148181e-05, | |
| "loss": 0.9787, | |
| "mean_token_accuracy": 0.7469952210783959, | |
| "num_tokens": 73526877.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.3620466977357863, | |
| "epoch": 0.1752731710022741, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.124889086069211e-05, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7441352687776088, | |
| "num_tokens": 74473784.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.3590239346027375, | |
| "epoch": 0.17749181873648012, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.11379769299024e-05, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.748184335231781, | |
| "num_tokens": 75417836.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.3708465218544006, | |
| "epoch": 0.17971046647068611, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.1027062999112693e-05, | |
| "loss": 0.9631, | |
| "mean_token_accuracy": 0.7510820157825947, | |
| "num_tokens": 76354490.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.341279798746109, | |
| "epoch": 0.1819291142048921, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.0916149068322986e-05, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7506601929664611, | |
| "num_tokens": 77302002.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.3692625604569912, | |
| "epoch": 0.18414776193909813, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.080523513753328e-05, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.7492272712290287, | |
| "num_tokens": 78250737.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.3755904287099838, | |
| "epoch": 0.18636640967330412, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.069432120674357e-05, | |
| "loss": 0.9899, | |
| "mean_token_accuracy": 0.7458423741161824, | |
| "num_tokens": 79188862.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.4084200143814087, | |
| "epoch": 0.1885850574075101, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.058340727595386e-05, | |
| "loss": 1.0189, | |
| "mean_token_accuracy": 0.739814518392086, | |
| "num_tokens": 80106826.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.3865490198135375, | |
| "epoch": 0.19080370514171613, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.0472493345164154e-05, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.7435623817145824, | |
| "num_tokens": 81042920.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.4000697553157806, | |
| "epoch": 0.19302235287592212, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.0361579414374446e-05, | |
| "loss": 0.9822, | |
| "mean_token_accuracy": 0.7476979814469814, | |
| "num_tokens": 81961887.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.4024762332439422, | |
| "epoch": 0.19524100061012814, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 4.025066548358474e-05, | |
| "loss": 1.0031, | |
| "mean_token_accuracy": 0.7444592162966728, | |
| "num_tokens": 82897270.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.3742952913045883, | |
| "epoch": 0.19745964834433413, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.013975155279504e-05, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7470481149852276, | |
| "num_tokens": 83862674.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.3406959801912308, | |
| "epoch": 0.19967829607854012, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.002883762200533e-05, | |
| "loss": 0.9555, | |
| "mean_token_accuracy": 0.7523404717445373, | |
| "num_tokens": 84815713.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.4000651821494103, | |
| "epoch": 0.20189694381274614, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.991792369121562e-05, | |
| "loss": 0.9945, | |
| "mean_token_accuracy": 0.7464812904596329, | |
| "num_tokens": 85748577.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.3798070877790451, | |
| "epoch": 0.20411559154695214, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.9807009760425914e-05, | |
| "loss": 1.0066, | |
| "mean_token_accuracy": 0.7432169638574123, | |
| "num_tokens": 86686607.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.3416421085596084, | |
| "epoch": 0.20633423928115813, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 3.9696095829636206e-05, | |
| "loss": 0.9644, | |
| "mean_token_accuracy": 0.7486262872815133, | |
| "num_tokens": 87638726.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.3670515537261962, | |
| "epoch": 0.20855288701536415, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 3.95851818988465e-05, | |
| "loss": 0.9849, | |
| "mean_token_accuracy": 0.7475749678909779, | |
| "num_tokens": 88601047.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.379362154006958, | |
| "epoch": 0.21077153474957014, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.947426796805679e-05, | |
| "loss": 0.9476, | |
| "mean_token_accuracy": 0.7543636500835419, | |
| "num_tokens": 89520415.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.371240857243538, | |
| "epoch": 0.21299018248377613, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.936335403726708e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.7490607380867005, | |
| "num_tokens": 90455065.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.340223667025566, | |
| "epoch": 0.21520883021798215, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.9252440106477374e-05, | |
| "loss": 0.9414, | |
| "mean_token_accuracy": 0.75420788154006, | |
| "num_tokens": 91408476.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.358856461942196, | |
| "epoch": 0.21742747795218814, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.914152617568767e-05, | |
| "loss": 0.9898, | |
| "mean_token_accuracy": 0.748216237872839, | |
| "num_tokens": 92379704.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.3565750516951085, | |
| "epoch": 0.21964612568639413, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.9030612244897965e-05, | |
| "loss": 0.947, | |
| "mean_token_accuracy": 0.7539278566837311, | |
| "num_tokens": 93330334.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.3714484706521035, | |
| "epoch": 0.22186477342060015, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.891969831410826e-05, | |
| "loss": 0.9702, | |
| "mean_token_accuracy": 0.7506582617759705, | |
| "num_tokens": 94283931.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.4159359961748124, | |
| "epoch": 0.22408342115480614, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.880878438331855e-05, | |
| "loss": 1.0096, | |
| "mean_token_accuracy": 0.7425235278904438, | |
| "num_tokens": 95207171.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.3379707857966423, | |
| "epoch": 0.22630206888901214, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.869787045252884e-05, | |
| "loss": 0.9549, | |
| "mean_token_accuracy": 0.7548819564282894, | |
| "num_tokens": 96148468.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.36017052680254, | |
| "epoch": 0.22852071662321816, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.8586956521739134e-05, | |
| "loss": 0.9901, | |
| "mean_token_accuracy": 0.7463661000132561, | |
| "num_tokens": 97119881.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.3724884755909443, | |
| "epoch": 0.23073936435742415, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 3.8476042590949426e-05, | |
| "loss": 0.9822, | |
| "mean_token_accuracy": 0.7475468330085278, | |
| "num_tokens": 98077875.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.3405052460730076, | |
| "epoch": 0.23295801209163014, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.836512866015972e-05, | |
| "loss": 0.9432, | |
| "mean_token_accuracy": 0.7541234731674195, | |
| "num_tokens": 99031833.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.3757181286811828, | |
| "epoch": 0.23517665982583616, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.825421472937001e-05, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.7472271144390106, | |
| "num_tokens": 99991709.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.3773247390985488, | |
| "epoch": 0.23739530756004215, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.814330079858031e-05, | |
| "loss": 0.9933, | |
| "mean_token_accuracy": 0.7446820683777332, | |
| "num_tokens": 100941262.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.3600140511989594, | |
| "epoch": 0.23961395529424814, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.80323868677906e-05, | |
| "loss": 0.9708, | |
| "mean_token_accuracy": 0.7481269456446171, | |
| "num_tokens": 101897106.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.4162090666592122, | |
| "epoch": 0.24183260302845416, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.7921472937000893e-05, | |
| "loss": 0.9589, | |
| "mean_token_accuracy": 0.7518798463046551, | |
| "num_tokens": 102824359.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.342430242151022, | |
| "epoch": 0.24405125076266015, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.7810559006211186e-05, | |
| "loss": 0.9439, | |
| "mean_token_accuracy": 0.7558625318109989, | |
| "num_tokens": 103772965.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.3550659596920014, | |
| "epoch": 0.24626989849686615, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.769964507542148e-05, | |
| "loss": 0.9645, | |
| "mean_token_accuracy": 0.7508764907717704, | |
| "num_tokens": 104691589.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.3936431795358657, | |
| "epoch": 0.24848854623107217, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.758873114463177e-05, | |
| "loss": 0.989, | |
| "mean_token_accuracy": 0.7461673654615879, | |
| "num_tokens": 105581821.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.38260547965765, | |
| "epoch": 0.2507071939652782, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.747781721384206e-05, | |
| "loss": 0.9923, | |
| "mean_token_accuracy": 0.7444244168698788, | |
| "num_tokens": 106548122.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.3805961057543754, | |
| "epoch": 0.2529258416994842, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.7366903283052354e-05, | |
| "loss": 0.988, | |
| "mean_token_accuracy": 0.746028533577919, | |
| "num_tokens": 107471780.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.3697818227112293, | |
| "epoch": 0.25514448943369017, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.7255989352262646e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7449548006057739, | |
| "num_tokens": 108423848.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.3857608392834664, | |
| "epoch": 0.25736313716789616, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.714507542147294e-05, | |
| "loss": 0.9909, | |
| "mean_token_accuracy": 0.7463208839297295, | |
| "num_tokens": 109339085.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.4123259857296944, | |
| "epoch": 0.25958178490210215, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 3.703416149068323e-05, | |
| "loss": 1.0152, | |
| "mean_token_accuracy": 0.7402355149388313, | |
| "num_tokens": 110293198.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.3887243419885635, | |
| "epoch": 0.26180043263630814, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.692324755989352e-05, | |
| "loss": 1.0135, | |
| "mean_token_accuracy": 0.7396637931466102, | |
| "num_tokens": 111236146.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.3697876557707787, | |
| "epoch": 0.2640190803705142, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.6812333629103815e-05, | |
| "loss": 1.0056, | |
| "mean_token_accuracy": 0.7433505475521087, | |
| "num_tokens": 112180565.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.3680458456277846, | |
| "epoch": 0.2662377281047202, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.670141969831411e-05, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7568574421107769, | |
| "num_tokens": 113127830.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.3552896961569787, | |
| "epoch": 0.2684563758389262, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.65905057675244e-05, | |
| "loss": 0.9666, | |
| "mean_token_accuracy": 0.7498848676681519, | |
| "num_tokens": 114067246.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.3530599243938923, | |
| "epoch": 0.27067502357313217, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.64795918367347e-05, | |
| "loss": 0.9768, | |
| "mean_token_accuracy": 0.7468112826347351, | |
| "num_tokens": 114993058.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.3884347334504128, | |
| "epoch": 0.27289367130733816, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 3.636867790594499e-05, | |
| "loss": 1.0055, | |
| "mean_token_accuracy": 0.7407384052872658, | |
| "num_tokens": 115913621.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.3924200147390366, | |
| "epoch": 0.2751123190415442, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.625776397515528e-05, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.7461141526699067, | |
| "num_tokens": 116873252.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.3493129260838033, | |
| "epoch": 0.2773309667757502, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.6146850044365574e-05, | |
| "loss": 0.9607, | |
| "mean_token_accuracy": 0.7491537302732467, | |
| "num_tokens": 117827994.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.382804460823536, | |
| "epoch": 0.2795496145099562, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.6035936113575866e-05, | |
| "loss": 0.9795, | |
| "mean_token_accuracy": 0.7502268873155117, | |
| "num_tokens": 118765286.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.374519681930542, | |
| "epoch": 0.2817682622441622, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.592502218278616e-05, | |
| "loss": 0.9933, | |
| "mean_token_accuracy": 0.7442196063697338, | |
| "num_tokens": 119728068.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.3972402699291706, | |
| "epoch": 0.2839869099783682, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.581410825199645e-05, | |
| "loss": 1.0037, | |
| "mean_token_accuracy": 0.7407265052199363, | |
| "num_tokens": 120663567.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.379422479122877, | |
| "epoch": 0.28620555771257417, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.570319432120674e-05, | |
| "loss": 0.9921, | |
| "mean_token_accuracy": 0.7444989711046219, | |
| "num_tokens": 121604187.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.3605211839079856, | |
| "epoch": 0.2884242054467802, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.5592280390417035e-05, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7547078765928745, | |
| "num_tokens": 122549091.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.358756284415722, | |
| "epoch": 0.2906428531809862, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.548136645962733e-05, | |
| "loss": 0.9763, | |
| "mean_token_accuracy": 0.7475451476871967, | |
| "num_tokens": 123493867.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.3583389446139336, | |
| "epoch": 0.2928615009151922, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.537045252883762e-05, | |
| "loss": 0.9564, | |
| "mean_token_accuracy": 0.7530794121325016, | |
| "num_tokens": 124444992.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.3372597798705101, | |
| "epoch": 0.2950801486493982, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.525953859804791e-05, | |
| "loss": 0.917, | |
| "mean_token_accuracy": 0.7617659427225589, | |
| "num_tokens": 125376281.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.3307228960096835, | |
| "epoch": 0.2972987963836042, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.514862466725821e-05, | |
| "loss": 0.9606, | |
| "mean_token_accuracy": 0.749411403387785, | |
| "num_tokens": 126299926.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.3589562863111495, | |
| "epoch": 0.29951744411781017, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.50377107364685e-05, | |
| "loss": 0.9547, | |
| "mean_token_accuracy": 0.753935182094574, | |
| "num_tokens": 127248113.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.3731069147586823, | |
| "epoch": 0.3017360918520162, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.4926796805678794e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7470672108232975, | |
| "num_tokens": 128181913.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.3970228135585785, | |
| "epoch": 0.3039547395862222, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.481588287488909e-05, | |
| "loss": 0.9808, | |
| "mean_token_accuracy": 0.7479516059160233, | |
| "num_tokens": 129129397.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.3645412735641003, | |
| "epoch": 0.3061733873204282, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.470496894409938e-05, | |
| "loss": 0.9904, | |
| "mean_token_accuracy": 0.7432928495109081, | |
| "num_tokens": 130072657.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.3820607632398605, | |
| "epoch": 0.3083920350546342, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.459405501330967e-05, | |
| "loss": 0.9456, | |
| "mean_token_accuracy": 0.7521802820265293, | |
| "num_tokens": 131009716.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.370594221353531, | |
| "epoch": 0.3106106827888402, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.448314108251996e-05, | |
| "loss": 0.9802, | |
| "mean_token_accuracy": 0.7480019509792328, | |
| "num_tokens": 131933570.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.4096333682537079, | |
| "epoch": 0.3128293305230462, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.4372227151730255e-05, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.7449732661247254, | |
| "num_tokens": 132867719.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.3462214186787604, | |
| "epoch": 0.3150479782572522, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.426131322094055e-05, | |
| "loss": 0.965, | |
| "mean_token_accuracy": 0.7521069377660752, | |
| "num_tokens": 133800807.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.3778568729758263, | |
| "epoch": 0.3172666259914582, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.415039929015084e-05, | |
| "loss": 0.984, | |
| "mean_token_accuracy": 0.7461141437292099, | |
| "num_tokens": 134735911.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.3803854644298554, | |
| "epoch": 0.3194852737256642, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.403948535936114e-05, | |
| "loss": 1.0009, | |
| "mean_token_accuracy": 0.7454333089292049, | |
| "num_tokens": 135679919.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.4075747832655907, | |
| "epoch": 0.3217039214598702, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.392857142857143e-05, | |
| "loss": 0.9977, | |
| "mean_token_accuracy": 0.7427182622253895, | |
| "num_tokens": 136600159.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.3631702698767185, | |
| "epoch": 0.3239225691940762, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.381765749778172e-05, | |
| "loss": 0.9834, | |
| "mean_token_accuracy": 0.7481319233775139, | |
| "num_tokens": 137552890.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.3479732781648637, | |
| "epoch": 0.3261412169282822, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.3706743566992015e-05, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7547242395579815, | |
| "num_tokens": 138503386.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.3664841935038567, | |
| "epoch": 0.32835986466248823, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.359582963620231e-05, | |
| "loss": 0.9414, | |
| "mean_token_accuracy": 0.7544668681919575, | |
| "num_tokens": 139436618.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.369805136322975, | |
| "epoch": 0.3305785123966942, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 3.34849157054126e-05, | |
| "loss": 0.9659, | |
| "mean_token_accuracy": 0.748472998291254, | |
| "num_tokens": 140386106.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.3472276948392392, | |
| "epoch": 0.3327971601309002, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 3.337400177462289e-05, | |
| "loss": 0.9306, | |
| "mean_token_accuracy": 0.7600706323981286, | |
| "num_tokens": 141318351.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.3616932608187198, | |
| "epoch": 0.3350158078651062, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.326308784383318e-05, | |
| "loss": 0.949, | |
| "mean_token_accuracy": 0.7525821574032306, | |
| "num_tokens": 142261086.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.3529250659048557, | |
| "epoch": 0.3372344555993122, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.3152173913043475e-05, | |
| "loss": 0.9622, | |
| "mean_token_accuracy": 0.7509421311318875, | |
| "num_tokens": 143210426.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.3494533702731133, | |
| "epoch": 0.33945310333351825, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.3041259982253774e-05, | |
| "loss": 0.9719, | |
| "mean_token_accuracy": 0.7484220921993255, | |
| "num_tokens": 144159675.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.3390969790518283, | |
| "epoch": 0.34167175106772424, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.2930346051464066e-05, | |
| "loss": 0.951, | |
| "mean_token_accuracy": 0.7507020443677902, | |
| "num_tokens": 145086625.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.3771871954202652, | |
| "epoch": 0.34389039880193023, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.281943212067436e-05, | |
| "loss": 0.9817, | |
| "mean_token_accuracy": 0.7470030762255192, | |
| "num_tokens": 145986719.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.3694410175085068, | |
| "epoch": 0.3461090465361362, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 3.270851818988465e-05, | |
| "loss": 0.9611, | |
| "mean_token_accuracy": 0.7503976099193096, | |
| "num_tokens": 146918044.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.3784858211874962, | |
| "epoch": 0.3483276942703422, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.259760425909494e-05, | |
| "loss": 0.9773, | |
| "mean_token_accuracy": 0.7474872335791588, | |
| "num_tokens": 147883804.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.3432278633117676, | |
| "epoch": 0.3505463420045482, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.2486690328305235e-05, | |
| "loss": 0.936, | |
| "mean_token_accuracy": 0.755360123515129, | |
| "num_tokens": 148818674.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.3762368500232696, | |
| "epoch": 0.35276498973875425, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.237577639751553e-05, | |
| "loss": 1.0127, | |
| "mean_token_accuracy": 0.7423109777271748, | |
| "num_tokens": 149775495.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.389584618806839, | |
| "epoch": 0.35498363747296025, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.226486246672582e-05, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.7540929049253464, | |
| "num_tokens": 150715437.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.3511891454458236, | |
| "epoch": 0.35720228520716624, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.215394853593611e-05, | |
| "loss": 0.9424, | |
| "mean_token_accuracy": 0.7547151155769825, | |
| "num_tokens": 151648229.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.342407089471817, | |
| "epoch": 0.35942093294137223, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.204303460514641e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.75165830925107, | |
| "num_tokens": 152568041.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.3556662440299987, | |
| "epoch": 0.3616395806755782, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.19321206743567e-05, | |
| "loss": 0.9579, | |
| "mean_token_accuracy": 0.7535672217607499, | |
| "num_tokens": 153481444.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.3596419125795365, | |
| "epoch": 0.3638582284097842, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.1821206743566994e-05, | |
| "loss": 0.9707, | |
| "mean_token_accuracy": 0.7496115677058697, | |
| "num_tokens": 154436286.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.3691009670495986, | |
| "epoch": 0.36607687614399026, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.171029281277729e-05, | |
| "loss": 0.9829, | |
| "mean_token_accuracy": 0.7472112305462361, | |
| "num_tokens": 155369838.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.3596091173589229, | |
| "epoch": 0.36829552387819625, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.159937888198758e-05, | |
| "loss": 0.9514, | |
| "mean_token_accuracy": 0.7531104668974876, | |
| "num_tokens": 156331805.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.4006396278738975, | |
| "epoch": 0.37051417161240224, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.148846495119787e-05, | |
| "loss": 1.0172, | |
| "mean_token_accuracy": 0.7395706221461296, | |
| "num_tokens": 157249437.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.3770634673535824, | |
| "epoch": 0.37273281934660824, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.137755102040816e-05, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.7440236747264862, | |
| "num_tokens": 158190770.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.370580254495144, | |
| "epoch": 0.3749514670808142, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 3.1266637089618455e-05, | |
| "loss": 0.9632, | |
| "mean_token_accuracy": 0.7505429275333881, | |
| "num_tokens": 159111105.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.3719338580965996, | |
| "epoch": 0.3771701148150202, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.115572315882875e-05, | |
| "loss": 0.9831, | |
| "mean_token_accuracy": 0.7460063569247722, | |
| "num_tokens": 160054633.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.4081777222454548, | |
| "epoch": 0.37938876254922627, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.1044809228039046e-05, | |
| "loss": 1.0079, | |
| "mean_token_accuracy": 0.7412950038909912, | |
| "num_tokens": 161001374.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.3786864325404167, | |
| "epoch": 0.38160741028343226, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.093389529724934e-05, | |
| "loss": 1.0045, | |
| "mean_token_accuracy": 0.7446122042834759, | |
| "num_tokens": 161931135.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.3475178599357605, | |
| "epoch": 0.38382605801763825, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.082298136645963e-05, | |
| "loss": 0.9488, | |
| "mean_token_accuracy": 0.7558258168399334, | |
| "num_tokens": 162879415.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.369861949980259, | |
| "epoch": 0.38604470575184424, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.071206743566992e-05, | |
| "loss": 0.965, | |
| "mean_token_accuracy": 0.7485872730612755, | |
| "num_tokens": 163815621.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.3579731062054634, | |
| "epoch": 0.38826335348605023, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.0601153504880215e-05, | |
| "loss": 0.9566, | |
| "mean_token_accuracy": 0.7517626143991947, | |
| "num_tokens": 164760214.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.3689505890011788, | |
| "epoch": 0.3904820012202563, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.0490239574090507e-05, | |
| "loss": 0.9746, | |
| "mean_token_accuracy": 0.7488324150443078, | |
| "num_tokens": 165725369.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.3397907942533493, | |
| "epoch": 0.3927006489544623, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.03793256433008e-05, | |
| "loss": 0.9336, | |
| "mean_token_accuracy": 0.7559916451573372, | |
| "num_tokens": 166673239.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.3467686548829079, | |
| "epoch": 0.39491929668866826, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.026841171251109e-05, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7545785017311573, | |
| "num_tokens": 167629364.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.3575761772692203, | |
| "epoch": 0.39713794442287426, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.0157497781721383e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7488372251391411, | |
| "num_tokens": 168566244.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.353706033527851, | |
| "epoch": 0.39935659215708025, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.0046583850931682e-05, | |
| "loss": 0.9412, | |
| "mean_token_accuracy": 0.7558333098888397, | |
| "num_tokens": 169511821.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.3735353089869022, | |
| "epoch": 0.40157523989128624, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.9935669920141974e-05, | |
| "loss": 0.985, | |
| "mean_token_accuracy": 0.7467327207326889, | |
| "num_tokens": 170438159.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.383265955746174, | |
| "epoch": 0.4037938876254923, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.9824755989352266e-05, | |
| "loss": 0.9906, | |
| "mean_token_accuracy": 0.7453700192272663, | |
| "num_tokens": 171372232.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.3484601899981499, | |
| "epoch": 0.4060125353596983, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.971384205856256e-05, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7536688603460788, | |
| "num_tokens": 172306458.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.3571648687124251, | |
| "epoch": 0.40823118309390427, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.960292812777285e-05, | |
| "loss": 0.9957, | |
| "mean_token_accuracy": 0.7446161836385727, | |
| "num_tokens": 173255617.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.3686935976147652, | |
| "epoch": 0.41044983082811026, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.9492014196983143e-05, | |
| "loss": 0.9407, | |
| "mean_token_accuracy": 0.7566944785416126, | |
| "num_tokens": 174208124.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.3695332050323485, | |
| "epoch": 0.41266847856231625, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.9381100266193435e-05, | |
| "loss": 0.973, | |
| "mean_token_accuracy": 0.7486338473856449, | |
| "num_tokens": 175159867.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.3632485464215278, | |
| "epoch": 0.41488712629652225, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.9270186335403727e-05, | |
| "loss": 0.9524, | |
| "mean_token_accuracy": 0.7515506997704506, | |
| "num_tokens": 176092841.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.3715375781059265, | |
| "epoch": 0.4171057740307283, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.915927240461402e-05, | |
| "loss": 0.981, | |
| "mean_token_accuracy": 0.7478602975606918, | |
| "num_tokens": 177053245.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.3497217521071434, | |
| "epoch": 0.4193244217649343, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.9048358473824318e-05, | |
| "loss": 0.9605, | |
| "mean_token_accuracy": 0.7520354442298413, | |
| "num_tokens": 178014206.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.380847369134426, | |
| "epoch": 0.4215430694991403, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.893744454303461e-05, | |
| "loss": 0.9716, | |
| "mean_token_accuracy": 0.7520315021276474, | |
| "num_tokens": 178955937.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.3706847220659255, | |
| "epoch": 0.42376171723334627, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 2.8826530612244902e-05, | |
| "loss": 0.9787, | |
| "mean_token_accuracy": 0.7502063922584057, | |
| "num_tokens": 179913308.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.386249950528145, | |
| "epoch": 0.42598036496755226, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.8715616681455194e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7522901840507984, | |
| "num_tokens": 180862001.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.387051635980606, | |
| "epoch": 0.42819901270175825, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.8604702750665487e-05, | |
| "loss": 0.9775, | |
| "mean_token_accuracy": 0.7487853363156318, | |
| "num_tokens": 181811443.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.3455969080328942, | |
| "epoch": 0.4304176604359643, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 2.849378881987578e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.751628965884447, | |
| "num_tokens": 182766304.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.364281751215458, | |
| "epoch": 0.4326363081701703, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.838287488908607e-05, | |
| "loss": 0.9469, | |
| "mean_token_accuracy": 0.7545228533446788, | |
| "num_tokens": 183713237.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.3550898402929306, | |
| "epoch": 0.4348549559043763, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.8271960958296363e-05, | |
| "loss": 0.9731, | |
| "mean_token_accuracy": 0.7482604801654815, | |
| "num_tokens": 184646169.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.3582610800862311, | |
| "epoch": 0.4370736036385823, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.8161047027506655e-05, | |
| "loss": 0.948, | |
| "mean_token_accuracy": 0.7556835524737835, | |
| "num_tokens": 185593240.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.4051440745592116, | |
| "epoch": 0.43929225137278827, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.8050133096716947e-05, | |
| "loss": 1.0018, | |
| "mean_token_accuracy": 0.7416196145117283, | |
| "num_tokens": 186511580.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.3595298886299134, | |
| "epoch": 0.4415108991069943, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.7939219165927243e-05, | |
| "loss": 0.966, | |
| "mean_token_accuracy": 0.7499643869698047, | |
| "num_tokens": 187463868.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.344205194711685, | |
| "epoch": 0.4437295468412003, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.7828305235137535e-05, | |
| "loss": 0.9686, | |
| "mean_token_accuracy": 0.7512450948357582, | |
| "num_tokens": 188409504.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.4076050415635109, | |
| "epoch": 0.4459481945754063, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.7717391304347827e-05, | |
| "loss": 1.0067, | |
| "mean_token_accuracy": 0.7415082044899464, | |
| "num_tokens": 189355779.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.3575765684247016, | |
| "epoch": 0.4481668423096123, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.760647737355812e-05, | |
| "loss": 0.9601, | |
| "mean_token_accuracy": 0.7506938494741917, | |
| "num_tokens": 190278778.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.3836154788732529, | |
| "epoch": 0.4503854900438183, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.749556344276841e-05, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.7393032193183899, | |
| "num_tokens": 191241326.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.391631406545639, | |
| "epoch": 0.4526041377780243, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.7384649511978703e-05, | |
| "loss": 0.9776, | |
| "mean_token_accuracy": 0.7508174151182174, | |
| "num_tokens": 192214681.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.3410830795764923, | |
| "epoch": 0.4548227855122303, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.7273735581188996e-05, | |
| "loss": 0.943, | |
| "mean_token_accuracy": 0.7552683062851429, | |
| "num_tokens": 193166398.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.3428148820996284, | |
| "epoch": 0.4570414332464363, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.7162821650399288e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.750061446428299, | |
| "num_tokens": 194098519.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.379681558907032, | |
| "epoch": 0.4592600809806423, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.7051907719609583e-05, | |
| "loss": 0.9832, | |
| "mean_token_accuracy": 0.7464658364653587, | |
| "num_tokens": 195004862.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.3508182168006897, | |
| "epoch": 0.4614787287148483, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.694099378881988e-05, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7505707196891308, | |
| "num_tokens": 195942523.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.3505974404513836, | |
| "epoch": 0.4636973764490543, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.683007985803017e-05, | |
| "loss": 0.9358, | |
| "mean_token_accuracy": 0.7566464401781559, | |
| "num_tokens": 196877376.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.3485953092575074, | |
| "epoch": 0.4659160241832603, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.6719165927240463e-05, | |
| "loss": 0.9482, | |
| "mean_token_accuracy": 0.7524454712867736, | |
| "num_tokens": 197810400.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.3627469688653946, | |
| "epoch": 0.4681346719174663, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.6608251996450755e-05, | |
| "loss": 0.9897, | |
| "mean_token_accuracy": 0.7450599886476994, | |
| "num_tokens": 198757408.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.3389364905655383, | |
| "epoch": 0.4703533196516723, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.6497338065661047e-05, | |
| "loss": 0.9129, | |
| "mean_token_accuracy": 0.7605686038732529, | |
| "num_tokens": 199692340.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.3748140163719653, | |
| "epoch": 0.4725719673858783, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.638642413487134e-05, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.7520599849522114, | |
| "num_tokens": 200650664.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.3346731156110763, | |
| "epoch": 0.4747906151200843, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.627551020408163e-05, | |
| "loss": 0.9212, | |
| "mean_token_accuracy": 0.7582221433520318, | |
| "num_tokens": 201597933.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.3664929166436195, | |
| "epoch": 0.4770092628542903, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.6164596273291924e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7481563113629818, | |
| "num_tokens": 202551711.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.3843952640891075, | |
| "epoch": 0.4792279105884963, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.6053682342502216e-05, | |
| "loss": 1.0013, | |
| "mean_token_accuracy": 0.7428773507475853, | |
| "num_tokens": 203475006.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.3748216979205607, | |
| "epoch": 0.48144655832270233, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.5942768411712515e-05, | |
| "loss": 0.9858, | |
| "mean_token_accuracy": 0.7472008153796196, | |
| "num_tokens": 204414096.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.3418536871671676, | |
| "epoch": 0.4836652060569083, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.5831854480922807e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7489035427570343, | |
| "num_tokens": 205346877.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.334491142630577, | |
| "epoch": 0.4858838537911143, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.57209405501331e-05, | |
| "loss": 0.952, | |
| "mean_token_accuracy": 0.7544978365302086, | |
| "num_tokens": 206299157.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.3800398319959641, | |
| "epoch": 0.4881025015253203, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.561002661934339e-05, | |
| "loss": 0.9965, | |
| "mean_token_accuracy": 0.7438121646642685, | |
| "num_tokens": 207235480.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.3536552309989929, | |
| "epoch": 0.4903211492595263, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.5499112688553683e-05, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7553939551115036, | |
| "num_tokens": 208192869.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.3550305306911468, | |
| "epoch": 0.4925397969937323, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.5388198757763975e-05, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7509313143789769, | |
| "num_tokens": 209163216.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.3438825502991676, | |
| "epoch": 0.49475844472793834, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.5277284826974267e-05, | |
| "loss": 0.9612, | |
| "mean_token_accuracy": 0.7521602623164654, | |
| "num_tokens": 210107351.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.3802131339907646, | |
| "epoch": 0.49697709246214433, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.516637089618456e-05, | |
| "loss": 0.9761, | |
| "mean_token_accuracy": 0.7472608901560307, | |
| "num_tokens": 211041572.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.340398869663477, | |
| "epoch": 0.4991957401963503, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.5055456965394852e-05, | |
| "loss": 0.9588, | |
| "mean_token_accuracy": 0.7512741200625896, | |
| "num_tokens": 211977487.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.3554712682962418, | |
| "epoch": 0.5014143879305564, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 2.4944543034605147e-05, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.7513454340398311, | |
| "num_tokens": 212944715.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.3268218383193016, | |
| "epoch": 0.5036330356647624, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 2.483362910381544e-05, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7580919787287712, | |
| "num_tokens": 213894148.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.3675538420677185, | |
| "epoch": 0.5058516833989684, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.4722715173025735e-05, | |
| "loss": 0.9662, | |
| "mean_token_accuracy": 0.7483490623533726, | |
| "num_tokens": 214849284.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.3869496576488018, | |
| "epoch": 0.5080703311331743, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.4611801242236027e-05, | |
| "loss": 0.9855, | |
| "mean_token_accuracy": 0.7459334179759025, | |
| "num_tokens": 215795497.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.3464835032820701, | |
| "epoch": 0.5102889788673803, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.450088731144632e-05, | |
| "loss": 0.9559, | |
| "mean_token_accuracy": 0.7510037913918495, | |
| "num_tokens": 216730353.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.3536331675946713, | |
| "epoch": 0.5125076266015863, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.438997338065661e-05, | |
| "loss": 0.9768, | |
| "mean_token_accuracy": 0.7474269300699234, | |
| "num_tokens": 217652371.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.3597574278712272, | |
| "epoch": 0.5147262743357923, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.4279059449866903e-05, | |
| "loss": 0.9739, | |
| "mean_token_accuracy": 0.7476049326360226, | |
| "num_tokens": 218599573.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.3576934173703195, | |
| "epoch": 0.5169449220699983, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.41681455190772e-05, | |
| "loss": 0.9472, | |
| "mean_token_accuracy": 0.7533730484545231, | |
| "num_tokens": 219526034.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.3639849349856377, | |
| "epoch": 0.5191635698042043, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.405723158828749e-05, | |
| "loss": 1.0025, | |
| "mean_token_accuracy": 0.7426088079810143, | |
| "num_tokens": 220457735.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.3650838419795037, | |
| "epoch": 0.5213822175384103, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.3946317657497783e-05, | |
| "loss": 0.9795, | |
| "mean_token_accuracy": 0.7475734516978264, | |
| "num_tokens": 221414161.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.381436189264059, | |
| "epoch": 0.5236008652726163, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.3835403726708075e-05, | |
| "loss": 0.9928, | |
| "mean_token_accuracy": 0.744963239133358, | |
| "num_tokens": 222354700.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.3839796632528305, | |
| "epoch": 0.5258195130068224, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 2.372448979591837e-05, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.7483658462762832, | |
| "num_tokens": 223258825.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.3521512404084206, | |
| "epoch": 0.5280381607410284, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.3613575865128663e-05, | |
| "loss": 0.9965, | |
| "mean_token_accuracy": 0.7439669594168663, | |
| "num_tokens": 224207000.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.3651843503117562, | |
| "epoch": 0.5302568084752344, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 2.3502661934338955e-05, | |
| "loss": 0.9609, | |
| "mean_token_accuracy": 0.7512055054306984, | |
| "num_tokens": 225168371.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.3762024179100991, | |
| "epoch": 0.5324754562094404, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 2.3391748003549247e-05, | |
| "loss": 0.9818, | |
| "mean_token_accuracy": 0.7457237169146538, | |
| "num_tokens": 226082017.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.3472841560840607, | |
| "epoch": 0.5346941039436464, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.328083407275954e-05, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.7504221297800541, | |
| "num_tokens": 227034510.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.3381285414099693, | |
| "epoch": 0.5369127516778524, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 2.3169920141969835e-05, | |
| "loss": 0.9492, | |
| "mean_token_accuracy": 0.7552238062024117, | |
| "num_tokens": 228002765.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.3511281102895736, | |
| "epoch": 0.5391313994120583, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.3059006211180127e-05, | |
| "loss": 0.9393, | |
| "mean_token_accuracy": 0.7557635813951492, | |
| "num_tokens": 228965429.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.3392139934003353, | |
| "epoch": 0.5413500471462643, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.294809228039042e-05, | |
| "loss": 0.9343, | |
| "mean_token_accuracy": 0.7574323169887066, | |
| "num_tokens": 229896813.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.3686351031064987, | |
| "epoch": 0.5435686948804703, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 2.283717834960071e-05, | |
| "loss": 0.937, | |
| "mean_token_accuracy": 0.7549462541937828, | |
| "num_tokens": 230834752.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.339580136537552, | |
| "epoch": 0.5457873426146763, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.2726264418811003e-05, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7544058203697205, | |
| "num_tokens": 231770668.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.3737561523914337, | |
| "epoch": 0.5480059903488823, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 2.26153504880213e-05, | |
| "loss": 0.9539, | |
| "mean_token_accuracy": 0.752394187450409, | |
| "num_tokens": 232706737.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.3709870815277099, | |
| "epoch": 0.5502246380830884, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.250443655723159e-05, | |
| "loss": 0.9915, | |
| "mean_token_accuracy": 0.7455207951366901, | |
| "num_tokens": 233667028.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.3425948224961757, | |
| "epoch": 0.5524432858172944, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.2393522626441883e-05, | |
| "loss": 0.9351, | |
| "mean_token_accuracy": 0.7563626609742642, | |
| "num_tokens": 234615377.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.382601398229599, | |
| "epoch": 0.5546619335515004, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.2282608695652175e-05, | |
| "loss": 0.9837, | |
| "mean_token_accuracy": 0.7450042508542538, | |
| "num_tokens": 235554965.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.3571919694542884, | |
| "epoch": 0.5568805812857064, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.2171694764862467e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7487703949213028, | |
| "num_tokens": 236510004.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.3876874506473542, | |
| "epoch": 0.5590992290199124, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.206078083407276e-05, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7465429671108723, | |
| "num_tokens": 237426607.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.3840069979429246, | |
| "epoch": 0.5613178767541184, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.1949866903283052e-05, | |
| "loss": 0.9573, | |
| "mean_token_accuracy": 0.7508729174733162, | |
| "num_tokens": 238388324.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.3555088877677917, | |
| "epoch": 0.5635365244883244, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.1838952972493347e-05, | |
| "loss": 0.9508, | |
| "mean_token_accuracy": 0.7524322152137757, | |
| "num_tokens": 239305641.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.4074857875704765, | |
| "epoch": 0.5657551722225304, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.172803904170364e-05, | |
| "loss": 0.9851, | |
| "mean_token_accuracy": 0.7465929470956325, | |
| "num_tokens": 240221454.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.3208380579948424, | |
| "epoch": 0.5679738199567363, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.161712511091393e-05, | |
| "loss": 0.9432, | |
| "mean_token_accuracy": 0.754857836663723, | |
| "num_tokens": 241170406.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 1.36037939786911, | |
| "epoch": 0.5701924676909423, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.1506211180124224e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.7514487348496914, | |
| "num_tokens": 242100616.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.3895253077149392, | |
| "epoch": 0.5724111154251483, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.1395297249334516e-05, | |
| "loss": 0.9847, | |
| "mean_token_accuracy": 0.7462774030864239, | |
| "num_tokens": 243038631.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.3492624297738076, | |
| "epoch": 0.5746297631593543, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.1284383318544808e-05, | |
| "loss": 0.9428, | |
| "mean_token_accuracy": 0.7537525497376919, | |
| "num_tokens": 244000961.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.357532762736082, | |
| "epoch": 0.5768484108935604, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.1173469387755103e-05, | |
| "loss": 0.9577, | |
| "mean_token_accuracy": 0.7505512781441211, | |
| "num_tokens": 244952283.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.3500149488449096, | |
| "epoch": 0.5790670586277664, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.1062555456965396e-05, | |
| "loss": 0.9333, | |
| "mean_token_accuracy": 0.757443331182003, | |
| "num_tokens": 245897365.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 1.3495190888643265, | |
| "epoch": 0.5812857063619724, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.0951641526175688e-05, | |
| "loss": 0.9582, | |
| "mean_token_accuracy": 0.7513453289866447, | |
| "num_tokens": 246833155.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.3707278072834015, | |
| "epoch": 0.5835043540961784, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.084072759538598e-05, | |
| "loss": 0.9817, | |
| "mean_token_accuracy": 0.7466557987034321, | |
| "num_tokens": 247796159.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.340453139692545, | |
| "epoch": 0.5857230018303844, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.0729813664596272e-05, | |
| "loss": 0.9589, | |
| "mean_token_accuracy": 0.7525463417172432, | |
| "num_tokens": 248736277.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.3754316791892052, | |
| "epoch": 0.5879416495645904, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.0618899733806567e-05, | |
| "loss": 0.9697, | |
| "mean_token_accuracy": 0.7479398109018802, | |
| "num_tokens": 249662809.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.3680956415832042, | |
| "epoch": 0.5901602972987964, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.050798580301686e-05, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.7505876325070858, | |
| "num_tokens": 250581187.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 1.372042527794838, | |
| "epoch": 0.5923789450330024, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.0397071872227152e-05, | |
| "loss": 0.9573, | |
| "mean_token_accuracy": 0.7499286234378815, | |
| "num_tokens": 251494513.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.3676451787352562, | |
| "epoch": 0.5945975927672084, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.0286157941437444e-05, | |
| "loss": 1.0024, | |
| "mean_token_accuracy": 0.7436093680560589, | |
| "num_tokens": 252450478.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.321447344124317, | |
| "epoch": 0.5968162405014144, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 2.0175244010647736e-05, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.7592410154640674, | |
| "num_tokens": 253388200.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.367007777094841, | |
| "epoch": 0.5990348882356203, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.006433007985803e-05, | |
| "loss": 0.951, | |
| "mean_token_accuracy": 0.7534979909658432, | |
| "num_tokens": 254303611.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.3557642981410027, | |
| "epoch": 0.6012535359698263, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.9953416149068324e-05, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7532623074948788, | |
| "num_tokens": 255267315.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 1.3694660350680352, | |
| "epoch": 0.6034721837040324, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.9842502218278616e-05, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.7510243773460388, | |
| "num_tokens": 256198305.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.3600564405322075, | |
| "epoch": 0.6056908314382384, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.9731588287488908e-05, | |
| "loss": 0.9597, | |
| "mean_token_accuracy": 0.7512127391993999, | |
| "num_tokens": 257177077.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.3179180152714252, | |
| "epoch": 0.6079094791724444, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.9620674356699203e-05, | |
| "loss": 0.935, | |
| "mean_token_accuracy": 0.757300040870905, | |
| "num_tokens": 258134360.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.35346722304821, | |
| "epoch": 0.6101281269066504, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.9509760425909496e-05, | |
| "loss": 0.945, | |
| "mean_token_accuracy": 0.7527749851346016, | |
| "num_tokens": 259094913.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.3553345277905464, | |
| "epoch": 0.6123467746408564, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.9398846495119788e-05, | |
| "loss": 0.9358, | |
| "mean_token_accuracy": 0.7586644418537617, | |
| "num_tokens": 260019758.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.3732210516929626, | |
| "epoch": 0.6145654223750624, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.928793256433008e-05, | |
| "loss": 0.9604, | |
| "mean_token_accuracy": 0.7491915933787823, | |
| "num_tokens": 260944733.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.3680385306477547, | |
| "epoch": 0.6167840701092684, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.9177018633540372e-05, | |
| "loss": 0.9676, | |
| "mean_token_accuracy": 0.7478097401559353, | |
| "num_tokens": 261873242.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.3868214182555676, | |
| "epoch": 0.6190027178434744, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.9066104702750667e-05, | |
| "loss": 0.9516, | |
| "mean_token_accuracy": 0.7528703935444355, | |
| "num_tokens": 262814898.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.3595042198896408, | |
| "epoch": 0.6212213655776804, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.895519077196096e-05, | |
| "loss": 0.9479, | |
| "mean_token_accuracy": 0.7566476508975029, | |
| "num_tokens": 263753086.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.33022148758173, | |
| "epoch": 0.6234400133118864, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.8844276841171252e-05, | |
| "loss": 0.9321, | |
| "mean_token_accuracy": 0.7572297543287277, | |
| "num_tokens": 264700281.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.3536405637860298, | |
| "epoch": 0.6256586610460924, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.8733362910381544e-05, | |
| "loss": 0.9419, | |
| "mean_token_accuracy": 0.7534777402877808, | |
| "num_tokens": 265647657.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.3843684569001198, | |
| "epoch": 0.6278773087802985, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.862244897959184e-05, | |
| "loss": 0.9653, | |
| "mean_token_accuracy": 0.7501688152551651, | |
| "num_tokens": 266578441.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.3544567473232747, | |
| "epoch": 0.6300959565145045, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.851153504880213e-05, | |
| "loss": 0.9457, | |
| "mean_token_accuracy": 0.7535403810441494, | |
| "num_tokens": 267530874.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.3669777683913709, | |
| "epoch": 0.6323146042487104, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.8400621118012424e-05, | |
| "loss": 0.9422, | |
| "mean_token_accuracy": 0.7547655880451203, | |
| "num_tokens": 268461165.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.3307232797145843, | |
| "epoch": 0.6345332519829164, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.8289707187222716e-05, | |
| "loss": 0.9172, | |
| "mean_token_accuracy": 0.7619568608701229, | |
| "num_tokens": 269407213.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.367132118344307, | |
| "epoch": 0.6367518997171224, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.8178793256433008e-05, | |
| "loss": 0.9508, | |
| "mean_token_accuracy": 0.7541770383715629, | |
| "num_tokens": 270328899.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.3831279791891575, | |
| "epoch": 0.6389705474513284, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.8067879325643303e-05, | |
| "loss": 0.9817, | |
| "mean_token_accuracy": 0.746974790096283, | |
| "num_tokens": 271265664.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.3699244022369386, | |
| "epoch": 0.6411891951855344, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.7956965394853596e-05, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7492304258048534, | |
| "num_tokens": 272186809.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.3639655753970146, | |
| "epoch": 0.6434078429197404, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.7846051464063888e-05, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7505564413964748, | |
| "num_tokens": 273108290.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.403120481967926, | |
| "epoch": 0.6456264906539464, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.773513753327418e-05, | |
| "loss": 0.9769, | |
| "mean_token_accuracy": 0.746848201751709, | |
| "num_tokens": 274041696.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.4035961225628852, | |
| "epoch": 0.6478451383881524, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.7624223602484475e-05, | |
| "loss": 1.028, | |
| "mean_token_accuracy": 0.7384353429079056, | |
| "num_tokens": 274987065.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.3617764726281165, | |
| "epoch": 0.6500637861223584, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.7513309671694767e-05, | |
| "loss": 0.9566, | |
| "mean_token_accuracy": 0.7519726864993572, | |
| "num_tokens": 275931121.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.3632805831730366, | |
| "epoch": 0.6522824338565644, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.740239574090506e-05, | |
| "loss": 0.9548, | |
| "mean_token_accuracy": 0.751621701568365, | |
| "num_tokens": 276887903.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.3548810198903083, | |
| "epoch": 0.6545010815907705, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.7291481810115352e-05, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7500473111867905, | |
| "num_tokens": 277797512.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.3761564634740353, | |
| "epoch": 0.6567197293249765, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.7180567879325644e-05, | |
| "loss": 0.9801, | |
| "mean_token_accuracy": 0.7459054350852966, | |
| "num_tokens": 278758206.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.3604497477412223, | |
| "epoch": 0.6589383770591825, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.706965394853594e-05, | |
| "loss": 0.9701, | |
| "mean_token_accuracy": 0.7494505539536476, | |
| "num_tokens": 279682051.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.365204595029354, | |
| "epoch": 0.6611570247933884, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.695874001774623e-05, | |
| "loss": 0.9354, | |
| "mean_token_accuracy": 0.7586885608732701, | |
| "num_tokens": 280616177.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.3509592905640602, | |
| "epoch": 0.6633756725275944, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.6847826086956524e-05, | |
| "loss": 0.9502, | |
| "mean_token_accuracy": 0.7536125592887402, | |
| "num_tokens": 281562822.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.361519905924797, | |
| "epoch": 0.6655943202618004, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.6736912156166816e-05, | |
| "loss": 0.9355, | |
| "mean_token_accuracy": 0.755986961722374, | |
| "num_tokens": 282503409.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.351868186891079, | |
| "epoch": 0.6678129679960064, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.6625998225377108e-05, | |
| "loss": 0.9363, | |
| "mean_token_accuracy": 0.7572803579270839, | |
| "num_tokens": 283411000.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 1.3627968370914458, | |
| "epoch": 0.6700316157302124, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.6515084294587403e-05, | |
| "loss": 0.954, | |
| "mean_token_accuracy": 0.7517252512276172, | |
| "num_tokens": 284363870.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.350551488995552, | |
| "epoch": 0.6722502634644184, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.6404170363797696e-05, | |
| "loss": 0.9243, | |
| "mean_token_accuracy": 0.7591398231685161, | |
| "num_tokens": 285307777.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.3500189259648323, | |
| "epoch": 0.6744689111986244, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.6293256433007988e-05, | |
| "loss": 0.938, | |
| "mean_token_accuracy": 0.7556370176374912, | |
| "num_tokens": 286253628.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.344145791977644, | |
| "epoch": 0.6766875589328304, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.618234250221828e-05, | |
| "loss": 0.9545, | |
| "mean_token_accuracy": 0.7536003112792968, | |
| "num_tokens": 287183850.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.3805773958563805, | |
| "epoch": 0.6789062066670365, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.6071428571428572e-05, | |
| "loss": 0.9733, | |
| "mean_token_accuracy": 0.7494976818561554, | |
| "num_tokens": 288100206.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 1.344843527674675, | |
| "epoch": 0.6811248544012425, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.5960514640638864e-05, | |
| "loss": 0.9458, | |
| "mean_token_accuracy": 0.7523748345673085, | |
| "num_tokens": 289028085.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.3617902539670468, | |
| "epoch": 0.6833435021354485, | |
| "grad_norm": 0.5, | |
| "learning_rate": 1.5849600709849156e-05, | |
| "loss": 0.9616, | |
| "mean_token_accuracy": 0.7512977905571461, | |
| "num_tokens": 289972169.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 1.3891084134578704, | |
| "epoch": 0.6855621498696545, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.573868677905945e-05, | |
| "loss": 0.984, | |
| "mean_token_accuracy": 0.7475369438529015, | |
| "num_tokens": 290890546.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.3548466391861438, | |
| "epoch": 0.6877807976038605, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.5627772848269744e-05, | |
| "loss": 0.9156, | |
| "mean_token_accuracy": 0.7610405057668685, | |
| "num_tokens": 291831317.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.3873838737607003, | |
| "epoch": 0.6899994453380665, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.5516858917480036e-05, | |
| "loss": 0.9849, | |
| "mean_token_accuracy": 0.7466968774795533, | |
| "num_tokens": 292773910.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.374309216439724, | |
| "epoch": 0.6922180930722724, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.5405944986690328e-05, | |
| "loss": 0.9691, | |
| "mean_token_accuracy": 0.7497981458902359, | |
| "num_tokens": 293706792.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.3540133006870747, | |
| "epoch": 0.6944367408064784, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.529503105590062e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7486146375536918, | |
| "num_tokens": 294649185.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.3608009479939938, | |
| "epoch": 0.6966553885406844, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.5184117125110914e-05, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7509834311902523, | |
| "num_tokens": 295571599.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.3526781380176545, | |
| "epoch": 0.6988740362748904, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.5073203194321208e-05, | |
| "loss": 0.9838, | |
| "mean_token_accuracy": 0.7449854724109173, | |
| "num_tokens": 296510262.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.3786792248487472, | |
| "epoch": 0.7010926840090964, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.4962289263531502e-05, | |
| "loss": 0.9576, | |
| "mean_token_accuracy": 0.7524838514626027, | |
| "num_tokens": 297462659.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.37396137714386, | |
| "epoch": 0.7033113317433024, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.4851375332741794e-05, | |
| "loss": 0.9655, | |
| "mean_token_accuracy": 0.7482963159680367, | |
| "num_tokens": 298410420.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 1.3479665741324425, | |
| "epoch": 0.7055299794775085, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.4740461401952086e-05, | |
| "loss": 0.9735, | |
| "mean_token_accuracy": 0.7480811208486557, | |
| "num_tokens": 299335997.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.4149656519293785, | |
| "epoch": 0.7077486272117145, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 1.4629547471162378e-05, | |
| "loss": 0.9868, | |
| "mean_token_accuracy": 0.7450309813022613, | |
| "num_tokens": 300300688.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.3166653901338576, | |
| "epoch": 0.7099672749459205, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.4518633540372672e-05, | |
| "loss": 0.9244, | |
| "mean_token_accuracy": 0.7599585182964802, | |
| "num_tokens": 301275137.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.3608283437788486, | |
| "epoch": 0.7121859226801265, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.4407719609582964e-05, | |
| "loss": 0.9792, | |
| "mean_token_accuracy": 0.7479373283684254, | |
| "num_tokens": 302208727.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.3675961531698704, | |
| "epoch": 0.7144045704143325, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.4296805678793256e-05, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7523340001702309, | |
| "num_tokens": 303160006.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.3157847836613654, | |
| "epoch": 0.7166232181485385, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.4185891748003548e-05, | |
| "loss": 0.9469, | |
| "mean_token_accuracy": 0.7520846240222454, | |
| "num_tokens": 304102524.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 1.3752693004906178, | |
| "epoch": 0.7188418658827445, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.4074977817213844e-05, | |
| "loss": 0.9426, | |
| "mean_token_accuracy": 0.753889911621809, | |
| "num_tokens": 305042287.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.3292134046554565, | |
| "epoch": 0.7210605136169504, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.3964063886424136e-05, | |
| "loss": 0.9464, | |
| "mean_token_accuracy": 0.754455479234457, | |
| "num_tokens": 305988003.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.3723205238580705, | |
| "epoch": 0.7232791613511564, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.3853149955634428e-05, | |
| "loss": 0.9942, | |
| "mean_token_accuracy": 0.7461909614503384, | |
| "num_tokens": 306927584.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.3628524258732795, | |
| "epoch": 0.7254978090853624, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.374223602484472e-05, | |
| "loss": 0.9594, | |
| "mean_token_accuracy": 0.7528522469103336, | |
| "num_tokens": 307863697.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.353959833085537, | |
| "epoch": 0.7277164568195684, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.3631322094055012e-05, | |
| "loss": 0.9472, | |
| "mean_token_accuracy": 0.7561062417924405, | |
| "num_tokens": 308808276.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 1.3523946583271027, | |
| "epoch": 0.7299351045537745, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 1.3520408163265308e-05, | |
| "loss": 0.9578, | |
| "mean_token_accuracy": 0.7514262087643147, | |
| "num_tokens": 309773086.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.3321008674800396, | |
| "epoch": 0.7321537522879805, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.34094942324756e-05, | |
| "loss": 0.9513, | |
| "mean_token_accuracy": 0.7530274912714958, | |
| "num_tokens": 310728967.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.3726357147097588, | |
| "epoch": 0.7343724000221865, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.3298580301685892e-05, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7526456661522388, | |
| "num_tokens": 311671029.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.3460698679089547, | |
| "epoch": 0.7365910477563925, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.3187666370896184e-05, | |
| "loss": 0.977, | |
| "mean_token_accuracy": 0.7480454221367836, | |
| "num_tokens": 312608775.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.358740784227848, | |
| "epoch": 0.7388096954905985, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.3076752440106476e-05, | |
| "loss": 0.9388, | |
| "mean_token_accuracy": 0.7564548753201962, | |
| "num_tokens": 313562050.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 1.3844745293259622, | |
| "epoch": 0.7410283432248045, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.2965838509316772e-05, | |
| "loss": 0.9834, | |
| "mean_token_accuracy": 0.7466944210231304, | |
| "num_tokens": 314509120.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.3659690007567407, | |
| "epoch": 0.7432469909590105, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 1.2854924578527064e-05, | |
| "loss": 0.941, | |
| "mean_token_accuracy": 0.7546365484595299, | |
| "num_tokens": 315486453.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.3873593926429748, | |
| "epoch": 0.7454656386932165, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.2744010647737356e-05, | |
| "loss": 0.985, | |
| "mean_token_accuracy": 0.7476673908531666, | |
| "num_tokens": 316431477.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.3676550433039665, | |
| "epoch": 0.7476842864274225, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.2633096716947648e-05, | |
| "loss": 0.9627, | |
| "mean_token_accuracy": 0.7511092610657215, | |
| "num_tokens": 317357092.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.3835733927786351, | |
| "epoch": 0.7499029341616285, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.2522182786157944e-05, | |
| "loss": 0.948, | |
| "mean_token_accuracy": 0.7537398427724838, | |
| "num_tokens": 318280117.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.3766888722777366, | |
| "epoch": 0.7521215818958344, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.2411268855368236e-05, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.7515489347279072, | |
| "num_tokens": 319203557.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.357860617339611, | |
| "epoch": 0.7543402296300404, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.2300354924578528e-05, | |
| "loss": 0.9535, | |
| "mean_token_accuracy": 0.752687606215477, | |
| "num_tokens": 320136852.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.3469961121678353, | |
| "epoch": 0.7565588773642465, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.218944099378882e-05, | |
| "loss": 0.9459, | |
| "mean_token_accuracy": 0.7537472225725651, | |
| "num_tokens": 321106324.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.334907030314207, | |
| "epoch": 0.7587775250984525, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.2078527062999114e-05, | |
| "loss": 0.9359, | |
| "mean_token_accuracy": 0.7552877001464366, | |
| "num_tokens": 322042151.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.3580046392977239, | |
| "epoch": 0.7609961728326585, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.1967613132209406e-05, | |
| "loss": 0.9404, | |
| "mean_token_accuracy": 0.7551106229424477, | |
| "num_tokens": 322952370.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 1.3473434820771217, | |
| "epoch": 0.7632148205668645, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.18566992014197e-05, | |
| "loss": 0.9527, | |
| "mean_token_accuracy": 0.7552920714020729, | |
| "num_tokens": 323909905.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.388922219723463, | |
| "epoch": 0.7654334683010705, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.1745785270629992e-05, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.7475064925849437, | |
| "num_tokens": 324843712.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.3555053889751434, | |
| "epoch": 0.7676521160352765, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.1634871339840284e-05, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.747514633089304, | |
| "num_tokens": 325763726.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.4119513988494874, | |
| "epoch": 0.7698707637694825, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.1523957409050576e-05, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.752115435898304, | |
| "num_tokens": 326704959.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.358751341700554, | |
| "epoch": 0.7720894115036885, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.141304347826087e-05, | |
| "loss": 0.9502, | |
| "mean_token_accuracy": 0.752843676507473, | |
| "num_tokens": 327654129.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.373246306180954, | |
| "epoch": 0.7743080592378945, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.1302129547471162e-05, | |
| "loss": 0.9896, | |
| "mean_token_accuracy": 0.7471863307058811, | |
| "num_tokens": 328577246.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.3401599921286107, | |
| "epoch": 0.7765267069721005, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 1.1191215616681455e-05, | |
| "loss": 0.9046, | |
| "mean_token_accuracy": 0.7622545510530472, | |
| "num_tokens": 329531531.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.403013862669468, | |
| "epoch": 0.7787453547063065, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.1080301685891748e-05, | |
| "loss": 1.0199, | |
| "mean_token_accuracy": 0.7393336437642575, | |
| "num_tokens": 330493898.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.3496449366211891, | |
| "epoch": 0.7809640024405126, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.096938775510204e-05, | |
| "loss": 0.9422, | |
| "mean_token_accuracy": 0.7540981650352478, | |
| "num_tokens": 331418294.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 1.344119517505169, | |
| "epoch": 0.7831826501747186, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.0858473824312334e-05, | |
| "loss": 0.9386, | |
| "mean_token_accuracy": 0.757073562592268, | |
| "num_tokens": 332378464.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.3490448504686356, | |
| "epoch": 0.7854012979089245, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.0747559893522626e-05, | |
| "loss": 0.9457, | |
| "mean_token_accuracy": 0.7535859100520611, | |
| "num_tokens": 333318703.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.3718070283532142, | |
| "epoch": 0.7876199456431305, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.063664596273292e-05, | |
| "loss": 0.9685, | |
| "mean_token_accuracy": 0.7511322259902954, | |
| "num_tokens": 334255047.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.3711335480213165, | |
| "epoch": 0.7898385933773365, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.0525732031943212e-05, | |
| "loss": 0.9743, | |
| "mean_token_accuracy": 0.7471988372504711, | |
| "num_tokens": 335189458.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.3890349462628364, | |
| "epoch": 0.7920572411115425, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.0414818101153505e-05, | |
| "loss": 0.9872, | |
| "mean_token_accuracy": 0.7452017098665238, | |
| "num_tokens": 336139155.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.336748766899109, | |
| "epoch": 0.7942758888457485, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.0303904170363798e-05, | |
| "loss": 0.9194, | |
| "mean_token_accuracy": 0.7595400720834732, | |
| "num_tokens": 337103166.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.3947007723152638, | |
| "epoch": 0.7964945365799545, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.019299023957409e-05, | |
| "loss": 0.9857, | |
| "mean_token_accuracy": 0.7481105640530586, | |
| "num_tokens": 338049665.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.3394004009664058, | |
| "epoch": 0.7987131843141605, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.0082076308784384e-05, | |
| "loss": 0.9501, | |
| "mean_token_accuracy": 0.7537369303405285, | |
| "num_tokens": 339030359.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.4002343088388443, | |
| "epoch": 0.8009318320483665, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 9.971162377994676e-06, | |
| "loss": 0.9899, | |
| "mean_token_accuracy": 0.7460181936621666, | |
| "num_tokens": 339965846.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.3751978531479836, | |
| "epoch": 0.8031504797825725, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 9.86024844720497e-06, | |
| "loss": 0.9663, | |
| "mean_token_accuracy": 0.7495487280189991, | |
| "num_tokens": 340909085.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.3296589955687523, | |
| "epoch": 0.8053691275167785, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 9.749334516415262e-06, | |
| "loss": 0.9116, | |
| "mean_token_accuracy": 0.7615578956902027, | |
| "num_tokens": 341836396.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.3545545935630798, | |
| "epoch": 0.8075877752509846, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 9.638420585625555e-06, | |
| "loss": 0.946, | |
| "mean_token_accuracy": 0.7542130470275878, | |
| "num_tokens": 342759623.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.3891134530305862, | |
| "epoch": 0.8098064229851906, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 9.527506654835848e-06, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.7399868927896023, | |
| "num_tokens": 343714548.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.3653203830122949, | |
| "epoch": 0.8120250707193966, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 9.41659272404614e-06, | |
| "loss": 0.9689, | |
| "mean_token_accuracy": 0.747505272179842, | |
| "num_tokens": 344676512.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.3524901941418648, | |
| "epoch": 0.8142437184536025, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 9.305678793256434e-06, | |
| "loss": 0.9416, | |
| "mean_token_accuracy": 0.7552405230700969, | |
| "num_tokens": 345609814.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.3355680212378502, | |
| "epoch": 0.8164623661878085, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 9.194764862466726e-06, | |
| "loss": 0.9409, | |
| "mean_token_accuracy": 0.7546605832874775, | |
| "num_tokens": 346552116.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.3585198432207108, | |
| "epoch": 0.8186810139220145, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 9.08385093167702e-06, | |
| "loss": 0.9451, | |
| "mean_token_accuracy": 0.7566642910242081, | |
| "num_tokens": 347476547.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.3646457374095917, | |
| "epoch": 0.8208996616562205, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 8.972937000887312e-06, | |
| "loss": 0.9328, | |
| "mean_token_accuracy": 0.757544395327568, | |
| "num_tokens": 348402286.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.4008646070957185, | |
| "epoch": 0.8231183093904265, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 8.862023070097605e-06, | |
| "loss": 1.0166, | |
| "mean_token_accuracy": 0.7399243280291558, | |
| "num_tokens": 349350422.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.305922406166792, | |
| "epoch": 0.8253369571246325, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 8.751109139307898e-06, | |
| "loss": 0.9002, | |
| "mean_token_accuracy": 0.7653868660330773, | |
| "num_tokens": 350307992.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.344923496246338, | |
| "epoch": 0.8275556048588385, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 8.64019520851819e-06, | |
| "loss": 0.923, | |
| "mean_token_accuracy": 0.7606289356946945, | |
| "num_tokens": 351239736.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.356829535961151, | |
| "epoch": 0.8297742525930445, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 8.529281277728483e-06, | |
| "loss": 0.9208, | |
| "mean_token_accuracy": 0.7572924271225929, | |
| "num_tokens": 352175438.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.371825471520424, | |
| "epoch": 0.8319929003272506, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 8.418367346938775e-06, | |
| "loss": 0.9769, | |
| "mean_token_accuracy": 0.7484906286001205, | |
| "num_tokens": 353093469.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.3505297660827638, | |
| "epoch": 0.8342115480614566, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 8.307453416149069e-06, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.748659697920084, | |
| "num_tokens": 354042260.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.3741331085562707, | |
| "epoch": 0.8364301957956626, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 8.19653948535936e-06, | |
| "loss": 0.982, | |
| "mean_token_accuracy": 0.7466577455401421, | |
| "num_tokens": 354960941.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.3521684527397155, | |
| "epoch": 0.8386488435298686, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 8.085625554569655e-06, | |
| "loss": 0.951, | |
| "mean_token_accuracy": 0.7545395441353321, | |
| "num_tokens": 355899405.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 1.3922821909189225, | |
| "epoch": 0.8408674912640746, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.974711623779947e-06, | |
| "loss": 0.9774, | |
| "mean_token_accuracy": 0.7484460555016994, | |
| "num_tokens": 356837111.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.341869878768921, | |
| "epoch": 0.8430861389982806, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.863797692990239e-06, | |
| "loss": 0.9371, | |
| "mean_token_accuracy": 0.7555838227272034, | |
| "num_tokens": 357775995.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.3769854843616485, | |
| "epoch": 0.8453047867324865, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 7.752883762200533e-06, | |
| "loss": 0.9788, | |
| "mean_token_accuracy": 0.7470424689352513, | |
| "num_tokens": 358730556.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.3654131770133973, | |
| "epoch": 0.8475234344666925, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 7.641969831410825e-06, | |
| "loss": 0.9543, | |
| "mean_token_accuracy": 0.7543313026428222, | |
| "num_tokens": 359702857.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.3479675091803074, | |
| "epoch": 0.8497420822008985, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 7.5310559006211186e-06, | |
| "loss": 0.9434, | |
| "mean_token_accuracy": 0.7545025050640106, | |
| "num_tokens": 360637451.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 1.368970339745283, | |
| "epoch": 0.8519607299351045, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 7.420141969831411e-06, | |
| "loss": 0.9585, | |
| "mean_token_accuracy": 0.7518557466566562, | |
| "num_tokens": 361574227.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 1.3598952896893024, | |
| "epoch": 0.8541793776693105, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.3092280390417045e-06, | |
| "loss": 0.9427, | |
| "mean_token_accuracy": 0.754470182955265, | |
| "num_tokens": 362506193.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.3638700023293495, | |
| "epoch": 0.8563980254035165, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.198314108251997e-06, | |
| "loss": 0.9701, | |
| "mean_token_accuracy": 0.7472980074584484, | |
| "num_tokens": 363441282.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 1.3546796232461928, | |
| "epoch": 0.8586166731377226, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 7.0874001774622905e-06, | |
| "loss": 0.9753, | |
| "mean_token_accuracy": 0.7478179946541786, | |
| "num_tokens": 364393822.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 1.350717130303383, | |
| "epoch": 0.8608353208719286, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 6.976486246672583e-06, | |
| "loss": 0.9486, | |
| "mean_token_accuracy": 0.7572776488959789, | |
| "num_tokens": 365331779.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 1.3585814163088799, | |
| "epoch": 0.8630539686061346, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 6.865572315882875e-06, | |
| "loss": 0.9629, | |
| "mean_token_accuracy": 0.748991634696722, | |
| "num_tokens": 366257855.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.3992498129606248, | |
| "epoch": 0.8652726163403406, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 6.7546583850931686e-06, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7451303206384182, | |
| "num_tokens": 367181436.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.3461244717240333, | |
| "epoch": 0.8674912640745466, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 6.643744454303461e-06, | |
| "loss": 0.9381, | |
| "mean_token_accuracy": 0.7554601080715656, | |
| "num_tokens": 368128436.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 1.3403765760362147, | |
| "epoch": 0.8697099118087526, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 6.532830523513754e-06, | |
| "loss": 0.9265, | |
| "mean_token_accuracy": 0.7592454843223095, | |
| "num_tokens": 369092081.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 1.3784636914730073, | |
| "epoch": 0.8719285595429586, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 6.421916592724047e-06, | |
| "loss": 0.9607, | |
| "mean_token_accuracy": 0.751040443778038, | |
| "num_tokens": 370022755.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 1.3625924080610274, | |
| "epoch": 0.8741472072771646, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 6.31100266193434e-06, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7501497231423855, | |
| "num_tokens": 370963738.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 1.3465173587203025, | |
| "epoch": 0.8763658550113705, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 6.200088731144632e-06, | |
| "loss": 0.9578, | |
| "mean_token_accuracy": 0.7541100673377514, | |
| "num_tokens": 371888931.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.3527381241321563, | |
| "epoch": 0.8785845027455765, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 6.089174800354925e-06, | |
| "loss": 0.9467, | |
| "mean_token_accuracy": 0.7544343665242195, | |
| "num_tokens": 372849693.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 1.3818270325660706, | |
| "epoch": 0.8808031504797825, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 5.978260869565218e-06, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7534758277237416, | |
| "num_tokens": 373792435.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.3504199832677841, | |
| "epoch": 0.8830217982139886, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 5.867346938775511e-06, | |
| "loss": 0.9445, | |
| "mean_token_accuracy": 0.756835724413395, | |
| "num_tokens": 374746035.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 1.3496798947453499, | |
| "epoch": 0.8852404459481946, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 5.756433007985803e-06, | |
| "loss": 0.9419, | |
| "mean_token_accuracy": 0.7544379711151123, | |
| "num_tokens": 375703477.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 1.3732656255364417, | |
| "epoch": 0.8874590936824006, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 5.645519077196096e-06, | |
| "loss": 0.9604, | |
| "mean_token_accuracy": 0.751821743696928, | |
| "num_tokens": 376636971.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.375483873486519, | |
| "epoch": 0.8896777414166066, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 5.534605146406389e-06, | |
| "loss": 0.9671, | |
| "mean_token_accuracy": 0.7487936913967133, | |
| "num_tokens": 377588517.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 1.3773034647107125, | |
| "epoch": 0.8918963891508126, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 5.423691215616682e-06, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7501926451921463, | |
| "num_tokens": 378524822.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 1.3265659905970097, | |
| "epoch": 0.8941150368850186, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 5.312777284826975e-06, | |
| "loss": 0.9452, | |
| "mean_token_accuracy": 0.7550780981779098, | |
| "num_tokens": 379505593.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 1.335418175160885, | |
| "epoch": 0.8963336846192246, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 5.201863354037268e-06, | |
| "loss": 0.9482, | |
| "mean_token_accuracy": 0.7534942403435707, | |
| "num_tokens": 380473052.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 1.3610256776213645, | |
| "epoch": 0.8985523323534306, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 5.090949423247561e-06, | |
| "loss": 0.958, | |
| "mean_token_accuracy": 0.7533226810395718, | |
| "num_tokens": 381443551.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.3507319584488868, | |
| "epoch": 0.9007709800876366, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.980035492457853e-06, | |
| "loss": 0.9489, | |
| "mean_token_accuracy": 0.7544699974358082, | |
| "num_tokens": 382378572.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 1.3752561420202256, | |
| "epoch": 0.9029896278218426, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.869121561668146e-06, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7520841076970101, | |
| "num_tokens": 383311592.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 1.3476091951131821, | |
| "epoch": 0.9052082755560485, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 4.758207630878438e-06, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7553776867687703, | |
| "num_tokens": 384248057.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 1.3605633400380612, | |
| "epoch": 0.9074269232902545, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.647293700088731e-06, | |
| "loss": 0.9244, | |
| "mean_token_accuracy": 0.7569485224783421, | |
| "num_tokens": 385189392.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 1.365138278901577, | |
| "epoch": 0.9096455710244606, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 4.536379769299024e-06, | |
| "loss": 0.9531, | |
| "mean_token_accuracy": 0.7533132433891296, | |
| "num_tokens": 386110731.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.3613657392561436, | |
| "epoch": 0.9118642187586666, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.425465838509317e-06, | |
| "loss": 0.943, | |
| "mean_token_accuracy": 0.7544411860406399, | |
| "num_tokens": 387057495.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.3290772818028926, | |
| "epoch": 0.9140828664928726, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.31455190771961e-06, | |
| "loss": 0.9183, | |
| "mean_token_accuracy": 0.7608602307736874, | |
| "num_tokens": 388021016.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 1.3475232422351837, | |
| "epoch": 0.9163015142270786, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.203637976929903e-06, | |
| "loss": 0.9411, | |
| "mean_token_accuracy": 0.7554165907204151, | |
| "num_tokens": 388924467.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 1.3602249071002006, | |
| "epoch": 0.9185201619612846, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 4.092724046140196e-06, | |
| "loss": 0.9307, | |
| "mean_token_accuracy": 0.7554832518100738, | |
| "num_tokens": 389879904.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 1.332291903346777, | |
| "epoch": 0.9207388096954906, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.981810115350488e-06, | |
| "loss": 0.9559, | |
| "mean_token_accuracy": 0.752610693871975, | |
| "num_tokens": 390857519.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.3272889666259289, | |
| "epoch": 0.9229574574296966, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.870896184560781e-06, | |
| "loss": 0.9306, | |
| "mean_token_accuracy": 0.7583662964403629, | |
| "num_tokens": 391792594.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 1.3560280472040176, | |
| "epoch": 0.9251761051639026, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.759982253771074e-06, | |
| "loss": 0.9577, | |
| "mean_token_accuracy": 0.7537056483328343, | |
| "num_tokens": 392749013.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 1.3662122264504433, | |
| "epoch": 0.9273947528981086, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.6490683229813664e-06, | |
| "loss": 0.9715, | |
| "mean_token_accuracy": 0.7491789266467095, | |
| "num_tokens": 393666251.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 1.31452574133873, | |
| "epoch": 0.9296134006323146, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.5381543921916594e-06, | |
| "loss": 0.9006, | |
| "mean_token_accuracy": 0.7624947860836983, | |
| "num_tokens": 394596882.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 1.3401482120156287, | |
| "epoch": 0.9318320483665206, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.4272404614019524e-06, | |
| "loss": 0.9282, | |
| "mean_token_accuracy": 0.756990148127079, | |
| "num_tokens": 395526506.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.4013796046376228, | |
| "epoch": 0.9340506961007266, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.3163265306122454e-06, | |
| "loss": 0.9932, | |
| "mean_token_accuracy": 0.7460516929626465, | |
| "num_tokens": 396460730.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 1.3630273953080176, | |
| "epoch": 0.9362693438349327, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.2054125998225384e-06, | |
| "loss": 0.9513, | |
| "mean_token_accuracy": 0.7526130631566048, | |
| "num_tokens": 397410429.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 1.2898547686636448, | |
| "epoch": 0.9384879915691386, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 3.094498669032831e-06, | |
| "loss": 0.9228, | |
| "mean_token_accuracy": 0.7592225328087807, | |
| "num_tokens": 398358920.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 1.3584180302917956, | |
| "epoch": 0.9407066393033446, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 2.9835847382431235e-06, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.7511270597577095, | |
| "num_tokens": 399332691.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.3851253606379033, | |
| "epoch": 0.9429252870375506, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.872670807453416e-06, | |
| "loss": 0.974, | |
| "mean_token_accuracy": 0.7480023667216301, | |
| "num_tokens": 400266541.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.3538463555276394, | |
| "epoch": 0.9451439347717566, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.761756876663709e-06, | |
| "loss": 0.9443, | |
| "mean_token_accuracy": 0.7554497793316841, | |
| "num_tokens": 401201822.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 1.3774816602468491, | |
| "epoch": 0.9473625825059626, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.650842945874002e-06, | |
| "loss": 0.9843, | |
| "mean_token_accuracy": 0.746273136138916, | |
| "num_tokens": 402152611.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 1.3171171061694622, | |
| "epoch": 0.9495812302401686, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.539929015084295e-06, | |
| "loss": 0.9111, | |
| "mean_token_accuracy": 0.7632594168186188, | |
| "num_tokens": 403118617.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 1.3414073579013348, | |
| "epoch": 0.9517998779743746, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.4290150842945875e-06, | |
| "loss": 0.9402, | |
| "mean_token_accuracy": 0.7527715168893337, | |
| "num_tokens": 404056789.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 1.3378793716430664, | |
| "epoch": 0.9540185257085806, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.3181011535048805e-06, | |
| "loss": 0.9353, | |
| "mean_token_accuracy": 0.7563605636358262, | |
| "num_tokens": 405004371.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.372169415652752, | |
| "epoch": 0.9562371734427866, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 2.207187222715173e-06, | |
| "loss": 0.9436, | |
| "mean_token_accuracy": 0.7551277004182338, | |
| "num_tokens": 405922059.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 1.3533624187111855, | |
| "epoch": 0.9584558211769926, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.096273291925466e-06, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.7533909723162651, | |
| "num_tokens": 406838792.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 1.3719367325305938, | |
| "epoch": 0.9606744689111987, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.9853593611357586e-06, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7434499144554139, | |
| "num_tokens": 407786498.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 1.3563473880290986, | |
| "epoch": 0.9628931166454047, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.8744454303460516e-06, | |
| "loss": 0.9401, | |
| "mean_token_accuracy": 0.7530663572251797, | |
| "num_tokens": 408736948.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 1.347538560628891, | |
| "epoch": 0.9651117643796107, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.7635314995563443e-06, | |
| "loss": 0.933, | |
| "mean_token_accuracy": 0.7575333446264267, | |
| "num_tokens": 409664542.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.3749746069312097, | |
| "epoch": 0.9673304121138167, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.6526175687666373e-06, | |
| "loss": 0.9698, | |
| "mean_token_accuracy": 0.7511622585356236, | |
| "num_tokens": 410602122.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 1.3442941211163997, | |
| "epoch": 0.9695490598480226, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.54170363797693e-06, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.7501766428351402, | |
| "num_tokens": 411528742.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 1.3314830370247364, | |
| "epoch": 0.9717677075822286, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.4307897071872228e-06, | |
| "loss": 0.9528, | |
| "mean_token_accuracy": 0.7543077766895294, | |
| "num_tokens": 412496866.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 1.362931652367115, | |
| "epoch": 0.9739863553164346, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.3198757763975156e-06, | |
| "loss": 0.9539, | |
| "mean_token_accuracy": 0.7541424036026001, | |
| "num_tokens": 413426074.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 1.314641258120537, | |
| "epoch": 0.9762050030506406, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.2089618456078084e-06, | |
| "loss": 0.9326, | |
| "mean_token_accuracy": 0.7566425338387489, | |
| "num_tokens": 414366936.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.3944153673946857, | |
| "epoch": 0.9784236507848466, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.0980479148181013e-06, | |
| "loss": 0.9887, | |
| "mean_token_accuracy": 0.7444652430713177, | |
| "num_tokens": 415305379.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 1.3453952841460706, | |
| "epoch": 0.9806422985190526, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 9.871339840283939e-07, | |
| "loss": 0.9563, | |
| "mean_token_accuracy": 0.7523396387696266, | |
| "num_tokens": 416266511.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 1.3440303832292557, | |
| "epoch": 0.9828609462532586, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 8.762200532386869e-07, | |
| "loss": 0.9456, | |
| "mean_token_accuracy": 0.7547272637486457, | |
| "num_tokens": 417231789.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.367350959777832, | |
| "epoch": 0.9850795939874646, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 7.653061224489796e-07, | |
| "loss": 0.9693, | |
| "mean_token_accuracy": 0.7508193962275982, | |
| "num_tokens": 418194063.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 1.3651387616991997, | |
| "epoch": 0.9872982417216707, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 6.543921916592724e-07, | |
| "loss": 0.9463, | |
| "mean_token_accuracy": 0.7529610082507133, | |
| "num_tokens": 419114425.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.347716721892357, | |
| "epoch": 0.9895168894558767, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 5.434782608695653e-07, | |
| "loss": 0.9493, | |
| "mean_token_accuracy": 0.7544530227780342, | |
| "num_tokens": 420062804.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 1.323940635472536, | |
| "epoch": 0.9917355371900827, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 4.3256433007985804e-07, | |
| "loss": 0.9176, | |
| "mean_token_accuracy": 0.7596350736916065, | |
| "num_tokens": 420985488.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 1.3813497826457024, | |
| "epoch": 0.9939541849242887, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.2165039929015086e-07, | |
| "loss": 0.9854, | |
| "mean_token_accuracy": 0.7467473462224007, | |
| "num_tokens": 421920339.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 1.3907534167170525, | |
| "epoch": 0.9961728326584947, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.1073646850044365e-07, | |
| "loss": 0.9956, | |
| "mean_token_accuracy": 0.7431350871920586, | |
| "num_tokens": 422875342.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 1.4102192774415017, | |
| "epoch": 0.9983914803927006, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 9.982253771073646e-08, | |
| "loss": 1.0, | |
| "mean_token_accuracy": 0.7438700333237648, | |
| "num_tokens": 423810727.0, | |
| "step": 4500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4508, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.665075636310376e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |