sft_math / trainer_state.json
mmm128's picture
Backup current best SFT math checkpoint
d796dbc verified
Raw
History Blame Contribute Delete
213 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3587304668543143,
"eval_steps": 200,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.1698873918503523,
"epoch": 0.0019411821799475881,
"grad_norm": 28.75,
"learning_rate": 1.7241379310344828e-07,
"loss": 0.9568568229675293,
"mean_token_accuracy": 0.8273445844650269,
"num_tokens": 68060.0,
"step": 5
},
{
"entropy": 0.1770024599507451,
"epoch": 0.0038823643598951763,
"grad_norm": 35.0,
"learning_rate": 3.8793103448275865e-07,
"loss": 0.9151897430419922,
"mean_token_accuracy": 0.8321746543049813,
"num_tokens": 138267.0,
"step": 10
},
{
"entropy": 0.19476149305701257,
"epoch": 0.005823546539842764,
"grad_norm": 30.75,
"learning_rate": 6.034482758620691e-07,
"loss": 0.8955170631408691,
"mean_token_accuracy": 0.8212856188416481,
"num_tokens": 217527.0,
"step": 15
},
{
"entropy": 0.19077662620693445,
"epoch": 0.007764728719790353,
"grad_norm": 29.875,
"learning_rate": 8.189655172413794e-07,
"loss": 0.9730923652648926,
"mean_token_accuracy": 0.822833463549614,
"num_tokens": 280569.0,
"step": 20
},
{
"entropy": 0.1851712815463543,
"epoch": 0.00970591089973794,
"grad_norm": 31.0,
"learning_rate": 1.0344827586206898e-06,
"loss": 0.8731012344360352,
"mean_token_accuracy": 0.828074723482132,
"num_tokens": 361410.0,
"step": 25
},
{
"entropy": 0.21108674593269824,
"epoch": 0.011647093079685528,
"grad_norm": 30.875,
"learning_rate": 1.25e-06,
"loss": 0.9689438819885254,
"mean_token_accuracy": 0.8158086076378822,
"num_tokens": 433416.0,
"step": 30
},
{
"entropy": 0.1849596280604601,
"epoch": 0.013588275259633116,
"grad_norm": 23.375,
"learning_rate": 1.4655172413793104e-06,
"loss": 0.8792544364929199,
"mean_token_accuracy": 0.8349022850394249,
"num_tokens": 509913.0,
"step": 35
},
{
"entropy": 0.22154813222587108,
"epoch": 0.015529457439580705,
"grad_norm": 24.125,
"learning_rate": 1.681034482758621e-06,
"loss": 0.8865782737731933,
"mean_token_accuracy": 0.8225297197699547,
"num_tokens": 574517.0,
"step": 40
},
{
"entropy": 0.21101174745708703,
"epoch": 0.017470639619528293,
"grad_norm": 21.125,
"learning_rate": 1.896551724137931e-06,
"loss": 0.8394794464111328,
"mean_token_accuracy": 0.8262531071901321,
"num_tokens": 646659.0,
"step": 45
},
{
"entropy": 0.22667213380336762,
"epoch": 0.01941182179947588,
"grad_norm": 18.875,
"learning_rate": 2.1120689655172416e-06,
"loss": 0.8253890037536621,
"mean_token_accuracy": 0.8253167048096657,
"num_tokens": 726398.0,
"step": 50
},
{
"entropy": 0.23717499971389772,
"epoch": 0.02135300397942347,
"grad_norm": 17.25,
"learning_rate": 2.327586206896552e-06,
"loss": 0.7434019565582275,
"mean_token_accuracy": 0.8295832589268685,
"num_tokens": 809917.0,
"step": 55
},
{
"entropy": 0.25738408528268336,
"epoch": 0.023294186159371056,
"grad_norm": 13.8125,
"learning_rate": 2.543103448275862e-06,
"loss": 0.7521392345428467,
"mean_token_accuracy": 0.8236480697989463,
"num_tokens": 886659.0,
"step": 60
},
{
"entropy": 0.22672694064676763,
"epoch": 0.025235368339318644,
"grad_norm": 12.5625,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.7187893390655518,
"mean_token_accuracy": 0.8356023579835892,
"num_tokens": 953610.0,
"step": 65
},
{
"entropy": 0.26906434297561643,
"epoch": 0.02717655051926623,
"grad_norm": 11.9375,
"learning_rate": 2.9741379310344832e-06,
"loss": 0.7249302387237548,
"mean_token_accuracy": 0.8295665010809898,
"num_tokens": 1024103.0,
"step": 70
},
{
"entropy": 0.2811603628098965,
"epoch": 0.029117732699213823,
"grad_norm": 10.5,
"learning_rate": 3.1896551724137935e-06,
"loss": 0.7414528846740722,
"mean_token_accuracy": 0.8267218798398972,
"num_tokens": 1104806.0,
"step": 75
},
{
"entropy": 0.2909968294203281,
"epoch": 0.03105891487916141,
"grad_norm": 10.3125,
"learning_rate": 3.4051724137931034e-06,
"loss": 0.6241181850433349,
"mean_token_accuracy": 0.8339074313640594,
"num_tokens": 1185581.0,
"step": 80
},
{
"entropy": 0.3096594549715519,
"epoch": 0.033000097059108995,
"grad_norm": 7.6875,
"learning_rate": 3.620689655172414e-06,
"loss": 0.5764092445373535,
"mean_token_accuracy": 0.834169502556324,
"num_tokens": 1277766.0,
"step": 85
},
{
"entropy": 0.3442493978887796,
"epoch": 0.034941279239056586,
"grad_norm": 6.3125,
"learning_rate": 3.8362068965517245e-06,
"loss": 0.6330607414245606,
"mean_token_accuracy": 0.8328578889369964,
"num_tokens": 1343195.0,
"step": 90
},
{
"entropy": 0.3338872347027063,
"epoch": 0.03688246141900418,
"grad_norm": 3.9375,
"learning_rate": 4.051724137931034e-06,
"loss": 0.5495956897735595,
"mean_token_accuracy": 0.8446441546082497,
"num_tokens": 1422307.0,
"step": 95
},
{
"entropy": 0.3676390130072832,
"epoch": 0.03882364359895176,
"grad_norm": 3.78125,
"learning_rate": 4.267241379310345e-06,
"loss": 0.5221522331237793,
"mean_token_accuracy": 0.8460515171289444,
"num_tokens": 1502407.0,
"step": 100
},
{
"entropy": 0.3870431698858738,
"epoch": 0.04076482577889935,
"grad_norm": 2.875,
"learning_rate": 4.482758620689656e-06,
"loss": 0.5109545230865479,
"mean_token_accuracy": 0.8474476292729378,
"num_tokens": 1588898.0,
"step": 105
},
{
"entropy": 0.4107159897685051,
"epoch": 0.04270600795884694,
"grad_norm": 2.109375,
"learning_rate": 4.698275862068966e-06,
"loss": 0.5011598587036132,
"mean_token_accuracy": 0.8443124324083329,
"num_tokens": 1680514.0,
"step": 110
},
{
"entropy": 0.39204273708164694,
"epoch": 0.04464719013879453,
"grad_norm": 2.125,
"learning_rate": 4.9137931034482765e-06,
"loss": 0.4943844795227051,
"mean_token_accuracy": 0.8546174108982086,
"num_tokens": 1751043.0,
"step": 115
},
{
"entropy": 0.44253255538642405,
"epoch": 0.04658837231874211,
"grad_norm": 1.96875,
"learning_rate": 5.129310344827587e-06,
"loss": 0.5116967678070068,
"mean_token_accuracy": 0.8467695400118828,
"num_tokens": 1828152.0,
"step": 120
},
{
"entropy": 0.4137393295764923,
"epoch": 0.0485295544986897,
"grad_norm": 2.421875,
"learning_rate": 5.344827586206896e-06,
"loss": 0.4703562259674072,
"mean_token_accuracy": 0.8589524060487748,
"num_tokens": 1903581.0,
"step": 125
},
{
"entropy": 0.43903482854366305,
"epoch": 0.05047073667863729,
"grad_norm": 2.375,
"learning_rate": 5.560344827586207e-06,
"loss": 0.4887136936187744,
"mean_token_accuracy": 0.8527244672179222,
"num_tokens": 1976772.0,
"step": 130
},
{
"entropy": 0.4222589176148176,
"epoch": 0.05241191885858488,
"grad_norm": 2.265625,
"learning_rate": 5.775862068965518e-06,
"loss": 0.45414047241210936,
"mean_token_accuracy": 0.8595154702663421,
"num_tokens": 2045499.0,
"step": 135
},
{
"entropy": 0.4219055060297251,
"epoch": 0.05435310103853246,
"grad_norm": 1.9765625,
"learning_rate": 5.9913793103448284e-06,
"loss": 0.49632959365844725,
"mean_token_accuracy": 0.8565252378582955,
"num_tokens": 2120623.0,
"step": 140
},
{
"entropy": 0.42592958733439445,
"epoch": 0.056294283218480054,
"grad_norm": 1.828125,
"learning_rate": 6.206896551724138e-06,
"loss": 0.4315296173095703,
"mean_token_accuracy": 0.8575032651424408,
"num_tokens": 2198334.0,
"step": 145
},
{
"entropy": 0.40059518739581107,
"epoch": 0.058235465398427645,
"grad_norm": 2.015625,
"learning_rate": 6.422413793103449e-06,
"loss": 0.4771871566772461,
"mean_token_accuracy": 0.8683709859848022,
"num_tokens": 2265434.0,
"step": 150
},
{
"entropy": 0.41876095086336135,
"epoch": 0.06017664757837523,
"grad_norm": 1.34375,
"learning_rate": 6.63793103448276e-06,
"loss": 0.444645357131958,
"mean_token_accuracy": 0.8565793663263321,
"num_tokens": 2358742.0,
"step": 155
},
{
"entropy": 0.4004307441413403,
"epoch": 0.06211782975832282,
"grad_norm": 1.8828125,
"learning_rate": 6.853448275862069e-06,
"loss": 0.4320836544036865,
"mean_token_accuracy": 0.8631758615374565,
"num_tokens": 2437938.0,
"step": 160
},
{
"entropy": 0.41059495508670807,
"epoch": 0.0640590119382704,
"grad_norm": 1.8359375,
"learning_rate": 7.0689655172413796e-06,
"loss": 0.450638484954834,
"mean_token_accuracy": 0.8649105593562126,
"num_tokens": 2506611.0,
"step": 165
},
{
"entropy": 0.4375029347836971,
"epoch": 0.06600019411821799,
"grad_norm": 1.734375,
"learning_rate": 7.28448275862069e-06,
"loss": 0.48232545852661135,
"mean_token_accuracy": 0.8586594820022583,
"num_tokens": 2585776.0,
"step": 170
},
{
"entropy": 0.39430369548499583,
"epoch": 0.06794137629816559,
"grad_norm": 1.859375,
"learning_rate": 7.500000000000001e-06,
"loss": 0.42594180107116697,
"mean_token_accuracy": 0.8716334730386734,
"num_tokens": 2649428.0,
"step": 175
},
{
"entropy": 0.4384324595332146,
"epoch": 0.06988255847811317,
"grad_norm": 1.796875,
"learning_rate": 7.715517241379312e-06,
"loss": 0.45015506744384765,
"mean_token_accuracy": 0.8587341636419297,
"num_tokens": 2741939.0,
"step": 180
},
{
"entropy": 0.4197473503649235,
"epoch": 0.07182374065806076,
"grad_norm": 1.78125,
"learning_rate": 7.93103448275862e-06,
"loss": 0.430635404586792,
"mean_token_accuracy": 0.8667460069060325,
"num_tokens": 2814243.0,
"step": 185
},
{
"entropy": 0.40914484262466433,
"epoch": 0.07376492283800835,
"grad_norm": 2.0625,
"learning_rate": 8.146551724137932e-06,
"loss": 0.43552379608154296,
"mean_token_accuracy": 0.865776352584362,
"num_tokens": 2885496.0,
"step": 190
},
{
"entropy": 0.4126306913793087,
"epoch": 0.07570610501795594,
"grad_norm": 1.953125,
"learning_rate": 8.362068965517242e-06,
"loss": 0.44997596740722656,
"mean_token_accuracy": 0.8635187759995461,
"num_tokens": 2952067.0,
"step": 195
},
{
"entropy": 0.39164264835417273,
"epoch": 0.07764728719790352,
"grad_norm": 1.8046875,
"learning_rate": 8.577586206896551e-06,
"loss": 0.4397777557373047,
"mean_token_accuracy": 0.8692673921585083,
"num_tokens": 3032945.0,
"step": 200
},
{
"epoch": 0.07764728719790352,
"eval_entropy": 0.40086069169184285,
"eval_loss": 0.42599618434906006,
"eval_mean_token_accuracy": 0.8688706356933251,
"eval_num_tokens": 3032945.0,
"eval_runtime": 60.1253,
"eval_samples_per_second": 35.742,
"eval_steps_per_second": 35.742,
"step": 200
},
{
"entropy": 0.43272491060197354,
"epoch": 0.0795884693778511,
"grad_norm": 1.8828125,
"learning_rate": 8.793103448275862e-06,
"loss": 0.44966917037963866,
"mean_token_accuracy": 0.8635564997792244,
"num_tokens": 3099465.0,
"step": 205
},
{
"entropy": 0.4191489264369011,
"epoch": 0.0815296515577987,
"grad_norm": 1.734375,
"learning_rate": 9.008620689655173e-06,
"loss": 0.45826034545898436,
"mean_token_accuracy": 0.8656236171722412,
"num_tokens": 3165471.0,
"step": 210
},
{
"entropy": 0.3851976301521063,
"epoch": 0.08347083373774629,
"grad_norm": 1.9921875,
"learning_rate": 9.224137931034484e-06,
"loss": 0.41442441940307617,
"mean_token_accuracy": 0.8754843935370445,
"num_tokens": 3250159.0,
"step": 215
},
{
"entropy": 0.4017783209681511,
"epoch": 0.08541201591769387,
"grad_norm": 2.03125,
"learning_rate": 9.439655172413794e-06,
"loss": 0.4402902603149414,
"mean_token_accuracy": 0.8709232717752456,
"num_tokens": 3336018.0,
"step": 220
},
{
"entropy": 0.38373975045979025,
"epoch": 0.08735319809764146,
"grad_norm": 1.8515625,
"learning_rate": 9.655172413793105e-06,
"loss": 0.3947699546813965,
"mean_token_accuracy": 0.8761502489447593,
"num_tokens": 3402332.0,
"step": 225
},
{
"entropy": 0.3711710192263126,
"epoch": 0.08929438027758906,
"grad_norm": 1.4453125,
"learning_rate": 9.870689655172414e-06,
"loss": 0.38801021575927735,
"mean_token_accuracy": 0.8783514246344566,
"num_tokens": 3490759.0,
"step": 230
},
{
"entropy": 0.37838716208934786,
"epoch": 0.09123556245753664,
"grad_norm": 1.6875,
"learning_rate": 9.999998243530697e-06,
"loss": 0.4078525543212891,
"mean_token_accuracy": 0.8711143419146538,
"num_tokens": 3581453.0,
"step": 235
},
{
"entropy": 0.37663319632411,
"epoch": 0.09317674463748422,
"grad_norm": 2.5,
"learning_rate": 9.999978483265213e-06,
"loss": 0.4262491226196289,
"mean_token_accuracy": 0.8772962838411331,
"num_tokens": 3646168.0,
"step": 240
},
{
"entropy": 0.4050050787627697,
"epoch": 0.09511792681743182,
"grad_norm": 1.53125,
"learning_rate": 9.999936767234675e-06,
"loss": 0.4260216236114502,
"mean_token_accuracy": 0.8673088252544403,
"num_tokens": 3731064.0,
"step": 245
},
{
"entropy": 0.384697999432683,
"epoch": 0.0970591089973794,
"grad_norm": 1.9921875,
"learning_rate": 9.999873095622266e-06,
"loss": 0.3961009979248047,
"mean_token_accuracy": 0.8749020054936409,
"num_tokens": 3798798.0,
"step": 250
},
{
"entropy": 0.4126061208546162,
"epoch": 0.09900029117732699,
"grad_norm": 1.4375,
"learning_rate": 9.999787468707579e-06,
"loss": 0.43981060981750486,
"mean_token_accuracy": 0.8663996770977974,
"num_tokens": 3886350.0,
"step": 255
},
{
"entropy": 0.40845385044813154,
"epoch": 0.10094147335727457,
"grad_norm": 1.5078125,
"learning_rate": 9.999679886866614e-06,
"loss": 0.41130051612854,
"mean_token_accuracy": 0.8713549628853798,
"num_tokens": 3959484.0,
"step": 260
},
{
"entropy": 0.40083046182990073,
"epoch": 0.10288265553722217,
"grad_norm": 1.8671875,
"learning_rate": 9.999550350571785e-06,
"loss": 0.4313651561737061,
"mean_token_accuracy": 0.8690039083361626,
"num_tokens": 4039205.0,
"step": 265
},
{
"entropy": 0.35567491836845877,
"epoch": 0.10482383771716976,
"grad_norm": 1.625,
"learning_rate": 9.999398860391906e-06,
"loss": 0.39510302543640136,
"mean_token_accuracy": 0.8858051553368569,
"num_tokens": 4114526.0,
"step": 270
},
{
"entropy": 0.3589722190052271,
"epoch": 0.10676501989711734,
"grad_norm": 1.671875,
"learning_rate": 9.9992254169922e-06,
"loss": 0.377154541015625,
"mean_token_accuracy": 0.8847770616412163,
"num_tokens": 4180520.0,
"step": 275
},
{
"entropy": 0.37694212086498735,
"epoch": 0.10870620207706493,
"grad_norm": 2.078125,
"learning_rate": 9.99903002113428e-06,
"loss": 0.3913008213043213,
"mean_token_accuracy": 0.8760873630642891,
"num_tokens": 4256660.0,
"step": 280
},
{
"entropy": 0.43231718949973585,
"epoch": 0.11064738425701252,
"grad_norm": 1.59375,
"learning_rate": 9.99881267367617e-06,
"loss": 0.4287400722503662,
"mean_token_accuracy": 0.8621754452586174,
"num_tokens": 4326362.0,
"step": 285
},
{
"entropy": 0.4439574245363474,
"epoch": 0.11258856643696011,
"grad_norm": 1.765625,
"learning_rate": 9.998573375572277e-06,
"loss": 0.44347705841064455,
"mean_token_accuracy": 0.8591988816857338,
"num_tokens": 4401568.0,
"step": 290
},
{
"entropy": 0.4327508192509413,
"epoch": 0.11452974861690769,
"grad_norm": 1.53125,
"learning_rate": 9.998312127873398e-06,
"loss": 0.41773738861083987,
"mean_token_accuracy": 0.8604527100920677,
"num_tokens": 4482468.0,
"step": 295
},
{
"entropy": 0.39415039904415605,
"epoch": 0.11647093079685529,
"grad_norm": 1.734375,
"learning_rate": 9.99802893172672e-06,
"loss": 0.37184581756591795,
"mean_token_accuracy": 0.875768692791462,
"num_tokens": 4566311.0,
"step": 300
},
{
"entropy": 0.3908670715987682,
"epoch": 0.11841211297680287,
"grad_norm": 1.421875,
"learning_rate": 9.997723788375803e-06,
"loss": 0.4179991722106934,
"mean_token_accuracy": 0.8736939936876297,
"num_tokens": 4639335.0,
"step": 305
},
{
"entropy": 0.37057909071445466,
"epoch": 0.12035329515675046,
"grad_norm": 1.5703125,
"learning_rate": 9.997396699160586e-06,
"loss": 0.3718397855758667,
"mean_token_accuracy": 0.8778384670615196,
"num_tokens": 4729786.0,
"step": 310
},
{
"entropy": 0.3775249246507883,
"epoch": 0.12229447733669804,
"grad_norm": 1.25,
"learning_rate": 9.997047665517373e-06,
"loss": 0.36892924308776853,
"mean_token_accuracy": 0.8788423538208008,
"num_tokens": 4815579.0,
"step": 315
},
{
"entropy": 0.42527642734348775,
"epoch": 0.12423565951664564,
"grad_norm": 1.859375,
"learning_rate": 9.996676688978832e-06,
"loss": 0.4455845832824707,
"mean_token_accuracy": 0.8667929217219352,
"num_tokens": 4890387.0,
"step": 320
},
{
"entropy": 0.3964430205523968,
"epoch": 0.1261768416965932,
"grad_norm": 1.5859375,
"learning_rate": 9.996283771173982e-06,
"loss": 0.4093163967132568,
"mean_token_accuracy": 0.871640557050705,
"num_tokens": 4963050.0,
"step": 325
},
{
"entropy": 0.41448891162872314,
"epoch": 0.1281180238765408,
"grad_norm": 1.453125,
"learning_rate": 9.995868913828198e-06,
"loss": 0.4085641860961914,
"mean_token_accuracy": 0.8690432503819465,
"num_tokens": 5041868.0,
"step": 330
},
{
"entropy": 0.4093653842806816,
"epoch": 0.1300592060564884,
"grad_norm": 1.65625,
"learning_rate": 9.995432118763182e-06,
"loss": 0.4269090175628662,
"mean_token_accuracy": 0.8645370990037918,
"num_tokens": 5131350.0,
"step": 335
},
{
"entropy": 0.40297048091888427,
"epoch": 0.13200038823643598,
"grad_norm": 1.671875,
"learning_rate": 9.994973387896983e-06,
"loss": 0.4135453224182129,
"mean_token_accuracy": 0.8721030279994011,
"num_tokens": 5206454.0,
"step": 340
},
{
"entropy": 0.3713659271597862,
"epoch": 0.13394157041638358,
"grad_norm": 1.234375,
"learning_rate": 9.994492723243965e-06,
"loss": 0.38209033012390137,
"mean_token_accuracy": 0.8801575794816017,
"num_tokens": 5290337.0,
"step": 345
},
{
"entropy": 0.43902772702276704,
"epoch": 0.13588275259633117,
"grad_norm": 1.8046875,
"learning_rate": 9.993990126914808e-06,
"loss": 0.45044879913330077,
"mean_token_accuracy": 0.8585825085639953,
"num_tokens": 5355252.0,
"step": 350
},
{
"entropy": 0.403434070199728,
"epoch": 0.13782393477627874,
"grad_norm": 1.5859375,
"learning_rate": 9.9934656011165e-06,
"loss": 0.4331518650054932,
"mean_token_accuracy": 0.8729702636599541,
"num_tokens": 5421254.0,
"step": 355
},
{
"entropy": 0.3791210547089577,
"epoch": 0.13976511695622634,
"grad_norm": 1.5859375,
"learning_rate": 9.992919148152323e-06,
"loss": 0.4140181064605713,
"mean_token_accuracy": 0.8779570132493972,
"num_tokens": 5506667.0,
"step": 360
},
{
"entropy": 0.3965904530137777,
"epoch": 0.14170629913617394,
"grad_norm": 1.484375,
"learning_rate": 9.992350770421849e-06,
"loss": 0.40141940116882324,
"mean_token_accuracy": 0.873149348795414,
"num_tokens": 5582156.0,
"step": 365
},
{
"entropy": 0.3853264570236206,
"epoch": 0.1436474813161215,
"grad_norm": 2.015625,
"learning_rate": 9.991760470420917e-06,
"loss": 0.386338210105896,
"mean_token_accuracy": 0.8758631706237793,
"num_tokens": 5651606.0,
"step": 370
},
{
"entropy": 0.3922650724649429,
"epoch": 0.1455886634960691,
"grad_norm": 1.484375,
"learning_rate": 9.99114825074164e-06,
"loss": 0.41069760322570803,
"mean_token_accuracy": 0.8746298983693123,
"num_tokens": 5746987.0,
"step": 375
},
{
"entropy": 0.43557493686676024,
"epoch": 0.1475298456760167,
"grad_norm": 2.390625,
"learning_rate": 9.990514114072379e-06,
"loss": 0.44701457023620605,
"mean_token_accuracy": 0.8603363439440728,
"num_tokens": 5820511.0,
"step": 380
},
{
"entropy": 0.3883111171424389,
"epoch": 0.14947102785596428,
"grad_norm": 1.4921875,
"learning_rate": 9.989858063197735e-06,
"loss": 0.40341935157775877,
"mean_token_accuracy": 0.8721037909388543,
"num_tokens": 5901615.0,
"step": 385
},
{
"entropy": 0.39608169682323935,
"epoch": 0.15141221003591188,
"grad_norm": 2.21875,
"learning_rate": 9.989180100998543e-06,
"loss": 0.4208333492279053,
"mean_token_accuracy": 0.8722649529576302,
"num_tokens": 5974272.0,
"step": 390
},
{
"entropy": 0.37846892662346365,
"epoch": 0.15335339221585945,
"grad_norm": 1.40625,
"learning_rate": 9.988480230451849e-06,
"loss": 0.38179306983947753,
"mean_token_accuracy": 0.8771921068429946,
"num_tokens": 6060716.0,
"step": 395
},
{
"entropy": 0.37789811603724954,
"epoch": 0.15529457439580704,
"grad_norm": 2.03125,
"learning_rate": 9.987758454630909e-06,
"loss": 0.39535834789276125,
"mean_token_accuracy": 0.8785205245018005,
"num_tokens": 6126916.0,
"step": 400
},
{
"epoch": 0.15529457439580704,
"eval_entropy": 0.3923966215794283,
"eval_loss": 0.39556533098220825,
"eval_mean_token_accuracy": 0.876580595429302,
"eval_num_tokens": 6126916.0,
"eval_runtime": 60.1557,
"eval_samples_per_second": 35.724,
"eval_steps_per_second": 35.724,
"step": 400
},
{
"entropy": 0.43358618319034575,
"epoch": 0.15723575657575464,
"grad_norm": 1.6953125,
"learning_rate": 9.98701477670516e-06,
"loss": 0.4627527236938477,
"mean_token_accuracy": 0.8672218635678292,
"num_tokens": 6191091.0,
"step": 405
},
{
"entropy": 0.41946529373526575,
"epoch": 0.1591769387557022,
"grad_norm": 1.84375,
"learning_rate": 9.986249199940221e-06,
"loss": 0.4011059284210205,
"mean_token_accuracy": 0.8665074944496155,
"num_tokens": 6271540.0,
"step": 410
},
{
"entropy": 0.40461285747587683,
"epoch": 0.1611181209356498,
"grad_norm": 1.8828125,
"learning_rate": 9.985461727697873e-06,
"loss": 0.4005119800567627,
"mean_token_accuracy": 0.8737778559327125,
"num_tokens": 6335828.0,
"step": 415
},
{
"entropy": 0.42256330624222754,
"epoch": 0.1630593031155974,
"grad_norm": 1.9140625,
"learning_rate": 9.98465236343604e-06,
"loss": 0.447072172164917,
"mean_token_accuracy": 0.8665044084191322,
"num_tokens": 6398004.0,
"step": 420
},
{
"entropy": 0.3777090422809124,
"epoch": 0.16500048529554498,
"grad_norm": 1.6953125,
"learning_rate": 9.98382111070878e-06,
"loss": 0.40898308753967283,
"mean_token_accuracy": 0.8773329868912697,
"num_tokens": 6475011.0,
"step": 425
},
{
"entropy": 0.3920007921755314,
"epoch": 0.16694166747549258,
"grad_norm": 1.9375,
"learning_rate": 9.982967973166269e-06,
"loss": 0.36671743392944334,
"mean_token_accuracy": 0.8770380824804306,
"num_tokens": 6537403.0,
"step": 430
},
{
"entropy": 0.38834148123860357,
"epoch": 0.16888284965544018,
"grad_norm": 1.5625,
"learning_rate": 9.982092954554776e-06,
"loss": 0.40844144821166994,
"mean_token_accuracy": 0.8771779343485833,
"num_tokens": 6605063.0,
"step": 435
},
{
"entropy": 0.3952221803367138,
"epoch": 0.17082403183538775,
"grad_norm": 1.5859375,
"learning_rate": 9.981196058716662e-06,
"loss": 0.42590937614440916,
"mean_token_accuracy": 0.8744154885411263,
"num_tokens": 6679167.0,
"step": 440
},
{
"entropy": 0.38398993872106074,
"epoch": 0.17276521401533534,
"grad_norm": 1.5703125,
"learning_rate": 9.98027728959035e-06,
"loss": 0.39303438663482665,
"mean_token_accuracy": 0.8744676560163498,
"num_tokens": 6761337.0,
"step": 445
},
{
"entropy": 0.3808287113904953,
"epoch": 0.17470639619528291,
"grad_norm": 1.4296875,
"learning_rate": 9.979336651210314e-06,
"loss": 0.3940417289733887,
"mean_token_accuracy": 0.8789507359266281,
"num_tokens": 6834935.0,
"step": 450
},
{
"entropy": 0.4267194837331772,
"epoch": 0.1766475783752305,
"grad_norm": 1.765625,
"learning_rate": 9.978374147707055e-06,
"loss": 0.4340329647064209,
"mean_token_accuracy": 0.8673587426543236,
"num_tokens": 6904193.0,
"step": 455
},
{
"entropy": 0.41968510262668135,
"epoch": 0.1785887605551781,
"grad_norm": 1.6640625,
"learning_rate": 9.977389783307095e-06,
"loss": 0.4462919235229492,
"mean_token_accuracy": 0.8695808529853821,
"num_tokens": 6971958.0,
"step": 460
},
{
"entropy": 0.38464379906654356,
"epoch": 0.18052994273512568,
"grad_norm": 1.4140625,
"learning_rate": 9.976383562332946e-06,
"loss": 0.40283498764038084,
"mean_token_accuracy": 0.880063496530056,
"num_tokens": 7053470.0,
"step": 465
},
{
"entropy": 0.364135454967618,
"epoch": 0.18247112491507328,
"grad_norm": 1.640625,
"learning_rate": 9.975355489203097e-06,
"loss": 0.4077275276184082,
"mean_token_accuracy": 0.8819017142057419,
"num_tokens": 7126534.0,
"step": 470
},
{
"entropy": 0.3730104427784681,
"epoch": 0.18441230709502088,
"grad_norm": 1.7421875,
"learning_rate": 9.974305568431994e-06,
"loss": 0.3929471969604492,
"mean_token_accuracy": 0.8784888133406639,
"num_tokens": 7201674.0,
"step": 475
},
{
"entropy": 0.3986961957067251,
"epoch": 0.18635348927496845,
"grad_norm": 1.5703125,
"learning_rate": 9.973233804630022e-06,
"loss": 0.4142603874206543,
"mean_token_accuracy": 0.8733824387192726,
"num_tokens": 7271021.0,
"step": 480
},
{
"entropy": 0.4345793057233095,
"epoch": 0.18829467145491605,
"grad_norm": 1.7890625,
"learning_rate": 9.972140202503477e-06,
"loss": 0.45402941703796384,
"mean_token_accuracy": 0.8599157705903053,
"num_tokens": 7347794.0,
"step": 485
},
{
"entropy": 0.39443473145365715,
"epoch": 0.19023585363486364,
"grad_norm": 1.71875,
"learning_rate": 9.971024766854554e-06,
"loss": 0.4239619731903076,
"mean_token_accuracy": 0.8729955241084099,
"num_tokens": 7430791.0,
"step": 490
},
{
"entropy": 0.39781664684414864,
"epoch": 0.19217703581481121,
"grad_norm": 1.640625,
"learning_rate": 9.969887502581324e-06,
"loss": 0.41582446098327636,
"mean_token_accuracy": 0.8728051796555519,
"num_tokens": 7509803.0,
"step": 495
},
{
"entropy": 0.45657297112047673,
"epoch": 0.1941182179947588,
"grad_norm": 1.265625,
"learning_rate": 9.96872841467771e-06,
"loss": 0.446823263168335,
"mean_token_accuracy": 0.8597154960036277,
"num_tokens": 7601591.0,
"step": 500
},
{
"entropy": 0.3968469314277172,
"epoch": 0.19605940017470638,
"grad_norm": 1.703125,
"learning_rate": 9.967547508233466e-06,
"loss": 0.4176668643951416,
"mean_token_accuracy": 0.872602291405201,
"num_tokens": 7666352.0,
"step": 505
},
{
"entropy": 0.41542044915258886,
"epoch": 0.19800058235465398,
"grad_norm": 1.515625,
"learning_rate": 9.966344788434154e-06,
"loss": 0.43819799423217776,
"mean_token_accuracy": 0.865588866174221,
"num_tokens": 7742267.0,
"step": 510
},
{
"entropy": 0.39156174324452875,
"epoch": 0.19994176453460158,
"grad_norm": 1.3125,
"learning_rate": 9.965120260561126e-06,
"loss": 0.39728028774261476,
"mean_token_accuracy": 0.8775914892554283,
"num_tokens": 7819899.0,
"step": 515
},
{
"entropy": 0.3775805365294218,
"epoch": 0.20188294671454915,
"grad_norm": 1.9375,
"learning_rate": 9.963873929991492e-06,
"loss": 0.4102130889892578,
"mean_token_accuracy": 0.878247183561325,
"num_tokens": 7887873.0,
"step": 520
},
{
"entropy": 0.39370285868644717,
"epoch": 0.20382412889449675,
"grad_norm": 1.4375,
"learning_rate": 9.962605802198105e-06,
"loss": 0.4014415264129639,
"mean_token_accuracy": 0.8733704462647438,
"num_tokens": 7961482.0,
"step": 525
},
{
"entropy": 0.40123398676514627,
"epoch": 0.20576531107444435,
"grad_norm": 1.9453125,
"learning_rate": 9.961315882749531e-06,
"loss": 0.42458133697509765,
"mean_token_accuracy": 0.875715845823288,
"num_tokens": 8022441.0,
"step": 530
},
{
"entropy": 0.3925070337951183,
"epoch": 0.20770649325439192,
"grad_norm": 1.5234375,
"learning_rate": 9.960004177310029e-06,
"loss": 0.38911452293396,
"mean_token_accuracy": 0.8763631775975227,
"num_tokens": 8098552.0,
"step": 535
},
{
"entropy": 0.39059548266232014,
"epoch": 0.20964767543433951,
"grad_norm": 1.21875,
"learning_rate": 9.958670691639523e-06,
"loss": 0.41231446266174315,
"mean_token_accuracy": 0.8729776293039322,
"num_tokens": 8193252.0,
"step": 540
},
{
"entropy": 0.3952123038470745,
"epoch": 0.2115888576142871,
"grad_norm": 2.09375,
"learning_rate": 9.957315431593578e-06,
"loss": 0.41542778015136717,
"mean_token_accuracy": 0.8778386160731315,
"num_tokens": 8259794.0,
"step": 545
},
{
"entropy": 0.39136257991194723,
"epoch": 0.21353003979423468,
"grad_norm": 1.734375,
"learning_rate": 9.955938403123372e-06,
"loss": 0.4131179332733154,
"mean_token_accuracy": 0.8729422584176063,
"num_tokens": 8325306.0,
"step": 550
},
{
"entropy": 0.36876106821000576,
"epoch": 0.21547122197418228,
"grad_norm": 2.15625,
"learning_rate": 9.954539612275676e-06,
"loss": 0.3939671516418457,
"mean_token_accuracy": 0.8807895123958588,
"num_tokens": 8403551.0,
"step": 555
},
{
"entropy": 0.41602297611534594,
"epoch": 0.21741240415412985,
"grad_norm": 1.609375,
"learning_rate": 9.95311906519282e-06,
"loss": 0.43773713111877444,
"mean_token_accuracy": 0.8707596242427826,
"num_tokens": 8474927.0,
"step": 560
},
{
"entropy": 0.4392675504088402,
"epoch": 0.21935358633407745,
"grad_norm": 1.3984375,
"learning_rate": 9.951676768112673e-06,
"loss": 0.43816194534301756,
"mean_token_accuracy": 0.8632524207234382,
"num_tokens": 8551768.0,
"step": 565
},
{
"entropy": 0.4103314906358719,
"epoch": 0.22129476851402505,
"grad_norm": 1.6953125,
"learning_rate": 9.950212727368606e-06,
"loss": 0.43707122802734377,
"mean_token_accuracy": 0.8690274521708489,
"num_tokens": 8623280.0,
"step": 570
},
{
"entropy": 0.35621660873293876,
"epoch": 0.22323595069397262,
"grad_norm": 2.390625,
"learning_rate": 9.948726949389474e-06,
"loss": 0.39322140216827395,
"mean_token_accuracy": 0.8832426086068154,
"num_tokens": 8701428.0,
"step": 575
},
{
"entropy": 0.40779992677271365,
"epoch": 0.22517713287392022,
"grad_norm": 1.6796875,
"learning_rate": 9.947219440699584e-06,
"loss": 0.42441439628601074,
"mean_token_accuracy": 0.8702899888157845,
"num_tokens": 8769812.0,
"step": 580
},
{
"entropy": 0.42166984751820563,
"epoch": 0.22711831505386781,
"grad_norm": 1.7890625,
"learning_rate": 9.945690207918667e-06,
"loss": 0.41438679695129393,
"mean_token_accuracy": 0.8685426101088524,
"num_tokens": 8850435.0,
"step": 585
},
{
"entropy": 0.3902070872485638,
"epoch": 0.22905949723381538,
"grad_norm": 1.6796875,
"learning_rate": 9.944139257761845e-06,
"loss": 0.3842545747756958,
"mean_token_accuracy": 0.8773440137505532,
"num_tokens": 8903430.0,
"step": 590
},
{
"entropy": 0.43535452634096145,
"epoch": 0.23100067941376298,
"grad_norm": 1.5703125,
"learning_rate": 9.942566597039608e-06,
"loss": 0.42905964851379397,
"mean_token_accuracy": 0.862843619287014,
"num_tokens": 8976102.0,
"step": 595
},
{
"entropy": 0.42619109377264974,
"epoch": 0.23294186159371058,
"grad_norm": 1.9140625,
"learning_rate": 9.940972232657782e-06,
"loss": 0.4514484882354736,
"mean_token_accuracy": 0.8609629839658737,
"num_tokens": 9073966.0,
"step": 600
},
{
"epoch": 0.23294186159371058,
"eval_entropy": 0.3833293942704263,
"eval_loss": 0.38786980509757996,
"eval_mean_token_accuracy": 0.8783254230255413,
"eval_num_tokens": 9073966.0,
"eval_runtime": 60.1116,
"eval_samples_per_second": 35.75,
"eval_steps_per_second": 35.75,
"step": 600
},
{
"entropy": 0.3839044734835625,
"epoch": 0.23488304377365815,
"grad_norm": 2.203125,
"learning_rate": 9.93935617161749e-06,
"loss": 0.4186872959136963,
"mean_token_accuracy": 0.8785676345229149,
"num_tokens": 9145748.0,
"step": 605
},
{
"entropy": 0.39257055819034575,
"epoch": 0.23682422595360575,
"grad_norm": 1.609375,
"learning_rate": 9.937718421015137e-06,
"loss": 0.418427848815918,
"mean_token_accuracy": 0.8697977751493454,
"num_tokens": 9232179.0,
"step": 610
},
{
"entropy": 0.377314992621541,
"epoch": 0.23876540813355335,
"grad_norm": 1.875,
"learning_rate": 9.936058988042367e-06,
"loss": 0.4139708042144775,
"mean_token_accuracy": 0.8783795028924942,
"num_tokens": 9308281.0,
"step": 615
},
{
"entropy": 0.38419472984969616,
"epoch": 0.24070659031350092,
"grad_norm": 1.4453125,
"learning_rate": 9.934377879986035e-06,
"loss": 0.40369882583618166,
"mean_token_accuracy": 0.8746202811598778,
"num_tokens": 9393206.0,
"step": 620
},
{
"entropy": 0.3744822334498167,
"epoch": 0.24264777249344852,
"grad_norm": 1.734375,
"learning_rate": 9.932675104228177e-06,
"loss": 0.40184435844421384,
"mean_token_accuracy": 0.8763082399964333,
"num_tokens": 9485199.0,
"step": 625
},
{
"entropy": 0.42811642177402975,
"epoch": 0.2445889546733961,
"grad_norm": 1.6015625,
"learning_rate": 9.930950668245971e-06,
"loss": 0.44397845268249514,
"mean_token_accuracy": 0.8665265038609504,
"num_tokens": 9565581.0,
"step": 630
},
{
"entropy": 0.39702326618134975,
"epoch": 0.24653013685334368,
"grad_norm": 2.0625,
"learning_rate": 9.929204579611716e-06,
"loss": 0.38111956119537355,
"mean_token_accuracy": 0.8766680151224137,
"num_tokens": 9636638.0,
"step": 635
},
{
"entropy": 0.4137250851839781,
"epoch": 0.24847131903329128,
"grad_norm": 1.59375,
"learning_rate": 9.927436845992782e-06,
"loss": 0.4052375316619873,
"mean_token_accuracy": 0.8680448547005654,
"num_tokens": 9714886.0,
"step": 640
},
{
"entropy": 0.394980476424098,
"epoch": 0.2504125012132389,
"grad_norm": 1.21875,
"learning_rate": 9.925647475151596e-06,
"loss": 0.40479207038879395,
"mean_token_accuracy": 0.8760687246918678,
"num_tokens": 9793866.0,
"step": 645
},
{
"entropy": 0.3940431509166956,
"epoch": 0.2523536833931864,
"grad_norm": 1.5,
"learning_rate": 9.923836474945592e-06,
"loss": 0.3884091854095459,
"mean_token_accuracy": 0.8729974597692489,
"num_tokens": 9880497.0,
"step": 650
},
{
"entropy": 0.42443324290215967,
"epoch": 0.254294865573134,
"grad_norm": 1.7109375,
"learning_rate": 9.92200385332718e-06,
"loss": 0.4349085330963135,
"mean_token_accuracy": 0.8659611865878105,
"num_tokens": 9947758.0,
"step": 655
},
{
"entropy": 0.43983132056891916,
"epoch": 0.2562360477530816,
"grad_norm": 1.6875,
"learning_rate": 9.92014961834372e-06,
"loss": 0.46071972846984866,
"mean_token_accuracy": 0.8601326540112495,
"num_tokens": 10016198.0,
"step": 660
},
{
"entropy": 0.3981770180165768,
"epoch": 0.2581772299330292,
"grad_norm": 1.4765625,
"learning_rate": 9.918273778137477e-06,
"loss": 0.41265163421630857,
"mean_token_accuracy": 0.8710227489471436,
"num_tokens": 10111907.0,
"step": 665
},
{
"entropy": 0.41508678197860716,
"epoch": 0.2601184121129768,
"grad_norm": 1.6875,
"learning_rate": 9.916376340945584e-06,
"loss": 0.4117740631103516,
"mean_token_accuracy": 0.8687305614352226,
"num_tokens": 10191989.0,
"step": 670
},
{
"entropy": 0.4320628222078085,
"epoch": 0.2620595942929244,
"grad_norm": 1.2734375,
"learning_rate": 9.91445731510002e-06,
"loss": 0.4121531963348389,
"mean_token_accuracy": 0.8719193398952484,
"num_tokens": 10273505.0,
"step": 675
},
{
"entropy": 0.3792607393115759,
"epoch": 0.26400077647287196,
"grad_norm": 2.296875,
"learning_rate": 9.91251670902755e-06,
"loss": 0.3988708257675171,
"mean_token_accuracy": 0.8824092477560044,
"num_tokens": 10342792.0,
"step": 680
},
{
"entropy": 0.37850567921996114,
"epoch": 0.26594195865281955,
"grad_norm": 1.46875,
"learning_rate": 9.910554531249714e-06,
"loss": 0.3946362018585205,
"mean_token_accuracy": 0.8791269809007645,
"num_tokens": 10409365.0,
"step": 685
},
{
"entropy": 0.36963232718408107,
"epoch": 0.26788314083276715,
"grad_norm": 1.375,
"learning_rate": 9.90857079038277e-06,
"loss": 0.37774851322174074,
"mean_token_accuracy": 0.8818046569824218,
"num_tokens": 10497911.0,
"step": 690
},
{
"entropy": 0.3968418601900339,
"epoch": 0.26982432301271475,
"grad_norm": 1.765625,
"learning_rate": 9.906565495137665e-06,
"loss": 0.3911430835723877,
"mean_token_accuracy": 0.875646622478962,
"num_tokens": 10569784.0,
"step": 695
},
{
"entropy": 0.384658931568265,
"epoch": 0.27176550519266235,
"grad_norm": 1.4609375,
"learning_rate": 9.904538654319998e-06,
"loss": 0.4171136379241943,
"mean_token_accuracy": 0.8759941428899765,
"num_tokens": 10638722.0,
"step": 700
},
{
"entropy": 0.3830408491194248,
"epoch": 0.27370668737260995,
"grad_norm": 1.734375,
"learning_rate": 9.90249027682997e-06,
"loss": 0.43827214241027834,
"mean_token_accuracy": 0.8736877083778382,
"num_tokens": 10717045.0,
"step": 705
},
{
"entropy": 0.42484647817909715,
"epoch": 0.2756478695525575,
"grad_norm": 1.5234375,
"learning_rate": 9.900420371662364e-06,
"loss": 0.42806663513183596,
"mean_token_accuracy": 0.8680253028869629,
"num_tokens": 10786078.0,
"step": 710
},
{
"entropy": 0.36103312484920025,
"epoch": 0.2775890517325051,
"grad_norm": 1.453125,
"learning_rate": 9.898328947906489e-06,
"loss": 0.3689872741699219,
"mean_token_accuracy": 0.8823491036891937,
"num_tokens": 10863918.0,
"step": 715
},
{
"entropy": 0.4158777046948671,
"epoch": 0.2795302339124527,
"grad_norm": 1.6328125,
"learning_rate": 9.896216014746141e-06,
"loss": 0.40418004989624023,
"mean_token_accuracy": 0.8711874485015869,
"num_tokens": 10952093.0,
"step": 720
},
{
"entropy": 0.388820331171155,
"epoch": 0.2814714160924003,
"grad_norm": 1.609375,
"learning_rate": 9.894081581459579e-06,
"loss": 0.40212116241455076,
"mean_token_accuracy": 0.8809416055679321,
"num_tokens": 11026123.0,
"step": 725
},
{
"entropy": 0.4209954336285591,
"epoch": 0.2834125982723479,
"grad_norm": 1.6484375,
"learning_rate": 9.891925657419463e-06,
"loss": 0.4366124153137207,
"mean_token_accuracy": 0.870456813275814,
"num_tokens": 11090901.0,
"step": 730
},
{
"entropy": 0.40942220725119116,
"epoch": 0.2853537804522954,
"grad_norm": 1.3515625,
"learning_rate": 9.889748252092827e-06,
"loss": 0.40485477447509766,
"mean_token_accuracy": 0.8676602795720101,
"num_tokens": 11178026.0,
"step": 735
},
{
"entropy": 0.3782723072916269,
"epoch": 0.287294962632243,
"grad_norm": 1.671875,
"learning_rate": 9.887549375041031e-06,
"loss": 0.40353665351867674,
"mean_token_accuracy": 0.8802446350455284,
"num_tokens": 11231711.0,
"step": 740
},
{
"entropy": 0.39903284460306165,
"epoch": 0.2892361448121906,
"grad_norm": 1.734375,
"learning_rate": 9.885329035919724e-06,
"loss": 0.4090695858001709,
"mean_token_accuracy": 0.868949045240879,
"num_tokens": 11327844.0,
"step": 745
},
{
"entropy": 0.4031669870018959,
"epoch": 0.2911773269921382,
"grad_norm": 2.125,
"learning_rate": 9.883087244478796e-06,
"loss": 0.45818114280700684,
"mean_token_accuracy": 0.8704651057720184,
"num_tokens": 11401714.0,
"step": 750
},
{
"entropy": 0.43706582076847555,
"epoch": 0.2931185091720858,
"grad_norm": 1.71875,
"learning_rate": 9.880824010562338e-06,
"loss": 0.42615551948547364,
"mean_token_accuracy": 0.860453313589096,
"num_tokens": 11489610.0,
"step": 755
},
{
"entropy": 0.41881459429860113,
"epoch": 0.2950596913520334,
"grad_norm": 1.2578125,
"learning_rate": 9.878539344108599e-06,
"loss": 0.42671880722045896,
"mean_token_accuracy": 0.8694243490695953,
"num_tokens": 11574592.0,
"step": 760
},
{
"entropy": 0.42061977460980415,
"epoch": 0.29700087353198096,
"grad_norm": 1.59375,
"learning_rate": 9.876233255149945e-06,
"loss": 0.4513099193572998,
"mean_token_accuracy": 0.8744151741266251,
"num_tokens": 11641646.0,
"step": 765
},
{
"entropy": 0.39190227575600145,
"epoch": 0.29894205571192856,
"grad_norm": 1.84375,
"learning_rate": 9.873905753812807e-06,
"loss": 0.39388408660888674,
"mean_token_accuracy": 0.8745140552520752,
"num_tokens": 11714807.0,
"step": 770
},
{
"entropy": 0.4235379956662655,
"epoch": 0.30088323789187615,
"grad_norm": 1.9609375,
"learning_rate": 9.871556850317641e-06,
"loss": 0.45146808624267576,
"mean_token_accuracy": 0.8673876538872719,
"num_tokens": 11781200.0,
"step": 775
},
{
"entropy": 0.4271043732762337,
"epoch": 0.30282442007182375,
"grad_norm": 2.078125,
"learning_rate": 9.86918655497889e-06,
"loss": 0.41930394172668456,
"mean_token_accuracy": 0.8670791104435921,
"num_tokens": 11859281.0,
"step": 780
},
{
"entropy": 0.3889836758375168,
"epoch": 0.30476560225177135,
"grad_norm": 1.9921875,
"learning_rate": 9.866794878204926e-06,
"loss": 0.42397122383117675,
"mean_token_accuracy": 0.875930380821228,
"num_tokens": 11935604.0,
"step": 785
},
{
"entropy": 0.407536294311285,
"epoch": 0.3067067844317189,
"grad_norm": 1.7265625,
"learning_rate": 9.864381830498013e-06,
"loss": 0.4073331356048584,
"mean_token_accuracy": 0.8763293012976646,
"num_tokens": 12000505.0,
"step": 790
},
{
"entropy": 0.3989451553672552,
"epoch": 0.3086479666116665,
"grad_norm": 1.5703125,
"learning_rate": 9.861947422454262e-06,
"loss": 0.40294957160949707,
"mean_token_accuracy": 0.8775883078575134,
"num_tokens": 12073943.0,
"step": 795
},
{
"entropy": 0.3732746794819832,
"epoch": 0.3105891487916141,
"grad_norm": 1.7265625,
"learning_rate": 9.85949166476357e-06,
"loss": 0.36971125602722166,
"mean_token_accuracy": 0.8844305410981178,
"num_tokens": 12139003.0,
"step": 800
},
{
"epoch": 0.3105891487916141,
"eval_entropy": 0.381350220199683,
"eval_loss": 0.38283953070640564,
"eval_mean_token_accuracy": 0.8796889461046488,
"eval_num_tokens": 12139003.0,
"eval_runtime": 60.3322,
"eval_samples_per_second": 35.619,
"eval_steps_per_second": 35.619,
"step": 800
},
{
"entropy": 0.41282732523977755,
"epoch": 0.3125303309715617,
"grad_norm": 1.9375,
"learning_rate": 9.857014568209597e-06,
"loss": 0.46287264823913576,
"mean_token_accuracy": 0.8671122461557388,
"num_tokens": 12216310.0,
"step": 805
},
{
"entropy": 0.3740253135561943,
"epoch": 0.3144715131515093,
"grad_norm": 2.046875,
"learning_rate": 9.854516143669699e-06,
"loss": 0.3972623348236084,
"mean_token_accuracy": 0.8769651532173157,
"num_tokens": 12284513.0,
"step": 810
},
{
"entropy": 0.3666670624166727,
"epoch": 0.3164126953314569,
"grad_norm": 1.390625,
"learning_rate": 9.851996402114886e-06,
"loss": 0.3955537796020508,
"mean_token_accuracy": 0.8804457679390907,
"num_tokens": 12376220.0,
"step": 815
},
{
"entropy": 0.40160666182637217,
"epoch": 0.3183538775114044,
"grad_norm": 1.5703125,
"learning_rate": 9.849455354609777e-06,
"loss": 0.41783933639526366,
"mean_token_accuracy": 0.8718173667788506,
"num_tokens": 12465139.0,
"step": 820
},
{
"entropy": 0.395163032412529,
"epoch": 0.320295059691352,
"grad_norm": 1.46875,
"learning_rate": 9.846893012312549e-06,
"loss": 0.4353921413421631,
"mean_token_accuracy": 0.8741151168942451,
"num_tokens": 12543594.0,
"step": 825
},
{
"entropy": 0.38790931962430475,
"epoch": 0.3222362418712996,
"grad_norm": 1.4453125,
"learning_rate": 9.844309386474886e-06,
"loss": 0.4091060638427734,
"mean_token_accuracy": 0.8755813196301461,
"num_tokens": 12633984.0,
"step": 830
},
{
"entropy": 0.35505941733717916,
"epoch": 0.3241774240512472,
"grad_norm": 2.15625,
"learning_rate": 9.841704488441934e-06,
"loss": 0.34843366146087645,
"mean_token_accuracy": 0.8839788928627967,
"num_tokens": 12696787.0,
"step": 835
},
{
"entropy": 0.4340564154088497,
"epoch": 0.3261186062311948,
"grad_norm": 1.8125,
"learning_rate": 9.83907832965225e-06,
"loss": 0.4248070240020752,
"mean_token_accuracy": 0.8684971421957016,
"num_tokens": 12758495.0,
"step": 840
},
{
"entropy": 0.4072086077183485,
"epoch": 0.32805978841114236,
"grad_norm": 1.609375,
"learning_rate": 9.836430921637746e-06,
"loss": 0.4239677906036377,
"mean_token_accuracy": 0.871665708720684,
"num_tokens": 12824603.0,
"step": 845
},
{
"entropy": 0.4058460295200348,
"epoch": 0.33000097059108996,
"grad_norm": 1.6328125,
"learning_rate": 9.833762276023646e-06,
"loss": 0.437244176864624,
"mean_token_accuracy": 0.8710885986685752,
"num_tokens": 12915482.0,
"step": 850
},
{
"entropy": 0.4212725304067135,
"epoch": 0.33194215277103756,
"grad_norm": 1.703125,
"learning_rate": 9.831072404528433e-06,
"loss": 0.4174651622772217,
"mean_token_accuracy": 0.8688189521431923,
"num_tokens": 12993367.0,
"step": 855
},
{
"entropy": 0.3895297344774008,
"epoch": 0.33388333495098516,
"grad_norm": 1.671875,
"learning_rate": 9.828361318963794e-06,
"loss": 0.39536495208740235,
"mean_token_accuracy": 0.8768842920660973,
"num_tokens": 13065810.0,
"step": 860
},
{
"entropy": 0.4025235269218683,
"epoch": 0.33582451713093275,
"grad_norm": 1.9609375,
"learning_rate": 9.825629031234574e-06,
"loss": 0.37229766845703127,
"mean_token_accuracy": 0.8749150916934013,
"num_tokens": 13133471.0,
"step": 865
},
{
"entropy": 0.3760003004223108,
"epoch": 0.33776569931088035,
"grad_norm": 1.453125,
"learning_rate": 9.822875553338715e-06,
"loss": 0.3896082639694214,
"mean_token_accuracy": 0.8785295352339745,
"num_tokens": 13211617.0,
"step": 870
},
{
"entropy": 0.3616090904921293,
"epoch": 0.3397068814908279,
"grad_norm": 1.546875,
"learning_rate": 9.820100897367214e-06,
"loss": 0.3726183891296387,
"mean_token_accuracy": 0.8849639266729354,
"num_tokens": 13286555.0,
"step": 875
},
{
"entropy": 0.352078976854682,
"epoch": 0.3416480636707755,
"grad_norm": 1.8125,
"learning_rate": 9.81730507550406e-06,
"loss": 0.3919616937637329,
"mean_token_accuracy": 0.8840990662574768,
"num_tokens": 13355200.0,
"step": 880
},
{
"entropy": 0.38022752702236173,
"epoch": 0.3435892458507231,
"grad_norm": 1.5625,
"learning_rate": 9.81448810002619e-06,
"loss": 0.40322446823120117,
"mean_token_accuracy": 0.8776090621948243,
"num_tokens": 13427836.0,
"step": 885
},
{
"entropy": 0.4132396575063467,
"epoch": 0.3455304280306707,
"grad_norm": 1.4296875,
"learning_rate": 9.811649983303425e-06,
"loss": 0.4324185371398926,
"mean_token_accuracy": 0.8720195293426514,
"num_tokens": 13496990.0,
"step": 890
},
{
"entropy": 0.36122960932552817,
"epoch": 0.3474716102106183,
"grad_norm": 1.765625,
"learning_rate": 9.808790737798426e-06,
"loss": 0.39123167991638186,
"mean_token_accuracy": 0.8842917993664742,
"num_tokens": 13572848.0,
"step": 895
},
{
"entropy": 0.3831039108335972,
"epoch": 0.34941279239056583,
"grad_norm": 1.8984375,
"learning_rate": 9.805910376066631e-06,
"loss": 0.37870833873748777,
"mean_token_accuracy": 0.874284490942955,
"num_tokens": 13652434.0,
"step": 900
},
{
"entropy": 0.40437927283346653,
"epoch": 0.3513539745705134,
"grad_norm": 1.875,
"learning_rate": 9.803008910756203e-06,
"loss": 0.4461234092712402,
"mean_token_accuracy": 0.8702120751142501,
"num_tokens": 13730423.0,
"step": 905
},
{
"entropy": 0.4021365400403738,
"epoch": 0.353295156750461,
"grad_norm": 1.84375,
"learning_rate": 9.800086354607975e-06,
"loss": 0.4367063999176025,
"mean_token_accuracy": 0.8725798889994621,
"num_tokens": 13794147.0,
"step": 910
},
{
"entropy": 0.36978473588824273,
"epoch": 0.3552363389304086,
"grad_norm": 1.6015625,
"learning_rate": 9.797142720455391e-06,
"loss": 0.3837603569030762,
"mean_token_accuracy": 0.882180480659008,
"num_tokens": 13874189.0,
"step": 915
},
{
"entropy": 0.40101403892040255,
"epoch": 0.3571775211103562,
"grad_norm": 1.6953125,
"learning_rate": 9.794178021224459e-06,
"loss": 0.4223616123199463,
"mean_token_accuracy": 0.8726651340723037,
"num_tokens": 13945289.0,
"step": 920
},
{
"entropy": 0.35331339165568354,
"epoch": 0.3591187032903038,
"grad_norm": 1.6953125,
"learning_rate": 9.79119226993368e-06,
"loss": 0.4017838478088379,
"mean_token_accuracy": 0.8861800834536553,
"num_tokens": 14028418.0,
"step": 925
},
{
"entropy": 0.3538735806941986,
"epoch": 0.36105988547025136,
"grad_norm": 1.671875,
"learning_rate": 9.788185479694004e-06,
"loss": 0.382387375831604,
"mean_token_accuracy": 0.8858973324298859,
"num_tokens": 14098572.0,
"step": 930
},
{
"entropy": 0.3838920842856169,
"epoch": 0.36300106765019896,
"grad_norm": 1.515625,
"learning_rate": 9.785157663708761e-06,
"loss": 0.37942454814910886,
"mean_token_accuracy": 0.8802381858229638,
"num_tokens": 14180314.0,
"step": 935
},
{
"entropy": 0.41736325100064275,
"epoch": 0.36494224983014656,
"grad_norm": 2.40625,
"learning_rate": 9.782108835273612e-06,
"loss": 0.42386960983276367,
"mean_token_accuracy": 0.8687801375985146,
"num_tokens": 14255569.0,
"step": 940
},
{
"entropy": 0.3824776504188776,
"epoch": 0.36688343201009416,
"grad_norm": 1.7109375,
"learning_rate": 9.779039007776487e-06,
"loss": 0.39833407402038573,
"mean_token_accuracy": 0.8781590893864631,
"num_tokens": 14323554.0,
"step": 945
},
{
"entropy": 0.34414222836494446,
"epoch": 0.36882461419004176,
"grad_norm": 1.609375,
"learning_rate": 9.775948194697528e-06,
"loss": 0.3766109704971313,
"mean_token_accuracy": 0.8860784068703651,
"num_tokens": 14404112.0,
"step": 950
},
{
"entropy": 0.3819138675928116,
"epoch": 0.3707657963699893,
"grad_norm": 1.7265625,
"learning_rate": 9.772836409609025e-06,
"loss": 0.4195257663726807,
"mean_token_accuracy": 0.8752670779824256,
"num_tokens": 14493874.0,
"step": 955
},
{
"entropy": 0.3910291727632284,
"epoch": 0.3727069785499369,
"grad_norm": 1.5390625,
"learning_rate": 9.76970366617536e-06,
"loss": 0.42772369384765624,
"mean_token_accuracy": 0.8762600779533386,
"num_tokens": 14563770.0,
"step": 960
},
{
"entropy": 0.39120335690677166,
"epoch": 0.3746481607298845,
"grad_norm": 1.3828125,
"learning_rate": 9.76654997815295e-06,
"loss": 0.4206050395965576,
"mean_token_accuracy": 0.8764881610870361,
"num_tokens": 14649946.0,
"step": 965
},
{
"entropy": 0.37538095470517874,
"epoch": 0.3765893429098321,
"grad_norm": 1.2421875,
"learning_rate": 9.763375359390181e-06,
"loss": 0.40073623657226565,
"mean_token_accuracy": 0.87882329672575,
"num_tokens": 14732764.0,
"step": 970
},
{
"entropy": 0.4004466250538826,
"epoch": 0.3785305250897797,
"grad_norm": 1.34375,
"learning_rate": 9.760179823827347e-06,
"loss": 0.41023030281066897,
"mean_token_accuracy": 0.8705371245741844,
"num_tokens": 14814893.0,
"step": 975
},
{
"entropy": 0.4013238321989775,
"epoch": 0.3804717072697273,
"grad_norm": 1.7265625,
"learning_rate": 9.756963385496599e-06,
"loss": 0.44197940826416016,
"mean_token_accuracy": 0.8728833734989166,
"num_tokens": 14893153.0,
"step": 980
},
{
"entropy": 0.38930566757917406,
"epoch": 0.38241288944967483,
"grad_norm": 1.4921875,
"learning_rate": 9.753726058521868e-06,
"loss": 0.40584554672241213,
"mean_token_accuracy": 0.8758206337690353,
"num_tokens": 14968880.0,
"step": 985
},
{
"entropy": 0.4486214961856604,
"epoch": 0.38435407162962243,
"grad_norm": 1.71875,
"learning_rate": 9.750467857118811e-06,
"loss": 0.4342947959899902,
"mean_token_accuracy": 0.8598737180233001,
"num_tokens": 15055751.0,
"step": 990
},
{
"entropy": 0.3913938831537962,
"epoch": 0.38629525380957,
"grad_norm": 1.796875,
"learning_rate": 9.747188795594755e-06,
"loss": 0.43964052200317383,
"mean_token_accuracy": 0.8771824568510056,
"num_tokens": 15129042.0,
"step": 995
},
{
"entropy": 0.3813592415302992,
"epoch": 0.3882364359895176,
"grad_norm": 1.578125,
"learning_rate": 9.743888888348618e-06,
"loss": 0.3965798854827881,
"mean_token_accuracy": 0.8770193979144096,
"num_tokens": 15213724.0,
"step": 1000
},
{
"epoch": 0.3882364359895176,
"eval_entropy": 0.3787456456798906,
"eval_loss": 0.3797132670879364,
"eval_mean_token_accuracy": 0.8805272205589316,
"eval_num_tokens": 15213724.0,
"eval_runtime": 60.2207,
"eval_samples_per_second": 35.685,
"eval_steps_per_second": 35.685,
"step": 1000
},
{
"entropy": 0.36478537507355213,
"epoch": 0.3901776181694652,
"grad_norm": 1.6015625,
"learning_rate": 9.740568149870864e-06,
"loss": 0.38557422161102295,
"mean_token_accuracy": 0.8830441504716873,
"num_tokens": 15293346.0,
"step": 1005
},
{
"entropy": 0.3718357976526022,
"epoch": 0.39211880034941277,
"grad_norm": 1.3203125,
"learning_rate": 9.737226594743425e-06,
"loss": 0.41036291122436525,
"mean_token_accuracy": 0.8785412311553955,
"num_tokens": 15381773.0,
"step": 1010
},
{
"entropy": 0.3791601274162531,
"epoch": 0.39405998252936036,
"grad_norm": 1.4140625,
"learning_rate": 9.733864237639645e-06,
"loss": 0.39489567279815674,
"mean_token_accuracy": 0.8787289649248123,
"num_tokens": 15463663.0,
"step": 1015
},
{
"entropy": 0.40409386418759824,
"epoch": 0.39600116470930796,
"grad_norm": 1.5,
"learning_rate": 9.730481093324209e-06,
"loss": 0.39559972286224365,
"mean_token_accuracy": 0.873577019572258,
"num_tokens": 15545078.0,
"step": 1020
},
{
"entropy": 0.38083929792046545,
"epoch": 0.39794234688925556,
"grad_norm": 1.3359375,
"learning_rate": 9.72707717665309e-06,
"loss": 0.3976888179779053,
"mean_token_accuracy": 0.876065094769001,
"num_tokens": 15634558.0,
"step": 1025
},
{
"entropy": 0.39706564620137214,
"epoch": 0.39988352906920316,
"grad_norm": 1.96875,
"learning_rate": 9.723652502573465e-06,
"loss": 0.39628422260284424,
"mean_token_accuracy": 0.8746212273836136,
"num_tokens": 15706364.0,
"step": 1030
},
{
"entropy": 0.4297271855175495,
"epoch": 0.40182471124915076,
"grad_norm": 1.8203125,
"learning_rate": 9.720207086123674e-06,
"loss": 0.44258790016174315,
"mean_token_accuracy": 0.8682567358016968,
"num_tokens": 15786703.0,
"step": 1035
},
{
"entropy": 0.37680495008826254,
"epoch": 0.4037658934290983,
"grad_norm": 1.34375,
"learning_rate": 9.716740942433127e-06,
"loss": 0.3932778358459473,
"mean_token_accuracy": 0.8771912530064583,
"num_tokens": 15883251.0,
"step": 1040
},
{
"entropy": 0.4019945695996284,
"epoch": 0.4057070756090459,
"grad_norm": 1.359375,
"learning_rate": 9.713254086722259e-06,
"loss": 0.39323198795318604,
"mean_token_accuracy": 0.8740618228912354,
"num_tokens": 15962211.0,
"step": 1045
},
{
"entropy": 0.3781829860061407,
"epoch": 0.4076482577889935,
"grad_norm": 1.4921875,
"learning_rate": 9.709746534302453e-06,
"loss": 0.4020181655883789,
"mean_token_accuracy": 0.8759587749838829,
"num_tokens": 16040844.0,
"step": 1050
},
{
"entropy": 0.35617672353982927,
"epoch": 0.4095894399689411,
"grad_norm": 1.453125,
"learning_rate": 9.706218300575975e-06,
"loss": 0.3751538276672363,
"mean_token_accuracy": 0.8814701676368714,
"num_tokens": 16117566.0,
"step": 1055
},
{
"entropy": 0.39271861389279367,
"epoch": 0.4115306221488887,
"grad_norm": 1.7265625,
"learning_rate": 9.702669401035904e-06,
"loss": 0.3784984827041626,
"mean_token_accuracy": 0.877305480837822,
"num_tokens": 16197812.0,
"step": 1060
},
{
"entropy": 0.3672247972339392,
"epoch": 0.41347180432883623,
"grad_norm": 2.15625,
"learning_rate": 9.699099851266071e-06,
"loss": 0.3595015525817871,
"mean_token_accuracy": 0.8823614567518234,
"num_tokens": 16275198.0,
"step": 1065
},
{
"entropy": 0.369810114428401,
"epoch": 0.41541298650878383,
"grad_norm": 1.359375,
"learning_rate": 9.695509666940978e-06,
"loss": 0.405411958694458,
"mean_token_accuracy": 0.8822488501667977,
"num_tokens": 16350965.0,
"step": 1070
},
{
"entropy": 0.3897046368569136,
"epoch": 0.41735416868873143,
"grad_norm": 1.4609375,
"learning_rate": 9.691898863825749e-06,
"loss": 0.38735527992248536,
"mean_token_accuracy": 0.8744328498840332,
"num_tokens": 16436304.0,
"step": 1075
},
{
"entropy": 0.3776374412700534,
"epoch": 0.41929535086867903,
"grad_norm": 1.7890625,
"learning_rate": 9.688267457776032e-06,
"loss": 0.39870805740356446,
"mean_token_accuracy": 0.8785615637898445,
"num_tokens": 16501266.0,
"step": 1080
},
{
"entropy": 0.38694494068622587,
"epoch": 0.4212365330486266,
"grad_norm": 1.59375,
"learning_rate": 9.684615464737964e-06,
"loss": 0.39393253326416017,
"mean_token_accuracy": 0.8791399031877518,
"num_tokens": 16560531.0,
"step": 1085
},
{
"entropy": 0.38168427646160125,
"epoch": 0.4231777152285742,
"grad_norm": 1.6796875,
"learning_rate": 9.680942900748067e-06,
"loss": 0.4125086784362793,
"mean_token_accuracy": 0.878045716881752,
"num_tokens": 16627433.0,
"step": 1090
},
{
"entropy": 0.3892548579722643,
"epoch": 0.42511889740852177,
"grad_norm": 1.359375,
"learning_rate": 9.677249781933205e-06,
"loss": 0.40183658599853517,
"mean_token_accuracy": 0.8756108567118644,
"num_tokens": 16731006.0,
"step": 1095
},
{
"entropy": 0.3736507400870323,
"epoch": 0.42706007958846937,
"grad_norm": 2.28125,
"learning_rate": 9.673536124510496e-06,
"loss": 0.40765180587768557,
"mean_token_accuracy": 0.8797045171260833,
"num_tokens": 16801170.0,
"step": 1100
},
{
"entropy": 0.3525055509060621,
"epoch": 0.42900126176841696,
"grad_norm": 1.3984375,
"learning_rate": 9.669801944787249e-06,
"loss": 0.3724426031112671,
"mean_token_accuracy": 0.8849082082509995,
"num_tokens": 16881282.0,
"step": 1105
},
{
"entropy": 0.39476602226495744,
"epoch": 0.43094244394836456,
"grad_norm": 1.3125,
"learning_rate": 9.66604725916089e-06,
"loss": 0.3921769380569458,
"mean_token_accuracy": 0.8737779542803764,
"num_tokens": 16965112.0,
"step": 1110
},
{
"entropy": 0.38296737633645533,
"epoch": 0.43288362612831216,
"grad_norm": 1.65625,
"learning_rate": 9.662272084118887e-06,
"loss": 0.39906389713287355,
"mean_token_accuracy": 0.8742195263504982,
"num_tokens": 17033565.0,
"step": 1115
},
{
"entropy": 0.4264007180929184,
"epoch": 0.4348248083082597,
"grad_norm": 1.7109375,
"learning_rate": 9.658476436238683e-06,
"loss": 0.4375418186187744,
"mean_token_accuracy": 0.8661627262830734,
"num_tokens": 17110727.0,
"step": 1120
},
{
"entropy": 0.3653211809694767,
"epoch": 0.4367659904882073,
"grad_norm": 1.390625,
"learning_rate": 9.654660332187621e-06,
"loss": 0.3593518972396851,
"mean_token_accuracy": 0.882878914475441,
"num_tokens": 17189104.0,
"step": 1125
},
{
"entropy": 0.33642441034317017,
"epoch": 0.4387071726681549,
"grad_norm": 1.546875,
"learning_rate": 9.65082378872287e-06,
"loss": 0.36505885124206544,
"mean_token_accuracy": 0.8923633351922036,
"num_tokens": 17256636.0,
"step": 1130
},
{
"entropy": 0.3748783551156521,
"epoch": 0.4406483548481025,
"grad_norm": 1.6484375,
"learning_rate": 9.646966822691351e-06,
"loss": 0.4033698558807373,
"mean_token_accuracy": 0.8803606644272804,
"num_tokens": 17333913.0,
"step": 1135
},
{
"entropy": 0.3871772147715092,
"epoch": 0.4425895370280501,
"grad_norm": 1.484375,
"learning_rate": 9.643089451029666e-06,
"loss": 0.39040231704711914,
"mean_token_accuracy": 0.8764987051486969,
"num_tokens": 17402674.0,
"step": 1140
},
{
"entropy": 0.4355839218944311,
"epoch": 0.4445307192079977,
"grad_norm": 1.9609375,
"learning_rate": 9.639191690764018e-06,
"loss": 0.40796470642089844,
"mean_token_accuracy": 0.8702881962060929,
"num_tokens": 17476753.0,
"step": 1145
},
{
"entropy": 0.37759856693446636,
"epoch": 0.44647190138794524,
"grad_norm": 1.3515625,
"learning_rate": 9.635273559010148e-06,
"loss": 0.38673570156097414,
"mean_token_accuracy": 0.8773683786392212,
"num_tokens": 17554472.0,
"step": 1150
},
{
"entropy": 0.3581284359097481,
"epoch": 0.44841308356789283,
"grad_norm": 1.390625,
"learning_rate": 9.63133507297324e-06,
"loss": 0.37009692192077637,
"mean_token_accuracy": 0.8856742799282074,
"num_tokens": 17637236.0,
"step": 1155
},
{
"entropy": 0.47716011516749857,
"epoch": 0.45035426574784043,
"grad_norm": 1.84375,
"learning_rate": 9.627376249947866e-06,
"loss": 0.491148042678833,
"mean_token_accuracy": 0.8575920403003693,
"num_tokens": 17714880.0,
"step": 1160
},
{
"entropy": 0.38734299391508104,
"epoch": 0.45229544792778803,
"grad_norm": 1.7734375,
"learning_rate": 9.623397107317897e-06,
"loss": 0.4355041980743408,
"mean_token_accuracy": 0.8793953686952591,
"num_tokens": 17786198.0,
"step": 1165
},
{
"entropy": 0.3389087375253439,
"epoch": 0.45423663010773563,
"grad_norm": 1.5625,
"learning_rate": 9.619397662556434e-06,
"loss": 0.35656213760375977,
"mean_token_accuracy": 0.8894602239131928,
"num_tokens": 17851917.0,
"step": 1170
},
{
"entropy": 0.3687282849103212,
"epoch": 0.4561778122876832,
"grad_norm": 1.5859375,
"learning_rate": 9.615377933225727e-06,
"loss": 0.4001771450042725,
"mean_token_accuracy": 0.8769694566726685,
"num_tokens": 17922400.0,
"step": 1175
},
{
"entropy": 0.37209414653480055,
"epoch": 0.45811899446763077,
"grad_norm": 1.3984375,
"learning_rate": 9.611337936977096e-06,
"loss": 0.39912428855896,
"mean_token_accuracy": 0.8808334246277809,
"num_tokens": 17996715.0,
"step": 1180
},
{
"entropy": 0.38213921934366224,
"epoch": 0.46006017664757837,
"grad_norm": 1.390625,
"learning_rate": 9.607277691550862e-06,
"loss": 0.41675233840942383,
"mean_token_accuracy": 0.8742595329880715,
"num_tokens": 18084477.0,
"step": 1185
},
{
"entropy": 0.39352951049804685,
"epoch": 0.46200135882752597,
"grad_norm": 2.3125,
"learning_rate": 9.60319721477626e-06,
"loss": 0.4071157932281494,
"mean_token_accuracy": 0.8733987167477608,
"num_tokens": 18152478.0,
"step": 1190
},
{
"entropy": 0.392200979962945,
"epoch": 0.46394254100747356,
"grad_norm": 1.5390625,
"learning_rate": 9.59909652457136e-06,
"loss": 0.4249094486236572,
"mean_token_accuracy": 0.8737818524241447,
"num_tokens": 18219295.0,
"step": 1195
},
{
"entropy": 0.35255367681384087,
"epoch": 0.46588372318742116,
"grad_norm": 1.609375,
"learning_rate": 9.594975638943006e-06,
"loss": 0.3529276132583618,
"mean_token_accuracy": 0.8894811898469925,
"num_tokens": 18289986.0,
"step": 1200
},
{
"epoch": 0.46588372318742116,
"eval_entropy": 0.3770919001001211,
"eval_loss": 0.37714096903800964,
"eval_mean_token_accuracy": 0.8812256991003435,
"eval_num_tokens": 18289986.0,
"eval_runtime": 60.2886,
"eval_samples_per_second": 35.645,
"eval_steps_per_second": 35.645,
"step": 1200
},
{
"entropy": 0.38377687335014343,
"epoch": 0.4678249053673687,
"grad_norm": 1.9140625,
"learning_rate": 9.59083457598671e-06,
"loss": 0.4212928771972656,
"mean_token_accuracy": 0.8758307337760926,
"num_tokens": 18359707.0,
"step": 1205
},
{
"entropy": 0.3596473693847656,
"epoch": 0.4697660875473163,
"grad_norm": 1.4375,
"learning_rate": 9.586673353886591e-06,
"loss": 0.3813552141189575,
"mean_token_accuracy": 0.8841730430722237,
"num_tokens": 18435661.0,
"step": 1210
},
{
"entropy": 0.4140443943440914,
"epoch": 0.4717072697272639,
"grad_norm": 1.46875,
"learning_rate": 9.582491990915292e-06,
"loss": 0.4197361469268799,
"mean_token_accuracy": 0.8726279020309449,
"num_tokens": 18509573.0,
"step": 1215
},
{
"entropy": 0.3907853942364454,
"epoch": 0.4736484519072115,
"grad_norm": 1.625,
"learning_rate": 9.578290505433896e-06,
"loss": 0.4191273212432861,
"mean_token_accuracy": 0.8725411191582679,
"num_tokens": 18608110.0,
"step": 1220
},
{
"entropy": 0.3810266986489296,
"epoch": 0.4755896340871591,
"grad_norm": 1.4296875,
"learning_rate": 9.57406891589185e-06,
"loss": 0.38829352855682375,
"mean_token_accuracy": 0.8763051420450211,
"num_tokens": 18689794.0,
"step": 1225
},
{
"entropy": 0.420042909681797,
"epoch": 0.4775308162671067,
"grad_norm": 1.703125,
"learning_rate": 9.569827240826876e-06,
"loss": 0.40959844589233396,
"mean_token_accuracy": 0.8703769713640213,
"num_tokens": 18758060.0,
"step": 1230
},
{
"entropy": 0.36773351952433586,
"epoch": 0.47947199844705424,
"grad_norm": 2.328125,
"learning_rate": 9.565565498864902e-06,
"loss": 0.3752429962158203,
"mean_token_accuracy": 0.8811664238572121,
"num_tokens": 18824217.0,
"step": 1235
},
{
"entropy": 0.40542583018541334,
"epoch": 0.48141318062700184,
"grad_norm": 1.3515625,
"learning_rate": 9.561283708719968e-06,
"loss": 0.4128578662872314,
"mean_token_accuracy": 0.8699263706803322,
"num_tokens": 18910883.0,
"step": 1240
},
{
"entropy": 0.37737762071192266,
"epoch": 0.48335436280694943,
"grad_norm": 1.5234375,
"learning_rate": 9.55698188919415e-06,
"loss": 0.39167325496673583,
"mean_token_accuracy": 0.879218578338623,
"num_tokens": 18993242.0,
"step": 1245
},
{
"entropy": 0.38517537601292134,
"epoch": 0.48529554498689703,
"grad_norm": 1.921875,
"learning_rate": 9.552660059177477e-06,
"loss": 0.38378689289093015,
"mean_token_accuracy": 0.879303851723671,
"num_tokens": 19061334.0,
"step": 1250
},
{
"entropy": 0.37100368924438953,
"epoch": 0.48723672716684463,
"grad_norm": 1.828125,
"learning_rate": 9.548318237647849e-06,
"loss": 0.4200906753540039,
"mean_token_accuracy": 0.8786957338452339,
"num_tokens": 19128848.0,
"step": 1255
},
{
"entropy": 0.38898602277040484,
"epoch": 0.4891779093467922,
"grad_norm": 1.46875,
"learning_rate": 9.543956443670947e-06,
"loss": 0.4141817569732666,
"mean_token_accuracy": 0.8764576897025108,
"num_tokens": 19202756.0,
"step": 1260
},
{
"entropy": 0.3760422389954329,
"epoch": 0.49111909152673977,
"grad_norm": 2.109375,
"learning_rate": 9.539574696400165e-06,
"loss": 0.3719266653060913,
"mean_token_accuracy": 0.8817565947771072,
"num_tokens": 19268958.0,
"step": 1265
},
{
"entropy": 0.36780366376042367,
"epoch": 0.49306027370668737,
"grad_norm": 1.8203125,
"learning_rate": 9.535173015076501e-06,
"loss": 0.39360432624816893,
"mean_token_accuracy": 0.8779341161251069,
"num_tokens": 19352111.0,
"step": 1270
},
{
"entropy": 0.4413019739091396,
"epoch": 0.49500145588663497,
"grad_norm": 1.7109375,
"learning_rate": 9.5307514190285e-06,
"loss": 0.4407999515533447,
"mean_token_accuracy": 0.8592960745096206,
"num_tokens": 19441509.0,
"step": 1275
},
{
"entropy": 0.39783000349998476,
"epoch": 0.49694263806658256,
"grad_norm": 1.234375,
"learning_rate": 9.526309927672148e-06,
"loss": 0.42558717727661133,
"mean_token_accuracy": 0.8755456551909446,
"num_tokens": 19525598.0,
"step": 1280
},
{
"entropy": 0.36931919269263747,
"epoch": 0.49888382024653016,
"grad_norm": 1.3984375,
"learning_rate": 9.521848560510796e-06,
"loss": 0.38771824836730956,
"mean_token_accuracy": 0.8801409855484963,
"num_tokens": 19612547.0,
"step": 1285
},
{
"entropy": 0.41849659457802774,
"epoch": 0.5008250024264778,
"grad_norm": 1.6953125,
"learning_rate": 9.517367337135076e-06,
"loss": 0.43532710075378417,
"mean_token_accuracy": 0.8676797851920128,
"num_tokens": 19689731.0,
"step": 1290
},
{
"entropy": 0.41437376402318477,
"epoch": 0.5027661846064253,
"grad_norm": 1.6796875,
"learning_rate": 9.51286627722281e-06,
"loss": 0.4385324478149414,
"mean_token_accuracy": 0.8706857517361641,
"num_tokens": 19756661.0,
"step": 1295
},
{
"entropy": 0.4086436625570059,
"epoch": 0.5047073667863728,
"grad_norm": 1.6171875,
"learning_rate": 9.508345400538926e-06,
"loss": 0.4336398124694824,
"mean_token_accuracy": 0.8683807790279389,
"num_tokens": 19821704.0,
"step": 1300
},
{
"entropy": 0.3883740194141865,
"epoch": 0.5066485489663205,
"grad_norm": 1.3828125,
"learning_rate": 9.503804726935369e-06,
"loss": 0.39049382209777833,
"mean_token_accuracy": 0.8757125899195671,
"num_tokens": 19902414.0,
"step": 1305
},
{
"entropy": 0.39095442183315754,
"epoch": 0.508589731146268,
"grad_norm": 1.6640625,
"learning_rate": 9.499244276351019e-06,
"loss": 0.38634843826293946,
"mean_token_accuracy": 0.8751242905855179,
"num_tokens": 19984638.0,
"step": 1310
},
{
"entropy": 0.3666827451437712,
"epoch": 0.5105309133262157,
"grad_norm": 1.6171875,
"learning_rate": 9.494664068811597e-06,
"loss": 0.40018815994262696,
"mean_token_accuracy": 0.880105035007,
"num_tokens": 20064261.0,
"step": 1315
},
{
"entropy": 0.34377430453896524,
"epoch": 0.5124720955061632,
"grad_norm": 1.8046875,
"learning_rate": 9.490064124429584e-06,
"loss": 0.36567790508270265,
"mean_token_accuracy": 0.8907153263688088,
"num_tokens": 20126962.0,
"step": 1320
},
{
"entropy": 0.3555528115481138,
"epoch": 0.5144132776861109,
"grad_norm": 1.3046875,
"learning_rate": 9.485444463404125e-06,
"loss": 0.3725638151168823,
"mean_token_accuracy": 0.8841280445456505,
"num_tokens": 20215463.0,
"step": 1325
},
{
"entropy": 0.3422409202903509,
"epoch": 0.5163544598660584,
"grad_norm": 1.53125,
"learning_rate": 9.480805106020947e-06,
"loss": 0.3722813129425049,
"mean_token_accuracy": 0.8891440883278847,
"num_tokens": 20298577.0,
"step": 1330
},
{
"entropy": 0.3823659881949425,
"epoch": 0.518295642046006,
"grad_norm": 1.5234375,
"learning_rate": 9.476146072652262e-06,
"loss": 0.39306447505950926,
"mean_token_accuracy": 0.876395545899868,
"num_tokens": 20374990.0,
"step": 1335
},
{
"entropy": 0.388584029302001,
"epoch": 0.5202368242259536,
"grad_norm": 1.5,
"learning_rate": 9.471467383756692e-06,
"loss": 0.41069755554199217,
"mean_token_accuracy": 0.8789796933531762,
"num_tokens": 20446548.0,
"step": 1340
},
{
"entropy": 0.36739424169063567,
"epoch": 0.5221780064059012,
"grad_norm": 1.8671875,
"learning_rate": 9.46676905987916e-06,
"loss": 0.3884859085083008,
"mean_token_accuracy": 0.8806875750422478,
"num_tokens": 20517149.0,
"step": 1345
},
{
"entropy": 0.35578424148261545,
"epoch": 0.5241191885858488,
"grad_norm": 1.5,
"learning_rate": 9.462051121650816e-06,
"loss": 0.3846778869628906,
"mean_token_accuracy": 0.8805378764867783,
"num_tokens": 20596629.0,
"step": 1350
},
{
"entropy": 0.375444458052516,
"epoch": 0.5260603707657964,
"grad_norm": 1.8984375,
"learning_rate": 9.457313589788937e-06,
"loss": 0.40492801666259765,
"mean_token_accuracy": 0.8799885243177414,
"num_tokens": 20660631.0,
"step": 1355
},
{
"entropy": 0.38559874445199965,
"epoch": 0.5280015529457439,
"grad_norm": 1.984375,
"learning_rate": 9.452556485096839e-06,
"loss": 0.4140150547027588,
"mean_token_accuracy": 0.8767204716801643,
"num_tokens": 20723882.0,
"step": 1360
},
{
"entropy": 0.40360996387898923,
"epoch": 0.5299427351256916,
"grad_norm": 1.5234375,
"learning_rate": 9.447779828463788e-06,
"loss": 0.38798012733459475,
"mean_token_accuracy": 0.8741889104247094,
"num_tokens": 20801012.0,
"step": 1365
},
{
"entropy": 0.3853685542941093,
"epoch": 0.5318839173056391,
"grad_norm": 1.4140625,
"learning_rate": 9.442983640864904e-06,
"loss": 0.39840006828308105,
"mean_token_accuracy": 0.8812712132930756,
"num_tokens": 20870494.0,
"step": 1370
},
{
"entropy": 0.41988850980997083,
"epoch": 0.5338250994855868,
"grad_norm": 1.859375,
"learning_rate": 9.43816794336107e-06,
"loss": 0.42660999298095703,
"mean_token_accuracy": 0.8691768750548363,
"num_tokens": 20932159.0,
"step": 1375
},
{
"entropy": 0.35770875252783296,
"epoch": 0.5357662816655343,
"grad_norm": 1.3203125,
"learning_rate": 9.433332757098844e-06,
"loss": 0.35865347385406493,
"mean_token_accuracy": 0.8853553980588913,
"num_tokens": 21012568.0,
"step": 1380
},
{
"entropy": 0.3706452056765556,
"epoch": 0.5377074638454818,
"grad_norm": 1.734375,
"learning_rate": 9.428478103310358e-06,
"loss": 0.40013108253479,
"mean_token_accuracy": 0.8823366552591324,
"num_tokens": 21083094.0,
"step": 1385
},
{
"entropy": 0.39769635573029516,
"epoch": 0.5396486460254295,
"grad_norm": 1.390625,
"learning_rate": 9.423604003313232e-06,
"loss": 0.4011887550354004,
"mean_token_accuracy": 0.8764868810772896,
"num_tokens": 21161361.0,
"step": 1390
},
{
"entropy": 0.39253461360931396,
"epoch": 0.541589828205377,
"grad_norm": 1.8828125,
"learning_rate": 9.418710478510478e-06,
"loss": 0.41046462059020994,
"mean_token_accuracy": 0.878234452009201,
"num_tokens": 21225113.0,
"step": 1395
},
{
"entropy": 0.3744351703673601,
"epoch": 0.5435310103853247,
"grad_norm": 1.5625,
"learning_rate": 9.413797550390403e-06,
"loss": 0.37674736976623535,
"mean_token_accuracy": 0.8828163802623749,
"num_tokens": 21295691.0,
"step": 1400
},
{
"epoch": 0.5435310103853247,
"eval_entropy": 0.3686942668842293,
"eval_loss": 0.3755117952823639,
"eval_mean_token_accuracy": 0.8816876367060402,
"eval_num_tokens": 21295691.0,
"eval_runtime": 60.3519,
"eval_samples_per_second": 35.608,
"eval_steps_per_second": 35.608,
"step": 1400
},
{
"entropy": 0.34990762211382387,
"epoch": 0.5454721925652722,
"grad_norm": 1.2109375,
"learning_rate": 9.40886524052652e-06,
"loss": 0.3539942979812622,
"mean_token_accuracy": 0.8850773021578788,
"num_tokens": 21376676.0,
"step": 1405
},
{
"entropy": 0.41086711175739765,
"epoch": 0.5474133747452199,
"grad_norm": 1.6328125,
"learning_rate": 9.403913570577448e-06,
"loss": 0.43075881004333494,
"mean_token_accuracy": 0.871852807700634,
"num_tokens": 21451444.0,
"step": 1410
},
{
"entropy": 0.37052917703986166,
"epoch": 0.5493545569251674,
"grad_norm": 1.453125,
"learning_rate": 9.398942562286822e-06,
"loss": 0.38300988674163816,
"mean_token_accuracy": 0.8779854521155357,
"num_tokens": 21536871.0,
"step": 1415
},
{
"entropy": 0.3861477542668581,
"epoch": 0.551295739105115,
"grad_norm": 1.7265625,
"learning_rate": 9.393952237483195e-06,
"loss": 0.40117707252502444,
"mean_token_accuracy": 0.8765692830085754,
"num_tokens": 21605987.0,
"step": 1420
},
{
"entropy": 0.4081678859889507,
"epoch": 0.5532369212850626,
"grad_norm": 1.7890625,
"learning_rate": 9.38894261807994e-06,
"loss": 0.42310566902160646,
"mean_token_accuracy": 0.8693249508738518,
"num_tokens": 21692774.0,
"step": 1425
},
{
"entropy": 0.3717794116586447,
"epoch": 0.5551781034650102,
"grad_norm": 1.4453125,
"learning_rate": 9.383913726075157e-06,
"loss": 0.38362655639648435,
"mean_token_accuracy": 0.8794390082359314,
"num_tokens": 21774027.0,
"step": 1430
},
{
"entropy": 0.3897536873817444,
"epoch": 0.5571192856449578,
"grad_norm": 1.59375,
"learning_rate": 9.378865583551575e-06,
"loss": 0.40027127265930174,
"mean_token_accuracy": 0.874237485229969,
"num_tokens": 21855865.0,
"step": 1435
},
{
"entropy": 0.4386448211967945,
"epoch": 0.5590604678249054,
"grad_norm": 1.8984375,
"learning_rate": 9.373798212676459e-06,
"loss": 0.44517908096313474,
"mean_token_accuracy": 0.8637019321322441,
"num_tokens": 21937592.0,
"step": 1440
},
{
"entropy": 0.40399301163852214,
"epoch": 0.5610016500048529,
"grad_norm": 2.234375,
"learning_rate": 9.368711635701499e-06,
"loss": 0.42911725044250487,
"mean_token_accuracy": 0.8717393398284912,
"num_tokens": 22016228.0,
"step": 1445
},
{
"entropy": 0.33674396723508837,
"epoch": 0.5629428321848006,
"grad_norm": 1.625,
"learning_rate": 9.363605874962735e-06,
"loss": 0.3449155569076538,
"mean_token_accuracy": 0.8916645109653473,
"num_tokens": 22091155.0,
"step": 1450
},
{
"entropy": 0.34950118474662306,
"epoch": 0.5648840143647481,
"grad_norm": 1.6875,
"learning_rate": 9.358480952880438e-06,
"loss": 0.37925631999969484,
"mean_token_accuracy": 0.8876421838998795,
"num_tokens": 22168063.0,
"step": 1455
},
{
"entropy": 0.3646994840353727,
"epoch": 0.5668251965446958,
"grad_norm": 1.359375,
"learning_rate": 9.35333689195902e-06,
"loss": 0.3887592554092407,
"mean_token_accuracy": 0.8781901568174362,
"num_tokens": 22247190.0,
"step": 1460
},
{
"entropy": 0.42925515584647655,
"epoch": 0.5687663787246433,
"grad_norm": 1.7265625,
"learning_rate": 9.34817371478694e-06,
"loss": 0.4361258983612061,
"mean_token_accuracy": 0.8652834072709084,
"num_tokens": 22327454.0,
"step": 1465
},
{
"entropy": 0.40622838474810125,
"epoch": 0.5707075609045908,
"grad_norm": 1.53125,
"learning_rate": 9.342991444036593e-06,
"loss": 0.4456647872924805,
"mean_token_accuracy": 0.8694571733474732,
"num_tokens": 22412083.0,
"step": 1470
},
{
"entropy": 0.42173517793416976,
"epoch": 0.5726487430845385,
"grad_norm": 1.421875,
"learning_rate": 9.337790102464224e-06,
"loss": 0.454360818862915,
"mean_token_accuracy": 0.8657700821757317,
"num_tokens": 22490065.0,
"step": 1475
},
{
"entropy": 0.4561560284346342,
"epoch": 0.574589925264486,
"grad_norm": 1.40625,
"learning_rate": 9.332569712909816e-06,
"loss": 0.4739046573638916,
"mean_token_accuracy": 0.8589961290359497,
"num_tokens": 22578402.0,
"step": 1480
},
{
"entropy": 0.38373861461877823,
"epoch": 0.5765311074444337,
"grad_norm": 1.6796875,
"learning_rate": 9.327330298296998e-06,
"loss": 0.3775209665298462,
"mean_token_accuracy": 0.8786651358008385,
"num_tokens": 22657716.0,
"step": 1485
},
{
"entropy": 0.34858159013092516,
"epoch": 0.5784722896243812,
"grad_norm": 1.5,
"learning_rate": 9.32207188163294e-06,
"loss": 0.36159141063690187,
"mean_token_accuracy": 0.8849190220236778,
"num_tokens": 22727213.0,
"step": 1490
},
{
"entropy": 0.36950380988419057,
"epoch": 0.5804134718043288,
"grad_norm": 1.84375,
"learning_rate": 9.316794486008254e-06,
"loss": 0.41820201873779295,
"mean_token_accuracy": 0.8807887002825737,
"num_tokens": 22796084.0,
"step": 1495
},
{
"entropy": 0.3770993869751692,
"epoch": 0.5823546539842764,
"grad_norm": 1.5234375,
"learning_rate": 9.31149813459689e-06,
"loss": 0.3539431571960449,
"mean_token_accuracy": 0.8795977741479873,
"num_tokens": 22870251.0,
"step": 1500
},
{
"entropy": 0.3788142062723637,
"epoch": 0.584295836164224,
"grad_norm": 1.828125,
"learning_rate": 9.306182850656037e-06,
"loss": 0.3946338415145874,
"mean_token_accuracy": 0.8801519960165024,
"num_tokens": 22951081.0,
"step": 1505
},
{
"entropy": 0.3841311365365982,
"epoch": 0.5862370183441716,
"grad_norm": 1.46875,
"learning_rate": 9.300848657526024e-06,
"loss": 0.38277838230133054,
"mean_token_accuracy": 0.8772434189915657,
"num_tokens": 23034217.0,
"step": 1510
},
{
"entropy": 0.3577498983591795,
"epoch": 0.5881782005241192,
"grad_norm": 2.0,
"learning_rate": 9.29549557863021e-06,
"loss": 0.37149059772491455,
"mean_token_accuracy": 0.8867600724101067,
"num_tokens": 23103243.0,
"step": 1515
},
{
"entropy": 0.36544432379305364,
"epoch": 0.5901193827040668,
"grad_norm": 1.65625,
"learning_rate": 9.29012363747488e-06,
"loss": 0.3911574840545654,
"mean_token_accuracy": 0.8809396475553513,
"num_tokens": 23180618.0,
"step": 1520
},
{
"entropy": 0.37773900777101516,
"epoch": 0.5920605648840144,
"grad_norm": 1.6015625,
"learning_rate": 9.284732857649154e-06,
"loss": 0.40440049171447756,
"mean_token_accuracy": 0.8771207213401795,
"num_tokens": 23274254.0,
"step": 1525
},
{
"entropy": 0.41926471069455146,
"epoch": 0.5940017470639619,
"grad_norm": 1.78125,
"learning_rate": 9.279323262824871e-06,
"loss": 0.43068270683288573,
"mean_token_accuracy": 0.8644863858819007,
"num_tokens": 23354103.0,
"step": 1530
},
{
"entropy": 0.3747733347117901,
"epoch": 0.5959429292439096,
"grad_norm": 1.5703125,
"learning_rate": 9.273894876756497e-06,
"loss": 0.3952503204345703,
"mean_token_accuracy": 0.8833319827914238,
"num_tokens": 23420187.0,
"step": 1535
},
{
"entropy": 0.3898849368095398,
"epoch": 0.5978841114238571,
"grad_norm": 1.4375,
"learning_rate": 9.268447723281003e-06,
"loss": 0.4146092891693115,
"mean_token_accuracy": 0.8798678085207939,
"num_tokens": 23491179.0,
"step": 1540
},
{
"entropy": 0.34879231434315444,
"epoch": 0.5998252936038048,
"grad_norm": 1.4375,
"learning_rate": 9.262981826317778e-06,
"loss": 0.37036240100860596,
"mean_token_accuracy": 0.8908288046717644,
"num_tokens": 23561490.0,
"step": 1545
},
{
"entropy": 0.4013595413416624,
"epoch": 0.6017664757837523,
"grad_norm": 1.90625,
"learning_rate": 9.257497209868516e-06,
"loss": 0.42535991668701173,
"mean_token_accuracy": 0.8715372681617737,
"num_tokens": 23643197.0,
"step": 1550
},
{
"entropy": 0.3767758123576641,
"epoch": 0.6037076579636999,
"grad_norm": 1.875,
"learning_rate": 9.251993898017109e-06,
"loss": 0.3970643997192383,
"mean_token_accuracy": 0.8821586266160011,
"num_tokens": 23714513.0,
"step": 1555
},
{
"entropy": 0.40486165285110476,
"epoch": 0.6056488401436475,
"grad_norm": 1.515625,
"learning_rate": 9.246471914929547e-06,
"loss": 0.41384401321411135,
"mean_token_accuracy": 0.8695383608341217,
"num_tokens": 23801743.0,
"step": 1560
},
{
"entropy": 0.3538258448243141,
"epoch": 0.607590022323595,
"grad_norm": 1.6328125,
"learning_rate": 9.240931284853807e-06,
"loss": 0.3868009090423584,
"mean_token_accuracy": 0.8842133894562721,
"num_tokens": 23893948.0,
"step": 1565
},
{
"entropy": 0.3908126030117273,
"epoch": 0.6095312045035427,
"grad_norm": 1.8125,
"learning_rate": 9.235372032119747e-06,
"loss": 0.40709576606750486,
"mean_token_accuracy": 0.8742722377181054,
"num_tokens": 23959291.0,
"step": 1570
},
{
"entropy": 0.39328810535371306,
"epoch": 0.6114723866834902,
"grad_norm": 1.625,
"learning_rate": 9.229794181139002e-06,
"loss": 0.40347847938537595,
"mean_token_accuracy": 0.874020305275917,
"num_tokens": 24028375.0,
"step": 1575
},
{
"entropy": 0.38035779893398286,
"epoch": 0.6134135688634378,
"grad_norm": 1.4765625,
"learning_rate": 9.224197756404875e-06,
"loss": 0.39300010204315183,
"mean_token_accuracy": 0.8796775847673416,
"num_tokens": 24106755.0,
"step": 1580
},
{
"entropy": 0.35121305733919145,
"epoch": 0.6153547510433854,
"grad_norm": 2.296875,
"learning_rate": 9.218582782492228e-06,
"loss": 0.39762823581695556,
"mean_token_accuracy": 0.8853226408362389,
"num_tokens": 24174787.0,
"step": 1585
},
{
"entropy": 0.3741837713867426,
"epoch": 0.617295933223333,
"grad_norm": 1.7578125,
"learning_rate": 9.212949284057378e-06,
"loss": 0.39895901679992674,
"mean_token_accuracy": 0.8801515579223633,
"num_tokens": 24253990.0,
"step": 1590
},
{
"entropy": 0.4153418317437172,
"epoch": 0.6192371154032806,
"grad_norm": 1.7890625,
"learning_rate": 9.207297285837984e-06,
"loss": 0.4323587894439697,
"mean_token_accuracy": 0.8745242461562157,
"num_tokens": 24326608.0,
"step": 1595
},
{
"entropy": 0.42888959534466264,
"epoch": 0.6211782975832282,
"grad_norm": 1.3125,
"learning_rate": 9.201626812652942e-06,
"loss": 0.4193469524383545,
"mean_token_accuracy": 0.8636892691254616,
"num_tokens": 24408096.0,
"step": 1600
},
{
"epoch": 0.6211782975832282,
"eval_entropy": 0.3693768273520714,
"eval_loss": 0.3738563656806946,
"eval_mean_token_accuracy": 0.8822050338265174,
"eval_num_tokens": 24408096.0,
"eval_runtime": 60.0738,
"eval_samples_per_second": 35.773,
"eval_steps_per_second": 35.773,
"step": 1600
},
{
"entropy": 0.4069465111941099,
"epoch": 0.6231194797631757,
"grad_norm": 1.3828125,
"learning_rate": 9.195937889402276e-06,
"loss": 0.3946805238723755,
"mean_token_accuracy": 0.8711962580680848,
"num_tokens": 24477273.0,
"step": 1605
},
{
"entropy": 0.3497770603746176,
"epoch": 0.6250606619431234,
"grad_norm": 1.53125,
"learning_rate": 9.190230541067023e-06,
"loss": 0.3609620094299316,
"mean_token_accuracy": 0.8848605647683143,
"num_tokens": 24556472.0,
"step": 1610
},
{
"entropy": 0.4083269018679857,
"epoch": 0.6270018441230709,
"grad_norm": 1.484375,
"learning_rate": 9.184504792709134e-06,
"loss": 0.4195822238922119,
"mean_token_accuracy": 0.873286210000515,
"num_tokens": 24633310.0,
"step": 1615
},
{
"entropy": 0.3741916142404079,
"epoch": 0.6289430263030186,
"grad_norm": 1.71875,
"learning_rate": 9.178760669471351e-06,
"loss": 0.3778867244720459,
"mean_token_accuracy": 0.8781406879425049,
"num_tokens": 24702572.0,
"step": 1620
},
{
"entropy": 0.34799036718904974,
"epoch": 0.6308842084829661,
"grad_norm": 1.3828125,
"learning_rate": 9.17299819657711e-06,
"loss": 0.36290202140808103,
"mean_token_accuracy": 0.8880066946148872,
"num_tokens": 24786717.0,
"step": 1625
},
{
"entropy": 0.35105147287249566,
"epoch": 0.6328253906629138,
"grad_norm": 1.9296875,
"learning_rate": 9.167217399330418e-06,
"loss": 0.367209792137146,
"mean_token_accuracy": 0.886526557803154,
"num_tokens": 24861736.0,
"step": 1630
},
{
"entropy": 0.3578195352107286,
"epoch": 0.6347665728428613,
"grad_norm": 1.375,
"learning_rate": 9.161418303115749e-06,
"loss": 0.3651568412780762,
"mean_token_accuracy": 0.8801181107759476,
"num_tokens": 24932067.0,
"step": 1635
},
{
"entropy": 0.4024165827780962,
"epoch": 0.6367077550228089,
"grad_norm": 1.546875,
"learning_rate": 9.155600933397932e-06,
"loss": 0.4195927619934082,
"mean_token_accuracy": 0.8746752873063087,
"num_tokens": 25003342.0,
"step": 1640
},
{
"entropy": 0.41863835491240026,
"epoch": 0.6386489372027565,
"grad_norm": 1.234375,
"learning_rate": 9.149765315722039e-06,
"loss": 0.4207592964172363,
"mean_token_accuracy": 0.8699864789843559,
"num_tokens": 25089543.0,
"step": 1645
},
{
"entropy": 0.3774807959794998,
"epoch": 0.640590119382704,
"grad_norm": 1.484375,
"learning_rate": 9.14391147571327e-06,
"loss": 0.38125219345092776,
"mean_token_accuracy": 0.8813193202018738,
"num_tokens": 25164250.0,
"step": 1650
},
{
"entropy": 0.36004649810492995,
"epoch": 0.6425313015626517,
"grad_norm": 1.4609375,
"learning_rate": 9.13803943907684e-06,
"loss": 0.38634524345397947,
"mean_token_accuracy": 0.8815308138728142,
"num_tokens": 25235584.0,
"step": 1655
},
{
"entropy": 0.40895739644765855,
"epoch": 0.6444724837425992,
"grad_norm": 1.390625,
"learning_rate": 9.132149231597874e-06,
"loss": 0.42175993919372556,
"mean_token_accuracy": 0.8735464856028556,
"num_tokens": 25326121.0,
"step": 1660
},
{
"entropy": 0.42022631838917734,
"epoch": 0.6464136659225468,
"grad_norm": 1.3671875,
"learning_rate": 9.126240879141286e-06,
"loss": 0.4283411502838135,
"mean_token_accuracy": 0.8683241337537766,
"num_tokens": 25416532.0,
"step": 1665
},
{
"entropy": 0.3419806692749262,
"epoch": 0.6483548481024944,
"grad_norm": 1.3515625,
"learning_rate": 9.120314407651665e-06,
"loss": 0.3869215726852417,
"mean_token_accuracy": 0.8876996964216233,
"num_tokens": 25500339.0,
"step": 1670
},
{
"entropy": 0.37519195675849915,
"epoch": 0.650296030282442,
"grad_norm": 1.65625,
"learning_rate": 9.114369843153168e-06,
"loss": 0.38437614440917967,
"mean_token_accuracy": 0.880204701423645,
"num_tokens": 25571598.0,
"step": 1675
},
{
"entropy": 0.34165109843015673,
"epoch": 0.6522372124623896,
"grad_norm": 1.7734375,
"learning_rate": 9.108407211749397e-06,
"loss": 0.3734029531478882,
"mean_token_accuracy": 0.8863589748740196,
"num_tokens": 25647870.0,
"step": 1680
},
{
"entropy": 0.3643860913813114,
"epoch": 0.6541783946423372,
"grad_norm": 1.4765625,
"learning_rate": 9.102426539623295e-06,
"loss": 0.3877432107925415,
"mean_token_accuracy": 0.8784330353140831,
"num_tokens": 25729611.0,
"step": 1685
},
{
"entropy": 0.4150772735476494,
"epoch": 0.6561195768222847,
"grad_norm": 1.78125,
"learning_rate": 9.09642785303702e-06,
"loss": 0.4420276641845703,
"mean_token_accuracy": 0.8656805634498597,
"num_tokens": 25808840.0,
"step": 1690
},
{
"entropy": 0.3560687083750963,
"epoch": 0.6580607590022324,
"grad_norm": 1.7421875,
"learning_rate": 9.090411178331835e-06,
"loss": 0.37286901473999023,
"mean_token_accuracy": 0.8856526881456375,
"num_tokens": 25887901.0,
"step": 1695
},
{
"entropy": 0.4048729032278061,
"epoch": 0.6600019411821799,
"grad_norm": 1.2734375,
"learning_rate": 9.084376541927995e-06,
"loss": 0.4281449317932129,
"mean_token_accuracy": 0.8717523291707039,
"num_tokens": 25980223.0,
"step": 1700
},
{
"entropy": 0.4058201160281897,
"epoch": 0.6619431233621276,
"grad_norm": 1.28125,
"learning_rate": 9.078323970324626e-06,
"loss": 0.42533535957336427,
"mean_token_accuracy": 0.8724991276860237,
"num_tokens": 26057550.0,
"step": 1705
},
{
"entropy": 0.40643964521586895,
"epoch": 0.6638843055420751,
"grad_norm": 1.640625,
"learning_rate": 9.072253490099607e-06,
"loss": 0.4063755512237549,
"mean_token_accuracy": 0.8733890399336814,
"num_tokens": 26131468.0,
"step": 1710
},
{
"entropy": 0.37885321527719495,
"epoch": 0.6658254877220227,
"grad_norm": 1.53125,
"learning_rate": 9.066165127909463e-06,
"loss": 0.39308197498321534,
"mean_token_accuracy": 0.881773728132248,
"num_tokens": 26209570.0,
"step": 1715
},
{
"entropy": 0.39982022494077685,
"epoch": 0.6677666699019703,
"grad_norm": 1.359375,
"learning_rate": 9.060058910489237e-06,
"loss": 0.4166593551635742,
"mean_token_accuracy": 0.875648008286953,
"num_tokens": 26291210.0,
"step": 1720
},
{
"entropy": 0.38358601108193396,
"epoch": 0.6697078520819179,
"grad_norm": 1.6328125,
"learning_rate": 9.053934864652382e-06,
"loss": 0.39159939289093015,
"mean_token_accuracy": 0.8792028650641441,
"num_tokens": 26363096.0,
"step": 1725
},
{
"entropy": 0.39065288491547107,
"epoch": 0.6716490342618655,
"grad_norm": 1.6875,
"learning_rate": 9.047793017290635e-06,
"loss": 0.41971278190612793,
"mean_token_accuracy": 0.8771908909082413,
"num_tokens": 26449438.0,
"step": 1730
},
{
"entropy": 0.36197944805026055,
"epoch": 0.673590216441813,
"grad_norm": 1.78125,
"learning_rate": 9.041633395373902e-06,
"loss": 0.3651232957839966,
"mean_token_accuracy": 0.8863715797662735,
"num_tokens": 26506251.0,
"step": 1735
},
{
"entropy": 0.41872271075844764,
"epoch": 0.6755313986217607,
"grad_norm": 1.5234375,
"learning_rate": 9.035456025950145e-06,
"loss": 0.4293703556060791,
"mean_token_accuracy": 0.8711474344134331,
"num_tokens": 26577535.0,
"step": 1740
},
{
"entropy": 0.3581832841038704,
"epoch": 0.6774725808017082,
"grad_norm": 1.609375,
"learning_rate": 9.029260936145252e-06,
"loss": 0.3745636224746704,
"mean_token_accuracy": 0.8827520579099655,
"num_tokens": 26652699.0,
"step": 1745
},
{
"entropy": 0.43668837919831277,
"epoch": 0.6794137629816558,
"grad_norm": 1.65625,
"learning_rate": 9.02304815316293e-06,
"loss": 0.45046534538269045,
"mean_token_accuracy": 0.865936142206192,
"num_tokens": 26735591.0,
"step": 1750
},
{
"entropy": 0.3559565614908934,
"epoch": 0.6813549451616034,
"grad_norm": 1.421875,
"learning_rate": 9.016817704284575e-06,
"loss": 0.36423630714416505,
"mean_token_accuracy": 0.8824115738272666,
"num_tokens": 26812459.0,
"step": 1755
},
{
"entropy": 0.3483701661229134,
"epoch": 0.683296127341551,
"grad_norm": 1.8203125,
"learning_rate": 9.010569616869159e-06,
"loss": 0.37481648921966554,
"mean_token_accuracy": 0.8892971143126488,
"num_tokens": 26882592.0,
"step": 1760
},
{
"entropy": 0.3975631568580866,
"epoch": 0.6852373095214986,
"grad_norm": 1.484375,
"learning_rate": 9.004303918353107e-06,
"loss": 0.39717047214508056,
"mean_token_accuracy": 0.8726603716611863,
"num_tokens": 26954080.0,
"step": 1765
},
{
"entropy": 0.38138355370610955,
"epoch": 0.6871784917014462,
"grad_norm": 2.234375,
"learning_rate": 8.998020636250181e-06,
"loss": 0.39662230014801025,
"mean_token_accuracy": 0.8773909747600556,
"num_tokens": 27025611.0,
"step": 1770
},
{
"entropy": 0.35980530045926573,
"epoch": 0.6891196738813937,
"grad_norm": 1.90625,
"learning_rate": 8.991719798151354e-06,
"loss": 0.38723225593566896,
"mean_token_accuracy": 0.8855857968330383,
"num_tokens": 27106998.0,
"step": 1775
},
{
"entropy": 0.39718156717717645,
"epoch": 0.6910608560613414,
"grad_norm": 1.3203125,
"learning_rate": 8.985401431724685e-06,
"loss": 0.42195706367492675,
"mean_token_accuracy": 0.870930427312851,
"num_tokens": 27191593.0,
"step": 1780
},
{
"entropy": 0.39791759476065636,
"epoch": 0.6930020382412889,
"grad_norm": 1.4453125,
"learning_rate": 8.979065564715209e-06,
"loss": 0.3908670902252197,
"mean_token_accuracy": 0.877900630235672,
"num_tokens": 27259061.0,
"step": 1785
},
{
"entropy": 0.37184464260935784,
"epoch": 0.6949432204212366,
"grad_norm": 1.328125,
"learning_rate": 8.972712224944808e-06,
"loss": 0.3723410367965698,
"mean_token_accuracy": 0.8796270757913589,
"num_tokens": 27345514.0,
"step": 1790
},
{
"entropy": 0.39280957020819185,
"epoch": 0.6968844026011841,
"grad_norm": 1.3671875,
"learning_rate": 8.966341440312088e-06,
"loss": 0.37746195793151854,
"mean_token_accuracy": 0.8742925137281418,
"num_tokens": 27434611.0,
"step": 1795
},
{
"entropy": 0.38702532537281514,
"epoch": 0.6988255847811317,
"grad_norm": 1.34375,
"learning_rate": 8.959953238792261e-06,
"loss": 0.4323995113372803,
"mean_token_accuracy": 0.876301246881485,
"num_tokens": 27522141.0,
"step": 1800
},
{
"epoch": 0.6988255847811317,
"eval_entropy": 0.36904463945099675,
"eval_loss": 0.37259599566459656,
"eval_mean_token_accuracy": 0.8823383979156108,
"eval_num_tokens": 27522141.0,
"eval_runtime": 60.1232,
"eval_samples_per_second": 35.743,
"eval_steps_per_second": 35.743,
"step": 1800
},
{
"entropy": 0.39694005586206915,
"epoch": 0.7007667669610793,
"grad_norm": 1.6640625,
"learning_rate": 8.953547648437016e-06,
"loss": 0.422884464263916,
"mean_token_accuracy": 0.8706113517284393,
"num_tokens": 27606238.0,
"step": 1805
},
{
"entropy": 0.35907841585576533,
"epoch": 0.7027079491410269,
"grad_norm": 1.6171875,
"learning_rate": 8.947124697374403e-06,
"loss": 0.37867820262908936,
"mean_token_accuracy": 0.8819711148738861,
"num_tokens": 27698297.0,
"step": 1810
},
{
"entropy": 0.39380453154444695,
"epoch": 0.7046491313209745,
"grad_norm": 1.1328125,
"learning_rate": 8.940684413808704e-06,
"loss": 0.41552581787109377,
"mean_token_accuracy": 0.8773353233933449,
"num_tokens": 27783292.0,
"step": 1815
},
{
"entropy": 0.3948865693062544,
"epoch": 0.706590313500922,
"grad_norm": 2.0,
"learning_rate": 8.93422682602031e-06,
"loss": 0.45133333206176757,
"mean_token_accuracy": 0.8750575929880142,
"num_tokens": 27857682.0,
"step": 1820
},
{
"entropy": 0.39443247877061366,
"epoch": 0.7085314956808697,
"grad_norm": 1.8671875,
"learning_rate": 8.927751962365603e-06,
"loss": 0.39142508506774903,
"mean_token_accuracy": 0.8749705284833909,
"num_tokens": 27933338.0,
"step": 1825
},
{
"entropy": 0.38654340282082555,
"epoch": 0.7104726778608172,
"grad_norm": 1.7890625,
"learning_rate": 8.921259851276816e-06,
"loss": 0.38780851364135743,
"mean_token_accuracy": 0.8745802566409111,
"num_tokens": 28004374.0,
"step": 1830
},
{
"entropy": 0.3340354781597853,
"epoch": 0.7124138600407648,
"grad_norm": 1.8125,
"learning_rate": 8.91475052126193e-06,
"loss": 0.34950056076049807,
"mean_token_accuracy": 0.8917890131473541,
"num_tokens": 28076071.0,
"step": 1835
},
{
"entropy": 0.36462055034935476,
"epoch": 0.7143550422207124,
"grad_norm": 1.1796875,
"learning_rate": 8.90822400090453e-06,
"loss": 0.36106727123260496,
"mean_token_accuracy": 0.879101251065731,
"num_tokens": 28167857.0,
"step": 1840
},
{
"entropy": 0.3714527040719986,
"epoch": 0.71629622440066,
"grad_norm": 1.921875,
"learning_rate": 8.90168031886369e-06,
"loss": 0.3883594274520874,
"mean_token_accuracy": 0.881637692451477,
"num_tokens": 28228771.0,
"step": 1845
},
{
"entropy": 0.39277232214808466,
"epoch": 0.7182374065806076,
"grad_norm": 1.734375,
"learning_rate": 8.895119503873841e-06,
"loss": 0.4170830726623535,
"mean_token_accuracy": 0.8729140803217887,
"num_tokens": 28299510.0,
"step": 1850
},
{
"entropy": 0.3991117935627699,
"epoch": 0.7201785887605552,
"grad_norm": 2.59375,
"learning_rate": 8.888541584744652e-06,
"loss": 0.3907686710357666,
"mean_token_accuracy": 0.8788457185029983,
"num_tokens": 28356716.0,
"step": 1855
},
{
"entropy": 0.33190413266420365,
"epoch": 0.7221197709405027,
"grad_norm": 1.3125,
"learning_rate": 8.881946590360893e-06,
"loss": 0.3549908399581909,
"mean_token_accuracy": 0.8904741749167442,
"num_tokens": 28425961.0,
"step": 1860
},
{
"entropy": 0.3761907495558262,
"epoch": 0.7240609531204504,
"grad_norm": 1.7265625,
"learning_rate": 8.875334549682322e-06,
"loss": 0.40765061378479006,
"mean_token_accuracy": 0.8756383866071701,
"num_tokens": 28492753.0,
"step": 1865
},
{
"entropy": 0.3859711352735758,
"epoch": 0.7260021353003979,
"grad_norm": 1.640625,
"learning_rate": 8.868705491743543e-06,
"loss": 0.40584306716918944,
"mean_token_accuracy": 0.8751093596220016,
"num_tokens": 28574648.0,
"step": 1870
},
{
"entropy": 0.3750033970922232,
"epoch": 0.7279433174803456,
"grad_norm": 1.375,
"learning_rate": 8.862059445653892e-06,
"loss": 0.42207088470458987,
"mean_token_accuracy": 0.8791605412960053,
"num_tokens": 28673368.0,
"step": 1875
},
{
"entropy": 0.33736986815929415,
"epoch": 0.7298844996602931,
"grad_norm": 1.984375,
"learning_rate": 8.855396440597299e-06,
"loss": 0.33533928394317625,
"mean_token_accuracy": 0.8882556319236755,
"num_tokens": 28745333.0,
"step": 1880
},
{
"entropy": 0.38949261195957663,
"epoch": 0.7318256818402407,
"grad_norm": 1.46875,
"learning_rate": 8.848716505832163e-06,
"loss": 0.39729306697845457,
"mean_token_accuracy": 0.8767626166343689,
"num_tokens": 28823783.0,
"step": 1885
},
{
"entropy": 0.373487963527441,
"epoch": 0.7337668640201883,
"grad_norm": 1.578125,
"learning_rate": 8.842019670691226e-06,
"loss": 0.3975057601928711,
"mean_token_accuracy": 0.8789292603731156,
"num_tokens": 28899576.0,
"step": 1890
},
{
"entropy": 0.3453738629817963,
"epoch": 0.7357080462001359,
"grad_norm": 2.0625,
"learning_rate": 8.835305964581442e-06,
"loss": 0.38850131034851076,
"mean_token_accuracy": 0.8864782005548477,
"num_tokens": 28979338.0,
"step": 1895
},
{
"entropy": 0.3513793833553791,
"epoch": 0.7376492283800835,
"grad_norm": 1.734375,
"learning_rate": 8.828575416983853e-06,
"loss": 0.3649607181549072,
"mean_token_accuracy": 0.8849209144711494,
"num_tokens": 29038858.0,
"step": 1900
},
{
"entropy": 0.3707513175904751,
"epoch": 0.739590410560031,
"grad_norm": 1.609375,
"learning_rate": 8.821828057453448e-06,
"loss": 0.3917756795883179,
"mean_token_accuracy": 0.8805187106132507,
"num_tokens": 29121454.0,
"step": 1905
},
{
"entropy": 0.3511063469573855,
"epoch": 0.7415315927399786,
"grad_norm": 1.9296875,
"learning_rate": 8.81506391561904e-06,
"loss": 0.3545810699462891,
"mean_token_accuracy": 0.8827590346336365,
"num_tokens": 29192959.0,
"step": 1910
},
{
"entropy": 0.3934715397655964,
"epoch": 0.7434727749199262,
"grad_norm": 1.4609375,
"learning_rate": 8.80828302118314e-06,
"loss": 0.44544425010681155,
"mean_token_accuracy": 0.873169532418251,
"num_tokens": 29275025.0,
"step": 1915
},
{
"entropy": 0.37588623352348804,
"epoch": 0.7454139570998738,
"grad_norm": 1.8046875,
"learning_rate": 8.801485403921823e-06,
"loss": 0.4109992027282715,
"mean_token_accuracy": 0.8753042757511139,
"num_tokens": 29359266.0,
"step": 1920
},
{
"entropy": 0.3526043064892292,
"epoch": 0.7473551392798214,
"grad_norm": 1.625,
"learning_rate": 8.794671093684595e-06,
"loss": 0.3500061988830566,
"mean_token_accuracy": 0.8878669127821922,
"num_tokens": 29415745.0,
"step": 1925
},
{
"entropy": 0.39229949191212654,
"epoch": 0.749296321459769,
"grad_norm": 1.453125,
"learning_rate": 8.787840120394261e-06,
"loss": 0.4506565570831299,
"mean_token_accuracy": 0.873441505432129,
"num_tokens": 29492482.0,
"step": 1930
},
{
"entropy": 0.41463610120117667,
"epoch": 0.7512375036397166,
"grad_norm": 1.5703125,
"learning_rate": 8.7809925140468e-06,
"loss": 0.4298503875732422,
"mean_token_accuracy": 0.8726509675383568,
"num_tokens": 29572784.0,
"step": 1935
},
{
"entropy": 0.44622854702174664,
"epoch": 0.7531786858196642,
"grad_norm": 1.578125,
"learning_rate": 8.774128304711232e-06,
"loss": 0.47462167739868166,
"mean_token_accuracy": 0.858974027633667,
"num_tokens": 29664399.0,
"step": 1940
},
{
"entropy": 0.36902854703366755,
"epoch": 0.7551198679996117,
"grad_norm": 1.625,
"learning_rate": 8.767247522529473e-06,
"loss": 0.38140344619750977,
"mean_token_accuracy": 0.8812808141112327,
"num_tokens": 29743761.0,
"step": 1945
},
{
"entropy": 0.3770481664687395,
"epoch": 0.7570610501795594,
"grad_norm": 1.5625,
"learning_rate": 8.760350197716228e-06,
"loss": 0.37451202869415284,
"mean_token_accuracy": 0.8845255061984062,
"num_tokens": 29805351.0,
"step": 1950
},
{
"entropy": 0.42605091743171214,
"epoch": 0.7590022323595069,
"grad_norm": 1.71875,
"learning_rate": 8.75343636055883e-06,
"loss": 0.434804630279541,
"mean_token_accuracy": 0.8679893091320992,
"num_tokens": 29876830.0,
"step": 1955
},
{
"entropy": 0.4243669960647821,
"epoch": 0.7609434145394546,
"grad_norm": 1.6953125,
"learning_rate": 8.746506041417133e-06,
"loss": 0.41442170143127444,
"mean_token_accuracy": 0.8689531117677689,
"num_tokens": 29952810.0,
"step": 1960
},
{
"entropy": 0.40345251336693766,
"epoch": 0.7628845967194021,
"grad_norm": 1.828125,
"learning_rate": 8.739559270723353e-06,
"loss": 0.3906730651855469,
"mean_token_accuracy": 0.8731215000152588,
"num_tokens": 30017592.0,
"step": 1965
},
{
"entropy": 0.40580461621284486,
"epoch": 0.7648257788993497,
"grad_norm": 1.4375,
"learning_rate": 8.732596078981957e-06,
"loss": 0.40709662437438965,
"mean_token_accuracy": 0.8757615357637405,
"num_tokens": 30091851.0,
"step": 1970
},
{
"entropy": 0.3563157990574837,
"epoch": 0.7667669610792973,
"grad_norm": 1.53125,
"learning_rate": 8.72561649676952e-06,
"loss": 0.36572675704956054,
"mean_token_accuracy": 0.8835931360721588,
"num_tokens": 30167070.0,
"step": 1975
},
{
"entropy": 0.4045840006321669,
"epoch": 0.7687081432592449,
"grad_norm": 2.1875,
"learning_rate": 8.718620554734582e-06,
"loss": 0.4593046188354492,
"mean_token_accuracy": 0.8688464492559433,
"num_tokens": 30232066.0,
"step": 1980
},
{
"entropy": 0.38409191854298114,
"epoch": 0.7706493254391925,
"grad_norm": 1.65625,
"learning_rate": 8.71160828359753e-06,
"loss": 0.40677800178527834,
"mean_token_accuracy": 0.8754825726151466,
"num_tokens": 30303663.0,
"step": 1985
},
{
"entropy": 0.36517377346754076,
"epoch": 0.77259050761914,
"grad_norm": 1.5,
"learning_rate": 8.704579714150451e-06,
"loss": 0.38115544319152833,
"mean_token_accuracy": 0.8828602716326713,
"num_tokens": 30371090.0,
"step": 1990
},
{
"entropy": 0.3954622160643339,
"epoch": 0.7745316897990876,
"grad_norm": 1.234375,
"learning_rate": 8.697534877257003e-06,
"loss": 0.4024034023284912,
"mean_token_accuracy": 0.8711701706051826,
"num_tokens": 30462563.0,
"step": 1995
},
{
"entropy": 0.3614397499710321,
"epoch": 0.7764728719790353,
"grad_norm": 1.9375,
"learning_rate": 8.690473803852277e-06,
"loss": 0.38828601837158205,
"mean_token_accuracy": 0.8840885296463966,
"num_tokens": 30537774.0,
"step": 2000
},
{
"epoch": 0.7764728719790353,
"eval_entropy": 0.37249069839105764,
"eval_loss": 0.3714829683303833,
"eval_mean_token_accuracy": 0.8825830673411481,
"eval_num_tokens": 30537774.0,
"eval_runtime": 60.1567,
"eval_samples_per_second": 35.723,
"eval_steps_per_second": 35.723,
"step": 2000
},
{
"entropy": 0.4046864528208971,
"epoch": 0.7784140541589828,
"grad_norm": 1.59375,
"learning_rate": 8.683396524942655e-06,
"loss": 0.4361577033996582,
"mean_token_accuracy": 0.8703169271349906,
"num_tokens": 30629222.0,
"step": 2005
},
{
"entropy": 0.3440706986933947,
"epoch": 0.7803552363389304,
"grad_norm": 2.046875,
"learning_rate": 8.676303071605692e-06,
"loss": 0.3639081954956055,
"mean_token_accuracy": 0.8900219470262527,
"num_tokens": 30691162.0,
"step": 2010
},
{
"entropy": 0.3819488488137722,
"epoch": 0.782296418518878,
"grad_norm": 1.9453125,
"learning_rate": 8.669193474989957e-06,
"loss": 0.3750166654586792,
"mean_token_accuracy": 0.8811448708176612,
"num_tokens": 30763162.0,
"step": 2015
},
{
"entropy": 0.41373511366546156,
"epoch": 0.7842376006988255,
"grad_norm": 1.5625,
"learning_rate": 8.66206776631491e-06,
"loss": 0.4189298152923584,
"mean_token_accuracy": 0.8678776487708092,
"num_tokens": 30851103.0,
"step": 2020
},
{
"entropy": 0.38263436295092107,
"epoch": 0.7861787828787732,
"grad_norm": 1.2265625,
"learning_rate": 8.654925976870766e-06,
"loss": 0.4248814582824707,
"mean_token_accuracy": 0.8735449090600014,
"num_tokens": 30940204.0,
"step": 2025
},
{
"entropy": 0.4140047915279865,
"epoch": 0.7881199650587207,
"grad_norm": 1.5625,
"learning_rate": 8.647768138018348e-06,
"loss": 0.41850671768188474,
"mean_token_accuracy": 0.8704892829060554,
"num_tokens": 31020160.0,
"step": 2030
},
{
"entropy": 0.35883037857711314,
"epoch": 0.7900611472386684,
"grad_norm": 1.90625,
"learning_rate": 8.640594281188958e-06,
"loss": 0.3723835229873657,
"mean_token_accuracy": 0.8855434998869895,
"num_tokens": 31099746.0,
"step": 2035
},
{
"entropy": 0.36093359626829624,
"epoch": 0.7920023294186159,
"grad_norm": 1.390625,
"learning_rate": 8.633404437884235e-06,
"loss": 0.3619117498397827,
"mean_token_accuracy": 0.8832022443413734,
"num_tokens": 31175731.0,
"step": 2040
},
{
"entropy": 0.38728207871317866,
"epoch": 0.7939435115985636,
"grad_norm": 1.6953125,
"learning_rate": 8.626198639676014e-06,
"loss": 0.38774235248565675,
"mean_token_accuracy": 0.8787285834550858,
"num_tokens": 31258521.0,
"step": 2045
},
{
"entropy": 0.36480732820928097,
"epoch": 0.7958846937785111,
"grad_norm": 1.71875,
"learning_rate": 8.618976918206196e-06,
"loss": 0.3832773447036743,
"mean_token_accuracy": 0.8871650651097298,
"num_tokens": 31332899.0,
"step": 2050
},
{
"entropy": 0.37207553833723067,
"epoch": 0.7978258759584587,
"grad_norm": 1.609375,
"learning_rate": 8.611739305186602e-06,
"loss": 0.4212314605712891,
"mean_token_accuracy": 0.877054350078106,
"num_tokens": 31409631.0,
"step": 2055
},
{
"entropy": 0.4014189802110195,
"epoch": 0.7997670581384063,
"grad_norm": 1.5390625,
"learning_rate": 8.604485832398833e-06,
"loss": 0.4188095569610596,
"mean_token_accuracy": 0.8725872606039047,
"num_tokens": 31486529.0,
"step": 2060
},
{
"entropy": 0.3518418502062559,
"epoch": 0.8017082403183539,
"grad_norm": 1.4609375,
"learning_rate": 8.597216531694136e-06,
"loss": 0.3803000211715698,
"mean_token_accuracy": 0.8850871086120605,
"num_tokens": 31557811.0,
"step": 2065
},
{
"entropy": 0.38242518045008184,
"epoch": 0.8036494224983015,
"grad_norm": 1.890625,
"learning_rate": 8.589931434993262e-06,
"loss": 0.4062291145324707,
"mean_token_accuracy": 0.8756525501608848,
"num_tokens": 31627170.0,
"step": 2070
},
{
"entropy": 0.3889755714684725,
"epoch": 0.8055906046782491,
"grad_norm": 1.6875,
"learning_rate": 8.58263057428632e-06,
"loss": 0.3767040014266968,
"mean_token_accuracy": 0.8771411761641502,
"num_tokens": 31696937.0,
"step": 2075
},
{
"entropy": 0.3841622915118933,
"epoch": 0.8075317868581966,
"grad_norm": 2.03125,
"learning_rate": 8.575313981632645e-06,
"loss": 0.4042715549468994,
"mean_token_accuracy": 0.877042506635189,
"num_tokens": 31769548.0,
"step": 2080
},
{
"entropy": 0.3571667678654194,
"epoch": 0.8094729690381443,
"grad_norm": 1.4453125,
"learning_rate": 8.567981689160654e-06,
"loss": 0.3828322172164917,
"mean_token_accuracy": 0.8810154914855957,
"num_tokens": 31843626.0,
"step": 2085
},
{
"entropy": 0.4046927910298109,
"epoch": 0.8114141512180918,
"grad_norm": 1.8046875,
"learning_rate": 8.560633729067705e-06,
"loss": 0.4062997341156006,
"mean_token_accuracy": 0.8745594829320907,
"num_tokens": 31926157.0,
"step": 2090
},
{
"entropy": 0.3526596352458,
"epoch": 0.8133553333980394,
"grad_norm": 1.953125,
"learning_rate": 8.55327013361995e-06,
"loss": 0.3879395484924316,
"mean_token_accuracy": 0.8853856906294822,
"num_tokens": 31985096.0,
"step": 2095
},
{
"entropy": 0.37672988660633566,
"epoch": 0.815296515577987,
"grad_norm": 1.5859375,
"learning_rate": 8.545890935152204e-06,
"loss": 0.3643826961517334,
"mean_token_accuracy": 0.8801594600081444,
"num_tokens": 32071262.0,
"step": 2100
},
{
"entropy": 0.3610169466584921,
"epoch": 0.8172376977579345,
"grad_norm": 1.5546875,
"learning_rate": 8.538496166067798e-06,
"loss": 0.37534480094909667,
"mean_token_accuracy": 0.8826367557048798,
"num_tokens": 32154499.0,
"step": 2105
},
{
"entropy": 0.3441557249054313,
"epoch": 0.8191788799378822,
"grad_norm": 1.765625,
"learning_rate": 8.531085858838434e-06,
"loss": 0.34778728485107424,
"mean_token_accuracy": 0.8875968590378761,
"num_tokens": 32218854.0,
"step": 2110
},
{
"entropy": 0.361727100238204,
"epoch": 0.8211200621178297,
"grad_norm": 1.5078125,
"learning_rate": 8.523660046004043e-06,
"loss": 0.36833460330963136,
"mean_token_accuracy": 0.8847725585103035,
"num_tokens": 32290653.0,
"step": 2115
},
{
"entropy": 0.37507129870355127,
"epoch": 0.8230612442977774,
"grad_norm": 1.8828125,
"learning_rate": 8.516218760172647e-06,
"loss": 0.4152214050292969,
"mean_token_accuracy": 0.8757175728678703,
"num_tokens": 32369260.0,
"step": 2120
},
{
"entropy": 0.38906187675893306,
"epoch": 0.8250024264777249,
"grad_norm": 1.390625,
"learning_rate": 8.508762034020211e-06,
"loss": 0.40627117156982423,
"mean_token_accuracy": 0.8743843853473663,
"num_tokens": 32461339.0,
"step": 2125
},
{
"entropy": 0.3820859346538782,
"epoch": 0.8269436086576725,
"grad_norm": 1.8359375,
"learning_rate": 8.501289900290499e-06,
"loss": 0.3897759437561035,
"mean_token_accuracy": 0.8774882882833481,
"num_tokens": 32541252.0,
"step": 2130
},
{
"entropy": 0.43145383819937705,
"epoch": 0.8288847908376201,
"grad_norm": 1.78125,
"learning_rate": 8.49380239179494e-06,
"loss": 0.4624598503112793,
"mean_token_accuracy": 0.8640148594975472,
"num_tokens": 32626380.0,
"step": 2135
},
{
"entropy": 0.38095561824738977,
"epoch": 0.8308259730175677,
"grad_norm": 1.546875,
"learning_rate": 8.486299541412466e-06,
"loss": 0.4128393650054932,
"mean_token_accuracy": 0.8786048114299774,
"num_tokens": 32702475.0,
"step": 2140
},
{
"entropy": 0.38069754019379615,
"epoch": 0.8327671551975153,
"grad_norm": 1.6953125,
"learning_rate": 8.478781382089387e-06,
"loss": 0.41826744079589845,
"mean_token_accuracy": 0.8762264057993889,
"num_tokens": 32798281.0,
"step": 2145
},
{
"entropy": 0.4370048839598894,
"epoch": 0.8347083373774629,
"grad_norm": 1.8046875,
"learning_rate": 8.471247946839229e-06,
"loss": 0.4501640796661377,
"mean_token_accuracy": 0.8669643774628639,
"num_tokens": 32865902.0,
"step": 2150
},
{
"entropy": 0.35586942471563815,
"epoch": 0.8366495195574105,
"grad_norm": 1.4609375,
"learning_rate": 8.463699268742604e-06,
"loss": 0.3725292444229126,
"mean_token_accuracy": 0.8853255197405815,
"num_tokens": 32936532.0,
"step": 2155
},
{
"entropy": 0.3503182210028172,
"epoch": 0.8385907017373581,
"grad_norm": 1.9921875,
"learning_rate": 8.456135380947055e-06,
"loss": 0.3832036733627319,
"mean_token_accuracy": 0.8870538592338562,
"num_tokens": 33000960.0,
"step": 2160
},
{
"entropy": 0.41169595904648304,
"epoch": 0.8405318839173056,
"grad_norm": 1.3671875,
"learning_rate": 8.448556316666912e-06,
"loss": 0.4174903392791748,
"mean_token_accuracy": 0.8725608646869659,
"num_tokens": 33086991.0,
"step": 2165
},
{
"entropy": 0.3502325866371393,
"epoch": 0.8424730660972533,
"grad_norm": 1.6484375,
"learning_rate": 8.44096210918315e-06,
"loss": 0.356764554977417,
"mean_token_accuracy": 0.8890736445784568,
"num_tokens": 33149859.0,
"step": 2170
},
{
"entropy": 0.4082874767482281,
"epoch": 0.8444142482772008,
"grad_norm": 1.5859375,
"learning_rate": 8.43335279184324e-06,
"loss": 0.41769680976867674,
"mean_token_accuracy": 0.8735889151692391,
"num_tokens": 33235890.0,
"step": 2175
},
{
"entropy": 0.3691970378160477,
"epoch": 0.8463554304571484,
"grad_norm": 1.5,
"learning_rate": 8.425728398061002e-06,
"loss": 0.4044227600097656,
"mean_token_accuracy": 0.8845529943704605,
"num_tokens": 33293736.0,
"step": 2180
},
{
"entropy": 0.3638597309589386,
"epoch": 0.848296612637096,
"grad_norm": 1.7890625,
"learning_rate": 8.418088961316459e-06,
"loss": 0.3457561254501343,
"mean_token_accuracy": 0.8865492403507232,
"num_tokens": 33355455.0,
"step": 2185
},
{
"entropy": 0.39855882450938224,
"epoch": 0.8502377948170435,
"grad_norm": 1.4921875,
"learning_rate": 8.410434515155694e-06,
"loss": 0.40858187675476076,
"mean_token_accuracy": 0.8782258868217468,
"num_tokens": 33428022.0,
"step": 2190
},
{
"entropy": 0.345845440402627,
"epoch": 0.8521789769969912,
"grad_norm": 1.6328125,
"learning_rate": 8.402765093190693e-06,
"loss": 0.35137181282043456,
"mean_token_accuracy": 0.8922701835632324,
"num_tokens": 33495081.0,
"step": 2195
},
{
"entropy": 0.3340891394764185,
"epoch": 0.8541201591769387,
"grad_norm": 1.3671875,
"learning_rate": 8.395080729099206e-06,
"loss": 0.3650421380996704,
"mean_token_accuracy": 0.8865584105253219,
"num_tokens": 33588202.0,
"step": 2200
},
{
"epoch": 0.8541201591769387,
"eval_entropy": 0.36483841773214426,
"eval_loss": 0.3705015778541565,
"eval_mean_token_accuracy": 0.8827846525241297,
"eval_num_tokens": 33588202.0,
"eval_runtime": 60.1598,
"eval_samples_per_second": 35.722,
"eval_steps_per_second": 35.722,
"step": 2200
},
{
"entropy": 0.3953329209238291,
"epoch": 0.8560613413568864,
"grad_norm": 1.65625,
"learning_rate": 8.3873814566246e-06,
"loss": 0.4315669536590576,
"mean_token_accuracy": 0.8746445998549461,
"num_tokens": 33668354.0,
"step": 2205
},
{
"entropy": 0.37631992548704146,
"epoch": 0.8580025235368339,
"grad_norm": 1.6015625,
"learning_rate": 8.379667309575699e-06,
"loss": 0.41765918731689455,
"mean_token_accuracy": 0.8796603456139565,
"num_tokens": 33733737.0,
"step": 2210
},
{
"entropy": 0.3429785013198853,
"epoch": 0.8599437057167815,
"grad_norm": 1.609375,
"learning_rate": 8.371938321826654e-06,
"loss": 0.35924372673034666,
"mean_token_accuracy": 0.8863778650760651,
"num_tokens": 33813616.0,
"step": 2215
},
{
"entropy": 0.36748342849314214,
"epoch": 0.8618848878967291,
"grad_norm": 1.1953125,
"learning_rate": 8.364194527316776e-06,
"loss": 0.38543248176574707,
"mean_token_accuracy": 0.8795094177126884,
"num_tokens": 33893625.0,
"step": 2220
},
{
"entropy": 0.3699775494635105,
"epoch": 0.8638260700766767,
"grad_norm": 1.3203125,
"learning_rate": 8.356435960050398e-06,
"loss": 0.3805511474609375,
"mean_token_accuracy": 0.8810431718826294,
"num_tokens": 33969465.0,
"step": 2225
},
{
"entropy": 0.39027220420539377,
"epoch": 0.8657672522566243,
"grad_norm": 1.765625,
"learning_rate": 8.348662654096724e-06,
"loss": 0.3937405586242676,
"mean_token_accuracy": 0.875699220597744,
"num_tokens": 34037321.0,
"step": 2230
},
{
"entropy": 0.380586925894022,
"epoch": 0.8677084344365719,
"grad_norm": 1.96875,
"learning_rate": 8.340874643589676e-06,
"loss": 0.39784080982208253,
"mean_token_accuracy": 0.8761513873934745,
"num_tokens": 34115202.0,
"step": 2235
},
{
"entropy": 0.38007759377360345,
"epoch": 0.8696496166165194,
"grad_norm": 1.3359375,
"learning_rate": 8.333071962727745e-06,
"loss": 0.3872611284255981,
"mean_token_accuracy": 0.8754698395729065,
"num_tokens": 34202914.0,
"step": 2240
},
{
"entropy": 0.36750674396753313,
"epoch": 0.8715907987964671,
"grad_norm": 1.4765625,
"learning_rate": 8.325254645773849e-06,
"loss": 0.36534600257873534,
"mean_token_accuracy": 0.8795874208211899,
"num_tokens": 34276806.0,
"step": 2245
},
{
"entropy": 0.3650043081492186,
"epoch": 0.8735319809764146,
"grad_norm": 1.59375,
"learning_rate": 8.317422727055165e-06,
"loss": 0.3911173105239868,
"mean_token_accuracy": 0.8810791179537774,
"num_tokens": 34349979.0,
"step": 2250
},
{
"entropy": 0.4252156797796488,
"epoch": 0.8754731631563623,
"grad_norm": 1.421875,
"learning_rate": 8.309576240962998e-06,
"loss": 0.3878526449203491,
"mean_token_accuracy": 0.870864699780941,
"num_tokens": 34431756.0,
"step": 2255
},
{
"entropy": 0.3478477492928505,
"epoch": 0.8774143453363098,
"grad_norm": 1.703125,
"learning_rate": 8.301715221952615e-06,
"loss": 0.3578909635543823,
"mean_token_accuracy": 0.8878472730517387,
"num_tokens": 34507200.0,
"step": 2260
},
{
"entropy": 0.3685021881014109,
"epoch": 0.8793555275162575,
"grad_norm": 1.59375,
"learning_rate": 8.293839704543103e-06,
"loss": 0.39955284595489504,
"mean_token_accuracy": 0.880965618789196,
"num_tokens": 34586189.0,
"step": 2265
},
{
"entropy": 0.35574909709393976,
"epoch": 0.881296709696205,
"grad_norm": 1.859375,
"learning_rate": 8.285949723317214e-06,
"loss": 0.38354690074920655,
"mean_token_accuracy": 0.8842941373586655,
"num_tokens": 34664914.0,
"step": 2270
},
{
"entropy": 0.3691468223929405,
"epoch": 0.8832378918761525,
"grad_norm": 1.3984375,
"learning_rate": 8.27804531292121e-06,
"loss": 0.3860490322113037,
"mean_token_accuracy": 0.8787114471197128,
"num_tokens": 34748974.0,
"step": 2275
},
{
"entropy": 0.3915288481861353,
"epoch": 0.8851790740561002,
"grad_norm": 2.015625,
"learning_rate": 8.270126508064717e-06,
"loss": 0.4229584217071533,
"mean_token_accuracy": 0.8710038289427757,
"num_tokens": 34823550.0,
"step": 2280
},
{
"entropy": 0.4291458610445261,
"epoch": 0.8871202562360477,
"grad_norm": 1.3359375,
"learning_rate": 8.262193343520567e-06,
"loss": 0.43143463134765625,
"mean_token_accuracy": 0.8686526745557785,
"num_tokens": 34917555.0,
"step": 2285
},
{
"entropy": 0.37965994626283645,
"epoch": 0.8890614384159954,
"grad_norm": 1.5078125,
"learning_rate": 8.254245854124652e-06,
"loss": 0.3806295394897461,
"mean_token_accuracy": 0.881553427875042,
"num_tokens": 34990801.0,
"step": 2290
},
{
"entropy": 0.4057528983801603,
"epoch": 0.8910026205959429,
"grad_norm": 1.875,
"learning_rate": 8.246284074775763e-06,
"loss": 0.41382293701171874,
"mean_token_accuracy": 0.8720936447381973,
"num_tokens": 35074294.0,
"step": 2295
},
{
"entropy": 0.36701224111020564,
"epoch": 0.8929438027758905,
"grad_norm": 1.53125,
"learning_rate": 8.23830804043544e-06,
"loss": 0.39014787673950196,
"mean_token_accuracy": 0.8813544929027557,
"num_tokens": 35142936.0,
"step": 2300
},
{
"entropy": 0.36863389015197756,
"epoch": 0.8948849849558381,
"grad_norm": 1.921875,
"learning_rate": 8.230317786127822e-06,
"loss": 0.4085258483886719,
"mean_token_accuracy": 0.8811587393283844,
"num_tokens": 35228369.0,
"step": 2305
},
{
"entropy": 0.3572548136115074,
"epoch": 0.8968261671357857,
"grad_norm": 1.6640625,
"learning_rate": 8.22231334693949e-06,
"loss": 0.40120840072631836,
"mean_token_accuracy": 0.8830894485116005,
"num_tokens": 35306656.0,
"step": 2310
},
{
"entropy": 0.3663245867937803,
"epoch": 0.8987673493157333,
"grad_norm": 1.65625,
"learning_rate": 8.21429475801931e-06,
"loss": 0.3976627826690674,
"mean_token_accuracy": 0.8808062687516213,
"num_tokens": 35383007.0,
"step": 2315
},
{
"entropy": 0.40247388668358325,
"epoch": 0.9007085314956809,
"grad_norm": 1.609375,
"learning_rate": 8.20626205457829e-06,
"loss": 0.4049511909484863,
"mean_token_accuracy": 0.8711845085024834,
"num_tokens": 35453462.0,
"step": 2320
},
{
"entropy": 0.3913619853556156,
"epoch": 0.9026497136756284,
"grad_norm": 1.8203125,
"learning_rate": 8.198215271889405e-06,
"loss": 0.3979458808898926,
"mean_token_accuracy": 0.879303203523159,
"num_tokens": 35524043.0,
"step": 2325
},
{
"entropy": 0.37322167456150057,
"epoch": 0.9045908958555761,
"grad_norm": 1.5703125,
"learning_rate": 8.190154445287466e-06,
"loss": 0.41640191078186034,
"mean_token_accuracy": 0.8771696910262108,
"num_tokens": 35609328.0,
"step": 2330
},
{
"entropy": 0.39078370332717893,
"epoch": 0.9065320780355236,
"grad_norm": 1.421875,
"learning_rate": 8.182079610168945e-06,
"loss": 0.37838523387908934,
"mean_token_accuracy": 0.8762111157178879,
"num_tokens": 35680622.0,
"step": 2335
},
{
"entropy": 0.39326913058757784,
"epoch": 0.9084732602154713,
"grad_norm": 1.8203125,
"learning_rate": 8.173990801991834e-06,
"loss": 0.38826932907104494,
"mean_token_accuracy": 0.8793436914682389,
"num_tokens": 35744201.0,
"step": 2340
},
{
"entropy": 0.38700769394636153,
"epoch": 0.9104144423954188,
"grad_norm": 1.3125,
"learning_rate": 8.165888056275478e-06,
"loss": 0.4147165298461914,
"mean_token_accuracy": 0.8736557975411415,
"num_tokens": 35822206.0,
"step": 2345
},
{
"entropy": 0.3916376482695341,
"epoch": 0.9123556245753665,
"grad_norm": 1.4375,
"learning_rate": 8.157771408600427e-06,
"loss": 0.40491595268249514,
"mean_token_accuracy": 0.8760656327009201,
"num_tokens": 35898583.0,
"step": 2350
},
{
"entropy": 0.37616121433675287,
"epoch": 0.914296806755314,
"grad_norm": 1.859375,
"learning_rate": 8.149640894608277e-06,
"loss": 0.39853510856628416,
"mean_token_accuracy": 0.8797665163874626,
"num_tokens": 35962197.0,
"step": 2355
},
{
"entropy": 0.3872214786708355,
"epoch": 0.9162379889352615,
"grad_norm": 1.6953125,
"learning_rate": 8.141496550001512e-06,
"loss": 0.4320131778717041,
"mean_token_accuracy": 0.8749197080731392,
"num_tokens": 36048634.0,
"step": 2360
},
{
"entropy": 0.3654011983424425,
"epoch": 0.9181791711152092,
"grad_norm": 1.8125,
"learning_rate": 8.13333841054335e-06,
"loss": 0.4127936363220215,
"mean_token_accuracy": 0.8794704377651215,
"num_tokens": 36114078.0,
"step": 2365
},
{
"entropy": 0.4052185159176588,
"epoch": 0.9201203532951567,
"grad_norm": 1.671875,
"learning_rate": 8.125166512057583e-06,
"loss": 0.4502895355224609,
"mean_token_accuracy": 0.8730918914079666,
"num_tokens": 36185468.0,
"step": 2370
},
{
"entropy": 0.3805313348770142,
"epoch": 0.9220615354751044,
"grad_norm": 1.71875,
"learning_rate": 8.116980890428421e-06,
"loss": 0.4319314956665039,
"mean_token_accuracy": 0.8795101106166839,
"num_tokens": 36262273.0,
"step": 2375
},
{
"entropy": 0.379262937605381,
"epoch": 0.9240027176550519,
"grad_norm": 1.828125,
"learning_rate": 8.108781581600337e-06,
"loss": 0.3972128391265869,
"mean_token_accuracy": 0.8769020855426788,
"num_tokens": 36339772.0,
"step": 2380
},
{
"entropy": 0.3275274306535721,
"epoch": 0.9259438998349995,
"grad_norm": 1.8828125,
"learning_rate": 8.100568621577907e-06,
"loss": 0.349655294418335,
"mean_token_accuracy": 0.8939395412802696,
"num_tokens": 36405190.0,
"step": 2385
},
{
"entropy": 0.37873089760541917,
"epoch": 0.9278850820149471,
"grad_norm": 1.65625,
"learning_rate": 8.092342046425647e-06,
"loss": 0.41008243560791013,
"mean_token_accuracy": 0.8806984931230545,
"num_tokens": 36471551.0,
"step": 2390
},
{
"entropy": 0.42233099043369293,
"epoch": 0.9298262641948947,
"grad_norm": 1.5,
"learning_rate": 8.084101892267866e-06,
"loss": 0.43898987770080566,
"mean_token_accuracy": 0.8671094790101052,
"num_tokens": 36545141.0,
"step": 2395
},
{
"entropy": 0.377986478433013,
"epoch": 0.9317674463748423,
"grad_norm": 1.6015625,
"learning_rate": 8.075848195288495e-06,
"loss": 0.4050844669342041,
"mean_token_accuracy": 0.8788811087608337,
"num_tokens": 36621766.0,
"step": 2400
},
{
"epoch": 0.9317674463748423,
"eval_entropy": 0.3660563061428991,
"eval_loss": 0.3696966767311096,
"eval_mean_token_accuracy": 0.8829624101609727,
"eval_num_tokens": 36621766.0,
"eval_runtime": 60.0474,
"eval_samples_per_second": 35.788,
"eval_steps_per_second": 35.788,
"step": 2400
},
{
"entropy": 0.41329049319028854,
"epoch": 0.9337086285547899,
"grad_norm": 1.3203125,
"learning_rate": 8.06758099173094e-06,
"loss": 0.3864266872406006,
"mean_token_accuracy": 0.8733444228768349,
"num_tokens": 36692402.0,
"step": 2405
},
{
"entropy": 0.35147353522479535,
"epoch": 0.9356498107347374,
"grad_norm": 1.84375,
"learning_rate": 8.059300317897907e-06,
"loss": 0.3865788698196411,
"mean_token_accuracy": 0.8899516001343727,
"num_tokens": 36762687.0,
"step": 2410
},
{
"entropy": 0.35971076525747775,
"epoch": 0.9375909929146851,
"grad_norm": 2.015625,
"learning_rate": 8.051006210151264e-06,
"loss": 0.38848717212677003,
"mean_token_accuracy": 0.8857970297336578,
"num_tokens": 36829409.0,
"step": 2415
},
{
"entropy": 0.36762615144252775,
"epoch": 0.9395321750946326,
"grad_norm": 1.7578125,
"learning_rate": 8.04269870491186e-06,
"loss": 0.38152570724487306,
"mean_token_accuracy": 0.8803352236747741,
"num_tokens": 36920865.0,
"step": 2420
},
{
"entropy": 0.37836971804499625,
"epoch": 0.9414733572745803,
"grad_norm": 1.6875,
"learning_rate": 8.03437783865938e-06,
"loss": 0.3982245683670044,
"mean_token_accuracy": 0.8782578885555268,
"num_tokens": 36993918.0,
"step": 2425
},
{
"entropy": 0.40916073732078073,
"epoch": 0.9434145394545278,
"grad_norm": 1.3203125,
"learning_rate": 8.02604364793218e-06,
"loss": 0.40181870460510255,
"mean_token_accuracy": 0.8741835564374923,
"num_tokens": 37067466.0,
"step": 2430
},
{
"entropy": 0.38971280567348004,
"epoch": 0.9453557216344753,
"grad_norm": 1.4375,
"learning_rate": 8.017696169327121e-06,
"loss": 0.3853023052215576,
"mean_token_accuracy": 0.8737384587526321,
"num_tokens": 37158825.0,
"step": 2435
},
{
"entropy": 0.36188525408506395,
"epoch": 0.947296903814423,
"grad_norm": 1.8046875,
"learning_rate": 8.009335439499418e-06,
"loss": 0.39717903137207033,
"mean_token_accuracy": 0.8839860737323761,
"num_tokens": 37231768.0,
"step": 2440
},
{
"entropy": 0.3343341175466776,
"epoch": 0.9492380859943705,
"grad_norm": 1.5078125,
"learning_rate": 8.000961495162474e-06,
"loss": 0.34873759746551514,
"mean_token_accuracy": 0.8942202180624008,
"num_tokens": 37295787.0,
"step": 2445
},
{
"entropy": 0.37651418149471283,
"epoch": 0.9511792681743182,
"grad_norm": 2.0625,
"learning_rate": 7.992574373087717e-06,
"loss": 0.3985455989837646,
"mean_token_accuracy": 0.8804239287972451,
"num_tokens": 37365031.0,
"step": 2450
},
{
"entropy": 0.3872853074222803,
"epoch": 0.9531204503542657,
"grad_norm": 1.859375,
"learning_rate": 7.984174110104442e-06,
"loss": 0.3960126876831055,
"mean_token_accuracy": 0.8773596182465553,
"num_tokens": 37440723.0,
"step": 2455
},
{
"entropy": 0.36501435153186323,
"epoch": 0.9550616325342134,
"grad_norm": 1.546875,
"learning_rate": 7.975760743099648e-06,
"loss": 0.3613110065460205,
"mean_token_accuracy": 0.8814436718821526,
"num_tokens": 37517552.0,
"step": 2460
},
{
"entropy": 0.4213182792067528,
"epoch": 0.9570028147141609,
"grad_norm": 2.125,
"learning_rate": 7.967334309017876e-06,
"loss": 0.42275075912475585,
"mean_token_accuracy": 0.8684807687997818,
"num_tokens": 37576304.0,
"step": 2465
},
{
"entropy": 0.36460405923426153,
"epoch": 0.9589439968941085,
"grad_norm": 1.4453125,
"learning_rate": 7.958894844861044e-06,
"loss": 0.4192854881286621,
"mean_token_accuracy": 0.882645896077156,
"num_tokens": 37649463.0,
"step": 2470
},
{
"entropy": 0.3699316095560789,
"epoch": 0.9608851790740561,
"grad_norm": 1.53125,
"learning_rate": 7.950442387688295e-06,
"loss": 0.39672675132751467,
"mean_token_accuracy": 0.8789965286850929,
"num_tokens": 37727011.0,
"step": 2475
},
{
"entropy": 0.3613630454987288,
"epoch": 0.9628263612540037,
"grad_norm": 1.65625,
"learning_rate": 7.941976974615817e-06,
"loss": 0.35828289985656736,
"mean_token_accuracy": 0.8861474558711052,
"num_tokens": 37799274.0,
"step": 2480
},
{
"entropy": 0.4028384655714035,
"epoch": 0.9647675434339513,
"grad_norm": 1.453125,
"learning_rate": 7.933498642816698e-06,
"loss": 0.39244048595428466,
"mean_token_accuracy": 0.8733719438314438,
"num_tokens": 37872790.0,
"step": 2485
},
{
"entropy": 0.37262568436563015,
"epoch": 0.9667087256138989,
"grad_norm": 1.328125,
"learning_rate": 7.925007429520745e-06,
"loss": 0.3869138240814209,
"mean_token_accuracy": 0.8780170202255249,
"num_tokens": 37949478.0,
"step": 2490
},
{
"entropy": 0.3532130911946297,
"epoch": 0.9686499077938464,
"grad_norm": 1.65625,
"learning_rate": 7.916503372014339e-06,
"loss": 0.3645073175430298,
"mean_token_accuracy": 0.8856014132499694,
"num_tokens": 38010035.0,
"step": 2495
},
{
"entropy": 0.4101907879114151,
"epoch": 0.9705910899737941,
"grad_norm": 1.4453125,
"learning_rate": 7.90798650764026e-06,
"loss": 0.43153948783874513,
"mean_token_accuracy": 0.868617196381092,
"num_tokens": 38091317.0,
"step": 2500
},
{
"entropy": 0.37458378039300444,
"epoch": 0.9725322721537416,
"grad_norm": 1.578125,
"learning_rate": 7.899456873797519e-06,
"loss": 0.4130906105041504,
"mean_token_accuracy": 0.8811309933662415,
"num_tokens": 38156010.0,
"step": 2505
},
{
"entropy": 0.33361660987138747,
"epoch": 0.9744734543336893,
"grad_norm": 1.5078125,
"learning_rate": 7.890914507941209e-06,
"loss": 0.3599473714828491,
"mean_token_accuracy": 0.891946268081665,
"num_tokens": 38227058.0,
"step": 2510
},
{
"entropy": 0.36410921774804594,
"epoch": 0.9764146365136368,
"grad_norm": 1.71875,
"learning_rate": 7.882359447582323e-06,
"loss": 0.36246566772460936,
"mean_token_accuracy": 0.8795957028865814,
"num_tokens": 38308578.0,
"step": 2515
},
{
"entropy": 0.40007474571466445,
"epoch": 0.9783558186935843,
"grad_norm": 1.7734375,
"learning_rate": 7.873791730287607e-06,
"loss": 0.416595458984375,
"mean_token_accuracy": 0.8698130205273629,
"num_tokens": 38393316.0,
"step": 2520
},
{
"entropy": 0.37237811721861364,
"epoch": 0.980297000873532,
"grad_norm": 1.65625,
"learning_rate": 7.865211393679374e-06,
"loss": 0.3867233991622925,
"mean_token_accuracy": 0.8821268856525422,
"num_tokens": 38469288.0,
"step": 2525
},
{
"entropy": 0.3845432631671429,
"epoch": 0.9822381830534795,
"grad_norm": 1.3515625,
"learning_rate": 7.856618475435361e-06,
"loss": 0.3905576944351196,
"mean_token_accuracy": 0.8774156749248505,
"num_tokens": 38543267.0,
"step": 2530
},
{
"entropy": 0.40176920518279075,
"epoch": 0.9841793652334272,
"grad_norm": 1.546875,
"learning_rate": 7.848013013288548e-06,
"loss": 0.41007471084594727,
"mean_token_accuracy": 0.8690975129604339,
"num_tokens": 38626344.0,
"step": 2535
},
{
"entropy": 0.38942315727472304,
"epoch": 0.9861205474133747,
"grad_norm": 1.6796875,
"learning_rate": 7.839395045027e-06,
"loss": 0.40326895713806155,
"mean_token_accuracy": 0.8756881758570672,
"num_tokens": 38701064.0,
"step": 2540
},
{
"entropy": 0.3606115547940135,
"epoch": 0.9880617295933223,
"grad_norm": 1.59375,
"learning_rate": 7.830764608493697e-06,
"loss": 0.36026384830474856,
"mean_token_accuracy": 0.8839934259653092,
"num_tokens": 38779059.0,
"step": 2545
},
{
"entropy": 0.39264477528631686,
"epoch": 0.9900029117732699,
"grad_norm": 1.3203125,
"learning_rate": 7.822121741586368e-06,
"loss": 0.4251199245452881,
"mean_token_accuracy": 0.8718041434884072,
"num_tokens": 38880681.0,
"step": 2550
},
{
"entropy": 0.40420532748103144,
"epoch": 0.9919440939532175,
"grad_norm": 1.7109375,
"learning_rate": 7.813466482257327e-06,
"loss": 0.4312422752380371,
"mean_token_accuracy": 0.8763286352157593,
"num_tokens": 38941161.0,
"step": 2555
},
{
"entropy": 0.3369973488152027,
"epoch": 0.9938852761331651,
"grad_norm": 1.40625,
"learning_rate": 7.804798868513306e-06,
"loss": 0.35411407947540285,
"mean_token_accuracy": 0.8899437338113785,
"num_tokens": 39018011.0,
"step": 2560
},
{
"entropy": 0.39636878967285155,
"epoch": 0.9958264583131127,
"grad_norm": 1.59375,
"learning_rate": 7.796118938415289e-06,
"loss": 0.407199764251709,
"mean_token_accuracy": 0.8719804942607879,
"num_tokens": 39101097.0,
"step": 2565
},
{
"entropy": 0.3830322280526161,
"epoch": 0.9977676404930603,
"grad_norm": 1.8046875,
"learning_rate": 7.78742673007834e-06,
"loss": 0.38955183029174806,
"mean_token_accuracy": 0.8754953861236572,
"num_tokens": 39180075.0,
"step": 2570
},
{
"entropy": 0.37996360957622527,
"epoch": 0.9997088226730079,
"grad_norm": 1.8046875,
"learning_rate": 7.77872228167144e-06,
"loss": 0.4175414085388184,
"mean_token_accuracy": 0.8776975840330123,
"num_tokens": 39256747.0,
"step": 2575
},
{
"entropy": 0.3622729911615974,
"epoch": 1.001552945743958,
"grad_norm": 1.5,
"learning_rate": 7.770005631417316e-06,
"loss": 0.3494336366653442,
"mean_token_accuracy": 0.8844640803964514,
"num_tokens": 39331281.0,
"step": 2580
},
{
"entropy": 0.3717500135302544,
"epoch": 1.0034941279239056,
"grad_norm": 1.7890625,
"learning_rate": 7.761276817592283e-06,
"loss": 0.38556852340698244,
"mean_token_accuracy": 0.8811951488256454,
"num_tokens": 39403180.0,
"step": 2585
},
{
"entropy": 0.3536002866923809,
"epoch": 1.0054353101038533,
"grad_norm": 1.5078125,
"learning_rate": 7.752535878526057e-06,
"loss": 0.3865217208862305,
"mean_token_accuracy": 0.8846583724021911,
"num_tokens": 39482653.0,
"step": 2590
},
{
"entropy": 0.36924818605184556,
"epoch": 1.0073764922838009,
"grad_norm": 1.359375,
"learning_rate": 7.743782852601609e-06,
"loss": 0.3744253873825073,
"mean_token_accuracy": 0.8835659891366958,
"num_tokens": 39571122.0,
"step": 2595
},
{
"entropy": 0.3826246250420809,
"epoch": 1.0093176744637484,
"grad_norm": 1.3125,
"learning_rate": 7.735017778254976e-06,
"loss": 0.3962560176849365,
"mean_token_accuracy": 0.8752669557929039,
"num_tokens": 39643625.0,
"step": 2600
},
{
"epoch": 1.0093176744637484,
"eval_entropy": 0.3590102044439915,
"eval_loss": 0.3693583905696869,
"eval_mean_token_accuracy": 0.8832231578354283,
"eval_num_tokens": 39643625.0,
"eval_runtime": 60.1285,
"eval_samples_per_second": 35.74,
"eval_steps_per_second": 35.74,
"step": 2600
},
{
"entropy": 0.3363046307116747,
"epoch": 1.011258856643696,
"grad_norm": 1.5859375,
"learning_rate": 7.726240693975112e-06,
"loss": 0.3622615814208984,
"mean_token_accuracy": 0.890743799507618,
"num_tokens": 39727345.0,
"step": 2605
},
{
"entropy": 0.39671580009162427,
"epoch": 1.0132000388236435,
"grad_norm": 1.640625,
"learning_rate": 7.7174516383037e-06,
"loss": 0.42294821739196775,
"mean_token_accuracy": 0.8709942042827606,
"num_tokens": 39809382.0,
"step": 2610
},
{
"entropy": 0.38348409347236156,
"epoch": 1.0151412210035913,
"grad_norm": 1.8046875,
"learning_rate": 7.70865064983499e-06,
"loss": 0.40362987518310545,
"mean_token_accuracy": 0.8796380490064621,
"num_tokens": 39870908.0,
"step": 2615
},
{
"entropy": 0.35954158157110216,
"epoch": 1.0170824031835388,
"grad_norm": 1.5859375,
"learning_rate": 7.699837767215642e-06,
"loss": 0.391841459274292,
"mean_token_accuracy": 0.8825426653027535,
"num_tokens": 39946284.0,
"step": 2620
},
{
"entropy": 0.3796717070043087,
"epoch": 1.0190235853634864,
"grad_norm": 1.8203125,
"learning_rate": 7.691013029144535e-06,
"loss": 0.4171717643737793,
"mean_token_accuracy": 0.8788355842232705,
"num_tokens": 40017489.0,
"step": 2625
},
{
"entropy": 0.35411719866096975,
"epoch": 1.020964767543434,
"grad_norm": 1.265625,
"learning_rate": 7.682176474372613e-06,
"loss": 0.36679236888885497,
"mean_token_accuracy": 0.8839921057224274,
"num_tokens": 40091956.0,
"step": 2630
},
{
"entropy": 0.4047669190913439,
"epoch": 1.0229059497233814,
"grad_norm": 1.78125,
"learning_rate": 7.673328141702708e-06,
"loss": 0.42531418800354004,
"mean_token_accuracy": 0.8716289401054382,
"num_tokens": 40174273.0,
"step": 2635
},
{
"entropy": 0.3872047744691372,
"epoch": 1.0248471319033292,
"grad_norm": 1.453125,
"learning_rate": 7.664468069989363e-06,
"loss": 0.39284777641296387,
"mean_token_accuracy": 0.8739194989204406,
"num_tokens": 40259401.0,
"step": 2640
},
{
"entropy": 0.39128339402377604,
"epoch": 1.0267883140832768,
"grad_norm": 1.8515625,
"learning_rate": 7.655596298138683e-06,
"loss": 0.3992388963699341,
"mean_token_accuracy": 0.8772461920976639,
"num_tokens": 40326313.0,
"step": 2645
},
{
"entropy": 0.36866898983716967,
"epoch": 1.0287294962632243,
"grad_norm": 1.65625,
"learning_rate": 7.646712865108143e-06,
"loss": 0.376071572303772,
"mean_token_accuracy": 0.885202445089817,
"num_tokens": 40402194.0,
"step": 2650
},
{
"entropy": 0.3736342485994101,
"epoch": 1.0306706784431718,
"grad_norm": 1.59375,
"learning_rate": 7.637817809906422e-06,
"loss": 0.38311469554901123,
"mean_token_accuracy": 0.8761677891016006,
"num_tokens": 40482708.0,
"step": 2655
},
{
"entropy": 0.36633356250822546,
"epoch": 1.0326118606231194,
"grad_norm": 1.8671875,
"learning_rate": 7.628911171593236e-06,
"loss": 0.39987525939941404,
"mean_token_accuracy": 0.8770500838756561,
"num_tokens": 40550320.0,
"step": 2660
},
{
"entropy": 0.3814588252454996,
"epoch": 1.0345530428030671,
"grad_norm": 1.8203125,
"learning_rate": 7.6199929892791666e-06,
"loss": 0.42825708389282224,
"mean_token_accuracy": 0.8766391202807426,
"num_tokens": 40621121.0,
"step": 2665
},
{
"entropy": 0.3263087157160044,
"epoch": 1.0364942249830147,
"grad_norm": 1.8046875,
"learning_rate": 7.611063302125485e-06,
"loss": 0.3370352745056152,
"mean_token_accuracy": 0.892676542699337,
"num_tokens": 40675017.0,
"step": 2670
},
{
"entropy": 0.35688314363360407,
"epoch": 1.0384354071629622,
"grad_norm": 1.59375,
"learning_rate": 7.602122149343982e-06,
"loss": 0.37260828018188474,
"mean_token_accuracy": 0.8857112199068069,
"num_tokens": 40735182.0,
"step": 2675
},
{
"entropy": 0.3657310428097844,
"epoch": 1.0403765893429098,
"grad_norm": 1.4296875,
"learning_rate": 7.593169570196798e-06,
"loss": 0.38663344383239745,
"mean_token_accuracy": 0.8788129478693009,
"num_tokens": 40812276.0,
"step": 2680
},
{
"entropy": 0.39327623806893824,
"epoch": 1.0423177715228573,
"grad_norm": 2.078125,
"learning_rate": 7.5842056039962465e-06,
"loss": 0.40496459007263186,
"mean_token_accuracy": 0.8742495253682137,
"num_tokens": 40883935.0,
"step": 2685
},
{
"entropy": 0.37283147126436234,
"epoch": 1.044258953702805,
"grad_norm": 1.5859375,
"learning_rate": 7.575230290104643e-06,
"loss": 0.38010687828063966,
"mean_token_accuracy": 0.8804807871580124,
"num_tokens": 40965277.0,
"step": 2690
},
{
"entropy": 0.33802379108965397,
"epoch": 1.0462001358827526,
"grad_norm": 1.5078125,
"learning_rate": 7.566243667934132e-06,
"loss": 0.34528648853302,
"mean_token_accuracy": 0.8925616145133972,
"num_tokens": 41036874.0,
"step": 2695
},
{
"entropy": 0.34530715458095074,
"epoch": 1.0481413180627002,
"grad_norm": 1.4375,
"learning_rate": 7.557245776946522e-06,
"loss": 0.3618237257003784,
"mean_token_accuracy": 0.8869366824626923,
"num_tokens": 41123295.0,
"step": 2700
},
{
"entropy": 0.3728719219565392,
"epoch": 1.0500825002426477,
"grad_norm": 1.53125,
"learning_rate": 7.548236656653095e-06,
"loss": 0.3764779567718506,
"mean_token_accuracy": 0.8791951701045037,
"num_tokens": 41206755.0,
"step": 2705
},
{
"entropy": 0.36930376179516317,
"epoch": 1.0520236824225955,
"grad_norm": 1.3671875,
"learning_rate": 7.539216346614448e-06,
"loss": 0.3768768310546875,
"mean_token_accuracy": 0.8802413672208786,
"num_tokens": 41295129.0,
"step": 2710
},
{
"entropy": 0.3499265480786562,
"epoch": 1.053964864602543,
"grad_norm": 2.046875,
"learning_rate": 7.530184886440312e-06,
"loss": 0.3675286293029785,
"mean_token_accuracy": 0.889797542989254,
"num_tokens": 41374363.0,
"step": 2715
},
{
"entropy": 0.3654515855014324,
"epoch": 1.0559060467824906,
"grad_norm": 1.53125,
"learning_rate": 7.521142315789382e-06,
"loss": 0.3843737840652466,
"mean_token_accuracy": 0.8779830664396286,
"num_tokens": 41452026.0,
"step": 2720
},
{
"entropy": 0.36771729625761507,
"epoch": 1.057847228962438,
"grad_norm": 2.03125,
"learning_rate": 7.512088674369143e-06,
"loss": 0.3673874378204346,
"mean_token_accuracy": 0.8848532065749168,
"num_tokens": 41516536.0,
"step": 2725
},
{
"entropy": 0.4460260573774576,
"epoch": 1.0597884111423856,
"grad_norm": 1.546875,
"learning_rate": 7.503024001935686e-06,
"loss": 0.45882291793823243,
"mean_token_accuracy": 0.8644292831420899,
"num_tokens": 41595307.0,
"step": 2730
},
{
"entropy": 0.34162113182246684,
"epoch": 1.0617295933223332,
"grad_norm": 1.375,
"learning_rate": 7.493948338293549e-06,
"loss": 0.35989553928375245,
"mean_token_accuracy": 0.8857067421078682,
"num_tokens": 41675093.0,
"step": 2735
},
{
"entropy": 0.34600385688245294,
"epoch": 1.063670775502281,
"grad_norm": 1.453125,
"learning_rate": 7.4848617232955275e-06,
"loss": 0.36208953857421877,
"mean_token_accuracy": 0.8861552521586418,
"num_tokens": 41751969.0,
"step": 2740
},
{
"entropy": 0.34471417032182217,
"epoch": 1.0656119576822285,
"grad_norm": 1.453125,
"learning_rate": 7.475764196842516e-06,
"loss": 0.3590202331542969,
"mean_token_accuracy": 0.8894360795617103,
"num_tokens": 41826333.0,
"step": 2745
},
{
"entropy": 0.35084208101034164,
"epoch": 1.067553139862176,
"grad_norm": 2.15625,
"learning_rate": 7.466655798883313e-06,
"loss": 0.3687446117401123,
"mean_token_accuracy": 0.8872736170887947,
"num_tokens": 41908430.0,
"step": 2750
},
{
"entropy": 0.3609664674848318,
"epoch": 1.0694943220421236,
"grad_norm": 2.171875,
"learning_rate": 7.457536569414459e-06,
"loss": 0.3871330738067627,
"mean_token_accuracy": 0.8852574542164803,
"num_tokens": 41985011.0,
"step": 2755
},
{
"entropy": 0.34639161452651024,
"epoch": 1.0714355042220713,
"grad_norm": 1.4375,
"learning_rate": 7.448406548480063e-06,
"loss": 0.3695810794830322,
"mean_token_accuracy": 0.8897538051009178,
"num_tokens": 42048911.0,
"step": 2760
},
{
"entropy": 0.32954322583973406,
"epoch": 1.0733766864020189,
"grad_norm": 1.5,
"learning_rate": 7.439265776171611e-06,
"loss": 0.3176077365875244,
"mean_token_accuracy": 0.8935502767562866,
"num_tokens": 42120003.0,
"step": 2765
},
{
"entropy": 0.37422714903950693,
"epoch": 1.0753178685819664,
"grad_norm": 1.59375,
"learning_rate": 7.430114292627808e-06,
"loss": 0.350958251953125,
"mean_token_accuracy": 0.8825857222080231,
"num_tokens": 42189503.0,
"step": 2770
},
{
"entropy": 0.35138509757816794,
"epoch": 1.077259050761914,
"grad_norm": 1.9921875,
"learning_rate": 7.420952138034392e-06,
"loss": 0.3909478187561035,
"mean_token_accuracy": 0.8854555234313011,
"num_tokens": 42251724.0,
"step": 2775
},
{
"entropy": 0.3574231918901205,
"epoch": 1.0792002329418615,
"grad_norm": 1.3125,
"learning_rate": 7.411779352623958e-06,
"loss": 0.36853466033935545,
"mean_token_accuracy": 0.8846323460340499,
"num_tokens": 42328165.0,
"step": 2780
},
{
"entropy": 0.3312282390892506,
"epoch": 1.0811414151218093,
"grad_norm": 1.265625,
"learning_rate": 7.402595976675785e-06,
"loss": 0.34425904750823977,
"mean_token_accuracy": 0.8917144045233727,
"num_tokens": 42416101.0,
"step": 2785
},
{
"entropy": 0.38315938860177995,
"epoch": 1.0830825973017568,
"grad_norm": 1.4375,
"learning_rate": 7.393402050515652e-06,
"loss": 0.41192307472229006,
"mean_token_accuracy": 0.8757176354527474,
"num_tokens": 42490192.0,
"step": 2790
},
{
"entropy": 0.3734312802553177,
"epoch": 1.0850237794817044,
"grad_norm": 1.6484375,
"learning_rate": 7.384197614515672e-06,
"loss": 0.3864989519119263,
"mean_token_accuracy": 0.8778089836239815,
"num_tokens": 42579078.0,
"step": 2795
},
{
"entropy": 0.3602697692811489,
"epoch": 1.086964961661652,
"grad_norm": 1.515625,
"learning_rate": 7.3749827090941074e-06,
"loss": 0.4144554615020752,
"mean_token_accuracy": 0.8825942382216454,
"num_tokens": 42656738.0,
"step": 2800
},
{
"epoch": 1.086964961661652,
"eval_entropy": 0.3603161519775561,
"eval_loss": 0.3686440587043762,
"eval_mean_token_accuracy": 0.8834110738205987,
"eval_num_tokens": 42656738.0,
"eval_runtime": 60.0795,
"eval_samples_per_second": 35.769,
"eval_steps_per_second": 35.769,
"step": 2800
},
{
"entropy": 0.37382765375077726,
"epoch": 1.0889061438415994,
"grad_norm": 1.8046875,
"learning_rate": 7.365757374715188e-06,
"loss": 0.4022432804107666,
"mean_token_accuracy": 0.878273893892765,
"num_tokens": 42727874.0,
"step": 2805
},
{
"entropy": 0.3878149565309286,
"epoch": 1.0908473260215472,
"grad_norm": 1.453125,
"learning_rate": 7.356521651888946e-06,
"loss": 0.4143357276916504,
"mean_token_accuracy": 0.8759766072034836,
"num_tokens": 42805952.0,
"step": 2810
},
{
"entropy": 0.3720662288367748,
"epoch": 1.0927885082014948,
"grad_norm": 1.765625,
"learning_rate": 7.347275581171027e-06,
"loss": 0.3936682939529419,
"mean_token_accuracy": 0.8800241187214851,
"num_tokens": 42879670.0,
"step": 2815
},
{
"entropy": 0.3727020751684904,
"epoch": 1.0947296903814423,
"grad_norm": 1.53125,
"learning_rate": 7.338019203162516e-06,
"loss": 0.40426788330078123,
"mean_token_accuracy": 0.8778004497289658,
"num_tokens": 42966404.0,
"step": 2820
},
{
"entropy": 0.37748123742640016,
"epoch": 1.0966708725613898,
"grad_norm": 1.6875,
"learning_rate": 7.3287525585097615e-06,
"loss": 0.3956634044647217,
"mean_token_accuracy": 0.877564987540245,
"num_tokens": 43043314.0,
"step": 2825
},
{
"entropy": 0.3898195032030344,
"epoch": 1.0986120547413374,
"grad_norm": 2.390625,
"learning_rate": 7.319475687904193e-06,
"loss": 0.39679808616638185,
"mean_token_accuracy": 0.8783272713422775,
"num_tokens": 43108948.0,
"step": 2830
},
{
"entropy": 0.3532901670783758,
"epoch": 1.1005532369212851,
"grad_norm": 1.5859375,
"learning_rate": 7.310188632082145e-06,
"loss": 0.3547484874725342,
"mean_token_accuracy": 0.8868882149457932,
"num_tokens": 43182120.0,
"step": 2835
},
{
"entropy": 0.38809507302939894,
"epoch": 1.1024944191012327,
"grad_norm": 2.046875,
"learning_rate": 7.300891431824673e-06,
"loss": 0.4074056148529053,
"mean_token_accuracy": 0.8722813636064529,
"num_tokens": 43263374.0,
"step": 2840
},
{
"entropy": 0.35386649817228316,
"epoch": 1.1044356012811802,
"grad_norm": 1.265625,
"learning_rate": 7.291584127957384e-06,
"loss": 0.3566242456436157,
"mean_token_accuracy": 0.8883859798312187,
"num_tokens": 43334896.0,
"step": 2845
},
{
"entropy": 0.37594650611281394,
"epoch": 1.1063767834611278,
"grad_norm": 1.6875,
"learning_rate": 7.282266761350249e-06,
"loss": 0.3671935319900513,
"mean_token_accuracy": 0.8861946225166321,
"num_tokens": 43395764.0,
"step": 2850
},
{
"entropy": 0.37138679772615435,
"epoch": 1.1083179656410753,
"grad_norm": 1.3359375,
"learning_rate": 7.272939372917427e-06,
"loss": 0.3758493185043335,
"mean_token_accuracy": 0.8805965319275856,
"num_tokens": 43483273.0,
"step": 2855
},
{
"entropy": 0.3592002343386412,
"epoch": 1.110259147821023,
"grad_norm": 1.609375,
"learning_rate": 7.263602003617083e-06,
"loss": 0.36438665390014646,
"mean_token_accuracy": 0.8856978788971901,
"num_tokens": 43554609.0,
"step": 2860
},
{
"entropy": 0.3399000741541386,
"epoch": 1.1122003300009706,
"grad_norm": 1.5625,
"learning_rate": 7.2542546944512106e-06,
"loss": 0.3749422550201416,
"mean_token_accuracy": 0.8887553334236145,
"num_tokens": 43626500.0,
"step": 2865
},
{
"entropy": 0.40647769123315813,
"epoch": 1.1141415121809182,
"grad_norm": 1.9453125,
"learning_rate": 7.244897486465451e-06,
"loss": 0.43062515258789064,
"mean_token_accuracy": 0.8718539297580719,
"num_tokens": 43696284.0,
"step": 2870
},
{
"entropy": 0.3513967592269182,
"epoch": 1.1160826943608657,
"grad_norm": 1.3984375,
"learning_rate": 7.2355304207489154e-06,
"loss": 0.35802536010742186,
"mean_token_accuracy": 0.88879015147686,
"num_tokens": 43768064.0,
"step": 2875
},
{
"entropy": 0.3477201282978058,
"epoch": 1.1180238765408133,
"grad_norm": 1.828125,
"learning_rate": 7.226153538433996e-06,
"loss": 0.37060644626617434,
"mean_token_accuracy": 0.8868695870041847,
"num_tokens": 43841997.0,
"step": 2880
},
{
"entropy": 0.3926592905074358,
"epoch": 1.119965058720761,
"grad_norm": 1.84375,
"learning_rate": 7.216766880696199e-06,
"loss": 0.4085033893585205,
"mean_token_accuracy": 0.8761053428053855,
"num_tokens": 43920068.0,
"step": 2885
},
{
"entropy": 0.34527620263397696,
"epoch": 1.1219062409007086,
"grad_norm": 1.640625,
"learning_rate": 7.207370488753949e-06,
"loss": 0.35770795345306394,
"mean_token_accuracy": 0.8901533395051956,
"num_tokens": 43989892.0,
"step": 2890
},
{
"entropy": 0.3740640126168728,
"epoch": 1.123847423080656,
"grad_norm": 1.375,
"learning_rate": 7.197964403868421e-06,
"loss": 0.39128780364990234,
"mean_token_accuracy": 0.8786321595311165,
"num_tokens": 44067873.0,
"step": 2895
},
{
"entropy": 0.3401097748428583,
"epoch": 1.1257886052606036,
"grad_norm": 1.6875,
"learning_rate": 7.188548667343347e-06,
"loss": 0.357515287399292,
"mean_token_accuracy": 0.8893807768821717,
"num_tokens": 44142522.0,
"step": 2900
},
{
"entropy": 0.39471787922084334,
"epoch": 1.1277297874405514,
"grad_norm": 1.6328125,
"learning_rate": 7.179123320524848e-06,
"loss": 0.3968302488327026,
"mean_token_accuracy": 0.8748754128813744,
"num_tokens": 44228212.0,
"step": 2905
},
{
"entropy": 0.4300340283662081,
"epoch": 1.129670969620499,
"grad_norm": 1.5546875,
"learning_rate": 7.169688404801241e-06,
"loss": 0.4560871124267578,
"mean_token_accuracy": 0.8694811254739762,
"num_tokens": 44294114.0,
"step": 2910
},
{
"entropy": 0.3595341399312019,
"epoch": 1.1316121518004465,
"grad_norm": 2.078125,
"learning_rate": 7.160243961602863e-06,
"loss": 0.3778635025024414,
"mean_token_accuracy": 0.8835319861769676,
"num_tokens": 44355840.0,
"step": 2915
},
{
"entropy": 0.42457632496953013,
"epoch": 1.133553333980394,
"grad_norm": 2.375,
"learning_rate": 7.150790032401887e-06,
"loss": 0.4247872829437256,
"mean_token_accuracy": 0.8706662476062774,
"num_tokens": 44426127.0,
"step": 2920
},
{
"entropy": 0.4208326905965805,
"epoch": 1.1354945161603416,
"grad_norm": 1.5703125,
"learning_rate": 7.1413266587121434e-06,
"loss": 0.42088823318481444,
"mean_token_accuracy": 0.871395905315876,
"num_tokens": 44497833.0,
"step": 2925
},
{
"entropy": 0.356390430778265,
"epoch": 1.1374356983402891,
"grad_norm": 1.4765625,
"learning_rate": 7.13185388208893e-06,
"loss": 0.37389678955078126,
"mean_token_accuracy": 0.8866265177726745,
"num_tokens": 44582547.0,
"step": 2930
},
{
"entropy": 0.3736116912215948,
"epoch": 1.139376880520237,
"grad_norm": 1.828125,
"learning_rate": 7.122371744128839e-06,
"loss": 0.3963154792785645,
"mean_token_accuracy": 0.8842655003070832,
"num_tokens": 44655812.0,
"step": 2935
},
{
"entropy": 0.43113922215998174,
"epoch": 1.1413180627001844,
"grad_norm": 1.6328125,
"learning_rate": 7.112880286469568e-06,
"loss": 0.42786569595336915,
"mean_token_accuracy": 0.8678357198834419,
"num_tokens": 44730175.0,
"step": 2940
},
{
"entropy": 0.34794704206287863,
"epoch": 1.143259244880132,
"grad_norm": 1.4453125,
"learning_rate": 7.103379550789741e-06,
"loss": 0.35416512489318847,
"mean_token_accuracy": 0.8896363779902459,
"num_tokens": 44794755.0,
"step": 2945
},
{
"entropy": 0.40246716812253,
"epoch": 1.1452004270600795,
"grad_norm": 1.5234375,
"learning_rate": 7.093869578808719e-06,
"loss": 0.41913704872131347,
"mean_token_accuracy": 0.868728120625019,
"num_tokens": 44866536.0,
"step": 2950
},
{
"entropy": 0.3885481279343367,
"epoch": 1.1471416092400273,
"grad_norm": 1.8671875,
"learning_rate": 7.084350412286424e-06,
"loss": 0.40956454277038573,
"mean_token_accuracy": 0.8750770896673202,
"num_tokens": 44942021.0,
"step": 2955
},
{
"entropy": 0.3983582962304354,
"epoch": 1.1490827914199748,
"grad_norm": 1.578125,
"learning_rate": 7.074822093023154e-06,
"loss": 0.4057170391082764,
"mean_token_accuracy": 0.8758019611239434,
"num_tokens": 45016428.0,
"step": 2960
},
{
"entropy": 0.3897486738860607,
"epoch": 1.1510239735999224,
"grad_norm": 1.921875,
"learning_rate": 7.065284662859395e-06,
"loss": 0.4297188282012939,
"mean_token_accuracy": 0.8763071224093437,
"num_tokens": 45082753.0,
"step": 2965
},
{
"entropy": 0.3621146373450756,
"epoch": 1.15296515577987,
"grad_norm": 1.6796875,
"learning_rate": 7.055738163675645e-06,
"loss": 0.35830867290496826,
"mean_token_accuracy": 0.8859908595681191,
"num_tokens": 45147776.0,
"step": 2970
},
{
"entropy": 0.40969758927822114,
"epoch": 1.1549063379598175,
"grad_norm": 1.5234375,
"learning_rate": 7.046182637392221e-06,
"loss": 0.3900305271148682,
"mean_token_accuracy": 0.8708938717842102,
"num_tokens": 45223891.0,
"step": 2975
},
{
"entropy": 0.37424799539148806,
"epoch": 1.156847520139765,
"grad_norm": 1.828125,
"learning_rate": 7.036618125969081e-06,
"loss": 0.3869047164916992,
"mean_token_accuracy": 0.8798339098691941,
"num_tokens": 45294331.0,
"step": 2980
},
{
"entropy": 0.38800337798893453,
"epoch": 1.1587887023197128,
"grad_norm": 1.90625,
"learning_rate": 7.027044671405643e-06,
"loss": 0.3859901428222656,
"mean_token_accuracy": 0.8810177177190781,
"num_tokens": 45348349.0,
"step": 2985
},
{
"entropy": 0.3485205162316561,
"epoch": 1.1607298844996603,
"grad_norm": 1.4296875,
"learning_rate": 7.017462315740586e-06,
"loss": 0.3649015188217163,
"mean_token_accuracy": 0.8860207587480545,
"num_tokens": 45425964.0,
"step": 2990
},
{
"entropy": 0.37424386143684385,
"epoch": 1.1626710666796078,
"grad_norm": 1.6484375,
"learning_rate": 7.007871101051686e-06,
"loss": 0.3772335767745972,
"mean_token_accuracy": 0.8799161404371262,
"num_tokens": 45500293.0,
"step": 2995
},
{
"entropy": 0.38352062441408635,
"epoch": 1.1646122488595554,
"grad_norm": 1.9296875,
"learning_rate": 6.998271069455612e-06,
"loss": 0.40156922340393064,
"mean_token_accuracy": 0.8821331828832626,
"num_tokens": 45571727.0,
"step": 3000
},
{
"epoch": 1.1646122488595554,
"eval_entropy": 0.3600764439036925,
"eval_loss": 0.36817678809165955,
"eval_mean_token_accuracy": 0.883426463426463,
"eval_num_tokens": 45571727.0,
"eval_runtime": 60.1453,
"eval_samples_per_second": 35.73,
"eval_steps_per_second": 35.73,
"step": 3000
},
{
"entropy": 0.4300002858042717,
"epoch": 1.1665534310395032,
"grad_norm": 1.4296875,
"learning_rate": 6.988662263107755e-06,
"loss": 0.4532319068908691,
"mean_token_accuracy": 0.8672842562198639,
"num_tokens": 45648648.0,
"step": 3005
},
{
"entropy": 0.3812822367995977,
"epoch": 1.1684946132194507,
"grad_norm": 1.640625,
"learning_rate": 6.979044724202034e-06,
"loss": 0.39993724822998045,
"mean_token_accuracy": 0.8782149285078049,
"num_tokens": 45743015.0,
"step": 3010
},
{
"entropy": 0.41226282604038716,
"epoch": 1.1704357953993982,
"grad_norm": 1.5234375,
"learning_rate": 6.969418494970717e-06,
"loss": 0.4353823661804199,
"mean_token_accuracy": 0.8674470081925392,
"num_tokens": 45826008.0,
"step": 3015
},
{
"entropy": 0.37322292029857634,
"epoch": 1.1723769775793458,
"grad_norm": 1.7265625,
"learning_rate": 6.9597836176842315e-06,
"loss": 0.4075223445892334,
"mean_token_accuracy": 0.8766345664858818,
"num_tokens": 45907989.0,
"step": 3020
},
{
"entropy": 0.3779715023934841,
"epoch": 1.1743181597592933,
"grad_norm": 1.609375,
"learning_rate": 6.9501401346509786e-06,
"loss": 0.4066688060760498,
"mean_token_accuracy": 0.8800197467207909,
"num_tokens": 45976593.0,
"step": 3025
},
{
"entropy": 0.364545364305377,
"epoch": 1.176259341939241,
"grad_norm": 1.453125,
"learning_rate": 6.940488088217152e-06,
"loss": 0.37837910652160645,
"mean_token_accuracy": 0.8811485067009925,
"num_tokens": 46067425.0,
"step": 3030
},
{
"entropy": 0.34584682770073416,
"epoch": 1.1782005241191886,
"grad_norm": 1.609375,
"learning_rate": 6.930827520766544e-06,
"loss": 0.3524082899093628,
"mean_token_accuracy": 0.8913029715418815,
"num_tokens": 46141435.0,
"step": 3035
},
{
"entropy": 0.38420435078442094,
"epoch": 1.1801417062991362,
"grad_norm": 1.7890625,
"learning_rate": 6.921158474720368e-06,
"loss": 0.3806861400604248,
"mean_token_accuracy": 0.8749532103538513,
"num_tokens": 46222095.0,
"step": 3040
},
{
"entropy": 0.376201831176877,
"epoch": 1.1820828884790837,
"grad_norm": 1.5625,
"learning_rate": 6.911480992537072e-06,
"loss": 0.4178003311157227,
"mean_token_accuracy": 0.8752377212047577,
"num_tokens": 46312000.0,
"step": 3045
},
{
"entropy": 0.38689825385808946,
"epoch": 1.1840240706590313,
"grad_norm": 2.09375,
"learning_rate": 6.901795116712136e-06,
"loss": 0.40619282722473143,
"mean_token_accuracy": 0.8773537456989289,
"num_tokens": 46381015.0,
"step": 3050
},
{
"entropy": 0.39098729118704795,
"epoch": 1.185965252838979,
"grad_norm": 1.546875,
"learning_rate": 6.892100889777913e-06,
"loss": 0.42108306884765623,
"mean_token_accuracy": 0.8786390334367752,
"num_tokens": 46464894.0,
"step": 3055
},
{
"entropy": 0.3601615995168686,
"epoch": 1.1879064350189266,
"grad_norm": 1.5546875,
"learning_rate": 6.882398354303416e-06,
"loss": 0.3870659351348877,
"mean_token_accuracy": 0.8846402570605278,
"num_tokens": 46545575.0,
"step": 3060
},
{
"entropy": 0.3909476988017559,
"epoch": 1.189847617198874,
"grad_norm": 2.125,
"learning_rate": 6.872687552894145e-06,
"loss": 0.3942322969436646,
"mean_token_accuracy": 0.8762227043509483,
"num_tokens": 46620397.0,
"step": 3065
},
{
"entropy": 0.36160071194171906,
"epoch": 1.1917887993788217,
"grad_norm": 1.484375,
"learning_rate": 6.8629685281919025e-06,
"loss": 0.35771043300628663,
"mean_token_accuracy": 0.8830681905150414,
"num_tokens": 46695823.0,
"step": 3070
},
{
"entropy": 0.40609239749610426,
"epoch": 1.1937299815587692,
"grad_norm": 1.4453125,
"learning_rate": 6.853241322874593e-06,
"loss": 0.40566306114196776,
"mean_token_accuracy": 0.8745755672454834,
"num_tokens": 46763659.0,
"step": 3075
},
{
"entropy": 0.39826103691011666,
"epoch": 1.195671163738717,
"grad_norm": 1.8046875,
"learning_rate": 6.843505979656049e-06,
"loss": 0.42182149887084963,
"mean_token_accuracy": 0.878571617603302,
"num_tokens": 46827063.0,
"step": 3080
},
{
"entropy": 0.3527070388197899,
"epoch": 1.1976123459186645,
"grad_norm": 1.7421875,
"learning_rate": 6.8337625412858364e-06,
"loss": 0.3918677806854248,
"mean_token_accuracy": 0.8872169196605683,
"num_tokens": 46902371.0,
"step": 3085
},
{
"entropy": 0.360038623213768,
"epoch": 1.199553528098612,
"grad_norm": 1.4140625,
"learning_rate": 6.824011050549067e-06,
"loss": 0.36493072509765623,
"mean_token_accuracy": 0.8841731250286102,
"num_tokens": 46982779.0,
"step": 3090
},
{
"entropy": 0.372321966663003,
"epoch": 1.2014947102785596,
"grad_norm": 1.1796875,
"learning_rate": 6.814251550266216e-06,
"loss": 0.39631216526031493,
"mean_token_accuracy": 0.8795213535428047,
"num_tokens": 47072025.0,
"step": 3095
},
{
"entropy": 0.44918127730488777,
"epoch": 1.2034358924585074,
"grad_norm": 1.5,
"learning_rate": 6.8044840832929216e-06,
"loss": 0.4901744365692139,
"mean_token_accuracy": 0.8591332510113716,
"num_tokens": 47134711.0,
"step": 3100
},
{
"entropy": 0.36903586611151695,
"epoch": 1.205377074638455,
"grad_norm": 1.734375,
"learning_rate": 6.794708692519815e-06,
"loss": 0.36009137630462645,
"mean_token_accuracy": 0.8829508319497108,
"num_tokens": 47211803.0,
"step": 3105
},
{
"entropy": 0.4135138522833586,
"epoch": 1.2073182568184024,
"grad_norm": 1.5703125,
"learning_rate": 6.784925420872315e-06,
"loss": 0.4357631683349609,
"mean_token_accuracy": 0.8690931290388108,
"num_tokens": 47289477.0,
"step": 3110
},
{
"entropy": 0.3864825196564198,
"epoch": 1.20925943899835,
"grad_norm": 1.5078125,
"learning_rate": 6.775134311310449e-06,
"loss": 0.3875833034515381,
"mean_token_accuracy": 0.8817471221089364,
"num_tokens": 47361495.0,
"step": 3115
},
{
"entropy": 0.3780834227800369,
"epoch": 1.2112006211782975,
"grad_norm": 2.578125,
"learning_rate": 6.765335406828664e-06,
"loss": 0.4267258167266846,
"mean_token_accuracy": 0.8809913843870163,
"num_tokens": 47423556.0,
"step": 3120
},
{
"entropy": 0.36974840685725213,
"epoch": 1.213141803358245,
"grad_norm": 1.75,
"learning_rate": 6.755528750455634e-06,
"loss": 0.36568589210510255,
"mean_token_accuracy": 0.8837969750165939,
"num_tokens": 47502054.0,
"step": 3125
},
{
"entropy": 0.40455227382481096,
"epoch": 1.2150829855381928,
"grad_norm": 1.9765625,
"learning_rate": 6.745714385254072e-06,
"loss": 0.4230593204498291,
"mean_token_accuracy": 0.8713468372821808,
"num_tokens": 47576658.0,
"step": 3130
},
{
"entropy": 0.3805558536201715,
"epoch": 1.2170241677181404,
"grad_norm": 1.75,
"learning_rate": 6.735892354320544e-06,
"loss": 0.38716301918029783,
"mean_token_accuracy": 0.8806325614452362,
"num_tokens": 47646232.0,
"step": 3135
},
{
"entropy": 0.36968096643686293,
"epoch": 1.218965349898088,
"grad_norm": 1.3125,
"learning_rate": 6.726062700785273e-06,
"loss": 0.39945073127746583,
"mean_token_accuracy": 0.8774180024862289,
"num_tokens": 47741132.0,
"step": 3140
},
{
"entropy": 0.35996747594326733,
"epoch": 1.2209065320780355,
"grad_norm": 1.8046875,
"learning_rate": 6.716225467811961e-06,
"loss": 0.37158637046813964,
"mean_token_accuracy": 0.8840801179409027,
"num_tokens": 47812661.0,
"step": 3145
},
{
"entropy": 0.37774690724909304,
"epoch": 1.2228477142579832,
"grad_norm": 1.796875,
"learning_rate": 6.706380698597588e-06,
"loss": 0.3942166805267334,
"mean_token_accuracy": 0.8794585153460502,
"num_tokens": 47883300.0,
"step": 3150
},
{
"entropy": 0.4131374925374985,
"epoch": 1.2247888964379308,
"grad_norm": 1.6875,
"learning_rate": 6.696528436372229e-06,
"loss": 0.4139698505401611,
"mean_token_accuracy": 0.869027565419674,
"num_tokens": 47979410.0,
"step": 3155
},
{
"entropy": 0.3713895071297884,
"epoch": 1.2267300786178783,
"grad_norm": 2.390625,
"learning_rate": 6.68666872439886e-06,
"loss": 0.3731879472732544,
"mean_token_accuracy": 0.880556121468544,
"num_tokens": 48058170.0,
"step": 3160
},
{
"entropy": 0.3848850384354591,
"epoch": 1.2286712607978258,
"grad_norm": 1.8125,
"learning_rate": 6.67680160597317e-06,
"loss": 0.40471110343933103,
"mean_token_accuracy": 0.8750801667571068,
"num_tokens": 48127200.0,
"step": 3165
},
{
"entropy": 0.3463645543903112,
"epoch": 1.2306124429777734,
"grad_norm": 1.6171875,
"learning_rate": 6.666927124423374e-06,
"loss": 0.3593963623046875,
"mean_token_accuracy": 0.8887131616473198,
"num_tokens": 48190947.0,
"step": 3170
},
{
"entropy": 0.3704676777124405,
"epoch": 1.232553625157721,
"grad_norm": 1.7265625,
"learning_rate": 6.657045323110017e-06,
"loss": 0.3847299337387085,
"mean_token_accuracy": 0.8824770480394364,
"num_tokens": 48268615.0,
"step": 3175
},
{
"entropy": 0.3637780986726284,
"epoch": 1.2344948073376687,
"grad_norm": 1.59375,
"learning_rate": 6.647156245425789e-06,
"loss": 0.3874013423919678,
"mean_token_accuracy": 0.8841297894716262,
"num_tokens": 48336348.0,
"step": 3180
},
{
"entropy": 0.35034383423626425,
"epoch": 1.2364359895176162,
"grad_norm": 1.4765625,
"learning_rate": 6.637259934795328e-06,
"loss": 0.34986927509307864,
"mean_token_accuracy": 0.8901937618851662,
"num_tokens": 48406226.0,
"step": 3185
},
{
"entropy": 0.403315170109272,
"epoch": 1.2383771716975638,
"grad_norm": 1.2421875,
"learning_rate": 6.627356434675035e-06,
"loss": 0.4066962718963623,
"mean_token_accuracy": 0.8722446888685227,
"num_tokens": 48490073.0,
"step": 3190
},
{
"entropy": 0.36379750072956085,
"epoch": 1.2403183538775113,
"grad_norm": 1.3984375,
"learning_rate": 6.6174457885528855e-06,
"loss": 0.3708995819091797,
"mean_token_accuracy": 0.8828730553388595,
"num_tokens": 48561708.0,
"step": 3195
},
{
"entropy": 0.35466758720576763,
"epoch": 1.242259536057459,
"grad_norm": 1.859375,
"learning_rate": 6.607528039948226e-06,
"loss": 0.36141531467437743,
"mean_token_accuracy": 0.8850849062204361,
"num_tokens": 48629826.0,
"step": 3200
},
{
"epoch": 1.242259536057459,
"eval_entropy": 0.3607052234181308,
"eval_loss": 0.3680853247642517,
"eval_mean_token_accuracy": 0.8834105204598413,
"eval_num_tokens": 48629826.0,
"eval_runtime": 60.0565,
"eval_samples_per_second": 35.783,
"eval_steps_per_second": 35.783,
"step": 3200
},
{
"entropy": 0.33488245457410815,
"epoch": 1.2442007182374066,
"grad_norm": 1.609375,
"learning_rate": 6.597603232411597e-06,
"loss": 0.40059671401977537,
"mean_token_accuracy": 0.8868094369769096,
"num_tokens": 48705277.0,
"step": 3205
},
{
"entropy": 0.36259912960231305,
"epoch": 1.2461419004173542,
"grad_norm": 1.7265625,
"learning_rate": 6.587671409524534e-06,
"loss": 0.36700074672698973,
"mean_token_accuracy": 0.8851820915937424,
"num_tokens": 48773921.0,
"step": 3210
},
{
"entropy": 0.3843076877295971,
"epoch": 1.2480830825973017,
"grad_norm": 1.59375,
"learning_rate": 6.577732614899379e-06,
"loss": 0.4054192066192627,
"mean_token_accuracy": 0.8755981966853141,
"num_tokens": 48859799.0,
"step": 3215
},
{
"entropy": 0.38492829352617264,
"epoch": 1.2500242647772493,
"grad_norm": 1.6328125,
"learning_rate": 6.56778689217909e-06,
"loss": 0.39413578510284425,
"mean_token_accuracy": 0.8772288784384727,
"num_tokens": 48930817.0,
"step": 3220
},
{
"entropy": 0.3711765740066767,
"epoch": 1.2519654469571968,
"grad_norm": 1.3515625,
"learning_rate": 6.5578342850370415e-06,
"loss": 0.37443616390228274,
"mean_token_accuracy": 0.8799751400947571,
"num_tokens": 49002171.0,
"step": 3225
},
{
"entropy": 0.3800849601626396,
"epoch": 1.2539066291371446,
"grad_norm": 1.5234375,
"learning_rate": 6.547874837176847e-06,
"loss": 0.3951963186264038,
"mean_token_accuracy": 0.88048807233572,
"num_tokens": 49073741.0,
"step": 3230
},
{
"entropy": 0.3790770899504423,
"epoch": 1.255847811317092,
"grad_norm": 1.4765625,
"learning_rate": 6.537908592332147e-06,
"loss": 0.40506410598754883,
"mean_token_accuracy": 0.8772617995738983,
"num_tokens": 49148184.0,
"step": 3235
},
{
"entropy": 0.3464452028274536,
"epoch": 1.2577889934970397,
"grad_norm": 1.484375,
"learning_rate": 6.5279355942664435e-06,
"loss": 0.3766259908676147,
"mean_token_accuracy": 0.8875595390796661,
"num_tokens": 49218339.0,
"step": 3240
},
{
"entropy": 0.3986640240997076,
"epoch": 1.2597301756769874,
"grad_norm": 1.9921875,
"learning_rate": 6.51795588677288e-06,
"loss": 0.3935344696044922,
"mean_token_accuracy": 0.87731524258852,
"num_tokens": 49279139.0,
"step": 3245
},
{
"entropy": 0.3633753590285778,
"epoch": 1.261671357856935,
"grad_norm": 1.6953125,
"learning_rate": 6.5079695136740706e-06,
"loss": 0.3786989688873291,
"mean_token_accuracy": 0.8843644946813584,
"num_tokens": 49352390.0,
"step": 3250
},
{
"entropy": 0.38015848845243455,
"epoch": 1.2636125400368825,
"grad_norm": 1.3203125,
"learning_rate": 6.497976518821896e-06,
"loss": 0.4066456317901611,
"mean_token_accuracy": 0.878155305981636,
"num_tokens": 49446216.0,
"step": 3255
},
{
"entropy": 0.39295368976891043,
"epoch": 1.26555372221683,
"grad_norm": 1.7578125,
"learning_rate": 6.487976946097314e-06,
"loss": 0.3828210115432739,
"mean_token_accuracy": 0.8781291946768761,
"num_tokens": 49523614.0,
"step": 3260
},
{
"entropy": 0.3769407343119383,
"epoch": 1.2674949043967776,
"grad_norm": 1.40625,
"learning_rate": 6.477970839410166e-06,
"loss": 0.40603952407836913,
"mean_token_accuracy": 0.8798100754618645,
"num_tokens": 49591786.0,
"step": 3265
},
{
"entropy": 0.3537591304630041,
"epoch": 1.2694360865767251,
"grad_norm": 1.25,
"learning_rate": 6.46795824269899e-06,
"loss": 0.3576634407043457,
"mean_token_accuracy": 0.8874867498874665,
"num_tokens": 49663637.0,
"step": 3270
},
{
"entropy": 0.35550354048609734,
"epoch": 1.271377268756673,
"grad_norm": 1.8359375,
"learning_rate": 6.457939199930815e-06,
"loss": 0.39648468494415284,
"mean_token_accuracy": 0.8848752856254578,
"num_tokens": 49731508.0,
"step": 3275
},
{
"entropy": 0.3586549339815974,
"epoch": 1.2733184509366204,
"grad_norm": 1.9296875,
"learning_rate": 6.4479137551009855e-06,
"loss": 0.3832548141479492,
"mean_token_accuracy": 0.8830386832356453,
"num_tokens": 49813544.0,
"step": 3280
},
{
"entropy": 0.3636878037825227,
"epoch": 1.275259633116568,
"grad_norm": 1.640625,
"learning_rate": 6.437881952232947e-06,
"loss": 0.3801161766052246,
"mean_token_accuracy": 0.8825449839234352,
"num_tokens": 49885620.0,
"step": 3285
},
{
"entropy": 0.3816155593842268,
"epoch": 1.2772008152965155,
"grad_norm": 1.8359375,
"learning_rate": 6.427843835378074e-06,
"loss": 0.3867227554321289,
"mean_token_accuracy": 0.87795270383358,
"num_tokens": 49964261.0,
"step": 3290
},
{
"entropy": 0.33171985633671286,
"epoch": 1.2791419974764633,
"grad_norm": 1.7734375,
"learning_rate": 6.417799448615465e-06,
"loss": 0.3791377544403076,
"mean_token_accuracy": 0.8870480135083199,
"num_tokens": 50051116.0,
"step": 3295
},
{
"entropy": 0.33659284114837645,
"epoch": 1.2810831796564108,
"grad_norm": 1.59375,
"learning_rate": 6.407748836051746e-06,
"loss": 0.35617640018463137,
"mean_token_accuracy": 0.8889725834131241,
"num_tokens": 50125047.0,
"step": 3300
},
{
"entropy": 0.3994648285210133,
"epoch": 1.2830243618363584,
"grad_norm": 2.0,
"learning_rate": 6.397692041820885e-06,
"loss": 0.37363758087158205,
"mean_token_accuracy": 0.8738816857337952,
"num_tokens": 50185527.0,
"step": 3305
},
{
"entropy": 0.3728124268352985,
"epoch": 1.284965544016306,
"grad_norm": 1.3984375,
"learning_rate": 6.387629110083995e-06,
"loss": 0.37665843963623047,
"mean_token_accuracy": 0.8807444587349892,
"num_tokens": 50267257.0,
"step": 3310
},
{
"entropy": 0.39132245220243933,
"epoch": 1.2869067261962535,
"grad_norm": 1.6015625,
"learning_rate": 6.377560085029139e-06,
"loss": 0.3918001651763916,
"mean_token_accuracy": 0.8778573974967003,
"num_tokens": 50339377.0,
"step": 3315
},
{
"entropy": 0.3549855757504702,
"epoch": 1.288847908376201,
"grad_norm": 1.375,
"learning_rate": 6.367485010871136e-06,
"loss": 0.3473607301712036,
"mean_token_accuracy": 0.8883946269750596,
"num_tokens": 50412920.0,
"step": 3320
},
{
"entropy": 0.40118363983929156,
"epoch": 1.2907890905561488,
"grad_norm": 1.7734375,
"learning_rate": 6.35740393185137e-06,
"loss": 0.4183527946472168,
"mean_token_accuracy": 0.8735328048467637,
"num_tokens": 50493105.0,
"step": 3325
},
{
"entropy": 0.3577863838523626,
"epoch": 1.2927302727360963,
"grad_norm": 1.21875,
"learning_rate": 6.347316892237592e-06,
"loss": 0.36974031925201417,
"mean_token_accuracy": 0.8823448717594147,
"num_tokens": 50597292.0,
"step": 3330
},
{
"entropy": 0.4042118158191442,
"epoch": 1.2946714549160439,
"grad_norm": 1.4609375,
"learning_rate": 6.3372239363237255e-06,
"loss": 0.3996162414550781,
"mean_token_accuracy": 0.8742659211158752,
"num_tokens": 50669226.0,
"step": 3335
},
{
"entropy": 0.38877438604831693,
"epoch": 1.2966126370959914,
"grad_norm": 1.4140625,
"learning_rate": 6.327125108429677e-06,
"loss": 0.3838073492050171,
"mean_token_accuracy": 0.8778223499655724,
"num_tokens": 50740937.0,
"step": 3340
},
{
"entropy": 0.3652618743479252,
"epoch": 1.2985538192759392,
"grad_norm": 1.609375,
"learning_rate": 6.317020452901134e-06,
"loss": 0.40174212455749514,
"mean_token_accuracy": 0.8852205485105514,
"num_tokens": 50800354.0,
"step": 3345
},
{
"entropy": 0.36313771940767764,
"epoch": 1.3004950014558867,
"grad_norm": 1.75,
"learning_rate": 6.3069100141093755e-06,
"loss": 0.40732836723327637,
"mean_token_accuracy": 0.8836523965001106,
"num_tokens": 50874364.0,
"step": 3350
},
{
"entropy": 0.3877572625875473,
"epoch": 1.3024361836358342,
"grad_norm": 1.4609375,
"learning_rate": 6.2967938364510794e-06,
"loss": 0.3883176326751709,
"mean_token_accuracy": 0.8740043297410012,
"num_tokens": 50953290.0,
"step": 3355
},
{
"entropy": 0.3541896607726812,
"epoch": 1.3043773658157818,
"grad_norm": 1.71875,
"learning_rate": 6.2866719643481185e-06,
"loss": 0.40380287170410156,
"mean_token_accuracy": 0.8862088546156883,
"num_tokens": 51013828.0,
"step": 3360
},
{
"entropy": 0.3846803639084101,
"epoch": 1.3063185479957293,
"grad_norm": 1.5546875,
"learning_rate": 6.2765444422473735e-06,
"loss": 0.4024141788482666,
"mean_token_accuracy": 0.8768733203411102,
"num_tokens": 51088099.0,
"step": 3365
},
{
"entropy": 0.382393941283226,
"epoch": 1.3082597301756769,
"grad_norm": 1.5390625,
"learning_rate": 6.2664113146205355e-06,
"loss": 0.4033693313598633,
"mean_token_accuracy": 0.8758635804057121,
"num_tokens": 51174151.0,
"step": 3370
},
{
"entropy": 0.35187431797385216,
"epoch": 1.3102009123556246,
"grad_norm": 1.53125,
"learning_rate": 6.256272625963908e-06,
"loss": 0.3925636291503906,
"mean_token_accuracy": 0.8831515818834305,
"num_tokens": 51253871.0,
"step": 3375
},
{
"entropy": 0.3575460772961378,
"epoch": 1.3121420945355722,
"grad_norm": 1.7578125,
"learning_rate": 6.24612842079822e-06,
"loss": 0.3699699878692627,
"mean_token_accuracy": 0.8861239358782769,
"num_tokens": 51320927.0,
"step": 3380
},
{
"entropy": 0.3574929475784302,
"epoch": 1.3140832767155197,
"grad_norm": 1.40625,
"learning_rate": 6.235978743668415e-06,
"loss": 0.3928325653076172,
"mean_token_accuracy": 0.8840924382209778,
"num_tokens": 51393313.0,
"step": 3385
},
{
"entropy": 0.4037714671343565,
"epoch": 1.3160244588954673,
"grad_norm": 1.75,
"learning_rate": 6.2258236391434735e-06,
"loss": 0.43996176719665525,
"mean_token_accuracy": 0.8732839792966842,
"num_tokens": 51469149.0,
"step": 3390
},
{
"entropy": 0.3767301281914115,
"epoch": 1.317965641075415,
"grad_norm": 1.671875,
"learning_rate": 6.215663151816204e-06,
"loss": 0.41208739280700685,
"mean_token_accuracy": 0.8773611128330231,
"num_tokens": 51549599.0,
"step": 3395
},
{
"entropy": 0.41424218341708186,
"epoch": 1.3199068232553626,
"grad_norm": 1.390625,
"learning_rate": 6.205497326303054e-06,
"loss": 0.4277363300323486,
"mean_token_accuracy": 0.8679974019527436,
"num_tokens": 51642096.0,
"step": 3400
},
{
"epoch": 1.3199068232553626,
"eval_entropy": 0.3615535780503174,
"eval_loss": 0.36774712800979614,
"eval_mean_token_accuracy": 0.8836295662535352,
"eval_num_tokens": 51642096.0,
"eval_runtime": 60.122,
"eval_samples_per_second": 35.744,
"eval_steps_per_second": 35.744,
"step": 3400
},
{
"entropy": 0.3396712843328714,
"epoch": 1.3218480054353101,
"grad_norm": 2.0,
"learning_rate": 6.1953262072439104e-06,
"loss": 0.36101136207580564,
"mean_token_accuracy": 0.8887990996241569,
"num_tokens": 51720729.0,
"step": 3405
},
{
"entropy": 0.37931633833795786,
"epoch": 1.3237891876152577,
"grad_norm": 1.5625,
"learning_rate": 6.185149839301904e-06,
"loss": 0.4054765224456787,
"mean_token_accuracy": 0.8779459938406944,
"num_tokens": 51794420.0,
"step": 3410
},
{
"entropy": 0.3843179401010275,
"epoch": 1.3257303697952052,
"grad_norm": 1.5,
"learning_rate": 6.1749682671632185e-06,
"loss": 0.40850515365600587,
"mean_token_accuracy": 0.8769529685378075,
"num_tokens": 51877828.0,
"step": 3415
},
{
"entropy": 0.3709686040878296,
"epoch": 1.3276715519751527,
"grad_norm": 1.4609375,
"learning_rate": 6.1647815355368845e-06,
"loss": 0.38035385608673095,
"mean_token_accuracy": 0.8827380672097206,
"num_tokens": 51949655.0,
"step": 3420
},
{
"entropy": 0.3752392638474703,
"epoch": 1.3296127341551005,
"grad_norm": 1.6328125,
"learning_rate": 6.154589689154594e-06,
"loss": 0.36831343173980713,
"mean_token_accuracy": 0.8784654662013054,
"num_tokens": 52034397.0,
"step": 3425
},
{
"entropy": 0.4212960582226515,
"epoch": 1.331553916335048,
"grad_norm": 1.921875,
"learning_rate": 6.144392772770498e-06,
"loss": 0.4582382678985596,
"mean_token_accuracy": 0.8687367781996727,
"num_tokens": 52108021.0,
"step": 3430
},
{
"entropy": 0.3721700422465801,
"epoch": 1.3334950985149956,
"grad_norm": 1.703125,
"learning_rate": 6.134190831161004e-06,
"loss": 0.39261841773986816,
"mean_token_accuracy": 0.8782809346914291,
"num_tokens": 52189452.0,
"step": 3435
},
{
"entropy": 0.41867862418293955,
"epoch": 1.3354362806949434,
"grad_norm": 1.5703125,
"learning_rate": 6.123983909124597e-06,
"loss": 0.4275325298309326,
"mean_token_accuracy": 0.8698067650198936,
"num_tokens": 52269123.0,
"step": 3440
},
{
"entropy": 0.32477850653231144,
"epoch": 1.337377462874891,
"grad_norm": 1.65625,
"learning_rate": 6.113772051481622e-06,
"loss": 0.3294957399368286,
"mean_token_accuracy": 0.8947040095925332,
"num_tokens": 52329549.0,
"step": 3445
},
{
"entropy": 0.3995737452059984,
"epoch": 1.3393186450548384,
"grad_norm": 1.3125,
"learning_rate": 6.103555303074105e-06,
"loss": 0.42353267669677735,
"mean_token_accuracy": 0.8714441776275634,
"num_tokens": 52432623.0,
"step": 3450
},
{
"entropy": 0.3620085157454014,
"epoch": 1.341259827234786,
"grad_norm": 1.5234375,
"learning_rate": 6.093333708765541e-06,
"loss": 0.37543137073516847,
"mean_token_accuracy": 0.8843591079115868,
"num_tokens": 52505394.0,
"step": 3455
},
{
"entropy": 0.40583874434232714,
"epoch": 1.3432010094147335,
"grad_norm": 1.9921875,
"learning_rate": 6.08310731344071e-06,
"loss": 0.40748982429504393,
"mean_token_accuracy": 0.8733288407325744,
"num_tokens": 52574341.0,
"step": 3460
},
{
"entropy": 0.37172621488571167,
"epoch": 1.345142191594681,
"grad_norm": 1.390625,
"learning_rate": 6.072876162005474e-06,
"loss": 0.3841069221496582,
"mean_token_accuracy": 0.8812505900859833,
"num_tokens": 52650739.0,
"step": 3465
},
{
"entropy": 0.36620298847556115,
"epoch": 1.3470833737746286,
"grad_norm": 1.7265625,
"learning_rate": 6.062640299386573e-06,
"loss": 0.37988154888153075,
"mean_token_accuracy": 0.8840025961399078,
"num_tokens": 52730667.0,
"step": 3470
},
{
"entropy": 0.37304456941783426,
"epoch": 1.3490245559545764,
"grad_norm": 1.6953125,
"learning_rate": 6.052399770531441e-06,
"loss": 0.3846965551376343,
"mean_token_accuracy": 0.8781289547681809,
"num_tokens": 52816922.0,
"step": 3475
},
{
"entropy": 0.37507508173584936,
"epoch": 1.350965738134524,
"grad_norm": 1.390625,
"learning_rate": 6.042154620408003e-06,
"loss": 0.39843082427978516,
"mean_token_accuracy": 0.8789984509348869,
"num_tokens": 52893991.0,
"step": 3480
},
{
"entropy": 0.33831214234232904,
"epoch": 1.3529069203144715,
"grad_norm": 1.890625,
"learning_rate": 6.0319048940044715e-06,
"loss": 0.35968937873840334,
"mean_token_accuracy": 0.8883291095495224,
"num_tokens": 52959658.0,
"step": 3485
},
{
"entropy": 0.39217614494264125,
"epoch": 1.3548481024944192,
"grad_norm": 1.5234375,
"learning_rate": 6.021650636329159e-06,
"loss": 0.395078182220459,
"mean_token_accuracy": 0.8747363820672035,
"num_tokens": 53030794.0,
"step": 3490
},
{
"entropy": 0.36045850329101087,
"epoch": 1.3567892846743668,
"grad_norm": 1.2890625,
"learning_rate": 6.011391892410272e-06,
"loss": 0.39100329875946044,
"mean_token_accuracy": 0.8819140180945396,
"num_tokens": 53118526.0,
"step": 3495
},
{
"entropy": 0.393280316144228,
"epoch": 1.3587304668543143,
"grad_norm": 1.7109375,
"learning_rate": 6.0011287072957205e-06,
"loss": 0.39736104011535645,
"mean_token_accuracy": 0.8757255643606185,
"num_tokens": 53188265.0,
"step": 3500
}
],
"logging_steps": 5,
"max_steps": 7728,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.926527421264753e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}