| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.3587304668543143, |
| "eval_steps": 200, |
| "global_step": 3500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.1698873918503523, |
| "epoch": 0.0019411821799475881, |
| "grad_norm": 28.75, |
| "learning_rate": 1.7241379310344828e-07, |
| "loss": 0.9568568229675293, |
| "mean_token_accuracy": 0.8273445844650269, |
| "num_tokens": 68060.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 0.1770024599507451, |
| "epoch": 0.0038823643598951763, |
| "grad_norm": 35.0, |
| "learning_rate": 3.8793103448275865e-07, |
| "loss": 0.9151897430419922, |
| "mean_token_accuracy": 0.8321746543049813, |
| "num_tokens": 138267.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.19476149305701257, |
| "epoch": 0.005823546539842764, |
| "grad_norm": 30.75, |
| "learning_rate": 6.034482758620691e-07, |
| "loss": 0.8955170631408691, |
| "mean_token_accuracy": 0.8212856188416481, |
| "num_tokens": 217527.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.19077662620693445, |
| "epoch": 0.007764728719790353, |
| "grad_norm": 29.875, |
| "learning_rate": 8.189655172413794e-07, |
| "loss": 0.9730923652648926, |
| "mean_token_accuracy": 0.822833463549614, |
| "num_tokens": 280569.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.1851712815463543, |
| "epoch": 0.00970591089973794, |
| "grad_norm": 31.0, |
| "learning_rate": 1.0344827586206898e-06, |
| "loss": 0.8731012344360352, |
| "mean_token_accuracy": 0.828074723482132, |
| "num_tokens": 361410.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.21108674593269824, |
| "epoch": 0.011647093079685528, |
| "grad_norm": 30.875, |
| "learning_rate": 1.25e-06, |
| "loss": 0.9689438819885254, |
| "mean_token_accuracy": 0.8158086076378822, |
| "num_tokens": 433416.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.1849596280604601, |
| "epoch": 0.013588275259633116, |
| "grad_norm": 23.375, |
| "learning_rate": 1.4655172413793104e-06, |
| "loss": 0.8792544364929199, |
| "mean_token_accuracy": 0.8349022850394249, |
| "num_tokens": 509913.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.22154813222587108, |
| "epoch": 0.015529457439580705, |
| "grad_norm": 24.125, |
| "learning_rate": 1.681034482758621e-06, |
| "loss": 0.8865782737731933, |
| "mean_token_accuracy": 0.8225297197699547, |
| "num_tokens": 574517.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.21101174745708703, |
| "epoch": 0.017470639619528293, |
| "grad_norm": 21.125, |
| "learning_rate": 1.896551724137931e-06, |
| "loss": 0.8394794464111328, |
| "mean_token_accuracy": 0.8262531071901321, |
| "num_tokens": 646659.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.22667213380336762, |
| "epoch": 0.01941182179947588, |
| "grad_norm": 18.875, |
| "learning_rate": 2.1120689655172416e-06, |
| "loss": 0.8253890037536621, |
| "mean_token_accuracy": 0.8253167048096657, |
| "num_tokens": 726398.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.23717499971389772, |
| "epoch": 0.02135300397942347, |
| "grad_norm": 17.25, |
| "learning_rate": 2.327586206896552e-06, |
| "loss": 0.7434019565582275, |
| "mean_token_accuracy": 0.8295832589268685, |
| "num_tokens": 809917.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.25738408528268336, |
| "epoch": 0.023294186159371056, |
| "grad_norm": 13.8125, |
| "learning_rate": 2.543103448275862e-06, |
| "loss": 0.7521392345428467, |
| "mean_token_accuracy": 0.8236480697989463, |
| "num_tokens": 886659.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.22672694064676763, |
| "epoch": 0.025235368339318644, |
| "grad_norm": 12.5625, |
| "learning_rate": 2.7586206896551725e-06, |
| "loss": 0.7187893390655518, |
| "mean_token_accuracy": 0.8356023579835892, |
| "num_tokens": 953610.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.26906434297561643, |
| "epoch": 0.02717655051926623, |
| "grad_norm": 11.9375, |
| "learning_rate": 2.9741379310344832e-06, |
| "loss": 0.7249302387237548, |
| "mean_token_accuracy": 0.8295665010809898, |
| "num_tokens": 1024103.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.2811603628098965, |
| "epoch": 0.029117732699213823, |
| "grad_norm": 10.5, |
| "learning_rate": 3.1896551724137935e-06, |
| "loss": 0.7414528846740722, |
| "mean_token_accuracy": 0.8267218798398972, |
| "num_tokens": 1104806.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.2909968294203281, |
| "epoch": 0.03105891487916141, |
| "grad_norm": 10.3125, |
| "learning_rate": 3.4051724137931034e-06, |
| "loss": 0.6241181850433349, |
| "mean_token_accuracy": 0.8339074313640594, |
| "num_tokens": 1185581.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.3096594549715519, |
| "epoch": 0.033000097059108995, |
| "grad_norm": 7.6875, |
| "learning_rate": 3.620689655172414e-06, |
| "loss": 0.5764092445373535, |
| "mean_token_accuracy": 0.834169502556324, |
| "num_tokens": 1277766.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.3442493978887796, |
| "epoch": 0.034941279239056586, |
| "grad_norm": 6.3125, |
| "learning_rate": 3.8362068965517245e-06, |
| "loss": 0.6330607414245606, |
| "mean_token_accuracy": 0.8328578889369964, |
| "num_tokens": 1343195.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.3338872347027063, |
| "epoch": 0.03688246141900418, |
| "grad_norm": 3.9375, |
| "learning_rate": 4.051724137931034e-06, |
| "loss": 0.5495956897735595, |
| "mean_token_accuracy": 0.8446441546082497, |
| "num_tokens": 1422307.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.3676390130072832, |
| "epoch": 0.03882364359895176, |
| "grad_norm": 3.78125, |
| "learning_rate": 4.267241379310345e-06, |
| "loss": 0.5221522331237793, |
| "mean_token_accuracy": 0.8460515171289444, |
| "num_tokens": 1502407.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.3870431698858738, |
| "epoch": 0.04076482577889935, |
| "grad_norm": 2.875, |
| "learning_rate": 4.482758620689656e-06, |
| "loss": 0.5109545230865479, |
| "mean_token_accuracy": 0.8474476292729378, |
| "num_tokens": 1588898.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.4107159897685051, |
| "epoch": 0.04270600795884694, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.698275862068966e-06, |
| "loss": 0.5011598587036132, |
| "mean_token_accuracy": 0.8443124324083329, |
| "num_tokens": 1680514.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.39204273708164694, |
| "epoch": 0.04464719013879453, |
| "grad_norm": 2.125, |
| "learning_rate": 4.9137931034482765e-06, |
| "loss": 0.4943844795227051, |
| "mean_token_accuracy": 0.8546174108982086, |
| "num_tokens": 1751043.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.44253255538642405, |
| "epoch": 0.04658837231874211, |
| "grad_norm": 1.96875, |
| "learning_rate": 5.129310344827587e-06, |
| "loss": 0.5116967678070068, |
| "mean_token_accuracy": 0.8467695400118828, |
| "num_tokens": 1828152.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.4137393295764923, |
| "epoch": 0.0485295544986897, |
| "grad_norm": 2.421875, |
| "learning_rate": 5.344827586206896e-06, |
| "loss": 0.4703562259674072, |
| "mean_token_accuracy": 0.8589524060487748, |
| "num_tokens": 1903581.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.43903482854366305, |
| "epoch": 0.05047073667863729, |
| "grad_norm": 2.375, |
| "learning_rate": 5.560344827586207e-06, |
| "loss": 0.4887136936187744, |
| "mean_token_accuracy": 0.8527244672179222, |
| "num_tokens": 1976772.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.4222589176148176, |
| "epoch": 0.05241191885858488, |
| "grad_norm": 2.265625, |
| "learning_rate": 5.775862068965518e-06, |
| "loss": 0.45414047241210936, |
| "mean_token_accuracy": 0.8595154702663421, |
| "num_tokens": 2045499.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 0.4219055060297251, |
| "epoch": 0.05435310103853246, |
| "grad_norm": 1.9765625, |
| "learning_rate": 5.9913793103448284e-06, |
| "loss": 0.49632959365844725, |
| "mean_token_accuracy": 0.8565252378582955, |
| "num_tokens": 2120623.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.42592958733439445, |
| "epoch": 0.056294283218480054, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.206896551724138e-06, |
| "loss": 0.4315296173095703, |
| "mean_token_accuracy": 0.8575032651424408, |
| "num_tokens": 2198334.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 0.40059518739581107, |
| "epoch": 0.058235465398427645, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.422413793103449e-06, |
| "loss": 0.4771871566772461, |
| "mean_token_accuracy": 0.8683709859848022, |
| "num_tokens": 2265434.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.41876095086336135, |
| "epoch": 0.06017664757837523, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.63793103448276e-06, |
| "loss": 0.444645357131958, |
| "mean_token_accuracy": 0.8565793663263321, |
| "num_tokens": 2358742.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 0.4004307441413403, |
| "epoch": 0.06211782975832282, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.853448275862069e-06, |
| "loss": 0.4320836544036865, |
| "mean_token_accuracy": 0.8631758615374565, |
| "num_tokens": 2437938.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.41059495508670807, |
| "epoch": 0.0640590119382704, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.0689655172413796e-06, |
| "loss": 0.450638484954834, |
| "mean_token_accuracy": 0.8649105593562126, |
| "num_tokens": 2506611.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.4375029347836971, |
| "epoch": 0.06600019411821799, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.28448275862069e-06, |
| "loss": 0.48232545852661135, |
| "mean_token_accuracy": 0.8586594820022583, |
| "num_tokens": 2585776.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.39430369548499583, |
| "epoch": 0.06794137629816559, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.42594180107116697, |
| "mean_token_accuracy": 0.8716334730386734, |
| "num_tokens": 2649428.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.4384324595332146, |
| "epoch": 0.06988255847811317, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.715517241379312e-06, |
| "loss": 0.45015506744384765, |
| "mean_token_accuracy": 0.8587341636419297, |
| "num_tokens": 2741939.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.4197473503649235, |
| "epoch": 0.07182374065806076, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.93103448275862e-06, |
| "loss": 0.430635404586792, |
| "mean_token_accuracy": 0.8667460069060325, |
| "num_tokens": 2814243.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 0.40914484262466433, |
| "epoch": 0.07376492283800835, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.146551724137932e-06, |
| "loss": 0.43552379608154296, |
| "mean_token_accuracy": 0.865776352584362, |
| "num_tokens": 2885496.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.4126306913793087, |
| "epoch": 0.07570610501795594, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.362068965517242e-06, |
| "loss": 0.44997596740722656, |
| "mean_token_accuracy": 0.8635187759995461, |
| "num_tokens": 2952067.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 0.39164264835417273, |
| "epoch": 0.07764728719790352, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.577586206896551e-06, |
| "loss": 0.4397777557373047, |
| "mean_token_accuracy": 0.8692673921585083, |
| "num_tokens": 3032945.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07764728719790352, |
| "eval_entropy": 0.40086069169184285, |
| "eval_loss": 0.42599618434906006, |
| "eval_mean_token_accuracy": 0.8688706356933251, |
| "eval_num_tokens": 3032945.0, |
| "eval_runtime": 60.1253, |
| "eval_samples_per_second": 35.742, |
| "eval_steps_per_second": 35.742, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.43272491060197354, |
| "epoch": 0.0795884693778511, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.793103448275862e-06, |
| "loss": 0.44966917037963866, |
| "mean_token_accuracy": 0.8635564997792244, |
| "num_tokens": 3099465.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 0.4191489264369011, |
| "epoch": 0.0815296515577987, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.008620689655173e-06, |
| "loss": 0.45826034545898436, |
| "mean_token_accuracy": 0.8656236171722412, |
| "num_tokens": 3165471.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.3851976301521063, |
| "epoch": 0.08347083373774629, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.224137931034484e-06, |
| "loss": 0.41442441940307617, |
| "mean_token_accuracy": 0.8754843935370445, |
| "num_tokens": 3250159.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 0.4017783209681511, |
| "epoch": 0.08541201591769387, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.439655172413794e-06, |
| "loss": 0.4402902603149414, |
| "mean_token_accuracy": 0.8709232717752456, |
| "num_tokens": 3336018.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.38373975045979025, |
| "epoch": 0.08735319809764146, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.655172413793105e-06, |
| "loss": 0.3947699546813965, |
| "mean_token_accuracy": 0.8761502489447593, |
| "num_tokens": 3402332.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.3711710192263126, |
| "epoch": 0.08929438027758906, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.870689655172414e-06, |
| "loss": 0.38801021575927735, |
| "mean_token_accuracy": 0.8783514246344566, |
| "num_tokens": 3490759.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.37838716208934786, |
| "epoch": 0.09123556245753664, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.999998243530697e-06, |
| "loss": 0.4078525543212891, |
| "mean_token_accuracy": 0.8711143419146538, |
| "num_tokens": 3581453.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 0.37663319632411, |
| "epoch": 0.09317674463748422, |
| "grad_norm": 2.5, |
| "learning_rate": 9.999978483265213e-06, |
| "loss": 0.4262491226196289, |
| "mean_token_accuracy": 0.8772962838411331, |
| "num_tokens": 3646168.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.4050050787627697, |
| "epoch": 0.09511792681743182, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.999936767234675e-06, |
| "loss": 0.4260216236114502, |
| "mean_token_accuracy": 0.8673088252544403, |
| "num_tokens": 3731064.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.384697999432683, |
| "epoch": 0.0970591089973794, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.999873095622266e-06, |
| "loss": 0.3961009979248047, |
| "mean_token_accuracy": 0.8749020054936409, |
| "num_tokens": 3798798.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.4126061208546162, |
| "epoch": 0.09900029117732699, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.999787468707579e-06, |
| "loss": 0.43981060981750486, |
| "mean_token_accuracy": 0.8663996770977974, |
| "num_tokens": 3886350.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 0.40845385044813154, |
| "epoch": 0.10094147335727457, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.999679886866614e-06, |
| "loss": 0.41130051612854, |
| "mean_token_accuracy": 0.8713549628853798, |
| "num_tokens": 3959484.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.40083046182990073, |
| "epoch": 0.10288265553722217, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.999550350571785e-06, |
| "loss": 0.4313651561737061, |
| "mean_token_accuracy": 0.8690039083361626, |
| "num_tokens": 4039205.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 0.35567491836845877, |
| "epoch": 0.10482383771716976, |
| "grad_norm": 1.625, |
| "learning_rate": 9.999398860391906e-06, |
| "loss": 0.39510302543640136, |
| "mean_token_accuracy": 0.8858051553368569, |
| "num_tokens": 4114526.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.3589722190052271, |
| "epoch": 0.10676501989711734, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.9992254169922e-06, |
| "loss": 0.377154541015625, |
| "mean_token_accuracy": 0.8847770616412163, |
| "num_tokens": 4180520.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.37694212086498735, |
| "epoch": 0.10870620207706493, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.99903002113428e-06, |
| "loss": 0.3913008213043213, |
| "mean_token_accuracy": 0.8760873630642891, |
| "num_tokens": 4256660.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.43231718949973585, |
| "epoch": 0.11064738425701252, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.99881267367617e-06, |
| "loss": 0.4287400722503662, |
| "mean_token_accuracy": 0.8621754452586174, |
| "num_tokens": 4326362.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.4439574245363474, |
| "epoch": 0.11258856643696011, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.998573375572277e-06, |
| "loss": 0.44347705841064455, |
| "mean_token_accuracy": 0.8591988816857338, |
| "num_tokens": 4401568.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.4327508192509413, |
| "epoch": 0.11452974861690769, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.998312127873398e-06, |
| "loss": 0.41773738861083987, |
| "mean_token_accuracy": 0.8604527100920677, |
| "num_tokens": 4482468.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.39415039904415605, |
| "epoch": 0.11647093079685529, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.99802893172672e-06, |
| "loss": 0.37184581756591795, |
| "mean_token_accuracy": 0.875768692791462, |
| "num_tokens": 4566311.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.3908670715987682, |
| "epoch": 0.11841211297680287, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.997723788375803e-06, |
| "loss": 0.4179991722106934, |
| "mean_token_accuracy": 0.8736939936876297, |
| "num_tokens": 4639335.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.37057909071445466, |
| "epoch": 0.12035329515675046, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.997396699160586e-06, |
| "loss": 0.3718397855758667, |
| "mean_token_accuracy": 0.8778384670615196, |
| "num_tokens": 4729786.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.3775249246507883, |
| "epoch": 0.12229447733669804, |
| "grad_norm": 1.25, |
| "learning_rate": 9.997047665517373e-06, |
| "loss": 0.36892924308776853, |
| "mean_token_accuracy": 0.8788423538208008, |
| "num_tokens": 4815579.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.42527642734348775, |
| "epoch": 0.12423565951664564, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.996676688978832e-06, |
| "loss": 0.4455845832824707, |
| "mean_token_accuracy": 0.8667929217219352, |
| "num_tokens": 4890387.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.3964430205523968, |
| "epoch": 0.1261768416965932, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.996283771173982e-06, |
| "loss": 0.4093163967132568, |
| "mean_token_accuracy": 0.871640557050705, |
| "num_tokens": 4963050.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.41448891162872314, |
| "epoch": 0.1281180238765408, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.995868913828198e-06, |
| "loss": 0.4085641860961914, |
| "mean_token_accuracy": 0.8690432503819465, |
| "num_tokens": 5041868.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.4093653842806816, |
| "epoch": 0.1300592060564884, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.995432118763182e-06, |
| "loss": 0.4269090175628662, |
| "mean_token_accuracy": 0.8645370990037918, |
| "num_tokens": 5131350.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.40297048091888427, |
| "epoch": 0.13200038823643598, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.994973387896983e-06, |
| "loss": 0.4135453224182129, |
| "mean_token_accuracy": 0.8721030279994011, |
| "num_tokens": 5206454.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.3713659271597862, |
| "epoch": 0.13394157041638358, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.994492723243965e-06, |
| "loss": 0.38209033012390137, |
| "mean_token_accuracy": 0.8801575794816017, |
| "num_tokens": 5290337.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.43902772702276704, |
| "epoch": 0.13588275259633117, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.993990126914808e-06, |
| "loss": 0.45044879913330077, |
| "mean_token_accuracy": 0.8585825085639953, |
| "num_tokens": 5355252.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.403434070199728, |
| "epoch": 0.13782393477627874, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.9934656011165e-06, |
| "loss": 0.4331518650054932, |
| "mean_token_accuracy": 0.8729702636599541, |
| "num_tokens": 5421254.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.3791210547089577, |
| "epoch": 0.13976511695622634, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.992919148152323e-06, |
| "loss": 0.4140181064605713, |
| "mean_token_accuracy": 0.8779570132493972, |
| "num_tokens": 5506667.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.3965904530137777, |
| "epoch": 0.14170629913617394, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.992350770421849e-06, |
| "loss": 0.40141940116882324, |
| "mean_token_accuracy": 0.873149348795414, |
| "num_tokens": 5582156.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.3853264570236206, |
| "epoch": 0.1436474813161215, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.991760470420917e-06, |
| "loss": 0.386338210105896, |
| "mean_token_accuracy": 0.8758631706237793, |
| "num_tokens": 5651606.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.3922650724649429, |
| "epoch": 0.1455886634960691, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.99114825074164e-06, |
| "loss": 0.41069760322570803, |
| "mean_token_accuracy": 0.8746298983693123, |
| "num_tokens": 5746987.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.43557493686676024, |
| "epoch": 0.1475298456760167, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.990514114072379e-06, |
| "loss": 0.44701457023620605, |
| "mean_token_accuracy": 0.8603363439440728, |
| "num_tokens": 5820511.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.3883111171424389, |
| "epoch": 0.14947102785596428, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.989858063197735e-06, |
| "loss": 0.40341935157775877, |
| "mean_token_accuracy": 0.8721037909388543, |
| "num_tokens": 5901615.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.39608169682323935, |
| "epoch": 0.15141221003591188, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.989180100998543e-06, |
| "loss": 0.4208333492279053, |
| "mean_token_accuracy": 0.8722649529576302, |
| "num_tokens": 5974272.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.37846892662346365, |
| "epoch": 0.15335339221585945, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.988480230451849e-06, |
| "loss": 0.38179306983947753, |
| "mean_token_accuracy": 0.8771921068429946, |
| "num_tokens": 6060716.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.37789811603724954, |
| "epoch": 0.15529457439580704, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.987758454630909e-06, |
| "loss": 0.39535834789276125, |
| "mean_token_accuracy": 0.8785205245018005, |
| "num_tokens": 6126916.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.15529457439580704, |
| "eval_entropy": 0.3923966215794283, |
| "eval_loss": 0.39556533098220825, |
| "eval_mean_token_accuracy": 0.876580595429302, |
| "eval_num_tokens": 6126916.0, |
| "eval_runtime": 60.1557, |
| "eval_samples_per_second": 35.724, |
| "eval_steps_per_second": 35.724, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.43358618319034575, |
| "epoch": 0.15723575657575464, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.98701477670516e-06, |
| "loss": 0.4627527236938477, |
| "mean_token_accuracy": 0.8672218635678292, |
| "num_tokens": 6191091.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 0.41946529373526575, |
| "epoch": 0.1591769387557022, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.986249199940221e-06, |
| "loss": 0.4011059284210205, |
| "mean_token_accuracy": 0.8665074944496155, |
| "num_tokens": 6271540.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.40461285747587683, |
| "epoch": 0.1611181209356498, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.985461727697873e-06, |
| "loss": 0.4005119800567627, |
| "mean_token_accuracy": 0.8737778559327125, |
| "num_tokens": 6335828.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 0.42256330624222754, |
| "epoch": 0.1630593031155974, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.98465236343604e-06, |
| "loss": 0.447072172164917, |
| "mean_token_accuracy": 0.8665044084191322, |
| "num_tokens": 6398004.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.3777090422809124, |
| "epoch": 0.16500048529554498, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.98382111070878e-06, |
| "loss": 0.40898308753967283, |
| "mean_token_accuracy": 0.8773329868912697, |
| "num_tokens": 6475011.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.3920007921755314, |
| "epoch": 0.16694166747549258, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.982967973166269e-06, |
| "loss": 0.36671743392944334, |
| "mean_token_accuracy": 0.8770380824804306, |
| "num_tokens": 6537403.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.38834148123860357, |
| "epoch": 0.16888284965544018, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.982092954554776e-06, |
| "loss": 0.40844144821166994, |
| "mean_token_accuracy": 0.8771779343485833, |
| "num_tokens": 6605063.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 0.3952221803367138, |
| "epoch": 0.17082403183538775, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.981196058716662e-06, |
| "loss": 0.42590937614440916, |
| "mean_token_accuracy": 0.8744154885411263, |
| "num_tokens": 6679167.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.38398993872106074, |
| "epoch": 0.17276521401533534, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.98027728959035e-06, |
| "loss": 0.39303438663482665, |
| "mean_token_accuracy": 0.8744676560163498, |
| "num_tokens": 6761337.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 0.3808287113904953, |
| "epoch": 0.17470639619528291, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.979336651210314e-06, |
| "loss": 0.3940417289733887, |
| "mean_token_accuracy": 0.8789507359266281, |
| "num_tokens": 6834935.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.4267194837331772, |
| "epoch": 0.1766475783752305, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.978374147707055e-06, |
| "loss": 0.4340329647064209, |
| "mean_token_accuracy": 0.8673587426543236, |
| "num_tokens": 6904193.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 0.41968510262668135, |
| "epoch": 0.1785887605551781, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.977389783307095e-06, |
| "loss": 0.4462919235229492, |
| "mean_token_accuracy": 0.8695808529853821, |
| "num_tokens": 6971958.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.38464379906654356, |
| "epoch": 0.18052994273512568, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.976383562332946e-06, |
| "loss": 0.40283498764038084, |
| "mean_token_accuracy": 0.880063496530056, |
| "num_tokens": 7053470.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 0.364135454967618, |
| "epoch": 0.18247112491507328, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.975355489203097e-06, |
| "loss": 0.4077275276184082, |
| "mean_token_accuracy": 0.8819017142057419, |
| "num_tokens": 7126534.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.3730104427784681, |
| "epoch": 0.18441230709502088, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.974305568431994e-06, |
| "loss": 0.3929471969604492, |
| "mean_token_accuracy": 0.8784888133406639, |
| "num_tokens": 7201674.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.3986961957067251, |
| "epoch": 0.18635348927496845, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.973233804630022e-06, |
| "loss": 0.4142603874206543, |
| "mean_token_accuracy": 0.8733824387192726, |
| "num_tokens": 7271021.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.4345793057233095, |
| "epoch": 0.18829467145491605, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.972140202503477e-06, |
| "loss": 0.45402941703796384, |
| "mean_token_accuracy": 0.8599157705903053, |
| "num_tokens": 7347794.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 0.39443473145365715, |
| "epoch": 0.19023585363486364, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.971024766854554e-06, |
| "loss": 0.4239619731903076, |
| "mean_token_accuracy": 0.8729955241084099, |
| "num_tokens": 7430791.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.39781664684414864, |
| "epoch": 0.19217703581481121, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.969887502581324e-06, |
| "loss": 0.41582446098327636, |
| "mean_token_accuracy": 0.8728051796555519, |
| "num_tokens": 7509803.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 0.45657297112047673, |
| "epoch": 0.1941182179947588, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.96872841467771e-06, |
| "loss": 0.446823263168335, |
| "mean_token_accuracy": 0.8597154960036277, |
| "num_tokens": 7601591.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.3968469314277172, |
| "epoch": 0.19605940017470638, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.967547508233466e-06, |
| "loss": 0.4176668643951416, |
| "mean_token_accuracy": 0.872602291405201, |
| "num_tokens": 7666352.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 0.41542044915258886, |
| "epoch": 0.19800058235465398, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.966344788434154e-06, |
| "loss": 0.43819799423217776, |
| "mean_token_accuracy": 0.865588866174221, |
| "num_tokens": 7742267.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.39156174324452875, |
| "epoch": 0.19994176453460158, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.965120260561126e-06, |
| "loss": 0.39728028774261476, |
| "mean_token_accuracy": 0.8775914892554283, |
| "num_tokens": 7819899.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 0.3775805365294218, |
| "epoch": 0.20188294671454915, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.963873929991492e-06, |
| "loss": 0.4102130889892578, |
| "mean_token_accuracy": 0.878247183561325, |
| "num_tokens": 7887873.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.39370285868644717, |
| "epoch": 0.20382412889449675, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.962605802198105e-06, |
| "loss": 0.4014415264129639, |
| "mean_token_accuracy": 0.8733704462647438, |
| "num_tokens": 7961482.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 0.40123398676514627, |
| "epoch": 0.20576531107444435, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.961315882749531e-06, |
| "loss": 0.42458133697509765, |
| "mean_token_accuracy": 0.875715845823288, |
| "num_tokens": 8022441.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.3925070337951183, |
| "epoch": 0.20770649325439192, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.960004177310029e-06, |
| "loss": 0.38911452293396, |
| "mean_token_accuracy": 0.8763631775975227, |
| "num_tokens": 8098552.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 0.39059548266232014, |
| "epoch": 0.20964767543433951, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.958670691639523e-06, |
| "loss": 0.41231446266174315, |
| "mean_token_accuracy": 0.8729776293039322, |
| "num_tokens": 8193252.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.3952123038470745, |
| "epoch": 0.2115888576142871, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.957315431593578e-06, |
| "loss": 0.41542778015136717, |
| "mean_token_accuracy": 0.8778386160731315, |
| "num_tokens": 8259794.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 0.39136257991194723, |
| "epoch": 0.21353003979423468, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.955938403123372e-06, |
| "loss": 0.4131179332733154, |
| "mean_token_accuracy": 0.8729422584176063, |
| "num_tokens": 8325306.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.36876106821000576, |
| "epoch": 0.21547122197418228, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.954539612275676e-06, |
| "loss": 0.3939671516418457, |
| "mean_token_accuracy": 0.8807895123958588, |
| "num_tokens": 8403551.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 0.41602297611534594, |
| "epoch": 0.21741240415412985, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.95311906519282e-06, |
| "loss": 0.43773713111877444, |
| "mean_token_accuracy": 0.8707596242427826, |
| "num_tokens": 8474927.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.4392675504088402, |
| "epoch": 0.21935358633407745, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.951676768112673e-06, |
| "loss": 0.43816194534301756, |
| "mean_token_accuracy": 0.8632524207234382, |
| "num_tokens": 8551768.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 0.4103314906358719, |
| "epoch": 0.22129476851402505, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.950212727368606e-06, |
| "loss": 0.43707122802734377, |
| "mean_token_accuracy": 0.8690274521708489, |
| "num_tokens": 8623280.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.35621660873293876, |
| "epoch": 0.22323595069397262, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.948726949389474e-06, |
| "loss": 0.39322140216827395, |
| "mean_token_accuracy": 0.8832426086068154, |
| "num_tokens": 8701428.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 0.40779992677271365, |
| "epoch": 0.22517713287392022, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.947219440699584e-06, |
| "loss": 0.42441439628601074, |
| "mean_token_accuracy": 0.8702899888157845, |
| "num_tokens": 8769812.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.42166984751820563, |
| "epoch": 0.22711831505386781, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.945690207918667e-06, |
| "loss": 0.41438679695129393, |
| "mean_token_accuracy": 0.8685426101088524, |
| "num_tokens": 8850435.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 0.3902070872485638, |
| "epoch": 0.22905949723381538, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.944139257761845e-06, |
| "loss": 0.3842545747756958, |
| "mean_token_accuracy": 0.8773440137505532, |
| "num_tokens": 8903430.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.43535452634096145, |
| "epoch": 0.23100067941376298, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.942566597039608e-06, |
| "loss": 0.42905964851379397, |
| "mean_token_accuracy": 0.862843619287014, |
| "num_tokens": 8976102.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 0.42619109377264974, |
| "epoch": 0.23294186159371058, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.940972232657782e-06, |
| "loss": 0.4514484882354736, |
| "mean_token_accuracy": 0.8609629839658737, |
| "num_tokens": 9073966.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.23294186159371058, |
| "eval_entropy": 0.3833293942704263, |
| "eval_loss": 0.38786980509757996, |
| "eval_mean_token_accuracy": 0.8783254230255413, |
| "eval_num_tokens": 9073966.0, |
| "eval_runtime": 60.1116, |
| "eval_samples_per_second": 35.75, |
| "eval_steps_per_second": 35.75, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.3839044734835625, |
| "epoch": 0.23488304377365815, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.93935617161749e-06, |
| "loss": 0.4186872959136963, |
| "mean_token_accuracy": 0.8785676345229149, |
| "num_tokens": 9145748.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 0.39257055819034575, |
| "epoch": 0.23682422595360575, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.937718421015137e-06, |
| "loss": 0.418427848815918, |
| "mean_token_accuracy": 0.8697977751493454, |
| "num_tokens": 9232179.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.377314992621541, |
| "epoch": 0.23876540813355335, |
| "grad_norm": 1.875, |
| "learning_rate": 9.936058988042367e-06, |
| "loss": 0.4139708042144775, |
| "mean_token_accuracy": 0.8783795028924942, |
| "num_tokens": 9308281.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 0.38419472984969616, |
| "epoch": 0.24070659031350092, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.934377879986035e-06, |
| "loss": 0.40369882583618166, |
| "mean_token_accuracy": 0.8746202811598778, |
| "num_tokens": 9393206.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.3744822334498167, |
| "epoch": 0.24264777249344852, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.932675104228177e-06, |
| "loss": 0.40184435844421384, |
| "mean_token_accuracy": 0.8763082399964333, |
| "num_tokens": 9485199.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 0.42811642177402975, |
| "epoch": 0.2445889546733961, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.930950668245971e-06, |
| "loss": 0.44397845268249514, |
| "mean_token_accuracy": 0.8665265038609504, |
| "num_tokens": 9565581.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.39702326618134975, |
| "epoch": 0.24653013685334368, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.929204579611716e-06, |
| "loss": 0.38111956119537355, |
| "mean_token_accuracy": 0.8766680151224137, |
| "num_tokens": 9636638.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 0.4137250851839781, |
| "epoch": 0.24847131903329128, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.927436845992782e-06, |
| "loss": 0.4052375316619873, |
| "mean_token_accuracy": 0.8680448547005654, |
| "num_tokens": 9714886.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.394980476424098, |
| "epoch": 0.2504125012132389, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.925647475151596e-06, |
| "loss": 0.40479207038879395, |
| "mean_token_accuracy": 0.8760687246918678, |
| "num_tokens": 9793866.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 0.3940431509166956, |
| "epoch": 0.2523536833931864, |
| "grad_norm": 1.5, |
| "learning_rate": 9.923836474945592e-06, |
| "loss": 0.3884091854095459, |
| "mean_token_accuracy": 0.8729974597692489, |
| "num_tokens": 9880497.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.42443324290215967, |
| "epoch": 0.254294865573134, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.92200385332718e-06, |
| "loss": 0.4349085330963135, |
| "mean_token_accuracy": 0.8659611865878105, |
| "num_tokens": 9947758.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 0.43983132056891916, |
| "epoch": 0.2562360477530816, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.92014961834372e-06, |
| "loss": 0.46071972846984866, |
| "mean_token_accuracy": 0.8601326540112495, |
| "num_tokens": 10016198.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.3981770180165768, |
| "epoch": 0.2581772299330292, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.918273778137477e-06, |
| "loss": 0.41265163421630857, |
| "mean_token_accuracy": 0.8710227489471436, |
| "num_tokens": 10111907.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 0.41508678197860716, |
| "epoch": 0.2601184121129768, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.916376340945584e-06, |
| "loss": 0.4117740631103516, |
| "mean_token_accuracy": 0.8687305614352226, |
| "num_tokens": 10191989.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.4320628222078085, |
| "epoch": 0.2620595942929244, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.91445731510002e-06, |
| "loss": 0.4121531963348389, |
| "mean_token_accuracy": 0.8719193398952484, |
| "num_tokens": 10273505.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 0.3792607393115759, |
| "epoch": 0.26400077647287196, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.91251670902755e-06, |
| "loss": 0.3988708257675171, |
| "mean_token_accuracy": 0.8824092477560044, |
| "num_tokens": 10342792.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.37850567921996114, |
| "epoch": 0.26594195865281955, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.910554531249714e-06, |
| "loss": 0.3946362018585205, |
| "mean_token_accuracy": 0.8791269809007645, |
| "num_tokens": 10409365.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 0.36963232718408107, |
| "epoch": 0.26788314083276715, |
| "grad_norm": 1.375, |
| "learning_rate": 9.90857079038277e-06, |
| "loss": 0.37774851322174074, |
| "mean_token_accuracy": 0.8818046569824218, |
| "num_tokens": 10497911.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.3968418601900339, |
| "epoch": 0.26982432301271475, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.906565495137665e-06, |
| "loss": 0.3911430835723877, |
| "mean_token_accuracy": 0.875646622478962, |
| "num_tokens": 10569784.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 0.384658931568265, |
| "epoch": 0.27176550519266235, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.904538654319998e-06, |
| "loss": 0.4171136379241943, |
| "mean_token_accuracy": 0.8759941428899765, |
| "num_tokens": 10638722.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.3830408491194248, |
| "epoch": 0.27370668737260995, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.90249027682997e-06, |
| "loss": 0.43827214241027834, |
| "mean_token_accuracy": 0.8736877083778382, |
| "num_tokens": 10717045.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 0.42484647817909715, |
| "epoch": 0.2756478695525575, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.900420371662364e-06, |
| "loss": 0.42806663513183596, |
| "mean_token_accuracy": 0.8680253028869629, |
| "num_tokens": 10786078.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.36103312484920025, |
| "epoch": 0.2775890517325051, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.898328947906489e-06, |
| "loss": 0.3689872741699219, |
| "mean_token_accuracy": 0.8823491036891937, |
| "num_tokens": 10863918.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 0.4158777046948671, |
| "epoch": 0.2795302339124527, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.896216014746141e-06, |
| "loss": 0.40418004989624023, |
| "mean_token_accuracy": 0.8711874485015869, |
| "num_tokens": 10952093.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.388820331171155, |
| "epoch": 0.2814714160924003, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.894081581459579e-06, |
| "loss": 0.40212116241455076, |
| "mean_token_accuracy": 0.8809416055679321, |
| "num_tokens": 11026123.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 0.4209954336285591, |
| "epoch": 0.2834125982723479, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.891925657419463e-06, |
| "loss": 0.4366124153137207, |
| "mean_token_accuracy": 0.870456813275814, |
| "num_tokens": 11090901.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.40942220725119116, |
| "epoch": 0.2853537804522954, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.889748252092827e-06, |
| "loss": 0.40485477447509766, |
| "mean_token_accuracy": 0.8676602795720101, |
| "num_tokens": 11178026.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 0.3782723072916269, |
| "epoch": 0.287294962632243, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.887549375041031e-06, |
| "loss": 0.40353665351867674, |
| "mean_token_accuracy": 0.8802446350455284, |
| "num_tokens": 11231711.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.39903284460306165, |
| "epoch": 0.2892361448121906, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.885329035919724e-06, |
| "loss": 0.4090695858001709, |
| "mean_token_accuracy": 0.868949045240879, |
| "num_tokens": 11327844.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 0.4031669870018959, |
| "epoch": 0.2911773269921382, |
| "grad_norm": 2.125, |
| "learning_rate": 9.883087244478796e-06, |
| "loss": 0.45818114280700684, |
| "mean_token_accuracy": 0.8704651057720184, |
| "num_tokens": 11401714.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.43706582076847555, |
| "epoch": 0.2931185091720858, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.880824010562338e-06, |
| "loss": 0.42615551948547364, |
| "mean_token_accuracy": 0.860453313589096, |
| "num_tokens": 11489610.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 0.41881459429860113, |
| "epoch": 0.2950596913520334, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.878539344108599e-06, |
| "loss": 0.42671880722045896, |
| "mean_token_accuracy": 0.8694243490695953, |
| "num_tokens": 11574592.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.42061977460980415, |
| "epoch": 0.29700087353198096, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.876233255149945e-06, |
| "loss": 0.4513099193572998, |
| "mean_token_accuracy": 0.8744151741266251, |
| "num_tokens": 11641646.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 0.39190227575600145, |
| "epoch": 0.29894205571192856, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.873905753812807e-06, |
| "loss": 0.39388408660888674, |
| "mean_token_accuracy": 0.8745140552520752, |
| "num_tokens": 11714807.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.4235379956662655, |
| "epoch": 0.30088323789187615, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.871556850317641e-06, |
| "loss": 0.45146808624267576, |
| "mean_token_accuracy": 0.8673876538872719, |
| "num_tokens": 11781200.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 0.4271043732762337, |
| "epoch": 0.30282442007182375, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.86918655497889e-06, |
| "loss": 0.41930394172668456, |
| "mean_token_accuracy": 0.8670791104435921, |
| "num_tokens": 11859281.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.3889836758375168, |
| "epoch": 0.30476560225177135, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.866794878204926e-06, |
| "loss": 0.42397122383117675, |
| "mean_token_accuracy": 0.875930380821228, |
| "num_tokens": 11935604.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 0.407536294311285, |
| "epoch": 0.3067067844317189, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.864381830498013e-06, |
| "loss": 0.4073331356048584, |
| "mean_token_accuracy": 0.8763293012976646, |
| "num_tokens": 12000505.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.3989451553672552, |
| "epoch": 0.3086479666116665, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.861947422454262e-06, |
| "loss": 0.40294957160949707, |
| "mean_token_accuracy": 0.8775883078575134, |
| "num_tokens": 12073943.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 0.3732746794819832, |
| "epoch": 0.3105891487916141, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.85949166476357e-06, |
| "loss": 0.36971125602722166, |
| "mean_token_accuracy": 0.8844305410981178, |
| "num_tokens": 12139003.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3105891487916141, |
| "eval_entropy": 0.381350220199683, |
| "eval_loss": 0.38283953070640564, |
| "eval_mean_token_accuracy": 0.8796889461046488, |
| "eval_num_tokens": 12139003.0, |
| "eval_runtime": 60.3322, |
| "eval_samples_per_second": 35.619, |
| "eval_steps_per_second": 35.619, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.41282732523977755, |
| "epoch": 0.3125303309715617, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.857014568209597e-06, |
| "loss": 0.46287264823913576, |
| "mean_token_accuracy": 0.8671122461557388, |
| "num_tokens": 12216310.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 0.3740253135561943, |
| "epoch": 0.3144715131515093, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.854516143669699e-06, |
| "loss": 0.3972623348236084, |
| "mean_token_accuracy": 0.8769651532173157, |
| "num_tokens": 12284513.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.3666670624166727, |
| "epoch": 0.3164126953314569, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.851996402114886e-06, |
| "loss": 0.3955537796020508, |
| "mean_token_accuracy": 0.8804457679390907, |
| "num_tokens": 12376220.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 0.40160666182637217, |
| "epoch": 0.3183538775114044, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.849455354609777e-06, |
| "loss": 0.41783933639526366, |
| "mean_token_accuracy": 0.8718173667788506, |
| "num_tokens": 12465139.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.395163032412529, |
| "epoch": 0.320295059691352, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.846893012312549e-06, |
| "loss": 0.4353921413421631, |
| "mean_token_accuracy": 0.8741151168942451, |
| "num_tokens": 12543594.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 0.38790931962430475, |
| "epoch": 0.3222362418712996, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.844309386474886e-06, |
| "loss": 0.4091060638427734, |
| "mean_token_accuracy": 0.8755813196301461, |
| "num_tokens": 12633984.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.35505941733717916, |
| "epoch": 0.3241774240512472, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.841704488441934e-06, |
| "loss": 0.34843366146087645, |
| "mean_token_accuracy": 0.8839788928627967, |
| "num_tokens": 12696787.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 0.4340564154088497, |
| "epoch": 0.3261186062311948, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.83907832965225e-06, |
| "loss": 0.4248070240020752, |
| "mean_token_accuracy": 0.8684971421957016, |
| "num_tokens": 12758495.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.4072086077183485, |
| "epoch": 0.32805978841114236, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.836430921637746e-06, |
| "loss": 0.4239677906036377, |
| "mean_token_accuracy": 0.871665708720684, |
| "num_tokens": 12824603.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 0.4058460295200348, |
| "epoch": 0.33000097059108996, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.833762276023646e-06, |
| "loss": 0.437244176864624, |
| "mean_token_accuracy": 0.8710885986685752, |
| "num_tokens": 12915482.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.4212725304067135, |
| "epoch": 0.33194215277103756, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.831072404528433e-06, |
| "loss": 0.4174651622772217, |
| "mean_token_accuracy": 0.8688189521431923, |
| "num_tokens": 12993367.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 0.3895297344774008, |
| "epoch": 0.33388333495098516, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.828361318963794e-06, |
| "loss": 0.39536495208740235, |
| "mean_token_accuracy": 0.8768842920660973, |
| "num_tokens": 13065810.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.4025235269218683, |
| "epoch": 0.33582451713093275, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.825629031234574e-06, |
| "loss": 0.37229766845703127, |
| "mean_token_accuracy": 0.8749150916934013, |
| "num_tokens": 13133471.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 0.3760003004223108, |
| "epoch": 0.33776569931088035, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.822875553338715e-06, |
| "loss": 0.3896082639694214, |
| "mean_token_accuracy": 0.8785295352339745, |
| "num_tokens": 13211617.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.3616090904921293, |
| "epoch": 0.3397068814908279, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.820100897367214e-06, |
| "loss": 0.3726183891296387, |
| "mean_token_accuracy": 0.8849639266729354, |
| "num_tokens": 13286555.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 0.352078976854682, |
| "epoch": 0.3416480636707755, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.81730507550406e-06, |
| "loss": 0.3919616937637329, |
| "mean_token_accuracy": 0.8840990662574768, |
| "num_tokens": 13355200.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.38022752702236173, |
| "epoch": 0.3435892458507231, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.81448810002619e-06, |
| "loss": 0.40322446823120117, |
| "mean_token_accuracy": 0.8776090621948243, |
| "num_tokens": 13427836.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 0.4132396575063467, |
| "epoch": 0.3455304280306707, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.811649983303425e-06, |
| "loss": 0.4324185371398926, |
| "mean_token_accuracy": 0.8720195293426514, |
| "num_tokens": 13496990.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.36122960932552817, |
| "epoch": 0.3474716102106183, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.808790737798426e-06, |
| "loss": 0.39123167991638186, |
| "mean_token_accuracy": 0.8842917993664742, |
| "num_tokens": 13572848.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 0.3831039108335972, |
| "epoch": 0.34941279239056583, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.805910376066631e-06, |
| "loss": 0.37870833873748777, |
| "mean_token_accuracy": 0.874284490942955, |
| "num_tokens": 13652434.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.40437927283346653, |
| "epoch": 0.3513539745705134, |
| "grad_norm": 1.875, |
| "learning_rate": 9.803008910756203e-06, |
| "loss": 0.4461234092712402, |
| "mean_token_accuracy": 0.8702120751142501, |
| "num_tokens": 13730423.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 0.4021365400403738, |
| "epoch": 0.353295156750461, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.800086354607975e-06, |
| "loss": 0.4367063999176025, |
| "mean_token_accuracy": 0.8725798889994621, |
| "num_tokens": 13794147.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.36978473588824273, |
| "epoch": 0.3552363389304086, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.797142720455391e-06, |
| "loss": 0.3837603569030762, |
| "mean_token_accuracy": 0.882180480659008, |
| "num_tokens": 13874189.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 0.40101403892040255, |
| "epoch": 0.3571775211103562, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.794178021224459e-06, |
| "loss": 0.4223616123199463, |
| "mean_token_accuracy": 0.8726651340723037, |
| "num_tokens": 13945289.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.35331339165568354, |
| "epoch": 0.3591187032903038, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.79119226993368e-06, |
| "loss": 0.4017838478088379, |
| "mean_token_accuracy": 0.8861800834536553, |
| "num_tokens": 14028418.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 0.3538735806941986, |
| "epoch": 0.36105988547025136, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.788185479694004e-06, |
| "loss": 0.382387375831604, |
| "mean_token_accuracy": 0.8858973324298859, |
| "num_tokens": 14098572.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.3838920842856169, |
| "epoch": 0.36300106765019896, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.785157663708761e-06, |
| "loss": 0.37942454814910886, |
| "mean_token_accuracy": 0.8802381858229638, |
| "num_tokens": 14180314.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 0.41736325100064275, |
| "epoch": 0.36494224983014656, |
| "grad_norm": 2.40625, |
| "learning_rate": 9.782108835273612e-06, |
| "loss": 0.42386960983276367, |
| "mean_token_accuracy": 0.8687801375985146, |
| "num_tokens": 14255569.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.3824776504188776, |
| "epoch": 0.36688343201009416, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.779039007776487e-06, |
| "loss": 0.39833407402038573, |
| "mean_token_accuracy": 0.8781590893864631, |
| "num_tokens": 14323554.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 0.34414222836494446, |
| "epoch": 0.36882461419004176, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.775948194697528e-06, |
| "loss": 0.3766109704971313, |
| "mean_token_accuracy": 0.8860784068703651, |
| "num_tokens": 14404112.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.3819138675928116, |
| "epoch": 0.3707657963699893, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.772836409609025e-06, |
| "loss": 0.4195257663726807, |
| "mean_token_accuracy": 0.8752670779824256, |
| "num_tokens": 14493874.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 0.3910291727632284, |
| "epoch": 0.3727069785499369, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.76970366617536e-06, |
| "loss": 0.42772369384765624, |
| "mean_token_accuracy": 0.8762600779533386, |
| "num_tokens": 14563770.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.39120335690677166, |
| "epoch": 0.3746481607298845, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.76654997815295e-06, |
| "loss": 0.4206050395965576, |
| "mean_token_accuracy": 0.8764881610870361, |
| "num_tokens": 14649946.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 0.37538095470517874, |
| "epoch": 0.3765893429098321, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.763375359390181e-06, |
| "loss": 0.40073623657226565, |
| "mean_token_accuracy": 0.87882329672575, |
| "num_tokens": 14732764.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.4004466250538826, |
| "epoch": 0.3785305250897797, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.760179823827347e-06, |
| "loss": 0.41023030281066897, |
| "mean_token_accuracy": 0.8705371245741844, |
| "num_tokens": 14814893.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 0.4013238321989775, |
| "epoch": 0.3804717072697273, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.756963385496599e-06, |
| "loss": 0.44197940826416016, |
| "mean_token_accuracy": 0.8728833734989166, |
| "num_tokens": 14893153.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.38930566757917406, |
| "epoch": 0.38241288944967483, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.753726058521868e-06, |
| "loss": 0.40584554672241213, |
| "mean_token_accuracy": 0.8758206337690353, |
| "num_tokens": 14968880.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 0.4486214961856604, |
| "epoch": 0.38435407162962243, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.750467857118811e-06, |
| "loss": 0.4342947959899902, |
| "mean_token_accuracy": 0.8598737180233001, |
| "num_tokens": 15055751.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.3913938831537962, |
| "epoch": 0.38629525380957, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.747188795594755e-06, |
| "loss": 0.43964052200317383, |
| "mean_token_accuracy": 0.8771824568510056, |
| "num_tokens": 15129042.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 0.3813592415302992, |
| "epoch": 0.3882364359895176, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.743888888348618e-06, |
| "loss": 0.3965798854827881, |
| "mean_token_accuracy": 0.8770193979144096, |
| "num_tokens": 15213724.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3882364359895176, |
| "eval_entropy": 0.3787456456798906, |
| "eval_loss": 0.3797132670879364, |
| "eval_mean_token_accuracy": 0.8805272205589316, |
| "eval_num_tokens": 15213724.0, |
| "eval_runtime": 60.2207, |
| "eval_samples_per_second": 35.685, |
| "eval_steps_per_second": 35.685, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.36478537507355213, |
| "epoch": 0.3901776181694652, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.740568149870864e-06, |
| "loss": 0.38557422161102295, |
| "mean_token_accuracy": 0.8830441504716873, |
| "num_tokens": 15293346.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 0.3718357976526022, |
| "epoch": 0.39211880034941277, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.737226594743425e-06, |
| "loss": 0.41036291122436525, |
| "mean_token_accuracy": 0.8785412311553955, |
| "num_tokens": 15381773.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.3791601274162531, |
| "epoch": 0.39405998252936036, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.733864237639645e-06, |
| "loss": 0.39489567279815674, |
| "mean_token_accuracy": 0.8787289649248123, |
| "num_tokens": 15463663.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 0.40409386418759824, |
| "epoch": 0.39600116470930796, |
| "grad_norm": 1.5, |
| "learning_rate": 9.730481093324209e-06, |
| "loss": 0.39559972286224365, |
| "mean_token_accuracy": 0.873577019572258, |
| "num_tokens": 15545078.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.38083929792046545, |
| "epoch": 0.39794234688925556, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.72707717665309e-06, |
| "loss": 0.3976888179779053, |
| "mean_token_accuracy": 0.876065094769001, |
| "num_tokens": 15634558.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 0.39706564620137214, |
| "epoch": 0.39988352906920316, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.723652502573465e-06, |
| "loss": 0.39628422260284424, |
| "mean_token_accuracy": 0.8746212273836136, |
| "num_tokens": 15706364.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.4297271855175495, |
| "epoch": 0.40182471124915076, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.720207086123674e-06, |
| "loss": 0.44258790016174315, |
| "mean_token_accuracy": 0.8682567358016968, |
| "num_tokens": 15786703.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 0.37680495008826254, |
| "epoch": 0.4037658934290983, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.716740942433127e-06, |
| "loss": 0.3932778358459473, |
| "mean_token_accuracy": 0.8771912530064583, |
| "num_tokens": 15883251.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.4019945695996284, |
| "epoch": 0.4057070756090459, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.713254086722259e-06, |
| "loss": 0.39323198795318604, |
| "mean_token_accuracy": 0.8740618228912354, |
| "num_tokens": 15962211.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 0.3781829860061407, |
| "epoch": 0.4076482577889935, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.709746534302453e-06, |
| "loss": 0.4020181655883789, |
| "mean_token_accuracy": 0.8759587749838829, |
| "num_tokens": 16040844.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.35617672353982927, |
| "epoch": 0.4095894399689411, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.706218300575975e-06, |
| "loss": 0.3751538276672363, |
| "mean_token_accuracy": 0.8814701676368714, |
| "num_tokens": 16117566.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 0.39271861389279367, |
| "epoch": 0.4115306221488887, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.702669401035904e-06, |
| "loss": 0.3784984827041626, |
| "mean_token_accuracy": 0.877305480837822, |
| "num_tokens": 16197812.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.3672247972339392, |
| "epoch": 0.41347180432883623, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.699099851266071e-06, |
| "loss": 0.3595015525817871, |
| "mean_token_accuracy": 0.8823614567518234, |
| "num_tokens": 16275198.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 0.369810114428401, |
| "epoch": 0.41541298650878383, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.695509666940978e-06, |
| "loss": 0.405411958694458, |
| "mean_token_accuracy": 0.8822488501667977, |
| "num_tokens": 16350965.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.3897046368569136, |
| "epoch": 0.41735416868873143, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.691898863825749e-06, |
| "loss": 0.38735527992248536, |
| "mean_token_accuracy": 0.8744328498840332, |
| "num_tokens": 16436304.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 0.3776374412700534, |
| "epoch": 0.41929535086867903, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.688267457776032e-06, |
| "loss": 0.39870805740356446, |
| "mean_token_accuracy": 0.8785615637898445, |
| "num_tokens": 16501266.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.38694494068622587, |
| "epoch": 0.4212365330486266, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.684615464737964e-06, |
| "loss": 0.39393253326416017, |
| "mean_token_accuracy": 0.8791399031877518, |
| "num_tokens": 16560531.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 0.38168427646160125, |
| "epoch": 0.4231777152285742, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.680942900748067e-06, |
| "loss": 0.4125086784362793, |
| "mean_token_accuracy": 0.878045716881752, |
| "num_tokens": 16627433.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.3892548579722643, |
| "epoch": 0.42511889740852177, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.677249781933205e-06, |
| "loss": 0.40183658599853517, |
| "mean_token_accuracy": 0.8756108567118644, |
| "num_tokens": 16731006.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 0.3736507400870323, |
| "epoch": 0.42706007958846937, |
| "grad_norm": 2.28125, |
| "learning_rate": 9.673536124510496e-06, |
| "loss": 0.40765180587768557, |
| "mean_token_accuracy": 0.8797045171260833, |
| "num_tokens": 16801170.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.3525055509060621, |
| "epoch": 0.42900126176841696, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.669801944787249e-06, |
| "loss": 0.3724426031112671, |
| "mean_token_accuracy": 0.8849082082509995, |
| "num_tokens": 16881282.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 0.39476602226495744, |
| "epoch": 0.43094244394836456, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.66604725916089e-06, |
| "loss": 0.3921769380569458, |
| "mean_token_accuracy": 0.8737779542803764, |
| "num_tokens": 16965112.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.38296737633645533, |
| "epoch": 0.43288362612831216, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.662272084118887e-06, |
| "loss": 0.39906389713287355, |
| "mean_token_accuracy": 0.8742195263504982, |
| "num_tokens": 17033565.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 0.4264007180929184, |
| "epoch": 0.4348248083082597, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.658476436238683e-06, |
| "loss": 0.4375418186187744, |
| "mean_token_accuracy": 0.8661627262830734, |
| "num_tokens": 17110727.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.3653211809694767, |
| "epoch": 0.4367659904882073, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.654660332187621e-06, |
| "loss": 0.3593518972396851, |
| "mean_token_accuracy": 0.882878914475441, |
| "num_tokens": 17189104.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 0.33642441034317017, |
| "epoch": 0.4387071726681549, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.65082378872287e-06, |
| "loss": 0.36505885124206544, |
| "mean_token_accuracy": 0.8923633351922036, |
| "num_tokens": 17256636.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.3748783551156521, |
| "epoch": 0.4406483548481025, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.646966822691351e-06, |
| "loss": 0.4033698558807373, |
| "mean_token_accuracy": 0.8803606644272804, |
| "num_tokens": 17333913.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 0.3871772147715092, |
| "epoch": 0.4425895370280501, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.643089451029666e-06, |
| "loss": 0.39040231704711914, |
| "mean_token_accuracy": 0.8764987051486969, |
| "num_tokens": 17402674.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.4355839218944311, |
| "epoch": 0.4445307192079977, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.639191690764018e-06, |
| "loss": 0.40796470642089844, |
| "mean_token_accuracy": 0.8702881962060929, |
| "num_tokens": 17476753.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 0.37759856693446636, |
| "epoch": 0.44647190138794524, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.635273559010148e-06, |
| "loss": 0.38673570156097414, |
| "mean_token_accuracy": 0.8773683786392212, |
| "num_tokens": 17554472.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.3581284359097481, |
| "epoch": 0.44841308356789283, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.63133507297324e-06, |
| "loss": 0.37009692192077637, |
| "mean_token_accuracy": 0.8856742799282074, |
| "num_tokens": 17637236.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 0.47716011516749857, |
| "epoch": 0.45035426574784043, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.627376249947866e-06, |
| "loss": 0.491148042678833, |
| "mean_token_accuracy": 0.8575920403003693, |
| "num_tokens": 17714880.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.38734299391508104, |
| "epoch": 0.45229544792778803, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.623397107317897e-06, |
| "loss": 0.4355041980743408, |
| "mean_token_accuracy": 0.8793953686952591, |
| "num_tokens": 17786198.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 0.3389087375253439, |
| "epoch": 0.45423663010773563, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.619397662556434e-06, |
| "loss": 0.35656213760375977, |
| "mean_token_accuracy": 0.8894602239131928, |
| "num_tokens": 17851917.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.3687282849103212, |
| "epoch": 0.4561778122876832, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.615377933225727e-06, |
| "loss": 0.4001771450042725, |
| "mean_token_accuracy": 0.8769694566726685, |
| "num_tokens": 17922400.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 0.37209414653480055, |
| "epoch": 0.45811899446763077, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.611337936977096e-06, |
| "loss": 0.39912428855896, |
| "mean_token_accuracy": 0.8808334246277809, |
| "num_tokens": 17996715.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.38213921934366224, |
| "epoch": 0.46006017664757837, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.607277691550862e-06, |
| "loss": 0.41675233840942383, |
| "mean_token_accuracy": 0.8742595329880715, |
| "num_tokens": 18084477.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 0.39352951049804685, |
| "epoch": 0.46200135882752597, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.60319721477626e-06, |
| "loss": 0.4071157932281494, |
| "mean_token_accuracy": 0.8733987167477608, |
| "num_tokens": 18152478.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.392200979962945, |
| "epoch": 0.46394254100747356, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.59909652457136e-06, |
| "loss": 0.4249094486236572, |
| "mean_token_accuracy": 0.8737818524241447, |
| "num_tokens": 18219295.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 0.35255367681384087, |
| "epoch": 0.46588372318742116, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.594975638943006e-06, |
| "loss": 0.3529276132583618, |
| "mean_token_accuracy": 0.8894811898469925, |
| "num_tokens": 18289986.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.46588372318742116, |
| "eval_entropy": 0.3770919001001211, |
| "eval_loss": 0.37714096903800964, |
| "eval_mean_token_accuracy": 0.8812256991003435, |
| "eval_num_tokens": 18289986.0, |
| "eval_runtime": 60.2886, |
| "eval_samples_per_second": 35.645, |
| "eval_steps_per_second": 35.645, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.38377687335014343, |
| "epoch": 0.4678249053673687, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.59083457598671e-06, |
| "loss": 0.4212928771972656, |
| "mean_token_accuracy": 0.8758307337760926, |
| "num_tokens": 18359707.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 0.3596473693847656, |
| "epoch": 0.4697660875473163, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.586673353886591e-06, |
| "loss": 0.3813552141189575, |
| "mean_token_accuracy": 0.8841730430722237, |
| "num_tokens": 18435661.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.4140443943440914, |
| "epoch": 0.4717072697272639, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.582491990915292e-06, |
| "loss": 0.4197361469268799, |
| "mean_token_accuracy": 0.8726279020309449, |
| "num_tokens": 18509573.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 0.3907853942364454, |
| "epoch": 0.4736484519072115, |
| "grad_norm": 1.625, |
| "learning_rate": 9.578290505433896e-06, |
| "loss": 0.4191273212432861, |
| "mean_token_accuracy": 0.8725411191582679, |
| "num_tokens": 18608110.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.3810266986489296, |
| "epoch": 0.4755896340871591, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.57406891589185e-06, |
| "loss": 0.38829352855682375, |
| "mean_token_accuracy": 0.8763051420450211, |
| "num_tokens": 18689794.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 0.420042909681797, |
| "epoch": 0.4775308162671067, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.569827240826876e-06, |
| "loss": 0.40959844589233396, |
| "mean_token_accuracy": 0.8703769713640213, |
| "num_tokens": 18758060.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.36773351952433586, |
| "epoch": 0.47947199844705424, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.565565498864902e-06, |
| "loss": 0.3752429962158203, |
| "mean_token_accuracy": 0.8811664238572121, |
| "num_tokens": 18824217.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 0.40542583018541334, |
| "epoch": 0.48141318062700184, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.561283708719968e-06, |
| "loss": 0.4128578662872314, |
| "mean_token_accuracy": 0.8699263706803322, |
| "num_tokens": 18910883.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.37737762071192266, |
| "epoch": 0.48335436280694943, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.55698188919415e-06, |
| "loss": 0.39167325496673583, |
| "mean_token_accuracy": 0.879218578338623, |
| "num_tokens": 18993242.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 0.38517537601292134, |
| "epoch": 0.48529554498689703, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.552660059177477e-06, |
| "loss": 0.38378689289093015, |
| "mean_token_accuracy": 0.879303851723671, |
| "num_tokens": 19061334.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.37100368924438953, |
| "epoch": 0.48723672716684463, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.548318237647849e-06, |
| "loss": 0.4200906753540039, |
| "mean_token_accuracy": 0.8786957338452339, |
| "num_tokens": 19128848.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 0.38898602277040484, |
| "epoch": 0.4891779093467922, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.543956443670947e-06, |
| "loss": 0.4141817569732666, |
| "mean_token_accuracy": 0.8764576897025108, |
| "num_tokens": 19202756.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.3760422389954329, |
| "epoch": 0.49111909152673977, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.539574696400165e-06, |
| "loss": 0.3719266653060913, |
| "mean_token_accuracy": 0.8817565947771072, |
| "num_tokens": 19268958.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 0.36780366376042367, |
| "epoch": 0.49306027370668737, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.535173015076501e-06, |
| "loss": 0.39360432624816893, |
| "mean_token_accuracy": 0.8779341161251069, |
| "num_tokens": 19352111.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.4413019739091396, |
| "epoch": 0.49500145588663497, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.5307514190285e-06, |
| "loss": 0.4407999515533447, |
| "mean_token_accuracy": 0.8592960745096206, |
| "num_tokens": 19441509.0, |
| "step": 1275 |
| }, |
| { |
| "entropy": 0.39783000349998476, |
| "epoch": 0.49694263806658256, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.526309927672148e-06, |
| "loss": 0.42558717727661133, |
| "mean_token_accuracy": 0.8755456551909446, |
| "num_tokens": 19525598.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.36931919269263747, |
| "epoch": 0.49888382024653016, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.521848560510796e-06, |
| "loss": 0.38771824836730956, |
| "mean_token_accuracy": 0.8801409855484963, |
| "num_tokens": 19612547.0, |
| "step": 1285 |
| }, |
| { |
| "entropy": 0.41849659457802774, |
| "epoch": 0.5008250024264778, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.517367337135076e-06, |
| "loss": 0.43532710075378417, |
| "mean_token_accuracy": 0.8676797851920128, |
| "num_tokens": 19689731.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.41437376402318477, |
| "epoch": 0.5027661846064253, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.51286627722281e-06, |
| "loss": 0.4385324478149414, |
| "mean_token_accuracy": 0.8706857517361641, |
| "num_tokens": 19756661.0, |
| "step": 1295 |
| }, |
| { |
| "entropy": 0.4086436625570059, |
| "epoch": 0.5047073667863728, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.508345400538926e-06, |
| "loss": 0.4336398124694824, |
| "mean_token_accuracy": 0.8683807790279389, |
| "num_tokens": 19821704.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.3883740194141865, |
| "epoch": 0.5066485489663205, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.503804726935369e-06, |
| "loss": 0.39049382209777833, |
| "mean_token_accuracy": 0.8757125899195671, |
| "num_tokens": 19902414.0, |
| "step": 1305 |
| }, |
| { |
| "entropy": 0.39095442183315754, |
| "epoch": 0.508589731146268, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.499244276351019e-06, |
| "loss": 0.38634843826293946, |
| "mean_token_accuracy": 0.8751242905855179, |
| "num_tokens": 19984638.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.3666827451437712, |
| "epoch": 0.5105309133262157, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.494664068811597e-06, |
| "loss": 0.40018815994262696, |
| "mean_token_accuracy": 0.880105035007, |
| "num_tokens": 20064261.0, |
| "step": 1315 |
| }, |
| { |
| "entropy": 0.34377430453896524, |
| "epoch": 0.5124720955061632, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.490064124429584e-06, |
| "loss": 0.36567790508270265, |
| "mean_token_accuracy": 0.8907153263688088, |
| "num_tokens": 20126962.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.3555528115481138, |
| "epoch": 0.5144132776861109, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.485444463404125e-06, |
| "loss": 0.3725638151168823, |
| "mean_token_accuracy": 0.8841280445456505, |
| "num_tokens": 20215463.0, |
| "step": 1325 |
| }, |
| { |
| "entropy": 0.3422409202903509, |
| "epoch": 0.5163544598660584, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.480805106020947e-06, |
| "loss": 0.3722813129425049, |
| "mean_token_accuracy": 0.8891440883278847, |
| "num_tokens": 20298577.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.3823659881949425, |
| "epoch": 0.518295642046006, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.476146072652262e-06, |
| "loss": 0.39306447505950926, |
| "mean_token_accuracy": 0.876395545899868, |
| "num_tokens": 20374990.0, |
| "step": 1335 |
| }, |
| { |
| "entropy": 0.388584029302001, |
| "epoch": 0.5202368242259536, |
| "grad_norm": 1.5, |
| "learning_rate": 9.471467383756692e-06, |
| "loss": 0.41069755554199217, |
| "mean_token_accuracy": 0.8789796933531762, |
| "num_tokens": 20446548.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.36739424169063567, |
| "epoch": 0.5221780064059012, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.46676905987916e-06, |
| "loss": 0.3884859085083008, |
| "mean_token_accuracy": 0.8806875750422478, |
| "num_tokens": 20517149.0, |
| "step": 1345 |
| }, |
| { |
| "entropy": 0.35578424148261545, |
| "epoch": 0.5241191885858488, |
| "grad_norm": 1.5, |
| "learning_rate": 9.462051121650816e-06, |
| "loss": 0.3846778869628906, |
| "mean_token_accuracy": 0.8805378764867783, |
| "num_tokens": 20596629.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.375444458052516, |
| "epoch": 0.5260603707657964, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.457313589788937e-06, |
| "loss": 0.40492801666259765, |
| "mean_token_accuracy": 0.8799885243177414, |
| "num_tokens": 20660631.0, |
| "step": 1355 |
| }, |
| { |
| "entropy": 0.38559874445199965, |
| "epoch": 0.5280015529457439, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.452556485096839e-06, |
| "loss": 0.4140150547027588, |
| "mean_token_accuracy": 0.8767204716801643, |
| "num_tokens": 20723882.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.40360996387898923, |
| "epoch": 0.5299427351256916, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.447779828463788e-06, |
| "loss": 0.38798012733459475, |
| "mean_token_accuracy": 0.8741889104247094, |
| "num_tokens": 20801012.0, |
| "step": 1365 |
| }, |
| { |
| "entropy": 0.3853685542941093, |
| "epoch": 0.5318839173056391, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.442983640864904e-06, |
| "loss": 0.39840006828308105, |
| "mean_token_accuracy": 0.8812712132930756, |
| "num_tokens": 20870494.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.41988850980997083, |
| "epoch": 0.5338250994855868, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.43816794336107e-06, |
| "loss": 0.42660999298095703, |
| "mean_token_accuracy": 0.8691768750548363, |
| "num_tokens": 20932159.0, |
| "step": 1375 |
| }, |
| { |
| "entropy": 0.35770875252783296, |
| "epoch": 0.5357662816655343, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.433332757098844e-06, |
| "loss": 0.35865347385406493, |
| "mean_token_accuracy": 0.8853553980588913, |
| "num_tokens": 21012568.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.3706452056765556, |
| "epoch": 0.5377074638454818, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.428478103310358e-06, |
| "loss": 0.40013108253479, |
| "mean_token_accuracy": 0.8823366552591324, |
| "num_tokens": 21083094.0, |
| "step": 1385 |
| }, |
| { |
| "entropy": 0.39769635573029516, |
| "epoch": 0.5396486460254295, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.423604003313232e-06, |
| "loss": 0.4011887550354004, |
| "mean_token_accuracy": 0.8764868810772896, |
| "num_tokens": 21161361.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.39253461360931396, |
| "epoch": 0.541589828205377, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.418710478510478e-06, |
| "loss": 0.41046462059020994, |
| "mean_token_accuracy": 0.878234452009201, |
| "num_tokens": 21225113.0, |
| "step": 1395 |
| }, |
| { |
| "entropy": 0.3744351703673601, |
| "epoch": 0.5435310103853247, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.413797550390403e-06, |
| "loss": 0.37674736976623535, |
| "mean_token_accuracy": 0.8828163802623749, |
| "num_tokens": 21295691.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5435310103853247, |
| "eval_entropy": 0.3686942668842293, |
| "eval_loss": 0.3755117952823639, |
| "eval_mean_token_accuracy": 0.8816876367060402, |
| "eval_num_tokens": 21295691.0, |
| "eval_runtime": 60.3519, |
| "eval_samples_per_second": 35.608, |
| "eval_steps_per_second": 35.608, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.34990762211382387, |
| "epoch": 0.5454721925652722, |
| "grad_norm": 1.2109375, |
| "learning_rate": 9.40886524052652e-06, |
| "loss": 0.3539942979812622, |
| "mean_token_accuracy": 0.8850773021578788, |
| "num_tokens": 21376676.0, |
| "step": 1405 |
| }, |
| { |
| "entropy": 0.41086711175739765, |
| "epoch": 0.5474133747452199, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.403913570577448e-06, |
| "loss": 0.43075881004333494, |
| "mean_token_accuracy": 0.871852807700634, |
| "num_tokens": 21451444.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.37052917703986166, |
| "epoch": 0.5493545569251674, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.398942562286822e-06, |
| "loss": 0.38300988674163816, |
| "mean_token_accuracy": 0.8779854521155357, |
| "num_tokens": 21536871.0, |
| "step": 1415 |
| }, |
| { |
| "entropy": 0.3861477542668581, |
| "epoch": 0.551295739105115, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.393952237483195e-06, |
| "loss": 0.40117707252502444, |
| "mean_token_accuracy": 0.8765692830085754, |
| "num_tokens": 21605987.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.4081678859889507, |
| "epoch": 0.5532369212850626, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.38894261807994e-06, |
| "loss": 0.42310566902160646, |
| "mean_token_accuracy": 0.8693249508738518, |
| "num_tokens": 21692774.0, |
| "step": 1425 |
| }, |
| { |
| "entropy": 0.3717794116586447, |
| "epoch": 0.5551781034650102, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.383913726075157e-06, |
| "loss": 0.38362655639648435, |
| "mean_token_accuracy": 0.8794390082359314, |
| "num_tokens": 21774027.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.3897536873817444, |
| "epoch": 0.5571192856449578, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.378865583551575e-06, |
| "loss": 0.40027127265930174, |
| "mean_token_accuracy": 0.874237485229969, |
| "num_tokens": 21855865.0, |
| "step": 1435 |
| }, |
| { |
| "entropy": 0.4386448211967945, |
| "epoch": 0.5590604678249054, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.373798212676459e-06, |
| "loss": 0.44517908096313474, |
| "mean_token_accuracy": 0.8637019321322441, |
| "num_tokens": 21937592.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.40399301163852214, |
| "epoch": 0.5610016500048529, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.368711635701499e-06, |
| "loss": 0.42911725044250487, |
| "mean_token_accuracy": 0.8717393398284912, |
| "num_tokens": 22016228.0, |
| "step": 1445 |
| }, |
| { |
| "entropy": 0.33674396723508837, |
| "epoch": 0.5629428321848006, |
| "grad_norm": 1.625, |
| "learning_rate": 9.363605874962735e-06, |
| "loss": 0.3449155569076538, |
| "mean_token_accuracy": 0.8916645109653473, |
| "num_tokens": 22091155.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.34950118474662306, |
| "epoch": 0.5648840143647481, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.358480952880438e-06, |
| "loss": 0.37925631999969484, |
| "mean_token_accuracy": 0.8876421838998795, |
| "num_tokens": 22168063.0, |
| "step": 1455 |
| }, |
| { |
| "entropy": 0.3646994840353727, |
| "epoch": 0.5668251965446958, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.35333689195902e-06, |
| "loss": 0.3887592554092407, |
| "mean_token_accuracy": 0.8781901568174362, |
| "num_tokens": 22247190.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.42925515584647655, |
| "epoch": 0.5687663787246433, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.34817371478694e-06, |
| "loss": 0.4361258983612061, |
| "mean_token_accuracy": 0.8652834072709084, |
| "num_tokens": 22327454.0, |
| "step": 1465 |
| }, |
| { |
| "entropy": 0.40622838474810125, |
| "epoch": 0.5707075609045908, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.342991444036593e-06, |
| "loss": 0.4456647872924805, |
| "mean_token_accuracy": 0.8694571733474732, |
| "num_tokens": 22412083.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.42173517793416976, |
| "epoch": 0.5726487430845385, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.337790102464224e-06, |
| "loss": 0.454360818862915, |
| "mean_token_accuracy": 0.8657700821757317, |
| "num_tokens": 22490065.0, |
| "step": 1475 |
| }, |
| { |
| "entropy": 0.4561560284346342, |
| "epoch": 0.574589925264486, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.332569712909816e-06, |
| "loss": 0.4739046573638916, |
| "mean_token_accuracy": 0.8589961290359497, |
| "num_tokens": 22578402.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.38373861461877823, |
| "epoch": 0.5765311074444337, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.327330298296998e-06, |
| "loss": 0.3775209665298462, |
| "mean_token_accuracy": 0.8786651358008385, |
| "num_tokens": 22657716.0, |
| "step": 1485 |
| }, |
| { |
| "entropy": 0.34858159013092516, |
| "epoch": 0.5784722896243812, |
| "grad_norm": 1.5, |
| "learning_rate": 9.32207188163294e-06, |
| "loss": 0.36159141063690187, |
| "mean_token_accuracy": 0.8849190220236778, |
| "num_tokens": 22727213.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.36950380988419057, |
| "epoch": 0.5804134718043288, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.316794486008254e-06, |
| "loss": 0.41820201873779295, |
| "mean_token_accuracy": 0.8807887002825737, |
| "num_tokens": 22796084.0, |
| "step": 1495 |
| }, |
| { |
| "entropy": 0.3770993869751692, |
| "epoch": 0.5823546539842764, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.31149813459689e-06, |
| "loss": 0.3539431571960449, |
| "mean_token_accuracy": 0.8795977741479873, |
| "num_tokens": 22870251.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.3788142062723637, |
| "epoch": 0.584295836164224, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.306182850656037e-06, |
| "loss": 0.3946338415145874, |
| "mean_token_accuracy": 0.8801519960165024, |
| "num_tokens": 22951081.0, |
| "step": 1505 |
| }, |
| { |
| "entropy": 0.3841311365365982, |
| "epoch": 0.5862370183441716, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.300848657526024e-06, |
| "loss": 0.38277838230133054, |
| "mean_token_accuracy": 0.8772434189915657, |
| "num_tokens": 23034217.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.3577498983591795, |
| "epoch": 0.5881782005241192, |
| "grad_norm": 2.0, |
| "learning_rate": 9.29549557863021e-06, |
| "loss": 0.37149059772491455, |
| "mean_token_accuracy": 0.8867600724101067, |
| "num_tokens": 23103243.0, |
| "step": 1515 |
| }, |
| { |
| "entropy": 0.36544432379305364, |
| "epoch": 0.5901193827040668, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.29012363747488e-06, |
| "loss": 0.3911574840545654, |
| "mean_token_accuracy": 0.8809396475553513, |
| "num_tokens": 23180618.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.37773900777101516, |
| "epoch": 0.5920605648840144, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.284732857649154e-06, |
| "loss": 0.40440049171447756, |
| "mean_token_accuracy": 0.8771207213401795, |
| "num_tokens": 23274254.0, |
| "step": 1525 |
| }, |
| { |
| "entropy": 0.41926471069455146, |
| "epoch": 0.5940017470639619, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.279323262824871e-06, |
| "loss": 0.43068270683288573, |
| "mean_token_accuracy": 0.8644863858819007, |
| "num_tokens": 23354103.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.3747733347117901, |
| "epoch": 0.5959429292439096, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.273894876756497e-06, |
| "loss": 0.3952503204345703, |
| "mean_token_accuracy": 0.8833319827914238, |
| "num_tokens": 23420187.0, |
| "step": 1535 |
| }, |
| { |
| "entropy": 0.3898849368095398, |
| "epoch": 0.5978841114238571, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.268447723281003e-06, |
| "loss": 0.4146092891693115, |
| "mean_token_accuracy": 0.8798678085207939, |
| "num_tokens": 23491179.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.34879231434315444, |
| "epoch": 0.5998252936038048, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.262981826317778e-06, |
| "loss": 0.37036240100860596, |
| "mean_token_accuracy": 0.8908288046717644, |
| "num_tokens": 23561490.0, |
| "step": 1545 |
| }, |
| { |
| "entropy": 0.4013595413416624, |
| "epoch": 0.6017664757837523, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.257497209868516e-06, |
| "loss": 0.42535991668701173, |
| "mean_token_accuracy": 0.8715372681617737, |
| "num_tokens": 23643197.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.3767758123576641, |
| "epoch": 0.6037076579636999, |
| "grad_norm": 1.875, |
| "learning_rate": 9.251993898017109e-06, |
| "loss": 0.3970643997192383, |
| "mean_token_accuracy": 0.8821586266160011, |
| "num_tokens": 23714513.0, |
| "step": 1555 |
| }, |
| { |
| "entropy": 0.40486165285110476, |
| "epoch": 0.6056488401436475, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.246471914929547e-06, |
| "loss": 0.41384401321411135, |
| "mean_token_accuracy": 0.8695383608341217, |
| "num_tokens": 23801743.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.3538258448243141, |
| "epoch": 0.607590022323595, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.240931284853807e-06, |
| "loss": 0.3868009090423584, |
| "mean_token_accuracy": 0.8842133894562721, |
| "num_tokens": 23893948.0, |
| "step": 1565 |
| }, |
| { |
| "entropy": 0.3908126030117273, |
| "epoch": 0.6095312045035427, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.235372032119747e-06, |
| "loss": 0.40709576606750486, |
| "mean_token_accuracy": 0.8742722377181054, |
| "num_tokens": 23959291.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.39328810535371306, |
| "epoch": 0.6114723866834902, |
| "grad_norm": 1.625, |
| "learning_rate": 9.229794181139002e-06, |
| "loss": 0.40347847938537595, |
| "mean_token_accuracy": 0.874020305275917, |
| "num_tokens": 24028375.0, |
| "step": 1575 |
| }, |
| { |
| "entropy": 0.38035779893398286, |
| "epoch": 0.6134135688634378, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.224197756404875e-06, |
| "loss": 0.39300010204315183, |
| "mean_token_accuracy": 0.8796775847673416, |
| "num_tokens": 24106755.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.35121305733919145, |
| "epoch": 0.6153547510433854, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.218582782492228e-06, |
| "loss": 0.39762823581695556, |
| "mean_token_accuracy": 0.8853226408362389, |
| "num_tokens": 24174787.0, |
| "step": 1585 |
| }, |
| { |
| "entropy": 0.3741837713867426, |
| "epoch": 0.617295933223333, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.212949284057378e-06, |
| "loss": 0.39895901679992674, |
| "mean_token_accuracy": 0.8801515579223633, |
| "num_tokens": 24253990.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.4153418317437172, |
| "epoch": 0.6192371154032806, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.207297285837984e-06, |
| "loss": 0.4323587894439697, |
| "mean_token_accuracy": 0.8745242461562157, |
| "num_tokens": 24326608.0, |
| "step": 1595 |
| }, |
| { |
| "entropy": 0.42888959534466264, |
| "epoch": 0.6211782975832282, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.201626812652942e-06, |
| "loss": 0.4193469524383545, |
| "mean_token_accuracy": 0.8636892691254616, |
| "num_tokens": 24408096.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.6211782975832282, |
| "eval_entropy": 0.3693768273520714, |
| "eval_loss": 0.3738563656806946, |
| "eval_mean_token_accuracy": 0.8822050338265174, |
| "eval_num_tokens": 24408096.0, |
| "eval_runtime": 60.0738, |
| "eval_samples_per_second": 35.773, |
| "eval_steps_per_second": 35.773, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.4069465111941099, |
| "epoch": 0.6231194797631757, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.195937889402276e-06, |
| "loss": 0.3946805238723755, |
| "mean_token_accuracy": 0.8711962580680848, |
| "num_tokens": 24477273.0, |
| "step": 1605 |
| }, |
| { |
| "entropy": 0.3497770603746176, |
| "epoch": 0.6250606619431234, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.190230541067023e-06, |
| "loss": 0.3609620094299316, |
| "mean_token_accuracy": 0.8848605647683143, |
| "num_tokens": 24556472.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.4083269018679857, |
| "epoch": 0.6270018441230709, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.184504792709134e-06, |
| "loss": 0.4195822238922119, |
| "mean_token_accuracy": 0.873286210000515, |
| "num_tokens": 24633310.0, |
| "step": 1615 |
| }, |
| { |
| "entropy": 0.3741916142404079, |
| "epoch": 0.6289430263030186, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.178760669471351e-06, |
| "loss": 0.3778867244720459, |
| "mean_token_accuracy": 0.8781406879425049, |
| "num_tokens": 24702572.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.34799036718904974, |
| "epoch": 0.6308842084829661, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.17299819657711e-06, |
| "loss": 0.36290202140808103, |
| "mean_token_accuracy": 0.8880066946148872, |
| "num_tokens": 24786717.0, |
| "step": 1625 |
| }, |
| { |
| "entropy": 0.35105147287249566, |
| "epoch": 0.6328253906629138, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.167217399330418e-06, |
| "loss": 0.367209792137146, |
| "mean_token_accuracy": 0.886526557803154, |
| "num_tokens": 24861736.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.3578195352107286, |
| "epoch": 0.6347665728428613, |
| "grad_norm": 1.375, |
| "learning_rate": 9.161418303115749e-06, |
| "loss": 0.3651568412780762, |
| "mean_token_accuracy": 0.8801181107759476, |
| "num_tokens": 24932067.0, |
| "step": 1635 |
| }, |
| { |
| "entropy": 0.4024165827780962, |
| "epoch": 0.6367077550228089, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.155600933397932e-06, |
| "loss": 0.4195927619934082, |
| "mean_token_accuracy": 0.8746752873063087, |
| "num_tokens": 25003342.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.41863835491240026, |
| "epoch": 0.6386489372027565, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.149765315722039e-06, |
| "loss": 0.4207592964172363, |
| "mean_token_accuracy": 0.8699864789843559, |
| "num_tokens": 25089543.0, |
| "step": 1645 |
| }, |
| { |
| "entropy": 0.3774807959794998, |
| "epoch": 0.640590119382704, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.14391147571327e-06, |
| "loss": 0.38125219345092776, |
| "mean_token_accuracy": 0.8813193202018738, |
| "num_tokens": 25164250.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.36004649810492995, |
| "epoch": 0.6425313015626517, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.13803943907684e-06, |
| "loss": 0.38634524345397947, |
| "mean_token_accuracy": 0.8815308138728142, |
| "num_tokens": 25235584.0, |
| "step": 1655 |
| }, |
| { |
| "entropy": 0.40895739644765855, |
| "epoch": 0.6444724837425992, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.132149231597874e-06, |
| "loss": 0.42175993919372556, |
| "mean_token_accuracy": 0.8735464856028556, |
| "num_tokens": 25326121.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.42022631838917734, |
| "epoch": 0.6464136659225468, |
| "grad_norm": 1.3671875, |
| "learning_rate": 9.126240879141286e-06, |
| "loss": 0.4283411502838135, |
| "mean_token_accuracy": 0.8683241337537766, |
| "num_tokens": 25416532.0, |
| "step": 1665 |
| }, |
| { |
| "entropy": 0.3419806692749262, |
| "epoch": 0.6483548481024944, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.120314407651665e-06, |
| "loss": 0.3869215726852417, |
| "mean_token_accuracy": 0.8876996964216233, |
| "num_tokens": 25500339.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.37519195675849915, |
| "epoch": 0.650296030282442, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.114369843153168e-06, |
| "loss": 0.38437614440917967, |
| "mean_token_accuracy": 0.880204701423645, |
| "num_tokens": 25571598.0, |
| "step": 1675 |
| }, |
| { |
| "entropy": 0.34165109843015673, |
| "epoch": 0.6522372124623896, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.108407211749397e-06, |
| "loss": 0.3734029531478882, |
| "mean_token_accuracy": 0.8863589748740196, |
| "num_tokens": 25647870.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.3643860913813114, |
| "epoch": 0.6541783946423372, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.102426539623295e-06, |
| "loss": 0.3877432107925415, |
| "mean_token_accuracy": 0.8784330353140831, |
| "num_tokens": 25729611.0, |
| "step": 1685 |
| }, |
| { |
| "entropy": 0.4150772735476494, |
| "epoch": 0.6561195768222847, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.09642785303702e-06, |
| "loss": 0.4420276641845703, |
| "mean_token_accuracy": 0.8656805634498597, |
| "num_tokens": 25808840.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.3560687083750963, |
| "epoch": 0.6580607590022324, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.090411178331835e-06, |
| "loss": 0.37286901473999023, |
| "mean_token_accuracy": 0.8856526881456375, |
| "num_tokens": 25887901.0, |
| "step": 1695 |
| }, |
| { |
| "entropy": 0.4048729032278061, |
| "epoch": 0.6600019411821799, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.084376541927995e-06, |
| "loss": 0.4281449317932129, |
| "mean_token_accuracy": 0.8717523291707039, |
| "num_tokens": 25980223.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.4058201160281897, |
| "epoch": 0.6619431233621276, |
| "grad_norm": 1.28125, |
| "learning_rate": 9.078323970324626e-06, |
| "loss": 0.42533535957336427, |
| "mean_token_accuracy": 0.8724991276860237, |
| "num_tokens": 26057550.0, |
| "step": 1705 |
| }, |
| { |
| "entropy": 0.40643964521586895, |
| "epoch": 0.6638843055420751, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.072253490099607e-06, |
| "loss": 0.4063755512237549, |
| "mean_token_accuracy": 0.8733890399336814, |
| "num_tokens": 26131468.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.37885321527719495, |
| "epoch": 0.6658254877220227, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.066165127909463e-06, |
| "loss": 0.39308197498321534, |
| "mean_token_accuracy": 0.881773728132248, |
| "num_tokens": 26209570.0, |
| "step": 1715 |
| }, |
| { |
| "entropy": 0.39982022494077685, |
| "epoch": 0.6677666699019703, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.060058910489237e-06, |
| "loss": 0.4166593551635742, |
| "mean_token_accuracy": 0.875648008286953, |
| "num_tokens": 26291210.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.38358601108193396, |
| "epoch": 0.6697078520819179, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.053934864652382e-06, |
| "loss": 0.39159939289093015, |
| "mean_token_accuracy": 0.8792028650641441, |
| "num_tokens": 26363096.0, |
| "step": 1725 |
| }, |
| { |
| "entropy": 0.39065288491547107, |
| "epoch": 0.6716490342618655, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.047793017290635e-06, |
| "loss": 0.41971278190612793, |
| "mean_token_accuracy": 0.8771908909082413, |
| "num_tokens": 26449438.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.36197944805026055, |
| "epoch": 0.673590216441813, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.041633395373902e-06, |
| "loss": 0.3651232957839966, |
| "mean_token_accuracy": 0.8863715797662735, |
| "num_tokens": 26506251.0, |
| "step": 1735 |
| }, |
| { |
| "entropy": 0.41872271075844764, |
| "epoch": 0.6755313986217607, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.035456025950145e-06, |
| "loss": 0.4293703556060791, |
| "mean_token_accuracy": 0.8711474344134331, |
| "num_tokens": 26577535.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.3581832841038704, |
| "epoch": 0.6774725808017082, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.029260936145252e-06, |
| "loss": 0.3745636224746704, |
| "mean_token_accuracy": 0.8827520579099655, |
| "num_tokens": 26652699.0, |
| "step": 1745 |
| }, |
| { |
| "entropy": 0.43668837919831277, |
| "epoch": 0.6794137629816558, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.02304815316293e-06, |
| "loss": 0.45046534538269045, |
| "mean_token_accuracy": 0.865936142206192, |
| "num_tokens": 26735591.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.3559565614908934, |
| "epoch": 0.6813549451616034, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.016817704284575e-06, |
| "loss": 0.36423630714416505, |
| "mean_token_accuracy": 0.8824115738272666, |
| "num_tokens": 26812459.0, |
| "step": 1755 |
| }, |
| { |
| "entropy": 0.3483701661229134, |
| "epoch": 0.683296127341551, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.010569616869159e-06, |
| "loss": 0.37481648921966554, |
| "mean_token_accuracy": 0.8892971143126488, |
| "num_tokens": 26882592.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.3975631568580866, |
| "epoch": 0.6852373095214986, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.004303918353107e-06, |
| "loss": 0.39717047214508056, |
| "mean_token_accuracy": 0.8726603716611863, |
| "num_tokens": 26954080.0, |
| "step": 1765 |
| }, |
| { |
| "entropy": 0.38138355370610955, |
| "epoch": 0.6871784917014462, |
| "grad_norm": 2.234375, |
| "learning_rate": 8.998020636250181e-06, |
| "loss": 0.39662230014801025, |
| "mean_token_accuracy": 0.8773909747600556, |
| "num_tokens": 27025611.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.35980530045926573, |
| "epoch": 0.6891196738813937, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.991719798151354e-06, |
| "loss": 0.38723225593566896, |
| "mean_token_accuracy": 0.8855857968330383, |
| "num_tokens": 27106998.0, |
| "step": 1775 |
| }, |
| { |
| "entropy": 0.39718156717717645, |
| "epoch": 0.6910608560613414, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.985401431724685e-06, |
| "loss": 0.42195706367492675, |
| "mean_token_accuracy": 0.870930427312851, |
| "num_tokens": 27191593.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.39791759476065636, |
| "epoch": 0.6930020382412889, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.979065564715209e-06, |
| "loss": 0.3908670902252197, |
| "mean_token_accuracy": 0.877900630235672, |
| "num_tokens": 27259061.0, |
| "step": 1785 |
| }, |
| { |
| "entropy": 0.37184464260935784, |
| "epoch": 0.6949432204212366, |
| "grad_norm": 1.328125, |
| "learning_rate": 8.972712224944808e-06, |
| "loss": 0.3723410367965698, |
| "mean_token_accuracy": 0.8796270757913589, |
| "num_tokens": 27345514.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.39280957020819185, |
| "epoch": 0.6968844026011841, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.966341440312088e-06, |
| "loss": 0.37746195793151854, |
| "mean_token_accuracy": 0.8742925137281418, |
| "num_tokens": 27434611.0, |
| "step": 1795 |
| }, |
| { |
| "entropy": 0.38702532537281514, |
| "epoch": 0.6988255847811317, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.959953238792261e-06, |
| "loss": 0.4323995113372803, |
| "mean_token_accuracy": 0.876301246881485, |
| "num_tokens": 27522141.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6988255847811317, |
| "eval_entropy": 0.36904463945099675, |
| "eval_loss": 0.37259599566459656, |
| "eval_mean_token_accuracy": 0.8823383979156108, |
| "eval_num_tokens": 27522141.0, |
| "eval_runtime": 60.1232, |
| "eval_samples_per_second": 35.743, |
| "eval_steps_per_second": 35.743, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.39694005586206915, |
| "epoch": 0.7007667669610793, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.953547648437016e-06, |
| "loss": 0.422884464263916, |
| "mean_token_accuracy": 0.8706113517284393, |
| "num_tokens": 27606238.0, |
| "step": 1805 |
| }, |
| { |
| "entropy": 0.35907841585576533, |
| "epoch": 0.7027079491410269, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.947124697374403e-06, |
| "loss": 0.37867820262908936, |
| "mean_token_accuracy": 0.8819711148738861, |
| "num_tokens": 27698297.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.39380453154444695, |
| "epoch": 0.7046491313209745, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.940684413808704e-06, |
| "loss": 0.41552581787109377, |
| "mean_token_accuracy": 0.8773353233933449, |
| "num_tokens": 27783292.0, |
| "step": 1815 |
| }, |
| { |
| "entropy": 0.3948865693062544, |
| "epoch": 0.706590313500922, |
| "grad_norm": 2.0, |
| "learning_rate": 8.93422682602031e-06, |
| "loss": 0.45133333206176757, |
| "mean_token_accuracy": 0.8750575929880142, |
| "num_tokens": 27857682.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.39443247877061366, |
| "epoch": 0.7085314956808697, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.927751962365603e-06, |
| "loss": 0.39142508506774903, |
| "mean_token_accuracy": 0.8749705284833909, |
| "num_tokens": 27933338.0, |
| "step": 1825 |
| }, |
| { |
| "entropy": 0.38654340282082555, |
| "epoch": 0.7104726778608172, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.921259851276816e-06, |
| "loss": 0.38780851364135743, |
| "mean_token_accuracy": 0.8745802566409111, |
| "num_tokens": 28004374.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.3340354781597853, |
| "epoch": 0.7124138600407648, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.91475052126193e-06, |
| "loss": 0.34950056076049807, |
| "mean_token_accuracy": 0.8917890131473541, |
| "num_tokens": 28076071.0, |
| "step": 1835 |
| }, |
| { |
| "entropy": 0.36462055034935476, |
| "epoch": 0.7143550422207124, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.90822400090453e-06, |
| "loss": 0.36106727123260496, |
| "mean_token_accuracy": 0.879101251065731, |
| "num_tokens": 28167857.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.3714527040719986, |
| "epoch": 0.71629622440066, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.90168031886369e-06, |
| "loss": 0.3883594274520874, |
| "mean_token_accuracy": 0.881637692451477, |
| "num_tokens": 28228771.0, |
| "step": 1845 |
| }, |
| { |
| "entropy": 0.39277232214808466, |
| "epoch": 0.7182374065806076, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.895119503873841e-06, |
| "loss": 0.4170830726623535, |
| "mean_token_accuracy": 0.8729140803217887, |
| "num_tokens": 28299510.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.3991117935627699, |
| "epoch": 0.7201785887605552, |
| "grad_norm": 2.59375, |
| "learning_rate": 8.888541584744652e-06, |
| "loss": 0.3907686710357666, |
| "mean_token_accuracy": 0.8788457185029983, |
| "num_tokens": 28356716.0, |
| "step": 1855 |
| }, |
| { |
| "entropy": 0.33190413266420365, |
| "epoch": 0.7221197709405027, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.881946590360893e-06, |
| "loss": 0.3549908399581909, |
| "mean_token_accuracy": 0.8904741749167442, |
| "num_tokens": 28425961.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.3761907495558262, |
| "epoch": 0.7240609531204504, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.875334549682322e-06, |
| "loss": 0.40765061378479006, |
| "mean_token_accuracy": 0.8756383866071701, |
| "num_tokens": 28492753.0, |
| "step": 1865 |
| }, |
| { |
| "entropy": 0.3859711352735758, |
| "epoch": 0.7260021353003979, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.868705491743543e-06, |
| "loss": 0.40584306716918944, |
| "mean_token_accuracy": 0.8751093596220016, |
| "num_tokens": 28574648.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.3750033970922232, |
| "epoch": 0.7279433174803456, |
| "grad_norm": 1.375, |
| "learning_rate": 8.862059445653892e-06, |
| "loss": 0.42207088470458987, |
| "mean_token_accuracy": 0.8791605412960053, |
| "num_tokens": 28673368.0, |
| "step": 1875 |
| }, |
| { |
| "entropy": 0.33736986815929415, |
| "epoch": 0.7298844996602931, |
| "grad_norm": 1.984375, |
| "learning_rate": 8.855396440597299e-06, |
| "loss": 0.33533928394317625, |
| "mean_token_accuracy": 0.8882556319236755, |
| "num_tokens": 28745333.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.38949261195957663, |
| "epoch": 0.7318256818402407, |
| "grad_norm": 1.46875, |
| "learning_rate": 8.848716505832163e-06, |
| "loss": 0.39729306697845457, |
| "mean_token_accuracy": 0.8767626166343689, |
| "num_tokens": 28823783.0, |
| "step": 1885 |
| }, |
| { |
| "entropy": 0.373487963527441, |
| "epoch": 0.7337668640201883, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.842019670691226e-06, |
| "loss": 0.3975057601928711, |
| "mean_token_accuracy": 0.8789292603731156, |
| "num_tokens": 28899576.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.3453738629817963, |
| "epoch": 0.7357080462001359, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.835305964581442e-06, |
| "loss": 0.38850131034851076, |
| "mean_token_accuracy": 0.8864782005548477, |
| "num_tokens": 28979338.0, |
| "step": 1895 |
| }, |
| { |
| "entropy": 0.3513793833553791, |
| "epoch": 0.7376492283800835, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.828575416983853e-06, |
| "loss": 0.3649607181549072, |
| "mean_token_accuracy": 0.8849209144711494, |
| "num_tokens": 29038858.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.3707513175904751, |
| "epoch": 0.739590410560031, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.821828057453448e-06, |
| "loss": 0.3917756795883179, |
| "mean_token_accuracy": 0.8805187106132507, |
| "num_tokens": 29121454.0, |
| "step": 1905 |
| }, |
| { |
| "entropy": 0.3511063469573855, |
| "epoch": 0.7415315927399786, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.81506391561904e-06, |
| "loss": 0.3545810699462891, |
| "mean_token_accuracy": 0.8827590346336365, |
| "num_tokens": 29192959.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.3934715397655964, |
| "epoch": 0.7434727749199262, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.80828302118314e-06, |
| "loss": 0.44544425010681155, |
| "mean_token_accuracy": 0.873169532418251, |
| "num_tokens": 29275025.0, |
| "step": 1915 |
| }, |
| { |
| "entropy": 0.37588623352348804, |
| "epoch": 0.7454139570998738, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.801485403921823e-06, |
| "loss": 0.4109992027282715, |
| "mean_token_accuracy": 0.8753042757511139, |
| "num_tokens": 29359266.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.3526043064892292, |
| "epoch": 0.7473551392798214, |
| "grad_norm": 1.625, |
| "learning_rate": 8.794671093684595e-06, |
| "loss": 0.3500061988830566, |
| "mean_token_accuracy": 0.8878669127821922, |
| "num_tokens": 29415745.0, |
| "step": 1925 |
| }, |
| { |
| "entropy": 0.39229949191212654, |
| "epoch": 0.749296321459769, |
| "grad_norm": 1.453125, |
| "learning_rate": 8.787840120394261e-06, |
| "loss": 0.4506565570831299, |
| "mean_token_accuracy": 0.873441505432129, |
| "num_tokens": 29492482.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.41463610120117667, |
| "epoch": 0.7512375036397166, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.7809925140468e-06, |
| "loss": 0.4298503875732422, |
| "mean_token_accuracy": 0.8726509675383568, |
| "num_tokens": 29572784.0, |
| "step": 1935 |
| }, |
| { |
| "entropy": 0.44622854702174664, |
| "epoch": 0.7531786858196642, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.774128304711232e-06, |
| "loss": 0.47462167739868166, |
| "mean_token_accuracy": 0.858974027633667, |
| "num_tokens": 29664399.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.36902854703366755, |
| "epoch": 0.7551198679996117, |
| "grad_norm": 1.625, |
| "learning_rate": 8.767247522529473e-06, |
| "loss": 0.38140344619750977, |
| "mean_token_accuracy": 0.8812808141112327, |
| "num_tokens": 29743761.0, |
| "step": 1945 |
| }, |
| { |
| "entropy": 0.3770481664687395, |
| "epoch": 0.7570610501795594, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.760350197716228e-06, |
| "loss": 0.37451202869415284, |
| "mean_token_accuracy": 0.8845255061984062, |
| "num_tokens": 29805351.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.42605091743171214, |
| "epoch": 0.7590022323595069, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.75343636055883e-06, |
| "loss": 0.434804630279541, |
| "mean_token_accuracy": 0.8679893091320992, |
| "num_tokens": 29876830.0, |
| "step": 1955 |
| }, |
| { |
| "entropy": 0.4243669960647821, |
| "epoch": 0.7609434145394546, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.746506041417133e-06, |
| "loss": 0.41442170143127444, |
| "mean_token_accuracy": 0.8689531117677689, |
| "num_tokens": 29952810.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.40345251336693766, |
| "epoch": 0.7628845967194021, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.739559270723353e-06, |
| "loss": 0.3906730651855469, |
| "mean_token_accuracy": 0.8731215000152588, |
| "num_tokens": 30017592.0, |
| "step": 1965 |
| }, |
| { |
| "entropy": 0.40580461621284486, |
| "epoch": 0.7648257788993497, |
| "grad_norm": 1.4375, |
| "learning_rate": 8.732596078981957e-06, |
| "loss": 0.40709662437438965, |
| "mean_token_accuracy": 0.8757615357637405, |
| "num_tokens": 30091851.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.3563157990574837, |
| "epoch": 0.7667669610792973, |
| "grad_norm": 1.53125, |
| "learning_rate": 8.72561649676952e-06, |
| "loss": 0.36572675704956054, |
| "mean_token_accuracy": 0.8835931360721588, |
| "num_tokens": 30167070.0, |
| "step": 1975 |
| }, |
| { |
| "entropy": 0.4045840006321669, |
| "epoch": 0.7687081432592449, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.718620554734582e-06, |
| "loss": 0.4593046188354492, |
| "mean_token_accuracy": 0.8688464492559433, |
| "num_tokens": 30232066.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.38409191854298114, |
| "epoch": 0.7706493254391925, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.71160828359753e-06, |
| "loss": 0.40677800178527834, |
| "mean_token_accuracy": 0.8754825726151466, |
| "num_tokens": 30303663.0, |
| "step": 1985 |
| }, |
| { |
| "entropy": 0.36517377346754076, |
| "epoch": 0.77259050761914, |
| "grad_norm": 1.5, |
| "learning_rate": 8.704579714150451e-06, |
| "loss": 0.38115544319152833, |
| "mean_token_accuracy": 0.8828602716326713, |
| "num_tokens": 30371090.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.3954622160643339, |
| "epoch": 0.7745316897990876, |
| "grad_norm": 1.234375, |
| "learning_rate": 8.697534877257003e-06, |
| "loss": 0.4024034023284912, |
| "mean_token_accuracy": 0.8711701706051826, |
| "num_tokens": 30462563.0, |
| "step": 1995 |
| }, |
| { |
| "entropy": 0.3614397499710321, |
| "epoch": 0.7764728719790353, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.690473803852277e-06, |
| "loss": 0.38828601837158205, |
| "mean_token_accuracy": 0.8840885296463966, |
| "num_tokens": 30537774.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7764728719790353, |
| "eval_entropy": 0.37249069839105764, |
| "eval_loss": 0.3714829683303833, |
| "eval_mean_token_accuracy": 0.8825830673411481, |
| "eval_num_tokens": 30537774.0, |
| "eval_runtime": 60.1567, |
| "eval_samples_per_second": 35.723, |
| "eval_steps_per_second": 35.723, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.4046864528208971, |
| "epoch": 0.7784140541589828, |
| "grad_norm": 1.59375, |
| "learning_rate": 8.683396524942655e-06, |
| "loss": 0.4361577033996582, |
| "mean_token_accuracy": 0.8703169271349906, |
| "num_tokens": 30629222.0, |
| "step": 2005 |
| }, |
| { |
| "entropy": 0.3440706986933947, |
| "epoch": 0.7803552363389304, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.676303071605692e-06, |
| "loss": 0.3639081954956055, |
| "mean_token_accuracy": 0.8900219470262527, |
| "num_tokens": 30691162.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 0.3819488488137722, |
| "epoch": 0.782296418518878, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.669193474989957e-06, |
| "loss": 0.3750166654586792, |
| "mean_token_accuracy": 0.8811448708176612, |
| "num_tokens": 30763162.0, |
| "step": 2015 |
| }, |
| { |
| "entropy": 0.41373511366546156, |
| "epoch": 0.7842376006988255, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.66206776631491e-06, |
| "loss": 0.4189298152923584, |
| "mean_token_accuracy": 0.8678776487708092, |
| "num_tokens": 30851103.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 0.38263436295092107, |
| "epoch": 0.7861787828787732, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.654925976870766e-06, |
| "loss": 0.4248814582824707, |
| "mean_token_accuracy": 0.8735449090600014, |
| "num_tokens": 30940204.0, |
| "step": 2025 |
| }, |
| { |
| "entropy": 0.4140047915279865, |
| "epoch": 0.7881199650587207, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.647768138018348e-06, |
| "loss": 0.41850671768188474, |
| "mean_token_accuracy": 0.8704892829060554, |
| "num_tokens": 31020160.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 0.35883037857711314, |
| "epoch": 0.7900611472386684, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.640594281188958e-06, |
| "loss": 0.3723835229873657, |
| "mean_token_accuracy": 0.8855434998869895, |
| "num_tokens": 31099746.0, |
| "step": 2035 |
| }, |
| { |
| "entropy": 0.36093359626829624, |
| "epoch": 0.7920023294186159, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.633404437884235e-06, |
| "loss": 0.3619117498397827, |
| "mean_token_accuracy": 0.8832022443413734, |
| "num_tokens": 31175731.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 0.38728207871317866, |
| "epoch": 0.7939435115985636, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.626198639676014e-06, |
| "loss": 0.38774235248565675, |
| "mean_token_accuracy": 0.8787285834550858, |
| "num_tokens": 31258521.0, |
| "step": 2045 |
| }, |
| { |
| "entropy": 0.36480732820928097, |
| "epoch": 0.7958846937785111, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.618976918206196e-06, |
| "loss": 0.3832773447036743, |
| "mean_token_accuracy": 0.8871650651097298, |
| "num_tokens": 31332899.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.37207553833723067, |
| "epoch": 0.7978258759584587, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.611739305186602e-06, |
| "loss": 0.4212314605712891, |
| "mean_token_accuracy": 0.877054350078106, |
| "num_tokens": 31409631.0, |
| "step": 2055 |
| }, |
| { |
| "entropy": 0.4014189802110195, |
| "epoch": 0.7997670581384063, |
| "grad_norm": 1.5390625, |
| "learning_rate": 8.604485832398833e-06, |
| "loss": 0.4188095569610596, |
| "mean_token_accuracy": 0.8725872606039047, |
| "num_tokens": 31486529.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 0.3518418502062559, |
| "epoch": 0.8017082403183539, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.597216531694136e-06, |
| "loss": 0.3803000211715698, |
| "mean_token_accuracy": 0.8850871086120605, |
| "num_tokens": 31557811.0, |
| "step": 2065 |
| }, |
| { |
| "entropy": 0.38242518045008184, |
| "epoch": 0.8036494224983015, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.589931434993262e-06, |
| "loss": 0.4062291145324707, |
| "mean_token_accuracy": 0.8756525501608848, |
| "num_tokens": 31627170.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 0.3889755714684725, |
| "epoch": 0.8055906046782491, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.58263057428632e-06, |
| "loss": 0.3767040014266968, |
| "mean_token_accuracy": 0.8771411761641502, |
| "num_tokens": 31696937.0, |
| "step": 2075 |
| }, |
| { |
| "entropy": 0.3841622915118933, |
| "epoch": 0.8075317868581966, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.575313981632645e-06, |
| "loss": 0.4042715549468994, |
| "mean_token_accuracy": 0.877042506635189, |
| "num_tokens": 31769548.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 0.3571667678654194, |
| "epoch": 0.8094729690381443, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.567981689160654e-06, |
| "loss": 0.3828322172164917, |
| "mean_token_accuracy": 0.8810154914855957, |
| "num_tokens": 31843626.0, |
| "step": 2085 |
| }, |
| { |
| "entropy": 0.4046927910298109, |
| "epoch": 0.8114141512180918, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.560633729067705e-06, |
| "loss": 0.4062997341156006, |
| "mean_token_accuracy": 0.8745594829320907, |
| "num_tokens": 31926157.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 0.3526596352458, |
| "epoch": 0.8133553333980394, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.55327013361995e-06, |
| "loss": 0.3879395484924316, |
| "mean_token_accuracy": 0.8853856906294822, |
| "num_tokens": 31985096.0, |
| "step": 2095 |
| }, |
| { |
| "entropy": 0.37672988660633566, |
| "epoch": 0.815296515577987, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.545890935152204e-06, |
| "loss": 0.3643826961517334, |
| "mean_token_accuracy": 0.8801594600081444, |
| "num_tokens": 32071262.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.3610169466584921, |
| "epoch": 0.8172376977579345, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.538496166067798e-06, |
| "loss": 0.37534480094909667, |
| "mean_token_accuracy": 0.8826367557048798, |
| "num_tokens": 32154499.0, |
| "step": 2105 |
| }, |
| { |
| "entropy": 0.3441557249054313, |
| "epoch": 0.8191788799378822, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.531085858838434e-06, |
| "loss": 0.34778728485107424, |
| "mean_token_accuracy": 0.8875968590378761, |
| "num_tokens": 32218854.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 0.361727100238204, |
| "epoch": 0.8211200621178297, |
| "grad_norm": 1.5078125, |
| "learning_rate": 8.523660046004043e-06, |
| "loss": 0.36833460330963136, |
| "mean_token_accuracy": 0.8847725585103035, |
| "num_tokens": 32290653.0, |
| "step": 2115 |
| }, |
| { |
| "entropy": 0.37507129870355127, |
| "epoch": 0.8230612442977774, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.516218760172647e-06, |
| "loss": 0.4152214050292969, |
| "mean_token_accuracy": 0.8757175728678703, |
| "num_tokens": 32369260.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 0.38906187675893306, |
| "epoch": 0.8250024264777249, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.508762034020211e-06, |
| "loss": 0.40627117156982423, |
| "mean_token_accuracy": 0.8743843853473663, |
| "num_tokens": 32461339.0, |
| "step": 2125 |
| }, |
| { |
| "entropy": 0.3820859346538782, |
| "epoch": 0.8269436086576725, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.501289900290499e-06, |
| "loss": 0.3897759437561035, |
| "mean_token_accuracy": 0.8774882882833481, |
| "num_tokens": 32541252.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 0.43145383819937705, |
| "epoch": 0.8288847908376201, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.49380239179494e-06, |
| "loss": 0.4624598503112793, |
| "mean_token_accuracy": 0.8640148594975472, |
| "num_tokens": 32626380.0, |
| "step": 2135 |
| }, |
| { |
| "entropy": 0.38095561824738977, |
| "epoch": 0.8308259730175677, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.486299541412466e-06, |
| "loss": 0.4128393650054932, |
| "mean_token_accuracy": 0.8786048114299774, |
| "num_tokens": 32702475.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 0.38069754019379615, |
| "epoch": 0.8327671551975153, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.478781382089387e-06, |
| "loss": 0.41826744079589845, |
| "mean_token_accuracy": 0.8762264057993889, |
| "num_tokens": 32798281.0, |
| "step": 2145 |
| }, |
| { |
| "entropy": 0.4370048839598894, |
| "epoch": 0.8347083373774629, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.471247946839229e-06, |
| "loss": 0.4501640796661377, |
| "mean_token_accuracy": 0.8669643774628639, |
| "num_tokens": 32865902.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.35586942471563815, |
| "epoch": 0.8366495195574105, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.463699268742604e-06, |
| "loss": 0.3725292444229126, |
| "mean_token_accuracy": 0.8853255197405815, |
| "num_tokens": 32936532.0, |
| "step": 2155 |
| }, |
| { |
| "entropy": 0.3503182210028172, |
| "epoch": 0.8385907017373581, |
| "grad_norm": 1.9921875, |
| "learning_rate": 8.456135380947055e-06, |
| "loss": 0.3832036733627319, |
| "mean_token_accuracy": 0.8870538592338562, |
| "num_tokens": 33000960.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 0.41169595904648304, |
| "epoch": 0.8405318839173056, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.448556316666912e-06, |
| "loss": 0.4174903392791748, |
| "mean_token_accuracy": 0.8725608646869659, |
| "num_tokens": 33086991.0, |
| "step": 2165 |
| }, |
| { |
| "entropy": 0.3502325866371393, |
| "epoch": 0.8424730660972533, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.44096210918315e-06, |
| "loss": 0.356764554977417, |
| "mean_token_accuracy": 0.8890736445784568, |
| "num_tokens": 33149859.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 0.4082874767482281, |
| "epoch": 0.8444142482772008, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.43335279184324e-06, |
| "loss": 0.41769680976867674, |
| "mean_token_accuracy": 0.8735889151692391, |
| "num_tokens": 33235890.0, |
| "step": 2175 |
| }, |
| { |
| "entropy": 0.3691970378160477, |
| "epoch": 0.8463554304571484, |
| "grad_norm": 1.5, |
| "learning_rate": 8.425728398061002e-06, |
| "loss": 0.4044227600097656, |
| "mean_token_accuracy": 0.8845529943704605, |
| "num_tokens": 33293736.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 0.3638597309589386, |
| "epoch": 0.848296612637096, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.418088961316459e-06, |
| "loss": 0.3457561254501343, |
| "mean_token_accuracy": 0.8865492403507232, |
| "num_tokens": 33355455.0, |
| "step": 2185 |
| }, |
| { |
| "entropy": 0.39855882450938224, |
| "epoch": 0.8502377948170435, |
| "grad_norm": 1.4921875, |
| "learning_rate": 8.410434515155694e-06, |
| "loss": 0.40858187675476076, |
| "mean_token_accuracy": 0.8782258868217468, |
| "num_tokens": 33428022.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 0.345845440402627, |
| "epoch": 0.8521789769969912, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.402765093190693e-06, |
| "loss": 0.35137181282043456, |
| "mean_token_accuracy": 0.8922701835632324, |
| "num_tokens": 33495081.0, |
| "step": 2195 |
| }, |
| { |
| "entropy": 0.3340891394764185, |
| "epoch": 0.8541201591769387, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.395080729099206e-06, |
| "loss": 0.3650421380996704, |
| "mean_token_accuracy": 0.8865584105253219, |
| "num_tokens": 33588202.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.8541201591769387, |
| "eval_entropy": 0.36483841773214426, |
| "eval_loss": 0.3705015778541565, |
| "eval_mean_token_accuracy": 0.8827846525241297, |
| "eval_num_tokens": 33588202.0, |
| "eval_runtime": 60.1598, |
| "eval_samples_per_second": 35.722, |
| "eval_steps_per_second": 35.722, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.3953329209238291, |
| "epoch": 0.8560613413568864, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.3873814566246e-06, |
| "loss": 0.4315669536590576, |
| "mean_token_accuracy": 0.8746445998549461, |
| "num_tokens": 33668354.0, |
| "step": 2205 |
| }, |
| { |
| "entropy": 0.37631992548704146, |
| "epoch": 0.8580025235368339, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.379667309575699e-06, |
| "loss": 0.41765918731689455, |
| "mean_token_accuracy": 0.8796603456139565, |
| "num_tokens": 33733737.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 0.3429785013198853, |
| "epoch": 0.8599437057167815, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.371938321826654e-06, |
| "loss": 0.35924372673034666, |
| "mean_token_accuracy": 0.8863778650760651, |
| "num_tokens": 33813616.0, |
| "step": 2215 |
| }, |
| { |
| "entropy": 0.36748342849314214, |
| "epoch": 0.8618848878967291, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.364194527316776e-06, |
| "loss": 0.38543248176574707, |
| "mean_token_accuracy": 0.8795094177126884, |
| "num_tokens": 33893625.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 0.3699775494635105, |
| "epoch": 0.8638260700766767, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.356435960050398e-06, |
| "loss": 0.3805511474609375, |
| "mean_token_accuracy": 0.8810431718826294, |
| "num_tokens": 33969465.0, |
| "step": 2225 |
| }, |
| { |
| "entropy": 0.39027220420539377, |
| "epoch": 0.8657672522566243, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.348662654096724e-06, |
| "loss": 0.3937405586242676, |
| "mean_token_accuracy": 0.875699220597744, |
| "num_tokens": 34037321.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 0.380586925894022, |
| "epoch": 0.8677084344365719, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.340874643589676e-06, |
| "loss": 0.39784080982208253, |
| "mean_token_accuracy": 0.8761513873934745, |
| "num_tokens": 34115202.0, |
| "step": 2235 |
| }, |
| { |
| "entropy": 0.38007759377360345, |
| "epoch": 0.8696496166165194, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.333071962727745e-06, |
| "loss": 0.3872611284255981, |
| "mean_token_accuracy": 0.8754698395729065, |
| "num_tokens": 34202914.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 0.36750674396753313, |
| "epoch": 0.8715907987964671, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.325254645773849e-06, |
| "loss": 0.36534600257873534, |
| "mean_token_accuracy": 0.8795874208211899, |
| "num_tokens": 34276806.0, |
| "step": 2245 |
| }, |
| { |
| "entropy": 0.3650043081492186, |
| "epoch": 0.8735319809764146, |
| "grad_norm": 1.59375, |
| "learning_rate": 8.317422727055165e-06, |
| "loss": 0.3911173105239868, |
| "mean_token_accuracy": 0.8810791179537774, |
| "num_tokens": 34349979.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.4252156797796488, |
| "epoch": 0.8754731631563623, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.309576240962998e-06, |
| "loss": 0.3878526449203491, |
| "mean_token_accuracy": 0.870864699780941, |
| "num_tokens": 34431756.0, |
| "step": 2255 |
| }, |
| { |
| "entropy": 0.3478477492928505, |
| "epoch": 0.8774143453363098, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.301715221952615e-06, |
| "loss": 0.3578909635543823, |
| "mean_token_accuracy": 0.8878472730517387, |
| "num_tokens": 34507200.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 0.3685021881014109, |
| "epoch": 0.8793555275162575, |
| "grad_norm": 1.59375, |
| "learning_rate": 8.293839704543103e-06, |
| "loss": 0.39955284595489504, |
| "mean_token_accuracy": 0.880965618789196, |
| "num_tokens": 34586189.0, |
| "step": 2265 |
| }, |
| { |
| "entropy": 0.35574909709393976, |
| "epoch": 0.881296709696205, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.285949723317214e-06, |
| "loss": 0.38354690074920655, |
| "mean_token_accuracy": 0.8842941373586655, |
| "num_tokens": 34664914.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 0.3691468223929405, |
| "epoch": 0.8832378918761525, |
| "grad_norm": 1.3984375, |
| "learning_rate": 8.27804531292121e-06, |
| "loss": 0.3860490322113037, |
| "mean_token_accuracy": 0.8787114471197128, |
| "num_tokens": 34748974.0, |
| "step": 2275 |
| }, |
| { |
| "entropy": 0.3915288481861353, |
| "epoch": 0.8851790740561002, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.270126508064717e-06, |
| "loss": 0.4229584217071533, |
| "mean_token_accuracy": 0.8710038289427757, |
| "num_tokens": 34823550.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 0.4291458610445261, |
| "epoch": 0.8871202562360477, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.262193343520567e-06, |
| "loss": 0.43143463134765625, |
| "mean_token_accuracy": 0.8686526745557785, |
| "num_tokens": 34917555.0, |
| "step": 2285 |
| }, |
| { |
| "entropy": 0.37965994626283645, |
| "epoch": 0.8890614384159954, |
| "grad_norm": 1.5078125, |
| "learning_rate": 8.254245854124652e-06, |
| "loss": 0.3806295394897461, |
| "mean_token_accuracy": 0.881553427875042, |
| "num_tokens": 34990801.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 0.4057528983801603, |
| "epoch": 0.8910026205959429, |
| "grad_norm": 1.875, |
| "learning_rate": 8.246284074775763e-06, |
| "loss": 0.41382293701171874, |
| "mean_token_accuracy": 0.8720936447381973, |
| "num_tokens": 35074294.0, |
| "step": 2295 |
| }, |
| { |
| "entropy": 0.36701224111020564, |
| "epoch": 0.8929438027758905, |
| "grad_norm": 1.53125, |
| "learning_rate": 8.23830804043544e-06, |
| "loss": 0.39014787673950196, |
| "mean_token_accuracy": 0.8813544929027557, |
| "num_tokens": 35142936.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.36863389015197756, |
| "epoch": 0.8948849849558381, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.230317786127822e-06, |
| "loss": 0.4085258483886719, |
| "mean_token_accuracy": 0.8811587393283844, |
| "num_tokens": 35228369.0, |
| "step": 2305 |
| }, |
| { |
| "entropy": 0.3572548136115074, |
| "epoch": 0.8968261671357857, |
| "grad_norm": 1.6640625, |
| "learning_rate": 8.22231334693949e-06, |
| "loss": 0.40120840072631836, |
| "mean_token_accuracy": 0.8830894485116005, |
| "num_tokens": 35306656.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 0.3663245867937803, |
| "epoch": 0.8987673493157333, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.21429475801931e-06, |
| "loss": 0.3976627826690674, |
| "mean_token_accuracy": 0.8808062687516213, |
| "num_tokens": 35383007.0, |
| "step": 2315 |
| }, |
| { |
| "entropy": 0.40247388668358325, |
| "epoch": 0.9007085314956809, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.20626205457829e-06, |
| "loss": 0.4049511909484863, |
| "mean_token_accuracy": 0.8711845085024834, |
| "num_tokens": 35453462.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 0.3913619853556156, |
| "epoch": 0.9026497136756284, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.198215271889405e-06, |
| "loss": 0.3979458808898926, |
| "mean_token_accuracy": 0.879303203523159, |
| "num_tokens": 35524043.0, |
| "step": 2325 |
| }, |
| { |
| "entropy": 0.37322167456150057, |
| "epoch": 0.9045908958555761, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.190154445287466e-06, |
| "loss": 0.41640191078186034, |
| "mean_token_accuracy": 0.8771696910262108, |
| "num_tokens": 35609328.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 0.39078370332717893, |
| "epoch": 0.9065320780355236, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.182079610168945e-06, |
| "loss": 0.37838523387908934, |
| "mean_token_accuracy": 0.8762111157178879, |
| "num_tokens": 35680622.0, |
| "step": 2335 |
| }, |
| { |
| "entropy": 0.39326913058757784, |
| "epoch": 0.9084732602154713, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.173990801991834e-06, |
| "loss": 0.38826932907104494, |
| "mean_token_accuracy": 0.8793436914682389, |
| "num_tokens": 35744201.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 0.38700769394636153, |
| "epoch": 0.9104144423954188, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.165888056275478e-06, |
| "loss": 0.4147165298461914, |
| "mean_token_accuracy": 0.8736557975411415, |
| "num_tokens": 35822206.0, |
| "step": 2345 |
| }, |
| { |
| "entropy": 0.3916376482695341, |
| "epoch": 0.9123556245753665, |
| "grad_norm": 1.4375, |
| "learning_rate": 8.157771408600427e-06, |
| "loss": 0.40491595268249514, |
| "mean_token_accuracy": 0.8760656327009201, |
| "num_tokens": 35898583.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.37616121433675287, |
| "epoch": 0.914296806755314, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.149640894608277e-06, |
| "loss": 0.39853510856628416, |
| "mean_token_accuracy": 0.8797665163874626, |
| "num_tokens": 35962197.0, |
| "step": 2355 |
| }, |
| { |
| "entropy": 0.3872214786708355, |
| "epoch": 0.9162379889352615, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.141496550001512e-06, |
| "loss": 0.4320131778717041, |
| "mean_token_accuracy": 0.8749197080731392, |
| "num_tokens": 36048634.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 0.3654011983424425, |
| "epoch": 0.9181791711152092, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.13333841054335e-06, |
| "loss": 0.4127936363220215, |
| "mean_token_accuracy": 0.8794704377651215, |
| "num_tokens": 36114078.0, |
| "step": 2365 |
| }, |
| { |
| "entropy": 0.4052185159176588, |
| "epoch": 0.9201203532951567, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.125166512057583e-06, |
| "loss": 0.4502895355224609, |
| "mean_token_accuracy": 0.8730918914079666, |
| "num_tokens": 36185468.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 0.3805313348770142, |
| "epoch": 0.9220615354751044, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.116980890428421e-06, |
| "loss": 0.4319314956665039, |
| "mean_token_accuracy": 0.8795101106166839, |
| "num_tokens": 36262273.0, |
| "step": 2375 |
| }, |
| { |
| "entropy": 0.379262937605381, |
| "epoch": 0.9240027176550519, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.108781581600337e-06, |
| "loss": 0.3972128391265869, |
| "mean_token_accuracy": 0.8769020855426788, |
| "num_tokens": 36339772.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 0.3275274306535721, |
| "epoch": 0.9259438998349995, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.100568621577907e-06, |
| "loss": 0.349655294418335, |
| "mean_token_accuracy": 0.8939395412802696, |
| "num_tokens": 36405190.0, |
| "step": 2385 |
| }, |
| { |
| "entropy": 0.37873089760541917, |
| "epoch": 0.9278850820149471, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.092342046425647e-06, |
| "loss": 0.41008243560791013, |
| "mean_token_accuracy": 0.8806984931230545, |
| "num_tokens": 36471551.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 0.42233099043369293, |
| "epoch": 0.9298262641948947, |
| "grad_norm": 1.5, |
| "learning_rate": 8.084101892267866e-06, |
| "loss": 0.43898987770080566, |
| "mean_token_accuracy": 0.8671094790101052, |
| "num_tokens": 36545141.0, |
| "step": 2395 |
| }, |
| { |
| "entropy": 0.377986478433013, |
| "epoch": 0.9317674463748423, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.075848195288495e-06, |
| "loss": 0.4050844669342041, |
| "mean_token_accuracy": 0.8788811087608337, |
| "num_tokens": 36621766.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.9317674463748423, |
| "eval_entropy": 0.3660563061428991, |
| "eval_loss": 0.3696966767311096, |
| "eval_mean_token_accuracy": 0.8829624101609727, |
| "eval_num_tokens": 36621766.0, |
| "eval_runtime": 60.0474, |
| "eval_samples_per_second": 35.788, |
| "eval_steps_per_second": 35.788, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.41329049319028854, |
| "epoch": 0.9337086285547899, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.06758099173094e-06, |
| "loss": 0.3864266872406006, |
| "mean_token_accuracy": 0.8733444228768349, |
| "num_tokens": 36692402.0, |
| "step": 2405 |
| }, |
| { |
| "entropy": 0.35147353522479535, |
| "epoch": 0.9356498107347374, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.059300317897907e-06, |
| "loss": 0.3865788698196411, |
| "mean_token_accuracy": 0.8899516001343727, |
| "num_tokens": 36762687.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 0.35971076525747775, |
| "epoch": 0.9375909929146851, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.051006210151264e-06, |
| "loss": 0.38848717212677003, |
| "mean_token_accuracy": 0.8857970297336578, |
| "num_tokens": 36829409.0, |
| "step": 2415 |
| }, |
| { |
| "entropy": 0.36762615144252775, |
| "epoch": 0.9395321750946326, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.04269870491186e-06, |
| "loss": 0.38152570724487306, |
| "mean_token_accuracy": 0.8803352236747741, |
| "num_tokens": 36920865.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 0.37836971804499625, |
| "epoch": 0.9414733572745803, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.03437783865938e-06, |
| "loss": 0.3982245683670044, |
| "mean_token_accuracy": 0.8782578885555268, |
| "num_tokens": 36993918.0, |
| "step": 2425 |
| }, |
| { |
| "entropy": 0.40916073732078073, |
| "epoch": 0.9434145394545278, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.02604364793218e-06, |
| "loss": 0.40181870460510255, |
| "mean_token_accuracy": 0.8741835564374923, |
| "num_tokens": 37067466.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 0.38971280567348004, |
| "epoch": 0.9453557216344753, |
| "grad_norm": 1.4375, |
| "learning_rate": 8.017696169327121e-06, |
| "loss": 0.3853023052215576, |
| "mean_token_accuracy": 0.8737384587526321, |
| "num_tokens": 37158825.0, |
| "step": 2435 |
| }, |
| { |
| "entropy": 0.36188525408506395, |
| "epoch": 0.947296903814423, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.009335439499418e-06, |
| "loss": 0.39717903137207033, |
| "mean_token_accuracy": 0.8839860737323761, |
| "num_tokens": 37231768.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 0.3343341175466776, |
| "epoch": 0.9492380859943705, |
| "grad_norm": 1.5078125, |
| "learning_rate": 8.000961495162474e-06, |
| "loss": 0.34873759746551514, |
| "mean_token_accuracy": 0.8942202180624008, |
| "num_tokens": 37295787.0, |
| "step": 2445 |
| }, |
| { |
| "entropy": 0.37651418149471283, |
| "epoch": 0.9511792681743182, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.992574373087717e-06, |
| "loss": 0.3985455989837646, |
| "mean_token_accuracy": 0.8804239287972451, |
| "num_tokens": 37365031.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.3872853074222803, |
| "epoch": 0.9531204503542657, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.984174110104442e-06, |
| "loss": 0.3960126876831055, |
| "mean_token_accuracy": 0.8773596182465553, |
| "num_tokens": 37440723.0, |
| "step": 2455 |
| }, |
| { |
| "entropy": 0.36501435153186323, |
| "epoch": 0.9550616325342134, |
| "grad_norm": 1.546875, |
| "learning_rate": 7.975760743099648e-06, |
| "loss": 0.3613110065460205, |
| "mean_token_accuracy": 0.8814436718821526, |
| "num_tokens": 37517552.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 0.4213182792067528, |
| "epoch": 0.9570028147141609, |
| "grad_norm": 2.125, |
| "learning_rate": 7.967334309017876e-06, |
| "loss": 0.42275075912475585, |
| "mean_token_accuracy": 0.8684807687997818, |
| "num_tokens": 37576304.0, |
| "step": 2465 |
| }, |
| { |
| "entropy": 0.36460405923426153, |
| "epoch": 0.9589439968941085, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.958894844861044e-06, |
| "loss": 0.4192854881286621, |
| "mean_token_accuracy": 0.882645896077156, |
| "num_tokens": 37649463.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 0.3699316095560789, |
| "epoch": 0.9608851790740561, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.950442387688295e-06, |
| "loss": 0.39672675132751467, |
| "mean_token_accuracy": 0.8789965286850929, |
| "num_tokens": 37727011.0, |
| "step": 2475 |
| }, |
| { |
| "entropy": 0.3613630454987288, |
| "epoch": 0.9628263612540037, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.941976974615817e-06, |
| "loss": 0.35828289985656736, |
| "mean_token_accuracy": 0.8861474558711052, |
| "num_tokens": 37799274.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 0.4028384655714035, |
| "epoch": 0.9647675434339513, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.933498642816698e-06, |
| "loss": 0.39244048595428466, |
| "mean_token_accuracy": 0.8733719438314438, |
| "num_tokens": 37872790.0, |
| "step": 2485 |
| }, |
| { |
| "entropy": 0.37262568436563015, |
| "epoch": 0.9667087256138989, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.925007429520745e-06, |
| "loss": 0.3869138240814209, |
| "mean_token_accuracy": 0.8780170202255249, |
| "num_tokens": 37949478.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 0.3532130911946297, |
| "epoch": 0.9686499077938464, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.916503372014339e-06, |
| "loss": 0.3645073175430298, |
| "mean_token_accuracy": 0.8856014132499694, |
| "num_tokens": 38010035.0, |
| "step": 2495 |
| }, |
| { |
| "entropy": 0.4101907879114151, |
| "epoch": 0.9705910899737941, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.90798650764026e-06, |
| "loss": 0.43153948783874513, |
| "mean_token_accuracy": 0.868617196381092, |
| "num_tokens": 38091317.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 0.37458378039300444, |
| "epoch": 0.9725322721537416, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.899456873797519e-06, |
| "loss": 0.4130906105041504, |
| "mean_token_accuracy": 0.8811309933662415, |
| "num_tokens": 38156010.0, |
| "step": 2505 |
| }, |
| { |
| "entropy": 0.33361660987138747, |
| "epoch": 0.9744734543336893, |
| "grad_norm": 1.5078125, |
| "learning_rate": 7.890914507941209e-06, |
| "loss": 0.3599473714828491, |
| "mean_token_accuracy": 0.891946268081665, |
| "num_tokens": 38227058.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 0.36410921774804594, |
| "epoch": 0.9764146365136368, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.882359447582323e-06, |
| "loss": 0.36246566772460936, |
| "mean_token_accuracy": 0.8795957028865814, |
| "num_tokens": 38308578.0, |
| "step": 2515 |
| }, |
| { |
| "entropy": 0.40007474571466445, |
| "epoch": 0.9783558186935843, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.873791730287607e-06, |
| "loss": 0.416595458984375, |
| "mean_token_accuracy": 0.8698130205273629, |
| "num_tokens": 38393316.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 0.37237811721861364, |
| "epoch": 0.980297000873532, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.865211393679374e-06, |
| "loss": 0.3867233991622925, |
| "mean_token_accuracy": 0.8821268856525422, |
| "num_tokens": 38469288.0, |
| "step": 2525 |
| }, |
| { |
| "entropy": 0.3845432631671429, |
| "epoch": 0.9822381830534795, |
| "grad_norm": 1.3515625, |
| "learning_rate": 7.856618475435361e-06, |
| "loss": 0.3905576944351196, |
| "mean_token_accuracy": 0.8774156749248505, |
| "num_tokens": 38543267.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 0.40176920518279075, |
| "epoch": 0.9841793652334272, |
| "grad_norm": 1.546875, |
| "learning_rate": 7.848013013288548e-06, |
| "loss": 0.41007471084594727, |
| "mean_token_accuracy": 0.8690975129604339, |
| "num_tokens": 38626344.0, |
| "step": 2535 |
| }, |
| { |
| "entropy": 0.38942315727472304, |
| "epoch": 0.9861205474133747, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.839395045027e-06, |
| "loss": 0.40326895713806155, |
| "mean_token_accuracy": 0.8756881758570672, |
| "num_tokens": 38701064.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 0.3606115547940135, |
| "epoch": 0.9880617295933223, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.830764608493697e-06, |
| "loss": 0.36026384830474856, |
| "mean_token_accuracy": 0.8839934259653092, |
| "num_tokens": 38779059.0, |
| "step": 2545 |
| }, |
| { |
| "entropy": 0.39264477528631686, |
| "epoch": 0.9900029117732699, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.822121741586368e-06, |
| "loss": 0.4251199245452881, |
| "mean_token_accuracy": 0.8718041434884072, |
| "num_tokens": 38880681.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 0.40420532748103144, |
| "epoch": 0.9919440939532175, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.813466482257327e-06, |
| "loss": 0.4312422752380371, |
| "mean_token_accuracy": 0.8763286352157593, |
| "num_tokens": 38941161.0, |
| "step": 2555 |
| }, |
| { |
| "entropy": 0.3369973488152027, |
| "epoch": 0.9938852761331651, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.804798868513306e-06, |
| "loss": 0.35411407947540285, |
| "mean_token_accuracy": 0.8899437338113785, |
| "num_tokens": 39018011.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 0.39636878967285155, |
| "epoch": 0.9958264583131127, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.796118938415289e-06, |
| "loss": 0.407199764251709, |
| "mean_token_accuracy": 0.8719804942607879, |
| "num_tokens": 39101097.0, |
| "step": 2565 |
| }, |
| { |
| "entropy": 0.3830322280526161, |
| "epoch": 0.9977676404930603, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.78742673007834e-06, |
| "loss": 0.38955183029174806, |
| "mean_token_accuracy": 0.8754953861236572, |
| "num_tokens": 39180075.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 0.37996360957622527, |
| "epoch": 0.9997088226730079, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.77872228167144e-06, |
| "loss": 0.4175414085388184, |
| "mean_token_accuracy": 0.8776975840330123, |
| "num_tokens": 39256747.0, |
| "step": 2575 |
| }, |
| { |
| "entropy": 0.3622729911615974, |
| "epoch": 1.001552945743958, |
| "grad_norm": 1.5, |
| "learning_rate": 7.770005631417316e-06, |
| "loss": 0.3494336366653442, |
| "mean_token_accuracy": 0.8844640803964514, |
| "num_tokens": 39331281.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 0.3717500135302544, |
| "epoch": 1.0034941279239056, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.761276817592283e-06, |
| "loss": 0.38556852340698244, |
| "mean_token_accuracy": 0.8811951488256454, |
| "num_tokens": 39403180.0, |
| "step": 2585 |
| }, |
| { |
| "entropy": 0.3536002866923809, |
| "epoch": 1.0054353101038533, |
| "grad_norm": 1.5078125, |
| "learning_rate": 7.752535878526057e-06, |
| "loss": 0.3865217208862305, |
| "mean_token_accuracy": 0.8846583724021911, |
| "num_tokens": 39482653.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 0.36924818605184556, |
| "epoch": 1.0073764922838009, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.743782852601609e-06, |
| "loss": 0.3744253873825073, |
| "mean_token_accuracy": 0.8835659891366958, |
| "num_tokens": 39571122.0, |
| "step": 2595 |
| }, |
| { |
| "entropy": 0.3826246250420809, |
| "epoch": 1.0093176744637484, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.735017778254976e-06, |
| "loss": 0.3962560176849365, |
| "mean_token_accuracy": 0.8752669557929039, |
| "num_tokens": 39643625.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.0093176744637484, |
| "eval_entropy": 0.3590102044439915, |
| "eval_loss": 0.3693583905696869, |
| "eval_mean_token_accuracy": 0.8832231578354283, |
| "eval_num_tokens": 39643625.0, |
| "eval_runtime": 60.1285, |
| "eval_samples_per_second": 35.74, |
| "eval_steps_per_second": 35.74, |
| "step": 2600 |
| }, |
| { |
| "entropy": 0.3363046307116747, |
| "epoch": 1.011258856643696, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.726240693975112e-06, |
| "loss": 0.3622615814208984, |
| "mean_token_accuracy": 0.890743799507618, |
| "num_tokens": 39727345.0, |
| "step": 2605 |
| }, |
| { |
| "entropy": 0.39671580009162427, |
| "epoch": 1.0132000388236435, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.7174516383037e-06, |
| "loss": 0.42294821739196775, |
| "mean_token_accuracy": 0.8709942042827606, |
| "num_tokens": 39809382.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 0.38348409347236156, |
| "epoch": 1.0151412210035913, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.70865064983499e-06, |
| "loss": 0.40362987518310545, |
| "mean_token_accuracy": 0.8796380490064621, |
| "num_tokens": 39870908.0, |
| "step": 2615 |
| }, |
| { |
| "entropy": 0.35954158157110216, |
| "epoch": 1.0170824031835388, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.699837767215642e-06, |
| "loss": 0.391841459274292, |
| "mean_token_accuracy": 0.8825426653027535, |
| "num_tokens": 39946284.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 0.3796717070043087, |
| "epoch": 1.0190235853634864, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.691013029144535e-06, |
| "loss": 0.4171717643737793, |
| "mean_token_accuracy": 0.8788355842232705, |
| "num_tokens": 40017489.0, |
| "step": 2625 |
| }, |
| { |
| "entropy": 0.35411719866096975, |
| "epoch": 1.020964767543434, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.682176474372613e-06, |
| "loss": 0.36679236888885497, |
| "mean_token_accuracy": 0.8839921057224274, |
| "num_tokens": 40091956.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 0.4047669190913439, |
| "epoch": 1.0229059497233814, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.673328141702708e-06, |
| "loss": 0.42531418800354004, |
| "mean_token_accuracy": 0.8716289401054382, |
| "num_tokens": 40174273.0, |
| "step": 2635 |
| }, |
| { |
| "entropy": 0.3872047744691372, |
| "epoch": 1.0248471319033292, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.664468069989363e-06, |
| "loss": 0.39284777641296387, |
| "mean_token_accuracy": 0.8739194989204406, |
| "num_tokens": 40259401.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 0.39128339402377604, |
| "epoch": 1.0267883140832768, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.655596298138683e-06, |
| "loss": 0.3992388963699341, |
| "mean_token_accuracy": 0.8772461920976639, |
| "num_tokens": 40326313.0, |
| "step": 2645 |
| }, |
| { |
| "entropy": 0.36866898983716967, |
| "epoch": 1.0287294962632243, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.646712865108143e-06, |
| "loss": 0.376071572303772, |
| "mean_token_accuracy": 0.885202445089817, |
| "num_tokens": 40402194.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 0.3736342485994101, |
| "epoch": 1.0306706784431718, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.637817809906422e-06, |
| "loss": 0.38311469554901123, |
| "mean_token_accuracy": 0.8761677891016006, |
| "num_tokens": 40482708.0, |
| "step": 2655 |
| }, |
| { |
| "entropy": 0.36633356250822546, |
| "epoch": 1.0326118606231194, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.628911171593236e-06, |
| "loss": 0.39987525939941404, |
| "mean_token_accuracy": 0.8770500838756561, |
| "num_tokens": 40550320.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 0.3814588252454996, |
| "epoch": 1.0345530428030671, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.6199929892791666e-06, |
| "loss": 0.42825708389282224, |
| "mean_token_accuracy": 0.8766391202807426, |
| "num_tokens": 40621121.0, |
| "step": 2665 |
| }, |
| { |
| "entropy": 0.3263087157160044, |
| "epoch": 1.0364942249830147, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.611063302125485e-06, |
| "loss": 0.3370352745056152, |
| "mean_token_accuracy": 0.892676542699337, |
| "num_tokens": 40675017.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 0.35688314363360407, |
| "epoch": 1.0384354071629622, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.602122149343982e-06, |
| "loss": 0.37260828018188474, |
| "mean_token_accuracy": 0.8857112199068069, |
| "num_tokens": 40735182.0, |
| "step": 2675 |
| }, |
| { |
| "entropy": 0.3657310428097844, |
| "epoch": 1.0403765893429098, |
| "grad_norm": 1.4296875, |
| "learning_rate": 7.593169570196798e-06, |
| "loss": 0.38663344383239745, |
| "mean_token_accuracy": 0.8788129478693009, |
| "num_tokens": 40812276.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 0.39327623806893824, |
| "epoch": 1.0423177715228573, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.5842056039962465e-06, |
| "loss": 0.40496459007263186, |
| "mean_token_accuracy": 0.8742495253682137, |
| "num_tokens": 40883935.0, |
| "step": 2685 |
| }, |
| { |
| "entropy": 0.37283147126436234, |
| "epoch": 1.044258953702805, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.575230290104643e-06, |
| "loss": 0.38010687828063966, |
| "mean_token_accuracy": 0.8804807871580124, |
| "num_tokens": 40965277.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 0.33802379108965397, |
| "epoch": 1.0462001358827526, |
| "grad_norm": 1.5078125, |
| "learning_rate": 7.566243667934132e-06, |
| "loss": 0.34528648853302, |
| "mean_token_accuracy": 0.8925616145133972, |
| "num_tokens": 41036874.0, |
| "step": 2695 |
| }, |
| { |
| "entropy": 0.34530715458095074, |
| "epoch": 1.0481413180627002, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.557245776946522e-06, |
| "loss": 0.3618237257003784, |
| "mean_token_accuracy": 0.8869366824626923, |
| "num_tokens": 41123295.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 0.3728719219565392, |
| "epoch": 1.0500825002426477, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.548236656653095e-06, |
| "loss": 0.3764779567718506, |
| "mean_token_accuracy": 0.8791951701045037, |
| "num_tokens": 41206755.0, |
| "step": 2705 |
| }, |
| { |
| "entropy": 0.36930376179516317, |
| "epoch": 1.0520236824225955, |
| "grad_norm": 1.3671875, |
| "learning_rate": 7.539216346614448e-06, |
| "loss": 0.3768768310546875, |
| "mean_token_accuracy": 0.8802413672208786, |
| "num_tokens": 41295129.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 0.3499265480786562, |
| "epoch": 1.053964864602543, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.530184886440312e-06, |
| "loss": 0.3675286293029785, |
| "mean_token_accuracy": 0.889797542989254, |
| "num_tokens": 41374363.0, |
| "step": 2715 |
| }, |
| { |
| "entropy": 0.3654515855014324, |
| "epoch": 1.0559060467824906, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.521142315789382e-06, |
| "loss": 0.3843737840652466, |
| "mean_token_accuracy": 0.8779830664396286, |
| "num_tokens": 41452026.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 0.36771729625761507, |
| "epoch": 1.057847228962438, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.512088674369143e-06, |
| "loss": 0.3673874378204346, |
| "mean_token_accuracy": 0.8848532065749168, |
| "num_tokens": 41516536.0, |
| "step": 2725 |
| }, |
| { |
| "entropy": 0.4460260573774576, |
| "epoch": 1.0597884111423856, |
| "grad_norm": 1.546875, |
| "learning_rate": 7.503024001935686e-06, |
| "loss": 0.45882291793823243, |
| "mean_token_accuracy": 0.8644292831420899, |
| "num_tokens": 41595307.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 0.34162113182246684, |
| "epoch": 1.0617295933223332, |
| "grad_norm": 1.375, |
| "learning_rate": 7.493948338293549e-06, |
| "loss": 0.35989553928375245, |
| "mean_token_accuracy": 0.8857067421078682, |
| "num_tokens": 41675093.0, |
| "step": 2735 |
| }, |
| { |
| "entropy": 0.34600385688245294, |
| "epoch": 1.063670775502281, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.4848617232955275e-06, |
| "loss": 0.36208953857421877, |
| "mean_token_accuracy": 0.8861552521586418, |
| "num_tokens": 41751969.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 0.34471417032182217, |
| "epoch": 1.0656119576822285, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.475764196842516e-06, |
| "loss": 0.3590202331542969, |
| "mean_token_accuracy": 0.8894360795617103, |
| "num_tokens": 41826333.0, |
| "step": 2745 |
| }, |
| { |
| "entropy": 0.35084208101034164, |
| "epoch": 1.067553139862176, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.466655798883313e-06, |
| "loss": 0.3687446117401123, |
| "mean_token_accuracy": 0.8872736170887947, |
| "num_tokens": 41908430.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 0.3609664674848318, |
| "epoch": 1.0694943220421236, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.457536569414459e-06, |
| "loss": 0.3871330738067627, |
| "mean_token_accuracy": 0.8852574542164803, |
| "num_tokens": 41985011.0, |
| "step": 2755 |
| }, |
| { |
| "entropy": 0.34639161452651024, |
| "epoch": 1.0714355042220713, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.448406548480063e-06, |
| "loss": 0.3695810794830322, |
| "mean_token_accuracy": 0.8897538051009178, |
| "num_tokens": 42048911.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 0.32954322583973406, |
| "epoch": 1.0733766864020189, |
| "grad_norm": 1.5, |
| "learning_rate": 7.439265776171611e-06, |
| "loss": 0.3176077365875244, |
| "mean_token_accuracy": 0.8935502767562866, |
| "num_tokens": 42120003.0, |
| "step": 2765 |
| }, |
| { |
| "entropy": 0.37422714903950693, |
| "epoch": 1.0753178685819664, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.430114292627808e-06, |
| "loss": 0.350958251953125, |
| "mean_token_accuracy": 0.8825857222080231, |
| "num_tokens": 42189503.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 0.35138509757816794, |
| "epoch": 1.077259050761914, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.420952138034392e-06, |
| "loss": 0.3909478187561035, |
| "mean_token_accuracy": 0.8854555234313011, |
| "num_tokens": 42251724.0, |
| "step": 2775 |
| }, |
| { |
| "entropy": 0.3574231918901205, |
| "epoch": 1.0792002329418615, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.411779352623958e-06, |
| "loss": 0.36853466033935545, |
| "mean_token_accuracy": 0.8846323460340499, |
| "num_tokens": 42328165.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 0.3312282390892506, |
| "epoch": 1.0811414151218093, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.402595976675785e-06, |
| "loss": 0.34425904750823977, |
| "mean_token_accuracy": 0.8917144045233727, |
| "num_tokens": 42416101.0, |
| "step": 2785 |
| }, |
| { |
| "entropy": 0.38315938860177995, |
| "epoch": 1.0830825973017568, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.393402050515652e-06, |
| "loss": 0.41192307472229006, |
| "mean_token_accuracy": 0.8757176354527474, |
| "num_tokens": 42490192.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 0.3734312802553177, |
| "epoch": 1.0850237794817044, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.384197614515672e-06, |
| "loss": 0.3864989519119263, |
| "mean_token_accuracy": 0.8778089836239815, |
| "num_tokens": 42579078.0, |
| "step": 2795 |
| }, |
| { |
| "entropy": 0.3602697692811489, |
| "epoch": 1.086964961661652, |
| "grad_norm": 1.515625, |
| "learning_rate": 7.3749827090941074e-06, |
| "loss": 0.4144554615020752, |
| "mean_token_accuracy": 0.8825942382216454, |
| "num_tokens": 42656738.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.086964961661652, |
| "eval_entropy": 0.3603161519775561, |
| "eval_loss": 0.3686440587043762, |
| "eval_mean_token_accuracy": 0.8834110738205987, |
| "eval_num_tokens": 42656738.0, |
| "eval_runtime": 60.0795, |
| "eval_samples_per_second": 35.769, |
| "eval_steps_per_second": 35.769, |
| "step": 2800 |
| }, |
| { |
| "entropy": 0.37382765375077726, |
| "epoch": 1.0889061438415994, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.365757374715188e-06, |
| "loss": 0.4022432804107666, |
| "mean_token_accuracy": 0.878273893892765, |
| "num_tokens": 42727874.0, |
| "step": 2805 |
| }, |
| { |
| "entropy": 0.3878149565309286, |
| "epoch": 1.0908473260215472, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.356521651888946e-06, |
| "loss": 0.4143357276916504, |
| "mean_token_accuracy": 0.8759766072034836, |
| "num_tokens": 42805952.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 0.3720662288367748, |
| "epoch": 1.0927885082014948, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.347275581171027e-06, |
| "loss": 0.3936682939529419, |
| "mean_token_accuracy": 0.8800241187214851, |
| "num_tokens": 42879670.0, |
| "step": 2815 |
| }, |
| { |
| "entropy": 0.3727020751684904, |
| "epoch": 1.0947296903814423, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.338019203162516e-06, |
| "loss": 0.40426788330078123, |
| "mean_token_accuracy": 0.8778004497289658, |
| "num_tokens": 42966404.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 0.37748123742640016, |
| "epoch": 1.0966708725613898, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.3287525585097615e-06, |
| "loss": 0.3956634044647217, |
| "mean_token_accuracy": 0.877564987540245, |
| "num_tokens": 43043314.0, |
| "step": 2825 |
| }, |
| { |
| "entropy": 0.3898195032030344, |
| "epoch": 1.0986120547413374, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.319475687904193e-06, |
| "loss": 0.39679808616638185, |
| "mean_token_accuracy": 0.8783272713422775, |
| "num_tokens": 43108948.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 0.3532901670783758, |
| "epoch": 1.1005532369212851, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.310188632082145e-06, |
| "loss": 0.3547484874725342, |
| "mean_token_accuracy": 0.8868882149457932, |
| "num_tokens": 43182120.0, |
| "step": 2835 |
| }, |
| { |
| "entropy": 0.38809507302939894, |
| "epoch": 1.1024944191012327, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.300891431824673e-06, |
| "loss": 0.4074056148529053, |
| "mean_token_accuracy": 0.8722813636064529, |
| "num_tokens": 43263374.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 0.35386649817228316, |
| "epoch": 1.1044356012811802, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.291584127957384e-06, |
| "loss": 0.3566242456436157, |
| "mean_token_accuracy": 0.8883859798312187, |
| "num_tokens": 43334896.0, |
| "step": 2845 |
| }, |
| { |
| "entropy": 0.37594650611281394, |
| "epoch": 1.1063767834611278, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.282266761350249e-06, |
| "loss": 0.3671935319900513, |
| "mean_token_accuracy": 0.8861946225166321, |
| "num_tokens": 43395764.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 0.37138679772615435, |
| "epoch": 1.1083179656410753, |
| "grad_norm": 1.3359375, |
| "learning_rate": 7.272939372917427e-06, |
| "loss": 0.3758493185043335, |
| "mean_token_accuracy": 0.8805965319275856, |
| "num_tokens": 43483273.0, |
| "step": 2855 |
| }, |
| { |
| "entropy": 0.3592002343386412, |
| "epoch": 1.110259147821023, |
| "grad_norm": 1.609375, |
| "learning_rate": 7.263602003617083e-06, |
| "loss": 0.36438665390014646, |
| "mean_token_accuracy": 0.8856978788971901, |
| "num_tokens": 43554609.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 0.3399000741541386, |
| "epoch": 1.1122003300009706, |
| "grad_norm": 1.5625, |
| "learning_rate": 7.2542546944512106e-06, |
| "loss": 0.3749422550201416, |
| "mean_token_accuracy": 0.8887553334236145, |
| "num_tokens": 43626500.0, |
| "step": 2865 |
| }, |
| { |
| "entropy": 0.40647769123315813, |
| "epoch": 1.1141415121809182, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.244897486465451e-06, |
| "loss": 0.43062515258789064, |
| "mean_token_accuracy": 0.8718539297580719, |
| "num_tokens": 43696284.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 0.3513967592269182, |
| "epoch": 1.1160826943608657, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.2355304207489154e-06, |
| "loss": 0.35802536010742186, |
| "mean_token_accuracy": 0.88879015147686, |
| "num_tokens": 43768064.0, |
| "step": 2875 |
| }, |
| { |
| "entropy": 0.3477201282978058, |
| "epoch": 1.1180238765408133, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.226153538433996e-06, |
| "loss": 0.37060644626617434, |
| "mean_token_accuracy": 0.8868695870041847, |
| "num_tokens": 43841997.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 0.3926592905074358, |
| "epoch": 1.119965058720761, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.216766880696199e-06, |
| "loss": 0.4085033893585205, |
| "mean_token_accuracy": 0.8761053428053855, |
| "num_tokens": 43920068.0, |
| "step": 2885 |
| }, |
| { |
| "entropy": 0.34527620263397696, |
| "epoch": 1.1219062409007086, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.207370488753949e-06, |
| "loss": 0.35770795345306394, |
| "mean_token_accuracy": 0.8901533395051956, |
| "num_tokens": 43989892.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 0.3740640126168728, |
| "epoch": 1.123847423080656, |
| "grad_norm": 1.375, |
| "learning_rate": 7.197964403868421e-06, |
| "loss": 0.39128780364990234, |
| "mean_token_accuracy": 0.8786321595311165, |
| "num_tokens": 44067873.0, |
| "step": 2895 |
| }, |
| { |
| "entropy": 0.3401097748428583, |
| "epoch": 1.1257886052606036, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.188548667343347e-06, |
| "loss": 0.357515287399292, |
| "mean_token_accuracy": 0.8893807768821717, |
| "num_tokens": 44142522.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 0.39471787922084334, |
| "epoch": 1.1277297874405514, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.179123320524848e-06, |
| "loss": 0.3968302488327026, |
| "mean_token_accuracy": 0.8748754128813744, |
| "num_tokens": 44228212.0, |
| "step": 2905 |
| }, |
| { |
| "entropy": 0.4300340283662081, |
| "epoch": 1.129670969620499, |
| "grad_norm": 1.5546875, |
| "learning_rate": 7.169688404801241e-06, |
| "loss": 0.4560871124267578, |
| "mean_token_accuracy": 0.8694811254739762, |
| "num_tokens": 44294114.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 0.3595341399312019, |
| "epoch": 1.1316121518004465, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.160243961602863e-06, |
| "loss": 0.3778635025024414, |
| "mean_token_accuracy": 0.8835319861769676, |
| "num_tokens": 44355840.0, |
| "step": 2915 |
| }, |
| { |
| "entropy": 0.42457632496953013, |
| "epoch": 1.133553333980394, |
| "grad_norm": 2.375, |
| "learning_rate": 7.150790032401887e-06, |
| "loss": 0.4247872829437256, |
| "mean_token_accuracy": 0.8706662476062774, |
| "num_tokens": 44426127.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 0.4208326905965805, |
| "epoch": 1.1354945161603416, |
| "grad_norm": 1.5703125, |
| "learning_rate": 7.1413266587121434e-06, |
| "loss": 0.42088823318481444, |
| "mean_token_accuracy": 0.871395905315876, |
| "num_tokens": 44497833.0, |
| "step": 2925 |
| }, |
| { |
| "entropy": 0.356390430778265, |
| "epoch": 1.1374356983402891, |
| "grad_norm": 1.4765625, |
| "learning_rate": 7.13185388208893e-06, |
| "loss": 0.37389678955078126, |
| "mean_token_accuracy": 0.8866265177726745, |
| "num_tokens": 44582547.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 0.3736116912215948, |
| "epoch": 1.139376880520237, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.122371744128839e-06, |
| "loss": 0.3963154792785645, |
| "mean_token_accuracy": 0.8842655003070832, |
| "num_tokens": 44655812.0, |
| "step": 2935 |
| }, |
| { |
| "entropy": 0.43113922215998174, |
| "epoch": 1.1413180627001844, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.112880286469568e-06, |
| "loss": 0.42786569595336915, |
| "mean_token_accuracy": 0.8678357198834419, |
| "num_tokens": 44730175.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 0.34794704206287863, |
| "epoch": 1.143259244880132, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.103379550789741e-06, |
| "loss": 0.35416512489318847, |
| "mean_token_accuracy": 0.8896363779902459, |
| "num_tokens": 44794755.0, |
| "step": 2945 |
| }, |
| { |
| "entropy": 0.40246716812253, |
| "epoch": 1.1452004270600795, |
| "grad_norm": 1.5234375, |
| "learning_rate": 7.093869578808719e-06, |
| "loss": 0.41913704872131347, |
| "mean_token_accuracy": 0.868728120625019, |
| "num_tokens": 44866536.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 0.3885481279343367, |
| "epoch": 1.1471416092400273, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.084350412286424e-06, |
| "loss": 0.40956454277038573, |
| "mean_token_accuracy": 0.8750770896673202, |
| "num_tokens": 44942021.0, |
| "step": 2955 |
| }, |
| { |
| "entropy": 0.3983582962304354, |
| "epoch": 1.1490827914199748, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.074822093023154e-06, |
| "loss": 0.4057170391082764, |
| "mean_token_accuracy": 0.8758019611239434, |
| "num_tokens": 45016428.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 0.3897486738860607, |
| "epoch": 1.1510239735999224, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.065284662859395e-06, |
| "loss": 0.4297188282012939, |
| "mean_token_accuracy": 0.8763071224093437, |
| "num_tokens": 45082753.0, |
| "step": 2965 |
| }, |
| { |
| "entropy": 0.3621146373450756, |
| "epoch": 1.15296515577987, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.055738163675645e-06, |
| "loss": 0.35830867290496826, |
| "mean_token_accuracy": 0.8859908595681191, |
| "num_tokens": 45147776.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 0.40969758927822114, |
| "epoch": 1.1549063379598175, |
| "grad_norm": 1.5234375, |
| "learning_rate": 7.046182637392221e-06, |
| "loss": 0.3900305271148682, |
| "mean_token_accuracy": 0.8708938717842102, |
| "num_tokens": 45223891.0, |
| "step": 2975 |
| }, |
| { |
| "entropy": 0.37424799539148806, |
| "epoch": 1.156847520139765, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.036618125969081e-06, |
| "loss": 0.3869047164916992, |
| "mean_token_accuracy": 0.8798339098691941, |
| "num_tokens": 45294331.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 0.38800337798893453, |
| "epoch": 1.1587887023197128, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.027044671405643e-06, |
| "loss": 0.3859901428222656, |
| "mean_token_accuracy": 0.8810177177190781, |
| "num_tokens": 45348349.0, |
| "step": 2985 |
| }, |
| { |
| "entropy": 0.3485205162316561, |
| "epoch": 1.1607298844996603, |
| "grad_norm": 1.4296875, |
| "learning_rate": 7.017462315740586e-06, |
| "loss": 0.3649015188217163, |
| "mean_token_accuracy": 0.8860207587480545, |
| "num_tokens": 45425964.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 0.37424386143684385, |
| "epoch": 1.1626710666796078, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.007871101051686e-06, |
| "loss": 0.3772335767745972, |
| "mean_token_accuracy": 0.8799161404371262, |
| "num_tokens": 45500293.0, |
| "step": 2995 |
| }, |
| { |
| "entropy": 0.38352062441408635, |
| "epoch": 1.1646122488595554, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.998271069455612e-06, |
| "loss": 0.40156922340393064, |
| "mean_token_accuracy": 0.8821331828832626, |
| "num_tokens": 45571727.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.1646122488595554, |
| "eval_entropy": 0.3600764439036925, |
| "eval_loss": 0.36817678809165955, |
| "eval_mean_token_accuracy": 0.883426463426463, |
| "eval_num_tokens": 45571727.0, |
| "eval_runtime": 60.1453, |
| "eval_samples_per_second": 35.73, |
| "eval_steps_per_second": 35.73, |
| "step": 3000 |
| }, |
| { |
| "entropy": 0.4300002858042717, |
| "epoch": 1.1665534310395032, |
| "grad_norm": 1.4296875, |
| "learning_rate": 6.988662263107755e-06, |
| "loss": 0.4532319068908691, |
| "mean_token_accuracy": 0.8672842562198639, |
| "num_tokens": 45648648.0, |
| "step": 3005 |
| }, |
| { |
| "entropy": 0.3812822367995977, |
| "epoch": 1.1684946132194507, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.979044724202034e-06, |
| "loss": 0.39993724822998045, |
| "mean_token_accuracy": 0.8782149285078049, |
| "num_tokens": 45743015.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 0.41226282604038716, |
| "epoch": 1.1704357953993982, |
| "grad_norm": 1.5234375, |
| "learning_rate": 6.969418494970717e-06, |
| "loss": 0.4353823661804199, |
| "mean_token_accuracy": 0.8674470081925392, |
| "num_tokens": 45826008.0, |
| "step": 3015 |
| }, |
| { |
| "entropy": 0.37322292029857634, |
| "epoch": 1.1723769775793458, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.9597836176842315e-06, |
| "loss": 0.4075223445892334, |
| "mean_token_accuracy": 0.8766345664858818, |
| "num_tokens": 45907989.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 0.3779715023934841, |
| "epoch": 1.1743181597592933, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.9501401346509786e-06, |
| "loss": 0.4066688060760498, |
| "mean_token_accuracy": 0.8800197467207909, |
| "num_tokens": 45976593.0, |
| "step": 3025 |
| }, |
| { |
| "entropy": 0.364545364305377, |
| "epoch": 1.176259341939241, |
| "grad_norm": 1.453125, |
| "learning_rate": 6.940488088217152e-06, |
| "loss": 0.37837910652160645, |
| "mean_token_accuracy": 0.8811485067009925, |
| "num_tokens": 46067425.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 0.34584682770073416, |
| "epoch": 1.1782005241191886, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.930827520766544e-06, |
| "loss": 0.3524082899093628, |
| "mean_token_accuracy": 0.8913029715418815, |
| "num_tokens": 46141435.0, |
| "step": 3035 |
| }, |
| { |
| "entropy": 0.38420435078442094, |
| "epoch": 1.1801417062991362, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.921158474720368e-06, |
| "loss": 0.3806861400604248, |
| "mean_token_accuracy": 0.8749532103538513, |
| "num_tokens": 46222095.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 0.376201831176877, |
| "epoch": 1.1820828884790837, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.911480992537072e-06, |
| "loss": 0.4178003311157227, |
| "mean_token_accuracy": 0.8752377212047577, |
| "num_tokens": 46312000.0, |
| "step": 3045 |
| }, |
| { |
| "entropy": 0.38689825385808946, |
| "epoch": 1.1840240706590313, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.901795116712136e-06, |
| "loss": 0.40619282722473143, |
| "mean_token_accuracy": 0.8773537456989289, |
| "num_tokens": 46381015.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 0.39098729118704795, |
| "epoch": 1.185965252838979, |
| "grad_norm": 1.546875, |
| "learning_rate": 6.892100889777913e-06, |
| "loss": 0.42108306884765623, |
| "mean_token_accuracy": 0.8786390334367752, |
| "num_tokens": 46464894.0, |
| "step": 3055 |
| }, |
| { |
| "entropy": 0.3601615995168686, |
| "epoch": 1.1879064350189266, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.882398354303416e-06, |
| "loss": 0.3870659351348877, |
| "mean_token_accuracy": 0.8846402570605278, |
| "num_tokens": 46545575.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 0.3909476988017559, |
| "epoch": 1.189847617198874, |
| "grad_norm": 2.125, |
| "learning_rate": 6.872687552894145e-06, |
| "loss": 0.3942322969436646, |
| "mean_token_accuracy": 0.8762227043509483, |
| "num_tokens": 46620397.0, |
| "step": 3065 |
| }, |
| { |
| "entropy": 0.36160071194171906, |
| "epoch": 1.1917887993788217, |
| "grad_norm": 1.484375, |
| "learning_rate": 6.8629685281919025e-06, |
| "loss": 0.35771043300628663, |
| "mean_token_accuracy": 0.8830681905150414, |
| "num_tokens": 46695823.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 0.40609239749610426, |
| "epoch": 1.1937299815587692, |
| "grad_norm": 1.4453125, |
| "learning_rate": 6.853241322874593e-06, |
| "loss": 0.40566306114196776, |
| "mean_token_accuracy": 0.8745755672454834, |
| "num_tokens": 46763659.0, |
| "step": 3075 |
| }, |
| { |
| "entropy": 0.39826103691011666, |
| "epoch": 1.195671163738717, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.843505979656049e-06, |
| "loss": 0.42182149887084963, |
| "mean_token_accuracy": 0.878571617603302, |
| "num_tokens": 46827063.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 0.3527070388197899, |
| "epoch": 1.1976123459186645, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.8337625412858364e-06, |
| "loss": 0.3918677806854248, |
| "mean_token_accuracy": 0.8872169196605683, |
| "num_tokens": 46902371.0, |
| "step": 3085 |
| }, |
| { |
| "entropy": 0.360038623213768, |
| "epoch": 1.199553528098612, |
| "grad_norm": 1.4140625, |
| "learning_rate": 6.824011050549067e-06, |
| "loss": 0.36493072509765623, |
| "mean_token_accuracy": 0.8841731250286102, |
| "num_tokens": 46982779.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 0.372321966663003, |
| "epoch": 1.2014947102785596, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.814251550266216e-06, |
| "loss": 0.39631216526031493, |
| "mean_token_accuracy": 0.8795213535428047, |
| "num_tokens": 47072025.0, |
| "step": 3095 |
| }, |
| { |
| "entropy": 0.44918127730488777, |
| "epoch": 1.2034358924585074, |
| "grad_norm": 1.5, |
| "learning_rate": 6.8044840832929216e-06, |
| "loss": 0.4901744365692139, |
| "mean_token_accuracy": 0.8591332510113716, |
| "num_tokens": 47134711.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 0.36903586611151695, |
| "epoch": 1.205377074638455, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.794708692519815e-06, |
| "loss": 0.36009137630462645, |
| "mean_token_accuracy": 0.8829508319497108, |
| "num_tokens": 47211803.0, |
| "step": 3105 |
| }, |
| { |
| "entropy": 0.4135138522833586, |
| "epoch": 1.2073182568184024, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.784925420872315e-06, |
| "loss": 0.4357631683349609, |
| "mean_token_accuracy": 0.8690931290388108, |
| "num_tokens": 47289477.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 0.3864825196564198, |
| "epoch": 1.20925943899835, |
| "grad_norm": 1.5078125, |
| "learning_rate": 6.775134311310449e-06, |
| "loss": 0.3875833034515381, |
| "mean_token_accuracy": 0.8817471221089364, |
| "num_tokens": 47361495.0, |
| "step": 3115 |
| }, |
| { |
| "entropy": 0.3780834227800369, |
| "epoch": 1.2112006211782975, |
| "grad_norm": 2.578125, |
| "learning_rate": 6.765335406828664e-06, |
| "loss": 0.4267258167266846, |
| "mean_token_accuracy": 0.8809913843870163, |
| "num_tokens": 47423556.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 0.36974840685725213, |
| "epoch": 1.213141803358245, |
| "grad_norm": 1.75, |
| "learning_rate": 6.755528750455634e-06, |
| "loss": 0.36568589210510255, |
| "mean_token_accuracy": 0.8837969750165939, |
| "num_tokens": 47502054.0, |
| "step": 3125 |
| }, |
| { |
| "entropy": 0.40455227382481096, |
| "epoch": 1.2150829855381928, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.745714385254072e-06, |
| "loss": 0.4230593204498291, |
| "mean_token_accuracy": 0.8713468372821808, |
| "num_tokens": 47576658.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 0.3805558536201715, |
| "epoch": 1.2170241677181404, |
| "grad_norm": 1.75, |
| "learning_rate": 6.735892354320544e-06, |
| "loss": 0.38716301918029783, |
| "mean_token_accuracy": 0.8806325614452362, |
| "num_tokens": 47646232.0, |
| "step": 3135 |
| }, |
| { |
| "entropy": 0.36968096643686293, |
| "epoch": 1.218965349898088, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.726062700785273e-06, |
| "loss": 0.39945073127746583, |
| "mean_token_accuracy": 0.8774180024862289, |
| "num_tokens": 47741132.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 0.35996747594326733, |
| "epoch": 1.2209065320780355, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.716225467811961e-06, |
| "loss": 0.37158637046813964, |
| "mean_token_accuracy": 0.8840801179409027, |
| "num_tokens": 47812661.0, |
| "step": 3145 |
| }, |
| { |
| "entropy": 0.37774690724909304, |
| "epoch": 1.2228477142579832, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.706380698597588e-06, |
| "loss": 0.3942166805267334, |
| "mean_token_accuracy": 0.8794585153460502, |
| "num_tokens": 47883300.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 0.4131374925374985, |
| "epoch": 1.2247888964379308, |
| "grad_norm": 1.6875, |
| "learning_rate": 6.696528436372229e-06, |
| "loss": 0.4139698505401611, |
| "mean_token_accuracy": 0.869027565419674, |
| "num_tokens": 47979410.0, |
| "step": 3155 |
| }, |
| { |
| "entropy": 0.3713895071297884, |
| "epoch": 1.2267300786178783, |
| "grad_norm": 2.390625, |
| "learning_rate": 6.68666872439886e-06, |
| "loss": 0.3731879472732544, |
| "mean_token_accuracy": 0.880556121468544, |
| "num_tokens": 48058170.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 0.3848850384354591, |
| "epoch": 1.2286712607978258, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.67680160597317e-06, |
| "loss": 0.40471110343933103, |
| "mean_token_accuracy": 0.8750801667571068, |
| "num_tokens": 48127200.0, |
| "step": 3165 |
| }, |
| { |
| "entropy": 0.3463645543903112, |
| "epoch": 1.2306124429777734, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.666927124423374e-06, |
| "loss": 0.3593963623046875, |
| "mean_token_accuracy": 0.8887131616473198, |
| "num_tokens": 48190947.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 0.3704676777124405, |
| "epoch": 1.232553625157721, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.657045323110017e-06, |
| "loss": 0.3847299337387085, |
| "mean_token_accuracy": 0.8824770480394364, |
| "num_tokens": 48268615.0, |
| "step": 3175 |
| }, |
| { |
| "entropy": 0.3637780986726284, |
| "epoch": 1.2344948073376687, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.647156245425789e-06, |
| "loss": 0.3874013423919678, |
| "mean_token_accuracy": 0.8841297894716262, |
| "num_tokens": 48336348.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 0.35034383423626425, |
| "epoch": 1.2364359895176162, |
| "grad_norm": 1.4765625, |
| "learning_rate": 6.637259934795328e-06, |
| "loss": 0.34986927509307864, |
| "mean_token_accuracy": 0.8901937618851662, |
| "num_tokens": 48406226.0, |
| "step": 3185 |
| }, |
| { |
| "entropy": 0.403315170109272, |
| "epoch": 1.2383771716975638, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.627356434675035e-06, |
| "loss": 0.4066962718963623, |
| "mean_token_accuracy": 0.8722446888685227, |
| "num_tokens": 48490073.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 0.36379750072956085, |
| "epoch": 1.2403183538775113, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.6174457885528855e-06, |
| "loss": 0.3708995819091797, |
| "mean_token_accuracy": 0.8828730553388595, |
| "num_tokens": 48561708.0, |
| "step": 3195 |
| }, |
| { |
| "entropy": 0.35466758720576763, |
| "epoch": 1.242259536057459, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.607528039948226e-06, |
| "loss": 0.36141531467437743, |
| "mean_token_accuracy": 0.8850849062204361, |
| "num_tokens": 48629826.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.242259536057459, |
| "eval_entropy": 0.3607052234181308, |
| "eval_loss": 0.3680853247642517, |
| "eval_mean_token_accuracy": 0.8834105204598413, |
| "eval_num_tokens": 48629826.0, |
| "eval_runtime": 60.0565, |
| "eval_samples_per_second": 35.783, |
| "eval_steps_per_second": 35.783, |
| "step": 3200 |
| }, |
| { |
| "entropy": 0.33488245457410815, |
| "epoch": 1.2442007182374066, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.597603232411597e-06, |
| "loss": 0.40059671401977537, |
| "mean_token_accuracy": 0.8868094369769096, |
| "num_tokens": 48705277.0, |
| "step": 3205 |
| }, |
| { |
| "entropy": 0.36259912960231305, |
| "epoch": 1.2461419004173542, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.587671409524534e-06, |
| "loss": 0.36700074672698973, |
| "mean_token_accuracy": 0.8851820915937424, |
| "num_tokens": 48773921.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 0.3843076877295971, |
| "epoch": 1.2480830825973017, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.577732614899379e-06, |
| "loss": 0.4054192066192627, |
| "mean_token_accuracy": 0.8755981966853141, |
| "num_tokens": 48859799.0, |
| "step": 3215 |
| }, |
| { |
| "entropy": 0.38492829352617264, |
| "epoch": 1.2500242647772493, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.56778689217909e-06, |
| "loss": 0.39413578510284425, |
| "mean_token_accuracy": 0.8772288784384727, |
| "num_tokens": 48930817.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 0.3711765740066767, |
| "epoch": 1.2519654469571968, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.5578342850370415e-06, |
| "loss": 0.37443616390228274, |
| "mean_token_accuracy": 0.8799751400947571, |
| "num_tokens": 49002171.0, |
| "step": 3225 |
| }, |
| { |
| "entropy": 0.3800849601626396, |
| "epoch": 1.2539066291371446, |
| "grad_norm": 1.5234375, |
| "learning_rate": 6.547874837176847e-06, |
| "loss": 0.3951963186264038, |
| "mean_token_accuracy": 0.88048807233572, |
| "num_tokens": 49073741.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 0.3790770899504423, |
| "epoch": 1.255847811317092, |
| "grad_norm": 1.4765625, |
| "learning_rate": 6.537908592332147e-06, |
| "loss": 0.40506410598754883, |
| "mean_token_accuracy": 0.8772617995738983, |
| "num_tokens": 49148184.0, |
| "step": 3235 |
| }, |
| { |
| "entropy": 0.3464452028274536, |
| "epoch": 1.2577889934970397, |
| "grad_norm": 1.484375, |
| "learning_rate": 6.5279355942664435e-06, |
| "loss": 0.3766259908676147, |
| "mean_token_accuracy": 0.8875595390796661, |
| "num_tokens": 49218339.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 0.3986640240997076, |
| "epoch": 1.2597301756769874, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.51795588677288e-06, |
| "loss": 0.3935344696044922, |
| "mean_token_accuracy": 0.87731524258852, |
| "num_tokens": 49279139.0, |
| "step": 3245 |
| }, |
| { |
| "entropy": 0.3633753590285778, |
| "epoch": 1.261671357856935, |
| "grad_norm": 1.6953125, |
| "learning_rate": 6.5079695136740706e-06, |
| "loss": 0.3786989688873291, |
| "mean_token_accuracy": 0.8843644946813584, |
| "num_tokens": 49352390.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 0.38015848845243455, |
| "epoch": 1.2636125400368825, |
| "grad_norm": 1.3203125, |
| "learning_rate": 6.497976518821896e-06, |
| "loss": 0.4066456317901611, |
| "mean_token_accuracy": 0.878155305981636, |
| "num_tokens": 49446216.0, |
| "step": 3255 |
| }, |
| { |
| "entropy": 0.39295368976891043, |
| "epoch": 1.26555372221683, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.487976946097314e-06, |
| "loss": 0.3828210115432739, |
| "mean_token_accuracy": 0.8781291946768761, |
| "num_tokens": 49523614.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 0.3769407343119383, |
| "epoch": 1.2674949043967776, |
| "grad_norm": 1.40625, |
| "learning_rate": 6.477970839410166e-06, |
| "loss": 0.40603952407836913, |
| "mean_token_accuracy": 0.8798100754618645, |
| "num_tokens": 49591786.0, |
| "step": 3265 |
| }, |
| { |
| "entropy": 0.3537591304630041, |
| "epoch": 1.2694360865767251, |
| "grad_norm": 1.25, |
| "learning_rate": 6.46795824269899e-06, |
| "loss": 0.3576634407043457, |
| "mean_token_accuracy": 0.8874867498874665, |
| "num_tokens": 49663637.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 0.35550354048609734, |
| "epoch": 1.271377268756673, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.457939199930815e-06, |
| "loss": 0.39648468494415284, |
| "mean_token_accuracy": 0.8848752856254578, |
| "num_tokens": 49731508.0, |
| "step": 3275 |
| }, |
| { |
| "entropy": 0.3586549339815974, |
| "epoch": 1.2733184509366204, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.4479137551009855e-06, |
| "loss": 0.3832548141479492, |
| "mean_token_accuracy": 0.8830386832356453, |
| "num_tokens": 49813544.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 0.3636878037825227, |
| "epoch": 1.275259633116568, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.437881952232947e-06, |
| "loss": 0.3801161766052246, |
| "mean_token_accuracy": 0.8825449839234352, |
| "num_tokens": 49885620.0, |
| "step": 3285 |
| }, |
| { |
| "entropy": 0.3816155593842268, |
| "epoch": 1.2772008152965155, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.427843835378074e-06, |
| "loss": 0.3867227554321289, |
| "mean_token_accuracy": 0.87795270383358, |
| "num_tokens": 49964261.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 0.33171985633671286, |
| "epoch": 1.2791419974764633, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.417799448615465e-06, |
| "loss": 0.3791377544403076, |
| "mean_token_accuracy": 0.8870480135083199, |
| "num_tokens": 50051116.0, |
| "step": 3295 |
| }, |
| { |
| "entropy": 0.33659284114837645, |
| "epoch": 1.2810831796564108, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.407748836051746e-06, |
| "loss": 0.35617640018463137, |
| "mean_token_accuracy": 0.8889725834131241, |
| "num_tokens": 50125047.0, |
| "step": 3300 |
| }, |
| { |
| "entropy": 0.3994648285210133, |
| "epoch": 1.2830243618363584, |
| "grad_norm": 2.0, |
| "learning_rate": 6.397692041820885e-06, |
| "loss": 0.37363758087158205, |
| "mean_token_accuracy": 0.8738816857337952, |
| "num_tokens": 50185527.0, |
| "step": 3305 |
| }, |
| { |
| "entropy": 0.3728124268352985, |
| "epoch": 1.284965544016306, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.387629110083995e-06, |
| "loss": 0.37665843963623047, |
| "mean_token_accuracy": 0.8807444587349892, |
| "num_tokens": 50267257.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 0.39132245220243933, |
| "epoch": 1.2869067261962535, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.377560085029139e-06, |
| "loss": 0.3918001651763916, |
| "mean_token_accuracy": 0.8778573974967003, |
| "num_tokens": 50339377.0, |
| "step": 3315 |
| }, |
| { |
| "entropy": 0.3549855757504702, |
| "epoch": 1.288847908376201, |
| "grad_norm": 1.375, |
| "learning_rate": 6.367485010871136e-06, |
| "loss": 0.3473607301712036, |
| "mean_token_accuracy": 0.8883946269750596, |
| "num_tokens": 50412920.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 0.40118363983929156, |
| "epoch": 1.2907890905561488, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.35740393185137e-06, |
| "loss": 0.4183527946472168, |
| "mean_token_accuracy": 0.8735328048467637, |
| "num_tokens": 50493105.0, |
| "step": 3325 |
| }, |
| { |
| "entropy": 0.3577863838523626, |
| "epoch": 1.2927302727360963, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.347316892237592e-06, |
| "loss": 0.36974031925201417, |
| "mean_token_accuracy": 0.8823448717594147, |
| "num_tokens": 50597292.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 0.4042118158191442, |
| "epoch": 1.2946714549160439, |
| "grad_norm": 1.4609375, |
| "learning_rate": 6.3372239363237255e-06, |
| "loss": 0.3996162414550781, |
| "mean_token_accuracy": 0.8742659211158752, |
| "num_tokens": 50669226.0, |
| "step": 3335 |
| }, |
| { |
| "entropy": 0.38877438604831693, |
| "epoch": 1.2966126370959914, |
| "grad_norm": 1.4140625, |
| "learning_rate": 6.327125108429677e-06, |
| "loss": 0.3838073492050171, |
| "mean_token_accuracy": 0.8778223499655724, |
| "num_tokens": 50740937.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 0.3652618743479252, |
| "epoch": 1.2985538192759392, |
| "grad_norm": 1.609375, |
| "learning_rate": 6.317020452901134e-06, |
| "loss": 0.40174212455749514, |
| "mean_token_accuracy": 0.8852205485105514, |
| "num_tokens": 50800354.0, |
| "step": 3345 |
| }, |
| { |
| "entropy": 0.36313771940767764, |
| "epoch": 1.3004950014558867, |
| "grad_norm": 1.75, |
| "learning_rate": 6.3069100141093755e-06, |
| "loss": 0.40732836723327637, |
| "mean_token_accuracy": 0.8836523965001106, |
| "num_tokens": 50874364.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 0.3877572625875473, |
| "epoch": 1.3024361836358342, |
| "grad_norm": 1.4609375, |
| "learning_rate": 6.2967938364510794e-06, |
| "loss": 0.3883176326751709, |
| "mean_token_accuracy": 0.8740043297410012, |
| "num_tokens": 50953290.0, |
| "step": 3355 |
| }, |
| { |
| "entropy": 0.3541896607726812, |
| "epoch": 1.3043773658157818, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.2866719643481185e-06, |
| "loss": 0.40380287170410156, |
| "mean_token_accuracy": 0.8862088546156883, |
| "num_tokens": 51013828.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 0.3846803639084101, |
| "epoch": 1.3063185479957293, |
| "grad_norm": 1.5546875, |
| "learning_rate": 6.2765444422473735e-06, |
| "loss": 0.4024141788482666, |
| "mean_token_accuracy": 0.8768733203411102, |
| "num_tokens": 51088099.0, |
| "step": 3365 |
| }, |
| { |
| "entropy": 0.382393941283226, |
| "epoch": 1.3082597301756769, |
| "grad_norm": 1.5390625, |
| "learning_rate": 6.2664113146205355e-06, |
| "loss": 0.4033693313598633, |
| "mean_token_accuracy": 0.8758635804057121, |
| "num_tokens": 51174151.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 0.35187431797385216, |
| "epoch": 1.3102009123556246, |
| "grad_norm": 1.53125, |
| "learning_rate": 6.256272625963908e-06, |
| "loss": 0.3925636291503906, |
| "mean_token_accuracy": 0.8831515818834305, |
| "num_tokens": 51253871.0, |
| "step": 3375 |
| }, |
| { |
| "entropy": 0.3575460772961378, |
| "epoch": 1.3121420945355722, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.24612842079822e-06, |
| "loss": 0.3699699878692627, |
| "mean_token_accuracy": 0.8861239358782769, |
| "num_tokens": 51320927.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 0.3574929475784302, |
| "epoch": 1.3140832767155197, |
| "grad_norm": 1.40625, |
| "learning_rate": 6.235978743668415e-06, |
| "loss": 0.3928325653076172, |
| "mean_token_accuracy": 0.8840924382209778, |
| "num_tokens": 51393313.0, |
| "step": 3385 |
| }, |
| { |
| "entropy": 0.4037714671343565, |
| "epoch": 1.3160244588954673, |
| "grad_norm": 1.75, |
| "learning_rate": 6.2258236391434735e-06, |
| "loss": 0.43996176719665525, |
| "mean_token_accuracy": 0.8732839792966842, |
| "num_tokens": 51469149.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 0.3767301281914115, |
| "epoch": 1.317965641075415, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.215663151816204e-06, |
| "loss": 0.41208739280700685, |
| "mean_token_accuracy": 0.8773611128330231, |
| "num_tokens": 51549599.0, |
| "step": 3395 |
| }, |
| { |
| "entropy": 0.41424218341708186, |
| "epoch": 1.3199068232553626, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.205497326303054e-06, |
| "loss": 0.4277363300323486, |
| "mean_token_accuracy": 0.8679974019527436, |
| "num_tokens": 51642096.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.3199068232553626, |
| "eval_entropy": 0.3615535780503174, |
| "eval_loss": 0.36774712800979614, |
| "eval_mean_token_accuracy": 0.8836295662535352, |
| "eval_num_tokens": 51642096.0, |
| "eval_runtime": 60.122, |
| "eval_samples_per_second": 35.744, |
| "eval_steps_per_second": 35.744, |
| "step": 3400 |
| }, |
| { |
| "entropy": 0.3396712843328714, |
| "epoch": 1.3218480054353101, |
| "grad_norm": 2.0, |
| "learning_rate": 6.1953262072439104e-06, |
| "loss": 0.36101136207580564, |
| "mean_token_accuracy": 0.8887990996241569, |
| "num_tokens": 51720729.0, |
| "step": 3405 |
| }, |
| { |
| "entropy": 0.37931633833795786, |
| "epoch": 1.3237891876152577, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.185149839301904e-06, |
| "loss": 0.4054765224456787, |
| "mean_token_accuracy": 0.8779459938406944, |
| "num_tokens": 51794420.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 0.3843179401010275, |
| "epoch": 1.3257303697952052, |
| "grad_norm": 1.5, |
| "learning_rate": 6.1749682671632185e-06, |
| "loss": 0.40850515365600587, |
| "mean_token_accuracy": 0.8769529685378075, |
| "num_tokens": 51877828.0, |
| "step": 3415 |
| }, |
| { |
| "entropy": 0.3709686040878296, |
| "epoch": 1.3276715519751527, |
| "grad_norm": 1.4609375, |
| "learning_rate": 6.1647815355368845e-06, |
| "loss": 0.38035385608673095, |
| "mean_token_accuracy": 0.8827380672097206, |
| "num_tokens": 51949655.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 0.3752392638474703, |
| "epoch": 1.3296127341551005, |
| "grad_norm": 1.6328125, |
| "learning_rate": 6.154589689154594e-06, |
| "loss": 0.36831343173980713, |
| "mean_token_accuracy": 0.8784654662013054, |
| "num_tokens": 52034397.0, |
| "step": 3425 |
| }, |
| { |
| "entropy": 0.4212960582226515, |
| "epoch": 1.331553916335048, |
| "grad_norm": 1.921875, |
| "learning_rate": 6.144392772770498e-06, |
| "loss": 0.4582382678985596, |
| "mean_token_accuracy": 0.8687367781996727, |
| "num_tokens": 52108021.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 0.3721700422465801, |
| "epoch": 1.3334950985149956, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.134190831161004e-06, |
| "loss": 0.39261841773986816, |
| "mean_token_accuracy": 0.8782809346914291, |
| "num_tokens": 52189452.0, |
| "step": 3435 |
| }, |
| { |
| "entropy": 0.41867862418293955, |
| "epoch": 1.3354362806949434, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.123983909124597e-06, |
| "loss": 0.4275325298309326, |
| "mean_token_accuracy": 0.8698067650198936, |
| "num_tokens": 52269123.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 0.32477850653231144, |
| "epoch": 1.337377462874891, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.113772051481622e-06, |
| "loss": 0.3294957399368286, |
| "mean_token_accuracy": 0.8947040095925332, |
| "num_tokens": 52329549.0, |
| "step": 3445 |
| }, |
| { |
| "entropy": 0.3995737452059984, |
| "epoch": 1.3393186450548384, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.103555303074105e-06, |
| "loss": 0.42353267669677735, |
| "mean_token_accuracy": 0.8714441776275634, |
| "num_tokens": 52432623.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 0.3620085157454014, |
| "epoch": 1.341259827234786, |
| "grad_norm": 1.5234375, |
| "learning_rate": 6.093333708765541e-06, |
| "loss": 0.37543137073516847, |
| "mean_token_accuracy": 0.8843591079115868, |
| "num_tokens": 52505394.0, |
| "step": 3455 |
| }, |
| { |
| "entropy": 0.40583874434232714, |
| "epoch": 1.3432010094147335, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.08310731344071e-06, |
| "loss": 0.40748982429504393, |
| "mean_token_accuracy": 0.8733288407325744, |
| "num_tokens": 52574341.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 0.37172621488571167, |
| "epoch": 1.345142191594681, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.072876162005474e-06, |
| "loss": 0.3841069221496582, |
| "mean_token_accuracy": 0.8812505900859833, |
| "num_tokens": 52650739.0, |
| "step": 3465 |
| }, |
| { |
| "entropy": 0.36620298847556115, |
| "epoch": 1.3470833737746286, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.062640299386573e-06, |
| "loss": 0.37988154888153075, |
| "mean_token_accuracy": 0.8840025961399078, |
| "num_tokens": 52730667.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 0.37304456941783426, |
| "epoch": 1.3490245559545764, |
| "grad_norm": 1.6953125, |
| "learning_rate": 6.052399770531441e-06, |
| "loss": 0.3846965551376343, |
| "mean_token_accuracy": 0.8781289547681809, |
| "num_tokens": 52816922.0, |
| "step": 3475 |
| }, |
| { |
| "entropy": 0.37507508173584936, |
| "epoch": 1.350965738134524, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.042154620408003e-06, |
| "loss": 0.39843082427978516, |
| "mean_token_accuracy": 0.8789984509348869, |
| "num_tokens": 52893991.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 0.33831214234232904, |
| "epoch": 1.3529069203144715, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.0319048940044715e-06, |
| "loss": 0.35968937873840334, |
| "mean_token_accuracy": 0.8883291095495224, |
| "num_tokens": 52959658.0, |
| "step": 3485 |
| }, |
| { |
| "entropy": 0.39217614494264125, |
| "epoch": 1.3548481024944192, |
| "grad_norm": 1.5234375, |
| "learning_rate": 6.021650636329159e-06, |
| "loss": 0.395078182220459, |
| "mean_token_accuracy": 0.8747363820672035, |
| "num_tokens": 53030794.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 0.36045850329101087, |
| "epoch": 1.3567892846743668, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.011391892410272e-06, |
| "loss": 0.39100329875946044, |
| "mean_token_accuracy": 0.8819140180945396, |
| "num_tokens": 53118526.0, |
| "step": 3495 |
| }, |
| { |
| "entropy": 0.393280316144228, |
| "epoch": 1.3587304668543143, |
| "grad_norm": 1.7109375, |
| "learning_rate": 6.0011287072957205e-06, |
| "loss": 0.39736104011535645, |
| "mean_token_accuracy": 0.8757255643606185, |
| "num_tokens": 53188265.0, |
| "step": 3500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 7728, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.926527421264753e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|