| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9980806142034548, |
| "eval_steps": 500, |
| "global_step": 780, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.025591810620601407, |
| "grad_norm": 2.7761764526367188, |
| "learning_rate": 1.976923076923077e-05, |
| "loss": 4.4287, |
| "mean_token_accuracy": 0.22894721738994123, |
| "num_tokens": 894781.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05118362124120281, |
| "grad_norm": 2.137922525405884, |
| "learning_rate": 1.9512820512820515e-05, |
| "loss": 4.1941, |
| "mean_token_accuracy": 0.2595341790467501, |
| "num_tokens": 1805355.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07677543186180422, |
| "grad_norm": 2.520721673965454, |
| "learning_rate": 1.9256410256410258e-05, |
| "loss": 4.0212, |
| "mean_token_accuracy": 0.2825487531721592, |
| "num_tokens": 2699319.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10236724248240563, |
| "grad_norm": 1.547048807144165, |
| "learning_rate": 1.9e-05, |
| "loss": 3.8859, |
| "mean_token_accuracy": 0.30224928483366964, |
| "num_tokens": 3588550.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12795905310300704, |
| "grad_norm": 1.5576727390289307, |
| "learning_rate": 1.8743589743589744e-05, |
| "loss": 3.7614, |
| "mean_token_accuracy": 0.320540402084589, |
| "num_tokens": 4483812.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.15355086372360843, |
| "grad_norm": 1.8138513565063477, |
| "learning_rate": 1.848717948717949e-05, |
| "loss": 3.6902, |
| "mean_token_accuracy": 0.33278593569993975, |
| "num_tokens": 5372889.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.17914267434420986, |
| "grad_norm": 1.1237279176712036, |
| "learning_rate": 1.823076923076923e-05, |
| "loss": 3.5589, |
| "mean_token_accuracy": 0.347703804820776, |
| "num_tokens": 6277651.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.20473448496481125, |
| "grad_norm": 0.9200040698051453, |
| "learning_rate": 1.7974358974358977e-05, |
| "loss": 3.4994, |
| "mean_token_accuracy": 0.35774786919355395, |
| "num_tokens": 7172606.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.23032629558541268, |
| "grad_norm": 1.232081413269043, |
| "learning_rate": 1.7717948717948717e-05, |
| "loss": 3.4412, |
| "mean_token_accuracy": 0.3667620047926903, |
| "num_tokens": 8065928.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2559181062060141, |
| "grad_norm": 1.0582596063613892, |
| "learning_rate": 1.7461538461538464e-05, |
| "loss": 3.3965, |
| "mean_token_accuracy": 0.37178524360060694, |
| "num_tokens": 8952519.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.28150991682661547, |
| "grad_norm": 0.8860539793968201, |
| "learning_rate": 1.7205128205128207e-05, |
| "loss": 3.3236, |
| "mean_token_accuracy": 0.37989369705319403, |
| "num_tokens": 9847582.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.30710172744721687, |
| "grad_norm": 5.248290538787842, |
| "learning_rate": 1.694871794871795e-05, |
| "loss": 3.3115, |
| "mean_token_accuracy": 0.38339284956455233, |
| "num_tokens": 10742170.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3326935380678183, |
| "grad_norm": 1.340420126914978, |
| "learning_rate": 1.6692307692307694e-05, |
| "loss": 3.2598, |
| "mean_token_accuracy": 0.3880648836493492, |
| "num_tokens": 11637955.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3582853486884197, |
| "grad_norm": 0.9110859036445618, |
| "learning_rate": 1.6435897435897437e-05, |
| "loss": 3.2249, |
| "mean_token_accuracy": 0.3904557466506958, |
| "num_tokens": 12537106.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3838771593090211, |
| "grad_norm": 0.9896584749221802, |
| "learning_rate": 1.617948717948718e-05, |
| "loss": 3.1932, |
| "mean_token_accuracy": 0.39675025418400767, |
| "num_tokens": 13426821.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4094689699296225, |
| "grad_norm": 0.8233770728111267, |
| "learning_rate": 1.5923076923076924e-05, |
| "loss": 3.1574, |
| "mean_token_accuracy": 0.399780885130167, |
| "num_tokens": 14313654.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4350607805502239, |
| "grad_norm": 1.1695690155029297, |
| "learning_rate": 1.5666666666666667e-05, |
| "loss": 3.1298, |
| "mean_token_accuracy": 0.40367085859179497, |
| "num_tokens": 15225624.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.46065259117082535, |
| "grad_norm": 0.776798665523529, |
| "learning_rate": 1.5410256410256414e-05, |
| "loss": 3.1148, |
| "mean_token_accuracy": 0.4049819767475128, |
| "num_tokens": 16121750.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.48624440179142675, |
| "grad_norm": 0.8073769807815552, |
| "learning_rate": 1.5153846153846155e-05, |
| "loss": 3.0744, |
| "mean_token_accuracy": 0.4091757610440254, |
| "num_tokens": 17023953.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5118362124120281, |
| "grad_norm": 0.9491840600967407, |
| "learning_rate": 1.4897435897435898e-05, |
| "loss": 3.0747, |
| "mean_token_accuracy": 0.40831351578235625, |
| "num_tokens": 17922120.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5374280230326296, |
| "grad_norm": 0.9127055406570435, |
| "learning_rate": 1.4641025641025642e-05, |
| "loss": 3.0611, |
| "mean_token_accuracy": 0.4100494936108589, |
| "num_tokens": 18823381.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5630198336532309, |
| "grad_norm": 0.7686980366706848, |
| "learning_rate": 1.4384615384615387e-05, |
| "loss": 3.0241, |
| "mean_token_accuracy": 0.4129943989217281, |
| "num_tokens": 19724559.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5886116442738324, |
| "grad_norm": 1.0187700986862183, |
| "learning_rate": 1.412820512820513e-05, |
| "loss": 3.0255, |
| "mean_token_accuracy": 0.41373484060168264, |
| "num_tokens": 20614866.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6142034548944337, |
| "grad_norm": 0.7729346752166748, |
| "learning_rate": 1.3871794871794873e-05, |
| "loss": 2.9829, |
| "mean_token_accuracy": 0.42028677016496657, |
| "num_tokens": 21514400.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6397952655150352, |
| "grad_norm": 6.206243991851807, |
| "learning_rate": 1.3615384615384616e-05, |
| "loss": 2.9783, |
| "mean_token_accuracy": 0.42037207037210467, |
| "num_tokens": 22404222.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6653870761356366, |
| "grad_norm": 0.6800546050071716, |
| "learning_rate": 1.335897435897436e-05, |
| "loss": 2.9643, |
| "mean_token_accuracy": 0.4217521704733372, |
| "num_tokens": 23302463.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.690978886756238, |
| "grad_norm": 1.250061273574829, |
| "learning_rate": 1.3102564102564103e-05, |
| "loss": 2.9509, |
| "mean_token_accuracy": 0.4231578640639782, |
| "num_tokens": 24196568.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7165706973768394, |
| "grad_norm": 0.8788368701934814, |
| "learning_rate": 1.2846153846153848e-05, |
| "loss": 2.9386, |
| "mean_token_accuracy": 0.4258805990219116, |
| "num_tokens": 25084205.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7421625079974408, |
| "grad_norm": 2.8421788215637207, |
| "learning_rate": 1.2589743589743591e-05, |
| "loss": 2.9336, |
| "mean_token_accuracy": 0.4254921153187752, |
| "num_tokens": 25975991.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7677543186180422, |
| "grad_norm": 0.8528095483779907, |
| "learning_rate": 1.2333333333333334e-05, |
| "loss": 2.926, |
| "mean_token_accuracy": 0.426171388477087, |
| "num_tokens": 26866598.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7933461292386437, |
| "grad_norm": 0.7494258880615234, |
| "learning_rate": 1.2076923076923078e-05, |
| "loss": 2.9165, |
| "mean_token_accuracy": 0.42724777534604075, |
| "num_tokens": 27765200.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.818937939859245, |
| "grad_norm": 1.1572643518447876, |
| "learning_rate": 1.1820512820512821e-05, |
| "loss": 2.8875, |
| "mean_token_accuracy": 0.43193317875266074, |
| "num_tokens": 28663860.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8445297504798465, |
| "grad_norm": 0.7244220972061157, |
| "learning_rate": 1.1564102564102566e-05, |
| "loss": 2.8848, |
| "mean_token_accuracy": 0.43283705189824107, |
| "num_tokens": 29561062.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8701215611004478, |
| "grad_norm": 0.9269903898239136, |
| "learning_rate": 1.1307692307692309e-05, |
| "loss": 2.8685, |
| "mean_token_accuracy": 0.4354383051395416, |
| "num_tokens": 30454457.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8957133717210493, |
| "grad_norm": 1.2391330003738403, |
| "learning_rate": 1.1051282051282052e-05, |
| "loss": 2.8566, |
| "mean_token_accuracy": 0.4357985772192478, |
| "num_tokens": 31357895.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9213051823416507, |
| "grad_norm": 0.60429447889328, |
| "learning_rate": 1.0794871794871796e-05, |
| "loss": 2.8737, |
| "mean_token_accuracy": 0.43362868800759313, |
| "num_tokens": 32253327.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.946896992962252, |
| "grad_norm": 0.7335871458053589, |
| "learning_rate": 1.0538461538461539e-05, |
| "loss": 2.8323, |
| "mean_token_accuracy": 0.4392602853477001, |
| "num_tokens": 33148158.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.9724888035828535, |
| "grad_norm": 0.8495442867279053, |
| "learning_rate": 1.0282051282051282e-05, |
| "loss": 2.8493, |
| "mean_token_accuracy": 0.4362662024796009, |
| "num_tokens": 34042486.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9980806142034548, |
| "grad_norm": 1.2509167194366455, |
| "learning_rate": 1.0025641025641027e-05, |
| "loss": 2.8359, |
| "mean_token_accuracy": 0.44098760187625885, |
| "num_tokens": 34924151.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0255918106206015, |
| "grad_norm": 0.5853167176246643, |
| "learning_rate": 9.76923076923077e-06, |
| "loss": 3.0907, |
| "mean_token_accuracy": 0.44089484498614356, |
| "num_tokens": 35865261.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.051183621241203, |
| "grad_norm": 0.8962728381156921, |
| "learning_rate": 9.512820512820514e-06, |
| "loss": 2.81, |
| "mean_token_accuracy": 0.44187747687101364, |
| "num_tokens": 36762778.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.0767754318618041, |
| "grad_norm": 2.0159361362457275, |
| "learning_rate": 9.256410256410257e-06, |
| "loss": 2.8121, |
| "mean_token_accuracy": 0.44206427708268164, |
| "num_tokens": 37666100.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.1023672424824056, |
| "grad_norm": 1.0803464651107788, |
| "learning_rate": 9e-06, |
| "loss": 2.8162, |
| "mean_token_accuracy": 0.4421087481081486, |
| "num_tokens": 38570373.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.127959053103007, |
| "grad_norm": 1.8244508504867554, |
| "learning_rate": 8.743589743589743e-06, |
| "loss": 2.7878, |
| "mean_token_accuracy": 0.443483180552721, |
| "num_tokens": 39463722.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1535508637236085, |
| "grad_norm": 1.025512933731079, |
| "learning_rate": 8.487179487179488e-06, |
| "loss": 2.7997, |
| "mean_token_accuracy": 0.44501683712005613, |
| "num_tokens": 40356114.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.17914267434421, |
| "grad_norm": 1.3213328123092651, |
| "learning_rate": 8.230769230769232e-06, |
| "loss": 2.7966, |
| "mean_token_accuracy": 0.44384807869791987, |
| "num_tokens": 41253467.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.2047344849648112, |
| "grad_norm": 0.6755945086479187, |
| "learning_rate": 7.974358974358975e-06, |
| "loss": 2.7788, |
| "mean_token_accuracy": 0.44546112343668937, |
| "num_tokens": 42147423.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.2303262955854126, |
| "grad_norm": 0.815871000289917, |
| "learning_rate": 7.717948717948718e-06, |
| "loss": 2.7702, |
| "mean_token_accuracy": 0.4467897318303585, |
| "num_tokens": 43043613.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.255918106206014, |
| "grad_norm": 0.8647878766059875, |
| "learning_rate": 7.461538461538462e-06, |
| "loss": 2.7593, |
| "mean_token_accuracy": 0.4478010691702366, |
| "num_tokens": 43928134.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2815099168266155, |
| "grad_norm": 0.5179564356803894, |
| "learning_rate": 7.205128205128206e-06, |
| "loss": 2.7684, |
| "mean_token_accuracy": 0.4487785018980503, |
| "num_tokens": 44810604.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.307101727447217, |
| "grad_norm": 1.2333202362060547, |
| "learning_rate": 6.948717948717949e-06, |
| "loss": 2.755, |
| "mean_token_accuracy": 0.4496650531888008, |
| "num_tokens": 45704269.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.3326935380678182, |
| "grad_norm": 0.9590532779693604, |
| "learning_rate": 6.692307692307692e-06, |
| "loss": 2.7565, |
| "mean_token_accuracy": 0.44860857501626017, |
| "num_tokens": 46599574.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.3582853486884197, |
| "grad_norm": 0.5098512172698975, |
| "learning_rate": 6.435897435897437e-06, |
| "loss": 2.7424, |
| "mean_token_accuracy": 0.4517396934330463, |
| "num_tokens": 47500789.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.383877159309021, |
| "grad_norm": 0.6403014659881592, |
| "learning_rate": 6.17948717948718e-06, |
| "loss": 2.7484, |
| "mean_token_accuracy": 0.45020677894353867, |
| "num_tokens": 48397237.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.4094689699296226, |
| "grad_norm": 2.570819854736328, |
| "learning_rate": 5.923076923076924e-06, |
| "loss": 2.7382, |
| "mean_token_accuracy": 0.45186189860105513, |
| "num_tokens": 49288748.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.435060780550224, |
| "grad_norm": 0.6260067224502563, |
| "learning_rate": 5.666666666666667e-06, |
| "loss": 2.7378, |
| "mean_token_accuracy": 0.45138209462165835, |
| "num_tokens": 50172619.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.4606525911708252, |
| "grad_norm": 0.6413472294807434, |
| "learning_rate": 5.41025641025641e-06, |
| "loss": 2.7317, |
| "mean_token_accuracy": 0.45287573114037516, |
| "num_tokens": 51083437.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4862444017914267, |
| "grad_norm": 0.9748353362083435, |
| "learning_rate": 5.1538461538461534e-06, |
| "loss": 2.7163, |
| "mean_token_accuracy": 0.4528719700872898, |
| "num_tokens": 51987422.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.5118362124120281, |
| "grad_norm": 0.7616235017776489, |
| "learning_rate": 4.8974358974358975e-06, |
| "loss": 2.7247, |
| "mean_token_accuracy": 0.45309568718075754, |
| "num_tokens": 52875548.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.5374280230326296, |
| "grad_norm": 1.3642046451568604, |
| "learning_rate": 4.641025641025642e-06, |
| "loss": 2.7189, |
| "mean_token_accuracy": 0.4543063327670097, |
| "num_tokens": 53777731.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.563019833653231, |
| "grad_norm": 2.9087612628936768, |
| "learning_rate": 4.384615384615385e-06, |
| "loss": 2.7221, |
| "mean_token_accuracy": 0.4528664395213127, |
| "num_tokens": 54672687.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.5886116442738323, |
| "grad_norm": 1.5256329774856567, |
| "learning_rate": 4.128205128205128e-06, |
| "loss": 2.7249, |
| "mean_token_accuracy": 0.4534047245979309, |
| "num_tokens": 55567899.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.6142034548944337, |
| "grad_norm": 0.6487675905227661, |
| "learning_rate": 3.871794871794872e-06, |
| "loss": 2.715, |
| "mean_token_accuracy": 0.45388809889554976, |
| "num_tokens": 56472732.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.6397952655150352, |
| "grad_norm": 0.6692759990692139, |
| "learning_rate": 3.6153846153846156e-06, |
| "loss": 2.7033, |
| "mean_token_accuracy": 0.45569391921162605, |
| "num_tokens": 57378122.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.6653870761356366, |
| "grad_norm": 0.5085057020187378, |
| "learning_rate": 3.358974358974359e-06, |
| "loss": 2.7164, |
| "mean_token_accuracy": 0.45427701622247696, |
| "num_tokens": 58274016.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.690978886756238, |
| "grad_norm": 0.6099756360054016, |
| "learning_rate": 3.102564102564103e-06, |
| "loss": 2.7047, |
| "mean_token_accuracy": 0.45659793838858603, |
| "num_tokens": 59171947.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.7165706973768393, |
| "grad_norm": 0.7250745892524719, |
| "learning_rate": 2.846153846153846e-06, |
| "loss": 2.7102, |
| "mean_token_accuracy": 0.45642822831869123, |
| "num_tokens": 60068960.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.7421625079974408, |
| "grad_norm": 0.6664665937423706, |
| "learning_rate": 2.5897435897435903e-06, |
| "loss": 2.7096, |
| "mean_token_accuracy": 0.4557917319238186, |
| "num_tokens": 60968874.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.7677543186180422, |
| "grad_norm": 1.3182893991470337, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 2.7186, |
| "mean_token_accuracy": 0.4538013473153114, |
| "num_tokens": 61865465.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.7933461292386437, |
| "grad_norm": 0.7066709995269775, |
| "learning_rate": 2.0769230769230773e-06, |
| "loss": 2.7083, |
| "mean_token_accuracy": 0.4556036002933979, |
| "num_tokens": 62751515.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.8189379398592451, |
| "grad_norm": 0.633770227432251, |
| "learning_rate": 1.8205128205128205e-06, |
| "loss": 2.708, |
| "mean_token_accuracy": 0.4562342181801796, |
| "num_tokens": 63653226.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.8445297504798464, |
| "grad_norm": 0.6355544924736023, |
| "learning_rate": 1.5641025641025642e-06, |
| "loss": 2.6983, |
| "mean_token_accuracy": 0.45753874629735947, |
| "num_tokens": 64553626.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.8701215611004478, |
| "grad_norm": 0.6770824193954468, |
| "learning_rate": 1.307692307692308e-06, |
| "loss": 2.7195, |
| "mean_token_accuracy": 0.45460380911827086, |
| "num_tokens": 65440945.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8957133717210493, |
| "grad_norm": 0.9229477643966675, |
| "learning_rate": 1.0512820512820514e-06, |
| "loss": 2.7056, |
| "mean_token_accuracy": 0.4557946674525738, |
| "num_tokens": 66330885.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.9213051823416507, |
| "grad_norm": 0.6344442367553711, |
| "learning_rate": 7.948717948717949e-07, |
| "loss": 2.6962, |
| "mean_token_accuracy": 0.4569088116288185, |
| "num_tokens": 67222508.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.9468969929622522, |
| "grad_norm": 0.6643022894859314, |
| "learning_rate": 5.384615384615386e-07, |
| "loss": 2.6917, |
| "mean_token_accuracy": 0.45792855247855185, |
| "num_tokens": 68113106.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9724888035828534, |
| "grad_norm": 0.7332549095153809, |
| "learning_rate": 2.820512820512821e-07, |
| "loss": 2.7041, |
| "mean_token_accuracy": 0.4560040533542633, |
| "num_tokens": 69005071.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9980806142034548, |
| "grad_norm": 0.48741206526756287, |
| "learning_rate": 2.5641025641025643e-08, |
| "loss": 2.6798, |
| "mean_token_accuracy": 0.4586730174720287, |
| "num_tokens": 69895190.0, |
| "step": 780 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 780, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2469045882126336e+16, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|