| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.20149103364900262, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0020149103364900263, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.998791053798106e-05, | |
| "loss": 1.9277, | |
| "mean_token_accuracy": 0.679860633611679, | |
| "num_tokens": 9373.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004029820672980053, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.9974477802404462e-05, | |
| "loss": 1.2796, | |
| "mean_token_accuracy": 0.7233692526817321, | |
| "num_tokens": 20789.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006044731009470079, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.996104506682786e-05, | |
| "loss": 1.2607, | |
| "mean_token_accuracy": 0.7299719333648682, | |
| "num_tokens": 32661.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008059641345960105, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.994761233125126e-05, | |
| "loss": 1.2356, | |
| "mean_token_accuracy": 0.7324558198451996, | |
| "num_tokens": 43049.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01007455168245013, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.9934179595674662e-05, | |
| "loss": 1.1324, | |
| "mean_token_accuracy": 0.7531639993190765, | |
| "num_tokens": 52956.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012089462018940157, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.992074686009806e-05, | |
| "loss": 1.1775, | |
| "mean_token_accuracy": 0.7408373892307282, | |
| "num_tokens": 63513.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014104372355430184, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.990731412452146e-05, | |
| "loss": 1.2446, | |
| "mean_token_accuracy": 0.7307547807693482, | |
| "num_tokens": 74794.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01611928269192021, | |
| "grad_norm": 11.875, | |
| "learning_rate": 1.989388138894486e-05, | |
| "loss": 1.2428, | |
| "mean_token_accuracy": 0.7255984365940094, | |
| "num_tokens": 86903.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.018134193028410236, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 1.988044865336826e-05, | |
| "loss": 1.2766, | |
| "mean_token_accuracy": 0.7225647568702698, | |
| "num_tokens": 97159.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02014910336490026, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.986701591779166e-05, | |
| "loss": 1.1458, | |
| "mean_token_accuracy": 0.7415299773216247, | |
| "num_tokens": 107437.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02216401370139029, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.985358318221506e-05, | |
| "loss": 1.2748, | |
| "mean_token_accuracy": 0.7202155470848084, | |
| "num_tokens": 117867.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.024178924037880314, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.984015044663846e-05, | |
| "loss": 1.1689, | |
| "mean_token_accuracy": 0.7355200052261353, | |
| "num_tokens": 128288.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02619383437437034, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 1.982671771106186e-05, | |
| "loss": 1.2324, | |
| "mean_token_accuracy": 0.7224856972694397, | |
| "num_tokens": 139627.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.028208744710860368, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1.981328497548526e-05, | |
| "loss": 1.1365, | |
| "mean_token_accuracy": 0.7402825653553009, | |
| "num_tokens": 150498.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.030223655047350393, | |
| "grad_norm": 14.75, | |
| "learning_rate": 1.979985223990866e-05, | |
| "loss": 1.1178, | |
| "mean_token_accuracy": 0.7426175236701965, | |
| "num_tokens": 161754.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03223856538384042, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 1.978641950433206e-05, | |
| "loss": 1.2596, | |
| "mean_token_accuracy": 0.7134447395801544, | |
| "num_tokens": 173087.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03425347572033045, | |
| "grad_norm": 12.75, | |
| "learning_rate": 1.9772986768755458e-05, | |
| "loss": 1.0652, | |
| "mean_token_accuracy": 0.7474986433982849, | |
| "num_tokens": 184747.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03626838605682047, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 1.9759554033178857e-05, | |
| "loss": 1.1436, | |
| "mean_token_accuracy": 0.7323237180709838, | |
| "num_tokens": 195331.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0382832963933105, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.974612129760226e-05, | |
| "loss": 1.0312, | |
| "mean_token_accuracy": 0.7625056743621826, | |
| "num_tokens": 208260.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04029820672980052, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 1.9732688562025658e-05, | |
| "loss": 1.0084, | |
| "mean_token_accuracy": 0.7631498157978058, | |
| "num_tokens": 218822.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04231311706629055, | |
| "grad_norm": 11.625, | |
| "learning_rate": 1.9719255826449057e-05, | |
| "loss": 0.9813, | |
| "mean_token_accuracy": 0.7651655077934265, | |
| "num_tokens": 228580.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04432802740278058, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.970582309087246e-05, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.7532146275043488, | |
| "num_tokens": 239159.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.046342937739270604, | |
| "grad_norm": 11.25, | |
| "learning_rate": 1.9692390355295858e-05, | |
| "loss": 1.113, | |
| "mean_token_accuracy": 0.7436384916305542, | |
| "num_tokens": 251695.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04835784807576063, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.9678957619719257e-05, | |
| "loss": 0.929, | |
| "mean_token_accuracy": 0.7755303025245667, | |
| "num_tokens": 261128.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.050372758412250654, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.9665524884142656e-05, | |
| "loss": 1.0999, | |
| "mean_token_accuracy": 0.7514171898365021, | |
| "num_tokens": 271560.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05238766874874068, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.9652092148566058e-05, | |
| "loss": 1.0339, | |
| "mean_token_accuracy": 0.7604846298694611, | |
| "num_tokens": 282223.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.054402579085230704, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.9638659412989457e-05, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.7622893512248993, | |
| "num_tokens": 292726.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.056417489421720736, | |
| "grad_norm": 15.0, | |
| "learning_rate": 1.9625226677412856e-05, | |
| "loss": 0.9894, | |
| "mean_token_accuracy": 0.764206200838089, | |
| "num_tokens": 303785.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.05843239975821076, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.9611793941836258e-05, | |
| "loss": 1.109, | |
| "mean_token_accuracy": 0.7469749927520752, | |
| "num_tokens": 314725.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.060447310094700786, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.9598361206259657e-05, | |
| "loss": 1.2098, | |
| "mean_token_accuracy": 0.718773603439331, | |
| "num_tokens": 326635.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06246222043119081, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.9584928470683055e-05, | |
| "loss": 1.1025, | |
| "mean_token_accuracy": 0.7460452795028687, | |
| "num_tokens": 337866.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.06447713076768084, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.9571495735106458e-05, | |
| "loss": 1.0772, | |
| "mean_token_accuracy": 0.7526730418205261, | |
| "num_tokens": 348512.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06649204110417087, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.9558062999529857e-05, | |
| "loss": 1.157, | |
| "mean_token_accuracy": 0.7320702195167541, | |
| "num_tokens": 360281.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0685069514406609, | |
| "grad_norm": 12.0, | |
| "learning_rate": 1.9544630263953255e-05, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.760700649023056, | |
| "num_tokens": 371068.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07052186177715092, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.9531197528376654e-05, | |
| "loss": 0.8851, | |
| "mean_token_accuracy": 0.7925353944301605, | |
| "num_tokens": 380947.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07253677211364094, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.9517764792800056e-05, | |
| "loss": 1.0325, | |
| "mean_token_accuracy": 0.7617525398731232, | |
| "num_tokens": 391552.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.07455168245013097, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.9504332057223455e-05, | |
| "loss": 0.9852, | |
| "mean_token_accuracy": 0.7655075788497925, | |
| "num_tokens": 403321.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.076566592786621, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.9490899321646854e-05, | |
| "loss": 1.0527, | |
| "mean_token_accuracy": 0.7569321393966675, | |
| "num_tokens": 414435.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.07858150312311102, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 1.9477466586070256e-05, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7720924854278565, | |
| "num_tokens": 423506.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08059641345960104, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.9464033850493655e-05, | |
| "loss": 1.0475, | |
| "mean_token_accuracy": 0.7505548059940338, | |
| "num_tokens": 436556.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08261132379609107, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 1.9450601114917054e-05, | |
| "loss": 1.0775, | |
| "mean_token_accuracy": 0.7474610984325409, | |
| "num_tokens": 448248.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0846262341325811, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.9437168379340453e-05, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.7566307663917542, | |
| "num_tokens": 460102.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.08664114446907113, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 1.9423735643763855e-05, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7676237523555756, | |
| "num_tokens": 471590.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.08865605480556116, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.9410302908187254e-05, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.7525161623954773, | |
| "num_tokens": 482096.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09067096514205118, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 1.9396870172610653e-05, | |
| "loss": 1.0347, | |
| "mean_token_accuracy": 0.7515169024467468, | |
| "num_tokens": 493585.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.09268587547854121, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.9383437437034055e-05, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.7547510921955108, | |
| "num_tokens": 505989.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.09470078581503123, | |
| "grad_norm": 12.25, | |
| "learning_rate": 1.9370004701457454e-05, | |
| "loss": 1.018, | |
| "mean_token_accuracy": 0.7596003413200378, | |
| "num_tokens": 516900.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.09671569615152126, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.9356571965880853e-05, | |
| "loss": 0.9797, | |
| "mean_token_accuracy": 0.7699940800666809, | |
| "num_tokens": 526427.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09873060648801128, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.9343139230304255e-05, | |
| "loss": 1.0817, | |
| "mean_token_accuracy": 0.7470319092273712, | |
| "num_tokens": 537981.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.10074551682450131, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.9329706494727654e-05, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7595715343952179, | |
| "num_tokens": 549174.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.10276042716099133, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.9316273759151052e-05, | |
| "loss": 1.0164, | |
| "mean_token_accuracy": 0.7571583390235901, | |
| "num_tokens": 559988.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.10477533749748136, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.930284102357445e-05, | |
| "loss": 1.1148, | |
| "mean_token_accuracy": 0.7423564851284027, | |
| "num_tokens": 571510.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.10679024783397138, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 1.9289408287997854e-05, | |
| "loss": 1.053, | |
| "mean_token_accuracy": 0.7485374748706818, | |
| "num_tokens": 583020.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.10880515817046141, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 1.9275975552421252e-05, | |
| "loss": 0.9756, | |
| "mean_token_accuracy": 0.7606720209121705, | |
| "num_tokens": 594042.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.11082006850695145, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 1.926254281684465e-05, | |
| "loss": 0.9514, | |
| "mean_token_accuracy": 0.7702824532985687, | |
| "num_tokens": 605932.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.11283497884344147, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.9249110081268053e-05, | |
| "loss": 1.0008, | |
| "mean_token_accuracy": 0.7583375632762909, | |
| "num_tokens": 617431.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1148498891799315, | |
| "grad_norm": 10.875, | |
| "learning_rate": 1.9235677345691452e-05, | |
| "loss": 0.998, | |
| "mean_token_accuracy": 0.7597042858600617, | |
| "num_tokens": 629827.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.11686479951642152, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.922224461011485e-05, | |
| "loss": 0.9512, | |
| "mean_token_accuracy": 0.7806954503059387, | |
| "num_tokens": 640144.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.11887970985291155, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.920881187453825e-05, | |
| "loss": 0.9292, | |
| "mean_token_accuracy": 0.7761410176753998, | |
| "num_tokens": 652386.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.12089462018940157, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.9195379138961652e-05, | |
| "loss": 1.0768, | |
| "mean_token_accuracy": 0.7544383645057678, | |
| "num_tokens": 663460.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1229095305258916, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.918194640338505e-05, | |
| "loss": 0.8975, | |
| "mean_token_accuracy": 0.7799494147300721, | |
| "num_tokens": 673425.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.12492444086238162, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.916851366780845e-05, | |
| "loss": 0.899, | |
| "mean_token_accuracy": 0.7885317802429199, | |
| "num_tokens": 683817.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.12693935119887165, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.9155080932231852e-05, | |
| "loss": 0.998, | |
| "mean_token_accuracy": 0.7671383440494537, | |
| "num_tokens": 694196.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.12895426153536169, | |
| "grad_norm": 11.625, | |
| "learning_rate": 1.914164819665525e-05, | |
| "loss": 0.9808, | |
| "mean_token_accuracy": 0.7700311303138733, | |
| "num_tokens": 704564.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1309691718718517, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.912821546107865e-05, | |
| "loss": 1.0077, | |
| "mean_token_accuracy": 0.7643253684043885, | |
| "num_tokens": 715775.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.13298408220834174, | |
| "grad_norm": 13.625, | |
| "learning_rate": 1.911478272550205e-05, | |
| "loss": 0.9457, | |
| "mean_token_accuracy": 0.7678769171237946, | |
| "num_tokens": 726005.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.13499899254483175, | |
| "grad_norm": 13.5, | |
| "learning_rate": 1.910134998992545e-05, | |
| "loss": 1.0155, | |
| "mean_token_accuracy": 0.7607427120208741, | |
| "num_tokens": 738053.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1370139028813218, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 1.908791725434885e-05, | |
| "loss": 0.9395, | |
| "mean_token_accuracy": 0.7723658442497253, | |
| "num_tokens": 748480.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.1390288132178118, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 1.907448451877225e-05, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7676171123981476, | |
| "num_tokens": 759972.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.14104372355430184, | |
| "grad_norm": 12.875, | |
| "learning_rate": 1.906105178319565e-05, | |
| "loss": 0.9557, | |
| "mean_token_accuracy": 0.7719902992248535, | |
| "num_tokens": 771123.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.14305863389079185, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.904761904761905e-05, | |
| "loss": 1.0022, | |
| "mean_token_accuracy": 0.7667870819568634, | |
| "num_tokens": 782532.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1450735442272819, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.903418631204245e-05, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7708106875419617, | |
| "num_tokens": 794067.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.14708845456377193, | |
| "grad_norm": 14.125, | |
| "learning_rate": 1.902075357646585e-05, | |
| "loss": 0.9718, | |
| "mean_token_accuracy": 0.766555666923523, | |
| "num_tokens": 804871.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.14910336490026194, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.900732084088925e-05, | |
| "loss": 0.9852, | |
| "mean_token_accuracy": 0.7678309619426728, | |
| "num_tokens": 815050.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.15111827523675198, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.8993888105312648e-05, | |
| "loss": 0.9951, | |
| "mean_token_accuracy": 0.7627758264541626, | |
| "num_tokens": 826248.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.153133185573242, | |
| "grad_norm": 17.25, | |
| "learning_rate": 1.8980455369736047e-05, | |
| "loss": 1.0433, | |
| "mean_token_accuracy": 0.7571396887302398, | |
| "num_tokens": 835706.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.15514809590973203, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.896702263415945e-05, | |
| "loss": 1.0518, | |
| "mean_token_accuracy": 0.7517435431480408, | |
| "num_tokens": 847261.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.15716300624622204, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.8953589898582848e-05, | |
| "loss": 0.9629, | |
| "mean_token_accuracy": 0.7732720315456391, | |
| "num_tokens": 858655.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.15917791658271208, | |
| "grad_norm": 12.5, | |
| "learning_rate": 1.8940157163006247e-05, | |
| "loss": 1.0231, | |
| "mean_token_accuracy": 0.7555422127246857, | |
| "num_tokens": 870002.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1611928269192021, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.892672442742965e-05, | |
| "loss": 1.1283, | |
| "mean_token_accuracy": 0.7441882312297821, | |
| "num_tokens": 881131.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.16320773725569213, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 1.8913291691853048e-05, | |
| "loss": 1.0252, | |
| "mean_token_accuracy": 0.7630669414997101, | |
| "num_tokens": 893437.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.16522264759218214, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.8899858956276447e-05, | |
| "loss": 1.0528, | |
| "mean_token_accuracy": 0.7483877301216125, | |
| "num_tokens": 904976.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.16723755792867218, | |
| "grad_norm": 12.375, | |
| "learning_rate": 1.8886426220699846e-05, | |
| "loss": 0.8715, | |
| "mean_token_accuracy": 0.7899761021137237, | |
| "num_tokens": 915631.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1692524682651622, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.8872993485123248e-05, | |
| "loss": 1.0548, | |
| "mean_token_accuracy": 0.7494987368583679, | |
| "num_tokens": 927141.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.17126737860165223, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.8859560749546647e-05, | |
| "loss": 0.9579, | |
| "mean_token_accuracy": 0.7668360054492951, | |
| "num_tokens": 938792.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.17328228893814226, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.8846128013970046e-05, | |
| "loss": 0.8595, | |
| "mean_token_accuracy": 0.7870603501796722, | |
| "num_tokens": 949894.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.17529719927463228, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.8832695278393448e-05, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.7846542239189148, | |
| "num_tokens": 961003.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.17731210961112231, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.8819262542816847e-05, | |
| "loss": 1.0052, | |
| "mean_token_accuracy": 0.7603223979473114, | |
| "num_tokens": 971577.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.17932701994761233, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 1.8805829807240245e-05, | |
| "loss": 0.9299, | |
| "mean_token_accuracy": 0.7757908642292023, | |
| "num_tokens": 982234.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.18134193028410237, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.8792397071663648e-05, | |
| "loss": 1.0312, | |
| "mean_token_accuracy": 0.7591780245304107, | |
| "num_tokens": 992997.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.18335684062059238, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.8778964336087047e-05, | |
| "loss": 0.8999, | |
| "mean_token_accuracy": 0.779550439119339, | |
| "num_tokens": 1004102.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.18537175095708242, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.8765531600510445e-05, | |
| "loss": 0.8892, | |
| "mean_token_accuracy": 0.7890210688114166, | |
| "num_tokens": 1015447.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.18738666129357243, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.8752098864933844e-05, | |
| "loss": 1.0344, | |
| "mean_token_accuracy": 0.7584980130195618, | |
| "num_tokens": 1026939.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.18940157163006247, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.8738666129357246e-05, | |
| "loss": 0.9686, | |
| "mean_token_accuracy": 0.7649740993976593, | |
| "num_tokens": 1037937.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.19141648196655248, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.8725233393780645e-05, | |
| "loss": 1.0364, | |
| "mean_token_accuracy": 0.7554452955722809, | |
| "num_tokens": 1049173.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.19343139230304252, | |
| "grad_norm": 13.625, | |
| "learning_rate": 1.8711800658204044e-05, | |
| "loss": 1.0173, | |
| "mean_token_accuracy": 0.7559767007827759, | |
| "num_tokens": 1060166.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.19544630263953255, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.8698367922627446e-05, | |
| "loss": 0.9464, | |
| "mean_token_accuracy": 0.7735530078411103, | |
| "num_tokens": 1070458.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.19746121297602257, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.8684935187050845e-05, | |
| "loss": 0.9397, | |
| "mean_token_accuracy": 0.7724673867225647, | |
| "num_tokens": 1081477.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1994761233125126, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 1.8671502451474244e-05, | |
| "loss": 1.0769, | |
| "mean_token_accuracy": 0.7459556341171265, | |
| "num_tokens": 1094205.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.20149103364900262, | |
| "grad_norm": 16.5, | |
| "learning_rate": 1.8658069715897643e-05, | |
| "loss": 0.9763, | |
| "mean_token_accuracy": 0.7707934081554413, | |
| "num_tokens": 1104929.0, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 14889, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1337180456005632.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |