| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.30223655047350395, |
| "eval_steps": 500, |
| "global_step": 1500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0020149103364900263, |
| "grad_norm": 18.375, |
| "learning_rate": 1.998791053798106e-05, |
| "loss": 1.9277, |
| "mean_token_accuracy": 0.679860633611679, |
| "num_tokens": 9373.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.004029820672980053, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.9974477802404462e-05, |
| "loss": 1.2796, |
| "mean_token_accuracy": 0.7233692526817321, |
| "num_tokens": 20789.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006044731009470079, |
| "grad_norm": 13.0, |
| "learning_rate": 1.996104506682786e-05, |
| "loss": 1.2607, |
| "mean_token_accuracy": 0.7299719333648682, |
| "num_tokens": 32661.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.008059641345960105, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.994761233125126e-05, |
| "loss": 1.2356, |
| "mean_token_accuracy": 0.7324558198451996, |
| "num_tokens": 43049.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.01007455168245013, |
| "grad_norm": 12.125, |
| "learning_rate": 1.9934179595674662e-05, |
| "loss": 1.1324, |
| "mean_token_accuracy": 0.7531639993190765, |
| "num_tokens": 52956.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.012089462018940157, |
| "grad_norm": 16.125, |
| "learning_rate": 1.992074686009806e-05, |
| "loss": 1.1775, |
| "mean_token_accuracy": 0.7408373892307282, |
| "num_tokens": 63513.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.014104372355430184, |
| "grad_norm": 14.0625, |
| "learning_rate": 1.990731412452146e-05, |
| "loss": 1.2446, |
| "mean_token_accuracy": 0.7307547807693482, |
| "num_tokens": 74794.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.01611928269192021, |
| "grad_norm": 11.875, |
| "learning_rate": 1.989388138894486e-05, |
| "loss": 1.2428, |
| "mean_token_accuracy": 0.7255984365940094, |
| "num_tokens": 86903.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.018134193028410236, |
| "grad_norm": 14.4375, |
| "learning_rate": 1.988044865336826e-05, |
| "loss": 1.2766, |
| "mean_token_accuracy": 0.7225647568702698, |
| "num_tokens": 97159.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.02014910336490026, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.986701591779166e-05, |
| "loss": 1.1458, |
| "mean_token_accuracy": 0.7415299773216247, |
| "num_tokens": 107437.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02216401370139029, |
| "grad_norm": 16.75, |
| "learning_rate": 1.985358318221506e-05, |
| "loss": 1.2748, |
| "mean_token_accuracy": 0.7202155470848084, |
| "num_tokens": 117867.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.024178924037880314, |
| "grad_norm": 18.125, |
| "learning_rate": 1.984015044663846e-05, |
| "loss": 1.1689, |
| "mean_token_accuracy": 0.7355200052261353, |
| "num_tokens": 128288.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.02619383437437034, |
| "grad_norm": 12.6875, |
| "learning_rate": 1.982671771106186e-05, |
| "loss": 1.2324, |
| "mean_token_accuracy": 0.7224856972694397, |
| "num_tokens": 139627.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.028208744710860368, |
| "grad_norm": 11.5625, |
| "learning_rate": 1.981328497548526e-05, |
| "loss": 1.1365, |
| "mean_token_accuracy": 0.7402825653553009, |
| "num_tokens": 150498.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.030223655047350393, |
| "grad_norm": 14.75, |
| "learning_rate": 1.979985223990866e-05, |
| "loss": 1.1178, |
| "mean_token_accuracy": 0.7426175236701965, |
| "num_tokens": 161754.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03223856538384042, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.978641950433206e-05, |
| "loss": 1.2596, |
| "mean_token_accuracy": 0.7134447395801544, |
| "num_tokens": 173087.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03425347572033045, |
| "grad_norm": 12.75, |
| "learning_rate": 1.9772986768755458e-05, |
| "loss": 1.0652, |
| "mean_token_accuracy": 0.7474986433982849, |
| "num_tokens": 184747.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.03626838605682047, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.9759554033178857e-05, |
| "loss": 1.1436, |
| "mean_token_accuracy": 0.7323237180709838, |
| "num_tokens": 195331.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0382832963933105, |
| "grad_norm": 9.875, |
| "learning_rate": 1.974612129760226e-05, |
| "loss": 1.0312, |
| "mean_token_accuracy": 0.7625056743621826, |
| "num_tokens": 208260.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04029820672980052, |
| "grad_norm": 14.9375, |
| "learning_rate": 1.9732688562025658e-05, |
| "loss": 1.0084, |
| "mean_token_accuracy": 0.7631498157978058, |
| "num_tokens": 218822.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04231311706629055, |
| "grad_norm": 11.625, |
| "learning_rate": 1.9719255826449057e-05, |
| "loss": 0.9813, |
| "mean_token_accuracy": 0.7651655077934265, |
| "num_tokens": 228580.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.04432802740278058, |
| "grad_norm": 17.875, |
| "learning_rate": 1.970582309087246e-05, |
| "loss": 1.07, |
| "mean_token_accuracy": 0.7532146275043488, |
| "num_tokens": 239159.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.046342937739270604, |
| "grad_norm": 11.25, |
| "learning_rate": 1.9692390355295858e-05, |
| "loss": 1.113, |
| "mean_token_accuracy": 0.7436384916305542, |
| "num_tokens": 251695.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.04835784807576063, |
| "grad_norm": 13.375, |
| "learning_rate": 1.9678957619719257e-05, |
| "loss": 0.929, |
| "mean_token_accuracy": 0.7755303025245667, |
| "num_tokens": 261128.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.050372758412250654, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.9665524884142656e-05, |
| "loss": 1.0999, |
| "mean_token_accuracy": 0.7514171898365021, |
| "num_tokens": 271560.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05238766874874068, |
| "grad_norm": 13.0, |
| "learning_rate": 1.9652092148566058e-05, |
| "loss": 1.0339, |
| "mean_token_accuracy": 0.7604846298694611, |
| "num_tokens": 282223.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.054402579085230704, |
| "grad_norm": 12.9375, |
| "learning_rate": 1.9638659412989457e-05, |
| "loss": 1.0473, |
| "mean_token_accuracy": 0.7622893512248993, |
| "num_tokens": 292726.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.056417489421720736, |
| "grad_norm": 15.0, |
| "learning_rate": 1.9625226677412856e-05, |
| "loss": 0.9894, |
| "mean_token_accuracy": 0.764206200838089, |
| "num_tokens": 303785.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.05843239975821076, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.9611793941836258e-05, |
| "loss": 1.109, |
| "mean_token_accuracy": 0.7469749927520752, |
| "num_tokens": 314725.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.060447310094700786, |
| "grad_norm": 12.625, |
| "learning_rate": 1.9598361206259657e-05, |
| "loss": 1.2098, |
| "mean_token_accuracy": 0.718773603439331, |
| "num_tokens": 326635.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06246222043119081, |
| "grad_norm": 11.125, |
| "learning_rate": 1.9584928470683055e-05, |
| "loss": 1.1025, |
| "mean_token_accuracy": 0.7460452795028687, |
| "num_tokens": 337866.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.06447713076768084, |
| "grad_norm": 10.6875, |
| "learning_rate": 1.9571495735106458e-05, |
| "loss": 1.0772, |
| "mean_token_accuracy": 0.7526730418205261, |
| "num_tokens": 348512.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.06649204110417087, |
| "grad_norm": 13.375, |
| "learning_rate": 1.9558062999529857e-05, |
| "loss": 1.157, |
| "mean_token_accuracy": 0.7320702195167541, |
| "num_tokens": 360281.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0685069514406609, |
| "grad_norm": 12.0, |
| "learning_rate": 1.9544630263953255e-05, |
| "loss": 1.0157, |
| "mean_token_accuracy": 0.760700649023056, |
| "num_tokens": 371068.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.07052186177715092, |
| "grad_norm": 17.375, |
| "learning_rate": 1.9531197528376654e-05, |
| "loss": 0.8851, |
| "mean_token_accuracy": 0.7925353944301605, |
| "num_tokens": 380947.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.07253677211364094, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.9517764792800056e-05, |
| "loss": 1.0325, |
| "mean_token_accuracy": 0.7617525398731232, |
| "num_tokens": 391552.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.07455168245013097, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.9504332057223455e-05, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.7655075788497925, |
| "num_tokens": 403321.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.076566592786621, |
| "grad_norm": 11.1875, |
| "learning_rate": 1.9490899321646854e-05, |
| "loss": 1.0527, |
| "mean_token_accuracy": 0.7569321393966675, |
| "num_tokens": 414435.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.07858150312311102, |
| "grad_norm": 14.6875, |
| "learning_rate": 1.9477466586070256e-05, |
| "loss": 0.9602, |
| "mean_token_accuracy": 0.7720924854278565, |
| "num_tokens": 423506.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.08059641345960104, |
| "grad_norm": 11.0, |
| "learning_rate": 1.9464033850493655e-05, |
| "loss": 1.0475, |
| "mean_token_accuracy": 0.7505548059940338, |
| "num_tokens": 436556.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.08261132379609107, |
| "grad_norm": 11.9375, |
| "learning_rate": 1.9450601114917054e-05, |
| "loss": 1.0775, |
| "mean_token_accuracy": 0.7474610984325409, |
| "num_tokens": 448248.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0846262341325811, |
| "grad_norm": 10.25, |
| "learning_rate": 1.9437168379340453e-05, |
| "loss": 1.0487, |
| "mean_token_accuracy": 0.7566307663917542, |
| "num_tokens": 460102.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.08664114446907113, |
| "grad_norm": 12.0625, |
| "learning_rate": 1.9423735643763855e-05, |
| "loss": 0.9919, |
| "mean_token_accuracy": 0.7676237523555756, |
| "num_tokens": 471590.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.08865605480556116, |
| "grad_norm": 13.125, |
| "learning_rate": 1.9410302908187254e-05, |
| "loss": 1.0473, |
| "mean_token_accuracy": 0.7525161623954773, |
| "num_tokens": 482096.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.09067096514205118, |
| "grad_norm": 13.4375, |
| "learning_rate": 1.9396870172610653e-05, |
| "loss": 1.0347, |
| "mean_token_accuracy": 0.7515169024467468, |
| "num_tokens": 493585.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.09268587547854121, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.9383437437034055e-05, |
| "loss": 1.0487, |
| "mean_token_accuracy": 0.7547510921955108, |
| "num_tokens": 505989.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.09470078581503123, |
| "grad_norm": 12.25, |
| "learning_rate": 1.9370004701457454e-05, |
| "loss": 1.018, |
| "mean_token_accuracy": 0.7596003413200378, |
| "num_tokens": 516900.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.09671569615152126, |
| "grad_norm": 11.1875, |
| "learning_rate": 1.9356571965880853e-05, |
| "loss": 0.9797, |
| "mean_token_accuracy": 0.7699940800666809, |
| "num_tokens": 526427.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.09873060648801128, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.9343139230304255e-05, |
| "loss": 1.0817, |
| "mean_token_accuracy": 0.7470319092273712, |
| "num_tokens": 537981.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.10074551682450131, |
| "grad_norm": 13.25, |
| "learning_rate": 1.9329706494727654e-05, |
| "loss": 1.0089, |
| "mean_token_accuracy": 0.7595715343952179, |
| "num_tokens": 549174.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.10276042716099133, |
| "grad_norm": 12.625, |
| "learning_rate": 1.9316273759151052e-05, |
| "loss": 1.0164, |
| "mean_token_accuracy": 0.7571583390235901, |
| "num_tokens": 559988.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.10477533749748136, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.930284102357445e-05, |
| "loss": 1.1148, |
| "mean_token_accuracy": 0.7423564851284027, |
| "num_tokens": 571510.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.10679024783397138, |
| "grad_norm": 14.6875, |
| "learning_rate": 1.9289408287997854e-05, |
| "loss": 1.053, |
| "mean_token_accuracy": 0.7485374748706818, |
| "num_tokens": 583020.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.10880515817046141, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.9275975552421252e-05, |
| "loss": 0.9756, |
| "mean_token_accuracy": 0.7606720209121705, |
| "num_tokens": 594042.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.11082006850695145, |
| "grad_norm": 13.3125, |
| "learning_rate": 1.926254281684465e-05, |
| "loss": 0.9514, |
| "mean_token_accuracy": 0.7702824532985687, |
| "num_tokens": 605932.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.11283497884344147, |
| "grad_norm": 10.625, |
| "learning_rate": 1.9249110081268053e-05, |
| "loss": 1.0008, |
| "mean_token_accuracy": 0.7583375632762909, |
| "num_tokens": 617431.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.1148498891799315, |
| "grad_norm": 10.875, |
| "learning_rate": 1.9235677345691452e-05, |
| "loss": 0.998, |
| "mean_token_accuracy": 0.7597042858600617, |
| "num_tokens": 629827.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.11686479951642152, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.922224461011485e-05, |
| "loss": 0.9512, |
| "mean_token_accuracy": 0.7806954503059387, |
| "num_tokens": 640144.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.11887970985291155, |
| "grad_norm": 10.5625, |
| "learning_rate": 1.920881187453825e-05, |
| "loss": 0.9292, |
| "mean_token_accuracy": 0.7761410176753998, |
| "num_tokens": 652386.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.12089462018940157, |
| "grad_norm": 11.0, |
| "learning_rate": 1.9195379138961652e-05, |
| "loss": 1.0768, |
| "mean_token_accuracy": 0.7544383645057678, |
| "num_tokens": 663460.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1229095305258916, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.918194640338505e-05, |
| "loss": 0.8975, |
| "mean_token_accuracy": 0.7799494147300721, |
| "num_tokens": 673425.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.12492444086238162, |
| "grad_norm": 10.375, |
| "learning_rate": 1.916851366780845e-05, |
| "loss": 0.899, |
| "mean_token_accuracy": 0.7885317802429199, |
| "num_tokens": 683817.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.12693935119887165, |
| "grad_norm": 13.375, |
| "learning_rate": 1.9155080932231852e-05, |
| "loss": 0.998, |
| "mean_token_accuracy": 0.7671383440494537, |
| "num_tokens": 694196.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.12895426153536169, |
| "grad_norm": 11.625, |
| "learning_rate": 1.914164819665525e-05, |
| "loss": 0.9808, |
| "mean_token_accuracy": 0.7700311303138733, |
| "num_tokens": 704564.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.1309691718718517, |
| "grad_norm": 13.25, |
| "learning_rate": 1.912821546107865e-05, |
| "loss": 1.0077, |
| "mean_token_accuracy": 0.7643253684043885, |
| "num_tokens": 715775.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.13298408220834174, |
| "grad_norm": 13.625, |
| "learning_rate": 1.911478272550205e-05, |
| "loss": 0.9457, |
| "mean_token_accuracy": 0.7678769171237946, |
| "num_tokens": 726005.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.13499899254483175, |
| "grad_norm": 13.5, |
| "learning_rate": 1.910134998992545e-05, |
| "loss": 1.0155, |
| "mean_token_accuracy": 0.7607427120208741, |
| "num_tokens": 738053.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.1370139028813218, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.908791725434885e-05, |
| "loss": 0.9395, |
| "mean_token_accuracy": 0.7723658442497253, |
| "num_tokens": 748480.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.1390288132178118, |
| "grad_norm": 15.6875, |
| "learning_rate": 1.907448451877225e-05, |
| "loss": 0.9639, |
| "mean_token_accuracy": 0.7676171123981476, |
| "num_tokens": 759972.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.14104372355430184, |
| "grad_norm": 12.875, |
| "learning_rate": 1.906105178319565e-05, |
| "loss": 0.9557, |
| "mean_token_accuracy": 0.7719902992248535, |
| "num_tokens": 771123.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.14305863389079185, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.904761904761905e-05, |
| "loss": 1.0022, |
| "mean_token_accuracy": 0.7667870819568634, |
| "num_tokens": 782532.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1450735442272819, |
| "grad_norm": 11.0625, |
| "learning_rate": 1.903418631204245e-05, |
| "loss": 0.9519, |
| "mean_token_accuracy": 0.7708106875419617, |
| "num_tokens": 794067.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.14708845456377193, |
| "grad_norm": 14.125, |
| "learning_rate": 1.902075357646585e-05, |
| "loss": 0.9718, |
| "mean_token_accuracy": 0.766555666923523, |
| "num_tokens": 804871.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.14910336490026194, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.900732084088925e-05, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.7678309619426728, |
| "num_tokens": 815050.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.15111827523675198, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.8993888105312648e-05, |
| "loss": 0.9951, |
| "mean_token_accuracy": 0.7627758264541626, |
| "num_tokens": 826248.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.153133185573242, |
| "grad_norm": 17.25, |
| "learning_rate": 1.8980455369736047e-05, |
| "loss": 1.0433, |
| "mean_token_accuracy": 0.7571396887302398, |
| "num_tokens": 835706.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.15514809590973203, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.896702263415945e-05, |
| "loss": 1.0518, |
| "mean_token_accuracy": 0.7517435431480408, |
| "num_tokens": 847261.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.15716300624622204, |
| "grad_norm": 10.4375, |
| "learning_rate": 1.8953589898582848e-05, |
| "loss": 0.9629, |
| "mean_token_accuracy": 0.7732720315456391, |
| "num_tokens": 858655.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.15917791658271208, |
| "grad_norm": 12.5, |
| "learning_rate": 1.8940157163006247e-05, |
| "loss": 1.0231, |
| "mean_token_accuracy": 0.7555422127246857, |
| "num_tokens": 870002.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1611928269192021, |
| "grad_norm": 11.0, |
| "learning_rate": 1.892672442742965e-05, |
| "loss": 1.1283, |
| "mean_token_accuracy": 0.7441882312297821, |
| "num_tokens": 881131.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.16320773725569213, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.8913291691853048e-05, |
| "loss": 1.0252, |
| "mean_token_accuracy": 0.7630669414997101, |
| "num_tokens": 893437.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.16522264759218214, |
| "grad_norm": 11.0, |
| "learning_rate": 1.8899858956276447e-05, |
| "loss": 1.0528, |
| "mean_token_accuracy": 0.7483877301216125, |
| "num_tokens": 904976.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.16723755792867218, |
| "grad_norm": 12.375, |
| "learning_rate": 1.8886426220699846e-05, |
| "loss": 0.8715, |
| "mean_token_accuracy": 0.7899761021137237, |
| "num_tokens": 915631.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.1692524682651622, |
| "grad_norm": 13.375, |
| "learning_rate": 1.8872993485123248e-05, |
| "loss": 1.0548, |
| "mean_token_accuracy": 0.7494987368583679, |
| "num_tokens": 927141.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.17126737860165223, |
| "grad_norm": 11.0, |
| "learning_rate": 1.8859560749546647e-05, |
| "loss": 0.9579, |
| "mean_token_accuracy": 0.7668360054492951, |
| "num_tokens": 938792.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.17328228893814226, |
| "grad_norm": 13.125, |
| "learning_rate": 1.8846128013970046e-05, |
| "loss": 0.8595, |
| "mean_token_accuracy": 0.7870603501796722, |
| "num_tokens": 949894.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.17529719927463228, |
| "grad_norm": 12.625, |
| "learning_rate": 1.8832695278393448e-05, |
| "loss": 0.9216, |
| "mean_token_accuracy": 0.7846542239189148, |
| "num_tokens": 961003.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.17731210961112231, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8819262542816847e-05, |
| "loss": 1.0052, |
| "mean_token_accuracy": 0.7603223979473114, |
| "num_tokens": 971577.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.17932701994761233, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.8805829807240245e-05, |
| "loss": 0.9299, |
| "mean_token_accuracy": 0.7757908642292023, |
| "num_tokens": 982234.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.18134193028410237, |
| "grad_norm": 11.0, |
| "learning_rate": 1.8792397071663648e-05, |
| "loss": 1.0312, |
| "mean_token_accuracy": 0.7591780245304107, |
| "num_tokens": 992997.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.18335684062059238, |
| "grad_norm": 10.5625, |
| "learning_rate": 1.8778964336087047e-05, |
| "loss": 0.8999, |
| "mean_token_accuracy": 0.779550439119339, |
| "num_tokens": 1004102.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.18537175095708242, |
| "grad_norm": 12.625, |
| "learning_rate": 1.8765531600510445e-05, |
| "loss": 0.8892, |
| "mean_token_accuracy": 0.7890210688114166, |
| "num_tokens": 1015447.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.18738666129357243, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8752098864933844e-05, |
| "loss": 1.0344, |
| "mean_token_accuracy": 0.7584980130195618, |
| "num_tokens": 1026939.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.18940157163006247, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.8738666129357246e-05, |
| "loss": 0.9686, |
| "mean_token_accuracy": 0.7649740993976593, |
| "num_tokens": 1037937.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.19141648196655248, |
| "grad_norm": 8.75, |
| "learning_rate": 1.8725233393780645e-05, |
| "loss": 1.0364, |
| "mean_token_accuracy": 0.7554452955722809, |
| "num_tokens": 1049173.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.19343139230304252, |
| "grad_norm": 13.625, |
| "learning_rate": 1.8711800658204044e-05, |
| "loss": 1.0173, |
| "mean_token_accuracy": 0.7559767007827759, |
| "num_tokens": 1060166.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.19544630263953255, |
| "grad_norm": 11.1875, |
| "learning_rate": 1.8698367922627446e-05, |
| "loss": 0.9464, |
| "mean_token_accuracy": 0.7735530078411103, |
| "num_tokens": 1070458.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.19746121297602257, |
| "grad_norm": 11.1875, |
| "learning_rate": 1.8684935187050845e-05, |
| "loss": 0.9397, |
| "mean_token_accuracy": 0.7724673867225647, |
| "num_tokens": 1081477.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.1994761233125126, |
| "grad_norm": 15.1875, |
| "learning_rate": 1.8671502451474244e-05, |
| "loss": 1.0769, |
| "mean_token_accuracy": 0.7459556341171265, |
| "num_tokens": 1094205.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.20149103364900262, |
| "grad_norm": 16.5, |
| "learning_rate": 1.8658069715897643e-05, |
| "loss": 0.9763, |
| "mean_token_accuracy": 0.7707934081554413, |
| "num_tokens": 1104929.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.20350594398549265, |
| "grad_norm": 9.9375, |
| "learning_rate": 1.8644636980321045e-05, |
| "loss": 0.9065, |
| "mean_token_accuracy": 0.7750193297863006, |
| "num_tokens": 1115780.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.20552085432198267, |
| "grad_norm": 15.5, |
| "learning_rate": 1.8631204244744444e-05, |
| "loss": 0.9421, |
| "mean_token_accuracy": 0.7709006071090698, |
| "num_tokens": 1127078.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.2075357646584727, |
| "grad_norm": 11.5625, |
| "learning_rate": 1.8617771509167843e-05, |
| "loss": 1.0089, |
| "mean_token_accuracy": 0.7673897624015809, |
| "num_tokens": 1138685.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.20955067499496272, |
| "grad_norm": 11.875, |
| "learning_rate": 1.8604338773591245e-05, |
| "loss": 0.9082, |
| "mean_token_accuracy": 0.7804294168949127, |
| "num_tokens": 1149508.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.21156558533145275, |
| "grad_norm": 13.1875, |
| "learning_rate": 1.8590906038014644e-05, |
| "loss": 0.9128, |
| "mean_token_accuracy": 0.7730132281780243, |
| "num_tokens": 1159971.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.21358049566794277, |
| "grad_norm": 15.5625, |
| "learning_rate": 1.8577473302438043e-05, |
| "loss": 0.8863, |
| "mean_token_accuracy": 0.7842482626438141, |
| "num_tokens": 1170506.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2155954060044328, |
| "grad_norm": 10.1875, |
| "learning_rate": 1.8564040566861445e-05, |
| "loss": 1.0306, |
| "mean_token_accuracy": 0.7470630705356598, |
| "num_tokens": 1183402.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.21761031634092282, |
| "grad_norm": 13.4375, |
| "learning_rate": 1.8550607831284844e-05, |
| "loss": 0.9829, |
| "mean_token_accuracy": 0.7678338825702667, |
| "num_tokens": 1193700.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.21962522667741285, |
| "grad_norm": 10.875, |
| "learning_rate": 1.8537175095708242e-05, |
| "loss": 1.0178, |
| "mean_token_accuracy": 0.7664987504482269, |
| "num_tokens": 1204501.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.2216401370139029, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.852374236013164e-05, |
| "loss": 0.9276, |
| "mean_token_accuracy": 0.7776144444942474, |
| "num_tokens": 1214622.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2236550473503929, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8510309624555044e-05, |
| "loss": 0.9235, |
| "mean_token_accuracy": 0.7812209010124207, |
| "num_tokens": 1225266.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.22566995768688294, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.8496876888978442e-05, |
| "loss": 0.8635, |
| "mean_token_accuracy": 0.7839280545711518, |
| "num_tokens": 1236214.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.22768486802337295, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.848344415340184e-05, |
| "loss": 0.9995, |
| "mean_token_accuracy": 0.7634225428104401, |
| "num_tokens": 1248434.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.229699778359863, |
| "grad_norm": 14.4375, |
| "learning_rate": 1.8470011417825243e-05, |
| "loss": 0.8734, |
| "mean_token_accuracy": 0.7929128646850586, |
| "num_tokens": 1258925.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.231714688696353, |
| "grad_norm": 11.375, |
| "learning_rate": 1.845657868224864e-05, |
| "loss": 0.8612, |
| "mean_token_accuracy": 0.7883239209651947, |
| "num_tokens": 1268877.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.23372959903284304, |
| "grad_norm": 9.375, |
| "learning_rate": 1.844314594667204e-05, |
| "loss": 0.8697, |
| "mean_token_accuracy": 0.782884806394577, |
| "num_tokens": 1280712.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.23574450936933306, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.842971321109544e-05, |
| "loss": 0.9373, |
| "mean_token_accuracy": 0.7709940969944, |
| "num_tokens": 1291740.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.2377594197058231, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.8416280475518842e-05, |
| "loss": 1.0077, |
| "mean_token_accuracy": 0.7596822798252105, |
| "num_tokens": 1303009.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.2397743300423131, |
| "grad_norm": 9.5625, |
| "learning_rate": 1.840284773994224e-05, |
| "loss": 0.9671, |
| "mean_token_accuracy": 0.7675224483013153, |
| "num_tokens": 1314524.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.24178924037880314, |
| "grad_norm": 14.6875, |
| "learning_rate": 1.838941500436564e-05, |
| "loss": 0.8832, |
| "mean_token_accuracy": 0.7861056625843048, |
| "num_tokens": 1327497.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.24380415071529318, |
| "grad_norm": 10.75, |
| "learning_rate": 1.8375982268789042e-05, |
| "loss": 0.8841, |
| "mean_token_accuracy": 0.785036051273346, |
| "num_tokens": 1338614.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.2458190610517832, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.836254953321244e-05, |
| "loss": 0.9576, |
| "mean_token_accuracy": 0.77821044921875, |
| "num_tokens": 1348997.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.24783397138827323, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.834911679763584e-05, |
| "loss": 0.9204, |
| "mean_token_accuracy": 0.7739447593688965, |
| "num_tokens": 1360384.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.24984888172476324, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.833568406205924e-05, |
| "loss": 0.9523, |
| "mean_token_accuracy": 0.7746530413627625, |
| "num_tokens": 1371506.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.25186379206125326, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.832225132648264e-05, |
| "loss": 1.0415, |
| "mean_token_accuracy": 0.7526679396629333, |
| "num_tokens": 1383841.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.2538787023977433, |
| "grad_norm": 11.0, |
| "learning_rate": 1.830881859090604e-05, |
| "loss": 1.0038, |
| "mean_token_accuracy": 0.7654858827590942, |
| "num_tokens": 1395211.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.25589361273423333, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.829538585532944e-05, |
| "loss": 0.9847, |
| "mean_token_accuracy": 0.769145131111145, |
| "num_tokens": 1405181.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.25790852307072337, |
| "grad_norm": 10.8125, |
| "learning_rate": 1.828195311975284e-05, |
| "loss": 1.0403, |
| "mean_token_accuracy": 0.7538439452648162, |
| "num_tokens": 1415965.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.25992343340721336, |
| "grad_norm": 10.25, |
| "learning_rate": 1.826852038417624e-05, |
| "loss": 0.8642, |
| "mean_token_accuracy": 0.7828892707824707, |
| "num_tokens": 1427838.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.2619383437437034, |
| "grad_norm": 16.125, |
| "learning_rate": 1.825508764859964e-05, |
| "loss": 1.0695, |
| "mean_token_accuracy": 0.7503586292266846, |
| "num_tokens": 1438672.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.26395325408019343, |
| "grad_norm": 9.375, |
| "learning_rate": 1.824165491302304e-05, |
| "loss": 0.9433, |
| "mean_token_accuracy": 0.7743871629238128, |
| "num_tokens": 1450338.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.2659681644166835, |
| "grad_norm": 12.5, |
| "learning_rate": 1.8228222177446436e-05, |
| "loss": 1.0234, |
| "mean_token_accuracy": 0.7584192335605622, |
| "num_tokens": 1462159.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.2679830747531735, |
| "grad_norm": 12.3125, |
| "learning_rate": 1.8214789441869838e-05, |
| "loss": 0.9743, |
| "mean_token_accuracy": 0.765831732749939, |
| "num_tokens": 1475528.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.2699979850896635, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8201356706293237e-05, |
| "loss": 0.9147, |
| "mean_token_accuracy": 0.7787733376026154, |
| "num_tokens": 1484980.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.27201289542615353, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.818792397071664e-05, |
| "loss": 0.9997, |
| "mean_token_accuracy": 0.7686746776103973, |
| "num_tokens": 1496744.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.2740278057626436, |
| "grad_norm": 11.1875, |
| "learning_rate": 1.8174491235140038e-05, |
| "loss": 0.8834, |
| "mean_token_accuracy": 0.791484820842743, |
| "num_tokens": 1507317.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.2760427160991336, |
| "grad_norm": 12.0, |
| "learning_rate": 1.8161058499563437e-05, |
| "loss": 0.9816, |
| "mean_token_accuracy": 0.7709372580051422, |
| "num_tokens": 1519459.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.2780576264356236, |
| "grad_norm": 12.0, |
| "learning_rate": 1.814762576398684e-05, |
| "loss": 0.9477, |
| "mean_token_accuracy": 0.7731155812740326, |
| "num_tokens": 1530464.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.28007253677211363, |
| "grad_norm": 14.625, |
| "learning_rate": 1.8134193028410235e-05, |
| "loss": 0.9117, |
| "mean_token_accuracy": 0.780947208404541, |
| "num_tokens": 1541480.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.2820874471086037, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8120760292833637e-05, |
| "loss": 0.8446, |
| "mean_token_accuracy": 0.7891036987304687, |
| "num_tokens": 1552611.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.2841023574450937, |
| "grad_norm": 15.5625, |
| "learning_rate": 1.8107327557257036e-05, |
| "loss": 0.8572, |
| "mean_token_accuracy": 0.7868121325969696, |
| "num_tokens": 1563258.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.2861172677815837, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.8093894821680438e-05, |
| "loss": 0.8396, |
| "mean_token_accuracy": 0.7922836720943451, |
| "num_tokens": 1575060.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.28813217811807373, |
| "grad_norm": 16.25, |
| "learning_rate": 1.8080462086103837e-05, |
| "loss": 0.9779, |
| "mean_token_accuracy": 0.7661596953868866, |
| "num_tokens": 1586846.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.2901470884545638, |
| "grad_norm": 11.9375, |
| "learning_rate": 1.8067029350527236e-05, |
| "loss": 0.9174, |
| "mean_token_accuracy": 0.7865382909774781, |
| "num_tokens": 1597526.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.2921619987910538, |
| "grad_norm": 12.125, |
| "learning_rate": 1.8053596614950638e-05, |
| "loss": 1.1157, |
| "mean_token_accuracy": 0.733438128232956, |
| "num_tokens": 1608463.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.29417690912754385, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.8040163879374037e-05, |
| "loss": 0.9306, |
| "mean_token_accuracy": 0.7765897631645202, |
| "num_tokens": 1619939.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.29619181946403383, |
| "grad_norm": 11.125, |
| "learning_rate": 1.8026731143797435e-05, |
| "loss": 0.9663, |
| "mean_token_accuracy": 0.773787796497345, |
| "num_tokens": 1630503.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.2982067298005239, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.8013298408220838e-05, |
| "loss": 0.8462, |
| "mean_token_accuracy": 0.793005895614624, |
| "num_tokens": 1641658.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.3002216401370139, |
| "grad_norm": 15.875, |
| "learning_rate": 1.7999865672644233e-05, |
| "loss": 0.8524, |
| "mean_token_accuracy": 0.7874381899833679, |
| "num_tokens": 1652188.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.30223655047350395, |
| "grad_norm": 10.9375, |
| "learning_rate": 1.7986432937067635e-05, |
| "loss": 1.0263, |
| "mean_token_accuracy": 0.7567296206951142, |
| "num_tokens": 1663193.0, |
| "step": 1500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 14889, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2012425910605824.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|