| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.10074551682450131, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0020149103364900263, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.998791053798106e-05, | |
| "loss": 1.9277, | |
| "mean_token_accuracy": 0.679860633611679, | |
| "num_tokens": 9373.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004029820672980053, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.9974477802404462e-05, | |
| "loss": 1.2796, | |
| "mean_token_accuracy": 0.7233692526817321, | |
| "num_tokens": 20789.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006044731009470079, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.996104506682786e-05, | |
| "loss": 1.2607, | |
| "mean_token_accuracy": 0.7299719333648682, | |
| "num_tokens": 32661.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008059641345960105, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.994761233125126e-05, | |
| "loss": 1.2356, | |
| "mean_token_accuracy": 0.7324558198451996, | |
| "num_tokens": 43049.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01007455168245013, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.9934179595674662e-05, | |
| "loss": 1.1324, | |
| "mean_token_accuracy": 0.7531639993190765, | |
| "num_tokens": 52956.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012089462018940157, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.992074686009806e-05, | |
| "loss": 1.1775, | |
| "mean_token_accuracy": 0.7408373892307282, | |
| "num_tokens": 63513.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014104372355430184, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.990731412452146e-05, | |
| "loss": 1.2446, | |
| "mean_token_accuracy": 0.7307547807693482, | |
| "num_tokens": 74794.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01611928269192021, | |
| "grad_norm": 11.875, | |
| "learning_rate": 1.989388138894486e-05, | |
| "loss": 1.2428, | |
| "mean_token_accuracy": 0.7255984365940094, | |
| "num_tokens": 86903.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.018134193028410236, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 1.988044865336826e-05, | |
| "loss": 1.2766, | |
| "mean_token_accuracy": 0.7225647568702698, | |
| "num_tokens": 97159.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02014910336490026, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.986701591779166e-05, | |
| "loss": 1.1458, | |
| "mean_token_accuracy": 0.7415299773216247, | |
| "num_tokens": 107437.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02216401370139029, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.985358318221506e-05, | |
| "loss": 1.2748, | |
| "mean_token_accuracy": 0.7202155470848084, | |
| "num_tokens": 117867.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.024178924037880314, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.984015044663846e-05, | |
| "loss": 1.1689, | |
| "mean_token_accuracy": 0.7355200052261353, | |
| "num_tokens": 128288.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02619383437437034, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 1.982671771106186e-05, | |
| "loss": 1.2324, | |
| "mean_token_accuracy": 0.7224856972694397, | |
| "num_tokens": 139627.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.028208744710860368, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1.981328497548526e-05, | |
| "loss": 1.1365, | |
| "mean_token_accuracy": 0.7402825653553009, | |
| "num_tokens": 150498.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.030223655047350393, | |
| "grad_norm": 14.75, | |
| "learning_rate": 1.979985223990866e-05, | |
| "loss": 1.1178, | |
| "mean_token_accuracy": 0.7426175236701965, | |
| "num_tokens": 161754.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03223856538384042, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 1.978641950433206e-05, | |
| "loss": 1.2596, | |
| "mean_token_accuracy": 0.7134447395801544, | |
| "num_tokens": 173087.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03425347572033045, | |
| "grad_norm": 12.75, | |
| "learning_rate": 1.9772986768755458e-05, | |
| "loss": 1.0652, | |
| "mean_token_accuracy": 0.7474986433982849, | |
| "num_tokens": 184747.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03626838605682047, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 1.9759554033178857e-05, | |
| "loss": 1.1436, | |
| "mean_token_accuracy": 0.7323237180709838, | |
| "num_tokens": 195331.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0382832963933105, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.974612129760226e-05, | |
| "loss": 1.0312, | |
| "mean_token_accuracy": 0.7625056743621826, | |
| "num_tokens": 208260.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04029820672980052, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 1.9732688562025658e-05, | |
| "loss": 1.0084, | |
| "mean_token_accuracy": 0.7631498157978058, | |
| "num_tokens": 218822.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04231311706629055, | |
| "grad_norm": 11.625, | |
| "learning_rate": 1.9719255826449057e-05, | |
| "loss": 0.9813, | |
| "mean_token_accuracy": 0.7651655077934265, | |
| "num_tokens": 228580.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04432802740278058, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.970582309087246e-05, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.7532146275043488, | |
| "num_tokens": 239159.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.046342937739270604, | |
| "grad_norm": 11.25, | |
| "learning_rate": 1.9692390355295858e-05, | |
| "loss": 1.113, | |
| "mean_token_accuracy": 0.7436384916305542, | |
| "num_tokens": 251695.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04835784807576063, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.9678957619719257e-05, | |
| "loss": 0.929, | |
| "mean_token_accuracy": 0.7755303025245667, | |
| "num_tokens": 261128.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.050372758412250654, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.9665524884142656e-05, | |
| "loss": 1.0999, | |
| "mean_token_accuracy": 0.7514171898365021, | |
| "num_tokens": 271560.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05238766874874068, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.9652092148566058e-05, | |
| "loss": 1.0339, | |
| "mean_token_accuracy": 0.7604846298694611, | |
| "num_tokens": 282223.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.054402579085230704, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.9638659412989457e-05, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.7622893512248993, | |
| "num_tokens": 292726.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.056417489421720736, | |
| "grad_norm": 15.0, | |
| "learning_rate": 1.9625226677412856e-05, | |
| "loss": 0.9894, | |
| "mean_token_accuracy": 0.764206200838089, | |
| "num_tokens": 303785.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.05843239975821076, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.9611793941836258e-05, | |
| "loss": 1.109, | |
| "mean_token_accuracy": 0.7469749927520752, | |
| "num_tokens": 314725.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.060447310094700786, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.9598361206259657e-05, | |
| "loss": 1.2098, | |
| "mean_token_accuracy": 0.718773603439331, | |
| "num_tokens": 326635.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06246222043119081, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.9584928470683055e-05, | |
| "loss": 1.1025, | |
| "mean_token_accuracy": 0.7460452795028687, | |
| "num_tokens": 337866.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.06447713076768084, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.9571495735106458e-05, | |
| "loss": 1.0772, | |
| "mean_token_accuracy": 0.7526730418205261, | |
| "num_tokens": 348512.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06649204110417087, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.9558062999529857e-05, | |
| "loss": 1.157, | |
| "mean_token_accuracy": 0.7320702195167541, | |
| "num_tokens": 360281.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0685069514406609, | |
| "grad_norm": 12.0, | |
| "learning_rate": 1.9544630263953255e-05, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.760700649023056, | |
| "num_tokens": 371068.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07052186177715092, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.9531197528376654e-05, | |
| "loss": 0.8851, | |
| "mean_token_accuracy": 0.7925353944301605, | |
| "num_tokens": 380947.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07253677211364094, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.9517764792800056e-05, | |
| "loss": 1.0325, | |
| "mean_token_accuracy": 0.7617525398731232, | |
| "num_tokens": 391552.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.07455168245013097, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.9504332057223455e-05, | |
| "loss": 0.9852, | |
| "mean_token_accuracy": 0.7655075788497925, | |
| "num_tokens": 403321.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.076566592786621, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.9490899321646854e-05, | |
| "loss": 1.0527, | |
| "mean_token_accuracy": 0.7569321393966675, | |
| "num_tokens": 414435.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.07858150312311102, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 1.9477466586070256e-05, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7720924854278565, | |
| "num_tokens": 423506.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08059641345960104, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.9464033850493655e-05, | |
| "loss": 1.0475, | |
| "mean_token_accuracy": 0.7505548059940338, | |
| "num_tokens": 436556.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08261132379609107, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 1.9450601114917054e-05, | |
| "loss": 1.0775, | |
| "mean_token_accuracy": 0.7474610984325409, | |
| "num_tokens": 448248.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0846262341325811, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.9437168379340453e-05, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.7566307663917542, | |
| "num_tokens": 460102.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.08664114446907113, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 1.9423735643763855e-05, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7676237523555756, | |
| "num_tokens": 471590.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.08865605480556116, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.9410302908187254e-05, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.7525161623954773, | |
| "num_tokens": 482096.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09067096514205118, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 1.9396870172610653e-05, | |
| "loss": 1.0347, | |
| "mean_token_accuracy": 0.7515169024467468, | |
| "num_tokens": 493585.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.09268587547854121, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.9383437437034055e-05, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.7547510921955108, | |
| "num_tokens": 505989.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.09470078581503123, | |
| "grad_norm": 12.25, | |
| "learning_rate": 1.9370004701457454e-05, | |
| "loss": 1.018, | |
| "mean_token_accuracy": 0.7596003413200378, | |
| "num_tokens": 516900.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.09671569615152126, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.9356571965880853e-05, | |
| "loss": 0.9797, | |
| "mean_token_accuracy": 0.7699940800666809, | |
| "num_tokens": 526427.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09873060648801128, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.9343139230304255e-05, | |
| "loss": 1.0817, | |
| "mean_token_accuracy": 0.7470319092273712, | |
| "num_tokens": 537981.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.10074551682450131, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.9329706494727654e-05, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7595715343952179, | |
| "num_tokens": 549174.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 14889, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 668729881817088.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |