{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10074551682450131, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020149103364900263, "grad_norm": 18.375, "learning_rate": 1.998791053798106e-05, "loss": 1.9277, "mean_token_accuracy": 0.679860633611679, "num_tokens": 9373.0, "step": 10 }, { "epoch": 0.004029820672980053, "grad_norm": 13.5625, "learning_rate": 1.9974477802404462e-05, "loss": 1.2796, "mean_token_accuracy": 0.7233692526817321, "num_tokens": 20789.0, "step": 20 }, { "epoch": 0.006044731009470079, "grad_norm": 13.0, "learning_rate": 1.996104506682786e-05, "loss": 1.2607, "mean_token_accuracy": 0.7299719333648682, "num_tokens": 32661.0, "step": 30 }, { "epoch": 0.008059641345960105, "grad_norm": 12.8125, "learning_rate": 1.994761233125126e-05, "loss": 1.2356, "mean_token_accuracy": 0.7324558198451996, "num_tokens": 43049.0, "step": 40 }, { "epoch": 0.01007455168245013, "grad_norm": 12.125, "learning_rate": 1.9934179595674662e-05, "loss": 1.1324, "mean_token_accuracy": 0.7531639993190765, "num_tokens": 52956.0, "step": 50 }, { "epoch": 0.012089462018940157, "grad_norm": 16.125, "learning_rate": 1.992074686009806e-05, "loss": 1.1775, "mean_token_accuracy": 0.7408373892307282, "num_tokens": 63513.0, "step": 60 }, { "epoch": 0.014104372355430184, "grad_norm": 14.0625, "learning_rate": 1.990731412452146e-05, "loss": 1.2446, "mean_token_accuracy": 0.7307547807693482, "num_tokens": 74794.0, "step": 70 }, { "epoch": 0.01611928269192021, "grad_norm": 11.875, "learning_rate": 1.989388138894486e-05, "loss": 1.2428, "mean_token_accuracy": 0.7255984365940094, "num_tokens": 86903.0, "step": 80 }, { "epoch": 0.018134193028410236, "grad_norm": 14.4375, "learning_rate": 1.988044865336826e-05, "loss": 1.2766, "mean_token_accuracy": 0.7225647568702698, "num_tokens": 97159.0, "step": 90 }, { "epoch": 0.02014910336490026, "grad_norm": 12.5625, "learning_rate": 1.986701591779166e-05, "loss": 1.1458, "mean_token_accuracy": 0.7415299773216247, "num_tokens": 107437.0, "step": 100 }, { "epoch": 0.02216401370139029, "grad_norm": 16.75, "learning_rate": 1.985358318221506e-05, "loss": 1.2748, "mean_token_accuracy": 0.7202155470848084, "num_tokens": 117867.0, "step": 110 }, { "epoch": 0.024178924037880314, "grad_norm": 18.125, "learning_rate": 1.984015044663846e-05, "loss": 1.1689, "mean_token_accuracy": 0.7355200052261353, "num_tokens": 128288.0, "step": 120 }, { "epoch": 0.02619383437437034, "grad_norm": 12.6875, "learning_rate": 1.982671771106186e-05, "loss": 1.2324, "mean_token_accuracy": 0.7224856972694397, "num_tokens": 139627.0, "step": 130 }, { "epoch": 0.028208744710860368, "grad_norm": 11.5625, "learning_rate": 1.981328497548526e-05, "loss": 1.1365, "mean_token_accuracy": 0.7402825653553009, "num_tokens": 150498.0, "step": 140 }, { "epoch": 0.030223655047350393, "grad_norm": 14.75, "learning_rate": 1.979985223990866e-05, "loss": 1.1178, "mean_token_accuracy": 0.7426175236701965, "num_tokens": 161754.0, "step": 150 }, { "epoch": 0.03223856538384042, "grad_norm": 11.4375, "learning_rate": 1.978641950433206e-05, "loss": 1.2596, "mean_token_accuracy": 0.7134447395801544, "num_tokens": 173087.0, "step": 160 }, { "epoch": 0.03425347572033045, "grad_norm": 12.75, "learning_rate": 1.9772986768755458e-05, "loss": 1.0652, "mean_token_accuracy": 0.7474986433982849, "num_tokens": 184747.0, "step": 170 }, { "epoch": 0.03626838605682047, "grad_norm": 11.8125, "learning_rate": 1.9759554033178857e-05, "loss": 1.1436, "mean_token_accuracy": 0.7323237180709838, "num_tokens": 195331.0, "step": 180 }, { "epoch": 0.0382832963933105, "grad_norm": 9.875, "learning_rate": 1.974612129760226e-05, "loss": 1.0312, "mean_token_accuracy": 0.7625056743621826, "num_tokens": 208260.0, "step": 190 }, { "epoch": 0.04029820672980052, "grad_norm": 14.9375, "learning_rate": 1.9732688562025658e-05, "loss": 1.0084, "mean_token_accuracy": 0.7631498157978058, "num_tokens": 218822.0, "step": 200 }, { "epoch": 0.04231311706629055, "grad_norm": 11.625, "learning_rate": 1.9719255826449057e-05, "loss": 0.9813, "mean_token_accuracy": 0.7651655077934265, "num_tokens": 228580.0, "step": 210 }, { "epoch": 0.04432802740278058, "grad_norm": 17.875, "learning_rate": 1.970582309087246e-05, "loss": 1.07, "mean_token_accuracy": 0.7532146275043488, "num_tokens": 239159.0, "step": 220 }, { "epoch": 0.046342937739270604, "grad_norm": 11.25, "learning_rate": 1.9692390355295858e-05, "loss": 1.113, "mean_token_accuracy": 0.7436384916305542, "num_tokens": 251695.0, "step": 230 }, { "epoch": 0.04835784807576063, "grad_norm": 13.375, "learning_rate": 1.9678957619719257e-05, "loss": 0.929, "mean_token_accuracy": 0.7755303025245667, "num_tokens": 261128.0, "step": 240 }, { "epoch": 0.050372758412250654, "grad_norm": 12.8125, "learning_rate": 1.9665524884142656e-05, "loss": 1.0999, "mean_token_accuracy": 0.7514171898365021, "num_tokens": 271560.0, "step": 250 }, { "epoch": 0.05238766874874068, "grad_norm": 13.0, "learning_rate": 1.9652092148566058e-05, "loss": 1.0339, "mean_token_accuracy": 0.7604846298694611, "num_tokens": 282223.0, "step": 260 }, { "epoch": 0.054402579085230704, "grad_norm": 12.9375, "learning_rate": 1.9638659412989457e-05, "loss": 1.0473, "mean_token_accuracy": 0.7622893512248993, "num_tokens": 292726.0, "step": 270 }, { "epoch": 0.056417489421720736, "grad_norm": 15.0, "learning_rate": 1.9625226677412856e-05, "loss": 0.9894, "mean_token_accuracy": 0.764206200838089, "num_tokens": 303785.0, "step": 280 }, { "epoch": 0.05843239975821076, "grad_norm": 10.3125, "learning_rate": 1.9611793941836258e-05, "loss": 1.109, "mean_token_accuracy": 0.7469749927520752, "num_tokens": 314725.0, "step": 290 }, { "epoch": 0.060447310094700786, "grad_norm": 12.625, "learning_rate": 1.9598361206259657e-05, "loss": 1.2098, "mean_token_accuracy": 0.718773603439331, "num_tokens": 326635.0, "step": 300 }, { "epoch": 0.06246222043119081, "grad_norm": 11.125, "learning_rate": 1.9584928470683055e-05, "loss": 1.1025, "mean_token_accuracy": 0.7460452795028687, "num_tokens": 337866.0, "step": 310 }, { "epoch": 0.06447713076768084, "grad_norm": 10.6875, "learning_rate": 1.9571495735106458e-05, "loss": 1.0772, "mean_token_accuracy": 0.7526730418205261, "num_tokens": 348512.0, "step": 320 }, { "epoch": 0.06649204110417087, "grad_norm": 13.375, "learning_rate": 1.9558062999529857e-05, "loss": 1.157, "mean_token_accuracy": 0.7320702195167541, "num_tokens": 360281.0, "step": 330 }, { "epoch": 0.0685069514406609, "grad_norm": 12.0, "learning_rate": 1.9544630263953255e-05, "loss": 1.0157, "mean_token_accuracy": 0.760700649023056, "num_tokens": 371068.0, "step": 340 }, { "epoch": 0.07052186177715092, "grad_norm": 17.375, "learning_rate": 1.9531197528376654e-05, "loss": 0.8851, "mean_token_accuracy": 0.7925353944301605, "num_tokens": 380947.0, "step": 350 }, { "epoch": 0.07253677211364094, "grad_norm": 11.3125, "learning_rate": 1.9517764792800056e-05, "loss": 1.0325, "mean_token_accuracy": 0.7617525398731232, "num_tokens": 391552.0, "step": 360 }, { "epoch": 0.07455168245013097, "grad_norm": 10.9375, "learning_rate": 1.9504332057223455e-05, "loss": 0.9852, "mean_token_accuracy": 0.7655075788497925, "num_tokens": 403321.0, "step": 370 }, { "epoch": 0.076566592786621, "grad_norm": 11.1875, "learning_rate": 1.9490899321646854e-05, "loss": 1.0527, "mean_token_accuracy": 0.7569321393966675, "num_tokens": 414435.0, "step": 380 }, { "epoch": 0.07858150312311102, "grad_norm": 14.6875, "learning_rate": 1.9477466586070256e-05, "loss": 0.9602, "mean_token_accuracy": 0.7720924854278565, "num_tokens": 423506.0, "step": 390 }, { "epoch": 0.08059641345960104, "grad_norm": 11.0, "learning_rate": 1.9464033850493655e-05, "loss": 1.0475, "mean_token_accuracy": 0.7505548059940338, "num_tokens": 436556.0, "step": 400 }, { "epoch": 0.08261132379609107, "grad_norm": 11.9375, "learning_rate": 1.9450601114917054e-05, "loss": 1.0775, "mean_token_accuracy": 0.7474610984325409, "num_tokens": 448248.0, "step": 410 }, { "epoch": 0.0846262341325811, "grad_norm": 10.25, "learning_rate": 1.9437168379340453e-05, "loss": 1.0487, "mean_token_accuracy": 0.7566307663917542, "num_tokens": 460102.0, "step": 420 }, { "epoch": 0.08664114446907113, "grad_norm": 12.0625, "learning_rate": 1.9423735643763855e-05, "loss": 0.9919, "mean_token_accuracy": 0.7676237523555756, "num_tokens": 471590.0, "step": 430 }, { "epoch": 0.08865605480556116, "grad_norm": 13.125, "learning_rate": 1.9410302908187254e-05, "loss": 1.0473, "mean_token_accuracy": 0.7525161623954773, "num_tokens": 482096.0, "step": 440 }, { "epoch": 0.09067096514205118, "grad_norm": 13.4375, "learning_rate": 1.9396870172610653e-05, "loss": 1.0347, "mean_token_accuracy": 0.7515169024467468, "num_tokens": 493585.0, "step": 450 }, { "epoch": 0.09268587547854121, "grad_norm": 10.9375, "learning_rate": 1.9383437437034055e-05, "loss": 1.0487, "mean_token_accuracy": 0.7547510921955108, "num_tokens": 505989.0, "step": 460 }, { "epoch": 0.09470078581503123, "grad_norm": 12.25, "learning_rate": 1.9370004701457454e-05, "loss": 1.018, "mean_token_accuracy": 0.7596003413200378, "num_tokens": 516900.0, "step": 470 }, { "epoch": 0.09671569615152126, "grad_norm": 11.1875, "learning_rate": 1.9356571965880853e-05, "loss": 0.9797, "mean_token_accuracy": 0.7699940800666809, "num_tokens": 526427.0, "step": 480 }, { "epoch": 0.09873060648801128, "grad_norm": 10.3125, "learning_rate": 1.9343139230304255e-05, "loss": 1.0817, "mean_token_accuracy": 0.7470319092273712, "num_tokens": 537981.0, "step": 490 }, { "epoch": 0.10074551682450131, "grad_norm": 13.25, "learning_rate": 1.9329706494727654e-05, "loss": 1.0089, "mean_token_accuracy": 0.7595715343952179, "num_tokens": 549174.0, "step": 500 } ], "logging_steps": 10, "max_steps": 14889, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 668729881817088.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }