{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988901220865705, "eval_steps": 100, "global_step": 225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022197558268590455, "grad_norm": 2.2235984897572694, "learning_rate": 4.347826086956522e-06, "loss": 1.0965, "mean_token_accuracy": 0.7105070888996124, "step": 5 }, { "epoch": 0.04439511653718091, "grad_norm": 1.0371581479889551, "learning_rate": 8.695652173913044e-06, "loss": 1.0434, "mean_token_accuracy": 0.7201980158686638, "step": 10 }, { "epoch": 0.06659267480577137, "grad_norm": 1.3071296340196783, "learning_rate": 1.3043478260869566e-05, "loss": 0.9843, "mean_token_accuracy": 0.7260666146874428, "step": 15 }, { "epoch": 0.08879023307436182, "grad_norm": 0.7154957235445495, "learning_rate": 1.739130434782609e-05, "loss": 0.918, "mean_token_accuracy": 0.7386834695935249, "step": 20 }, { "epoch": 0.11098779134295228, "grad_norm": 0.6557057916957052, "learning_rate": 1.999516282291988e-05, "loss": 0.8655, "mean_token_accuracy": 0.7502432510256767, "step": 25 }, { "epoch": 0.13318534961154274, "grad_norm": 0.5942751563907254, "learning_rate": 1.9940798309400527e-05, "loss": 0.8476, "mean_token_accuracy": 0.7531737372279167, "step": 30 }, { "epoch": 0.15538290788013318, "grad_norm": 0.4957314057726332, "learning_rate": 1.982635248222264e-05, "loss": 0.8302, "mean_token_accuracy": 0.7568761467933655, "step": 35 }, { "epoch": 0.17758046614872364, "grad_norm": 0.5120800804849761, "learning_rate": 1.9652517041934357e-05, "loss": 0.8213, "mean_token_accuracy": 0.7587681159377098, "step": 40 }, { "epoch": 0.1997780244173141, "grad_norm": 0.3881900403766342, "learning_rate": 1.9420342634699893e-05, "loss": 0.8043, "mean_token_accuracy": 0.7615880772471428, "step": 45 }, { "epoch": 0.22197558268590456, "grad_norm": 0.4192634272720123, "learning_rate": 1.913123250228619e-05, "loss": 0.7988, "mean_token_accuracy": 0.7634198889136314, "step": 50 }, { "epoch": 0.244173140954495, "grad_norm": 0.3732616490096885, "learning_rate": 1.878693400099269e-05, "loss": 0.7936, "mean_token_accuracy": 0.7645810097455978, "step": 55 }, { "epoch": 0.2663706992230855, "grad_norm": 0.4016469613549646, "learning_rate": 1.8389528040783014e-05, "loss": 0.7885, "mean_token_accuracy": 0.7654082521796226, "step": 60 }, { "epoch": 0.2885682574916759, "grad_norm": 0.3914922958543853, "learning_rate": 1.7941416508447537e-05, "loss": 0.7832, "mean_token_accuracy": 0.7660864099860192, "step": 65 }, { "epoch": 0.31076581576026635, "grad_norm": 0.41242564338129056, "learning_rate": 1.7445307750810153e-05, "loss": 0.7843, "mean_token_accuracy": 0.7660787045955658, "step": 70 }, { "epoch": 0.33296337402885684, "grad_norm": 0.40771645930304695, "learning_rate": 1.690420020571747e-05, "loss": 0.7852, "mean_token_accuracy": 0.765475058555603, "step": 75 }, { "epoch": 0.3551609322974473, "grad_norm": 0.4015081855730916, "learning_rate": 1.6321364279743267e-05, "loss": 0.7897, "mean_token_accuracy": 0.7640682518482208, "step": 80 }, { "epoch": 0.37735849056603776, "grad_norm": 0.44299751358195016, "learning_rate": 1.570032258213783e-05, "loss": 0.7825, "mean_token_accuracy": 0.7656505450606346, "step": 85 }, { "epoch": 0.3995560488346282, "grad_norm": 0.4192853533608115, "learning_rate": 1.50448286344864e-05, "loss": 0.7807, "mean_token_accuracy": 0.765812449157238, "step": 90 }, { "epoch": 0.42175360710321863, "grad_norm": 0.3631740147185924, "learning_rate": 1.4358844184753713e-05, "loss": 0.7587, "mean_token_accuracy": 0.7719828367233277, "step": 95 }, { "epoch": 0.4439511653718091, "grad_norm": 0.402770649577875, "learning_rate": 1.3646515262826551e-05, "loss": 0.7717, "mean_token_accuracy": 0.7688061684370041, "step": 100 }, { "epoch": 0.4439511653718091, "eval_loss": 0.7915011048316956, "eval_mean_token_accuracy": 0.7592891256014506, "eval_runtime": 3.409, "eval_samples_per_second": 37.841, "eval_steps_per_second": 1.76, "step": 100 }, { "epoch": 0.46614872364039955, "grad_norm": 0.3807177377985598, "learning_rate": 1.2912147122272523e-05, "loss": 0.7718, "mean_token_accuracy": 0.7677725195884705, "step": 105 }, { "epoch": 0.48834628190899, "grad_norm": 0.39305302441426365, "learning_rate": 1.2160178219764838e-05, "loss": 0.7673, "mean_token_accuracy": 0.7697135150432587, "step": 110 }, { "epoch": 0.5105438401775805, "grad_norm": 0.3538843727039592, "learning_rate": 1.1395153389439232e-05, "loss": 0.7774, "mean_token_accuracy": 0.7661349773406982, "step": 115 }, { "epoch": 0.532741398446171, "grad_norm": 0.39909651538501784, "learning_rate": 1.0621696374314807e-05, "loss": 0.7612, "mean_token_accuracy": 0.7710714146494866, "step": 120 }, { "epoch": 0.5549389567147613, "grad_norm": 0.3924767091876128, "learning_rate": 9.844481880796492e-06, "loss": 0.7647, "mean_token_accuracy": 0.7698081240057946, "step": 125 }, { "epoch": 0.5771365149833518, "grad_norm": 0.37938597040417427, "learning_rate": 9.068207325159285e-06, "loss": 0.7446, "mean_token_accuracy": 0.7747560039162635, "step": 130 }, { "epoch": 0.5993340732519423, "grad_norm": 0.3854698311289992, "learning_rate": 8.297564442776014e-06, "loss": 0.7555, "mean_token_accuracy": 0.7720320910215378, "step": 135 }, { "epoch": 0.6215316315205327, "grad_norm": 0.3547942293439342, "learning_rate": 7.537210931679988e-06, "loss": 0.7531, "mean_token_accuracy": 0.7726601183414459, "step": 140 }, { "epoch": 0.6437291897891232, "grad_norm": 0.34912810948700074, "learning_rate": 6.791742301846325e-06, "loss": 0.774, "mean_token_accuracy": 0.7670607671141625, "step": 145 }, { "epoch": 0.6659267480577137, "grad_norm": 0.3681331061431648, "learning_rate": 6.065664100332478e-06, "loss": 0.7651, "mean_token_accuracy": 0.7690967857837677, "step": 150 }, { "epoch": 0.6881243063263041, "grad_norm": 0.368954099366331, "learning_rate": 5.3633646801467255e-06, "loss": 0.7524, "mean_token_accuracy": 0.7736147075891495, "step": 155 }, { "epoch": 0.7103218645948945, "grad_norm": 0.35992878085709806, "learning_rate": 4.689088677427249e-06, "loss": 0.7373, "mean_token_accuracy": 0.7774115353822708, "step": 160 }, { "epoch": 0.732519422863485, "grad_norm": 0.3614111664312195, "learning_rate": 4.046911357233343e-06, "loss": 0.7524, "mean_token_accuracy": 0.773401352763176, "step": 165 }, { "epoch": 0.7547169811320755, "grad_norm": 0.36349455300535294, "learning_rate": 3.440713983000601e-06, "loss": 0.7558, "mean_token_accuracy": 0.772033941745758, "step": 170 }, { "epoch": 0.7769145394006659, "grad_norm": 0.3397807098524205, "learning_rate": 2.8741603585249312e-06, "loss": 0.7597, "mean_token_accuracy": 0.7703836098313331, "step": 175 }, { "epoch": 0.7991120976692564, "grad_norm": 0.34012440219961076, "learning_rate": 2.3506746842535244e-06, "loss": 0.7574, "mean_token_accuracy": 0.7716947227716446, "step": 180 }, { "epoch": 0.8213096559378469, "grad_norm": 0.33440840345953365, "learning_rate": 1.8734208617174986e-06, "loss": 0.7558, "mean_token_accuracy": 0.772092518210411, "step": 185 }, { "epoch": 0.8435072142064373, "grad_norm": 0.34135151245134615, "learning_rate": 1.4452833711883629e-06, "loss": 0.7507, "mean_token_accuracy": 0.7737182468175888, "step": 190 }, { "epoch": 0.8657047724750278, "grad_norm": 0.3426195190218489, "learning_rate": 1.0688498381320855e-06, "loss": 0.7506, "mean_token_accuracy": 0.7732807129621506, "step": 195 }, { "epoch": 0.8879023307436182, "grad_norm": 0.33276209433043014, "learning_rate": 7.463953938275859e-07, "loss": 0.7412, "mean_token_accuracy": 0.7763293862342835, "step": 200 }, { "epoch": 0.8879023307436182, "eval_loss": 0.7738624811172485, "eval_mean_token_accuracy": 0.7629910707473755, "eval_runtime": 3.3184, "eval_samples_per_second": 38.875, "eval_steps_per_second": 1.808, "step": 200 }, { "epoch": 0.9100998890122086, "grad_norm": 0.32120176766623043, "learning_rate": 4.798689246727006e-07, "loss": 0.7435, "mean_token_accuracy": 0.7754677474498749, "step": 205 }, { "epoch": 0.9322974472807991, "grad_norm": 0.33184265142796177, "learning_rate": 2.708812932856253e-07, "loss": 0.7341, "mean_token_accuracy": 0.7786281272768975, "step": 210 }, { "epoch": 0.9544950055493896, "grad_norm": 0.3272741862810846, "learning_rate": 1.206956025924333e-07, "loss": 0.7431, "mean_token_accuracy": 0.7758487805724144, "step": 215 }, { "epoch": 0.97669256381798, "grad_norm": 0.32591262892422984, "learning_rate": 3.0219561743707326e-08, "loss": 0.7404, "mean_token_accuracy": 0.7761501207947731, "step": 220 }, { "epoch": 0.9988901220865705, "grad_norm": 0.3313176690461435, "learning_rate": 0.0, "loss": 0.754, "mean_token_accuracy": 0.7721990346908569, "step": 225 }, { "epoch": 0.9988901220865705, "step": 225, "total_flos": 76874092904448.0, "train_loss": 0.7931315400865343, "train_runtime": 3405.4017, "train_samples_per_second": 6.35, "train_steps_per_second": 0.066 } ], "logging_steps": 5, "max_steps": 225, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 76874092904448.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }