| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9988901220865705, |
| "eval_steps": 100, |
| "global_step": 225, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.022197558268590455, |
| "grad_norm": 2.2235984897572694, |
| "learning_rate": 4.347826086956522e-06, |
| "loss": 1.0965, |
| "mean_token_accuracy": 0.7105070888996124, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04439511653718091, |
| "grad_norm": 1.0371581479889551, |
| "learning_rate": 8.695652173913044e-06, |
| "loss": 1.0434, |
| "mean_token_accuracy": 0.7201980158686638, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06659267480577137, |
| "grad_norm": 1.3071296340196783, |
| "learning_rate": 1.3043478260869566e-05, |
| "loss": 0.9843, |
| "mean_token_accuracy": 0.7260666146874428, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08879023307436182, |
| "grad_norm": 0.7154957235445495, |
| "learning_rate": 1.739130434782609e-05, |
| "loss": 0.918, |
| "mean_token_accuracy": 0.7386834695935249, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11098779134295228, |
| "grad_norm": 0.6557057916957052, |
| "learning_rate": 1.999516282291988e-05, |
| "loss": 0.8655, |
| "mean_token_accuracy": 0.7502432510256767, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.13318534961154274, |
| "grad_norm": 0.5942751563907254, |
| "learning_rate": 1.9940798309400527e-05, |
| "loss": 0.8476, |
| "mean_token_accuracy": 0.7531737372279167, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15538290788013318, |
| "grad_norm": 0.4957314057726332, |
| "learning_rate": 1.982635248222264e-05, |
| "loss": 0.8302, |
| "mean_token_accuracy": 0.7568761467933655, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.17758046614872364, |
| "grad_norm": 0.5120800804849761, |
| "learning_rate": 1.9652517041934357e-05, |
| "loss": 0.8213, |
| "mean_token_accuracy": 0.7587681159377098, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1997780244173141, |
| "grad_norm": 0.3881900403766342, |
| "learning_rate": 1.9420342634699893e-05, |
| "loss": 0.8043, |
| "mean_token_accuracy": 0.7615880772471428, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.22197558268590456, |
| "grad_norm": 0.4192634272720123, |
| "learning_rate": 1.913123250228619e-05, |
| "loss": 0.7988, |
| "mean_token_accuracy": 0.7634198889136314, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.244173140954495, |
| "grad_norm": 0.3732616490096885, |
| "learning_rate": 1.878693400099269e-05, |
| "loss": 0.7936, |
| "mean_token_accuracy": 0.7645810097455978, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2663706992230855, |
| "grad_norm": 0.4016469613549646, |
| "learning_rate": 1.8389528040783014e-05, |
| "loss": 0.7885, |
| "mean_token_accuracy": 0.7654082521796226, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2885682574916759, |
| "grad_norm": 0.3914922958543853, |
| "learning_rate": 1.7941416508447537e-05, |
| "loss": 0.7832, |
| "mean_token_accuracy": 0.7660864099860192, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.31076581576026635, |
| "grad_norm": 0.41242564338129056, |
| "learning_rate": 1.7445307750810153e-05, |
| "loss": 0.7843, |
| "mean_token_accuracy": 0.7660787045955658, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.33296337402885684, |
| "grad_norm": 0.40771645930304695, |
| "learning_rate": 1.690420020571747e-05, |
| "loss": 0.7852, |
| "mean_token_accuracy": 0.765475058555603, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3551609322974473, |
| "grad_norm": 0.4015081855730916, |
| "learning_rate": 1.6321364279743267e-05, |
| "loss": 0.7897, |
| "mean_token_accuracy": 0.7640682518482208, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 0.44299751358195016, |
| "learning_rate": 1.570032258213783e-05, |
| "loss": 0.7825, |
| "mean_token_accuracy": 0.7656505450606346, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.3995560488346282, |
| "grad_norm": 0.4192853533608115, |
| "learning_rate": 1.50448286344864e-05, |
| "loss": 0.7807, |
| "mean_token_accuracy": 0.765812449157238, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.42175360710321863, |
| "grad_norm": 0.3631740147185924, |
| "learning_rate": 1.4358844184753713e-05, |
| "loss": 0.7587, |
| "mean_token_accuracy": 0.7719828367233277, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.4439511653718091, |
| "grad_norm": 0.402770649577875, |
| "learning_rate": 1.3646515262826551e-05, |
| "loss": 0.7717, |
| "mean_token_accuracy": 0.7688061684370041, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4439511653718091, |
| "eval_loss": 0.7915011048316956, |
| "eval_mean_token_accuracy": 0.7592891256014506, |
| "eval_runtime": 3.409, |
| "eval_samples_per_second": 37.841, |
| "eval_steps_per_second": 1.76, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.46614872364039955, |
| "grad_norm": 0.3807177377985598, |
| "learning_rate": 1.2912147122272523e-05, |
| "loss": 0.7718, |
| "mean_token_accuracy": 0.7677725195884705, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.48834628190899, |
| "grad_norm": 0.39305302441426365, |
| "learning_rate": 1.2160178219764838e-05, |
| "loss": 0.7673, |
| "mean_token_accuracy": 0.7697135150432587, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5105438401775805, |
| "grad_norm": 0.3538843727039592, |
| "learning_rate": 1.1395153389439232e-05, |
| "loss": 0.7774, |
| "mean_token_accuracy": 0.7661349773406982, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.532741398446171, |
| "grad_norm": 0.39909651538501784, |
| "learning_rate": 1.0621696374314807e-05, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7710714146494866, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5549389567147613, |
| "grad_norm": 0.3924767091876128, |
| "learning_rate": 9.844481880796492e-06, |
| "loss": 0.7647, |
| "mean_token_accuracy": 0.7698081240057946, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5771365149833518, |
| "grad_norm": 0.37938597040417427, |
| "learning_rate": 9.068207325159285e-06, |
| "loss": 0.7446, |
| "mean_token_accuracy": 0.7747560039162635, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5993340732519423, |
| "grad_norm": 0.3854698311289992, |
| "learning_rate": 8.297564442776014e-06, |
| "loss": 0.7555, |
| "mean_token_accuracy": 0.7720320910215378, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.6215316315205327, |
| "grad_norm": 0.3547942293439342, |
| "learning_rate": 7.537210931679988e-06, |
| "loss": 0.7531, |
| "mean_token_accuracy": 0.7726601183414459, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6437291897891232, |
| "grad_norm": 0.34912810948700074, |
| "learning_rate": 6.791742301846325e-06, |
| "loss": 0.774, |
| "mean_token_accuracy": 0.7670607671141625, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6659267480577137, |
| "grad_norm": 0.3681331061431648, |
| "learning_rate": 6.065664100332478e-06, |
| "loss": 0.7651, |
| "mean_token_accuracy": 0.7690967857837677, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6881243063263041, |
| "grad_norm": 0.368954099366331, |
| "learning_rate": 5.3633646801467255e-06, |
| "loss": 0.7524, |
| "mean_token_accuracy": 0.7736147075891495, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.7103218645948945, |
| "grad_norm": 0.35992878085709806, |
| "learning_rate": 4.689088677427249e-06, |
| "loss": 0.7373, |
| "mean_token_accuracy": 0.7774115353822708, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.732519422863485, |
| "grad_norm": 0.3614111664312195, |
| "learning_rate": 4.046911357233343e-06, |
| "loss": 0.7524, |
| "mean_token_accuracy": 0.773401352763176, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.36349455300535294, |
| "learning_rate": 3.440713983000601e-06, |
| "loss": 0.7558, |
| "mean_token_accuracy": 0.772033941745758, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7769145394006659, |
| "grad_norm": 0.3397807098524205, |
| "learning_rate": 2.8741603585249312e-06, |
| "loss": 0.7597, |
| "mean_token_accuracy": 0.7703836098313331, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7991120976692564, |
| "grad_norm": 0.34012440219961076, |
| "learning_rate": 2.3506746842535244e-06, |
| "loss": 0.7574, |
| "mean_token_accuracy": 0.7716947227716446, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8213096559378469, |
| "grad_norm": 0.33440840345953365, |
| "learning_rate": 1.8734208617174986e-06, |
| "loss": 0.7558, |
| "mean_token_accuracy": 0.772092518210411, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8435072142064373, |
| "grad_norm": 0.34135151245134615, |
| "learning_rate": 1.4452833711883629e-06, |
| "loss": 0.7507, |
| "mean_token_accuracy": 0.7737182468175888, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8657047724750278, |
| "grad_norm": 0.3426195190218489, |
| "learning_rate": 1.0688498381320855e-06, |
| "loss": 0.7506, |
| "mean_token_accuracy": 0.7732807129621506, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8879023307436182, |
| "grad_norm": 0.33276209433043014, |
| "learning_rate": 7.463953938275859e-07, |
| "loss": 0.7412, |
| "mean_token_accuracy": 0.7763293862342835, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8879023307436182, |
| "eval_loss": 0.7738624811172485, |
| "eval_mean_token_accuracy": 0.7629910707473755, |
| "eval_runtime": 3.3184, |
| "eval_samples_per_second": 38.875, |
| "eval_steps_per_second": 1.808, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9100998890122086, |
| "grad_norm": 0.32120176766623043, |
| "learning_rate": 4.798689246727006e-07, |
| "loss": 0.7435, |
| "mean_token_accuracy": 0.7754677474498749, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.9322974472807991, |
| "grad_norm": 0.33184265142796177, |
| "learning_rate": 2.708812932856253e-07, |
| "loss": 0.7341, |
| "mean_token_accuracy": 0.7786281272768975, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9544950055493896, |
| "grad_norm": 0.3272741862810846, |
| "learning_rate": 1.206956025924333e-07, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7758487805724144, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.97669256381798, |
| "grad_norm": 0.32591262892422984, |
| "learning_rate": 3.0219561743707326e-08, |
| "loss": 0.7404, |
| "mean_token_accuracy": 0.7761501207947731, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9988901220865705, |
| "grad_norm": 0.3313176690461435, |
| "learning_rate": 0.0, |
| "loss": 0.754, |
| "mean_token_accuracy": 0.7721990346908569, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9988901220865705, |
| "step": 225, |
| "total_flos": 76874092904448.0, |
| "train_loss": 0.7931315400865343, |
| "train_runtime": 3405.4017, |
| "train_samples_per_second": 6.35, |
| "train_steps_per_second": 0.066 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 225, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 76874092904448.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|