| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9974102848686645, | |
| "eval_steps": 100, | |
| "global_step": 337, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014798372179060304, | |
| "grad_norm": 2.529757296078759, | |
| "learning_rate": 2.9411764705882355e-06, | |
| "loss": 1.0934, | |
| "mean_token_accuracy": 0.7109986682692578, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.029596744358120607, | |
| "grad_norm": 1.422813372633783, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 1.0769, | |
| "mean_token_accuracy": 0.713703674632456, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04439511653718091, | |
| "grad_norm": 1.2509269497287367, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 0.9836, | |
| "mean_token_accuracy": 0.7299454424541987, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.059193488716241215, | |
| "grad_norm": 0.8914426503111362, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 0.959, | |
| "mean_token_accuracy": 0.7300663921992366, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07399186089530152, | |
| "grad_norm": 0.8067170719112181, | |
| "learning_rate": 1.4705882352941179e-05, | |
| "loss": 0.8946, | |
| "mean_token_accuracy": 0.7439782021567598, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08879023307436182, | |
| "grad_norm": 0.7305058664290756, | |
| "learning_rate": 1.7647058823529414e-05, | |
| "loss": 0.8868, | |
| "mean_token_accuracy": 0.7440409923806053, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10358860525342212, | |
| "grad_norm": 0.6295417023897449, | |
| "learning_rate": 1.9999462497359468e-05, | |
| "loss": 0.846, | |
| "mean_token_accuracy": 0.7537203381992988, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11838697743248243, | |
| "grad_norm": 0.6117165691743598, | |
| "learning_rate": 1.9980655971335944e-05, | |
| "loss": 0.8244, | |
| "mean_token_accuracy": 0.7585093434278123, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13318534961154274, | |
| "grad_norm": 0.5437820723713919, | |
| "learning_rate": 1.993503206718859e-05, | |
| "loss": 0.8234, | |
| "mean_token_accuracy": 0.7580775511378283, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14798372179060304, | |
| "grad_norm": 0.5994518377351506, | |
| "learning_rate": 1.986271337340182e-05, | |
| "loss": 0.8061, | |
| "mean_token_accuracy": 0.7622137778953486, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16278209396966334, | |
| "grad_norm": 0.5610382931843831, | |
| "learning_rate": 1.976389420563607e-05, | |
| "loss": 0.8147, | |
| "mean_token_accuracy": 0.7596355209938828, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17758046614872364, | |
| "grad_norm": 0.541749021280417, | |
| "learning_rate": 1.9638840084614182e-05, | |
| "loss": 0.8023, | |
| "mean_token_accuracy": 0.7626221131913979, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19237883832778394, | |
| "grad_norm": 0.5927391227082073, | |
| "learning_rate": 1.9487887022684336e-05, | |
| "loss": 0.792, | |
| "mean_token_accuracy": 0.7640808288437386, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20717721050684423, | |
| "grad_norm": 0.49483109919597235, | |
| "learning_rate": 1.9311440620976597e-05, | |
| "loss": 0.7952, | |
| "mean_token_accuracy": 0.763416504405295, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.22197558268590456, | |
| "grad_norm": 0.5209603398907325, | |
| "learning_rate": 1.9109974979578852e-05, | |
| "loss": 0.7767, | |
| "mean_token_accuracy": 0.7679532679973347, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.23677395486496486, | |
| "grad_norm": 0.559301673802474, | |
| "learning_rate": 1.8884031423660492e-05, | |
| "loss": 0.7859, | |
| "mean_token_accuracy": 0.7658941659362738, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 0.48738299737003077, | |
| "learning_rate": 1.8634217048966638e-05, | |
| "loss": 0.7901, | |
| "mean_token_accuracy": 0.7640363065528991, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2663706992230855, | |
| "grad_norm": 0.5037354193246141, | |
| "learning_rate": 1.836120309059107e-05, | |
| "loss": 0.7653, | |
| "mean_token_accuracy": 0.7711097394533366, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.28116907140214575, | |
| "grad_norm": 0.5099324231686295, | |
| "learning_rate": 1.8065723119410885e-05, | |
| "loss": 0.773, | |
| "mean_token_accuracy": 0.7688236610178055, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2959674435812061, | |
| "grad_norm": 0.5162546937987671, | |
| "learning_rate": 1.77485710710289e-05, | |
| "loss": 0.777, | |
| "mean_token_accuracy": 0.7669513366327537, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2959674435812061, | |
| "eval_loss": 0.7993948459625244, | |
| "eval_mean_token_accuracy": 0.7520157117000811, | |
| "eval_runtime": 22.4251, | |
| "eval_samples_per_second": 5.752, | |
| "eval_steps_per_second": 0.401, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.31076581576026635, | |
| "grad_norm": 0.5153662692401914, | |
| "learning_rate": 1.741059911251997e-05, | |
| "loss": 0.7743, | |
| "mean_token_accuracy": 0.7680899400071219, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3255641879393267, | |
| "grad_norm": 0.531301740012161, | |
| "learning_rate": 1.7052715352713076e-05, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.768589950092964, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.340362560118387, | |
| "grad_norm": 0.5079587378149193, | |
| "learning_rate": 1.667588140216154e-05, | |
| "loss": 0.781, | |
| "mean_token_accuracy": 0.7663125944717821, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3551609322974473, | |
| "grad_norm": 0.5075617093383449, | |
| "learning_rate": 1.628110978935756e-05, | |
| "loss": 0.786, | |
| "mean_token_accuracy": 0.7641918327864261, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3699593044765076, | |
| "grad_norm": 0.4920553775684615, | |
| "learning_rate": 1.586946124013354e-05, | |
| "loss": 0.7722, | |
| "mean_token_accuracy": 0.7675306826676355, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.38475767665556787, | |
| "grad_norm": 0.46844086408604685, | |
| "learning_rate": 1.5442041827560274e-05, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.7681226440401921, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3995560488346282, | |
| "grad_norm": 0.5006979961590039, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.7784, | |
| "mean_token_accuracy": 0.7660222443686391, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.41435442101368847, | |
| "grad_norm": 0.6042379639918534, | |
| "learning_rate": 1.4544523495299843e-05, | |
| "loss": 0.7479, | |
| "mean_token_accuracy": 0.7747889658516649, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4291527931927488, | |
| "grad_norm": 0.4755237937840668, | |
| "learning_rate": 1.4076836149416889e-05, | |
| "loss": 0.7624, | |
| "mean_token_accuracy": 0.7699293798855323, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4439511653718091, | |
| "grad_norm": 0.46503094702295095, | |
| "learning_rate": 1.3598194608050011e-05, | |
| "loss": 0.7616, | |
| "mean_token_accuracy": 0.7709603055918677, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4587495375508694, | |
| "grad_norm": 0.5346016990534304, | |
| "learning_rate": 1.3109884950114007e-05, | |
| "loss": 0.7484, | |
| "mean_token_accuracy": 0.7737660069728891, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4735479097299297, | |
| "grad_norm": 0.5047241034043195, | |
| "learning_rate": 1.2613219232128608e-05, | |
| "loss": 0.7785, | |
| "mean_token_accuracy": 0.7653063232140367, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.48834628190899, | |
| "grad_norm": 0.5151988912608395, | |
| "learning_rate": 1.2109531962807333e-05, | |
| "loss": 0.7586, | |
| "mean_token_accuracy": 0.7715861263931194, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 0.5207635149659061, | |
| "learning_rate": 1.1600176517318742e-05, | |
| "loss": 0.7698, | |
| "mean_token_accuracy": 0.767696240618356, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5179430262671106, | |
| "grad_norm": 0.5221377484405474, | |
| "learning_rate": 1.1086521500854746e-05, | |
| "loss": 0.7645, | |
| "mean_token_accuracy": 0.7690029290294697, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.532741398446171, | |
| "grad_norm": 0.47399176489700884, | |
| "learning_rate": 1.0569947071276847e-05, | |
| "loss": 0.7509, | |
| "mean_token_accuracy": 0.7737958764535031, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5475397706252312, | |
| "grad_norm": 0.4801259627614154, | |
| "learning_rate": 1.0051841230721065e-05, | |
| "loss": 0.7571, | |
| "mean_token_accuracy": 0.7718575689607972, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5623381428042915, | |
| "grad_norm": 0.5442241692987562, | |
| "learning_rate": 9.533596096125826e-06, | |
| "loss": 0.7475, | |
| "mean_token_accuracy": 0.7735494724831667, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5771365149833518, | |
| "grad_norm": 0.5129988043441257, | |
| "learning_rate": 9.016604158703654e-06, | |
| "loss": 0.7367, | |
| "mean_token_accuracy": 0.7760403846737656, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5919348871624122, | |
| "grad_norm": 0.49802077786714777, | |
| "learning_rate": 8.502254542407186e-06, | |
| "loss": 0.751, | |
| "mean_token_accuracy": 0.7725336897742148, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5919348871624122, | |
| "eval_loss": 0.7738236784934998, | |
| "eval_mean_token_accuracy": 0.7564002649424092, | |
| "eval_runtime": 22.6977, | |
| "eval_samples_per_second": 5.683, | |
| "eval_steps_per_second": 0.397, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6067332593414725, | |
| "grad_norm": 0.4725178576549685, | |
| "learning_rate": 7.991929271442817e-06, | |
| "loss": 0.7408, | |
| "mean_token_accuracy": 0.7754658210611487, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6215316315205327, | |
| "grad_norm": 0.4467549184477307, | |
| "learning_rate": 7.48699955686089e-06, | |
| "loss": 0.7481, | |
| "mean_token_accuracy": 0.7734976134620747, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.636330003699593, | |
| "grad_norm": 0.5092175414486388, | |
| "learning_rate": 6.988822112200157e-06, | |
| "loss": 0.775, | |
| "mean_token_accuracy": 0.7658959697158333, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6511283758786534, | |
| "grad_norm": 0.42984120956628935, | |
| "learning_rate": 6.498735508086094e-06, | |
| "loss": 0.7498, | |
| "mean_token_accuracy": 0.7733468993741717, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6659267480577137, | |
| "grad_norm": 0.4796464573610777, | |
| "learning_rate": 6.018056575578075e-06, | |
| "loss": 0.7592, | |
| "mean_token_accuracy": 0.7700744726453849, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.680725120236774, | |
| "grad_norm": 0.44466012402594945, | |
| "learning_rate": 5.548076867929331e-06, | |
| "loss": 0.7379, | |
| "mean_token_accuracy": 0.7766688186626635, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6955234924158342, | |
| "grad_norm": 0.420118342029257, | |
| "learning_rate": 5.090059190266779e-06, | |
| "loss": 0.7407, | |
| "mean_token_accuracy": 0.7765309135683108, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7103218645948945, | |
| "grad_norm": 0.469412147263681, | |
| "learning_rate": 4.645234206515171e-06, | |
| "loss": 0.7318, | |
| "mean_token_accuracy": 0.7780594268298044, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7251202367739549, | |
| "grad_norm": 0.4551442738979974, | |
| "learning_rate": 4.214797132682597e-06, | |
| "loss": 0.7441, | |
| "mean_token_accuracy": 0.7748968283755631, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7399186089530152, | |
| "grad_norm": 0.4831064222375685, | |
| "learning_rate": 3.799904525392251e-06, | |
| "loss": 0.7445, | |
| "mean_token_accuracy": 0.7748052156178381, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.47051009367436814, | |
| "learning_rate": 3.401671174289469e-06, | |
| "loss": 0.7489, | |
| "mean_token_accuracy": 0.7735863356272306, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7695153533111357, | |
| "grad_norm": 0.45299706282059216, | |
| "learning_rate": 3.021167106673928e-06, | |
| "loss": 0.7445, | |
| "mean_token_accuracy": 0.7743176597062353, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 0.45515424714196745, | |
| "learning_rate": 2.6594147124053983e-06, | |
| "loss": 0.7611, | |
| "mean_token_accuracy": 0.7692758287124456, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7991120976692564, | |
| "grad_norm": 0.4410676246388664, | |
| "learning_rate": 2.317385996808195e-06, | |
| "loss": 0.7446, | |
| "mean_token_accuracy": 0.7746800378847397, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8139104698483167, | |
| "grad_norm": 0.43167561987963765, | |
| "learning_rate": 1.9959999689556407e-06, | |
| "loss": 0.7435, | |
| "mean_token_accuracy": 0.7749901909589475, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8287088420273769, | |
| "grad_norm": 0.4155407434016717, | |
| "learning_rate": 1.6961201723520248e-06, | |
| "loss": 0.745, | |
| "mean_token_accuracy": 0.7748366486384513, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8435072142064373, | |
| "grad_norm": 0.4064131784578557, | |
| "learning_rate": 1.4185523646469822e-06, | |
| "loss": 0.7454, | |
| "mean_token_accuracy": 0.7744148351411485, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8583055863854976, | |
| "grad_norm": 0.45134117555841247, | |
| "learning_rate": 1.1640423526166987e-06, | |
| "loss": 0.7421, | |
| "mean_token_accuracy": 0.7751366333358114, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8731039585645579, | |
| "grad_norm": 0.44061180190834115, | |
| "learning_rate": 9.332739882292752e-07, | |
| "loss": 0.739, | |
| "mean_token_accuracy": 0.7758671125550064, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8879023307436182, | |
| "grad_norm": 0.4169382521249663, | |
| "learning_rate": 7.268673311786378e-07, | |
| "loss": 0.7311, | |
| "mean_token_accuracy": 0.7785074040925413, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8879023307436182, | |
| "eval_loss": 0.7653124928474426, | |
| "eval_mean_token_accuracy": 0.7593830888021523, | |
| "eval_runtime": 22.2878, | |
| "eval_samples_per_second": 5.788, | |
| "eval_steps_per_second": 0.404, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9027007029226785, | |
| "grad_norm": 0.410258398874674, | |
| "learning_rate": 5.453769828241872e-07, | |
| "loss": 0.7392, | |
| "mean_token_accuracy": 0.7761676627734904, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9174990751017388, | |
| "grad_norm": 0.40249101480415633, | |
| "learning_rate": 3.8929059601275463e-07, | |
| "loss": 0.7121, | |
| "mean_token_accuracy": 0.783584436550681, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9322974472807991, | |
| "grad_norm": 0.414157547251248, | |
| "learning_rate": 2.5902756478688674e-07, | |
| "loss": 0.7401, | |
| "mean_token_accuracy": 0.7763153434034343, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9470958194598594, | |
| "grad_norm": 0.41182543636624114, | |
| "learning_rate": 1.5493789750014032e-07, | |
| "loss": 0.7374, | |
| "mean_token_accuracy": 0.776741892879153, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9618941916389198, | |
| "grad_norm": 0.4219118289563983, | |
| "learning_rate": 7.730127636723539e-08, | |
| "loss": 0.7238, | |
| "mean_token_accuracy": 0.7803358503518802, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.97669256381798, | |
| "grad_norm": 0.3965539812845053, | |
| "learning_rate": 2.6326305976001054e-08, | |
| "loss": 0.7384, | |
| "mean_token_accuracy": 0.7762797398356506, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9914909359970403, | |
| "grad_norm": 0.42732278784108485, | |
| "learning_rate": 2.149952780321485e-09, | |
| "loss": 0.7369, | |
| "mean_token_accuracy": 0.7763332392093869, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9974102848686645, | |
| "mean_token_accuracy": 0.7690880825852714, | |
| "step": 337, | |
| "total_flos": 76774385909760.0, | |
| "train_loss": 0.7816961303309092, | |
| "train_runtime": 6971.6118, | |
| "train_samples_per_second": 3.102, | |
| "train_steps_per_second": 0.048 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 337, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 76774385909760.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |