| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 169, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.029585798816568046, | |
| "grad_norm": 2.3596792221069336, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 1.1077, | |
| "mean_token_accuracy": 0.7105089992284774, | |
| "num_tokens": 2621440.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05917159763313609, | |
| "grad_norm": 1.7160693407058716, | |
| "learning_rate": 1.0588235294117648e-05, | |
| "loss": 1.0353, | |
| "mean_token_accuracy": 0.7203521847724914, | |
| "num_tokens": 5242880.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08875739644970414, | |
| "grad_norm": 0.8708027601242065, | |
| "learning_rate": 1.647058823529412e-05, | |
| "loss": 0.9553, | |
| "mean_token_accuracy": 0.7316273808479309, | |
| "num_tokens": 7861273.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.11834319526627218, | |
| "grad_norm": 0.6267297267913818, | |
| "learning_rate": 1.999145758387301e-05, | |
| "loss": 0.8899, | |
| "mean_token_accuracy": 0.743945425748825, | |
| "num_tokens": 10474605.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14792899408284024, | |
| "grad_norm": 0.4869539439678192, | |
| "learning_rate": 1.9895522933272028e-05, | |
| "loss": 0.8455, | |
| "mean_token_accuracy": 0.7542784661054611, | |
| "num_tokens": 13096045.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.17751479289940827, | |
| "grad_norm": 0.4492546319961548, | |
| "learning_rate": 1.9694002659393306e-05, | |
| "loss": 0.8389, | |
| "mean_token_accuracy": 0.7545322090387344, | |
| "num_tokens": 15717485.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20710059171597633, | |
| "grad_norm": 0.4301901161670685, | |
| "learning_rate": 1.9389046991574298e-05, | |
| "loss": 0.8268, | |
| "mean_token_accuracy": 0.7568192929029465, | |
| "num_tokens": 18338925.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.23668639053254437, | |
| "grad_norm": 0.37857791781425476, | |
| "learning_rate": 1.898390981891979e-05, | |
| "loss": 0.7956, | |
| "mean_token_accuracy": 0.7649114817380905, | |
| "num_tokens": 20960365.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.26627218934911245, | |
| "grad_norm": 0.3774871230125427, | |
| "learning_rate": 1.8482913971175737e-05, | |
| "loss": 0.8079, | |
| "mean_token_accuracy": 0.7604979366064072, | |
| "num_tokens": 23581805.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2958579881656805, | |
| "grad_norm": 0.3935665488243103, | |
| "learning_rate": 1.789140509396394e-05, | |
| "loss": 0.7961, | |
| "mean_token_accuracy": 0.7637095510959625, | |
| "num_tokens": 26203245.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3254437869822485, | |
| "grad_norm": 0.3560955226421356, | |
| "learning_rate": 1.7215694610530624e-05, | |
| "loss": 0.7946, | |
| "mean_token_accuracy": 0.7636954367160798, | |
| "num_tokens": 28824685.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.35502958579881655, | |
| "grad_norm": 0.37536928057670593, | |
| "learning_rate": 1.646299237860941e-05, | |
| "loss": 0.7938, | |
| "mean_token_accuracy": 0.7634146034717559, | |
| "num_tokens": 31443682.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.3460127115249634, | |
| "learning_rate": 1.5641329760952514e-05, | |
| "loss": 0.7639, | |
| "mean_token_accuracy": 0.7715859562158585, | |
| "num_tokens": 34061327.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.41420118343195267, | |
| "grad_norm": 0.37248924374580383, | |
| "learning_rate": 1.4759473930370738e-05, | |
| "loss": 0.7771, | |
| "mean_token_accuracy": 0.7680761635303497, | |
| "num_tokens": 36682767.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4437869822485207, | |
| "grad_norm": 0.34658947587013245, | |
| "learning_rate": 1.3826834323650899e-05, | |
| "loss": 0.8028, | |
| "mean_token_accuracy": 0.7603934347629547, | |
| "num_tokens": 39300751.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.47337278106508873, | |
| "grad_norm": 0.3706357479095459, | |
| "learning_rate": 1.2853362242491054e-05, | |
| "loss": 0.7738, | |
| "mean_token_accuracy": 0.7687152832746506, | |
| "num_tokens": 41918913.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5029585798816568, | |
| "grad_norm": 0.3420180380344391, | |
| "learning_rate": 1.1849444672715587e-05, | |
| "loss": 0.779, | |
| "mean_token_accuracy": 0.7665890276432037, | |
| "num_tokens": 44540169.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5325443786982249, | |
| "grad_norm": 0.3875311315059662, | |
| "learning_rate": 1.0825793454723325e-05, | |
| "loss": 0.7683, | |
| "mean_token_accuracy": 0.7695774495601654, | |
| "num_tokens": 47160969.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5621301775147929, | |
| "grad_norm": 0.35641783475875854, | |
| "learning_rate": 9.79333098772446e-06, | |
| "loss": 0.7692, | |
| "mean_token_accuracy": 0.7692601472139359, | |
| "num_tokens": 49782409.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 0.3525794446468353, | |
| "learning_rate": 8.763073687306523e-06, | |
| "loss": 0.7853, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "eval_loss": 0.7879331111907959, | |
| "eval_mean_token_accuracy": 0.77275630235672, | |
| "eval_num_tokens": 52403849.0, | |
| "eval_runtime": 1.3797, | |
| "eval_samples_per_second": 93.496, | |
| "eval_steps_per_second": 3.624, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.621301775147929, | |
| "grad_norm": 0.31415921449661255, | |
| "learning_rate": 7.746014439841941e-06, | |
| "loss": 0.7483, | |
| "mean_token_accuracy": 0.7696478188037872, | |
| "num_tokens": 55025289.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.650887573964497, | |
| "grad_norm": 0.35946062207221985, | |
| "learning_rate": 6.7530053079531664e-06, | |
| "loss": 0.751, | |
| "mean_token_accuracy": 0.773971700668335, | |
| "num_tokens": 57641987.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6804733727810651, | |
| "grad_norm": 0.3247712552547455, | |
| "learning_rate": 5.794641738572925e-06, | |
| "loss": 0.766, | |
| "mean_token_accuracy": 0.7699923694133759, | |
| "num_tokens": 60263427.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7100591715976331, | |
| "grad_norm": 0.32254937291145325, | |
| "learning_rate": 4.881149509103993e-06, | |
| "loss": 0.7655, | |
| "mean_token_accuracy": 0.7701400071382523, | |
| "num_tokens": 62882216.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7396449704142012, | |
| "grad_norm": 0.3108745515346527, | |
| "learning_rate": 4.0222756179675915e-06, | |
| "loss": 0.7772, | |
| "mean_token_accuracy": 0.7666765838861466, | |
| "num_tokens": 65503656.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.3257134258747101, | |
| "learning_rate": 3.2271842837425917e-06, | |
| "loss": 0.7507, | |
| "mean_token_accuracy": 0.7745073974132538, | |
| "num_tokens": 68125096.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7988165680473372, | |
| "grad_norm": 0.29293152689933777, | |
| "learning_rate": 2.504359162588741e-06, | |
| "loss": 0.7436, | |
| "mean_token_accuracy": 0.7756482750177384, | |
| "num_tokens": 70746536.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8284023668639053, | |
| "grad_norm": 0.3014591932296753, | |
| "learning_rate": 1.861512827298051e-06, | |
| "loss": 0.7438, | |
| "mean_token_accuracy": 0.7760634958744049, | |
| "num_tokens": 73365315.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8579881656804734, | |
| "grad_norm": 0.2900739312171936, | |
| "learning_rate": 1.305504473836331e-06, | |
| "loss": 0.7585, | |
| "mean_token_accuracy": 0.7716891765594482, | |
| "num_tokens": 75986755.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8875739644970414, | |
| "grad_norm": 0.2970161437988281, | |
| "learning_rate": 8.42266733449425e-07, | |
| "loss": 0.7436, | |
| "mean_token_accuracy": 0.7757604539394378, | |
| "num_tokens": 78608195.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9171597633136095, | |
| "grad_norm": 0.29888418316841125, | |
| "learning_rate": 4.7674237125185597e-07, | |
| "loss": 0.7513, | |
| "mean_token_accuracy": 0.7735760033130645, | |
| "num_tokens": 81229635.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9467455621301775, | |
| "grad_norm": 0.28909653425216675, | |
| "learning_rate": 2.1283154672645522e-07, | |
| "loss": 0.7583, | |
| "mean_token_accuracy": 0.7715224415063858, | |
| "num_tokens": 83851075.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9763313609467456, | |
| "grad_norm": 0.2939208447933197, | |
| "learning_rate": 5.3350198867574424e-08, | |
| "loss": 0.7567, | |
| "mean_token_accuracy": 0.7722329139709473, | |
| "num_tokens": 86472515.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "mean_token_accuracy": 0.7728699259459972, | |
| "num_tokens": 88569667.0, | |
| "step": 169, | |
| "total_flos": 6.966137809639834e+17, | |
| "train_loss": 0.80340930978222, | |
| "train_runtime": 717.7254, | |
| "train_samples_per_second": 30.133, | |
| "train_steps_per_second": 0.235 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 169, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.966137809639834e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |