| { | |
| "best_global_step": 420, | |
| "best_metric": 0.03601359203457832, | |
| "best_model_checkpoint": "/content/models/gemma_qlora_lmh/checkpoint-420", | |
| "epoch": 1.6535433070866141, | |
| "eval_steps": 20, | |
| "global_step": 420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.0512761652469633, | |
| "epoch": 0.07874015748031496, | |
| "grad_norm": 2.0831573009490967, | |
| "learning_rate": 9.625984251968504e-06, | |
| "loss": 0.6806, | |
| "mean_token_accuracy": 0.8287491604685784, | |
| "num_tokens": 80574.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07874015748031496, | |
| "eval_entropy": 2.0480873808264732, | |
| "eval_loss": 0.16752119362354279, | |
| "eval_mean_token_accuracy": 0.9204465709626675, | |
| "eval_num_tokens": 80574.0, | |
| "eval_runtime": 14.4058, | |
| "eval_samples_per_second": 70.388, | |
| "eval_steps_per_second": 4.443, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.043068042397499, | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 3.2172703742980957, | |
| "learning_rate": 9.23228346456693e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9078111112117767, | |
| "num_tokens": 163585.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "eval_entropy": 1.998575333505869, | |
| "eval_loss": 0.13858385384082794, | |
| "eval_mean_token_accuracy": 0.9343411969020963, | |
| "eval_num_tokens": 163585.0, | |
| "eval_runtime": 14.4076, | |
| "eval_samples_per_second": 70.38, | |
| "eval_steps_per_second": 4.442, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.0217554807662963, | |
| "epoch": 0.23622047244094488, | |
| "grad_norm": 6.096222877502441, | |
| "learning_rate": 8.838582677165355e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.922796231508255, | |
| "num_tokens": 247496.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.23622047244094488, | |
| "eval_entropy": 2.0012808348983526, | |
| "eval_loss": 0.17316989600658417, | |
| "eval_mean_token_accuracy": 0.9123187400400639, | |
| "eval_num_tokens": 247496.0, | |
| "eval_runtime": 14.4234, | |
| "eval_samples_per_second": 70.302, | |
| "eval_steps_per_second": 4.437, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.01616033911705, | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 3.203735113143921, | |
| "learning_rate": 8.444881889763782e-06, | |
| "loss": 0.1077, | |
| "mean_token_accuracy": 0.9547338560223579, | |
| "num_tokens": 330519.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "eval_entropy": 1.967957517132163, | |
| "eval_loss": 0.10772992670536041, | |
| "eval_mean_token_accuracy": 0.9575660079717636, | |
| "eval_num_tokens": 330519.0, | |
| "eval_runtime": 14.4579, | |
| "eval_samples_per_second": 70.135, | |
| "eval_steps_per_second": 4.427, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.9572902411222457, | |
| "epoch": 0.3937007874015748, | |
| "grad_norm": 12.593894004821777, | |
| "learning_rate": 8.051181102362205e-06, | |
| "loss": 0.0977, | |
| "mean_token_accuracy": 0.9625995993614197, | |
| "num_tokens": 412053.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3937007874015748, | |
| "eval_entropy": 1.9561998695135117, | |
| "eval_loss": 0.10965846478939056, | |
| "eval_mean_token_accuracy": 0.9605293860659003, | |
| "eval_num_tokens": 412053.0, | |
| "eval_runtime": 14.4568, | |
| "eval_samples_per_second": 70.14, | |
| "eval_steps_per_second": 4.427, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.9785019993782043, | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 5.114565849304199, | |
| "learning_rate": 7.65748031496063e-06, | |
| "loss": 0.0684, | |
| "mean_token_accuracy": 0.9732592537999153, | |
| "num_tokens": 492803.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "eval_entropy": 1.9417847488075495, | |
| "eval_loss": 0.11676711589097977, | |
| "eval_mean_token_accuracy": 0.970998496748507, | |
| "eval_num_tokens": 492803.0, | |
| "eval_runtime": 15.073, | |
| "eval_samples_per_second": 67.273, | |
| "eval_steps_per_second": 4.246, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.9924108654260635, | |
| "epoch": 0.5511811023622047, | |
| "grad_norm": 4.5052642822265625, | |
| "learning_rate": 7.263779527559056e-06, | |
| "loss": 0.0727, | |
| "mean_token_accuracy": 0.9757537111639977, | |
| "num_tokens": 574646.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5511811023622047, | |
| "eval_entropy": 1.9667269736528397, | |
| "eval_loss": 0.05474039167165756, | |
| "eval_mean_token_accuracy": 0.981005135923624, | |
| "eval_num_tokens": 574646.0, | |
| "eval_runtime": 14.4351, | |
| "eval_samples_per_second": 70.245, | |
| "eval_steps_per_second": 4.434, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.9721805155277252, | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 3.2773079872131348, | |
| "learning_rate": 6.870078740157481e-06, | |
| "loss": 0.0417, | |
| "mean_token_accuracy": 0.9847830027341843, | |
| "num_tokens": 656089.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "eval_entropy": 1.9357626736164093, | |
| "eval_loss": 0.05493154749274254, | |
| "eval_mean_token_accuracy": 0.9821033654734492, | |
| "eval_num_tokens": 656089.0, | |
| "eval_runtime": 14.7904, | |
| "eval_samples_per_second": 68.558, | |
| "eval_steps_per_second": 4.327, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.980790689587593, | |
| "epoch": 0.7086614173228346, | |
| "grad_norm": 3.153142213821411, | |
| "learning_rate": 6.476377952755906e-06, | |
| "loss": 0.0717, | |
| "mean_token_accuracy": 0.9758390337228775, | |
| "num_tokens": 736840.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7086614173228346, | |
| "eval_entropy": 1.9667537324130535, | |
| "eval_loss": 0.05807032063603401, | |
| "eval_mean_token_accuracy": 0.9831889141350985, | |
| "eval_num_tokens": 736840.0, | |
| "eval_runtime": 14.7032, | |
| "eval_samples_per_second": 68.965, | |
| "eval_steps_per_second": 4.353, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.990926167368889, | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 2.1746864318847656, | |
| "learning_rate": 6.082677165354331e-06, | |
| "loss": 0.0596, | |
| "mean_token_accuracy": 0.9820288747549057, | |
| "num_tokens": 818671.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "eval_entropy": 1.9952750019729137, | |
| "eval_loss": 0.039442677050828934, | |
| "eval_mean_token_accuracy": 0.9868771303445101, | |
| "eval_num_tokens": 818671.0, | |
| "eval_runtime": 14.4324, | |
| "eval_samples_per_second": 70.259, | |
| "eval_steps_per_second": 4.434, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.9992768168449402, | |
| "epoch": 0.8661417322834646, | |
| "grad_norm": 3.7657172679901123, | |
| "learning_rate": 5.6889763779527565e-06, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9879351630806923, | |
| "num_tokens": 900585.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8661417322834646, | |
| "eval_entropy": 1.9681989178061485, | |
| "eval_loss": 0.06415939331054688, | |
| "eval_mean_token_accuracy": 0.9848531475290656, | |
| "eval_num_tokens": 900585.0, | |
| "eval_runtime": 14.4299, | |
| "eval_samples_per_second": 70.271, | |
| "eval_steps_per_second": 4.435, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.9873531699180602, | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 1.2447177171707153, | |
| "learning_rate": 5.295275590551181e-06, | |
| "loss": 0.0495, | |
| "mean_token_accuracy": 0.9891579717397689, | |
| "num_tokens": 982502.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "eval_entropy": 1.9764121137559414, | |
| "eval_loss": 0.05618462339043617, | |
| "eval_mean_token_accuracy": 0.984872730448842, | |
| "eval_num_tokens": 982502.0, | |
| "eval_runtime": 14.5073, | |
| "eval_samples_per_second": 69.896, | |
| "eval_steps_per_second": 4.412, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.9799151510000228, | |
| "epoch": 1.0236220472440944, | |
| "grad_norm": 2.651578426361084, | |
| "learning_rate": 4.901574803149607e-06, | |
| "loss": 0.037, | |
| "mean_token_accuracy": 0.9879837512969971, | |
| "num_tokens": 1061407.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0236220472440944, | |
| "eval_entropy": 1.9760331977158785, | |
| "eval_loss": 0.04186183959245682, | |
| "eval_mean_token_accuracy": 0.9875151747837663, | |
| "eval_num_tokens": 1061407.0, | |
| "eval_runtime": 14.3962, | |
| "eval_samples_per_second": 70.435, | |
| "eval_steps_per_second": 4.446, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.9939074516296387, | |
| "epoch": 1.1023622047244095, | |
| "grad_norm": 2.1040918827056885, | |
| "learning_rate": 4.507874015748032e-06, | |
| "loss": 0.0287, | |
| "mean_token_accuracy": 0.9914279609918595, | |
| "num_tokens": 1144556.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1023622047244095, | |
| "eval_entropy": 1.9513436201959848, | |
| "eval_loss": 0.03779410198330879, | |
| "eval_mean_token_accuracy": 0.9891293849796057, | |
| "eval_num_tokens": 1144556.0, | |
| "eval_runtime": 14.4692, | |
| "eval_samples_per_second": 70.08, | |
| "eval_steps_per_second": 4.423, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.9635676503181458, | |
| "epoch": 1.1811023622047245, | |
| "grad_norm": 2.812749147415161, | |
| "learning_rate": 4.114173228346457e-06, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9924521818757057, | |
| "num_tokens": 1228054.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1811023622047245, | |
| "eval_entropy": 1.9615301713347435, | |
| "eval_loss": 0.043686393648386, | |
| "eval_mean_token_accuracy": 0.9885605089366436, | |
| "eval_num_tokens": 1228054.0, | |
| "eval_runtime": 15.1051, | |
| "eval_samples_per_second": 67.13, | |
| "eval_steps_per_second": 4.237, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.9595869064331055, | |
| "epoch": 1.2598425196850394, | |
| "grad_norm": 0.0898560956120491, | |
| "learning_rate": 3.7204724409448824e-06, | |
| "loss": 0.016, | |
| "mean_token_accuracy": 0.9946568146347999, | |
| "num_tokens": 1310062.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2598425196850394, | |
| "eval_entropy": 1.9571783430874348, | |
| "eval_loss": 0.04049532115459442, | |
| "eval_mean_token_accuracy": 0.9888405678793788, | |
| "eval_num_tokens": 1310062.0, | |
| "eval_runtime": 14.8004, | |
| "eval_samples_per_second": 68.512, | |
| "eval_steps_per_second": 4.324, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.9704302370548248, | |
| "epoch": 1.3385826771653544, | |
| "grad_norm": 0.1551266759634018, | |
| "learning_rate": 3.3267716535433077e-06, | |
| "loss": 0.0272, | |
| "mean_token_accuracy": 0.991837514936924, | |
| "num_tokens": 1392015.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3385826771653544, | |
| "eval_entropy": 1.9510800130665302, | |
| "eval_loss": 0.040886349976062775, | |
| "eval_mean_token_accuracy": 0.9888559766113758, | |
| "eval_num_tokens": 1392015.0, | |
| "eval_runtime": 14.4788, | |
| "eval_samples_per_second": 70.033, | |
| "eval_steps_per_second": 4.42, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.9628229856491088, | |
| "epoch": 1.4173228346456692, | |
| "grad_norm": 0.49692580103874207, | |
| "learning_rate": 2.9330708661417322e-06, | |
| "loss": 0.0242, | |
| "mean_token_accuracy": 0.99283407330513, | |
| "num_tokens": 1472981.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4173228346456692, | |
| "eval_entropy": 1.943079276010394, | |
| "eval_loss": 0.03727104514837265, | |
| "eval_mean_token_accuracy": 0.9910982735455036, | |
| "eval_num_tokens": 1472981.0, | |
| "eval_runtime": 14.4234, | |
| "eval_samples_per_second": 70.303, | |
| "eval_steps_per_second": 4.437, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.957309964299202, | |
| "epoch": 1.4960629921259843, | |
| "grad_norm": 2.6952269077301025, | |
| "learning_rate": 2.5393700787401576e-06, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9920013844966888, | |
| "num_tokens": 1554129.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4960629921259843, | |
| "eval_entropy": 1.942601079121232, | |
| "eval_loss": 0.03651127964258194, | |
| "eval_mean_token_accuracy": 0.9905505627393723, | |
| "eval_num_tokens": 1554129.0, | |
| "eval_runtime": 14.4247, | |
| "eval_samples_per_second": 70.296, | |
| "eval_steps_per_second": 4.437, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.9591941982507706, | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 4.7762556076049805, | |
| "learning_rate": 2.145669291338583e-06, | |
| "loss": 0.0251, | |
| "mean_token_accuracy": 0.9915260329842568, | |
| "num_tokens": 1636074.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "eval_entropy": 1.9433052614331245, | |
| "eval_loss": 0.039551958441734314, | |
| "eval_mean_token_accuracy": 0.9907209984958172, | |
| "eval_num_tokens": 1636074.0, | |
| "eval_runtime": 14.4728, | |
| "eval_samples_per_second": 70.063, | |
| "eval_steps_per_second": 4.422, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.972610130906105, | |
| "epoch": 1.6535433070866141, | |
| "grad_norm": 6.626554012298584, | |
| "learning_rate": 1.7519685039370079e-06, | |
| "loss": 0.0172, | |
| "mean_token_accuracy": 0.9950673922896385, | |
| "num_tokens": 1718697.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6535433070866141, | |
| "eval_entropy": 1.9396098591387272, | |
| "eval_loss": 0.03601359203457832, | |
| "eval_mean_token_accuracy": 0.9912841068580747, | |
| "eval_num_tokens": 1718697.0, | |
| "eval_runtime": 15.1369, | |
| "eval_samples_per_second": 66.989, | |
| "eval_steps_per_second": 4.228, | |
| "step": 420 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 508, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.5052021162399744e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |