{ "best_global_step": 420, "best_metric": 0.03601359203457832, "best_model_checkpoint": "/content/models/gemma_qlora_lmh/checkpoint-420", "epoch": 1.6535433070866141, "eval_steps": 20, "global_step": 420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.0512761652469633, "epoch": 0.07874015748031496, "grad_norm": 2.0831573009490967, "learning_rate": 9.625984251968504e-06, "loss": 0.6806, "mean_token_accuracy": 0.8287491604685784, "num_tokens": 80574.0, "step": 20 }, { "epoch": 0.07874015748031496, "eval_entropy": 2.0480873808264732, "eval_loss": 0.16752119362354279, "eval_mean_token_accuracy": 0.9204465709626675, "eval_num_tokens": 80574.0, "eval_runtime": 14.4058, "eval_samples_per_second": 70.388, "eval_steps_per_second": 4.443, "step": 20 }, { "entropy": 2.043068042397499, "epoch": 0.15748031496062992, "grad_norm": 3.2172703742980957, "learning_rate": 9.23228346456693e-06, "loss": 0.1758, "mean_token_accuracy": 0.9078111112117767, "num_tokens": 163585.0, "step": 40 }, { "epoch": 0.15748031496062992, "eval_entropy": 1.998575333505869, "eval_loss": 0.13858385384082794, "eval_mean_token_accuracy": 0.9343411969020963, "eval_num_tokens": 163585.0, "eval_runtime": 14.4076, "eval_samples_per_second": 70.38, "eval_steps_per_second": 4.442, "step": 40 }, { "entropy": 2.0217554807662963, "epoch": 0.23622047244094488, "grad_norm": 6.096222877502441, "learning_rate": 8.838582677165355e-06, "loss": 0.1675, "mean_token_accuracy": 0.922796231508255, "num_tokens": 247496.0, "step": 60 }, { "epoch": 0.23622047244094488, "eval_entropy": 2.0012808348983526, "eval_loss": 0.17316989600658417, "eval_mean_token_accuracy": 0.9123187400400639, "eval_num_tokens": 247496.0, "eval_runtime": 14.4234, "eval_samples_per_second": 70.302, "eval_steps_per_second": 4.437, "step": 60 }, { "entropy": 2.01616033911705, "epoch": 0.31496062992125984, "grad_norm": 3.203735113143921, "learning_rate": 8.444881889763782e-06, "loss": 0.1077, "mean_token_accuracy": 0.9547338560223579, "num_tokens": 330519.0, "step": 80 }, { "epoch": 0.31496062992125984, "eval_entropy": 1.967957517132163, "eval_loss": 0.10772992670536041, "eval_mean_token_accuracy": 0.9575660079717636, "eval_num_tokens": 330519.0, "eval_runtime": 14.4579, "eval_samples_per_second": 70.135, "eval_steps_per_second": 4.427, "step": 80 }, { "entropy": 1.9572902411222457, "epoch": 0.3937007874015748, "grad_norm": 12.593894004821777, "learning_rate": 8.051181102362205e-06, "loss": 0.0977, "mean_token_accuracy": 0.9625995993614197, "num_tokens": 412053.0, "step": 100 }, { "epoch": 0.3937007874015748, "eval_entropy": 1.9561998695135117, "eval_loss": 0.10965846478939056, "eval_mean_token_accuracy": 0.9605293860659003, "eval_num_tokens": 412053.0, "eval_runtime": 14.4568, "eval_samples_per_second": 70.14, "eval_steps_per_second": 4.427, "step": 100 }, { "entropy": 1.9785019993782043, "epoch": 0.47244094488188976, "grad_norm": 5.114565849304199, "learning_rate": 7.65748031496063e-06, "loss": 0.0684, "mean_token_accuracy": 0.9732592537999153, "num_tokens": 492803.0, "step": 120 }, { "epoch": 0.47244094488188976, "eval_entropy": 1.9417847488075495, "eval_loss": 0.11676711589097977, "eval_mean_token_accuracy": 0.970998496748507, "eval_num_tokens": 492803.0, "eval_runtime": 15.073, "eval_samples_per_second": 67.273, "eval_steps_per_second": 4.246, "step": 120 }, { "entropy": 1.9924108654260635, "epoch": 0.5511811023622047, "grad_norm": 4.5052642822265625, "learning_rate": 7.263779527559056e-06, "loss": 0.0727, "mean_token_accuracy": 0.9757537111639977, "num_tokens": 574646.0, "step": 140 }, { "epoch": 0.5511811023622047, "eval_entropy": 1.9667269736528397, "eval_loss": 0.05474039167165756, "eval_mean_token_accuracy": 0.981005135923624, "eval_num_tokens": 574646.0, "eval_runtime": 14.4351, "eval_samples_per_second": 70.245, "eval_steps_per_second": 4.434, "step": 140 }, { "entropy": 1.9721805155277252, "epoch": 0.6299212598425197, "grad_norm": 3.2773079872131348, "learning_rate": 6.870078740157481e-06, "loss": 0.0417, "mean_token_accuracy": 0.9847830027341843, "num_tokens": 656089.0, "step": 160 }, { "epoch": 0.6299212598425197, "eval_entropy": 1.9357626736164093, "eval_loss": 0.05493154749274254, "eval_mean_token_accuracy": 0.9821033654734492, "eval_num_tokens": 656089.0, "eval_runtime": 14.7904, "eval_samples_per_second": 68.558, "eval_steps_per_second": 4.327, "step": 160 }, { "entropy": 1.980790689587593, "epoch": 0.7086614173228346, "grad_norm": 3.153142213821411, "learning_rate": 6.476377952755906e-06, "loss": 0.0717, "mean_token_accuracy": 0.9758390337228775, "num_tokens": 736840.0, "step": 180 }, { "epoch": 0.7086614173228346, "eval_entropy": 1.9667537324130535, "eval_loss": 0.05807032063603401, "eval_mean_token_accuracy": 0.9831889141350985, "eval_num_tokens": 736840.0, "eval_runtime": 14.7032, "eval_samples_per_second": 68.965, "eval_steps_per_second": 4.353, "step": 180 }, { "entropy": 1.990926167368889, "epoch": 0.7874015748031497, "grad_norm": 2.1746864318847656, "learning_rate": 6.082677165354331e-06, "loss": 0.0596, "mean_token_accuracy": 0.9820288747549057, "num_tokens": 818671.0, "step": 200 }, { "epoch": 0.7874015748031497, "eval_entropy": 1.9952750019729137, "eval_loss": 0.039442677050828934, "eval_mean_token_accuracy": 0.9868771303445101, "eval_num_tokens": 818671.0, "eval_runtime": 14.4324, "eval_samples_per_second": 70.259, "eval_steps_per_second": 4.434, "step": 200 }, { "entropy": 1.9992768168449402, "epoch": 0.8661417322834646, "grad_norm": 3.7657172679901123, "learning_rate": 5.6889763779527565e-06, "loss": 0.0434, "mean_token_accuracy": 0.9879351630806923, "num_tokens": 900585.0, "step": 220 }, { "epoch": 0.8661417322834646, "eval_entropy": 1.9681989178061485, "eval_loss": 0.06415939331054688, "eval_mean_token_accuracy": 0.9848531475290656, "eval_num_tokens": 900585.0, "eval_runtime": 14.4299, "eval_samples_per_second": 70.271, "eval_steps_per_second": 4.435, "step": 220 }, { "entropy": 1.9873531699180602, "epoch": 0.9448818897637795, "grad_norm": 1.2447177171707153, "learning_rate": 5.295275590551181e-06, "loss": 0.0495, "mean_token_accuracy": 0.9891579717397689, "num_tokens": 982502.0, "step": 240 }, { "epoch": 0.9448818897637795, "eval_entropy": 1.9764121137559414, "eval_loss": 0.05618462339043617, "eval_mean_token_accuracy": 0.984872730448842, "eval_num_tokens": 982502.0, "eval_runtime": 14.5073, "eval_samples_per_second": 69.896, "eval_steps_per_second": 4.412, "step": 240 }, { "entropy": 1.9799151510000228, "epoch": 1.0236220472440944, "grad_norm": 2.651578426361084, "learning_rate": 4.901574803149607e-06, "loss": 0.037, "mean_token_accuracy": 0.9879837512969971, "num_tokens": 1061407.0, "step": 260 }, { "epoch": 1.0236220472440944, "eval_entropy": 1.9760331977158785, "eval_loss": 0.04186183959245682, "eval_mean_token_accuracy": 0.9875151747837663, "eval_num_tokens": 1061407.0, "eval_runtime": 14.3962, "eval_samples_per_second": 70.435, "eval_steps_per_second": 4.446, "step": 260 }, { "entropy": 1.9939074516296387, "epoch": 1.1023622047244095, "grad_norm": 2.1040918827056885, "learning_rate": 4.507874015748032e-06, "loss": 0.0287, "mean_token_accuracy": 0.9914279609918595, "num_tokens": 1144556.0, "step": 280 }, { "epoch": 1.1023622047244095, "eval_entropy": 1.9513436201959848, "eval_loss": 0.03779410198330879, "eval_mean_token_accuracy": 0.9891293849796057, "eval_num_tokens": 1144556.0, "eval_runtime": 14.4692, "eval_samples_per_second": 70.08, "eval_steps_per_second": 4.423, "step": 280 }, { "entropy": 1.9635676503181458, "epoch": 1.1811023622047245, "grad_norm": 2.812749147415161, "learning_rate": 4.114173228346457e-06, "loss": 0.0257, "mean_token_accuracy": 0.9924521818757057, "num_tokens": 1228054.0, "step": 300 }, { "epoch": 1.1811023622047245, "eval_entropy": 1.9615301713347435, "eval_loss": 0.043686393648386, "eval_mean_token_accuracy": 0.9885605089366436, "eval_num_tokens": 1228054.0, "eval_runtime": 15.1051, "eval_samples_per_second": 67.13, "eval_steps_per_second": 4.237, "step": 300 }, { "entropy": 1.9595869064331055, "epoch": 1.2598425196850394, "grad_norm": 0.0898560956120491, "learning_rate": 3.7204724409448824e-06, "loss": 0.016, "mean_token_accuracy": 0.9946568146347999, "num_tokens": 1310062.0, "step": 320 }, { "epoch": 1.2598425196850394, "eval_entropy": 1.9571783430874348, "eval_loss": 0.04049532115459442, "eval_mean_token_accuracy": 0.9888405678793788, "eval_num_tokens": 1310062.0, "eval_runtime": 14.8004, "eval_samples_per_second": 68.512, "eval_steps_per_second": 4.324, "step": 320 }, { "entropy": 1.9704302370548248, "epoch": 1.3385826771653544, "grad_norm": 0.1551266759634018, "learning_rate": 3.3267716535433077e-06, "loss": 0.0272, "mean_token_accuracy": 0.991837514936924, "num_tokens": 1392015.0, "step": 340 }, { "epoch": 1.3385826771653544, "eval_entropy": 1.9510800130665302, "eval_loss": 0.040886349976062775, "eval_mean_token_accuracy": 0.9888559766113758, "eval_num_tokens": 1392015.0, "eval_runtime": 14.4788, "eval_samples_per_second": 70.033, "eval_steps_per_second": 4.42, "step": 340 }, { "entropy": 1.9628229856491088, "epoch": 1.4173228346456692, "grad_norm": 0.49692580103874207, "learning_rate": 2.9330708661417322e-06, "loss": 0.0242, "mean_token_accuracy": 0.99283407330513, "num_tokens": 1472981.0, "step": 360 }, { "epoch": 1.4173228346456692, "eval_entropy": 1.943079276010394, "eval_loss": 0.03727104514837265, "eval_mean_token_accuracy": 0.9910982735455036, "eval_num_tokens": 1472981.0, "eval_runtime": 14.4234, "eval_samples_per_second": 70.303, "eval_steps_per_second": 4.437, "step": 360 }, { "entropy": 1.957309964299202, "epoch": 1.4960629921259843, "grad_norm": 2.6952269077301025, "learning_rate": 2.5393700787401576e-06, "loss": 0.0304, "mean_token_accuracy": 0.9920013844966888, "num_tokens": 1554129.0, "step": 380 }, { "epoch": 1.4960629921259843, "eval_entropy": 1.942601079121232, "eval_loss": 0.03651127964258194, "eval_mean_token_accuracy": 0.9905505627393723, "eval_num_tokens": 1554129.0, "eval_runtime": 14.4247, "eval_samples_per_second": 70.296, "eval_steps_per_second": 4.437, "step": 380 }, { "entropy": 1.9591941982507706, "epoch": 1.574803149606299, "grad_norm": 4.7762556076049805, "learning_rate": 2.145669291338583e-06, "loss": 0.0251, "mean_token_accuracy": 0.9915260329842568, "num_tokens": 1636074.0, "step": 400 }, { "epoch": 1.574803149606299, "eval_entropy": 1.9433052614331245, "eval_loss": 0.039551958441734314, "eval_mean_token_accuracy": 0.9907209984958172, "eval_num_tokens": 1636074.0, "eval_runtime": 14.4728, "eval_samples_per_second": 70.063, "eval_steps_per_second": 4.422, "step": 400 }, { "entropy": 1.972610130906105, "epoch": 1.6535433070866141, "grad_norm": 6.626554012298584, "learning_rate": 1.7519685039370079e-06, "loss": 0.0172, "mean_token_accuracy": 0.9950673922896385, "num_tokens": 1718697.0, "step": 420 }, { "epoch": 1.6535433070866141, "eval_entropy": 1.9396098591387272, "eval_loss": 0.03601359203457832, "eval_mean_token_accuracy": 0.9912841068580747, "eval_num_tokens": 1718697.0, "eval_runtime": 15.1369, "eval_samples_per_second": 66.989, "eval_steps_per_second": 4.228, "step": 420 } ], "logging_steps": 20, "max_steps": 508, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5052021162399744e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }