{ "best_global_step": 420, "best_metric": 0.03255658224225044, "best_model_checkpoint": "/content/models/gemma_qlora_lmh/checkpoint-420", "epoch": 1.6535433070866141, "eval_steps": 20, "global_step": 420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.4301040887832643, "epoch": 0.07874015748031496, "grad_norm": 3.0547735691070557, "learning_rate": 9.625984251968504e-06, "loss": 0.5992, "mean_token_accuracy": 0.7930104970932007, "num_tokens": 75454.0, "step": 20 }, { "epoch": 0.07874015748031496, "eval_entropy": 2.0184655766934156, "eval_loss": 0.17359499633312225, "eval_mean_token_accuracy": 0.9012619638815522, "eval_num_tokens": 75454.0, "eval_runtime": 13.9893, "eval_samples_per_second": 72.484, "eval_steps_per_second": 4.575, "step": 20 }, { "entropy": 2.124652886390686, "epoch": 0.15748031496062992, "grad_norm": 4.66159725189209, "learning_rate": 9.23228346456693e-06, "loss": 0.206, "mean_token_accuracy": 0.8831128552556038, "num_tokens": 153345.0, "step": 40 }, { "epoch": 0.15748031496062992, "eval_entropy": 2.156722355633974, "eval_loss": 0.1739385426044464, "eval_mean_token_accuracy": 0.8968315636739135, "eval_num_tokens": 153345.0, "eval_runtime": 14.0205, "eval_samples_per_second": 72.323, "eval_steps_per_second": 4.565, "step": 40 }, { "entropy": 2.0995097190141676, "epoch": 0.23622047244094488, "grad_norm": 4.2743754386901855, "learning_rate": 8.838582677165355e-06, "loss": 0.1494, "mean_token_accuracy": 0.9216955065727234, "num_tokens": 232136.0, "step": 60 }, { "epoch": 0.23622047244094488, "eval_entropy": 2.0814744140952826, "eval_loss": 0.13872948288917542, "eval_mean_token_accuracy": 0.9309953525662422, "eval_num_tokens": 232136.0, "eval_runtime": 13.9666, "eval_samples_per_second": 72.602, "eval_steps_per_second": 4.582, "step": 60 }, { "entropy": 2.114528650045395, "epoch": 0.31496062992125984, "grad_norm": 2.626481294631958, "learning_rate": 8.444881889763782e-06, "loss": 0.1602, "mean_token_accuracy": 0.9276395246386528, "num_tokens": 310039.0, "step": 80 }, { "epoch": 0.31496062992125984, "eval_entropy": 2.021162658929825, "eval_loss": 0.10903553664684296, "eval_mean_token_accuracy": 0.9508337117731571, "eval_num_tokens": 310039.0, "eval_runtime": 13.9195, "eval_samples_per_second": 72.848, "eval_steps_per_second": 4.598, "step": 80 }, { "entropy": 2.0373571157455443, "epoch": 0.3937007874015748, "grad_norm": 12.651391983032227, "learning_rate": 8.051181102362205e-06, "loss": 0.1181, "mean_token_accuracy": 0.9546803295612335, "num_tokens": 386453.0, "step": 100 }, { "epoch": 0.3937007874015748, "eval_entropy": 2.0321336779743433, "eval_loss": 0.08907214552164078, "eval_mean_token_accuracy": 0.962163164280355, "eval_num_tokens": 386453.0, "eval_runtime": 13.8806, "eval_samples_per_second": 73.051, "eval_steps_per_second": 4.611, "step": 100 }, { "entropy": 2.084933453798294, "epoch": 0.47244094488188976, "grad_norm": 1.525368094444275, "learning_rate": 7.65748031496063e-06, "loss": 0.1062, "mean_token_accuracy": 0.95157091319561, "num_tokens": 462083.0, "step": 120 }, { "epoch": 0.47244094488188976, "eval_entropy": 2.055444575846195, "eval_loss": 0.07883646339178085, "eval_mean_token_accuracy": 0.9678388619795442, "eval_num_tokens": 462083.0, "eval_runtime": 13.8682, "eval_samples_per_second": 73.117, "eval_steps_per_second": 4.615, "step": 120 }, { "entropy": 2.041289675235748, "epoch": 0.5511811023622047, "grad_norm": 4.715269565582275, "learning_rate": 7.263779527559056e-06, "loss": 0.0791, "mean_token_accuracy": 0.9711190596222877, "num_tokens": 538806.0, "step": 140 }, { "epoch": 0.5511811023622047, "eval_entropy": 2.0219028927385807, "eval_loss": 0.06344127655029297, "eval_mean_token_accuracy": 0.9835457233712077, "eval_num_tokens": 538806.0, "eval_runtime": 13.8631, "eval_samples_per_second": 73.144, "eval_steps_per_second": 4.617, "step": 140 }, { "entropy": 2.046231508255005, "epoch": 0.6299212598425197, "grad_norm": 4.157348155975342, "learning_rate": 6.870078740157481e-06, "loss": 0.0639, "mean_token_accuracy": 0.9775616720318794, "num_tokens": 615129.0, "step": 160 }, { "epoch": 0.6299212598425197, "eval_entropy": 2.0033062752336264, "eval_loss": 0.053461696952581406, "eval_mean_token_accuracy": 0.9832161571830511, "eval_num_tokens": 615129.0, "eval_runtime": 13.9444, "eval_samples_per_second": 72.717, "eval_steps_per_second": 4.59, "step": 160 }, { "entropy": 2.062005800008774, "epoch": 0.7086614173228346, "grad_norm": 10.413922309875488, "learning_rate": 6.476377952755906e-06, "loss": 0.0747, "mean_token_accuracy": 0.9757226049900055, "num_tokens": 690760.0, "step": 180 }, { "epoch": 0.7086614173228346, "eval_entropy": 2.062701778486371, "eval_loss": 0.07900257408618927, "eval_mean_token_accuracy": 0.9750550417229533, "eval_num_tokens": 690760.0, "eval_runtime": 14.2787, "eval_samples_per_second": 71.015, "eval_steps_per_second": 4.482, "step": 180 }, { "entropy": 2.078292927145958, "epoch": 0.7874015748031497, "grad_norm": 3.501204252243042, "learning_rate": 6.082677165354331e-06, "loss": 0.052, "mean_token_accuracy": 0.9841863334178924, "num_tokens": 767471.0, "step": 200 }, { "epoch": 0.7874015748031497, "eval_entropy": 2.0887723341584206, "eval_loss": 0.046214085072278976, "eval_mean_token_accuracy": 0.985442828387022, "eval_num_tokens": 767471.0, "eval_runtime": 13.9887, "eval_samples_per_second": 72.487, "eval_steps_per_second": 4.575, "step": 200 }, { "entropy": 2.1700605511665345, "epoch": 0.8661417322834646, "grad_norm": 5.373133659362793, "learning_rate": 5.6889763779527565e-06, "loss": 0.0646, "mean_token_accuracy": 0.9767223253846169, "num_tokens": 844265.0, "step": 220 }, { "epoch": 0.8661417322834646, "eval_entropy": 2.134835472330451, "eval_loss": 0.06774821132421494, "eval_mean_token_accuracy": 0.9768518777564168, "eval_num_tokens": 844265.0, "eval_runtime": 13.9674, "eval_samples_per_second": 72.598, "eval_steps_per_second": 4.582, "step": 220 }, { "entropy": 2.130521237850189, "epoch": 0.9448818897637795, "grad_norm": 0.8587220311164856, "learning_rate": 5.295275590551181e-06, "loss": 0.0431, "mean_token_accuracy": 0.9848933383822441, "num_tokens": 921062.0, "step": 240 }, { "epoch": 0.9448818897637795, "eval_entropy": 2.1466477904468775, "eval_loss": 0.04291221499443054, "eval_mean_token_accuracy": 0.9876205483451486, "eval_num_tokens": 921062.0, "eval_runtime": 13.8812, "eval_samples_per_second": 73.048, "eval_steps_per_second": 4.611, "step": 240 }, { "entropy": 2.068550485372543, "epoch": 1.0236220472440944, "grad_norm": 7.186275005340576, "learning_rate": 4.901574803149607e-06, "loss": 0.0398, "mean_token_accuracy": 0.9878435462713242, "num_tokens": 994943.0, "step": 260 }, { "epoch": 1.0236220472440944, "eval_entropy": 2.0358662642538548, "eval_loss": 0.06262390315532684, "eval_mean_token_accuracy": 0.9817966390401125, "eval_num_tokens": 994943.0, "eval_runtime": 13.9135, "eval_samples_per_second": 72.879, "eval_steps_per_second": 4.6, "step": 260 }, { "entropy": 2.0602549403905868, "epoch": 1.1023622047244095, "grad_norm": 3.056312084197998, "learning_rate": 4.507874015748032e-06, "loss": 0.0296, "mean_token_accuracy": 0.9905342325568199, "num_tokens": 1072972.0, "step": 280 }, { "epoch": 1.1023622047244095, "eval_entropy": 2.0256000570952892, "eval_loss": 0.0363699272274971, "eval_mean_token_accuracy": 0.989073995500803, "eval_num_tokens": 1072972.0, "eval_runtime": 13.9066, "eval_samples_per_second": 72.915, "eval_steps_per_second": 4.602, "step": 280 }, { "entropy": 2.0230892926454542, "epoch": 1.1811023622047245, "grad_norm": 4.540070533752441, "learning_rate": 4.114173228346457e-06, "loss": 0.0248, "mean_token_accuracy": 0.9916361093521118, "num_tokens": 1151350.0, "step": 300 }, { "epoch": 1.1811023622047245, "eval_entropy": 2.0147312097251415, "eval_loss": 0.04147057980298996, "eval_mean_token_accuracy": 0.9891073293983936, "eval_num_tokens": 1151350.0, "eval_runtime": 13.9315, "eval_samples_per_second": 72.785, "eval_steps_per_second": 4.594, "step": 300 }, { "entropy": 2.022981768846512, "epoch": 1.2598425196850394, "grad_norm": 0.8380900025367737, "learning_rate": 3.7204724409448824e-06, "loss": 0.018, "mean_token_accuracy": 0.9942479804158211, "num_tokens": 1228238.0, "step": 320 }, { "epoch": 1.2598425196850394, "eval_entropy": 2.0086006112396717, "eval_loss": 0.04451654851436615, "eval_mean_token_accuracy": 0.9909367645159364, "eval_num_tokens": 1228238.0, "eval_runtime": 13.9575, "eval_samples_per_second": 72.649, "eval_steps_per_second": 4.585, "step": 320 }, { "entropy": 2.0186730861663817, "epoch": 1.3385826771653544, "grad_norm": 3.016096591949463, "learning_rate": 3.3267716535433077e-06, "loss": 0.0331, "mean_token_accuracy": 0.9891796618700027, "num_tokens": 1305071.0, "step": 340 }, { "epoch": 1.3385826771653544, "eval_entropy": 1.9903168231248856, "eval_loss": 0.03695274144411087, "eval_mean_token_accuracy": 0.9910112516954541, "eval_num_tokens": 1305071.0, "eval_runtime": 13.9564, "eval_samples_per_second": 72.655, "eval_steps_per_second": 4.586, "step": 340 }, { "entropy": 2.017111986875534, "epoch": 1.4173228346456692, "grad_norm": 2.0211527347564697, "learning_rate": 2.9330708661417322e-06, "loss": 0.0202, "mean_token_accuracy": 0.9945956841111183, "num_tokens": 1380917.0, "step": 360 }, { "epoch": 1.4173228346456692, "eval_entropy": 1.9944983646273613, "eval_loss": 0.037537336349487305, "eval_mean_token_accuracy": 0.9904407253488898, "eval_num_tokens": 1380917.0, "eval_runtime": 13.9126, "eval_samples_per_second": 72.884, "eval_steps_per_second": 4.6, "step": 360 }, { "entropy": 2.0372937440872194, "epoch": 1.4960629921259843, "grad_norm": 1.1146146059036255, "learning_rate": 2.5393700787401576e-06, "loss": 0.0411, "mean_token_accuracy": 0.9902476906776428, "num_tokens": 1456945.0, "step": 380 }, { "epoch": 1.4960629921259843, "eval_entropy": 2.0192780885845423, "eval_loss": 0.03898231312632561, "eval_mean_token_accuracy": 0.9909690143540502, "eval_num_tokens": 1456945.0, "eval_runtime": 14.0021, "eval_samples_per_second": 72.418, "eval_steps_per_second": 4.571, "step": 380 }, { "entropy": 2.0159901797771456, "epoch": 1.574803149606299, "grad_norm": 5.674668312072754, "learning_rate": 2.145669291338583e-06, "loss": 0.0286, "mean_token_accuracy": 0.9905779853463172, "num_tokens": 1533770.0, "step": 400 }, { "epoch": 1.574803149606299, "eval_entropy": 1.9870711751282215, "eval_loss": 0.0370485782623291, "eval_mean_token_accuracy": 0.9909984050318599, "eval_num_tokens": 1533770.0, "eval_runtime": 14.0218, "eval_samples_per_second": 72.316, "eval_steps_per_second": 4.564, "step": 400 }, { "entropy": 2.021164360642433, "epoch": 1.6535433070866141, "grad_norm": 3.0989601612091064, "learning_rate": 1.7519685039370079e-06, "loss": 0.0234, "mean_token_accuracy": 0.9928320273756981, "num_tokens": 1611273.0, "step": 420 }, { "epoch": 1.6535433070866141, "eval_entropy": 1.9803004171699286, "eval_loss": 0.03255658224225044, "eval_mean_token_accuracy": 0.9918564734980464, "eval_num_tokens": 1611273.0, "eval_runtime": 14.0125, "eval_samples_per_second": 72.364, "eval_steps_per_second": 4.567, "step": 420 } ], "logging_steps": 20, "max_steps": 508, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.364004978659123e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }