| { | |
| "best_global_step": 420, | |
| "best_metric": 0.03255658224225044, | |
| "best_model_checkpoint": "/content/models/gemma_qlora_lmh/checkpoint-420", | |
| "epoch": 1.6535433070866141, | |
| "eval_steps": 20, | |
| "global_step": 420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.4301040887832643, | |
| "epoch": 0.07874015748031496, | |
| "grad_norm": 3.0547735691070557, | |
| "learning_rate": 9.625984251968504e-06, | |
| "loss": 0.5992, | |
| "mean_token_accuracy": 0.7930104970932007, | |
| "num_tokens": 75454.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07874015748031496, | |
| "eval_entropy": 2.0184655766934156, | |
| "eval_loss": 0.17359499633312225, | |
| "eval_mean_token_accuracy": 0.9012619638815522, | |
| "eval_num_tokens": 75454.0, | |
| "eval_runtime": 13.9893, | |
| "eval_samples_per_second": 72.484, | |
| "eval_steps_per_second": 4.575, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.124652886390686, | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 4.66159725189209, | |
| "learning_rate": 9.23228346456693e-06, | |
| "loss": 0.206, | |
| "mean_token_accuracy": 0.8831128552556038, | |
| "num_tokens": 153345.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "eval_entropy": 2.156722355633974, | |
| "eval_loss": 0.1739385426044464, | |
| "eval_mean_token_accuracy": 0.8968315636739135, | |
| "eval_num_tokens": 153345.0, | |
| "eval_runtime": 14.0205, | |
| "eval_samples_per_second": 72.323, | |
| "eval_steps_per_second": 4.565, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.0995097190141676, | |
| "epoch": 0.23622047244094488, | |
| "grad_norm": 4.2743754386901855, | |
| "learning_rate": 8.838582677165355e-06, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9216955065727234, | |
| "num_tokens": 232136.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.23622047244094488, | |
| "eval_entropy": 2.0814744140952826, | |
| "eval_loss": 0.13872948288917542, | |
| "eval_mean_token_accuracy": 0.9309953525662422, | |
| "eval_num_tokens": 232136.0, | |
| "eval_runtime": 13.9666, | |
| "eval_samples_per_second": 72.602, | |
| "eval_steps_per_second": 4.582, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.114528650045395, | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 2.626481294631958, | |
| "learning_rate": 8.444881889763782e-06, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9276395246386528, | |
| "num_tokens": 310039.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "eval_entropy": 2.021162658929825, | |
| "eval_loss": 0.10903553664684296, | |
| "eval_mean_token_accuracy": 0.9508337117731571, | |
| "eval_num_tokens": 310039.0, | |
| "eval_runtime": 13.9195, | |
| "eval_samples_per_second": 72.848, | |
| "eval_steps_per_second": 4.598, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.0373571157455443, | |
| "epoch": 0.3937007874015748, | |
| "grad_norm": 12.651391983032227, | |
| "learning_rate": 8.051181102362205e-06, | |
| "loss": 0.1181, | |
| "mean_token_accuracy": 0.9546803295612335, | |
| "num_tokens": 386453.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3937007874015748, | |
| "eval_entropy": 2.0321336779743433, | |
| "eval_loss": 0.08907214552164078, | |
| "eval_mean_token_accuracy": 0.962163164280355, | |
| "eval_num_tokens": 386453.0, | |
| "eval_runtime": 13.8806, | |
| "eval_samples_per_second": 73.051, | |
| "eval_steps_per_second": 4.611, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.084933453798294, | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 1.525368094444275, | |
| "learning_rate": 7.65748031496063e-06, | |
| "loss": 0.1062, | |
| "mean_token_accuracy": 0.95157091319561, | |
| "num_tokens": 462083.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "eval_entropy": 2.055444575846195, | |
| "eval_loss": 0.07883646339178085, | |
| "eval_mean_token_accuracy": 0.9678388619795442, | |
| "eval_num_tokens": 462083.0, | |
| "eval_runtime": 13.8682, | |
| "eval_samples_per_second": 73.117, | |
| "eval_steps_per_second": 4.615, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.041289675235748, | |
| "epoch": 0.5511811023622047, | |
| "grad_norm": 4.715269565582275, | |
| "learning_rate": 7.263779527559056e-06, | |
| "loss": 0.0791, | |
| "mean_token_accuracy": 0.9711190596222877, | |
| "num_tokens": 538806.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5511811023622047, | |
| "eval_entropy": 2.0219028927385807, | |
| "eval_loss": 0.06344127655029297, | |
| "eval_mean_token_accuracy": 0.9835457233712077, | |
| "eval_num_tokens": 538806.0, | |
| "eval_runtime": 13.8631, | |
| "eval_samples_per_second": 73.144, | |
| "eval_steps_per_second": 4.617, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.046231508255005, | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 4.157348155975342, | |
| "learning_rate": 6.870078740157481e-06, | |
| "loss": 0.0639, | |
| "mean_token_accuracy": 0.9775616720318794, | |
| "num_tokens": 615129.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "eval_entropy": 2.0033062752336264, | |
| "eval_loss": 0.053461696952581406, | |
| "eval_mean_token_accuracy": 0.9832161571830511, | |
| "eval_num_tokens": 615129.0, | |
| "eval_runtime": 13.9444, | |
| "eval_samples_per_second": 72.717, | |
| "eval_steps_per_second": 4.59, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.062005800008774, | |
| "epoch": 0.7086614173228346, | |
| "grad_norm": 10.413922309875488, | |
| "learning_rate": 6.476377952755906e-06, | |
| "loss": 0.0747, | |
| "mean_token_accuracy": 0.9757226049900055, | |
| "num_tokens": 690760.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7086614173228346, | |
| "eval_entropy": 2.062701778486371, | |
| "eval_loss": 0.07900257408618927, | |
| "eval_mean_token_accuracy": 0.9750550417229533, | |
| "eval_num_tokens": 690760.0, | |
| "eval_runtime": 14.2787, | |
| "eval_samples_per_second": 71.015, | |
| "eval_steps_per_second": 4.482, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.078292927145958, | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 3.501204252243042, | |
| "learning_rate": 6.082677165354331e-06, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9841863334178924, | |
| "num_tokens": 767471.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "eval_entropy": 2.0887723341584206, | |
| "eval_loss": 0.046214085072278976, | |
| "eval_mean_token_accuracy": 0.985442828387022, | |
| "eval_num_tokens": 767471.0, | |
| "eval_runtime": 13.9887, | |
| "eval_samples_per_second": 72.487, | |
| "eval_steps_per_second": 4.575, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.1700605511665345, | |
| "epoch": 0.8661417322834646, | |
| "grad_norm": 5.373133659362793, | |
| "learning_rate": 5.6889763779527565e-06, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9767223253846169, | |
| "num_tokens": 844265.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8661417322834646, | |
| "eval_entropy": 2.134835472330451, | |
| "eval_loss": 0.06774821132421494, | |
| "eval_mean_token_accuracy": 0.9768518777564168, | |
| "eval_num_tokens": 844265.0, | |
| "eval_runtime": 13.9674, | |
| "eval_samples_per_second": 72.598, | |
| "eval_steps_per_second": 4.582, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.130521237850189, | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.8587220311164856, | |
| "learning_rate": 5.295275590551181e-06, | |
| "loss": 0.0431, | |
| "mean_token_accuracy": 0.9848933383822441, | |
| "num_tokens": 921062.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "eval_entropy": 2.1466477904468775, | |
| "eval_loss": 0.04291221499443054, | |
| "eval_mean_token_accuracy": 0.9876205483451486, | |
| "eval_num_tokens": 921062.0, | |
| "eval_runtime": 13.8812, | |
| "eval_samples_per_second": 73.048, | |
| "eval_steps_per_second": 4.611, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.068550485372543, | |
| "epoch": 1.0236220472440944, | |
| "grad_norm": 7.186275005340576, | |
| "learning_rate": 4.901574803149607e-06, | |
| "loss": 0.0398, | |
| "mean_token_accuracy": 0.9878435462713242, | |
| "num_tokens": 994943.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0236220472440944, | |
| "eval_entropy": 2.0358662642538548, | |
| "eval_loss": 0.06262390315532684, | |
| "eval_mean_token_accuracy": 0.9817966390401125, | |
| "eval_num_tokens": 994943.0, | |
| "eval_runtime": 13.9135, | |
| "eval_samples_per_second": 72.879, | |
| "eval_steps_per_second": 4.6, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.0602549403905868, | |
| "epoch": 1.1023622047244095, | |
| "grad_norm": 3.056312084197998, | |
| "learning_rate": 4.507874015748032e-06, | |
| "loss": 0.0296, | |
| "mean_token_accuracy": 0.9905342325568199, | |
| "num_tokens": 1072972.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1023622047244095, | |
| "eval_entropy": 2.0256000570952892, | |
| "eval_loss": 0.0363699272274971, | |
| "eval_mean_token_accuracy": 0.989073995500803, | |
| "eval_num_tokens": 1072972.0, | |
| "eval_runtime": 13.9066, | |
| "eval_samples_per_second": 72.915, | |
| "eval_steps_per_second": 4.602, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.0230892926454542, | |
| "epoch": 1.1811023622047245, | |
| "grad_norm": 4.540070533752441, | |
| "learning_rate": 4.114173228346457e-06, | |
| "loss": 0.0248, | |
| "mean_token_accuracy": 0.9916361093521118, | |
| "num_tokens": 1151350.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1811023622047245, | |
| "eval_entropy": 2.0147312097251415, | |
| "eval_loss": 0.04147057980298996, | |
| "eval_mean_token_accuracy": 0.9891073293983936, | |
| "eval_num_tokens": 1151350.0, | |
| "eval_runtime": 13.9315, | |
| "eval_samples_per_second": 72.785, | |
| "eval_steps_per_second": 4.594, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.022981768846512, | |
| "epoch": 1.2598425196850394, | |
| "grad_norm": 0.8380900025367737, | |
| "learning_rate": 3.7204724409448824e-06, | |
| "loss": 0.018, | |
| "mean_token_accuracy": 0.9942479804158211, | |
| "num_tokens": 1228238.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2598425196850394, | |
| "eval_entropy": 2.0086006112396717, | |
| "eval_loss": 0.04451654851436615, | |
| "eval_mean_token_accuracy": 0.9909367645159364, | |
| "eval_num_tokens": 1228238.0, | |
| "eval_runtime": 13.9575, | |
| "eval_samples_per_second": 72.649, | |
| "eval_steps_per_second": 4.585, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.0186730861663817, | |
| "epoch": 1.3385826771653544, | |
| "grad_norm": 3.016096591949463, | |
| "learning_rate": 3.3267716535433077e-06, | |
| "loss": 0.0331, | |
| "mean_token_accuracy": 0.9891796618700027, | |
| "num_tokens": 1305071.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3385826771653544, | |
| "eval_entropy": 1.9903168231248856, | |
| "eval_loss": 0.03695274144411087, | |
| "eval_mean_token_accuracy": 0.9910112516954541, | |
| "eval_num_tokens": 1305071.0, | |
| "eval_runtime": 13.9564, | |
| "eval_samples_per_second": 72.655, | |
| "eval_steps_per_second": 4.586, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 2.017111986875534, | |
| "epoch": 1.4173228346456692, | |
| "grad_norm": 2.0211527347564697, | |
| "learning_rate": 2.9330708661417322e-06, | |
| "loss": 0.0202, | |
| "mean_token_accuracy": 0.9945956841111183, | |
| "num_tokens": 1380917.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4173228346456692, | |
| "eval_entropy": 1.9944983646273613, | |
| "eval_loss": 0.037537336349487305, | |
| "eval_mean_token_accuracy": 0.9904407253488898, | |
| "eval_num_tokens": 1380917.0, | |
| "eval_runtime": 13.9126, | |
| "eval_samples_per_second": 72.884, | |
| "eval_steps_per_second": 4.6, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.0372937440872194, | |
| "epoch": 1.4960629921259843, | |
| "grad_norm": 1.1146146059036255, | |
| "learning_rate": 2.5393700787401576e-06, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9902476906776428, | |
| "num_tokens": 1456945.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4960629921259843, | |
| "eval_entropy": 2.0192780885845423, | |
| "eval_loss": 0.03898231312632561, | |
| "eval_mean_token_accuracy": 0.9909690143540502, | |
| "eval_num_tokens": 1456945.0, | |
| "eval_runtime": 14.0021, | |
| "eval_samples_per_second": 72.418, | |
| "eval_steps_per_second": 4.571, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.0159901797771456, | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 5.674668312072754, | |
| "learning_rate": 2.145669291338583e-06, | |
| "loss": 0.0286, | |
| "mean_token_accuracy": 0.9905779853463172, | |
| "num_tokens": 1533770.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "eval_entropy": 1.9870711751282215, | |
| "eval_loss": 0.0370485782623291, | |
| "eval_mean_token_accuracy": 0.9909984050318599, | |
| "eval_num_tokens": 1533770.0, | |
| "eval_runtime": 14.0218, | |
| "eval_samples_per_second": 72.316, | |
| "eval_steps_per_second": 4.564, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.021164360642433, | |
| "epoch": 1.6535433070866141, | |
| "grad_norm": 3.0989601612091064, | |
| "learning_rate": 1.7519685039370079e-06, | |
| "loss": 0.0234, | |
| "mean_token_accuracy": 0.9928320273756981, | |
| "num_tokens": 1611273.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6535433070866141, | |
| "eval_entropy": 1.9803004171699286, | |
| "eval_loss": 0.03255658224225044, | |
| "eval_mean_token_accuracy": 0.9918564734980464, | |
| "eval_num_tokens": 1611273.0, | |
| "eval_runtime": 14.0125, | |
| "eval_samples_per_second": 72.364, | |
| "eval_steps_per_second": 4.567, | |
| "step": 420 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 508, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.364004978659123e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |