stack_exc_multilabel_lm_head / trainer_state.json
TanDutta's picture
Upload folder using huggingface_hub
d863ac0 verified
{
"best_global_step": 420,
"best_metric": 0.03255658224225044,
"best_model_checkpoint": "/content/models/gemma_qlora_lmh/checkpoint-420",
"epoch": 1.6535433070866141,
"eval_steps": 20,
"global_step": 420,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.4301040887832643,
"epoch": 0.07874015748031496,
"grad_norm": 3.0547735691070557,
"learning_rate": 9.625984251968504e-06,
"loss": 0.5992,
"mean_token_accuracy": 0.7930104970932007,
"num_tokens": 75454.0,
"step": 20
},
{
"epoch": 0.07874015748031496,
"eval_entropy": 2.0184655766934156,
"eval_loss": 0.17359499633312225,
"eval_mean_token_accuracy": 0.9012619638815522,
"eval_num_tokens": 75454.0,
"eval_runtime": 13.9893,
"eval_samples_per_second": 72.484,
"eval_steps_per_second": 4.575,
"step": 20
},
{
"entropy": 2.124652886390686,
"epoch": 0.15748031496062992,
"grad_norm": 4.66159725189209,
"learning_rate": 9.23228346456693e-06,
"loss": 0.206,
"mean_token_accuracy": 0.8831128552556038,
"num_tokens": 153345.0,
"step": 40
},
{
"epoch": 0.15748031496062992,
"eval_entropy": 2.156722355633974,
"eval_loss": 0.1739385426044464,
"eval_mean_token_accuracy": 0.8968315636739135,
"eval_num_tokens": 153345.0,
"eval_runtime": 14.0205,
"eval_samples_per_second": 72.323,
"eval_steps_per_second": 4.565,
"step": 40
},
{
"entropy": 2.0995097190141676,
"epoch": 0.23622047244094488,
"grad_norm": 4.2743754386901855,
"learning_rate": 8.838582677165355e-06,
"loss": 0.1494,
"mean_token_accuracy": 0.9216955065727234,
"num_tokens": 232136.0,
"step": 60
},
{
"epoch": 0.23622047244094488,
"eval_entropy": 2.0814744140952826,
"eval_loss": 0.13872948288917542,
"eval_mean_token_accuracy": 0.9309953525662422,
"eval_num_tokens": 232136.0,
"eval_runtime": 13.9666,
"eval_samples_per_second": 72.602,
"eval_steps_per_second": 4.582,
"step": 60
},
{
"entropy": 2.114528650045395,
"epoch": 0.31496062992125984,
"grad_norm": 2.626481294631958,
"learning_rate": 8.444881889763782e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.9276395246386528,
"num_tokens": 310039.0,
"step": 80
},
{
"epoch": 0.31496062992125984,
"eval_entropy": 2.021162658929825,
"eval_loss": 0.10903553664684296,
"eval_mean_token_accuracy": 0.9508337117731571,
"eval_num_tokens": 310039.0,
"eval_runtime": 13.9195,
"eval_samples_per_second": 72.848,
"eval_steps_per_second": 4.598,
"step": 80
},
{
"entropy": 2.0373571157455443,
"epoch": 0.3937007874015748,
"grad_norm": 12.651391983032227,
"learning_rate": 8.051181102362205e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9546803295612335,
"num_tokens": 386453.0,
"step": 100
},
{
"epoch": 0.3937007874015748,
"eval_entropy": 2.0321336779743433,
"eval_loss": 0.08907214552164078,
"eval_mean_token_accuracy": 0.962163164280355,
"eval_num_tokens": 386453.0,
"eval_runtime": 13.8806,
"eval_samples_per_second": 73.051,
"eval_steps_per_second": 4.611,
"step": 100
},
{
"entropy": 2.084933453798294,
"epoch": 0.47244094488188976,
"grad_norm": 1.525368094444275,
"learning_rate": 7.65748031496063e-06,
"loss": 0.1062,
"mean_token_accuracy": 0.95157091319561,
"num_tokens": 462083.0,
"step": 120
},
{
"epoch": 0.47244094488188976,
"eval_entropy": 2.055444575846195,
"eval_loss": 0.07883646339178085,
"eval_mean_token_accuracy": 0.9678388619795442,
"eval_num_tokens": 462083.0,
"eval_runtime": 13.8682,
"eval_samples_per_second": 73.117,
"eval_steps_per_second": 4.615,
"step": 120
},
{
"entropy": 2.041289675235748,
"epoch": 0.5511811023622047,
"grad_norm": 4.715269565582275,
"learning_rate": 7.263779527559056e-06,
"loss": 0.0791,
"mean_token_accuracy": 0.9711190596222877,
"num_tokens": 538806.0,
"step": 140
},
{
"epoch": 0.5511811023622047,
"eval_entropy": 2.0219028927385807,
"eval_loss": 0.06344127655029297,
"eval_mean_token_accuracy": 0.9835457233712077,
"eval_num_tokens": 538806.0,
"eval_runtime": 13.8631,
"eval_samples_per_second": 73.144,
"eval_steps_per_second": 4.617,
"step": 140
},
{
"entropy": 2.046231508255005,
"epoch": 0.6299212598425197,
"grad_norm": 4.157348155975342,
"learning_rate": 6.870078740157481e-06,
"loss": 0.0639,
"mean_token_accuracy": 0.9775616720318794,
"num_tokens": 615129.0,
"step": 160
},
{
"epoch": 0.6299212598425197,
"eval_entropy": 2.0033062752336264,
"eval_loss": 0.053461696952581406,
"eval_mean_token_accuracy": 0.9832161571830511,
"eval_num_tokens": 615129.0,
"eval_runtime": 13.9444,
"eval_samples_per_second": 72.717,
"eval_steps_per_second": 4.59,
"step": 160
},
{
"entropy": 2.062005800008774,
"epoch": 0.7086614173228346,
"grad_norm": 10.413922309875488,
"learning_rate": 6.476377952755906e-06,
"loss": 0.0747,
"mean_token_accuracy": 0.9757226049900055,
"num_tokens": 690760.0,
"step": 180
},
{
"epoch": 0.7086614173228346,
"eval_entropy": 2.062701778486371,
"eval_loss": 0.07900257408618927,
"eval_mean_token_accuracy": 0.9750550417229533,
"eval_num_tokens": 690760.0,
"eval_runtime": 14.2787,
"eval_samples_per_second": 71.015,
"eval_steps_per_second": 4.482,
"step": 180
},
{
"entropy": 2.078292927145958,
"epoch": 0.7874015748031497,
"grad_norm": 3.501204252243042,
"learning_rate": 6.082677165354331e-06,
"loss": 0.052,
"mean_token_accuracy": 0.9841863334178924,
"num_tokens": 767471.0,
"step": 200
},
{
"epoch": 0.7874015748031497,
"eval_entropy": 2.0887723341584206,
"eval_loss": 0.046214085072278976,
"eval_mean_token_accuracy": 0.985442828387022,
"eval_num_tokens": 767471.0,
"eval_runtime": 13.9887,
"eval_samples_per_second": 72.487,
"eval_steps_per_second": 4.575,
"step": 200
},
{
"entropy": 2.1700605511665345,
"epoch": 0.8661417322834646,
"grad_norm": 5.373133659362793,
"learning_rate": 5.6889763779527565e-06,
"loss": 0.0646,
"mean_token_accuracy": 0.9767223253846169,
"num_tokens": 844265.0,
"step": 220
},
{
"epoch": 0.8661417322834646,
"eval_entropy": 2.134835472330451,
"eval_loss": 0.06774821132421494,
"eval_mean_token_accuracy": 0.9768518777564168,
"eval_num_tokens": 844265.0,
"eval_runtime": 13.9674,
"eval_samples_per_second": 72.598,
"eval_steps_per_second": 4.582,
"step": 220
},
{
"entropy": 2.130521237850189,
"epoch": 0.9448818897637795,
"grad_norm": 0.8587220311164856,
"learning_rate": 5.295275590551181e-06,
"loss": 0.0431,
"mean_token_accuracy": 0.9848933383822441,
"num_tokens": 921062.0,
"step": 240
},
{
"epoch": 0.9448818897637795,
"eval_entropy": 2.1466477904468775,
"eval_loss": 0.04291221499443054,
"eval_mean_token_accuracy": 0.9876205483451486,
"eval_num_tokens": 921062.0,
"eval_runtime": 13.8812,
"eval_samples_per_second": 73.048,
"eval_steps_per_second": 4.611,
"step": 240
},
{
"entropy": 2.068550485372543,
"epoch": 1.0236220472440944,
"grad_norm": 7.186275005340576,
"learning_rate": 4.901574803149607e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.9878435462713242,
"num_tokens": 994943.0,
"step": 260
},
{
"epoch": 1.0236220472440944,
"eval_entropy": 2.0358662642538548,
"eval_loss": 0.06262390315532684,
"eval_mean_token_accuracy": 0.9817966390401125,
"eval_num_tokens": 994943.0,
"eval_runtime": 13.9135,
"eval_samples_per_second": 72.879,
"eval_steps_per_second": 4.6,
"step": 260
},
{
"entropy": 2.0602549403905868,
"epoch": 1.1023622047244095,
"grad_norm": 3.056312084197998,
"learning_rate": 4.507874015748032e-06,
"loss": 0.0296,
"mean_token_accuracy": 0.9905342325568199,
"num_tokens": 1072972.0,
"step": 280
},
{
"epoch": 1.1023622047244095,
"eval_entropy": 2.0256000570952892,
"eval_loss": 0.0363699272274971,
"eval_mean_token_accuracy": 0.989073995500803,
"eval_num_tokens": 1072972.0,
"eval_runtime": 13.9066,
"eval_samples_per_second": 72.915,
"eval_steps_per_second": 4.602,
"step": 280
},
{
"entropy": 2.0230892926454542,
"epoch": 1.1811023622047245,
"grad_norm": 4.540070533752441,
"learning_rate": 4.114173228346457e-06,
"loss": 0.0248,
"mean_token_accuracy": 0.9916361093521118,
"num_tokens": 1151350.0,
"step": 300
},
{
"epoch": 1.1811023622047245,
"eval_entropy": 2.0147312097251415,
"eval_loss": 0.04147057980298996,
"eval_mean_token_accuracy": 0.9891073293983936,
"eval_num_tokens": 1151350.0,
"eval_runtime": 13.9315,
"eval_samples_per_second": 72.785,
"eval_steps_per_second": 4.594,
"step": 300
},
{
"entropy": 2.022981768846512,
"epoch": 1.2598425196850394,
"grad_norm": 0.8380900025367737,
"learning_rate": 3.7204724409448824e-06,
"loss": 0.018,
"mean_token_accuracy": 0.9942479804158211,
"num_tokens": 1228238.0,
"step": 320
},
{
"epoch": 1.2598425196850394,
"eval_entropy": 2.0086006112396717,
"eval_loss": 0.04451654851436615,
"eval_mean_token_accuracy": 0.9909367645159364,
"eval_num_tokens": 1228238.0,
"eval_runtime": 13.9575,
"eval_samples_per_second": 72.649,
"eval_steps_per_second": 4.585,
"step": 320
},
{
"entropy": 2.0186730861663817,
"epoch": 1.3385826771653544,
"grad_norm": 3.016096591949463,
"learning_rate": 3.3267716535433077e-06,
"loss": 0.0331,
"mean_token_accuracy": 0.9891796618700027,
"num_tokens": 1305071.0,
"step": 340
},
{
"epoch": 1.3385826771653544,
"eval_entropy": 1.9903168231248856,
"eval_loss": 0.03695274144411087,
"eval_mean_token_accuracy": 0.9910112516954541,
"eval_num_tokens": 1305071.0,
"eval_runtime": 13.9564,
"eval_samples_per_second": 72.655,
"eval_steps_per_second": 4.586,
"step": 340
},
{
"entropy": 2.017111986875534,
"epoch": 1.4173228346456692,
"grad_norm": 2.0211527347564697,
"learning_rate": 2.9330708661417322e-06,
"loss": 0.0202,
"mean_token_accuracy": 0.9945956841111183,
"num_tokens": 1380917.0,
"step": 360
},
{
"epoch": 1.4173228346456692,
"eval_entropy": 1.9944983646273613,
"eval_loss": 0.037537336349487305,
"eval_mean_token_accuracy": 0.9904407253488898,
"eval_num_tokens": 1380917.0,
"eval_runtime": 13.9126,
"eval_samples_per_second": 72.884,
"eval_steps_per_second": 4.6,
"step": 360
},
{
"entropy": 2.0372937440872194,
"epoch": 1.4960629921259843,
"grad_norm": 1.1146146059036255,
"learning_rate": 2.5393700787401576e-06,
"loss": 0.0411,
"mean_token_accuracy": 0.9902476906776428,
"num_tokens": 1456945.0,
"step": 380
},
{
"epoch": 1.4960629921259843,
"eval_entropy": 2.0192780885845423,
"eval_loss": 0.03898231312632561,
"eval_mean_token_accuracy": 0.9909690143540502,
"eval_num_tokens": 1456945.0,
"eval_runtime": 14.0021,
"eval_samples_per_second": 72.418,
"eval_steps_per_second": 4.571,
"step": 380
},
{
"entropy": 2.0159901797771456,
"epoch": 1.574803149606299,
"grad_norm": 5.674668312072754,
"learning_rate": 2.145669291338583e-06,
"loss": 0.0286,
"mean_token_accuracy": 0.9905779853463172,
"num_tokens": 1533770.0,
"step": 400
},
{
"epoch": 1.574803149606299,
"eval_entropy": 1.9870711751282215,
"eval_loss": 0.0370485782623291,
"eval_mean_token_accuracy": 0.9909984050318599,
"eval_num_tokens": 1533770.0,
"eval_runtime": 14.0218,
"eval_samples_per_second": 72.316,
"eval_steps_per_second": 4.564,
"step": 400
},
{
"entropy": 2.021164360642433,
"epoch": 1.6535433070866141,
"grad_norm": 3.0989601612091064,
"learning_rate": 1.7519685039370079e-06,
"loss": 0.0234,
"mean_token_accuracy": 0.9928320273756981,
"num_tokens": 1611273.0,
"step": 420
},
{
"epoch": 1.6535433070866141,
"eval_entropy": 1.9803004171699286,
"eval_loss": 0.03255658224225044,
"eval_mean_token_accuracy": 0.9918564734980464,
"eval_num_tokens": 1611273.0,
"eval_runtime": 14.0125,
"eval_samples_per_second": 72.364,
"eval_steps_per_second": 4.567,
"step": 420
}
],
"logging_steps": 20,
"max_steps": 508,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.364004978659123e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}