Qwen2.5-0.5B-Capybara / checkpoint-42 /trainer_state.json
BurnyCoder's picture
Upload folder using huggingface_hub
bce1339 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08502024291497975,
"eval_steps": 500,
"global_step": 42,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020242914979757085,
"grad_norm": 13.587785720825195,
"learning_rate": 2e-05,
"loss": 1.903,
"mean_token_accuracy": 0.5894419699907303,
"num_tokens": 23998.0,
"step": 1
},
{
"epoch": 0.004048582995951417,
"grad_norm": 10.837627410888672,
"learning_rate": 1.998650472334683e-05,
"loss": 1.7773,
"mean_token_accuracy": 0.6223782598972321,
"num_tokens": 46596.0,
"step": 2
},
{
"epoch": 0.006072874493927126,
"grad_norm": 5.157018184661865,
"learning_rate": 1.9973009446693658e-05,
"loss": 1.9274,
"mean_token_accuracy": 0.5814870595932007,
"num_tokens": 71542.0,
"step": 3
},
{
"epoch": 0.008097165991902834,
"grad_norm": 5.115257263183594,
"learning_rate": 1.9959514170040488e-05,
"loss": 1.6163,
"mean_token_accuracy": 0.6391739100217819,
"num_tokens": 90870.0,
"step": 4
},
{
"epoch": 0.010121457489878543,
"grad_norm": 4.290038585662842,
"learning_rate": 1.9946018893387314e-05,
"loss": 1.5241,
"mean_token_accuracy": 0.6456017643213272,
"num_tokens": 115219.0,
"step": 5
},
{
"epoch": 0.012145748987854251,
"grad_norm": 3.599583864212036,
"learning_rate": 1.9932523616734144e-05,
"loss": 1.363,
"mean_token_accuracy": 0.6773215681314468,
"num_tokens": 136447.0,
"step": 6
},
{
"epoch": 0.01417004048582996,
"grad_norm": 3.737048625946045,
"learning_rate": 1.9919028340080974e-05,
"loss": 1.6063,
"mean_token_accuracy": 0.6296929270029068,
"num_tokens": 159775.0,
"step": 7
},
{
"epoch": 0.016194331983805668,
"grad_norm": 3.6092047691345215,
"learning_rate": 1.9905533063427804e-05,
"loss": 1.5649,
"mean_token_accuracy": 0.639984205365181,
"num_tokens": 184127.0,
"step": 8
},
{
"epoch": 0.018218623481781375,
"grad_norm": 3.934776782989502,
"learning_rate": 1.989203778677463e-05,
"loss": 1.8535,
"mean_token_accuracy": 0.594915583729744,
"num_tokens": 206169.0,
"step": 9
},
{
"epoch": 0.020242914979757085,
"grad_norm": 3.7700464725494385,
"learning_rate": 1.987854251012146e-05,
"loss": 1.4936,
"mean_token_accuracy": 0.6473288685083389,
"num_tokens": 226067.0,
"step": 10
},
{
"epoch": 0.022267206477732792,
"grad_norm": 3.7429916858673096,
"learning_rate": 1.986504723346829e-05,
"loss": 1.5662,
"mean_token_accuracy": 0.6233761757612228,
"num_tokens": 248812.0,
"step": 11
},
{
"epoch": 0.024291497975708502,
"grad_norm": 3.451914072036743,
"learning_rate": 1.9851551956815116e-05,
"loss": 1.7484,
"mean_token_accuracy": 0.6058520972728729,
"num_tokens": 272961.0,
"step": 12
},
{
"epoch": 0.02631578947368421,
"grad_norm": 3.706216335296631,
"learning_rate": 1.9838056680161946e-05,
"loss": 1.4828,
"mean_token_accuracy": 0.6424361318349838,
"num_tokens": 293964.0,
"step": 13
},
{
"epoch": 0.02834008097165992,
"grad_norm": 3.402036428451538,
"learning_rate": 1.9824561403508773e-05,
"loss": 1.6059,
"mean_token_accuracy": 0.6380074322223663,
"num_tokens": 314414.0,
"step": 14
},
{
"epoch": 0.030364372469635626,
"grad_norm": 3.5934112071990967,
"learning_rate": 1.9811066126855602e-05,
"loss": 1.6123,
"mean_token_accuracy": 0.6258052587509155,
"num_tokens": 338761.0,
"step": 15
},
{
"epoch": 0.032388663967611336,
"grad_norm": 3.2826409339904785,
"learning_rate": 1.979757085020243e-05,
"loss": 1.5093,
"mean_token_accuracy": 0.647676095366478,
"num_tokens": 364154.0,
"step": 16
},
{
"epoch": 0.03441295546558704,
"grad_norm": 3.411837339401245,
"learning_rate": 1.978407557354926e-05,
"loss": 1.5875,
"mean_token_accuracy": 0.6337466537952423,
"num_tokens": 387522.0,
"step": 17
},
{
"epoch": 0.03643724696356275,
"grad_norm": 3.537415027618408,
"learning_rate": 1.977058029689609e-05,
"loss": 1.6839,
"mean_token_accuracy": 0.6119341999292374,
"num_tokens": 410636.0,
"step": 18
},
{
"epoch": 0.038461538461538464,
"grad_norm": 3.6170666217803955,
"learning_rate": 1.9757085020242915e-05,
"loss": 1.3929,
"mean_token_accuracy": 0.6720004975795746,
"num_tokens": 430858.0,
"step": 19
},
{
"epoch": 0.04048582995951417,
"grad_norm": 3.725717782974243,
"learning_rate": 1.9743589743589745e-05,
"loss": 1.3478,
"mean_token_accuracy": 0.6779870688915253,
"num_tokens": 451526.0,
"step": 20
},
{
"epoch": 0.04251012145748988,
"grad_norm": 3.7410740852355957,
"learning_rate": 1.9730094466936575e-05,
"loss": 1.5027,
"mean_token_accuracy": 0.6415430754423141,
"num_tokens": 472555.0,
"step": 21
},
{
"epoch": 0.044534412955465584,
"grad_norm": 3.8375744819641113,
"learning_rate": 1.9716599190283405e-05,
"loss": 1.3805,
"mean_token_accuracy": 0.6554747521877289,
"num_tokens": 492412.0,
"step": 22
},
{
"epoch": 0.0465587044534413,
"grad_norm": 3.4310216903686523,
"learning_rate": 1.970310391363023e-05,
"loss": 1.3993,
"mean_token_accuracy": 0.6516353040933609,
"num_tokens": 517688.0,
"step": 23
},
{
"epoch": 0.048582995951417005,
"grad_norm": 3.4065134525299072,
"learning_rate": 1.968960863697706e-05,
"loss": 1.4144,
"mean_token_accuracy": 0.6639417558908463,
"num_tokens": 539605.0,
"step": 24
},
{
"epoch": 0.05060728744939271,
"grad_norm": 3.4423940181732178,
"learning_rate": 1.9676113360323887e-05,
"loss": 1.6237,
"mean_token_accuracy": 0.6199875771999359,
"num_tokens": 563276.0,
"step": 25
},
{
"epoch": 0.05263157894736842,
"grad_norm": 3.211747407913208,
"learning_rate": 1.9662618083670717e-05,
"loss": 1.4059,
"mean_token_accuracy": 0.6552923172712326,
"num_tokens": 586603.0,
"step": 26
},
{
"epoch": 0.05465587044534413,
"grad_norm": 3.1153526306152344,
"learning_rate": 1.9649122807017544e-05,
"loss": 1.2644,
"mean_token_accuracy": 0.6816990375518799,
"num_tokens": 612691.0,
"step": 27
},
{
"epoch": 0.05668016194331984,
"grad_norm": 3.2474708557128906,
"learning_rate": 1.9635627530364373e-05,
"loss": 1.4524,
"mean_token_accuracy": 0.6650048345327377,
"num_tokens": 636325.0,
"step": 28
},
{
"epoch": 0.058704453441295545,
"grad_norm": 3.521009683609009,
"learning_rate": 1.9622132253711203e-05,
"loss": 1.3588,
"mean_token_accuracy": 0.6608386486768723,
"num_tokens": 657410.0,
"step": 29
},
{
"epoch": 0.06072874493927125,
"grad_norm": 3.240419387817383,
"learning_rate": 1.960863697705803e-05,
"loss": 1.6196,
"mean_token_accuracy": 0.634381040930748,
"num_tokens": 678587.0,
"step": 30
},
{
"epoch": 0.06275303643724696,
"grad_norm": 3.0680091381073,
"learning_rate": 1.959514170040486e-05,
"loss": 1.537,
"mean_token_accuracy": 0.6444396674633026,
"num_tokens": 700383.0,
"step": 31
},
{
"epoch": 0.06477732793522267,
"grad_norm": 3.087522029876709,
"learning_rate": 1.958164642375169e-05,
"loss": 1.6414,
"mean_token_accuracy": 0.6257035434246063,
"num_tokens": 723769.0,
"step": 32
},
{
"epoch": 0.06680161943319839,
"grad_norm": 3.2430222034454346,
"learning_rate": 1.9568151147098516e-05,
"loss": 1.5166,
"mean_token_accuracy": 0.6486149281263351,
"num_tokens": 747387.0,
"step": 33
},
{
"epoch": 0.06882591093117409,
"grad_norm": 3.1888442039489746,
"learning_rate": 1.9554655870445346e-05,
"loss": 1.4071,
"mean_token_accuracy": 0.6582628488540649,
"num_tokens": 771537.0,
"step": 34
},
{
"epoch": 0.0708502024291498,
"grad_norm": 2.9818553924560547,
"learning_rate": 1.9541160593792176e-05,
"loss": 1.4723,
"mean_token_accuracy": 0.6373147964477539,
"num_tokens": 794954.0,
"step": 35
},
{
"epoch": 0.0728744939271255,
"grad_norm": 2.8076112270355225,
"learning_rate": 1.9527665317139005e-05,
"loss": 1.5494,
"mean_token_accuracy": 0.6327401697635651,
"num_tokens": 820451.0,
"step": 36
},
{
"epoch": 0.07489878542510121,
"grad_norm": 3.305832862854004,
"learning_rate": 1.9514170040485832e-05,
"loss": 1.5983,
"mean_token_accuracy": 0.6266501545906067,
"num_tokens": 840241.0,
"step": 37
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.9532933235168457,
"learning_rate": 1.9500674763832662e-05,
"loss": 1.4127,
"mean_token_accuracy": 0.6481295526027679,
"num_tokens": 862831.0,
"step": 38
},
{
"epoch": 0.07894736842105263,
"grad_norm": 2.7358744144439697,
"learning_rate": 1.9487179487179488e-05,
"loss": 1.4085,
"mean_token_accuracy": 0.6563303023576736,
"num_tokens": 887709.0,
"step": 39
},
{
"epoch": 0.08097165991902834,
"grad_norm": 2.550145149230957,
"learning_rate": 1.9473684210526318e-05,
"loss": 1.398,
"mean_token_accuracy": 0.6520788222551346,
"num_tokens": 914680.0,
"step": 40
},
{
"epoch": 0.08299595141700405,
"grad_norm": 2.6927826404571533,
"learning_rate": 1.9460188933873144e-05,
"loss": 1.618,
"mean_token_accuracy": 0.624066486954689,
"num_tokens": 939210.0,
"step": 41
},
{
"epoch": 0.08502024291497975,
"grad_norm": 2.7264392375946045,
"learning_rate": 1.9446693657219974e-05,
"loss": 1.3058,
"mean_token_accuracy": 0.6675033718347549,
"num_tokens": 960279.0,
"step": 42
}
],
"logging_steps": 1,
"max_steps": 1482,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 42,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2947625944952832.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}