1 / checkpoint-58 /trainer_state.json
Affinetop1's picture
Upload folder using huggingface_hub
092a79a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 58,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017391304347826087,
"grad_norm": 55.45934295654297,
"learning_rate": 0.0,
"loss": 7.4024,
"mean_token_accuracy": 0.1686166636645794,
"num_tokens": 6224.0,
"step": 1
},
{
"epoch": 0.034782608695652174,
"grad_norm": 50.480812072753906,
"learning_rate": 3.3333333333333335e-05,
"loss": 7.0353,
"mean_token_accuracy": 0.18751946836709976,
"num_tokens": 11886.0,
"step": 2
},
{
"epoch": 0.05217391304347826,
"grad_norm": 47.20847702026367,
"learning_rate": 6.666666666666667e-05,
"loss": 6.7961,
"mean_token_accuracy": 0.1992376409471035,
"num_tokens": 17328.0,
"step": 3
},
{
"epoch": 0.06956521739130435,
"grad_norm": 38.443172454833984,
"learning_rate": 0.0001,
"loss": 5.9767,
"mean_token_accuracy": 0.21114255115389824,
"num_tokens": 22752.0,
"step": 4
},
{
"epoch": 0.08695652173913043,
"grad_norm": 22.228315353393555,
"learning_rate": 9.99266096766761e-05,
"loss": 4.9025,
"mean_token_accuracy": 0.24107784777879715,
"num_tokens": 29004.0,
"step": 5
},
{
"epoch": 0.10434782608695652,
"grad_norm": 13.45175552368164,
"learning_rate": 9.970667809068476e-05,
"loss": 4.2551,
"mean_token_accuracy": 0.3103659078478813,
"num_tokens": 35240.0,
"step": 6
},
{
"epoch": 0.12173913043478261,
"grad_norm": 11.450566291809082,
"learning_rate": 9.93409226131462e-05,
"loss": 3.4704,
"mean_token_accuracy": 0.3919261246919632,
"num_tokens": 41848.0,
"step": 7
},
{
"epoch": 0.1391304347826087,
"grad_norm": 11.856537818908691,
"learning_rate": 9.883053626240502e-05,
"loss": 3.3263,
"mean_token_accuracy": 0.42777257412672043,
"num_tokens": 48131.0,
"step": 8
},
{
"epoch": 0.1565217391304348,
"grad_norm": 11.830099105834961,
"learning_rate": 9.81771838126524e-05,
"loss": 2.3787,
"mean_token_accuracy": 0.5440730974078178,
"num_tokens": 55437.0,
"step": 9
},
{
"epoch": 0.17391304347826086,
"grad_norm": 3.2861013412475586,
"learning_rate": 9.738299636377862e-05,
"loss": 1.752,
"mean_token_accuracy": 0.6369369179010391,
"num_tokens": 61534.0,
"step": 10
},
{
"epoch": 0.19130434782608696,
"grad_norm": 3.029527425765991,
"learning_rate": 9.645056439016827e-05,
"loss": 1.4911,
"mean_token_accuracy": 0.67353655397892,
"num_tokens": 67205.0,
"step": 11
},
{
"epoch": 0.20869565217391303,
"grad_norm": 2.049616813659668,
"learning_rate": 9.538292929111113e-05,
"loss": 1.5348,
"mean_token_accuracy": 0.6536547541618347,
"num_tokens": 73359.0,
"step": 12
},
{
"epoch": 0.22608695652173913,
"grad_norm": 2.315335750579834,
"learning_rate": 9.418357347038998e-05,
"loss": 1.3448,
"mean_token_accuracy": 0.6856418550014496,
"num_tokens": 79250.0,
"step": 13
},
{
"epoch": 0.24347826086956523,
"grad_norm": 2.0546271800994873,
"learning_rate": 9.285640897740315e-05,
"loss": 1.4246,
"mean_token_accuracy": 0.664088174700737,
"num_tokens": 84981.0,
"step": 14
},
{
"epoch": 0.2608695652173913,
"grad_norm": 1.8393396139144897,
"learning_rate": 9.140576474687264e-05,
"loss": 1.3907,
"mean_token_accuracy": 0.6777825355529785,
"num_tokens": 91084.0,
"step": 15
},
{
"epoch": 0.2782608695652174,
"grad_norm": 1.3824291229248047,
"learning_rate": 8.983637247875872e-05,
"loss": 1.0379,
"mean_token_accuracy": 0.7471358180046082,
"num_tokens": 98793.0,
"step": 16
},
{
"epoch": 0.2956521739130435,
"grad_norm": 1.8309787511825562,
"learning_rate": 8.815335120443822e-05,
"loss": 1.0902,
"mean_token_accuracy": 0.7380675822496414,
"num_tokens": 105183.0,
"step": 17
},
{
"epoch": 0.3130434782608696,
"grad_norm": 1.8205807209014893,
"learning_rate": 8.636219058948823e-05,
"loss": 1.0416,
"mean_token_accuracy": 0.7440497726202011,
"num_tokens": 111071.0,
"step": 18
},
{
"epoch": 0.33043478260869563,
"grad_norm": 1.7150083780288696,
"learning_rate": 8.446873302753784e-05,
"loss": 0.9923,
"mean_token_accuracy": 0.7531551718711853,
"num_tokens": 117451.0,
"step": 19
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.842894434928894,
"learning_rate": 8.247915458359473e-05,
"loss": 0.9871,
"mean_token_accuracy": 0.7675948143005371,
"num_tokens": 123384.0,
"step": 20
},
{
"epoch": 0.3652173913043478,
"grad_norm": 1.4929476976394653,
"learning_rate": 8.039994484900463e-05,
"loss": 0.804,
"mean_token_accuracy": 0.7979889959096909,
"num_tokens": 129294.0,
"step": 21
},
{
"epoch": 0.3826086956521739,
"grad_norm": 1.15473473072052,
"learning_rate": 7.82378857737533e-05,
"loss": 0.8532,
"mean_token_accuracy": 0.793354332447052,
"num_tokens": 135724.0,
"step": 22
},
{
"epoch": 0.4,
"grad_norm": 1.2569639682769775,
"learning_rate": 7.600002954515532e-05,
"loss": 0.8879,
"mean_token_accuracy": 0.796680137515068,
"num_tokens": 141797.0,
"step": 23
},
{
"epoch": 0.41739130434782606,
"grad_norm": 1.0053095817565918,
"learning_rate": 7.369367558508489e-05,
"loss": 0.8051,
"mean_token_accuracy": 0.8075380921363831,
"num_tokens": 147918.0,
"step": 24
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.793317437171936,
"learning_rate": 7.132634674077883e-05,
"loss": 0.9785,
"mean_token_accuracy": 0.7718029171228409,
"num_tokens": 154434.0,
"step": 25
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.7299309968948364,
"learning_rate": 6.890576474687263e-05,
"loss": 1.0357,
"mean_token_accuracy": 0.744295209646225,
"num_tokens": 161540.0,
"step": 26
},
{
"epoch": 0.46956521739130436,
"grad_norm": 1.4765475988388062,
"learning_rate": 6.643982503870693e-05,
"loss": 0.6015,
"mean_token_accuracy": 0.8558884114027023,
"num_tokens": 167030.0,
"step": 27
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.7446058392524719,
"learning_rate": 6.393657099905855e-05,
"loss": 0.7297,
"mean_token_accuracy": 0.8198393434286118,
"num_tokens": 172575.0,
"step": 28
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.662484347820282,
"learning_rate": 6.140416772229784e-05,
"loss": 0.7215,
"mean_token_accuracy": 0.8261076658964157,
"num_tokens": 178395.0,
"step": 29
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.0676500797271729,
"learning_rate": 5.88508753815478e-05,
"loss": 0.7392,
"mean_token_accuracy": 0.8211972415447235,
"num_tokens": 184121.0,
"step": 30
},
{
"epoch": 0.5391304347826087,
"grad_norm": 1.291377067565918,
"learning_rate": 5.628502228571633e-05,
"loss": 0.6282,
"mean_token_accuracy": 0.8491714000701904,
"num_tokens": 190440.0,
"step": 31
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.5546766519546509,
"learning_rate": 5.3714977714283674e-05,
"loss": 0.6955,
"mean_token_accuracy": 0.8281570971012115,
"num_tokens": 197221.0,
"step": 32
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.5453093647956848,
"learning_rate": 5.114912461845223e-05,
"loss": 0.7473,
"mean_token_accuracy": 0.817421019077301,
"num_tokens": 203874.0,
"step": 33
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.46395328640937805,
"learning_rate": 4.859583227770218e-05,
"loss": 0.7886,
"mean_token_accuracy": 0.805167943239212,
"num_tokens": 210064.0,
"step": 34
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.4362604022026062,
"learning_rate": 4.606342900094147e-05,
"loss": 0.7002,
"mean_token_accuracy": 0.829146608710289,
"num_tokens": 216569.0,
"step": 35
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.4113505184650421,
"learning_rate": 4.3560174961293097e-05,
"loss": 0.8059,
"mean_token_accuracy": 0.8010579198598862,
"num_tokens": 222896.0,
"step": 36
},
{
"epoch": 0.6434782608695652,
"grad_norm": 0.682939350605011,
"learning_rate": 4.109423525312738e-05,
"loss": 0.6212,
"mean_token_accuracy": 0.8504037708044052,
"num_tokens": 229011.0,
"step": 37
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.5376163125038147,
"learning_rate": 3.8673653259221166e-05,
"loss": 0.8097,
"mean_token_accuracy": 0.8016069531440735,
"num_tokens": 235909.0,
"step": 38
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.47806745767593384,
"learning_rate": 3.630632441491512e-05,
"loss": 0.7105,
"mean_token_accuracy": 0.8202503025531769,
"num_tokens": 242513.0,
"step": 39
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.582610011100769,
"learning_rate": 3.399997045484469e-05,
"loss": 0.7017,
"mean_token_accuracy": 0.8234356045722961,
"num_tokens": 248157.0,
"step": 40
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.5676279664039612,
"learning_rate": 3.176211422624672e-05,
"loss": 0.6786,
"mean_token_accuracy": 0.8357308208942413,
"num_tokens": 253917.0,
"step": 41
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.43341803550720215,
"learning_rate": 2.9600055150995398e-05,
"loss": 0.7935,
"mean_token_accuracy": 0.802367627620697,
"num_tokens": 260731.0,
"step": 42
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.5170316100120544,
"learning_rate": 2.7520845416405282e-05,
"loss": 0.6804,
"mean_token_accuracy": 0.8378360271453857,
"num_tokens": 266433.0,
"step": 43
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.48307615518569946,
"learning_rate": 2.5531266972462177e-05,
"loss": 0.7748,
"mean_token_accuracy": 0.8051830232143402,
"num_tokens": 273029.0,
"step": 44
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.45667076110839844,
"learning_rate": 2.36378094105118e-05,
"loss": 0.5937,
"mean_token_accuracy": 0.8383966088294983,
"num_tokens": 279655.0,
"step": 45
},
{
"epoch": 0.8,
"grad_norm": 0.41323214769363403,
"learning_rate": 2.1846648795561774e-05,
"loss": 0.6688,
"mean_token_accuracy": 0.8343894928693771,
"num_tokens": 286066.0,
"step": 46
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.6046397089958191,
"learning_rate": 2.0163627521241292e-05,
"loss": 0.614,
"mean_token_accuracy": 0.8473068177700043,
"num_tokens": 291788.0,
"step": 47
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.3877752721309662,
"learning_rate": 1.8594235253127375e-05,
"loss": 0.4869,
"mean_token_accuracy": 0.8654044568538666,
"num_tokens": 298877.0,
"step": 48
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.38120725750923157,
"learning_rate": 1.7143591022596845e-05,
"loss": 0.6164,
"mean_token_accuracy": 0.8455094546079636,
"num_tokens": 304903.0,
"step": 49
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.4243324398994446,
"learning_rate": 1.5816426529610035e-05,
"loss": 0.6793,
"mean_token_accuracy": 0.8279502987861633,
"num_tokens": 310741.0,
"step": 50
},
{
"epoch": 0.8869565217391304,
"grad_norm": 0.4153580963611603,
"learning_rate": 1.4617070708888881e-05,
"loss": 0.6215,
"mean_token_accuracy": 0.8429995030164719,
"num_tokens": 315959.0,
"step": 51
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.3602670133113861,
"learning_rate": 1.3549435609831752e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.822294071316719,
"num_tokens": 322024.0,
"step": 52
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.4163176119327545,
"learning_rate": 1.2617003636221395e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.8162952065467834,
"num_tokens": 328585.0,
"step": 53
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.4420251250267029,
"learning_rate": 1.1822816187347623e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.813583567738533,
"num_tokens": 334774.0,
"step": 54
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.4156906008720398,
"learning_rate": 1.1169463737594995e-05,
"loss": 0.6769,
"mean_token_accuracy": 0.8312394767999649,
"num_tokens": 340913.0,
"step": 55
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.4741043150424957,
"learning_rate": 1.0659077386853816e-05,
"loss": 0.6661,
"mean_token_accuracy": 0.8269830048084259,
"num_tokens": 347461.0,
"step": 56
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.4164107143878937,
"learning_rate": 1.0293321909315242e-05,
"loss": 0.6548,
"mean_token_accuracy": 0.8303200602531433,
"num_tokens": 353498.0,
"step": 57
},
{
"epoch": 1.0,
"grad_norm": 0.6555842757225037,
"learning_rate": 1.0073390323323897e-05,
"loss": 0.9622,
"mean_token_accuracy": 0.8200015425682068,
"num_tokens": 355699.0,
"step": 58
}
],
"logging_steps": 1,
"max_steps": 58,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.441734033785367e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}