sedrickkeh's picture
End of training
be30218 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.976,
"eval_steps": 500,
"global_step": 93,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.032,
"grad_norm": 6.08677457937853,
"learning_rate": 4.000000000000001e-06,
"loss": 0.8709,
"step": 1
},
{
"epoch": 0.064,
"grad_norm": 5.862503603501722,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8625,
"step": 2
},
{
"epoch": 0.096,
"grad_norm": 4.5723036455815205,
"learning_rate": 1.2e-05,
"loss": 0.839,
"step": 3
},
{
"epoch": 0.128,
"grad_norm": 2.0695238404961547,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7376,
"step": 4
},
{
"epoch": 0.16,
"grad_norm": 5.589012282844458,
"learning_rate": 2e-05,
"loss": 0.825,
"step": 5
},
{
"epoch": 0.192,
"grad_norm": 8.490554966738163,
"learning_rate": 2.4e-05,
"loss": 0.8478,
"step": 6
},
{
"epoch": 0.224,
"grad_norm": 6.29894861103046,
"learning_rate": 2.8e-05,
"loss": 0.7658,
"step": 7
},
{
"epoch": 0.256,
"grad_norm": 3.558435531359445,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.7555,
"step": 8
},
{
"epoch": 0.288,
"grad_norm": 2.722863714076688,
"learning_rate": 3.6e-05,
"loss": 0.7274,
"step": 9
},
{
"epoch": 0.32,
"grad_norm": 2.088640250697761,
"learning_rate": 4e-05,
"loss": 0.6627,
"step": 10
},
{
"epoch": 0.352,
"grad_norm": 1.5096876602344986,
"learning_rate": 3.998567509632663e-05,
"loss": 0.6817,
"step": 11
},
{
"epoch": 0.384,
"grad_norm": 1.6896637753034105,
"learning_rate": 3.9942720905593045e-05,
"loss": 0.6761,
"step": 12
},
{
"epoch": 0.416,
"grad_norm": 1.5060644054718806,
"learning_rate": 3.98711989592637e-05,
"loss": 0.6519,
"step": 13
},
{
"epoch": 0.448,
"grad_norm": 1.4049366508424377,
"learning_rate": 3.9771211711837774e-05,
"loss": 0.6333,
"step": 14
},
{
"epoch": 0.48,
"grad_norm": 1.411091031460123,
"learning_rate": 3.9642902394084056e-05,
"loss": 0.5874,
"step": 15
},
{
"epoch": 0.512,
"grad_norm": 0.8883192699998052,
"learning_rate": 3.948645480786427e-05,
"loss": 0.6116,
"step": 16
},
{
"epoch": 0.544,
"grad_norm": 1.129257283929009,
"learning_rate": 3.930209306283867e-05,
"loss": 0.5852,
"step": 17
},
{
"epoch": 0.576,
"grad_norm": 0.901306643045694,
"learning_rate": 3.909008125543111e-05,
"loss": 0.5821,
"step": 18
},
{
"epoch": 0.608,
"grad_norm": 0.9465933700636773,
"learning_rate": 3.885072309051346e-05,
"loss": 0.5833,
"step": 19
},
{
"epoch": 0.64,
"grad_norm": 0.8046120689103757,
"learning_rate": 3.858436144635131e-05,
"loss": 0.5597,
"step": 20
},
{
"epoch": 0.672,
"grad_norm": 0.9228859026973337,
"learning_rate": 3.829137788343415e-05,
"loss": 0.5571,
"step": 21
},
{
"epoch": 0.704,
"grad_norm": 0.9202261747479373,
"learning_rate": 3.797219209789365e-05,
"loss": 0.6007,
"step": 22
},
{
"epoch": 0.736,
"grad_norm": 0.9336252436543613,
"learning_rate": 3.762726132029298e-05,
"loss": 0.5618,
"step": 23
},
{
"epoch": 0.768,
"grad_norm": 1.2080051557421407,
"learning_rate": 3.725707966064846e-05,
"loss": 0.6057,
"step": 24
},
{
"epoch": 0.8,
"grad_norm": 1.1231601773572062,
"learning_rate": 3.686217740062169e-05,
"loss": 0.5603,
"step": 25
},
{
"epoch": 0.832,
"grad_norm": 0.8382981788712457,
"learning_rate": 3.644312023389621e-05,
"loss": 0.5491,
"step": 26
},
{
"epoch": 0.864,
"grad_norm": 1.1862939971134692,
"learning_rate": 3.600050845582669e-05,
"loss": 0.5887,
"step": 27
},
{
"epoch": 0.896,
"grad_norm": 1.0957634618366314,
"learning_rate": 3.5534976103521716e-05,
"loss": 0.5958,
"step": 28
},
{
"epoch": 0.928,
"grad_norm": 0.6590817382859444,
"learning_rate": 3.504719004759163e-05,
"loss": 0.5528,
"step": 29
},
{
"epoch": 0.96,
"grad_norm": 0.875749033008583,
"learning_rate": 3.4537849036862874e-05,
"loss": 0.565,
"step": 30
},
{
"epoch": 0.992,
"grad_norm": 0.783563683071678,
"learning_rate": 3.400768269742702e-05,
"loss": 0.5595,
"step": 31
},
{
"epoch": 1.024,
"grad_norm": 1.4075572002103194,
"learning_rate": 3.345745048745838e-05,
"loss": 0.919,
"step": 32
},
{
"epoch": 1.056,
"grad_norm": 0.7062499705952857,
"learning_rate": 3.288794060929754e-05,
"loss": 0.4404,
"step": 33
},
{
"epoch": 1.088,
"grad_norm": 0.9967541412280615,
"learning_rate": 3.229996888035908e-05,
"loss": 0.4984,
"step": 34
},
{
"epoch": 1.12,
"grad_norm": 0.8310704240859569,
"learning_rate": 3.169437756448095e-05,
"loss": 0.4807,
"step": 35
},
{
"epoch": 1.152,
"grad_norm": 0.7016314595292313,
"learning_rate": 3.107203416538969e-05,
"loss": 0.4703,
"step": 36
},
{
"epoch": 1.184,
"grad_norm": 1.4372935885534768,
"learning_rate": 3.0433830184009694e-05,
"loss": 0.4739,
"step": 37
},
{
"epoch": 1.216,
"grad_norm": 0.8650508524606009,
"learning_rate": 2.9780679841396668e-05,
"loss": 0.4525,
"step": 38
},
{
"epoch": 1.248,
"grad_norm": 1.0766618304625992,
"learning_rate": 2.9113518769124836e-05,
"loss": 0.4987,
"step": 39
},
{
"epoch": 1.28,
"grad_norm": 0.6767070574855524,
"learning_rate": 2.843330266900368e-05,
"loss": 0.4475,
"step": 40
},
{
"epoch": 1.312,
"grad_norm": 1.026763400096595,
"learning_rate": 2.774100594404435e-05,
"loss": 0.4667,
"step": 41
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.9002290625499892,
"learning_rate": 2.703762030263666e-05,
"loss": 0.4916,
"step": 42
},
{
"epoch": 1.376,
"grad_norm": 0.9903797867735974,
"learning_rate": 2.632415333793648e-05,
"loss": 0.4771,
"step": 43
},
{
"epoch": 1.408,
"grad_norm": 0.7218070561744779,
"learning_rate": 2.5601627084498146e-05,
"loss": 0.407,
"step": 44
},
{
"epoch": 1.44,
"grad_norm": 0.8313194407823631,
"learning_rate": 2.4871076554219838e-05,
"loss": 0.4442,
"step": 45
},
{
"epoch": 1.472,
"grad_norm": 0.8236730874850681,
"learning_rate": 2.413354825369906e-05,
"loss": 0.5223,
"step": 46
},
{
"epoch": 1.504,
"grad_norm": 0.6125019115754542,
"learning_rate": 2.3390098685121938e-05,
"loss": 0.42,
"step": 47
},
{
"epoch": 1.536,
"grad_norm": 0.6737099841054438,
"learning_rate": 2.264179283283405e-05,
"loss": 0.4665,
"step": 48
},
{
"epoch": 1.568,
"grad_norm": 0.5907056602966384,
"learning_rate": 2.1889702637760627e-05,
"loss": 0.4445,
"step": 49
},
{
"epoch": 1.6,
"grad_norm": 0.6965345367451425,
"learning_rate": 2.1134905461861486e-05,
"loss": 0.5221,
"step": 50
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.5006105897500711,
"learning_rate": 2.0378482544820383e-05,
"loss": 0.4218,
"step": 51
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.6174830888739168,
"learning_rate": 1.9621517455179627e-05,
"loss": 0.476,
"step": 52
},
{
"epoch": 1.696,
"grad_norm": 0.503990909520708,
"learning_rate": 1.886509453813852e-05,
"loss": 0.4275,
"step": 53
},
{
"epoch": 1.728,
"grad_norm": 0.5974502021657286,
"learning_rate": 1.8110297362239376e-05,
"loss": 0.4757,
"step": 54
},
{
"epoch": 1.76,
"grad_norm": 0.551329913445271,
"learning_rate": 1.735820716716596e-05,
"loss": 0.4757,
"step": 55
},
{
"epoch": 1.792,
"grad_norm": 0.579064058765967,
"learning_rate": 1.660990131487807e-05,
"loss": 0.4182,
"step": 56
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.548961385728647,
"learning_rate": 1.586645174630094e-05,
"loss": 0.4731,
"step": 57
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.5764590922933827,
"learning_rate": 1.5128923445780163e-05,
"loss": 0.4271,
"step": 58
},
{
"epoch": 1.888,
"grad_norm": 0.5781209646736115,
"learning_rate": 1.4398372915501862e-05,
"loss": 0.4644,
"step": 59
},
{
"epoch": 1.92,
"grad_norm": 0.42862823912049036,
"learning_rate": 1.3675846662063521e-05,
"loss": 0.4071,
"step": 60
},
{
"epoch": 1.952,
"grad_norm": 0.569159061133135,
"learning_rate": 1.296237969736334e-05,
"loss": 0.4561,
"step": 61
},
{
"epoch": 1.984,
"grad_norm": 0.47530501617214926,
"learning_rate": 1.2258994055955658e-05,
"loss": 0.3817,
"step": 62
},
{
"epoch": 2.016,
"grad_norm": 0.9251054706846084,
"learning_rate": 1.156669733099632e-05,
"loss": 0.7898,
"step": 63
},
{
"epoch": 2.048,
"grad_norm": 0.5716376901076641,
"learning_rate": 1.0886481230875172e-05,
"loss": 0.3525,
"step": 64
},
{
"epoch": 2.08,
"grad_norm": 0.5177936896448316,
"learning_rate": 1.0219320158603337e-05,
"loss": 0.3394,
"step": 65
},
{
"epoch": 2.112,
"grad_norm": 0.5247806458447061,
"learning_rate": 9.566169815990311e-06,
"loss": 0.3834,
"step": 66
},
{
"epoch": 2.144,
"grad_norm": 0.5455234154497576,
"learning_rate": 8.92796583461031e-06,
"loss": 0.3577,
"step": 67
},
{
"epoch": 2.176,
"grad_norm": 0.5925096096878631,
"learning_rate": 8.305622435519058e-06,
"loss": 0.3831,
"step": 68
},
{
"epoch": 2.208,
"grad_norm": 0.6820182428585542,
"learning_rate": 7.70003111964093e-06,
"loss": 0.376,
"step": 69
},
{
"epoch": 2.24,
"grad_norm": 0.6320620904769954,
"learning_rate": 7.112059390702459e-06,
"loss": 0.3715,
"step": 70
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.4928197929862798,
"learning_rate": 6.542549512541623e-06,
"loss": 0.3713,
"step": 71
},
{
"epoch": 2.304,
"grad_norm": 0.4767133735569691,
"learning_rate": 5.9923173025729895e-06,
"loss": 0.3303,
"step": 72
},
{
"epoch": 2.336,
"grad_norm": 0.7059312169326228,
"learning_rate": 5.462150963137125e-06,
"loss": 0.4568,
"step": 73
},
{
"epoch": 2.368,
"grad_norm": 0.5003954153261982,
"learning_rate": 4.952809952408375e-06,
"loss": 0.3514,
"step": 74
},
{
"epoch": 2.4,
"grad_norm": 0.5944579046423205,
"learning_rate": 4.465023896478293e-06,
"loss": 0.3627,
"step": 75
},
{
"epoch": 2.432,
"grad_norm": 0.4180864471254852,
"learning_rate": 3.999491544173311e-06,
"loss": 0.3054,
"step": 76
},
{
"epoch": 2.464,
"grad_norm": 0.44019542396333683,
"learning_rate": 3.5568797661038004e-06,
"loss": 0.375,
"step": 77
},
{
"epoch": 2.496,
"grad_norm": 0.4294310051147678,
"learning_rate": 3.137822599378315e-06,
"loss": 0.3537,
"step": 78
},
{
"epoch": 2.528,
"grad_norm": 0.3888176996168452,
"learning_rate": 2.7429203393515426e-06,
"loss": 0.378,
"step": 79
},
{
"epoch": 2.56,
"grad_norm": 0.3576643131385393,
"learning_rate": 2.372738679707023e-06,
"loss": 0.3232,
"step": 80
},
{
"epoch": 2.592,
"grad_norm": 0.3696160994931973,
"learning_rate": 2.02780790210636e-06,
"loss": 0.3542,
"step": 81
},
{
"epoch": 2.624,
"grad_norm": 0.36882809964571234,
"learning_rate": 1.7086221165658544e-06,
"loss": 0.351,
"step": 82
},
{
"epoch": 2.656,
"grad_norm": 0.34603236345776744,
"learning_rate": 1.4156385536486973e-06,
"loss": 0.3212,
"step": 83
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.3805023899734686,
"learning_rate": 1.1492769094865475e-06,
"loss": 0.3744,
"step": 84
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.3005431184449355,
"learning_rate": 9.099187445688984e-07,
"loss": 0.3071,
"step": 85
},
{
"epoch": 2.752,
"grad_norm": 0.3625697661026582,
"learning_rate": 6.979069371613345e-07,
"loss": 0.3755,
"step": 86
},
{
"epoch": 2.784,
"grad_norm": 0.30922554074419895,
"learning_rate": 5.135451921357337e-07,
"loss": 0.2993,
"step": 87
},
{
"epoch": 2.816,
"grad_norm": 0.3209396414531254,
"learning_rate": 3.570976059159481e-07,
"loss": 0.3725,
"step": 88
},
{
"epoch": 2.848,
"grad_norm": 0.3188890164441534,
"learning_rate": 2.2878828816222942e-07,
"loss": 0.3691,
"step": 89
},
{
"epoch": 2.88,
"grad_norm": 0.30765746035254077,
"learning_rate": 1.2880104073630163e-07,
"loss": 0.3218,
"step": 90
},
{
"epoch": 2.912,
"grad_norm": 0.3085708159717203,
"learning_rate": 5.7279094406959e-08,
"loss": 0.3625,
"step": 91
},
{
"epoch": 2.944,
"grad_norm": 0.3092205424287526,
"learning_rate": 1.4324903673370583e-08,
"loss": 0.3543,
"step": 92
},
{
"epoch": 2.976,
"grad_norm": 0.3121521435612877,
"learning_rate": 0.0,
"loss": 0.3917,
"step": 93
},
{
"epoch": 2.976,
"step": 93,
"total_flos": 1.818538711009198e+17,
"train_loss": 0.5000655266546434,
"train_runtime": 9080.5183,
"train_samples_per_second": 0.99,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1.0,
"max_steps": 93,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.818538711009198e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}