sedrickkeh's picture
End of training
f9f0b42 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9911012235817576,
"eval_steps": 500,
"global_step": 672,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04449388209121246,
"grad_norm": 17.919046764867105,
"learning_rate": 5e-06,
"loss": 0.7727,
"step": 10
},
{
"epoch": 0.08898776418242492,
"grad_norm": 2.5068176722694706,
"learning_rate": 5e-06,
"loss": 0.7067,
"step": 20
},
{
"epoch": 0.13348164627363737,
"grad_norm": 0.8188606020387743,
"learning_rate": 5e-06,
"loss": 0.6732,
"step": 30
},
{
"epoch": 0.17797552836484984,
"grad_norm": 0.7944470655707544,
"learning_rate": 5e-06,
"loss": 0.6514,
"step": 40
},
{
"epoch": 0.22246941045606228,
"grad_norm": 0.9562761756923293,
"learning_rate": 5e-06,
"loss": 0.6433,
"step": 50
},
{
"epoch": 0.26696329254727474,
"grad_norm": 0.8767792470489679,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 60
},
{
"epoch": 0.3114571746384872,
"grad_norm": 1.337143723222155,
"learning_rate": 5e-06,
"loss": 0.615,
"step": 70
},
{
"epoch": 0.3559510567296997,
"grad_norm": 0.5940615588218296,
"learning_rate": 5e-06,
"loss": 0.6111,
"step": 80
},
{
"epoch": 0.40044493882091214,
"grad_norm": 0.6245704159321135,
"learning_rate": 5e-06,
"loss": 0.6,
"step": 90
},
{
"epoch": 0.44493882091212456,
"grad_norm": 0.5683316859611832,
"learning_rate": 5e-06,
"loss": 0.6087,
"step": 100
},
{
"epoch": 0.489432703003337,
"grad_norm": 0.5159069244539627,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 110
},
{
"epoch": 0.5339265850945495,
"grad_norm": 0.5072121076395835,
"learning_rate": 5e-06,
"loss": 0.6023,
"step": 120
},
{
"epoch": 0.578420467185762,
"grad_norm": 0.5274934145200447,
"learning_rate": 5e-06,
"loss": 0.5919,
"step": 130
},
{
"epoch": 0.6229143492769744,
"grad_norm": 0.5544559368876572,
"learning_rate": 5e-06,
"loss": 0.5974,
"step": 140
},
{
"epoch": 0.6674082313681868,
"grad_norm": 0.4929894123721837,
"learning_rate": 5e-06,
"loss": 0.5945,
"step": 150
},
{
"epoch": 0.7119021134593994,
"grad_norm": 1.1362385626723797,
"learning_rate": 5e-06,
"loss": 0.5892,
"step": 160
},
{
"epoch": 0.7563959955506118,
"grad_norm": 0.4793988319315333,
"learning_rate": 5e-06,
"loss": 0.5856,
"step": 170
},
{
"epoch": 0.8008898776418243,
"grad_norm": 0.5137409156397603,
"learning_rate": 5e-06,
"loss": 0.5879,
"step": 180
},
{
"epoch": 0.8453837597330367,
"grad_norm": 0.7484432139988203,
"learning_rate": 5e-06,
"loss": 0.5897,
"step": 190
},
{
"epoch": 0.8898776418242491,
"grad_norm": 0.4983308283425539,
"learning_rate": 5e-06,
"loss": 0.5754,
"step": 200
},
{
"epoch": 0.9343715239154616,
"grad_norm": 0.5985618062120786,
"learning_rate": 5e-06,
"loss": 0.5821,
"step": 210
},
{
"epoch": 0.978865406006674,
"grad_norm": 0.4660686039944073,
"learning_rate": 5e-06,
"loss": 0.581,
"step": 220
},
{
"epoch": 0.996662958843159,
"eval_loss": 0.5854274034500122,
"eval_runtime": 240.2197,
"eval_samples_per_second": 25.21,
"eval_steps_per_second": 0.395,
"step": 224
},
{
"epoch": 1.0239154616240267,
"grad_norm": 0.7457465164497729,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 230
},
{
"epoch": 1.068409343715239,
"grad_norm": 0.6326576381402257,
"learning_rate": 5e-06,
"loss": 0.5449,
"step": 240
},
{
"epoch": 1.1129032258064515,
"grad_norm": 0.6388396264873892,
"learning_rate": 5e-06,
"loss": 0.5301,
"step": 250
},
{
"epoch": 1.1573971078976641,
"grad_norm": 0.752126204770771,
"learning_rate": 5e-06,
"loss": 0.5374,
"step": 260
},
{
"epoch": 1.2018909899888766,
"grad_norm": 0.4673289459919312,
"learning_rate": 5e-06,
"loss": 0.5314,
"step": 270
},
{
"epoch": 1.246384872080089,
"grad_norm": 0.5707602520384042,
"learning_rate": 5e-06,
"loss": 0.5371,
"step": 280
},
{
"epoch": 1.2908787541713014,
"grad_norm": 0.7449098291403021,
"learning_rate": 5e-06,
"loss": 0.5393,
"step": 290
},
{
"epoch": 1.3353726362625138,
"grad_norm": 0.6185110765439527,
"learning_rate": 5e-06,
"loss": 0.5361,
"step": 300
},
{
"epoch": 1.3798665183537264,
"grad_norm": 0.6947624284326104,
"learning_rate": 5e-06,
"loss": 0.5353,
"step": 310
},
{
"epoch": 1.4243604004449388,
"grad_norm": 0.5200108023651202,
"learning_rate": 5e-06,
"loss": 0.5316,
"step": 320
},
{
"epoch": 1.4688542825361512,
"grad_norm": 0.47510706811194214,
"learning_rate": 5e-06,
"loss": 0.5352,
"step": 330
},
{
"epoch": 1.5133481646273639,
"grad_norm": 0.4867636105327538,
"learning_rate": 5e-06,
"loss": 0.5415,
"step": 340
},
{
"epoch": 1.557842046718576,
"grad_norm": 0.48217592935887066,
"learning_rate": 5e-06,
"loss": 0.5339,
"step": 350
},
{
"epoch": 1.6023359288097887,
"grad_norm": 0.4650078322874499,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 360
},
{
"epoch": 1.6468298109010011,
"grad_norm": 0.570457650374032,
"learning_rate": 5e-06,
"loss": 0.5333,
"step": 370
},
{
"epoch": 1.6913236929922135,
"grad_norm": 0.5230883279688195,
"learning_rate": 5e-06,
"loss": 0.5347,
"step": 380
},
{
"epoch": 1.7358175750834262,
"grad_norm": 0.5808698181708927,
"learning_rate": 5e-06,
"loss": 0.5338,
"step": 390
},
{
"epoch": 1.7803114571746383,
"grad_norm": 0.6131929934071662,
"learning_rate": 5e-06,
"loss": 0.5392,
"step": 400
},
{
"epoch": 1.824805339265851,
"grad_norm": 0.6516997090789,
"learning_rate": 5e-06,
"loss": 0.52,
"step": 410
},
{
"epoch": 1.8692992213570634,
"grad_norm": 0.5459884768353754,
"learning_rate": 5e-06,
"loss": 0.5306,
"step": 420
},
{
"epoch": 1.9137931034482758,
"grad_norm": 0.5136522179463594,
"learning_rate": 5e-06,
"loss": 0.5369,
"step": 430
},
{
"epoch": 1.9582869855394884,
"grad_norm": 0.5400184881508431,
"learning_rate": 5e-06,
"loss": 0.5298,
"step": 440
},
{
"epoch": 1.9983314794215796,
"eval_loss": 0.5757958889007568,
"eval_runtime": 242.08,
"eval_samples_per_second": 25.017,
"eval_steps_per_second": 0.392,
"step": 449
},
{
"epoch": 2.0033370411568407,
"grad_norm": 0.7649398226033791,
"learning_rate": 5e-06,
"loss": 0.5412,
"step": 450
},
{
"epoch": 2.0478309232480534,
"grad_norm": 0.5503149371785776,
"learning_rate": 5e-06,
"loss": 0.4782,
"step": 460
},
{
"epoch": 2.092324805339266,
"grad_norm": 0.5497911599700889,
"learning_rate": 5e-06,
"loss": 0.4801,
"step": 470
},
{
"epoch": 2.136818687430478,
"grad_norm": 0.5261406383405891,
"learning_rate": 5e-06,
"loss": 0.4753,
"step": 480
},
{
"epoch": 2.181312569521691,
"grad_norm": 0.7163879690094836,
"learning_rate": 5e-06,
"loss": 0.4844,
"step": 490
},
{
"epoch": 2.225806451612903,
"grad_norm": 0.5297217476527686,
"learning_rate": 5e-06,
"loss": 0.4822,
"step": 500
},
{
"epoch": 2.2703003337041157,
"grad_norm": 0.5953010609328895,
"learning_rate": 5e-06,
"loss": 0.4899,
"step": 510
},
{
"epoch": 2.3147942157953283,
"grad_norm": 0.5148912353492243,
"learning_rate": 5e-06,
"loss": 0.4939,
"step": 520
},
{
"epoch": 2.3592880978865405,
"grad_norm": 0.6069152681341892,
"learning_rate": 5e-06,
"loss": 0.4835,
"step": 530
},
{
"epoch": 2.403781979977753,
"grad_norm": 0.6540469956921977,
"learning_rate": 5e-06,
"loss": 0.4889,
"step": 540
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.5359141705186573,
"learning_rate": 5e-06,
"loss": 0.4865,
"step": 550
},
{
"epoch": 2.492769744160178,
"grad_norm": 0.532021339209882,
"learning_rate": 5e-06,
"loss": 0.483,
"step": 560
},
{
"epoch": 2.5372636262513906,
"grad_norm": 0.5745397487010325,
"learning_rate": 5e-06,
"loss": 0.4797,
"step": 570
},
{
"epoch": 2.5817575083426028,
"grad_norm": 0.5721099629533181,
"learning_rate": 5e-06,
"loss": 0.477,
"step": 580
},
{
"epoch": 2.6262513904338154,
"grad_norm": 0.49094180466012677,
"learning_rate": 5e-06,
"loss": 0.4833,
"step": 590
},
{
"epoch": 2.6707452725250276,
"grad_norm": 0.4909218814568728,
"learning_rate": 5e-06,
"loss": 0.4897,
"step": 600
},
{
"epoch": 2.71523915461624,
"grad_norm": 0.5181636597841739,
"learning_rate": 5e-06,
"loss": 0.4893,
"step": 610
},
{
"epoch": 2.759733036707453,
"grad_norm": 0.46959422158183145,
"learning_rate": 5e-06,
"loss": 0.4809,
"step": 620
},
{
"epoch": 2.804226918798665,
"grad_norm": 0.5864283756615662,
"learning_rate": 5e-06,
"loss": 0.4888,
"step": 630
},
{
"epoch": 2.8487208008898777,
"grad_norm": 0.4555937611441811,
"learning_rate": 5e-06,
"loss": 0.4844,
"step": 640
},
{
"epoch": 2.89321468298109,
"grad_norm": 0.4648877683489992,
"learning_rate": 5e-06,
"loss": 0.4836,
"step": 650
},
{
"epoch": 2.9377085650723025,
"grad_norm": 0.645091025204656,
"learning_rate": 5e-06,
"loss": 0.4916,
"step": 660
},
{
"epoch": 2.982202447163515,
"grad_norm": 0.5297809492883717,
"learning_rate": 5e-06,
"loss": 0.483,
"step": 670
},
{
"epoch": 2.9911012235817576,
"eval_loss": 0.5784014463424683,
"eval_runtime": 240.5667,
"eval_samples_per_second": 25.174,
"eval_steps_per_second": 0.395,
"step": 672
},
{
"epoch": 2.9911012235817576,
"step": 672,
"total_flos": 1125415649280000.0,
"train_loss": 0.5457742023503497,
"train_runtime": 40203.7661,
"train_samples_per_second": 8.586,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 672,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1125415649280000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}