sedrickkeh's picture
End of training
925d9d4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983268265476855,
"eval_steps": 500,
"global_step": 672,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.044617958728388175,
"grad_norm": 0.44321257723607643,
"learning_rate": 5e-06,
"loss": 0.7473,
"step": 10
},
{
"epoch": 0.08923591745677635,
"grad_norm": 1.338530090223911,
"learning_rate": 5e-06,
"loss": 0.6749,
"step": 20
},
{
"epoch": 0.13385387618516453,
"grad_norm": 0.286982469204418,
"learning_rate": 5e-06,
"loss": 0.6518,
"step": 30
},
{
"epoch": 0.1784718349135527,
"grad_norm": 0.22191132313947864,
"learning_rate": 5e-06,
"loss": 0.6309,
"step": 40
},
{
"epoch": 0.22308979364194087,
"grad_norm": 0.22313526614443369,
"learning_rate": 5e-06,
"loss": 0.6242,
"step": 50
},
{
"epoch": 0.26770775237032907,
"grad_norm": 0.21768014018210444,
"learning_rate": 5e-06,
"loss": 0.6177,
"step": 60
},
{
"epoch": 0.3123257110987172,
"grad_norm": 0.20654128227881519,
"learning_rate": 5e-06,
"loss": 0.607,
"step": 70
},
{
"epoch": 0.3569436698271054,
"grad_norm": 0.2096515326020643,
"learning_rate": 5e-06,
"loss": 0.608,
"step": 80
},
{
"epoch": 0.4015616285554936,
"grad_norm": 0.214474799126283,
"learning_rate": 5e-06,
"loss": 0.5995,
"step": 90
},
{
"epoch": 0.44617958728388174,
"grad_norm": 0.2204481199549279,
"learning_rate": 5e-06,
"loss": 0.6024,
"step": 100
},
{
"epoch": 0.49079754601226994,
"grad_norm": 0.22134589080039557,
"learning_rate": 5e-06,
"loss": 0.584,
"step": 110
},
{
"epoch": 0.5354155047406581,
"grad_norm": 0.21238352336285804,
"learning_rate": 5e-06,
"loss": 0.5869,
"step": 120
},
{
"epoch": 0.5800334634690463,
"grad_norm": 0.22914773839860497,
"learning_rate": 5e-06,
"loss": 0.5869,
"step": 130
},
{
"epoch": 0.6246514221974344,
"grad_norm": 0.23025215144092048,
"learning_rate": 5e-06,
"loss": 0.5874,
"step": 140
},
{
"epoch": 0.6692693809258227,
"grad_norm": 0.21502067226392538,
"learning_rate": 5e-06,
"loss": 0.5779,
"step": 150
},
{
"epoch": 0.7138873396542108,
"grad_norm": 0.22383555331430666,
"learning_rate": 5e-06,
"loss": 0.5841,
"step": 160
},
{
"epoch": 0.758505298382599,
"grad_norm": 0.22090378539108488,
"learning_rate": 5e-06,
"loss": 0.5779,
"step": 170
},
{
"epoch": 0.8031232571109872,
"grad_norm": 0.22032432955026485,
"learning_rate": 5e-06,
"loss": 0.574,
"step": 180
},
{
"epoch": 0.8477412158393753,
"grad_norm": 0.24073566899355314,
"learning_rate": 5e-06,
"loss": 0.5733,
"step": 190
},
{
"epoch": 0.8923591745677635,
"grad_norm": 0.2140757527533294,
"learning_rate": 5e-06,
"loss": 0.572,
"step": 200
},
{
"epoch": 0.9369771332961517,
"grad_norm": 0.22679886715463268,
"learning_rate": 5e-06,
"loss": 0.5717,
"step": 210
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.2529726282581025,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 220
},
{
"epoch": 0.9994422755158952,
"eval_loss": 0.5709418058395386,
"eval_runtime": 223.3167,
"eval_samples_per_second": 27.042,
"eval_steps_per_second": 0.425,
"step": 224
},
{
"epoch": 1.0262130507529281,
"grad_norm": 0.2607396745733898,
"learning_rate": 5e-06,
"loss": 0.6088,
"step": 230
},
{
"epoch": 1.0708310094813163,
"grad_norm": 0.2163251119039884,
"learning_rate": 5e-06,
"loss": 0.5447,
"step": 240
},
{
"epoch": 1.1154489682097044,
"grad_norm": 0.22159094074691163,
"learning_rate": 5e-06,
"loss": 0.5438,
"step": 250
},
{
"epoch": 1.1600669269380925,
"grad_norm": 0.21627035107572698,
"learning_rate": 5e-06,
"loss": 0.5458,
"step": 260
},
{
"epoch": 1.2046848856664807,
"grad_norm": 0.23271897339521683,
"learning_rate": 5e-06,
"loss": 0.5483,
"step": 270
},
{
"epoch": 1.2493028443948688,
"grad_norm": 0.24002851008302872,
"learning_rate": 5e-06,
"loss": 0.5442,
"step": 280
},
{
"epoch": 1.2939208031232572,
"grad_norm": 0.2453568380812131,
"learning_rate": 5e-06,
"loss": 0.5474,
"step": 290
},
{
"epoch": 1.3385387618516453,
"grad_norm": 0.21246695532602308,
"learning_rate": 5e-06,
"loss": 0.5327,
"step": 300
},
{
"epoch": 1.3831567205800335,
"grad_norm": 0.23086082729025076,
"learning_rate": 5e-06,
"loss": 0.5388,
"step": 310
},
{
"epoch": 1.4277746793084216,
"grad_norm": 0.22600113921388443,
"learning_rate": 5e-06,
"loss": 0.5392,
"step": 320
},
{
"epoch": 1.4723926380368098,
"grad_norm": 0.2289968612823342,
"learning_rate": 5e-06,
"loss": 0.5353,
"step": 330
},
{
"epoch": 1.5170105967651981,
"grad_norm": 0.23941957998072452,
"learning_rate": 5e-06,
"loss": 0.5388,
"step": 340
},
{
"epoch": 1.561628555493586,
"grad_norm": 0.2302773313085982,
"learning_rate": 5e-06,
"loss": 0.5321,
"step": 350
},
{
"epoch": 1.6062465142219744,
"grad_norm": 0.23125753346116468,
"learning_rate": 5e-06,
"loss": 0.5355,
"step": 360
},
{
"epoch": 1.6508644729503625,
"grad_norm": 0.22766972806947575,
"learning_rate": 5e-06,
"loss": 0.5327,
"step": 370
},
{
"epoch": 1.6954824316787507,
"grad_norm": 0.21840397423259322,
"learning_rate": 5e-06,
"loss": 0.5405,
"step": 380
},
{
"epoch": 1.7401003904071388,
"grad_norm": 0.22699613496146168,
"learning_rate": 5e-06,
"loss": 0.5337,
"step": 390
},
{
"epoch": 1.784718349135527,
"grad_norm": 0.23126490164298122,
"learning_rate": 5e-06,
"loss": 0.5325,
"step": 400
},
{
"epoch": 1.8293363078639153,
"grad_norm": 0.23691741692227147,
"learning_rate": 5e-06,
"loss": 0.5315,
"step": 410
},
{
"epoch": 1.8739542665923032,
"grad_norm": 0.22142710480950437,
"learning_rate": 5e-06,
"loss": 0.5281,
"step": 420
},
{
"epoch": 1.9185722253206916,
"grad_norm": 0.29222348999396186,
"learning_rate": 5e-06,
"loss": 0.5316,
"step": 430
},
{
"epoch": 1.9631901840490797,
"grad_norm": 0.2348302665783151,
"learning_rate": 5e-06,
"loss": 0.5317,
"step": 440
},
{
"epoch": 1.9988845510317903,
"eval_loss": 0.5504088997840881,
"eval_runtime": 225.1824,
"eval_samples_per_second": 26.818,
"eval_steps_per_second": 0.422,
"step": 448
},
{
"epoch": 2.007808142777468,
"grad_norm": 0.3168047860383918,
"learning_rate": 5e-06,
"loss": 0.5741,
"step": 450
},
{
"epoch": 2.0524261015058562,
"grad_norm": 0.2834626392548171,
"learning_rate": 5e-06,
"loss": 0.5027,
"step": 460
},
{
"epoch": 2.097044060234244,
"grad_norm": 0.2569670194124399,
"learning_rate": 5e-06,
"loss": 0.5015,
"step": 470
},
{
"epoch": 2.1416620189626325,
"grad_norm": 0.23763594409899297,
"learning_rate": 5e-06,
"loss": 0.4997,
"step": 480
},
{
"epoch": 2.1862799776910204,
"grad_norm": 0.23783136640612776,
"learning_rate": 5e-06,
"loss": 0.5066,
"step": 490
},
{
"epoch": 2.230897936419409,
"grad_norm": 0.23752043284775223,
"learning_rate": 5e-06,
"loss": 0.5,
"step": 500
},
{
"epoch": 2.275515895147797,
"grad_norm": 0.2204952509796257,
"learning_rate": 5e-06,
"loss": 0.5048,
"step": 510
},
{
"epoch": 2.320133853876185,
"grad_norm": 0.24946351851573337,
"learning_rate": 5e-06,
"loss": 0.4972,
"step": 520
},
{
"epoch": 2.3647518126045735,
"grad_norm": 0.22710269863348215,
"learning_rate": 5e-06,
"loss": 0.512,
"step": 530
},
{
"epoch": 2.4093697713329614,
"grad_norm": 0.24573604962686163,
"learning_rate": 5e-06,
"loss": 0.5102,
"step": 540
},
{
"epoch": 2.4539877300613497,
"grad_norm": 0.2454775848592334,
"learning_rate": 5e-06,
"loss": 0.5103,
"step": 550
},
{
"epoch": 2.4986056887897377,
"grad_norm": 0.2410815694953051,
"learning_rate": 5e-06,
"loss": 0.496,
"step": 560
},
{
"epoch": 2.543223647518126,
"grad_norm": 0.23866340070030165,
"learning_rate": 5e-06,
"loss": 0.5098,
"step": 570
},
{
"epoch": 2.5878416062465144,
"grad_norm": 0.24075487463764883,
"learning_rate": 5e-06,
"loss": 0.4983,
"step": 580
},
{
"epoch": 2.6324595649749023,
"grad_norm": 0.2502533834047079,
"learning_rate": 5e-06,
"loss": 0.5024,
"step": 590
},
{
"epoch": 2.6770775237032907,
"grad_norm": 0.2660873218372233,
"learning_rate": 5e-06,
"loss": 0.5018,
"step": 600
},
{
"epoch": 2.721695482431679,
"grad_norm": 0.260710830944657,
"learning_rate": 5e-06,
"loss": 0.4986,
"step": 610
},
{
"epoch": 2.766313441160067,
"grad_norm": 0.23395448772416363,
"learning_rate": 5e-06,
"loss": 0.5044,
"step": 620
},
{
"epoch": 2.810931399888455,
"grad_norm": 0.24309792455207027,
"learning_rate": 5e-06,
"loss": 0.4959,
"step": 630
},
{
"epoch": 2.8555493586168432,
"grad_norm": 0.23050931374816222,
"learning_rate": 5e-06,
"loss": 0.5051,
"step": 640
},
{
"epoch": 2.9001673173452316,
"grad_norm": 0.23428496858539824,
"learning_rate": 5e-06,
"loss": 0.4967,
"step": 650
},
{
"epoch": 2.9447852760736195,
"grad_norm": 0.22967426268231442,
"learning_rate": 5e-06,
"loss": 0.4981,
"step": 660
},
{
"epoch": 2.989403234802008,
"grad_norm": 0.2561574384965489,
"learning_rate": 5e-06,
"loss": 0.4944,
"step": 670
},
{
"epoch": 2.9983268265476855,
"eval_loss": 0.5417667031288147,
"eval_runtime": 227.6813,
"eval_samples_per_second": 26.524,
"eval_steps_per_second": 0.417,
"step": 672
},
{
"epoch": 2.9983268265476855,
"step": 672,
"total_flos": 1125415649280000.0,
"train_loss": 0.5495298178423018,
"train_runtime": 36802.8232,
"train_samples_per_second": 9.353,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 672,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1125415649280000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}