sedrickkeh's picture
End of training
7ee5bdd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9969593310528317,
"eval_steps": 500,
"global_step": 656,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03040668947168377,
"grad_norm": 5.294622334039074,
"learning_rate": 5e-06,
"loss": 0.9257,
"step": 10
},
{
"epoch": 0.06081337894336754,
"grad_norm": 1.6301473096686117,
"learning_rate": 5e-06,
"loss": 0.8122,
"step": 20
},
{
"epoch": 0.09122006841505131,
"grad_norm": 1.106205167846798,
"learning_rate": 5e-06,
"loss": 0.7764,
"step": 30
},
{
"epoch": 0.12162675788673508,
"grad_norm": 1.0271191708023029,
"learning_rate": 5e-06,
"loss": 0.7512,
"step": 40
},
{
"epoch": 0.15203344735841884,
"grad_norm": 0.8176114852390686,
"learning_rate": 5e-06,
"loss": 0.7338,
"step": 50
},
{
"epoch": 0.18244013683010263,
"grad_norm": 0.9260828555935472,
"learning_rate": 5e-06,
"loss": 0.721,
"step": 60
},
{
"epoch": 0.2128468263017864,
"grad_norm": 0.9770222831832209,
"learning_rate": 5e-06,
"loss": 0.7094,
"step": 70
},
{
"epoch": 0.24325351577347015,
"grad_norm": 0.6562302483289237,
"learning_rate": 5e-06,
"loss": 0.7012,
"step": 80
},
{
"epoch": 0.27366020524515394,
"grad_norm": 0.6984752490925699,
"learning_rate": 5e-06,
"loss": 0.6979,
"step": 90
},
{
"epoch": 0.3040668947168377,
"grad_norm": 0.8518484569843605,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 100
},
{
"epoch": 0.33447358418852147,
"grad_norm": 0.8741274500386024,
"learning_rate": 5e-06,
"loss": 0.69,
"step": 110
},
{
"epoch": 0.36488027366020526,
"grad_norm": 0.7325820852264833,
"learning_rate": 5e-06,
"loss": 0.6811,
"step": 120
},
{
"epoch": 0.395286963131889,
"grad_norm": 0.5709370572770963,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 130
},
{
"epoch": 0.4256936526035728,
"grad_norm": 0.6057188325485896,
"learning_rate": 5e-06,
"loss": 0.6818,
"step": 140
},
{
"epoch": 0.45610034207525657,
"grad_norm": 0.5863099510897374,
"learning_rate": 5e-06,
"loss": 0.6766,
"step": 150
},
{
"epoch": 0.4865070315469403,
"grad_norm": 0.7170797428745375,
"learning_rate": 5e-06,
"loss": 0.684,
"step": 160
},
{
"epoch": 0.5169137210186241,
"grad_norm": 0.559188071891075,
"learning_rate": 5e-06,
"loss": 0.6763,
"step": 170
},
{
"epoch": 0.5473204104903079,
"grad_norm": 0.5679992922149982,
"learning_rate": 5e-06,
"loss": 0.6845,
"step": 180
},
{
"epoch": 0.5777270999619917,
"grad_norm": 0.6193525571471781,
"learning_rate": 5e-06,
"loss": 0.671,
"step": 190
},
{
"epoch": 0.6081337894336754,
"grad_norm": 0.5431044581567518,
"learning_rate": 5e-06,
"loss": 0.6707,
"step": 200
},
{
"epoch": 0.6385404789053591,
"grad_norm": 0.5871938765797495,
"learning_rate": 5e-06,
"loss": 0.6721,
"step": 210
},
{
"epoch": 0.6689471683770429,
"grad_norm": 0.5980280104992193,
"learning_rate": 5e-06,
"loss": 0.6716,
"step": 220
},
{
"epoch": 0.6993538578487267,
"grad_norm": 0.49868159695154574,
"learning_rate": 5e-06,
"loss": 0.6686,
"step": 230
},
{
"epoch": 0.7297605473204105,
"grad_norm": 0.6213515255920768,
"learning_rate": 5e-06,
"loss": 0.6684,
"step": 240
},
{
"epoch": 0.7601672367920943,
"grad_norm": 0.7065699853711176,
"learning_rate": 5e-06,
"loss": 0.6637,
"step": 250
},
{
"epoch": 0.790573926263778,
"grad_norm": 0.5298004443539811,
"learning_rate": 5e-06,
"loss": 0.6681,
"step": 260
},
{
"epoch": 0.8209806157354618,
"grad_norm": 0.853485338030159,
"learning_rate": 5e-06,
"loss": 0.6619,
"step": 270
},
{
"epoch": 0.8513873052071456,
"grad_norm": 0.6474661712665346,
"learning_rate": 5e-06,
"loss": 0.6611,
"step": 280
},
{
"epoch": 0.8817939946788294,
"grad_norm": 0.5350576553467696,
"learning_rate": 5e-06,
"loss": 0.6595,
"step": 290
},
{
"epoch": 0.9122006841505131,
"grad_norm": 0.6461920255035306,
"learning_rate": 5e-06,
"loss": 0.6622,
"step": 300
},
{
"epoch": 0.9426073736221969,
"grad_norm": 0.7113283875706861,
"learning_rate": 5e-06,
"loss": 0.6565,
"step": 310
},
{
"epoch": 0.9730140630938806,
"grad_norm": 0.5398831103354343,
"learning_rate": 5e-06,
"loss": 0.6579,
"step": 320
},
{
"epoch": 0.9973394146712277,
"eval_loss": 0.6590341925621033,
"eval_runtime": 349.1132,
"eval_samples_per_second": 25.381,
"eval_steps_per_second": 0.398,
"step": 328
},
{
"epoch": 1.0057012542759407,
"grad_norm": 0.6541787183575265,
"learning_rate": 5e-06,
"loss": 0.7147,
"step": 330
},
{
"epoch": 1.0361079437476244,
"grad_norm": 0.5849575356382846,
"learning_rate": 5e-06,
"loss": 0.6206,
"step": 340
},
{
"epoch": 1.0665146332193083,
"grad_norm": 0.6188318897860179,
"learning_rate": 5e-06,
"loss": 0.6123,
"step": 350
},
{
"epoch": 1.096921322690992,
"grad_norm": 0.5529845628714145,
"learning_rate": 5e-06,
"loss": 0.6235,
"step": 360
},
{
"epoch": 1.1273280121626759,
"grad_norm": 0.6471216585957622,
"learning_rate": 5e-06,
"loss": 0.6149,
"step": 370
},
{
"epoch": 1.1577347016343595,
"grad_norm": 0.5348917582128644,
"learning_rate": 5e-06,
"loss": 0.6151,
"step": 380
},
{
"epoch": 1.1881413911060434,
"grad_norm": 0.5102391958573069,
"learning_rate": 5e-06,
"loss": 0.611,
"step": 390
},
{
"epoch": 1.2185480805777271,
"grad_norm": 0.5738175958206242,
"learning_rate": 5e-06,
"loss": 0.6123,
"step": 400
},
{
"epoch": 1.2489547700494108,
"grad_norm": 0.7700452193739229,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 410
},
{
"epoch": 1.2793614595210947,
"grad_norm": 0.6437425461349855,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 420
},
{
"epoch": 1.3097681489927784,
"grad_norm": 0.7772459625346615,
"learning_rate": 5e-06,
"loss": 0.6162,
"step": 430
},
{
"epoch": 1.340174838464462,
"grad_norm": 0.7382002997370456,
"learning_rate": 5e-06,
"loss": 0.6139,
"step": 440
},
{
"epoch": 1.370581527936146,
"grad_norm": 0.6650239061593356,
"learning_rate": 5e-06,
"loss": 0.6153,
"step": 450
},
{
"epoch": 1.4009882174078299,
"grad_norm": 0.4904795314706487,
"learning_rate": 5e-06,
"loss": 0.6115,
"step": 460
},
{
"epoch": 1.4313949068795135,
"grad_norm": 0.6074182358853202,
"learning_rate": 5e-06,
"loss": 0.6133,
"step": 470
},
{
"epoch": 1.4618015963511972,
"grad_norm": 0.6115080516205853,
"learning_rate": 5e-06,
"loss": 0.6151,
"step": 480
},
{
"epoch": 1.4922082858228811,
"grad_norm": 0.6245371375579123,
"learning_rate": 5e-06,
"loss": 0.6146,
"step": 490
},
{
"epoch": 1.5226149752945648,
"grad_norm": 0.6563433070038904,
"learning_rate": 5e-06,
"loss": 0.6182,
"step": 500
},
{
"epoch": 1.5530216647662485,
"grad_norm": 0.5157535145442664,
"learning_rate": 5e-06,
"loss": 0.6119,
"step": 510
},
{
"epoch": 1.5834283542379324,
"grad_norm": 0.5467230049625755,
"learning_rate": 5e-06,
"loss": 0.6146,
"step": 520
},
{
"epoch": 1.6138350437096163,
"grad_norm": 0.5809725331645209,
"learning_rate": 5e-06,
"loss": 0.612,
"step": 530
},
{
"epoch": 1.6442417331812997,
"grad_norm": 0.7935716363732433,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 540
},
{
"epoch": 1.6746484226529836,
"grad_norm": 0.5375427777402721,
"learning_rate": 5e-06,
"loss": 0.6116,
"step": 550
},
{
"epoch": 1.7050551121246675,
"grad_norm": 0.6075737303881514,
"learning_rate": 5e-06,
"loss": 0.6087,
"step": 560
},
{
"epoch": 1.7354618015963512,
"grad_norm": 0.5457501519081878,
"learning_rate": 5e-06,
"loss": 0.6146,
"step": 570
},
{
"epoch": 1.765868491068035,
"grad_norm": 0.5359013635410763,
"learning_rate": 5e-06,
"loss": 0.6121,
"step": 580
},
{
"epoch": 1.7962751805397188,
"grad_norm": 0.5731354873642086,
"learning_rate": 5e-06,
"loss": 0.6114,
"step": 590
},
{
"epoch": 1.8266818700114025,
"grad_norm": 0.5798625309299036,
"learning_rate": 5e-06,
"loss": 0.6156,
"step": 600
},
{
"epoch": 1.8570885594830862,
"grad_norm": 0.6682632356720307,
"learning_rate": 5e-06,
"loss": 0.612,
"step": 610
},
{
"epoch": 1.88749524895477,
"grad_norm": 0.5386120296701589,
"learning_rate": 5e-06,
"loss": 0.6097,
"step": 620
},
{
"epoch": 1.917901938426454,
"grad_norm": 0.5373342158868872,
"learning_rate": 5e-06,
"loss": 0.6029,
"step": 630
},
{
"epoch": 1.9483086278981376,
"grad_norm": 0.5069132774726652,
"learning_rate": 5e-06,
"loss": 0.6107,
"step": 640
},
{
"epoch": 1.9787153173698213,
"grad_norm": 0.6365254431868149,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 650
},
{
"epoch": 1.9969593310528317,
"eval_loss": 0.6481617093086243,
"eval_runtime": 349.4697,
"eval_samples_per_second": 25.356,
"eval_steps_per_second": 0.398,
"step": 656
},
{
"epoch": 1.9969593310528317,
"step": 656,
"total_flos": 1098615053352960.0,
"train_loss": 0.6551533584914556,
"train_runtime": 38635.1705,
"train_samples_per_second": 8.715,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 656,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1098615053352960.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}