hp_ablations_mistral_epoch2 / trainer_state.json
sedrickkeh's picture
End of training
af6d687 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9969620253164557,
"eval_steps": 500,
"global_step": 986,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020253164556962026,
"grad_norm": 2.8830449086777117,
"learning_rate": 5e-06,
"loss": 0.7568,
"step": 10
},
{
"epoch": 0.04050632911392405,
"grad_norm": 2.303422506767745,
"learning_rate": 5e-06,
"loss": 0.6504,
"step": 20
},
{
"epoch": 0.060759493670886074,
"grad_norm": 1.6860772420358063,
"learning_rate": 5e-06,
"loss": 0.6288,
"step": 30
},
{
"epoch": 0.0810126582278481,
"grad_norm": 2.0970267075945785,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 40
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.6054998154511506,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 50
},
{
"epoch": 0.12151898734177215,
"grad_norm": 1.838238268065728,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 60
},
{
"epoch": 0.14177215189873418,
"grad_norm": 1.6213305380578262,
"learning_rate": 5e-06,
"loss": 0.5938,
"step": 70
},
{
"epoch": 0.1620253164556962,
"grad_norm": 2.1780011248652023,
"learning_rate": 5e-06,
"loss": 0.5882,
"step": 80
},
{
"epoch": 0.18227848101265823,
"grad_norm": 2.103534064613827,
"learning_rate": 5e-06,
"loss": 0.5863,
"step": 90
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.8335202039731435,
"learning_rate": 5e-06,
"loss": 0.5888,
"step": 100
},
{
"epoch": 0.22278481012658227,
"grad_norm": 1.523297129174684,
"learning_rate": 5e-06,
"loss": 0.5809,
"step": 110
},
{
"epoch": 0.2430379746835443,
"grad_norm": 2.0170719153012904,
"learning_rate": 5e-06,
"loss": 0.5788,
"step": 120
},
{
"epoch": 0.26329113924050634,
"grad_norm": 1.7622257691924685,
"learning_rate": 5e-06,
"loss": 0.5777,
"step": 130
},
{
"epoch": 0.28354430379746837,
"grad_norm": 1.7181353902048122,
"learning_rate": 5e-06,
"loss": 0.5672,
"step": 140
},
{
"epoch": 0.3037974683544304,
"grad_norm": 1.4401641944683279,
"learning_rate": 5e-06,
"loss": 0.5793,
"step": 150
},
{
"epoch": 0.3240506329113924,
"grad_norm": 1.4242865070941313,
"learning_rate": 5e-06,
"loss": 0.5788,
"step": 160
},
{
"epoch": 0.34430379746835443,
"grad_norm": 1.7980269158065236,
"learning_rate": 5e-06,
"loss": 0.5794,
"step": 170
},
{
"epoch": 0.36455696202531646,
"grad_norm": 1.4779433518605618,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 180
},
{
"epoch": 0.3848101265822785,
"grad_norm": 1.8693708290648419,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 190
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.8949198189001324,
"learning_rate": 5e-06,
"loss": 0.5695,
"step": 200
},
{
"epoch": 0.4253164556962025,
"grad_norm": 1.435742719558278,
"learning_rate": 5e-06,
"loss": 0.5649,
"step": 210
},
{
"epoch": 0.44556962025316454,
"grad_norm": 1.5468357652273146,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 220
},
{
"epoch": 0.46582278481012657,
"grad_norm": 1.4031725584973305,
"learning_rate": 5e-06,
"loss": 0.5682,
"step": 230
},
{
"epoch": 0.4860759493670886,
"grad_norm": 1.4275164314242066,
"learning_rate": 5e-06,
"loss": 0.5643,
"step": 240
},
{
"epoch": 0.5063291139240507,
"grad_norm": 1.2176273594878788,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 250
},
{
"epoch": 0.5265822784810127,
"grad_norm": 1.2690186560536825,
"learning_rate": 5e-06,
"loss": 0.5548,
"step": 260
},
{
"epoch": 0.5468354430379747,
"grad_norm": 1.83660932215292,
"learning_rate": 5e-06,
"loss": 0.5644,
"step": 270
},
{
"epoch": 0.5670886075949367,
"grad_norm": 1.6169905329132488,
"learning_rate": 5e-06,
"loss": 0.5593,
"step": 280
},
{
"epoch": 0.5873417721518988,
"grad_norm": 1.4338044643322776,
"learning_rate": 5e-06,
"loss": 0.5586,
"step": 290
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.7201385340518724,
"learning_rate": 5e-06,
"loss": 0.5582,
"step": 300
},
{
"epoch": 0.6278481012658228,
"grad_norm": 1.2567147302918276,
"learning_rate": 5e-06,
"loss": 0.5637,
"step": 310
},
{
"epoch": 0.6481012658227848,
"grad_norm": 1.2477603311793903,
"learning_rate": 5e-06,
"loss": 0.5599,
"step": 320
},
{
"epoch": 0.6683544303797468,
"grad_norm": 1.4335133075994282,
"learning_rate": 5e-06,
"loss": 0.5602,
"step": 330
},
{
"epoch": 0.6886075949367089,
"grad_norm": 1.4578243700932763,
"learning_rate": 5e-06,
"loss": 0.5604,
"step": 340
},
{
"epoch": 0.7088607594936709,
"grad_norm": 1.2082777377225689,
"learning_rate": 5e-06,
"loss": 0.5616,
"step": 350
},
{
"epoch": 0.7291139240506329,
"grad_norm": 1.231089840082507,
"learning_rate": 5e-06,
"loss": 0.5636,
"step": 360
},
{
"epoch": 0.7493670886075949,
"grad_norm": 1.3997981963290846,
"learning_rate": 5e-06,
"loss": 0.5562,
"step": 370
},
{
"epoch": 0.769620253164557,
"grad_norm": 1.2047345664692388,
"learning_rate": 5e-06,
"loss": 0.5527,
"step": 380
},
{
"epoch": 0.789873417721519,
"grad_norm": 1.4342172334673526,
"learning_rate": 5e-06,
"loss": 0.5555,
"step": 390
},
{
"epoch": 0.810126582278481,
"grad_norm": 1.2558989205908657,
"learning_rate": 5e-06,
"loss": 0.556,
"step": 400
},
{
"epoch": 0.830379746835443,
"grad_norm": 1.2117418919687244,
"learning_rate": 5e-06,
"loss": 0.5502,
"step": 410
},
{
"epoch": 0.850632911392405,
"grad_norm": 1.185811964971908,
"learning_rate": 5e-06,
"loss": 0.5581,
"step": 420
},
{
"epoch": 0.8708860759493671,
"grad_norm": 1.1218795286529273,
"learning_rate": 5e-06,
"loss": 0.5517,
"step": 430
},
{
"epoch": 0.8911392405063291,
"grad_norm": 1.3887849252274234,
"learning_rate": 5e-06,
"loss": 0.5573,
"step": 440
},
{
"epoch": 0.9113924050632911,
"grad_norm": 1.0476717005428378,
"learning_rate": 5e-06,
"loss": 0.5485,
"step": 450
},
{
"epoch": 0.9316455696202531,
"grad_norm": 1.06285887248084,
"learning_rate": 5e-06,
"loss": 0.5527,
"step": 460
},
{
"epoch": 0.9518987341772152,
"grad_norm": 1.1062346230921074,
"learning_rate": 5e-06,
"loss": 0.5536,
"step": 470
},
{
"epoch": 0.9721518987341772,
"grad_norm": 1.1370240328967087,
"learning_rate": 5e-06,
"loss": 0.5506,
"step": 480
},
{
"epoch": 0.9924050632911392,
"grad_norm": 1.1112526055158258,
"learning_rate": 5e-06,
"loss": 0.5472,
"step": 490
},
{
"epoch": 0.9984810126582279,
"eval_loss": 0.06877367943525314,
"eval_runtime": 510.2093,
"eval_samples_per_second": 26.076,
"eval_steps_per_second": 0.408,
"step": 493
},
{
"epoch": 1.0126582278481013,
"grad_norm": 1.896508901425431,
"learning_rate": 5e-06,
"loss": 0.5062,
"step": 500
},
{
"epoch": 1.0329113924050632,
"grad_norm": 1.29214738627033,
"learning_rate": 5e-06,
"loss": 0.4713,
"step": 510
},
{
"epoch": 1.0531645569620254,
"grad_norm": 1.3143564009223339,
"learning_rate": 5e-06,
"loss": 0.4665,
"step": 520
},
{
"epoch": 1.0734177215189873,
"grad_norm": 1.6998357379137725,
"learning_rate": 5e-06,
"loss": 0.4683,
"step": 530
},
{
"epoch": 1.0936708860759494,
"grad_norm": 1.2775472369900311,
"learning_rate": 5e-06,
"loss": 0.4641,
"step": 540
},
{
"epoch": 1.1139240506329113,
"grad_norm": 1.2895233559545232,
"learning_rate": 5e-06,
"loss": 0.4664,
"step": 550
},
{
"epoch": 1.1341772151898735,
"grad_norm": 2.336669368486411,
"learning_rate": 5e-06,
"loss": 0.4633,
"step": 560
},
{
"epoch": 1.1544303797468354,
"grad_norm": 1.552813991949337,
"learning_rate": 5e-06,
"loss": 0.4693,
"step": 570
},
{
"epoch": 1.1746835443037975,
"grad_norm": 1.7264516276776805,
"learning_rate": 5e-06,
"loss": 0.4677,
"step": 580
},
{
"epoch": 1.1949367088607594,
"grad_norm": 1.5668329127595755,
"learning_rate": 5e-06,
"loss": 0.4611,
"step": 590
},
{
"epoch": 1.2151898734177216,
"grad_norm": 1.5420195271384818,
"learning_rate": 5e-06,
"loss": 0.4683,
"step": 600
},
{
"epoch": 1.2354430379746835,
"grad_norm": 1.4025799668342696,
"learning_rate": 5e-06,
"loss": 0.4648,
"step": 610
},
{
"epoch": 1.2556962025316456,
"grad_norm": 1.410087562343117,
"learning_rate": 5e-06,
"loss": 0.4691,
"step": 620
},
{
"epoch": 1.2759493670886077,
"grad_norm": 1.3134227418822069,
"learning_rate": 5e-06,
"loss": 0.4699,
"step": 630
},
{
"epoch": 1.2962025316455696,
"grad_norm": 1.265890889850941,
"learning_rate": 5e-06,
"loss": 0.47,
"step": 640
},
{
"epoch": 1.3164556962025316,
"grad_norm": 1.347312704270352,
"learning_rate": 5e-06,
"loss": 0.4747,
"step": 650
},
{
"epoch": 1.3367088607594937,
"grad_norm": 1.5665345840182998,
"learning_rate": 5e-06,
"loss": 0.4637,
"step": 660
},
{
"epoch": 1.3569620253164558,
"grad_norm": 1.4061410250755932,
"learning_rate": 5e-06,
"loss": 0.4768,
"step": 670
},
{
"epoch": 1.3772151898734177,
"grad_norm": 1.2663454266336562,
"learning_rate": 5e-06,
"loss": 0.4733,
"step": 680
},
{
"epoch": 1.3974683544303796,
"grad_norm": 1.1821039871004464,
"learning_rate": 5e-06,
"loss": 0.473,
"step": 690
},
{
"epoch": 1.4177215189873418,
"grad_norm": 1.156981338736933,
"learning_rate": 5e-06,
"loss": 0.4712,
"step": 700
},
{
"epoch": 1.437974683544304,
"grad_norm": 1.2263612576805232,
"learning_rate": 5e-06,
"loss": 0.477,
"step": 710
},
{
"epoch": 1.4582278481012658,
"grad_norm": 1.2801673642156481,
"learning_rate": 5e-06,
"loss": 0.4772,
"step": 720
},
{
"epoch": 1.4784810126582277,
"grad_norm": 1.2617356305246052,
"learning_rate": 5e-06,
"loss": 0.4799,
"step": 730
},
{
"epoch": 1.4987341772151899,
"grad_norm": 1.2354318184866413,
"learning_rate": 5e-06,
"loss": 0.4728,
"step": 740
},
{
"epoch": 1.518987341772152,
"grad_norm": 1.2511733882832696,
"learning_rate": 5e-06,
"loss": 0.4757,
"step": 750
},
{
"epoch": 1.539240506329114,
"grad_norm": 1.2772298337747716,
"learning_rate": 5e-06,
"loss": 0.4768,
"step": 760
},
{
"epoch": 1.5594936708860758,
"grad_norm": 1.2992839040784614,
"learning_rate": 5e-06,
"loss": 0.4759,
"step": 770
},
{
"epoch": 1.579746835443038,
"grad_norm": 1.7049378845615897,
"learning_rate": 5e-06,
"loss": 0.4766,
"step": 780
},
{
"epoch": 1.6,
"grad_norm": 1.6571454832942571,
"learning_rate": 5e-06,
"loss": 0.4789,
"step": 790
},
{
"epoch": 1.620253164556962,
"grad_norm": 1.9054844094236882,
"learning_rate": 5e-06,
"loss": 0.4736,
"step": 800
},
{
"epoch": 1.640506329113924,
"grad_norm": 1.5464356008924167,
"learning_rate": 5e-06,
"loss": 0.466,
"step": 810
},
{
"epoch": 1.660759493670886,
"grad_norm": 1.5111838450770525,
"learning_rate": 5e-06,
"loss": 0.4743,
"step": 820
},
{
"epoch": 1.6810126582278482,
"grad_norm": 1.42468780507972,
"learning_rate": 5e-06,
"loss": 0.4703,
"step": 830
},
{
"epoch": 1.70126582278481,
"grad_norm": 1.3971247658469674,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 840
},
{
"epoch": 1.721518987341772,
"grad_norm": 1.3361669855974696,
"learning_rate": 5e-06,
"loss": 0.4734,
"step": 850
},
{
"epoch": 1.7417721518987341,
"grad_norm": 1.2864567717133562,
"learning_rate": 5e-06,
"loss": 0.4733,
"step": 860
},
{
"epoch": 1.7620253164556963,
"grad_norm": 1.379392846668321,
"learning_rate": 5e-06,
"loss": 0.4764,
"step": 870
},
{
"epoch": 1.7822784810126582,
"grad_norm": 1.2161972975914068,
"learning_rate": 5e-06,
"loss": 0.477,
"step": 880
},
{
"epoch": 1.80253164556962,
"grad_norm": 1.1457621158991818,
"learning_rate": 5e-06,
"loss": 0.4741,
"step": 890
},
{
"epoch": 1.8227848101265822,
"grad_norm": 1.165588356364841,
"learning_rate": 5e-06,
"loss": 0.4744,
"step": 900
},
{
"epoch": 1.8430379746835444,
"grad_norm": 1.232051152752489,
"learning_rate": 5e-06,
"loss": 0.477,
"step": 910
},
{
"epoch": 1.8632911392405065,
"grad_norm": 1.2278118260508522,
"learning_rate": 5e-06,
"loss": 0.4764,
"step": 920
},
{
"epoch": 1.8835443037974684,
"grad_norm": 1.1460939229882365,
"learning_rate": 5e-06,
"loss": 0.4734,
"step": 930
},
{
"epoch": 1.9037974683544303,
"grad_norm": 1.1502258411053914,
"learning_rate": 5e-06,
"loss": 0.4775,
"step": 940
},
{
"epoch": 1.9240506329113924,
"grad_norm": 1.3240628061111426,
"learning_rate": 5e-06,
"loss": 0.4785,
"step": 950
},
{
"epoch": 1.9443037974683546,
"grad_norm": 1.4234146644601138,
"learning_rate": 5e-06,
"loss": 0.4827,
"step": 960
},
{
"epoch": 1.9645569620253165,
"grad_norm": 1.1410192422559635,
"learning_rate": 5e-06,
"loss": 0.4807,
"step": 970
},
{
"epoch": 1.9848101265822784,
"grad_norm": 1.233279816619483,
"learning_rate": 5e-06,
"loss": 0.4813,
"step": 980
},
{
"epoch": 1.9969620253164557,
"eval_loss": 0.06898781657218933,
"eval_runtime": 512.3069,
"eval_samples_per_second": 25.969,
"eval_steps_per_second": 0.406,
"step": 986
},
{
"epoch": 1.9969620253164557,
"step": 986,
"total_flos": 1651377344348160.0,
"train_loss": 0.5238038238598899,
"train_runtime": 56319.7775,
"train_samples_per_second": 8.976,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 986,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1651377344348160.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}