OH_original_wo_caseus_custom / trainer_state.json
sedrickkeh's picture
End of training
d7883a7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99775617053104,
"eval_steps": 500,
"global_step": 1002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029917726252804786,
"grad_norm": 3.1024108625766647,
"learning_rate": 5e-06,
"loss": 0.7821,
"step": 10
},
{
"epoch": 0.05983545250560957,
"grad_norm": 2.0154961377920744,
"learning_rate": 5e-06,
"loss": 0.709,
"step": 20
},
{
"epoch": 0.08975317875841436,
"grad_norm": 4.098110110830773,
"learning_rate": 5e-06,
"loss": 0.716,
"step": 30
},
{
"epoch": 0.11967090501121914,
"grad_norm": 1.3839237187441724,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 40
},
{
"epoch": 0.14958863126402394,
"grad_norm": 1.6642156675636246,
"learning_rate": 5e-06,
"loss": 0.6742,
"step": 50
},
{
"epoch": 0.17950635751682872,
"grad_norm": 0.8382342346916078,
"learning_rate": 5e-06,
"loss": 0.6578,
"step": 60
},
{
"epoch": 0.2094240837696335,
"grad_norm": 1.0785514818670323,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 70
},
{
"epoch": 0.2393418100224383,
"grad_norm": 0.6806480615454938,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 80
},
{
"epoch": 0.26925953627524307,
"grad_norm": 0.6822998110330022,
"learning_rate": 5e-06,
"loss": 0.6396,
"step": 90
},
{
"epoch": 0.2991772625280479,
"grad_norm": 0.6407539938463268,
"learning_rate": 5e-06,
"loss": 0.6336,
"step": 100
},
{
"epoch": 0.32909498878085264,
"grad_norm": 0.6023614683907215,
"learning_rate": 5e-06,
"loss": 0.6345,
"step": 110
},
{
"epoch": 0.35901271503365745,
"grad_norm": 1.2991344935679592,
"learning_rate": 5e-06,
"loss": 0.6284,
"step": 120
},
{
"epoch": 0.3889304412864622,
"grad_norm": 1.2836405178889319,
"learning_rate": 5e-06,
"loss": 0.6206,
"step": 130
},
{
"epoch": 0.418848167539267,
"grad_norm": 0.5455928257982399,
"learning_rate": 5e-06,
"loss": 0.626,
"step": 140
},
{
"epoch": 0.4487658937920718,
"grad_norm": 0.8705277514817991,
"learning_rate": 5e-06,
"loss": 0.6243,
"step": 150
},
{
"epoch": 0.4786836200448766,
"grad_norm": 0.6015797080220293,
"learning_rate": 5e-06,
"loss": 0.6239,
"step": 160
},
{
"epoch": 0.5086013462976814,
"grad_norm": 0.5799933689114735,
"learning_rate": 5e-06,
"loss": 0.6242,
"step": 170
},
{
"epoch": 0.5385190725504861,
"grad_norm": 0.7181956170370547,
"learning_rate": 5e-06,
"loss": 0.6187,
"step": 180
},
{
"epoch": 0.5684367988032909,
"grad_norm": 1.1457159660009166,
"learning_rate": 5e-06,
"loss": 0.6249,
"step": 190
},
{
"epoch": 0.5983545250560958,
"grad_norm": 0.8081668167489676,
"learning_rate": 5e-06,
"loss": 0.6262,
"step": 200
},
{
"epoch": 0.6282722513089005,
"grad_norm": 0.494100005031363,
"learning_rate": 5e-06,
"loss": 0.6115,
"step": 210
},
{
"epoch": 0.6581899775617053,
"grad_norm": 0.49011911418037035,
"learning_rate": 5e-06,
"loss": 0.6154,
"step": 220
},
{
"epoch": 0.6881077038145101,
"grad_norm": 0.5619549814669743,
"learning_rate": 5e-06,
"loss": 0.6182,
"step": 230
},
{
"epoch": 0.7180254300673149,
"grad_norm": 0.47978866623290667,
"learning_rate": 5e-06,
"loss": 0.6136,
"step": 240
},
{
"epoch": 0.7479431563201197,
"grad_norm": 0.4834220711675942,
"learning_rate": 5e-06,
"loss": 0.6092,
"step": 250
},
{
"epoch": 0.7778608825729244,
"grad_norm": 0.5842572852141615,
"learning_rate": 5e-06,
"loss": 0.6183,
"step": 260
},
{
"epoch": 0.8077786088257293,
"grad_norm": 0.5124590739903697,
"learning_rate": 5e-06,
"loss": 0.6091,
"step": 270
},
{
"epoch": 0.837696335078534,
"grad_norm": 0.571561034937845,
"learning_rate": 5e-06,
"loss": 0.6089,
"step": 280
},
{
"epoch": 0.8676140613313388,
"grad_norm": 0.50639640711907,
"learning_rate": 5e-06,
"loss": 0.6075,
"step": 290
},
{
"epoch": 0.8975317875841436,
"grad_norm": 0.47922109749059255,
"learning_rate": 5e-06,
"loss": 0.6113,
"step": 300
},
{
"epoch": 0.9274495138369484,
"grad_norm": 0.6105089487425442,
"learning_rate": 5e-06,
"loss": 0.6123,
"step": 310
},
{
"epoch": 0.9573672400897532,
"grad_norm": 0.5056360719681718,
"learning_rate": 5e-06,
"loss": 0.6156,
"step": 320
},
{
"epoch": 0.9872849663425579,
"grad_norm": 0.48485936511026195,
"learning_rate": 5e-06,
"loss": 0.6008,
"step": 330
},
{
"epoch": 0.9992520568436799,
"eval_loss": 0.6066673398017883,
"eval_runtime": 179.254,
"eval_samples_per_second": 50.242,
"eval_steps_per_second": 0.396,
"step": 334
},
{
"epoch": 1.0172026925953628,
"grad_norm": 0.6922596264407767,
"learning_rate": 5e-06,
"loss": 0.5765,
"step": 340
},
{
"epoch": 1.0471204188481675,
"grad_norm": 0.49267857964550554,
"learning_rate": 5e-06,
"loss": 0.5603,
"step": 350
},
{
"epoch": 1.0770381451009723,
"grad_norm": 0.49751934654599217,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 360
},
{
"epoch": 1.106955871353777,
"grad_norm": 0.6837370299507576,
"learning_rate": 5e-06,
"loss": 0.5612,
"step": 370
},
{
"epoch": 1.136873597606582,
"grad_norm": 0.47693359913296135,
"learning_rate": 5e-06,
"loss": 0.5539,
"step": 380
},
{
"epoch": 1.1667913238593868,
"grad_norm": 0.648584636579785,
"learning_rate": 5e-06,
"loss": 0.5622,
"step": 390
},
{
"epoch": 1.1967090501121915,
"grad_norm": 0.5670898037838431,
"learning_rate": 5e-06,
"loss": 0.554,
"step": 400
},
{
"epoch": 1.2266267763649963,
"grad_norm": 0.5703136712557304,
"learning_rate": 5e-06,
"loss": 0.5609,
"step": 410
},
{
"epoch": 1.256544502617801,
"grad_norm": 0.6102946772879293,
"learning_rate": 5e-06,
"loss": 0.559,
"step": 420
},
{
"epoch": 1.2864622288706058,
"grad_norm": 0.5772564364701466,
"learning_rate": 5e-06,
"loss": 0.5603,
"step": 430
},
{
"epoch": 1.3163799551234106,
"grad_norm": 0.450931906177121,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 440
},
{
"epoch": 1.3462976813762153,
"grad_norm": 0.5054266947188414,
"learning_rate": 5e-06,
"loss": 0.5572,
"step": 450
},
{
"epoch": 1.37621540762902,
"grad_norm": 0.48292336210926307,
"learning_rate": 5e-06,
"loss": 0.5587,
"step": 460
},
{
"epoch": 1.406133133881825,
"grad_norm": 0.5249447315851029,
"learning_rate": 5e-06,
"loss": 0.5575,
"step": 470
},
{
"epoch": 1.4360508601346298,
"grad_norm": 0.5140359027555277,
"learning_rate": 5e-06,
"loss": 0.5546,
"step": 480
},
{
"epoch": 1.4659685863874345,
"grad_norm": 0.5354245491327346,
"learning_rate": 5e-06,
"loss": 0.5578,
"step": 490
},
{
"epoch": 1.4958863126402393,
"grad_norm": 0.5682468395490031,
"learning_rate": 5e-06,
"loss": 0.5602,
"step": 500
},
{
"epoch": 1.5258040388930443,
"grad_norm": 0.5900754898799866,
"learning_rate": 5e-06,
"loss": 0.5616,
"step": 510
},
{
"epoch": 1.555721765145849,
"grad_norm": 0.4996963352974884,
"learning_rate": 5e-06,
"loss": 0.5557,
"step": 520
},
{
"epoch": 1.5856394913986538,
"grad_norm": 0.5075965675534208,
"learning_rate": 5e-06,
"loss": 0.5573,
"step": 530
},
{
"epoch": 1.6155572176514585,
"grad_norm": 0.6139589992144621,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 540
},
{
"epoch": 1.6454749439042633,
"grad_norm": 0.5479413460230939,
"learning_rate": 5e-06,
"loss": 0.5549,
"step": 550
},
{
"epoch": 1.675392670157068,
"grad_norm": 0.5917928857307124,
"learning_rate": 5e-06,
"loss": 0.553,
"step": 560
},
{
"epoch": 1.7053103964098728,
"grad_norm": 0.5837997645694308,
"learning_rate": 5e-06,
"loss": 0.557,
"step": 570
},
{
"epoch": 1.7352281226626776,
"grad_norm": 0.5008632390644033,
"learning_rate": 5e-06,
"loss": 0.5497,
"step": 580
},
{
"epoch": 1.7651458489154823,
"grad_norm": 0.5321160422034145,
"learning_rate": 5e-06,
"loss": 0.554,
"step": 590
},
{
"epoch": 1.795063575168287,
"grad_norm": 0.5334951007964042,
"learning_rate": 5e-06,
"loss": 0.5594,
"step": 600
},
{
"epoch": 1.824981301421092,
"grad_norm": 0.5210632059648287,
"learning_rate": 5e-06,
"loss": 0.56,
"step": 610
},
{
"epoch": 1.8548990276738968,
"grad_norm": 0.5043445814465957,
"learning_rate": 5e-06,
"loss": 0.5575,
"step": 620
},
{
"epoch": 1.8848167539267016,
"grad_norm": 0.613976408483885,
"learning_rate": 5e-06,
"loss": 0.5583,
"step": 630
},
{
"epoch": 1.9147344801795063,
"grad_norm": 0.5363674682216356,
"learning_rate": 5e-06,
"loss": 0.5643,
"step": 640
},
{
"epoch": 1.9446522064323113,
"grad_norm": 0.5061979916241445,
"learning_rate": 5e-06,
"loss": 0.5617,
"step": 650
},
{
"epoch": 1.974569932685116,
"grad_norm": 0.5681791964553616,
"learning_rate": 5e-06,
"loss": 0.5567,
"step": 660
},
{
"epoch": 1.9985041136873598,
"eval_loss": 0.5981965661048889,
"eval_runtime": 179.6452,
"eval_samples_per_second": 50.132,
"eval_steps_per_second": 0.395,
"step": 668
},
{
"epoch": 2.004487658937921,
"grad_norm": 0.7426343168495358,
"learning_rate": 5e-06,
"loss": 0.5487,
"step": 670
},
{
"epoch": 2.0344053851907256,
"grad_norm": 0.8360352246281931,
"learning_rate": 5e-06,
"loss": 0.509,
"step": 680
},
{
"epoch": 2.0643231114435303,
"grad_norm": 0.5586788759798959,
"learning_rate": 5e-06,
"loss": 0.5107,
"step": 690
},
{
"epoch": 2.094240837696335,
"grad_norm": 0.5334495185578801,
"learning_rate": 5e-06,
"loss": 0.5083,
"step": 700
},
{
"epoch": 2.12415856394914,
"grad_norm": 0.5451729047772684,
"learning_rate": 5e-06,
"loss": 0.5103,
"step": 710
},
{
"epoch": 2.1540762902019446,
"grad_norm": 0.5384304992109907,
"learning_rate": 5e-06,
"loss": 0.5083,
"step": 720
},
{
"epoch": 2.1839940164547493,
"grad_norm": 0.6897130607106118,
"learning_rate": 5e-06,
"loss": 0.5087,
"step": 730
},
{
"epoch": 2.213911742707554,
"grad_norm": 0.4914356466355389,
"learning_rate": 5e-06,
"loss": 0.5028,
"step": 740
},
{
"epoch": 2.243829468960359,
"grad_norm": 0.4859395300154547,
"learning_rate": 5e-06,
"loss": 0.5107,
"step": 750
},
{
"epoch": 2.273747195213164,
"grad_norm": 0.5080772320686796,
"learning_rate": 5e-06,
"loss": 0.5085,
"step": 760
},
{
"epoch": 2.303664921465969,
"grad_norm": 0.5928831782745143,
"learning_rate": 5e-06,
"loss": 0.5065,
"step": 770
},
{
"epoch": 2.3335826477187736,
"grad_norm": 0.5292235530530627,
"learning_rate": 5e-06,
"loss": 0.5064,
"step": 780
},
{
"epoch": 2.3635003739715783,
"grad_norm": 0.5612309376826925,
"learning_rate": 5e-06,
"loss": 0.5047,
"step": 790
},
{
"epoch": 2.393418100224383,
"grad_norm": 0.7840153330542589,
"learning_rate": 5e-06,
"loss": 0.5107,
"step": 800
},
{
"epoch": 2.423335826477188,
"grad_norm": 0.563462388459372,
"learning_rate": 5e-06,
"loss": 0.5074,
"step": 810
},
{
"epoch": 2.4532535527299926,
"grad_norm": 0.6037763909811528,
"learning_rate": 5e-06,
"loss": 0.5027,
"step": 820
},
{
"epoch": 2.4831712789827973,
"grad_norm": 0.512144492512373,
"learning_rate": 5e-06,
"loss": 0.5059,
"step": 830
},
{
"epoch": 2.513089005235602,
"grad_norm": 0.5423885548086916,
"learning_rate": 5e-06,
"loss": 0.5087,
"step": 840
},
{
"epoch": 2.543006731488407,
"grad_norm": 0.5435953936785038,
"learning_rate": 5e-06,
"loss": 0.5097,
"step": 850
},
{
"epoch": 2.5729244577412116,
"grad_norm": 0.5702298253328785,
"learning_rate": 5e-06,
"loss": 0.5139,
"step": 860
},
{
"epoch": 2.6028421839940163,
"grad_norm": 0.5598880036547539,
"learning_rate": 5e-06,
"loss": 0.5108,
"step": 870
},
{
"epoch": 2.632759910246821,
"grad_norm": 0.5408931335977699,
"learning_rate": 5e-06,
"loss": 0.5123,
"step": 880
},
{
"epoch": 2.662677636499626,
"grad_norm": 0.5650280550038018,
"learning_rate": 5e-06,
"loss": 0.5024,
"step": 890
},
{
"epoch": 2.6925953627524306,
"grad_norm": 0.5162753582027944,
"learning_rate": 5e-06,
"loss": 0.5086,
"step": 900
},
{
"epoch": 2.7225130890052354,
"grad_norm": 0.562255619044257,
"learning_rate": 5e-06,
"loss": 0.5091,
"step": 910
},
{
"epoch": 2.75243081525804,
"grad_norm": 0.525530348245258,
"learning_rate": 5e-06,
"loss": 0.5101,
"step": 920
},
{
"epoch": 2.7823485415108453,
"grad_norm": 0.5416570745705145,
"learning_rate": 5e-06,
"loss": 0.5099,
"step": 930
},
{
"epoch": 2.81226626776365,
"grad_norm": 0.5094752288812316,
"learning_rate": 5e-06,
"loss": 0.5091,
"step": 940
},
{
"epoch": 2.842183994016455,
"grad_norm": 0.48328984684734566,
"learning_rate": 5e-06,
"loss": 0.5115,
"step": 950
},
{
"epoch": 2.8721017202692596,
"grad_norm": 0.5309507328482131,
"learning_rate": 5e-06,
"loss": 0.5145,
"step": 960
},
{
"epoch": 2.9020194465220643,
"grad_norm": 0.5645199920156511,
"learning_rate": 5e-06,
"loss": 0.507,
"step": 970
},
{
"epoch": 2.931937172774869,
"grad_norm": 0.6341772078202893,
"learning_rate": 5e-06,
"loss": 0.5164,
"step": 980
},
{
"epoch": 2.961854899027674,
"grad_norm": 0.5241928497019043,
"learning_rate": 5e-06,
"loss": 0.5075,
"step": 990
},
{
"epoch": 2.9917726252804786,
"grad_norm": 0.5568699384966846,
"learning_rate": 5e-06,
"loss": 0.5124,
"step": 1000
},
{
"epoch": 2.99775617053104,
"eval_loss": 0.6033644080162048,
"eval_runtime": 179.8158,
"eval_samples_per_second": 50.085,
"eval_steps_per_second": 0.395,
"step": 1002
},
{
"epoch": 2.99775617053104,
"step": 1002,
"total_flos": 1677968560619520.0,
"train_loss": 0.5679433697949865,
"train_runtime": 29958.3489,
"train_samples_per_second": 17.134,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 1002,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1677968560619520.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}