dasdasdasdaasdasda / trainer_state.json
Mohaddz's picture
Upload folder using huggingface_hub
c75e0fc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 873,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 7.051288604736328,
"eval_runtime": 17.1259,
"eval_samples_per_second": 0.058,
"eval_steps_per_second": 0.058,
"step": 0
},
{
"epoch": 0.03436426116838488,
"grad_norm": 32.725921630859375,
"learning_rate": 3.5000000000000004e-06,
"loss": 7.176,
"step": 10
},
{
"epoch": 0.06872852233676977,
"grad_norm": 10.35120677947998,
"learning_rate": 8.500000000000002e-06,
"loss": 6.1643,
"step": 20
},
{
"epoch": 0.10309278350515463,
"grad_norm": 10.239090919494629,
"learning_rate": 1.3500000000000001e-05,
"loss": 5.8296,
"step": 30
},
{
"epoch": 0.13745704467353953,
"grad_norm": 10.433135032653809,
"learning_rate": 1.85e-05,
"loss": 5.737,
"step": 40
},
{
"epoch": 0.1718213058419244,
"grad_norm": 9.619898796081543,
"learning_rate": 2.35e-05,
"loss": 5.7242,
"step": 50
},
{
"epoch": 0.20618556701030927,
"grad_norm": 9.597282409667969,
"learning_rate": 2.8499999999999998e-05,
"loss": 5.6166,
"step": 60
},
{
"epoch": 0.24054982817869416,
"grad_norm": 9.673783302307129,
"learning_rate": 3.35e-05,
"loss": 5.5887,
"step": 70
},
{
"epoch": 0.27491408934707906,
"grad_norm": 8.612630844116211,
"learning_rate": 3.85e-05,
"loss": 5.6619,
"step": 80
},
{
"epoch": 0.30927835051546393,
"grad_norm": 8.954395294189453,
"learning_rate": 4.35e-05,
"loss": 5.6506,
"step": 90
},
{
"epoch": 0.3436426116838488,
"grad_norm": 8.672375679016113,
"learning_rate": 4.85e-05,
"loss": 5.5449,
"step": 100
},
{
"epoch": 0.37800687285223367,
"grad_norm": 9.550030708312988,
"learning_rate": 4.954721862871928e-05,
"loss": 5.6273,
"step": 110
},
{
"epoch": 0.41237113402061853,
"grad_norm": 8.500064849853516,
"learning_rate": 4.890038809831824e-05,
"loss": 5.5225,
"step": 120
},
{
"epoch": 0.44673539518900346,
"grad_norm": 8.339930534362793,
"learning_rate": 4.8253557567917204e-05,
"loss": 5.5552,
"step": 130
},
{
"epoch": 0.48109965635738833,
"grad_norm": 8.283628463745117,
"learning_rate": 4.760672703751617e-05,
"loss": 5.3781,
"step": 140
},
{
"epoch": 0.5154639175257731,
"grad_norm": 9.112236976623535,
"learning_rate": 4.6959896507115136e-05,
"loss": 5.4563,
"step": 150
},
{
"epoch": 0.5498281786941581,
"grad_norm": 8.985079765319824,
"learning_rate": 4.6313065976714105e-05,
"loss": 5.5249,
"step": 160
},
{
"epoch": 0.584192439862543,
"grad_norm": 7.019378185272217,
"learning_rate": 4.566623544631307e-05,
"loss": 5.4304,
"step": 170
},
{
"epoch": 0.6185567010309279,
"grad_norm": 7.323549270629883,
"learning_rate": 4.5019404915912036e-05,
"loss": 5.4806,
"step": 180
},
{
"epoch": 0.6529209621993127,
"grad_norm": 7.620777130126953,
"learning_rate": 4.4372574385511e-05,
"loss": 5.3499,
"step": 190
},
{
"epoch": 0.6872852233676976,
"grad_norm": 7.8983659744262695,
"learning_rate": 4.372574385510996e-05,
"loss": 5.3936,
"step": 200
},
{
"epoch": 0.7216494845360825,
"grad_norm": 6.915121555328369,
"learning_rate": 4.307891332470893e-05,
"loss": 5.3653,
"step": 210
},
{
"epoch": 0.7560137457044673,
"grad_norm": 7.591084957122803,
"learning_rate": 4.243208279430789e-05,
"loss": 5.4473,
"step": 220
},
{
"epoch": 0.7903780068728522,
"grad_norm": 7.623355865478516,
"learning_rate": 4.178525226390686e-05,
"loss": 5.4019,
"step": 230
},
{
"epoch": 0.8247422680412371,
"grad_norm": 7.67550802230835,
"learning_rate": 4.113842173350582e-05,
"loss": 5.3478,
"step": 240
},
{
"epoch": 0.8591065292096219,
"grad_norm": 7.541503429412842,
"learning_rate": 4.049159120310479e-05,
"loss": 5.3985,
"step": 250
},
{
"epoch": 0.8934707903780069,
"grad_norm": 7.1159749031066895,
"learning_rate": 3.9844760672703754e-05,
"loss": 5.3093,
"step": 260
},
{
"epoch": 0.9278350515463918,
"grad_norm": 7.516855716705322,
"learning_rate": 3.9197930142302716e-05,
"loss": 5.3952,
"step": 270
},
{
"epoch": 0.9621993127147767,
"grad_norm": 7.933532238006592,
"learning_rate": 3.855109961190168e-05,
"loss": 5.2822,
"step": 280
},
{
"epoch": 0.9965635738831615,
"grad_norm": 7.439053535461426,
"learning_rate": 3.790426908150065e-05,
"loss": 5.3152,
"step": 290
},
{
"epoch": 1.0309278350515463,
"grad_norm": 9.026961326599121,
"learning_rate": 3.7257438551099616e-05,
"loss": 4.8395,
"step": 300
},
{
"epoch": 1.0652920962199313,
"grad_norm": 8.444412231445312,
"learning_rate": 3.661060802069858e-05,
"loss": 4.7306,
"step": 310
},
{
"epoch": 1.0996563573883162,
"grad_norm": 8.353560447692871,
"learning_rate": 3.596377749029755e-05,
"loss": 4.7795,
"step": 320
},
{
"epoch": 1.134020618556701,
"grad_norm": 8.427460670471191,
"learning_rate": 3.531694695989651e-05,
"loss": 4.736,
"step": 330
},
{
"epoch": 1.168384879725086,
"grad_norm": 8.1294584274292,
"learning_rate": 3.467011642949548e-05,
"loss": 4.8136,
"step": 340
},
{
"epoch": 1.2027491408934707,
"grad_norm": 8.170726776123047,
"learning_rate": 3.4023285899094434e-05,
"loss": 4.7125,
"step": 350
},
{
"epoch": 1.2371134020618557,
"grad_norm": 8.133183479309082,
"learning_rate": 3.33764553686934e-05,
"loss": 4.8097,
"step": 360
},
{
"epoch": 1.2714776632302405,
"grad_norm": 8.4751615524292,
"learning_rate": 3.2729624838292365e-05,
"loss": 4.76,
"step": 370
},
{
"epoch": 1.3058419243986255,
"grad_norm": 8.671772003173828,
"learning_rate": 3.2082794307891334e-05,
"loss": 4.6785,
"step": 380
},
{
"epoch": 1.3402061855670104,
"grad_norm": 9.14929485321045,
"learning_rate": 3.14359637774903e-05,
"loss": 4.6647,
"step": 390
},
{
"epoch": 1.3745704467353952,
"grad_norm": 7.652307033538818,
"learning_rate": 3.0789133247089265e-05,
"loss": 4.7723,
"step": 400
},
{
"epoch": 1.40893470790378,
"grad_norm": 8.373456001281738,
"learning_rate": 3.014230271668823e-05,
"loss": 4.6346,
"step": 410
},
{
"epoch": 1.443298969072165,
"grad_norm": 8.60721492767334,
"learning_rate": 2.9495472186287193e-05,
"loss": 4.7384,
"step": 420
},
{
"epoch": 1.47766323024055,
"grad_norm": 8.466670036315918,
"learning_rate": 2.8848641655886162e-05,
"loss": 4.8056,
"step": 430
},
{
"epoch": 1.5120274914089347,
"grad_norm": 8.389398574829102,
"learning_rate": 2.8201811125485124e-05,
"loss": 4.691,
"step": 440
},
{
"epoch": 1.5463917525773194,
"grad_norm": 8.483616828918457,
"learning_rate": 2.755498059508409e-05,
"loss": 4.7388,
"step": 450
},
{
"epoch": 1.5807560137457046,
"grad_norm": 8.77346134185791,
"learning_rate": 2.6908150064683052e-05,
"loss": 4.7401,
"step": 460
},
{
"epoch": 1.6151202749140894,
"grad_norm": 9.041746139526367,
"learning_rate": 2.626131953428202e-05,
"loss": 4.7421,
"step": 470
},
{
"epoch": 1.6494845360824741,
"grad_norm": 8.592238426208496,
"learning_rate": 2.5614489003880986e-05,
"loss": 4.7643,
"step": 480
},
{
"epoch": 1.6838487972508591,
"grad_norm": 7.6761956214904785,
"learning_rate": 2.496765847347995e-05,
"loss": 4.7175,
"step": 490
},
{
"epoch": 1.718213058419244,
"grad_norm": 9.062220573425293,
"learning_rate": 2.4320827943078914e-05,
"loss": 4.7023,
"step": 500
},
{
"epoch": 1.7525773195876289,
"grad_norm": 8.060118675231934,
"learning_rate": 2.367399741267788e-05,
"loss": 4.7484,
"step": 510
},
{
"epoch": 1.7869415807560136,
"grad_norm": 8.655328750610352,
"learning_rate": 2.3027166882276842e-05,
"loss": 4.7492,
"step": 520
},
{
"epoch": 1.8213058419243986,
"grad_norm": 7.456566333770752,
"learning_rate": 2.238033635187581e-05,
"loss": 4.7337,
"step": 530
},
{
"epoch": 1.8556701030927836,
"grad_norm": 8.35741138458252,
"learning_rate": 2.1733505821474777e-05,
"loss": 4.8222,
"step": 540
},
{
"epoch": 1.8900343642611683,
"grad_norm": 8.883995056152344,
"learning_rate": 2.108667529107374e-05,
"loss": 4.7534,
"step": 550
},
{
"epoch": 1.9243986254295533,
"grad_norm": 8.559647560119629,
"learning_rate": 2.0439844760672704e-05,
"loss": 4.6819,
"step": 560
},
{
"epoch": 1.9587628865979383,
"grad_norm": 8.394923210144043,
"learning_rate": 1.979301423027167e-05,
"loss": 4.6777,
"step": 570
},
{
"epoch": 1.993127147766323,
"grad_norm": 9.052810668945312,
"learning_rate": 1.9146183699870636e-05,
"loss": 4.7,
"step": 580
},
{
"epoch": 2.027491408934708,
"grad_norm": 9.817968368530273,
"learning_rate": 1.8499353169469598e-05,
"loss": 4.0928,
"step": 590
},
{
"epoch": 2.0618556701030926,
"grad_norm": 11.832119941711426,
"learning_rate": 1.7852522639068563e-05,
"loss": 3.8996,
"step": 600
},
{
"epoch": 2.0962199312714778,
"grad_norm": 11.534992218017578,
"learning_rate": 1.720569210866753e-05,
"loss": 3.8028,
"step": 610
},
{
"epoch": 2.1305841924398625,
"grad_norm": 11.591385841369629,
"learning_rate": 1.6558861578266498e-05,
"loss": 3.7741,
"step": 620
},
{
"epoch": 2.1649484536082473,
"grad_norm": 11.821063995361328,
"learning_rate": 1.591203104786546e-05,
"loss": 3.756,
"step": 630
},
{
"epoch": 2.1993127147766325,
"grad_norm": 12.23161792755127,
"learning_rate": 1.5265200517464426e-05,
"loss": 3.7957,
"step": 640
},
{
"epoch": 2.2336769759450172,
"grad_norm": 12.386809349060059,
"learning_rate": 1.4618369987063391e-05,
"loss": 3.7536,
"step": 650
},
{
"epoch": 2.268041237113402,
"grad_norm": 12.17062759399414,
"learning_rate": 1.3971539456662355e-05,
"loss": 3.7806,
"step": 660
},
{
"epoch": 2.3024054982817868,
"grad_norm": 15.317811012268066,
"learning_rate": 1.332470892626132e-05,
"loss": 3.794,
"step": 670
},
{
"epoch": 2.336769759450172,
"grad_norm": 11.287023544311523,
"learning_rate": 1.2677878395860285e-05,
"loss": 3.7329,
"step": 680
},
{
"epoch": 2.3711340206185567,
"grad_norm": 11.834717750549316,
"learning_rate": 1.203104786545925e-05,
"loss": 3.8154,
"step": 690
},
{
"epoch": 2.4054982817869415,
"grad_norm": 12.480812072753906,
"learning_rate": 1.1384217335058216e-05,
"loss": 3.7529,
"step": 700
},
{
"epoch": 2.4398625429553267,
"grad_norm": 11.966567039489746,
"learning_rate": 1.073738680465718e-05,
"loss": 3.8161,
"step": 710
},
{
"epoch": 2.4742268041237114,
"grad_norm": 11.70124340057373,
"learning_rate": 1.0090556274256145e-05,
"loss": 3.8291,
"step": 720
},
{
"epoch": 2.508591065292096,
"grad_norm": 10.707592010498047,
"learning_rate": 9.44372574385511e-06,
"loss": 3.8132,
"step": 730
},
{
"epoch": 2.542955326460481,
"grad_norm": 11.80911922454834,
"learning_rate": 8.796895213454076e-06,
"loss": 3.8293,
"step": 740
},
{
"epoch": 2.5773195876288657,
"grad_norm": 11.74314022064209,
"learning_rate": 8.15006468305304e-06,
"loss": 3.7905,
"step": 750
},
{
"epoch": 2.611683848797251,
"grad_norm": 12.890970230102539,
"learning_rate": 7.503234152652006e-06,
"loss": 3.7617,
"step": 760
},
{
"epoch": 2.6460481099656357,
"grad_norm": 12.482461929321289,
"learning_rate": 6.856403622250971e-06,
"loss": 3.7767,
"step": 770
},
{
"epoch": 2.680412371134021,
"grad_norm": 12.128081321716309,
"learning_rate": 6.2095730918499354e-06,
"loss": 3.8105,
"step": 780
},
{
"epoch": 2.7147766323024056,
"grad_norm": 11.817726135253906,
"learning_rate": 5.5627425614489e-06,
"loss": 3.7593,
"step": 790
},
{
"epoch": 2.7491408934707904,
"grad_norm": 13.080018043518066,
"learning_rate": 4.915912031047866e-06,
"loss": 3.8518,
"step": 800
},
{
"epoch": 2.783505154639175,
"grad_norm": 11.133337020874023,
"learning_rate": 4.2690815006468305e-06,
"loss": 3.7509,
"step": 810
},
{
"epoch": 2.81786941580756,
"grad_norm": 11.332104682922363,
"learning_rate": 3.6222509702457957e-06,
"loss": 3.76,
"step": 820
},
{
"epoch": 2.852233676975945,
"grad_norm": 12.596721649169922,
"learning_rate": 2.975420439844761e-06,
"loss": 3.7612,
"step": 830
},
{
"epoch": 2.88659793814433,
"grad_norm": 11.145185470581055,
"learning_rate": 2.328589909443726e-06,
"loss": 3.6734,
"step": 840
},
{
"epoch": 2.9209621993127146,
"grad_norm": 13.382460594177246,
"learning_rate": 1.6817593790426907e-06,
"loss": 3.7865,
"step": 850
},
{
"epoch": 2.9553264604811,
"grad_norm": 11.197080612182617,
"learning_rate": 1.0349288486416561e-06,
"loss": 3.7716,
"step": 860
},
{
"epoch": 2.9896907216494846,
"grad_norm": 12.549001693725586,
"learning_rate": 3.8809831824062096e-07,
"loss": 3.7189,
"step": 870
},
{
"epoch": 3.0,
"step": 873,
"total_flos": 0.0,
"train_loss": 4.699677870445645,
"train_runtime": 1249.7659,
"train_samples_per_second": 5.581,
"train_steps_per_second": 0.699
}
],
"logging_steps": 10,
"max_steps": 873,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}