starcoder-1b-conti / checkpoint-1600 /trainer_state.json
limernyou's picture
Upload folder using huggingface_hub
d309c05 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8,
"eval_steps": 100,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0125,
"grad_norm": 0.08136817067861557,
"learning_rate": 0.0004,
"loss": 1.158,
"step": 25
},
{
"epoch": 0.025,
"grad_norm": 0.0653829574584961,
"learning_rate": 0.0004998852503731983,
"loss": 1.0957,
"step": 50
},
{
"epoch": 0.0375,
"grad_norm": 0.11592712253332138,
"learning_rate": 0.0004993848168027977,
"loss": 0.9276,
"step": 75
},
{
"epoch": 0.05,
"grad_norm": 0.08926476538181305,
"learning_rate": 0.0004984880506341147,
"loss": 1.0337,
"step": 100
},
{
"epoch": 0.05,
"eval_loss": 0.9111642837524414,
"eval_runtime": 843.8058,
"eval_samples_per_second": 1.3,
"eval_steps_per_second": 0.021,
"step": 100
},
{
"epoch": 0.0625,
"grad_norm": 0.08632034063339233,
"learning_rate": 0.0004971963770447935,
"loss": 1.0219,
"step": 125
},
{
"epoch": 0.075,
"grad_norm": 0.12949417531490326,
"learning_rate": 0.0004955118488155782,
"loss": 0.784,
"step": 150
},
{
"epoch": 0.0875,
"grad_norm": 0.09222520887851715,
"learning_rate": 0.0004934371430679492,
"loss": 1.0,
"step": 175
},
{
"epoch": 0.1,
"grad_norm": 0.10075929015874863,
"learning_rate": 0.0004909755570095319,
"loss": 0.9617,
"step": 200
},
{
"epoch": 0.1,
"eval_loss": 0.9380430579185486,
"eval_runtime": 856.156,
"eval_samples_per_second": 1.281,
"eval_steps_per_second": 0.021,
"step": 200
},
{
"epoch": 0.1125,
"grad_norm": 0.09567111730575562,
"learning_rate": 0.0004881310026940389,
"loss": 0.7051,
"step": 225
},
{
"epoch": 0.125,
"grad_norm": 0.09723825007677078,
"learning_rate": 0.0004849080008040734,
"loss": 0.9906,
"step": 250
},
{
"epoch": 0.1375,
"grad_norm": 0.09430444985628128,
"learning_rate": 0.00048131167346667446,
"loss": 0.9113,
"step": 275
},
{
"epoch": 0.15,
"grad_norm": 0.09439756721258163,
"learning_rate": 0.00047734773611302284,
"loss": 0.674,
"step": 300
},
{
"epoch": 0.15,
"eval_loss": 0.9544369578361511,
"eval_runtime": 850.9461,
"eval_samples_per_second": 1.289,
"eval_steps_per_second": 0.021,
"step": 300
},
{
"epoch": 0.1625,
"grad_norm": 0.10961435735225677,
"learning_rate": 0.0004730224883952422,
"loss": 0.9701,
"step": 325
},
{
"epoch": 0.175,
"grad_norm": 0.09628895670175552,
"learning_rate": 0.0004683428041747334,
"loss": 0.8976,
"step": 350
},
{
"epoch": 0.1875,
"grad_norm": 0.11085063964128494,
"learning_rate": 0.0004633161205979517,
"loss": 0.6683,
"step": 375
},
{
"epoch": 0.2,
"grad_norm": 0.10140710324048996,
"learning_rate": 0.0004579504262769877,
"loss": 0.9373,
"step": 400
},
{
"epoch": 0.2,
"eval_loss": 0.968166172504425,
"eval_runtime": 853.0258,
"eval_samples_per_second": 1.286,
"eval_steps_per_second": 0.021,
"step": 400
},
{
"epoch": 0.2125,
"grad_norm": 0.13868793845176697,
"learning_rate": 0.0004522542485937369,
"loss": 0.8822,
"step": 425
},
{
"epoch": 0.225,
"grad_norm": 0.11853040754795074,
"learning_rate": 0.00044623664014783386,
"loss": 0.6483,
"step": 450
},
{
"epoch": 0.2375,
"grad_norm": 0.11668186634778976,
"learning_rate": 0.00043990716436988924,
"loss": 0.9374,
"step": 475
},
{
"epoch": 0.25,
"grad_norm": 0.11911585181951523,
"learning_rate": 0.0004332758803228925,
"loss": 0.8434,
"step": 500
},
{
"epoch": 0.25,
"eval_loss": 0.9753186702728271,
"eval_runtime": 851.2812,
"eval_samples_per_second": 1.289,
"eval_steps_per_second": 0.021,
"step": 500
},
{
"epoch": 0.2625,
"grad_norm": 0.09807440638542175,
"learning_rate": 0.00042635332671593575,
"loss": 0.6661,
"step": 525
},
{
"epoch": 0.275,
"grad_norm": 0.09675773978233337,
"learning_rate": 0.00041915050515566445,
"loss": 0.8999,
"step": 550
},
{
"epoch": 0.2875,
"grad_norm": 0.0979737937450409,
"learning_rate": 0.00041167886266207167,
"loss": 0.8616,
"step": 575
},
{
"epoch": 0.3,
"grad_norm": 0.08990875631570816,
"learning_rate": 0.0004039502734764241,
"loss": 0.8167,
"step": 600
},
{
"epoch": 0.3,
"eval_loss": 0.9926204681396484,
"eval_runtime": 851.6484,
"eval_samples_per_second": 1.288,
"eval_steps_per_second": 0.021,
"step": 600
},
{
"epoch": 0.3125,
"grad_norm": 0.09505137801170349,
"learning_rate": 0.0003959770201902294,
"loss": 0.8191,
"step": 625
},
{
"epoch": 0.325,
"grad_norm": 0.09105059504508972,
"learning_rate": 0.0003877717742252371,
"loss": 0.7203,
"step": 650
},
{
"epoch": 0.3375,
"grad_norm": 0.09274734556674957,
"learning_rate": 0.00037934757569549495,
"loss": 0.8154,
"step": 675
},
{
"epoch": 0.35,
"grad_norm": 0.10026416182518005,
"learning_rate": 0.00037071781268346345,
"loss": 0.8336,
"step": 700
},
{
"epoch": 0.35,
"eval_loss": 0.9959912896156311,
"eval_runtime": 849.7662,
"eval_samples_per_second": 1.291,
"eval_steps_per_second": 0.021,
"step": 700
},
{
"epoch": 0.3625,
"grad_norm": 0.09517039358615875,
"learning_rate": 0.00036189619996312495,
"loss": 0.6845,
"step": 725
},
{
"epoch": 0.375,
"grad_norm": 0.11264201998710632,
"learning_rate": 0.00035289675720390174,
"loss": 0.8445,
"step": 750
},
{
"epoch": 0.3875,
"grad_norm": 0.0979958102107048,
"learning_rate": 0.00034373378669002105,
"loss": 0.8269,
"step": 775
},
{
"epoch": 0.4,
"grad_norm": 0.10223093628883362,
"learning_rate": 0.00033442185059073706,
"loss": 0.6517,
"step": 800
},
{
"epoch": 0.4,
"eval_loss": 1.006990671157837,
"eval_runtime": 845.1184,
"eval_samples_per_second": 1.298,
"eval_steps_per_second": 0.021,
"step": 800
},
{
"epoch": 0.4125,
"grad_norm": 0.09464213252067566,
"learning_rate": 0.00032497574781753367,
"loss": 0.8455,
"step": 825
},
{
"epoch": 0.425,
"grad_norm": 0.10187377035617828,
"learning_rate": 0.000315410490505086,
"loss": 0.8217,
"step": 850
},
{
"epoch": 0.4375,
"grad_norm": 0.09793733805418015,
"learning_rate": 0.0003057412801533589,
"loss": 0.6218,
"step": 875
},
{
"epoch": 0.45,
"grad_norm": 0.09902466833591461,
"learning_rate": 0.0002959834834687587,
"loss": 0.869,
"step": 900
},
{
"epoch": 0.45,
"eval_loss": 1.0097707509994507,
"eval_runtime": 847.3981,
"eval_samples_per_second": 1.295,
"eval_steps_per_second": 0.021,
"step": 900
},
{
"epoch": 0.4625,
"grad_norm": 0.10546339303255081,
"learning_rate": 0.00028615260794273236,
"loss": 0.8204,
"step": 925
},
{
"epoch": 0.475,
"grad_norm": 0.10600671917200089,
"learning_rate": 0.00027626427720662416,
"loss": 0.5917,
"step": 950
},
{
"epoch": 0.4875,
"grad_norm": 0.09627766162157059,
"learning_rate": 0.00026633420620195917,
"loss": 0.8667,
"step": 975
},
{
"epoch": 0.5,
"grad_norm": 0.09961821138858795,
"learning_rate": 0.00025637817620561263,
"loss": 0.8215,
"step": 1000
},
{
"epoch": 0.5,
"eval_loss": 1.023415446281433,
"eval_runtime": 844.006,
"eval_samples_per_second": 1.3,
"eval_steps_per_second": 0.021,
"step": 1000
},
{
"epoch": 0.5125,
"grad_norm": 0.09977111220359802,
"learning_rate": 0.0002464120097495559,
"loss": 0.5829,
"step": 1025
},
{
"epoch": 0.525,
"grad_norm": 0.10276953876018524,
"learning_rate": 0.00023645154547503855,
"loss": 0.8857,
"step": 1050
},
{
"epoch": 0.5375,
"grad_norm": 0.10077586770057678,
"learning_rate": 0.00022651261296116894,
"loss": 0.8018,
"step": 1075
},
{
"epoch": 0.55,
"grad_norm": 0.09652134776115417,
"learning_rate": 0.00021661100756789666,
"loss": 0.561,
"step": 1100
},
{
"epoch": 0.55,
"eval_loss": 1.0231441259384155,
"eval_runtime": 845.0138,
"eval_samples_per_second": 1.298,
"eval_steps_per_second": 0.021,
"step": 1100
},
{
"epoch": 0.5625,
"grad_norm": 0.0990302637219429,
"learning_rate": 0.00020676246533337764,
"loss": 0.9092,
"step": 1125
},
{
"epoch": 0.575,
"grad_norm": 0.11152709275484085,
"learning_rate": 0.00019698263796561526,
"loss": 0.8159,
"step": 1150
},
{
"epoch": 0.5875,
"grad_norm": 0.10092920064926147,
"learning_rate": 0.00018728706796812333,
"loss": 0.7329,
"step": 1175
},
{
"epoch": 0.6,
"grad_norm": 0.10138432681560516,
"learning_rate": 0.00017769116393914037,
"loss": 0.7873,
"step": 1200
},
{
"epoch": 0.6,
"eval_loss": 1.0180704593658447,
"eval_runtime": 843.0775,
"eval_samples_per_second": 1.301,
"eval_steps_per_second": 0.021,
"step": 1200
},
{
"epoch": 0.6125,
"grad_norm": 0.10279645770788193,
"learning_rate": 0.00016821017608365264,
"loss": 0.7186,
"step": 1225
},
{
"epoch": 0.625,
"grad_norm": 0.10766831040382385,
"learning_rate": 0.00015885917197714112,
"loss": 0.7201,
"step": 1250
},
{
"epoch": 0.6375,
"grad_norm": 0.10177863389253616,
"learning_rate": 0.00014965301261957238,
"loss": 0.8009,
"step": 1275
},
{
"epoch": 0.65,
"grad_norm": 0.0983508974313736,
"learning_rate": 0.00014060632881768558,
"loss": 0.7023,
"step": 1300
},
{
"epoch": 0.65,
"eval_loss": 1.0266674757003784,
"eval_runtime": 854.7884,
"eval_samples_per_second": 1.283,
"eval_steps_per_second": 0.021,
"step": 1300
},
{
"epoch": 0.6625,
"grad_norm": 0.10579918324947357,
"learning_rate": 0.00013173349793311424,
"loss": 0.7624,
"step": 1325
},
{
"epoch": 0.675,
"grad_norm": 0.10350169986486435,
"learning_rate": 0.0001230486210332916,
"loss": 0.7857,
"step": 1350
},
{
"epoch": 0.6875,
"grad_norm": 0.10701841115951538,
"learning_rate": 0.00011456550048145536,
"loss": 0.6771,
"step": 1375
},
{
"epoch": 0.7,
"grad_norm": 0.10333641618490219,
"learning_rate": 0.00010629761800136473,
"loss": 0.7669,
"step": 1400
},
{
"epoch": 0.7,
"eval_loss": 1.0311139822006226,
"eval_runtime": 850.5438,
"eval_samples_per_second": 1.29,
"eval_steps_per_second": 0.021,
"step": 1400
},
{
"epoch": 0.7125,
"grad_norm": 0.10531915724277496,
"learning_rate": 9.82581132515907e-05,
"loss": 0.7869,
"step": 1425
},
{
"epoch": 0.725,
"grad_norm": 0.08779594302177429,
"learning_rate": 9.045976294343145e-05,
"loss": 0.6651,
"step": 1450
},
{
"epoch": 0.7375,
"grad_norm": 0.09893123060464859,
"learning_rate": 8.291496053563699e-05,
"loss": 0.7938,
"step": 1475
},
{
"epoch": 0.75,
"grad_norm": 0.09708540141582489,
"learning_rate": 7.563569653821565e-05,
"loss": 0.7873,
"step": 1500
},
{
"epoch": 0.75,
"eval_loss": 1.0259246826171875,
"eval_runtime": 843.8488,
"eval_samples_per_second": 1.3,
"eval_steps_per_second": 0.021,
"step": 1500
},
{
"epoch": 0.7625,
"grad_norm": 0.1001739501953125,
"learning_rate": 6.863353945662288e-05,
"loss": 0.6202,
"step": 1525
},
{
"epoch": 0.775,
"grad_norm": 0.10724864155054092,
"learning_rate": 6.191961740661687e-05,
"loss": 0.8107,
"step": 1550
},
{
"epoch": 0.7875,
"grad_norm": 0.10225515067577362,
"learning_rate": 5.550460042899982e-05,
"loss": 0.8041,
"step": 1575
},
{
"epoch": 0.8,
"grad_norm": 0.09861259162425995,
"learning_rate": 4.9398683532350855e-05,
"loss": 0.5894,
"step": 1600
},
{
"epoch": 0.8,
"eval_loss": 1.0317810773849487,
"eval_runtime": 852.0414,
"eval_samples_per_second": 1.287,
"eval_steps_per_second": 0.021,
"step": 1600
}
],
"logging_steps": 25,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.608283832933417e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}