test / trainer_state.json
freQuensy23's picture
Upload 7 files
a1c53d5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.979827089337174,
"eval_steps": 1200,
"global_step": 21500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.7204610951008645,
"grad_norm": 0.19696219265460968,
"learning_rate": 0.0004967269843558504,
"loss": 0.115,
"step": 500
},
{
"epoch": 1.440922190201729,
"grad_norm": 0.18757623434066772,
"learning_rate": 0.0004931223415759498,
"loss": 0.0605,
"step": 1000
},
{
"epoch": 1.729106628242075,
"eval_loss": 0.051301125437021255,
"eval_runtime": 20.8394,
"eval_samples_per_second": 111.039,
"eval_steps_per_second": 0.096,
"step": 1200
},
{
"epoch": 2.161383285302594,
"grad_norm": 0.23383216559886932,
"learning_rate": 0.0004895176987960493,
"loss": 0.06,
"step": 1500
},
{
"epoch": 2.881844380403458,
"grad_norm": 0.8578475713729858,
"learning_rate": 0.00048591305601614884,
"loss": 0.045,
"step": 2000
},
{
"epoch": 3.4582132564841497,
"eval_loss": 0.048763249069452286,
"eval_runtime": 21.4687,
"eval_samples_per_second": 107.785,
"eval_steps_per_second": 0.093,
"step": 2400
},
{
"epoch": 3.602305475504323,
"grad_norm": 0.5358479619026184,
"learning_rate": 0.0004823084132362483,
"loss": 0.034,
"step": 2500
},
{
"epoch": 4.322766570605188,
"grad_norm": 0.3984196186065674,
"learning_rate": 0.0004787037704563478,
"loss": 0.0278,
"step": 3000
},
{
"epoch": 5.043227665706052,
"grad_norm": 0.5611603856086731,
"learning_rate": 0.00047509912767644725,
"loss": 0.021,
"step": 3500
},
{
"epoch": 5.187319884726225,
"eval_loss": 0.05793336406350136,
"eval_runtime": 21.4122,
"eval_samples_per_second": 108.069,
"eval_steps_per_second": 0.093,
"step": 3600
},
{
"epoch": 5.763688760806916,
"grad_norm": 0.33251306414604187,
"learning_rate": 0.0004714944848965468,
"loss": 0.019,
"step": 4000
},
{
"epoch": 6.484149855907781,
"grad_norm": 0.5683927536010742,
"learning_rate": 0.00046788984211664625,
"loss": 0.0168,
"step": 4500
},
{
"epoch": 6.916426512968299,
"eval_loss": 0.047444652765989304,
"eval_runtime": 21.5287,
"eval_samples_per_second": 107.484,
"eval_steps_per_second": 0.093,
"step": 4800
},
{
"epoch": 7.204610951008646,
"grad_norm": 1.6696492433547974,
"learning_rate": 0.0004642851993367457,
"loss": 0.0153,
"step": 5000
},
{
"epoch": 7.92507204610951,
"grad_norm": 0.6783491373062134,
"learning_rate": 0.0004606805565568452,
"loss": 0.0116,
"step": 5500
},
{
"epoch": 8.645533141210375,
"grad_norm": 0.4771524667739868,
"learning_rate": 0.0004570759137769447,
"loss": 0.0118,
"step": 6000
},
{
"epoch": 8.645533141210375,
"eval_loss": 0.06448203325271606,
"eval_runtime": 20.7245,
"eval_samples_per_second": 111.655,
"eval_steps_per_second": 0.097,
"step": 6000
},
{
"epoch": 9.36599423631124,
"grad_norm": 0.45867717266082764,
"learning_rate": 0.0004534712709970442,
"loss": 0.0095,
"step": 6500
},
{
"epoch": 10.086455331412104,
"grad_norm": 1.0143071413040161,
"learning_rate": 0.0004498666282171437,
"loss": 0.0081,
"step": 7000
},
{
"epoch": 10.37463976945245,
"eval_loss": 0.059642400592565536,
"eval_runtime": 20.5013,
"eval_samples_per_second": 112.871,
"eval_steps_per_second": 0.098,
"step": 7200
},
{
"epoch": 10.806916426512968,
"grad_norm": 0.34545987844467163,
"learning_rate": 0.0004462619854372432,
"loss": 0.0077,
"step": 7500
},
{
"epoch": 11.527377521613833,
"grad_norm": 0.6745367050170898,
"learning_rate": 0.00044265734265734266,
"loss": 0.0073,
"step": 8000
},
{
"epoch": 12.103746397694524,
"eval_loss": 0.057360123842954636,
"eval_runtime": 21.5407,
"eval_samples_per_second": 107.425,
"eval_steps_per_second": 0.093,
"step": 8400
},
{
"epoch": 12.247838616714697,
"grad_norm": 0.3190229535102844,
"learning_rate": 0.0004390526998774422,
"loss": 0.0065,
"step": 8500
},
{
"epoch": 12.968299711815561,
"grad_norm": 0.20763935148715973,
"learning_rate": 0.00043544805709754166,
"loss": 0.0064,
"step": 9000
},
{
"epoch": 13.688760806916427,
"grad_norm": 0.11372426152229309,
"learning_rate": 0.00043184341431764113,
"loss": 0.0059,
"step": 9500
},
{
"epoch": 13.832853025936599,
"eval_loss": 0.08313994109630585,
"eval_runtime": 21.5595,
"eval_samples_per_second": 107.331,
"eval_steps_per_second": 0.093,
"step": 9600
},
{
"epoch": 14.409221902017292,
"grad_norm": 0.6901423335075378,
"learning_rate": 0.0004282387715377406,
"loss": 0.0055,
"step": 10000
},
{
"epoch": 15.129682997118156,
"grad_norm": 0.5882952213287354,
"learning_rate": 0.0004246341287578401,
"loss": 0.005,
"step": 10500
},
{
"epoch": 15.561959654178674,
"eval_loss": 0.06821350008249283,
"eval_runtime": 20.3166,
"eval_samples_per_second": 113.897,
"eval_steps_per_second": 0.098,
"step": 10800
},
{
"epoch": 15.85014409221902,
"grad_norm": 0.4642440676689148,
"learning_rate": 0.0004210294859779396,
"loss": 0.0049,
"step": 11000
},
{
"epoch": 16.570605187319885,
"grad_norm": 0.9032358527183533,
"learning_rate": 0.00041742484319803907,
"loss": 0.0048,
"step": 11500
},
{
"epoch": 17.29106628242075,
"grad_norm": 0.5521640777587891,
"learning_rate": 0.00041382020041813854,
"loss": 0.0046,
"step": 12000
},
{
"epoch": 17.29106628242075,
"eval_loss": 0.08423992991447449,
"eval_runtime": 21.1812,
"eval_samples_per_second": 109.248,
"eval_steps_per_second": 0.094,
"step": 12000
},
{
"epoch": 18.011527377521613,
"grad_norm": 0.7376463413238525,
"learning_rate": 0.000410215557638238,
"loss": 0.0044,
"step": 12500
},
{
"epoch": 18.73198847262248,
"grad_norm": 1.1471983194351196,
"learning_rate": 0.0004066109148583376,
"loss": 0.0045,
"step": 13000
},
{
"epoch": 19.020172910662826,
"eval_loss": 0.07880275696516037,
"eval_runtime": 21.5701,
"eval_samples_per_second": 107.278,
"eval_steps_per_second": 0.093,
"step": 13200
},
{
"epoch": 19.45244956772334,
"grad_norm": 0.053835347294807434,
"learning_rate": 0.00040300627207843706,
"loss": 0.0041,
"step": 13500
},
{
"epoch": 20.172910662824208,
"grad_norm": 0.7777488231658936,
"learning_rate": 0.00039940162929853653,
"loss": 0.0042,
"step": 14000
},
{
"epoch": 20.7492795389049,
"eval_loss": 0.062229253351688385,
"eval_runtime": 20.4938,
"eval_samples_per_second": 112.912,
"eval_steps_per_second": 0.098,
"step": 14400
},
{
"epoch": 20.89337175792507,
"grad_norm": 0.14320553839206696,
"learning_rate": 0.000395796986518636,
"loss": 0.004,
"step": 14500
},
{
"epoch": 21.613832853025936,
"grad_norm": 0.3327866494655609,
"learning_rate": 0.00039219234373873553,
"loss": 0.004,
"step": 15000
},
{
"epoch": 22.334293948126803,
"grad_norm": 0.29509493708610535,
"learning_rate": 0.000388587700958835,
"loss": 0.0037,
"step": 15500
},
{
"epoch": 22.478386167146976,
"eval_loss": 0.07450389117002487,
"eval_runtime": 21.8716,
"eval_samples_per_second": 105.799,
"eval_steps_per_second": 0.091,
"step": 15600
},
{
"epoch": 23.054755043227665,
"grad_norm": 0.5017435550689697,
"learning_rate": 0.00038498305817893447,
"loss": 0.0038,
"step": 16000
},
{
"epoch": 23.77521613832853,
"grad_norm": 0.05931377038359642,
"learning_rate": 0.00038137841539903394,
"loss": 0.0038,
"step": 16500
},
{
"epoch": 24.207492795389047,
"eval_loss": 0.09549176692962646,
"eval_runtime": 21.5513,
"eval_samples_per_second": 107.372,
"eval_steps_per_second": 0.093,
"step": 16800
},
{
"epoch": 24.495677233429394,
"grad_norm": 0.13349242508411407,
"learning_rate": 0.0003777737726191334,
"loss": 0.0034,
"step": 17000
},
{
"epoch": 25.21613832853026,
"grad_norm": 0.19320227205753326,
"learning_rate": 0.00037416912983923294,
"loss": 0.0034,
"step": 17500
},
{
"epoch": 25.936599423631122,
"grad_norm": 0.24608492851257324,
"learning_rate": 0.0003705644870593324,
"loss": 0.0034,
"step": 18000
},
{
"epoch": 25.936599423631122,
"eval_loss": 0.10036125034093857,
"eval_runtime": 22.0387,
"eval_samples_per_second": 104.997,
"eval_steps_per_second": 0.091,
"step": 18000
},
{
"epoch": 26.65706051873199,
"grad_norm": 0.11887585371732712,
"learning_rate": 0.0003669598442794319,
"loss": 0.0033,
"step": 18500
},
{
"epoch": 27.377521613832855,
"grad_norm": 0.5103694796562195,
"learning_rate": 0.0003633552014995314,
"loss": 0.0031,
"step": 19000
},
{
"epoch": 27.665706051873197,
"eval_loss": 0.0853080227971077,
"eval_runtime": 21.6671,
"eval_samples_per_second": 106.798,
"eval_steps_per_second": 0.092,
"step": 19200
},
{
"epoch": 28.097982708933717,
"grad_norm": 0.9122279286384583,
"learning_rate": 0.00035975055871963093,
"loss": 0.0034,
"step": 19500
},
{
"epoch": 28.818443804034583,
"grad_norm": 0.028490234166383743,
"learning_rate": 0.0003561459159397304,
"loss": 0.0035,
"step": 20000
},
{
"epoch": 29.394812680115272,
"eval_loss": 0.05787323787808418,
"eval_runtime": 21.1854,
"eval_samples_per_second": 109.226,
"eval_steps_per_second": 0.094,
"step": 20400
},
{
"epoch": 29.538904899135446,
"grad_norm": 0.32352131605148315,
"learning_rate": 0.0003525412731598299,
"loss": 0.0036,
"step": 20500
},
{
"epoch": 30.259365994236312,
"grad_norm": 0.43146830797195435,
"learning_rate": 0.00034893663037992935,
"loss": 0.0032,
"step": 21000
},
{
"epoch": 30.979827089337174,
"grad_norm": 0.22915582358837128,
"learning_rate": 0.0003453319876000288,
"loss": 0.0026,
"step": 21500
}
],
"logging_steps": 500,
"max_steps": 69400,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.477693523839612e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}