cterdam's picture
Upload folder using huggingface_hub
1ccb927 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"grad_norm": 3.5600266456604004,
"learning_rate": 1.9333333333333333e-05,
"loss": 1.6248,
"step": 100
},
{
"epoch": 0.07,
"eval_loss": 1.5200175046920776,
"eval_runtime": 39.9101,
"eval_samples_per_second": 25.056,
"eval_steps_per_second": 3.132,
"step": 100
},
{
"epoch": 0.13,
"grad_norm": 2.83909273147583,
"learning_rate": 1.866666666666667e-05,
"loss": 1.5193,
"step": 200
},
{
"epoch": 0.13,
"eval_loss": 1.5256463289260864,
"eval_runtime": 39.9255,
"eval_samples_per_second": 25.047,
"eval_steps_per_second": 3.131,
"step": 200
},
{
"epoch": 0.2,
"grad_norm": 2.1943609714508057,
"learning_rate": 1.8e-05,
"loss": 1.4986,
"step": 300
},
{
"epoch": 0.2,
"eval_loss": 1.5103366374969482,
"eval_runtime": 39.9391,
"eval_samples_per_second": 25.038,
"eval_steps_per_second": 3.13,
"step": 300
},
{
"epoch": 0.27,
"grad_norm": 3.015270709991455,
"learning_rate": 1.7333333333333336e-05,
"loss": 1.4868,
"step": 400
},
{
"epoch": 0.27,
"eval_loss": 1.4908276796340942,
"eval_runtime": 39.9824,
"eval_samples_per_second": 25.011,
"eval_steps_per_second": 3.126,
"step": 400
},
{
"epoch": 0.33,
"grad_norm": 3.107252836227417,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.4795,
"step": 500
},
{
"epoch": 0.33,
"eval_loss": 1.459694743156433,
"eval_runtime": 39.9779,
"eval_samples_per_second": 25.014,
"eval_steps_per_second": 3.127,
"step": 500
},
{
"epoch": 0.4,
"grad_norm": 3.612938642501831,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.4927,
"step": 600
},
{
"epoch": 0.4,
"eval_loss": 1.496130347251892,
"eval_runtime": 39.9928,
"eval_samples_per_second": 25.004,
"eval_steps_per_second": 3.126,
"step": 600
},
{
"epoch": 0.47,
"grad_norm": 2.127037763595581,
"learning_rate": 1.5333333333333334e-05,
"loss": 1.4549,
"step": 700
},
{
"epoch": 0.47,
"eval_loss": 1.4545286893844604,
"eval_runtime": 39.9652,
"eval_samples_per_second": 25.022,
"eval_steps_per_second": 3.128,
"step": 700
},
{
"epoch": 0.53,
"grad_norm": 2.8459372520446777,
"learning_rate": 1.4666666666666666e-05,
"loss": 1.4764,
"step": 800
},
{
"epoch": 0.53,
"eval_loss": 1.472076177597046,
"eval_runtime": 39.984,
"eval_samples_per_second": 25.01,
"eval_steps_per_second": 3.126,
"step": 800
},
{
"epoch": 0.6,
"grad_norm": 2.973015546798706,
"learning_rate": 1.4e-05,
"loss": 1.4576,
"step": 900
},
{
"epoch": 0.6,
"eval_loss": 1.4507238864898682,
"eval_runtime": 39.9652,
"eval_samples_per_second": 25.022,
"eval_steps_per_second": 3.128,
"step": 900
},
{
"epoch": 0.67,
"grad_norm": 3.1685054302215576,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.4547,
"step": 1000
},
{
"epoch": 0.67,
"eval_loss": 1.4341034889221191,
"eval_runtime": 39.9594,
"eval_samples_per_second": 25.025,
"eval_steps_per_second": 3.128,
"step": 1000
},
{
"epoch": 0.73,
"grad_norm": 2.5751569271087646,
"learning_rate": 1.2666666666666667e-05,
"loss": 1.4304,
"step": 1100
},
{
"epoch": 0.73,
"eval_loss": 1.4542651176452637,
"eval_runtime": 39.9141,
"eval_samples_per_second": 25.054,
"eval_steps_per_second": 3.132,
"step": 1100
},
{
"epoch": 0.8,
"grad_norm": 3.4009389877319336,
"learning_rate": 1.2e-05,
"loss": 1.4094,
"step": 1200
},
{
"epoch": 0.8,
"eval_loss": 1.4490467309951782,
"eval_runtime": 39.8559,
"eval_samples_per_second": 25.09,
"eval_steps_per_second": 3.136,
"step": 1200
},
{
"epoch": 0.87,
"grad_norm": 3.2304933071136475,
"learning_rate": 1.1333333333333334e-05,
"loss": 1.4144,
"step": 1300
},
{
"epoch": 0.87,
"eval_loss": 1.411392092704773,
"eval_runtime": 39.8705,
"eval_samples_per_second": 25.081,
"eval_steps_per_second": 3.135,
"step": 1300
},
{
"epoch": 0.93,
"grad_norm": 2.266749858856201,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.4275,
"step": 1400
},
{
"epoch": 0.93,
"eval_loss": 1.4107253551483154,
"eval_runtime": 39.8875,
"eval_samples_per_second": 25.071,
"eval_steps_per_second": 3.134,
"step": 1400
},
{
"epoch": 1.0,
"grad_norm": 2.8319966793060303,
"learning_rate": 1e-05,
"loss": 1.4112,
"step": 1500
},
{
"epoch": 1.0,
"eval_loss": 1.4000786542892456,
"eval_runtime": 39.906,
"eval_samples_per_second": 25.059,
"eval_steps_per_second": 3.132,
"step": 1500
},
{
"epoch": 1.07,
"grad_norm": 3.5198869705200195,
"learning_rate": 9.333333333333334e-06,
"loss": 1.3564,
"step": 1600
},
{
"epoch": 1.07,
"eval_loss": 1.402136206626892,
"eval_runtime": 39.9894,
"eval_samples_per_second": 25.007,
"eval_steps_per_second": 3.126,
"step": 1600
},
{
"epoch": 1.13,
"grad_norm": 3.098515510559082,
"learning_rate": 8.666666666666668e-06,
"loss": 1.3579,
"step": 1700
},
{
"epoch": 1.13,
"eval_loss": 1.4018568992614746,
"eval_runtime": 39.9664,
"eval_samples_per_second": 25.021,
"eval_steps_per_second": 3.128,
"step": 1700
},
{
"epoch": 1.2,
"grad_norm": 2.2909343242645264,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3538,
"step": 1800
},
{
"epoch": 1.2,
"eval_loss": 1.3881511688232422,
"eval_runtime": 39.9759,
"eval_samples_per_second": 25.015,
"eval_steps_per_second": 3.127,
"step": 1800
},
{
"epoch": 1.27,
"grad_norm": 3.0837056636810303,
"learning_rate": 7.333333333333333e-06,
"loss": 1.3425,
"step": 1900
},
{
"epoch": 1.27,
"eval_loss": 1.3771216869354248,
"eval_runtime": 39.9844,
"eval_samples_per_second": 25.01,
"eval_steps_per_second": 3.126,
"step": 1900
},
{
"epoch": 1.33,
"grad_norm": 2.912759780883789,
"learning_rate": 6.666666666666667e-06,
"loss": 1.3369,
"step": 2000
},
{
"epoch": 1.33,
"eval_loss": 1.3808449506759644,
"eval_runtime": 39.9353,
"eval_samples_per_second": 25.04,
"eval_steps_per_second": 3.13,
"step": 2000
},
{
"epoch": 1.4,
"grad_norm": 4.19409704208374,
"learning_rate": 6.006666666666667e-06,
"loss": 1.3237,
"step": 2100
},
{
"epoch": 1.4,
"eval_loss": 1.3673079013824463,
"eval_runtime": 39.8846,
"eval_samples_per_second": 25.072,
"eval_steps_per_second": 3.134,
"step": 2100
},
{
"epoch": 1.47,
"grad_norm": 3.5358569622039795,
"learning_rate": 5.3400000000000005e-06,
"loss": 1.3182,
"step": 2200
},
{
"epoch": 1.47,
"eval_loss": 1.414589524269104,
"eval_runtime": 39.8676,
"eval_samples_per_second": 25.083,
"eval_steps_per_second": 3.135,
"step": 2200
},
{
"epoch": 1.53,
"grad_norm": 3.819859743118286,
"learning_rate": 4.673333333333333e-06,
"loss": 1.3162,
"step": 2300
},
{
"epoch": 1.53,
"eval_loss": 1.4112467765808105,
"eval_runtime": 39.885,
"eval_samples_per_second": 25.072,
"eval_steps_per_second": 3.134,
"step": 2300
},
{
"epoch": 1.6,
"grad_norm": 2.633864402770996,
"learning_rate": 4.006666666666667e-06,
"loss": 1.3305,
"step": 2400
},
{
"epoch": 1.6,
"eval_loss": 1.3370815515518188,
"eval_runtime": 39.8724,
"eval_samples_per_second": 25.08,
"eval_steps_per_second": 3.135,
"step": 2400
},
{
"epoch": 1.67,
"grad_norm": 2.9776573181152344,
"learning_rate": 3.3400000000000006e-06,
"loss": 1.3137,
"step": 2500
},
{
"epoch": 1.67,
"eval_loss": 1.3839720487594604,
"eval_runtime": 39.8922,
"eval_samples_per_second": 25.068,
"eval_steps_per_second": 3.133,
"step": 2500
},
{
"epoch": 1.73,
"grad_norm": 3.757050037384033,
"learning_rate": 2.6733333333333333e-06,
"loss": 1.2883,
"step": 2600
},
{
"epoch": 1.73,
"eval_loss": 1.3844949007034302,
"eval_runtime": 39.9563,
"eval_samples_per_second": 25.027,
"eval_steps_per_second": 3.128,
"step": 2600
},
{
"epoch": 1.8,
"grad_norm": 2.7019670009613037,
"learning_rate": 2.006666666666667e-06,
"loss": 1.2819,
"step": 2700
},
{
"epoch": 1.8,
"eval_loss": 1.3609910011291504,
"eval_runtime": 39.9383,
"eval_samples_per_second": 25.039,
"eval_steps_per_second": 3.13,
"step": 2700
},
{
"epoch": 1.87,
"grad_norm": 3.4379355907440186,
"learning_rate": 1.34e-06,
"loss": 1.3003,
"step": 2800
},
{
"epoch": 1.87,
"eval_loss": 1.3372739553451538,
"eval_runtime": 39.9408,
"eval_samples_per_second": 25.037,
"eval_steps_per_second": 3.13,
"step": 2800
},
{
"epoch": 1.93,
"grad_norm": 3.176496744155884,
"learning_rate": 6.733333333333334e-07,
"loss": 1.2928,
"step": 2900
},
{
"epoch": 1.93,
"eval_loss": 1.3514271974563599,
"eval_runtime": 39.9216,
"eval_samples_per_second": 25.049,
"eval_steps_per_second": 3.131,
"step": 2900
},
{
"epoch": 2.0,
"grad_norm": 2.788795232772827,
"learning_rate": 6.666666666666667e-09,
"loss": 1.2878,
"step": 3000
},
{
"epoch": 2.0,
"eval_loss": 1.3470672369003296,
"eval_runtime": 39.9358,
"eval_samples_per_second": 25.04,
"eval_steps_per_second": 3.13,
"step": 3000
}
],
"logging_steps": 100,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1500,
"total_flos": 2.83206569951232e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}