cc_kaz / checkpoint-20000 /trainer_state.json

Initial upload of multiple checkpoints

769e510 verified 10 months ago

7.5 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 12.795905310300704,
	"eval_steps": 500,
	"global_step": 20000,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.3198976327575176,
	"grad_norm": 4.1601386070251465,
	"learning_rate": 5e-06,
	"loss": 10.3279,
	"step": 500
	},
	{
	"epoch": 0.6397952655150352,
	"grad_norm": 4.366061687469482,
	"learning_rate": 1e-05,
	"loss": 9.3834,
	"step": 1000
	},
	{
	"epoch": 0.9596928982725528,
	"grad_norm": 4.784337043762207,
	"learning_rate": 1.5e-05,
	"loss": 8.8888,
	"step": 1500
	},
	{
	"epoch": 1.2795905310300704,
	"grad_norm": 3.9968652725219727,
	"learning_rate": 2e-05,
	"loss": 8.6568,
	"step": 2000
	},
	{
	"epoch": 1.599488163787588,
	"grad_norm": 4.402552127838135,
	"learning_rate": 2.5e-05,
	"loss": 8.5473,
	"step": 2500
	},
	{
	"epoch": 1.9193857965451055,
	"grad_norm": 4.639041423797607,
	"learning_rate": 3e-05,
	"loss": 8.4044,
	"step": 3000
	},
	{
	"epoch": 2.239283429302623,
	"grad_norm": 5.651747226715088,
	"learning_rate": 3.5e-05,
	"loss": 8.2868,
	"step": 3500
	},
	{
	"epoch": 2.5591810620601407,
	"grad_norm": 4.6999359130859375,
	"learning_rate": 4e-05,
	"loss": 8.1766,
	"step": 4000
	},
	{
	"epoch": 2.8790786948176583,
	"grad_norm": 4.838181495666504,
	"learning_rate": 4.499e-05,
	"loss": 8.1118,
	"step": 4500
	},
	{
	"epoch": 3.198976327575176,
	"grad_norm": 4.238831996917725,
	"learning_rate": 4.999e-05,
	"loss": 8.0038,
	"step": 5000
	},
	{
	"epoch": 3.5188739603326935,
	"grad_norm": 4.455530643463135,
	"learning_rate": 5.499000000000001e-05,
	"loss": 7.9014,
	"step": 5500
	},
	{
	"epoch": 3.838771593090211,
	"grad_norm": 5.811736583709717,
	"learning_rate": 5.999e-05,
	"loss": 7.8352,
	"step": 6000
	},
	{
	"epoch": 4.158669225847729,
	"grad_norm": 4.998301982879639,
	"learning_rate": 6.498e-05,
	"loss": 7.7613,
	"step": 6500
	},
	{
	"epoch": 4.478566858605246,
	"grad_norm": 5.011510848999023,
	"learning_rate": 6.998e-05,
	"loss": 7.6554,
	"step": 7000
	},
	{
	"epoch": 4.798464491362764,
	"grad_norm": 4.750300884246826,
	"learning_rate": 7.498e-05,
	"loss": 7.6109,
	"step": 7500
	},
	{
	"epoch": 5.1183621241202815,
	"grad_norm": 6.24017858505249,
	"learning_rate": 7.998e-05,
	"loss": 7.5186,
	"step": 8000
	},
	{
	"epoch": 5.438259756877799,
	"grad_norm": 6.061458587646484,
	"learning_rate": 8.497000000000001e-05,
	"loss": 7.3966,
	"step": 8500
	},
	{
	"epoch": 5.758157389635317,
	"grad_norm": 7.151447772979736,
	"learning_rate": 8.997000000000001e-05,
	"loss": 7.2877,
	"step": 9000
	},
	{
	"epoch": 6.078055022392834,
	"grad_norm": 7.578985214233398,
	"learning_rate": 9.497000000000001e-05,
	"loss": 7.1542,
	"step": 9500
	},
	{
	"epoch": 6.397952655150352,
	"grad_norm": 5.948920726776123,
	"learning_rate": 9.997e-05,
	"loss": 7.0008,
	"step": 10000
	},
	{
	"epoch": 6.717850287907869,
	"grad_norm": 8.036959648132324,
	"learning_rate": 9.982896551724137e-05,
	"loss": 6.8966,
	"step": 10500
	},
	{
	"epoch": 7.037747920665387,
	"grad_norm": 7.160433292388916,
	"learning_rate": 9.965655172413794e-05,
	"loss": 6.7509,
	"step": 11000
	},
	{
	"epoch": 7.357645553422905,
	"grad_norm": 5.934999465942383,
	"learning_rate": 9.948413793103449e-05,
	"loss": 6.5833,
	"step": 11500
	},
	{
	"epoch": 7.677543186180422,
	"grad_norm": 7.745622634887695,
	"learning_rate": 9.931172413793104e-05,
	"loss": 6.4975,
	"step": 12000
	},
	{
	"epoch": 7.99744081893794,
	"grad_norm": 7.0418477058410645,
	"learning_rate": 9.91393103448276e-05,
	"loss": 6.4261,
	"step": 12500
	},
	{
	"epoch": 8.317338451695457,
	"grad_norm": 6.101259708404541,
	"learning_rate": 9.896689655172414e-05,
	"loss": 6.2092,
	"step": 13000
	},
	{
	"epoch": 8.637236084452976,
	"grad_norm": 7.289799213409424,
	"learning_rate": 9.87944827586207e-05,
	"loss": 6.1436,
	"step": 13500
	},
	{
	"epoch": 8.957133717210493,
	"grad_norm": 8.126811027526855,
	"learning_rate": 9.862206896551725e-05,
	"loss": 6.0456,
	"step": 14000
	},
	{
	"epoch": 9.277031349968011,
	"grad_norm": 8.221816062927246,
	"learning_rate": 9.845000000000001e-05,
	"loss": 5.9141,
	"step": 14500
	},
	{
	"epoch": 9.596928982725528,
	"grad_norm": 7.361550331115723,
	"learning_rate": 9.827793103448277e-05,
	"loss": 5.8326,
	"step": 15000
	},
	{
	"epoch": 9.916826615483046,
	"grad_norm": 7.1737775802612305,
	"learning_rate": 9.810551724137932e-05,
	"loss": 5.7974,
	"step": 15500
	},
	{
	"epoch": 10.236724248240563,
	"grad_norm": 9.80185604095459,
	"learning_rate": 9.793310344827586e-05,
	"loss": 5.6282,
	"step": 16000
	},
	{
	"epoch": 10.556621880998081,
	"grad_norm": 7.2062153816223145,
	"learning_rate": 9.776068965517242e-05,
	"loss": 5.5619,
	"step": 16500
	},
	{
	"epoch": 10.876519513755598,
	"grad_norm": 10.801878929138184,
	"learning_rate": 9.758827586206896e-05,
	"loss": 5.5155,
	"step": 17000
	},
	{
	"epoch": 11.196417146513117,
	"grad_norm": 8.48509693145752,
	"learning_rate": 9.741586206896553e-05,
	"loss": 5.4259,
	"step": 17500
	},
	{
	"epoch": 11.516314779270633,
	"grad_norm": 8.47572135925293,
	"learning_rate": 9.724344827586207e-05,
	"loss": 5.3205,
	"step": 18000
	},
	{
	"epoch": 11.836212412028152,
	"grad_norm": 6.122796535491943,
	"learning_rate": 9.707103448275863e-05,
	"loss": 5.3025,
	"step": 18500
	},
	{
	"epoch": 12.156110044785668,
	"grad_norm": 8.210710525512695,
	"learning_rate": 9.689896551724139e-05,
	"loss": 5.2264,
	"step": 19000
	},
	{
	"epoch": 12.476007677543187,
	"grad_norm": 7.857537746429443,
	"learning_rate": 9.672655172413794e-05,
	"loss": 5.1395,
	"step": 19500
	},
	{
	"epoch": 12.795905310300704,
	"grad_norm": 7.743075370788574,
	"learning_rate": 9.655413793103448e-05,
	"loss": 5.1109,
	"step": 20000
	}
	],
	"logging_steps": 500,
	"max_steps": 300000,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 192,
	"save_steps": 20000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 2.2481858007638016e+16,
	"train_batch_size": 32,
	"trial_name": null,
	"trial_params": null
	}