m1-32b / trainer_state.json
Can111's picture
Initial upload of qwen2.5-32b-instruct_deepseek-reasoner_2004_03-10-21_lr1e-5_wd1e-4_epo5_len32768_tbs1
d55c213 verified
raw
history blame
6.33 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.924302788844622,
"eval_steps": 500,
"global_step": 310,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1593625498007968,
"grad_norm": 1.439923644065857,
"learning_rate": 6.25e-06,
"loss": 0.8005,
"step": 10
},
{
"epoch": 0.3187250996015936,
"grad_norm": 1.1607290506362915,
"learning_rate": 9.995433337085492e-06,
"loss": 0.6192,
"step": 20
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.735303521156311,
"learning_rate": 9.944154131125643e-06,
"loss": 0.5349,
"step": 30
},
{
"epoch": 0.6374501992031872,
"grad_norm": 1.0546119213104248,
"learning_rate": 9.836474315195148e-06,
"loss": 0.5105,
"step": 40
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.655317485332489,
"learning_rate": 9.673622250534155e-06,
"loss": 0.4992,
"step": 50
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.8014914393424988,
"learning_rate": 9.457455677726447e-06,
"loss": 0.4943,
"step": 60
},
{
"epoch": 1.1115537848605577,
"grad_norm": 0.8364565372467041,
"learning_rate": 9.190440524459203e-06,
"loss": 0.4301,
"step": 70
},
{
"epoch": 1.2709163346613546,
"grad_norm": 0.6313614249229431,
"learning_rate": 8.87562277536726e-06,
"loss": 0.3869,
"step": 80
},
{
"epoch": 1.4302788844621515,
"grad_norm": 0.7729827761650085,
"learning_rate": 8.516593724857598e-06,
"loss": 0.3895,
"step": 90
},
{
"epoch": 1.5896414342629481,
"grad_norm": 0.5305516123771667,
"learning_rate": 8.117449009293668e-06,
"loss": 0.3809,
"step": 100
},
{
"epoch": 1.749003984063745,
"grad_norm": 0.6976670026779175,
"learning_rate": 7.682741885881314e-06,
"loss": 0.3707,
"step": 110
},
{
"epoch": 1.908366533864542,
"grad_norm": 0.5881310701370239,
"learning_rate": 7.217431291229068e-06,
"loss": 0.3831,
"step": 120
},
{
"epoch": 2.0637450199203187,
"grad_norm": 0.5917549729347229,
"learning_rate": 6.726825272106539e-06,
"loss": 0.3343,
"step": 130
},
{
"epoch": 2.2231075697211153,
"grad_norm": 0.6392484903335571,
"learning_rate": 6.216520433716544e-06,
"loss": 0.2776,
"step": 140
},
{
"epoch": 2.3824701195219125,
"grad_norm": 0.5469350814819336,
"learning_rate": 5.69233809622687e-06,
"loss": 0.2751,
"step": 150
},
{
"epoch": 2.541832669322709,
"grad_norm": 0.5329071879386902,
"learning_rate": 5.160257887858278e-06,
"loss": 0.2758,
"step": 160
},
{
"epoch": 2.7011952191235062,
"grad_norm": 0.608709454536438,
"learning_rate": 4.626349532067879e-06,
"loss": 0.2711,
"step": 170
},
{
"epoch": 2.860557768924303,
"grad_norm": 0.5087049603462219,
"learning_rate": 4.096703606968007e-06,
"loss": 0.2685,
"step": 180
},
{
"epoch": 3.0159362549800797,
"grad_norm": 0.7022324800491333,
"learning_rate": 3.5773620668448384e-06,
"loss": 0.2626,
"step": 190
},
{
"epoch": 3.1752988047808763,
"grad_norm": 0.5048023462295532,
"learning_rate": 3.074249318355046e-06,
"loss": 0.1978,
"step": 200
},
{
"epoch": 3.3346613545816735,
"grad_norm": 0.4734826385974884,
"learning_rate": 2.5931046376510875e-06,
"loss": 0.1886,
"step": 210
},
{
"epoch": 3.49402390438247,
"grad_norm": 0.6656137108802795,
"learning_rate": 2.139416699389153e-06,
"loss": 0.1918,
"step": 220
},
{
"epoch": 3.653386454183267,
"grad_norm": 0.4610200524330139,
"learning_rate": 1.7183609644824096e-06,
"loss": 0.1908,
"step": 230
},
{
"epoch": 3.812749003984064,
"grad_norm": 0.5110896229743958,
"learning_rate": 1.3347406408508695e-06,
"loss": 0.1758,
"step": 240
},
{
"epoch": 3.9721115537848606,
"grad_norm": 0.4129928946495056,
"learning_rate": 9.929318906602176e-07,
"loss": 0.1944,
"step": 250
},
{
"epoch": 4.127490039840637,
"grad_norm": 0.39533188939094543,
"learning_rate": 6.968339090999188e-07,
"loss": 0.1561,
"step": 260
},
{
"epoch": 4.286852589641434,
"grad_norm": 0.4790317118167877,
"learning_rate": 4.4982444417866753e-07,
"loss": 0.1381,
"step": 270
},
{
"epoch": 4.446215139442231,
"grad_norm": 0.39792048931121826,
"learning_rate": 2.547212649466568e-07,
"loss": 0.1532,
"step": 280
},
{
"epoch": 4.605577689243028,
"grad_norm": 0.4457632899284363,
"learning_rate": 1.1375001769728e-07,
"loss": 0.153,
"step": 290
},
{
"epoch": 4.764940239043825,
"grad_norm": 0.36862707138061523,
"learning_rate": 2.8518836829732332e-08,
"loss": 0.1556,
"step": 300
},
{
"epoch": 4.924302788844622,
"grad_norm": 0.44045692682266235,
"learning_rate": 0.0,
"loss": 0.1487,
"step": 310
},
{
"epoch": 4.924302788844622,
"step": 310,
"total_flos": 238832327327744.0,
"train_loss": 0.3164117013254473,
"train_runtime": 47203.2069,
"train_samples_per_second": 0.212,
"train_steps_per_second": 0.007
}
],
"logging_steps": 10,
"max_steps": 310,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 238832327327744.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}