gemma-input-ablation / trainer_state.json
asun17904's picture
Upload folder using huggingface_hub
dc8b2d1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 50,
"global_step": 663,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04524886877828054,
"grad_norm": 5.21430778503418,
"learning_rate": 4.9321266968325794e-05,
"loss": 2.927,
"step": 10
},
{
"epoch": 0.09049773755656108,
"grad_norm": 3.602487087249756,
"learning_rate": 4.856711915535445e-05,
"loss": 0.2562,
"step": 20
},
{
"epoch": 0.13574660633484162,
"grad_norm": 3.16632080078125,
"learning_rate": 4.781297134238311e-05,
"loss": 0.1797,
"step": 30
},
{
"epoch": 0.18099547511312217,
"grad_norm": 1.254840612411499,
"learning_rate": 4.705882352941177e-05,
"loss": 0.1615,
"step": 40
},
{
"epoch": 0.22624434389140272,
"grad_norm": 1.1034722328186035,
"learning_rate": 4.6304675716440425e-05,
"loss": 0.1336,
"step": 50
},
{
"epoch": 0.22624434389140272,
"eval_loss": 0.1453334242105484,
"eval_runtime": 19.792,
"eval_samples_per_second": 44.563,
"eval_steps_per_second": 1.415,
"step": 50
},
{
"epoch": 0.27149321266968324,
"grad_norm": 1.1389213800430298,
"learning_rate": 4.555052790346908e-05,
"loss": 0.1375,
"step": 60
},
{
"epoch": 0.3167420814479638,
"grad_norm": 1.7688931226730347,
"learning_rate": 4.479638009049774e-05,
"loss": 0.1198,
"step": 70
},
{
"epoch": 0.36199095022624433,
"grad_norm": 0.8274220824241638,
"learning_rate": 4.40422322775264e-05,
"loss": 0.097,
"step": 80
},
{
"epoch": 0.4072398190045249,
"grad_norm": 1.46064293384552,
"learning_rate": 4.328808446455506e-05,
"loss": 0.1155,
"step": 90
},
{
"epoch": 0.45248868778280543,
"grad_norm": 1.8560250997543335,
"learning_rate": 4.2533936651583714e-05,
"loss": 0.1168,
"step": 100
},
{
"epoch": 0.45248868778280543,
"eval_loss": 0.13205984234809875,
"eval_runtime": 19.7942,
"eval_samples_per_second": 44.559,
"eval_steps_per_second": 1.415,
"step": 100
},
{
"epoch": 0.497737556561086,
"grad_norm": 0.6780909895896912,
"learning_rate": 4.177978883861237e-05,
"loss": 0.1156,
"step": 110
},
{
"epoch": 0.5429864253393665,
"grad_norm": 1.2922134399414062,
"learning_rate": 4.1025641025641023e-05,
"loss": 0.1099,
"step": 120
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.5779380798339844,
"learning_rate": 4.027149321266969e-05,
"loss": 0.1054,
"step": 130
},
{
"epoch": 0.6334841628959276,
"grad_norm": 1.124670386314392,
"learning_rate": 3.951734539969834e-05,
"loss": 0.1158,
"step": 140
},
{
"epoch": 0.6787330316742082,
"grad_norm": 0.8543263077735901,
"learning_rate": 3.8763197586727004e-05,
"loss": 0.1013,
"step": 150
},
{
"epoch": 0.6787330316742082,
"eval_loss": 0.11915399879217148,
"eval_runtime": 19.7455,
"eval_samples_per_second": 44.668,
"eval_steps_per_second": 1.418,
"step": 150
},
{
"epoch": 0.7239819004524887,
"grad_norm": 1.6712355613708496,
"learning_rate": 3.8009049773755655e-05,
"loss": 0.1095,
"step": 160
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.8462319374084473,
"learning_rate": 3.725490196078432e-05,
"loss": 0.1032,
"step": 170
},
{
"epoch": 0.8144796380090498,
"grad_norm": 1.3506959676742554,
"learning_rate": 3.650075414781297e-05,
"loss": 0.0801,
"step": 180
},
{
"epoch": 0.8597285067873304,
"grad_norm": 1.9755982160568237,
"learning_rate": 3.574660633484163e-05,
"loss": 0.1174,
"step": 190
},
{
"epoch": 0.9049773755656109,
"grad_norm": 0.8696920275688171,
"learning_rate": 3.4992458521870286e-05,
"loss": 0.0995,
"step": 200
},
{
"epoch": 0.9049773755656109,
"eval_loss": 0.09578042477369308,
"eval_runtime": 19.7545,
"eval_samples_per_second": 44.648,
"eval_steps_per_second": 1.417,
"step": 200
},
{
"epoch": 0.9502262443438914,
"grad_norm": 0.7832978963851929,
"learning_rate": 3.4238310708898944e-05,
"loss": 0.0921,
"step": 210
},
{
"epoch": 0.995475113122172,
"grad_norm": 2.3884148597717285,
"learning_rate": 3.34841628959276e-05,
"loss": 0.0944,
"step": 220
},
{
"epoch": 1.0407239819004526,
"grad_norm": 2.8667216300964355,
"learning_rate": 3.273001508295626e-05,
"loss": 0.0677,
"step": 230
},
{
"epoch": 1.085972850678733,
"grad_norm": 1.0837510824203491,
"learning_rate": 3.197586726998492e-05,
"loss": 0.0705,
"step": 240
},
{
"epoch": 1.1312217194570136,
"grad_norm": 1.6083077192306519,
"learning_rate": 3.1221719457013576e-05,
"loss": 0.0599,
"step": 250
},
{
"epoch": 1.1312217194570136,
"eval_loss": 0.09875330328941345,
"eval_runtime": 19.7671,
"eval_samples_per_second": 44.62,
"eval_steps_per_second": 1.416,
"step": 250
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.8606009483337402,
"learning_rate": 3.046757164404223e-05,
"loss": 0.0563,
"step": 260
},
{
"epoch": 1.2217194570135748,
"grad_norm": 1.5137906074523926,
"learning_rate": 2.971342383107089e-05,
"loss": 0.0774,
"step": 270
},
{
"epoch": 1.2669683257918551,
"grad_norm": 3.468083381652832,
"learning_rate": 2.8959276018099553e-05,
"loss": 0.0636,
"step": 280
},
{
"epoch": 1.3122171945701357,
"grad_norm": 2.216883659362793,
"learning_rate": 2.8205128205128207e-05,
"loss": 0.0578,
"step": 290
},
{
"epoch": 1.3574660633484164,
"grad_norm": 0.9391270875930786,
"learning_rate": 2.7450980392156865e-05,
"loss": 0.0688,
"step": 300
},
{
"epoch": 1.3574660633484164,
"eval_loss": 0.10202794522047043,
"eval_runtime": 19.7869,
"eval_samples_per_second": 44.575,
"eval_steps_per_second": 1.415,
"step": 300
},
{
"epoch": 1.4027149321266967,
"grad_norm": 1.7586933374404907,
"learning_rate": 2.6696832579185523e-05,
"loss": 0.0762,
"step": 310
},
{
"epoch": 1.4479638009049773,
"grad_norm": 1.774359941482544,
"learning_rate": 2.594268476621418e-05,
"loss": 0.0513,
"step": 320
},
{
"epoch": 1.493212669683258,
"grad_norm": 1.3426671028137207,
"learning_rate": 2.5188536953242835e-05,
"loss": 0.0694,
"step": 330
},
{
"epoch": 1.5384615384615383,
"grad_norm": 2.611431121826172,
"learning_rate": 2.4434389140271493e-05,
"loss": 0.0553,
"step": 340
},
{
"epoch": 1.5837104072398192,
"grad_norm": 1.2032498121261597,
"learning_rate": 2.368024132730015e-05,
"loss": 0.0649,
"step": 350
},
{
"epoch": 1.5837104072398192,
"eval_loss": 0.08488748222589493,
"eval_runtime": 19.7102,
"eval_samples_per_second": 44.748,
"eval_steps_per_second": 1.421,
"step": 350
},
{
"epoch": 1.6289592760180995,
"grad_norm": 1.0457937717437744,
"learning_rate": 2.292609351432881e-05,
"loss": 0.0656,
"step": 360
},
{
"epoch": 1.6742081447963801,
"grad_norm": 1.1490514278411865,
"learning_rate": 2.2171945701357466e-05,
"loss": 0.0459,
"step": 370
},
{
"epoch": 1.7194570135746607,
"grad_norm": 2.64288592338562,
"learning_rate": 2.1417797888386124e-05,
"loss": 0.0634,
"step": 380
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.5465795993804932,
"learning_rate": 2.0663650075414782e-05,
"loss": 0.0457,
"step": 390
},
{
"epoch": 1.8099547511312217,
"grad_norm": 2.7211270332336426,
"learning_rate": 1.990950226244344e-05,
"loss": 0.0541,
"step": 400
},
{
"epoch": 1.8099547511312217,
"eval_loss": 0.08079428225755692,
"eval_runtime": 19.7977,
"eval_samples_per_second": 44.551,
"eval_steps_per_second": 1.414,
"step": 400
},
{
"epoch": 1.8552036199095023,
"grad_norm": 1.789421558380127,
"learning_rate": 1.9155354449472098e-05,
"loss": 0.0573,
"step": 410
},
{
"epoch": 1.9004524886877827,
"grad_norm": 2.3791332244873047,
"learning_rate": 1.8401206636500756e-05,
"loss": 0.0654,
"step": 420
},
{
"epoch": 1.9457013574660633,
"grad_norm": 1.8045274019241333,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.0567,
"step": 430
},
{
"epoch": 1.990950226244344,
"grad_norm": 1.0020647048950195,
"learning_rate": 1.689291101055807e-05,
"loss": 0.056,
"step": 440
},
{
"epoch": 2.0361990950226243,
"grad_norm": 1.4606750011444092,
"learning_rate": 1.613876319758673e-05,
"loss": 0.0357,
"step": 450
},
{
"epoch": 2.0361990950226243,
"eval_loss": 0.09197434037923813,
"eval_runtime": 19.7791,
"eval_samples_per_second": 44.592,
"eval_steps_per_second": 1.416,
"step": 450
},
{
"epoch": 2.081447963800905,
"grad_norm": 1.4176522493362427,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0487,
"step": 460
},
{
"epoch": 2.1266968325791855,
"grad_norm": 0.9721936583518982,
"learning_rate": 1.4630467571644043e-05,
"loss": 0.0264,
"step": 470
},
{
"epoch": 2.171945701357466,
"grad_norm": 1.8826217651367188,
"learning_rate": 1.3876319758672701e-05,
"loss": 0.0337,
"step": 480
},
{
"epoch": 2.2171945701357467,
"grad_norm": 2.2242259979248047,
"learning_rate": 1.3122171945701359e-05,
"loss": 0.0486,
"step": 490
},
{
"epoch": 2.262443438914027,
"grad_norm": 3.4401419162750244,
"learning_rate": 1.2368024132730017e-05,
"loss": 0.0371,
"step": 500
},
{
"epoch": 2.262443438914027,
"eval_loss": 0.10297037661075592,
"eval_runtime": 19.804,
"eval_samples_per_second": 44.536,
"eval_steps_per_second": 1.414,
"step": 500
},
{
"epoch": 2.3076923076923075,
"grad_norm": 2.3715732097625732,
"learning_rate": 1.1613876319758673e-05,
"loss": 0.0303,
"step": 510
},
{
"epoch": 2.3529411764705883,
"grad_norm": 1.1995147466659546,
"learning_rate": 1.0859728506787331e-05,
"loss": 0.0375,
"step": 520
},
{
"epoch": 2.3981900452488687,
"grad_norm": 2.4895598888397217,
"learning_rate": 1.0105580693815989e-05,
"loss": 0.0336,
"step": 530
},
{
"epoch": 2.4434389140271495,
"grad_norm": 1.5218836069107056,
"learning_rate": 9.351432880844647e-06,
"loss": 0.0412,
"step": 540
},
{
"epoch": 2.48868778280543,
"grad_norm": 1.484147071838379,
"learning_rate": 8.597285067873303e-06,
"loss": 0.0437,
"step": 550
},
{
"epoch": 2.48868778280543,
"eval_loss": 0.09078551828861237,
"eval_runtime": 19.7816,
"eval_samples_per_second": 44.587,
"eval_steps_per_second": 1.415,
"step": 550
},
{
"epoch": 2.5339366515837103,
"grad_norm": 1.726880669593811,
"learning_rate": 7.84313725490196e-06,
"loss": 0.0282,
"step": 560
},
{
"epoch": 2.579185520361991,
"grad_norm": 1.269982933998108,
"learning_rate": 7.0889894419306185e-06,
"loss": 0.0353,
"step": 570
},
{
"epoch": 2.6244343891402715,
"grad_norm": 0.8152230381965637,
"learning_rate": 6.334841628959276e-06,
"loss": 0.0251,
"step": 580
},
{
"epoch": 2.669683257918552,
"grad_norm": 1.2272216081619263,
"learning_rate": 5.580693815987934e-06,
"loss": 0.0251,
"step": 590
},
{
"epoch": 2.7149321266968327,
"grad_norm": 1.86264967918396,
"learning_rate": 4.826546003016592e-06,
"loss": 0.0213,
"step": 600
},
{
"epoch": 2.7149321266968327,
"eval_loss": 0.10036029666662216,
"eval_runtime": 19.7588,
"eval_samples_per_second": 44.638,
"eval_steps_per_second": 1.417,
"step": 600
},
{
"epoch": 2.760180995475113,
"grad_norm": 2.0232512950897217,
"learning_rate": 4.072398190045249e-06,
"loss": 0.0317,
"step": 610
},
{
"epoch": 2.8054298642533935,
"grad_norm": 1.125870943069458,
"learning_rate": 3.3182503770739065e-06,
"loss": 0.0328,
"step": 620
},
{
"epoch": 2.8506787330316743,
"grad_norm": 2.3638086318969727,
"learning_rate": 2.564102564102564e-06,
"loss": 0.0318,
"step": 630
},
{
"epoch": 2.8959276018099547,
"grad_norm": 1.220841646194458,
"learning_rate": 1.809954751131222e-06,
"loss": 0.0265,
"step": 640
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.9988038539886475,
"learning_rate": 1.0558069381598795e-06,
"loss": 0.0266,
"step": 650
},
{
"epoch": 2.9411764705882355,
"eval_loss": 0.0963086187839508,
"eval_runtime": 19.7526,
"eval_samples_per_second": 44.652,
"eval_steps_per_second": 1.418,
"step": 650
},
{
"epoch": 2.986425339366516,
"grad_norm": 2.0754244327545166,
"learning_rate": 3.01659125188537e-07,
"loss": 0.0218,
"step": 660
}
],
"logging_steps": 10,
"max_steps": 663,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2915495153893376e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}