chbx-nikta-0.1 / trainer_state.json
fron1runner's picture
Upload checkpoint-800 (RU fine-tune)
bedd1ae verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 200,
"global_step": 480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_runtime": 13.9532,
"eval_samples_per_second": 0.072,
"eval_steps_per_second": 0.072,
"step": 0
},
{
"epoch": 0.15873015873015872,
"grad_norm": 11.557395935058594,
"learning_rate": 1.6000000000000003e-05,
"loss": 8.1827,
"step": 5
},
{
"epoch": 0.31746031746031744,
"grad_norm": 11.404980659484863,
"learning_rate": 1.9831578947368423e-05,
"loss": 6.9238,
"step": 10
},
{
"epoch": 0.47619047619047616,
"grad_norm": 7.607983589172363,
"learning_rate": 1.962105263157895e-05,
"loss": 5.6257,
"step": 15
},
{
"epoch": 0.6349206349206349,
"grad_norm": 9.123591423034668,
"learning_rate": 1.9410526315789476e-05,
"loss": 5.757,
"step": 20
},
{
"epoch": 0.7936507936507936,
"grad_norm": 8.275577545166016,
"learning_rate": 1.9200000000000003e-05,
"loss": 5.8446,
"step": 25
},
{
"epoch": 0.9523809523809523,
"grad_norm": 7.894503593444824,
"learning_rate": 1.898947368421053e-05,
"loss": 5.9334,
"step": 30
},
{
"epoch": 1.0952380952380953,
"grad_norm": 8.614194869995117,
"learning_rate": 1.8778947368421056e-05,
"loss": 4.9961,
"step": 35
},
{
"epoch": 1.253968253968254,
"grad_norm": 7.895678520202637,
"learning_rate": 1.856842105263158e-05,
"loss": 4.9552,
"step": 40
},
{
"epoch": 1.4126984126984126,
"grad_norm": 8.537894248962402,
"learning_rate": 1.8357894736842105e-05,
"loss": 4.4348,
"step": 45
},
{
"epoch": 1.5714285714285714,
"grad_norm": 14.754180908203125,
"learning_rate": 1.8147368421052632e-05,
"loss": 5.5887,
"step": 50
},
{
"epoch": 1.7301587301587302,
"grad_norm": 12.931550979614258,
"learning_rate": 1.793684210526316e-05,
"loss": 4.3393,
"step": 55
},
{
"epoch": 1.8888888888888888,
"grad_norm": 12.019343376159668,
"learning_rate": 1.7726315789473685e-05,
"loss": 4.4514,
"step": 60
},
{
"epoch": 2.0317460317460316,
"grad_norm": 9.587310791015625,
"learning_rate": 1.751578947368421e-05,
"loss": 4.4192,
"step": 65
},
{
"epoch": 2.1904761904761907,
"grad_norm": 15.39806842803955,
"learning_rate": 1.7305263157894738e-05,
"loss": 3.321,
"step": 70
},
{
"epoch": 2.3492063492063493,
"grad_norm": 33.02901077270508,
"learning_rate": 1.7094736842105265e-05,
"loss": 3.5273,
"step": 75
},
{
"epoch": 2.507936507936508,
"grad_norm": 15.809943199157715,
"learning_rate": 1.688421052631579e-05,
"loss": 3.627,
"step": 80
},
{
"epoch": 2.6666666666666665,
"grad_norm": 17.844499588012695,
"learning_rate": 1.6673684210526318e-05,
"loss": 3.4103,
"step": 85
},
{
"epoch": 2.825396825396825,
"grad_norm": 16.915332794189453,
"learning_rate": 1.6463157894736844e-05,
"loss": 3.2948,
"step": 90
},
{
"epoch": 2.984126984126984,
"grad_norm": 10.49189567565918,
"learning_rate": 1.6252631578947367e-05,
"loss": 3.841,
"step": 95
},
{
"epoch": 3.126984126984127,
"grad_norm": 10.832942962646484,
"learning_rate": 1.6042105263157897e-05,
"loss": 2.8493,
"step": 100
},
{
"epoch": 3.2857142857142856,
"grad_norm": 21.16595458984375,
"learning_rate": 1.5831578947368424e-05,
"loss": 2.5387,
"step": 105
},
{
"epoch": 3.4444444444444446,
"grad_norm": 14.9147310256958,
"learning_rate": 1.5621052631578947e-05,
"loss": 2.615,
"step": 110
},
{
"epoch": 3.6031746031746033,
"grad_norm": 18.327787399291992,
"learning_rate": 1.5410526315789477e-05,
"loss": 2.1285,
"step": 115
},
{
"epoch": 3.761904761904762,
"grad_norm": 9.476289749145508,
"learning_rate": 1.5200000000000002e-05,
"loss": 2.2728,
"step": 120
},
{
"epoch": 3.9206349206349205,
"grad_norm": 17.919769287109375,
"learning_rate": 1.4989473684210527e-05,
"loss": 2.5896,
"step": 125
},
{
"epoch": 4.063492063492063,
"grad_norm": 14.85531997680664,
"learning_rate": 1.4778947368421055e-05,
"loss": 2.1744,
"step": 130
},
{
"epoch": 4.222222222222222,
"grad_norm": 14.478163719177246,
"learning_rate": 1.456842105263158e-05,
"loss": 1.7097,
"step": 135
},
{
"epoch": 4.380952380952381,
"grad_norm": 11.578816413879395,
"learning_rate": 1.4357894736842106e-05,
"loss": 1.5367,
"step": 140
},
{
"epoch": 4.5396825396825395,
"grad_norm": 17.49262809753418,
"learning_rate": 1.4147368421052631e-05,
"loss": 1.5296,
"step": 145
},
{
"epoch": 4.698412698412699,
"grad_norm": 14.988396644592285,
"learning_rate": 1.393684210526316e-05,
"loss": 1.7586,
"step": 150
},
{
"epoch": 4.857142857142857,
"grad_norm": 14.392678260803223,
"learning_rate": 1.3726315789473686e-05,
"loss": 1.6262,
"step": 155
},
{
"epoch": 5.0,
"grad_norm": 20.406312942504883,
"learning_rate": 1.3515789473684211e-05,
"loss": 1.6957,
"step": 160
},
{
"epoch": 5.158730158730159,
"grad_norm": 11.343132972717285,
"learning_rate": 1.3305263157894739e-05,
"loss": 1.1688,
"step": 165
},
{
"epoch": 5.317460317460317,
"grad_norm": 19.622802734375,
"learning_rate": 1.3094736842105264e-05,
"loss": 1.2816,
"step": 170
},
{
"epoch": 5.476190476190476,
"grad_norm": 8.312881469726562,
"learning_rate": 1.288421052631579e-05,
"loss": 1.1862,
"step": 175
},
{
"epoch": 5.634920634920634,
"grad_norm": 9.719124794006348,
"learning_rate": 1.2673684210526315e-05,
"loss": 1.1252,
"step": 180
},
{
"epoch": 5.7936507936507935,
"grad_norm": 13.568297386169434,
"learning_rate": 1.2463157894736844e-05,
"loss": 1.4237,
"step": 185
},
{
"epoch": 5.9523809523809526,
"grad_norm": 14.834338188171387,
"learning_rate": 1.225263157894737e-05,
"loss": 1.1826,
"step": 190
},
{
"epoch": 6.095238095238095,
"grad_norm": 9.984978675842285,
"learning_rate": 1.2042105263157895e-05,
"loss": 0.7894,
"step": 195
},
{
"epoch": 6.253968253968254,
"grad_norm": 9.38455581665039,
"learning_rate": 1.1831578947368423e-05,
"loss": 0.631,
"step": 200
},
{
"epoch": 6.253968253968254,
"eval_runtime": 1.8467,
"eval_samples_per_second": 0.542,
"eval_steps_per_second": 0.542,
"step": 200
},
{
"epoch": 6.412698412698413,
"grad_norm": 10.59284496307373,
"learning_rate": 1.1621052631578948e-05,
"loss": 0.5435,
"step": 205
},
{
"epoch": 6.571428571428571,
"grad_norm": 11.207695007324219,
"learning_rate": 1.1410526315789475e-05,
"loss": 0.7223,
"step": 210
},
{
"epoch": 6.73015873015873,
"grad_norm": 8.157690048217773,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.6299,
"step": 215
},
{
"epoch": 6.888888888888889,
"grad_norm": 11.521504402160645,
"learning_rate": 1.0989473684210528e-05,
"loss": 0.68,
"step": 220
},
{
"epoch": 7.031746031746032,
"grad_norm": 15.177350997924805,
"learning_rate": 1.0778947368421053e-05,
"loss": 0.5935,
"step": 225
},
{
"epoch": 7.190476190476191,
"grad_norm": 12.173933029174805,
"learning_rate": 1.0568421052631579e-05,
"loss": 0.3321,
"step": 230
},
{
"epoch": 7.349206349206349,
"grad_norm": 8.02884292602539,
"learning_rate": 1.0357894736842107e-05,
"loss": 0.3183,
"step": 235
},
{
"epoch": 7.507936507936508,
"grad_norm": 7.30162239074707,
"learning_rate": 1.0147368421052632e-05,
"loss": 0.2462,
"step": 240
},
{
"epoch": 7.666666666666667,
"grad_norm": 8.202823638916016,
"learning_rate": 9.936842105263159e-06,
"loss": 0.4269,
"step": 245
},
{
"epoch": 7.825396825396825,
"grad_norm": 12.977221488952637,
"learning_rate": 9.726315789473685e-06,
"loss": 0.4175,
"step": 250
},
{
"epoch": 7.984126984126984,
"grad_norm": 13.560741424560547,
"learning_rate": 9.515789473684212e-06,
"loss": 0.4621,
"step": 255
},
{
"epoch": 8.126984126984127,
"grad_norm": 4.011098384857178,
"learning_rate": 9.305263157894737e-06,
"loss": 0.2328,
"step": 260
},
{
"epoch": 8.285714285714286,
"grad_norm": 6.631120681762695,
"learning_rate": 9.094736842105263e-06,
"loss": 0.1861,
"step": 265
},
{
"epoch": 8.444444444444445,
"grad_norm": 5.168328285217285,
"learning_rate": 8.884210526315792e-06,
"loss": 0.1793,
"step": 270
},
{
"epoch": 8.603174603174603,
"grad_norm": 10.629039764404297,
"learning_rate": 8.673684210526316e-06,
"loss": 0.2486,
"step": 275
},
{
"epoch": 8.761904761904763,
"grad_norm": 13.89322566986084,
"learning_rate": 8.463157894736843e-06,
"loss": 0.167,
"step": 280
},
{
"epoch": 8.920634920634921,
"grad_norm": 6.970239639282227,
"learning_rate": 8.25263157894737e-06,
"loss": 0.1983,
"step": 285
},
{
"epoch": 9.063492063492063,
"grad_norm": 3.4170796871185303,
"learning_rate": 8.042105263157896e-06,
"loss": 0.1928,
"step": 290
},
{
"epoch": 9.222222222222221,
"grad_norm": 11.420437812805176,
"learning_rate": 7.831578947368421e-06,
"loss": 0.1943,
"step": 295
},
{
"epoch": 9.380952380952381,
"grad_norm": 5.474252223968506,
"learning_rate": 7.621052631578948e-06,
"loss": 0.1197,
"step": 300
},
{
"epoch": 9.53968253968254,
"grad_norm": 47.27220916748047,
"learning_rate": 7.410526315789475e-06,
"loss": 0.3147,
"step": 305
},
{
"epoch": 9.698412698412698,
"grad_norm": 9.262266159057617,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.0838,
"step": 310
},
{
"epoch": 9.857142857142858,
"grad_norm": 2.843402862548828,
"learning_rate": 6.989473684210527e-06,
"loss": 0.1181,
"step": 315
},
{
"epoch": 10.0,
"grad_norm": 10.928123474121094,
"learning_rate": 6.778947368421053e-06,
"loss": 0.1583,
"step": 320
},
{
"epoch": 10.158730158730158,
"grad_norm": 1.7071508169174194,
"learning_rate": 6.568421052631579e-06,
"loss": 0.0808,
"step": 325
},
{
"epoch": 10.317460317460318,
"grad_norm": 6.591403007507324,
"learning_rate": 6.357894736842106e-06,
"loss": 0.1076,
"step": 330
},
{
"epoch": 10.476190476190476,
"grad_norm": 4.758854389190674,
"learning_rate": 6.1473684210526316e-06,
"loss": 0.085,
"step": 335
},
{
"epoch": 10.634920634920634,
"grad_norm": 8.381784439086914,
"learning_rate": 5.936842105263159e-06,
"loss": 0.1412,
"step": 340
},
{
"epoch": 10.793650793650794,
"grad_norm": 6.775882244110107,
"learning_rate": 5.726315789473685e-06,
"loss": 0.1122,
"step": 345
},
{
"epoch": 10.952380952380953,
"grad_norm": 3.3244922161102295,
"learning_rate": 5.515789473684211e-06,
"loss": 0.0922,
"step": 350
},
{
"epoch": 11.095238095238095,
"grad_norm": 2.986769437789917,
"learning_rate": 5.305263157894738e-06,
"loss": 0.051,
"step": 355
},
{
"epoch": 11.253968253968253,
"grad_norm": 2.7891147136688232,
"learning_rate": 5.0947368421052635e-06,
"loss": 0.0607,
"step": 360
},
{
"epoch": 11.412698412698413,
"grad_norm": 1.6444604396820068,
"learning_rate": 4.88421052631579e-06,
"loss": 0.0738,
"step": 365
},
{
"epoch": 11.571428571428571,
"grad_norm": 3.8520383834838867,
"learning_rate": 4.6736842105263166e-06,
"loss": 0.0699,
"step": 370
},
{
"epoch": 11.73015873015873,
"grad_norm": 2.9614264965057373,
"learning_rate": 4.463157894736842e-06,
"loss": 0.0662,
"step": 375
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.9366450309753418,
"learning_rate": 4.252631578947369e-06,
"loss": 0.038,
"step": 380
},
{
"epoch": 12.031746031746032,
"grad_norm": 1.4501631259918213,
"learning_rate": 4.042105263157895e-06,
"loss": 0.0415,
"step": 385
},
{
"epoch": 12.19047619047619,
"grad_norm": 1.08451509475708,
"learning_rate": 3.831578947368421e-06,
"loss": 0.0382,
"step": 390
},
{
"epoch": 12.34920634920635,
"grad_norm": 3.214855670928955,
"learning_rate": 3.621052631578948e-06,
"loss": 0.0337,
"step": 395
},
{
"epoch": 12.507936507936508,
"grad_norm": 1.9067870378494263,
"learning_rate": 3.410526315789474e-06,
"loss": 0.0524,
"step": 400
},
{
"epoch": 12.507936507936508,
"eval_runtime": 1.7856,
"eval_samples_per_second": 0.56,
"eval_steps_per_second": 0.56,
"step": 400
},
{
"epoch": 12.666666666666666,
"grad_norm": 1.6618458032608032,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0591,
"step": 405
},
{
"epoch": 12.825396825396826,
"grad_norm": 1.5796409845352173,
"learning_rate": 2.9894736842105264e-06,
"loss": 0.0835,
"step": 410
},
{
"epoch": 12.984126984126984,
"grad_norm": 0.4131016433238983,
"learning_rate": 2.7789473684210525e-06,
"loss": 0.0579,
"step": 415
},
{
"epoch": 13.126984126984127,
"grad_norm": 0.8320081830024719,
"learning_rate": 2.568421052631579e-06,
"loss": 0.0559,
"step": 420
},
{
"epoch": 13.285714285714286,
"grad_norm": 0.18294575810432434,
"learning_rate": 2.357894736842105e-06,
"loss": 0.0245,
"step": 425
},
{
"epoch": 13.444444444444445,
"grad_norm": 1.5417789220809937,
"learning_rate": 2.1473684210526317e-06,
"loss": 0.0367,
"step": 430
},
{
"epoch": 13.603174603174603,
"grad_norm": 0.5289078950881958,
"learning_rate": 1.936842105263158e-06,
"loss": 0.0327,
"step": 435
},
{
"epoch": 13.761904761904763,
"grad_norm": 0.751720666885376,
"learning_rate": 1.7263157894736842e-06,
"loss": 0.0243,
"step": 440
},
{
"epoch": 13.920634920634921,
"grad_norm": 0.7442598938941956,
"learning_rate": 1.5157894736842108e-06,
"loss": 0.033,
"step": 445
},
{
"epoch": 14.063492063492063,
"grad_norm": 0.3151313066482544,
"learning_rate": 1.3052631578947369e-06,
"loss": 0.0253,
"step": 450
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.2672920525074005,
"learning_rate": 1.0947368421052632e-06,
"loss": 0.0198,
"step": 455
},
{
"epoch": 14.380952380952381,
"grad_norm": 0.6730213165283203,
"learning_rate": 8.842105263157895e-07,
"loss": 0.0212,
"step": 460
},
{
"epoch": 14.53968253968254,
"grad_norm": 0.5566405653953552,
"learning_rate": 6.736842105263158e-07,
"loss": 0.0176,
"step": 465
},
{
"epoch": 14.698412698412698,
"grad_norm": 0.37914007902145386,
"learning_rate": 4.631578947368422e-07,
"loss": 0.0341,
"step": 470
},
{
"epoch": 14.857142857142858,
"grad_norm": 0.2741248905658722,
"learning_rate": 2.5263157894736846e-07,
"loss": 0.0247,
"step": 475
},
{
"epoch": 15.0,
"grad_norm": 0.30271536111831665,
"learning_rate": 4.2105263157894737e-08,
"loss": 0.0212,
"step": 480
},
{
"epoch": 15.0,
"step": 480,
"total_flos": 0.0,
"train_loss": 1.4350806780159473,
"train_runtime": 3024.1986,
"train_samples_per_second": 2.475,
"train_steps_per_second": 0.159
}
],
"logging_steps": 5,
"max_steps": 480,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}