finetune-vit-base-patch16-224 / trainer_state.json
tiendoan's picture
Training in progress, step 352
a73000d verified
{
"best_metric": 0.0417679101228714,
"best_model_checkpoint": "./finetune-vit-base-patch16-224/checkpoint-1200",
"epoch": 4.0,
"eval_steps": 400,
"global_step": 1408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028409090909090908,
"grad_norm": 246460.15625,
"learning_rate": 4.9644886363636365e-05,
"loss": 1.1228,
"step": 10
},
{
"epoch": 0.056818181818181816,
"grad_norm": 226149.65625,
"learning_rate": 4.9289772727272735e-05,
"loss": 0.9359,
"step": 20
},
{
"epoch": 0.08522727272727272,
"grad_norm": 229797.6875,
"learning_rate": 4.893465909090909e-05,
"loss": 0.9185,
"step": 30
},
{
"epoch": 0.11363636363636363,
"grad_norm": 274111.03125,
"learning_rate": 4.857954545454545e-05,
"loss": 0.9599,
"step": 40
},
{
"epoch": 0.14204545454545456,
"grad_norm": 189042.953125,
"learning_rate": 4.822443181818182e-05,
"loss": 0.9459,
"step": 50
},
{
"epoch": 0.17045454545454544,
"grad_norm": 233362.859375,
"learning_rate": 4.7869318181818185e-05,
"loss": 0.9634,
"step": 60
},
{
"epoch": 0.19886363636363635,
"grad_norm": 267175.90625,
"learning_rate": 4.751420454545455e-05,
"loss": 0.8705,
"step": 70
},
{
"epoch": 0.22727272727272727,
"grad_norm": 211430.734375,
"learning_rate": 4.715909090909091e-05,
"loss": 0.9014,
"step": 80
},
{
"epoch": 0.2556818181818182,
"grad_norm": 238574.546875,
"learning_rate": 4.6803977272727274e-05,
"loss": 0.8607,
"step": 90
},
{
"epoch": 0.2840909090909091,
"grad_norm": 260448.125,
"learning_rate": 4.6448863636363636e-05,
"loss": 0.8127,
"step": 100
},
{
"epoch": 0.3125,
"grad_norm": 168009.265625,
"learning_rate": 4.609375e-05,
"loss": 0.8228,
"step": 110
},
{
"epoch": 0.3409090909090909,
"grad_norm": 232205.125,
"learning_rate": 4.573863636363637e-05,
"loss": 0.8704,
"step": 120
},
{
"epoch": 0.3693181818181818,
"grad_norm": 302465.9375,
"learning_rate": 4.538352272727273e-05,
"loss": 0.8896,
"step": 130
},
{
"epoch": 0.3977272727272727,
"grad_norm": 210630.53125,
"learning_rate": 4.5028409090909094e-05,
"loss": 0.8732,
"step": 140
},
{
"epoch": 0.42613636363636365,
"grad_norm": 171584.9375,
"learning_rate": 4.4673295454545457e-05,
"loss": 0.7886,
"step": 150
},
{
"epoch": 0.45454545454545453,
"grad_norm": 255000.359375,
"learning_rate": 4.431818181818182e-05,
"loss": 0.9411,
"step": 160
},
{
"epoch": 0.48295454545454547,
"grad_norm": 244293.703125,
"learning_rate": 4.396306818181818e-05,
"loss": 0.8608,
"step": 170
},
{
"epoch": 0.5113636363636364,
"grad_norm": 235527.875,
"learning_rate": 4.360795454545455e-05,
"loss": 0.8106,
"step": 180
},
{
"epoch": 0.5397727272727273,
"grad_norm": 234210.1875,
"learning_rate": 4.3252840909090914e-05,
"loss": 0.795,
"step": 190
},
{
"epoch": 0.5681818181818182,
"grad_norm": 182797.875,
"learning_rate": 4.289772727272727e-05,
"loss": 0.7926,
"step": 200
},
{
"epoch": 0.5965909090909091,
"grad_norm": 324642.5,
"learning_rate": 4.254261363636364e-05,
"loss": 0.778,
"step": 210
},
{
"epoch": 0.625,
"grad_norm": 359272.71875,
"learning_rate": 4.21875e-05,
"loss": 0.7829,
"step": 220
},
{
"epoch": 0.6534090909090909,
"grad_norm": 279676.875,
"learning_rate": 4.1832386363636365e-05,
"loss": 0.8244,
"step": 230
},
{
"epoch": 0.6818181818181818,
"grad_norm": 259783.71875,
"learning_rate": 4.1477272727272734e-05,
"loss": 0.7465,
"step": 240
},
{
"epoch": 0.7102272727272727,
"grad_norm": 184817.609375,
"learning_rate": 4.112215909090909e-05,
"loss": 0.7447,
"step": 250
},
{
"epoch": 0.7386363636363636,
"grad_norm": 221672.1875,
"learning_rate": 4.076704545454545e-05,
"loss": 0.8206,
"step": 260
},
{
"epoch": 0.7670454545454546,
"grad_norm": 251710.0,
"learning_rate": 4.041193181818182e-05,
"loss": 0.8222,
"step": 270
},
{
"epoch": 0.7954545454545454,
"grad_norm": 287394.75,
"learning_rate": 4.0056818181818185e-05,
"loss": 0.8751,
"step": 280
},
{
"epoch": 0.8238636363636364,
"grad_norm": 261405.84375,
"learning_rate": 3.970170454545455e-05,
"loss": 0.8049,
"step": 290
},
{
"epoch": 0.8522727272727273,
"grad_norm": 339216.5,
"learning_rate": 3.934659090909091e-05,
"loss": 0.7734,
"step": 300
},
{
"epoch": 0.8806818181818182,
"grad_norm": 253168.921875,
"learning_rate": 3.899147727272727e-05,
"loss": 0.7916,
"step": 310
},
{
"epoch": 0.9090909090909091,
"grad_norm": 243938.09375,
"learning_rate": 3.8636363636363636e-05,
"loss": 0.8075,
"step": 320
},
{
"epoch": 0.9375,
"grad_norm": 224975.296875,
"learning_rate": 3.828125e-05,
"loss": 0.724,
"step": 330
},
{
"epoch": 0.9659090909090909,
"grad_norm": 314409.71875,
"learning_rate": 3.792613636363637e-05,
"loss": 0.86,
"step": 340
},
{
"epoch": 0.9943181818181818,
"grad_norm": 254573.59375,
"learning_rate": 3.757102272727273e-05,
"loss": 0.7882,
"step": 350
},
{
"epoch": 1.0227272727272727,
"grad_norm": 239098.109375,
"learning_rate": 3.721590909090909e-05,
"loss": 0.5987,
"step": 360
},
{
"epoch": 1.0511363636363635,
"grad_norm": 212271.015625,
"learning_rate": 3.6860795454545456e-05,
"loss": 0.5594,
"step": 370
},
{
"epoch": 1.0795454545454546,
"grad_norm": 258443.203125,
"learning_rate": 3.650568181818182e-05,
"loss": 0.5778,
"step": 380
},
{
"epoch": 1.1079545454545454,
"grad_norm": 251415.8125,
"learning_rate": 3.615056818181818e-05,
"loss": 0.5707,
"step": 390
},
{
"epoch": 1.1363636363636362,
"grad_norm": 191828.046875,
"learning_rate": 3.579545454545455e-05,
"loss": 0.6151,
"step": 400
},
{
"epoch": 1.1363636363636362,
"eval_f1": 0.7879138483446066,
"eval_loss": 0.5355119705200195,
"eval_runtime": 204.0556,
"eval_samples_per_second": 55.063,
"eval_steps_per_second": 3.445,
"step": 400
},
{
"epoch": 1.1647727272727273,
"grad_norm": 233159.53125,
"learning_rate": 3.5440340909090914e-05,
"loss": 0.548,
"step": 410
},
{
"epoch": 1.1931818181818181,
"grad_norm": 206000.609375,
"learning_rate": 3.508522727272727e-05,
"loss": 0.5118,
"step": 420
},
{
"epoch": 1.2215909090909092,
"grad_norm": 262176.0625,
"learning_rate": 3.473011363636364e-05,
"loss": 0.5221,
"step": 430
},
{
"epoch": 1.25,
"grad_norm": 225265.671875,
"learning_rate": 3.4375e-05,
"loss": 0.5489,
"step": 440
},
{
"epoch": 1.2784090909090908,
"grad_norm": 261512.140625,
"learning_rate": 3.4019886363636365e-05,
"loss": 0.5682,
"step": 450
},
{
"epoch": 1.3068181818181819,
"grad_norm": 336397.46875,
"learning_rate": 3.3664772727272734e-05,
"loss": 0.5585,
"step": 460
},
{
"epoch": 1.3352272727272727,
"grad_norm": 253634.796875,
"learning_rate": 3.330965909090909e-05,
"loss": 0.5239,
"step": 470
},
{
"epoch": 1.3636363636363638,
"grad_norm": 253387.1875,
"learning_rate": 3.295454545454545e-05,
"loss": 0.5411,
"step": 480
},
{
"epoch": 1.3920454545454546,
"grad_norm": 175611.75,
"learning_rate": 3.259943181818182e-05,
"loss": 0.4704,
"step": 490
},
{
"epoch": 1.4204545454545454,
"grad_norm": 210382.125,
"learning_rate": 3.2244318181818185e-05,
"loss": 0.4668,
"step": 500
},
{
"epoch": 1.4488636363636362,
"grad_norm": 207340.484375,
"learning_rate": 3.188920454545455e-05,
"loss": 0.5243,
"step": 510
},
{
"epoch": 1.4772727272727273,
"grad_norm": 211227.53125,
"learning_rate": 3.153409090909091e-05,
"loss": 0.5158,
"step": 520
},
{
"epoch": 1.5056818181818183,
"grad_norm": 263875.125,
"learning_rate": 3.117897727272727e-05,
"loss": 0.5264,
"step": 530
},
{
"epoch": 1.5340909090909092,
"grad_norm": 250973.984375,
"learning_rate": 3.0823863636363636e-05,
"loss": 0.4892,
"step": 540
},
{
"epoch": 1.5625,
"grad_norm": 210192.90625,
"learning_rate": 3.0468750000000002e-05,
"loss": 0.565,
"step": 550
},
{
"epoch": 1.5909090909090908,
"grad_norm": 277090.34375,
"learning_rate": 3.0113636363636365e-05,
"loss": 0.5501,
"step": 560
},
{
"epoch": 1.6193181818181817,
"grad_norm": 262420.625,
"learning_rate": 2.975852272727273e-05,
"loss": 0.4802,
"step": 570
},
{
"epoch": 1.6477272727272727,
"grad_norm": 247244.59375,
"learning_rate": 2.940340909090909e-05,
"loss": 0.4778,
"step": 580
},
{
"epoch": 1.6761363636363638,
"grad_norm": 238716.140625,
"learning_rate": 2.9048295454545453e-05,
"loss": 0.4998,
"step": 590
},
{
"epoch": 1.7045454545454546,
"grad_norm": 288676.875,
"learning_rate": 2.869318181818182e-05,
"loss": 0.4763,
"step": 600
},
{
"epoch": 1.7329545454545454,
"grad_norm": 254478.03125,
"learning_rate": 2.8338068181818185e-05,
"loss": 0.4912,
"step": 610
},
{
"epoch": 1.7613636363636362,
"grad_norm": 295674.3125,
"learning_rate": 2.7982954545454548e-05,
"loss": 0.4892,
"step": 620
},
{
"epoch": 1.7897727272727273,
"grad_norm": 279737.21875,
"learning_rate": 2.7627840909090914e-05,
"loss": 0.4677,
"step": 630
},
{
"epoch": 1.8181818181818183,
"grad_norm": 325599.34375,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.4977,
"step": 640
},
{
"epoch": 1.8465909090909092,
"grad_norm": 303249.375,
"learning_rate": 2.6917613636363636e-05,
"loss": 0.5212,
"step": 650
},
{
"epoch": 1.875,
"grad_norm": 269595.21875,
"learning_rate": 2.6562500000000002e-05,
"loss": 0.5283,
"step": 660
},
{
"epoch": 1.9034090909090908,
"grad_norm": 274965.3125,
"learning_rate": 2.6207386363636365e-05,
"loss": 0.5194,
"step": 670
},
{
"epoch": 1.9318181818181817,
"grad_norm": 250650.328125,
"learning_rate": 2.585227272727273e-05,
"loss": 0.5274,
"step": 680
},
{
"epoch": 1.9602272727272727,
"grad_norm": 232058.15625,
"learning_rate": 2.549715909090909e-05,
"loss": 0.5002,
"step": 690
},
{
"epoch": 1.9886363636363638,
"grad_norm": 251402.0,
"learning_rate": 2.5142045454545453e-05,
"loss": 0.4618,
"step": 700
},
{
"epoch": 2.0170454545454546,
"grad_norm": 192832.578125,
"learning_rate": 2.478693181818182e-05,
"loss": 0.3425,
"step": 710
},
{
"epoch": 2.0454545454545454,
"grad_norm": 200086.390625,
"learning_rate": 2.4431818181818185e-05,
"loss": 0.2832,
"step": 720
},
{
"epoch": 2.0738636363636362,
"grad_norm": 162459.609375,
"learning_rate": 2.4076704545454544e-05,
"loss": 0.2102,
"step": 730
},
{
"epoch": 2.102272727272727,
"grad_norm": 132360.765625,
"learning_rate": 2.372159090909091e-05,
"loss": 0.2097,
"step": 740
},
{
"epoch": 2.1306818181818183,
"grad_norm": 146930.046875,
"learning_rate": 2.3366477272727273e-05,
"loss": 0.1884,
"step": 750
},
{
"epoch": 2.159090909090909,
"grad_norm": 246238.796875,
"learning_rate": 2.3011363636363636e-05,
"loss": 0.1969,
"step": 760
},
{
"epoch": 2.1875,
"grad_norm": 232657.203125,
"learning_rate": 2.2656250000000002e-05,
"loss": 0.1925,
"step": 770
},
{
"epoch": 2.215909090909091,
"grad_norm": 227103.3125,
"learning_rate": 2.2301136363636365e-05,
"loss": 0.1851,
"step": 780
},
{
"epoch": 2.2443181818181817,
"grad_norm": 171326.71875,
"learning_rate": 2.1946022727272727e-05,
"loss": 0.2253,
"step": 790
},
{
"epoch": 2.2727272727272725,
"grad_norm": 121495.1953125,
"learning_rate": 2.1590909090909093e-05,
"loss": 0.1867,
"step": 800
},
{
"epoch": 2.2727272727272725,
"eval_f1": 0.9550551797792809,
"eval_loss": 0.17148956656455994,
"eval_runtime": 203.949,
"eval_samples_per_second": 55.092,
"eval_steps_per_second": 3.447,
"step": 800
},
{
"epoch": 2.3011363636363638,
"grad_norm": 238023.546875,
"learning_rate": 2.1235795454545456e-05,
"loss": 0.2143,
"step": 810
},
{
"epoch": 2.3295454545454546,
"grad_norm": 215472.78125,
"learning_rate": 2.088068181818182e-05,
"loss": 0.1681,
"step": 820
},
{
"epoch": 2.3579545454545454,
"grad_norm": 185951.046875,
"learning_rate": 2.0525568181818185e-05,
"loss": 0.2,
"step": 830
},
{
"epoch": 2.3863636363636362,
"grad_norm": 288287.34375,
"learning_rate": 2.0170454545454544e-05,
"loss": 0.1899,
"step": 840
},
{
"epoch": 2.4147727272727275,
"grad_norm": 184342.796875,
"learning_rate": 1.981534090909091e-05,
"loss": 0.1898,
"step": 850
},
{
"epoch": 2.4431818181818183,
"grad_norm": 143657.375,
"learning_rate": 1.9460227272727273e-05,
"loss": 0.1707,
"step": 860
},
{
"epoch": 2.471590909090909,
"grad_norm": 142439.578125,
"learning_rate": 1.9105113636363636e-05,
"loss": 0.1505,
"step": 870
},
{
"epoch": 2.5,
"grad_norm": 255553.71875,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.2047,
"step": 880
},
{
"epoch": 2.528409090909091,
"grad_norm": 217335.078125,
"learning_rate": 1.8394886363636364e-05,
"loss": 0.18,
"step": 890
},
{
"epoch": 2.5568181818181817,
"grad_norm": 143375.3125,
"learning_rate": 1.8039772727272727e-05,
"loss": 0.2372,
"step": 900
},
{
"epoch": 2.5852272727272725,
"grad_norm": 325331.0625,
"learning_rate": 1.7684659090909093e-05,
"loss": 0.2047,
"step": 910
},
{
"epoch": 2.6136363636363638,
"grad_norm": 160601.78125,
"learning_rate": 1.7329545454545456e-05,
"loss": 0.1999,
"step": 920
},
{
"epoch": 2.6420454545454546,
"grad_norm": 114873.859375,
"learning_rate": 1.697443181818182e-05,
"loss": 0.1736,
"step": 930
},
{
"epoch": 2.6704545454545454,
"grad_norm": 191060.78125,
"learning_rate": 1.6619318181818185e-05,
"loss": 0.1809,
"step": 940
},
{
"epoch": 2.6988636363636362,
"grad_norm": 303838.96875,
"learning_rate": 1.6264204545454544e-05,
"loss": 0.238,
"step": 950
},
{
"epoch": 2.7272727272727275,
"grad_norm": 92415.265625,
"learning_rate": 1.590909090909091e-05,
"loss": 0.137,
"step": 960
},
{
"epoch": 2.7556818181818183,
"grad_norm": 227939.296875,
"learning_rate": 1.5553977272727273e-05,
"loss": 0.1811,
"step": 970
},
{
"epoch": 2.784090909090909,
"grad_norm": 244860.359375,
"learning_rate": 1.5198863636363636e-05,
"loss": 0.2235,
"step": 980
},
{
"epoch": 2.8125,
"grad_norm": 199524.078125,
"learning_rate": 1.484375e-05,
"loss": 0.1885,
"step": 990
},
{
"epoch": 2.840909090909091,
"grad_norm": 245456.046875,
"learning_rate": 1.4488636363636366e-05,
"loss": 0.2261,
"step": 1000
},
{
"epoch": 2.8693181818181817,
"grad_norm": 291130.96875,
"learning_rate": 1.4133522727272727e-05,
"loss": 0.1767,
"step": 1010
},
{
"epoch": 2.8977272727272725,
"grad_norm": 119223.3046875,
"learning_rate": 1.3778409090909091e-05,
"loss": 0.1589,
"step": 1020
},
{
"epoch": 2.9261363636363638,
"grad_norm": 205424.078125,
"learning_rate": 1.3423295454545456e-05,
"loss": 0.1666,
"step": 1030
},
{
"epoch": 2.9545454545454546,
"grad_norm": 177895.84375,
"learning_rate": 1.3068181818181819e-05,
"loss": 0.1572,
"step": 1040
},
{
"epoch": 2.9829545454545454,
"grad_norm": 337598.78125,
"learning_rate": 1.2713068181818183e-05,
"loss": 0.1938,
"step": 1050
},
{
"epoch": 3.0113636363636362,
"grad_norm": 173000.0,
"learning_rate": 1.2357954545454546e-05,
"loss": 0.1126,
"step": 1060
},
{
"epoch": 3.039772727272727,
"grad_norm": 97144.171875,
"learning_rate": 1.200284090909091e-05,
"loss": 0.0462,
"step": 1070
},
{
"epoch": 3.0681818181818183,
"grad_norm": 54899.234375,
"learning_rate": 1.1647727272727273e-05,
"loss": 0.0615,
"step": 1080
},
{
"epoch": 3.096590909090909,
"grad_norm": 36492.046875,
"learning_rate": 1.1292613636363637e-05,
"loss": 0.0491,
"step": 1090
},
{
"epoch": 3.125,
"grad_norm": 37996.1953125,
"learning_rate": 1.09375e-05,
"loss": 0.0562,
"step": 1100
},
{
"epoch": 3.153409090909091,
"grad_norm": 190393.703125,
"learning_rate": 1.0582386363636364e-05,
"loss": 0.054,
"step": 1110
},
{
"epoch": 3.1818181818181817,
"grad_norm": 179904.40625,
"learning_rate": 1.0227272727272729e-05,
"loss": 0.0728,
"step": 1120
},
{
"epoch": 3.210227272727273,
"grad_norm": 100628.515625,
"learning_rate": 9.872159090909091e-06,
"loss": 0.0625,
"step": 1130
},
{
"epoch": 3.2386363636363638,
"grad_norm": 118374.3984375,
"learning_rate": 9.517045454545454e-06,
"loss": 0.0569,
"step": 1140
},
{
"epoch": 3.2670454545454546,
"grad_norm": 75175.8359375,
"learning_rate": 9.161931818181818e-06,
"loss": 0.0436,
"step": 1150
},
{
"epoch": 3.2954545454545454,
"grad_norm": 158238.78125,
"learning_rate": 8.806818181818183e-06,
"loss": 0.079,
"step": 1160
},
{
"epoch": 3.3238636363636362,
"grad_norm": 68349.515625,
"learning_rate": 8.451704545454546e-06,
"loss": 0.056,
"step": 1170
},
{
"epoch": 3.3522727272727275,
"grad_norm": 43816.8671875,
"learning_rate": 8.09659090909091e-06,
"loss": 0.0443,
"step": 1180
},
{
"epoch": 3.3806818181818183,
"grad_norm": 61632.68359375,
"learning_rate": 7.741477272727273e-06,
"loss": 0.0554,
"step": 1190
},
{
"epoch": 3.409090909090909,
"grad_norm": 60831.44140625,
"learning_rate": 7.386363636363637e-06,
"loss": 0.0871,
"step": 1200
},
{
"epoch": 3.409090909090909,
"eval_f1": 0.9917230331078676,
"eval_loss": 0.0417679101228714,
"eval_runtime": 204.1321,
"eval_samples_per_second": 55.043,
"eval_steps_per_second": 3.444,
"step": 1200
},
{
"epoch": 3.4375,
"grad_norm": 90207.28125,
"learning_rate": 7.031250000000001e-06,
"loss": 0.0676,
"step": 1210
},
{
"epoch": 3.465909090909091,
"grad_norm": 63487.5546875,
"learning_rate": 6.676136363636363e-06,
"loss": 0.0346,
"step": 1220
},
{
"epoch": 3.4943181818181817,
"grad_norm": 83902.515625,
"learning_rate": 6.321022727272729e-06,
"loss": 0.0587,
"step": 1230
},
{
"epoch": 3.5227272727272725,
"grad_norm": 26082.44921875,
"learning_rate": 5.965909090909091e-06,
"loss": 0.0385,
"step": 1240
},
{
"epoch": 3.5511363636363638,
"grad_norm": 71738.4140625,
"learning_rate": 5.610795454545455e-06,
"loss": 0.0497,
"step": 1250
},
{
"epoch": 3.5795454545454546,
"grad_norm": 115759.3671875,
"learning_rate": 5.255681818181818e-06,
"loss": 0.0679,
"step": 1260
},
{
"epoch": 3.6079545454545454,
"grad_norm": 49416.74609375,
"learning_rate": 4.900568181818182e-06,
"loss": 0.0565,
"step": 1270
},
{
"epoch": 3.6363636363636362,
"grad_norm": 164339.484375,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.0374,
"step": 1280
},
{
"epoch": 3.6647727272727275,
"grad_norm": 74746.796875,
"learning_rate": 4.190340909090909e-06,
"loss": 0.0382,
"step": 1290
},
{
"epoch": 3.6931818181818183,
"grad_norm": 29929.04296875,
"learning_rate": 3.835227272727273e-06,
"loss": 0.039,
"step": 1300
},
{
"epoch": 3.721590909090909,
"grad_norm": 59106.06640625,
"learning_rate": 3.480113636363636e-06,
"loss": 0.0376,
"step": 1310
},
{
"epoch": 3.75,
"grad_norm": 187797.71875,
"learning_rate": 3.125e-06,
"loss": 0.056,
"step": 1320
},
{
"epoch": 3.778409090909091,
"grad_norm": 42829.46875,
"learning_rate": 2.7698863636363637e-06,
"loss": 0.0434,
"step": 1330
},
{
"epoch": 3.8068181818181817,
"grad_norm": 252679.109375,
"learning_rate": 2.4147727272727273e-06,
"loss": 0.0502,
"step": 1340
},
{
"epoch": 3.8352272727272725,
"grad_norm": 35090.86328125,
"learning_rate": 2.059659090909091e-06,
"loss": 0.0686,
"step": 1350
},
{
"epoch": 3.8636363636363638,
"grad_norm": 287442.9375,
"learning_rate": 1.7045454545454546e-06,
"loss": 0.0579,
"step": 1360
},
{
"epoch": 3.8920454545454546,
"grad_norm": 241179.890625,
"learning_rate": 1.3494318181818183e-06,
"loss": 0.065,
"step": 1370
},
{
"epoch": 3.9204545454545454,
"grad_norm": 20388.59765625,
"learning_rate": 9.943181818181819e-07,
"loss": 0.0281,
"step": 1380
},
{
"epoch": 3.9488636363636362,
"grad_norm": 44893.046875,
"learning_rate": 6.392045454545455e-07,
"loss": 0.0297,
"step": 1390
},
{
"epoch": 3.9772727272727275,
"grad_norm": 30813.0546875,
"learning_rate": 2.840909090909091e-07,
"loss": 0.048,
"step": 1400
},
{
"epoch": 4.0,
"step": 1408,
"total_flos": 3.4828624117074493e+18,
"train_loss": 0.40373469023457303,
"train_runtime": 1995.1511,
"train_samples_per_second": 22.527,
"train_steps_per_second": 0.706
}
],
"logging_steps": 10,
"max_steps": 1408,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4828624117074493e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}