Llasa_ckpts / trainer_state.json
chenqi1126's picture
Upload folder using huggingface_hub
f45eee1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 100.0,
"eval_steps": 100,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32,
"grad_norm": 22.61836051940918,
"learning_rate": 5.193376768186383e-06,
"loss": 28.116,
"step": 10
},
{
"epoch": 0.64,
"grad_norm": 12.9418363571167,
"learning_rate": 6.666429955495757e-06,
"loss": 26.7925,
"step": 20
},
{
"epoch": 0.96,
"grad_norm": 10.848026275634766,
"learning_rate": 7.528110831639518e-06,
"loss": 25.6575,
"step": 30
},
{
"epoch": 1.256,
"grad_norm": 10.735931396484375,
"learning_rate": 8.085678584707068e-06,
"loss": 23.0539,
"step": 40
},
{
"epoch": 1.576,
"grad_norm": 10.395103454589844,
"learning_rate": 8.57076623169122e-06,
"loss": 24.4311,
"step": 50
},
{
"epoch": 1.896,
"grad_norm": 10.887977600097656,
"learning_rate": 8.965446093665792e-06,
"loss": 24.2035,
"step": 60
},
{
"epoch": 2.192,
"grad_norm": 10.915943145751953,
"learning_rate": 9.267156540931188e-06,
"loss": 21.8915,
"step": 70
},
{
"epoch": 2.512,
"grad_norm": 10.198206901550293,
"learning_rate": 9.558731772016444e-06,
"loss": 23.4517,
"step": 80
},
{
"epoch": 2.832,
"grad_norm": 10.81621265411377,
"learning_rate": 9.815086334040616e-06,
"loss": 23.2951,
"step": 90
},
{
"epoch": 3.128,
"grad_norm": 12.31503677368164,
"learning_rate": 9.999997695170515e-06,
"loss": 21.3033,
"step": 100
},
{
"epoch": 3.448,
"grad_norm": 12.238049507141113,
"learning_rate": 9.999721118489014e-06,
"loss": 22.7153,
"step": 110
},
{
"epoch": 3.768,
"grad_norm": 10.42602252960205,
"learning_rate": 9.998983608373495e-06,
"loss": 22.6426,
"step": 120
},
{
"epoch": 4.064,
"grad_norm": 11.45156192779541,
"learning_rate": 9.997785240371542e-06,
"loss": 20.7914,
"step": 130
},
{
"epoch": 4.384,
"grad_norm": 12.555192947387695,
"learning_rate": 9.996126137239184e-06,
"loss": 22.1085,
"step": 140
},
{
"epoch": 4.704,
"grad_norm": 11.370450019836426,
"learning_rate": 9.994006468928306e-06,
"loss": 22.0096,
"step": 150
},
{
"epoch": 5.0,
"grad_norm": 12.388717651367188,
"learning_rate": 9.991705162704631e-06,
"loss": 20.3489,
"step": 160
},
{
"epoch": 5.32,
"grad_norm": 11.950061798095703,
"learning_rate": 9.988711057723813e-06,
"loss": 21.4641,
"step": 170
},
{
"epoch": 5.64,
"grad_norm": 12.537701606750488,
"learning_rate": 9.985257147135564e-06,
"loss": 21.4924,
"step": 180
},
{
"epoch": 5.96,
"grad_norm": 11.869683265686035,
"learning_rate": 9.98134378474467e-06,
"loss": 21.3469,
"step": 190
},
{
"epoch": 6.256,
"grad_norm": 13.779632568359375,
"learning_rate": 9.97742925764817e-06,
"loss": 19.3486,
"step": 200
},
{
"epoch": 6.576,
"grad_norm": 13.29927921295166,
"learning_rate": 9.97264408009259e-06,
"loss": 20.8448,
"step": 210
},
{
"epoch": 6.896,
"grad_norm": 13.365044593811035,
"learning_rate": 9.967400742766507e-06,
"loss": 20.8125,
"step": 220
},
{
"epoch": 7.192,
"grad_norm": 13.839238166809082,
"learning_rate": 9.962290455518914e-06,
"loss": 19.0171,
"step": 230
},
{
"epoch": 7.5120000000000005,
"grad_norm": 15.155104637145996,
"learning_rate": 9.956178133102374e-06,
"loss": 20.2994,
"step": 240
},
{
"epoch": 7.832,
"grad_norm": 15.683795928955078,
"learning_rate": 9.949609337621082e-06,
"loss": 20.2594,
"step": 250
},
{
"epoch": 8.128,
"grad_norm": 15.537983894348145,
"learning_rate": 9.94258474195617e-06,
"loss": 18.5829,
"step": 260
},
{
"epoch": 8.448,
"grad_norm": 16.607322692871094,
"learning_rate": 9.935105065679127e-06,
"loss": 19.7508,
"step": 270
},
{
"epoch": 8.768,
"grad_norm": 15.696822166442871,
"learning_rate": 9.927171074978072e-06,
"loss": 19.5839,
"step": 280
},
{
"epoch": 9.064,
"grad_norm": 16.520605087280273,
"learning_rate": 9.91964271527259e-06,
"loss": 18.1771,
"step": 290
},
{
"epoch": 9.384,
"grad_norm": 17.410171508789062,
"learning_rate": 9.910847804623571e-06,
"loss": 19.0556,
"step": 300
},
{
"epoch": 9.704,
"grad_norm": 17.677932739257812,
"learning_rate": 9.901601064367343e-06,
"loss": 19.0936,
"step": 310
},
{
"epoch": 10.0,
"grad_norm": 17.897781372070312,
"learning_rate": 9.892893465727831e-06,
"loss": 17.7231,
"step": 320
},
{
"epoch": 10.32,
"grad_norm": 19.219694137573242,
"learning_rate": 9.88279089692669e-06,
"loss": 18.4743,
"step": 330
},
{
"epoch": 10.64,
"grad_norm": 18.7277774810791,
"learning_rate": 9.872239372555743e-06,
"loss": 18.4844,
"step": 340
},
{
"epoch": 10.96,
"grad_norm": 21.37215232849121,
"learning_rate": 9.86123997347095e-06,
"loss": 18.4988,
"step": 350
},
{
"epoch": 11.256,
"grad_norm": 23.783323287963867,
"learning_rate": 9.850958511714194e-06,
"loss": 16.5981,
"step": 360
},
{
"epoch": 11.576,
"grad_norm": 23.457868576049805,
"learning_rate": 9.839111292652133e-06,
"loss": 17.8158,
"step": 370
},
{
"epoch": 11.896,
"grad_norm": 23.80459213256836,
"learning_rate": 9.826819592384226e-06,
"loss": 17.8393,
"step": 380
},
{
"epoch": 12.192,
"grad_norm": 24.82889747619629,
"learning_rate": 9.814084670023088e-06,
"loss": 16.1185,
"step": 390
},
{
"epoch": 12.512,
"grad_norm": 28.92336082458496,
"learning_rate": 9.800907830083227e-06,
"loss": 17.25,
"step": 400
},
{
"epoch": 12.832,
"grad_norm": 27.11651611328125,
"learning_rate": 9.787290422347427e-06,
"loss": 17.2422,
"step": 410
},
{
"epoch": 13.128,
"grad_norm": 30.726104736328125,
"learning_rate": 9.774659221884884e-06,
"loss": 15.6421,
"step": 420
},
{
"epoch": 13.448,
"grad_norm": 27.514354705810547,
"learning_rate": 9.760208615506392e-06,
"loss": 16.4979,
"step": 430
},
{
"epoch": 13.768,
"grad_norm": 31.63400650024414,
"learning_rate": 9.745321610396716e-06,
"loss": 16.5669,
"step": 440
},
{
"epoch": 14.064,
"grad_norm": 31.240859985351562,
"learning_rate": 9.731551444345466e-06,
"loss": 15.132,
"step": 450
},
{
"epoch": 14.384,
"grad_norm": 34.0392951965332,
"learning_rate": 9.715839519747009e-06,
"loss": 15.7479,
"step": 460
},
{
"epoch": 14.704,
"grad_norm": 31.50341033935547,
"learning_rate": 9.69969574141008e-06,
"loss": 15.8802,
"step": 470
},
{
"epoch": 15.0,
"grad_norm": 30.895503997802734,
"learning_rate": 9.684798471835533e-06,
"loss": 14.6161,
"step": 480
},
{
"epoch": 15.32,
"grad_norm": 34.22053146362305,
"learning_rate": 9.667838763776003e-06,
"loss": 15.0823,
"step": 490
},
{
"epoch": 15.64,
"grad_norm": 40.52189636230469,
"learning_rate": 9.650452118983454e-06,
"loss": 15.0497,
"step": 500
},
{
"epoch": 15.96,
"grad_norm": 37.59934616088867,
"learning_rate": 9.632640318476236e-06,
"loss": 15.1599,
"step": 510
},
{
"epoch": 16.256,
"grad_norm": 44.10334396362305,
"learning_rate": 9.614405186823954e-06,
"loss": 13.3485,
"step": 520
},
{
"epoch": 16.576,
"grad_norm": 39.68657302856445,
"learning_rate": 9.595748591960559e-06,
"loss": 14.352,
"step": 530
},
{
"epoch": 16.896,
"grad_norm": 35.99330520629883,
"learning_rate": 9.576672444993012e-06,
"loss": 14.4162,
"step": 540
},
{
"epoch": 17.192,
"grad_norm": 46.91948699951172,
"learning_rate": 9.55914680983895e-06,
"loss": 12.9685,
"step": 550
},
{
"epoch": 17.512,
"grad_norm": 46.87045669555664,
"learning_rate": 9.539278932734926e-06,
"loss": 13.6413,
"step": 560
},
{
"epoch": 17.832,
"grad_norm": 48.02497100830078,
"learning_rate": 9.51899728805278e-06,
"loss": 13.5406,
"step": 570
},
{
"epoch": 18.128,
"grad_norm": 44.37493896484375,
"learning_rate": 9.500391752799266e-06,
"loss": 12.2899,
"step": 580
},
{
"epoch": 18.448,
"grad_norm": 47.000877380371094,
"learning_rate": 9.479329708285107e-06,
"loss": 12.7494,
"step": 590
},
{
"epoch": 18.768,
"grad_norm": 47.82246780395508,
"learning_rate": 9.4578600371516e-06,
"loss": 13.0171,
"step": 600
},
{
"epoch": 19.064,
"grad_norm": 44.77831268310547,
"learning_rate": 9.43819062920712e-06,
"loss": 11.778,
"step": 610
},
{
"epoch": 19.384,
"grad_norm": 47.652130126953125,
"learning_rate": 9.415952560807661e-06,
"loss": 12.1909,
"step": 620
},
{
"epoch": 19.704,
"grad_norm": 45.41845703125,
"learning_rate": 9.393313357890357e-06,
"loss": 12.3073,
"step": 630
},
{
"epoch": 20.0,
"grad_norm": 55.55602264404297,
"learning_rate": 9.370275339524604e-06,
"loss": 11.2453,
"step": 640
},
{
"epoch": 20.32,
"grad_norm": 54.18324279785156,
"learning_rate": 9.34684086563286e-06,
"loss": 11.3862,
"step": 650
},
{
"epoch": 20.64,
"grad_norm": 52.06657028198242,
"learning_rate": 9.3230123367489e-06,
"loss": 11.2495,
"step": 660
},
{
"epoch": 20.96,
"grad_norm": 52.56513977050781,
"learning_rate": 9.298792193771915e-06,
"loss": 11.4311,
"step": 670
},
{
"epoch": 21.256,
"grad_norm": 55.839622497558594,
"learning_rate": 9.276661284776841e-06,
"loss": 9.9001,
"step": 680
},
{
"epoch": 21.576,
"grad_norm": 54.41539764404297,
"learning_rate": 9.251703943168894e-06,
"loss": 10.7236,
"step": 690
},
{
"epoch": 21.896,
"grad_norm": 54.44001007080078,
"learning_rate": 9.226362292014552e-06,
"loss": 10.793,
"step": 700
},
{
"epoch": 22.192,
"grad_norm": 56.16716003417969,
"learning_rate": 9.203228366013518e-06,
"loss": 9.4557,
"step": 710
},
{
"epoch": 22.512,
"grad_norm": 62.475990295410156,
"learning_rate": 9.177163710741043e-06,
"loss": 10.1059,
"step": 720
},
{
"epoch": 22.832,
"grad_norm": 60.76034164428711,
"learning_rate": 9.150722381525448e-06,
"loss": 10.0807,
"step": 730
},
{
"epoch": 23.128,
"grad_norm": 57.365806579589844,
"learning_rate": 9.126605366828865e-06,
"loss": 9.0266,
"step": 740
},
{
"epoch": 23.448,
"grad_norm": 65.74173736572266,
"learning_rate": 9.099455850827047e-06,
"loss": 9.4513,
"step": 750
},
{
"epoch": 23.768,
"grad_norm": 63.232688903808594,
"learning_rate": 9.071937620964472e-06,
"loss": 9.3833,
"step": 760
},
{
"epoch": 24.064,
"grad_norm": 55.957969665527344,
"learning_rate": 9.044053496098546e-06,
"loss": 8.5141,
"step": 770
},
{
"epoch": 24.384,
"grad_norm": 59.8948860168457,
"learning_rate": 9.015806332567492e-06,
"loss": 9.0655,
"step": 780
},
{
"epoch": 24.704,
"grad_norm": 62.30965042114258,
"learning_rate": 8.987199023897762e-06,
"loss": 8.6284,
"step": 790
},
{
"epoch": 25.0,
"grad_norm": 65.89315795898438,
"learning_rate": 8.961146943234696e-06,
"loss": 8.191,
"step": 800
},
{
"epoch": 25.32,
"grad_norm": 57.95423126220703,
"learning_rate": 8.931863462353107e-06,
"loss": 8.1047,
"step": 810
},
{
"epoch": 25.64,
"grad_norm": 70.34273529052734,
"learning_rate": 8.902228435104725e-06,
"loss": 8.6509,
"step": 820
},
{
"epoch": 25.96,
"grad_norm": 53.95098114013672,
"learning_rate": 8.872244897183212e-06,
"loss": 8.2501,
"step": 830
},
{
"epoch": 26.256,
"grad_norm": 62.12706756591797,
"learning_rate": 8.844964274202623e-06,
"loss": 7.0761,
"step": 840
},
{
"epoch": 26.576,
"grad_norm": 61.21213150024414,
"learning_rate": 8.814327056944412e-06,
"loss": 7.9348,
"step": 850
},
{
"epoch": 26.896,
"grad_norm": 72.10478210449219,
"learning_rate": 8.783350333278674e-06,
"loss": 7.8212,
"step": 860
},
{
"epoch": 27.192,
"grad_norm": 69.69745635986328,
"learning_rate": 8.755183625881033e-06,
"loss": 6.836,
"step": 870
},
{
"epoch": 27.512,
"grad_norm": 65.09381103515625,
"learning_rate": 8.723570610500632e-06,
"loss": 7.0539,
"step": 880
},
{
"epoch": 27.832,
"grad_norm": 56.70985794067383,
"learning_rate": 8.691627385440082e-06,
"loss": 7.3318,
"step": 890
},
{
"epoch": 28.128,
"grad_norm": 54.06085205078125,
"learning_rate": 8.659357222835593e-06,
"loss": 6.5638,
"step": 900
},
{
"epoch": 28.448,
"grad_norm": 63.56743240356445,
"learning_rate": 8.626763428313554e-06,
"loss": 6.803,
"step": 910
},
{
"epoch": 28.768,
"grad_norm": 56.118526458740234,
"learning_rate": 8.593849340651913e-06,
"loss": 6.8041,
"step": 920
},
{
"epoch": 29.064,
"grad_norm": 56.52422332763672,
"learning_rate": 8.56395559706677e-06,
"loss": 6.3728,
"step": 930
},
{
"epoch": 29.384,
"grad_norm": 61.43674850463867,
"learning_rate": 8.530442268002263e-06,
"loss": 6.4516,
"step": 940
},
{
"epoch": 29.704,
"grad_norm": 54.28837203979492,
"learning_rate": 8.496618512552564e-06,
"loss": 6.2512,
"step": 950
},
{
"epoch": 30.0,
"grad_norm": 61.01597213745117,
"learning_rate": 8.465914581069757e-06,
"loss": 6.0489,
"step": 960
},
{
"epoch": 30.32,
"grad_norm": 52.78199768066406,
"learning_rate": 8.431510586925946e-06,
"loss": 6.3215,
"step": 970
},
{
"epoch": 30.64,
"grad_norm": 66.41915130615234,
"learning_rate": 8.39680630056266e-06,
"loss": 6.177,
"step": 980
},
{
"epoch": 30.96,
"grad_norm": 73.75415802001953,
"learning_rate": 8.361805276948188e-06,
"loss": 6.2312,
"step": 990
},
{
"epoch": 31.256,
"grad_norm": 61.83228302001953,
"learning_rate": 8.3300536080273e-06,
"loss": 5.5059,
"step": 1000
},
{
"epoch": 31.576,
"grad_norm": 56.67393493652344,
"learning_rate": 8.294498686142642e-06,
"loss": 5.6519,
"step": 1010
},
{
"epoch": 31.896,
"grad_norm": 62.11791229248047,
"learning_rate": 8.258657506989624e-06,
"loss": 5.9147,
"step": 1020
},
{
"epoch": 32.192,
"grad_norm": 58.08525085449219,
"learning_rate": 8.222533741995281e-06,
"loss": 5.0185,
"step": 1030
},
{
"epoch": 32.512,
"grad_norm": 59.7594108581543,
"learning_rate": 8.186131091533624e-06,
"loss": 5.5372,
"step": 1040
},
{
"epoch": 32.832,
"grad_norm": 57.91405487060547,
"learning_rate": 8.149453284546562e-06,
"loss": 5.7239,
"step": 1050
},
{
"epoch": 33.128,
"grad_norm": 56.105133056640625,
"learning_rate": 8.116211104118957e-06,
"loss": 5.0643,
"step": 1060
},
{
"epoch": 33.448,
"grad_norm": 57.08987045288086,
"learning_rate": 8.079020873606434e-06,
"loss": 5.1807,
"step": 1070
},
{
"epoch": 33.768,
"grad_norm": 53.906761169433594,
"learning_rate": 8.04156645851083e-06,
"loss": 5.3584,
"step": 1080
},
{
"epoch": 34.064,
"grad_norm": 50.738651275634766,
"learning_rate": 8.00763477756982e-06,
"loss": 4.6149,
"step": 1090
},
{
"epoch": 34.384,
"grad_norm": 62.51714324951172,
"learning_rate": 7.969689003871167e-06,
"loss": 5.1416,
"step": 1100
},
{
"epoch": 34.704,
"grad_norm": 64.98294830322266,
"learning_rate": 7.931490245108871e-06,
"loss": 5.001,
"step": 1110
},
{
"epoch": 35.0,
"grad_norm": 48.71992111206055,
"learning_rate": 7.89689829349943e-06,
"loss": 4.5248,
"step": 1120
},
{
"epoch": 35.32,
"grad_norm": 49.821617126464844,
"learning_rate": 7.858229664328653e-06,
"loss": 4.6947,
"step": 1130
},
{
"epoch": 35.64,
"grad_norm": 60.61543655395508,
"learning_rate": 7.819319467543181e-06,
"loss": 4.7697,
"step": 1140
},
{
"epoch": 35.96,
"grad_norm": 55.65413284301758,
"learning_rate": 7.780171688947942e-06,
"loss": 4.7358,
"step": 1150
},
{
"epoch": 36.256,
"grad_norm": 58.530906677246094,
"learning_rate": 7.740790338684804e-06,
"loss": 4.1803,
"step": 1160
},
{
"epoch": 36.576,
"grad_norm": 59.444671630859375,
"learning_rate": 7.701179450821773e-06,
"loss": 4.4415,
"step": 1170
},
{
"epoch": 36.896,
"grad_norm": 49.884071350097656,
"learning_rate": 7.661343082939769e-06,
"loss": 4.7063,
"step": 1180
},
{
"epoch": 37.192,
"grad_norm": 52.532249450683594,
"learning_rate": 7.625300938639757e-06,
"loss": 4.1235,
"step": 1190
},
{
"epoch": 37.512,
"grad_norm": 51.9701042175293,
"learning_rate": 7.58504741974322e-06,
"loss": 4.4617,
"step": 1200
},
{
"epoch": 37.832,
"grad_norm": 49.44038391113281,
"learning_rate": 7.544580316928487e-06,
"loss": 4.4452,
"step": 1210
},
{
"epoch": 38.128,
"grad_norm": 50.28534698486328,
"learning_rate": 7.507980735780601e-06,
"loss": 3.9453,
"step": 1220
},
{
"epoch": 38.448,
"grad_norm": 46.916019439697266,
"learning_rate": 7.467119261530757e-06,
"loss": 4.2222,
"step": 1230
},
{
"epoch": 38.768,
"grad_norm": 49.836448669433594,
"learning_rate": 7.426056283451572e-06,
"loss": 4.3332,
"step": 1240
},
{
"epoch": 39.064,
"grad_norm": 44.598793029785156,
"learning_rate": 7.388930793516118e-06,
"loss": 3.9014,
"step": 1250
},
{
"epoch": 39.384,
"grad_norm": 42.805973052978516,
"learning_rate": 7.347496563305526e-06,
"loss": 4.1539,
"step": 1260
},
{
"epoch": 39.704,
"grad_norm": 48.368324279785156,
"learning_rate": 7.3058730829363485e-06,
"loss": 3.9781,
"step": 1270
},
{
"epoch": 40.0,
"grad_norm": 50.3718147277832,
"learning_rate": 7.264064616151436e-06,
"loss": 3.7175,
"step": 1280
},
{
"epoch": 40.32,
"grad_norm": 52.81575012207031,
"learning_rate": 7.222075445642904e-06,
"loss": 3.8827,
"step": 1290
},
{
"epoch": 40.64,
"grad_norm": 53.90591812133789,
"learning_rate": 7.17990987261344e-06,
"loss": 3.9467,
"step": 1300
},
{
"epoch": 40.96,
"grad_norm": 46.753875732421875,
"learning_rate": 7.1375722163356945e-06,
"loss": 4.0143,
"step": 1310
},
{
"epoch": 41.256,
"grad_norm": 43.555267333984375,
"learning_rate": 7.099324778599362e-06,
"loss": 3.396,
"step": 1320
},
{
"epoch": 41.576,
"grad_norm": 43.159061431884766,
"learning_rate": 7.056672126521037e-06,
"loss": 3.888,
"step": 1330
},
{
"epoch": 41.896,
"grad_norm": 48.97645950317383,
"learning_rate": 7.0138600151762305e-06,
"loss": 3.8203,
"step": 1340
},
{
"epoch": 42.192,
"grad_norm": 50.10870361328125,
"learning_rate": 6.975196401579741e-06,
"loss": 3.5406,
"step": 1350
},
{
"epoch": 42.512,
"grad_norm": 44.10620880126953,
"learning_rate": 6.9320934128234985e-06,
"loss": 3.4479,
"step": 1360
},
{
"epoch": 42.832,
"grad_norm": 49.795814514160156,
"learning_rate": 6.8888437261459315e-06,
"loss": 3.7759,
"step": 1370
},
{
"epoch": 43.128,
"grad_norm": 44.46809005737305,
"learning_rate": 6.849797242773753e-06,
"loss": 3.2658,
"step": 1380
},
{
"epoch": 43.448,
"grad_norm": 45.29841232299805,
"learning_rate": 6.80628104764508e-06,
"loss": 3.4485,
"step": 1390
},
{
"epoch": 43.768,
"grad_norm": 43.17744445800781,
"learning_rate": 6.762631042310571e-06,
"loss": 3.6471,
"step": 1400
},
{
"epoch": 44.064,
"grad_norm": 43.51730728149414,
"learning_rate": 6.718851698102184e-06,
"loss": 3.2223,
"step": 1410
},
{
"epoch": 44.384,
"grad_norm": 44.667606353759766,
"learning_rate": 6.674947499600837e-06,
"loss": 3.4478,
"step": 1420
},
{
"epoch": 44.704,
"grad_norm": 43.00602340698242,
"learning_rate": 6.630922944177019e-06,
"loss": 3.2787,
"step": 1430
},
{
"epoch": 45.0,
"grad_norm": 41.20823669433594,
"learning_rate": 6.591201666146107e-06,
"loss": 3.2353,
"step": 1440
},
{
"epoch": 45.32,
"grad_norm": 44.2363395690918,
"learning_rate": 6.546960866608958e-06,
"loss": 3.3232,
"step": 1450
},
{
"epoch": 45.64,
"grad_norm": 34.35457992553711,
"learning_rate": 6.5026128205881235e-06,
"loss": 3.2552,
"step": 1460
},
{
"epoch": 45.96,
"grad_norm": 42.78106689453125,
"learning_rate": 6.458162070920059e-06,
"loss": 3.6029,
"step": 1470
},
{
"epoch": 46.256,
"grad_norm": 41.89685821533203,
"learning_rate": 6.41807234774012e-06,
"loss": 3.1435,
"step": 1480
},
{
"epoch": 46.576,
"grad_norm": 35.95709991455078,
"learning_rate": 6.373439013960264e-06,
"loss": 3.0505,
"step": 1490
},
{
"epoch": 46.896,
"grad_norm": 40.73233413696289,
"learning_rate": 6.328716208581277e-06,
"loss": 3.2412,
"step": 1500
},
{
"epoch": 47.192,
"grad_norm": 38.4586296081543,
"learning_rate": 6.288392971723836e-06,
"loss": 3.0233,
"step": 1510
},
{
"epoch": 47.512,
"grad_norm": 39.029319763183594,
"learning_rate": 6.243512798779037e-06,
"loss": 3.2573,
"step": 1520
},
{
"epoch": 47.832,
"grad_norm": 43.8737678527832,
"learning_rate": 6.198556463356931e-06,
"loss": 3.1004,
"step": 1530
},
{
"epoch": 48.128,
"grad_norm": 41.60302734375,
"learning_rate": 6.1535285706047075e-06,
"loss": 2.905,
"step": 1540
},
{
"epoch": 48.448,
"grad_norm": 41.740875244140625,
"learning_rate": 6.108433732999604e-06,
"loss": 3.1313,
"step": 1550
},
{
"epoch": 48.768,
"grad_norm": 38.86418151855469,
"learning_rate": 6.063276569876421e-06,
"loss": 3.115,
"step": 1560
},
{
"epoch": 49.064,
"grad_norm": 42.09383010864258,
"learning_rate": 6.022585657780215e-06,
"loss": 2.8219,
"step": 1570
},
{
"epoch": 49.384,
"grad_norm": 31.668691635131836,
"learning_rate": 5.9773228249238725e-06,
"loss": 2.9306,
"step": 1580
},
{
"epoch": 49.704,
"grad_norm": 40.560951232910156,
"learning_rate": 5.932011097026472e-06,
"loss": 3.0078,
"step": 1590
},
{
"epoch": 50.0,
"grad_norm": 39.417659759521484,
"learning_rate": 5.891192572805754e-06,
"loss": 2.7878,
"step": 1600
},
{
"epoch": 50.32,
"grad_norm": 38.46611022949219,
"learning_rate": 5.84580073556307e-06,
"loss": 3.0789,
"step": 1610
},
{
"epoch": 50.64,
"grad_norm": 43.09978485107422,
"learning_rate": 5.80037347587612e-06,
"loss": 2.9716,
"step": 1620
},
{
"epoch": 50.96,
"grad_norm": 37.992034912109375,
"learning_rate": 5.754915447131693e-06,
"loss": 3.1029,
"step": 1630
},
{
"epoch": 51.256,
"grad_norm": 34.79606628417969,
"learning_rate": 5.713980762289322e-06,
"loss": 2.7127,
"step": 1640
},
{
"epoch": 51.576,
"grad_norm": 34.46636962890625,
"learning_rate": 5.668477103320299e-06,
"loss": 2.9521,
"step": 1650
},
{
"epoch": 51.896,
"grad_norm": 39.093299865722656,
"learning_rate": 5.6229561862306635e-06,
"loss": 2.8132,
"step": 1660
},
{
"epoch": 52.192,
"grad_norm": 36.822364807128906,
"learning_rate": 5.577422674001093e-06,
"loss": 2.6165,
"step": 1670
},
{
"epoch": 52.512,
"grad_norm": 37.18339157104492,
"learning_rate": 5.531881230902461e-06,
"loss": 2.8924,
"step": 1680
},
{
"epoch": 52.832,
"grad_norm": 34.324920654296875,
"learning_rate": 5.486336522018052e-06,
"loss": 2.7838,
"step": 1690
},
{
"epoch": 53.128,
"grad_norm": 34.57423782348633,
"learning_rate": 5.445347347743854e-06,
"loss": 2.5569,
"step": 1700
},
{
"epoch": 53.448,
"grad_norm": 34.08185958862305,
"learning_rate": 5.399809286983922e-06,
"loss": 2.6638,
"step": 1710
},
{
"epoch": 53.768,
"grad_norm": 35.969120025634766,
"learning_rate": 5.354281489359898e-06,
"loss": 2.8466,
"step": 1720
},
{
"epoch": 54.064,
"grad_norm": 36.20486831665039,
"learning_rate": 5.313319101044968e-06,
"loss": 2.6815,
"step": 1730
},
{
"epoch": 54.384,
"grad_norm": 34.20916748046875,
"learning_rate": 5.267823650578826e-06,
"loss": 2.8057,
"step": 1740
},
{
"epoch": 54.704,
"grad_norm": 35.21307373046875,
"learning_rate": 5.222351983329293e-06,
"loss": 2.5881,
"step": 1750
},
{
"epoch": 55.0,
"grad_norm": 37.0841064453125,
"learning_rate": 5.181451667296815e-06,
"loss": 2.5513,
"step": 1760
},
{
"epoch": 55.32,
"grad_norm": 32.83190155029297,
"learning_rate": 5.136038018373682e-06,
"loss": 2.7037,
"step": 1770
},
{
"epoch": 55.64,
"grad_norm": 36.691349029541016,
"learning_rate": 5.0906616522602225e-06,
"loss": 2.4978,
"step": 1780
},
{
"epoch": 55.96,
"grad_norm": 37.95656204223633,
"learning_rate": 5.045327217129888e-06,
"loss": 2.7158,
"step": 1790
},
{
"epoch": 56.256,
"grad_norm": 37.772701263427734,
"learning_rate": 5.000039356860898e-06,
"loss": 2.458,
"step": 1800
},
{
"epoch": 56.576,
"grad_norm": 33.042057037353516,
"learning_rate": 4.954802710560514e-06,
"loss": 2.6958,
"step": 1810
},
{
"epoch": 56.896,
"grad_norm": 34.64514923095703,
"learning_rate": 4.909621912089858e-06,
"loss": 2.6069,
"step": 1820
},
{
"epoch": 57.192,
"grad_norm": 30.218544006347656,
"learning_rate": 4.869010768644479e-06,
"loss": 2.4799,
"step": 1830
},
{
"epoch": 57.512,
"grad_norm": 33.19715881347656,
"learning_rate": 4.82394882646648e-06,
"loss": 2.638,
"step": 1840
},
{
"epoch": 57.832,
"grad_norm": 34.21620178222656,
"learning_rate": 4.778956136266724e-06,
"loss": 2.5429,
"step": 1850
},
{
"epoch": 58.128,
"grad_norm": 30.74477195739746,
"learning_rate": 4.738525734914598e-06,
"loss": 2.2018,
"step": 1860
},
{
"epoch": 58.448,
"grad_norm": 29.80803680419922,
"learning_rate": 4.693677314670168e-06,
"loss": 2.5071,
"step": 1870
},
{
"epoch": 58.768,
"grad_norm": 28.387727737426758,
"learning_rate": 4.648911490896609e-06,
"loss": 2.5778,
"step": 1880
},
{
"epoch": 59.064,
"grad_norm": 38.00436782836914,
"learning_rate": 4.608696659690191e-06,
"loss": 2.3583,
"step": 1890
},
{
"epoch": 59.384,
"grad_norm": 28.832168579101562,
"learning_rate": 4.5641003952558335e-06,
"loss": 2.4408,
"step": 1900
},
{
"epoch": 59.704,
"grad_norm": 30.9281005859375,
"learning_rate": 4.519600000633706e-06,
"loss": 2.572,
"step": 1910
},
{
"epoch": 60.0,
"grad_norm": 29.808483123779297,
"learning_rate": 4.4752000342662435e-06,
"loss": 2.2828,
"step": 1920
},
{
"epoch": 60.32,
"grad_norm": 30.235679626464844,
"learning_rate": 4.430905044308421e-06,
"loss": 2.4467,
"step": 1930
},
{
"epoch": 60.64,
"grad_norm": 34.53474044799805,
"learning_rate": 4.386719568161842e-06,
"loss": 2.6138,
"step": 1940
},
{
"epoch": 60.96,
"grad_norm": 28.18803596496582,
"learning_rate": 4.342648132009955e-06,
"loss": 2.3741,
"step": 1950
},
{
"epoch": 61.256,
"grad_norm": 36.68701934814453,
"learning_rate": 4.3030850751531636e-06,
"loss": 2.265,
"step": 1960
},
{
"epoch": 61.576,
"grad_norm": 28.751218795776367,
"learning_rate": 4.259242742415551e-06,
"loss": 2.323,
"step": 1970
},
{
"epoch": 61.896,
"grad_norm": 36.796180725097656,
"learning_rate": 4.215527507888797e-06,
"loss": 2.4313,
"step": 1980
},
{
"epoch": 62.192,
"grad_norm": 32.65000534057617,
"learning_rate": 4.1762961671398e-06,
"loss": 2.2107,
"step": 1990
},
{
"epoch": 62.512,
"grad_norm": 29.814340591430664,
"learning_rate": 4.1328347450114085e-06,
"loss": 2.3156,
"step": 2000
},
{
"epoch": 62.832,
"grad_norm": 30.403644561767578,
"learning_rate": 4.08951336982354e-06,
"loss": 2.4868,
"step": 2010
},
{
"epoch": 63.128,
"grad_norm": 30.64937400817871,
"learning_rate": 4.050647540325295e-06,
"loss": 2.1469,
"step": 2020
},
{
"epoch": 63.448,
"grad_norm": 33.04718017578125,
"learning_rate": 4.0076044678776885e-06,
"loss": 2.2953,
"step": 2030
},
{
"epoch": 63.768,
"grad_norm": 26.519309997558594,
"learning_rate": 3.964714270459319e-06,
"loss": 2.2162,
"step": 2040
},
{
"epoch": 64.064,
"grad_norm": 30.9384822845459,
"learning_rate": 3.921981341570459e-06,
"loss": 2.1822,
"step": 2050
},
{
"epoch": 64.384,
"grad_norm": 27.196205139160156,
"learning_rate": 3.87941005860143e-06,
"loss": 2.2589,
"step": 2060
},
{
"epoch": 64.704,
"grad_norm": 29.418684005737305,
"learning_rate": 3.837004782384188e-06,
"loss": 2.2247,
"step": 2070
},
{
"epoch": 65.0,
"grad_norm": 31.906282424926758,
"learning_rate": 3.798985560092915e-06,
"loss": 2.122,
"step": 2080
},
{
"epoch": 65.32,
"grad_norm": 30.91475486755371,
"learning_rate": 3.7569076495338595e-06,
"loss": 2.2306,
"step": 2090
},
{
"epoch": 65.64,
"grad_norm": 29.103853225708008,
"learning_rate": 3.7150082943834076e-06,
"loss": 2.3974,
"step": 2100
},
{
"epoch": 65.96,
"grad_norm": 30.623212814331055,
"learning_rate": 3.673291786643919e-06,
"loss": 2.1196,
"step": 2110
},
{
"epoch": 66.256,
"grad_norm": 36.47549819946289,
"learning_rate": 3.635906796468296e-06,
"loss": 2.0359,
"step": 2120
},
{
"epoch": 66.576,
"grad_norm": 32.82271957397461,
"learning_rate": 3.5945494558412943e-06,
"loss": 2.1794,
"step": 2130
},
{
"epoch": 66.896,
"grad_norm": 28.202482223510742,
"learning_rate": 3.5533873019468813e-06,
"loss": 2.1184,
"step": 2140
},
{
"epoch": 67.192,
"grad_norm": 25.35958480834961,
"learning_rate": 3.5165117334443868e-06,
"loss": 2.019,
"step": 2150
},
{
"epoch": 67.512,
"grad_norm": 32.23173141479492,
"learning_rate": 3.4757320338886956e-06,
"loss": 2.1757,
"step": 2160
},
{
"epoch": 67.832,
"grad_norm": 29.19985008239746,
"learning_rate": 3.4351596922470677e-06,
"loss": 2.165,
"step": 2170
},
{
"epoch": 68.128,
"grad_norm": 33.79531478881836,
"learning_rate": 3.394798864587904e-06,
"loss": 1.9395,
"step": 2180
},
{
"epoch": 68.448,
"grad_norm": 25.360973358154297,
"learning_rate": 3.3546536853129603e-06,
"loss": 2.1175,
"step": 2190
},
{
"epoch": 68.768,
"grad_norm": 31.0283145904541,
"learning_rate": 3.314728266733837e-06,
"loss": 2.1479,
"step": 2200
},
{
"epoch": 69.064,
"grad_norm": 28.047056198120117,
"learning_rate": 3.2789866660993095e-06,
"loss": 1.9538,
"step": 2210
},
{
"epoch": 69.384,
"grad_norm": 29.004770278930664,
"learning_rate": 3.2394900412984527e-06,
"loss": 2.1279,
"step": 2220
},
{
"epoch": 69.704,
"grad_norm": 29.955669403076172,
"learning_rate": 3.20022497409649e-06,
"loss": 2.1065,
"step": 2230
},
{
"epoch": 70.0,
"grad_norm": 25.94803810119629,
"learning_rate": 3.1650877201708363e-06,
"loss": 1.975,
"step": 2240
},
{
"epoch": 70.32,
"grad_norm": 27.678482055664062,
"learning_rate": 3.126273673511262e-06,
"loss": 2.0469,
"step": 2250
},
{
"epoch": 70.64,
"grad_norm": 26.973514556884766,
"learning_rate": 3.0877027818824744e-06,
"loss": 1.9968,
"step": 2260
},
{
"epoch": 70.96,
"grad_norm": 25.811176300048828,
"learning_rate": 3.0493789963323385e-06,
"loss": 2.0934,
"step": 2270
},
{
"epoch": 71.256,
"grad_norm": 24.189441680908203,
"learning_rate": 3.015102110184209e-06,
"loss": 1.8912,
"step": 2280
},
{
"epoch": 71.576,
"grad_norm": 28.846141815185547,
"learning_rate": 2.9772586203393617e-06,
"loss": 1.9996,
"step": 2290
},
{
"epoch": 71.896,
"grad_norm": 29.31644630432129,
"learning_rate": 2.939673550031339e-06,
"loss": 2.1061,
"step": 2300
},
{
"epoch": 72.192,
"grad_norm": 24.74772071838379,
"learning_rate": 2.902350749324414e-06,
"loss": 1.8613,
"step": 2310
},
{
"epoch": 72.512,
"grad_norm": 26.519685745239258,
"learning_rate": 2.8652940414170094e-06,
"loss": 1.9042,
"step": 2320
},
{
"epoch": 72.832,
"grad_norm": 27.277027130126953,
"learning_rate": 2.828507222250073e-06,
"loss": 2.1886,
"step": 2330
},
{
"epoch": 73.128,
"grad_norm": 25.948633193969727,
"learning_rate": 2.795632954945905e-06,
"loss": 1.8885,
"step": 2340
},
{
"epoch": 73.448,
"grad_norm": 27.202346801757812,
"learning_rate": 2.7593692828767417e-06,
"loss": 2.0617,
"step": 2350
},
{
"epoch": 73.768,
"grad_norm": 25.095088958740234,
"learning_rate": 2.723386350056646e-06,
"loss": 2.0202,
"step": 2360
},
{
"epoch": 74.064,
"grad_norm": 27.66188621520996,
"learning_rate": 2.6912447896126063e-06,
"loss": 1.9708,
"step": 2370
},
{
"epoch": 74.384,
"grad_norm": 27.14434814453125,
"learning_rate": 2.655805392079069e-06,
"loss": 1.9453,
"step": 2380
},
{
"epoch": 74.704,
"grad_norm": 27.399051666259766,
"learning_rate": 2.6206573424653525e-06,
"loss": 1.85,
"step": 2390
},
{
"epoch": 75.0,
"grad_norm": 27.855464935302734,
"learning_rate": 2.5892761766588475e-06,
"loss": 1.9639,
"step": 2400
},
{
"epoch": 75.32,
"grad_norm": 22.55164909362793,
"learning_rate": 2.554691582301635e-06,
"loss": 1.8897,
"step": 2410
},
{
"epoch": 75.64,
"grad_norm": 28.89771842956543,
"learning_rate": 2.5204086935584972e-06,
"loss": 2.0627,
"step": 2420
},
{
"epoch": 75.96,
"grad_norm": 27.81890106201172,
"learning_rate": 2.486431022231434e-06,
"loss": 1.9662,
"step": 2430
},
{
"epoch": 76.256,
"grad_norm": 23.57813835144043,
"learning_rate": 2.4527620488572007e-06,
"loss": 1.8852,
"step": 2440
},
{
"epoch": 76.576,
"grad_norm": 28.515785217285156,
"learning_rate": 2.4194052223507637e-06,
"loss": 1.9537,
"step": 2450
},
{
"epoch": 76.896,
"grad_norm": 21.694969177246094,
"learning_rate": 2.3863639596520146e-06,
"loss": 1.9615,
"step": 2460
},
{
"epoch": 77.192,
"grad_norm": 22.404096603393555,
"learning_rate": 2.3568994283363583e-06,
"loss": 1.7868,
"step": 2470
},
{
"epoch": 77.512,
"grad_norm": 22.37274932861328,
"learning_rate": 2.324467034489848e-06,
"loss": 1.92,
"step": 2480
},
{
"epoch": 77.832,
"grad_norm": 22.426904678344727,
"learning_rate": 2.29235992953927e-06,
"loss": 1.9547,
"step": 2490
},
{
"epoch": 78.128,
"grad_norm": 27.522668838500977,
"learning_rate": 2.2637443760789118e-06,
"loss": 1.8528,
"step": 2500
},
{
"epoch": 78.448,
"grad_norm": 28.163259506225586,
"learning_rate": 2.232264353203013e-06,
"loss": 1.9261,
"step": 2510
},
{
"epoch": 78.768,
"grad_norm": 25.449567794799805,
"learning_rate": 2.2011190640969375e-06,
"loss": 1.9074,
"step": 2520
},
{
"epoch": 79.064,
"grad_norm": 23.236602783203125,
"learning_rate": 2.173377138853083e-06,
"loss": 1.821,
"step": 2530
},
{
"epoch": 79.384,
"grad_norm": 25.74871826171875,
"learning_rate": 2.142876604855393e-06,
"loss": 1.9702,
"step": 2540
},
{
"epoch": 79.704,
"grad_norm": 27.608022689819336,
"learning_rate": 2.1127199611509476e-06,
"loss": 1.8918,
"step": 2550
},
{
"epoch": 80.0,
"grad_norm": 26.690242767333984,
"learning_rate": 2.0829102968657067e-06,
"loss": 1.7362,
"step": 2560
},
{
"epoch": 80.32,
"grad_norm": 25.989137649536133,
"learning_rate": 2.0534506655824563e-06,
"loss": 1.8662,
"step": 2570
},
{
"epoch": 80.64,
"grad_norm": 26.44927215576172,
"learning_rate": 2.024344085027995e-06,
"loss": 1.8962,
"step": 2580
},
{
"epoch": 80.96,
"grad_norm": 26.649433135986328,
"learning_rate": 1.9955935367640172e-06,
"loss": 1.9202,
"step": 2590
},
{
"epoch": 81.256,
"grad_norm": 25.331037521362305,
"learning_rate": 1.9700248858111003e-06,
"loss": 1.781,
"step": 2600
},
{
"epoch": 81.576,
"grad_norm": 27.306407928466797,
"learning_rate": 1.941958882239165e-06,
"loss": 1.9203,
"step": 2610
},
{
"epoch": 81.896,
"grad_norm": 22.897323608398438,
"learning_rate": 1.914257350168377e-06,
"loss": 1.8839,
"step": 2620
},
{
"epoch": 82.192,
"grad_norm": 23.076894760131836,
"learning_rate": 1.8896399405188986e-06,
"loss": 1.6662,
"step": 2630
},
{
"epoch": 82.512,
"grad_norm": 23.84672737121582,
"learning_rate": 1.8626386908906847e-06,
"loss": 1.8701,
"step": 2640
},
{
"epoch": 82.832,
"grad_norm": 25.28089714050293,
"learning_rate": 1.8360100380067636e-06,
"loss": 1.9854,
"step": 2650
},
{
"epoch": 83.128,
"grad_norm": 24.182842254638672,
"learning_rate": 1.8123650758918262e-06,
"loss": 1.6976,
"step": 2660
},
{
"epoch": 83.448,
"grad_norm": 26.943639755249023,
"learning_rate": 1.7864518399551195e-06,
"loss": 2.0291,
"step": 2670
},
{
"epoch": 83.768,
"grad_norm": 22.0482234954834,
"learning_rate": 1.7609190050389946e-06,
"loss": 1.8355,
"step": 2680
},
{
"epoch": 84.064,
"grad_norm": 28.498458862304688,
"learning_rate": 1.7357691866249377e-06,
"loss": 1.7042,
"step": 2690
},
{
"epoch": 84.384,
"grad_norm": 23.913034439086914,
"learning_rate": 1.7110049609597555e-06,
"loss": 1.8195,
"step": 2700
},
{
"epoch": 84.704,
"grad_norm": 24.15852928161621,
"learning_rate": 1.6866288647916768e-06,
"loss": 1.8004,
"step": 2710
},
{
"epoch": 85.0,
"grad_norm": 26.641592025756836,
"learning_rate": 1.665024293548139e-06,
"loss": 1.7754,
"step": 2720
},
{
"epoch": 85.32,
"grad_norm": 25.73943328857422,
"learning_rate": 1.641392489564914e-06,
"loss": 1.863,
"step": 2730
},
{
"epoch": 85.64,
"grad_norm": 21.554733276367188,
"learning_rate": 1.6181559459019353e-06,
"loss": 1.8378,
"step": 2740
},
{
"epoch": 85.96,
"grad_norm": 23.64627456665039,
"learning_rate": 1.5953170428177997e-06,
"loss": 1.8041,
"step": 2750
},
{
"epoch": 86.256,
"grad_norm": 27.766212463378906,
"learning_rate": 1.5751039472006447e-06,
"loss": 1.7524,
"step": 2760
},
{
"epoch": 86.576,
"grad_norm": 19.842952728271484,
"learning_rate": 1.5530269727513447e-06,
"loss": 1.8589,
"step": 2770
},
{
"epoch": 86.896,
"grad_norm": 23.708539962768555,
"learning_rate": 1.531354310432403e-06,
"loss": 1.7687,
"step": 2780
},
{
"epoch": 87.192,
"grad_norm": 27.59642791748047,
"learning_rate": 1.5121964369565362e-06,
"loss": 1.7757,
"step": 2790
},
{
"epoch": 87.512,
"grad_norm": 23.417882919311523,
"learning_rate": 1.4912980495391515e-06,
"loss": 1.8026,
"step": 2800
},
{
"epoch": 87.832,
"grad_norm": 23.49812889099121,
"learning_rate": 1.4708102975161232e-06,
"loss": 1.7793,
"step": 2810
},
{
"epoch": 88.128,
"grad_norm": 20.413761138916016,
"learning_rate": 1.450735279570825e-06,
"loss": 1.644,
"step": 2820
},
{
"epoch": 88.448,
"grad_norm": 20.448760986328125,
"learning_rate": 1.431075052107801e-06,
"loss": 1.7853,
"step": 2830
},
{
"epoch": 88.768,
"grad_norm": 24.541154861450195,
"learning_rate": 1.411831629042118e-06,
"loss": 1.85,
"step": 2840
},
{
"epoch": 89.064,
"grad_norm": 25.394201278686523,
"learning_rate": 1.3948705461409444e-06,
"loss": 1.6874,
"step": 2850
},
{
"epoch": 89.384,
"grad_norm": 23.580543518066406,
"learning_rate": 1.3764244466885141e-06,
"loss": 1.8601,
"step": 2860
},
{
"epoch": 89.704,
"grad_norm": 23.068803787231445,
"learning_rate": 1.3584007498228602e-06,
"loss": 1.813,
"step": 2870
},
{
"epoch": 90.0,
"grad_norm": 24.027877807617188,
"learning_rate": 1.3425421036992098e-06,
"loss": 1.7416,
"step": 2880
},
{
"epoch": 90.32,
"grad_norm": 27.069442749023438,
"learning_rate": 1.3253260223227138e-06,
"loss": 1.715,
"step": 2890
},
{
"epoch": 90.64,
"grad_norm": 22.877151489257812,
"learning_rate": 1.30853757785121e-06,
"loss": 1.8249,
"step": 2900
},
{
"epoch": 90.96,
"grad_norm": 24.492151260375977,
"learning_rate": 1.292178490025803e-06,
"loss": 1.7188,
"step": 2910
},
{
"epoch": 91.256,
"grad_norm": 20.234020233154297,
"learning_rate": 1.2778237968421458e-06,
"loss": 1.5356,
"step": 2920
},
{
"epoch": 91.576,
"grad_norm": 25.770517349243164,
"learning_rate": 1.2622850668726402e-06,
"loss": 1.6071,
"step": 2930
},
{
"epoch": 91.896,
"grad_norm": 28.51846694946289,
"learning_rate": 1.2471804314714615e-06,
"loss": 1.7475,
"step": 2940
},
{
"epoch": 92.192,
"grad_norm": 22.339611053466797,
"learning_rate": 1.232511437897046e-06,
"loss": 1.6772,
"step": 2950
},
{
"epoch": 92.512,
"grad_norm": 25.455184936523438,
"learning_rate": 1.2182795887824252e-06,
"loss": 1.9423,
"step": 2960
},
{
"epoch": 92.832,
"grad_norm": 22.2073974609375,
"learning_rate": 1.2044863419812993e-06,
"loss": 1.6468,
"step": 2970
},
{
"epoch": 93.128,
"grad_norm": 23.767728805541992,
"learning_rate": 1.1924485935515493e-06,
"loss": 1.6161,
"step": 2980
},
{
"epoch": 93.448,
"grad_norm": 22.131113052368164,
"learning_rate": 1.179492546504283e-06,
"loss": 1.7268,
"step": 2990
},
{
"epoch": 93.768,
"grad_norm": 20.83936882019043,
"learning_rate": 1.1669790749599883e-06,
"loss": 1.6866,
"step": 3000
},
{
"epoch": 94.064,
"grad_norm": 21.060152053833008,
"learning_rate": 1.1560964129965143e-06,
"loss": 1.6455,
"step": 3010
},
{
"epoch": 94.384,
"grad_norm": 26.631736755371094,
"learning_rate": 1.144427328773634e-06,
"loss": 1.7459,
"step": 3020
},
{
"epoch": 94.704,
"grad_norm": 22.8407039642334,
"learning_rate": 1.1332044119943799e-06,
"loss": 1.7396,
"step": 3030
},
{
"epoch": 95.0,
"grad_norm": 24.34178352355957,
"learning_rate": 1.1234862111733938e-06,
"loss": 1.5881,
"step": 3040
},
{
"epoch": 95.32,
"grad_norm": 23.25417709350586,
"learning_rate": 1.1131141419006206e-06,
"loss": 1.6469,
"step": 3050
},
{
"epoch": 95.64,
"grad_norm": 21.618032455444336,
"learning_rate": 1.1031914476690958e-06,
"loss": 1.7556,
"step": 3060
},
{
"epoch": 95.96,
"grad_norm": 25.758623123168945,
"learning_rate": 1.093719144919928e-06,
"loss": 1.745,
"step": 3070
},
{
"epoch": 96.256,
"grad_norm": 24.91753578186035,
"learning_rate": 1.0846982039579245e-06,
"loss": 1.6264,
"step": 3080
},
{
"epoch": 96.576,
"grad_norm": 23.212156295776367,
"learning_rate": 1.0761295488521977e-06,
"loss": 1.7324,
"step": 3090
},
{
"epoch": 96.896,
"grad_norm": 24.846576690673828,
"learning_rate": 1.0680140573415042e-06,
"loss": 1.7467,
"step": 3100
},
{
"epoch": 97.192,
"grad_norm": 24.324583053588867,
"learning_rate": 1.0610982579009123e-06,
"loss": 1.5182,
"step": 3110
},
{
"epoch": 97.512,
"grad_norm": 26.072097778320312,
"learning_rate": 1.0538460290681764e-06,
"loss": 1.7966,
"step": 3120
},
{
"epoch": 97.832,
"grad_norm": 21.74355125427246,
"learning_rate": 1.0470492464651147e-06,
"loss": 1.6975,
"step": 3130
},
{
"epoch": 98.128,
"grad_norm": 24.43443489074707,
"learning_rate": 1.0413221250522967e-06,
"loss": 1.6646,
"step": 3140
},
{
"epoch": 98.448,
"grad_norm": 21.463241577148438,
"learning_rate": 1.0353925697965052e-06,
"loss": 1.7691,
"step": 3150
},
{
"epoch": 98.768,
"grad_norm": 21.25942039489746,
"learning_rate": 1.0299203510689471e-06,
"loss": 1.7827,
"step": 3160
},
{
"epoch": 99.064,
"grad_norm": 23.53223419189453,
"learning_rate": 1.0253868412111096e-06,
"loss": 1.6516,
"step": 3170
},
{
"epoch": 99.384,
"grad_norm": 21.707815170288086,
"learning_rate": 1.020785067442512e-06,
"loss": 1.6417,
"step": 3180
},
{
"epoch": 99.704,
"grad_norm": 23.734600067138672,
"learning_rate": 1.0166421265362642e-06,
"loss": 1.7752,
"step": 3190
},
{
"epoch": 100.0,
"grad_norm": 22.472688674926758,
"learning_rate": 1.0129584428786632e-06,
"loss": 1.6273,
"step": 3200
}
],
"logging_steps": 10,
"max_steps": 3200,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}