Llama-3.1-KokoroChat-High / trainer_state.json
ZhiyangQi97's picture
Upload 11 files
a6868b4 verified
raw
history blame
30.4 kB
{
"best_metric": 0.015329813584685326,
"best_model_checkpoint": "./results_high/checkpoint-3200",
"epoch": 4.954668733049206,
"eval_steps": 400,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030995738086013174,
"grad_norm": 18.685022354125977,
"learning_rate": 0.0002,
"loss": 28.2054,
"step": 20
},
{
"epoch": 0.06199147617202635,
"grad_norm": 3.494961738586426,
"learning_rate": 0.0004,
"loss": 20.2888,
"step": 40
},
{
"epoch": 0.09298721425803952,
"grad_norm": 4.7338151931762695,
"learning_rate": 0.0006,
"loss": 18.4692,
"step": 60
},
{
"epoch": 0.1239829523440527,
"grad_norm": 4.808109283447266,
"learning_rate": 0.0008,
"loss": 17.1405,
"step": 80
},
{
"epoch": 0.15497869043006587,
"grad_norm": 5.874823570251465,
"learning_rate": 0.001,
"loss": 15.9045,
"step": 100
},
{
"epoch": 0.18597442851607904,
"grad_norm": 5.856851577758789,
"learning_rate": 0.0009998989386555814,
"loss": 14.6319,
"step": 120
},
{
"epoch": 0.2169701666020922,
"grad_norm": 5.156916618347168,
"learning_rate": 0.0009995957954759072,
"loss": 14.0393,
"step": 140
},
{
"epoch": 0.2479659046881054,
"grad_norm": 5.812623023986816,
"learning_rate": 0.0009990906930052064,
"loss": 12.2195,
"step": 160
},
{
"epoch": 0.27896164277411856,
"grad_norm": 4.946173667907715,
"learning_rate": 0.000998383835428818,
"loss": 10.9657,
"step": 180
},
{
"epoch": 0.30995738086013175,
"grad_norm": 6.92509651184082,
"learning_rate": 0.0009974755084906502,
"loss": 9.8633,
"step": 200
},
{
"epoch": 0.3409531189461449,
"grad_norm": 5.058427333831787,
"learning_rate": 0.0009963660793776688,
"loss": 8.7671,
"step": 220
},
{
"epoch": 0.3719488570321581,
"grad_norm": 5.186002731323242,
"learning_rate": 0.0009950559965714648,
"loss": 7.8407,
"step": 240
},
{
"epoch": 0.40294459511817127,
"grad_norm": 5.172844886779785,
"learning_rate": 0.0009935457896669568,
"loss": 6.9268,
"step": 260
},
{
"epoch": 0.4339403332041844,
"grad_norm": 5.07811164855957,
"learning_rate": 0.0009918360691583054,
"loss": 6.2128,
"step": 280
},
{
"epoch": 0.4649360712901976,
"grad_norm": 5.332867622375488,
"learning_rate": 0.0009899275261921235,
"loss": 5.8147,
"step": 300
},
{
"epoch": 0.4959318093762108,
"grad_norm": 5.0256171226501465,
"learning_rate": 0.000987820932288083,
"loss": 5.2,
"step": 320
},
{
"epoch": 0.5269275474622239,
"grad_norm": 4.997400760650635,
"learning_rate": 0.0009855171390270323,
"loss": 4.8953,
"step": 340
},
{
"epoch": 0.5579232855482371,
"grad_norm": 5.24421501159668,
"learning_rate": 0.0009830170777067485,
"loss": 4.34,
"step": 360
},
{
"epoch": 0.5889190236342503,
"grad_norm": 5.273136615753174,
"learning_rate": 0.000980321758965464,
"loss": 4.2298,
"step": 380
},
{
"epoch": 0.6199147617202635,
"grad_norm": 5.154964923858643,
"learning_rate": 0.0009774322723733215,
"loss": 3.8631,
"step": 400
},
{
"epoch": 0.6199147617202635,
"eval_loss": 0.21621239185333252,
"eval_runtime": 1322.5637,
"eval_samples_per_second": 6.939,
"eval_steps_per_second": 0.868,
"step": 400
},
{
"epoch": 0.6509104998062767,
"grad_norm": 4.869210720062256,
"learning_rate": 0.0009743497859919196,
"loss": 3.7908,
"step": 420
},
{
"epoch": 0.6819062378922898,
"grad_norm": 4.612276077270508,
"learning_rate": 0.0009710755459021297,
"loss": 3.4467,
"step": 440
},
{
"epoch": 0.712901975978303,
"grad_norm": 4.885288238525391,
"learning_rate": 0.0009676108757003736,
"loss": 3.3109,
"step": 460
},
{
"epoch": 0.7438977140643162,
"grad_norm": 4.861220359802246,
"learning_rate": 0.0009639571759635653,
"loss": 3.175,
"step": 480
},
{
"epoch": 0.7748934521503293,
"grad_norm": 4.333089351654053,
"learning_rate": 0.0009601159236829353,
"loss": 3.0602,
"step": 500
},
{
"epoch": 0.8058891902363425,
"grad_norm": 4.754833221435547,
"learning_rate": 0.0009560886716669619,
"loss": 2.9202,
"step": 520
},
{
"epoch": 0.8368849283223557,
"grad_norm": 4.719883918762207,
"learning_rate": 0.0009518770479136578,
"loss": 2.7844,
"step": 540
},
{
"epoch": 0.8678806664083688,
"grad_norm": 4.607946872711182,
"learning_rate": 0.0009474827549524574,
"loss": 2.649,
"step": 560
},
{
"epoch": 0.898876404494382,
"grad_norm": 4.616612434387207,
"learning_rate": 0.0009429075691559787,
"loss": 2.5995,
"step": 580
},
{
"epoch": 0.9298721425803952,
"grad_norm": 4.475257396697998,
"learning_rate": 0.0009381533400219318,
"loss": 2.3863,
"step": 600
},
{
"epoch": 0.9608678806664084,
"grad_norm": 4.479520797729492,
"learning_rate": 0.0009332219894254686,
"loss": 2.3413,
"step": 620
},
{
"epoch": 0.9918636187524216,
"grad_norm": 4.254257678985596,
"learning_rate": 0.0009281155108422732,
"loss": 2.1925,
"step": 640
},
{
"epoch": 1.0216970166602093,
"grad_norm": 4.349820613861084,
"learning_rate": 0.0009228359685427095,
"loss": 2.0963,
"step": 660
},
{
"epoch": 1.0526927547462224,
"grad_norm": 4.13181209564209,
"learning_rate": 0.0009173854967573479,
"loss": 2.0195,
"step": 680
},
{
"epoch": 1.0836884928322355,
"grad_norm": 3.947432279586792,
"learning_rate": 0.0009117662988142137,
"loss": 1.9265,
"step": 700
},
{
"epoch": 1.1146842309182488,
"grad_norm": 4.134565353393555,
"learning_rate": 0.0009059806462481021,
"loss": 1.9981,
"step": 720
},
{
"epoch": 1.1456799690042618,
"grad_norm": 4.007950305938721,
"learning_rate": 0.0009000308778823195,
"loss": 1.8992,
"step": 740
},
{
"epoch": 1.1766757070902751,
"grad_norm": 3.916898012161255,
"learning_rate": 0.000893919398883226,
"loss": 1.8171,
"step": 760
},
{
"epoch": 1.2076714451762882,
"grad_norm": 4.240416049957275,
"learning_rate": 0.0008876486797879579,
"loss": 1.7744,
"step": 780
},
{
"epoch": 1.2386671832623015,
"grad_norm": 3.7714171409606934,
"learning_rate": 0.000881221255505724,
"loss": 1.7344,
"step": 800
},
{
"epoch": 1.2386671832623015,
"eval_loss": 0.08633554726839066,
"eval_runtime": 1321.2599,
"eval_samples_per_second": 6.946,
"eval_steps_per_second": 0.869,
"step": 800
},
{
"epoch": 1.2696629213483146,
"grad_norm": 3.9192988872528076,
"learning_rate": 0.0008746397242930808,
"loss": 1.7184,
"step": 820
},
{
"epoch": 1.300658659434328,
"grad_norm": 3.8376963138580322,
"learning_rate": 0.0008679067467035988,
"loss": 1.7126,
"step": 840
},
{
"epoch": 1.331654397520341,
"grad_norm": 4.027160167694092,
"learning_rate": 0.0008610250445123471,
"loss": 1.6277,
"step": 860
},
{
"epoch": 1.362650135606354,
"grad_norm": 3.6905505657196045,
"learning_rate": 0.0008539973996156264,
"loss": 1.5709,
"step": 880
},
{
"epoch": 1.3936458736923674,
"grad_norm": 3.442706346511841,
"learning_rate": 0.0008468266529064025,
"loss": 1.5135,
"step": 900
},
{
"epoch": 1.4246416117783804,
"grad_norm": 3.874969005584717,
"learning_rate": 0.000839515703125887,
"loss": 1.4924,
"step": 920
},
{
"epoch": 1.4556373498643937,
"grad_norm": 4.037222385406494,
"learning_rate": 0.0008320675056917352,
"loss": 1.4229,
"step": 940
},
{
"epoch": 1.4866330879504068,
"grad_norm": 3.6256048679351807,
"learning_rate": 0.0008244850715033315,
"loss": 1.4245,
"step": 960
},
{
"epoch": 1.51762882603642,
"grad_norm": 3.6055657863616943,
"learning_rate": 0.0008167714657246485,
"loss": 1.3645,
"step": 980
},
{
"epoch": 1.5486245641224332,
"grad_norm": 3.3049917221069336,
"learning_rate": 0.0008089298065451672,
"loss": 1.3765,
"step": 1000
},
{
"epoch": 1.5796203022084463,
"grad_norm": 3.7329185009002686,
"learning_rate": 0.0008009632639193642,
"loss": 1.3048,
"step": 1020
},
{
"epoch": 1.6106160402944596,
"grad_norm": 3.6136186122894287,
"learning_rate": 0.0007928750582852722,
"loss": 1.2793,
"step": 1040
},
{
"epoch": 1.6416117783804727,
"grad_norm": 3.500743865966797,
"learning_rate": 0.0007846684592626323,
"loss": 1.2545,
"step": 1060
},
{
"epoch": 1.6726075164664858,
"grad_norm": 3.393615961074829,
"learning_rate": 0.0007763467843311658,
"loss": 1.2531,
"step": 1080
},
{
"epoch": 1.703603254552499,
"grad_norm": 3.132471799850464,
"learning_rate": 0.0007679133974894983,
"loss": 1.192,
"step": 1100
},
{
"epoch": 1.7345989926385124,
"grad_norm": 3.4565207958221436,
"learning_rate": 0.0007593717078952787,
"loss": 1.1406,
"step": 1120
},
{
"epoch": 1.7655947307245254,
"grad_norm": 3.3600223064422607,
"learning_rate": 0.0007507251684870432,
"loss": 1.1335,
"step": 1140
},
{
"epoch": 1.7965904688105385,
"grad_norm": 3.5565154552459717,
"learning_rate": 0.0007419772745883799,
"loss": 1.142,
"step": 1160
},
{
"epoch": 1.8275862068965516,
"grad_norm": 3.155515193939209,
"learning_rate": 0.0007331315624949624,
"loss": 1.0728,
"step": 1180
},
{
"epoch": 1.858581944982565,
"grad_norm": 3.3215274810791016,
"learning_rate": 0.0007241916080450163,
"loss": 1.0707,
"step": 1200
},
{
"epoch": 1.858581944982565,
"eval_loss": 0.048645660281181335,
"eval_runtime": 1327.8,
"eval_samples_per_second": 6.911,
"eval_steps_per_second": 0.865,
"step": 1200
},
{
"epoch": 1.8895776830685782,
"grad_norm": 3.2068769931793213,
"learning_rate": 0.0007151610251738044,
"loss": 1.0147,
"step": 1220
},
{
"epoch": 1.9205734211545913,
"grad_norm": 2.9419779777526855,
"learning_rate": 0.0007060434644527105,
"loss": 1.0187,
"step": 1240
},
{
"epoch": 1.9515691592406044,
"grad_norm": 3.336106777191162,
"learning_rate": 0.0006968426116135118,
"loss": 1.0282,
"step": 1260
},
{
"epoch": 1.9825648973266174,
"grad_norm": 3.152125597000122,
"learning_rate": 0.0006875621860584389,
"loss": 0.9777,
"step": 1280
},
{
"epoch": 2.012398295234405,
"grad_norm": 2.975085735321045,
"learning_rate": 0.0006782059393566253,
"loss": 0.9048,
"step": 1300
},
{
"epoch": 2.0433940333204186,
"grad_norm": 3.0185718536376953,
"learning_rate": 0.000668777653727553,
"loss": 0.9297,
"step": 1320
},
{
"epoch": 2.0743897714064317,
"grad_norm": 2.8180739879608154,
"learning_rate": 0.0006592811405121065,
"loss": 0.8795,
"step": 1340
},
{
"epoch": 2.1053855094924447,
"grad_norm": 2.9423587322235107,
"learning_rate": 0.0006497202386318572,
"loss": 0.8589,
"step": 1360
},
{
"epoch": 2.136381247578458,
"grad_norm": 2.615948438644409,
"learning_rate": 0.0006400988130371969,
"loss": 0.8508,
"step": 1380
},
{
"epoch": 2.167376985664471,
"grad_norm": 2.6553754806518555,
"learning_rate": 0.0006304207531449486,
"loss": 0.8377,
"step": 1400
},
{
"epoch": 2.1983727237504844,
"grad_norm": 2.841156482696533,
"learning_rate": 0.0006206899712660886,
"loss": 0.813,
"step": 1420
},
{
"epoch": 2.2293684618364975,
"grad_norm": 2.705960988998413,
"learning_rate": 0.0006109104010242127,
"loss": 0.7873,
"step": 1440
},
{
"epoch": 2.2603641999225106,
"grad_norm": 2.7412073612213135,
"learning_rate": 0.0006010859957653868,
"loss": 0.7773,
"step": 1460
},
{
"epoch": 2.2913599380085237,
"grad_norm": 2.5611062049865723,
"learning_rate": 0.0005912207269600251,
"loss": 0.7602,
"step": 1480
},
{
"epoch": 2.322355676094537,
"grad_norm": 2.5611677169799805,
"learning_rate": 0.0005813185825974419,
"loss": 0.7453,
"step": 1500
},
{
"epoch": 2.3533514141805503,
"grad_norm": 2.4461872577667236,
"learning_rate": 0.0005713835655737244,
"loss": 0.7208,
"step": 1520
},
{
"epoch": 2.3843471522665634,
"grad_norm": 2.6245830059051514,
"learning_rate": 0.0005614196920735821,
"loss": 0.7292,
"step": 1540
},
{
"epoch": 2.4153428903525764,
"grad_norm": 2.444567918777466,
"learning_rate": 0.0005514309899468208,
"loss": 0.6917,
"step": 1560
},
{
"epoch": 2.4463386284385895,
"grad_norm": 2.399683952331543,
"learning_rate": 0.0005414214970801041,
"loss": 0.6623,
"step": 1580
},
{
"epoch": 2.477334366524603,
"grad_norm": 2.5869252681732178,
"learning_rate": 0.0005313952597646568,
"loss": 0.6501,
"step": 1600
},
{
"epoch": 2.477334366524603,
"eval_loss": 0.030375245958566666,
"eval_runtime": 1322.7101,
"eval_samples_per_second": 6.938,
"eval_steps_per_second": 0.868,
"step": 1600
},
{
"epoch": 2.508330104610616,
"grad_norm": 2.286961078643799,
"learning_rate": 0.0005213563310605686,
"loss": 0.6232,
"step": 1620
},
{
"epoch": 2.539325842696629,
"grad_norm": 2.2856411933898926,
"learning_rate": 0.0005113087691583649,
"loss": 0.6375,
"step": 1640
},
{
"epoch": 2.5703215807826423,
"grad_norm": 2.464272975921631,
"learning_rate": 0.000501256635738502,
"loss": 0.6252,
"step": 1660
},
{
"epoch": 2.601317318868656,
"grad_norm": 2.1272389888763428,
"learning_rate": 0.0004912039943294501,
"loss": 0.5947,
"step": 1680
},
{
"epoch": 2.632313056954669,
"grad_norm": 2.389514446258545,
"learning_rate": 0.0004811549086650327,
"loss": 0.5886,
"step": 1700
},
{
"epoch": 2.663308795040682,
"grad_norm": 2.169243097305298,
"learning_rate": 0.0004711134410416794,
"loss": 0.577,
"step": 1720
},
{
"epoch": 2.694304533126695,
"grad_norm": 2.2765533924102783,
"learning_rate": 0.0004610836506762617,
"loss": 0.5496,
"step": 1740
},
{
"epoch": 2.725300271212708,
"grad_norm": 2.2728376388549805,
"learning_rate": 0.00045106959206517425,
"loss": 0.536,
"step": 1760
},
{
"epoch": 2.7562960092987216,
"grad_norm": 2.093844413757324,
"learning_rate": 0.0004410753133453222,
"loss": 0.5255,
"step": 1780
},
{
"epoch": 2.7872917473847347,
"grad_norm": 2.17061185836792,
"learning_rate": 0.000431104854657681,
"loss": 0.5153,
"step": 1800
},
{
"epoch": 2.818287485470748,
"grad_norm": 2.016740560531616,
"learning_rate": 0.0004211622465140887,
"loss": 0.5014,
"step": 1820
},
{
"epoch": 2.849283223556761,
"grad_norm": 1.8642460107803345,
"learning_rate": 0.00041125150816792946,
"loss": 0.5038,
"step": 1840
},
{
"epoch": 2.880278961642774,
"grad_norm": 2.1162288188934326,
"learning_rate": 0.00040137664598936855,
"loss": 0.4808,
"step": 1860
},
{
"epoch": 2.9112746997287875,
"grad_norm": 2.0367867946624756,
"learning_rate": 0.00039154165184579736,
"loss": 0.4777,
"step": 1880
},
{
"epoch": 2.9422704378148006,
"grad_norm": 2.0988779067993164,
"learning_rate": 0.0003817505014881378,
"loss": 0.4737,
"step": 1900
},
{
"epoch": 2.9732661759008137,
"grad_norm": 1.8902864456176758,
"learning_rate": 0.0003720071529436637,
"loss": 0.4467,
"step": 1920
},
{
"epoch": 3.0030995738086013,
"grad_norm": 1.977570652961731,
"learning_rate": 0.0003623155449159876,
"loss": 0.4367,
"step": 1940
},
{
"epoch": 3.0340953118946143,
"grad_norm": 1.571098804473877,
"learning_rate": 0.00035267959519285686,
"loss": 0.4162,
"step": 1960
},
{
"epoch": 3.065091049980628,
"grad_norm": 1.6555993556976318,
"learning_rate": 0.0003431031990624063,
"loss": 0.4047,
"step": 1980
},
{
"epoch": 3.096086788066641,
"grad_norm": 1.785839557647705,
"learning_rate": 0.00033359022773850675,
"loss": 0.4096,
"step": 2000
},
{
"epoch": 3.096086788066641,
"eval_loss": 0.020494887605309486,
"eval_runtime": 1323.6591,
"eval_samples_per_second": 6.933,
"eval_steps_per_second": 0.867,
"step": 2000
},
{
"epoch": 3.127082526152654,
"grad_norm": 1.5646486282348633,
"learning_rate": 0.00032414452679584377,
"loss": 0.4,
"step": 2020
},
{
"epoch": 3.158078264238667,
"grad_norm": 1.4028712511062622,
"learning_rate": 0.0003147699146153621,
"loss": 0.378,
"step": 2040
},
{
"epoch": 3.18907400232468,
"grad_norm": 1.4465529918670654,
"learning_rate": 0.00030547018084070343,
"loss": 0.3819,
"step": 2060
},
{
"epoch": 3.2200697404106937,
"grad_norm": 1.5283536911010742,
"learning_rate": 0.0002962490848462596,
"loss": 0.38,
"step": 2080
},
{
"epoch": 3.251065478496707,
"grad_norm": 1.420145034790039,
"learning_rate": 0.00028711035421746366,
"loss": 0.367,
"step": 2100
},
{
"epoch": 3.28206121658272,
"grad_norm": 1.378266453742981,
"learning_rate": 0.00027805768324393014,
"loss": 0.3509,
"step": 2120
},
{
"epoch": 3.313056954668733,
"grad_norm": 1.202911138534546,
"learning_rate": 0.00026909473142605524,
"loss": 0.3552,
"step": 2140
},
{
"epoch": 3.344052692754746,
"grad_norm": 1.2163655757904053,
"learning_rate": 0.00026022512199568205,
"loss": 0.3461,
"step": 2160
},
{
"epoch": 3.3750484308407596,
"grad_norm": 1.2958979606628418,
"learning_rate": 0.0002514524404514248,
"loss": 0.3452,
"step": 2180
},
{
"epoch": 3.4060441689267726,
"grad_norm": 1.3039333820343018,
"learning_rate": 0.00024278023310924675,
"loss": 0.3358,
"step": 2200
},
{
"epoch": 3.4370399070127857,
"grad_norm": 1.1213936805725098,
"learning_rate": 0.00023421200566888095,
"loss": 0.3309,
"step": 2220
},
{
"epoch": 3.468035645098799,
"grad_norm": 1.1700717210769653,
"learning_rate": 0.00022575122179666497,
"loss": 0.3186,
"step": 2240
},
{
"epoch": 3.4990313831848123,
"grad_norm": 1.2684606313705444,
"learning_rate": 0.0002174013017253701,
"loss": 0.3239,
"step": 2260
},
{
"epoch": 3.5300271212708254,
"grad_norm": 0.9373674392700195,
"learning_rate": 0.00020916562087158964,
"loss": 0.3155,
"step": 2280
},
{
"epoch": 3.5610228593568385,
"grad_norm": 1.221389889717102,
"learning_rate": 0.00020104750847124077,
"loss": 0.3133,
"step": 2300
},
{
"epoch": 3.5920185974428516,
"grad_norm": 1.0687464475631714,
"learning_rate": 0.00019305024623373618,
"loss": 0.3086,
"step": 2320
},
{
"epoch": 3.6230143355288646,
"grad_norm": 1.025343656539917,
"learning_rate": 0.00018517706701536997,
"loss": 0.306,
"step": 2340
},
{
"epoch": 3.654010073614878,
"grad_norm": 0.9350593090057373,
"learning_rate": 0.00017743115351244882,
"loss": 0.3072,
"step": 2360
},
{
"epoch": 3.6850058117008913,
"grad_norm": 1.0027642250061035,
"learning_rate": 0.00016981563697470158,
"loss": 0.2976,
"step": 2380
},
{
"epoch": 3.7160015497869043,
"grad_norm": 0.7607480883598328,
"learning_rate": 0.00016233359593948777,
"loss": 0.2864,
"step": 2400
},
{
"epoch": 3.7160015497869043,
"eval_loss": 0.016527026891708374,
"eval_runtime": 1322.2158,
"eval_samples_per_second": 6.941,
"eval_steps_per_second": 0.868,
"step": 2400
},
{
"epoch": 3.7469972878729174,
"grad_norm": 0.8243028521537781,
"learning_rate": 0.00015498805498731144,
"loss": 0.2848,
"step": 2420
},
{
"epoch": 3.7779930259589305,
"grad_norm": 0.7183871269226074,
"learning_rate": 0.00014778198351914852,
"loss": 0.2758,
"step": 2440
},
{
"epoch": 3.808988764044944,
"grad_norm": 0.71489417552948,
"learning_rate": 0.00014071829455608172,
"loss": 0.2759,
"step": 2460
},
{
"epoch": 3.839984502130957,
"grad_norm": 0.7238665223121643,
"learning_rate": 0.00013379984356172347,
"loss": 0.2687,
"step": 2480
},
{
"epoch": 3.87098024021697,
"grad_norm": 0.5687535405158997,
"learning_rate": 0.00012702942728790896,
"loss": 0.273,
"step": 2500
},
{
"epoch": 3.9019759783029833,
"grad_norm": 0.7781446576118469,
"learning_rate": 0.00012040978264412178,
"loss": 0.2694,
"step": 2520
},
{
"epoch": 3.9329717163889963,
"grad_norm": 0.6078444123268127,
"learning_rate": 0.000113943585591111,
"loss": 0.2708,
"step": 2540
},
{
"epoch": 3.96396745447501,
"grad_norm": 0.48452651500701904,
"learning_rate": 0.00010763345005914649,
"loss": 0.2612,
"step": 2560
},
{
"epoch": 3.994963192561023,
"grad_norm": 0.46242251992225647,
"learning_rate": 0.0001014819268913495,
"loss": 0.2647,
"step": 2580
},
{
"epoch": 4.02479659046881,
"grad_norm": 0.4267343282699585,
"learning_rate": 9.549150281252633e-05,
"loss": 0.2441,
"step": 2600
},
{
"epoch": 4.055792328554824,
"grad_norm": 0.40898454189300537,
"learning_rate": 8.966459942392108e-05,
"loss": 0.253,
"step": 2620
},
{
"epoch": 4.086788066640837,
"grad_norm": 0.4370998740196228,
"learning_rate": 8.400357222429472e-05,
"loss": 0.249,
"step": 2640
},
{
"epoch": 4.11778380472685,
"grad_norm": 0.39429062604904175,
"learning_rate": 7.851070965772572e-05,
"loss": 0.2449,
"step": 2660
},
{
"epoch": 4.148779542812863,
"grad_norm": 0.3252967894077301,
"learning_rate": 7.318823218851667e-05,
"loss": 0.2452,
"step": 2680
},
{
"epoch": 4.179775280898877,
"grad_norm": 0.35145244002342224,
"learning_rate": 6.803829140358236e-05,
"loss": 0.246,
"step": 2700
},
{
"epoch": 4.2107710189848895,
"grad_norm": 0.447158545255661,
"learning_rate": 6.306296914268039e-05,
"loss": 0.249,
"step": 2720
},
{
"epoch": 4.241766757070903,
"grad_norm": 0.3183532655239105,
"learning_rate": 5.8264276656837145e-05,
"loss": 0.2435,
"step": 2740
},
{
"epoch": 4.272762495156916,
"grad_norm": 0.3511893153190613,
"learning_rate": 5.36441537953089e-05,
"loss": 0.2475,
"step": 2760
},
{
"epoch": 4.303758233242929,
"grad_norm": 0.49244144558906555,
"learning_rate": 4.920446822140673e-05,
"loss": 0.2485,
"step": 2780
},
{
"epoch": 4.334753971328942,
"grad_norm": 0.30061423778533936,
"learning_rate": 4.494701465750217e-05,
"loss": 0.2515,
"step": 2800
},
{
"epoch": 4.334753971328942,
"eval_loss": 0.015458072535693645,
"eval_runtime": 1323.2432,
"eval_samples_per_second": 6.935,
"eval_steps_per_second": 0.868,
"step": 2800
},
{
"epoch": 4.365749709414955,
"grad_norm": 0.34871456027030945,
"learning_rate": 4.087351415951917e-05,
"loss": 0.2434,
"step": 2820
},
{
"epoch": 4.396745447500969,
"grad_norm": 0.39381858706474304,
"learning_rate": 3.698561342120499e-05,
"loss": 0.2476,
"step": 2840
},
{
"epoch": 4.4277411855869815,
"grad_norm": 0.38118091225624084,
"learning_rate": 3.3284884108461864e-05,
"loss": 0.245,
"step": 2860
},
{
"epoch": 4.458736923672995,
"grad_norm": 0.31697818636894226,
"learning_rate": 2.9772822224008513e-05,
"loss": 0.2432,
"step": 2880
},
{
"epoch": 4.4897326617590085,
"grad_norm": 0.30253851413726807,
"learning_rate": 2.6450847502627883e-05,
"loss": 0.2462,
"step": 2900
},
{
"epoch": 4.520728399845021,
"grad_norm": 0.2832024097442627,
"learning_rate": 2.3320302837245844e-05,
"loss": 0.2445,
"step": 2920
},
{
"epoch": 4.551724137931035,
"grad_norm": 0.259616494178772,
"learning_rate": 2.0382453736072835e-05,
"loss": 0.248,
"step": 2940
},
{
"epoch": 4.582719876017047,
"grad_norm": 0.361459881067276,
"learning_rate": 1.7638487811028614e-05,
"loss": 0.2444,
"step": 2960
},
{
"epoch": 4.613715614103061,
"grad_norm": 0.33997073769569397,
"learning_rate": 1.5089514297654594e-05,
"loss": 0.2402,
"step": 2980
},
{
"epoch": 4.644711352189074,
"grad_norm": 0.33987364172935486,
"learning_rate": 1.2736563606711382e-05,
"loss": 0.2456,
"step": 3000
},
{
"epoch": 4.675707090275087,
"grad_norm": 0.3494039475917816,
"learning_rate": 1.0580586907639911e-05,
"loss": 0.2415,
"step": 3020
},
{
"epoch": 4.7067028283611005,
"grad_norm": 0.27838659286499023,
"learning_rate": 8.622455744054958e-06,
"loss": 0.2442,
"step": 3040
},
{
"epoch": 4.737698566447113,
"grad_norm": 0.26378950476646423,
"learning_rate": 6.8629616814283035e-06,
"loss": 0.2412,
"step": 3060
},
{
"epoch": 4.768694304533127,
"grad_norm": 0.3408554494380951,
"learning_rate": 5.302815987101917e-06,
"loss": 0.2405,
"step": 3080
},
{
"epoch": 4.79969004261914,
"grad_norm": 0.3381432592868805,
"learning_rate": 3.942649342761117e-06,
"loss": 0.2406,
"step": 3100
},
{
"epoch": 4.830685780705153,
"grad_norm": 0.2882293164730072,
"learning_rate": 2.7830115894847407e-06,
"loss": 0.2448,
"step": 3120
},
{
"epoch": 4.861681518791166,
"grad_norm": 0.26501309871673584,
"learning_rate": 1.8243715054744313e-06,
"loss": 0.2427,
"step": 3140
},
{
"epoch": 4.892677256877179,
"grad_norm": 0.25310027599334717,
"learning_rate": 1.067116616552899e-06,
"loss": 0.2463,
"step": 3160
},
{
"epoch": 4.9236729949631926,
"grad_norm": 0.31968578696250916,
"learning_rate": 5.115530395087276e-07,
"loss": 0.2432,
"step": 3180
},
{
"epoch": 4.954668733049206,
"grad_norm": 0.2826690375804901,
"learning_rate": 1.5790535835003005e-07,
"loss": 0.2409,
"step": 3200
},
{
"epoch": 4.954668733049206,
"eval_loss": 0.015329813584685326,
"eval_runtime": 1323.5417,
"eval_samples_per_second": 6.934,
"eval_steps_per_second": 0.867,
"step": 3200
}
],
"logging_steps": 20,
"max_steps": 3225,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 400,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8917658856433517e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}