irishprancer's picture
Training in progress, step 4650, checkpoint
3404b50 verified
{
"best_metric": 1.2152043581008911,
"best_model_checkpoint": "./output/checkpoint-4650",
"epoch": 0.3073567321039064,
"eval_steps": 150,
"global_step": 4650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006609822195782933,
"grad_norm": 7.413546562194824,
"learning_rate": 2.2360679774997904e-06,
"loss": 1.2392,
"step": 10
},
{
"epoch": 0.0013219644391565867,
"grad_norm": 7.08538818359375,
"learning_rate": 4.472135954999581e-06,
"loss": 1.2951,
"step": 20
},
{
"epoch": 0.00198294665873488,
"grad_norm": 15.173999786376953,
"learning_rate": 6.70820393249937e-06,
"loss": 1.3208,
"step": 30
},
{
"epoch": 0.0026439288783131733,
"grad_norm": 7.055360317230225,
"learning_rate": 8.944271909999161e-06,
"loss": 1.2641,
"step": 40
},
{
"epoch": 0.003304911097891467,
"grad_norm": 8.638155937194824,
"learning_rate": 1.118033988749895e-05,
"loss": 1.2835,
"step": 50
},
{
"epoch": 0.00396589331746976,
"grad_norm": 7.482174396514893,
"learning_rate": 1.341640786499874e-05,
"loss": 1.1253,
"step": 60
},
{
"epoch": 0.0046268755370480535,
"grad_norm": 11.88020133972168,
"learning_rate": 1.565247584249853e-05,
"loss": 1.1857,
"step": 70
},
{
"epoch": 0.005287857756626347,
"grad_norm": 13.985732078552246,
"learning_rate": 1.7888543819998323e-05,
"loss": 1.3447,
"step": 80
},
{
"epoch": 0.00594883997620464,
"grad_norm": 7.102285861968994,
"learning_rate": 2.0124611797498112e-05,
"loss": 1.3336,
"step": 90
},
{
"epoch": 0.006609822195782934,
"grad_norm": 7.646865367889404,
"learning_rate": 2.23606797749979e-05,
"loss": 1.176,
"step": 100
},
{
"epoch": 0.007270804415361227,
"grad_norm": 6.750139236450195,
"learning_rate": 2.236044998500671e-05,
"loss": 1.2429,
"step": 110
},
{
"epoch": 0.00793178663493952,
"grad_norm": 10.000078201293945,
"learning_rate": 2.235976062447891e-05,
"loss": 1.3139,
"step": 120
},
{
"epoch": 0.008592768854517813,
"grad_norm": 12.12943172454834,
"learning_rate": 2.2358611721751407e-05,
"loss": 1.3145,
"step": 130
},
{
"epoch": 0.009253751074096107,
"grad_norm": 7.1956071853637695,
"learning_rate": 2.2357003324051093e-05,
"loss": 1.3055,
"step": 140
},
{
"epoch": 0.009914733293674401,
"grad_norm": 6.159770965576172,
"learning_rate": 2.23549354974929e-05,
"loss": 1.3298,
"step": 150
},
{
"epoch": 0.009914733293674401,
"eval_loss": 1.3606581687927246,
"eval_runtime": 45.5267,
"eval_samples_per_second": 11.005,
"eval_steps_per_second": 11.005,
"step": 150
},
{
"epoch": 0.010575715513252693,
"grad_norm": 15.24757194519043,
"learning_rate": 2.2352408327077078e-05,
"loss": 1.303,
"step": 160
},
{
"epoch": 0.011236697732830987,
"grad_norm": 10.154984474182129,
"learning_rate": 2.2349421916685704e-05,
"loss": 1.2568,
"step": 170
},
{
"epoch": 0.01189767995240928,
"grad_norm": 7.64827299118042,
"learning_rate": 2.234597638907841e-05,
"loss": 1.27,
"step": 180
},
{
"epoch": 0.012558662171987573,
"grad_norm": 10.21170711517334,
"learning_rate": 2.2342071885887346e-05,
"loss": 1.2995,
"step": 190
},
{
"epoch": 0.013219644391565867,
"grad_norm": 10.44480037689209,
"learning_rate": 2.2337708567611343e-05,
"loss": 1.3509,
"step": 200
},
{
"epoch": 0.01388062661114416,
"grad_norm": 7.435905456542969,
"learning_rate": 2.233288661360932e-05,
"loss": 1.1597,
"step": 210
},
{
"epoch": 0.014541608830722454,
"grad_norm": 16.616416931152344,
"learning_rate": 2.232760622209293e-05,
"loss": 1.2589,
"step": 220
},
{
"epoch": 0.015202591050300748,
"grad_norm": 13.498307228088379,
"learning_rate": 2.2321867610118378e-05,
"loss": 1.3307,
"step": 230
},
{
"epoch": 0.01586357326987904,
"grad_norm": 7.282419681549072,
"learning_rate": 2.231567101357753e-05,
"loss": 1.3213,
"step": 240
},
{
"epoch": 0.016524555489457332,
"grad_norm": 12.302486419677734,
"learning_rate": 2.2309016687188194e-05,
"loss": 1.3124,
"step": 250
},
{
"epoch": 0.017185537709035626,
"grad_norm": 8.877416610717773,
"learning_rate": 2.230190490448367e-05,
"loss": 1.1267,
"step": 260
},
{
"epoch": 0.01784651992861392,
"grad_norm": 10.397753715515137,
"learning_rate": 2.229433595780149e-05,
"loss": 1.3197,
"step": 270
},
{
"epoch": 0.018507502148192214,
"grad_norm": 9.187607765197754,
"learning_rate": 2.2286310158271407e-05,
"loss": 1.1703,
"step": 280
},
{
"epoch": 0.019168484367770508,
"grad_norm": 7.458565711975098,
"learning_rate": 2.22778278358026e-05,
"loss": 1.2126,
"step": 290
},
{
"epoch": 0.019829466587348802,
"grad_norm": 11.090981483459473,
"learning_rate": 2.2268889339070124e-05,
"loss": 1.1683,
"step": 300
},
{
"epoch": 0.019829466587348802,
"eval_loss": 1.3488467931747437,
"eval_runtime": 55.8106,
"eval_samples_per_second": 8.977,
"eval_steps_per_second": 8.977,
"step": 300
},
{
"epoch": 0.020490448806927093,
"grad_norm": 10.89608383178711,
"learning_rate": 2.2259495035500576e-05,
"loss": 1.4133,
"step": 310
},
{
"epoch": 0.021151431026505386,
"grad_norm": 7.514070510864258,
"learning_rate": 2.2249645311256972e-05,
"loss": 1.2241,
"step": 320
},
{
"epoch": 0.02181241324608368,
"grad_norm": 12.841883659362793,
"learning_rate": 2.2239340571222904e-05,
"loss": 1.2928,
"step": 330
},
{
"epoch": 0.022473395465661974,
"grad_norm": 13.028974533081055,
"learning_rate": 2.2228581238985868e-05,
"loss": 1.2704,
"step": 340
},
{
"epoch": 0.02313437768524027,
"grad_norm": 11.415493965148926,
"learning_rate": 2.2217367756819878e-05,
"loss": 1.2951,
"step": 350
},
{
"epoch": 0.02379535990481856,
"grad_norm": 14.492388725280762,
"learning_rate": 2.2205700585667257e-05,
"loss": 1.2643,
"step": 360
},
{
"epoch": 0.024456342124396853,
"grad_norm": 10.009002685546875,
"learning_rate": 2.2193580205119724e-05,
"loss": 1.2515,
"step": 370
},
{
"epoch": 0.025117324343975147,
"grad_norm": 8.66943073272705,
"learning_rate": 2.2181007113398642e-05,
"loss": 1.1653,
"step": 380
},
{
"epoch": 0.02577830656355344,
"grad_norm": 13.82745361328125,
"learning_rate": 2.216798182733457e-05,
"loss": 1.3251,
"step": 390
},
{
"epoch": 0.026439288783131735,
"grad_norm": 9.831866264343262,
"learning_rate": 2.2154504882346002e-05,
"loss": 1.3099,
"step": 400
},
{
"epoch": 0.02710027100271003,
"grad_norm": 6.000834941864014,
"learning_rate": 2.214057683241736e-05,
"loss": 1.2919,
"step": 410
},
{
"epoch": 0.02776125322228832,
"grad_norm": 5.438742160797119,
"learning_rate": 2.2126198250076225e-05,
"loss": 1.1859,
"step": 420
},
{
"epoch": 0.028422235441866613,
"grad_norm": 11.776556968688965,
"learning_rate": 2.2111369726369802e-05,
"loss": 1.339,
"step": 430
},
{
"epoch": 0.029083217661444907,
"grad_norm": 7.697872638702393,
"learning_rate": 2.2096091870840613e-05,
"loss": 1.2235,
"step": 440
},
{
"epoch": 0.0297441998810232,
"grad_norm": 12.47408676147461,
"learning_rate": 2.2080365311501466e-05,
"loss": 1.0851,
"step": 450
},
{
"epoch": 0.0297441998810232,
"eval_loss": 1.3441540002822876,
"eval_runtime": 45.42,
"eval_samples_per_second": 11.03,
"eval_steps_per_second": 11.03,
"step": 450
},
{
"epoch": 0.030405182100601495,
"grad_norm": 5.456786155700684,
"learning_rate": 2.206419069480962e-05,
"loss": 1.2224,
"step": 460
},
{
"epoch": 0.031066164320179786,
"grad_norm": 17.571989059448242,
"learning_rate": 2.2047568685640212e-05,
"loss": 1.355,
"step": 470
},
{
"epoch": 0.03172714653975808,
"grad_norm": 10.6810302734375,
"learning_rate": 2.203049996725894e-05,
"loss": 1.3274,
"step": 480
},
{
"epoch": 0.032388128759336374,
"grad_norm": 7.424011707305908,
"learning_rate": 2.2012985241293954e-05,
"loss": 1.1497,
"step": 490
},
{
"epoch": 0.033049110978914664,
"grad_norm": 12.73671817779541,
"learning_rate": 2.1995025227707044e-05,
"loss": 1.3728,
"step": 500
},
{
"epoch": 0.03371009319849296,
"grad_norm": 8.181777000427246,
"learning_rate": 2.1976620664764027e-05,
"loss": 1.2332,
"step": 510
},
{
"epoch": 0.03437107541807125,
"grad_norm": 13.738442420959473,
"learning_rate": 2.1957772309004394e-05,
"loss": 1.2833,
"step": 520
},
{
"epoch": 0.03503205763764955,
"grad_norm": 13.703083992004395,
"learning_rate": 2.1938480935210228e-05,
"loss": 1.4239,
"step": 530
},
{
"epoch": 0.03569303985722784,
"grad_norm": 7.870193004608154,
"learning_rate": 2.1918747336374347e-05,
"loss": 1.4103,
"step": 540
},
{
"epoch": 0.03635402207680613,
"grad_norm": 8.396446228027344,
"learning_rate": 2.189857232366771e-05,
"loss": 1.2522,
"step": 550
},
{
"epoch": 0.03701500429638443,
"grad_norm": 12.225940704345703,
"learning_rate": 2.1877956726406063e-05,
"loss": 1.3464,
"step": 560
},
{
"epoch": 0.03767598651596272,
"grad_norm": 11.3760347366333,
"learning_rate": 2.1856901392015874e-05,
"loss": 1.2843,
"step": 570
},
{
"epoch": 0.038336968735541016,
"grad_norm": 11.334436416625977,
"learning_rate": 2.183540718599946e-05,
"loss": 1.2579,
"step": 580
},
{
"epoch": 0.03899795095511931,
"grad_norm": 10.890923500061035,
"learning_rate": 2.1813474991899453e-05,
"loss": 1.1799,
"step": 590
},
{
"epoch": 0.039658933174697604,
"grad_norm": 9.872835159301758,
"learning_rate": 2.1791105711262442e-05,
"loss": 1.1629,
"step": 600
},
{
"epoch": 0.039658933174697604,
"eval_loss": 1.3372266292572021,
"eval_runtime": 56.8438,
"eval_samples_per_second": 8.814,
"eval_steps_per_second": 8.814,
"step": 600
},
{
"epoch": 0.040319915394275894,
"grad_norm": 11.447709083557129,
"learning_rate": 2.1768300263601945e-05,
"loss": 1.2011,
"step": 610
},
{
"epoch": 0.040980897613854185,
"grad_norm": 12.056636810302734,
"learning_rate": 2.174505958636059e-05,
"loss": 1.2068,
"step": 620
},
{
"epoch": 0.04164187983343248,
"grad_norm": 8.074010848999023,
"learning_rate": 2.1721384634871592e-05,
"loss": 1.1598,
"step": 630
},
{
"epoch": 0.04230286205301077,
"grad_norm": 11.10396957397461,
"learning_rate": 2.169727638231948e-05,
"loss": 1.0609,
"step": 640
},
{
"epoch": 0.04296384427258907,
"grad_norm": 7.929290771484375,
"learning_rate": 2.1672735819700084e-05,
"loss": 1.1761,
"step": 650
},
{
"epoch": 0.04362482649216736,
"grad_norm": 12.149751663208008,
"learning_rate": 2.1647763955779823e-05,
"loss": 1.35,
"step": 660
},
{
"epoch": 0.04428580871174565,
"grad_norm": 12.335487365722656,
"learning_rate": 2.1622361817054213e-05,
"loss": 1.2615,
"step": 670
},
{
"epoch": 0.04494679093132395,
"grad_norm": 10.838406562805176,
"learning_rate": 2.1596530447705676e-05,
"loss": 1.1423,
"step": 680
},
{
"epoch": 0.04560777315090224,
"grad_norm": 11.29602336883545,
"learning_rate": 2.157027090956064e-05,
"loss": 1.2088,
"step": 690
},
{
"epoch": 0.04626875537048054,
"grad_norm": 6.865326881408691,
"learning_rate": 2.1543584282045862e-05,
"loss": 1.2449,
"step": 700
},
{
"epoch": 0.04692973759005883,
"grad_norm": 11.23728084564209,
"learning_rate": 2.1516471662144077e-05,
"loss": 1.3072,
"step": 710
},
{
"epoch": 0.04759071980963712,
"grad_norm": 9.809483528137207,
"learning_rate": 2.1488934164348898e-05,
"loss": 1.2592,
"step": 720
},
{
"epoch": 0.048251702029215415,
"grad_norm": 12.237908363342285,
"learning_rate": 2.1460972920619e-05,
"loss": 1.2014,
"step": 730
},
{
"epoch": 0.048912684248793706,
"grad_norm": 12.795587539672852,
"learning_rate": 2.143258908033159e-05,
"loss": 1.2433,
"step": 740
},
{
"epoch": 0.049573666468372,
"grad_norm": 13.611194610595703,
"learning_rate": 2.140378381023518e-05,
"loss": 1.2548,
"step": 750
},
{
"epoch": 0.049573666468372,
"eval_loss": 1.3183883428573608,
"eval_runtime": 55.6542,
"eval_samples_per_second": 9.002,
"eval_steps_per_second": 9.002,
"step": 750
},
{
"epoch": 0.050234648687950294,
"grad_norm": 4.964775085449219,
"learning_rate": 2.1374558294401597e-05,
"loss": 1.2587,
"step": 760
},
{
"epoch": 0.050895630907528584,
"grad_norm": 13.402926445007324,
"learning_rate": 2.134491373417733e-05,
"loss": 1.1855,
"step": 770
},
{
"epoch": 0.05155661312710688,
"grad_norm": 8.38901138305664,
"learning_rate": 2.1314851348134134e-05,
"loss": 1.3289,
"step": 780
},
{
"epoch": 0.05221759534668517,
"grad_norm": 6.840709686279297,
"learning_rate": 2.1284372372018963e-05,
"loss": 1.1234,
"step": 790
},
{
"epoch": 0.05287857756626347,
"grad_norm": 6.543496608734131,
"learning_rate": 2.125347805870314e-05,
"loss": 1.2149,
"step": 800
},
{
"epoch": 0.05353955978584176,
"grad_norm": 7.223635196685791,
"learning_rate": 2.122216967813088e-05,
"loss": 1.0977,
"step": 810
},
{
"epoch": 0.05420054200542006,
"grad_norm": 10.436606407165527,
"learning_rate": 2.1190448517267087e-05,
"loss": 1.1564,
"step": 820
},
{
"epoch": 0.05486152422499835,
"grad_norm": 17.590259552001953,
"learning_rate": 2.115831588004444e-05,
"loss": 1.3229,
"step": 830
},
{
"epoch": 0.05552250644457664,
"grad_norm": 11.749155044555664,
"learning_rate": 2.1125773087309798e-05,
"loss": 1.2345,
"step": 840
},
{
"epoch": 0.056183488664154936,
"grad_norm": 11.912696838378906,
"learning_rate": 2.1092821476769906e-05,
"loss": 1.1779,
"step": 850
},
{
"epoch": 0.05684447088373323,
"grad_norm": 5.420770168304443,
"learning_rate": 2.1059462402936416e-05,
"loss": 1.2414,
"step": 860
},
{
"epoch": 0.057505453103311524,
"grad_norm": 4.887539863586426,
"learning_rate": 2.102569723707019e-05,
"loss": 1.1046,
"step": 870
},
{
"epoch": 0.058166435322889815,
"grad_norm": 9.325897216796875,
"learning_rate": 2.0991527367124955e-05,
"loss": 1.3145,
"step": 880
},
{
"epoch": 0.058827417542468105,
"grad_norm": 14.635684967041016,
"learning_rate": 2.095695419769022e-05,
"loss": 1.3592,
"step": 890
},
{
"epoch": 0.0594883997620464,
"grad_norm": 8.91545295715332,
"learning_rate": 2.0921979149933576e-05,
"loss": 1.3035,
"step": 900
},
{
"epoch": 0.0594883997620464,
"eval_loss": 1.3120555877685547,
"eval_runtime": 52.1726,
"eval_samples_per_second": 9.603,
"eval_steps_per_second": 9.603,
"step": 900
},
{
"epoch": 0.06014938198162469,
"grad_norm": 6.539499759674072,
"learning_rate": 2.0886603661542245e-05,
"loss": 1.2819,
"step": 910
},
{
"epoch": 0.06081036420120299,
"grad_norm": 5.03954553604126,
"learning_rate": 2.0850829186663994e-05,
"loss": 1.2467,
"step": 920
},
{
"epoch": 0.06147134642078128,
"grad_norm": 12.52458381652832,
"learning_rate": 2.0814657195847375e-05,
"loss": 1.1568,
"step": 930
},
{
"epoch": 0.06213232864035957,
"grad_norm": 11.251747131347656,
"learning_rate": 2.077808917598125e-05,
"loss": 1.1703,
"step": 940
},
{
"epoch": 0.06279331085993786,
"grad_norm": 10.658408164978027,
"learning_rate": 2.0741126630233687e-05,
"loss": 1.1074,
"step": 950
},
{
"epoch": 0.06345429307951617,
"grad_norm": 6.95957612991333,
"learning_rate": 2.070377107799017e-05,
"loss": 1.1635,
"step": 960
},
{
"epoch": 0.06411527529909446,
"grad_norm": 10.898233413696289,
"learning_rate": 2.0666024054791137e-05,
"loss": 1.2801,
"step": 970
},
{
"epoch": 0.06477625751867275,
"grad_norm": 12.640921592712402,
"learning_rate": 2.0627887112268875e-05,
"loss": 1.2982,
"step": 980
},
{
"epoch": 0.06543723973825104,
"grad_norm": 6.845248699188232,
"learning_rate": 2.0589361818083712e-05,
"loss": 1.0552,
"step": 990
},
{
"epoch": 0.06609822195782933,
"grad_norm": 12.774737358093262,
"learning_rate": 2.0550449755859598e-05,
"loss": 1.149,
"step": 1000
},
{
"epoch": 0.06675920417740763,
"grad_norm": 12.460762977600098,
"learning_rate": 2.0511152525119014e-05,
"loss": 1.0864,
"step": 1010
},
{
"epoch": 0.06742018639698592,
"grad_norm": 12.369227409362793,
"learning_rate": 2.0471471741217183e-05,
"loss": 1.2691,
"step": 1020
},
{
"epoch": 0.06808116861656421,
"grad_norm": 15.577491760253906,
"learning_rate": 2.0431409035275724e-05,
"loss": 1.3091,
"step": 1030
},
{
"epoch": 0.0687421508361425,
"grad_norm": 8.849650382995605,
"learning_rate": 2.0390966054115558e-05,
"loss": 1.2703,
"step": 1040
},
{
"epoch": 0.0694031330557208,
"grad_norm": 13.82666015625,
"learning_rate": 2.035014446018924e-05,
"loss": 1.388,
"step": 1050
},
{
"epoch": 0.0694031330557208,
"eval_loss": 1.303145170211792,
"eval_runtime": 53.8965,
"eval_samples_per_second": 9.296,
"eval_steps_per_second": 9.296,
"step": 1050
},
{
"epoch": 0.0700641152752991,
"grad_norm": 11.953422546386719,
"learning_rate": 2.0308945931512606e-05,
"loss": 1.1849,
"step": 1060
},
{
"epoch": 0.07072509749487739,
"grad_norm": 6.583851337432861,
"learning_rate": 2.0267372161595806e-05,
"loss": 1.2334,
"step": 1070
},
{
"epoch": 0.07138607971445568,
"grad_norm": 10.967381477355957,
"learning_rate": 2.022542485937369e-05,
"loss": 1.146,
"step": 1080
},
{
"epoch": 0.07204706193403397,
"grad_norm": 11.6732177734375,
"learning_rate": 2.0183105749135553e-05,
"loss": 1.1601,
"step": 1090
},
{
"epoch": 0.07270804415361226,
"grad_norm": 11.63559341430664,
"learning_rate": 2.0140416570454266e-05,
"loss": 1.2845,
"step": 1100
},
{
"epoch": 0.07336902637319057,
"grad_norm": 8.482784271240234,
"learning_rate": 2.0097359078114767e-05,
"loss": 1.1344,
"step": 1110
},
{
"epoch": 0.07403000859276886,
"grad_norm": 11.602831840515137,
"learning_rate": 2.0053935042041915e-05,
"loss": 1.2167,
"step": 1120
},
{
"epoch": 0.07469099081234715,
"grad_norm": 6.016249179840088,
"learning_rate": 2.001014624722775e-05,
"loss": 1.2611,
"step": 1130
},
{
"epoch": 0.07535197303192544,
"grad_norm": 6.9794020652771,
"learning_rate": 1.996599449365813e-05,
"loss": 1.0101,
"step": 1140
},
{
"epoch": 0.07601295525150373,
"grad_norm": 10.84961986541748,
"learning_rate": 1.9921481596238703e-05,
"loss": 1.1906,
"step": 1150
},
{
"epoch": 0.07667393747108203,
"grad_norm": 13.637924194335938,
"learning_rate": 1.9876609384720335e-05,
"loss": 1.2617,
"step": 1160
},
{
"epoch": 0.07733491969066032,
"grad_norm": 11.967713356018066,
"learning_rate": 1.9831379703623903e-05,
"loss": 1.1903,
"step": 1170
},
{
"epoch": 0.07799590191023861,
"grad_norm": 12.296497344970703,
"learning_rate": 1.978579441216443e-05,
"loss": 0.9757,
"step": 1180
},
{
"epoch": 0.0786568841298169,
"grad_norm": 12.823221206665039,
"learning_rate": 1.9739855384174708e-05,
"loss": 1.2341,
"step": 1190
},
{
"epoch": 0.07931786634939521,
"grad_norm": 9.349319458007812,
"learning_rate": 1.969356450802825e-05,
"loss": 1.1929,
"step": 1200
},
{
"epoch": 0.07931786634939521,
"eval_loss": 1.3002644777297974,
"eval_runtime": 46.8524,
"eval_samples_per_second": 10.693,
"eval_steps_per_second": 10.693,
"step": 1200
},
{
"epoch": 0.0799788485689735,
"grad_norm": 6.869687080383301,
"learning_rate": 1.964692368656166e-05,
"loss": 0.9831,
"step": 1210
},
{
"epoch": 0.08063983078855179,
"grad_norm": 12.35352897644043,
"learning_rate": 1.9599934836996435e-05,
"loss": 1.1827,
"step": 1220
},
{
"epoch": 0.08130081300813008,
"grad_norm": 14.163335800170898,
"learning_rate": 1.9552599890860126e-05,
"loss": 1.2183,
"step": 1230
},
{
"epoch": 0.08196179522770837,
"grad_norm": 14.357596397399902,
"learning_rate": 1.9504920793906985e-05,
"loss": 1.1122,
"step": 1240
},
{
"epoch": 0.08262277744728667,
"grad_norm": 12.211373329162598,
"learning_rate": 1.945689950603793e-05,
"loss": 1.1785,
"step": 1250
},
{
"epoch": 0.08328375966686496,
"grad_norm": 9.271207809448242,
"learning_rate": 1.9408538001220032e-05,
"loss": 1.3458,
"step": 1260
},
{
"epoch": 0.08394474188644326,
"grad_norm": 8.985238075256348,
"learning_rate": 1.9359838267405318e-05,
"loss": 1.2764,
"step": 1270
},
{
"epoch": 0.08460572410602155,
"grad_norm": 6.032650947570801,
"learning_rate": 1.931080230644911e-05,
"loss": 1.1252,
"step": 1280
},
{
"epoch": 0.08526670632559984,
"grad_norm": 8.561097145080566,
"learning_rate": 1.926143213402771e-05,
"loss": 1.1761,
"step": 1290
},
{
"epoch": 0.08592768854517814,
"grad_norm": 11.316914558410645,
"learning_rate": 1.921172977955552e-05,
"loss": 1.2844,
"step": 1300
},
{
"epoch": 0.08658867076475643,
"grad_norm": 11.52777099609375,
"learning_rate": 1.9161697286101677e-05,
"loss": 1.3252,
"step": 1310
},
{
"epoch": 0.08724965298433472,
"grad_norm": 7.112990379333496,
"learning_rate": 1.9111336710306013e-05,
"loss": 1.2886,
"step": 1320
},
{
"epoch": 0.08791063520391301,
"grad_norm": 11.982434272766113,
"learning_rate": 1.9060650122294554e-05,
"loss": 1.2249,
"step": 1330
},
{
"epoch": 0.0885716174234913,
"grad_norm": 5.956284046173096,
"learning_rate": 1.9009639605594407e-05,
"loss": 1.1993,
"step": 1340
},
{
"epoch": 0.08923259964306961,
"grad_norm": 6.896420955657959,
"learning_rate": 1.8958307257048116e-05,
"loss": 1.2083,
"step": 1350
},
{
"epoch": 0.08923259964306961,
"eval_loss": 1.2925916910171509,
"eval_runtime": 53.3979,
"eval_samples_per_second": 9.382,
"eval_steps_per_second": 9.382,
"step": 1350
},
{
"epoch": 0.0898935818626479,
"grad_norm": 11.231532096862793,
"learning_rate": 1.890665518672748e-05,
"loss": 1.3071,
"step": 1360
},
{
"epoch": 0.09055456408222619,
"grad_norm": 8.269697189331055,
"learning_rate": 1.88546855178468e-05,
"loss": 1.3681,
"step": 1370
},
{
"epoch": 0.09121554630180448,
"grad_norm": 9.768874168395996,
"learning_rate": 1.880240038667561e-05,
"loss": 1.1444,
"step": 1380
},
{
"epoch": 0.09187652852138277,
"grad_norm": 12.701289176940918,
"learning_rate": 1.874980194245087e-05,
"loss": 1.2358,
"step": 1390
},
{
"epoch": 0.09253751074096107,
"grad_norm": 7.481356620788574,
"learning_rate": 1.8696892347288606e-05,
"loss": 1.2474,
"step": 1400
},
{
"epoch": 0.09319849296053936,
"grad_norm": 5.565570831298828,
"learning_rate": 1.864367377609504e-05,
"loss": 1.3041,
"step": 1410
},
{
"epoch": 0.09385947518011765,
"grad_norm": 11.658685684204102,
"learning_rate": 1.8590148416477198e-05,
"loss": 1.2475,
"step": 1420
},
{
"epoch": 0.09452045739969595,
"grad_norm": 7.721464157104492,
"learning_rate": 1.8536318468652962e-05,
"loss": 1.2889,
"step": 1430
},
{
"epoch": 0.09518143961927424,
"grad_norm": 13.417887687683105,
"learning_rate": 1.8482186145360648e-05,
"loss": 1.0137,
"step": 1440
},
{
"epoch": 0.09584242183885254,
"grad_norm": 12.11631965637207,
"learning_rate": 1.8427753671768056e-05,
"loss": 1.1422,
"step": 1450
},
{
"epoch": 0.09650340405843083,
"grad_norm": 10.596673965454102,
"learning_rate": 1.8373023285380966e-05,
"loss": 1.3137,
"step": 1460
},
{
"epoch": 0.09716438627800912,
"grad_norm": 7.0566558837890625,
"learning_rate": 1.8317997235951204e-05,
"loss": 1.1111,
"step": 1470
},
{
"epoch": 0.09782536849758741,
"grad_norm": 11.534781455993652,
"learning_rate": 1.8262677785384142e-05,
"loss": 1.207,
"step": 1480
},
{
"epoch": 0.0984863507171657,
"grad_norm": 10.579961776733398,
"learning_rate": 1.8207067207645716e-05,
"loss": 1.0107,
"step": 1490
},
{
"epoch": 0.099147332936744,
"grad_norm": 11.584352493286133,
"learning_rate": 1.815116778866897e-05,
"loss": 1.3272,
"step": 1500
},
{
"epoch": 0.099147332936744,
"eval_loss": 1.2920811176300049,
"eval_runtime": 56.3843,
"eval_samples_per_second": 8.885,
"eval_steps_per_second": 8.885,
"step": 1500
},
{
"epoch": 0.0998083151563223,
"grad_norm": 12.167766571044922,
"learning_rate": 1.8094981826260064e-05,
"loss": 1.1052,
"step": 1510
},
{
"epoch": 0.10046929737590059,
"grad_norm": 6.422857284545898,
"learning_rate": 1.8038511630003865e-05,
"loss": 1.2341,
"step": 1520
},
{
"epoch": 0.10113027959547888,
"grad_norm": 11.502632141113281,
"learning_rate": 1.798175952116895e-05,
"loss": 1.2251,
"step": 1530
},
{
"epoch": 0.10179126181505717,
"grad_norm": 13.205157279968262,
"learning_rate": 1.7924727832612227e-05,
"loss": 1.2488,
"step": 1540
},
{
"epoch": 0.10245224403463547,
"grad_norm": 7.521269798278809,
"learning_rate": 1.786741890868305e-05,
"loss": 1.2128,
"step": 1550
},
{
"epoch": 0.10311322625421376,
"grad_norm": 7.006454944610596,
"learning_rate": 1.7809835105126807e-05,
"loss": 1.1772,
"step": 1560
},
{
"epoch": 0.10377420847379205,
"grad_norm": 10.070454597473145,
"learning_rate": 1.7751978788988123e-05,
"loss": 1.2622,
"step": 1570
},
{
"epoch": 0.10443519069337034,
"grad_norm": 5.716686248779297,
"learning_rate": 1.7693852338513545e-05,
"loss": 1.2284,
"step": 1580
},
{
"epoch": 0.10509617291294863,
"grad_norm": 9.35854721069336,
"learning_rate": 1.7635458143053794e-05,
"loss": 1.1278,
"step": 1590
},
{
"epoch": 0.10575715513252694,
"grad_norm": 8.222880363464355,
"learning_rate": 1.7576798602965525e-05,
"loss": 1.2629,
"step": 1600
},
{
"epoch": 0.10641813735210523,
"grad_norm": 7.391974925994873,
"learning_rate": 1.7517876129512677e-05,
"loss": 1.1084,
"step": 1610
},
{
"epoch": 0.10707911957168352,
"grad_norm": 9.882158279418945,
"learning_rate": 1.7458693144767353e-05,
"loss": 1.1754,
"step": 1620
},
{
"epoch": 0.10774010179126181,
"grad_norm": 6.603885173797607,
"learning_rate": 1.7399252081510248e-05,
"loss": 1.2642,
"step": 1630
},
{
"epoch": 0.10840108401084012,
"grad_norm": 9.928793907165527,
"learning_rate": 1.733955538313066e-05,
"loss": 1.2299,
"step": 1640
},
{
"epoch": 0.1090620662304184,
"grad_norm": 13.607159614562988,
"learning_rate": 1.7279605503526047e-05,
"loss": 1.3297,
"step": 1650
},
{
"epoch": 0.1090620662304184,
"eval_loss": 1.2833280563354492,
"eval_runtime": 56.0628,
"eval_samples_per_second": 8.936,
"eval_steps_per_second": 8.936,
"step": 1650
},
{
"epoch": 0.1097230484499967,
"grad_norm": 12.829073905944824,
"learning_rate": 1.721940490700115e-05,
"loss": 1.1734,
"step": 1660
},
{
"epoch": 0.11038403066957499,
"grad_norm": 5.9544548988342285,
"learning_rate": 1.7158956068166697e-05,
"loss": 1.0935,
"step": 1670
},
{
"epoch": 0.11104501288915328,
"grad_norm": 7.440855503082275,
"learning_rate": 1.7098261471837696e-05,
"loss": 1.22,
"step": 1680
},
{
"epoch": 0.11170599510873158,
"grad_norm": 5.567168235778809,
"learning_rate": 1.7037323612931272e-05,
"loss": 1.1423,
"step": 1690
},
{
"epoch": 0.11236697732830987,
"grad_norm": 5.937944412231445,
"learning_rate": 1.697614499636414e-05,
"loss": 1.148,
"step": 1700
},
{
"epoch": 0.11302795954788816,
"grad_norm": 6.795397758483887,
"learning_rate": 1.6914728136949594e-05,
"loss": 1.2881,
"step": 1710
},
{
"epoch": 0.11368894176746645,
"grad_norm": 8.981378555297852,
"learning_rate": 1.6853075559294172e-05,
"loss": 1.1772,
"step": 1720
},
{
"epoch": 0.11434992398704474,
"grad_norm": 9.995403289794922,
"learning_rate": 1.6791189797693877e-05,
"loss": 1.1541,
"step": 1730
},
{
"epoch": 0.11501090620662305,
"grad_norm": 12.851771354675293,
"learning_rate": 1.6729073396029965e-05,
"loss": 1.2167,
"step": 1740
},
{
"epoch": 0.11567188842620134,
"grad_norm": 12.812955856323242,
"learning_rate": 1.666672890766442e-05,
"loss": 1.1763,
"step": 1750
},
{
"epoch": 0.11633287064577963,
"grad_norm": 8.584874153137207,
"learning_rate": 1.660415889533497e-05,
"loss": 1.2797,
"step": 1760
},
{
"epoch": 0.11699385286535792,
"grad_norm": 8.92071533203125,
"learning_rate": 1.6541365931049757e-05,
"loss": 1.23,
"step": 1770
},
{
"epoch": 0.11765483508493621,
"grad_norm": 5.1022210121154785,
"learning_rate": 1.6478352595981594e-05,
"loss": 1.0536,
"step": 1780
},
{
"epoch": 0.11831581730451451,
"grad_norm": 8.801514625549316,
"learning_rate": 1.6415121480361884e-05,
"loss": 1.0129,
"step": 1790
},
{
"epoch": 0.1189767995240928,
"grad_norm": 11.475573539733887,
"learning_rate": 1.635167518337413e-05,
"loss": 1.2538,
"step": 1800
},
{
"epoch": 0.1189767995240928,
"eval_loss": 1.278364896774292,
"eval_runtime": 47.0777,
"eval_samples_per_second": 10.642,
"eval_steps_per_second": 10.642,
"step": 1800
},
{
"epoch": 0.1196377817436711,
"grad_norm": 10.728155136108398,
"learning_rate": 1.6288016313047095e-05,
"loss": 1.2208,
"step": 1810
},
{
"epoch": 0.12029876396324939,
"grad_norm": 12.165102005004883,
"learning_rate": 1.6224147486147602e-05,
"loss": 1.3179,
"step": 1820
},
{
"epoch": 0.12095974618282768,
"grad_norm": 10.370355606079102,
"learning_rate": 1.616007132807298e-05,
"loss": 1.226,
"step": 1830
},
{
"epoch": 0.12162072840240598,
"grad_norm": 13.64041519165039,
"learning_rate": 1.6095790472743107e-05,
"loss": 1.287,
"step": 1840
},
{
"epoch": 0.12228171062198427,
"grad_norm": 9.342700958251953,
"learning_rate": 1.6031307562492174e-05,
"loss": 1.2169,
"step": 1850
},
{
"epoch": 0.12294269284156256,
"grad_norm": 5.222902297973633,
"learning_rate": 1.5966625247960068e-05,
"loss": 1.2688,
"step": 1860
},
{
"epoch": 0.12360367506114085,
"grad_norm": 6.980830669403076,
"learning_rate": 1.5901746187983387e-05,
"loss": 1.1797,
"step": 1870
},
{
"epoch": 0.12426465728071914,
"grad_norm": 10.581820487976074,
"learning_rate": 1.5836673049486175e-05,
"loss": 1.1752,
"step": 1880
},
{
"epoch": 0.12492563950029745,
"grad_norm": 10.523150444030762,
"learning_rate": 1.577140850737029e-05,
"loss": 1.2042,
"step": 1890
},
{
"epoch": 0.12558662171987572,
"grad_norm": 6.221709251403809,
"learning_rate": 1.5705955244405423e-05,
"loss": 1.1912,
"step": 1900
},
{
"epoch": 0.12624760393945403,
"grad_norm": 10.54680347442627,
"learning_rate": 1.564031595111886e-05,
"loss": 1.2476,
"step": 1910
},
{
"epoch": 0.12690858615903233,
"grad_norm": 5.043491840362549,
"learning_rate": 1.557449332568485e-05,
"loss": 1.2221,
"step": 1920
},
{
"epoch": 0.1275695683786106,
"grad_norm": 10.203733444213867,
"learning_rate": 1.5508490073813722e-05,
"loss": 1.1716,
"step": 1930
},
{
"epoch": 0.1282305505981889,
"grad_norm": 7.249475955963135,
"learning_rate": 1.5442308908640636e-05,
"loss": 1.1548,
"step": 1940
},
{
"epoch": 0.1288915328177672,
"grad_norm": 11.740514755249023,
"learning_rate": 1.537595255061408e-05,
"loss": 1.1863,
"step": 1950
},
{
"epoch": 0.1288915328177672,
"eval_loss": 1.2681256532669067,
"eval_runtime": 53.9387,
"eval_samples_per_second": 9.288,
"eval_steps_per_second": 9.288,
"step": 1950
},
{
"epoch": 0.1295525150373455,
"grad_norm": 9.638320922851562,
"learning_rate": 1.5309423727384037e-05,
"loss": 1.2506,
"step": 1960
},
{
"epoch": 0.1302134972569238,
"grad_norm": 7.702147483825684,
"learning_rate": 1.5242725173689851e-05,
"loss": 1.1908,
"step": 1970
},
{
"epoch": 0.13087447947650208,
"grad_norm": 15.315128326416016,
"learning_rate": 1.5175859631247827e-05,
"loss": 1.1775,
"step": 1980
},
{
"epoch": 0.13153546169608038,
"grad_norm": 6.902062892913818,
"learning_rate": 1.5108829848638515e-05,
"loss": 1.1696,
"step": 1990
},
{
"epoch": 0.13219644391565866,
"grad_norm": 10.421862602233887,
"learning_rate": 1.5041638581193741e-05,
"loss": 1.1456,
"step": 2000
},
{
"epoch": 0.13285742613523696,
"grad_norm": 12.304083824157715,
"learning_rate": 1.4974288590883346e-05,
"loss": 1.0899,
"step": 2010
},
{
"epoch": 0.13351840835481527,
"grad_norm": 6.598790645599365,
"learning_rate": 1.4906782646201634e-05,
"loss": 1.1023,
"step": 2020
},
{
"epoch": 0.13417939057439354,
"grad_norm": 10.214670181274414,
"learning_rate": 1.4839123522053591e-05,
"loss": 1.1551,
"step": 2030
},
{
"epoch": 0.13484037279397185,
"grad_norm": 9.92830753326416,
"learning_rate": 1.4771313999640806e-05,
"loss": 1.1611,
"step": 2040
},
{
"epoch": 0.13550135501355012,
"grad_norm": 11.352734565734863,
"learning_rate": 1.4703356866347155e-05,
"loss": 1.1261,
"step": 2050
},
{
"epoch": 0.13616233723312843,
"grad_norm": 9.193647384643555,
"learning_rate": 1.4635254915624214e-05,
"loss": 1.1497,
"step": 2060
},
{
"epoch": 0.13682331945270673,
"grad_norm": 8.309967994689941,
"learning_rate": 1.4567010946876445e-05,
"loss": 1.2163,
"step": 2070
},
{
"epoch": 0.137484301672285,
"grad_norm": 9.005535125732422,
"learning_rate": 1.4498627765346109e-05,
"loss": 1.1769,
"step": 2080
},
{
"epoch": 0.1381452838918633,
"grad_norm": 6.557043552398682,
"learning_rate": 1.4430108181997962e-05,
"loss": 1.093,
"step": 2090
},
{
"epoch": 0.1388062661114416,
"grad_norm": 7.859200954437256,
"learning_rate": 1.4361455013403695e-05,
"loss": 1.2585,
"step": 2100
},
{
"epoch": 0.1388062661114416,
"eval_loss": 1.2679221630096436,
"eval_runtime": 46.9201,
"eval_samples_per_second": 10.678,
"eval_steps_per_second": 10.678,
"step": 2100
},
{
"epoch": 0.1394672483310199,
"grad_norm": 12.011978149414062,
"learning_rate": 1.4292671081626183e-05,
"loss": 1.2173,
"step": 2110
},
{
"epoch": 0.1401282305505982,
"grad_norm": 9.485074996948242,
"learning_rate": 1.4223759214103443e-05,
"loss": 1.2501,
"step": 2120
},
{
"epoch": 0.14078921277017648,
"grad_norm": 11.757882118225098,
"learning_rate": 1.4154722243532445e-05,
"loss": 1.1974,
"step": 2130
},
{
"epoch": 0.14145019498975478,
"grad_norm": 13.57962703704834,
"learning_rate": 1.4085563007752654e-05,
"loss": 1.1892,
"step": 2140
},
{
"epoch": 0.14211117720933306,
"grad_norm": 9.708785057067871,
"learning_rate": 1.4016284349629364e-05,
"loss": 1.225,
"step": 2150
},
{
"epoch": 0.14277215942891136,
"grad_norm": 10.492091178894043,
"learning_rate": 1.3946889116936874e-05,
"loss": 1.208,
"step": 2160
},
{
"epoch": 0.14343314164848966,
"grad_norm": 7.376300811767578,
"learning_rate": 1.3877380162241394e-05,
"loss": 1.1689,
"step": 2170
},
{
"epoch": 0.14409412386806794,
"grad_norm": 6.636634349822998,
"learning_rate": 1.3807760342783804e-05,
"loss": 1.1393,
"step": 2180
},
{
"epoch": 0.14475510608764625,
"grad_norm": 12.17708969116211,
"learning_rate": 1.37380325203622e-05,
"loss": 1.2818,
"step": 2190
},
{
"epoch": 0.14541608830722452,
"grad_norm": 12.49779987335205,
"learning_rate": 1.3668199561214252e-05,
"loss": 1.133,
"step": 2200
},
{
"epoch": 0.14607707052680283,
"grad_norm": 6.741744518280029,
"learning_rate": 1.35982643358994e-05,
"loss": 1.1637,
"step": 2210
},
{
"epoch": 0.14673805274638113,
"grad_norm": 9.643292427062988,
"learning_rate": 1.3528229719180835e-05,
"loss": 1.2758,
"step": 2220
},
{
"epoch": 0.1473990349659594,
"grad_norm": 10.941937446594238,
"learning_rate": 1.3458098589907348e-05,
"loss": 1.268,
"step": 2230
},
{
"epoch": 0.1480600171855377,
"grad_norm": 11.461699485778809,
"learning_rate": 1.3387873830894973e-05,
"loss": 1.0558,
"step": 2240
},
{
"epoch": 0.148720999405116,
"grad_norm": 6.023902893066406,
"learning_rate": 1.3317558328808506e-05,
"loss": 1.1131,
"step": 2250
},
{
"epoch": 0.148720999405116,
"eval_loss": 1.259637475013733,
"eval_runtime": 52.7273,
"eval_samples_per_second": 9.502,
"eval_steps_per_second": 9.502,
"step": 2250
},
{
"epoch": 0.1493819816246943,
"grad_norm": 11.362767219543457,
"learning_rate": 1.3247154974042827e-05,
"loss": 1.2487,
"step": 2260
},
{
"epoch": 0.1500429638442726,
"grad_norm": 12.16934585571289,
"learning_rate": 1.3176666660604102e-05,
"loss": 1.3317,
"step": 2270
},
{
"epoch": 0.15070394606385087,
"grad_norm": 7.8326849937438965,
"learning_rate": 1.3106096285990812e-05,
"loss": 1.1973,
"step": 2280
},
{
"epoch": 0.15136492828342918,
"grad_norm": 7.108518600463867,
"learning_rate": 1.3035446751074653e-05,
"loss": 1.1605,
"step": 2290
},
{
"epoch": 0.15202591050300746,
"grad_norm": 11.288322448730469,
"learning_rate": 1.2964720959981287e-05,
"loss": 1.1857,
"step": 2300
},
{
"epoch": 0.15268689272258576,
"grad_norm": 5.468815803527832,
"learning_rate": 1.2893921819970972e-05,
"loss": 1.2428,
"step": 2310
},
{
"epoch": 0.15334787494216406,
"grad_norm": 11.970479011535645,
"learning_rate": 1.2823052241319061e-05,
"loss": 1.2249,
"step": 2320
},
{
"epoch": 0.15400885716174234,
"grad_norm": 9.788006782531738,
"learning_rate": 1.2752115137196341e-05,
"loss": 1.1832,
"step": 2330
},
{
"epoch": 0.15466983938132065,
"grad_norm": 5.940231800079346,
"learning_rate": 1.2681113423549334e-05,
"loss": 1.0796,
"step": 2340
},
{
"epoch": 0.15533082160089895,
"grad_norm": 5.606922149658203,
"learning_rate": 1.2610050018980385e-05,
"loss": 0.9388,
"step": 2350
},
{
"epoch": 0.15599180382047723,
"grad_norm": 6.812578201293945,
"learning_rate": 1.2538927844627726e-05,
"loss": 1.12,
"step": 2360
},
{
"epoch": 0.15665278604005553,
"grad_norm": 10.468450546264648,
"learning_rate": 1.2467749824045373e-05,
"loss": 1.1143,
"step": 2370
},
{
"epoch": 0.1573137682596338,
"grad_norm": 6.699043273925781,
"learning_rate": 1.2396518883082966e-05,
"loss": 1.1317,
"step": 2380
},
{
"epoch": 0.1579747504792121,
"grad_norm": 11.339058876037598,
"learning_rate": 1.2325237949765496e-05,
"loss": 1.1824,
"step": 2390
},
{
"epoch": 0.15863573269879042,
"grad_norm": 6.434577941894531,
"learning_rate": 1.225390995417295e-05,
"loss": 1.0624,
"step": 2400
},
{
"epoch": 0.15863573269879042,
"eval_loss": 1.253835678100586,
"eval_runtime": 47.0266,
"eval_samples_per_second": 10.654,
"eval_steps_per_second": 10.654,
"step": 2400
},
{
"epoch": 0.1592967149183687,
"grad_norm": 10.957035064697266,
"learning_rate": 1.2182537828319848e-05,
"loss": 1.265,
"step": 2410
},
{
"epoch": 0.159957697137947,
"grad_norm": 12.669862747192383,
"learning_rate": 1.2111124506034739e-05,
"loss": 1.1453,
"step": 2420
},
{
"epoch": 0.16061867935752527,
"grad_norm": 12.645952224731445,
"learning_rate": 1.2039672922839598e-05,
"loss": 1.1506,
"step": 2430
},
{
"epoch": 0.16127966157710358,
"grad_norm": 12.920147895812988,
"learning_rate": 1.196818601582915e-05,
"loss": 1.0976,
"step": 2440
},
{
"epoch": 0.16194064379668188,
"grad_norm": 13.062854766845703,
"learning_rate": 1.189666672355015e-05,
"loss": 1.3518,
"step": 2450
},
{
"epoch": 0.16260162601626016,
"grad_norm": 5.583253860473633,
"learning_rate": 1.1825117985880576e-05,
"loss": 1.0854,
"step": 2460
},
{
"epoch": 0.16326260823583846,
"grad_norm": 12.410826683044434,
"learning_rate": 1.1753542743908802e-05,
"loss": 1.1561,
"step": 2470
},
{
"epoch": 0.16392359045541674,
"grad_norm": 11.445279121398926,
"learning_rate": 1.1681943939812688e-05,
"loss": 1.3584,
"step": 2480
},
{
"epoch": 0.16458457267499504,
"grad_norm": 6.8058342933654785,
"learning_rate": 1.1610324516738626e-05,
"loss": 1.2373,
"step": 2490
},
{
"epoch": 0.16524555489457335,
"grad_norm": 10.376558303833008,
"learning_rate": 1.1538687418680596e-05,
"loss": 1.0921,
"step": 2500
},
{
"epoch": 0.16590653711415163,
"grad_norm": 6.7869791984558105,
"learning_rate": 1.1467035590359106e-05,
"loss": 1.2743,
"step": 2510
},
{
"epoch": 0.16656751933372993,
"grad_norm": 12.313713073730469,
"learning_rate": 1.139537197710018e-05,
"loss": 1.1243,
"step": 2520
},
{
"epoch": 0.1672285015533082,
"grad_norm": 11.535476684570312,
"learning_rate": 1.1323699524714278e-05,
"loss": 1.2232,
"step": 2530
},
{
"epoch": 0.1678894837728865,
"grad_norm": 9.248635292053223,
"learning_rate": 1.1252021179375192e-05,
"loss": 1.0689,
"step": 2540
},
{
"epoch": 0.16855046599246482,
"grad_norm": 10.689653396606445,
"learning_rate": 1.118033988749895e-05,
"loss": 1.2617,
"step": 2550
},
{
"epoch": 0.16855046599246482,
"eval_loss": 1.2488397359848022,
"eval_runtime": 52.0382,
"eval_samples_per_second": 9.628,
"eval_steps_per_second": 9.628,
"step": 2550
},
{
"epoch": 0.1692114482120431,
"grad_norm": 12.502510070800781,
"learning_rate": 1.1108658595622709e-05,
"loss": 1.2023,
"step": 2560
},
{
"epoch": 0.1698724304316214,
"grad_norm": 11.087409973144531,
"learning_rate": 1.1036980250283621e-05,
"loss": 1.2207,
"step": 2570
},
{
"epoch": 0.17053341265119967,
"grad_norm": 9.92039680480957,
"learning_rate": 1.096530779789772e-05,
"loss": 1.1602,
"step": 2580
},
{
"epoch": 0.17119439487077798,
"grad_norm": 5.836206912994385,
"learning_rate": 1.0893644184638797e-05,
"loss": 1.0523,
"step": 2590
},
{
"epoch": 0.17185537709035628,
"grad_norm": 12.243383407592773,
"learning_rate": 1.0821992356317307e-05,
"loss": 1.2196,
"step": 2600
},
{
"epoch": 0.17251635930993456,
"grad_norm": 6.7921366691589355,
"learning_rate": 1.0750355258259273e-05,
"loss": 1.2333,
"step": 2610
},
{
"epoch": 0.17317734152951286,
"grad_norm": 11.758354187011719,
"learning_rate": 1.0678735835185219e-05,
"loss": 1.1695,
"step": 2620
},
{
"epoch": 0.17383832374909114,
"grad_norm": 12.446253776550293,
"learning_rate": 1.06071370310891e-05,
"loss": 1.1428,
"step": 2630
},
{
"epoch": 0.17449930596866944,
"grad_norm": 7.370149612426758,
"learning_rate": 1.0535561789117327e-05,
"loss": 1.262,
"step": 2640
},
{
"epoch": 0.17516028818824775,
"grad_norm": 10.489151954650879,
"learning_rate": 1.0464013051447755e-05,
"loss": 1.0921,
"step": 2650
},
{
"epoch": 0.17582127040782602,
"grad_norm": 10.34467887878418,
"learning_rate": 1.0392493759168751e-05,
"loss": 1.1942,
"step": 2660
},
{
"epoch": 0.17648225262740433,
"grad_norm": 11.04796314239502,
"learning_rate": 1.0321006852158306e-05,
"loss": 1.0937,
"step": 2670
},
{
"epoch": 0.1771432348469826,
"grad_norm": 12.193102836608887,
"learning_rate": 1.0249555268963164e-05,
"loss": 1.1015,
"step": 2680
},
{
"epoch": 0.1778042170665609,
"grad_norm": 11.928840637207031,
"learning_rate": 1.0178141946678054e-05,
"loss": 1.2069,
"step": 2690
},
{
"epoch": 0.17846519928613921,
"grad_norm": 6.055873870849609,
"learning_rate": 1.0106769820824951e-05,
"loss": 1.0915,
"step": 2700
},
{
"epoch": 0.17846519928613921,
"eval_loss": 1.246018409729004,
"eval_runtime": 47.997,
"eval_samples_per_second": 10.438,
"eval_steps_per_second": 10.438,
"step": 2700
},
{
"epoch": 0.1791261815057175,
"grad_norm": 7.3669586181640625,
"learning_rate": 1.0035441825232406e-05,
"loss": 1.0824,
"step": 2710
},
{
"epoch": 0.1797871637252958,
"grad_norm": 12.520928382873535,
"learning_rate": 9.964160891914937e-06,
"loss": 1.1395,
"step": 2720
},
{
"epoch": 0.18044814594487407,
"grad_norm": 6.952485084533691,
"learning_rate": 9.892929950952532e-06,
"loss": 1.1727,
"step": 2730
},
{
"epoch": 0.18110912816445238,
"grad_norm": 10.507661819458008,
"learning_rate": 9.821751930370177e-06,
"loss": 1.184,
"step": 2740
},
{
"epoch": 0.18177011038403068,
"grad_norm": 12.77137279510498,
"learning_rate": 9.750629756017514e-06,
"loss": 1.228,
"step": 2750
},
{
"epoch": 0.18243109260360896,
"grad_norm": 7.609248161315918,
"learning_rate": 9.679566351448571e-06,
"loss": 1.1315,
"step": 2760
},
{
"epoch": 0.18309207482318726,
"grad_norm": 11.428009986877441,
"learning_rate": 9.608564637801562e-06,
"loss": 1.041,
"step": 2770
},
{
"epoch": 0.18375305704276554,
"grad_norm": 12.582087516784668,
"learning_rate": 9.537627533678842e-06,
"loss": 1.1608,
"step": 2780
},
{
"epoch": 0.18441403926234384,
"grad_norm": 10.488136291503906,
"learning_rate": 9.466757955026925e-06,
"loss": 1.0935,
"step": 2790
},
{
"epoch": 0.18507502148192215,
"grad_norm": 12.54319953918457,
"learning_rate": 9.395958815016618e-06,
"loss": 1.1654,
"step": 2800
},
{
"epoch": 0.18573600370150042,
"grad_norm": 10.314374923706055,
"learning_rate": 9.325233023923252e-06,
"loss": 1.2293,
"step": 2810
},
{
"epoch": 0.18639698592107873,
"grad_norm": 7.015604496002197,
"learning_rate": 9.25458348900709e-06,
"loss": 1.0994,
"step": 2820
},
{
"epoch": 0.187057968140657,
"grad_norm": 6.349636554718018,
"learning_rate": 9.1840131143938e-06,
"loss": 1.2272,
"step": 2830
},
{
"epoch": 0.1877189503602353,
"grad_norm": 9.584831237792969,
"learning_rate": 9.113524800955074e-06,
"loss": 1.1187,
"step": 2840
},
{
"epoch": 0.1883799325798136,
"grad_norm": 4.967813491821289,
"learning_rate": 9.043121446189398e-06,
"loss": 1.0012,
"step": 2850
},
{
"epoch": 0.1883799325798136,
"eval_loss": 1.2398909330368042,
"eval_runtime": 53.5377,
"eval_samples_per_second": 9.358,
"eval_steps_per_second": 9.358,
"step": 2850
},
{
"epoch": 0.1890409147993919,
"grad_norm": 11.762967109680176,
"learning_rate": 8.972805944102928e-06,
"loss": 1.1628,
"step": 2860
},
{
"epoch": 0.1897018970189702,
"grad_norm": 9.806082725524902,
"learning_rate": 8.902581185090555e-06,
"loss": 1.0982,
"step": 2870
},
{
"epoch": 0.19036287923854847,
"grad_norm": 5.619679927825928,
"learning_rate": 8.832450055817064e-06,
"loss": 1.1545,
"step": 2880
},
{
"epoch": 0.19102386145812678,
"grad_norm": 12.290181159973145,
"learning_rate": 8.7624154390985e-06,
"loss": 1.1625,
"step": 2890
},
{
"epoch": 0.19168484367770508,
"grad_norm": 12.353217124938965,
"learning_rate": 8.692480213783649e-06,
"loss": 1.159,
"step": 2900
},
{
"epoch": 0.19234582589728336,
"grad_norm": 9.661192893981934,
"learning_rate": 8.622647254635703e-06,
"loss": 1.2334,
"step": 2910
},
{
"epoch": 0.19300680811686166,
"grad_norm": 10.236005783081055,
"learning_rate": 8.552919432214097e-06,
"loss": 1.1434,
"step": 2920
},
{
"epoch": 0.19366779033643994,
"grad_norm": 11.429096221923828,
"learning_rate": 8.483299612756505e-06,
"loss": 1.2204,
"step": 2930
},
{
"epoch": 0.19432877255601824,
"grad_norm": 7.723197937011719,
"learning_rate": 8.413790658061028e-06,
"loss": 1.2049,
"step": 2940
},
{
"epoch": 0.19498975477559655,
"grad_norm": 9.042826652526855,
"learning_rate": 8.344395425368537e-06,
"loss": 1.1231,
"step": 2950
},
{
"epoch": 0.19565073699517482,
"grad_norm": 11.260157585144043,
"learning_rate": 8.275116767245251e-06,
"loss": 1.1543,
"step": 2960
},
{
"epoch": 0.19631171921475313,
"grad_norm": 5.6008830070495605,
"learning_rate": 8.205957531465456e-06,
"loss": 1.0243,
"step": 2970
},
{
"epoch": 0.1969727014343314,
"grad_norm": 5.492390155792236,
"learning_rate": 8.136920560894458e-06,
"loss": 1.2962,
"step": 2980
},
{
"epoch": 0.1976336836539097,
"grad_norm": 10.791748046875,
"learning_rate": 8.068008693371723e-06,
"loss": 1.0384,
"step": 2990
},
{
"epoch": 0.198294665873488,
"grad_norm": 6.472116470336914,
"learning_rate": 7.999224761594206e-06,
"loss": 1.0479,
"step": 3000
},
{
"epoch": 0.198294665873488,
"eval_loss": 1.2349213361740112,
"eval_runtime": 53.0521,
"eval_samples_per_second": 9.444,
"eval_steps_per_second": 9.444,
"step": 3000
},
{
"epoch": 0.1989556480930663,
"grad_norm": 7.443964958190918,
"learning_rate": 7.930571592999942e-06,
"loss": 1.1367,
"step": 3010
},
{
"epoch": 0.1996166303126446,
"grad_norm": 7.271074295043945,
"learning_rate": 7.86205200965179e-06,
"loss": 1.1435,
"step": 3020
},
{
"epoch": 0.20027761253222287,
"grad_norm": 12.19694995880127,
"learning_rate": 7.793668828121457e-06,
"loss": 1.274,
"step": 3030
},
{
"epoch": 0.20093859475180118,
"grad_norm": 6.130085468292236,
"learning_rate": 7.725424859373688e-06,
"loss": 1.1887,
"step": 3040
},
{
"epoch": 0.20159957697137948,
"grad_norm": 8.441886901855469,
"learning_rate": 7.65732290865075e-06,
"loss": 1.1228,
"step": 3050
},
{
"epoch": 0.20226055919095776,
"grad_norm": 10.298881530761719,
"learning_rate": 7.589365775357096e-06,
"loss": 1.1681,
"step": 3060
},
{
"epoch": 0.20292154141053606,
"grad_norm": 5.6892218589782715,
"learning_rate": 7.52155625294431e-06,
"loss": 1.1967,
"step": 3070
},
{
"epoch": 0.20358252363011434,
"grad_norm": 4.733664035797119,
"learning_rate": 7.453897128796269e-06,
"loss": 0.9874,
"step": 3080
},
{
"epoch": 0.20424350584969264,
"grad_norm": 6.695845603942871,
"learning_rate": 7.386391184114558e-06,
"loss": 1.2284,
"step": 3090
},
{
"epoch": 0.20490448806927095,
"grad_norm": 11.191842079162598,
"learning_rate": 7.319041193804161e-06,
"loss": 1.2232,
"step": 3100
},
{
"epoch": 0.20556547028884922,
"grad_norm": 6.132591724395752,
"learning_rate": 7.2518499263593866e-06,
"loss": 1.12,
"step": 3110
},
{
"epoch": 0.20622645250842753,
"grad_norm": 11.867471694946289,
"learning_rate": 7.184820143750079e-06,
"loss": 1.1889,
"step": 3120
},
{
"epoch": 0.2068874347280058,
"grad_norm": 10.931007385253906,
"learning_rate": 7.117954601308052e-06,
"loss": 1.2347,
"step": 3130
},
{
"epoch": 0.2075484169475841,
"grad_norm": 12.895480155944824,
"learning_rate": 7.051256047613866e-06,
"loss": 1.216,
"step": 3140
},
{
"epoch": 0.2082093991671624,
"grad_norm": 10.634278297424316,
"learning_rate": 6.984727224383822e-06,
"loss": 1.1687,
"step": 3150
},
{
"epoch": 0.2082093991671624,
"eval_loss": 1.2307320833206177,
"eval_runtime": 58.1752,
"eval_samples_per_second": 8.612,
"eval_steps_per_second": 8.612,
"step": 3150
},
{
"epoch": 0.2088703813867407,
"grad_norm": 11.298223495483398,
"learning_rate": 6.918370866357266e-06,
"loss": 1.1429,
"step": 3160
},
{
"epoch": 0.209531363606319,
"grad_norm": 5.801537036895752,
"learning_rate": 6.852189701184183e-06,
"loss": 1.1809,
"step": 3170
},
{
"epoch": 0.21019234582589727,
"grad_norm": 11.565352439880371,
"learning_rate": 6.786186449313051e-06,
"loss": 1.1068,
"step": 3180
},
{
"epoch": 0.21085332804547557,
"grad_norm": 9.563201904296875,
"learning_rate": 6.720363823879042e-06,
"loss": 1.1438,
"step": 3190
},
{
"epoch": 0.21151431026505388,
"grad_norm": 3.7967348098754883,
"learning_rate": 6.6547245305924765e-06,
"loss": 1.1022,
"step": 3200
},
{
"epoch": 0.21217529248463216,
"grad_norm": 9.867331504821777,
"learning_rate": 6.589271267627615e-06,
"loss": 1.0329,
"step": 3210
},
{
"epoch": 0.21283627470421046,
"grad_norm": 10.908332824707031,
"learning_rate": 6.524006725511727e-06,
"loss": 1.0811,
"step": 3220
},
{
"epoch": 0.21349725692378874,
"grad_norm": 11.866363525390625,
"learning_rate": 6.4589335870145165e-06,
"loss": 1.1611,
"step": 3230
},
{
"epoch": 0.21415823914336704,
"grad_norm": 12.108943939208984,
"learning_rate": 6.394054527037837e-06,
"loss": 1.1558,
"step": 3240
},
{
"epoch": 0.21481922136294535,
"grad_norm": 11.09125804901123,
"learning_rate": 6.329372212505727e-06,
"loss": 1.1853,
"step": 3250
},
{
"epoch": 0.21548020358252362,
"grad_norm": 12.74525260925293,
"learning_rate": 6.264889302254797e-06,
"loss": 1.1862,
"step": 3260
},
{
"epoch": 0.21614118580210193,
"grad_norm": 9.876714706420898,
"learning_rate": 6.200608446924922e-06,
"loss": 1.1651,
"step": 3270
},
{
"epoch": 0.21680216802168023,
"grad_norm": 9.700896263122559,
"learning_rate": 6.136532288850295e-06,
"loss": 1.2345,
"step": 3280
},
{
"epoch": 0.2174631502412585,
"grad_norm": 10.941569328308105,
"learning_rate": 6.072663461950806e-06,
"loss": 1.0379,
"step": 3290
},
{
"epoch": 0.2181241324608368,
"grad_norm": 13.29504108428955,
"learning_rate": 6.009004591623776e-06,
"loss": 1.1251,
"step": 3300
},
{
"epoch": 0.2181241324608368,
"eval_loss": 1.2260839939117432,
"eval_runtime": 47.8562,
"eval_samples_per_second": 10.469,
"eval_steps_per_second": 10.469,
"step": 3300
},
{
"epoch": 0.2187851146804151,
"grad_norm": 8.1751708984375,
"learning_rate": 5.945558294636019e-06,
"loss": 1.1452,
"step": 3310
},
{
"epoch": 0.2194460968999934,
"grad_norm": 12.451173782348633,
"learning_rate": 5.882327179016307e-06,
"loss": 1.217,
"step": 3320
},
{
"epoch": 0.2201070791195717,
"grad_norm": 11.116937637329102,
"learning_rate": 5.819313843948146e-06,
"loss": 1.1602,
"step": 3330
},
{
"epoch": 0.22076806133914997,
"grad_norm": 10.272557258605957,
"learning_rate": 5.756520879662929e-06,
"loss": 1.2616,
"step": 3340
},
{
"epoch": 0.22142904355872828,
"grad_norm": 10.73164176940918,
"learning_rate": 5.693950867333488e-06,
"loss": 1.2448,
"step": 3350
},
{
"epoch": 0.22209002577830655,
"grad_norm": 11.405309677124023,
"learning_rate": 5.6316063789679415e-06,
"loss": 1.2419,
"step": 3360
},
{
"epoch": 0.22275100799788486,
"grad_norm": 6.117231369018555,
"learning_rate": 5.569489977304029e-06,
"loss": 1.2027,
"step": 3370
},
{
"epoch": 0.22341199021746316,
"grad_norm": 12.008468627929688,
"learning_rate": 5.507604215703729e-06,
"loss": 1.1525,
"step": 3380
},
{
"epoch": 0.22407297243704144,
"grad_norm": 6.268473148345947,
"learning_rate": 5.44595163804831e-06,
"loss": 1.1422,
"step": 3390
},
{
"epoch": 0.22473395465661974,
"grad_norm": 14.515848159790039,
"learning_rate": 5.384534778633763e-06,
"loss": 1.0998,
"step": 3400
},
{
"epoch": 0.22539493687619802,
"grad_norm": 10.610064506530762,
"learning_rate": 5.323356162066626e-06,
"loss": 1.2074,
"step": 3410
},
{
"epoch": 0.22605591909577633,
"grad_norm": 11.648080825805664,
"learning_rate": 5.262418303160206e-06,
"loss": 1.0755,
"step": 3420
},
{
"epoch": 0.22671690131535463,
"grad_norm": 6.210646629333496,
"learning_rate": 5.201723706831204e-06,
"loss": 1.1203,
"step": 3430
},
{
"epoch": 0.2273778835349329,
"grad_norm": 4.218708038330078,
"learning_rate": 5.141274867996755e-06,
"loss": 0.9939,
"step": 3440
},
{
"epoch": 0.2280388657545112,
"grad_norm": 8.179903030395508,
"learning_rate": 5.081074271471855e-06,
"loss": 1.0597,
"step": 3450
},
{
"epoch": 0.2280388657545112,
"eval_loss": 1.2263822555541992,
"eval_runtime": 52.855,
"eval_samples_per_second": 9.479,
"eval_steps_per_second": 9.479,
"step": 3450
},
{
"epoch": 0.2286998479740895,
"grad_norm": 13.975303649902344,
"learning_rate": 5.021124391867241e-06,
"loss": 1.1898,
"step": 3460
},
{
"epoch": 0.2293608301936678,
"grad_norm": 11.902430534362793,
"learning_rate": 4.961427693487654e-06,
"loss": 1.2382,
"step": 3470
},
{
"epoch": 0.2300218124132461,
"grad_norm": 7.363813877105713,
"learning_rate": 4.901986630230549e-06,
"loss": 1.1337,
"step": 3480
},
{
"epoch": 0.23068279463282437,
"grad_norm": 14.231773376464844,
"learning_rate": 4.842803645485228e-06,
"loss": 1.2631,
"step": 3490
},
{
"epoch": 0.23134377685240268,
"grad_norm": 13.055315971374512,
"learning_rate": 4.7838811720323795e-06,
"loss": 1.2307,
"step": 3500
},
{
"epoch": 0.23200475907198095,
"grad_norm": 11.109673500061035,
"learning_rate": 4.725221631944109e-06,
"loss": 1.0673,
"step": 3510
},
{
"epoch": 0.23266574129155926,
"grad_norm": 9.12000560760498,
"learning_rate": 4.666827436484355e-06,
"loss": 1.2818,
"step": 3520
},
{
"epoch": 0.23332672351113756,
"grad_norm": 11.266242980957031,
"learning_rate": 4.60870098600978e-06,
"loss": 0.9892,
"step": 3530
},
{
"epoch": 0.23398770573071584,
"grad_norm": 13.089488983154297,
"learning_rate": 4.550844669871095e-06,
"loss": 1.1585,
"step": 3540
},
{
"epoch": 0.23464868795029414,
"grad_norm": 9.938103675842285,
"learning_rate": 4.493260866314851e-06,
"loss": 1.1734,
"step": 3550
},
{
"epoch": 0.23530967016987242,
"grad_norm": 10.093935012817383,
"learning_rate": 4.435951942385671e-06,
"loss": 1.1185,
"step": 3560
},
{
"epoch": 0.23597065238945072,
"grad_norm": 4.782352924346924,
"learning_rate": 4.378920253828953e-06,
"loss": 1.1413,
"step": 3570
},
{
"epoch": 0.23663163460902903,
"grad_norm": 11.091765403747559,
"learning_rate": 4.322168144994041e-06,
"loss": 1.2909,
"step": 3580
},
{
"epoch": 0.2372926168286073,
"grad_norm": 10.81592845916748,
"learning_rate": 4.265697948737836e-06,
"loss": 1.2501,
"step": 3590
},
{
"epoch": 0.2379535990481856,
"grad_norm": 11.043889045715332,
"learning_rate": 4.209511986328935e-06,
"loss": 1.1757,
"step": 3600
},
{
"epoch": 0.2379535990481856,
"eval_loss": 1.223681092262268,
"eval_runtime": 54.0238,
"eval_samples_per_second": 9.274,
"eval_steps_per_second": 9.274,
"step": 3600
},
{
"epoch": 0.2386145812677639,
"grad_norm": 6.890323638916016,
"learning_rate": 4.153612567352186e-06,
"loss": 1.0562,
"step": 3610
},
{
"epoch": 0.2392755634873422,
"grad_norm": 8.741559028625488,
"learning_rate": 4.098001989613763e-06,
"loss": 1.1737,
"step": 3620
},
{
"epoch": 0.2399365457069205,
"grad_norm": 12.617691993713379,
"learning_rate": 4.042682539046698e-06,
"loss": 1.2365,
"step": 3630
},
{
"epoch": 0.24059752792649877,
"grad_norm": 6.839216232299805,
"learning_rate": 3.987656489616937e-06,
"loss": 1.1941,
"step": 3640
},
{
"epoch": 0.24125851014607708,
"grad_norm": 10.760446548461914,
"learning_rate": 3.932926103229849e-06,
"loss": 1.1187,
"step": 3650
},
{
"epoch": 0.24191949236565535,
"grad_norm": 7.493879795074463,
"learning_rate": 3.878493629637249e-06,
"loss": 1.1193,
"step": 3660
},
{
"epoch": 0.24258047458523366,
"grad_norm": 8.233012199401855,
"learning_rate": 3.824361306344942e-06,
"loss": 1.1905,
"step": 3670
},
{
"epoch": 0.24324145680481196,
"grad_norm": 8.992157936096191,
"learning_rate": 3.7705313585207056e-06,
"loss": 1.0877,
"step": 3680
},
{
"epoch": 0.24390243902439024,
"grad_norm": 13.892884254455566,
"learning_rate": 3.717005998902859e-06,
"loss": 1.1345,
"step": 3690
},
{
"epoch": 0.24456342124396854,
"grad_norm": 10.53703784942627,
"learning_rate": 3.6637874277092946e-06,
"loss": 1.1473,
"step": 3700
},
{
"epoch": 0.24522440346354682,
"grad_norm": 5.2873406410217285,
"learning_rate": 3.610877832547034e-06,
"loss": 1.0317,
"step": 3710
},
{
"epoch": 0.24588538568312512,
"grad_norm": 8.536104202270508,
"learning_rate": 3.5582793883222923e-06,
"loss": 1.0296,
"step": 3720
},
{
"epoch": 0.24654636790270343,
"grad_norm": 7.4764227867126465,
"learning_rate": 3.5059942571511037e-06,
"loss": 1.0728,
"step": 3730
},
{
"epoch": 0.2472073501222817,
"grad_norm": 9.194038391113281,
"learning_rate": 3.4540245882704213e-06,
"loss": 1.1157,
"step": 3740
},
{
"epoch": 0.24786833234186,
"grad_norm": 10.502184867858887,
"learning_rate": 3.4023725179497848e-06,
"loss": 1.1923,
"step": 3750
},
{
"epoch": 0.24786833234186,
"eval_loss": 1.2212793827056885,
"eval_runtime": 53.4315,
"eval_samples_per_second": 9.376,
"eval_steps_per_second": 9.376,
"step": 3750
},
{
"epoch": 0.24852931456143829,
"grad_norm": 7.8659234046936035,
"learning_rate": 3.351040169403499e-06,
"loss": 1.0991,
"step": 3760
},
{
"epoch": 0.2491902967810166,
"grad_norm": 8.55827808380127,
"learning_rate": 3.30002965270335e-06,
"loss": 1.0168,
"step": 3770
},
{
"epoch": 0.2498512790005949,
"grad_norm": 10.08139705657959,
"learning_rate": 3.2493430646918865e-06,
"loss": 1.188,
"step": 3780
},
{
"epoch": 0.25051226122017317,
"grad_norm": 7.772961139678955,
"learning_rate": 3.1989824888962225e-06,
"loss": 1.1373,
"step": 3790
},
{
"epoch": 0.25117324343975145,
"grad_norm": 7.485221862792969,
"learning_rate": 3.1489499954423797e-06,
"loss": 1.2637,
"step": 3800
},
{
"epoch": 0.2518342256593298,
"grad_norm": 14.595245361328125,
"learning_rate": 3.0992476409701936e-06,
"loss": 1.1433,
"step": 3810
},
{
"epoch": 0.25249520787890806,
"grad_norm": 11.104635238647461,
"learning_rate": 3.0498774685487882e-06,
"loss": 1.1773,
"step": 3820
},
{
"epoch": 0.25315619009848633,
"grad_norm": 6.462589263916016,
"learning_rate": 3.000841507592583e-06,
"loss": 1.0087,
"step": 3830
},
{
"epoch": 0.25381717231806467,
"grad_norm": 12.072765350341797,
"learning_rate": 2.9521417737778717e-06,
"loss": 1.0804,
"step": 3840
},
{
"epoch": 0.25447815453764294,
"grad_norm": 11.500109672546387,
"learning_rate": 2.9037802689599704e-06,
"loss": 1.1597,
"step": 3850
},
{
"epoch": 0.2551391367572212,
"grad_norm": 8.149591445922852,
"learning_rate": 2.855758981090918e-06,
"loss": 1.2028,
"step": 3860
},
{
"epoch": 0.25580011897679955,
"grad_norm": 11.354681015014648,
"learning_rate": 2.8080798841377743e-06,
"loss": 1.1725,
"step": 3870
},
{
"epoch": 0.2564611011963778,
"grad_norm": 9.085524559020996,
"learning_rate": 2.7607449380014703e-06,
"loss": 1.2511,
"step": 3880
},
{
"epoch": 0.2571220834159561,
"grad_norm": 10.283825874328613,
"learning_rate": 2.713756088436244e-06,
"loss": 1.1444,
"step": 3890
},
{
"epoch": 0.2577830656355344,
"grad_norm": 11.607617378234863,
"learning_rate": 2.6671152669696515e-06,
"loss": 1.1419,
"step": 3900
},
{
"epoch": 0.2577830656355344,
"eval_loss": 1.2201364040374756,
"eval_runtime": 55.3983,
"eval_samples_per_second": 9.044,
"eval_steps_per_second": 9.044,
"step": 3900
},
{
"epoch": 0.2584440478551127,
"grad_norm": 7.006284713745117,
"learning_rate": 2.6208243908231916e-06,
"loss": 1.0414,
"step": 3910
},
{
"epoch": 0.259105030074691,
"grad_norm": 10.41873550415039,
"learning_rate": 2.57488536283347e-06,
"loss": 1.1597,
"step": 3920
},
{
"epoch": 0.25976601229426927,
"grad_norm": 9.293778419494629,
"learning_rate": 2.5293000713739977e-06,
"loss": 1.182,
"step": 3930
},
{
"epoch": 0.2604269945138476,
"grad_norm": 11.898356437683105,
"learning_rate": 2.4840703902775642e-06,
"loss": 1.2502,
"step": 3940
},
{
"epoch": 0.2610879767334259,
"grad_norm": 9.323407173156738,
"learning_rate": 2.4391981787592005e-06,
"loss": 1.0892,
"step": 3950
},
{
"epoch": 0.26174895895300415,
"grad_norm": 11.664414405822754,
"learning_rate": 2.3946852813397737e-06,
"loss": 1.1837,
"step": 3960
},
{
"epoch": 0.2624099411725825,
"grad_norm": 11.392061233520508,
"learning_rate": 2.3505335277701494e-06,
"loss": 1.0029,
"step": 3970
},
{
"epoch": 0.26307092339216076,
"grad_norm": 10.388303756713867,
"learning_rate": 2.306744732955991e-06,
"loss": 1.172,
"step": 3980
},
{
"epoch": 0.26373190561173904,
"grad_norm": 11.332767486572266,
"learning_rate": 2.2633206968831374e-06,
"loss": 1.1951,
"step": 3990
},
{
"epoch": 0.2643928878313173,
"grad_norm": 4.8323259353637695,
"learning_rate": 2.220263204543635e-06,
"loss": 1.0181,
"step": 4000
},
{
"epoch": 0.26505387005089565,
"grad_norm": 11.138567924499512,
"learning_rate": 2.1775740258623492e-06,
"loss": 1.1295,
"step": 4010
},
{
"epoch": 0.2657148522704739,
"grad_norm": 7.644820690155029,
"learning_rate": 2.1352549156242126e-06,
"loss": 1.1392,
"step": 4020
},
{
"epoch": 0.2663758344900522,
"grad_norm": 11.998611450195312,
"learning_rate": 2.0933076134020958e-06,
"loss": 1.1516,
"step": 4030
},
{
"epoch": 0.26703681670963053,
"grad_norm": 9.40128231048584,
"learning_rate": 2.0517338434852946e-06,
"loss": 1.1157,
"step": 4040
},
{
"epoch": 0.2676977989292088,
"grad_norm": 7.291782379150391,
"learning_rate": 2.010535314808659e-06,
"loss": 1.1069,
"step": 4050
},
{
"epoch": 0.2676977989292088,
"eval_loss": 1.2179657220840454,
"eval_runtime": 53.213,
"eval_samples_per_second": 9.415,
"eval_steps_per_second": 9.415,
"step": 4050
},
{
"epoch": 0.2683587811487871,
"grad_norm": 11.658596992492676,
"learning_rate": 1.9697137208823396e-06,
"loss": 1.172,
"step": 4060
},
{
"epoch": 0.2690197633683654,
"grad_norm": 5.082404613494873,
"learning_rate": 1.9292707397221775e-06,
"loss": 1.1331,
"step": 4070
},
{
"epoch": 0.2696807455879437,
"grad_norm": 13.126559257507324,
"learning_rate": 1.8892080337807171e-06,
"loss": 1.1899,
"step": 4080
},
{
"epoch": 0.27034172780752197,
"grad_norm": 11.264731407165527,
"learning_rate": 1.8495272498788887e-06,
"loss": 1.0929,
"step": 4090
},
{
"epoch": 0.27100271002710025,
"grad_norm": 12.232498168945312,
"learning_rate": 1.8102300191383008e-06,
"loss": 1.1517,
"step": 4100
},
{
"epoch": 0.2716636922466786,
"grad_norm": 6.517210483551025,
"learning_rate": 1.7713179569141897e-06,
"loss": 1.1451,
"step": 4110
},
{
"epoch": 0.27232467446625686,
"grad_norm": 10.073516845703125,
"learning_rate": 1.7327926627290298e-06,
"loss": 1.1757,
"step": 4120
},
{
"epoch": 0.27298565668583513,
"grad_norm": 10.904183387756348,
"learning_rate": 1.6946557202067662e-06,
"loss": 1.201,
"step": 4130
},
{
"epoch": 0.27364663890541346,
"grad_norm": 9.502151489257812,
"learning_rate": 1.6569086970077352e-06,
"loss": 1.1649,
"step": 4140
},
{
"epoch": 0.27430762112499174,
"grad_norm": 12.71923542022705,
"learning_rate": 1.6195531447642177e-06,
"loss": 1.2048,
"step": 4150
},
{
"epoch": 0.27496860334457,
"grad_norm": 13.27767562866211,
"learning_rate": 1.582590599016653e-06,
"loss": 1.0894,
"step": 4160
},
{
"epoch": 0.27562958556414835,
"grad_norm": 12.859643936157227,
"learning_rate": 1.5460225791505258e-06,
"loss": 1.1565,
"step": 4170
},
{
"epoch": 0.2762905677837266,
"grad_norm": 6.589792728424072,
"learning_rate": 1.509850588333905e-06,
"loss": 1.0296,
"step": 4180
},
{
"epoch": 0.2769515500033049,
"grad_norm": 13.752243995666504,
"learning_rate": 1.4740761134556557e-06,
"loss": 1.312,
"step": 4190
},
{
"epoch": 0.2776125322228832,
"grad_norm": 12.691303253173828,
"learning_rate": 1.4387006250643236e-06,
"loss": 1.1494,
"step": 4200
},
{
"epoch": 0.2776125322228832,
"eval_loss": 1.2168010473251343,
"eval_runtime": 51.4283,
"eval_samples_per_second": 9.742,
"eval_steps_per_second": 9.742,
"step": 4200
},
{
"epoch": 0.2782735144424615,
"grad_norm": 11.23477840423584,
"learning_rate": 1.4037255773076804e-06,
"loss": 1.0421,
"step": 4210
},
{
"epoch": 0.2789344966620398,
"grad_norm": 10.921051979064941,
"learning_rate": 1.3691524078729481e-06,
"loss": 1.055,
"step": 4220
},
{
"epoch": 0.27959547888161806,
"grad_norm": 7.342863082885742,
"learning_rate": 1.3349825379277099e-06,
"loss": 1.2973,
"step": 4230
},
{
"epoch": 0.2802564611011964,
"grad_norm": 11.837105751037598,
"learning_rate": 1.3012173720614862e-06,
"loss": 1.2177,
"step": 4240
},
{
"epoch": 0.2809174433207747,
"grad_norm": 13.415239334106445,
"learning_rate": 1.267858298227995e-06,
"loss": 1.1455,
"step": 4250
},
{
"epoch": 0.28157842554035295,
"grad_norm": 11.301210403442383,
"learning_rate": 1.2349066876881063e-06,
"loss": 1.1602,
"step": 4260
},
{
"epoch": 0.2822394077599313,
"grad_norm": 5.907723903656006,
"learning_rate": 1.202363894953462e-06,
"loss": 1.1053,
"step": 4270
},
{
"epoch": 0.28290038997950956,
"grad_norm": 12.926289558410645,
"learning_rate": 1.1702312577308133e-06,
"loss": 1.2056,
"step": 4280
},
{
"epoch": 0.28356137219908784,
"grad_norm": 10.026867866516113,
"learning_rate": 1.1385100968670189e-06,
"loss": 1.1685,
"step": 4290
},
{
"epoch": 0.2842223544186661,
"grad_norm": 12.193798065185547,
"learning_rate": 1.107201716294762e-06,
"loss": 1.1253,
"step": 4300
},
{
"epoch": 0.28488333663824444,
"grad_norm": 6.5807294845581055,
"learning_rate": 1.076307402978938e-06,
"loss": 1.1252,
"step": 4310
},
{
"epoch": 0.2855443188578227,
"grad_norm": 11.568461418151855,
"learning_rate": 1.0458284268637652e-06,
"loss": 1.2131,
"step": 4320
},
{
"epoch": 0.286205301077401,
"grad_norm": 5.46840238571167,
"learning_rate": 1.0157660408205728e-06,
"loss": 1.0678,
"step": 4330
},
{
"epoch": 0.28686628329697933,
"grad_norm": 13.20085334777832,
"learning_rate": 9.861214805963042e-07,
"loss": 1.1974,
"step": 4340
},
{
"epoch": 0.2875272655165576,
"grad_norm": 13.585931777954102,
"learning_rate": 9.568959647627223e-07,
"loss": 1.1664,
"step": 4350
},
{
"epoch": 0.2875272655165576,
"eval_loss": 1.21638822555542,
"eval_runtime": 51.7738,
"eval_samples_per_second": 9.677,
"eval_steps_per_second": 9.677,
"step": 4350
},
{
"epoch": 0.2881882477361359,
"grad_norm": 7.628300189971924,
"learning_rate": 9.280906946663111e-07,
"loss": 1.0584,
"step": 4360
},
{
"epoch": 0.2888492299557142,
"grad_norm": 8.380716323852539,
"learning_rate": 8.997068543789051e-07,
"loss": 1.1137,
"step": 4370
},
{
"epoch": 0.2895102121752925,
"grad_norm": 12.071667671203613,
"learning_rate": 8.717456106490042e-07,
"loss": 1.0887,
"step": 4380
},
{
"epoch": 0.29017119439487077,
"grad_norm": 6.33940315246582,
"learning_rate": 8.442081128538243e-07,
"loss": 1.0145,
"step": 4390
},
{
"epoch": 0.29083217661444905,
"grad_norm": 9.972112655639648,
"learning_rate": 8.170954929520389e-07,
"loss": 1.1362,
"step": 4400
},
{
"epoch": 0.2914931588340274,
"grad_norm": 12.998346328735352,
"learning_rate": 7.904088654372622e-07,
"loss": 1.148,
"step": 4410
},
{
"epoch": 0.29215414105360565,
"grad_norm": 5.646799087524414,
"learning_rate": 7.641493272922243e-07,
"loss": 1.1281,
"step": 4420
},
{
"epoch": 0.29281512327318393,
"grad_norm": 10.702962875366211,
"learning_rate": 7.383179579436903e-07,
"loss": 1.1785,
"step": 4430
},
{
"epoch": 0.29347610549276226,
"grad_norm": 5.956870079040527,
"learning_rate": 7.129158192180766e-07,
"loss": 1.1568,
"step": 4440
},
{
"epoch": 0.29413708771234054,
"grad_norm": 11.048665046691895,
"learning_rate": 6.879439552978142e-07,
"loss": 1.0652,
"step": 4450
},
{
"epoch": 0.2947980699319188,
"grad_norm": 5.649775505065918,
"learning_rate": 6.634033926784221e-07,
"loss": 1.1235,
"step": 4460
},
{
"epoch": 0.29545905215149715,
"grad_norm": 11.055773735046387,
"learning_rate": 6.392951401263069e-07,
"loss": 1.285,
"step": 4470
},
{
"epoch": 0.2961200343710754,
"grad_norm": 7.027043342590332,
"learning_rate": 6.156201886373113e-07,
"loss": 1.209,
"step": 4480
},
{
"epoch": 0.2967810165906537,
"grad_norm": 11.43958854675293,
"learning_rate": 5.923795113959569e-07,
"loss": 1.2139,
"step": 4490
},
{
"epoch": 0.297441998810232,
"grad_norm": 11.668280601501465,
"learning_rate": 5.695740637354591e-07,
"loss": 1.2407,
"step": 4500
},
{
"epoch": 0.297441998810232,
"eval_loss": 1.2155283689498901,
"eval_runtime": 48.0067,
"eval_samples_per_second": 10.436,
"eval_steps_per_second": 10.436,
"step": 4500
},
{
"epoch": 0.2981029810298103,
"grad_norm": 10.411969184875488,
"learning_rate": 5.472047830984499e-07,
"loss": 1.1499,
"step": 4510
},
{
"epoch": 0.2987639632493886,
"grad_norm": 6.937885761260986,
"learning_rate": 5.252725889984403e-07,
"loss": 1.0297,
"step": 4520
},
{
"epoch": 0.29942494546896686,
"grad_norm": 10.743237495422363,
"learning_rate": 5.037783829820298e-07,
"loss": 1.1198,
"step": 4530
},
{
"epoch": 0.3000859276885452,
"grad_norm": 5.665622234344482,
"learning_rate": 4.827230485918372e-07,
"loss": 1.0459,
"step": 4540
},
{
"epoch": 0.30074690990812347,
"grad_norm": 9.720799446105957,
"learning_rate": 4.6210745133019236e-07,
"loss": 1.1943,
"step": 4550
},
{
"epoch": 0.30140789212770175,
"grad_norm": 11.57904052734375,
"learning_rate": 4.419324386235529e-07,
"loss": 1.2007,
"step": 4560
},
{
"epoch": 0.3020688743472801,
"grad_norm": 10.47191333770752,
"learning_rate": 4.2219883978767386e-07,
"loss": 1.1754,
"step": 4570
},
{
"epoch": 0.30272985656685836,
"grad_norm": 8.371639251708984,
"learning_rate": 4.029074659935082e-07,
"loss": 1.0829,
"step": 4580
},
{
"epoch": 0.30339083878643663,
"grad_norm": 11.640840530395508,
"learning_rate": 3.8405911023387444e-07,
"loss": 1.0573,
"step": 4590
},
{
"epoch": 0.3040518210060149,
"grad_norm": 14.082575798034668,
"learning_rate": 3.6565454729085526e-07,
"loss": 1.2711,
"step": 4600
},
{
"epoch": 0.30471280322559324,
"grad_norm": 8.940695762634277,
"learning_rate": 3.4769453370394753e-07,
"loss": 1.1595,
"step": 4610
},
{
"epoch": 0.3053737854451715,
"grad_norm": 7.7234954833984375,
"learning_rate": 3.301798077389637e-07,
"loss": 1.2151,
"step": 4620
},
{
"epoch": 0.3060347676647498,
"grad_norm": 4.756081581115723,
"learning_rate": 3.1311108935768926e-07,
"loss": 1.173,
"step": 4630
},
{
"epoch": 0.30669574988432813,
"grad_norm": 10.524628639221191,
"learning_rate": 2.964890801882817e-07,
"loss": 1.0992,
"step": 4640
},
{
"epoch": 0.3073567321039064,
"grad_norm": 6.618716716766357,
"learning_rate": 2.8031446349643393e-07,
"loss": 1.1152,
"step": 4650
},
{
"epoch": 0.3073567321039064,
"eval_loss": 1.2152043581008911,
"eval_runtime": 53.4713,
"eval_samples_per_second": 9.37,
"eval_steps_per_second": 9.37,
"step": 4650
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2041141329494016e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}