wav2vec2-ie-300m / trainer_state.json
Grosy's picture
New model with fixed vocabulary bindings
610a92b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4705.895522388059,
"eval_steps": 1000.0,
"global_step": 80000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 11.776119402985074,
"grad_norm": 2824.542724609375,
"learning_rate": 6.333333333333334e-07,
"loss": 4464.7338,
"step": 200
},
{
"epoch": 23.53731343283582,
"grad_norm": 4310.4130859375,
"learning_rate": 1.3e-06,
"loss": 4264.3419,
"step": 400
},
{
"epoch": 35.298507462686565,
"grad_norm": 4456.80908203125,
"learning_rate": 1.9666666666666668e-06,
"loss": 3192.1663,
"step": 600
},
{
"epoch": 47.059701492537314,
"grad_norm": 3549.643310546875,
"learning_rate": 2.6333333333333332e-06,
"loss": 1894.5492,
"step": 800
},
{
"epoch": 58.83582089552239,
"grad_norm": 2974.13720703125,
"learning_rate": 3.3000000000000006e-06,
"loss": 1464.722,
"step": 1000
},
{
"epoch": 70.59701492537313,
"grad_norm": 2222.13427734375,
"learning_rate": 3.966666666666667e-06,
"loss": 1245.5541,
"step": 1200
},
{
"epoch": 82.35820895522389,
"grad_norm": 1658.634033203125,
"learning_rate": 4.633333333333334e-06,
"loss": 1112.0441,
"step": 1400
},
{
"epoch": 94.11940298507463,
"grad_norm": 800.1384887695312,
"learning_rate": 5.300000000000001e-06,
"loss": 1024.3591,
"step": 1600
},
{
"epoch": 105.8955223880597,
"grad_norm": 472.2015686035156,
"learning_rate": 5.966666666666667e-06,
"loss": 959.4488,
"step": 1800
},
{
"epoch": 117.65671641791045,
"grad_norm": 395.40087890625,
"learning_rate": 6.633333333333334e-06,
"loss": 921.3352,
"step": 2000
},
{
"epoch": 129.4179104477612,
"grad_norm": 214.59837341308594,
"learning_rate": 7.3e-06,
"loss": 892.8059,
"step": 2200
},
{
"epoch": 141.17910447761193,
"grad_norm": 69.00880432128906,
"learning_rate": 7.966666666666668e-06,
"loss": 879.1008,
"step": 2400
},
{
"epoch": 152.955223880597,
"grad_norm": 168.0197296142578,
"learning_rate": 8.633333333333334e-06,
"loss": 871.8709,
"step": 2600
},
{
"epoch": 164.71641791044777,
"grad_norm": 300.1562194824219,
"learning_rate": 9.3e-06,
"loss": 865.7897,
"step": 2800
},
{
"epoch": 176.47761194029852,
"grad_norm": 96.16655731201172,
"learning_rate": 9.966666666666667e-06,
"loss": 858.404,
"step": 3000
},
{
"epoch": 188.23880597014926,
"grad_norm": 158.73681640625,
"learning_rate": 9.975324675324676e-06,
"loss": 829.5743,
"step": 3200
},
{
"epoch": 200.0,
"grad_norm": 175.4840087890625,
"learning_rate": 9.94935064935065e-06,
"loss": 763.1604,
"step": 3400
},
{
"epoch": 211.77611940298507,
"grad_norm": 258.41546630859375,
"learning_rate": 9.923376623376623e-06,
"loss": 667.2331,
"step": 3600
},
{
"epoch": 223.53731343283582,
"grad_norm": 301.9793701171875,
"learning_rate": 9.897402597402598e-06,
"loss": 580.1272,
"step": 3800
},
{
"epoch": 235.29850746268656,
"grad_norm": 365.6127014160156,
"learning_rate": 9.871428571428572e-06,
"loss": 509.478,
"step": 4000
},
{
"epoch": 247.0597014925373,
"grad_norm": 580.1858520507812,
"learning_rate": 9.845454545454546e-06,
"loss": 456.7661,
"step": 4200
},
{
"epoch": 258.8358208955224,
"grad_norm": 405.80657958984375,
"learning_rate": 9.81948051948052e-06,
"loss": 413.5111,
"step": 4400
},
{
"epoch": 270.5970149253731,
"grad_norm": 558.102783203125,
"learning_rate": 9.793506493506494e-06,
"loss": 381.9194,
"step": 4600
},
{
"epoch": 282.35820895522386,
"grad_norm": 537.9163208007812,
"learning_rate": 9.767532467532468e-06,
"loss": 352.0065,
"step": 4800
},
{
"epoch": 294.1194029850746,
"grad_norm": 477.7449951171875,
"learning_rate": 9.741558441558442e-06,
"loss": 318.2346,
"step": 5000
},
{
"epoch": 305.8955223880597,
"grad_norm": 515.5437622070312,
"learning_rate": 9.715584415584415e-06,
"loss": 295.5646,
"step": 5200
},
{
"epoch": 317.65671641791045,
"grad_norm": 390.6947021484375,
"learning_rate": 9.68961038961039e-06,
"loss": 274.0082,
"step": 5400
},
{
"epoch": 329.4179104477612,
"grad_norm": 398.3788146972656,
"learning_rate": 9.663636363636364e-06,
"loss": 256.2253,
"step": 5600
},
{
"epoch": 341.17910447761193,
"grad_norm": 522.6358642578125,
"learning_rate": 9.637662337662338e-06,
"loss": 241.4042,
"step": 5800
},
{
"epoch": 352.95522388059703,
"grad_norm": 464.41546630859375,
"learning_rate": 9.611688311688311e-06,
"loss": 225.5165,
"step": 6000
},
{
"epoch": 364.7164179104478,
"grad_norm": 859.1548461914062,
"learning_rate": 9.585714285714286e-06,
"loss": 208.6093,
"step": 6200
},
{
"epoch": 376.4776119402985,
"grad_norm": 590.9322509765625,
"learning_rate": 9.55974025974026e-06,
"loss": 197.202,
"step": 6400
},
{
"epoch": 388.23880597014926,
"grad_norm": 498.85870361328125,
"learning_rate": 9.533766233766234e-06,
"loss": 182.2606,
"step": 6600
},
{
"epoch": 400.0,
"grad_norm": 310.8315124511719,
"learning_rate": 9.507792207792209e-06,
"loss": 177.0946,
"step": 6800
},
{
"epoch": 411.7761194029851,
"grad_norm": 503.26812744140625,
"learning_rate": 9.481818181818182e-06,
"loss": 167.5973,
"step": 7000
},
{
"epoch": 423.53731343283584,
"grad_norm": 564.5276489257812,
"learning_rate": 9.455844155844158e-06,
"loss": 157.7437,
"step": 7200
},
{
"epoch": 435.2985074626866,
"grad_norm": 325.49566650390625,
"learning_rate": 9.429870129870131e-06,
"loss": 146.1547,
"step": 7400
},
{
"epoch": 447.05970149253733,
"grad_norm": 446.79388427734375,
"learning_rate": 9.403896103896105e-06,
"loss": 138.5613,
"step": 7600
},
{
"epoch": 458.8358208955224,
"grad_norm": 582.2409057617188,
"learning_rate": 9.37792207792208e-06,
"loss": 133.6703,
"step": 7800
},
{
"epoch": 470.5970149253731,
"grad_norm": 503.8074951171875,
"learning_rate": 9.351948051948054e-06,
"loss": 125.3042,
"step": 8000
},
{
"epoch": 482.35820895522386,
"grad_norm": 523.983642578125,
"learning_rate": 9.326103896103897e-06,
"loss": 119.7444,
"step": 8200
},
{
"epoch": 494.1194029850746,
"grad_norm": 797.6224975585938,
"learning_rate": 9.30012987012987e-06,
"loss": 112.4131,
"step": 8400
},
{
"epoch": 505.8955223880597,
"grad_norm": 491.4894714355469,
"learning_rate": 9.274155844155846e-06,
"loss": 112.5327,
"step": 8600
},
{
"epoch": 517.6567164179105,
"grad_norm": 503.34698486328125,
"learning_rate": 9.24818181818182e-06,
"loss": 101.2935,
"step": 8800
},
{
"epoch": 529.4179104477612,
"grad_norm": 418.223876953125,
"learning_rate": 9.222207792207793e-06,
"loss": 97.0487,
"step": 9000
},
{
"epoch": 541.179104477612,
"grad_norm": 475.534423828125,
"learning_rate": 9.196233766233767e-06,
"loss": 95.3661,
"step": 9200
},
{
"epoch": 552.955223880597,
"grad_norm": 406.5447082519531,
"learning_rate": 9.170259740259742e-06,
"loss": 91.4412,
"step": 9400
},
{
"epoch": 564.7164179104477,
"grad_norm": 394.3955078125,
"learning_rate": 9.144285714285716e-06,
"loss": 87.2209,
"step": 9600
},
{
"epoch": 576.4776119402985,
"grad_norm": 426.0124206542969,
"learning_rate": 9.118311688311689e-06,
"loss": 81.9725,
"step": 9800
},
{
"epoch": 588.2388059701492,
"grad_norm": 499.22869873046875,
"learning_rate": 9.092337662337664e-06,
"loss": 82.7026,
"step": 10000
},
{
"epoch": 600.0,
"grad_norm": 378.9816589355469,
"learning_rate": 9.066493506493508e-06,
"loss": 78.9563,
"step": 10200
},
{
"epoch": 611.776119402985,
"grad_norm": 440.2572021484375,
"learning_rate": 9.040519480519482e-06,
"loss": 74.7577,
"step": 10400
},
{
"epoch": 623.5373134328358,
"grad_norm": 492.1644592285156,
"learning_rate": 9.014545454545455e-06,
"loss": 72.7476,
"step": 10600
},
{
"epoch": 635.2985074626865,
"grad_norm": 412.267822265625,
"learning_rate": 8.98857142857143e-06,
"loss": 70.3234,
"step": 10800
},
{
"epoch": 647.0597014925373,
"grad_norm": 439.10736083984375,
"learning_rate": 8.962597402597404e-06,
"loss": 69.0891,
"step": 11000
},
{
"epoch": 658.8358208955224,
"grad_norm": 349.216064453125,
"learning_rate": 8.936623376623378e-06,
"loss": 63.9953,
"step": 11200
},
{
"epoch": 670.5970149253732,
"grad_norm": 414.66741943359375,
"learning_rate": 8.910649350649351e-06,
"loss": 65.5338,
"step": 11400
},
{
"epoch": 682.3582089552239,
"grad_norm": 379.8702392578125,
"learning_rate": 8.884675324675326e-06,
"loss": 62.1837,
"step": 11600
},
{
"epoch": 694.1194029850747,
"grad_norm": 338.27685546875,
"learning_rate": 8.858831168831168e-06,
"loss": 61.4667,
"step": 11800
},
{
"epoch": 705.8955223880597,
"grad_norm": 375.4732360839844,
"learning_rate": 8.832857142857143e-06,
"loss": 59.5297,
"step": 12000
},
{
"epoch": 717.6567164179105,
"grad_norm": 441.6203308105469,
"learning_rate": 8.806883116883119e-06,
"loss": 55.1731,
"step": 12200
},
{
"epoch": 729.4179104477612,
"grad_norm": 393.1416320800781,
"learning_rate": 8.780909090909092e-06,
"loss": 57.3119,
"step": 12400
},
{
"epoch": 741.179104477612,
"grad_norm": 281.7967224121094,
"learning_rate": 8.754935064935066e-06,
"loss": 53.6714,
"step": 12600
},
{
"epoch": 752.955223880597,
"grad_norm": 441.4906921386719,
"learning_rate": 8.72896103896104e-06,
"loss": 52.6136,
"step": 12800
},
{
"epoch": 764.7164179104477,
"grad_norm": 514.225830078125,
"learning_rate": 8.702987012987015e-06,
"loss": 51.2136,
"step": 13000
},
{
"epoch": 776.4776119402985,
"grad_norm": 358.94439697265625,
"learning_rate": 8.677012987012988e-06,
"loss": 50.1962,
"step": 13200
},
{
"epoch": 788.2388059701492,
"grad_norm": 403.89691162109375,
"learning_rate": 8.651038961038962e-06,
"loss": 47.0859,
"step": 13400
},
{
"epoch": 800.0,
"grad_norm": 324.1061096191406,
"learning_rate": 8.625064935064935e-06,
"loss": 46.5822,
"step": 13600
},
{
"epoch": 811.776119402985,
"grad_norm": 297.4976501464844,
"learning_rate": 8.59909090909091e-06,
"loss": 47.6056,
"step": 13800
},
{
"epoch": 823.5373134328358,
"grad_norm": 350.8284912109375,
"learning_rate": 8.573116883116884e-06,
"loss": 42.6472,
"step": 14000
},
{
"epoch": 835.2985074626865,
"grad_norm": 291.62103271484375,
"learning_rate": 8.547142857142858e-06,
"loss": 43.605,
"step": 14200
},
{
"epoch": 847.0597014925373,
"grad_norm": 450.9121398925781,
"learning_rate": 8.521168831168833e-06,
"loss": 42.5453,
"step": 14400
},
{
"epoch": 858.8358208955224,
"grad_norm": 312.8192138671875,
"learning_rate": 8.495324675324677e-06,
"loss": 41.1642,
"step": 14600
},
{
"epoch": 870.5970149253732,
"grad_norm": 346.3138427734375,
"learning_rate": 8.46935064935065e-06,
"loss": 39.1892,
"step": 14800
},
{
"epoch": 882.3582089552239,
"grad_norm": 365.8443908691406,
"learning_rate": 8.443376623376624e-06,
"loss": 43.7121,
"step": 15000
},
{
"epoch": 894.1194029850747,
"grad_norm": 335.4299011230469,
"learning_rate": 8.417402597402599e-06,
"loss": 38.9868,
"step": 15200
},
{
"epoch": 905.8955223880597,
"grad_norm": 406.51416015625,
"learning_rate": 8.391428571428573e-06,
"loss": 38.384,
"step": 15400
},
{
"epoch": 917.6567164179105,
"grad_norm": 242.07505798339844,
"learning_rate": 8.365454545454546e-06,
"loss": 35.6221,
"step": 15600
},
{
"epoch": 929.4179104477612,
"grad_norm": 450.2956237792969,
"learning_rate": 8.33948051948052e-06,
"loss": 38.2171,
"step": 15800
},
{
"epoch": 941.179104477612,
"grad_norm": 367.16973876953125,
"learning_rate": 8.313636363636365e-06,
"loss": 35.7045,
"step": 16000
},
{
"epoch": 952.955223880597,
"grad_norm": 263.6134948730469,
"learning_rate": 8.287662337662339e-06,
"loss": 34.7425,
"step": 16200
},
{
"epoch": 964.7164179104477,
"grad_norm": 249.75967407226562,
"learning_rate": 8.261688311688312e-06,
"loss": 32.8498,
"step": 16400
},
{
"epoch": 976.4776119402985,
"grad_norm": 447.4524841308594,
"learning_rate": 8.235714285714287e-06,
"loss": 32.5018,
"step": 16600
},
{
"epoch": 988.2388059701492,
"grad_norm": 375.5469055175781,
"learning_rate": 8.209740259740261e-06,
"loss": 33.4849,
"step": 16800
},
{
"epoch": 1000.0,
"grad_norm": 235.9437713623047,
"learning_rate": 8.183896103896105e-06,
"loss": 34.2183,
"step": 17000
},
{
"epoch": 1011.776119402985,
"grad_norm": 388.0195617675781,
"learning_rate": 8.15792207792208e-06,
"loss": 31.4981,
"step": 17200
},
{
"epoch": 1023.5373134328358,
"grad_norm": 252.05894470214844,
"learning_rate": 8.131948051948053e-06,
"loss": 31.8987,
"step": 17400
},
{
"epoch": 1035.2985074626865,
"grad_norm": 244.4244842529297,
"learning_rate": 8.105974025974027e-06,
"loss": 33.8581,
"step": 17600
},
{
"epoch": 1047.0597014925372,
"grad_norm": 397.1546936035156,
"learning_rate": 8.08e-06,
"loss": 30.2649,
"step": 17800
},
{
"epoch": 1058.8358208955224,
"grad_norm": 338.2776184082031,
"learning_rate": 8.054025974025976e-06,
"loss": 30.8272,
"step": 18000
},
{
"epoch": 1070.597014925373,
"grad_norm": 653.6060791015625,
"learning_rate": 8.02805194805195e-06,
"loss": 28.4248,
"step": 18200
},
{
"epoch": 1082.358208955224,
"grad_norm": 317.8432922363281,
"learning_rate": 8.002077922077923e-06,
"loss": 29.007,
"step": 18400
},
{
"epoch": 1094.1194029850747,
"grad_norm": 362.0513916015625,
"learning_rate": 7.976103896103897e-06,
"loss": 29.2207,
"step": 18600
},
{
"epoch": 1105.8955223880596,
"grad_norm": 377.53857421875,
"learning_rate": 7.950129870129872e-06,
"loss": 27.3833,
"step": 18800
},
{
"epoch": 1117.6567164179105,
"grad_norm": 298.9675598144531,
"learning_rate": 7.924155844155845e-06,
"loss": 29.8298,
"step": 19000
},
{
"epoch": 1129.4179104477612,
"grad_norm": 333.05206298828125,
"learning_rate": 7.898181818181819e-06,
"loss": 27.2648,
"step": 19200
},
{
"epoch": 1141.1791044776119,
"grad_norm": 227.57530212402344,
"learning_rate": 7.872207792207793e-06,
"loss": 25.75,
"step": 19400
},
{
"epoch": 1152.955223880597,
"grad_norm": 345.0716552734375,
"learning_rate": 7.846233766233768e-06,
"loss": 25.266,
"step": 19600
},
{
"epoch": 1164.7164179104477,
"grad_norm": 302.9230651855469,
"learning_rate": 7.820259740259741e-06,
"loss": 25.8148,
"step": 19800
},
{
"epoch": 1176.4776119402984,
"grad_norm": 346.07598876953125,
"learning_rate": 7.794285714285715e-06,
"loss": 25.3416,
"step": 20000
},
{
"epoch": 1188.2388059701493,
"grad_norm": 354.4360046386719,
"learning_rate": 7.76831168831169e-06,
"loss": 24.6638,
"step": 20200
},
{
"epoch": 1200.0,
"grad_norm": 368.4764709472656,
"learning_rate": 7.742337662337664e-06,
"loss": 24.0062,
"step": 20400
},
{
"epoch": 1211.7761194029852,
"grad_norm": 248.7994384765625,
"learning_rate": 7.716363636363637e-06,
"loss": 23.8738,
"step": 20600
},
{
"epoch": 1223.5373134328358,
"grad_norm": 283.0020751953125,
"learning_rate": 7.690389610389611e-06,
"loss": 24.369,
"step": 20800
},
{
"epoch": 1235.2985074626865,
"grad_norm": 490.0590515136719,
"learning_rate": 7.664415584415586e-06,
"loss": 22.5971,
"step": 21000
},
{
"epoch": 1247.0597014925372,
"grad_norm": 338.49102783203125,
"learning_rate": 7.63844155844156e-06,
"loss": 23.1806,
"step": 21200
},
{
"epoch": 1258.8358208955224,
"grad_norm": 201.65057373046875,
"learning_rate": 7.612467532467533e-06,
"loss": 21.7898,
"step": 21400
},
{
"epoch": 1270.597014925373,
"grad_norm": 176.30361938476562,
"learning_rate": 7.586493506493508e-06,
"loss": 22.0611,
"step": 21600
},
{
"epoch": 1282.358208955224,
"grad_norm": 145.49542236328125,
"learning_rate": 7.560519480519481e-06,
"loss": 21.875,
"step": 21800
},
{
"epoch": 1294.1194029850747,
"grad_norm": 271.5858154296875,
"learning_rate": 7.534545454545456e-06,
"loss": 21.199,
"step": 22000
},
{
"epoch": 1305.8955223880596,
"grad_norm": 682.5701904296875,
"learning_rate": 7.508571428571429e-06,
"loss": 22.5543,
"step": 22200
},
{
"epoch": 1317.6567164179105,
"grad_norm": 629.8733520507812,
"learning_rate": 7.482597402597404e-06,
"loss": 21.7533,
"step": 22400
},
{
"epoch": 1329.4179104477612,
"grad_norm": 447.5409240722656,
"learning_rate": 7.456623376623377e-06,
"loss": 20.4621,
"step": 22600
},
{
"epoch": 1341.1791044776119,
"grad_norm": 190.6186065673828,
"learning_rate": 7.430649350649352e-06,
"loss": 20.3926,
"step": 22800
},
{
"epoch": 1352.955223880597,
"grad_norm": 480.3921813964844,
"learning_rate": 7.404675324675325e-06,
"loss": 21.3989,
"step": 23000
},
{
"epoch": 1364.7164179104477,
"grad_norm": 432.7933044433594,
"learning_rate": 7.3787012987013e-06,
"loss": 20.2977,
"step": 23200
},
{
"epoch": 1376.4776119402984,
"grad_norm": 263.83331298828125,
"learning_rate": 7.352727272727273e-06,
"loss": 21.8715,
"step": 23400
},
{
"epoch": 1388.2388059701493,
"grad_norm": 225.55624389648438,
"learning_rate": 7.326753246753248e-06,
"loss": 19.0286,
"step": 23600
},
{
"epoch": 1400.0,
"grad_norm": 271.21759033203125,
"learning_rate": 7.300779220779221e-06,
"loss": 20.0349,
"step": 23800
},
{
"epoch": 1411.7761194029852,
"grad_norm": 273.9780578613281,
"learning_rate": 7.274805194805196e-06,
"loss": 17.947,
"step": 24000
},
{
"epoch": 1423.5373134328358,
"grad_norm": 464.6848449707031,
"learning_rate": 7.24883116883117e-06,
"loss": 19.3846,
"step": 24200
},
{
"epoch": 1435.2985074626865,
"grad_norm": 376.6822814941406,
"learning_rate": 7.222857142857144e-06,
"loss": 17.7826,
"step": 24400
},
{
"epoch": 1447.0597014925372,
"grad_norm": 236.9201202392578,
"learning_rate": 7.196883116883118e-06,
"loss": 18.7341,
"step": 24600
},
{
"epoch": 1458.8358208955224,
"grad_norm": 269.235595703125,
"learning_rate": 7.171038961038962e-06,
"loss": 16.5044,
"step": 24800
},
{
"epoch": 1470.597014925373,
"grad_norm": 224.2851104736328,
"learning_rate": 7.145064935064936e-06,
"loss": 17.9869,
"step": 25000
},
{
"epoch": 1482.358208955224,
"grad_norm": 424.98834228515625,
"learning_rate": 7.11909090909091e-06,
"loss": 17.7338,
"step": 25200
},
{
"epoch": 1494.1194029850747,
"grad_norm": 244.76785278320312,
"learning_rate": 7.093116883116884e-06,
"loss": 15.3534,
"step": 25400
},
{
"epoch": 1505.8955223880596,
"grad_norm": 89.87413787841797,
"learning_rate": 7.067142857142858e-06,
"loss": 17.2717,
"step": 25600
},
{
"epoch": 1517.6567164179105,
"grad_norm": 278.63165283203125,
"learning_rate": 7.041168831168832e-06,
"loss": 17.7088,
"step": 25800
},
{
"epoch": 1529.4179104477612,
"grad_norm": 257.7251281738281,
"learning_rate": 7.015194805194806e-06,
"loss": 16.9006,
"step": 26000
},
{
"epoch": 1541.1791044776119,
"grad_norm": 352.7020263671875,
"learning_rate": 6.98935064935065e-06,
"loss": 17.136,
"step": 26200
},
{
"epoch": 1552.955223880597,
"grad_norm": 124.77161407470703,
"learning_rate": 6.9633766233766244e-06,
"loss": 17.4147,
"step": 26400
},
{
"epoch": 1564.7164179104477,
"grad_norm": 321.99993896484375,
"learning_rate": 6.937402597402598e-06,
"loss": 15.1883,
"step": 26600
},
{
"epoch": 1576.4776119402984,
"grad_norm": 211.46424865722656,
"learning_rate": 6.911428571428572e-06,
"loss": 14.7355,
"step": 26800
},
{
"epoch": 1588.2388059701493,
"grad_norm": 147.95535278320312,
"learning_rate": 6.885454545454546e-06,
"loss": 15.0797,
"step": 27000
},
{
"epoch": 1600.0,
"grad_norm": 359.3253173828125,
"learning_rate": 6.85948051948052e-06,
"loss": 15.7509,
"step": 27200
},
{
"epoch": 1611.7761194029852,
"grad_norm": 360.4601135253906,
"learning_rate": 6.833506493506494e-06,
"loss": 16.636,
"step": 27400
},
{
"epoch": 1623.5373134328358,
"grad_norm": 217.2965545654297,
"learning_rate": 6.807532467532468e-06,
"loss": 15.6866,
"step": 27600
},
{
"epoch": 1635.2985074626865,
"grad_norm": 220.64784240722656,
"learning_rate": 6.781558441558442e-06,
"loss": 15.5503,
"step": 27800
},
{
"epoch": 1647.0597014925372,
"grad_norm": 258.07672119140625,
"learning_rate": 6.755584415584416e-06,
"loss": 14.64,
"step": 28000
},
{
"epoch": 1658.8358208955224,
"grad_norm": 255.86795043945312,
"learning_rate": 6.72961038961039e-06,
"loss": 14.2442,
"step": 28200
},
{
"epoch": 1670.597014925373,
"grad_norm": 183.0586395263672,
"learning_rate": 6.703636363636364e-06,
"loss": 14.277,
"step": 28400
},
{
"epoch": 1682.358208955224,
"grad_norm": 133.0940399169922,
"learning_rate": 6.677662337662339e-06,
"loss": 14.0652,
"step": 28600
},
{
"epoch": 1694.1194029850747,
"grad_norm": 292.6916198730469,
"learning_rate": 6.651688311688312e-06,
"loss": 13.645,
"step": 28800
},
{
"epoch": 1705.8955223880596,
"grad_norm": 305.353515625,
"learning_rate": 6.625714285714287e-06,
"loss": 14.4846,
"step": 29000
},
{
"epoch": 1717.6567164179105,
"grad_norm": 242.2213592529297,
"learning_rate": 6.59974025974026e-06,
"loss": 14.0155,
"step": 29200
},
{
"epoch": 1729.4179104477612,
"grad_norm": 309.6208801269531,
"learning_rate": 6.573766233766235e-06,
"loss": 13.813,
"step": 29400
},
{
"epoch": 1741.1791044776119,
"grad_norm": 295.8955078125,
"learning_rate": 6.547792207792208e-06,
"loss": 14.3597,
"step": 29600
},
{
"epoch": 1752.955223880597,
"grad_norm": 221.8771514892578,
"learning_rate": 6.521818181818183e-06,
"loss": 14.1165,
"step": 29800
},
{
"epoch": 1764.7164179104477,
"grad_norm": 276.3559875488281,
"learning_rate": 6.495844155844156e-06,
"loss": 12.6467,
"step": 30000
},
{
"epoch": 1776.4776119402984,
"grad_norm": 242.14724731445312,
"learning_rate": 6.469870129870131e-06,
"loss": 13.3587,
"step": 30200
},
{
"epoch": 1788.2388059701493,
"grad_norm": 170.22076416015625,
"learning_rate": 6.443896103896104e-06,
"loss": 13.2378,
"step": 30400
},
{
"epoch": 1800.0,
"grad_norm": 164.98995971679688,
"learning_rate": 6.417922077922079e-06,
"loss": 12.556,
"step": 30600
},
{
"epoch": 1811.7761194029852,
"grad_norm": 355.3738098144531,
"learning_rate": 6.391948051948052e-06,
"loss": 11.3157,
"step": 30800
},
{
"epoch": 1823.5373134328358,
"grad_norm": 220.3183135986328,
"learning_rate": 6.365974025974027e-06,
"loss": 13.424,
"step": 31000
},
{
"epoch": 1835.2985074626865,
"grad_norm": 261.7161865234375,
"learning_rate": 6.34e-06,
"loss": 11.4347,
"step": 31200
},
{
"epoch": 1847.0597014925372,
"grad_norm": 155.91165161132812,
"learning_rate": 6.314025974025975e-06,
"loss": 10.8361,
"step": 31400
},
{
"epoch": 1858.8358208955224,
"grad_norm": 93.3238754272461,
"learning_rate": 6.288051948051948e-06,
"loss": 12.3992,
"step": 31600
},
{
"epoch": 1870.597014925373,
"grad_norm": 249.3748016357422,
"learning_rate": 6.262077922077923e-06,
"loss": 11.965,
"step": 31800
},
{
"epoch": 1882.358208955224,
"grad_norm": 242.0140838623047,
"learning_rate": 6.236103896103896e-06,
"loss": 11.4908,
"step": 32000
},
{
"epoch": 1894.1194029850747,
"grad_norm": 463.1481018066406,
"learning_rate": 6.210129870129871e-06,
"loss": 12.7511,
"step": 32200
},
{
"epoch": 1905.8955223880596,
"grad_norm": 265.88739013671875,
"learning_rate": 6.184155844155845e-06,
"loss": 11.0652,
"step": 32400
},
{
"epoch": 1917.6567164179105,
"grad_norm": 167.36598205566406,
"learning_rate": 6.1584415584415595e-06,
"loss": 10.4137,
"step": 32600
},
{
"epoch": 1929.4179104477612,
"grad_norm": 173.92787170410156,
"learning_rate": 6.132467532467533e-06,
"loss": 10.7277,
"step": 32800
},
{
"epoch": 1941.1791044776119,
"grad_norm": 294.4163818359375,
"learning_rate": 6.1064935064935075e-06,
"loss": 11.8032,
"step": 33000
},
{
"epoch": 1952.955223880597,
"grad_norm": 243.70274353027344,
"learning_rate": 6.080519480519481e-06,
"loss": 10.3227,
"step": 33200
},
{
"epoch": 1964.7164179104477,
"grad_norm": 120.62760162353516,
"learning_rate": 6.0545454545454555e-06,
"loss": 10.37,
"step": 33400
},
{
"epoch": 1976.4776119402984,
"grad_norm": 130.5196075439453,
"learning_rate": 6.028571428571429e-06,
"loss": 11.8498,
"step": 33600
},
{
"epoch": 1988.2388059701493,
"grad_norm": 254.48614501953125,
"learning_rate": 6.0025974025974035e-06,
"loss": 10.7649,
"step": 33800
},
{
"epoch": 2000.0,
"grad_norm": 292.97216796875,
"learning_rate": 5.976623376623377e-06,
"loss": 11.4211,
"step": 34000
},
{
"epoch": 2011.7761194029852,
"grad_norm": 497.4891052246094,
"learning_rate": 5.9506493506493515e-06,
"loss": 11.9636,
"step": 34200
},
{
"epoch": 2023.5373134328358,
"grad_norm": 316.7022705078125,
"learning_rate": 5.924675324675325e-06,
"loss": 10.3258,
"step": 34400
},
{
"epoch": 2035.2985074626865,
"grad_norm": 263.2421569824219,
"learning_rate": 5.8987012987012994e-06,
"loss": 11.0163,
"step": 34600
},
{
"epoch": 2047.0597014925372,
"grad_norm": 149.36387634277344,
"learning_rate": 5.872727272727273e-06,
"loss": 9.7479,
"step": 34800
},
{
"epoch": 2058.8358208955224,
"grad_norm": 294.4871826171875,
"learning_rate": 5.8467532467532474e-06,
"loss": 10.7836,
"step": 35000
},
{
"epoch": 2070.597014925373,
"grad_norm": 311.5248107910156,
"learning_rate": 5.820779220779221e-06,
"loss": 10.6797,
"step": 35200
},
{
"epoch": 2082.3582089552237,
"grad_norm": 136.69500732421875,
"learning_rate": 5.7948051948051954e-06,
"loss": 10.9252,
"step": 35400
},
{
"epoch": 2094.1194029850744,
"grad_norm": 293.40399169921875,
"learning_rate": 5.768831168831169e-06,
"loss": 9.608,
"step": 35600
},
{
"epoch": 2105.89552238806,
"grad_norm": 232.2513427734375,
"learning_rate": 5.742987012987013e-06,
"loss": 10.0404,
"step": 35800
},
{
"epoch": 2117.6567164179105,
"grad_norm": 178.5306396484375,
"learning_rate": 5.717012987012988e-06,
"loss": 9.612,
"step": 36000
},
{
"epoch": 2129.417910447761,
"grad_norm": 323.44146728515625,
"learning_rate": 5.691168831168831e-06,
"loss": 8.6739,
"step": 36200
},
{
"epoch": 2141.179104477612,
"grad_norm": 101.05858612060547,
"learning_rate": 5.665194805194806e-06,
"loss": 9.2302,
"step": 36400
},
{
"epoch": 2152.955223880597,
"grad_norm": 164.43310546875,
"learning_rate": 5.63922077922078e-06,
"loss": 9.1987,
"step": 36600
},
{
"epoch": 2164.716417910448,
"grad_norm": 182.63916015625,
"learning_rate": 5.613246753246754e-06,
"loss": 9.7936,
"step": 36800
},
{
"epoch": 2176.4776119402986,
"grad_norm": 201.4703369140625,
"learning_rate": 5.587272727272728e-06,
"loss": 9.6401,
"step": 37000
},
{
"epoch": 2188.2388059701493,
"grad_norm": 151.9721221923828,
"learning_rate": 5.561298701298702e-06,
"loss": 10.4346,
"step": 37200
},
{
"epoch": 2200.0,
"grad_norm": 244.78732299804688,
"learning_rate": 5.535324675324676e-06,
"loss": 10.2022,
"step": 37400
},
{
"epoch": 2211.776119402985,
"grad_norm": 233.7654266357422,
"learning_rate": 5.50935064935065e-06,
"loss": 10.1548,
"step": 37600
},
{
"epoch": 2223.5373134328356,
"grad_norm": 188.21788024902344,
"learning_rate": 5.483376623376624e-06,
"loss": 9.9679,
"step": 37800
},
{
"epoch": 2235.2985074626868,
"grad_norm": 285.9569091796875,
"learning_rate": 5.457402597402598e-06,
"loss": 9.8509,
"step": 38000
},
{
"epoch": 2247.0597014925374,
"grad_norm": 264.8778991699219,
"learning_rate": 5.431428571428572e-06,
"loss": 9.9817,
"step": 38200
},
{
"epoch": 2258.8358208955224,
"grad_norm": 182.24191284179688,
"learning_rate": 5.405454545454546e-06,
"loss": 9.4971,
"step": 38400
},
{
"epoch": 2270.597014925373,
"grad_norm": 219.8632354736328,
"learning_rate": 5.37948051948052e-06,
"loss": 8.164,
"step": 38600
},
{
"epoch": 2282.3582089552237,
"grad_norm": 292.8862609863281,
"learning_rate": 5.353506493506494e-06,
"loss": 8.4305,
"step": 38800
},
{
"epoch": 2294.1194029850744,
"grad_norm": 243.6376190185547,
"learning_rate": 5.327532467532468e-06,
"loss": 10.2729,
"step": 39000
},
{
"epoch": 2305.89552238806,
"grad_norm": 120.5704574584961,
"learning_rate": 5.301558441558442e-06,
"loss": 8.4079,
"step": 39200
},
{
"epoch": 2317.6567164179105,
"grad_norm": 333.58612060546875,
"learning_rate": 5.275584415584416e-06,
"loss": 8.1809,
"step": 39400
},
{
"epoch": 2329.417910447761,
"grad_norm": 77.8395004272461,
"learning_rate": 5.24961038961039e-06,
"loss": 9.0445,
"step": 39600
},
{
"epoch": 2341.179104477612,
"grad_norm": 172.45359802246094,
"learning_rate": 5.223636363636364e-06,
"loss": 9.6053,
"step": 39800
},
{
"epoch": 2352.955223880597,
"grad_norm": 195.2128143310547,
"learning_rate": 5.197662337662338e-06,
"loss": 8.1167,
"step": 40000
},
{
"epoch": 2364.716417910448,
"grad_norm": 154.22743225097656,
"learning_rate": 5.171688311688312e-06,
"loss": 7.8072,
"step": 40200
},
{
"epoch": 2376.4776119402986,
"grad_norm": 83.33777618408203,
"learning_rate": 5.145714285714286e-06,
"loss": 8.0217,
"step": 40400
},
{
"epoch": 2388.2388059701493,
"grad_norm": 102.02108764648438,
"learning_rate": 5.11974025974026e-06,
"loss": 8.8543,
"step": 40600
},
{
"epoch": 2400.0,
"grad_norm": 68.74557495117188,
"learning_rate": 5.0937662337662345e-06,
"loss": 8.7576,
"step": 40800
},
{
"epoch": 2411.776119402985,
"grad_norm": 251.68568420410156,
"learning_rate": 5.067792207792208e-06,
"loss": 8.4178,
"step": 41000
},
{
"epoch": 2423.5373134328356,
"grad_norm": 176.3686065673828,
"learning_rate": 5.0418181818181825e-06,
"loss": 7.5952,
"step": 41200
},
{
"epoch": 2435.2985074626868,
"grad_norm": 355.08428955078125,
"learning_rate": 5.015974025974026e-06,
"loss": 7.7425,
"step": 41400
},
{
"epoch": 2447.0597014925374,
"grad_norm": 114.10618591308594,
"learning_rate": 4.9900000000000005e-06,
"loss": 9.2298,
"step": 41600
},
{
"epoch": 2458.8358208955224,
"grad_norm": 85.14331817626953,
"learning_rate": 4.964025974025974e-06,
"loss": 8.2393,
"step": 41800
},
{
"epoch": 2470.597014925373,
"grad_norm": 223.14291381835938,
"learning_rate": 4.9380519480519485e-06,
"loss": 9.5785,
"step": 42000
},
{
"epoch": 2482.3582089552237,
"grad_norm": 156.1779022216797,
"learning_rate": 4.912077922077922e-06,
"loss": 8.3694,
"step": 42200
},
{
"epoch": 2494.1194029850744,
"grad_norm": 163.2755126953125,
"learning_rate": 4.8862337662337665e-06,
"loss": 8.2274,
"step": 42400
},
{
"epoch": 2505.89552238806,
"grad_norm": 448.78741455078125,
"learning_rate": 4.860259740259741e-06,
"loss": 7.7257,
"step": 42600
},
{
"epoch": 2517.6567164179105,
"grad_norm": 224.7237548828125,
"learning_rate": 4.8342857142857145e-06,
"loss": 7.8661,
"step": 42800
},
{
"epoch": 2529.417910447761,
"grad_norm": 218.48426818847656,
"learning_rate": 4.808311688311689e-06,
"loss": 8.1052,
"step": 43000
},
{
"epoch": 2541.179104477612,
"grad_norm": 285.8219909667969,
"learning_rate": 4.7823376623376625e-06,
"loss": 7.2237,
"step": 43200
},
{
"epoch": 2552.955223880597,
"grad_norm": 121.75675964355469,
"learning_rate": 4.756363636363637e-06,
"loss": 7.3067,
"step": 43400
},
{
"epoch": 2564.716417910448,
"grad_norm": 441.2803039550781,
"learning_rate": 4.7303896103896104e-06,
"loss": 7.4268,
"step": 43600
},
{
"epoch": 2576.4776119402986,
"grad_norm": 250.1401824951172,
"learning_rate": 4.704415584415585e-06,
"loss": 7.4162,
"step": 43800
},
{
"epoch": 2588.2388059701493,
"grad_norm": 225.75238037109375,
"learning_rate": 4.6784415584415584e-06,
"loss": 7.9644,
"step": 44000
},
{
"epoch": 2600.0,
"grad_norm": 199.13861083984375,
"learning_rate": 4.652597402597403e-06,
"loss": 8.048,
"step": 44200
},
{
"epoch": 2611.776119402985,
"grad_norm": 128.95506286621094,
"learning_rate": 4.626623376623377e-06,
"loss": 6.9158,
"step": 44400
},
{
"epoch": 2623.5373134328356,
"grad_norm": 365.9570007324219,
"learning_rate": 4.600649350649351e-06,
"loss": 6.4058,
"step": 44600
},
{
"epoch": 2635.2985074626868,
"grad_norm": 114.73458862304688,
"learning_rate": 4.574675324675325e-06,
"loss": 8.2035,
"step": 44800
},
{
"epoch": 2647.0597014925374,
"grad_norm": 158.98861694335938,
"learning_rate": 4.548701298701299e-06,
"loss": 6.3892,
"step": 45000
},
{
"epoch": 2658.8358208955224,
"grad_norm": 203.47166442871094,
"learning_rate": 4.522727272727273e-06,
"loss": 7.2696,
"step": 45200
},
{
"epoch": 2670.597014925373,
"grad_norm": 231.44822692871094,
"learning_rate": 4.496753246753247e-06,
"loss": 7.5676,
"step": 45400
},
{
"epoch": 2682.3582089552237,
"grad_norm": 247.14039611816406,
"learning_rate": 4.470779220779221e-06,
"loss": 6.668,
"step": 45600
},
{
"epoch": 2694.1194029850744,
"grad_norm": 170.46774291992188,
"learning_rate": 4.444805194805195e-06,
"loss": 6.9474,
"step": 45800
},
{
"epoch": 2705.89552238806,
"grad_norm": 191.91915893554688,
"learning_rate": 4.418831168831169e-06,
"loss": 6.3278,
"step": 46000
},
{
"epoch": 2717.6567164179105,
"grad_norm": 105.89759826660156,
"learning_rate": 4.392857142857143e-06,
"loss": 7.3746,
"step": 46200
},
{
"epoch": 2729.417910447761,
"grad_norm": 86.0598373413086,
"learning_rate": 4.366883116883117e-06,
"loss": 7.079,
"step": 46400
},
{
"epoch": 2741.179104477612,
"grad_norm": 272.9900207519531,
"learning_rate": 4.340909090909091e-06,
"loss": 6.9619,
"step": 46600
},
{
"epoch": 2752.955223880597,
"grad_norm": 113.84228515625,
"learning_rate": 4.314935064935065e-06,
"loss": 6.7595,
"step": 46800
},
{
"epoch": 2764.716417910448,
"grad_norm": 464.052734375,
"learning_rate": 4.28896103896104e-06,
"loss": 6.9025,
"step": 47000
},
{
"epoch": 2776.4776119402986,
"grad_norm": 73.1541519165039,
"learning_rate": 4.262987012987013e-06,
"loss": 6.753,
"step": 47200
},
{
"epoch": 2788.2388059701493,
"grad_norm": 42.5888786315918,
"learning_rate": 4.237012987012988e-06,
"loss": 6.2469,
"step": 47400
},
{
"epoch": 2800.0,
"grad_norm": 87.32052612304688,
"learning_rate": 4.211038961038961e-06,
"loss": 7.9924,
"step": 47600
},
{
"epoch": 2811.776119402985,
"grad_norm": 324.5746154785156,
"learning_rate": 4.185064935064936e-06,
"loss": 5.9469,
"step": 47800
},
{
"epoch": 2823.5373134328356,
"grad_norm": 51.54016876220703,
"learning_rate": 4.159090909090909e-06,
"loss": 7.4489,
"step": 48000
},
{
"epoch": 2835.2985074626868,
"grad_norm": 176.7570343017578,
"learning_rate": 4.1331168831168836e-06,
"loss": 6.7529,
"step": 48200
},
{
"epoch": 2847.0597014925374,
"grad_norm": 107.7917709350586,
"learning_rate": 4.107142857142857e-06,
"loss": 6.1709,
"step": 48400
},
{
"epoch": 2858.8358208955224,
"grad_norm": 218.55908203125,
"learning_rate": 4.0812987012987016e-06,
"loss": 6.4473,
"step": 48600
},
{
"epoch": 2870.597014925373,
"grad_norm": 317.35137939453125,
"learning_rate": 4.055324675324675e-06,
"loss": 7.2964,
"step": 48800
},
{
"epoch": 2882.3582089552237,
"grad_norm": 407.064697265625,
"learning_rate": 4.0293506493506495e-06,
"loss": 6.5469,
"step": 49000
},
{
"epoch": 2894.1194029850744,
"grad_norm": 89.26599884033203,
"learning_rate": 4.003376623376624e-06,
"loss": 6.4736,
"step": 49200
},
{
"epoch": 2905.89552238806,
"grad_norm": 25.349979400634766,
"learning_rate": 3.9774025974025975e-06,
"loss": 6.6709,
"step": 49400
},
{
"epoch": 2917.6567164179105,
"grad_norm": 152.76893615722656,
"learning_rate": 3.951428571428572e-06,
"loss": 6.769,
"step": 49600
},
{
"epoch": 2929.417910447761,
"grad_norm": 369.66729736328125,
"learning_rate": 3.9254545454545455e-06,
"loss": 6.8757,
"step": 49800
},
{
"epoch": 2941.179104477612,
"grad_norm": 163.01708984375,
"learning_rate": 3.89948051948052e-06,
"loss": 5.9855,
"step": 50000
},
{
"epoch": 2952.955223880597,
"grad_norm": 359.50164794921875,
"learning_rate": 3.8735064935064935e-06,
"loss": 6.7189,
"step": 50200
},
{
"epoch": 2964.716417910448,
"grad_norm": 135.31906127929688,
"learning_rate": 3.847532467532468e-06,
"loss": 6.1798,
"step": 50400
},
{
"epoch": 2976.4776119402986,
"grad_norm": 77.5290756225586,
"learning_rate": 3.8215584415584415e-06,
"loss": 5.9281,
"step": 50600
},
{
"epoch": 2988.2388059701493,
"grad_norm": 103.02841186523438,
"learning_rate": 3.7955844155844155e-06,
"loss": 6.068,
"step": 50800
},
{
"epoch": 3000.0,
"grad_norm": 348.920166015625,
"learning_rate": 3.76961038961039e-06,
"loss": 5.7779,
"step": 51000
},
{
"epoch": 3011.776119402985,
"grad_norm": 356.079345703125,
"learning_rate": 3.743636363636364e-06,
"loss": 6.8131,
"step": 51200
},
{
"epoch": 3023.5373134328356,
"grad_norm": 201.86148071289062,
"learning_rate": 3.717662337662338e-06,
"loss": 6.2735,
"step": 51400
},
{
"epoch": 3035.2985074626868,
"grad_norm": 181.3549041748047,
"learning_rate": 3.691688311688312e-06,
"loss": 6.2642,
"step": 51600
},
{
"epoch": 3047.0597014925374,
"grad_norm": 158.26319885253906,
"learning_rate": 3.665714285714286e-06,
"loss": 7.6419,
"step": 51800
},
{
"epoch": 3058.8358208955224,
"grad_norm": 417.2932434082031,
"learning_rate": 3.63974025974026e-06,
"loss": 5.9062,
"step": 52000
},
{
"epoch": 3070.597014925373,
"grad_norm": 123.00536346435547,
"learning_rate": 3.613766233766234e-06,
"loss": 6.0384,
"step": 52200
},
{
"epoch": 3082.3582089552237,
"grad_norm": 212.98793029785156,
"learning_rate": 3.587792207792208e-06,
"loss": 5.6856,
"step": 52400
},
{
"epoch": 3094.1194029850744,
"grad_norm": 232.1907501220703,
"learning_rate": 3.561818181818182e-06,
"loss": 6.196,
"step": 52600
},
{
"epoch": 3105.89552238806,
"grad_norm": 40.089229583740234,
"learning_rate": 3.535844155844156e-06,
"loss": 5.7956,
"step": 52800
},
{
"epoch": 3117.6567164179105,
"grad_norm": 286.09320068359375,
"learning_rate": 3.5100000000000003e-06,
"loss": 6.1736,
"step": 53000
},
{
"epoch": 3129.417910447761,
"grad_norm": 247.46432495117188,
"learning_rate": 3.4840259740259743e-06,
"loss": 5.9662,
"step": 53200
},
{
"epoch": 3141.179104477612,
"grad_norm": 175.70138549804688,
"learning_rate": 3.4580519480519483e-06,
"loss": 5.5742,
"step": 53400
},
{
"epoch": 3152.955223880597,
"grad_norm": 244.6918487548828,
"learning_rate": 3.4320779220779223e-06,
"loss": 5.4242,
"step": 53600
},
{
"epoch": 3164.716417910448,
"grad_norm": 290.1295166015625,
"learning_rate": 3.4061038961038963e-06,
"loss": 5.9908,
"step": 53800
},
{
"epoch": 3176.4776119402986,
"grad_norm": 191.1642303466797,
"learning_rate": 3.3801298701298702e-06,
"loss": 6.4687,
"step": 54000
},
{
"epoch": 3188.2388059701493,
"grad_norm": 230.18724060058594,
"learning_rate": 3.3541558441558442e-06,
"loss": 5.8824,
"step": 54200
},
{
"epoch": 3200.0,
"grad_norm": 189.81280517578125,
"learning_rate": 3.3281818181818182e-06,
"loss": 6.0779,
"step": 54400
},
{
"epoch": 3211.776119402985,
"grad_norm": 139.64651489257812,
"learning_rate": 3.3022077922077927e-06,
"loss": 6.1833,
"step": 54600
},
{
"epoch": 3223.5373134328356,
"grad_norm": 191.9344024658203,
"learning_rate": 3.2762337662337666e-06,
"loss": 5.7443,
"step": 54800
},
{
"epoch": 3235.2985074626868,
"grad_norm": 313.61773681640625,
"learning_rate": 3.2502597402597406e-06,
"loss": 5.4124,
"step": 55000
},
{
"epoch": 3247.0597014925374,
"grad_norm": 139.777099609375,
"learning_rate": 3.2242857142857146e-06,
"loss": 5.4102,
"step": 55200
},
{
"epoch": 3258.8358208955224,
"grad_norm": 142.17034912109375,
"learning_rate": 3.1983116883116886e-06,
"loss": 5.2255,
"step": 55400
},
{
"epoch": 3270.597014925373,
"grad_norm": 159.3903350830078,
"learning_rate": 3.1724675324675326e-06,
"loss": 5.1369,
"step": 55600
},
{
"epoch": 3282.3582089552237,
"grad_norm": 171.2684326171875,
"learning_rate": 3.1464935064935066e-06,
"loss": 5.5156,
"step": 55800
},
{
"epoch": 3294.1194029850744,
"grad_norm": 177.32545471191406,
"learning_rate": 3.1205194805194806e-06,
"loss": 6.1047,
"step": 56000
},
{
"epoch": 3305.89552238806,
"grad_norm": 127.48875427246094,
"learning_rate": 3.094545454545455e-06,
"loss": 5.5533,
"step": 56200
},
{
"epoch": 3317.6567164179105,
"grad_norm": 284.4790954589844,
"learning_rate": 3.068571428571429e-06,
"loss": 5.096,
"step": 56400
},
{
"epoch": 3329.417910447761,
"grad_norm": 148.5039520263672,
"learning_rate": 3.042597402597403e-06,
"loss": 5.6225,
"step": 56600
},
{
"epoch": 3341.179104477612,
"grad_norm": 414.6415710449219,
"learning_rate": 3.016623376623377e-06,
"loss": 6.35,
"step": 56800
},
{
"epoch": 3352.955223880597,
"grad_norm": 62.76802062988281,
"learning_rate": 2.990649350649351e-06,
"loss": 5.5579,
"step": 57000
},
{
"epoch": 3364.716417910448,
"grad_norm": 31.516719818115234,
"learning_rate": 2.964675324675325e-06,
"loss": 5.7893,
"step": 57200
},
{
"epoch": 3376.4776119402986,
"grad_norm": 151.5336456298828,
"learning_rate": 2.938701298701299e-06,
"loss": 6.144,
"step": 57400
},
{
"epoch": 3388.2388059701493,
"grad_norm": 457.56573486328125,
"learning_rate": 2.912727272727273e-06,
"loss": 5.7772,
"step": 57600
},
{
"epoch": 3400.0,
"grad_norm": 81.55277252197266,
"learning_rate": 2.8867532467532474e-06,
"loss": 5.0447,
"step": 57800
},
{
"epoch": 3411.776119402985,
"grad_norm": 22.124055862426758,
"learning_rate": 2.8607792207792214e-06,
"loss": 6.2737,
"step": 58000
},
{
"epoch": 3423.5373134328356,
"grad_norm": 284.54803466796875,
"learning_rate": 2.8348051948051954e-06,
"loss": 5.392,
"step": 58200
},
{
"epoch": 3435.2985074626868,
"grad_norm": 455.4921569824219,
"learning_rate": 2.8088311688311694e-06,
"loss": 5.9439,
"step": 58400
},
{
"epoch": 3447.0597014925374,
"grad_norm": 328.2806701660156,
"learning_rate": 2.7829870129870134e-06,
"loss": 5.8568,
"step": 58600
},
{
"epoch": 3458.8358208955224,
"grad_norm": 186.59173583984375,
"learning_rate": 2.7570129870129874e-06,
"loss": 5.8562,
"step": 58800
},
{
"epoch": 3470.597014925373,
"grad_norm": 225.7448272705078,
"learning_rate": 2.7310389610389614e-06,
"loss": 4.7843,
"step": 59000
},
{
"epoch": 3482.3582089552237,
"grad_norm": 453.7283935546875,
"learning_rate": 2.7050649350649354e-06,
"loss": 5.9007,
"step": 59200
},
{
"epoch": 3494.1194029850744,
"grad_norm": 185.39083862304688,
"learning_rate": 2.6790909090909094e-06,
"loss": 5.8159,
"step": 59400
},
{
"epoch": 3505.89552238806,
"grad_norm": 95.3510513305664,
"learning_rate": 2.6531168831168833e-06,
"loss": 5.2905,
"step": 59600
},
{
"epoch": 3517.6567164179105,
"grad_norm": 29.410858154296875,
"learning_rate": 2.6271428571428573e-06,
"loss": 4.9938,
"step": 59800
},
{
"epoch": 3529.417910447761,
"grad_norm": 87.61227416992188,
"learning_rate": 2.6011688311688318e-06,
"loss": 5.5285,
"step": 60000
},
{
"epoch": 3541.179104477612,
"grad_norm": 13.053986549377441,
"learning_rate": 2.5751948051948058e-06,
"loss": 5.7445,
"step": 60200
},
{
"epoch": 3552.955223880597,
"grad_norm": 104.69475555419922,
"learning_rate": 2.5493506493506497e-06,
"loss": 5.7425,
"step": 60400
},
{
"epoch": 3564.716417910448,
"grad_norm": 206.36949157714844,
"learning_rate": 2.5233766233766237e-06,
"loss": 5.2375,
"step": 60600
},
{
"epoch": 3576.4776119402986,
"grad_norm": 375.29266357421875,
"learning_rate": 2.4974025974025977e-06,
"loss": 4.803,
"step": 60800
},
{
"epoch": 3588.2388059701493,
"grad_norm": 55.10752868652344,
"learning_rate": 2.4714285714285717e-06,
"loss": 4.9683,
"step": 61000
},
{
"epoch": 3600.0,
"grad_norm": 129.48883056640625,
"learning_rate": 2.4454545454545457e-06,
"loss": 4.7734,
"step": 61200
},
{
"epoch": 3611.776119402985,
"grad_norm": 232.613525390625,
"learning_rate": 2.4194805194805197e-06,
"loss": 4.9529,
"step": 61400
},
{
"epoch": 3623.5373134328356,
"grad_norm": 119.18389892578125,
"learning_rate": 2.3935064935064937e-06,
"loss": 4.6153,
"step": 61600
},
{
"epoch": 3635.2985074626868,
"grad_norm": 163.57119750976562,
"learning_rate": 2.3675324675324677e-06,
"loss": 5.8858,
"step": 61800
},
{
"epoch": 3647.0597014925374,
"grad_norm": 179.8179473876953,
"learning_rate": 2.3415584415584417e-06,
"loss": 4.7752,
"step": 62000
},
{
"epoch": 3658.8358208955224,
"grad_norm": 476.3257751464844,
"learning_rate": 2.3155844155844157e-06,
"loss": 5.3414,
"step": 62200
},
{
"epoch": 3670.597014925373,
"grad_norm": 52.362213134765625,
"learning_rate": 2.2896103896103897e-06,
"loss": 5.1212,
"step": 62400
},
{
"epoch": 3682.3582089552237,
"grad_norm": 146.75779724121094,
"learning_rate": 2.2636363636363637e-06,
"loss": 5.0903,
"step": 62600
},
{
"epoch": 3694.1194029850744,
"grad_norm": 194.1846160888672,
"learning_rate": 2.237662337662338e-06,
"loss": 5.1435,
"step": 62800
},
{
"epoch": 3705.89552238806,
"grad_norm": 353.49639892578125,
"learning_rate": 2.211688311688312e-06,
"loss": 5.3565,
"step": 63000
},
{
"epoch": 3717.6567164179105,
"grad_norm": 170.0306396484375,
"learning_rate": 2.185844155844156e-06,
"loss": 5.4508,
"step": 63200
},
{
"epoch": 3729.417910447761,
"grad_norm": 105.09954071044922,
"learning_rate": 2.15987012987013e-06,
"loss": 4.764,
"step": 63400
},
{
"epoch": 3741.179104477612,
"grad_norm": 110.3379135131836,
"learning_rate": 2.133896103896104e-06,
"loss": 4.7869,
"step": 63600
},
{
"epoch": 3752.955223880597,
"grad_norm": 195.00747680664062,
"learning_rate": 2.107922077922078e-06,
"loss": 5.006,
"step": 63800
},
{
"epoch": 3764.716417910448,
"grad_norm": 384.640380859375,
"learning_rate": 2.081948051948052e-06,
"loss": 4.3629,
"step": 64000
},
{
"epoch": 3776.4776119402986,
"grad_norm": 151.75579833984375,
"learning_rate": 2.055974025974026e-06,
"loss": 5.1427,
"step": 64200
},
{
"epoch": 3788.2388059701493,
"grad_norm": 321.9924621582031,
"learning_rate": 2.0300000000000005e-06,
"loss": 4.5263,
"step": 64400
},
{
"epoch": 3800.0,
"grad_norm": 71.38465118408203,
"learning_rate": 2.0040259740259745e-06,
"loss": 5.9976,
"step": 64600
},
{
"epoch": 3811.776119402985,
"grad_norm": 142.11964416503906,
"learning_rate": 1.9780519480519485e-06,
"loss": 4.5058,
"step": 64800
},
{
"epoch": 3823.5373134328356,
"grad_norm": 255.7351531982422,
"learning_rate": 1.9520779220779225e-06,
"loss": 5.2054,
"step": 65000
},
{
"epoch": 3835.2985074626868,
"grad_norm": 139.64971923828125,
"learning_rate": 1.9261038961038964e-06,
"loss": 4.5154,
"step": 65200
},
{
"epoch": 3847.0597014925374,
"grad_norm": 269.8912658691406,
"learning_rate": 1.9001298701298704e-06,
"loss": 4.7901,
"step": 65400
},
{
"epoch": 3858.8358208955224,
"grad_norm": 143.27041625976562,
"learning_rate": 1.8741558441558444e-06,
"loss": 5.0364,
"step": 65600
},
{
"epoch": 3870.597014925373,
"grad_norm": 85.00979614257812,
"learning_rate": 1.8481818181818184e-06,
"loss": 5.7018,
"step": 65800
},
{
"epoch": 3882.3582089552237,
"grad_norm": 23.321130752563477,
"learning_rate": 1.8222077922077924e-06,
"loss": 5.2634,
"step": 66000
},
{
"epoch": 3894.1194029850744,
"grad_norm": 116.6059799194336,
"learning_rate": 1.7963636363636366e-06,
"loss": 5.1182,
"step": 66200
},
{
"epoch": 3905.89552238806,
"grad_norm": 250.52786254882812,
"learning_rate": 1.7703896103896106e-06,
"loss": 4.8144,
"step": 66400
},
{
"epoch": 3917.6567164179105,
"grad_norm": 153.89578247070312,
"learning_rate": 1.7444155844155846e-06,
"loss": 4.1859,
"step": 66600
},
{
"epoch": 3929.417910447761,
"grad_norm": 377.2282409667969,
"learning_rate": 1.7184415584415586e-06,
"loss": 4.8385,
"step": 66800
},
{
"epoch": 3941.179104477612,
"grad_norm": 122.95838928222656,
"learning_rate": 1.6924675324675326e-06,
"loss": 3.9718,
"step": 67000
},
{
"epoch": 3952.955223880597,
"grad_norm": 231.4821014404297,
"learning_rate": 1.6664935064935068e-06,
"loss": 4.6111,
"step": 67200
},
{
"epoch": 3964.716417910448,
"grad_norm": 237.96945190429688,
"learning_rate": 1.6405194805194808e-06,
"loss": 5.2436,
"step": 67400
},
{
"epoch": 3976.4776119402986,
"grad_norm": 110.74050903320312,
"learning_rate": 1.6145454545454548e-06,
"loss": 4.6773,
"step": 67600
},
{
"epoch": 3988.2388059701493,
"grad_norm": 68.52436828613281,
"learning_rate": 1.5885714285714288e-06,
"loss": 4.4819,
"step": 67800
},
{
"epoch": 4000.0,
"grad_norm": 87.57453155517578,
"learning_rate": 1.5625974025974028e-06,
"loss": 4.9292,
"step": 68000
},
{
"epoch": 4011.776119402985,
"grad_norm": 288.13714599609375,
"learning_rate": 1.5366233766233768e-06,
"loss": 5.3013,
"step": 68200
},
{
"epoch": 4023.5373134328356,
"grad_norm": 266.3171081542969,
"learning_rate": 1.5106493506493508e-06,
"loss": 4.7125,
"step": 68400
},
{
"epoch": 4035.2985074626868,
"grad_norm": 136.16607666015625,
"learning_rate": 1.4846753246753248e-06,
"loss": 4.4004,
"step": 68600
},
{
"epoch": 4047.0597014925374,
"grad_norm": 199.1755828857422,
"learning_rate": 1.4587012987012988e-06,
"loss": 5.0152,
"step": 68800
},
{
"epoch": 4058.8358208955224,
"grad_norm": 135.2139434814453,
"learning_rate": 1.4327272727272728e-06,
"loss": 4.592,
"step": 69000
},
{
"epoch": 4070.597014925373,
"grad_norm": 256.96954345703125,
"learning_rate": 1.406753246753247e-06,
"loss": 4.1364,
"step": 69200
},
{
"epoch": 4082.3582089552237,
"grad_norm": 151.81385803222656,
"learning_rate": 1.380779220779221e-06,
"loss": 4.638,
"step": 69400
},
{
"epoch": 4094.1194029850744,
"grad_norm": 140.18310546875,
"learning_rate": 1.354805194805195e-06,
"loss": 5.1307,
"step": 69600
},
{
"epoch": 4105.895522388059,
"grad_norm": 131.49514770507812,
"learning_rate": 1.328831168831169e-06,
"loss": 4.4532,
"step": 69800
},
{
"epoch": 4117.6567164179105,
"grad_norm": 86.51961517333984,
"learning_rate": 1.302857142857143e-06,
"loss": 4.4996,
"step": 70000
},
{
"epoch": 4129.417910447762,
"grad_norm": 232.16627502441406,
"learning_rate": 1.2770129870129871e-06,
"loss": 5.0817,
"step": 70200
},
{
"epoch": 4141.179104477612,
"grad_norm": 111.115478515625,
"learning_rate": 1.2511688311688313e-06,
"loss": 4.4479,
"step": 70400
},
{
"epoch": 4152.955223880597,
"grad_norm": 136.0243377685547,
"learning_rate": 1.2251948051948053e-06,
"loss": 4.9805,
"step": 70600
},
{
"epoch": 4164.7164179104475,
"grad_norm": 240.4090576171875,
"learning_rate": 1.1992207792207793e-06,
"loss": 5.1758,
"step": 70800
},
{
"epoch": 4176.477611940299,
"grad_norm": 162.47177124023438,
"learning_rate": 1.1732467532467533e-06,
"loss": 4.4003,
"step": 71000
},
{
"epoch": 4188.238805970149,
"grad_norm": 172.05398559570312,
"learning_rate": 1.1472727272727275e-06,
"loss": 4.5982,
"step": 71200
},
{
"epoch": 4200.0,
"grad_norm": 115.58950805664062,
"learning_rate": 1.1212987012987015e-06,
"loss": 4.9562,
"step": 71400
},
{
"epoch": 4211.776119402985,
"grad_norm": 612.0722045898438,
"learning_rate": 1.0954545454545455e-06,
"loss": 4.5789,
"step": 71600
},
{
"epoch": 4223.537313432836,
"grad_norm": 142.39805603027344,
"learning_rate": 1.0694805194805195e-06,
"loss": 5.3481,
"step": 71800
},
{
"epoch": 4235.298507462687,
"grad_norm": 151.8544158935547,
"learning_rate": 1.0435064935064935e-06,
"loss": 4.9694,
"step": 72000
},
{
"epoch": 4247.059701492537,
"grad_norm": 78.40233612060547,
"learning_rate": 1.0175324675324675e-06,
"loss": 4.6901,
"step": 72200
},
{
"epoch": 4258.835820895522,
"grad_norm": 128.51756286621094,
"learning_rate": 9.915584415584415e-07,
"loss": 4.44,
"step": 72400
},
{
"epoch": 4270.5970149253735,
"grad_norm": 98.647216796875,
"learning_rate": 9.655844155844157e-07,
"loss": 4.1811,
"step": 72600
},
{
"epoch": 4282.358208955224,
"grad_norm": 142.39537048339844,
"learning_rate": 9.396103896103898e-07,
"loss": 3.957,
"step": 72800
},
{
"epoch": 4294.119402985075,
"grad_norm": 172.83412170410156,
"learning_rate": 9.136363636363638e-07,
"loss": 5.0366,
"step": 73000
},
{
"epoch": 4305.895522388059,
"grad_norm": 323.51953125,
"learning_rate": 8.876623376623378e-07,
"loss": 4.8918,
"step": 73200
},
{
"epoch": 4317.6567164179105,
"grad_norm": 175.51437377929688,
"learning_rate": 8.616883116883118e-07,
"loss": 4.8388,
"step": 73400
},
{
"epoch": 4329.417910447762,
"grad_norm": 10.666017532348633,
"learning_rate": 8.357142857142858e-07,
"loss": 4.7345,
"step": 73600
},
{
"epoch": 4341.179104477612,
"grad_norm": 236.7921142578125,
"learning_rate": 8.097402597402599e-07,
"loss": 4.5804,
"step": 73800
},
{
"epoch": 4352.955223880597,
"grad_norm": 124.93741607666016,
"learning_rate": 7.837662337662339e-07,
"loss": 5.0424,
"step": 74000
},
{
"epoch": 4364.7164179104475,
"grad_norm": 321.4673156738281,
"learning_rate": 7.577922077922079e-07,
"loss": 4.7424,
"step": 74200
},
{
"epoch": 4376.477611940299,
"grad_norm": 566.9447021484375,
"learning_rate": 7.318181818181819e-07,
"loss": 4.2368,
"step": 74400
},
{
"epoch": 4388.238805970149,
"grad_norm": 170.28578186035156,
"learning_rate": 7.058441558441559e-07,
"loss": 4.5343,
"step": 74600
},
{
"epoch": 4400.0,
"grad_norm": 45.38258743286133,
"learning_rate": 6.7987012987013e-07,
"loss": 4.2724,
"step": 74800
},
{
"epoch": 4411.776119402985,
"grad_norm": 207.35653686523438,
"learning_rate": 6.538961038961039e-07,
"loss": 4.1466,
"step": 75000
},
{
"epoch": 4423.537313432836,
"grad_norm": 195.24972534179688,
"learning_rate": 6.279220779220779e-07,
"loss": 4.8058,
"step": 75200
},
{
"epoch": 4435.298507462687,
"grad_norm": 21.271228790283203,
"learning_rate": 6.01948051948052e-07,
"loss": 4.2456,
"step": 75400
},
{
"epoch": 4447.059701492537,
"grad_norm": 128.87979125976562,
"learning_rate": 5.75974025974026e-07,
"loss": 4.609,
"step": 75600
},
{
"epoch": 4458.835820895522,
"grad_norm": 125.3255844116211,
"learning_rate": 5.5e-07,
"loss": 3.8383,
"step": 75800
},
{
"epoch": 4470.5970149253735,
"grad_norm": 176.71116638183594,
"learning_rate": 5.24025974025974e-07,
"loss": 4.3682,
"step": 76000
},
{
"epoch": 4482.358208955224,
"grad_norm": 174.38621520996094,
"learning_rate": 4.980519480519481e-07,
"loss": 3.9024,
"step": 76200
},
{
"epoch": 4494.119402985075,
"grad_norm": 150.01458740234375,
"learning_rate": 4.720779220779221e-07,
"loss": 4.1331,
"step": 76400
},
{
"epoch": 4505.895522388059,
"grad_norm": 209.55963134765625,
"learning_rate": 4.461038961038961e-07,
"loss": 4.5891,
"step": 76600
},
{
"epoch": 4517.6567164179105,
"grad_norm": 332.1000061035156,
"learning_rate": 4.201298701298702e-07,
"loss": 4.1596,
"step": 76800
},
{
"epoch": 4529.417910447762,
"grad_norm": 267.723388671875,
"learning_rate": 3.941558441558442e-07,
"loss": 3.6498,
"step": 77000
},
{
"epoch": 4541.179104477612,
"grad_norm": 221.1051483154297,
"learning_rate": 3.681818181818182e-07,
"loss": 3.9301,
"step": 77200
},
{
"epoch": 4552.955223880597,
"grad_norm": 102.58394622802734,
"learning_rate": 3.4220779220779225e-07,
"loss": 4.2587,
"step": 77400
},
{
"epoch": 4564.7164179104475,
"grad_norm": 221.03683471679688,
"learning_rate": 3.1623376623376624e-07,
"loss": 3.8038,
"step": 77600
},
{
"epoch": 4576.477611940299,
"grad_norm": 199.16111755371094,
"learning_rate": 2.902597402597403e-07,
"loss": 4.4641,
"step": 77800
},
{
"epoch": 4588.238805970149,
"grad_norm": 30.24901580810547,
"learning_rate": 2.6428571428571433e-07,
"loss": 4.233,
"step": 78000
},
{
"epoch": 4600.0,
"grad_norm": 146.84005737304688,
"learning_rate": 2.384415584415585e-07,
"loss": 4.2298,
"step": 78200
},
{
"epoch": 4611.776119402985,
"grad_norm": 215.6741180419922,
"learning_rate": 2.1246753246753248e-07,
"loss": 4.4618,
"step": 78400
},
{
"epoch": 4623.537313432836,
"grad_norm": 318.6520080566406,
"learning_rate": 1.864935064935065e-07,
"loss": 4.4153,
"step": 78600
},
{
"epoch": 4635.298507462687,
"grad_norm": 80.2079849243164,
"learning_rate": 1.6051948051948055e-07,
"loss": 4.2413,
"step": 78800
},
{
"epoch": 4647.059701492537,
"grad_norm": 118.82848358154297,
"learning_rate": 1.3454545454545457e-07,
"loss": 4.164,
"step": 79000
},
{
"epoch": 4658.835820895522,
"grad_norm": 56.74250411987305,
"learning_rate": 1.0857142857142857e-07,
"loss": 4.7498,
"step": 79200
},
{
"epoch": 4670.5970149253735,
"grad_norm": 164.66262817382812,
"learning_rate": 8.259740259740261e-08,
"loss": 4.4375,
"step": 79400
},
{
"epoch": 4682.358208955224,
"grad_norm": 267.6045227050781,
"learning_rate": 5.662337662337663e-08,
"loss": 4.8022,
"step": 79600
},
{
"epoch": 4694.119402985075,
"grad_norm": 136.69151306152344,
"learning_rate": 3.0649350649350655e-08,
"loss": 3.8398,
"step": 79800
},
{
"epoch": 4705.895522388059,
"grad_norm": 251.12142944335938,
"learning_rate": 4.675324675324676e-09,
"loss": 4.6735,
"step": 80000
}
],
"logging_steps": 200,
"max_steps": 80000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5000,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.852250804395515e+20,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}