gliner-post-small / trainer_state.json
Ihor's picture
Upload folder using huggingface_hub
ac1d7fb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.423800044238001,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022119000221190004,
"grad_norm": 14340.359375,
"learning_rate": 2.5e-06,
"loss": 6523.6544,
"step": 100
},
{
"epoch": 0.04423800044238001,
"grad_norm": 1850.3282470703125,
"learning_rate": 5e-06,
"loss": 350.7309,
"step": 200
},
{
"epoch": 0.06635700066357,
"grad_norm": 1415.2620849609375,
"learning_rate": 7.5e-06,
"loss": 321.6054,
"step": 300
},
{
"epoch": 0.08847600088476001,
"grad_norm": 3297.33984375,
"learning_rate": 1e-05,
"loss": 260.2301,
"step": 400
},
{
"epoch": 0.11059500110595001,
"grad_norm": 2892.188232421875,
"learning_rate": 1.25e-05,
"loss": 308.3714,
"step": 500
},
{
"epoch": 0.13271400132714,
"grad_norm": 1215.732666015625,
"learning_rate": 1.5e-05,
"loss": 343.5222,
"step": 600
},
{
"epoch": 0.15483300154833002,
"grad_norm": 1504.814453125,
"learning_rate": 1.75e-05,
"loss": 301.771,
"step": 700
},
{
"epoch": 0.17695200176952003,
"grad_norm": 714.8418579101562,
"learning_rate": 2e-05,
"loss": 290.3531,
"step": 800
},
{
"epoch": 0.19907100199071,
"grad_norm": 2755.97998046875,
"learning_rate": 2.25e-05,
"loss": 179.3829,
"step": 900
},
{
"epoch": 0.22119000221190002,
"grad_norm": 353.36248779296875,
"learning_rate": 2.5e-05,
"loss": 310.0189,
"step": 1000
},
{
"epoch": 0.24330900243309003,
"grad_norm": 418.4841613769531,
"learning_rate": 2.7500000000000004e-05,
"loss": 237.1839,
"step": 1100
},
{
"epoch": 0.26542800265428,
"grad_norm": 259.7605285644531,
"learning_rate": 3e-05,
"loss": 221.7796,
"step": 1200
},
{
"epoch": 0.28754700287547,
"grad_norm": 997.5048217773438,
"learning_rate": 3.2500000000000004e-05,
"loss": 345.3388,
"step": 1300
},
{
"epoch": 0.30966600309666004,
"grad_norm": 36418.90625,
"learning_rate": 3.5e-05,
"loss": 342.6086,
"step": 1400
},
{
"epoch": 0.33178500331785005,
"grad_norm": 863.9041748046875,
"learning_rate": 3.7500000000000003e-05,
"loss": 233.7479,
"step": 1500
},
{
"epoch": 0.35390400353904006,
"grad_norm": 622.4481811523438,
"learning_rate": 4e-05,
"loss": 134.1319,
"step": 1600
},
{
"epoch": 0.37602300376023,
"grad_norm": 699.130859375,
"learning_rate": 4.25e-05,
"loss": 192.9184,
"step": 1700
},
{
"epoch": 0.39814200398142,
"grad_norm": 893.842041015625,
"learning_rate": 4.5e-05,
"loss": 309.9599,
"step": 1800
},
{
"epoch": 0.42026100420261003,
"grad_norm": 478.645751953125,
"learning_rate": 4.75e-05,
"loss": 160.4654,
"step": 1900
},
{
"epoch": 0.44238000442380004,
"grad_norm": 833.3300170898438,
"learning_rate": 5e-05,
"loss": 184.012,
"step": 2000
},
{
"epoch": 0.46449900464499005,
"grad_norm": 7655.88037109375,
"learning_rate": 4.972222222222223e-05,
"loss": 257.931,
"step": 2100
},
{
"epoch": 0.48661800486618007,
"grad_norm": 466.54302978515625,
"learning_rate": 4.9444444444444446e-05,
"loss": 154.0851,
"step": 2200
},
{
"epoch": 0.5087370050873701,
"grad_norm": 948.7927856445312,
"learning_rate": 4.9166666666666665e-05,
"loss": 170.1584,
"step": 2300
},
{
"epoch": 0.53085600530856,
"grad_norm": 767.844482421875,
"learning_rate": 4.888888888888889e-05,
"loss": 179.1037,
"step": 2400
},
{
"epoch": 0.5529750055297501,
"grad_norm": 1689.810791015625,
"learning_rate": 4.8611111111111115e-05,
"loss": 197.1341,
"step": 2500
},
{
"epoch": 0.57509400575094,
"grad_norm": 1661.3369140625,
"learning_rate": 4.8333333333333334e-05,
"loss": 411.5847,
"step": 2600
},
{
"epoch": 0.59721300597213,
"grad_norm": 833.5604858398438,
"learning_rate": 4.805555555555556e-05,
"loss": 119.3549,
"step": 2700
},
{
"epoch": 0.6193320061933201,
"grad_norm": 562038.25,
"learning_rate": 4.7777777777777784e-05,
"loss": 245.144,
"step": 2800
},
{
"epoch": 0.64145100641451,
"grad_norm": 1275.5999755859375,
"learning_rate": 4.75e-05,
"loss": 197.1938,
"step": 2900
},
{
"epoch": 0.6635700066357001,
"grad_norm": 464.8160095214844,
"learning_rate": 4.722222222222222e-05,
"loss": 206.2149,
"step": 3000
},
{
"epoch": 0.68568900685689,
"grad_norm": 296.2396240234375,
"learning_rate": 4.6944444444444446e-05,
"loss": 146.9992,
"step": 3100
},
{
"epoch": 0.7078080070780801,
"grad_norm": 490.5688171386719,
"learning_rate": 4.666666666666667e-05,
"loss": 121.8763,
"step": 3200
},
{
"epoch": 0.7299270072992701,
"grad_norm": 320.01116943359375,
"learning_rate": 4.638888888888889e-05,
"loss": 82.6486,
"step": 3300
},
{
"epoch": 0.75204600752046,
"grad_norm": 269.9767150878906,
"learning_rate": 4.6111111111111115e-05,
"loss": 144.595,
"step": 3400
},
{
"epoch": 0.7741650077416501,
"grad_norm": 662.4879150390625,
"learning_rate": 4.5833333333333334e-05,
"loss": 140.9867,
"step": 3500
},
{
"epoch": 0.79628400796284,
"grad_norm": 882.5130615234375,
"learning_rate": 4.555555555555556e-05,
"loss": 130.253,
"step": 3600
},
{
"epoch": 0.8184030081840301,
"grad_norm": 631.7805786132812,
"learning_rate": 4.527777777777778e-05,
"loss": 95.0868,
"step": 3700
},
{
"epoch": 0.8405220084052201,
"grad_norm": 197.107421875,
"learning_rate": 4.5e-05,
"loss": 106.4198,
"step": 3800
},
{
"epoch": 0.8626410086264101,
"grad_norm": 3084.970947265625,
"learning_rate": 4.472222222222223e-05,
"loss": 145.6145,
"step": 3900
},
{
"epoch": 0.8847600088476001,
"grad_norm": 703.7938232421875,
"learning_rate": 4.4444444444444447e-05,
"loss": 133.7272,
"step": 4000
},
{
"epoch": 0.90687900906879,
"grad_norm": 546.3758544921875,
"learning_rate": 4.4166666666666665e-05,
"loss": 190.9529,
"step": 4100
},
{
"epoch": 0.9289980092899801,
"grad_norm": 415.634765625,
"learning_rate": 4.388888888888889e-05,
"loss": 149.3702,
"step": 4200
},
{
"epoch": 0.9511170095111701,
"grad_norm": 614.1394653320312,
"learning_rate": 4.3611111111111116e-05,
"loss": 148.1649,
"step": 4300
},
{
"epoch": 0.9732360097323601,
"grad_norm": 957.6664428710938,
"learning_rate": 4.3333333333333334e-05,
"loss": 134.7802,
"step": 4400
},
{
"epoch": 0.9953550099535501,
"grad_norm": 700.3091430664062,
"learning_rate": 4.305555555555556e-05,
"loss": 128.7425,
"step": 4500
},
{
"epoch": 1.0,
"eval_loss": 229.09039306640625,
"eval_runtime": 13.3799,
"eval_samples_per_second": 150.226,
"eval_steps_per_second": 37.594,
"step": 4521
},
{
"epoch": 1.0174740101747402,
"grad_norm": 159.52822875976562,
"learning_rate": 4.277777777777778e-05,
"loss": 160.3704,
"step": 4600
},
{
"epoch": 1.03959301039593,
"grad_norm": 1282.0911865234375,
"learning_rate": 4.25e-05,
"loss": 135.6942,
"step": 4700
},
{
"epoch": 1.06171201061712,
"grad_norm": 542.7197265625,
"learning_rate": 4.222222222222222e-05,
"loss": 174.3882,
"step": 4800
},
{
"epoch": 1.08383101083831,
"grad_norm": 1587.159423828125,
"learning_rate": 4.194444444444445e-05,
"loss": 153.1831,
"step": 4900
},
{
"epoch": 1.1059500110595002,
"grad_norm": 695.9743041992188,
"learning_rate": 4.166666666666667e-05,
"loss": 109.4728,
"step": 5000
},
{
"epoch": 1.1280690112806901,
"grad_norm": 515.8330078125,
"learning_rate": 4.138888888888889e-05,
"loss": 136.7678,
"step": 5100
},
{
"epoch": 1.15018801150188,
"grad_norm": 586.1018676757812,
"learning_rate": 4.111111111111111e-05,
"loss": 84.0752,
"step": 5200
},
{
"epoch": 1.17230701172307,
"grad_norm": 995.541259765625,
"learning_rate": 4.0833333333333334e-05,
"loss": 155.4204,
"step": 5300
},
{
"epoch": 1.1944260119442602,
"grad_norm": 742.328125,
"learning_rate": 4.055555555555556e-05,
"loss": 72.3888,
"step": 5400
},
{
"epoch": 1.2165450121654502,
"grad_norm": 684.4664916992188,
"learning_rate": 4.027777777777778e-05,
"loss": 144.5675,
"step": 5500
},
{
"epoch": 1.2386640123866401,
"grad_norm": 810.091552734375,
"learning_rate": 4e-05,
"loss": 105.8335,
"step": 5600
},
{
"epoch": 1.26078301260783,
"grad_norm": 1068.27783203125,
"learning_rate": 3.972222222222222e-05,
"loss": 108.972,
"step": 5700
},
{
"epoch": 1.28290201282902,
"grad_norm": 902.8329467773438,
"learning_rate": 3.944444444444445e-05,
"loss": 114.8165,
"step": 5800
},
{
"epoch": 1.3050210130502102,
"grad_norm": 1491.9244384765625,
"learning_rate": 3.9166666666666665e-05,
"loss": 88.0044,
"step": 5900
},
{
"epoch": 1.3271400132714002,
"grad_norm": 441.6331481933594,
"learning_rate": 3.888888888888889e-05,
"loss": 105.6323,
"step": 6000
},
{
"epoch": 1.3492590134925901,
"grad_norm": 121.39036560058594,
"learning_rate": 3.8611111111111116e-05,
"loss": 140.8393,
"step": 6100
},
{
"epoch": 1.37137801371378,
"grad_norm": 839.1984252929688,
"learning_rate": 3.8333333333333334e-05,
"loss": 97.4277,
"step": 6200
},
{
"epoch": 1.39349701393497,
"grad_norm": 612.2739868164062,
"learning_rate": 3.805555555555555e-05,
"loss": 80.9864,
"step": 6300
},
{
"epoch": 1.4156160141561602,
"grad_norm": 588.4722290039062,
"learning_rate": 3.777777777777778e-05,
"loss": 104.521,
"step": 6400
},
{
"epoch": 1.4377350143773502,
"grad_norm": 325.037353515625,
"learning_rate": 3.7500000000000003e-05,
"loss": 131.7437,
"step": 6500
},
{
"epoch": 1.4598540145985401,
"grad_norm": 196.6481170654297,
"learning_rate": 3.722222222222222e-05,
"loss": 117.8643,
"step": 6600
},
{
"epoch": 1.48197301481973,
"grad_norm": 424.9925231933594,
"learning_rate": 3.694444444444445e-05,
"loss": 130.4872,
"step": 6700
},
{
"epoch": 1.50409201504092,
"grad_norm": 796.464599609375,
"learning_rate": 3.6666666666666666e-05,
"loss": 115.8529,
"step": 6800
},
{
"epoch": 1.5262110152621102,
"grad_norm": 2076.1787109375,
"learning_rate": 3.638888888888889e-05,
"loss": 185.1337,
"step": 6900
},
{
"epoch": 1.5483300154833002,
"grad_norm": 383.5282287597656,
"learning_rate": 3.611111111111111e-05,
"loss": 78.6264,
"step": 7000
},
{
"epoch": 1.5704490157044901,
"grad_norm": 591.9740600585938,
"learning_rate": 3.5833333333333335e-05,
"loss": 86.4212,
"step": 7100
},
{
"epoch": 1.5925680159256803,
"grad_norm": 815.822998046875,
"learning_rate": 3.555555555555556e-05,
"loss": 105.2421,
"step": 7200
},
{
"epoch": 1.61468701614687,
"grad_norm": 698.5433959960938,
"learning_rate": 3.527777777777778e-05,
"loss": 107.0973,
"step": 7300
},
{
"epoch": 1.6368060163680602,
"grad_norm": 260.39154052734375,
"learning_rate": 3.5e-05,
"loss": 102.5785,
"step": 7400
},
{
"epoch": 1.6589250165892502,
"grad_norm": 8398.65234375,
"learning_rate": 3.472222222222222e-05,
"loss": 145.6558,
"step": 7500
},
{
"epoch": 1.6810440168104401,
"grad_norm": 929.0120239257812,
"learning_rate": 3.444444444444445e-05,
"loss": 93.9839,
"step": 7600
},
{
"epoch": 1.7031630170316303,
"grad_norm": 627.1079711914062,
"learning_rate": 3.4166666666666666e-05,
"loss": 125.3684,
"step": 7700
},
{
"epoch": 1.72528201725282,
"grad_norm": 408.4013671875,
"learning_rate": 3.388888888888889e-05,
"loss": 91.5595,
"step": 7800
},
{
"epoch": 1.7474010174740102,
"grad_norm": 304.0116882324219,
"learning_rate": 3.3611111111111116e-05,
"loss": 79.996,
"step": 7900
},
{
"epoch": 1.7695200176952002,
"grad_norm": 985.1416625976562,
"learning_rate": 3.3333333333333335e-05,
"loss": 106.5055,
"step": 8000
},
{
"epoch": 1.7916390179163901,
"grad_norm": 381.7845764160156,
"learning_rate": 3.3055555555555553e-05,
"loss": 184.2213,
"step": 8100
},
{
"epoch": 1.8137580181375803,
"grad_norm": 739.7052612304688,
"learning_rate": 3.277777777777778e-05,
"loss": 81.3863,
"step": 8200
},
{
"epoch": 1.83587701835877,
"grad_norm": 315.7475280761719,
"learning_rate": 3.2500000000000004e-05,
"loss": 92.0976,
"step": 8300
},
{
"epoch": 1.8579960185799602,
"grad_norm": 326.6717529296875,
"learning_rate": 3.222222222222223e-05,
"loss": 112.162,
"step": 8400
},
{
"epoch": 1.8801150188011502,
"grad_norm": 322.20086669921875,
"learning_rate": 3.194444444444444e-05,
"loss": 111.4809,
"step": 8500
},
{
"epoch": 1.9022340190223401,
"grad_norm": 1028.286376953125,
"learning_rate": 3.1666666666666666e-05,
"loss": 56.9394,
"step": 8600
},
{
"epoch": 1.9243530192435303,
"grad_norm": 696.5010986328125,
"learning_rate": 3.138888888888889e-05,
"loss": 100.1622,
"step": 8700
},
{
"epoch": 1.94647201946472,
"grad_norm": 227.0893096923828,
"learning_rate": 3.111111111111111e-05,
"loss": 102.4857,
"step": 8800
},
{
"epoch": 1.9685910196859102,
"grad_norm": 571.6776123046875,
"learning_rate": 3.0833333333333335e-05,
"loss": 97.7735,
"step": 8900
},
{
"epoch": 1.9907100199071002,
"grad_norm": 2159.34716796875,
"learning_rate": 3.055555555555556e-05,
"loss": 138.7627,
"step": 9000
},
{
"epoch": 2.0,
"eval_loss": 284.751708984375,
"eval_runtime": 13.2962,
"eval_samples_per_second": 151.171,
"eval_steps_per_second": 37.83,
"step": 9042
},
{
"epoch": 2.01282902012829,
"grad_norm": 425.97235107421875,
"learning_rate": 3.0277777777777776e-05,
"loss": 110.9788,
"step": 9100
},
{
"epoch": 2.0349480203494803,
"grad_norm": 428.1395568847656,
"learning_rate": 3e-05,
"loss": 88.1351,
"step": 9200
},
{
"epoch": 2.05706702057067,
"grad_norm": 409.4471130371094,
"learning_rate": 2.9722222222222223e-05,
"loss": 99.0849,
"step": 9300
},
{
"epoch": 2.07918602079186,
"grad_norm": 296.4221496582031,
"learning_rate": 2.9444444444444448e-05,
"loss": 104.3382,
"step": 9400
},
{
"epoch": 2.1013050210130504,
"grad_norm": 740.9823608398438,
"learning_rate": 2.916666666666667e-05,
"loss": 72.3218,
"step": 9500
},
{
"epoch": 2.12342402123424,
"grad_norm": 547.3069458007812,
"learning_rate": 2.8888888888888888e-05,
"loss": 56.0866,
"step": 9600
},
{
"epoch": 2.1455430214554303,
"grad_norm": 448.69793701171875,
"learning_rate": 2.861111111111111e-05,
"loss": 92.5643,
"step": 9700
},
{
"epoch": 2.16766202167662,
"grad_norm": 626.5888671875,
"learning_rate": 2.8333333333333335e-05,
"loss": 90.8,
"step": 9800
},
{
"epoch": 2.18978102189781,
"grad_norm": 935.4774780273438,
"learning_rate": 2.8055555555555557e-05,
"loss": 56.0015,
"step": 9900
},
{
"epoch": 2.2119000221190004,
"grad_norm": 578.9419555664062,
"learning_rate": 2.777777777777778e-05,
"loss": 63.4371,
"step": 10000
},
{
"epoch": 2.23401902234019,
"grad_norm": 434.9461364746094,
"learning_rate": 2.7500000000000004e-05,
"loss": 81.4173,
"step": 10100
},
{
"epoch": 2.2561380225613803,
"grad_norm": 823.6088256835938,
"learning_rate": 2.7222222222222223e-05,
"loss": 91.1232,
"step": 10200
},
{
"epoch": 2.2782570227825705,
"grad_norm": 12951.28515625,
"learning_rate": 2.6944444444444445e-05,
"loss": 83.2001,
"step": 10300
},
{
"epoch": 2.30037602300376,
"grad_norm": 466.7561950683594,
"learning_rate": 2.6666666666666667e-05,
"loss": 105.8661,
"step": 10400
},
{
"epoch": 2.3224950232249504,
"grad_norm": 439.0936584472656,
"learning_rate": 2.6388888888888892e-05,
"loss": 131.327,
"step": 10500
},
{
"epoch": 2.34461402344614,
"grad_norm": 822.1676635742188,
"learning_rate": 2.6111111111111114e-05,
"loss": 93.9335,
"step": 10600
},
{
"epoch": 2.3667330236673303,
"grad_norm": 1097.796875,
"learning_rate": 2.5833333333333336e-05,
"loss": 114.3786,
"step": 10700
},
{
"epoch": 2.3888520238885205,
"grad_norm": 513.4930419921875,
"learning_rate": 2.5555555555555554e-05,
"loss": 117.5649,
"step": 10800
},
{
"epoch": 2.41097102410971,
"grad_norm": 488.8702087402344,
"learning_rate": 2.527777777777778e-05,
"loss": 93.1184,
"step": 10900
},
{
"epoch": 2.4330900243309004,
"grad_norm": 16564.607421875,
"learning_rate": 2.5e-05,
"loss": 101.7624,
"step": 11000
},
{
"epoch": 2.45520902455209,
"grad_norm": 905.677978515625,
"learning_rate": 2.4722222222222223e-05,
"loss": 110.5879,
"step": 11100
},
{
"epoch": 2.4773280247732803,
"grad_norm": 289.8318786621094,
"learning_rate": 2.4444444444444445e-05,
"loss": 61.5242,
"step": 11200
},
{
"epoch": 2.4994470249944705,
"grad_norm": 457.4600524902344,
"learning_rate": 2.4166666666666667e-05,
"loss": 92.5947,
"step": 11300
},
{
"epoch": 2.52156602521566,
"grad_norm": 1546.554931640625,
"learning_rate": 2.3888888888888892e-05,
"loss": 131.2207,
"step": 11400
},
{
"epoch": 2.5436850254368504,
"grad_norm": 452.04412841796875,
"learning_rate": 2.361111111111111e-05,
"loss": 66.9818,
"step": 11500
},
{
"epoch": 2.56580402565804,
"grad_norm": 7368.85595703125,
"learning_rate": 2.3333333333333336e-05,
"loss": 128.0609,
"step": 11600
},
{
"epoch": 2.5879230258792303,
"grad_norm": 833.9178466796875,
"learning_rate": 2.3055555555555558e-05,
"loss": 82.7991,
"step": 11700
},
{
"epoch": 2.6100420261004205,
"grad_norm": 488.8749084472656,
"learning_rate": 2.277777777777778e-05,
"loss": 70.6396,
"step": 11800
},
{
"epoch": 2.63216102632161,
"grad_norm": 525.4971923828125,
"learning_rate": 2.25e-05,
"loss": 81.1729,
"step": 11900
},
{
"epoch": 2.6542800265428004,
"grad_norm": 506.8924865722656,
"learning_rate": 2.2222222222222223e-05,
"loss": 70.9919,
"step": 12000
},
{
"epoch": 2.67639902676399,
"grad_norm": 272.2956848144531,
"learning_rate": 2.1944444444444445e-05,
"loss": 83.741,
"step": 12100
},
{
"epoch": 2.6985180269851803,
"grad_norm": 153.6388702392578,
"learning_rate": 2.1666666666666667e-05,
"loss": 82.1675,
"step": 12200
},
{
"epoch": 2.7206370272063705,
"grad_norm": 668.1585693359375,
"learning_rate": 2.138888888888889e-05,
"loss": 93.7683,
"step": 12300
},
{
"epoch": 2.74275602742756,
"grad_norm": 386.3387756347656,
"learning_rate": 2.111111111111111e-05,
"loss": 69.9565,
"step": 12400
},
{
"epoch": 2.7648750276487504,
"grad_norm": 456.8686828613281,
"learning_rate": 2.0833333333333336e-05,
"loss": 86.7724,
"step": 12500
},
{
"epoch": 2.78699402786994,
"grad_norm": 300.98541259765625,
"learning_rate": 2.0555555555555555e-05,
"loss": 59.863,
"step": 12600
},
{
"epoch": 2.8091130280911303,
"grad_norm": 434.6181945800781,
"learning_rate": 2.027777777777778e-05,
"loss": 92.5719,
"step": 12700
},
{
"epoch": 2.8312320283123205,
"grad_norm": 1306.5218505859375,
"learning_rate": 2e-05,
"loss": 72.4254,
"step": 12800
},
{
"epoch": 2.85335102853351,
"grad_norm": 255.54293823242188,
"learning_rate": 1.9722222222222224e-05,
"loss": 55.263,
"step": 12900
},
{
"epoch": 2.8754700287547004,
"grad_norm": 1816.9591064453125,
"learning_rate": 1.9444444444444445e-05,
"loss": 72.7496,
"step": 13000
},
{
"epoch": 2.89758902897589,
"grad_norm": 831.1156616210938,
"learning_rate": 1.9166666666666667e-05,
"loss": 92.1811,
"step": 13100
},
{
"epoch": 2.9197080291970803,
"grad_norm": 2121.381591796875,
"learning_rate": 1.888888888888889e-05,
"loss": 84.3183,
"step": 13200
},
{
"epoch": 2.9418270294182705,
"grad_norm": 348.0662841796875,
"learning_rate": 1.861111111111111e-05,
"loss": 79.233,
"step": 13300
},
{
"epoch": 2.96394602963946,
"grad_norm": 502.2306823730469,
"learning_rate": 1.8333333333333333e-05,
"loss": 86.6864,
"step": 13400
},
{
"epoch": 2.9860650298606504,
"grad_norm": 498.3459777832031,
"learning_rate": 1.8055555555555555e-05,
"loss": 67.7695,
"step": 13500
},
{
"epoch": 3.0,
"eval_loss": 190.48678588867188,
"eval_runtime": 13.6446,
"eval_samples_per_second": 147.311,
"eval_steps_per_second": 36.864,
"step": 13563
},
{
"epoch": 3.00818403008184,
"grad_norm": 691.843505859375,
"learning_rate": 1.777777777777778e-05,
"loss": 85.3921,
"step": 13600
},
{
"epoch": 3.0303030303030303,
"grad_norm": 1307.369140625,
"learning_rate": 1.75e-05,
"loss": 68.4007,
"step": 13700
},
{
"epoch": 3.0524220305242205,
"grad_norm": 329.44451904296875,
"learning_rate": 1.7222222222222224e-05,
"loss": 74.87,
"step": 13800
},
{
"epoch": 3.07454103074541,
"grad_norm": 2009.1265869140625,
"learning_rate": 1.6944444444444446e-05,
"loss": 78.6698,
"step": 13900
},
{
"epoch": 3.0966600309666004,
"grad_norm": 893.1031494140625,
"learning_rate": 1.6666666666666667e-05,
"loss": 72.2353,
"step": 14000
},
{
"epoch": 3.11877903118779,
"grad_norm": 455.1803283691406,
"learning_rate": 1.638888888888889e-05,
"loss": 54.7878,
"step": 14100
},
{
"epoch": 3.1408980314089803,
"grad_norm": 494.5647888183594,
"learning_rate": 1.6111111111111115e-05,
"loss": 70.0231,
"step": 14200
},
{
"epoch": 3.1630170316301705,
"grad_norm": 356.6091613769531,
"learning_rate": 1.5833333333333333e-05,
"loss": 77.7633,
"step": 14300
},
{
"epoch": 3.18513603185136,
"grad_norm": 1459.70458984375,
"learning_rate": 1.5555555555555555e-05,
"loss": 85.1146,
"step": 14400
},
{
"epoch": 3.2072550320725504,
"grad_norm": 1403.0411376953125,
"learning_rate": 1.527777777777778e-05,
"loss": 81.8396,
"step": 14500
},
{
"epoch": 3.22937403229374,
"grad_norm": 618.7579345703125,
"learning_rate": 1.5e-05,
"loss": 60.2011,
"step": 14600
},
{
"epoch": 3.2514930325149303,
"grad_norm": 283.14459228515625,
"learning_rate": 1.4722222222222224e-05,
"loss": 51.0029,
"step": 14700
},
{
"epoch": 3.2736120327361204,
"grad_norm": 217.41241455078125,
"learning_rate": 1.4444444444444444e-05,
"loss": 74.5451,
"step": 14800
},
{
"epoch": 3.29573103295731,
"grad_norm": 205.61912536621094,
"learning_rate": 1.4166666666666668e-05,
"loss": 89.2503,
"step": 14900
},
{
"epoch": 3.3178500331785004,
"grad_norm": 1901.0181884765625,
"learning_rate": 1.388888888888889e-05,
"loss": 60.1773,
"step": 15000
},
{
"epoch": 3.33996903339969,
"grad_norm": 3059.44677734375,
"learning_rate": 1.3611111111111111e-05,
"loss": 74.0471,
"step": 15100
},
{
"epoch": 3.3620880336208803,
"grad_norm": 742.3741455078125,
"learning_rate": 1.3333333333333333e-05,
"loss": 70.3482,
"step": 15200
},
{
"epoch": 3.3842070338420704,
"grad_norm": 1228.872802734375,
"learning_rate": 1.3055555555555557e-05,
"loss": 70.8862,
"step": 15300
},
{
"epoch": 3.40632603406326,
"grad_norm": 255.646728515625,
"learning_rate": 1.2777777777777777e-05,
"loss": 91.8017,
"step": 15400
},
{
"epoch": 3.4284450342844504,
"grad_norm": 112406.4921875,
"learning_rate": 1.25e-05,
"loss": 134.3668,
"step": 15500
},
{
"epoch": 3.4505640345056405,
"grad_norm": 3105.224609375,
"learning_rate": 1.2222222222222222e-05,
"loss": 65.372,
"step": 15600
},
{
"epoch": 3.4726830347268303,
"grad_norm": 357.5060119628906,
"learning_rate": 1.1944444444444446e-05,
"loss": 76.7241,
"step": 15700
},
{
"epoch": 3.4948020349480204,
"grad_norm": 347.73797607421875,
"learning_rate": 1.1666666666666668e-05,
"loss": 66.5545,
"step": 15800
},
{
"epoch": 3.5169210351692106,
"grad_norm": 747.213134765625,
"learning_rate": 1.138888888888889e-05,
"loss": 85.4001,
"step": 15900
},
{
"epoch": 3.5390400353904004,
"grad_norm": 637.4097290039062,
"learning_rate": 1.1111111111111112e-05,
"loss": 67.5741,
"step": 16000
},
{
"epoch": 3.56115903561159,
"grad_norm": 539.6690673828125,
"learning_rate": 1.0833333333333334e-05,
"loss": 58.0714,
"step": 16100
},
{
"epoch": 3.5832780358327803,
"grad_norm": 164.90731811523438,
"learning_rate": 1.0555555555555555e-05,
"loss": 71.1992,
"step": 16200
},
{
"epoch": 3.6053970360539704,
"grad_norm": 1147.67822265625,
"learning_rate": 1.0277777777777777e-05,
"loss": 69.1552,
"step": 16300
},
{
"epoch": 3.6275160362751606,
"grad_norm": 1046.4427490234375,
"learning_rate": 1e-05,
"loss": 76.3746,
"step": 16400
},
{
"epoch": 3.6496350364963503,
"grad_norm": 694.0726318359375,
"learning_rate": 9.722222222222223e-06,
"loss": 72.2259,
"step": 16500
},
{
"epoch": 3.6717540367175405,
"grad_norm": 791.6326293945312,
"learning_rate": 9.444444444444445e-06,
"loss": 61.315,
"step": 16600
},
{
"epoch": 3.6938730369387303,
"grad_norm": 2267.64501953125,
"learning_rate": 9.166666666666666e-06,
"loss": 70.8261,
"step": 16700
},
{
"epoch": 3.7159920371599204,
"grad_norm": 420.0555114746094,
"learning_rate": 8.88888888888889e-06,
"loss": 82.5815,
"step": 16800
},
{
"epoch": 3.7381110373811106,
"grad_norm": 846.6657104492188,
"learning_rate": 8.611111111111112e-06,
"loss": 76.4155,
"step": 16900
},
{
"epoch": 3.7602300376023003,
"grad_norm": 518.072998046875,
"learning_rate": 8.333333333333334e-06,
"loss": 77.4135,
"step": 17000
},
{
"epoch": 3.7823490378234905,
"grad_norm": 1067.0804443359375,
"learning_rate": 8.055555555555557e-06,
"loss": 78.471,
"step": 17100
},
{
"epoch": 3.8044680380446803,
"grad_norm": 1737.0146484375,
"learning_rate": 7.777777777777777e-06,
"loss": 57.732,
"step": 17200
},
{
"epoch": 3.8265870382658704,
"grad_norm": 377.42120361328125,
"learning_rate": 7.5e-06,
"loss": 51.0151,
"step": 17300
},
{
"epoch": 3.8487060384870606,
"grad_norm": 544.689697265625,
"learning_rate": 7.222222222222222e-06,
"loss": 73.9021,
"step": 17400
},
{
"epoch": 3.8708250387082503,
"grad_norm": 198.7846221923828,
"learning_rate": 6.944444444444445e-06,
"loss": 55.6872,
"step": 17500
},
{
"epoch": 3.8929440389294405,
"grad_norm": 169.698486328125,
"learning_rate": 6.666666666666667e-06,
"loss": 77.5166,
"step": 17600
},
{
"epoch": 3.9150630391506303,
"grad_norm": 671.6690673828125,
"learning_rate": 6.3888888888888885e-06,
"loss": 71.126,
"step": 17700
},
{
"epoch": 3.9371820393718204,
"grad_norm": 267.1748046875,
"learning_rate": 6.111111111111111e-06,
"loss": 68.3726,
"step": 17800
},
{
"epoch": 3.9593010395930106,
"grad_norm": 958.6556396484375,
"learning_rate": 5.833333333333334e-06,
"loss": 48.5082,
"step": 17900
},
{
"epoch": 3.9814200398142003,
"grad_norm": 981.7296142578125,
"learning_rate": 5.555555555555556e-06,
"loss": 75.4993,
"step": 18000
},
{
"epoch": 4.0,
"eval_loss": 184.00929260253906,
"eval_runtime": 13.5718,
"eval_samples_per_second": 148.101,
"eval_steps_per_second": 37.062,
"step": 18084
},
{
"epoch": 4.00353904003539,
"grad_norm": 1069.6796875,
"learning_rate": 5.277777777777778e-06,
"loss": 68.5521,
"step": 18100
},
{
"epoch": 4.02565804025658,
"grad_norm": 358.9207763671875,
"learning_rate": 5e-06,
"loss": 61.5938,
"step": 18200
},
{
"epoch": 4.04777704047777,
"grad_norm": 1874.0775146484375,
"learning_rate": 4.722222222222222e-06,
"loss": 61.3293,
"step": 18300
},
{
"epoch": 4.069896040698961,
"grad_norm": 1064.0521240234375,
"learning_rate": 4.444444444444445e-06,
"loss": 61.2526,
"step": 18400
},
{
"epoch": 4.092015040920151,
"grad_norm": 530.5929565429688,
"learning_rate": 4.166666666666667e-06,
"loss": 70.5009,
"step": 18500
},
{
"epoch": 4.11413404114134,
"grad_norm": 630.8464965820312,
"learning_rate": 3.888888888888889e-06,
"loss": 73.1008,
"step": 18600
},
{
"epoch": 4.13625304136253,
"grad_norm": 806.4486083984375,
"learning_rate": 3.611111111111111e-06,
"loss": 61.8402,
"step": 18700
},
{
"epoch": 4.15837204158372,
"grad_norm": 267.43902587890625,
"learning_rate": 3.3333333333333333e-06,
"loss": 50.5993,
"step": 18800
},
{
"epoch": 4.180491041804911,
"grad_norm": 289.7193298339844,
"learning_rate": 3.0555555555555556e-06,
"loss": 82.3094,
"step": 18900
},
{
"epoch": 4.202610042026101,
"grad_norm": 287.5179748535156,
"learning_rate": 2.777777777777778e-06,
"loss": 67.3702,
"step": 19000
},
{
"epoch": 4.22472904224729,
"grad_norm": 723.5756225585938,
"learning_rate": 2.5e-06,
"loss": 69.6824,
"step": 19100
},
{
"epoch": 4.24684804246848,
"grad_norm": 1228.8450927734375,
"learning_rate": 2.2222222222222225e-06,
"loss": 67.9151,
"step": 19200
},
{
"epoch": 4.26896704268967,
"grad_norm": 187.32969665527344,
"learning_rate": 1.9444444444444444e-06,
"loss": 63.4367,
"step": 19300
},
{
"epoch": 4.291086042910861,
"grad_norm": 354.1274108886719,
"learning_rate": 1.6666666666666667e-06,
"loss": 74.4323,
"step": 19400
},
{
"epoch": 4.313205043132051,
"grad_norm": 1628.03857421875,
"learning_rate": 1.388888888888889e-06,
"loss": 74.6173,
"step": 19500
},
{
"epoch": 4.33532404335324,
"grad_norm": 286.3065185546875,
"learning_rate": 1.1111111111111112e-06,
"loss": 66.8454,
"step": 19600
},
{
"epoch": 4.35744304357443,
"grad_norm": 140.7686767578125,
"learning_rate": 8.333333333333333e-07,
"loss": 40.7083,
"step": 19700
},
{
"epoch": 4.37956204379562,
"grad_norm": 907.8124389648438,
"learning_rate": 5.555555555555556e-07,
"loss": 79.8728,
"step": 19800
},
{
"epoch": 4.401681044016811,
"grad_norm": 498.08941650390625,
"learning_rate": 2.777777777777778e-07,
"loss": 68.5423,
"step": 19900
},
{
"epoch": 4.423800044238001,
"grad_norm": 380.51446533203125,
"learning_rate": 0.0,
"loss": 58.604,
"step": 20000
}
],
"logging_steps": 100,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}