mifeng09's picture
Upload folder using huggingface_hub
47a6668 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 430.7713928222656,
"learning_rate": 9.996e-06,
"loss": 9.3758,
"step": 5
},
{
"epoch": 0.001,
"grad_norm": 115.31803894042969,
"learning_rate": 9.991000000000001e-06,
"loss": 7.1112,
"step": 10
},
{
"epoch": 0.0015,
"grad_norm": 107.39344787597656,
"learning_rate": 9.986e-06,
"loss": 7.7463,
"step": 15
},
{
"epoch": 0.002,
"grad_norm": 81.79249572753906,
"learning_rate": 9.981000000000002e-06,
"loss": 6.9878,
"step": 20
},
{
"epoch": 0.0025,
"grad_norm": 81.8421859741211,
"learning_rate": 9.976e-06,
"loss": 5.8778,
"step": 25
},
{
"epoch": 0.003,
"grad_norm": 97.48958587646484,
"learning_rate": 9.971e-06,
"loss": 6.7617,
"step": 30
},
{
"epoch": 0.0035,
"grad_norm": 67.10735321044922,
"learning_rate": 9.966e-06,
"loss": 6.6815,
"step": 35
},
{
"epoch": 0.004,
"grad_norm": 91.79243469238281,
"learning_rate": 9.961e-06,
"loss": 5.5256,
"step": 40
},
{
"epoch": 0.0045,
"grad_norm": 73.99298858642578,
"learning_rate": 9.956000000000001e-06,
"loss": 6.301,
"step": 45
},
{
"epoch": 0.005,
"grad_norm": 82.86575317382812,
"learning_rate": 9.951e-06,
"loss": 6.0791,
"step": 50
},
{
"epoch": 0.0055,
"grad_norm": 79.45413208007812,
"learning_rate": 9.946000000000001e-06,
"loss": 6.4806,
"step": 55
},
{
"epoch": 0.006,
"grad_norm": 93.1746597290039,
"learning_rate": 9.941e-06,
"loss": 6.0065,
"step": 60
},
{
"epoch": 0.0065,
"grad_norm": 80.89791107177734,
"learning_rate": 9.936000000000001e-06,
"loss": 7.1122,
"step": 65
},
{
"epoch": 0.007,
"grad_norm": 71.02462005615234,
"learning_rate": 9.931e-06,
"loss": 5.8262,
"step": 70
},
{
"epoch": 0.0075,
"grad_norm": 88.24502563476562,
"learning_rate": 9.926000000000001e-06,
"loss": 6.1168,
"step": 75
},
{
"epoch": 0.008,
"grad_norm": 85.9001693725586,
"learning_rate": 9.921e-06,
"loss": 6.6863,
"step": 80
},
{
"epoch": 0.0085,
"grad_norm": 86.51998138427734,
"learning_rate": 9.916000000000001e-06,
"loss": 6.4471,
"step": 85
},
{
"epoch": 0.009,
"grad_norm": 66.9721450805664,
"learning_rate": 9.911e-06,
"loss": 6.1214,
"step": 90
},
{
"epoch": 0.0095,
"grad_norm": 81.62995147705078,
"learning_rate": 9.906000000000001e-06,
"loss": 7.0519,
"step": 95
},
{
"epoch": 0.01,
"grad_norm": 76.39604949951172,
"learning_rate": 9.901e-06,
"loss": 5.5445,
"step": 100
},
{
"epoch": 0.0105,
"grad_norm": 84.44085693359375,
"learning_rate": 9.896000000000001e-06,
"loss": 6.6272,
"step": 105
},
{
"epoch": 0.011,
"grad_norm": 82.27601623535156,
"learning_rate": 9.891e-06,
"loss": 6.0362,
"step": 110
},
{
"epoch": 0.0115,
"grad_norm": 83.45719146728516,
"learning_rate": 9.886000000000002e-06,
"loss": 6.0516,
"step": 115
},
{
"epoch": 0.012,
"grad_norm": 69.73688507080078,
"learning_rate": 9.881e-06,
"loss": 5.4801,
"step": 120
},
{
"epoch": 0.0125,
"grad_norm": 61.0240364074707,
"learning_rate": 9.876000000000002e-06,
"loss": 4.7676,
"step": 125
},
{
"epoch": 0.013,
"grad_norm": 110.38282775878906,
"learning_rate": 9.871000000000001e-06,
"loss": 6.1181,
"step": 130
},
{
"epoch": 0.0135,
"grad_norm": 68.89127349853516,
"learning_rate": 9.866000000000002e-06,
"loss": 5.1128,
"step": 135
},
{
"epoch": 0.014,
"grad_norm": 120.67707061767578,
"learning_rate": 9.861000000000001e-06,
"loss": 5.8327,
"step": 140
},
{
"epoch": 0.0145,
"grad_norm": 84.3951644897461,
"learning_rate": 9.856000000000002e-06,
"loss": 6.9349,
"step": 145
},
{
"epoch": 0.015,
"grad_norm": 73.58941650390625,
"learning_rate": 9.851000000000001e-06,
"loss": 5.4627,
"step": 150
},
{
"epoch": 0.0155,
"grad_norm": 72.00989532470703,
"learning_rate": 9.846000000000002e-06,
"loss": 5.6341,
"step": 155
},
{
"epoch": 0.016,
"grad_norm": 78.54756927490234,
"learning_rate": 9.841000000000001e-06,
"loss": 5.5729,
"step": 160
},
{
"epoch": 0.0165,
"grad_norm": 94.46847534179688,
"learning_rate": 9.836e-06,
"loss": 6.7456,
"step": 165
},
{
"epoch": 0.017,
"grad_norm": 68.8740463256836,
"learning_rate": 9.831000000000001e-06,
"loss": 6.0283,
"step": 170
},
{
"epoch": 0.0175,
"grad_norm": 67.59810638427734,
"learning_rate": 9.826e-06,
"loss": 5.9708,
"step": 175
},
{
"epoch": 0.018,
"grad_norm": 82.12776184082031,
"learning_rate": 9.821000000000001e-06,
"loss": 6.137,
"step": 180
},
{
"epoch": 0.0185,
"grad_norm": 86.02208709716797,
"learning_rate": 9.816e-06,
"loss": 5.9061,
"step": 185
},
{
"epoch": 0.019,
"grad_norm": 68.40846252441406,
"learning_rate": 9.811e-06,
"loss": 5.9535,
"step": 190
},
{
"epoch": 0.0195,
"grad_norm": 71.01241302490234,
"learning_rate": 9.806e-06,
"loss": 5.3895,
"step": 195
},
{
"epoch": 0.02,
"grad_norm": 69.2813720703125,
"learning_rate": 9.801e-06,
"loss": 5.8592,
"step": 200
},
{
"epoch": 0.0205,
"grad_norm": 91.32173156738281,
"learning_rate": 9.796e-06,
"loss": 6.4962,
"step": 205
},
{
"epoch": 0.021,
"grad_norm": 72.96836853027344,
"learning_rate": 9.791e-06,
"loss": 5.7814,
"step": 210
},
{
"epoch": 0.0215,
"grad_norm": 67.66992950439453,
"learning_rate": 9.786e-06,
"loss": 5.7865,
"step": 215
},
{
"epoch": 0.022,
"grad_norm": 66.08545684814453,
"learning_rate": 9.781e-06,
"loss": 5.8699,
"step": 220
},
{
"epoch": 0.0225,
"grad_norm": 88.2244873046875,
"learning_rate": 9.776000000000001e-06,
"loss": 6.0525,
"step": 225
},
{
"epoch": 0.023,
"grad_norm": 77.0357666015625,
"learning_rate": 9.771e-06,
"loss": 5.8639,
"step": 230
},
{
"epoch": 0.0235,
"grad_norm": 67.81790161132812,
"learning_rate": 9.766000000000001e-06,
"loss": 5.2133,
"step": 235
},
{
"epoch": 0.024,
"grad_norm": 86.99850463867188,
"learning_rate": 9.761e-06,
"loss": 5.5895,
"step": 240
},
{
"epoch": 0.0245,
"grad_norm": 75.6593017578125,
"learning_rate": 9.756000000000001e-06,
"loss": 5.8113,
"step": 245
},
{
"epoch": 0.025,
"grad_norm": 68.56720733642578,
"learning_rate": 9.751e-06,
"loss": 5.2984,
"step": 250
},
{
"epoch": 0.0255,
"grad_norm": 66.91255187988281,
"learning_rate": 9.746000000000001e-06,
"loss": 5.224,
"step": 255
},
{
"epoch": 0.026,
"grad_norm": 98.80131530761719,
"learning_rate": 9.741e-06,
"loss": 5.8157,
"step": 260
},
{
"epoch": 0.0265,
"grad_norm": 74.00731658935547,
"learning_rate": 9.736000000000001e-06,
"loss": 5.6423,
"step": 265
},
{
"epoch": 0.027,
"grad_norm": 66.43975830078125,
"learning_rate": 9.731e-06,
"loss": 4.4168,
"step": 270
},
{
"epoch": 0.0275,
"grad_norm": 78.20140838623047,
"learning_rate": 9.726000000000001e-06,
"loss": 6.0293,
"step": 275
},
{
"epoch": 0.028,
"grad_norm": 76.42058563232422,
"learning_rate": 9.721e-06,
"loss": 6.1047,
"step": 280
},
{
"epoch": 0.0285,
"grad_norm": 69.54177856445312,
"learning_rate": 9.716000000000002e-06,
"loss": 5.6279,
"step": 285
},
{
"epoch": 0.029,
"grad_norm": 71.5013656616211,
"learning_rate": 9.711e-06,
"loss": 5.3354,
"step": 290
},
{
"epoch": 0.0295,
"grad_norm": 69.59915161132812,
"learning_rate": 9.706000000000002e-06,
"loss": 4.9435,
"step": 295
},
{
"epoch": 0.03,
"grad_norm": 68.15699005126953,
"learning_rate": 9.701e-06,
"loss": 6.8088,
"step": 300
},
{
"epoch": 0.0305,
"grad_norm": 83.83426666259766,
"learning_rate": 9.696000000000002e-06,
"loss": 6.3353,
"step": 305
},
{
"epoch": 0.031,
"grad_norm": 100.34281921386719,
"learning_rate": 9.691000000000001e-06,
"loss": 6.3657,
"step": 310
},
{
"epoch": 0.0315,
"grad_norm": 60.91353225708008,
"learning_rate": 9.686000000000002e-06,
"loss": 5.495,
"step": 315
},
{
"epoch": 0.032,
"grad_norm": 68.33106231689453,
"learning_rate": 9.681000000000001e-06,
"loss": 6.231,
"step": 320
},
{
"epoch": 0.0325,
"grad_norm": 56.71453094482422,
"learning_rate": 9.676e-06,
"loss": 5.9889,
"step": 325
},
{
"epoch": 0.033,
"grad_norm": 68.593505859375,
"learning_rate": 9.671000000000001e-06,
"loss": 6.3372,
"step": 330
},
{
"epoch": 0.0335,
"grad_norm": 66.93341064453125,
"learning_rate": 9.666e-06,
"loss": 5.4936,
"step": 335
},
{
"epoch": 0.034,
"grad_norm": 70.41497802734375,
"learning_rate": 9.661000000000001e-06,
"loss": 6.5437,
"step": 340
},
{
"epoch": 0.0345,
"grad_norm": 55.9371452331543,
"learning_rate": 9.656e-06,
"loss": 6.0847,
"step": 345
},
{
"epoch": 0.035,
"grad_norm": 72.59474182128906,
"learning_rate": 9.651e-06,
"loss": 5.2704,
"step": 350
},
{
"epoch": 0.0355,
"grad_norm": 65.90139770507812,
"learning_rate": 9.646e-06,
"loss": 5.6678,
"step": 355
},
{
"epoch": 0.036,
"grad_norm": 55.99635696411133,
"learning_rate": 9.641e-06,
"loss": 5.2667,
"step": 360
},
{
"epoch": 0.0365,
"grad_norm": 61.77568817138672,
"learning_rate": 9.636e-06,
"loss": 4.7788,
"step": 365
},
{
"epoch": 0.037,
"grad_norm": 62.45391845703125,
"learning_rate": 9.631e-06,
"loss": 5.3709,
"step": 370
},
{
"epoch": 0.0375,
"grad_norm": 77.93061828613281,
"learning_rate": 9.626e-06,
"loss": 6.911,
"step": 375
},
{
"epoch": 0.038,
"grad_norm": 77.62232208251953,
"learning_rate": 9.621e-06,
"loss": 6.6592,
"step": 380
},
{
"epoch": 0.0385,
"grad_norm": 67.21658325195312,
"learning_rate": 9.616e-06,
"loss": 5.5068,
"step": 385
},
{
"epoch": 0.039,
"grad_norm": 61.39186477661133,
"learning_rate": 9.611e-06,
"loss": 6.015,
"step": 390
},
{
"epoch": 0.0395,
"grad_norm": 68.21427154541016,
"learning_rate": 9.606000000000001e-06,
"loss": 5.2284,
"step": 395
},
{
"epoch": 0.04,
"grad_norm": 70.14752197265625,
"learning_rate": 9.601e-06,
"loss": 6.014,
"step": 400
},
{
"epoch": 0.0405,
"grad_norm": 70.03300476074219,
"learning_rate": 9.596000000000001e-06,
"loss": 5.2755,
"step": 405
},
{
"epoch": 0.041,
"grad_norm": 71.4956283569336,
"learning_rate": 9.591e-06,
"loss": 4.7642,
"step": 410
},
{
"epoch": 0.0415,
"grad_norm": 65.81194305419922,
"learning_rate": 9.586000000000001e-06,
"loss": 5.8357,
"step": 415
},
{
"epoch": 0.042,
"grad_norm": 80.47106170654297,
"learning_rate": 9.581e-06,
"loss": 5.1515,
"step": 420
},
{
"epoch": 0.0425,
"grad_norm": 78.7089614868164,
"learning_rate": 9.576000000000001e-06,
"loss": 5.3348,
"step": 425
},
{
"epoch": 0.043,
"grad_norm": 75.13068389892578,
"learning_rate": 9.571e-06,
"loss": 5.3623,
"step": 430
},
{
"epoch": 0.0435,
"grad_norm": 69.73881530761719,
"learning_rate": 9.566000000000001e-06,
"loss": 5.8583,
"step": 435
},
{
"epoch": 0.044,
"grad_norm": 60.87718963623047,
"learning_rate": 9.561e-06,
"loss": 5.3451,
"step": 440
},
{
"epoch": 0.0445,
"grad_norm": 68.39297485351562,
"learning_rate": 9.556000000000001e-06,
"loss": 5.2354,
"step": 445
},
{
"epoch": 0.045,
"grad_norm": 80.71501159667969,
"learning_rate": 9.551e-06,
"loss": 5.6821,
"step": 450
},
{
"epoch": 0.0455,
"grad_norm": 75.60702514648438,
"learning_rate": 9.546000000000001e-06,
"loss": 6.4379,
"step": 455
},
{
"epoch": 0.046,
"grad_norm": 65.37830352783203,
"learning_rate": 9.541e-06,
"loss": 5.7551,
"step": 460
},
{
"epoch": 0.0465,
"grad_norm": 63.455440521240234,
"learning_rate": 9.536000000000002e-06,
"loss": 5.6863,
"step": 465
},
{
"epoch": 0.047,
"grad_norm": 66.560546875,
"learning_rate": 9.531e-06,
"loss": 5.3967,
"step": 470
},
{
"epoch": 0.0475,
"grad_norm": 66.29322052001953,
"learning_rate": 9.526000000000002e-06,
"loss": 6.5743,
"step": 475
},
{
"epoch": 0.048,
"grad_norm": 73.60453033447266,
"learning_rate": 9.521e-06,
"loss": 5.4218,
"step": 480
},
{
"epoch": 0.0485,
"grad_norm": 63.51287841796875,
"learning_rate": 9.516e-06,
"loss": 5.9678,
"step": 485
},
{
"epoch": 0.049,
"grad_norm": 55.5192985534668,
"learning_rate": 9.511000000000001e-06,
"loss": 5.7875,
"step": 490
},
{
"epoch": 0.0495,
"grad_norm": 62.022682189941406,
"learning_rate": 9.506e-06,
"loss": 7.1336,
"step": 495
},
{
"epoch": 0.05,
"grad_norm": 57.97554397583008,
"learning_rate": 9.501000000000001e-06,
"loss": 5.1286,
"step": 500
},
{
"epoch": 0.0505,
"grad_norm": 58.80134582519531,
"learning_rate": 9.496e-06,
"loss": 5.0009,
"step": 505
},
{
"epoch": 0.051,
"grad_norm": 57.95747756958008,
"learning_rate": 9.491000000000001e-06,
"loss": 4.9228,
"step": 510
},
{
"epoch": 0.0515,
"grad_norm": 60.494632720947266,
"learning_rate": 9.486e-06,
"loss": 5.7521,
"step": 515
},
{
"epoch": 0.052,
"grad_norm": 53.30976104736328,
"learning_rate": 9.481000000000001e-06,
"loss": 5.8064,
"step": 520
},
{
"epoch": 0.0525,
"grad_norm": 71.98876190185547,
"learning_rate": 9.476e-06,
"loss": 4.4395,
"step": 525
},
{
"epoch": 0.053,
"grad_norm": 59.24486541748047,
"learning_rate": 9.471000000000001e-06,
"loss": 5.1754,
"step": 530
},
{
"epoch": 0.0535,
"grad_norm": 83.90294647216797,
"learning_rate": 9.466e-06,
"loss": 5.4962,
"step": 535
},
{
"epoch": 0.054,
"grad_norm": 60.665740966796875,
"learning_rate": 9.461000000000001e-06,
"loss": 5.3369,
"step": 540
},
{
"epoch": 0.0545,
"grad_norm": 74.39241790771484,
"learning_rate": 9.456e-06,
"loss": 5.2247,
"step": 545
},
{
"epoch": 0.055,
"grad_norm": 66.4027328491211,
"learning_rate": 9.451000000000002e-06,
"loss": 5.2229,
"step": 550
},
{
"epoch": 0.0555,
"grad_norm": 61.06802749633789,
"learning_rate": 9.446e-06,
"loss": 5.8417,
"step": 555
},
{
"epoch": 0.056,
"grad_norm": 57.60587692260742,
"learning_rate": 9.441000000000002e-06,
"loss": 4.9819,
"step": 560
},
{
"epoch": 0.0565,
"grad_norm": 73.18106079101562,
"learning_rate": 9.436e-06,
"loss": 6.4829,
"step": 565
},
{
"epoch": 0.057,
"grad_norm": 70.94274139404297,
"learning_rate": 9.431000000000002e-06,
"loss": 5.8026,
"step": 570
},
{
"epoch": 0.0575,
"grad_norm": 72.48558044433594,
"learning_rate": 9.426000000000001e-06,
"loss": 5.5023,
"step": 575
},
{
"epoch": 0.058,
"grad_norm": 46.47978591918945,
"learning_rate": 9.421000000000002e-06,
"loss": 4.4358,
"step": 580
},
{
"epoch": 0.0585,
"grad_norm": 59.145790100097656,
"learning_rate": 9.416000000000001e-06,
"loss": 5.5065,
"step": 585
},
{
"epoch": 0.059,
"grad_norm": 59.508522033691406,
"learning_rate": 9.411000000000002e-06,
"loss": 4.9478,
"step": 590
},
{
"epoch": 0.0595,
"grad_norm": 83.52436065673828,
"learning_rate": 9.406000000000001e-06,
"loss": 5.4239,
"step": 595
},
{
"epoch": 0.06,
"grad_norm": 63.06760025024414,
"learning_rate": 9.401000000000002e-06,
"loss": 5.2395,
"step": 600
},
{
"epoch": 0.0605,
"grad_norm": 77.39131927490234,
"learning_rate": 9.396000000000001e-06,
"loss": 4.9941,
"step": 605
},
{
"epoch": 0.061,
"grad_norm": 72.02144622802734,
"learning_rate": 9.391e-06,
"loss": 6.3654,
"step": 610
},
{
"epoch": 0.0615,
"grad_norm": 78.32617950439453,
"learning_rate": 9.386000000000001e-06,
"loss": 5.6073,
"step": 615
},
{
"epoch": 0.062,
"grad_norm": 64.85309600830078,
"learning_rate": 9.381e-06,
"loss": 5.1284,
"step": 620
},
{
"epoch": 0.0625,
"grad_norm": 65.42955017089844,
"learning_rate": 9.376000000000001e-06,
"loss": 5.4364,
"step": 625
},
{
"epoch": 0.063,
"grad_norm": 72.40003204345703,
"learning_rate": 9.371e-06,
"loss": 4.9955,
"step": 630
},
{
"epoch": 0.0635,
"grad_norm": 85.38143920898438,
"learning_rate": 9.366000000000001e-06,
"loss": 6.3228,
"step": 635
},
{
"epoch": 0.064,
"grad_norm": 63.02680587768555,
"learning_rate": 9.361e-06,
"loss": 5.5719,
"step": 640
},
{
"epoch": 0.0645,
"grad_norm": 71.41946411132812,
"learning_rate": 9.356e-06,
"loss": 6.0309,
"step": 645
},
{
"epoch": 0.065,
"grad_norm": 61.37795639038086,
"learning_rate": 9.351e-06,
"loss": 5.2103,
"step": 650
},
{
"epoch": 0.0655,
"grad_norm": 49.167747497558594,
"learning_rate": 9.346e-06,
"loss": 4.906,
"step": 655
},
{
"epoch": 0.066,
"grad_norm": 61.07039260864258,
"learning_rate": 9.341000000000001e-06,
"loss": 4.6488,
"step": 660
},
{
"epoch": 0.0665,
"grad_norm": 57.769874572753906,
"learning_rate": 9.336e-06,
"loss": 6.048,
"step": 665
},
{
"epoch": 0.067,
"grad_norm": 60.57929229736328,
"learning_rate": 9.331000000000001e-06,
"loss": 5.1988,
"step": 670
},
{
"epoch": 0.0675,
"grad_norm": 51.742740631103516,
"learning_rate": 9.326e-06,
"loss": 4.782,
"step": 675
},
{
"epoch": 0.068,
"grad_norm": 66.58326721191406,
"learning_rate": 9.321000000000001e-06,
"loss": 5.2994,
"step": 680
},
{
"epoch": 0.0685,
"grad_norm": 51.75037384033203,
"learning_rate": 9.316e-06,
"loss": 6.0222,
"step": 685
},
{
"epoch": 0.069,
"grad_norm": 74.25454711914062,
"learning_rate": 9.311000000000001e-06,
"loss": 5.7173,
"step": 690
},
{
"epoch": 0.0695,
"grad_norm": 77.25601959228516,
"learning_rate": 9.306e-06,
"loss": 5.3204,
"step": 695
},
{
"epoch": 0.07,
"grad_norm": 83.57421875,
"learning_rate": 9.301000000000001e-06,
"loss": 4.3073,
"step": 700
},
{
"epoch": 0.0705,
"grad_norm": 72.01386260986328,
"learning_rate": 9.296e-06,
"loss": 6.278,
"step": 705
},
{
"epoch": 0.071,
"grad_norm": 58.889522552490234,
"learning_rate": 9.291000000000001e-06,
"loss": 5.3862,
"step": 710
},
{
"epoch": 0.0715,
"grad_norm": 55.614715576171875,
"learning_rate": 9.286e-06,
"loss": 5.1135,
"step": 715
},
{
"epoch": 0.072,
"grad_norm": 66.86405181884766,
"learning_rate": 9.281000000000001e-06,
"loss": 6.0137,
"step": 720
},
{
"epoch": 0.0725,
"grad_norm": 73.3634033203125,
"learning_rate": 9.276e-06,
"loss": 5.3904,
"step": 725
},
{
"epoch": 0.073,
"grad_norm": 65.3800277709961,
"learning_rate": 9.271000000000002e-06,
"loss": 5.104,
"step": 730
},
{
"epoch": 0.0735,
"grad_norm": 57.99812316894531,
"learning_rate": 9.266e-06,
"loss": 6.0557,
"step": 735
},
{
"epoch": 0.074,
"grad_norm": 72.21461486816406,
"learning_rate": 9.261000000000002e-06,
"loss": 5.2916,
"step": 740
},
{
"epoch": 0.0745,
"grad_norm": 46.72555160522461,
"learning_rate": 9.256e-06,
"loss": 4.765,
"step": 745
},
{
"epoch": 0.075,
"grad_norm": 56.59814453125,
"learning_rate": 9.251000000000002e-06,
"loss": 5.1524,
"step": 750
},
{
"epoch": 0.0755,
"grad_norm": 51.47397994995117,
"learning_rate": 9.246000000000001e-06,
"loss": 4.679,
"step": 755
},
{
"epoch": 0.076,
"grad_norm": 66.05585479736328,
"learning_rate": 9.241000000000002e-06,
"loss": 5.2957,
"step": 760
},
{
"epoch": 0.0765,
"grad_norm": 95.89804077148438,
"learning_rate": 9.236000000000001e-06,
"loss": 6.7028,
"step": 765
},
{
"epoch": 0.077,
"grad_norm": 55.590572357177734,
"learning_rate": 9.231000000000002e-06,
"loss": 5.4986,
"step": 770
},
{
"epoch": 0.0775,
"grad_norm": 64.92922973632812,
"learning_rate": 9.226000000000001e-06,
"loss": 4.4751,
"step": 775
},
{
"epoch": 0.078,
"grad_norm": 77.49915313720703,
"learning_rate": 9.221e-06,
"loss": 5.5307,
"step": 780
},
{
"epoch": 0.0785,
"grad_norm": 71.59280395507812,
"learning_rate": 9.216000000000001e-06,
"loss": 5.7804,
"step": 785
},
{
"epoch": 0.079,
"grad_norm": 47.804176330566406,
"learning_rate": 9.211e-06,
"loss": 4.3967,
"step": 790
},
{
"epoch": 0.0795,
"grad_norm": 70.14527130126953,
"learning_rate": 9.206000000000001e-06,
"loss": 5.6562,
"step": 795
},
{
"epoch": 0.08,
"grad_norm": 65.86378479003906,
"learning_rate": 9.201e-06,
"loss": 4.1849,
"step": 800
},
{
"epoch": 0.0805,
"grad_norm": 75.2146987915039,
"learning_rate": 9.196e-06,
"loss": 4.4274,
"step": 805
},
{
"epoch": 0.081,
"grad_norm": 69.72351837158203,
"learning_rate": 9.191e-06,
"loss": 5.9283,
"step": 810
},
{
"epoch": 0.0815,
"grad_norm": 70.36526489257812,
"learning_rate": 9.186e-06,
"loss": 5.6442,
"step": 815
},
{
"epoch": 0.082,
"grad_norm": 61.410221099853516,
"learning_rate": 9.181e-06,
"loss": 4.9186,
"step": 820
},
{
"epoch": 0.0825,
"grad_norm": 62.3479118347168,
"learning_rate": 9.176e-06,
"loss": 4.9193,
"step": 825
},
{
"epoch": 0.083,
"grad_norm": 77.28399658203125,
"learning_rate": 9.171e-06,
"loss": 4.8504,
"step": 830
},
{
"epoch": 0.0835,
"grad_norm": 57.84767532348633,
"learning_rate": 9.166e-06,
"loss": 6.5448,
"step": 835
},
{
"epoch": 0.084,
"grad_norm": 70.60618591308594,
"learning_rate": 9.161000000000001e-06,
"loss": 4.4438,
"step": 840
},
{
"epoch": 0.0845,
"grad_norm": 60.7105827331543,
"learning_rate": 9.156e-06,
"loss": 5.6495,
"step": 845
},
{
"epoch": 0.085,
"grad_norm": 129.35887145996094,
"learning_rate": 9.151000000000001e-06,
"loss": 5.5979,
"step": 850
},
{
"epoch": 0.0855,
"grad_norm": 91.48816680908203,
"learning_rate": 9.146e-06,
"loss": 5.0048,
"step": 855
},
{
"epoch": 0.086,
"grad_norm": 52.0786018371582,
"learning_rate": 9.141000000000001e-06,
"loss": 5.2717,
"step": 860
},
{
"epoch": 0.0865,
"grad_norm": 65.6240005493164,
"learning_rate": 9.136e-06,
"loss": 5.4725,
"step": 865
},
{
"epoch": 0.087,
"grad_norm": 72.78899383544922,
"learning_rate": 9.131000000000001e-06,
"loss": 5.6896,
"step": 870
},
{
"epoch": 0.0875,
"grad_norm": 59.107357025146484,
"learning_rate": 9.126e-06,
"loss": 5.9547,
"step": 875
},
{
"epoch": 0.088,
"grad_norm": 74.16963958740234,
"learning_rate": 9.121000000000001e-06,
"loss": 5.3027,
"step": 880
},
{
"epoch": 0.0885,
"grad_norm": 70.86022186279297,
"learning_rate": 9.116e-06,
"loss": 5.8291,
"step": 885
},
{
"epoch": 0.089,
"grad_norm": 52.165042877197266,
"learning_rate": 9.111000000000001e-06,
"loss": 5.4198,
"step": 890
},
{
"epoch": 0.0895,
"grad_norm": 63.6991081237793,
"learning_rate": 9.106e-06,
"loss": 5.4296,
"step": 895
},
{
"epoch": 0.09,
"grad_norm": 50.96390914916992,
"learning_rate": 9.101000000000001e-06,
"loss": 4.4431,
"step": 900
},
{
"epoch": 0.0905,
"grad_norm": 82.38723754882812,
"learning_rate": 9.096e-06,
"loss": 5.2341,
"step": 905
},
{
"epoch": 0.091,
"grad_norm": 53.057498931884766,
"learning_rate": 9.091000000000002e-06,
"loss": 5.6911,
"step": 910
},
{
"epoch": 0.0915,
"grad_norm": 61.33327102661133,
"learning_rate": 9.086e-06,
"loss": 5.1844,
"step": 915
},
{
"epoch": 0.092,
"grad_norm": 82.66905975341797,
"learning_rate": 9.081000000000002e-06,
"loss": 5.5432,
"step": 920
},
{
"epoch": 0.0925,
"grad_norm": 58.46932601928711,
"learning_rate": 9.076000000000001e-06,
"loss": 4.5893,
"step": 925
},
{
"epoch": 0.093,
"grad_norm": 49.836265563964844,
"learning_rate": 9.071000000000002e-06,
"loss": 5.752,
"step": 930
},
{
"epoch": 0.0935,
"grad_norm": 58.416526794433594,
"learning_rate": 9.066000000000001e-06,
"loss": 4.6827,
"step": 935
},
{
"epoch": 0.094,
"grad_norm": 65.9136962890625,
"learning_rate": 9.061e-06,
"loss": 5.8126,
"step": 940
},
{
"epoch": 0.0945,
"grad_norm": 52.34477233886719,
"learning_rate": 9.056000000000001e-06,
"loss": 5.0553,
"step": 945
},
{
"epoch": 0.095,
"grad_norm": 54.70780563354492,
"learning_rate": 9.051e-06,
"loss": 4.8854,
"step": 950
},
{
"epoch": 0.0955,
"grad_norm": 51.67803192138672,
"learning_rate": 9.046000000000001e-06,
"loss": 4.9881,
"step": 955
},
{
"epoch": 0.096,
"grad_norm": 52.68552017211914,
"learning_rate": 9.041e-06,
"loss": 5.656,
"step": 960
},
{
"epoch": 0.0965,
"grad_norm": 80.78492736816406,
"learning_rate": 9.036e-06,
"loss": 6.34,
"step": 965
},
{
"epoch": 0.097,
"grad_norm": 59.34943389892578,
"learning_rate": 9.031e-06,
"loss": 5.2822,
"step": 970
},
{
"epoch": 0.0975,
"grad_norm": 62.7556037902832,
"learning_rate": 9.026e-06,
"loss": 4.8106,
"step": 975
},
{
"epoch": 0.098,
"grad_norm": 50.799312591552734,
"learning_rate": 9.021e-06,
"loss": 5.4295,
"step": 980
},
{
"epoch": 0.0985,
"grad_norm": 57.404300689697266,
"learning_rate": 9.016e-06,
"loss": 5.6952,
"step": 985
},
{
"epoch": 0.099,
"grad_norm": 74.7780532836914,
"learning_rate": 9.011e-06,
"loss": 3.9573,
"step": 990
},
{
"epoch": 0.0995,
"grad_norm": 59.960636138916016,
"learning_rate": 9.006e-06,
"loss": 5.8612,
"step": 995
},
{
"epoch": 0.1,
"grad_norm": 79.70378112792969,
"learning_rate": 9.001e-06,
"loss": 5.2286,
"step": 1000
},
{
"epoch": 0.1005,
"grad_norm": 69.96094512939453,
"learning_rate": 8.996e-06,
"loss": 5.5803,
"step": 1005
},
{
"epoch": 0.101,
"grad_norm": 69.7931900024414,
"learning_rate": 8.991e-06,
"loss": 5.2732,
"step": 1010
},
{
"epoch": 0.1015,
"grad_norm": 96.09075164794922,
"learning_rate": 8.986e-06,
"loss": 5.2639,
"step": 1015
},
{
"epoch": 0.102,
"grad_norm": 51.90883255004883,
"learning_rate": 8.981000000000001e-06,
"loss": 4.0375,
"step": 1020
},
{
"epoch": 0.1025,
"grad_norm": 62.40837478637695,
"learning_rate": 8.976e-06,
"loss": 5.5941,
"step": 1025
},
{
"epoch": 0.103,
"grad_norm": 57.33722686767578,
"learning_rate": 8.971000000000001e-06,
"loss": 4.6877,
"step": 1030
},
{
"epoch": 0.1035,
"grad_norm": 69.11624145507812,
"learning_rate": 8.966e-06,
"loss": 4.684,
"step": 1035
},
{
"epoch": 0.104,
"grad_norm": 61.289737701416016,
"learning_rate": 8.961000000000001e-06,
"loss": 5.9492,
"step": 1040
},
{
"epoch": 0.1045,
"grad_norm": 61.19724655151367,
"learning_rate": 8.956e-06,
"loss": 5.7873,
"step": 1045
},
{
"epoch": 0.105,
"grad_norm": 70.41716766357422,
"learning_rate": 8.951000000000001e-06,
"loss": 4.8212,
"step": 1050
},
{
"epoch": 0.1055,
"grad_norm": 63.66280746459961,
"learning_rate": 8.946e-06,
"loss": 4.8835,
"step": 1055
},
{
"epoch": 0.106,
"grad_norm": 88.53239440917969,
"learning_rate": 8.941000000000001e-06,
"loss": 6.0866,
"step": 1060
},
{
"epoch": 0.1065,
"grad_norm": 59.35116958618164,
"learning_rate": 8.936e-06,
"loss": 5.0788,
"step": 1065
},
{
"epoch": 0.107,
"grad_norm": 56.121864318847656,
"learning_rate": 8.931000000000001e-06,
"loss": 5.2856,
"step": 1070
},
{
"epoch": 0.1075,
"grad_norm": 55.64980697631836,
"learning_rate": 8.926e-06,
"loss": 5.9382,
"step": 1075
},
{
"epoch": 0.108,
"grad_norm": 65.16487121582031,
"learning_rate": 8.921000000000001e-06,
"loss": 4.5096,
"step": 1080
},
{
"epoch": 0.1085,
"grad_norm": 55.02686309814453,
"learning_rate": 8.916e-06,
"loss": 5.0421,
"step": 1085
},
{
"epoch": 0.109,
"grad_norm": 65.21896362304688,
"learning_rate": 8.911000000000002e-06,
"loss": 4.7471,
"step": 1090
},
{
"epoch": 0.1095,
"grad_norm": 84.38319396972656,
"learning_rate": 8.906e-06,
"loss": 5.9924,
"step": 1095
},
{
"epoch": 0.11,
"grad_norm": 84.8278579711914,
"learning_rate": 8.901e-06,
"loss": 4.4728,
"step": 1100
},
{
"epoch": 0.1105,
"grad_norm": 55.53620910644531,
"learning_rate": 8.896000000000001e-06,
"loss": 4.709,
"step": 1105
},
{
"epoch": 0.111,
"grad_norm": 65.74577331542969,
"learning_rate": 8.891e-06,
"loss": 5.1826,
"step": 1110
},
{
"epoch": 0.1115,
"grad_norm": 68.25616455078125,
"learning_rate": 8.886000000000001e-06,
"loss": 7.1784,
"step": 1115
},
{
"epoch": 0.112,
"grad_norm": 51.782901763916016,
"learning_rate": 8.881e-06,
"loss": 5.2915,
"step": 1120
},
{
"epoch": 0.1125,
"grad_norm": 61.892391204833984,
"learning_rate": 8.876e-06,
"loss": 6.0908,
"step": 1125
},
{
"epoch": 0.113,
"grad_norm": 54.9797248840332,
"learning_rate": 8.871e-06,
"loss": 4.4188,
"step": 1130
},
{
"epoch": 0.1135,
"grad_norm": 78.96302795410156,
"learning_rate": 8.866000000000001e-06,
"loss": 4.8732,
"step": 1135
},
{
"epoch": 0.114,
"grad_norm": 67.2728500366211,
"learning_rate": 8.861e-06,
"loss": 5.6544,
"step": 1140
},
{
"epoch": 0.1145,
"grad_norm": 53.67226028442383,
"learning_rate": 8.856000000000001e-06,
"loss": 4.7992,
"step": 1145
},
{
"epoch": 0.115,
"grad_norm": 63.462032318115234,
"learning_rate": 8.851e-06,
"loss": 5.5585,
"step": 1150
},
{
"epoch": 0.1155,
"grad_norm": 85.72888946533203,
"learning_rate": 8.846000000000001e-06,
"loss": 5.7742,
"step": 1155
},
{
"epoch": 0.116,
"grad_norm": 60.94392013549805,
"learning_rate": 8.841e-06,
"loss": 5.0155,
"step": 1160
},
{
"epoch": 0.1165,
"grad_norm": 56.96271896362305,
"learning_rate": 8.836000000000001e-06,
"loss": 4.5992,
"step": 1165
},
{
"epoch": 0.117,
"grad_norm": 50.74510192871094,
"learning_rate": 8.831e-06,
"loss": 4.5616,
"step": 1170
},
{
"epoch": 0.1175,
"grad_norm": 90.70756530761719,
"learning_rate": 8.826000000000002e-06,
"loss": 6.4825,
"step": 1175
},
{
"epoch": 0.118,
"grad_norm": 59.96350860595703,
"learning_rate": 8.821e-06,
"loss": 4.4625,
"step": 1180
},
{
"epoch": 0.1185,
"grad_norm": 89.02561950683594,
"learning_rate": 8.816000000000002e-06,
"loss": 5.7029,
"step": 1185
},
{
"epoch": 0.119,
"grad_norm": 61.081298828125,
"learning_rate": 8.811000000000001e-06,
"loss": 5.1897,
"step": 1190
},
{
"epoch": 0.1195,
"grad_norm": 75.82427978515625,
"learning_rate": 8.806000000000002e-06,
"loss": 5.4992,
"step": 1195
},
{
"epoch": 0.12,
"grad_norm": 53.01462936401367,
"learning_rate": 8.801000000000001e-06,
"loss": 6.01,
"step": 1200
},
{
"epoch": 0.1205,
"grad_norm": 68.6491928100586,
"learning_rate": 8.796000000000002e-06,
"loss": 5.4257,
"step": 1205
},
{
"epoch": 0.121,
"grad_norm": 67.61992645263672,
"learning_rate": 8.791000000000001e-06,
"loss": 5.0935,
"step": 1210
},
{
"epoch": 0.1215,
"grad_norm": 75.9515609741211,
"learning_rate": 8.786000000000002e-06,
"loss": 4.7788,
"step": 1215
},
{
"epoch": 0.122,
"grad_norm": 55.77494812011719,
"learning_rate": 8.781000000000001e-06,
"loss": 5.2895,
"step": 1220
},
{
"epoch": 0.1225,
"grad_norm": 78.90836334228516,
"learning_rate": 8.776e-06,
"loss": 5.7502,
"step": 1225
},
{
"epoch": 0.123,
"grad_norm": 54.38423156738281,
"learning_rate": 8.771000000000001e-06,
"loss": 5.2534,
"step": 1230
},
{
"epoch": 0.1235,
"grad_norm": 48.77484130859375,
"learning_rate": 8.766e-06,
"loss": 5.0617,
"step": 1235
},
{
"epoch": 0.124,
"grad_norm": 73.08646392822266,
"learning_rate": 8.761000000000001e-06,
"loss": 5.6851,
"step": 1240
},
{
"epoch": 0.1245,
"grad_norm": 49.47875213623047,
"learning_rate": 8.756e-06,
"loss": 4.7621,
"step": 1245
},
{
"epoch": 0.125,
"grad_norm": 42.435768127441406,
"learning_rate": 8.751000000000001e-06,
"loss": 4.2181,
"step": 1250
},
{
"epoch": 0.1255,
"grad_norm": 59.38146209716797,
"learning_rate": 8.746e-06,
"loss": 5.838,
"step": 1255
},
{
"epoch": 0.126,
"grad_norm": 60.928714752197266,
"learning_rate": 8.741e-06,
"loss": 4.903,
"step": 1260
},
{
"epoch": 0.1265,
"grad_norm": 62.644805908203125,
"learning_rate": 8.736e-06,
"loss": 6.1991,
"step": 1265
},
{
"epoch": 0.127,
"grad_norm": 62.247047424316406,
"learning_rate": 8.731e-06,
"loss": 5.3907,
"step": 1270
},
{
"epoch": 0.1275,
"grad_norm": 61.00815200805664,
"learning_rate": 8.726e-06,
"loss": 5.2265,
"step": 1275
},
{
"epoch": 0.128,
"grad_norm": 57.970314025878906,
"learning_rate": 8.721e-06,
"loss": 5.0272,
"step": 1280
},
{
"epoch": 0.1285,
"grad_norm": 78.71075439453125,
"learning_rate": 8.716000000000001e-06,
"loss": 5.3924,
"step": 1285
},
{
"epoch": 0.129,
"grad_norm": 45.33870315551758,
"learning_rate": 8.711e-06,
"loss": 4.1664,
"step": 1290
},
{
"epoch": 0.1295,
"grad_norm": 59.856685638427734,
"learning_rate": 8.706000000000001e-06,
"loss": 4.9717,
"step": 1295
},
{
"epoch": 0.13,
"grad_norm": 70.93965911865234,
"learning_rate": 8.701e-06,
"loss": 4.1831,
"step": 1300
},
{
"epoch": 0.1305,
"grad_norm": 54.16468811035156,
"learning_rate": 8.696000000000001e-06,
"loss": 5.3112,
"step": 1305
},
{
"epoch": 0.131,
"grad_norm": 77.19535064697266,
"learning_rate": 8.691e-06,
"loss": 5.0046,
"step": 1310
},
{
"epoch": 0.1315,
"grad_norm": 73.0103988647461,
"learning_rate": 8.686000000000001e-06,
"loss": 5.0891,
"step": 1315
},
{
"epoch": 0.132,
"grad_norm": 79.43903350830078,
"learning_rate": 8.681e-06,
"loss": 5.621,
"step": 1320
},
{
"epoch": 0.1325,
"grad_norm": 69.93892669677734,
"learning_rate": 8.676000000000001e-06,
"loss": 4.6726,
"step": 1325
},
{
"epoch": 0.133,
"grad_norm": 56.596439361572266,
"learning_rate": 8.671e-06,
"loss": 4.7146,
"step": 1330
},
{
"epoch": 0.1335,
"grad_norm": 59.12910079956055,
"learning_rate": 8.666000000000001e-06,
"loss": 5.683,
"step": 1335
},
{
"epoch": 0.134,
"grad_norm": 65.23446655273438,
"learning_rate": 8.661e-06,
"loss": 5.9475,
"step": 1340
},
{
"epoch": 0.1345,
"grad_norm": 60.68814468383789,
"learning_rate": 8.656000000000001e-06,
"loss": 4.8507,
"step": 1345
},
{
"epoch": 0.135,
"grad_norm": 58.06928634643555,
"learning_rate": 8.651e-06,
"loss": 5.7808,
"step": 1350
},
{
"epoch": 0.1355,
"grad_norm": 58.640655517578125,
"learning_rate": 8.646000000000002e-06,
"loss": 5.1012,
"step": 1355
},
{
"epoch": 0.136,
"grad_norm": 65.93994140625,
"learning_rate": 8.641e-06,
"loss": 4.8662,
"step": 1360
},
{
"epoch": 0.1365,
"grad_norm": 66.67878723144531,
"learning_rate": 8.636000000000002e-06,
"loss": 5.6589,
"step": 1365
},
{
"epoch": 0.137,
"grad_norm": 68.66671752929688,
"learning_rate": 8.631000000000001e-06,
"loss": 4.7309,
"step": 1370
},
{
"epoch": 0.1375,
"grad_norm": 64.89599609375,
"learning_rate": 8.626000000000002e-06,
"loss": 4.832,
"step": 1375
},
{
"epoch": 0.138,
"grad_norm": 70.13557434082031,
"learning_rate": 8.621000000000001e-06,
"loss": 5.6079,
"step": 1380
},
{
"epoch": 0.1385,
"grad_norm": 61.200347900390625,
"learning_rate": 8.616000000000002e-06,
"loss": 5.5559,
"step": 1385
},
{
"epoch": 0.139,
"grad_norm": 64.06344604492188,
"learning_rate": 8.611000000000001e-06,
"loss": 4.7863,
"step": 1390
},
{
"epoch": 0.1395,
"grad_norm": 81.44152069091797,
"learning_rate": 8.606e-06,
"loss": 6.071,
"step": 1395
},
{
"epoch": 0.14,
"grad_norm": 52.7180290222168,
"learning_rate": 8.601000000000001e-06,
"loss": 5.2613,
"step": 1400
},
{
"epoch": 0.1405,
"grad_norm": 72.11871337890625,
"learning_rate": 8.596e-06,
"loss": 4.4381,
"step": 1405
},
{
"epoch": 0.141,
"grad_norm": 85.8184814453125,
"learning_rate": 8.591000000000001e-06,
"loss": 6.2463,
"step": 1410
},
{
"epoch": 0.1415,
"grad_norm": 59.726993560791016,
"learning_rate": 8.586e-06,
"loss": 4.9436,
"step": 1415
},
{
"epoch": 0.142,
"grad_norm": 55.018795013427734,
"learning_rate": 8.581e-06,
"loss": 4.7814,
"step": 1420
},
{
"epoch": 0.1425,
"grad_norm": 56.400421142578125,
"learning_rate": 8.576e-06,
"loss": 4.6833,
"step": 1425
},
{
"epoch": 0.143,
"grad_norm": 46.20795822143555,
"learning_rate": 8.571e-06,
"loss": 3.7731,
"step": 1430
},
{
"epoch": 0.1435,
"grad_norm": 57.5872917175293,
"learning_rate": 8.566e-06,
"loss": 5.7952,
"step": 1435
},
{
"epoch": 0.144,
"grad_norm": 60.95462417602539,
"learning_rate": 8.561e-06,
"loss": 6.742,
"step": 1440
},
{
"epoch": 0.1445,
"grad_norm": 80.12348175048828,
"learning_rate": 8.556e-06,
"loss": 5.3024,
"step": 1445
},
{
"epoch": 0.145,
"grad_norm": 59.4682731628418,
"learning_rate": 8.551e-06,
"loss": 3.4684,
"step": 1450
},
{
"epoch": 0.1455,
"grad_norm": 87.38482666015625,
"learning_rate": 8.546000000000001e-06,
"loss": 5.9045,
"step": 1455
},
{
"epoch": 0.146,
"grad_norm": 56.61424255371094,
"learning_rate": 8.541e-06,
"loss": 5.1945,
"step": 1460
},
{
"epoch": 0.1465,
"grad_norm": 51.18035125732422,
"learning_rate": 8.536000000000001e-06,
"loss": 4.7717,
"step": 1465
},
{
"epoch": 0.147,
"grad_norm": 71.49605560302734,
"learning_rate": 8.531e-06,
"loss": 5.6623,
"step": 1470
},
{
"epoch": 0.1475,
"grad_norm": 70.1756362915039,
"learning_rate": 8.526000000000001e-06,
"loss": 4.667,
"step": 1475
},
{
"epoch": 0.148,
"grad_norm": 52.19149398803711,
"learning_rate": 8.521e-06,
"loss": 4.177,
"step": 1480
},
{
"epoch": 0.1485,
"grad_norm": 71.5694351196289,
"learning_rate": 8.516000000000001e-06,
"loss": 4.8019,
"step": 1485
},
{
"epoch": 0.149,
"grad_norm": 62.04697036743164,
"learning_rate": 8.511e-06,
"loss": 5.1555,
"step": 1490
},
{
"epoch": 0.1495,
"grad_norm": 79.89663696289062,
"learning_rate": 8.506000000000001e-06,
"loss": 5.8517,
"step": 1495
},
{
"epoch": 0.15,
"grad_norm": 93.36337280273438,
"learning_rate": 8.501e-06,
"loss": 6.1533,
"step": 1500
},
{
"epoch": 0.1505,
"grad_norm": 54.96908187866211,
"learning_rate": 8.496000000000001e-06,
"loss": 4.6604,
"step": 1505
},
{
"epoch": 0.151,
"grad_norm": 61.35822296142578,
"learning_rate": 8.491e-06,
"loss": 5.0536,
"step": 1510
},
{
"epoch": 0.1515,
"grad_norm": 66.18203735351562,
"learning_rate": 8.486000000000001e-06,
"loss": 4.9661,
"step": 1515
},
{
"epoch": 0.152,
"grad_norm": 50.83629608154297,
"learning_rate": 8.481e-06,
"loss": 5.2605,
"step": 1520
},
{
"epoch": 0.1525,
"grad_norm": 68.134033203125,
"learning_rate": 8.476000000000002e-06,
"loss": 5.9831,
"step": 1525
},
{
"epoch": 0.153,
"grad_norm": 55.83713150024414,
"learning_rate": 8.471e-06,
"loss": 5.2597,
"step": 1530
},
{
"epoch": 0.1535,
"grad_norm": 49.900516510009766,
"learning_rate": 8.466000000000002e-06,
"loss": 5.0297,
"step": 1535
},
{
"epoch": 0.154,
"grad_norm": 55.53334045410156,
"learning_rate": 8.461e-06,
"loss": 4.7405,
"step": 1540
},
{
"epoch": 0.1545,
"grad_norm": 51.795074462890625,
"learning_rate": 8.456000000000002e-06,
"loss": 4.7491,
"step": 1545
},
{
"epoch": 0.155,
"grad_norm": 43.57197570800781,
"learning_rate": 8.451000000000001e-06,
"loss": 5.0043,
"step": 1550
},
{
"epoch": 0.1555,
"grad_norm": 51.24105453491211,
"learning_rate": 8.446e-06,
"loss": 4.9249,
"step": 1555
},
{
"epoch": 0.156,
"grad_norm": 65.57850646972656,
"learning_rate": 8.441000000000001e-06,
"loss": 4.7407,
"step": 1560
},
{
"epoch": 0.1565,
"grad_norm": 64.2021255493164,
"learning_rate": 8.436e-06,
"loss": 5.7441,
"step": 1565
},
{
"epoch": 0.157,
"grad_norm": 65.62643432617188,
"learning_rate": 8.431000000000001e-06,
"loss": 5.1524,
"step": 1570
},
{
"epoch": 0.1575,
"grad_norm": 60.66287612915039,
"learning_rate": 8.426e-06,
"loss": 6.0503,
"step": 1575
},
{
"epoch": 0.158,
"grad_norm": 59.155853271484375,
"learning_rate": 8.421e-06,
"loss": 6.0681,
"step": 1580
},
{
"epoch": 0.1585,
"grad_norm": 54.823699951171875,
"learning_rate": 8.416e-06,
"loss": 4.6115,
"step": 1585
},
{
"epoch": 0.159,
"grad_norm": 58.49845504760742,
"learning_rate": 8.411e-06,
"loss": 4.8539,
"step": 1590
},
{
"epoch": 0.1595,
"grad_norm": 63.808223724365234,
"learning_rate": 8.406e-06,
"loss": 4.931,
"step": 1595
},
{
"epoch": 0.16,
"grad_norm": 49.438392639160156,
"learning_rate": 8.401e-06,
"loss": 4.449,
"step": 1600
},
{
"epoch": 0.1605,
"grad_norm": 58.23267364501953,
"learning_rate": 8.396e-06,
"loss": 5.3667,
"step": 1605
},
{
"epoch": 0.161,
"grad_norm": 64.92132568359375,
"learning_rate": 8.391e-06,
"loss": 5.3969,
"step": 1610
},
{
"epoch": 0.1615,
"grad_norm": 76.33795928955078,
"learning_rate": 8.386e-06,
"loss": 5.0351,
"step": 1615
},
{
"epoch": 0.162,
"grad_norm": 68.49414825439453,
"learning_rate": 8.381e-06,
"loss": 6.1171,
"step": 1620
},
{
"epoch": 0.1625,
"grad_norm": 77.73714447021484,
"learning_rate": 8.376e-06,
"loss": 4.633,
"step": 1625
},
{
"epoch": 0.163,
"grad_norm": 73.00968933105469,
"learning_rate": 8.371e-06,
"loss": 5.813,
"step": 1630
},
{
"epoch": 0.1635,
"grad_norm": 71.55262756347656,
"learning_rate": 8.366000000000001e-06,
"loss": 5.2287,
"step": 1635
},
{
"epoch": 0.164,
"grad_norm": 67.30160522460938,
"learning_rate": 8.361e-06,
"loss": 5.2435,
"step": 1640
},
{
"epoch": 0.1645,
"grad_norm": 59.34938430786133,
"learning_rate": 8.356000000000001e-06,
"loss": 5.6554,
"step": 1645
},
{
"epoch": 0.165,
"grad_norm": 50.35622024536133,
"learning_rate": 8.351e-06,
"loss": 5.1153,
"step": 1650
},
{
"epoch": 0.1655,
"grad_norm": 70.94762420654297,
"learning_rate": 8.346000000000001e-06,
"loss": 4.9395,
"step": 1655
},
{
"epoch": 0.166,
"grad_norm": 71.47914123535156,
"learning_rate": 8.341e-06,
"loss": 4.832,
"step": 1660
},
{
"epoch": 0.1665,
"grad_norm": 67.92778015136719,
"learning_rate": 8.336000000000001e-06,
"loss": 4.3402,
"step": 1665
},
{
"epoch": 0.167,
"grad_norm": 67.71968841552734,
"learning_rate": 8.331e-06,
"loss": 5.1193,
"step": 1670
},
{
"epoch": 0.1675,
"grad_norm": 48.13265609741211,
"learning_rate": 8.326000000000001e-06,
"loss": 5.1677,
"step": 1675
},
{
"epoch": 0.168,
"grad_norm": 57.75625991821289,
"learning_rate": 8.321e-06,
"loss": 5.2929,
"step": 1680
},
{
"epoch": 0.1685,
"grad_norm": 59.2353630065918,
"learning_rate": 8.316000000000001e-06,
"loss": 4.6238,
"step": 1685
},
{
"epoch": 0.169,
"grad_norm": 72.31878662109375,
"learning_rate": 8.311e-06,
"loss": 4.9348,
"step": 1690
},
{
"epoch": 0.1695,
"grad_norm": 66.85990905761719,
"learning_rate": 8.306000000000001e-06,
"loss": 5.2399,
"step": 1695
},
{
"epoch": 0.17,
"grad_norm": 65.82227325439453,
"learning_rate": 8.301e-06,
"loss": 3.9255,
"step": 1700
},
{
"epoch": 0.1705,
"grad_norm": 64.09220886230469,
"learning_rate": 8.296000000000002e-06,
"loss": 5.0496,
"step": 1705
},
{
"epoch": 0.171,
"grad_norm": 70.31346893310547,
"learning_rate": 8.291e-06,
"loss": 5.5094,
"step": 1710
},
{
"epoch": 0.1715,
"grad_norm": 58.26755142211914,
"learning_rate": 8.286e-06,
"loss": 4.0126,
"step": 1715
},
{
"epoch": 0.172,
"grad_norm": 53.582611083984375,
"learning_rate": 8.281e-06,
"loss": 4.79,
"step": 1720
},
{
"epoch": 0.1725,
"grad_norm": 56.40678405761719,
"learning_rate": 8.276e-06,
"loss": 5.1313,
"step": 1725
},
{
"epoch": 0.173,
"grad_norm": 54.7972526550293,
"learning_rate": 8.271000000000001e-06,
"loss": 5.046,
"step": 1730
},
{
"epoch": 0.1735,
"grad_norm": 52.5406379699707,
"learning_rate": 8.266e-06,
"loss": 4.7231,
"step": 1735
},
{
"epoch": 0.174,
"grad_norm": 48.16586685180664,
"learning_rate": 8.261e-06,
"loss": 5.1313,
"step": 1740
},
{
"epoch": 0.1745,
"grad_norm": 47.292720794677734,
"learning_rate": 8.256e-06,
"loss": 4.6004,
"step": 1745
},
{
"epoch": 0.175,
"grad_norm": 60.821266174316406,
"learning_rate": 8.251e-06,
"loss": 5.5364,
"step": 1750
},
{
"epoch": 0.1755,
"grad_norm": 93.91549682617188,
"learning_rate": 8.246e-06,
"loss": 5.5144,
"step": 1755
},
{
"epoch": 0.176,
"grad_norm": 114.97464752197266,
"learning_rate": 8.241000000000001e-06,
"loss": 5.0595,
"step": 1760
},
{
"epoch": 0.1765,
"grad_norm": 64.73751831054688,
"learning_rate": 8.236e-06,
"loss": 4.3854,
"step": 1765
},
{
"epoch": 0.177,
"grad_norm": 64.93399810791016,
"learning_rate": 8.231000000000001e-06,
"loss": 5.1058,
"step": 1770
},
{
"epoch": 0.1775,
"grad_norm": 67.42242431640625,
"learning_rate": 8.226e-06,
"loss": 4.8133,
"step": 1775
},
{
"epoch": 0.178,
"grad_norm": 52.42443084716797,
"learning_rate": 8.221000000000001e-06,
"loss": 5.0636,
"step": 1780
},
{
"epoch": 0.1785,
"grad_norm": 79.13347625732422,
"learning_rate": 8.216e-06,
"loss": 5.2212,
"step": 1785
},
{
"epoch": 0.179,
"grad_norm": 67.81745910644531,
"learning_rate": 8.211000000000002e-06,
"loss": 5.2105,
"step": 1790
},
{
"epoch": 0.1795,
"grad_norm": 62.96812057495117,
"learning_rate": 8.206e-06,
"loss": 5.3702,
"step": 1795
},
{
"epoch": 0.18,
"grad_norm": 76.28816223144531,
"learning_rate": 8.201000000000002e-06,
"loss": 4.6774,
"step": 1800
},
{
"epoch": 0.1805,
"grad_norm": 129.661865234375,
"learning_rate": 8.196e-06,
"loss": 6.0545,
"step": 1805
},
{
"epoch": 0.181,
"grad_norm": 68.19229888916016,
"learning_rate": 8.191000000000002e-06,
"loss": 5.403,
"step": 1810
},
{
"epoch": 0.1815,
"grad_norm": 65.75102996826172,
"learning_rate": 8.186000000000001e-06,
"loss": 4.7501,
"step": 1815
},
{
"epoch": 0.182,
"grad_norm": 61.759761810302734,
"learning_rate": 8.181000000000002e-06,
"loss": 4.7871,
"step": 1820
},
{
"epoch": 0.1825,
"grad_norm": 66.68859100341797,
"learning_rate": 8.176000000000001e-06,
"loss": 4.5813,
"step": 1825
},
{
"epoch": 0.183,
"grad_norm": 71.10906219482422,
"learning_rate": 8.171000000000002e-06,
"loss": 4.9447,
"step": 1830
},
{
"epoch": 0.1835,
"grad_norm": 68.92731475830078,
"learning_rate": 8.166000000000001e-06,
"loss": 5.2777,
"step": 1835
},
{
"epoch": 0.184,
"grad_norm": 68.01397705078125,
"learning_rate": 8.161e-06,
"loss": 5.3131,
"step": 1840
},
{
"epoch": 0.1845,
"grad_norm": 63.41286849975586,
"learning_rate": 8.156000000000001e-06,
"loss": 4.3475,
"step": 1845
},
{
"epoch": 0.185,
"grad_norm": 70.69223022460938,
"learning_rate": 8.151e-06,
"loss": 5.0628,
"step": 1850
},
{
"epoch": 0.1855,
"grad_norm": 84.73713684082031,
"learning_rate": 8.146000000000001e-06,
"loss": 6.1366,
"step": 1855
},
{
"epoch": 0.186,
"grad_norm": 73.59104919433594,
"learning_rate": 8.141e-06,
"loss": 4.4199,
"step": 1860
},
{
"epoch": 0.1865,
"grad_norm": 71.16675567626953,
"learning_rate": 8.136000000000001e-06,
"loss": 4.4485,
"step": 1865
},
{
"epoch": 0.187,
"grad_norm": 58.05774688720703,
"learning_rate": 8.131e-06,
"loss": 5.5044,
"step": 1870
},
{
"epoch": 0.1875,
"grad_norm": 55.99666976928711,
"learning_rate": 8.126e-06,
"loss": 3.8585,
"step": 1875
},
{
"epoch": 0.188,
"grad_norm": 69.0136947631836,
"learning_rate": 8.121e-06,
"loss": 4.2551,
"step": 1880
},
{
"epoch": 0.1885,
"grad_norm": 43.44075012207031,
"learning_rate": 8.116e-06,
"loss": 4.511,
"step": 1885
},
{
"epoch": 0.189,
"grad_norm": 60.94331741333008,
"learning_rate": 8.111e-06,
"loss": 5.4013,
"step": 1890
},
{
"epoch": 0.1895,
"grad_norm": 64.44482421875,
"learning_rate": 8.106e-06,
"loss": 4.2638,
"step": 1895
},
{
"epoch": 0.19,
"grad_norm": 52.195465087890625,
"learning_rate": 8.101000000000001e-06,
"loss": 3.9795,
"step": 1900
},
{
"epoch": 0.1905,
"grad_norm": 76.83987426757812,
"learning_rate": 8.096e-06,
"loss": 4.3378,
"step": 1905
},
{
"epoch": 0.191,
"grad_norm": 71.9560317993164,
"learning_rate": 8.091000000000001e-06,
"loss": 5.4242,
"step": 1910
},
{
"epoch": 0.1915,
"grad_norm": 66.99281311035156,
"learning_rate": 8.086e-06,
"loss": 5.5902,
"step": 1915
},
{
"epoch": 0.192,
"grad_norm": 56.23039245605469,
"learning_rate": 8.081000000000001e-06,
"loss": 5.7197,
"step": 1920
},
{
"epoch": 0.1925,
"grad_norm": 80.09093475341797,
"learning_rate": 8.076e-06,
"loss": 5.1211,
"step": 1925
},
{
"epoch": 0.193,
"grad_norm": 65.79684448242188,
"learning_rate": 8.071000000000001e-06,
"loss": 4.8171,
"step": 1930
},
{
"epoch": 0.1935,
"grad_norm": 69.53257751464844,
"learning_rate": 8.066e-06,
"loss": 4.4696,
"step": 1935
},
{
"epoch": 0.194,
"grad_norm": 61.77272033691406,
"learning_rate": 8.061000000000001e-06,
"loss": 4.1851,
"step": 1940
},
{
"epoch": 0.1945,
"grad_norm": 77.31800842285156,
"learning_rate": 8.056e-06,
"loss": 5.2775,
"step": 1945
},
{
"epoch": 0.195,
"grad_norm": 56.944435119628906,
"learning_rate": 8.051000000000001e-06,
"loss": 4.62,
"step": 1950
},
{
"epoch": 0.1955,
"grad_norm": 88.08844757080078,
"learning_rate": 8.046e-06,
"loss": 6.7407,
"step": 1955
},
{
"epoch": 0.196,
"grad_norm": 69.57597351074219,
"learning_rate": 8.041000000000001e-06,
"loss": 4.6671,
"step": 1960
},
{
"epoch": 0.1965,
"grad_norm": 65.85086822509766,
"learning_rate": 8.036e-06,
"loss": 4.2913,
"step": 1965
},
{
"epoch": 0.197,
"grad_norm": 71.38687133789062,
"learning_rate": 8.031000000000002e-06,
"loss": 4.5964,
"step": 1970
},
{
"epoch": 0.1975,
"grad_norm": 72.68925476074219,
"learning_rate": 8.026e-06,
"loss": 4.7652,
"step": 1975
},
{
"epoch": 0.198,
"grad_norm": 56.921199798583984,
"learning_rate": 8.021000000000002e-06,
"loss": 5.8899,
"step": 1980
},
{
"epoch": 0.1985,
"grad_norm": 60.6448860168457,
"learning_rate": 8.016e-06,
"loss": 4.4663,
"step": 1985
},
{
"epoch": 0.199,
"grad_norm": 77.55535888671875,
"learning_rate": 8.011000000000002e-06,
"loss": 4.9308,
"step": 1990
},
{
"epoch": 0.1995,
"grad_norm": 73.72390747070312,
"learning_rate": 8.006000000000001e-06,
"loss": 5.127,
"step": 1995
},
{
"epoch": 0.2,
"grad_norm": 80.28380584716797,
"learning_rate": 8.001000000000002e-06,
"loss": 5.1619,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}