Dongkkka's picture
Upload folder using huggingface_hub
d39a332 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 531.25,
"eval_steps": 500,
"global_step": 8500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.625,
"grad_norm": 0.9679650664329529,
"learning_rate": 1.8e-07,
"loss": 1.4147,
"step": 10
},
{
"epoch": 1.25,
"grad_norm": 0.9715470671653748,
"learning_rate": 3.8e-07,
"loss": 1.4102,
"step": 20
},
{
"epoch": 1.875,
"grad_norm": 0.9312183260917664,
"learning_rate": 5.8e-07,
"loss": 1.4117,
"step": 30
},
{
"epoch": 2.5,
"grad_norm": 0.9487130641937256,
"learning_rate": 7.8e-07,
"loss": 1.4086,
"step": 40
},
{
"epoch": 3.125,
"grad_norm": 0.8825913071632385,
"learning_rate": 9.8e-07,
"loss": 1.3962,
"step": 50
},
{
"epoch": 3.75,
"grad_norm": 0.8918140530586243,
"learning_rate": 1.18e-06,
"loss": 1.3851,
"step": 60
},
{
"epoch": 4.375,
"grad_norm": 0.8738917708396912,
"learning_rate": 1.3800000000000001e-06,
"loss": 1.3686,
"step": 70
},
{
"epoch": 5.0,
"grad_norm": 0.9906870126724243,
"learning_rate": 1.5800000000000003e-06,
"loss": 1.3532,
"step": 80
},
{
"epoch": 5.625,
"grad_norm": 1.1455429792404175,
"learning_rate": 1.7800000000000001e-06,
"loss": 1.3182,
"step": 90
},
{
"epoch": 6.25,
"grad_norm": 1.236258864402771,
"learning_rate": 1.98e-06,
"loss": 1.2925,
"step": 100
},
{
"epoch": 6.875,
"grad_norm": 1.327654242515564,
"learning_rate": 2.1800000000000003e-06,
"loss": 1.258,
"step": 110
},
{
"epoch": 7.5,
"grad_norm": 1.2924314737319946,
"learning_rate": 2.38e-06,
"loss": 1.2277,
"step": 120
},
{
"epoch": 8.125,
"grad_norm": 1.167385220527649,
"learning_rate": 2.5800000000000003e-06,
"loss": 1.1915,
"step": 130
},
{
"epoch": 8.75,
"grad_norm": 0.9448131322860718,
"learning_rate": 2.78e-06,
"loss": 1.1615,
"step": 140
},
{
"epoch": 9.375,
"grad_norm": 0.6841535568237305,
"learning_rate": 2.9800000000000003e-06,
"loss": 1.1314,
"step": 150
},
{
"epoch": 10.0,
"grad_norm": 0.41633787751197815,
"learning_rate": 3.1800000000000005e-06,
"loss": 1.1138,
"step": 160
},
{
"epoch": 10.625,
"grad_norm": 0.3072815537452698,
"learning_rate": 3.38e-06,
"loss": 1.1009,
"step": 170
},
{
"epoch": 11.25,
"grad_norm": 0.28385525941848755,
"learning_rate": 3.58e-06,
"loss": 1.0909,
"step": 180
},
{
"epoch": 11.875,
"grad_norm": 0.26593610644340515,
"learning_rate": 3.7800000000000002e-06,
"loss": 1.0844,
"step": 190
},
{
"epoch": 12.5,
"grad_norm": 0.2532358169555664,
"learning_rate": 3.98e-06,
"loss": 1.0767,
"step": 200
},
{
"epoch": 13.125,
"grad_norm": 0.2549743354320526,
"learning_rate": 4.18e-06,
"loss": 1.0693,
"step": 210
},
{
"epoch": 13.75,
"grad_norm": 0.22734442353248596,
"learning_rate": 4.38e-06,
"loss": 1.0659,
"step": 220
},
{
"epoch": 14.375,
"grad_norm": 0.22863343358039856,
"learning_rate": 4.58e-06,
"loss": 1.0584,
"step": 230
},
{
"epoch": 15.0,
"grad_norm": 0.22417008876800537,
"learning_rate": 4.780000000000001e-06,
"loss": 1.0538,
"step": 240
},
{
"epoch": 15.625,
"grad_norm": 0.20044955611228943,
"learning_rate": 4.98e-06,
"loss": 1.0501,
"step": 250
},
{
"epoch": 16.25,
"grad_norm": 0.20292679965496063,
"learning_rate": 5.18e-06,
"loss": 1.0471,
"step": 260
},
{
"epoch": 16.875,
"grad_norm": 0.18847720324993134,
"learning_rate": 5.38e-06,
"loss": 1.0408,
"step": 270
},
{
"epoch": 17.5,
"grad_norm": 0.16966596245765686,
"learning_rate": 5.580000000000001e-06,
"loss": 1.0372,
"step": 280
},
{
"epoch": 18.125,
"grad_norm": 0.189141184091568,
"learning_rate": 5.78e-06,
"loss": 1.0352,
"step": 290
},
{
"epoch": 18.75,
"grad_norm": 0.18442021310329437,
"learning_rate": 5.98e-06,
"loss": 1.0313,
"step": 300
},
{
"epoch": 19.375,
"grad_norm": 0.1687787026166916,
"learning_rate": 6.18e-06,
"loss": 1.0297,
"step": 310
},
{
"epoch": 20.0,
"grad_norm": 0.20706340670585632,
"learning_rate": 6.38e-06,
"loss": 1.0266,
"step": 320
},
{
"epoch": 20.625,
"grad_norm": 0.17096994817256927,
"learning_rate": 6.58e-06,
"loss": 1.0222,
"step": 330
},
{
"epoch": 21.25,
"grad_norm": 0.18136513233184814,
"learning_rate": 6.78e-06,
"loss": 1.0181,
"step": 340
},
{
"epoch": 21.875,
"grad_norm": 0.18053822219371796,
"learning_rate": 6.98e-06,
"loss": 1.0154,
"step": 350
},
{
"epoch": 22.5,
"grad_norm": 0.2154332548379898,
"learning_rate": 7.180000000000001e-06,
"loss": 1.0116,
"step": 360
},
{
"epoch": 23.125,
"grad_norm": 0.23920832574367523,
"learning_rate": 7.3800000000000005e-06,
"loss": 1.0056,
"step": 370
},
{
"epoch": 23.75,
"grad_norm": 0.24692080914974213,
"learning_rate": 7.580000000000001e-06,
"loss": 0.9982,
"step": 380
},
{
"epoch": 24.375,
"grad_norm": 0.32462307810783386,
"learning_rate": 7.78e-06,
"loss": 0.988,
"step": 390
},
{
"epoch": 25.0,
"grad_norm": 0.3690284788608551,
"learning_rate": 7.98e-06,
"loss": 0.9721,
"step": 400
},
{
"epoch": 25.625,
"grad_norm": 0.44107159972190857,
"learning_rate": 8.18e-06,
"loss": 0.9429,
"step": 410
},
{
"epoch": 26.25,
"grad_norm": 0.5854696035385132,
"learning_rate": 8.380000000000001e-06,
"loss": 0.8986,
"step": 420
},
{
"epoch": 26.875,
"grad_norm": 0.7961719632148743,
"learning_rate": 8.580000000000001e-06,
"loss": 0.8538,
"step": 430
},
{
"epoch": 27.5,
"grad_norm": 0.9634988307952881,
"learning_rate": 8.78e-06,
"loss": 0.8043,
"step": 440
},
{
"epoch": 28.125,
"grad_norm": 0.8321980834007263,
"learning_rate": 8.98e-06,
"loss": 0.7597,
"step": 450
},
{
"epoch": 28.75,
"grad_norm": 0.784283459186554,
"learning_rate": 9.180000000000002e-06,
"loss": 0.7166,
"step": 460
},
{
"epoch": 29.375,
"grad_norm": 0.8551204800605774,
"learning_rate": 9.38e-06,
"loss": 0.6809,
"step": 470
},
{
"epoch": 30.0,
"grad_norm": 0.774193525314331,
"learning_rate": 9.58e-06,
"loss": 0.6511,
"step": 480
},
{
"epoch": 30.625,
"grad_norm": 0.8994619846343994,
"learning_rate": 9.78e-06,
"loss": 0.6267,
"step": 490
},
{
"epoch": 31.25,
"grad_norm": 1.0884357690811157,
"learning_rate": 9.980000000000001e-06,
"loss": 0.6061,
"step": 500
},
{
"epoch": 31.875,
"grad_norm": 1.0175591707229614,
"learning_rate": 1.018e-05,
"loss": 0.5858,
"step": 510
},
{
"epoch": 32.5,
"grad_norm": 1.1407521963119507,
"learning_rate": 1.038e-05,
"loss": 0.5714,
"step": 520
},
{
"epoch": 33.125,
"grad_norm": 1.042667269706726,
"learning_rate": 1.058e-05,
"loss": 0.5586,
"step": 530
},
{
"epoch": 33.75,
"grad_norm": 0.9843167662620544,
"learning_rate": 1.0780000000000002e-05,
"loss": 0.5492,
"step": 540
},
{
"epoch": 34.375,
"grad_norm": 1.0415880680084229,
"learning_rate": 1.098e-05,
"loss": 0.5431,
"step": 550
},
{
"epoch": 35.0,
"grad_norm": 1.1198561191558838,
"learning_rate": 1.118e-05,
"loss": 0.5375,
"step": 560
},
{
"epoch": 35.625,
"grad_norm": 1.2082597017288208,
"learning_rate": 1.1380000000000001e-05,
"loss": 0.5338,
"step": 570
},
{
"epoch": 36.25,
"grad_norm": 1.2071870565414429,
"learning_rate": 1.1580000000000001e-05,
"loss": 0.5298,
"step": 580
},
{
"epoch": 36.875,
"grad_norm": 1.3737869262695312,
"learning_rate": 1.178e-05,
"loss": 0.5287,
"step": 590
},
{
"epoch": 37.5,
"grad_norm": 1.0583442449569702,
"learning_rate": 1.198e-05,
"loss": 0.5243,
"step": 600
},
{
"epoch": 38.125,
"grad_norm": 1.0685936212539673,
"learning_rate": 1.2180000000000002e-05,
"loss": 0.5229,
"step": 610
},
{
"epoch": 38.75,
"grad_norm": 1.2425378561019897,
"learning_rate": 1.238e-05,
"loss": 0.5215,
"step": 620
},
{
"epoch": 39.375,
"grad_norm": 1.1828131675720215,
"learning_rate": 1.258e-05,
"loss": 0.5194,
"step": 630
},
{
"epoch": 40.0,
"grad_norm": 1.0766721963882446,
"learning_rate": 1.278e-05,
"loss": 0.519,
"step": 640
},
{
"epoch": 40.625,
"grad_norm": 1.0481817722320557,
"learning_rate": 1.2980000000000001e-05,
"loss": 0.5154,
"step": 650
},
{
"epoch": 41.25,
"grad_norm": 1.0572658777236938,
"learning_rate": 1.3180000000000001e-05,
"loss": 0.5159,
"step": 660
},
{
"epoch": 41.875,
"grad_norm": 1.203803539276123,
"learning_rate": 1.338e-05,
"loss": 0.5121,
"step": 670
},
{
"epoch": 42.5,
"grad_norm": 0.872924268245697,
"learning_rate": 1.358e-05,
"loss": 0.512,
"step": 680
},
{
"epoch": 43.125,
"grad_norm": 1.162695050239563,
"learning_rate": 1.3780000000000002e-05,
"loss": 0.5064,
"step": 690
},
{
"epoch": 43.75,
"grad_norm": 1.1628501415252686,
"learning_rate": 1.3980000000000002e-05,
"loss": 0.5082,
"step": 700
},
{
"epoch": 44.375,
"grad_norm": 1.217319369316101,
"learning_rate": 1.4180000000000001e-05,
"loss": 0.5067,
"step": 710
},
{
"epoch": 45.0,
"grad_norm": 1.1384787559509277,
"learning_rate": 1.4380000000000001e-05,
"loss": 0.506,
"step": 720
},
{
"epoch": 45.625,
"grad_norm": 1.2870819568634033,
"learning_rate": 1.4580000000000003e-05,
"loss": 0.505,
"step": 730
},
{
"epoch": 46.25,
"grad_norm": 1.1514242887496948,
"learning_rate": 1.4779999999999999e-05,
"loss": 0.5064,
"step": 740
},
{
"epoch": 46.875,
"grad_norm": 1.144319772720337,
"learning_rate": 1.4979999999999999e-05,
"loss": 0.5017,
"step": 750
},
{
"epoch": 47.5,
"grad_norm": 1.22275710105896,
"learning_rate": 1.518e-05,
"loss": 0.5013,
"step": 760
},
{
"epoch": 48.125,
"grad_norm": 1.118972659111023,
"learning_rate": 1.538e-05,
"loss": 0.4997,
"step": 770
},
{
"epoch": 48.75,
"grad_norm": 1.051975965499878,
"learning_rate": 1.558e-05,
"loss": 0.5008,
"step": 780
},
{
"epoch": 49.375,
"grad_norm": 1.4525421857833862,
"learning_rate": 1.578e-05,
"loss": 0.4996,
"step": 790
},
{
"epoch": 50.0,
"grad_norm": 1.183190941810608,
"learning_rate": 1.598e-05,
"loss": 0.4963,
"step": 800
},
{
"epoch": 50.625,
"grad_norm": 1.2612169981002808,
"learning_rate": 1.618e-05,
"loss": 0.4974,
"step": 810
},
{
"epoch": 51.25,
"grad_norm": 0.9744483828544617,
"learning_rate": 1.6380000000000002e-05,
"loss": 0.4954,
"step": 820
},
{
"epoch": 51.875,
"grad_norm": 1.1227869987487793,
"learning_rate": 1.658e-05,
"loss": 0.4959,
"step": 830
},
{
"epoch": 52.5,
"grad_norm": 1.043039321899414,
"learning_rate": 1.6780000000000002e-05,
"loss": 0.4927,
"step": 840
},
{
"epoch": 53.125,
"grad_norm": 1.2920984029769897,
"learning_rate": 1.698e-05,
"loss": 0.4918,
"step": 850
},
{
"epoch": 53.75,
"grad_norm": 1.2774907350540161,
"learning_rate": 1.718e-05,
"loss": 0.4907,
"step": 860
},
{
"epoch": 54.375,
"grad_norm": 1.1010966300964355,
"learning_rate": 1.7380000000000003e-05,
"loss": 0.4914,
"step": 870
},
{
"epoch": 55.0,
"grad_norm": 1.0284886360168457,
"learning_rate": 1.758e-05,
"loss": 0.4873,
"step": 880
},
{
"epoch": 55.625,
"grad_norm": 1.3238409757614136,
"learning_rate": 1.7780000000000003e-05,
"loss": 0.4883,
"step": 890
},
{
"epoch": 56.25,
"grad_norm": 1.127139687538147,
"learning_rate": 1.798e-05,
"loss": 0.4871,
"step": 900
},
{
"epoch": 56.875,
"grad_norm": 1.3855187892913818,
"learning_rate": 1.818e-05,
"loss": 0.4828,
"step": 910
},
{
"epoch": 57.5,
"grad_norm": 1.536786437034607,
"learning_rate": 1.838e-05,
"loss": 0.4812,
"step": 920
},
{
"epoch": 58.125,
"grad_norm": 2.120702028274536,
"learning_rate": 1.858e-05,
"loss": 0.4776,
"step": 930
},
{
"epoch": 58.75,
"grad_norm": 1.6191856861114502,
"learning_rate": 1.878e-05,
"loss": 0.4761,
"step": 940
},
{
"epoch": 59.375,
"grad_norm": 1.6021475791931152,
"learning_rate": 1.898e-05,
"loss": 0.4743,
"step": 950
},
{
"epoch": 60.0,
"grad_norm": 1.604246735572815,
"learning_rate": 1.918e-05,
"loss": 0.4705,
"step": 960
},
{
"epoch": 60.625,
"grad_norm": 3.4168691635131836,
"learning_rate": 1.938e-05,
"loss": 0.4732,
"step": 970
},
{
"epoch": 61.25,
"grad_norm": 1.997071623802185,
"learning_rate": 1.9580000000000002e-05,
"loss": 0.4709,
"step": 980
},
{
"epoch": 61.875,
"grad_norm": 2.351092576980591,
"learning_rate": 1.978e-05,
"loss": 0.4678,
"step": 990
},
{
"epoch": 62.5,
"grad_norm": 1.9072397947311401,
"learning_rate": 1.9980000000000002e-05,
"loss": 0.469,
"step": 1000
},
{
"epoch": 63.125,
"grad_norm": 1.777255892753601,
"learning_rate": 2.0180000000000003e-05,
"loss": 0.4642,
"step": 1010
},
{
"epoch": 63.75,
"grad_norm": 1.3945808410644531,
"learning_rate": 2.038e-05,
"loss": 0.4613,
"step": 1020
},
{
"epoch": 64.375,
"grad_norm": 1.5424234867095947,
"learning_rate": 2.0580000000000003e-05,
"loss": 0.4621,
"step": 1030
},
{
"epoch": 65.0,
"grad_norm": 1.4399698972702026,
"learning_rate": 2.078e-05,
"loss": 0.4596,
"step": 1040
},
{
"epoch": 65.625,
"grad_norm": 1.5211741924285889,
"learning_rate": 2.098e-05,
"loss": 0.4571,
"step": 1050
},
{
"epoch": 66.25,
"grad_norm": 1.6879644393920898,
"learning_rate": 2.118e-05,
"loss": 0.4535,
"step": 1060
},
{
"epoch": 66.875,
"grad_norm": 1.7169924974441528,
"learning_rate": 2.138e-05,
"loss": 0.4506,
"step": 1070
},
{
"epoch": 67.5,
"grad_norm": 2.5419564247131348,
"learning_rate": 2.158e-05,
"loss": 0.4497,
"step": 1080
},
{
"epoch": 68.125,
"grad_norm": 1.9424971342086792,
"learning_rate": 2.178e-05,
"loss": 0.446,
"step": 1090
},
{
"epoch": 68.75,
"grad_norm": 4.030938148498535,
"learning_rate": 2.198e-05,
"loss": 0.4459,
"step": 1100
},
{
"epoch": 69.375,
"grad_norm": 2.3670308589935303,
"learning_rate": 2.218e-05,
"loss": 0.4454,
"step": 1110
},
{
"epoch": 70.0,
"grad_norm": 2.562795400619507,
"learning_rate": 2.2380000000000003e-05,
"loss": 0.4397,
"step": 1120
},
{
"epoch": 70.625,
"grad_norm": 2.4600791931152344,
"learning_rate": 2.258e-05,
"loss": 0.4359,
"step": 1130
},
{
"epoch": 71.25,
"grad_norm": 2.77167010307312,
"learning_rate": 2.2780000000000002e-05,
"loss": 0.4325,
"step": 1140
},
{
"epoch": 71.875,
"grad_norm": 3.150618314743042,
"learning_rate": 2.298e-05,
"loss": 0.4285,
"step": 1150
},
{
"epoch": 72.5,
"grad_norm": 2.5932984352111816,
"learning_rate": 2.318e-05,
"loss": 0.4247,
"step": 1160
},
{
"epoch": 73.125,
"grad_norm": 2.8484175205230713,
"learning_rate": 2.3380000000000003e-05,
"loss": 0.4199,
"step": 1170
},
{
"epoch": 73.75,
"grad_norm": 3.6509993076324463,
"learning_rate": 2.358e-05,
"loss": 0.4183,
"step": 1180
},
{
"epoch": 74.375,
"grad_norm": 5.74982213973999,
"learning_rate": 2.3780000000000003e-05,
"loss": 0.4146,
"step": 1190
},
{
"epoch": 75.0,
"grad_norm": 5.700360298156738,
"learning_rate": 2.398e-05,
"loss": 0.4322,
"step": 1200
},
{
"epoch": 75.625,
"grad_norm": 3.3000857830047607,
"learning_rate": 2.418e-05,
"loss": 0.4228,
"step": 1210
},
{
"epoch": 76.25,
"grad_norm": 2.5206165313720703,
"learning_rate": 2.438e-05,
"loss": 0.4072,
"step": 1220
},
{
"epoch": 76.875,
"grad_norm": 4.3755083084106445,
"learning_rate": 2.4580000000000002e-05,
"loss": 0.4025,
"step": 1230
},
{
"epoch": 77.5,
"grad_norm": 3.973033905029297,
"learning_rate": 2.478e-05,
"loss": 0.3953,
"step": 1240
},
{
"epoch": 78.125,
"grad_norm": 3.5819246768951416,
"learning_rate": 2.498e-05,
"loss": 0.3909,
"step": 1250
},
{
"epoch": 78.75,
"grad_norm": 3.183445930480957,
"learning_rate": 2.5180000000000003e-05,
"loss": 0.3852,
"step": 1260
},
{
"epoch": 79.375,
"grad_norm": 3.4347057342529297,
"learning_rate": 2.5380000000000004e-05,
"loss": 0.3785,
"step": 1270
},
{
"epoch": 80.0,
"grad_norm": 5.1768388748168945,
"learning_rate": 2.5580000000000002e-05,
"loss": 0.3717,
"step": 1280
},
{
"epoch": 80.625,
"grad_norm": 3.9438490867614746,
"learning_rate": 2.5779999999999997e-05,
"loss": 0.3681,
"step": 1290
},
{
"epoch": 81.25,
"grad_norm": 4.442440986633301,
"learning_rate": 2.598e-05,
"loss": 0.3571,
"step": 1300
},
{
"epoch": 81.875,
"grad_norm": 5.6028900146484375,
"learning_rate": 2.618e-05,
"loss": 0.3553,
"step": 1310
},
{
"epoch": 82.5,
"grad_norm": 5.537353038787842,
"learning_rate": 2.6379999999999998e-05,
"loss": 0.3515,
"step": 1320
},
{
"epoch": 83.125,
"grad_norm": 6.435239315032959,
"learning_rate": 2.658e-05,
"loss": 0.3486,
"step": 1330
},
{
"epoch": 83.75,
"grad_norm": 3.413828134536743,
"learning_rate": 2.678e-05,
"loss": 0.3373,
"step": 1340
},
{
"epoch": 84.375,
"grad_norm": 3.34212327003479,
"learning_rate": 2.698e-05,
"loss": 0.3297,
"step": 1350
},
{
"epoch": 85.0,
"grad_norm": 3.8223774433135986,
"learning_rate": 2.718e-05,
"loss": 0.322,
"step": 1360
},
{
"epoch": 85.625,
"grad_norm": 4.211275577545166,
"learning_rate": 2.738e-05,
"loss": 0.314,
"step": 1370
},
{
"epoch": 86.25,
"grad_norm": 5.035346984863281,
"learning_rate": 2.758e-05,
"loss": 0.3089,
"step": 1380
},
{
"epoch": 86.875,
"grad_norm": 5.702706813812256,
"learning_rate": 2.778e-05,
"loss": 0.3017,
"step": 1390
},
{
"epoch": 87.5,
"grad_norm": 3.9658496379852295,
"learning_rate": 2.798e-05,
"loss": 0.3002,
"step": 1400
},
{
"epoch": 88.125,
"grad_norm": 3.6658170223236084,
"learning_rate": 2.818e-05,
"loss": 0.2877,
"step": 1410
},
{
"epoch": 88.75,
"grad_norm": 4.639245986938477,
"learning_rate": 2.8380000000000003e-05,
"loss": 0.2773,
"step": 1420
},
{
"epoch": 89.375,
"grad_norm": 4.7095947265625,
"learning_rate": 2.858e-05,
"loss": 0.2739,
"step": 1430
},
{
"epoch": 90.0,
"grad_norm": 5.2461700439453125,
"learning_rate": 2.8780000000000002e-05,
"loss": 0.267,
"step": 1440
},
{
"epoch": 90.625,
"grad_norm": 9.186333656311035,
"learning_rate": 2.898e-05,
"loss": 0.2638,
"step": 1450
},
{
"epoch": 91.25,
"grad_norm": 5.92840051651001,
"learning_rate": 2.9180000000000002e-05,
"loss": 0.2639,
"step": 1460
},
{
"epoch": 91.875,
"grad_norm": 4.675635814666748,
"learning_rate": 2.9380000000000003e-05,
"loss": 0.2485,
"step": 1470
},
{
"epoch": 92.5,
"grad_norm": 4.761613845825195,
"learning_rate": 2.958e-05,
"loss": 0.2392,
"step": 1480
},
{
"epoch": 93.125,
"grad_norm": 5.566701889038086,
"learning_rate": 2.9780000000000003e-05,
"loss": 0.2341,
"step": 1490
},
{
"epoch": 93.75,
"grad_norm": 4.253500461578369,
"learning_rate": 2.998e-05,
"loss": 0.2222,
"step": 1500
},
{
"epoch": 94.375,
"grad_norm": 4.602695465087891,
"learning_rate": 3.0180000000000002e-05,
"loss": 0.2183,
"step": 1510
},
{
"epoch": 95.0,
"grad_norm": 5.938004493713379,
"learning_rate": 3.0380000000000004e-05,
"loss": 0.213,
"step": 1520
},
{
"epoch": 95.625,
"grad_norm": 6.689606189727783,
"learning_rate": 3.058e-05,
"loss": 0.2123,
"step": 1530
},
{
"epoch": 96.25,
"grad_norm": 5.87599515914917,
"learning_rate": 3.078e-05,
"loss": 0.2011,
"step": 1540
},
{
"epoch": 96.875,
"grad_norm": 5.971210956573486,
"learning_rate": 3.0980000000000005e-05,
"loss": 0.1966,
"step": 1550
},
{
"epoch": 97.5,
"grad_norm": 5.859025001525879,
"learning_rate": 3.118e-05,
"loss": 0.1946,
"step": 1560
},
{
"epoch": 98.125,
"grad_norm": 5.950936317443848,
"learning_rate": 3.138e-05,
"loss": 0.1814,
"step": 1570
},
{
"epoch": 98.75,
"grad_norm": 4.768659591674805,
"learning_rate": 3.1580000000000006e-05,
"loss": 0.1768,
"step": 1580
},
{
"epoch": 99.375,
"grad_norm": 5.677441596984863,
"learning_rate": 3.1780000000000004e-05,
"loss": 0.1727,
"step": 1590
},
{
"epoch": 100.0,
"grad_norm": 4.38816499710083,
"learning_rate": 3.198e-05,
"loss": 0.162,
"step": 1600
},
{
"epoch": 100.625,
"grad_norm": 4.571051120758057,
"learning_rate": 3.218e-05,
"loss": 0.1606,
"step": 1610
},
{
"epoch": 101.25,
"grad_norm": 6.406070709228516,
"learning_rate": 3.238e-05,
"loss": 0.1494,
"step": 1620
},
{
"epoch": 101.875,
"grad_norm": 4.923046588897705,
"learning_rate": 3.2579999999999996e-05,
"loss": 0.1479,
"step": 1630
},
{
"epoch": 102.5,
"grad_norm": 6.735141277313232,
"learning_rate": 3.278e-05,
"loss": 0.1468,
"step": 1640
},
{
"epoch": 103.125,
"grad_norm": 5.201277732849121,
"learning_rate": 3.298e-05,
"loss": 0.137,
"step": 1650
},
{
"epoch": 103.75,
"grad_norm": 5.127175331115723,
"learning_rate": 3.318e-05,
"loss": 0.1377,
"step": 1660
},
{
"epoch": 104.375,
"grad_norm": 5.3002471923828125,
"learning_rate": 3.338e-05,
"loss": 0.1277,
"step": 1670
},
{
"epoch": 105.0,
"grad_norm": 5.290287017822266,
"learning_rate": 3.358e-05,
"loss": 0.1224,
"step": 1680
},
{
"epoch": 105.625,
"grad_norm": 6.6339311599731445,
"learning_rate": 3.378e-05,
"loss": 0.121,
"step": 1690
},
{
"epoch": 106.25,
"grad_norm": 6.3351826667785645,
"learning_rate": 3.398e-05,
"loss": 0.1202,
"step": 1700
},
{
"epoch": 106.875,
"grad_norm": 6.30771017074585,
"learning_rate": 3.418e-05,
"loss": 0.1182,
"step": 1710
},
{
"epoch": 107.5,
"grad_norm": 5.454580307006836,
"learning_rate": 3.438e-05,
"loss": 0.1132,
"step": 1720
},
{
"epoch": 108.125,
"grad_norm": 5.418821811676025,
"learning_rate": 3.4580000000000004e-05,
"loss": 0.1064,
"step": 1730
},
{
"epoch": 108.75,
"grad_norm": 4.996350288391113,
"learning_rate": 3.478e-05,
"loss": 0.0993,
"step": 1740
},
{
"epoch": 109.375,
"grad_norm": 6.961830139160156,
"learning_rate": 3.498e-05,
"loss": 0.0994,
"step": 1750
},
{
"epoch": 110.0,
"grad_norm": 6.240096569061279,
"learning_rate": 3.518e-05,
"loss": 0.1032,
"step": 1760
},
{
"epoch": 110.625,
"grad_norm": 5.9896111488342285,
"learning_rate": 3.5380000000000003e-05,
"loss": 0.0931,
"step": 1770
},
{
"epoch": 111.25,
"grad_norm": 4.381375312805176,
"learning_rate": 3.558e-05,
"loss": 0.0899,
"step": 1780
},
{
"epoch": 111.875,
"grad_norm": 4.847299098968506,
"learning_rate": 3.578e-05,
"loss": 0.0845,
"step": 1790
},
{
"epoch": 112.5,
"grad_norm": 6.442471981048584,
"learning_rate": 3.5980000000000004e-05,
"loss": 0.0903,
"step": 1800
},
{
"epoch": 113.125,
"grad_norm": 6.36176872253418,
"learning_rate": 3.618e-05,
"loss": 0.0871,
"step": 1810
},
{
"epoch": 113.75,
"grad_norm": 5.581753730773926,
"learning_rate": 3.638e-05,
"loss": 0.0832,
"step": 1820
},
{
"epoch": 114.375,
"grad_norm": 5.413025379180908,
"learning_rate": 3.6580000000000006e-05,
"loss": 0.0894,
"step": 1830
},
{
"epoch": 115.0,
"grad_norm": 3.545975685119629,
"learning_rate": 3.6780000000000004e-05,
"loss": 0.0816,
"step": 1840
},
{
"epoch": 115.625,
"grad_norm": 5.415703773498535,
"learning_rate": 3.698e-05,
"loss": 0.0847,
"step": 1850
},
{
"epoch": 116.25,
"grad_norm": 5.82867956161499,
"learning_rate": 3.7180000000000007e-05,
"loss": 0.0781,
"step": 1860
},
{
"epoch": 116.875,
"grad_norm": 6.405238151550293,
"learning_rate": 3.7380000000000005e-05,
"loss": 0.082,
"step": 1870
},
{
"epoch": 117.5,
"grad_norm": 3.8289811611175537,
"learning_rate": 3.758e-05,
"loss": 0.0766,
"step": 1880
},
{
"epoch": 118.125,
"grad_norm": 4.225410461425781,
"learning_rate": 3.778000000000001e-05,
"loss": 0.0753,
"step": 1890
},
{
"epoch": 118.75,
"grad_norm": 3.565117120742798,
"learning_rate": 3.7980000000000006e-05,
"loss": 0.0714,
"step": 1900
},
{
"epoch": 119.375,
"grad_norm": 4.679031848907471,
"learning_rate": 3.818e-05,
"loss": 0.0727,
"step": 1910
},
{
"epoch": 120.0,
"grad_norm": 3.9762325286865234,
"learning_rate": 3.838e-05,
"loss": 0.0746,
"step": 1920
},
{
"epoch": 120.625,
"grad_norm": 5.354043960571289,
"learning_rate": 3.858e-05,
"loss": 0.0715,
"step": 1930
},
{
"epoch": 121.25,
"grad_norm": 4.858035564422607,
"learning_rate": 3.878e-05,
"loss": 0.0736,
"step": 1940
},
{
"epoch": 121.875,
"grad_norm": 5.547657012939453,
"learning_rate": 3.898e-05,
"loss": 0.0747,
"step": 1950
},
{
"epoch": 122.5,
"grad_norm": 4.109276294708252,
"learning_rate": 3.918e-05,
"loss": 0.0671,
"step": 1960
},
{
"epoch": 123.125,
"grad_norm": 2.984168291091919,
"learning_rate": 3.938e-05,
"loss": 0.0675,
"step": 1970
},
{
"epoch": 123.75,
"grad_norm": 3.5227620601654053,
"learning_rate": 3.958e-05,
"loss": 0.0704,
"step": 1980
},
{
"epoch": 124.375,
"grad_norm": 4.113119125366211,
"learning_rate": 3.978e-05,
"loss": 0.0684,
"step": 1990
},
{
"epoch": 125.0,
"grad_norm": 4.130417823791504,
"learning_rate": 3.998e-05,
"loss": 0.0696,
"step": 2000
},
{
"epoch": 125.625,
"grad_norm": 4.076992034912109,
"learning_rate": 4.018e-05,
"loss": 0.0648,
"step": 2010
},
{
"epoch": 126.25,
"grad_norm": 3.623624801635742,
"learning_rate": 4.038e-05,
"loss": 0.0634,
"step": 2020
},
{
"epoch": 126.875,
"grad_norm": 3.0136911869049072,
"learning_rate": 4.058e-05,
"loss": 0.0619,
"step": 2030
},
{
"epoch": 127.5,
"grad_norm": 2.3207767009735107,
"learning_rate": 4.078e-05,
"loss": 0.0602,
"step": 2040
},
{
"epoch": 128.125,
"grad_norm": 6.006433963775635,
"learning_rate": 4.0980000000000004e-05,
"loss": 0.0618,
"step": 2050
},
{
"epoch": 128.75,
"grad_norm": 4.211705684661865,
"learning_rate": 4.118e-05,
"loss": 0.0614,
"step": 2060
},
{
"epoch": 129.375,
"grad_norm": 3.0991692543029785,
"learning_rate": 4.138e-05,
"loss": 0.0596,
"step": 2070
},
{
"epoch": 130.0,
"grad_norm": 3.5333359241485596,
"learning_rate": 4.1580000000000005e-05,
"loss": 0.0594,
"step": 2080
},
{
"epoch": 130.625,
"grad_norm": 2.464125394821167,
"learning_rate": 4.178e-05,
"loss": 0.0579,
"step": 2090
},
{
"epoch": 131.25,
"grad_norm": 3.499553680419922,
"learning_rate": 4.198e-05,
"loss": 0.058,
"step": 2100
},
{
"epoch": 131.875,
"grad_norm": 4.001912593841553,
"learning_rate": 4.2180000000000006e-05,
"loss": 0.0585,
"step": 2110
},
{
"epoch": 132.5,
"grad_norm": 3.2040934562683105,
"learning_rate": 4.2380000000000004e-05,
"loss": 0.0597,
"step": 2120
},
{
"epoch": 133.125,
"grad_norm": 3.650881767272949,
"learning_rate": 4.258e-05,
"loss": 0.0594,
"step": 2130
},
{
"epoch": 133.75,
"grad_norm": 3.5435853004455566,
"learning_rate": 4.278e-05,
"loss": 0.0577,
"step": 2140
},
{
"epoch": 134.375,
"grad_norm": 3.881361484527588,
"learning_rate": 4.2980000000000005e-05,
"loss": 0.0586,
"step": 2150
},
{
"epoch": 135.0,
"grad_norm": 3.0621840953826904,
"learning_rate": 4.318e-05,
"loss": 0.0562,
"step": 2160
},
{
"epoch": 135.625,
"grad_norm": 3.5643880367279053,
"learning_rate": 4.338e-05,
"loss": 0.0573,
"step": 2170
},
{
"epoch": 136.25,
"grad_norm": 3.4029245376586914,
"learning_rate": 4.3580000000000006e-05,
"loss": 0.0538,
"step": 2180
},
{
"epoch": 136.875,
"grad_norm": 2.941638469696045,
"learning_rate": 4.3780000000000004e-05,
"loss": 0.0518,
"step": 2190
},
{
"epoch": 137.5,
"grad_norm": 3.19802188873291,
"learning_rate": 4.398e-05,
"loss": 0.0578,
"step": 2200
},
{
"epoch": 138.125,
"grad_norm": 3.2176332473754883,
"learning_rate": 4.418000000000001e-05,
"loss": 0.0553,
"step": 2210
},
{
"epoch": 138.75,
"grad_norm": 3.1325228214263916,
"learning_rate": 4.438e-05,
"loss": 0.0544,
"step": 2220
},
{
"epoch": 139.375,
"grad_norm": 2.721820116043091,
"learning_rate": 4.458e-05,
"loss": 0.0537,
"step": 2230
},
{
"epoch": 140.0,
"grad_norm": 2.2297821044921875,
"learning_rate": 4.478e-05,
"loss": 0.0536,
"step": 2240
},
{
"epoch": 140.625,
"grad_norm": 3.736509323120117,
"learning_rate": 4.498e-05,
"loss": 0.0568,
"step": 2250
},
{
"epoch": 141.25,
"grad_norm": 3.414687156677246,
"learning_rate": 4.518e-05,
"loss": 0.0535,
"step": 2260
},
{
"epoch": 141.875,
"grad_norm": 3.533870220184326,
"learning_rate": 4.538e-05,
"loss": 0.0528,
"step": 2270
},
{
"epoch": 142.5,
"grad_norm": 2.922818422317505,
"learning_rate": 4.558e-05,
"loss": 0.0509,
"step": 2280
},
{
"epoch": 143.125,
"grad_norm": 3.248502731323242,
"learning_rate": 4.578e-05,
"loss": 0.0499,
"step": 2290
},
{
"epoch": 143.75,
"grad_norm": 2.737330913543701,
"learning_rate": 4.5980000000000004e-05,
"loss": 0.0504,
"step": 2300
},
{
"epoch": 144.375,
"grad_norm": 2.7490787506103516,
"learning_rate": 4.618e-05,
"loss": 0.0494,
"step": 2310
},
{
"epoch": 145.0,
"grad_norm": 3.3917601108551025,
"learning_rate": 4.638e-05,
"loss": 0.0529,
"step": 2320
},
{
"epoch": 145.625,
"grad_norm": 3.115227699279785,
"learning_rate": 4.6580000000000005e-05,
"loss": 0.0487,
"step": 2330
},
{
"epoch": 146.25,
"grad_norm": 3.6642770767211914,
"learning_rate": 4.678e-05,
"loss": 0.0511,
"step": 2340
},
{
"epoch": 146.875,
"grad_norm": 3.4796688556671143,
"learning_rate": 4.698e-05,
"loss": 0.048,
"step": 2350
},
{
"epoch": 147.5,
"grad_norm": 2.7523436546325684,
"learning_rate": 4.718e-05,
"loss": 0.0478,
"step": 2360
},
{
"epoch": 148.125,
"grad_norm": 3.309631824493408,
"learning_rate": 4.7380000000000004e-05,
"loss": 0.0489,
"step": 2370
},
{
"epoch": 148.75,
"grad_norm": 3.5280392169952393,
"learning_rate": 4.758e-05,
"loss": 0.045,
"step": 2380
},
{
"epoch": 149.375,
"grad_norm": 3.055738925933838,
"learning_rate": 4.778e-05,
"loss": 0.0443,
"step": 2390
},
{
"epoch": 150.0,
"grad_norm": 2.935150146484375,
"learning_rate": 4.7980000000000005e-05,
"loss": 0.047,
"step": 2400
},
{
"epoch": 150.625,
"grad_norm": 3.540233612060547,
"learning_rate": 4.818e-05,
"loss": 0.0483,
"step": 2410
},
{
"epoch": 151.25,
"grad_norm": 3.3195087909698486,
"learning_rate": 4.838e-05,
"loss": 0.0461,
"step": 2420
},
{
"epoch": 151.875,
"grad_norm": 3.5009474754333496,
"learning_rate": 4.8580000000000006e-05,
"loss": 0.0478,
"step": 2430
},
{
"epoch": 152.5,
"grad_norm": 3.110968589782715,
"learning_rate": 4.8780000000000004e-05,
"loss": 0.0476,
"step": 2440
},
{
"epoch": 153.125,
"grad_norm": 2.5114879608154297,
"learning_rate": 4.898e-05,
"loss": 0.0457,
"step": 2450
},
{
"epoch": 153.75,
"grad_norm": 2.591670513153076,
"learning_rate": 4.918000000000001e-05,
"loss": 0.0425,
"step": 2460
},
{
"epoch": 154.375,
"grad_norm": 2.149576187133789,
"learning_rate": 4.9380000000000005e-05,
"loss": 0.0432,
"step": 2470
},
{
"epoch": 155.0,
"grad_norm": 2.866494655609131,
"learning_rate": 4.958e-05,
"loss": 0.047,
"step": 2480
},
{
"epoch": 155.625,
"grad_norm": 3.465266227722168,
"learning_rate": 4.978e-05,
"loss": 0.0458,
"step": 2490
},
{
"epoch": 156.25,
"grad_norm": 2.855782985687256,
"learning_rate": 4.9980000000000006e-05,
"loss": 0.043,
"step": 2500
},
{
"epoch": 156.875,
"grad_norm": 2.906052350997925,
"learning_rate": 5.0180000000000004e-05,
"loss": 0.042,
"step": 2510
},
{
"epoch": 157.5,
"grad_norm": 3.16371488571167,
"learning_rate": 5.038e-05,
"loss": 0.0427,
"step": 2520
},
{
"epoch": 158.125,
"grad_norm": 2.54278826713562,
"learning_rate": 5.058000000000001e-05,
"loss": 0.0433,
"step": 2530
},
{
"epoch": 158.75,
"grad_norm": 2.1072380542755127,
"learning_rate": 5.0780000000000005e-05,
"loss": 0.0378,
"step": 2540
},
{
"epoch": 159.375,
"grad_norm": 2.749347448348999,
"learning_rate": 5.098e-05,
"loss": 0.0412,
"step": 2550
},
{
"epoch": 160.0,
"grad_norm": 3.022982120513916,
"learning_rate": 5.118000000000001e-05,
"loss": 0.0413,
"step": 2560
},
{
"epoch": 160.625,
"grad_norm": 3.038039207458496,
"learning_rate": 5.1380000000000006e-05,
"loss": 0.0418,
"step": 2570
},
{
"epoch": 161.25,
"grad_norm": 2.538886070251465,
"learning_rate": 5.1580000000000004e-05,
"loss": 0.0407,
"step": 2580
},
{
"epoch": 161.875,
"grad_norm": 2.79771089553833,
"learning_rate": 5.178000000000001e-05,
"loss": 0.0421,
"step": 2590
},
{
"epoch": 162.5,
"grad_norm": 2.6892521381378174,
"learning_rate": 5.198000000000001e-05,
"loss": 0.0408,
"step": 2600
},
{
"epoch": 163.125,
"grad_norm": 2.829843759536743,
"learning_rate": 5.2180000000000005e-05,
"loss": 0.0386,
"step": 2610
},
{
"epoch": 163.75,
"grad_norm": 2.272169828414917,
"learning_rate": 5.238000000000001e-05,
"loss": 0.0392,
"step": 2620
},
{
"epoch": 164.375,
"grad_norm": 2.683228015899658,
"learning_rate": 5.258000000000001e-05,
"loss": 0.0403,
"step": 2630
},
{
"epoch": 165.0,
"grad_norm": 2.4979324340820312,
"learning_rate": 5.2780000000000006e-05,
"loss": 0.0412,
"step": 2640
},
{
"epoch": 165.625,
"grad_norm": 2.7030258178710938,
"learning_rate": 5.2980000000000004e-05,
"loss": 0.0386,
"step": 2650
},
{
"epoch": 166.25,
"grad_norm": 2.9168074131011963,
"learning_rate": 5.318000000000001e-05,
"loss": 0.041,
"step": 2660
},
{
"epoch": 166.875,
"grad_norm": 2.2548749446868896,
"learning_rate": 5.338000000000001e-05,
"loss": 0.0386,
"step": 2670
},
{
"epoch": 167.5,
"grad_norm": 2.6179001331329346,
"learning_rate": 5.3580000000000005e-05,
"loss": 0.0399,
"step": 2680
},
{
"epoch": 168.125,
"grad_norm": 3.1817469596862793,
"learning_rate": 5.378e-05,
"loss": 0.0391,
"step": 2690
},
{
"epoch": 168.75,
"grad_norm": 2.606260061264038,
"learning_rate": 5.3979999999999995e-05,
"loss": 0.0358,
"step": 2700
},
{
"epoch": 169.375,
"grad_norm": 2.6046321392059326,
"learning_rate": 5.418e-05,
"loss": 0.0366,
"step": 2710
},
{
"epoch": 170.0,
"grad_norm": 2.150594711303711,
"learning_rate": 5.438e-05,
"loss": 0.0366,
"step": 2720
},
{
"epoch": 170.625,
"grad_norm": 1.9119679927825928,
"learning_rate": 5.4579999999999996e-05,
"loss": 0.0359,
"step": 2730
},
{
"epoch": 171.25,
"grad_norm": 2.6968297958374023,
"learning_rate": 5.478e-05,
"loss": 0.0358,
"step": 2740
},
{
"epoch": 171.875,
"grad_norm": 2.433364152908325,
"learning_rate": 5.498e-05,
"loss": 0.0396,
"step": 2750
},
{
"epoch": 172.5,
"grad_norm": 2.7723114490509033,
"learning_rate": 5.518e-05,
"loss": 0.0369,
"step": 2760
},
{
"epoch": 173.125,
"grad_norm": 1.9324524402618408,
"learning_rate": 5.538e-05,
"loss": 0.0366,
"step": 2770
},
{
"epoch": 173.75,
"grad_norm": 2.4898505210876465,
"learning_rate": 5.558e-05,
"loss": 0.0357,
"step": 2780
},
{
"epoch": 174.375,
"grad_norm": 3.377042293548584,
"learning_rate": 5.578e-05,
"loss": 0.0356,
"step": 2790
},
{
"epoch": 175.0,
"grad_norm": 2.3189809322357178,
"learning_rate": 5.5979999999999996e-05,
"loss": 0.0383,
"step": 2800
},
{
"epoch": 175.625,
"grad_norm": 2.4106035232543945,
"learning_rate": 5.618e-05,
"loss": 0.0377,
"step": 2810
},
{
"epoch": 176.25,
"grad_norm": 2.3675427436828613,
"learning_rate": 5.638e-05,
"loss": 0.034,
"step": 2820
},
{
"epoch": 176.875,
"grad_norm": 2.3263936042785645,
"learning_rate": 5.658e-05,
"loss": 0.0329,
"step": 2830
},
{
"epoch": 177.5,
"grad_norm": 2.6326184272766113,
"learning_rate": 5.678e-05,
"loss": 0.0372,
"step": 2840
},
{
"epoch": 178.125,
"grad_norm": 2.5026683807373047,
"learning_rate": 5.698e-05,
"loss": 0.0384,
"step": 2850
},
{
"epoch": 178.75,
"grad_norm": 2.7007641792297363,
"learning_rate": 5.718e-05,
"loss": 0.0345,
"step": 2860
},
{
"epoch": 179.375,
"grad_norm": 2.948171854019165,
"learning_rate": 5.738e-05,
"loss": 0.0371,
"step": 2870
},
{
"epoch": 180.0,
"grad_norm": 2.368053674697876,
"learning_rate": 5.758e-05,
"loss": 0.0358,
"step": 2880
},
{
"epoch": 180.625,
"grad_norm": 2.625312328338623,
"learning_rate": 5.778e-05,
"loss": 0.0382,
"step": 2890
},
{
"epoch": 181.25,
"grad_norm": 2.2241172790527344,
"learning_rate": 5.7980000000000004e-05,
"loss": 0.0346,
"step": 2900
},
{
"epoch": 181.875,
"grad_norm": 2.2202515602111816,
"learning_rate": 5.818e-05,
"loss": 0.0391,
"step": 2910
},
{
"epoch": 182.5,
"grad_norm": 2.5838396549224854,
"learning_rate": 5.838e-05,
"loss": 0.0332,
"step": 2920
},
{
"epoch": 183.125,
"grad_norm": 2.4340357780456543,
"learning_rate": 5.858e-05,
"loss": 0.0341,
"step": 2930
},
{
"epoch": 183.75,
"grad_norm": 3.3191001415252686,
"learning_rate": 5.878e-05,
"loss": 0.0372,
"step": 2940
},
{
"epoch": 184.375,
"grad_norm": 2.798825263977051,
"learning_rate": 5.898e-05,
"loss": 0.0344,
"step": 2950
},
{
"epoch": 185.0,
"grad_norm": 2.0992839336395264,
"learning_rate": 5.918e-05,
"loss": 0.0364,
"step": 2960
},
{
"epoch": 185.625,
"grad_norm": 2.3140695095062256,
"learning_rate": 5.9380000000000004e-05,
"loss": 0.0345,
"step": 2970
},
{
"epoch": 186.25,
"grad_norm": 2.1252496242523193,
"learning_rate": 5.958e-05,
"loss": 0.0341,
"step": 2980
},
{
"epoch": 186.875,
"grad_norm": 1.9925975799560547,
"learning_rate": 5.978e-05,
"loss": 0.0371,
"step": 2990
},
{
"epoch": 187.5,
"grad_norm": 1.8534867763519287,
"learning_rate": 5.9980000000000005e-05,
"loss": 0.0324,
"step": 3000
},
{
"epoch": 188.125,
"grad_norm": 1.8940081596374512,
"learning_rate": 6.018e-05,
"loss": 0.0313,
"step": 3010
},
{
"epoch": 188.75,
"grad_norm": 3.098815679550171,
"learning_rate": 6.038e-05,
"loss": 0.0316,
"step": 3020
},
{
"epoch": 189.375,
"grad_norm": 2.562849521636963,
"learning_rate": 6.0580000000000006e-05,
"loss": 0.034,
"step": 3030
},
{
"epoch": 190.0,
"grad_norm": 2.3118202686309814,
"learning_rate": 6.0780000000000004e-05,
"loss": 0.0324,
"step": 3040
},
{
"epoch": 190.625,
"grad_norm": 1.8349565267562866,
"learning_rate": 6.098e-05,
"loss": 0.0316,
"step": 3050
},
{
"epoch": 191.25,
"grad_norm": 2.3919525146484375,
"learning_rate": 6.118000000000001e-05,
"loss": 0.0341,
"step": 3060
},
{
"epoch": 191.875,
"grad_norm": 2.795734405517578,
"learning_rate": 6.138e-05,
"loss": 0.0321,
"step": 3070
},
{
"epoch": 192.5,
"grad_norm": 2.4285318851470947,
"learning_rate": 6.158e-05,
"loss": 0.0338,
"step": 3080
},
{
"epoch": 193.125,
"grad_norm": 2.724107265472412,
"learning_rate": 6.178000000000001e-05,
"loss": 0.0325,
"step": 3090
},
{
"epoch": 193.75,
"grad_norm": 2.212014675140381,
"learning_rate": 6.198e-05,
"loss": 0.0297,
"step": 3100
},
{
"epoch": 194.375,
"grad_norm": 1.8803651332855225,
"learning_rate": 6.218e-05,
"loss": 0.0298,
"step": 3110
},
{
"epoch": 195.0,
"grad_norm": 1.7469961643218994,
"learning_rate": 6.238000000000001e-05,
"loss": 0.0291,
"step": 3120
},
{
"epoch": 195.625,
"grad_norm": 2.5273945331573486,
"learning_rate": 6.258e-05,
"loss": 0.0309,
"step": 3130
},
{
"epoch": 196.25,
"grad_norm": 2.398287773132324,
"learning_rate": 6.278e-05,
"loss": 0.0317,
"step": 3140
},
{
"epoch": 196.875,
"grad_norm": 1.9407683610916138,
"learning_rate": 6.298000000000001e-05,
"loss": 0.0299,
"step": 3150
},
{
"epoch": 197.5,
"grad_norm": 1.6159769296646118,
"learning_rate": 6.318e-05,
"loss": 0.0286,
"step": 3160
},
{
"epoch": 198.125,
"grad_norm": 2.744300603866577,
"learning_rate": 6.338e-05,
"loss": 0.0303,
"step": 3170
},
{
"epoch": 198.75,
"grad_norm": 2.6293482780456543,
"learning_rate": 6.358000000000001e-05,
"loss": 0.0328,
"step": 3180
},
{
"epoch": 199.375,
"grad_norm": 2.2811481952667236,
"learning_rate": 6.378e-05,
"loss": 0.0328,
"step": 3190
},
{
"epoch": 200.0,
"grad_norm": 2.951794385910034,
"learning_rate": 6.398000000000001e-05,
"loss": 0.033,
"step": 3200
},
{
"epoch": 200.625,
"grad_norm": 2.3573927879333496,
"learning_rate": 6.418000000000001e-05,
"loss": 0.0312,
"step": 3210
},
{
"epoch": 201.25,
"grad_norm": 2.088592529296875,
"learning_rate": 6.438e-05,
"loss": 0.0314,
"step": 3220
},
{
"epoch": 201.875,
"grad_norm": 2.646054983139038,
"learning_rate": 6.458000000000001e-05,
"loss": 0.0295,
"step": 3230
},
{
"epoch": 202.5,
"grad_norm": 2.5917739868164062,
"learning_rate": 6.478000000000001e-05,
"loss": 0.032,
"step": 3240
},
{
"epoch": 203.125,
"grad_norm": 2.122236490249634,
"learning_rate": 6.498e-05,
"loss": 0.0306,
"step": 3250
},
{
"epoch": 203.75,
"grad_norm": 2.2258174419403076,
"learning_rate": 6.518000000000001e-05,
"loss": 0.0289,
"step": 3260
},
{
"epoch": 204.375,
"grad_norm": 2.1164627075195312,
"learning_rate": 6.538000000000001e-05,
"loss": 0.0298,
"step": 3270
},
{
"epoch": 205.0,
"grad_norm": 2.397019386291504,
"learning_rate": 6.558e-05,
"loss": 0.0298,
"step": 3280
},
{
"epoch": 205.625,
"grad_norm": 2.260453701019287,
"learning_rate": 6.578000000000001e-05,
"loss": 0.0279,
"step": 3290
},
{
"epoch": 206.25,
"grad_norm": 2.1338107585906982,
"learning_rate": 6.598e-05,
"loss": 0.0292,
"step": 3300
},
{
"epoch": 206.875,
"grad_norm": 1.875387191772461,
"learning_rate": 6.618e-05,
"loss": 0.0276,
"step": 3310
},
{
"epoch": 207.5,
"grad_norm": 1.619683027267456,
"learning_rate": 6.638e-05,
"loss": 0.0286,
"step": 3320
},
{
"epoch": 208.125,
"grad_norm": 2.5062685012817383,
"learning_rate": 6.658e-05,
"loss": 0.031,
"step": 3330
},
{
"epoch": 208.75,
"grad_norm": 2.3004539012908936,
"learning_rate": 6.678e-05,
"loss": 0.0305,
"step": 3340
},
{
"epoch": 209.375,
"grad_norm": 2.2835469245910645,
"learning_rate": 6.698e-05,
"loss": 0.0281,
"step": 3350
},
{
"epoch": 210.0,
"grad_norm": 2.0576257705688477,
"learning_rate": 6.718e-05,
"loss": 0.0318,
"step": 3360
},
{
"epoch": 210.625,
"grad_norm": 2.0494043827056885,
"learning_rate": 6.738e-05,
"loss": 0.0284,
"step": 3370
},
{
"epoch": 211.25,
"grad_norm": 1.5460221767425537,
"learning_rate": 6.758e-05,
"loss": 0.0274,
"step": 3380
},
{
"epoch": 211.875,
"grad_norm": 2.422177791595459,
"learning_rate": 6.778e-05,
"loss": 0.0287,
"step": 3390
},
{
"epoch": 212.5,
"grad_norm": 2.38964581489563,
"learning_rate": 6.798e-05,
"loss": 0.0329,
"step": 3400
},
{
"epoch": 213.125,
"grad_norm": 2.0634000301361084,
"learning_rate": 6.818e-05,
"loss": 0.0273,
"step": 3410
},
{
"epoch": 213.75,
"grad_norm": 2.5334651470184326,
"learning_rate": 6.838e-05,
"loss": 0.0312,
"step": 3420
},
{
"epoch": 214.375,
"grad_norm": 2.527052402496338,
"learning_rate": 6.858e-05,
"loss": 0.0297,
"step": 3430
},
{
"epoch": 215.0,
"grad_norm": 2.3704299926757812,
"learning_rate": 6.878e-05,
"loss": 0.0313,
"step": 3440
},
{
"epoch": 215.625,
"grad_norm": 1.926483154296875,
"learning_rate": 6.898e-05,
"loss": 0.0256,
"step": 3450
},
{
"epoch": 216.25,
"grad_norm": 1.56046724319458,
"learning_rate": 6.918e-05,
"loss": 0.0278,
"step": 3460
},
{
"epoch": 216.875,
"grad_norm": 1.8307677507400513,
"learning_rate": 6.938e-05,
"loss": 0.0269,
"step": 3470
},
{
"epoch": 217.5,
"grad_norm": 1.9908180236816406,
"learning_rate": 6.958e-05,
"loss": 0.0276,
"step": 3480
},
{
"epoch": 218.125,
"grad_norm": 2.067988395690918,
"learning_rate": 6.978e-05,
"loss": 0.0268,
"step": 3490
},
{
"epoch": 218.75,
"grad_norm": 1.8545929193496704,
"learning_rate": 6.998e-05,
"loss": 0.0268,
"step": 3500
},
{
"epoch": 219.375,
"grad_norm": 2.052927017211914,
"learning_rate": 7.018e-05,
"loss": 0.0253,
"step": 3510
},
{
"epoch": 220.0,
"grad_norm": 2.1113545894622803,
"learning_rate": 7.038e-05,
"loss": 0.0245,
"step": 3520
},
{
"epoch": 220.625,
"grad_norm": 1.541675329208374,
"learning_rate": 7.058e-05,
"loss": 0.0253,
"step": 3530
},
{
"epoch": 221.25,
"grad_norm": 1.7272151708602905,
"learning_rate": 7.078e-05,
"loss": 0.0251,
"step": 3540
},
{
"epoch": 221.875,
"grad_norm": 1.7178980112075806,
"learning_rate": 7.098e-05,
"loss": 0.026,
"step": 3550
},
{
"epoch": 222.5,
"grad_norm": 2.246424913406372,
"learning_rate": 7.118e-05,
"loss": 0.0267,
"step": 3560
},
{
"epoch": 223.125,
"grad_norm": 1.9230071306228638,
"learning_rate": 7.138e-05,
"loss": 0.0268,
"step": 3570
},
{
"epoch": 223.75,
"grad_norm": 1.9361920356750488,
"learning_rate": 7.158e-05,
"loss": 0.0268,
"step": 3580
},
{
"epoch": 224.375,
"grad_norm": 1.6865476369857788,
"learning_rate": 7.178000000000001e-05,
"loss": 0.0248,
"step": 3590
},
{
"epoch": 225.0,
"grad_norm": 2.019584894180298,
"learning_rate": 7.198e-05,
"loss": 0.0258,
"step": 3600
},
{
"epoch": 225.625,
"grad_norm": 1.8740990161895752,
"learning_rate": 7.218e-05,
"loss": 0.0243,
"step": 3610
},
{
"epoch": 226.25,
"grad_norm": 2.088883399963379,
"learning_rate": 7.238000000000001e-05,
"loss": 0.0253,
"step": 3620
},
{
"epoch": 226.875,
"grad_norm": 2.107874870300293,
"learning_rate": 7.258e-05,
"loss": 0.0265,
"step": 3630
},
{
"epoch": 227.5,
"grad_norm": 1.690873622894287,
"learning_rate": 7.278e-05,
"loss": 0.0262,
"step": 3640
},
{
"epoch": 228.125,
"grad_norm": 2.7033252716064453,
"learning_rate": 7.298000000000001e-05,
"loss": 0.025,
"step": 3650
},
{
"epoch": 228.75,
"grad_norm": 1.91816246509552,
"learning_rate": 7.318e-05,
"loss": 0.0265,
"step": 3660
},
{
"epoch": 229.375,
"grad_norm": 1.9548629522323608,
"learning_rate": 7.338e-05,
"loss": 0.0251,
"step": 3670
},
{
"epoch": 230.0,
"grad_norm": 1.911120891571045,
"learning_rate": 7.358000000000001e-05,
"loss": 0.0245,
"step": 3680
},
{
"epoch": 230.625,
"grad_norm": 1.6720895767211914,
"learning_rate": 7.378e-05,
"loss": 0.0252,
"step": 3690
},
{
"epoch": 231.25,
"grad_norm": 1.9147329330444336,
"learning_rate": 7.398e-05,
"loss": 0.0247,
"step": 3700
},
{
"epoch": 231.875,
"grad_norm": 2.1456077098846436,
"learning_rate": 7.418000000000001e-05,
"loss": 0.0252,
"step": 3710
},
{
"epoch": 232.5,
"grad_norm": 1.9418590068817139,
"learning_rate": 7.438e-05,
"loss": 0.0257,
"step": 3720
},
{
"epoch": 233.125,
"grad_norm": 1.9458227157592773,
"learning_rate": 7.458000000000001e-05,
"loss": 0.0271,
"step": 3730
},
{
"epoch": 233.75,
"grad_norm": 1.9564207792282104,
"learning_rate": 7.478e-05,
"loss": 0.0262,
"step": 3740
},
{
"epoch": 234.375,
"grad_norm": 1.4478167295455933,
"learning_rate": 7.498e-05,
"loss": 0.0254,
"step": 3750
},
{
"epoch": 235.0,
"grad_norm": 2.14218807220459,
"learning_rate": 7.518000000000001e-05,
"loss": 0.0251,
"step": 3760
},
{
"epoch": 235.625,
"grad_norm": 2.029665946960449,
"learning_rate": 7.538e-05,
"loss": 0.0276,
"step": 3770
},
{
"epoch": 236.25,
"grad_norm": 1.8243962526321411,
"learning_rate": 7.558e-05,
"loss": 0.0267,
"step": 3780
},
{
"epoch": 236.875,
"grad_norm": 1.6162742376327515,
"learning_rate": 7.578000000000001e-05,
"loss": 0.0224,
"step": 3790
},
{
"epoch": 237.5,
"grad_norm": 2.0405139923095703,
"learning_rate": 7.598e-05,
"loss": 0.0248,
"step": 3800
},
{
"epoch": 238.125,
"grad_norm": 1.9894390106201172,
"learning_rate": 7.618e-05,
"loss": 0.0239,
"step": 3810
},
{
"epoch": 238.75,
"grad_norm": 1.7805562019348145,
"learning_rate": 7.638000000000001e-05,
"loss": 0.0245,
"step": 3820
},
{
"epoch": 239.375,
"grad_norm": 2.0249173641204834,
"learning_rate": 7.658e-05,
"loss": 0.0221,
"step": 3830
},
{
"epoch": 240.0,
"grad_norm": 1.8023134469985962,
"learning_rate": 7.678000000000001e-05,
"loss": 0.0234,
"step": 3840
},
{
"epoch": 240.625,
"grad_norm": 1.5592528581619263,
"learning_rate": 7.698000000000001e-05,
"loss": 0.0245,
"step": 3850
},
{
"epoch": 241.25,
"grad_norm": 2.1557257175445557,
"learning_rate": 7.718e-05,
"loss": 0.0243,
"step": 3860
},
{
"epoch": 241.875,
"grad_norm": 1.9655349254608154,
"learning_rate": 7.738000000000001e-05,
"loss": 0.0223,
"step": 3870
},
{
"epoch": 242.5,
"grad_norm": 1.616184115409851,
"learning_rate": 7.758000000000001e-05,
"loss": 0.0249,
"step": 3880
},
{
"epoch": 243.125,
"grad_norm": 2.146557331085205,
"learning_rate": 7.778e-05,
"loss": 0.0243,
"step": 3890
},
{
"epoch": 243.75,
"grad_norm": 1.6077772378921509,
"learning_rate": 7.798000000000001e-05,
"loss": 0.0237,
"step": 3900
},
{
"epoch": 244.375,
"grad_norm": 2.073211431503296,
"learning_rate": 7.818000000000001e-05,
"loss": 0.0211,
"step": 3910
},
{
"epoch": 245.0,
"grad_norm": 1.7445831298828125,
"learning_rate": 7.838e-05,
"loss": 0.0225,
"step": 3920
},
{
"epoch": 245.625,
"grad_norm": 1.5558561086654663,
"learning_rate": 7.858000000000001e-05,
"loss": 0.0215,
"step": 3930
},
{
"epoch": 246.25,
"grad_norm": 1.4040555953979492,
"learning_rate": 7.878e-05,
"loss": 0.0219,
"step": 3940
},
{
"epoch": 246.875,
"grad_norm": 1.6972527503967285,
"learning_rate": 7.897999999999999e-05,
"loss": 0.0247,
"step": 3950
},
{
"epoch": 247.5,
"grad_norm": 1.862613320350647,
"learning_rate": 7.918e-05,
"loss": 0.0235,
"step": 3960
},
{
"epoch": 248.125,
"grad_norm": 1.9567930698394775,
"learning_rate": 7.938e-05,
"loss": 0.0243,
"step": 3970
},
{
"epoch": 248.75,
"grad_norm": 1.8193110227584839,
"learning_rate": 7.958e-05,
"loss": 0.0232,
"step": 3980
},
{
"epoch": 249.375,
"grad_norm": 1.8279744386672974,
"learning_rate": 7.978e-05,
"loss": 0.022,
"step": 3990
},
{
"epoch": 250.0,
"grad_norm": 1.9170351028442383,
"learning_rate": 7.998e-05,
"loss": 0.0238,
"step": 4000
},
{
"epoch": 250.625,
"grad_norm": 1.7806050777435303,
"learning_rate": 8.018e-05,
"loss": 0.0224,
"step": 4010
},
{
"epoch": 251.25,
"grad_norm": 1.618657112121582,
"learning_rate": 8.038e-05,
"loss": 0.0228,
"step": 4020
},
{
"epoch": 251.875,
"grad_norm": 1.3448606729507446,
"learning_rate": 8.058e-05,
"loss": 0.0213,
"step": 4030
},
{
"epoch": 252.5,
"grad_norm": 2.1564993858337402,
"learning_rate": 8.078e-05,
"loss": 0.0224,
"step": 4040
},
{
"epoch": 253.125,
"grad_norm": 1.9321818351745605,
"learning_rate": 8.098e-05,
"loss": 0.0258,
"step": 4050
},
{
"epoch": 253.75,
"grad_norm": 1.6877397298812866,
"learning_rate": 8.118e-05,
"loss": 0.0235,
"step": 4060
},
{
"epoch": 254.375,
"grad_norm": 1.899335265159607,
"learning_rate": 8.138e-05,
"loss": 0.0243,
"step": 4070
},
{
"epoch": 255.0,
"grad_norm": 1.6680128574371338,
"learning_rate": 8.158e-05,
"loss": 0.0247,
"step": 4080
},
{
"epoch": 255.625,
"grad_norm": 1.4403914213180542,
"learning_rate": 8.178e-05,
"loss": 0.0224,
"step": 4090
},
{
"epoch": 256.25,
"grad_norm": 1.8112647533416748,
"learning_rate": 8.198e-05,
"loss": 0.022,
"step": 4100
},
{
"epoch": 256.875,
"grad_norm": 1.6451849937438965,
"learning_rate": 8.218e-05,
"loss": 0.0216,
"step": 4110
},
{
"epoch": 257.5,
"grad_norm": 1.4328521490097046,
"learning_rate": 8.238000000000001e-05,
"loss": 0.0217,
"step": 4120
},
{
"epoch": 258.125,
"grad_norm": 1.8865714073181152,
"learning_rate": 8.258e-05,
"loss": 0.0218,
"step": 4130
},
{
"epoch": 258.75,
"grad_norm": 1.6151604652404785,
"learning_rate": 8.278e-05,
"loss": 0.0207,
"step": 4140
},
{
"epoch": 259.375,
"grad_norm": 1.576856017112732,
"learning_rate": 8.298000000000001e-05,
"loss": 0.0227,
"step": 4150
},
{
"epoch": 260.0,
"grad_norm": 1.9383561611175537,
"learning_rate": 8.318e-05,
"loss": 0.0211,
"step": 4160
},
{
"epoch": 260.625,
"grad_norm": 1.417213797569275,
"learning_rate": 8.338e-05,
"loss": 0.0218,
"step": 4170
},
{
"epoch": 261.25,
"grad_norm": 1.4880584478378296,
"learning_rate": 8.358e-05,
"loss": 0.0215,
"step": 4180
},
{
"epoch": 261.875,
"grad_norm": 1.7698973417282104,
"learning_rate": 8.378e-05,
"loss": 0.0209,
"step": 4190
},
{
"epoch": 262.5,
"grad_norm": 1.4688743352890015,
"learning_rate": 8.398e-05,
"loss": 0.022,
"step": 4200
},
{
"epoch": 263.125,
"grad_norm": 1.563480019569397,
"learning_rate": 8.418e-05,
"loss": 0.021,
"step": 4210
},
{
"epoch": 263.75,
"grad_norm": 1.6026536226272583,
"learning_rate": 8.438e-05,
"loss": 0.0196,
"step": 4220
},
{
"epoch": 264.375,
"grad_norm": 1.390167236328125,
"learning_rate": 8.458e-05,
"loss": 0.0203,
"step": 4230
},
{
"epoch": 265.0,
"grad_norm": 1.3945834636688232,
"learning_rate": 8.478e-05,
"loss": 0.0187,
"step": 4240
},
{
"epoch": 265.625,
"grad_norm": 1.6028813123703003,
"learning_rate": 8.498e-05,
"loss": 0.0211,
"step": 4250
},
{
"epoch": 266.25,
"grad_norm": 1.5985839366912842,
"learning_rate": 8.518000000000001e-05,
"loss": 0.021,
"step": 4260
},
{
"epoch": 266.875,
"grad_norm": 1.3894219398498535,
"learning_rate": 8.538e-05,
"loss": 0.0203,
"step": 4270
},
{
"epoch": 267.5,
"grad_norm": 1.9198909997940063,
"learning_rate": 8.558e-05,
"loss": 0.0217,
"step": 4280
},
{
"epoch": 268.125,
"grad_norm": 1.6992826461791992,
"learning_rate": 8.578000000000001e-05,
"loss": 0.0218,
"step": 4290
},
{
"epoch": 268.75,
"grad_norm": 1.5295377969741821,
"learning_rate": 8.598e-05,
"loss": 0.0209,
"step": 4300
},
{
"epoch": 269.375,
"grad_norm": 1.9647233486175537,
"learning_rate": 8.618e-05,
"loss": 0.0208,
"step": 4310
},
{
"epoch": 270.0,
"grad_norm": 1.6796159744262695,
"learning_rate": 8.638000000000001e-05,
"loss": 0.0207,
"step": 4320
},
{
"epoch": 270.625,
"grad_norm": 1.7937408685684204,
"learning_rate": 8.658e-05,
"loss": 0.0212,
"step": 4330
},
{
"epoch": 271.25,
"grad_norm": 1.944583535194397,
"learning_rate": 8.678e-05,
"loss": 0.0214,
"step": 4340
},
{
"epoch": 271.875,
"grad_norm": 1.501273512840271,
"learning_rate": 8.698000000000001e-05,
"loss": 0.0227,
"step": 4350
},
{
"epoch": 272.5,
"grad_norm": 1.630289077758789,
"learning_rate": 8.718e-05,
"loss": 0.0187,
"step": 4360
},
{
"epoch": 273.125,
"grad_norm": 1.558972716331482,
"learning_rate": 8.738000000000001e-05,
"loss": 0.0201,
"step": 4370
},
{
"epoch": 273.75,
"grad_norm": 1.319100260734558,
"learning_rate": 8.758000000000001e-05,
"loss": 0.0199,
"step": 4380
},
{
"epoch": 274.375,
"grad_norm": 1.687119722366333,
"learning_rate": 8.778e-05,
"loss": 0.0222,
"step": 4390
},
{
"epoch": 275.0,
"grad_norm": 1.5666712522506714,
"learning_rate": 8.798000000000001e-05,
"loss": 0.0218,
"step": 4400
},
{
"epoch": 275.625,
"grad_norm": 1.374186396598816,
"learning_rate": 8.818000000000001e-05,
"loss": 0.0201,
"step": 4410
},
{
"epoch": 276.25,
"grad_norm": 1.4911551475524902,
"learning_rate": 8.838e-05,
"loss": 0.0197,
"step": 4420
},
{
"epoch": 276.875,
"grad_norm": 1.391093134880066,
"learning_rate": 8.858000000000001e-05,
"loss": 0.019,
"step": 4430
},
{
"epoch": 277.5,
"grad_norm": 1.5734379291534424,
"learning_rate": 8.878000000000001e-05,
"loss": 0.0182,
"step": 4440
},
{
"epoch": 278.125,
"grad_norm": 1.5925443172454834,
"learning_rate": 8.898e-05,
"loss": 0.0196,
"step": 4450
},
{
"epoch": 278.75,
"grad_norm": 1.6269075870513916,
"learning_rate": 8.918000000000001e-05,
"loss": 0.0211,
"step": 4460
},
{
"epoch": 279.375,
"grad_norm": 1.5029900074005127,
"learning_rate": 8.938e-05,
"loss": 0.0208,
"step": 4470
},
{
"epoch": 280.0,
"grad_norm": 1.56442391872406,
"learning_rate": 8.958e-05,
"loss": 0.0196,
"step": 4480
},
{
"epoch": 280.625,
"grad_norm": 1.3483182191848755,
"learning_rate": 8.978000000000001e-05,
"loss": 0.0185,
"step": 4490
},
{
"epoch": 281.25,
"grad_norm": 1.2249255180358887,
"learning_rate": 8.998e-05,
"loss": 0.0182,
"step": 4500
},
{
"epoch": 281.875,
"grad_norm": 1.206023097038269,
"learning_rate": 9.018000000000001e-05,
"loss": 0.0191,
"step": 4510
},
{
"epoch": 282.5,
"grad_norm": 1.9158329963684082,
"learning_rate": 9.038000000000001e-05,
"loss": 0.0202,
"step": 4520
},
{
"epoch": 283.125,
"grad_norm": 1.6515963077545166,
"learning_rate": 9.058e-05,
"loss": 0.0199,
"step": 4530
},
{
"epoch": 283.75,
"grad_norm": 1.7891855239868164,
"learning_rate": 9.078000000000001e-05,
"loss": 0.0213,
"step": 4540
},
{
"epoch": 284.375,
"grad_norm": 1.5916194915771484,
"learning_rate": 9.098000000000001e-05,
"loss": 0.0204,
"step": 4550
},
{
"epoch": 285.0,
"grad_norm": 1.6548500061035156,
"learning_rate": 9.118e-05,
"loss": 0.0206,
"step": 4560
},
{
"epoch": 285.625,
"grad_norm": 1.7890138626098633,
"learning_rate": 9.138e-05,
"loss": 0.0216,
"step": 4570
},
{
"epoch": 286.25,
"grad_norm": 1.3698619604110718,
"learning_rate": 9.158e-05,
"loss": 0.021,
"step": 4580
},
{
"epoch": 286.875,
"grad_norm": 1.3164348602294922,
"learning_rate": 9.178e-05,
"loss": 0.0201,
"step": 4590
},
{
"epoch": 287.5,
"grad_norm": 1.2602595090866089,
"learning_rate": 9.198e-05,
"loss": 0.0206,
"step": 4600
},
{
"epoch": 288.125,
"grad_norm": 1.6356364488601685,
"learning_rate": 9.218e-05,
"loss": 0.0206,
"step": 4610
},
{
"epoch": 288.75,
"grad_norm": 1.339037299156189,
"learning_rate": 9.238e-05,
"loss": 0.0194,
"step": 4620
},
{
"epoch": 289.375,
"grad_norm": 1.5343581438064575,
"learning_rate": 9.258e-05,
"loss": 0.0214,
"step": 4630
},
{
"epoch": 290.0,
"grad_norm": 1.7950295209884644,
"learning_rate": 9.278e-05,
"loss": 0.02,
"step": 4640
},
{
"epoch": 290.625,
"grad_norm": 1.346240758895874,
"learning_rate": 9.298e-05,
"loss": 0.0202,
"step": 4650
},
{
"epoch": 291.25,
"grad_norm": 1.1901124715805054,
"learning_rate": 9.318e-05,
"loss": 0.0176,
"step": 4660
},
{
"epoch": 291.875,
"grad_norm": 1.3559141159057617,
"learning_rate": 9.338e-05,
"loss": 0.0165,
"step": 4670
},
{
"epoch": 292.5,
"grad_norm": 1.342185139656067,
"learning_rate": 9.358e-05,
"loss": 0.0191,
"step": 4680
},
{
"epoch": 293.125,
"grad_norm": 1.5401999950408936,
"learning_rate": 9.378e-05,
"loss": 0.0193,
"step": 4690
},
{
"epoch": 293.75,
"grad_norm": 1.4412999153137207,
"learning_rate": 9.398e-05,
"loss": 0.0191,
"step": 4700
},
{
"epoch": 294.375,
"grad_norm": 1.2340666055679321,
"learning_rate": 9.418e-05,
"loss": 0.0182,
"step": 4710
},
{
"epoch": 295.0,
"grad_norm": 1.1816933155059814,
"learning_rate": 9.438e-05,
"loss": 0.0175,
"step": 4720
},
{
"epoch": 295.625,
"grad_norm": 1.2440204620361328,
"learning_rate": 9.458e-05,
"loss": 0.0178,
"step": 4730
},
{
"epoch": 296.25,
"grad_norm": 1.4980961084365845,
"learning_rate": 9.478e-05,
"loss": 0.0173,
"step": 4740
},
{
"epoch": 296.875,
"grad_norm": 1.4015268087387085,
"learning_rate": 9.498e-05,
"loss": 0.0198,
"step": 4750
},
{
"epoch": 297.5,
"grad_norm": 1.420882225036621,
"learning_rate": 9.518000000000001e-05,
"loss": 0.019,
"step": 4760
},
{
"epoch": 298.125,
"grad_norm": 1.2662218809127808,
"learning_rate": 9.538e-05,
"loss": 0.0195,
"step": 4770
},
{
"epoch": 298.75,
"grad_norm": 1.528330683708191,
"learning_rate": 9.558e-05,
"loss": 0.0213,
"step": 4780
},
{
"epoch": 299.375,
"grad_norm": 1.3324357271194458,
"learning_rate": 9.578000000000001e-05,
"loss": 0.0194,
"step": 4790
},
{
"epoch": 300.0,
"grad_norm": 1.3170146942138672,
"learning_rate": 9.598e-05,
"loss": 0.0186,
"step": 4800
},
{
"epoch": 300.625,
"grad_norm": 1.4495036602020264,
"learning_rate": 9.618e-05,
"loss": 0.0178,
"step": 4810
},
{
"epoch": 301.25,
"grad_norm": 1.6242793798446655,
"learning_rate": 9.638000000000001e-05,
"loss": 0.0204,
"step": 4820
},
{
"epoch": 301.875,
"grad_norm": 1.4832464456558228,
"learning_rate": 9.658e-05,
"loss": 0.0203,
"step": 4830
},
{
"epoch": 302.5,
"grad_norm": 1.3549563884735107,
"learning_rate": 9.678e-05,
"loss": 0.0177,
"step": 4840
},
{
"epoch": 303.125,
"grad_norm": 1.804412841796875,
"learning_rate": 9.698000000000001e-05,
"loss": 0.021,
"step": 4850
},
{
"epoch": 303.75,
"grad_norm": 1.5907257795333862,
"learning_rate": 9.718e-05,
"loss": 0.0209,
"step": 4860
},
{
"epoch": 304.375,
"grad_norm": 1.4540935754776,
"learning_rate": 9.738e-05,
"loss": 0.017,
"step": 4870
},
{
"epoch": 305.0,
"grad_norm": 1.223158597946167,
"learning_rate": 9.758000000000001e-05,
"loss": 0.018,
"step": 4880
},
{
"epoch": 305.625,
"grad_norm": 1.2038943767547607,
"learning_rate": 9.778e-05,
"loss": 0.0176,
"step": 4890
},
{
"epoch": 306.25,
"grad_norm": 1.110867977142334,
"learning_rate": 9.798000000000001e-05,
"loss": 0.0173,
"step": 4900
},
{
"epoch": 306.875,
"grad_norm": 1.414939522743225,
"learning_rate": 9.818000000000001e-05,
"loss": 0.017,
"step": 4910
},
{
"epoch": 307.5,
"grad_norm": 1.3866313695907593,
"learning_rate": 9.838e-05,
"loss": 0.02,
"step": 4920
},
{
"epoch": 308.125,
"grad_norm": 1.5799922943115234,
"learning_rate": 9.858000000000001e-05,
"loss": 0.0162,
"step": 4930
},
{
"epoch": 308.75,
"grad_norm": 1.261763334274292,
"learning_rate": 9.878e-05,
"loss": 0.0189,
"step": 4940
},
{
"epoch": 309.375,
"grad_norm": 1.474787950515747,
"learning_rate": 9.898e-05,
"loss": 0.0181,
"step": 4950
},
{
"epoch": 310.0,
"grad_norm": 1.287822961807251,
"learning_rate": 9.918000000000001e-05,
"loss": 0.0184,
"step": 4960
},
{
"epoch": 310.625,
"grad_norm": 1.0713199377059937,
"learning_rate": 9.938e-05,
"loss": 0.0179,
"step": 4970
},
{
"epoch": 311.25,
"grad_norm": 1.2200391292572021,
"learning_rate": 9.958e-05,
"loss": 0.018,
"step": 4980
},
{
"epoch": 311.875,
"grad_norm": 1.5587009191513062,
"learning_rate": 9.978000000000001e-05,
"loss": 0.0199,
"step": 4990
},
{
"epoch": 312.5,
"grad_norm": 1.4640460014343262,
"learning_rate": 9.998e-05,
"loss": 0.017,
"step": 5000
},
{
"epoch": 313.125,
"grad_norm": 1.4215519428253174,
"learning_rate": 9.999999778549045e-05,
"loss": 0.0171,
"step": 5010
},
{
"epoch": 313.75,
"grad_norm": 1.1879425048828125,
"learning_rate": 9.999999013039593e-05,
"loss": 0.016,
"step": 5020
},
{
"epoch": 314.375,
"grad_norm": 1.231829047203064,
"learning_rate": 9.999997700737766e-05,
"loss": 0.0158,
"step": 5030
},
{
"epoch": 315.0,
"grad_norm": 1.224221110343933,
"learning_rate": 9.999995841643709e-05,
"loss": 0.0164,
"step": 5040
},
{
"epoch": 315.625,
"grad_norm": 1.491013765335083,
"learning_rate": 9.999993435757623e-05,
"loss": 0.0166,
"step": 5050
},
{
"epoch": 316.25,
"grad_norm": 1.2551881074905396,
"learning_rate": 9.999990483079773e-05,
"loss": 0.0187,
"step": 5060
},
{
"epoch": 316.875,
"grad_norm": 1.3919192552566528,
"learning_rate": 9.999986983610481e-05,
"loss": 0.0167,
"step": 5070
},
{
"epoch": 317.5,
"grad_norm": 1.145408272743225,
"learning_rate": 9.99998293735013e-05,
"loss": 0.0169,
"step": 5080
},
{
"epoch": 318.125,
"grad_norm": 1.5774271488189697,
"learning_rate": 9.999978344299161e-05,
"loss": 0.0171,
"step": 5090
},
{
"epoch": 318.75,
"grad_norm": 1.4125555753707886,
"learning_rate": 9.99997320445808e-05,
"loss": 0.0191,
"step": 5100
},
{
"epoch": 319.375,
"grad_norm": 1.110128402709961,
"learning_rate": 9.999967517827444e-05,
"loss": 0.0159,
"step": 5110
},
{
"epoch": 320.0,
"grad_norm": 1.3442533016204834,
"learning_rate": 9.999961284407879e-05,
"loss": 0.0177,
"step": 5120
},
{
"epoch": 320.625,
"grad_norm": 1.3384839296340942,
"learning_rate": 9.999954504200067e-05,
"loss": 0.0154,
"step": 5130
},
{
"epoch": 321.25,
"grad_norm": 1.1482480764389038,
"learning_rate": 9.999947177204744e-05,
"loss": 0.0166,
"step": 5140
},
{
"epoch": 321.875,
"grad_norm": 1.2519944906234741,
"learning_rate": 9.999939303422718e-05,
"loss": 0.0172,
"step": 5150
},
{
"epoch": 322.5,
"grad_norm": 1.3870333433151245,
"learning_rate": 9.999930882854847e-05,
"loss": 0.0168,
"step": 5160
},
{
"epoch": 323.125,
"grad_norm": 1.366909146308899,
"learning_rate": 9.999921915502051e-05,
"loss": 0.016,
"step": 5170
},
{
"epoch": 323.75,
"grad_norm": 1.1931958198547363,
"learning_rate": 9.99991240136531e-05,
"loss": 0.0186,
"step": 5180
},
{
"epoch": 324.375,
"grad_norm": 1.1246201992034912,
"learning_rate": 9.999902340445668e-05,
"loss": 0.0151,
"step": 5190
},
{
"epoch": 325.0,
"grad_norm": 1.2969485521316528,
"learning_rate": 9.999891732744224e-05,
"loss": 0.0154,
"step": 5200
},
{
"epoch": 325.625,
"grad_norm": 1.1869677305221558,
"learning_rate": 9.999880578262135e-05,
"loss": 0.0167,
"step": 5210
},
{
"epoch": 326.25,
"grad_norm": 1.221058964729309,
"learning_rate": 9.999868877000624e-05,
"loss": 0.0164,
"step": 5220
},
{
"epoch": 326.875,
"grad_norm": 1.2182931900024414,
"learning_rate": 9.99985662896097e-05,
"loss": 0.0175,
"step": 5230
},
{
"epoch": 327.5,
"grad_norm": 1.2568279504776,
"learning_rate": 9.999843834144513e-05,
"loss": 0.0159,
"step": 5240
},
{
"epoch": 328.125,
"grad_norm": 1.254540205001831,
"learning_rate": 9.99983049255265e-05,
"loss": 0.0161,
"step": 5250
},
{
"epoch": 328.75,
"grad_norm": 1.2322643995285034,
"learning_rate": 9.999816604186843e-05,
"loss": 0.0168,
"step": 5260
},
{
"epoch": 329.375,
"grad_norm": 0.9582310914993286,
"learning_rate": 9.999802169048609e-05,
"loss": 0.0149,
"step": 5270
},
{
"epoch": 330.0,
"grad_norm": 0.900672197341919,
"learning_rate": 9.999787187139527e-05,
"loss": 0.0141,
"step": 5280
},
{
"epoch": 330.625,
"grad_norm": 1.049651026725769,
"learning_rate": 9.999771658461234e-05,
"loss": 0.0153,
"step": 5290
},
{
"epoch": 331.25,
"grad_norm": 1.0110572576522827,
"learning_rate": 9.999755583015431e-05,
"loss": 0.0145,
"step": 5300
},
{
"epoch": 331.875,
"grad_norm": 1.1884170770645142,
"learning_rate": 9.999738960803874e-05,
"loss": 0.0152,
"step": 5310
},
{
"epoch": 332.5,
"grad_norm": 1.4686788320541382,
"learning_rate": 9.99972179182838e-05,
"loss": 0.0136,
"step": 5320
},
{
"epoch": 333.125,
"grad_norm": 1.0699830055236816,
"learning_rate": 9.99970407609083e-05,
"loss": 0.0161,
"step": 5330
},
{
"epoch": 333.75,
"grad_norm": 1.8003672361373901,
"learning_rate": 9.999685813593159e-05,
"loss": 0.0177,
"step": 5340
},
{
"epoch": 334.375,
"grad_norm": 1.38191556930542,
"learning_rate": 9.999667004337362e-05,
"loss": 0.0161,
"step": 5350
},
{
"epoch": 335.0,
"grad_norm": 1.199036717414856,
"learning_rate": 9.9996476483255e-05,
"loss": 0.0164,
"step": 5360
},
{
"epoch": 335.625,
"grad_norm": 1.1064685583114624,
"learning_rate": 9.999627745559688e-05,
"loss": 0.0153,
"step": 5370
},
{
"epoch": 336.25,
"grad_norm": 0.968438982963562,
"learning_rate": 9.999607296042101e-05,
"loss": 0.015,
"step": 5380
},
{
"epoch": 336.875,
"grad_norm": 1.3204340934753418,
"learning_rate": 9.99958629977498e-05,
"loss": 0.0144,
"step": 5390
},
{
"epoch": 337.5,
"grad_norm": 1.0026376247406006,
"learning_rate": 9.999564756760615e-05,
"loss": 0.0144,
"step": 5400
},
{
"epoch": 338.125,
"grad_norm": 1.094014048576355,
"learning_rate": 9.999542667001366e-05,
"loss": 0.0143,
"step": 5410
},
{
"epoch": 338.75,
"grad_norm": 1.0915470123291016,
"learning_rate": 9.999520030499647e-05,
"loss": 0.0138,
"step": 5420
},
{
"epoch": 339.375,
"grad_norm": 1.0048651695251465,
"learning_rate": 9.999496847257936e-05,
"loss": 0.0146,
"step": 5430
},
{
"epoch": 340.0,
"grad_norm": 1.138767123222351,
"learning_rate": 9.999473117278764e-05,
"loss": 0.0162,
"step": 5440
},
{
"epoch": 340.625,
"grad_norm": 1.3121551275253296,
"learning_rate": 9.999448840564731e-05,
"loss": 0.0144,
"step": 5450
},
{
"epoch": 341.25,
"grad_norm": 1.2357908487319946,
"learning_rate": 9.999424017118488e-05,
"loss": 0.0155,
"step": 5460
},
{
"epoch": 341.875,
"grad_norm": 1.4110485315322876,
"learning_rate": 9.999398646942751e-05,
"loss": 0.0171,
"step": 5470
},
{
"epoch": 342.5,
"grad_norm": 1.231876015663147,
"learning_rate": 9.999372730040296e-05,
"loss": 0.0148,
"step": 5480
},
{
"epoch": 343.125,
"grad_norm": 1.1513409614562988,
"learning_rate": 9.999346266413953e-05,
"loss": 0.0155,
"step": 5490
},
{
"epoch": 343.75,
"grad_norm": 1.0324758291244507,
"learning_rate": 9.99931925606662e-05,
"loss": 0.0155,
"step": 5500
},
{
"epoch": 344.375,
"grad_norm": 1.2001458406448364,
"learning_rate": 9.99929169900125e-05,
"loss": 0.0142,
"step": 5510
},
{
"epoch": 345.0,
"grad_norm": 0.9090719819068909,
"learning_rate": 9.999263595220855e-05,
"loss": 0.0133,
"step": 5520
},
{
"epoch": 345.625,
"grad_norm": 0.9517356157302856,
"learning_rate": 9.99923494472851e-05,
"loss": 0.0131,
"step": 5530
},
{
"epoch": 346.25,
"grad_norm": 0.9557884931564331,
"learning_rate": 9.999205747527348e-05,
"loss": 0.0153,
"step": 5540
},
{
"epoch": 346.875,
"grad_norm": 1.039165735244751,
"learning_rate": 9.999176003620561e-05,
"loss": 0.0141,
"step": 5550
},
{
"epoch": 347.5,
"grad_norm": 0.930853545665741,
"learning_rate": 9.999145713011405e-05,
"loss": 0.0143,
"step": 5560
},
{
"epoch": 348.125,
"grad_norm": 0.956095278263092,
"learning_rate": 9.999114875703186e-05,
"loss": 0.0141,
"step": 5570
},
{
"epoch": 348.75,
"grad_norm": 0.771486222743988,
"learning_rate": 9.999083491699281e-05,
"loss": 0.0143,
"step": 5580
},
{
"epoch": 349.375,
"grad_norm": 0.6893032193183899,
"learning_rate": 9.999051561003123e-05,
"loss": 0.0144,
"step": 5590
},
{
"epoch": 350.0,
"grad_norm": 1.0121644735336304,
"learning_rate": 9.999019083618202e-05,
"loss": 0.0151,
"step": 5600
},
{
"epoch": 350.625,
"grad_norm": 1.1058743000030518,
"learning_rate": 9.99898605954807e-05,
"loss": 0.0162,
"step": 5610
},
{
"epoch": 351.25,
"grad_norm": 1.0109678506851196,
"learning_rate": 9.998952488796338e-05,
"loss": 0.015,
"step": 5620
},
{
"epoch": 351.875,
"grad_norm": 0.8328022360801697,
"learning_rate": 9.998918371366676e-05,
"loss": 0.0142,
"step": 5630
},
{
"epoch": 352.5,
"grad_norm": 0.836746096611023,
"learning_rate": 9.99888370726282e-05,
"loss": 0.0137,
"step": 5640
},
{
"epoch": 353.125,
"grad_norm": 0.9082058072090149,
"learning_rate": 9.998848496488556e-05,
"loss": 0.0141,
"step": 5650
},
{
"epoch": 353.75,
"grad_norm": 0.9380905628204346,
"learning_rate": 9.998812739047736e-05,
"loss": 0.0149,
"step": 5660
},
{
"epoch": 354.375,
"grad_norm": 0.9345435500144958,
"learning_rate": 9.99877643494427e-05,
"loss": 0.0144,
"step": 5670
},
{
"epoch": 355.0,
"grad_norm": 0.8377882242202759,
"learning_rate": 9.998739584182128e-05,
"loss": 0.0151,
"step": 5680
},
{
"epoch": 355.625,
"grad_norm": 1.1241296529769897,
"learning_rate": 9.998702186765342e-05,
"loss": 0.0145,
"step": 5690
},
{
"epoch": 356.25,
"grad_norm": 1.0022445917129517,
"learning_rate": 9.998664242698e-05,
"loss": 0.0137,
"step": 5700
},
{
"epoch": 356.875,
"grad_norm": 1.14398992061615,
"learning_rate": 9.998625751984251e-05,
"loss": 0.0122,
"step": 5710
},
{
"epoch": 357.5,
"grad_norm": 1.511240839958191,
"learning_rate": 9.998586714628307e-05,
"loss": 0.0141,
"step": 5720
},
{
"epoch": 358.125,
"grad_norm": 1.257946252822876,
"learning_rate": 9.998547130634432e-05,
"loss": 0.0157,
"step": 5730
},
{
"epoch": 358.75,
"grad_norm": 1.1702454090118408,
"learning_rate": 9.99850700000696e-05,
"loss": 0.0144,
"step": 5740
},
{
"epoch": 359.375,
"grad_norm": 0.8067399859428406,
"learning_rate": 9.998466322750278e-05,
"loss": 0.0136,
"step": 5750
},
{
"epoch": 360.0,
"grad_norm": 0.8550326228141785,
"learning_rate": 9.998425098868834e-05,
"loss": 0.0129,
"step": 5760
},
{
"epoch": 360.625,
"grad_norm": 0.9919332265853882,
"learning_rate": 9.998383328367136e-05,
"loss": 0.013,
"step": 5770
},
{
"epoch": 361.25,
"grad_norm": 0.9598110914230347,
"learning_rate": 9.99834101124975e-05,
"loss": 0.0136,
"step": 5780
},
{
"epoch": 361.875,
"grad_norm": 0.8677031397819519,
"learning_rate": 9.998298147521309e-05,
"loss": 0.0137,
"step": 5790
},
{
"epoch": 362.5,
"grad_norm": 0.9038897156715393,
"learning_rate": 9.998254737186496e-05,
"loss": 0.0124,
"step": 5800
},
{
"epoch": 363.125,
"grad_norm": 0.9390170574188232,
"learning_rate": 9.99821078025006e-05,
"loss": 0.0119,
"step": 5810
},
{
"epoch": 363.75,
"grad_norm": 1.011299967765808,
"learning_rate": 9.998166276716807e-05,
"loss": 0.0131,
"step": 5820
},
{
"epoch": 364.375,
"grad_norm": 0.7727632522583008,
"learning_rate": 9.998121226591606e-05,
"loss": 0.0124,
"step": 5830
},
{
"epoch": 365.0,
"grad_norm": 0.9111457467079163,
"learning_rate": 9.998075629879382e-05,
"loss": 0.0122,
"step": 5840
},
{
"epoch": 365.625,
"grad_norm": 0.8254387378692627,
"learning_rate": 9.99802948658512e-05,
"loss": 0.0122,
"step": 5850
},
{
"epoch": 366.25,
"grad_norm": 0.8419124484062195,
"learning_rate": 9.99798279671387e-05,
"loss": 0.0136,
"step": 5860
},
{
"epoch": 366.875,
"grad_norm": 0.9950329661369324,
"learning_rate": 9.997935560270734e-05,
"loss": 0.0139,
"step": 5870
},
{
"epoch": 367.5,
"grad_norm": 0.8446523547172546,
"learning_rate": 9.997887777260879e-05,
"loss": 0.0128,
"step": 5880
},
{
"epoch": 368.125,
"grad_norm": 0.8795507550239563,
"learning_rate": 9.997839447689532e-05,
"loss": 0.0142,
"step": 5890
},
{
"epoch": 368.75,
"grad_norm": 0.9794557094573975,
"learning_rate": 9.997790571561978e-05,
"loss": 0.0134,
"step": 5900
},
{
"epoch": 369.375,
"grad_norm": 0.9027246236801147,
"learning_rate": 9.99774114888356e-05,
"loss": 0.0126,
"step": 5910
},
{
"epoch": 370.0,
"grad_norm": 0.8756938576698303,
"learning_rate": 9.997691179659684e-05,
"loss": 0.014,
"step": 5920
},
{
"epoch": 370.625,
"grad_norm": 1.2023380994796753,
"learning_rate": 9.997640663895815e-05,
"loss": 0.0131,
"step": 5930
},
{
"epoch": 371.25,
"grad_norm": 1.141804814338684,
"learning_rate": 9.997589601597477e-05,
"loss": 0.015,
"step": 5940
},
{
"epoch": 371.875,
"grad_norm": 0.9179847836494446,
"learning_rate": 9.997537992770252e-05,
"loss": 0.0126,
"step": 5950
},
{
"epoch": 372.5,
"grad_norm": 0.8151926398277283,
"learning_rate": 9.997485837419788e-05,
"loss": 0.013,
"step": 5960
},
{
"epoch": 373.125,
"grad_norm": 0.6601715683937073,
"learning_rate": 9.997433135551786e-05,
"loss": 0.0123,
"step": 5970
},
{
"epoch": 373.75,
"grad_norm": 0.8281500935554504,
"learning_rate": 9.997379887172009e-05,
"loss": 0.0115,
"step": 5980
},
{
"epoch": 374.375,
"grad_norm": 0.8727806806564331,
"learning_rate": 9.997326092286281e-05,
"loss": 0.0128,
"step": 5990
},
{
"epoch": 375.0,
"grad_norm": 0.8489688038825989,
"learning_rate": 9.997271750900486e-05,
"loss": 0.0129,
"step": 6000
},
{
"epoch": 375.625,
"grad_norm": 0.6510198712348938,
"learning_rate": 9.997216863020565e-05,
"loss": 0.0117,
"step": 6010
},
{
"epoch": 376.25,
"grad_norm": 0.8793591856956482,
"learning_rate": 9.99716142865252e-05,
"loss": 0.012,
"step": 6020
},
{
"epoch": 376.875,
"grad_norm": 0.7070950269699097,
"learning_rate": 9.997105447802415e-05,
"loss": 0.0118,
"step": 6030
},
{
"epoch": 377.5,
"grad_norm": 0.8314371109008789,
"learning_rate": 9.997048920476373e-05,
"loss": 0.0118,
"step": 6040
},
{
"epoch": 378.125,
"grad_norm": 0.761350154876709,
"learning_rate": 9.996991846680572e-05,
"loss": 0.0127,
"step": 6050
},
{
"epoch": 378.75,
"grad_norm": 0.7484061121940613,
"learning_rate": 9.996934226421257e-05,
"loss": 0.0119,
"step": 6060
},
{
"epoch": 379.375,
"grad_norm": 0.7929844260215759,
"learning_rate": 9.996876059704726e-05,
"loss": 0.012,
"step": 6070
},
{
"epoch": 380.0,
"grad_norm": 0.8181713819503784,
"learning_rate": 9.996817346537343e-05,
"loss": 0.0142,
"step": 6080
},
{
"epoch": 380.625,
"grad_norm": 0.9369438290596008,
"learning_rate": 9.996758086925526e-05,
"loss": 0.0132,
"step": 6090
},
{
"epoch": 381.25,
"grad_norm": 0.8046433925628662,
"learning_rate": 9.996698280875759e-05,
"loss": 0.012,
"step": 6100
},
{
"epoch": 381.875,
"grad_norm": 0.7803655862808228,
"learning_rate": 9.99663792839458e-05,
"loss": 0.0134,
"step": 6110
},
{
"epoch": 382.5,
"grad_norm": 0.7660366296768188,
"learning_rate": 9.99657702948859e-05,
"loss": 0.0124,
"step": 6120
},
{
"epoch": 383.125,
"grad_norm": 0.6417670845985413,
"learning_rate": 9.996515584164448e-05,
"loss": 0.012,
"step": 6130
},
{
"epoch": 383.75,
"grad_norm": 0.8960108160972595,
"learning_rate": 9.996453592428873e-05,
"loss": 0.0117,
"step": 6140
},
{
"epoch": 384.375,
"grad_norm": 0.8871966600418091,
"learning_rate": 9.996391054288646e-05,
"loss": 0.0116,
"step": 6150
},
{
"epoch": 385.0,
"grad_norm": 0.8760678172111511,
"learning_rate": 9.996327969750605e-05,
"loss": 0.0117,
"step": 6160
},
{
"epoch": 385.625,
"grad_norm": 0.865280032157898,
"learning_rate": 9.996264338821649e-05,
"loss": 0.011,
"step": 6170
},
{
"epoch": 386.25,
"grad_norm": 1.1085981130599976,
"learning_rate": 9.996200161508735e-05,
"loss": 0.0128,
"step": 6180
},
{
"epoch": 386.875,
"grad_norm": 1.0455905199050903,
"learning_rate": 9.996135437818885e-05,
"loss": 0.0121,
"step": 6190
},
{
"epoch": 387.5,
"grad_norm": 0.8136721253395081,
"learning_rate": 9.996070167759175e-05,
"loss": 0.013,
"step": 6200
},
{
"epoch": 388.125,
"grad_norm": 0.7488872408866882,
"learning_rate": 9.996004351336743e-05,
"loss": 0.0126,
"step": 6210
},
{
"epoch": 388.75,
"grad_norm": 0.8310092091560364,
"learning_rate": 9.995937988558785e-05,
"loss": 0.0136,
"step": 6220
},
{
"epoch": 389.375,
"grad_norm": 0.8811050653457642,
"learning_rate": 9.995871079432561e-05,
"loss": 0.0132,
"step": 6230
},
{
"epoch": 390.0,
"grad_norm": 0.9369884133338928,
"learning_rate": 9.995803623965389e-05,
"loss": 0.0133,
"step": 6240
},
{
"epoch": 390.625,
"grad_norm": 0.9472755193710327,
"learning_rate": 9.995735622164641e-05,
"loss": 0.0132,
"step": 6250
},
{
"epoch": 391.25,
"grad_norm": 1.1913206577301025,
"learning_rate": 9.995667074037758e-05,
"loss": 0.0134,
"step": 6260
},
{
"epoch": 391.875,
"grad_norm": 0.8896439075469971,
"learning_rate": 9.995597979592232e-05,
"loss": 0.0134,
"step": 6270
},
{
"epoch": 392.5,
"grad_norm": 0.8965170383453369,
"learning_rate": 9.995528338835625e-05,
"loss": 0.0124,
"step": 6280
},
{
"epoch": 393.125,
"grad_norm": 0.8789317011833191,
"learning_rate": 9.995458151775547e-05,
"loss": 0.0126,
"step": 6290
},
{
"epoch": 393.75,
"grad_norm": 0.7865223288536072,
"learning_rate": 9.995387418419677e-05,
"loss": 0.0119,
"step": 6300
},
{
"epoch": 394.375,
"grad_norm": 0.7527452111244202,
"learning_rate": 9.99531613877575e-05,
"loss": 0.0118,
"step": 6310
},
{
"epoch": 395.0,
"grad_norm": 0.7900567650794983,
"learning_rate": 9.995244312851559e-05,
"loss": 0.0116,
"step": 6320
},
{
"epoch": 395.625,
"grad_norm": 0.7366781234741211,
"learning_rate": 9.995171940654961e-05,
"loss": 0.0112,
"step": 6330
},
{
"epoch": 396.25,
"grad_norm": 0.8073196411132812,
"learning_rate": 9.995099022193871e-05,
"loss": 0.0116,
"step": 6340
},
{
"epoch": 396.875,
"grad_norm": 0.924555242061615,
"learning_rate": 9.995025557476261e-05,
"loss": 0.0109,
"step": 6350
},
{
"epoch": 397.5,
"grad_norm": 0.8284614682197571,
"learning_rate": 9.994951546510165e-05,
"loss": 0.0117,
"step": 6360
},
{
"epoch": 398.125,
"grad_norm": 0.8100062012672424,
"learning_rate": 9.994876989303679e-05,
"loss": 0.0127,
"step": 6370
},
{
"epoch": 398.75,
"grad_norm": 0.9377039670944214,
"learning_rate": 9.994801885864955e-05,
"loss": 0.0122,
"step": 6380
},
{
"epoch": 399.375,
"grad_norm": 0.9842908978462219,
"learning_rate": 9.994726236202205e-05,
"loss": 0.013,
"step": 6390
},
{
"epoch": 400.0,
"grad_norm": 1.1019262075424194,
"learning_rate": 9.994650040323704e-05,
"loss": 0.0134,
"step": 6400
},
{
"epoch": 400.625,
"grad_norm": 1.0751221179962158,
"learning_rate": 9.994573298237784e-05,
"loss": 0.0118,
"step": 6410
},
{
"epoch": 401.25,
"grad_norm": 0.898923933506012,
"learning_rate": 9.994496009952837e-05,
"loss": 0.012,
"step": 6420
},
{
"epoch": 401.875,
"grad_norm": 0.8281941413879395,
"learning_rate": 9.994418175477316e-05,
"loss": 0.0124,
"step": 6430
},
{
"epoch": 402.5,
"grad_norm": 0.692079484462738,
"learning_rate": 9.994339794819733e-05,
"loss": 0.011,
"step": 6440
},
{
"epoch": 403.125,
"grad_norm": 0.7526706457138062,
"learning_rate": 9.994260867988658e-05,
"loss": 0.0121,
"step": 6450
},
{
"epoch": 403.75,
"grad_norm": 0.8704769015312195,
"learning_rate": 9.994181394992723e-05,
"loss": 0.0109,
"step": 6460
},
{
"epoch": 404.375,
"grad_norm": 0.8282954096794128,
"learning_rate": 9.994101375840618e-05,
"loss": 0.0107,
"step": 6470
},
{
"epoch": 405.0,
"grad_norm": 0.7742241621017456,
"learning_rate": 9.994020810541098e-05,
"loss": 0.0115,
"step": 6480
},
{
"epoch": 405.625,
"grad_norm": 0.7262750267982483,
"learning_rate": 9.99393969910297e-05,
"loss": 0.011,
"step": 6490
},
{
"epoch": 406.25,
"grad_norm": 0.8099271655082703,
"learning_rate": 9.993858041535104e-05,
"loss": 0.0126,
"step": 6500
},
{
"epoch": 406.875,
"grad_norm": 0.8308644890785217,
"learning_rate": 9.99377583784643e-05,
"loss": 0.0119,
"step": 6510
},
{
"epoch": 407.5,
"grad_norm": 0.900124728679657,
"learning_rate": 9.993693088045939e-05,
"loss": 0.0112,
"step": 6520
},
{
"epoch": 408.125,
"grad_norm": 0.8921932578086853,
"learning_rate": 9.99360979214268e-05,
"loss": 0.0112,
"step": 6530
},
{
"epoch": 408.75,
"grad_norm": 0.9405972361564636,
"learning_rate": 9.99352595014576e-05,
"loss": 0.0107,
"step": 6540
},
{
"epoch": 409.375,
"grad_norm": 0.8436768651008606,
"learning_rate": 9.993441562064354e-05,
"loss": 0.0113,
"step": 6550
},
{
"epoch": 410.0,
"grad_norm": 0.804934024810791,
"learning_rate": 9.993356627907685e-05,
"loss": 0.0117,
"step": 6560
},
{
"epoch": 410.625,
"grad_norm": 0.945950984954834,
"learning_rate": 9.99327114768504e-05,
"loss": 0.0125,
"step": 6570
},
{
"epoch": 411.25,
"grad_norm": 0.925611674785614,
"learning_rate": 9.99318512140577e-05,
"loss": 0.0121,
"step": 6580
},
{
"epoch": 411.875,
"grad_norm": 0.9319164156913757,
"learning_rate": 9.993098549079284e-05,
"loss": 0.012,
"step": 6590
},
{
"epoch": 412.5,
"grad_norm": 1.0740889310836792,
"learning_rate": 9.993011430715047e-05,
"loss": 0.0137,
"step": 6600
},
{
"epoch": 413.125,
"grad_norm": 1.1442779302597046,
"learning_rate": 9.992923766322586e-05,
"loss": 0.0125,
"step": 6610
},
{
"epoch": 413.75,
"grad_norm": 0.8353562355041504,
"learning_rate": 9.99283555591149e-05,
"loss": 0.0119,
"step": 6620
},
{
"epoch": 414.375,
"grad_norm": 0.720020592212677,
"learning_rate": 9.992746799491404e-05,
"loss": 0.012,
"step": 6630
},
{
"epoch": 415.0,
"grad_norm": 0.7117792367935181,
"learning_rate": 9.992657497072033e-05,
"loss": 0.0118,
"step": 6640
},
{
"epoch": 415.625,
"grad_norm": 0.8013281226158142,
"learning_rate": 9.992567648663147e-05,
"loss": 0.0134,
"step": 6650
},
{
"epoch": 416.25,
"grad_norm": 0.8130918145179749,
"learning_rate": 9.992477254274568e-05,
"loss": 0.0118,
"step": 6660
},
{
"epoch": 416.875,
"grad_norm": 0.7213727235794067,
"learning_rate": 9.992386313916183e-05,
"loss": 0.0111,
"step": 6670
},
{
"epoch": 417.5,
"grad_norm": 0.6564821004867554,
"learning_rate": 9.992294827597934e-05,
"loss": 0.0113,
"step": 6680
},
{
"epoch": 418.125,
"grad_norm": 0.6905478239059448,
"learning_rate": 9.992202795329831e-05,
"loss": 0.012,
"step": 6690
},
{
"epoch": 418.75,
"grad_norm": 0.821371853351593,
"learning_rate": 9.992110217121936e-05,
"loss": 0.0128,
"step": 6700
},
{
"epoch": 419.375,
"grad_norm": 0.8834856152534485,
"learning_rate": 9.992017092984372e-05,
"loss": 0.0106,
"step": 6710
},
{
"epoch": 420.0,
"grad_norm": 0.8281375765800476,
"learning_rate": 9.991923422927326e-05,
"loss": 0.0115,
"step": 6720
},
{
"epoch": 420.625,
"grad_norm": 0.7310401797294617,
"learning_rate": 9.991829206961037e-05,
"loss": 0.0101,
"step": 6730
},
{
"epoch": 421.25,
"grad_norm": 0.7845788598060608,
"learning_rate": 9.991734445095813e-05,
"loss": 0.0105,
"step": 6740
},
{
"epoch": 421.875,
"grad_norm": 0.8412182331085205,
"learning_rate": 9.991639137342015e-05,
"loss": 0.0111,
"step": 6750
},
{
"epoch": 422.5,
"grad_norm": 0.7537260055541992,
"learning_rate": 9.991543283710064e-05,
"loss": 0.0113,
"step": 6760
},
{
"epoch": 423.125,
"grad_norm": 0.6647925972938538,
"learning_rate": 9.991446884210445e-05,
"loss": 0.0119,
"step": 6770
},
{
"epoch": 423.75,
"grad_norm": 0.7035212516784668,
"learning_rate": 9.9913499388537e-05,
"loss": 0.0097,
"step": 6780
},
{
"epoch": 424.375,
"grad_norm": 0.7553647756576538,
"learning_rate": 9.99125244765043e-05,
"loss": 0.01,
"step": 6790
},
{
"epoch": 425.0,
"grad_norm": 0.7420441508293152,
"learning_rate": 9.991154410611296e-05,
"loss": 0.0114,
"step": 6800
},
{
"epoch": 425.625,
"grad_norm": 0.6657722592353821,
"learning_rate": 9.99105582774702e-05,
"loss": 0.0109,
"step": 6810
},
{
"epoch": 426.25,
"grad_norm": 0.7254708409309387,
"learning_rate": 9.990956699068384e-05,
"loss": 0.0116,
"step": 6820
},
{
"epoch": 426.875,
"grad_norm": 0.8595172166824341,
"learning_rate": 9.990857024586224e-05,
"loss": 0.0113,
"step": 6830
},
{
"epoch": 427.5,
"grad_norm": 0.9384058117866516,
"learning_rate": 9.990756804311446e-05,
"loss": 0.0112,
"step": 6840
},
{
"epoch": 428.125,
"grad_norm": 0.8805230855941772,
"learning_rate": 9.990656038255006e-05,
"loss": 0.0097,
"step": 6850
},
{
"epoch": 428.75,
"grad_norm": 0.8175788521766663,
"learning_rate": 9.990554726427926e-05,
"loss": 0.0111,
"step": 6860
},
{
"epoch": 429.375,
"grad_norm": 0.8853816390037537,
"learning_rate": 9.990452868841284e-05,
"loss": 0.0119,
"step": 6870
},
{
"epoch": 430.0,
"grad_norm": 0.8857107758522034,
"learning_rate": 9.99035046550622e-05,
"loss": 0.0111,
"step": 6880
},
{
"epoch": 430.625,
"grad_norm": 0.7299500107765198,
"learning_rate": 9.99024751643393e-05,
"loss": 0.0113,
"step": 6890
},
{
"epoch": 431.25,
"grad_norm": 0.6400433778762817,
"learning_rate": 9.990144021635677e-05,
"loss": 0.0106,
"step": 6900
},
{
"epoch": 431.875,
"grad_norm": 0.6998341083526611,
"learning_rate": 9.990039981122775e-05,
"loss": 0.0117,
"step": 6910
},
{
"epoch": 432.5,
"grad_norm": 0.6614553928375244,
"learning_rate": 9.989935394906602e-05,
"loss": 0.0108,
"step": 6920
},
{
"epoch": 433.125,
"grad_norm": 0.8393372893333435,
"learning_rate": 9.989830262998598e-05,
"loss": 0.013,
"step": 6930
},
{
"epoch": 433.75,
"grad_norm": 0.7657507061958313,
"learning_rate": 9.989724585410259e-05,
"loss": 0.0115,
"step": 6940
},
{
"epoch": 434.375,
"grad_norm": 0.6534095406532288,
"learning_rate": 9.989618362153139e-05,
"loss": 0.0116,
"step": 6950
},
{
"epoch": 435.0,
"grad_norm": 0.5554938316345215,
"learning_rate": 9.989511593238859e-05,
"loss": 0.0101,
"step": 6960
},
{
"epoch": 435.625,
"grad_norm": 0.633482813835144,
"learning_rate": 9.98940427867909e-05,
"loss": 0.0105,
"step": 6970
},
{
"epoch": 436.25,
"grad_norm": 0.5705388784408569,
"learning_rate": 9.989296418485573e-05,
"loss": 0.0127,
"step": 6980
},
{
"epoch": 436.875,
"grad_norm": 0.560118556022644,
"learning_rate": 9.989188012670101e-05,
"loss": 0.0102,
"step": 6990
},
{
"epoch": 437.5,
"grad_norm": 0.5680054426193237,
"learning_rate": 9.989079061244528e-05,
"loss": 0.0108,
"step": 7000
},
{
"epoch": 438.125,
"grad_norm": 0.6862987875938416,
"learning_rate": 9.988969564220769e-05,
"loss": 0.011,
"step": 7010
},
{
"epoch": 438.75,
"grad_norm": 0.6537038683891296,
"learning_rate": 9.988859521610801e-05,
"loss": 0.011,
"step": 7020
},
{
"epoch": 439.375,
"grad_norm": 0.7102747559547424,
"learning_rate": 9.988748933426656e-05,
"loss": 0.0114,
"step": 7030
},
{
"epoch": 440.0,
"grad_norm": 0.7743424773216248,
"learning_rate": 9.988637799680428e-05,
"loss": 0.0114,
"step": 7040
},
{
"epoch": 440.625,
"grad_norm": 0.7385320663452148,
"learning_rate": 9.98852612038427e-05,
"loss": 0.0102,
"step": 7050
},
{
"epoch": 441.25,
"grad_norm": 0.7324809432029724,
"learning_rate": 9.988413895550397e-05,
"loss": 0.0095,
"step": 7060
},
{
"epoch": 441.875,
"grad_norm": 0.6916730999946594,
"learning_rate": 9.98830112519108e-05,
"loss": 0.012,
"step": 7070
},
{
"epoch": 442.5,
"grad_norm": 0.5611207485198975,
"learning_rate": 9.98818780931865e-05,
"loss": 0.0099,
"step": 7080
},
{
"epoch": 443.125,
"grad_norm": 0.6533907055854797,
"learning_rate": 9.988073947945502e-05,
"loss": 0.0097,
"step": 7090
},
{
"epoch": 443.75,
"grad_norm": 0.8114432096481323,
"learning_rate": 9.987959541084087e-05,
"loss": 0.0096,
"step": 7100
},
{
"epoch": 444.375,
"grad_norm": 0.5615887641906738,
"learning_rate": 9.987844588746915e-05,
"loss": 0.0085,
"step": 7110
},
{
"epoch": 445.0,
"grad_norm": 0.6930294632911682,
"learning_rate": 9.987729090946558e-05,
"loss": 0.0096,
"step": 7120
},
{
"epoch": 445.625,
"grad_norm": 0.7661396265029907,
"learning_rate": 9.987613047695647e-05,
"loss": 0.0099,
"step": 7130
},
{
"epoch": 446.25,
"grad_norm": 0.7148370146751404,
"learning_rate": 9.987496459006871e-05,
"loss": 0.0092,
"step": 7140
},
{
"epoch": 446.875,
"grad_norm": 0.9166419506072998,
"learning_rate": 9.987379324892982e-05,
"loss": 0.0113,
"step": 7150
},
{
"epoch": 447.5,
"grad_norm": 0.8479866981506348,
"learning_rate": 9.987261645366788e-05,
"loss": 0.0101,
"step": 7160
},
{
"epoch": 448.125,
"grad_norm": 0.6642943620681763,
"learning_rate": 9.987143420441158e-05,
"loss": 0.01,
"step": 7170
},
{
"epoch": 448.75,
"grad_norm": 0.6536929607391357,
"learning_rate": 9.987024650129022e-05,
"loss": 0.0102,
"step": 7180
},
{
"epoch": 449.375,
"grad_norm": 0.8466352820396423,
"learning_rate": 9.986905334443368e-05,
"loss": 0.0117,
"step": 7190
},
{
"epoch": 450.0,
"grad_norm": 0.7626696228981018,
"learning_rate": 9.986785473397245e-05,
"loss": 0.0103,
"step": 7200
},
{
"epoch": 450.625,
"grad_norm": 0.7776815891265869,
"learning_rate": 9.98666506700376e-05,
"loss": 0.0115,
"step": 7210
},
{
"epoch": 451.25,
"grad_norm": 1.0069994926452637,
"learning_rate": 9.986544115276081e-05,
"loss": 0.0128,
"step": 7220
},
{
"epoch": 451.875,
"grad_norm": 0.8917898535728455,
"learning_rate": 9.986422618227433e-05,
"loss": 0.0109,
"step": 7230
},
{
"epoch": 452.5,
"grad_norm": 0.7967373728752136,
"learning_rate": 9.986300575871106e-05,
"loss": 0.0116,
"step": 7240
},
{
"epoch": 453.125,
"grad_norm": 0.6768915057182312,
"learning_rate": 9.986177988220444e-05,
"loss": 0.0099,
"step": 7250
},
{
"epoch": 453.75,
"grad_norm": 0.7261281609535217,
"learning_rate": 9.986054855288856e-05,
"loss": 0.0103,
"step": 7260
},
{
"epoch": 454.375,
"grad_norm": 0.7023577094078064,
"learning_rate": 9.985931177089802e-05,
"loss": 0.0112,
"step": 7270
},
{
"epoch": 455.0,
"grad_norm": 0.5902547836303711,
"learning_rate": 9.985806953636814e-05,
"loss": 0.0098,
"step": 7280
},
{
"epoch": 455.625,
"grad_norm": 0.6153225302696228,
"learning_rate": 9.985682184943471e-05,
"loss": 0.0111,
"step": 7290
},
{
"epoch": 456.25,
"grad_norm": 0.6180372834205627,
"learning_rate": 9.98555687102342e-05,
"loss": 0.0096,
"step": 7300
},
{
"epoch": 456.875,
"grad_norm": 0.7004512548446655,
"learning_rate": 9.985431011890367e-05,
"loss": 0.0107,
"step": 7310
},
{
"epoch": 457.5,
"grad_norm": 0.8018707036972046,
"learning_rate": 9.985304607558075e-05,
"loss": 0.0104,
"step": 7320
},
{
"epoch": 458.125,
"grad_norm": 0.6335276365280151,
"learning_rate": 9.985177658040364e-05,
"loss": 0.0102,
"step": 7330
},
{
"epoch": 458.75,
"grad_norm": 0.8146379590034485,
"learning_rate": 9.985050163351119e-05,
"loss": 0.0106,
"step": 7340
},
{
"epoch": 459.375,
"grad_norm": 0.7131094336509705,
"learning_rate": 9.984922123504286e-05,
"loss": 0.0093,
"step": 7350
},
{
"epoch": 460.0,
"grad_norm": 0.647261381149292,
"learning_rate": 9.984793538513862e-05,
"loss": 0.0103,
"step": 7360
},
{
"epoch": 460.625,
"grad_norm": 0.6319265961647034,
"learning_rate": 9.984664408393912e-05,
"loss": 0.01,
"step": 7370
},
{
"epoch": 461.25,
"grad_norm": 0.5086030960083008,
"learning_rate": 9.984534733158556e-05,
"loss": 0.0105,
"step": 7380
},
{
"epoch": 461.875,
"grad_norm": 0.6072356104850769,
"learning_rate": 9.984404512821977e-05,
"loss": 0.0089,
"step": 7390
},
{
"epoch": 462.5,
"grad_norm": 0.6429985165596008,
"learning_rate": 9.984273747398411e-05,
"loss": 0.0102,
"step": 7400
},
{
"epoch": 463.125,
"grad_norm": 0.5790389776229858,
"learning_rate": 9.984142436902165e-05,
"loss": 0.0104,
"step": 7410
},
{
"epoch": 463.75,
"grad_norm": 0.701302170753479,
"learning_rate": 9.984010581347596e-05,
"loss": 0.0089,
"step": 7420
},
{
"epoch": 464.375,
"grad_norm": 0.6150535941123962,
"learning_rate": 9.983878180749121e-05,
"loss": 0.0098,
"step": 7430
},
{
"epoch": 465.0,
"grad_norm": 0.6264737248420715,
"learning_rate": 9.983745235121222e-05,
"loss": 0.0093,
"step": 7440
},
{
"epoch": 465.625,
"grad_norm": 0.5422685146331787,
"learning_rate": 9.983611744478438e-05,
"loss": 0.0104,
"step": 7450
},
{
"epoch": 466.25,
"grad_norm": 0.6225709915161133,
"learning_rate": 9.983477708835365e-05,
"loss": 0.0101,
"step": 7460
},
{
"epoch": 466.875,
"grad_norm": 0.5819153785705566,
"learning_rate": 9.983343128206664e-05,
"loss": 0.0106,
"step": 7470
},
{
"epoch": 467.5,
"grad_norm": 0.7224307060241699,
"learning_rate": 9.983208002607049e-05,
"loss": 0.0107,
"step": 7480
},
{
"epoch": 468.125,
"grad_norm": 0.7039912939071655,
"learning_rate": 9.9830723320513e-05,
"loss": 0.0103,
"step": 7490
},
{
"epoch": 468.75,
"grad_norm": 0.6855049133300781,
"learning_rate": 9.982936116554254e-05,
"loss": 0.0088,
"step": 7500
},
{
"epoch": 469.375,
"grad_norm": 0.6290692687034607,
"learning_rate": 9.982799356130803e-05,
"loss": 0.0106,
"step": 7510
},
{
"epoch": 470.0,
"grad_norm": 0.5659773945808411,
"learning_rate": 9.982662050795908e-05,
"loss": 0.0106,
"step": 7520
},
{
"epoch": 470.625,
"grad_norm": 0.5781753063201904,
"learning_rate": 9.982524200564583e-05,
"loss": 0.0104,
"step": 7530
},
{
"epoch": 471.25,
"grad_norm": 0.6644128561019897,
"learning_rate": 9.982385805451901e-05,
"loss": 0.0103,
"step": 7540
},
{
"epoch": 471.875,
"grad_norm": 0.7858973145484924,
"learning_rate": 9.982246865472998e-05,
"loss": 0.0093,
"step": 7550
},
{
"epoch": 472.5,
"grad_norm": 0.7751241326332092,
"learning_rate": 9.982107380643069e-05,
"loss": 0.0101,
"step": 7560
},
{
"epoch": 473.125,
"grad_norm": 0.8384363055229187,
"learning_rate": 9.981967350977368e-05,
"loss": 0.0107,
"step": 7570
},
{
"epoch": 473.75,
"grad_norm": 0.8584528565406799,
"learning_rate": 9.981826776491208e-05,
"loss": 0.0095,
"step": 7580
},
{
"epoch": 474.375,
"grad_norm": 0.995509922504425,
"learning_rate": 9.98168565719996e-05,
"loss": 0.0115,
"step": 7590
},
{
"epoch": 475.0,
"grad_norm": 0.8218001127243042,
"learning_rate": 9.98154399311906e-05,
"loss": 0.011,
"step": 7600
},
{
"epoch": 475.625,
"grad_norm": 0.7269605994224548,
"learning_rate": 9.981401784263997e-05,
"loss": 0.0103,
"step": 7610
},
{
"epoch": 476.25,
"grad_norm": 0.6630864143371582,
"learning_rate": 9.981259030650326e-05,
"loss": 0.0092,
"step": 7620
},
{
"epoch": 476.875,
"grad_norm": 0.7081972360610962,
"learning_rate": 9.981115732293655e-05,
"loss": 0.0084,
"step": 7630
},
{
"epoch": 477.5,
"grad_norm": 0.6908837556838989,
"learning_rate": 9.980971889209659e-05,
"loss": 0.0096,
"step": 7640
},
{
"epoch": 478.125,
"grad_norm": 0.6863625645637512,
"learning_rate": 9.980827501414064e-05,
"loss": 0.0094,
"step": 7650
},
{
"epoch": 478.75,
"grad_norm": 0.628754734992981,
"learning_rate": 9.980682568922663e-05,
"loss": 0.0087,
"step": 7660
},
{
"epoch": 479.375,
"grad_norm": 0.6461851000785828,
"learning_rate": 9.980537091751304e-05,
"loss": 0.0091,
"step": 7670
},
{
"epoch": 480.0,
"grad_norm": 0.6353027820587158,
"learning_rate": 9.980391069915897e-05,
"loss": 0.009,
"step": 7680
},
{
"epoch": 480.625,
"grad_norm": 0.5868967175483704,
"learning_rate": 9.98024450343241e-05,
"loss": 0.0101,
"step": 7690
},
{
"epoch": 481.25,
"grad_norm": 0.6688029766082764,
"learning_rate": 9.980097392316872e-05,
"loss": 0.0083,
"step": 7700
},
{
"epoch": 481.875,
"grad_norm": 0.5620129108428955,
"learning_rate": 9.97994973658537e-05,
"loss": 0.0088,
"step": 7710
},
{
"epoch": 482.5,
"grad_norm": 0.6990760564804077,
"learning_rate": 9.979801536254054e-05,
"loss": 0.008,
"step": 7720
},
{
"epoch": 483.125,
"grad_norm": 0.5271959900856018,
"learning_rate": 9.979652791339127e-05,
"loss": 0.01,
"step": 7730
},
{
"epoch": 483.75,
"grad_norm": 0.717219352722168,
"learning_rate": 9.97950350185686e-05,
"loss": 0.0104,
"step": 7740
},
{
"epoch": 484.375,
"grad_norm": 0.5886634588241577,
"learning_rate": 9.979353667823574e-05,
"loss": 0.0086,
"step": 7750
},
{
"epoch": 485.0,
"grad_norm": 0.7227773070335388,
"learning_rate": 9.979203289255658e-05,
"loss": 0.0094,
"step": 7760
},
{
"epoch": 485.625,
"grad_norm": 0.6355369687080383,
"learning_rate": 9.979052366169557e-05,
"loss": 0.0098,
"step": 7770
},
{
"epoch": 486.25,
"grad_norm": 0.6813123226165771,
"learning_rate": 9.978900898581775e-05,
"loss": 0.01,
"step": 7780
},
{
"epoch": 486.875,
"grad_norm": 0.659970223903656,
"learning_rate": 9.978748886508875e-05,
"loss": 0.0088,
"step": 7790
},
{
"epoch": 487.5,
"grad_norm": 0.7737880349159241,
"learning_rate": 9.978596329967484e-05,
"loss": 0.0106,
"step": 7800
},
{
"epoch": 488.125,
"grad_norm": 0.7581619024276733,
"learning_rate": 9.978443228974284e-05,
"loss": 0.0087,
"step": 7810
},
{
"epoch": 488.75,
"grad_norm": 0.7430512309074402,
"learning_rate": 9.978289583546015e-05,
"loss": 0.0093,
"step": 7820
},
{
"epoch": 489.375,
"grad_norm": 0.6579586863517761,
"learning_rate": 9.978135393699484e-05,
"loss": 0.0092,
"step": 7830
},
{
"epoch": 490.0,
"grad_norm": 0.6156346797943115,
"learning_rate": 9.977980659451548e-05,
"loss": 0.0099,
"step": 7840
},
{
"epoch": 490.625,
"grad_norm": 0.6920315623283386,
"learning_rate": 9.977825380819135e-05,
"loss": 0.0101,
"step": 7850
},
{
"epoch": 491.25,
"grad_norm": 0.7143272161483765,
"learning_rate": 9.97766955781922e-05,
"loss": 0.0102,
"step": 7860
},
{
"epoch": 491.875,
"grad_norm": 0.6715136170387268,
"learning_rate": 9.977513190468848e-05,
"loss": 0.0092,
"step": 7870
},
{
"epoch": 492.5,
"grad_norm": 0.792335569858551,
"learning_rate": 9.977356278785116e-05,
"loss": 0.0094,
"step": 7880
},
{
"epoch": 493.125,
"grad_norm": 0.8089608550071716,
"learning_rate": 9.977198822785184e-05,
"loss": 0.0099,
"step": 7890
},
{
"epoch": 493.75,
"grad_norm": 0.727393627166748,
"learning_rate": 9.977040822486273e-05,
"loss": 0.0093,
"step": 7900
},
{
"epoch": 494.375,
"grad_norm": 0.7314863204956055,
"learning_rate": 9.97688227790566e-05,
"loss": 0.01,
"step": 7910
},
{
"epoch": 495.0,
"grad_norm": 0.6197735667228699,
"learning_rate": 9.976723189060684e-05,
"loss": 0.0093,
"step": 7920
},
{
"epoch": 495.625,
"grad_norm": 0.6258811950683594,
"learning_rate": 9.976563555968742e-05,
"loss": 0.0089,
"step": 7930
},
{
"epoch": 496.25,
"grad_norm": 0.6613799929618835,
"learning_rate": 9.976403378647292e-05,
"loss": 0.0099,
"step": 7940
},
{
"epoch": 496.875,
"grad_norm": 0.5219643115997314,
"learning_rate": 9.97624265711385e-05,
"loss": 0.0102,
"step": 7950
},
{
"epoch": 497.5,
"grad_norm": 0.5938867330551147,
"learning_rate": 9.976081391385993e-05,
"loss": 0.0101,
"step": 7960
},
{
"epoch": 498.125,
"grad_norm": 0.5493279099464417,
"learning_rate": 9.975919581481356e-05,
"loss": 0.01,
"step": 7970
},
{
"epoch": 498.75,
"grad_norm": 0.5064048767089844,
"learning_rate": 9.975757227417634e-05,
"loss": 0.0092,
"step": 7980
},
{
"epoch": 499.375,
"grad_norm": 0.5940008163452148,
"learning_rate": 9.975594329212586e-05,
"loss": 0.0097,
"step": 7990
},
{
"epoch": 500.0,
"grad_norm": 0.5561034083366394,
"learning_rate": 9.97543088688402e-05,
"loss": 0.0092,
"step": 8000
},
{
"epoch": 500.625,
"grad_norm": 0.587040901184082,
"learning_rate": 9.975266900449814e-05,
"loss": 0.0105,
"step": 8010
},
{
"epoch": 501.25,
"grad_norm": 0.6578340530395508,
"learning_rate": 9.975102369927898e-05,
"loss": 0.0088,
"step": 8020
},
{
"epoch": 501.875,
"grad_norm": 0.6301031708717346,
"learning_rate": 9.974937295336269e-05,
"loss": 0.0096,
"step": 8030
},
{
"epoch": 502.5,
"grad_norm": 0.49646562337875366,
"learning_rate": 9.974771676692975e-05,
"loss": 0.0094,
"step": 8040
},
{
"epoch": 503.125,
"grad_norm": 0.5952965021133423,
"learning_rate": 9.974605514016131e-05,
"loss": 0.0088,
"step": 8050
},
{
"epoch": 503.75,
"grad_norm": 0.6772691607475281,
"learning_rate": 9.974438807323907e-05,
"loss": 0.0093,
"step": 8060
},
{
"epoch": 504.375,
"grad_norm": 0.5597459673881531,
"learning_rate": 9.974271556634535e-05,
"loss": 0.0088,
"step": 8070
},
{
"epoch": 505.0,
"grad_norm": 0.8469547033309937,
"learning_rate": 9.974103761966302e-05,
"loss": 0.0106,
"step": 8080
},
{
"epoch": 505.625,
"grad_norm": 0.775303065776825,
"learning_rate": 9.973935423337563e-05,
"loss": 0.0097,
"step": 8090
},
{
"epoch": 506.25,
"grad_norm": 0.7015887498855591,
"learning_rate": 9.973766540766722e-05,
"loss": 0.0095,
"step": 8100
},
{
"epoch": 506.875,
"grad_norm": 0.6640006303787231,
"learning_rate": 9.97359711427225e-05,
"loss": 0.0111,
"step": 8110
},
{
"epoch": 507.5,
"grad_norm": 0.6578481793403625,
"learning_rate": 9.973427143872677e-05,
"loss": 0.0088,
"step": 8120
},
{
"epoch": 508.125,
"grad_norm": 0.6807109713554382,
"learning_rate": 9.973256629586589e-05,
"loss": 0.0102,
"step": 8130
},
{
"epoch": 508.75,
"grad_norm": 0.5422506332397461,
"learning_rate": 9.973085571432632e-05,
"loss": 0.0101,
"step": 8140
},
{
"epoch": 509.375,
"grad_norm": 0.5136811137199402,
"learning_rate": 9.972913969429513e-05,
"loss": 0.0097,
"step": 8150
},
{
"epoch": 510.0,
"grad_norm": 0.693134069442749,
"learning_rate": 9.972741823596e-05,
"loss": 0.0094,
"step": 8160
},
{
"epoch": 510.625,
"grad_norm": 0.611960232257843,
"learning_rate": 9.972569133950917e-05,
"loss": 0.0089,
"step": 8170
},
{
"epoch": 511.25,
"grad_norm": 0.617396354675293,
"learning_rate": 9.972395900513151e-05,
"loss": 0.0088,
"step": 8180
},
{
"epoch": 511.875,
"grad_norm": 0.6016327738761902,
"learning_rate": 9.972222123301645e-05,
"loss": 0.0095,
"step": 8190
},
{
"epoch": 512.5,
"grad_norm": 0.5470365881919861,
"learning_rate": 9.972047802335403e-05,
"loss": 0.0096,
"step": 8200
},
{
"epoch": 513.125,
"grad_norm": 0.6275759935379028,
"learning_rate": 9.971872937633488e-05,
"loss": 0.0085,
"step": 8210
},
{
"epoch": 513.75,
"grad_norm": 0.5876614451408386,
"learning_rate": 9.971697529215024e-05,
"loss": 0.0093,
"step": 8220
},
{
"epoch": 514.375,
"grad_norm": 0.57300865650177,
"learning_rate": 9.971521577099192e-05,
"loss": 0.0091,
"step": 8230
},
{
"epoch": 515.0,
"grad_norm": 0.6590330600738525,
"learning_rate": 9.971345081305236e-05,
"loss": 0.0094,
"step": 8240
},
{
"epoch": 515.625,
"grad_norm": 0.7168742418289185,
"learning_rate": 9.971168041852456e-05,
"loss": 0.0091,
"step": 8250
},
{
"epoch": 516.25,
"grad_norm": 0.7002500295639038,
"learning_rate": 9.970990458760215e-05,
"loss": 0.0082,
"step": 8260
},
{
"epoch": 516.875,
"grad_norm": 0.5979912877082825,
"learning_rate": 9.970812332047929e-05,
"loss": 0.0083,
"step": 8270
},
{
"epoch": 517.5,
"grad_norm": 0.6995880603790283,
"learning_rate": 9.97063366173508e-05,
"loss": 0.0083,
"step": 8280
},
{
"epoch": 518.125,
"grad_norm": 0.6054606437683105,
"learning_rate": 9.970454447841207e-05,
"loss": 0.0086,
"step": 8290
},
{
"epoch": 518.75,
"grad_norm": 0.6761727333068848,
"learning_rate": 9.970274690385909e-05,
"loss": 0.0091,
"step": 8300
},
{
"epoch": 519.375,
"grad_norm": 0.7297013401985168,
"learning_rate": 9.970094389388844e-05,
"loss": 0.0101,
"step": 8310
},
{
"epoch": 520.0,
"grad_norm": 0.6933302879333496,
"learning_rate": 9.969913544869728e-05,
"loss": 0.009,
"step": 8320
},
{
"epoch": 520.625,
"grad_norm": 0.632068932056427,
"learning_rate": 9.96973215684834e-05,
"loss": 0.0092,
"step": 8330
},
{
"epoch": 521.25,
"grad_norm": 0.5213248133659363,
"learning_rate": 9.969550225344513e-05,
"loss": 0.0095,
"step": 8340
},
{
"epoch": 521.875,
"grad_norm": 0.5387685298919678,
"learning_rate": 9.969367750378147e-05,
"loss": 0.0072,
"step": 8350
},
{
"epoch": 522.5,
"grad_norm": 0.5790697336196899,
"learning_rate": 9.969184731969194e-05,
"loss": 0.0098,
"step": 8360
},
{
"epoch": 523.125,
"grad_norm": 0.6181520819664001,
"learning_rate": 9.96900117013767e-05,
"loss": 0.0094,
"step": 8370
},
{
"epoch": 523.75,
"grad_norm": 0.6647499799728394,
"learning_rate": 9.96881706490365e-05,
"loss": 0.0092,
"step": 8380
},
{
"epoch": 524.375,
"grad_norm": 0.5274850726127625,
"learning_rate": 9.968632416287265e-05,
"loss": 0.0092,
"step": 8390
},
{
"epoch": 525.0,
"grad_norm": 0.5954369902610779,
"learning_rate": 9.96844722430871e-05,
"loss": 0.0083,
"step": 8400
},
{
"epoch": 525.625,
"grad_norm": 0.5637514591217041,
"learning_rate": 9.968261488988235e-05,
"loss": 0.0096,
"step": 8410
},
{
"epoch": 526.25,
"grad_norm": 0.5467987656593323,
"learning_rate": 9.968075210346155e-05,
"loss": 0.0087,
"step": 8420
},
{
"epoch": 526.875,
"grad_norm": 0.6766216158866882,
"learning_rate": 9.967888388402839e-05,
"loss": 0.0098,
"step": 8430
},
{
"epoch": 527.5,
"grad_norm": 0.689804196357727,
"learning_rate": 9.967701023178717e-05,
"loss": 0.0094,
"step": 8440
},
{
"epoch": 528.125,
"grad_norm": 0.6711739301681519,
"learning_rate": 9.967513114694282e-05,
"loss": 0.0098,
"step": 8450
},
{
"epoch": 528.75,
"grad_norm": 0.7609061002731323,
"learning_rate": 9.967324662970079e-05,
"loss": 0.0091,
"step": 8460
},
{
"epoch": 529.375,
"grad_norm": 0.6599430441856384,
"learning_rate": 9.96713566802672e-05,
"loss": 0.0096,
"step": 8470
},
{
"epoch": 530.0,
"grad_norm": 0.6817207932472229,
"learning_rate": 9.966946129884873e-05,
"loss": 0.0093,
"step": 8480
},
{
"epoch": 530.625,
"grad_norm": 0.8081104755401611,
"learning_rate": 9.966756048565265e-05,
"loss": 0.01,
"step": 8490
},
{
"epoch": 531.25,
"grad_norm": 0.6982617378234863,
"learning_rate": 9.966565424088681e-05,
"loss": 0.0088,
"step": 8500
}
],
"logging_steps": 10,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6250,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}