1208_test_10k / trainer_state.json
Dongkkka's picture
Upload folder using huggingface_hub
c56097f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 250.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.25,
"grad_norm": 5.177168846130371,
"learning_rate": 1.8e-07,
"loss": 0.6721,
"step": 10
},
{
"epoch": 0.5,
"grad_norm": 5.208155155181885,
"learning_rate": 3.8e-07,
"loss": 0.6798,
"step": 20
},
{
"epoch": 0.75,
"grad_norm": 4.579753875732422,
"learning_rate": 5.8e-07,
"loss": 0.6639,
"step": 30
},
{
"epoch": 1.0,
"grad_norm": 4.642096042633057,
"learning_rate": 7.8e-07,
"loss": 0.6353,
"step": 40
},
{
"epoch": 1.25,
"grad_norm": 3.5934982299804688,
"learning_rate": 9.8e-07,
"loss": 0.5288,
"step": 50
},
{
"epoch": 1.5,
"grad_norm": 2.046332597732544,
"learning_rate": 1.18e-06,
"loss": 0.4528,
"step": 60
},
{
"epoch": 1.75,
"grad_norm": 1.644978404045105,
"learning_rate": 1.3800000000000001e-06,
"loss": 0.3442,
"step": 70
},
{
"epoch": 2.0,
"grad_norm": 0.8051798343658447,
"learning_rate": 1.5800000000000003e-06,
"loss": 0.2862,
"step": 80
},
{
"epoch": 2.25,
"grad_norm": 0.4936715066432953,
"learning_rate": 1.7800000000000001e-06,
"loss": 0.2241,
"step": 90
},
{
"epoch": 2.5,
"grad_norm": 0.37280142307281494,
"learning_rate": 1.98e-06,
"loss": 0.1893,
"step": 100
},
{
"epoch": 2.75,
"grad_norm": 0.3533601760864258,
"learning_rate": 2.1800000000000003e-06,
"loss": 0.1706,
"step": 110
},
{
"epoch": 3.0,
"grad_norm": 0.2663685083389282,
"learning_rate": 2.38e-06,
"loss": 0.1497,
"step": 120
},
{
"epoch": 3.25,
"grad_norm": 0.28943586349487305,
"learning_rate": 2.5800000000000003e-06,
"loss": 0.1377,
"step": 130
},
{
"epoch": 3.5,
"grad_norm": 0.20089849829673767,
"learning_rate": 2.78e-06,
"loss": 0.1256,
"step": 140
},
{
"epoch": 3.75,
"grad_norm": 0.17530840635299683,
"learning_rate": 2.9800000000000003e-06,
"loss": 0.1201,
"step": 150
},
{
"epoch": 4.0,
"grad_norm": 0.19372163712978363,
"learning_rate": 3.1800000000000005e-06,
"loss": 0.1136,
"step": 160
},
{
"epoch": 4.25,
"grad_norm": 0.13657791912555695,
"learning_rate": 3.38e-06,
"loss": 0.1059,
"step": 170
},
{
"epoch": 4.5,
"grad_norm": 0.1357901245355606,
"learning_rate": 3.58e-06,
"loss": 0.099,
"step": 180
},
{
"epoch": 4.75,
"grad_norm": 0.1591852754354477,
"learning_rate": 3.7800000000000002e-06,
"loss": 0.0986,
"step": 190
},
{
"epoch": 5.0,
"grad_norm": 0.12721344828605652,
"learning_rate": 3.98e-06,
"loss": 0.0935,
"step": 200
},
{
"epoch": 5.25,
"grad_norm": 0.14519330859184265,
"learning_rate": 4.18e-06,
"loss": 0.0862,
"step": 210
},
{
"epoch": 5.5,
"grad_norm": 0.17094707489013672,
"learning_rate": 4.38e-06,
"loss": 0.0864,
"step": 220
},
{
"epoch": 5.75,
"grad_norm": 0.11924029141664505,
"learning_rate": 4.58e-06,
"loss": 0.082,
"step": 230
},
{
"epoch": 6.0,
"grad_norm": 0.13231466710567474,
"learning_rate": 4.780000000000001e-06,
"loss": 0.0787,
"step": 240
},
{
"epoch": 6.25,
"grad_norm": 0.17676876485347748,
"learning_rate": 4.98e-06,
"loss": 0.0751,
"step": 250
},
{
"epoch": 6.5,
"grad_norm": 0.12683098018169403,
"learning_rate": 5.18e-06,
"loss": 0.0704,
"step": 260
},
{
"epoch": 6.75,
"grad_norm": 0.14490821957588196,
"learning_rate": 5.38e-06,
"loss": 0.0658,
"step": 270
},
{
"epoch": 7.0,
"grad_norm": 0.11153838038444519,
"learning_rate": 5.580000000000001e-06,
"loss": 0.0626,
"step": 280
},
{
"epoch": 7.25,
"grad_norm": 0.11997738480567932,
"learning_rate": 5.78e-06,
"loss": 0.0613,
"step": 290
},
{
"epoch": 7.5,
"grad_norm": 0.15803208947181702,
"learning_rate": 5.98e-06,
"loss": 0.0598,
"step": 300
},
{
"epoch": 7.75,
"grad_norm": 0.1401054412126541,
"learning_rate": 6.18e-06,
"loss": 0.0555,
"step": 310
},
{
"epoch": 8.0,
"grad_norm": 0.17630530893802643,
"learning_rate": 6.38e-06,
"loss": 0.0532,
"step": 320
},
{
"epoch": 8.25,
"grad_norm": 0.18545646965503693,
"learning_rate": 6.58e-06,
"loss": 0.0492,
"step": 330
},
{
"epoch": 8.5,
"grad_norm": 0.1365729719400406,
"learning_rate": 6.78e-06,
"loss": 0.0509,
"step": 340
},
{
"epoch": 8.75,
"grad_norm": 0.14690732955932617,
"learning_rate": 6.98e-06,
"loss": 0.0455,
"step": 350
},
{
"epoch": 9.0,
"grad_norm": 0.13823232054710388,
"learning_rate": 7.180000000000001e-06,
"loss": 0.0432,
"step": 360
},
{
"epoch": 9.25,
"grad_norm": 0.14099512994289398,
"learning_rate": 7.3800000000000005e-06,
"loss": 0.0434,
"step": 370
},
{
"epoch": 9.5,
"grad_norm": 0.1966245025396347,
"learning_rate": 7.580000000000001e-06,
"loss": 0.0404,
"step": 380
},
{
"epoch": 9.75,
"grad_norm": 0.2028568536043167,
"learning_rate": 7.78e-06,
"loss": 0.0411,
"step": 390
},
{
"epoch": 10.0,
"grad_norm": 0.14070606231689453,
"learning_rate": 7.98e-06,
"loss": 0.039,
"step": 400
},
{
"epoch": 10.25,
"grad_norm": 0.15250281989574432,
"learning_rate": 8.18e-06,
"loss": 0.0353,
"step": 410
},
{
"epoch": 10.5,
"grad_norm": 0.13883410394191742,
"learning_rate": 8.380000000000001e-06,
"loss": 0.0344,
"step": 420
},
{
"epoch": 10.75,
"grad_norm": 0.16541792452335358,
"learning_rate": 8.580000000000001e-06,
"loss": 0.0359,
"step": 430
},
{
"epoch": 11.0,
"grad_norm": 0.1294703632593155,
"learning_rate": 8.78e-06,
"loss": 0.0336,
"step": 440
},
{
"epoch": 11.25,
"grad_norm": 0.1688312292098999,
"learning_rate": 8.98e-06,
"loss": 0.0341,
"step": 450
},
{
"epoch": 11.5,
"grad_norm": 0.1562804877758026,
"learning_rate": 9.180000000000002e-06,
"loss": 0.0315,
"step": 460
},
{
"epoch": 11.75,
"grad_norm": 0.12476273626089096,
"learning_rate": 9.38e-06,
"loss": 0.0311,
"step": 470
},
{
"epoch": 12.0,
"grad_norm": 0.1468283087015152,
"learning_rate": 9.58e-06,
"loss": 0.0305,
"step": 480
},
{
"epoch": 12.25,
"grad_norm": 0.1696883887052536,
"learning_rate": 9.78e-06,
"loss": 0.0295,
"step": 490
},
{
"epoch": 12.5,
"grad_norm": 0.16157524287700653,
"learning_rate": 9.980000000000001e-06,
"loss": 0.0296,
"step": 500
},
{
"epoch": 12.75,
"grad_norm": 0.2175641655921936,
"learning_rate": 1.018e-05,
"loss": 0.0284,
"step": 510
},
{
"epoch": 13.0,
"grad_norm": 0.17267122864723206,
"learning_rate": 1.038e-05,
"loss": 0.0278,
"step": 520
},
{
"epoch": 13.25,
"grad_norm": 0.202761709690094,
"learning_rate": 1.058e-05,
"loss": 0.0272,
"step": 530
},
{
"epoch": 13.5,
"grad_norm": 0.1282481700181961,
"learning_rate": 1.0780000000000002e-05,
"loss": 0.0263,
"step": 540
},
{
"epoch": 13.75,
"grad_norm": 0.1510225236415863,
"learning_rate": 1.098e-05,
"loss": 0.0261,
"step": 550
},
{
"epoch": 14.0,
"grad_norm": 0.15521079301834106,
"learning_rate": 1.118e-05,
"loss": 0.0262,
"step": 560
},
{
"epoch": 14.25,
"grad_norm": 0.16990794241428375,
"learning_rate": 1.1380000000000001e-05,
"loss": 0.0251,
"step": 570
},
{
"epoch": 14.5,
"grad_norm": 0.15591584146022797,
"learning_rate": 1.1580000000000001e-05,
"loss": 0.0253,
"step": 580
},
{
"epoch": 14.75,
"grad_norm": 0.14896942675113678,
"learning_rate": 1.178e-05,
"loss": 0.0258,
"step": 590
},
{
"epoch": 15.0,
"grad_norm": 0.18454653024673462,
"learning_rate": 1.198e-05,
"loss": 0.0241,
"step": 600
},
{
"epoch": 15.25,
"grad_norm": 0.18142494559288025,
"learning_rate": 1.2180000000000002e-05,
"loss": 0.025,
"step": 610
},
{
"epoch": 15.5,
"grad_norm": 0.1759718656539917,
"learning_rate": 1.238e-05,
"loss": 0.023,
"step": 620
},
{
"epoch": 15.75,
"grad_norm": 0.12727558612823486,
"learning_rate": 1.258e-05,
"loss": 0.0242,
"step": 630
},
{
"epoch": 16.0,
"grad_norm": 0.1535845398902893,
"learning_rate": 1.278e-05,
"loss": 0.0236,
"step": 640
},
{
"epoch": 16.25,
"grad_norm": 0.19321507215499878,
"learning_rate": 1.2980000000000001e-05,
"loss": 0.022,
"step": 650
},
{
"epoch": 16.5,
"grad_norm": 0.2249348759651184,
"learning_rate": 1.3180000000000001e-05,
"loss": 0.0229,
"step": 660
},
{
"epoch": 16.75,
"grad_norm": 0.19657425582408905,
"learning_rate": 1.338e-05,
"loss": 0.0223,
"step": 670
},
{
"epoch": 17.0,
"grad_norm": 0.17625083029270172,
"learning_rate": 1.358e-05,
"loss": 0.022,
"step": 680
},
{
"epoch": 17.25,
"grad_norm": 0.2059263288974762,
"learning_rate": 1.3780000000000002e-05,
"loss": 0.0204,
"step": 690
},
{
"epoch": 17.5,
"grad_norm": 0.2298121601343155,
"learning_rate": 1.3980000000000002e-05,
"loss": 0.0214,
"step": 700
},
{
"epoch": 17.75,
"grad_norm": 0.18725836277008057,
"learning_rate": 1.4180000000000001e-05,
"loss": 0.0213,
"step": 710
},
{
"epoch": 18.0,
"grad_norm": 0.21040500700473785,
"learning_rate": 1.4380000000000001e-05,
"loss": 0.0221,
"step": 720
},
{
"epoch": 18.25,
"grad_norm": 0.20119509100914001,
"learning_rate": 1.4580000000000003e-05,
"loss": 0.0208,
"step": 730
},
{
"epoch": 18.5,
"grad_norm": 0.1417151838541031,
"learning_rate": 1.4779999999999999e-05,
"loss": 0.0223,
"step": 740
},
{
"epoch": 18.75,
"grad_norm": 0.16525112092494965,
"learning_rate": 1.4979999999999999e-05,
"loss": 0.0191,
"step": 750
},
{
"epoch": 19.0,
"grad_norm": 0.1953587532043457,
"learning_rate": 1.518e-05,
"loss": 0.0192,
"step": 760
},
{
"epoch": 19.25,
"grad_norm": 0.19152410328388214,
"learning_rate": 1.538e-05,
"loss": 0.0187,
"step": 770
},
{
"epoch": 19.5,
"grad_norm": 0.1645529419183731,
"learning_rate": 1.558e-05,
"loss": 0.02,
"step": 780
},
{
"epoch": 19.75,
"grad_norm": 0.16479845345020294,
"learning_rate": 1.578e-05,
"loss": 0.0197,
"step": 790
},
{
"epoch": 20.0,
"grad_norm": 0.20477408170700073,
"learning_rate": 1.598e-05,
"loss": 0.019,
"step": 800
},
{
"epoch": 20.25,
"grad_norm": 0.1593884378671646,
"learning_rate": 1.618e-05,
"loss": 0.0182,
"step": 810
},
{
"epoch": 20.5,
"grad_norm": 0.2143949717283249,
"learning_rate": 1.6380000000000002e-05,
"loss": 0.0192,
"step": 820
},
{
"epoch": 20.75,
"grad_norm": 0.21650566160678864,
"learning_rate": 1.658e-05,
"loss": 0.0201,
"step": 830
},
{
"epoch": 21.0,
"grad_norm": 0.1762174516916275,
"learning_rate": 1.6780000000000002e-05,
"loss": 0.018,
"step": 840
},
{
"epoch": 21.25,
"grad_norm": 0.252549409866333,
"learning_rate": 1.698e-05,
"loss": 0.0184,
"step": 850
},
{
"epoch": 21.5,
"grad_norm": 0.26648053526878357,
"learning_rate": 1.718e-05,
"loss": 0.0179,
"step": 860
},
{
"epoch": 21.75,
"grad_norm": 0.18754205107688904,
"learning_rate": 1.7380000000000003e-05,
"loss": 0.0187,
"step": 870
},
{
"epoch": 22.0,
"grad_norm": 0.18613967299461365,
"learning_rate": 1.758e-05,
"loss": 0.0177,
"step": 880
},
{
"epoch": 22.25,
"grad_norm": 0.20721137523651123,
"learning_rate": 1.7780000000000003e-05,
"loss": 0.018,
"step": 890
},
{
"epoch": 22.5,
"grad_norm": 0.21724781394004822,
"learning_rate": 1.798e-05,
"loss": 0.0185,
"step": 900
},
{
"epoch": 22.75,
"grad_norm": 0.18366505205631256,
"learning_rate": 1.818e-05,
"loss": 0.0164,
"step": 910
},
{
"epoch": 23.0,
"grad_norm": 0.1680997759103775,
"learning_rate": 1.838e-05,
"loss": 0.0171,
"step": 920
},
{
"epoch": 23.25,
"grad_norm": 0.19146476686000824,
"learning_rate": 1.858e-05,
"loss": 0.0174,
"step": 930
},
{
"epoch": 23.5,
"grad_norm": 0.21596679091453552,
"learning_rate": 1.878e-05,
"loss": 0.0173,
"step": 940
},
{
"epoch": 23.75,
"grad_norm": 0.1829003393650055,
"learning_rate": 1.898e-05,
"loss": 0.017,
"step": 950
},
{
"epoch": 24.0,
"grad_norm": 0.1917005032300949,
"learning_rate": 1.918e-05,
"loss": 0.0162,
"step": 960
},
{
"epoch": 24.25,
"grad_norm": 0.1833629459142685,
"learning_rate": 1.938e-05,
"loss": 0.0163,
"step": 970
},
{
"epoch": 24.5,
"grad_norm": 0.23003654181957245,
"learning_rate": 1.9580000000000002e-05,
"loss": 0.0171,
"step": 980
},
{
"epoch": 24.75,
"grad_norm": 0.16011449694633484,
"learning_rate": 1.978e-05,
"loss": 0.0161,
"step": 990
},
{
"epoch": 25.0,
"grad_norm": 0.19529913365840912,
"learning_rate": 1.9980000000000002e-05,
"loss": 0.0188,
"step": 1000
},
{
"epoch": 25.25,
"grad_norm": 0.1939581334590912,
"learning_rate": 2.0180000000000003e-05,
"loss": 0.016,
"step": 1010
},
{
"epoch": 25.5,
"grad_norm": 0.2373170107603073,
"learning_rate": 2.038e-05,
"loss": 0.0153,
"step": 1020
},
{
"epoch": 25.75,
"grad_norm": 0.29833200573921204,
"learning_rate": 2.0580000000000003e-05,
"loss": 0.0162,
"step": 1030
},
{
"epoch": 26.0,
"grad_norm": 0.19248932600021362,
"learning_rate": 2.078e-05,
"loss": 0.0165,
"step": 1040
},
{
"epoch": 26.25,
"grad_norm": 0.16877706348896027,
"learning_rate": 2.098e-05,
"loss": 0.015,
"step": 1050
},
{
"epoch": 26.5,
"grad_norm": 0.17076095938682556,
"learning_rate": 2.118e-05,
"loss": 0.015,
"step": 1060
},
{
"epoch": 26.75,
"grad_norm": 0.11618935316801071,
"learning_rate": 2.138e-05,
"loss": 0.0145,
"step": 1070
},
{
"epoch": 27.0,
"grad_norm": 0.16996583342552185,
"learning_rate": 2.158e-05,
"loss": 0.0154,
"step": 1080
},
{
"epoch": 27.25,
"grad_norm": 0.15623579919338226,
"learning_rate": 2.178e-05,
"loss": 0.0152,
"step": 1090
},
{
"epoch": 27.5,
"grad_norm": 0.22260257601737976,
"learning_rate": 2.198e-05,
"loss": 0.0159,
"step": 1100
},
{
"epoch": 27.75,
"grad_norm": 0.1553281843662262,
"learning_rate": 2.218e-05,
"loss": 0.0151,
"step": 1110
},
{
"epoch": 28.0,
"grad_norm": 0.23849648237228394,
"learning_rate": 2.2380000000000003e-05,
"loss": 0.0144,
"step": 1120
},
{
"epoch": 28.25,
"grad_norm": 0.3228089213371277,
"learning_rate": 2.258e-05,
"loss": 0.0143,
"step": 1130
},
{
"epoch": 28.5,
"grad_norm": 0.1590012162923813,
"learning_rate": 2.2780000000000002e-05,
"loss": 0.0148,
"step": 1140
},
{
"epoch": 28.75,
"grad_norm": 0.2641109824180603,
"learning_rate": 2.298e-05,
"loss": 0.0151,
"step": 1150
},
{
"epoch": 29.0,
"grad_norm": 0.20940740406513214,
"learning_rate": 2.318e-05,
"loss": 0.0149,
"step": 1160
},
{
"epoch": 29.25,
"grad_norm": 0.2249547690153122,
"learning_rate": 2.3380000000000003e-05,
"loss": 0.0149,
"step": 1170
},
{
"epoch": 29.5,
"grad_norm": 0.23424924910068512,
"learning_rate": 2.358e-05,
"loss": 0.0155,
"step": 1180
},
{
"epoch": 29.75,
"grad_norm": 0.24567143619060516,
"learning_rate": 2.3780000000000003e-05,
"loss": 0.0151,
"step": 1190
},
{
"epoch": 30.0,
"grad_norm": 0.2167378067970276,
"learning_rate": 2.398e-05,
"loss": 0.0151,
"step": 1200
},
{
"epoch": 30.25,
"grad_norm": 0.24560105800628662,
"learning_rate": 2.418e-05,
"loss": 0.0156,
"step": 1210
},
{
"epoch": 30.5,
"grad_norm": 0.2878526747226715,
"learning_rate": 2.438e-05,
"loss": 0.0145,
"step": 1220
},
{
"epoch": 30.75,
"grad_norm": 0.16258525848388672,
"learning_rate": 2.4580000000000002e-05,
"loss": 0.0144,
"step": 1230
},
{
"epoch": 31.0,
"grad_norm": 0.18433189392089844,
"learning_rate": 2.478e-05,
"loss": 0.0156,
"step": 1240
},
{
"epoch": 31.25,
"grad_norm": 0.23272426426410675,
"learning_rate": 2.498e-05,
"loss": 0.0147,
"step": 1250
},
{
"epoch": 31.5,
"grad_norm": 0.27221301198005676,
"learning_rate": 2.5180000000000003e-05,
"loss": 0.0151,
"step": 1260
},
{
"epoch": 31.75,
"grad_norm": 0.19055114686489105,
"learning_rate": 2.5380000000000004e-05,
"loss": 0.0134,
"step": 1270
},
{
"epoch": 32.0,
"grad_norm": 0.24308407306671143,
"learning_rate": 2.5580000000000002e-05,
"loss": 0.0141,
"step": 1280
},
{
"epoch": 32.25,
"grad_norm": 0.19755953550338745,
"learning_rate": 2.5779999999999997e-05,
"loss": 0.0145,
"step": 1290
},
{
"epoch": 32.5,
"grad_norm": 0.22123226523399353,
"learning_rate": 2.598e-05,
"loss": 0.0128,
"step": 1300
},
{
"epoch": 32.75,
"grad_norm": 0.2962028980255127,
"learning_rate": 2.618e-05,
"loss": 0.0135,
"step": 1310
},
{
"epoch": 33.0,
"grad_norm": 0.2808900475502014,
"learning_rate": 2.6379999999999998e-05,
"loss": 0.0141,
"step": 1320
},
{
"epoch": 33.25,
"grad_norm": 0.2979101240634918,
"learning_rate": 2.658e-05,
"loss": 0.0131,
"step": 1330
},
{
"epoch": 33.5,
"grad_norm": 0.2694721817970276,
"learning_rate": 2.678e-05,
"loss": 0.0134,
"step": 1340
},
{
"epoch": 33.75,
"grad_norm": 0.2524833381175995,
"learning_rate": 2.698e-05,
"loss": 0.0135,
"step": 1350
},
{
"epoch": 34.0,
"grad_norm": 0.2554227113723755,
"learning_rate": 2.718e-05,
"loss": 0.0132,
"step": 1360
},
{
"epoch": 34.25,
"grad_norm": 0.2688157260417938,
"learning_rate": 2.738e-05,
"loss": 0.0125,
"step": 1370
},
{
"epoch": 34.5,
"grad_norm": 0.20932850241661072,
"learning_rate": 2.758e-05,
"loss": 0.0137,
"step": 1380
},
{
"epoch": 34.75,
"grad_norm": 0.19536396861076355,
"learning_rate": 2.778e-05,
"loss": 0.0134,
"step": 1390
},
{
"epoch": 35.0,
"grad_norm": 0.19312022626399994,
"learning_rate": 2.798e-05,
"loss": 0.015,
"step": 1400
},
{
"epoch": 35.25,
"grad_norm": 0.15348872542381287,
"learning_rate": 2.818e-05,
"loss": 0.0134,
"step": 1410
},
{
"epoch": 35.5,
"grad_norm": 0.19372671842575073,
"learning_rate": 2.8380000000000003e-05,
"loss": 0.0132,
"step": 1420
},
{
"epoch": 35.75,
"grad_norm": 0.16821452975273132,
"learning_rate": 2.858e-05,
"loss": 0.0124,
"step": 1430
},
{
"epoch": 36.0,
"grad_norm": 0.14121121168136597,
"learning_rate": 2.8780000000000002e-05,
"loss": 0.0131,
"step": 1440
},
{
"epoch": 36.25,
"grad_norm": 0.14743903279304504,
"learning_rate": 2.898e-05,
"loss": 0.0122,
"step": 1450
},
{
"epoch": 36.5,
"grad_norm": 0.1689988076686859,
"learning_rate": 2.9180000000000002e-05,
"loss": 0.0121,
"step": 1460
},
{
"epoch": 36.75,
"grad_norm": 0.1635463982820511,
"learning_rate": 2.9380000000000003e-05,
"loss": 0.0123,
"step": 1470
},
{
"epoch": 37.0,
"grad_norm": 0.2134884148836136,
"learning_rate": 2.958e-05,
"loss": 0.0118,
"step": 1480
},
{
"epoch": 37.25,
"grad_norm": 0.1917855590581894,
"learning_rate": 2.9780000000000003e-05,
"loss": 0.0135,
"step": 1490
},
{
"epoch": 37.5,
"grad_norm": 0.25814926624298096,
"learning_rate": 2.998e-05,
"loss": 0.0119,
"step": 1500
},
{
"epoch": 37.75,
"grad_norm": 0.19456756114959717,
"learning_rate": 3.0180000000000002e-05,
"loss": 0.0126,
"step": 1510
},
{
"epoch": 38.0,
"grad_norm": 0.16085229814052582,
"learning_rate": 3.0380000000000004e-05,
"loss": 0.0128,
"step": 1520
},
{
"epoch": 38.25,
"grad_norm": 0.1836978793144226,
"learning_rate": 3.058e-05,
"loss": 0.0127,
"step": 1530
},
{
"epoch": 38.5,
"grad_norm": 0.2007584571838379,
"learning_rate": 3.078e-05,
"loss": 0.012,
"step": 1540
},
{
"epoch": 38.75,
"grad_norm": 0.2957303524017334,
"learning_rate": 3.0980000000000005e-05,
"loss": 0.0116,
"step": 1550
},
{
"epoch": 39.0,
"grad_norm": 0.3204158544540405,
"learning_rate": 3.118e-05,
"loss": 0.0126,
"step": 1560
},
{
"epoch": 39.25,
"grad_norm": 0.1719009280204773,
"learning_rate": 3.138e-05,
"loss": 0.011,
"step": 1570
},
{
"epoch": 39.5,
"grad_norm": 0.16781948506832123,
"learning_rate": 3.1580000000000006e-05,
"loss": 0.0117,
"step": 1580
},
{
"epoch": 39.75,
"grad_norm": 0.21803607046604156,
"learning_rate": 3.1780000000000004e-05,
"loss": 0.0122,
"step": 1590
},
{
"epoch": 40.0,
"grad_norm": 0.2320670336484909,
"learning_rate": 3.198e-05,
"loss": 0.0126,
"step": 1600
},
{
"epoch": 40.25,
"grad_norm": 0.24788905680179596,
"learning_rate": 3.218e-05,
"loss": 0.0124,
"step": 1610
},
{
"epoch": 40.5,
"grad_norm": 0.1828049123287201,
"learning_rate": 3.238e-05,
"loss": 0.0112,
"step": 1620
},
{
"epoch": 40.75,
"grad_norm": 0.23975656926631927,
"learning_rate": 3.2579999999999996e-05,
"loss": 0.0124,
"step": 1630
},
{
"epoch": 41.0,
"grad_norm": 0.21705782413482666,
"learning_rate": 3.278e-05,
"loss": 0.0112,
"step": 1640
},
{
"epoch": 41.25,
"grad_norm": 0.22534944117069244,
"learning_rate": 3.298e-05,
"loss": 0.0116,
"step": 1650
},
{
"epoch": 41.5,
"grad_norm": 0.16665105521678925,
"learning_rate": 3.318e-05,
"loss": 0.0134,
"step": 1660
},
{
"epoch": 41.75,
"grad_norm": 0.2458990514278412,
"learning_rate": 3.338e-05,
"loss": 0.012,
"step": 1670
},
{
"epoch": 42.0,
"grad_norm": 0.192895770072937,
"learning_rate": 3.358e-05,
"loss": 0.0119,
"step": 1680
},
{
"epoch": 42.25,
"grad_norm": 0.21721677482128143,
"learning_rate": 3.378e-05,
"loss": 0.0126,
"step": 1690
},
{
"epoch": 42.5,
"grad_norm": 0.2175186723470688,
"learning_rate": 3.398e-05,
"loss": 0.0124,
"step": 1700
},
{
"epoch": 42.75,
"grad_norm": 0.1788182556629181,
"learning_rate": 3.418e-05,
"loss": 0.0119,
"step": 1710
},
{
"epoch": 43.0,
"grad_norm": 0.17151236534118652,
"learning_rate": 3.438e-05,
"loss": 0.0115,
"step": 1720
},
{
"epoch": 43.25,
"grad_norm": 0.22371003031730652,
"learning_rate": 3.4580000000000004e-05,
"loss": 0.0121,
"step": 1730
},
{
"epoch": 43.5,
"grad_norm": 0.26781198382377625,
"learning_rate": 3.478e-05,
"loss": 0.0116,
"step": 1740
},
{
"epoch": 43.75,
"grad_norm": 0.21033993363380432,
"learning_rate": 3.498e-05,
"loss": 0.0106,
"step": 1750
},
{
"epoch": 44.0,
"grad_norm": 0.22407804429531097,
"learning_rate": 3.518e-05,
"loss": 0.0109,
"step": 1760
},
{
"epoch": 44.25,
"grad_norm": 0.18357867002487183,
"learning_rate": 3.5380000000000003e-05,
"loss": 0.0107,
"step": 1770
},
{
"epoch": 44.5,
"grad_norm": 0.1971215456724167,
"learning_rate": 3.558e-05,
"loss": 0.0112,
"step": 1780
},
{
"epoch": 44.75,
"grad_norm": 0.18872156739234924,
"learning_rate": 3.578e-05,
"loss": 0.0105,
"step": 1790
},
{
"epoch": 45.0,
"grad_norm": 0.17367962002754211,
"learning_rate": 3.5980000000000004e-05,
"loss": 0.0115,
"step": 1800
},
{
"epoch": 45.25,
"grad_norm": 0.14945487678050995,
"learning_rate": 3.618e-05,
"loss": 0.0113,
"step": 1810
},
{
"epoch": 45.5,
"grad_norm": 0.1590161919593811,
"learning_rate": 3.638e-05,
"loss": 0.0103,
"step": 1820
},
{
"epoch": 45.75,
"grad_norm": 0.18627125024795532,
"learning_rate": 3.6580000000000006e-05,
"loss": 0.0111,
"step": 1830
},
{
"epoch": 46.0,
"grad_norm": 0.19844526052474976,
"learning_rate": 3.6780000000000004e-05,
"loss": 0.0111,
"step": 1840
},
{
"epoch": 46.25,
"grad_norm": 0.17142851650714874,
"learning_rate": 3.698e-05,
"loss": 0.0114,
"step": 1850
},
{
"epoch": 46.5,
"grad_norm": 0.1916825920343399,
"learning_rate": 3.7180000000000007e-05,
"loss": 0.0099,
"step": 1860
},
{
"epoch": 46.75,
"grad_norm": 0.1983088254928589,
"learning_rate": 3.7380000000000005e-05,
"loss": 0.0102,
"step": 1870
},
{
"epoch": 47.0,
"grad_norm": 0.11927555501461029,
"learning_rate": 3.758e-05,
"loss": 0.0099,
"step": 1880
},
{
"epoch": 47.25,
"grad_norm": 0.1939011514186859,
"learning_rate": 3.778000000000001e-05,
"loss": 0.0106,
"step": 1890
},
{
"epoch": 47.5,
"grad_norm": 0.20579646527767181,
"learning_rate": 3.7980000000000006e-05,
"loss": 0.01,
"step": 1900
},
{
"epoch": 47.75,
"grad_norm": 0.19019336998462677,
"learning_rate": 3.818e-05,
"loss": 0.0102,
"step": 1910
},
{
"epoch": 48.0,
"grad_norm": 0.17853817343711853,
"learning_rate": 3.838e-05,
"loss": 0.0111,
"step": 1920
},
{
"epoch": 48.25,
"grad_norm": 0.2036396712064743,
"learning_rate": 3.858e-05,
"loss": 0.0106,
"step": 1930
},
{
"epoch": 48.5,
"grad_norm": 0.1510358452796936,
"learning_rate": 3.878e-05,
"loss": 0.0104,
"step": 1940
},
{
"epoch": 48.75,
"grad_norm": 0.21868108212947845,
"learning_rate": 3.898e-05,
"loss": 0.0107,
"step": 1950
},
{
"epoch": 49.0,
"grad_norm": 0.12815332412719727,
"learning_rate": 3.918e-05,
"loss": 0.0099,
"step": 1960
},
{
"epoch": 49.25,
"grad_norm": 0.22570985555648804,
"learning_rate": 3.938e-05,
"loss": 0.0102,
"step": 1970
},
{
"epoch": 49.5,
"grad_norm": 0.18665990233421326,
"learning_rate": 3.958e-05,
"loss": 0.0109,
"step": 1980
},
{
"epoch": 49.75,
"grad_norm": 0.24190504848957062,
"learning_rate": 3.978e-05,
"loss": 0.011,
"step": 1990
},
{
"epoch": 50.0,
"grad_norm": 0.2541164755821228,
"learning_rate": 3.998e-05,
"loss": 0.011,
"step": 2000
},
{
"epoch": 50.25,
"grad_norm": 0.25958317518234253,
"learning_rate": 4.018e-05,
"loss": 0.0096,
"step": 2010
},
{
"epoch": 50.5,
"grad_norm": 0.2289685308933258,
"learning_rate": 4.038e-05,
"loss": 0.0106,
"step": 2020
},
{
"epoch": 50.75,
"grad_norm": 0.19097794592380524,
"learning_rate": 4.058e-05,
"loss": 0.0105,
"step": 2030
},
{
"epoch": 51.0,
"grad_norm": 0.19423332810401917,
"learning_rate": 4.078e-05,
"loss": 0.0101,
"step": 2040
},
{
"epoch": 51.25,
"grad_norm": 0.20464728772640228,
"learning_rate": 4.0980000000000004e-05,
"loss": 0.0102,
"step": 2050
},
{
"epoch": 51.5,
"grad_norm": 0.22027207911014557,
"learning_rate": 4.118e-05,
"loss": 0.0094,
"step": 2060
},
{
"epoch": 51.75,
"grad_norm": 0.2023499310016632,
"learning_rate": 4.138e-05,
"loss": 0.0096,
"step": 2070
},
{
"epoch": 52.0,
"grad_norm": 0.2296619862318039,
"learning_rate": 4.1580000000000005e-05,
"loss": 0.0097,
"step": 2080
},
{
"epoch": 52.25,
"grad_norm": 0.25673484802246094,
"learning_rate": 4.178e-05,
"loss": 0.0099,
"step": 2090
},
{
"epoch": 52.5,
"grad_norm": 0.14860525727272034,
"learning_rate": 4.198e-05,
"loss": 0.0093,
"step": 2100
},
{
"epoch": 52.75,
"grad_norm": 0.20391502976417542,
"learning_rate": 4.2180000000000006e-05,
"loss": 0.0099,
"step": 2110
},
{
"epoch": 53.0,
"grad_norm": 0.1730753779411316,
"learning_rate": 4.2380000000000004e-05,
"loss": 0.0106,
"step": 2120
},
{
"epoch": 53.25,
"grad_norm": 0.24567775428295135,
"learning_rate": 4.258e-05,
"loss": 0.0101,
"step": 2130
},
{
"epoch": 53.5,
"grad_norm": 0.20263069868087769,
"learning_rate": 4.278e-05,
"loss": 0.01,
"step": 2140
},
{
"epoch": 53.75,
"grad_norm": 0.23891495168209076,
"learning_rate": 4.2980000000000005e-05,
"loss": 0.0102,
"step": 2150
},
{
"epoch": 54.0,
"grad_norm": 0.21011893451213837,
"learning_rate": 4.318e-05,
"loss": 0.0102,
"step": 2160
},
{
"epoch": 54.25,
"grad_norm": 0.19230227172374725,
"learning_rate": 4.338e-05,
"loss": 0.0103,
"step": 2170
},
{
"epoch": 54.5,
"grad_norm": 0.2491769939661026,
"learning_rate": 4.3580000000000006e-05,
"loss": 0.0093,
"step": 2180
},
{
"epoch": 54.75,
"grad_norm": 0.22918479144573212,
"learning_rate": 4.3780000000000004e-05,
"loss": 0.0088,
"step": 2190
},
{
"epoch": 55.0,
"grad_norm": 0.19413116574287415,
"learning_rate": 4.398e-05,
"loss": 0.01,
"step": 2200
},
{
"epoch": 55.25,
"grad_norm": 0.19914408028125763,
"learning_rate": 4.418000000000001e-05,
"loss": 0.0097,
"step": 2210
},
{
"epoch": 55.5,
"grad_norm": 0.2266189604997635,
"learning_rate": 4.438e-05,
"loss": 0.0101,
"step": 2220
},
{
"epoch": 55.75,
"grad_norm": 0.19024869799613953,
"learning_rate": 4.458e-05,
"loss": 0.0101,
"step": 2230
},
{
"epoch": 56.0,
"grad_norm": 0.2472565621137619,
"learning_rate": 4.478e-05,
"loss": 0.0105,
"step": 2240
},
{
"epoch": 56.25,
"grad_norm": 0.20708179473876953,
"learning_rate": 4.498e-05,
"loss": 0.0104,
"step": 2250
},
{
"epoch": 56.5,
"grad_norm": 0.22921694815158844,
"learning_rate": 4.518e-05,
"loss": 0.0109,
"step": 2260
},
{
"epoch": 56.75,
"grad_norm": 0.16786304116249084,
"learning_rate": 4.538e-05,
"loss": 0.0094,
"step": 2270
},
{
"epoch": 57.0,
"grad_norm": 0.20425142347812653,
"learning_rate": 4.558e-05,
"loss": 0.0093,
"step": 2280
},
{
"epoch": 57.25,
"grad_norm": 0.18586701154708862,
"learning_rate": 4.578e-05,
"loss": 0.0092,
"step": 2290
},
{
"epoch": 57.5,
"grad_norm": 0.22823484241962433,
"learning_rate": 4.5980000000000004e-05,
"loss": 0.0096,
"step": 2300
},
{
"epoch": 57.75,
"grad_norm": 0.21421211957931519,
"learning_rate": 4.618e-05,
"loss": 0.0095,
"step": 2310
},
{
"epoch": 58.0,
"grad_norm": 0.3175329267978668,
"learning_rate": 4.638e-05,
"loss": 0.0112,
"step": 2320
},
{
"epoch": 58.25,
"grad_norm": 0.206922248005867,
"learning_rate": 4.6580000000000005e-05,
"loss": 0.0093,
"step": 2330
},
{
"epoch": 58.5,
"grad_norm": 0.15221217274665833,
"learning_rate": 4.678e-05,
"loss": 0.0107,
"step": 2340
},
{
"epoch": 58.75,
"grad_norm": 0.15896166861057281,
"learning_rate": 4.698e-05,
"loss": 0.0086,
"step": 2350
},
{
"epoch": 59.0,
"grad_norm": 0.20901215076446533,
"learning_rate": 4.718e-05,
"loss": 0.0088,
"step": 2360
},
{
"epoch": 59.25,
"grad_norm": 0.263406902551651,
"learning_rate": 4.7380000000000004e-05,
"loss": 0.0092,
"step": 2370
},
{
"epoch": 59.5,
"grad_norm": 0.21237947046756744,
"learning_rate": 4.758e-05,
"loss": 0.0084,
"step": 2380
},
{
"epoch": 59.75,
"grad_norm": 0.17976641654968262,
"learning_rate": 4.778e-05,
"loss": 0.0082,
"step": 2390
},
{
"epoch": 60.0,
"grad_norm": 0.21562369167804718,
"learning_rate": 4.7980000000000005e-05,
"loss": 0.0095,
"step": 2400
},
{
"epoch": 60.25,
"grad_norm": 0.2006334811449051,
"learning_rate": 4.818e-05,
"loss": 0.0089,
"step": 2410
},
{
"epoch": 60.5,
"grad_norm": 0.1672360748052597,
"learning_rate": 4.838e-05,
"loss": 0.0087,
"step": 2420
},
{
"epoch": 60.75,
"grad_norm": 0.19504928588867188,
"learning_rate": 4.8580000000000006e-05,
"loss": 0.0088,
"step": 2430
},
{
"epoch": 61.0,
"grad_norm": 0.14196740090847015,
"learning_rate": 4.8780000000000004e-05,
"loss": 0.0094,
"step": 2440
},
{
"epoch": 61.25,
"grad_norm": 0.18832427263259888,
"learning_rate": 4.898e-05,
"loss": 0.0097,
"step": 2450
},
{
"epoch": 61.5,
"grad_norm": 0.1406528502702713,
"learning_rate": 4.918000000000001e-05,
"loss": 0.008,
"step": 2460
},
{
"epoch": 61.75,
"grad_norm": 0.17165546119213104,
"learning_rate": 4.9380000000000005e-05,
"loss": 0.0096,
"step": 2470
},
{
"epoch": 62.0,
"grad_norm": 0.1912836730480194,
"learning_rate": 4.958e-05,
"loss": 0.0099,
"step": 2480
},
{
"epoch": 62.25,
"grad_norm": 0.239595428109169,
"learning_rate": 4.978e-05,
"loss": 0.0093,
"step": 2490
},
{
"epoch": 62.5,
"grad_norm": 0.16011711955070496,
"learning_rate": 4.9980000000000006e-05,
"loss": 0.0083,
"step": 2500
},
{
"epoch": 62.75,
"grad_norm": 0.17068330943584442,
"learning_rate": 5.0180000000000004e-05,
"loss": 0.0084,
"step": 2510
},
{
"epoch": 63.0,
"grad_norm": 0.1504756510257721,
"learning_rate": 5.038e-05,
"loss": 0.0085,
"step": 2520
},
{
"epoch": 63.25,
"grad_norm": 0.20306769013404846,
"learning_rate": 5.058000000000001e-05,
"loss": 0.0093,
"step": 2530
},
{
"epoch": 63.5,
"grad_norm": 0.17090564966201782,
"learning_rate": 5.0780000000000005e-05,
"loss": 0.0075,
"step": 2540
},
{
"epoch": 63.75,
"grad_norm": 0.1755877137184143,
"learning_rate": 5.098e-05,
"loss": 0.0086,
"step": 2550
},
{
"epoch": 64.0,
"grad_norm": 0.11523690074682236,
"learning_rate": 5.118000000000001e-05,
"loss": 0.0084,
"step": 2560
},
{
"epoch": 64.25,
"grad_norm": 0.15952451527118683,
"learning_rate": 5.1380000000000006e-05,
"loss": 0.0085,
"step": 2570
},
{
"epoch": 64.5,
"grad_norm": 0.11506156623363495,
"learning_rate": 5.1580000000000004e-05,
"loss": 0.008,
"step": 2580
},
{
"epoch": 64.75,
"grad_norm": 0.21272993087768555,
"learning_rate": 5.178000000000001e-05,
"loss": 0.0083,
"step": 2590
},
{
"epoch": 65.0,
"grad_norm": 0.18400269746780396,
"learning_rate": 5.198000000000001e-05,
"loss": 0.0083,
"step": 2600
},
{
"epoch": 65.25,
"grad_norm": 0.1559101790189743,
"learning_rate": 5.2180000000000005e-05,
"loss": 0.0075,
"step": 2610
},
{
"epoch": 65.5,
"grad_norm": 0.1637546718120575,
"learning_rate": 5.238000000000001e-05,
"loss": 0.0083,
"step": 2620
},
{
"epoch": 65.75,
"grad_norm": 0.22429779171943665,
"learning_rate": 5.258000000000001e-05,
"loss": 0.0087,
"step": 2630
},
{
"epoch": 66.0,
"grad_norm": 0.20996035635471344,
"learning_rate": 5.2780000000000006e-05,
"loss": 0.0088,
"step": 2640
},
{
"epoch": 66.25,
"grad_norm": 0.15640440583229065,
"learning_rate": 5.2980000000000004e-05,
"loss": 0.0079,
"step": 2650
},
{
"epoch": 66.5,
"grad_norm": 0.22142156958580017,
"learning_rate": 5.318000000000001e-05,
"loss": 0.0091,
"step": 2660
},
{
"epoch": 66.75,
"grad_norm": 0.17605368793010712,
"learning_rate": 5.338000000000001e-05,
"loss": 0.0081,
"step": 2670
},
{
"epoch": 67.0,
"grad_norm": 0.1534302979707718,
"learning_rate": 5.3580000000000005e-05,
"loss": 0.0086,
"step": 2680
},
{
"epoch": 67.25,
"grad_norm": 0.18950927257537842,
"learning_rate": 5.378e-05,
"loss": 0.0086,
"step": 2690
},
{
"epoch": 67.5,
"grad_norm": 0.17753522098064423,
"learning_rate": 5.3979999999999995e-05,
"loss": 0.0075,
"step": 2700
},
{
"epoch": 67.75,
"grad_norm": 0.14533065259456635,
"learning_rate": 5.418e-05,
"loss": 0.0079,
"step": 2710
},
{
"epoch": 68.0,
"grad_norm": 0.17737789452075958,
"learning_rate": 5.438e-05,
"loss": 0.0081,
"step": 2720
},
{
"epoch": 68.25,
"grad_norm": 0.16454491019248962,
"learning_rate": 5.4579999999999996e-05,
"loss": 0.0079,
"step": 2730
},
{
"epoch": 68.5,
"grad_norm": 0.1953648179769516,
"learning_rate": 5.478e-05,
"loss": 0.0078,
"step": 2740
},
{
"epoch": 68.75,
"grad_norm": 0.20770540833473206,
"learning_rate": 5.498e-05,
"loss": 0.009,
"step": 2750
},
{
"epoch": 69.0,
"grad_norm": 0.18003995716571808,
"learning_rate": 5.518e-05,
"loss": 0.0079,
"step": 2760
},
{
"epoch": 69.25,
"grad_norm": 0.1802724152803421,
"learning_rate": 5.538e-05,
"loss": 0.008,
"step": 2770
},
{
"epoch": 69.5,
"grad_norm": 0.17148584127426147,
"learning_rate": 5.558e-05,
"loss": 0.0079,
"step": 2780
},
{
"epoch": 69.75,
"grad_norm": 0.21797288954257965,
"learning_rate": 5.578e-05,
"loss": 0.0075,
"step": 2790
},
{
"epoch": 70.0,
"grad_norm": 0.19681450724601746,
"learning_rate": 5.5979999999999996e-05,
"loss": 0.0085,
"step": 2800
},
{
"epoch": 70.25,
"grad_norm": 0.18321318924427032,
"learning_rate": 5.618e-05,
"loss": 0.0087,
"step": 2810
},
{
"epoch": 70.5,
"grad_norm": 0.18249335885047913,
"learning_rate": 5.638e-05,
"loss": 0.0073,
"step": 2820
},
{
"epoch": 70.75,
"grad_norm": 0.13530372083187103,
"learning_rate": 5.658e-05,
"loss": 0.007,
"step": 2830
},
{
"epoch": 71.0,
"grad_norm": 0.23054936528205872,
"learning_rate": 5.678e-05,
"loss": 0.0086,
"step": 2840
},
{
"epoch": 71.25,
"grad_norm": 0.2308930903673172,
"learning_rate": 5.698e-05,
"loss": 0.0086,
"step": 2850
},
{
"epoch": 71.5,
"grad_norm": 0.2094513475894928,
"learning_rate": 5.718e-05,
"loss": 0.0077,
"step": 2860
},
{
"epoch": 71.75,
"grad_norm": 0.17534701526165009,
"learning_rate": 5.738e-05,
"loss": 0.0076,
"step": 2870
},
{
"epoch": 72.0,
"grad_norm": 0.22989408671855927,
"learning_rate": 5.758e-05,
"loss": 0.0079,
"step": 2880
},
{
"epoch": 72.25,
"grad_norm": 0.2179604172706604,
"learning_rate": 5.778e-05,
"loss": 0.0093,
"step": 2890
},
{
"epoch": 72.5,
"grad_norm": 0.24805155396461487,
"learning_rate": 5.7980000000000004e-05,
"loss": 0.0084,
"step": 2900
},
{
"epoch": 72.75,
"grad_norm": 0.20227353274822235,
"learning_rate": 5.818e-05,
"loss": 0.01,
"step": 2910
},
{
"epoch": 73.0,
"grad_norm": 0.17762471735477448,
"learning_rate": 5.838e-05,
"loss": 0.0075,
"step": 2920
},
{
"epoch": 73.25,
"grad_norm": 0.25949570536613464,
"learning_rate": 5.858e-05,
"loss": 0.0079,
"step": 2930
},
{
"epoch": 73.5,
"grad_norm": 0.2676275670528412,
"learning_rate": 5.878e-05,
"loss": 0.0086,
"step": 2940
},
{
"epoch": 73.75,
"grad_norm": 0.1574457585811615,
"learning_rate": 5.898e-05,
"loss": 0.0079,
"step": 2950
},
{
"epoch": 74.0,
"grad_norm": 0.28759798407554626,
"learning_rate": 5.918e-05,
"loss": 0.0089,
"step": 2960
},
{
"epoch": 74.25,
"grad_norm": 0.18525859713554382,
"learning_rate": 5.9380000000000004e-05,
"loss": 0.0086,
"step": 2970
},
{
"epoch": 74.5,
"grad_norm": 0.1620161384344101,
"learning_rate": 5.958e-05,
"loss": 0.0084,
"step": 2980
},
{
"epoch": 74.75,
"grad_norm": 0.20352789759635925,
"learning_rate": 5.978e-05,
"loss": 0.0093,
"step": 2990
},
{
"epoch": 75.0,
"grad_norm": 0.2062734216451645,
"learning_rate": 5.9980000000000005e-05,
"loss": 0.0082,
"step": 3000
},
{
"epoch": 75.25,
"grad_norm": 0.18606604635715485,
"learning_rate": 6.018e-05,
"loss": 0.008,
"step": 3010
},
{
"epoch": 75.5,
"grad_norm": 0.15150688588619232,
"learning_rate": 6.038e-05,
"loss": 0.0075,
"step": 3020
},
{
"epoch": 75.75,
"grad_norm": 0.17146310210227966,
"learning_rate": 6.0580000000000006e-05,
"loss": 0.0077,
"step": 3030
},
{
"epoch": 76.0,
"grad_norm": 0.17717711627483368,
"learning_rate": 6.0780000000000004e-05,
"loss": 0.0078,
"step": 3040
},
{
"epoch": 76.25,
"grad_norm": 0.22910268604755402,
"learning_rate": 6.098e-05,
"loss": 0.0078,
"step": 3050
},
{
"epoch": 76.5,
"grad_norm": 0.1869814544916153,
"learning_rate": 6.118000000000001e-05,
"loss": 0.0081,
"step": 3060
},
{
"epoch": 76.75,
"grad_norm": 0.23087920248508453,
"learning_rate": 6.138e-05,
"loss": 0.0076,
"step": 3070
},
{
"epoch": 77.0,
"grad_norm": 0.17326343059539795,
"learning_rate": 6.158e-05,
"loss": 0.0078,
"step": 3080
},
{
"epoch": 77.25,
"grad_norm": 0.17539773881435394,
"learning_rate": 6.178000000000001e-05,
"loss": 0.0077,
"step": 3090
},
{
"epoch": 77.5,
"grad_norm": 0.16601373255252838,
"learning_rate": 6.198e-05,
"loss": 0.0071,
"step": 3100
},
{
"epoch": 77.75,
"grad_norm": 0.23548623919487,
"learning_rate": 6.218e-05,
"loss": 0.0072,
"step": 3110
},
{
"epoch": 78.0,
"grad_norm": 0.20448268949985504,
"learning_rate": 6.238000000000001e-05,
"loss": 0.0072,
"step": 3120
},
{
"epoch": 78.25,
"grad_norm": 0.18967969715595245,
"learning_rate": 6.258e-05,
"loss": 0.008,
"step": 3130
},
{
"epoch": 78.5,
"grad_norm": 0.15069612860679626,
"learning_rate": 6.278e-05,
"loss": 0.0078,
"step": 3140
},
{
"epoch": 78.75,
"grad_norm": 0.18829141557216644,
"learning_rate": 6.298000000000001e-05,
"loss": 0.0075,
"step": 3150
},
{
"epoch": 79.0,
"grad_norm": 0.25839942693710327,
"learning_rate": 6.318e-05,
"loss": 0.0071,
"step": 3160
},
{
"epoch": 79.25,
"grad_norm": 0.23017530143260956,
"learning_rate": 6.338e-05,
"loss": 0.0074,
"step": 3170
},
{
"epoch": 79.5,
"grad_norm": 0.21601979434490204,
"learning_rate": 6.358000000000001e-05,
"loss": 0.0085,
"step": 3180
},
{
"epoch": 79.75,
"grad_norm": 0.16384194791316986,
"learning_rate": 6.378e-05,
"loss": 0.0082,
"step": 3190
},
{
"epoch": 80.0,
"grad_norm": 0.18998737633228302,
"learning_rate": 6.398000000000001e-05,
"loss": 0.0083,
"step": 3200
},
{
"epoch": 80.25,
"grad_norm": 0.16958178579807281,
"learning_rate": 6.418000000000001e-05,
"loss": 0.0074,
"step": 3210
},
{
"epoch": 80.5,
"grad_norm": 0.17526504397392273,
"learning_rate": 6.438e-05,
"loss": 0.0078,
"step": 3220
},
{
"epoch": 80.75,
"grad_norm": 0.1629568487405777,
"learning_rate": 6.458000000000001e-05,
"loss": 0.007,
"step": 3230
},
{
"epoch": 81.0,
"grad_norm": 0.14959284663200378,
"learning_rate": 6.478000000000001e-05,
"loss": 0.0079,
"step": 3240
},
{
"epoch": 81.25,
"grad_norm": 0.12923310697078705,
"learning_rate": 6.498e-05,
"loss": 0.0074,
"step": 3250
},
{
"epoch": 81.5,
"grad_norm": 0.24613654613494873,
"learning_rate": 6.518000000000001e-05,
"loss": 0.0077,
"step": 3260
},
{
"epoch": 81.75,
"grad_norm": 0.248945415019989,
"learning_rate": 6.538000000000001e-05,
"loss": 0.0078,
"step": 3270
},
{
"epoch": 82.0,
"grad_norm": 0.2111428678035736,
"learning_rate": 6.558e-05,
"loss": 0.0078,
"step": 3280
},
{
"epoch": 82.25,
"grad_norm": 0.1845024675130844,
"learning_rate": 6.578000000000001e-05,
"loss": 0.007,
"step": 3290
},
{
"epoch": 82.5,
"grad_norm": 0.16099268198013306,
"learning_rate": 6.598e-05,
"loss": 0.0074,
"step": 3300
},
{
"epoch": 82.75,
"grad_norm": 0.16541269421577454,
"learning_rate": 6.618e-05,
"loss": 0.0071,
"step": 3310
},
{
"epoch": 83.0,
"grad_norm": 0.1501871943473816,
"learning_rate": 6.638e-05,
"loss": 0.0073,
"step": 3320
},
{
"epoch": 83.25,
"grad_norm": 0.17568759620189667,
"learning_rate": 6.658e-05,
"loss": 0.0083,
"step": 3330
},
{
"epoch": 83.5,
"grad_norm": 0.209712415933609,
"learning_rate": 6.678e-05,
"loss": 0.0077,
"step": 3340
},
{
"epoch": 83.75,
"grad_norm": 0.23022456467151642,
"learning_rate": 6.698e-05,
"loss": 0.0075,
"step": 3350
},
{
"epoch": 84.0,
"grad_norm": 0.19351983070373535,
"learning_rate": 6.718e-05,
"loss": 0.0084,
"step": 3360
},
{
"epoch": 84.25,
"grad_norm": 0.21300102770328522,
"learning_rate": 6.738e-05,
"loss": 0.0075,
"step": 3370
},
{
"epoch": 84.5,
"grad_norm": 0.11744840443134308,
"learning_rate": 6.758e-05,
"loss": 0.0071,
"step": 3380
},
{
"epoch": 84.75,
"grad_norm": 0.18190492689609528,
"learning_rate": 6.778e-05,
"loss": 0.0076,
"step": 3390
},
{
"epoch": 85.0,
"grad_norm": 0.17487181723117828,
"learning_rate": 6.798e-05,
"loss": 0.008,
"step": 3400
},
{
"epoch": 85.25,
"grad_norm": 0.09983476251363754,
"learning_rate": 6.818e-05,
"loss": 0.0068,
"step": 3410
},
{
"epoch": 85.5,
"grad_norm": 0.16663005948066711,
"learning_rate": 6.838e-05,
"loss": 0.0082,
"step": 3420
},
{
"epoch": 85.75,
"grad_norm": 0.20401281118392944,
"learning_rate": 6.858e-05,
"loss": 0.0076,
"step": 3430
},
{
"epoch": 86.0,
"grad_norm": 0.2125725895166397,
"learning_rate": 6.878e-05,
"loss": 0.0086,
"step": 3440
},
{
"epoch": 86.25,
"grad_norm": 0.14450369775295258,
"learning_rate": 6.898e-05,
"loss": 0.0065,
"step": 3450
},
{
"epoch": 86.5,
"grad_norm": 0.1645418107509613,
"learning_rate": 6.918e-05,
"loss": 0.0078,
"step": 3460
},
{
"epoch": 86.75,
"grad_norm": 0.15646201372146606,
"learning_rate": 6.938e-05,
"loss": 0.0073,
"step": 3470
},
{
"epoch": 87.0,
"grad_norm": 0.17534413933753967,
"learning_rate": 6.958e-05,
"loss": 0.0076,
"step": 3480
},
{
"epoch": 87.25,
"grad_norm": 0.13481944799423218,
"learning_rate": 6.978e-05,
"loss": 0.007,
"step": 3490
},
{
"epoch": 87.5,
"grad_norm": 0.10460406541824341,
"learning_rate": 6.998e-05,
"loss": 0.0074,
"step": 3500
},
{
"epoch": 87.75,
"grad_norm": 0.1374213844537735,
"learning_rate": 7.018e-05,
"loss": 0.0069,
"step": 3510
},
{
"epoch": 88.0,
"grad_norm": 0.12865151464939117,
"learning_rate": 7.038e-05,
"loss": 0.0063,
"step": 3520
},
{
"epoch": 88.25,
"grad_norm": 0.17576292157173157,
"learning_rate": 7.058e-05,
"loss": 0.007,
"step": 3530
},
{
"epoch": 88.5,
"grad_norm": 0.18292437493801117,
"learning_rate": 7.078e-05,
"loss": 0.0068,
"step": 3540
},
{
"epoch": 88.75,
"grad_norm": 0.18838587403297424,
"learning_rate": 7.098e-05,
"loss": 0.0075,
"step": 3550
},
{
"epoch": 89.0,
"grad_norm": 0.1894351989030838,
"learning_rate": 7.118e-05,
"loss": 0.0081,
"step": 3560
},
{
"epoch": 89.25,
"grad_norm": 0.14305658638477325,
"learning_rate": 7.138e-05,
"loss": 0.0072,
"step": 3570
},
{
"epoch": 89.5,
"grad_norm": 0.1975056380033493,
"learning_rate": 7.158e-05,
"loss": 0.0079,
"step": 3580
},
{
"epoch": 89.75,
"grad_norm": 0.11977282166481018,
"learning_rate": 7.178000000000001e-05,
"loss": 0.0067,
"step": 3590
},
{
"epoch": 90.0,
"grad_norm": 0.1803884357213974,
"learning_rate": 7.198e-05,
"loss": 0.0071,
"step": 3600
},
{
"epoch": 90.25,
"grad_norm": 0.17560359835624695,
"learning_rate": 7.218e-05,
"loss": 0.0067,
"step": 3610
},
{
"epoch": 90.5,
"grad_norm": 0.12173809111118317,
"learning_rate": 7.238000000000001e-05,
"loss": 0.0068,
"step": 3620
},
{
"epoch": 90.75,
"grad_norm": 0.1920517534017563,
"learning_rate": 7.258e-05,
"loss": 0.0073,
"step": 3630
},
{
"epoch": 91.0,
"grad_norm": 0.18857932090759277,
"learning_rate": 7.278e-05,
"loss": 0.0074,
"step": 3640
},
{
"epoch": 91.25,
"grad_norm": 0.196843221783638,
"learning_rate": 7.298000000000001e-05,
"loss": 0.0066,
"step": 3650
},
{
"epoch": 91.5,
"grad_norm": 0.16514630615711212,
"learning_rate": 7.318e-05,
"loss": 0.0069,
"step": 3660
},
{
"epoch": 91.75,
"grad_norm": 0.14540424942970276,
"learning_rate": 7.338e-05,
"loss": 0.0066,
"step": 3670
},
{
"epoch": 92.0,
"grad_norm": 0.1184212788939476,
"learning_rate": 7.358000000000001e-05,
"loss": 0.0066,
"step": 3680
},
{
"epoch": 92.25,
"grad_norm": 0.15860192477703094,
"learning_rate": 7.378e-05,
"loss": 0.0072,
"step": 3690
},
{
"epoch": 92.5,
"grad_norm": 0.12642768025398254,
"learning_rate": 7.398e-05,
"loss": 0.0067,
"step": 3700
},
{
"epoch": 92.75,
"grad_norm": 0.14021170139312744,
"learning_rate": 7.418000000000001e-05,
"loss": 0.0064,
"step": 3710
},
{
"epoch": 93.0,
"grad_norm": 0.16662588715553284,
"learning_rate": 7.438e-05,
"loss": 0.0065,
"step": 3720
},
{
"epoch": 93.25,
"grad_norm": 0.1377324014902115,
"learning_rate": 7.458000000000001e-05,
"loss": 0.0073,
"step": 3730
},
{
"epoch": 93.5,
"grad_norm": 0.14351031184196472,
"learning_rate": 7.478e-05,
"loss": 0.0068,
"step": 3740
},
{
"epoch": 93.75,
"grad_norm": 0.14035643637180328,
"learning_rate": 7.498e-05,
"loss": 0.0065,
"step": 3750
},
{
"epoch": 94.0,
"grad_norm": 0.17440786957740784,
"learning_rate": 7.518000000000001e-05,
"loss": 0.007,
"step": 3760
},
{
"epoch": 94.25,
"grad_norm": 0.17953602969646454,
"learning_rate": 7.538e-05,
"loss": 0.0081,
"step": 3770
},
{
"epoch": 94.5,
"grad_norm": 0.19098593294620514,
"learning_rate": 7.558e-05,
"loss": 0.0078,
"step": 3780
},
{
"epoch": 94.75,
"grad_norm": 0.1300899088382721,
"learning_rate": 7.578000000000001e-05,
"loss": 0.0063,
"step": 3790
},
{
"epoch": 95.0,
"grad_norm": 0.2068023532629013,
"learning_rate": 7.598e-05,
"loss": 0.0073,
"step": 3800
},
{
"epoch": 95.25,
"grad_norm": 0.17585402727127075,
"learning_rate": 7.618e-05,
"loss": 0.0069,
"step": 3810
},
{
"epoch": 95.5,
"grad_norm": 0.22300763428211212,
"learning_rate": 7.638000000000001e-05,
"loss": 0.0075,
"step": 3820
},
{
"epoch": 95.75,
"grad_norm": 0.185755655169487,
"learning_rate": 7.658e-05,
"loss": 0.0068,
"step": 3830
},
{
"epoch": 96.0,
"grad_norm": 0.1492988020181656,
"learning_rate": 7.678000000000001e-05,
"loss": 0.0071,
"step": 3840
},
{
"epoch": 96.25,
"grad_norm": 0.15368859469890594,
"learning_rate": 7.698000000000001e-05,
"loss": 0.0077,
"step": 3850
},
{
"epoch": 96.5,
"grad_norm": 0.19075649976730347,
"learning_rate": 7.718e-05,
"loss": 0.0075,
"step": 3860
},
{
"epoch": 96.75,
"grad_norm": 0.13382676243782043,
"learning_rate": 7.738000000000001e-05,
"loss": 0.0066,
"step": 3870
},
{
"epoch": 97.0,
"grad_norm": 0.16297952830791473,
"learning_rate": 7.758000000000001e-05,
"loss": 0.0077,
"step": 3880
},
{
"epoch": 97.25,
"grad_norm": 0.12056317180395126,
"learning_rate": 7.778e-05,
"loss": 0.0074,
"step": 3890
},
{
"epoch": 97.5,
"grad_norm": 0.17979000508785248,
"learning_rate": 7.798000000000001e-05,
"loss": 0.0073,
"step": 3900
},
{
"epoch": 97.75,
"grad_norm": 0.18691030144691467,
"learning_rate": 7.818000000000001e-05,
"loss": 0.0064,
"step": 3910
},
{
"epoch": 98.0,
"grad_norm": 0.1338334083557129,
"learning_rate": 7.838e-05,
"loss": 0.0067,
"step": 3920
},
{
"epoch": 98.25,
"grad_norm": 0.1993681937456131,
"learning_rate": 7.858000000000001e-05,
"loss": 0.0068,
"step": 3930
},
{
"epoch": 98.5,
"grad_norm": 0.14984053373336792,
"learning_rate": 7.878e-05,
"loss": 0.0072,
"step": 3940
},
{
"epoch": 98.75,
"grad_norm": 0.15467676520347595,
"learning_rate": 7.897999999999999e-05,
"loss": 0.0075,
"step": 3950
},
{
"epoch": 99.0,
"grad_norm": 0.13584424555301666,
"learning_rate": 7.918e-05,
"loss": 0.0067,
"step": 3960
},
{
"epoch": 99.25,
"grad_norm": 0.10644155740737915,
"learning_rate": 7.938e-05,
"loss": 0.0072,
"step": 3970
},
{
"epoch": 99.5,
"grad_norm": 0.1576027125120163,
"learning_rate": 7.958e-05,
"loss": 0.0068,
"step": 3980
},
{
"epoch": 99.75,
"grad_norm": 0.12459024786949158,
"learning_rate": 7.978e-05,
"loss": 0.0063,
"step": 3990
},
{
"epoch": 100.0,
"grad_norm": 0.1432129591703415,
"learning_rate": 7.998e-05,
"loss": 0.007,
"step": 4000
},
{
"epoch": 100.25,
"grad_norm": 0.13156336545944214,
"learning_rate": 8.018e-05,
"loss": 0.0063,
"step": 4010
},
{
"epoch": 100.5,
"grad_norm": 0.19238434731960297,
"learning_rate": 8.038e-05,
"loss": 0.0067,
"step": 4020
},
{
"epoch": 100.75,
"grad_norm": 0.14078587293624878,
"learning_rate": 8.058e-05,
"loss": 0.0065,
"step": 4030
},
{
"epoch": 101.0,
"grad_norm": 0.13242961466312408,
"learning_rate": 8.078e-05,
"loss": 0.0065,
"step": 4040
},
{
"epoch": 101.25,
"grad_norm": 0.14315347373485565,
"learning_rate": 8.098e-05,
"loss": 0.0071,
"step": 4050
},
{
"epoch": 101.5,
"grad_norm": 0.17269261181354523,
"learning_rate": 8.118e-05,
"loss": 0.0064,
"step": 4060
},
{
"epoch": 101.75,
"grad_norm": 0.12596949934959412,
"learning_rate": 8.138e-05,
"loss": 0.0075,
"step": 4070
},
{
"epoch": 102.0,
"grad_norm": 0.14768068492412567,
"learning_rate": 8.158e-05,
"loss": 0.0076,
"step": 4080
},
{
"epoch": 102.25,
"grad_norm": 0.16106919944286346,
"learning_rate": 8.178e-05,
"loss": 0.0068,
"step": 4090
},
{
"epoch": 102.5,
"grad_norm": 0.11802563816308975,
"learning_rate": 8.198e-05,
"loss": 0.0068,
"step": 4100
},
{
"epoch": 102.75,
"grad_norm": 0.14049091935157776,
"learning_rate": 8.218e-05,
"loss": 0.0064,
"step": 4110
},
{
"epoch": 103.0,
"grad_norm": 0.1250571757555008,
"learning_rate": 8.238000000000001e-05,
"loss": 0.0065,
"step": 4120
},
{
"epoch": 103.25,
"grad_norm": 0.1342255175113678,
"learning_rate": 8.258e-05,
"loss": 0.0063,
"step": 4130
},
{
"epoch": 103.5,
"grad_norm": 0.1357700079679489,
"learning_rate": 8.278e-05,
"loss": 0.006,
"step": 4140
},
{
"epoch": 103.75,
"grad_norm": 0.1375548541545868,
"learning_rate": 8.298000000000001e-05,
"loss": 0.0073,
"step": 4150
},
{
"epoch": 104.0,
"grad_norm": 0.12512585520744324,
"learning_rate": 8.318e-05,
"loss": 0.0065,
"step": 4160
},
{
"epoch": 104.25,
"grad_norm": 0.1292533129453659,
"learning_rate": 8.338e-05,
"loss": 0.0065,
"step": 4170
},
{
"epoch": 104.5,
"grad_norm": 0.12111157923936844,
"learning_rate": 8.358e-05,
"loss": 0.0071,
"step": 4180
},
{
"epoch": 104.75,
"grad_norm": 0.17220772802829742,
"learning_rate": 8.378e-05,
"loss": 0.0065,
"step": 4190
},
{
"epoch": 105.0,
"grad_norm": 0.14518342912197113,
"learning_rate": 8.398e-05,
"loss": 0.0067,
"step": 4200
},
{
"epoch": 105.25,
"grad_norm": 0.16509418189525604,
"learning_rate": 8.418e-05,
"loss": 0.0072,
"step": 4210
},
{
"epoch": 105.5,
"grad_norm": 0.13074707984924316,
"learning_rate": 8.438e-05,
"loss": 0.0063,
"step": 4220
},
{
"epoch": 105.75,
"grad_norm": 0.1503017544746399,
"learning_rate": 8.458e-05,
"loss": 0.0065,
"step": 4230
},
{
"epoch": 106.0,
"grad_norm": 0.17117547988891602,
"learning_rate": 8.478e-05,
"loss": 0.0058,
"step": 4240
},
{
"epoch": 106.25,
"grad_norm": 0.28769177198410034,
"learning_rate": 8.498e-05,
"loss": 0.0071,
"step": 4250
},
{
"epoch": 106.5,
"grad_norm": 0.18290570378303528,
"learning_rate": 8.518000000000001e-05,
"loss": 0.0069,
"step": 4260
},
{
"epoch": 106.75,
"grad_norm": 0.2097172886133194,
"learning_rate": 8.538e-05,
"loss": 0.0065,
"step": 4270
},
{
"epoch": 107.0,
"grad_norm": 0.23110030591487885,
"learning_rate": 8.558e-05,
"loss": 0.0066,
"step": 4280
},
{
"epoch": 107.25,
"grad_norm": 0.21445296704769135,
"learning_rate": 8.578000000000001e-05,
"loss": 0.0071,
"step": 4290
},
{
"epoch": 107.5,
"grad_norm": 0.15120886266231537,
"learning_rate": 8.598e-05,
"loss": 0.0067,
"step": 4300
},
{
"epoch": 107.75,
"grad_norm": 0.21303877234458923,
"learning_rate": 8.618e-05,
"loss": 0.0068,
"step": 4310
},
{
"epoch": 108.0,
"grad_norm": 0.1451047956943512,
"learning_rate": 8.638000000000001e-05,
"loss": 0.0061,
"step": 4320
},
{
"epoch": 108.25,
"grad_norm": 0.14858797192573547,
"learning_rate": 8.658e-05,
"loss": 0.0065,
"step": 4330
},
{
"epoch": 108.5,
"grad_norm": 0.12267820537090302,
"learning_rate": 8.678e-05,
"loss": 0.0058,
"step": 4340
},
{
"epoch": 108.75,
"grad_norm": 0.14928346872329712,
"learning_rate": 8.698000000000001e-05,
"loss": 0.0073,
"step": 4350
},
{
"epoch": 109.0,
"grad_norm": 0.1512640118598938,
"learning_rate": 8.718e-05,
"loss": 0.0055,
"step": 4360
},
{
"epoch": 109.25,
"grad_norm": 0.13059866428375244,
"learning_rate": 8.738000000000001e-05,
"loss": 0.0062,
"step": 4370
},
{
"epoch": 109.5,
"grad_norm": 0.12359509617090225,
"learning_rate": 8.758000000000001e-05,
"loss": 0.0063,
"step": 4380
},
{
"epoch": 109.75,
"grad_norm": 0.146541029214859,
"learning_rate": 8.778e-05,
"loss": 0.0069,
"step": 4390
},
{
"epoch": 110.0,
"grad_norm": 0.1751328557729721,
"learning_rate": 8.798000000000001e-05,
"loss": 0.0072,
"step": 4400
},
{
"epoch": 110.25,
"grad_norm": 0.2347930371761322,
"learning_rate": 8.818000000000001e-05,
"loss": 0.0065,
"step": 4410
},
{
"epoch": 110.5,
"grad_norm": 0.18729887902736664,
"learning_rate": 8.838e-05,
"loss": 0.0064,
"step": 4420
},
{
"epoch": 110.75,
"grad_norm": 0.11705614626407623,
"learning_rate": 8.858000000000001e-05,
"loss": 0.0063,
"step": 4430
},
{
"epoch": 111.0,
"grad_norm": 0.1382816731929779,
"learning_rate": 8.878000000000001e-05,
"loss": 0.0057,
"step": 4440
},
{
"epoch": 111.25,
"grad_norm": 0.15040716528892517,
"learning_rate": 8.898e-05,
"loss": 0.0059,
"step": 4450
},
{
"epoch": 111.5,
"grad_norm": 0.15252092480659485,
"learning_rate": 8.918000000000001e-05,
"loss": 0.0065,
"step": 4460
},
{
"epoch": 111.75,
"grad_norm": 0.15340879559516907,
"learning_rate": 8.938e-05,
"loss": 0.0063,
"step": 4470
},
{
"epoch": 112.0,
"grad_norm": 0.13012273609638214,
"learning_rate": 8.958e-05,
"loss": 0.0064,
"step": 4480
},
{
"epoch": 112.25,
"grad_norm": 0.14281576871871948,
"learning_rate": 8.978000000000001e-05,
"loss": 0.0058,
"step": 4490
},
{
"epoch": 112.5,
"grad_norm": 0.14264865219593048,
"learning_rate": 8.998e-05,
"loss": 0.0062,
"step": 4500
},
{
"epoch": 112.75,
"grad_norm": 0.1958669275045395,
"learning_rate": 9.018000000000001e-05,
"loss": 0.0066,
"step": 4510
},
{
"epoch": 113.0,
"grad_norm": 0.10879232734441757,
"learning_rate": 9.038000000000001e-05,
"loss": 0.0065,
"step": 4520
},
{
"epoch": 113.25,
"grad_norm": 0.2231931984424591,
"learning_rate": 9.058e-05,
"loss": 0.0062,
"step": 4530
},
{
"epoch": 113.5,
"grad_norm": 0.16552019119262695,
"learning_rate": 9.078000000000001e-05,
"loss": 0.0066,
"step": 4540
},
{
"epoch": 113.75,
"grad_norm": 0.15375559031963348,
"learning_rate": 9.098000000000001e-05,
"loss": 0.0072,
"step": 4550
},
{
"epoch": 114.0,
"grad_norm": 0.20676645636558533,
"learning_rate": 9.118e-05,
"loss": 0.0072,
"step": 4560
},
{
"epoch": 114.25,
"grad_norm": 0.15286394953727722,
"learning_rate": 9.138e-05,
"loss": 0.0069,
"step": 4570
},
{
"epoch": 114.5,
"grad_norm": 0.15226341784000397,
"learning_rate": 9.158e-05,
"loss": 0.0066,
"step": 4580
},
{
"epoch": 114.75,
"grad_norm": 0.12615948915481567,
"learning_rate": 9.178e-05,
"loss": 0.0066,
"step": 4590
},
{
"epoch": 115.0,
"grad_norm": 0.1715383529663086,
"learning_rate": 9.198e-05,
"loss": 0.0072,
"step": 4600
},
{
"epoch": 115.25,
"grad_norm": 0.13157600164413452,
"learning_rate": 9.218e-05,
"loss": 0.0071,
"step": 4610
},
{
"epoch": 115.5,
"grad_norm": 0.16462917625904083,
"learning_rate": 9.238e-05,
"loss": 0.0067,
"step": 4620
},
{
"epoch": 115.75,
"grad_norm": 0.2137746512889862,
"learning_rate": 9.258e-05,
"loss": 0.0072,
"step": 4630
},
{
"epoch": 116.0,
"grad_norm": 0.14793118834495544,
"learning_rate": 9.278e-05,
"loss": 0.0067,
"step": 4640
},
{
"epoch": 116.25,
"grad_norm": 0.1574595421552658,
"learning_rate": 9.298e-05,
"loss": 0.007,
"step": 4650
},
{
"epoch": 116.5,
"grad_norm": 0.12151467055082321,
"learning_rate": 9.318e-05,
"loss": 0.0065,
"step": 4660
},
{
"epoch": 116.75,
"grad_norm": 0.18424446880817413,
"learning_rate": 9.338e-05,
"loss": 0.0056,
"step": 4670
},
{
"epoch": 117.0,
"grad_norm": 0.181967630982399,
"learning_rate": 9.358e-05,
"loss": 0.0064,
"step": 4680
},
{
"epoch": 117.25,
"grad_norm": 0.15449753403663635,
"learning_rate": 9.378e-05,
"loss": 0.0062,
"step": 4690
},
{
"epoch": 117.5,
"grad_norm": 0.17288358509540558,
"learning_rate": 9.398e-05,
"loss": 0.0065,
"step": 4700
},
{
"epoch": 117.75,
"grad_norm": 0.22410866618156433,
"learning_rate": 9.418e-05,
"loss": 0.0065,
"step": 4710
},
{
"epoch": 118.0,
"grad_norm": 0.1840396225452423,
"learning_rate": 9.438e-05,
"loss": 0.0063,
"step": 4720
},
{
"epoch": 118.25,
"grad_norm": 0.1368318498134613,
"learning_rate": 9.458e-05,
"loss": 0.0062,
"step": 4730
},
{
"epoch": 118.5,
"grad_norm": 0.19932417571544647,
"learning_rate": 9.478e-05,
"loss": 0.006,
"step": 4740
},
{
"epoch": 118.75,
"grad_norm": 0.13300760090351105,
"learning_rate": 9.498e-05,
"loss": 0.0066,
"step": 4750
},
{
"epoch": 119.0,
"grad_norm": 0.17009101808071136,
"learning_rate": 9.518000000000001e-05,
"loss": 0.0063,
"step": 4760
},
{
"epoch": 119.25,
"grad_norm": 0.14035862684249878,
"learning_rate": 9.538e-05,
"loss": 0.0073,
"step": 4770
},
{
"epoch": 119.5,
"grad_norm": 0.15153725445270538,
"learning_rate": 9.558e-05,
"loss": 0.0076,
"step": 4780
},
{
"epoch": 119.75,
"grad_norm": 0.16562645137310028,
"learning_rate": 9.578000000000001e-05,
"loss": 0.007,
"step": 4790
},
{
"epoch": 120.0,
"grad_norm": 0.19395475089550018,
"learning_rate": 9.598e-05,
"loss": 0.0067,
"step": 4800
},
{
"epoch": 120.25,
"grad_norm": 0.19338366389274597,
"learning_rate": 9.618e-05,
"loss": 0.0063,
"step": 4810
},
{
"epoch": 120.5,
"grad_norm": 0.18101127445697784,
"learning_rate": 9.638000000000001e-05,
"loss": 0.0071,
"step": 4820
},
{
"epoch": 120.75,
"grad_norm": 0.15528017282485962,
"learning_rate": 9.658e-05,
"loss": 0.0068,
"step": 4830
},
{
"epoch": 121.0,
"grad_norm": 0.15108326077461243,
"learning_rate": 9.678e-05,
"loss": 0.0058,
"step": 4840
},
{
"epoch": 121.25,
"grad_norm": 0.14880536496639252,
"learning_rate": 9.698000000000001e-05,
"loss": 0.0076,
"step": 4850
},
{
"epoch": 121.5,
"grad_norm": 0.18922747671604156,
"learning_rate": 9.718e-05,
"loss": 0.0071,
"step": 4860
},
{
"epoch": 121.75,
"grad_norm": 0.15255926549434662,
"learning_rate": 9.738e-05,
"loss": 0.0055,
"step": 4870
},
{
"epoch": 122.0,
"grad_norm": 0.16512243449687958,
"learning_rate": 9.758000000000001e-05,
"loss": 0.006,
"step": 4880
},
{
"epoch": 122.25,
"grad_norm": 0.1157233789563179,
"learning_rate": 9.778e-05,
"loss": 0.006,
"step": 4890
},
{
"epoch": 122.5,
"grad_norm": 0.15160076320171356,
"learning_rate": 9.798000000000001e-05,
"loss": 0.0059,
"step": 4900
},
{
"epoch": 122.75,
"grad_norm": 0.13103358447551727,
"learning_rate": 9.818000000000001e-05,
"loss": 0.0058,
"step": 4910
},
{
"epoch": 123.0,
"grad_norm": 0.11951006203889847,
"learning_rate": 9.838e-05,
"loss": 0.0071,
"step": 4920
},
{
"epoch": 123.25,
"grad_norm": 0.15750989317893982,
"learning_rate": 9.858000000000001e-05,
"loss": 0.0053,
"step": 4930
},
{
"epoch": 123.5,
"grad_norm": 0.13397419452667236,
"learning_rate": 9.878e-05,
"loss": 0.0066,
"step": 4940
},
{
"epoch": 123.75,
"grad_norm": 0.15722912549972534,
"learning_rate": 9.898e-05,
"loss": 0.006,
"step": 4950
},
{
"epoch": 124.0,
"grad_norm": 0.1683134287595749,
"learning_rate": 9.918000000000001e-05,
"loss": 0.0061,
"step": 4960
},
{
"epoch": 124.25,
"grad_norm": 0.19436003267765045,
"learning_rate": 9.938e-05,
"loss": 0.0061,
"step": 4970
},
{
"epoch": 124.5,
"grad_norm": 0.17469929158687592,
"learning_rate": 9.958e-05,
"loss": 0.0065,
"step": 4980
},
{
"epoch": 124.75,
"grad_norm": 0.19975730776786804,
"learning_rate": 9.978000000000001e-05,
"loss": 0.0075,
"step": 4990
},
{
"epoch": 125.0,
"grad_norm": 0.12145998328924179,
"learning_rate": 9.998e-05,
"loss": 0.0057,
"step": 5000
},
{
"epoch": 125.25,
"grad_norm": 0.12310753762722015,
"learning_rate": 9.999999778549045e-05,
"loss": 0.0057,
"step": 5010
},
{
"epoch": 125.5,
"grad_norm": 0.17047083377838135,
"learning_rate": 9.999999013039593e-05,
"loss": 0.0058,
"step": 5020
},
{
"epoch": 125.75,
"grad_norm": 0.1985018253326416,
"learning_rate": 9.999997700737766e-05,
"loss": 0.0061,
"step": 5030
},
{
"epoch": 126.0,
"grad_norm": 0.12021245807409286,
"learning_rate": 9.999995841643709e-05,
"loss": 0.0064,
"step": 5040
},
{
"epoch": 126.25,
"grad_norm": 0.08459141105413437,
"learning_rate": 9.999993435757623e-05,
"loss": 0.0061,
"step": 5050
},
{
"epoch": 126.5,
"grad_norm": 0.12869714200496674,
"learning_rate": 9.999990483079773e-05,
"loss": 0.0065,
"step": 5060
},
{
"epoch": 126.75,
"grad_norm": 0.10929016023874283,
"learning_rate": 9.999986983610481e-05,
"loss": 0.0059,
"step": 5070
},
{
"epoch": 127.0,
"grad_norm": 0.14434878528118134,
"learning_rate": 9.99998293735013e-05,
"loss": 0.0061,
"step": 5080
},
{
"epoch": 127.25,
"grad_norm": 0.17825250327587128,
"learning_rate": 9.999978344299161e-05,
"loss": 0.0061,
"step": 5090
},
{
"epoch": 127.5,
"grad_norm": 0.1418701559305191,
"learning_rate": 9.99997320445808e-05,
"loss": 0.0068,
"step": 5100
},
{
"epoch": 127.75,
"grad_norm": 0.13784301280975342,
"learning_rate": 9.999967517827444e-05,
"loss": 0.0058,
"step": 5110
},
{
"epoch": 128.0,
"grad_norm": 0.1485631763935089,
"learning_rate": 9.999961284407879e-05,
"loss": 0.0066,
"step": 5120
},
{
"epoch": 128.25,
"grad_norm": 0.13838165998458862,
"learning_rate": 9.999954504200067e-05,
"loss": 0.0053,
"step": 5130
},
{
"epoch": 128.5,
"grad_norm": 0.098316490650177,
"learning_rate": 9.999947177204744e-05,
"loss": 0.0062,
"step": 5140
},
{
"epoch": 128.75,
"grad_norm": 0.15371425449848175,
"learning_rate": 9.999939303422718e-05,
"loss": 0.0061,
"step": 5150
},
{
"epoch": 129.0,
"grad_norm": 0.12473967671394348,
"learning_rate": 9.999930882854847e-05,
"loss": 0.0058,
"step": 5160
},
{
"epoch": 129.25,
"grad_norm": 0.12960635125637054,
"learning_rate": 9.999921915502051e-05,
"loss": 0.0057,
"step": 5170
},
{
"epoch": 129.5,
"grad_norm": 0.14831651747226715,
"learning_rate": 9.99991240136531e-05,
"loss": 0.0073,
"step": 5180
},
{
"epoch": 129.75,
"grad_norm": 0.11215781420469284,
"learning_rate": 9.999902340445668e-05,
"loss": 0.0057,
"step": 5190
},
{
"epoch": 130.0,
"grad_norm": 0.10951609164476395,
"learning_rate": 9.999891732744224e-05,
"loss": 0.0062,
"step": 5200
},
{
"epoch": 130.25,
"grad_norm": 0.12410330027341843,
"learning_rate": 9.999880578262135e-05,
"loss": 0.0063,
"step": 5210
},
{
"epoch": 130.5,
"grad_norm": 0.16753678023815155,
"learning_rate": 9.999868877000624e-05,
"loss": 0.0065,
"step": 5220
},
{
"epoch": 130.75,
"grad_norm": 0.1503169685602188,
"learning_rate": 9.99985662896097e-05,
"loss": 0.0068,
"step": 5230
},
{
"epoch": 131.0,
"grad_norm": 0.17394974827766418,
"learning_rate": 9.999843834144513e-05,
"loss": 0.0069,
"step": 5240
},
{
"epoch": 131.25,
"grad_norm": 0.1436404585838318,
"learning_rate": 9.99983049255265e-05,
"loss": 0.0066,
"step": 5250
},
{
"epoch": 131.5,
"grad_norm": 0.13780523836612701,
"learning_rate": 9.999816604186843e-05,
"loss": 0.0066,
"step": 5260
},
{
"epoch": 131.75,
"grad_norm": 0.11955001950263977,
"learning_rate": 9.999802169048609e-05,
"loss": 0.0058,
"step": 5270
},
{
"epoch": 132.0,
"grad_norm": 0.11023246496915817,
"learning_rate": 9.999787187139527e-05,
"loss": 0.0054,
"step": 5280
},
{
"epoch": 132.25,
"grad_norm": 0.13991482555866241,
"learning_rate": 9.999771658461234e-05,
"loss": 0.0059,
"step": 5290
},
{
"epoch": 132.5,
"grad_norm": 0.11659090220928192,
"learning_rate": 9.999755583015431e-05,
"loss": 0.0055,
"step": 5300
},
{
"epoch": 132.75,
"grad_norm": 0.1481342911720276,
"learning_rate": 9.999738960803874e-05,
"loss": 0.0059,
"step": 5310
},
{
"epoch": 133.0,
"grad_norm": 0.12136317044496536,
"learning_rate": 9.99972179182838e-05,
"loss": 0.0047,
"step": 5320
},
{
"epoch": 133.25,
"grad_norm": 0.12952932715415955,
"learning_rate": 9.99970407609083e-05,
"loss": 0.0059,
"step": 5330
},
{
"epoch": 133.5,
"grad_norm": 0.14714136719703674,
"learning_rate": 9.999685813593159e-05,
"loss": 0.006,
"step": 5340
},
{
"epoch": 133.75,
"grad_norm": 0.15123462677001953,
"learning_rate": 9.999667004337362e-05,
"loss": 0.0051,
"step": 5350
},
{
"epoch": 134.0,
"grad_norm": 0.17769788205623627,
"learning_rate": 9.9996476483255e-05,
"loss": 0.0059,
"step": 5360
},
{
"epoch": 134.25,
"grad_norm": 0.15275105834007263,
"learning_rate": 9.999627745559688e-05,
"loss": 0.0055,
"step": 5370
},
{
"epoch": 134.5,
"grad_norm": 0.16935890913009644,
"learning_rate": 9.999607296042101e-05,
"loss": 0.0059,
"step": 5380
},
{
"epoch": 134.75,
"grad_norm": 0.14174121618270874,
"learning_rate": 9.99958629977498e-05,
"loss": 0.0053,
"step": 5390
},
{
"epoch": 135.0,
"grad_norm": 0.1606847494840622,
"learning_rate": 9.999564756760615e-05,
"loss": 0.0055,
"step": 5400
},
{
"epoch": 135.25,
"grad_norm": 0.15848489105701447,
"learning_rate": 9.999542667001366e-05,
"loss": 0.0056,
"step": 5410
},
{
"epoch": 135.5,
"grad_norm": 0.1053905189037323,
"learning_rate": 9.999520030499647e-05,
"loss": 0.0052,
"step": 5420
},
{
"epoch": 135.75,
"grad_norm": 0.16355837881565094,
"learning_rate": 9.999496847257936e-05,
"loss": 0.0055,
"step": 5430
},
{
"epoch": 136.0,
"grad_norm": 0.18699996173381805,
"learning_rate": 9.999473117278764e-05,
"loss": 0.0059,
"step": 5440
},
{
"epoch": 136.25,
"grad_norm": 0.16889815032482147,
"learning_rate": 9.999448840564731e-05,
"loss": 0.0054,
"step": 5450
},
{
"epoch": 136.5,
"grad_norm": 0.15516693890094757,
"learning_rate": 9.999424017118488e-05,
"loss": 0.0057,
"step": 5460
},
{
"epoch": 136.75,
"grad_norm": 0.14914794266223907,
"learning_rate": 9.999398646942751e-05,
"loss": 0.0061,
"step": 5470
},
{
"epoch": 137.0,
"grad_norm": 0.17972686886787415,
"learning_rate": 9.999372730040296e-05,
"loss": 0.0054,
"step": 5480
},
{
"epoch": 137.25,
"grad_norm": 0.1406720131635666,
"learning_rate": 9.999346266413953e-05,
"loss": 0.0059,
"step": 5490
},
{
"epoch": 137.5,
"grad_norm": 0.1266246736049652,
"learning_rate": 9.99931925606662e-05,
"loss": 0.0061,
"step": 5500
},
{
"epoch": 137.75,
"grad_norm": 0.1418047994375229,
"learning_rate": 9.99929169900125e-05,
"loss": 0.0056,
"step": 5510
},
{
"epoch": 138.0,
"grad_norm": 0.1325012743473053,
"learning_rate": 9.999263595220855e-05,
"loss": 0.0051,
"step": 5520
},
{
"epoch": 138.25,
"grad_norm": 0.11973172426223755,
"learning_rate": 9.99923494472851e-05,
"loss": 0.0051,
"step": 5530
},
{
"epoch": 138.5,
"grad_norm": 0.1425570398569107,
"learning_rate": 9.999205747527348e-05,
"loss": 0.0063,
"step": 5540
},
{
"epoch": 138.75,
"grad_norm": 0.14493736624717712,
"learning_rate": 9.999176003620561e-05,
"loss": 0.0055,
"step": 5550
},
{
"epoch": 139.0,
"grad_norm": 0.10896685719490051,
"learning_rate": 9.999145713011405e-05,
"loss": 0.0056,
"step": 5560
},
{
"epoch": 139.25,
"grad_norm": 0.126507967710495,
"learning_rate": 9.999114875703186e-05,
"loss": 0.0052,
"step": 5570
},
{
"epoch": 139.5,
"grad_norm": 0.09839679300785065,
"learning_rate": 9.999083491699281e-05,
"loss": 0.0057,
"step": 5580
},
{
"epoch": 139.75,
"grad_norm": 0.10790673643350601,
"learning_rate": 9.999051561003123e-05,
"loss": 0.0057,
"step": 5590
},
{
"epoch": 140.0,
"grad_norm": 0.11031424254179001,
"learning_rate": 9.999019083618202e-05,
"loss": 0.0057,
"step": 5600
},
{
"epoch": 140.25,
"grad_norm": 0.12621337175369263,
"learning_rate": 9.99898605954807e-05,
"loss": 0.0062,
"step": 5610
},
{
"epoch": 140.5,
"grad_norm": 0.09760873019695282,
"learning_rate": 9.998952488796338e-05,
"loss": 0.0057,
"step": 5620
},
{
"epoch": 140.75,
"grad_norm": 0.10049641132354736,
"learning_rate": 9.998918371366676e-05,
"loss": 0.0055,
"step": 5630
},
{
"epoch": 141.0,
"grad_norm": 0.11897911131381989,
"learning_rate": 9.99888370726282e-05,
"loss": 0.0052,
"step": 5640
},
{
"epoch": 141.25,
"grad_norm": 0.12692591547966003,
"learning_rate": 9.998848496488556e-05,
"loss": 0.0055,
"step": 5650
},
{
"epoch": 141.5,
"grad_norm": 0.09676604717969894,
"learning_rate": 9.998812739047736e-05,
"loss": 0.0061,
"step": 5660
},
{
"epoch": 141.75,
"grad_norm": 0.10841045528650284,
"learning_rate": 9.99877643494427e-05,
"loss": 0.0062,
"step": 5670
},
{
"epoch": 142.0,
"grad_norm": 0.13137775659561157,
"learning_rate": 9.998739584182128e-05,
"loss": 0.0065,
"step": 5680
},
{
"epoch": 142.25,
"grad_norm": 0.11271249502897263,
"learning_rate": 9.998702186765342e-05,
"loss": 0.0055,
"step": 5690
},
{
"epoch": 142.5,
"grad_norm": 0.12125517427921295,
"learning_rate": 9.998664242698e-05,
"loss": 0.0055,
"step": 5700
},
{
"epoch": 142.75,
"grad_norm": 0.15787778794765472,
"learning_rate": 9.998625751984251e-05,
"loss": 0.0048,
"step": 5710
},
{
"epoch": 143.0,
"grad_norm": 0.1311769336462021,
"learning_rate": 9.998586714628307e-05,
"loss": 0.0056,
"step": 5720
},
{
"epoch": 143.25,
"grad_norm": 0.1340465098619461,
"learning_rate": 9.998547130634432e-05,
"loss": 0.006,
"step": 5730
},
{
"epoch": 143.5,
"grad_norm": 0.11795949935913086,
"learning_rate": 9.99850700000696e-05,
"loss": 0.0057,
"step": 5740
},
{
"epoch": 143.75,
"grad_norm": 0.16333207488059998,
"learning_rate": 9.998466322750278e-05,
"loss": 0.0056,
"step": 5750
},
{
"epoch": 144.0,
"grad_norm": 0.19970041513442993,
"learning_rate": 9.998425098868834e-05,
"loss": 0.0057,
"step": 5760
},
{
"epoch": 144.25,
"grad_norm": 0.22557619214057922,
"learning_rate": 9.998383328367136e-05,
"loss": 0.0059,
"step": 5770
},
{
"epoch": 144.5,
"grad_norm": 0.22150106728076935,
"learning_rate": 9.99834101124975e-05,
"loss": 0.0062,
"step": 5780
},
{
"epoch": 144.75,
"grad_norm": 0.20753586292266846,
"learning_rate": 9.998298147521309e-05,
"loss": 0.0065,
"step": 5790
},
{
"epoch": 145.0,
"grad_norm": 0.17864994704723358,
"learning_rate": 9.998254737186496e-05,
"loss": 0.0057,
"step": 5800
},
{
"epoch": 145.25,
"grad_norm": 0.17263974249362946,
"learning_rate": 9.99821078025006e-05,
"loss": 0.0058,
"step": 5810
},
{
"epoch": 145.5,
"grad_norm": 0.16186940670013428,
"learning_rate": 9.998166276716807e-05,
"loss": 0.0063,
"step": 5820
},
{
"epoch": 145.75,
"grad_norm": 0.1366291344165802,
"learning_rate": 9.998121226591606e-05,
"loss": 0.0057,
"step": 5830
},
{
"epoch": 146.0,
"grad_norm": 0.14409442245960236,
"learning_rate": 9.998075629879382e-05,
"loss": 0.0057,
"step": 5840
},
{
"epoch": 146.25,
"grad_norm": 0.1769361048936844,
"learning_rate": 9.99802948658512e-05,
"loss": 0.0056,
"step": 5850
},
{
"epoch": 146.5,
"grad_norm": 0.178902268409729,
"learning_rate": 9.99798279671387e-05,
"loss": 0.0065,
"step": 5860
},
{
"epoch": 146.75,
"grad_norm": 0.18864601850509644,
"learning_rate": 9.997935560270734e-05,
"loss": 0.0066,
"step": 5870
},
{
"epoch": 147.0,
"grad_norm": 0.10840289294719696,
"learning_rate": 9.997887777260879e-05,
"loss": 0.0055,
"step": 5880
},
{
"epoch": 147.25,
"grad_norm": 0.1299472600221634,
"learning_rate": 9.997839447689532e-05,
"loss": 0.0062,
"step": 5890
},
{
"epoch": 147.5,
"grad_norm": 0.09526827186346054,
"learning_rate": 9.997790571561978e-05,
"loss": 0.0057,
"step": 5900
},
{
"epoch": 147.75,
"grad_norm": 0.104829341173172,
"learning_rate": 9.99774114888356e-05,
"loss": 0.0052,
"step": 5910
},
{
"epoch": 148.0,
"grad_norm": 0.11738763749599457,
"learning_rate": 9.997691179659684e-05,
"loss": 0.0056,
"step": 5920
},
{
"epoch": 148.25,
"grad_norm": 0.09024183452129364,
"learning_rate": 9.997640663895815e-05,
"loss": 0.0052,
"step": 5930
},
{
"epoch": 148.5,
"grad_norm": 0.10453230142593384,
"learning_rate": 9.997589601597477e-05,
"loss": 0.0062,
"step": 5940
},
{
"epoch": 148.75,
"grad_norm": 0.11493031680583954,
"learning_rate": 9.997537992770252e-05,
"loss": 0.0048,
"step": 5950
},
{
"epoch": 149.0,
"grad_norm": 0.1389797329902649,
"learning_rate": 9.997485837419788e-05,
"loss": 0.0055,
"step": 5960
},
{
"epoch": 149.25,
"grad_norm": 0.12700024247169495,
"learning_rate": 9.997433135551786e-05,
"loss": 0.0054,
"step": 5970
},
{
"epoch": 149.5,
"grad_norm": 0.13564901053905487,
"learning_rate": 9.997379887172009e-05,
"loss": 0.0047,
"step": 5980
},
{
"epoch": 149.75,
"grad_norm": 0.12888741493225098,
"learning_rate": 9.997326092286281e-05,
"loss": 0.0055,
"step": 5990
},
{
"epoch": 150.0,
"grad_norm": 0.10119718313217163,
"learning_rate": 9.997271750900486e-05,
"loss": 0.0056,
"step": 6000
},
{
"epoch": 150.25,
"grad_norm": 0.12381160259246826,
"learning_rate": 9.997216863020565e-05,
"loss": 0.0051,
"step": 6010
},
{
"epoch": 150.5,
"grad_norm": 0.12457701563835144,
"learning_rate": 9.99716142865252e-05,
"loss": 0.0052,
"step": 6020
},
{
"epoch": 150.75,
"grad_norm": 0.1514553427696228,
"learning_rate": 9.997105447802415e-05,
"loss": 0.0054,
"step": 6030
},
{
"epoch": 151.0,
"grad_norm": 0.1348123848438263,
"learning_rate": 9.997048920476373e-05,
"loss": 0.0054,
"step": 6040
},
{
"epoch": 151.25,
"grad_norm": 0.14996957778930664,
"learning_rate": 9.996991846680572e-05,
"loss": 0.0057,
"step": 6050
},
{
"epoch": 151.5,
"grad_norm": 0.09939752519130707,
"learning_rate": 9.996934226421257e-05,
"loss": 0.0052,
"step": 6060
},
{
"epoch": 151.75,
"grad_norm": 0.10740762203931808,
"learning_rate": 9.996876059704726e-05,
"loss": 0.005,
"step": 6070
},
{
"epoch": 152.0,
"grad_norm": 0.13272447884082794,
"learning_rate": 9.996817346537343e-05,
"loss": 0.0063,
"step": 6080
},
{
"epoch": 152.25,
"grad_norm": 0.10130985081195831,
"learning_rate": 9.996758086925526e-05,
"loss": 0.0056,
"step": 6090
},
{
"epoch": 152.5,
"grad_norm": 0.13598772883415222,
"learning_rate": 9.996698280875759e-05,
"loss": 0.0053,
"step": 6100
},
{
"epoch": 152.75,
"grad_norm": 0.1303255409002304,
"learning_rate": 9.99663792839458e-05,
"loss": 0.0058,
"step": 6110
},
{
"epoch": 153.0,
"grad_norm": 0.15227794647216797,
"learning_rate": 9.99657702948859e-05,
"loss": 0.0056,
"step": 6120
},
{
"epoch": 153.25,
"grad_norm": 0.12363743036985397,
"learning_rate": 9.996515584164448e-05,
"loss": 0.0053,
"step": 6130
},
{
"epoch": 153.5,
"grad_norm": 0.16186046600341797,
"learning_rate": 9.996453592428873e-05,
"loss": 0.005,
"step": 6140
},
{
"epoch": 153.75,
"grad_norm": 0.11398918926715851,
"learning_rate": 9.996391054288646e-05,
"loss": 0.0051,
"step": 6150
},
{
"epoch": 154.0,
"grad_norm": 0.14045920968055725,
"learning_rate": 9.996327969750605e-05,
"loss": 0.0051,
"step": 6160
},
{
"epoch": 154.25,
"grad_norm": 0.14219066500663757,
"learning_rate": 9.996264338821649e-05,
"loss": 0.0045,
"step": 6170
},
{
"epoch": 154.5,
"grad_norm": 0.1310894936323166,
"learning_rate": 9.996200161508735e-05,
"loss": 0.0052,
"step": 6180
},
{
"epoch": 154.75,
"grad_norm": 0.10034388303756714,
"learning_rate": 9.996135437818885e-05,
"loss": 0.0048,
"step": 6190
},
{
"epoch": 155.0,
"grad_norm": 0.15445446968078613,
"learning_rate": 9.996070167759175e-05,
"loss": 0.0056,
"step": 6200
},
{
"epoch": 155.25,
"grad_norm": 0.11739563941955566,
"learning_rate": 9.996004351336743e-05,
"loss": 0.0055,
"step": 6210
},
{
"epoch": 155.5,
"grad_norm": 0.15179182589054108,
"learning_rate": 9.995937988558785e-05,
"loss": 0.0058,
"step": 6220
},
{
"epoch": 155.75,
"grad_norm": 0.14104419946670532,
"learning_rate": 9.995871079432561e-05,
"loss": 0.0054,
"step": 6230
},
{
"epoch": 156.0,
"grad_norm": 0.1537674367427826,
"learning_rate": 9.995803623965389e-05,
"loss": 0.0056,
"step": 6240
},
{
"epoch": 156.25,
"grad_norm": 0.1653551310300827,
"learning_rate": 9.995735622164641e-05,
"loss": 0.0057,
"step": 6250
},
{
"epoch": 156.5,
"grad_norm": 0.1293126344680786,
"learning_rate": 9.995667074037758e-05,
"loss": 0.0054,
"step": 6260
},
{
"epoch": 156.75,
"grad_norm": 0.1530045121908188,
"learning_rate": 9.995597979592232e-05,
"loss": 0.006,
"step": 6270
},
{
"epoch": 157.0,
"grad_norm": 0.14181704819202423,
"learning_rate": 9.995528338835625e-05,
"loss": 0.0052,
"step": 6280
},
{
"epoch": 157.25,
"grad_norm": 0.16457095742225647,
"learning_rate": 9.995458151775547e-05,
"loss": 0.0055,
"step": 6290
},
{
"epoch": 157.5,
"grad_norm": 0.16033579409122467,
"learning_rate": 9.995387418419677e-05,
"loss": 0.0051,
"step": 6300
},
{
"epoch": 157.75,
"grad_norm": 0.11192826926708221,
"learning_rate": 9.99531613877575e-05,
"loss": 0.0055,
"step": 6310
},
{
"epoch": 158.0,
"grad_norm": 0.15627609193325043,
"learning_rate": 9.995244312851559e-05,
"loss": 0.0055,
"step": 6320
},
{
"epoch": 158.25,
"grad_norm": 0.14025883376598358,
"learning_rate": 9.995171940654961e-05,
"loss": 0.0051,
"step": 6330
},
{
"epoch": 158.5,
"grad_norm": 0.09506193548440933,
"learning_rate": 9.995099022193871e-05,
"loss": 0.0054,
"step": 6340
},
{
"epoch": 158.75,
"grad_norm": 0.12658117711544037,
"learning_rate": 9.995025557476261e-05,
"loss": 0.0051,
"step": 6350
},
{
"epoch": 159.0,
"grad_norm": 0.12233058363199234,
"learning_rate": 9.994951546510165e-05,
"loss": 0.0055,
"step": 6360
},
{
"epoch": 159.25,
"grad_norm": 0.14143706858158112,
"learning_rate": 9.994876989303679e-05,
"loss": 0.0062,
"step": 6370
},
{
"epoch": 159.5,
"grad_norm": 0.15059718489646912,
"learning_rate": 9.994801885864955e-05,
"loss": 0.0058,
"step": 6380
},
{
"epoch": 159.75,
"grad_norm": 0.13234072923660278,
"learning_rate": 9.994726236202205e-05,
"loss": 0.0062,
"step": 6390
},
{
"epoch": 160.0,
"grad_norm": 0.17327646911144257,
"learning_rate": 9.994650040323704e-05,
"loss": 0.0059,
"step": 6400
},
{
"epoch": 160.25,
"grad_norm": 0.13921305537223816,
"learning_rate": 9.994573298237784e-05,
"loss": 0.0049,
"step": 6410
},
{
"epoch": 160.5,
"grad_norm": 0.16460886597633362,
"learning_rate": 9.994496009952837e-05,
"loss": 0.0049,
"step": 6420
},
{
"epoch": 160.75,
"grad_norm": 0.15839236974716187,
"learning_rate": 9.994418175477316e-05,
"loss": 0.0056,
"step": 6430
},
{
"epoch": 161.0,
"grad_norm": 0.1224624365568161,
"learning_rate": 9.994339794819733e-05,
"loss": 0.0048,
"step": 6440
},
{
"epoch": 161.25,
"grad_norm": 0.13407361507415771,
"learning_rate": 9.994260867988658e-05,
"loss": 0.0055,
"step": 6450
},
{
"epoch": 161.5,
"grad_norm": 0.14540457725524902,
"learning_rate": 9.994181394992723e-05,
"loss": 0.0046,
"step": 6460
},
{
"epoch": 161.75,
"grad_norm": 0.12441486120223999,
"learning_rate": 9.994101375840618e-05,
"loss": 0.0047,
"step": 6470
},
{
"epoch": 162.0,
"grad_norm": 0.12421895563602448,
"learning_rate": 9.994020810541098e-05,
"loss": 0.0051,
"step": 6480
},
{
"epoch": 162.25,
"grad_norm": 0.11112942546606064,
"learning_rate": 9.99393969910297e-05,
"loss": 0.0051,
"step": 6490
},
{
"epoch": 162.5,
"grad_norm": 0.12117798626422882,
"learning_rate": 9.993858041535104e-05,
"loss": 0.0057,
"step": 6500
},
{
"epoch": 162.75,
"grad_norm": 0.1292831152677536,
"learning_rate": 9.99377583784643e-05,
"loss": 0.0052,
"step": 6510
},
{
"epoch": 163.0,
"grad_norm": 0.13252988457679749,
"learning_rate": 9.993693088045939e-05,
"loss": 0.0051,
"step": 6520
},
{
"epoch": 163.25,
"grad_norm": 0.13480907678604126,
"learning_rate": 9.99360979214268e-05,
"loss": 0.0049,
"step": 6530
},
{
"epoch": 163.5,
"grad_norm": 0.08185603469610214,
"learning_rate": 9.99352595014576e-05,
"loss": 0.0047,
"step": 6540
},
{
"epoch": 163.75,
"grad_norm": 0.07064332067966461,
"learning_rate": 9.993441562064354e-05,
"loss": 0.0049,
"step": 6550
},
{
"epoch": 164.0,
"grad_norm": 0.11800257861614227,
"learning_rate": 9.993356627907685e-05,
"loss": 0.0049,
"step": 6560
},
{
"epoch": 164.25,
"grad_norm": 0.1276804357767105,
"learning_rate": 9.99327114768504e-05,
"loss": 0.0055,
"step": 6570
},
{
"epoch": 164.5,
"grad_norm": 0.17329150438308716,
"learning_rate": 9.99318512140577e-05,
"loss": 0.0053,
"step": 6580
},
{
"epoch": 164.75,
"grad_norm": 0.15143054723739624,
"learning_rate": 9.993098549079284e-05,
"loss": 0.0055,
"step": 6590
},
{
"epoch": 165.0,
"grad_norm": 0.1650630086660385,
"learning_rate": 9.993011430715047e-05,
"loss": 0.006,
"step": 6600
},
{
"epoch": 165.25,
"grad_norm": 0.13941645622253418,
"learning_rate": 9.992923766322586e-05,
"loss": 0.0049,
"step": 6610
},
{
"epoch": 165.5,
"grad_norm": 0.13879020512104034,
"learning_rate": 9.99283555591149e-05,
"loss": 0.005,
"step": 6620
},
{
"epoch": 165.75,
"grad_norm": 0.1108022928237915,
"learning_rate": 9.992746799491404e-05,
"loss": 0.0054,
"step": 6630
},
{
"epoch": 166.0,
"grad_norm": 0.16898183524608612,
"learning_rate": 9.992657497072033e-05,
"loss": 0.0055,
"step": 6640
},
{
"epoch": 166.25,
"grad_norm": 0.1466725915670395,
"learning_rate": 9.992567648663147e-05,
"loss": 0.0066,
"step": 6650
},
{
"epoch": 166.5,
"grad_norm": 0.10014578700065613,
"learning_rate": 9.992477254274568e-05,
"loss": 0.0057,
"step": 6660
},
{
"epoch": 166.75,
"grad_norm": 0.11596689373254776,
"learning_rate": 9.992386313916183e-05,
"loss": 0.0051,
"step": 6670
},
{
"epoch": 167.0,
"grad_norm": 0.1346360445022583,
"learning_rate": 9.992294827597934e-05,
"loss": 0.0054,
"step": 6680
},
{
"epoch": 167.25,
"grad_norm": 0.12456992268562317,
"learning_rate": 9.992202795329831e-05,
"loss": 0.0058,
"step": 6690
},
{
"epoch": 167.5,
"grad_norm": 0.15003210306167603,
"learning_rate": 9.992110217121936e-05,
"loss": 0.0064,
"step": 6700
},
{
"epoch": 167.75,
"grad_norm": 0.12493447959423065,
"learning_rate": 9.992017092984372e-05,
"loss": 0.0048,
"step": 6710
},
{
"epoch": 168.0,
"grad_norm": 0.13486067950725555,
"learning_rate": 9.991923422927326e-05,
"loss": 0.0052,
"step": 6720
},
{
"epoch": 168.25,
"grad_norm": 0.12454357743263245,
"learning_rate": 9.991829206961037e-05,
"loss": 0.0047,
"step": 6730
},
{
"epoch": 168.5,
"grad_norm": 0.12296856194734573,
"learning_rate": 9.991734445095813e-05,
"loss": 0.0048,
"step": 6740
},
{
"epoch": 168.75,
"grad_norm": 0.12852822244167328,
"learning_rate": 9.991639137342015e-05,
"loss": 0.0052,
"step": 6750
},
{
"epoch": 169.0,
"grad_norm": 0.10896472632884979,
"learning_rate": 9.991543283710064e-05,
"loss": 0.0056,
"step": 6760
},
{
"epoch": 169.25,
"grad_norm": 0.09551511704921722,
"learning_rate": 9.991446884210445e-05,
"loss": 0.0055,
"step": 6770
},
{
"epoch": 169.5,
"grad_norm": 0.11103704571723938,
"learning_rate": 9.9913499388537e-05,
"loss": 0.0045,
"step": 6780
},
{
"epoch": 169.75,
"grad_norm": 0.09321358799934387,
"learning_rate": 9.99125244765043e-05,
"loss": 0.0045,
"step": 6790
},
{
"epoch": 170.0,
"grad_norm": 0.1304449886083603,
"learning_rate": 9.991154410611296e-05,
"loss": 0.0051,
"step": 6800
},
{
"epoch": 170.25,
"grad_norm": 0.1056373193860054,
"learning_rate": 9.99105582774702e-05,
"loss": 0.0048,
"step": 6810
},
{
"epoch": 170.5,
"grad_norm": 0.15499410033226013,
"learning_rate": 9.990956699068384e-05,
"loss": 0.0054,
"step": 6820
},
{
"epoch": 170.75,
"grad_norm": 0.1866844743490219,
"learning_rate": 9.990857024586224e-05,
"loss": 0.0054,
"step": 6830
},
{
"epoch": 171.0,
"grad_norm": 0.16530998051166534,
"learning_rate": 9.990756804311446e-05,
"loss": 0.0051,
"step": 6840
},
{
"epoch": 171.25,
"grad_norm": 0.11174263060092926,
"learning_rate": 9.990656038255006e-05,
"loss": 0.0044,
"step": 6850
},
{
"epoch": 171.5,
"grad_norm": 0.13130134344100952,
"learning_rate": 9.990554726427926e-05,
"loss": 0.0051,
"step": 6860
},
{
"epoch": 171.75,
"grad_norm": 0.13618028163909912,
"learning_rate": 9.990452868841284e-05,
"loss": 0.0055,
"step": 6870
},
{
"epoch": 172.0,
"grad_norm": 0.12057960033416748,
"learning_rate": 9.99035046550622e-05,
"loss": 0.0051,
"step": 6880
},
{
"epoch": 172.25,
"grad_norm": 0.13933198153972626,
"learning_rate": 9.99024751643393e-05,
"loss": 0.0051,
"step": 6890
},
{
"epoch": 172.5,
"grad_norm": 0.11323478817939758,
"learning_rate": 9.990144021635677e-05,
"loss": 0.0048,
"step": 6900
},
{
"epoch": 172.75,
"grad_norm": 0.12394394725561142,
"learning_rate": 9.990039981122775e-05,
"loss": 0.0053,
"step": 6910
},
{
"epoch": 173.0,
"grad_norm": 0.12509888410568237,
"learning_rate": 9.989935394906602e-05,
"loss": 0.0049,
"step": 6920
},
{
"epoch": 173.25,
"grad_norm": 0.150846928358078,
"learning_rate": 9.989830262998598e-05,
"loss": 0.0063,
"step": 6930
},
{
"epoch": 173.5,
"grad_norm": 0.15578363835811615,
"learning_rate": 9.989724585410259e-05,
"loss": 0.0053,
"step": 6940
},
{
"epoch": 173.75,
"grad_norm": 0.1302061229944229,
"learning_rate": 9.989618362153139e-05,
"loss": 0.0057,
"step": 6950
},
{
"epoch": 174.0,
"grad_norm": 0.13165602087974548,
"learning_rate": 9.989511593238859e-05,
"loss": 0.0048,
"step": 6960
},
{
"epoch": 174.25,
"grad_norm": 0.1607247292995453,
"learning_rate": 9.98940427867909e-05,
"loss": 0.0051,
"step": 6970
},
{
"epoch": 174.5,
"grad_norm": 0.17222370207309723,
"learning_rate": 9.989296418485573e-05,
"loss": 0.0067,
"step": 6980
},
{
"epoch": 174.75,
"grad_norm": 0.14066697657108307,
"learning_rate": 9.989188012670101e-05,
"loss": 0.0052,
"step": 6990
},
{
"epoch": 175.0,
"grad_norm": 0.12622103095054626,
"learning_rate": 9.989079061244528e-05,
"loss": 0.0058,
"step": 7000
},
{
"epoch": 175.25,
"grad_norm": 0.16112670302391052,
"learning_rate": 9.988969564220769e-05,
"loss": 0.0054,
"step": 7010
},
{
"epoch": 175.5,
"grad_norm": 0.09230020642280579,
"learning_rate": 9.988859521610801e-05,
"loss": 0.0054,
"step": 7020
},
{
"epoch": 175.75,
"grad_norm": 0.14121113717556,
"learning_rate": 9.988748933426656e-05,
"loss": 0.0056,
"step": 7030
},
{
"epoch": 176.0,
"grad_norm": 0.09799350798130035,
"learning_rate": 9.988637799680428e-05,
"loss": 0.0057,
"step": 7040
},
{
"epoch": 176.25,
"grad_norm": 0.09402919560670853,
"learning_rate": 9.98852612038427e-05,
"loss": 0.0047,
"step": 7050
},
{
"epoch": 176.5,
"grad_norm": 0.1271122395992279,
"learning_rate": 9.988413895550397e-05,
"loss": 0.0046,
"step": 7060
},
{
"epoch": 176.75,
"grad_norm": 0.09895417094230652,
"learning_rate": 9.98830112519108e-05,
"loss": 0.0059,
"step": 7070
},
{
"epoch": 177.0,
"grad_norm": 0.12824778258800507,
"learning_rate": 9.98818780931865e-05,
"loss": 0.0049,
"step": 7080
},
{
"epoch": 177.25,
"grad_norm": 0.12342114001512527,
"learning_rate": 9.988073947945502e-05,
"loss": 0.0049,
"step": 7090
},
{
"epoch": 177.5,
"grad_norm": 0.143673375248909,
"learning_rate": 9.987959541084087e-05,
"loss": 0.0047,
"step": 7100
},
{
"epoch": 177.75,
"grad_norm": 0.14878948032855988,
"learning_rate": 9.987844588746915e-05,
"loss": 0.0041,
"step": 7110
},
{
"epoch": 178.0,
"grad_norm": 0.12074983865022659,
"learning_rate": 9.987729090946558e-05,
"loss": 0.0048,
"step": 7120
},
{
"epoch": 178.25,
"grad_norm": 0.12571795284748077,
"learning_rate": 9.987613047695647e-05,
"loss": 0.005,
"step": 7130
},
{
"epoch": 178.5,
"grad_norm": 0.1860690414905548,
"learning_rate": 9.987496459006871e-05,
"loss": 0.0047,
"step": 7140
},
{
"epoch": 178.75,
"grad_norm": 0.11447081714868546,
"learning_rate": 9.987379324892982e-05,
"loss": 0.0059,
"step": 7150
},
{
"epoch": 179.0,
"grad_norm": 0.2062373012304306,
"learning_rate": 9.987261645366788e-05,
"loss": 0.0049,
"step": 7160
},
{
"epoch": 179.25,
"grad_norm": 0.13676029443740845,
"learning_rate": 9.987143420441158e-05,
"loss": 0.005,
"step": 7170
},
{
"epoch": 179.5,
"grad_norm": 0.17207178473472595,
"learning_rate": 9.987024650129022e-05,
"loss": 0.0051,
"step": 7180
},
{
"epoch": 179.75,
"grad_norm": 0.17590519785881042,
"learning_rate": 9.986905334443368e-05,
"loss": 0.0059,
"step": 7190
},
{
"epoch": 180.0,
"grad_norm": 0.13459520041942596,
"learning_rate": 9.986785473397245e-05,
"loss": 0.005,
"step": 7200
},
{
"epoch": 180.25,
"grad_norm": 0.21301501989364624,
"learning_rate": 9.98666506700376e-05,
"loss": 0.0056,
"step": 7210
},
{
"epoch": 180.5,
"grad_norm": 0.13290734589099884,
"learning_rate": 9.986544115276081e-05,
"loss": 0.0066,
"step": 7220
},
{
"epoch": 180.75,
"grad_norm": 0.12449201196432114,
"learning_rate": 9.986422618227433e-05,
"loss": 0.0053,
"step": 7230
},
{
"epoch": 181.0,
"grad_norm": 0.119524285197258,
"learning_rate": 9.986300575871106e-05,
"loss": 0.0056,
"step": 7240
},
{
"epoch": 181.25,
"grad_norm": 0.10814197361469269,
"learning_rate": 9.986177988220444e-05,
"loss": 0.0047,
"step": 7250
},
{
"epoch": 181.5,
"grad_norm": 0.12408486753702164,
"learning_rate": 9.986054855288856e-05,
"loss": 0.005,
"step": 7260
},
{
"epoch": 181.75,
"grad_norm": 0.1282089203596115,
"learning_rate": 9.985931177089802e-05,
"loss": 0.0056,
"step": 7270
},
{
"epoch": 182.0,
"grad_norm": 0.17553548514842987,
"learning_rate": 9.985806953636814e-05,
"loss": 0.005,
"step": 7280
},
{
"epoch": 182.25,
"grad_norm": 0.10986651480197906,
"learning_rate": 9.985682184943471e-05,
"loss": 0.0056,
"step": 7290
},
{
"epoch": 182.5,
"grad_norm": 0.11029662936925888,
"learning_rate": 9.98555687102342e-05,
"loss": 0.0047,
"step": 7300
},
{
"epoch": 182.75,
"grad_norm": 0.1303234100341797,
"learning_rate": 9.985431011890367e-05,
"loss": 0.0052,
"step": 7310
},
{
"epoch": 183.0,
"grad_norm": 0.11000842601060867,
"learning_rate": 9.985304607558075e-05,
"loss": 0.0053,
"step": 7320
},
{
"epoch": 183.25,
"grad_norm": 0.11282704770565033,
"learning_rate": 9.985177658040364e-05,
"loss": 0.0049,
"step": 7330
},
{
"epoch": 183.5,
"grad_norm": 0.11639503389596939,
"learning_rate": 9.985050163351119e-05,
"loss": 0.005,
"step": 7340
},
{
"epoch": 183.75,
"grad_norm": 0.12452349811792374,
"learning_rate": 9.984922123504286e-05,
"loss": 0.0044,
"step": 7350
},
{
"epoch": 184.0,
"grad_norm": 0.09971412271261215,
"learning_rate": 9.984793538513862e-05,
"loss": 0.0052,
"step": 7360
},
{
"epoch": 184.25,
"grad_norm": 0.08956573903560638,
"learning_rate": 9.984664408393912e-05,
"loss": 0.0053,
"step": 7370
},
{
"epoch": 184.5,
"grad_norm": 0.0963820144534111,
"learning_rate": 9.984534733158556e-05,
"loss": 0.0053,
"step": 7380
},
{
"epoch": 184.75,
"grad_norm": 0.11035646498203278,
"learning_rate": 9.984404512821977e-05,
"loss": 0.0045,
"step": 7390
},
{
"epoch": 185.0,
"grad_norm": 0.10871769487857819,
"learning_rate": 9.984273747398411e-05,
"loss": 0.0052,
"step": 7400
},
{
"epoch": 185.25,
"grad_norm": 0.10968684405088425,
"learning_rate": 9.984142436902165e-05,
"loss": 0.0057,
"step": 7410
},
{
"epoch": 185.5,
"grad_norm": 0.135623961687088,
"learning_rate": 9.984010581347596e-05,
"loss": 0.0045,
"step": 7420
},
{
"epoch": 185.75,
"grad_norm": 0.10947667807340622,
"learning_rate": 9.983878180749121e-05,
"loss": 0.005,
"step": 7430
},
{
"epoch": 186.0,
"grad_norm": 0.10079862177371979,
"learning_rate": 9.983745235121222e-05,
"loss": 0.0046,
"step": 7440
},
{
"epoch": 186.25,
"grad_norm": 0.16193941235542297,
"learning_rate": 9.983611744478438e-05,
"loss": 0.0054,
"step": 7450
},
{
"epoch": 186.5,
"grad_norm": 0.11646270751953125,
"learning_rate": 9.983477708835365e-05,
"loss": 0.0054,
"step": 7460
},
{
"epoch": 186.75,
"grad_norm": 0.1582486480474472,
"learning_rate": 9.983343128206664e-05,
"loss": 0.0055,
"step": 7470
},
{
"epoch": 187.0,
"grad_norm": 0.15796583890914917,
"learning_rate": 9.983208002607049e-05,
"loss": 0.0054,
"step": 7480
},
{
"epoch": 187.25,
"grad_norm": 0.15638047456741333,
"learning_rate": 9.9830723320513e-05,
"loss": 0.0054,
"step": 7490
},
{
"epoch": 187.5,
"grad_norm": 0.12270047515630722,
"learning_rate": 9.982936116554254e-05,
"loss": 0.0045,
"step": 7500
},
{
"epoch": 187.75,
"grad_norm": 0.12344437092542648,
"learning_rate": 9.982799356130803e-05,
"loss": 0.0056,
"step": 7510
},
{
"epoch": 188.0,
"grad_norm": 0.13426025211811066,
"learning_rate": 9.982662050795908e-05,
"loss": 0.0055,
"step": 7520
},
{
"epoch": 188.25,
"grad_norm": 0.18254458904266357,
"learning_rate": 9.982524200564583e-05,
"loss": 0.0058,
"step": 7530
},
{
"epoch": 188.5,
"grad_norm": 0.1355525702238083,
"learning_rate": 9.982385805451901e-05,
"loss": 0.0052,
"step": 7540
},
{
"epoch": 188.75,
"grad_norm": 0.12746183574199677,
"learning_rate": 9.982246865472998e-05,
"loss": 0.0048,
"step": 7550
},
{
"epoch": 189.0,
"grad_norm": 0.07315339148044586,
"learning_rate": 9.982107380643069e-05,
"loss": 0.0051,
"step": 7560
},
{
"epoch": 189.25,
"grad_norm": 0.08937343209981918,
"learning_rate": 9.981967350977368e-05,
"loss": 0.0054,
"step": 7570
},
{
"epoch": 189.5,
"grad_norm": 0.1072855219244957,
"learning_rate": 9.981826776491208e-05,
"loss": 0.0046,
"step": 7580
},
{
"epoch": 189.75,
"grad_norm": 0.08609167486429214,
"learning_rate": 9.98168565719996e-05,
"loss": 0.0057,
"step": 7590
},
{
"epoch": 190.0,
"grad_norm": 0.1458943635225296,
"learning_rate": 9.98154399311906e-05,
"loss": 0.0052,
"step": 7600
},
{
"epoch": 190.25,
"grad_norm": 0.14894481003284454,
"learning_rate": 9.981401784263997e-05,
"loss": 0.0051,
"step": 7610
},
{
"epoch": 190.5,
"grad_norm": 0.12157489359378815,
"learning_rate": 9.981259030650326e-05,
"loss": 0.0043,
"step": 7620
},
{
"epoch": 190.75,
"grad_norm": 0.09578462690114975,
"learning_rate": 9.981115732293655e-05,
"loss": 0.0043,
"step": 7630
},
{
"epoch": 191.0,
"grad_norm": 0.1009032130241394,
"learning_rate": 9.980971889209659e-05,
"loss": 0.0047,
"step": 7640
},
{
"epoch": 191.25,
"grad_norm": 0.10971588641405106,
"learning_rate": 9.980827501414064e-05,
"loss": 0.0048,
"step": 7650
},
{
"epoch": 191.5,
"grad_norm": 0.12458957731723785,
"learning_rate": 9.980682568922663e-05,
"loss": 0.0042,
"step": 7660
},
{
"epoch": 191.75,
"grad_norm": 0.17035630345344543,
"learning_rate": 9.980537091751304e-05,
"loss": 0.0046,
"step": 7670
},
{
"epoch": 192.0,
"grad_norm": 0.12632252275943756,
"learning_rate": 9.980391069915897e-05,
"loss": 0.0047,
"step": 7680
},
{
"epoch": 192.25,
"grad_norm": 0.08305094391107559,
"learning_rate": 9.98024450343241e-05,
"loss": 0.0051,
"step": 7690
},
{
"epoch": 192.5,
"grad_norm": 0.10795965045690536,
"learning_rate": 9.980097392316872e-05,
"loss": 0.0041,
"step": 7700
},
{
"epoch": 192.75,
"grad_norm": 0.1059790700674057,
"learning_rate": 9.97994973658537e-05,
"loss": 0.0042,
"step": 7710
},
{
"epoch": 193.0,
"grad_norm": 0.09630829840898514,
"learning_rate": 9.979801536254054e-05,
"loss": 0.0038,
"step": 7720
},
{
"epoch": 193.25,
"grad_norm": 0.09148659557104111,
"learning_rate": 9.979652791339127e-05,
"loss": 0.0051,
"step": 7730
},
{
"epoch": 193.5,
"grad_norm": 0.11520194262266159,
"learning_rate": 9.97950350185686e-05,
"loss": 0.0053,
"step": 7740
},
{
"epoch": 193.75,
"grad_norm": 0.1046760156750679,
"learning_rate": 9.979353667823574e-05,
"loss": 0.0041,
"step": 7750
},
{
"epoch": 194.0,
"grad_norm": 0.13308505713939667,
"learning_rate": 9.979203289255658e-05,
"loss": 0.0046,
"step": 7760
},
{
"epoch": 194.25,
"grad_norm": 0.11976161599159241,
"learning_rate": 9.979052366169557e-05,
"loss": 0.005,
"step": 7770
},
{
"epoch": 194.5,
"grad_norm": 0.1054573506116867,
"learning_rate": 9.978900898581775e-05,
"loss": 0.0049,
"step": 7780
},
{
"epoch": 194.75,
"grad_norm": 0.14348535239696503,
"learning_rate": 9.978748886508875e-05,
"loss": 0.0043,
"step": 7790
},
{
"epoch": 195.0,
"grad_norm": 0.1736174374818802,
"learning_rate": 9.978596329967484e-05,
"loss": 0.0052,
"step": 7800
},
{
"epoch": 195.25,
"grad_norm": 0.15408484637737274,
"learning_rate": 9.978443228974284e-05,
"loss": 0.0043,
"step": 7810
},
{
"epoch": 195.5,
"grad_norm": 0.12470567971467972,
"learning_rate": 9.978289583546015e-05,
"loss": 0.0047,
"step": 7820
},
{
"epoch": 195.75,
"grad_norm": 0.1008351519703865,
"learning_rate": 9.978135393699484e-05,
"loss": 0.0045,
"step": 7830
},
{
"epoch": 196.0,
"grad_norm": 0.13738684356212616,
"learning_rate": 9.977980659451548e-05,
"loss": 0.005,
"step": 7840
},
{
"epoch": 196.25,
"grad_norm": 0.10673101246356964,
"learning_rate": 9.977825380819135e-05,
"loss": 0.005,
"step": 7850
},
{
"epoch": 196.5,
"grad_norm": 0.12118956446647644,
"learning_rate": 9.97766955781922e-05,
"loss": 0.005,
"step": 7860
},
{
"epoch": 196.75,
"grad_norm": 0.14345556497573853,
"learning_rate": 9.977513190468848e-05,
"loss": 0.0044,
"step": 7870
},
{
"epoch": 197.0,
"grad_norm": 0.13606630265712738,
"learning_rate": 9.977356278785116e-05,
"loss": 0.0044,
"step": 7880
},
{
"epoch": 197.25,
"grad_norm": 0.14485426247119904,
"learning_rate": 9.977198822785184e-05,
"loss": 0.0048,
"step": 7890
},
{
"epoch": 197.5,
"grad_norm": 0.11165472120046616,
"learning_rate": 9.977040822486273e-05,
"loss": 0.0045,
"step": 7900
},
{
"epoch": 197.75,
"grad_norm": 0.11101426929235458,
"learning_rate": 9.97688227790566e-05,
"loss": 0.0048,
"step": 7910
},
{
"epoch": 198.0,
"grad_norm": 0.1383507251739502,
"learning_rate": 9.976723189060684e-05,
"loss": 0.0048,
"step": 7920
},
{
"epoch": 198.25,
"grad_norm": 0.07337084412574768,
"learning_rate": 9.976563555968742e-05,
"loss": 0.0044,
"step": 7930
},
{
"epoch": 198.5,
"grad_norm": 0.10197046399116516,
"learning_rate": 9.976403378647292e-05,
"loss": 0.0052,
"step": 7940
},
{
"epoch": 198.75,
"grad_norm": 0.08910421282052994,
"learning_rate": 9.97624265711385e-05,
"loss": 0.0052,
"step": 7950
},
{
"epoch": 199.0,
"grad_norm": 0.12083287537097931,
"learning_rate": 9.976081391385993e-05,
"loss": 0.0052,
"step": 7960
},
{
"epoch": 199.25,
"grad_norm": 0.08918462693691254,
"learning_rate": 9.975919581481356e-05,
"loss": 0.0051,
"step": 7970
},
{
"epoch": 199.5,
"grad_norm": 0.10877599567174911,
"learning_rate": 9.975757227417634e-05,
"loss": 0.0047,
"step": 7980
},
{
"epoch": 199.75,
"grad_norm": 0.09586022794246674,
"learning_rate": 9.975594329212586e-05,
"loss": 0.005,
"step": 7990
},
{
"epoch": 200.0,
"grad_norm": 0.15994898974895477,
"learning_rate": 9.97543088688402e-05,
"loss": 0.0049,
"step": 8000
},
{
"epoch": 200.25,
"grad_norm": 0.14788265526294708,
"learning_rate": 9.975266900449814e-05,
"loss": 0.0056,
"step": 8010
},
{
"epoch": 200.5,
"grad_norm": 0.11368973553180695,
"learning_rate": 9.975102369927898e-05,
"loss": 0.0045,
"step": 8020
},
{
"epoch": 200.75,
"grad_norm": 0.14189907908439636,
"learning_rate": 9.974937295336269e-05,
"loss": 0.005,
"step": 8030
},
{
"epoch": 201.0,
"grad_norm": 0.12064416706562042,
"learning_rate": 9.974771676692975e-05,
"loss": 0.0049,
"step": 8040
},
{
"epoch": 201.25,
"grad_norm": 0.09696459025144577,
"learning_rate": 9.974605514016131e-05,
"loss": 0.0043,
"step": 8050
},
{
"epoch": 201.5,
"grad_norm": 0.10713174939155579,
"learning_rate": 9.974438807323907e-05,
"loss": 0.0046,
"step": 8060
},
{
"epoch": 201.75,
"grad_norm": 0.10264813154935837,
"learning_rate": 9.974271556634535e-05,
"loss": 0.0044,
"step": 8070
},
{
"epoch": 202.0,
"grad_norm": 0.08025900274515152,
"learning_rate": 9.974103761966302e-05,
"loss": 0.0055,
"step": 8080
},
{
"epoch": 202.25,
"grad_norm": 0.13613669574260712,
"learning_rate": 9.973935423337563e-05,
"loss": 0.005,
"step": 8090
},
{
"epoch": 202.5,
"grad_norm": 0.11066287010908127,
"learning_rate": 9.973766540766722e-05,
"loss": 0.005,
"step": 8100
},
{
"epoch": 202.75,
"grad_norm": 0.09373009204864502,
"learning_rate": 9.97359711427225e-05,
"loss": 0.0063,
"step": 8110
},
{
"epoch": 203.0,
"grad_norm": 0.10568234324455261,
"learning_rate": 9.973427143872677e-05,
"loss": 0.0049,
"step": 8120
},
{
"epoch": 203.25,
"grad_norm": 0.08303306251764297,
"learning_rate": 9.973256629586589e-05,
"loss": 0.0056,
"step": 8130
},
{
"epoch": 203.5,
"grad_norm": 0.08859831839799881,
"learning_rate": 9.973085571432632e-05,
"loss": 0.0055,
"step": 8140
},
{
"epoch": 203.75,
"grad_norm": 0.12644809484481812,
"learning_rate": 9.972913969429513e-05,
"loss": 0.0056,
"step": 8150
},
{
"epoch": 204.0,
"grad_norm": 0.12281641364097595,
"learning_rate": 9.972741823596e-05,
"loss": 0.0052,
"step": 8160
},
{
"epoch": 204.25,
"grad_norm": 0.1079707145690918,
"learning_rate": 9.972569133950917e-05,
"loss": 0.0048,
"step": 8170
},
{
"epoch": 204.5,
"grad_norm": 0.13612797856330872,
"learning_rate": 9.972395900513151e-05,
"loss": 0.0047,
"step": 8180
},
{
"epoch": 204.75,
"grad_norm": 0.1213703528046608,
"learning_rate": 9.972222123301645e-05,
"loss": 0.0052,
"step": 8190
},
{
"epoch": 205.0,
"grad_norm": 0.10602298378944397,
"learning_rate": 9.972047802335403e-05,
"loss": 0.0051,
"step": 8200
},
{
"epoch": 205.25,
"grad_norm": 0.09890609234571457,
"learning_rate": 9.971872937633488e-05,
"loss": 0.0045,
"step": 8210
},
{
"epoch": 205.5,
"grad_norm": 0.10948968678712845,
"learning_rate": 9.971697529215024e-05,
"loss": 0.005,
"step": 8220
},
{
"epoch": 205.75,
"grad_norm": 0.11318536102771759,
"learning_rate": 9.971521577099192e-05,
"loss": 0.005,
"step": 8230
},
{
"epoch": 206.0,
"grad_norm": 0.14533638954162598,
"learning_rate": 9.971345081305236e-05,
"loss": 0.0051,
"step": 8240
},
{
"epoch": 206.25,
"grad_norm": 0.16806240379810333,
"learning_rate": 9.971168041852456e-05,
"loss": 0.0047,
"step": 8250
},
{
"epoch": 206.5,
"grad_norm": 0.12269231677055359,
"learning_rate": 9.970990458760215e-05,
"loss": 0.0044,
"step": 8260
},
{
"epoch": 206.75,
"grad_norm": 0.19708134233951569,
"learning_rate": 9.970812332047929e-05,
"loss": 0.0045,
"step": 8270
},
{
"epoch": 207.0,
"grad_norm": 0.16831305623054504,
"learning_rate": 9.97063366173508e-05,
"loss": 0.0046,
"step": 8280
},
{
"epoch": 207.25,
"grad_norm": 0.09102386981248856,
"learning_rate": 9.970454447841207e-05,
"loss": 0.0046,
"step": 8290
},
{
"epoch": 207.5,
"grad_norm": 0.1855050027370453,
"learning_rate": 9.970274690385909e-05,
"loss": 0.005,
"step": 8300
},
{
"epoch": 207.75,
"grad_norm": 0.1450817734003067,
"learning_rate": 9.970094389388844e-05,
"loss": 0.0054,
"step": 8310
},
{
"epoch": 208.0,
"grad_norm": 0.1687973290681839,
"learning_rate": 9.969913544869728e-05,
"loss": 0.0049,
"step": 8320
},
{
"epoch": 208.25,
"grad_norm": 0.12717373669147491,
"learning_rate": 9.96973215684834e-05,
"loss": 0.005,
"step": 8330
},
{
"epoch": 208.5,
"grad_norm": 0.1274053007364273,
"learning_rate": 9.969550225344513e-05,
"loss": 0.0051,
"step": 8340
},
{
"epoch": 208.75,
"grad_norm": 0.15039260685443878,
"learning_rate": 9.969367750378147e-05,
"loss": 0.0036,
"step": 8350
},
{
"epoch": 209.0,
"grad_norm": 0.12453170120716095,
"learning_rate": 9.969184731969194e-05,
"loss": 0.0052,
"step": 8360
},
{
"epoch": 209.25,
"grad_norm": 0.10263194143772125,
"learning_rate": 9.96900117013767e-05,
"loss": 0.005,
"step": 8370
},
{
"epoch": 209.5,
"grad_norm": 0.10451891273260117,
"learning_rate": 9.96881706490365e-05,
"loss": 0.0048,
"step": 8380
},
{
"epoch": 209.75,
"grad_norm": 0.11797595024108887,
"learning_rate": 9.968632416287265e-05,
"loss": 0.0047,
"step": 8390
},
{
"epoch": 210.0,
"grad_norm": 0.1403282731771469,
"learning_rate": 9.96844722430871e-05,
"loss": 0.0042,
"step": 8400
},
{
"epoch": 210.25,
"grad_norm": 0.12103106081485748,
"learning_rate": 9.968261488988235e-05,
"loss": 0.0052,
"step": 8410
},
{
"epoch": 210.5,
"grad_norm": 0.09587433189153671,
"learning_rate": 9.968075210346155e-05,
"loss": 0.0043,
"step": 8420
},
{
"epoch": 210.75,
"grad_norm": 0.09224146604537964,
"learning_rate": 9.967888388402839e-05,
"loss": 0.0049,
"step": 8430
},
{
"epoch": 211.0,
"grad_norm": 0.09856747835874557,
"learning_rate": 9.967701023178717e-05,
"loss": 0.0045,
"step": 8440
},
{
"epoch": 211.25,
"grad_norm": 0.1124839186668396,
"learning_rate": 9.967513114694282e-05,
"loss": 0.0045,
"step": 8450
},
{
"epoch": 211.5,
"grad_norm": 0.10403812676668167,
"learning_rate": 9.967324662970079e-05,
"loss": 0.0043,
"step": 8460
},
{
"epoch": 211.75,
"grad_norm": 0.09139248728752136,
"learning_rate": 9.96713566802672e-05,
"loss": 0.0047,
"step": 8470
},
{
"epoch": 212.0,
"grad_norm": 0.08583129942417145,
"learning_rate": 9.966946129884873e-05,
"loss": 0.0044,
"step": 8480
},
{
"epoch": 212.25,
"grad_norm": 0.10125056654214859,
"learning_rate": 9.966756048565265e-05,
"loss": 0.0047,
"step": 8490
},
{
"epoch": 212.5,
"grad_norm": 0.1086161881685257,
"learning_rate": 9.966565424088681e-05,
"loss": 0.0042,
"step": 8500
},
{
"epoch": 212.75,
"grad_norm": 0.1101728305220604,
"learning_rate": 9.96637425647597e-05,
"loss": 0.0045,
"step": 8510
},
{
"epoch": 213.0,
"grad_norm": 0.11788172274827957,
"learning_rate": 9.966182545748038e-05,
"loss": 0.0045,
"step": 8520
},
{
"epoch": 213.25,
"grad_norm": 0.10477957874536514,
"learning_rate": 9.96599029192585e-05,
"loss": 0.0046,
"step": 8530
},
{
"epoch": 213.5,
"grad_norm": 0.10119979828596115,
"learning_rate": 9.965797495030428e-05,
"loss": 0.0044,
"step": 8540
},
{
"epoch": 213.75,
"grad_norm": 0.10286411643028259,
"learning_rate": 9.96560415508286e-05,
"loss": 0.0048,
"step": 8550
},
{
"epoch": 214.0,
"grad_norm": 0.11209660023450851,
"learning_rate": 9.965410272104286e-05,
"loss": 0.0041,
"step": 8560
},
{
"epoch": 214.25,
"grad_norm": 0.15167979896068573,
"learning_rate": 9.96521584611591e-05,
"loss": 0.0046,
"step": 8570
},
{
"epoch": 214.5,
"grad_norm": 0.1361161321401596,
"learning_rate": 9.965020877138994e-05,
"loss": 0.0055,
"step": 8580
},
{
"epoch": 214.75,
"grad_norm": 0.1173548623919487,
"learning_rate": 9.964825365194861e-05,
"loss": 0.0057,
"step": 8590
},
{
"epoch": 215.0,
"grad_norm": 0.10933533310890198,
"learning_rate": 9.96462931030489e-05,
"loss": 0.0047,
"step": 8600
},
{
"epoch": 215.25,
"grad_norm": 0.1078348457813263,
"learning_rate": 9.96443271249052e-05,
"loss": 0.0044,
"step": 8610
},
{
"epoch": 215.5,
"grad_norm": 0.12997107207775116,
"learning_rate": 9.964235571773255e-05,
"loss": 0.0042,
"step": 8620
},
{
"epoch": 215.75,
"grad_norm": 0.08457321673631668,
"learning_rate": 9.96403788817465e-05,
"loss": 0.0044,
"step": 8630
},
{
"epoch": 216.0,
"grad_norm": 0.15407255291938782,
"learning_rate": 9.963839661716325e-05,
"loss": 0.0055,
"step": 8640
},
{
"epoch": 216.25,
"grad_norm": 0.1310333013534546,
"learning_rate": 9.963640892419958e-05,
"loss": 0.0051,
"step": 8650
},
{
"epoch": 216.5,
"grad_norm": 0.08272965252399445,
"learning_rate": 9.963441580307286e-05,
"loss": 0.0043,
"step": 8660
},
{
"epoch": 216.75,
"grad_norm": 0.0824747085571289,
"learning_rate": 9.963241725400104e-05,
"loss": 0.0042,
"step": 8670
},
{
"epoch": 217.0,
"grad_norm": 0.08268961310386658,
"learning_rate": 9.963041327720271e-05,
"loss": 0.0037,
"step": 8680
},
{
"epoch": 217.25,
"grad_norm": 0.07103843986988068,
"learning_rate": 9.962840387289697e-05,
"loss": 0.0051,
"step": 8690
},
{
"epoch": 217.5,
"grad_norm": 0.09084175527095795,
"learning_rate": 9.962638904130363e-05,
"loss": 0.0044,
"step": 8700
},
{
"epoch": 217.75,
"grad_norm": 0.10540860146284103,
"learning_rate": 9.962436878264298e-05,
"loss": 0.0047,
"step": 8710
},
{
"epoch": 218.0,
"grad_norm": 0.10905332863330841,
"learning_rate": 9.962234309713598e-05,
"loss": 0.0046,
"step": 8720
},
{
"epoch": 218.25,
"grad_norm": 0.1034293845295906,
"learning_rate": 9.962031198500414e-05,
"loss": 0.0041,
"step": 8730
},
{
"epoch": 218.5,
"grad_norm": 0.09964323043823242,
"learning_rate": 9.961827544646958e-05,
"loss": 0.0045,
"step": 8740
},
{
"epoch": 218.75,
"grad_norm": 0.09900356829166412,
"learning_rate": 9.961623348175501e-05,
"loss": 0.0045,
"step": 8750
},
{
"epoch": 219.0,
"grad_norm": 0.10930373519659042,
"learning_rate": 9.961418609108377e-05,
"loss": 0.0046,
"step": 8760
},
{
"epoch": 219.25,
"grad_norm": 0.09773886203765869,
"learning_rate": 9.961213327467971e-05,
"loss": 0.0046,
"step": 8770
},
{
"epoch": 219.5,
"grad_norm": 0.12222948670387268,
"learning_rate": 9.961007503276736e-05,
"loss": 0.0045,
"step": 8780
},
{
"epoch": 219.75,
"grad_norm": 0.07556351274251938,
"learning_rate": 9.960801136557179e-05,
"loss": 0.004,
"step": 8790
},
{
"epoch": 220.0,
"grad_norm": 0.08961526304483414,
"learning_rate": 9.960594227331866e-05,
"loss": 0.0037,
"step": 8800
},
{
"epoch": 220.25,
"grad_norm": 0.10761409252882004,
"learning_rate": 9.960386775623429e-05,
"loss": 0.0048,
"step": 8810
},
{
"epoch": 220.5,
"grad_norm": 0.07478926330804825,
"learning_rate": 9.96017878145455e-05,
"loss": 0.0042,
"step": 8820
},
{
"epoch": 220.75,
"grad_norm": 0.09129362553358078,
"learning_rate": 9.959970244847977e-05,
"loss": 0.0042,
"step": 8830
},
{
"epoch": 221.0,
"grad_norm": 0.10972625017166138,
"learning_rate": 9.959761165826518e-05,
"loss": 0.0047,
"step": 8840
},
{
"epoch": 221.25,
"grad_norm": 0.10915403813123703,
"learning_rate": 9.959551544413033e-05,
"loss": 0.0049,
"step": 8850
},
{
"epoch": 221.5,
"grad_norm": 0.13377898931503296,
"learning_rate": 9.959341380630448e-05,
"loss": 0.005,
"step": 8860
},
{
"epoch": 221.75,
"grad_norm": 0.11009661853313446,
"learning_rate": 9.959130674501746e-05,
"loss": 0.0044,
"step": 8870
},
{
"epoch": 222.0,
"grad_norm": 0.11041966080665588,
"learning_rate": 9.958919426049968e-05,
"loss": 0.0045,
"step": 8880
},
{
"epoch": 222.25,
"grad_norm": 0.13888955116271973,
"learning_rate": 9.958707635298219e-05,
"loss": 0.0045,
"step": 8890
},
{
"epoch": 222.5,
"grad_norm": 0.13241493701934814,
"learning_rate": 9.958495302269657e-05,
"loss": 0.0045,
"step": 8900
},
{
"epoch": 222.75,
"grad_norm": 0.090728759765625,
"learning_rate": 9.958282426987503e-05,
"loss": 0.0043,
"step": 8910
},
{
"epoch": 223.0,
"grad_norm": 0.09145260602235794,
"learning_rate": 9.95806900947504e-05,
"loss": 0.0044,
"step": 8920
},
{
"epoch": 223.25,
"grad_norm": 0.09865462779998779,
"learning_rate": 9.957855049755604e-05,
"loss": 0.0046,
"step": 8930
},
{
"epoch": 223.5,
"grad_norm": 0.1020873486995697,
"learning_rate": 9.957640547852593e-05,
"loss": 0.0041,
"step": 8940
},
{
"epoch": 223.75,
"grad_norm": 0.1301255077123642,
"learning_rate": 9.957425503789466e-05,
"loss": 0.0041,
"step": 8950
},
{
"epoch": 224.0,
"grad_norm": 0.12889862060546875,
"learning_rate": 9.957209917589738e-05,
"loss": 0.0045,
"step": 8960
},
{
"epoch": 224.25,
"grad_norm": 0.11002857983112335,
"learning_rate": 9.956993789276987e-05,
"loss": 0.0044,
"step": 8970
},
{
"epoch": 224.5,
"grad_norm": 0.08376175165176392,
"learning_rate": 9.956777118874847e-05,
"loss": 0.0049,
"step": 8980
},
{
"epoch": 224.75,
"grad_norm": 0.1019083708524704,
"learning_rate": 9.956559906407016e-05,
"loss": 0.0042,
"step": 8990
},
{
"epoch": 225.0,
"grad_norm": 0.08716961741447449,
"learning_rate": 9.956342151897245e-05,
"loss": 0.0054,
"step": 9000
},
{
"epoch": 225.25,
"grad_norm": 0.08603795617818832,
"learning_rate": 9.956123855369346e-05,
"loss": 0.0039,
"step": 9010
},
{
"epoch": 225.5,
"grad_norm": 0.11349231004714966,
"learning_rate": 9.955905016847196e-05,
"loss": 0.0046,
"step": 9020
},
{
"epoch": 225.75,
"grad_norm": 0.10906950384378433,
"learning_rate": 9.955685636354723e-05,
"loss": 0.0043,
"step": 9030
},
{
"epoch": 226.0,
"grad_norm": 0.08074238151311874,
"learning_rate": 9.95546571391592e-05,
"loss": 0.0041,
"step": 9040
},
{
"epoch": 226.25,
"grad_norm": 0.1182880699634552,
"learning_rate": 9.955245249554837e-05,
"loss": 0.0045,
"step": 9050
},
{
"epoch": 226.5,
"grad_norm": 0.11833614856004715,
"learning_rate": 9.955024243295582e-05,
"loss": 0.0047,
"step": 9060
},
{
"epoch": 226.75,
"grad_norm": 0.12007834017276764,
"learning_rate": 9.954802695162328e-05,
"loss": 0.0048,
"step": 9070
},
{
"epoch": 227.0,
"grad_norm": 0.10048998892307281,
"learning_rate": 9.954580605179302e-05,
"loss": 0.0042,
"step": 9080
},
{
"epoch": 227.25,
"grad_norm": 0.10070258378982544,
"learning_rate": 9.954357973370788e-05,
"loss": 0.0042,
"step": 9090
},
{
"epoch": 227.5,
"grad_norm": 0.10562805086374283,
"learning_rate": 9.954134799761135e-05,
"loss": 0.006,
"step": 9100
},
{
"epoch": 227.75,
"grad_norm": 0.11274793744087219,
"learning_rate": 9.953911084374748e-05,
"loss": 0.0044,
"step": 9110
},
{
"epoch": 228.0,
"grad_norm": 0.15728285908699036,
"learning_rate": 9.953686827236093e-05,
"loss": 0.0045,
"step": 9120
},
{
"epoch": 228.25,
"grad_norm": 0.11849649250507355,
"learning_rate": 9.953462028369695e-05,
"loss": 0.0046,
"step": 9130
},
{
"epoch": 228.5,
"grad_norm": 0.10420042276382446,
"learning_rate": 9.953236687800136e-05,
"loss": 0.0046,
"step": 9140
},
{
"epoch": 228.75,
"grad_norm": 0.10627323389053345,
"learning_rate": 9.95301080555206e-05,
"loss": 0.0044,
"step": 9150
},
{
"epoch": 229.0,
"grad_norm": 0.11721424013376236,
"learning_rate": 9.952784381650171e-05,
"loss": 0.0051,
"step": 9160
},
{
"epoch": 229.25,
"grad_norm": 0.1566528081893921,
"learning_rate": 9.952557416119226e-05,
"loss": 0.0044,
"step": 9170
},
{
"epoch": 229.5,
"grad_norm": 0.12469837069511414,
"learning_rate": 9.95232990898405e-05,
"loss": 0.0046,
"step": 9180
},
{
"epoch": 229.75,
"grad_norm": 0.11129660159349442,
"learning_rate": 9.95210186026952e-05,
"loss": 0.0044,
"step": 9190
},
{
"epoch": 230.0,
"grad_norm": 0.1258237212896347,
"learning_rate": 9.951873270000576e-05,
"loss": 0.0044,
"step": 9200
},
{
"epoch": 230.25,
"grad_norm": 0.1056128740310669,
"learning_rate": 9.951644138202216e-05,
"loss": 0.0048,
"step": 9210
},
{
"epoch": 230.5,
"grad_norm": 0.12693685293197632,
"learning_rate": 9.951414464899498e-05,
"loss": 0.0045,
"step": 9220
},
{
"epoch": 230.75,
"grad_norm": 0.1353299915790558,
"learning_rate": 9.951184250117538e-05,
"loss": 0.0048,
"step": 9230
},
{
"epoch": 231.0,
"grad_norm": 0.14081411063671112,
"learning_rate": 9.950953493881513e-05,
"loss": 0.0046,
"step": 9240
},
{
"epoch": 231.25,
"grad_norm": 0.1451917141675949,
"learning_rate": 9.950722196216658e-05,
"loss": 0.0044,
"step": 9250
},
{
"epoch": 231.5,
"grad_norm": 0.11318142712116241,
"learning_rate": 9.950490357148265e-05,
"loss": 0.0048,
"step": 9260
},
{
"epoch": 231.75,
"grad_norm": 0.13487468659877777,
"learning_rate": 9.950257976701692e-05,
"loss": 0.0047,
"step": 9270
},
{
"epoch": 232.0,
"grad_norm": 0.1256389319896698,
"learning_rate": 9.950025054902348e-05,
"loss": 0.0042,
"step": 9280
},
{
"epoch": 232.25,
"grad_norm": 0.10590405017137527,
"learning_rate": 9.949791591775706e-05,
"loss": 0.004,
"step": 9290
},
{
"epoch": 232.5,
"grad_norm": 0.08192436397075653,
"learning_rate": 9.949557587347298e-05,
"loss": 0.0051,
"step": 9300
},
{
"epoch": 232.75,
"grad_norm": 0.12231657654047012,
"learning_rate": 9.949323041642713e-05,
"loss": 0.004,
"step": 9310
},
{
"epoch": 233.0,
"grad_norm": 0.08818791806697845,
"learning_rate": 9.949087954687602e-05,
"loss": 0.0047,
"step": 9320
},
{
"epoch": 233.25,
"grad_norm": 0.1179974153637886,
"learning_rate": 9.948852326507672e-05,
"loss": 0.0048,
"step": 9330
},
{
"epoch": 233.5,
"grad_norm": 0.09120003879070282,
"learning_rate": 9.948616157128694e-05,
"loss": 0.0041,
"step": 9340
},
{
"epoch": 233.75,
"grad_norm": 0.09142210334539413,
"learning_rate": 9.948379446576493e-05,
"loss": 0.0042,
"step": 9350
},
{
"epoch": 234.0,
"grad_norm": 0.15579193830490112,
"learning_rate": 9.948142194876952e-05,
"loss": 0.0044,
"step": 9360
},
{
"epoch": 234.25,
"grad_norm": 0.09148753434419632,
"learning_rate": 9.947904402056024e-05,
"loss": 0.0045,
"step": 9370
},
{
"epoch": 234.5,
"grad_norm": 0.10289934277534485,
"learning_rate": 9.947666068139708e-05,
"loss": 0.0055,
"step": 9380
},
{
"epoch": 234.75,
"grad_norm": 0.11406154930591583,
"learning_rate": 9.947427193154071e-05,
"loss": 0.0043,
"step": 9390
},
{
"epoch": 235.0,
"grad_norm": 0.13602414727210999,
"learning_rate": 9.947187777125233e-05,
"loss": 0.0055,
"step": 9400
},
{
"epoch": 235.25,
"grad_norm": 0.10251234471797943,
"learning_rate": 9.946947820079377e-05,
"loss": 0.0043,
"step": 9410
},
{
"epoch": 235.5,
"grad_norm": 0.11955104768276215,
"learning_rate": 9.946707322042747e-05,
"loss": 0.0038,
"step": 9420
},
{
"epoch": 235.75,
"grad_norm": 0.1253117173910141,
"learning_rate": 9.94646628304164e-05,
"loss": 0.0045,
"step": 9430
},
{
"epoch": 236.0,
"grad_norm": 0.13483910262584686,
"learning_rate": 9.946224703102418e-05,
"loss": 0.0046,
"step": 9440
},
{
"epoch": 236.25,
"grad_norm": 0.13329099118709564,
"learning_rate": 9.945982582251498e-05,
"loss": 0.0042,
"step": 9450
},
{
"epoch": 236.5,
"grad_norm": 0.11717459559440613,
"learning_rate": 9.94573992051536e-05,
"loss": 0.0046,
"step": 9460
},
{
"epoch": 236.75,
"grad_norm": 0.14019109308719635,
"learning_rate": 9.94549671792054e-05,
"loss": 0.0051,
"step": 9470
},
{
"epoch": 237.0,
"grad_norm": 0.13346461951732635,
"learning_rate": 9.945252974493635e-05,
"loss": 0.004,
"step": 9480
},
{
"epoch": 237.25,
"grad_norm": 0.14633530378341675,
"learning_rate": 9.9450086902613e-05,
"loss": 0.0045,
"step": 9490
},
{
"epoch": 237.5,
"grad_norm": 0.11529278010129929,
"learning_rate": 9.944763865250248e-05,
"loss": 0.0043,
"step": 9500
},
{
"epoch": 237.75,
"grad_norm": 0.11980587244033813,
"learning_rate": 9.944518499487254e-05,
"loss": 0.0039,
"step": 9510
},
{
"epoch": 238.0,
"grad_norm": 0.1261284202337265,
"learning_rate": 9.944272592999151e-05,
"loss": 0.0046,
"step": 9520
},
{
"epoch": 238.25,
"grad_norm": 0.09479007124900818,
"learning_rate": 9.94402614581283e-05,
"loss": 0.0037,
"step": 9530
},
{
"epoch": 238.5,
"grad_norm": 0.12620887160301208,
"learning_rate": 9.943779157955244e-05,
"loss": 0.0048,
"step": 9540
},
{
"epoch": 238.75,
"grad_norm": 0.1151387169957161,
"learning_rate": 9.943531629453403e-05,
"loss": 0.0043,
"step": 9550
},
{
"epoch": 239.0,
"grad_norm": 0.14887531101703644,
"learning_rate": 9.943283560334375e-05,
"loss": 0.0038,
"step": 9560
},
{
"epoch": 239.25,
"grad_norm": 0.13858802616596222,
"learning_rate": 9.943034950625288e-05,
"loss": 0.0038,
"step": 9570
},
{
"epoch": 239.5,
"grad_norm": 0.11963634192943573,
"learning_rate": 9.942785800353332e-05,
"loss": 0.0047,
"step": 9580
},
{
"epoch": 239.75,
"grad_norm": 0.13546070456504822,
"learning_rate": 9.942536109545751e-05,
"loss": 0.0041,
"step": 9590
},
{
"epoch": 240.0,
"grad_norm": 0.12385343760251999,
"learning_rate": 9.942285878229853e-05,
"loss": 0.0038,
"step": 9600
},
{
"epoch": 240.25,
"grad_norm": 0.10541026294231415,
"learning_rate": 9.942035106433001e-05,
"loss": 0.0042,
"step": 9610
},
{
"epoch": 240.5,
"grad_norm": 0.10453888773918152,
"learning_rate": 9.94178379418262e-05,
"loss": 0.0047,
"step": 9620
},
{
"epoch": 240.75,
"grad_norm": 0.10397256165742874,
"learning_rate": 9.941531941506194e-05,
"loss": 0.0041,
"step": 9630
},
{
"epoch": 241.0,
"grad_norm": 0.09260208904743195,
"learning_rate": 9.941279548431263e-05,
"loss": 0.0042,
"step": 9640
},
{
"epoch": 241.25,
"grad_norm": 0.11214695125818253,
"learning_rate": 9.941026614985431e-05,
"loss": 0.0044,
"step": 9650
},
{
"epoch": 241.5,
"grad_norm": 0.10193338245153427,
"learning_rate": 9.940773141196357e-05,
"loss": 0.0039,
"step": 9660
},
{
"epoch": 241.75,
"grad_norm": 0.09992244839668274,
"learning_rate": 9.94051912709176e-05,
"loss": 0.004,
"step": 9670
},
{
"epoch": 242.0,
"grad_norm": 0.1305544674396515,
"learning_rate": 9.940264572699421e-05,
"loss": 0.0037,
"step": 9680
},
{
"epoch": 242.25,
"grad_norm": 0.11810918897390366,
"learning_rate": 9.940009478047174e-05,
"loss": 0.0047,
"step": 9690
},
{
"epoch": 242.5,
"grad_norm": 0.08566198498010635,
"learning_rate": 9.939753843162918e-05,
"loss": 0.0036,
"step": 9700
},
{
"epoch": 242.75,
"grad_norm": 0.09429420530796051,
"learning_rate": 9.939497668074609e-05,
"loss": 0.0044,
"step": 9710
},
{
"epoch": 243.0,
"grad_norm": 0.09578400105237961,
"learning_rate": 9.93924095281026e-05,
"loss": 0.0046,
"step": 9720
},
{
"epoch": 243.25,
"grad_norm": 0.09713154286146164,
"learning_rate": 9.938983697397948e-05,
"loss": 0.004,
"step": 9730
},
{
"epoch": 243.5,
"grad_norm": 0.10905840992927551,
"learning_rate": 9.938725901865805e-05,
"loss": 0.004,
"step": 9740
},
{
"epoch": 243.75,
"grad_norm": 0.11417087912559509,
"learning_rate": 9.93846756624202e-05,
"loss": 0.0043,
"step": 9750
},
{
"epoch": 244.0,
"grad_norm": 0.1219557598233223,
"learning_rate": 9.938208690554849e-05,
"loss": 0.0039,
"step": 9760
},
{
"epoch": 244.25,
"grad_norm": 0.11892379820346832,
"learning_rate": 9.9379492748326e-05,
"loss": 0.0039,
"step": 9770
},
{
"epoch": 244.5,
"grad_norm": 0.13812822103500366,
"learning_rate": 9.937689319103641e-05,
"loss": 0.0037,
"step": 9780
},
{
"epoch": 244.75,
"grad_norm": 0.08162184059619904,
"learning_rate": 9.937428823396404e-05,
"loss": 0.0039,
"step": 9790
},
{
"epoch": 245.0,
"grad_norm": 0.11298015713691711,
"learning_rate": 9.937167787739372e-05,
"loss": 0.0041,
"step": 9800
},
{
"epoch": 245.25,
"grad_norm": 0.10408628731966019,
"learning_rate": 9.936906212161095e-05,
"loss": 0.0044,
"step": 9810
},
{
"epoch": 245.5,
"grad_norm": 0.1286010593175888,
"learning_rate": 9.936644096690176e-05,
"loss": 0.0044,
"step": 9820
},
{
"epoch": 245.75,
"grad_norm": 0.12994638085365295,
"learning_rate": 9.936381441355282e-05,
"loss": 0.0049,
"step": 9830
},
{
"epoch": 246.0,
"grad_norm": 0.13015589118003845,
"learning_rate": 9.936118246185136e-05,
"loss": 0.005,
"step": 9840
},
{
"epoch": 246.25,
"grad_norm": 0.1136460080742836,
"learning_rate": 9.935854511208518e-05,
"loss": 0.0047,
"step": 9850
},
{
"epoch": 246.5,
"grad_norm": 0.12648144364356995,
"learning_rate": 9.935590236454272e-05,
"loss": 0.0044,
"step": 9860
},
{
"epoch": 246.75,
"grad_norm": 0.11533254384994507,
"learning_rate": 9.935325421951298e-05,
"loss": 0.0042,
"step": 9870
},
{
"epoch": 247.0,
"grad_norm": 0.11005179584026337,
"learning_rate": 9.935060067728557e-05,
"loss": 0.0032,
"step": 9880
},
{
"epoch": 247.25,
"grad_norm": 0.1126861721277237,
"learning_rate": 9.934794173815067e-05,
"loss": 0.0043,
"step": 9890
},
{
"epoch": 247.5,
"grad_norm": 0.11560443043708801,
"learning_rate": 9.934527740239906e-05,
"loss": 0.0045,
"step": 9900
},
{
"epoch": 247.75,
"grad_norm": 0.11774428933858871,
"learning_rate": 9.934260767032209e-05,
"loss": 0.0043,
"step": 9910
},
{
"epoch": 248.0,
"grad_norm": 0.11532504856586456,
"learning_rate": 9.933993254221172e-05,
"loss": 0.0045,
"step": 9920
},
{
"epoch": 248.25,
"grad_norm": 0.08799305558204651,
"learning_rate": 9.933725201836053e-05,
"loss": 0.0049,
"step": 9930
},
{
"epoch": 248.5,
"grad_norm": 0.11146137118339539,
"learning_rate": 9.933456609906162e-05,
"loss": 0.0039,
"step": 9940
},
{
"epoch": 248.75,
"grad_norm": 0.08155910670757294,
"learning_rate": 9.933187478460875e-05,
"loss": 0.0047,
"step": 9950
},
{
"epoch": 249.0,
"grad_norm": 0.08949435502290726,
"learning_rate": 9.93291780752962e-05,
"loss": 0.0048,
"step": 9960
},
{
"epoch": 249.25,
"grad_norm": 0.10293368995189667,
"learning_rate": 9.932647597141893e-05,
"loss": 0.004,
"step": 9970
},
{
"epoch": 249.5,
"grad_norm": 0.09809095412492752,
"learning_rate": 9.932376847327239e-05,
"loss": 0.004,
"step": 9980
},
{
"epoch": 249.75,
"grad_norm": 0.08202514797449112,
"learning_rate": 9.932105558115268e-05,
"loss": 0.0036,
"step": 9990
},
{
"epoch": 250.0,
"grad_norm": 0.12790925800800323,
"learning_rate": 9.931833729535651e-05,
"loss": 0.0048,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2500,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}