serving_10k / trainer_state.json
Dongkkka's picture
Upload folder using huggingface_hub
af530e5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"grad_norm": 1.775665283203125,
"learning_rate": 1.8e-07,
"loss": 1.2459,
"step": 10
},
{
"grad_norm": 1.8006603717803955,
"learning_rate": 3.8e-07,
"loss": 1.2474,
"step": 20
},
{
"grad_norm": 1.7776480913162231,
"learning_rate": 5.8e-07,
"loss": 1.2534,
"step": 30
},
{
"grad_norm": 1.6365658044815063,
"learning_rate": 7.8e-07,
"loss": 1.2435,
"step": 40
},
{
"grad_norm": 1.465846061706543,
"learning_rate": 9.8e-07,
"loss": 1.2257,
"step": 50
},
{
"grad_norm": 1.0099663734436035,
"learning_rate": 1.18e-06,
"loss": 1.205,
"step": 60
},
{
"grad_norm": 0.7630435824394226,
"learning_rate": 1.3800000000000001e-06,
"loss": 1.1777,
"step": 70
},
{
"grad_norm": 0.4639379680156708,
"learning_rate": 1.5800000000000003e-06,
"loss": 1.1724,
"step": 80
},
{
"grad_norm": 0.36149483919143677,
"learning_rate": 1.7800000000000001e-06,
"loss": 1.1488,
"step": 90
},
{
"grad_norm": 0.37011831998825073,
"learning_rate": 1.98e-06,
"loss": 1.1551,
"step": 100
},
{
"grad_norm": 0.28454360365867615,
"learning_rate": 2.1800000000000003e-06,
"loss": 1.1449,
"step": 110
},
{
"grad_norm": 0.31042224168777466,
"learning_rate": 2.38e-06,
"loss": 1.146,
"step": 120
},
{
"grad_norm": 0.30090591311454773,
"learning_rate": 2.5800000000000003e-06,
"loss": 1.1378,
"step": 130
},
{
"grad_norm": 0.3730885684490204,
"learning_rate": 2.78e-06,
"loss": 1.1428,
"step": 140
},
{
"grad_norm": 0.2819376289844513,
"learning_rate": 2.9800000000000003e-06,
"loss": 1.1497,
"step": 150
},
{
"grad_norm": 0.33462652564048767,
"learning_rate": 3.1800000000000005e-06,
"loss": 1.1373,
"step": 160
},
{
"grad_norm": 0.21172603964805603,
"learning_rate": 3.38e-06,
"loss": 1.1329,
"step": 170
},
{
"grad_norm": 0.2958638370037079,
"learning_rate": 3.58e-06,
"loss": 1.1405,
"step": 180
},
{
"grad_norm": 0.3197462260723114,
"learning_rate": 3.7800000000000002e-06,
"loss": 1.1269,
"step": 190
},
{
"grad_norm": 0.36853522062301636,
"learning_rate": 3.98e-06,
"loss": 1.1312,
"step": 200
},
{
"grad_norm": 0.41355302929878235,
"learning_rate": 4.18e-06,
"loss": 1.1263,
"step": 210
},
{
"grad_norm": 0.41098693013191223,
"learning_rate": 4.38e-06,
"loss": 1.1298,
"step": 220
},
{
"grad_norm": 0.2925293445587158,
"learning_rate": 4.58e-06,
"loss": 1.1182,
"step": 230
},
{
"grad_norm": 0.5053501129150391,
"learning_rate": 4.780000000000001e-06,
"loss": 1.1138,
"step": 240
},
{
"grad_norm": 0.3710362911224365,
"learning_rate": 4.98e-06,
"loss": 1.0948,
"step": 250
},
{
"grad_norm": 0.37328803539276123,
"learning_rate": 5.18e-06,
"loss": 1.0996,
"step": 260
},
{
"grad_norm": 0.39009347558021545,
"learning_rate": 5.38e-06,
"loss": 1.0835,
"step": 270
},
{
"grad_norm": 0.4309180974960327,
"learning_rate": 5.580000000000001e-06,
"loss": 1.0791,
"step": 280
},
{
"grad_norm": 0.4729507863521576,
"learning_rate": 5.78e-06,
"loss": 1.0567,
"step": 290
},
{
"grad_norm": 0.6347916722297668,
"learning_rate": 5.98e-06,
"loss": 1.0571,
"step": 300
},
{
"grad_norm": 0.5822917819023132,
"learning_rate": 6.18e-06,
"loss": 1.0539,
"step": 310
},
{
"grad_norm": 0.47035670280456543,
"learning_rate": 6.38e-06,
"loss": 1.0484,
"step": 320
},
{
"grad_norm": 0.4800194501876831,
"learning_rate": 6.58e-06,
"loss": 1.0384,
"step": 330
},
{
"grad_norm": 0.5258088111877441,
"learning_rate": 6.78e-06,
"loss": 1.0289,
"step": 340
},
{
"grad_norm": 0.5151435732841492,
"learning_rate": 6.98e-06,
"loss": 1.0326,
"step": 350
},
{
"grad_norm": 0.7264977097511292,
"learning_rate": 7.180000000000001e-06,
"loss": 1.0114,
"step": 360
},
{
"grad_norm": 0.8875923156738281,
"learning_rate": 7.3800000000000005e-06,
"loss": 0.9882,
"step": 370
},
{
"grad_norm": 0.8583943843841553,
"learning_rate": 7.580000000000001e-06,
"loss": 0.9699,
"step": 380
},
{
"grad_norm": 0.8830875754356384,
"learning_rate": 7.78e-06,
"loss": 0.9492,
"step": 390
},
{
"grad_norm": 1.2209216356277466,
"learning_rate": 7.98e-06,
"loss": 0.9143,
"step": 400
},
{
"grad_norm": 1.1229692697525024,
"learning_rate": 8.18e-06,
"loss": 0.8809,
"step": 410
},
{
"grad_norm": 1.2567274570465088,
"learning_rate": 8.380000000000001e-06,
"loss": 0.8308,
"step": 420
},
{
"grad_norm": 1.3022184371948242,
"learning_rate": 8.580000000000001e-06,
"loss": 0.8125,
"step": 430
},
{
"grad_norm": 1.4625277519226074,
"learning_rate": 8.78e-06,
"loss": 0.7822,
"step": 440
},
{
"grad_norm": 1.5630838871002197,
"learning_rate": 8.98e-06,
"loss": 0.7605,
"step": 450
},
{
"grad_norm": 1.3062169551849365,
"learning_rate": 9.180000000000002e-06,
"loss": 0.7316,
"step": 460
},
{
"grad_norm": 1.480367660522461,
"learning_rate": 9.38e-06,
"loss": 0.7145,
"step": 470
},
{
"grad_norm": 1.6373467445373535,
"learning_rate": 9.58e-06,
"loss": 0.6868,
"step": 480
},
{
"grad_norm": 1.2964894771575928,
"learning_rate": 9.78e-06,
"loss": 0.6551,
"step": 490
},
{
"grad_norm": 1.2476259469985962,
"learning_rate": 9.980000000000001e-06,
"loss": 0.6387,
"step": 500
},
{
"grad_norm": 1.2335081100463867,
"learning_rate": 1.018e-05,
"loss": 0.614,
"step": 510
},
{
"grad_norm": 1.3435138463974,
"learning_rate": 1.038e-05,
"loss": 0.6005,
"step": 520
},
{
"grad_norm": 1.5091711282730103,
"learning_rate": 1.058e-05,
"loss": 0.5758,
"step": 530
},
{
"grad_norm": 1.3913429975509644,
"learning_rate": 1.0780000000000002e-05,
"loss": 0.5535,
"step": 540
},
{
"grad_norm": 1.5673116445541382,
"learning_rate": 1.098e-05,
"loss": 0.5258,
"step": 550
},
{
"grad_norm": 1.2061394453048706,
"learning_rate": 1.118e-05,
"loss": 0.5042,
"step": 560
},
{
"grad_norm": 1.7407894134521484,
"learning_rate": 1.1380000000000001e-05,
"loss": 0.4777,
"step": 570
},
{
"grad_norm": 1.718432068824768,
"learning_rate": 1.1580000000000001e-05,
"loss": 0.4431,
"step": 580
},
{
"grad_norm": 1.4845730066299438,
"learning_rate": 1.178e-05,
"loss": 0.4007,
"step": 590
},
{
"grad_norm": 1.5443795919418335,
"learning_rate": 1.198e-05,
"loss": 0.3775,
"step": 600
},
{
"grad_norm": 1.5730947256088257,
"learning_rate": 1.2180000000000002e-05,
"loss": 0.3443,
"step": 610
},
{
"grad_norm": 1.5342825651168823,
"learning_rate": 1.238e-05,
"loss": 0.3318,
"step": 620
},
{
"grad_norm": 1.8967719078063965,
"learning_rate": 1.258e-05,
"loss": 0.3008,
"step": 630
},
{
"grad_norm": 2.1637165546417236,
"learning_rate": 1.278e-05,
"loss": 0.2707,
"step": 640
},
{
"grad_norm": 2.3026540279388428,
"learning_rate": 1.2980000000000001e-05,
"loss": 0.2594,
"step": 650
},
{
"grad_norm": 1.9306834936141968,
"learning_rate": 1.3180000000000001e-05,
"loss": 0.2457,
"step": 660
},
{
"grad_norm": 1.636735200881958,
"learning_rate": 1.338e-05,
"loss": 0.2194,
"step": 670
},
{
"grad_norm": 1.8122118711471558,
"learning_rate": 1.358e-05,
"loss": 0.2064,
"step": 680
},
{
"grad_norm": 1.933800220489502,
"learning_rate": 1.3780000000000002e-05,
"loss": 0.2038,
"step": 690
},
{
"grad_norm": 1.6010997295379639,
"learning_rate": 1.3980000000000002e-05,
"loss": 0.193,
"step": 700
},
{
"grad_norm": 2.043034791946411,
"learning_rate": 1.4180000000000001e-05,
"loss": 0.2065,
"step": 710
},
{
"grad_norm": 1.6404719352722168,
"learning_rate": 1.4380000000000001e-05,
"loss": 0.1811,
"step": 720
},
{
"grad_norm": 2.222703456878662,
"learning_rate": 1.4580000000000003e-05,
"loss": 0.1607,
"step": 730
},
{
"grad_norm": 1.5441687107086182,
"learning_rate": 1.4779999999999999e-05,
"loss": 0.1745,
"step": 740
},
{
"grad_norm": 1.9696680307388306,
"learning_rate": 1.4979999999999999e-05,
"loss": 0.1705,
"step": 750
},
{
"grad_norm": 1.7248599529266357,
"learning_rate": 1.518e-05,
"loss": 0.171,
"step": 760
},
{
"grad_norm": 2.310361385345459,
"learning_rate": 1.538e-05,
"loss": 0.1628,
"step": 770
},
{
"grad_norm": 1.7767536640167236,
"learning_rate": 1.558e-05,
"loss": 0.1599,
"step": 780
},
{
"grad_norm": 1.3809053897857666,
"learning_rate": 1.578e-05,
"loss": 0.1561,
"step": 790
},
{
"grad_norm": 1.8111076354980469,
"learning_rate": 1.598e-05,
"loss": 0.1543,
"step": 800
},
{
"grad_norm": 2.080385684967041,
"learning_rate": 1.618e-05,
"loss": 0.1486,
"step": 810
},
{
"grad_norm": 1.756594181060791,
"learning_rate": 1.6380000000000002e-05,
"loss": 0.1459,
"step": 820
},
{
"grad_norm": 1.6584662199020386,
"learning_rate": 1.658e-05,
"loss": 0.1443,
"step": 830
},
{
"grad_norm": 1.4734628200531006,
"learning_rate": 1.6780000000000002e-05,
"loss": 0.1365,
"step": 840
},
{
"grad_norm": 2.3412086963653564,
"learning_rate": 1.698e-05,
"loss": 0.141,
"step": 850
},
{
"grad_norm": 1.933574914932251,
"learning_rate": 1.718e-05,
"loss": 0.1488,
"step": 860
},
{
"grad_norm": 2.0521068572998047,
"learning_rate": 1.7380000000000003e-05,
"loss": 0.1337,
"step": 870
},
{
"grad_norm": 1.7398793697357178,
"learning_rate": 1.758e-05,
"loss": 0.1379,
"step": 880
},
{
"grad_norm": 1.6098215579986572,
"learning_rate": 1.7780000000000003e-05,
"loss": 0.125,
"step": 890
},
{
"grad_norm": 1.6967601776123047,
"learning_rate": 1.798e-05,
"loss": 0.1185,
"step": 900
},
{
"grad_norm": 1.6858727931976318,
"learning_rate": 1.818e-05,
"loss": 0.1179,
"step": 910
},
{
"grad_norm": 1.334216833114624,
"learning_rate": 1.838e-05,
"loss": 0.1197,
"step": 920
},
{
"grad_norm": 1.690228819847107,
"learning_rate": 1.858e-05,
"loss": 0.1209,
"step": 930
},
{
"grad_norm": 2.309962511062622,
"learning_rate": 1.878e-05,
"loss": 0.1162,
"step": 940
},
{
"grad_norm": 1.2557134628295898,
"learning_rate": 1.898e-05,
"loss": 0.1202,
"step": 950
},
{
"grad_norm": 1.8468115329742432,
"learning_rate": 1.918e-05,
"loss": 0.1206,
"step": 960
},
{
"grad_norm": 2.1588213443756104,
"learning_rate": 1.938e-05,
"loss": 0.1183,
"step": 970
},
{
"grad_norm": 1.9463591575622559,
"learning_rate": 1.9580000000000002e-05,
"loss": 0.1196,
"step": 980
},
{
"grad_norm": 1.909501075744629,
"learning_rate": 1.978e-05,
"loss": 0.118,
"step": 990
},
{
"grad_norm": 1.6289474964141846,
"learning_rate": 1.9980000000000002e-05,
"loss": 0.118,
"step": 1000
},
{
"grad_norm": 1.5920497179031372,
"learning_rate": 2.0180000000000003e-05,
"loss": 0.1128,
"step": 1010
},
{
"grad_norm": 1.5876258611679077,
"learning_rate": 2.038e-05,
"loss": 0.1152,
"step": 1020
},
{
"grad_norm": 1.4641977548599243,
"learning_rate": 2.0580000000000003e-05,
"loss": 0.1165,
"step": 1030
},
{
"grad_norm": 1.3542155027389526,
"learning_rate": 2.078e-05,
"loss": 0.1063,
"step": 1040
},
{
"grad_norm": 1.9370126724243164,
"learning_rate": 2.098e-05,
"loss": 0.1041,
"step": 1050
},
{
"grad_norm": 1.5336304903030396,
"learning_rate": 2.118e-05,
"loss": 0.1033,
"step": 1060
},
{
"grad_norm": 1.911483883857727,
"learning_rate": 2.138e-05,
"loss": 0.1026,
"step": 1070
},
{
"grad_norm": 1.7599859237670898,
"learning_rate": 2.158e-05,
"loss": 0.0954,
"step": 1080
},
{
"grad_norm": 1.656410813331604,
"learning_rate": 2.178e-05,
"loss": 0.0997,
"step": 1090
},
{
"grad_norm": 2.2407729625701904,
"learning_rate": 2.198e-05,
"loss": 0.1049,
"step": 1100
},
{
"grad_norm": 1.480260968208313,
"learning_rate": 2.218e-05,
"loss": 0.0916,
"step": 1110
},
{
"grad_norm": 1.7908304929733276,
"learning_rate": 2.2380000000000003e-05,
"loss": 0.1032,
"step": 1120
},
{
"grad_norm": 2.3818678855895996,
"learning_rate": 2.258e-05,
"loss": 0.1005,
"step": 1130
},
{
"grad_norm": 1.8040275573730469,
"learning_rate": 2.2780000000000002e-05,
"loss": 0.098,
"step": 1140
},
{
"grad_norm": 1.6693925857543945,
"learning_rate": 2.298e-05,
"loss": 0.1035,
"step": 1150
},
{
"grad_norm": 1.7212457656860352,
"learning_rate": 2.318e-05,
"loss": 0.095,
"step": 1160
},
{
"grad_norm": 1.7259304523468018,
"learning_rate": 2.3380000000000003e-05,
"loss": 0.0978,
"step": 1170
},
{
"grad_norm": 1.6136237382888794,
"learning_rate": 2.358e-05,
"loss": 0.0983,
"step": 1180
},
{
"grad_norm": 1.4225982427597046,
"learning_rate": 2.3780000000000003e-05,
"loss": 0.0957,
"step": 1190
},
{
"grad_norm": 1.3380581140518188,
"learning_rate": 2.398e-05,
"loss": 0.0985,
"step": 1200
},
{
"grad_norm": 1.51802659034729,
"learning_rate": 2.418e-05,
"loss": 0.0929,
"step": 1210
},
{
"grad_norm": 1.5779268741607666,
"learning_rate": 2.438e-05,
"loss": 0.0872,
"step": 1220
},
{
"grad_norm": 1.709097981452942,
"learning_rate": 2.4580000000000002e-05,
"loss": 0.0881,
"step": 1230
},
{
"grad_norm": 1.5976459980010986,
"learning_rate": 2.478e-05,
"loss": 0.0982,
"step": 1240
},
{
"grad_norm": 1.5077290534973145,
"learning_rate": 2.498e-05,
"loss": 0.0947,
"step": 1250
},
{
"grad_norm": 1.671633005142212,
"learning_rate": 2.5180000000000003e-05,
"loss": 0.0803,
"step": 1260
},
{
"grad_norm": 1.733736515045166,
"learning_rate": 2.5380000000000004e-05,
"loss": 0.0978,
"step": 1270
},
{
"grad_norm": 1.736934781074524,
"learning_rate": 2.5580000000000002e-05,
"loss": 0.0887,
"step": 1280
},
{
"grad_norm": 1.7765599489212036,
"learning_rate": 2.5779999999999997e-05,
"loss": 0.0922,
"step": 1290
},
{
"grad_norm": 1.616905689239502,
"learning_rate": 2.598e-05,
"loss": 0.0906,
"step": 1300
},
{
"grad_norm": 1.6434909105300903,
"learning_rate": 2.618e-05,
"loss": 0.0817,
"step": 1310
},
{
"grad_norm": 1.4403036832809448,
"learning_rate": 2.6379999999999998e-05,
"loss": 0.1014,
"step": 1320
},
{
"grad_norm": 1.3139845132827759,
"learning_rate": 2.658e-05,
"loss": 0.0843,
"step": 1330
},
{
"grad_norm": 1.5314061641693115,
"learning_rate": 2.678e-05,
"loss": 0.0789,
"step": 1340
},
{
"grad_norm": 1.670033574104309,
"learning_rate": 2.698e-05,
"loss": 0.0892,
"step": 1350
},
{
"grad_norm": 1.468614935874939,
"learning_rate": 2.718e-05,
"loss": 0.0926,
"step": 1360
},
{
"grad_norm": 1.516029715538025,
"learning_rate": 2.738e-05,
"loss": 0.0826,
"step": 1370
},
{
"grad_norm": 1.4946731328964233,
"learning_rate": 2.758e-05,
"loss": 0.0782,
"step": 1380
},
{
"grad_norm": 1.6159133911132812,
"learning_rate": 2.778e-05,
"loss": 0.0743,
"step": 1390
},
{
"grad_norm": 1.2443039417266846,
"learning_rate": 2.798e-05,
"loss": 0.0766,
"step": 1400
},
{
"grad_norm": 1.6355807781219482,
"learning_rate": 2.818e-05,
"loss": 0.0783,
"step": 1410
},
{
"grad_norm": 1.1233558654785156,
"learning_rate": 2.8380000000000003e-05,
"loss": 0.0802,
"step": 1420
},
{
"grad_norm": 1.4807885885238647,
"learning_rate": 2.858e-05,
"loss": 0.0916,
"step": 1430
},
{
"grad_norm": 1.8424674272537231,
"learning_rate": 2.8780000000000002e-05,
"loss": 0.0784,
"step": 1440
},
{
"grad_norm": 1.2739874124526978,
"learning_rate": 2.898e-05,
"loss": 0.0726,
"step": 1450
},
{
"grad_norm": 1.5551581382751465,
"learning_rate": 2.9180000000000002e-05,
"loss": 0.0829,
"step": 1460
},
{
"grad_norm": 1.5547388792037964,
"learning_rate": 2.9380000000000003e-05,
"loss": 0.0751,
"step": 1470
},
{
"grad_norm": 1.2586749792099,
"learning_rate": 2.958e-05,
"loss": 0.0763,
"step": 1480
},
{
"grad_norm": 1.2998220920562744,
"learning_rate": 2.9780000000000003e-05,
"loss": 0.0787,
"step": 1490
},
{
"grad_norm": 1.1537818908691406,
"learning_rate": 2.998e-05,
"loss": 0.0751,
"step": 1500
},
{
"grad_norm": 1.569019079208374,
"learning_rate": 3.0180000000000002e-05,
"loss": 0.0791,
"step": 1510
},
{
"grad_norm": 1.9538917541503906,
"learning_rate": 3.0380000000000004e-05,
"loss": 0.0779,
"step": 1520
},
{
"grad_norm": 1.8163516521453857,
"learning_rate": 3.058e-05,
"loss": 0.075,
"step": 1530
},
{
"grad_norm": 1.3041982650756836,
"learning_rate": 3.078e-05,
"loss": 0.0784,
"step": 1540
},
{
"grad_norm": 1.7077465057373047,
"learning_rate": 3.0980000000000005e-05,
"loss": 0.0773,
"step": 1550
},
{
"grad_norm": 1.2418750524520874,
"learning_rate": 3.118e-05,
"loss": 0.0768,
"step": 1560
},
{
"grad_norm": 1.398686170578003,
"learning_rate": 3.138e-05,
"loss": 0.0774,
"step": 1570
},
{
"grad_norm": 1.4768542051315308,
"learning_rate": 3.1580000000000006e-05,
"loss": 0.0725,
"step": 1580
},
{
"grad_norm": 1.1355159282684326,
"learning_rate": 3.1780000000000004e-05,
"loss": 0.073,
"step": 1590
},
{
"grad_norm": 1.2840240001678467,
"learning_rate": 3.198e-05,
"loss": 0.0796,
"step": 1600
},
{
"grad_norm": 1.4005167484283447,
"learning_rate": 3.218e-05,
"loss": 0.0717,
"step": 1610
},
{
"grad_norm": 1.4848135709762573,
"learning_rate": 3.238e-05,
"loss": 0.0699,
"step": 1620
},
{
"grad_norm": 1.502197265625,
"learning_rate": 3.2579999999999996e-05,
"loss": 0.0775,
"step": 1630
},
{
"grad_norm": 1.7683912515640259,
"learning_rate": 3.278e-05,
"loss": 0.0768,
"step": 1640
},
{
"grad_norm": 1.1830830574035645,
"learning_rate": 3.298e-05,
"loss": 0.0702,
"step": 1650
},
{
"grad_norm": 1.4201654195785522,
"learning_rate": 3.318e-05,
"loss": 0.0685,
"step": 1660
},
{
"grad_norm": 1.1277825832366943,
"learning_rate": 3.338e-05,
"loss": 0.0732,
"step": 1670
},
{
"grad_norm": 0.9588015079498291,
"learning_rate": 3.358e-05,
"loss": 0.0702,
"step": 1680
},
{
"grad_norm": 1.0491678714752197,
"learning_rate": 3.378e-05,
"loss": 0.0699,
"step": 1690
},
{
"grad_norm": 1.3812892436981201,
"learning_rate": 3.398e-05,
"loss": 0.0852,
"step": 1700
},
{
"grad_norm": 1.305217981338501,
"learning_rate": 3.418e-05,
"loss": 0.0723,
"step": 1710
},
{
"grad_norm": 1.474246859550476,
"learning_rate": 3.438e-05,
"loss": 0.0721,
"step": 1720
},
{
"grad_norm": 1.3952361345291138,
"learning_rate": 3.4580000000000004e-05,
"loss": 0.0744,
"step": 1730
},
{
"grad_norm": 1.1891484260559082,
"learning_rate": 3.478e-05,
"loss": 0.0667,
"step": 1740
},
{
"grad_norm": 1.1835684776306152,
"learning_rate": 3.498e-05,
"loss": 0.0666,
"step": 1750
},
{
"grad_norm": 1.1999907493591309,
"learning_rate": 3.518e-05,
"loss": 0.0648,
"step": 1760
},
{
"grad_norm": 1.2668675184249878,
"learning_rate": 3.5380000000000003e-05,
"loss": 0.0813,
"step": 1770
},
{
"grad_norm": 1.2897546291351318,
"learning_rate": 3.558e-05,
"loss": 0.0716,
"step": 1780
},
{
"grad_norm": 1.3775722980499268,
"learning_rate": 3.578e-05,
"loss": 0.0694,
"step": 1790
},
{
"grad_norm": 1.5125720500946045,
"learning_rate": 3.5980000000000004e-05,
"loss": 0.0769,
"step": 1800
},
{
"grad_norm": 1.5418047904968262,
"learning_rate": 3.618e-05,
"loss": 0.0681,
"step": 1810
},
{
"grad_norm": 1.5127960443496704,
"learning_rate": 3.638e-05,
"loss": 0.0696,
"step": 1820
},
{
"grad_norm": 1.3286449909210205,
"learning_rate": 3.6580000000000006e-05,
"loss": 0.0658,
"step": 1830
},
{
"grad_norm": 1.3748996257781982,
"learning_rate": 3.6780000000000004e-05,
"loss": 0.0652,
"step": 1840
},
{
"grad_norm": 1.3369362354278564,
"learning_rate": 3.698e-05,
"loss": 0.0674,
"step": 1850
},
{
"grad_norm": 1.2467151880264282,
"learning_rate": 3.7180000000000007e-05,
"loss": 0.0633,
"step": 1860
},
{
"grad_norm": 1.494481086730957,
"learning_rate": 3.7380000000000005e-05,
"loss": 0.0675,
"step": 1870
},
{
"grad_norm": 1.4153687953948975,
"learning_rate": 3.758e-05,
"loss": 0.0638,
"step": 1880
},
{
"grad_norm": 1.2345432043075562,
"learning_rate": 3.778000000000001e-05,
"loss": 0.0602,
"step": 1890
},
{
"grad_norm": 1.743789553642273,
"learning_rate": 3.7980000000000006e-05,
"loss": 0.0685,
"step": 1900
},
{
"grad_norm": 1.361466646194458,
"learning_rate": 3.818e-05,
"loss": 0.0684,
"step": 1910
},
{
"grad_norm": 1.256642460823059,
"learning_rate": 3.838e-05,
"loss": 0.0611,
"step": 1920
},
{
"grad_norm": 1.037825107574463,
"learning_rate": 3.858e-05,
"loss": 0.0638,
"step": 1930
},
{
"grad_norm": 1.2323062419891357,
"learning_rate": 3.878e-05,
"loss": 0.0794,
"step": 1940
},
{
"grad_norm": 1.1717441082000732,
"learning_rate": 3.898e-05,
"loss": 0.0642,
"step": 1950
},
{
"grad_norm": 1.1457865238189697,
"learning_rate": 3.918e-05,
"loss": 0.0638,
"step": 1960
},
{
"grad_norm": 1.339479923248291,
"learning_rate": 3.938e-05,
"loss": 0.0666,
"step": 1970
},
{
"grad_norm": 1.3312828540802002,
"learning_rate": 3.958e-05,
"loss": 0.071,
"step": 1980
},
{
"grad_norm": 0.9382027387619019,
"learning_rate": 3.978e-05,
"loss": 0.0596,
"step": 1990
},
{
"grad_norm": 1.4129154682159424,
"learning_rate": 3.998e-05,
"loss": 0.0636,
"step": 2000
},
{
"grad_norm": 1.1233935356140137,
"learning_rate": 4.018e-05,
"loss": 0.0666,
"step": 2010
},
{
"grad_norm": 1.1201488971710205,
"learning_rate": 4.038e-05,
"loss": 0.0678,
"step": 2020
},
{
"grad_norm": 1.1399140357971191,
"learning_rate": 4.058e-05,
"loss": 0.058,
"step": 2030
},
{
"grad_norm": 1.1462037563323975,
"learning_rate": 4.078e-05,
"loss": 0.0653,
"step": 2040
},
{
"grad_norm": 1.2740496397018433,
"learning_rate": 4.0980000000000004e-05,
"loss": 0.0672,
"step": 2050
},
{
"grad_norm": 1.1629321575164795,
"learning_rate": 4.118e-05,
"loss": 0.0634,
"step": 2060
},
{
"grad_norm": 1.2404067516326904,
"learning_rate": 4.138e-05,
"loss": 0.0683,
"step": 2070
},
{
"grad_norm": 0.708964467048645,
"learning_rate": 4.1580000000000005e-05,
"loss": 0.0676,
"step": 2080
},
{
"grad_norm": 1.0040842294692993,
"learning_rate": 4.178e-05,
"loss": 0.0668,
"step": 2090
},
{
"grad_norm": 1.1013442277908325,
"learning_rate": 4.198e-05,
"loss": 0.063,
"step": 2100
},
{
"grad_norm": 1.5293811559677124,
"learning_rate": 4.2180000000000006e-05,
"loss": 0.0628,
"step": 2110
},
{
"grad_norm": 1.1530494689941406,
"learning_rate": 4.2380000000000004e-05,
"loss": 0.0641,
"step": 2120
},
{
"grad_norm": 1.0142579078674316,
"learning_rate": 4.258e-05,
"loss": 0.0673,
"step": 2130
},
{
"grad_norm": 1.135764718055725,
"learning_rate": 4.278e-05,
"loss": 0.0632,
"step": 2140
},
{
"grad_norm": 1.46830153465271,
"learning_rate": 4.2980000000000005e-05,
"loss": 0.0673,
"step": 2150
},
{
"grad_norm": 1.0555006265640259,
"learning_rate": 4.318e-05,
"loss": 0.0569,
"step": 2160
},
{
"grad_norm": 1.275568962097168,
"learning_rate": 4.338e-05,
"loss": 0.0644,
"step": 2170
},
{
"grad_norm": 1.327375888824463,
"learning_rate": 4.3580000000000006e-05,
"loss": 0.0634,
"step": 2180
},
{
"grad_norm": 0.8595468401908875,
"learning_rate": 4.3780000000000004e-05,
"loss": 0.0615,
"step": 2190
},
{
"grad_norm": 1.0512006282806396,
"learning_rate": 4.398e-05,
"loss": 0.0561,
"step": 2200
},
{
"grad_norm": 1.0317003726959229,
"learning_rate": 4.418000000000001e-05,
"loss": 0.0576,
"step": 2210
},
{
"grad_norm": 1.185249924659729,
"learning_rate": 4.438e-05,
"loss": 0.0602,
"step": 2220
},
{
"grad_norm": 0.8673859238624573,
"learning_rate": 4.458e-05,
"loss": 0.0601,
"step": 2230
},
{
"grad_norm": 0.9111462831497192,
"learning_rate": 4.478e-05,
"loss": 0.059,
"step": 2240
},
{
"grad_norm": 1.0911729335784912,
"learning_rate": 4.498e-05,
"loss": 0.0677,
"step": 2250
},
{
"grad_norm": 1.2300727367401123,
"learning_rate": 4.518e-05,
"loss": 0.0695,
"step": 2260
},
{
"grad_norm": 0.9249147772789001,
"learning_rate": 4.538e-05,
"loss": 0.0661,
"step": 2270
},
{
"grad_norm": 0.862099826335907,
"learning_rate": 4.558e-05,
"loss": 0.0576,
"step": 2280
},
{
"grad_norm": 1.1415400505065918,
"learning_rate": 4.578e-05,
"loss": 0.0545,
"step": 2290
},
{
"grad_norm": 1.0691193342208862,
"learning_rate": 4.5980000000000004e-05,
"loss": 0.0615,
"step": 2300
},
{
"grad_norm": 1.158877968788147,
"learning_rate": 4.618e-05,
"loss": 0.061,
"step": 2310
},
{
"grad_norm": 0.9803627133369446,
"learning_rate": 4.638e-05,
"loss": 0.0595,
"step": 2320
},
{
"grad_norm": 1.003619909286499,
"learning_rate": 4.6580000000000005e-05,
"loss": 0.0642,
"step": 2330
},
{
"grad_norm": 1.0040209293365479,
"learning_rate": 4.678e-05,
"loss": 0.061,
"step": 2340
},
{
"grad_norm": 1.0538854598999023,
"learning_rate": 4.698e-05,
"loss": 0.0581,
"step": 2350
},
{
"grad_norm": 0.9938404560089111,
"learning_rate": 4.718e-05,
"loss": 0.064,
"step": 2360
},
{
"grad_norm": 1.2104016542434692,
"learning_rate": 4.7380000000000004e-05,
"loss": 0.0603,
"step": 2370
},
{
"grad_norm": 0.9382691383361816,
"learning_rate": 4.758e-05,
"loss": 0.0636,
"step": 2380
},
{
"grad_norm": 0.9005857706069946,
"learning_rate": 4.778e-05,
"loss": 0.0591,
"step": 2390
},
{
"grad_norm": 0.9913654923439026,
"learning_rate": 4.7980000000000005e-05,
"loss": 0.0614,
"step": 2400
},
{
"grad_norm": 0.6588058471679688,
"learning_rate": 4.818e-05,
"loss": 0.0547,
"step": 2410
},
{
"grad_norm": 0.9043067097663879,
"learning_rate": 4.838e-05,
"loss": 0.0591,
"step": 2420
},
{
"grad_norm": 0.9156412482261658,
"learning_rate": 4.8580000000000006e-05,
"loss": 0.0586,
"step": 2430
},
{
"grad_norm": 1.0626698732376099,
"learning_rate": 4.8780000000000004e-05,
"loss": 0.0579,
"step": 2440
},
{
"grad_norm": 0.9202826023101807,
"learning_rate": 4.898e-05,
"loss": 0.0588,
"step": 2450
},
{
"grad_norm": 1.0640308856964111,
"learning_rate": 4.918000000000001e-05,
"loss": 0.0656,
"step": 2460
},
{
"grad_norm": 1.06102454662323,
"learning_rate": 4.9380000000000005e-05,
"loss": 0.0576,
"step": 2470
},
{
"grad_norm": 0.7095378637313843,
"learning_rate": 4.958e-05,
"loss": 0.0634,
"step": 2480
},
{
"grad_norm": 1.026294469833374,
"learning_rate": 4.978e-05,
"loss": 0.0552,
"step": 2490
},
{
"grad_norm": 1.0467168092727661,
"learning_rate": 4.9980000000000006e-05,
"loss": 0.0592,
"step": 2500
},
{
"grad_norm": 1.0738788843154907,
"learning_rate": 5.0180000000000004e-05,
"loss": 0.0609,
"step": 2510
},
{
"grad_norm": 0.7375679612159729,
"learning_rate": 5.038e-05,
"loss": 0.0582,
"step": 2520
},
{
"grad_norm": 0.9701960682868958,
"learning_rate": 5.058000000000001e-05,
"loss": 0.0543,
"step": 2530
},
{
"grad_norm": 0.776954710483551,
"learning_rate": 5.0780000000000005e-05,
"loss": 0.0532,
"step": 2540
},
{
"grad_norm": 1.1171388626098633,
"learning_rate": 5.098e-05,
"loss": 0.0611,
"step": 2550
},
{
"grad_norm": 0.9650934934616089,
"learning_rate": 5.118000000000001e-05,
"loss": 0.0539,
"step": 2560
},
{
"grad_norm": 1.0291506052017212,
"learning_rate": 5.1380000000000006e-05,
"loss": 0.0562,
"step": 2570
},
{
"grad_norm": 0.8755670785903931,
"learning_rate": 5.1580000000000004e-05,
"loss": 0.06,
"step": 2580
},
{
"grad_norm": 0.8407630324363708,
"learning_rate": 5.178000000000001e-05,
"loss": 0.0539,
"step": 2590
},
{
"grad_norm": 0.8454659581184387,
"learning_rate": 5.198000000000001e-05,
"loss": 0.0577,
"step": 2600
},
{
"grad_norm": 0.981410801410675,
"learning_rate": 5.2180000000000005e-05,
"loss": 0.0563,
"step": 2610
},
{
"grad_norm": 1.130122184753418,
"learning_rate": 5.238000000000001e-05,
"loss": 0.0526,
"step": 2620
},
{
"grad_norm": 0.9934293627738953,
"learning_rate": 5.258000000000001e-05,
"loss": 0.0573,
"step": 2630
},
{
"grad_norm": 0.7207489609718323,
"learning_rate": 5.2780000000000006e-05,
"loss": 0.0559,
"step": 2640
},
{
"grad_norm": 0.8188751935958862,
"learning_rate": 5.2980000000000004e-05,
"loss": 0.0582,
"step": 2650
},
{
"grad_norm": 0.6484973430633545,
"learning_rate": 5.318000000000001e-05,
"loss": 0.0524,
"step": 2660
},
{
"grad_norm": 0.6975863575935364,
"learning_rate": 5.338000000000001e-05,
"loss": 0.0545,
"step": 2670
},
{
"grad_norm": 1.1552317142486572,
"learning_rate": 5.3580000000000005e-05,
"loss": 0.0534,
"step": 2680
},
{
"grad_norm": 0.7495611906051636,
"learning_rate": 5.378e-05,
"loss": 0.0593,
"step": 2690
},
{
"grad_norm": 1.096688151359558,
"learning_rate": 5.3979999999999995e-05,
"loss": 0.0571,
"step": 2700
},
{
"grad_norm": 0.9496958255767822,
"learning_rate": 5.418e-05,
"loss": 0.059,
"step": 2710
},
{
"grad_norm": 0.788066565990448,
"learning_rate": 5.438e-05,
"loss": 0.0535,
"step": 2720
},
{
"grad_norm": 0.6263362765312195,
"learning_rate": 5.4579999999999996e-05,
"loss": 0.0538,
"step": 2730
},
{
"grad_norm": 0.7248815894126892,
"learning_rate": 5.478e-05,
"loss": 0.0556,
"step": 2740
},
{
"grad_norm": 0.7064611315727234,
"learning_rate": 5.498e-05,
"loss": 0.0541,
"step": 2750
},
{
"grad_norm": 0.9526064395904541,
"learning_rate": 5.518e-05,
"loss": 0.0549,
"step": 2760
},
{
"grad_norm": 0.7567411065101624,
"learning_rate": 5.538e-05,
"loss": 0.0489,
"step": 2770
},
{
"grad_norm": 0.797159731388092,
"learning_rate": 5.558e-05,
"loss": 0.0553,
"step": 2780
},
{
"grad_norm": 0.7693145275115967,
"learning_rate": 5.578e-05,
"loss": 0.0551,
"step": 2790
},
{
"grad_norm": 0.7668411731719971,
"learning_rate": 5.5979999999999996e-05,
"loss": 0.0502,
"step": 2800
},
{
"grad_norm": 0.6621181964874268,
"learning_rate": 5.618e-05,
"loss": 0.0486,
"step": 2810
},
{
"grad_norm": 0.7623941898345947,
"learning_rate": 5.638e-05,
"loss": 0.0448,
"step": 2820
},
{
"grad_norm": 0.7547678351402283,
"learning_rate": 5.658e-05,
"loss": 0.0512,
"step": 2830
},
{
"grad_norm": 0.9330014586448669,
"learning_rate": 5.678e-05,
"loss": 0.0554,
"step": 2840
},
{
"grad_norm": 0.9201127290725708,
"learning_rate": 5.698e-05,
"loss": 0.05,
"step": 2850
},
{
"grad_norm": 0.7783554792404175,
"learning_rate": 5.718e-05,
"loss": 0.0514,
"step": 2860
},
{
"grad_norm": 0.8040813207626343,
"learning_rate": 5.738e-05,
"loss": 0.0511,
"step": 2870
},
{
"grad_norm": 0.6171097755432129,
"learning_rate": 5.758e-05,
"loss": 0.0476,
"step": 2880
},
{
"grad_norm": 0.7062954306602478,
"learning_rate": 5.778e-05,
"loss": 0.0474,
"step": 2890
},
{
"grad_norm": 0.6481232643127441,
"learning_rate": 5.7980000000000004e-05,
"loss": 0.0498,
"step": 2900
},
{
"grad_norm": 0.6301347613334656,
"learning_rate": 5.818e-05,
"loss": 0.0483,
"step": 2910
},
{
"grad_norm": 0.764008104801178,
"learning_rate": 5.838e-05,
"loss": 0.0462,
"step": 2920
},
{
"grad_norm": 0.8574206829071045,
"learning_rate": 5.858e-05,
"loss": 0.0562,
"step": 2930
},
{
"grad_norm": 0.7253652215003967,
"learning_rate": 5.878e-05,
"loss": 0.0583,
"step": 2940
},
{
"grad_norm": 0.8518454432487488,
"learning_rate": 5.898e-05,
"loss": 0.056,
"step": 2950
},
{
"grad_norm": 0.8673760890960693,
"learning_rate": 5.918e-05,
"loss": 0.0591,
"step": 2960
},
{
"grad_norm": 0.6929979920387268,
"learning_rate": 5.9380000000000004e-05,
"loss": 0.0491,
"step": 2970
},
{
"grad_norm": 0.8658909797668457,
"learning_rate": 5.958e-05,
"loss": 0.0496,
"step": 2980
},
{
"grad_norm": 0.7409300208091736,
"learning_rate": 5.978e-05,
"loss": 0.0499,
"step": 2990
},
{
"grad_norm": 0.40856173634529114,
"learning_rate": 5.9980000000000005e-05,
"loss": 0.0508,
"step": 3000
},
{
"grad_norm": 0.6541692614555359,
"learning_rate": 6.018e-05,
"loss": 0.0552,
"step": 3010
},
{
"grad_norm": 0.817074179649353,
"learning_rate": 6.038e-05,
"loss": 0.0504,
"step": 3020
},
{
"grad_norm": 0.548643171787262,
"learning_rate": 6.0580000000000006e-05,
"loss": 0.0468,
"step": 3030
},
{
"grad_norm": 0.6240131855010986,
"learning_rate": 6.0780000000000004e-05,
"loss": 0.0444,
"step": 3040
},
{
"grad_norm": 0.7566686868667603,
"learning_rate": 6.098e-05,
"loss": 0.0499,
"step": 3050
},
{
"grad_norm": 0.7670771479606628,
"learning_rate": 6.118000000000001e-05,
"loss": 0.0515,
"step": 3060
},
{
"grad_norm": 0.7754244208335876,
"learning_rate": 6.138e-05,
"loss": 0.054,
"step": 3070
},
{
"grad_norm": 0.5756589770317078,
"learning_rate": 6.158e-05,
"loss": 0.0512,
"step": 3080
},
{
"grad_norm": 0.8729175329208374,
"learning_rate": 6.178000000000001e-05,
"loss": 0.0526,
"step": 3090
},
{
"grad_norm": 0.667744517326355,
"learning_rate": 6.198e-05,
"loss": 0.0543,
"step": 3100
},
{
"grad_norm": 0.9713404178619385,
"learning_rate": 6.218e-05,
"loss": 0.0573,
"step": 3110
},
{
"grad_norm": 0.6793529987335205,
"learning_rate": 6.238000000000001e-05,
"loss": 0.0482,
"step": 3120
},
{
"grad_norm": 0.6212384700775146,
"learning_rate": 6.258e-05,
"loss": 0.0573,
"step": 3130
},
{
"grad_norm": 0.5435833930969238,
"learning_rate": 6.278e-05,
"loss": 0.0474,
"step": 3140
},
{
"grad_norm": 0.7197127938270569,
"learning_rate": 6.298000000000001e-05,
"loss": 0.0519,
"step": 3150
},
{
"grad_norm": 0.685274064540863,
"learning_rate": 6.318e-05,
"loss": 0.0434,
"step": 3160
},
{
"grad_norm": 0.7319912910461426,
"learning_rate": 6.338e-05,
"loss": 0.0488,
"step": 3170
},
{
"grad_norm": 0.7035712003707886,
"learning_rate": 6.358000000000001e-05,
"loss": 0.0491,
"step": 3180
},
{
"grad_norm": 0.6836076378822327,
"learning_rate": 6.378e-05,
"loss": 0.0509,
"step": 3190
},
{
"grad_norm": 0.5685492753982544,
"learning_rate": 6.398000000000001e-05,
"loss": 0.0433,
"step": 3200
},
{
"grad_norm": 0.9179131388664246,
"learning_rate": 6.418000000000001e-05,
"loss": 0.0435,
"step": 3210
},
{
"grad_norm": 0.896883487701416,
"learning_rate": 6.438e-05,
"loss": 0.0505,
"step": 3220
},
{
"grad_norm": 0.6876042485237122,
"learning_rate": 6.458000000000001e-05,
"loss": 0.0559,
"step": 3230
},
{
"grad_norm": 0.676445722579956,
"learning_rate": 6.478000000000001e-05,
"loss": 0.0481,
"step": 3240
},
{
"grad_norm": 0.732628345489502,
"learning_rate": 6.498e-05,
"loss": 0.0488,
"step": 3250
},
{
"grad_norm": 0.6861891150474548,
"learning_rate": 6.518000000000001e-05,
"loss": 0.0502,
"step": 3260
},
{
"grad_norm": 0.8076179027557373,
"learning_rate": 6.538000000000001e-05,
"loss": 0.0501,
"step": 3270
},
{
"grad_norm": 0.6303666234016418,
"learning_rate": 6.558e-05,
"loss": 0.0445,
"step": 3280
},
{
"grad_norm": 0.6976994872093201,
"learning_rate": 6.578000000000001e-05,
"loss": 0.0499,
"step": 3290
},
{
"grad_norm": 0.5723156332969666,
"learning_rate": 6.598e-05,
"loss": 0.0506,
"step": 3300
},
{
"grad_norm": 0.711640477180481,
"learning_rate": 6.618e-05,
"loss": 0.0477,
"step": 3310
},
{
"grad_norm": 0.5589656829833984,
"learning_rate": 6.638e-05,
"loss": 0.0509,
"step": 3320
},
{
"grad_norm": 0.6375987529754639,
"learning_rate": 6.658e-05,
"loss": 0.0475,
"step": 3330
},
{
"grad_norm": 0.7278190851211548,
"learning_rate": 6.678e-05,
"loss": 0.0471,
"step": 3340
},
{
"grad_norm": 0.767208456993103,
"learning_rate": 6.698e-05,
"loss": 0.0474,
"step": 3350
},
{
"grad_norm": 0.7896096706390381,
"learning_rate": 6.718e-05,
"loss": 0.0453,
"step": 3360
},
{
"grad_norm": 0.7294151782989502,
"learning_rate": 6.738e-05,
"loss": 0.0483,
"step": 3370
},
{
"grad_norm": 0.76553875207901,
"learning_rate": 6.758e-05,
"loss": 0.0449,
"step": 3380
},
{
"grad_norm": 0.7237085700035095,
"learning_rate": 6.778e-05,
"loss": 0.0493,
"step": 3390
},
{
"grad_norm": 0.5056716799736023,
"learning_rate": 6.798e-05,
"loss": 0.051,
"step": 3400
},
{
"grad_norm": 0.6128408908843994,
"learning_rate": 6.818e-05,
"loss": 0.0525,
"step": 3410
},
{
"grad_norm": 0.5406836271286011,
"learning_rate": 6.838e-05,
"loss": 0.0462,
"step": 3420
},
{
"grad_norm": 0.46201321482658386,
"learning_rate": 6.858e-05,
"loss": 0.0432,
"step": 3430
},
{
"grad_norm": 0.8275691866874695,
"learning_rate": 6.878e-05,
"loss": 0.05,
"step": 3440
},
{
"grad_norm": 0.9060169458389282,
"learning_rate": 6.898e-05,
"loss": 0.0417,
"step": 3450
},
{
"grad_norm": 0.7133957147598267,
"learning_rate": 6.918e-05,
"loss": 0.0469,
"step": 3460
},
{
"grad_norm": 0.5484327077865601,
"learning_rate": 6.938e-05,
"loss": 0.053,
"step": 3470
},
{
"grad_norm": 0.5235493183135986,
"learning_rate": 6.958e-05,
"loss": 0.048,
"step": 3480
},
{
"grad_norm": 0.3897517919540405,
"learning_rate": 6.978e-05,
"loss": 0.045,
"step": 3490
},
{
"grad_norm": 0.5649837255477905,
"learning_rate": 6.998e-05,
"loss": 0.0441,
"step": 3500
},
{
"grad_norm": 0.7963085174560547,
"learning_rate": 7.018e-05,
"loss": 0.0515,
"step": 3510
},
{
"grad_norm": 0.5288142561912537,
"learning_rate": 7.038e-05,
"loss": 0.0459,
"step": 3520
},
{
"grad_norm": 0.6564542651176453,
"learning_rate": 7.058e-05,
"loss": 0.0465,
"step": 3530
},
{
"grad_norm": 0.7467551827430725,
"learning_rate": 7.078e-05,
"loss": 0.0454,
"step": 3540
},
{
"grad_norm": 0.7674255967140198,
"learning_rate": 7.098e-05,
"loss": 0.0457,
"step": 3550
},
{
"grad_norm": 0.8723062872886658,
"learning_rate": 7.118e-05,
"loss": 0.0477,
"step": 3560
},
{
"grad_norm": 0.6423082947731018,
"learning_rate": 7.138e-05,
"loss": 0.051,
"step": 3570
},
{
"grad_norm": 0.5300627946853638,
"learning_rate": 7.158e-05,
"loss": 0.0486,
"step": 3580
},
{
"grad_norm": 0.5693717002868652,
"learning_rate": 7.178000000000001e-05,
"loss": 0.0419,
"step": 3590
},
{
"grad_norm": 0.7759952545166016,
"learning_rate": 7.198e-05,
"loss": 0.0452,
"step": 3600
},
{
"grad_norm": 0.6738252639770508,
"learning_rate": 7.218e-05,
"loss": 0.0469,
"step": 3610
},
{
"grad_norm": 0.6549544930458069,
"learning_rate": 7.238000000000001e-05,
"loss": 0.0427,
"step": 3620
},
{
"grad_norm": 0.6660674214363098,
"learning_rate": 7.258e-05,
"loss": 0.0468,
"step": 3630
},
{
"grad_norm": 0.5806365013122559,
"learning_rate": 7.278e-05,
"loss": 0.0447,
"step": 3640
},
{
"grad_norm": 0.6439737677574158,
"learning_rate": 7.298000000000001e-05,
"loss": 0.0416,
"step": 3650
},
{
"grad_norm": 0.624728798866272,
"learning_rate": 7.318e-05,
"loss": 0.0434,
"step": 3660
},
{
"grad_norm": 0.5165579319000244,
"learning_rate": 7.338e-05,
"loss": 0.0424,
"step": 3670
},
{
"grad_norm": 0.6652499437332153,
"learning_rate": 7.358000000000001e-05,
"loss": 0.0413,
"step": 3680
},
{
"grad_norm": 0.6600234508514404,
"learning_rate": 7.378e-05,
"loss": 0.0481,
"step": 3690
},
{
"grad_norm": 0.6090612411499023,
"learning_rate": 7.398e-05,
"loss": 0.0395,
"step": 3700
},
{
"grad_norm": 0.6924229264259338,
"learning_rate": 7.418000000000001e-05,
"loss": 0.0433,
"step": 3710
},
{
"grad_norm": 0.6175999045372009,
"learning_rate": 7.438e-05,
"loss": 0.0454,
"step": 3720
},
{
"grad_norm": 0.6091599464416504,
"learning_rate": 7.458000000000001e-05,
"loss": 0.0481,
"step": 3730
},
{
"grad_norm": 0.7810183167457581,
"learning_rate": 7.478e-05,
"loss": 0.0483,
"step": 3740
},
{
"grad_norm": 0.6066230535507202,
"learning_rate": 7.498e-05,
"loss": 0.0465,
"step": 3750
},
{
"grad_norm": 0.5959266424179077,
"learning_rate": 7.518000000000001e-05,
"loss": 0.0469,
"step": 3760
},
{
"grad_norm": 0.6197426915168762,
"learning_rate": 7.538e-05,
"loss": 0.0394,
"step": 3770
},
{
"grad_norm": 0.5359181761741638,
"learning_rate": 7.558e-05,
"loss": 0.046,
"step": 3780
},
{
"grad_norm": 0.5866822600364685,
"learning_rate": 7.578000000000001e-05,
"loss": 0.043,
"step": 3790
},
{
"grad_norm": 0.5869863629341125,
"learning_rate": 7.598e-05,
"loss": 0.047,
"step": 3800
},
{
"grad_norm": 0.7168889045715332,
"learning_rate": 7.618e-05,
"loss": 0.053,
"step": 3810
},
{
"grad_norm": 0.6113885641098022,
"learning_rate": 7.638000000000001e-05,
"loss": 0.0523,
"step": 3820
},
{
"grad_norm": 0.5637705326080322,
"learning_rate": 7.658e-05,
"loss": 0.0414,
"step": 3830
},
{
"grad_norm": 0.4776048958301544,
"learning_rate": 7.678000000000001e-05,
"loss": 0.0438,
"step": 3840
},
{
"grad_norm": 0.5855521559715271,
"learning_rate": 7.698000000000001e-05,
"loss": 0.0448,
"step": 3850
},
{
"grad_norm": 0.4712742269039154,
"learning_rate": 7.718e-05,
"loss": 0.0449,
"step": 3860
},
{
"grad_norm": 0.6781105995178223,
"learning_rate": 7.738000000000001e-05,
"loss": 0.0385,
"step": 3870
},
{
"grad_norm": 0.6678699851036072,
"learning_rate": 7.758000000000001e-05,
"loss": 0.0405,
"step": 3880
},
{
"grad_norm": 0.675919234752655,
"learning_rate": 7.778e-05,
"loss": 0.0475,
"step": 3890
},
{
"grad_norm": 0.5031142234802246,
"learning_rate": 7.798000000000001e-05,
"loss": 0.0491,
"step": 3900
},
{
"grad_norm": 0.5640633702278137,
"learning_rate": 7.818000000000001e-05,
"loss": 0.045,
"step": 3910
},
{
"grad_norm": 0.6376245021820068,
"learning_rate": 7.838e-05,
"loss": 0.0519,
"step": 3920
},
{
"grad_norm": 0.5042374730110168,
"learning_rate": 7.858000000000001e-05,
"loss": 0.0433,
"step": 3930
},
{
"grad_norm": 0.7852765917778015,
"learning_rate": 7.878e-05,
"loss": 0.0462,
"step": 3940
},
{
"grad_norm": 0.6827335357666016,
"learning_rate": 7.897999999999999e-05,
"loss": 0.0451,
"step": 3950
},
{
"grad_norm": 0.5106844305992126,
"learning_rate": 7.918e-05,
"loss": 0.0472,
"step": 3960
},
{
"grad_norm": 0.5510252714157104,
"learning_rate": 7.938e-05,
"loss": 0.0414,
"step": 3970
},
{
"grad_norm": 0.4865681231021881,
"learning_rate": 7.958e-05,
"loss": 0.0411,
"step": 3980
},
{
"grad_norm": 0.5142320990562439,
"learning_rate": 7.978e-05,
"loss": 0.0409,
"step": 3990
},
{
"grad_norm": 0.6441251039505005,
"learning_rate": 7.998e-05,
"loss": 0.0458,
"step": 4000
},
{
"grad_norm": 0.47978806495666504,
"learning_rate": 8.018e-05,
"loss": 0.0423,
"step": 4010
},
{
"grad_norm": 0.5475010871887207,
"learning_rate": 8.038e-05,
"loss": 0.0417,
"step": 4020
},
{
"grad_norm": 0.5535333156585693,
"learning_rate": 8.058e-05,
"loss": 0.0463,
"step": 4030
},
{
"grad_norm": 0.4661189615726471,
"learning_rate": 8.078e-05,
"loss": 0.0451,
"step": 4040
},
{
"grad_norm": 0.6663604974746704,
"learning_rate": 8.098e-05,
"loss": 0.0472,
"step": 4050
},
{
"grad_norm": 0.6188867092132568,
"learning_rate": 8.118e-05,
"loss": 0.0447,
"step": 4060
},
{
"grad_norm": 0.43748292326927185,
"learning_rate": 8.138e-05,
"loss": 0.0377,
"step": 4070
},
{
"grad_norm": 0.50865238904953,
"learning_rate": 8.158e-05,
"loss": 0.0427,
"step": 4080
},
{
"grad_norm": 0.43123728036880493,
"learning_rate": 8.178e-05,
"loss": 0.0384,
"step": 4090
},
{
"grad_norm": 0.48116111755371094,
"learning_rate": 8.198e-05,
"loss": 0.0381,
"step": 4100
},
{
"grad_norm": 0.5601455569267273,
"learning_rate": 8.218e-05,
"loss": 0.0457,
"step": 4110
},
{
"grad_norm": 0.35396626591682434,
"learning_rate": 8.238000000000001e-05,
"loss": 0.0456,
"step": 4120
},
{
"grad_norm": 0.6245121955871582,
"learning_rate": 8.258e-05,
"loss": 0.0468,
"step": 4130
},
{
"grad_norm": 0.47480565309524536,
"learning_rate": 8.278e-05,
"loss": 0.0424,
"step": 4140
},
{
"grad_norm": 0.4660918712615967,
"learning_rate": 8.298000000000001e-05,
"loss": 0.0433,
"step": 4150
},
{
"grad_norm": 0.577488362789154,
"learning_rate": 8.318e-05,
"loss": 0.0407,
"step": 4160
},
{
"grad_norm": 0.5647264719009399,
"learning_rate": 8.338e-05,
"loss": 0.043,
"step": 4170
},
{
"grad_norm": 0.5393707752227783,
"learning_rate": 8.358e-05,
"loss": 0.0429,
"step": 4180
},
{
"grad_norm": 0.70014488697052,
"learning_rate": 8.378e-05,
"loss": 0.0473,
"step": 4190
},
{
"grad_norm": 0.5518887042999268,
"learning_rate": 8.398e-05,
"loss": 0.0404,
"step": 4200
},
{
"grad_norm": 0.6201269626617432,
"learning_rate": 8.418e-05,
"loss": 0.0399,
"step": 4210
},
{
"grad_norm": 0.5491284728050232,
"learning_rate": 8.438e-05,
"loss": 0.0414,
"step": 4220
},
{
"grad_norm": 0.4668818712234497,
"learning_rate": 8.458e-05,
"loss": 0.0408,
"step": 4230
},
{
"grad_norm": 0.5356974005699158,
"learning_rate": 8.478e-05,
"loss": 0.0393,
"step": 4240
},
{
"grad_norm": 0.5943256616592407,
"learning_rate": 8.498e-05,
"loss": 0.042,
"step": 4250
},
{
"grad_norm": 0.5515814423561096,
"learning_rate": 8.518000000000001e-05,
"loss": 0.0438,
"step": 4260
},
{
"grad_norm": 0.5101014375686646,
"learning_rate": 8.538e-05,
"loss": 0.0434,
"step": 4270
},
{
"grad_norm": 0.6161578893661499,
"learning_rate": 8.558e-05,
"loss": 0.0437,
"step": 4280
},
{
"grad_norm": 0.5971099138259888,
"learning_rate": 8.578000000000001e-05,
"loss": 0.0387,
"step": 4290
},
{
"grad_norm": 0.5258501172065735,
"learning_rate": 8.598e-05,
"loss": 0.0381,
"step": 4300
},
{
"grad_norm": 0.5870957374572754,
"learning_rate": 8.618e-05,
"loss": 0.0396,
"step": 4310
},
{
"grad_norm": 0.6256932616233826,
"learning_rate": 8.638000000000001e-05,
"loss": 0.0451,
"step": 4320
},
{
"grad_norm": 0.4102240800857544,
"learning_rate": 8.658e-05,
"loss": 0.0405,
"step": 4330
},
{
"grad_norm": 0.4915444552898407,
"learning_rate": 8.678e-05,
"loss": 0.0428,
"step": 4340
},
{
"grad_norm": 0.4968762993812561,
"learning_rate": 8.698000000000001e-05,
"loss": 0.046,
"step": 4350
},
{
"grad_norm": 0.5967429280281067,
"learning_rate": 8.718e-05,
"loss": 0.0393,
"step": 4360
},
{
"grad_norm": 0.6591717004776001,
"learning_rate": 8.738000000000001e-05,
"loss": 0.0439,
"step": 4370
},
{
"grad_norm": 0.7125138640403748,
"learning_rate": 8.758000000000001e-05,
"loss": 0.0461,
"step": 4380
},
{
"grad_norm": 0.46841591596603394,
"learning_rate": 8.778e-05,
"loss": 0.0497,
"step": 4390
},
{
"grad_norm": 0.5526397824287415,
"learning_rate": 8.798000000000001e-05,
"loss": 0.0401,
"step": 4400
},
{
"grad_norm": 0.489277184009552,
"learning_rate": 8.818000000000001e-05,
"loss": 0.0417,
"step": 4410
},
{
"grad_norm": 0.5580271482467651,
"learning_rate": 8.838e-05,
"loss": 0.0425,
"step": 4420
},
{
"grad_norm": 0.4590274691581726,
"learning_rate": 8.858000000000001e-05,
"loss": 0.0393,
"step": 4430
},
{
"grad_norm": 0.5382079482078552,
"learning_rate": 8.878000000000001e-05,
"loss": 0.043,
"step": 4440
},
{
"grad_norm": 0.43965789675712585,
"learning_rate": 8.898e-05,
"loss": 0.0438,
"step": 4450
},
{
"grad_norm": 0.41453346610069275,
"learning_rate": 8.918000000000001e-05,
"loss": 0.042,
"step": 4460
},
{
"grad_norm": 0.48576247692108154,
"learning_rate": 8.938e-05,
"loss": 0.0396,
"step": 4470
},
{
"grad_norm": 0.6486697196960449,
"learning_rate": 8.958e-05,
"loss": 0.0422,
"step": 4480
},
{
"grad_norm": 0.6449689269065857,
"learning_rate": 8.978000000000001e-05,
"loss": 0.0482,
"step": 4490
},
{
"grad_norm": 0.5332819223403931,
"learning_rate": 8.998e-05,
"loss": 0.0428,
"step": 4500
},
{
"grad_norm": 0.550387978553772,
"learning_rate": 9.018000000000001e-05,
"loss": 0.0371,
"step": 4510
},
{
"grad_norm": 0.4800110459327698,
"learning_rate": 9.038000000000001e-05,
"loss": 0.0372,
"step": 4520
},
{
"grad_norm": 0.510522723197937,
"learning_rate": 9.058e-05,
"loss": 0.0436,
"step": 4530
},
{
"grad_norm": 0.6461642384529114,
"learning_rate": 9.078000000000001e-05,
"loss": 0.0423,
"step": 4540
},
{
"grad_norm": 0.4169749915599823,
"learning_rate": 9.098000000000001e-05,
"loss": 0.0365,
"step": 4550
},
{
"grad_norm": 0.5735549330711365,
"learning_rate": 9.118e-05,
"loss": 0.0383,
"step": 4560
},
{
"grad_norm": 0.46670055389404297,
"learning_rate": 9.138e-05,
"loss": 0.0371,
"step": 4570
},
{
"grad_norm": 0.49771854281425476,
"learning_rate": 9.158e-05,
"loss": 0.0458,
"step": 4580
},
{
"grad_norm": 0.576575517654419,
"learning_rate": 9.178e-05,
"loss": 0.0455,
"step": 4590
},
{
"grad_norm": 0.45334720611572266,
"learning_rate": 9.198e-05,
"loss": 0.0403,
"step": 4600
},
{
"grad_norm": 0.44037169218063354,
"learning_rate": 9.218e-05,
"loss": 0.0464,
"step": 4610
},
{
"grad_norm": 0.4453979730606079,
"learning_rate": 9.238e-05,
"loss": 0.0389,
"step": 4620
},
{
"grad_norm": 0.5467743277549744,
"learning_rate": 9.258e-05,
"loss": 0.0411,
"step": 4630
},
{
"grad_norm": 0.47528183460235596,
"learning_rate": 9.278e-05,
"loss": 0.0418,
"step": 4640
},
{
"grad_norm": 0.3945184051990509,
"learning_rate": 9.298e-05,
"loss": 0.0376,
"step": 4650
},
{
"grad_norm": 0.43459224700927734,
"learning_rate": 9.318e-05,
"loss": 0.0398,
"step": 4660
},
{
"grad_norm": 0.4490455389022827,
"learning_rate": 9.338e-05,
"loss": 0.0382,
"step": 4670
},
{
"grad_norm": 0.4714946448802948,
"learning_rate": 9.358e-05,
"loss": 0.0343,
"step": 4680
},
{
"grad_norm": 0.39870744943618774,
"learning_rate": 9.378e-05,
"loss": 0.0388,
"step": 4690
},
{
"grad_norm": 0.5481004118919373,
"learning_rate": 9.398e-05,
"loss": 0.0507,
"step": 4700
},
{
"grad_norm": 0.5729978680610657,
"learning_rate": 9.418e-05,
"loss": 0.0411,
"step": 4710
},
{
"grad_norm": 0.5097001194953918,
"learning_rate": 9.438e-05,
"loss": 0.0354,
"step": 4720
},
{
"grad_norm": 0.3913422226905823,
"learning_rate": 9.458e-05,
"loss": 0.037,
"step": 4730
},
{
"grad_norm": 0.6350584626197815,
"learning_rate": 9.478e-05,
"loss": 0.039,
"step": 4740
},
{
"grad_norm": 0.4887065589427948,
"learning_rate": 9.498e-05,
"loss": 0.0407,
"step": 4750
},
{
"grad_norm": 0.42942601442337036,
"learning_rate": 9.518000000000001e-05,
"loss": 0.0426,
"step": 4760
},
{
"grad_norm": 0.40548574924468994,
"learning_rate": 9.538e-05,
"loss": 0.0452,
"step": 4770
},
{
"grad_norm": 0.35316094756126404,
"learning_rate": 9.558e-05,
"loss": 0.0399,
"step": 4780
},
{
"grad_norm": 0.47131964564323425,
"learning_rate": 9.578000000000001e-05,
"loss": 0.0428,
"step": 4790
},
{
"grad_norm": 0.45955830812454224,
"learning_rate": 9.598e-05,
"loss": 0.0415,
"step": 4800
},
{
"grad_norm": 0.459089457988739,
"learning_rate": 9.618e-05,
"loss": 0.0373,
"step": 4810
},
{
"grad_norm": 0.4864519238471985,
"learning_rate": 9.638000000000001e-05,
"loss": 0.0402,
"step": 4820
},
{
"grad_norm": 0.44014737010002136,
"learning_rate": 9.658e-05,
"loss": 0.039,
"step": 4830
},
{
"grad_norm": 0.505915105342865,
"learning_rate": 9.678e-05,
"loss": 0.04,
"step": 4840
},
{
"grad_norm": 0.41672059893608093,
"learning_rate": 9.698000000000001e-05,
"loss": 0.0387,
"step": 4850
},
{
"grad_norm": 0.5977463722229004,
"learning_rate": 9.718e-05,
"loss": 0.0381,
"step": 4860
},
{
"grad_norm": 0.4902133345603943,
"learning_rate": 9.738e-05,
"loss": 0.0445,
"step": 4870
},
{
"grad_norm": 0.35810279846191406,
"learning_rate": 9.758000000000001e-05,
"loss": 0.0383,
"step": 4880
},
{
"grad_norm": 0.41253653168678284,
"learning_rate": 9.778e-05,
"loss": 0.0391,
"step": 4890
},
{
"grad_norm": 0.35557398200035095,
"learning_rate": 9.798000000000001e-05,
"loss": 0.038,
"step": 4900
},
{
"grad_norm": 0.5121879577636719,
"learning_rate": 9.818000000000001e-05,
"loss": 0.0365,
"step": 4910
},
{
"grad_norm": 0.45584094524383545,
"learning_rate": 9.838e-05,
"loss": 0.0408,
"step": 4920
},
{
"grad_norm": 0.5642129182815552,
"learning_rate": 9.858000000000001e-05,
"loss": 0.04,
"step": 4930
},
{
"grad_norm": 0.44973066449165344,
"learning_rate": 9.878e-05,
"loss": 0.0344,
"step": 4940
},
{
"grad_norm": 0.6327542066574097,
"learning_rate": 9.898e-05,
"loss": 0.0423,
"step": 4950
},
{
"grad_norm": 0.4707786738872528,
"learning_rate": 9.918000000000001e-05,
"loss": 0.0419,
"step": 4960
},
{
"grad_norm": 0.49904462695121765,
"learning_rate": 9.938e-05,
"loss": 0.0351,
"step": 4970
},
{
"grad_norm": 0.3730181157588959,
"learning_rate": 9.958e-05,
"loss": 0.0435,
"step": 4980
},
{
"grad_norm": 0.3837527632713318,
"learning_rate": 9.978000000000001e-05,
"loss": 0.0394,
"step": 4990
},
{
"grad_norm": 0.49388110637664795,
"learning_rate": 9.998e-05,
"loss": 0.0392,
"step": 5000
},
{
"grad_norm": 0.3429860472679138,
"learning_rate": 9.999999778549045e-05,
"loss": 0.0388,
"step": 5010
},
{
"grad_norm": 0.6041615009307861,
"learning_rate": 9.999999013039593e-05,
"loss": 0.0436,
"step": 5020
},
{
"grad_norm": 0.36285310983657837,
"learning_rate": 9.999997700737766e-05,
"loss": 0.0367,
"step": 5030
},
{
"grad_norm": 0.35924023389816284,
"learning_rate": 9.999995841643709e-05,
"loss": 0.0352,
"step": 5040
},
{
"grad_norm": 0.3420441746711731,
"learning_rate": 9.999993435757623e-05,
"loss": 0.0377,
"step": 5050
},
{
"grad_norm": 0.43935585021972656,
"learning_rate": 9.999990483079773e-05,
"loss": 0.0406,
"step": 5060
},
{
"grad_norm": 0.47324880957603455,
"learning_rate": 9.999986983610481e-05,
"loss": 0.0373,
"step": 5070
},
{
"grad_norm": 0.5538941621780396,
"learning_rate": 9.99998293735013e-05,
"loss": 0.0355,
"step": 5080
},
{
"grad_norm": 0.45285844802856445,
"learning_rate": 9.999978344299161e-05,
"loss": 0.0405,
"step": 5090
},
{
"grad_norm": 0.5444912314414978,
"learning_rate": 9.99997320445808e-05,
"loss": 0.0391,
"step": 5100
},
{
"grad_norm": 0.3988666236400604,
"learning_rate": 9.999967517827444e-05,
"loss": 0.0414,
"step": 5110
},
{
"grad_norm": 0.42463627457618713,
"learning_rate": 9.999961284407879e-05,
"loss": 0.0359,
"step": 5120
},
{
"grad_norm": 0.40492162108421326,
"learning_rate": 9.999954504200067e-05,
"loss": 0.0356,
"step": 5130
},
{
"grad_norm": 0.46432259678840637,
"learning_rate": 9.999947177204744e-05,
"loss": 0.0404,
"step": 5140
},
{
"grad_norm": 0.4111925959587097,
"learning_rate": 9.999939303422718e-05,
"loss": 0.0406,
"step": 5150
},
{
"grad_norm": 0.38475123047828674,
"learning_rate": 9.999930882854847e-05,
"loss": 0.0418,
"step": 5160
},
{
"grad_norm": 0.41682934761047363,
"learning_rate": 9.999921915502051e-05,
"loss": 0.0369,
"step": 5170
},
{
"grad_norm": 0.43813854455947876,
"learning_rate": 9.99991240136531e-05,
"loss": 0.0334,
"step": 5180
},
{
"grad_norm": 0.42981648445129395,
"learning_rate": 9.999902340445668e-05,
"loss": 0.0401,
"step": 5190
},
{
"grad_norm": 0.4623315930366516,
"learning_rate": 9.999891732744224e-05,
"loss": 0.0364,
"step": 5200
},
{
"grad_norm": 0.49400535225868225,
"learning_rate": 9.999880578262135e-05,
"loss": 0.0428,
"step": 5210
},
{
"grad_norm": 0.4747942388057709,
"learning_rate": 9.999868877000624e-05,
"loss": 0.046,
"step": 5220
},
{
"grad_norm": 0.3685579001903534,
"learning_rate": 9.99985662896097e-05,
"loss": 0.0398,
"step": 5230
},
{
"grad_norm": 0.4534323811531067,
"learning_rate": 9.999843834144513e-05,
"loss": 0.0344,
"step": 5240
},
{
"grad_norm": 0.4030906558036804,
"learning_rate": 9.99983049255265e-05,
"loss": 0.0393,
"step": 5250
},
{
"grad_norm": 0.4210641384124756,
"learning_rate": 9.999816604186843e-05,
"loss": 0.0365,
"step": 5260
},
{
"grad_norm": 0.419874906539917,
"learning_rate": 9.999802169048609e-05,
"loss": 0.0389,
"step": 5270
},
{
"grad_norm": 0.34282663464546204,
"learning_rate": 9.999787187139527e-05,
"loss": 0.0375,
"step": 5280
},
{
"grad_norm": 0.3982928693294525,
"learning_rate": 9.999771658461234e-05,
"loss": 0.0392,
"step": 5290
},
{
"grad_norm": 0.3975090980529785,
"learning_rate": 9.999755583015431e-05,
"loss": 0.0428,
"step": 5300
},
{
"grad_norm": 0.37944069504737854,
"learning_rate": 9.999738960803874e-05,
"loss": 0.0454,
"step": 5310
},
{
"grad_norm": 0.32097867131233215,
"learning_rate": 9.99972179182838e-05,
"loss": 0.0449,
"step": 5320
},
{
"grad_norm": 0.33090895414352417,
"learning_rate": 9.99970407609083e-05,
"loss": 0.0364,
"step": 5330
},
{
"grad_norm": 0.4796665608882904,
"learning_rate": 9.999685813593159e-05,
"loss": 0.0394,
"step": 5340
},
{
"grad_norm": 0.3169485032558441,
"learning_rate": 9.999667004337362e-05,
"loss": 0.0392,
"step": 5350
},
{
"grad_norm": 0.36470603942871094,
"learning_rate": 9.9996476483255e-05,
"loss": 0.0353,
"step": 5360
},
{
"grad_norm": 0.44357529282569885,
"learning_rate": 9.999627745559688e-05,
"loss": 0.0348,
"step": 5370
},
{
"grad_norm": 0.36827611923217773,
"learning_rate": 9.999607296042101e-05,
"loss": 0.0344,
"step": 5380
},
{
"grad_norm": 0.37521809339523315,
"learning_rate": 9.99958629977498e-05,
"loss": 0.0341,
"step": 5390
},
{
"grad_norm": 0.3604615330696106,
"learning_rate": 9.999564756760615e-05,
"loss": 0.0312,
"step": 5400
},
{
"grad_norm": 0.37528300285339355,
"learning_rate": 9.999542667001366e-05,
"loss": 0.0341,
"step": 5410
},
{
"grad_norm": 0.3395298719406128,
"learning_rate": 9.999520030499647e-05,
"loss": 0.034,
"step": 5420
},
{
"grad_norm": 0.32314980030059814,
"learning_rate": 9.999496847257936e-05,
"loss": 0.0382,
"step": 5430
},
{
"grad_norm": 0.46882694959640503,
"learning_rate": 9.999473117278764e-05,
"loss": 0.0359,
"step": 5440
},
{
"grad_norm": 0.48731929063796997,
"learning_rate": 9.999448840564731e-05,
"loss": 0.0376,
"step": 5450
},
{
"grad_norm": 0.47560223937034607,
"learning_rate": 9.999424017118488e-05,
"loss": 0.038,
"step": 5460
},
{
"grad_norm": 0.35583850741386414,
"learning_rate": 9.999398646942751e-05,
"loss": 0.0346,
"step": 5470
},
{
"grad_norm": 0.370250940322876,
"learning_rate": 9.999372730040296e-05,
"loss": 0.0375,
"step": 5480
},
{
"grad_norm": 0.4444587528705597,
"learning_rate": 9.999346266413953e-05,
"loss": 0.0366,
"step": 5490
},
{
"grad_norm": 0.4800468683242798,
"learning_rate": 9.99931925606662e-05,
"loss": 0.0336,
"step": 5500
},
{
"grad_norm": 0.4077254831790924,
"learning_rate": 9.99929169900125e-05,
"loss": 0.039,
"step": 5510
},
{
"grad_norm": 0.47895848751068115,
"learning_rate": 9.999263595220855e-05,
"loss": 0.0392,
"step": 5520
},
{
"grad_norm": 0.3881273567676544,
"learning_rate": 9.99923494472851e-05,
"loss": 0.0355,
"step": 5530
},
{
"grad_norm": 0.34213364124298096,
"learning_rate": 9.999205747527348e-05,
"loss": 0.0343,
"step": 5540
},
{
"grad_norm": 0.30999526381492615,
"learning_rate": 9.999176003620561e-05,
"loss": 0.0349,
"step": 5550
},
{
"grad_norm": 0.42547470331192017,
"learning_rate": 9.999145713011405e-05,
"loss": 0.0334,
"step": 5560
},
{
"grad_norm": 0.43339619040489197,
"learning_rate": 9.999114875703186e-05,
"loss": 0.0355,
"step": 5570
},
{
"grad_norm": 0.4451631009578705,
"learning_rate": 9.999083491699281e-05,
"loss": 0.0321,
"step": 5580
},
{
"grad_norm": 0.2986989915370941,
"learning_rate": 9.999051561003123e-05,
"loss": 0.0353,
"step": 5590
},
{
"grad_norm": 0.36763715744018555,
"learning_rate": 9.999019083618202e-05,
"loss": 0.0314,
"step": 5600
},
{
"grad_norm": 0.4679538607597351,
"learning_rate": 9.99898605954807e-05,
"loss": 0.0337,
"step": 5610
},
{
"grad_norm": 0.444879949092865,
"learning_rate": 9.998952488796338e-05,
"loss": 0.0351,
"step": 5620
},
{
"grad_norm": 0.33788660168647766,
"learning_rate": 9.998918371366676e-05,
"loss": 0.0336,
"step": 5630
},
{
"grad_norm": 0.28990015387535095,
"learning_rate": 9.99888370726282e-05,
"loss": 0.031,
"step": 5640
},
{
"grad_norm": 0.421210378408432,
"learning_rate": 9.998848496488556e-05,
"loss": 0.0319,
"step": 5650
},
{
"grad_norm": 0.25194352865219116,
"learning_rate": 9.998812739047736e-05,
"loss": 0.0411,
"step": 5660
},
{
"grad_norm": 0.3490672707557678,
"learning_rate": 9.99877643494427e-05,
"loss": 0.0328,
"step": 5670
},
{
"grad_norm": 0.33266201615333557,
"learning_rate": 9.998739584182128e-05,
"loss": 0.0349,
"step": 5680
},
{
"grad_norm": 0.3868013024330139,
"learning_rate": 9.998702186765342e-05,
"loss": 0.0319,
"step": 5690
},
{
"grad_norm": 0.48441991209983826,
"learning_rate": 9.998664242698e-05,
"loss": 0.0363,
"step": 5700
},
{
"grad_norm": 0.47828468680381775,
"learning_rate": 9.998625751984251e-05,
"loss": 0.0348,
"step": 5710
},
{
"grad_norm": 0.3215515911579132,
"learning_rate": 9.998586714628307e-05,
"loss": 0.0348,
"step": 5720
},
{
"grad_norm": 0.3726402521133423,
"learning_rate": 9.998547130634432e-05,
"loss": 0.0338,
"step": 5730
},
{
"grad_norm": 0.30773892998695374,
"learning_rate": 9.99850700000696e-05,
"loss": 0.0371,
"step": 5740
},
{
"grad_norm": 0.2897832691669464,
"learning_rate": 9.998466322750278e-05,
"loss": 0.0365,
"step": 5750
},
{
"grad_norm": 0.37654203176498413,
"learning_rate": 9.998425098868834e-05,
"loss": 0.034,
"step": 5760
},
{
"grad_norm": 0.4271163046360016,
"learning_rate": 9.998383328367136e-05,
"loss": 0.0364,
"step": 5770
},
{
"grad_norm": 0.3594057261943817,
"learning_rate": 9.99834101124975e-05,
"loss": 0.0344,
"step": 5780
},
{
"grad_norm": 0.324118971824646,
"learning_rate": 9.998298147521309e-05,
"loss": 0.0351,
"step": 5790
},
{
"grad_norm": 0.3425390422344208,
"learning_rate": 9.998254737186496e-05,
"loss": 0.0338,
"step": 5800
},
{
"grad_norm": 0.326735258102417,
"learning_rate": 9.99821078025006e-05,
"loss": 0.0307,
"step": 5810
},
{
"grad_norm": 0.3162924349308014,
"learning_rate": 9.998166276716807e-05,
"loss": 0.0343,
"step": 5820
},
{
"grad_norm": 0.4421290457248688,
"learning_rate": 9.998121226591606e-05,
"loss": 0.0373,
"step": 5830
},
{
"grad_norm": 0.39257335662841797,
"learning_rate": 9.998075629879382e-05,
"loss": 0.035,
"step": 5840
},
{
"grad_norm": 0.3666342496871948,
"learning_rate": 9.99802948658512e-05,
"loss": 0.0352,
"step": 5850
},
{
"grad_norm": 0.27250564098358154,
"learning_rate": 9.99798279671387e-05,
"loss": 0.0379,
"step": 5860
},
{
"grad_norm": 0.3464193344116211,
"learning_rate": 9.997935560270734e-05,
"loss": 0.0382,
"step": 5870
},
{
"grad_norm": 0.49353811144828796,
"learning_rate": 9.997887777260879e-05,
"loss": 0.0343,
"step": 5880
},
{
"grad_norm": 0.3261691927909851,
"learning_rate": 9.997839447689532e-05,
"loss": 0.0317,
"step": 5890
},
{
"grad_norm": 0.49990391731262207,
"learning_rate": 9.997790571561978e-05,
"loss": 0.0357,
"step": 5900
},
{
"grad_norm": 0.4302642345428467,
"learning_rate": 9.99774114888356e-05,
"loss": 0.0366,
"step": 5910
},
{
"grad_norm": 0.28723835945129395,
"learning_rate": 9.997691179659684e-05,
"loss": 0.0319,
"step": 5920
},
{
"grad_norm": 0.4203507900238037,
"learning_rate": 9.997640663895815e-05,
"loss": 0.0301,
"step": 5930
},
{
"grad_norm": 0.38473060727119446,
"learning_rate": 9.997589601597477e-05,
"loss": 0.0369,
"step": 5940
},
{
"grad_norm": 0.3344627022743225,
"learning_rate": 9.997537992770252e-05,
"loss": 0.0337,
"step": 5950
},
{
"grad_norm": 0.393898606300354,
"learning_rate": 9.997485837419788e-05,
"loss": 0.0312,
"step": 5960
},
{
"grad_norm": 0.36444637179374695,
"learning_rate": 9.997433135551786e-05,
"loss": 0.0361,
"step": 5970
},
{
"grad_norm": 0.34656956791877747,
"learning_rate": 9.997379887172009e-05,
"loss": 0.0333,
"step": 5980
},
{
"grad_norm": 0.40818262100219727,
"learning_rate": 9.997326092286281e-05,
"loss": 0.0349,
"step": 5990
},
{
"grad_norm": 0.3876807987689972,
"learning_rate": 9.997271750900486e-05,
"loss": 0.0381,
"step": 6000
},
{
"grad_norm": 0.3406454622745514,
"learning_rate": 9.997216863020565e-05,
"loss": 0.0307,
"step": 6010
},
{
"grad_norm": 0.45904242992401123,
"learning_rate": 9.99716142865252e-05,
"loss": 0.0313,
"step": 6020
},
{
"grad_norm": 0.51307612657547,
"learning_rate": 9.997105447802415e-05,
"loss": 0.0313,
"step": 6030
},
{
"grad_norm": 0.3594149351119995,
"learning_rate": 9.997048920476373e-05,
"loss": 0.0328,
"step": 6040
},
{
"grad_norm": 0.3178771734237671,
"learning_rate": 9.996991846680572e-05,
"loss": 0.0332,
"step": 6050
},
{
"grad_norm": 0.3893943428993225,
"learning_rate": 9.996934226421257e-05,
"loss": 0.0301,
"step": 6060
},
{
"grad_norm": 0.3164651691913605,
"learning_rate": 9.996876059704726e-05,
"loss": 0.033,
"step": 6070
},
{
"grad_norm": 0.2855364978313446,
"learning_rate": 9.996817346537343e-05,
"loss": 0.0354,
"step": 6080
},
{
"grad_norm": 0.37176141142845154,
"learning_rate": 9.996758086925526e-05,
"loss": 0.0284,
"step": 6090
},
{
"grad_norm": 0.36542609333992004,
"learning_rate": 9.996698280875759e-05,
"loss": 0.0399,
"step": 6100
},
{
"grad_norm": 0.30383336544036865,
"learning_rate": 9.99663792839458e-05,
"loss": 0.0331,
"step": 6110
},
{
"grad_norm": 0.33855536580085754,
"learning_rate": 9.99657702948859e-05,
"loss": 0.04,
"step": 6120
},
{
"grad_norm": 0.34331974387168884,
"learning_rate": 9.996515584164448e-05,
"loss": 0.0337,
"step": 6130
},
{
"grad_norm": 0.3528672158718109,
"learning_rate": 9.996453592428873e-05,
"loss": 0.0328,
"step": 6140
},
{
"grad_norm": 0.3226683437824249,
"learning_rate": 9.996391054288646e-05,
"loss": 0.038,
"step": 6150
},
{
"grad_norm": 0.3772316873073578,
"learning_rate": 9.996327969750605e-05,
"loss": 0.0338,
"step": 6160
},
{
"grad_norm": 0.38018396496772766,
"learning_rate": 9.996264338821649e-05,
"loss": 0.036,
"step": 6170
},
{
"grad_norm": 0.4035049378871918,
"learning_rate": 9.996200161508735e-05,
"loss": 0.034,
"step": 6180
},
{
"grad_norm": 0.3025064170360565,
"learning_rate": 9.996135437818885e-05,
"loss": 0.0306,
"step": 6190
},
{
"grad_norm": 0.4211691915988922,
"learning_rate": 9.996070167759175e-05,
"loss": 0.0325,
"step": 6200
},
{
"grad_norm": 0.33900928497314453,
"learning_rate": 9.996004351336743e-05,
"loss": 0.0316,
"step": 6210
},
{
"grad_norm": 0.4741729199886322,
"learning_rate": 9.995937988558785e-05,
"loss": 0.0391,
"step": 6220
},
{
"grad_norm": 0.38156992197036743,
"learning_rate": 9.995871079432561e-05,
"loss": 0.0352,
"step": 6230
},
{
"grad_norm": 0.3592539131641388,
"learning_rate": 9.995803623965389e-05,
"loss": 0.0319,
"step": 6240
},
{
"grad_norm": 0.324923574924469,
"learning_rate": 9.995735622164641e-05,
"loss": 0.0367,
"step": 6250
},
{
"grad_norm": 0.35206273198127747,
"learning_rate": 9.995667074037758e-05,
"loss": 0.0374,
"step": 6260
},
{
"grad_norm": 0.33310672640800476,
"learning_rate": 9.995597979592232e-05,
"loss": 0.031,
"step": 6270
},
{
"grad_norm": 0.3474056124687195,
"learning_rate": 9.995528338835625e-05,
"loss": 0.0349,
"step": 6280
},
{
"grad_norm": 0.39525800943374634,
"learning_rate": 9.995458151775547e-05,
"loss": 0.03,
"step": 6290
},
{
"grad_norm": 0.28876927495002747,
"learning_rate": 9.995387418419677e-05,
"loss": 0.0368,
"step": 6300
},
{
"grad_norm": 0.4322184920310974,
"learning_rate": 9.99531613877575e-05,
"loss": 0.0343,
"step": 6310
},
{
"grad_norm": 0.42007580399513245,
"learning_rate": 9.995244312851559e-05,
"loss": 0.0344,
"step": 6320
},
{
"grad_norm": 0.31896668672561646,
"learning_rate": 9.995171940654961e-05,
"loss": 0.036,
"step": 6330
},
{
"grad_norm": 0.2959060072898865,
"learning_rate": 9.995099022193871e-05,
"loss": 0.0357,
"step": 6340
},
{
"grad_norm": 0.43493425846099854,
"learning_rate": 9.995025557476261e-05,
"loss": 0.0365,
"step": 6350
},
{
"grad_norm": 0.3504331409931183,
"learning_rate": 9.994951546510165e-05,
"loss": 0.0383,
"step": 6360
},
{
"grad_norm": 0.36251556873321533,
"learning_rate": 9.994876989303679e-05,
"loss": 0.0348,
"step": 6370
},
{
"grad_norm": 0.3220044672489166,
"learning_rate": 9.994801885864955e-05,
"loss": 0.0363,
"step": 6380
},
{
"grad_norm": 0.3531849682331085,
"learning_rate": 9.994726236202205e-05,
"loss": 0.0352,
"step": 6390
},
{
"grad_norm": 0.43758726119995117,
"learning_rate": 9.994650040323704e-05,
"loss": 0.0317,
"step": 6400
},
{
"grad_norm": 0.29453524947166443,
"learning_rate": 9.994573298237784e-05,
"loss": 0.039,
"step": 6410
},
{
"grad_norm": 0.371326208114624,
"learning_rate": 9.994496009952837e-05,
"loss": 0.0361,
"step": 6420
},
{
"grad_norm": 0.3863460421562195,
"learning_rate": 9.994418175477316e-05,
"loss": 0.0331,
"step": 6430
},
{
"grad_norm": 0.40759512782096863,
"learning_rate": 9.994339794819733e-05,
"loss": 0.0331,
"step": 6440
},
{
"grad_norm": 0.2871116101741791,
"learning_rate": 9.994260867988658e-05,
"loss": 0.0327,
"step": 6450
},
{
"grad_norm": 0.21499332785606384,
"learning_rate": 9.994181394992723e-05,
"loss": 0.033,
"step": 6460
},
{
"grad_norm": 0.2656130790710449,
"learning_rate": 9.994101375840618e-05,
"loss": 0.0331,
"step": 6470
},
{
"grad_norm": 0.3623477816581726,
"learning_rate": 9.994020810541098e-05,
"loss": 0.0358,
"step": 6480
},
{
"grad_norm": 0.33246755599975586,
"learning_rate": 9.99393969910297e-05,
"loss": 0.0344,
"step": 6490
},
{
"grad_norm": 0.3610251247882843,
"learning_rate": 9.993858041535104e-05,
"loss": 0.0323,
"step": 6500
},
{
"grad_norm": 0.3781639039516449,
"learning_rate": 9.99377583784643e-05,
"loss": 0.0316,
"step": 6510
},
{
"grad_norm": 0.2506084442138672,
"learning_rate": 9.993693088045939e-05,
"loss": 0.03,
"step": 6520
},
{
"grad_norm": 0.38919296860694885,
"learning_rate": 9.99360979214268e-05,
"loss": 0.0318,
"step": 6530
},
{
"grad_norm": 0.33336395025253296,
"learning_rate": 9.99352595014576e-05,
"loss": 0.0322,
"step": 6540
},
{
"grad_norm": 0.4244031310081482,
"learning_rate": 9.993441562064354e-05,
"loss": 0.0329,
"step": 6550
},
{
"grad_norm": 0.3944956958293915,
"learning_rate": 9.993356627907685e-05,
"loss": 0.0351,
"step": 6560
},
{
"grad_norm": 0.3432598412036896,
"learning_rate": 9.99327114768504e-05,
"loss": 0.0381,
"step": 6570
},
{
"grad_norm": 0.4182208776473999,
"learning_rate": 9.99318512140577e-05,
"loss": 0.0314,
"step": 6580
},
{
"grad_norm": 0.3147868812084198,
"learning_rate": 9.993098549079284e-05,
"loss": 0.033,
"step": 6590
},
{
"grad_norm": 0.3362464904785156,
"learning_rate": 9.993011430715047e-05,
"loss": 0.0365,
"step": 6600
},
{
"grad_norm": 0.2871691882610321,
"learning_rate": 9.992923766322586e-05,
"loss": 0.0355,
"step": 6610
},
{
"grad_norm": 0.3950765132904053,
"learning_rate": 9.99283555591149e-05,
"loss": 0.0271,
"step": 6620
},
{
"grad_norm": 0.2880813479423523,
"learning_rate": 9.992746799491404e-05,
"loss": 0.0368,
"step": 6630
},
{
"grad_norm": 0.31202730536460876,
"learning_rate": 9.992657497072033e-05,
"loss": 0.0342,
"step": 6640
},
{
"grad_norm": 0.2865760326385498,
"learning_rate": 9.992567648663147e-05,
"loss": 0.0298,
"step": 6650
},
{
"grad_norm": 0.3159760534763336,
"learning_rate": 9.992477254274568e-05,
"loss": 0.0352,
"step": 6660
},
{
"grad_norm": 0.3384312391281128,
"learning_rate": 9.992386313916183e-05,
"loss": 0.0352,
"step": 6670
},
{
"grad_norm": 0.31035810708999634,
"learning_rate": 9.992294827597934e-05,
"loss": 0.032,
"step": 6680
},
{
"grad_norm": 0.29633647203445435,
"learning_rate": 9.992202795329831e-05,
"loss": 0.0334,
"step": 6690
},
{
"grad_norm": 0.3398205637931824,
"learning_rate": 9.992110217121936e-05,
"loss": 0.0324,
"step": 6700
},
{
"grad_norm": 0.37763530015945435,
"learning_rate": 9.992017092984372e-05,
"loss": 0.0332,
"step": 6710
},
{
"grad_norm": 0.26065415143966675,
"learning_rate": 9.991923422927326e-05,
"loss": 0.0313,
"step": 6720
},
{
"grad_norm": 0.3471277356147766,
"learning_rate": 9.991829206961037e-05,
"loss": 0.0302,
"step": 6730
},
{
"grad_norm": 0.3465164601802826,
"learning_rate": 9.991734445095813e-05,
"loss": 0.031,
"step": 6740
},
{
"grad_norm": 0.3133346140384674,
"learning_rate": 9.991639137342015e-05,
"loss": 0.0288,
"step": 6750
},
{
"grad_norm": 0.46403151750564575,
"learning_rate": 9.991543283710064e-05,
"loss": 0.0296,
"step": 6760
},
{
"grad_norm": 0.3120069205760956,
"learning_rate": 9.991446884210445e-05,
"loss": 0.0291,
"step": 6770
},
{
"grad_norm": 0.30843591690063477,
"learning_rate": 9.9913499388537e-05,
"loss": 0.0338,
"step": 6780
},
{
"grad_norm": 0.2730610966682434,
"learning_rate": 9.99125244765043e-05,
"loss": 0.032,
"step": 6790
},
{
"grad_norm": 0.3308141827583313,
"learning_rate": 9.991154410611296e-05,
"loss": 0.0318,
"step": 6800
},
{
"grad_norm": 0.3219098150730133,
"learning_rate": 9.99105582774702e-05,
"loss": 0.0318,
"step": 6810
},
{
"grad_norm": 0.34381741285324097,
"learning_rate": 9.990956699068384e-05,
"loss": 0.0342,
"step": 6820
},
{
"grad_norm": 0.30304059386253357,
"learning_rate": 9.990857024586224e-05,
"loss": 0.0287,
"step": 6830
},
{
"grad_norm": 0.2646781802177429,
"learning_rate": 9.990756804311446e-05,
"loss": 0.0323,
"step": 6840
},
{
"grad_norm": 0.29807525873184204,
"learning_rate": 9.990656038255006e-05,
"loss": 0.0336,
"step": 6850
},
{
"grad_norm": 0.33460119366645813,
"learning_rate": 9.990554726427926e-05,
"loss": 0.0356,
"step": 6860
},
{
"grad_norm": 0.3203860819339752,
"learning_rate": 9.990452868841284e-05,
"loss": 0.0276,
"step": 6870
},
{
"grad_norm": 0.2722104489803314,
"learning_rate": 9.99035046550622e-05,
"loss": 0.0343,
"step": 6880
},
{
"grad_norm": 0.25526782870292664,
"learning_rate": 9.99024751643393e-05,
"loss": 0.0283,
"step": 6890
},
{
"grad_norm": 0.32612594962120056,
"learning_rate": 9.990144021635677e-05,
"loss": 0.028,
"step": 6900
},
{
"grad_norm": 0.39642319083213806,
"learning_rate": 9.990039981122775e-05,
"loss": 0.0286,
"step": 6910
},
{
"grad_norm": 0.32006964087486267,
"learning_rate": 9.989935394906602e-05,
"loss": 0.031,
"step": 6920
},
{
"grad_norm": 0.2804211974143982,
"learning_rate": 9.989830262998598e-05,
"loss": 0.0322,
"step": 6930
},
{
"grad_norm": 0.2989294230937958,
"learning_rate": 9.989724585410259e-05,
"loss": 0.0283,
"step": 6940
},
{
"grad_norm": 0.44074177742004395,
"learning_rate": 9.989618362153139e-05,
"loss": 0.0297,
"step": 6950
},
{
"grad_norm": 0.32931166887283325,
"learning_rate": 9.989511593238859e-05,
"loss": 0.0339,
"step": 6960
},
{
"grad_norm": 0.2922728657722473,
"learning_rate": 9.98940427867909e-05,
"loss": 0.0286,
"step": 6970
},
{
"grad_norm": 0.3765396773815155,
"learning_rate": 9.989296418485573e-05,
"loss": 0.0332,
"step": 6980
},
{
"grad_norm": 0.3679473400115967,
"learning_rate": 9.989188012670101e-05,
"loss": 0.0314,
"step": 6990
},
{
"grad_norm": 0.21550998091697693,
"learning_rate": 9.989079061244528e-05,
"loss": 0.0314,
"step": 7000
},
{
"grad_norm": 0.29178258776664734,
"learning_rate": 9.988969564220769e-05,
"loss": 0.0343,
"step": 7010
},
{
"grad_norm": 0.32072561979293823,
"learning_rate": 9.988859521610801e-05,
"loss": 0.0321,
"step": 7020
},
{
"grad_norm": 0.30814284086227417,
"learning_rate": 9.988748933426656e-05,
"loss": 0.0276,
"step": 7030
},
{
"grad_norm": 0.32844078540802,
"learning_rate": 9.988637799680428e-05,
"loss": 0.0304,
"step": 7040
},
{
"grad_norm": 0.29593050479888916,
"learning_rate": 9.98852612038427e-05,
"loss": 0.0297,
"step": 7050
},
{
"grad_norm": 0.35126709938049316,
"learning_rate": 9.988413895550397e-05,
"loss": 0.0319,
"step": 7060
},
{
"grad_norm": 0.3708306550979614,
"learning_rate": 9.98830112519108e-05,
"loss": 0.0296,
"step": 7070
},
{
"grad_norm": 0.24658213555812836,
"learning_rate": 9.98818780931865e-05,
"loss": 0.0299,
"step": 7080
},
{
"grad_norm": 0.22519266605377197,
"learning_rate": 9.988073947945502e-05,
"loss": 0.0309,
"step": 7090
},
{
"grad_norm": 0.29005545377731323,
"learning_rate": 9.987959541084087e-05,
"loss": 0.0255,
"step": 7100
},
{
"grad_norm": 0.36423617601394653,
"learning_rate": 9.987844588746915e-05,
"loss": 0.0309,
"step": 7110
},
{
"grad_norm": 0.2325596660375595,
"learning_rate": 9.987729090946558e-05,
"loss": 0.0302,
"step": 7120
},
{
"grad_norm": 0.43327176570892334,
"learning_rate": 9.987613047695647e-05,
"loss": 0.0268,
"step": 7130
},
{
"grad_norm": 0.3482636511325836,
"learning_rate": 9.987496459006871e-05,
"loss": 0.0328,
"step": 7140
},
{
"grad_norm": 0.34811919927597046,
"learning_rate": 9.987379324892982e-05,
"loss": 0.0274,
"step": 7150
},
{
"grad_norm": 0.3028563857078552,
"learning_rate": 9.987261645366788e-05,
"loss": 0.0288,
"step": 7160
},
{
"grad_norm": 0.2302742302417755,
"learning_rate": 9.987143420441158e-05,
"loss": 0.0254,
"step": 7170
},
{
"grad_norm": 0.33132651448249817,
"learning_rate": 9.987024650129022e-05,
"loss": 0.0305,
"step": 7180
},
{
"grad_norm": 0.438442200422287,
"learning_rate": 9.986905334443368e-05,
"loss": 0.0289,
"step": 7190
},
{
"grad_norm": 0.26672929525375366,
"learning_rate": 9.986785473397245e-05,
"loss": 0.0296,
"step": 7200
},
{
"grad_norm": 0.2794930636882782,
"learning_rate": 9.98666506700376e-05,
"loss": 0.0293,
"step": 7210
},
{
"grad_norm": 0.3462786078453064,
"learning_rate": 9.986544115276081e-05,
"loss": 0.0293,
"step": 7220
},
{
"grad_norm": 0.27141034603118896,
"learning_rate": 9.986422618227433e-05,
"loss": 0.0312,
"step": 7230
},
{
"grad_norm": 0.3272283971309662,
"learning_rate": 9.986300575871106e-05,
"loss": 0.0263,
"step": 7240
},
{
"grad_norm": 0.4185367524623871,
"learning_rate": 9.986177988220444e-05,
"loss": 0.0255,
"step": 7250
},
{
"grad_norm": 0.29810070991516113,
"learning_rate": 9.986054855288856e-05,
"loss": 0.0272,
"step": 7260
},
{
"grad_norm": 0.3437705934047699,
"learning_rate": 9.985931177089802e-05,
"loss": 0.0285,
"step": 7270
},
{
"grad_norm": 0.32581207156181335,
"learning_rate": 9.985806953636814e-05,
"loss": 0.0254,
"step": 7280
},
{
"grad_norm": 0.28992629051208496,
"learning_rate": 9.985682184943471e-05,
"loss": 0.0291,
"step": 7290
},
{
"grad_norm": 0.2846454679965973,
"learning_rate": 9.98555687102342e-05,
"loss": 0.029,
"step": 7300
},
{
"grad_norm": 0.3287263810634613,
"learning_rate": 9.985431011890367e-05,
"loss": 0.0291,
"step": 7310
},
{
"grad_norm": 0.2946445643901825,
"learning_rate": 9.985304607558075e-05,
"loss": 0.0297,
"step": 7320
},
{
"grad_norm": 0.32676783204078674,
"learning_rate": 9.985177658040364e-05,
"loss": 0.0338,
"step": 7330
},
{
"grad_norm": 0.2921066880226135,
"learning_rate": 9.985050163351119e-05,
"loss": 0.0287,
"step": 7340
},
{
"grad_norm": 0.39218196272850037,
"learning_rate": 9.984922123504286e-05,
"loss": 0.0326,
"step": 7350
},
{
"grad_norm": 0.26967233419418335,
"learning_rate": 9.984793538513862e-05,
"loss": 0.0263,
"step": 7360
},
{
"grad_norm": 0.40346407890319824,
"learning_rate": 9.984664408393912e-05,
"loss": 0.0304,
"step": 7370
},
{
"grad_norm": 0.34608927369117737,
"learning_rate": 9.984534733158556e-05,
"loss": 0.0321,
"step": 7380
},
{
"grad_norm": 0.32998940348625183,
"learning_rate": 9.984404512821977e-05,
"loss": 0.0317,
"step": 7390
},
{
"grad_norm": 0.31583118438720703,
"learning_rate": 9.984273747398411e-05,
"loss": 0.0298,
"step": 7400
},
{
"grad_norm": 0.2871437966823578,
"learning_rate": 9.984142436902165e-05,
"loss": 0.0328,
"step": 7410
},
{
"grad_norm": 0.29042500257492065,
"learning_rate": 9.984010581347596e-05,
"loss": 0.029,
"step": 7420
},
{
"grad_norm": 0.3463146686553955,
"learning_rate": 9.983878180749121e-05,
"loss": 0.0386,
"step": 7430
},
{
"grad_norm": 0.3153989315032959,
"learning_rate": 9.983745235121222e-05,
"loss": 0.0342,
"step": 7440
},
{
"grad_norm": 0.36380794644355774,
"learning_rate": 9.983611744478438e-05,
"loss": 0.033,
"step": 7450
},
{
"grad_norm": 0.3831520080566406,
"learning_rate": 9.983477708835365e-05,
"loss": 0.0322,
"step": 7460
},
{
"grad_norm": 0.322574257850647,
"learning_rate": 9.983343128206664e-05,
"loss": 0.0355,
"step": 7470
},
{
"grad_norm": 0.3847709000110626,
"learning_rate": 9.983208002607049e-05,
"loss": 0.0291,
"step": 7480
},
{
"grad_norm": 0.2655700147151947,
"learning_rate": 9.9830723320513e-05,
"loss": 0.032,
"step": 7490
},
{
"grad_norm": 0.3310350477695465,
"learning_rate": 9.982936116554254e-05,
"loss": 0.0318,
"step": 7500
},
{
"grad_norm": 0.32061687111854553,
"learning_rate": 9.982799356130803e-05,
"loss": 0.027,
"step": 7510
},
{
"grad_norm": 0.28159672021865845,
"learning_rate": 9.982662050795908e-05,
"loss": 0.0262,
"step": 7520
},
{
"grad_norm": 0.3665085434913635,
"learning_rate": 9.982524200564583e-05,
"loss": 0.0312,
"step": 7530
},
{
"grad_norm": 0.30298393964767456,
"learning_rate": 9.982385805451901e-05,
"loss": 0.0293,
"step": 7540
},
{
"grad_norm": 0.3483893871307373,
"learning_rate": 9.982246865472998e-05,
"loss": 0.0287,
"step": 7550
},
{
"grad_norm": 0.38968947529792786,
"learning_rate": 9.982107380643069e-05,
"loss": 0.0319,
"step": 7560
},
{
"grad_norm": 0.2924145758152008,
"learning_rate": 9.981967350977368e-05,
"loss": 0.026,
"step": 7570
},
{
"grad_norm": 0.32797256112098694,
"learning_rate": 9.981826776491208e-05,
"loss": 0.0322,
"step": 7580
},
{
"grad_norm": 0.26752641797065735,
"learning_rate": 9.98168565719996e-05,
"loss": 0.0249,
"step": 7590
},
{
"grad_norm": 0.32022371888160706,
"learning_rate": 9.98154399311906e-05,
"loss": 0.0267,
"step": 7600
},
{
"grad_norm": 0.35032230615615845,
"learning_rate": 9.981401784263997e-05,
"loss": 0.0278,
"step": 7610
},
{
"grad_norm": 0.29520300030708313,
"learning_rate": 9.981259030650326e-05,
"loss": 0.0296,
"step": 7620
},
{
"grad_norm": 0.3218529224395752,
"learning_rate": 9.981115732293655e-05,
"loss": 0.0276,
"step": 7630
},
{
"grad_norm": 0.4104684591293335,
"learning_rate": 9.980971889209659e-05,
"loss": 0.0385,
"step": 7640
},
{
"grad_norm": 0.24052345752716064,
"learning_rate": 9.980827501414064e-05,
"loss": 0.0337,
"step": 7650
},
{
"grad_norm": 0.29084518551826477,
"learning_rate": 9.980682568922663e-05,
"loss": 0.0252,
"step": 7660
},
{
"grad_norm": 0.24746528267860413,
"learning_rate": 9.980537091751304e-05,
"loss": 0.0281,
"step": 7670
},
{
"grad_norm": 0.27151262760162354,
"learning_rate": 9.980391069915897e-05,
"loss": 0.0286,
"step": 7680
},
{
"grad_norm": 0.34403955936431885,
"learning_rate": 9.98024450343241e-05,
"loss": 0.0306,
"step": 7690
},
{
"grad_norm": 0.32236459851264954,
"learning_rate": 9.980097392316872e-05,
"loss": 0.029,
"step": 7700
},
{
"grad_norm": 0.24938872456550598,
"learning_rate": 9.97994973658537e-05,
"loss": 0.0279,
"step": 7710
},
{
"grad_norm": 0.3244172930717468,
"learning_rate": 9.979801536254054e-05,
"loss": 0.0242,
"step": 7720
},
{
"grad_norm": 0.27534356713294983,
"learning_rate": 9.979652791339127e-05,
"loss": 0.0288,
"step": 7730
},
{
"grad_norm": 0.27200597524642944,
"learning_rate": 9.97950350185686e-05,
"loss": 0.0277,
"step": 7740
},
{
"grad_norm": 0.21152366697788239,
"learning_rate": 9.979353667823574e-05,
"loss": 0.0273,
"step": 7750
},
{
"grad_norm": 0.27074524760246277,
"learning_rate": 9.979203289255658e-05,
"loss": 0.0299,
"step": 7760
},
{
"grad_norm": 0.2672007381916046,
"learning_rate": 9.979052366169557e-05,
"loss": 0.0264,
"step": 7770
},
{
"grad_norm": 0.3108447790145874,
"learning_rate": 9.978900898581775e-05,
"loss": 0.029,
"step": 7780
},
{
"grad_norm": 0.2584184408187866,
"learning_rate": 9.978748886508875e-05,
"loss": 0.0259,
"step": 7790
},
{
"grad_norm": 0.32770732045173645,
"learning_rate": 9.978596329967484e-05,
"loss": 0.0303,
"step": 7800
},
{
"grad_norm": 0.26243487000465393,
"learning_rate": 9.978443228974284e-05,
"loss": 0.0302,
"step": 7810
},
{
"grad_norm": 0.35991379618644714,
"learning_rate": 9.978289583546015e-05,
"loss": 0.0262,
"step": 7820
},
{
"grad_norm": 0.23951007425785065,
"learning_rate": 9.978135393699484e-05,
"loss": 0.0257,
"step": 7830
},
{
"grad_norm": 0.2510838210582733,
"learning_rate": 9.977980659451548e-05,
"loss": 0.0271,
"step": 7840
},
{
"grad_norm": 0.3507087826728821,
"learning_rate": 9.977825380819135e-05,
"loss": 0.0231,
"step": 7850
},
{
"grad_norm": 0.2924744188785553,
"learning_rate": 9.97766955781922e-05,
"loss": 0.0252,
"step": 7860
},
{
"grad_norm": 0.2155923694372177,
"learning_rate": 9.977513190468848e-05,
"loss": 0.0252,
"step": 7870
},
{
"grad_norm": 0.21585436165332794,
"learning_rate": 9.977356278785116e-05,
"loss": 0.031,
"step": 7880
},
{
"grad_norm": 0.26633843779563904,
"learning_rate": 9.977198822785184e-05,
"loss": 0.0247,
"step": 7890
},
{
"grad_norm": 0.2820769250392914,
"learning_rate": 9.977040822486273e-05,
"loss": 0.0283,
"step": 7900
},
{
"grad_norm": 0.38163092732429504,
"learning_rate": 9.97688227790566e-05,
"loss": 0.0317,
"step": 7910
},
{
"grad_norm": 0.2948736846446991,
"learning_rate": 9.976723189060684e-05,
"loss": 0.0275,
"step": 7920
},
{
"grad_norm": 0.32506266236305237,
"learning_rate": 9.976563555968742e-05,
"loss": 0.0286,
"step": 7930
},
{
"grad_norm": 0.19829989969730377,
"learning_rate": 9.976403378647292e-05,
"loss": 0.029,
"step": 7940
},
{
"grad_norm": 0.27721431851387024,
"learning_rate": 9.97624265711385e-05,
"loss": 0.0234,
"step": 7950
},
{
"grad_norm": 0.24248407781124115,
"learning_rate": 9.976081391385993e-05,
"loss": 0.0261,
"step": 7960
},
{
"grad_norm": 0.21007753908634186,
"learning_rate": 9.975919581481356e-05,
"loss": 0.0262,
"step": 7970
},
{
"grad_norm": 0.3008323907852173,
"learning_rate": 9.975757227417634e-05,
"loss": 0.0283,
"step": 7980
},
{
"grad_norm": 0.2978932857513428,
"learning_rate": 9.975594329212586e-05,
"loss": 0.0247,
"step": 7990
},
{
"grad_norm": 0.26832762360572815,
"learning_rate": 9.97543088688402e-05,
"loss": 0.0249,
"step": 8000
},
{
"grad_norm": 0.3029903769493103,
"learning_rate": 9.975266900449814e-05,
"loss": 0.0264,
"step": 8010
},
{
"grad_norm": 0.22478513419628143,
"learning_rate": 9.975102369927898e-05,
"loss": 0.0289,
"step": 8020
},
{
"grad_norm": 0.28864771127700806,
"learning_rate": 9.974937295336269e-05,
"loss": 0.0249,
"step": 8030
},
{
"grad_norm": 0.3094823360443115,
"learning_rate": 9.974771676692975e-05,
"loss": 0.0305,
"step": 8040
},
{
"grad_norm": 0.32838520407676697,
"learning_rate": 9.974605514016131e-05,
"loss": 0.0252,
"step": 8050
},
{
"grad_norm": 0.3053399622440338,
"learning_rate": 9.974438807323907e-05,
"loss": 0.0238,
"step": 8060
},
{
"grad_norm": 0.23975878953933716,
"learning_rate": 9.974271556634535e-05,
"loss": 0.027,
"step": 8070
},
{
"grad_norm": 0.2973889410495758,
"learning_rate": 9.974103761966302e-05,
"loss": 0.0277,
"step": 8080
},
{
"grad_norm": 0.32135510444641113,
"learning_rate": 9.973935423337563e-05,
"loss": 0.024,
"step": 8090
},
{
"grad_norm": 0.33895865082740784,
"learning_rate": 9.973766540766722e-05,
"loss": 0.0272,
"step": 8100
},
{
"grad_norm": 0.29293546080589294,
"learning_rate": 9.97359711427225e-05,
"loss": 0.0315,
"step": 8110
},
{
"grad_norm": 0.2531964182853699,
"learning_rate": 9.973427143872677e-05,
"loss": 0.0277,
"step": 8120
},
{
"grad_norm": 0.2604624629020691,
"learning_rate": 9.973256629586589e-05,
"loss": 0.0264,
"step": 8130
},
{
"grad_norm": 0.2898333668708801,
"learning_rate": 9.973085571432632e-05,
"loss": 0.0284,
"step": 8140
},
{
"grad_norm": 0.2965547442436218,
"learning_rate": 9.972913969429513e-05,
"loss": 0.0273,
"step": 8150
},
{
"grad_norm": 0.28335511684417725,
"learning_rate": 9.972741823596e-05,
"loss": 0.0273,
"step": 8160
},
{
"grad_norm": 0.32163968682289124,
"learning_rate": 9.972569133950917e-05,
"loss": 0.0261,
"step": 8170
},
{
"grad_norm": 0.33677390217781067,
"learning_rate": 9.972395900513151e-05,
"loss": 0.0264,
"step": 8180
},
{
"grad_norm": 0.2846772372722626,
"learning_rate": 9.972222123301645e-05,
"loss": 0.0273,
"step": 8190
},
{
"grad_norm": 0.28376877307891846,
"learning_rate": 9.972047802335403e-05,
"loss": 0.0235,
"step": 8200
},
{
"grad_norm": 0.30323314666748047,
"learning_rate": 9.971872937633488e-05,
"loss": 0.0251,
"step": 8210
},
{
"grad_norm": 0.2867453396320343,
"learning_rate": 9.971697529215024e-05,
"loss": 0.024,
"step": 8220
},
{
"grad_norm": 0.37948867678642273,
"learning_rate": 9.971521577099192e-05,
"loss": 0.0253,
"step": 8230
},
{
"grad_norm": 0.30224281549453735,
"learning_rate": 9.971345081305236e-05,
"loss": 0.0243,
"step": 8240
},
{
"grad_norm": 0.2565627992153168,
"learning_rate": 9.971168041852456e-05,
"loss": 0.0236,
"step": 8250
},
{
"grad_norm": 0.21409475803375244,
"learning_rate": 9.970990458760215e-05,
"loss": 0.0254,
"step": 8260
},
{
"grad_norm": 0.34124553203582764,
"learning_rate": 9.970812332047929e-05,
"loss": 0.0295,
"step": 8270
},
{
"grad_norm": 0.23786534368991852,
"learning_rate": 9.97063366173508e-05,
"loss": 0.0276,
"step": 8280
},
{
"grad_norm": 0.19712774455547333,
"learning_rate": 9.970454447841207e-05,
"loss": 0.0261,
"step": 8290
},
{
"grad_norm": 0.21682223677635193,
"learning_rate": 9.970274690385909e-05,
"loss": 0.0231,
"step": 8300
},
{
"grad_norm": 0.36284199357032776,
"learning_rate": 9.970094389388844e-05,
"loss": 0.027,
"step": 8310
},
{
"grad_norm": 0.27997058629989624,
"learning_rate": 9.969913544869728e-05,
"loss": 0.0263,
"step": 8320
},
{
"grad_norm": 0.21517546474933624,
"learning_rate": 9.96973215684834e-05,
"loss": 0.0247,
"step": 8330
},
{
"grad_norm": 0.367024689912796,
"learning_rate": 9.969550225344513e-05,
"loss": 0.0251,
"step": 8340
},
{
"grad_norm": 0.3357943594455719,
"learning_rate": 9.969367750378147e-05,
"loss": 0.0262,
"step": 8350
},
{
"grad_norm": 0.3117486536502838,
"learning_rate": 9.969184731969194e-05,
"loss": 0.0238,
"step": 8360
},
{
"grad_norm": 0.21269969642162323,
"learning_rate": 9.96900117013767e-05,
"loss": 0.0246,
"step": 8370
},
{
"grad_norm": 0.33218929171562195,
"learning_rate": 9.96881706490365e-05,
"loss": 0.0256,
"step": 8380
},
{
"grad_norm": 0.2647198736667633,
"learning_rate": 9.968632416287265e-05,
"loss": 0.0239,
"step": 8390
},
{
"grad_norm": 0.31344446539878845,
"learning_rate": 9.96844722430871e-05,
"loss": 0.0241,
"step": 8400
},
{
"grad_norm": 0.33891257643699646,
"learning_rate": 9.968261488988235e-05,
"loss": 0.0252,
"step": 8410
},
{
"grad_norm": 0.304499089717865,
"learning_rate": 9.968075210346155e-05,
"loss": 0.0253,
"step": 8420
},
{
"grad_norm": 0.292344868183136,
"learning_rate": 9.967888388402839e-05,
"loss": 0.0231,
"step": 8430
},
{
"grad_norm": 0.23222769796848297,
"learning_rate": 9.967701023178717e-05,
"loss": 0.0267,
"step": 8440
},
{
"grad_norm": 0.22055861353874207,
"learning_rate": 9.967513114694282e-05,
"loss": 0.0263,
"step": 8450
},
{
"grad_norm": 0.32641440629959106,
"learning_rate": 9.967324662970079e-05,
"loss": 0.0288,
"step": 8460
},
{
"grad_norm": 0.28375282883644104,
"learning_rate": 9.96713566802672e-05,
"loss": 0.0286,
"step": 8470
},
{
"grad_norm": 0.24904537200927734,
"learning_rate": 9.966946129884873e-05,
"loss": 0.0254,
"step": 8480
},
{
"grad_norm": 0.30935221910476685,
"learning_rate": 9.966756048565265e-05,
"loss": 0.0251,
"step": 8490
},
{
"grad_norm": 0.35785824060440063,
"learning_rate": 9.966565424088681e-05,
"loss": 0.0268,
"step": 8500
},
{
"grad_norm": 0.2504684329032898,
"learning_rate": 9.96637425647597e-05,
"loss": 0.0264,
"step": 8510
},
{
"grad_norm": 0.32707479596138,
"learning_rate": 9.966182545748038e-05,
"loss": 0.0277,
"step": 8520
},
{
"grad_norm": 0.22163079679012299,
"learning_rate": 9.96599029192585e-05,
"loss": 0.0294,
"step": 8530
},
{
"grad_norm": 0.34566739201545715,
"learning_rate": 9.965797495030428e-05,
"loss": 0.0294,
"step": 8540
},
{
"grad_norm": 0.23257708549499512,
"learning_rate": 9.96560415508286e-05,
"loss": 0.0258,
"step": 8550
},
{
"grad_norm": 0.3470739722251892,
"learning_rate": 9.965410272104286e-05,
"loss": 0.0279,
"step": 8560
},
{
"grad_norm": 0.24112451076507568,
"learning_rate": 9.96521584611591e-05,
"loss": 0.0223,
"step": 8570
},
{
"grad_norm": 0.3075414299964905,
"learning_rate": 9.965020877138994e-05,
"loss": 0.0263,
"step": 8580
},
{
"grad_norm": 0.28961965441703796,
"learning_rate": 9.964825365194861e-05,
"loss": 0.0255,
"step": 8590
},
{
"grad_norm": 0.2600545883178711,
"learning_rate": 9.96462931030489e-05,
"loss": 0.0235,
"step": 8600
},
{
"grad_norm": 0.3363104462623596,
"learning_rate": 9.96443271249052e-05,
"loss": 0.0247,
"step": 8610
},
{
"grad_norm": 0.33082354068756104,
"learning_rate": 9.964235571773255e-05,
"loss": 0.0262,
"step": 8620
},
{
"grad_norm": 0.27957209944725037,
"learning_rate": 9.96403788817465e-05,
"loss": 0.0211,
"step": 8630
},
{
"grad_norm": 0.2972489595413208,
"learning_rate": 9.963839661716325e-05,
"loss": 0.0243,
"step": 8640
},
{
"grad_norm": 0.2208458036184311,
"learning_rate": 9.963640892419958e-05,
"loss": 0.0266,
"step": 8650
},
{
"grad_norm": 0.3168970048427582,
"learning_rate": 9.963441580307286e-05,
"loss": 0.0271,
"step": 8660
},
{
"grad_norm": 0.24892112612724304,
"learning_rate": 9.963241725400104e-05,
"loss": 0.0282,
"step": 8670
},
{
"grad_norm": 0.2974965572357178,
"learning_rate": 9.963041327720271e-05,
"loss": 0.0262,
"step": 8680
},
{
"grad_norm": 0.3146728575229645,
"learning_rate": 9.962840387289697e-05,
"loss": 0.0259,
"step": 8690
},
{
"grad_norm": 0.3100884258747101,
"learning_rate": 9.962638904130363e-05,
"loss": 0.0234,
"step": 8700
},
{
"grad_norm": 0.2764372229576111,
"learning_rate": 9.962436878264298e-05,
"loss": 0.0228,
"step": 8710
},
{
"grad_norm": 0.294094055891037,
"learning_rate": 9.962234309713598e-05,
"loss": 0.0211,
"step": 8720
},
{
"grad_norm": 0.2615277171134949,
"learning_rate": 9.962031198500414e-05,
"loss": 0.0278,
"step": 8730
},
{
"grad_norm": 0.20549480617046356,
"learning_rate": 9.961827544646958e-05,
"loss": 0.0231,
"step": 8740
},
{
"grad_norm": 0.269010066986084,
"learning_rate": 9.961623348175501e-05,
"loss": 0.0279,
"step": 8750
},
{
"grad_norm": 0.3127162754535675,
"learning_rate": 9.961418609108377e-05,
"loss": 0.0237,
"step": 8760
},
{
"grad_norm": 0.2381407767534256,
"learning_rate": 9.961213327467971e-05,
"loss": 0.0212,
"step": 8770
},
{
"grad_norm": 0.26799631118774414,
"learning_rate": 9.961007503276736e-05,
"loss": 0.0237,
"step": 8780
},
{
"grad_norm": 0.24201858043670654,
"learning_rate": 9.960801136557179e-05,
"loss": 0.0218,
"step": 8790
},
{
"grad_norm": 0.3450324535369873,
"learning_rate": 9.960594227331866e-05,
"loss": 0.028,
"step": 8800
},
{
"grad_norm": 0.2894059121608734,
"learning_rate": 9.960386775623429e-05,
"loss": 0.0252,
"step": 8810
},
{
"grad_norm": 0.31548407673835754,
"learning_rate": 9.96017878145455e-05,
"loss": 0.0272,
"step": 8820
},
{
"grad_norm": 0.32599905133247375,
"learning_rate": 9.959970244847977e-05,
"loss": 0.0299,
"step": 8830
},
{
"grad_norm": 0.22628341615200043,
"learning_rate": 9.959761165826518e-05,
"loss": 0.0276,
"step": 8840
},
{
"grad_norm": 0.27729442715644836,
"learning_rate": 9.959551544413033e-05,
"loss": 0.0292,
"step": 8850
},
{
"grad_norm": 0.3411165773868561,
"learning_rate": 9.959341380630448e-05,
"loss": 0.0249,
"step": 8860
},
{
"grad_norm": 0.2997261583805084,
"learning_rate": 9.959130674501746e-05,
"loss": 0.0253,
"step": 8870
},
{
"grad_norm": 0.3570755124092102,
"learning_rate": 9.958919426049968e-05,
"loss": 0.0232,
"step": 8880
},
{
"grad_norm": 0.3273327052593231,
"learning_rate": 9.958707635298219e-05,
"loss": 0.0272,
"step": 8890
},
{
"grad_norm": 0.30449461936950684,
"learning_rate": 9.958495302269657e-05,
"loss": 0.0212,
"step": 8900
},
{
"grad_norm": 0.2833212912082672,
"learning_rate": 9.958282426987503e-05,
"loss": 0.0264,
"step": 8910
},
{
"grad_norm": 0.22627991437911987,
"learning_rate": 9.95806900947504e-05,
"loss": 0.0229,
"step": 8920
},
{
"grad_norm": 0.25467512011528015,
"learning_rate": 9.957855049755604e-05,
"loss": 0.0225,
"step": 8930
},
{
"grad_norm": 0.3754052221775055,
"learning_rate": 9.957640547852593e-05,
"loss": 0.0221,
"step": 8940
},
{
"grad_norm": 0.400193452835083,
"learning_rate": 9.957425503789466e-05,
"loss": 0.0266,
"step": 8950
},
{
"grad_norm": 0.2970482409000397,
"learning_rate": 9.957209917589738e-05,
"loss": 0.0281,
"step": 8960
},
{
"grad_norm": 0.22289486229419708,
"learning_rate": 9.956993789276987e-05,
"loss": 0.0247,
"step": 8970
},
{
"grad_norm": 0.21260538697242737,
"learning_rate": 9.956777118874847e-05,
"loss": 0.0254,
"step": 8980
},
{
"grad_norm": 0.28637486696243286,
"learning_rate": 9.956559906407016e-05,
"loss": 0.026,
"step": 8990
},
{
"grad_norm": 0.2537452280521393,
"learning_rate": 9.956342151897245e-05,
"loss": 0.0281,
"step": 9000
},
{
"grad_norm": 0.21656367182731628,
"learning_rate": 9.956123855369346e-05,
"loss": 0.0277,
"step": 9010
},
{
"grad_norm": 0.2879554331302643,
"learning_rate": 9.955905016847196e-05,
"loss": 0.0221,
"step": 9020
},
{
"grad_norm": 0.3109210133552551,
"learning_rate": 9.955685636354723e-05,
"loss": 0.0283,
"step": 9030
},
{
"grad_norm": 0.303078830242157,
"learning_rate": 9.95546571391592e-05,
"loss": 0.0281,
"step": 9040
},
{
"grad_norm": 0.27327966690063477,
"learning_rate": 9.955245249554837e-05,
"loss": 0.0235,
"step": 9050
},
{
"grad_norm": 0.2259787619113922,
"learning_rate": 9.955024243295582e-05,
"loss": 0.0237,
"step": 9060
},
{
"grad_norm": 0.2695671319961548,
"learning_rate": 9.954802695162328e-05,
"loss": 0.0248,
"step": 9070
},
{
"grad_norm": 0.22920946776866913,
"learning_rate": 9.954580605179302e-05,
"loss": 0.0254,
"step": 9080
},
{
"grad_norm": 0.24427971243858337,
"learning_rate": 9.954357973370788e-05,
"loss": 0.0257,
"step": 9090
},
{
"grad_norm": 0.2971542775630951,
"learning_rate": 9.954134799761135e-05,
"loss": 0.0241,
"step": 9100
},
{
"grad_norm": 0.22257231175899506,
"learning_rate": 9.953911084374748e-05,
"loss": 0.0248,
"step": 9110
},
{
"grad_norm": 0.25738465785980225,
"learning_rate": 9.953686827236093e-05,
"loss": 0.0235,
"step": 9120
},
{
"grad_norm": 0.3056095540523529,
"learning_rate": 9.953462028369695e-05,
"loss": 0.0243,
"step": 9130
},
{
"grad_norm": 0.29317033290863037,
"learning_rate": 9.953236687800136e-05,
"loss": 0.0256,
"step": 9140
},
{
"grad_norm": 0.35253703594207764,
"learning_rate": 9.95301080555206e-05,
"loss": 0.0226,
"step": 9150
},
{
"grad_norm": 0.3137780725955963,
"learning_rate": 9.952784381650171e-05,
"loss": 0.0245,
"step": 9160
},
{
"grad_norm": 0.3835451006889343,
"learning_rate": 9.952557416119226e-05,
"loss": 0.0278,
"step": 9170
},
{
"grad_norm": 0.3224237859249115,
"learning_rate": 9.95232990898405e-05,
"loss": 0.023,
"step": 9180
},
{
"grad_norm": 0.30361151695251465,
"learning_rate": 9.95210186026952e-05,
"loss": 0.0252,
"step": 9190
},
{
"grad_norm": 0.21164710819721222,
"learning_rate": 9.951873270000576e-05,
"loss": 0.0255,
"step": 9200
},
{
"grad_norm": 0.3063027560710907,
"learning_rate": 9.951644138202216e-05,
"loss": 0.0257,
"step": 9210
},
{
"grad_norm": 0.35758069157600403,
"learning_rate": 9.951414464899498e-05,
"loss": 0.0264,
"step": 9220
},
{
"grad_norm": 0.25073498487472534,
"learning_rate": 9.951184250117538e-05,
"loss": 0.0226,
"step": 9230
},
{
"grad_norm": 0.2150561809539795,
"learning_rate": 9.950953493881513e-05,
"loss": 0.0242,
"step": 9240
},
{
"grad_norm": 0.2752856910228729,
"learning_rate": 9.950722196216658e-05,
"loss": 0.0225,
"step": 9250
},
{
"grad_norm": 0.34514814615249634,
"learning_rate": 9.950490357148265e-05,
"loss": 0.0228,
"step": 9260
},
{
"grad_norm": 0.27709946036338806,
"learning_rate": 9.950257976701692e-05,
"loss": 0.0242,
"step": 9270
},
{
"grad_norm": 0.2694006860256195,
"learning_rate": 9.950025054902348e-05,
"loss": 0.0244,
"step": 9280
},
{
"grad_norm": 0.2841184139251709,
"learning_rate": 9.949791591775706e-05,
"loss": 0.0245,
"step": 9290
},
{
"grad_norm": 0.3109279274940491,
"learning_rate": 9.949557587347298e-05,
"loss": 0.0245,
"step": 9300
},
{
"grad_norm": 0.2491903007030487,
"learning_rate": 9.949323041642713e-05,
"loss": 0.0228,
"step": 9310
},
{
"grad_norm": 0.27171915769577026,
"learning_rate": 9.949087954687602e-05,
"loss": 0.0205,
"step": 9320
},
{
"grad_norm": 0.25222888588905334,
"learning_rate": 9.948852326507672e-05,
"loss": 0.0223,
"step": 9330
},
{
"grad_norm": 0.2523558735847473,
"learning_rate": 9.948616157128694e-05,
"loss": 0.0209,
"step": 9340
},
{
"grad_norm": 0.2467038780450821,
"learning_rate": 9.948379446576493e-05,
"loss": 0.0236,
"step": 9350
},
{
"grad_norm": 0.23207561671733856,
"learning_rate": 9.948142194876952e-05,
"loss": 0.0214,
"step": 9360
},
{
"grad_norm": 0.23598060011863708,
"learning_rate": 9.947904402056024e-05,
"loss": 0.0189,
"step": 9370
},
{
"grad_norm": 0.27153506875038147,
"learning_rate": 9.947666068139708e-05,
"loss": 0.0227,
"step": 9380
},
{
"grad_norm": 0.22271192073822021,
"learning_rate": 9.947427193154071e-05,
"loss": 0.0204,
"step": 9390
},
{
"grad_norm": 0.23430928587913513,
"learning_rate": 9.947187777125233e-05,
"loss": 0.0234,
"step": 9400
},
{
"grad_norm": 0.265509694814682,
"learning_rate": 9.946947820079377e-05,
"loss": 0.0191,
"step": 9410
},
{
"grad_norm": 0.26050254702568054,
"learning_rate": 9.946707322042747e-05,
"loss": 0.0236,
"step": 9420
},
{
"grad_norm": 0.2333969622850418,
"learning_rate": 9.94646628304164e-05,
"loss": 0.0243,
"step": 9430
},
{
"grad_norm": 0.2169397622346878,
"learning_rate": 9.946224703102418e-05,
"loss": 0.0219,
"step": 9440
},
{
"grad_norm": 0.2744732201099396,
"learning_rate": 9.945982582251498e-05,
"loss": 0.0233,
"step": 9450
},
{
"grad_norm": 0.3101717233657837,
"learning_rate": 9.94573992051536e-05,
"loss": 0.0206,
"step": 9460
},
{
"grad_norm": 0.2404516339302063,
"learning_rate": 9.94549671792054e-05,
"loss": 0.021,
"step": 9470
},
{
"grad_norm": 0.25397542119026184,
"learning_rate": 9.945252974493635e-05,
"loss": 0.0228,
"step": 9480
},
{
"grad_norm": 0.31194770336151123,
"learning_rate": 9.9450086902613e-05,
"loss": 0.0229,
"step": 9490
},
{
"grad_norm": 0.23232460021972656,
"learning_rate": 9.944763865250248e-05,
"loss": 0.0227,
"step": 9500
},
{
"grad_norm": 0.2093714326620102,
"learning_rate": 9.944518499487254e-05,
"loss": 0.0239,
"step": 9510
},
{
"grad_norm": 0.2016279250383377,
"learning_rate": 9.944272592999151e-05,
"loss": 0.0193,
"step": 9520
},
{
"grad_norm": 0.20296460390090942,
"learning_rate": 9.94402614581283e-05,
"loss": 0.0168,
"step": 9530
},
{
"grad_norm": 0.21995308995246887,
"learning_rate": 9.943779157955244e-05,
"loss": 0.0194,
"step": 9540
},
{
"grad_norm": 0.2858385741710663,
"learning_rate": 9.943531629453403e-05,
"loss": 0.0269,
"step": 9550
},
{
"grad_norm": 0.33944380283355713,
"learning_rate": 9.943283560334375e-05,
"loss": 0.023,
"step": 9560
},
{
"grad_norm": 0.22793933749198914,
"learning_rate": 9.943034950625288e-05,
"loss": 0.0229,
"step": 9570
},
{
"grad_norm": 0.2835240960121155,
"learning_rate": 9.942785800353332e-05,
"loss": 0.025,
"step": 9580
},
{
"grad_norm": 0.2578336298465729,
"learning_rate": 9.942536109545751e-05,
"loss": 0.025,
"step": 9590
},
{
"grad_norm": 0.2056572139263153,
"learning_rate": 9.942285878229853e-05,
"loss": 0.02,
"step": 9600
},
{
"grad_norm": 0.24234223365783691,
"learning_rate": 9.942035106433001e-05,
"loss": 0.0239,
"step": 9610
},
{
"grad_norm": 0.24196766316890717,
"learning_rate": 9.94178379418262e-05,
"loss": 0.0184,
"step": 9620
},
{
"grad_norm": 0.23192943632602692,
"learning_rate": 9.941531941506194e-05,
"loss": 0.0197,
"step": 9630
},
{
"grad_norm": 0.22634382545948029,
"learning_rate": 9.941279548431263e-05,
"loss": 0.0247,
"step": 9640
},
{
"grad_norm": 0.2701261043548584,
"learning_rate": 9.941026614985431e-05,
"loss": 0.0253,
"step": 9650
},
{
"grad_norm": 0.22548985481262207,
"learning_rate": 9.940773141196357e-05,
"loss": 0.0237,
"step": 9660
},
{
"grad_norm": 0.27719661593437195,
"learning_rate": 9.94051912709176e-05,
"loss": 0.0195,
"step": 9670
},
{
"grad_norm": 0.2582361102104187,
"learning_rate": 9.940264572699421e-05,
"loss": 0.0187,
"step": 9680
},
{
"grad_norm": 0.22791557013988495,
"learning_rate": 9.940009478047174e-05,
"loss": 0.0246,
"step": 9690
},
{
"grad_norm": 0.28019043803215027,
"learning_rate": 9.939753843162918e-05,
"loss": 0.0219,
"step": 9700
},
{
"grad_norm": 0.311816930770874,
"learning_rate": 9.939497668074609e-05,
"loss": 0.023,
"step": 9710
},
{
"grad_norm": 0.21537867188453674,
"learning_rate": 9.93924095281026e-05,
"loss": 0.0217,
"step": 9720
},
{
"grad_norm": 0.3489777147769928,
"learning_rate": 9.938983697397948e-05,
"loss": 0.0215,
"step": 9730
},
{
"grad_norm": 0.30278676748275757,
"learning_rate": 9.938725901865805e-05,
"loss": 0.0222,
"step": 9740
},
{
"grad_norm": 0.3652346134185791,
"learning_rate": 9.93846756624202e-05,
"loss": 0.0257,
"step": 9750
},
{
"grad_norm": 0.2883793115615845,
"learning_rate": 9.938208690554849e-05,
"loss": 0.0223,
"step": 9760
},
{
"grad_norm": 0.29903727769851685,
"learning_rate": 9.9379492748326e-05,
"loss": 0.0213,
"step": 9770
},
{
"grad_norm": 0.24020390212535858,
"learning_rate": 9.937689319103641e-05,
"loss": 0.0197,
"step": 9780
},
{
"grad_norm": 0.2210279256105423,
"learning_rate": 9.937428823396404e-05,
"loss": 0.021,
"step": 9790
},
{
"grad_norm": 0.25513386726379395,
"learning_rate": 9.937167787739372e-05,
"loss": 0.021,
"step": 9800
},
{
"grad_norm": 0.2967779040336609,
"learning_rate": 9.936906212161095e-05,
"loss": 0.0299,
"step": 9810
},
{
"grad_norm": 0.28899675607681274,
"learning_rate": 9.936644096690176e-05,
"loss": 0.0238,
"step": 9820
},
{
"grad_norm": 0.2674974799156189,
"learning_rate": 9.936381441355282e-05,
"loss": 0.0258,
"step": 9830
},
{
"grad_norm": 0.15966667234897614,
"learning_rate": 9.936118246185136e-05,
"loss": 0.0239,
"step": 9840
},
{
"grad_norm": 0.34273138642311096,
"learning_rate": 9.935854511208518e-05,
"loss": 0.0249,
"step": 9850
},
{
"grad_norm": 0.26348111033439636,
"learning_rate": 9.935590236454272e-05,
"loss": 0.0227,
"step": 9860
},
{
"grad_norm": 0.19656908512115479,
"learning_rate": 9.935325421951298e-05,
"loss": 0.0182,
"step": 9870
},
{
"grad_norm": 0.2591570019721985,
"learning_rate": 9.935060067728557e-05,
"loss": 0.02,
"step": 9880
},
{
"grad_norm": 0.24237704277038574,
"learning_rate": 9.934794173815067e-05,
"loss": 0.0205,
"step": 9890
},
{
"grad_norm": 0.281578928232193,
"learning_rate": 9.934527740239906e-05,
"loss": 0.0216,
"step": 9900
},
{
"grad_norm": 0.237837016582489,
"learning_rate": 9.934260767032209e-05,
"loss": 0.0218,
"step": 9910
},
{
"grad_norm": 0.25344088673591614,
"learning_rate": 9.933993254221172e-05,
"loss": 0.0209,
"step": 9920
},
{
"grad_norm": 0.19064785540103912,
"learning_rate": 9.933725201836053e-05,
"loss": 0.0214,
"step": 9930
},
{
"grad_norm": 0.17528073489665985,
"learning_rate": 9.933456609906162e-05,
"loss": 0.0177,
"step": 9940
},
{
"grad_norm": 0.2770511507987976,
"learning_rate": 9.933187478460875e-05,
"loss": 0.0258,
"step": 9950
},
{
"grad_norm": 0.25209683179855347,
"learning_rate": 9.93291780752962e-05,
"loss": 0.0226,
"step": 9960
},
{
"grad_norm": 0.19165830314159393,
"learning_rate": 9.932647597141893e-05,
"loss": 0.0208,
"step": 9970
},
{
"grad_norm": 0.2270399034023285,
"learning_rate": 9.932376847327239e-05,
"loss": 0.0164,
"step": 9980
},
{
"grad_norm": 0.2793896198272705,
"learning_rate": 9.932105558115268e-05,
"loss": 0.0212,
"step": 9990
},
{
"grad_norm": 0.27653828263282776,
"learning_rate": 9.931833729535651e-05,
"loss": 0.0225,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}