starcoder-1b-finetuned-ds-3 / trainer_state.json
rohitc33's picture
Upload folder using huggingface_hub
0ea5f15 verified
{
"best_metric": 0.35205078125,
"best_model_checkpoint": "./results/checkpoint-7094",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 10641,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.7654,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.862,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 146.36835642526796,
"learning_rate": 4.8e-06,
"loss": 2.4598,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 254.1399327894623,
"learning_rate": 1.02e-05,
"loss": 3.048,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 55.794104847848445,
"learning_rate": 1.6199999999999997e-05,
"loss": 2.4278,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 29.62062990171776,
"learning_rate": 2.2199999999999998e-05,
"loss": 0.9157,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 36.90337952409684,
"learning_rate": 2.8199999999999998e-05,
"loss": 1.0977,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 70.28513966911416,
"learning_rate": 3.42e-05,
"loss": 1.4851,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 76.38058442522734,
"learning_rate": 4.02e-05,
"loss": 0.9004,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 215.04691118737435,
"learning_rate": 4.62e-05,
"loss": 1.3784,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 156.18636353053705,
"learning_rate": 5.2199999999999995e-05,
"loss": 1.9046,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 83.58499534326243,
"learning_rate": 5.82e-05,
"loss": 1.8243,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 58.89968090715743,
"learning_rate": 6.419999999999999e-05,
"loss": 3.5648,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 279.1237096487612,
"learning_rate": 7.02e-05,
"loss": 1.3321,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 14.282470404736001,
"learning_rate": 7.62e-05,
"loss": 0.7693,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 21.416320260069092,
"learning_rate": 8.22e-05,
"loss": 0.7282,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 7.027757159631835,
"learning_rate": 8.819999999999999e-05,
"loss": 0.8376,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 28.808627534490917,
"learning_rate": 9.419999999999999e-05,
"loss": 0.8771,
"step": 180
},
{
"epoch": 0.05,
"grad_norm": 55.14542562374617,
"learning_rate": 0.0001002,
"loss": 1.2092,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 12.485175678803063,
"learning_rate": 0.00010619999999999998,
"loss": 0.7898,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 30.98880210639734,
"learning_rate": 0.00011219999999999999,
"loss": 1.1421,
"step": 210
},
{
"epoch": 0.06,
"grad_norm": 34.4471000818379,
"learning_rate": 0.0001182,
"loss": 0.755,
"step": 220
},
{
"epoch": 0.06,
"grad_norm": 25.20646584085208,
"learning_rate": 0.00012419999999999998,
"loss": 0.8171,
"step": 230
},
{
"epoch": 0.07,
"grad_norm": 57.11988442886105,
"learning_rate": 0.0001302,
"loss": 0.9672,
"step": 240
},
{
"epoch": 0.07,
"grad_norm": 31.266794385874547,
"learning_rate": 0.0001362,
"loss": 0.863,
"step": 250
},
{
"epoch": 0.07,
"grad_norm": 28.801620787803333,
"learning_rate": 0.0001422,
"loss": 0.9292,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 18.379715667965055,
"learning_rate": 0.0001482,
"loss": 0.6885,
"step": 270
},
{
"epoch": 0.08,
"grad_norm": 26.615184398415803,
"learning_rate": 0.00015419999999999998,
"loss": 0.8698,
"step": 280
},
{
"epoch": 0.08,
"grad_norm": 27.84018584708001,
"learning_rate": 0.0001602,
"loss": 0.7403,
"step": 290
},
{
"epoch": 0.08,
"grad_norm": 83.6950577392233,
"learning_rate": 0.0001662,
"loss": 1.7649,
"step": 300
},
{
"epoch": 0.09,
"grad_norm": 62.62507175586115,
"learning_rate": 0.00017219999999999998,
"loss": 1.5992,
"step": 310
},
{
"epoch": 0.09,
"grad_norm": 34.83362360351182,
"learning_rate": 0.00017819999999999997,
"loss": 3.7618,
"step": 320
},
{
"epoch": 0.09,
"grad_norm": 0.1566836386456626,
"learning_rate": 0.00018419999999999998,
"loss": 1.1883,
"step": 330
},
{
"epoch": 0.1,
"grad_norm": 44.53868085198857,
"learning_rate": 0.0001902,
"loss": 1.5033,
"step": 340
},
{
"epoch": 0.1,
"grad_norm": 16.47571390018737,
"learning_rate": 0.0001962,
"loss": 0.7708,
"step": 350
},
{
"epoch": 0.1,
"grad_norm": 90.50256233776733,
"learning_rate": 0.0002022,
"loss": 1.0146,
"step": 360
},
{
"epoch": 0.1,
"grad_norm": 16.535117663656802,
"learning_rate": 0.00020819999999999996,
"loss": 0.673,
"step": 370
},
{
"epoch": 0.11,
"grad_norm": 41.013763949361135,
"learning_rate": 0.00021419999999999998,
"loss": 0.7536,
"step": 380
},
{
"epoch": 0.11,
"grad_norm": 50.83635956076198,
"learning_rate": 0.00022019999999999999,
"loss": 0.8832,
"step": 390
},
{
"epoch": 0.11,
"grad_norm": 51.192653868723845,
"learning_rate": 0.00022619999999999997,
"loss": 0.756,
"step": 400
},
{
"epoch": 0.12,
"grad_norm": 95.02104619699388,
"learning_rate": 0.00023219999999999998,
"loss": 1.0475,
"step": 410
},
{
"epoch": 0.12,
"grad_norm": 14.755367161274899,
"learning_rate": 0.0002382,
"loss": 0.8114,
"step": 420
},
{
"epoch": 0.12,
"grad_norm": 26.824869427969787,
"learning_rate": 0.00024419999999999997,
"loss": 0.7827,
"step": 430
},
{
"epoch": 0.12,
"grad_norm": 10.199008591764807,
"learning_rate": 0.00025019999999999996,
"loss": 0.7201,
"step": 440
},
{
"epoch": 0.13,
"grad_norm": 16.099046337033606,
"learning_rate": 0.0002562,
"loss": 0.7852,
"step": 450
},
{
"epoch": 0.13,
"grad_norm": 72.84156691472333,
"learning_rate": 0.0002622,
"loss": 0.7819,
"step": 460
},
{
"epoch": 0.13,
"grad_norm": 23.060057287801556,
"learning_rate": 0.00026819999999999996,
"loss": 1.1294,
"step": 470
},
{
"epoch": 0.14,
"grad_norm": 18.372892721573056,
"learning_rate": 0.0002742,
"loss": 0.8509,
"step": 480
},
{
"epoch": 0.14,
"grad_norm": 44.80535522734965,
"learning_rate": 0.0002802,
"loss": 1.3075,
"step": 490
},
{
"epoch": 0.14,
"grad_norm": 53.039346895060866,
"learning_rate": 0.00028619999999999996,
"loss": 0.8981,
"step": 500
},
{
"epoch": 0.14,
"grad_norm": 8.933950393723551,
"learning_rate": 0.00029219999999999995,
"loss": 0.8596,
"step": 510
},
{
"epoch": 0.15,
"grad_norm": 54.86618147368649,
"learning_rate": 0.0002982,
"loss": 0.8419,
"step": 520
},
{
"epoch": 0.15,
"grad_norm": 52.0156886597986,
"learning_rate": 0.00029984658094681473,
"loss": 1.0992,
"step": 530
},
{
"epoch": 0.15,
"grad_norm": 48.61185102339785,
"learning_rate": 0.00029962741087083574,
"loss": 0.8373,
"step": 540
},
{
"epoch": 0.16,
"grad_norm": 85.88381074144633,
"learning_rate": 0.0002994082407948568,
"loss": 0.9952,
"step": 550
},
{
"epoch": 0.16,
"grad_norm": 46.828573117866156,
"learning_rate": 0.0002991890707188778,
"loss": 1.2285,
"step": 560
},
{
"epoch": 0.16,
"grad_norm": 4.319911680589826,
"learning_rate": 0.00029896990064289886,
"loss": 1.0784,
"step": 570
},
{
"epoch": 0.16,
"grad_norm": 49.18948488974743,
"learning_rate": 0.0002987507305669199,
"loss": 0.6879,
"step": 580
},
{
"epoch": 0.17,
"grad_norm": 14.790075292508273,
"learning_rate": 0.0002985315604909409,
"loss": 0.7182,
"step": 590
},
{
"epoch": 0.17,
"grad_norm": 81.77111956641443,
"learning_rate": 0.000298312390414962,
"loss": 0.6341,
"step": 600
},
{
"epoch": 0.17,
"grad_norm": 5.015074610499661,
"learning_rate": 0.00029809322033898304,
"loss": 0.8418,
"step": 610
},
{
"epoch": 0.17,
"grad_norm": 43.326414318341016,
"learning_rate": 0.00029787405026300405,
"loss": 0.7158,
"step": 620
},
{
"epoch": 0.18,
"grad_norm": 21.749351652802584,
"learning_rate": 0.0002976548801870251,
"loss": 0.5541,
"step": 630
},
{
"epoch": 0.18,
"grad_norm": 104.8554633037212,
"learning_rate": 0.00029743571011104616,
"loss": 0.7715,
"step": 640
},
{
"epoch": 0.18,
"grad_norm": 53.28037677060509,
"learning_rate": 0.00029721654003506717,
"loss": 0.7089,
"step": 650
},
{
"epoch": 0.19,
"grad_norm": 60.68620976609669,
"learning_rate": 0.00029699736995908823,
"loss": 0.7251,
"step": 660
},
{
"epoch": 0.19,
"grad_norm": 83.67598838205309,
"learning_rate": 0.0002967781998831093,
"loss": 0.9626,
"step": 670
},
{
"epoch": 0.19,
"grad_norm": 22.217129800838155,
"learning_rate": 0.0002965590298071303,
"loss": 0.7762,
"step": 680
},
{
"epoch": 0.19,
"grad_norm": 86.4481164773122,
"learning_rate": 0.00029633985973115135,
"loss": 0.6535,
"step": 690
},
{
"epoch": 0.2,
"grad_norm": 40.75531136163561,
"learning_rate": 0.00029612068965517236,
"loss": 0.6025,
"step": 700
},
{
"epoch": 0.2,
"grad_norm": 56.971296704966576,
"learning_rate": 0.0002959015195791934,
"loss": 0.7087,
"step": 710
},
{
"epoch": 0.2,
"grad_norm": 30.278786860468646,
"learning_rate": 0.0002956823495032145,
"loss": 0.8943,
"step": 720
},
{
"epoch": 0.21,
"grad_norm": 14.890198931647719,
"learning_rate": 0.0002954631794272355,
"loss": 0.8818,
"step": 730
},
{
"epoch": 0.21,
"grad_norm": 39.88836482589719,
"learning_rate": 0.00029524400935125654,
"loss": 0.9544,
"step": 740
},
{
"epoch": 0.21,
"grad_norm": 102.4851007431489,
"learning_rate": 0.0002950248392752776,
"loss": 0.7461,
"step": 750
},
{
"epoch": 0.21,
"grad_norm": 43.156700980283695,
"learning_rate": 0.0002948056691992986,
"loss": 0.7074,
"step": 760
},
{
"epoch": 0.22,
"grad_norm": 49.376028998432666,
"learning_rate": 0.00029458649912331966,
"loss": 0.6382,
"step": 770
},
{
"epoch": 0.22,
"grad_norm": 31.433406542237964,
"learning_rate": 0.0002943673290473407,
"loss": 0.732,
"step": 780
},
{
"epoch": 0.22,
"grad_norm": 44.3715979494319,
"learning_rate": 0.0002941481589713617,
"loss": 0.6883,
"step": 790
},
{
"epoch": 0.23,
"grad_norm": 17.06187425481664,
"learning_rate": 0.0002939289888953828,
"loss": 0.648,
"step": 800
},
{
"epoch": 0.23,
"grad_norm": 8.777853690776,
"learning_rate": 0.00029370981881940384,
"loss": 0.4979,
"step": 810
},
{
"epoch": 0.23,
"grad_norm": 4.880875214974237,
"learning_rate": 0.00029349064874342485,
"loss": 0.743,
"step": 820
},
{
"epoch": 0.23,
"grad_norm": 9.720423607604369,
"learning_rate": 0.0002932714786674459,
"loss": 0.645,
"step": 830
},
{
"epoch": 0.24,
"grad_norm": 9.066124973757804,
"learning_rate": 0.00029305230859146697,
"loss": 0.4977,
"step": 840
},
{
"epoch": 0.24,
"grad_norm": 23.89066736642937,
"learning_rate": 0.00029283313851548797,
"loss": 0.4926,
"step": 850
},
{
"epoch": 0.24,
"grad_norm": 41.695686510198016,
"learning_rate": 0.00029261396843950903,
"loss": 0.7886,
"step": 860
},
{
"epoch": 0.25,
"grad_norm": 66.53797988620009,
"learning_rate": 0.00029239479836353004,
"loss": 0.5658,
"step": 870
},
{
"epoch": 0.25,
"grad_norm": 55.97718756593597,
"learning_rate": 0.0002921756282875511,
"loss": 0.5484,
"step": 880
},
{
"epoch": 0.25,
"grad_norm": 108.03497298548182,
"learning_rate": 0.00029195645821157215,
"loss": 0.7978,
"step": 890
},
{
"epoch": 0.25,
"grad_norm": 40.14088131595207,
"learning_rate": 0.00029173728813559316,
"loss": 0.6178,
"step": 900
},
{
"epoch": 0.26,
"grad_norm": 48.46822333526267,
"learning_rate": 0.00029151811805961427,
"loss": 0.565,
"step": 910
},
{
"epoch": 0.26,
"grad_norm": 16.17488665111733,
"learning_rate": 0.0002912989479836353,
"loss": 0.6886,
"step": 920
},
{
"epoch": 0.26,
"grad_norm": 26.592963237868908,
"learning_rate": 0.0002910797779076563,
"loss": 0.6966,
"step": 930
},
{
"epoch": 0.27,
"grad_norm": 27.00009471161443,
"learning_rate": 0.0002908606078316774,
"loss": 0.6178,
"step": 940
},
{
"epoch": 0.27,
"grad_norm": 28.195576793914913,
"learning_rate": 0.0002906414377556984,
"loss": 0.4663,
"step": 950
},
{
"epoch": 0.27,
"grad_norm": 23.538388615789113,
"learning_rate": 0.00029042226767971946,
"loss": 0.7135,
"step": 960
},
{
"epoch": 0.27,
"grad_norm": 20.353666443170358,
"learning_rate": 0.00029020309760374046,
"loss": 0.708,
"step": 970
},
{
"epoch": 0.28,
"grad_norm": 30.98820805802448,
"learning_rate": 0.0002899839275277615,
"loss": 0.5138,
"step": 980
},
{
"epoch": 0.28,
"grad_norm": 49.96332795879376,
"learning_rate": 0.0002897647574517826,
"loss": 0.5238,
"step": 990
},
{
"epoch": 0.28,
"grad_norm": 36.472514659373466,
"learning_rate": 0.0002895455873758036,
"loss": 0.7198,
"step": 1000
},
{
"epoch": 0.28,
"grad_norm": 5.727047796229911,
"learning_rate": 0.00028932641729982465,
"loss": 0.5427,
"step": 1010
},
{
"epoch": 0.29,
"grad_norm": 6.39978136661481,
"learning_rate": 0.0002891072472238457,
"loss": 0.6386,
"step": 1020
},
{
"epoch": 0.29,
"grad_norm": 26.496446288276136,
"learning_rate": 0.0002888880771478667,
"loss": 0.4019,
"step": 1030
},
{
"epoch": 0.29,
"grad_norm": 96.50801416396253,
"learning_rate": 0.00028866890707188777,
"loss": 0.8864,
"step": 1040
},
{
"epoch": 0.3,
"grad_norm": 26.933228918624025,
"learning_rate": 0.00028844973699590883,
"loss": 0.8963,
"step": 1050
},
{
"epoch": 0.3,
"grad_norm": 20.70682008967005,
"learning_rate": 0.00028825248392752773,
"loss": 0.6956,
"step": 1060
},
{
"epoch": 0.3,
"grad_norm": 46.724804444462556,
"learning_rate": 0.00028803331385154874,
"loss": 0.8669,
"step": 1070
},
{
"epoch": 0.3,
"grad_norm": 5.5954267342032225,
"learning_rate": 0.00028781414377556985,
"loss": 0.9219,
"step": 1080
},
{
"epoch": 0.31,
"grad_norm": 60.59949911543188,
"learning_rate": 0.00028759497369959086,
"loss": 0.6557,
"step": 1090
},
{
"epoch": 0.31,
"grad_norm": 37.05416363332341,
"learning_rate": 0.00028737580362361186,
"loss": 0.7692,
"step": 1100
},
{
"epoch": 0.31,
"grad_norm": 14.82417785628839,
"learning_rate": 0.000287156633547633,
"loss": 0.5669,
"step": 1110
},
{
"epoch": 0.32,
"grad_norm": 34.71923857782051,
"learning_rate": 0.000286937463471654,
"loss": 0.7427,
"step": 1120
},
{
"epoch": 0.32,
"grad_norm": 4.620541208209977,
"learning_rate": 0.00028671829339567504,
"loss": 0.4811,
"step": 1130
},
{
"epoch": 0.32,
"grad_norm": 65.72414363498706,
"learning_rate": 0.00028649912331969604,
"loss": 0.6696,
"step": 1140
},
{
"epoch": 0.32,
"grad_norm": 31.255536486132993,
"learning_rate": 0.0002862799532437171,
"loss": 0.6253,
"step": 1150
},
{
"epoch": 0.33,
"grad_norm": 14.845236948350237,
"learning_rate": 0.00028606078316773816,
"loss": 0.5473,
"step": 1160
},
{
"epoch": 0.33,
"grad_norm": 62.44169640686947,
"learning_rate": 0.00028584161309175917,
"loss": 0.6686,
"step": 1170
},
{
"epoch": 0.33,
"grad_norm": 105.6997308934719,
"learning_rate": 0.0002856224430157802,
"loss": 0.7738,
"step": 1180
},
{
"epoch": 0.34,
"grad_norm": 75.19754745372326,
"learning_rate": 0.0002854032729398013,
"loss": 0.5995,
"step": 1190
},
{
"epoch": 0.34,
"grad_norm": 34.71806881877583,
"learning_rate": 0.0002851841028638223,
"loss": 0.5701,
"step": 1200
},
{
"epoch": 0.34,
"grad_norm": 48.159897749652075,
"learning_rate": 0.00028496493278784335,
"loss": 0.4724,
"step": 1210
},
{
"epoch": 0.34,
"grad_norm": 6.544316071386722,
"learning_rate": 0.0002847457627118644,
"loss": 0.6537,
"step": 1220
},
{
"epoch": 0.35,
"grad_norm": 21.365400503740027,
"learning_rate": 0.0002845265926358854,
"loss": 0.4979,
"step": 1230
},
{
"epoch": 0.35,
"grad_norm": 5.248896963680607,
"learning_rate": 0.00028430742255990647,
"loss": 0.4174,
"step": 1240
},
{
"epoch": 0.35,
"grad_norm": 7.535922720940304,
"learning_rate": 0.00028408825248392753,
"loss": 0.7422,
"step": 1250
},
{
"epoch": 0.36,
"grad_norm": 22.985477551377006,
"learning_rate": 0.00028386908240794854,
"loss": 0.6509,
"step": 1260
},
{
"epoch": 0.36,
"grad_norm": 44.34169162733889,
"learning_rate": 0.0002836499123319696,
"loss": 0.6908,
"step": 1270
},
{
"epoch": 0.36,
"grad_norm": 41.93808698385512,
"learning_rate": 0.00028343074225599065,
"loss": 0.5134,
"step": 1280
},
{
"epoch": 0.36,
"grad_norm": 9.954650885829432,
"learning_rate": 0.00028321157218001166,
"loss": 0.4772,
"step": 1290
},
{
"epoch": 0.37,
"grad_norm": 14.453953812006848,
"learning_rate": 0.0002829924021040327,
"loss": 0.6207,
"step": 1300
},
{
"epoch": 0.37,
"grad_norm": 5.273785434123745,
"learning_rate": 0.0002827732320280537,
"loss": 0.5753,
"step": 1310
},
{
"epoch": 0.37,
"grad_norm": 4.858445515524344,
"learning_rate": 0.0002825540619520748,
"loss": 0.5052,
"step": 1320
},
{
"epoch": 0.37,
"grad_norm": 9.635141224231973,
"learning_rate": 0.00028233489187609584,
"loss": 0.4951,
"step": 1330
},
{
"epoch": 0.38,
"grad_norm": 43.41263842722645,
"learning_rate": 0.00028211572180011685,
"loss": 0.5636,
"step": 1340
},
{
"epoch": 0.38,
"grad_norm": 15.495027516009957,
"learning_rate": 0.0002818965517241379,
"loss": 0.466,
"step": 1350
},
{
"epoch": 0.38,
"grad_norm": 64.28182805477857,
"learning_rate": 0.00028167738164815896,
"loss": 0.5672,
"step": 1360
},
{
"epoch": 0.39,
"grad_norm": 64.9516330457715,
"learning_rate": 0.00028145821157217997,
"loss": 0.569,
"step": 1370
},
{
"epoch": 0.39,
"grad_norm": 18.376532518493583,
"learning_rate": 0.00028123904149620103,
"loss": 0.4619,
"step": 1380
},
{
"epoch": 0.39,
"grad_norm": 8.562914609442593,
"learning_rate": 0.0002810198714202221,
"loss": 0.545,
"step": 1390
},
{
"epoch": 0.39,
"grad_norm": 60.21363972002377,
"learning_rate": 0.0002808007013442431,
"loss": 0.3975,
"step": 1400
},
{
"epoch": 0.4,
"grad_norm": 28.047906754092985,
"learning_rate": 0.00028058153126826415,
"loss": 0.6251,
"step": 1410
},
{
"epoch": 0.4,
"grad_norm": 35.10586789783855,
"learning_rate": 0.0002803623611922852,
"loss": 0.6277,
"step": 1420
},
{
"epoch": 0.4,
"grad_norm": 12.797561679192818,
"learning_rate": 0.0002801431911163062,
"loss": 0.5604,
"step": 1430
},
{
"epoch": 0.41,
"grad_norm": 45.77020206433535,
"learning_rate": 0.0002799240210403273,
"loss": 0.5095,
"step": 1440
},
{
"epoch": 0.41,
"grad_norm": 8.950325235661051,
"learning_rate": 0.00027970485096434833,
"loss": 0.6742,
"step": 1450
},
{
"epoch": 0.41,
"grad_norm": 89.39440158187,
"learning_rate": 0.00027948568088836934,
"loss": 0.6291,
"step": 1460
},
{
"epoch": 0.41,
"grad_norm": 42.25863783297151,
"learning_rate": 0.0002792665108123904,
"loss": 0.5981,
"step": 1470
},
{
"epoch": 0.42,
"grad_norm": 70.1528669289255,
"learning_rate": 0.0002790473407364114,
"loss": 0.6403,
"step": 1480
},
{
"epoch": 0.42,
"grad_norm": 46.97367429458831,
"learning_rate": 0.00027882817066043246,
"loss": 0.4118,
"step": 1490
},
{
"epoch": 0.42,
"grad_norm": 40.541175129277065,
"learning_rate": 0.0002786090005844535,
"loss": 0.5049,
"step": 1500
},
{
"epoch": 0.43,
"grad_norm": 15.523999358550466,
"learning_rate": 0.0002783898305084745,
"loss": 0.4035,
"step": 1510
},
{
"epoch": 0.43,
"grad_norm": 10.185134626898725,
"learning_rate": 0.0002781706604324956,
"loss": 0.3807,
"step": 1520
},
{
"epoch": 0.43,
"grad_norm": 41.67374680359159,
"learning_rate": 0.00027795149035651664,
"loss": 0.6452,
"step": 1530
},
{
"epoch": 0.43,
"grad_norm": 17.98431769773285,
"learning_rate": 0.00027773232028053765,
"loss": 0.4049,
"step": 1540
},
{
"epoch": 0.44,
"grad_norm": 28.42010603664099,
"learning_rate": 0.0002775131502045587,
"loss": 0.5067,
"step": 1550
},
{
"epoch": 0.44,
"grad_norm": 8.903302713389115,
"learning_rate": 0.00027729398012857977,
"loss": 0.6382,
"step": 1560
},
{
"epoch": 0.44,
"grad_norm": 65.95394817146764,
"learning_rate": 0.00027707481005260077,
"loss": 0.651,
"step": 1570
},
{
"epoch": 0.45,
"grad_norm": 13.423645923612371,
"learning_rate": 0.00027685563997662183,
"loss": 0.4343,
"step": 1580
},
{
"epoch": 0.45,
"grad_norm": 19.870165040548233,
"learning_rate": 0.0002766364699006429,
"loss": 0.5016,
"step": 1590
},
{
"epoch": 0.45,
"grad_norm": 7.295472062733364,
"learning_rate": 0.0002764172998246639,
"loss": 0.4203,
"step": 1600
},
{
"epoch": 0.45,
"grad_norm": 13.426653279288725,
"learning_rate": 0.00027619812974868495,
"loss": 0.5413,
"step": 1610
},
{
"epoch": 0.46,
"grad_norm": 35.81973630385564,
"learning_rate": 0.00027597895967270596,
"loss": 0.536,
"step": 1620
},
{
"epoch": 0.46,
"grad_norm": 15.789511808342628,
"learning_rate": 0.000275759789596727,
"loss": 0.3641,
"step": 1630
},
{
"epoch": 0.46,
"grad_norm": 3.438056573729943,
"learning_rate": 0.0002755406195207481,
"loss": 0.3846,
"step": 1640
},
{
"epoch": 0.47,
"grad_norm": 43.693835213902354,
"learning_rate": 0.0002753214494447691,
"loss": 0.6304,
"step": 1650
},
{
"epoch": 0.47,
"grad_norm": 41.69843964932105,
"learning_rate": 0.00027510227936879014,
"loss": 0.5184,
"step": 1660
},
{
"epoch": 0.47,
"grad_norm": 26.698062076363627,
"learning_rate": 0.0002748831092928112,
"loss": 0.5844,
"step": 1670
},
{
"epoch": 0.47,
"grad_norm": 1.7012189494264895,
"learning_rate": 0.0002746639392168322,
"loss": 0.4043,
"step": 1680
},
{
"epoch": 0.48,
"grad_norm": 52.6718491008472,
"learning_rate": 0.0002744447691408533,
"loss": 0.5674,
"step": 1690
},
{
"epoch": 0.48,
"grad_norm": 14.061719898202849,
"learning_rate": 0.0002742255990648743,
"loss": 0.4724,
"step": 1700
},
{
"epoch": 0.48,
"grad_norm": 3.847972088161487,
"learning_rate": 0.00027400642898889533,
"loss": 0.3894,
"step": 1710
},
{
"epoch": 0.48,
"grad_norm": 10.733410570701619,
"learning_rate": 0.0002737872589129164,
"loss": 0.3706,
"step": 1720
},
{
"epoch": 0.49,
"grad_norm": 5.582634401736958,
"learning_rate": 0.00027356808883693745,
"loss": 0.6391,
"step": 1730
},
{
"epoch": 0.49,
"grad_norm": 26.185314688453758,
"learning_rate": 0.0002733489187609585,
"loss": 0.6093,
"step": 1740
},
{
"epoch": 0.49,
"grad_norm": 5.754244803162683,
"learning_rate": 0.0002731297486849795,
"loss": 0.4868,
"step": 1750
},
{
"epoch": 0.5,
"grad_norm": 39.5290991827534,
"learning_rate": 0.00027291057860900057,
"loss": 0.3798,
"step": 1760
},
{
"epoch": 0.5,
"grad_norm": 50.24696278971589,
"learning_rate": 0.00027269140853302163,
"loss": 0.5672,
"step": 1770
},
{
"epoch": 0.5,
"grad_norm": 40.15272638877307,
"learning_rate": 0.00027247223845704263,
"loss": 0.4471,
"step": 1780
},
{
"epoch": 0.5,
"grad_norm": 28.70418219314436,
"learning_rate": 0.00027225306838106364,
"loss": 0.6382,
"step": 1790
},
{
"epoch": 0.51,
"grad_norm": 28.561604385347348,
"learning_rate": 0.00027203389830508475,
"loss": 0.4176,
"step": 1800
},
{
"epoch": 0.51,
"grad_norm": 17.815402549033355,
"learning_rate": 0.00027181472822910576,
"loss": 0.5882,
"step": 1810
},
{
"epoch": 0.51,
"grad_norm": 15.342408938615899,
"learning_rate": 0.0002715955581531268,
"loss": 0.5019,
"step": 1820
},
{
"epoch": 0.52,
"grad_norm": 23.598442882592035,
"learning_rate": 0.0002713763880771479,
"loss": 0.556,
"step": 1830
},
{
"epoch": 0.52,
"grad_norm": 67.23786205262353,
"learning_rate": 0.0002711572180011689,
"loss": 0.5685,
"step": 1840
},
{
"epoch": 0.52,
"grad_norm": 4.243022538273344,
"learning_rate": 0.00027093804792518994,
"loss": 0.4733,
"step": 1850
},
{
"epoch": 0.52,
"grad_norm": 13.180044758827014,
"learning_rate": 0.000270718877849211,
"loss": 0.4605,
"step": 1860
},
{
"epoch": 0.53,
"grad_norm": 22.165941849190077,
"learning_rate": 0.000270499707773232,
"loss": 0.5884,
"step": 1870
},
{
"epoch": 0.53,
"grad_norm": 63.37369359897044,
"learning_rate": 0.00027028053769725306,
"loss": 0.5035,
"step": 1880
},
{
"epoch": 0.53,
"grad_norm": 7.126542990903908,
"learning_rate": 0.00027006136762127407,
"loss": 0.5265,
"step": 1890
},
{
"epoch": 0.54,
"grad_norm": 18.999488189785442,
"learning_rate": 0.0002698421975452951,
"loss": 0.4029,
"step": 1900
},
{
"epoch": 0.54,
"grad_norm": 74.7735357892713,
"learning_rate": 0.0002696230274693162,
"loss": 0.5864,
"step": 1910
},
{
"epoch": 0.54,
"grad_norm": 4.537363990276932,
"learning_rate": 0.0002694038573933372,
"loss": 0.4943,
"step": 1920
},
{
"epoch": 0.54,
"grad_norm": 5.4900230565856125,
"learning_rate": 0.00026918468731735825,
"loss": 0.4046,
"step": 1930
},
{
"epoch": 0.55,
"grad_norm": 13.0055044025215,
"learning_rate": 0.0002689655172413793,
"loss": 0.4548,
"step": 1940
},
{
"epoch": 0.55,
"grad_norm": 38.12513704333924,
"learning_rate": 0.0002687463471654003,
"loss": 0.4879,
"step": 1950
},
{
"epoch": 0.55,
"grad_norm": 39.26658731655891,
"learning_rate": 0.0002685271770894214,
"loss": 0.4619,
"step": 1960
},
{
"epoch": 0.56,
"grad_norm": 25.60070282398106,
"learning_rate": 0.00026830800701344243,
"loss": 0.3977,
"step": 1970
},
{
"epoch": 0.56,
"grad_norm": 26.2056587948436,
"learning_rate": 0.00026808883693746344,
"loss": 0.6761,
"step": 1980
},
{
"epoch": 0.56,
"grad_norm": 6.11452999302761,
"learning_rate": 0.0002678696668614845,
"loss": 0.4193,
"step": 1990
},
{
"epoch": 0.56,
"grad_norm": 33.901640420477705,
"learning_rate": 0.00026765049678550556,
"loss": 0.3582,
"step": 2000
},
{
"epoch": 0.57,
"grad_norm": 39.05264118867418,
"learning_rate": 0.00026743132670952656,
"loss": 0.6291,
"step": 2010
},
{
"epoch": 0.57,
"grad_norm": 37.194416649503715,
"learning_rate": 0.0002672121566335476,
"loss": 0.5405,
"step": 2020
},
{
"epoch": 0.57,
"grad_norm": 10.433072414980515,
"learning_rate": 0.0002669929865575686,
"loss": 0.3794,
"step": 2030
},
{
"epoch": 0.58,
"grad_norm": 13.092304284069662,
"learning_rate": 0.0002667738164815897,
"loss": 0.5334,
"step": 2040
},
{
"epoch": 0.58,
"grad_norm": 63.86550661120767,
"learning_rate": 0.00026655464640561074,
"loss": 0.5122,
"step": 2050
},
{
"epoch": 0.58,
"grad_norm": 2.517042940759129,
"learning_rate": 0.00026633547632963175,
"loss": 0.5369,
"step": 2060
},
{
"epoch": 0.58,
"grad_norm": 5.619466465382078,
"learning_rate": 0.0002661163062536528,
"loss": 0.5914,
"step": 2070
},
{
"epoch": 0.59,
"grad_norm": 8.934697851639797,
"learning_rate": 0.00026589713617767387,
"loss": 0.4139,
"step": 2080
},
{
"epoch": 0.59,
"grad_norm": 6.230035836028844,
"learning_rate": 0.00026567796610169487,
"loss": 0.5029,
"step": 2090
},
{
"epoch": 0.59,
"grad_norm": 15.356060765823438,
"learning_rate": 0.00026545879602571593,
"loss": 0.3824,
"step": 2100
},
{
"epoch": 0.59,
"grad_norm": 7.576984306147784,
"learning_rate": 0.000265239625949737,
"loss": 0.6175,
"step": 2110
},
{
"epoch": 0.6,
"grad_norm": 28.748362595639772,
"learning_rate": 0.000265020455873758,
"loss": 0.5614,
"step": 2120
},
{
"epoch": 0.6,
"grad_norm": 16.32965992610039,
"learning_rate": 0.00026480128579777905,
"loss": 0.5116,
"step": 2130
},
{
"epoch": 0.6,
"grad_norm": 3.2753516984217614,
"learning_rate": 0.0002645821157218001,
"loss": 0.4541,
"step": 2140
},
{
"epoch": 0.61,
"grad_norm": 42.92699392822083,
"learning_rate": 0.0002643629456458211,
"loss": 0.4284,
"step": 2150
},
{
"epoch": 0.61,
"grad_norm": 17.612783953748007,
"learning_rate": 0.0002641437755698422,
"loss": 0.465,
"step": 2160
},
{
"epoch": 0.61,
"grad_norm": 8.549512475388743,
"learning_rate": 0.00026392460549386324,
"loss": 0.409,
"step": 2170
},
{
"epoch": 0.61,
"grad_norm": 12.657379379480137,
"learning_rate": 0.00026370543541788424,
"loss": 0.5745,
"step": 2180
},
{
"epoch": 0.62,
"grad_norm": 19.533607385657547,
"learning_rate": 0.0002634862653419053,
"loss": 0.493,
"step": 2190
},
{
"epoch": 0.62,
"grad_norm": 30.019312017902323,
"learning_rate": 0.0002632670952659263,
"loss": 0.486,
"step": 2200
},
{
"epoch": 0.62,
"grad_norm": 15.192669371979543,
"learning_rate": 0.00026304792518994736,
"loss": 0.4214,
"step": 2210
},
{
"epoch": 0.63,
"grad_norm": 32.465439529928304,
"learning_rate": 0.0002628287551139684,
"loss": 0.5712,
"step": 2220
},
{
"epoch": 0.63,
"grad_norm": 36.91886247399315,
"learning_rate": 0.00026260958503798943,
"loss": 0.4334,
"step": 2230
},
{
"epoch": 0.63,
"grad_norm": 23.910769075350032,
"learning_rate": 0.0002623904149620105,
"loss": 0.5462,
"step": 2240
},
{
"epoch": 0.63,
"grad_norm": 67.93569743336582,
"learning_rate": 0.00026217124488603155,
"loss": 0.4493,
"step": 2250
},
{
"epoch": 0.64,
"grad_norm": 2.532118414700022,
"learning_rate": 0.00026195207481005255,
"loss": 0.4534,
"step": 2260
},
{
"epoch": 0.64,
"grad_norm": 29.266963293461014,
"learning_rate": 0.0002617329047340736,
"loss": 0.4529,
"step": 2270
},
{
"epoch": 0.64,
"grad_norm": 48.97288077592708,
"learning_rate": 0.00026153565166569257,
"loss": 0.5841,
"step": 2280
},
{
"epoch": 0.65,
"grad_norm": 20.00706276686128,
"learning_rate": 0.00026131648158971357,
"loss": 0.5449,
"step": 2290
},
{
"epoch": 0.65,
"grad_norm": 52.88683636279855,
"learning_rate": 0.00026109731151373463,
"loss": 0.5451,
"step": 2300
},
{
"epoch": 0.65,
"grad_norm": 19.058421611399435,
"learning_rate": 0.0002608781414377557,
"loss": 0.4933,
"step": 2310
},
{
"epoch": 0.65,
"grad_norm": 12.662298018994834,
"learning_rate": 0.0002606589713617767,
"loss": 0.4933,
"step": 2320
},
{
"epoch": 0.66,
"grad_norm": 45.53858660601269,
"learning_rate": 0.00026043980128579776,
"loss": 0.53,
"step": 2330
},
{
"epoch": 0.66,
"grad_norm": 26.19222918219647,
"learning_rate": 0.0002602206312098188,
"loss": 0.4293,
"step": 2340
},
{
"epoch": 0.66,
"grad_norm": 12.902630462226897,
"learning_rate": 0.0002600014611338398,
"loss": 0.4571,
"step": 2350
},
{
"epoch": 0.67,
"grad_norm": 30.841311400161104,
"learning_rate": 0.0002597822910578609,
"loss": 0.3776,
"step": 2360
},
{
"epoch": 0.67,
"grad_norm": 2.354115675571604,
"learning_rate": 0.00025956312098188194,
"loss": 0.3751,
"step": 2370
},
{
"epoch": 0.67,
"grad_norm": 11.162047701554282,
"learning_rate": 0.00025934395090590294,
"loss": 0.2605,
"step": 2380
},
{
"epoch": 0.67,
"grad_norm": 42.422050961854865,
"learning_rate": 0.000259124780829924,
"loss": 0.5152,
"step": 2390
},
{
"epoch": 0.68,
"grad_norm": 38.9786626728249,
"learning_rate": 0.000258905610753945,
"loss": 0.3935,
"step": 2400
},
{
"epoch": 0.68,
"grad_norm": 38.1773503077784,
"learning_rate": 0.00025868644067796607,
"loss": 0.5644,
"step": 2410
},
{
"epoch": 0.68,
"grad_norm": 14.79430105184665,
"learning_rate": 0.0002584672706019871,
"loss": 0.5293,
"step": 2420
},
{
"epoch": 0.69,
"grad_norm": 34.794933030659074,
"learning_rate": 0.00025824810052600813,
"loss": 0.5646,
"step": 2430
},
{
"epoch": 0.69,
"grad_norm": 11.775866235902662,
"learning_rate": 0.0002580289304500292,
"loss": 0.4189,
"step": 2440
},
{
"epoch": 0.69,
"grad_norm": 74.35669253529362,
"learning_rate": 0.00025780976037405025,
"loss": 0.6112,
"step": 2450
},
{
"epoch": 0.69,
"grad_norm": 38.88263346213307,
"learning_rate": 0.00025759059029807125,
"loss": 0.4503,
"step": 2460
},
{
"epoch": 0.7,
"grad_norm": 37.361077582565066,
"learning_rate": 0.0002573714202220923,
"loss": 0.4393,
"step": 2470
},
{
"epoch": 0.7,
"grad_norm": 1.7092125809415342,
"learning_rate": 0.00025715225014611337,
"loss": 0.4243,
"step": 2480
},
{
"epoch": 0.7,
"grad_norm": 57.33578620565298,
"learning_rate": 0.0002569330800701344,
"loss": 0.4961,
"step": 2490
},
{
"epoch": 0.7,
"grad_norm": 24.953469848220585,
"learning_rate": 0.00025671390999415543,
"loss": 0.5143,
"step": 2500
},
{
"epoch": 0.71,
"grad_norm": 69.87186379679335,
"learning_rate": 0.0002564947399181765,
"loss": 0.5994,
"step": 2510
},
{
"epoch": 0.71,
"grad_norm": 9.423503958754821,
"learning_rate": 0.0002562755698421975,
"loss": 0.3865,
"step": 2520
},
{
"epoch": 0.71,
"grad_norm": 35.63761411276129,
"learning_rate": 0.00025605639976621856,
"loss": 0.4244,
"step": 2530
},
{
"epoch": 0.72,
"grad_norm": 7.718201160525164,
"learning_rate": 0.0002558372296902396,
"loss": 0.303,
"step": 2540
},
{
"epoch": 0.72,
"grad_norm": 4.591243708143018,
"learning_rate": 0.0002556180596142607,
"loss": 0.5215,
"step": 2550
},
{
"epoch": 0.72,
"grad_norm": 22.307015902915715,
"learning_rate": 0.0002553988895382817,
"loss": 0.4571,
"step": 2560
},
{
"epoch": 0.72,
"grad_norm": 22.779301668637764,
"learning_rate": 0.0002551797194623027,
"loss": 0.4807,
"step": 2570
},
{
"epoch": 0.73,
"grad_norm": 12.190202028042945,
"learning_rate": 0.0002549605493863238,
"loss": 0.3605,
"step": 2580
},
{
"epoch": 0.73,
"grad_norm": 7.251641577848608,
"learning_rate": 0.0002547413793103448,
"loss": 0.6469,
"step": 2590
},
{
"epoch": 0.73,
"grad_norm": 1.5118716411197952,
"learning_rate": 0.00025452220923436586,
"loss": 0.3641,
"step": 2600
},
{
"epoch": 0.74,
"grad_norm": 3.4089258042614086,
"learning_rate": 0.0002543030391583869,
"loss": 0.6651,
"step": 2610
},
{
"epoch": 0.74,
"grad_norm": 2.9557173441623856,
"learning_rate": 0.00025408386908240793,
"loss": 0.6309,
"step": 2620
},
{
"epoch": 0.74,
"grad_norm": 29.838754199710966,
"learning_rate": 0.000253864699006429,
"loss": 0.4806,
"step": 2630
},
{
"epoch": 0.74,
"grad_norm": 27.703367909949506,
"learning_rate": 0.00025364552893045,
"loss": 0.5419,
"step": 2640
},
{
"epoch": 0.75,
"grad_norm": 30.019133853453674,
"learning_rate": 0.00025342635885447105,
"loss": 0.5585,
"step": 2650
},
{
"epoch": 0.75,
"grad_norm": 34.77564734000214,
"learning_rate": 0.0002532071887784921,
"loss": 0.4483,
"step": 2660
},
{
"epoch": 0.75,
"grad_norm": 47.783274628573835,
"learning_rate": 0.0002529880187025131,
"loss": 0.5153,
"step": 2670
},
{
"epoch": 0.76,
"grad_norm": 26.23337473603945,
"learning_rate": 0.0002527688486265342,
"loss": 0.4699,
"step": 2680
},
{
"epoch": 0.76,
"grad_norm": 29.216617715519472,
"learning_rate": 0.00025254967855055523,
"loss": 0.5015,
"step": 2690
},
{
"epoch": 0.76,
"grad_norm": 62.76087495760497,
"learning_rate": 0.00025233050847457624,
"loss": 0.3711,
"step": 2700
},
{
"epoch": 0.76,
"grad_norm": 45.88533063155937,
"learning_rate": 0.0002521113383985973,
"loss": 0.6509,
"step": 2710
},
{
"epoch": 0.77,
"grad_norm": 20.44584239605378,
"learning_rate": 0.00025189216832261836,
"loss": 0.3635,
"step": 2720
},
{
"epoch": 0.77,
"grad_norm": 16.861314315606865,
"learning_rate": 0.00025167299824663936,
"loss": 0.4719,
"step": 2730
},
{
"epoch": 0.77,
"grad_norm": 28.52875293425469,
"learning_rate": 0.0002514538281706604,
"loss": 0.689,
"step": 2740
},
{
"epoch": 0.78,
"grad_norm": 35.22222000713686,
"learning_rate": 0.0002512346580946815,
"loss": 0.6445,
"step": 2750
},
{
"epoch": 0.78,
"grad_norm": 35.45687930499127,
"learning_rate": 0.0002510154880187025,
"loss": 0.4704,
"step": 2760
},
{
"epoch": 0.78,
"grad_norm": 8.810315004780433,
"learning_rate": 0.00025079631794272354,
"loss": 0.5576,
"step": 2770
},
{
"epoch": 0.78,
"grad_norm": 35.77216936747473,
"learning_rate": 0.0002505771478667446,
"loss": 0.3533,
"step": 2780
},
{
"epoch": 0.79,
"grad_norm": 6.045353414942304,
"learning_rate": 0.0002503579777907656,
"loss": 0.4113,
"step": 2790
},
{
"epoch": 0.79,
"grad_norm": 45.51398057288223,
"learning_rate": 0.00025013880771478667,
"loss": 0.4071,
"step": 2800
},
{
"epoch": 0.79,
"grad_norm": 36.84264931235842,
"learning_rate": 0.00024991963763880767,
"loss": 0.6049,
"step": 2810
},
{
"epoch": 0.8,
"grad_norm": 7.376752961081483,
"learning_rate": 0.00024970046756282873,
"loss": 0.4454,
"step": 2820
},
{
"epoch": 0.8,
"grad_norm": 45.296630887337315,
"learning_rate": 0.0002494812974868498,
"loss": 0.4267,
"step": 2830
},
{
"epoch": 0.8,
"grad_norm": 43.62273117063915,
"learning_rate": 0.0002492621274108708,
"loss": 0.4359,
"step": 2840
},
{
"epoch": 0.8,
"grad_norm": 51.720566303652305,
"learning_rate": 0.00024904295733489185,
"loss": 0.4941,
"step": 2850
},
{
"epoch": 0.81,
"grad_norm": 24.249504968694367,
"learning_rate": 0.0002488237872589129,
"loss": 0.345,
"step": 2860
},
{
"epoch": 0.81,
"grad_norm": 14.634006430151054,
"learning_rate": 0.0002486046171829339,
"loss": 0.4586,
"step": 2870
},
{
"epoch": 0.81,
"grad_norm": 11.217467398348905,
"learning_rate": 0.000248385447106955,
"loss": 0.4376,
"step": 2880
},
{
"epoch": 0.81,
"grad_norm": 45.30561601980238,
"learning_rate": 0.00024816627703097604,
"loss": 0.3944,
"step": 2890
},
{
"epoch": 0.82,
"grad_norm": 15.499098833410399,
"learning_rate": 0.00024794710695499704,
"loss": 0.4469,
"step": 2900
},
{
"epoch": 0.82,
"grad_norm": 15.594634721523647,
"learning_rate": 0.0002477279368790181,
"loss": 0.4524,
"step": 2910
},
{
"epoch": 0.82,
"grad_norm": 4.415769490237178,
"learning_rate": 0.00024750876680303916,
"loss": 0.5186,
"step": 2920
},
{
"epoch": 0.83,
"grad_norm": 50.952551749899285,
"learning_rate": 0.00024728959672706016,
"loss": 0.3835,
"step": 2930
},
{
"epoch": 0.83,
"grad_norm": 44.37013904045829,
"learning_rate": 0.0002470704266510812,
"loss": 0.3692,
"step": 2940
},
{
"epoch": 0.83,
"grad_norm": 5.538761067578604,
"learning_rate": 0.0002468512565751023,
"loss": 0.4466,
"step": 2950
},
{
"epoch": 0.83,
"grad_norm": 36.623608048249956,
"learning_rate": 0.0002466320864991233,
"loss": 0.4119,
"step": 2960
},
{
"epoch": 0.84,
"grad_norm": 14.12931554239451,
"learning_rate": 0.00024641291642314435,
"loss": 0.4094,
"step": 2970
},
{
"epoch": 0.84,
"grad_norm": 5.17846897516002,
"learning_rate": 0.00024619374634716535,
"loss": 0.6419,
"step": 2980
},
{
"epoch": 0.84,
"grad_norm": 17.27825883448974,
"learning_rate": 0.0002459745762711864,
"loss": 0.4454,
"step": 2990
},
{
"epoch": 0.85,
"grad_norm": 39.317445442045795,
"learning_rate": 0.00024575540619520747,
"loss": 0.4505,
"step": 3000
},
{
"epoch": 0.85,
"grad_norm": 69.64730678545516,
"learning_rate": 0.0002455362361192285,
"loss": 0.4583,
"step": 3010
},
{
"epoch": 0.85,
"grad_norm": 28.844952718130543,
"learning_rate": 0.00024531706604324953,
"loss": 0.6161,
"step": 3020
},
{
"epoch": 0.85,
"grad_norm": 19.0210560197049,
"learning_rate": 0.0002450978959672706,
"loss": 0.4669,
"step": 3030
},
{
"epoch": 0.86,
"grad_norm": 6.173405033604054,
"learning_rate": 0.0002448787258912916,
"loss": 0.4409,
"step": 3040
},
{
"epoch": 0.86,
"grad_norm": 52.77800432968733,
"learning_rate": 0.00024465955581531266,
"loss": 0.4491,
"step": 3050
},
{
"epoch": 0.86,
"grad_norm": 44.46853107524498,
"learning_rate": 0.0002444403857393337,
"loss": 0.3192,
"step": 3060
},
{
"epoch": 0.87,
"grad_norm": 5.0160255873775546,
"learning_rate": 0.0002442212156633547,
"loss": 0.4516,
"step": 3070
},
{
"epoch": 0.87,
"grad_norm": 23.9601395551555,
"learning_rate": 0.00024400204558737578,
"loss": 0.5556,
"step": 3080
},
{
"epoch": 0.87,
"grad_norm": 29.267790594083895,
"learning_rate": 0.00024378287551139684,
"loss": 0.592,
"step": 3090
},
{
"epoch": 0.87,
"grad_norm": 23.012129032542134,
"learning_rate": 0.00024356370543541787,
"loss": 0.3747,
"step": 3100
},
{
"epoch": 0.88,
"grad_norm": 14.454095097216845,
"learning_rate": 0.0002433445353594389,
"loss": 0.384,
"step": 3110
},
{
"epoch": 0.88,
"grad_norm": 26.97816432687431,
"learning_rate": 0.00024312536528345993,
"loss": 0.3297,
"step": 3120
},
{
"epoch": 0.88,
"grad_norm": 9.801537735404679,
"learning_rate": 0.000242906195207481,
"loss": 0.5374,
"step": 3130
},
{
"epoch": 0.89,
"grad_norm": 6.8951700119996,
"learning_rate": 0.00024268702513150203,
"loss": 0.5083,
"step": 3140
},
{
"epoch": 0.89,
"grad_norm": 14.817012261931632,
"learning_rate": 0.00024246785505552306,
"loss": 0.5883,
"step": 3150
},
{
"epoch": 0.89,
"grad_norm": 41.27878717469242,
"learning_rate": 0.00024224868497954412,
"loss": 0.3978,
"step": 3160
},
{
"epoch": 0.89,
"grad_norm": 37.80902917455334,
"learning_rate": 0.00024202951490356515,
"loss": 0.5849,
"step": 3170
},
{
"epoch": 0.9,
"grad_norm": 18.49777920142327,
"learning_rate": 0.00024181034482758618,
"loss": 0.4706,
"step": 3180
},
{
"epoch": 0.9,
"grad_norm": 19.061143834111856,
"learning_rate": 0.00024159117475160724,
"loss": 0.4619,
"step": 3190
},
{
"epoch": 0.9,
"grad_norm": 22.2453112157315,
"learning_rate": 0.00024137200467562827,
"loss": 0.3842,
"step": 3200
},
{
"epoch": 0.9,
"grad_norm": 50.79421799621206,
"learning_rate": 0.0002411528345996493,
"loss": 0.3655,
"step": 3210
},
{
"epoch": 0.91,
"grad_norm": 32.50988571681092,
"learning_rate": 0.00024093366452367034,
"loss": 0.5652,
"step": 3220
},
{
"epoch": 0.91,
"grad_norm": 3.5054157276555955,
"learning_rate": 0.0002407144944476914,
"loss": 0.6236,
"step": 3230
},
{
"epoch": 0.91,
"grad_norm": 34.425678272389796,
"learning_rate": 0.00024049532437171243,
"loss": 0.4448,
"step": 3240
},
{
"epoch": 0.92,
"grad_norm": 27.486059433114335,
"learning_rate": 0.00024027615429573346,
"loss": 0.4669,
"step": 3250
},
{
"epoch": 0.92,
"grad_norm": 19.22828927795651,
"learning_rate": 0.00024005698421975452,
"loss": 0.4644,
"step": 3260
},
{
"epoch": 0.92,
"grad_norm": 21.0363603349132,
"learning_rate": 0.00023983781414377555,
"loss": 0.431,
"step": 3270
},
{
"epoch": 0.92,
"grad_norm": 3.5723838037716984,
"learning_rate": 0.00023961864406779658,
"loss": 0.5383,
"step": 3280
},
{
"epoch": 0.93,
"grad_norm": 13.546990268361519,
"learning_rate": 0.00023939947399181761,
"loss": 0.3201,
"step": 3290
},
{
"epoch": 0.93,
"grad_norm": 24.38084566112433,
"learning_rate": 0.00023918030391583867,
"loss": 0.3129,
"step": 3300
},
{
"epoch": 0.93,
"grad_norm": 25.645694818995487,
"learning_rate": 0.0002389611338398597,
"loss": 0.5026,
"step": 3310
},
{
"epoch": 0.94,
"grad_norm": 66.5222383607227,
"learning_rate": 0.00023874196376388074,
"loss": 0.5204,
"step": 3320
},
{
"epoch": 0.94,
"grad_norm": 24.167963861089163,
"learning_rate": 0.0002385227936879018,
"loss": 0.2994,
"step": 3330
},
{
"epoch": 0.94,
"grad_norm": 2.9923273025019665,
"learning_rate": 0.00023830362361192283,
"loss": 0.3327,
"step": 3340
},
{
"epoch": 0.94,
"grad_norm": 7.036402375923249,
"learning_rate": 0.00023808445353594386,
"loss": 0.4824,
"step": 3350
},
{
"epoch": 0.95,
"grad_norm": 4.932257241329014,
"learning_rate": 0.00023786528345996492,
"loss": 0.3872,
"step": 3360
},
{
"epoch": 0.95,
"grad_norm": 8.597901439972688,
"learning_rate": 0.00023764611338398595,
"loss": 0.3997,
"step": 3370
},
{
"epoch": 0.95,
"grad_norm": 11.315382409369429,
"learning_rate": 0.00023742694330800698,
"loss": 0.5147,
"step": 3380
},
{
"epoch": 0.96,
"grad_norm": 14.52575221014949,
"learning_rate": 0.00023720777323202802,
"loss": 0.3314,
"step": 3390
},
{
"epoch": 0.96,
"grad_norm": 43.682402501837984,
"learning_rate": 0.00023698860315604907,
"loss": 0.4339,
"step": 3400
},
{
"epoch": 0.96,
"grad_norm": 53.44210886125679,
"learning_rate": 0.0002367694330800701,
"loss": 0.4009,
"step": 3410
},
{
"epoch": 0.96,
"grad_norm": 13.63857046515992,
"learning_rate": 0.00023655026300409114,
"loss": 0.4517,
"step": 3420
},
{
"epoch": 0.97,
"grad_norm": 18.188881596196584,
"learning_rate": 0.0002363310929281122,
"loss": 0.7472,
"step": 3430
},
{
"epoch": 0.97,
"grad_norm": 15.622454098714014,
"learning_rate": 0.00023611192285213323,
"loss": 0.4332,
"step": 3440
},
{
"epoch": 0.97,
"grad_norm": 23.030560101084287,
"learning_rate": 0.00023589275277615426,
"loss": 0.4659,
"step": 3450
},
{
"epoch": 0.98,
"grad_norm": 11.217028241840945,
"learning_rate": 0.0002356735827001753,
"loss": 0.4288,
"step": 3460
},
{
"epoch": 0.98,
"grad_norm": 20.08432954445042,
"learning_rate": 0.00023545441262419635,
"loss": 0.277,
"step": 3470
},
{
"epoch": 0.98,
"grad_norm": 74.65434882424645,
"learning_rate": 0.00023523524254821738,
"loss": 0.635,
"step": 3480
},
{
"epoch": 0.98,
"grad_norm": 40.495503288549955,
"learning_rate": 0.00023501607247223842,
"loss": 0.388,
"step": 3490
},
{
"epoch": 0.99,
"grad_norm": 12.752161600605927,
"learning_rate": 0.0002347969023962595,
"loss": 0.5092,
"step": 3500
},
{
"epoch": 0.99,
"grad_norm": 14.53893507733024,
"learning_rate": 0.0002345777323202805,
"loss": 0.4765,
"step": 3510
},
{
"epoch": 0.99,
"grad_norm": 4.316017393789724,
"learning_rate": 0.00023435856224430154,
"loss": 0.3659,
"step": 3520
},
{
"epoch": 1.0,
"grad_norm": 30.899825163472197,
"learning_rate": 0.00023413939216832257,
"loss": 0.4516,
"step": 3530
},
{
"epoch": 1.0,
"grad_norm": 39.99478400763079,
"learning_rate": 0.00023392022209234366,
"loss": 0.381,
"step": 3540
},
{
"epoch": 1.0,
"eval_0_f1": 0.6956680014561338,
"eval_0_precision": 0.607631160572337,
"eval_0_recall": 0.8135376756066411,
"eval_1_f1": 0.8688421713209915,
"eval_1_precision": 0.9267068273092369,
"eval_1_recall": 0.8177790903721205,
"eval_accuracy": 0.8166867668018858,
"eval_loss": 0.3857421875,
"eval_runtime": 546.7196,
"eval_samples_per_second": 16.683,
"eval_steps_per_second": 2.782,
"step": 3547
},
{
"epoch": 1.0,
"grad_norm": 5.0389158685922375,
"learning_rate": 0.00023370105201636466,
"loss": 0.4146,
"step": 3550
},
{
"epoch": 1.0,
"grad_norm": 11.9478624203234,
"learning_rate": 0.0002334818819403857,
"loss": 0.219,
"step": 3560
},
{
"epoch": 1.01,
"grad_norm": 2.574561125344869,
"learning_rate": 0.00023326271186440678,
"loss": 0.1326,
"step": 3570
},
{
"epoch": 1.01,
"grad_norm": 6.645296190271201,
"learning_rate": 0.0002330435417884278,
"loss": 0.3714,
"step": 3580
},
{
"epoch": 1.01,
"grad_norm": 21.165312017263854,
"learning_rate": 0.00023282437171244884,
"loss": 0.186,
"step": 3590
},
{
"epoch": 1.01,
"grad_norm": 33.09937520843868,
"learning_rate": 0.0002326052016364699,
"loss": 0.189,
"step": 3600
},
{
"epoch": 1.02,
"grad_norm": 28.917213358449875,
"learning_rate": 0.00023238603156049094,
"loss": 0.2064,
"step": 3610
},
{
"epoch": 1.02,
"grad_norm": 6.362982741180963,
"learning_rate": 0.00023216686148451197,
"loss": 0.3743,
"step": 3620
},
{
"epoch": 1.02,
"grad_norm": 15.278355603224561,
"learning_rate": 0.000231947691408533,
"loss": 0.339,
"step": 3630
},
{
"epoch": 1.03,
"grad_norm": 19.75892836736991,
"learning_rate": 0.00023172852133255406,
"loss": 0.3234,
"step": 3640
},
{
"epoch": 1.03,
"grad_norm": 27.481948501720865,
"learning_rate": 0.0002315093512565751,
"loss": 0.3744,
"step": 3650
},
{
"epoch": 1.03,
"grad_norm": 4.553602693616155,
"learning_rate": 0.00023129018118059612,
"loss": 0.292,
"step": 3660
},
{
"epoch": 1.03,
"grad_norm": 5.428732611717056,
"learning_rate": 0.00023107101110461718,
"loss": 0.2631,
"step": 3670
},
{
"epoch": 1.04,
"grad_norm": 5.050323966343426,
"learning_rate": 0.00023085184102863821,
"loss": 0.4004,
"step": 3680
},
{
"epoch": 1.04,
"grad_norm": 24.80793600752628,
"learning_rate": 0.00023063267095265925,
"loss": 0.5381,
"step": 3690
},
{
"epoch": 1.04,
"grad_norm": 18.258408123657016,
"learning_rate": 0.00023041350087668028,
"loss": 0.3396,
"step": 3700
},
{
"epoch": 1.05,
"grad_norm": 29.325589345887487,
"learning_rate": 0.00023019433080070134,
"loss": 0.3759,
"step": 3710
},
{
"epoch": 1.05,
"grad_norm": 5.83589085044179,
"learning_rate": 0.00022997516072472237,
"loss": 0.3515,
"step": 3720
},
{
"epoch": 1.05,
"grad_norm": 31.709214076980373,
"learning_rate": 0.0002297559906487434,
"loss": 0.3038,
"step": 3730
},
{
"epoch": 1.05,
"grad_norm": 5.331144297018398,
"learning_rate": 0.00022953682057276446,
"loss": 0.2608,
"step": 3740
},
{
"epoch": 1.06,
"grad_norm": 10.008440999378909,
"learning_rate": 0.0002293176504967855,
"loss": 0.392,
"step": 3750
},
{
"epoch": 1.06,
"grad_norm": 24.06127490696333,
"learning_rate": 0.00022909848042080652,
"loss": 0.464,
"step": 3760
},
{
"epoch": 1.06,
"grad_norm": 36.47161723251647,
"learning_rate": 0.00022887931034482758,
"loss": 0.3144,
"step": 3770
},
{
"epoch": 1.07,
"grad_norm": 44.318502303445214,
"learning_rate": 0.00022866014026884862,
"loss": 0.2178,
"step": 3780
},
{
"epoch": 1.07,
"grad_norm": 21.011798626587794,
"learning_rate": 0.00022844097019286965,
"loss": 0.2655,
"step": 3790
},
{
"epoch": 1.07,
"grad_norm": 7.015038983544465,
"learning_rate": 0.00022822180011689068,
"loss": 0.33,
"step": 3800
},
{
"epoch": 1.07,
"grad_norm": 9.837625310344656,
"learning_rate": 0.00022800263004091174,
"loss": 0.3804,
"step": 3810
},
{
"epoch": 1.08,
"grad_norm": 3.5246300872909493,
"learning_rate": 0.00022778345996493277,
"loss": 0.3144,
"step": 3820
},
{
"epoch": 1.08,
"grad_norm": 7.333333429866329,
"learning_rate": 0.0002275642898889538,
"loss": 0.208,
"step": 3830
},
{
"epoch": 1.08,
"grad_norm": 2.9276220226615655,
"learning_rate": 0.00022734511981297486,
"loss": 0.1965,
"step": 3840
},
{
"epoch": 1.09,
"grad_norm": 3.296027114122367,
"learning_rate": 0.0002271259497369959,
"loss": 0.1641,
"step": 3850
},
{
"epoch": 1.09,
"grad_norm": 63.98473163919884,
"learning_rate": 0.00022690677966101693,
"loss": 0.2204,
"step": 3860
},
{
"epoch": 1.09,
"grad_norm": 25.79236796313587,
"learning_rate": 0.00022668760958503796,
"loss": 0.3985,
"step": 3870
},
{
"epoch": 1.09,
"grad_norm": 47.19895401753889,
"learning_rate": 0.00022646843950905902,
"loss": 0.3724,
"step": 3880
},
{
"epoch": 1.1,
"grad_norm": 11.740165468615665,
"learning_rate": 0.00022624926943308005,
"loss": 0.3228,
"step": 3890
},
{
"epoch": 1.1,
"grad_norm": 1.7482155895929372,
"learning_rate": 0.00022603009935710108,
"loss": 0.3069,
"step": 3900
},
{
"epoch": 1.1,
"grad_norm": 31.032450056833344,
"learning_rate": 0.00022581092928112214,
"loss": 0.2573,
"step": 3910
},
{
"epoch": 1.11,
"grad_norm": 11.15417567821661,
"learning_rate": 0.00022559175920514317,
"loss": 0.2432,
"step": 3920
},
{
"epoch": 1.11,
"grad_norm": 9.77526286134104,
"learning_rate": 0.0002253725891291642,
"loss": 0.202,
"step": 3930
},
{
"epoch": 1.11,
"grad_norm": 43.631843372398045,
"learning_rate": 0.00022515341905318524,
"loss": 0.3633,
"step": 3940
},
{
"epoch": 1.11,
"grad_norm": 24.908344949793975,
"learning_rate": 0.0002249342489772063,
"loss": 0.2629,
"step": 3950
},
{
"epoch": 1.12,
"grad_norm": 1.9235099739993984,
"learning_rate": 0.00022471507890122733,
"loss": 0.312,
"step": 3960
},
{
"epoch": 1.12,
"grad_norm": 8.901642976423531,
"learning_rate": 0.00022449590882524836,
"loss": 0.2134,
"step": 3970
},
{
"epoch": 1.12,
"grad_norm": 35.49143975289104,
"learning_rate": 0.00022427673874926942,
"loss": 0.3581,
"step": 3980
},
{
"epoch": 1.12,
"grad_norm": 9.880151735005258,
"learning_rate": 0.00022405756867329045,
"loss": 0.2105,
"step": 3990
},
{
"epoch": 1.13,
"grad_norm": 11.472473033640894,
"learning_rate": 0.00022383839859731148,
"loss": 0.2771,
"step": 4000
},
{
"epoch": 1.13,
"grad_norm": 7.564996853716761,
"learning_rate": 0.00022361922852133254,
"loss": 0.1729,
"step": 4010
},
{
"epoch": 1.13,
"grad_norm": 1.6868590263390821,
"learning_rate": 0.00022340005844535357,
"loss": 0.4633,
"step": 4020
},
{
"epoch": 1.14,
"grad_norm": 28.229059784881606,
"learning_rate": 0.0002231808883693746,
"loss": 0.5021,
"step": 4030
},
{
"epoch": 1.14,
"grad_norm": 17.572412568669705,
"learning_rate": 0.00022296171829339564,
"loss": 0.2666,
"step": 4040
},
{
"epoch": 1.14,
"grad_norm": 11.720230148855956,
"learning_rate": 0.0002227425482174167,
"loss": 0.3706,
"step": 4050
},
{
"epoch": 1.14,
"grad_norm": 57.76450102261794,
"learning_rate": 0.00022252337814143773,
"loss": 0.3514,
"step": 4060
},
{
"epoch": 1.15,
"grad_norm": 8.937408336756231,
"learning_rate": 0.00022230420806545876,
"loss": 0.3542,
"step": 4070
},
{
"epoch": 1.15,
"grad_norm": 40.358482259032456,
"learning_rate": 0.00022208503798947982,
"loss": 0.3049,
"step": 4080
},
{
"epoch": 1.15,
"grad_norm": 22.567330463151773,
"learning_rate": 0.00022186586791350085,
"loss": 0.2778,
"step": 4090
},
{
"epoch": 1.16,
"grad_norm": 27.666117112861954,
"learning_rate": 0.00022164669783752188,
"loss": 0.2204,
"step": 4100
},
{
"epoch": 1.16,
"grad_norm": 34.568905094183656,
"learning_rate": 0.00022142752776154292,
"loss": 0.4829,
"step": 4110
},
{
"epoch": 1.16,
"grad_norm": 37.20888182696178,
"learning_rate": 0.00022120835768556397,
"loss": 0.3751,
"step": 4120
},
{
"epoch": 1.16,
"grad_norm": 2.628119016749957,
"learning_rate": 0.000220989187609585,
"loss": 0.2531,
"step": 4130
},
{
"epoch": 1.17,
"grad_norm": 12.787898954063717,
"learning_rate": 0.00022077001753360604,
"loss": 0.2568,
"step": 4140
},
{
"epoch": 1.17,
"grad_norm": 13.740422198890055,
"learning_rate": 0.0002205508474576271,
"loss": 0.1521,
"step": 4150
},
{
"epoch": 1.17,
"grad_norm": 2.754528026569796,
"learning_rate": 0.00022033167738164813,
"loss": 0.3939,
"step": 4160
},
{
"epoch": 1.18,
"grad_norm": 17.13803914567701,
"learning_rate": 0.00022011250730566916,
"loss": 0.2548,
"step": 4170
},
{
"epoch": 1.18,
"grad_norm": 8.755380723031136,
"learning_rate": 0.00021989333722969025,
"loss": 0.22,
"step": 4180
},
{
"epoch": 1.18,
"grad_norm": 11.327279230065335,
"learning_rate": 0.00021967416715371128,
"loss": 0.1881,
"step": 4190
},
{
"epoch": 1.18,
"grad_norm": 11.76478464724261,
"learning_rate": 0.00021945499707773229,
"loss": 0.4543,
"step": 4200
},
{
"epoch": 1.19,
"grad_norm": 21.089139286448926,
"learning_rate": 0.00021923582700175332,
"loss": 0.412,
"step": 4210
},
{
"epoch": 1.19,
"grad_norm": 4.35685071056218,
"learning_rate": 0.0002190166569257744,
"loss": 0.2452,
"step": 4220
},
{
"epoch": 1.19,
"grad_norm": 16.567825119225937,
"learning_rate": 0.00021879748684979544,
"loss": 0.2714,
"step": 4230
},
{
"epoch": 1.2,
"grad_norm": 51.4802446925637,
"learning_rate": 0.00021857831677381644,
"loss": 0.3585,
"step": 4240
},
{
"epoch": 1.2,
"grad_norm": 50.02592512359097,
"learning_rate": 0.00021835914669783753,
"loss": 0.4534,
"step": 4250
},
{
"epoch": 1.2,
"grad_norm": 6.384047108249605,
"learning_rate": 0.00021813997662185856,
"loss": 0.3206,
"step": 4260
},
{
"epoch": 1.2,
"grad_norm": 9.283525914668507,
"learning_rate": 0.0002179208065458796,
"loss": 0.3393,
"step": 4270
},
{
"epoch": 1.21,
"grad_norm": 45.654861594884274,
"learning_rate": 0.0002177016364699006,
"loss": 0.335,
"step": 4280
},
{
"epoch": 1.21,
"grad_norm": 3.5003233393976263,
"learning_rate": 0.00021748246639392168,
"loss": 0.3477,
"step": 4290
},
{
"epoch": 1.21,
"grad_norm": 14.979317988693346,
"learning_rate": 0.00021726329631794271,
"loss": 0.2205,
"step": 4300
},
{
"epoch": 1.22,
"grad_norm": 9.173367064164506,
"learning_rate": 0.00021704412624196375,
"loss": 0.4878,
"step": 4310
},
{
"epoch": 1.22,
"grad_norm": 13.812041376988086,
"learning_rate": 0.0002168249561659848,
"loss": 0.3115,
"step": 4320
},
{
"epoch": 1.22,
"grad_norm": 5.145857101646077,
"learning_rate": 0.0002166277030976037,
"loss": 0.1912,
"step": 4330
},
{
"epoch": 1.22,
"grad_norm": 1.5222263707561177,
"learning_rate": 0.00021640853302162474,
"loss": 0.2155,
"step": 4340
},
{
"epoch": 1.23,
"grad_norm": 20.94214337532918,
"learning_rate": 0.00021618936294564583,
"loss": 0.1663,
"step": 4350
},
{
"epoch": 1.23,
"grad_norm": 46.6321807054184,
"learning_rate": 0.00021597019286966686,
"loss": 0.2966,
"step": 4360
},
{
"epoch": 1.23,
"grad_norm": 4.374585543579176,
"learning_rate": 0.00021575102279368786,
"loss": 0.3326,
"step": 4370
},
{
"epoch": 1.23,
"grad_norm": 5.550072623324231,
"learning_rate": 0.0002155318527177089,
"loss": 0.1478,
"step": 4380
},
{
"epoch": 1.24,
"grad_norm": 0.6613601079645897,
"learning_rate": 0.00021531268264172998,
"loss": 0.2295,
"step": 4390
},
{
"epoch": 1.24,
"grad_norm": 3.000033161148002,
"learning_rate": 0.00021509351256575101,
"loss": 0.5028,
"step": 4400
},
{
"epoch": 1.24,
"grad_norm": 38.67630998438632,
"learning_rate": 0.00021487434248977205,
"loss": 0.304,
"step": 4410
},
{
"epoch": 1.25,
"grad_norm": 31.0474775238587,
"learning_rate": 0.0002146551724137931,
"loss": 0.3397,
"step": 4420
},
{
"epoch": 1.25,
"grad_norm": 21.038657972290533,
"learning_rate": 0.00021443600233781414,
"loss": 0.2639,
"step": 4430
},
{
"epoch": 1.25,
"grad_norm": 29.03727828295053,
"learning_rate": 0.00021421683226183517,
"loss": 0.269,
"step": 4440
},
{
"epoch": 1.25,
"grad_norm": 4.822263791040293,
"learning_rate": 0.0002139976621858562,
"loss": 0.2482,
"step": 4450
},
{
"epoch": 1.26,
"grad_norm": 8.31426730608056,
"learning_rate": 0.00021377849210987726,
"loss": 0.2249,
"step": 4460
},
{
"epoch": 1.26,
"grad_norm": 43.20888606905504,
"learning_rate": 0.0002135593220338983,
"loss": 0.3654,
"step": 4470
},
{
"epoch": 1.26,
"grad_norm": 2.446800168689743,
"learning_rate": 0.00021334015195791932,
"loss": 0.3182,
"step": 4480
},
{
"epoch": 1.27,
"grad_norm": 7.6988347063951466,
"learning_rate": 0.00021312098188194038,
"loss": 0.3373,
"step": 4490
},
{
"epoch": 1.27,
"grad_norm": 31.935307455130687,
"learning_rate": 0.00021290181180596142,
"loss": 0.3036,
"step": 4500
},
{
"epoch": 1.27,
"grad_norm": 20.726406179500756,
"learning_rate": 0.00021268264172998245,
"loss": 0.3627,
"step": 4510
},
{
"epoch": 1.27,
"grad_norm": 7.097138033352009,
"learning_rate": 0.0002124634716540035,
"loss": 0.2623,
"step": 4520
},
{
"epoch": 1.28,
"grad_norm": 3.8873072142785183,
"learning_rate": 0.00021224430157802454,
"loss": 0.2236,
"step": 4530
},
{
"epoch": 1.28,
"grad_norm": 14.527770370800262,
"learning_rate": 0.00021202513150204557,
"loss": 0.137,
"step": 4540
},
{
"epoch": 1.28,
"grad_norm": 7.558415335390357,
"learning_rate": 0.0002118059614260666,
"loss": 0.217,
"step": 4550
},
{
"epoch": 1.29,
"grad_norm": 29.89529000773257,
"learning_rate": 0.00021158679135008766,
"loss": 0.251,
"step": 4560
},
{
"epoch": 1.29,
"grad_norm": 52.64728318565253,
"learning_rate": 0.0002113676212741087,
"loss": 0.3333,
"step": 4570
},
{
"epoch": 1.29,
"grad_norm": 30.152792328177426,
"learning_rate": 0.00021114845119812973,
"loss": 0.2724,
"step": 4580
},
{
"epoch": 1.29,
"grad_norm": 2.5341334779748967,
"learning_rate": 0.00021092928112215079,
"loss": 0.269,
"step": 4590
},
{
"epoch": 1.3,
"grad_norm": 22.55021824043886,
"learning_rate": 0.00021071011104617182,
"loss": 0.0765,
"step": 4600
},
{
"epoch": 1.3,
"grad_norm": 20.277146842395847,
"learning_rate": 0.00021049094097019285,
"loss": 0.2189,
"step": 4610
},
{
"epoch": 1.3,
"grad_norm": 31.332786897175072,
"learning_rate": 0.00021027177089421388,
"loss": 0.4537,
"step": 4620
},
{
"epoch": 1.31,
"grad_norm": 4.0878545777289395,
"learning_rate": 0.00021005260081823494,
"loss": 0.2703,
"step": 4630
},
{
"epoch": 1.31,
"grad_norm": 26.911824724655006,
"learning_rate": 0.00020983343074225597,
"loss": 0.2727,
"step": 4640
},
{
"epoch": 1.31,
"grad_norm": 2.5043488479928016,
"learning_rate": 0.000209614260666277,
"loss": 0.1888,
"step": 4650
},
{
"epoch": 1.31,
"grad_norm": 38.26894575743672,
"learning_rate": 0.00020939509059029806,
"loss": 0.334,
"step": 4660
},
{
"epoch": 1.32,
"grad_norm": 3.073803318378424,
"learning_rate": 0.0002091759205143191,
"loss": 0.3018,
"step": 4670
},
{
"epoch": 1.32,
"grad_norm": 50.99829894564531,
"learning_rate": 0.00020895675043834013,
"loss": 0.309,
"step": 4680
},
{
"epoch": 1.32,
"grad_norm": 33.03113078011947,
"learning_rate": 0.0002087375803623612,
"loss": 0.3267,
"step": 4690
},
{
"epoch": 1.33,
"grad_norm": 49.72639701138489,
"learning_rate": 0.00020851841028638222,
"loss": 0.3428,
"step": 4700
},
{
"epoch": 1.33,
"grad_norm": 15.250514847558827,
"learning_rate": 0.00020829924021040325,
"loss": 0.2442,
"step": 4710
},
{
"epoch": 1.33,
"grad_norm": 29.630372731951933,
"learning_rate": 0.00020808007013442428,
"loss": 0.4006,
"step": 4720
},
{
"epoch": 1.33,
"grad_norm": 2.6025635165269834,
"learning_rate": 0.00020786090005844534,
"loss": 0.1417,
"step": 4730
},
{
"epoch": 1.34,
"grad_norm": 46.58385970647689,
"learning_rate": 0.00020764172998246637,
"loss": 0.3178,
"step": 4740
},
{
"epoch": 1.34,
"grad_norm": 33.748707418723626,
"learning_rate": 0.0002074225599064874,
"loss": 0.4147,
"step": 4750
},
{
"epoch": 1.34,
"grad_norm": 8.516868922828099,
"learning_rate": 0.00020720338983050846,
"loss": 0.2285,
"step": 4760
},
{
"epoch": 1.34,
"grad_norm": 56.676130895938115,
"learning_rate": 0.0002069842197545295,
"loss": 0.3276,
"step": 4770
},
{
"epoch": 1.35,
"grad_norm": 57.88752849086389,
"learning_rate": 0.00020676504967855053,
"loss": 0.4268,
"step": 4780
},
{
"epoch": 1.35,
"grad_norm": 2.2518432859009083,
"learning_rate": 0.00020654587960257156,
"loss": 0.2271,
"step": 4790
},
{
"epoch": 1.35,
"grad_norm": 8.476321740016418,
"learning_rate": 0.00020632670952659262,
"loss": 0.2991,
"step": 4800
},
{
"epoch": 1.36,
"grad_norm": 5.939165676183437,
"learning_rate": 0.00020610753945061365,
"loss": 0.2207,
"step": 4810
},
{
"epoch": 1.36,
"grad_norm": 6.721301020536598,
"learning_rate": 0.00020588836937463468,
"loss": 0.1735,
"step": 4820
},
{
"epoch": 1.36,
"grad_norm": 3.2082370683760044,
"learning_rate": 0.00020566919929865574,
"loss": 0.4545,
"step": 4830
},
{
"epoch": 1.36,
"grad_norm": 29.422853677429167,
"learning_rate": 0.00020545002922267678,
"loss": 0.4142,
"step": 4840
},
{
"epoch": 1.37,
"grad_norm": 5.299481055699104,
"learning_rate": 0.0002052308591466978,
"loss": 0.3986,
"step": 4850
},
{
"epoch": 1.37,
"grad_norm": 22.75931248540711,
"learning_rate": 0.00020501168907071887,
"loss": 0.4092,
"step": 4860
},
{
"epoch": 1.37,
"grad_norm": 6.466191538331885,
"learning_rate": 0.0002047925189947399,
"loss": 0.1388,
"step": 4870
},
{
"epoch": 1.38,
"grad_norm": 8.95128170273167,
"learning_rate": 0.00020457334891876093,
"loss": 0.4025,
"step": 4880
},
{
"epoch": 1.38,
"grad_norm": 30.917184250084812,
"learning_rate": 0.00020435417884278196,
"loss": 0.3814,
"step": 4890
},
{
"epoch": 1.38,
"grad_norm": 20.91623684082827,
"learning_rate": 0.00020413500876680302,
"loss": 0.3143,
"step": 4900
},
{
"epoch": 1.38,
"grad_norm": 28.994187069448067,
"learning_rate": 0.00020391583869082405,
"loss": 0.2026,
"step": 4910
},
{
"epoch": 1.39,
"grad_norm": 42.11122790207425,
"learning_rate": 0.00020369666861484509,
"loss": 0.3098,
"step": 4920
},
{
"epoch": 1.39,
"grad_norm": 36.159828304877706,
"learning_rate": 0.00020347749853886614,
"loss": 0.2509,
"step": 4930
},
{
"epoch": 1.39,
"grad_norm": 20.23189475386178,
"learning_rate": 0.00020325832846288718,
"loss": 0.3435,
"step": 4940
},
{
"epoch": 1.4,
"grad_norm": 9.446387490878042,
"learning_rate": 0.0002030391583869082,
"loss": 0.3598,
"step": 4950
},
{
"epoch": 1.4,
"grad_norm": 31.356335012679175,
"learning_rate": 0.00020281998831092924,
"loss": 0.3427,
"step": 4960
},
{
"epoch": 1.4,
"grad_norm": 25.111026271060013,
"learning_rate": 0.0002026008182349503,
"loss": 0.2731,
"step": 4970
},
{
"epoch": 1.4,
"grad_norm": 10.288548511014797,
"learning_rate": 0.00020238164815897133,
"loss": 0.2741,
"step": 4980
},
{
"epoch": 1.41,
"grad_norm": 18.078048168470804,
"learning_rate": 0.00020216247808299236,
"loss": 0.1965,
"step": 4990
},
{
"epoch": 1.41,
"grad_norm": 2.5818740774616953,
"learning_rate": 0.00020194330800701345,
"loss": 0.423,
"step": 5000
},
{
"epoch": 1.41,
"grad_norm": 5.029346814046525,
"learning_rate": 0.00020172413793103448,
"loss": 0.1981,
"step": 5010
},
{
"epoch": 1.42,
"grad_norm": 3.69515764136778,
"learning_rate": 0.0002015049678550555,
"loss": 0.3407,
"step": 5020
},
{
"epoch": 1.42,
"grad_norm": 1.1185073469260922,
"learning_rate": 0.00020128579777907652,
"loss": 0.3759,
"step": 5030
},
{
"epoch": 1.42,
"grad_norm": 25.82929883275312,
"learning_rate": 0.0002010666277030976,
"loss": 0.3386,
"step": 5040
},
{
"epoch": 1.42,
"grad_norm": 5.658058990862141,
"learning_rate": 0.00020084745762711864,
"loss": 0.3067,
"step": 5050
},
{
"epoch": 1.43,
"grad_norm": 33.789807219043574,
"learning_rate": 0.00020062828755113964,
"loss": 0.3116,
"step": 5060
},
{
"epoch": 1.43,
"grad_norm": 29.665346856502566,
"learning_rate": 0.00020040911747516073,
"loss": 0.4307,
"step": 5070
},
{
"epoch": 1.43,
"grad_norm": 2.953173260130455,
"learning_rate": 0.00020018994739918176,
"loss": 0.2401,
"step": 5080
},
{
"epoch": 1.44,
"grad_norm": 59.25975112660892,
"learning_rate": 0.0001999707773232028,
"loss": 0.3712,
"step": 5090
},
{
"epoch": 1.44,
"grad_norm": 16.61695534436786,
"learning_rate": 0.00019975160724722385,
"loss": 0.2978,
"step": 5100
},
{
"epoch": 1.44,
"grad_norm": 27.59304302639443,
"learning_rate": 0.00019953243717124488,
"loss": 0.2207,
"step": 5110
},
{
"epoch": 1.44,
"grad_norm": 33.3996635406056,
"learning_rate": 0.00019931326709526592,
"loss": 0.4006,
"step": 5120
},
{
"epoch": 1.45,
"grad_norm": 7.909724279076434,
"learning_rate": 0.00019909409701928695,
"loss": 0.3282,
"step": 5130
},
{
"epoch": 1.45,
"grad_norm": 10.96614010093427,
"learning_rate": 0.000198874926943308,
"loss": 0.3641,
"step": 5140
},
{
"epoch": 1.45,
"grad_norm": 4.747888769171669,
"learning_rate": 0.00019865575686732904,
"loss": 0.2596,
"step": 5150
},
{
"epoch": 1.45,
"grad_norm": 17.19969500744997,
"learning_rate": 0.00019843658679135007,
"loss": 0.2359,
"step": 5160
},
{
"epoch": 1.46,
"grad_norm": 12.445792367631444,
"learning_rate": 0.00019821741671537113,
"loss": 0.291,
"step": 5170
},
{
"epoch": 1.46,
"grad_norm": 16.38158373051574,
"learning_rate": 0.00019799824663939216,
"loss": 0.2152,
"step": 5180
},
{
"epoch": 1.46,
"grad_norm": 19.55308310137066,
"learning_rate": 0.0001977790765634132,
"loss": 0.245,
"step": 5190
},
{
"epoch": 1.47,
"grad_norm": 4.151920692809044,
"learning_rate": 0.00019755990648743423,
"loss": 0.263,
"step": 5200
},
{
"epoch": 1.47,
"grad_norm": 23.195604876425826,
"learning_rate": 0.00019734073641145528,
"loss": 0.3534,
"step": 5210
},
{
"epoch": 1.47,
"grad_norm": 21.589521451918323,
"learning_rate": 0.00019712156633547632,
"loss": 0.1662,
"step": 5220
},
{
"epoch": 1.47,
"grad_norm": 11.887752916702121,
"learning_rate": 0.00019690239625949735,
"loss": 0.2105,
"step": 5230
},
{
"epoch": 1.48,
"grad_norm": 5.176318946475123,
"learning_rate": 0.0001966832261835184,
"loss": 0.2695,
"step": 5240
},
{
"epoch": 1.48,
"grad_norm": 11.690634456574996,
"learning_rate": 0.00019646405610753944,
"loss": 0.3302,
"step": 5250
},
{
"epoch": 1.48,
"grad_norm": 62.85698360348048,
"learning_rate": 0.00019624488603156047,
"loss": 0.5215,
"step": 5260
},
{
"epoch": 1.49,
"grad_norm": 13.466880633693304,
"learning_rate": 0.00019602571595558153,
"loss": 0.2985,
"step": 5270
},
{
"epoch": 1.49,
"grad_norm": 1.965805579763693,
"learning_rate": 0.00019580654587960256,
"loss": 0.2372,
"step": 5280
},
{
"epoch": 1.49,
"grad_norm": 12.499426672870545,
"learning_rate": 0.0001955873758036236,
"loss": 0.1587,
"step": 5290
},
{
"epoch": 1.49,
"grad_norm": 15.307208098887806,
"learning_rate": 0.00019536820572764463,
"loss": 0.207,
"step": 5300
},
{
"epoch": 1.5,
"grad_norm": 14.010835079736815,
"learning_rate": 0.00019514903565166569,
"loss": 0.1688,
"step": 5310
},
{
"epoch": 1.5,
"grad_norm": 7.104347674810669,
"learning_rate": 0.00019492986557568672,
"loss": 0.1888,
"step": 5320
},
{
"epoch": 1.5,
"grad_norm": 1.5642209100774738,
"learning_rate": 0.00019473261250730565,
"loss": 0.2465,
"step": 5330
},
{
"epoch": 1.51,
"grad_norm": 32.89294792453513,
"learning_rate": 0.0001945134424313267,
"loss": 0.4685,
"step": 5340
},
{
"epoch": 1.51,
"grad_norm": 16.400791636569856,
"learning_rate": 0.00019429427235534774,
"loss": 0.2564,
"step": 5350
},
{
"epoch": 1.51,
"grad_norm": 6.986562239990336,
"learning_rate": 0.00019407510227936877,
"loss": 0.2917,
"step": 5360
},
{
"epoch": 1.51,
"grad_norm": 11.557550672896545,
"learning_rate": 0.00019385593220338983,
"loss": 0.3427,
"step": 5370
},
{
"epoch": 1.52,
"grad_norm": 2.1712768411403927,
"learning_rate": 0.00019363676212741086,
"loss": 0.2312,
"step": 5380
},
{
"epoch": 1.52,
"grad_norm": 3.4188406927450874,
"learning_rate": 0.0001934175920514319,
"loss": 0.2822,
"step": 5390
},
{
"epoch": 1.52,
"grad_norm": 18.41553181032857,
"learning_rate": 0.00019319842197545293,
"loss": 0.3085,
"step": 5400
},
{
"epoch": 1.53,
"grad_norm": 8.925374304814394,
"learning_rate": 0.000192979251899474,
"loss": 0.3014,
"step": 5410
},
{
"epoch": 1.53,
"grad_norm": 2.21057714781379,
"learning_rate": 0.00019276008182349502,
"loss": 0.2019,
"step": 5420
},
{
"epoch": 1.53,
"grad_norm": 7.995029854183915,
"learning_rate": 0.00019254091174751605,
"loss": 0.3143,
"step": 5430
},
{
"epoch": 1.53,
"grad_norm": 24.818437530389712,
"learning_rate": 0.0001923217416715371,
"loss": 0.2312,
"step": 5440
},
{
"epoch": 1.54,
"grad_norm": 4.630710193156958,
"learning_rate": 0.00019210257159555814,
"loss": 0.2946,
"step": 5450
},
{
"epoch": 1.54,
"grad_norm": 18.89029942903239,
"learning_rate": 0.00019188340151957917,
"loss": 0.2129,
"step": 5460
},
{
"epoch": 1.54,
"grad_norm": 21.18937099773853,
"learning_rate": 0.0001916642314436002,
"loss": 0.3114,
"step": 5470
},
{
"epoch": 1.54,
"grad_norm": 44.57514569240804,
"learning_rate": 0.00019144506136762127,
"loss": 0.4142,
"step": 5480
},
{
"epoch": 1.55,
"grad_norm": 10.696173605726317,
"learning_rate": 0.0001912258912916423,
"loss": 0.1754,
"step": 5490
},
{
"epoch": 1.55,
"grad_norm": 40.73458136589298,
"learning_rate": 0.00019100672121566333,
"loss": 0.2926,
"step": 5500
},
{
"epoch": 1.55,
"grad_norm": 39.31886057121872,
"learning_rate": 0.0001907875511396844,
"loss": 0.266,
"step": 5510
},
{
"epoch": 1.56,
"grad_norm": 2.9842826591961664,
"learning_rate": 0.00019056838106370542,
"loss": 0.2224,
"step": 5520
},
{
"epoch": 1.56,
"grad_norm": 9.331926869782274,
"learning_rate": 0.00019034921098772645,
"loss": 0.1006,
"step": 5530
},
{
"epoch": 1.56,
"grad_norm": 0.42527725447521897,
"learning_rate": 0.00019013004091174748,
"loss": 0.2903,
"step": 5540
},
{
"epoch": 1.56,
"grad_norm": 2.2444076494386254,
"learning_rate": 0.00018991087083576854,
"loss": 0.1155,
"step": 5550
},
{
"epoch": 1.57,
"grad_norm": 37.94628287997447,
"learning_rate": 0.00018969170075978958,
"loss": 0.4778,
"step": 5560
},
{
"epoch": 1.57,
"grad_norm": 42.20359422033777,
"learning_rate": 0.0001894725306838106,
"loss": 0.268,
"step": 5570
},
{
"epoch": 1.57,
"grad_norm": 5.954129998100243,
"learning_rate": 0.00018925336060783167,
"loss": 0.2858,
"step": 5580
},
{
"epoch": 1.58,
"grad_norm": 12.545115176449832,
"learning_rate": 0.0001890341905318527,
"loss": 0.4231,
"step": 5590
},
{
"epoch": 1.58,
"grad_norm": 4.6634905746414965,
"learning_rate": 0.00018881502045587373,
"loss": 0.2802,
"step": 5600
},
{
"epoch": 1.58,
"grad_norm": 25.89963576909228,
"learning_rate": 0.0001885958503798948,
"loss": 0.3474,
"step": 5610
},
{
"epoch": 1.58,
"grad_norm": 8.132022049717532,
"learning_rate": 0.00018837668030391582,
"loss": 0.2969,
"step": 5620
},
{
"epoch": 1.59,
"grad_norm": 9.15480977443254,
"learning_rate": 0.00018815751022793685,
"loss": 0.2564,
"step": 5630
},
{
"epoch": 1.59,
"grad_norm": 8.525165992903926,
"learning_rate": 0.00018793834015195789,
"loss": 0.3067,
"step": 5640
},
{
"epoch": 1.59,
"grad_norm": 32.8997919695663,
"learning_rate": 0.00018771917007597894,
"loss": 0.3231,
"step": 5650
},
{
"epoch": 1.6,
"grad_norm": 13.880130230974345,
"learning_rate": 0.00018749999999999998,
"loss": 0.233,
"step": 5660
},
{
"epoch": 1.6,
"grad_norm": 5.059925820076422,
"learning_rate": 0.000187280829924021,
"loss": 0.274,
"step": 5670
},
{
"epoch": 1.6,
"grad_norm": 33.92209798070997,
"learning_rate": 0.00018706165984804207,
"loss": 0.1971,
"step": 5680
},
{
"epoch": 1.6,
"grad_norm": 11.486919782540255,
"learning_rate": 0.0001868424897720631,
"loss": 0.5099,
"step": 5690
},
{
"epoch": 1.61,
"grad_norm": 1.118181876078142,
"learning_rate": 0.00018662331969608413,
"loss": 0.1531,
"step": 5700
},
{
"epoch": 1.61,
"grad_norm": 18.07168190463933,
"learning_rate": 0.00018640414962010516,
"loss": 0.4196,
"step": 5710
},
{
"epoch": 1.61,
"grad_norm": 8.865176806627508,
"learning_rate": 0.00018618497954412622,
"loss": 0.2771,
"step": 5720
},
{
"epoch": 1.62,
"grad_norm": 35.74999402744737,
"learning_rate": 0.00018596580946814726,
"loss": 0.3592,
"step": 5730
},
{
"epoch": 1.62,
"grad_norm": 14.709743095476123,
"learning_rate": 0.0001857466393921683,
"loss": 0.2372,
"step": 5740
},
{
"epoch": 1.62,
"grad_norm": 8.547571860162597,
"learning_rate": 0.00018552746931618935,
"loss": 0.1769,
"step": 5750
},
{
"epoch": 1.62,
"grad_norm": 29.188932438810735,
"learning_rate": 0.00018530829924021038,
"loss": 0.1595,
"step": 5760
},
{
"epoch": 1.63,
"grad_norm": 9.72627004103299,
"learning_rate": 0.0001850891291642314,
"loss": 0.267,
"step": 5770
},
{
"epoch": 1.63,
"grad_norm": 17.585505956653314,
"learning_rate": 0.0001848699590882525,
"loss": 0.2313,
"step": 5780
},
{
"epoch": 1.63,
"grad_norm": 11.872045841554243,
"learning_rate": 0.0001846507890122735,
"loss": 0.2371,
"step": 5790
},
{
"epoch": 1.64,
"grad_norm": 17.194223427069954,
"learning_rate": 0.00018443161893629453,
"loss": 0.3299,
"step": 5800
},
{
"epoch": 1.64,
"grad_norm": 11.83536538733706,
"learning_rate": 0.00018421244886031557,
"loss": 0.1985,
"step": 5810
},
{
"epoch": 1.64,
"grad_norm": 3.4341999458410153,
"learning_rate": 0.00018399327878433665,
"loss": 0.2886,
"step": 5820
},
{
"epoch": 1.64,
"grad_norm": 4.533321919018761,
"learning_rate": 0.00018377410870835768,
"loss": 0.2209,
"step": 5830
},
{
"epoch": 1.65,
"grad_norm": 15.786307762774168,
"learning_rate": 0.0001835549386323787,
"loss": 0.1963,
"step": 5840
},
{
"epoch": 1.65,
"grad_norm": 4.999680638451794,
"learning_rate": 0.00018333576855639977,
"loss": 0.3231,
"step": 5850
},
{
"epoch": 1.65,
"grad_norm": 7.345856055836037,
"learning_rate": 0.0001831165984804208,
"loss": 0.3497,
"step": 5860
},
{
"epoch": 1.65,
"grad_norm": 24.05112698349709,
"learning_rate": 0.00018289742840444184,
"loss": 0.1703,
"step": 5870
},
{
"epoch": 1.66,
"grad_norm": 5.14777448497475,
"learning_rate": 0.00018267825832846284,
"loss": 0.2877,
"step": 5880
},
{
"epoch": 1.66,
"grad_norm": 22.321301335050478,
"learning_rate": 0.00018245908825248393,
"loss": 0.3838,
"step": 5890
},
{
"epoch": 1.66,
"grad_norm": 13.553653717813507,
"learning_rate": 0.00018223991817650496,
"loss": 0.2386,
"step": 5900
},
{
"epoch": 1.67,
"grad_norm": 8.36423228043391,
"learning_rate": 0.000182020748100526,
"loss": 0.3274,
"step": 5910
},
{
"epoch": 1.67,
"grad_norm": 32.214527042072234,
"learning_rate": 0.00018180157802454705,
"loss": 0.2649,
"step": 5920
},
{
"epoch": 1.67,
"grad_norm": 7.310510867463555,
"learning_rate": 0.00018158240794856808,
"loss": 0.3419,
"step": 5930
},
{
"epoch": 1.67,
"grad_norm": 26.843414736420463,
"learning_rate": 0.00018136323787258912,
"loss": 0.3201,
"step": 5940
},
{
"epoch": 1.68,
"grad_norm": 17.750295021021458,
"learning_rate": 0.00018114406779661015,
"loss": 0.2248,
"step": 5950
},
{
"epoch": 1.68,
"grad_norm": 0.8761970753193081,
"learning_rate": 0.0001809248977206312,
"loss": 0.1681,
"step": 5960
},
{
"epoch": 1.68,
"grad_norm": 8.023645949418535,
"learning_rate": 0.00018070572764465224,
"loss": 0.3362,
"step": 5970
},
{
"epoch": 1.69,
"grad_norm": 21.33676597686644,
"learning_rate": 0.00018048655756867327,
"loss": 0.1318,
"step": 5980
},
{
"epoch": 1.69,
"grad_norm": 17.955586403704974,
"learning_rate": 0.00018026738749269433,
"loss": 0.3109,
"step": 5990
},
{
"epoch": 1.69,
"grad_norm": 37.44497964171056,
"learning_rate": 0.00018004821741671536,
"loss": 0.224,
"step": 6000
},
{
"epoch": 1.69,
"grad_norm": 21.77593890358771,
"learning_rate": 0.0001798290473407364,
"loss": 0.3034,
"step": 6010
},
{
"epoch": 1.7,
"grad_norm": 0.7352368145267243,
"learning_rate": 0.00017960987726475745,
"loss": 0.1274,
"step": 6020
},
{
"epoch": 1.7,
"grad_norm": 21.416372952522156,
"learning_rate": 0.00017939070718877849,
"loss": 0.2799,
"step": 6030
},
{
"epoch": 1.7,
"grad_norm": 6.43140669106781,
"learning_rate": 0.00017917153711279952,
"loss": 0.3944,
"step": 6040
},
{
"epoch": 1.71,
"grad_norm": 21.589922507704756,
"learning_rate": 0.00017895236703682055,
"loss": 0.4062,
"step": 6050
},
{
"epoch": 1.71,
"grad_norm": 11.40138578000891,
"learning_rate": 0.0001787331969608416,
"loss": 0.244,
"step": 6060
},
{
"epoch": 1.71,
"grad_norm": 17.091859469856562,
"learning_rate": 0.00017851402688486264,
"loss": 0.2566,
"step": 6070
},
{
"epoch": 1.71,
"grad_norm": 11.99466531174607,
"learning_rate": 0.00017829485680888367,
"loss": 0.2817,
"step": 6080
},
{
"epoch": 1.72,
"grad_norm": 20.51599983452521,
"learning_rate": 0.00017807568673290473,
"loss": 0.2728,
"step": 6090
},
{
"epoch": 1.72,
"grad_norm": 43.54037932343941,
"learning_rate": 0.00017785651665692576,
"loss": 0.3911,
"step": 6100
},
{
"epoch": 1.72,
"grad_norm": 14.595613560758464,
"learning_rate": 0.0001776373465809468,
"loss": 0.2645,
"step": 6110
},
{
"epoch": 1.73,
"grad_norm": 24.91815308399374,
"learning_rate": 0.00017741817650496783,
"loss": 0.2268,
"step": 6120
},
{
"epoch": 1.73,
"grad_norm": 9.720913254909993,
"learning_rate": 0.0001771990064289889,
"loss": 0.3635,
"step": 6130
},
{
"epoch": 1.73,
"grad_norm": 29.77336977828609,
"learning_rate": 0.00017697983635300992,
"loss": 0.2624,
"step": 6140
},
{
"epoch": 1.73,
"grad_norm": 35.22870133705211,
"learning_rate": 0.00017676066627703095,
"loss": 0.3233,
"step": 6150
},
{
"epoch": 1.74,
"grad_norm": 1.3492806118043075,
"learning_rate": 0.000176541496201052,
"loss": 0.2593,
"step": 6160
},
{
"epoch": 1.74,
"grad_norm": 12.854084741350912,
"learning_rate": 0.00017632232612507304,
"loss": 0.288,
"step": 6170
},
{
"epoch": 1.74,
"grad_norm": 26.51581773086576,
"learning_rate": 0.00017610315604909407,
"loss": 0.4244,
"step": 6180
},
{
"epoch": 1.75,
"grad_norm": 8.677592676470999,
"learning_rate": 0.00017588398597311513,
"loss": 0.2529,
"step": 6190
},
{
"epoch": 1.75,
"grad_norm": 49.69265782318678,
"learning_rate": 0.00017566481589713617,
"loss": 0.2555,
"step": 6200
},
{
"epoch": 1.75,
"grad_norm": 9.137310587778737,
"learning_rate": 0.0001754456458211572,
"loss": 0.2237,
"step": 6210
},
{
"epoch": 1.75,
"grad_norm": 2.5627703512715154,
"learning_rate": 0.00017522647574517823,
"loss": 0.2253,
"step": 6220
},
{
"epoch": 1.76,
"grad_norm": 1.3427155017491879,
"learning_rate": 0.0001750073056691993,
"loss": 0.4267,
"step": 6230
},
{
"epoch": 1.76,
"grad_norm": 32.026763021399056,
"learning_rate": 0.00017478813559322032,
"loss": 0.2694,
"step": 6240
},
{
"epoch": 1.76,
"grad_norm": 1.8952720705907915,
"learning_rate": 0.00017456896551724135,
"loss": 0.2647,
"step": 6250
},
{
"epoch": 1.76,
"grad_norm": 8.293829114055725,
"learning_rate": 0.0001743497954412624,
"loss": 0.3184,
"step": 6260
},
{
"epoch": 1.77,
"grad_norm": 21.630558018188975,
"learning_rate": 0.00017413062536528344,
"loss": 0.2528,
"step": 6270
},
{
"epoch": 1.77,
"grad_norm": 11.263065393245803,
"learning_rate": 0.00017391145528930448,
"loss": 0.3655,
"step": 6280
},
{
"epoch": 1.77,
"grad_norm": 0.5383693893588692,
"learning_rate": 0.0001736922852133255,
"loss": 0.2871,
"step": 6290
},
{
"epoch": 1.78,
"grad_norm": 8.222238464796305,
"learning_rate": 0.00017347311513734657,
"loss": 0.2595,
"step": 6300
},
{
"epoch": 1.78,
"grad_norm": 17.415154838487343,
"learning_rate": 0.0001732539450613676,
"loss": 0.3208,
"step": 6310
},
{
"epoch": 1.78,
"grad_norm": 2.384813430472559,
"learning_rate": 0.00017303477498538863,
"loss": 0.1697,
"step": 6320
},
{
"epoch": 1.78,
"grad_norm": 3.430324248430251,
"learning_rate": 0.0001728156049094097,
"loss": 0.3111,
"step": 6330
},
{
"epoch": 1.79,
"grad_norm": 29.695298798800554,
"learning_rate": 0.00017259643483343072,
"loss": 0.2624,
"step": 6340
},
{
"epoch": 1.79,
"grad_norm": 3.4530119939476016,
"learning_rate": 0.00017239918176504965,
"loss": 0.5039,
"step": 6350
},
{
"epoch": 1.79,
"grad_norm": 14.522683359889644,
"learning_rate": 0.0001721800116890707,
"loss": 0.2896,
"step": 6360
},
{
"epoch": 1.8,
"grad_norm": 11.419748641445885,
"learning_rate": 0.00017196084161309175,
"loss": 0.2413,
"step": 6370
},
{
"epoch": 1.8,
"grad_norm": 12.845162868376475,
"learning_rate": 0.00017174167153711278,
"loss": 0.2085,
"step": 6380
},
{
"epoch": 1.8,
"grad_norm": 50.97954814308535,
"learning_rate": 0.0001715225014611338,
"loss": 0.4096,
"step": 6390
},
{
"epoch": 1.8,
"grad_norm": 3.776354286239877,
"learning_rate": 0.00017130333138515487,
"loss": 0.2438,
"step": 6400
},
{
"epoch": 1.81,
"grad_norm": 3.71168428040738,
"learning_rate": 0.0001710841613091759,
"loss": 0.2612,
"step": 6410
},
{
"epoch": 1.81,
"grad_norm": 32.131330999922675,
"learning_rate": 0.00017086499123319693,
"loss": 0.4542,
"step": 6420
},
{
"epoch": 1.81,
"grad_norm": 6.736921728341746,
"learning_rate": 0.000170645821157218,
"loss": 0.2303,
"step": 6430
},
{
"epoch": 1.82,
"grad_norm": 15.468427433397974,
"learning_rate": 0.00017042665108123902,
"loss": 0.3181,
"step": 6440
},
{
"epoch": 1.82,
"grad_norm": 64.72524937993848,
"learning_rate": 0.00017020748100526006,
"loss": 0.2804,
"step": 6450
},
{
"epoch": 1.82,
"grad_norm": 5.219058854813203,
"learning_rate": 0.0001699883109292811,
"loss": 0.2497,
"step": 6460
},
{
"epoch": 1.82,
"grad_norm": 1.6037966913707118,
"learning_rate": 0.00016976914085330215,
"loss": 0.2066,
"step": 6470
},
{
"epoch": 1.83,
"grad_norm": 6.240768583919815,
"learning_rate": 0.00016954997077732318,
"loss": 0.2428,
"step": 6480
},
{
"epoch": 1.83,
"grad_norm": 23.240452777334195,
"learning_rate": 0.0001693308007013442,
"loss": 0.2075,
"step": 6490
},
{
"epoch": 1.83,
"grad_norm": 39.5379292284798,
"learning_rate": 0.00016911163062536527,
"loss": 0.2936,
"step": 6500
},
{
"epoch": 1.84,
"grad_norm": 32.798181084355704,
"learning_rate": 0.0001688924605493863,
"loss": 0.3773,
"step": 6510
},
{
"epoch": 1.84,
"grad_norm": 17.376010201609784,
"learning_rate": 0.00016867329047340733,
"loss": 0.3442,
"step": 6520
},
{
"epoch": 1.84,
"grad_norm": 12.08974541668813,
"learning_rate": 0.0001684541203974284,
"loss": 0.3321,
"step": 6530
},
{
"epoch": 1.84,
"grad_norm": 34.210468800599315,
"learning_rate": 0.00016823495032144942,
"loss": 0.2846,
"step": 6540
},
{
"epoch": 1.85,
"grad_norm": 13.86396524279559,
"learning_rate": 0.00016801578024547046,
"loss": 0.2522,
"step": 6550
},
{
"epoch": 1.85,
"grad_norm": 2.9623178774098693,
"learning_rate": 0.0001677966101694915,
"loss": 0.3122,
"step": 6560
},
{
"epoch": 1.85,
"grad_norm": 16.202793893907323,
"learning_rate": 0.00016757744009351255,
"loss": 0.2785,
"step": 6570
},
{
"epoch": 1.86,
"grad_norm": 8.818952376048744,
"learning_rate": 0.00016735827001753358,
"loss": 0.2893,
"step": 6580
},
{
"epoch": 1.86,
"grad_norm": 9.840941433124744,
"learning_rate": 0.0001671390999415546,
"loss": 0.4644,
"step": 6590
},
{
"epoch": 1.86,
"grad_norm": 22.487991211369078,
"learning_rate": 0.0001669199298655757,
"loss": 0.3367,
"step": 6600
},
{
"epoch": 1.86,
"grad_norm": 29.408750577985327,
"learning_rate": 0.0001667007597895967,
"loss": 0.433,
"step": 6610
},
{
"epoch": 1.87,
"grad_norm": 44.84702783843238,
"learning_rate": 0.00016648158971361773,
"loss": 0.288,
"step": 6620
},
{
"epoch": 1.87,
"grad_norm": 35.39092595728219,
"learning_rate": 0.00016626241963763877,
"loss": 0.401,
"step": 6630
},
{
"epoch": 1.87,
"grad_norm": 9.939332121236001,
"learning_rate": 0.00016604324956165985,
"loss": 0.1682,
"step": 6640
},
{
"epoch": 1.87,
"grad_norm": 44.968039933166224,
"learning_rate": 0.00016582407948568089,
"loss": 0.2435,
"step": 6650
},
{
"epoch": 1.88,
"grad_norm": 8.897239396543608,
"learning_rate": 0.0001656049094097019,
"loss": 0.2616,
"step": 6660
},
{
"epoch": 1.88,
"grad_norm": 24.54192700316105,
"learning_rate": 0.00016538573933372298,
"loss": 0.2454,
"step": 6670
},
{
"epoch": 1.88,
"grad_norm": 1.9484839859694942,
"learning_rate": 0.000165166569257744,
"loss": 0.2232,
"step": 6680
},
{
"epoch": 1.89,
"grad_norm": 15.628379568346645,
"learning_rate": 0.00016494739918176504,
"loss": 0.2314,
"step": 6690
},
{
"epoch": 1.89,
"grad_norm": 23.499563414114768,
"learning_rate": 0.0001647282291057861,
"loss": 0.1903,
"step": 6700
},
{
"epoch": 1.89,
"grad_norm": 19.39538243318877,
"learning_rate": 0.00016450905902980713,
"loss": 0.2385,
"step": 6710
},
{
"epoch": 1.89,
"grad_norm": 5.238154008583709,
"learning_rate": 0.00016428988895382816,
"loss": 0.3917,
"step": 6720
},
{
"epoch": 1.9,
"grad_norm": 24.81933612902287,
"learning_rate": 0.0001640707188778492,
"loss": 0.2389,
"step": 6730
},
{
"epoch": 1.9,
"grad_norm": 4.608132814218328,
"learning_rate": 0.00016385154880187025,
"loss": 0.1413,
"step": 6740
},
{
"epoch": 1.9,
"grad_norm": 11.881773803892107,
"learning_rate": 0.0001636323787258913,
"loss": 0.1785,
"step": 6750
},
{
"epoch": 1.91,
"grad_norm": 14.177125312181635,
"learning_rate": 0.00016341320864991232,
"loss": 0.2461,
"step": 6760
},
{
"epoch": 1.91,
"grad_norm": 33.173507263725085,
"learning_rate": 0.00016319403857393338,
"loss": 0.5047,
"step": 6770
},
{
"epoch": 1.91,
"grad_norm": 37.82372264857794,
"learning_rate": 0.0001629748684979544,
"loss": 0.3656,
"step": 6780
},
{
"epoch": 1.91,
"grad_norm": 23.51699250829612,
"learning_rate": 0.00016275569842197544,
"loss": 0.3609,
"step": 6790
},
{
"epoch": 1.92,
"grad_norm": 26.427233006930997,
"learning_rate": 0.00016253652834599647,
"loss": 0.2522,
"step": 6800
},
{
"epoch": 1.92,
"grad_norm": 0.8480665720492925,
"learning_rate": 0.00016231735827001753,
"loss": 0.1934,
"step": 6810
},
{
"epoch": 1.92,
"grad_norm": 1.0073865565621205,
"learning_rate": 0.00016209818819403856,
"loss": 0.2325,
"step": 6820
},
{
"epoch": 1.93,
"grad_norm": 7.079199003953245,
"learning_rate": 0.0001618790181180596,
"loss": 0.2733,
"step": 6830
},
{
"epoch": 1.93,
"grad_norm": 4.227514966678838,
"learning_rate": 0.00016165984804208066,
"loss": 0.5935,
"step": 6840
},
{
"epoch": 1.93,
"grad_norm": 2.3825703295584146,
"learning_rate": 0.0001614406779661017,
"loss": 0.2733,
"step": 6850
},
{
"epoch": 1.93,
"grad_norm": 1.8576971315426782,
"learning_rate": 0.00016122150789012272,
"loss": 0.303,
"step": 6860
},
{
"epoch": 1.94,
"grad_norm": 33.6413124083341,
"learning_rate": 0.00016100233781414378,
"loss": 0.2274,
"step": 6870
},
{
"epoch": 1.94,
"grad_norm": 0.9887468524380643,
"learning_rate": 0.0001607831677381648,
"loss": 0.2919,
"step": 6880
},
{
"epoch": 1.94,
"grad_norm": 4.991113672687678,
"learning_rate": 0.00016056399766218584,
"loss": 0.2303,
"step": 6890
},
{
"epoch": 1.95,
"grad_norm": 56.489557072844796,
"learning_rate": 0.00016034482758620688,
"loss": 0.3899,
"step": 6900
},
{
"epoch": 1.95,
"grad_norm": 2.8149639916154947,
"learning_rate": 0.00016012565751022793,
"loss": 0.2179,
"step": 6910
},
{
"epoch": 1.95,
"grad_norm": 9.821507341872895,
"learning_rate": 0.00015990648743424897,
"loss": 0.3172,
"step": 6920
},
{
"epoch": 1.95,
"grad_norm": 2.8569862825069285,
"learning_rate": 0.00015968731735827,
"loss": 0.256,
"step": 6930
},
{
"epoch": 1.96,
"grad_norm": 2.1928517803266643,
"learning_rate": 0.00015946814728229106,
"loss": 0.1468,
"step": 6940
},
{
"epoch": 1.96,
"grad_norm": 8.161581000646946,
"learning_rate": 0.0001592489772063121,
"loss": 0.3608,
"step": 6950
},
{
"epoch": 1.96,
"grad_norm": 1.8284875998450847,
"learning_rate": 0.00015902980713033312,
"loss": 0.2207,
"step": 6960
},
{
"epoch": 1.97,
"grad_norm": 3.6951898202003726,
"learning_rate": 0.00015881063705435415,
"loss": 0.2749,
"step": 6970
},
{
"epoch": 1.97,
"grad_norm": 17.687512857327995,
"learning_rate": 0.0001585914669783752,
"loss": 0.2825,
"step": 6980
},
{
"epoch": 1.97,
"grad_norm": 4.61555546951409,
"learning_rate": 0.00015837229690239624,
"loss": 0.3753,
"step": 6990
},
{
"epoch": 1.97,
"grad_norm": 28.47716869865466,
"learning_rate": 0.00015815312682641728,
"loss": 0.3437,
"step": 7000
},
{
"epoch": 1.98,
"grad_norm": 9.853541461506175,
"learning_rate": 0.00015793395675043834,
"loss": 0.2261,
"step": 7010
},
{
"epoch": 1.98,
"grad_norm": 3.150395806350278,
"learning_rate": 0.00015771478667445937,
"loss": 0.3094,
"step": 7020
},
{
"epoch": 1.98,
"grad_norm": 8.382086348656976,
"learning_rate": 0.0001574956165984804,
"loss": 0.3093,
"step": 7030
},
{
"epoch": 1.98,
"grad_norm": 2.722468973867923,
"learning_rate": 0.00015727644652250143,
"loss": 0.1783,
"step": 7040
},
{
"epoch": 1.99,
"grad_norm": 7.546076976068019,
"learning_rate": 0.0001570572764465225,
"loss": 0.1107,
"step": 7050
},
{
"epoch": 1.99,
"grad_norm": 20.5642182254047,
"learning_rate": 0.00015683810637054352,
"loss": 0.4277,
"step": 7060
},
{
"epoch": 1.99,
"grad_norm": 23.175588346263925,
"learning_rate": 0.00015661893629456455,
"loss": 0.4047,
"step": 7070
},
{
"epoch": 2.0,
"grad_norm": 16.76827959394083,
"learning_rate": 0.00015639976621858561,
"loss": 0.2191,
"step": 7080
},
{
"epoch": 2.0,
"grad_norm": 39.47975455838656,
"learning_rate": 0.00015618059614260665,
"loss": 0.3542,
"step": 7090
},
{
"epoch": 2.0,
"eval_0_f1": 0.7521064301552106,
"eval_0_precision": 0.7848218417399352,
"eval_0_recall": 0.7220093656875266,
"eval_1_f1": 0.9185843285755897,
"eval_1_precision": 0.9061781609195402,
"eval_1_recall": 0.9313349084465445,
"eval_accuracy": 0.8774257208639403,
"eval_loss": 0.35205078125,
"eval_runtime": 546.1666,
"eval_samples_per_second": 16.7,
"eval_steps_per_second": 2.785,
"step": 7094
},
{
"epoch": 2.0,
"grad_norm": 3.9467741467334223,
"learning_rate": 0.00015596142606662768,
"loss": 0.185,
"step": 7100
},
{
"epoch": 2.0,
"grad_norm": 2.796842409513772,
"learning_rate": 0.00015574225599064874,
"loss": 0.1507,
"step": 7110
},
{
"epoch": 2.01,
"grad_norm": 1.4260048578862903,
"learning_rate": 0.00015552308591466977,
"loss": 0.1149,
"step": 7120
},
{
"epoch": 2.01,
"grad_norm": 1.5807242351519994,
"learning_rate": 0.0001553039158386908,
"loss": 0.0837,
"step": 7130
},
{
"epoch": 2.01,
"grad_norm": 0.04513928456377307,
"learning_rate": 0.00015508474576271183,
"loss": 0.1494,
"step": 7140
},
{
"epoch": 2.02,
"grad_norm": 40.003470604804754,
"learning_rate": 0.0001548655756867329,
"loss": 0.1058,
"step": 7150
},
{
"epoch": 2.02,
"grad_norm": 5.841175038437886,
"learning_rate": 0.00015464640561075392,
"loss": 0.1506,
"step": 7160
},
{
"epoch": 2.02,
"grad_norm": 7.276698167932587,
"learning_rate": 0.00015442723553477496,
"loss": 0.0631,
"step": 7170
},
{
"epoch": 2.02,
"grad_norm": 1.3902255476639265,
"learning_rate": 0.00015420806545879602,
"loss": 0.0569,
"step": 7180
},
{
"epoch": 2.03,
"grad_norm": 8.321999591495654,
"learning_rate": 0.00015398889538281705,
"loss": 0.2596,
"step": 7190
},
{
"epoch": 2.03,
"grad_norm": 4.274705509444957,
"learning_rate": 0.00015376972530683808,
"loss": 0.0755,
"step": 7200
},
{
"epoch": 2.03,
"grad_norm": 14.284605361939498,
"learning_rate": 0.0001535505552308591,
"loss": 0.0506,
"step": 7210
},
{
"epoch": 2.04,
"grad_norm": 1.2721793288961767,
"learning_rate": 0.00015333138515488017,
"loss": 0.1444,
"step": 7220
},
{
"epoch": 2.04,
"grad_norm": 10.887784732379894,
"learning_rate": 0.0001531122150789012,
"loss": 0.0952,
"step": 7230
},
{
"epoch": 2.04,
"grad_norm": 0.33776382671575805,
"learning_rate": 0.00015289304500292223,
"loss": 0.1503,
"step": 7240
},
{
"epoch": 2.04,
"grad_norm": 0.7362979177108379,
"learning_rate": 0.00015267387492694332,
"loss": 0.0826,
"step": 7250
},
{
"epoch": 2.05,
"grad_norm": 12.73307715279125,
"learning_rate": 0.00015245470485096433,
"loss": 0.2583,
"step": 7260
},
{
"epoch": 2.05,
"grad_norm": 38.30889802059039,
"learning_rate": 0.00015223553477498536,
"loss": 0.2281,
"step": 7270
},
{
"epoch": 2.05,
"grad_norm": 21.730855964037335,
"learning_rate": 0.00015201636469900644,
"loss": 0.3252,
"step": 7280
},
{
"epoch": 2.06,
"grad_norm": 19.535045567591606,
"learning_rate": 0.00015179719462302748,
"loss": 0.1412,
"step": 7290
},
{
"epoch": 2.06,
"grad_norm": 6.082430108448023,
"learning_rate": 0.00015157802454704848,
"loss": 0.1301,
"step": 7300
},
{
"epoch": 2.06,
"grad_norm": 3.5121677910383875,
"learning_rate": 0.0001513588544710695,
"loss": 0.2337,
"step": 7310
},
{
"epoch": 2.06,
"grad_norm": 18.706966962801445,
"learning_rate": 0.0001511396843950906,
"loss": 0.0847,
"step": 7320
},
{
"epoch": 2.07,
"grad_norm": 1.4783230754439916,
"learning_rate": 0.00015092051431911163,
"loss": 0.1419,
"step": 7330
},
{
"epoch": 2.07,
"grad_norm": 2.9639753705286136,
"learning_rate": 0.00015070134424313264,
"loss": 0.0583,
"step": 7340
},
{
"epoch": 2.07,
"grad_norm": 23.160696704392283,
"learning_rate": 0.00015048217416715372,
"loss": 0.2117,
"step": 7350
},
{
"epoch": 2.07,
"grad_norm": 12.771447911890823,
"learning_rate": 0.00015026300409117475,
"loss": 0.0548,
"step": 7360
},
{
"epoch": 2.08,
"grad_norm": 11.531079583730829,
"learning_rate": 0.00015004383401519579,
"loss": 0.1,
"step": 7370
},
{
"epoch": 2.08,
"grad_norm": 1.5725191403592071,
"learning_rate": 0.00014982466393921682,
"loss": 0.0763,
"step": 7380
},
{
"epoch": 2.08,
"grad_norm": 19.62591146141424,
"learning_rate": 0.00014960549386323785,
"loss": 0.2172,
"step": 7390
},
{
"epoch": 2.09,
"grad_norm": 4.328008880202292,
"learning_rate": 0.0001493863237872589,
"loss": 0.1791,
"step": 7400
},
{
"epoch": 2.09,
"grad_norm": 2.542706232558499,
"learning_rate": 0.00014916715371127994,
"loss": 0.1069,
"step": 7410
},
{
"epoch": 2.09,
"grad_norm": 5.01099611371998,
"learning_rate": 0.00014894798363530097,
"loss": 0.0867,
"step": 7420
},
{
"epoch": 2.09,
"grad_norm": 1.4225594957009309,
"learning_rate": 0.00014872881355932203,
"loss": 0.1617,
"step": 7430
},
{
"epoch": 2.1,
"grad_norm": 1.4366748859319889,
"learning_rate": 0.00014850964348334306,
"loss": 0.1247,
"step": 7440
},
{
"epoch": 2.1,
"grad_norm": 0.4798934066028662,
"learning_rate": 0.0001482904734073641,
"loss": 0.1457,
"step": 7450
},
{
"epoch": 2.1,
"grad_norm": 0.8614048372726179,
"learning_rate": 0.00014807130333138516,
"loss": 0.1488,
"step": 7460
},
{
"epoch": 2.11,
"grad_norm": 1.5048632731892742,
"learning_rate": 0.0001478521332554062,
"loss": 0.0832,
"step": 7470
},
{
"epoch": 2.11,
"grad_norm": 2.6843269650855808,
"learning_rate": 0.00014763296317942722,
"loss": 0.0961,
"step": 7480
},
{
"epoch": 2.11,
"grad_norm": 16.8741628191936,
"learning_rate": 0.00014741379310344825,
"loss": 0.2136,
"step": 7490
},
{
"epoch": 2.11,
"grad_norm": 0.807139340475033,
"learning_rate": 0.0001471946230274693,
"loss": 0.1095,
"step": 7500
},
{
"epoch": 2.12,
"grad_norm": 2.2184905450148986,
"learning_rate": 0.00014697545295149034,
"loss": 0.1902,
"step": 7510
},
{
"epoch": 2.12,
"grad_norm": 5.771979829819086,
"learning_rate": 0.00014675628287551137,
"loss": 0.138,
"step": 7520
},
{
"epoch": 2.12,
"grad_norm": 5.97570946689331,
"learning_rate": 0.00014653711279953243,
"loss": 0.21,
"step": 7530
},
{
"epoch": 2.13,
"grad_norm": 5.577046113368879,
"learning_rate": 0.00014631794272355347,
"loss": 0.2296,
"step": 7540
},
{
"epoch": 2.13,
"grad_norm": 0.16147618396834,
"learning_rate": 0.0001460987726475745,
"loss": 0.1041,
"step": 7550
},
{
"epoch": 2.13,
"grad_norm": 1.3482389383340438,
"learning_rate": 0.00014587960257159553,
"loss": 0.0832,
"step": 7560
},
{
"epoch": 2.13,
"grad_norm": 5.133780628362138,
"learning_rate": 0.0001456604324956166,
"loss": 0.0916,
"step": 7570
},
{
"epoch": 2.14,
"grad_norm": 2.729454474197146,
"learning_rate": 0.00014544126241963762,
"loss": 0.1806,
"step": 7580
},
{
"epoch": 2.14,
"grad_norm": 2.72773715656119,
"learning_rate": 0.00014522209234365865,
"loss": 0.0647,
"step": 7590
},
{
"epoch": 2.14,
"grad_norm": 3.507884957259747,
"learning_rate": 0.0001450248392752776,
"loss": 0.2701,
"step": 7600
},
{
"epoch": 2.15,
"grad_norm": 19.081923498359,
"learning_rate": 0.00014480566919929864,
"loss": 0.0962,
"step": 7610
},
{
"epoch": 2.15,
"grad_norm": 45.75244080209308,
"learning_rate": 0.00014458649912331968,
"loss": 0.2713,
"step": 7620
},
{
"epoch": 2.15,
"grad_norm": 6.360305569438668,
"learning_rate": 0.00014436732904734073,
"loss": 0.0377,
"step": 7630
},
{
"epoch": 2.15,
"grad_norm": 11.812868187605755,
"learning_rate": 0.00014414815897136177,
"loss": 0.0538,
"step": 7640
},
{
"epoch": 2.16,
"grad_norm": 11.581826556875212,
"learning_rate": 0.0001439289888953828,
"loss": 0.1113,
"step": 7650
},
{
"epoch": 2.16,
"grad_norm": 3.0769487150537067,
"learning_rate": 0.00014370981881940383,
"loss": 0.1045,
"step": 7660
},
{
"epoch": 2.16,
"grad_norm": 19.551871190286114,
"learning_rate": 0.0001434906487434249,
"loss": 0.1375,
"step": 7670
},
{
"epoch": 2.17,
"grad_norm": 4.0482427664543925,
"learning_rate": 0.00014327147866744592,
"loss": 0.1863,
"step": 7680
},
{
"epoch": 2.17,
"grad_norm": 3.1977613453815654,
"learning_rate": 0.00014305230859146695,
"loss": 0.0756,
"step": 7690
},
{
"epoch": 2.17,
"grad_norm": 0.9737985079221272,
"learning_rate": 0.000142833138515488,
"loss": 0.0916,
"step": 7700
},
{
"epoch": 2.17,
"grad_norm": 1.984146795188883,
"learning_rate": 0.00014261396843950904,
"loss": 0.2625,
"step": 7710
},
{
"epoch": 2.18,
"grad_norm": 1.72026076920096,
"learning_rate": 0.00014239479836353008,
"loss": 0.1073,
"step": 7720
},
{
"epoch": 2.18,
"grad_norm": 0.555301240735939,
"learning_rate": 0.00014217562828755114,
"loss": 0.0962,
"step": 7730
},
{
"epoch": 2.18,
"grad_norm": 12.306241433553296,
"learning_rate": 0.00014195645821157217,
"loss": 0.312,
"step": 7740
},
{
"epoch": 2.18,
"grad_norm": 39.31295240290497,
"learning_rate": 0.0001417372881355932,
"loss": 0.1382,
"step": 7750
},
{
"epoch": 2.19,
"grad_norm": 18.239083580266996,
"learning_rate": 0.00014151811805961423,
"loss": 0.1959,
"step": 7760
},
{
"epoch": 2.19,
"grad_norm": 3.6063732707629277,
"learning_rate": 0.0001412989479836353,
"loss": 0.1882,
"step": 7770
},
{
"epoch": 2.19,
"grad_norm": 22.59443633796688,
"learning_rate": 0.00014107977790765632,
"loss": 0.1411,
"step": 7780
},
{
"epoch": 2.2,
"grad_norm": 9.943061970841525,
"learning_rate": 0.00014086060783167738,
"loss": 0.2073,
"step": 7790
},
{
"epoch": 2.2,
"grad_norm": 13.334986026618791,
"learning_rate": 0.00014064143775569841,
"loss": 0.1452,
"step": 7800
},
{
"epoch": 2.2,
"grad_norm": 18.672479936744026,
"learning_rate": 0.00014042226767971945,
"loss": 0.1604,
"step": 7810
},
{
"epoch": 2.2,
"grad_norm": 8.209768975033892,
"learning_rate": 0.0001402030976037405,
"loss": 0.0633,
"step": 7820
},
{
"epoch": 2.21,
"grad_norm": 7.929920880167193,
"learning_rate": 0.00013998392752776154,
"loss": 0.0759,
"step": 7830
},
{
"epoch": 2.21,
"grad_norm": 5.155314191153851,
"learning_rate": 0.00013976475745178257,
"loss": 0.1334,
"step": 7840
},
{
"epoch": 2.21,
"grad_norm": 12.293469154578297,
"learning_rate": 0.00013954558737580363,
"loss": 0.1731,
"step": 7850
},
{
"epoch": 2.22,
"grad_norm": 27.793799163143525,
"learning_rate": 0.00013932641729982466,
"loss": 0.1718,
"step": 7860
},
{
"epoch": 2.22,
"grad_norm": 18.160732235536603,
"learning_rate": 0.0001391072472238457,
"loss": 0.3889,
"step": 7870
},
{
"epoch": 2.22,
"grad_norm": 6.719945966655997,
"learning_rate": 0.00013888807714786672,
"loss": 0.0903,
"step": 7880
},
{
"epoch": 2.22,
"grad_norm": 4.702233374552434,
"learning_rate": 0.00013866890707188778,
"loss": 0.1029,
"step": 7890
},
{
"epoch": 2.23,
"grad_norm": 1.6814706649468594,
"learning_rate": 0.00013844973699590882,
"loss": 0.1609,
"step": 7900
},
{
"epoch": 2.23,
"grad_norm": 35.183209348221745,
"learning_rate": 0.00013823056691992985,
"loss": 0.0987,
"step": 7910
},
{
"epoch": 2.23,
"grad_norm": 0.3853650828148091,
"learning_rate": 0.0001380113968439509,
"loss": 0.092,
"step": 7920
},
{
"epoch": 2.24,
"grad_norm": 2.1900815019481827,
"learning_rate": 0.00013779222676797194,
"loss": 0.0721,
"step": 7930
},
{
"epoch": 2.24,
"grad_norm": 2.719013304384676,
"learning_rate": 0.00013757305669199297,
"loss": 0.2757,
"step": 7940
},
{
"epoch": 2.24,
"grad_norm": 1.9065891840102538,
"learning_rate": 0.000137353886616014,
"loss": 0.0894,
"step": 7950
},
{
"epoch": 2.24,
"grad_norm": 0.7041589905973044,
"learning_rate": 0.00013713471654003506,
"loss": 0.1567,
"step": 7960
},
{
"epoch": 2.25,
"grad_norm": 13.09370538833684,
"learning_rate": 0.0001369155464640561,
"loss": 0.1153,
"step": 7970
},
{
"epoch": 2.25,
"grad_norm": 0.10005792474473749,
"learning_rate": 0.00013669637638807713,
"loss": 0.0889,
"step": 7980
},
{
"epoch": 2.25,
"grad_norm": 8.650143720385165,
"learning_rate": 0.00013647720631209818,
"loss": 0.102,
"step": 7990
},
{
"epoch": 2.26,
"grad_norm": 13.834552459976706,
"learning_rate": 0.00013625803623611922,
"loss": 0.1991,
"step": 8000
},
{
"epoch": 2.26,
"grad_norm": 7.1136160697882636,
"learning_rate": 0.00013603886616014025,
"loss": 0.1605,
"step": 8010
},
{
"epoch": 2.26,
"grad_norm": 3.409491115278956,
"learning_rate": 0.00013581969608416128,
"loss": 0.182,
"step": 8020
},
{
"epoch": 2.26,
"grad_norm": 11.603050853164602,
"learning_rate": 0.00013560052600818234,
"loss": 0.1393,
"step": 8030
},
{
"epoch": 2.27,
"grad_norm": 91.11285901526837,
"learning_rate": 0.00013538135593220337,
"loss": 0.5593,
"step": 8040
},
{
"epoch": 2.27,
"grad_norm": 9.002577439722662,
"learning_rate": 0.0001351621858562244,
"loss": 0.2636,
"step": 8050
},
{
"epoch": 2.27,
"grad_norm": 2.8423642348776945,
"learning_rate": 0.00013494301578024546,
"loss": 0.2049,
"step": 8060
},
{
"epoch": 2.28,
"grad_norm": 9.017487980947873,
"learning_rate": 0.0001347238457042665,
"loss": 0.2001,
"step": 8070
},
{
"epoch": 2.28,
"grad_norm": 5.463991248637368,
"learning_rate": 0.00013450467562828753,
"loss": 0.1163,
"step": 8080
},
{
"epoch": 2.28,
"grad_norm": 35.9490561961924,
"learning_rate": 0.00013428550555230859,
"loss": 0.1705,
"step": 8090
},
{
"epoch": 2.28,
"grad_norm": 1.5214346506002336,
"learning_rate": 0.00013406633547632962,
"loss": 0.1706,
"step": 8100
},
{
"epoch": 2.29,
"grad_norm": 6.553206900744027,
"learning_rate": 0.00013384716540035068,
"loss": 0.1406,
"step": 8110
},
{
"epoch": 2.29,
"grad_norm": 10.219338883484347,
"learning_rate": 0.00013362799532437168,
"loss": 0.2204,
"step": 8120
},
{
"epoch": 2.29,
"grad_norm": 18.748997475525492,
"learning_rate": 0.00013340882524839274,
"loss": 0.2109,
"step": 8130
},
{
"epoch": 2.29,
"grad_norm": 1.3727170176969377,
"learning_rate": 0.0001331896551724138,
"loss": 0.1122,
"step": 8140
},
{
"epoch": 2.3,
"grad_norm": 10.948203912180993,
"learning_rate": 0.00013297048509643483,
"loss": 0.3022,
"step": 8150
},
{
"epoch": 2.3,
"grad_norm": 2.056636685995259,
"learning_rate": 0.00013275131502045586,
"loss": 0.239,
"step": 8160
},
{
"epoch": 2.3,
"grad_norm": 6.518651435362685,
"learning_rate": 0.0001325321449444769,
"loss": 0.3234,
"step": 8170
},
{
"epoch": 2.31,
"grad_norm": 5.475635275763899,
"learning_rate": 0.00013231297486849796,
"loss": 0.2198,
"step": 8180
},
{
"epoch": 2.31,
"grad_norm": 3.671883643297412,
"learning_rate": 0.000132093804792519,
"loss": 0.1707,
"step": 8190
},
{
"epoch": 2.31,
"grad_norm": 0.4579375750027517,
"learning_rate": 0.00013187463471654002,
"loss": 0.2133,
"step": 8200
},
{
"epoch": 2.31,
"grad_norm": 5.194066447141225,
"learning_rate": 0.00013165546464056108,
"loss": 0.2439,
"step": 8210
},
{
"epoch": 2.32,
"grad_norm": 10.494958205761325,
"learning_rate": 0.0001314362945645821,
"loss": 0.244,
"step": 8220
},
{
"epoch": 2.32,
"grad_norm": 3.2710044167508534,
"learning_rate": 0.00013121712448860314,
"loss": 0.1088,
"step": 8230
},
{
"epoch": 2.32,
"grad_norm": 16.056832749725743,
"learning_rate": 0.00013099795441262417,
"loss": 0.2221,
"step": 8240
},
{
"epoch": 2.33,
"grad_norm": 0.7672515772397378,
"learning_rate": 0.00013077878433664523,
"loss": 0.0554,
"step": 8250
},
{
"epoch": 2.33,
"grad_norm": 21.726819743293646,
"learning_rate": 0.00013055961426066627,
"loss": 0.1701,
"step": 8260
},
{
"epoch": 2.33,
"grad_norm": 11.287063948392506,
"learning_rate": 0.0001303404441846873,
"loss": 0.2342,
"step": 8270
},
{
"epoch": 2.33,
"grad_norm": 22.058917311910815,
"learning_rate": 0.00013012127410870836,
"loss": 0.163,
"step": 8280
},
{
"epoch": 2.34,
"grad_norm": 3.2943302484351142,
"learning_rate": 0.0001299021040327294,
"loss": 0.1142,
"step": 8290
},
{
"epoch": 2.34,
"grad_norm": 41.01468692106424,
"learning_rate": 0.00012968293395675042,
"loss": 0.1652,
"step": 8300
},
{
"epoch": 2.34,
"grad_norm": 15.110863439212581,
"learning_rate": 0.00012946376388077145,
"loss": 0.2891,
"step": 8310
},
{
"epoch": 2.35,
"grad_norm": 4.692836606354725,
"learning_rate": 0.0001292445938047925,
"loss": 0.0777,
"step": 8320
},
{
"epoch": 2.35,
"grad_norm": 42.35071301518718,
"learning_rate": 0.00012902542372881354,
"loss": 0.2289,
"step": 8330
},
{
"epoch": 2.35,
"grad_norm": 3.3851205501937085,
"learning_rate": 0.00012880625365283458,
"loss": 0.0574,
"step": 8340
},
{
"epoch": 2.35,
"grad_norm": 32.382836448835434,
"learning_rate": 0.00012858708357685564,
"loss": 0.2681,
"step": 8350
},
{
"epoch": 2.36,
"grad_norm": 11.990285837961236,
"learning_rate": 0.00012836791350087667,
"loss": 0.1145,
"step": 8360
},
{
"epoch": 2.36,
"grad_norm": 6.231424331853902,
"learning_rate": 0.0001281487434248977,
"loss": 0.1645,
"step": 8370
},
{
"epoch": 2.36,
"grad_norm": 8.823978019308194,
"learning_rate": 0.00012792957334891876,
"loss": 0.0895,
"step": 8380
},
{
"epoch": 2.37,
"grad_norm": 3.5791569065379147,
"learning_rate": 0.0001277104032729398,
"loss": 0.2226,
"step": 8390
},
{
"epoch": 2.37,
"grad_norm": 4.904760306159147,
"learning_rate": 0.00012749123319696082,
"loss": 0.1586,
"step": 8400
},
{
"epoch": 2.37,
"grad_norm": 35.790471287396194,
"learning_rate": 0.00012727206312098185,
"loss": 0.1451,
"step": 8410
},
{
"epoch": 2.37,
"grad_norm": 19.49158058941717,
"learning_rate": 0.0001270528930450029,
"loss": 0.1129,
"step": 8420
},
{
"epoch": 2.38,
"grad_norm": 1.2301380360175656,
"learning_rate": 0.00012683372296902397,
"loss": 0.2055,
"step": 8430
},
{
"epoch": 2.38,
"grad_norm": 6.081015675249448,
"learning_rate": 0.00012661455289304498,
"loss": 0.1039,
"step": 8440
},
{
"epoch": 2.38,
"grad_norm": 8.51374019556884,
"learning_rate": 0.00012639538281706604,
"loss": 0.0764,
"step": 8450
},
{
"epoch": 2.39,
"grad_norm": 22.61234712969463,
"learning_rate": 0.00012617621274108707,
"loss": 0.1144,
"step": 8460
},
{
"epoch": 2.39,
"grad_norm": 16.325546182379608,
"learning_rate": 0.00012595704266510813,
"loss": 0.2252,
"step": 8470
},
{
"epoch": 2.39,
"grad_norm": 10.684407579259915,
"learning_rate": 0.00012573787258912916,
"loss": 0.1617,
"step": 8480
},
{
"epoch": 2.39,
"grad_norm": 18.8401359355114,
"learning_rate": 0.0001255187025131502,
"loss": 0.1327,
"step": 8490
},
{
"epoch": 2.4,
"grad_norm": 4.428339354625936,
"learning_rate": 0.00012529953243717125,
"loss": 0.2793,
"step": 8500
},
{
"epoch": 2.4,
"grad_norm": 34.97712900138805,
"learning_rate": 0.00012508036236119228,
"loss": 0.1734,
"step": 8510
},
{
"epoch": 2.4,
"grad_norm": 10.842732736668664,
"learning_rate": 0.00012486119228521331,
"loss": 0.172,
"step": 8520
},
{
"epoch": 2.4,
"grad_norm": 3.8204570700978753,
"learning_rate": 0.00012464202220923435,
"loss": 0.1893,
"step": 8530
},
{
"epoch": 2.41,
"grad_norm": 1.7847088171149714,
"learning_rate": 0.0001244228521332554,
"loss": 0.1119,
"step": 8540
},
{
"epoch": 2.41,
"grad_norm": 10.013026009815832,
"learning_rate": 0.00012420368205727644,
"loss": 0.1488,
"step": 8550
},
{
"epoch": 2.41,
"grad_norm": 0.9956055302547419,
"learning_rate": 0.00012398451198129747,
"loss": 0.167,
"step": 8560
},
{
"epoch": 2.42,
"grad_norm": 15.708190043930621,
"learning_rate": 0.00012376534190531853,
"loss": 0.103,
"step": 8570
},
{
"epoch": 2.42,
"grad_norm": 9.516127340363248,
"learning_rate": 0.00012354617182933956,
"loss": 0.1761,
"step": 8580
},
{
"epoch": 2.42,
"grad_norm": 11.289621429730468,
"learning_rate": 0.0001233270017533606,
"loss": 0.21,
"step": 8590
},
{
"epoch": 2.42,
"grad_norm": 6.438699785103895,
"learning_rate": 0.00012310783167738162,
"loss": 0.1212,
"step": 8600
},
{
"epoch": 2.43,
"grad_norm": 0.4291084022368479,
"learning_rate": 0.00012288866160140268,
"loss": 0.1315,
"step": 8610
},
{
"epoch": 2.43,
"grad_norm": 3.090543654415638,
"learning_rate": 0.00012266949152542372,
"loss": 0.0689,
"step": 8620
},
{
"epoch": 2.43,
"grad_norm": 0.47917246377381595,
"learning_rate": 0.00012245032144944475,
"loss": 0.1119,
"step": 8630
},
{
"epoch": 2.44,
"grad_norm": 0.7069329797066186,
"learning_rate": 0.0001222311513734658,
"loss": 0.0562,
"step": 8640
},
{
"epoch": 2.44,
"grad_norm": 18.96685701324762,
"learning_rate": 0.00012201198129748684,
"loss": 0.1177,
"step": 8650
},
{
"epoch": 2.44,
"grad_norm": 0.00279294620786177,
"learning_rate": 0.00012179281122150788,
"loss": 0.0998,
"step": 8660
},
{
"epoch": 2.44,
"grad_norm": 6.199666064354547,
"learning_rate": 0.00012157364114552893,
"loss": 0.083,
"step": 8670
},
{
"epoch": 2.45,
"grad_norm": 0.639057585376392,
"learning_rate": 0.00012135447106954996,
"loss": 0.0941,
"step": 8680
},
{
"epoch": 2.45,
"grad_norm": 0.15447864197247607,
"learning_rate": 0.0001211572180011689,
"loss": 0.1944,
"step": 8690
},
{
"epoch": 2.45,
"grad_norm": 0.8173758905341566,
"learning_rate": 0.00012093804792518993,
"loss": 0.1019,
"step": 8700
},
{
"epoch": 2.46,
"grad_norm": 3.243981396595975,
"learning_rate": 0.00012071887784921097,
"loss": 0.3167,
"step": 8710
},
{
"epoch": 2.46,
"grad_norm": 3.4464993538031634,
"learning_rate": 0.00012049970777323202,
"loss": 0.1482,
"step": 8720
},
{
"epoch": 2.46,
"grad_norm": 23.425856884504597,
"learning_rate": 0.00012028053769725305,
"loss": 0.2159,
"step": 8730
},
{
"epoch": 2.46,
"grad_norm": 18.894344998479365,
"learning_rate": 0.00012006136762127411,
"loss": 0.3282,
"step": 8740
},
{
"epoch": 2.47,
"grad_norm": 4.015044349522744,
"learning_rate": 0.00011984219754529513,
"loss": 0.1495,
"step": 8750
},
{
"epoch": 2.47,
"grad_norm": 0.456213488330113,
"learning_rate": 0.00011962302746931619,
"loss": 0.1598,
"step": 8760
},
{
"epoch": 2.47,
"grad_norm": 13.704116606800925,
"learning_rate": 0.00011940385739333723,
"loss": 0.1294,
"step": 8770
},
{
"epoch": 2.48,
"grad_norm": 7.3368094990394175,
"learning_rate": 0.00011918468731735826,
"loss": 0.1551,
"step": 8780
},
{
"epoch": 2.48,
"grad_norm": 1.0015347738020366,
"learning_rate": 0.00011896551724137931,
"loss": 0.1037,
"step": 8790
},
{
"epoch": 2.48,
"grad_norm": 4.798234813041826,
"learning_rate": 0.00011874634716540034,
"loss": 0.0762,
"step": 8800
},
{
"epoch": 2.48,
"grad_norm": 0.46732247838464797,
"learning_rate": 0.00011852717708942139,
"loss": 0.2361,
"step": 8810
},
{
"epoch": 2.49,
"grad_norm": 0.78510526101886,
"learning_rate": 0.00011830800701344242,
"loss": 0.1172,
"step": 8820
},
{
"epoch": 2.49,
"grad_norm": 14.754170828495456,
"learning_rate": 0.00011808883693746346,
"loss": 0.1337,
"step": 8830
},
{
"epoch": 2.49,
"grad_norm": 5.457626503330071,
"learning_rate": 0.00011786966686148451,
"loss": 0.1238,
"step": 8840
},
{
"epoch": 2.5,
"grad_norm": 24.359633554477504,
"learning_rate": 0.00011765049678550554,
"loss": 0.159,
"step": 8850
},
{
"epoch": 2.5,
"grad_norm": 4.052116414721034,
"learning_rate": 0.00011743132670952659,
"loss": 0.1946,
"step": 8860
},
{
"epoch": 2.5,
"grad_norm": 10.286136518184552,
"learning_rate": 0.00011721215663354762,
"loss": 0.0675,
"step": 8870
},
{
"epoch": 2.5,
"grad_norm": 0.7798842992321797,
"learning_rate": 0.00011699298655756866,
"loss": 0.0518,
"step": 8880
},
{
"epoch": 2.51,
"grad_norm": 3.1590348601862037,
"learning_rate": 0.00011677381648158971,
"loss": 0.237,
"step": 8890
},
{
"epoch": 2.51,
"grad_norm": 9.309421761709203,
"learning_rate": 0.00011655464640561074,
"loss": 0.1237,
"step": 8900
},
{
"epoch": 2.51,
"grad_norm": 7.223449459613724,
"learning_rate": 0.00011633547632963179,
"loss": 0.1144,
"step": 8910
},
{
"epoch": 2.51,
"grad_norm": 2.293633045983554,
"learning_rate": 0.00011611630625365282,
"loss": 0.1469,
"step": 8920
},
{
"epoch": 2.52,
"grad_norm": 11.619319474508913,
"learning_rate": 0.00011589713617767387,
"loss": 0.1324,
"step": 8930
},
{
"epoch": 2.52,
"grad_norm": 8.237900621376555,
"learning_rate": 0.00011567796610169491,
"loss": 0.1019,
"step": 8940
},
{
"epoch": 2.52,
"grad_norm": 3.4844703517603746,
"learning_rate": 0.00011545879602571594,
"loss": 0.1582,
"step": 8950
},
{
"epoch": 2.53,
"grad_norm": 48.74139317560625,
"learning_rate": 0.00011523962594973699,
"loss": 0.243,
"step": 8960
},
{
"epoch": 2.53,
"grad_norm": 32.91098913412278,
"learning_rate": 0.00011502045587375802,
"loss": 0.153,
"step": 8970
},
{
"epoch": 2.53,
"grad_norm": 5.659700047857308,
"learning_rate": 0.00011480128579777907,
"loss": 0.0843,
"step": 8980
},
{
"epoch": 2.53,
"grad_norm": 22.35388198625644,
"learning_rate": 0.0001145821157218001,
"loss": 0.1841,
"step": 8990
},
{
"epoch": 2.54,
"grad_norm": 5.24175893236962,
"learning_rate": 0.00011436294564582114,
"loss": 0.1452,
"step": 9000
},
{
"epoch": 2.54,
"grad_norm": 5.865583240157655,
"learning_rate": 0.00011414377556984219,
"loss": 0.1757,
"step": 9010
},
{
"epoch": 2.54,
"grad_norm": 16.96991984978489,
"learning_rate": 0.00011392460549386322,
"loss": 0.2905,
"step": 9020
},
{
"epoch": 2.55,
"grad_norm": 1.4459460915714275,
"learning_rate": 0.00011370543541788427,
"loss": 0.0953,
"step": 9030
},
{
"epoch": 2.55,
"grad_norm": 0.27775375444037353,
"learning_rate": 0.0001134862653419053,
"loss": 0.0792,
"step": 9040
},
{
"epoch": 2.55,
"grad_norm": 1.1397011386751719,
"learning_rate": 0.00011326709526592634,
"loss": 0.0971,
"step": 9050
},
{
"epoch": 2.55,
"grad_norm": 32.740624235968234,
"learning_rate": 0.0001130479251899474,
"loss": 0.152,
"step": 9060
},
{
"epoch": 2.56,
"grad_norm": 15.666578971132482,
"learning_rate": 0.00011282875511396842,
"loss": 0.2745,
"step": 9070
},
{
"epoch": 2.56,
"grad_norm": 0.6312408815420002,
"learning_rate": 0.00011260958503798948,
"loss": 0.0736,
"step": 9080
},
{
"epoch": 2.56,
"grad_norm": 0.9882525535102352,
"learning_rate": 0.0001123904149620105,
"loss": 0.1197,
"step": 9090
},
{
"epoch": 2.57,
"grad_norm": 22.45196464336915,
"learning_rate": 0.00011217124488603156,
"loss": 0.0746,
"step": 9100
},
{
"epoch": 2.57,
"grad_norm": 0.4466669958671361,
"learning_rate": 0.00011195207481005258,
"loss": 0.026,
"step": 9110
},
{
"epoch": 2.57,
"grad_norm": 18.22674384805627,
"learning_rate": 0.00011173290473407364,
"loss": 0.1488,
"step": 9120
},
{
"epoch": 2.57,
"grad_norm": 3.2075642222454324,
"learning_rate": 0.00011151373465809468,
"loss": 0.271,
"step": 9130
},
{
"epoch": 2.58,
"grad_norm": 8.45175655576022,
"learning_rate": 0.00011129456458211571,
"loss": 0.0749,
"step": 9140
},
{
"epoch": 2.58,
"grad_norm": 13.874834426706034,
"learning_rate": 0.00011107539450613676,
"loss": 0.0782,
"step": 9150
},
{
"epoch": 2.58,
"grad_norm": 0.9676566458873671,
"learning_rate": 0.00011085622443015779,
"loss": 0.0905,
"step": 9160
},
{
"epoch": 2.59,
"grad_norm": 5.621065616371578,
"learning_rate": 0.00011063705435417884,
"loss": 0.0798,
"step": 9170
},
{
"epoch": 2.59,
"grad_norm": 7.042618733836522,
"learning_rate": 0.00011041788427819988,
"loss": 0.2499,
"step": 9180
},
{
"epoch": 2.59,
"grad_norm": 2.4221807383973646,
"learning_rate": 0.00011019871420222091,
"loss": 0.0507,
"step": 9190
},
{
"epoch": 2.59,
"grad_norm": 24.467155852219083,
"learning_rate": 0.00010997954412624196,
"loss": 0.1741,
"step": 9200
},
{
"epoch": 2.6,
"grad_norm": 4.428974769529165,
"learning_rate": 0.00010976037405026299,
"loss": 0.1104,
"step": 9210
},
{
"epoch": 2.6,
"grad_norm": 20.92148868212674,
"learning_rate": 0.00010954120397428404,
"loss": 0.0914,
"step": 9220
},
{
"epoch": 2.6,
"grad_norm": 0.30038070281462703,
"learning_rate": 0.00010932203389830507,
"loss": 0.1936,
"step": 9230
},
{
"epoch": 2.61,
"grad_norm": 48.36362550140161,
"learning_rate": 0.00010910286382232611,
"loss": 0.3639,
"step": 9240
},
{
"epoch": 2.61,
"grad_norm": 8.15759688958997,
"learning_rate": 0.00010888369374634716,
"loss": 0.1991,
"step": 9250
},
{
"epoch": 2.61,
"grad_norm": 12.841408835810743,
"learning_rate": 0.00010866452367036819,
"loss": 0.1441,
"step": 9260
},
{
"epoch": 2.61,
"grad_norm": 13.483453911295381,
"learning_rate": 0.00010844535359438924,
"loss": 0.0981,
"step": 9270
},
{
"epoch": 2.62,
"grad_norm": 0.5843792074816087,
"learning_rate": 0.00010822618351841027,
"loss": 0.2757,
"step": 9280
},
{
"epoch": 2.62,
"grad_norm": 7.822943624957112,
"learning_rate": 0.00010800701344243132,
"loss": 0.1102,
"step": 9290
},
{
"epoch": 2.62,
"grad_norm": 35.655682175617585,
"learning_rate": 0.00010778784336645236,
"loss": 0.2879,
"step": 9300
},
{
"epoch": 2.62,
"grad_norm": 1.429017224224025,
"learning_rate": 0.0001075686732904734,
"loss": 0.0986,
"step": 9310
},
{
"epoch": 2.63,
"grad_norm": 8.077860057159654,
"learning_rate": 0.00010734950321449444,
"loss": 0.2654,
"step": 9320
},
{
"epoch": 2.63,
"grad_norm": 1.2534241595837954,
"learning_rate": 0.00010713033313851547,
"loss": 0.0941,
"step": 9330
},
{
"epoch": 2.63,
"grad_norm": 0.9501360823975038,
"learning_rate": 0.00010691116306253652,
"loss": 0.1358,
"step": 9340
},
{
"epoch": 2.64,
"grad_norm": 1.3120476171581812,
"learning_rate": 0.00010669199298655756,
"loss": 0.1927,
"step": 9350
},
{
"epoch": 2.64,
"grad_norm": 10.128511370932692,
"learning_rate": 0.0001064728229105786,
"loss": 0.2176,
"step": 9360
},
{
"epoch": 2.64,
"grad_norm": 3.376491562592107,
"learning_rate": 0.00010625365283459964,
"loss": 0.0673,
"step": 9370
},
{
"epoch": 2.64,
"grad_norm": 0.011877575586390247,
"learning_rate": 0.00010603448275862067,
"loss": 0.1272,
"step": 9380
},
{
"epoch": 2.65,
"grad_norm": 15.244920750217991,
"learning_rate": 0.00010581531268264172,
"loss": 0.1012,
"step": 9390
},
{
"epoch": 2.65,
"grad_norm": 0.2705443274155431,
"learning_rate": 0.00010559614260666275,
"loss": 0.1024,
"step": 9400
},
{
"epoch": 2.65,
"grad_norm": 12.05359833618471,
"learning_rate": 0.0001053769725306838,
"loss": 0.1826,
"step": 9410
},
{
"epoch": 2.66,
"grad_norm": 9.360989137584955,
"learning_rate": 0.00010515780245470485,
"loss": 0.205,
"step": 9420
},
{
"epoch": 2.66,
"grad_norm": 6.831707184981156,
"learning_rate": 0.00010493863237872587,
"loss": 0.2364,
"step": 9430
},
{
"epoch": 2.66,
"grad_norm": 12.68075831146527,
"learning_rate": 0.00010471946230274693,
"loss": 0.1878,
"step": 9440
},
{
"epoch": 2.66,
"grad_norm": 9.461914667245052,
"learning_rate": 0.00010450029222676796,
"loss": 0.1061,
"step": 9450
},
{
"epoch": 2.67,
"grad_norm": 33.7830548827646,
"learning_rate": 0.00010428112215078901,
"loss": 0.0955,
"step": 9460
},
{
"epoch": 2.67,
"grad_norm": 3.41262405773915,
"learning_rate": 0.00010406195207481005,
"loss": 0.0893,
"step": 9470
},
{
"epoch": 2.67,
"grad_norm": 16.661587161769187,
"learning_rate": 0.00010384278199883109,
"loss": 0.1281,
"step": 9480
},
{
"epoch": 2.68,
"grad_norm": 19.501609655955452,
"learning_rate": 0.00010362361192285213,
"loss": 0.3944,
"step": 9490
},
{
"epoch": 2.68,
"grad_norm": 4.601907463118784,
"learning_rate": 0.00010340444184687316,
"loss": 0.1158,
"step": 9500
},
{
"epoch": 2.68,
"grad_norm": 24.10143095564842,
"learning_rate": 0.00010318527177089421,
"loss": 0.2357,
"step": 9510
},
{
"epoch": 2.68,
"grad_norm": 4.970899462803766,
"learning_rate": 0.00010296610169491524,
"loss": 0.1134,
"step": 9520
},
{
"epoch": 2.69,
"grad_norm": 0.04492151846400819,
"learning_rate": 0.00010274693161893629,
"loss": 0.1146,
"step": 9530
},
{
"epoch": 2.69,
"grad_norm": 7.2274322872988055,
"learning_rate": 0.00010252776154295733,
"loss": 0.1354,
"step": 9540
},
{
"epoch": 2.69,
"grad_norm": 6.048047676599459,
"learning_rate": 0.00010230859146697836,
"loss": 0.2284,
"step": 9550
},
{
"epoch": 2.7,
"grad_norm": 0.9638985947560608,
"learning_rate": 0.00010208942139099941,
"loss": 0.0955,
"step": 9560
},
{
"epoch": 2.7,
"grad_norm": 6.19171074222296,
"learning_rate": 0.00010187025131502044,
"loss": 0.0909,
"step": 9570
},
{
"epoch": 2.7,
"grad_norm": 22.167114710888278,
"learning_rate": 0.00010165108123904149,
"loss": 0.1367,
"step": 9580
},
{
"epoch": 2.7,
"grad_norm": 0.3557238646240087,
"learning_rate": 0.00010143191116306253,
"loss": 0.0825,
"step": 9590
},
{
"epoch": 2.71,
"grad_norm": 23.067425155746133,
"learning_rate": 0.00010121274108708357,
"loss": 0.0815,
"step": 9600
},
{
"epoch": 2.71,
"grad_norm": 1.3212362814667589,
"learning_rate": 0.00010099357101110461,
"loss": 0.1454,
"step": 9610
},
{
"epoch": 2.71,
"grad_norm": 9.03334133310524,
"learning_rate": 0.00010077440093512564,
"loss": 0.1943,
"step": 9620
},
{
"epoch": 2.71,
"grad_norm": 2.1696899004758556,
"learning_rate": 0.00010055523085914669,
"loss": 0.1105,
"step": 9630
},
{
"epoch": 2.72,
"grad_norm": 3.8419745918801067,
"learning_rate": 0.00010033606078316773,
"loss": 0.3075,
"step": 9640
},
{
"epoch": 2.72,
"grad_norm": 0.9683867410845369,
"learning_rate": 0.00010011689070718877,
"loss": 0.1233,
"step": 9650
},
{
"epoch": 2.72,
"grad_norm": 13.53465449736677,
"learning_rate": 9.989772063120981e-05,
"loss": 0.2396,
"step": 9660
},
{
"epoch": 2.73,
"grad_norm": 1.3390366123084314,
"learning_rate": 9.967855055523084e-05,
"loss": 0.117,
"step": 9670
},
{
"epoch": 2.73,
"grad_norm": 0.7238054927151057,
"learning_rate": 9.945938047925189e-05,
"loss": 0.0825,
"step": 9680
},
{
"epoch": 2.73,
"grad_norm": 1.343299376332715,
"learning_rate": 9.924021040327292e-05,
"loss": 0.1852,
"step": 9690
},
{
"epoch": 2.73,
"grad_norm": 2.7174665471907296,
"learning_rate": 9.902104032729397e-05,
"loss": 0.185,
"step": 9700
},
{
"epoch": 2.74,
"grad_norm": 19.67625891707943,
"learning_rate": 9.880187025131501e-05,
"loss": 0.1037,
"step": 9710
},
{
"epoch": 2.74,
"grad_norm": 1.7362888479778698,
"learning_rate": 9.858270017533604e-05,
"loss": 0.0993,
"step": 9720
},
{
"epoch": 2.74,
"grad_norm": 24.97466231442174,
"learning_rate": 9.836353009935709e-05,
"loss": 0.3,
"step": 9730
},
{
"epoch": 2.75,
"grad_norm": 0.19223391446010424,
"learning_rate": 9.814436002337812e-05,
"loss": 0.139,
"step": 9740
},
{
"epoch": 2.75,
"grad_norm": 1.0232214471272263,
"learning_rate": 9.792518994739917e-05,
"loss": 0.0779,
"step": 9750
},
{
"epoch": 2.75,
"grad_norm": 2.4406875106467685,
"learning_rate": 9.770601987142023e-05,
"loss": 0.2194,
"step": 9760
},
{
"epoch": 2.75,
"grad_norm": 29.07247504638446,
"learning_rate": 9.748684979544126e-05,
"loss": 0.0975,
"step": 9770
},
{
"epoch": 2.76,
"grad_norm": 0.7785847782304731,
"learning_rate": 9.72676797194623e-05,
"loss": 0.1824,
"step": 9780
},
{
"epoch": 2.76,
"grad_norm": 15.90756717391926,
"learning_rate": 9.704850964348334e-05,
"loss": 0.0997,
"step": 9790
},
{
"epoch": 2.76,
"grad_norm": 0.5434568527038021,
"learning_rate": 9.682933956750438e-05,
"loss": 0.0639,
"step": 9800
},
{
"epoch": 2.77,
"grad_norm": 0.8181189757985562,
"learning_rate": 9.661016949152541e-05,
"loss": 0.1401,
"step": 9810
},
{
"epoch": 2.77,
"grad_norm": 0.7002007823686216,
"learning_rate": 9.639099941554646e-05,
"loss": 0.1317,
"step": 9820
},
{
"epoch": 2.77,
"grad_norm": 10.646847543416506,
"learning_rate": 9.61718293395675e-05,
"loss": 0.0834,
"step": 9830
},
{
"epoch": 2.77,
"grad_norm": 2.854877487641081,
"learning_rate": 9.595265926358854e-05,
"loss": 0.094,
"step": 9840
},
{
"epoch": 2.78,
"grad_norm": 21.086749067232077,
"learning_rate": 9.573348918760958e-05,
"loss": 0.1658,
"step": 9850
},
{
"epoch": 2.78,
"grad_norm": 6.599220856291059,
"learning_rate": 9.551431911163061e-05,
"loss": 0.1013,
"step": 9860
},
{
"epoch": 2.78,
"grad_norm": 10.557135159098205,
"learning_rate": 9.529514903565166e-05,
"loss": 0.1954,
"step": 9870
},
{
"epoch": 2.79,
"grad_norm": 9.42143848698557,
"learning_rate": 9.50759789596727e-05,
"loss": 0.2229,
"step": 9880
},
{
"epoch": 2.79,
"grad_norm": 36.05803797726123,
"learning_rate": 9.485680888369374e-05,
"loss": 0.1102,
"step": 9890
},
{
"epoch": 2.79,
"grad_norm": 19.750284280798315,
"learning_rate": 9.463763880771478e-05,
"loss": 0.1737,
"step": 9900
},
{
"epoch": 2.79,
"grad_norm": 20.670824131237584,
"learning_rate": 9.441846873173582e-05,
"loss": 0.1087,
"step": 9910
},
{
"epoch": 2.8,
"grad_norm": 3.3595758310158126,
"learning_rate": 9.419929865575686e-05,
"loss": 0.0856,
"step": 9920
},
{
"epoch": 2.8,
"grad_norm": 0.6471151094333392,
"learning_rate": 9.398012857977789e-05,
"loss": 0.1957,
"step": 9930
},
{
"epoch": 2.8,
"grad_norm": 26.791209319259156,
"learning_rate": 9.376095850379894e-05,
"loss": 0.1552,
"step": 9940
},
{
"epoch": 2.81,
"grad_norm": 2.0994878103860124,
"learning_rate": 9.354178842781998e-05,
"loss": 0.0714,
"step": 9950
},
{
"epoch": 2.81,
"grad_norm": 3.939232559831004,
"learning_rate": 9.332261835184102e-05,
"loss": 0.1463,
"step": 9960
},
{
"epoch": 2.81,
"grad_norm": 1.93177447901502,
"learning_rate": 9.310344827586206e-05,
"loss": 0.3218,
"step": 9970
},
{
"epoch": 2.81,
"grad_norm": 11.674268724271638,
"learning_rate": 9.28842781998831e-05,
"loss": 0.121,
"step": 9980
},
{
"epoch": 2.82,
"grad_norm": 29.77892155165882,
"learning_rate": 9.266510812390414e-05,
"loss": 0.2259,
"step": 9990
},
{
"epoch": 2.82,
"grad_norm": 3.1793474536477495,
"learning_rate": 9.244593804792518e-05,
"loss": 0.1147,
"step": 10000
},
{
"epoch": 2.82,
"grad_norm": 29.467125692383135,
"learning_rate": 9.222676797194622e-05,
"loss": 0.1296,
"step": 10010
},
{
"epoch": 2.82,
"grad_norm": 12.829429531674693,
"learning_rate": 9.202951490356516e-05,
"loss": 0.343,
"step": 10020
},
{
"epoch": 2.83,
"grad_norm": 4.842170823899554,
"learning_rate": 9.18103448275862e-05,
"loss": 0.1342,
"step": 10030
},
{
"epoch": 2.83,
"grad_norm": 2.8036087435052117,
"learning_rate": 9.159117475160724e-05,
"loss": 0.1981,
"step": 10040
},
{
"epoch": 2.83,
"grad_norm": 1.2567121612024053,
"learning_rate": 9.137200467562828e-05,
"loss": 0.1156,
"step": 10050
},
{
"epoch": 2.84,
"grad_norm": 39.13892743280264,
"learning_rate": 9.115283459964932e-05,
"loss": 0.217,
"step": 10060
},
{
"epoch": 2.84,
"grad_norm": 4.018926730147514,
"learning_rate": 9.093366452367036e-05,
"loss": 0.1098,
"step": 10070
},
{
"epoch": 2.84,
"grad_norm": 20.370189526716274,
"learning_rate": 9.07144944476914e-05,
"loss": 0.1131,
"step": 10080
},
{
"epoch": 2.84,
"grad_norm": 12.513349385435367,
"learning_rate": 9.049532437171244e-05,
"loss": 0.0835,
"step": 10090
},
{
"epoch": 2.85,
"grad_norm": 3.0078713954695693,
"learning_rate": 9.027615429573349e-05,
"loss": 0.0935,
"step": 10100
},
{
"epoch": 2.85,
"grad_norm": 0.35428391785347213,
"learning_rate": 9.005698421975452e-05,
"loss": 0.1368,
"step": 10110
},
{
"epoch": 2.85,
"grad_norm": 2.8765836604387487,
"learning_rate": 8.983781414377556e-05,
"loss": 0.0582,
"step": 10120
},
{
"epoch": 2.86,
"grad_norm": 0.27300803686564074,
"learning_rate": 8.96186440677966e-05,
"loss": 0.1443,
"step": 10130
},
{
"epoch": 2.86,
"grad_norm": 1.0632557332902792,
"learning_rate": 8.939947399181764e-05,
"loss": 0.2713,
"step": 10140
},
{
"epoch": 2.86,
"grad_norm": 1.3919283720717666,
"learning_rate": 8.918030391583869e-05,
"loss": 0.1426,
"step": 10150
},
{
"epoch": 2.86,
"grad_norm": 0.2975889790690355,
"learning_rate": 8.896113383985972e-05,
"loss": 0.1255,
"step": 10160
},
{
"epoch": 2.87,
"grad_norm": 0.6331908778097588,
"learning_rate": 8.874196376388076e-05,
"loss": 0.1088,
"step": 10170
},
{
"epoch": 2.87,
"grad_norm": 0.49968429973464584,
"learning_rate": 8.85227936879018e-05,
"loss": 0.0462,
"step": 10180
},
{
"epoch": 2.87,
"grad_norm": 0.3998283883981307,
"learning_rate": 8.830362361192284e-05,
"loss": 0.1456,
"step": 10190
},
{
"epoch": 2.88,
"grad_norm": 0.6762860335305919,
"learning_rate": 8.808445353594387e-05,
"loss": 0.1,
"step": 10200
},
{
"epoch": 2.88,
"grad_norm": 1.5918195856520565,
"learning_rate": 8.786528345996492e-05,
"loss": 0.0775,
"step": 10210
},
{
"epoch": 2.88,
"grad_norm": 0.47908599501430993,
"learning_rate": 8.764611338398598e-05,
"loss": 0.1299,
"step": 10220
},
{
"epoch": 2.88,
"grad_norm": 0.02533220752832844,
"learning_rate": 8.7426943308007e-05,
"loss": 0.1705,
"step": 10230
},
{
"epoch": 2.89,
"grad_norm": 1.6916417760501654,
"learning_rate": 8.720777323202806e-05,
"loss": 0.3096,
"step": 10240
},
{
"epoch": 2.89,
"grad_norm": 5.6132755915793116,
"learning_rate": 8.698860315604907e-05,
"loss": 0.14,
"step": 10250
},
{
"epoch": 2.89,
"grad_norm": 3.6946000211637866,
"learning_rate": 8.676943308007013e-05,
"loss": 0.1283,
"step": 10260
},
{
"epoch": 2.9,
"grad_norm": 1.5612252460560485,
"learning_rate": 8.655026300409118e-05,
"loss": 0.2084,
"step": 10270
},
{
"epoch": 2.9,
"grad_norm": 17.442735530139714,
"learning_rate": 8.633109292811221e-05,
"loss": 0.0978,
"step": 10280
},
{
"epoch": 2.9,
"grad_norm": 0.7188172450254493,
"learning_rate": 8.611192285213326e-05,
"loss": 0.1475,
"step": 10290
},
{
"epoch": 2.9,
"grad_norm": 0.23472035328250088,
"learning_rate": 8.589275277615429e-05,
"loss": 0.084,
"step": 10300
},
{
"epoch": 2.91,
"grad_norm": 3.847787592250939,
"learning_rate": 8.567358270017533e-05,
"loss": 0.0532,
"step": 10310
},
{
"epoch": 2.91,
"grad_norm": 8.67255157030904,
"learning_rate": 8.545441262419637e-05,
"loss": 0.2527,
"step": 10320
},
{
"epoch": 2.91,
"grad_norm": 0.6329813929319419,
"learning_rate": 8.523524254821741e-05,
"loss": 0.1688,
"step": 10330
},
{
"epoch": 2.92,
"grad_norm": 38.31309176901214,
"learning_rate": 8.501607247223846e-05,
"loss": 0.1513,
"step": 10340
},
{
"epoch": 2.92,
"grad_norm": 0.9873950760760376,
"learning_rate": 8.479690239625949e-05,
"loss": 0.1007,
"step": 10350
},
{
"epoch": 2.92,
"grad_norm": 12.444359816230664,
"learning_rate": 8.457773232028053e-05,
"loss": 0.0936,
"step": 10360
},
{
"epoch": 2.92,
"grad_norm": 2.1983295007186743,
"learning_rate": 8.435856224430157e-05,
"loss": 0.1726,
"step": 10370
},
{
"epoch": 2.93,
"grad_norm": 25.402786909188887,
"learning_rate": 8.413939216832261e-05,
"loss": 0.2034,
"step": 10380
},
{
"epoch": 2.93,
"grad_norm": 10.271503407485543,
"learning_rate": 8.392022209234366e-05,
"loss": 0.0983,
"step": 10390
},
{
"epoch": 2.93,
"grad_norm": 11.41372086097229,
"learning_rate": 8.370105201636469e-05,
"loss": 0.1763,
"step": 10400
},
{
"epoch": 2.93,
"grad_norm": 1.0021380382604943,
"learning_rate": 8.348188194038573e-05,
"loss": 0.1691,
"step": 10410
},
{
"epoch": 2.94,
"grad_norm": 1.4011630497758993,
"learning_rate": 8.326271186440677e-05,
"loss": 0.0795,
"step": 10420
},
{
"epoch": 2.94,
"grad_norm": 0.5995590743117784,
"learning_rate": 8.304354178842781e-05,
"loss": 0.155,
"step": 10430
},
{
"epoch": 2.94,
"grad_norm": 13.891285620986729,
"learning_rate": 8.282437171244884e-05,
"loss": 0.152,
"step": 10440
},
{
"epoch": 2.95,
"grad_norm": 4.322816995983127,
"learning_rate": 8.260520163646989e-05,
"loss": 0.1174,
"step": 10450
},
{
"epoch": 2.95,
"grad_norm": 4.199701886882987,
"learning_rate": 8.238603156049094e-05,
"loss": 0.1112,
"step": 10460
},
{
"epoch": 2.95,
"grad_norm": 1.0003512662431095,
"learning_rate": 8.216686148451197e-05,
"loss": 0.1023,
"step": 10470
},
{
"epoch": 2.95,
"grad_norm": 2.7506464116256915,
"learning_rate": 8.194769140853301e-05,
"loss": 0.125,
"step": 10480
},
{
"epoch": 2.96,
"grad_norm": 0.267916746880681,
"learning_rate": 8.172852133255405e-05,
"loss": 0.1657,
"step": 10490
},
{
"epoch": 2.96,
"grad_norm": 1.1331681687447057,
"learning_rate": 8.150935125657509e-05,
"loss": 0.1416,
"step": 10500
},
{
"epoch": 2.96,
"grad_norm": 1.8446952462251813,
"learning_rate": 8.129018118059614e-05,
"loss": 0.2599,
"step": 10510
},
{
"epoch": 2.97,
"grad_norm": 0.51816310859789,
"learning_rate": 8.107101110461717e-05,
"loss": 0.0686,
"step": 10520
},
{
"epoch": 2.97,
"grad_norm": 1.2233390386002774,
"learning_rate": 8.085184102863821e-05,
"loss": 0.1502,
"step": 10530
},
{
"epoch": 2.97,
"grad_norm": 83.49175206154096,
"learning_rate": 8.063267095265925e-05,
"loss": 0.1806,
"step": 10540
},
{
"epoch": 2.97,
"grad_norm": 4.967238991092641,
"learning_rate": 8.041350087668029e-05,
"loss": 0.1847,
"step": 10550
},
{
"epoch": 2.98,
"grad_norm": 15.102812775710126,
"learning_rate": 8.019433080070135e-05,
"loss": 0.1602,
"step": 10560
},
{
"epoch": 2.98,
"grad_norm": 26.52721557816532,
"learning_rate": 7.997516072472237e-05,
"loss": 0.1863,
"step": 10570
},
{
"epoch": 2.98,
"grad_norm": 1.232502179558844,
"learning_rate": 7.975599064874343e-05,
"loss": 0.1091,
"step": 10580
},
{
"epoch": 2.99,
"grad_norm": 0.802916184416696,
"learning_rate": 7.953682057276446e-05,
"loss": 0.1202,
"step": 10590
},
{
"epoch": 2.99,
"grad_norm": 18.591440679290102,
"learning_rate": 7.93176504967855e-05,
"loss": 0.1693,
"step": 10600
},
{
"epoch": 2.99,
"grad_norm": 1.0156253505497566,
"learning_rate": 7.909848042080654e-05,
"loss": 0.1564,
"step": 10610
},
{
"epoch": 2.99,
"grad_norm": 10.068403109613666,
"learning_rate": 7.887931034482758e-05,
"loss": 0.2361,
"step": 10620
},
{
"epoch": 3.0,
"grad_norm": 1.6740912563898531,
"learning_rate": 7.866014026884863e-05,
"loss": 0.0799,
"step": 10630
},
{
"epoch": 3.0,
"grad_norm": 2.461614313039469,
"learning_rate": 7.844097019286966e-05,
"loss": 0.0537,
"step": 10640
},
{
"epoch": 3.0,
"eval_0_f1": 0.7738325801592424,
"eval_0_precision": 0.7824194952132288,
"eval_0_recall": 0.7654320987654321,
"eval_1_f1": 0.9226921662375874,
"eval_1_precision": 0.9192437344276712,
"eval_1_recall": 0.926166568222091,
"eval_accuracy": 0.8847714066440083,
"eval_loss": 0.39013671875,
"eval_runtime": 544.8422,
"eval_samples_per_second": 16.741,
"eval_steps_per_second": 2.792,
"step": 10641
}
],
"logging_steps": 10,
"max_steps": 14188,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"total_flos": 2.0001268071346995e+17,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}