DietAgent / trainer_state.json
yixin1121's picture
Upload folder using huggingface_hub
131319a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1552,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032232070910556,
"grad_norm": 70.69935607910156,
"learning_rate": 1.6077170418006432e-07,
"loss": 0.9483,
"step": 5
},
{
"epoch": 0.0064464141821112,
"grad_norm": 41.78139877319336,
"learning_rate": 3.2154340836012864e-07,
"loss": 0.8601,
"step": 10
},
{
"epoch": 0.009669621273166801,
"grad_norm": 39.28358840942383,
"learning_rate": 4.823151125401929e-07,
"loss": 0.7662,
"step": 15
},
{
"epoch": 0.0128928283642224,
"grad_norm": 26.412134170532227,
"learning_rate": 6.430868167202573e-07,
"loss": 0.5686,
"step": 20
},
{
"epoch": 0.016116035455278,
"grad_norm": 12.356087684631348,
"learning_rate": 8.038585209003216e-07,
"loss": 0.4011,
"step": 25
},
{
"epoch": 0.019339242546333603,
"grad_norm": 7.162217617034912,
"learning_rate": 9.646302250803859e-07,
"loss": 0.3031,
"step": 30
},
{
"epoch": 0.022562449637389202,
"grad_norm": 5.508477687835693,
"learning_rate": 1.1254019292604503e-06,
"loss": 0.245,
"step": 35
},
{
"epoch": 0.0257856567284448,
"grad_norm": 6.486464500427246,
"learning_rate": 1.2861736334405146e-06,
"loss": 0.2266,
"step": 40
},
{
"epoch": 0.029008863819500404,
"grad_norm": 5.0619378089904785,
"learning_rate": 1.4469453376205788e-06,
"loss": 0.1895,
"step": 45
},
{
"epoch": 0.032232070910556,
"grad_norm": 3.8913064002990723,
"learning_rate": 1.6077170418006432e-06,
"loss": 0.1845,
"step": 50
},
{
"epoch": 0.035455278001611606,
"grad_norm": 3.2244060039520264,
"learning_rate": 1.7684887459807077e-06,
"loss": 0.1619,
"step": 55
},
{
"epoch": 0.038678485092667206,
"grad_norm": 4.426245212554932,
"learning_rate": 1.9292604501607717e-06,
"loss": 0.1638,
"step": 60
},
{
"epoch": 0.041901692183722805,
"grad_norm": 4.924641132354736,
"learning_rate": 2.090032154340836e-06,
"loss": 0.1535,
"step": 65
},
{
"epoch": 0.045124899274778404,
"grad_norm": 4.1020894050598145,
"learning_rate": 2.2508038585209006e-06,
"loss": 0.1514,
"step": 70
},
{
"epoch": 0.048348106365834004,
"grad_norm": 3.604417562484741,
"learning_rate": 2.411575562700965e-06,
"loss": 0.136,
"step": 75
},
{
"epoch": 0.0515713134568896,
"grad_norm": 3.021495819091797,
"learning_rate": 2.572347266881029e-06,
"loss": 0.1329,
"step": 80
},
{
"epoch": 0.0547945205479452,
"grad_norm": 3.2783167362213135,
"learning_rate": 2.7331189710610936e-06,
"loss": 0.142,
"step": 85
},
{
"epoch": 0.05801772763900081,
"grad_norm": 2.818120241165161,
"learning_rate": 2.8938906752411576e-06,
"loss": 0.1305,
"step": 90
},
{
"epoch": 0.06124093473005641,
"grad_norm": 3.0875892639160156,
"learning_rate": 3.054662379421222e-06,
"loss": 0.123,
"step": 95
},
{
"epoch": 0.064464141821112,
"grad_norm": 2.726335048675537,
"learning_rate": 3.2154340836012865e-06,
"loss": 0.139,
"step": 100
},
{
"epoch": 0.06768734891216761,
"grad_norm": 2.3112688064575195,
"learning_rate": 3.376205787781351e-06,
"loss": 0.1174,
"step": 105
},
{
"epoch": 0.07091055600322321,
"grad_norm": 2.9780499935150146,
"learning_rate": 3.5369774919614154e-06,
"loss": 0.1342,
"step": 110
},
{
"epoch": 0.07413376309427881,
"grad_norm": 3.2706351280212402,
"learning_rate": 3.69774919614148e-06,
"loss": 0.1225,
"step": 115
},
{
"epoch": 0.07735697018533441,
"grad_norm": 2.3604736328125,
"learning_rate": 3.8585209003215434e-06,
"loss": 0.1298,
"step": 120
},
{
"epoch": 0.08058017727639001,
"grad_norm": 2.7637319564819336,
"learning_rate": 4.0192926045016075e-06,
"loss": 0.1237,
"step": 125
},
{
"epoch": 0.08380338436744561,
"grad_norm": 2.42730975151062,
"learning_rate": 4.180064308681672e-06,
"loss": 0.1321,
"step": 130
},
{
"epoch": 0.08702659145850121,
"grad_norm": 2.5871758460998535,
"learning_rate": 4.340836012861736e-06,
"loss": 0.13,
"step": 135
},
{
"epoch": 0.09024979854955681,
"grad_norm": 2.662217855453491,
"learning_rate": 4.501607717041801e-06,
"loss": 0.1308,
"step": 140
},
{
"epoch": 0.09347300564061241,
"grad_norm": 2.97275447845459,
"learning_rate": 4.662379421221865e-06,
"loss": 0.1152,
"step": 145
},
{
"epoch": 0.09669621273166801,
"grad_norm": 2.990713119506836,
"learning_rate": 4.82315112540193e-06,
"loss": 0.1241,
"step": 150
},
{
"epoch": 0.0999194198227236,
"grad_norm": 3.011382818222046,
"learning_rate": 4.983922829581994e-06,
"loss": 0.1318,
"step": 155
},
{
"epoch": 0.1031426269137792,
"grad_norm": 2.340388774871826,
"learning_rate": 5.144694533762058e-06,
"loss": 0.1217,
"step": 160
},
{
"epoch": 0.1063658340048348,
"grad_norm": 2.1309595108032227,
"learning_rate": 5.305466237942123e-06,
"loss": 0.1139,
"step": 165
},
{
"epoch": 0.1095890410958904,
"grad_norm": 2.023085355758667,
"learning_rate": 5.466237942122187e-06,
"loss": 0.117,
"step": 170
},
{
"epoch": 0.11281224818694602,
"grad_norm": 2.348984718322754,
"learning_rate": 5.627009646302252e-06,
"loss": 0.1141,
"step": 175
},
{
"epoch": 0.11603545527800162,
"grad_norm": 2.375622272491455,
"learning_rate": 5.787781350482315e-06,
"loss": 0.128,
"step": 180
},
{
"epoch": 0.11925866236905722,
"grad_norm": 2.196361780166626,
"learning_rate": 5.94855305466238e-06,
"loss": 0.1196,
"step": 185
},
{
"epoch": 0.12248186946011282,
"grad_norm": 2.2878026962280273,
"learning_rate": 6.109324758842444e-06,
"loss": 0.1244,
"step": 190
},
{
"epoch": 0.12570507655116842,
"grad_norm": 2.213945150375366,
"learning_rate": 6.270096463022508e-06,
"loss": 0.1247,
"step": 195
},
{
"epoch": 0.128928283642224,
"grad_norm": 2.1751949787139893,
"learning_rate": 6.430868167202573e-06,
"loss": 0.1218,
"step": 200
},
{
"epoch": 0.13215149073327961,
"grad_norm": 3.6750779151916504,
"learning_rate": 6.591639871382637e-06,
"loss": 0.1245,
"step": 205
},
{
"epoch": 0.13537469782433523,
"grad_norm": 2.0573132038116455,
"learning_rate": 6.752411575562702e-06,
"loss": 0.1222,
"step": 210
},
{
"epoch": 0.1385979049153908,
"grad_norm": 2.161153554916382,
"learning_rate": 6.913183279742766e-06,
"loss": 0.1144,
"step": 215
},
{
"epoch": 0.14182111200644643,
"grad_norm": 2.023874282836914,
"learning_rate": 7.073954983922831e-06,
"loss": 0.1196,
"step": 220
},
{
"epoch": 0.145044319097502,
"grad_norm": 2.5854148864746094,
"learning_rate": 7.234726688102894e-06,
"loss": 0.1226,
"step": 225
},
{
"epoch": 0.14826752618855762,
"grad_norm": 2.1432173252105713,
"learning_rate": 7.39549839228296e-06,
"loss": 0.1141,
"step": 230
},
{
"epoch": 0.1514907332796132,
"grad_norm": 1.9795219898223877,
"learning_rate": 7.556270096463023e-06,
"loss": 0.1214,
"step": 235
},
{
"epoch": 0.15471394037066882,
"grad_norm": 2.685298204421997,
"learning_rate": 7.717041800643087e-06,
"loss": 0.1166,
"step": 240
},
{
"epoch": 0.1579371474617244,
"grad_norm": 2.327439069747925,
"learning_rate": 7.877813504823153e-06,
"loss": 0.1147,
"step": 245
},
{
"epoch": 0.16116035455278002,
"grad_norm": 2.873242139816284,
"learning_rate": 8.038585209003215e-06,
"loss": 0.1214,
"step": 250
},
{
"epoch": 0.1643835616438356,
"grad_norm": 1.947021484375,
"learning_rate": 8.19935691318328e-06,
"loss": 0.1181,
"step": 255
},
{
"epoch": 0.16760676873489122,
"grad_norm": 2.10578989982605,
"learning_rate": 8.360128617363345e-06,
"loss": 0.1166,
"step": 260
},
{
"epoch": 0.1708299758259468,
"grad_norm": 3.1242830753326416,
"learning_rate": 8.520900321543409e-06,
"loss": 0.1223,
"step": 265
},
{
"epoch": 0.17405318291700242,
"grad_norm": 2.3070671558380127,
"learning_rate": 8.681672025723473e-06,
"loss": 0.1199,
"step": 270
},
{
"epoch": 0.177276390008058,
"grad_norm": 4.113283157348633,
"learning_rate": 8.842443729903538e-06,
"loss": 0.1359,
"step": 275
},
{
"epoch": 0.18049959709911362,
"grad_norm": 2.406996488571167,
"learning_rate": 9.003215434083602e-06,
"loss": 0.1247,
"step": 280
},
{
"epoch": 0.18372280419016923,
"grad_norm": 1.8795942068099976,
"learning_rate": 9.163987138263667e-06,
"loss": 0.1184,
"step": 285
},
{
"epoch": 0.18694601128122482,
"grad_norm": 1.873802900314331,
"learning_rate": 9.32475884244373e-06,
"loss": 0.1147,
"step": 290
},
{
"epoch": 0.19016921837228043,
"grad_norm": 2.1450552940368652,
"learning_rate": 9.485530546623795e-06,
"loss": 0.1297,
"step": 295
},
{
"epoch": 0.19339242546333602,
"grad_norm": 2.236771583557129,
"learning_rate": 9.64630225080386e-06,
"loss": 0.1144,
"step": 300
},
{
"epoch": 0.19661563255439163,
"grad_norm": 2.0439035892486572,
"learning_rate": 9.807073954983923e-06,
"loss": 0.1153,
"step": 305
},
{
"epoch": 0.1998388396454472,
"grad_norm": 2.3851237297058105,
"learning_rate": 9.967845659163988e-06,
"loss": 0.1185,
"step": 310
},
{
"epoch": 0.20306204673650283,
"grad_norm": 1.9924075603485107,
"learning_rate": 9.99994931968214e-06,
"loss": 0.1094,
"step": 315
},
{
"epoch": 0.2062852538275584,
"grad_norm": 2.0796937942504883,
"learning_rate": 9.999743432651652e-06,
"loss": 0.1211,
"step": 320
},
{
"epoch": 0.20950846091861403,
"grad_norm": 2.6404528617858887,
"learning_rate": 9.999379177905158e-06,
"loss": 0.1225,
"step": 325
},
{
"epoch": 0.2127316680096696,
"grad_norm": 1.8474249839782715,
"learning_rate": 9.998856566980493e-06,
"loss": 0.1195,
"step": 330
},
{
"epoch": 0.21595487510072522,
"grad_norm": 2.2194368839263916,
"learning_rate": 9.998175616431443e-06,
"loss": 0.1234,
"step": 335
},
{
"epoch": 0.2191780821917808,
"grad_norm": 2.3846206665039062,
"learning_rate": 9.99733634782723e-06,
"loss": 0.1183,
"step": 340
},
{
"epoch": 0.22240128928283642,
"grad_norm": 2.1238505840301514,
"learning_rate": 9.996338787751834e-06,
"loss": 0.1133,
"step": 345
},
{
"epoch": 0.22562449637389204,
"grad_norm": 1.9952363967895508,
"learning_rate": 9.995182967803131e-06,
"loss": 0.1171,
"step": 350
},
{
"epoch": 0.22884770346494762,
"grad_norm": 1.8000890016555786,
"learning_rate": 9.99386892459192e-06,
"loss": 0.1271,
"step": 355
},
{
"epoch": 0.23207091055600323,
"grad_norm": 1.680237889289856,
"learning_rate": 9.992396699740738e-06,
"loss": 0.1233,
"step": 360
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.9006551504135132,
"learning_rate": 9.990766339882554e-06,
"loss": 0.117,
"step": 365
},
{
"epoch": 0.23851732473811443,
"grad_norm": 2.417584180831909,
"learning_rate": 9.988977896659294e-06,
"loss": 0.1202,
"step": 370
},
{
"epoch": 0.24174053182917002,
"grad_norm": 1.7894331216812134,
"learning_rate": 9.987031426720195e-06,
"loss": 0.1131,
"step": 375
},
{
"epoch": 0.24496373892022563,
"grad_norm": 2.3755767345428467,
"learning_rate": 9.984926991720025e-06,
"loss": 0.1199,
"step": 380
},
{
"epoch": 0.24818694601128122,
"grad_norm": 1.6282379627227783,
"learning_rate": 9.982664658317115e-06,
"loss": 0.1139,
"step": 385
},
{
"epoch": 0.25141015310233683,
"grad_norm": 2.3639254570007324,
"learning_rate": 9.980244498171256e-06,
"loss": 0.1064,
"step": 390
},
{
"epoch": 0.25463336019339244,
"grad_norm": 1.6792004108428955,
"learning_rate": 9.97766658794143e-06,
"loss": 0.111,
"step": 395
},
{
"epoch": 0.257856567284448,
"grad_norm": 2.633594036102295,
"learning_rate": 9.974931009283378e-06,
"loss": 0.1083,
"step": 400
},
{
"epoch": 0.2610797743755036,
"grad_norm": 1.7790218591690063,
"learning_rate": 9.972037848847014e-06,
"loss": 0.1197,
"step": 405
},
{
"epoch": 0.26430298146655923,
"grad_norm": 2.443464756011963,
"learning_rate": 9.968987198273682e-06,
"loss": 0.1153,
"step": 410
},
{
"epoch": 0.26752618855761484,
"grad_norm": 1.6622118949890137,
"learning_rate": 9.965779154193256e-06,
"loss": 0.1214,
"step": 415
},
{
"epoch": 0.27074939564867045,
"grad_norm": 1.398253321647644,
"learning_rate": 9.962413818221071e-06,
"loss": 0.1053,
"step": 420
},
{
"epoch": 0.273972602739726,
"grad_norm": 2.2201757431030273,
"learning_rate": 9.95889129695471e-06,
"loss": 0.1206,
"step": 425
},
{
"epoch": 0.2771958098307816,
"grad_norm": 2.262932062149048,
"learning_rate": 9.955211701970631e-06,
"loss": 0.1152,
"step": 430
},
{
"epoch": 0.28041901692183724,
"grad_norm": 1.72652268409729,
"learning_rate": 9.951375149820624e-06,
"loss": 0.114,
"step": 435
},
{
"epoch": 0.28364222401289285,
"grad_norm": 2.2013440132141113,
"learning_rate": 9.947381762028124e-06,
"loss": 0.1172,
"step": 440
},
{
"epoch": 0.2868654311039484,
"grad_norm": 1.7692981958389282,
"learning_rate": 9.943231665084363e-06,
"loss": 0.108,
"step": 445
},
{
"epoch": 0.290088638195004,
"grad_norm": 2.222728729248047,
"learning_rate": 9.938924990444363e-06,
"loss": 0.1074,
"step": 450
},
{
"epoch": 0.29331184528605964,
"grad_norm": 1.67208993434906,
"learning_rate": 9.934461874522767e-06,
"loss": 0.1176,
"step": 455
},
{
"epoch": 0.29653505237711525,
"grad_norm": 1.7946748733520508,
"learning_rate": 9.929842458689524e-06,
"loss": 0.111,
"step": 460
},
{
"epoch": 0.2997582594681708,
"grad_norm": 1.9234955310821533,
"learning_rate": 9.925066889265412e-06,
"loss": 0.1182,
"step": 465
},
{
"epoch": 0.3029814665592264,
"grad_norm": 2.4538662433624268,
"learning_rate": 9.920135317517393e-06,
"loss": 0.1227,
"step": 470
},
{
"epoch": 0.30620467365028203,
"grad_norm": 1.8412810564041138,
"learning_rate": 9.915047899653838e-06,
"loss": 0.1128,
"step": 475
},
{
"epoch": 0.30942788074133765,
"grad_norm": 1.9068711996078491,
"learning_rate": 9.909804796819562e-06,
"loss": 0.1142,
"step": 480
},
{
"epoch": 0.3126510878323932,
"grad_norm": 1.9735403060913086,
"learning_rate": 9.904406175090732e-06,
"loss": 0.1066,
"step": 485
},
{
"epoch": 0.3158742949234488,
"grad_norm": 2.349578619003296,
"learning_rate": 9.898852205469603e-06,
"loss": 0.1187,
"step": 490
},
{
"epoch": 0.31909750201450443,
"grad_norm": 1.504022479057312,
"learning_rate": 9.893143063879098e-06,
"loss": 0.1051,
"step": 495
},
{
"epoch": 0.32232070910556004,
"grad_norm": 1.5406743288040161,
"learning_rate": 9.887278931157237e-06,
"loss": 0.1123,
"step": 500
},
{
"epoch": 0.32554391619661566,
"grad_norm": 1.8361977338790894,
"learning_rate": 9.881259993051415e-06,
"loss": 0.1225,
"step": 505
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.7701557874679565,
"learning_rate": 9.875086440212511e-06,
"loss": 0.1027,
"step": 510
},
{
"epoch": 0.33199033037872683,
"grad_norm": 1.6567251682281494,
"learning_rate": 9.86875846818885e-06,
"loss": 0.1206,
"step": 515
},
{
"epoch": 0.33521353746978244,
"grad_norm": 1.641015887260437,
"learning_rate": 9.862276277420016e-06,
"loss": 0.1183,
"step": 520
},
{
"epoch": 0.33843674456083805,
"grad_norm": 1.2547988891601562,
"learning_rate": 9.85564007323049e-06,
"loss": 0.1098,
"step": 525
},
{
"epoch": 0.3416599516518936,
"grad_norm": 1.8411493301391602,
"learning_rate": 9.848850065823159e-06,
"loss": 0.1052,
"step": 530
},
{
"epoch": 0.3448831587429492,
"grad_norm": 1.6275116205215454,
"learning_rate": 9.841906470272655e-06,
"loss": 0.1224,
"step": 535
},
{
"epoch": 0.34810636583400484,
"grad_norm": 2.855224847793579,
"learning_rate": 9.834809506518537e-06,
"loss": 0.1082,
"step": 540
},
{
"epoch": 0.35132957292506045,
"grad_norm": 2.3182358741760254,
"learning_rate": 9.827559399358327e-06,
"loss": 0.1224,
"step": 545
},
{
"epoch": 0.354552780016116,
"grad_norm": 2.1670444011688232,
"learning_rate": 9.82015637844039e-06,
"loss": 0.1101,
"step": 550
},
{
"epoch": 0.3577759871071716,
"grad_norm": 1.629321813583374,
"learning_rate": 9.812600678256664e-06,
"loss": 0.1054,
"step": 555
},
{
"epoch": 0.36099919419822724,
"grad_norm": 1.6911214590072632,
"learning_rate": 9.804892538135225e-06,
"loss": 0.1028,
"step": 560
},
{
"epoch": 0.36422240128928285,
"grad_norm": 2.162142038345337,
"learning_rate": 9.797032202232708e-06,
"loss": 0.1052,
"step": 565
},
{
"epoch": 0.36744560838033846,
"grad_norm": 1.6863000392913818,
"learning_rate": 9.789019919526583e-06,
"loss": 0.1078,
"step": 570
},
{
"epoch": 0.370668815471394,
"grad_norm": 2.0217230319976807,
"learning_rate": 9.780855943807253e-06,
"loss": 0.1152,
"step": 575
},
{
"epoch": 0.37389202256244963,
"grad_norm": 1.3627716302871704,
"learning_rate": 9.772540533670023e-06,
"loss": 0.1055,
"step": 580
},
{
"epoch": 0.37711522965350525,
"grad_norm": 1.4767628908157349,
"learning_rate": 9.764073952506913e-06,
"loss": 0.1126,
"step": 585
},
{
"epoch": 0.38033843674456086,
"grad_norm": 1.769196629524231,
"learning_rate": 9.755456468498307e-06,
"loss": 0.1062,
"step": 590
},
{
"epoch": 0.3835616438356164,
"grad_norm": 1.9426604509353638,
"learning_rate": 9.746688354604467e-06,
"loss": 0.1128,
"step": 595
},
{
"epoch": 0.38678485092667203,
"grad_norm": 1.6949142217636108,
"learning_rate": 9.737769888556874e-06,
"loss": 0.1058,
"step": 600
},
{
"epoch": 0.39000805801772764,
"grad_norm": 1.6036336421966553,
"learning_rate": 9.728701352849445e-06,
"loss": 0.1214,
"step": 605
},
{
"epoch": 0.39323126510878326,
"grad_norm": 1.3234189748764038,
"learning_rate": 9.71948303472958e-06,
"loss": 0.1095,
"step": 610
},
{
"epoch": 0.3964544721998388,
"grad_norm": 1.5805995464324951,
"learning_rate": 9.710115226189054e-06,
"loss": 0.1179,
"step": 615
},
{
"epoch": 0.3996776792908944,
"grad_norm": 1.5236024856567383,
"learning_rate": 9.700598223954787e-06,
"loss": 0.1065,
"step": 620
},
{
"epoch": 0.40290088638195004,
"grad_norm": 2.2143023014068604,
"learning_rate": 9.690932329479425e-06,
"loss": 0.1118,
"step": 625
},
{
"epoch": 0.40612409347300565,
"grad_norm": 2.0677402019500732,
"learning_rate": 9.681117848931806e-06,
"loss": 0.1015,
"step": 630
},
{
"epoch": 0.40934730056406127,
"grad_norm": 1.786145567893982,
"learning_rate": 9.671155093187256e-06,
"loss": 0.1072,
"step": 635
},
{
"epoch": 0.4125705076551168,
"grad_norm": 1.661035418510437,
"learning_rate": 9.661044377817745e-06,
"loss": 0.1165,
"step": 640
},
{
"epoch": 0.41579371474617244,
"grad_norm": 1.7452033758163452,
"learning_rate": 9.650786023081882e-06,
"loss": 0.1107,
"step": 645
},
{
"epoch": 0.41901692183722805,
"grad_norm": 1.5127534866333008,
"learning_rate": 9.640380353914784e-06,
"loss": 0.1205,
"step": 650
},
{
"epoch": 0.42224012892828366,
"grad_norm": 1.7860335111618042,
"learning_rate": 9.629827699917777e-06,
"loss": 0.1099,
"step": 655
},
{
"epoch": 0.4254633360193392,
"grad_norm": 1.8366566896438599,
"learning_rate": 9.619128395347957e-06,
"loss": 0.0995,
"step": 660
},
{
"epoch": 0.42868654311039484,
"grad_norm": 1.7406480312347412,
"learning_rate": 9.608282779107596e-06,
"loss": 0.1093,
"step": 665
},
{
"epoch": 0.43190975020145045,
"grad_norm": 1.5550240278244019,
"learning_rate": 9.597291194733417e-06,
"loss": 0.1081,
"step": 670
},
{
"epoch": 0.43513295729250606,
"grad_norm": 1.8106791973114014,
"learning_rate": 9.58615399038571e-06,
"loss": 0.1092,
"step": 675
},
{
"epoch": 0.4383561643835616,
"grad_norm": 2.0512306690216064,
"learning_rate": 9.574871518837299e-06,
"loss": 0.11,
"step": 680
},
{
"epoch": 0.44157937147461723,
"grad_norm": 1.536855697631836,
"learning_rate": 9.563444137462373e-06,
"loss": 0.1092,
"step": 685
},
{
"epoch": 0.44480257856567285,
"grad_norm": 1.4719635248184204,
"learning_rate": 9.55187220822516e-06,
"loss": 0.1081,
"step": 690
},
{
"epoch": 0.44802578565672846,
"grad_norm": 1.7767363786697388,
"learning_rate": 9.54015609766847e-06,
"loss": 0.1099,
"step": 695
},
{
"epoch": 0.4512489927477841,
"grad_norm": 1.453895092010498,
"learning_rate": 9.528296176902085e-06,
"loss": 0.1083,
"step": 700
},
{
"epoch": 0.45447219983883963,
"grad_norm": 1.516648292541504,
"learning_rate": 9.51629282159099e-06,
"loss": 0.1174,
"step": 705
},
{
"epoch": 0.45769540692989524,
"grad_norm": 1.5289475917816162,
"learning_rate": 9.504146411943488e-06,
"loss": 0.1119,
"step": 710
},
{
"epoch": 0.46091861402095086,
"grad_norm": 1.7268835306167603,
"learning_rate": 9.491857332699153e-06,
"loss": 0.1067,
"step": 715
},
{
"epoch": 0.46414182111200647,
"grad_norm": 1.424131989479065,
"learning_rate": 9.47942597311664e-06,
"loss": 0.1096,
"step": 720
},
{
"epoch": 0.467365028203062,
"grad_norm": 2.6142001152038574,
"learning_rate": 9.466852726961363e-06,
"loss": 0.1132,
"step": 725
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.7743583917617798,
"learning_rate": 9.454137992493008e-06,
"loss": 0.1095,
"step": 730
},
{
"epoch": 0.47381144238517325,
"grad_norm": 1.3648674488067627,
"learning_rate": 9.441282172452935e-06,
"loss": 0.1016,
"step": 735
},
{
"epoch": 0.47703464947622887,
"grad_norm": 1.202217698097229,
"learning_rate": 9.428285674051413e-06,
"loss": 0.1014,
"step": 740
},
{
"epoch": 0.4802578565672844,
"grad_norm": 1.2294992208480835,
"learning_rate": 9.415148908954717e-06,
"loss": 0.0958,
"step": 745
},
{
"epoch": 0.48348106365834004,
"grad_norm": 1.3715941905975342,
"learning_rate": 9.401872293272096e-06,
"loss": 0.1032,
"step": 750
},
{
"epoch": 0.48670427074939565,
"grad_norm": 1.2639530897140503,
"learning_rate": 9.38845624754259e-06,
"loss": 0.1047,
"step": 755
},
{
"epoch": 0.48992747784045126,
"grad_norm": 1.389994502067566,
"learning_rate": 9.37490119672171e-06,
"loss": 0.1072,
"step": 760
},
{
"epoch": 0.4931506849315068,
"grad_norm": 1.6051830053329468,
"learning_rate": 9.361207570167974e-06,
"loss": 0.1021,
"step": 765
},
{
"epoch": 0.49637389202256244,
"grad_norm": 2.006974458694458,
"learning_rate": 9.347375801629313e-06,
"loss": 0.1038,
"step": 770
},
{
"epoch": 0.49959709911361805,
"grad_norm": 1.40548837184906,
"learning_rate": 9.333406329229326e-06,
"loss": 0.1064,
"step": 775
},
{
"epoch": 0.5028203062046737,
"grad_norm": 1.4568746089935303,
"learning_rate": 9.319299595453404e-06,
"loss": 0.1109,
"step": 780
},
{
"epoch": 0.5060435132957293,
"grad_norm": 1.7389963865280151,
"learning_rate": 9.305056047134722e-06,
"loss": 0.1082,
"step": 785
},
{
"epoch": 0.5092667203867849,
"grad_norm": 1.2637214660644531,
"learning_rate": 9.29067613544007e-06,
"loss": 0.1019,
"step": 790
},
{
"epoch": 0.5124899274778405,
"grad_norm": 1.7853842973709106,
"learning_rate": 9.276160315855576e-06,
"loss": 0.101,
"step": 795
},
{
"epoch": 0.515713134568896,
"grad_norm": 1.365021824836731,
"learning_rate": 9.261509048172272e-06,
"loss": 0.0903,
"step": 800
},
{
"epoch": 0.5189363416599516,
"grad_norm": 1.0780879259109497,
"learning_rate": 9.246722796471534e-06,
"loss": 0.1003,
"step": 805
},
{
"epoch": 0.5221595487510072,
"grad_norm": 1.3499747514724731,
"learning_rate": 9.231802029110373e-06,
"loss": 0.108,
"step": 810
},
{
"epoch": 0.5253827558420628,
"grad_norm": 1.2328459024429321,
"learning_rate": 9.216747218706612e-06,
"loss": 0.1086,
"step": 815
},
{
"epoch": 0.5286059629331185,
"grad_norm": 1.665556788444519,
"learning_rate": 9.20155884212391e-06,
"loss": 0.0989,
"step": 820
},
{
"epoch": 0.5318291700241741,
"grad_norm": 1.262510061264038,
"learning_rate": 9.186237380456652e-06,
"loss": 0.1087,
"step": 825
},
{
"epoch": 0.5350523771152297,
"grad_norm": 1.2561684846878052,
"learning_rate": 9.170783319014723e-06,
"loss": 0.1011,
"step": 830
},
{
"epoch": 0.5382755842062853,
"grad_norm": 1.4100691080093384,
"learning_rate": 9.155197147308118e-06,
"loss": 0.1025,
"step": 835
},
{
"epoch": 0.5414987912973409,
"grad_norm": 1.4246737957000732,
"learning_rate": 9.13947935903146e-06,
"loss": 0.1043,
"step": 840
},
{
"epoch": 0.5447219983883964,
"grad_norm": 1.5978039503097534,
"learning_rate": 9.12363045204834e-06,
"loss": 0.1137,
"step": 845
},
{
"epoch": 0.547945205479452,
"grad_norm": 1.3265248537063599,
"learning_rate": 9.107650928375555e-06,
"loss": 0.1066,
"step": 850
},
{
"epoch": 0.5511684125705076,
"grad_norm": 1.3155473470687866,
"learning_rate": 9.091541294167214e-06,
"loss": 0.0958,
"step": 855
},
{
"epoch": 0.5543916196615633,
"grad_norm": 1.4708001613616943,
"learning_rate": 9.075302059698696e-06,
"loss": 0.1063,
"step": 860
},
{
"epoch": 0.5576148267526189,
"grad_norm": 1.3062944412231445,
"learning_rate": 9.05893373935049e-06,
"loss": 0.1009,
"step": 865
},
{
"epoch": 0.5608380338436745,
"grad_norm": 1.3801825046539307,
"learning_rate": 9.0424368515919e-06,
"loss": 0.1042,
"step": 870
},
{
"epoch": 0.5640612409347301,
"grad_norm": 1.2434556484222412,
"learning_rate": 9.02581191896463e-06,
"loss": 0.1027,
"step": 875
},
{
"epoch": 0.5672844480257857,
"grad_norm": 1.4129581451416016,
"learning_rate": 9.00905946806622e-06,
"loss": 0.1028,
"step": 880
},
{
"epoch": 0.5705076551168412,
"grad_norm": 1.7679858207702637,
"learning_rate": 8.992180029533378e-06,
"loss": 0.1044,
"step": 885
},
{
"epoch": 0.5737308622078968,
"grad_norm": 1.4008570909500122,
"learning_rate": 8.975174138025165e-06,
"loss": 0.0988,
"step": 890
},
{
"epoch": 0.5769540692989524,
"grad_norm": 1.1371465921401978,
"learning_rate": 8.958042332206059e-06,
"loss": 0.0977,
"step": 895
},
{
"epoch": 0.580177276390008,
"grad_norm": 1.4827523231506348,
"learning_rate": 8.940785154728899e-06,
"loss": 0.097,
"step": 900
},
{
"epoch": 0.5834004834810637,
"grad_norm": 1.5484329462051392,
"learning_rate": 8.92340315221769e-06,
"loss": 0.1049,
"step": 905
},
{
"epoch": 0.5866236905721193,
"grad_norm": 1.267395257949829,
"learning_rate": 8.905896875250291e-06,
"loss": 0.0943,
"step": 910
},
{
"epoch": 0.5898468976631749,
"grad_norm": 2.0672824382781982,
"learning_rate": 8.888266878340979e-06,
"loss": 0.0984,
"step": 915
},
{
"epoch": 0.5930701047542305,
"grad_norm": 1.3846348524093628,
"learning_rate": 8.870513719922873e-06,
"loss": 0.1047,
"step": 920
},
{
"epoch": 0.5962933118452861,
"grad_norm": 1.162631630897522,
"learning_rate": 8.85263796233026e-06,
"loss": 0.1067,
"step": 925
},
{
"epoch": 0.5995165189363416,
"grad_norm": 1.2754584550857544,
"learning_rate": 8.834640171780777e-06,
"loss": 0.0959,
"step": 930
},
{
"epoch": 0.6027397260273972,
"grad_norm": 1.383272409439087,
"learning_rate": 8.816520918357473e-06,
"loss": 0.1063,
"step": 935
},
{
"epoch": 0.6059629331184528,
"grad_norm": 1.6662758588790894,
"learning_rate": 8.798280775990751e-06,
"loss": 0.1024,
"step": 940
},
{
"epoch": 0.6091861402095085,
"grad_norm": 1.9114925861358643,
"learning_rate": 8.7799203224402e-06,
"loss": 0.1054,
"step": 945
},
{
"epoch": 0.6124093473005641,
"grad_norm": 1.4166219234466553,
"learning_rate": 8.761440139276279e-06,
"loss": 0.1077,
"step": 950
},
{
"epoch": 0.6156325543916197,
"grad_norm": 1.9428759813308716,
"learning_rate": 8.742840811861901e-06,
"loss": 0.1044,
"step": 955
},
{
"epoch": 0.6188557614826753,
"grad_norm": 1.5213385820388794,
"learning_rate": 8.724122929333904e-06,
"loss": 0.1128,
"step": 960
},
{
"epoch": 0.6220789685737309,
"grad_norm": 1.2538460493087769,
"learning_rate": 8.705287084584369e-06,
"loss": 0.0963,
"step": 965
},
{
"epoch": 0.6253021756647864,
"grad_norm": 0.9515339136123657,
"learning_rate": 8.68633387424185e-06,
"loss": 0.104,
"step": 970
},
{
"epoch": 0.628525382755842,
"grad_norm": 1.6763734817504883,
"learning_rate": 8.667263898652485e-06,
"loss": 0.0975,
"step": 975
},
{
"epoch": 0.6317485898468976,
"grad_norm": 1.9303100109100342,
"learning_rate": 8.648077761860962e-06,
"loss": 0.0936,
"step": 980
},
{
"epoch": 0.6349717969379532,
"grad_norm": 1.3960262537002563,
"learning_rate": 8.6287760715914e-06,
"loss": 0.1018,
"step": 985
},
{
"epoch": 0.6381950040290089,
"grad_norm": 1.3512085676193237,
"learning_rate": 8.609359439228092e-06,
"loss": 0.1051,
"step": 990
},
{
"epoch": 0.6414182111200645,
"grad_norm": 1.3363274335861206,
"learning_rate": 8.589828479796138e-06,
"loss": 0.1026,
"step": 995
},
{
"epoch": 0.6446414182111201,
"grad_norm": 1.2143882513046265,
"learning_rate": 8.570183811941973e-06,
"loss": 0.0997,
"step": 1000
},
{
"epoch": 0.6478646253021757,
"grad_norm": 1.5231553316116333,
"learning_rate": 8.550426057913758e-06,
"loss": 0.0971,
"step": 1005
},
{
"epoch": 0.6510878323932313,
"grad_norm": 1.5069528818130493,
"learning_rate": 8.53055584354169e-06,
"loss": 0.0968,
"step": 1010
},
{
"epoch": 0.6543110394842868,
"grad_norm": 1.9453926086425781,
"learning_rate": 8.510573798218153e-06,
"loss": 0.1056,
"step": 1015
},
{
"epoch": 0.6575342465753424,
"grad_norm": 1.6074435710906982,
"learning_rate": 8.490480554877804e-06,
"loss": 0.1005,
"step": 1020
},
{
"epoch": 0.660757453666398,
"grad_norm": 1.4784128665924072,
"learning_rate": 8.47027674997751e-06,
"loss": 0.091,
"step": 1025
},
{
"epoch": 0.6639806607574537,
"grad_norm": 1.3281731605529785,
"learning_rate": 8.449963023476198e-06,
"loss": 0.1007,
"step": 1030
},
{
"epoch": 0.6672038678485093,
"grad_norm": 1.3868046998977661,
"learning_rate": 8.429540018814581e-06,
"loss": 0.1023,
"step": 1035
},
{
"epoch": 0.6704270749395649,
"grad_norm": 1.4011777639389038,
"learning_rate": 8.409008382894771e-06,
"loss": 0.0972,
"step": 1040
},
{
"epoch": 0.6736502820306205,
"grad_norm": 1.2864456176757812,
"learning_rate": 8.388368766059798e-06,
"loss": 0.1024,
"step": 1045
},
{
"epoch": 0.6768734891216761,
"grad_norm": 1.8163318634033203,
"learning_rate": 8.367621822073004e-06,
"loss": 0.0942,
"step": 1050
},
{
"epoch": 0.6800966962127317,
"grad_norm": 1.1266424655914307,
"learning_rate": 8.346768208097339e-06,
"loss": 0.0997,
"step": 1055
},
{
"epoch": 0.6833199033037872,
"grad_norm": 1.268912672996521,
"learning_rate": 8.325808584674539e-06,
"loss": 0.0954,
"step": 1060
},
{
"epoch": 0.6865431103948428,
"grad_norm": 1.9696354866027832,
"learning_rate": 8.304743615704207e-06,
"loss": 0.1056,
"step": 1065
},
{
"epoch": 0.6897663174858985,
"grad_norm": 1.47492253780365,
"learning_rate": 8.283573968422792e-06,
"loss": 0.103,
"step": 1070
},
{
"epoch": 0.6929895245769541,
"grad_norm": 1.654740810394287,
"learning_rate": 8.262300313382431e-06,
"loss": 0.0951,
"step": 1075
},
{
"epoch": 0.6962127316680097,
"grad_norm": 1.3860782384872437,
"learning_rate": 8.240923324429742e-06,
"loss": 0.1013,
"step": 1080
},
{
"epoch": 0.6994359387590653,
"grad_norm": 1.9896957874298096,
"learning_rate": 8.219443678684448e-06,
"loss": 0.095,
"step": 1085
},
{
"epoch": 0.7026591458501209,
"grad_norm": 1.5903962850570679,
"learning_rate": 8.197862056517954e-06,
"loss": 0.1025,
"step": 1090
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.1547088623046875,
"learning_rate": 8.176179141531774e-06,
"loss": 0.1011,
"step": 1095
},
{
"epoch": 0.709105560032232,
"grad_norm": 1.4623602628707886,
"learning_rate": 8.154395620535899e-06,
"loss": 0.1015,
"step": 1100
},
{
"epoch": 0.7123287671232876,
"grad_norm": 1.5208735466003418,
"learning_rate": 8.132512183527027e-06,
"loss": 0.1018,
"step": 1105
},
{
"epoch": 0.7155519742143432,
"grad_norm": 1.1935063600540161,
"learning_rate": 8.110529523666712e-06,
"loss": 0.1022,
"step": 1110
},
{
"epoch": 0.7187751813053989,
"grad_norm": 1.2939246892929077,
"learning_rate": 8.088448337259416e-06,
"loss": 0.1049,
"step": 1115
},
{
"epoch": 0.7219983883964545,
"grad_norm": 1.3576562404632568,
"learning_rate": 8.066269323730435e-06,
"loss": 0.0964,
"step": 1120
},
{
"epoch": 0.7252215954875101,
"grad_norm": 1.2397035360336304,
"learning_rate": 8.043993185603764e-06,
"loss": 0.0949,
"step": 1125
},
{
"epoch": 0.7284448025785657,
"grad_norm": 1.6794919967651367,
"learning_rate": 8.021620628479833e-06,
"loss": 0.0941,
"step": 1130
},
{
"epoch": 0.7316680096696213,
"grad_norm": 1.9329454898834229,
"learning_rate": 7.99915236101316e-06,
"loss": 0.0929,
"step": 1135
},
{
"epoch": 0.7348912167606769,
"grad_norm": 1.2772644758224487,
"learning_rate": 7.976589094889903e-06,
"loss": 0.1004,
"step": 1140
},
{
"epoch": 0.7381144238517324,
"grad_norm": 1.1697113513946533,
"learning_rate": 7.953931544805324e-06,
"loss": 0.0905,
"step": 1145
},
{
"epoch": 0.741337630942788,
"grad_norm": 1.7702858448028564,
"learning_rate": 7.931180428441135e-06,
"loss": 0.1052,
"step": 1150
},
{
"epoch": 0.7445608380338437,
"grad_norm": 1.3432146310806274,
"learning_rate": 7.908336466442786e-06,
"loss": 0.0919,
"step": 1155
},
{
"epoch": 0.7477840451248993,
"grad_norm": 1.2473376989364624,
"learning_rate": 7.885400382396621e-06,
"loss": 0.0961,
"step": 1160
},
{
"epoch": 0.7510072522159549,
"grad_norm": 2.3682289123535156,
"learning_rate": 7.862372902806971e-06,
"loss": 0.1042,
"step": 1165
},
{
"epoch": 0.7542304593070105,
"grad_norm": 1.86495041847229,
"learning_rate": 7.839254757073133e-06,
"loss": 0.1009,
"step": 1170
},
{
"epoch": 0.7574536663980661,
"grad_norm": 1.7069085836410522,
"learning_rate": 7.816046677466269e-06,
"loss": 0.1007,
"step": 1175
},
{
"epoch": 0.7606768734891217,
"grad_norm": 1.8137654066085815,
"learning_rate": 7.792749399106214e-06,
"loss": 0.0927,
"step": 1180
},
{
"epoch": 0.7639000805801772,
"grad_norm": 1.1234313249588013,
"learning_rate": 7.769363659938186e-06,
"loss": 0.0931,
"step": 1185
},
{
"epoch": 0.7671232876712328,
"grad_norm": 1.3791865110397339,
"learning_rate": 7.745890200709416e-06,
"loss": 0.0973,
"step": 1190
},
{
"epoch": 0.7703464947622884,
"grad_norm": 1.2439701557159424,
"learning_rate": 7.722329764945682e-06,
"loss": 0.1004,
"step": 1195
},
{
"epoch": 0.7735697018533441,
"grad_norm": 1.2246594429016113,
"learning_rate": 7.698683098927756e-06,
"loss": 0.1078,
"step": 1200
},
{
"epoch": 0.7767929089443997,
"grad_norm": 1.09011709690094,
"learning_rate": 7.674950951667773e-06,
"loss": 0.0939,
"step": 1205
},
{
"epoch": 0.7800161160354553,
"grad_norm": 1.0550169944763184,
"learning_rate": 7.651134074885495e-06,
"loss": 0.0982,
"step": 1210
},
{
"epoch": 0.7832393231265109,
"grad_norm": 1.935786247253418,
"learning_rate": 7.627233222984514e-06,
"loss": 0.0973,
"step": 1215
},
{
"epoch": 0.7864625302175665,
"grad_norm": 1.7208002805709839,
"learning_rate": 7.603249153028335e-06,
"loss": 0.098,
"step": 1220
},
{
"epoch": 0.7896857373086221,
"grad_norm": 1.3723320960998535,
"learning_rate": 7.579182624716422e-06,
"loss": 0.1035,
"step": 1225
},
{
"epoch": 0.7929089443996776,
"grad_norm": 1.1093083620071411,
"learning_rate": 7.555034400360115e-06,
"loss": 0.0906,
"step": 1230
},
{
"epoch": 0.7961321514907332,
"grad_norm": 1.0704550743103027,
"learning_rate": 7.530805244858492e-06,
"loss": 0.0937,
"step": 1235
},
{
"epoch": 0.7993553585817889,
"grad_norm": 1.111206293106079,
"learning_rate": 7.506495925674135e-06,
"loss": 0.11,
"step": 1240
},
{
"epoch": 0.8025785656728445,
"grad_norm": 1.0680865049362183,
"learning_rate": 7.482107212808829e-06,
"loss": 0.0978,
"step": 1245
},
{
"epoch": 0.8058017727639001,
"grad_norm": 1.2233189344406128,
"learning_rate": 7.457639878779164e-06,
"loss": 0.0901,
"step": 1250
},
{
"epoch": 0.8090249798549557,
"grad_norm": 1.1432982683181763,
"learning_rate": 7.433094698592069e-06,
"loss": 0.1055,
"step": 1255
},
{
"epoch": 0.8122481869460113,
"grad_norm": 1.0968226194381714,
"learning_rate": 7.4084724497202675e-06,
"loss": 0.0893,
"step": 1260
},
{
"epoch": 0.8154713940370669,
"grad_norm": 1.416164755821228,
"learning_rate": 7.383773912077639e-06,
"loss": 0.1048,
"step": 1265
},
{
"epoch": 0.8186946011281225,
"grad_norm": 1.4907201528549194,
"learning_rate": 7.3589998679945274e-06,
"loss": 0.0957,
"step": 1270
},
{
"epoch": 0.821917808219178,
"grad_norm": 1.1113173961639404,
"learning_rate": 7.3341511021929565e-06,
"loss": 0.0891,
"step": 1275
},
{
"epoch": 0.8251410153102336,
"grad_norm": 1.5791008472442627,
"learning_rate": 7.30922840176177e-06,
"loss": 0.0938,
"step": 1280
},
{
"epoch": 0.8283642224012893,
"grad_norm": 1.1519147157669067,
"learning_rate": 7.2842325561317064e-06,
"loss": 0.0937,
"step": 1285
},
{
"epoch": 0.8315874294923449,
"grad_norm": 1.0236375331878662,
"learning_rate": 7.259164357050389e-06,
"loss": 0.0859,
"step": 1290
},
{
"epoch": 0.8348106365834005,
"grad_norm": 1.629459023475647,
"learning_rate": 7.234024598557248e-06,
"loss": 0.0902,
"step": 1295
},
{
"epoch": 0.8380338436744561,
"grad_norm": 1.171321988105774,
"learning_rate": 7.208814076958374e-06,
"loss": 0.0887,
"step": 1300
},
{
"epoch": 0.8412570507655117,
"grad_norm": 1.2966508865356445,
"learning_rate": 7.183533590801286e-06,
"loss": 0.0958,
"step": 1305
},
{
"epoch": 0.8444802578565673,
"grad_norm": 1.7354567050933838,
"learning_rate": 7.158183940849644e-06,
"loss": 0.0967,
"step": 1310
},
{
"epoch": 0.8477034649476228,
"grad_norm": 2.0169782638549805,
"learning_rate": 7.132765930057886e-06,
"loss": 0.0972,
"step": 1315
},
{
"epoch": 0.8509266720386784,
"grad_norm": 1.147071123123169,
"learning_rate": 7.107280363545785e-06,
"loss": 0.0976,
"step": 1320
},
{
"epoch": 0.8541498791297341,
"grad_norm": 1.391392707824707,
"learning_rate": 7.08172804857296e-06,
"loss": 0.0899,
"step": 1325
},
{
"epoch": 0.8573730862207897,
"grad_norm": 1.1355257034301758,
"learning_rate": 7.056109794513292e-06,
"loss": 0.1036,
"step": 1330
},
{
"epoch": 0.8605962933118453,
"grad_norm": 0.9366469979286194,
"learning_rate": 7.030426412829296e-06,
"loss": 0.088,
"step": 1335
},
{
"epoch": 0.8638195004029009,
"grad_norm": 1.1631442308425903,
"learning_rate": 7.004678717046419e-06,
"loss": 0.0891,
"step": 1340
},
{
"epoch": 0.8670427074939565,
"grad_norm": 1.429606318473816,
"learning_rate": 6.978867522727264e-06,
"loss": 0.1039,
"step": 1345
},
{
"epoch": 0.8702659145850121,
"grad_norm": 1.1730060577392578,
"learning_rate": 6.952993647445762e-06,
"loss": 0.0931,
"step": 1350
},
{
"epoch": 0.8734891216760677,
"grad_norm": 1.1138707399368286,
"learning_rate": 6.927057910761273e-06,
"loss": 0.0982,
"step": 1355
},
{
"epoch": 0.8767123287671232,
"grad_norm": 1.2846705913543701,
"learning_rate": 6.9010611341926286e-06,
"loss": 0.0937,
"step": 1360
},
{
"epoch": 0.8799355358581789,
"grad_norm": 1.1566565036773682,
"learning_rate": 6.875004141192108e-06,
"loss": 0.092,
"step": 1365
},
{
"epoch": 0.8831587429492345,
"grad_norm": 1.17435622215271,
"learning_rate": 6.848887757119358e-06,
"loss": 0.0996,
"step": 1370
},
{
"epoch": 0.8863819500402901,
"grad_norm": 1.250361442565918,
"learning_rate": 6.822712809215247e-06,
"loss": 0.099,
"step": 1375
},
{
"epoch": 0.8896051571313457,
"grad_norm": 1.0037554502487183,
"learning_rate": 6.7964801265756616e-06,
"loss": 0.0873,
"step": 1380
},
{
"epoch": 0.8928283642224013,
"grad_norm": 1.2579954862594604,
"learning_rate": 6.770190540125246e-06,
"loss": 0.0898,
"step": 1385
},
{
"epoch": 0.8960515713134569,
"grad_norm": 1.5266188383102417,
"learning_rate": 6.74384488259108e-06,
"loss": 0.094,
"step": 1390
},
{
"epoch": 0.8992747784045125,
"grad_norm": 1.5171687602996826,
"learning_rate": 6.71744398847631e-06,
"loss": 0.0924,
"step": 1395
},
{
"epoch": 0.9024979854955681,
"grad_norm": 1.0802000761032104,
"learning_rate": 6.690988694033707e-06,
"loss": 0.0941,
"step": 1400
},
{
"epoch": 0.9057211925866236,
"grad_norm": 1.210917353630066,
"learning_rate": 6.664479837239182e-06,
"loss": 0.0885,
"step": 1405
},
{
"epoch": 0.9089443996776793,
"grad_norm": 0.9679683446884155,
"learning_rate": 6.63791825776524e-06,
"loss": 0.0929,
"step": 1410
},
{
"epoch": 0.9121676067687349,
"grad_norm": 1.1072417497634888,
"learning_rate": 6.611304796954391e-06,
"loss": 0.0907,
"step": 1415
},
{
"epoch": 0.9153908138597905,
"grad_norm": 1.1007254123687744,
"learning_rate": 6.58464029779249e-06,
"loss": 0.0834,
"step": 1420
},
{
"epoch": 0.9186140209508461,
"grad_norm": 1.3668423891067505,
"learning_rate": 6.557925604882045e-06,
"loss": 0.0996,
"step": 1425
},
{
"epoch": 0.9218372280419017,
"grad_norm": 1.0063837766647339,
"learning_rate": 6.531161564415455e-06,
"loss": 0.0967,
"step": 1430
},
{
"epoch": 0.9250604351329573,
"grad_norm": 2.1310322284698486,
"learning_rate": 6.504349024148215e-06,
"loss": 0.0891,
"step": 1435
},
{
"epoch": 0.9282836422240129,
"grad_norm": 1.7086719274520874,
"learning_rate": 6.4774888333720565e-06,
"loss": 0.091,
"step": 1440
},
{
"epoch": 0.9315068493150684,
"grad_norm": 1.2807854413986206,
"learning_rate": 6.450581842888051e-06,
"loss": 0.0945,
"step": 1445
},
{
"epoch": 0.934730056406124,
"grad_norm": 1.2100774049758911,
"learning_rate": 6.423628904979655e-06,
"loss": 0.0927,
"step": 1450
},
{
"epoch": 0.9379532634971797,
"grad_norm": 1.079419732093811,
"learning_rate": 6.396630873385723e-06,
"loss": 0.0928,
"step": 1455
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.2331191301345825,
"learning_rate": 6.369588603273453e-06,
"loss": 0.0902,
"step": 1460
},
{
"epoch": 0.9443996776792909,
"grad_norm": 1.521287441253662,
"learning_rate": 6.342502951211314e-06,
"loss": 0.0906,
"step": 1465
},
{
"epoch": 0.9476228847703465,
"grad_norm": 1.2149112224578857,
"learning_rate": 6.315374775141897e-06,
"loss": 0.088,
"step": 1470
},
{
"epoch": 0.9508460918614021,
"grad_norm": 1.041596531867981,
"learning_rate": 6.288204934354753e-06,
"loss": 0.0903,
"step": 1475
},
{
"epoch": 0.9540692989524577,
"grad_norm": 1.4294019937515259,
"learning_rate": 6.26099428945917e-06,
"loss": 0.0827,
"step": 1480
},
{
"epoch": 0.9572925060435133,
"grad_norm": 1.353190302848816,
"learning_rate": 6.2337437023569105e-06,
"loss": 0.0892,
"step": 1485
},
{
"epoch": 0.9605157131345688,
"grad_norm": 1.0870195627212524,
"learning_rate": 6.206454036214914e-06,
"loss": 0.1028,
"step": 1490
},
{
"epoch": 0.9637389202256245,
"grad_norm": 0.9204868674278259,
"learning_rate": 6.179126155437957e-06,
"loss": 0.0929,
"step": 1495
},
{
"epoch": 0.9669621273166801,
"grad_norm": 1.2136569023132324,
"learning_rate": 6.151760925641268e-06,
"loss": 0.0871,
"step": 1500
},
{
"epoch": 0.9701853344077357,
"grad_norm": 1.2971879243850708,
"learning_rate": 6.124359213623114e-06,
"loss": 0.0979,
"step": 1505
},
{
"epoch": 0.9734085414987913,
"grad_norm": 1.1342357397079468,
"learning_rate": 6.096921887337342e-06,
"loss": 0.0821,
"step": 1510
},
{
"epoch": 0.9766317485898469,
"grad_norm": 1.1401841640472412,
"learning_rate": 6.0694498158658886e-06,
"loss": 0.0853,
"step": 1515
},
{
"epoch": 0.9798549556809025,
"grad_norm": 1.0659375190734863,
"learning_rate": 6.041943869391248e-06,
"loss": 0.092,
"step": 1520
},
{
"epoch": 0.9830781627719581,
"grad_norm": 1.2414722442626953,
"learning_rate": 6.0144049191689095e-06,
"loss": 0.0943,
"step": 1525
},
{
"epoch": 0.9863013698630136,
"grad_norm": 1.3391859531402588,
"learning_rate": 5.9868338374997645e-06,
"loss": 0.0846,
"step": 1530
},
{
"epoch": 0.9895245769540693,
"grad_norm": 1.275088906288147,
"learning_rate": 5.959231497702473e-06,
"loss": 0.0976,
"step": 1535
},
{
"epoch": 0.9927477840451249,
"grad_norm": 1.732020616531372,
"learning_rate": 5.9315987740857995e-06,
"loss": 0.0906,
"step": 1540
},
{
"epoch": 0.9959709911361805,
"grad_norm": 1.2046090364456177,
"learning_rate": 5.903936541920924e-06,
"loss": 0.092,
"step": 1545
},
{
"epoch": 0.9991941982272361,
"grad_norm": 0.9802199006080627,
"learning_rate": 5.876245677413712e-06,
"loss": 0.0815,
"step": 1550
}
],
"logging_steps": 5,
"max_steps": 3102,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 776,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.912061943183573e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}