{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032232070910556, "grad_norm": 70.69935607910156, "learning_rate": 1.6077170418006432e-07, "loss": 0.9483, "step": 5 }, { "epoch": 0.0064464141821112, "grad_norm": 41.78139877319336, "learning_rate": 3.2154340836012864e-07, "loss": 0.8601, "step": 10 }, { "epoch": 0.009669621273166801, "grad_norm": 39.28358840942383, "learning_rate": 4.823151125401929e-07, "loss": 0.7662, "step": 15 }, { "epoch": 0.0128928283642224, "grad_norm": 26.412134170532227, "learning_rate": 6.430868167202573e-07, "loss": 0.5686, "step": 20 }, { "epoch": 0.016116035455278, "grad_norm": 12.356087684631348, "learning_rate": 8.038585209003216e-07, "loss": 0.4011, "step": 25 }, { "epoch": 0.019339242546333603, "grad_norm": 7.162217617034912, "learning_rate": 9.646302250803859e-07, "loss": 0.3031, "step": 30 }, { "epoch": 0.022562449637389202, "grad_norm": 5.508477687835693, "learning_rate": 1.1254019292604503e-06, "loss": 0.245, "step": 35 }, { "epoch": 0.0257856567284448, "grad_norm": 6.486464500427246, "learning_rate": 1.2861736334405146e-06, "loss": 0.2266, "step": 40 }, { "epoch": 0.029008863819500404, "grad_norm": 5.0619378089904785, "learning_rate": 1.4469453376205788e-06, "loss": 0.1895, "step": 45 }, { "epoch": 0.032232070910556, "grad_norm": 3.8913064002990723, "learning_rate": 1.6077170418006432e-06, "loss": 0.1845, "step": 50 }, { "epoch": 0.035455278001611606, "grad_norm": 3.2244060039520264, "learning_rate": 1.7684887459807077e-06, "loss": 0.1619, "step": 55 }, { "epoch": 0.038678485092667206, "grad_norm": 4.426245212554932, "learning_rate": 1.9292604501607717e-06, "loss": 0.1638, "step": 60 }, { "epoch": 0.041901692183722805, "grad_norm": 4.924641132354736, "learning_rate": 2.090032154340836e-06, "loss": 0.1535, "step": 65 }, { "epoch": 0.045124899274778404, "grad_norm": 4.1020894050598145, "learning_rate": 2.2508038585209006e-06, "loss": 0.1514, "step": 70 }, { "epoch": 0.048348106365834004, "grad_norm": 3.604417562484741, "learning_rate": 2.411575562700965e-06, "loss": 0.136, "step": 75 }, { "epoch": 0.0515713134568896, "grad_norm": 3.021495819091797, "learning_rate": 2.572347266881029e-06, "loss": 0.1329, "step": 80 }, { "epoch": 0.0547945205479452, "grad_norm": 3.2783167362213135, "learning_rate": 2.7331189710610936e-06, "loss": 0.142, "step": 85 }, { "epoch": 0.05801772763900081, "grad_norm": 2.818120241165161, "learning_rate": 2.8938906752411576e-06, "loss": 0.1305, "step": 90 }, { "epoch": 0.06124093473005641, "grad_norm": 3.0875892639160156, "learning_rate": 3.054662379421222e-06, "loss": 0.123, "step": 95 }, { "epoch": 0.064464141821112, "grad_norm": 2.726335048675537, "learning_rate": 3.2154340836012865e-06, "loss": 0.139, "step": 100 }, { "epoch": 0.06768734891216761, "grad_norm": 2.3112688064575195, "learning_rate": 3.376205787781351e-06, "loss": 0.1174, "step": 105 }, { "epoch": 0.07091055600322321, "grad_norm": 2.9780499935150146, "learning_rate": 3.5369774919614154e-06, "loss": 0.1342, "step": 110 }, { "epoch": 0.07413376309427881, "grad_norm": 3.2706351280212402, "learning_rate": 3.69774919614148e-06, "loss": 0.1225, "step": 115 }, { "epoch": 0.07735697018533441, "grad_norm": 2.3604736328125, "learning_rate": 3.8585209003215434e-06, "loss": 0.1298, "step": 120 }, { "epoch": 0.08058017727639001, "grad_norm": 2.7637319564819336, "learning_rate": 4.0192926045016075e-06, "loss": 0.1237, "step": 125 }, { "epoch": 0.08380338436744561, "grad_norm": 2.42730975151062, "learning_rate": 4.180064308681672e-06, "loss": 0.1321, "step": 130 }, { "epoch": 0.08702659145850121, "grad_norm": 2.5871758460998535, "learning_rate": 4.340836012861736e-06, "loss": 0.13, "step": 135 }, { "epoch": 0.09024979854955681, "grad_norm": 2.662217855453491, "learning_rate": 4.501607717041801e-06, "loss": 0.1308, "step": 140 }, { "epoch": 0.09347300564061241, "grad_norm": 2.97275447845459, "learning_rate": 4.662379421221865e-06, "loss": 0.1152, "step": 145 }, { "epoch": 0.09669621273166801, "grad_norm": 2.990713119506836, "learning_rate": 4.82315112540193e-06, "loss": 0.1241, "step": 150 }, { "epoch": 0.0999194198227236, "grad_norm": 3.011382818222046, "learning_rate": 4.983922829581994e-06, "loss": 0.1318, "step": 155 }, { "epoch": 0.1031426269137792, "grad_norm": 2.340388774871826, "learning_rate": 5.144694533762058e-06, "loss": 0.1217, "step": 160 }, { "epoch": 0.1063658340048348, "grad_norm": 2.1309595108032227, "learning_rate": 5.305466237942123e-06, "loss": 0.1139, "step": 165 }, { "epoch": 0.1095890410958904, "grad_norm": 2.023085355758667, "learning_rate": 5.466237942122187e-06, "loss": 0.117, "step": 170 }, { "epoch": 0.11281224818694602, "grad_norm": 2.348984718322754, "learning_rate": 5.627009646302252e-06, "loss": 0.1141, "step": 175 }, { "epoch": 0.11603545527800162, "grad_norm": 2.375622272491455, "learning_rate": 5.787781350482315e-06, "loss": 0.128, "step": 180 }, { "epoch": 0.11925866236905722, "grad_norm": 2.196361780166626, "learning_rate": 5.94855305466238e-06, "loss": 0.1196, "step": 185 }, { "epoch": 0.12248186946011282, "grad_norm": 2.2878026962280273, "learning_rate": 6.109324758842444e-06, "loss": 0.1244, "step": 190 }, { "epoch": 0.12570507655116842, "grad_norm": 2.213945150375366, "learning_rate": 6.270096463022508e-06, "loss": 0.1247, "step": 195 }, { "epoch": 0.128928283642224, "grad_norm": 2.1751949787139893, "learning_rate": 6.430868167202573e-06, "loss": 0.1218, "step": 200 }, { "epoch": 0.13215149073327961, "grad_norm": 3.6750779151916504, "learning_rate": 6.591639871382637e-06, "loss": 0.1245, "step": 205 }, { "epoch": 0.13537469782433523, "grad_norm": 2.0573132038116455, "learning_rate": 6.752411575562702e-06, "loss": 0.1222, "step": 210 }, { "epoch": 0.1385979049153908, "grad_norm": 2.161153554916382, "learning_rate": 6.913183279742766e-06, "loss": 0.1144, "step": 215 }, { "epoch": 0.14182111200644643, "grad_norm": 2.023874282836914, "learning_rate": 7.073954983922831e-06, "loss": 0.1196, "step": 220 }, { "epoch": 0.145044319097502, "grad_norm": 2.5854148864746094, "learning_rate": 7.234726688102894e-06, "loss": 0.1226, "step": 225 }, { "epoch": 0.14826752618855762, "grad_norm": 2.1432173252105713, "learning_rate": 7.39549839228296e-06, "loss": 0.1141, "step": 230 }, { "epoch": 0.1514907332796132, "grad_norm": 1.9795219898223877, "learning_rate": 7.556270096463023e-06, "loss": 0.1214, "step": 235 }, { "epoch": 0.15471394037066882, "grad_norm": 2.685298204421997, "learning_rate": 7.717041800643087e-06, "loss": 0.1166, "step": 240 }, { "epoch": 0.1579371474617244, "grad_norm": 2.327439069747925, "learning_rate": 7.877813504823153e-06, "loss": 0.1147, "step": 245 }, { "epoch": 0.16116035455278002, "grad_norm": 2.873242139816284, "learning_rate": 8.038585209003215e-06, "loss": 0.1214, "step": 250 }, { "epoch": 0.1643835616438356, "grad_norm": 1.947021484375, "learning_rate": 8.19935691318328e-06, "loss": 0.1181, "step": 255 }, { "epoch": 0.16760676873489122, "grad_norm": 2.10578989982605, "learning_rate": 8.360128617363345e-06, "loss": 0.1166, "step": 260 }, { "epoch": 0.1708299758259468, "grad_norm": 3.1242830753326416, "learning_rate": 8.520900321543409e-06, "loss": 0.1223, "step": 265 }, { "epoch": 0.17405318291700242, "grad_norm": 2.3070671558380127, "learning_rate": 8.681672025723473e-06, "loss": 0.1199, "step": 270 }, { "epoch": 0.177276390008058, "grad_norm": 4.113283157348633, "learning_rate": 8.842443729903538e-06, "loss": 0.1359, "step": 275 }, { "epoch": 0.18049959709911362, "grad_norm": 2.406996488571167, "learning_rate": 9.003215434083602e-06, "loss": 0.1247, "step": 280 }, { "epoch": 0.18372280419016923, "grad_norm": 1.8795942068099976, "learning_rate": 9.163987138263667e-06, "loss": 0.1184, "step": 285 }, { "epoch": 0.18694601128122482, "grad_norm": 1.873802900314331, "learning_rate": 9.32475884244373e-06, "loss": 0.1147, "step": 290 }, { "epoch": 0.19016921837228043, "grad_norm": 2.1450552940368652, "learning_rate": 9.485530546623795e-06, "loss": 0.1297, "step": 295 }, { "epoch": 0.19339242546333602, "grad_norm": 2.236771583557129, "learning_rate": 9.64630225080386e-06, "loss": 0.1144, "step": 300 }, { "epoch": 0.19661563255439163, "grad_norm": 2.0439035892486572, "learning_rate": 9.807073954983923e-06, "loss": 0.1153, "step": 305 }, { "epoch": 0.1998388396454472, "grad_norm": 2.3851237297058105, "learning_rate": 9.967845659163988e-06, "loss": 0.1185, "step": 310 }, { "epoch": 0.20306204673650283, "grad_norm": 1.9924075603485107, "learning_rate": 9.99994931968214e-06, "loss": 0.1094, "step": 315 }, { "epoch": 0.2062852538275584, "grad_norm": 2.0796937942504883, "learning_rate": 9.999743432651652e-06, "loss": 0.1211, "step": 320 }, { "epoch": 0.20950846091861403, "grad_norm": 2.6404528617858887, "learning_rate": 9.999379177905158e-06, "loss": 0.1225, "step": 325 }, { "epoch": 0.2127316680096696, "grad_norm": 1.8474249839782715, "learning_rate": 9.998856566980493e-06, "loss": 0.1195, "step": 330 }, { "epoch": 0.21595487510072522, "grad_norm": 2.2194368839263916, "learning_rate": 9.998175616431443e-06, "loss": 0.1234, "step": 335 }, { "epoch": 0.2191780821917808, "grad_norm": 2.3846206665039062, "learning_rate": 9.99733634782723e-06, "loss": 0.1183, "step": 340 }, { "epoch": 0.22240128928283642, "grad_norm": 2.1238505840301514, "learning_rate": 9.996338787751834e-06, "loss": 0.1133, "step": 345 }, { "epoch": 0.22562449637389204, "grad_norm": 1.9952363967895508, "learning_rate": 9.995182967803131e-06, "loss": 0.1171, "step": 350 }, { "epoch": 0.22884770346494762, "grad_norm": 1.8000890016555786, "learning_rate": 9.99386892459192e-06, "loss": 0.1271, "step": 355 }, { "epoch": 0.23207091055600323, "grad_norm": 1.680237889289856, "learning_rate": 9.992396699740738e-06, "loss": 0.1233, "step": 360 }, { "epoch": 0.23529411764705882, "grad_norm": 1.9006551504135132, "learning_rate": 9.990766339882554e-06, "loss": 0.117, "step": 365 }, { "epoch": 0.23851732473811443, "grad_norm": 2.417584180831909, "learning_rate": 9.988977896659294e-06, "loss": 0.1202, "step": 370 }, { "epoch": 0.24174053182917002, "grad_norm": 1.7894331216812134, "learning_rate": 9.987031426720195e-06, "loss": 0.1131, "step": 375 }, { "epoch": 0.24496373892022563, "grad_norm": 2.3755767345428467, "learning_rate": 9.984926991720025e-06, "loss": 0.1199, "step": 380 }, { "epoch": 0.24818694601128122, "grad_norm": 1.6282379627227783, "learning_rate": 9.982664658317115e-06, "loss": 0.1139, "step": 385 }, { "epoch": 0.25141015310233683, "grad_norm": 2.3639254570007324, "learning_rate": 9.980244498171256e-06, "loss": 0.1064, "step": 390 }, { "epoch": 0.25463336019339244, "grad_norm": 1.6792004108428955, "learning_rate": 9.97766658794143e-06, "loss": 0.111, "step": 395 }, { "epoch": 0.257856567284448, "grad_norm": 2.633594036102295, "learning_rate": 9.974931009283378e-06, "loss": 0.1083, "step": 400 }, { "epoch": 0.2610797743755036, "grad_norm": 1.7790218591690063, "learning_rate": 9.972037848847014e-06, "loss": 0.1197, "step": 405 }, { "epoch": 0.26430298146655923, "grad_norm": 2.443464756011963, "learning_rate": 9.968987198273682e-06, "loss": 0.1153, "step": 410 }, { "epoch": 0.26752618855761484, "grad_norm": 1.6622118949890137, "learning_rate": 9.965779154193256e-06, "loss": 0.1214, "step": 415 }, { "epoch": 0.27074939564867045, "grad_norm": 1.398253321647644, "learning_rate": 9.962413818221071e-06, "loss": 0.1053, "step": 420 }, { "epoch": 0.273972602739726, "grad_norm": 2.2201757431030273, "learning_rate": 9.95889129695471e-06, "loss": 0.1206, "step": 425 }, { "epoch": 0.2771958098307816, "grad_norm": 2.262932062149048, "learning_rate": 9.955211701970631e-06, "loss": 0.1152, "step": 430 }, { "epoch": 0.28041901692183724, "grad_norm": 1.72652268409729, "learning_rate": 9.951375149820624e-06, "loss": 0.114, "step": 435 }, { "epoch": 0.28364222401289285, "grad_norm": 2.2013440132141113, "learning_rate": 9.947381762028124e-06, "loss": 0.1172, "step": 440 }, { "epoch": 0.2868654311039484, "grad_norm": 1.7692981958389282, "learning_rate": 9.943231665084363e-06, "loss": 0.108, "step": 445 }, { "epoch": 0.290088638195004, "grad_norm": 2.222728729248047, "learning_rate": 9.938924990444363e-06, "loss": 0.1074, "step": 450 }, { "epoch": 0.29331184528605964, "grad_norm": 1.67208993434906, "learning_rate": 9.934461874522767e-06, "loss": 0.1176, "step": 455 }, { "epoch": 0.29653505237711525, "grad_norm": 1.7946748733520508, "learning_rate": 9.929842458689524e-06, "loss": 0.111, "step": 460 }, { "epoch": 0.2997582594681708, "grad_norm": 1.9234955310821533, "learning_rate": 9.925066889265412e-06, "loss": 0.1182, "step": 465 }, { "epoch": 0.3029814665592264, "grad_norm": 2.4538662433624268, "learning_rate": 9.920135317517393e-06, "loss": 0.1227, "step": 470 }, { "epoch": 0.30620467365028203, "grad_norm": 1.8412810564041138, "learning_rate": 9.915047899653838e-06, "loss": 0.1128, "step": 475 }, { "epoch": 0.30942788074133765, "grad_norm": 1.9068711996078491, "learning_rate": 9.909804796819562e-06, "loss": 0.1142, "step": 480 }, { "epoch": 0.3126510878323932, "grad_norm": 1.9735403060913086, "learning_rate": 9.904406175090732e-06, "loss": 0.1066, "step": 485 }, { "epoch": 0.3158742949234488, "grad_norm": 2.349578619003296, "learning_rate": 9.898852205469603e-06, "loss": 0.1187, "step": 490 }, { "epoch": 0.31909750201450443, "grad_norm": 1.504022479057312, "learning_rate": 9.893143063879098e-06, "loss": 0.1051, "step": 495 }, { "epoch": 0.32232070910556004, "grad_norm": 1.5406743288040161, "learning_rate": 9.887278931157237e-06, "loss": 0.1123, "step": 500 }, { "epoch": 0.32554391619661566, "grad_norm": 1.8361977338790894, "learning_rate": 9.881259993051415e-06, "loss": 0.1225, "step": 505 }, { "epoch": 0.3287671232876712, "grad_norm": 1.7701557874679565, "learning_rate": 9.875086440212511e-06, "loss": 0.1027, "step": 510 }, { "epoch": 0.33199033037872683, "grad_norm": 1.6567251682281494, "learning_rate": 9.86875846818885e-06, "loss": 0.1206, "step": 515 }, { "epoch": 0.33521353746978244, "grad_norm": 1.641015887260437, "learning_rate": 9.862276277420016e-06, "loss": 0.1183, "step": 520 }, { "epoch": 0.33843674456083805, "grad_norm": 1.2547988891601562, "learning_rate": 9.85564007323049e-06, "loss": 0.1098, "step": 525 }, { "epoch": 0.3416599516518936, "grad_norm": 1.8411493301391602, "learning_rate": 9.848850065823159e-06, "loss": 0.1052, "step": 530 }, { "epoch": 0.3448831587429492, "grad_norm": 1.6275116205215454, "learning_rate": 9.841906470272655e-06, "loss": 0.1224, "step": 535 }, { "epoch": 0.34810636583400484, "grad_norm": 2.855224847793579, "learning_rate": 9.834809506518537e-06, "loss": 0.1082, "step": 540 }, { "epoch": 0.35132957292506045, "grad_norm": 2.3182358741760254, "learning_rate": 9.827559399358327e-06, "loss": 0.1224, "step": 545 }, { "epoch": 0.354552780016116, "grad_norm": 2.1670444011688232, "learning_rate": 9.82015637844039e-06, "loss": 0.1101, "step": 550 }, { "epoch": 0.3577759871071716, "grad_norm": 1.629321813583374, "learning_rate": 9.812600678256664e-06, "loss": 0.1054, "step": 555 }, { "epoch": 0.36099919419822724, "grad_norm": 1.6911214590072632, "learning_rate": 9.804892538135225e-06, "loss": 0.1028, "step": 560 }, { "epoch": 0.36422240128928285, "grad_norm": 2.162142038345337, "learning_rate": 9.797032202232708e-06, "loss": 0.1052, "step": 565 }, { "epoch": 0.36744560838033846, "grad_norm": 1.6863000392913818, "learning_rate": 9.789019919526583e-06, "loss": 0.1078, "step": 570 }, { "epoch": 0.370668815471394, "grad_norm": 2.0217230319976807, "learning_rate": 9.780855943807253e-06, "loss": 0.1152, "step": 575 }, { "epoch": 0.37389202256244963, "grad_norm": 1.3627716302871704, "learning_rate": 9.772540533670023e-06, "loss": 0.1055, "step": 580 }, { "epoch": 0.37711522965350525, "grad_norm": 1.4767628908157349, "learning_rate": 9.764073952506913e-06, "loss": 0.1126, "step": 585 }, { "epoch": 0.38033843674456086, "grad_norm": 1.769196629524231, "learning_rate": 9.755456468498307e-06, "loss": 0.1062, "step": 590 }, { "epoch": 0.3835616438356164, "grad_norm": 1.9426604509353638, "learning_rate": 9.746688354604467e-06, "loss": 0.1128, "step": 595 }, { "epoch": 0.38678485092667203, "grad_norm": 1.6949142217636108, "learning_rate": 9.737769888556874e-06, "loss": 0.1058, "step": 600 }, { "epoch": 0.39000805801772764, "grad_norm": 1.6036336421966553, "learning_rate": 9.728701352849445e-06, "loss": 0.1214, "step": 605 }, { "epoch": 0.39323126510878326, "grad_norm": 1.3234189748764038, "learning_rate": 9.71948303472958e-06, "loss": 0.1095, "step": 610 }, { "epoch": 0.3964544721998388, "grad_norm": 1.5805995464324951, "learning_rate": 9.710115226189054e-06, "loss": 0.1179, "step": 615 }, { "epoch": 0.3996776792908944, "grad_norm": 1.5236024856567383, "learning_rate": 9.700598223954787e-06, "loss": 0.1065, "step": 620 }, { "epoch": 0.40290088638195004, "grad_norm": 2.2143023014068604, "learning_rate": 9.690932329479425e-06, "loss": 0.1118, "step": 625 }, { "epoch": 0.40612409347300565, "grad_norm": 2.0677402019500732, "learning_rate": 9.681117848931806e-06, "loss": 0.1015, "step": 630 }, { "epoch": 0.40934730056406127, "grad_norm": 1.786145567893982, "learning_rate": 9.671155093187256e-06, "loss": 0.1072, "step": 635 }, { "epoch": 0.4125705076551168, "grad_norm": 1.661035418510437, "learning_rate": 9.661044377817745e-06, "loss": 0.1165, "step": 640 }, { "epoch": 0.41579371474617244, "grad_norm": 1.7452033758163452, "learning_rate": 9.650786023081882e-06, "loss": 0.1107, "step": 645 }, { "epoch": 0.41901692183722805, "grad_norm": 1.5127534866333008, "learning_rate": 9.640380353914784e-06, "loss": 0.1205, "step": 650 }, { "epoch": 0.42224012892828366, "grad_norm": 1.7860335111618042, "learning_rate": 9.629827699917777e-06, "loss": 0.1099, "step": 655 }, { "epoch": 0.4254633360193392, "grad_norm": 1.8366566896438599, "learning_rate": 9.619128395347957e-06, "loss": 0.0995, "step": 660 }, { "epoch": 0.42868654311039484, "grad_norm": 1.7406480312347412, "learning_rate": 9.608282779107596e-06, "loss": 0.1093, "step": 665 }, { "epoch": 0.43190975020145045, "grad_norm": 1.5550240278244019, "learning_rate": 9.597291194733417e-06, "loss": 0.1081, "step": 670 }, { "epoch": 0.43513295729250606, "grad_norm": 1.8106791973114014, "learning_rate": 9.58615399038571e-06, "loss": 0.1092, "step": 675 }, { "epoch": 0.4383561643835616, "grad_norm": 2.0512306690216064, "learning_rate": 9.574871518837299e-06, "loss": 0.11, "step": 680 }, { "epoch": 0.44157937147461723, "grad_norm": 1.536855697631836, "learning_rate": 9.563444137462373e-06, "loss": 0.1092, "step": 685 }, { "epoch": 0.44480257856567285, "grad_norm": 1.4719635248184204, "learning_rate": 9.55187220822516e-06, "loss": 0.1081, "step": 690 }, { "epoch": 0.44802578565672846, "grad_norm": 1.7767363786697388, "learning_rate": 9.54015609766847e-06, "loss": 0.1099, "step": 695 }, { "epoch": 0.4512489927477841, "grad_norm": 1.453895092010498, "learning_rate": 9.528296176902085e-06, "loss": 0.1083, "step": 700 }, { "epoch": 0.45447219983883963, "grad_norm": 1.516648292541504, "learning_rate": 9.51629282159099e-06, "loss": 0.1174, "step": 705 }, { "epoch": 0.45769540692989524, "grad_norm": 1.5289475917816162, "learning_rate": 9.504146411943488e-06, "loss": 0.1119, "step": 710 }, { "epoch": 0.46091861402095086, "grad_norm": 1.7268835306167603, "learning_rate": 9.491857332699153e-06, "loss": 0.1067, "step": 715 }, { "epoch": 0.46414182111200647, "grad_norm": 1.424131989479065, "learning_rate": 9.47942597311664e-06, "loss": 0.1096, "step": 720 }, { "epoch": 0.467365028203062, "grad_norm": 2.6142001152038574, "learning_rate": 9.466852726961363e-06, "loss": 0.1132, "step": 725 }, { "epoch": 0.47058823529411764, "grad_norm": 1.7743583917617798, "learning_rate": 9.454137992493008e-06, "loss": 0.1095, "step": 730 }, { "epoch": 0.47381144238517325, "grad_norm": 1.3648674488067627, "learning_rate": 9.441282172452935e-06, "loss": 0.1016, "step": 735 }, { "epoch": 0.47703464947622887, "grad_norm": 1.202217698097229, "learning_rate": 9.428285674051413e-06, "loss": 0.1014, "step": 740 }, { "epoch": 0.4802578565672844, "grad_norm": 1.2294992208480835, "learning_rate": 9.415148908954717e-06, "loss": 0.0958, "step": 745 }, { "epoch": 0.48348106365834004, "grad_norm": 1.3715941905975342, "learning_rate": 9.401872293272096e-06, "loss": 0.1032, "step": 750 }, { "epoch": 0.48670427074939565, "grad_norm": 1.2639530897140503, "learning_rate": 9.38845624754259e-06, "loss": 0.1047, "step": 755 }, { "epoch": 0.48992747784045126, "grad_norm": 1.389994502067566, "learning_rate": 9.37490119672171e-06, "loss": 0.1072, "step": 760 }, { "epoch": 0.4931506849315068, "grad_norm": 1.6051830053329468, "learning_rate": 9.361207570167974e-06, "loss": 0.1021, "step": 765 }, { "epoch": 0.49637389202256244, "grad_norm": 2.006974458694458, "learning_rate": 9.347375801629313e-06, "loss": 0.1038, "step": 770 }, { "epoch": 0.49959709911361805, "grad_norm": 1.40548837184906, "learning_rate": 9.333406329229326e-06, "loss": 0.1064, "step": 775 }, { "epoch": 0.5028203062046737, "grad_norm": 1.4568746089935303, "learning_rate": 9.319299595453404e-06, "loss": 0.1109, "step": 780 }, { "epoch": 0.5060435132957293, "grad_norm": 1.7389963865280151, "learning_rate": 9.305056047134722e-06, "loss": 0.1082, "step": 785 }, { "epoch": 0.5092667203867849, "grad_norm": 1.2637214660644531, "learning_rate": 9.29067613544007e-06, "loss": 0.1019, "step": 790 }, { "epoch": 0.5124899274778405, "grad_norm": 1.7853842973709106, "learning_rate": 9.276160315855576e-06, "loss": 0.101, "step": 795 }, { "epoch": 0.515713134568896, "grad_norm": 1.365021824836731, "learning_rate": 9.261509048172272e-06, "loss": 0.0903, "step": 800 }, { "epoch": 0.5189363416599516, "grad_norm": 1.0780879259109497, "learning_rate": 9.246722796471534e-06, "loss": 0.1003, "step": 805 }, { "epoch": 0.5221595487510072, "grad_norm": 1.3499747514724731, "learning_rate": 9.231802029110373e-06, "loss": 0.108, "step": 810 }, { "epoch": 0.5253827558420628, "grad_norm": 1.2328459024429321, "learning_rate": 9.216747218706612e-06, "loss": 0.1086, "step": 815 }, { "epoch": 0.5286059629331185, "grad_norm": 1.665556788444519, "learning_rate": 9.20155884212391e-06, "loss": 0.0989, "step": 820 }, { "epoch": 0.5318291700241741, "grad_norm": 1.262510061264038, "learning_rate": 9.186237380456652e-06, "loss": 0.1087, "step": 825 }, { "epoch": 0.5350523771152297, "grad_norm": 1.2561684846878052, "learning_rate": 9.170783319014723e-06, "loss": 0.1011, "step": 830 }, { "epoch": 0.5382755842062853, "grad_norm": 1.4100691080093384, "learning_rate": 9.155197147308118e-06, "loss": 0.1025, "step": 835 }, { "epoch": 0.5414987912973409, "grad_norm": 1.4246737957000732, "learning_rate": 9.13947935903146e-06, "loss": 0.1043, "step": 840 }, { "epoch": 0.5447219983883964, "grad_norm": 1.5978039503097534, "learning_rate": 9.12363045204834e-06, "loss": 0.1137, "step": 845 }, { "epoch": 0.547945205479452, "grad_norm": 1.3265248537063599, "learning_rate": 9.107650928375555e-06, "loss": 0.1066, "step": 850 }, { "epoch": 0.5511684125705076, "grad_norm": 1.3155473470687866, "learning_rate": 9.091541294167214e-06, "loss": 0.0958, "step": 855 }, { "epoch": 0.5543916196615633, "grad_norm": 1.4708001613616943, "learning_rate": 9.075302059698696e-06, "loss": 0.1063, "step": 860 }, { "epoch": 0.5576148267526189, "grad_norm": 1.3062944412231445, "learning_rate": 9.05893373935049e-06, "loss": 0.1009, "step": 865 }, { "epoch": 0.5608380338436745, "grad_norm": 1.3801825046539307, "learning_rate": 9.0424368515919e-06, "loss": 0.1042, "step": 870 }, { "epoch": 0.5640612409347301, "grad_norm": 1.2434556484222412, "learning_rate": 9.02581191896463e-06, "loss": 0.1027, "step": 875 }, { "epoch": 0.5672844480257857, "grad_norm": 1.4129581451416016, "learning_rate": 9.00905946806622e-06, "loss": 0.1028, "step": 880 }, { "epoch": 0.5705076551168412, "grad_norm": 1.7679858207702637, "learning_rate": 8.992180029533378e-06, "loss": 0.1044, "step": 885 }, { "epoch": 0.5737308622078968, "grad_norm": 1.4008570909500122, "learning_rate": 8.975174138025165e-06, "loss": 0.0988, "step": 890 }, { "epoch": 0.5769540692989524, "grad_norm": 1.1371465921401978, "learning_rate": 8.958042332206059e-06, "loss": 0.0977, "step": 895 }, { "epoch": 0.580177276390008, "grad_norm": 1.4827523231506348, "learning_rate": 8.940785154728899e-06, "loss": 0.097, "step": 900 }, { "epoch": 0.5834004834810637, "grad_norm": 1.5484329462051392, "learning_rate": 8.92340315221769e-06, "loss": 0.1049, "step": 905 }, { "epoch": 0.5866236905721193, "grad_norm": 1.267395257949829, "learning_rate": 8.905896875250291e-06, "loss": 0.0943, "step": 910 }, { "epoch": 0.5898468976631749, "grad_norm": 2.0672824382781982, "learning_rate": 8.888266878340979e-06, "loss": 0.0984, "step": 915 }, { "epoch": 0.5930701047542305, "grad_norm": 1.3846348524093628, "learning_rate": 8.870513719922873e-06, "loss": 0.1047, "step": 920 }, { "epoch": 0.5962933118452861, "grad_norm": 1.162631630897522, "learning_rate": 8.85263796233026e-06, "loss": 0.1067, "step": 925 }, { "epoch": 0.5995165189363416, "grad_norm": 1.2754584550857544, "learning_rate": 8.834640171780777e-06, "loss": 0.0959, "step": 930 }, { "epoch": 0.6027397260273972, "grad_norm": 1.383272409439087, "learning_rate": 8.816520918357473e-06, "loss": 0.1063, "step": 935 }, { "epoch": 0.6059629331184528, "grad_norm": 1.6662758588790894, "learning_rate": 8.798280775990751e-06, "loss": 0.1024, "step": 940 }, { "epoch": 0.6091861402095085, "grad_norm": 1.9114925861358643, "learning_rate": 8.7799203224402e-06, "loss": 0.1054, "step": 945 }, { "epoch": 0.6124093473005641, "grad_norm": 1.4166219234466553, "learning_rate": 8.761440139276279e-06, "loss": 0.1077, "step": 950 }, { "epoch": 0.6156325543916197, "grad_norm": 1.9428759813308716, "learning_rate": 8.742840811861901e-06, "loss": 0.1044, "step": 955 }, { "epoch": 0.6188557614826753, "grad_norm": 1.5213385820388794, "learning_rate": 8.724122929333904e-06, "loss": 0.1128, "step": 960 }, { "epoch": 0.6220789685737309, "grad_norm": 1.2538460493087769, "learning_rate": 8.705287084584369e-06, "loss": 0.0963, "step": 965 }, { "epoch": 0.6253021756647864, "grad_norm": 0.9515339136123657, "learning_rate": 8.68633387424185e-06, "loss": 0.104, "step": 970 }, { "epoch": 0.628525382755842, "grad_norm": 1.6763734817504883, "learning_rate": 8.667263898652485e-06, "loss": 0.0975, "step": 975 }, { "epoch": 0.6317485898468976, "grad_norm": 1.9303100109100342, "learning_rate": 8.648077761860962e-06, "loss": 0.0936, "step": 980 }, { "epoch": 0.6349717969379532, "grad_norm": 1.3960262537002563, "learning_rate": 8.6287760715914e-06, "loss": 0.1018, "step": 985 }, { "epoch": 0.6381950040290089, "grad_norm": 1.3512085676193237, "learning_rate": 8.609359439228092e-06, "loss": 0.1051, "step": 990 }, { "epoch": 0.6414182111200645, "grad_norm": 1.3363274335861206, "learning_rate": 8.589828479796138e-06, "loss": 0.1026, "step": 995 }, { "epoch": 0.6446414182111201, "grad_norm": 1.2143882513046265, "learning_rate": 8.570183811941973e-06, "loss": 0.0997, "step": 1000 }, { "epoch": 0.6478646253021757, "grad_norm": 1.5231553316116333, "learning_rate": 8.550426057913758e-06, "loss": 0.0971, "step": 1005 }, { "epoch": 0.6510878323932313, "grad_norm": 1.5069528818130493, "learning_rate": 8.53055584354169e-06, "loss": 0.0968, "step": 1010 }, { "epoch": 0.6543110394842868, "grad_norm": 1.9453926086425781, "learning_rate": 8.510573798218153e-06, "loss": 0.1056, "step": 1015 }, { "epoch": 0.6575342465753424, "grad_norm": 1.6074435710906982, "learning_rate": 8.490480554877804e-06, "loss": 0.1005, "step": 1020 }, { "epoch": 0.660757453666398, "grad_norm": 1.4784128665924072, "learning_rate": 8.47027674997751e-06, "loss": 0.091, "step": 1025 }, { "epoch": 0.6639806607574537, "grad_norm": 1.3281731605529785, "learning_rate": 8.449963023476198e-06, "loss": 0.1007, "step": 1030 }, { "epoch": 0.6672038678485093, "grad_norm": 1.3868046998977661, "learning_rate": 8.429540018814581e-06, "loss": 0.1023, "step": 1035 }, { "epoch": 0.6704270749395649, "grad_norm": 1.4011777639389038, "learning_rate": 8.409008382894771e-06, "loss": 0.0972, "step": 1040 }, { "epoch": 0.6736502820306205, "grad_norm": 1.2864456176757812, "learning_rate": 8.388368766059798e-06, "loss": 0.1024, "step": 1045 }, { "epoch": 0.6768734891216761, "grad_norm": 1.8163318634033203, "learning_rate": 8.367621822073004e-06, "loss": 0.0942, "step": 1050 }, { "epoch": 0.6800966962127317, "grad_norm": 1.1266424655914307, "learning_rate": 8.346768208097339e-06, "loss": 0.0997, "step": 1055 }, { "epoch": 0.6833199033037872, "grad_norm": 1.268912672996521, "learning_rate": 8.325808584674539e-06, "loss": 0.0954, "step": 1060 }, { "epoch": 0.6865431103948428, "grad_norm": 1.9696354866027832, "learning_rate": 8.304743615704207e-06, "loss": 0.1056, "step": 1065 }, { "epoch": 0.6897663174858985, "grad_norm": 1.47492253780365, "learning_rate": 8.283573968422792e-06, "loss": 0.103, "step": 1070 }, { "epoch": 0.6929895245769541, "grad_norm": 1.654740810394287, "learning_rate": 8.262300313382431e-06, "loss": 0.0951, "step": 1075 }, { "epoch": 0.6962127316680097, "grad_norm": 1.3860782384872437, "learning_rate": 8.240923324429742e-06, "loss": 0.1013, "step": 1080 }, { "epoch": 0.6994359387590653, "grad_norm": 1.9896957874298096, "learning_rate": 8.219443678684448e-06, "loss": 0.095, "step": 1085 }, { "epoch": 0.7026591458501209, "grad_norm": 1.5903962850570679, "learning_rate": 8.197862056517954e-06, "loss": 0.1025, "step": 1090 }, { "epoch": 0.7058823529411765, "grad_norm": 1.1547088623046875, "learning_rate": 8.176179141531774e-06, "loss": 0.1011, "step": 1095 }, { "epoch": 0.709105560032232, "grad_norm": 1.4623602628707886, "learning_rate": 8.154395620535899e-06, "loss": 0.1015, "step": 1100 }, { "epoch": 0.7123287671232876, "grad_norm": 1.5208735466003418, "learning_rate": 8.132512183527027e-06, "loss": 0.1018, "step": 1105 }, { "epoch": 0.7155519742143432, "grad_norm": 1.1935063600540161, "learning_rate": 8.110529523666712e-06, "loss": 0.1022, "step": 1110 }, { "epoch": 0.7187751813053989, "grad_norm": 1.2939246892929077, "learning_rate": 8.088448337259416e-06, "loss": 0.1049, "step": 1115 }, { "epoch": 0.7219983883964545, "grad_norm": 1.3576562404632568, "learning_rate": 8.066269323730435e-06, "loss": 0.0964, "step": 1120 }, { "epoch": 0.7252215954875101, "grad_norm": 1.2397035360336304, "learning_rate": 8.043993185603764e-06, "loss": 0.0949, "step": 1125 }, { "epoch": 0.7284448025785657, "grad_norm": 1.6794919967651367, "learning_rate": 8.021620628479833e-06, "loss": 0.0941, "step": 1130 }, { "epoch": 0.7316680096696213, "grad_norm": 1.9329454898834229, "learning_rate": 7.99915236101316e-06, "loss": 0.0929, "step": 1135 }, { "epoch": 0.7348912167606769, "grad_norm": 1.2772644758224487, "learning_rate": 7.976589094889903e-06, "loss": 0.1004, "step": 1140 }, { "epoch": 0.7381144238517324, "grad_norm": 1.1697113513946533, "learning_rate": 7.953931544805324e-06, "loss": 0.0905, "step": 1145 }, { "epoch": 0.741337630942788, "grad_norm": 1.7702858448028564, "learning_rate": 7.931180428441135e-06, "loss": 0.1052, "step": 1150 }, { "epoch": 0.7445608380338437, "grad_norm": 1.3432146310806274, "learning_rate": 7.908336466442786e-06, "loss": 0.0919, "step": 1155 }, { "epoch": 0.7477840451248993, "grad_norm": 1.2473376989364624, "learning_rate": 7.885400382396621e-06, "loss": 0.0961, "step": 1160 }, { "epoch": 0.7510072522159549, "grad_norm": 2.3682289123535156, "learning_rate": 7.862372902806971e-06, "loss": 0.1042, "step": 1165 }, { "epoch": 0.7542304593070105, "grad_norm": 1.86495041847229, "learning_rate": 7.839254757073133e-06, "loss": 0.1009, "step": 1170 }, { "epoch": 0.7574536663980661, "grad_norm": 1.7069085836410522, "learning_rate": 7.816046677466269e-06, "loss": 0.1007, "step": 1175 }, { "epoch": 0.7606768734891217, "grad_norm": 1.8137654066085815, "learning_rate": 7.792749399106214e-06, "loss": 0.0927, "step": 1180 }, { "epoch": 0.7639000805801772, "grad_norm": 1.1234313249588013, "learning_rate": 7.769363659938186e-06, "loss": 0.0931, "step": 1185 }, { "epoch": 0.7671232876712328, "grad_norm": 1.3791865110397339, "learning_rate": 7.745890200709416e-06, "loss": 0.0973, "step": 1190 }, { "epoch": 0.7703464947622884, "grad_norm": 1.2439701557159424, "learning_rate": 7.722329764945682e-06, "loss": 0.1004, "step": 1195 }, { "epoch": 0.7735697018533441, "grad_norm": 1.2246594429016113, "learning_rate": 7.698683098927756e-06, "loss": 0.1078, "step": 1200 }, { "epoch": 0.7767929089443997, "grad_norm": 1.09011709690094, "learning_rate": 7.674950951667773e-06, "loss": 0.0939, "step": 1205 }, { "epoch": 0.7800161160354553, "grad_norm": 1.0550169944763184, "learning_rate": 7.651134074885495e-06, "loss": 0.0982, "step": 1210 }, { "epoch": 0.7832393231265109, "grad_norm": 1.935786247253418, "learning_rate": 7.627233222984514e-06, "loss": 0.0973, "step": 1215 }, { "epoch": 0.7864625302175665, "grad_norm": 1.7208002805709839, "learning_rate": 7.603249153028335e-06, "loss": 0.098, "step": 1220 }, { "epoch": 0.7896857373086221, "grad_norm": 1.3723320960998535, "learning_rate": 7.579182624716422e-06, "loss": 0.1035, "step": 1225 }, { "epoch": 0.7929089443996776, "grad_norm": 1.1093083620071411, "learning_rate": 7.555034400360115e-06, "loss": 0.0906, "step": 1230 }, { "epoch": 0.7961321514907332, "grad_norm": 1.0704550743103027, "learning_rate": 7.530805244858492e-06, "loss": 0.0937, "step": 1235 }, { "epoch": 0.7993553585817889, "grad_norm": 1.111206293106079, "learning_rate": 7.506495925674135e-06, "loss": 0.11, "step": 1240 }, { "epoch": 0.8025785656728445, "grad_norm": 1.0680865049362183, "learning_rate": 7.482107212808829e-06, "loss": 0.0978, "step": 1245 }, { "epoch": 0.8058017727639001, "grad_norm": 1.2233189344406128, "learning_rate": 7.457639878779164e-06, "loss": 0.0901, "step": 1250 }, { "epoch": 0.8090249798549557, "grad_norm": 1.1432982683181763, "learning_rate": 7.433094698592069e-06, "loss": 0.1055, "step": 1255 }, { "epoch": 0.8122481869460113, "grad_norm": 1.0968226194381714, "learning_rate": 7.4084724497202675e-06, "loss": 0.0893, "step": 1260 }, { "epoch": 0.8154713940370669, "grad_norm": 1.416164755821228, "learning_rate": 7.383773912077639e-06, "loss": 0.1048, "step": 1265 }, { "epoch": 0.8186946011281225, "grad_norm": 1.4907201528549194, "learning_rate": 7.3589998679945274e-06, "loss": 0.0957, "step": 1270 }, { "epoch": 0.821917808219178, "grad_norm": 1.1113173961639404, "learning_rate": 7.3341511021929565e-06, "loss": 0.0891, "step": 1275 }, { "epoch": 0.8251410153102336, "grad_norm": 1.5791008472442627, "learning_rate": 7.30922840176177e-06, "loss": 0.0938, "step": 1280 }, { "epoch": 0.8283642224012893, "grad_norm": 1.1519147157669067, "learning_rate": 7.2842325561317064e-06, "loss": 0.0937, "step": 1285 }, { "epoch": 0.8315874294923449, "grad_norm": 1.0236375331878662, "learning_rate": 7.259164357050389e-06, "loss": 0.0859, "step": 1290 }, { "epoch": 0.8348106365834005, "grad_norm": 1.629459023475647, "learning_rate": 7.234024598557248e-06, "loss": 0.0902, "step": 1295 }, { "epoch": 0.8380338436744561, "grad_norm": 1.171321988105774, "learning_rate": 7.208814076958374e-06, "loss": 0.0887, "step": 1300 }, { "epoch": 0.8412570507655117, "grad_norm": 1.2966508865356445, "learning_rate": 7.183533590801286e-06, "loss": 0.0958, "step": 1305 }, { "epoch": 0.8444802578565673, "grad_norm": 1.7354567050933838, "learning_rate": 7.158183940849644e-06, "loss": 0.0967, "step": 1310 }, { "epoch": 0.8477034649476228, "grad_norm": 2.0169782638549805, "learning_rate": 7.132765930057886e-06, "loss": 0.0972, "step": 1315 }, { "epoch": 0.8509266720386784, "grad_norm": 1.147071123123169, "learning_rate": 7.107280363545785e-06, "loss": 0.0976, "step": 1320 }, { "epoch": 0.8541498791297341, "grad_norm": 1.391392707824707, "learning_rate": 7.08172804857296e-06, "loss": 0.0899, "step": 1325 }, { "epoch": 0.8573730862207897, "grad_norm": 1.1355257034301758, "learning_rate": 7.056109794513292e-06, "loss": 0.1036, "step": 1330 }, { "epoch": 0.8605962933118453, "grad_norm": 0.9366469979286194, "learning_rate": 7.030426412829296e-06, "loss": 0.088, "step": 1335 }, { "epoch": 0.8638195004029009, "grad_norm": 1.1631442308425903, "learning_rate": 7.004678717046419e-06, "loss": 0.0891, "step": 1340 }, { "epoch": 0.8670427074939565, "grad_norm": 1.429606318473816, "learning_rate": 6.978867522727264e-06, "loss": 0.1039, "step": 1345 }, { "epoch": 0.8702659145850121, "grad_norm": 1.1730060577392578, "learning_rate": 6.952993647445762e-06, "loss": 0.0931, "step": 1350 }, { "epoch": 0.8734891216760677, "grad_norm": 1.1138707399368286, "learning_rate": 6.927057910761273e-06, "loss": 0.0982, "step": 1355 }, { "epoch": 0.8767123287671232, "grad_norm": 1.2846705913543701, "learning_rate": 6.9010611341926286e-06, "loss": 0.0937, "step": 1360 }, { "epoch": 0.8799355358581789, "grad_norm": 1.1566565036773682, "learning_rate": 6.875004141192108e-06, "loss": 0.092, "step": 1365 }, { "epoch": 0.8831587429492345, "grad_norm": 1.17435622215271, "learning_rate": 6.848887757119358e-06, "loss": 0.0996, "step": 1370 }, { "epoch": 0.8863819500402901, "grad_norm": 1.250361442565918, "learning_rate": 6.822712809215247e-06, "loss": 0.099, "step": 1375 }, { "epoch": 0.8896051571313457, "grad_norm": 1.0037554502487183, "learning_rate": 6.7964801265756616e-06, "loss": 0.0873, "step": 1380 }, { "epoch": 0.8928283642224013, "grad_norm": 1.2579954862594604, "learning_rate": 6.770190540125246e-06, "loss": 0.0898, "step": 1385 }, { "epoch": 0.8960515713134569, "grad_norm": 1.5266188383102417, "learning_rate": 6.74384488259108e-06, "loss": 0.094, "step": 1390 }, { "epoch": 0.8992747784045125, "grad_norm": 1.5171687602996826, "learning_rate": 6.71744398847631e-06, "loss": 0.0924, "step": 1395 }, { "epoch": 0.9024979854955681, "grad_norm": 1.0802000761032104, "learning_rate": 6.690988694033707e-06, "loss": 0.0941, "step": 1400 }, { "epoch": 0.9057211925866236, "grad_norm": 1.210917353630066, "learning_rate": 6.664479837239182e-06, "loss": 0.0885, "step": 1405 }, { "epoch": 0.9089443996776793, "grad_norm": 0.9679683446884155, "learning_rate": 6.63791825776524e-06, "loss": 0.0929, "step": 1410 }, { "epoch": 0.9121676067687349, "grad_norm": 1.1072417497634888, "learning_rate": 6.611304796954391e-06, "loss": 0.0907, "step": 1415 }, { "epoch": 0.9153908138597905, "grad_norm": 1.1007254123687744, "learning_rate": 6.58464029779249e-06, "loss": 0.0834, "step": 1420 }, { "epoch": 0.9186140209508461, "grad_norm": 1.3668423891067505, "learning_rate": 6.557925604882045e-06, "loss": 0.0996, "step": 1425 }, { "epoch": 0.9218372280419017, "grad_norm": 1.0063837766647339, "learning_rate": 6.531161564415455e-06, "loss": 0.0967, "step": 1430 }, { "epoch": 0.9250604351329573, "grad_norm": 2.1310322284698486, "learning_rate": 6.504349024148215e-06, "loss": 0.0891, "step": 1435 }, { "epoch": 0.9282836422240129, "grad_norm": 1.7086719274520874, "learning_rate": 6.4774888333720565e-06, "loss": 0.091, "step": 1440 }, { "epoch": 0.9315068493150684, "grad_norm": 1.2807854413986206, "learning_rate": 6.450581842888051e-06, "loss": 0.0945, "step": 1445 }, { "epoch": 0.934730056406124, "grad_norm": 1.2100774049758911, "learning_rate": 6.423628904979655e-06, "loss": 0.0927, "step": 1450 }, { "epoch": 0.9379532634971797, "grad_norm": 1.079419732093811, "learning_rate": 6.396630873385723e-06, "loss": 0.0928, "step": 1455 }, { "epoch": 0.9411764705882353, "grad_norm": 1.2331191301345825, "learning_rate": 6.369588603273453e-06, "loss": 0.0902, "step": 1460 }, { "epoch": 0.9443996776792909, "grad_norm": 1.521287441253662, "learning_rate": 6.342502951211314e-06, "loss": 0.0906, "step": 1465 }, { "epoch": 0.9476228847703465, "grad_norm": 1.2149112224578857, "learning_rate": 6.315374775141897e-06, "loss": 0.088, "step": 1470 }, { "epoch": 0.9508460918614021, "grad_norm": 1.041596531867981, "learning_rate": 6.288204934354753e-06, "loss": 0.0903, "step": 1475 }, { "epoch": 0.9540692989524577, "grad_norm": 1.4294019937515259, "learning_rate": 6.26099428945917e-06, "loss": 0.0827, "step": 1480 }, { "epoch": 0.9572925060435133, "grad_norm": 1.353190302848816, "learning_rate": 6.2337437023569105e-06, "loss": 0.0892, "step": 1485 }, { "epoch": 0.9605157131345688, "grad_norm": 1.0870195627212524, "learning_rate": 6.206454036214914e-06, "loss": 0.1028, "step": 1490 }, { "epoch": 0.9637389202256245, "grad_norm": 0.9204868674278259, "learning_rate": 6.179126155437957e-06, "loss": 0.0929, "step": 1495 }, { "epoch": 0.9669621273166801, "grad_norm": 1.2136569023132324, "learning_rate": 6.151760925641268e-06, "loss": 0.0871, "step": 1500 }, { "epoch": 0.9701853344077357, "grad_norm": 1.2971879243850708, "learning_rate": 6.124359213623114e-06, "loss": 0.0979, "step": 1505 }, { "epoch": 0.9734085414987913, "grad_norm": 1.1342357397079468, "learning_rate": 6.096921887337342e-06, "loss": 0.0821, "step": 1510 }, { "epoch": 0.9766317485898469, "grad_norm": 1.1401841640472412, "learning_rate": 6.0694498158658886e-06, "loss": 0.0853, "step": 1515 }, { "epoch": 0.9798549556809025, "grad_norm": 1.0659375190734863, "learning_rate": 6.041943869391248e-06, "loss": 0.092, "step": 1520 }, { "epoch": 0.9830781627719581, "grad_norm": 1.2414722442626953, "learning_rate": 6.0144049191689095e-06, "loss": 0.0943, "step": 1525 }, { "epoch": 0.9863013698630136, "grad_norm": 1.3391859531402588, "learning_rate": 5.9868338374997645e-06, "loss": 0.0846, "step": 1530 }, { "epoch": 0.9895245769540693, "grad_norm": 1.275088906288147, "learning_rate": 5.959231497702473e-06, "loss": 0.0976, "step": 1535 }, { "epoch": 0.9927477840451249, "grad_norm": 1.732020616531372, "learning_rate": 5.9315987740857995e-06, "loss": 0.0906, "step": 1540 }, { "epoch": 0.9959709911361805, "grad_norm": 1.2046090364456177, "learning_rate": 5.903936541920924e-06, "loss": 0.092, "step": 1545 }, { "epoch": 0.9991941982272361, "grad_norm": 0.9802199006080627, "learning_rate": 5.876245677413712e-06, "loss": 0.0815, "step": 1550 } ], "logging_steps": 5, "max_steps": 3102, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 776, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.912061943183573e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }