| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.8458242699887943, |
| "eval_steps": 500, |
| "global_step": 7000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001318304660206974, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.0002, |
| "loss": 1.9624, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.002636609320413948, |
| "grad_norm": 1.7421875, |
| "learning_rate": 0.00019986805647183008, |
| "loss": 0.6513, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.003954913980620921, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019973611294366012, |
| "loss": 0.1146, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.005273218640827896, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.0001996041694154902, |
| "loss": 0.0529, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006591523301034869, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00019947222588732023, |
| "loss": 0.1214, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007909827961241843, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.0001993402823591503, |
| "loss": 0.0919, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.009228132621448816, |
| "grad_norm": 0.06201171875, |
| "learning_rate": 0.00019920833883098034, |
| "loss": 0.09, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.010546437281655791, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.0001990763953028104, |
| "loss": 0.1945, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.011864741941862765, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.00019894445177464048, |
| "loss": 0.1259, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.013183046602069738, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.00019881250824647052, |
| "loss": 0.027, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.014501351262276712, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.00019868056471830057, |
| "loss": 0.1068, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.015819655922483685, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.00019854862119013064, |
| "loss": 0.0542, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.01713796058269066, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 0.00019841667766196068, |
| "loss": 0.0901, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.018456265242897632, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00019828473413379075, |
| "loss": 0.0091, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.019774569903104607, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 0.0001981527906056208, |
| "loss": 0.0744, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.021092874563311582, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.00019802084707745086, |
| "loss": 0.1108, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.022411179223518554, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0001978889035492809, |
| "loss": 0.0446, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.02372948388372553, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 0.00019775696002111097, |
| "loss": 0.0982, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0250477885439325, |
| "grad_norm": 0.490234375, |
| "learning_rate": 0.00019762501649294104, |
| "loss": 0.1035, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.026366093204139476, |
| "grad_norm": 0.12158203125, |
| "learning_rate": 0.00019749307296477108, |
| "loss": 0.0401, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02768439786434645, |
| "grad_norm": 0.16015625, |
| "learning_rate": 0.00019736112943660115, |
| "loss": 0.0309, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.029002702524553423, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.0001972291859084312, |
| "loss": 0.1032, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0303210071847604, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.00019709724238026126, |
| "loss": 0.0811, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.03163931184496737, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.00019696529885209133, |
| "loss": 0.0258, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03295761650517435, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.00019683335532392137, |
| "loss": 0.0437, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.03427592116538132, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00019670141179575144, |
| "loss": 0.0967, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03559422582558829, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00019656946826758148, |
| "loss": 0.0132, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.036912530485795264, |
| "grad_norm": 0.66015625, |
| "learning_rate": 0.00019643752473941155, |
| "loss": 0.0396, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03823083514600224, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0001963055812112416, |
| "loss": 0.0449, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.039549139806209214, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.00019617363768307166, |
| "loss": 0.1196, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.040867444466416186, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0001960416941549017, |
| "loss": 0.0588, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.042185749126623165, |
| "grad_norm": 0.06005859375, |
| "learning_rate": 0.00019590975062673175, |
| "loss": 0.0234, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04350405378683014, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.00019577780709856182, |
| "loss": 0.0916, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04482235844703711, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.0001956458635703919, |
| "loss": 0.0271, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04614066310724409, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.00019551392004222193, |
| "loss": 0.0175, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.04745896776745106, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 0.000195381976514052, |
| "loss": 0.0356, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04877727242765803, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 0.00019525003298588204, |
| "loss": 0.0057, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.050095577087865, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.0001951180894577121, |
| "loss": 0.0082, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.05141388174807198, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 0.00019498614592954215, |
| "loss": 0.0178, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.05273218640827895, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.00019485420240137222, |
| "loss": 0.0789, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.054050491068485924, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.0001947222588732023, |
| "loss": 0.0645, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.0553687957286929, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.00019459031534503233, |
| "loss": 0.116, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.056687100388899875, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.0001944583718168624, |
| "loss": 0.0516, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.058005405049106847, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.00019432642828869244, |
| "loss": 0.1019, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.059323709709313825, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.0001941944847605225, |
| "loss": 0.0529, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.0606420143695208, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.00019406254123235256, |
| "loss": 0.0368, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.06196031902972777, |
| "grad_norm": 0.054443359375, |
| "learning_rate": 0.00019393059770418262, |
| "loss": 0.037, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.06327862368993474, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 0.0001937986541760127, |
| "loss": 0.0324, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.06459692835014172, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00019366671064784274, |
| "loss": 0.0334, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0659152330103487, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.0001935347671196728, |
| "loss": 0.0671, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.06723353767055566, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00019340282359150285, |
| "loss": 0.1559, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06855184233076264, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.0001932708800633329, |
| "loss": 0.0198, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06987014699096962, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.00019313893653516296, |
| "loss": 0.0151, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.07118845165117658, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.000193006993006993, |
| "loss": 0.0269, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.07250675631138356, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00019287504947882307, |
| "loss": 0.0565, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.07382506097159053, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0001927431059506531, |
| "loss": 0.0942, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0751433656317975, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.00019261116242248318, |
| "loss": 0.0061, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.07646167029200449, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019247921889431325, |
| "loss": 0.0497, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07777997495221145, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.0001923472753661433, |
| "loss": 0.0573, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07909827961241843, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00019221533183797336, |
| "loss": 0.0528, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08041658427262541, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.0001920833883098034, |
| "loss": 0.0506, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.08173488893283237, |
| "grad_norm": 0.08203125, |
| "learning_rate": 0.00019195144478163347, |
| "loss": 0.0307, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.08305319359303935, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.00019181950125346354, |
| "loss": 0.0365, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.08437149825324633, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00019168755772529358, |
| "loss": 0.0447, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.0856898029134533, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.00019155561419712365, |
| "loss": 0.0605, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.08700810757366027, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.0001914236706689537, |
| "loss": 0.0846, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08832641223386725, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.00019129172714078376, |
| "loss": 0.0713, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08964471689407422, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 0.0001911597836126138, |
| "loss": 0.0826, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.0909630215542812, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00019102784008444388, |
| "loss": 0.0441, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.09228132621448817, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00019089589655627395, |
| "loss": 0.1378, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.09359963087469514, |
| "grad_norm": 3.0625, |
| "learning_rate": 0.00019076395302810396, |
| "loss": 0.1552, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.09491793553490212, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.00019063200949993403, |
| "loss": 0.0458, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0962362401951091, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.0001905000659717641, |
| "loss": 0.0312, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.09755454485531606, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 0.00019036812244359414, |
| "loss": 0.0247, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09887284951552304, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.0001902361789154242, |
| "loss": 0.054, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.10019115417573, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 0.00019010423538725425, |
| "loss": 0.0023, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.10150945883593698, |
| "grad_norm": 0.0361328125, |
| "learning_rate": 0.00018997229185908432, |
| "loss": 0.0884, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.10282776349614396, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018984034833091436, |
| "loss": 0.0506, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.10414606815635093, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.00018970840480274443, |
| "loss": 0.1123, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.1054643728165579, |
| "grad_norm": 0.6953125, |
| "learning_rate": 0.0001895764612745745, |
| "loss": 0.0597, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.10678267747676488, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.00018944451774640454, |
| "loss": 0.0138, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.10810098213697185, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 0.0001893125742182346, |
| "loss": 0.0249, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.10941928679717883, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 0.00018918063069006466, |
| "loss": 0.0084, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.1107375914573858, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.00018904868716189472, |
| "loss": 0.0541, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.11205589611759277, |
| "grad_norm": 0.74609375, |
| "learning_rate": 0.00018891674363372477, |
| "loss": 0.007, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.11337420077779975, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 0.00018878480010555484, |
| "loss": 0.0875, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.11469250543800673, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.0001886528565773849, |
| "loss": 0.1207, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.11601081009821369, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00018852091304921495, |
| "loss": 0.1143, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.11732911475842067, |
| "grad_norm": 0.6484375, |
| "learning_rate": 0.00018838896952104502, |
| "loss": 0.0393, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.11864741941862765, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 0.00018825702599287506, |
| "loss": 0.02, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.11996572407883462, |
| "grad_norm": 0.486328125, |
| "learning_rate": 0.0001881250824647051, |
| "loss": 0.0891, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.1212840287390416, |
| "grad_norm": 1.0, |
| "learning_rate": 0.00018799313893653517, |
| "loss": 0.0469, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.12260233339924857, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.0001878611954083652, |
| "loss": 0.019, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.12392063805945554, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 0.00018772925188019528, |
| "loss": 0.007, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.12523894271966252, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 0.00018759730835202532, |
| "loss": 0.0039, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.12655724737986948, |
| "grad_norm": 0.014404296875, |
| "learning_rate": 0.0001874653648238554, |
| "loss": 0.0043, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.12787555204007647, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.00018733342129568546, |
| "loss": 0.1326, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.12919385670028344, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.0001872014777675155, |
| "loss": 0.0369, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.1305121613604904, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00018706953423934557, |
| "loss": 0.0395, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1318304660206974, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.00018693759071117561, |
| "loss": 0.0284, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1318304660206974, |
| "eval_loss": 0.04542969539761543, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.5293, |
| "eval_samples_per_second": 7.37, |
| "eval_steps_per_second": 3.685, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.13314877068090436, |
| "grad_norm": 0.0291748046875, |
| "learning_rate": 0.00018680564718300568, |
| "loss": 0.0533, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.13446707534111133, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.00018667370365483575, |
| "loss": 0.0183, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.13578538000131832, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 0.0001865417601266658, |
| "loss": 0.0473, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.13710368466152528, |
| "grad_norm": 0.388671875, |
| "learning_rate": 0.00018640981659849586, |
| "loss": 0.0562, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.13842198932173225, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.0001862778730703259, |
| "loss": 0.0755, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.13974029398193924, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.00018614592954215598, |
| "loss": 0.0422, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1410585986421462, |
| "grad_norm": 0.48828125, |
| "learning_rate": 0.00018601398601398602, |
| "loss": 0.0882, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.14237690330235317, |
| "grad_norm": 0.16015625, |
| "learning_rate": 0.0001858820424858161, |
| "loss": 0.0131, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.14369520796256013, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.00018575009895764616, |
| "loss": 0.03, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.14501351262276713, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 0.0001856181554294762, |
| "loss": 0.0425, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1463318172829741, |
| "grad_norm": 0.390625, |
| "learning_rate": 0.00018548621190130624, |
| "loss": 0.011, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.14765012194318106, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.0001853542683731363, |
| "loss": 0.0807, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.14896842660338805, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.00018522232484496635, |
| "loss": 0.0278, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.150286731263595, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00018509038131679642, |
| "loss": 0.0484, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.15160503592380198, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.00018495843778862646, |
| "loss": 0.1277, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.15292334058400897, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.00018482649426045653, |
| "loss": 0.058, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.15424164524421594, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.00018469455073228657, |
| "loss": 0.0259, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.1555599499044229, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018456260720411664, |
| "loss": 0.113, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1568782545646299, |
| "grad_norm": 0.12451171875, |
| "learning_rate": 0.0001844306636759467, |
| "loss": 0.0312, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.15819655922483686, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 0.00018429872014777676, |
| "loss": 0.0476, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.15951486388504382, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.00018416677661960682, |
| "loss": 0.0232, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.16083316854525082, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.00018403483309143687, |
| "loss": 0.1287, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.16215147320545778, |
| "grad_norm": 0.765625, |
| "learning_rate": 0.00018390288956326694, |
| "loss": 0.0991, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.16346977786566474, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.00018377094603509698, |
| "loss": 0.0247, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.16478808252587174, |
| "grad_norm": 0.37890625, |
| "learning_rate": 0.00018363900250692705, |
| "loss": 0.0632, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.1661063871860787, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.00018350705897875712, |
| "loss": 0.0314, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.16742469184628567, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.00018337511545058716, |
| "loss": 0.0425, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.16874299650649266, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.00018324317192241723, |
| "loss": 0.0613, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.17006130116669962, |
| "grad_norm": 0.057373046875, |
| "learning_rate": 0.00018311122839424727, |
| "loss": 0.0569, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.1713796058269066, |
| "grad_norm": 0.001373291015625, |
| "learning_rate": 0.00018297928486607734, |
| "loss": 0.007, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.17269791048711358, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00018284734133790738, |
| "loss": 0.0189, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.17401621514732055, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.00018271539780973742, |
| "loss": 0.0601, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1753345198075275, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.0001825834542815675, |
| "loss": 0.0211, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.1766528244677345, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.00018245151075339753, |
| "loss": 0.0713, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.17797112912794147, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001823195672252276, |
| "loss": 0.0522, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.17928943378814843, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 0.00018218762369705767, |
| "loss": 0.0242, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.18060773844835543, |
| "grad_norm": 0.048095703125, |
| "learning_rate": 0.00018205568016888772, |
| "loss": 0.0129, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.1819260431085624, |
| "grad_norm": 0.04541015625, |
| "learning_rate": 0.00018192373664071778, |
| "loss": 0.0142, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.18324434776876936, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 0.00018179179311254783, |
| "loss": 0.0121, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.18456265242897635, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0001816598495843779, |
| "loss": 0.0163, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.1858809570891833, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.00018152790605620796, |
| "loss": 0.0203, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.18719926174939028, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.000181395962528038, |
| "loss": 0.1548, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.18851756640959727, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00018126401899986808, |
| "loss": 0.0543, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.18983587106980424, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.00018113207547169812, |
| "loss": 0.0346, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.1911541757300112, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 0.0001810001319435282, |
| "loss": 0.03, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.1924724803902182, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.00018086818841535823, |
| "loss": 0.0796, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.19379078505042516, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 0.0001807362448871883, |
| "loss": 0.0662, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.19510908971063212, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.00018060430135901837, |
| "loss": 0.0675, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.19642739437083911, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.0001804723578308484, |
| "loss": 0.0377, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.19774569903104608, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.00018034041430267848, |
| "loss": 0.0174, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.19906400369125304, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00018020847077450852, |
| "loss": 0.0278, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.20038230835146, |
| "grad_norm": 0.8515625, |
| "learning_rate": 0.00018007652724633856, |
| "loss": 0.0113, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.201700613011667, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.00017994458371816863, |
| "loss": 0.0589, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.20301891767187397, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 0.00017981264018999867, |
| "loss": 0.0203, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.20433722233208093, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 0.00017968069666182874, |
| "loss": 0.0494, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.20565552699228792, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.00017954875313365879, |
| "loss": 0.0394, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2069738316524949, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.00017941680960548886, |
| "loss": 0.0848, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.20829213631270185, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00017928486607731892, |
| "loss": 0.0464, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.20961044097290885, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00017915292254914897, |
| "loss": 0.0222, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.2109287456331158, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.00017902097902097904, |
| "loss": 0.0434, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.21224705029332278, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00017888903549280908, |
| "loss": 0.0222, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.21356535495352977, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 0.00017875709196463915, |
| "loss": 0.0099, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.21488365961373673, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.0001786251484364692, |
| "loss": 0.0086, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.2162019642739437, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.00017849320490829926, |
| "loss": 0.0715, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2175202689341507, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.00017836126138012933, |
| "loss": 0.0642, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.21883857359435765, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 0.00017822931785195937, |
| "loss": 0.0111, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.22015687825456462, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00017809737432378944, |
| "loss": 0.0518, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.2214751829147716, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 0.00017796543079561948, |
| "loss": 0.0384, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.22279348757497858, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.00017783348726744955, |
| "loss": 0.0204, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.22411179223518554, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.00017770154373927962, |
| "loss": 0.0075, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.22543009689539253, |
| "grad_norm": 0.033203125, |
| "learning_rate": 0.00017756960021110963, |
| "loss": 0.0895, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.2267484015555995, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.0001774376566829397, |
| "loss": 0.1039, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.22806670621580646, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.00017730571315476975, |
| "loss": 0.0125, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.22938501087601346, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.00017717376962659982, |
| "loss": 0.0381, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.23070331553622042, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 0.00017704182609842988, |
| "loss": 0.0434, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.23202162019642739, |
| "grad_norm": 0.43359375, |
| "learning_rate": 0.00017690988257025993, |
| "loss": 0.0799, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.23333992485663438, |
| "grad_norm": 0.04150390625, |
| "learning_rate": 0.00017677793904209, |
| "loss": 0.0692, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.23465822951684134, |
| "grad_norm": 0.435546875, |
| "learning_rate": 0.00017664599551392004, |
| "loss": 0.0544, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2359765341770483, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001765140519857501, |
| "loss": 0.0619, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.2372948388372553, |
| "grad_norm": 0.01263427734375, |
| "learning_rate": 0.00017638210845758018, |
| "loss": 0.0418, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.23861314349746227, |
| "grad_norm": 0.017578125, |
| "learning_rate": 0.00017625016492941022, |
| "loss": 0.0195, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.23993144815766923, |
| "grad_norm": 0.6171875, |
| "learning_rate": 0.0001761182214012403, |
| "loss": 0.067, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.24124975281787622, |
| "grad_norm": 0.59765625, |
| "learning_rate": 0.00017598627787307033, |
| "loss": 0.049, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.2425680574780832, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001758543343449004, |
| "loss": 0.0539, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.24388636213829015, |
| "grad_norm": 0.10302734375, |
| "learning_rate": 0.00017572239081673044, |
| "loss": 0.0725, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.24520466679849715, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.0001755904472885605, |
| "loss": 0.064, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.2465229714587041, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.00017545850376039058, |
| "loss": 0.0271, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.24784127611891107, |
| "grad_norm": 0.01470947265625, |
| "learning_rate": 0.00017532656023222062, |
| "loss": 0.0247, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.24915958077911807, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 0.0001751946167040507, |
| "loss": 0.017, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.25047788543932503, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.00017506267317588073, |
| "loss": 0.0254, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.251796190099532, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.00017493072964771078, |
| "loss": 0.0186, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.25311449475973896, |
| "grad_norm": 0.66796875, |
| "learning_rate": 0.00017479878611954084, |
| "loss": 0.0617, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.25443279941994595, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.00017466684259137089, |
| "loss": 0.0173, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.25575110408015295, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.00017453489906320096, |
| "loss": 0.0512, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.2570694087403599, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.000174402955535031, |
| "loss": 0.0361, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.2583877134005669, |
| "grad_norm": 0.423828125, |
| "learning_rate": 0.00017427101200686107, |
| "loss": 0.0175, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.25970601806077387, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.00017413906847869114, |
| "loss": 0.0139, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.2610243227209808, |
| "grad_norm": 0.515625, |
| "learning_rate": 0.00017400712495052118, |
| "loss": 0.0948, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.2623426273811878, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017387518142235125, |
| "loss": 0.0406, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.2636609320413948, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 0.0001737432378941813, |
| "loss": 0.1011, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2636609320413948, |
| "eval_loss": 0.045552924275398254, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.6113, |
| "eval_samples_per_second": 7.369, |
| "eval_steps_per_second": 3.684, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.26497923670160173, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.00017361129436601136, |
| "loss": 0.0711, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2662975413618087, |
| "grad_norm": 0.0208740234375, |
| "learning_rate": 0.00017347935083784143, |
| "loss": 0.0218, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.2676158460220157, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00017334740730967147, |
| "loss": 0.0301, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.26893415068222265, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00017321546378150154, |
| "loss": 0.0721, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.27025245534242964, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.00017308352025333158, |
| "loss": 0.0363, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.27157076000263664, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00017295157672516165, |
| "loss": 0.0313, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.2728890646628436, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 0.0001728196331969917, |
| "loss": 0.0385, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.27420736932305056, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 0.00017268768966882176, |
| "loss": 0.0405, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.27552567398325756, |
| "grad_norm": 0.484375, |
| "learning_rate": 0.00017255574614065183, |
| "loss": 0.0616, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.2768439786434645, |
| "grad_norm": 0.0908203125, |
| "learning_rate": 0.00017242380261248185, |
| "loss": 0.0057, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2781622833036715, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.00017229185908431192, |
| "loss": 0.0417, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.2794805879638785, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.00017215991555614196, |
| "loss": 0.0346, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2807988926240854, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 0.00017202797202797203, |
| "loss": 0.0295, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.2821171972842924, |
| "grad_norm": 0.490234375, |
| "learning_rate": 0.0001718960284998021, |
| "loss": 0.0448, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.28343550194449935, |
| "grad_norm": 0.004241943359375, |
| "learning_rate": 0.00017176408497163214, |
| "loss": 0.0051, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.28475380660470634, |
| "grad_norm": 0.01904296875, |
| "learning_rate": 0.0001716321414434622, |
| "loss": 0.0894, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.28607211126491333, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.00017150019791529225, |
| "loss": 0.0288, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.28739041592512027, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.00017136825438712232, |
| "loss": 0.0222, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.28870872058532726, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.0001712363108589524, |
| "loss": 0.0444, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.29002702524553425, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.00017110436733078243, |
| "loss": 0.0828, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2913453299057412, |
| "grad_norm": 0.04052734375, |
| "learning_rate": 0.0001709724238026125, |
| "loss": 0.0725, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.2926636345659482, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.00017084048027444254, |
| "loss": 0.0204, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.2939819392261552, |
| "grad_norm": 0.67578125, |
| "learning_rate": 0.0001707085367462726, |
| "loss": 0.0503, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.2953002438863621, |
| "grad_norm": 0.0059814453125, |
| "learning_rate": 0.00017057659321810265, |
| "loss": 0.0144, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.2966185485465691, |
| "grad_norm": 0.0269775390625, |
| "learning_rate": 0.00017044464968993272, |
| "loss": 0.0044, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.2979368532067761, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.0001703127061617628, |
| "loss": 0.013, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.29925515786698303, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.00017018076263359283, |
| "loss": 0.0245, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.30057346252719, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.0001700488191054229, |
| "loss": 0.0247, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.301891767187397, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.00016991687557725294, |
| "loss": 0.0402, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.30321007184760396, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.000169784932049083, |
| "loss": 0.0071, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.30452837650781095, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.00016965298852091306, |
| "loss": 0.0177, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.30584668116801794, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 0.0001695210449927431, |
| "loss": 0.0029, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3071649858282249, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.00016938910146457317, |
| "loss": 0.0262, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.30848329048843187, |
| "grad_norm": 0.002655029296875, |
| "learning_rate": 0.0001692571579364032, |
| "loss": 0.0346, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.30980159514863886, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 0.00016912521440823328, |
| "loss": 0.0494, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.3111198998088458, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016899327088006335, |
| "loss": 0.0603, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3124382044690528, |
| "grad_norm": 0.1572265625, |
| "learning_rate": 0.0001688613273518934, |
| "loss": 0.0366, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.3137565091292598, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 0.00016872938382372346, |
| "loss": 0.0678, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.3150748137894667, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.0001685974402955535, |
| "loss": 0.0359, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.3163931184496737, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.00016846549676738357, |
| "loss": 0.1099, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3177114231098807, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.00016833355323921364, |
| "loss": 0.0343, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.31902972777008765, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 0.00016820160971104368, |
| "loss": 0.0138, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.32034803243029464, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.00016806966618287375, |
| "loss": 0.0202, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.32166633709050163, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.0001679377226547038, |
| "loss": 0.0442, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.32298464175070857, |
| "grad_norm": 0.049072265625, |
| "learning_rate": 0.00016780577912653386, |
| "loss": 0.0375, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.32430294641091556, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 0.0001676738355983639, |
| "loss": 0.01, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.32562125107112255, |
| "grad_norm": 0.02197265625, |
| "learning_rate": 0.00016754189207019397, |
| "loss": 0.0139, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.3269395557313295, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.00016740994854202404, |
| "loss": 0.014, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3282578603915365, |
| "grad_norm": 0.47265625, |
| "learning_rate": 0.00016727800501385408, |
| "loss": 0.1546, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.3295761650517435, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 0.00016714606148568413, |
| "loss": 0.0803, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3308944697119504, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.00016701411795751417, |
| "loss": 0.0376, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.3322127743721574, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.00016688217442934424, |
| "loss": 0.0375, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.3335310790323644, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0001667502309011743, |
| "loss": 0.0442, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.33484938369257133, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 0.00016661828737300435, |
| "loss": 0.0261, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3361676883527783, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.00016648634384483442, |
| "loss": 0.0553, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.3374859930129853, |
| "grad_norm": 0.1328125, |
| "learning_rate": 0.00016635440031666446, |
| "loss": 0.0065, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.33880429767319226, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.00016622245678849453, |
| "loss": 0.0527, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.34012260233339925, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.0001660905132603246, |
| "loss": 0.0297, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.34144090699360624, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00016595856973215464, |
| "loss": 0.0477, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.3427592116538132, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.0001658266262039847, |
| "loss": 0.0298, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.34407751631402017, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.00016569468267581475, |
| "loss": 0.0481, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.34539582097422716, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.00016556273914764482, |
| "loss": 0.0153, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3467141256344341, |
| "grad_norm": 0.00592041015625, |
| "learning_rate": 0.00016543079561947486, |
| "loss": 0.0111, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.3480324302946411, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.00016529885209130493, |
| "loss": 0.0309, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.3493507349548481, |
| "grad_norm": 0.0198974609375, |
| "learning_rate": 0.000165166908563135, |
| "loss": 0.0579, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.350669039615055, |
| "grad_norm": 0.10107421875, |
| "learning_rate": 0.00016503496503496504, |
| "loss": 0.0055, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.351987344275262, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.00016490302150679511, |
| "loss": 0.0299, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.353305648935469, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 0.00016477107797862516, |
| "loss": 0.0943, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.35462395359567594, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.00016463913445045523, |
| "loss": 0.0216, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.35594225825588294, |
| "grad_norm": 0.02392578125, |
| "learning_rate": 0.00016450719092228527, |
| "loss": 0.0265, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.35726056291608993, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0001643752473941153, |
| "loss": 0.0539, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.35857886757629687, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 0.00016424330386594538, |
| "loss": 0.0139, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.35989717223650386, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.00016411136033777542, |
| "loss": 0.0428, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.36121547689671085, |
| "grad_norm": 0.052734375, |
| "learning_rate": 0.0001639794168096055, |
| "loss": 0.0346, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.3625337815569178, |
| "grad_norm": 0.12158203125, |
| "learning_rate": 0.00016384747328143556, |
| "loss": 0.0095, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.3638520862171248, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 0.0001637155297532656, |
| "loss": 0.0224, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3651703908773318, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 0.00016358358622509567, |
| "loss": 0.0316, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.3664886955375387, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 0.0001634516426969257, |
| "loss": 0.0051, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3678070001977457, |
| "grad_norm": 0.00396728515625, |
| "learning_rate": 0.00016331969916875578, |
| "loss": 0.038, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.3691253048579527, |
| "grad_norm": 0.375, |
| "learning_rate": 0.00016318775564058585, |
| "loss": 0.029, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.37044360951815963, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.0001630558121124159, |
| "loss": 0.0072, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.3717619141783666, |
| "grad_norm": 0.00127410888671875, |
| "learning_rate": 0.00016292386858424596, |
| "loss": 0.0381, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3730802188385736, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.000162791925056076, |
| "loss": 0.0573, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.37439852349878056, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 0.00016265998152790607, |
| "loss": 0.051, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.37571682815898755, |
| "grad_norm": 0.0015106201171875, |
| "learning_rate": 0.00016252803799973612, |
| "loss": 0.0239, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.37703513281919454, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.00016239609447156618, |
| "loss": 0.0165, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.3783534374794015, |
| "grad_norm": 0.006134033203125, |
| "learning_rate": 0.00016226415094339625, |
| "loss": 0.0071, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.37967174213960847, |
| "grad_norm": 2.828125, |
| "learning_rate": 0.0001621322074152263, |
| "loss": 0.0272, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.38099004679981546, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.00016200026388705637, |
| "loss": 0.0647, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.3823083514600224, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 0.00016186832035888638, |
| "loss": 0.0262, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.3836266561202294, |
| "grad_norm": 0.041015625, |
| "learning_rate": 0.00016173637683071645, |
| "loss": 0.0576, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.3849449607804364, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 0.00016160443330254652, |
| "loss": 0.0142, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.3862632654406433, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.00016147248977437656, |
| "loss": 0.0348, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.3875815701008503, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00016134054624620663, |
| "loss": 0.0672, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.3888998747610573, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.00016120860271803667, |
| "loss": 0.0121, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.39021817942126424, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.00016107665918986674, |
| "loss": 0.0114, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.39153648408147124, |
| "grad_norm": 0.85546875, |
| "learning_rate": 0.0001609447156616968, |
| "loss": 0.0968, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.39285478874167823, |
| "grad_norm": 0.703125, |
| "learning_rate": 0.00016081277213352685, |
| "loss": 0.0349, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.39417309340188517, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 0.00016068082860535692, |
| "loss": 0.0106, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.39549139806209216, |
| "grad_norm": 0.7265625, |
| "learning_rate": 0.00016054888507718696, |
| "loss": 0.0225, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.39549139806209216, |
| "eval_loss": 0.03515048325061798, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.3497, |
| "eval_samples_per_second": 7.373, |
| "eval_steps_per_second": 3.686, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3968097027222991, |
| "grad_norm": 0.016519820317626, |
| "learning_rate": 0.00016041694154901703, |
| "loss": 0.0202, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.3981280073825061, |
| "grad_norm": 0.8505942225456238, |
| "learning_rate": 0.00016028499802084708, |
| "loss": 0.0541, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.3994463120427131, |
| "grad_norm": 0.04163295030593872, |
| "learning_rate": 0.00016015305449267714, |
| "loss": 0.0037, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.40076461670292, |
| "grad_norm": 0.011332935653626919, |
| "learning_rate": 0.00016002111096450721, |
| "loss": 0.0459, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.402082921363127, |
| "grad_norm": 0.9360129833221436, |
| "learning_rate": 0.00015988916743633726, |
| "loss": 0.013, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.403401226023334, |
| "grad_norm": 0.11991436779499054, |
| "learning_rate": 0.00015975722390816733, |
| "loss": 0.0079, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.40471953068354094, |
| "grad_norm": 0.36911076307296753, |
| "learning_rate": 0.00015962528037999737, |
| "loss": 0.0638, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.40603783534374793, |
| "grad_norm": 0.020278634503483772, |
| "learning_rate": 0.00015949333685182744, |
| "loss": 0.0217, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.4073561400039549, |
| "grad_norm": 0.14263059198856354, |
| "learning_rate": 0.0001593613933236575, |
| "loss": 0.0495, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.40867444466416186, |
| "grad_norm": 0.09494803845882416, |
| "learning_rate": 0.00015922944979548752, |
| "loss": 0.0248, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.40999274932436885, |
| "grad_norm": 0.23064319789409637, |
| "learning_rate": 0.0001590975062673176, |
| "loss": 0.0285, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.41131105398457585, |
| "grad_norm": 0.32220256328582764, |
| "learning_rate": 0.00015896556273914763, |
| "loss": 0.0537, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.4126293586447828, |
| "grad_norm": 0.41208815574645996, |
| "learning_rate": 0.0001588336192109777, |
| "loss": 0.0453, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.4139476633049898, |
| "grad_norm": 0.03775424137711525, |
| "learning_rate": 0.00015870167568280777, |
| "loss": 0.0134, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.41526596796519677, |
| "grad_norm": 0.6526333093643188, |
| "learning_rate": 0.0001585697321546378, |
| "loss": 0.0329, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.4165842726254037, |
| "grad_norm": 1.001305103302002, |
| "learning_rate": 0.00015843778862646788, |
| "loss": 0.0912, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.4179025772856107, |
| "grad_norm": 0.4055219888687134, |
| "learning_rate": 0.00015830584509829792, |
| "loss": 0.0519, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.4192208819458177, |
| "grad_norm": 0.035015616565942764, |
| "learning_rate": 0.000158173901570128, |
| "loss": 0.0191, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.42053918660602463, |
| "grad_norm": 0.09326844662427902, |
| "learning_rate": 0.00015804195804195806, |
| "loss": 0.0106, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.4218574912662316, |
| "grad_norm": 0.06223440542817116, |
| "learning_rate": 0.0001579100145137881, |
| "loss": 0.0113, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4231757959264386, |
| "grad_norm": 0.0625135526061058, |
| "learning_rate": 0.00015777807098561817, |
| "loss": 0.0191, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.42449410058664555, |
| "grad_norm": 0.2645983099937439, |
| "learning_rate": 0.00015764612745744822, |
| "loss": 0.0829, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.42581240524685254, |
| "grad_norm": 0.009632415138185024, |
| "learning_rate": 0.00015751418392927829, |
| "loss": 0.0542, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.42713070990705954, |
| "grad_norm": 0.01979319378733635, |
| "learning_rate": 0.00015738224040110833, |
| "loss": 0.0517, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.4284490145672665, |
| "grad_norm": 0.3065454065799713, |
| "learning_rate": 0.0001572502968729384, |
| "loss": 0.0738, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.42976731922747347, |
| "grad_norm": 0.09581473469734192, |
| "learning_rate": 0.00015711835334476847, |
| "loss": 0.0571, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.43108562388768046, |
| "grad_norm": 0.23746591806411743, |
| "learning_rate": 0.0001569864098165985, |
| "loss": 0.0128, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.4324039285478874, |
| "grad_norm": 0.936278760433197, |
| "learning_rate": 0.00015685446628842858, |
| "loss": 0.0665, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4337222332080944, |
| "grad_norm": 0.18487441539764404, |
| "learning_rate": 0.00015672252276025862, |
| "loss": 0.0527, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.4350405378683014, |
| "grad_norm": 0.6980624794960022, |
| "learning_rate": 0.00015659057923208866, |
| "loss": 0.0613, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4363588425285083, |
| "grad_norm": 0.4696301221847534, |
| "learning_rate": 0.00015645863570391873, |
| "loss": 0.0569, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.4376771471887153, |
| "grad_norm": 0.15083105862140656, |
| "learning_rate": 0.00015632669217574877, |
| "loss": 0.0394, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.4389954518489223, |
| "grad_norm": 0.44701239466667175, |
| "learning_rate": 0.00015619474864757884, |
| "loss": 0.0494, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.44031375650912924, |
| "grad_norm": 0.07418403029441833, |
| "learning_rate": 0.00015606280511940888, |
| "loss": 0.0291, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.44163206116933623, |
| "grad_norm": 0.02311861515045166, |
| "learning_rate": 0.00015593086159123895, |
| "loss": 0.0304, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.4429503658295432, |
| "grad_norm": 0.4416038990020752, |
| "learning_rate": 0.00015579891806306902, |
| "loss": 0.0176, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.44426867048975016, |
| "grad_norm": 0.5124915242195129, |
| "learning_rate": 0.00015566697453489906, |
| "loss": 0.0454, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.44558697514995715, |
| "grad_norm": 0.3159286081790924, |
| "learning_rate": 0.00015553503100672913, |
| "loss": 0.047, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.44690527981016415, |
| "grad_norm": 0.032126396894454956, |
| "learning_rate": 0.00015540308747855918, |
| "loss": 0.0151, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.4482235844703711, |
| "grad_norm": 0.04663548618555069, |
| "learning_rate": 0.00015527114395038924, |
| "loss": 0.0375, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.4495418891305781, |
| "grad_norm": 0.013753900304436684, |
| "learning_rate": 0.0001551392004222193, |
| "loss": 0.0485, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.45086019379078507, |
| "grad_norm": 1.9952393770217896, |
| "learning_rate": 0.00015500725689404936, |
| "loss": 0.0625, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.452178498450992, |
| "grad_norm": 0.014283270575106144, |
| "learning_rate": 0.00015487531336587943, |
| "loss": 0.0037, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.453496803111199, |
| "grad_norm": 0.3897913098335266, |
| "learning_rate": 0.00015474336983770947, |
| "loss": 0.0304, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.454815107771406, |
| "grad_norm": 0.3730885684490204, |
| "learning_rate": 0.00015461142630953954, |
| "loss": 0.0115, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.45613341243161293, |
| "grad_norm": 0.035858724266290665, |
| "learning_rate": 0.00015447948278136958, |
| "loss": 0.0021, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.4574517170918199, |
| "grad_norm": 0.20589517056941986, |
| "learning_rate": 0.00015434753925319965, |
| "loss": 0.0132, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.4587700217520269, |
| "grad_norm": 0.004939342383295298, |
| "learning_rate": 0.00015421559572502972, |
| "loss": 0.0471, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.46008832641223385, |
| "grad_norm": 0.03493283689022064, |
| "learning_rate": 0.00015408365219685976, |
| "loss": 0.0062, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.46140663107244084, |
| "grad_norm": 0.045927103608846664, |
| "learning_rate": 0.0001539517086686898, |
| "loss": 0.0283, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.46272493573264784, |
| "grad_norm": 0.012629454955458641, |
| "learning_rate": 0.00015381976514051984, |
| "loss": 0.0133, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.46404324039285477, |
| "grad_norm": 0.8001697659492493, |
| "learning_rate": 0.0001536878216123499, |
| "loss": 0.0224, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.46536154505306176, |
| "grad_norm": 0.002036362886428833, |
| "learning_rate": 0.00015355587808417998, |
| "loss": 0.0066, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.46667984971326876, |
| "grad_norm": 1.0261330604553223, |
| "learning_rate": 0.00015342393455601002, |
| "loss": 0.191, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.4679981543734757, |
| "grad_norm": 0.3033429682254791, |
| "learning_rate": 0.0001532919910278401, |
| "loss": 0.0222, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.4693164590336827, |
| "grad_norm": 0.36911338567733765, |
| "learning_rate": 0.00015316004749967014, |
| "loss": 0.0363, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.4706347636938897, |
| "grad_norm": 0.0406811460852623, |
| "learning_rate": 0.0001530281039715002, |
| "loss": 0.0283, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.4719530683540966, |
| "grad_norm": 0.23334211111068726, |
| "learning_rate": 0.00015289616044333027, |
| "loss": 0.0274, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.4732713730143036, |
| "grad_norm": 0.013081169687211514, |
| "learning_rate": 0.00015276421691516032, |
| "loss": 0.0221, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.4745896776745106, |
| "grad_norm": 0.2480790615081787, |
| "learning_rate": 0.00015263227338699039, |
| "loss": 0.019, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.47590798233471754, |
| "grad_norm": 0.0373196005821228, |
| "learning_rate": 0.00015250032985882043, |
| "loss": 0.0292, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.47722628699492453, |
| "grad_norm": 0.004609994124621153, |
| "learning_rate": 0.0001523683863306505, |
| "loss": 0.0918, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4785445916551315, |
| "grad_norm": 0.02370987832546234, |
| "learning_rate": 0.00015223644280248054, |
| "loss": 0.0462, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.47986289631533846, |
| "grad_norm": 0.05842221528291702, |
| "learning_rate": 0.0001521044992743106, |
| "loss": 0.0595, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.48118120097554545, |
| "grad_norm": 0.009685276076197624, |
| "learning_rate": 0.00015197255574614068, |
| "loss": 0.0074, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.48249950563575245, |
| "grad_norm": 0.8933250308036804, |
| "learning_rate": 0.00015184061221797072, |
| "loss": 0.0757, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.4838178102959594, |
| "grad_norm": 0.07075401395559311, |
| "learning_rate": 0.0001517086686898008, |
| "loss": 0.0226, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.4851361149561664, |
| "grad_norm": 0.732706606388092, |
| "learning_rate": 0.00015157672516163083, |
| "loss": 0.0161, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.48645441961637337, |
| "grad_norm": 1.1897023916244507, |
| "learning_rate": 0.0001514447816334609, |
| "loss": 0.0265, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.4877727242765803, |
| "grad_norm": 0.052572328597307205, |
| "learning_rate": 0.00015131283810529094, |
| "loss": 0.0094, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4890910289367873, |
| "grad_norm": 0.08263898640871048, |
| "learning_rate": 0.00015118089457712098, |
| "loss": 0.0631, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.4904093335969943, |
| "grad_norm": 0.03225664421916008, |
| "learning_rate": 0.00015104895104895105, |
| "loss": 0.023, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.4917276382572012, |
| "grad_norm": 0.007935039699077606, |
| "learning_rate": 0.0001509170075207811, |
| "loss": 0.0039, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.4930459429174082, |
| "grad_norm": 0.00830796267837286, |
| "learning_rate": 0.00015078506399261116, |
| "loss": 0.007, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.4943642475776152, |
| "grad_norm": 0.08042234182357788, |
| "learning_rate": 0.00015065312046444123, |
| "loss": 0.0366, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.49568255223782215, |
| "grad_norm": 0.009092851541936398, |
| "learning_rate": 0.00015052117693627128, |
| "loss": 0.0107, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.49700085689802914, |
| "grad_norm": 0.2674141824245453, |
| "learning_rate": 0.00015038923340810135, |
| "loss": 0.0076, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.49831916155823613, |
| "grad_norm": 0.07694366574287415, |
| "learning_rate": 0.0001502572898799314, |
| "loss": 0.0252, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.49963746621844307, |
| "grad_norm": 0.5699467062950134, |
| "learning_rate": 0.00015012534635176146, |
| "loss": 0.0487, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.5009557708786501, |
| "grad_norm": 0.18800878524780273, |
| "learning_rate": 0.0001499934028235915, |
| "loss": 0.0183, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5022740755388571, |
| "grad_norm": 0.019469989463686943, |
| "learning_rate": 0.00014986145929542157, |
| "loss": 0.0268, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.503592380199064, |
| "grad_norm": 0.01890506222844124, |
| "learning_rate": 0.00014972951576725164, |
| "loss": 0.0449, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5049106848592709, |
| "grad_norm": 0.0006314461352303624, |
| "learning_rate": 0.00014959757223908168, |
| "loss": 0.0056, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.5062289895194779, |
| "grad_norm": 0.32654041051864624, |
| "learning_rate": 0.00014946562871091175, |
| "loss": 0.0256, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.5075472941796849, |
| "grad_norm": 0.7803483605384827, |
| "learning_rate": 0.0001493336851827418, |
| "loss": 0.0374, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.5088655988398919, |
| "grad_norm": 0.028441445901989937, |
| "learning_rate": 0.00014920174165457186, |
| "loss": 0.0161, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5101839035000989, |
| "grad_norm": 0.028379200026392937, |
| "learning_rate": 0.00014906979812640193, |
| "loss": 0.0151, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.5115022081603059, |
| "grad_norm": 0.021159596741199493, |
| "learning_rate": 0.00014893785459823197, |
| "loss": 0.0303, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.24903325736522675, |
| "learning_rate": 0.000148805911070062, |
| "loss": 0.0076, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.5141388174807198, |
| "grad_norm": 0.007065301761031151, |
| "learning_rate": 0.00014867396754189206, |
| "loss": 0.022, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5154571221409268, |
| "grad_norm": 0.004032329190522432, |
| "learning_rate": 0.00014854202401372212, |
| "loss": 0.0083, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.5167754268011338, |
| "grad_norm": 0.3045775592327118, |
| "learning_rate": 0.0001484100804855522, |
| "loss": 0.0113, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5180937314613407, |
| "grad_norm": 0.36974939703941345, |
| "learning_rate": 0.00014827813695738224, |
| "loss": 0.0267, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.5194120361215477, |
| "grad_norm": 0.009729950688779354, |
| "learning_rate": 0.0001481461934292123, |
| "loss": 0.027, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5207303407817546, |
| "grad_norm": 0.0013097926275804639, |
| "learning_rate": 0.00014801424990104235, |
| "loss": 0.003, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.5220486454419616, |
| "grad_norm": 0.0706263929605484, |
| "learning_rate": 0.00014788230637287242, |
| "loss": 0.0193, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.5233669501021686, |
| "grad_norm": 1.435702919960022, |
| "learning_rate": 0.00014775036284470249, |
| "loss": 0.0647, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.5246852547623756, |
| "grad_norm": 0.00661757867783308, |
| "learning_rate": 0.00014761841931653253, |
| "loss": 0.0373, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.5260035594225826, |
| "grad_norm": 0.12014541029930115, |
| "learning_rate": 0.0001474864757883626, |
| "loss": 0.0178, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.5273218640827896, |
| "grad_norm": 1.0549248456954956, |
| "learning_rate": 0.00014735453226019264, |
| "loss": 0.0191, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5273218640827896, |
| "eval_loss": 0.037292081862688065, |
| "eval_runtime": 454.3033, |
| "eval_samples_per_second": 7.422, |
| "eval_steps_per_second": 3.711, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5286401687429965, |
| "grad_norm": 0.47634151577949524, |
| "learning_rate": 0.0001472225887320227, |
| "loss": 0.0404, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.5299584734032035, |
| "grad_norm": 0.006752463988959789, |
| "learning_rate": 0.00014709064520385275, |
| "loss": 0.034, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.5312767780634104, |
| "grad_norm": 0.20780125260353088, |
| "learning_rate": 0.00014695870167568282, |
| "loss": 0.0421, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.5325950827236174, |
| "grad_norm": 0.010941066779196262, |
| "learning_rate": 0.0001468267581475129, |
| "loss": 0.0086, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.5339133873838244, |
| "grad_norm": 0.3439581096172333, |
| "learning_rate": 0.00014669481461934293, |
| "loss": 0.0187, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.5352316920440314, |
| "grad_norm": 0.14961636066436768, |
| "learning_rate": 0.000146562871091173, |
| "loss": 0.0504, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.5365499967042383, |
| "grad_norm": 0.0044641937129199505, |
| "learning_rate": 0.00014643092756300304, |
| "loss": 0.0134, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.5378683013644453, |
| "grad_norm": 0.14088386297225952, |
| "learning_rate": 0.0001462989840348331, |
| "loss": 0.0096, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.5391866060246523, |
| "grad_norm": 0.48116979002952576, |
| "learning_rate": 0.00014616704050666315, |
| "loss": 0.0124, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.5405049106848593, |
| "grad_norm": 0.3688766360282898, |
| "learning_rate": 0.0001460350969784932, |
| "loss": 0.0226, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5418232153450663, |
| "grad_norm": 0.002938181860372424, |
| "learning_rate": 0.00014590315345032326, |
| "loss": 0.0267, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.5431415200052733, |
| "grad_norm": 0.3335214853286743, |
| "learning_rate": 0.0001457712099221533, |
| "loss": 0.0367, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.5444598246654802, |
| "grad_norm": 0.004644686821848154, |
| "learning_rate": 0.00014563926639398338, |
| "loss": 0.0121, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.5457781293256871, |
| "grad_norm": 0.19505545496940613, |
| "learning_rate": 0.00014550732286581345, |
| "loss": 0.0591, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.5470964339858941, |
| "grad_norm": 0.018028756603598595, |
| "learning_rate": 0.0001453753793376435, |
| "loss": 0.0131, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.5484147386461011, |
| "grad_norm": 0.045639291405677795, |
| "learning_rate": 0.00014524343580947356, |
| "loss": 0.0443, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5497330433063081, |
| "grad_norm": 0.727981686592102, |
| "learning_rate": 0.0001451114922813036, |
| "loss": 0.0205, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.5510513479665151, |
| "grad_norm": 0.03766491636633873, |
| "learning_rate": 0.00014497954875313367, |
| "loss": 0.0067, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.552369652626722, |
| "grad_norm": 0.1911504715681076, |
| "learning_rate": 0.0001448476052249637, |
| "loss": 0.0397, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.553687957286929, |
| "grad_norm": 0.08238353580236435, |
| "learning_rate": 0.00014471566169679378, |
| "loss": 0.0513, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.555006261947136, |
| "grad_norm": 0.06317206472158432, |
| "learning_rate": 0.00014458371816862385, |
| "loss": 0.0178, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.556324566607343, |
| "grad_norm": 0.0652734637260437, |
| "learning_rate": 0.0001444517746404539, |
| "loss": 0.0184, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.55764287126755, |
| "grad_norm": 0.05471858009696007, |
| "learning_rate": 0.00014431983111228396, |
| "loss": 0.0089, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.558961175927757, |
| "grad_norm": 0.005062670446932316, |
| "learning_rate": 0.000144187887584114, |
| "loss": 0.0052, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5602794805879638, |
| "grad_norm": 0.06337414681911469, |
| "learning_rate": 0.00014405594405594407, |
| "loss": 0.053, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.5615977852481708, |
| "grad_norm": 0.33745357394218445, |
| "learning_rate": 0.00014392400052777414, |
| "loss": 0.0166, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5629160899083778, |
| "grad_norm": 0.7382741570472717, |
| "learning_rate": 0.00014379205699960418, |
| "loss": 0.0191, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.5642343945685848, |
| "grad_norm": 0.007551972754299641, |
| "learning_rate": 0.00014366011347143425, |
| "loss": 0.0022, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5655526992287918, |
| "grad_norm": 0.6260896921157837, |
| "learning_rate": 0.00014352816994326427, |
| "loss": 0.0095, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.5668710038889987, |
| "grad_norm": 0.11619322001934052, |
| "learning_rate": 0.00014339622641509434, |
| "loss": 0.015, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5681893085492057, |
| "grad_norm": 1.1440670490264893, |
| "learning_rate": 0.0001432642828869244, |
| "loss": 0.1343, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.5695076132094127, |
| "grad_norm": 1.1793878078460693, |
| "learning_rate": 0.00014313233935875445, |
| "loss": 0.0968, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.5708259178696197, |
| "grad_norm": 0.6865736842155457, |
| "learning_rate": 0.00014300039583058452, |
| "loss": 0.0195, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.5721442225298267, |
| "grad_norm": 0.140816792845726, |
| "learning_rate": 0.00014286845230241456, |
| "loss": 0.0761, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5734625271900337, |
| "grad_norm": 0.04071786254644394, |
| "learning_rate": 0.00014273650877424463, |
| "loss": 0.0193, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.5747808318502405, |
| "grad_norm": 0.044617727398872375, |
| "learning_rate": 0.0001426045652460747, |
| "loss": 0.0112, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.5760991365104475, |
| "grad_norm": 0.11001799255609512, |
| "learning_rate": 0.00014247262171790474, |
| "loss": 0.0039, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.5774174411706545, |
| "grad_norm": 0.0036315324250608683, |
| "learning_rate": 0.0001423406781897348, |
| "loss": 0.0038, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5787357458308615, |
| "grad_norm": 0.9866570830345154, |
| "learning_rate": 0.00014220873466156485, |
| "loss": 0.025, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.5800540504910685, |
| "grad_norm": 0.023570384830236435, |
| "learning_rate": 0.00014207679113339492, |
| "loss": 0.0468, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5813723551512755, |
| "grad_norm": 0.20010559260845184, |
| "learning_rate": 0.00014194484760522496, |
| "loss": 0.0198, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.5826906598114824, |
| "grad_norm": 0.06153270602226257, |
| "learning_rate": 0.00014181290407705503, |
| "loss": 0.0764, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.5840089644716894, |
| "grad_norm": 0.033162448555231094, |
| "learning_rate": 0.0001416809605488851, |
| "loss": 0.028, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.5853272691318964, |
| "grad_norm": 0.428382933139801, |
| "learning_rate": 0.00014154901702071514, |
| "loss": 0.0652, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5866455737921034, |
| "grad_norm": 0.25004762411117554, |
| "learning_rate": 0.0001414170734925452, |
| "loss": 0.0411, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.5879638784523104, |
| "grad_norm": 0.22649863362312317, |
| "learning_rate": 0.00014128512996437525, |
| "loss": 0.0517, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5892821831125173, |
| "grad_norm": 0.035932112485170364, |
| "learning_rate": 0.00014115318643620532, |
| "loss": 0.015, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.5906004877727242, |
| "grad_norm": 0.3800172507762909, |
| "learning_rate": 0.00014102124290803536, |
| "loss": 0.0324, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5919187924329312, |
| "grad_norm": 0.6974118947982788, |
| "learning_rate": 0.0001408892993798654, |
| "loss": 0.0216, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.5932370970931382, |
| "grad_norm": 0.15472032129764557, |
| "learning_rate": 0.00014075735585169548, |
| "loss": 0.0164, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5945554017533452, |
| "grad_norm": 0.015000814571976662, |
| "learning_rate": 0.00014062541232352552, |
| "loss": 0.0395, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.5958737064135522, |
| "grad_norm": 0.052086081355810165, |
| "learning_rate": 0.0001404934687953556, |
| "loss": 0.0032, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5971920110737592, |
| "grad_norm": 0.004600350745022297, |
| "learning_rate": 0.00014036152526718566, |
| "loss": 0.0056, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.5985103157339661, |
| "grad_norm": 0.4940958321094513, |
| "learning_rate": 0.0001402295817390157, |
| "loss": 0.0206, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5998286203941731, |
| "grad_norm": 0.09658394008874893, |
| "learning_rate": 0.00014009763821084577, |
| "loss": 0.0052, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.60114692505438, |
| "grad_norm": 0.00020539117394946516, |
| "learning_rate": 0.0001399656946826758, |
| "loss": 0.087, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.602465229714587, |
| "grad_norm": 0.1871018409729004, |
| "learning_rate": 0.00013983375115450588, |
| "loss": 0.0812, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.603783534374794, |
| "grad_norm": 0.02583954855799675, |
| "learning_rate": 0.00013970180762633592, |
| "loss": 0.0232, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.605101839035001, |
| "grad_norm": 1.2103784084320068, |
| "learning_rate": 0.000139569864098166, |
| "loss": 0.0151, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.6064201436952079, |
| "grad_norm": 0.023514943197369576, |
| "learning_rate": 0.00013943792056999606, |
| "loss": 0.0193, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6077384483554149, |
| "grad_norm": 0.0076395305804908276, |
| "learning_rate": 0.0001393059770418261, |
| "loss": 0.0379, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.6090567530156219, |
| "grad_norm": 0.12412039190530777, |
| "learning_rate": 0.00013917403351365617, |
| "loss": 0.0095, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.6103750576758289, |
| "grad_norm": 0.021904783323407173, |
| "learning_rate": 0.0001390420899854862, |
| "loss": 0.0166, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.6116933623360359, |
| "grad_norm": 0.004012851510196924, |
| "learning_rate": 0.00013891014645731628, |
| "loss": 0.0103, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.6130116669962429, |
| "grad_norm": 0.007267913781106472, |
| "learning_rate": 0.00013877820292914635, |
| "loss": 0.0708, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.6143299716564498, |
| "grad_norm": 0.10363642126321793, |
| "learning_rate": 0.0001386462594009764, |
| "loss": 0.0473, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.6156482763166568, |
| "grad_norm": 0.04899830371141434, |
| "learning_rate": 0.00013851431587280646, |
| "loss": 0.0283, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.6169665809768637, |
| "grad_norm": 0.39460498094558716, |
| "learning_rate": 0.0001383823723446365, |
| "loss": 0.0597, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.6182848856370707, |
| "grad_norm": 0.04092290997505188, |
| "learning_rate": 0.00013825042881646655, |
| "loss": 0.0167, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.6196031902972777, |
| "grad_norm": 0.2781132161617279, |
| "learning_rate": 0.00013811848528829662, |
| "loss": 0.0097, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6209214949574847, |
| "grad_norm": 0.041443537920713425, |
| "learning_rate": 0.00013798654176012666, |
| "loss": 0.0226, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.6222397996176916, |
| "grad_norm": 0.1242462694644928, |
| "learning_rate": 0.00013785459823195673, |
| "loss": 0.0055, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.6235581042778986, |
| "grad_norm": 0.4440467357635498, |
| "learning_rate": 0.00013772265470378677, |
| "loss": 0.049, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.6248764089381056, |
| "grad_norm": 0.014354427345097065, |
| "learning_rate": 0.00013759071117561684, |
| "loss": 0.0327, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.6261947135983126, |
| "grad_norm": 0.011539973318576813, |
| "learning_rate": 0.0001374587676474469, |
| "loss": 0.0222, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.6275130182585196, |
| "grad_norm": 0.23539051413536072, |
| "learning_rate": 0.00013732682411927695, |
| "loss": 0.0816, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.6288313229187266, |
| "grad_norm": 0.26793941855430603, |
| "learning_rate": 0.00013719488059110702, |
| "loss": 0.0325, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.6301496275789334, |
| "grad_norm": 0.01662217453122139, |
| "learning_rate": 0.00013706293706293706, |
| "loss": 0.0221, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.6314679322391404, |
| "grad_norm": 0.30669671297073364, |
| "learning_rate": 0.00013693099353476713, |
| "loss": 0.026, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.6327862368993474, |
| "grad_norm": 0.03350894898176193, |
| "learning_rate": 0.00013679905000659717, |
| "loss": 0.0072, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6341045415595544, |
| "grad_norm": 0.014983875676989555, |
| "learning_rate": 0.00013666710647842724, |
| "loss": 0.049, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.6354228462197614, |
| "grad_norm": 1.8989384174346924, |
| "learning_rate": 0.0001365351629502573, |
| "loss": 0.0335, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.6367411508799684, |
| "grad_norm": 0.030135562643408775, |
| "learning_rate": 0.00013640321942208735, |
| "loss": 0.0051, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.6380594555401753, |
| "grad_norm": 0.02079075388610363, |
| "learning_rate": 0.00013627127589391742, |
| "loss": 0.0138, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.6393777602003823, |
| "grad_norm": 0.06065403297543526, |
| "learning_rate": 0.00013613933236574746, |
| "loss": 0.0357, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.6406960648605893, |
| "grad_norm": 0.2980937659740448, |
| "learning_rate": 0.00013600738883757753, |
| "loss": 0.0138, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.6420143695207963, |
| "grad_norm": 0.4820438623428345, |
| "learning_rate": 0.00013587544530940758, |
| "loss": 0.01, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.6433326741810033, |
| "grad_norm": 0.005618259310722351, |
| "learning_rate": 0.00013574350178123765, |
| "loss": 0.0052, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.6446509788412103, |
| "grad_norm": 0.7173821926116943, |
| "learning_rate": 0.0001356115582530677, |
| "loss": 0.0133, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.6459692835014171, |
| "grad_norm": 0.0053142281249165535, |
| "learning_rate": 0.00013547961472489773, |
| "loss": 0.0045, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6472875881616241, |
| "grad_norm": 0.06118829548358917, |
| "learning_rate": 0.0001353476711967278, |
| "loss": 0.056, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.6486058928218311, |
| "grad_norm": 3.5878078937530518, |
| "learning_rate": 0.00013521572766855787, |
| "loss": 0.0232, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.6499241974820381, |
| "grad_norm": 0.004911276511847973, |
| "learning_rate": 0.0001350837841403879, |
| "loss": 0.0074, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.6512425021422451, |
| "grad_norm": 0.0028026222717016935, |
| "learning_rate": 0.00013495184061221798, |
| "loss": 0.0782, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.6525608068024521, |
| "grad_norm": 0.7317615747451782, |
| "learning_rate": 0.00013481989708404802, |
| "loss": 0.0222, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.653879111462659, |
| "grad_norm": 0.01835751160979271, |
| "learning_rate": 0.0001346879535558781, |
| "loss": 0.0661, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.655197416122866, |
| "grad_norm": 0.03598962351679802, |
| "learning_rate": 0.00013455601002770813, |
| "loss": 0.0395, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.656515720783073, |
| "grad_norm": 0.013886351138353348, |
| "learning_rate": 0.0001344240664995382, |
| "loss": 0.0156, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.65783402544328, |
| "grad_norm": 5.741530895233154, |
| "learning_rate": 0.00013429212297136827, |
| "loss": 0.0317, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.659152330103487, |
| "grad_norm": 0.20793496072292328, |
| "learning_rate": 0.0001341601794431983, |
| "loss": 0.0072, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.659152330103487, |
| "eval_loss": 0.0300898440182209, |
| "eval_runtime": 453.0554, |
| "eval_samples_per_second": 7.443, |
| "eval_steps_per_second": 3.721, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6604706347636939, |
| "grad_norm": 0.03460961952805519, |
| "learning_rate": 0.00013402823591502838, |
| "loss": 0.0097, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.6617889394239008, |
| "grad_norm": 0.31785696744918823, |
| "learning_rate": 0.00013389629238685842, |
| "loss": 0.0303, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.6631072440841078, |
| "grad_norm": 0.4273851215839386, |
| "learning_rate": 0.0001337643488586885, |
| "loss": 0.0499, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.6644255487443148, |
| "grad_norm": 0.02236153744161129, |
| "learning_rate": 0.00013363240533051856, |
| "loss": 0.0069, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6657438534045218, |
| "grad_norm": 0.1592864990234375, |
| "learning_rate": 0.0001335004618023486, |
| "loss": 0.0326, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.6670621580647288, |
| "grad_norm": 0.029961545020341873, |
| "learning_rate": 0.00013336851827417867, |
| "loss": 0.0178, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.6683804627249358, |
| "grad_norm": 0.03120764158666134, |
| "learning_rate": 0.00013323657474600872, |
| "loss": 0.115, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.6696987673851427, |
| "grad_norm": 0.01060028001666069, |
| "learning_rate": 0.00013310463121783879, |
| "loss": 0.0036, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6710170720453497, |
| "grad_norm": 0.053470809012651443, |
| "learning_rate": 0.00013297268768966883, |
| "loss": 0.0079, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.6723353767055567, |
| "grad_norm": 0.022777097299695015, |
| "learning_rate": 0.00013284074416149887, |
| "loss": 0.0078, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6736536813657636, |
| "grad_norm": 0.0548521913588047, |
| "learning_rate": 0.00013270880063332894, |
| "loss": 0.0503, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.6749719860259706, |
| "grad_norm": 0.02028457075357437, |
| "learning_rate": 0.00013257685710515898, |
| "loss": 0.0096, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.6762902906861776, |
| "grad_norm": 0.01569107361137867, |
| "learning_rate": 0.00013244491357698905, |
| "loss": 0.008, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.6776085953463845, |
| "grad_norm": 0.00743742985650897, |
| "learning_rate": 0.00013231297004881912, |
| "loss": 0.005, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.6789269000065915, |
| "grad_norm": 0.025164416059851646, |
| "learning_rate": 0.00013218102652064916, |
| "loss": 0.018, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.6802452046667985, |
| "grad_norm": 0.3653188645839691, |
| "learning_rate": 0.00013204908299247923, |
| "loss": 0.0295, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.6815635093270055, |
| "grad_norm": 0.685422956943512, |
| "learning_rate": 0.00013191713946430927, |
| "loss": 0.0335, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.6828818139872125, |
| "grad_norm": 0.675740122795105, |
| "learning_rate": 0.00013178519593613934, |
| "loss": 0.0592, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.6842001186474194, |
| "grad_norm": 0.10513252764940262, |
| "learning_rate": 0.00013165325240796938, |
| "loss": 0.0353, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.6855184233076264, |
| "grad_norm": 0.43512973189353943, |
| "learning_rate": 0.00013152130887979945, |
| "loss": 0.0142, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6868367279678333, |
| "grad_norm": 0.029436839744448662, |
| "learning_rate": 0.00013138936535162952, |
| "loss": 0.0042, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.6881550326280403, |
| "grad_norm": 0.5607122778892517, |
| "learning_rate": 0.00013125742182345957, |
| "loss": 0.0184, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.6894733372882473, |
| "grad_norm": 0.11365406215190887, |
| "learning_rate": 0.00013112547829528963, |
| "loss": 0.006, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.6907916419484543, |
| "grad_norm": 0.047227244824171066, |
| "learning_rate": 0.00013099353476711968, |
| "loss": 0.008, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.6921099466086612, |
| "grad_norm": 0.0005877618095837533, |
| "learning_rate": 0.00013086159123894975, |
| "loss": 0.0286, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.6934282512688682, |
| "grad_norm": 0.010759112425148487, |
| "learning_rate": 0.0001307296477107798, |
| "loss": 0.0062, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.6947465559290752, |
| "grad_norm": 0.07117745280265808, |
| "learning_rate": 0.00013059770418260986, |
| "loss": 0.0891, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.6960648605892822, |
| "grad_norm": 0.0639057606458664, |
| "learning_rate": 0.00013046576065443993, |
| "loss": 0.0072, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.6973831652494892, |
| "grad_norm": 0.027350090444087982, |
| "learning_rate": 0.00013033381712626994, |
| "loss": 0.0103, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.6987014699096962, |
| "grad_norm": 0.015336195938289165, |
| "learning_rate": 0.0001302018735981, |
| "loss": 0.0041, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.700019774569903, |
| "grad_norm": 1.0650830268859863, |
| "learning_rate": 0.00013006993006993008, |
| "loss": 0.0443, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.70133807923011, |
| "grad_norm": 0.019073212519288063, |
| "learning_rate": 0.00012993798654176012, |
| "loss": 0.0331, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.702656383890317, |
| "grad_norm": 0.10109209269285202, |
| "learning_rate": 0.0001298060430135902, |
| "loss": 0.0054, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.703974688550524, |
| "grad_norm": 0.03528957813978195, |
| "learning_rate": 0.00012967409948542023, |
| "loss": 0.0427, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.705292993210731, |
| "grad_norm": 0.03577788919210434, |
| "learning_rate": 0.0001295421559572503, |
| "loss": 0.023, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.706611297870938, |
| "grad_norm": 0.5576180815696716, |
| "learning_rate": 0.00012941021242908034, |
| "loss": 0.0416, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.7079296025311449, |
| "grad_norm": 0.017131298780441284, |
| "learning_rate": 0.0001292782689009104, |
| "loss": 0.0235, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.7092479071913519, |
| "grad_norm": 0.8517888784408569, |
| "learning_rate": 0.00012914632537274048, |
| "loss": 0.0168, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.7105662118515589, |
| "grad_norm": 0.23812156915664673, |
| "learning_rate": 0.00012901438184457052, |
| "loss": 0.0483, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.7118845165117659, |
| "grad_norm": 0.11746613681316376, |
| "learning_rate": 0.0001288824383164006, |
| "loss": 0.0255, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.7132028211719729, |
| "grad_norm": 0.20089928805828094, |
| "learning_rate": 0.00012875049478823064, |
| "loss": 0.0267, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.7145211258321799, |
| "grad_norm": 0.8301129937171936, |
| "learning_rate": 0.0001286185512600607, |
| "loss": 0.016, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.7158394304923867, |
| "grad_norm": 0.01838674768805504, |
| "learning_rate": 0.00012848660773189077, |
| "loss": 0.0229, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.7171577351525937, |
| "grad_norm": 0.03670337051153183, |
| "learning_rate": 0.00012835466420372082, |
| "loss": 0.038, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.7184760398128007, |
| "grad_norm": 0.0452633760869503, |
| "learning_rate": 0.00012822272067555089, |
| "loss": 0.0622, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.7197943444730077, |
| "grad_norm": 0.09503110498189926, |
| "learning_rate": 0.00012809077714738093, |
| "loss": 0.0209, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.7211126491332147, |
| "grad_norm": 1.0327308177947998, |
| "learning_rate": 0.000127958833619211, |
| "loss": 0.0361, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.7224309537934217, |
| "grad_norm": 1.0049290657043457, |
| "learning_rate": 0.00012782689009104104, |
| "loss": 0.0365, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.7237492584536286, |
| "grad_norm": 0.029774073511362076, |
| "learning_rate": 0.00012769494656287108, |
| "loss": 0.0257, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.7250675631138356, |
| "grad_norm": 0.20974040031433105, |
| "learning_rate": 0.00012756300303470115, |
| "loss": 0.0542, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.7263858677740426, |
| "grad_norm": 0.8153854608535767, |
| "learning_rate": 0.0001274310595065312, |
| "loss": 0.0216, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.7277041724342496, |
| "grad_norm": 0.4393698573112488, |
| "learning_rate": 0.00012729911597836126, |
| "loss": 0.0451, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.7290224770944566, |
| "grad_norm": 0.06990349292755127, |
| "learning_rate": 0.00012716717245019133, |
| "loss": 0.03, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.7303407817546635, |
| "grad_norm": 0.32689470052719116, |
| "learning_rate": 0.00012703522892202137, |
| "loss": 0.0263, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.7316590864148704, |
| "grad_norm": 0.026600876823067665, |
| "learning_rate": 0.00012690328539385144, |
| "loss": 0.0404, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.7329773910750774, |
| "grad_norm": 0.11228257417678833, |
| "learning_rate": 0.00012677134186568148, |
| "loss": 0.0224, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.7342956957352844, |
| "grad_norm": 0.6469443440437317, |
| "learning_rate": 0.00012663939833751155, |
| "loss": 0.0178, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.7356140003954914, |
| "grad_norm": 0.020773250609636307, |
| "learning_rate": 0.0001265074548093416, |
| "loss": 0.011, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.7369323050556984, |
| "grad_norm": 0.7378728985786438, |
| "learning_rate": 0.00012637551128117167, |
| "loss": 0.0227, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.7382506097159054, |
| "grad_norm": 0.008189595304429531, |
| "learning_rate": 0.00012624356775300173, |
| "loss": 0.0892, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.7395689143761123, |
| "grad_norm": 0.031633853912353516, |
| "learning_rate": 0.00012611162422483178, |
| "loss": 0.0093, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.7408872190363193, |
| "grad_norm": 0.5078475475311279, |
| "learning_rate": 0.00012597968069666185, |
| "loss": 0.0567, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.7422055236965263, |
| "grad_norm": 0.21766887605190277, |
| "learning_rate": 0.0001258477371684919, |
| "loss": 0.0485, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.7435238283567333, |
| "grad_norm": 0.3029612898826599, |
| "learning_rate": 0.00012571579364032196, |
| "loss": 0.032, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.7448421330169402, |
| "grad_norm": 1.2135159969329834, |
| "learning_rate": 0.00012558385011215203, |
| "loss": 0.0139, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.7461604376771472, |
| "grad_norm": 0.016875172033905983, |
| "learning_rate": 0.00012545190658398207, |
| "loss": 0.0323, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.7474787423373541, |
| "grad_norm": 0.08923230320215225, |
| "learning_rate": 0.00012531996305581214, |
| "loss": 0.0343, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.7487970469975611, |
| "grad_norm": 0.2958766520023346, |
| "learning_rate": 0.00012518801952764215, |
| "loss": 0.0431, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.7501153516577681, |
| "grad_norm": 0.7344386577606201, |
| "learning_rate": 0.00012505607599947222, |
| "loss": 0.0389, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.7514336563179751, |
| "grad_norm": 0.03681635856628418, |
| "learning_rate": 0.0001249241324713023, |
| "loss": 0.0258, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.7527519609781821, |
| "grad_norm": 0.22866861522197723, |
| "learning_rate": 0.00012479218894313233, |
| "loss": 0.0223, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.7540702656383891, |
| "grad_norm": 0.029770435765385628, |
| "learning_rate": 0.0001246602454149624, |
| "loss": 0.0205, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.755388570298596, |
| "grad_norm": 0.011845707893371582, |
| "learning_rate": 0.00012452830188679244, |
| "loss": 0.0252, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.756706874958803, |
| "grad_norm": 0.06696149706840515, |
| "learning_rate": 0.00012439635835862251, |
| "loss": 0.0166, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.75802517961901, |
| "grad_norm": 0.01653144136071205, |
| "learning_rate": 0.00012426441483045256, |
| "loss": 0.0487, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.7593434842792169, |
| "grad_norm": 0.031312476843595505, |
| "learning_rate": 0.00012413247130228263, |
| "loss": 0.0155, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.7606617889394239, |
| "grad_norm": 0.011625733226537704, |
| "learning_rate": 0.0001240005277741127, |
| "loss": 0.0333, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.7619800935996309, |
| "grad_norm": 0.012089414522051811, |
| "learning_rate": 0.00012386858424594274, |
| "loss": 0.003, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.7632983982598378, |
| "grad_norm": 0.3012307584285736, |
| "learning_rate": 0.0001237366407177728, |
| "loss": 0.0172, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.7646167029200448, |
| "grad_norm": 0.31575000286102295, |
| "learning_rate": 0.00012360469718960285, |
| "loss": 0.0409, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.7659350075802518, |
| "grad_norm": 0.009794364683330059, |
| "learning_rate": 0.00012347275366143292, |
| "loss": 0.0214, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.7672533122404588, |
| "grad_norm": 0.5973085165023804, |
| "learning_rate": 0.00012334081013326299, |
| "loss": 0.0245, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.7685716169006658, |
| "grad_norm": 0.019750040024518967, |
| "learning_rate": 0.00012320886660509303, |
| "loss": 0.0063, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.7698899215608728, |
| "grad_norm": 0.06402858346700668, |
| "learning_rate": 0.0001230769230769231, |
| "loss": 0.0444, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.7712082262210797, |
| "grad_norm": 0.02876671403646469, |
| "learning_rate": 0.00012294497954875314, |
| "loss": 0.0103, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.7725265308812866, |
| "grad_norm": 0.6962207555770874, |
| "learning_rate": 0.0001228130360205832, |
| "loss": 0.0318, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.7738448355414936, |
| "grad_norm": 0.006536522414535284, |
| "learning_rate": 0.00012268109249241325, |
| "loss": 0.0096, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.7751631402017006, |
| "grad_norm": 0.07097168266773224, |
| "learning_rate": 0.0001225491489642433, |
| "loss": 0.0174, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.7764814448619076, |
| "grad_norm": 0.042360126972198486, |
| "learning_rate": 0.00012241720543607336, |
| "loss": 0.0158, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.7777997495221146, |
| "grad_norm": 0.01159572321921587, |
| "learning_rate": 0.0001222852619079034, |
| "loss": 0.0265, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7791180541823215, |
| "grad_norm": 0.38408163189888, |
| "learning_rate": 0.00012215331837973347, |
| "loss": 0.0233, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.7804363588425285, |
| "grad_norm": 0.15588605403900146, |
| "learning_rate": 0.00012202137485156353, |
| "loss": 0.0041, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.7817546635027355, |
| "grad_norm": 0.006892362609505653, |
| "learning_rate": 0.00012188943132339358, |
| "loss": 0.0026, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.7830729681629425, |
| "grad_norm": 0.030915727838873863, |
| "learning_rate": 0.00012175748779522364, |
| "loss": 0.0028, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.7843912728231495, |
| "grad_norm": 0.8151025772094727, |
| "learning_rate": 0.00012162554426705371, |
| "loss": 0.0429, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.7857095774833565, |
| "grad_norm": 0.6765475273132324, |
| "learning_rate": 0.00012149360073888377, |
| "loss": 0.0319, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.7870278821435633, |
| "grad_norm": 0.054469238966703415, |
| "learning_rate": 0.00012136165721071382, |
| "loss": 0.0413, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.7883461868037703, |
| "grad_norm": 0.045610666275024414, |
| "learning_rate": 0.00012122971368254388, |
| "loss": 0.0521, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.7896644914639773, |
| "grad_norm": 0.4222470223903656, |
| "learning_rate": 0.00012109777015437393, |
| "loss": 0.0846, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.7909827961241843, |
| "grad_norm": 0.0272397268563509, |
| "learning_rate": 0.00012096582662620399, |
| "loss": 0.0364, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7909827961241843, |
| "eval_loss": 0.033312585204839706, |
| "eval_runtime": 452.2552, |
| "eval_samples_per_second": 7.456, |
| "eval_steps_per_second": 3.728, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7923011007843913, |
| "grad_norm": 0.08674059063196182, |
| "learning_rate": 0.00012083388309803406, |
| "loss": 0.0081, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.7936194054445982, |
| "grad_norm": 0.21960832178592682, |
| "learning_rate": 0.00012070193956986411, |
| "loss": 0.0468, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.7949377101048052, |
| "grad_norm": 0.11259289085865021, |
| "learning_rate": 0.00012056999604169417, |
| "loss": 0.0124, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.7962560147650122, |
| "grad_norm": 0.02945362776517868, |
| "learning_rate": 0.00012043805251352422, |
| "loss": 0.0298, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.7975743194252192, |
| "grad_norm": 0.27889615297317505, |
| "learning_rate": 0.00012030610898535428, |
| "loss": 0.0251, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.7988926240854262, |
| "grad_norm": 0.05873241275548935, |
| "learning_rate": 0.00012017416545718434, |
| "loss": 0.0132, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.8002109287456332, |
| "grad_norm": 0.1570046991109848, |
| "learning_rate": 0.00012004222192901439, |
| "loss": 0.0228, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.80152923340584, |
| "grad_norm": 0.12575332820415497, |
| "learning_rate": 0.00011991027840084443, |
| "loss": 0.0049, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.802847538066047, |
| "grad_norm": 0.8416435122489929, |
| "learning_rate": 0.00011977833487267449, |
| "loss": 0.0542, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.804165842726254, |
| "grad_norm": 0.2605098485946655, |
| "learning_rate": 0.00011964639134450454, |
| "loss": 0.0084, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.805484147386461, |
| "grad_norm": 0.8996294736862183, |
| "learning_rate": 0.00011951444781633461, |
| "loss": 0.0442, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.806802452046668, |
| "grad_norm": 2.7525105476379395, |
| "learning_rate": 0.00011938250428816467, |
| "loss": 0.0642, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.808120756706875, |
| "grad_norm": 0.14955930411815643, |
| "learning_rate": 0.00011925056075999473, |
| "loss": 0.0384, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.8094390613670819, |
| "grad_norm": 0.018756115809082985, |
| "learning_rate": 0.00011911861723182478, |
| "loss": 0.0154, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.8107573660272889, |
| "grad_norm": 0.23998615145683289, |
| "learning_rate": 0.00011898667370365484, |
| "loss": 0.0413, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.8120756706874959, |
| "grad_norm": 0.27253249287605286, |
| "learning_rate": 0.00011885473017548489, |
| "loss": 0.0081, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.8133939753477029, |
| "grad_norm": 0.2925993502140045, |
| "learning_rate": 0.00011872278664731495, |
| "loss": 0.0332, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.8147122800079099, |
| "grad_norm": 0.5364832878112793, |
| "learning_rate": 0.00011859084311914502, |
| "loss": 0.0143, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.8160305846681168, |
| "grad_norm": 0.32104921340942383, |
| "learning_rate": 0.00011845889959097507, |
| "loss": 0.0216, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.8173488893283237, |
| "grad_norm": 0.0205856766551733, |
| "learning_rate": 0.00011832695606280513, |
| "loss": 0.0346, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.8186671939885307, |
| "grad_norm": 0.2541547417640686, |
| "learning_rate": 0.00011819501253463518, |
| "loss": 0.0793, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.8199854986487377, |
| "grad_norm": 0.08333491533994675, |
| "learning_rate": 0.00011806306900646524, |
| "loss": 0.0049, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.8213038033089447, |
| "grad_norm": 0.0355968177318573, |
| "learning_rate": 0.0001179311254782953, |
| "loss": 0.0051, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.8226221079691517, |
| "grad_norm": 0.06948401033878326, |
| "learning_rate": 0.00011779918195012536, |
| "loss": 0.013, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.8239404126293587, |
| "grad_norm": 0.03328891843557358, |
| "learning_rate": 0.00011766723842195542, |
| "loss": 0.0122, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.8252587172895656, |
| "grad_norm": 0.013782350346446037, |
| "learning_rate": 0.00011753529489378548, |
| "loss": 0.0073, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.8265770219497726, |
| "grad_norm": 0.024390392005443573, |
| "learning_rate": 0.00011740335136561553, |
| "loss": 0.0143, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.8278953266099796, |
| "grad_norm": 0.002548128366470337, |
| "learning_rate": 0.00011727140783744557, |
| "loss": 0.0027, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.8292136312701865, |
| "grad_norm": 0.11674848943948746, |
| "learning_rate": 0.00011713946430927563, |
| "loss": 0.0253, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.8305319359303935, |
| "grad_norm": 0.005774884019047022, |
| "learning_rate": 0.00011700752078110568, |
| "loss": 0.0018, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.8318502405906005, |
| "grad_norm": 0.5763069987297058, |
| "learning_rate": 0.00011687557725293574, |
| "loss": 0.0119, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.8331685452508074, |
| "grad_norm": 0.0027607593219727278, |
| "learning_rate": 0.0001167436337247658, |
| "loss": 0.0279, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.8344868499110144, |
| "grad_norm": 1.859642505645752, |
| "learning_rate": 0.00011661169019659585, |
| "loss": 0.0228, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.8358051545712214, |
| "grad_norm": 0.16597022116184235, |
| "learning_rate": 0.00011647974666842592, |
| "loss": 0.1228, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.8371234592314284, |
| "grad_norm": 0.33833742141723633, |
| "learning_rate": 0.00011634780314025598, |
| "loss": 0.073, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.8384417638916354, |
| "grad_norm": 0.024682912975549698, |
| "learning_rate": 0.00011621585961208603, |
| "loss": 0.0042, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.8397600685518424, |
| "grad_norm": 0.05926942452788353, |
| "learning_rate": 0.00011608391608391609, |
| "loss": 0.0066, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.8410783732120493, |
| "grad_norm": 0.1414029747247696, |
| "learning_rate": 0.00011595197255574614, |
| "loss": 0.0603, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.8423966778722562, |
| "grad_norm": 0.37928736209869385, |
| "learning_rate": 0.0001158200290275762, |
| "loss": 0.0266, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.8437149825324632, |
| "grad_norm": 0.018329354003071785, |
| "learning_rate": 0.00011568808549940627, |
| "loss": 0.0047, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.8450332871926702, |
| "grad_norm": 0.2993735373020172, |
| "learning_rate": 0.00011555614197123632, |
| "loss": 0.0218, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.8463515918528772, |
| "grad_norm": 0.1767728328704834, |
| "learning_rate": 0.00011542419844306638, |
| "loss": 0.0363, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.8476698965130842, |
| "grad_norm": 0.39774414896965027, |
| "learning_rate": 0.00011529225491489644, |
| "loss": 0.0506, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.8489882011732911, |
| "grad_norm": 0.021896762773394585, |
| "learning_rate": 0.00011516031138672649, |
| "loss": 0.0081, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.8503065058334981, |
| "grad_norm": 0.358372300863266, |
| "learning_rate": 0.00011502836785855655, |
| "loss": 0.0224, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.8516248104937051, |
| "grad_norm": 0.01605542004108429, |
| "learning_rate": 0.00011489642433038662, |
| "loss": 0.0215, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.8529431151539121, |
| "grad_norm": 0.021189266815781593, |
| "learning_rate": 0.00011476448080221667, |
| "loss": 0.0051, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.8542614198141191, |
| "grad_norm": 0.013394076377153397, |
| "learning_rate": 0.0001146325372740467, |
| "loss": 0.021, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.8555797244743261, |
| "grad_norm": 0.19848507642745972, |
| "learning_rate": 0.00011450059374587676, |
| "loss": 0.0285, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.856898029134533, |
| "grad_norm": 0.2463046759366989, |
| "learning_rate": 0.00011436865021770683, |
| "loss": 0.0384, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.8582163337947399, |
| "grad_norm": 0.37432390451431274, |
| "learning_rate": 0.00011423670668953688, |
| "loss": 0.0098, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.8595346384549469, |
| "grad_norm": 0.060943394899368286, |
| "learning_rate": 0.00011410476316136694, |
| "loss": 0.0087, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.8608529431151539, |
| "grad_norm": 0.2846696674823761, |
| "learning_rate": 0.00011397281963319699, |
| "loss": 0.0148, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.8621712477753609, |
| "grad_norm": 0.009311323054134846, |
| "learning_rate": 0.00011384087610502705, |
| "loss": 0.0024, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.8634895524355679, |
| "grad_norm": 0.046277035027742386, |
| "learning_rate": 0.0001137089325768571, |
| "loss": 0.0274, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.8648078570957748, |
| "grad_norm": 0.006024620030075312, |
| "learning_rate": 0.00011357698904868716, |
| "loss": 0.0286, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.8661261617559818, |
| "grad_norm": 0.033578380942344666, |
| "learning_rate": 0.00011344504552051723, |
| "loss": 0.0153, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.8674444664161888, |
| "grad_norm": 0.8537917137145996, |
| "learning_rate": 0.00011331310199234728, |
| "loss": 0.0304, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.8687627710763958, |
| "grad_norm": 0.013933337293565273, |
| "learning_rate": 0.00011318115846417734, |
| "loss": 0.0112, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.8700810757366028, |
| "grad_norm": 0.35437721014022827, |
| "learning_rate": 0.0001130492149360074, |
| "loss": 0.0228, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.8713993803968098, |
| "grad_norm": 1.3024121522903442, |
| "learning_rate": 0.00011291727140783745, |
| "loss": 0.0203, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.8727176850570166, |
| "grad_norm": 0.5131255984306335, |
| "learning_rate": 0.00011278532787966751, |
| "loss": 0.0181, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.8740359897172236, |
| "grad_norm": 0.039366886019706726, |
| "learning_rate": 0.00011265338435149758, |
| "loss": 0.0192, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.8753542943774306, |
| "grad_norm": 0.13679669797420502, |
| "learning_rate": 0.00011252144082332763, |
| "loss": 0.004, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.8766725990376376, |
| "grad_norm": 0.003076886525377631, |
| "learning_rate": 0.00011238949729515769, |
| "loss": 0.0405, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.8779909036978446, |
| "grad_norm": 0.019953785464167595, |
| "learning_rate": 0.00011225755376698774, |
| "loss": 0.0241, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.8793092083580516, |
| "grad_norm": 0.007980377413332462, |
| "learning_rate": 0.0001121256102388178, |
| "loss": 0.0064, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.8806275130182585, |
| "grad_norm": 0.018761295825242996, |
| "learning_rate": 0.00011199366671064784, |
| "loss": 0.0032, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.8819458176784655, |
| "grad_norm": 0.022511709481477737, |
| "learning_rate": 0.0001118617231824779, |
| "loss": 0.0055, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.8832641223386725, |
| "grad_norm": 0.021270718425512314, |
| "learning_rate": 0.00011172977965430795, |
| "loss": 0.033, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.8845824269988795, |
| "grad_norm": 0.02710561640560627, |
| "learning_rate": 0.00011159783612613801, |
| "loss": 0.0094, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.8859007316590864, |
| "grad_norm": 0.4353378117084503, |
| "learning_rate": 0.00011146589259796806, |
| "loss": 0.0089, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8872190363192934, |
| "grad_norm": 0.0257766991853714, |
| "learning_rate": 0.00011133394906979813, |
| "loss": 0.0059, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.8885373409795003, |
| "grad_norm": 0.80838942527771, |
| "learning_rate": 0.00011120200554162819, |
| "loss": 0.0263, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.8898556456397073, |
| "grad_norm": 0.007799761835485697, |
| "learning_rate": 0.00011107006201345824, |
| "loss": 0.0028, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.8911739502999143, |
| "grad_norm": 0.007315775845199823, |
| "learning_rate": 0.0001109381184852883, |
| "loss": 0.0127, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.8924922549601213, |
| "grad_norm": 1.4861233234405518, |
| "learning_rate": 0.00011080617495711836, |
| "loss": 0.0562, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.8938105596203283, |
| "grad_norm": 0.010219530202448368, |
| "learning_rate": 0.00011067423142894841, |
| "loss": 0.0438, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.8951288642805353, |
| "grad_norm": 1.0191857814788818, |
| "learning_rate": 0.00011054228790077848, |
| "loss": 0.0493, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.8964471689407422, |
| "grad_norm": 0.01459536887705326, |
| "learning_rate": 0.00011041034437260854, |
| "loss": 0.0117, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8977654736009492, |
| "grad_norm": 0.008682495914399624, |
| "learning_rate": 0.00011027840084443859, |
| "loss": 0.02, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.8990837782611562, |
| "grad_norm": 0.02197263017296791, |
| "learning_rate": 0.00011014645731626865, |
| "loss": 0.0454, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.9004020829213631, |
| "grad_norm": 0.01436714269220829, |
| "learning_rate": 0.0001100145137880987, |
| "loss": 0.0283, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.9017203875815701, |
| "grad_norm": 0.14327946305274963, |
| "learning_rate": 0.00010988257025992876, |
| "loss": 0.0461, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.9030386922417771, |
| "grad_norm": 1.671773910522461, |
| "learning_rate": 0.00010975062673175883, |
| "loss": 0.054, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.904356996901984, |
| "grad_norm": 0.009926804341375828, |
| "learning_rate": 0.00010961868320358888, |
| "loss": 0.0429, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.905675301562191, |
| "grad_norm": 0.554020881652832, |
| "learning_rate": 0.00010948673967541894, |
| "loss": 0.0618, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.906993606222398, |
| "grad_norm": 0.1399248093366623, |
| "learning_rate": 0.00010935479614724897, |
| "loss": 0.0229, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.908311910882605, |
| "grad_norm": 0.02739197015762329, |
| "learning_rate": 0.00010922285261907904, |
| "loss": 0.0082, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.909630215542812, |
| "grad_norm": 0.33394527435302734, |
| "learning_rate": 0.00010909090909090909, |
| "loss": 0.0403, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.9109485202030189, |
| "grad_norm": 0.08083894103765488, |
| "learning_rate": 0.00010895896556273915, |
| "loss": 0.0406, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.9122668248632259, |
| "grad_norm": 0.39336663484573364, |
| "learning_rate": 0.0001088270220345692, |
| "loss": 0.02, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.9135851295234328, |
| "grad_norm": 0.20481553673744202, |
| "learning_rate": 0.00010869507850639926, |
| "loss": 0.0221, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.9149034341836398, |
| "grad_norm": 1.4507408142089844, |
| "learning_rate": 0.00010856313497822932, |
| "loss": 0.0357, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.9162217388438468, |
| "grad_norm": 0.2678806483745575, |
| "learning_rate": 0.00010843119145005937, |
| "loss": 0.0181, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.9175400435040538, |
| "grad_norm": 0.007361674215644598, |
| "learning_rate": 0.00010829924792188944, |
| "loss": 0.0978, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.9188583481642607, |
| "grad_norm": 0.773695707321167, |
| "learning_rate": 0.0001081673043937195, |
| "loss": 0.0401, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.9201766528244677, |
| "grad_norm": 0.0010772625682875514, |
| "learning_rate": 0.00010803536086554955, |
| "loss": 0.0233, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.9214949574846747, |
| "grad_norm": 0.08971104770898819, |
| "learning_rate": 0.00010790341733737961, |
| "loss": 0.0319, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.9228132621448817, |
| "grad_norm": 0.21372731029987335, |
| "learning_rate": 0.00010777147380920966, |
| "loss": 0.0315, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.9228132621448817, |
| "eval_loss": 0.02952708676457405, |
| "eval_runtime": 451.5837, |
| "eval_samples_per_second": 7.467, |
| "eval_steps_per_second": 3.734, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.9241315668050887, |
| "grad_norm": 0.016639264300465584, |
| "learning_rate": 0.00010763953028103972, |
| "loss": 0.0125, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.9254498714652957, |
| "grad_norm": 0.46340492367744446, |
| "learning_rate": 0.00010750758675286979, |
| "loss": 0.0186, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.9267681761255026, |
| "grad_norm": 0.01847526989877224, |
| "learning_rate": 0.00010737564322469984, |
| "loss": 0.0026, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.9280864807857095, |
| "grad_norm": 0.5947860479354858, |
| "learning_rate": 0.0001072436996965299, |
| "loss": 0.0259, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.9294047854459165, |
| "grad_norm": 0.06145291402935982, |
| "learning_rate": 0.00010711175616835995, |
| "loss": 0.0057, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.9307230901061235, |
| "grad_norm": 0.0143959429115057, |
| "learning_rate": 0.00010697981264019001, |
| "loss": 0.0145, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.9320413947663305, |
| "grad_norm": 0.21143831312656403, |
| "learning_rate": 0.00010684786911202007, |
| "loss": 0.0459, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.9333596994265375, |
| "grad_norm": 0.02548077143728733, |
| "learning_rate": 0.00010671592558385011, |
| "loss": 0.0051, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.9346780040867444, |
| "grad_norm": 0.008077048696577549, |
| "learning_rate": 0.00010658398205568016, |
| "loss": 0.0306, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.9359963087469514, |
| "grad_norm": 0.0030760422814637423, |
| "learning_rate": 0.00010645203852751022, |
| "loss": 0.0575, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.9373146134071584, |
| "grad_norm": 0.18114158511161804, |
| "learning_rate": 0.00010632009499934027, |
| "loss": 0.0885, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.9386329180673654, |
| "grad_norm": 0.02450549602508545, |
| "learning_rate": 0.00010618815147117034, |
| "loss": 0.0045, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.9399512227275724, |
| "grad_norm": 0.1238626018166542, |
| "learning_rate": 0.0001060562079430004, |
| "loss": 0.0166, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.9412695273877794, |
| "grad_norm": 0.1879919469356537, |
| "learning_rate": 0.00010592426441483046, |
| "loss": 0.0077, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.9425878320479862, |
| "grad_norm": 0.11323565989732742, |
| "learning_rate": 0.00010579232088666051, |
| "loss": 0.0213, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.9439061367081932, |
| "grad_norm": 0.35575854778289795, |
| "learning_rate": 0.00010566037735849057, |
| "loss": 0.0336, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.9452244413684002, |
| "grad_norm": 0.14052227139472961, |
| "learning_rate": 0.00010552843383032062, |
| "loss": 0.0325, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.9465427460286072, |
| "grad_norm": 0.2643798887729645, |
| "learning_rate": 0.00010539649030215069, |
| "loss": 0.0192, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.9478610506888142, |
| "grad_norm": 0.3207031190395355, |
| "learning_rate": 0.00010526454677398075, |
| "loss": 0.0221, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.9491793553490212, |
| "grad_norm": 0.022803861647844315, |
| "learning_rate": 0.0001051326032458108, |
| "loss": 0.029, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.9504976600092281, |
| "grad_norm": 0.02511664852499962, |
| "learning_rate": 0.00010500065971764086, |
| "loss": 0.0422, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.9518159646694351, |
| "grad_norm": 0.06505445390939713, |
| "learning_rate": 0.00010486871618947091, |
| "loss": 0.0092, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.9531342693296421, |
| "grad_norm": 0.09998584538698196, |
| "learning_rate": 0.00010473677266130097, |
| "loss": 0.0242, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.9544525739898491, |
| "grad_norm": 0.9645698666572571, |
| "learning_rate": 0.00010460482913313104, |
| "loss": 0.0124, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.955770878650056, |
| "grad_norm": 0.2389964610338211, |
| "learning_rate": 0.0001044728856049611, |
| "loss": 0.0169, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.957089183310263, |
| "grad_norm": 2.030608654022217, |
| "learning_rate": 0.00010434094207679115, |
| "loss": 0.0518, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.9584074879704699, |
| "grad_norm": 0.05979987606406212, |
| "learning_rate": 0.0001042089985486212, |
| "loss": 0.0081, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.9597257926306769, |
| "grad_norm": 0.15761719644069672, |
| "learning_rate": 0.00010407705502045125, |
| "loss": 0.0061, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.9610440972908839, |
| "grad_norm": 0.6534290909767151, |
| "learning_rate": 0.0001039451114922813, |
| "loss": 0.0104, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.9623624019510909, |
| "grad_norm": 1.0324147939682007, |
| "learning_rate": 0.00010381316796411136, |
| "loss": 0.0381, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.9636807066112979, |
| "grad_norm": 0.002968872431665659, |
| "learning_rate": 0.00010368122443594142, |
| "loss": 0.0343, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.9649990112715049, |
| "grad_norm": 0.011243184097111225, |
| "learning_rate": 0.00010354928090777147, |
| "loss": 0.019, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.9663173159317118, |
| "grad_norm": 0.17663739621639252, |
| "learning_rate": 0.00010341733737960153, |
| "loss": 0.0452, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.9676356205919188, |
| "grad_norm": 1.2647719383239746, |
| "learning_rate": 0.00010328539385143158, |
| "loss": 0.0154, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.9689539252521258, |
| "grad_norm": 0.3691752552986145, |
| "learning_rate": 0.00010315345032326165, |
| "loss": 0.028, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.9702722299123328, |
| "grad_norm": 0.0015879774000495672, |
| "learning_rate": 0.00010302150679509171, |
| "loss": 0.0202, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.9715905345725397, |
| "grad_norm": 0.1441984623670578, |
| "learning_rate": 0.00010288956326692176, |
| "loss": 0.0221, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.9729088392327467, |
| "grad_norm": 0.20431455969810486, |
| "learning_rate": 0.00010275761973875182, |
| "loss": 0.0072, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.9742271438929536, |
| "grad_norm": 0.861625611782074, |
| "learning_rate": 0.00010262567621058187, |
| "loss": 0.0523, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.9755454485531606, |
| "grad_norm": 0.005049478262662888, |
| "learning_rate": 0.00010249373268241193, |
| "loss": 0.0051, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.9768637532133676, |
| "grad_norm": 0.49685510993003845, |
| "learning_rate": 0.000102361789154242, |
| "loss": 0.023, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.9781820578735746, |
| "grad_norm": 0.08789395540952682, |
| "learning_rate": 0.00010222984562607205, |
| "loss": 0.0159, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.9795003625337816, |
| "grad_norm": 0.027168691158294678, |
| "learning_rate": 0.00010209790209790211, |
| "loss": 0.0083, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.9808186671939886, |
| "grad_norm": 0.0006773864733986557, |
| "learning_rate": 0.00010196595856973217, |
| "loss": 0.0048, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.9821369718541955, |
| "grad_norm": 0.01636457070708275, |
| "learning_rate": 0.00010183401504156222, |
| "loss": 0.0159, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.9834552765144025, |
| "grad_norm": 0.10160859674215317, |
| "learning_rate": 0.00010170207151339228, |
| "loss": 0.0047, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.9847735811746094, |
| "grad_norm": 0.14173269271850586, |
| "learning_rate": 0.00010157012798522232, |
| "loss": 0.006, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.9860918858348164, |
| "grad_norm": 0.003458512481302023, |
| "learning_rate": 0.00010143818445705238, |
| "loss": 0.0193, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.9874101904950234, |
| "grad_norm": 0.005163820460438728, |
| "learning_rate": 0.00010130624092888243, |
| "loss": 0.0039, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.9887284951552304, |
| "grad_norm": 0.005913791712373495, |
| "learning_rate": 0.00010117429740071249, |
| "loss": 0.0119, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.9900467998154373, |
| "grad_norm": 0.00800853967666626, |
| "learning_rate": 0.00010104235387254256, |
| "loss": 0.044, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.9913651044756443, |
| "grad_norm": 0.18146778643131256, |
| "learning_rate": 0.00010091041034437261, |
| "loss": 0.0048, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.9926834091358513, |
| "grad_norm": 0.01235104724764824, |
| "learning_rate": 0.00010077846681620267, |
| "loss": 0.0017, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.9940017137960583, |
| "grad_norm": 0.17677897214889526, |
| "learning_rate": 0.00010064652328803272, |
| "loss": 0.0339, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.9953200184562653, |
| "grad_norm": 0.0017472271574661136, |
| "learning_rate": 0.00010051457975986278, |
| "loss": 0.0494, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.9966383231164723, |
| "grad_norm": 0.10814860463142395, |
| "learning_rate": 0.00010038263623169283, |
| "loss": 0.0741, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.9979566277766792, |
| "grad_norm": 0.11329760402441025, |
| "learning_rate": 0.0001002506927035229, |
| "loss": 0.0182, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.9992749324368861, |
| "grad_norm": 0.11573276668787003, |
| "learning_rate": 0.00010011874917535296, |
| "loss": 0.0068, |
| "step": 3790 |
| }, |
| { |
| "epoch": 1.000790982796124, |
| "grad_norm": 0.08449886739253998, |
| "learning_rate": 9.998680564718301e-05, |
| "loss": 0.0141, |
| "step": 3795 |
| }, |
| { |
| "epoch": 1.002109287456331, |
| "grad_norm": 0.05035184696316719, |
| "learning_rate": 9.985486211901307e-05, |
| "loss": 0.0293, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.003427592116538, |
| "grad_norm": 0.0255444198846817, |
| "learning_rate": 9.972291859084313e-05, |
| "loss": 0.0054, |
| "step": 3805 |
| }, |
| { |
| "epoch": 1.004745896776745, |
| "grad_norm": 0.0033677336759865284, |
| "learning_rate": 9.959097506267318e-05, |
| "loss": 0.0567, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.006064201436952, |
| "grad_norm": 0.09453682601451874, |
| "learning_rate": 9.945903153450324e-05, |
| "loss": 0.0589, |
| "step": 3815 |
| }, |
| { |
| "epoch": 1.007382506097159, |
| "grad_norm": 0.01592979207634926, |
| "learning_rate": 9.932708800633329e-05, |
| "loss": 0.0043, |
| "step": 3820 |
| }, |
| { |
| "epoch": 1.008700810757366, |
| "grad_norm": 0.002263693604618311, |
| "learning_rate": 9.919514447816335e-05, |
| "loss": 0.0195, |
| "step": 3825 |
| }, |
| { |
| "epoch": 1.010019115417573, |
| "grad_norm": 0.013390793465077877, |
| "learning_rate": 9.90632009499934e-05, |
| "loss": 0.0152, |
| "step": 3830 |
| }, |
| { |
| "epoch": 1.01133742007778, |
| "grad_norm": 0.10473847389221191, |
| "learning_rate": 9.893125742182346e-05, |
| "loss": 0.0606, |
| "step": 3835 |
| }, |
| { |
| "epoch": 1.012655724737987, |
| "grad_norm": 0.05837221071124077, |
| "learning_rate": 9.879931389365353e-05, |
| "loss": 0.0121, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.013974029398194, |
| "grad_norm": 0.3803791105747223, |
| "learning_rate": 9.866737036548358e-05, |
| "loss": 0.0386, |
| "step": 3845 |
| }, |
| { |
| "epoch": 1.0152923340584008, |
| "grad_norm": 0.4067519009113312, |
| "learning_rate": 9.853542683731364e-05, |
| "loss": 0.0115, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.0166106387186078, |
| "grad_norm": 0.02585229091346264, |
| "learning_rate": 9.84034833091437e-05, |
| "loss": 0.0214, |
| "step": 3855 |
| }, |
| { |
| "epoch": 1.0179289433788148, |
| "grad_norm": 0.03670825809240341, |
| "learning_rate": 9.827153978097374e-05, |
| "loss": 0.0059, |
| "step": 3860 |
| }, |
| { |
| "epoch": 1.0192472480390218, |
| "grad_norm": 0.014171554706990719, |
| "learning_rate": 9.81395962528038e-05, |
| "loss": 0.0145, |
| "step": 3865 |
| }, |
| { |
| "epoch": 1.0205655526992288, |
| "grad_norm": 0.027376385405659676, |
| "learning_rate": 9.800765272463386e-05, |
| "loss": 0.0089, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.0218838573594358, |
| "grad_norm": 0.03168405964970589, |
| "learning_rate": 9.787570919646392e-05, |
| "loss": 0.0132, |
| "step": 3875 |
| }, |
| { |
| "epoch": 1.0232021620196428, |
| "grad_norm": 0.03346199914813042, |
| "learning_rate": 9.774376566829397e-05, |
| "loss": 0.0246, |
| "step": 3880 |
| }, |
| { |
| "epoch": 1.0245204666798498, |
| "grad_norm": 0.00894144270569086, |
| "learning_rate": 9.761182214012403e-05, |
| "loss": 0.0105, |
| "step": 3885 |
| }, |
| { |
| "epoch": 1.0258387713400567, |
| "grad_norm": 0.3172806203365326, |
| "learning_rate": 9.747987861195409e-05, |
| "loss": 0.0103, |
| "step": 3890 |
| }, |
| { |
| "epoch": 1.0271570760002637, |
| "grad_norm": 0.009055040776729584, |
| "learning_rate": 9.734793508378414e-05, |
| "loss": 0.0103, |
| "step": 3895 |
| }, |
| { |
| "epoch": 1.0284753806604707, |
| "grad_norm": 0.014140011742711067, |
| "learning_rate": 9.721599155561421e-05, |
| "loss": 0.0037, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.0297936853206777, |
| "grad_norm": 0.008317383006215096, |
| "learning_rate": 9.708404802744427e-05, |
| "loss": 0.002, |
| "step": 3905 |
| }, |
| { |
| "epoch": 1.0311119899808845, |
| "grad_norm": 0.005038558971136808, |
| "learning_rate": 9.695210449927431e-05, |
| "loss": 0.0017, |
| "step": 3910 |
| }, |
| { |
| "epoch": 1.0324302946410915, |
| "grad_norm": 0.40058520436286926, |
| "learning_rate": 9.682016097110436e-05, |
| "loss": 0.0065, |
| "step": 3915 |
| }, |
| { |
| "epoch": 1.0337485993012985, |
| "grad_norm": 0.005197151098400354, |
| "learning_rate": 9.668821744293442e-05, |
| "loss": 0.0031, |
| "step": 3920 |
| }, |
| { |
| "epoch": 1.0350669039615055, |
| "grad_norm": 0.014353781007230282, |
| "learning_rate": 9.655627391476449e-05, |
| "loss": 0.0009, |
| "step": 3925 |
| }, |
| { |
| "epoch": 1.0363852086217125, |
| "grad_norm": 0.13260559737682343, |
| "learning_rate": 9.642433038659454e-05, |
| "loss": 0.0323, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.0377035132819195, |
| "grad_norm": 0.006795065477490425, |
| "learning_rate": 9.62923868584246e-05, |
| "loss": 0.0022, |
| "step": 3935 |
| }, |
| { |
| "epoch": 1.0390218179421264, |
| "grad_norm": 0.2276086062192917, |
| "learning_rate": 9.616044333025466e-05, |
| "loss": 0.0221, |
| "step": 3940 |
| }, |
| { |
| "epoch": 1.0403401226023334, |
| "grad_norm": 0.06121920794248581, |
| "learning_rate": 9.602849980208471e-05, |
| "loss": 0.0037, |
| "step": 3945 |
| }, |
| { |
| "epoch": 1.0416584272625404, |
| "grad_norm": 0.9180755019187927, |
| "learning_rate": 9.589655627391477e-05, |
| "loss": 0.0589, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.0429767319227474, |
| "grad_norm": 0.07515591382980347, |
| "learning_rate": 9.576461274574484e-05, |
| "loss": 0.0653, |
| "step": 3955 |
| }, |
| { |
| "epoch": 1.0442950365829544, |
| "grad_norm": 0.018060607835650444, |
| "learning_rate": 9.563266921757488e-05, |
| "loss": 0.0178, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.0456133412431612, |
| "grad_norm": 0.02751368284225464, |
| "learning_rate": 9.550072568940493e-05, |
| "loss": 0.0076, |
| "step": 3965 |
| }, |
| { |
| "epoch": 1.0469316459033682, |
| "grad_norm": 0.653998613357544, |
| "learning_rate": 9.536878216123499e-05, |
| "loss": 0.0066, |
| "step": 3970 |
| }, |
| { |
| "epoch": 1.0482499505635752, |
| "grad_norm": 0.3117768168449402, |
| "learning_rate": 9.523683863306505e-05, |
| "loss": 0.0087, |
| "step": 3975 |
| }, |
| { |
| "epoch": 1.0495682552237822, |
| "grad_norm": 0.013952831737697124, |
| "learning_rate": 9.510489510489511e-05, |
| "loss": 0.0037, |
| "step": 3980 |
| }, |
| { |
| "epoch": 1.0508865598839892, |
| "grad_norm": 0.01806250400841236, |
| "learning_rate": 9.497295157672517e-05, |
| "loss": 0.0028, |
| "step": 3985 |
| }, |
| { |
| "epoch": 1.0522048645441962, |
| "grad_norm": 0.13678006827831268, |
| "learning_rate": 9.484100804855523e-05, |
| "loss": 0.0533, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.0535231692044031, |
| "grad_norm": 0.14869382977485657, |
| "learning_rate": 9.470906452038528e-05, |
| "loss": 0.009, |
| "step": 3995 |
| }, |
| { |
| "epoch": 1.0548414738646101, |
| "grad_norm": 0.33614659309387207, |
| "learning_rate": 9.457712099221534e-05, |
| "loss": 0.0555, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.0548414738646101, |
| "eval_loss": 0.026165226474404335, |
| "eval_runtime": 452.2482, |
| "eval_samples_per_second": 7.456, |
| "eval_steps_per_second": 3.728, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.0561597785248171, |
| "grad_norm": 0.007546027656644583, |
| "learning_rate": 9.444517746404539e-05, |
| "loss": 0.0029, |
| "step": 4005 |
| }, |
| { |
| "epoch": 1.0574780831850241, |
| "grad_norm": 0.3720332384109497, |
| "learning_rate": 9.431323393587545e-05, |
| "loss": 0.0353, |
| "step": 4010 |
| }, |
| { |
| "epoch": 1.0587963878452311, |
| "grad_norm": 1.1335264444351196, |
| "learning_rate": 9.41812904077055e-05, |
| "loss": 0.0142, |
| "step": 4015 |
| }, |
| { |
| "epoch": 1.060114692505438, |
| "grad_norm": 0.024723488837480545, |
| "learning_rate": 9.404934687953556e-05, |
| "loss": 0.006, |
| "step": 4020 |
| }, |
| { |
| "epoch": 1.0614329971656449, |
| "grad_norm": 0.040354058146476746, |
| "learning_rate": 9.391740335136562e-05, |
| "loss": 0.0107, |
| "step": 4025 |
| }, |
| { |
| "epoch": 1.0627513018258519, |
| "grad_norm": 0.222810298204422, |
| "learning_rate": 9.378545982319567e-05, |
| "loss": 0.0273, |
| "step": 4030 |
| }, |
| { |
| "epoch": 1.0640696064860589, |
| "grad_norm": 0.025684095919132233, |
| "learning_rate": 9.365351629502574e-05, |
| "loss": 0.0033, |
| "step": 4035 |
| }, |
| { |
| "epoch": 1.0653879111462659, |
| "grad_norm": 0.05338352546095848, |
| "learning_rate": 9.35215727668558e-05, |
| "loss": 0.0052, |
| "step": 4040 |
| }, |
| { |
| "epoch": 1.0667062158064728, |
| "grad_norm": 0.06182330474257469, |
| "learning_rate": 9.338962923868585e-05, |
| "loss": 0.0038, |
| "step": 4045 |
| }, |
| { |
| "epoch": 1.0680245204666798, |
| "grad_norm": 0.012170832604169846, |
| "learning_rate": 9.325768571051591e-05, |
| "loss": 0.0018, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.0693428251268868, |
| "grad_norm": 0.5424306392669678, |
| "learning_rate": 9.312574218234596e-05, |
| "loss": 0.0445, |
| "step": 4055 |
| }, |
| { |
| "epoch": 1.0706611297870938, |
| "grad_norm": 0.017939254641532898, |
| "learning_rate": 9.299379865417602e-05, |
| "loss": 0.0389, |
| "step": 4060 |
| }, |
| { |
| "epoch": 1.0719794344473008, |
| "grad_norm": 0.0060431682504713535, |
| "learning_rate": 9.286185512600607e-05, |
| "loss": 0.0025, |
| "step": 4065 |
| }, |
| { |
| "epoch": 1.0732977391075078, |
| "grad_norm": 0.0071444883942604065, |
| "learning_rate": 9.272991159783613e-05, |
| "loss": 0.0333, |
| "step": 4070 |
| }, |
| { |
| "epoch": 1.0746160437677148, |
| "grad_norm": 0.29632750153541565, |
| "learning_rate": 9.259796806966619e-05, |
| "loss": 0.0151, |
| "step": 4075 |
| }, |
| { |
| "epoch": 1.0759343484279218, |
| "grad_norm": 0.004526323173195124, |
| "learning_rate": 9.246602454149624e-05, |
| "loss": 0.006, |
| "step": 4080 |
| }, |
| { |
| "epoch": 1.0772526530881286, |
| "grad_norm": 0.023945212364196777, |
| "learning_rate": 9.23340810133263e-05, |
| "loss": 0.004, |
| "step": 4085 |
| }, |
| { |
| "epoch": 1.0785709577483356, |
| "grad_norm": 0.13235126435756683, |
| "learning_rate": 9.220213748515635e-05, |
| "loss": 0.0059, |
| "step": 4090 |
| }, |
| { |
| "epoch": 1.0798892624085425, |
| "grad_norm": 0.17592330276966095, |
| "learning_rate": 9.207019395698642e-05, |
| "loss": 0.0302, |
| "step": 4095 |
| }, |
| { |
| "epoch": 1.0812075670687495, |
| "grad_norm": 0.004582866560667753, |
| "learning_rate": 9.193825042881648e-05, |
| "loss": 0.009, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.0825258717289565, |
| "grad_norm": 0.15214525163173676, |
| "learning_rate": 9.180630690064653e-05, |
| "loss": 0.0062, |
| "step": 4105 |
| }, |
| { |
| "epoch": 1.0838441763891635, |
| "grad_norm": 0.16535983979701996, |
| "learning_rate": 9.167436337247658e-05, |
| "loss": 0.0926, |
| "step": 4110 |
| }, |
| { |
| "epoch": 1.0851624810493705, |
| "grad_norm": 0.013285227119922638, |
| "learning_rate": 9.154241984430663e-05, |
| "loss": 0.0043, |
| "step": 4115 |
| }, |
| { |
| "epoch": 1.0864807857095775, |
| "grad_norm": 0.012116984464228153, |
| "learning_rate": 9.14104763161367e-05, |
| "loss": 0.0037, |
| "step": 4120 |
| }, |
| { |
| "epoch": 1.0877990903697845, |
| "grad_norm": 0.0373845212161541, |
| "learning_rate": 9.127853278796676e-05, |
| "loss": 0.0081, |
| "step": 4125 |
| }, |
| { |
| "epoch": 1.0891173950299915, |
| "grad_norm": 0.09324615448713303, |
| "learning_rate": 9.114658925979681e-05, |
| "loss": 0.0534, |
| "step": 4130 |
| }, |
| { |
| "epoch": 1.0904356996901985, |
| "grad_norm": 0.010992968454957008, |
| "learning_rate": 9.101464573162687e-05, |
| "loss": 0.0025, |
| "step": 4135 |
| }, |
| { |
| "epoch": 1.0917540043504055, |
| "grad_norm": 0.13710318505764008, |
| "learning_rate": 9.088270220345692e-05, |
| "loss": 0.0555, |
| "step": 4140 |
| }, |
| { |
| "epoch": 1.0930723090106123, |
| "grad_norm": 0.010403074324131012, |
| "learning_rate": 9.075075867528698e-05, |
| "loss": 0.0042, |
| "step": 4145 |
| }, |
| { |
| "epoch": 1.0943906136708192, |
| "grad_norm": 0.21544460952281952, |
| "learning_rate": 9.061881514711705e-05, |
| "loss": 0.0144, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.0957089183310262, |
| "grad_norm": 0.04194799065589905, |
| "learning_rate": 9.04868716189471e-05, |
| "loss": 0.0106, |
| "step": 4155 |
| }, |
| { |
| "epoch": 1.0970272229912332, |
| "grad_norm": 0.029204202815890312, |
| "learning_rate": 9.035492809077715e-05, |
| "loss": 0.0085, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.0983455276514402, |
| "grad_norm": 0.006751026958227158, |
| "learning_rate": 9.02229845626072e-05, |
| "loss": 0.0049, |
| "step": 4165 |
| }, |
| { |
| "epoch": 1.0996638323116472, |
| "grad_norm": 0.008232722990214825, |
| "learning_rate": 9.009104103443726e-05, |
| "loss": 0.0172, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.1009821369718542, |
| "grad_norm": 0.05630079656839371, |
| "learning_rate": 8.995909750626733e-05, |
| "loss": 0.0112, |
| "step": 4175 |
| }, |
| { |
| "epoch": 1.1023004416320612, |
| "grad_norm": 0.0011601662263274193, |
| "learning_rate": 8.982715397809738e-05, |
| "loss": 0.0317, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.1036187462922682, |
| "grad_norm": 0.006554402410984039, |
| "learning_rate": 8.969521044992744e-05, |
| "loss": 0.0035, |
| "step": 4185 |
| }, |
| { |
| "epoch": 1.1049370509524752, |
| "grad_norm": 0.34513652324676514, |
| "learning_rate": 8.956326692175749e-05, |
| "loss": 0.0036, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.1062553556126822, |
| "grad_norm": 0.283669650554657, |
| "learning_rate": 8.943132339358755e-05, |
| "loss": 0.0182, |
| "step": 4195 |
| }, |
| { |
| "epoch": 1.1075736602728892, |
| "grad_norm": 0.5376952290534973, |
| "learning_rate": 8.92993798654176e-05, |
| "loss": 0.0293, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.108891964933096, |
| "grad_norm": 0.01689724065363407, |
| "learning_rate": 8.916743633724767e-05, |
| "loss": 0.0206, |
| "step": 4205 |
| }, |
| { |
| "epoch": 1.110210269593303, |
| "grad_norm": 0.026538770645856857, |
| "learning_rate": 8.903549280907772e-05, |
| "loss": 0.0181, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.11152857425351, |
| "grad_norm": 0.6372873783111572, |
| "learning_rate": 8.890354928090777e-05, |
| "loss": 0.021, |
| "step": 4215 |
| }, |
| { |
| "epoch": 1.112846878913717, |
| "grad_norm": 0.06177428737282753, |
| "learning_rate": 8.877160575273783e-05, |
| "loss": 0.0033, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.114165183573924, |
| "grad_norm": 0.3712109923362732, |
| "learning_rate": 8.863966222456788e-05, |
| "loss": 0.0075, |
| "step": 4225 |
| }, |
| { |
| "epoch": 1.115483488234131, |
| "grad_norm": 0.030514653772115707, |
| "learning_rate": 8.850771869639795e-05, |
| "loss": 0.0183, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.116801792894338, |
| "grad_norm": 0.012861707247793674, |
| "learning_rate": 8.837577516822801e-05, |
| "loss": 0.0032, |
| "step": 4235 |
| }, |
| { |
| "epoch": 1.118120097554545, |
| "grad_norm": 0.3278522789478302, |
| "learning_rate": 8.824383164005806e-05, |
| "loss": 0.0058, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.1194384022147519, |
| "grad_norm": 0.580259382724762, |
| "learning_rate": 8.811188811188812e-05, |
| "loss": 0.0068, |
| "step": 4245 |
| }, |
| { |
| "epoch": 1.1207567068749589, |
| "grad_norm": 0.007002575788646936, |
| "learning_rate": 8.797994458371817e-05, |
| "loss": 0.0063, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.1220750115351659, |
| "grad_norm": 0.22484643757343292, |
| "learning_rate": 8.784800105554823e-05, |
| "loss": 0.0167, |
| "step": 4255 |
| }, |
| { |
| "epoch": 1.1233933161953726, |
| "grad_norm": 0.004122686106711626, |
| "learning_rate": 8.771605752737829e-05, |
| "loss": 0.002, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.1247116208555796, |
| "grad_norm": 0.009832561016082764, |
| "learning_rate": 8.758411399920834e-05, |
| "loss": 0.0029, |
| "step": 4265 |
| }, |
| { |
| "epoch": 1.1260299255157866, |
| "grad_norm": 0.04854527860879898, |
| "learning_rate": 8.74521704710384e-05, |
| "loss": 0.0068, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.1273482301759936, |
| "grad_norm": 0.12221235036849976, |
| "learning_rate": 8.732022694286845e-05, |
| "loss": 0.003, |
| "step": 4275 |
| }, |
| { |
| "epoch": 1.1286665348362006, |
| "grad_norm": 0.005857539363205433, |
| "learning_rate": 8.718828341469851e-05, |
| "loss": 0.0022, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.1299848394964076, |
| "grad_norm": 0.10582758486270905, |
| "learning_rate": 8.705633988652856e-05, |
| "loss": 0.002, |
| "step": 4285 |
| }, |
| { |
| "epoch": 1.1313031441566146, |
| "grad_norm": 0.006190940272063017, |
| "learning_rate": 8.692439635835863e-05, |
| "loss": 0.0022, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.1326214488168216, |
| "grad_norm": 0.00221514655277133, |
| "learning_rate": 8.679245283018869e-05, |
| "loss": 0.0314, |
| "step": 4295 |
| }, |
| { |
| "epoch": 1.1339397534770286, |
| "grad_norm": 0.0796755850315094, |
| "learning_rate": 8.666050930201874e-05, |
| "loss": 0.0347, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.1352580581372356, |
| "grad_norm": 0.20088806748390198, |
| "learning_rate": 8.65285657738488e-05, |
| "loss": 0.0048, |
| "step": 4305 |
| }, |
| { |
| "epoch": 1.1365763627974426, |
| "grad_norm": 0.4018377363681793, |
| "learning_rate": 8.639662224567884e-05, |
| "loss": 0.0234, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.1378946674576496, |
| "grad_norm": 0.014961684122681618, |
| "learning_rate": 8.626467871750891e-05, |
| "loss": 0.0033, |
| "step": 4315 |
| }, |
| { |
| "epoch": 1.1392129721178565, |
| "grad_norm": 0.004534922540187836, |
| "learning_rate": 8.613273518933897e-05, |
| "loss": 0.0021, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.1405312767780633, |
| "grad_norm": 0.06340984255075455, |
| "learning_rate": 8.600079166116902e-05, |
| "loss": 0.0538, |
| "step": 4325 |
| }, |
| { |
| "epoch": 1.1418495814382703, |
| "grad_norm": 0.007374623324722052, |
| "learning_rate": 8.586884813299908e-05, |
| "loss": 0.0157, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.1431678860984773, |
| "grad_norm": 0.02313193492591381, |
| "learning_rate": 8.573690460482913e-05, |
| "loss": 0.0307, |
| "step": 4335 |
| }, |
| { |
| "epoch": 1.1444861907586843, |
| "grad_norm": 0.014071634039282799, |
| "learning_rate": 8.560496107665919e-05, |
| "loss": 0.0058, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.1458044954188913, |
| "grad_norm": 1.4664901494979858, |
| "learning_rate": 8.547301754848926e-05, |
| "loss": 0.0566, |
| "step": 4345 |
| }, |
| { |
| "epoch": 1.1471228000790983, |
| "grad_norm": 0.023680074140429497, |
| "learning_rate": 8.534107402031931e-05, |
| "loss": 0.0048, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.1484411047393053, |
| "grad_norm": 0.012555698864161968, |
| "learning_rate": 8.520913049214937e-05, |
| "loss": 0.0076, |
| "step": 4355 |
| }, |
| { |
| "epoch": 1.1497594093995123, |
| "grad_norm": 0.013624129816889763, |
| "learning_rate": 8.507718696397941e-05, |
| "loss": 0.0373, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.1510777140597193, |
| "grad_norm": 0.015372387133538723, |
| "learning_rate": 8.494524343580947e-05, |
| "loss": 0.0147, |
| "step": 4365 |
| }, |
| { |
| "epoch": 1.1523960187199263, |
| "grad_norm": 0.3312993347644806, |
| "learning_rate": 8.481329990763954e-05, |
| "loss": 0.0299, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.1537143233801332, |
| "grad_norm": 0.023838184773921967, |
| "learning_rate": 8.468135637946959e-05, |
| "loss": 0.0226, |
| "step": 4375 |
| }, |
| { |
| "epoch": 1.15503262804034, |
| "grad_norm": 0.42516952753067017, |
| "learning_rate": 8.454941285129965e-05, |
| "loss": 0.0088, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.156350932700547, |
| "grad_norm": 0.6900278925895691, |
| "learning_rate": 8.44174693231297e-05, |
| "loss": 0.0245, |
| "step": 4385 |
| }, |
| { |
| "epoch": 1.157669237360754, |
| "grad_norm": 0.2932703197002411, |
| "learning_rate": 8.428552579495976e-05, |
| "loss": 0.0207, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.158987542020961, |
| "grad_norm": 0.12942780554294586, |
| "learning_rate": 8.415358226678982e-05, |
| "loss": 0.0037, |
| "step": 4395 |
| }, |
| { |
| "epoch": 1.160305846681168, |
| "grad_norm": 0.9499046802520752, |
| "learning_rate": 8.402163873861989e-05, |
| "loss": 0.0246, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.161624151341375, |
| "grad_norm": 0.008869118988513947, |
| "learning_rate": 8.388969521044994e-05, |
| "loss": 0.0171, |
| "step": 4405 |
| }, |
| { |
| "epoch": 1.162942456001582, |
| "grad_norm": 1.7409231662750244, |
| "learning_rate": 8.375775168227998e-05, |
| "loss": 0.017, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.164260760661789, |
| "grad_norm": 0.0020101398695260286, |
| "learning_rate": 8.362580815411004e-05, |
| "loss": 0.0027, |
| "step": 4415 |
| }, |
| { |
| "epoch": 1.165579065321996, |
| "grad_norm": 0.0785067081451416, |
| "learning_rate": 8.34938646259401e-05, |
| "loss": 0.0043, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.166897369982203, |
| "grad_norm": 0.0029506285209208727, |
| "learning_rate": 8.336192109777016e-05, |
| "loss": 0.0109, |
| "step": 4425 |
| }, |
| { |
| "epoch": 1.16821567464241, |
| "grad_norm": 0.02216683328151703, |
| "learning_rate": 8.322997756960022e-05, |
| "loss": 0.0026, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.1695339793026167, |
| "grad_norm": 0.02216639369726181, |
| "learning_rate": 8.309803404143027e-05, |
| "loss": 0.0045, |
| "step": 4435 |
| }, |
| { |
| "epoch": 1.170852283962824, |
| "grad_norm": 0.0, |
| "learning_rate": 8.296609051326033e-05, |
| "loss": 0.006, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.1721705886230307, |
| "grad_norm": 0.0019736960530281067, |
| "learning_rate": 8.283414698509039e-05, |
| "loss": 0.0078, |
| "step": 4445 |
| }, |
| { |
| "epoch": 1.1734888932832377, |
| "grad_norm": 0.012957746163010597, |
| "learning_rate": 8.270220345692044e-05, |
| "loss": 0.002, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.1748071979434447, |
| "grad_norm": 0.010877869091928005, |
| "learning_rate": 8.25702599287505e-05, |
| "loss": 0.0237, |
| "step": 4455 |
| }, |
| { |
| "epoch": 1.1761255026036517, |
| "grad_norm": 0.005947659723460674, |
| "learning_rate": 8.243831640058055e-05, |
| "loss": 0.0341, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.1774438072638587, |
| "grad_norm": 0.0005026470171287656, |
| "learning_rate": 8.230637287241061e-05, |
| "loss": 0.0033, |
| "step": 4465 |
| }, |
| { |
| "epoch": 1.1787621119240657, |
| "grad_norm": 0.022054588422179222, |
| "learning_rate": 8.217442934424066e-05, |
| "loss": 0.0042, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.1800804165842727, |
| "grad_norm": 0.7929030656814575, |
| "learning_rate": 8.204248581607072e-05, |
| "loss": 0.0076, |
| "step": 4475 |
| }, |
| { |
| "epoch": 1.1813987212444796, |
| "grad_norm": 0.39052629470825195, |
| "learning_rate": 8.191054228790078e-05, |
| "loss": 0.0228, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.1827170259046866, |
| "grad_norm": 0.007177622988820076, |
| "learning_rate": 8.177859875973084e-05, |
| "loss": 0.01, |
| "step": 4485 |
| }, |
| { |
| "epoch": 1.1840353305648936, |
| "grad_norm": 0.006175135262310505, |
| "learning_rate": 8.16466552315609e-05, |
| "loss": 0.0037, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.1853536352251006, |
| "grad_norm": 0.0356481671333313, |
| "learning_rate": 8.151471170339096e-05, |
| "loss": 0.0024, |
| "step": 4495 |
| }, |
| { |
| "epoch": 1.1866719398853074, |
| "grad_norm": 0.19069480895996094, |
| "learning_rate": 8.138276817522101e-05, |
| "loss": 0.0048, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.1866719398853074, |
| "eval_loss": 0.026386437937617302, |
| "eval_runtime": 452.2896, |
| "eval_samples_per_second": 7.455, |
| "eval_steps_per_second": 3.728, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.1879902445455144, |
| "grad_norm": 0.002254961524158716, |
| "learning_rate": 8.125082464705107e-05, |
| "loss": 0.0014, |
| "step": 4505 |
| }, |
| { |
| "epoch": 1.1893085492057214, |
| "grad_norm": 0.8026870489120483, |
| "learning_rate": 8.111888111888112e-05, |
| "loss": 0.0411, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.1906268538659284, |
| "grad_norm": 0.47328072786331177, |
| "learning_rate": 8.098693759071118e-05, |
| "loss": 0.0271, |
| "step": 4515 |
| }, |
| { |
| "epoch": 1.1919451585261354, |
| "grad_norm": 0.4888288676738739, |
| "learning_rate": 8.085499406254123e-05, |
| "loss": 0.039, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.1932634631863424, |
| "grad_norm": 0.000925812462810427, |
| "learning_rate": 8.072305053437129e-05, |
| "loss": 0.0461, |
| "step": 4525 |
| }, |
| { |
| "epoch": 1.1945817678465493, |
| "grad_norm": 0.12472371757030487, |
| "learning_rate": 8.059110700620135e-05, |
| "loss": 0.0037, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.1959000725067563, |
| "grad_norm": 0.002875336678698659, |
| "learning_rate": 8.04591634780314e-05, |
| "loss": 0.0425, |
| "step": 4535 |
| }, |
| { |
| "epoch": 1.1972183771669633, |
| "grad_norm": 0.042056187987327576, |
| "learning_rate": 8.032721994986147e-05, |
| "loss": 0.0068, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.1985366818271703, |
| "grad_norm": 0.157605841755867, |
| "learning_rate": 8.019527642169153e-05, |
| "loss": 0.0179, |
| "step": 4545 |
| }, |
| { |
| "epoch": 1.1998549864873773, |
| "grad_norm": 0.005153563339263201, |
| "learning_rate": 8.006333289352158e-05, |
| "loss": 0.0045, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.201173291147584, |
| "grad_norm": 0.02541598491370678, |
| "learning_rate": 7.993138936535164e-05, |
| "loss": 0.0041, |
| "step": 4555 |
| }, |
| { |
| "epoch": 1.2024915958077913, |
| "grad_norm": 0.04266195371747017, |
| "learning_rate": 7.979944583718168e-05, |
| "loss": 0.0121, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.203809900467998, |
| "grad_norm": 0.36108532547950745, |
| "learning_rate": 7.966750230901175e-05, |
| "loss": 0.0147, |
| "step": 4565 |
| }, |
| { |
| "epoch": 1.205128205128205, |
| "grad_norm": 0.40405452251434326, |
| "learning_rate": 7.95355587808418e-05, |
| "loss": 0.0056, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.206446509788412, |
| "grad_norm": 0.030422702431678772, |
| "learning_rate": 7.940361525267186e-05, |
| "loss": 0.0055, |
| "step": 4575 |
| }, |
| { |
| "epoch": 1.207764814448619, |
| "grad_norm": 0.014555396512150764, |
| "learning_rate": 7.927167172450192e-05, |
| "loss": 0.0029, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.209083119108826, |
| "grad_norm": 0.33962950110435486, |
| "learning_rate": 7.913972819633197e-05, |
| "loss": 0.0191, |
| "step": 4585 |
| }, |
| { |
| "epoch": 1.210401423769033, |
| "grad_norm": 0.040150560438632965, |
| "learning_rate": 7.900778466816203e-05, |
| "loss": 0.0096, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.21171972842924, |
| "grad_norm": 0.2968510091304779, |
| "learning_rate": 7.88758411399921e-05, |
| "loss": 0.0311, |
| "step": 4595 |
| }, |
| { |
| "epoch": 1.213038033089447, |
| "grad_norm": 0.04709814116358757, |
| "learning_rate": 7.874389761182215e-05, |
| "loss": 0.0175, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.214356337749654, |
| "grad_norm": 0.1379537284374237, |
| "learning_rate": 7.861195408365221e-05, |
| "loss": 0.02, |
| "step": 4605 |
| }, |
| { |
| "epoch": 1.215674642409861, |
| "grad_norm": 0.018291711807250977, |
| "learning_rate": 7.848001055548225e-05, |
| "loss": 0.003, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.216992947070068, |
| "grad_norm": 0.041676126420497894, |
| "learning_rate": 7.83480670273123e-05, |
| "loss": 0.0054, |
| "step": 4615 |
| }, |
| { |
| "epoch": 1.2183112517302748, |
| "grad_norm": 0.0013747498160228133, |
| "learning_rate": 7.821612349914237e-05, |
| "loss": 0.0132, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.2196295563904818, |
| "grad_norm": 0.0050489697605371475, |
| "learning_rate": 7.808417997097243e-05, |
| "loss": 0.0272, |
| "step": 4625 |
| }, |
| { |
| "epoch": 1.2209478610506888, |
| "grad_norm": 0.017974581569433212, |
| "learning_rate": 7.795223644280249e-05, |
| "loss": 0.0037, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.2222661657108957, |
| "grad_norm": 0.001916698063723743, |
| "learning_rate": 7.782029291463254e-05, |
| "loss": 0.002, |
| "step": 4635 |
| }, |
| { |
| "epoch": 1.2235844703711027, |
| "grad_norm": 0.05344574153423309, |
| "learning_rate": 7.76883493864626e-05, |
| "loss": 0.0114, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.2249027750313097, |
| "grad_norm": 0.22823786735534668, |
| "learning_rate": 7.755640585829265e-05, |
| "loss": 0.0296, |
| "step": 4645 |
| }, |
| { |
| "epoch": 1.2262210796915167, |
| "grad_norm": 0.02051074244081974, |
| "learning_rate": 7.742446233012272e-05, |
| "loss": 0.0037, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.2275393843517237, |
| "grad_norm": 0.9797061681747437, |
| "learning_rate": 7.729251880195276e-05, |
| "loss": 0.011, |
| "step": 4655 |
| }, |
| { |
| "epoch": 1.2288576890119307, |
| "grad_norm": 0.0017285927897319198, |
| "learning_rate": 7.716057527378282e-05, |
| "loss": 0.0224, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.2301759936721377, |
| "grad_norm": 0.021783018484711647, |
| "learning_rate": 7.702863174561288e-05, |
| "loss": 0.0174, |
| "step": 4665 |
| }, |
| { |
| "epoch": 1.2314942983323447, |
| "grad_norm": 0.00763307698071003, |
| "learning_rate": 7.689668821744293e-05, |
| "loss": 0.0516, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.2328126029925515, |
| "grad_norm": 0.32605209946632385, |
| "learning_rate": 7.676474468927299e-05, |
| "loss": 0.0301, |
| "step": 4675 |
| }, |
| { |
| "epoch": 1.2341309076527585, |
| "grad_norm": 1.2027722597122192, |
| "learning_rate": 7.663280116110306e-05, |
| "loss": 0.0474, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.2354492123129655, |
| "grad_norm": 0.10201717168092728, |
| "learning_rate": 7.650085763293311e-05, |
| "loss": 0.0144, |
| "step": 4685 |
| }, |
| { |
| "epoch": 1.2367675169731724, |
| "grad_norm": 0.013835664838552475, |
| "learning_rate": 7.636891410476317e-05, |
| "loss": 0.0024, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.2380858216333794, |
| "grad_norm": 0.005699916277080774, |
| "learning_rate": 7.623697057659322e-05, |
| "loss": 0.0089, |
| "step": 4695 |
| }, |
| { |
| "epoch": 1.2394041262935864, |
| "grad_norm": 0.16583332419395447, |
| "learning_rate": 7.610502704842328e-05, |
| "loss": 0.019, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.2407224309537934, |
| "grad_norm": 0.2734023332595825, |
| "learning_rate": 7.597308352025333e-05, |
| "loss": 0.0041, |
| "step": 4705 |
| }, |
| { |
| "epoch": 1.2420407356140004, |
| "grad_norm": 0.04209504276514053, |
| "learning_rate": 7.584113999208339e-05, |
| "loss": 0.0292, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.2433590402742074, |
| "grad_norm": 0.0303195733577013, |
| "learning_rate": 7.570919646391345e-05, |
| "loss": 0.0019, |
| "step": 4715 |
| }, |
| { |
| "epoch": 1.2446773449344144, |
| "grad_norm": 0.014011899940669537, |
| "learning_rate": 7.55772529357435e-05, |
| "loss": 0.0236, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.2459956495946214, |
| "grad_norm": 0.37838876247406006, |
| "learning_rate": 7.544530940757356e-05, |
| "loss": 0.0081, |
| "step": 4725 |
| }, |
| { |
| "epoch": 1.2473139542548284, |
| "grad_norm": 0.003717717481777072, |
| "learning_rate": 7.531336587940361e-05, |
| "loss": 0.0036, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.2486322589150354, |
| "grad_norm": 1.2284752130508423, |
| "learning_rate": 7.518142235123368e-05, |
| "loss": 0.0089, |
| "step": 4735 |
| }, |
| { |
| "epoch": 1.2499505635752421, |
| "grad_norm": 0.015356095507740974, |
| "learning_rate": 7.504947882306374e-05, |
| "loss": 0.0074, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.2512688682354491, |
| "grad_norm": 0.0020383282098919153, |
| "learning_rate": 7.49175352948938e-05, |
| "loss": 0.0444, |
| "step": 4745 |
| }, |
| { |
| "epoch": 1.2525871728956561, |
| "grad_norm": 0.006680132355540991, |
| "learning_rate": 7.478559176672385e-05, |
| "loss": 0.009, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.2539054775558631, |
| "grad_norm": 0.01650019735097885, |
| "learning_rate": 7.465364823855389e-05, |
| "loss": 0.0022, |
| "step": 4755 |
| }, |
| { |
| "epoch": 1.2552237822160701, |
| "grad_norm": 0.009536102414131165, |
| "learning_rate": 7.452170471038396e-05, |
| "loss": 0.0026, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.256542086876277, |
| "grad_norm": 0.04677430912852287, |
| "learning_rate": 7.438976118221402e-05, |
| "loss": 0.004, |
| "step": 4765 |
| }, |
| { |
| "epoch": 1.257860391536484, |
| "grad_norm": 0.007777783088386059, |
| "learning_rate": 7.425781765404407e-05, |
| "loss": 0.0112, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.259178696196691, |
| "grad_norm": 0.03724197298288345, |
| "learning_rate": 7.412587412587413e-05, |
| "loss": 0.0065, |
| "step": 4775 |
| }, |
| { |
| "epoch": 1.260497000856898, |
| "grad_norm": 0.0023958412930369377, |
| "learning_rate": 7.399393059770418e-05, |
| "loss": 0.0238, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.261815305517105, |
| "grad_norm": 0.0036889975890517235, |
| "learning_rate": 7.386198706953424e-05, |
| "loss": 0.0012, |
| "step": 4785 |
| }, |
| { |
| "epoch": 1.263133610177312, |
| "grad_norm": 0.0009220903157256544, |
| "learning_rate": 7.373004354136431e-05, |
| "loss": 0.0017, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.2644519148375188, |
| "grad_norm": 0.0033395602367818356, |
| "learning_rate": 7.359810001319436e-05, |
| "loss": 0.0474, |
| "step": 4795 |
| }, |
| { |
| "epoch": 1.265770219497726, |
| "grad_norm": 0.004093261435627937, |
| "learning_rate": 7.346615648502442e-05, |
| "loss": 0.0025, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.2670885241579328, |
| "grad_norm": 0.004395488649606705, |
| "learning_rate": 7.333421295685446e-05, |
| "loss": 0.0011, |
| "step": 4805 |
| }, |
| { |
| "epoch": 1.2684068288181398, |
| "grad_norm": 0.024034051224589348, |
| "learning_rate": 7.320226942868452e-05, |
| "loss": 0.0027, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.2697251334783468, |
| "grad_norm": 0.9501499533653259, |
| "learning_rate": 7.307032590051459e-05, |
| "loss": 0.0279, |
| "step": 4815 |
| }, |
| { |
| "epoch": 1.2710434381385538, |
| "grad_norm": 0.008805549703538418, |
| "learning_rate": 7.293838237234464e-05, |
| "loss": 0.0403, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.2723617427987608, |
| "grad_norm": 0.01750873774290085, |
| "learning_rate": 7.28064388441747e-05, |
| "loss": 0.0571, |
| "step": 4825 |
| }, |
| { |
| "epoch": 1.2736800474589678, |
| "grad_norm": 0.004490260500460863, |
| "learning_rate": 7.267449531600475e-05, |
| "loss": 0.0269, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.2749983521191748, |
| "grad_norm": 0.07510064542293549, |
| "learning_rate": 7.254255178783481e-05, |
| "loss": 0.0123, |
| "step": 4835 |
| }, |
| { |
| "epoch": 1.2763166567793818, |
| "grad_norm": 0.039783038198947906, |
| "learning_rate": 7.241060825966486e-05, |
| "loss": 0.0137, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.2776349614395888, |
| "grad_norm": 0.019004900008440018, |
| "learning_rate": 7.227866473149493e-05, |
| "loss": 0.0047, |
| "step": 4845 |
| }, |
| { |
| "epoch": 1.2789532660997955, |
| "grad_norm": 0.04813052713871002, |
| "learning_rate": 7.214672120332499e-05, |
| "loss": 0.0021, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.2802715707600028, |
| "grad_norm": 0.00835048221051693, |
| "learning_rate": 7.201477767515503e-05, |
| "loss": 0.0014, |
| "step": 4855 |
| }, |
| { |
| "epoch": 1.2815898754202095, |
| "grad_norm": 0.008609198965132236, |
| "learning_rate": 7.188283414698509e-05, |
| "loss": 0.0219, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.2829081800804165, |
| "grad_norm": 0.007337458431720734, |
| "learning_rate": 7.175089061881514e-05, |
| "loss": 0.0014, |
| "step": 4865 |
| }, |
| { |
| "epoch": 1.2842264847406235, |
| "grad_norm": 0.0032645913306623697, |
| "learning_rate": 7.161894709064521e-05, |
| "loss": 0.0026, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.2855447894008305, |
| "grad_norm": 0.27384671568870544, |
| "learning_rate": 7.148700356247527e-05, |
| "loss": 0.0227, |
| "step": 4875 |
| }, |
| { |
| "epoch": 1.2868630940610375, |
| "grad_norm": 0.03584875538945198, |
| "learning_rate": 7.135506003430532e-05, |
| "loss": 0.0299, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.2881813987212445, |
| "grad_norm": 0.03482440486550331, |
| "learning_rate": 7.122311650613538e-05, |
| "loss": 0.0125, |
| "step": 4885 |
| }, |
| { |
| "epoch": 1.2894997033814515, |
| "grad_norm": 0.005974395200610161, |
| "learning_rate": 7.109117297796543e-05, |
| "loss": 0.0029, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.2908180080416585, |
| "grad_norm": 0.01820153370499611, |
| "learning_rate": 7.095922944979549e-05, |
| "loss": 0.0254, |
| "step": 4895 |
| }, |
| { |
| "epoch": 1.2921363127018655, |
| "grad_norm": 0.1733965277671814, |
| "learning_rate": 7.082728592162555e-05, |
| "loss": 0.028, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.2934546173620725, |
| "grad_norm": 1.3017303943634033, |
| "learning_rate": 7.06953423934556e-05, |
| "loss": 0.0213, |
| "step": 4905 |
| }, |
| { |
| "epoch": 1.2947729220222794, |
| "grad_norm": 0.01360877975821495, |
| "learning_rate": 7.056339886528566e-05, |
| "loss": 0.0039, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.2960912266824862, |
| "grad_norm": 0.01503999624401331, |
| "learning_rate": 7.043145533711571e-05, |
| "loss": 0.0102, |
| "step": 4915 |
| }, |
| { |
| "epoch": 1.2974095313426934, |
| "grad_norm": 0.2200804352760315, |
| "learning_rate": 7.029951180894577e-05, |
| "loss": 0.0461, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.2987278360029002, |
| "grad_norm": 0.08512946963310242, |
| "learning_rate": 7.016756828077582e-05, |
| "loss": 0.0066, |
| "step": 4925 |
| }, |
| { |
| "epoch": 1.3000461406631072, |
| "grad_norm": 0.08296570926904678, |
| "learning_rate": 7.00356247526059e-05, |
| "loss": 0.0223, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.3013644453233142, |
| "grad_norm": 0.008866079151630402, |
| "learning_rate": 6.990368122443595e-05, |
| "loss": 0.0032, |
| "step": 4935 |
| }, |
| { |
| "epoch": 1.3026827499835212, |
| "grad_norm": 0.024493014439940453, |
| "learning_rate": 6.9771737696266e-05, |
| "loss": 0.0128, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.3040010546437282, |
| "grad_norm": 0.08965341746807098, |
| "learning_rate": 6.963979416809606e-05, |
| "loss": 0.028, |
| "step": 4945 |
| }, |
| { |
| "epoch": 1.3053193593039352, |
| "grad_norm": 0.023156631737947464, |
| "learning_rate": 6.950785063992612e-05, |
| "loss": 0.0187, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.3066376639641422, |
| "grad_norm": 0.18552155792713165, |
| "learning_rate": 6.937590711175617e-05, |
| "loss": 0.0424, |
| "step": 4955 |
| }, |
| { |
| "epoch": 1.3079559686243492, |
| "grad_norm": 0.02200198918581009, |
| "learning_rate": 6.924396358358623e-05, |
| "loss": 0.0148, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.3092742732845561, |
| "grad_norm": 0.00568364467471838, |
| "learning_rate": 6.911202005541628e-05, |
| "loss": 0.0199, |
| "step": 4965 |
| }, |
| { |
| "epoch": 1.310592577944763, |
| "grad_norm": 0.021591177210211754, |
| "learning_rate": 6.898007652724634e-05, |
| "loss": 0.0092, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.3119108826049701, |
| "grad_norm": 0.327177494764328, |
| "learning_rate": 6.88481329990764e-05, |
| "loss": 0.0047, |
| "step": 4975 |
| }, |
| { |
| "epoch": 1.313229187265177, |
| "grad_norm": 0.024512887001037598, |
| "learning_rate": 6.871618947090645e-05, |
| "loss": 0.0046, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.314547491925384, |
| "grad_norm": 0.05725006014108658, |
| "learning_rate": 6.858424594273652e-05, |
| "loss": 0.0227, |
| "step": 4985 |
| }, |
| { |
| "epoch": 1.3158657965855909, |
| "grad_norm": 0.011280277743935585, |
| "learning_rate": 6.845230241456658e-05, |
| "loss": 0.0056, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.3171841012457979, |
| "grad_norm": 0.022504402324557304, |
| "learning_rate": 6.832035888639663e-05, |
| "loss": 0.0029, |
| "step": 4995 |
| }, |
| { |
| "epoch": 1.3185024059060049, |
| "grad_norm": 0.02168826013803482, |
| "learning_rate": 6.818841535822669e-05, |
| "loss": 0.0198, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.3185024059060049, |
| "eval_loss": 0.025039294734597206, |
| "eval_runtime": 452.1097, |
| "eval_samples_per_second": 7.458, |
| "eval_steps_per_second": 3.729, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.3198207105662119, |
| "grad_norm": 0.0064329709857702255, |
| "learning_rate": 6.805647183005673e-05, |
| "loss": 0.0299, |
| "step": 5005 |
| }, |
| { |
| "epoch": 1.3211390152264189, |
| "grad_norm": 0.00267885928042233, |
| "learning_rate": 6.79245283018868e-05, |
| "loss": 0.0065, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.3224573198866258, |
| "grad_norm": 0.6842889189720154, |
| "learning_rate": 6.779258477371685e-05, |
| "loss": 0.008, |
| "step": 5015 |
| }, |
| { |
| "epoch": 1.3237756245468328, |
| "grad_norm": 0.002985635306686163, |
| "learning_rate": 6.766064124554691e-05, |
| "loss": 0.0119, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.3250939292070396, |
| "grad_norm": 0.019304940477013588, |
| "learning_rate": 6.752869771737696e-05, |
| "loss": 0.0041, |
| "step": 5025 |
| }, |
| { |
| "epoch": 1.3264122338672468, |
| "grad_norm": 0.011305035091936588, |
| "learning_rate": 6.739675418920702e-05, |
| "loss": 0.0031, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.3277305385274536, |
| "grad_norm": 0.006184784695506096, |
| "learning_rate": 6.726481066103708e-05, |
| "loss": 0.0081, |
| "step": 5035 |
| }, |
| { |
| "epoch": 1.3290488431876606, |
| "grad_norm": 0.0073184361681342125, |
| "learning_rate": 6.713286713286715e-05, |
| "loss": 0.0202, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.3303671478478676, |
| "grad_norm": 0.006566181313246489, |
| "learning_rate": 6.70009236046972e-05, |
| "loss": 0.0052, |
| "step": 5045 |
| }, |
| { |
| "epoch": 1.3316854525080746, |
| "grad_norm": 0.31427526473999023, |
| "learning_rate": 6.686898007652726e-05, |
| "loss": 0.017, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.3330037571682816, |
| "grad_norm": 0.005085447803139687, |
| "learning_rate": 6.67370365483573e-05, |
| "loss": 0.009, |
| "step": 5055 |
| }, |
| { |
| "epoch": 1.3343220618284886, |
| "grad_norm": 0.2745366096496582, |
| "learning_rate": 6.660509302018735e-05, |
| "loss": 0.0119, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.3356403664886956, |
| "grad_norm": 0.2871796786785126, |
| "learning_rate": 6.647314949201742e-05, |
| "loss": 0.0158, |
| "step": 5065 |
| }, |
| { |
| "epoch": 1.3369586711489025, |
| "grad_norm": 0.2774186134338379, |
| "learning_rate": 6.634120596384748e-05, |
| "loss": 0.0084, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.3382769758091095, |
| "grad_norm": 0.013278775848448277, |
| "learning_rate": 6.620926243567753e-05, |
| "loss": 0.0111, |
| "step": 5075 |
| }, |
| { |
| "epoch": 1.3395952804693165, |
| "grad_norm": 0.01614517532289028, |
| "learning_rate": 6.607731890750759e-05, |
| "loss": 0.0066, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.3409135851295235, |
| "grad_norm": 0.0037789656780660152, |
| "learning_rate": 6.594537537933765e-05, |
| "loss": 0.0142, |
| "step": 5085 |
| }, |
| { |
| "epoch": 1.3422318897897303, |
| "grad_norm": 0.03221861273050308, |
| "learning_rate": 6.58134318511677e-05, |
| "loss": 0.0155, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.3435501944499375, |
| "grad_norm": 0.005637989845126867, |
| "learning_rate": 6.568148832299776e-05, |
| "loss": 0.0022, |
| "step": 5095 |
| }, |
| { |
| "epoch": 1.3448684991101443, |
| "grad_norm": 0.0017844432732090354, |
| "learning_rate": 6.554954479482783e-05, |
| "loss": 0.0217, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.3461868037703513, |
| "grad_norm": 0.08099021762609482, |
| "learning_rate": 6.541760126665787e-05, |
| "loss": 0.0222, |
| "step": 5105 |
| }, |
| { |
| "epoch": 1.3475051084305583, |
| "grad_norm": 0.011909045279026031, |
| "learning_rate": 6.528565773848792e-05, |
| "loss": 0.0058, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.3488234130907653, |
| "grad_norm": 0.7332578301429749, |
| "learning_rate": 6.515371421031798e-05, |
| "loss": 0.0286, |
| "step": 5115 |
| }, |
| { |
| "epoch": 1.3501417177509722, |
| "grad_norm": 0.3415885865688324, |
| "learning_rate": 6.502177068214804e-05, |
| "loss": 0.1191, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.3514600224111792, |
| "grad_norm": 0.00904211588203907, |
| "learning_rate": 6.48898271539781e-05, |
| "loss": 0.0043, |
| "step": 5125 |
| }, |
| { |
| "epoch": 1.3527783270713862, |
| "grad_norm": 0.1978830248117447, |
| "learning_rate": 6.475788362580816e-05, |
| "loss": 0.0316, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.3540966317315932, |
| "grad_norm": 0.10229042172431946, |
| "learning_rate": 6.462594009763822e-05, |
| "loss": 0.0194, |
| "step": 5135 |
| }, |
| { |
| "epoch": 1.3554149363918002, |
| "grad_norm": 0.4457210600376129, |
| "learning_rate": 6.449399656946827e-05, |
| "loss": 0.0276, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.356733241052007, |
| "grad_norm": 0.023706572130322456, |
| "learning_rate": 6.436205304129833e-05, |
| "loss": 0.0163, |
| "step": 5145 |
| }, |
| { |
| "epoch": 1.3580515457122142, |
| "grad_norm": 1.166896939277649, |
| "learning_rate": 6.423010951312838e-05, |
| "loss": 0.0189, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.359369850372421, |
| "grad_norm": 0.0016115796752274036, |
| "learning_rate": 6.409816598495844e-05, |
| "loss": 0.0191, |
| "step": 5155 |
| }, |
| { |
| "epoch": 1.360688155032628, |
| "grad_norm": 0.00786682777106762, |
| "learning_rate": 6.39662224567885e-05, |
| "loss": 0.0119, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.362006459692835, |
| "grad_norm": 1.042732834815979, |
| "learning_rate": 6.383427892861855e-05, |
| "loss": 0.0497, |
| "step": 5165 |
| }, |
| { |
| "epoch": 1.363324764353042, |
| "grad_norm": 0.007983304560184479, |
| "learning_rate": 6.37023354004486e-05, |
| "loss": 0.044, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.364643069013249, |
| "grad_norm": 0.009767642244696617, |
| "learning_rate": 6.357039187227866e-05, |
| "loss": 0.0405, |
| "step": 5175 |
| }, |
| { |
| "epoch": 1.365961373673456, |
| "grad_norm": 0.03164628520607948, |
| "learning_rate": 6.343844834410873e-05, |
| "loss": 0.0138, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.367279678333663, |
| "grad_norm": 0.004159921780228615, |
| "learning_rate": 6.330650481593879e-05, |
| "loss": 0.0045, |
| "step": 5185 |
| }, |
| { |
| "epoch": 1.36859798299387, |
| "grad_norm": 0.004395391326397657, |
| "learning_rate": 6.317456128776884e-05, |
| "loss": 0.0046, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.369916287654077, |
| "grad_norm": 0.011886746622622013, |
| "learning_rate": 6.30426177595989e-05, |
| "loss": 0.0064, |
| "step": 5195 |
| }, |
| { |
| "epoch": 1.371234592314284, |
| "grad_norm": 0.2259266972541809, |
| "learning_rate": 6.291067423142895e-05, |
| "loss": 0.0076, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.372552896974491, |
| "grad_norm": 0.01407301053404808, |
| "learning_rate": 6.277873070325901e-05, |
| "loss": 0.0201, |
| "step": 5205 |
| }, |
| { |
| "epoch": 1.3738712016346977, |
| "grad_norm": 0.00911578256636858, |
| "learning_rate": 6.264678717508906e-05, |
| "loss": 0.0164, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.3751895062949049, |
| "grad_norm": 0.20968014001846313, |
| "learning_rate": 6.251484364691912e-05, |
| "loss": 0.0075, |
| "step": 5215 |
| }, |
| { |
| "epoch": 1.3765078109551117, |
| "grad_norm": 0.008801166899502277, |
| "learning_rate": 6.238290011874918e-05, |
| "loss": 0.0068, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.3778261156153186, |
| "grad_norm": 0.007181806955486536, |
| "learning_rate": 6.225095659057923e-05, |
| "loss": 0.0136, |
| "step": 5225 |
| }, |
| { |
| "epoch": 1.3791444202755256, |
| "grad_norm": 0.7527109980583191, |
| "learning_rate": 6.211901306240929e-05, |
| "loss": 0.0287, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.3804627249357326, |
| "grad_norm": 0.039015207439661026, |
| "learning_rate": 6.198706953423936e-05, |
| "loss": 0.0326, |
| "step": 5235 |
| }, |
| { |
| "epoch": 1.3817810295959396, |
| "grad_norm": 0.021076606586575508, |
| "learning_rate": 6.185512600606941e-05, |
| "loss": 0.0191, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.3830993342561466, |
| "grad_norm": 0.016630731523036957, |
| "learning_rate": 6.172318247789947e-05, |
| "loss": 0.0131, |
| "step": 5245 |
| }, |
| { |
| "epoch": 1.3844176389163536, |
| "grad_norm": 0.011133644729852676, |
| "learning_rate": 6.159123894972952e-05, |
| "loss": 0.0029, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.3857359435765606, |
| "grad_norm": 0.6434677243232727, |
| "learning_rate": 6.145929542155957e-05, |
| "loss": 0.0091, |
| "step": 5255 |
| }, |
| { |
| "epoch": 1.3870542482367676, |
| "grad_norm": 0.051020298153162, |
| "learning_rate": 6.132735189338964e-05, |
| "loss": 0.0086, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.3883725528969744, |
| "grad_norm": 0.016413932666182518, |
| "learning_rate": 6.119540836521969e-05, |
| "loss": 0.0061, |
| "step": 5265 |
| }, |
| { |
| "epoch": 1.3896908575571816, |
| "grad_norm": 0.005769540090113878, |
| "learning_rate": 6.106346483704975e-05, |
| "loss": 0.0027, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.3910091622173884, |
| "grad_norm": 0.06687796860933304, |
| "learning_rate": 6.09315213088798e-05, |
| "loss": 0.0423, |
| "step": 5275 |
| }, |
| { |
| "epoch": 1.3923274668775953, |
| "grad_norm": 0.005641553085297346, |
| "learning_rate": 6.079957778070986e-05, |
| "loss": 0.0353, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.3936457715378023, |
| "grad_norm": 0.04460568353533745, |
| "learning_rate": 6.066763425253992e-05, |
| "loss": 0.0041, |
| "step": 5285 |
| }, |
| { |
| "epoch": 1.3949640761980093, |
| "grad_norm": 0.0387534461915493, |
| "learning_rate": 6.0535690724369976e-05, |
| "loss": 0.006, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.3962823808582163, |
| "grad_norm": 0.010292598977684975, |
| "learning_rate": 6.040374719620003e-05, |
| "loss": 0.0038, |
| "step": 5295 |
| }, |
| { |
| "epoch": 1.3976006855184233, |
| "grad_norm": 0.3646155297756195, |
| "learning_rate": 6.0271803668030094e-05, |
| "loss": 0.0111, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.3989189901786303, |
| "grad_norm": 0.022035539150238037, |
| "learning_rate": 6.0139860139860136e-05, |
| "loss": 0.0507, |
| "step": 5305 |
| }, |
| { |
| "epoch": 1.4002372948388373, |
| "grad_norm": 0.003314939560368657, |
| "learning_rate": 6.00079166116902e-05, |
| "loss": 0.0132, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.4015555994990443, |
| "grad_norm": 0.0838267058134079, |
| "learning_rate": 5.9875973083520254e-05, |
| "loss": 0.0105, |
| "step": 5315 |
| }, |
| { |
| "epoch": 1.4028739041592513, |
| "grad_norm": 0.009368584491312504, |
| "learning_rate": 5.974402955535031e-05, |
| "loss": 0.0026, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.4041922088194583, |
| "grad_norm": 0.031248098239302635, |
| "learning_rate": 5.961208602718037e-05, |
| "loss": 0.0151, |
| "step": 5325 |
| }, |
| { |
| "epoch": 1.405510513479665, |
| "grad_norm": 0.06447605788707733, |
| "learning_rate": 5.948014249901043e-05, |
| "loss": 0.0219, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.4068288181398723, |
| "grad_norm": 0.010814374312758446, |
| "learning_rate": 5.9348198970840484e-05, |
| "loss": 0.0038, |
| "step": 5335 |
| }, |
| { |
| "epoch": 1.408147122800079, |
| "grad_norm": 0.6235967874526978, |
| "learning_rate": 5.9216255442670546e-05, |
| "loss": 0.0354, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.409465427460286, |
| "grad_norm": 0.026741521432995796, |
| "learning_rate": 5.90843119145006e-05, |
| "loss": 0.0032, |
| "step": 5345 |
| }, |
| { |
| "epoch": 1.410783732120493, |
| "grad_norm": 0.019413433969020844, |
| "learning_rate": 5.895236838633066e-05, |
| "loss": 0.0216, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.4121020367807, |
| "grad_norm": 0.0735543966293335, |
| "learning_rate": 5.8820424858160706e-05, |
| "loss": 0.0033, |
| "step": 5355 |
| }, |
| { |
| "epoch": 1.413420341440907, |
| "grad_norm": 0.005189546383917332, |
| "learning_rate": 5.868848132999076e-05, |
| "loss": 0.021, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.414738646101114, |
| "grad_norm": 0.21240335702896118, |
| "learning_rate": 5.8556537801820824e-05, |
| "loss": 0.0294, |
| "step": 5365 |
| }, |
| { |
| "epoch": 1.416056950761321, |
| "grad_norm": 0.010165920481085777, |
| "learning_rate": 5.842459427365088e-05, |
| "loss": 0.0021, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.417375255421528, |
| "grad_norm": 0.026774069294333458, |
| "learning_rate": 5.8292650745480936e-05, |
| "loss": 0.0299, |
| "step": 5375 |
| }, |
| { |
| "epoch": 1.418693560081735, |
| "grad_norm": 0.0019810455851256847, |
| "learning_rate": 5.816070721731099e-05, |
| "loss": 0.0029, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.4200118647419417, |
| "grad_norm": 0.038888879120349884, |
| "learning_rate": 5.8028763689141054e-05, |
| "loss": 0.0069, |
| "step": 5385 |
| }, |
| { |
| "epoch": 1.421330169402149, |
| "grad_norm": 0.016180936247110367, |
| "learning_rate": 5.789682016097111e-05, |
| "loss": 0.0032, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.4226484740623557, |
| "grad_norm": 0.01119404286146164, |
| "learning_rate": 5.7764876632801165e-05, |
| "loss": 0.0024, |
| "step": 5395 |
| }, |
| { |
| "epoch": 1.4239667787225627, |
| "grad_norm": 0.010486694052815437, |
| "learning_rate": 5.763293310463123e-05, |
| "loss": 0.0324, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.4252850833827697, |
| "grad_norm": 0.005453066434711218, |
| "learning_rate": 5.750098957646127e-05, |
| "loss": 0.0038, |
| "step": 5405 |
| }, |
| { |
| "epoch": 1.4266033880429767, |
| "grad_norm": 0.17556461691856384, |
| "learning_rate": 5.736904604829133e-05, |
| "loss": 0.0305, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.4279216927031837, |
| "grad_norm": 0.03074715845286846, |
| "learning_rate": 5.723710252012139e-05, |
| "loss": 0.003, |
| "step": 5415 |
| }, |
| { |
| "epoch": 1.4292399973633907, |
| "grad_norm": 1.7238941192626953, |
| "learning_rate": 5.710515899195144e-05, |
| "loss": 0.0254, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.4305583020235977, |
| "grad_norm": 0.012462320737540722, |
| "learning_rate": 5.6973215463781506e-05, |
| "loss": 0.0018, |
| "step": 5425 |
| }, |
| { |
| "epoch": 1.4318766066838047, |
| "grad_norm": 0.021576853469014168, |
| "learning_rate": 5.684127193561156e-05, |
| "loss": 0.0472, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.4331949113440117, |
| "grad_norm": 0.2862134575843811, |
| "learning_rate": 5.670932840744162e-05, |
| "loss": 0.0258, |
| "step": 5435 |
| }, |
| { |
| "epoch": 1.4345132160042184, |
| "grad_norm": 0.28419312834739685, |
| "learning_rate": 5.657738487927168e-05, |
| "loss": 0.0053, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.4358315206644257, |
| "grad_norm": 0.013650139793753624, |
| "learning_rate": 5.6445441351101735e-05, |
| "loss": 0.0126, |
| "step": 5445 |
| }, |
| { |
| "epoch": 1.4371498253246324, |
| "grad_norm": 0.01203097216784954, |
| "learning_rate": 5.631349782293179e-05, |
| "loss": 0.0076, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.4384681299848394, |
| "grad_norm": 0.0881054624915123, |
| "learning_rate": 5.618155429476184e-05, |
| "loss": 0.0178, |
| "step": 5455 |
| }, |
| { |
| "epoch": 1.4397864346450464, |
| "grad_norm": 0.5258516669273376, |
| "learning_rate": 5.6049610766591895e-05, |
| "loss": 0.0112, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.4411047393052534, |
| "grad_norm": 0.001202153041958809, |
| "learning_rate": 5.591766723842196e-05, |
| "loss": 0.0089, |
| "step": 5465 |
| }, |
| { |
| "epoch": 1.4424230439654604, |
| "grad_norm": 0.4498993456363678, |
| "learning_rate": 5.5785723710252014e-05, |
| "loss": 0.0252, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.4437413486256674, |
| "grad_norm": 0.17477644979953766, |
| "learning_rate": 5.565378018208207e-05, |
| "loss": 0.0169, |
| "step": 5475 |
| }, |
| { |
| "epoch": 1.4450596532858744, |
| "grad_norm": 0.019443338736891747, |
| "learning_rate": 5.552183665391213e-05, |
| "loss": 0.0019, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.4463779579460814, |
| "grad_norm": 0.005653039086610079, |
| "learning_rate": 5.538989312574219e-05, |
| "loss": 0.0231, |
| "step": 5485 |
| }, |
| { |
| "epoch": 1.4476962626062884, |
| "grad_norm": 0.01554112322628498, |
| "learning_rate": 5.525794959757224e-05, |
| "loss": 0.0167, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.4490145672664954, |
| "grad_norm": 0.044272180646657944, |
| "learning_rate": 5.5126006069402305e-05, |
| "loss": 0.007, |
| "step": 5495 |
| }, |
| { |
| "epoch": 1.4503328719267023, |
| "grad_norm": 0.014857172966003418, |
| "learning_rate": 5.499406254123236e-05, |
| "loss": 0.0045, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.4503328719267023, |
| "eval_loss": 0.02392147295176983, |
| "eval_runtime": 452.468, |
| "eval_samples_per_second": 7.452, |
| "eval_steps_per_second": 3.726, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.4516511765869091, |
| "grad_norm": 0.007390835788100958, |
| "learning_rate": 5.486211901306241e-05, |
| "loss": 0.0171, |
| "step": 5505 |
| }, |
| { |
| "epoch": 1.4529694812471163, |
| "grad_norm": 0.0050474610179662704, |
| "learning_rate": 5.4730175484892466e-05, |
| "loss": 0.004, |
| "step": 5510 |
| }, |
| { |
| "epoch": 1.454287785907323, |
| "grad_norm": 0.08066163957118988, |
| "learning_rate": 5.459823195672252e-05, |
| "loss": 0.0103, |
| "step": 5515 |
| }, |
| { |
| "epoch": 1.45560609056753, |
| "grad_norm": 0.0062376330606639385, |
| "learning_rate": 5.4466288428552584e-05, |
| "loss": 0.0066, |
| "step": 5520 |
| }, |
| { |
| "epoch": 1.456924395227737, |
| "grad_norm": 0.00711809890344739, |
| "learning_rate": 5.433434490038264e-05, |
| "loss": 0.003, |
| "step": 5525 |
| }, |
| { |
| "epoch": 1.458242699887944, |
| "grad_norm": 0.004010149277746677, |
| "learning_rate": 5.4202401372212695e-05, |
| "loss": 0.0231, |
| "step": 5530 |
| }, |
| { |
| "epoch": 1.459561004548151, |
| "grad_norm": 0.4791967272758484, |
| "learning_rate": 5.407045784404276e-05, |
| "loss": 0.0277, |
| "step": 5535 |
| }, |
| { |
| "epoch": 1.460879309208358, |
| "grad_norm": 0.03979189693927765, |
| "learning_rate": 5.393851431587281e-05, |
| "loss": 0.0033, |
| "step": 5540 |
| }, |
| { |
| "epoch": 1.462197613868565, |
| "grad_norm": 0.03331119939684868, |
| "learning_rate": 5.380657078770287e-05, |
| "loss": 0.0187, |
| "step": 5545 |
| }, |
| { |
| "epoch": 1.463515918528772, |
| "grad_norm": 0.0042802803218364716, |
| "learning_rate": 5.367462725953293e-05, |
| "loss": 0.0032, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.464834223188979, |
| "grad_norm": 0.05439918115735054, |
| "learning_rate": 5.354268373136297e-05, |
| "loss": 0.0043, |
| "step": 5555 |
| }, |
| { |
| "epoch": 1.4661525278491858, |
| "grad_norm": 0.042643506079912186, |
| "learning_rate": 5.3410740203193036e-05, |
| "loss": 0.0059, |
| "step": 5560 |
| }, |
| { |
| "epoch": 1.467470832509393, |
| "grad_norm": 0.023453116416931152, |
| "learning_rate": 5.327879667502309e-05, |
| "loss": 0.0043, |
| "step": 5565 |
| }, |
| { |
| "epoch": 1.4687891371695998, |
| "grad_norm": 0.037712760269641876, |
| "learning_rate": 5.314685314685315e-05, |
| "loss": 0.0033, |
| "step": 5570 |
| }, |
| { |
| "epoch": 1.4701074418298068, |
| "grad_norm": 1.0485608577728271, |
| "learning_rate": 5.301490961868321e-05, |
| "loss": 0.0489, |
| "step": 5575 |
| }, |
| { |
| "epoch": 1.4714257464900138, |
| "grad_norm": 0.004728829488158226, |
| "learning_rate": 5.2882966090513265e-05, |
| "loss": 0.0067, |
| "step": 5580 |
| }, |
| { |
| "epoch": 1.4727440511502208, |
| "grad_norm": 0.027893677353858948, |
| "learning_rate": 5.275102256234332e-05, |
| "loss": 0.0208, |
| "step": 5585 |
| }, |
| { |
| "epoch": 1.4740623558104278, |
| "grad_norm": 0.02256879396736622, |
| "learning_rate": 5.2619079034173377e-05, |
| "loss": 0.0036, |
| "step": 5590 |
| }, |
| { |
| "epoch": 1.4753806604706348, |
| "grad_norm": 0.12636558711528778, |
| "learning_rate": 5.248713550600344e-05, |
| "loss": 0.0046, |
| "step": 5595 |
| }, |
| { |
| "epoch": 1.4766989651308418, |
| "grad_norm": 0.000997041119262576, |
| "learning_rate": 5.235519197783348e-05, |
| "loss": 0.0101, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.4780172697910487, |
| "grad_norm": 0.023494020104408264, |
| "learning_rate": 5.2223248449663543e-05, |
| "loss": 0.0039, |
| "step": 5605 |
| }, |
| { |
| "epoch": 1.4793355744512557, |
| "grad_norm": 0.01525307446718216, |
| "learning_rate": 5.20913049214936e-05, |
| "loss": 0.021, |
| "step": 5610 |
| }, |
| { |
| "epoch": 1.4806538791114627, |
| "grad_norm": 0.0024215306621044874, |
| "learning_rate": 5.1959361393323655e-05, |
| "loss": 0.0017, |
| "step": 5615 |
| }, |
| { |
| "epoch": 1.4819721837716697, |
| "grad_norm": 1.4708061218261719, |
| "learning_rate": 5.182741786515372e-05, |
| "loss": 0.04, |
| "step": 5620 |
| }, |
| { |
| "epoch": 1.4832904884318765, |
| "grad_norm": 0.015033531002700329, |
| "learning_rate": 5.169547433698377e-05, |
| "loss": 0.0042, |
| "step": 5625 |
| }, |
| { |
| "epoch": 1.4846087930920837, |
| "grad_norm": 0.0035444959066808224, |
| "learning_rate": 5.156353080881383e-05, |
| "loss": 0.0087, |
| "step": 5630 |
| }, |
| { |
| "epoch": 1.4859270977522905, |
| "grad_norm": 0.010087919421494007, |
| "learning_rate": 5.143158728064389e-05, |
| "loss": 0.0158, |
| "step": 5635 |
| }, |
| { |
| "epoch": 1.4872454024124975, |
| "grad_norm": 0.05779251083731651, |
| "learning_rate": 5.129964375247395e-05, |
| "loss": 0.0157, |
| "step": 5640 |
| }, |
| { |
| "epoch": 1.4885637070727045, |
| "grad_norm": 0.14927980303764343, |
| "learning_rate": 5.1167700224304e-05, |
| "loss": 0.0257, |
| "step": 5645 |
| }, |
| { |
| "epoch": 1.4898820117329115, |
| "grad_norm": 0.004252352751791477, |
| "learning_rate": 5.103575669613405e-05, |
| "loss": 0.0198, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.4912003163931185, |
| "grad_norm": 0.0029206848703324795, |
| "learning_rate": 5.090381316796411e-05, |
| "loss": 0.0016, |
| "step": 5655 |
| }, |
| { |
| "epoch": 1.4925186210533254, |
| "grad_norm": 0.005047530401498079, |
| "learning_rate": 5.077186963979417e-05, |
| "loss": 0.0023, |
| "step": 5660 |
| }, |
| { |
| "epoch": 1.4938369257135324, |
| "grad_norm": 0.003732564626261592, |
| "learning_rate": 5.0639926111624225e-05, |
| "loss": 0.0336, |
| "step": 5665 |
| }, |
| { |
| "epoch": 1.4951552303737394, |
| "grad_norm": 0.3832889497280121, |
| "learning_rate": 5.050798258345428e-05, |
| "loss": 0.0476, |
| "step": 5670 |
| }, |
| { |
| "epoch": 1.4964735350339464, |
| "grad_norm": 0.06733009219169617, |
| "learning_rate": 5.037603905528434e-05, |
| "loss": 0.0044, |
| "step": 5675 |
| }, |
| { |
| "epoch": 1.4977918396941532, |
| "grad_norm": 0.008067069575190544, |
| "learning_rate": 5.02440955271144e-05, |
| "loss": 0.0035, |
| "step": 5680 |
| }, |
| { |
| "epoch": 1.4991101443543604, |
| "grad_norm": 0.01706300489604473, |
| "learning_rate": 5.0112151998944454e-05, |
| "loss": 0.0031, |
| "step": 5685 |
| }, |
| { |
| "epoch": 1.5004284490145672, |
| "grad_norm": 0.009932024404406548, |
| "learning_rate": 4.998020847077451e-05, |
| "loss": 0.0587, |
| "step": 5690 |
| }, |
| { |
| "epoch": 1.5017467536747744, |
| "grad_norm": 0.006488936021924019, |
| "learning_rate": 4.9848264942604566e-05, |
| "loss": 0.002, |
| "step": 5695 |
| }, |
| { |
| "epoch": 1.5030650583349812, |
| "grad_norm": 0.17488756775856018, |
| "learning_rate": 4.971632141443462e-05, |
| "loss": 0.0245, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.5043833629951882, |
| "grad_norm": 0.3327178359031677, |
| "learning_rate": 4.9584377886264684e-05, |
| "loss": 0.0404, |
| "step": 5705 |
| }, |
| { |
| "epoch": 1.5057016676553951, |
| "grad_norm": 0.18467263877391815, |
| "learning_rate": 4.945243435809474e-05, |
| "loss": 0.0248, |
| "step": 5710 |
| }, |
| { |
| "epoch": 1.5070199723156021, |
| "grad_norm": 0.020061776041984558, |
| "learning_rate": 4.9320490829924795e-05, |
| "loss": 0.0034, |
| "step": 5715 |
| }, |
| { |
| "epoch": 1.5083382769758091, |
| "grad_norm": 0.0005288647953420877, |
| "learning_rate": 4.918854730175485e-05, |
| "loss": 0.0076, |
| "step": 5720 |
| }, |
| { |
| "epoch": 1.5096565816360161, |
| "grad_norm": 0.007515576668083668, |
| "learning_rate": 4.9056603773584906e-05, |
| "loss": 0.004, |
| "step": 5725 |
| }, |
| { |
| "epoch": 1.5109748862962231, |
| "grad_norm": 0.05365758761763573, |
| "learning_rate": 4.892466024541497e-05, |
| "loss": 0.0222, |
| "step": 5730 |
| }, |
| { |
| "epoch": 1.51229319095643, |
| "grad_norm": 0.00572391040623188, |
| "learning_rate": 4.8792716717245025e-05, |
| "loss": 0.0132, |
| "step": 5735 |
| }, |
| { |
| "epoch": 1.513611495616637, |
| "grad_norm": 0.21178627014160156, |
| "learning_rate": 4.8660773189075073e-05, |
| "loss": 0.0417, |
| "step": 5740 |
| }, |
| { |
| "epoch": 1.5149298002768439, |
| "grad_norm": 0.0641486868262291, |
| "learning_rate": 4.8528829660905136e-05, |
| "loss": 0.011, |
| "step": 5745 |
| }, |
| { |
| "epoch": 1.516248104937051, |
| "grad_norm": 0.04451924189925194, |
| "learning_rate": 4.839688613273519e-05, |
| "loss": 0.012, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.5175664095972579, |
| "grad_norm": 0.019951259717345238, |
| "learning_rate": 4.826494260456525e-05, |
| "loss": 0.009, |
| "step": 5755 |
| }, |
| { |
| "epoch": 1.5188847142574649, |
| "grad_norm": 0.021919893100857735, |
| "learning_rate": 4.813299907639531e-05, |
| "loss": 0.0081, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.5202030189176718, |
| "grad_norm": 0.5730367302894592, |
| "learning_rate": 4.800105554822536e-05, |
| "loss": 0.0254, |
| "step": 5765 |
| }, |
| { |
| "epoch": 1.5215213235778788, |
| "grad_norm": 0.02501523122191429, |
| "learning_rate": 4.786911202005542e-05, |
| "loss": 0.0045, |
| "step": 5770 |
| }, |
| { |
| "epoch": 1.5228396282380858, |
| "grad_norm": 0.01574208028614521, |
| "learning_rate": 4.773716849188548e-05, |
| "loss": 0.0081, |
| "step": 5775 |
| }, |
| { |
| "epoch": 1.5241579328982928, |
| "grad_norm": 0.009626791812479496, |
| "learning_rate": 4.760522496371553e-05, |
| "loss": 0.0037, |
| "step": 5780 |
| }, |
| { |
| "epoch": 1.5254762375584998, |
| "grad_norm": 0.535539448261261, |
| "learning_rate": 4.747328143554559e-05, |
| "loss": 0.0149, |
| "step": 5785 |
| }, |
| { |
| "epoch": 1.5267945422187066, |
| "grad_norm": 0.004934845492243767, |
| "learning_rate": 4.7341337907375644e-05, |
| "loss": 0.0048, |
| "step": 5790 |
| }, |
| { |
| "epoch": 1.5281128468789138, |
| "grad_norm": 0.009070080704987049, |
| "learning_rate": 4.72093943792057e-05, |
| "loss": 0.0028, |
| "step": 5795 |
| }, |
| { |
| "epoch": 1.5294311515391206, |
| "grad_norm": 0.0040720063261687756, |
| "learning_rate": 4.707745085103576e-05, |
| "loss": 0.0016, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.5307494561993278, |
| "grad_norm": 0.45212000608444214, |
| "learning_rate": 4.694550732286582e-05, |
| "loss": 0.0111, |
| "step": 5805 |
| }, |
| { |
| "epoch": 1.5320677608595346, |
| "grad_norm": 0.024048497900366783, |
| "learning_rate": 4.681356379469587e-05, |
| "loss": 0.0149, |
| "step": 5810 |
| }, |
| { |
| "epoch": 1.5333860655197418, |
| "grad_norm": 0.11899136006832123, |
| "learning_rate": 4.668162026652593e-05, |
| "loss": 0.0034, |
| "step": 5815 |
| }, |
| { |
| "epoch": 1.5347043701799485, |
| "grad_norm": 0.011249657720327377, |
| "learning_rate": 4.6549676738355984e-05, |
| "loss": 0.0052, |
| "step": 5820 |
| }, |
| { |
| "epoch": 1.5360226748401555, |
| "grad_norm": 0.051634710282087326, |
| "learning_rate": 4.641773321018604e-05, |
| "loss": 0.0031, |
| "step": 5825 |
| }, |
| { |
| "epoch": 1.5373409795003625, |
| "grad_norm": 0.3726826012134552, |
| "learning_rate": 4.62857896820161e-05, |
| "loss": 0.0582, |
| "step": 5830 |
| }, |
| { |
| "epoch": 1.5386592841605695, |
| "grad_norm": 0.5827310681343079, |
| "learning_rate": 4.615384615384616e-05, |
| "loss": 0.0652, |
| "step": 5835 |
| }, |
| { |
| "epoch": 1.5399775888207765, |
| "grad_norm": 0.006390869617462158, |
| "learning_rate": 4.6021902625676214e-05, |
| "loss": 0.0022, |
| "step": 5840 |
| }, |
| { |
| "epoch": 1.5412958934809835, |
| "grad_norm": 0.022760871797800064, |
| "learning_rate": 4.588995909750627e-05, |
| "loss": 0.0311, |
| "step": 5845 |
| }, |
| { |
| "epoch": 1.5426141981411905, |
| "grad_norm": 0.22773241996765137, |
| "learning_rate": 4.5758015569336325e-05, |
| "loss": 0.0051, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.5439325028013973, |
| "grad_norm": 0.015375247225165367, |
| "learning_rate": 4.562607204116639e-05, |
| "loss": 0.0023, |
| "step": 5855 |
| }, |
| { |
| "epoch": 1.5452508074616045, |
| "grad_norm": 0.007347101345658302, |
| "learning_rate": 4.549412851299644e-05, |
| "loss": 0.0437, |
| "step": 5860 |
| }, |
| { |
| "epoch": 1.5465691121218113, |
| "grad_norm": 0.012344900518655777, |
| "learning_rate": 4.536218498482649e-05, |
| "loss": 0.004, |
| "step": 5865 |
| }, |
| { |
| "epoch": 1.5478874167820185, |
| "grad_norm": 0.27038896083831787, |
| "learning_rate": 4.5230241456656555e-05, |
| "loss": 0.0047, |
| "step": 5870 |
| }, |
| { |
| "epoch": 1.5492057214422252, |
| "grad_norm": 0.016395213082432747, |
| "learning_rate": 4.509829792848661e-05, |
| "loss": 0.0026, |
| "step": 5875 |
| }, |
| { |
| "epoch": 1.5505240261024322, |
| "grad_norm": 0.4217267632484436, |
| "learning_rate": 4.4966354400316666e-05, |
| "loss": 0.0364, |
| "step": 5880 |
| }, |
| { |
| "epoch": 1.5518423307626392, |
| "grad_norm": 0.20046105980873108, |
| "learning_rate": 4.483441087214673e-05, |
| "loss": 0.0243, |
| "step": 5885 |
| }, |
| { |
| "epoch": 1.5531606354228462, |
| "grad_norm": 0.004307698458433151, |
| "learning_rate": 4.470246734397678e-05, |
| "loss": 0.0064, |
| "step": 5890 |
| }, |
| { |
| "epoch": 1.5544789400830532, |
| "grad_norm": 0.46102187037467957, |
| "learning_rate": 4.457052381580683e-05, |
| "loss": 0.0115, |
| "step": 5895 |
| }, |
| { |
| "epoch": 1.5557972447432602, |
| "grad_norm": 0.0689118504524231, |
| "learning_rate": 4.4438580287636895e-05, |
| "loss": 0.0334, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.5571155494034672, |
| "grad_norm": 0.003091114340350032, |
| "learning_rate": 4.430663675946695e-05, |
| "loss": 0.0246, |
| "step": 5905 |
| }, |
| { |
| "epoch": 1.558433854063674, |
| "grad_norm": 0.003877349430695176, |
| "learning_rate": 4.417469323129701e-05, |
| "loss": 0.0032, |
| "step": 5910 |
| }, |
| { |
| "epoch": 1.5597521587238812, |
| "grad_norm": 0.30713143944740295, |
| "learning_rate": 4.404274970312706e-05, |
| "loss": 0.0229, |
| "step": 5915 |
| }, |
| { |
| "epoch": 1.561070463384088, |
| "grad_norm": 0.07344445586204529, |
| "learning_rate": 4.391080617495712e-05, |
| "loss": 0.0078, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.5623887680442952, |
| "grad_norm": 0.01774723082780838, |
| "learning_rate": 4.377886264678718e-05, |
| "loss": 0.0034, |
| "step": 5925 |
| }, |
| { |
| "epoch": 1.563707072704502, |
| "grad_norm": 0.476324200630188, |
| "learning_rate": 4.3646919118617236e-05, |
| "loss": 0.0071, |
| "step": 5930 |
| }, |
| { |
| "epoch": 1.5650253773647091, |
| "grad_norm": 0.11624465882778168, |
| "learning_rate": 4.351497559044729e-05, |
| "loss": 0.0236, |
| "step": 5935 |
| }, |
| { |
| "epoch": 1.566343682024916, |
| "grad_norm": 0.190691277384758, |
| "learning_rate": 4.338303206227735e-05, |
| "loss": 0.006, |
| "step": 5940 |
| }, |
| { |
| "epoch": 1.567661986685123, |
| "grad_norm": 0.20517045259475708, |
| "learning_rate": 4.32510885341074e-05, |
| "loss": 0.009, |
| "step": 5945 |
| }, |
| { |
| "epoch": 1.56898029134533, |
| "grad_norm": 0.008122317492961884, |
| "learning_rate": 4.311914500593746e-05, |
| "loss": 0.0041, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.570298596005537, |
| "grad_norm": 0.01982291042804718, |
| "learning_rate": 4.298720147776752e-05, |
| "loss": 0.0258, |
| "step": 5955 |
| }, |
| { |
| "epoch": 1.5716169006657439, |
| "grad_norm": 0.000996922142803669, |
| "learning_rate": 4.285525794959758e-05, |
| "loss": 0.0233, |
| "step": 5960 |
| }, |
| { |
| "epoch": 1.5729352053259509, |
| "grad_norm": 0.09725592285394669, |
| "learning_rate": 4.272331442142763e-05, |
| "loss": 0.0218, |
| "step": 5965 |
| }, |
| { |
| "epoch": 1.5742535099861579, |
| "grad_norm": 0.0672350749373436, |
| "learning_rate": 4.259137089325769e-05, |
| "loss": 0.0194, |
| "step": 5970 |
| }, |
| { |
| "epoch": 1.5755718146463646, |
| "grad_norm": 0.014844833873212337, |
| "learning_rate": 4.2459427365087744e-05, |
| "loss": 0.0298, |
| "step": 5975 |
| }, |
| { |
| "epoch": 1.5768901193065719, |
| "grad_norm": 0.030519040301442146, |
| "learning_rate": 4.2327483836917806e-05, |
| "loss": 0.0178, |
| "step": 5980 |
| }, |
| { |
| "epoch": 1.5782084239667786, |
| "grad_norm": 0.018561460077762604, |
| "learning_rate": 4.219554030874786e-05, |
| "loss": 0.0154, |
| "step": 5985 |
| }, |
| { |
| "epoch": 1.5795267286269858, |
| "grad_norm": 0.02470085583627224, |
| "learning_rate": 4.206359678057791e-05, |
| "loss": 0.0361, |
| "step": 5990 |
| }, |
| { |
| "epoch": 1.5808450332871926, |
| "grad_norm": 0.055412422865629196, |
| "learning_rate": 4.193165325240797e-05, |
| "loss": 0.0162, |
| "step": 5995 |
| }, |
| { |
| "epoch": 1.5821633379473996, |
| "grad_norm": 0.0034158769994974136, |
| "learning_rate": 4.179970972423803e-05, |
| "loss": 0.0068, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.5821633379473996, |
| "eval_loss": 0.024797894060611725, |
| "eval_runtime": 452.1611, |
| "eval_samples_per_second": 7.458, |
| "eval_steps_per_second": 3.729, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.5834816426076066, |
| "grad_norm": 0.01284120511263609, |
| "learning_rate": 4.1667766196068085e-05, |
| "loss": 0.0036, |
| "step": 6005 |
| }, |
| { |
| "epoch": 1.5847999472678136, |
| "grad_norm": 0.01274865586310625, |
| "learning_rate": 4.153582266789815e-05, |
| "loss": 0.0447, |
| "step": 6010 |
| }, |
| { |
| "epoch": 1.5861182519280206, |
| "grad_norm": 0.03555435314774513, |
| "learning_rate": 4.1403879139728196e-05, |
| "loss": 0.0078, |
| "step": 6015 |
| }, |
| { |
| "epoch": 1.5874365565882276, |
| "grad_norm": 0.0011938117677345872, |
| "learning_rate": 4.127193561155825e-05, |
| "loss": 0.0136, |
| "step": 6020 |
| }, |
| { |
| "epoch": 1.5887548612484346, |
| "grad_norm": 0.9741255640983582, |
| "learning_rate": 4.1139992083388314e-05, |
| "loss": 0.0153, |
| "step": 6025 |
| }, |
| { |
| "epoch": 1.5900731659086413, |
| "grad_norm": 0.011220674030482769, |
| "learning_rate": 4.100804855521837e-05, |
| "loss": 0.0262, |
| "step": 6030 |
| }, |
| { |
| "epoch": 1.5913914705688486, |
| "grad_norm": 0.021556466817855835, |
| "learning_rate": 4.0876105027048425e-05, |
| "loss": 0.0044, |
| "step": 6035 |
| }, |
| { |
| "epoch": 1.5927097752290553, |
| "grad_norm": 0.2725502848625183, |
| "learning_rate": 4.074416149887848e-05, |
| "loss": 0.0558, |
| "step": 6040 |
| }, |
| { |
| "epoch": 1.5940280798892625, |
| "grad_norm": 0.6407182216644287, |
| "learning_rate": 4.0612217970708537e-05, |
| "loss": 0.0261, |
| "step": 6045 |
| }, |
| { |
| "epoch": 1.5953463845494693, |
| "grad_norm": 0.0024960115551948547, |
| "learning_rate": 4.04802744425386e-05, |
| "loss": 0.0128, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.5966646892096763, |
| "grad_norm": 0.11380109190940857, |
| "learning_rate": 4.0348330914368655e-05, |
| "loss": 0.0199, |
| "step": 6055 |
| }, |
| { |
| "epoch": 1.5979829938698833, |
| "grad_norm": 0.18358005583286285, |
| "learning_rate": 4.0216387386198704e-05, |
| "loss": 0.0083, |
| "step": 6060 |
| }, |
| { |
| "epoch": 1.5993012985300903, |
| "grad_norm": 0.06412303447723389, |
| "learning_rate": 4.0084443858028766e-05, |
| "loss": 0.0548, |
| "step": 6065 |
| }, |
| { |
| "epoch": 1.6006196031902973, |
| "grad_norm": 0.6999421119689941, |
| "learning_rate": 3.995250032985882e-05, |
| "loss": 0.0074, |
| "step": 6070 |
| }, |
| { |
| "epoch": 1.6019379078505043, |
| "grad_norm": 0.18698133528232574, |
| "learning_rate": 3.982055680168888e-05, |
| "loss": 0.0542, |
| "step": 6075 |
| }, |
| { |
| "epoch": 1.6032562125107113, |
| "grad_norm": 0.014717207290232182, |
| "learning_rate": 3.968861327351894e-05, |
| "loss": 0.0071, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.604574517170918, |
| "grad_norm": 0.0765385851264, |
| "learning_rate": 3.955666974534899e-05, |
| "loss": 0.0063, |
| "step": 6085 |
| }, |
| { |
| "epoch": 1.6058928218311253, |
| "grad_norm": 0.4332450330257416, |
| "learning_rate": 3.9424726217179044e-05, |
| "loss": 0.0071, |
| "step": 6090 |
| }, |
| { |
| "epoch": 1.607211126491332, |
| "grad_norm": 0.003700035158544779, |
| "learning_rate": 3.929278268900911e-05, |
| "loss": 0.0052, |
| "step": 6095 |
| }, |
| { |
| "epoch": 1.6085294311515392, |
| "grad_norm": 0.02500278130173683, |
| "learning_rate": 3.916083916083916e-05, |
| "loss": 0.0387, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.609847735811746, |
| "grad_norm": 0.023568281903862953, |
| "learning_rate": 3.902889563266922e-05, |
| "loss": 0.0594, |
| "step": 6105 |
| }, |
| { |
| "epoch": 1.6111660404719532, |
| "grad_norm": 0.02687825821340084, |
| "learning_rate": 3.8896952104499274e-05, |
| "loss": 0.0229, |
| "step": 6110 |
| }, |
| { |
| "epoch": 1.61248434513216, |
| "grad_norm": 0.005178579594939947, |
| "learning_rate": 3.876500857632933e-05, |
| "loss": 0.0293, |
| "step": 6115 |
| }, |
| { |
| "epoch": 1.613802649792367, |
| "grad_norm": 0.3987988531589508, |
| "learning_rate": 3.863306504815939e-05, |
| "loss": 0.015, |
| "step": 6120 |
| }, |
| { |
| "epoch": 1.615120954452574, |
| "grad_norm": 0.18915466964244843, |
| "learning_rate": 3.850112151998945e-05, |
| "loss": 0.023, |
| "step": 6125 |
| }, |
| { |
| "epoch": 1.616439259112781, |
| "grad_norm": 0.015252528712153435, |
| "learning_rate": 3.83691779918195e-05, |
| "loss": 0.0185, |
| "step": 6130 |
| }, |
| { |
| "epoch": 1.617757563772988, |
| "grad_norm": 0.04947187379002571, |
| "learning_rate": 3.823723446364956e-05, |
| "loss": 0.0131, |
| "step": 6135 |
| }, |
| { |
| "epoch": 1.619075868433195, |
| "grad_norm": 0.017095958814024925, |
| "learning_rate": 3.8105290935479615e-05, |
| "loss": 0.0071, |
| "step": 6140 |
| }, |
| { |
| "epoch": 1.620394173093402, |
| "grad_norm": 0.013050337322056293, |
| "learning_rate": 3.797334740730967e-05, |
| "loss": 0.0038, |
| "step": 6145 |
| }, |
| { |
| "epoch": 1.6217124777536087, |
| "grad_norm": 0.08132806420326233, |
| "learning_rate": 3.784140387913973e-05, |
| "loss": 0.0043, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.623030782413816, |
| "grad_norm": 0.020741304382681847, |
| "learning_rate": 3.770946035096979e-05, |
| "loss": 0.006, |
| "step": 6155 |
| }, |
| { |
| "epoch": 1.6243490870740227, |
| "grad_norm": 0.0576217919588089, |
| "learning_rate": 3.7577516822799844e-05, |
| "loss": 0.0033, |
| "step": 6160 |
| }, |
| { |
| "epoch": 1.62566739173423, |
| "grad_norm": 0.03032900020480156, |
| "learning_rate": 3.74455732946299e-05, |
| "loss": 0.0318, |
| "step": 6165 |
| }, |
| { |
| "epoch": 1.6269856963944367, |
| "grad_norm": 0.8868799209594727, |
| "learning_rate": 3.7313629766459955e-05, |
| "loss": 0.0304, |
| "step": 6170 |
| }, |
| { |
| "epoch": 1.6283040010546437, |
| "grad_norm": 0.003816834883764386, |
| "learning_rate": 3.718168623829002e-05, |
| "loss": 0.003, |
| "step": 6175 |
| }, |
| { |
| "epoch": 1.6296223057148507, |
| "grad_norm": 0.05368296429514885, |
| "learning_rate": 3.704974271012007e-05, |
| "loss": 0.0064, |
| "step": 6180 |
| }, |
| { |
| "epoch": 1.6309406103750577, |
| "grad_norm": 0.09963366389274597, |
| "learning_rate": 3.691779918195012e-05, |
| "loss": 0.0097, |
| "step": 6185 |
| }, |
| { |
| "epoch": 1.6322589150352647, |
| "grad_norm": 0.006273225415498018, |
| "learning_rate": 3.6785855653780185e-05, |
| "loss": 0.0071, |
| "step": 6190 |
| }, |
| { |
| "epoch": 1.6335772196954716, |
| "grad_norm": 0.15079188346862793, |
| "learning_rate": 3.665391212561024e-05, |
| "loss": 0.0058, |
| "step": 6195 |
| }, |
| { |
| "epoch": 1.6348955243556786, |
| "grad_norm": 0.004980973433703184, |
| "learning_rate": 3.6521968597440296e-05, |
| "loss": 0.0051, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.6362138290158854, |
| "grad_norm": 0.004235363099724054, |
| "learning_rate": 3.639002506927036e-05, |
| "loss": 0.0028, |
| "step": 6205 |
| }, |
| { |
| "epoch": 1.6375321336760926, |
| "grad_norm": 0.003829963505268097, |
| "learning_rate": 3.625808154110041e-05, |
| "loss": 0.0347, |
| "step": 6210 |
| }, |
| { |
| "epoch": 1.6388504383362994, |
| "grad_norm": 0.021650686860084534, |
| "learning_rate": 3.612613801293046e-05, |
| "loss": 0.0036, |
| "step": 6215 |
| }, |
| { |
| "epoch": 1.6401687429965066, |
| "grad_norm": 0.06326934695243835, |
| "learning_rate": 3.5994194484760525e-05, |
| "loss": 0.0228, |
| "step": 6220 |
| }, |
| { |
| "epoch": 1.6414870476567134, |
| "grad_norm": 0.017276322469115257, |
| "learning_rate": 3.586225095659058e-05, |
| "loss": 0.0025, |
| "step": 6225 |
| }, |
| { |
| "epoch": 1.6428053523169206, |
| "grad_norm": 0.005066063720732927, |
| "learning_rate": 3.573030742842064e-05, |
| "loss": 0.0047, |
| "step": 6230 |
| }, |
| { |
| "epoch": 1.6441236569771274, |
| "grad_norm": 0.003512267954647541, |
| "learning_rate": 3.559836390025069e-05, |
| "loss": 0.0018, |
| "step": 6235 |
| }, |
| { |
| "epoch": 1.6454419616373344, |
| "grad_norm": 0.004347699694335461, |
| "learning_rate": 3.546642037208075e-05, |
| "loss": 0.0045, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.6467602662975414, |
| "grad_norm": 0.008277533575892448, |
| "learning_rate": 3.533447684391081e-05, |
| "loss": 0.0456, |
| "step": 6245 |
| }, |
| { |
| "epoch": 1.6480785709577483, |
| "grad_norm": 0.00973033718764782, |
| "learning_rate": 3.5202533315740866e-05, |
| "loss": 0.0215, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.6493968756179553, |
| "grad_norm": 1.9432978630065918, |
| "learning_rate": 3.507058978757092e-05, |
| "loss": 0.0132, |
| "step": 6255 |
| }, |
| { |
| "epoch": 1.6507151802781623, |
| "grad_norm": 0.2693535387516022, |
| "learning_rate": 3.493864625940098e-05, |
| "loss": 0.0037, |
| "step": 6260 |
| }, |
| { |
| "epoch": 1.6520334849383693, |
| "grad_norm": 0.02107766456902027, |
| "learning_rate": 3.480670273123103e-05, |
| "loss": 0.0031, |
| "step": 6265 |
| }, |
| { |
| "epoch": 1.653351789598576, |
| "grad_norm": 0.07168436795473099, |
| "learning_rate": 3.467475920306109e-05, |
| "loss": 0.0101, |
| "step": 6270 |
| }, |
| { |
| "epoch": 1.6546700942587833, |
| "grad_norm": 0.06479799002408981, |
| "learning_rate": 3.454281567489115e-05, |
| "loss": 0.0032, |
| "step": 6275 |
| }, |
| { |
| "epoch": 1.65598839891899, |
| "grad_norm": 0.0013557536294683814, |
| "learning_rate": 3.441087214672121e-05, |
| "loss": 0.0037, |
| "step": 6280 |
| }, |
| { |
| "epoch": 1.6573067035791973, |
| "grad_norm": 0.07330150157213211, |
| "learning_rate": 3.427892861855126e-05, |
| "loss": 0.0031, |
| "step": 6285 |
| }, |
| { |
| "epoch": 1.658625008239404, |
| "grad_norm": 0.08246012777090073, |
| "learning_rate": 3.414698509038132e-05, |
| "loss": 0.0028, |
| "step": 6290 |
| }, |
| { |
| "epoch": 1.659943312899611, |
| "grad_norm": 0.6232367157936096, |
| "learning_rate": 3.4015041562211374e-05, |
| "loss": 0.0042, |
| "step": 6295 |
| }, |
| { |
| "epoch": 1.661261617559818, |
| "grad_norm": 0.007676729932427406, |
| "learning_rate": 3.388309803404143e-05, |
| "loss": 0.0501, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.662579922220025, |
| "grad_norm": 0.02081216312944889, |
| "learning_rate": 3.375115450587149e-05, |
| "loss": 0.0047, |
| "step": 6305 |
| }, |
| { |
| "epoch": 1.663898226880232, |
| "grad_norm": 0.008829087018966675, |
| "learning_rate": 3.361921097770154e-05, |
| "loss": 0.0298, |
| "step": 6310 |
| }, |
| { |
| "epoch": 1.665216531540439, |
| "grad_norm": 0.4426127076148987, |
| "learning_rate": 3.34872674495316e-05, |
| "loss": 0.0045, |
| "step": 6315 |
| }, |
| { |
| "epoch": 1.666534836200646, |
| "grad_norm": 0.025818035006523132, |
| "learning_rate": 3.335532392136166e-05, |
| "loss": 0.0028, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.6678531408608528, |
| "grad_norm": 0.6068133115768433, |
| "learning_rate": 3.3223380393191715e-05, |
| "loss": 0.0202, |
| "step": 6325 |
| }, |
| { |
| "epoch": 1.66917144552106, |
| "grad_norm": 0.02740122564136982, |
| "learning_rate": 3.309143686502178e-05, |
| "loss": 0.0025, |
| "step": 6330 |
| }, |
| { |
| "epoch": 1.6704897501812668, |
| "grad_norm": 0.15878735482692719, |
| "learning_rate": 3.2959493336851826e-05, |
| "loss": 0.004, |
| "step": 6335 |
| }, |
| { |
| "epoch": 1.671808054841474, |
| "grad_norm": 0.006827466655522585, |
| "learning_rate": 3.282754980868188e-05, |
| "loss": 0.0048, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.6731263595016808, |
| "grad_norm": 0.19508551061153412, |
| "learning_rate": 3.2695606280511944e-05, |
| "loss": 0.0025, |
| "step": 6345 |
| }, |
| { |
| "epoch": 1.674444664161888, |
| "grad_norm": 0.8176754713058472, |
| "learning_rate": 3.2563662752342e-05, |
| "loss": 0.0151, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.6757629688220947, |
| "grad_norm": 0.011672024615108967, |
| "learning_rate": 3.2431719224172055e-05, |
| "loss": 0.0452, |
| "step": 6355 |
| }, |
| { |
| "epoch": 1.6770812734823017, |
| "grad_norm": 0.015824951231479645, |
| "learning_rate": 3.229977569600211e-05, |
| "loss": 0.0236, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.6783995781425087, |
| "grad_norm": 0.1358737051486969, |
| "learning_rate": 3.216783216783217e-05, |
| "loss": 0.0078, |
| "step": 6365 |
| }, |
| { |
| "epoch": 1.6797178828027157, |
| "grad_norm": 0.004896901547908783, |
| "learning_rate": 3.203588863966223e-05, |
| "loss": 0.0042, |
| "step": 6370 |
| }, |
| { |
| "epoch": 1.6810361874629227, |
| "grad_norm": 0.22593103349208832, |
| "learning_rate": 3.1903945111492285e-05, |
| "loss": 0.0053, |
| "step": 6375 |
| }, |
| { |
| "epoch": 1.6823544921231297, |
| "grad_norm": 0.0073196059092879295, |
| "learning_rate": 3.177200158332234e-05, |
| "loss": 0.0287, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.6836727967833367, |
| "grad_norm": 0.018524926155805588, |
| "learning_rate": 3.1640058055152396e-05, |
| "loss": 0.0122, |
| "step": 6385 |
| }, |
| { |
| "epoch": 1.6849911014435435, |
| "grad_norm": 0.7453815937042236, |
| "learning_rate": 3.150811452698245e-05, |
| "loss": 0.0378, |
| "step": 6390 |
| }, |
| { |
| "epoch": 1.6863094061037507, |
| "grad_norm": 0.22409795224666595, |
| "learning_rate": 3.137617099881251e-05, |
| "loss": 0.0282, |
| "step": 6395 |
| }, |
| { |
| "epoch": 1.6876277107639575, |
| "grad_norm": 0.005432693753391504, |
| "learning_rate": 3.124422747064257e-05, |
| "loss": 0.0162, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.6889460154241647, |
| "grad_norm": 0.1493055820465088, |
| "learning_rate": 3.1112283942472626e-05, |
| "loss": 0.0123, |
| "step": 6405 |
| }, |
| { |
| "epoch": 1.6902643200843714, |
| "grad_norm": 0.1638440042734146, |
| "learning_rate": 3.0980340414302674e-05, |
| "loss": 0.0058, |
| "step": 6410 |
| }, |
| { |
| "epoch": 1.6915826247445784, |
| "grad_norm": 0.015779908746480942, |
| "learning_rate": 3.084839688613274e-05, |
| "loss": 0.0157, |
| "step": 6415 |
| }, |
| { |
| "epoch": 1.6929009294047854, |
| "grad_norm": 0.0012348912423476577, |
| "learning_rate": 3.071645335796279e-05, |
| "loss": 0.0016, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.6942192340649924, |
| "grad_norm": 0.05294624716043472, |
| "learning_rate": 3.058450982979285e-05, |
| "loss": 0.0037, |
| "step": 6425 |
| }, |
| { |
| "epoch": 1.6955375387251994, |
| "grad_norm": 0.01926981844007969, |
| "learning_rate": 3.045256630162291e-05, |
| "loss": 0.0053, |
| "step": 6430 |
| }, |
| { |
| "epoch": 1.6968558433854064, |
| "grad_norm": 0.005958891473710537, |
| "learning_rate": 3.0320622773452963e-05, |
| "loss": 0.0025, |
| "step": 6435 |
| }, |
| { |
| "epoch": 1.6981741480456134, |
| "grad_norm": 0.001902201445773244, |
| "learning_rate": 3.018867924528302e-05, |
| "loss": 0.0027, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.6994924527058202, |
| "grad_norm": 0.036614127457141876, |
| "learning_rate": 3.0056735717113078e-05, |
| "loss": 0.0026, |
| "step": 6445 |
| }, |
| { |
| "epoch": 1.7008107573660274, |
| "grad_norm": 0.07294526696205139, |
| "learning_rate": 2.9924792188943133e-05, |
| "loss": 0.0042, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.7021290620262342, |
| "grad_norm": 0.42822372913360596, |
| "learning_rate": 2.9792848660773192e-05, |
| "loss": 0.013, |
| "step": 6455 |
| }, |
| { |
| "epoch": 1.7034473666864414, |
| "grad_norm": 0.036622967571020126, |
| "learning_rate": 2.9660905132603245e-05, |
| "loss": 0.0029, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.7047656713466481, |
| "grad_norm": 0.08314034342765808, |
| "learning_rate": 2.9528961604433304e-05, |
| "loss": 0.0043, |
| "step": 6465 |
| }, |
| { |
| "epoch": 1.7060839760068551, |
| "grad_norm": 0.0005654952838085592, |
| "learning_rate": 2.939701807626336e-05, |
| "loss": 0.0595, |
| "step": 6470 |
| }, |
| { |
| "epoch": 1.7074022806670621, |
| "grad_norm": 0.004545385017991066, |
| "learning_rate": 2.926507454809342e-05, |
| "loss": 0.0044, |
| "step": 6475 |
| }, |
| { |
| "epoch": 1.7087205853272691, |
| "grad_norm": 0.00033831383916549385, |
| "learning_rate": 2.9133131019923477e-05, |
| "loss": 0.0046, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.710038889987476, |
| "grad_norm": 0.0019903562497347593, |
| "learning_rate": 2.900118749175353e-05, |
| "loss": 0.0026, |
| "step": 6485 |
| }, |
| { |
| "epoch": 1.711357194647683, |
| "grad_norm": 0.10188104957342148, |
| "learning_rate": 2.8869243963583585e-05, |
| "loss": 0.0069, |
| "step": 6490 |
| }, |
| { |
| "epoch": 1.71267549930789, |
| "grad_norm": 0.2123432606458664, |
| "learning_rate": 2.8737300435413644e-05, |
| "loss": 0.0199, |
| "step": 6495 |
| }, |
| { |
| "epoch": 1.7139938039680969, |
| "grad_norm": 0.43209517002105713, |
| "learning_rate": 2.8605356907243703e-05, |
| "loss": 0.0099, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.7139938039680969, |
| "eval_loss": 0.024327505379915237, |
| "eval_runtime": 452.0052, |
| "eval_samples_per_second": 7.46, |
| "eval_steps_per_second": 3.73, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.715312108628304, |
| "grad_norm": 0.009868285618722439, |
| "learning_rate": 2.847341337907376e-05, |
| "loss": 0.0025, |
| "step": 6505 |
| }, |
| { |
| "epoch": 1.7166304132885108, |
| "grad_norm": 0.00778606254607439, |
| "learning_rate": 2.834146985090381e-05, |
| "loss": 0.0028, |
| "step": 6510 |
| }, |
| { |
| "epoch": 1.717948717948718, |
| "grad_norm": 0.02987460047006607, |
| "learning_rate": 2.820952632273387e-05, |
| "loss": 0.0068, |
| "step": 6515 |
| }, |
| { |
| "epoch": 1.7192670226089248, |
| "grad_norm": 0.04475142061710358, |
| "learning_rate": 2.807758279456393e-05, |
| "loss": 0.0022, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.720585327269132, |
| "grad_norm": 0.12720516324043274, |
| "learning_rate": 2.7945639266393985e-05, |
| "loss": 0.0488, |
| "step": 6525 |
| }, |
| { |
| "epoch": 1.7219036319293388, |
| "grad_norm": 0.0011463731061667204, |
| "learning_rate": 2.7813695738224044e-05, |
| "loss": 0.0023, |
| "step": 6530 |
| }, |
| { |
| "epoch": 1.7232219365895458, |
| "grad_norm": 0.008907752111554146, |
| "learning_rate": 2.7681752210054096e-05, |
| "loss": 0.0039, |
| "step": 6535 |
| }, |
| { |
| "epoch": 1.7245402412497528, |
| "grad_norm": 0.008416680619120598, |
| "learning_rate": 2.7549808681884156e-05, |
| "loss": 0.0055, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.7258585459099598, |
| "grad_norm": 0.26278871297836304, |
| "learning_rate": 2.741786515371421e-05, |
| "loss": 0.0386, |
| "step": 6545 |
| }, |
| { |
| "epoch": 1.7271768505701668, |
| "grad_norm": 0.01750275492668152, |
| "learning_rate": 2.728592162554427e-05, |
| "loss": 0.0048, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.7284951552303738, |
| "grad_norm": 0.009483959525823593, |
| "learning_rate": 2.7153978097374326e-05, |
| "loss": 0.0061, |
| "step": 6555 |
| }, |
| { |
| "epoch": 1.7298134598905808, |
| "grad_norm": 0.016591722145676613, |
| "learning_rate": 2.7022034569204378e-05, |
| "loss": 0.0058, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.7311317645507875, |
| "grad_norm": 0.5120682716369629, |
| "learning_rate": 2.6890091041034437e-05, |
| "loss": 0.0229, |
| "step": 6565 |
| }, |
| { |
| "epoch": 1.7324500692109948, |
| "grad_norm": 0.03748248517513275, |
| "learning_rate": 2.6758147512864496e-05, |
| "loss": 0.0026, |
| "step": 6570 |
| }, |
| { |
| "epoch": 1.7337683738712015, |
| "grad_norm": 0.08328749984502792, |
| "learning_rate": 2.6626203984694552e-05, |
| "loss": 0.0052, |
| "step": 6575 |
| }, |
| { |
| "epoch": 1.7350866785314087, |
| "grad_norm": 0.012284482829272747, |
| "learning_rate": 2.649426045652461e-05, |
| "loss": 0.0353, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.7364049831916155, |
| "grad_norm": 0.06362583488225937, |
| "learning_rate": 2.6362316928354663e-05, |
| "loss": 0.0309, |
| "step": 6585 |
| }, |
| { |
| "epoch": 1.7377232878518225, |
| "grad_norm": 0.01475360058248043, |
| "learning_rate": 2.6230373400184722e-05, |
| "loss": 0.0034, |
| "step": 6590 |
| }, |
| { |
| "epoch": 1.7390415925120295, |
| "grad_norm": 0.002241638721898198, |
| "learning_rate": 2.6098429872014778e-05, |
| "loss": 0.0365, |
| "step": 6595 |
| }, |
| { |
| "epoch": 1.7403598971722365, |
| "grad_norm": 0.11375941336154938, |
| "learning_rate": 2.5966486343844837e-05, |
| "loss": 0.0241, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.7416782018324435, |
| "grad_norm": 0.009631779976189137, |
| "learning_rate": 2.5834542815674896e-05, |
| "loss": 0.0026, |
| "step": 6605 |
| }, |
| { |
| "epoch": 1.7429965064926505, |
| "grad_norm": 0.12113262712955475, |
| "learning_rate": 2.570259928750495e-05, |
| "loss": 0.0207, |
| "step": 6610 |
| }, |
| { |
| "epoch": 1.7443148111528575, |
| "grad_norm": 0.006536155007779598, |
| "learning_rate": 2.5570655759335004e-05, |
| "loss": 0.0022, |
| "step": 6615 |
| }, |
| { |
| "epoch": 1.7456331158130642, |
| "grad_norm": 0.043030887842178345, |
| "learning_rate": 2.5438712231165063e-05, |
| "loss": 0.003, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.7469514204732715, |
| "grad_norm": 0.00860620103776455, |
| "learning_rate": 2.5306768702995122e-05, |
| "loss": 0.027, |
| "step": 6625 |
| }, |
| { |
| "epoch": 1.7482697251334782, |
| "grad_norm": 0.014589210972189903, |
| "learning_rate": 2.5174825174825178e-05, |
| "loss": 0.0224, |
| "step": 6630 |
| }, |
| { |
| "epoch": 1.7495880297936854, |
| "grad_norm": 0.01215316355228424, |
| "learning_rate": 2.504288164665523e-05, |
| "loss": 0.011, |
| "step": 6635 |
| }, |
| { |
| "epoch": 1.7509063344538922, |
| "grad_norm": 0.10951556265354156, |
| "learning_rate": 2.491093811848529e-05, |
| "loss": 0.0384, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.7522246391140994, |
| "grad_norm": 0.30859875679016113, |
| "learning_rate": 2.4778994590315345e-05, |
| "loss": 0.0031, |
| "step": 6645 |
| }, |
| { |
| "epoch": 1.7535429437743062, |
| "grad_norm": 0.025427229702472687, |
| "learning_rate": 2.4647051062145404e-05, |
| "loss": 0.0171, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.7548612484345132, |
| "grad_norm": 0.03334197774529457, |
| "learning_rate": 2.451510753397546e-05, |
| "loss": 0.0473, |
| "step": 6655 |
| }, |
| { |
| "epoch": 1.7561795530947202, |
| "grad_norm": 0.013445639982819557, |
| "learning_rate": 2.438316400580552e-05, |
| "loss": 0.0056, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.7574978577549272, |
| "grad_norm": 0.008306960575282574, |
| "learning_rate": 2.425122047763557e-05, |
| "loss": 0.0104, |
| "step": 6665 |
| }, |
| { |
| "epoch": 1.7588161624151342, |
| "grad_norm": 0.012615012936294079, |
| "learning_rate": 2.411927694946563e-05, |
| "loss": 0.0097, |
| "step": 6670 |
| }, |
| { |
| "epoch": 1.7601344670753412, |
| "grad_norm": 0.006827410310506821, |
| "learning_rate": 2.398733342129569e-05, |
| "loss": 0.0057, |
| "step": 6675 |
| }, |
| { |
| "epoch": 1.7614527717355482, |
| "grad_norm": 0.017035294324159622, |
| "learning_rate": 2.3855389893125745e-05, |
| "loss": 0.0035, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.762771076395755, |
| "grad_norm": 0.036102693527936935, |
| "learning_rate": 2.37234463649558e-05, |
| "loss": 0.0031, |
| "step": 6685 |
| }, |
| { |
| "epoch": 1.7640893810559621, |
| "grad_norm": 0.5004498958587646, |
| "learning_rate": 2.3591502836785856e-05, |
| "loss": 0.0217, |
| "step": 6690 |
| }, |
| { |
| "epoch": 1.765407685716169, |
| "grad_norm": 0.017726672813296318, |
| "learning_rate": 2.3459559308615915e-05, |
| "loss": 0.0112, |
| "step": 6695 |
| }, |
| { |
| "epoch": 1.7667259903763761, |
| "grad_norm": 0.00940331444144249, |
| "learning_rate": 2.332761578044597e-05, |
| "loss": 0.0107, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.768044295036583, |
| "grad_norm": 0.007495497819036245, |
| "learning_rate": 2.3195672252276026e-05, |
| "loss": 0.0032, |
| "step": 6705 |
| }, |
| { |
| "epoch": 1.7693625996967899, |
| "grad_norm": 0.6863199472427368, |
| "learning_rate": 2.3063728724106085e-05, |
| "loss": 0.034, |
| "step": 6710 |
| }, |
| { |
| "epoch": 1.7706809043569969, |
| "grad_norm": 0.004587489180266857, |
| "learning_rate": 2.293178519593614e-05, |
| "loss": 0.0032, |
| "step": 6715 |
| }, |
| { |
| "epoch": 1.7719992090172039, |
| "grad_norm": 0.017706016078591347, |
| "learning_rate": 2.2799841667766197e-05, |
| "loss": 0.0036, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.7733175136774109, |
| "grad_norm": 0.012740216217935085, |
| "learning_rate": 2.2667898139596252e-05, |
| "loss": 0.0147, |
| "step": 6725 |
| }, |
| { |
| "epoch": 1.7746358183376179, |
| "grad_norm": 0.010391579940915108, |
| "learning_rate": 2.253595461142631e-05, |
| "loss": 0.0041, |
| "step": 6730 |
| }, |
| { |
| "epoch": 1.7759541229978248, |
| "grad_norm": 0.021570540964603424, |
| "learning_rate": 2.2404011083256367e-05, |
| "loss": 0.0363, |
| "step": 6735 |
| }, |
| { |
| "epoch": 1.7772724276580316, |
| "grad_norm": 0.005778402555733919, |
| "learning_rate": 2.2272067555086423e-05, |
| "loss": 0.002, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.7785907323182388, |
| "grad_norm": 0.0, |
| "learning_rate": 2.2140124026916482e-05, |
| "loss": 0.0058, |
| "step": 6745 |
| }, |
| { |
| "epoch": 1.7799090369784456, |
| "grad_norm": 0.010869967751204967, |
| "learning_rate": 2.2008180498746537e-05, |
| "loss": 0.0036, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.7812273416386528, |
| "grad_norm": 0.04336518794298172, |
| "learning_rate": 2.1876236970576593e-05, |
| "loss": 0.0074, |
| "step": 6755 |
| }, |
| { |
| "epoch": 1.7825456462988596, |
| "grad_norm": 0.008664094842970371, |
| "learning_rate": 2.1744293442406652e-05, |
| "loss": 0.0027, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.7838639509590668, |
| "grad_norm": 0.9408183097839355, |
| "learning_rate": 2.1612349914236708e-05, |
| "loss": 0.0371, |
| "step": 6765 |
| }, |
| { |
| "epoch": 1.7851822556192736, |
| "grad_norm": 0.016822539269924164, |
| "learning_rate": 2.1480406386066763e-05, |
| "loss": 0.0137, |
| "step": 6770 |
| }, |
| { |
| "epoch": 1.7865005602794806, |
| "grad_norm": 0.00829544197767973, |
| "learning_rate": 2.134846285789682e-05, |
| "loss": 0.0134, |
| "step": 6775 |
| }, |
| { |
| "epoch": 1.7878188649396876, |
| "grad_norm": 0.0035508016590029, |
| "learning_rate": 2.1216519329726878e-05, |
| "loss": 0.0231, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.7891371695998946, |
| "grad_norm": 0.13871321082115173, |
| "learning_rate": 2.1084575801556937e-05, |
| "loss": 0.0296, |
| "step": 6785 |
| }, |
| { |
| "epoch": 1.7904554742601015, |
| "grad_norm": 0.002578354673460126, |
| "learning_rate": 2.095263227338699e-05, |
| "loss": 0.0178, |
| "step": 6790 |
| }, |
| { |
| "epoch": 1.7917737789203085, |
| "grad_norm": 0.5279458165168762, |
| "learning_rate": 2.082068874521705e-05, |
| "loss": 0.0336, |
| "step": 6795 |
| }, |
| { |
| "epoch": 1.7930920835805155, |
| "grad_norm": 0.0017439400544390082, |
| "learning_rate": 2.0688745217047104e-05, |
| "loss": 0.0031, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.7944103882407223, |
| "grad_norm": 0.007989778183400631, |
| "learning_rate": 2.055680168887716e-05, |
| "loss": 0.0081, |
| "step": 6805 |
| }, |
| { |
| "epoch": 1.7957286929009295, |
| "grad_norm": 0.015163813717663288, |
| "learning_rate": 2.042485816070722e-05, |
| "loss": 0.0234, |
| "step": 6810 |
| }, |
| { |
| "epoch": 1.7970469975611363, |
| "grad_norm": 0.10615389794111252, |
| "learning_rate": 2.0292914632537275e-05, |
| "loss": 0.0144, |
| "step": 6815 |
| }, |
| { |
| "epoch": 1.7983653022213435, |
| "grad_norm": 0.03466172143816948, |
| "learning_rate": 2.0160971104367334e-05, |
| "loss": 0.0036, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.7996836068815503, |
| "grad_norm": 0.047511328011751175, |
| "learning_rate": 2.0029027576197386e-05, |
| "loss": 0.002, |
| "step": 6825 |
| }, |
| { |
| "epoch": 1.8010019115417573, |
| "grad_norm": 0.019772246479988098, |
| "learning_rate": 1.9897084048027445e-05, |
| "loss": 0.0049, |
| "step": 6830 |
| }, |
| { |
| "epoch": 1.8023202162019643, |
| "grad_norm": 0.1156701073050499, |
| "learning_rate": 1.9765140519857504e-05, |
| "loss": 0.0033, |
| "step": 6835 |
| }, |
| { |
| "epoch": 1.8036385208621712, |
| "grad_norm": 0.010991690680384636, |
| "learning_rate": 1.963319699168756e-05, |
| "loss": 0.0036, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.8049568255223782, |
| "grad_norm": 0.29658815264701843, |
| "learning_rate": 1.9501253463517615e-05, |
| "loss": 0.0042, |
| "step": 6845 |
| }, |
| { |
| "epoch": 1.8062751301825852, |
| "grad_norm": 0.056147243827581406, |
| "learning_rate": 1.936930993534767e-05, |
| "loss": 0.0052, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.8075934348427922, |
| "grad_norm": 0.010382590815424919, |
| "learning_rate": 1.923736640717773e-05, |
| "loss": 0.0033, |
| "step": 6855 |
| }, |
| { |
| "epoch": 1.808911739502999, |
| "grad_norm": 1.1247020959854126, |
| "learning_rate": 1.9105422879007786e-05, |
| "loss": 0.0112, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.8102300441632062, |
| "grad_norm": 1.4515737295150757, |
| "learning_rate": 1.897347935083784e-05, |
| "loss": 0.0202, |
| "step": 6865 |
| }, |
| { |
| "epoch": 1.811548348823413, |
| "grad_norm": 0.016307830810546875, |
| "learning_rate": 1.88415358226679e-05, |
| "loss": 0.0148, |
| "step": 6870 |
| }, |
| { |
| "epoch": 1.8128666534836202, |
| "grad_norm": 0.0745878592133522, |
| "learning_rate": 1.8709592294497956e-05, |
| "loss": 0.0062, |
| "step": 6875 |
| }, |
| { |
| "epoch": 1.814184958143827, |
| "grad_norm": 0.02554013952612877, |
| "learning_rate": 1.8577648766328012e-05, |
| "loss": 0.003, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.815503262804034, |
| "grad_norm": 0.45748665928840637, |
| "learning_rate": 1.844570523815807e-05, |
| "loss": 0.0386, |
| "step": 6885 |
| }, |
| { |
| "epoch": 1.816821567464241, |
| "grad_norm": 0.013801589608192444, |
| "learning_rate": 1.8313761709988126e-05, |
| "loss": 0.0342, |
| "step": 6890 |
| }, |
| { |
| "epoch": 1.818139872124448, |
| "grad_norm": 0.6251696944236755, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 0.0101, |
| "step": 6895 |
| }, |
| { |
| "epoch": 1.819458176784655, |
| "grad_norm": 0.28203102946281433, |
| "learning_rate": 1.8049874653648238e-05, |
| "loss": 0.0032, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.820776481444862, |
| "grad_norm": 0.28511062264442444, |
| "learning_rate": 1.7917931125478297e-05, |
| "loss": 0.0343, |
| "step": 6905 |
| }, |
| { |
| "epoch": 1.822094786105069, |
| "grad_norm": 0.004940215498209, |
| "learning_rate": 1.7785987597308352e-05, |
| "loss": 0.0265, |
| "step": 6910 |
| }, |
| { |
| "epoch": 1.8234130907652757, |
| "grad_norm": 0.002903093583881855, |
| "learning_rate": 1.7654044069138408e-05, |
| "loss": 0.0025, |
| "step": 6915 |
| }, |
| { |
| "epoch": 1.824731395425483, |
| "grad_norm": 0.008801674470305443, |
| "learning_rate": 1.7522100540968467e-05, |
| "loss": 0.0246, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.8260497000856897, |
| "grad_norm": 0.13823826611042023, |
| "learning_rate": 1.7390157012798523e-05, |
| "loss": 0.0058, |
| "step": 6925 |
| }, |
| { |
| "epoch": 1.827368004745897, |
| "grad_norm": 0.020868878811597824, |
| "learning_rate": 1.725821348462858e-05, |
| "loss": 0.0014, |
| "step": 6930 |
| }, |
| { |
| "epoch": 1.8286863094061037, |
| "grad_norm": 0.0027356524951756, |
| "learning_rate": 1.7126269956458638e-05, |
| "loss": 0.0035, |
| "step": 6935 |
| }, |
| { |
| "epoch": 1.8300046140663109, |
| "grad_norm": 0.06023023650050163, |
| "learning_rate": 1.6994326428288693e-05, |
| "loss": 0.0212, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.8313229187265176, |
| "grad_norm": 0.0009826788445934653, |
| "learning_rate": 1.686238290011875e-05, |
| "loss": 0.0034, |
| "step": 6945 |
| }, |
| { |
| "epoch": 1.8326412233867246, |
| "grad_norm": 0.2867647707462311, |
| "learning_rate": 1.6730439371948805e-05, |
| "loss": 0.0146, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.8339595280469316, |
| "grad_norm": 0.004501632414758205, |
| "learning_rate": 1.6598495843778864e-05, |
| "loss": 0.0026, |
| "step": 6955 |
| }, |
| { |
| "epoch": 1.8352778327071386, |
| "grad_norm": 0.01251616608351469, |
| "learning_rate": 1.6466552315608923e-05, |
| "loss": 0.0107, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.8365961373673456, |
| "grad_norm": 0.054781850427389145, |
| "learning_rate": 1.6334608787438975e-05, |
| "loss": 0.0044, |
| "step": 6965 |
| }, |
| { |
| "epoch": 1.8379144420275526, |
| "grad_norm": 0.1120501235127449, |
| "learning_rate": 1.6202665259269034e-05, |
| "loss": 0.0284, |
| "step": 6970 |
| }, |
| { |
| "epoch": 1.8392327466877596, |
| "grad_norm": 0.001668553682975471, |
| "learning_rate": 1.607072173109909e-05, |
| "loss": 0.0169, |
| "step": 6975 |
| }, |
| { |
| "epoch": 1.8405510513479664, |
| "grad_norm": 1.6374458074569702, |
| "learning_rate": 1.593877820292915e-05, |
| "loss": 0.031, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.8418693560081736, |
| "grad_norm": 0.012474550865590572, |
| "learning_rate": 1.5806834674759204e-05, |
| "loss": 0.0037, |
| "step": 6985 |
| }, |
| { |
| "epoch": 1.8431876606683804, |
| "grad_norm": 0.014898869208991528, |
| "learning_rate": 1.567489114658926e-05, |
| "loss": 0.003, |
| "step": 6990 |
| }, |
| { |
| "epoch": 1.8445059653285876, |
| "grad_norm": 0.035570453852415085, |
| "learning_rate": 1.554294761841932e-05, |
| "loss": 0.0038, |
| "step": 6995 |
| }, |
| { |
| "epoch": 1.8458242699887943, |
| "grad_norm": 0.9279152750968933, |
| "learning_rate": 1.541100409024937e-05, |
| "loss": 0.0235, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.8458242699887943, |
| "eval_loss": 0.022339830175042152, |
| "eval_runtime": 451.9068, |
| "eval_samples_per_second": 7.462, |
| "eval_steps_per_second": 3.731, |
| "step": 7000 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 7584, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.6496806486741606e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|