| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.659152330103487, |
| "eval_steps": 500, |
| "global_step": 2500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001318304660206974, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.0002, |
| "loss": 1.9624, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.002636609320413948, |
| "grad_norm": 1.7421875, |
| "learning_rate": 0.00019986805647183008, |
| "loss": 0.6513, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.003954913980620921, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019973611294366012, |
| "loss": 0.1146, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.005273218640827896, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.0001996041694154902, |
| "loss": 0.0529, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006591523301034869, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00019947222588732023, |
| "loss": 0.1214, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007909827961241843, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.0001993402823591503, |
| "loss": 0.0919, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.009228132621448816, |
| "grad_norm": 0.06201171875, |
| "learning_rate": 0.00019920833883098034, |
| "loss": 0.09, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.010546437281655791, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.0001990763953028104, |
| "loss": 0.1945, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.011864741941862765, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.00019894445177464048, |
| "loss": 0.1259, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.013183046602069738, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.00019881250824647052, |
| "loss": 0.027, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.014501351262276712, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.00019868056471830057, |
| "loss": 0.1068, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.015819655922483685, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.00019854862119013064, |
| "loss": 0.0542, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.01713796058269066, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 0.00019841667766196068, |
| "loss": 0.0901, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.018456265242897632, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00019828473413379075, |
| "loss": 0.0091, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.019774569903104607, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 0.0001981527906056208, |
| "loss": 0.0744, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.021092874563311582, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.00019802084707745086, |
| "loss": 0.1108, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.022411179223518554, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0001978889035492809, |
| "loss": 0.0446, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.02372948388372553, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 0.00019775696002111097, |
| "loss": 0.0982, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0250477885439325, |
| "grad_norm": 0.490234375, |
| "learning_rate": 0.00019762501649294104, |
| "loss": 0.1035, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.026366093204139476, |
| "grad_norm": 0.12158203125, |
| "learning_rate": 0.00019749307296477108, |
| "loss": 0.0401, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02768439786434645, |
| "grad_norm": 0.16015625, |
| "learning_rate": 0.00019736112943660115, |
| "loss": 0.0309, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.029002702524553423, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.0001972291859084312, |
| "loss": 0.1032, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0303210071847604, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.00019709724238026126, |
| "loss": 0.0811, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.03163931184496737, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.00019696529885209133, |
| "loss": 0.0258, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03295761650517435, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.00019683335532392137, |
| "loss": 0.0437, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.03427592116538132, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00019670141179575144, |
| "loss": 0.0967, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03559422582558829, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00019656946826758148, |
| "loss": 0.0132, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.036912530485795264, |
| "grad_norm": 0.66015625, |
| "learning_rate": 0.00019643752473941155, |
| "loss": 0.0396, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03823083514600224, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0001963055812112416, |
| "loss": 0.0449, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.039549139806209214, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.00019617363768307166, |
| "loss": 0.1196, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.040867444466416186, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0001960416941549017, |
| "loss": 0.0588, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.042185749126623165, |
| "grad_norm": 0.06005859375, |
| "learning_rate": 0.00019590975062673175, |
| "loss": 0.0234, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04350405378683014, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.00019577780709856182, |
| "loss": 0.0916, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04482235844703711, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.0001956458635703919, |
| "loss": 0.0271, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04614066310724409, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.00019551392004222193, |
| "loss": 0.0175, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.04745896776745106, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 0.000195381976514052, |
| "loss": 0.0356, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04877727242765803, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 0.00019525003298588204, |
| "loss": 0.0057, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.050095577087865, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.0001951180894577121, |
| "loss": 0.0082, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.05141388174807198, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 0.00019498614592954215, |
| "loss": 0.0178, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.05273218640827895, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.00019485420240137222, |
| "loss": 0.0789, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.054050491068485924, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.0001947222588732023, |
| "loss": 0.0645, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.0553687957286929, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.00019459031534503233, |
| "loss": 0.116, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.056687100388899875, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.0001944583718168624, |
| "loss": 0.0516, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.058005405049106847, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.00019432642828869244, |
| "loss": 0.1019, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.059323709709313825, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.0001941944847605225, |
| "loss": 0.0529, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.0606420143695208, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.00019406254123235256, |
| "loss": 0.0368, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.06196031902972777, |
| "grad_norm": 0.054443359375, |
| "learning_rate": 0.00019393059770418262, |
| "loss": 0.037, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.06327862368993474, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 0.0001937986541760127, |
| "loss": 0.0324, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.06459692835014172, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00019366671064784274, |
| "loss": 0.0334, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0659152330103487, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.0001935347671196728, |
| "loss": 0.0671, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.06723353767055566, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00019340282359150285, |
| "loss": 0.1559, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06855184233076264, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.0001932708800633329, |
| "loss": 0.0198, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06987014699096962, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.00019313893653516296, |
| "loss": 0.0151, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.07118845165117658, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.000193006993006993, |
| "loss": 0.0269, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.07250675631138356, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00019287504947882307, |
| "loss": 0.0565, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.07382506097159053, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0001927431059506531, |
| "loss": 0.0942, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0751433656317975, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.00019261116242248318, |
| "loss": 0.0061, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.07646167029200449, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019247921889431325, |
| "loss": 0.0497, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07777997495221145, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.0001923472753661433, |
| "loss": 0.0573, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07909827961241843, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00019221533183797336, |
| "loss": 0.0528, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08041658427262541, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.0001920833883098034, |
| "loss": 0.0506, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.08173488893283237, |
| "grad_norm": 0.08203125, |
| "learning_rate": 0.00019195144478163347, |
| "loss": 0.0307, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.08305319359303935, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.00019181950125346354, |
| "loss": 0.0365, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.08437149825324633, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00019168755772529358, |
| "loss": 0.0447, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.0856898029134533, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.00019155561419712365, |
| "loss": 0.0605, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.08700810757366027, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.0001914236706689537, |
| "loss": 0.0846, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08832641223386725, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.00019129172714078376, |
| "loss": 0.0713, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08964471689407422, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 0.0001911597836126138, |
| "loss": 0.0826, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.0909630215542812, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00019102784008444388, |
| "loss": 0.0441, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.09228132621448817, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00019089589655627395, |
| "loss": 0.1378, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.09359963087469514, |
| "grad_norm": 3.0625, |
| "learning_rate": 0.00019076395302810396, |
| "loss": 0.1552, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.09491793553490212, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.00019063200949993403, |
| "loss": 0.0458, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0962362401951091, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.0001905000659717641, |
| "loss": 0.0312, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.09755454485531606, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 0.00019036812244359414, |
| "loss": 0.0247, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09887284951552304, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.0001902361789154242, |
| "loss": 0.054, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.10019115417573, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 0.00019010423538725425, |
| "loss": 0.0023, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.10150945883593698, |
| "grad_norm": 0.0361328125, |
| "learning_rate": 0.00018997229185908432, |
| "loss": 0.0884, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.10282776349614396, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018984034833091436, |
| "loss": 0.0506, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.10414606815635093, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.00018970840480274443, |
| "loss": 0.1123, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.1054643728165579, |
| "grad_norm": 0.6953125, |
| "learning_rate": 0.0001895764612745745, |
| "loss": 0.0597, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.10678267747676488, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.00018944451774640454, |
| "loss": 0.0138, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.10810098213697185, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 0.0001893125742182346, |
| "loss": 0.0249, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.10941928679717883, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 0.00018918063069006466, |
| "loss": 0.0084, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.1107375914573858, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.00018904868716189472, |
| "loss": 0.0541, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.11205589611759277, |
| "grad_norm": 0.74609375, |
| "learning_rate": 0.00018891674363372477, |
| "loss": 0.007, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.11337420077779975, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 0.00018878480010555484, |
| "loss": 0.0875, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.11469250543800673, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.0001886528565773849, |
| "loss": 0.1207, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.11601081009821369, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00018852091304921495, |
| "loss": 0.1143, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.11732911475842067, |
| "grad_norm": 0.6484375, |
| "learning_rate": 0.00018838896952104502, |
| "loss": 0.0393, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.11864741941862765, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 0.00018825702599287506, |
| "loss": 0.02, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.11996572407883462, |
| "grad_norm": 0.486328125, |
| "learning_rate": 0.0001881250824647051, |
| "loss": 0.0891, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.1212840287390416, |
| "grad_norm": 1.0, |
| "learning_rate": 0.00018799313893653517, |
| "loss": 0.0469, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.12260233339924857, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.0001878611954083652, |
| "loss": 0.019, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.12392063805945554, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 0.00018772925188019528, |
| "loss": 0.007, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.12523894271966252, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 0.00018759730835202532, |
| "loss": 0.0039, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.12655724737986948, |
| "grad_norm": 0.014404296875, |
| "learning_rate": 0.0001874653648238554, |
| "loss": 0.0043, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.12787555204007647, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.00018733342129568546, |
| "loss": 0.1326, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.12919385670028344, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.0001872014777675155, |
| "loss": 0.0369, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.1305121613604904, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00018706953423934557, |
| "loss": 0.0395, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1318304660206974, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.00018693759071117561, |
| "loss": 0.0284, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1318304660206974, |
| "eval_loss": 0.04542969539761543, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.5293, |
| "eval_samples_per_second": 7.37, |
| "eval_steps_per_second": 3.685, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.13314877068090436, |
| "grad_norm": 0.0291748046875, |
| "learning_rate": 0.00018680564718300568, |
| "loss": 0.0533, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.13446707534111133, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.00018667370365483575, |
| "loss": 0.0183, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.13578538000131832, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 0.0001865417601266658, |
| "loss": 0.0473, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.13710368466152528, |
| "grad_norm": 0.388671875, |
| "learning_rate": 0.00018640981659849586, |
| "loss": 0.0562, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.13842198932173225, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.0001862778730703259, |
| "loss": 0.0755, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.13974029398193924, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.00018614592954215598, |
| "loss": 0.0422, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1410585986421462, |
| "grad_norm": 0.48828125, |
| "learning_rate": 0.00018601398601398602, |
| "loss": 0.0882, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.14237690330235317, |
| "grad_norm": 0.16015625, |
| "learning_rate": 0.0001858820424858161, |
| "loss": 0.0131, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.14369520796256013, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.00018575009895764616, |
| "loss": 0.03, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.14501351262276713, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 0.0001856181554294762, |
| "loss": 0.0425, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1463318172829741, |
| "grad_norm": 0.390625, |
| "learning_rate": 0.00018548621190130624, |
| "loss": 0.011, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.14765012194318106, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.0001853542683731363, |
| "loss": 0.0807, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.14896842660338805, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.00018522232484496635, |
| "loss": 0.0278, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.150286731263595, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00018509038131679642, |
| "loss": 0.0484, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.15160503592380198, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.00018495843778862646, |
| "loss": 0.1277, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.15292334058400897, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.00018482649426045653, |
| "loss": 0.058, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.15424164524421594, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.00018469455073228657, |
| "loss": 0.0259, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.1555599499044229, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018456260720411664, |
| "loss": 0.113, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1568782545646299, |
| "grad_norm": 0.12451171875, |
| "learning_rate": 0.0001844306636759467, |
| "loss": 0.0312, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.15819655922483686, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 0.00018429872014777676, |
| "loss": 0.0476, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.15951486388504382, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.00018416677661960682, |
| "loss": 0.0232, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.16083316854525082, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.00018403483309143687, |
| "loss": 0.1287, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.16215147320545778, |
| "grad_norm": 0.765625, |
| "learning_rate": 0.00018390288956326694, |
| "loss": 0.0991, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.16346977786566474, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.00018377094603509698, |
| "loss": 0.0247, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.16478808252587174, |
| "grad_norm": 0.37890625, |
| "learning_rate": 0.00018363900250692705, |
| "loss": 0.0632, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.1661063871860787, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.00018350705897875712, |
| "loss": 0.0314, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.16742469184628567, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.00018337511545058716, |
| "loss": 0.0425, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.16874299650649266, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.00018324317192241723, |
| "loss": 0.0613, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.17006130116669962, |
| "grad_norm": 0.057373046875, |
| "learning_rate": 0.00018311122839424727, |
| "loss": 0.0569, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.1713796058269066, |
| "grad_norm": 0.001373291015625, |
| "learning_rate": 0.00018297928486607734, |
| "loss": 0.007, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.17269791048711358, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00018284734133790738, |
| "loss": 0.0189, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.17401621514732055, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.00018271539780973742, |
| "loss": 0.0601, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1753345198075275, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.0001825834542815675, |
| "loss": 0.0211, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.1766528244677345, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.00018245151075339753, |
| "loss": 0.0713, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.17797112912794147, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001823195672252276, |
| "loss": 0.0522, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.17928943378814843, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 0.00018218762369705767, |
| "loss": 0.0242, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.18060773844835543, |
| "grad_norm": 0.048095703125, |
| "learning_rate": 0.00018205568016888772, |
| "loss": 0.0129, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.1819260431085624, |
| "grad_norm": 0.04541015625, |
| "learning_rate": 0.00018192373664071778, |
| "loss": 0.0142, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.18324434776876936, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 0.00018179179311254783, |
| "loss": 0.0121, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.18456265242897635, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0001816598495843779, |
| "loss": 0.0163, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.1858809570891833, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.00018152790605620796, |
| "loss": 0.0203, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.18719926174939028, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.000181395962528038, |
| "loss": 0.1548, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.18851756640959727, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00018126401899986808, |
| "loss": 0.0543, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.18983587106980424, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.00018113207547169812, |
| "loss": 0.0346, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.1911541757300112, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 0.0001810001319435282, |
| "loss": 0.03, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.1924724803902182, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.00018086818841535823, |
| "loss": 0.0796, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.19379078505042516, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 0.0001807362448871883, |
| "loss": 0.0662, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.19510908971063212, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.00018060430135901837, |
| "loss": 0.0675, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.19642739437083911, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.0001804723578308484, |
| "loss": 0.0377, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.19774569903104608, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.00018034041430267848, |
| "loss": 0.0174, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.19906400369125304, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00018020847077450852, |
| "loss": 0.0278, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.20038230835146, |
| "grad_norm": 0.8515625, |
| "learning_rate": 0.00018007652724633856, |
| "loss": 0.0113, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.201700613011667, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.00017994458371816863, |
| "loss": 0.0589, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.20301891767187397, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 0.00017981264018999867, |
| "loss": 0.0203, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.20433722233208093, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 0.00017968069666182874, |
| "loss": 0.0494, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.20565552699228792, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.00017954875313365879, |
| "loss": 0.0394, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2069738316524949, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.00017941680960548886, |
| "loss": 0.0848, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.20829213631270185, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00017928486607731892, |
| "loss": 0.0464, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.20961044097290885, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00017915292254914897, |
| "loss": 0.0222, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.2109287456331158, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.00017902097902097904, |
| "loss": 0.0434, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.21224705029332278, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00017888903549280908, |
| "loss": 0.0222, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.21356535495352977, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 0.00017875709196463915, |
| "loss": 0.0099, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.21488365961373673, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.0001786251484364692, |
| "loss": 0.0086, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.2162019642739437, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.00017849320490829926, |
| "loss": 0.0715, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2175202689341507, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.00017836126138012933, |
| "loss": 0.0642, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.21883857359435765, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 0.00017822931785195937, |
| "loss": 0.0111, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.22015687825456462, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00017809737432378944, |
| "loss": 0.0518, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.2214751829147716, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 0.00017796543079561948, |
| "loss": 0.0384, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.22279348757497858, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.00017783348726744955, |
| "loss": 0.0204, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.22411179223518554, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.00017770154373927962, |
| "loss": 0.0075, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.22543009689539253, |
| "grad_norm": 0.033203125, |
| "learning_rate": 0.00017756960021110963, |
| "loss": 0.0895, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.2267484015555995, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.0001774376566829397, |
| "loss": 0.1039, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.22806670621580646, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.00017730571315476975, |
| "loss": 0.0125, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.22938501087601346, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.00017717376962659982, |
| "loss": 0.0381, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.23070331553622042, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 0.00017704182609842988, |
| "loss": 0.0434, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.23202162019642739, |
| "grad_norm": 0.43359375, |
| "learning_rate": 0.00017690988257025993, |
| "loss": 0.0799, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.23333992485663438, |
| "grad_norm": 0.04150390625, |
| "learning_rate": 0.00017677793904209, |
| "loss": 0.0692, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.23465822951684134, |
| "grad_norm": 0.435546875, |
| "learning_rate": 0.00017664599551392004, |
| "loss": 0.0544, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2359765341770483, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001765140519857501, |
| "loss": 0.0619, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.2372948388372553, |
| "grad_norm": 0.01263427734375, |
| "learning_rate": 0.00017638210845758018, |
| "loss": 0.0418, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.23861314349746227, |
| "grad_norm": 0.017578125, |
| "learning_rate": 0.00017625016492941022, |
| "loss": 0.0195, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.23993144815766923, |
| "grad_norm": 0.6171875, |
| "learning_rate": 0.0001761182214012403, |
| "loss": 0.067, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.24124975281787622, |
| "grad_norm": 0.59765625, |
| "learning_rate": 0.00017598627787307033, |
| "loss": 0.049, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.2425680574780832, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001758543343449004, |
| "loss": 0.0539, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.24388636213829015, |
| "grad_norm": 0.10302734375, |
| "learning_rate": 0.00017572239081673044, |
| "loss": 0.0725, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.24520466679849715, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.0001755904472885605, |
| "loss": 0.064, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.2465229714587041, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.00017545850376039058, |
| "loss": 0.0271, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.24784127611891107, |
| "grad_norm": 0.01470947265625, |
| "learning_rate": 0.00017532656023222062, |
| "loss": 0.0247, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.24915958077911807, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 0.0001751946167040507, |
| "loss": 0.017, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.25047788543932503, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.00017506267317588073, |
| "loss": 0.0254, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.251796190099532, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.00017493072964771078, |
| "loss": 0.0186, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.25311449475973896, |
| "grad_norm": 0.66796875, |
| "learning_rate": 0.00017479878611954084, |
| "loss": 0.0617, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.25443279941994595, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.00017466684259137089, |
| "loss": 0.0173, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.25575110408015295, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.00017453489906320096, |
| "loss": 0.0512, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.2570694087403599, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.000174402955535031, |
| "loss": 0.0361, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.2583877134005669, |
| "grad_norm": 0.423828125, |
| "learning_rate": 0.00017427101200686107, |
| "loss": 0.0175, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.25970601806077387, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.00017413906847869114, |
| "loss": 0.0139, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.2610243227209808, |
| "grad_norm": 0.515625, |
| "learning_rate": 0.00017400712495052118, |
| "loss": 0.0948, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.2623426273811878, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017387518142235125, |
| "loss": 0.0406, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.2636609320413948, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 0.0001737432378941813, |
| "loss": 0.1011, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2636609320413948, |
| "eval_loss": 0.045552924275398254, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.6113, |
| "eval_samples_per_second": 7.369, |
| "eval_steps_per_second": 3.684, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.26497923670160173, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.00017361129436601136, |
| "loss": 0.0711, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2662975413618087, |
| "grad_norm": 0.0208740234375, |
| "learning_rate": 0.00017347935083784143, |
| "loss": 0.0218, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.2676158460220157, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00017334740730967147, |
| "loss": 0.0301, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.26893415068222265, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.00017321546378150154, |
| "loss": 0.0721, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.27025245534242964, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.00017308352025333158, |
| "loss": 0.0363, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.27157076000263664, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00017295157672516165, |
| "loss": 0.0313, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.2728890646628436, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 0.0001728196331969917, |
| "loss": 0.0385, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.27420736932305056, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 0.00017268768966882176, |
| "loss": 0.0405, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.27552567398325756, |
| "grad_norm": 0.484375, |
| "learning_rate": 0.00017255574614065183, |
| "loss": 0.0616, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.2768439786434645, |
| "grad_norm": 0.0908203125, |
| "learning_rate": 0.00017242380261248185, |
| "loss": 0.0057, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2781622833036715, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.00017229185908431192, |
| "loss": 0.0417, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.2794805879638785, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.00017215991555614196, |
| "loss": 0.0346, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2807988926240854, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 0.00017202797202797203, |
| "loss": 0.0295, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.2821171972842924, |
| "grad_norm": 0.490234375, |
| "learning_rate": 0.0001718960284998021, |
| "loss": 0.0448, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.28343550194449935, |
| "grad_norm": 0.004241943359375, |
| "learning_rate": 0.00017176408497163214, |
| "loss": 0.0051, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.28475380660470634, |
| "grad_norm": 0.01904296875, |
| "learning_rate": 0.0001716321414434622, |
| "loss": 0.0894, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.28607211126491333, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.00017150019791529225, |
| "loss": 0.0288, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.28739041592512027, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.00017136825438712232, |
| "loss": 0.0222, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.28870872058532726, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.0001712363108589524, |
| "loss": 0.0444, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.29002702524553425, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.00017110436733078243, |
| "loss": 0.0828, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2913453299057412, |
| "grad_norm": 0.04052734375, |
| "learning_rate": 0.0001709724238026125, |
| "loss": 0.0725, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.2926636345659482, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.00017084048027444254, |
| "loss": 0.0204, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.2939819392261552, |
| "grad_norm": 0.67578125, |
| "learning_rate": 0.0001707085367462726, |
| "loss": 0.0503, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.2953002438863621, |
| "grad_norm": 0.0059814453125, |
| "learning_rate": 0.00017057659321810265, |
| "loss": 0.0144, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.2966185485465691, |
| "grad_norm": 0.0269775390625, |
| "learning_rate": 0.00017044464968993272, |
| "loss": 0.0044, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.2979368532067761, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.0001703127061617628, |
| "loss": 0.013, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.29925515786698303, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.00017018076263359283, |
| "loss": 0.0245, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.30057346252719, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.0001700488191054229, |
| "loss": 0.0247, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.301891767187397, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.00016991687557725294, |
| "loss": 0.0402, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.30321007184760396, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.000169784932049083, |
| "loss": 0.0071, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.30452837650781095, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.00016965298852091306, |
| "loss": 0.0177, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.30584668116801794, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 0.0001695210449927431, |
| "loss": 0.0029, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3071649858282249, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.00016938910146457317, |
| "loss": 0.0262, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.30848329048843187, |
| "grad_norm": 0.002655029296875, |
| "learning_rate": 0.0001692571579364032, |
| "loss": 0.0346, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.30980159514863886, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 0.00016912521440823328, |
| "loss": 0.0494, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.3111198998088458, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016899327088006335, |
| "loss": 0.0603, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3124382044690528, |
| "grad_norm": 0.1572265625, |
| "learning_rate": 0.0001688613273518934, |
| "loss": 0.0366, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.3137565091292598, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 0.00016872938382372346, |
| "loss": 0.0678, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.3150748137894667, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.0001685974402955535, |
| "loss": 0.0359, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.3163931184496737, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.00016846549676738357, |
| "loss": 0.1099, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3177114231098807, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.00016833355323921364, |
| "loss": 0.0343, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.31902972777008765, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 0.00016820160971104368, |
| "loss": 0.0138, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.32034803243029464, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.00016806966618287375, |
| "loss": 0.0202, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.32166633709050163, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.0001679377226547038, |
| "loss": 0.0442, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.32298464175070857, |
| "grad_norm": 0.049072265625, |
| "learning_rate": 0.00016780577912653386, |
| "loss": 0.0375, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.32430294641091556, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 0.0001676738355983639, |
| "loss": 0.01, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.32562125107112255, |
| "grad_norm": 0.02197265625, |
| "learning_rate": 0.00016754189207019397, |
| "loss": 0.0139, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.3269395557313295, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.00016740994854202404, |
| "loss": 0.014, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3282578603915365, |
| "grad_norm": 0.47265625, |
| "learning_rate": 0.00016727800501385408, |
| "loss": 0.1546, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.3295761650517435, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 0.00016714606148568413, |
| "loss": 0.0803, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3308944697119504, |
| "grad_norm": 0.185546875, |
| "learning_rate": 0.00016701411795751417, |
| "loss": 0.0376, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.3322127743721574, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.00016688217442934424, |
| "loss": 0.0375, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.3335310790323644, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0001667502309011743, |
| "loss": 0.0442, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.33484938369257133, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 0.00016661828737300435, |
| "loss": 0.0261, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3361676883527783, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.00016648634384483442, |
| "loss": 0.0553, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.3374859930129853, |
| "grad_norm": 0.1328125, |
| "learning_rate": 0.00016635440031666446, |
| "loss": 0.0065, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.33880429767319226, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.00016622245678849453, |
| "loss": 0.0527, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.34012260233339925, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.0001660905132603246, |
| "loss": 0.0297, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.34144090699360624, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 0.00016595856973215464, |
| "loss": 0.0477, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.3427592116538132, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.0001658266262039847, |
| "loss": 0.0298, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.34407751631402017, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.00016569468267581475, |
| "loss": 0.0481, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.34539582097422716, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.00016556273914764482, |
| "loss": 0.0153, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.3467141256344341, |
| "grad_norm": 0.00592041015625, |
| "learning_rate": 0.00016543079561947486, |
| "loss": 0.0111, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.3480324302946411, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.00016529885209130493, |
| "loss": 0.0309, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.3493507349548481, |
| "grad_norm": 0.0198974609375, |
| "learning_rate": 0.000165166908563135, |
| "loss": 0.0579, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.350669039615055, |
| "grad_norm": 0.10107421875, |
| "learning_rate": 0.00016503496503496504, |
| "loss": 0.0055, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.351987344275262, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.00016490302150679511, |
| "loss": 0.0299, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.353305648935469, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 0.00016477107797862516, |
| "loss": 0.0943, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.35462395359567594, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.00016463913445045523, |
| "loss": 0.0216, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.35594225825588294, |
| "grad_norm": 0.02392578125, |
| "learning_rate": 0.00016450719092228527, |
| "loss": 0.0265, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.35726056291608993, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0001643752473941153, |
| "loss": 0.0539, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.35857886757629687, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 0.00016424330386594538, |
| "loss": 0.0139, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.35989717223650386, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.00016411136033777542, |
| "loss": 0.0428, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.36121547689671085, |
| "grad_norm": 0.052734375, |
| "learning_rate": 0.0001639794168096055, |
| "loss": 0.0346, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.3625337815569178, |
| "grad_norm": 0.12158203125, |
| "learning_rate": 0.00016384747328143556, |
| "loss": 0.0095, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.3638520862171248, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 0.0001637155297532656, |
| "loss": 0.0224, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3651703908773318, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 0.00016358358622509567, |
| "loss": 0.0316, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.3664886955375387, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 0.0001634516426969257, |
| "loss": 0.0051, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3678070001977457, |
| "grad_norm": 0.00396728515625, |
| "learning_rate": 0.00016331969916875578, |
| "loss": 0.038, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.3691253048579527, |
| "grad_norm": 0.375, |
| "learning_rate": 0.00016318775564058585, |
| "loss": 0.029, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.37044360951815963, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.0001630558121124159, |
| "loss": 0.0072, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.3717619141783666, |
| "grad_norm": 0.00127410888671875, |
| "learning_rate": 0.00016292386858424596, |
| "loss": 0.0381, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3730802188385736, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.000162791925056076, |
| "loss": 0.0573, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.37439852349878056, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 0.00016265998152790607, |
| "loss": 0.051, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.37571682815898755, |
| "grad_norm": 0.0015106201171875, |
| "learning_rate": 0.00016252803799973612, |
| "loss": 0.0239, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.37703513281919454, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.00016239609447156618, |
| "loss": 0.0165, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.3783534374794015, |
| "grad_norm": 0.006134033203125, |
| "learning_rate": 0.00016226415094339625, |
| "loss": 0.0071, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.37967174213960847, |
| "grad_norm": 2.828125, |
| "learning_rate": 0.0001621322074152263, |
| "loss": 0.0272, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.38099004679981546, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.00016200026388705637, |
| "loss": 0.0647, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.3823083514600224, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 0.00016186832035888638, |
| "loss": 0.0262, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.3836266561202294, |
| "grad_norm": 0.041015625, |
| "learning_rate": 0.00016173637683071645, |
| "loss": 0.0576, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.3849449607804364, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 0.00016160443330254652, |
| "loss": 0.0142, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.3862632654406433, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.00016147248977437656, |
| "loss": 0.0348, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.3875815701008503, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00016134054624620663, |
| "loss": 0.0672, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.3888998747610573, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.00016120860271803667, |
| "loss": 0.0121, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.39021817942126424, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.00016107665918986674, |
| "loss": 0.0114, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.39153648408147124, |
| "grad_norm": 0.85546875, |
| "learning_rate": 0.0001609447156616968, |
| "loss": 0.0968, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.39285478874167823, |
| "grad_norm": 0.703125, |
| "learning_rate": 0.00016081277213352685, |
| "loss": 0.0349, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.39417309340188517, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 0.00016068082860535692, |
| "loss": 0.0106, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.39549139806209216, |
| "grad_norm": 0.7265625, |
| "learning_rate": 0.00016054888507718696, |
| "loss": 0.0225, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.39549139806209216, |
| "eval_loss": 0.03515048325061798, |
| "eval_model_preparation_time": 0.0076, |
| "eval_runtime": 457.3497, |
| "eval_samples_per_second": 7.373, |
| "eval_steps_per_second": 3.686, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3968097027222991, |
| "grad_norm": 0.016519820317626, |
| "learning_rate": 0.00016041694154901703, |
| "loss": 0.0202, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.3981280073825061, |
| "grad_norm": 0.8505942225456238, |
| "learning_rate": 0.00016028499802084708, |
| "loss": 0.0541, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.3994463120427131, |
| "grad_norm": 0.04163295030593872, |
| "learning_rate": 0.00016015305449267714, |
| "loss": 0.0037, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.40076461670292, |
| "grad_norm": 0.011332935653626919, |
| "learning_rate": 0.00016002111096450721, |
| "loss": 0.0459, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.402082921363127, |
| "grad_norm": 0.9360129833221436, |
| "learning_rate": 0.00015988916743633726, |
| "loss": 0.013, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.403401226023334, |
| "grad_norm": 0.11991436779499054, |
| "learning_rate": 0.00015975722390816733, |
| "loss": 0.0079, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.40471953068354094, |
| "grad_norm": 0.36911076307296753, |
| "learning_rate": 0.00015962528037999737, |
| "loss": 0.0638, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.40603783534374793, |
| "grad_norm": 0.020278634503483772, |
| "learning_rate": 0.00015949333685182744, |
| "loss": 0.0217, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.4073561400039549, |
| "grad_norm": 0.14263059198856354, |
| "learning_rate": 0.0001593613933236575, |
| "loss": 0.0495, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.40867444466416186, |
| "grad_norm": 0.09494803845882416, |
| "learning_rate": 0.00015922944979548752, |
| "loss": 0.0248, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.40999274932436885, |
| "grad_norm": 0.23064319789409637, |
| "learning_rate": 0.0001590975062673176, |
| "loss": 0.0285, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.41131105398457585, |
| "grad_norm": 0.32220256328582764, |
| "learning_rate": 0.00015896556273914763, |
| "loss": 0.0537, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.4126293586447828, |
| "grad_norm": 0.41208815574645996, |
| "learning_rate": 0.0001588336192109777, |
| "loss": 0.0453, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.4139476633049898, |
| "grad_norm": 0.03775424137711525, |
| "learning_rate": 0.00015870167568280777, |
| "loss": 0.0134, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.41526596796519677, |
| "grad_norm": 0.6526333093643188, |
| "learning_rate": 0.0001585697321546378, |
| "loss": 0.0329, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.4165842726254037, |
| "grad_norm": 1.001305103302002, |
| "learning_rate": 0.00015843778862646788, |
| "loss": 0.0912, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.4179025772856107, |
| "grad_norm": 0.4055219888687134, |
| "learning_rate": 0.00015830584509829792, |
| "loss": 0.0519, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.4192208819458177, |
| "grad_norm": 0.035015616565942764, |
| "learning_rate": 0.000158173901570128, |
| "loss": 0.0191, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.42053918660602463, |
| "grad_norm": 0.09326844662427902, |
| "learning_rate": 0.00015804195804195806, |
| "loss": 0.0106, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.4218574912662316, |
| "grad_norm": 0.06223440542817116, |
| "learning_rate": 0.0001579100145137881, |
| "loss": 0.0113, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4231757959264386, |
| "grad_norm": 0.0625135526061058, |
| "learning_rate": 0.00015777807098561817, |
| "loss": 0.0191, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.42449410058664555, |
| "grad_norm": 0.2645983099937439, |
| "learning_rate": 0.00015764612745744822, |
| "loss": 0.0829, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.42581240524685254, |
| "grad_norm": 0.009632415138185024, |
| "learning_rate": 0.00015751418392927829, |
| "loss": 0.0542, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.42713070990705954, |
| "grad_norm": 0.01979319378733635, |
| "learning_rate": 0.00015738224040110833, |
| "loss": 0.0517, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.4284490145672665, |
| "grad_norm": 0.3065454065799713, |
| "learning_rate": 0.0001572502968729384, |
| "loss": 0.0738, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.42976731922747347, |
| "grad_norm": 0.09581473469734192, |
| "learning_rate": 0.00015711835334476847, |
| "loss": 0.0571, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.43108562388768046, |
| "grad_norm": 0.23746591806411743, |
| "learning_rate": 0.0001569864098165985, |
| "loss": 0.0128, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.4324039285478874, |
| "grad_norm": 0.936278760433197, |
| "learning_rate": 0.00015685446628842858, |
| "loss": 0.0665, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4337222332080944, |
| "grad_norm": 0.18487441539764404, |
| "learning_rate": 0.00015672252276025862, |
| "loss": 0.0527, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.4350405378683014, |
| "grad_norm": 0.6980624794960022, |
| "learning_rate": 0.00015659057923208866, |
| "loss": 0.0613, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4363588425285083, |
| "grad_norm": 0.4696301221847534, |
| "learning_rate": 0.00015645863570391873, |
| "loss": 0.0569, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.4376771471887153, |
| "grad_norm": 0.15083105862140656, |
| "learning_rate": 0.00015632669217574877, |
| "loss": 0.0394, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.4389954518489223, |
| "grad_norm": 0.44701239466667175, |
| "learning_rate": 0.00015619474864757884, |
| "loss": 0.0494, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.44031375650912924, |
| "grad_norm": 0.07418403029441833, |
| "learning_rate": 0.00015606280511940888, |
| "loss": 0.0291, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.44163206116933623, |
| "grad_norm": 0.02311861515045166, |
| "learning_rate": 0.00015593086159123895, |
| "loss": 0.0304, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.4429503658295432, |
| "grad_norm": 0.4416038990020752, |
| "learning_rate": 0.00015579891806306902, |
| "loss": 0.0176, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.44426867048975016, |
| "grad_norm": 0.5124915242195129, |
| "learning_rate": 0.00015566697453489906, |
| "loss": 0.0454, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.44558697514995715, |
| "grad_norm": 0.3159286081790924, |
| "learning_rate": 0.00015553503100672913, |
| "loss": 0.047, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.44690527981016415, |
| "grad_norm": 0.032126396894454956, |
| "learning_rate": 0.00015540308747855918, |
| "loss": 0.0151, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.4482235844703711, |
| "grad_norm": 0.04663548618555069, |
| "learning_rate": 0.00015527114395038924, |
| "loss": 0.0375, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.4495418891305781, |
| "grad_norm": 0.013753900304436684, |
| "learning_rate": 0.0001551392004222193, |
| "loss": 0.0485, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.45086019379078507, |
| "grad_norm": 1.9952393770217896, |
| "learning_rate": 0.00015500725689404936, |
| "loss": 0.0625, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.452178498450992, |
| "grad_norm": 0.014283270575106144, |
| "learning_rate": 0.00015487531336587943, |
| "loss": 0.0037, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.453496803111199, |
| "grad_norm": 0.3897913098335266, |
| "learning_rate": 0.00015474336983770947, |
| "loss": 0.0304, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.454815107771406, |
| "grad_norm": 0.3730885684490204, |
| "learning_rate": 0.00015461142630953954, |
| "loss": 0.0115, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.45613341243161293, |
| "grad_norm": 0.035858724266290665, |
| "learning_rate": 0.00015447948278136958, |
| "loss": 0.0021, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.4574517170918199, |
| "grad_norm": 0.20589517056941986, |
| "learning_rate": 0.00015434753925319965, |
| "loss": 0.0132, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.4587700217520269, |
| "grad_norm": 0.004939342383295298, |
| "learning_rate": 0.00015421559572502972, |
| "loss": 0.0471, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.46008832641223385, |
| "grad_norm": 0.03493283689022064, |
| "learning_rate": 0.00015408365219685976, |
| "loss": 0.0062, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.46140663107244084, |
| "grad_norm": 0.045927103608846664, |
| "learning_rate": 0.0001539517086686898, |
| "loss": 0.0283, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.46272493573264784, |
| "grad_norm": 0.012629454955458641, |
| "learning_rate": 0.00015381976514051984, |
| "loss": 0.0133, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.46404324039285477, |
| "grad_norm": 0.8001697659492493, |
| "learning_rate": 0.0001536878216123499, |
| "loss": 0.0224, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.46536154505306176, |
| "grad_norm": 0.002036362886428833, |
| "learning_rate": 0.00015355587808417998, |
| "loss": 0.0066, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.46667984971326876, |
| "grad_norm": 1.0261330604553223, |
| "learning_rate": 0.00015342393455601002, |
| "loss": 0.191, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.4679981543734757, |
| "grad_norm": 0.3033429682254791, |
| "learning_rate": 0.0001532919910278401, |
| "loss": 0.0222, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.4693164590336827, |
| "grad_norm": 0.36911338567733765, |
| "learning_rate": 0.00015316004749967014, |
| "loss": 0.0363, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.4706347636938897, |
| "grad_norm": 0.0406811460852623, |
| "learning_rate": 0.0001530281039715002, |
| "loss": 0.0283, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.4719530683540966, |
| "grad_norm": 0.23334211111068726, |
| "learning_rate": 0.00015289616044333027, |
| "loss": 0.0274, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.4732713730143036, |
| "grad_norm": 0.013081169687211514, |
| "learning_rate": 0.00015276421691516032, |
| "loss": 0.0221, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.4745896776745106, |
| "grad_norm": 0.2480790615081787, |
| "learning_rate": 0.00015263227338699039, |
| "loss": 0.019, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.47590798233471754, |
| "grad_norm": 0.0373196005821228, |
| "learning_rate": 0.00015250032985882043, |
| "loss": 0.0292, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.47722628699492453, |
| "grad_norm": 0.004609994124621153, |
| "learning_rate": 0.0001523683863306505, |
| "loss": 0.0918, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4785445916551315, |
| "grad_norm": 0.02370987832546234, |
| "learning_rate": 0.00015223644280248054, |
| "loss": 0.0462, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.47986289631533846, |
| "grad_norm": 0.05842221528291702, |
| "learning_rate": 0.0001521044992743106, |
| "loss": 0.0595, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.48118120097554545, |
| "grad_norm": 0.009685276076197624, |
| "learning_rate": 0.00015197255574614068, |
| "loss": 0.0074, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.48249950563575245, |
| "grad_norm": 0.8933250308036804, |
| "learning_rate": 0.00015184061221797072, |
| "loss": 0.0757, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.4838178102959594, |
| "grad_norm": 0.07075401395559311, |
| "learning_rate": 0.0001517086686898008, |
| "loss": 0.0226, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.4851361149561664, |
| "grad_norm": 0.732706606388092, |
| "learning_rate": 0.00015157672516163083, |
| "loss": 0.0161, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.48645441961637337, |
| "grad_norm": 1.1897023916244507, |
| "learning_rate": 0.0001514447816334609, |
| "loss": 0.0265, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.4877727242765803, |
| "grad_norm": 0.052572328597307205, |
| "learning_rate": 0.00015131283810529094, |
| "loss": 0.0094, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4890910289367873, |
| "grad_norm": 0.08263898640871048, |
| "learning_rate": 0.00015118089457712098, |
| "loss": 0.0631, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.4904093335969943, |
| "grad_norm": 0.03225664421916008, |
| "learning_rate": 0.00015104895104895105, |
| "loss": 0.023, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.4917276382572012, |
| "grad_norm": 0.007935039699077606, |
| "learning_rate": 0.0001509170075207811, |
| "loss": 0.0039, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.4930459429174082, |
| "grad_norm": 0.00830796267837286, |
| "learning_rate": 0.00015078506399261116, |
| "loss": 0.007, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.4943642475776152, |
| "grad_norm": 0.08042234182357788, |
| "learning_rate": 0.00015065312046444123, |
| "loss": 0.0366, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.49568255223782215, |
| "grad_norm": 0.009092851541936398, |
| "learning_rate": 0.00015052117693627128, |
| "loss": 0.0107, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.49700085689802914, |
| "grad_norm": 0.2674141824245453, |
| "learning_rate": 0.00015038923340810135, |
| "loss": 0.0076, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.49831916155823613, |
| "grad_norm": 0.07694366574287415, |
| "learning_rate": 0.0001502572898799314, |
| "loss": 0.0252, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.49963746621844307, |
| "grad_norm": 0.5699467062950134, |
| "learning_rate": 0.00015012534635176146, |
| "loss": 0.0487, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.5009557708786501, |
| "grad_norm": 0.18800878524780273, |
| "learning_rate": 0.0001499934028235915, |
| "loss": 0.0183, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5022740755388571, |
| "grad_norm": 0.019469989463686943, |
| "learning_rate": 0.00014986145929542157, |
| "loss": 0.0268, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.503592380199064, |
| "grad_norm": 0.01890506222844124, |
| "learning_rate": 0.00014972951576725164, |
| "loss": 0.0449, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5049106848592709, |
| "grad_norm": 0.0006314461352303624, |
| "learning_rate": 0.00014959757223908168, |
| "loss": 0.0056, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.5062289895194779, |
| "grad_norm": 0.32654041051864624, |
| "learning_rate": 0.00014946562871091175, |
| "loss": 0.0256, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.5075472941796849, |
| "grad_norm": 0.7803483605384827, |
| "learning_rate": 0.0001493336851827418, |
| "loss": 0.0374, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.5088655988398919, |
| "grad_norm": 0.028441445901989937, |
| "learning_rate": 0.00014920174165457186, |
| "loss": 0.0161, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5101839035000989, |
| "grad_norm": 0.028379200026392937, |
| "learning_rate": 0.00014906979812640193, |
| "loss": 0.0151, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.5115022081603059, |
| "grad_norm": 0.021159596741199493, |
| "learning_rate": 0.00014893785459823197, |
| "loss": 0.0303, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.24903325736522675, |
| "learning_rate": 0.000148805911070062, |
| "loss": 0.0076, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.5141388174807198, |
| "grad_norm": 0.007065301761031151, |
| "learning_rate": 0.00014867396754189206, |
| "loss": 0.022, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5154571221409268, |
| "grad_norm": 0.004032329190522432, |
| "learning_rate": 0.00014854202401372212, |
| "loss": 0.0083, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.5167754268011338, |
| "grad_norm": 0.3045775592327118, |
| "learning_rate": 0.0001484100804855522, |
| "loss": 0.0113, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5180937314613407, |
| "grad_norm": 0.36974939703941345, |
| "learning_rate": 0.00014827813695738224, |
| "loss": 0.0267, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.5194120361215477, |
| "grad_norm": 0.009729950688779354, |
| "learning_rate": 0.0001481461934292123, |
| "loss": 0.027, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5207303407817546, |
| "grad_norm": 0.0013097926275804639, |
| "learning_rate": 0.00014801424990104235, |
| "loss": 0.003, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.5220486454419616, |
| "grad_norm": 0.0706263929605484, |
| "learning_rate": 0.00014788230637287242, |
| "loss": 0.0193, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.5233669501021686, |
| "grad_norm": 1.435702919960022, |
| "learning_rate": 0.00014775036284470249, |
| "loss": 0.0647, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.5246852547623756, |
| "grad_norm": 0.00661757867783308, |
| "learning_rate": 0.00014761841931653253, |
| "loss": 0.0373, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.5260035594225826, |
| "grad_norm": 0.12014541029930115, |
| "learning_rate": 0.0001474864757883626, |
| "loss": 0.0178, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.5273218640827896, |
| "grad_norm": 1.0549248456954956, |
| "learning_rate": 0.00014735453226019264, |
| "loss": 0.0191, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5273218640827896, |
| "eval_loss": 0.037292081862688065, |
| "eval_runtime": 454.3033, |
| "eval_samples_per_second": 7.422, |
| "eval_steps_per_second": 3.711, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5286401687429965, |
| "grad_norm": 0.47634151577949524, |
| "learning_rate": 0.0001472225887320227, |
| "loss": 0.0404, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.5299584734032035, |
| "grad_norm": 0.006752463988959789, |
| "learning_rate": 0.00014709064520385275, |
| "loss": 0.034, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.5312767780634104, |
| "grad_norm": 0.20780125260353088, |
| "learning_rate": 0.00014695870167568282, |
| "loss": 0.0421, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.5325950827236174, |
| "grad_norm": 0.010941066779196262, |
| "learning_rate": 0.0001468267581475129, |
| "loss": 0.0086, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.5339133873838244, |
| "grad_norm": 0.3439581096172333, |
| "learning_rate": 0.00014669481461934293, |
| "loss": 0.0187, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.5352316920440314, |
| "grad_norm": 0.14961636066436768, |
| "learning_rate": 0.000146562871091173, |
| "loss": 0.0504, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.5365499967042383, |
| "grad_norm": 0.0044641937129199505, |
| "learning_rate": 0.00014643092756300304, |
| "loss": 0.0134, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.5378683013644453, |
| "grad_norm": 0.14088386297225952, |
| "learning_rate": 0.0001462989840348331, |
| "loss": 0.0096, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.5391866060246523, |
| "grad_norm": 0.48116979002952576, |
| "learning_rate": 0.00014616704050666315, |
| "loss": 0.0124, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.5405049106848593, |
| "grad_norm": 0.3688766360282898, |
| "learning_rate": 0.0001460350969784932, |
| "loss": 0.0226, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5418232153450663, |
| "grad_norm": 0.002938181860372424, |
| "learning_rate": 0.00014590315345032326, |
| "loss": 0.0267, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.5431415200052733, |
| "grad_norm": 0.3335214853286743, |
| "learning_rate": 0.0001457712099221533, |
| "loss": 0.0367, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.5444598246654802, |
| "grad_norm": 0.004644686821848154, |
| "learning_rate": 0.00014563926639398338, |
| "loss": 0.0121, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.5457781293256871, |
| "grad_norm": 0.19505545496940613, |
| "learning_rate": 0.00014550732286581345, |
| "loss": 0.0591, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.5470964339858941, |
| "grad_norm": 0.018028756603598595, |
| "learning_rate": 0.0001453753793376435, |
| "loss": 0.0131, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.5484147386461011, |
| "grad_norm": 0.045639291405677795, |
| "learning_rate": 0.00014524343580947356, |
| "loss": 0.0443, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5497330433063081, |
| "grad_norm": 0.727981686592102, |
| "learning_rate": 0.0001451114922813036, |
| "loss": 0.0205, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.5510513479665151, |
| "grad_norm": 0.03766491636633873, |
| "learning_rate": 0.00014497954875313367, |
| "loss": 0.0067, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.552369652626722, |
| "grad_norm": 0.1911504715681076, |
| "learning_rate": 0.0001448476052249637, |
| "loss": 0.0397, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.553687957286929, |
| "grad_norm": 0.08238353580236435, |
| "learning_rate": 0.00014471566169679378, |
| "loss": 0.0513, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.555006261947136, |
| "grad_norm": 0.06317206472158432, |
| "learning_rate": 0.00014458371816862385, |
| "loss": 0.0178, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.556324566607343, |
| "grad_norm": 0.0652734637260437, |
| "learning_rate": 0.0001444517746404539, |
| "loss": 0.0184, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.55764287126755, |
| "grad_norm": 0.05471858009696007, |
| "learning_rate": 0.00014431983111228396, |
| "loss": 0.0089, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.558961175927757, |
| "grad_norm": 0.005062670446932316, |
| "learning_rate": 0.000144187887584114, |
| "loss": 0.0052, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5602794805879638, |
| "grad_norm": 0.06337414681911469, |
| "learning_rate": 0.00014405594405594407, |
| "loss": 0.053, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.5615977852481708, |
| "grad_norm": 0.33745357394218445, |
| "learning_rate": 0.00014392400052777414, |
| "loss": 0.0166, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5629160899083778, |
| "grad_norm": 0.7382741570472717, |
| "learning_rate": 0.00014379205699960418, |
| "loss": 0.0191, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.5642343945685848, |
| "grad_norm": 0.007551972754299641, |
| "learning_rate": 0.00014366011347143425, |
| "loss": 0.0022, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5655526992287918, |
| "grad_norm": 0.6260896921157837, |
| "learning_rate": 0.00014352816994326427, |
| "loss": 0.0095, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.5668710038889987, |
| "grad_norm": 0.11619322001934052, |
| "learning_rate": 0.00014339622641509434, |
| "loss": 0.015, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5681893085492057, |
| "grad_norm": 1.1440670490264893, |
| "learning_rate": 0.0001432642828869244, |
| "loss": 0.1343, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.5695076132094127, |
| "grad_norm": 1.1793878078460693, |
| "learning_rate": 0.00014313233935875445, |
| "loss": 0.0968, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.5708259178696197, |
| "grad_norm": 0.6865736842155457, |
| "learning_rate": 0.00014300039583058452, |
| "loss": 0.0195, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.5721442225298267, |
| "grad_norm": 0.140816792845726, |
| "learning_rate": 0.00014286845230241456, |
| "loss": 0.0761, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5734625271900337, |
| "grad_norm": 0.04071786254644394, |
| "learning_rate": 0.00014273650877424463, |
| "loss": 0.0193, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.5747808318502405, |
| "grad_norm": 0.044617727398872375, |
| "learning_rate": 0.0001426045652460747, |
| "loss": 0.0112, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.5760991365104475, |
| "grad_norm": 0.11001799255609512, |
| "learning_rate": 0.00014247262171790474, |
| "loss": 0.0039, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.5774174411706545, |
| "grad_norm": 0.0036315324250608683, |
| "learning_rate": 0.0001423406781897348, |
| "loss": 0.0038, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5787357458308615, |
| "grad_norm": 0.9866570830345154, |
| "learning_rate": 0.00014220873466156485, |
| "loss": 0.025, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.5800540504910685, |
| "grad_norm": 0.023570384830236435, |
| "learning_rate": 0.00014207679113339492, |
| "loss": 0.0468, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5813723551512755, |
| "grad_norm": 0.20010559260845184, |
| "learning_rate": 0.00014194484760522496, |
| "loss": 0.0198, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.5826906598114824, |
| "grad_norm": 0.06153270602226257, |
| "learning_rate": 0.00014181290407705503, |
| "loss": 0.0764, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.5840089644716894, |
| "grad_norm": 0.033162448555231094, |
| "learning_rate": 0.0001416809605488851, |
| "loss": 0.028, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.5853272691318964, |
| "grad_norm": 0.428382933139801, |
| "learning_rate": 0.00014154901702071514, |
| "loss": 0.0652, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5866455737921034, |
| "grad_norm": 0.25004762411117554, |
| "learning_rate": 0.0001414170734925452, |
| "loss": 0.0411, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.5879638784523104, |
| "grad_norm": 0.22649863362312317, |
| "learning_rate": 0.00014128512996437525, |
| "loss": 0.0517, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5892821831125173, |
| "grad_norm": 0.035932112485170364, |
| "learning_rate": 0.00014115318643620532, |
| "loss": 0.015, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.5906004877727242, |
| "grad_norm": 0.3800172507762909, |
| "learning_rate": 0.00014102124290803536, |
| "loss": 0.0324, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5919187924329312, |
| "grad_norm": 0.6974118947982788, |
| "learning_rate": 0.0001408892993798654, |
| "loss": 0.0216, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.5932370970931382, |
| "grad_norm": 0.15472032129764557, |
| "learning_rate": 0.00014075735585169548, |
| "loss": 0.0164, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5945554017533452, |
| "grad_norm": 0.015000814571976662, |
| "learning_rate": 0.00014062541232352552, |
| "loss": 0.0395, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.5958737064135522, |
| "grad_norm": 0.052086081355810165, |
| "learning_rate": 0.0001404934687953556, |
| "loss": 0.0032, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5971920110737592, |
| "grad_norm": 0.004600350745022297, |
| "learning_rate": 0.00014036152526718566, |
| "loss": 0.0056, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.5985103157339661, |
| "grad_norm": 0.4940958321094513, |
| "learning_rate": 0.0001402295817390157, |
| "loss": 0.0206, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5998286203941731, |
| "grad_norm": 0.09658394008874893, |
| "learning_rate": 0.00014009763821084577, |
| "loss": 0.0052, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.60114692505438, |
| "grad_norm": 0.00020539117394946516, |
| "learning_rate": 0.0001399656946826758, |
| "loss": 0.087, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.602465229714587, |
| "grad_norm": 0.1871018409729004, |
| "learning_rate": 0.00013983375115450588, |
| "loss": 0.0812, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.603783534374794, |
| "grad_norm": 0.02583954855799675, |
| "learning_rate": 0.00013970180762633592, |
| "loss": 0.0232, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.605101839035001, |
| "grad_norm": 1.2103784084320068, |
| "learning_rate": 0.000139569864098166, |
| "loss": 0.0151, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.6064201436952079, |
| "grad_norm": 0.023514943197369576, |
| "learning_rate": 0.00013943792056999606, |
| "loss": 0.0193, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6077384483554149, |
| "grad_norm": 0.0076395305804908276, |
| "learning_rate": 0.0001393059770418261, |
| "loss": 0.0379, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.6090567530156219, |
| "grad_norm": 0.12412039190530777, |
| "learning_rate": 0.00013917403351365617, |
| "loss": 0.0095, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.6103750576758289, |
| "grad_norm": 0.021904783323407173, |
| "learning_rate": 0.0001390420899854862, |
| "loss": 0.0166, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.6116933623360359, |
| "grad_norm": 0.004012851510196924, |
| "learning_rate": 0.00013891014645731628, |
| "loss": 0.0103, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.6130116669962429, |
| "grad_norm": 0.007267913781106472, |
| "learning_rate": 0.00013877820292914635, |
| "loss": 0.0708, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.6143299716564498, |
| "grad_norm": 0.10363642126321793, |
| "learning_rate": 0.0001386462594009764, |
| "loss": 0.0473, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.6156482763166568, |
| "grad_norm": 0.04899830371141434, |
| "learning_rate": 0.00013851431587280646, |
| "loss": 0.0283, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.6169665809768637, |
| "grad_norm": 0.39460498094558716, |
| "learning_rate": 0.0001383823723446365, |
| "loss": 0.0597, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.6182848856370707, |
| "grad_norm": 0.04092290997505188, |
| "learning_rate": 0.00013825042881646655, |
| "loss": 0.0167, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.6196031902972777, |
| "grad_norm": 0.2781132161617279, |
| "learning_rate": 0.00013811848528829662, |
| "loss": 0.0097, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6209214949574847, |
| "grad_norm": 0.041443537920713425, |
| "learning_rate": 0.00013798654176012666, |
| "loss": 0.0226, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.6222397996176916, |
| "grad_norm": 0.1242462694644928, |
| "learning_rate": 0.00013785459823195673, |
| "loss": 0.0055, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.6235581042778986, |
| "grad_norm": 0.4440467357635498, |
| "learning_rate": 0.00013772265470378677, |
| "loss": 0.049, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.6248764089381056, |
| "grad_norm": 0.014354427345097065, |
| "learning_rate": 0.00013759071117561684, |
| "loss": 0.0327, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.6261947135983126, |
| "grad_norm": 0.011539973318576813, |
| "learning_rate": 0.0001374587676474469, |
| "loss": 0.0222, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.6275130182585196, |
| "grad_norm": 0.23539051413536072, |
| "learning_rate": 0.00013732682411927695, |
| "loss": 0.0816, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.6288313229187266, |
| "grad_norm": 0.26793941855430603, |
| "learning_rate": 0.00013719488059110702, |
| "loss": 0.0325, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.6301496275789334, |
| "grad_norm": 0.01662217453122139, |
| "learning_rate": 0.00013706293706293706, |
| "loss": 0.0221, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.6314679322391404, |
| "grad_norm": 0.30669671297073364, |
| "learning_rate": 0.00013693099353476713, |
| "loss": 0.026, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.6327862368993474, |
| "grad_norm": 0.03350894898176193, |
| "learning_rate": 0.00013679905000659717, |
| "loss": 0.0072, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6341045415595544, |
| "grad_norm": 0.014983875676989555, |
| "learning_rate": 0.00013666710647842724, |
| "loss": 0.049, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.6354228462197614, |
| "grad_norm": 1.8989384174346924, |
| "learning_rate": 0.0001365351629502573, |
| "loss": 0.0335, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.6367411508799684, |
| "grad_norm": 0.030135562643408775, |
| "learning_rate": 0.00013640321942208735, |
| "loss": 0.0051, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.6380594555401753, |
| "grad_norm": 0.02079075388610363, |
| "learning_rate": 0.00013627127589391742, |
| "loss": 0.0138, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.6393777602003823, |
| "grad_norm": 0.06065403297543526, |
| "learning_rate": 0.00013613933236574746, |
| "loss": 0.0357, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.6406960648605893, |
| "grad_norm": 0.2980937659740448, |
| "learning_rate": 0.00013600738883757753, |
| "loss": 0.0138, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.6420143695207963, |
| "grad_norm": 0.4820438623428345, |
| "learning_rate": 0.00013587544530940758, |
| "loss": 0.01, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.6433326741810033, |
| "grad_norm": 0.005618259310722351, |
| "learning_rate": 0.00013574350178123765, |
| "loss": 0.0052, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.6446509788412103, |
| "grad_norm": 0.7173821926116943, |
| "learning_rate": 0.0001356115582530677, |
| "loss": 0.0133, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.6459692835014171, |
| "grad_norm": 0.0053142281249165535, |
| "learning_rate": 0.00013547961472489773, |
| "loss": 0.0045, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6472875881616241, |
| "grad_norm": 0.06118829548358917, |
| "learning_rate": 0.0001353476711967278, |
| "loss": 0.056, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.6486058928218311, |
| "grad_norm": 3.5878078937530518, |
| "learning_rate": 0.00013521572766855787, |
| "loss": 0.0232, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.6499241974820381, |
| "grad_norm": 0.004911276511847973, |
| "learning_rate": 0.0001350837841403879, |
| "loss": 0.0074, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.6512425021422451, |
| "grad_norm": 0.0028026222717016935, |
| "learning_rate": 0.00013495184061221798, |
| "loss": 0.0782, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.6525608068024521, |
| "grad_norm": 0.7317615747451782, |
| "learning_rate": 0.00013481989708404802, |
| "loss": 0.0222, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.653879111462659, |
| "grad_norm": 0.01835751160979271, |
| "learning_rate": 0.0001346879535558781, |
| "loss": 0.0661, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.655197416122866, |
| "grad_norm": 0.03598962351679802, |
| "learning_rate": 0.00013455601002770813, |
| "loss": 0.0395, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.656515720783073, |
| "grad_norm": 0.013886351138353348, |
| "learning_rate": 0.0001344240664995382, |
| "loss": 0.0156, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.65783402544328, |
| "grad_norm": 5.741530895233154, |
| "learning_rate": 0.00013429212297136827, |
| "loss": 0.0317, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.659152330103487, |
| "grad_norm": 0.20793496072292328, |
| "learning_rate": 0.0001341601794431983, |
| "loss": 0.0072, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.659152330103487, |
| "eval_loss": 0.0300898440182209, |
| "eval_runtime": 453.0554, |
| "eval_samples_per_second": 7.443, |
| "eval_steps_per_second": 3.721, |
| "step": 2500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 7584, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.0176108255414272e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|