diff --git a/.gitattributes b/.gitattributes index ea5b1bf9e56d910c6e4bd976cd70234ee414ed53..41e795851187d5327d964335f86e473fb7422699 100644 --- a/.gitattributes +++ b/.gitattributes @@ -38,3 +38,4 @@ unselective_run_1_restarted/wandb/run-20250213_073721-bd672vnz/run-bd672vnz.wand unselective_run_1_restarted_with_o_rescaled/wandb/run-20250213_072314-jkzpfmgd/run-jkzpfmgd.wandb filter=lfs diff=lfs merge=lfs -text self_to_selective_run_1_restarted_with_ko_zero/wandb/run-20250213_073407-e0qpxozd/run-e0qpxozd.wandb filter=lfs diff=lfs merge=lfs -text unselective_run_1_restarted_with_o_zero/wandb/run-20250213_072621-844lvv02/run-844lvv02.wandb filter=lfs diff=lfs merge=lfs -text +self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/run-xzmoqbdx.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/args.json b/self_to_selective_run_1_restarted_with_memory_penalty/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e22950c8ea7c6df02461e776c058c6723e4d869f --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/args.json @@ -0,0 +1 @@ +{"hellaswag": true, "attention_kind": "selective_with_memory_penalty", "log_dir": "self_to_selective_run_1_restarted_with_memory_penalty", "resume_checkpoint": "hf://andrew-healey/context-compression/unselective_run_0/model_07500.pt", "resume_optimizer": false, "add_a_head": true, "add_head_to_start": true, "new_head_init": "ko_zero", "protect_bos_token": true, "max_steps": 2500, "group": "selective_surgery_3", "use_wandb": true, "kill_self_after_run": false} \ No newline at end of file diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/dataloader_02499.pt b/self_to_selective_run_1_restarted_with_memory_penalty/dataloader_02499.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cba0b4eadb1ff3494ac72786c853d422de2f975 --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/dataloader_02499.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cefbc0ab907c6a30382d71f16b47450e3533ee64ae273fcfa46862dc62ec8a7 +size 964 diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/log2.txt b/self_to_selective_run_1_restarted_with_memory_penalty/log2.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e852ae3ba965101cff4946d50a1afd7515165d8 --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/log2.txt @@ -0,0 +1,2703 @@ +max_steps: 2500 +0 val loss 4.4279 +0 val perplexity 83.7545 +0 train 4.393443 (lr=8.3916e-07) (hash(x)=45139629) +1 train 4.408799 (lr=1.6783e-06) (hash(x)=42751064) +2 train 4.448466 (lr=2.5175e-06) (hash(x)=41229078) +3 train 4.339198 (lr=3.3566e-06) (hash(x)=34548191) +4 train 4.191264 (lr=4.1958e-06) (hash(x)=39959848) +5 train 4.139318 (lr=5.0350e-06) (hash(x)=39240761) +6 train 4.075793 (lr=5.8741e-06) (hash(x)=38148412) +7 train 3.998695 (lr=6.7133e-06) (hash(x)=46254249) +8 train 4.040766 (lr=7.5524e-06) (hash(x)=39878968) +9 train 4.005822 (lr=8.3916e-06) (hash(x)=48214364) +10 train 4.006109 (lr=9.2308e-06) (hash(x)=40388916) +11 train 4.010857 (lr=1.0070e-05) (hash(x)=36449929) +12 train 3.959165 (lr=1.0909e-05) (hash(x)=39410877) +13 train 3.935296 (lr=1.1748e-05) (hash(x)=45951428) +14 train 3.915572 (lr=1.2587e-05) (hash(x)=39171052) +15 train 3.976100 (lr=1.3427e-05) (hash(x)=37736146) +16 train 3.886858 (lr=1.4266e-05) (hash(x)=40669979) +17 train 3.988770 (lr=1.5105e-05) (hash(x)=25678061) +18 train 3.887189 (lr=1.5944e-05) (hash(x)=50273793) +19 train 3.918703 (lr=1.6783e-05) (hash(x)=39337479) +20 train 3.932514 (lr=1.7622e-05) (hash(x)=39376722) +21 train 3.943015 (lr=1.8462e-05) (hash(x)=38776207) +22 train 3.942523 (lr=1.9301e-05) (hash(x)=34982567) +23 train 3.893634 (lr=2.0140e-05) (hash(x)=42647996) +24 train 3.907353 (lr=2.0979e-05) (hash(x)=34546256) +25 val loss 3.8895 +25 val perplexity 48.8860 +25 train 3.950865 (lr=2.1818e-05) (hash(x)=41562172) +26 train 3.921378 (lr=2.2657e-05) (hash(x)=43573190) +27 train 3.916573 (lr=2.3497e-05) (hash(x)=39618284) +28 train 3.921322 (lr=2.4336e-05) (hash(x)=41845339) +29 train 3.901302 (lr=2.5175e-05) (hash(x)=40130466) +30 train 3.853675 (lr=2.6014e-05) (hash(x)=41299376) +31 train 3.851100 (lr=2.6853e-05) (hash(x)=41574570) +32 train 3.801766 (lr=2.7692e-05) (hash(x)=40806162) +33 train 3.815352 (lr=2.8531e-05) (hash(x)=38975248) +34 train 3.862724 (lr=2.9371e-05) (hash(x)=36691007) +35 train 3.818541 (lr=3.0210e-05) (hash(x)=38159293) +36 train 3.844186 (lr=3.1049e-05) (hash(x)=46581740) +37 train 3.805704 (lr=3.1888e-05) (hash(x)=40284490) +38 train 3.785592 (lr=3.2727e-05) (hash(x)=33383282) +39 train 3.912579 (lr=3.3566e-05) (hash(x)=40554543) +40 train 3.832278 (lr=3.4406e-05) (hash(x)=40397535) +41 train 3.874651 (lr=3.5245e-05) (hash(x)=45561912) +42 train 3.906337 (lr=3.6084e-05) (hash(x)=42671174) +43 train 3.876131 (lr=3.6923e-05) (hash(x)=38750076) +44 train 3.835607 (lr=3.7762e-05) (hash(x)=35947128) +45 train 3.927090 (lr=3.8601e-05) (hash(x)=41888346) +46 train 3.891225 (lr=3.9441e-05) (hash(x)=40366739) +47 train 3.901740 (lr=4.0280e-05) (hash(x)=41932257) +48 train 3.906932 (lr=4.1119e-05) (hash(x)=42401906) +49 train 3.907823 (lr=4.1958e-05) (hash(x)=40109308) +50 val loss 3.8596 +50 val perplexity 47.4442 +50 train 3.873801 (lr=4.2797e-05) (hash(x)=35634789) +51 train 3.822859 (lr=4.3636e-05) (hash(x)=39031980) +52 train 3.868409 (lr=4.4476e-05) (hash(x)=37985908) +53 train 3.825042 (lr=4.5315e-05) (hash(x)=40471332) +54 train 3.866691 (lr=4.6154e-05) (hash(x)=39051270) +55 train 3.836372 (lr=4.6993e-05) (hash(x)=39609850) +56 train 3.862881 (lr=4.7832e-05) (hash(x)=44495960) +57 train 3.847629 (lr=4.8671e-05) (hash(x)=35992556) +58 train 3.963893 (lr=4.9510e-05) (hash(x)=42497266) +59 train 3.936149 (lr=5.0350e-05) (hash(x)=36478415) +60 train 3.883785 (lr=5.1189e-05) (hash(x)=39626286) +61 train 3.876284 (lr=5.2028e-05) (hash(x)=39215075) +62 train 3.884097 (lr=5.2867e-05) (hash(x)=38138224) +63 train 3.868871 (lr=5.3706e-05) (hash(x)=39018360) +64 train 3.847831 (lr=5.4545e-05) (hash(x)=34873695) +65 train 3.803256 (lr=5.5385e-05) (hash(x)=44883258) +66 train 3.814696 (lr=5.6224e-05) (hash(x)=34297192) +67 train 3.859408 (lr=5.7063e-05) (hash(x)=43895937) +68 train 3.862029 (lr=5.7902e-05) (hash(x)=35055514) +69 train 3.826919 (lr=5.8741e-05) (hash(x)=36471351) +70 train 3.816696 (lr=5.9580e-05) (hash(x)=36173326) +71 train 3.804653 (lr=6.0420e-05) (hash(x)=47689862) +72 train 3.834329 (lr=6.1259e-05) (hash(x)=41340098) +73 train 3.846651 (lr=6.2098e-05) (hash(x)=42210254) +74 train 3.811414 (lr=6.2937e-05) (hash(x)=42472212) +75 val loss 3.8538 +75 val perplexity 47.1712 +75 train 3.908384 (lr=6.3776e-05) (hash(x)=38577865) +76 train 3.832153 (lr=6.4615e-05) (hash(x)=38745918) +77 train 3.867815 (lr=6.5455e-05) (hash(x)=42642137) +78 train 3.873968 (lr=6.6294e-05) (hash(x)=41679554) +79 train 3.899307 (lr=6.7133e-05) (hash(x)=38524043) +80 train 3.868505 (lr=6.7972e-05) (hash(x)=42115063) +81 train 3.875647 (lr=6.8811e-05) (hash(x)=45202065) +82 train 3.925524 (lr=6.9650e-05) (hash(x)=35452902) +83 train 3.862810 (lr=7.0490e-05) (hash(x)=41623515) +84 train 3.839538 (lr=7.1329e-05) (hash(x)=44144786) +85 train 3.921173 (lr=7.2168e-05) (hash(x)=49639761) +86 train 3.859308 (lr=7.3007e-05) (hash(x)=66028996) +87 train 3.854143 (lr=7.3846e-05) (hash(x)=43667778) +88 train 3.782822 (lr=7.4685e-05) (hash(x)=36746080) +89 train 3.901006 (lr=7.5524e-05) (hash(x)=42307415) +90 train 3.871667 (lr=7.6364e-05) (hash(x)=39047906) +91 train 3.815237 (lr=7.7203e-05) (hash(x)=43108713) +92 train 3.826456 (lr=7.8042e-05) (hash(x)=38603082) +93 train 3.847907 (lr=7.8881e-05) (hash(x)=37170764) +94 train 3.865739 (lr=7.9720e-05) (hash(x)=38420391) +95 train 3.799418 (lr=8.0559e-05) (hash(x)=34663661) +96 train 3.827882 (lr=8.1399e-05) (hash(x)=36923391) +97 train 3.885617 (lr=8.2238e-05) (hash(x)=37752378) +98 train 3.861015 (lr=8.3077e-05) (hash(x)=41208499) +99 train 3.834940 (lr=8.3916e-05) (hash(x)=33289881) +100 val loss 3.8509 +100 val perplexity 47.0346 +100 train 3.852343 (lr=8.4755e-05) (hash(x)=38474341) +101 train 3.824832 (lr=8.5594e-05) (hash(x)=53109574) +102 train 3.850434 (lr=8.6434e-05) (hash(x)=38902818) +103 train 3.815179 (lr=8.7273e-05) (hash(x)=35142963) +104 train 3.826632 (lr=8.8112e-05) (hash(x)=44079319) +105 train 3.822171 (lr=8.8951e-05) (hash(x)=39111210) +106 train 3.863076 (lr=8.9790e-05) (hash(x)=41698107) +107 train 3.807552 (lr=9.0629e-05) (hash(x)=40884369) +108 train 3.829730 (lr=9.1469e-05) (hash(x)=41815593) +109 train 3.828184 (lr=9.2308e-05) (hash(x)=33956891) +110 train 3.880381 (lr=9.3147e-05) (hash(x)=39453656) +111 train 3.834862 (lr=9.3986e-05) (hash(x)=40579926) +112 train 3.845744 (lr=9.4825e-05) (hash(x)=38194672) +113 train 3.898971 (lr=9.5664e-05) (hash(x)=40405541) +114 train 3.884350 (lr=9.6503e-05) (hash(x)=42131297) +115 train 3.823194 (lr=9.7343e-05) (hash(x)=42547839) +116 train 3.888769 (lr=9.8182e-05) (hash(x)=43524778) +117 train 3.849446 (lr=9.9021e-05) (hash(x)=46604978) +118 train 3.854818 (lr=9.9860e-05) (hash(x)=44906941) +119 train 3.871190 (lr=1.0070e-04) (hash(x)=38419253) +120 train 3.901891 (lr=1.0154e-04) (hash(x)=38704177) +121 train 3.806935 (lr=1.0238e-04) (hash(x)=36268398) +122 train 3.861631 (lr=1.0322e-04) (hash(x)=38492575) +123 train 3.857289 (lr=1.0406e-04) (hash(x)=38364400) +124 train 3.859062 (lr=1.0490e-04) (hash(x)=38329297) +125 val loss 3.8509 +125 val perplexity 47.0376 +125 train 3.888362 (lr=1.0573e-04) (hash(x)=37468773) +126 train 3.913844 (lr=1.0657e-04) (hash(x)=36270168) +127 train 3.857267 (lr=1.0741e-04) (hash(x)=40390191) +128 train 3.842176 (lr=1.0825e-04) (hash(x)=39945965) +129 train 3.847849 (lr=1.0909e-04) (hash(x)=41463203) +130 train 3.882656 (lr=1.0993e-04) (hash(x)=35491760) +131 train 3.851478 (lr=1.1077e-04) (hash(x)=37812998) +132 train 3.828274 (lr=1.1161e-04) (hash(x)=35451641) +133 train 3.819535 (lr=1.1245e-04) (hash(x)=40122268) +134 train 3.847534 (lr=1.1329e-04) (hash(x)=39386364) +135 train 3.787405 (lr=1.1413e-04) (hash(x)=40325536) +136 train 3.770731 (lr=1.1497e-04) (hash(x)=38580053) +137 train 3.803704 (lr=1.1580e-04) (hash(x)=38485356) +138 train 3.833760 (lr=1.1664e-04) (hash(x)=39280930) +139 train 3.780044 (lr=1.1748e-04) (hash(x)=38959760) +140 train 3.786521 (lr=1.1832e-04) (hash(x)=43745568) +141 train 3.737349 (lr=1.1916e-04) (hash(x)=38097947) +142 train 3.800108 (lr=1.2000e-04) (hash(x)=31662213) +143 train 3.837610 (lr=1.2084e-04) (hash(x)=38465986) +144 train 3.797274 (lr=1.2168e-04) (hash(x)=38728740) +145 train 3.897761 (lr=1.2252e-04) (hash(x)=39929394) +146 train 3.881305 (lr=1.2336e-04) (hash(x)=45634379) +147 train 3.855614 (lr=1.2420e-04) (hash(x)=32659153) +148 train 3.849693 (lr=1.2503e-04) (hash(x)=41041296) +149 train 3.896503 (lr=1.2587e-04) (hash(x)=37921676) +150 val loss 3.8497 +150 val perplexity 46.9791 +150 train 3.874132 (lr=1.2671e-04) (hash(x)=42768795) +151 train 3.864563 (lr=1.2755e-04) (hash(x)=43927456) +152 train 3.816126 (lr=1.2839e-04) (hash(x)=31355062) +153 train 3.887863 (lr=1.2923e-04) (hash(x)=45419365) +154 train 3.907737 (lr=1.3007e-04) (hash(x)=43985748) +155 train 3.883861 (lr=1.3091e-04) (hash(x)=37801483) +156 train 3.893161 (lr=1.3175e-04) (hash(x)=35492280) +157 train 3.900731 (lr=1.3259e-04) (hash(x)=38315827) +158 train 3.809634 (lr=1.3343e-04) (hash(x)=37853276) +159 train 3.863439 (lr=1.3427e-04) (hash(x)=37107003) +160 train 3.890220 (lr=1.3510e-04) (hash(x)=37739525) +161 train 3.877751 (lr=1.3594e-04) (hash(x)=42612526) +162 train 3.845996 (lr=1.3678e-04) (hash(x)=38568399) +163 train 3.987020 (lr=1.3762e-04) (hash(x)=44287645) +164 train 3.829700 (lr=1.3846e-04) (hash(x)=36548378) +165 train 3.896605 (lr=1.3930e-04) (hash(x)=40191618) +166 train 3.907640 (lr=1.4014e-04) (hash(x)=36169522) +167 train 3.822870 (lr=1.4098e-04) (hash(x)=37948453) +168 train 3.836327 (lr=1.4182e-04) (hash(x)=41158818) +169 train 3.820416 (lr=1.4266e-04) (hash(x)=36949322) +170 train 3.793713 (lr=1.4350e-04) (hash(x)=36923221) +171 train 3.795236 (lr=1.4434e-04) (hash(x)=42716003) +172 train 3.815834 (lr=1.4517e-04) (hash(x)=39719566) +173 train 3.830936 (lr=1.4601e-04) (hash(x)=36436934) +174 train 3.861740 (lr=1.4685e-04) (hash(x)=42413518) +175 val loss 3.8515 +175 val perplexity 47.0657 +175 train 3.784555 (lr=1.4769e-04) (hash(x)=38319181) +176 train 3.795583 (lr=1.4853e-04) (hash(x)=32597072) +177 train 3.793663 (lr=1.4937e-04) (hash(x)=46025045) +178 train 3.827638 (lr=1.5021e-04) (hash(x)=39872736) +179 train 3.910553 (lr=1.5105e-04) (hash(x)=43154319) +180 train 3.975653 (lr=1.5189e-04) (hash(x)=46553051) +181 train 3.919280 (lr=1.5273e-04) (hash(x)=38763245) +182 train 3.864950 (lr=1.5357e-04) (hash(x)=44504491) +183 train 3.902390 (lr=1.5441e-04) (hash(x)=41453677) +184 train 3.896419 (lr=1.5524e-04) (hash(x)=39760171) +185 train 3.826394 (lr=1.5608e-04) (hash(x)=39637076) +186 train 3.899972 (lr=1.5692e-04) (hash(x)=35224783) +187 train 3.898394 (lr=1.5776e-04) (hash(x)=38849639) +188 train 3.861941 (lr=1.5860e-04) (hash(x)=39644563) +189 train 3.880408 (lr=1.5944e-04) (hash(x)=41907057) +190 train 3.919226 (lr=1.6028e-04) (hash(x)=41033336) +191 train 3.872868 (lr=1.6112e-04) (hash(x)=42108725) +192 train 3.861025 (lr=1.6196e-04) (hash(x)=36949624) +193 train 3.832063 (lr=1.6280e-04) (hash(x)=39890567) +194 train 3.864435 (lr=1.6364e-04) (hash(x)=40912591) +195 train 3.857391 (lr=1.6448e-04) (hash(x)=37282886) +196 train 3.905225 (lr=1.6531e-04) (hash(x)=45467295) +197 train 3.836240 (lr=1.6615e-04) (hash(x)=44025707) +198 train 3.945784 (lr=1.6699e-04) (hash(x)=40488606) +199 train 3.879972 (lr=1.6783e-04) (hash(x)=38872847) +200 val loss 3.8505 +200 val perplexity 47.0177 +200 train 3.868594 (lr=1.6867e-04) (hash(x)=41048411) +201 train 3.867278 (lr=1.6951e-04) (hash(x)=36678702) +202 train 3.852775 (lr=1.7035e-04) (hash(x)=35302959) +203 train 3.797747 (lr=1.7119e-04) (hash(x)=40274132) +204 train 3.823229 (lr=1.7203e-04) (hash(x)=37451650) +205 train 3.839741 (lr=1.7287e-04) (hash(x)=36633509) +206 train 3.857895 (lr=1.7371e-04) (hash(x)=34527713) +207 train 3.820510 (lr=1.7455e-04) (hash(x)=41039417) +208 train 3.787636 (lr=1.7538e-04) (hash(x)=43732277) +209 train 3.861665 (lr=1.7622e-04) (hash(x)=43632162) +210 train 3.800168 (lr=1.7706e-04) (hash(x)=38091146) +211 train 3.809201 (lr=1.7790e-04) (hash(x)=39693297) +212 train 3.834052 (lr=1.7874e-04) (hash(x)=50071749) +213 train 3.903516 (lr=1.7958e-04) (hash(x)=35549972) +214 train 3.918202 (lr=1.8042e-04) (hash(x)=42450342) +215 train 3.875239 (lr=1.8126e-04) (hash(x)=42038021) +216 train 3.870703 (lr=1.8210e-04) (hash(x)=38262804) +217 train 3.881535 (lr=1.8294e-04) (hash(x)=40349870) +218 train 3.875517 (lr=1.8378e-04) (hash(x)=41785546) +219 train 3.882008 (lr=1.8462e-04) (hash(x)=38986867) +220 train 3.878149 (lr=1.8545e-04) (hash(x)=35461811) +221 train 4.047620 (lr=1.8629e-04) (hash(x)=39715546) +222 train 3.818120 (lr=1.8713e-04) (hash(x)=49306666) +223 train 3.877470 (lr=1.8797e-04) (hash(x)=44998474) +224 train 3.883572 (lr=1.8881e-04) (hash(x)=36273519) +225 val loss 3.8557 +225 val perplexity 47.2612 +225 train 3.874290 (lr=1.8965e-04) (hash(x)=39398250) +226 train 3.887338 (lr=1.9049e-04) (hash(x)=51116221) +227 train 3.825192 (lr=1.9133e-04) (hash(x)=46189992) +228 train 3.884968 (lr=1.9217e-04) (hash(x)=37886638) +229 train 3.786342 (lr=1.9301e-04) (hash(x)=43211712) +230 train 3.929681 (lr=1.9385e-04) (hash(x)=49341124) +231 train 3.861465 (lr=1.9469e-04) (hash(x)=40060211) +232 train 3.870763 (lr=1.9552e-04) (hash(x)=40748465) +233 train 3.900997 (lr=1.9636e-04) (hash(x)=37050427) +234 train 3.884142 (lr=1.9720e-04) (hash(x)=35601639) +235 train 3.836524 (lr=1.9804e-04) (hash(x)=41459440) +236 train 3.820418 (lr=1.9888e-04) (hash(x)=40217124) +237 train 3.831916 (lr=1.9972e-04) (hash(x)=40406981) +238 train 3.832983 (lr=2.0056e-04) (hash(x)=35736560) +239 train 3.778226 (lr=2.0140e-04) (hash(x)=38813300) +240 train 3.797660 (lr=2.0224e-04) (hash(x)=35313660) +241 train 3.790141 (lr=2.0308e-04) (hash(x)=42477718) +242 train 3.861814 (lr=2.0392e-04) (hash(x)=39056217) +243 train 3.860725 (lr=2.0476e-04) (hash(x)=36939765) +244 train 3.831398 (lr=2.0559e-04) (hash(x)=41440144) +245 train 3.801565 (lr=2.0643e-04) (hash(x)=39097109) +246 train 3.890126 (lr=2.0727e-04) (hash(x)=41068379) +247 train 3.768040 (lr=2.0811e-04) (hash(x)=42406014) +248 train 3.780514 (lr=2.0895e-04) (hash(x)=40272802) +249 train 3.828695 (lr=2.0979e-04) (hash(x)=43014164) +250 val loss 3.8570 +250 val perplexity 47.3224 +250 train 3.792278 (lr=2.1063e-04) (hash(x)=38005826) +251 train 3.825191 (lr=2.1147e-04) (hash(x)=42829745) +252 train 3.829246 (lr=2.1231e-04) (hash(x)=37082851) +253 train 3.842735 (lr=2.1315e-04) (hash(x)=39244397) +254 train 3.849941 (lr=2.1399e-04) (hash(x)=42028615) +255 train 3.838879 (lr=2.1483e-04) (hash(x)=36408086) +256 train 3.841702 (lr=2.1566e-04) (hash(x)=41501548) +257 train 3.835712 (lr=2.1650e-04) (hash(x)=40442084) +258 train 3.821564 (lr=2.1734e-04) (hash(x)=39167439) +259 train 3.910155 (lr=2.1818e-04) (hash(x)=39830026) +260 train 3.893105 (lr=2.1902e-04) (hash(x)=40903606) +261 train 3.838136 (lr=2.1986e-04) (hash(x)=38439451) +262 train 3.865999 (lr=2.2070e-04) (hash(x)=40535148) +263 train 3.888954 (lr=2.2154e-04) (hash(x)=38989189) +264 train 3.869772 (lr=2.2238e-04) (hash(x)=40241202) +265 train 3.846615 (lr=2.2322e-04) (hash(x)=36985811) +266 train 3.927274 (lr=2.2406e-04) (hash(x)=38420661) +267 train 3.869853 (lr=2.2490e-04) (hash(x)=42376349) +268 train 3.810919 (lr=2.2573e-04) (hash(x)=46898906) +269 train 3.865035 (lr=2.2657e-04) (hash(x)=44713446) +270 train 3.863976 (lr=2.2741e-04) (hash(x)=46801531) +271 train 3.827195 (lr=2.2825e-04) (hash(x)=40660722) +272 train 3.874818 (lr=2.2909e-04) (hash(x)=41753296) +273 train 3.816881 (lr=2.2993e-04) (hash(x)=45527356) +274 train 3.825661 (lr=2.3077e-04) (hash(x)=43506937) +275 val loss 3.8587 +275 val perplexity 47.4020 +275 train 3.822490 (lr=2.3161e-04) (hash(x)=34336749) +276 train 3.856504 (lr=2.3245e-04) (hash(x)=30601400) +277 train 3.785060 (lr=2.3329e-04) (hash(x)=34696963) +278 train 3.811465 (lr=2.3413e-04) (hash(x)=32569045) +279 train 3.817144 (lr=2.3497e-04) (hash(x)=44118681) +280 train 3.809293 (lr=2.3580e-04) (hash(x)=39963624) +281 train 3.849562 (lr=2.3664e-04) (hash(x)=41482046) +282 train 3.858880 (lr=2.3748e-04) (hash(x)=34128007) +283 train 3.861196 (lr=2.3832e-04) (hash(x)=45453384) +284 train 3.844221 (lr=2.3916e-04) (hash(x)=44277936) +285 train 3.920742 (lr=2.4000e-04) (hash(x)=36871782) +286 train 3.865961 (lr=2.4084e-04) (hash(x)=40205601) +287 train 3.826860 (lr=2.4168e-04) (hash(x)=38381640) +288 train 3.800892 (lr=2.4252e-04) (hash(x)=40234000) +289 train 3.825801 (lr=2.4336e-04) (hash(x)=43934207) +290 train 3.867836 (lr=2.4420e-04) (hash(x)=33803765) +291 train 3.881078 (lr=2.4503e-04) (hash(x)=42983811) +292 train 3.911033 (lr=2.4587e-04) (hash(x)=28552721) +293 train 3.911941 (lr=2.4671e-04) (hash(x)=47951640) +294 train 3.890288 (lr=2.4755e-04) (hash(x)=51340827) +295 train 3.880983 (lr=2.4839e-04) (hash(x)=42163594) +296 train 3.911482 (lr=2.4923e-04) (hash(x)=38847687) +297 train 3.868113 (lr=2.5007e-04) (hash(x)=37068840) +298 train 3.871959 (lr=2.5091e-04) (hash(x)=43006298) +299 train 3.882006 (lr=2.5175e-04) (hash(x)=41635105) +300 val loss 3.8630 +300 val perplexity 47.6056 +300 train 3.893557 (lr=2.5259e-04) (hash(x)=39191006) +301 train 3.883399 (lr=2.5343e-04) (hash(x)=35095440) +302 train 3.862748 (lr=2.5427e-04) (hash(x)=43613543) +303 train 3.880363 (lr=2.5510e-04) (hash(x)=43803605) +304 train 3.897355 (lr=2.5594e-04) (hash(x)=37617896) +305 train 3.862287 (lr=2.5678e-04) (hash(x)=44278837) +306 train 3.840580 (lr=2.5762e-04) (hash(x)=39452850) +307 train 3.851703 (lr=2.5846e-04) (hash(x)=45959456) +308 train 3.816612 (lr=2.5930e-04) (hash(x)=40579873) +309 train 3.860766 (lr=2.6014e-04) (hash(x)=35794292) +310 train 3.875046 (lr=2.6098e-04) (hash(x)=42960300) +311 train 3.890170 (lr=2.6182e-04) (hash(x)=40003942) +312 train 3.850728 (lr=2.6266e-04) (hash(x)=35591620) +313 train 3.878728 (lr=2.6350e-04) (hash(x)=38162048) +314 train 3.871975 (lr=2.6434e-04) (hash(x)=37227144) +315 train 3.832888 (lr=2.6517e-04) (hash(x)=40189251) +316 train 3.811469 (lr=2.6601e-04) (hash(x)=41434711) +317 train 3.879610 (lr=2.6685e-04) (hash(x)=37731173) +318 train 3.875114 (lr=2.6769e-04) (hash(x)=33013739) +319 train 3.826522 (lr=2.6853e-04) (hash(x)=39907080) +320 train 3.817621 (lr=2.6937e-04) (hash(x)=41357431) +321 train 3.903267 (lr=2.7021e-04) (hash(x)=45393909) +322 train 3.833714 (lr=2.7105e-04) (hash(x)=45680086) +323 train 3.889701 (lr=2.7189e-04) (hash(x)=40282412) +324 train 3.840719 (lr=2.7273e-04) (hash(x)=41908482) +325 val loss 3.8618 +325 val perplexity 47.5491 +325 train 3.903890 (lr=2.7357e-04) (hash(x)=38108809) +326 train 3.841767 (lr=2.7441e-04) (hash(x)=41267943) +327 train 3.871916 (lr=2.7524e-04) (hash(x)=40425172) +328 train 3.892068 (lr=2.7608e-04) (hash(x)=41215123) +329 train 3.905273 (lr=2.7692e-04) (hash(x)=39711205) +330 train 3.891838 (lr=2.7776e-04) (hash(x)=42950510) +331 train 3.866570 (lr=2.7860e-04) (hash(x)=38040911) +332 train 3.883849 (lr=2.7944e-04) (hash(x)=38643209) +333 train 3.897305 (lr=2.8028e-04) (hash(x)=37133859) +334 train 3.911971 (lr=2.8112e-04) (hash(x)=42830391) +335 train 3.859587 (lr=2.8196e-04) (hash(x)=36665178) +336 train 3.872805 (lr=2.8280e-04) (hash(x)=41186875) +337 train 3.905930 (lr=2.8364e-04) (hash(x)=42238832) +338 train 3.862665 (lr=2.8448e-04) (hash(x)=37254380) +339 train 3.853812 (lr=2.8531e-04) (hash(x)=36367215) +340 train 3.850474 (lr=2.8615e-04) (hash(x)=47825234) +341 train 3.813458 (lr=2.8699e-04) (hash(x)=34694756) +342 train 3.885455 (lr=2.8783e-04) (hash(x)=39299110) +343 train 3.915621 (lr=2.8867e-04) (hash(x)=40101833) +344 train 3.889045 (lr=2.8951e-04) (hash(x)=40007668) +345 train 3.847039 (lr=2.9035e-04) (hash(x)=50419294) +346 train 3.823452 (lr=2.9119e-04) (hash(x)=36009682) +347 train 3.842559 (lr=2.9203e-04) (hash(x)=37722138) +348 train 3.817088 (lr=2.9287e-04) (hash(x)=40120501) +349 train 3.836729 (lr=2.9371e-04) (hash(x)=37202243) +350 val loss 3.8716 +350 val perplexity 48.0187 +350 train 3.822455 (lr=2.9455e-04) (hash(x)=41471753) +351 train 3.899278 (lr=2.9538e-04) (hash(x)=44306996) +352 train 3.876629 (lr=2.9622e-04) (hash(x)=38882927) +353 train 3.828707 (lr=2.9706e-04) (hash(x)=38233276) +354 train 3.929975 (lr=2.9790e-04) (hash(x)=37681810) +355 train 3.899930 (lr=2.9874e-04) (hash(x)=41051898) +356 train 3.805047 (lr=2.9958e-04) (hash(x)=40344290) +357 train 3.854972 (lr=3.0042e-04) (hash(x)=37395740) +358 train 3.849973 (lr=3.0126e-04) (hash(x)=37539108) +359 train 3.819481 (lr=3.0210e-04) (hash(x)=43409791) +360 train 3.875143 (lr=3.0294e-04) (hash(x)=40049316) +361 train 3.864595 (lr=3.0378e-04) (hash(x)=44609997) +362 train 3.917947 (lr=3.0462e-04) (hash(x)=30969715) +363 train 3.838974 (lr=3.0545e-04) (hash(x)=39297808) +364 train 3.853747 (lr=3.0629e-04) (hash(x)=39507177) +365 train 3.864021 (lr=3.0713e-04) (hash(x)=41700739) +366 train 3.923968 (lr=3.0797e-04) (hash(x)=41020635) +367 train 3.831028 (lr=3.0881e-04) (hash(x)=41342289) +368 train 3.938215 (lr=3.0965e-04) (hash(x)=38263858) +369 train 3.899338 (lr=3.1049e-04) (hash(x)=34402058) +370 train 3.890531 (lr=3.1133e-04) (hash(x)=37626767) +371 train 3.867266 (lr=3.1217e-04) (hash(x)=40715997) +372 train 3.862134 (lr=3.1301e-04) (hash(x)=44724777) +373 train 3.863185 (lr=3.1385e-04) (hash(x)=35886098) +374 train 3.892324 (lr=3.1469e-04) (hash(x)=43156817) +375 val loss 3.8699 +375 val perplexity 47.9364 +375 train 3.841361 (lr=3.1552e-04) (hash(x)=40215585) +376 train 3.843825 (lr=3.1636e-04) (hash(x)=46637169) +377 train 3.839301 (lr=3.1720e-04) (hash(x)=40512467) +378 train 3.832651 (lr=3.1804e-04) (hash(x)=47062141) +379 train 3.876358 (lr=3.1888e-04) (hash(x)=38460928) +380 train 3.811151 (lr=3.1972e-04) (hash(x)=33900747) +381 train 3.824117 (lr=3.2056e-04) (hash(x)=46452722) +382 train 3.847764 (lr=3.2140e-04) (hash(x)=39039801) +383 train 3.866390 (lr=3.2224e-04) (hash(x)=39847188) +384 train 3.875432 (lr=3.2308e-04) (hash(x)=41290351) +385 train 3.875997 (lr=3.2392e-04) (hash(x)=37994917) +386 train 3.835991 (lr=3.2476e-04) (hash(x)=41411286) +387 train 3.853321 (lr=3.2559e-04) (hash(x)=38253907) +388 train 3.857584 (lr=3.2643e-04) (hash(x)=39389558) +389 train 3.835227 (lr=3.2727e-04) (hash(x)=41833533) +390 train 3.905764 (lr=3.2811e-04) (hash(x)=36090003) +391 train 3.750039 (lr=3.2895e-04) (hash(x)=37464282) +392 train 3.828994 (lr=3.2979e-04) (hash(x)=39535733) +393 train 3.847142 (lr=3.3063e-04) (hash(x)=42597408) +394 train 3.850677 (lr=3.3147e-04) (hash(x)=41815351) +395 train 3.904500 (lr=3.3231e-04) (hash(x)=35416682) +396 train 3.876894 (lr=3.3315e-04) (hash(x)=44708062) +397 train 3.850883 (lr=3.3399e-04) (hash(x)=42858542) +398 train 3.882925 (lr=3.3483e-04) (hash(x)=39793794) +399 train 3.859722 (lr=3.3566e-04) (hash(x)=37764149) +400 val loss 3.8784 +400 val perplexity 48.3465 +400 train 3.897000 (lr=3.3650e-04) (hash(x)=37217038) +401 train 3.914107 (lr=3.3734e-04) (hash(x)=36977201) +402 train 3.905820 (lr=3.3818e-04) (hash(x)=42908543) +403 train 3.955850 (lr=3.3902e-04) (hash(x)=41542299) +404 train 4.096984 (lr=3.3986e-04) (hash(x)=40346138) +405 train 3.863445 (lr=3.4070e-04) (hash(x)=51782438) +406 train 3.896952 (lr=3.4154e-04) (hash(x)=38898364) +407 train 3.879852 (lr=3.4238e-04) (hash(x)=44276053) +408 train 3.988398 (lr=3.4322e-04) (hash(x)=36183405) +409 train 3.844318 (lr=3.4406e-04) (hash(x)=45397385) +410 train 3.848653 (lr=3.4490e-04) (hash(x)=46724445) +411 train 3.906629 (lr=3.4573e-04) (hash(x)=38601597) +412 train 3.876091 (lr=3.4657e-04) (hash(x)=40773938) +413 train 3.877480 (lr=3.4741e-04) (hash(x)=39051026) +414 train 3.894830 (lr=3.4825e-04) (hash(x)=41625650) +415 train 3.886320 (lr=3.4909e-04) (hash(x)=33704341) +416 train 3.855072 (lr=3.4993e-04) (hash(x)=38010873) +417 train 3.838010 (lr=3.5077e-04) (hash(x)=42938488) +418 train 3.877621 (lr=3.5161e-04) (hash(x)=40261140) +419 train 3.914494 (lr=3.5245e-04) (hash(x)=65481505) +420 train 3.859554 (lr=3.5329e-04) (hash(x)=36936602) +421 train 3.895493 (lr=3.5413e-04) (hash(x)=39966635) +422 train 3.986085 (lr=3.5497e-04) (hash(x)=41834481) +423 train 3.858933 (lr=3.5580e-04) (hash(x)=42795863) +424 train 3.903056 (lr=3.5664e-04) (hash(x)=39799088) +425 val loss 3.8820 +425 val perplexity 48.5230 +425 train 3.932259 (lr=3.5748e-04) (hash(x)=41625444) +426 train 3.839967 (lr=3.5832e-04) (hash(x)=37722679) +427 train 3.878925 (lr=3.5916e-04) (hash(x)=42249547) +428 train 3.823243 (lr=3.6000e-04) (hash(x)=42217048) +429 train 3.908056 (lr=3.6084e-04) (hash(x)=40733947) +430 train 3.939759 (lr=3.6168e-04) (hash(x)=41668868) +431 train 3.856617 (lr=3.6252e-04) (hash(x)=37888442) +432 train 3.850848 (lr=3.6336e-04) (hash(x)=38679759) +433 train 3.869457 (lr=3.6420e-04) (hash(x)=40762754) +434 train 3.893175 (lr=3.6503e-04) (hash(x)=38710765) +435 train 3.920468 (lr=3.6587e-04) (hash(x)=35652983) +436 train 3.906605 (lr=3.6671e-04) (hash(x)=43623054) +437 train 3.826689 (lr=3.6755e-04) (hash(x)=39990930) +438 train 3.881460 (lr=3.6839e-04) (hash(x)=39295330) +439 train 3.919364 (lr=3.6923e-04) (hash(x)=40580162) +440 train 3.866326 (lr=3.7007e-04) (hash(x)=39042873) +441 train 3.857991 (lr=3.7091e-04) (hash(x)=46206217) +442 train 3.864721 (lr=3.7175e-04) (hash(x)=39076645) +443 train 3.945112 (lr=3.7259e-04) (hash(x)=36423161) +444 train 3.878020 (lr=3.7343e-04) (hash(x)=50627742) +445 train 3.843171 (lr=3.7427e-04) (hash(x)=35801894) +446 train 3.834538 (lr=3.7510e-04) (hash(x)=41843008) +447 train 3.866690 (lr=3.7594e-04) (hash(x)=44116937) +448 train 3.836999 (lr=3.7678e-04) (hash(x)=41603691) +449 train 3.863618 (lr=3.7762e-04) (hash(x)=30259039) +450 val loss 3.8833 +450 val perplexity 48.5824 +450 train 3.862782 (lr=3.7846e-04) (hash(x)=39633083) +451 train 3.849272 (lr=3.7930e-04) (hash(x)=40697339) +452 train 3.861321 (lr=3.8014e-04) (hash(x)=38107484) +453 train 3.919208 (lr=3.8098e-04) (hash(x)=39550562) +454 train 3.866850 (lr=3.8182e-04) (hash(x)=41347536) +455 train 3.835210 (lr=3.8266e-04) (hash(x)=37216447) +456 train 3.894035 (lr=3.8350e-04) (hash(x)=38902664) +457 train 3.895190 (lr=3.8434e-04) (hash(x)=46248854) +458 train 3.873477 (lr=3.8517e-04) (hash(x)=37479491) +459 train 3.896287 (lr=3.8601e-04) (hash(x)=47794102) +460 train 3.897126 (lr=3.8685e-04) (hash(x)=41170176) +461 train 3.904165 (lr=3.8769e-04) (hash(x)=43515078) +462 train 3.895084 (lr=3.8853e-04) (hash(x)=39550642) +463 train 3.934421 (lr=3.8937e-04) (hash(x)=39272253) +464 train 3.824460 (lr=3.9021e-04) (hash(x)=37812444) +465 train 3.916705 (lr=3.9105e-04) (hash(x)=38611088) +466 train 3.880251 (lr=3.9189e-04) (hash(x)=34492797) +467 train 3.893742 (lr=3.9273e-04) (hash(x)=37977801) +468 train 3.927913 (lr=3.9357e-04) (hash(x)=36740660) +469 train 3.902443 (lr=3.9441e-04) (hash(x)=42896829) +470 train 3.885204 (lr=3.9524e-04) (hash(x)=41878547) +471 train 3.855738 (lr=3.9608e-04) (hash(x)=42468059) +472 train 3.893852 (lr=3.9692e-04) (hash(x)=39976325) +473 train 3.949296 (lr=3.9776e-04) (hash(x)=41591581) +474 train 3.892916 (lr=3.9860e-04) (hash(x)=40716572) +475 val loss 3.8851 +475 val perplexity 48.6702 +475 train 3.852768 (lr=3.9944e-04) (hash(x)=38961558) +476 train 3.918780 (lr=4.0028e-04) (hash(x)=38289527) +477 train 3.941182 (lr=4.0112e-04) (hash(x)=40058482) +478 train 3.838184 (lr=4.0196e-04) (hash(x)=39753474) +479 train 3.859168 (lr=4.0280e-04) (hash(x)=42219460) +480 train 3.812346 (lr=4.0364e-04) (hash(x)=39599070) +481 train 3.876783 (lr=4.0448e-04) (hash(x)=44391935) +482 train 3.879762 (lr=4.0531e-04) (hash(x)=38878843) +483 train 3.967027 (lr=4.0615e-04) (hash(x)=39967305) +484 train 3.897903 (lr=4.0699e-04) (hash(x)=44978480) +485 train 3.917948 (lr=4.0783e-04) (hash(x)=39773362) +486 train 3.901612 (lr=4.0867e-04) (hash(x)=42989604) +487 train 3.791342 (lr=4.0951e-04) (hash(x)=37505924) +488 train 3.907545 (lr=4.1035e-04) (hash(x)=40902663) +489 train 3.900328 (lr=4.1119e-04) (hash(x)=48151941) +490 train 3.940559 (lr=4.1203e-04) (hash(x)=37175190) +491 train 3.980025 (lr=4.1287e-04) (hash(x)=40471176) +492 train 3.931572 (lr=4.1371e-04) (hash(x)=41831592) +493 train 3.914078 (lr=4.1455e-04) (hash(x)=34614525) +494 train 3.914722 (lr=4.1538e-04) (hash(x)=38763153) +495 train 3.848957 (lr=4.1622e-04) (hash(x)=42649226) +496 train 3.877984 (lr=4.1706e-04) (hash(x)=44317832) +497 train 3.919805 (lr=4.1790e-04) (hash(x)=40794953) +498 train 3.886177 (lr=4.1874e-04) (hash(x)=40457710) +499 train 3.879502 (lr=4.1958e-04) (hash(x)=37976256) +500 val loss 3.8868 +500 val perplexity 48.7542 +500 train 3.908135 (lr=4.2042e-04) (hash(x)=52109233) +501 train 3.947453 (lr=4.2126e-04) (hash(x)=40556756) +502 train 3.925533 (lr=4.2210e-04) (hash(x)=40559045) +503 train 3.894030 (lr=4.2294e-04) (hash(x)=38114697) +504 train 3.911389 (lr=4.2378e-04) (hash(x)=38815138) +505 train 3.946396 (lr=4.2462e-04) (hash(x)=43981989) +506 train 3.889690 (lr=4.2545e-04) (hash(x)=35096273) +507 train 3.893897 (lr=4.2629e-04) (hash(x)=45919037) +508 train 3.922432 (lr=4.2713e-04) (hash(x)=39734632) +509 train 3.862441 (lr=4.2797e-04) (hash(x)=35712904) +510 train 3.889961 (lr=4.2881e-04) (hash(x)=36803160) +511 train 3.982841 (lr=4.2965e-04) (hash(x)=30412353) +512 train 3.920351 (lr=4.3049e-04) (hash(x)=42742870) +513 train 3.888626 (lr=4.3133e-04) (hash(x)=43561454) +514 train 3.870001 (lr=4.3217e-04) (hash(x)=42403106) +515 train 3.907346 (lr=4.3301e-04) (hash(x)=40065717) +516 train 3.893583 (lr=4.3385e-04) (hash(x)=44676443) +517 train 3.857188 (lr=4.3469e-04) (hash(x)=37599943) +518 train 3.897652 (lr=4.3552e-04) (hash(x)=36619196) +519 train 3.832333 (lr=4.3636e-04) (hash(x)=42450033) +520 train 3.908115 (lr=4.3720e-04) (hash(x)=37985678) +521 train 3.886316 (lr=4.3804e-04) (hash(x)=35298533) +522 train 3.865573 (lr=4.3888e-04) (hash(x)=44111018) +523 train 3.832541 (lr=4.3972e-04) (hash(x)=38180941) +524 train 3.885832 (lr=4.4056e-04) (hash(x)=41960282) +525 val loss 3.9033 +525 val perplexity 49.5650 +525 train 3.889136 (lr=4.4140e-04) (hash(x)=35776213) +526 train 3.924963 (lr=4.4224e-04) (hash(x)=40242082) +527 train 3.965729 (lr=4.4308e-04) (hash(x)=41815447) +528 train 3.928971 (lr=4.4392e-04) (hash(x)=40903887) +529 train 3.875183 (lr=4.4476e-04) (hash(x)=40754156) +530 train 3.894653 (lr=4.4559e-04) (hash(x)=38581536) +531 train 3.909822 (lr=4.4643e-04) (hash(x)=41130354) +532 train 3.923635 (lr=4.4727e-04) (hash(x)=40776870) +533 train 3.906196 (lr=4.4811e-04) (hash(x)=36808358) +534 train 3.963665 (lr=4.4895e-04) (hash(x)=35149764) +535 train 3.882747 (lr=4.4979e-04) (hash(x)=41727249) +536 train 3.996640 (lr=4.5063e-04) (hash(x)=43103404) +537 train 3.922885 (lr=4.5147e-04) (hash(x)=43481237) +538 train 3.887696 (lr=4.5231e-04) (hash(x)=36582175) +539 train 3.996899 (lr=4.5315e-04) (hash(x)=37877055) +540 train 3.926992 (lr=4.5399e-04) (hash(x)=41426398) +541 train 3.898449 (lr=4.5483e-04) (hash(x)=42714440) +542 train 3.905726 (lr=4.5566e-04) (hash(x)=43293053) +543 train 3.879348 (lr=4.5650e-04) (hash(x)=39819813) +544 train 3.993848 (lr=4.5734e-04) (hash(x)=32553776) +545 train 3.922774 (lr=4.5818e-04) (hash(x)=37789342) +546 train 3.945741 (lr=4.5902e-04) (hash(x)=42795409) +547 train 3.905503 (lr=4.5986e-04) (hash(x)=38113329) +548 train 3.901222 (lr=4.6070e-04) (hash(x)=40291389) +549 train 3.879651 (lr=4.6154e-04) (hash(x)=36506782) +550 val loss 3.9043 +550 val perplexity 49.6159 +550 train 3.951739 (lr=4.6238e-04) (hash(x)=43957171) +551 train 3.869262 (lr=4.6322e-04) (hash(x)=42702809) +552 train 3.849909 (lr=4.6406e-04) (hash(x)=39042350) +553 train 3.959766 (lr=4.6490e-04) (hash(x)=41492067) +554 train 3.859243 (lr=4.6573e-04) (hash(x)=36110013) +555 train 3.847322 (lr=4.6657e-04) (hash(x)=40130005) +556 train 3.869086 (lr=4.6741e-04) (hash(x)=45049726) +557 train 3.931903 (lr=4.6825e-04) (hash(x)=49365195) +558 train 3.913176 (lr=4.6909e-04) (hash(x)=41567669) +559 train 3.918575 (lr=4.6993e-04) (hash(x)=46207492) +560 train 3.841910 (lr=4.7077e-04) (hash(x)=42130197) +561 train 3.965280 (lr=4.7161e-04) (hash(x)=34265021) +562 train 3.917118 (lr=4.7245e-04) (hash(x)=34773583) +563 train 3.880439 (lr=4.7329e-04) (hash(x)=43184012) +564 train 3.914160 (lr=4.7413e-04) (hash(x)=37145259) +565 train 3.887158 (lr=4.7497e-04) (hash(x)=38381221) +566 train 3.868138 (lr=4.7580e-04) (hash(x)=41428261) +567 train 3.950212 (lr=4.7664e-04) (hash(x)=37091783) +568 train 3.874136 (lr=4.7748e-04) (hash(x)=45685846) +569 train 3.924161 (lr=4.7832e-04) (hash(x)=43343294) +570 train 3.880471 (lr=4.7916e-04) (hash(x)=43254582) +571 train 3.952397 (lr=4.8000e-04) (hash(x)=39849483) +572 train 3.894690 (lr=4.8084e-04) (hash(x)=41264380) +573 train 4.015523 (lr=4.8168e-04) (hash(x)=42283924) +574 train 3.987999 (lr=4.8252e-04) (hash(x)=39655953) +575 val loss 3.9099 +575 val perplexity 49.8921 +575 train 3.887321 (lr=4.8336e-04) (hash(x)=42086418) +576 train 3.852061 (lr=4.8420e-04) (hash(x)=40948295) +577 train 3.945314 (lr=4.8503e-04) (hash(x)=35687541) +578 train 3.921410 (lr=4.8587e-04) (hash(x)=39936458) +579 train 3.908061 (lr=4.8671e-04) (hash(x)=35117344) +580 train 3.986453 (lr=4.8755e-04) (hash(x)=34395538) +581 train 3.939448 (lr=4.8839e-04) (hash(x)=35892076) +582 train 3.877398 (lr=4.8923e-04) (hash(x)=41800741) +583 train 3.922646 (lr=4.9007e-04) (hash(x)=43445467) +584 train 3.912651 (lr=4.9091e-04) (hash(x)=38303204) +585 train 3.876784 (lr=4.9175e-04) (hash(x)=48720465) +586 train 3.862882 (lr=4.9259e-04) (hash(x)=38533369) +587 train 3.868021 (lr=4.9343e-04) (hash(x)=37231081) +588 train 3.918005 (lr=4.9427e-04) (hash(x)=39889787) +589 train 3.849256 (lr=4.9510e-04) (hash(x)=39893421) +590 train 3.881243 (lr=4.9594e-04) (hash(x)=39390655) +591 train 3.942911 (lr=4.9678e-04) (hash(x)=45778037) +592 train 3.875890 (lr=4.9762e-04) (hash(x)=41795445) +593 train 3.876249 (lr=4.9846e-04) (hash(x)=49549707) +594 train 3.880204 (lr=4.9930e-04) (hash(x)=39607090) +595 train 3.959775 (lr=5.0014e-04) (hash(x)=36263694) +596 train 3.928124 (lr=5.0098e-04) (hash(x)=41510997) +597 train 3.925704 (lr=5.0182e-04) (hash(x)=34956326) +598 train 3.916266 (lr=5.0266e-04) (hash(x)=38418391) +599 train 3.983778 (lr=5.0350e-04) (hash(x)=36742756) +600 val loss 3.9115 +600 val perplexity 49.9729 +600 train 3.995497 (lr=5.0434e-04) (hash(x)=39672359) +601 train 3.958368 (lr=5.0517e-04) (hash(x)=39820713) +602 train 3.820040 (lr=5.0601e-04) (hash(x)=49143281) +603 train 3.929526 (lr=5.0685e-04) (hash(x)=41304465) +604 train 3.921060 (lr=5.0769e-04) (hash(x)=41001879) +605 train 3.918296 (lr=5.0853e-04) (hash(x)=43482056) +606 train 3.905796 (lr=5.0937e-04) (hash(x)=45670652) +607 train 3.928124 (lr=5.1021e-04) (hash(x)=33970928) +608 train 3.923735 (lr=5.1105e-04) (hash(x)=38045843) +609 train 3.924061 (lr=5.1189e-04) (hash(x)=33350103) +610 train 3.996737 (lr=5.1273e-04) (hash(x)=41589216) +611 train 3.945261 (lr=5.1357e-04) (hash(x)=34059789) +612 train 3.909083 (lr=5.1441e-04) (hash(x)=41112671) +613 train 3.988202 (lr=5.1524e-04) (hash(x)=40437666) +614 train 3.899864 (lr=5.1608e-04) (hash(x)=38571118) +615 train 3.920360 (lr=5.1692e-04) (hash(x)=44240071) +616 train 3.970065 (lr=5.1776e-04) (hash(x)=41989373) +617 train 3.938045 (lr=5.1860e-04) (hash(x)=39759801) +618 train 3.881736 (lr=5.1944e-04) (hash(x)=45447578) +619 train 3.880761 (lr=5.2028e-04) (hash(x)=40754763) +620 train 3.860386 (lr=5.2112e-04) (hash(x)=35234483) +621 train 3.875401 (lr=5.2196e-04) (hash(x)=42819964) +622 train 3.931608 (lr=5.2280e-04) (hash(x)=39270768) +623 train 3.898251 (lr=5.2364e-04) (hash(x)=37793696) +624 train 3.906253 (lr=5.2448e-04) (hash(x)=43314996) +625 val loss 3.9158 +625 val perplexity 50.1870 +625 train 3.950860 (lr=5.2531e-04) (hash(x)=43151202) +626 train 3.874368 (lr=5.2615e-04) (hash(x)=35628651) +627 train 3.984074 (lr=5.2699e-04) (hash(x)=35743866) +628 train 3.956202 (lr=5.2783e-04) (hash(x)=44585024) +629 train 3.968324 (lr=5.2867e-04) (hash(x)=35346290) +630 train 3.969553 (lr=5.2951e-04) (hash(x)=46920242) +631 train 3.956055 (lr=5.3035e-04) (hash(x)=47673827) +632 train 3.906036 (lr=5.3119e-04) (hash(x)=41445806) +633 train 3.921305 (lr=5.3203e-04) (hash(x)=43754932) +634 train 4.010479 (lr=5.3287e-04) (hash(x)=40292952) +635 train 3.988150 (lr=5.3371e-04) (hash(x)=38812569) +636 train 3.895024 (lr=5.3455e-04) (hash(x)=39579162) +637 train 3.910872 (lr=5.3538e-04) (hash(x)=39930364) +638 train 3.910724 (lr=5.3622e-04) (hash(x)=40851945) +639 train 3.896214 (lr=5.3706e-04) (hash(x)=51691619) +640 train 3.918957 (lr=5.3790e-04) (hash(x)=40556269) +641 train 3.961314 (lr=5.3874e-04) (hash(x)=36311728) +642 train 3.970993 (lr=5.3958e-04) (hash(x)=37571264) +643 train 3.967100 (lr=5.4042e-04) (hash(x)=38957683) +644 train 3.922627 (lr=5.4126e-04) (hash(x)=41255317) +645 train 3.929216 (lr=5.4210e-04) (hash(x)=37052064) +646 train 3.880909 (lr=5.4294e-04) (hash(x)=40994240) +647 train 3.968946 (lr=5.4378e-04) (hash(x)=33207295) +648 train 3.935027 (lr=5.4462e-04) (hash(x)=41024214) +649 train 3.995211 (lr=5.4545e-04) (hash(x)=34240417) +650 val loss 3.9158 +650 val perplexity 50.1895 +650 train 3.929068 (lr=5.4629e-04) (hash(x)=39783693) +651 train 3.891405 (lr=5.4713e-04) (hash(x)=38296799) +652 train 3.919350 (lr=5.4797e-04) (hash(x)=38829129) +653 train 3.901128 (lr=5.4881e-04) (hash(x)=39551481) +654 train 3.941038 (lr=5.4965e-04) (hash(x)=36579423) +655 train 3.902657 (lr=5.5049e-04) (hash(x)=38278786) +656 train 3.943504 (lr=5.5133e-04) (hash(x)=40225050) +657 train 3.904582 (lr=5.5217e-04) (hash(x)=39240007) +658 train 3.840514 (lr=5.5301e-04) (hash(x)=40561343) +659 train 3.889364 (lr=5.5385e-04) (hash(x)=37724001) +660 train 3.926849 (lr=5.5469e-04) (hash(x)=39744474) +661 train 3.933321 (lr=5.5552e-04) (hash(x)=49442189) +662 train 3.921920 (lr=5.5636e-04) (hash(x)=43741733) +663 train 3.871307 (lr=5.5720e-04) (hash(x)=41220583) +664 train 3.941711 (lr=5.5804e-04) (hash(x)=39822002) +665 train 3.916007 (lr=5.5888e-04) (hash(x)=46692739) +666 train 3.908528 (lr=5.5972e-04) (hash(x)=39824606) +667 train 3.943529 (lr=5.6056e-04) (hash(x)=39715723) +668 train 3.905954 (lr=5.6140e-04) (hash(x)=46413749) +669 train 3.884871 (lr=5.6224e-04) (hash(x)=40849450) +670 train 3.968729 (lr=5.6308e-04) (hash(x)=52400522) +671 train 4.021111 (lr=5.6392e-04) (hash(x)=43236859) +672 train 4.037140 (lr=5.6476e-04) (hash(x)=47198014) +673 train 3.956490 (lr=5.6559e-04) (hash(x)=42750457) +674 train 3.983686 (lr=5.6643e-04) (hash(x)=38676842) +675 val loss 3.9239 +675 val perplexity 50.5975 +675 train 3.980011 (lr=5.6727e-04) (hash(x)=42517622) +676 train 3.904492 (lr=5.6811e-04) (hash(x)=46387198) +677 train 3.931018 (lr=5.6895e-04) (hash(x)=37474737) +678 train 3.924693 (lr=5.6979e-04) (hash(x)=40119687) +679 train 3.942250 (lr=5.7063e-04) (hash(x)=35078420) +680 train 3.873550 (lr=5.7147e-04) (hash(x)=40956046) +681 train 4.044616 (lr=5.7231e-04) (hash(x)=39815653) +682 train 3.952139 (lr=5.7315e-04) (hash(x)=41616566) +683 train 3.975157 (lr=5.7399e-04) (hash(x)=38111701) +684 train 3.917153 (lr=5.7483e-04) (hash(x)=42644878) +685 train 3.999467 (lr=5.7566e-04) (hash(x)=27096192) +686 train 3.943334 (lr=5.7650e-04) (hash(x)=39777738) +687 train 3.885042 (lr=5.7734e-04) (hash(x)=38992627) +688 train 3.941897 (lr=5.7818e-04) (hash(x)=38626326) +689 train 3.922449 (lr=5.7902e-04) (hash(x)=36039839) +690 train 3.919404 (lr=5.7986e-04) (hash(x)=34422707) +691 train 4.039830 (lr=5.8070e-04) (hash(x)=46158211) +692 train 3.907504 (lr=5.8154e-04) (hash(x)=40083122) +693 train 3.881465 (lr=5.8238e-04) (hash(x)=40493600) +694 train 3.921789 (lr=5.8322e-04) (hash(x)=39041798) +695 train 3.927493 (lr=5.8406e-04) (hash(x)=38914095) +696 train 3.886823 (lr=5.8490e-04) (hash(x)=37039873) +697 train 4.027889 (lr=5.8573e-04) (hash(x)=46638102) +698 train 3.896973 (lr=5.8657e-04) (hash(x)=32822910) +699 train 3.894875 (lr=5.8741e-04) (hash(x)=43957881) +700 val loss 3.9326 +700 val perplexity 51.0400 +700 train 3.881636 (lr=5.8825e-04) (hash(x)=39080251) +701 train 3.894817 (lr=5.8909e-04) (hash(x)=37363642) +702 train 3.903635 (lr=5.8993e-04) (hash(x)=42982683) +703 train 3.914458 (lr=5.9077e-04) (hash(x)=39483629) +704 train 3.874629 (lr=5.9161e-04) (hash(x)=42630341) +705 train 3.882901 (lr=5.9245e-04) (hash(x)=44056950) +706 train 3.922693 (lr=5.9329e-04) (hash(x)=43817641) +707 train 3.978151 (lr=5.9413e-04) (hash(x)=44960773) +708 train 3.906362 (lr=5.9497e-04) (hash(x)=42891837) +709 train 3.909367 (lr=5.9580e-04) (hash(x)=38869574) +710 train 3.992977 (lr=5.9664e-04) (hash(x)=37447381) +711 train 3.971006 (lr=5.9748e-04) (hash(x)=40348031) +712 train 3.944895 (lr=5.9832e-04) (hash(x)=38271489) +713 train 4.016311 (lr=5.9916e-04) (hash(x)=37067529) +714 train 3.963895 (lr=6.0000e-04) (hash(x)=38997415) +715 train 3.964758 (lr=6.0000e-04) (hash(x)=37716278) +716 train 3.980748 (lr=6.0000e-04) (hash(x)=37470292) +717 train 4.051491 (lr=6.0000e-04) (hash(x)=32951006) +718 train 3.962289 (lr=6.0000e-04) (hash(x)=38403623) +719 train 3.869091 (lr=5.9999e-04) (hash(x)=39843697) +720 train 3.884332 (lr=5.9999e-04) (hash(x)=35378506) +721 train 3.937107 (lr=5.9998e-04) (hash(x)=40062039) +722 train 3.931679 (lr=5.9998e-04) (hash(x)=37782242) +723 train 4.019020 (lr=5.9997e-04) (hash(x)=41154387) +724 train 3.920989 (lr=5.9997e-04) (hash(x)=44678854) +725 val loss 3.9444 +725 val perplexity 51.6456 +725 train 4.000432 (lr=5.9996e-04) (hash(x)=34961660) +726 train 3.915324 (lr=5.9995e-04) (hash(x)=35021763) +727 train 3.890289 (lr=5.9994e-04) (hash(x)=46217208) +728 train 3.994755 (lr=5.9993e-04) (hash(x)=42952749) +729 train 3.966922 (lr=5.9992e-04) (hash(x)=45499581) +730 train 3.918770 (lr=5.9991e-04) (hash(x)=36523080) +731 train 3.878277 (lr=5.9989e-04) (hash(x)=43229796) +732 train 3.957591 (lr=5.9988e-04) (hash(x)=41719874) +733 train 3.932230 (lr=5.9986e-04) (hash(x)=35761807) +734 train 3.947205 (lr=5.9985e-04) (hash(x)=42546372) +735 train 3.917304 (lr=5.9983e-04) (hash(x)=37986269) +736 train 3.976538 (lr=5.9982e-04) (hash(x)=38055422) +737 train 3.922392 (lr=5.9980e-04) (hash(x)=41093788) +738 train 3.954836 (lr=5.9978e-04) (hash(x)=40997225) +739 train 3.924319 (lr=5.9976e-04) (hash(x)=41735081) +740 train 3.910630 (lr=5.9974e-04) (hash(x)=44933186) +741 train 3.924643 (lr=5.9972e-04) (hash(x)=44152799) +742 train 3.951087 (lr=5.9970e-04) (hash(x)=37414189) +743 train 3.973892 (lr=5.9967e-04) (hash(x)=35429703) +744 train 3.958550 (lr=5.9965e-04) (hash(x)=40175187) +745 train 3.980742 (lr=5.9962e-04) (hash(x)=40122934) +746 train 3.878482 (lr=5.9960e-04) (hash(x)=38277599) +747 train 3.920609 (lr=5.9957e-04) (hash(x)=38243786) +748 train 3.971339 (lr=5.9954e-04) (hash(x)=55171118) +749 train 3.886545 (lr=5.9952e-04) (hash(x)=42119817) +750 val loss 3.9362 +750 val perplexity 51.2249 +750 train 3.967291 (lr=5.9949e-04) (hash(x)=37528047) +751 train 3.950522 (lr=5.9946e-04) (hash(x)=45983138) +752 train 3.954453 (lr=5.9943e-04) (hash(x)=42856523) +753 train 3.952019 (lr=5.9940e-04) (hash(x)=39190898) +754 train 3.884312 (lr=5.9936e-04) (hash(x)=38057524) +755 train 3.935417 (lr=5.9933e-04) (hash(x)=38419763) +756 train 3.976043 (lr=5.9930e-04) (hash(x)=42173888) +757 train 3.916650 (lr=5.9926e-04) (hash(x)=41243641) +758 train 3.890563 (lr=5.9923e-04) (hash(x)=43657272) +759 train 3.912485 (lr=5.9919e-04) (hash(x)=41817875) +760 train 3.907616 (lr=5.9915e-04) (hash(x)=40838176) +761 train 3.928781 (lr=5.9912e-04) (hash(x)=40879958) +762 train 3.927971 (lr=5.9908e-04) (hash(x)=38389305) +763 train 3.948019 (lr=5.9904e-04) (hash(x)=40607210) +764 train 3.859493 (lr=5.9900e-04) (hash(x)=37254101) +765 train 3.986724 (lr=5.9896e-04) (hash(x)=41046639) +766 train 3.964188 (lr=5.9891e-04) (hash(x)=39783224) +767 train 3.900281 (lr=5.9887e-04) (hash(x)=43577291) +768 train 3.910514 (lr=5.9883e-04) (hash(x)=43585715) +769 train 3.889319 (lr=5.9878e-04) (hash(x)=43351915) +770 train 3.913004 (lr=5.9874e-04) (hash(x)=28949932) +771 train 3.928488 (lr=5.9869e-04) (hash(x)=40361419) +772 train 3.970248 (lr=5.9864e-04) (hash(x)=36061970) +773 train 3.913821 (lr=5.9859e-04) (hash(x)=39055994) +774 train 3.977076 (lr=5.9855e-04) (hash(x)=39858114) +775 val loss 3.9406 +775 val perplexity 51.4519 +775 train 3.948256 (lr=5.9850e-04) (hash(x)=45228853) +776 train 3.922258 (lr=5.9845e-04) (hash(x)=41007316) +777 train 3.965634 (lr=5.9839e-04) (hash(x)=37337726) +778 train 3.931684 (lr=5.9834e-04) (hash(x)=38455317) +779 train 3.965815 (lr=5.9829e-04) (hash(x)=36750700) +780 train 3.913942 (lr=5.9824e-04) (hash(x)=38957738) +781 train 3.942941 (lr=5.9818e-04) (hash(x)=43340500) +782 train 3.927706 (lr=5.9812e-04) (hash(x)=39867257) +783 train 3.957249 (lr=5.9807e-04) (hash(x)=39025934) +784 train 3.999262 (lr=5.9801e-04) (hash(x)=35483698) +785 train 3.892890 (lr=5.9795e-04) (hash(x)=36119240) +786 train 3.991087 (lr=5.9789e-04) (hash(x)=40166674) +787 train 3.910872 (lr=5.9784e-04) (hash(x)=46415032) +788 train 3.931558 (lr=5.9777e-04) (hash(x)=52572443) +789 train 3.927254 (lr=5.9771e-04) (hash(x)=51933482) +790 train 3.941753 (lr=5.9765e-04) (hash(x)=35393448) +791 train 3.904380 (lr=5.9759e-04) (hash(x)=43623650) +792 train 3.983484 (lr=5.9752e-04) (hash(x)=37872530) +793 train 3.965390 (lr=5.9746e-04) (hash(x)=40239724) +794 train 3.894158 (lr=5.9739e-04) (hash(x)=41447309) +795 train 3.874352 (lr=5.9733e-04) (hash(x)=36128048) +796 train 3.895689 (lr=5.9726e-04) (hash(x)=35685568) +797 train 3.891604 (lr=5.9719e-04) (hash(x)=42451389) +798 train 3.861352 (lr=5.9712e-04) (hash(x)=40089891) +799 train 3.932593 (lr=5.9705e-04) (hash(x)=40219363) +800 val loss 3.9318 +800 val perplexity 50.9970 +800 train 3.994884 (lr=5.9698e-04) (hash(x)=42474755) +801 train 3.873833 (lr=5.9691e-04) (hash(x)=38453966) +802 train 3.970432 (lr=5.9684e-04) (hash(x)=43863292) +803 train 3.938663 (lr=5.9677e-04) (hash(x)=40324052) +804 train 3.925446 (lr=5.9669e-04) (hash(x)=43548037) +805 train 3.920943 (lr=5.9662e-04) (hash(x)=38516664) +806 train 3.947798 (lr=5.9654e-04) (hash(x)=38702958) +807 train 3.905314 (lr=5.9647e-04) (hash(x)=43531951) +808 train 3.913078 (lr=5.9639e-04) (hash(x)=42134336) +809 train 3.934447 (lr=5.9631e-04) (hash(x)=38879514) +810 train 3.997724 (lr=5.9623e-04) (hash(x)=32943913) +811 train 3.928231 (lr=5.9616e-04) (hash(x)=41912647) +812 train 3.973211 (lr=5.9607e-04) (hash(x)=37153225) +813 train 4.031930 (lr=5.9599e-04) (hash(x)=34506202) +814 train 3.965290 (lr=5.9591e-04) (hash(x)=40983583) +815 train 3.944231 (lr=5.9583e-04) (hash(x)=42668764) +816 train 4.024728 (lr=5.9575e-04) (hash(x)=38453524) +817 train 3.939555 (lr=5.9566e-04) (hash(x)=40100484) +818 train 3.950910 (lr=5.9558e-04) (hash(x)=38519967) +819 train 3.973635 (lr=5.9549e-04) (hash(x)=40239120) +820 train 4.026714 (lr=5.9540e-04) (hash(x)=40761828) +821 train 3.929379 (lr=5.9532e-04) (hash(x)=52429592) +822 train 3.963857 (lr=5.9523e-04) (hash(x)=43521843) +823 train 3.910863 (lr=5.9514e-04) (hash(x)=40395216) +824 train 3.851832 (lr=5.9505e-04) (hash(x)=51713752) +825 val loss 3.9327 +825 val perplexity 51.0427 +825 train 3.938242 (lr=5.9496e-04) (hash(x)=36807070) +826 train 3.931685 (lr=5.9486e-04) (hash(x)=44794843) +827 train 3.970970 (lr=5.9477e-04) (hash(x)=49100903) +828 train 3.931909 (lr=5.9468e-04) (hash(x)=35351558) +829 train 3.903415 (lr=5.9458e-04) (hash(x)=42881689) +830 train 3.904668 (lr=5.9449e-04) (hash(x)=39640932) +831 train 3.983983 (lr=5.9439e-04) (hash(x)=38393419) +832 train 3.905729 (lr=5.9430e-04) (hash(x)=42600434) +833 train 3.956767 (lr=5.9420e-04) (hash(x)=43194327) +834 train 3.920690 (lr=5.9410e-04) (hash(x)=36987143) +835 train 3.941207 (lr=5.9400e-04) (hash(x)=38851912) +836 train 3.882663 (lr=5.9390e-04) (hash(x)=32250931) +837 train 3.928998 (lr=5.9380e-04) (hash(x)=37020079) +838 train 4.007948 (lr=5.9370e-04) (hash(x)=43176619) +839 train 3.961152 (lr=5.9360e-04) (hash(x)=41785073) +840 train 3.853265 (lr=5.9349e-04) (hash(x)=41645417) +841 train 3.940165 (lr=5.9339e-04) (hash(x)=41073484) +842 train 3.941086 (lr=5.9328e-04) (hash(x)=39974880) +843 train 3.932655 (lr=5.9318e-04) (hash(x)=42288363) +844 train 3.900338 (lr=5.9307e-04) (hash(x)=41498532) +845 train 3.939884 (lr=5.9296e-04) (hash(x)=39632055) +846 train 3.974152 (lr=5.9286e-04) (hash(x)=39383942) +847 train 3.959493 (lr=5.9275e-04) (hash(x)=33125847) +848 train 3.956165 (lr=5.9264e-04) (hash(x)=35062147) +849 train 4.019079 (lr=5.9253e-04) (hash(x)=36773864) +850 val loss 3.9294 +850 val perplexity 50.8758 +850 train 3.945986 (lr=5.9241e-04) (hash(x)=38698600) +851 train 4.072665 (lr=5.9230e-04) (hash(x)=43920779) +852 train 3.937315 (lr=5.9219e-04) (hash(x)=44229059) +853 train 3.967475 (lr=5.9208e-04) (hash(x)=34179834) +854 train 4.024686 (lr=5.9196e-04) (hash(x)=38238897) +855 train 3.966960 (lr=5.9185e-04) (hash(x)=43339112) +856 train 3.935577 (lr=5.9173e-04) (hash(x)=40788159) +857 train 3.919321 (lr=5.9161e-04) (hash(x)=39351731) +858 train 3.901103 (lr=5.9149e-04) (hash(x)=34385405) +859 train 3.925479 (lr=5.9138e-04) (hash(x)=37449150) +860 train 3.923503 (lr=5.9126e-04) (hash(x)=38982888) +861 train 3.973137 (lr=5.9114e-04) (hash(x)=37715930) +862 train 3.894149 (lr=5.9101e-04) (hash(x)=36524609) +863 train 3.911928 (lr=5.9089e-04) (hash(x)=33546685) +864 train 3.930174 (lr=5.9077e-04) (hash(x)=38940538) +865 train 3.920861 (lr=5.9065e-04) (hash(x)=45161966) +866 train 3.875911 (lr=5.9052e-04) (hash(x)=45791594) +867 train 3.893018 (lr=5.9040e-04) (hash(x)=42029741) +868 train 3.949128 (lr=5.9027e-04) (hash(x)=38760636) +869 train 3.921783 (lr=5.9014e-04) (hash(x)=43755346) +870 train 3.982633 (lr=5.9002e-04) (hash(x)=40589839) +871 train 3.980992 (lr=5.8989e-04) (hash(x)=42226954) +872 train 3.923978 (lr=5.8976e-04) (hash(x)=42310416) +873 train 3.927274 (lr=5.8963e-04) (hash(x)=40662853) +874 train 3.939073 (lr=5.8950e-04) (hash(x)=36549362) +875 val loss 3.9321 +875 val perplexity 51.0162 +875 train 3.935910 (lr=5.8937e-04) (hash(x)=33609060) +876 train 3.954491 (lr=5.8923e-04) (hash(x)=42490606) +877 train 3.938570 (lr=5.8910e-04) (hash(x)=41831163) +878 train 3.964153 (lr=5.8897e-04) (hash(x)=41323453) +879 train 3.969203 (lr=5.8883e-04) (hash(x)=39338497) +880 train 3.888915 (lr=5.8869e-04) (hash(x)=39977991) +881 train 3.904629 (lr=5.8856e-04) (hash(x)=41549358) +882 train 3.978836 (lr=5.8842e-04) (hash(x)=39679120) +883 train 3.966287 (lr=5.8828e-04) (hash(x)=36113474) +884 train 3.970418 (lr=5.8814e-04) (hash(x)=44708569) +885 train 3.955987 (lr=5.8800e-04) (hash(x)=43626341) +886 train 3.988407 (lr=5.8786e-04) (hash(x)=36523548) +887 train 3.983734 (lr=5.8772e-04) (hash(x)=40566411) +888 train 4.132557 (lr=5.8758e-04) (hash(x)=42741772) +889 train 3.998026 (lr=5.8744e-04) (hash(x)=38529367) +890 train 3.981635 (lr=5.8729e-04) (hash(x)=43910066) +891 train 3.947320 (lr=5.8715e-04) (hash(x)=40876842) +892 train 3.910776 (lr=5.8700e-04) (hash(x)=41234048) +893 train 3.909419 (lr=5.8686e-04) (hash(x)=42074913) +894 train 3.904206 (lr=5.8671e-04) (hash(x)=43115949) +895 train 3.862434 (lr=5.8656e-04) (hash(x)=39141688) +896 train 3.941557 (lr=5.8642e-04) (hash(x)=34546857) +897 train 3.901917 (lr=5.8627e-04) (hash(x)=35466141) +898 train 3.902506 (lr=5.8612e-04) (hash(x)=39645146) +899 train 3.855386 (lr=5.8597e-04) (hash(x)=40285127) +900 val loss 3.9290 +900 val perplexity 50.8552 +900 train 3.918122 (lr=5.8581e-04) (hash(x)=34730751) +901 train 3.908261 (lr=5.8566e-04) (hash(x)=41924509) +902 train 3.946739 (lr=5.8551e-04) (hash(x)=43619335) +903 train 3.941875 (lr=5.8535e-04) (hash(x)=43349580) +904 train 3.999565 (lr=5.8520e-04) (hash(x)=41833238) +905 train 3.933658 (lr=5.8504e-04) (hash(x)=47692928) +906 train 3.938309 (lr=5.8489e-04) (hash(x)=40952478) +907 train 3.981853 (lr=5.8473e-04) (hash(x)=40378928) +908 train 4.038857 (lr=5.8457e-04) (hash(x)=40783680) +909 train 3.881715 (lr=5.8441e-04) (hash(x)=42913549) +910 train 3.917917 (lr=5.8425e-04) (hash(x)=40692727) +911 train 3.894319 (lr=5.8409e-04) (hash(x)=37726499) +912 train 3.933734 (lr=5.8393e-04) (hash(x)=41661185) +913 train 3.911181 (lr=5.8377e-04) (hash(x)=36023729) +914 train 3.987229 (lr=5.8361e-04) (hash(x)=40984699) +915 train 3.984171 (lr=5.8345e-04) (hash(x)=39519671) +916 train 3.918325 (lr=5.8328e-04) (hash(x)=42111263) +917 train 3.965194 (lr=5.8312e-04) (hash(x)=39598069) +918 train 3.948618 (lr=5.8295e-04) (hash(x)=38877825) +919 train 3.960583 (lr=5.8278e-04) (hash(x)=37363438) +920 train 3.984382 (lr=5.8262e-04) (hash(x)=37035517) +921 train 3.952421 (lr=5.8245e-04) (hash(x)=38422361) +922 train 3.969114 (lr=5.8228e-04) (hash(x)=42869663) +923 train 4.013616 (lr=5.8211e-04) (hash(x)=42918418) +924 train 3.971840 (lr=5.8194e-04) (hash(x)=36565282) +925 val loss 3.9316 +925 val perplexity 50.9906 +925 train 3.903394 (lr=5.8177e-04) (hash(x)=42787331) +926 train 3.948708 (lr=5.8160e-04) (hash(x)=46714204) +927 train 3.892556 (lr=5.8142e-04) (hash(x)=37204102) +928 train 3.878578 (lr=5.8125e-04) (hash(x)=37916393) +929 train 3.896698 (lr=5.8107e-04) (hash(x)=42877917) +930 train 3.958746 (lr=5.8090e-04) (hash(x)=34900663) +931 train 3.922946 (lr=5.8072e-04) (hash(x)=38429251) +932 train 3.917778 (lr=5.8055e-04) (hash(x)=39540528) +933 train 3.931687 (lr=5.8037e-04) (hash(x)=41798890) +934 train 3.952096 (lr=5.8019e-04) (hash(x)=38240055) +935 train 3.923293 (lr=5.8001e-04) (hash(x)=38523525) +936 train 3.945485 (lr=5.7983e-04) (hash(x)=40024596) +937 train 3.970985 (lr=5.7965e-04) (hash(x)=46774489) +938 train 3.932669 (lr=5.7947e-04) (hash(x)=42047965) +939 train 3.925340 (lr=5.7929e-04) (hash(x)=41732787) +940 train 3.921493 (lr=5.7911e-04) (hash(x)=38710791) +941 train 3.975593 (lr=5.7892e-04) (hash(x)=40899862) +942 train 3.954704 (lr=5.7874e-04) (hash(x)=37236959) +943 train 3.892841 (lr=5.7855e-04) (hash(x)=40037401) +944 train 3.928921 (lr=5.7837e-04) (hash(x)=40785041) +945 train 3.934536 (lr=5.7818e-04) (hash(x)=43936646) +946 train 3.951128 (lr=5.7799e-04) (hash(x)=43354359) +947 train 3.944148 (lr=5.7780e-04) (hash(x)=37820575) +948 train 3.949564 (lr=5.7761e-04) (hash(x)=47822263) +949 train 4.031469 (lr=5.7742e-04) (hash(x)=43910466) +950 val loss 3.9303 +950 val perplexity 50.9244 +950 train 4.001417 (lr=5.7723e-04) (hash(x)=41274997) +951 train 3.929778 (lr=5.7704e-04) (hash(x)=33540500) +952 train 3.988221 (lr=5.7685e-04) (hash(x)=41870707) +953 train 3.935193 (lr=5.7666e-04) (hash(x)=38713267) +954 train 4.045477 (lr=5.7646e-04) (hash(x)=40757257) +955 train 3.987377 (lr=5.7627e-04) (hash(x)=35663269) +956 train 3.948662 (lr=5.7607e-04) (hash(x)=39681102) +957 train 3.920448 (lr=5.7588e-04) (hash(x)=37482320) +958 train 3.981793 (lr=5.7568e-04) (hash(x)=44716444) +959 train 3.939209 (lr=5.7548e-04) (hash(x)=42117099) +960 train 3.928361 (lr=5.7529e-04) (hash(x)=40926653) +961 train 3.893247 (lr=5.7509e-04) (hash(x)=39800938) +962 train 3.950392 (lr=5.7489e-04) (hash(x)=35294116) +963 train 3.963662 (lr=5.7469e-04) (hash(x)=42226445) +964 train 3.885383 (lr=5.7449e-04) (hash(x)=42391820) +965 train 3.958201 (lr=5.7428e-04) (hash(x)=42438566) +966 train 3.928366 (lr=5.7408e-04) (hash(x)=49363080) +967 train 3.921200 (lr=5.7388e-04) (hash(x)=44113436) +968 train 3.873161 (lr=5.7367e-04) (hash(x)=37497543) +969 train 3.849571 (lr=5.7347e-04) (hash(x)=38632288) +970 train 3.892869 (lr=5.7326e-04) (hash(x)=39325736) +971 train 3.883222 (lr=5.7306e-04) (hash(x)=37310950) +972 train 3.948573 (lr=5.7285e-04) (hash(x)=37907006) +973 train 3.965697 (lr=5.7264e-04) (hash(x)=43449153) +974 train 3.927560 (lr=5.7243e-04) (hash(x)=43580813) +975 val loss 3.9230 +975 val perplexity 50.5537 +975 train 3.937810 (lr=5.7222e-04) (hash(x)=45229538) +976 train 3.953691 (lr=5.7201e-04) (hash(x)=35764199) +977 train 3.910310 (lr=5.7180e-04) (hash(x)=42171751) +978 train 3.885774 (lr=5.7159e-04) (hash(x)=41360538) +979 train 3.973917 (lr=5.7138e-04) (hash(x)=41131783) +980 train 3.966391 (lr=5.7116e-04) (hash(x)=39689081) +981 train 3.967647 (lr=5.7095e-04) (hash(x)=35865506) +982 train 3.903361 (lr=5.7073e-04) (hash(x)=38340724) +983 train 3.942240 (lr=5.7052e-04) (hash(x)=39875677) +984 train 3.945445 (lr=5.7030e-04) (hash(x)=42747809) +985 train 3.993714 (lr=5.7008e-04) (hash(x)=41968338) +986 train 3.939103 (lr=5.6987e-04) (hash(x)=32006007) +987 train 3.974712 (lr=5.6965e-04) (hash(x)=42243964) +988 train 3.952278 (lr=5.6943e-04) (hash(x)=36069330) +989 train 3.933239 (lr=5.6921e-04) (hash(x)=41613796) +990 train 3.961873 (lr=5.6899e-04) (hash(x)=38357484) +991 train 4.007088 (lr=5.6877e-04) (hash(x)=38633949) +992 train 4.002340 (lr=5.6854e-04) (hash(x)=34070204) +993 train 3.950693 (lr=5.6832e-04) (hash(x)=41583007) +994 train 3.947638 (lr=5.6810e-04) (hash(x)=38043738) +995 train 3.916758 (lr=5.6787e-04) (hash(x)=36824424) +996 train 3.897637 (lr=5.6765e-04) (hash(x)=36348681) +997 train 3.907023 (lr=5.6742e-04) (hash(x)=44512817) +998 train 3.911179 (lr=5.6720e-04) (hash(x)=40862648) +999 train 3.915909 (lr=5.6697e-04) (hash(x)=41448922) +1000 val loss 3.9223 +1000 val perplexity 50.5146 +1000 train 3.965521 (lr=5.6674e-04) (hash(x)=34272182) +1001 train 3.944131 (lr=5.6651e-04) (hash(x)=40951954) +1002 train 3.909610 (lr=5.6628e-04) (hash(x)=47487562) +1003 train 3.860866 (lr=5.6605e-04) (hash(x)=38790513) +1004 train 3.886588 (lr=5.6582e-04) (hash(x)=37141506) +1005 train 3.864774 (lr=5.6559e-04) (hash(x)=34227600) +1006 train 3.831444 (lr=5.6536e-04) (hash(x)=36876899) +1007 train 3.946797 (lr=5.6512e-04) (hash(x)=44864425) +1008 train 3.956946 (lr=5.6489e-04) (hash(x)=44048754) +1009 train 3.913130 (lr=5.6465e-04) (hash(x)=36017025) +1010 train 3.857386 (lr=5.6442e-04) (hash(x)=45133559) +1011 train 3.986201 (lr=5.6418e-04) (hash(x)=39306278) +1012 train 3.902658 (lr=5.6395e-04) (hash(x)=40533714) +1013 train 3.924166 (lr=5.6371e-04) (hash(x)=42560040) +1014 train 3.947214 (lr=5.6347e-04) (hash(x)=33695913) +1015 train 4.004626 (lr=5.6323e-04) (hash(x)=41357599) +1016 train 3.939928 (lr=5.6299e-04) (hash(x)=43394408) +1017 train 3.959078 (lr=5.6275e-04) (hash(x)=38756270) +1018 train 3.936986 (lr=5.6251e-04) (hash(x)=34419574) +1019 train 3.920416 (lr=5.6227e-04) (hash(x)=34196599) +1020 train 3.928544 (lr=5.6202e-04) (hash(x)=38879982) +1021 train 3.980324 (lr=5.6178e-04) (hash(x)=37736438) +1022 train 4.016722 (lr=5.6154e-04) (hash(x)=42847771) +1023 train 3.968665 (lr=5.6129e-04) (hash(x)=32689610) +1024 train 3.963441 (lr=5.6105e-04) (hash(x)=42555118) +1025 val loss 3.9223 +1025 val perplexity 50.5158 +1025 train 3.961635 (lr=5.6080e-04) (hash(x)=42075074) +1026 train 3.939561 (lr=5.6055e-04) (hash(x)=38328921) +1027 train 3.901815 (lr=5.6031e-04) (hash(x)=37282910) +1028 train 3.923902 (lr=5.6006e-04) (hash(x)=47414779) +1029 train 3.959723 (lr=5.5981e-04) (hash(x)=37797506) +1030 train 3.908282 (lr=5.5956e-04) (hash(x)=39693499) +1031 train 3.927383 (lr=5.5931e-04) (hash(x)=44165621) +1032 train 3.920769 (lr=5.5906e-04) (hash(x)=44137718) +1033 train 3.895254 (lr=5.5880e-04) (hash(x)=37552609) +1034 train 3.914616 (lr=5.5855e-04) (hash(x)=39477202) +1035 train 3.892040 (lr=5.5830e-04) (hash(x)=39135773) +1036 train 3.860164 (lr=5.5804e-04) (hash(x)=44206068) +1037 train 3.859274 (lr=5.5779e-04) (hash(x)=39207820) +1038 train 3.884290 (lr=5.5753e-04) (hash(x)=43271155) +1039 train 3.831087 (lr=5.5728e-04) (hash(x)=32523181) +1040 train 3.922941 (lr=5.5702e-04) (hash(x)=42083156) +1041 train 3.915904 (lr=5.5676e-04) (hash(x)=39199670) +1042 train 3.928778 (lr=5.5651e-04) (hash(x)=34196519) +1043 train 3.914561 (lr=5.5625e-04) (hash(x)=41694768) +1044 train 3.948164 (lr=5.5599e-04) (hash(x)=42866277) +1045 train 3.894003 (lr=5.5573e-04) (hash(x)=38919517) +1046 train 3.910673 (lr=5.5547e-04) (hash(x)=39986006) +1047 train 3.878724 (lr=5.5520e-04) (hash(x)=37800529) +1048 train 3.902539 (lr=5.5494e-04) (hash(x)=40327984) +1049 train 3.890513 (lr=5.5468e-04) (hash(x)=40607060) +1050 val loss 3.9166 +1050 val perplexity 50.2294 +1050 train 3.903963 (lr=5.5441e-04) (hash(x)=38824975) +1051 train 3.948347 (lr=5.5415e-04) (hash(x)=35544208) +1052 train 3.952345 (lr=5.5388e-04) (hash(x)=43475460) +1053 train 3.941226 (lr=5.5362e-04) (hash(x)=42801534) +1054 train 3.911246 (lr=5.5335e-04) (hash(x)=50227217) +1055 train 4.029135 (lr=5.5308e-04) (hash(x)=45474740) +1056 train 3.953032 (lr=5.5282e-04) (hash(x)=44473629) +1057 train 3.951145 (lr=5.5255e-04) (hash(x)=42494749) +1058 train 3.962261 (lr=5.5228e-04) (hash(x)=45152618) +1059 train 3.956590 (lr=5.5201e-04) (hash(x)=49452418) +1060 train 3.990839 (lr=5.5174e-04) (hash(x)=38711051) +1061 train 3.911048 (lr=5.5147e-04) (hash(x)=36136230) +1062 train 3.942118 (lr=5.5119e-04) (hash(x)=36839800) +1063 train 3.954549 (lr=5.5092e-04) (hash(x)=41508425) +1064 train 3.905490 (lr=5.5065e-04) (hash(x)=42297102) +1065 train 3.907057 (lr=5.5037e-04) (hash(x)=41744192) +1066 train 3.836963 (lr=5.5010e-04) (hash(x)=39750459) +1067 train 3.887221 (lr=5.4982e-04) (hash(x)=43490786) +1068 train 3.901114 (lr=5.4955e-04) (hash(x)=35309842) +1069 train 3.872139 (lr=5.4927e-04) (hash(x)=49017843) +1070 train 3.910966 (lr=5.4899e-04) (hash(x)=33027215) +1071 train 3.922523 (lr=5.4871e-04) (hash(x)=47815749) +1072 train 3.920423 (lr=5.4843e-04) (hash(x)=37351497) +1073 train 3.852818 (lr=5.4815e-04) (hash(x)=39073277) +1074 train 3.860718 (lr=5.4787e-04) (hash(x)=37198639) +1075 val loss 3.9185 +1075 val perplexity 50.3231 +1075 train 3.894468 (lr=5.4759e-04) (hash(x)=42404192) +1076 train 3.932950 (lr=5.4731e-04) (hash(x)=40085297) +1077 train 3.910384 (lr=5.4703e-04) (hash(x)=37711996) +1078 train 3.939596 (lr=5.4675e-04) (hash(x)=37781715) +1079 train 3.952103 (lr=5.4646e-04) (hash(x)=41656937) +1080 train 3.972744 (lr=5.4618e-04) (hash(x)=34720431) +1081 train 3.935417 (lr=5.4589e-04) (hash(x)=44085371) +1082 train 4.014768 (lr=5.4561e-04) (hash(x)=43230748) +1083 train 3.932413 (lr=5.4532e-04) (hash(x)=38651134) +1084 train 3.916990 (lr=5.4503e-04) (hash(x)=44237369) +1085 train 4.083557 (lr=5.4475e-04) (hash(x)=44830119) +1086 train 3.954165 (lr=5.4446e-04) (hash(x)=42059403) +1087 train 3.959549 (lr=5.4417e-04) (hash(x)=42438880) +1088 train 3.861588 (lr=5.4388e-04) (hash(x)=41465539) +1089 train 4.028632 (lr=5.4359e-04) (hash(x)=43262926) +1090 train 3.964155 (lr=5.4330e-04) (hash(x)=37845028) +1091 train 3.949335 (lr=5.4301e-04) (hash(x)=38314905) +1092 train 3.961001 (lr=5.4271e-04) (hash(x)=36881427) +1093 train 3.918009 (lr=5.4242e-04) (hash(x)=39356652) +1094 train 3.961380 (lr=5.4213e-04) (hash(x)=33455755) +1095 train 3.912809 (lr=5.4183e-04) (hash(x)=37962850) +1096 train 3.939011 (lr=5.4154e-04) (hash(x)=34763555) +1097 train 3.949151 (lr=5.4124e-04) (hash(x)=39652134) +1098 train 3.891271 (lr=5.4095e-04) (hash(x)=42909474) +1099 train 3.869338 (lr=5.4065e-04) (hash(x)=41341019) +1100 val loss 3.9148 +1100 val perplexity 50.1371 +1100 train 3.916449 (lr=5.4035e-04) (hash(x)=40560805) +1101 train 3.918781 (lr=5.4005e-04) (hash(x)=36855993) +1102 train 3.880409 (lr=5.3975e-04) (hash(x)=41973757) +1103 train 3.924272 (lr=5.3946e-04) (hash(x)=41199463) +1104 train 3.884873 (lr=5.3915e-04) (hash(x)=35553657) +1105 train 3.933090 (lr=5.3885e-04) (hash(x)=34971650) +1106 train 3.894217 (lr=5.3855e-04) (hash(x)=32184090) +1107 train 3.881969 (lr=5.3825e-04) (hash(x)=49098979) +1108 train 3.911630 (lr=5.3795e-04) (hash(x)=35147316) +1109 train 3.867978 (lr=5.3764e-04) (hash(x)=36965560) +1110 train 3.880740 (lr=5.3734e-04) (hash(x)=39035477) +1111 train 3.993434 (lr=5.3704e-04) (hash(x)=41260704) +1112 train 3.934479 (lr=5.3673e-04) (hash(x)=44955748) +1113 train 3.951018 (lr=5.3642e-04) (hash(x)=37043067) +1114 train 3.920901 (lr=5.3612e-04) (hash(x)=43852802) +1115 train 3.983172 (lr=5.3581e-04) (hash(x)=37622092) +1116 train 3.978838 (lr=5.3550e-04) (hash(x)=37293705) +1117 train 3.886415 (lr=5.3519e-04) (hash(x)=40808945) +1118 train 3.923116 (lr=5.3488e-04) (hash(x)=47564987) +1119 train 3.910658 (lr=5.3457e-04) (hash(x)=28341513) +1120 train 3.970764 (lr=5.3426e-04) (hash(x)=41011337) +1121 train 3.914766 (lr=5.3395e-04) (hash(x)=40264632) +1122 train 3.985860 (lr=5.3364e-04) (hash(x)=42252745) +1123 train 3.915807 (lr=5.3333e-04) (hash(x)=38064968) +1124 train 3.937629 (lr=5.3302e-04) (hash(x)=41137282) +1125 val loss 3.9172 +1125 val perplexity 50.2588 +1125 train 3.953011 (lr=5.3270e-04) (hash(x)=33340392) +1126 train 3.975278 (lr=5.3239e-04) (hash(x)=34365654) +1127 train 3.945652 (lr=5.3207e-04) (hash(x)=39420935) +1128 train 3.937716 (lr=5.3176e-04) (hash(x)=36764430) +1129 train 3.954950 (lr=5.3144e-04) (hash(x)=30368244) +1130 train 3.937807 (lr=5.3113e-04) (hash(x)=36709109) +1131 train 3.937109 (lr=5.3081e-04) (hash(x)=40882907) +1132 train 3.959936 (lr=5.3049e-04) (hash(x)=35178548) +1133 train 3.917560 (lr=5.3017e-04) (hash(x)=39602696) +1134 train 3.876654 (lr=5.2985e-04) (hash(x)=41634000) +1135 train 3.877423 (lr=5.2953e-04) (hash(x)=37528804) +1136 train 3.916847 (lr=5.2921e-04) (hash(x)=46221150) +1137 train 3.863974 (lr=5.2889e-04) (hash(x)=31367667) +1138 train 3.849113 (lr=5.2857e-04) (hash(x)=39299672) +1139 train 3.867191 (lr=5.2825e-04) (hash(x)=36586333) +1140 train 3.878786 (lr=5.2792e-04) (hash(x)=50843873) +1141 train 3.928541 (lr=5.2760e-04) (hash(x)=42668057) +1142 train 3.885930 (lr=5.2728e-04) (hash(x)=41979243) +1143 train 3.898152 (lr=5.2695e-04) (hash(x)=38802957) +1144 train 3.906897 (lr=5.2663e-04) (hash(x)=41299617) +1145 train 3.916712 (lr=5.2630e-04) (hash(x)=38788675) +1146 train 3.920725 (lr=5.2597e-04) (hash(x)=43275448) +1147 train 3.999310 (lr=5.2565e-04) (hash(x)=42320176) +1148 train 3.853028 (lr=5.2532e-04) (hash(x)=36971094) +1149 train 4.000334 (lr=5.2499e-04) (hash(x)=37598284) +1150 val loss 3.9098 +1150 val perplexity 49.8878 +1150 train 3.930282 (lr=5.2466e-04) (hash(x)=40241935) +1151 train 3.946131 (lr=5.2433e-04) (hash(x)=41845756) +1152 train 3.894723 (lr=5.2400e-04) (hash(x)=42705446) +1153 train 3.835078 (lr=5.2367e-04) (hash(x)=43396513) +1154 train 3.760438 (lr=5.2334e-04) (hash(x)=37977061) +1155 train 3.919055 (lr=5.2301e-04) (hash(x)=36796923) +1156 train 3.963028 (lr=5.2267e-04) (hash(x)=38477953) +1157 train 3.925420 (lr=5.2234e-04) (hash(x)=32610098) +1158 train 3.955029 (lr=5.2201e-04) (hash(x)=41713283) +1159 train 3.948960 (lr=5.2167e-04) (hash(x)=37400246) +1160 train 3.989330 (lr=5.2134e-04) (hash(x)=38934591) +1161 train 4.012727 (lr=5.2100e-04) (hash(x)=43372128) +1162 train 3.951899 (lr=5.2067e-04) (hash(x)=38696389) +1163 train 3.906939 (lr=5.2033e-04) (hash(x)=36956777) +1164 train 3.975944 (lr=5.1999e-04) (hash(x)=42740600) +1165 train 3.911454 (lr=5.1965e-04) (hash(x)=34101104) +1166 train 4.036081 (lr=5.1932e-04) (hash(x)=38653784) +1167 train 4.023912 (lr=5.1898e-04) (hash(x)=42044550) +1168 train 3.872003 (lr=5.1864e-04) (hash(x)=35655933) +1169 train 3.877356 (lr=5.1830e-04) (hash(x)=34975746) +1170 train 3.893447 (lr=5.1796e-04) (hash(x)=41292799) +1171 train 3.893014 (lr=5.1761e-04) (hash(x)=39653081) +1172 train 3.866527 (lr=5.1727e-04) (hash(x)=40839697) +1173 train 3.856145 (lr=5.1693e-04) (hash(x)=40174053) +1174 train 3.841046 (lr=5.1659e-04) (hash(x)=41620921) +1175 val loss 3.9135 +1175 val perplexity 50.0734 +1175 train 3.916482 (lr=5.1624e-04) (hash(x)=38648993) +1176 train 3.836964 (lr=5.1590e-04) (hash(x)=36532874) +1177 train 3.899209 (lr=5.1555e-04) (hash(x)=39843841) +1178 train 3.864567 (lr=5.1521e-04) (hash(x)=40107816) +1179 train 3.904245 (lr=5.1486e-04) (hash(x)=37723569) +1180 train 3.962282 (lr=5.1452e-04) (hash(x)=40675612) +1181 train 3.934659 (lr=5.1417e-04) (hash(x)=42947281) +1182 train 3.908488 (lr=5.1382e-04) (hash(x)=39764186) +1183 train 3.899031 (lr=5.1347e-04) (hash(x)=53036852) +1184 train 3.929236 (lr=5.1312e-04) (hash(x)=39087616) +1185 train 3.999620 (lr=5.1277e-04) (hash(x)=39227231) +1186 train 3.939773 (lr=5.1242e-04) (hash(x)=41132107) +1187 train 3.945382 (lr=5.1207e-04) (hash(x)=36632968) +1188 train 3.946747 (lr=5.1172e-04) (hash(x)=42392726) +1189 train 3.970589 (lr=5.1137e-04) (hash(x)=34929056) +1190 train 3.936333 (lr=5.1102e-04) (hash(x)=33876766) +1191 train 3.974137 (lr=5.1067e-04) (hash(x)=43424315) +1192 train 3.927297 (lr=5.1031e-04) (hash(x)=33392589) +1193 train 3.998593 (lr=5.0996e-04) (hash(x)=33958793) +1194 train 3.904512 (lr=5.0960e-04) (hash(x)=37811842) +1195 train 3.955267 (lr=5.0925e-04) (hash(x)=42436326) +1196 train 3.927253 (lr=5.0889e-04) (hash(x)=40437157) +1197 train 3.931869 (lr=5.0854e-04) (hash(x)=37903917) +1198 train 3.938747 (lr=5.0818e-04) (hash(x)=39915644) +1199 train 4.050231 (lr=5.0782e-04) (hash(x)=37479096) +1200 val loss 3.9097 +1200 val perplexity 49.8836 +1200 train 3.948855 (lr=5.0746e-04) (hash(x)=36033678) +1201 train 3.959306 (lr=5.0711e-04) (hash(x)=38402994) +1202 train 3.870855 (lr=5.0675e-04) (hash(x)=34247547) +1203 train 3.905335 (lr=5.0639e-04) (hash(x)=35963132) +1204 train 3.945591 (lr=5.0603e-04) (hash(x)=38794096) +1205 train 3.899985 (lr=5.0567e-04) (hash(x)=44539609) +1206 train 3.901940 (lr=5.0531e-04) (hash(x)=33334302) +1207 train 3.896625 (lr=5.0494e-04) (hash(x)=42784771) +1208 train 3.948766 (lr=5.0458e-04) (hash(x)=39712686) +1209 train 3.882130 (lr=5.0422e-04) (hash(x)=37541003) +1210 train 3.901161 (lr=5.0386e-04) (hash(x)=34465839) +1211 train 3.870388 (lr=5.0349e-04) (hash(x)=40142419) +1212 train 3.874183 (lr=5.0313e-04) (hash(x)=35662505) +1213 train 3.952955 (lr=5.0276e-04) (hash(x)=40049244) +1214 train 3.860060 (lr=5.0240e-04) (hash(x)=44884942) +1215 train 3.823457 (lr=5.0203e-04) (hash(x)=40649963) +1216 train 3.903499 (lr=5.0166e-04) (hash(x)=40311978) +1217 train 3.891204 (lr=5.0130e-04) (hash(x)=40015429) +1218 train 3.921021 (lr=5.0093e-04) (hash(x)=42279741) +1219 train 3.883350 (lr=5.0056e-04) (hash(x)=45004652) +1220 train 3.851842 (lr=5.0019e-04) (hash(x)=39667504) +1221 train 3.858281 (lr=4.9982e-04) (hash(x)=37079643) +1222 train 3.933222 (lr=4.9945e-04) (hash(x)=38700014) +1223 train 3.856160 (lr=4.9908e-04) (hash(x)=44849759) +1224 train 3.832405 (lr=4.9871e-04) (hash(x)=40219873) +1225 val loss 3.9053 +1225 val perplexity 49.6648 +1225 train 3.879165 (lr=4.9834e-04) (hash(x)=41728126) +1226 train 3.914027 (lr=4.9797e-04) (hash(x)=40669585) +1227 train 3.979441 (lr=4.9760e-04) (hash(x)=34813246) +1228 train 3.957683 (lr=4.9723e-04) (hash(x)=39751412) +1229 train 3.928717 (lr=4.9685e-04) (hash(x)=41059352) +1230 train 3.938504 (lr=4.9648e-04) (hash(x)=40747888) +1231 train 3.993985 (lr=4.9610e-04) (hash(x)=37402403) +1232 train 3.951302 (lr=4.9573e-04) (hash(x)=35128296) +1233 train 3.851964 (lr=4.9535e-04) (hash(x)=45346348) +1234 train 3.943573 (lr=4.9498e-04) (hash(x)=29711085) +1235 train 3.902007 (lr=4.9460e-04) (hash(x)=36024564) +1236 train 3.901617 (lr=4.9422e-04) (hash(x)=44708877) +1237 train 3.937624 (lr=4.9385e-04) (hash(x)=44872444) +1238 train 3.866732 (lr=4.9347e-04) (hash(x)=34147161) +1239 train 3.898949 (lr=4.9309e-04) (hash(x)=37133519) +1240 train 3.898762 (lr=4.9271e-04) (hash(x)=37827214) +1241 train 3.863565 (lr=4.9233e-04) (hash(x)=40360117) +1242 train 3.883421 (lr=4.9195e-04) (hash(x)=33583679) +1243 train 3.863061 (lr=4.9157e-04) (hash(x)=48955821) +1244 train 3.855265 (lr=4.9119e-04) (hash(x)=36796713) +1245 train 3.909855 (lr=4.9081e-04) (hash(x)=49969571) +1246 train 3.827265 (lr=4.9043e-04) (hash(x)=38414404) +1247 train 3.913578 (lr=4.9004e-04) (hash(x)=38923250) +1248 train 3.865974 (lr=4.8966e-04) (hash(x)=37694115) +1249 train 3.896384 (lr=4.8928e-04) (hash(x)=46715357) +1250 val loss 3.9031 +1250 val perplexity 49.5543 +1250 train 3.900003 (lr=4.8889e-04) (hash(x)=40663145) +1251 train 3.848935 (lr=4.8851e-04) (hash(x)=32741584) +1252 train 3.851664 (lr=4.8812e-04) (hash(x)=42678438) +1253 train 3.892893 (lr=4.8774e-04) (hash(x)=38541454) +1254 train 3.872412 (lr=4.8735e-04) (hash(x)=38332251) +1255 train 3.884717 (lr=4.8697e-04) (hash(x)=33815485) +1256 train 3.855556 (lr=4.8658e-04) (hash(x)=38997342) +1257 train 3.875366 (lr=4.8619e-04) (hash(x)=38707421) +1258 train 3.901960 (lr=4.8580e-04) (hash(x)=42210912) +1259 train 3.855776 (lr=4.8542e-04) (hash(x)=39362727) +1260 train 3.883826 (lr=4.8503e-04) (hash(x)=43174937) +1261 train 3.908904 (lr=4.8464e-04) (hash(x)=42865597) +1262 train 3.966709 (lr=4.8425e-04) (hash(x)=42891556) +1263 train 3.936115 (lr=4.8386e-04) (hash(x)=39268233) +1264 train 3.981720 (lr=4.8347e-04) (hash(x)=46093971) +1265 train 3.900629 (lr=4.8308e-04) (hash(x)=38333163) +1266 train 3.926666 (lr=4.8268e-04) (hash(x)=41140734) +1267 train 3.904927 (lr=4.8229e-04) (hash(x)=35512210) +1268 train 3.900600 (lr=4.8190e-04) (hash(x)=38161434) +1269 train 3.905323 (lr=4.8151e-04) (hash(x)=41280683) +1270 train 3.906125 (lr=4.8111e-04) (hash(x)=38630422) +1271 train 3.968302 (lr=4.8072e-04) (hash(x)=44321046) +1272 train 3.937365 (lr=4.8032e-04) (hash(x)=43488673) +1273 train 3.966146 (lr=4.7993e-04) (hash(x)=40268424) +1274 train 3.865010 (lr=4.7953e-04) (hash(x)=38443566) +1275 val loss 3.8988 +1275 val perplexity 49.3412 +1275 train 3.927174 (lr=4.7914e-04) (hash(x)=42334345) +1276 train 3.921507 (lr=4.7874e-04) (hash(x)=35030628) +1277 train 3.944926 (lr=4.7835e-04) (hash(x)=43580445) +1278 train 3.864147 (lr=4.7795e-04) (hash(x)=36679759) +1279 train 3.851919 (lr=4.7755e-04) (hash(x)=39311813) +1280 train 3.859385 (lr=4.7715e-04) (hash(x)=43154744) +1281 train 3.791480 (lr=4.7675e-04) (hash(x)=42799678) +1282 train 3.850129 (lr=4.7635e-04) (hash(x)=41149340) +1283 train 3.897139 (lr=4.7595e-04) (hash(x)=37060133) +1284 train 3.873930 (lr=4.7555e-04) (hash(x)=38684176) +1285 train 3.891213 (lr=4.7515e-04) (hash(x)=48170271) +1286 train 3.873615 (lr=4.7475e-04) (hash(x)=36876077) +1287 train 3.892973 (lr=4.7435e-04) (hash(x)=38848305) +1288 train 3.892890 (lr=4.7395e-04) (hash(x)=44085705) +1289 train 3.942321 (lr=4.7355e-04) (hash(x)=38623132) +1290 train 3.883278 (lr=4.7315e-04) (hash(x)=38056717) +1291 train 3.901766 (lr=4.7274e-04) (hash(x)=39178256) +1292 train 3.902114 (lr=4.7234e-04) (hash(x)=33708196) +1293 train 3.899998 (lr=4.7193e-04) (hash(x)=44207417) +1294 train 3.873982 (lr=4.7153e-04) (hash(x)=40197190) +1295 train 3.847621 (lr=4.7113e-04) (hash(x)=36680387) +1296 train 3.938995 (lr=4.7072e-04) (hash(x)=40047699) +1297 train 3.986392 (lr=4.7031e-04) (hash(x)=37313189) +1298 train 3.918041 (lr=4.6991e-04) (hash(x)=37766051) +1299 train 3.959065 (lr=4.6950e-04) (hash(x)=38284242) +1300 val loss 3.8928 +1300 val perplexity 49.0470 +1300 train 3.891171 (lr=4.6909e-04) (hash(x)=39641933) +1301 train 4.001854 (lr=4.6869e-04) (hash(x)=43938109) +1302 train 3.897438 (lr=4.6828e-04) (hash(x)=39849275) +1303 train 3.936044 (lr=4.6787e-04) (hash(x)=41655148) +1304 train 3.929419 (lr=4.6746e-04) (hash(x)=41033746) +1305 train 3.931777 (lr=4.6705e-04) (hash(x)=39674763) +1306 train 3.941201 (lr=4.6664e-04) (hash(x)=42068709) +1307 train 3.912880 (lr=4.6623e-04) (hash(x)=44556374) +1308 train 3.858922 (lr=4.6582e-04) (hash(x)=38496641) +1309 train 3.844950 (lr=4.6541e-04) (hash(x)=40040694) +1310 train 3.923794 (lr=4.6500e-04) (hash(x)=38659362) +1311 train 3.866172 (lr=4.6459e-04) (hash(x)=41179013) +1312 train 3.856927 (lr=4.6418e-04) (hash(x)=36863007) +1313 train 3.822062 (lr=4.6376e-04) (hash(x)=42430839) +1314 train 3.824650 (lr=4.6335e-04) (hash(x)=35396514) +1315 train 3.871274 (lr=4.6294e-04) (hash(x)=39090715) +1316 train 3.912968 (lr=4.6252e-04) (hash(x)=40270889) +1317 train 3.936815 (lr=4.6211e-04) (hash(x)=31528194) +1318 train 3.862797 (lr=4.6169e-04) (hash(x)=33281151) +1319 train 3.903363 (lr=4.6128e-04) (hash(x)=42398183) +1320 train 3.897626 (lr=4.6086e-04) (hash(x)=40378595) +1321 train 3.865008 (lr=4.6045e-04) (hash(x)=41383332) +1322 train 4.013382 (lr=4.6003e-04) (hash(x)=43095626) +1323 train 3.857874 (lr=4.5962e-04) (hash(x)=43149742) +1324 train 3.974930 (lr=4.5920e-04) (hash(x)=41370699) +1325 val loss 3.8954 +1325 val perplexity 49.1778 +1325 train 3.883108 (lr=4.5878e-04) (hash(x)=40933099) +1326 train 3.953423 (lr=4.5836e-04) (hash(x)=44865841) +1327 train 3.934145 (lr=4.5794e-04) (hash(x)=42879866) +1328 train 3.883750 (lr=4.5753e-04) (hash(x)=40356088) +1329 train 3.918973 (lr=4.5711e-04) (hash(x)=49481354) +1330 train 3.916904 (lr=4.5669e-04) (hash(x)=40612238) +1331 train 3.939579 (lr=4.5627e-04) (hash(x)=43436017) +1332 train 3.893400 (lr=4.5585e-04) (hash(x)=42086838) +1333 train 3.955024 (lr=4.5543e-04) (hash(x)=39906046) +1334 train 3.945915 (lr=4.5501e-04) (hash(x)=43884680) +1335 train 3.943354 (lr=4.5458e-04) (hash(x)=41650569) +1336 train 3.928137 (lr=4.5416e-04) (hash(x)=33509399) +1337 train 3.929173 (lr=4.5374e-04) (hash(x)=36797403) +1338 train 4.003134 (lr=4.5332e-04) (hash(x)=39433285) +1339 train 3.932567 (lr=4.5289e-04) (hash(x)=34748819) +1340 train 3.919448 (lr=4.5247e-04) (hash(x)=38164430) +1341 train 3.920207 (lr=4.5205e-04) (hash(x)=41155318) +1342 train 3.905877 (lr=4.5162e-04) (hash(x)=39828444) +1343 train 3.993318 (lr=4.5120e-04) (hash(x)=40665148) +1344 train 3.918101 (lr=4.5077e-04) (hash(x)=37246672) +1345 train 3.881290 (lr=4.5035e-04) (hash(x)=38025956) +1346 train 3.847164 (lr=4.4992e-04) (hash(x)=39452682) +1347 train 3.892815 (lr=4.4950e-04) (hash(x)=37170327) +1348 train 3.867805 (lr=4.4907e-04) (hash(x)=39862974) +1349 train 3.889949 (lr=4.4864e-04) (hash(x)=39017499) +1350 val loss 3.8905 +1350 val perplexity 48.9352 +1350 train 3.917588 (lr=4.4822e-04) (hash(x)=36939510) +1351 train 3.886204 (lr=4.4779e-04) (hash(x)=41912309) +1352 train 3.816982 (lr=4.4736e-04) (hash(x)=45976341) +1353 train 3.871327 (lr=4.4693e-04) (hash(x)=41333377) +1354 train 3.808421 (lr=4.4651e-04) (hash(x)=47027403) +1355 train 3.828454 (lr=4.4608e-04) (hash(x)=34976148) +1356 train 3.873611 (lr=4.4565e-04) (hash(x)=38966078) +1357 train 3.874091 (lr=4.4522e-04) (hash(x)=37203373) +1358 train 3.883724 (lr=4.4479e-04) (hash(x)=41995895) +1359 train 3.911620 (lr=4.4436e-04) (hash(x)=39480082) +1360 train 3.861668 (lr=4.4393e-04) (hash(x)=41997670) +1361 train 3.883086 (lr=4.4350e-04) (hash(x)=41347986) +1362 train 3.951373 (lr=4.4307e-04) (hash(x)=42771500) +1363 train 3.863725 (lr=4.4263e-04) (hash(x)=46129276) +1364 train 3.871984 (lr=4.4220e-04) (hash(x)=44382069) +1365 train 3.915846 (lr=4.4177e-04) (hash(x)=39119520) +1366 train 3.908914 (lr=4.4134e-04) (hash(x)=38499319) +1367 train 3.933050 (lr=4.4090e-04) (hash(x)=41281820) +1368 train 3.955879 (lr=4.4047e-04) (hash(x)=45937223) +1369 train 3.957287 (lr=4.4004e-04) (hash(x)=35028762) +1370 train 3.946336 (lr=4.3960e-04) (hash(x)=35366176) +1371 train 3.934758 (lr=4.3917e-04) (hash(x)=45630174) +1372 train 3.936662 (lr=4.3873e-04) (hash(x)=41910070) +1373 train 3.947383 (lr=4.3830e-04) (hash(x)=36125406) +1374 train 3.931239 (lr=4.3786e-04) (hash(x)=38415926) +1375 val loss 3.8893 +1375 val perplexity 48.8791 +1375 train 3.890212 (lr=4.3743e-04) (hash(x)=42819817) +1376 train 3.899067 (lr=4.3699e-04) (hash(x)=33779400) +1377 train 3.916103 (lr=4.3655e-04) (hash(x)=38488666) +1378 train 3.928525 (lr=4.3612e-04) (hash(x)=47385754) +1379 train 3.867427 (lr=4.3568e-04) (hash(x)=36993972) +1380 train 3.853868 (lr=4.3524e-04) (hash(x)=41899836) +1381 train 3.854429 (lr=4.3480e-04) (hash(x)=44167005) +1382 train 3.858767 (lr=4.3437e-04) (hash(x)=39065683) +1383 train 3.864419 (lr=4.3393e-04) (hash(x)=46103629) +1384 train 3.943118 (lr=4.3349e-04) (hash(x)=38345113) +1385 train 3.910729 (lr=4.3305e-04) (hash(x)=39768527) +1386 train 3.899460 (lr=4.3261e-04) (hash(x)=52101109) +1387 train 3.862000 (lr=4.3217e-04) (hash(x)=45796944) +1388 train 3.924715 (lr=4.3173e-04) (hash(x)=35243409) +1389 train 3.896152 (lr=4.3129e-04) (hash(x)=36391771) +1390 train 3.888381 (lr=4.3085e-04) (hash(x)=38160032) +1391 train 3.992512 (lr=4.3041e-04) (hash(x)=39523477) +1392 train 3.884790 (lr=4.2997e-04) (hash(x)=45342108) +1393 train 3.854874 (lr=4.2953e-04) (hash(x)=40575068) +1394 train 3.856329 (lr=4.2908e-04) (hash(x)=42087419) +1395 train 3.855596 (lr=4.2864e-04) (hash(x)=44533589) +1396 train 3.846492 (lr=4.2820e-04) (hash(x)=43679812) +1397 train 3.875684 (lr=4.2776e-04) (hash(x)=38864512) +1398 train 3.879281 (lr=4.2731e-04) (hash(x)=39414505) +1399 train 3.906952 (lr=4.2687e-04) (hash(x)=46340933) +1400 val loss 3.8828 +1400 val perplexity 48.5611 +1400 train 3.871735 (lr=4.2643e-04) (hash(x)=42490666) +1401 train 3.886252 (lr=4.2598e-04) (hash(x)=36427066) +1402 train 3.888371 (lr=4.2554e-04) (hash(x)=40963468) +1403 train 3.899438 (lr=4.2509e-04) (hash(x)=41855128) +1404 train 3.892738 (lr=4.2465e-04) (hash(x)=39233377) +1405 train 3.952327 (lr=4.2420e-04) (hash(x)=38486535) +1406 train 3.957063 (lr=4.2376e-04) (hash(x)=38441645) +1407 train 3.910203 (lr=4.2331e-04) (hash(x)=28516958) +1408 train 3.966781 (lr=4.2287e-04) (hash(x)=37805186) +1409 train 3.918784 (lr=4.2242e-04) (hash(x)=37481550) +1410 train 3.942316 (lr=4.2197e-04) (hash(x)=38733218) +1411 train 3.894591 (lr=4.2153e-04) (hash(x)=36045102) +1412 train 3.877861 (lr=4.2108e-04) (hash(x)=35341840) +1413 train 3.860137 (lr=4.2063e-04) (hash(x)=45848409) +1414 train 3.821226 (lr=4.2018e-04) (hash(x)=38980386) +1415 train 3.898959 (lr=4.1974e-04) (hash(x)=41124656) +1416 train 3.869988 (lr=4.1929e-04) (hash(x)=35720386) +1417 train 3.884906 (lr=4.1884e-04) (hash(x)=33990585) +1418 train 3.858839 (lr=4.1839e-04) (hash(x)=43737398) +1419 train 3.870673 (lr=4.1794e-04) (hash(x)=40120991) +1420 train 3.906587 (lr=4.1749e-04) (hash(x)=38250768) +1421 train 3.894596 (lr=4.1704e-04) (hash(x)=39064500) +1422 train 3.893702 (lr=4.1659e-04) (hash(x)=40888030) +1423 train 3.863409 (lr=4.1614e-04) (hash(x)=35731950) +1424 train 3.877488 (lr=4.1569e-04) (hash(x)=35678673) +1425 val loss 3.8818 +1425 val perplexity 48.5126 +1425 train 3.832700 (lr=4.1524e-04) (hash(x)=40805713) +1426 train 3.874631 (lr=4.1479e-04) (hash(x)=37262458) +1427 train 3.850188 (lr=4.1434e-04) (hash(x)=41996436) +1428 train 3.793015 (lr=4.1389e-04) (hash(x)=41958406) +1429 train 3.930429 (lr=4.1343e-04) (hash(x)=44139550) +1430 train 3.819598 (lr=4.1298e-04) (hash(x)=38902952) +1431 train 3.812226 (lr=4.1253e-04) (hash(x)=42176120) +1432 train 3.915162 (lr=4.1208e-04) (hash(x)=35883317) +1433 train 3.898729 (lr=4.1162e-04) (hash(x)=41266349) +1434 train 3.917945 (lr=4.1117e-04) (hash(x)=41545933) +1435 train 3.894384 (lr=4.1072e-04) (hash(x)=41995010) +1436 train 3.918571 (lr=4.1026e-04) (hash(x)=39029102) +1437 train 3.896502 (lr=4.0981e-04) (hash(x)=39899963) +1438 train 3.927071 (lr=4.0936e-04) (hash(x)=35541954) +1439 train 3.890802 (lr=4.0890e-04) (hash(x)=38559197) +1440 train 3.884275 (lr=4.0845e-04) (hash(x)=31780670) +1441 train 3.871756 (lr=4.0799e-04) (hash(x)=40745173) +1442 train 3.891528 (lr=4.0754e-04) (hash(x)=41625044) +1443 train 3.883567 (lr=4.0708e-04) (hash(x)=38420174) +1444 train 3.908018 (lr=4.0663e-04) (hash(x)=37306086) +1445 train 3.909577 (lr=4.0617e-04) (hash(x)=39308452) +1446 train 3.874475 (lr=4.0572e-04) (hash(x)=39564328) +1447 train 3.927028 (lr=4.0526e-04) (hash(x)=38669834) +1448 train 3.897792 (lr=4.0480e-04) (hash(x)=40587610) +1449 train 3.843796 (lr=4.0435e-04) (hash(x)=42932738) +1450 val loss 3.8762 +1450 val perplexity 48.2385 +1450 train 3.919767 (lr=4.0389e-04) (hash(x)=39829747) +1451 train 3.873205 (lr=4.0343e-04) (hash(x)=35022444) +1452 train 3.878064 (lr=4.0297e-04) (hash(x)=40356505) +1453 train 3.888861 (lr=4.0252e-04) (hash(x)=38322690) +1454 train 3.876286 (lr=4.0206e-04) (hash(x)=38909054) +1455 train 3.915097 (lr=4.0160e-04) (hash(x)=38898519) +1456 train 3.869443 (lr=4.0114e-04) (hash(x)=42483473) +1457 train 3.909100 (lr=4.0068e-04) (hash(x)=40096531) +1458 train 3.825988 (lr=4.0023e-04) (hash(x)=38178843) +1459 train 3.830176 (lr=3.9977e-04) (hash(x)=35327076) +1460 train 3.905799 (lr=3.9931e-04) (hash(x)=42637631) +1461 train 3.923129 (lr=3.9885e-04) (hash(x)=40006574) +1462 train 3.851009 (lr=3.9839e-04) (hash(x)=32921213) +1463 train 3.950773 (lr=3.9793e-04) (hash(x)=43303382) +1464 train 3.910339 (lr=3.9747e-04) (hash(x)=41192032) +1465 train 3.847375 (lr=3.9701e-04) (hash(x)=42715664) +1466 train 3.856455 (lr=3.9655e-04) (hash(x)=51717645) +1467 train 3.866651 (lr=3.9609e-04) (hash(x)=41820470) +1468 train 3.898773 (lr=3.9563e-04) (hash(x)=41340250) +1469 train 3.830728 (lr=3.9517e-04) (hash(x)=43655991) +1470 train 3.881809 (lr=3.9470e-04) (hash(x)=40170889) +1471 train 3.821984 (lr=3.9424e-04) (hash(x)=41619893) +1472 train 3.893860 (lr=3.9378e-04) (hash(x)=39702074) +1473 train 3.910213 (lr=3.9332e-04) (hash(x)=38910234) +1474 train 3.836158 (lr=3.9286e-04) (hash(x)=49975030) +1475 val loss 3.8715 +1475 val perplexity 48.0121 +1475 train 3.928776 (lr=3.9239e-04) (hash(x)=47353128) +1476 train 3.812323 (lr=3.9193e-04) (hash(x)=38194519) +1477 train 3.921005 (lr=3.9147e-04) (hash(x)=35775283) +1478 train 3.940937 (lr=3.9101e-04) (hash(x)=41470356) +1479 train 3.871354 (lr=3.9054e-04) (hash(x)=35970147) +1480 train 3.886744 (lr=3.9008e-04) (hash(x)=37706493) +1481 train 3.924195 (lr=3.8962e-04) (hash(x)=39510796) +1482 train 3.857725 (lr=3.8915e-04) (hash(x)=38845313) +1483 train 3.902533 (lr=3.8869e-04) (hash(x)=38073194) +1484 train 3.999000 (lr=3.8823e-04) (hash(x)=43141114) +1485 train 3.820047 (lr=3.8776e-04) (hash(x)=42131726) +1486 train 3.870992 (lr=3.8730e-04) (hash(x)=40583721) +1487 train 3.885769 (lr=3.8683e-04) (hash(x)=38210009) +1488 train 3.880045 (lr=3.8637e-04) (hash(x)=37680893) +1489 train 3.816723 (lr=3.8590e-04) (hash(x)=40627895) +1490 train 3.862901 (lr=3.8544e-04) (hash(x)=42139258) +1491 train 3.835845 (lr=3.8497e-04) (hash(x)=56025275) +1492 train 3.834021 (lr=3.8451e-04) (hash(x)=38884979) +1493 train 3.889003 (lr=3.8404e-04) (hash(x)=41192020) +1494 train 3.818891 (lr=3.8358e-04) (hash(x)=42477272) +1495 train 3.842494 (lr=3.8311e-04) (hash(x)=40362105) +1496 train 3.877550 (lr=3.8265e-04) (hash(x)=37539165) +1497 train 3.842907 (lr=3.8218e-04) (hash(x)=39280299) +1498 train 3.877005 (lr=3.8171e-04) (hash(x)=49236439) +1499 train 3.892327 (lr=3.8125e-04) (hash(x)=43983358) +1500 val loss 3.8731 +1500 val perplexity 48.0936 +1500 train 3.827914 (lr=3.8078e-04) (hash(x)=38499685) +1501 train 3.804693 (lr=3.8031e-04) (hash(x)=39546412) +1502 train 3.880463 (lr=3.7985e-04) (hash(x)=38764240) +1503 train 3.880233 (lr=3.7938e-04) (hash(x)=41653611) +1504 train 3.855524 (lr=3.7891e-04) (hash(x)=41019698) +1505 train 3.859645 (lr=3.7844e-04) (hash(x)=34894451) +1506 train 3.863978 (lr=3.7798e-04) (hash(x)=43712054) +1507 train 3.917581 (lr=3.7751e-04) (hash(x)=38624098) +1508 train 3.925220 (lr=3.7704e-04) (hash(x)=42257471) +1509 train 3.872885 (lr=3.7657e-04) (hash(x)=37808084) +1510 train 3.947422 (lr=3.7610e-04) (hash(x)=36499863) +1511 train 3.856168 (lr=3.7564e-04) (hash(x)=43829068) +1512 train 3.868588 (lr=3.7517e-04) (hash(x)=43024760) +1513 train 3.844611 (lr=3.7470e-04) (hash(x)=44507409) +1514 train 3.913291 (lr=3.7423e-04) (hash(x)=41419528) +1515 train 3.843402 (lr=3.7376e-04) (hash(x)=42041209) +1516 train 3.890829 (lr=3.7329e-04) (hash(x)=40177064) +1517 train 3.845786 (lr=3.7282e-04) (hash(x)=42080840) +1518 train 3.929813 (lr=3.7235e-04) (hash(x)=35347305) +1519 train 3.868508 (lr=3.7189e-04) (hash(x)=40251544) +1520 train 3.940339 (lr=3.7142e-04) (hash(x)=43871898) +1521 train 3.870209 (lr=3.7095e-04) (hash(x)=34306528) +1522 train 3.858093 (lr=3.7048e-04) (hash(x)=39448215) +1523 train 3.839057 (lr=3.7001e-04) (hash(x)=36733155) +1524 train 3.879194 (lr=3.6954e-04) (hash(x)=40497980) +1525 val loss 3.8671 +1525 val perplexity 47.8026 +1525 train 3.812669 (lr=3.6907e-04) (hash(x)=39583973) +1526 train 3.850997 (lr=3.6860e-04) (hash(x)=37291571) +1527 train 3.847092 (lr=3.6813e-04) (hash(x)=29080418) +1528 train 3.871413 (lr=3.6766e-04) (hash(x)=36297525) +1529 train 3.874473 (lr=3.6718e-04) (hash(x)=37548335) +1530 train 3.890857 (lr=3.6671e-04) (hash(x)=43398302) +1531 train 3.905516 (lr=3.6624e-04) (hash(x)=43825118) +1532 train 3.885553 (lr=3.6577e-04) (hash(x)=45074521) +1533 train 3.897552 (lr=3.6530e-04) (hash(x)=50978938) +1534 train 3.881003 (lr=3.6483e-04) (hash(x)=44382031) +1535 train 3.878498 (lr=3.6436e-04) (hash(x)=40087369) +1536 train 3.860620 (lr=3.6389e-04) (hash(x)=37442306) +1537 train 3.918733 (lr=3.6342e-04) (hash(x)=40189321) +1538 train 3.839806 (lr=3.6294e-04) (hash(x)=37529547) +1539 train 3.865065 (lr=3.6247e-04) (hash(x)=44350120) +1540 train 3.897463 (lr=3.6200e-04) (hash(x)=46435845) +1541 train 3.902369 (lr=3.6153e-04) (hash(x)=39783339) +1542 train 3.886777 (lr=3.6106e-04) (hash(x)=37884323) +1543 train 3.905871 (lr=3.6058e-04) (hash(x)=38889127) +1544 train 3.887436 (lr=3.6011e-04) (hash(x)=42509778) +1545 train 3.867934 (lr=3.5964e-04) (hash(x)=38712008) +1546 train 3.873189 (lr=3.5917e-04) (hash(x)=39641961) +1547 train 3.856560 (lr=3.5870e-04) (hash(x)=31241373) +1548 train 3.876107 (lr=3.5822e-04) (hash(x)=39398479) +1549 train 3.880839 (lr=3.5775e-04) (hash(x)=33129389) +1550 val loss 3.8619 +1550 val perplexity 47.5536 +1550 train 3.818285 (lr=3.5728e-04) (hash(x)=36189187) +1551 train 3.852146 (lr=3.5680e-04) (hash(x)=33832069) +1552 train 3.867922 (lr=3.5633e-04) (hash(x)=38297634) +1553 train 3.840419 (lr=3.5586e-04) (hash(x)=45428704) +1554 train 3.890976 (lr=3.5539e-04) (hash(x)=40854512) +1555 train 3.823045 (lr=3.5491e-04) (hash(x)=38265838) +1556 train 3.867264 (lr=3.5444e-04) (hash(x)=43818850) +1557 train 3.809199 (lr=3.5397e-04) (hash(x)=40098656) +1558 train 3.846154 (lr=3.5349e-04) (hash(x)=46373542) +1559 train 3.867592 (lr=3.5302e-04) (hash(x)=36543425) +1560 train 3.892067 (lr=3.5255e-04) (hash(x)=36910053) +1561 train 3.845462 (lr=3.5207e-04) (hash(x)=38320944) +1562 train 3.835179 (lr=3.5160e-04) (hash(x)=35943710) +1563 train 3.945080 (lr=3.5112e-04) (hash(x)=44335791) +1564 train 3.835270 (lr=3.5065e-04) (hash(x)=42888942) +1565 train 3.852586 (lr=3.5018e-04) (hash(x)=38058078) +1566 train 3.872939 (lr=3.4970e-04) (hash(x)=37606649) +1567 train 3.880657 (lr=3.4923e-04) (hash(x)=50448491) +1568 train 3.903907 (lr=3.4876e-04) (hash(x)=42495491) +1569 train 4.082086 (lr=3.4828e-04) (hash(x)=43265286) +1570 train 3.914077 (lr=3.4781e-04) (hash(x)=42718822) +1571 train 3.856668 (lr=3.4733e-04) (hash(x)=37565358) +1572 train 4.084485 (lr=3.4686e-04) (hash(x)=31657663) +1573 train 3.877611 (lr=3.4638e-04) (hash(x)=40998655) +1574 train 3.854797 (lr=3.4591e-04) (hash(x)=42668571) +1575 val loss 3.8676 +1575 val perplexity 47.8256 +1575 train 3.863677 (lr=3.4544e-04) (hash(x)=37458041) +1576 train 3.883717 (lr=3.4496e-04) (hash(x)=39403623) +1577 train 3.871484 (lr=3.4449e-04) (hash(x)=40124262) +1578 train 3.888700 (lr=3.4401e-04) (hash(x)=36811898) +1579 train 3.880329 (lr=3.4354e-04) (hash(x)=40574452) +1580 train 3.813730 (lr=3.4306e-04) (hash(x)=38422374) +1581 train 3.867652 (lr=3.4259e-04) (hash(x)=40288658) +1582 train 3.910725 (lr=3.4211e-04) (hash(x)=42960832) +1583 train 3.869384 (lr=3.4164e-04) (hash(x)=41443532) +1584 train 3.903682 (lr=3.4116e-04) (hash(x)=39826458) +1585 train 3.876907 (lr=3.4069e-04) (hash(x)=44383030) +1586 train 3.869014 (lr=3.4021e-04) (hash(x)=35677640) +1587 train 3.848519 (lr=3.3974e-04) (hash(x)=37112056) +1588 train 3.958912 (lr=3.3926e-04) (hash(x)=43210880) +1589 train 3.851819 (lr=3.3879e-04) (hash(x)=39712403) +1590 train 3.829383 (lr=3.3831e-04) (hash(x)=34604741) +1591 train 3.768692 (lr=3.3784e-04) (hash(x)=42317807) +1592 train 3.829514 (lr=3.3736e-04) (hash(x)=41647114) +1593 train 3.920655 (lr=3.3689e-04) (hash(x)=37354029) +1594 train 3.836524 (lr=3.3641e-04) (hash(x)=44344701) +1595 train 3.809465 (lr=3.3594e-04) (hash(x)=32628808) +1596 train 3.892855 (lr=3.3546e-04) (hash(x)=43492816) +1597 train 3.832104 (lr=3.3499e-04) (hash(x)=40588313) +1598 train 3.782779 (lr=3.3451e-04) (hash(x)=40958901) +1599 train 3.825098 (lr=3.3404e-04) (hash(x)=36723026) +1600 val loss 3.8600 +1600 val perplexity 47.4650 +1600 train 3.888232 (lr=3.3356e-04) (hash(x)=43858561) +1601 train 3.823087 (lr=3.3309e-04) (hash(x)=40990465) +1602 train 3.883584 (lr=3.3261e-04) (hash(x)=45309986) +1603 train 3.847507 (lr=3.3214e-04) (hash(x)=39208148) +1604 train 3.854513 (lr=3.3166e-04) (hash(x)=39799483) +1605 train 3.889433 (lr=3.3119e-04) (hash(x)=35248836) +1606 train 3.862376 (lr=3.3071e-04) (hash(x)=39574852) +1607 train 3.920967 (lr=3.3024e-04) (hash(x)=38114119) +1608 train 3.880018 (lr=3.2976e-04) (hash(x)=37487266) +1609 train 3.896006 (lr=3.2929e-04) (hash(x)=42603597) +1610 train 3.915513 (lr=3.2881e-04) (hash(x)=39084381) +1611 train 3.774094 (lr=3.2834e-04) (hash(x)=41680118) +1612 train 3.834783 (lr=3.2786e-04) (hash(x)=38670034) +1613 train 3.896645 (lr=3.2739e-04) (hash(x)=37809180) +1614 train 3.847695 (lr=3.2691e-04) (hash(x)=36772092) +1615 train 3.883897 (lr=3.2644e-04) (hash(x)=39534264) +1616 train 3.910768 (lr=3.2596e-04) (hash(x)=38263925) +1617 train 3.928158 (lr=3.2549e-04) (hash(x)=35393511) +1618 train 3.873620 (lr=3.2501e-04) (hash(x)=41960844) +1619 train 3.869750 (lr=3.2454e-04) (hash(x)=40501602) +1620 train 3.867470 (lr=3.2406e-04) (hash(x)=41362662) +1621 train 3.862906 (lr=3.2359e-04) (hash(x)=47155709) +1622 train 3.908209 (lr=3.2311e-04) (hash(x)=40330277) +1623 train 3.794762 (lr=3.2264e-04) (hash(x)=34879410) +1624 train 3.872967 (lr=3.2216e-04) (hash(x)=36509272) +1625 val loss 3.8549 +1625 val perplexity 47.2257 +1625 train 3.880852 (lr=3.2169e-04) (hash(x)=41171081) +1626 train 3.813070 (lr=3.2121e-04) (hash(x)=47354161) +1627 train 3.856266 (lr=3.2074e-04) (hash(x)=38123189) +1628 train 3.835179 (lr=3.2026e-04) (hash(x)=37839393) +1629 train 3.831539 (lr=3.1979e-04) (hash(x)=37826804) +1630 train 3.900082 (lr=3.1931e-04) (hash(x)=43972375) +1631 train 3.802429 (lr=3.1884e-04) (hash(x)=37166779) +1632 train 3.796980 (lr=3.1836e-04) (hash(x)=53134469) +1633 train 3.874418 (lr=3.1789e-04) (hash(x)=36142455) +1634 train 3.818348 (lr=3.1741e-04) (hash(x)=42360633) +1635 train 3.861767 (lr=3.1694e-04) (hash(x)=39382040) +1636 train 3.835673 (lr=3.1646e-04) (hash(x)=34590612) +1637 train 3.817015 (lr=3.1599e-04) (hash(x)=41616974) +1638 train 3.885756 (lr=3.1551e-04) (hash(x)=41175541) +1639 train 3.862134 (lr=3.1504e-04) (hash(x)=39562881) +1640 train 3.849824 (lr=3.1456e-04) (hash(x)=39600488) +1641 train 3.906787 (lr=3.1409e-04) (hash(x)=44082117) +1642 train 3.871882 (lr=3.1362e-04) (hash(x)=42294565) +1643 train 4.002625 (lr=3.1314e-04) (hash(x)=39936849) +1644 train 3.896419 (lr=3.1267e-04) (hash(x)=38120459) +1645 train 3.871373 (lr=3.1219e-04) (hash(x)=36443920) +1646 train 3.818641 (lr=3.1172e-04) (hash(x)=47850516) +1647 train 3.910496 (lr=3.1124e-04) (hash(x)=44211350) +1648 train 3.905368 (lr=3.1077e-04) (hash(x)=37474622) +1649 train 3.884148 (lr=3.1030e-04) (hash(x)=37310967) +1650 val loss 3.8500 +1650 val perplexity 46.9946 +1650 train 3.873799 (lr=3.0982e-04) (hash(x)=39518082) +1651 train 3.895843 (lr=3.0935e-04) (hash(x)=42932182) +1652 train 3.905673 (lr=3.0888e-04) (hash(x)=35760521) +1653 train 3.844843 (lr=3.0840e-04) (hash(x)=39365374) +1654 train 3.839769 (lr=3.0793e-04) (hash(x)=38439459) +1655 train 3.877319 (lr=3.0745e-04) (hash(x)=77972236) +1656 train 3.919524 (lr=3.0698e-04) (hash(x)=41001514) +1657 train 3.904485 (lr=3.0651e-04) (hash(x)=38284959) +1658 train 3.793278 (lr=3.0603e-04) (hash(x)=38377545) +1659 train 3.852717 (lr=3.0556e-04) (hash(x)=43162586) +1660 train 3.906315 (lr=3.0509e-04) (hash(x)=36855725) +1661 train 3.848639 (lr=3.0461e-04) (hash(x)=37285015) +1662 train 3.877108 (lr=3.0414e-04) (hash(x)=37678459) +1663 train 3.799121 (lr=3.0367e-04) (hash(x)=49173105) +1664 train 3.867347 (lr=3.0320e-04) (hash(x)=39987719) +1665 train 3.820493 (lr=3.0272e-04) (hash(x)=43636987) +1666 train 3.803518 (lr=3.0225e-04) (hash(x)=35264813) +1667 train 3.844409 (lr=3.0178e-04) (hash(x)=36696211) +1668 train 3.805916 (lr=3.0130e-04) (hash(x)=34491551) +1669 train 3.830439 (lr=3.0083e-04) (hash(x)=36173228) +1670 train 3.887572 (lr=3.0036e-04) (hash(x)=36503429) +1671 train 3.885050 (lr=2.9989e-04) (hash(x)=36974561) +1672 train 3.828027 (lr=2.9942e-04) (hash(x)=41835906) +1673 train 3.865273 (lr=2.9894e-04) (hash(x)=43484411) +1674 train 3.823120 (lr=2.9847e-04) (hash(x)=47417208) +1675 val loss 3.8468 +1675 val perplexity 46.8422 +1675 train 3.842909 (lr=2.9800e-04) (hash(x)=39892285) +1676 train 3.849490 (lr=2.9753e-04) (hash(x)=36499269) +1677 train 3.840965 (lr=2.9706e-04) (hash(x)=38652055) +1678 train 3.868639 (lr=2.9658e-04) (hash(x)=41815549) +1679 train 3.813670 (lr=2.9611e-04) (hash(x)=40287024) +1680 train 3.845132 (lr=2.9564e-04) (hash(x)=43277376) +1681 train 3.822594 (lr=2.9517e-04) (hash(x)=35662401) +1682 train 3.896034 (lr=2.9470e-04) (hash(x)=44634164) +1683 train 3.832432 (lr=2.9423e-04) (hash(x)=40494723) +1684 train 3.843688 (lr=2.9376e-04) (hash(x)=44611712) +1685 train 3.864978 (lr=2.9329e-04) (hash(x)=40770255) +1686 train 3.889225 (lr=2.9282e-04) (hash(x)=39957542) +1687 train 3.873170 (lr=2.9234e-04) (hash(x)=40877048) +1688 train 3.905135 (lr=2.9187e-04) (hash(x)=40271384) +1689 train 3.877549 (lr=2.9140e-04) (hash(x)=39419856) +1690 train 3.839575 (lr=2.9093e-04) (hash(x)=44443615) +1691 train 3.838000 (lr=2.9046e-04) (hash(x)=42988386) +1692 train 3.861683 (lr=2.8999e-04) (hash(x)=36264062) +1693 train 3.822584 (lr=2.8952e-04) (hash(x)=42750108) +1694 train 3.840883 (lr=2.8905e-04) (hash(x)=39735523) +1695 train 3.861099 (lr=2.8858e-04) (hash(x)=36141886) +1696 train 3.809892 (lr=2.8811e-04) (hash(x)=38721781) +1697 train 3.822805 (lr=2.8765e-04) (hash(x)=41676135) +1698 train 3.854577 (lr=2.8718e-04) (hash(x)=41723876) +1699 train 3.831947 (lr=2.8671e-04) (hash(x)=38769907) +1700 val loss 3.8436 +1700 val perplexity 46.6947 +1700 train 3.850524 (lr=2.8624e-04) (hash(x)=29275525) +1701 train 3.782125 (lr=2.8577e-04) (hash(x)=37513379) +1702 train 3.821759 (lr=2.8530e-04) (hash(x)=39654231) +1703 train 3.833602 (lr=2.8483e-04) (hash(x)=62058763) +1704 train 3.847374 (lr=2.8436e-04) (hash(x)=35257603) +1705 train 3.931981 (lr=2.8390e-04) (hash(x)=43798600) +1706 train 3.859510 (lr=2.8343e-04) (hash(x)=38920480) +1707 train 3.800257 (lr=2.8296e-04) (hash(x)=39171314) +1708 train 3.839341 (lr=2.8249e-04) (hash(x)=45081537) +1709 train 3.771966 (lr=2.8202e-04) (hash(x)=42761484) +1710 train 3.814546 (lr=2.8156e-04) (hash(x)=35361203) +1711 train 3.815383 (lr=2.8109e-04) (hash(x)=38113098) +1712 train 3.799691 (lr=2.8062e-04) (hash(x)=42573049) +1713 train 3.806999 (lr=2.8015e-04) (hash(x)=37087852) +1714 train 3.834282 (lr=2.7969e-04) (hash(x)=39678133) +1715 train 3.776262 (lr=2.7922e-04) (hash(x)=45437529) +1716 train 3.799962 (lr=2.7875e-04) (hash(x)=35861584) +1717 train 3.805037 (lr=2.7829e-04) (hash(x)=43970506) +1718 train 3.859638 (lr=2.7782e-04) (hash(x)=37676197) +1719 train 3.850246 (lr=2.7735e-04) (hash(x)=38125550) +1720 train 3.859085 (lr=2.7689e-04) (hash(x)=40605084) +1721 train 3.841967 (lr=2.7642e-04) (hash(x)=36271350) +1722 train 3.834100 (lr=2.7596e-04) (hash(x)=30912651) +1723 train 3.822785 (lr=2.7549e-04) (hash(x)=43385840) +1724 train 3.865575 (lr=2.7503e-04) (hash(x)=47072319) +1725 val loss 3.8426 +1725 val perplexity 46.6479 +1725 train 3.877557 (lr=2.7456e-04) (hash(x)=41573100) +1726 train 3.854742 (lr=2.7410e-04) (hash(x)=36054934) +1727 train 3.907625 (lr=2.7363e-04) (hash(x)=39710862) +1728 train 3.918640 (lr=2.7317e-04) (hash(x)=37749858) +1729 train 3.841780 (lr=2.7270e-04) (hash(x)=39689453) +1730 train 3.876993 (lr=2.7224e-04) (hash(x)=38104879) +1731 train 3.807574 (lr=2.7177e-04) (hash(x)=36436634) +1732 train 3.911800 (lr=2.7131e-04) (hash(x)=39369709) +1733 train 3.846801 (lr=2.7085e-04) (hash(x)=42647375) +1734 train 3.803742 (lr=2.7038e-04) (hash(x)=38975126) +1735 train 3.857607 (lr=2.6992e-04) (hash(x)=38240089) +1736 train 3.838449 (lr=2.6946e-04) (hash(x)=36774904) +1737 train 3.830353 (lr=2.6899e-04) (hash(x)=43940787) +1738 train 3.800393 (lr=2.6853e-04) (hash(x)=36966932) +1739 train 3.857622 (lr=2.6807e-04) (hash(x)=49661046) +1740 train 3.818552 (lr=2.6761e-04) (hash(x)=39266199) +1741 train 3.808908 (lr=2.6714e-04) (hash(x)=37014280) +1742 train 3.773061 (lr=2.6668e-04) (hash(x)=39971005) +1743 train 3.808598 (lr=2.6622e-04) (hash(x)=42682622) +1744 train 3.815270 (lr=2.6576e-04) (hash(x)=41821509) +1745 train 3.882179 (lr=2.6530e-04) (hash(x)=44516678) +1746 train 3.762290 (lr=2.6483e-04) (hash(x)=44456840) +1747 train 3.835375 (lr=2.6437e-04) (hash(x)=41395712) +1748 train 3.854746 (lr=2.6391e-04) (hash(x)=39871829) +1749 train 3.858202 (lr=2.6345e-04) (hash(x)=44870587) +1750 val loss 3.8371 +1750 val perplexity 46.3910 +1750 train 3.824224 (lr=2.6299e-04) (hash(x)=40590843) +1751 train 3.809885 (lr=2.6253e-04) (hash(x)=42732277) +1752 train 3.839734 (lr=2.6207e-04) (hash(x)=36504852) +1753 train 3.775031 (lr=2.6161e-04) (hash(x)=37370626) +1754 train 3.890182 (lr=2.6115e-04) (hash(x)=42620878) +1755 train 3.847951 (lr=2.6069e-04) (hash(x)=41254218) +1756 train 3.942807 (lr=2.6023e-04) (hash(x)=47746093) +1757 train 3.846874 (lr=2.5977e-04) (hash(x)=45536967) +1758 train 3.838305 (lr=2.5932e-04) (hash(x)=40241534) +1759 train 3.831285 (lr=2.5886e-04) (hash(x)=41386093) +1760 train 3.867056 (lr=2.5840e-04) (hash(x)=36037199) +1761 train 3.841832 (lr=2.5794e-04) (hash(x)=39419151) +1762 train 3.807040 (lr=2.5748e-04) (hash(x)=38680759) +1763 train 3.883086 (lr=2.5703e-04) (hash(x)=39174296) +1764 train 3.851336 (lr=2.5657e-04) (hash(x)=34742648) +1765 train 3.828597 (lr=2.5611e-04) (hash(x)=61951368) +1766 train 3.814972 (lr=2.5565e-04) (hash(x)=32883697) +1767 train 3.771002 (lr=2.5520e-04) (hash(x)=40745127) +1768 train 3.869025 (lr=2.5474e-04) (hash(x)=35974691) +1769 train 3.802565 (lr=2.5428e-04) (hash(x)=49624867) +1770 train 3.846778 (lr=2.5383e-04) (hash(x)=36025381) +1771 train 3.839330 (lr=2.5337e-04) (hash(x)=37301382) +1772 train 3.789363 (lr=2.5292e-04) (hash(x)=39688677) +1773 train 3.771961 (lr=2.5246e-04) (hash(x)=41294105) +1774 train 3.860578 (lr=2.5201e-04) (hash(x)=35511310) +1775 val loss 3.8345 +1775 val perplexity 46.2720 +1775 train 3.843336 (lr=2.5155e-04) (hash(x)=40040013) +1776 train 3.786002 (lr=2.5110e-04) (hash(x)=40869591) +1777 train 3.815645 (lr=2.5064e-04) (hash(x)=34461159) +1778 train 3.820933 (lr=2.5019e-04) (hash(x)=39742222) +1779 train 3.819179 (lr=2.4974e-04) (hash(x)=36692431) +1780 train 3.790527 (lr=2.4928e-04) (hash(x)=41866268) +1781 train 3.920253 (lr=2.4883e-04) (hash(x)=40127350) +1782 train 3.832412 (lr=2.4838e-04) (hash(x)=39200190) +1783 train 3.819323 (lr=2.4792e-04) (hash(x)=39526632) +1784 train 3.861099 (lr=2.4747e-04) (hash(x)=47091452) +1785 train 3.803337 (lr=2.4702e-04) (hash(x)=44092469) +1786 train 3.818172 (lr=2.4657e-04) (hash(x)=38288897) +1787 train 3.827141 (lr=2.4611e-04) (hash(x)=42177588) +1788 train 3.864768 (lr=2.4566e-04) (hash(x)=39809912) +1789 train 3.859549 (lr=2.4521e-04) (hash(x)=50020563) +1790 train 3.875974 (lr=2.4476e-04) (hash(x)=37469065) +1791 train 3.991437 (lr=2.4431e-04) (hash(x)=34654449) +1792 train 3.905731 (lr=2.4386e-04) (hash(x)=38620695) +1793 train 3.881969 (lr=2.4341e-04) (hash(x)=42229671) +1794 train 3.915488 (lr=2.4296e-04) (hash(x)=42829009) +1795 train 3.886563 (lr=2.4251e-04) (hash(x)=41214899) +1796 train 3.823389 (lr=2.4206e-04) (hash(x)=41629631) +1797 train 3.841432 (lr=2.4161e-04) (hash(x)=43209160) +1798 train 3.823833 (lr=2.4116e-04) (hash(x)=41555786) +1799 train 3.823383 (lr=2.4071e-04) (hash(x)=38778839) +1800 val loss 3.8299 +1800 val perplexity 46.0571 +1800 train 3.834434 (lr=2.4026e-04) (hash(x)=38200080) +1801 train 3.892835 (lr=2.3982e-04) (hash(x)=35409577) +1802 train 3.839844 (lr=2.3937e-04) (hash(x)=39612782) +1803 train 3.793620 (lr=2.3892e-04) (hash(x)=41413875) +1804 train 3.764670 (lr=2.3847e-04) (hash(x)=41431605) +1805 train 3.794467 (lr=2.3803e-04) (hash(x)=38678388) +1806 train 3.827284 (lr=2.3758e-04) (hash(x)=37386469) +1807 train 3.774142 (lr=2.3713e-04) (hash(x)=33281867) +1808 train 3.797175 (lr=2.3669e-04) (hash(x)=40909550) +1809 train 3.785535 (lr=2.3624e-04) (hash(x)=36522721) +1810 train 3.846878 (lr=2.3580e-04) (hash(x)=42762393) +1811 train 3.846099 (lr=2.3535e-04) (hash(x)=36776039) +1812 train 3.818627 (lr=2.3491e-04) (hash(x)=46179524) +1813 train 3.849162 (lr=2.3446e-04) (hash(x)=42484439) +1814 train 3.877408 (lr=2.3402e-04) (hash(x)=42461122) +1815 train 3.822993 (lr=2.3357e-04) (hash(x)=38732269) +1816 train 3.814668 (lr=2.3313e-04) (hash(x)=47737901) +1817 train 3.899853 (lr=2.3269e-04) (hash(x)=43658139) +1818 train 3.822036 (lr=2.3224e-04) (hash(x)=46188381) +1819 train 3.831884 (lr=2.3180e-04) (hash(x)=45124011) +1820 train 3.823700 (lr=2.3136e-04) (hash(x)=39352126) +1821 train 3.791554 (lr=2.3092e-04) (hash(x)=40994942) +1822 train 3.834232 (lr=2.3047e-04) (hash(x)=42139354) +1823 train 3.804581 (lr=2.3003e-04) (hash(x)=54684876) +1824 train 3.919766 (lr=2.2959e-04) (hash(x)=39271906) +1825 val loss 3.8265 +1825 val perplexity 45.9010 +1825 train 3.833828 (lr=2.2915e-04) (hash(x)=35965735) +1826 train 3.833893 (lr=2.2871e-04) (hash(x)=41682676) +1827 train 3.897145 (lr=2.2827e-04) (hash(x)=39083252) +1828 train 3.839289 (lr=2.2783e-04) (hash(x)=41096280) +1829 train 3.863064 (lr=2.2739e-04) (hash(x)=38969267) +1830 train 3.856108 (lr=2.2695e-04) (hash(x)=38161393) +1831 train 3.804722 (lr=2.2651e-04) (hash(x)=40892454) +1832 train 3.814510 (lr=2.2607e-04) (hash(x)=38002178) +1833 train 3.848171 (lr=2.2563e-04) (hash(x)=39653873) +1834 train 3.858899 (lr=2.2520e-04) (hash(x)=42262233) +1835 train 3.816870 (lr=2.2476e-04) (hash(x)=41502665) +1836 train 3.819019 (lr=2.2432e-04) (hash(x)=40158556) +1837 train 3.813051 (lr=2.2388e-04) (hash(x)=40654424) +1838 train 3.812537 (lr=2.2345e-04) (hash(x)=35758315) +1839 train 3.758736 (lr=2.2301e-04) (hash(x)=38335913) +1840 train 3.796674 (lr=2.2257e-04) (hash(x)=30660524) +1841 train 3.804323 (lr=2.2214e-04) (hash(x)=32835489) +1842 train 3.804183 (lr=2.2170e-04) (hash(x)=39158228) +1843 train 3.858470 (lr=2.2127e-04) (hash(x)=39266947) +1844 train 3.774223 (lr=2.2083e-04) (hash(x)=37197807) +1845 train 3.761775 (lr=2.2040e-04) (hash(x)=42826529) +1846 train 3.836716 (lr=2.1996e-04) (hash(x)=36245990) +1847 train 3.831937 (lr=2.1953e-04) (hash(x)=37609374) +1848 train 3.879344 (lr=2.1910e-04) (hash(x)=42171879) +1849 train 3.851722 (lr=2.1866e-04) (hash(x)=42148370) +1850 val loss 3.8246 +1850 val perplexity 45.8154 +1850 train 3.764681 (lr=2.1823e-04) (hash(x)=32107049) +1851 train 3.804662 (lr=2.1780e-04) (hash(x)=36916505) +1852 train 3.780368 (lr=2.1737e-04) (hash(x)=35629026) +1853 train 3.849319 (lr=2.1693e-04) (hash(x)=40752762) +1854 train 3.731425 (lr=2.1650e-04) (hash(x)=39019077) +1855 train 3.828892 (lr=2.1607e-04) (hash(x)=41728873) +1856 train 3.882594 (lr=2.1564e-04) (hash(x)=39089010) +1857 train 3.884936 (lr=2.1521e-04) (hash(x)=41994509) +1858 train 3.847682 (lr=2.1478e-04) (hash(x)=42750780) +1859 train 3.896034 (lr=2.1435e-04) (hash(x)=45648421) +1860 train 3.822355 (lr=2.1392e-04) (hash(x)=37289509) +1861 train 3.894389 (lr=2.1349e-04) (hash(x)=40409012) +1862 train 3.838651 (lr=2.1307e-04) (hash(x)=45660285) +1863 train 3.843348 (lr=2.1264e-04) (hash(x)=37435451) +1864 train 3.819902 (lr=2.1221e-04) (hash(x)=36765247) +1865 train 3.890391 (lr=2.1178e-04) (hash(x)=37300169) +1866 train 3.857492 (lr=2.1136e-04) (hash(x)=36704348) +1867 train 3.835767 (lr=2.1093e-04) (hash(x)=39590646) +1868 train 3.828768 (lr=2.1050e-04) (hash(x)=41336036) +1869 train 3.895271 (lr=2.1008e-04) (hash(x)=35367952) +1870 train 3.863339 (lr=2.0965e-04) (hash(x)=40845049) +1871 train 3.865729 (lr=2.0923e-04) (hash(x)=36387083) +1872 train 3.899481 (lr=2.0880e-04) (hash(x)=40035810) +1873 train 3.839805 (lr=2.0838e-04) (hash(x)=35958668) +1874 train 3.772813 (lr=2.0795e-04) (hash(x)=36690996) +1875 val loss 3.8221 +1875 val perplexity 45.6981 +1875 train 3.869104 (lr=2.0753e-04) (hash(x)=37587288) +1876 train 3.809723 (lr=2.0711e-04) (hash(x)=37274622) +1877 train 3.805526 (lr=2.0668e-04) (hash(x)=36618613) +1878 train 3.857047 (lr=2.0626e-04) (hash(x)=44527278) +1879 train 3.761404 (lr=2.0584e-04) (hash(x)=46351445) +1880 train 3.832197 (lr=2.0542e-04) (hash(x)=33648095) +1881 train 3.808442 (lr=2.0499e-04) (hash(x)=34401874) +1882 train 3.843303 (lr=2.0457e-04) (hash(x)=43476004) +1883 train 3.783145 (lr=2.0415e-04) (hash(x)=40902317) +1884 train 3.811525 (lr=2.0373e-04) (hash(x)=40730568) +1885 train 3.837322 (lr=2.0331e-04) (hash(x)=40788376) +1886 train 3.793513 (lr=2.0289e-04) (hash(x)=39736083) +1887 train 3.806123 (lr=2.0247e-04) (hash(x)=44526572) +1888 train 3.817802 (lr=2.0206e-04) (hash(x)=39148831) +1889 train 3.884900 (lr=2.0164e-04) (hash(x)=38513702) +1890 train 3.872460 (lr=2.0122e-04) (hash(x)=46166493) +1891 train 3.812523 (lr=2.0080e-04) (hash(x)=44878199) +1892 train 3.764773 (lr=2.0038e-04) (hash(x)=43503332) +1893 train 3.780904 (lr=1.9997e-04) (hash(x)=39929632) +1894 train 3.907701 (lr=1.9955e-04) (hash(x)=38975158) +1895 train 3.804700 (lr=1.9914e-04) (hash(x)=34851965) +1896 train 3.860210 (lr=1.9872e-04) (hash(x)=41230221) +1897 train 3.906293 (lr=1.9831e-04) (hash(x)=37795018) +1898 train 3.897805 (lr=1.9789e-04) (hash(x)=40761568) +1899 train 3.811027 (lr=1.9748e-04) (hash(x)=37543143) +1900 val loss 3.8180 +1900 val perplexity 45.5132 +1900 train 3.824185 (lr=1.9706e-04) (hash(x)=39826689) +1901 train 3.795538 (lr=1.9665e-04) (hash(x)=39028665) +1902 train 3.940017 (lr=1.9624e-04) (hash(x)=42997360) +1903 train 3.830631 (lr=1.9582e-04) (hash(x)=42178757) +1904 train 3.816025 (lr=1.9541e-04) (hash(x)=39529755) +1905 train 3.936213 (lr=1.9500e-04) (hash(x)=41950641) +1906 train 3.840107 (lr=1.9459e-04) (hash(x)=36713323) +1907 train 3.827512 (lr=1.9418e-04) (hash(x)=34928323) +1908 train 3.798495 (lr=1.9377e-04) (hash(x)=37252587) +1909 train 3.810443 (lr=1.9336e-04) (hash(x)=40054816) +1910 train 3.778972 (lr=1.9295e-04) (hash(x)=40767783) +1911 train 3.812544 (lr=1.9254e-04) (hash(x)=35916602) +1912 train 3.806827 (lr=1.9213e-04) (hash(x)=36494535) +1913 train 3.832786 (lr=1.9172e-04) (hash(x)=32303977) +1914 train 3.848043 (lr=1.9131e-04) (hash(x)=39741574) +1915 train 3.766192 (lr=1.9091e-04) (hash(x)=40534050) +1916 train 3.820299 (lr=1.9050e-04) (hash(x)=38311064) +1917 train 3.795261 (lr=1.9009e-04) (hash(x)=38581241) +1918 train 3.773481 (lr=1.8969e-04) (hash(x)=41608706) +1919 train 3.821310 (lr=1.8928e-04) (hash(x)=55097105) +1920 train 3.832447 (lr=1.8887e-04) (hash(x)=35176632) +1921 train 3.838760 (lr=1.8847e-04) (hash(x)=40199408) +1922 train 3.827824 (lr=1.8807e-04) (hash(x)=42799673) +1923 train 3.829744 (lr=1.8766e-04) (hash(x)=45066557) +1924 train 3.816090 (lr=1.8726e-04) (hash(x)=43415179) +1925 val loss 3.8165 +1925 val perplexity 45.4435 +1925 train 3.847556 (lr=1.8685e-04) (hash(x)=33808762) +1926 train 3.764707 (lr=1.8645e-04) (hash(x)=48264375) +1927 train 3.836978 (lr=1.8605e-04) (hash(x)=40243559) +1928 train 3.797976 (lr=1.8565e-04) (hash(x)=39717492) +1929 train 3.845414 (lr=1.8525e-04) (hash(x)=44116416) +1930 train 3.799386 (lr=1.8485e-04) (hash(x)=42813431) +1931 train 3.983575 (lr=1.8445e-04) (hash(x)=42022886) +1932 train 3.833104 (lr=1.8405e-04) (hash(x)=40554385) +1933 train 3.962076 (lr=1.8365e-04) (hash(x)=37778008) +1934 train 3.839904 (lr=1.8325e-04) (hash(x)=43437375) +1935 train 3.868418 (lr=1.8285e-04) (hash(x)=35163754) +1936 train 3.807105 (lr=1.8245e-04) (hash(x)=38992398) +1937 train 3.812029 (lr=1.8205e-04) (hash(x)=39865796) +1938 train 3.842635 (lr=1.8165e-04) (hash(x)=44844234) +1939 train 3.856123 (lr=1.8126e-04) (hash(x)=46413652) +1940 train 3.878730 (lr=1.8086e-04) (hash(x)=40424362) +1941 train 3.857882 (lr=1.8047e-04) (hash(x)=41424944) +1942 train 3.831998 (lr=1.8007e-04) (hash(x)=39281922) +1943 train 3.816370 (lr=1.7968e-04) (hash(x)=48958882) +1944 train 3.758696 (lr=1.7928e-04) (hash(x)=40230705) +1945 train 3.793845 (lr=1.7889e-04) (hash(x)=32313908) +1946 train 3.774516 (lr=1.7849e-04) (hash(x)=35170284) +1947 train 3.825593 (lr=1.7810e-04) (hash(x)=39810309) +1948 train 3.815992 (lr=1.7771e-04) (hash(x)=39247719) +1949 train 3.849712 (lr=1.7732e-04) (hash(x)=36813127) +1950 val loss 3.8149 +1950 val perplexity 45.3717 +1950 train 3.816709 (lr=1.7692e-04) (hash(x)=34246215) +1951 train 3.847483 (lr=1.7653e-04) (hash(x)=39921499) +1952 train 3.773797 (lr=1.7614e-04) (hash(x)=46538677) +1953 train 3.863127 (lr=1.7575e-04) (hash(x)=44467521) +1954 train 3.779722 (lr=1.7536e-04) (hash(x)=41601267) +1955 train 3.764195 (lr=1.7497e-04) (hash(x)=38730194) +1956 train 3.781445 (lr=1.7458e-04) (hash(x)=48300517) +1957 train 3.809877 (lr=1.7420e-04) (hash(x)=36834534) +1958 train 3.768805 (lr=1.7381e-04) (hash(x)=43459710) +1959 train 3.791228 (lr=1.7342e-04) (hash(x)=46318490) +1960 train 3.809012 (lr=1.7303e-04) (hash(x)=45090896) +1961 train 3.837402 (lr=1.7265e-04) (hash(x)=37377118) +1962 train 3.835444 (lr=1.7226e-04) (hash(x)=44229634) +1963 train 3.855143 (lr=1.7188e-04) (hash(x)=40751441) +1964 train 3.842542 (lr=1.7149e-04) (hash(x)=31672869) +1965 train 3.785819 (lr=1.7111e-04) (hash(x)=41705617) +1966 train 3.833086 (lr=1.7072e-04) (hash(x)=37304616) +1967 train 3.876192 (lr=1.7034e-04) (hash(x)=39376829) +1968 train 3.859894 (lr=1.6996e-04) (hash(x)=41086274) +1969 train 3.789145 (lr=1.6957e-04) (hash(x)=40280995) +1970 train 3.856083 (lr=1.6919e-04) (hash(x)=37054769) +1971 train 3.764302 (lr=1.6881e-04) (hash(x)=43265811) +1972 train 3.866452 (lr=1.6843e-04) (hash(x)=36557607) +1973 train 3.809985 (lr=1.6805e-04) (hash(x)=46134974) +1974 train 3.838418 (lr=1.6767e-04) (hash(x)=43273862) +1975 val loss 3.8102 +1975 val perplexity 45.1616 +1975 train 3.777533 (lr=1.6729e-04) (hash(x)=41698603) +1976 train 3.803784 (lr=1.6691e-04) (hash(x)=34786816) +1977 train 3.827159 (lr=1.6653e-04) (hash(x)=39812812) +1978 train 3.805415 (lr=1.6615e-04) (hash(x)=39798339) +1979 train 3.841802 (lr=1.6578e-04) (hash(x)=40060670) +1980 train 3.800605 (lr=1.6540e-04) (hash(x)=39321118) +1981 train 3.763860 (lr=1.6502e-04) (hash(x)=41392984) +1982 train 3.741947 (lr=1.6465e-04) (hash(x)=36797245) +1983 train 3.774350 (lr=1.6427e-04) (hash(x)=36646059) +1984 train 3.856619 (lr=1.6390e-04) (hash(x)=41432184) +1985 train 3.794824 (lr=1.6352e-04) (hash(x)=40916523) +1986 train 3.771401 (lr=1.6315e-04) (hash(x)=49661111) +1987 train 3.870786 (lr=1.6277e-04) (hash(x)=42861934) +1988 train 3.798083 (lr=1.6240e-04) (hash(x)=46730077) +1989 train 3.841662 (lr=1.6203e-04) (hash(x)=39230832) +1990 train 3.856618 (lr=1.6166e-04) (hash(x)=37657974) +1991 train 3.815775 (lr=1.6129e-04) (hash(x)=33039762) +1992 train 3.815669 (lr=1.6092e-04) (hash(x)=44629778) +1993 train 3.853765 (lr=1.6055e-04) (hash(x)=50592766) +1994 train 3.876486 (lr=1.6018e-04) (hash(x)=40786862) +1995 train 3.838751 (lr=1.5981e-04) (hash(x)=36955579) +1996 train 3.779909 (lr=1.5944e-04) (hash(x)=40599980) +1997 train 3.824533 (lr=1.5907e-04) (hash(x)=41884827) +1998 train 3.826291 (lr=1.5870e-04) (hash(x)=37635630) +1999 train 3.844870 (lr=1.5834e-04) (hash(x)=35997828) +2000 val loss 3.8065 +2000 val perplexity 44.9932 +2000 train 3.876450 (lr=1.5797e-04) (hash(x)=42958416) +2001 train 3.866910 (lr=1.5760e-04) (hash(x)=42522319) +2002 train 3.817987 (lr=1.5724e-04) (hash(x)=35365834) +2003 train 3.761965 (lr=1.5687e-04) (hash(x)=51644528) +2004 train 3.805471 (lr=1.5651e-04) (hash(x)=40975307) +2005 train 3.792804 (lr=1.5614e-04) (hash(x)=45514956) +2006 train 3.849188 (lr=1.5578e-04) (hash(x)=42139280) +2007 train 3.860116 (lr=1.5542e-04) (hash(x)=37674155) +2008 train 3.802810 (lr=1.5506e-04) (hash(x)=44326800) +2009 train 3.804202 (lr=1.5469e-04) (hash(x)=37105233) +2010 train 3.824251 (lr=1.5433e-04) (hash(x)=41236693) +2011 train 3.775552 (lr=1.5397e-04) (hash(x)=42188301) +2012 train 3.753550 (lr=1.5361e-04) (hash(x)=37017168) +2013 train 3.798352 (lr=1.5325e-04) (hash(x)=41905194) +2014 train 3.830823 (lr=1.5289e-04) (hash(x)=43579902) +2015 train 3.778418 (lr=1.5254e-04) (hash(x)=40771506) +2016 train 3.753958 (lr=1.5218e-04) (hash(x)=41418352) +2017 train 3.785126 (lr=1.5182e-04) (hash(x)=41713652) +2018 train 3.794756 (lr=1.5146e-04) (hash(x)=41080890) +2019 train 3.789227 (lr=1.5111e-04) (hash(x)=41806527) +2020 train 3.778317 (lr=1.5075e-04) (hash(x)=37617313) +2021 train 3.804776 (lr=1.5040e-04) (hash(x)=40840425) +2022 train 3.791013 (lr=1.5004e-04) (hash(x)=40402645) +2023 train 3.809565 (lr=1.4969e-04) (hash(x)=42816063) +2024 train 3.790082 (lr=1.4933e-04) (hash(x)=24571097) +2025 val loss 3.8058 +2025 val perplexity 44.9614 +2025 train 3.799218 (lr=1.4898e-04) (hash(x)=38251188) +2026 train 3.818273 (lr=1.4863e-04) (hash(x)=33254158) +2027 train 3.850888 (lr=1.4828e-04) (hash(x)=38390600) +2028 train 3.846525 (lr=1.4793e-04) (hash(x)=37506589) +2029 train 3.792452 (lr=1.4758e-04) (hash(x)=43529661) +2030 train 3.793483 (lr=1.4723e-04) (hash(x)=41476041) +2031 train 3.848888 (lr=1.4688e-04) (hash(x)=33243936) +2032 train 3.905939 (lr=1.4653e-04) (hash(x)=42580498) +2033 train 3.788701 (lr=1.4618e-04) (hash(x)=37019513) +2034 train 3.852795 (lr=1.4583e-04) (hash(x)=40900086) +2035 train 3.827810 (lr=1.4548e-04) (hash(x)=39311163) +2036 train 3.809172 (lr=1.4514e-04) (hash(x)=39013873) +2037 train 3.883066 (lr=1.4479e-04) (hash(x)=38720787) +2038 train 3.864274 (lr=1.4445e-04) (hash(x)=38141522) +2039 train 3.816583 (lr=1.4410e-04) (hash(x)=36427219) +2040 train 3.881159 (lr=1.4376e-04) (hash(x)=36063666) +2041 train 3.764436 (lr=1.4341e-04) (hash(x)=50159536) +2042 train 3.786880 (lr=1.4307e-04) (hash(x)=24378942) +2043 train 3.795573 (lr=1.4273e-04) (hash(x)=34720780) +2044 train 3.829225 (lr=1.4239e-04) (hash(x)=40128717) +2045 train 3.866665 (lr=1.4204e-04) (hash(x)=42057440) +2046 train 3.800903 (lr=1.4170e-04) (hash(x)=36870306) +2047 train 3.799510 (lr=1.4136e-04) (hash(x)=48437395) +2048 train 3.826766 (lr=1.4102e-04) (hash(x)=37868059) +2049 train 3.782743 (lr=1.4068e-04) (hash(x)=46854471) +2050 val loss 3.8039 +2050 val perplexity 44.8770 +2050 train 3.769061 (lr=1.4035e-04) (hash(x)=40269727) +2051 train 3.907624 (lr=1.4001e-04) (hash(x)=36812043) +2052 train 3.911467 (lr=1.3967e-04) (hash(x)=38712036) +2053 train 3.866334 (lr=1.3933e-04) (hash(x)=39339777) +2054 train 3.743102 (lr=1.3900e-04) (hash(x)=38906340) +2055 train 3.782958 (lr=1.3866e-04) (hash(x)=40564113) +2056 train 3.788559 (lr=1.3833e-04) (hash(x)=42952742) +2057 train 3.776254 (lr=1.3799e-04) (hash(x)=37953712) +2058 train 3.768028 (lr=1.3766e-04) (hash(x)=38413375) +2059 train 3.794600 (lr=1.3733e-04) (hash(x)=39456492) +2060 train 3.851300 (lr=1.3699e-04) (hash(x)=38236133) +2061 train 3.816942 (lr=1.3666e-04) (hash(x)=40125398) +2062 train 3.797189 (lr=1.3633e-04) (hash(x)=39063473) +2063 train 3.786829 (lr=1.3600e-04) (hash(x)=41242203) +2064 train 3.860206 (lr=1.3567e-04) (hash(x)=38031306) +2065 train 3.865570 (lr=1.3534e-04) (hash(x)=38906079) +2066 train 3.846935 (lr=1.3501e-04) (hash(x)=42856670) +2067 train 3.782894 (lr=1.3468e-04) (hash(x)=44411159) +2068 train 3.810308 (lr=1.3435e-04) (hash(x)=36900383) +2069 train 3.825292 (lr=1.3403e-04) (hash(x)=36234489) +2070 train 3.810300 (lr=1.3370e-04) (hash(x)=39074966) +2071 train 3.813627 (lr=1.3337e-04) (hash(x)=37476037) +2072 train 3.925297 (lr=1.3305e-04) (hash(x)=37483929) +2073 train 3.809913 (lr=1.3272e-04) (hash(x)=38106134) +2074 train 3.805075 (lr=1.3240e-04) (hash(x)=39062591) +2075 val loss 3.8007 +2075 val perplexity 44.7322 +2075 train 3.825446 (lr=1.3208e-04) (hash(x)=37748919) +2076 train 3.861112 (lr=1.3175e-04) (hash(x)=38405303) +2077 train 3.840151 (lr=1.3143e-04) (hash(x)=32375696) +2078 train 3.822907 (lr=1.3111e-04) (hash(x)=42784711) +2079 train 3.780080 (lr=1.3079e-04) (hash(x)=39472287) +2080 train 3.849760 (lr=1.3047e-04) (hash(x)=40139046) +2081 train 3.775448 (lr=1.3015e-04) (hash(x)=38660533) +2082 train 3.790434 (lr=1.2983e-04) (hash(x)=41116393) +2083 train 3.841049 (lr=1.2951e-04) (hash(x)=40299749) +2084 train 3.799471 (lr=1.2919e-04) (hash(x)=34787631) +2085 train 3.781245 (lr=1.2887e-04) (hash(x)=37625583) +2086 train 3.761601 (lr=1.2856e-04) (hash(x)=40472669) +2087 train 3.778239 (lr=1.2824e-04) (hash(x)=40819139) +2088 train 3.744459 (lr=1.2793e-04) (hash(x)=37291191) +2089 train 3.811768 (lr=1.2761e-04) (hash(x)=40151703) +2090 train 3.811486 (lr=1.2730e-04) (hash(x)=36475581) +2091 train 3.806534 (lr=1.2698e-04) (hash(x)=39622844) +2092 train 3.768148 (lr=1.2667e-04) (hash(x)=41501822) +2093 train 3.818435 (lr=1.2636e-04) (hash(x)=39849540) +2094 train 3.779541 (lr=1.2605e-04) (hash(x)=44504710) +2095 train 3.831657 (lr=1.2574e-04) (hash(x)=41382026) +2096 train 3.745245 (lr=1.2543e-04) (hash(x)=40750493) +2097 train 3.825768 (lr=1.2512e-04) (hash(x)=39555127) +2098 train 3.831925 (lr=1.2481e-04) (hash(x)=43878979) +2099 train 3.917769 (lr=1.2450e-04) (hash(x)=44021900) +2100 val loss 3.7986 +2100 val perplexity 44.6375 +2100 train 3.834013 (lr=1.2419e-04) (hash(x)=41533411) +2101 train 3.764498 (lr=1.2388e-04) (hash(x)=41596098) +2102 train 3.788270 (lr=1.2358e-04) (hash(x)=41669704) +2103 train 3.848491 (lr=1.2327e-04) (hash(x)=40207093) +2104 train 3.817942 (lr=1.2296e-04) (hash(x)=39121820) +2105 train 3.779584 (lr=1.2266e-04) (hash(x)=36337084) +2106 train 3.832448 (lr=1.2236e-04) (hash(x)=40509907) +2107 train 3.882251 (lr=1.2205e-04) (hash(x)=44772279) +2108 train 3.808353 (lr=1.2175e-04) (hash(x)=34949758) +2109 train 3.783624 (lr=1.2145e-04) (hash(x)=41418019) +2110 train 3.743673 (lr=1.2115e-04) (hash(x)=37663643) +2111 train 3.807868 (lr=1.2085e-04) (hash(x)=38972650) +2112 train 3.816249 (lr=1.2054e-04) (hash(x)=41898998) +2113 train 3.813588 (lr=1.2025e-04) (hash(x)=38141774) +2114 train 3.810738 (lr=1.1995e-04) (hash(x)=44227138) +2115 train 3.818663 (lr=1.1965e-04) (hash(x)=43078299) +2116 train 3.839705 (lr=1.1935e-04) (hash(x)=42749021) +2117 train 3.797634 (lr=1.1905e-04) (hash(x)=43868550) +2118 train 3.801780 (lr=1.1876e-04) (hash(x)=37944223) +2119 train 3.832431 (lr=1.1846e-04) (hash(x)=41531635) +2120 train 3.781923 (lr=1.1817e-04) (hash(x)=36401250) +2121 train 3.835061 (lr=1.1787e-04) (hash(x)=38108174) +2122 train 3.836420 (lr=1.1758e-04) (hash(x)=39271678) +2123 train 3.786269 (lr=1.1729e-04) (hash(x)=46066448) +2124 train 3.899399 (lr=1.1699e-04) (hash(x)=42330731) +2125 val loss 3.7978 +2125 val perplexity 44.6034 +2125 train 3.829367 (lr=1.1670e-04) (hash(x)=37889396) +2126 train 3.867986 (lr=1.1641e-04) (hash(x)=42272912) +2127 train 3.810795 (lr=1.1612e-04) (hash(x)=39661006) +2128 train 3.789040 (lr=1.1583e-04) (hash(x)=35620800) +2129 train 3.784962 (lr=1.1554e-04) (hash(x)=35506692) +2130 train 3.804531 (lr=1.1525e-04) (hash(x)=38628204) +2131 train 3.829532 (lr=1.1497e-04) (hash(x)=42108031) +2132 train 3.734655 (lr=1.1468e-04) (hash(x)=40930168) +2133 train 3.748956 (lr=1.1439e-04) (hash(x)=36904609) +2134 train 3.743320 (lr=1.1411e-04) (hash(x)=39321620) +2135 train 3.684787 (lr=1.1382e-04) (hash(x)=41971798) +2136 train 3.734105 (lr=1.1354e-04) (hash(x)=49781249) +2137 train 3.734500 (lr=1.1325e-04) (hash(x)=36996371) +2138 train 3.743794 (lr=1.1297e-04) (hash(x)=42007924) +2139 train 3.857929 (lr=1.1269e-04) (hash(x)=37904767) +2140 train 3.832487 (lr=1.1241e-04) (hash(x)=43401743) +2141 train 3.766171 (lr=1.1213e-04) (hash(x)=44928853) +2142 train 3.768771 (lr=1.1185e-04) (hash(x)=39393661) +2143 train 3.766866 (lr=1.1157e-04) (hash(x)=38631638) +2144 train 3.765569 (lr=1.1129e-04) (hash(x)=38012274) +2145 train 3.804099 (lr=1.1101e-04) (hash(x)=40835560) +2146 train 3.813960 (lr=1.1073e-04) (hash(x)=43092534) +2147 train 3.811257 (lr=1.1045e-04) (hash(x)=35197150) +2148 train 3.850454 (lr=1.1018e-04) (hash(x)=42553720) +2149 train 3.788929 (lr=1.0990e-04) (hash(x)=34085881) +2150 val loss 3.7945 +2150 val perplexity 44.4559 +2150 train 3.839499 (lr=1.0963e-04) (hash(x)=35224500) +2151 train 3.842136 (lr=1.0935e-04) (hash(x)=46255889) +2152 train 3.749490 (lr=1.0908e-04) (hash(x)=36950847) +2153 train 3.797845 (lr=1.0881e-04) (hash(x)=42139072) +2154 train 3.816730 (lr=1.0853e-04) (hash(x)=42675870) +2155 train 3.803625 (lr=1.0826e-04) (hash(x)=47566022) +2156 train 3.747784 (lr=1.0799e-04) (hash(x)=40979857) +2157 train 3.742785 (lr=1.0772e-04) (hash(x)=41365290) +2158 train 3.782106 (lr=1.0745e-04) (hash(x)=44831367) +2159 train 3.814404 (lr=1.0718e-04) (hash(x)=40747018) +2160 train 3.774070 (lr=1.0692e-04) (hash(x)=40650554) +2161 train 3.756911 (lr=1.0665e-04) (hash(x)=49755242) +2162 train 3.814522 (lr=1.0638e-04) (hash(x)=38363949) +2163 train 3.803551 (lr=1.0612e-04) (hash(x)=33604327) +2164 train 3.828099 (lr=1.0585e-04) (hash(x)=43634716) +2165 train 3.806095 (lr=1.0559e-04) (hash(x)=38991800) +2166 train 3.809601 (lr=1.0532e-04) (hash(x)=42068001) +2167 train 3.775532 (lr=1.0506e-04) (hash(x)=41872488) +2168 train 3.792693 (lr=1.0480e-04) (hash(x)=40377147) +2169 train 3.805320 (lr=1.0453e-04) (hash(x)=44291998) +2170 train 3.775573 (lr=1.0427e-04) (hash(x)=42734023) +2171 train 3.734008 (lr=1.0401e-04) (hash(x)=45214790) +2172 train 3.790123 (lr=1.0375e-04) (hash(x)=40288990) +2173 train 3.735047 (lr=1.0349e-04) (hash(x)=37144489) +2174 train 3.763420 (lr=1.0324e-04) (hash(x)=36440952) +2175 val loss 3.7913 +2175 val perplexity 44.3151 +2175 train 3.729948 (lr=1.0298e-04) (hash(x)=43580350) +2176 train 3.791343 (lr=1.0272e-04) (hash(x)=39150853) +2177 train 3.796644 (lr=1.0247e-04) (hash(x)=40730401) +2178 train 3.838979 (lr=1.0221e-04) (hash(x)=37884561) +2179 train 3.748332 (lr=1.0196e-04) (hash(x)=38297541) +2180 train 3.809679 (lr=1.0170e-04) (hash(x)=40865795) +2181 train 3.803254 (lr=1.0145e-04) (hash(x)=38519708) +2182 train 3.793998 (lr=1.0120e-04) (hash(x)=39984561) +2183 train 3.782753 (lr=1.0094e-04) (hash(x)=37853050) +2184 train 3.791171 (lr=1.0069e-04) (hash(x)=40867999) +2185 train 3.786931 (lr=1.0044e-04) (hash(x)=38622093) +2186 train 3.842192 (lr=1.0019e-04) (hash(x)=37691514) +2187 train 3.798338 (lr=9.9942e-05) (hash(x)=43224284) +2188 train 3.801321 (lr=9.9694e-05) (hash(x)=40280977) +2189 train 3.872357 (lr=9.9446e-05) (hash(x)=34169818) +2190 train 3.758863 (lr=9.9199e-05) (hash(x)=36217544) +2191 train 3.839323 (lr=9.8953e-05) (hash(x)=43863529) +2192 train 3.795908 (lr=9.8708e-05) (hash(x)=42861683) +2193 train 3.773411 (lr=9.8463e-05) (hash(x)=38669699) +2194 train 3.786652 (lr=9.8219e-05) (hash(x)=41143457) +2195 train 3.974071 (lr=9.7975e-05) (hash(x)=40971031) +2196 train 3.808803 (lr=9.7733e-05) (hash(x)=39204899) +2197 train 3.776363 (lr=9.7491e-05) (hash(x)=38410420) +2198 train 3.774312 (lr=9.7250e-05) (hash(x)=38837215) +2199 train 3.781172 (lr=9.7009e-05) (hash(x)=38634035) +2200 val loss 3.7910 +2200 val perplexity 44.2997 +2200 train 3.752847 (lr=9.6769e-05) (hash(x)=41984003) +2201 train 3.745008 (lr=9.6530e-05) (hash(x)=42937984) +2202 train 3.777761 (lr=9.6292e-05) (hash(x)=40482054) +2203 train 3.804918 (lr=9.6054e-05) (hash(x)=41396009) +2204 train 3.811574 (lr=9.5818e-05) (hash(x)=38927364) +2205 train 3.753078 (lr=9.5581e-05) (hash(x)=36950043) +2206 train 3.809823 (lr=9.5346e-05) (hash(x)=40344078) +2207 train 3.725853 (lr=9.5111e-05) (hash(x)=40171443) +2208 train 3.780766 (lr=9.4877e-05) (hash(x)=47098079) +2209 train 3.753957 (lr=9.4644e-05) (hash(x)=48539416) +2210 train 3.823893 (lr=9.4412e-05) (hash(x)=44910428) +2211 train 3.791901 (lr=9.4180e-05) (hash(x)=40144468) +2212 train 3.746784 (lr=9.3949e-05) (hash(x)=43802908) +2213 train 3.809205 (lr=9.3718e-05) (hash(x)=40245003) +2214 train 3.835301 (lr=9.3489e-05) (hash(x)=39156495) +2215 train 3.794668 (lr=9.3260e-05) (hash(x)=37221583) +2216 train 3.810029 (lr=9.3032e-05) (hash(x)=38639981) +2217 train 3.771368 (lr=9.2804e-05) (hash(x)=43083541) +2218 train 3.765256 (lr=9.2578e-05) (hash(x)=43349665) +2219 train 3.787565 (lr=9.2352e-05) (hash(x)=41877177) +2220 train 3.780956 (lr=9.2127e-05) (hash(x)=36883776) +2221 train 3.775681 (lr=9.1902e-05) (hash(x)=42776720) +2222 train 3.756567 (lr=9.1679e-05) (hash(x)=38107696) +2223 train 3.803008 (lr=9.1456e-05) (hash(x)=38976337) +2224 train 3.800628 (lr=9.1233e-05) (hash(x)=39316931) +2225 val loss 3.7883 +2225 val perplexity 44.1818 +2225 train 3.889447 (lr=9.1012e-05) (hash(x)=41949348) +2226 train 3.812604 (lr=9.0791e-05) (hash(x)=41335177) +2227 train 3.771749 (lr=9.0571e-05) (hash(x)=50065538) +2228 train 3.746773 (lr=9.0352e-05) (hash(x)=47679048) +2229 train 3.777399 (lr=9.0133e-05) (hash(x)=37926332) +2230 train 3.866730 (lr=8.9916e-05) (hash(x)=40212343) +2231 train 3.787912 (lr=8.9698e-05) (hash(x)=41011968) +2232 train 3.819930 (lr=8.9482e-05) (hash(x)=42133693) +2233 train 3.797124 (lr=8.9267e-05) (hash(x)=37794211) +2234 train 3.795646 (lr=8.9052e-05) (hash(x)=34176395) +2235 train 3.699573 (lr=8.8838e-05) (hash(x)=38226697) +2236 train 3.777795 (lr=8.8624e-05) (hash(x)=40166049) +2237 train 3.740031 (lr=8.8412e-05) (hash(x)=44381770) +2238 train 3.768285 (lr=8.8200e-05) (hash(x)=35001697) +2239 train 3.757761 (lr=8.7989e-05) (hash(x)=40788823) +2240 train 3.794073 (lr=8.7779e-05) (hash(x)=39648969) +2241 train 3.785238 (lr=8.7569e-05) (hash(x)=42598191) +2242 train 3.702050 (lr=8.7360e-05) (hash(x)=36371769) +2243 train 3.801972 (lr=8.7152e-05) (hash(x)=39837713) +2244 train 3.800313 (lr=8.6945e-05) (hash(x)=38771833) +2245 train 3.789619 (lr=8.6738e-05) (hash(x)=36942426) +2246 train 3.788414 (lr=8.6533e-05) (hash(x)=44875672) +2247 train 3.868347 (lr=8.6328e-05) (hash(x)=34491202) +2248 train 3.758946 (lr=8.6123e-05) (hash(x)=40141543) +2249 train 3.806669 (lr=8.5920e-05) (hash(x)=41451740) +2250 val loss 3.7862 +2250 val perplexity 44.0904 +2250 train 3.764990 (lr=8.5717e-05) (hash(x)=39445216) +2251 train 3.771467 (lr=8.5515e-05) (hash(x)=38483448) +2252 train 3.779455 (lr=8.5314e-05) (hash(x)=35441096) +2253 train 3.822761 (lr=8.5113e-05) (hash(x)=48358855) +2254 train 3.802416 (lr=8.4913e-05) (hash(x)=39586200) +2255 train 3.755500 (lr=8.4714e-05) (hash(x)=43329025) +2256 train 3.739339 (lr=8.4516e-05) (hash(x)=41812624) +2257 train 3.776545 (lr=8.4319e-05) (hash(x)=37008184) +2258 train 3.778099 (lr=8.4122e-05) (hash(x)=40146890) +2259 train 3.808932 (lr=8.3926e-05) (hash(x)=40226765) +2260 train 3.773490 (lr=8.3731e-05) (hash(x)=42919560) +2261 train 3.811203 (lr=8.3536e-05) (hash(x)=37565454) +2262 train 3.782438 (lr=8.3343e-05) (hash(x)=37184854) +2263 train 3.791943 (lr=8.3150e-05) (hash(x)=37781767) +2264 train 3.777011 (lr=8.2958e-05) (hash(x)=37980189) +2265 train 3.797772 (lr=8.2766e-05) (hash(x)=38457368) +2266 train 3.764022 (lr=8.2576e-05) (hash(x)=39651530) +2267 train 3.777492 (lr=8.2386e-05) (hash(x)=44559286) +2268 train 3.765334 (lr=8.2197e-05) (hash(x)=40222481) +2269 train 3.770276 (lr=8.2009e-05) (hash(x)=44433657) +2270 train 3.743004 (lr=8.1821e-05) (hash(x)=40059512) +2271 train 3.815142 (lr=8.1634e-05) (hash(x)=41231011) +2272 train 3.732862 (lr=8.1448e-05) (hash(x)=41771600) +2273 train 3.730464 (lr=8.1263e-05) (hash(x)=40889707) +2274 train 3.803696 (lr=8.1079e-05) (hash(x)=39351430) +2275 val loss 3.7857 +2275 val perplexity 44.0673 +2275 train 3.770863 (lr=8.0895e-05) (hash(x)=40301291) +2276 train 3.728756 (lr=8.0712e-05) (hash(x)=37928094) +2277 train 3.794904 (lr=8.0530e-05) (hash(x)=42034244) +2278 train 3.826924 (lr=8.0348e-05) (hash(x)=38547982) +2279 train 3.789232 (lr=8.0168e-05) (hash(x)=40565532) +2280 train 3.774305 (lr=7.9988e-05) (hash(x)=35255406) +2281 train 3.786320 (lr=7.9809e-05) (hash(x)=37587906) +2282 train 3.797111 (lr=7.9631e-05) (hash(x)=37746290) +2283 train 3.784977 (lr=7.9453e-05) (hash(x)=38894878) +2284 train 3.779605 (lr=7.9276e-05) (hash(x)=39220146) +2285 train 3.825650 (lr=7.9101e-05) (hash(x)=42360719) +2286 train 3.768232 (lr=7.8925e-05) (hash(x)=37109232) +2287 train 3.947362 (lr=7.8751e-05) (hash(x)=48261560) +2288 train 3.785933 (lr=7.8577e-05) (hash(x)=41249581) +2289 train 3.821084 (lr=7.8405e-05) (hash(x)=39670452) +2290 train 3.769676 (lr=7.8232e-05) (hash(x)=47781193) +2291 train 3.852083 (lr=7.8061e-05) (hash(x)=42629380) +2292 train 3.773879 (lr=7.7891e-05) (hash(x)=38535581) +2293 train 3.839136 (lr=7.7721e-05) (hash(x)=47746191) +2294 train 3.838897 (lr=7.7552e-05) (hash(x)=40259902) +2295 train 3.816273 (lr=7.7384e-05) (hash(x)=39537448) +2296 train 3.822213 (lr=7.7217e-05) (hash(x)=37777315) +2297 train 3.787373 (lr=7.7050e-05) (hash(x)=40681229) +2298 train 3.777983 (lr=7.6884e-05) (hash(x)=35113270) +2299 train 3.767390 (lr=7.6719e-05) (hash(x)=37606330) +2300 val loss 3.7840 +2300 val perplexity 43.9900 +2300 train 3.770551 (lr=7.6555e-05) (hash(x)=34040447) +2301 train 3.810054 (lr=7.6392e-05) (hash(x)=40970735) +2302 train 3.712697 (lr=7.6229e-05) (hash(x)=35693537) +2303 train 3.727605 (lr=7.6067e-05) (hash(x)=41366962) +2304 train 3.798949 (lr=7.5906e-05) (hash(x)=35918945) +2305 train 3.800185 (lr=7.5746e-05) (hash(x)=38352147) +2306 train 3.751740 (lr=7.5586e-05) (hash(x)=39116071) +2307 train 3.742714 (lr=7.5427e-05) (hash(x)=44846442) +2308 train 3.746845 (lr=7.5269e-05) (hash(x)=39479190) +2309 train 3.768311 (lr=7.5112e-05) (hash(x)=38554415) +2310 train 3.811019 (lr=7.4956e-05) (hash(x)=37521926) +2311 train 3.803897 (lr=7.4800e-05) (hash(x)=39322712) +2312 train 3.784559 (lr=7.4646e-05) (hash(x)=38914190) +2313 train 3.847715 (lr=7.4492e-05) (hash(x)=42369738) +2314 train 3.811624 (lr=7.4338e-05) (hash(x)=38306662) +2315 train 3.735158 (lr=7.4186e-05) (hash(x)=41587549) +2316 train 3.777252 (lr=7.4034e-05) (hash(x)=41109277) +2317 train 3.784209 (lr=7.3884e-05) (hash(x)=44848245) +2318 train 3.815646 (lr=7.3734e-05) (hash(x)=42285511) +2319 train 3.797773 (lr=7.3584e-05) (hash(x)=42929188) +2320 train 3.759415 (lr=7.3436e-05) (hash(x)=42628372) +2321 train 3.703044 (lr=7.3288e-05) (hash(x)=41858484) +2322 train 3.794204 (lr=7.3141e-05) (hash(x)=38660349) +2323 train 3.812684 (lr=7.2995e-05) (hash(x)=36412462) +2324 train 3.790573 (lr=7.2850e-05) (hash(x)=38847051) +2325 val loss 3.7816 +2325 val perplexity 43.8845 +2325 train 3.838848 (lr=7.2706e-05) (hash(x)=37981570) +2326 train 3.838377 (lr=7.2562e-05) (hash(x)=37938173) +2327 train 3.851518 (lr=7.2419e-05) (hash(x)=38878355) +2328 train 3.834009 (lr=7.2277e-05) (hash(x)=35203344) +2329 train 3.884616 (lr=7.2136e-05) (hash(x)=40670930) +2330 train 3.802889 (lr=7.1995e-05) (hash(x)=36126074) +2331 train 3.821551 (lr=7.1856e-05) (hash(x)=46417802) +2332 train 3.830666 (lr=7.1717e-05) (hash(x)=41992748) +2333 train 3.817981 (lr=7.1579e-05) (hash(x)=36068607) +2334 train 3.760583 (lr=7.1441e-05) (hash(x)=35410081) +2335 train 3.786437 (lr=7.1305e-05) (hash(x)=37418926) +2336 train 3.777277 (lr=7.1169e-05) (hash(x)=42737127) +2337 train 3.762483 (lr=7.1034e-05) (hash(x)=35539618) +2338 train 3.780051 (lr=7.0900e-05) (hash(x)=37547816) +2339 train 3.702289 (lr=7.0767e-05) (hash(x)=38335445) +2340 train 3.772983 (lr=7.0635e-05) (hash(x)=44165715) +2341 train 3.782083 (lr=7.0503e-05) (hash(x)=39489311) +2342 train 3.764787 (lr=7.0372e-05) (hash(x)=43843813) +2343 train 3.776157 (lr=7.0242e-05) (hash(x)=39468187) +2344 train 3.828331 (lr=7.0113e-05) (hash(x)=41353414) +2345 train 3.775594 (lr=6.9984e-05) (hash(x)=43177896) +2346 train 3.797185 (lr=6.9857e-05) (hash(x)=43914915) +2347 train 3.776797 (lr=6.9730e-05) (hash(x)=40836646) +2348 train 3.789246 (lr=6.9604e-05) (hash(x)=37812362) +2349 train 3.784328 (lr=6.9479e-05) (hash(x)=38888940) +2350 val loss 3.7808 +2350 val perplexity 43.8530 +2350 train 3.830515 (lr=6.9354e-05) (hash(x)=45708099) +2351 train 3.735385 (lr=6.9231e-05) (hash(x)=43688863) +2352 train 3.828375 (lr=6.9108e-05) (hash(x)=35419353) +2353 train 3.753122 (lr=6.8986e-05) (hash(x)=36220541) +2354 train 3.754469 (lr=6.8865e-05) (hash(x)=39330276) +2355 train 3.773690 (lr=6.8744e-05) (hash(x)=40298228) +2356 train 3.840383 (lr=6.8625e-05) (hash(x)=39622122) +2357 train 3.821913 (lr=6.8506e-05) (hash(x)=41162445) +2358 train 3.789783 (lr=6.8388e-05) (hash(x)=42147375) +2359 train 3.800432 (lr=6.8271e-05) (hash(x)=39700487) +2360 train 3.933139 (lr=6.8155e-05) (hash(x)=45666720) +2361 train 3.826661 (lr=6.8039e-05) (hash(x)=37873456) +2362 train 3.804511 (lr=6.7925e-05) (hash(x)=43630915) +2363 train 3.787045 (lr=6.7811e-05) (hash(x)=39689679) +2364 train 3.756790 (lr=6.7698e-05) (hash(x)=42150154) +2365 train 3.788090 (lr=6.7585e-05) (hash(x)=41723186) +2366 train 3.770043 (lr=6.7474e-05) (hash(x)=38677848) +2367 train 3.799750 (lr=6.7363e-05) (hash(x)=35931669) +2368 train 3.810210 (lr=6.7254e-05) (hash(x)=37214582) +2369 train 3.756329 (lr=6.7145e-05) (hash(x)=37068356) +2370 train 3.751044 (lr=6.7036e-05) (hash(x)=40764767) +2371 train 3.833463 (lr=6.6929e-05) (hash(x)=33413674) +2372 train 3.796353 (lr=6.6822e-05) (hash(x)=41871393) +2373 train 3.768484 (lr=6.6717e-05) (hash(x)=33828528) +2374 train 3.766057 (lr=6.6612e-05) (hash(x)=43674823) +2375 val loss 3.7811 +2375 val perplexity 43.8654 +2375 train 3.808656 (lr=6.6508e-05) (hash(x)=42820633) +2376 train 3.897954 (lr=6.6404e-05) (hash(x)=39008244) +2377 train 3.777468 (lr=6.6302e-05) (hash(x)=36494374) +2378 train 3.830704 (lr=6.6200e-05) (hash(x)=41624854) +2379 train 3.778040 (lr=6.6099e-05) (hash(x)=40219047) +2380 train 3.842057 (lr=6.5999e-05) (hash(x)=54123076) +2381 train 3.750406 (lr=6.5900e-05) (hash(x)=43966942) +2382 train 3.781203 (lr=6.5802e-05) (hash(x)=45325944) +2383 train 3.791243 (lr=6.5704e-05) (hash(x)=40703059) +2384 train 3.759757 (lr=6.5607e-05) (hash(x)=40264880) +2385 train 3.811032 (lr=6.5511e-05) (hash(x)=49915908) +2386 train 3.795084 (lr=6.5416e-05) (hash(x)=42872803) +2387 train 3.850036 (lr=6.5322e-05) (hash(x)=42166253) +2388 train 3.814170 (lr=6.5229e-05) (hash(x)=38122933) +2389 train 3.740597 (lr=6.5136e-05) (hash(x)=41476118) +2390 train 3.755968 (lr=6.5044e-05) (hash(x)=45518852) +2391 train 3.789952 (lr=6.4953e-05) (hash(x)=41143990) +2392 train 3.813070 (lr=6.4863e-05) (hash(x)=36558269) +2393 train 3.764839 (lr=6.4774e-05) (hash(x)=41891218) +2394 train 3.784952 (lr=6.4685e-05) (hash(x)=44368637) +2395 train 3.786369 (lr=6.4597e-05) (hash(x)=42116707) +2396 train 3.756527 (lr=6.4510e-05) (hash(x)=36192602) +2397 train 3.822416 (lr=6.4424e-05) (hash(x)=43562548) +2398 train 3.810754 (lr=6.4339e-05) (hash(x)=32521419) +2399 train 3.777400 (lr=6.4255e-05) (hash(x)=41933861) +2400 val loss 3.7791 +2400 val perplexity 43.7752 +2400 train 3.767545 (lr=6.4171e-05) (hash(x)=44481467) +2401 train 3.793176 (lr=6.4088e-05) (hash(x)=39350473) +2402 train 3.830543 (lr=6.4006e-05) (hash(x)=43638791) +2403 train 3.773710 (lr=6.3925e-05) (hash(x)=41458794) +2404 train 3.843570 (lr=6.3845e-05) (hash(x)=35064103) +2405 train 3.798037 (lr=6.3765e-05) (hash(x)=42336591) +2406 train 3.724931 (lr=6.3687e-05) (hash(x)=37726126) +2407 train 3.756518 (lr=6.3609e-05) (hash(x)=43581379) +2408 train 3.776959 (lr=6.3532e-05) (hash(x)=40929071) +2409 train 3.776130 (lr=6.3456e-05) (hash(x)=43768688) +2410 train 3.755063 (lr=6.3380e-05) (hash(x)=46186293) +2411 train 3.755984 (lr=6.3306e-05) (hash(x)=37445746) +2412 train 3.731514 (lr=6.3232e-05) (hash(x)=39253300) +2413 train 3.796452 (lr=6.3159e-05) (hash(x)=39960699) +2414 train 3.737607 (lr=6.3087e-05) (hash(x)=37198589) +2415 train 3.785713 (lr=6.3016e-05) (hash(x)=36679461) +2416 train 3.736984 (lr=6.2945e-05) (hash(x)=45627160) +2417 train 3.841328 (lr=6.2876e-05) (hash(x)=41254159) +2418 train 3.809939 (lr=6.2807e-05) (hash(x)=34858445) +2419 train 3.792015 (lr=6.2739e-05) (hash(x)=40325069) +2420 train 3.811259 (lr=6.2672e-05) (hash(x)=48357935) +2421 train 3.738984 (lr=6.2606e-05) (hash(x)=44823044) +2422 train 3.786507 (lr=6.2540e-05) (hash(x)=40241134) +2423 train 3.760263 (lr=6.2476e-05) (hash(x)=38967952) +2424 train 3.751554 (lr=6.2412e-05) (hash(x)=40832508) +2425 val loss 3.7772 +2425 val perplexity 43.6942 +2425 train 3.744011 (lr=6.2349e-05) (hash(x)=35706307) +2426 train 3.831644 (lr=6.2287e-05) (hash(x)=39292456) +2427 train 3.794119 (lr=6.2225e-05) (hash(x)=42857686) +2428 train 3.825758 (lr=6.2165e-05) (hash(x)=39364380) +2429 train 3.746689 (lr=6.2105e-05) (hash(x)=39674279) +2430 train 3.817559 (lr=6.2046e-05) (hash(x)=42485354) +2431 train 3.793001 (lr=6.1988e-05) (hash(x)=35054889) +2432 train 3.813324 (lr=6.1931e-05) (hash(x)=37672480) +2433 train 3.809974 (lr=6.1875e-05) (hash(x)=44834481) +2434 train 3.811116 (lr=6.1820e-05) (hash(x)=41914793) +2435 train 3.792212 (lr=6.1765e-05) (hash(x)=37613338) +2436 train 3.777019 (lr=6.1711e-05) (hash(x)=40441749) +2437 train 3.788423 (lr=6.1658e-05) (hash(x)=44364671) +2438 train 3.810727 (lr=6.1606e-05) (hash(x)=42100261) +2439 train 3.820361 (lr=6.1555e-05) (hash(x)=40248905) +2440 train 3.805113 (lr=6.1504e-05) (hash(x)=37515629) +2441 train 3.795601 (lr=6.1454e-05) (hash(x)=38862361) +2442 train 3.784097 (lr=6.1406e-05) (hash(x)=37239618) +2443 train 3.740152 (lr=6.1358e-05) (hash(x)=39546071) +2444 train 3.828813 (lr=6.1310e-05) (hash(x)=35899286) +2445 train 3.797647 (lr=6.1264e-05) (hash(x)=37596139) +2446 train 3.742680 (lr=6.1218e-05) (hash(x)=37030027) +2447 train 3.777825 (lr=6.1174e-05) (hash(x)=38386486) +2448 train 3.820624 (lr=6.1130e-05) (hash(x)=37373875) +2449 train 3.803969 (lr=6.1087e-05) (hash(x)=40742492) +2450 val loss 3.7777 +2450 val perplexity 43.7172 +2450 train 3.763089 (lr=6.1045e-05) (hash(x)=41216868) +2451 train 3.765787 (lr=6.1003e-05) (hash(x)=39135201) +2452 train 3.742214 (lr=6.0963e-05) (hash(x)=37679684) +2453 train 3.755149 (lr=6.0923e-05) (hash(x)=37321495) +2454 train 3.815914 (lr=6.0884e-05) (hash(x)=41587536) +2455 train 3.715799 (lr=6.0846e-05) (hash(x)=48685042) +2456 train 3.752185 (lr=6.0809e-05) (hash(x)=39615396) +2457 train 3.766255 (lr=6.0773e-05) (hash(x)=37062876) +2458 train 3.785966 (lr=6.0737e-05) (hash(x)=46686625) +2459 train 3.748286 (lr=6.0703e-05) (hash(x)=40455762) +2460 train 3.764948 (lr=6.0669e-05) (hash(x)=37681309) +2461 train 3.791641 (lr=6.0636e-05) (hash(x)=43356300) +2462 train 3.724435 (lr=6.0604e-05) (hash(x)=39515329) +2463 train 3.845265 (lr=6.0572e-05) (hash(x)=41007182) +2464 train 3.781408 (lr=6.0542e-05) (hash(x)=43000164) +2465 train 3.786151 (lr=6.0512e-05) (hash(x)=38653004) +2466 train 3.800663 (lr=6.0483e-05) (hash(x)=40598079) +2467 train 3.814377 (lr=6.0455e-05) (hash(x)=41026238) +2468 train 3.787941 (lr=6.0428e-05) (hash(x)=35958594) +2469 train 3.796092 (lr=6.0402e-05) (hash(x)=42114985) +2470 train 3.803588 (lr=6.0376e-05) (hash(x)=46186157) +2471 train 3.795648 (lr=6.0352e-05) (hash(x)=36399528) +2472 train 3.785455 (lr=6.0328e-05) (hash(x)=38265887) +2473 train 3.823287 (lr=6.0305e-05) (hash(x)=40769984) +2474 train 3.800354 (lr=6.0283e-05) (hash(x)=43133488) +2475 val loss 3.7761 +2475 val perplexity 43.6450 +2475 train 3.805627 (lr=6.0261e-05) (hash(x)=42545052) +2476 train 3.726729 (lr=6.0241e-05) (hash(x)=40478528) +2477 train 3.783800 (lr=6.0221e-05) (hash(x)=39840853) +2478 train 3.804862 (lr=6.0202e-05) (hash(x)=37602465) +2479 train 3.786509 (lr=6.0184e-05) (hash(x)=41293883) +2480 train 3.733209 (lr=6.0167e-05) (hash(x)=39398840) +2481 train 3.772811 (lr=6.0151e-05) (hash(x)=35356617) +2482 train 3.712527 (lr=6.0135e-05) (hash(x)=35816064) +2483 train 3.747961 (lr=6.0121e-05) (hash(x)=48591339) +2484 train 3.731586 (lr=6.0107e-05) (hash(x)=50929146) +2485 train 3.790689 (lr=6.0094e-05) (hash(x)=37022238) +2486 train 3.759753 (lr=6.0082e-05) (hash(x)=46737415) +2487 train 3.739746 (lr=6.0071e-05) (hash(x)=45060130) +2488 train 3.751466 (lr=6.0060e-05) (hash(x)=41278189) +2489 train 3.873823 (lr=6.0051e-05) (hash(x)=37159304) +2490 train 3.787562 (lr=6.0042e-05) (hash(x)=42405877) +2491 train 3.782930 (lr=6.0034e-05) (hash(x)=39434693) +2492 train 3.890589 (lr=6.0027e-05) (hash(x)=41431319) +2493 train 3.762928 (lr=6.0020e-05) (hash(x)=52430946) +2494 train 3.788917 (lr=6.0015e-05) (hash(x)=40211356) +2495 train 3.807387 (lr=6.0010e-05) (hash(x)=39696767) +2496 train 3.750281 (lr=6.0007e-05) (hash(x)=38921623) +2497 train 3.793504 (lr=6.0004e-05) (hash(x)=41010258) +2498 train 3.770416 (lr=6.0002e-05) (hash(x)=37389979) +2499 val loss 3.7746 +2499 val perplexity 43.5804 +2499 train 3.770425 (lr=6.0000e-05) (hash(x)=35819836) diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/model_02499.pt b/self_to_selective_run_1_restarted_with_memory_penalty/model_02499.pt new file mode 100644 index 0000000000000000000000000000000000000000..1137002cada0cb52573a40dc93d2c0cef1c7761a --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/model_02499.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e7cdeb024a3633d9328113f28bbb83b56fba97ff0433fc5e224b3b823482f89 +size 557740354 diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/optimizer_02499.pt b/self_to_selective_run_1_restarted_with_memory_penalty/optimizer_02499.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b84d1dd7b67654de2eae8fc4b22057f1d9d721b --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/optimizer_02499.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f566c276bcd33c8802361684c32a9d1399158949c5467534c1d7b266e845c1be +size 1014834310 diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug-internal.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b6da6d19448f7ab20f243a6f16f096830f8b46ce --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-02-13T08:52:20.82612412Z","level":"INFO","msg":"stream: starting","core version":"0.19.6","symlink path":"self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-core.log"} +{"time":"2025-02-13T08:52:21.03727827Z","level":"INFO","msg":"created new stream","id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037339038Z","level":"INFO","msg":"stream: started","id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037414046Z","level":"INFO","msg":"writer: Do: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037478925Z","level":"INFO","msg":"sender: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037515744Z","level":"INFO","msg":"handler: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.267231208Z","level":"INFO","msg":"Starting system monitor"} diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8d71523eeef2967584a6b2d3533f5ce2061c4a6a --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/debug.log @@ -0,0 +1,22 @@ +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Current SDK version is 0.19.6 +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Configure stats pid to 6274 +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from /workspace/context-compression/wandb/settings +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from environment variables +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:setup_run_log_directory():637] Logging user logs to self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug.log +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-internal.log +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():756] calling init triggers +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():761] wandb.init called with sweep_config: {} +config: {'hellaswag': True, 'attention_kind': , 'log_dir': 'self_to_selective_run_1_restarted_with_memory_penalty', 'resume_checkpoint': 'hf://andrew-healey/context-compression/unselective_run_0/model_07500.pt', 'resume_optimizer': False, 'add_a_head': True, 'add_head_to_start': True, 'new_head_init': , 'protect_bos_token': True, 'max_steps': 2500, 'group': 'selective_surgery_3', 'use_wandb': True, 'kill_self_after_run': False, '_wandb': {}} +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():789] starting backend +2025-02-13 08:52:20,815 INFO MainThread:6274 [wandb_init.py:init():793] sending inform_init request +2025-02-13 08:52:20,822 INFO MainThread:6274 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-02-13 08:52:20,822 INFO MainThread:6274 [wandb_init.py:init():808] backend started and connected +2025-02-13 08:52:20,824 INFO MainThread:6274 [wandb_init.py:init():901] updated telemetry +2025-02-13 08:52:20,833 INFO MainThread:6274 [wandb_init.py:init():936] communicating run to backend with 90.0 second timeout +2025-02-13 08:52:21,264 INFO MainThread:6274 [wandb_init.py:init():994] starting run threads in backend +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_console_start():2385] atexit reg +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2235] redirect: wrap_raw +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2300] Wrapping output streams. +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2325] Redirects installed. +2025-02-13 08:52:21,388 INFO MainThread:6274 [wandb_init.py:init():1036] run started, returning control to user process diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/output.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1fad921e4a1822eee81532233415458762dba461 --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/output.log @@ -0,0 +1,2716 @@ +Resuming from hf://andrew-healey/context-compression/unselective_run_0/model_07500.pt +Downloading from andrew-healey/context-compression to unselective_run_0/model_07500.pt +/workspace/context-compression/context_compression/train.py:199: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + checkpoint = torch.load(hf_hub_download(repo_id, rel_path)) +Resuming dataloader state from /root/.cache/huggingface/hub/models--andrew-healey--context-compression/snapshots/89ea27355ce44590ccc5e5988eb923f79db77430/unselective_run_0/dataloader_07500.pt +/workspace/context-compression/context_compression/train.py:232: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + dataloader_state = torch.load(dataloader_path) +ADDING A HEAD +validation loss: 4.4279 +validation perplexity: 83.7545 +step 0 | loss: 4.393443 | lr 8.3916e-07 | norm: 0.2554 | dt: 42689.31ms | tok/sec: 12281.48 +step 1 | loss: 4.408799 | lr 1.6783e-06 | norm: 0.2436 | dt: 1239.29ms | tok/sec: 423056.60 +step 2 | loss: 4.448466 | lr 2.5175e-06 | norm: 278.0988 | dt: 1233.99ms | tok/sec: 424872.01 +step 3 | loss: 4.339198 | lr 3.3566e-06 | norm: 308.1567 | dt: 1233.23ms | tok/sec: 425134.20 +step 4 | loss: 4.191264 | lr 4.1958e-06 | norm: 268.8410 | dt: 1232.23ms | tok/sec: 425478.37 +step 5 | loss: 4.139318 | lr 5.0350e-06 | norm: 147.2222 | dt: 1234.01ms | tok/sec: 424865.86 +step 6 | loss: 4.075793 | lr 5.8741e-06 | norm: 87.1055 | dt: 1234.54ms | tok/sec: 424682.23 +step 7 | loss: 3.998695 | lr 6.7133e-06 | norm: 57.9147 | dt: 1231.99ms | tok/sec: 425560.87 +step 8 | loss: 4.040766 | lr 7.5524e-06 | norm: 40.7478 | dt: 1231.50ms | tok/sec: 425730.10 +step 9 | loss: 4.005822 | lr 8.3916e-06 | norm: 26.1085 | dt: 1229.19ms | tok/sec: 426532.66 +step 10 | loss: 4.006109 | lr 9.2308e-06 | norm: 16.5820 | dt: 1231.90ms | tok/sec: 425592.42 +step 11 | loss: 4.010857 | lr 1.0070e-05 | norm: 11.5869 | dt: 1232.60ms | tok/sec: 425351.38 +step 12 | loss: 3.959165 | lr 1.0909e-05 | norm: 8.3072 | dt: 1232.70ms | tok/sec: 425315.35 +step 13 | loss: 3.935296 | lr 1.1748e-05 | norm: 6.2389 | dt: 1230.22ms | tok/sec: 426174.32 +step 14 | loss: 3.915572 | lr 1.2587e-05 | norm: 6.2901 | dt: 1233.02ms | tok/sec: 425205.64 +step 15 | loss: 3.976100 | lr 1.3427e-05 | norm: 6.0623 | dt: 1235.00ms | tok/sec: 424524.24 +step 16 | loss: 3.886858 | lr 1.4266e-05 | norm: 5.4331 | dt: 1233.85ms | tok/sec: 424919.88 +step 17 | loss: 3.988770 | lr 1.5105e-05 | norm: 4.8398 | dt: 1231.73ms | tok/sec: 425651.81 +step 18 | loss: 3.887189 | lr 1.5944e-05 | norm: 4.4333 | dt: 1232.76ms | tok/sec: 425295.77 +step 19 | loss: 3.918703 | lr 1.6783e-05 | norm: 3.1994 | dt: 1235.04ms | tok/sec: 424509.90 +step 20 | loss: 3.932514 | lr 1.7622e-05 | norm: 2.1958 | dt: 1233.60ms | tok/sec: 425005.20 +step 21 | loss: 3.943015 | lr 1.8462e-05 | norm: 3.6968 | dt: 1234.31ms | tok/sec: 424763.11 +step 22 | loss: 3.942523 | lr 1.9301e-05 | norm: 2.8428 | dt: 1233.47ms | tok/sec: 425050.80 +step 23 | loss: 3.893634 | lr 2.0140e-05 | norm: 1.8610 | dt: 1230.98ms | tok/sec: 425910.43 +step 24 | loss: 3.907353 | lr 2.0979e-05 | norm: 2.3915 | dt: 1941.43ms | tok/sec: 270052.84 +validation loss: 3.8895 +validation perplexity: 48.8860 +step 25 | loss: 3.950865 | lr 2.1818e-05 | norm: 1.8844 | dt: 2686.29ms | tok/sec: 195171.71 +step 26 | loss: 3.921378 | lr 2.2657e-05 | norm: 2.4515 | dt: 1232.70ms | tok/sec: 425315.76 +step 27 | loss: 3.916573 | lr 2.3497e-05 | norm: 2.4385 | dt: 1231.44ms | tok/sec: 425753.34 +step 28 | loss: 3.921322 | lr 2.4336e-05 | norm: 1.6360 | dt: 1233.41ms | tok/sec: 425071.58 +step 29 | loss: 3.901302 | lr 2.5175e-05 | norm: 2.5501 | dt: 1234.70ms | tok/sec: 424628.27 +step 30 | loss: 3.853675 | lr 2.6014e-05 | norm: 2.7331 | dt: 1228.59ms | tok/sec: 426738.02 +step 31 | loss: 3.851100 | lr 2.6853e-05 | norm: 1.8070 | dt: 1233.64ms | tok/sec: 424994.20 +step 32 | loss: 3.801766 | lr 2.7692e-05 | norm: 2.3376 | dt: 1232.39ms | tok/sec: 425422.97 +step 33 | loss: 3.815352 | lr 2.8531e-05 | norm: 2.1432 | dt: 1234.80ms | tok/sec: 424594.49 +step 34 | loss: 3.862724 | lr 2.9371e-05 | norm: 4.3786 | dt: 1233.15ms | tok/sec: 425160.10 +step 35 | loss: 3.818541 | lr 3.0210e-05 | norm: 2.1269 | dt: 1233.61ms | tok/sec: 425002.25 +step 36 | loss: 3.844186 | lr 3.1049e-05 | norm: 3.6683 | dt: 1233.45ms | tok/sec: 425058.11 +step 37 | loss: 3.805704 | lr 3.1888e-05 | norm: 3.3498 | dt: 1232.11ms | tok/sec: 425518.96 +step 38 | loss: 3.785592 | lr 3.2727e-05 | norm: 2.2713 | dt: 1231.57ms | tok/sec: 425705.87 +step 39 | loss: 3.912579 | lr 3.3566e-05 | norm: 2.8840 | dt: 1234.50ms | tok/sec: 424696.01 +step 40 | loss: 3.832278 | lr 3.4406e-05 | norm: 2.5318 | dt: 1235.41ms | tok/sec: 424384.96 +step 41 | loss: 3.874651 | lr 3.5245e-05 | norm: 1.4082 | dt: 1232.77ms | tok/sec: 425292.48 +step 42 | loss: 3.906337 | lr 3.6084e-05 | norm: 1.9372 | dt: 1235.47ms | tok/sec: 424364.16 +step 43 | loss: 3.876131 | lr 3.6923e-05 | norm: 1.3725 | dt: 1234.06ms | tok/sec: 424847.96 +step 44 | loss: 3.835607 | lr 3.7762e-05 | norm: 1.7211 | dt: 1230.95ms | tok/sec: 425922.23 +step 45 | loss: 3.927090 | lr 3.8601e-05 | norm: 2.3528 | dt: 1233.35ms | tok/sec: 425091.80 +step 46 | loss: 3.891225 | lr 3.9441e-05 | norm: 1.7672 | dt: 1234.32ms | tok/sec: 424759.91 +step 47 | loss: 3.901740 | lr 4.0280e-05 | norm: 2.0284 | dt: 1236.18ms | tok/sec: 424118.30 +step 48 | loss: 3.906932 | lr 4.1119e-05 | norm: 1.7785 | dt: 1230.43ms | tok/sec: 426101.40 +step 49 | loss: 3.907823 | lr 4.1958e-05 | norm: 2.1312 | dt: 1233.88ms | tok/sec: 424908.88 +validation loss: 3.8596 +validation perplexity: 47.4442 +step 50 | loss: 3.873801 | lr 4.2797e-05 | norm: 1.8386 | dt: 2672.76ms | tok/sec: 196160.11 +step 51 | loss: 3.822859 | lr 4.3636e-05 | norm: 1.5526 | dt: 1233.93ms | tok/sec: 424894.51 +step 52 | loss: 3.868409 | lr 4.4476e-05 | norm: 1.4906 | dt: 1236.49ms | tok/sec: 424012.31 +step 53 | loss: 3.825042 | lr 4.5315e-05 | norm: 1.4569 | dt: 1232.59ms | tok/sec: 425356.15 +step 54 | loss: 3.866691 | lr 4.6154e-05 | norm: 1.9206 | dt: 1235.13ms | tok/sec: 424481.71 +step 55 | loss: 3.836372 | lr 4.6993e-05 | norm: 2.3948 | dt: 1236.81ms | tok/sec: 423904.83 +step 56 | loss: 3.862881 | lr 4.7832e-05 | norm: 1.5177 | dt: 1237.01ms | tok/sec: 423834.89 +step 57 | loss: 3.847629 | lr 4.8671e-05 | norm: 1.6162 | dt: 1233.86ms | tok/sec: 424917.41 +step 58 | loss: 3.963893 | lr 4.9510e-05 | norm: 2.7591 | dt: 1234.06ms | tok/sec: 424847.88 +step 59 | loss: 3.936149 | lr 5.0350e-05 | norm: 2.2900 | dt: 1235.65ms | tok/sec: 424301.44 +step 60 | loss: 3.883785 | lr 5.1189e-05 | norm: 2.2715 | dt: 1232.40ms | tok/sec: 425421.24 +step 61 | loss: 3.876284 | lr 5.2028e-05 | norm: 1.1955 | dt: 1234.93ms | tok/sec: 424549.57 +step 62 | loss: 3.884097 | lr 5.2867e-05 | norm: 1.7030 | dt: 1235.48ms | tok/sec: 424360.56 +step 63 | loss: 3.868871 | lr 5.3706e-05 | norm: 2.3770 | dt: 1233.27ms | tok/sec: 425120.23 +step 64 | loss: 3.847831 | lr 5.4545e-05 | norm: 1.5083 | dt: 1234.64ms | tok/sec: 424648.36 +step 65 | loss: 3.803256 | lr 5.5385e-05 | norm: 1.4376 | dt: 1232.35ms | tok/sec: 425437.54 +step 66 | loss: 3.814696 | lr 5.6224e-05 | norm: 1.7790 | dt: 1233.61ms | tok/sec: 425004.71 +step 67 | loss: 3.859408 | lr 5.7063e-05 | norm: 1.8586 | dt: 1231.69ms | tok/sec: 425665.24 +step 68 | loss: 3.862029 | lr 5.7902e-05 | norm: 1.5564 | dt: 1234.25ms | tok/sec: 424783.29 +step 69 | loss: 3.826919 | lr 5.8741e-05 | norm: 6.7956 | dt: 1232.68ms | tok/sec: 425323.33 +step 70 | loss: 3.816696 | lr 5.9580e-05 | norm: 4.3052 | dt: 1231.25ms | tok/sec: 425817.98 +step 71 | loss: 3.804653 | lr 6.0420e-05 | norm: 3.4156 | dt: 1230.40ms | tok/sec: 426112.38 +step 72 | loss: 3.834329 | lr 6.1259e-05 | norm: 3.3209 | dt: 1234.98ms | tok/sec: 424531.04 +step 73 | loss: 3.846651 | lr 6.2098e-05 | norm: 2.5245 | dt: 1234.55ms | tok/sec: 424680.50 +step 74 | loss: 3.811414 | lr 6.2937e-05 | norm: 2.4236 | dt: 1234.42ms | tok/sec: 424723.98 +validation loss: 3.8538 +validation perplexity: 47.1712 +step 75 | loss: 3.908384 | lr 6.3776e-05 | norm: 2.3487 | dt: 2677.40ms | tok/sec: 195819.66 +step 76 | loss: 3.832153 | lr 6.4615e-05 | norm: 1.5755 | dt: 1236.93ms | tok/sec: 423863.57 +step 77 | loss: 3.867815 | lr 6.5455e-05 | norm: 2.1410 | dt: 1234.46ms | tok/sec: 424709.05 +step 78 | loss: 3.873968 | lr 6.6294e-05 | norm: 1.6188 | dt: 1234.78ms | tok/sec: 424599.49 +step 79 | loss: 3.899307 | lr 6.7133e-05 | norm: 2.2035 | dt: 1235.83ms | tok/sec: 424238.16 +step 80 | loss: 3.868505 | lr 6.7972e-05 | norm: 1.9428 | dt: 1234.91ms | tok/sec: 424554.89 +step 81 | loss: 3.875647 | lr 6.8811e-05 | norm: 1.9564 | dt: 1234.08ms | tok/sec: 424842.71 +step 82 | loss: 3.925524 | lr 6.9650e-05 | norm: 1.5443 | dt: 1233.23ms | tok/sec: 425135.19 +step 83 | loss: 3.862810 | lr 7.0490e-05 | norm: 1.2501 | dt: 1232.21ms | tok/sec: 425487.26 +step 84 | loss: 3.839538 | lr 7.1329e-05 | norm: 1.2301 | dt: 1234.80ms | tok/sec: 424593.09 +step 85 | loss: 3.921173 | lr 7.2168e-05 | norm: 1.6478 | dt: 1231.57ms | tok/sec: 425708.09 +step 86 | loss: 3.859308 | lr 7.3007e-05 | norm: 1.7749 | dt: 1237.09ms | tok/sec: 423806.87 +step 87 | loss: 3.854143 | lr 7.3846e-05 | norm: 1.8833 | dt: 1232.27ms | tok/sec: 425464.05 +step 88 | loss: 3.782822 | lr 7.4685e-05 | norm: 1.3826 | dt: 1232.08ms | tok/sec: 425529.09 +step 89 | loss: 3.901006 | lr 7.5524e-05 | norm: 1.5733 | dt: 1238.61ms | tok/sec: 423286.57 +step 90 | loss: 3.871667 | lr 7.6364e-05 | norm: 1.1559 | dt: 1235.22ms | tok/sec: 424448.53 +step 91 | loss: 3.815237 | lr 7.7203e-05 | norm: 2.1100 | dt: 1235.03ms | tok/sec: 424512.85 +step 92 | loss: 3.826456 | lr 7.8042e-05 | norm: 1.8071 | dt: 1233.70ms | tok/sec: 424970.79 +step 93 | loss: 3.847907 | lr 7.8881e-05 | norm: 2.4650 | dt: 1231.43ms | tok/sec: 425755.49 +step 94 | loss: 3.865739 | lr 7.9720e-05 | norm: 1.8051 | dt: 1234.33ms | tok/sec: 424756.46 +step 95 | loss: 3.799418 | lr 8.0559e-05 | norm: 1.0060 | dt: 1233.94ms | tok/sec: 424888.43 +step 96 | loss: 3.827882 | lr 8.1399e-05 | norm: 3.0398 | dt: 1233.99ms | tok/sec: 424871.93 +step 97 | loss: 3.885617 | lr 8.2238e-05 | norm: 2.3695 | dt: 1235.38ms | tok/sec: 424393.89 +step 98 | loss: 3.861015 | lr 8.3077e-05 | norm: 1.7551 | dt: 1234.86ms | tok/sec: 424571.94 +step 99 | loss: 3.834940 | lr 8.3916e-05 | norm: 3.2244 | dt: 1232.08ms | tok/sec: 425529.83 +validation loss: 3.8509 +validation perplexity: 47.0346 +step 100 | loss: 3.852343 | lr 8.4755e-05 | norm: 1.9768 | dt: 2677.59ms | tok/sec: 195805.92 +step 101 | loss: 3.824832 | lr 8.5594e-05 | norm: 1.1934 | dt: 1236.89ms | tok/sec: 423877.05 +step 102 | loss: 3.850434 | lr 8.6434e-05 | norm: 1.1745 | dt: 1234.09ms | tok/sec: 424839.35 +step 103 | loss: 3.815179 | lr 8.7273e-05 | norm: 1.4941 | dt: 1231.78ms | tok/sec: 425634.68 +step 104 | loss: 3.826632 | lr 8.8112e-05 | norm: 1.4018 | dt: 1234.94ms | tok/sec: 424544.65 +step 105 | loss: 3.822171 | lr 8.8951e-05 | norm: 1.2587 | dt: 1235.93ms | tok/sec: 424205.18 +step 106 | loss: 3.863076 | lr 8.9790e-05 | norm: 1.8532 | dt: 1232.52ms | tok/sec: 425380.51 +step 107 | loss: 3.807552 | lr 9.0629e-05 | norm: 1.8815 | dt: 1233.40ms | tok/sec: 425074.13 +step 108 | loss: 3.829730 | lr 9.1469e-05 | norm: 1.1887 | dt: 1234.69ms | tok/sec: 424630.81 +step 109 | loss: 3.828184 | lr 9.2308e-05 | norm: 1.7355 | dt: 1234.93ms | tok/sec: 424550.14 +step 110 | loss: 3.880381 | lr 9.3147e-05 | norm: 2.2054 | dt: 1234.68ms | tok/sec: 424636.06 +step 111 | loss: 3.834862 | lr 9.3986e-05 | norm: 1.6807 | dt: 1233.95ms | tok/sec: 424886.63 +step 112 | loss: 3.845744 | lr 9.4825e-05 | norm: 1.0931 | dt: 1236.77ms | tok/sec: 423915.78 +step 113 | loss: 3.898971 | lr 9.5664e-05 | norm: 1.1179 | dt: 1234.82ms | tok/sec: 424585.31 +step 114 | loss: 3.884350 | lr 9.6503e-05 | norm: 1.8070 | dt: 1235.34ms | tok/sec: 424409.21 +step 115 | loss: 3.823194 | lr 9.7343e-05 | norm: 1.6716 | dt: 1231.78ms | tok/sec: 425634.59 +step 116 | loss: 3.888769 | lr 9.8182e-05 | norm: 1.9516 | dt: 1236.05ms | tok/sec: 424162.88 +step 117 | loss: 3.849446 | lr 9.9021e-05 | norm: 1.5300 | dt: 1232.58ms | tok/sec: 425357.31 +step 118 | loss: 3.854818 | lr 9.9860e-05 | norm: 2.0332 | dt: 1231.49ms | tok/sec: 425735.29 +step 119 | loss: 3.871190 | lr 1.0070e-04 | norm: 1.6712 | dt: 1234.35ms | tok/sec: 424749.33 +step 120 | loss: 3.901891 | lr 1.0154e-04 | norm: 2.4602 | dt: 1233.55ms | tok/sec: 425025.08 +step 121 | loss: 3.806935 | lr 1.0238e-04 | norm: 1.9759 | dt: 1234.02ms | tok/sec: 424861.67 +step 122 | loss: 3.861631 | lr 1.0322e-04 | norm: 1.3980 | dt: 1238.62ms | tok/sec: 423282.82 +step 123 | loss: 3.857289 | lr 1.0406e-04 | norm: 1.4087 | dt: 1237.66ms | tok/sec: 423611.67 +step 124 | loss: 3.859062 | lr 1.0490e-04 | norm: 2.5444 | dt: 1237.10ms | tok/sec: 423805.16 +validation loss: 3.8509 +validation perplexity: 47.0376 +step 125 | loss: 3.888362 | lr 1.0573e-04 | norm: 1.9239 | dt: 2680.85ms | tok/sec: 195567.98 +step 126 | loss: 3.913844 | lr 1.0657e-04 | norm: 1.2123 | dt: 1234.18ms | tok/sec: 424807.17 +step 127 | loss: 3.857267 | lr 1.0741e-04 | norm: 2.3743 | dt: 1233.18ms | tok/sec: 425149.58 +step 128 | loss: 3.842176 | lr 1.0825e-04 | norm: 1.5458 | dt: 1234.22ms | tok/sec: 424792.57 +step 129 | loss: 3.847849 | lr 1.0909e-04 | norm: 2.9894 | dt: 1233.24ms | tok/sec: 425129.77 +step 130 | loss: 3.882656 | lr 1.0993e-04 | norm: 2.6714 | dt: 1234.01ms | tok/sec: 424864.46 +step 131 | loss: 3.851478 | lr 1.1077e-04 | norm: 2.1820 | dt: 1237.21ms | tok/sec: 423766.20 +step 132 | loss: 3.828274 | lr 1.1161e-04 | norm: 1.3009 | dt: 1234.55ms | tok/sec: 424680.09 +step 133 | loss: 3.819535 | lr 1.1245e-04 | norm: 1.7121 | dt: 1233.67ms | tok/sec: 424983.77 +step 134 | loss: 3.847534 | lr 1.1329e-04 | norm: 1.5083 | dt: 1235.58ms | tok/sec: 424324.53 +step 135 | loss: 3.787405 | lr 1.1413e-04 | norm: 1.2176 | dt: 1234.96ms | tok/sec: 424539.32 +step 136 | loss: 3.770731 | lr 1.1497e-04 | norm: 0.9294 | dt: 1233.32ms | tok/sec: 425102.73 +step 137 | loss: 3.803704 | lr 1.1580e-04 | norm: 0.6989 | dt: 1234.09ms | tok/sec: 424838.93 +step 138 | loss: 3.833760 | lr 1.1664e-04 | norm: 0.9453 | dt: 1232.77ms | tok/sec: 425292.56 +step 139 | loss: 3.780044 | lr 1.1748e-04 | norm: 1.5922 | dt: 1233.99ms | tok/sec: 424872.10 +step 140 | loss: 3.786521 | lr 1.1832e-04 | norm: 1.3220 | dt: 1235.04ms | tok/sec: 424511.62 +step 141 | loss: 3.737349 | lr 1.1916e-04 | norm: 1.1074 | dt: 1237.36ms | tok/sec: 423715.33 +step 142 | loss: 3.800108 | lr 1.2000e-04 | norm: 1.4640 | dt: 1236.38ms | tok/sec: 424049.27 +step 143 | loss: 3.837610 | lr 1.2084e-04 | norm: 0.8305 | dt: 1234.02ms | tok/sec: 424860.60 +step 144 | loss: 3.797274 | lr 1.2168e-04 | norm: 3.0373 | dt: 1236.01ms | tok/sec: 424178.02 +step 145 | loss: 3.897761 | lr 1.2252e-04 | norm: 1.9030 | dt: 1229.36ms | tok/sec: 426470.70 +step 146 | loss: 3.881305 | lr 1.2336e-04 | norm: 1.5838 | dt: 1233.95ms | tok/sec: 424885.72 +step 147 | loss: 3.855614 | lr 1.2420e-04 | norm: 1.5106 | dt: 1232.70ms | tok/sec: 425318.47 +step 148 | loss: 3.849693 | lr 1.2503e-04 | norm: 2.2857 | dt: 1234.42ms | tok/sec: 424723.57 +step 149 | loss: 3.896503 | lr 1.2587e-04 | norm: 1.3323 | dt: 1236.61ms | tok/sec: 423973.64 +validation loss: 3.8497 +validation perplexity: 46.9791 +step 150 | loss: 3.874132 | lr 1.2671e-04 | norm: 2.0086 | dt: 2678.81ms | tok/sec: 195716.87 +step 151 | loss: 3.864563 | lr 1.2755e-04 | norm: 1.7713 | dt: 1237.28ms | tok/sec: 423740.72 +step 152 | loss: 3.816126 | lr 1.2839e-04 | norm: 1.6570 | dt: 1234.97ms | tok/sec: 424533.58 +step 153 | loss: 3.887863 | lr 1.2923e-04 | norm: 2.2824 | dt: 1234.51ms | tok/sec: 424692.97 +step 154 | loss: 3.907737 | lr 1.3007e-04 | norm: 1.5919 | dt: 1233.09ms | tok/sec: 425182.54 +step 155 | loss: 3.883861 | lr 1.3091e-04 | norm: 1.2807 | dt: 1237.34ms | tok/sec: 423722.76 +step 156 | loss: 3.893161 | lr 1.3175e-04 | norm: 2.0193 | dt: 1235.99ms | tok/sec: 424184.81 +step 157 | loss: 3.900731 | lr 1.3259e-04 | norm: 1.4424 | dt: 1233.20ms | tok/sec: 425143.90 +step 158 | loss: 3.809634 | lr 1.3343e-04 | norm: 0.8734 | dt: 1234.53ms | tok/sec: 424686.57 +step 159 | loss: 3.863439 | lr 1.3427e-04 | norm: 1.1913 | dt: 1234.08ms | tok/sec: 424842.14 +step 160 | loss: 3.890220 | lr 1.3510e-04 | norm: 1.9055 | dt: 1234.51ms | tok/sec: 424694.12 +step 161 | loss: 3.877751 | lr 1.3594e-04 | norm: 1.2572 | dt: 1236.04ms | tok/sec: 424169.18 +step 162 | loss: 3.845996 | lr 1.3678e-04 | norm: 1.8734 | dt: 1234.98ms | tok/sec: 424530.31 +step 163 | loss: 3.987020 | lr 1.3762e-04 | norm: 1.3074 | dt: 1234.32ms | tok/sec: 424759.42 +step 164 | loss: 3.829700 | lr 1.3846e-04 | norm: 1.2276 | dt: 1233.96ms | tok/sec: 424882.85 +step 165 | loss: 3.896605 | lr 1.3930e-04 | norm: 1.0675 | dt: 1234.08ms | tok/sec: 424840.58 +step 166 | loss: 3.907640 | lr 1.4014e-04 | norm: 1.6763 | dt: 1234.89ms | tok/sec: 424562.84 +step 167 | loss: 3.822870 | lr 1.4098e-04 | norm: 0.9914 | dt: 1235.56ms | tok/sec: 424331.00 +step 168 | loss: 3.836327 | lr 1.4182e-04 | norm: 0.9326 | dt: 1235.22ms | tok/sec: 424450.41 +step 169 | loss: 3.820416 | lr 1.4266e-04 | norm: 1.1749 | dt: 1234.48ms | tok/sec: 424704.95 +step 170 | loss: 3.793713 | lr 1.4350e-04 | norm: 1.1071 | dt: 1235.34ms | tok/sec: 424407.16 +step 171 | loss: 3.795236 | lr 1.4434e-04 | norm: 1.9549 | dt: 1234.67ms | tok/sec: 424636.71 +step 172 | loss: 3.815834 | lr 1.4517e-04 | norm: 0.9831 | dt: 1236.34ms | tok/sec: 424064.40 +step 173 | loss: 3.830936 | lr 1.4601e-04 | norm: 1.4651 | dt: 1234.12ms | tok/sec: 424825.72 +step 174 | loss: 3.861740 | lr 1.4685e-04 | norm: 1.6104 | dt: 1235.94ms | tok/sec: 424201.99 +validation loss: 3.8515 +validation perplexity: 47.0657 +step 175 | loss: 3.784555 | lr 1.4769e-04 | norm: 0.8513 | dt: 2689.44ms | tok/sec: 194943.34 +step 176 | loss: 3.795583 | lr 1.4853e-04 | norm: 0.9919 | dt: 1235.95ms | tok/sec: 424198.64 +step 177 | loss: 3.793663 | lr 1.4937e-04 | norm: 1.9432 | dt: 1232.28ms | tok/sec: 425460.34 +step 178 | loss: 3.827638 | lr 1.5021e-04 | norm: 0.9028 | dt: 1234.82ms | tok/sec: 424585.06 +step 179 | loss: 3.910553 | lr 1.5105e-04 | norm: 1.8409 | dt: 1232.02ms | tok/sec: 425552.80 +step 180 | loss: 3.975653 | lr 1.5189e-04 | norm: 1.9099 | dt: 1233.27ms | tok/sec: 425121.38 +step 181 | loss: 3.919280 | lr 1.5273e-04 | norm: 1.2679 | dt: 1232.53ms | tok/sec: 425374.75 +step 182 | loss: 3.864950 | lr 1.5357e-04 | norm: 1.4841 | dt: 1234.43ms | tok/sec: 424720.45 +step 183 | loss: 3.902390 | lr 1.5441e-04 | norm: 0.9498 | dt: 1237.33ms | tok/sec: 423725.37 +step 184 | loss: 3.896419 | lr 1.5524e-04 | norm: 3.9403 | dt: 1235.69ms | tok/sec: 424289.00 +step 185 | loss: 3.826394 | lr 1.5608e-04 | norm: 2.0869 | dt: 1233.15ms | tok/sec: 425162.07 +step 186 | loss: 3.899972 | lr 1.5692e-04 | norm: 1.9496 | dt: 1231.76ms | tok/sec: 425640.94 +step 187 | loss: 3.898394 | lr 1.5776e-04 | norm: 1.5858 | dt: 1234.53ms | tok/sec: 424685.59 +step 188 | loss: 3.861941 | lr 1.5860e-04 | norm: 0.9479 | dt: 1233.53ms | tok/sec: 425031.66 +step 189 | loss: 3.880408 | lr 1.5944e-04 | norm: 0.8313 | dt: 1231.39ms | tok/sec: 425767.77 +step 190 | loss: 3.919226 | lr 1.6028e-04 | norm: 1.6218 | dt: 1233.48ms | tok/sec: 425047.43 +step 191 | loss: 3.872868 | lr 1.6112e-04 | norm: 1.0185 | dt: 1236.16ms | tok/sec: 424126.48 +step 192 | loss: 3.861025 | lr 1.6196e-04 | norm: 0.9203 | dt: 1233.49ms | tok/sec: 425044.14 +step 193 | loss: 3.832063 | lr 1.6280e-04 | norm: 1.1226 | dt: 1235.85ms | tok/sec: 424233.25 +step 194 | loss: 3.864435 | lr 1.6364e-04 | norm: 1.0548 | dt: 1233.92ms | tok/sec: 424895.99 +step 195 | loss: 3.857391 | lr 1.6448e-04 | norm: 0.7949 | dt: 1236.57ms | tok/sec: 423987.21 +step 196 | loss: 3.905225 | lr 1.6531e-04 | norm: 1.1663 | dt: 1233.75ms | tok/sec: 424955.52 +step 197 | loss: 3.836240 | lr 1.6615e-04 | norm: 1.3042 | dt: 1233.46ms | tok/sec: 425053.92 +step 198 | loss: 3.945784 | lr 1.6699e-04 | norm: 0.6285 | dt: 1235.07ms | tok/sec: 424500.97 +step 199 | loss: 3.879972 | lr 1.6783e-04 | norm: 0.8472 | dt: 1232.08ms | tok/sec: 425530.82 +validation loss: 3.8505 +validation perplexity: 47.0177 +step 200 | loss: 3.868594 | lr 1.6867e-04 | norm: 0.6268 | dt: 2680.17ms | tok/sec: 195617.65 +step 201 | loss: 3.867278 | lr 1.6951e-04 | norm: 1.0767 | dt: 1233.80ms | tok/sec: 424937.78 +step 202 | loss: 3.852775 | lr 1.7035e-04 | norm: 1.0341 | dt: 1235.59ms | tok/sec: 424322.97 +step 203 | loss: 3.797747 | lr 1.7119e-04 | norm: 1.0950 | dt: 1232.21ms | tok/sec: 425486.19 +step 204 | loss: 3.823229 | lr 1.7203e-04 | norm: 1.5141 | dt: 1234.06ms | tok/sec: 424848.21 +step 205 | loss: 3.839741 | lr 1.7287e-04 | norm: 0.7216 | dt: 1234.96ms | tok/sec: 424536.78 +step 206 | loss: 3.857895 | lr 1.7371e-04 | norm: 2.9471 | dt: 1235.32ms | tok/sec: 424415.51 +step 207 | loss: 3.820510 | lr 1.7455e-04 | norm: 1.3140 | dt: 1236.95ms | tok/sec: 423855.97 +step 208 | loss: 3.787636 | lr 1.7538e-04 | norm: 1.2792 | dt: 1238.21ms | tok/sec: 423425.78 +step 209 | loss: 3.861665 | lr 1.7622e-04 | norm: 0.8113 | dt: 1235.06ms | tok/sec: 424503.59 +step 210 | loss: 3.800168 | lr 1.7706e-04 | norm: 1.3618 | dt: 1231.87ms | tok/sec: 425602.22 +step 211 | loss: 3.809201 | lr 1.7790e-04 | norm: 1.0504 | dt: 1237.70ms | tok/sec: 423598.45 +step 212 | loss: 3.834052 | lr 1.7874e-04 | norm: 1.4234 | dt: 1235.24ms | tok/sec: 424441.24 +step 213 | loss: 3.903516 | lr 1.7958e-04 | norm: 0.9114 | dt: 1237.51ms | tok/sec: 423664.39 +step 214 | loss: 3.918202 | lr 1.8042e-04 | norm: 1.2098 | dt: 1857.06ms | tok/sec: 282321.63 +step 215 | loss: 3.875239 | lr 1.8126e-04 | norm: 1.2449 | dt: 1236.04ms | tok/sec: 424166.32 +step 216 | loss: 3.870703 | lr 1.8210e-04 | norm: 1.1339 | dt: 1237.88ms | tok/sec: 423535.71 +step 217 | loss: 3.881535 | lr 1.8294e-04 | norm: 1.3249 | dt: 1235.24ms | tok/sec: 424443.45 +step 218 | loss: 3.875517 | lr 1.8378e-04 | norm: 0.7886 | dt: 1233.28ms | tok/sec: 425115.88 +step 219 | loss: 3.882008 | lr 1.8462e-04 | norm: 1.4395 | dt: 1239.16ms | tok/sec: 423099.42 +step 220 | loss: 3.878149 | lr 1.8545e-04 | norm: 1.1221 | dt: 1236.93ms | tok/sec: 423863.40 +step 221 | loss: 4.047620 | lr 1.8629e-04 | norm: 1.1636 | dt: 1236.46ms | tok/sec: 424023.02 +step 222 | loss: 3.818120 | lr 1.8713e-04 | norm: 1.4563 | dt: 1234.89ms | tok/sec: 424560.80 +step 223 | loss: 3.877470 | lr 1.8797e-04 | norm: 1.0025 | dt: 1235.67ms | tok/sec: 424295.55 +step 224 | loss: 3.883572 | lr 1.8881e-04 | norm: 0.8035 | dt: 1234.74ms | tok/sec: 424615.56 +validation loss: 3.8557 +validation perplexity: 47.2612 +step 225 | loss: 3.874290 | lr 1.8965e-04 | norm: 0.9137 | dt: 2679.23ms | tok/sec: 195685.88 +step 226 | loss: 3.887338 | lr 1.9049e-04 | norm: 0.7619 | dt: 1235.38ms | tok/sec: 424395.53 +step 227 | loss: 3.825192 | lr 1.9133e-04 | norm: 0.8134 | dt: 1230.82ms | tok/sec: 425965.29 +step 228 | loss: 3.884968 | lr 1.9217e-04 | norm: 1.2265 | dt: 1232.98ms | tok/sec: 425221.18 +step 229 | loss: 3.786342 | lr 1.9301e-04 | norm: 0.8097 | dt: 1236.64ms | tok/sec: 423960.98 +step 230 | loss: 3.929681 | lr 1.9385e-04 | norm: 0.7790 | dt: 1237.34ms | tok/sec: 423721.62 +step 231 | loss: 3.861465 | lr 1.9469e-04 | norm: 1.6385 | dt: 1236.49ms | tok/sec: 424012.80 +step 232 | loss: 3.870763 | lr 1.9552e-04 | norm: 0.7153 | dt: 1233.25ms | tok/sec: 425126.89 +step 233 | loss: 3.900997 | lr 1.9636e-04 | norm: 0.8982 | dt: 1233.47ms | tok/sec: 425050.47 +step 234 | loss: 3.884142 | lr 1.9720e-04 | norm: 1.2199 | dt: 1237.44ms | tok/sec: 423688.80 +step 235 | loss: 3.836524 | lr 1.9804e-04 | norm: 0.5961 | dt: 1232.99ms | tok/sec: 425216.41 +step 236 | loss: 3.820418 | lr 1.9888e-04 | norm: 0.6918 | dt: 1231.17ms | tok/sec: 425843.95 +step 237 | loss: 3.831916 | lr 1.9972e-04 | norm: 0.6208 | dt: 1236.73ms | tok/sec: 423932.29 +step 238 | loss: 3.832983 | lr 2.0056e-04 | norm: 0.7900 | dt: 1235.13ms | tok/sec: 424479.66 +step 239 | loss: 3.778226 | lr 2.0140e-04 | norm: 0.6390 | dt: 1232.42ms | tok/sec: 425413.10 +step 240 | loss: 3.797660 | lr 2.0224e-04 | norm: 0.7801 | dt: 1238.23ms | tok/sec: 423418.20 +step 241 | loss: 3.790141 | lr 2.0308e-04 | norm: 0.9957 | dt: 1234.66ms | tok/sec: 424643.27 +step 242 | loss: 3.861814 | lr 2.0392e-04 | norm: 0.9059 | dt: 1234.67ms | tok/sec: 424638.84 +step 243 | loss: 3.860725 | lr 2.0476e-04 | norm: 0.6820 | dt: 1234.98ms | tok/sec: 424533.01 +step 244 | loss: 3.831398 | lr 2.0559e-04 | norm: 0.9146 | dt: 1233.35ms | tok/sec: 425092.54 +step 245 | loss: 3.801565 | lr 2.0643e-04 | norm: 0.8827 | dt: 1234.99ms | tok/sec: 424527.03 +step 246 | loss: 3.890126 | lr 2.0727e-04 | norm: 0.7492 | dt: 1236.77ms | tok/sec: 423917.74 +step 247 | loss: 3.768040 | lr 2.0811e-04 | norm: 0.6327 | dt: 1234.23ms | tok/sec: 424788.87 +step 248 | loss: 3.780514 | lr 2.0895e-04 | norm: 0.6844 | dt: 1233.24ms | tok/sec: 425131.49 +step 249 | loss: 3.828695 | lr 2.0979e-04 | norm: 0.8321 | dt: 1231.59ms | tok/sec: 425699.03 +validation loss: 3.8570 +validation perplexity: 47.3224 +step 250 | loss: 3.792278 | lr 2.1063e-04 | norm: 0.8007 | dt: 2695.67ms | tok/sec: 194492.74 +step 251 | loss: 3.825191 | lr 2.1147e-04 | norm: 0.7866 | dt: 1231.83ms | tok/sec: 425618.53 +step 252 | loss: 3.829246 | lr 2.1231e-04 | norm: 0.9481 | dt: 1232.24ms | tok/sec: 425474.34 +step 253 | loss: 3.842735 | lr 2.1315e-04 | norm: 1.0353 | dt: 1234.69ms | tok/sec: 424630.40 +step 254 | loss: 3.849941 | lr 2.1399e-04 | norm: 0.6707 | dt: 1237.89ms | tok/sec: 423533.18 +step 255 | loss: 3.838879 | lr 2.1483e-04 | norm: 1.3520 | dt: 1239.18ms | tok/sec: 423091.76 +step 256 | loss: 3.841702 | lr 2.1566e-04 | norm: 1.2001 | dt: 1232.15ms | tok/sec: 425505.95 +step 257 | loss: 3.835712 | lr 2.1650e-04 | norm: 0.7778 | dt: 1232.90ms | tok/sec: 425246.34 +step 258 | loss: 3.821564 | lr 2.1734e-04 | norm: 1.4967 | dt: 1235.99ms | tok/sec: 424185.22 +step 259 | loss: 3.910155 | lr 2.1818e-04 | norm: 0.9179 | dt: 1238.52ms | tok/sec: 423317.37 +step 260 | loss: 3.893105 | lr 2.1902e-04 | norm: 1.3549 | dt: 1235.29ms | tok/sec: 424424.85 +step 261 | loss: 3.838136 | lr 2.1986e-04 | norm: 0.9879 | dt: 1234.74ms | tok/sec: 424614.74 +step 262 | loss: 3.865999 | lr 2.2070e-04 | norm: 0.9909 | dt: 1236.81ms | tok/sec: 423902.87 +step 263 | loss: 3.888954 | lr 2.2154e-04 | norm: 0.6456 | dt: 1235.50ms | tok/sec: 424351.30 +step 264 | loss: 3.869772 | lr 2.2238e-04 | norm: 0.7087 | dt: 1233.83ms | tok/sec: 424928.34 +step 265 | loss: 3.846615 | lr 2.2322e-04 | norm: 0.6286 | dt: 1234.32ms | tok/sec: 424759.58 +step 266 | loss: 3.927274 | lr 2.2406e-04 | norm: 0.5494 | dt: 1233.97ms | tok/sec: 424877.76 +step 267 | loss: 3.869853 | lr 2.2490e-04 | norm: 0.5027 | dt: 1233.94ms | tok/sec: 424888.68 +step 268 | loss: 3.810919 | lr 2.2573e-04 | norm: 0.6456 | dt: 1233.01ms | tok/sec: 425210.74 +step 269 | loss: 3.865035 | lr 2.2657e-04 | norm: 0.6363 | dt: 1231.58ms | tok/sec: 425703.64 +step 270 | loss: 3.863976 | lr 2.2741e-04 | norm: 0.5467 | dt: 1234.32ms | tok/sec: 424759.58 +step 271 | loss: 3.827195 | lr 2.2825e-04 | norm: 0.5322 | dt: 1235.29ms | tok/sec: 424425.43 +step 272 | loss: 3.874818 | lr 2.2909e-04 | norm: 0.6023 | dt: 1233.23ms | tok/sec: 425132.48 +step 273 | loss: 3.816881 | lr 2.2993e-04 | norm: 0.5405 | dt: 1234.17ms | tok/sec: 424810.87 +step 274 | loss: 3.825661 | lr 2.3077e-04 | norm: 0.9022 | dt: 1235.46ms | tok/sec: 424368.17 +validation loss: 3.8587 +validation perplexity: 47.4020 +step 275 | loss: 3.822490 | lr 2.3161e-04 | norm: 0.9032 | dt: 2687.36ms | tok/sec: 195093.91 +step 276 | loss: 3.856504 | lr 2.3245e-04 | norm: 0.5528 | dt: 1232.81ms | tok/sec: 425280.39 +step 277 | loss: 3.785060 | lr 2.3329e-04 | norm: 0.7724 | dt: 1233.91ms | tok/sec: 424899.84 +step 278 | loss: 3.811465 | lr 2.3413e-04 | norm: 0.7694 | dt: 1238.82ms | tok/sec: 423216.43 +step 279 | loss: 3.817144 | lr 2.3497e-04 | norm: 0.9190 | dt: 1235.08ms | tok/sec: 424495.80 +step 280 | loss: 3.809293 | lr 2.3580e-04 | norm: 0.6143 | dt: 1232.89ms | tok/sec: 425252.26 +step 281 | loss: 3.849562 | lr 2.3664e-04 | norm: 1.1236 | dt: 1233.78ms | tok/sec: 424943.20 +step 282 | loss: 3.858880 | lr 2.3748e-04 | norm: 1.1207 | dt: 1237.68ms | tok/sec: 423603.84 +step 283 | loss: 3.861196 | lr 2.3832e-04 | norm: 0.8954 | dt: 1236.08ms | tok/sec: 424154.86 +step 284 | loss: 3.844221 | lr 2.3916e-04 | norm: 0.8083 | dt: 1237.92ms | tok/sec: 423522.17 +step 285 | loss: 3.920742 | lr 2.4000e-04 | norm: 1.1782 | dt: 1237.13ms | tok/sec: 423793.15 +step 286 | loss: 3.865961 | lr 2.4084e-04 | norm: 0.9669 | dt: 1234.88ms | tok/sec: 424564.98 +step 287 | loss: 3.826860 | lr 2.4168e-04 | norm: 0.8459 | dt: 1237.74ms | tok/sec: 423584.17 +step 288 | loss: 3.800892 | lr 2.4252e-04 | norm: 0.8758 | dt: 1238.41ms | tok/sec: 423357.14 +step 289 | loss: 3.825801 | lr 2.4336e-04 | norm: 1.0647 | dt: 1237.56ms | tok/sec: 423645.95 +step 290 | loss: 3.867836 | lr 2.4420e-04 | norm: 0.8545 | dt: 1235.37ms | tok/sec: 424396.59 +step 291 | loss: 3.881078 | lr 2.4503e-04 | norm: 0.9358 | dt: 1234.77ms | tok/sec: 424605.06 +step 292 | loss: 3.911033 | lr 2.4587e-04 | norm: 0.9566 | dt: 1236.92ms | tok/sec: 423865.45 +step 293 | loss: 3.911941 | lr 2.4671e-04 | norm: 0.6499 | dt: 1237.74ms | tok/sec: 423583.52 +step 294 | loss: 3.890288 | lr 2.4755e-04 | norm: 0.8685 | dt: 1233.45ms | tok/sec: 425056.47 +step 295 | loss: 3.880983 | lr 2.4839e-04 | norm: 0.8243 | dt: 1232.93ms | tok/sec: 425238.53 +step 296 | loss: 3.911482 | lr 2.4923e-04 | norm: 0.5837 | dt: 1238.56ms | tok/sec: 423305.80 +step 297 | loss: 3.868113 | lr 2.5007e-04 | norm: 0.5606 | dt: 1236.72ms | tok/sec: 423935.39 +step 298 | loss: 3.871959 | lr 2.5091e-04 | norm: 0.6235 | dt: 1234.91ms | tok/sec: 424556.53 +step 299 | loss: 3.882006 | lr 2.5175e-04 | norm: 0.5317 | dt: 1234.46ms | tok/sec: 424709.21 +validation loss: 3.8630 +validation perplexity: 47.6056 +step 300 | loss: 3.893557 | lr 2.5259e-04 | norm: 0.5305 | dt: 2686.35ms | tok/sec: 195167.12 +step 301 | loss: 3.883399 | lr 2.5343e-04 | norm: 0.4215 | dt: 1235.49ms | tok/sec: 424355.89 +step 302 | loss: 3.862748 | lr 2.5427e-04 | norm: 0.7957 | dt: 1239.57ms | tok/sec: 422960.50 +step 303 | loss: 3.880363 | lr 2.5510e-04 | norm: 0.9801 | dt: 1236.91ms | tok/sec: 423870.51 +step 304 | loss: 3.897355 | lr 2.5594e-04 | norm: 0.5556 | dt: 1236.88ms | tok/sec: 423879.50 +step 305 | loss: 3.862287 | lr 2.5678e-04 | norm: 0.8879 | dt: 1234.85ms | tok/sec: 424575.06 +step 306 | loss: 3.840580 | lr 2.5762e-04 | norm: 0.7868 | dt: 1234.54ms | tok/sec: 424681.57 +step 307 | loss: 3.851703 | lr 2.5846e-04 | norm: 0.9599 | dt: 1233.57ms | tok/sec: 425015.55 +step 308 | loss: 3.816612 | lr 2.5930e-04 | norm: 0.8124 | dt: 1236.45ms | tok/sec: 424025.72 +step 309 | loss: 3.860766 | lr 2.6014e-04 | norm: 0.5127 | dt: 1234.63ms | tok/sec: 424652.21 +step 310 | loss: 3.875046 | lr 2.6098e-04 | norm: 0.5698 | dt: 1234.65ms | tok/sec: 424645.16 +step 311 | loss: 3.890170 | lr 2.6182e-04 | norm: 0.7508 | dt: 1236.35ms | tok/sec: 424060.72 +step 312 | loss: 3.850728 | lr 2.6266e-04 | norm: 0.7453 | dt: 1238.41ms | tok/sec: 423354.61 +step 313 | loss: 3.878728 | lr 2.6350e-04 | norm: 0.8014 | dt: 1233.95ms | tok/sec: 424887.37 +step 314 | loss: 3.871975 | lr 2.6434e-04 | norm: 0.7745 | dt: 1235.45ms | tok/sec: 424369.98 +step 315 | loss: 3.832888 | lr 2.6517e-04 | norm: 0.6028 | dt: 1236.54ms | tok/sec: 423997.43 +step 316 | loss: 3.811469 | lr 2.6601e-04 | norm: 0.6553 | dt: 1236.04ms | tok/sec: 424167.87 +step 317 | loss: 3.879610 | lr 2.6685e-04 | norm: 0.6342 | dt: 1234.93ms | tok/sec: 424548.09 +step 318 | loss: 3.875114 | lr 2.6769e-04 | norm: 0.5219 | dt: 1236.15ms | tok/sec: 424131.47 +step 319 | loss: 3.826522 | lr 2.6853e-04 | norm: 0.8505 | dt: 1235.52ms | tok/sec: 424346.64 +step 320 | loss: 3.817621 | lr 2.6937e-04 | norm: 0.9727 | dt: 1235.49ms | tok/sec: 424357.69 +step 321 | loss: 3.903267 | lr 2.7021e-04 | norm: 0.7418 | dt: 1236.67ms | tok/sec: 423952.96 +step 322 | loss: 3.833714 | lr 2.7105e-04 | norm: 0.7799 | dt: 1238.73ms | tok/sec: 423245.35 +step 323 | loss: 3.889701 | lr 2.7189e-04 | norm: 0.6876 | dt: 1235.14ms | tok/sec: 424475.32 +step 324 | loss: 3.840719 | lr 2.7273e-04 | norm: 0.6018 | dt: 1238.33ms | tok/sec: 423381.43 +validation loss: 3.8618 +validation perplexity: 47.5491 +step 325 | loss: 3.903890 | lr 2.7357e-04 | norm: 0.8312 | dt: 2685.09ms | tok/sec: 195258.91 +step 326 | loss: 3.841767 | lr 2.7441e-04 | norm: 0.8579 | dt: 1236.86ms | tok/sec: 423887.10 +step 327 | loss: 3.871916 | lr 2.7524e-04 | norm: 0.5181 | dt: 1235.35ms | tok/sec: 424406.09 +step 328 | loss: 3.892068 | lr 2.7608e-04 | norm: 0.8443 | dt: 1235.36ms | tok/sec: 424400.03 +step 329 | loss: 3.905273 | lr 2.7692e-04 | norm: 0.8861 | dt: 1236.18ms | tok/sec: 424119.77 +step 330 | loss: 3.891838 | lr 2.7776e-04 | norm: 0.5484 | dt: 1231.48ms | tok/sec: 425736.53 +step 331 | loss: 3.866570 | lr 2.7860e-04 | norm: 0.8328 | dt: 1233.12ms | tok/sec: 425170.45 +step 332 | loss: 3.883849 | lr 2.7944e-04 | norm: 0.6299 | dt: 1238.18ms | tok/sec: 423433.12 +step 333 | loss: 3.897305 | lr 2.8028e-04 | norm: 0.5996 | dt: 1234.08ms | tok/sec: 424840.99 +step 334 | loss: 3.911971 | lr 2.8112e-04 | norm: 0.5612 | dt: 1238.80ms | tok/sec: 423221.15 +step 335 | loss: 3.859587 | lr 2.8196e-04 | norm: 0.5567 | dt: 1235.95ms | tok/sec: 424196.67 +step 336 | loss: 3.872805 | lr 2.8280e-04 | norm: 0.6833 | dt: 1236.48ms | tok/sec: 424015.50 +step 337 | loss: 3.905930 | lr 2.8364e-04 | norm: 1.4612 | dt: 1233.73ms | tok/sec: 424961.67 +step 338 | loss: 3.862665 | lr 2.8448e-04 | norm: 1.0222 | dt: 1236.27ms | tok/sec: 424088.69 +step 339 | loss: 3.853812 | lr 2.8531e-04 | norm: 0.5693 | dt: 1235.63ms | tok/sec: 424309.14 +step 340 | loss: 3.850474 | lr 2.8615e-04 | norm: 0.8691 | dt: 1236.19ms | tok/sec: 424116.25 +step 341 | loss: 3.813458 | lr 2.8699e-04 | norm: 0.8708 | dt: 1237.39ms | tok/sec: 423704.39 +step 342 | loss: 3.885455 | lr 2.8783e-04 | norm: 0.7087 | dt: 1233.99ms | tok/sec: 424871.28 +step 343 | loss: 3.915621 | lr 2.8867e-04 | norm: 0.7638 | dt: 1233.23ms | tok/sec: 425135.19 +step 344 | loss: 3.889045 | lr 2.8951e-04 | norm: 0.6571 | dt: 1236.43ms | tok/sec: 424033.24 +step 345 | loss: 3.847039 | lr 2.9035e-04 | norm: 0.5387 | dt: 1235.00ms | tok/sec: 424525.39 +step 346 | loss: 3.823452 | lr 2.9119e-04 | norm: 0.5274 | dt: 1234.98ms | tok/sec: 424533.26 +step 347 | loss: 3.842559 | lr 2.9203e-04 | norm: 0.5663 | dt: 1235.82ms | tok/sec: 424242.18 +step 348 | loss: 3.817088 | lr 2.9287e-04 | norm: 0.4431 | dt: 1233.66ms | tok/sec: 424984.59 +step 349 | loss: 3.836729 | lr 2.9371e-04 | norm: 0.6483 | dt: 1235.35ms | tok/sec: 424405.68 +validation loss: 3.8716 +validation perplexity: 48.0187 +step 350 | loss: 3.822455 | lr 2.9455e-04 | norm: 0.5674 | dt: 2690.30ms | tok/sec: 194880.78 +step 351 | loss: 3.899278 | lr 2.9538e-04 | norm: 0.6933 | dt: 1236.90ms | tok/sec: 423873.45 +step 352 | loss: 3.876629 | lr 2.9622e-04 | norm: 0.7094 | dt: 1234.84ms | tok/sec: 424580.55 +step 353 | loss: 3.828707 | lr 2.9706e-04 | norm: 0.6506 | dt: 1234.91ms | tok/sec: 424554.65 +step 354 | loss: 3.929975 | lr 2.9790e-04 | norm: 0.5531 | dt: 1236.30ms | tok/sec: 424077.89 +step 355 | loss: 3.899930 | lr 2.9874e-04 | norm: 0.7876 | dt: 1239.66ms | tok/sec: 422927.88 +step 356 | loss: 3.805047 | lr 2.9958e-04 | norm: 0.7037 | dt: 1235.86ms | tok/sec: 424229.90 +step 357 | loss: 3.854972 | lr 3.0042e-04 | norm: 0.6614 | dt: 1233.47ms | tok/sec: 425050.06 +step 358 | loss: 3.849973 | lr 3.0126e-04 | norm: 0.8351 | dt: 1236.25ms | tok/sec: 424094.49 +step 359 | loss: 3.819481 | lr 3.0210e-04 | norm: 0.7053 | dt: 1235.37ms | tok/sec: 424397.00 +step 360 | loss: 3.875143 | lr 3.0294e-04 | norm: 1.0810 | dt: 1237.32ms | tok/sec: 423727.99 +step 361 | loss: 3.864595 | lr 3.0378e-04 | norm: 0.7525 | dt: 1236.97ms | tok/sec: 423847.31 +step 362 | loss: 3.917947 | lr 3.0462e-04 | norm: 0.7779 | dt: 1237.36ms | tok/sec: 423714.19 +step 363 | loss: 3.838974 | lr 3.0545e-04 | norm: 0.5342 | dt: 1236.15ms | tok/sec: 424129.99 +step 364 | loss: 3.853747 | lr 3.0629e-04 | norm: 0.5509 | dt: 1237.37ms | tok/sec: 423711.25 +step 365 | loss: 3.864021 | lr 3.0713e-04 | norm: 0.6230 | dt: 1236.52ms | tok/sec: 424003.32 +step 366 | loss: 3.923968 | lr 3.0797e-04 | norm: 0.5617 | dt: 1234.11ms | tok/sec: 424829.74 +step 367 | loss: 3.831028 | lr 3.0881e-04 | norm: 0.4768 | dt: 1236.32ms | tok/sec: 424071.92 +step 368 | loss: 3.938215 | lr 3.0965e-04 | norm: 0.5463 | dt: 1235.78ms | tok/sec: 424257.81 +step 369 | loss: 3.899338 | lr 3.1049e-04 | norm: 0.5588 | dt: 1234.84ms | tok/sec: 424580.22 +step 370 | loss: 3.890531 | lr 3.1133e-04 | norm: 0.5481 | dt: 1239.17ms | tok/sec: 423097.46 +step 371 | loss: 3.867266 | lr 3.1217e-04 | norm: 0.5788 | dt: 1235.18ms | tok/sec: 424463.44 +step 372 | loss: 3.862134 | lr 3.1301e-04 | norm: 0.5957 | dt: 1238.27ms | tok/sec: 423405.23 +step 373 | loss: 3.863185 | lr 3.1385e-04 | norm: 0.7860 | dt: 1234.83ms | tok/sec: 424581.70 +step 374 | loss: 3.892324 | lr 3.1469e-04 | norm: 0.8075 | dt: 1235.43ms | tok/sec: 424375.30 +validation loss: 3.8699 +validation perplexity: 47.9364 +step 375 | loss: 3.841361 | lr 3.1552e-04 | norm: 0.6371 | dt: 2684.88ms | tok/sec: 195274.64 +step 376 | loss: 3.843825 | lr 3.1636e-04 | norm: 0.8432 | dt: 1235.17ms | tok/sec: 424467.45 +step 377 | loss: 3.839301 | lr 3.1720e-04 | norm: 0.4297 | dt: 1235.84ms | tok/sec: 424235.63 +step 378 | loss: 3.832651 | lr 3.1804e-04 | norm: 0.7015 | dt: 1237.67ms | tok/sec: 423610.20 +step 379 | loss: 3.876358 | lr 3.1888e-04 | norm: 0.4649 | dt: 1237.15ms | tok/sec: 423785.72 +step 380 | loss: 3.811151 | lr 3.1972e-04 | norm: 0.7187 | dt: 1234.33ms | tok/sec: 424755.73 +step 381 | loss: 3.824117 | lr 3.2056e-04 | norm: 0.7806 | dt: 1236.59ms | tok/sec: 423980.27 +step 382 | loss: 3.847764 | lr 3.2140e-04 | norm: 0.8236 | dt: 1234.91ms | tok/sec: 424555.96 +step 383 | loss: 3.866390 | lr 3.2224e-04 | norm: 0.8375 | dt: 1236.52ms | tok/sec: 424003.73 +step 384 | loss: 3.875432 | lr 3.2308e-04 | norm: 0.7033 | dt: 1238.27ms | tok/sec: 423404.91 +step 385 | loss: 3.875997 | lr 3.2392e-04 | norm: 0.5556 | dt: 1235.18ms | tok/sec: 424462.13 +step 386 | loss: 3.835991 | lr 3.2476e-04 | norm: 0.5025 | dt: 1233.94ms | tok/sec: 424888.51 +step 387 | loss: 3.853321 | lr 3.2559e-04 | norm: 0.5594 | dt: 1238.21ms | tok/sec: 423422.93 +step 388 | loss: 3.857584 | lr 3.2643e-04 | norm: 0.4732 | dt: 1237.50ms | tok/sec: 423666.35 +step 389 | loss: 3.835227 | lr 3.2727e-04 | norm: 0.4612 | dt: 1236.51ms | tok/sec: 424006.18 +step 390 | loss: 3.905764 | lr 3.2811e-04 | norm: 0.5129 | dt: 1234.50ms | tok/sec: 424697.97 +step 391 | loss: 3.750039 | lr 3.2895e-04 | norm: 0.6344 | dt: 1233.63ms | tok/sec: 424996.58 +step 392 | loss: 3.828994 | lr 3.2979e-04 | norm: 0.5108 | dt: 1236.05ms | tok/sec: 424164.76 +step 393 | loss: 3.847142 | lr 3.3063e-04 | norm: 0.5510 | dt: 1232.65ms | tok/sec: 425333.78 +step 394 | loss: 3.850677 | lr 3.3147e-04 | norm: 0.4957 | dt: 1235.73ms | tok/sec: 424274.43 +step 395 | loss: 3.904500 | lr 3.3231e-04 | norm: 0.6945 | dt: 1232.49ms | tok/sec: 425389.64 +step 396 | loss: 3.876894 | lr 3.3315e-04 | norm: 0.6474 | dt: 1235.49ms | tok/sec: 424354.74 +step 397 | loss: 3.850883 | lr 3.3399e-04 | norm: 0.6189 | dt: 1235.95ms | tok/sec: 424199.21 +step 398 | loss: 3.882925 | lr 3.3483e-04 | norm: 0.5423 | dt: 1235.41ms | tok/sec: 424383.90 +step 399 | loss: 3.859722 | lr 3.3566e-04 | norm: 0.4456 | dt: 1231.99ms | tok/sec: 425562.19 +validation loss: 3.8784 +validation perplexity: 48.3465 +step 400 | loss: 3.897000 | lr 3.3650e-04 | norm: 0.4605 | dt: 2691.88ms | tok/sec: 194766.14 +step 401 | loss: 3.914107 | lr 3.3734e-04 | norm: 0.5091 | dt: 1238.40ms | tok/sec: 423359.42 +step 402 | loss: 3.905820 | lr 3.3818e-04 | norm: 0.7185 | dt: 1236.78ms | tok/sec: 423912.02 +step 403 | loss: 3.955850 | lr 3.3902e-04 | norm: 0.7016 | dt: 1235.20ms | tok/sec: 424456.97 +step 404 | loss: 4.096984 | lr 3.3986e-04 | norm: 0.7221 | dt: 1235.61ms | tok/sec: 424315.03 +step 405 | loss: 3.863445 | lr 3.4070e-04 | norm: 0.6261 | dt: 1867.74ms | tok/sec: 280706.85 +step 406 | loss: 3.896952 | lr 3.4154e-04 | norm: 0.6071 | dt: 1232.10ms | tok/sec: 425525.38 +step 407 | loss: 3.879852 | lr 3.4238e-04 | norm: 0.5195 | dt: 1236.59ms | tok/sec: 423979.20 +step 408 | loss: 3.988398 | lr 3.4322e-04 | norm: 0.4731 | dt: 1234.86ms | tok/sec: 424572.76 +step 409 | loss: 3.844318 | lr 3.4406e-04 | norm: 0.6419 | dt: 1234.88ms | tok/sec: 424565.39 +step 410 | loss: 3.848653 | lr 3.4490e-04 | norm: 0.6326 | dt: 1234.51ms | tok/sec: 424693.38 +step 411 | loss: 3.906629 | lr 3.4573e-04 | norm: 0.5685 | dt: 1233.14ms | tok/sec: 425166.10 +step 412 | loss: 3.876091 | lr 3.4657e-04 | norm: 0.8058 | dt: 1233.24ms | tok/sec: 425132.15 +step 413 | loss: 3.877480 | lr 3.4741e-04 | norm: 0.8335 | dt: 1229.95ms | tok/sec: 426266.02 +step 414 | loss: 3.894830 | lr 3.4825e-04 | norm: 0.7425 | dt: 1233.74ms | tok/sec: 424959.62 +step 415 | loss: 3.886320 | lr 3.4909e-04 | norm: 0.5519 | dt: 1237.17ms | tok/sec: 423780.82 +step 416 | loss: 3.855072 | lr 3.4993e-04 | norm: 0.6651 | dt: 1237.96ms | tok/sec: 423508.96 +step 417 | loss: 3.838010 | lr 3.5077e-04 | norm: 0.5396 | dt: 1236.13ms | tok/sec: 424137.77 +step 418 | loss: 3.877621 | lr 3.5161e-04 | norm: 0.5059 | dt: 1234.97ms | tok/sec: 424535.96 +step 419 | loss: 3.914494 | lr 3.5245e-04 | norm: 0.6707 | dt: 1235.31ms | tok/sec: 424417.23 +step 420 | loss: 3.859554 | lr 3.5329e-04 | norm: 0.8709 | dt: 1234.93ms | tok/sec: 424548.09 +step 421 | loss: 3.895493 | lr 3.5413e-04 | norm: 0.5526 | dt: 1234.97ms | tok/sec: 424533.99 +step 422 | loss: 3.986085 | lr 3.5497e-04 | norm: 0.7289 | dt: 1235.03ms | tok/sec: 424516.05 +step 423 | loss: 3.858933 | lr 3.5580e-04 | norm: 0.5616 | dt: 1236.25ms | tok/sec: 424094.82 +step 424 | loss: 3.903056 | lr 3.5664e-04 | norm: 0.5525 | dt: 1237.28ms | tok/sec: 423742.19 +validation loss: 3.8820 +validation perplexity: 48.5230 +step 425 | loss: 3.932259 | lr 3.5748e-04 | norm: 0.5663 | dt: 2690.04ms | tok/sec: 194899.51 +step 426 | loss: 3.839967 | lr 3.5832e-04 | norm: 0.6008 | dt: 1234.07ms | tok/sec: 424845.83 +step 427 | loss: 3.878925 | lr 3.5916e-04 | norm: 0.5164 | dt: 1235.84ms | tok/sec: 424234.56 +step 428 | loss: 3.823243 | lr 3.6000e-04 | norm: 0.5304 | dt: 1234.42ms | tok/sec: 424725.13 +step 429 | loss: 3.908056 | lr 3.6084e-04 | norm: 0.5071 | dt: 1233.29ms | tok/sec: 425112.10 +step 430 | loss: 3.939759 | lr 3.6168e-04 | norm: 0.5356 | dt: 1237.47ms | tok/sec: 423678.68 +step 431 | loss: 3.856617 | lr 3.6252e-04 | norm: 0.4582 | dt: 1236.22ms | tok/sec: 424105.13 +step 432 | loss: 3.850848 | lr 3.6336e-04 | norm: 0.5731 | dt: 1236.35ms | tok/sec: 424059.49 +step 433 | loss: 3.869457 | lr 3.6420e-04 | norm: 0.6073 | dt: 1232.63ms | tok/sec: 425341.67 +step 434 | loss: 3.893175 | lr 3.6503e-04 | norm: 0.5460 | dt: 1235.12ms | tok/sec: 424483.51 +step 435 | loss: 3.920468 | lr 3.6587e-04 | norm: 0.4940 | dt: 1238.89ms | tok/sec: 423190.20 +step 436 | loss: 3.906605 | lr 3.6671e-04 | norm: 0.4559 | dt: 1235.95ms | tok/sec: 424199.29 +step 437 | loss: 3.826689 | lr 3.6755e-04 | norm: 0.4724 | dt: 1235.87ms | tok/sec: 424226.87 +step 438 | loss: 3.881460 | lr 3.6839e-04 | norm: 0.4338 | dt: 1233.82ms | tok/sec: 424932.03 +step 439 | loss: 3.919364 | lr 3.6923e-04 | norm: 0.5306 | dt: 1232.96ms | tok/sec: 425228.25 +step 440 | loss: 3.866326 | lr 3.7007e-04 | norm: 0.5345 | dt: 1234.89ms | tok/sec: 424564.07 +step 441 | loss: 3.857991 | lr 3.7091e-04 | norm: 0.6163 | dt: 1236.54ms | tok/sec: 423996.21 +step 442 | loss: 3.864721 | lr 3.7175e-04 | norm: 0.6852 | dt: 1234.66ms | tok/sec: 424641.88 +step 443 | loss: 3.945112 | lr 3.7259e-04 | norm: 0.5086 | dt: 1236.69ms | tok/sec: 423946.18 +step 444 | loss: 3.878020 | lr 3.7343e-04 | norm: 0.5178 | dt: 1235.37ms | tok/sec: 424395.86 +step 445 | loss: 3.843171 | lr 3.7427e-04 | norm: 0.4634 | dt: 1237.02ms | tok/sec: 423831.87 +step 446 | loss: 3.834538 | lr 3.7510e-04 | norm: 0.4677 | dt: 1233.98ms | tok/sec: 424876.69 +step 447 | loss: 3.866690 | lr 3.7594e-04 | norm: 0.4790 | dt: 1236.60ms | tok/sec: 423974.63 +step 448 | loss: 3.836999 | lr 3.7678e-04 | norm: 0.4205 | dt: 1234.38ms | tok/sec: 424738.66 +step 449 | loss: 3.863618 | lr 3.7762e-04 | norm: 0.5088 | dt: 1235.03ms | tok/sec: 424512.93 +validation loss: 3.8833 +validation perplexity: 48.5824 +step 450 | loss: 3.862782 | lr 3.7846e-04 | norm: 0.4469 | dt: 2678.81ms | tok/sec: 195716.87 +step 451 | loss: 3.849272 | lr 3.7930e-04 | norm: 0.4841 | dt: 1234.95ms | tok/sec: 424541.21 +step 452 | loss: 3.861321 | lr 3.8014e-04 | norm: 0.4122 | dt: 1233.54ms | tok/sec: 425028.21 +step 453 | loss: 3.919208 | lr 3.8098e-04 | norm: 0.4654 | dt: 1234.94ms | tok/sec: 424545.88 +step 454 | loss: 3.866850 | lr 3.8182e-04 | norm: 0.4315 | dt: 1236.68ms | tok/sec: 423946.59 +step 455 | loss: 3.835210 | lr 3.8266e-04 | norm: 0.5345 | dt: 1234.64ms | tok/sec: 424647.04 +step 456 | loss: 3.894035 | lr 3.8350e-04 | norm: 0.5200 | dt: 1237.64ms | tok/sec: 423618.44 +step 457 | loss: 3.895190 | lr 3.8434e-04 | norm: 0.4664 | dt: 1236.63ms | tok/sec: 423966.53 +step 458 | loss: 3.873477 | lr 3.8517e-04 | norm: 0.5528 | dt: 1236.01ms | tok/sec: 424177.45 +step 459 | loss: 3.896287 | lr 3.8601e-04 | norm: 0.5290 | dt: 1235.87ms | tok/sec: 424225.32 +step 460 | loss: 3.897126 | lr 3.8685e-04 | norm: 0.6585 | dt: 1237.10ms | tok/sec: 423804.34 +step 461 | loss: 3.904165 | lr 3.8769e-04 | norm: 0.7429 | dt: 1234.95ms | tok/sec: 424543.50 +step 462 | loss: 3.895084 | lr 3.8853e-04 | norm: 0.7839 | dt: 1233.91ms | tok/sec: 424901.16 +step 463 | loss: 3.934421 | lr 3.8937e-04 | norm: 0.7380 | dt: 1234.26ms | tok/sec: 424780.67 +step 464 | loss: 3.824460 | lr 3.9021e-04 | norm: 0.5852 | dt: 1234.25ms | tok/sec: 424783.29 +step 465 | loss: 3.916705 | lr 3.9105e-04 | norm: 0.6376 | dt: 1233.15ms | tok/sec: 425161.49 +step 466 | loss: 3.880251 | lr 3.9189e-04 | norm: 0.8173 | dt: 1236.39ms | tok/sec: 424047.47 +step 467 | loss: 3.893742 | lr 3.9273e-04 | norm: 0.7927 | dt: 1235.41ms | tok/sec: 424385.45 +step 468 | loss: 3.927913 | lr 3.9357e-04 | norm: 0.5426 | dt: 1233.41ms | tok/sec: 425072.82 +step 469 | loss: 3.902443 | lr 3.9441e-04 | norm: 0.5595 | dt: 1233.73ms | tok/sec: 424963.24 +step 470 | loss: 3.885204 | lr 3.9524e-04 | norm: 0.6254 | dt: 1235.13ms | tok/sec: 424481.22 +step 471 | loss: 3.855738 | lr 3.9608e-04 | norm: 0.5206 | dt: 1234.37ms | tok/sec: 424741.29 +step 472 | loss: 3.893852 | lr 3.9692e-04 | norm: 0.5594 | dt: 1232.97ms | tok/sec: 425222.82 +step 473 | loss: 3.949296 | lr 3.9776e-04 | norm: 0.5414 | dt: 1231.72ms | tok/sec: 425656.34 +step 474 | loss: 3.892916 | lr 3.9860e-04 | norm: 0.4094 | dt: 1235.14ms | tok/sec: 424475.40 +validation loss: 3.8851 +validation perplexity: 48.6702 +step 475 | loss: 3.852768 | lr 3.9944e-04 | norm: 0.4327 | dt: 2678.94ms | tok/sec: 195707.27 +step 476 | loss: 3.918780 | lr 4.0028e-04 | norm: 0.5123 | dt: 1234.61ms | tok/sec: 424659.18 +step 477 | loss: 3.941182 | lr 4.0112e-04 | norm: 0.9253 | dt: 1231.07ms | tok/sec: 425880.41 +step 478 | loss: 3.838184 | lr 4.0196e-04 | norm: 0.7163 | dt: 1234.04ms | tok/sec: 424854.45 +step 479 | loss: 3.859168 | lr 4.0280e-04 | norm: 0.5863 | dt: 1237.42ms | tok/sec: 423693.37 +step 480 | loss: 3.812346 | lr 4.0364e-04 | norm: 0.5941 | dt: 1236.58ms | tok/sec: 423982.15 +step 481 | loss: 3.876783 | lr 4.0448e-04 | norm: 0.8219 | dt: 1236.47ms | tok/sec: 424019.59 +step 482 | loss: 3.879762 | lr 4.0531e-04 | norm: 1.1983 | dt: 1237.47ms | tok/sec: 423678.43 +step 483 | loss: 3.967027 | lr 4.0615e-04 | norm: 0.6898 | dt: 1232.79ms | tok/sec: 425285.82 +step 484 | loss: 3.897903 | lr 4.0699e-04 | norm: 0.5310 | dt: 1235.15ms | tok/sec: 424472.62 +step 485 | loss: 3.917948 | lr 4.0783e-04 | norm: 0.6557 | dt: 1235.78ms | tok/sec: 424257.15 +step 486 | loss: 3.901612 | lr 4.0867e-04 | norm: 0.5332 | dt: 1235.20ms | tok/sec: 424456.64 +step 487 | loss: 3.791342 | lr 4.0951e-04 | norm: 0.4907 | dt: 1234.53ms | tok/sec: 424686.41 +step 488 | loss: 3.907545 | lr 4.1035e-04 | norm: 0.5124 | dt: 1232.06ms | tok/sec: 425539.13 +step 489 | loss: 3.900328 | lr 4.1119e-04 | norm: 0.4304 | dt: 1232.75ms | tok/sec: 425298.07 +step 490 | loss: 3.940559 | lr 4.1203e-04 | norm: 0.4709 | dt: 1234.50ms | tok/sec: 424698.14 +step 491 | loss: 3.980025 | lr 4.1287e-04 | norm: 0.4322 | dt: 1236.28ms | tok/sec: 424085.50 +step 492 | loss: 3.931572 | lr 4.1371e-04 | norm: 0.4833 | dt: 1235.12ms | tok/sec: 424483.68 +step 493 | loss: 3.914078 | lr 4.1455e-04 | norm: 0.4746 | dt: 1235.00ms | tok/sec: 424524.90 +step 494 | loss: 3.914722 | lr 4.1538e-04 | norm: 0.4841 | dt: 1233.77ms | tok/sec: 424949.52 +step 495 | loss: 3.848957 | lr 4.1622e-04 | norm: 0.4227 | dt: 1237.35ms | tok/sec: 423717.54 +step 496 | loss: 3.877984 | lr 4.1706e-04 | norm: 0.3836 | dt: 1234.12ms | tok/sec: 424826.95 +step 497 | loss: 3.919805 | lr 4.1790e-04 | norm: 0.4822 | dt: 1232.64ms | tok/sec: 425337.56 +step 498 | loss: 3.886177 | lr 4.1874e-04 | norm: 0.5283 | dt: 1234.38ms | tok/sec: 424738.91 +step 499 | loss: 3.879502 | lr 4.1958e-04 | norm: 0.5598 | dt: 1236.23ms | tok/sec: 424100.63 +validation loss: 3.8868 +validation perplexity: 48.7542 +step 500 | loss: 3.908135 | lr 4.2042e-04 | norm: 0.6010 | dt: 2683.30ms | tok/sec: 195389.31 +step 501 | loss: 3.947453 | lr 4.2126e-04 | norm: 0.5630 | dt: 1232.64ms | tok/sec: 425336.65 +step 502 | loss: 3.925533 | lr 4.2210e-04 | norm: 0.5579 | dt: 1237.77ms | tok/sec: 423574.79 +step 503 | loss: 3.894030 | lr 4.2294e-04 | norm: 0.5023 | dt: 1236.06ms | tok/sec: 424159.12 +step 504 | loss: 3.911389 | lr 4.2378e-04 | norm: 0.4695 | dt: 1236.59ms | tok/sec: 423977.32 +step 505 | loss: 3.946396 | lr 4.2462e-04 | norm: 0.4626 | dt: 1237.02ms | tok/sec: 423830.40 +step 506 | loss: 3.889690 | lr 4.2545e-04 | norm: 0.4753 | dt: 1234.92ms | tok/sec: 424551.29 +step 507 | loss: 3.893897 | lr 4.2629e-04 | norm: 0.4340 | dt: 1239.32ms | tok/sec: 423043.74 +step 508 | loss: 3.922432 | lr 4.2713e-04 | norm: 0.4492 | dt: 1235.50ms | tok/sec: 424352.61 +step 509 | loss: 3.862441 | lr 4.2797e-04 | norm: 0.4772 | dt: 1234.87ms | tok/sec: 424571.04 +step 510 | loss: 3.889961 | lr 4.2881e-04 | norm: 0.5192 | dt: 1239.81ms | tok/sec: 422876.89 +step 511 | loss: 3.982841 | lr 4.2965e-04 | norm: 0.6071 | dt: 1233.13ms | tok/sec: 425168.97 +step 512 | loss: 3.920351 | lr 4.3049e-04 | norm: 0.6718 | dt: 1235.47ms | tok/sec: 424363.26 +step 513 | loss: 3.888626 | lr 4.3133e-04 | norm: 0.7532 | dt: 1234.29ms | tok/sec: 424769.26 +step 514 | loss: 3.870001 | lr 4.3217e-04 | norm: 0.5585 | dt: 1237.40ms | tok/sec: 423702.11 +step 515 | loss: 3.907346 | lr 4.3301e-04 | norm: 0.5046 | dt: 1233.60ms | tok/sec: 425005.62 +step 516 | loss: 3.893583 | lr 4.3385e-04 | norm: 0.4783 | dt: 1236.32ms | tok/sec: 424072.99 +step 517 | loss: 3.857188 | lr 4.3469e-04 | norm: 0.5481 | dt: 1235.57ms | tok/sec: 424328.29 +step 518 | loss: 3.897652 | lr 4.3552e-04 | norm: 0.5859 | dt: 1234.44ms | tok/sec: 424718.89 +step 519 | loss: 3.832333 | lr 4.3636e-04 | norm: 0.4956 | dt: 1235.86ms | tok/sec: 424230.14 +step 520 | loss: 3.908115 | lr 4.3720e-04 | norm: 0.4389 | dt: 1234.41ms | tok/sec: 424729.14 +step 521 | loss: 3.886316 | lr 4.3804e-04 | norm: 0.5127 | dt: 1238.40ms | tok/sec: 423358.45 +step 522 | loss: 3.865573 | lr 4.3888e-04 | norm: 0.5608 | dt: 1237.20ms | tok/sec: 423769.47 +step 523 | loss: 3.832541 | lr 4.3972e-04 | norm: 0.6823 | dt: 1235.24ms | tok/sec: 424442.14 +step 524 | loss: 3.885832 | lr 4.4056e-04 | norm: 0.9834 | dt: 1233.96ms | tok/sec: 424880.96 +validation loss: 3.9033 +validation perplexity: 49.5650 +step 525 | loss: 3.889136 | lr 4.4140e-04 | norm: 0.6576 | dt: 2693.37ms | tok/sec: 194659.07 +step 526 | loss: 3.924963 | lr 4.4224e-04 | norm: 0.5786 | dt: 1235.05ms | tok/sec: 424506.87 +step 527 | loss: 3.965729 | lr 4.4308e-04 | norm: 0.5611 | dt: 1235.08ms | tok/sec: 424495.72 +step 528 | loss: 3.928971 | lr 4.4392e-04 | norm: 0.5348 | dt: 1235.66ms | tok/sec: 424298.41 +step 529 | loss: 3.875183 | lr 4.4476e-04 | norm: 0.4576 | dt: 1234.18ms | tok/sec: 424805.53 +step 530 | loss: 3.894653 | lr 4.4559e-04 | norm: 0.4265 | dt: 1235.23ms | tok/sec: 424446.40 +step 531 | loss: 3.909822 | lr 4.4643e-04 | norm: 0.4194 | dt: 1236.92ms | tok/sec: 423866.59 +step 532 | loss: 3.923635 | lr 4.4727e-04 | norm: 0.4375 | dt: 1237.33ms | tok/sec: 423724.88 +step 533 | loss: 3.906196 | lr 4.4811e-04 | norm: 0.5875 | dt: 1236.84ms | tok/sec: 423892.90 +step 534 | loss: 3.963665 | lr 4.4895e-04 | norm: 0.6242 | dt: 1234.65ms | tok/sec: 424643.93 +step 535 | loss: 3.882747 | lr 4.4979e-04 | norm: 0.5055 | dt: 1234.11ms | tok/sec: 424832.12 +step 536 | loss: 3.996640 | lr 4.5063e-04 | norm: 0.5520 | dt: 1235.48ms | tok/sec: 424359.90 +step 537 | loss: 3.922885 | lr 4.5147e-04 | norm: 0.5840 | dt: 1235.47ms | tok/sec: 424364.08 +step 538 | loss: 3.887696 | lr 4.5231e-04 | norm: 0.4570 | dt: 1234.11ms | tok/sec: 424829.74 +step 539 | loss: 3.996899 | lr 4.5315e-04 | norm: 0.5055 | dt: 1235.68ms | tok/sec: 424291.21 +step 540 | loss: 3.926992 | lr 4.5399e-04 | norm: 0.4793 | dt: 1235.81ms | tok/sec: 424245.86 +step 541 | loss: 3.898449 | lr 4.5483e-04 | norm: 0.4601 | dt: 1233.60ms | tok/sec: 425007.34 +step 542 | loss: 3.905726 | lr 4.5566e-04 | norm: 0.4877 | dt: 1236.43ms | tok/sec: 424032.83 +step 543 | loss: 3.879348 | lr 4.5650e-04 | norm: 0.4054 | dt: 1236.85ms | tok/sec: 423890.77 +step 544 | loss: 3.993848 | lr 4.5734e-04 | norm: 0.4834 | dt: 1236.17ms | tok/sec: 424123.53 +step 545 | loss: 3.922774 | lr 4.5818e-04 | norm: 0.6018 | dt: 1236.91ms | tok/sec: 423870.84 +step 546 | loss: 3.945741 | lr 4.5902e-04 | norm: 0.6442 | dt: 1236.47ms | tok/sec: 424019.34 +step 547 | loss: 3.905503 | lr 4.5986e-04 | norm: 0.5481 | dt: 1237.10ms | tok/sec: 423802.38 +step 548 | loss: 3.901222 | lr 4.6070e-04 | norm: 0.6032 | dt: 1234.73ms | tok/sec: 424616.87 +step 549 | loss: 3.879651 | lr 4.6154e-04 | norm: 0.4784 | dt: 1238.02ms | tok/sec: 423487.43 +validation loss: 3.9043 +validation perplexity: 49.6159 +step 550 | loss: 3.951739 | lr 4.6238e-04 | norm: 0.4485 | dt: 2678.03ms | tok/sec: 195774.04 +step 551 | loss: 3.869262 | lr 4.6322e-04 | norm: 0.4727 | dt: 1236.89ms | tok/sec: 423877.05 +step 552 | loss: 3.849909 | lr 4.6406e-04 | norm: 0.4287 | dt: 1234.71ms | tok/sec: 424623.02 +step 553 | loss: 3.959766 | lr 4.6490e-04 | norm: 0.5644 | dt: 1236.06ms | tok/sec: 424160.75 +step 554 | loss: 3.859243 | lr 4.6573e-04 | norm: 0.5623 | dt: 1234.89ms | tok/sec: 424561.12 +step 555 | loss: 3.847322 | lr 4.6657e-04 | norm: 0.4785 | dt: 1236.84ms | tok/sec: 423892.82 +step 556 | loss: 3.869086 | lr 4.6741e-04 | norm: 0.4774 | dt: 1235.42ms | tok/sec: 424381.77 +step 557 | loss: 3.931903 | lr 4.6825e-04 | norm: 0.4149 | dt: 1235.36ms | tok/sec: 424402.16 +step 558 | loss: 3.913176 | lr 4.6909e-04 | norm: 0.4640 | dt: 1236.03ms | tok/sec: 424169.59 +step 559 | loss: 3.918575 | lr 4.6993e-04 | norm: 0.4585 | dt: 1236.67ms | tok/sec: 423951.58 +step 560 | loss: 3.841910 | lr 4.7077e-04 | norm: 0.4829 | dt: 1235.21ms | tok/sec: 424452.05 +step 561 | loss: 3.965280 | lr 4.7161e-04 | norm: 0.5927 | dt: 1234.92ms | tok/sec: 424552.68 +step 562 | loss: 3.917118 | lr 4.7245e-04 | norm: 0.6516 | dt: 1236.08ms | tok/sec: 424154.70 +step 563 | loss: 3.880439 | lr 4.7329e-04 | norm: 0.4880 | dt: 1233.27ms | tok/sec: 425120.97 +step 564 | loss: 3.914160 | lr 4.7413e-04 | norm: 0.4942 | dt: 1237.14ms | tok/sec: 423788.91 +step 565 | loss: 3.887158 | lr 4.7497e-04 | norm: 0.4314 | dt: 1237.08ms | tok/sec: 423811.37 +step 566 | loss: 3.868138 | lr 4.7580e-04 | norm: 0.4370 | dt: 1233.56ms | tok/sec: 425020.07 +step 567 | loss: 3.950212 | lr 4.7664e-04 | norm: 0.5437 | dt: 1234.70ms | tok/sec: 424627.94 +step 568 | loss: 3.874136 | lr 4.7748e-04 | norm: 0.5520 | dt: 1236.50ms | tok/sec: 424009.53 +step 569 | loss: 3.924161 | lr 4.7832e-04 | norm: 0.4591 | dt: 1234.52ms | tok/sec: 424688.71 +step 570 | loss: 3.880471 | lr 4.7916e-04 | norm: 0.4078 | dt: 1232.59ms | tok/sec: 425355.91 +step 571 | loss: 3.952397 | lr 4.8000e-04 | norm: 0.4861 | dt: 1238.24ms | tok/sec: 423415.02 +step 572 | loss: 3.894690 | lr 4.8084e-04 | norm: 0.5032 | dt: 1235.87ms | tok/sec: 424225.81 +step 573 | loss: 4.015523 | lr 4.8168e-04 | norm: 0.4654 | dt: 1236.69ms | tok/sec: 423945.94 +step 574 | loss: 3.987999 | lr 4.8252e-04 | norm: 0.4498 | dt: 1237.58ms | tok/sec: 423641.05 +validation loss: 3.9099 +validation perplexity: 49.8921 +step 575 | loss: 3.887321 | lr 4.8336e-04 | norm: 0.4782 | dt: 2688.56ms | tok/sec: 195006.70 +step 576 | loss: 3.852061 | lr 4.8420e-04 | norm: 0.5486 | dt: 1236.93ms | tok/sec: 423861.85 +step 577 | loss: 3.945314 | lr 4.8503e-04 | norm: 0.5535 | dt: 1234.42ms | tok/sec: 424723.81 +step 578 | loss: 3.921410 | lr 4.8587e-04 | norm: 0.4415 | dt: 1235.25ms | tok/sec: 424440.17 +step 579 | loss: 3.908061 | lr 4.8671e-04 | norm: 0.4277 | dt: 1236.65ms | tok/sec: 423959.67 +step 580 | loss: 3.986453 | lr 4.8755e-04 | norm: 0.4682 | dt: 1231.94ms | tok/sec: 425579.73 +step 581 | loss: 3.939448 | lr 4.8839e-04 | norm: 0.4538 | dt: 1235.61ms | tok/sec: 424316.59 +step 582 | loss: 3.877398 | lr 4.8923e-04 | norm: 0.3970 | dt: 1236.84ms | tok/sec: 423892.16 +step 583 | loss: 3.922646 | lr 4.9007e-04 | norm: 0.4420 | dt: 1234.19ms | tok/sec: 424802.33 +step 584 | loss: 3.912651 | lr 4.9091e-04 | norm: 0.4495 | dt: 1232.38ms | tok/sec: 425425.77 +step 585 | loss: 3.876784 | lr 4.9175e-04 | norm: 0.4056 | dt: 1230.90ms | tok/sec: 425938.97 +step 586 | loss: 3.862882 | lr 4.9259e-04 | norm: 0.4397 | dt: 1232.97ms | tok/sec: 425222.41 +step 587 | loss: 3.868021 | lr 4.9343e-04 | norm: 0.4369 | dt: 1236.20ms | tok/sec: 424110.93 +step 588 | loss: 3.918005 | lr 4.9427e-04 | norm: 0.4447 | dt: 1236.74ms | tok/sec: 423925.99 +step 589 | loss: 3.849256 | lr 4.9510e-04 | norm: 0.4878 | dt: 1232.18ms | tok/sec: 425496.15 +step 590 | loss: 3.881243 | lr 4.9594e-04 | norm: 0.4522 | dt: 1234.70ms | tok/sec: 424628.27 +step 591 | loss: 3.942911 | lr 4.9678e-04 | norm: 0.5346 | dt: 1233.82ms | tok/sec: 424931.04 +step 592 | loss: 3.875890 | lr 4.9762e-04 | norm: 0.4725 | dt: 1234.63ms | tok/sec: 424651.80 +step 593 | loss: 3.876249 | lr 4.9846e-04 | norm: 0.5009 | dt: 1235.23ms | tok/sec: 424446.56 +step 594 | loss: 3.880204 | lr 4.9930e-04 | norm: 0.5215 | dt: 1233.83ms | tok/sec: 424926.86 +step 595 | loss: 3.959775 | lr 5.0014e-04 | norm: 0.4584 | dt: 1859.64ms | tok/sec: 281929.78 +step 596 | loss: 3.928124 | lr 5.0098e-04 | norm: 0.4718 | dt: 1234.47ms | tok/sec: 424708.47 +step 597 | loss: 3.925704 | lr 5.0182e-04 | norm: 0.4755 | dt: 1233.62ms | tok/sec: 425000.85 +step 598 | loss: 3.916266 | lr 5.0266e-04 | norm: 0.4720 | dt: 1235.26ms | tok/sec: 424433.70 +step 599 | loss: 3.983778 | lr 5.0350e-04 | norm: 0.5318 | dt: 1234.02ms | tok/sec: 424860.69 +validation loss: 3.9115 +validation perplexity: 49.9729 +step 600 | loss: 3.995497 | lr 5.0434e-04 | norm: 0.7555 | dt: 2678.72ms | tok/sec: 195723.09 +step 601 | loss: 3.958368 | lr 5.0517e-04 | norm: 0.7041 | dt: 1235.03ms | tok/sec: 424515.64 +step 602 | loss: 3.820040 | lr 5.0601e-04 | norm: 0.5337 | dt: 1240.39ms | tok/sec: 422678.40 +step 603 | loss: 3.929526 | lr 5.0685e-04 | norm: 0.4805 | dt: 1231.81ms | tok/sec: 425624.96 +step 604 | loss: 3.921060 | lr 5.0769e-04 | norm: 0.4762 | dt: 1236.43ms | tok/sec: 424033.24 +step 605 | loss: 3.918296 | lr 5.0853e-04 | norm: 0.4800 | dt: 1240.42ms | tok/sec: 422670.11 +step 606 | loss: 3.905796 | lr 5.0937e-04 | norm: 0.4705 | dt: 1236.60ms | tok/sec: 423975.93 +step 607 | loss: 3.928124 | lr 5.1021e-04 | norm: 0.5293 | dt: 1234.13ms | tok/sec: 424823.59 +step 608 | loss: 3.923735 | lr 5.1105e-04 | norm: 0.5580 | dt: 1236.60ms | tok/sec: 423973.81 +step 609 | loss: 3.924061 | lr 5.1189e-04 | norm: 0.5105 | dt: 1236.15ms | tok/sec: 424131.30 +step 610 | loss: 3.996737 | lr 5.1273e-04 | norm: 0.4816 | dt: 1237.78ms | tok/sec: 423569.89 +step 611 | loss: 3.945261 | lr 5.1357e-04 | norm: 0.5482 | dt: 1235.96ms | tok/sec: 424195.53 +step 612 | loss: 3.909083 | lr 5.1441e-04 | norm: 0.5220 | dt: 1237.29ms | tok/sec: 423740.24 +step 613 | loss: 3.988202 | lr 5.1524e-04 | norm: 0.4437 | dt: 1237.01ms | tok/sec: 423834.24 +step 614 | loss: 3.899864 | lr 5.1608e-04 | norm: 0.4658 | dt: 1239.95ms | tok/sec: 422830.62 +step 615 | loss: 3.920360 | lr 5.1692e-04 | norm: 0.4068 | dt: 1237.60ms | tok/sec: 423632.64 +step 616 | loss: 3.970065 | lr 5.1776e-04 | norm: 0.4492 | dt: 1235.46ms | tok/sec: 424365.80 +step 617 | loss: 3.938045 | lr 5.1860e-04 | norm: 0.4436 | dt: 1234.64ms | tok/sec: 424649.83 +step 618 | loss: 3.881736 | lr 5.1944e-04 | norm: 0.4090 | dt: 1235.98ms | tok/sec: 424186.77 +step 619 | loss: 3.880761 | lr 5.2028e-04 | norm: 0.4361 | dt: 1235.63ms | tok/sec: 424309.46 +step 620 | loss: 3.860386 | lr 5.2112e-04 | norm: 0.5157 | dt: 1238.02ms | tok/sec: 423489.22 +step 621 | loss: 3.875401 | lr 5.2196e-04 | norm: 0.5606 | dt: 1233.63ms | tok/sec: 424996.74 +step 622 | loss: 3.931608 | lr 5.2280e-04 | norm: 0.5434 | dt: 1231.14ms | tok/sec: 425856.16 +step 623 | loss: 3.898251 | lr 5.2364e-04 | norm: 0.5106 | dt: 1236.79ms | tok/sec: 423910.22 +step 624 | loss: 3.906253 | lr 5.2448e-04 | norm: 0.5342 | dt: 1236.40ms | tok/sec: 424045.59 +validation loss: 3.9158 +validation perplexity: 50.1870 +step 625 | loss: 3.950860 | lr 5.2531e-04 | norm: 0.4762 | dt: 2688.91ms | tok/sec: 194981.63 +step 626 | loss: 3.874368 | lr 5.2615e-04 | norm: 0.4193 | dt: 1234.93ms | tok/sec: 424549.57 +step 627 | loss: 3.984074 | lr 5.2699e-04 | norm: 0.4603 | dt: 1236.04ms | tok/sec: 424168.85 +step 628 | loss: 3.956202 | lr 5.2783e-04 | norm: 0.4849 | dt: 1233.26ms | tok/sec: 425125.00 +step 629 | loss: 3.968324 | lr 5.2867e-04 | norm: 0.4811 | dt: 1231.10ms | tok/sec: 425870.43 +step 630 | loss: 3.969553 | lr 5.2951e-04 | norm: 0.5246 | dt: 1231.46ms | tok/sec: 425743.37 +step 631 | loss: 3.956055 | lr 5.3035e-04 | norm: 0.4707 | dt: 1232.27ms | tok/sec: 425466.43 +step 632 | loss: 3.906036 | lr 5.3119e-04 | norm: 0.5733 | dt: 1234.24ms | tok/sec: 424785.18 +step 633 | loss: 3.921305 | lr 5.3203e-04 | norm: 0.7208 | dt: 1234.17ms | tok/sec: 424809.80 +step 634 | loss: 4.010479 | lr 5.3287e-04 | norm: 0.9122 | dt: 1232.18ms | tok/sec: 425495.16 +step 635 | loss: 3.988150 | lr 5.3371e-04 | norm: 0.6647 | dt: 1236.77ms | tok/sec: 423916.76 +step 636 | loss: 3.895024 | lr 5.3455e-04 | norm: 0.5145 | dt: 1235.86ms | tok/sec: 424229.90 +step 637 | loss: 3.910872 | lr 5.3538e-04 | norm: 0.4302 | dt: 1237.66ms | tok/sec: 423611.67 +step 638 | loss: 3.910724 | lr 5.3622e-04 | norm: 0.3835 | dt: 1232.61ms | tok/sec: 425348.09 +step 639 | loss: 3.896214 | lr 5.3706e-04 | norm: 0.4414 | dt: 1234.08ms | tok/sec: 424842.46 +step 640 | loss: 3.918957 | lr 5.3790e-04 | norm: 0.4065 | dt: 1235.27ms | tok/sec: 424432.88 +step 641 | loss: 3.961314 | lr 5.3874e-04 | norm: 0.4217 | dt: 1236.57ms | tok/sec: 423984.84 +step 642 | loss: 3.970993 | lr 5.3958e-04 | norm: 0.3847 | dt: 1233.59ms | tok/sec: 425010.22 +step 643 | loss: 3.967100 | lr 5.4042e-04 | norm: 0.4268 | dt: 1237.81ms | tok/sec: 423560.43 +step 644 | loss: 3.922627 | lr 5.4126e-04 | norm: 0.4012 | dt: 1234.16ms | tok/sec: 424812.26 +step 645 | loss: 3.929216 | lr 5.4210e-04 | norm: 0.3930 | dt: 1234.45ms | tok/sec: 424713.97 +step 646 | loss: 3.880909 | lr 5.4294e-04 | norm: 0.3605 | dt: 1236.78ms | tok/sec: 423913.49 +step 647 | loss: 3.968946 | lr 5.4378e-04 | norm: 0.3864 | dt: 1235.04ms | tok/sec: 424511.95 +step 648 | loss: 3.935027 | lr 5.4462e-04 | norm: 0.3834 | dt: 1233.24ms | tok/sec: 425131.90 +step 649 | loss: 3.995211 | lr 5.4545e-04 | norm: 0.4065 | dt: 1234.79ms | tok/sec: 424596.29 +validation loss: 3.9158 +validation perplexity: 50.1895 +step 650 | loss: 3.929068 | lr 5.4629e-04 | norm: 0.3907 | dt: 2680.48ms | tok/sec: 195595.08 +step 651 | loss: 3.891405 | lr 5.4713e-04 | norm: 0.4540 | dt: 1232.58ms | tok/sec: 425357.31 +step 652 | loss: 3.919350 | lr 5.4797e-04 | norm: 0.6584 | dt: 1234.91ms | tok/sec: 424557.35 +step 653 | loss: 3.901128 | lr 5.4881e-04 | norm: 0.8470 | dt: 1233.96ms | tok/sec: 424881.54 +step 654 | loss: 3.941038 | lr 5.4965e-04 | norm: 0.6859 | dt: 1234.83ms | tok/sec: 424583.50 +step 655 | loss: 3.902657 | lr 5.5049e-04 | norm: 0.5460 | dt: 1236.79ms | tok/sec: 423909.90 +step 656 | loss: 3.943504 | lr 5.5133e-04 | norm: 0.5530 | dt: 1233.15ms | tok/sec: 425160.92 +step 657 | loss: 3.904582 | lr 5.5217e-04 | norm: 0.5543 | dt: 1235.49ms | tok/sec: 424357.86 +step 658 | loss: 3.840514 | lr 5.5301e-04 | norm: 0.4653 | dt: 1238.47ms | tok/sec: 423336.20 +step 659 | loss: 3.889364 | lr 5.5385e-04 | norm: 0.4216 | dt: 1235.72ms | tok/sec: 424276.80 +step 660 | loss: 3.926849 | lr 5.5469e-04 | norm: 0.4363 | dt: 1239.18ms | tok/sec: 423092.66 +step 661 | loss: 3.933321 | lr 5.5552e-04 | norm: 0.4567 | dt: 1239.19ms | tok/sec: 423088.02 +step 662 | loss: 3.921920 | lr 5.5636e-04 | norm: 0.5107 | dt: 1234.71ms | tok/sec: 424624.58 +step 663 | loss: 3.871307 | lr 5.5720e-04 | norm: 0.4484 | dt: 1236.84ms | tok/sec: 423893.23 +step 664 | loss: 3.941711 | lr 5.5804e-04 | norm: 0.4472 | dt: 1231.30ms | tok/sec: 425800.58 +step 665 | loss: 3.916007 | lr 5.5888e-04 | norm: 0.4706 | dt: 1230.96ms | tok/sec: 425918.18 +step 666 | loss: 3.908528 | lr 5.5972e-04 | norm: 0.4084 | dt: 1232.84ms | tok/sec: 425267.56 +step 667 | loss: 3.943529 | lr 5.6056e-04 | norm: 0.4321 | dt: 1234.23ms | tok/sec: 424789.04 +step 668 | loss: 3.905954 | lr 5.6140e-04 | norm: 0.4597 | dt: 1238.03ms | tok/sec: 423486.45 +step 669 | loss: 3.884871 | lr 5.6224e-04 | norm: 0.5472 | dt: 1234.03ms | tok/sec: 424859.87 +step 670 | loss: 3.968729 | lr 5.6308e-04 | norm: 0.5344 | dt: 1234.37ms | tok/sec: 424742.76 +step 671 | loss: 4.021111 | lr 5.6392e-04 | norm: 0.4583 | dt: 1231.08ms | tok/sec: 425876.20 +step 672 | loss: 4.037140 | lr 5.6476e-04 | norm: 0.4467 | dt: 1234.20ms | tok/sec: 424801.51 +step 673 | loss: 3.956490 | lr 5.6559e-04 | norm: 0.4565 | dt: 1236.46ms | tok/sec: 424023.92 +step 674 | loss: 3.983686 | lr 5.6643e-04 | norm: 0.4165 | dt: 1237.60ms | tok/sec: 423632.81 +validation loss: 3.9239 +validation perplexity: 50.5975 +step 675 | loss: 3.980011 | lr 5.6727e-04 | norm: 0.4471 | dt: 2682.91ms | tok/sec: 195417.82 +step 676 | loss: 3.904492 | lr 5.6811e-04 | norm: 0.4836 | dt: 1234.02ms | tok/sec: 424863.15 +step 677 | loss: 3.931018 | lr 5.6895e-04 | norm: 0.4830 | dt: 1234.50ms | tok/sec: 424697.81 +step 678 | loss: 3.924693 | lr 5.6979e-04 | norm: 0.4346 | dt: 1229.51ms | tok/sec: 426421.08 +step 679 | loss: 3.942250 | lr 5.7063e-04 | norm: 0.4504 | dt: 1232.73ms | tok/sec: 425305.15 +step 680 | loss: 3.873550 | lr 5.7147e-04 | norm: 0.4475 | dt: 1236.94ms | tok/sec: 423857.85 +step 681 | loss: 4.044616 | lr 5.7231e-04 | norm: 1.0471 | dt: 1234.69ms | tok/sec: 424630.56 +step 682 | loss: 3.952139 | lr 5.7315e-04 | norm: 0.6015 | dt: 1235.22ms | tok/sec: 424449.18 +step 683 | loss: 3.975157 | lr 5.7399e-04 | norm: 0.6692 | dt: 1238.46ms | tok/sec: 423338.48 +step 684 | loss: 3.917153 | lr 5.7483e-04 | norm: 0.5734 | dt: 1234.98ms | tok/sec: 424532.52 +step 685 | loss: 3.999467 | lr 5.7566e-04 | norm: 0.5546 | dt: 1234.22ms | tok/sec: 424792.98 +step 686 | loss: 3.943334 | lr 5.7650e-04 | norm: 0.7045 | dt: 1234.65ms | tok/sec: 424643.68 +step 687 | loss: 3.885042 | lr 5.7734e-04 | norm: 0.7009 | dt: 1232.65ms | tok/sec: 425335.17 +step 688 | loss: 3.941897 | lr 5.7818e-04 | norm: 0.5516 | dt: 1236.35ms | tok/sec: 424060.23 +step 689 | loss: 3.922449 | lr 5.7902e-04 | norm: 0.5149 | dt: 1235.54ms | tok/sec: 424338.86 +step 690 | loss: 3.919404 | lr 5.7986e-04 | norm: 0.5741 | dt: 1233.55ms | tok/sec: 425025.00 +step 691 | loss: 4.039830 | lr 5.8070e-04 | norm: 0.4711 | dt: 1234.53ms | tok/sec: 424687.72 +step 692 | loss: 3.907504 | lr 5.8154e-04 | norm: 0.4935 | dt: 1233.85ms | tok/sec: 424921.85 +step 693 | loss: 3.881465 | lr 5.8238e-04 | norm: 0.4604 | dt: 1232.33ms | tok/sec: 425445.94 +step 694 | loss: 3.921789 | lr 5.8322e-04 | norm: 0.5114 | dt: 1236.81ms | tok/sec: 423902.05 +step 695 | loss: 3.927493 | lr 5.8406e-04 | norm: 0.4450 | dt: 1233.66ms | tok/sec: 424986.64 +step 696 | loss: 3.886823 | lr 5.8490e-04 | norm: 0.3864 | dt: 1232.93ms | tok/sec: 425238.78 +step 697 | loss: 4.027889 | lr 5.8573e-04 | norm: 0.4355 | dt: 1233.73ms | tok/sec: 424961.35 +step 698 | loss: 3.896973 | lr 5.8657e-04 | norm: 0.4159 | dt: 1230.73ms | tok/sec: 425998.14 +step 699 | loss: 3.894875 | lr 5.8741e-04 | norm: 0.4064 | dt: 1237.92ms | tok/sec: 423524.78 +validation loss: 3.9326 +validation perplexity: 51.0400 +step 700 | loss: 3.881636 | lr 5.8825e-04 | norm: 0.4291 | dt: 2686.08ms | tok/sec: 195186.76 +step 701 | loss: 3.894817 | lr 5.8909e-04 | norm: 0.4762 | dt: 1236.22ms | tok/sec: 424104.39 +step 702 | loss: 3.903635 | lr 5.8993e-04 | norm: 0.4273 | dt: 1234.34ms | tok/sec: 424751.38 +step 703 | loss: 3.914458 | lr 5.9077e-04 | norm: 0.3919 | dt: 1234.50ms | tok/sec: 424697.56 +step 704 | loss: 3.874629 | lr 5.9161e-04 | norm: 0.4269 | dt: 1236.09ms | tok/sec: 424149.71 +step 705 | loss: 3.882901 | lr 5.9245e-04 | norm: 0.4585 | dt: 1232.84ms | tok/sec: 425267.56 +step 706 | loss: 3.922693 | lr 5.9329e-04 | norm: 0.5333 | dt: 1237.59ms | tok/sec: 423635.83 +step 707 | loss: 3.978151 | lr 5.9413e-04 | norm: 0.6643 | dt: 1233.22ms | tok/sec: 425139.14 +step 708 | loss: 3.906362 | lr 5.9497e-04 | norm: 0.6347 | dt: 1235.79ms | tok/sec: 424253.72 +step 709 | loss: 3.909367 | lr 5.9580e-04 | norm: 0.4203 | dt: 1237.03ms | tok/sec: 423826.97 +step 710 | loss: 3.992977 | lr 5.9664e-04 | norm: 0.4777 | dt: 1235.72ms | tok/sec: 424278.44 +step 711 | loss: 3.971006 | lr 5.9748e-04 | norm: 0.4371 | dt: 1237.48ms | tok/sec: 423673.94 +step 712 | loss: 3.944895 | lr 5.9832e-04 | norm: 0.4543 | dt: 1238.31ms | tok/sec: 423388.85 +step 713 | loss: 4.016311 | lr 5.9916e-04 | norm: 0.4202 | dt: 1235.75ms | tok/sec: 424267.71 +step 714 | loss: 3.963895 | lr 6.0000e-04 | norm: 0.4537 | dt: 1235.13ms | tok/sec: 424479.33 +step 715 | loss: 3.964758 | lr 6.0000e-04 | norm: 0.4134 | dt: 1236.61ms | tok/sec: 423970.29 +step 716 | loss: 3.980748 | lr 6.0000e-04 | norm: 0.4160 | dt: 1236.60ms | tok/sec: 423976.59 +step 717 | loss: 4.051491 | lr 6.0000e-04 | norm: 0.4653 | dt: 1235.82ms | tok/sec: 424242.01 +step 718 | loss: 3.962289 | lr 6.0000e-04 | norm: 0.4444 | dt: 1234.57ms | tok/sec: 424671.48 +step 719 | loss: 3.869091 | lr 5.9999e-04 | norm: 0.4284 | dt: 1234.74ms | tok/sec: 424612.85 +step 720 | loss: 3.884332 | lr 5.9999e-04 | norm: 0.4549 | dt: 1236.10ms | tok/sec: 424147.09 +step 721 | loss: 3.937107 | lr 5.9998e-04 | norm: 0.5166 | dt: 1234.74ms | tok/sec: 424613.18 +step 722 | loss: 3.931679 | lr 5.9998e-04 | norm: 0.4877 | dt: 1240.94ms | tok/sec: 422491.13 +step 723 | loss: 4.019020 | lr 5.9997e-04 | norm: 0.5330 | dt: 1234.10ms | tok/sec: 424832.62 +step 724 | loss: 3.920989 | lr 5.9997e-04 | norm: 0.6130 | dt: 1235.21ms | tok/sec: 424453.28 +validation loss: 3.9444 +validation perplexity: 51.6456 +step 725 | loss: 4.000432 | lr 5.9996e-04 | norm: 0.5363 | dt: 2682.24ms | tok/sec: 195466.44 +step 726 | loss: 3.915324 | lr 5.9995e-04 | norm: 0.5131 | dt: 1232.04ms | tok/sec: 425544.65 +step 727 | loss: 3.890289 | lr 5.9994e-04 | norm: 0.4702 | dt: 1237.08ms | tok/sec: 423809.65 +step 728 | loss: 3.994755 | lr 5.9993e-04 | norm: 0.4862 | dt: 1231.81ms | tok/sec: 425624.38 +step 729 | loss: 3.966922 | lr 5.9992e-04 | norm: 0.4314 | dt: 1231.14ms | tok/sec: 425854.76 +step 730 | loss: 3.918770 | lr 5.9991e-04 | norm: 0.3994 | dt: 1239.76ms | tok/sec: 422894.29 +step 731 | loss: 3.878277 | lr 5.9989e-04 | norm: 0.4251 | dt: 1231.68ms | tok/sec: 425667.80 +step 732 | loss: 3.957591 | lr 5.9988e-04 | norm: 0.4108 | dt: 1236.73ms | tok/sec: 423929.34 +step 733 | loss: 3.932230 | lr 5.9986e-04 | norm: 0.3865 | dt: 1233.77ms | tok/sec: 424946.89 +step 734 | loss: 3.947205 | lr 5.9985e-04 | norm: 0.4528 | dt: 1234.73ms | tok/sec: 424617.94 +step 735 | loss: 3.917304 | lr 5.9983e-04 | norm: 0.3622 | dt: 1234.26ms | tok/sec: 424777.55 +step 736 | loss: 3.976538 | lr 5.9982e-04 | norm: 0.4181 | dt: 1235.94ms | tok/sec: 424200.36 +step 737 | loss: 3.922392 | lr 5.9980e-04 | norm: 0.4378 | dt: 1236.90ms | tok/sec: 423871.41 +step 738 | loss: 3.954836 | lr 5.9978e-04 | norm: 0.4294 | dt: 1237.55ms | tok/sec: 423651.25 +step 739 | loss: 3.924319 | lr 5.9976e-04 | norm: 0.5153 | dt: 1235.92ms | tok/sec: 424209.19 +step 740 | loss: 3.910630 | lr 5.9974e-04 | norm: 0.5038 | dt: 1236.49ms | tok/sec: 424013.21 +step 741 | loss: 3.924643 | lr 5.9972e-04 | norm: 0.5303 | dt: 1236.51ms | tok/sec: 424006.18 +step 742 | loss: 3.951087 | lr 5.9970e-04 | norm: 0.4962 | dt: 1232.98ms | tok/sec: 425219.12 +step 743 | loss: 3.973892 | lr 5.9967e-04 | norm: 0.4737 | dt: 1235.20ms | tok/sec: 424456.47 +step 744 | loss: 3.958550 | lr 5.9965e-04 | norm: 0.4060 | dt: 1236.70ms | tok/sec: 423940.30 +step 745 | loss: 3.980742 | lr 5.9962e-04 | norm: 0.4539 | dt: 1235.88ms | tok/sec: 424222.37 +step 746 | loss: 3.878482 | lr 5.9960e-04 | norm: 0.4057 | dt: 1237.66ms | tok/sec: 423610.69 +step 747 | loss: 3.920609 | lr 5.9957e-04 | norm: 0.3985 | dt: 1231.92ms | tok/sec: 425587.39 +step 748 | loss: 3.971339 | lr 5.9954e-04 | norm: 0.4036 | dt: 1235.35ms | tok/sec: 424403.64 +step 749 | loss: 3.886545 | lr 5.9952e-04 | norm: 0.5063 | dt: 1232.05ms | tok/sec: 425540.29 +validation loss: 3.9362 +validation perplexity: 51.2249 +step 750 | loss: 3.967291 | lr 5.9949e-04 | norm: 0.5833 | dt: 2676.92ms | tok/sec: 195854.89 +step 751 | loss: 3.950522 | lr 5.9946e-04 | norm: 0.6213 | dt: 1236.77ms | tok/sec: 423917.00 +step 752 | loss: 3.954453 | lr 5.9943e-04 | norm: 0.5220 | dt: 1236.41ms | tok/sec: 424040.28 +step 753 | loss: 3.952019 | lr 5.9940e-04 | norm: 0.3963 | dt: 1237.40ms | tok/sec: 423700.88 +step 754 | loss: 3.884312 | lr 5.9936e-04 | norm: 0.5578 | dt: 1240.15ms | tok/sec: 422760.15 +step 755 | loss: 3.935417 | lr 5.9933e-04 | norm: 0.4677 | dt: 1236.78ms | tok/sec: 423912.18 +step 756 | loss: 3.976043 | lr 5.9930e-04 | norm: 0.5030 | dt: 1234.87ms | tok/sec: 424568.34 +step 757 | loss: 3.916650 | lr 5.9926e-04 | norm: 0.4456 | dt: 1234.83ms | tok/sec: 424584.24 +step 758 | loss: 3.890563 | lr 5.9923e-04 | norm: 0.4027 | dt: 1234.02ms | tok/sec: 424861.34 +step 759 | loss: 3.912485 | lr 5.9919e-04 | norm: 0.3965 | dt: 1232.70ms | tok/sec: 425316.09 +step 760 | loss: 3.907616 | lr 5.9915e-04 | norm: 0.3970 | dt: 1232.92ms | tok/sec: 425240.01 +step 761 | loss: 3.928781 | lr 5.9912e-04 | norm: 0.4043 | dt: 1236.95ms | tok/sec: 423853.93 +step 762 | loss: 3.927971 | lr 5.9908e-04 | norm: 0.3846 | dt: 1235.51ms | tok/sec: 424350.81 +step 763 | loss: 3.948019 | lr 5.9904e-04 | norm: 0.3582 | dt: 1234.39ms | tok/sec: 424734.40 +step 764 | loss: 3.859493 | lr 5.9900e-04 | norm: 0.3716 | dt: 1236.14ms | tok/sec: 424134.74 +step 765 | loss: 3.986724 | lr 5.9896e-04 | norm: 0.3844 | dt: 1236.57ms | tok/sec: 423986.64 +step 766 | loss: 3.964188 | lr 5.9891e-04 | norm: 0.3836 | dt: 1234.18ms | tok/sec: 424805.70 +step 767 | loss: 3.900281 | lr 5.9887e-04 | norm: 0.4036 | dt: 1230.07ms | tok/sec: 426225.70 +step 768 | loss: 3.910514 | lr 5.9883e-04 | norm: 0.4166 | dt: 1236.58ms | tok/sec: 423981.00 +step 769 | loss: 3.889319 | lr 5.9878e-04 | norm: 0.3438 | dt: 1234.62ms | tok/sec: 424654.59 +step 770 | loss: 3.913004 | lr 5.9874e-04 | norm: 0.4007 | dt: 1235.92ms | tok/sec: 424209.19 +step 771 | loss: 3.928488 | lr 5.9869e-04 | norm: 0.6350 | dt: 1236.52ms | tok/sec: 424002.50 +step 772 | loss: 3.970248 | lr 5.9864e-04 | norm: 0.7232 | dt: 1232.32ms | tok/sec: 425448.32 +step 773 | loss: 3.913821 | lr 5.9859e-04 | norm: 0.5029 | dt: 1230.62ms | tok/sec: 426035.85 +step 774 | loss: 3.977076 | lr 5.9855e-04 | norm: 0.5604 | dt: 1234.38ms | tok/sec: 424736.36 +validation loss: 3.9406 +validation perplexity: 51.4519 +step 775 | loss: 3.948256 | lr 5.9850e-04 | norm: 0.6299 | dt: 2694.83ms | tok/sec: 194553.45 +step 776 | loss: 3.922258 | lr 5.9845e-04 | norm: 0.6689 | dt: 1235.26ms | tok/sec: 424435.83 +step 777 | loss: 3.965634 | lr 5.9839e-04 | norm: 0.6289 | dt: 1232.69ms | tok/sec: 425321.44 +step 778 | loss: 3.931684 | lr 5.9834e-04 | norm: 0.6886 | dt: 1233.04ms | tok/sec: 425198.65 +step 779 | loss: 3.965815 | lr 5.9829e-04 | norm: 0.4756 | dt: 1234.89ms | tok/sec: 424562.03 +step 780 | loss: 3.913942 | lr 5.9824e-04 | norm: 0.4666 | dt: 1234.94ms | tok/sec: 424544.57 +step 781 | loss: 3.942941 | lr 5.9818e-04 | norm: 0.4324 | dt: 1236.37ms | tok/sec: 424055.65 +step 782 | loss: 3.927706 | lr 5.9812e-04 | norm: 0.3827 | dt: 1235.18ms | tok/sec: 424464.01 +step 783 | loss: 3.957249 | lr 5.9807e-04 | norm: 0.3596 | dt: 1235.10ms | tok/sec: 424491.38 +step 784 | loss: 3.999262 | lr 5.9801e-04 | norm: 0.4145 | dt: 1235.64ms | tok/sec: 424305.37 +step 785 | loss: 3.892890 | lr 5.9795e-04 | norm: 0.4325 | dt: 1233.16ms | tok/sec: 425157.55 +step 786 | loss: 3.991087 | lr 5.9789e-04 | norm: 0.4321 | dt: 1862.11ms | tok/sec: 281555.81 +step 787 | loss: 3.910872 | lr 5.9784e-04 | norm: 0.3698 | dt: 1237.03ms | tok/sec: 423829.50 +step 788 | loss: 3.931558 | lr 5.9777e-04 | norm: 0.3895 | dt: 1237.74ms | tok/sec: 423584.91 +step 789 | loss: 3.927254 | lr 5.9771e-04 | norm: 0.4659 | dt: 1231.77ms | tok/sec: 425637.97 +step 790 | loss: 3.941753 | lr 5.9765e-04 | norm: 0.4194 | dt: 1235.27ms | tok/sec: 424431.41 +step 791 | loss: 3.904380 | lr 5.9759e-04 | norm: 0.3789 | dt: 1235.11ms | tok/sec: 424487.36 +step 792 | loss: 3.983484 | lr 5.9752e-04 | norm: 0.4000 | dt: 1235.01ms | tok/sec: 424522.68 +step 793 | loss: 3.965390 | lr 5.9746e-04 | norm: 0.4258 | dt: 1237.90ms | tok/sec: 423531.47 +step 794 | loss: 3.894158 | lr 5.9739e-04 | norm: 0.3969 | dt: 1239.36ms | tok/sec: 423030.97 +step 795 | loss: 3.874352 | lr 5.9733e-04 | norm: 0.3566 | dt: 1237.16ms | tok/sec: 423782.29 +step 796 | loss: 3.895689 | lr 5.9726e-04 | norm: 0.3597 | dt: 1232.68ms | tok/sec: 425324.81 +step 797 | loss: 3.891604 | lr 5.9719e-04 | norm: 0.3641 | dt: 1233.38ms | tok/sec: 425080.95 +step 798 | loss: 3.861352 | lr 5.9712e-04 | norm: 0.3664 | dt: 1235.48ms | tok/sec: 424358.10 +step 799 | loss: 3.932593 | lr 5.9705e-04 | norm: 0.3459 | dt: 1236.13ms | tok/sec: 424136.46 +validation loss: 3.9318 +validation perplexity: 50.9970 +step 800 | loss: 3.994884 | lr 5.9698e-04 | norm: 0.4564 | dt: 2673.80ms | tok/sec: 196083.41 +step 801 | loss: 3.873833 | lr 5.9691e-04 | norm: 0.5934 | dt: 1236.68ms | tok/sec: 423948.88 +step 802 | loss: 3.970432 | lr 5.9684e-04 | norm: 0.4872 | dt: 1234.15ms | tok/sec: 424817.02 +step 803 | loss: 3.938663 | lr 5.9677e-04 | norm: 0.4737 | dt: 1237.57ms | tok/sec: 423643.09 +step 804 | loss: 3.925446 | lr 5.9669e-04 | norm: 0.5139 | dt: 1236.62ms | tok/sec: 423967.43 +step 805 | loss: 3.920943 | lr 5.9662e-04 | norm: 0.4982 | dt: 1237.04ms | tok/sec: 423825.17 +step 806 | loss: 3.947798 | lr 5.9654e-04 | norm: 0.4421 | dt: 1236.97ms | tok/sec: 423849.51 +step 807 | loss: 3.905314 | lr 5.9647e-04 | norm: 0.3853 | dt: 1240.20ms | tok/sec: 422743.97 +step 808 | loss: 3.913078 | lr 5.9639e-04 | norm: 0.4755 | dt: 1237.08ms | tok/sec: 423810.96 +step 809 | loss: 3.934447 | lr 5.9631e-04 | norm: 0.4723 | dt: 1237.26ms | tok/sec: 423748.07 +step 810 | loss: 3.997724 | lr 5.9623e-04 | norm: 0.4065 | dt: 1237.91ms | tok/sec: 423526.90 +step 811 | loss: 3.928231 | lr 5.9616e-04 | norm: 0.4852 | dt: 1239.58ms | tok/sec: 422955.30 +step 812 | loss: 3.973211 | lr 5.9607e-04 | norm: 0.4064 | dt: 1236.29ms | tok/sec: 424082.96 +step 813 | loss: 4.031930 | lr 5.9599e-04 | norm: 0.4173 | dt: 1233.96ms | tok/sec: 424881.62 +step 814 | loss: 3.965290 | lr 5.9591e-04 | norm: 0.4821 | dt: 1237.03ms | tok/sec: 423827.87 +step 815 | loss: 3.944231 | lr 5.9583e-04 | norm: 0.4708 | dt: 1236.64ms | tok/sec: 423961.06 +step 816 | loss: 4.024728 | lr 5.9575e-04 | norm: 0.4296 | dt: 1237.13ms | tok/sec: 423792.91 +step 817 | loss: 3.939555 | lr 5.9566e-04 | norm: 0.4285 | dt: 1235.72ms | tok/sec: 424277.78 +step 818 | loss: 3.950910 | lr 5.9558e-04 | norm: 0.3972 | dt: 1235.07ms | tok/sec: 424501.95 +step 819 | loss: 3.973635 | lr 5.9549e-04 | norm: 0.4157 | dt: 1235.15ms | tok/sec: 424474.09 +step 820 | loss: 4.026714 | lr 5.9540e-04 | norm: 0.4428 | dt: 1231.87ms | tok/sec: 425603.62 +step 821 | loss: 3.929379 | lr 5.9532e-04 | norm: 0.3495 | dt: 1233.30ms | tok/sec: 425108.15 +step 822 | loss: 3.963857 | lr 5.9523e-04 | norm: 0.3852 | dt: 1233.18ms | tok/sec: 425149.90 +step 823 | loss: 3.910863 | lr 5.9514e-04 | norm: 0.4363 | dt: 1235.91ms | tok/sec: 424213.12 +step 824 | loss: 3.851832 | lr 5.9505e-04 | norm: 0.4310 | dt: 1237.08ms | tok/sec: 423812.10 +validation loss: 3.9327 +validation perplexity: 51.0427 +step 825 | loss: 3.938242 | lr 5.9496e-04 | norm: 0.4203 | dt: 2680.15ms | tok/sec: 195619.20 +step 826 | loss: 3.931685 | lr 5.9486e-04 | norm: 0.4176 | dt: 1237.03ms | tok/sec: 423827.05 +step 827 | loss: 3.970970 | lr 5.9477e-04 | norm: 0.3564 | dt: 1233.43ms | tok/sec: 425066.57 +step 828 | loss: 3.931909 | lr 5.9468e-04 | norm: 0.4107 | dt: 1234.09ms | tok/sec: 424836.14 +step 829 | loss: 3.903415 | lr 5.9458e-04 | norm: 0.3796 | dt: 1233.45ms | tok/sec: 425057.12 +step 830 | loss: 3.904668 | lr 5.9449e-04 | norm: 0.3817 | dt: 1235.85ms | tok/sec: 424231.45 +step 831 | loss: 3.983983 | lr 5.9439e-04 | norm: 0.3739 | dt: 1233.51ms | tok/sec: 425036.42 +step 832 | loss: 3.905729 | lr 5.9430e-04 | norm: 0.3729 | dt: 1234.54ms | tok/sec: 424683.78 +step 833 | loss: 3.956767 | lr 5.9420e-04 | norm: 0.4076 | dt: 1234.58ms | tok/sec: 424669.76 +step 834 | loss: 3.920690 | lr 5.9410e-04 | norm: 0.4651 | dt: 1234.04ms | tok/sec: 424853.63 +step 835 | loss: 3.941207 | lr 5.9400e-04 | norm: 0.5125 | dt: 1236.40ms | tok/sec: 424044.61 +step 836 | loss: 3.882663 | lr 5.9390e-04 | norm: 0.4921 | dt: 1232.63ms | tok/sec: 425340.85 +step 837 | loss: 3.928998 | lr 5.9380e-04 | norm: 0.4339 | dt: 1238.25ms | tok/sec: 423408.90 +step 838 | loss: 4.007948 | lr 5.9370e-04 | norm: 0.3979 | dt: 1233.12ms | tok/sec: 425171.77 +step 839 | loss: 3.961152 | lr 5.9360e-04 | norm: 0.4864 | dt: 1235.82ms | tok/sec: 424243.40 +step 840 | loss: 3.853265 | lr 5.9349e-04 | norm: 0.3976 | dt: 1231.78ms | tok/sec: 425635.99 +step 841 | loss: 3.940165 | lr 5.9339e-04 | norm: 0.4203 | dt: 1235.27ms | tok/sec: 424432.14 +step 842 | loss: 3.941086 | lr 5.9328e-04 | norm: 0.4700 | dt: 1235.34ms | tok/sec: 424408.88 +step 843 | loss: 3.932655 | lr 5.9318e-04 | norm: 0.3788 | dt: 1233.77ms | tok/sec: 424946.89 +step 844 | loss: 3.900338 | lr 5.9307e-04 | norm: 0.3736 | dt: 1232.72ms | tok/sec: 425310.82 +step 845 | loss: 3.939884 | lr 5.9296e-04 | norm: 0.3517 | dt: 1235.27ms | tok/sec: 424433.62 +step 846 | loss: 3.974152 | lr 5.9286e-04 | norm: 0.3670 | dt: 1237.68ms | tok/sec: 423607.02 +step 847 | loss: 3.959493 | lr 5.9275e-04 | norm: 0.3811 | dt: 1233.80ms | tok/sec: 424937.70 +step 848 | loss: 3.956165 | lr 5.9264e-04 | norm: 0.3841 | dt: 1235.29ms | tok/sec: 424424.85 +step 849 | loss: 4.019079 | lr 5.9253e-04 | norm: 0.3537 | dt: 1234.67ms | tok/sec: 424636.47 +validation loss: 3.9294 +validation perplexity: 50.8758 +step 850 | loss: 3.945986 | lr 5.9241e-04 | norm: 0.4302 | dt: 2685.14ms | tok/sec: 195255.34 +step 851 | loss: 4.072665 | lr 5.9230e-04 | norm: 0.5593 | dt: 1239.48ms | tok/sec: 422991.66 +step 852 | loss: 3.937315 | lr 5.9219e-04 | norm: 0.5236 | dt: 1233.20ms | tok/sec: 425143.33 +step 853 | loss: 3.967475 | lr 5.9208e-04 | norm: 0.4992 | dt: 1233.53ms | tok/sec: 425030.01 +step 854 | loss: 4.024686 | lr 5.9196e-04 | norm: 0.5254 | dt: 1233.47ms | tok/sec: 425050.30 +step 855 | loss: 3.966960 | lr 5.9185e-04 | norm: 0.5045 | dt: 1233.17ms | tok/sec: 425153.27 +step 856 | loss: 3.935577 | lr 5.9173e-04 | norm: 0.4518 | dt: 1234.87ms | tok/sec: 424570.47 +step 857 | loss: 3.919321 | lr 5.9161e-04 | norm: 0.4130 | dt: 1235.36ms | tok/sec: 424401.34 +step 858 | loss: 3.901103 | lr 5.9149e-04 | norm: 0.3764 | dt: 1234.96ms | tok/sec: 424538.42 +step 859 | loss: 3.925479 | lr 5.9138e-04 | norm: 0.3612 | dt: 1236.16ms | tok/sec: 424125.25 +step 860 | loss: 3.923503 | lr 5.9126e-04 | norm: 0.3777 | dt: 1237.17ms | tok/sec: 423781.72 +step 861 | loss: 3.973137 | lr 5.9114e-04 | norm: 0.3615 | dt: 1233.67ms | tok/sec: 424982.04 +step 862 | loss: 3.894149 | lr 5.9101e-04 | norm: 0.4352 | dt: 1234.24ms | tok/sec: 424785.76 +step 863 | loss: 3.911928 | lr 5.9089e-04 | norm: 0.4411 | dt: 1234.38ms | tok/sec: 424738.25 +step 864 | loss: 3.930174 | lr 5.9077e-04 | norm: 0.4145 | dt: 1236.06ms | tok/sec: 424160.51 +step 865 | loss: 3.920861 | lr 5.9065e-04 | norm: 0.4384 | dt: 1236.97ms | tok/sec: 423849.43 +step 866 | loss: 3.875911 | lr 5.9052e-04 | norm: 0.3487 | dt: 1238.91ms | tok/sec: 423184.75 +step 867 | loss: 3.893018 | lr 5.9040e-04 | norm: 0.3718 | dt: 1234.80ms | tok/sec: 424594.65 +step 868 | loss: 3.949128 | lr 5.9027e-04 | norm: 0.5205 | dt: 1236.09ms | tok/sec: 424151.18 +step 869 | loss: 3.921783 | lr 5.9014e-04 | norm: 0.5111 | dt: 1238.15ms | tok/sec: 423443.64 +step 870 | loss: 3.982633 | lr 5.9002e-04 | norm: 0.5216 | dt: 1235.53ms | tok/sec: 424342.38 +step 871 | loss: 3.980992 | lr 5.8989e-04 | norm: 0.5651 | dt: 1234.51ms | tok/sec: 424692.72 +step 872 | loss: 3.923978 | lr 5.8976e-04 | norm: 0.5231 | dt: 1236.44ms | tok/sec: 424031.85 +step 873 | loss: 3.927274 | lr 5.8963e-04 | norm: 0.4149 | dt: 1236.19ms | tok/sec: 424115.92 +step 874 | loss: 3.939073 | lr 5.8950e-04 | norm: 0.3963 | dt: 1236.19ms | tok/sec: 424117.23 +validation loss: 3.9321 +validation perplexity: 51.0162 +step 875 | loss: 3.935910 | lr 5.8937e-04 | norm: 0.3829 | dt: 2685.81ms | tok/sec: 195206.41 +step 876 | loss: 3.954491 | lr 5.8923e-04 | norm: 0.4948 | dt: 1233.48ms | tok/sec: 425046.53 +step 877 | loss: 3.938570 | lr 5.8910e-04 | norm: 0.4652 | dt: 1234.46ms | tok/sec: 424712.08 +step 878 | loss: 3.964153 | lr 5.8897e-04 | norm: 0.3951 | dt: 1234.31ms | tok/sec: 424761.55 +step 879 | loss: 3.969203 | lr 5.8883e-04 | norm: 0.4049 | dt: 1235.10ms | tok/sec: 424490.64 +step 880 | loss: 3.888915 | lr 5.8869e-04 | norm: 0.4228 | dt: 1234.21ms | tok/sec: 424795.27 +step 881 | loss: 3.904629 | lr 5.8856e-04 | norm: 0.4333 | dt: 1233.08ms | tok/sec: 425187.22 +step 882 | loss: 3.978836 | lr 5.8842e-04 | norm: 0.4648 | dt: 1236.74ms | tok/sec: 423927.96 +step 883 | loss: 3.966287 | lr 5.8828e-04 | norm: 0.3980 | dt: 1235.43ms | tok/sec: 424378.57 +step 884 | loss: 3.970418 | lr 5.8814e-04 | norm: 0.3675 | dt: 1234.66ms | tok/sec: 424641.47 +step 885 | loss: 3.955987 | lr 5.8800e-04 | norm: 0.3607 | dt: 1236.16ms | tok/sec: 424124.92 +step 886 | loss: 3.988407 | lr 5.8786e-04 | norm: 0.3519 | dt: 1237.66ms | tok/sec: 423611.83 +step 887 | loss: 3.983734 | lr 5.8772e-04 | norm: 0.4042 | dt: 1234.46ms | tok/sec: 424709.21 +step 888 | loss: 4.132557 | lr 5.8758e-04 | norm: 0.4444 | dt: 1236.69ms | tok/sec: 423944.79 +step 889 | loss: 3.998026 | lr 5.8744e-04 | norm: 0.4551 | dt: 1233.60ms | tok/sec: 425007.83 +step 890 | loss: 3.981635 | lr 5.8729e-04 | norm: 0.3728 | dt: 1236.25ms | tok/sec: 424096.46 +step 891 | loss: 3.947320 | lr 5.8715e-04 | norm: 0.4321 | dt: 1233.29ms | tok/sec: 425113.90 +step 892 | loss: 3.910776 | lr 5.8700e-04 | norm: 0.4635 | dt: 1236.87ms | tok/sec: 423881.30 +step 893 | loss: 3.909419 | lr 5.8686e-04 | norm: 0.4425 | dt: 1239.74ms | tok/sec: 422902.91 +step 894 | loss: 3.904206 | lr 5.8671e-04 | norm: 0.3605 | dt: 1235.15ms | tok/sec: 424474.09 +step 895 | loss: 3.862434 | lr 5.8656e-04 | norm: 0.4395 | dt: 1237.89ms | tok/sec: 423534.16 +step 896 | loss: 3.941557 | lr 5.8642e-04 | norm: 0.3617 | dt: 1238.15ms | tok/sec: 423445.10 +step 897 | loss: 3.901917 | lr 5.8627e-04 | norm: 0.3614 | dt: 1237.22ms | tok/sec: 423763.92 +step 898 | loss: 3.902506 | lr 5.8612e-04 | norm: 0.3854 | dt: 1236.16ms | tok/sec: 424128.03 +step 899 | loss: 3.855386 | lr 5.8597e-04 | norm: 0.3792 | dt: 1234.52ms | tok/sec: 424689.94 +validation loss: 3.9290 +validation perplexity: 50.8552 +step 900 | loss: 3.918122 | lr 5.8581e-04 | norm: 0.3413 | dt: 2680.04ms | tok/sec: 195627.20 +step 901 | loss: 3.908261 | lr 5.8566e-04 | norm: 0.3484 | dt: 1239.09ms | tok/sec: 423123.03 +step 902 | loss: 3.946739 | lr 5.8551e-04 | norm: 0.3387 | dt: 1236.21ms | tok/sec: 424107.66 +step 903 | loss: 3.941875 | lr 5.8535e-04 | norm: 0.3950 | dt: 1235.74ms | tok/sec: 424271.40 +step 904 | loss: 3.999565 | lr 5.8520e-04 | norm: 0.4495 | dt: 1236.38ms | tok/sec: 424049.35 +step 905 | loss: 3.933658 | lr 5.8504e-04 | norm: 0.4824 | dt: 1236.04ms | tok/sec: 424167.46 +step 906 | loss: 3.938309 | lr 5.8489e-04 | norm: 0.4713 | dt: 1237.41ms | tok/sec: 423698.92 +step 907 | loss: 3.981853 | lr 5.8473e-04 | norm: 0.4195 | dt: 1235.90ms | tok/sec: 424216.89 +step 908 | loss: 4.038857 | lr 5.8457e-04 | norm: 0.5182 | dt: 1236.78ms | tok/sec: 423912.67 +step 909 | loss: 3.881715 | lr 5.8441e-04 | norm: 0.5009 | dt: 1239.06ms | tok/sec: 423134.42 +step 910 | loss: 3.917917 | lr 5.8425e-04 | norm: 0.4263 | dt: 1236.14ms | tok/sec: 424132.12 +step 911 | loss: 3.894319 | lr 5.8409e-04 | norm: 0.4364 | dt: 1235.52ms | tok/sec: 424344.43 +step 912 | loss: 3.933734 | lr 5.8393e-04 | norm: 0.3803 | dt: 1236.36ms | tok/sec: 424058.27 +step 913 | loss: 3.911181 | lr 5.8377e-04 | norm: 0.3925 | dt: 1234.87ms | tok/sec: 424569.81 +step 914 | loss: 3.987229 | lr 5.8361e-04 | norm: 0.3917 | dt: 1237.80ms | tok/sec: 423564.84 +step 915 | loss: 3.984171 | lr 5.8345e-04 | norm: 0.4304 | dt: 1233.54ms | tok/sec: 425028.37 +step 916 | loss: 3.918325 | lr 5.8328e-04 | norm: 0.3924 | dt: 1234.83ms | tok/sec: 424582.27 +step 917 | loss: 3.965194 | lr 5.8312e-04 | norm: 0.5693 | dt: 1231.95ms | tok/sec: 425577.43 +step 918 | loss: 3.948618 | lr 5.8295e-04 | norm: 0.3835 | dt: 1235.55ms | tok/sec: 424334.35 +step 919 | loss: 3.960583 | lr 5.8278e-04 | norm: 0.5529 | dt: 1234.94ms | tok/sec: 424545.14 +step 920 | loss: 3.984382 | lr 5.8262e-04 | norm: 0.5484 | dt: 1232.89ms | tok/sec: 425251.11 +step 921 | loss: 3.952421 | lr 5.8245e-04 | norm: 0.3760 | dt: 1237.12ms | tok/sec: 423795.85 +step 922 | loss: 3.969114 | lr 5.8228e-04 | norm: 0.3894 | dt: 1234.45ms | tok/sec: 424715.36 +step 923 | loss: 4.013616 | lr 5.8211e-04 | norm: 0.3582 | dt: 1235.25ms | tok/sec: 424439.11 +step 924 | loss: 3.971840 | lr 5.8194e-04 | norm: 0.3723 | dt: 1236.09ms | tok/sec: 424151.18 +validation loss: 3.9316 +validation perplexity: 50.9906 +step 925 | loss: 3.903394 | lr 5.8177e-04 | norm: 0.4272 | dt: 2684.03ms | tok/sec: 195335.91 +step 926 | loss: 3.948708 | lr 5.8160e-04 | norm: 0.4163 | dt: 1233.85ms | tok/sec: 424919.55 +step 927 | loss: 3.892556 | lr 5.8142e-04 | norm: 0.3801 | dt: 1235.04ms | tok/sec: 424511.95 +step 928 | loss: 3.878578 | lr 5.8125e-04 | norm: 0.3533 | dt: 1234.30ms | tok/sec: 424765.74 +step 929 | loss: 3.896698 | lr 5.8107e-04 | norm: 0.5396 | dt: 1238.35ms | tok/sec: 423376.30 +step 930 | loss: 3.958746 | lr 5.8090e-04 | norm: 0.7025 | dt: 1237.78ms | tok/sec: 423571.85 +step 931 | loss: 3.922946 | lr 5.8072e-04 | norm: 0.7092 | dt: 1238.87ms | tok/sec: 423199.16 +step 932 | loss: 3.917778 | lr 5.8055e-04 | norm: 0.5337 | dt: 1235.67ms | tok/sec: 424294.32 +step 933 | loss: 3.931687 | lr 5.8037e-04 | norm: 0.4121 | dt: 1232.28ms | tok/sec: 425461.58 +step 934 | loss: 3.952096 | lr 5.8019e-04 | norm: 0.4212 | dt: 1235.23ms | tok/sec: 424446.40 +step 935 | loss: 3.923293 | lr 5.8001e-04 | norm: 0.4192 | dt: 1232.41ms | tok/sec: 425418.12 +step 936 | loss: 3.945485 | lr 5.7983e-04 | norm: 0.4678 | dt: 1231.40ms | tok/sec: 425765.79 +step 937 | loss: 3.970985 | lr 5.7965e-04 | norm: 0.4443 | dt: 1237.02ms | tok/sec: 423832.28 +step 938 | loss: 3.932669 | lr 5.7947e-04 | norm: 0.4751 | dt: 1238.50ms | tok/sec: 423323.32 +step 939 | loss: 3.925340 | lr 5.7929e-04 | norm: 0.4866 | dt: 1232.96ms | tok/sec: 425228.01 +step 940 | loss: 3.921493 | lr 5.7911e-04 | norm: 0.4340 | dt: 1237.55ms | tok/sec: 423649.05 +step 941 | loss: 3.975593 | lr 5.7892e-04 | norm: 0.4050 | dt: 1237.92ms | tok/sec: 423523.80 +step 942 | loss: 3.954704 | lr 5.7874e-04 | norm: 0.3889 | dt: 1233.75ms | tok/sec: 424953.71 +step 943 | loss: 3.892841 | lr 5.7855e-04 | norm: 0.3604 | dt: 1232.64ms | tok/sec: 425336.82 +step 944 | loss: 3.928921 | lr 5.7837e-04 | norm: 0.4222 | dt: 1236.45ms | tok/sec: 424025.15 +step 945 | loss: 3.934536 | lr 5.7818e-04 | norm: 0.3804 | dt: 1234.95ms | tok/sec: 424541.04 +step 946 | loss: 3.951128 | lr 5.7799e-04 | norm: 0.4507 | dt: 1235.53ms | tok/sec: 424343.28 +step 947 | loss: 3.944148 | lr 5.7780e-04 | norm: 0.4257 | dt: 1232.84ms | tok/sec: 425268.55 +step 948 | loss: 3.949564 | lr 5.7761e-04 | norm: 0.3528 | dt: 1236.41ms | tok/sec: 424040.11 +step 949 | loss: 4.031469 | lr 5.7742e-04 | norm: 0.3843 | dt: 1232.38ms | tok/sec: 425425.69 +validation loss: 3.9303 +validation perplexity: 50.9244 +step 950 | loss: 4.001417 | lr 5.7723e-04 | norm: 0.3910 | dt: 2686.12ms | tok/sec: 195183.78 +step 951 | loss: 3.929778 | lr 5.7704e-04 | norm: 0.3651 | dt: 1234.65ms | tok/sec: 424643.68 +step 952 | loss: 3.988221 | lr 5.7685e-04 | norm: 0.3481 | dt: 1236.41ms | tok/sec: 424040.03 +step 953 | loss: 3.935193 | lr 5.7666e-04 | norm: 0.3618 | dt: 1230.98ms | tok/sec: 425912.16 +step 954 | loss: 4.045477 | lr 5.7646e-04 | norm: 0.3517 | dt: 1235.05ms | tok/sec: 424507.69 +step 955 | loss: 3.987377 | lr 5.7627e-04 | norm: 0.3727 | dt: 1237.34ms | tok/sec: 423720.97 +step 956 | loss: 3.948662 | lr 5.7607e-04 | norm: 0.3369 | dt: 1235.29ms | tok/sec: 424425.51 +step 957 | loss: 3.920448 | lr 5.7588e-04 | norm: 0.3642 | dt: 1236.78ms | tok/sec: 423913.90 +step 958 | loss: 3.981793 | lr 5.7568e-04 | norm: 0.3890 | dt: 1238.78ms | tok/sec: 423229.46 +step 959 | loss: 3.939209 | lr 5.7548e-04 | norm: 0.3354 | dt: 1232.56ms | tok/sec: 425365.45 +step 960 | loss: 3.928361 | lr 5.7529e-04 | norm: 0.4019 | dt: 1235.79ms | tok/sec: 424253.72 +step 961 | loss: 3.893247 | lr 5.7509e-04 | norm: 0.4146 | dt: 1237.44ms | tok/sec: 423689.21 +step 962 | loss: 3.950392 | lr 5.7489e-04 | norm: 0.4509 | dt: 1237.97ms | tok/sec: 423504.72 +step 963 | loss: 3.963662 | lr 5.7469e-04 | norm: 0.3871 | dt: 1235.55ms | tok/sec: 424334.76 +step 964 | loss: 3.885383 | lr 5.7449e-04 | norm: 0.3491 | dt: 1233.00ms | tok/sec: 425213.12 +step 965 | loss: 3.958201 | lr 5.7428e-04 | norm: 0.3584 | dt: 1236.72ms | tok/sec: 423932.86 +step 966 | loss: 3.928366 | lr 5.7408e-04 | norm: 0.3953 | dt: 1233.83ms | tok/sec: 424927.60 +step 967 | loss: 3.921200 | lr 5.7388e-04 | norm: 0.4757 | dt: 1237.90ms | tok/sec: 423531.31 +step 968 | loss: 3.873161 | lr 5.7367e-04 | norm: 0.5140 | dt: 1237.62ms | tok/sec: 423626.20 +step 969 | loss: 3.849571 | lr 5.7347e-04 | norm: 0.4876 | dt: 1236.57ms | tok/sec: 423986.64 +step 970 | loss: 3.892869 | lr 5.7326e-04 | norm: 0.3604 | dt: 1236.70ms | tok/sec: 423940.87 +step 971 | loss: 3.883222 | lr 5.7306e-04 | norm: 0.4992 | dt: 1234.40ms | tok/sec: 424731.28 +step 972 | loss: 3.948573 | lr 5.7285e-04 | norm: 0.4113 | dt: 1235.65ms | tok/sec: 424302.10 +step 973 | loss: 3.965697 | lr 5.7264e-04 | norm: 0.3626 | dt: 1240.83ms | tok/sec: 422530.51 +step 974 | loss: 3.927560 | lr 5.7243e-04 | norm: 0.4007 | dt: 1236.39ms | tok/sec: 424045.92 +validation loss: 3.9230 +validation perplexity: 50.5537 +step 975 | loss: 3.937810 | lr 5.7222e-04 | norm: 0.3856 | dt: 2677.20ms | tok/sec: 195834.61 +step 976 | loss: 3.953691 | lr 5.7201e-04 | norm: 0.4894 | dt: 1235.54ms | tok/sec: 424338.69 +step 977 | loss: 3.910310 | lr 5.7180e-04 | norm: 0.4087 | dt: 1842.83ms | tok/sec: 284501.64 +step 978 | loss: 3.885774 | lr 5.7159e-04 | norm: 0.4183 | dt: 1235.45ms | tok/sec: 424370.63 +step 979 | loss: 3.973917 | lr 5.7138e-04 | norm: 0.3588 | dt: 1233.66ms | tok/sec: 424986.56 +step 980 | loss: 3.966391 | lr 5.7116e-04 | norm: 0.3644 | dt: 1235.01ms | tok/sec: 424521.62 +step 981 | loss: 3.967647 | lr 5.7095e-04 | norm: 0.3593 | dt: 1234.62ms | tok/sec: 424655.57 +step 982 | loss: 3.903361 | lr 5.7073e-04 | norm: 0.3248 | dt: 1236.34ms | tok/sec: 424065.87 +step 983 | loss: 3.942240 | lr 5.7052e-04 | norm: 0.3395 | dt: 1237.71ms | tok/sec: 423595.03 +step 984 | loss: 3.945445 | lr 5.7030e-04 | norm: 0.3220 | dt: 1236.49ms | tok/sec: 424012.72 +step 985 | loss: 3.993714 | lr 5.7008e-04 | norm: 0.3541 | dt: 1234.87ms | tok/sec: 424567.68 +step 986 | loss: 3.939103 | lr 5.6987e-04 | norm: 0.3529 | dt: 1234.18ms | tok/sec: 424808.40 +step 987 | loss: 3.974712 | lr 5.6965e-04 | norm: 0.3693 | dt: 1235.14ms | tok/sec: 424476.30 +step 988 | loss: 3.952278 | lr 5.6943e-04 | norm: 0.4039 | dt: 1235.29ms | tok/sec: 424426.16 +step 989 | loss: 3.933239 | lr 5.6921e-04 | norm: 0.4141 | dt: 1236.35ms | tok/sec: 424062.84 +step 990 | loss: 3.961873 | lr 5.6899e-04 | norm: 0.4012 | dt: 1236.56ms | tok/sec: 423990.40 +step 991 | loss: 4.007088 | lr 5.6877e-04 | norm: 0.3953 | dt: 1233.86ms | tok/sec: 424917.99 +step 992 | loss: 4.002340 | lr 5.6854e-04 | norm: 0.3814 | dt: 1234.94ms | tok/sec: 424543.66 +step 993 | loss: 3.950693 | lr 5.6832e-04 | norm: 0.4116 | dt: 1235.02ms | tok/sec: 424518.26 +step 994 | loss: 3.947638 | lr 5.6810e-04 | norm: 0.4028 | dt: 1234.67ms | tok/sec: 424636.88 +step 995 | loss: 3.916758 | lr 5.6787e-04 | norm: 0.4133 | dt: 1234.09ms | tok/sec: 424838.36 +step 996 | loss: 3.897637 | lr 5.6765e-04 | norm: 0.4602 | dt: 1236.83ms | tok/sec: 423896.82 +step 997 | loss: 3.907023 | lr 5.6742e-04 | norm: 0.4161 | dt: 1235.56ms | tok/sec: 424333.94 +step 998 | loss: 3.911179 | lr 5.6720e-04 | norm: 0.4065 | dt: 1234.26ms | tok/sec: 424778.78 +step 999 | loss: 3.915909 | lr 5.6697e-04 | norm: 0.3460 | dt: 1234.07ms | tok/sec: 424844.60 +validation loss: 3.9223 +validation perplexity: 50.5146 +step 1000 | loss: 3.965521 | lr 5.6674e-04 | norm: 0.3607 | dt: 2672.98ms | tok/sec: 196143.40 +step 1001 | loss: 3.944131 | lr 5.6651e-04 | norm: 0.3897 | dt: 1236.01ms | tok/sec: 424178.10 +step 1002 | loss: 3.909610 | lr 5.6628e-04 | norm: 0.3913 | dt: 1237.01ms | tok/sec: 423835.95 +step 1003 | loss: 3.860866 | lr 5.6605e-04 | norm: 0.3570 | dt: 1233.52ms | tok/sec: 425034.86 +step 1004 | loss: 3.886588 | lr 5.6582e-04 | norm: 0.3682 | dt: 1234.91ms | tok/sec: 424556.29 +step 1005 | loss: 3.864774 | lr 5.6559e-04 | norm: 0.4981 | dt: 1234.31ms | tok/sec: 424762.29 +step 1006 | loss: 3.831444 | lr 5.6536e-04 | norm: 0.4486 | dt: 1237.34ms | tok/sec: 423723.33 +step 1007 | loss: 3.946797 | lr 5.6512e-04 | norm: 0.3951 | dt: 1235.28ms | tok/sec: 424429.77 +step 1008 | loss: 3.956946 | lr 5.6489e-04 | norm: 0.3800 | dt: 1234.68ms | tok/sec: 424634.50 +step 1009 | loss: 3.913130 | lr 5.6465e-04 | norm: 0.3752 | dt: 1233.59ms | tok/sec: 425009.39 +step 1010 | loss: 3.857386 | lr 5.6442e-04 | norm: 0.4025 | dt: 1234.72ms | tok/sec: 424622.04 +step 1011 | loss: 3.986201 | lr 5.6418e-04 | norm: 0.3824 | dt: 1234.75ms | tok/sec: 424609.41 +step 1012 | loss: 3.902658 | lr 5.6395e-04 | norm: 0.4619 | dt: 1237.10ms | tok/sec: 423802.79 +step 1013 | loss: 3.924166 | lr 5.6371e-04 | norm: 0.4778 | dt: 1235.65ms | tok/sec: 424300.05 +step 1014 | loss: 3.947214 | lr 5.6347e-04 | norm: 0.4257 | dt: 1232.97ms | tok/sec: 425224.88 +step 1015 | loss: 4.004626 | lr 5.6323e-04 | norm: 0.4347 | dt: 1235.07ms | tok/sec: 424499.41 +step 1016 | loss: 3.939928 | lr 5.6299e-04 | norm: 0.5614 | dt: 1236.62ms | tok/sec: 423969.48 +step 1017 | loss: 3.959078 | lr 5.6275e-04 | norm: 0.4155 | dt: 1236.46ms | tok/sec: 424023.35 +step 1018 | loss: 3.936986 | lr 5.6251e-04 | norm: 0.4436 | dt: 1234.05ms | tok/sec: 424850.43 +step 1019 | loss: 3.920416 | lr 5.6227e-04 | norm: 0.4289 | dt: 1235.25ms | tok/sec: 424437.63 +step 1020 | loss: 3.928544 | lr 5.6202e-04 | norm: 0.3967 | dt: 1235.39ms | tok/sec: 424391.43 +step 1021 | loss: 3.980324 | lr 5.6178e-04 | norm: 0.4086 | dt: 1238.16ms | tok/sec: 423439.80 +step 1022 | loss: 4.016722 | lr 5.6154e-04 | norm: 0.3354 | dt: 1232.23ms | tok/sec: 425480.59 +step 1023 | loss: 3.968665 | lr 5.6129e-04 | norm: 0.3588 | dt: 1234.09ms | tok/sec: 424836.14 +step 1024 | loss: 3.963441 | lr 5.6105e-04 | norm: 0.3853 | dt: 1233.31ms | tok/sec: 425106.67 +validation loss: 3.9223 +validation perplexity: 50.5158 +step 1025 | loss: 3.961635 | lr 5.6080e-04 | norm: 0.3410 | dt: 2680.83ms | tok/sec: 195569.56 +step 1026 | loss: 3.939561 | lr 5.6055e-04 | norm: 0.3502 | dt: 1233.31ms | tok/sec: 425104.95 +step 1027 | loss: 3.901815 | lr 5.6031e-04 | norm: 0.3373 | dt: 1232.55ms | tok/sec: 425369.40 +step 1028 | loss: 3.923902 | lr 5.6006e-04 | norm: 0.4021 | dt: 1232.13ms | tok/sec: 425513.03 +step 1029 | loss: 3.959723 | lr 5.5981e-04 | norm: 0.3602 | dt: 1232.54ms | tok/sec: 425373.19 +step 1030 | loss: 3.908282 | lr 5.5956e-04 | norm: 0.3507 | dt: 1233.57ms | tok/sec: 425016.05 +step 1031 | loss: 3.927383 | lr 5.5931e-04 | norm: 0.3884 | dt: 1234.11ms | tok/sec: 424829.25 +step 1032 | loss: 3.920769 | lr 5.5906e-04 | norm: 0.3742 | dt: 1234.37ms | tok/sec: 424741.04 +step 1033 | loss: 3.895254 | lr 5.5880e-04 | norm: 0.4103 | dt: 1232.59ms | tok/sec: 425353.93 +step 1034 | loss: 3.914616 | lr 5.5855e-04 | norm: 0.3609 | dt: 1234.70ms | tok/sec: 424626.30 +step 1035 | loss: 3.892040 | lr 5.5830e-04 | norm: 0.3812 | dt: 1234.34ms | tok/sec: 424752.53 +step 1036 | loss: 3.860164 | lr 5.5804e-04 | norm: 0.3630 | dt: 1236.41ms | tok/sec: 424040.60 +step 1037 | loss: 3.859274 | lr 5.5779e-04 | norm: 0.3964 | dt: 1233.60ms | tok/sec: 425006.93 +step 1038 | loss: 3.884290 | lr 5.5753e-04 | norm: 0.3949 | dt: 1232.52ms | tok/sec: 425379.69 +step 1039 | loss: 3.831087 | lr 5.5728e-04 | norm: 0.4192 | dt: 1232.33ms | tok/sec: 425445.85 +step 1040 | loss: 3.922941 | lr 5.5702e-04 | norm: 0.3414 | dt: 1232.89ms | tok/sec: 425250.13 +step 1041 | loss: 3.915904 | lr 5.5676e-04 | norm: 0.3488 | dt: 1236.42ms | tok/sec: 424037.74 +step 1042 | loss: 3.928778 | lr 5.5651e-04 | norm: 0.2867 | dt: 1231.99ms | tok/sec: 425561.94 +step 1043 | loss: 3.914561 | lr 5.5625e-04 | norm: 0.3540 | dt: 1236.27ms | tok/sec: 424086.89 +step 1044 | loss: 3.948164 | lr 5.5599e-04 | norm: 0.3651 | dt: 1232.65ms | tok/sec: 425335.01 +step 1045 | loss: 3.894003 | lr 5.5573e-04 | norm: 0.3647 | dt: 1237.42ms | tok/sec: 423695.90 +step 1046 | loss: 3.910673 | lr 5.5547e-04 | norm: 0.4465 | dt: 1237.25ms | tok/sec: 423751.26 +step 1047 | loss: 3.878724 | lr 5.5520e-04 | norm: 0.5402 | dt: 1231.61ms | tok/sec: 425691.86 +step 1048 | loss: 3.902539 | lr 5.5494e-04 | norm: 0.4742 | dt: 1234.88ms | tok/sec: 424565.39 +step 1049 | loss: 3.890513 | lr 5.5468e-04 | norm: 0.3675 | dt: 1235.43ms | tok/sec: 424375.54 +validation loss: 3.9166 +validation perplexity: 50.2294 +step 1050 | loss: 3.903963 | lr 5.5441e-04 | norm: 0.3718 | dt: 2681.79ms | tok/sec: 195499.25 +step 1051 | loss: 3.948347 | lr 5.5415e-04 | norm: 0.3477 | dt: 1233.41ms | tok/sec: 425070.43 +step 1052 | loss: 3.952345 | lr 5.5388e-04 | norm: 0.4130 | dt: 1235.12ms | tok/sec: 424482.78 +step 1053 | loss: 3.941226 | lr 5.5362e-04 | norm: 0.3767 | dt: 1233.30ms | tok/sec: 425109.14 +step 1054 | loss: 3.911246 | lr 5.5335e-04 | norm: 0.3751 | dt: 1233.56ms | tok/sec: 425019.58 +step 1055 | loss: 4.029135 | lr 5.5308e-04 | norm: 0.3777 | dt: 1238.35ms | tok/sec: 423376.21 +step 1056 | loss: 3.953032 | lr 5.5282e-04 | norm: 0.3912 | dt: 1232.88ms | tok/sec: 425254.15 +step 1057 | loss: 3.951145 | lr 5.5255e-04 | norm: 0.4370 | dt: 1233.19ms | tok/sec: 425149.49 +step 1058 | loss: 3.962261 | lr 5.5228e-04 | norm: 0.3999 | dt: 1232.42ms | tok/sec: 425414.08 +step 1059 | loss: 3.956590 | lr 5.5201e-04 | norm: 0.3678 | dt: 1235.70ms | tok/sec: 424285.31 +step 1060 | loss: 3.990839 | lr 5.5174e-04 | norm: 0.3418 | dt: 1234.83ms | tok/sec: 424583.09 +step 1061 | loss: 3.911048 | lr 5.5147e-04 | norm: 0.4102 | dt: 1231.85ms | tok/sec: 425611.53 +step 1062 | loss: 3.942118 | lr 5.5119e-04 | norm: 0.4456 | dt: 1236.67ms | tok/sec: 423951.00 +step 1063 | loss: 3.954549 | lr 5.5092e-04 | norm: 0.4113 | dt: 1232.85ms | tok/sec: 425265.83 +step 1064 | loss: 3.905490 | lr 5.5065e-04 | norm: 0.3557 | dt: 1234.24ms | tok/sec: 424784.69 +step 1065 | loss: 3.907057 | lr 5.5037e-04 | norm: 0.3823 | dt: 1236.46ms | tok/sec: 424022.61 +step 1066 | loss: 3.836963 | lr 5.5010e-04 | norm: 0.3801 | dt: 1234.56ms | tok/sec: 424677.63 +step 1067 | loss: 3.887221 | lr 5.4982e-04 | norm: 0.3124 | dt: 1230.70ms | tok/sec: 426007.38 +step 1068 | loss: 3.901114 | lr 5.4955e-04 | norm: 0.4389 | dt: 1234.21ms | tok/sec: 424798.15 +step 1069 | loss: 3.872139 | lr 5.4927e-04 | norm: 0.4105 | dt: 1236.56ms | tok/sec: 423989.01 +step 1070 | loss: 3.910966 | lr 5.4899e-04 | norm: 0.3611 | dt: 1234.56ms | tok/sec: 424677.14 +step 1071 | loss: 3.922523 | lr 5.4871e-04 | norm: 0.4109 | dt: 1233.53ms | tok/sec: 425029.27 +step 1072 | loss: 3.920423 | lr 5.4843e-04 | norm: 0.3728 | dt: 1232.48ms | tok/sec: 425393.43 +step 1073 | loss: 3.852818 | lr 5.4815e-04 | norm: 0.3360 | dt: 1233.27ms | tok/sec: 425119.90 +step 1074 | loss: 3.860718 | lr 5.4787e-04 | norm: 0.3442 | dt: 1236.33ms | tok/sec: 424067.10 +validation loss: 3.9185 +validation perplexity: 50.3231 +step 1075 | loss: 3.894468 | lr 5.4759e-04 | norm: 0.3645 | dt: 2681.02ms | tok/sec: 195555.23 +step 1076 | loss: 3.932950 | lr 5.4731e-04 | norm: 0.3650 | dt: 1235.66ms | tok/sec: 424297.43 +step 1077 | loss: 3.910384 | lr 5.4703e-04 | norm: 0.3836 | dt: 1233.44ms | tok/sec: 425062.22 +step 1078 | loss: 3.939596 | lr 5.4675e-04 | norm: 0.4147 | dt: 1233.68ms | tok/sec: 424978.59 +step 1079 | loss: 3.952103 | lr 5.4646e-04 | norm: 0.4202 | dt: 1234.73ms | tok/sec: 424617.28 +step 1080 | loss: 3.972744 | lr 5.4618e-04 | norm: 0.3840 | dt: 1237.55ms | tok/sec: 423651.25 +step 1081 | loss: 3.935417 | lr 5.4589e-04 | norm: 0.3820 | dt: 1236.79ms | tok/sec: 423911.77 +step 1082 | loss: 4.014768 | lr 5.4561e-04 | norm: 0.4193 | dt: 1235.13ms | tok/sec: 424478.84 +step 1083 | loss: 3.932413 | lr 5.4532e-04 | norm: 0.5383 | dt: 1236.05ms | tok/sec: 424162.88 +step 1084 | loss: 3.916990 | lr 5.4503e-04 | norm: 0.4731 | dt: 1237.90ms | tok/sec: 423529.35 +step 1085 | loss: 4.083557 | lr 5.4475e-04 | norm: 0.5306 | dt: 1234.19ms | tok/sec: 424804.63 +step 1086 | loss: 3.954165 | lr 5.4446e-04 | norm: 0.4542 | dt: 1238.07ms | tok/sec: 423471.93 +step 1087 | loss: 3.959549 | lr 5.4417e-04 | norm: 0.4383 | dt: 1234.96ms | tok/sec: 424538.83 +step 1088 | loss: 3.861588 | lr 5.4388e-04 | norm: 0.4071 | dt: 1235.47ms | tok/sec: 424364.08 +step 1089 | loss: 4.028632 | lr 5.4359e-04 | norm: 0.4451 | dt: 1234.25ms | tok/sec: 424781.08 +step 1090 | loss: 3.964155 | lr 5.4330e-04 | norm: 0.3865 | dt: 1234.22ms | tok/sec: 424794.54 +step 1091 | loss: 3.949335 | lr 5.4301e-04 | norm: 0.3709 | dt: 1234.78ms | tok/sec: 424598.75 +step 1092 | loss: 3.961001 | lr 5.4271e-04 | norm: 0.3891 | dt: 1239.34ms | tok/sec: 423039.19 +step 1093 | loss: 3.918009 | lr 5.4242e-04 | norm: 0.3776 | dt: 1236.45ms | tok/sec: 424025.48 +step 1094 | loss: 3.961380 | lr 5.4213e-04 | norm: 0.3370 | dt: 1236.64ms | tok/sec: 423961.06 +step 1095 | loss: 3.912809 | lr 5.4183e-04 | norm: 0.3541 | dt: 1236.50ms | tok/sec: 424008.80 +step 1096 | loss: 3.939011 | lr 5.4154e-04 | norm: 0.3533 | dt: 1235.01ms | tok/sec: 424520.55 +step 1097 | loss: 3.949151 | lr 5.4124e-04 | norm: 0.3190 | dt: 1234.68ms | tok/sec: 424634.01 +step 1098 | loss: 3.891271 | lr 5.4095e-04 | norm: 0.3225 | dt: 1234.31ms | tok/sec: 424761.55 +step 1099 | loss: 3.869338 | lr 5.4065e-04 | norm: 0.3545 | dt: 1237.53ms | tok/sec: 423655.66 +validation loss: 3.9148 +validation perplexity: 50.1371 +step 1100 | loss: 3.916449 | lr 5.4035e-04 | norm: 0.3761 | dt: 2676.45ms | tok/sec: 195889.47 +step 1101 | loss: 3.918781 | lr 5.4005e-04 | norm: 0.3568 | dt: 1233.37ms | tok/sec: 425085.80 +step 1102 | loss: 3.880409 | lr 5.3975e-04 | norm: 0.4216 | dt: 1235.14ms | tok/sec: 424475.65 +step 1103 | loss: 3.924272 | lr 5.3946e-04 | norm: 0.3479 | dt: 1235.31ms | tok/sec: 424419.04 +step 1104 | loss: 3.884873 | lr 5.3915e-04 | norm: 0.3705 | dt: 1234.33ms | tok/sec: 424755.07 +step 1105 | loss: 3.933090 | lr 5.3885e-04 | norm: 0.3950 | dt: 1236.54ms | tok/sec: 423995.96 +step 1106 | loss: 3.894217 | lr 5.3855e-04 | norm: 0.4454 | dt: 1238.35ms | tok/sec: 423375.32 +step 1107 | loss: 3.881969 | lr 5.3825e-04 | norm: 0.4065 | dt: 1235.70ms | tok/sec: 424284.00 +step 1108 | loss: 3.911630 | lr 5.3795e-04 | norm: 0.3469 | dt: 1235.36ms | tok/sec: 424402.49 +step 1109 | loss: 3.867978 | lr 5.3764e-04 | norm: 0.3480 | dt: 1234.93ms | tok/sec: 424547.93 +step 1110 | loss: 3.880740 | lr 5.3734e-04 | norm: 0.3254 | dt: 1236.16ms | tok/sec: 424127.54 +step 1111 | loss: 3.993434 | lr 5.3704e-04 | norm: 0.3345 | dt: 1232.61ms | tok/sec: 425346.94 +step 1112 | loss: 3.934479 | lr 5.3673e-04 | norm: 0.3653 | dt: 1234.78ms | tok/sec: 424599.57 +step 1113 | loss: 3.951018 | lr 5.3642e-04 | norm: 0.3287 | dt: 1237.64ms | tok/sec: 423618.77 +step 1114 | loss: 3.920901 | lr 5.3612e-04 | norm: 0.4128 | dt: 1233.67ms | tok/sec: 424981.22 +step 1115 | loss: 3.983172 | lr 5.3581e-04 | norm: 0.4879 | dt: 1234.29ms | tok/sec: 424769.67 +step 1116 | loss: 3.978838 | lr 5.3550e-04 | norm: 0.4499 | dt: 1235.24ms | tok/sec: 424442.30 +step 1117 | loss: 3.886415 | lr 5.3519e-04 | norm: 0.4494 | dt: 1233.59ms | tok/sec: 425009.15 +step 1118 | loss: 3.923116 | lr 5.3488e-04 | norm: 0.4316 | dt: 1235.50ms | tok/sec: 424354.33 +step 1119 | loss: 3.910658 | lr 5.3457e-04 | norm: 0.3930 | dt: 1235.48ms | tok/sec: 424360.89 +step 1120 | loss: 3.970764 | lr 5.3426e-04 | norm: 0.3767 | dt: 1235.78ms | tok/sec: 424258.22 +step 1121 | loss: 3.914766 | lr 5.3395e-04 | norm: 0.3621 | dt: 1235.58ms | tok/sec: 424324.94 +step 1122 | loss: 3.985860 | lr 5.3364e-04 | norm: 0.4888 | dt: 1234.71ms | tok/sec: 424623.27 +step 1123 | loss: 3.915807 | lr 5.3333e-04 | norm: 0.3881 | dt: 1234.85ms | tok/sec: 424574.81 +step 1124 | loss: 3.937629 | lr 5.3302e-04 | norm: 0.3666 | dt: 1236.69ms | tok/sec: 423943.65 +validation loss: 3.9172 +validation perplexity: 50.2588 +step 1125 | loss: 3.953011 | lr 5.3270e-04 | norm: 0.3940 | dt: 2679.98ms | tok/sec: 195631.38 +step 1126 | loss: 3.975278 | lr 5.3239e-04 | norm: 0.3882 | dt: 1232.45ms | tok/sec: 425402.40 +step 1127 | loss: 3.945652 | lr 5.3207e-04 | norm: 0.3675 | dt: 1237.00ms | tok/sec: 423837.91 +step 1128 | loss: 3.937716 | lr 5.3176e-04 | norm: 0.3908 | dt: 1233.65ms | tok/sec: 424989.76 +step 1129 | loss: 3.954950 | lr 5.3144e-04 | norm: 0.3277 | dt: 1236.43ms | tok/sec: 424034.31 +step 1130 | loss: 3.937807 | lr 5.3113e-04 | norm: 0.3257 | dt: 1235.81ms | tok/sec: 424245.12 +step 1131 | loss: 3.937109 | lr 5.3081e-04 | norm: 0.3540 | dt: 1236.47ms | tok/sec: 424019.02 +step 1132 | loss: 3.959936 | lr 5.3049e-04 | norm: 0.4029 | dt: 1235.04ms | tok/sec: 424510.88 +step 1133 | loss: 3.917560 | lr 5.3017e-04 | norm: 0.4302 | dt: 1233.16ms | tok/sec: 425158.21 +step 1134 | loss: 3.876654 | lr 5.2985e-04 | norm: 0.3793 | dt: 1232.28ms | tok/sec: 425462.73 +step 1135 | loss: 3.877423 | lr 5.2953e-04 | norm: 0.4021 | dt: 1234.69ms | tok/sec: 424631.14 +step 1136 | loss: 3.916847 | lr 5.2921e-04 | norm: 0.3926 | dt: 1233.98ms | tok/sec: 424875.22 +step 1137 | loss: 3.863974 | lr 5.2889e-04 | norm: 0.3673 | dt: 1235.36ms | tok/sec: 424401.84 +step 1138 | loss: 3.849113 | lr 5.2857e-04 | norm: 0.3813 | dt: 1236.22ms | tok/sec: 424105.62 +step 1139 | loss: 3.867191 | lr 5.2825e-04 | norm: 0.3693 | dt: 1234.50ms | tok/sec: 424695.02 +step 1140 | loss: 3.878786 | lr 5.2792e-04 | norm: 0.3898 | dt: 1235.96ms | tok/sec: 424195.61 +step 1141 | loss: 3.928541 | lr 5.2760e-04 | norm: 0.3963 | dt: 1236.62ms | tok/sec: 423967.11 +step 1142 | loss: 3.885930 | lr 5.2728e-04 | norm: 0.3221 | dt: 1233.88ms | tok/sec: 424910.93 +step 1143 | loss: 3.898152 | lr 5.2695e-04 | norm: 0.3954 | dt: 1236.47ms | tok/sec: 424018.28 +step 1144 | loss: 3.906897 | lr 5.2663e-04 | norm: 0.4110 | dt: 1234.82ms | tok/sec: 424585.96 +step 1145 | loss: 3.916712 | lr 5.2630e-04 | norm: 0.3804 | dt: 1230.95ms | tok/sec: 425921.98 +step 1146 | loss: 3.920725 | lr 5.2597e-04 | norm: 0.3516 | dt: 1238.80ms | tok/sec: 423223.19 +step 1147 | loss: 3.999310 | lr 5.2565e-04 | norm: 0.3601 | dt: 1235.37ms | tok/sec: 424396.68 +step 1148 | loss: 3.853028 | lr 5.2532e-04 | norm: 0.3643 | dt: 1232.40ms | tok/sec: 425420.34 +step 1149 | loss: 4.000334 | lr 5.2499e-04 | norm: 0.3772 | dt: 1234.12ms | tok/sec: 424825.72 +validation loss: 3.9098 +validation perplexity: 49.8878 +step 1150 | loss: 3.930282 | lr 5.2466e-04 | norm: 0.4264 | dt: 2694.81ms | tok/sec: 194555.03 +step 1151 | loss: 3.946131 | lr 5.2433e-04 | norm: 0.4292 | dt: 1232.52ms | tok/sec: 425378.70 +step 1152 | loss: 3.894723 | lr 5.2400e-04 | norm: 0.3679 | dt: 1236.88ms | tok/sec: 423878.93 +step 1153 | loss: 3.835078 | lr 5.2367e-04 | norm: 0.3257 | dt: 1236.22ms | tok/sec: 424106.44 +step 1154 | loss: 3.760438 | lr 5.2334e-04 | norm: 0.6688 | dt: 1234.41ms | tok/sec: 424728.57 +step 1155 | loss: 3.919055 | lr 5.2301e-04 | norm: 0.5070 | dt: 1235.24ms | tok/sec: 424442.30 +step 1156 | loss: 3.963028 | lr 5.2267e-04 | norm: 0.3961 | dt: 1233.04ms | tok/sec: 425200.30 +step 1157 | loss: 3.925420 | lr 5.2234e-04 | norm: 0.4175 | dt: 1235.49ms | tok/sec: 424355.23 +step 1158 | loss: 3.955029 | lr 5.2201e-04 | norm: 0.4074 | dt: 1234.57ms | tok/sec: 424673.37 +step 1159 | loss: 3.948960 | lr 5.2167e-04 | norm: 0.3848 | dt: 1235.14ms | tok/sec: 424476.71 +step 1160 | loss: 3.989330 | lr 5.2134e-04 | norm: 0.4331 | dt: 1235.36ms | tok/sec: 424400.36 +step 1161 | loss: 4.012727 | lr 5.2100e-04 | norm: 0.4264 | dt: 1240.46ms | tok/sec: 422656.71 +step 1162 | loss: 3.951899 | lr 5.2067e-04 | norm: 0.4014 | dt: 1239.46ms | tok/sec: 422996.22 +step 1163 | loss: 3.906939 | lr 5.2033e-04 | norm: 0.4083 | dt: 1234.54ms | tok/sec: 424683.87 +step 1164 | loss: 3.975944 | lr 5.1999e-04 | norm: 0.3322 | dt: 1235.85ms | tok/sec: 424232.35 +step 1165 | loss: 3.911454 | lr 5.1965e-04 | norm: 0.3997 | dt: 1239.08ms | tok/sec: 423125.88 +step 1166 | loss: 4.036081 | lr 5.1932e-04 | norm: 0.4334 | dt: 1239.66ms | tok/sec: 422927.23 +step 1167 | loss: 4.023912 | lr 5.1898e-04 | norm: 0.3799 | dt: 1856.89ms | tok/sec: 282347.92 +step 1168 | loss: 3.872003 | lr 5.1864e-04 | norm: 0.3524 | dt: 1239.78ms | tok/sec: 422886.32 +step 1169 | loss: 3.877356 | lr 5.1830e-04 | norm: 0.3673 | dt: 1236.68ms | tok/sec: 423946.59 +step 1170 | loss: 3.893447 | lr 5.1796e-04 | norm: 0.3188 | dt: 1239.08ms | tok/sec: 423126.77 +step 1171 | loss: 3.893014 | lr 5.1761e-04 | norm: 0.3470 | dt: 1236.39ms | tok/sec: 424047.88 +step 1172 | loss: 3.866527 | lr 5.1727e-04 | norm: 0.3208 | dt: 1237.71ms | tok/sec: 423594.70 +step 1173 | loss: 3.856145 | lr 5.1693e-04 | norm: 0.3310 | dt: 1233.14ms | tok/sec: 425166.67 +step 1174 | loss: 3.841046 | lr 5.1659e-04 | norm: 0.3407 | dt: 1236.21ms | tok/sec: 424108.81 +validation loss: 3.9135 +validation perplexity: 50.0734 +step 1175 | loss: 3.916482 | lr 5.1624e-04 | norm: 0.3031 | dt: 2689.44ms | tok/sec: 194943.48 +step 1176 | loss: 3.836964 | lr 5.1590e-04 | norm: 0.3447 | dt: 1238.26ms | tok/sec: 423406.38 +step 1177 | loss: 3.899209 | lr 5.1555e-04 | norm: 0.3876 | dt: 1239.00ms | tok/sec: 423155.51 +step 1178 | loss: 3.864567 | lr 5.1521e-04 | norm: 0.3447 | dt: 1235.19ms | tok/sec: 424458.20 +step 1179 | loss: 3.904245 | lr 5.1486e-04 | norm: 0.3275 | dt: 1232.13ms | tok/sec: 425512.70 +step 1180 | loss: 3.962282 | lr 5.1452e-04 | norm: 0.3535 | dt: 1233.38ms | tok/sec: 425081.03 +step 1181 | loss: 3.934659 | lr 5.1417e-04 | norm: 0.3696 | dt: 1234.47ms | tok/sec: 424705.68 +step 1182 | loss: 3.908488 | lr 5.1382e-04 | norm: 0.3521 | dt: 1236.63ms | tok/sec: 423964.00 +step 1183 | loss: 3.899031 | lr 5.1347e-04 | norm: 0.3762 | dt: 1233.54ms | tok/sec: 425028.29 +step 1184 | loss: 3.929236 | lr 5.1312e-04 | norm: 0.3730 | dt: 1234.89ms | tok/sec: 424563.25 +step 1185 | loss: 3.999620 | lr 5.1277e-04 | norm: 0.3948 | dt: 1237.63ms | tok/sec: 423623.59 +step 1186 | loss: 3.939773 | lr 5.1242e-04 | norm: 0.3819 | dt: 1237.33ms | tok/sec: 423725.86 +step 1187 | loss: 3.945382 | lr 5.1207e-04 | norm: 0.3887 | dt: 1237.47ms | tok/sec: 423678.68 +step 1188 | loss: 3.946747 | lr 5.1172e-04 | norm: 0.4198 | dt: 1238.24ms | tok/sec: 423413.96 +step 1189 | loss: 3.970589 | lr 5.1137e-04 | norm: 0.4123 | dt: 1233.86ms | tok/sec: 424915.53 +step 1190 | loss: 3.936333 | lr 5.1102e-04 | norm: 0.3651 | dt: 1236.31ms | tok/sec: 424074.95 +step 1191 | loss: 3.974137 | lr 5.1067e-04 | norm: 0.4299 | dt: 1234.83ms | tok/sec: 424584.24 +step 1192 | loss: 3.927297 | lr 5.1031e-04 | norm: 0.3902 | dt: 1235.83ms | tok/sec: 424240.70 +step 1193 | loss: 3.998593 | lr 5.0996e-04 | norm: 0.4125 | dt: 1238.46ms | tok/sec: 423337.09 +step 1194 | loss: 3.904512 | lr 5.0960e-04 | norm: 0.3913 | dt: 1236.79ms | tok/sec: 423909.81 +step 1195 | loss: 3.955267 | lr 5.0925e-04 | norm: 0.4129 | dt: 1237.92ms | tok/sec: 423523.31 +step 1196 | loss: 3.927253 | lr 5.0889e-04 | norm: 0.3565 | dt: 1233.28ms | tok/sec: 425116.70 +step 1197 | loss: 3.931869 | lr 5.0854e-04 | norm: 0.3789 | dt: 1234.57ms | tok/sec: 424672.47 +step 1198 | loss: 3.938747 | lr 5.0818e-04 | norm: 0.3730 | dt: 1237.76ms | tok/sec: 423576.42 +step 1199 | loss: 4.050231 | lr 5.0782e-04 | norm: 0.3363 | dt: 1238.76ms | tok/sec: 423236.06 +validation loss: 3.9097 +validation perplexity: 49.8836 +step 1200 | loss: 3.948855 | lr 5.0746e-04 | norm: 0.3577 | dt: 2683.18ms | tok/sec: 195397.78 +step 1201 | loss: 3.959306 | lr 5.0711e-04 | norm: 0.4013 | dt: 1234.09ms | tok/sec: 424836.72 +step 1202 | loss: 3.870855 | lr 5.0675e-04 | norm: 0.3462 | dt: 1234.62ms | tok/sec: 424655.90 +step 1203 | loss: 3.905335 | lr 5.0639e-04 | norm: 0.3862 | dt: 1236.61ms | tok/sec: 423971.44 +step 1204 | loss: 3.945591 | lr 5.0603e-04 | norm: 0.3678 | dt: 1235.64ms | tok/sec: 424304.80 +step 1205 | loss: 3.899985 | lr 5.0567e-04 | norm: 0.3682 | dt: 1240.26ms | tok/sec: 422725.20 +step 1206 | loss: 3.901940 | lr 5.0531e-04 | norm: 0.3202 | dt: 1240.73ms | tok/sec: 422563.96 +step 1207 | loss: 3.896625 | lr 5.0494e-04 | norm: 0.4253 | dt: 1240.43ms | tok/sec: 422667.11 +step 1208 | loss: 3.948766 | lr 5.0458e-04 | norm: 0.3953 | dt: 1237.87ms | tok/sec: 423540.77 +step 1209 | loss: 3.882130 | lr 5.0422e-04 | norm: 0.3408 | dt: 1236.06ms | tok/sec: 424160.92 +step 1210 | loss: 3.901161 | lr 5.0386e-04 | norm: 0.3368 | dt: 1236.93ms | tok/sec: 423863.89 +step 1211 | loss: 3.870388 | lr 5.0349e-04 | norm: 0.3604 | dt: 1240.38ms | tok/sec: 422684.17 +step 1212 | loss: 3.874183 | lr 5.0313e-04 | norm: 0.3606 | dt: 1234.07ms | tok/sec: 424843.45 +step 1213 | loss: 3.952955 | lr 5.0276e-04 | norm: 0.3843 | dt: 1239.64ms | tok/sec: 422936.02 +step 1214 | loss: 3.860060 | lr 5.0240e-04 | norm: 0.3869 | dt: 1236.09ms | tok/sec: 424151.75 +step 1215 | loss: 3.823457 | lr 5.0203e-04 | norm: 0.4001 | dt: 1233.27ms | tok/sec: 425120.40 +step 1216 | loss: 3.903499 | lr 5.0166e-04 | norm: 0.4774 | dt: 1234.55ms | tok/sec: 424678.62 +step 1217 | loss: 3.891204 | lr 5.0130e-04 | norm: 0.3478 | dt: 1235.98ms | tok/sec: 424186.85 +step 1218 | loss: 3.921021 | lr 5.0093e-04 | norm: 0.3500 | dt: 1235.49ms | tok/sec: 424357.28 +step 1219 | loss: 3.883350 | lr 5.0056e-04 | norm: 0.3502 | dt: 1234.57ms | tok/sec: 424672.88 +step 1220 | loss: 3.851842 | lr 5.0019e-04 | norm: 0.3382 | dt: 1233.47ms | tok/sec: 425052.44 +step 1221 | loss: 3.858281 | lr 4.9982e-04 | norm: 0.3616 | dt: 1238.32ms | tok/sec: 423387.14 +step 1222 | loss: 3.933222 | lr 4.9945e-04 | norm: 0.3601 | dt: 1234.25ms | tok/sec: 424784.11 +step 1223 | loss: 3.856160 | lr 4.9908e-04 | norm: 0.3849 | dt: 1234.35ms | tok/sec: 424747.69 +step 1224 | loss: 3.832405 | lr 4.9871e-04 | norm: 0.4043 | dt: 1237.12ms | tok/sec: 423795.77 +validation loss: 3.9053 +validation perplexity: 49.6648 +step 1225 | loss: 3.879165 | lr 4.9834e-04 | norm: 0.3635 | dt: 2686.72ms | tok/sec: 195140.62 +step 1226 | loss: 3.914027 | lr 4.9797e-04 | norm: 0.3508 | dt: 1233.53ms | tok/sec: 425030.67 +step 1227 | loss: 3.979441 | lr 4.9760e-04 | norm: 0.3546 | dt: 1232.95ms | tok/sec: 425229.65 +step 1228 | loss: 3.957683 | lr 4.9723e-04 | norm: 0.4309 | dt: 1234.36ms | tok/sec: 424745.39 +step 1229 | loss: 3.928717 | lr 4.9685e-04 | norm: 0.4220 | dt: 1236.13ms | tok/sec: 424137.36 +step 1230 | loss: 3.938504 | lr 4.9648e-04 | norm: 0.3474 | dt: 1236.32ms | tok/sec: 424070.61 +step 1231 | loss: 3.993985 | lr 4.9610e-04 | norm: 0.3749 | dt: 1234.08ms | tok/sec: 424840.66 +step 1232 | loss: 3.951302 | lr 4.9573e-04 | norm: 0.4278 | dt: 1236.74ms | tok/sec: 423928.85 +step 1233 | loss: 3.851964 | lr 4.9535e-04 | norm: 0.3835 | dt: 1236.90ms | tok/sec: 423874.11 +step 1234 | loss: 3.943573 | lr 4.9498e-04 | norm: 0.3604 | dt: 1237.68ms | tok/sec: 423606.20 +step 1235 | loss: 3.902007 | lr 4.9460e-04 | norm: 0.4037 | dt: 1235.30ms | tok/sec: 424421.49 +step 1236 | loss: 3.901617 | lr 4.9422e-04 | norm: 0.3581 | dt: 1237.40ms | tok/sec: 423700.31 +step 1237 | loss: 3.937624 | lr 4.9385e-04 | norm: 0.4079 | dt: 1233.78ms | tok/sec: 424945.83 +step 1238 | loss: 3.866732 | lr 4.9347e-04 | norm: 0.3795 | dt: 1236.58ms | tok/sec: 423983.13 +step 1239 | loss: 3.898949 | lr 4.9309e-04 | norm: 0.3726 | dt: 1237.62ms | tok/sec: 423624.73 +step 1240 | loss: 3.898762 | lr 4.9271e-04 | norm: 0.4122 | dt: 1236.64ms | tok/sec: 423961.30 +step 1241 | loss: 3.863565 | lr 4.9233e-04 | norm: 0.3950 | dt: 1238.40ms | tok/sec: 423358.28 +step 1242 | loss: 3.883421 | lr 4.9195e-04 | norm: 0.3493 | dt: 1237.15ms | tok/sec: 423788.58 +step 1243 | loss: 3.863061 | lr 4.9157e-04 | norm: 0.3987 | dt: 1234.75ms | tok/sec: 424610.72 +step 1244 | loss: 3.855265 | lr 4.9119e-04 | norm: 0.3396 | dt: 1233.98ms | tok/sec: 424876.69 +step 1245 | loss: 3.909855 | lr 4.9081e-04 | norm: 0.3883 | dt: 1237.03ms | tok/sec: 423827.13 +step 1246 | loss: 3.827265 | lr 4.9043e-04 | norm: 0.3841 | dt: 1237.58ms | tok/sec: 423639.42 +step 1247 | loss: 3.913578 | lr 4.9004e-04 | norm: 0.3757 | dt: 1237.44ms | tok/sec: 423686.19 +step 1248 | loss: 3.865974 | lr 4.8966e-04 | norm: 0.3655 | dt: 1237.77ms | tok/sec: 423576.34 +step 1249 | loss: 3.896384 | lr 4.8928e-04 | norm: 0.3627 | dt: 1236.23ms | tok/sec: 424103.49 +validation loss: 3.9031 +validation perplexity: 49.5543 +step 1250 | loss: 3.900003 | lr 4.8889e-04 | norm: 0.3176 | dt: 2682.80ms | tok/sec: 195425.50 +step 1251 | loss: 3.848935 | lr 4.8851e-04 | norm: 0.3270 | dt: 1233.17ms | tok/sec: 425153.19 +step 1252 | loss: 3.851664 | lr 4.8812e-04 | norm: 0.3311 | dt: 1235.98ms | tok/sec: 424188.98 +step 1253 | loss: 3.892893 | lr 4.8774e-04 | norm: 0.3234 | dt: 1232.64ms | tok/sec: 425337.97 +step 1254 | loss: 3.872412 | lr 4.8735e-04 | norm: 0.3050 | dt: 1235.49ms | tok/sec: 424355.40 +step 1255 | loss: 3.884717 | lr 4.8697e-04 | norm: 0.3129 | dt: 1235.23ms | tok/sec: 424445.09 +step 1256 | loss: 3.855556 | lr 4.8658e-04 | norm: 0.3893 | dt: 1237.46ms | tok/sec: 423680.31 +step 1257 | loss: 3.875366 | lr 4.8619e-04 | norm: 0.4121 | dt: 1234.08ms | tok/sec: 424840.82 +step 1258 | loss: 3.901960 | lr 4.8580e-04 | norm: 0.3567 | dt: 1235.34ms | tok/sec: 424407.16 +step 1259 | loss: 3.855776 | lr 4.8542e-04 | norm: 0.3594 | dt: 1235.39ms | tok/sec: 424391.92 +step 1260 | loss: 3.883826 | lr 4.8503e-04 | norm: 0.3640 | dt: 1235.28ms | tok/sec: 424429.28 +step 1261 | loss: 3.908904 | lr 4.8464e-04 | norm: 0.3850 | dt: 1238.21ms | tok/sec: 423423.58 +step 1262 | loss: 3.966709 | lr 4.8425e-04 | norm: 0.3352 | dt: 1237.16ms | tok/sec: 423783.92 +step 1263 | loss: 3.936115 | lr 4.8386e-04 | norm: 0.3866 | dt: 1233.14ms | tok/sec: 425163.71 +step 1264 | loss: 3.981720 | lr 4.8347e-04 | norm: 0.3553 | dt: 1234.88ms | tok/sec: 424564.24 +step 1265 | loss: 3.900629 | lr 4.8308e-04 | norm: 0.3725 | dt: 1236.54ms | tok/sec: 423994.90 +step 1266 | loss: 3.926666 | lr 4.8268e-04 | norm: 0.3808 | dt: 1236.20ms | tok/sec: 424111.51 +step 1267 | loss: 3.904927 | lr 4.8229e-04 | norm: 0.3544 | dt: 1233.39ms | tok/sec: 425079.64 +step 1268 | loss: 3.900600 | lr 4.8190e-04 | norm: 0.3254 | dt: 1237.28ms | tok/sec: 423742.03 +step 1269 | loss: 3.905323 | lr 4.8151e-04 | norm: 0.3317 | dt: 1234.42ms | tok/sec: 424724.55 +step 1270 | loss: 3.906125 | lr 4.8111e-04 | norm: 0.2871 | dt: 1238.70ms | tok/sec: 423257.57 +step 1271 | loss: 3.968302 | lr 4.8072e-04 | norm: 0.3355 | dt: 1235.54ms | tok/sec: 424340.33 +step 1272 | loss: 3.937365 | lr 4.8032e-04 | norm: 0.3634 | dt: 1234.76ms | tok/sec: 424608.67 +step 1273 | loss: 3.966146 | lr 4.7993e-04 | norm: 0.4049 | dt: 1236.45ms | tok/sec: 424027.36 +step 1274 | loss: 3.865010 | lr 4.7953e-04 | norm: 0.3673 | dt: 1234.39ms | tok/sec: 424736.04 +validation loss: 3.8988 +validation perplexity: 49.3412 +step 1275 | loss: 3.927174 | lr 4.7914e-04 | norm: 0.3143 | dt: 2690.85ms | tok/sec: 194840.69 +step 1276 | loss: 3.921507 | lr 4.7874e-04 | norm: 0.3395 | dt: 1231.97ms | tok/sec: 425569.52 +step 1277 | loss: 3.944926 | lr 4.7835e-04 | norm: 0.3503 | dt: 1234.87ms | tok/sec: 424570.22 +step 1278 | loss: 3.864147 | lr 4.7795e-04 | norm: 0.2947 | dt: 1236.42ms | tok/sec: 424037.17 +step 1279 | loss: 3.851919 | lr 4.7755e-04 | norm: 0.3452 | dt: 1233.33ms | tok/sec: 425098.37 +step 1280 | loss: 3.859385 | lr 4.7715e-04 | norm: 0.3610 | dt: 1236.72ms | tok/sec: 423934.09 +step 1281 | loss: 3.791480 | lr 4.7675e-04 | norm: 0.3116 | dt: 1231.46ms | tok/sec: 425745.92 +step 1282 | loss: 3.850129 | lr 4.7635e-04 | norm: 0.3612 | dt: 1234.27ms | tok/sec: 424774.51 +step 1283 | loss: 3.897139 | lr 4.7595e-04 | norm: 0.3338 | dt: 1237.39ms | tok/sec: 423704.88 +step 1284 | loss: 3.873930 | lr 4.7555e-04 | norm: 0.3637 | dt: 1237.08ms | tok/sec: 423810.22 +step 1285 | loss: 3.891213 | lr 4.7515e-04 | norm: 0.3400 | dt: 1231.83ms | tok/sec: 425618.20 +step 1286 | loss: 3.873615 | lr 4.7475e-04 | norm: 0.3748 | dt: 1231.25ms | tok/sec: 425816.66 +step 1287 | loss: 3.892973 | lr 4.7435e-04 | norm: 0.3445 | dt: 1238.87ms | tok/sec: 423199.81 +step 1288 | loss: 3.892890 | lr 4.7395e-04 | norm: 0.3415 | dt: 1235.66ms | tok/sec: 424296.86 +step 1289 | loss: 3.942321 | lr 4.7355e-04 | norm: 0.3348 | dt: 1235.18ms | tok/sec: 424462.54 +step 1290 | loss: 3.883278 | lr 4.7315e-04 | norm: 0.3810 | dt: 1236.44ms | tok/sec: 424029.16 +step 1291 | loss: 3.901766 | lr 4.7274e-04 | norm: 0.3324 | dt: 1232.35ms | tok/sec: 425438.61 +step 1292 | loss: 3.902114 | lr 4.7234e-04 | norm: 0.3535 | dt: 1237.49ms | tok/sec: 423669.70 +step 1293 | loss: 3.899998 | lr 4.7193e-04 | norm: 0.3358 | dt: 1233.64ms | tok/sec: 424993.21 +step 1294 | loss: 3.873982 | lr 4.7153e-04 | norm: 0.3447 | dt: 1234.24ms | tok/sec: 424787.23 +step 1295 | loss: 3.847621 | lr 4.7113e-04 | norm: 0.3900 | dt: 1235.75ms | tok/sec: 424265.67 +step 1296 | loss: 3.938995 | lr 4.7072e-04 | norm: 0.4391 | dt: 1233.48ms | tok/sec: 425048.42 +step 1297 | loss: 3.986392 | lr 4.7031e-04 | norm: 0.3982 | dt: 1235.42ms | tok/sec: 424379.97 +step 1298 | loss: 3.918041 | lr 4.6991e-04 | norm: 0.3149 | dt: 1234.62ms | tok/sec: 424656.23 +step 1299 | loss: 3.959065 | lr 4.6950e-04 | norm: 0.3733 | dt: 1234.14ms | tok/sec: 424819.98 +validation loss: 3.8928 +validation perplexity: 49.0470 +step 1300 | loss: 3.891171 | lr 4.6909e-04 | norm: 0.3000 | dt: 2687.83ms | tok/sec: 195059.89 +step 1301 | loss: 4.001854 | lr 4.6869e-04 | norm: 0.3404 | dt: 1235.01ms | tok/sec: 424522.60 +step 1302 | loss: 3.897438 | lr 4.6828e-04 | norm: 0.3513 | dt: 1238.18ms | tok/sec: 423435.65 +step 1303 | loss: 3.936044 | lr 4.6787e-04 | norm: 0.3507 | dt: 1234.13ms | tok/sec: 424824.49 +step 1304 | loss: 3.929419 | lr 4.6746e-04 | norm: 0.4032 | dt: 1232.53ms | tok/sec: 425376.64 +step 1305 | loss: 3.931777 | lr 4.6705e-04 | norm: 0.4033 | dt: 1231.57ms | tok/sec: 425707.10 +step 1306 | loss: 3.941201 | lr 4.6664e-04 | norm: 0.3812 | dt: 1235.91ms | tok/sec: 424212.22 +step 1307 | loss: 3.912880 | lr 4.6623e-04 | norm: 0.3925 | dt: 1237.61ms | tok/sec: 423630.77 +step 1308 | loss: 3.858922 | lr 4.6582e-04 | norm: 0.4225 | dt: 1234.77ms | tok/sec: 424604.33 +step 1309 | loss: 3.844950 | lr 4.6541e-04 | norm: 0.3816 | dt: 1235.43ms | tok/sec: 424377.35 +step 1310 | loss: 3.923794 | lr 4.6500e-04 | norm: 0.3701 | dt: 1235.61ms | tok/sec: 424315.28 +step 1311 | loss: 3.866172 | lr 4.6459e-04 | norm: 0.3661 | dt: 1234.34ms | tok/sec: 424753.18 +step 1312 | loss: 3.856927 | lr 4.6418e-04 | norm: 0.3156 | dt: 1235.27ms | tok/sec: 424430.50 +step 1313 | loss: 3.822062 | lr 4.6376e-04 | norm: 0.4264 | dt: 1234.87ms | tok/sec: 424569.81 +step 1314 | loss: 3.824650 | lr 4.6335e-04 | norm: 0.3935 | dt: 1229.99ms | tok/sec: 426252.47 +step 1315 | loss: 3.871274 | lr 4.6294e-04 | norm: 0.3576 | dt: 1233.99ms | tok/sec: 424873.66 +step 1316 | loss: 3.912968 | lr 4.6252e-04 | norm: 0.3820 | dt: 1233.78ms | tok/sec: 424943.44 +step 1317 | loss: 3.936815 | lr 4.6211e-04 | norm: 0.3405 | dt: 1235.63ms | tok/sec: 424309.30 +step 1318 | loss: 3.862797 | lr 4.6169e-04 | norm: 0.3720 | dt: 1232.58ms | tok/sec: 425358.13 +step 1319 | loss: 3.903363 | lr 4.6128e-04 | norm: 0.3294 | dt: 1236.60ms | tok/sec: 423974.87 +step 1320 | loss: 3.897626 | lr 4.6086e-04 | norm: 0.3782 | dt: 1235.56ms | tok/sec: 424330.92 +step 1321 | loss: 3.865008 | lr 4.6045e-04 | norm: 0.3154 | dt: 1233.32ms | tok/sec: 425101.99 +step 1322 | loss: 4.013382 | lr 4.6003e-04 | norm: 0.3743 | dt: 1231.76ms | tok/sec: 425640.20 +step 1323 | loss: 3.857874 | lr 4.5962e-04 | norm: 0.5162 | dt: 1234.46ms | tok/sec: 424711.02 +step 1324 | loss: 3.974930 | lr 4.5920e-04 | norm: 0.4940 | dt: 1234.94ms | tok/sec: 424546.53 +validation loss: 3.8954 +validation perplexity: 49.1778 +step 1325 | loss: 3.883108 | lr 4.5878e-04 | norm: 0.4110 | dt: 2684.21ms | tok/sec: 195323.00 +step 1326 | loss: 3.953423 | lr 4.5836e-04 | norm: 0.5133 | dt: 1234.55ms | tok/sec: 424679.52 +step 1327 | loss: 3.934145 | lr 4.5794e-04 | norm: 0.4309 | dt: 1233.65ms | tok/sec: 424988.61 +step 1328 | loss: 3.883750 | lr 4.5753e-04 | norm: 0.4419 | dt: 1240.03ms | tok/sec: 422803.31 +step 1329 | loss: 3.918973 | lr 4.5711e-04 | norm: 0.3497 | dt: 1231.07ms | tok/sec: 425879.42 +step 1330 | loss: 3.916904 | lr 4.5669e-04 | norm: 0.3847 | dt: 1234.84ms | tok/sec: 424578.17 +step 1331 | loss: 3.939579 | lr 4.5627e-04 | norm: 0.3451 | dt: 1237.00ms | tok/sec: 423836.77 +step 1332 | loss: 3.893400 | lr 4.5585e-04 | norm: 0.3695 | dt: 1234.00ms | tok/sec: 424870.13 +step 1333 | loss: 3.955024 | lr 4.5543e-04 | norm: 0.4187 | dt: 1236.08ms | tok/sec: 424155.03 +step 1334 | loss: 3.945915 | lr 4.5501e-04 | norm: 0.3549 | dt: 1236.34ms | tok/sec: 424066.12 +step 1335 | loss: 3.943354 | lr 4.5458e-04 | norm: 0.3768 | dt: 1234.93ms | tok/sec: 424549.81 +step 1336 | loss: 3.928137 | lr 4.5416e-04 | norm: 0.3863 | dt: 1233.48ms | tok/sec: 425046.44 +step 1337 | loss: 3.929173 | lr 4.5374e-04 | norm: 0.3239 | dt: 1237.67ms | tok/sec: 423607.67 +step 1338 | loss: 4.003134 | lr 4.5332e-04 | norm: 0.3614 | dt: 1234.24ms | tok/sec: 424786.99 +step 1339 | loss: 3.932567 | lr 4.5289e-04 | norm: 0.3681 | dt: 1236.12ms | tok/sec: 424141.45 +step 1340 | loss: 3.919448 | lr 4.5247e-04 | norm: 0.3332 | dt: 1235.53ms | tok/sec: 424342.46 +step 1341 | loss: 3.920207 | lr 4.5205e-04 | norm: 0.3171 | dt: 1235.76ms | tok/sec: 424261.90 +step 1342 | loss: 3.905877 | lr 4.5162e-04 | norm: 0.3441 | dt: 1237.72ms | tok/sec: 423590.62 +step 1343 | loss: 3.993318 | lr 4.5120e-04 | norm: 0.3587 | dt: 1237.91ms | tok/sec: 423525.11 +step 1344 | loss: 3.918101 | lr 4.5077e-04 | norm: 0.3169 | dt: 1236.49ms | tok/sec: 424014.03 +step 1345 | loss: 3.881290 | lr 4.5035e-04 | norm: 0.3548 | dt: 1236.94ms | tok/sec: 423858.17 +step 1346 | loss: 3.847164 | lr 4.4992e-04 | norm: 0.3019 | dt: 1238.05ms | tok/sec: 423479.03 +step 1347 | loss: 3.892815 | lr 4.4950e-04 | norm: 0.3230 | dt: 1236.38ms | tok/sec: 424050.82 +step 1348 | loss: 3.867805 | lr 4.4907e-04 | norm: 0.3197 | dt: 1232.69ms | tok/sec: 425320.12 +step 1349 | loss: 3.889949 | lr 4.4864e-04 | norm: 0.3231 | dt: 1233.23ms | tok/sec: 425133.96 +validation loss: 3.8905 +validation perplexity: 48.9352 +step 1350 | loss: 3.917588 | lr 4.4822e-04 | norm: 0.2993 | dt: 2684.25ms | tok/sec: 195319.93 +step 1351 | loss: 3.886204 | lr 4.4779e-04 | norm: 0.3134 | dt: 1230.95ms | tok/sec: 425922.89 +step 1352 | loss: 3.816982 | lr 4.4736e-04 | norm: 0.3176 | dt: 1235.90ms | tok/sec: 424216.89 +step 1353 | loss: 3.871327 | lr 4.4693e-04 | norm: 0.3137 | dt: 1236.91ms | tok/sec: 423870.84 +step 1354 | loss: 3.808421 | lr 4.4651e-04 | norm: 0.3747 | dt: 1233.44ms | tok/sec: 425060.74 +step 1355 | loss: 3.828454 | lr 4.4608e-04 | norm: 0.3550 | dt: 1234.53ms | tok/sec: 424684.61 +step 1356 | loss: 3.873611 | lr 4.4565e-04 | norm: 0.2926 | dt: 1235.98ms | tok/sec: 424186.45 +step 1357 | loss: 3.874091 | lr 4.4522e-04 | norm: 0.3560 | dt: 1236.03ms | tok/sec: 424170.33 +step 1358 | loss: 3.883724 | lr 4.4479e-04 | norm: 0.3635 | dt: 1859.82ms | tok/sec: 281903.04 +step 1359 | loss: 3.911620 | lr 4.4436e-04 | norm: 0.3454 | dt: 1233.00ms | tok/sec: 425213.70 +step 1360 | loss: 3.861668 | lr 4.4393e-04 | norm: 0.3759 | dt: 1237.77ms | tok/sec: 423573.73 +step 1361 | loss: 3.883086 | lr 4.4350e-04 | norm: 0.3878 | dt: 1238.37ms | tok/sec: 423368.96 +step 1362 | loss: 3.951373 | lr 4.4307e-04 | norm: 0.3649 | dt: 1236.05ms | tok/sec: 424164.60 +step 1363 | loss: 3.863725 | lr 4.4263e-04 | norm: 0.3805 | dt: 1237.79ms | tok/sec: 423569.24 +step 1364 | loss: 3.871984 | lr 4.4220e-04 | norm: 0.3360 | dt: 1235.82ms | tok/sec: 424243.32 +step 1365 | loss: 3.915846 | lr 4.4177e-04 | norm: 0.3546 | dt: 1235.64ms | tok/sec: 424303.73 +step 1366 | loss: 3.908914 | lr 4.4134e-04 | norm: 0.3848 | dt: 1234.13ms | tok/sec: 424822.27 +step 1367 | loss: 3.933050 | lr 4.4090e-04 | norm: 0.4369 | dt: 1236.41ms | tok/sec: 424040.19 +step 1368 | loss: 3.955879 | lr 4.4047e-04 | norm: 0.3420 | dt: 1236.52ms | tok/sec: 424003.56 +step 1369 | loss: 3.957287 | lr 4.4004e-04 | norm: 0.3730 | dt: 1235.16ms | tok/sec: 424469.91 +step 1370 | loss: 3.946336 | lr 4.3960e-04 | norm: 0.3427 | dt: 1237.41ms | tok/sec: 423697.05 +step 1371 | loss: 3.934758 | lr 4.3917e-04 | norm: 0.3663 | dt: 1236.12ms | tok/sec: 424138.42 +step 1372 | loss: 3.936662 | lr 4.3873e-04 | norm: 0.3264 | dt: 1237.62ms | tok/sec: 423626.28 +step 1373 | loss: 3.947383 | lr 4.3830e-04 | norm: 0.3149 | dt: 1237.40ms | tok/sec: 423701.94 +step 1374 | loss: 3.931239 | lr 4.3786e-04 | norm: 0.3128 | dt: 1236.87ms | tok/sec: 423882.28 +validation loss: 3.8893 +validation perplexity: 48.8791 +step 1375 | loss: 3.890212 | lr 4.3743e-04 | norm: 0.3095 | dt: 2694.03ms | tok/sec: 194611.11 +step 1376 | loss: 3.899067 | lr 4.3699e-04 | norm: 0.2907 | dt: 1234.06ms | tok/sec: 424848.29 +step 1377 | loss: 3.916103 | lr 4.3655e-04 | norm: 0.3578 | dt: 1234.58ms | tok/sec: 424668.12 +step 1378 | loss: 3.928525 | lr 4.3612e-04 | norm: 0.3394 | dt: 1232.07ms | tok/sec: 425533.04 +step 1379 | loss: 3.867427 | lr 4.3568e-04 | norm: 0.3168 | dt: 1232.38ms | tok/sec: 425428.24 +step 1380 | loss: 3.853868 | lr 4.3524e-04 | norm: 0.3053 | dt: 1233.73ms | tok/sec: 424962.58 +step 1381 | loss: 3.854429 | lr 4.3480e-04 | norm: 0.3299 | dt: 1233.55ms | tok/sec: 425025.41 +step 1382 | loss: 3.858767 | lr 4.3437e-04 | norm: 0.3738 | dt: 1234.19ms | tok/sec: 424802.25 +step 1383 | loss: 3.864419 | lr 4.3393e-04 | norm: 0.3502 | dt: 1236.53ms | tok/sec: 424000.54 +step 1384 | loss: 3.943118 | lr 4.3349e-04 | norm: 0.3511 | dt: 1236.75ms | tok/sec: 423923.30 +step 1385 | loss: 3.910729 | lr 4.3305e-04 | norm: 0.3796 | dt: 1232.84ms | tok/sec: 425270.11 +step 1386 | loss: 3.899460 | lr 4.3261e-04 | norm: 0.3161 | dt: 1235.36ms | tok/sec: 424402.08 +step 1387 | loss: 3.862000 | lr 4.3217e-04 | norm: 0.3766 | dt: 1236.80ms | tok/sec: 423908.02 +step 1388 | loss: 3.924715 | lr 4.3173e-04 | norm: 0.3519 | dt: 1233.13ms | tok/sec: 425166.84 +step 1389 | loss: 3.896152 | lr 4.3129e-04 | norm: 0.3557 | dt: 1232.88ms | tok/sec: 425254.15 +step 1390 | loss: 3.888381 | lr 4.3085e-04 | norm: 0.3189 | dt: 1235.83ms | tok/sec: 424239.47 +step 1391 | loss: 3.992512 | lr 4.3041e-04 | norm: 0.4269 | dt: 1233.27ms | tok/sec: 425119.99 +step 1392 | loss: 3.884790 | lr 4.2997e-04 | norm: 0.4302 | dt: 1232.52ms | tok/sec: 425380.34 +step 1393 | loss: 3.854874 | lr 4.2953e-04 | norm: 0.3886 | dt: 1232.73ms | tok/sec: 425305.23 +step 1394 | loss: 3.856329 | lr 4.2908e-04 | norm: 0.3367 | dt: 1233.06ms | tok/sec: 425191.42 +step 1395 | loss: 3.855596 | lr 4.2864e-04 | norm: 0.3613 | dt: 1235.80ms | tok/sec: 424248.56 +step 1396 | loss: 3.846492 | lr 4.2820e-04 | norm: 0.3145 | dt: 1233.60ms | tok/sec: 425004.79 +step 1397 | loss: 3.875684 | lr 4.2776e-04 | norm: 0.3706 | dt: 1233.14ms | tok/sec: 425163.71 +step 1398 | loss: 3.879281 | lr 4.2731e-04 | norm: 0.3221 | dt: 1236.13ms | tok/sec: 424137.93 +step 1399 | loss: 3.906952 | lr 4.2687e-04 | norm: 0.3140 | dt: 1235.53ms | tok/sec: 424342.87 +validation loss: 3.8828 +validation perplexity: 48.5611 +step 1400 | loss: 3.871735 | lr 4.2643e-04 | norm: 0.3459 | dt: 2689.91ms | tok/sec: 194908.82 +step 1401 | loss: 3.886252 | lr 4.2598e-04 | norm: 0.3283 | dt: 1232.38ms | tok/sec: 425428.24 +step 1402 | loss: 3.888371 | lr 4.2554e-04 | norm: 0.3375 | dt: 1238.33ms | tok/sec: 423383.71 +step 1403 | loss: 3.899438 | lr 4.2509e-04 | norm: 0.3343 | dt: 1231.85ms | tok/sec: 425609.96 +step 1404 | loss: 3.892738 | lr 4.2465e-04 | norm: 0.3434 | dt: 1236.22ms | tok/sec: 424104.06 +step 1405 | loss: 3.952327 | lr 4.2420e-04 | norm: 0.3137 | dt: 1234.75ms | tok/sec: 424611.21 +step 1406 | loss: 3.957063 | lr 4.2376e-04 | norm: 0.3402 | dt: 1236.50ms | tok/sec: 424011.33 +step 1407 | loss: 3.910203 | lr 4.2331e-04 | norm: 0.3795 | dt: 1233.70ms | tok/sec: 424973.09 +step 1408 | loss: 3.966781 | lr 4.2287e-04 | norm: 0.3854 | dt: 1232.45ms | tok/sec: 425403.88 +step 1409 | loss: 3.918784 | lr 4.2242e-04 | norm: 0.3256 | dt: 1238.18ms | tok/sec: 423432.71 +step 1410 | loss: 3.942316 | lr 4.2197e-04 | norm: 0.3020 | dt: 1237.92ms | tok/sec: 423523.72 +step 1411 | loss: 3.894591 | lr 4.2153e-04 | norm: 0.3231 | dt: 1234.01ms | tok/sec: 424863.81 +step 1412 | loss: 3.877861 | lr 4.2108e-04 | norm: 0.3295 | dt: 1232.34ms | tok/sec: 425442.64 +step 1413 | loss: 3.860137 | lr 4.2063e-04 | norm: 0.3037 | dt: 1235.34ms | tok/sec: 424409.29 +step 1414 | loss: 3.821226 | lr 4.2018e-04 | norm: 0.2695 | dt: 1235.24ms | tok/sec: 424442.63 +step 1415 | loss: 3.898959 | lr 4.1974e-04 | norm: 0.3147 | dt: 1233.64ms | tok/sec: 424991.16 +step 1416 | loss: 3.869988 | lr 4.1929e-04 | norm: 0.2870 | dt: 1232.60ms | tok/sec: 425350.89 +step 1417 | loss: 3.884906 | lr 4.1884e-04 | norm: 0.2975 | dt: 1234.09ms | tok/sec: 424837.54 +step 1418 | loss: 3.858839 | lr 4.1839e-04 | norm: 0.3001 | dt: 1234.76ms | tok/sec: 424607.69 +step 1419 | loss: 3.870673 | lr 4.1794e-04 | norm: 0.3417 | dt: 1235.25ms | tok/sec: 424439.68 +step 1420 | loss: 3.906587 | lr 4.1749e-04 | norm: 0.4020 | dt: 1232.26ms | tok/sec: 425468.24 +step 1421 | loss: 3.894596 | lr 4.1704e-04 | norm: 0.4597 | dt: 1234.54ms | tok/sec: 424681.90 +step 1422 | loss: 3.893702 | lr 4.1659e-04 | norm: 0.4914 | dt: 1234.77ms | tok/sec: 424603.34 +step 1423 | loss: 3.863409 | lr 4.1614e-04 | norm: 0.3185 | dt: 1234.37ms | tok/sec: 424740.79 +step 1424 | loss: 3.877488 | lr 4.1569e-04 | norm: 0.3473 | dt: 1234.57ms | tok/sec: 424671.97 +validation loss: 3.8818 +validation perplexity: 48.5126 +step 1425 | loss: 3.832700 | lr 4.1524e-04 | norm: 0.3978 | dt: 2685.82ms | tok/sec: 195206.13 +step 1426 | loss: 3.874631 | lr 4.1479e-04 | norm: 0.3720 | dt: 1234.31ms | tok/sec: 424761.80 +step 1427 | loss: 3.850188 | lr 4.1434e-04 | norm: 0.3513 | dt: 1234.86ms | tok/sec: 424572.52 +step 1428 | loss: 3.793015 | lr 4.1389e-04 | norm: 0.3516 | dt: 1234.58ms | tok/sec: 424669.84 +step 1429 | loss: 3.930429 | lr 4.1343e-04 | norm: 0.3218 | dt: 1234.18ms | tok/sec: 424808.24 +step 1430 | loss: 3.819598 | lr 4.1298e-04 | norm: 0.3462 | dt: 1236.58ms | tok/sec: 423981.33 +step 1431 | loss: 3.812226 | lr 4.1253e-04 | norm: 0.3225 | dt: 1236.03ms | tok/sec: 424172.37 +step 1432 | loss: 3.915162 | lr 4.1208e-04 | norm: 0.3270 | dt: 1230.73ms | tok/sec: 425996.32 +step 1433 | loss: 3.898729 | lr 4.1162e-04 | norm: 0.3768 | dt: 1237.01ms | tok/sec: 423835.22 +step 1434 | loss: 3.917945 | lr 4.1117e-04 | norm: 0.3371 | dt: 1233.31ms | tok/sec: 425107.25 +step 1435 | loss: 3.894384 | lr 4.1072e-04 | norm: 0.3487 | dt: 1236.76ms | tok/sec: 423920.27 +step 1436 | loss: 3.918571 | lr 4.1026e-04 | norm: 0.3731 | dt: 1240.16ms | tok/sec: 422758.36 +step 1437 | loss: 3.896502 | lr 4.0981e-04 | norm: 0.3236 | dt: 1233.25ms | tok/sec: 425127.05 +step 1438 | loss: 3.927071 | lr 4.0936e-04 | norm: 0.3749 | dt: 1238.26ms | tok/sec: 423408.01 +step 1439 | loss: 3.890802 | lr 4.0890e-04 | norm: 0.3215 | dt: 1234.59ms | tok/sec: 424664.59 +step 1440 | loss: 3.884275 | lr 4.0845e-04 | norm: 0.3175 | dt: 1236.44ms | tok/sec: 424030.63 +step 1441 | loss: 3.871756 | lr 4.0799e-04 | norm: 0.3065 | dt: 1235.02ms | tok/sec: 424517.52 +step 1442 | loss: 3.891528 | lr 4.0754e-04 | norm: 0.3365 | dt: 1233.44ms | tok/sec: 425060.00 +step 1443 | loss: 3.883567 | lr 4.0708e-04 | norm: 0.3454 | dt: 1237.82ms | tok/sec: 423557.74 +step 1444 | loss: 3.908018 | lr 4.0663e-04 | norm: 0.3285 | dt: 1235.44ms | tok/sec: 424373.17 +step 1445 | loss: 3.909577 | lr 4.0617e-04 | norm: 0.3345 | dt: 1234.83ms | tok/sec: 424581.53 +step 1446 | loss: 3.874475 | lr 4.0572e-04 | norm: 0.3269 | dt: 1239.01ms | tok/sec: 423150.71 +step 1447 | loss: 3.927028 | lr 4.0526e-04 | norm: 0.3143 | dt: 1234.79ms | tok/sec: 424595.31 +step 1448 | loss: 3.897792 | lr 4.0480e-04 | norm: 0.3213 | dt: 1235.23ms | tok/sec: 424444.35 +step 1449 | loss: 3.843796 | lr 4.0435e-04 | norm: 0.3237 | dt: 1235.69ms | tok/sec: 424286.30 +validation loss: 3.8762 +validation perplexity: 48.2385 +step 1450 | loss: 3.919767 | lr 4.0389e-04 | norm: 0.3183 | dt: 2671.37ms | tok/sec: 196261.77 +step 1451 | loss: 3.873205 | lr 4.0343e-04 | norm: 0.2716 | dt: 1238.10ms | tok/sec: 423461.33 +step 1452 | loss: 3.878064 | lr 4.0297e-04 | norm: 0.3521 | dt: 1237.95ms | tok/sec: 423512.06 +step 1453 | loss: 3.888861 | lr 4.0252e-04 | norm: 0.2678 | dt: 1234.86ms | tok/sec: 424571.78 +step 1454 | loss: 3.876286 | lr 4.0206e-04 | norm: 0.3122 | dt: 1238.11ms | tok/sec: 423457.74 +step 1455 | loss: 3.915097 | lr 4.0160e-04 | norm: 0.2830 | dt: 1234.84ms | tok/sec: 424579.90 +step 1456 | loss: 3.869443 | lr 4.0114e-04 | norm: 0.3480 | dt: 1234.37ms | tok/sec: 424742.27 +step 1457 | loss: 3.909100 | lr 4.0068e-04 | norm: 0.2767 | dt: 1233.17ms | tok/sec: 425154.51 +step 1458 | loss: 3.825988 | lr 4.0023e-04 | norm: 0.3518 | dt: 1234.80ms | tok/sec: 424594.98 +step 1459 | loss: 3.830176 | lr 3.9977e-04 | norm: 0.3200 | dt: 1233.93ms | tok/sec: 424894.02 +step 1460 | loss: 3.905799 | lr 3.9931e-04 | norm: 0.3351 | dt: 1235.09ms | tok/sec: 424495.31 +step 1461 | loss: 3.923129 | lr 3.9885e-04 | norm: 0.3310 | dt: 1234.10ms | tok/sec: 424834.01 +step 1462 | loss: 3.851009 | lr 3.9839e-04 | norm: 0.2791 | dt: 1236.48ms | tok/sec: 424015.26 +step 1463 | loss: 3.950773 | lr 3.9793e-04 | norm: 0.3351 | dt: 1234.17ms | tok/sec: 424808.98 +step 1464 | loss: 3.910339 | lr 3.9747e-04 | norm: 0.3561 | dt: 1235.80ms | tok/sec: 424250.36 +step 1465 | loss: 3.847375 | lr 3.9701e-04 | norm: 0.3156 | dt: 1234.66ms | tok/sec: 424641.80 +step 1466 | loss: 3.856455 | lr 3.9655e-04 | norm: 0.3526 | dt: 1234.15ms | tok/sec: 424816.20 +step 1467 | loss: 3.866651 | lr 3.9609e-04 | norm: 0.3543 | dt: 1233.81ms | tok/sec: 424935.48 +step 1468 | loss: 3.898773 | lr 3.9563e-04 | norm: 0.3418 | dt: 1234.81ms | tok/sec: 424591.04 +step 1469 | loss: 3.830728 | lr 3.9517e-04 | norm: 0.3125 | dt: 1235.55ms | tok/sec: 424334.03 +step 1470 | loss: 3.881809 | lr 3.9470e-04 | norm: 0.4023 | dt: 1236.05ms | tok/sec: 424164.35 +step 1471 | loss: 3.821984 | lr 3.9424e-04 | norm: 0.3018 | dt: 1234.67ms | tok/sec: 424638.43 +step 1472 | loss: 3.893860 | lr 3.9378e-04 | norm: 0.3132 | dt: 1239.55ms | tok/sec: 422964.81 +step 1473 | loss: 3.910213 | lr 3.9332e-04 | norm: 0.3756 | dt: 1233.96ms | tok/sec: 424883.92 +step 1474 | loss: 3.836158 | lr 3.9286e-04 | norm: 0.3175 | dt: 1233.68ms | tok/sec: 424978.02 +validation loss: 3.8715 +validation perplexity: 48.0121 +step 1475 | loss: 3.928776 | lr 3.9239e-04 | norm: 0.3480 | dt: 2691.16ms | tok/sec: 194818.77 +step 1476 | loss: 3.812323 | lr 3.9193e-04 | norm: 0.3123 | dt: 1234.48ms | tok/sec: 424703.14 +step 1477 | loss: 3.921005 | lr 3.9147e-04 | norm: 0.2998 | dt: 1236.53ms | tok/sec: 423999.80 +step 1478 | loss: 3.940937 | lr 3.9101e-04 | norm: 0.3407 | dt: 1234.58ms | tok/sec: 424669.76 +step 1479 | loss: 3.871354 | lr 3.9054e-04 | norm: 0.3197 | dt: 1236.41ms | tok/sec: 424042.16 +step 1480 | loss: 3.886744 | lr 3.9008e-04 | norm: 0.3045 | dt: 1231.83ms | tok/sec: 425618.69 +step 1481 | loss: 3.924195 | lr 3.8962e-04 | norm: 0.3304 | dt: 1237.12ms | tok/sec: 423796.26 +step 1482 | loss: 3.857725 | lr 3.8915e-04 | norm: 0.3071 | dt: 1238.82ms | tok/sec: 423214.31 +step 1483 | loss: 3.902533 | lr 3.8869e-04 | norm: 0.3604 | dt: 1232.26ms | tok/sec: 425470.30 +step 1484 | loss: 3.999000 | lr 3.8823e-04 | norm: 0.3350 | dt: 1233.12ms | tok/sec: 425172.02 +step 1485 | loss: 3.820047 | lr 3.8776e-04 | norm: 0.3737 | dt: 1233.57ms | tok/sec: 425016.87 +step 1486 | loss: 3.870992 | lr 3.8730e-04 | norm: 0.3751 | dt: 1239.48ms | tok/sec: 422991.09 +step 1487 | loss: 3.885769 | lr 3.8683e-04 | norm: 0.3031 | dt: 1233.77ms | tok/sec: 424947.06 +step 1488 | loss: 3.880045 | lr 3.8637e-04 | norm: 0.3312 | dt: 1230.95ms | tok/sec: 425922.72 +step 1489 | loss: 3.816723 | lr 3.8590e-04 | norm: 0.3052 | dt: 1232.96ms | tok/sec: 425227.51 +step 1490 | loss: 3.862901 | lr 3.8544e-04 | norm: 0.3321 | dt: 1231.58ms | tok/sec: 425702.00 +step 1491 | loss: 3.835845 | lr 3.8497e-04 | norm: 0.3433 | dt: 1232.21ms | tok/sec: 425485.37 +step 1492 | loss: 3.834021 | lr 3.8451e-04 | norm: 0.3564 | dt: 1232.04ms | tok/sec: 425544.73 +step 1493 | loss: 3.889003 | lr 3.8404e-04 | norm: 0.3600 | dt: 1237.75ms | tok/sec: 423582.54 +step 1494 | loss: 3.818891 | lr 3.8358e-04 | norm: 0.3657 | dt: 1235.50ms | tok/sec: 424354.42 +step 1495 | loss: 3.842494 | lr 3.8311e-04 | norm: 0.3252 | dt: 1237.53ms | tok/sec: 423655.42 +step 1496 | loss: 3.877550 | lr 3.8265e-04 | norm: 0.3407 | dt: 1234.69ms | tok/sec: 424632.78 +step 1497 | loss: 3.842907 | lr 3.8218e-04 | norm: 0.4222 | dt: 1234.78ms | tok/sec: 424598.75 +step 1498 | loss: 3.877005 | lr 3.8171e-04 | norm: 0.3728 | dt: 1235.72ms | tok/sec: 424276.80 +step 1499 | loss: 3.892327 | lr 3.8125e-04 | norm: 0.4109 | dt: 1233.37ms | tok/sec: 425084.32 +validation loss: 3.8731 +validation perplexity: 48.0936 +step 1500 | loss: 3.827914 | lr 3.8078e-04 | norm: 0.3331 | dt: 2674.01ms | tok/sec: 196068.18 +step 1501 | loss: 3.804693 | lr 3.8031e-04 | norm: 0.3154 | dt: 1236.32ms | tok/sec: 424069.80 +step 1502 | loss: 3.880463 | lr 3.7985e-04 | norm: 0.3878 | dt: 1235.37ms | tok/sec: 424398.89 +step 1503 | loss: 3.880233 | lr 3.7938e-04 | norm: 0.3271 | dt: 1236.55ms | tok/sec: 423991.79 +step 1504 | loss: 3.855524 | lr 3.7891e-04 | norm: 0.3320 | dt: 1235.06ms | tok/sec: 424505.31 +step 1505 | loss: 3.859645 | lr 3.7844e-04 | norm: 0.3303 | dt: 1235.72ms | tok/sec: 424277.70 +step 1506 | loss: 3.863978 | lr 3.7798e-04 | norm: 0.3405 | dt: 1232.86ms | tok/sec: 425259.99 +step 1507 | loss: 3.917581 | lr 3.7751e-04 | norm: 0.2949 | dt: 1236.23ms | tok/sec: 424103.90 +step 1508 | loss: 3.925220 | lr 3.7704e-04 | norm: 0.3315 | dt: 1234.21ms | tok/sec: 424794.86 +step 1509 | loss: 3.872885 | lr 3.7657e-04 | norm: 0.3684 | dt: 1236.41ms | tok/sec: 424041.67 +step 1510 | loss: 3.947422 | lr 3.7610e-04 | norm: 0.3459 | dt: 1237.77ms | tok/sec: 423574.46 +step 1511 | loss: 3.856168 | lr 3.7564e-04 | norm: 0.2993 | dt: 1236.47ms | tok/sec: 424019.26 +step 1512 | loss: 3.868588 | lr 3.7517e-04 | norm: 0.3096 | dt: 1234.88ms | tok/sec: 424565.30 +step 1513 | loss: 3.844611 | lr 3.7470e-04 | norm: 0.3325 | dt: 1236.96ms | tok/sec: 423851.31 +step 1514 | loss: 3.913291 | lr 3.7423e-04 | norm: 0.3405 | dt: 1235.38ms | tok/sec: 424394.55 +step 1515 | loss: 3.843402 | lr 3.7376e-04 | norm: 0.2918 | dt: 1235.52ms | tok/sec: 424344.34 +step 1516 | loss: 3.890829 | lr 3.7329e-04 | norm: 0.3100 | dt: 1233.57ms | tok/sec: 425015.31 +step 1517 | loss: 3.845786 | lr 3.7282e-04 | norm: 0.2889 | dt: 1235.44ms | tok/sec: 424374.40 +step 1518 | loss: 3.929813 | lr 3.7235e-04 | norm: 0.3064 | dt: 1234.25ms | tok/sec: 424782.15 +step 1519 | loss: 3.868508 | lr 3.7189e-04 | norm: 0.3510 | dt: 1235.42ms | tok/sec: 424379.48 +step 1520 | loss: 3.940339 | lr 3.7142e-04 | norm: 0.4126 | dt: 1237.43ms | tok/sec: 423692.07 +step 1521 | loss: 3.870209 | lr 3.7095e-04 | norm: 0.3360 | dt: 1234.63ms | tok/sec: 424652.87 +step 1522 | loss: 3.858093 | lr 3.7048e-04 | norm: 0.3087 | dt: 1232.34ms | tok/sec: 425441.41 +step 1523 | loss: 3.839057 | lr 3.7001e-04 | norm: 0.3510 | dt: 1235.84ms | tok/sec: 424235.96 +step 1524 | loss: 3.879194 | lr 3.6954e-04 | norm: 0.2831 | dt: 1236.46ms | tok/sec: 424022.70 +validation loss: 3.8671 +validation perplexity: 47.8026 +step 1525 | loss: 3.812669 | lr 3.6907e-04 | norm: 0.2780 | dt: 2695.44ms | tok/sec: 194509.57 +step 1526 | loss: 3.850997 | lr 3.6860e-04 | norm: 0.3125 | dt: 1233.07ms | tok/sec: 425188.05 +step 1527 | loss: 3.847092 | lr 3.6813e-04 | norm: 0.2926 | dt: 1237.30ms | tok/sec: 423735.01 +step 1528 | loss: 3.871413 | lr 3.6766e-04 | norm: 0.2922 | dt: 1235.53ms | tok/sec: 424342.95 +step 1529 | loss: 3.874473 | lr 3.6718e-04 | norm: 0.2904 | dt: 1234.76ms | tok/sec: 424605.80 +step 1530 | loss: 3.890857 | lr 3.6671e-04 | norm: 0.3046 | dt: 1231.77ms | tok/sec: 425637.31 +step 1531 | loss: 3.905516 | lr 3.6624e-04 | norm: 0.3431 | dt: 1235.46ms | tok/sec: 424365.88 +step 1532 | loss: 3.885553 | lr 3.6577e-04 | norm: 0.3208 | dt: 1233.58ms | tok/sec: 425013.50 +step 1533 | loss: 3.897552 | lr 3.6530e-04 | norm: 0.3415 | dt: 1235.54ms | tok/sec: 424337.47 +step 1534 | loss: 3.881003 | lr 3.6483e-04 | norm: 0.3446 | dt: 1237.71ms | tok/sec: 423595.35 +step 1535 | loss: 3.878498 | lr 3.6436e-04 | norm: 0.3233 | dt: 1233.88ms | tok/sec: 424908.79 +step 1536 | loss: 3.860620 | lr 3.6389e-04 | norm: 0.3077 | dt: 1238.16ms | tok/sec: 423441.68 +step 1537 | loss: 3.918733 | lr 3.6342e-04 | norm: 0.3204 | dt: 1234.37ms | tok/sec: 424741.45 +step 1538 | loss: 3.839806 | lr 3.6294e-04 | norm: 0.3039 | dt: 1236.92ms | tok/sec: 423866.34 +step 1539 | loss: 3.865065 | lr 3.6247e-04 | norm: 0.3528 | dt: 1235.65ms | tok/sec: 424301.36 +step 1540 | loss: 3.897463 | lr 3.6200e-04 | norm: 0.3045 | dt: 1237.92ms | tok/sec: 423524.78 +step 1541 | loss: 3.902369 | lr 3.6153e-04 | norm: 0.3068 | dt: 1234.64ms | tok/sec: 424647.54 +step 1542 | loss: 3.886777 | lr 3.6106e-04 | norm: 0.3225 | dt: 1235.65ms | tok/sec: 424302.67 +step 1543 | loss: 3.905871 | lr 3.6058e-04 | norm: 0.3378 | dt: 1236.32ms | tok/sec: 424072.41 +step 1544 | loss: 3.887436 | lr 3.6011e-04 | norm: 0.3138 | dt: 1233.11ms | tok/sec: 425174.32 +step 1545 | loss: 3.867934 | lr 3.5964e-04 | norm: 0.2825 | dt: 1234.81ms | tok/sec: 424589.24 +step 1546 | loss: 3.873189 | lr 3.5917e-04 | norm: 0.3000 | dt: 1236.02ms | tok/sec: 424173.84 +step 1547 | loss: 3.856560 | lr 3.5870e-04 | norm: 0.2980 | dt: 1236.76ms | tok/sec: 423921.99 +step 1548 | loss: 3.876107 | lr 3.5822e-04 | norm: 0.3074 | dt: 1236.89ms | tok/sec: 423875.17 +step 1549 | loss: 3.880839 | lr 3.5775e-04 | norm: 0.2820 | dt: 1960.72ms | tok/sec: 267396.06 +validation loss: 3.8619 +validation perplexity: 47.5536 +step 1550 | loss: 3.818285 | lr 3.5728e-04 | norm: 0.3416 | dt: 2670.86ms | tok/sec: 196299.60 +step 1551 | loss: 3.852146 | lr 3.5680e-04 | norm: 0.3636 | dt: 1236.85ms | tok/sec: 423891.18 +step 1552 | loss: 3.867922 | lr 3.5633e-04 | norm: 0.3027 | dt: 1236.33ms | tok/sec: 424068.98 +step 1553 | loss: 3.840419 | lr 3.5586e-04 | norm: 0.3625 | dt: 1233.68ms | tok/sec: 424978.76 +step 1554 | loss: 3.890976 | lr 3.5539e-04 | norm: 0.3042 | dt: 1235.88ms | tok/sec: 424222.21 +step 1555 | loss: 3.823045 | lr 3.5491e-04 | norm: 0.3672 | dt: 1234.29ms | tok/sec: 424770.41 +step 1556 | loss: 3.867264 | lr 3.5444e-04 | norm: 0.2867 | dt: 1237.94ms | tok/sec: 423514.99 +step 1557 | loss: 3.809199 | lr 3.5397e-04 | norm: 0.3450 | dt: 1237.81ms | tok/sec: 423560.02 +step 1558 | loss: 3.846154 | lr 3.5349e-04 | norm: 0.3078 | dt: 1238.29ms | tok/sec: 423396.51 +step 1559 | loss: 3.867592 | lr 3.5302e-04 | norm: 0.3297 | dt: 1236.31ms | tok/sec: 424073.88 +step 1560 | loss: 3.892067 | lr 3.5255e-04 | norm: 0.3429 | dt: 1235.54ms | tok/sec: 424339.68 +step 1561 | loss: 3.845462 | lr 3.5207e-04 | norm: 0.3377 | dt: 1235.18ms | tok/sec: 424464.50 +step 1562 | loss: 3.835179 | lr 3.5160e-04 | norm: 0.3207 | dt: 1235.19ms | tok/sec: 424457.95 +step 1563 | loss: 3.945080 | lr 3.5112e-04 | norm: 0.3953 | dt: 1238.51ms | tok/sec: 423321.93 +step 1564 | loss: 3.835270 | lr 3.5065e-04 | norm: 0.3724 | dt: 1232.87ms | tok/sec: 425256.87 +step 1565 | loss: 3.852586 | lr 3.5018e-04 | norm: 0.3570 | dt: 1235.77ms | tok/sec: 424258.87 +step 1566 | loss: 3.872939 | lr 3.4970e-04 | norm: 0.3643 | dt: 1238.71ms | tok/sec: 423253.66 +step 1567 | loss: 3.880657 | lr 3.4923e-04 | norm: 0.3671 | dt: 1238.79ms | tok/sec: 423224.33 +step 1568 | loss: 3.903907 | lr 3.4876e-04 | norm: 0.3687 | dt: 1239.03ms | tok/sec: 423143.22 +step 1569 | loss: 4.082086 | lr 3.4828e-04 | norm: 0.4629 | dt: 1236.16ms | tok/sec: 424125.74 +step 1570 | loss: 3.914077 | lr 3.4781e-04 | norm: 0.4564 | dt: 1237.49ms | tok/sec: 423670.43 +step 1571 | loss: 3.856668 | lr 3.4733e-04 | norm: 0.3973 | dt: 1232.68ms | tok/sec: 425325.06 +step 1572 | loss: 4.084485 | lr 3.4686e-04 | norm: 0.4285 | dt: 1234.33ms | tok/sec: 424754.49 +step 1573 | loss: 3.877611 | lr 3.4638e-04 | norm: 0.4775 | dt: 1235.33ms | tok/sec: 424410.27 +step 1574 | loss: 3.854797 | lr 3.4591e-04 | norm: 0.3771 | dt: 1236.17ms | tok/sec: 424123.78 +validation loss: 3.8676 +validation perplexity: 47.8256 +step 1575 | loss: 3.863677 | lr 3.4544e-04 | norm: 0.3498 | dt: 2694.17ms | tok/sec: 194601.24 +step 1576 | loss: 3.883717 | lr 3.4496e-04 | norm: 0.3867 | dt: 1236.88ms | tok/sec: 423877.95 +step 1577 | loss: 3.871484 | lr 3.4449e-04 | norm: 0.3201 | dt: 1234.29ms | tok/sec: 424768.36 +step 1578 | loss: 3.888700 | lr 3.4401e-04 | norm: 0.3194 | dt: 1236.18ms | tok/sec: 424121.16 +step 1579 | loss: 3.880329 | lr 3.4354e-04 | norm: 0.3042 | dt: 1233.07ms | tok/sec: 425190.76 +step 1580 | loss: 3.813730 | lr 3.4306e-04 | norm: 0.3474 | dt: 1235.57ms | tok/sec: 424330.01 +step 1581 | loss: 3.867652 | lr 3.4259e-04 | norm: 0.3252 | dt: 1233.29ms | tok/sec: 425111.60 +step 1582 | loss: 3.910725 | lr 3.4211e-04 | norm: 0.3314 | dt: 1234.34ms | tok/sec: 424750.23 +step 1583 | loss: 3.869384 | lr 3.4164e-04 | norm: 0.3116 | dt: 1236.63ms | tok/sec: 423964.24 +step 1584 | loss: 3.903682 | lr 3.4116e-04 | norm: 0.3155 | dt: 1234.84ms | tok/sec: 424579.73 +step 1585 | loss: 3.876907 | lr 3.4069e-04 | norm: 0.3099 | dt: 1232.40ms | tok/sec: 425421.24 +step 1586 | loss: 3.869014 | lr 3.4021e-04 | norm: 0.3363 | dt: 1235.69ms | tok/sec: 424288.75 +step 1587 | loss: 3.848519 | lr 3.3974e-04 | norm: 0.3222 | dt: 1232.12ms | tok/sec: 425518.63 +step 1588 | loss: 3.958912 | lr 3.3926e-04 | norm: 0.3095 | dt: 1233.91ms | tok/sec: 424900.01 +step 1589 | loss: 3.851819 | lr 3.3879e-04 | norm: 0.3129 | dt: 1235.25ms | tok/sec: 424440.25 +step 1590 | loss: 3.829383 | lr 3.3831e-04 | norm: 0.2998 | dt: 1231.06ms | tok/sec: 425884.45 +step 1591 | loss: 3.768692 | lr 3.3784e-04 | norm: 0.3552 | dt: 1235.26ms | tok/sec: 424433.78 +step 1592 | loss: 3.829514 | lr 3.3736e-04 | norm: 0.3283 | dt: 1235.98ms | tok/sec: 424186.61 +step 1593 | loss: 3.920655 | lr 3.3689e-04 | norm: 0.3115 | dt: 1237.81ms | tok/sec: 423561.74 +step 1594 | loss: 3.836524 | lr 3.3641e-04 | norm: 0.2849 | dt: 1236.05ms | tok/sec: 424163.37 +step 1595 | loss: 3.809465 | lr 3.3594e-04 | norm: 0.3240 | dt: 1234.70ms | tok/sec: 424629.33 +step 1596 | loss: 3.892855 | lr 3.3546e-04 | norm: 0.2750 | dt: 1237.83ms | tok/sec: 423553.90 +step 1597 | loss: 3.832104 | lr 3.3499e-04 | norm: 0.3336 | dt: 1235.46ms | tok/sec: 424367.19 +step 1598 | loss: 3.782779 | lr 3.3451e-04 | norm: 0.2794 | dt: 1235.27ms | tok/sec: 424432.80 +step 1599 | loss: 3.825098 | lr 3.3404e-04 | norm: 0.3020 | dt: 1232.22ms | tok/sec: 425481.50 +validation loss: 3.8600 +validation perplexity: 47.4650 +step 1600 | loss: 3.888232 | lr 3.3356e-04 | norm: 0.2950 | dt: 2680.12ms | tok/sec: 195621.41 +step 1601 | loss: 3.823087 | lr 3.3309e-04 | norm: 0.3472 | dt: 1237.42ms | tok/sec: 423694.60 +step 1602 | loss: 3.883584 | lr 3.3261e-04 | norm: 0.3164 | dt: 1235.66ms | tok/sec: 424299.56 +step 1603 | loss: 3.847507 | lr 3.3214e-04 | norm: 0.3161 | dt: 1237.94ms | tok/sec: 423517.20 +step 1604 | loss: 3.854513 | lr 3.3166e-04 | norm: 0.3181 | dt: 1233.72ms | tok/sec: 424966.36 +step 1605 | loss: 3.889433 | lr 3.3119e-04 | norm: 0.3058 | dt: 1234.00ms | tok/sec: 424868.81 +step 1606 | loss: 3.862376 | lr 3.3071e-04 | norm: 0.2804 | dt: 1235.24ms | tok/sec: 424441.81 +step 1607 | loss: 3.920967 | lr 3.3024e-04 | norm: 0.3266 | dt: 1233.36ms | tok/sec: 425087.85 +step 1608 | loss: 3.880018 | lr 3.2976e-04 | norm: 0.3047 | dt: 1237.15ms | tok/sec: 423788.09 +step 1609 | loss: 3.896006 | lr 3.2929e-04 | norm: 0.2797 | dt: 1237.31ms | tok/sec: 423731.09 +step 1610 | loss: 3.915513 | lr 3.2881e-04 | norm: 0.3379 | dt: 1236.94ms | tok/sec: 423858.83 +step 1611 | loss: 3.774094 | lr 3.2834e-04 | norm: 0.3160 | dt: 1235.32ms | tok/sec: 424413.30 +step 1612 | loss: 3.834783 | lr 3.2786e-04 | norm: 0.3041 | dt: 1235.92ms | tok/sec: 424208.87 +step 1613 | loss: 3.896645 | lr 3.2739e-04 | norm: 0.3418 | dt: 1236.14ms | tok/sec: 424132.61 +step 1614 | loss: 3.847695 | lr 3.2691e-04 | norm: 0.3013 | dt: 1236.98ms | tok/sec: 423844.53 +step 1615 | loss: 3.883897 | lr 3.2644e-04 | norm: 0.2950 | dt: 1233.58ms | tok/sec: 425012.35 +step 1616 | loss: 3.910768 | lr 3.2596e-04 | norm: 0.3224 | dt: 1236.17ms | tok/sec: 424123.86 +step 1617 | loss: 3.928158 | lr 3.2549e-04 | norm: 0.3040 | dt: 1238.69ms | tok/sec: 423260.99 +step 1618 | loss: 3.873620 | lr 3.2501e-04 | norm: 0.3204 | dt: 1235.68ms | tok/sec: 424290.88 +step 1619 | loss: 3.869750 | lr 3.2454e-04 | norm: 0.3034 | dt: 1233.41ms | tok/sec: 425071.01 +step 1620 | loss: 3.867470 | lr 3.2406e-04 | norm: 0.3016 | dt: 1236.91ms | tok/sec: 423868.63 +step 1621 | loss: 3.862906 | lr 3.2359e-04 | norm: 0.2996 | dt: 1235.63ms | tok/sec: 424309.79 +step 1622 | loss: 3.908209 | lr 3.2311e-04 | norm: 0.3212 | dt: 1233.68ms | tok/sec: 424978.51 +step 1623 | loss: 3.794762 | lr 3.2264e-04 | norm: 0.3358 | dt: 1238.26ms | tok/sec: 423408.17 +step 1624 | loss: 3.872967 | lr 3.2216e-04 | norm: 0.3039 | dt: 1236.66ms | tok/sec: 423954.84 +validation loss: 3.8549 +validation perplexity: 47.2257 +step 1625 | loss: 3.880852 | lr 3.2169e-04 | norm: 0.3638 | dt: 2678.07ms | tok/sec: 195771.18 +step 1626 | loss: 3.813070 | lr 3.2121e-04 | norm: 0.3302 | dt: 1235.59ms | tok/sec: 424322.40 +step 1627 | loss: 3.856266 | lr 3.2074e-04 | norm: 0.3534 | dt: 1233.18ms | tok/sec: 425151.22 +step 1628 | loss: 3.835179 | lr 3.2026e-04 | norm: 0.3865 | dt: 1233.07ms | tok/sec: 425187.88 +step 1629 | loss: 3.831539 | lr 3.1979e-04 | norm: 0.3552 | dt: 1234.96ms | tok/sec: 424539.65 +step 1630 | loss: 3.900082 | lr 3.1931e-04 | norm: 0.3468 | dt: 1233.81ms | tok/sec: 424933.18 +step 1631 | loss: 3.802429 | lr 3.1884e-04 | norm: 0.3856 | dt: 1233.03ms | tok/sec: 425202.35 +step 1632 | loss: 3.796980 | lr 3.1836e-04 | norm: 0.3051 | dt: 1234.60ms | tok/sec: 424661.81 +step 1633 | loss: 3.874418 | lr 3.1789e-04 | norm: 0.3505 | dt: 1235.29ms | tok/sec: 424426.49 +step 1634 | loss: 3.818348 | lr 3.1741e-04 | norm: 0.3300 | dt: 1235.90ms | tok/sec: 424215.50 +step 1635 | loss: 3.861767 | lr 3.1694e-04 | norm: 0.3666 | dt: 1236.63ms | tok/sec: 423964.24 +step 1636 | loss: 3.835673 | lr 3.1646e-04 | norm: 0.3148 | dt: 1235.48ms | tok/sec: 424361.46 +step 1637 | loss: 3.817015 | lr 3.1599e-04 | norm: 0.3671 | dt: 1235.80ms | tok/sec: 424250.93 +step 1638 | loss: 3.885756 | lr 3.1551e-04 | norm: 0.3862 | dt: 1237.26ms | tok/sec: 423750.44 +step 1639 | loss: 3.862134 | lr 3.1504e-04 | norm: 0.4250 | dt: 1235.17ms | tok/sec: 424466.14 +step 1640 | loss: 3.849824 | lr 3.1456e-04 | norm: 0.3155 | dt: 1238.25ms | tok/sec: 423409.72 +step 1641 | loss: 3.906787 | lr 3.1409e-04 | norm: 0.3599 | dt: 1234.36ms | tok/sec: 424745.14 +step 1642 | loss: 3.871882 | lr 3.1362e-04 | norm: 0.3365 | dt: 1236.59ms | tok/sec: 423978.55 +step 1643 | loss: 4.002625 | lr 3.1314e-04 | norm: 0.3422 | dt: 1235.03ms | tok/sec: 424513.83 +step 1644 | loss: 3.896419 | lr 3.1267e-04 | norm: 0.3695 | dt: 1240.20ms | tok/sec: 422743.16 +step 1645 | loss: 3.871373 | lr 3.1219e-04 | norm: 0.3547 | dt: 1233.48ms | tok/sec: 425046.85 +step 1646 | loss: 3.818641 | lr 3.1172e-04 | norm: 0.3014 | dt: 1237.16ms | tok/sec: 423783.68 +step 1647 | loss: 3.910496 | lr 3.1124e-04 | norm: 0.3503 | dt: 1232.30ms | tok/sec: 425453.34 +step 1648 | loss: 3.905368 | lr 3.1077e-04 | norm: 0.3497 | dt: 1234.36ms | tok/sec: 424744.57 +step 1649 | loss: 3.884148 | lr 3.1030e-04 | norm: 0.3168 | dt: 1238.13ms | tok/sec: 423450.57 +validation loss: 3.8500 +validation perplexity: 46.9946 +step 1650 | loss: 3.873799 | lr 3.0982e-04 | norm: 0.3964 | dt: 2681.44ms | tok/sec: 195524.59 +step 1651 | loss: 3.895843 | lr 3.0935e-04 | norm: 0.2937 | dt: 1234.90ms | tok/sec: 424557.76 +step 1652 | loss: 3.905673 | lr 3.0888e-04 | norm: 0.3036 | dt: 1239.05ms | tok/sec: 423138.66 +step 1653 | loss: 3.844843 | lr 3.0840e-04 | norm: 0.3153 | dt: 1233.47ms | tok/sec: 425052.36 +step 1654 | loss: 3.839769 | lr 3.0793e-04 | norm: 0.3070 | dt: 1236.16ms | tok/sec: 424125.25 +step 1655 | loss: 3.877319 | lr 3.0745e-04 | norm: 0.2956 | dt: 1237.21ms | tok/sec: 423766.77 +step 1656 | loss: 3.919524 | lr 3.0698e-04 | norm: 0.3284 | dt: 1238.31ms | tok/sec: 423390.07 +step 1657 | loss: 3.904485 | lr 3.0651e-04 | norm: 0.3416 | dt: 1239.22ms | tok/sec: 423079.72 +step 1658 | loss: 3.793278 | lr 3.0603e-04 | norm: 0.2900 | dt: 1237.41ms | tok/sec: 423699.09 +step 1659 | loss: 3.852717 | lr 3.0556e-04 | norm: 0.2856 | dt: 1235.22ms | tok/sec: 424448.12 +step 1660 | loss: 3.906315 | lr 3.0509e-04 | norm: 0.2780 | dt: 1236.80ms | tok/sec: 423905.65 +step 1661 | loss: 3.848639 | lr 3.0461e-04 | norm: 0.3055 | dt: 1235.39ms | tok/sec: 424389.71 +step 1662 | loss: 3.877108 | lr 3.0414e-04 | norm: 0.2885 | dt: 1235.68ms | tok/sec: 424290.31 +step 1663 | loss: 3.799121 | lr 3.0367e-04 | norm: 0.2863 | dt: 1238.22ms | tok/sec: 423422.11 +step 1664 | loss: 3.867347 | lr 3.0320e-04 | norm: 0.3338 | dt: 1238.51ms | tok/sec: 423321.12 +step 1665 | loss: 3.820493 | lr 3.0272e-04 | norm: 0.2671 | dt: 1235.38ms | tok/sec: 424392.58 +step 1666 | loss: 3.803518 | lr 3.0225e-04 | norm: 0.3149 | dt: 1235.56ms | tok/sec: 424331.32 +step 1667 | loss: 3.844409 | lr 3.0178e-04 | norm: 0.2935 | dt: 1236.15ms | tok/sec: 424129.99 +step 1668 | loss: 3.805916 | lr 3.0130e-04 | norm: 0.2597 | dt: 1233.85ms | tok/sec: 424920.37 +step 1669 | loss: 3.830439 | lr 3.0083e-04 | norm: 0.2919 | dt: 1236.61ms | tok/sec: 423973.07 +step 1670 | loss: 3.887572 | lr 3.0036e-04 | norm: 0.2793 | dt: 1239.21ms | tok/sec: 423084.11 +step 1671 | loss: 3.885050 | lr 2.9989e-04 | norm: 0.2994 | dt: 1238.43ms | tok/sec: 423349.24 +step 1672 | loss: 3.828027 | lr 2.9942e-04 | norm: 0.2744 | dt: 1235.24ms | tok/sec: 424443.37 +step 1673 | loss: 3.865273 | lr 2.9894e-04 | norm: 0.3019 | dt: 1238.80ms | tok/sec: 423221.56 +step 1674 | loss: 3.823120 | lr 2.9847e-04 | norm: 0.3343 | dt: 1237.02ms | tok/sec: 423829.99 +validation loss: 3.8468 +validation perplexity: 46.8422 +step 1675 | loss: 3.842909 | lr 2.9800e-04 | norm: 0.2967 | dt: 2680.51ms | tok/sec: 195592.38 +step 1676 | loss: 3.849490 | lr 2.9753e-04 | norm: 0.2906 | dt: 1235.89ms | tok/sec: 424217.87 +step 1677 | loss: 3.840965 | lr 2.9706e-04 | norm: 0.3044 | dt: 1236.73ms | tok/sec: 423932.04 +step 1678 | loss: 3.868639 | lr 2.9658e-04 | norm: 0.2915 | dt: 1234.12ms | tok/sec: 424828.76 +step 1679 | loss: 3.813670 | lr 2.9611e-04 | norm: 0.2856 | dt: 1237.39ms | tok/sec: 423706.11 +step 1680 | loss: 3.845132 | lr 2.9564e-04 | norm: 0.2928 | dt: 1235.87ms | tok/sec: 424227.12 +step 1681 | loss: 3.822594 | lr 2.9517e-04 | norm: 0.3281 | dt: 1235.86ms | tok/sec: 424230.72 +step 1682 | loss: 3.896034 | lr 2.9470e-04 | norm: 0.3126 | dt: 1235.51ms | tok/sec: 424350.89 +step 1683 | loss: 3.832432 | lr 2.9423e-04 | norm: 0.3571 | dt: 1234.90ms | tok/sec: 424557.84 +step 1684 | loss: 3.843688 | lr 2.9376e-04 | norm: 0.2978 | dt: 1236.85ms | tok/sec: 423889.47 +step 1685 | loss: 3.864978 | lr 2.9329e-04 | norm: 0.3657 | dt: 1237.02ms | tok/sec: 423830.07 +step 1686 | loss: 3.889225 | lr 2.9282e-04 | norm: 0.3102 | dt: 1233.30ms | tok/sec: 425108.32 +step 1687 | loss: 3.873170 | lr 2.9234e-04 | norm: 0.3180 | dt: 1232.90ms | tok/sec: 425248.65 +step 1688 | loss: 3.905135 | lr 2.9187e-04 | norm: 0.3203 | dt: 1232.78ms | tok/sec: 425289.68 +step 1689 | loss: 3.877549 | lr 2.9140e-04 | norm: 0.3077 | dt: 1237.84ms | tok/sec: 423550.31 +step 1690 | loss: 3.839575 | lr 2.9093e-04 | norm: 0.3087 | dt: 1234.19ms | tok/sec: 424804.30 +step 1691 | loss: 3.838000 | lr 2.9046e-04 | norm: 0.3143 | dt: 1238.81ms | tok/sec: 423219.04 +step 1692 | loss: 3.861683 | lr 2.8999e-04 | norm: 0.3207 | dt: 1234.08ms | tok/sec: 424840.49 +step 1693 | loss: 3.822584 | lr 2.8952e-04 | norm: 0.3048 | dt: 1232.24ms | tok/sec: 425474.01 +step 1694 | loss: 3.840883 | lr 2.8905e-04 | norm: 0.3029 | dt: 1234.53ms | tok/sec: 424684.61 +step 1695 | loss: 3.861099 | lr 2.8858e-04 | norm: 0.3085 | dt: 1234.47ms | tok/sec: 424707.65 +step 1696 | loss: 3.809892 | lr 2.8811e-04 | norm: 0.3222 | dt: 1235.86ms | tok/sec: 424227.85 +step 1697 | loss: 3.822805 | lr 2.8765e-04 | norm: 0.3208 | dt: 1233.88ms | tok/sec: 424908.88 +step 1698 | loss: 3.854577 | lr 2.8718e-04 | norm: 0.2683 | dt: 1232.22ms | tok/sec: 425481.50 +step 1699 | loss: 3.831947 | lr 2.8671e-04 | norm: 0.3141 | dt: 1236.12ms | tok/sec: 424139.65 +validation loss: 3.8436 +validation perplexity: 46.6947 +step 1700 | loss: 3.850524 | lr 2.8624e-04 | norm: 0.2897 | dt: 2691.17ms | tok/sec: 194817.97 +step 1701 | loss: 3.782125 | lr 2.8577e-04 | norm: 0.3100 | dt: 1233.96ms | tok/sec: 424883.92 +step 1702 | loss: 3.821759 | lr 2.8530e-04 | norm: 0.2847 | dt: 1235.80ms | tok/sec: 424250.36 +step 1703 | loss: 3.833602 | lr 2.8483e-04 | norm: 0.2798 | dt: 1237.00ms | tok/sec: 423837.42 +step 1704 | loss: 3.847374 | lr 2.8436e-04 | norm: 0.2962 | dt: 1236.77ms | tok/sec: 423916.84 +step 1705 | loss: 3.931981 | lr 2.8390e-04 | norm: 0.3318 | dt: 1233.52ms | tok/sec: 425032.48 +step 1706 | loss: 3.859510 | lr 2.8343e-04 | norm: 0.3499 | dt: 1235.34ms | tok/sec: 424407.41 +step 1707 | loss: 3.800257 | lr 2.8296e-04 | norm: 0.3311 | dt: 1239.28ms | tok/sec: 423059.69 +step 1708 | loss: 3.839341 | lr 2.8249e-04 | norm: 0.3326 | dt: 1237.61ms | tok/sec: 423630.36 +step 1709 | loss: 3.771966 | lr 2.8202e-04 | norm: 0.3140 | dt: 1233.56ms | tok/sec: 425019.99 +step 1710 | loss: 3.814546 | lr 2.8156e-04 | norm: 0.3285 | dt: 1235.42ms | tok/sec: 424380.21 +step 1711 | loss: 3.815383 | lr 2.8109e-04 | norm: 0.3198 | dt: 1236.22ms | tok/sec: 424105.37 +step 1712 | loss: 3.799691 | lr 2.8062e-04 | norm: 0.3612 | dt: 1233.57ms | tok/sec: 425017.77 +step 1713 | loss: 3.806999 | lr 2.8015e-04 | norm: 0.3444 | dt: 1238.44ms | tok/sec: 423346.63 +step 1714 | loss: 3.834282 | lr 2.7969e-04 | norm: 0.2952 | dt: 1235.42ms | tok/sec: 424381.44 +step 1715 | loss: 3.776262 | lr 2.7922e-04 | norm: 0.3302 | dt: 1236.65ms | tok/sec: 423957.05 +step 1716 | loss: 3.799962 | lr 2.7875e-04 | norm: 0.3157 | dt: 1237.94ms | tok/sec: 423517.93 +step 1717 | loss: 3.805037 | lr 2.7829e-04 | norm: 0.3225 | dt: 1235.94ms | tok/sec: 424201.67 +step 1718 | loss: 3.859638 | lr 2.7782e-04 | norm: 0.3339 | dt: 1238.27ms | tok/sec: 423403.60 +step 1719 | loss: 3.850246 | lr 2.7735e-04 | norm: 0.3336 | dt: 1234.37ms | tok/sec: 424741.20 +step 1720 | loss: 3.859085 | lr 2.7689e-04 | norm: 0.3162 | dt: 1233.89ms | tok/sec: 424907.64 +step 1721 | loss: 3.841967 | lr 2.7642e-04 | norm: 0.3006 | dt: 1236.72ms | tok/sec: 423934.49 +step 1722 | loss: 3.834100 | lr 2.7596e-04 | norm: 0.3046 | dt: 1232.68ms | tok/sec: 425323.57 +step 1723 | loss: 3.822785 | lr 2.7549e-04 | norm: 0.3837 | dt: 1237.75ms | tok/sec: 423581.81 +step 1724 | loss: 3.865575 | lr 2.7503e-04 | norm: 0.3500 | dt: 1236.18ms | tok/sec: 424118.38 +validation loss: 3.8426 +validation perplexity: 46.6479 +step 1725 | loss: 3.877557 | lr 2.7456e-04 | norm: 0.3601 | dt: 2692.26ms | tok/sec: 194738.71 +step 1726 | loss: 3.854742 | lr 2.7410e-04 | norm: 0.3308 | dt: 1235.12ms | tok/sec: 424483.59 +step 1727 | loss: 3.907625 | lr 2.7363e-04 | norm: 0.3354 | dt: 1232.73ms | tok/sec: 425304.90 +step 1728 | loss: 3.918640 | lr 2.7317e-04 | norm: 0.3198 | dt: 1236.43ms | tok/sec: 424035.04 +step 1729 | loss: 3.841780 | lr 2.7270e-04 | norm: 0.2942 | dt: 1238.77ms | tok/sec: 423232.72 +step 1730 | loss: 3.876993 | lr 2.7224e-04 | norm: 0.3438 | dt: 1233.80ms | tok/sec: 424937.70 +step 1731 | loss: 3.807574 | lr 2.7177e-04 | norm: 0.3816 | dt: 1233.40ms | tok/sec: 425076.84 +step 1732 | loss: 3.911800 | lr 2.7131e-04 | norm: 0.3030 | dt: 1234.62ms | tok/sec: 424656.31 +step 1733 | loss: 3.846801 | lr 2.7085e-04 | norm: 0.3154 | dt: 1236.43ms | tok/sec: 424035.04 +step 1734 | loss: 3.803742 | lr 2.7038e-04 | norm: 0.2910 | dt: 1234.67ms | tok/sec: 424637.12 +step 1735 | loss: 3.857607 | lr 2.6992e-04 | norm: 0.2923 | dt: 1235.13ms | tok/sec: 424479.83 +step 1736 | loss: 3.838449 | lr 2.6946e-04 | norm: 0.2650 | dt: 1236.41ms | tok/sec: 424042.16 +step 1737 | loss: 3.830353 | lr 2.6899e-04 | norm: 0.2978 | dt: 1236.52ms | tok/sec: 424003.56 +step 1738 | loss: 3.800393 | lr 2.6853e-04 | norm: 0.2925 | dt: 1234.70ms | tok/sec: 424627.45 +step 1739 | loss: 3.857622 | lr 2.6807e-04 | norm: 0.2794 | dt: 1862.36ms | tok/sec: 281518.69 +step 1740 | loss: 3.818552 | lr 2.6761e-04 | norm: 0.3220 | dt: 1230.22ms | tok/sec: 426173.57 +step 1741 | loss: 3.808908 | lr 2.6714e-04 | norm: 0.2625 | dt: 1236.29ms | tok/sec: 424081.82 +step 1742 | loss: 3.773061 | lr 2.6668e-04 | norm: 0.3408 | dt: 1237.05ms | tok/sec: 423821.66 +step 1743 | loss: 3.808598 | lr 2.6622e-04 | norm: 0.2991 | dt: 1230.99ms | tok/sec: 425906.22 +step 1744 | loss: 3.815270 | lr 2.6576e-04 | norm: 0.2995 | dt: 1232.32ms | tok/sec: 425449.23 +step 1745 | loss: 3.882179 | lr 2.6530e-04 | norm: 0.3197 | dt: 1235.17ms | tok/sec: 424467.95 +step 1746 | loss: 3.762290 | lr 2.6483e-04 | norm: 0.3026 | dt: 1235.88ms | tok/sec: 424222.29 +step 1747 | loss: 3.835375 | lr 2.6437e-04 | norm: 0.3126 | dt: 1234.70ms | tok/sec: 424629.33 +step 1748 | loss: 3.854746 | lr 2.6391e-04 | norm: 0.2998 | dt: 1234.41ms | tok/sec: 424727.50 +step 1749 | loss: 3.858202 | lr 2.6345e-04 | norm: 0.3489 | dt: 1236.29ms | tok/sec: 424080.26 +validation loss: 3.8371 +validation perplexity: 46.3910 +step 1750 | loss: 3.824224 | lr 2.6299e-04 | norm: 0.3177 | dt: 2688.24ms | tok/sec: 195029.89 +step 1751 | loss: 3.809885 | lr 2.6253e-04 | norm: 0.3179 | dt: 1234.74ms | tok/sec: 424613.67 +step 1752 | loss: 3.839734 | lr 2.6207e-04 | norm: 0.2912 | dt: 1234.63ms | tok/sec: 424650.32 +step 1753 | loss: 3.775031 | lr 2.6161e-04 | norm: 0.3075 | dt: 1236.54ms | tok/sec: 423996.86 +step 1754 | loss: 3.890182 | lr 2.6115e-04 | norm: 0.2895 | dt: 1236.24ms | tok/sec: 424100.30 +step 1755 | loss: 3.847951 | lr 2.6069e-04 | norm: 0.2941 | dt: 1235.76ms | tok/sec: 424264.36 +step 1756 | loss: 3.942807 | lr 2.6023e-04 | norm: 0.2998 | dt: 1237.31ms | tok/sec: 423731.66 +step 1757 | loss: 3.846874 | lr 2.5977e-04 | norm: 0.3053 | dt: 1233.15ms | tok/sec: 425161.66 +step 1758 | loss: 3.838305 | lr 2.5932e-04 | norm: 0.2700 | dt: 1234.06ms | tok/sec: 424847.72 +step 1759 | loss: 3.831285 | lr 2.5886e-04 | norm: 0.2968 | dt: 1233.40ms | tok/sec: 425075.86 +step 1760 | loss: 3.867056 | lr 2.5840e-04 | norm: 0.3485 | dt: 1234.32ms | tok/sec: 424758.93 +step 1761 | loss: 3.841832 | lr 2.5794e-04 | norm: 0.2709 | dt: 1234.73ms | tok/sec: 424616.21 +step 1762 | loss: 3.807040 | lr 2.5748e-04 | norm: 0.3260 | dt: 1236.26ms | tok/sec: 424091.71 +step 1763 | loss: 3.883086 | lr 2.5703e-04 | norm: 0.3052 | dt: 1238.09ms | tok/sec: 423466.22 +step 1764 | loss: 3.851336 | lr 2.5657e-04 | norm: 0.3068 | dt: 1235.70ms | tok/sec: 424284.74 +step 1765 | loss: 3.828597 | lr 2.5611e-04 | norm: 0.2970 | dt: 1237.40ms | tok/sec: 423702.35 +step 1766 | loss: 3.814972 | lr 2.5565e-04 | norm: 0.2623 | dt: 1234.32ms | tok/sec: 424759.58 +step 1767 | loss: 3.771002 | lr 2.5520e-04 | norm: 0.2752 | dt: 1239.04ms | tok/sec: 423139.39 +step 1768 | loss: 3.869025 | lr 2.5474e-04 | norm: 0.2980 | dt: 1234.06ms | tok/sec: 424848.78 +step 1769 | loss: 3.802565 | lr 2.5428e-04 | norm: 0.2884 | dt: 1238.54ms | tok/sec: 423310.28 +step 1770 | loss: 3.846778 | lr 2.5383e-04 | norm: 0.2962 | dt: 1234.17ms | tok/sec: 424809.47 +step 1771 | loss: 3.839330 | lr 2.5337e-04 | norm: 0.2531 | dt: 1235.10ms | tok/sec: 424491.79 +step 1772 | loss: 3.789363 | lr 2.5292e-04 | norm: 0.3025 | dt: 1236.84ms | tok/sec: 423894.45 +step 1773 | loss: 3.771961 | lr 2.5246e-04 | norm: 0.2779 | dt: 1234.37ms | tok/sec: 424740.38 +step 1774 | loss: 3.860578 | lr 2.5201e-04 | norm: 0.3234 | dt: 1237.10ms | tok/sec: 423804.75 +validation loss: 3.8345 +validation perplexity: 46.2720 +step 1775 | loss: 3.843336 | lr 2.5155e-04 | norm: 0.2824 | dt: 2689.72ms | tok/sec: 194923.12 +step 1776 | loss: 3.786002 | lr 2.5110e-04 | norm: 0.3027 | dt: 1236.14ms | tok/sec: 424133.92 +step 1777 | loss: 3.815645 | lr 2.5064e-04 | norm: 0.2903 | dt: 1236.76ms | tok/sec: 423920.11 +step 1778 | loss: 3.820933 | lr 2.5019e-04 | norm: 0.3033 | dt: 1236.15ms | tok/sec: 424128.28 +step 1779 | loss: 3.819179 | lr 2.4974e-04 | norm: 0.2906 | dt: 1238.55ms | tok/sec: 423307.27 +step 1780 | loss: 3.790527 | lr 2.4928e-04 | norm: 0.3208 | dt: 1235.35ms | tok/sec: 424404.78 +step 1781 | loss: 3.920253 | lr 2.4883e-04 | norm: 0.3009 | dt: 1232.92ms | tok/sec: 425240.18 +step 1782 | loss: 3.832412 | lr 2.4838e-04 | norm: 0.3172 | dt: 1232.40ms | tok/sec: 425419.35 +step 1783 | loss: 3.819323 | lr 2.4792e-04 | norm: 0.2729 | dt: 1230.63ms | tok/sec: 426031.64 +step 1784 | loss: 3.861099 | lr 2.4747e-04 | norm: 0.3529 | dt: 1235.06ms | tok/sec: 424504.24 +step 1785 | loss: 3.803337 | lr 2.4702e-04 | norm: 0.2798 | dt: 1235.79ms | tok/sec: 424252.65 +step 1786 | loss: 3.818172 | lr 2.4657e-04 | norm: 0.3292 | dt: 1233.86ms | tok/sec: 424917.25 +step 1787 | loss: 3.827141 | lr 2.4611e-04 | norm: 0.2853 | dt: 1236.46ms | tok/sec: 424022.29 +step 1788 | loss: 3.864768 | lr 2.4566e-04 | norm: 0.3462 | dt: 1232.79ms | tok/sec: 425284.50 +step 1789 | loss: 3.859549 | lr 2.4521e-04 | norm: 0.3023 | dt: 1231.08ms | tok/sec: 425876.86 +step 1790 | loss: 3.875974 | lr 2.4476e-04 | norm: 0.3306 | dt: 1236.57ms | tok/sec: 423984.27 +step 1791 | loss: 3.991437 | lr 2.4431e-04 | norm: 0.3105 | dt: 1235.66ms | tok/sec: 424298.33 +step 1792 | loss: 3.905731 | lr 2.4386e-04 | norm: 0.3060 | dt: 1234.28ms | tok/sec: 424773.20 +step 1793 | loss: 3.881969 | lr 2.4341e-04 | norm: 0.3014 | dt: 1237.19ms | tok/sec: 423772.82 +step 1794 | loss: 3.915488 | lr 2.4296e-04 | norm: 0.3198 | dt: 1238.76ms | tok/sec: 423234.51 +step 1795 | loss: 3.886563 | lr 2.4251e-04 | norm: 0.3303 | dt: 1236.77ms | tok/sec: 423916.02 +step 1796 | loss: 3.823389 | lr 2.4206e-04 | norm: 0.2761 | dt: 1234.76ms | tok/sec: 424608.51 +step 1797 | loss: 3.841432 | lr 2.4161e-04 | norm: 0.2956 | dt: 1236.70ms | tok/sec: 423942.67 +step 1798 | loss: 3.823833 | lr 2.4116e-04 | norm: 0.2913 | dt: 1236.55ms | tok/sec: 423992.04 +step 1799 | loss: 3.823383 | lr 2.4071e-04 | norm: 0.2897 | dt: 1235.04ms | tok/sec: 424510.14 +validation loss: 3.8299 +validation perplexity: 46.0571 +step 1800 | loss: 3.834434 | lr 2.4026e-04 | norm: 0.2773 | dt: 2682.10ms | tok/sec: 195476.34 +step 1801 | loss: 3.892835 | lr 2.3982e-04 | norm: 0.3062 | dt: 1231.62ms | tok/sec: 425689.30 +step 1802 | loss: 3.839844 | lr 2.3937e-04 | norm: 0.2952 | dt: 1233.98ms | tok/sec: 424876.45 +step 1803 | loss: 3.793620 | lr 2.3892e-04 | norm: 0.3249 | dt: 1233.10ms | tok/sec: 425178.59 +step 1804 | loss: 3.764670 | lr 2.3847e-04 | norm: 0.2975 | dt: 1239.17ms | tok/sec: 423095.18 +step 1805 | loss: 3.794467 | lr 2.3803e-04 | norm: 0.3046 | dt: 1235.78ms | tok/sec: 424255.27 +step 1806 | loss: 3.827284 | lr 2.3758e-04 | norm: 0.2897 | dt: 1233.27ms | tok/sec: 425119.33 +step 1807 | loss: 3.774142 | lr 2.3713e-04 | norm: 0.3021 | dt: 1234.38ms | tok/sec: 424736.69 +step 1808 | loss: 3.797175 | lr 2.3669e-04 | norm: 0.2835 | dt: 1233.71ms | tok/sec: 424968.00 +step 1809 | loss: 3.785535 | lr 2.3624e-04 | norm: 0.2943 | dt: 1235.01ms | tok/sec: 424519.82 +step 1810 | loss: 3.846878 | lr 2.3580e-04 | norm: 0.2895 | dt: 1234.65ms | tok/sec: 424644.01 +step 1811 | loss: 3.846099 | lr 2.3535e-04 | norm: 0.2993 | dt: 1235.34ms | tok/sec: 424406.18 +step 1812 | loss: 3.818627 | lr 2.3491e-04 | norm: 0.2914 | dt: 1235.13ms | tok/sec: 424481.05 +step 1813 | loss: 3.849162 | lr 2.3446e-04 | norm: 0.3099 | dt: 1233.96ms | tok/sec: 424881.78 +step 1814 | loss: 3.877408 | lr 2.3402e-04 | norm: 0.3037 | dt: 1233.84ms | tok/sec: 424924.80 +step 1815 | loss: 3.822993 | lr 2.3357e-04 | norm: 0.3166 | dt: 1235.86ms | tok/sec: 424230.88 +step 1816 | loss: 3.814668 | lr 2.3313e-04 | norm: 0.3262 | dt: 1235.56ms | tok/sec: 424331.65 +step 1817 | loss: 3.899853 | lr 2.3269e-04 | norm: 0.3015 | dt: 1237.25ms | tok/sec: 423751.75 +step 1818 | loss: 3.822036 | lr 2.3224e-04 | norm: 0.2726 | dt: 1234.26ms | tok/sec: 424780.09 +step 1819 | loss: 3.831884 | lr 2.3180e-04 | norm: 0.2869 | dt: 1235.01ms | tok/sec: 424520.31 +step 1820 | loss: 3.823700 | lr 2.3136e-04 | norm: 0.3142 | dt: 1234.22ms | tok/sec: 424791.83 +step 1821 | loss: 3.791554 | lr 2.3092e-04 | norm: 0.2822 | dt: 1236.62ms | tok/sec: 423968.50 +step 1822 | loss: 3.834232 | lr 2.3047e-04 | norm: 0.2904 | dt: 1238.66ms | tok/sec: 423270.36 +step 1823 | loss: 3.804581 | lr 2.3003e-04 | norm: 0.3058 | dt: 1232.63ms | tok/sec: 425340.19 +step 1824 | loss: 3.919766 | lr 2.2959e-04 | norm: 0.2954 | dt: 1235.26ms | tok/sec: 424434.76 +validation loss: 3.8265 +validation perplexity: 45.9010 +step 1825 | loss: 3.833828 | lr 2.2915e-04 | norm: 0.2866 | dt: 2683.38ms | tok/sec: 195383.32 +step 1826 | loss: 3.833893 | lr 2.2871e-04 | norm: 0.2922 | dt: 1231.47ms | tok/sec: 425743.29 +step 1827 | loss: 3.897145 | lr 2.2827e-04 | norm: 0.3109 | dt: 1233.05ms | tok/sec: 425196.35 +step 1828 | loss: 3.839289 | lr 2.2783e-04 | norm: 0.2600 | dt: 1232.80ms | tok/sec: 425282.20 +step 1829 | loss: 3.863064 | lr 2.2739e-04 | norm: 0.2982 | dt: 1235.56ms | tok/sec: 424330.83 +step 1830 | loss: 3.856108 | lr 2.2695e-04 | norm: 0.2747 | dt: 1233.16ms | tok/sec: 425157.22 +step 1831 | loss: 3.804722 | lr 2.2651e-04 | norm: 0.2969 | dt: 1234.13ms | tok/sec: 424823.34 +step 1832 | loss: 3.814510 | lr 2.2607e-04 | norm: 0.2633 | dt: 1235.71ms | tok/sec: 424280.56 +step 1833 | loss: 3.848171 | lr 2.2563e-04 | norm: 0.3201 | dt: 1233.68ms | tok/sec: 424978.76 +step 1834 | loss: 3.858899 | lr 2.2520e-04 | norm: 0.3106 | dt: 1235.57ms | tok/sec: 424328.95 +step 1835 | loss: 3.816870 | lr 2.2476e-04 | norm: 0.3602 | dt: 1232.70ms | tok/sec: 425315.18 +step 1836 | loss: 3.819019 | lr 2.2432e-04 | norm: 0.2803 | dt: 1230.95ms | tok/sec: 425921.81 +step 1837 | loss: 3.813051 | lr 2.2388e-04 | norm: 0.3327 | dt: 1233.53ms | tok/sec: 425032.07 +step 1838 | loss: 3.812537 | lr 2.2345e-04 | norm: 0.3071 | dt: 1233.09ms | tok/sec: 425181.55 +step 1839 | loss: 3.758736 | lr 2.2301e-04 | norm: 0.3089 | dt: 1235.91ms | tok/sec: 424211.32 +step 1840 | loss: 3.796674 | lr 2.2257e-04 | norm: 0.2992 | dt: 1233.12ms | tok/sec: 425171.52 +step 1841 | loss: 3.804323 | lr 2.2214e-04 | norm: 0.2979 | dt: 1238.00ms | tok/sec: 423496.56 +step 1842 | loss: 3.804183 | lr 2.2170e-04 | norm: 0.2964 | dt: 1238.61ms | tok/sec: 423287.06 +step 1843 | loss: 3.858470 | lr 2.2127e-04 | norm: 0.2880 | dt: 1232.44ms | tok/sec: 425406.76 +step 1844 | loss: 3.774223 | lr 2.2083e-04 | norm: 0.2814 | dt: 1236.39ms | tok/sec: 424049.02 +step 1845 | loss: 3.761775 | lr 2.2040e-04 | norm: 0.2691 | dt: 1234.43ms | tok/sec: 424719.79 +step 1846 | loss: 3.836716 | lr 2.1996e-04 | norm: 0.2735 | dt: 1232.92ms | tok/sec: 425241.98 +step 1847 | loss: 3.831937 | lr 2.1953e-04 | norm: 0.2524 | dt: 1235.41ms | tok/sec: 424383.65 +step 1848 | loss: 3.879344 | lr 2.1910e-04 | norm: 0.2994 | dt: 1231.53ms | tok/sec: 425720.54 +step 1849 | loss: 3.851722 | lr 2.1866e-04 | norm: 0.2708 | dt: 1233.72ms | tok/sec: 424965.37 +validation loss: 3.8246 +validation perplexity: 45.8154 +step 1850 | loss: 3.764681 | lr 2.1823e-04 | norm: 0.2779 | dt: 2683.45ms | tok/sec: 195378.06 +step 1851 | loss: 3.804662 | lr 2.1780e-04 | norm: 0.2948 | dt: 1239.05ms | tok/sec: 423135.81 +step 1852 | loss: 3.780368 | lr 2.1737e-04 | norm: 0.2645 | dt: 1237.89ms | tok/sec: 423534.16 +step 1853 | loss: 3.849319 | lr 2.1693e-04 | norm: 0.2764 | dt: 1233.64ms | tok/sec: 424991.08 +step 1854 | loss: 3.731425 | lr 2.1650e-04 | norm: 0.2671 | dt: 1236.16ms | tok/sec: 424127.95 +step 1855 | loss: 3.828892 | lr 2.1607e-04 | norm: 0.2740 | dt: 1235.44ms | tok/sec: 424373.58 +step 1856 | loss: 3.882594 | lr 2.1564e-04 | norm: 0.2730 | dt: 1234.93ms | tok/sec: 424547.19 +step 1857 | loss: 3.884936 | lr 2.1521e-04 | norm: 0.3170 | dt: 1235.68ms | tok/sec: 424291.94 +step 1858 | loss: 3.847682 | lr 2.1478e-04 | norm: 0.2593 | dt: 1234.06ms | tok/sec: 424848.05 +step 1859 | loss: 3.896034 | lr 2.1435e-04 | norm: 0.3665 | dt: 1233.63ms | tok/sec: 424995.92 +step 1860 | loss: 3.822355 | lr 2.1392e-04 | norm: 0.3450 | dt: 1234.87ms | tok/sec: 424567.93 +step 1861 | loss: 3.894389 | lr 2.1349e-04 | norm: 0.3293 | dt: 1235.27ms | tok/sec: 424432.96 +step 1862 | loss: 3.838651 | lr 2.1307e-04 | norm: 0.3384 | dt: 1233.97ms | tok/sec: 424879.40 +step 1863 | loss: 3.843348 | lr 2.1264e-04 | norm: 0.4540 | dt: 1235.76ms | tok/sec: 424262.39 +step 1864 | loss: 3.819902 | lr 2.1221e-04 | norm: 0.3310 | dt: 1233.43ms | tok/sec: 425065.59 +step 1865 | loss: 3.890391 | lr 2.1178e-04 | norm: 0.4051 | dt: 1236.02ms | tok/sec: 424172.78 +step 1866 | loss: 3.857492 | lr 2.1136e-04 | norm: 0.2941 | dt: 1235.63ms | tok/sec: 424309.22 +step 1867 | loss: 3.835767 | lr 2.1093e-04 | norm: 0.3667 | dt: 1233.83ms | tok/sec: 424926.36 +step 1868 | loss: 3.828768 | lr 2.1050e-04 | norm: 0.3212 | dt: 1238.20ms | tok/sec: 423428.14 +step 1869 | loss: 3.895271 | lr 2.1008e-04 | norm: 0.3344 | dt: 1233.05ms | tok/sec: 425195.45 +step 1870 | loss: 3.863339 | lr 2.0965e-04 | norm: 0.3223 | dt: 1233.74ms | tok/sec: 424956.75 +step 1871 | loss: 3.865729 | lr 2.0923e-04 | norm: 0.3055 | dt: 1234.89ms | tok/sec: 424562.03 +step 1872 | loss: 3.899481 | lr 2.0880e-04 | norm: 0.3309 | dt: 1234.59ms | tok/sec: 424666.32 +step 1873 | loss: 3.839805 | lr 2.0838e-04 | norm: 0.3037 | dt: 1234.94ms | tok/sec: 424545.63 +step 1874 | loss: 3.772813 | lr 2.0795e-04 | norm: 0.3031 | dt: 1234.71ms | tok/sec: 424623.27 +validation loss: 3.8221 +validation perplexity: 45.6981 +step 1875 | loss: 3.869104 | lr 2.0753e-04 | norm: 0.3108 | dt: 2691.44ms | tok/sec: 194798.59 +step 1876 | loss: 3.809723 | lr 2.0711e-04 | norm: 0.3158 | dt: 1235.63ms | tok/sec: 424309.79 +step 1877 | loss: 3.805526 | lr 2.0668e-04 | norm: 0.3083 | dt: 1235.12ms | tok/sec: 424482.69 +step 1878 | loss: 3.857047 | lr 2.0626e-04 | norm: 0.2995 | dt: 1234.71ms | tok/sec: 424625.56 +step 1879 | loss: 3.761404 | lr 2.0584e-04 | norm: 0.2724 | dt: 1234.66ms | tok/sec: 424640.89 +step 1880 | loss: 3.832197 | lr 2.0542e-04 | norm: 0.2908 | dt: 1236.10ms | tok/sec: 424147.58 +step 1881 | loss: 3.808442 | lr 2.0499e-04 | norm: 0.2912 | dt: 1235.85ms | tok/sec: 424232.27 +step 1882 | loss: 3.843303 | lr 2.0457e-04 | norm: 0.3014 | dt: 1234.40ms | tok/sec: 424730.38 +step 1883 | loss: 3.783145 | lr 2.0415e-04 | norm: 0.3360 | dt: 1232.50ms | tok/sec: 425385.12 +step 1884 | loss: 3.811525 | lr 2.0373e-04 | norm: 0.2767 | dt: 1235.07ms | tok/sec: 424499.00 +step 1885 | loss: 3.837322 | lr 2.0331e-04 | norm: 0.3244 | dt: 1234.17ms | tok/sec: 424810.29 +step 1886 | loss: 3.793513 | lr 2.0289e-04 | norm: 0.3077 | dt: 1233.47ms | tok/sec: 425050.80 +step 1887 | loss: 3.806123 | lr 2.0247e-04 | norm: 0.2945 | dt: 1233.46ms | tok/sec: 425054.25 +step 1888 | loss: 3.817802 | lr 2.0206e-04 | norm: 0.3159 | dt: 1237.05ms | tok/sec: 423821.98 +step 1889 | loss: 3.884900 | lr 2.0164e-04 | norm: 0.3115 | dt: 1233.15ms | tok/sec: 425161.91 +step 1890 | loss: 3.872460 | lr 2.0122e-04 | norm: 0.3309 | dt: 1235.69ms | tok/sec: 424288.51 +step 1891 | loss: 3.812523 | lr 2.0080e-04 | norm: 0.2871 | dt: 1233.94ms | tok/sec: 424889.83 +step 1892 | loss: 3.764773 | lr 2.0038e-04 | norm: 0.2917 | dt: 1236.34ms | tok/sec: 424064.40 +step 1893 | loss: 3.780904 | lr 1.9997e-04 | norm: 0.2887 | dt: 1234.21ms | tok/sec: 424796.18 +step 1894 | loss: 3.907701 | lr 1.9955e-04 | norm: 0.2767 | dt: 1235.22ms | tok/sec: 424448.20 +step 1895 | loss: 3.804700 | lr 1.9914e-04 | norm: 0.3281 | dt: 1235.37ms | tok/sec: 424399.13 +step 1896 | loss: 3.860210 | lr 1.9872e-04 | norm: 0.3051 | dt: 1238.18ms | tok/sec: 423433.04 +step 1897 | loss: 3.906293 | lr 1.9831e-04 | norm: 0.2613 | dt: 1236.04ms | tok/sec: 424166.64 +step 1898 | loss: 3.897805 | lr 1.9789e-04 | norm: 0.3236 | dt: 1236.67ms | tok/sec: 423949.78 +step 1899 | loss: 3.811027 | lr 1.9748e-04 | norm: 0.2778 | dt: 1234.80ms | tok/sec: 424592.19 +validation loss: 3.8180 +validation perplexity: 45.5132 +step 1900 | loss: 3.824185 | lr 1.9706e-04 | norm: 0.2642 | dt: 2675.28ms | tok/sec: 195975.05 +step 1901 | loss: 3.795538 | lr 1.9665e-04 | norm: 0.2981 | dt: 1235.93ms | tok/sec: 424206.41 +step 1902 | loss: 3.940017 | lr 1.9624e-04 | norm: 0.2878 | dt: 1235.42ms | tok/sec: 424381.61 +step 1903 | loss: 3.830631 | lr 1.9582e-04 | norm: 0.3026 | dt: 1232.00ms | tok/sec: 425557.50 +step 1904 | loss: 3.816025 | lr 1.9541e-04 | norm: 0.2635 | dt: 1232.75ms | tok/sec: 425300.05 +step 1905 | loss: 3.936213 | lr 1.9500e-04 | norm: 0.3320 | dt: 1235.30ms | tok/sec: 424420.18 +step 1906 | loss: 3.840107 | lr 1.9459e-04 | norm: 0.3004 | dt: 1235.91ms | tok/sec: 424212.96 +step 1907 | loss: 3.827512 | lr 1.9418e-04 | norm: 0.2842 | dt: 1235.72ms | tok/sec: 424278.85 +step 1908 | loss: 3.798495 | lr 1.9377e-04 | norm: 0.3108 | dt: 1238.64ms | tok/sec: 423276.30 +step 1909 | loss: 3.810443 | lr 1.9336e-04 | norm: 0.2885 | dt: 1233.78ms | tok/sec: 424943.44 +step 1910 | loss: 3.778972 | lr 1.9295e-04 | norm: 0.2694 | dt: 1238.23ms | tok/sec: 423418.61 +step 1911 | loss: 3.812544 | lr 1.9254e-04 | norm: 0.2796 | dt: 1236.10ms | tok/sec: 424145.21 +step 1912 | loss: 3.806827 | lr 1.9213e-04 | norm: 0.2772 | dt: 1235.04ms | tok/sec: 424510.14 +step 1913 | loss: 3.832786 | lr 1.9172e-04 | norm: 0.2947 | dt: 1234.98ms | tok/sec: 424532.93 +step 1914 | loss: 3.848043 | lr 1.9131e-04 | norm: 0.2885 | dt: 1231.79ms | tok/sec: 425629.40 +step 1915 | loss: 3.766192 | lr 1.9091e-04 | norm: 0.2783 | dt: 1232.18ms | tok/sec: 425495.74 +step 1916 | loss: 3.820299 | lr 1.9050e-04 | norm: 0.2803 | dt: 1233.85ms | tok/sec: 424922.01 +step 1917 | loss: 3.795261 | lr 1.9009e-04 | norm: 0.3427 | dt: 1232.21ms | tok/sec: 425486.68 +step 1918 | loss: 3.773481 | lr 1.8969e-04 | norm: 0.2667 | dt: 1234.49ms | tok/sec: 424701.17 +step 1919 | loss: 3.821310 | lr 1.8928e-04 | norm: 0.3075 | dt: 1235.66ms | tok/sec: 424296.53 +step 1920 | loss: 3.832447 | lr 1.8887e-04 | norm: 0.3042 | dt: 1233.65ms | tok/sec: 424990.75 +step 1921 | loss: 3.838760 | lr 1.8847e-04 | norm: 0.2879 | dt: 1232.21ms | tok/sec: 425487.18 +step 1922 | loss: 3.827824 | lr 1.8807e-04 | norm: 0.3263 | dt: 1235.58ms | tok/sec: 424326.90 +step 1923 | loss: 3.829744 | lr 1.8766e-04 | norm: 0.3314 | dt: 1233.16ms | tok/sec: 425157.22 +step 1924 | loss: 3.816090 | lr 1.8726e-04 | norm: 0.2982 | dt: 1236.28ms | tok/sec: 424086.64 +validation loss: 3.8165 +validation perplexity: 45.4435 +step 1925 | loss: 3.847556 | lr 1.8685e-04 | norm: 0.3302 | dt: 2686.21ms | tok/sec: 195177.56 +step 1926 | loss: 3.764707 | lr 1.8645e-04 | norm: 0.2963 | dt: 1236.65ms | tok/sec: 423957.62 +step 1927 | loss: 3.836978 | lr 1.8605e-04 | norm: 0.3309 | dt: 1236.70ms | tok/sec: 423940.38 +step 1928 | loss: 3.797976 | lr 1.8565e-04 | norm: 0.2836 | dt: 1233.81ms | tok/sec: 424932.52 +step 1929 | loss: 3.845414 | lr 1.8525e-04 | norm: 0.3102 | dt: 1234.61ms | tok/sec: 424658.77 +step 1930 | loss: 3.799386 | lr 1.8485e-04 | norm: 0.2877 | dt: 1869.24ms | tok/sec: 280482.62 +step 1931 | loss: 3.983575 | lr 1.8445e-04 | norm: 0.2946 | dt: 1237.05ms | tok/sec: 423820.76 +step 1932 | loss: 3.833104 | lr 1.8405e-04 | norm: 0.2904 | dt: 1237.05ms | tok/sec: 423822.23 +step 1933 | loss: 3.962076 | lr 1.8365e-04 | norm: 0.3529 | dt: 1238.85ms | tok/sec: 423204.78 +step 1934 | loss: 3.839904 | lr 1.8325e-04 | norm: 0.2971 | dt: 1236.79ms | tok/sec: 423911.12 +step 1935 | loss: 3.868418 | lr 1.8285e-04 | norm: 0.2694 | dt: 1233.81ms | tok/sec: 424932.77 +step 1936 | loss: 3.807105 | lr 1.8245e-04 | norm: 0.2853 | dt: 1237.45ms | tok/sec: 423682.76 +step 1937 | loss: 3.812029 | lr 1.8205e-04 | norm: 0.2990 | dt: 1233.40ms | tok/sec: 425075.78 +step 1938 | loss: 3.842635 | lr 1.8165e-04 | norm: 0.3176 | dt: 1236.00ms | tok/sec: 424182.68 +step 1939 | loss: 3.856123 | lr 1.8126e-04 | norm: 0.3324 | dt: 1237.25ms | tok/sec: 423753.54 +step 1940 | loss: 3.878730 | lr 1.8086e-04 | norm: 0.2740 | dt: 1237.23ms | tok/sec: 423760.65 +step 1941 | loss: 3.857882 | lr 1.8047e-04 | norm: 0.2865 | dt: 1235.82ms | tok/sec: 424244.39 +step 1942 | loss: 3.831998 | lr 1.8007e-04 | norm: 0.2602 | dt: 1237.79ms | tok/sec: 423569.00 +step 1943 | loss: 3.816370 | lr 1.7968e-04 | norm: 0.2868 | dt: 1237.95ms | tok/sec: 423511.73 +step 1944 | loss: 3.758696 | lr 1.7928e-04 | norm: 0.2392 | dt: 1234.59ms | tok/sec: 424665.58 +step 1945 | loss: 3.793845 | lr 1.7889e-04 | norm: 0.3060 | dt: 1237.77ms | tok/sec: 423575.20 +step 1946 | loss: 3.774516 | lr 1.7849e-04 | norm: 0.2576 | dt: 1237.60ms | tok/sec: 423632.89 +step 1947 | loss: 3.825593 | lr 1.7810e-04 | norm: 0.2831 | dt: 1238.79ms | tok/sec: 423227.26 +step 1948 | loss: 3.815992 | lr 1.7771e-04 | norm: 0.2520 | dt: 1235.67ms | tok/sec: 424296.04 +step 1949 | loss: 3.849712 | lr 1.7732e-04 | norm: 0.3019 | dt: 1237.18ms | tok/sec: 423778.13 +validation loss: 3.8149 +validation perplexity: 45.3717 +step 1950 | loss: 3.816709 | lr 1.7692e-04 | norm: 0.3264 | dt: 2684.53ms | tok/sec: 195299.60 +step 1951 | loss: 3.847483 | lr 1.7653e-04 | norm: 0.4586 | dt: 1236.94ms | tok/sec: 423857.36 +step 1952 | loss: 3.773797 | lr 1.7614e-04 | norm: 0.2684 | dt: 1235.90ms | tok/sec: 424217.13 +step 1953 | loss: 3.863127 | lr 1.7575e-04 | norm: 0.3319 | dt: 1235.87ms | tok/sec: 424224.17 +step 1954 | loss: 3.779722 | lr 1.7536e-04 | norm: 0.3038 | dt: 1236.72ms | tok/sec: 423932.70 +step 1955 | loss: 3.764195 | lr 1.7497e-04 | norm: 0.3259 | dt: 1236.09ms | tok/sec: 424148.73 +step 1956 | loss: 3.781445 | lr 1.7458e-04 | norm: 0.3269 | dt: 1234.95ms | tok/sec: 424543.34 +step 1957 | loss: 3.809877 | lr 1.7420e-04 | norm: 0.2876 | dt: 1236.31ms | tok/sec: 424074.21 +step 1958 | loss: 3.768805 | lr 1.7381e-04 | norm: 0.2841 | dt: 1232.06ms | tok/sec: 425539.05 +step 1959 | loss: 3.791228 | lr 1.7342e-04 | norm: 0.3062 | dt: 1238.67ms | tok/sec: 423266.36 +step 1960 | loss: 3.809012 | lr 1.7303e-04 | norm: 0.3057 | dt: 1233.67ms | tok/sec: 424983.11 +step 1961 | loss: 3.837402 | lr 1.7265e-04 | norm: 0.2944 | dt: 1235.04ms | tok/sec: 424509.41 +step 1962 | loss: 3.835444 | lr 1.7226e-04 | norm: 0.3034 | dt: 1234.57ms | tok/sec: 424671.32 +step 1963 | loss: 3.855143 | lr 1.7188e-04 | norm: 0.3093 | dt: 1236.66ms | tok/sec: 423953.54 +step 1964 | loss: 3.842542 | lr 1.7149e-04 | norm: 0.2712 | dt: 1236.31ms | tok/sec: 424074.46 +step 1965 | loss: 3.785819 | lr 1.7111e-04 | norm: 0.2851 | dt: 1234.82ms | tok/sec: 424586.29 +step 1966 | loss: 3.833086 | lr 1.7072e-04 | norm: 0.2828 | dt: 1234.68ms | tok/sec: 424634.01 +step 1967 | loss: 3.876192 | lr 1.7034e-04 | norm: 0.2578 | dt: 1235.06ms | tok/sec: 424503.83 +step 1968 | loss: 3.859894 | lr 1.6996e-04 | norm: 0.3015 | dt: 1234.65ms | tok/sec: 424643.93 +step 1969 | loss: 3.789145 | lr 1.6957e-04 | norm: 0.2643 | dt: 1238.79ms | tok/sec: 423226.86 +step 1970 | loss: 3.856083 | lr 1.6919e-04 | norm: 0.2820 | dt: 1237.03ms | tok/sec: 423826.89 +step 1971 | loss: 3.764302 | lr 1.6881e-04 | norm: 0.2841 | dt: 1238.22ms | tok/sec: 423421.21 +step 1972 | loss: 3.866452 | lr 1.6843e-04 | norm: 0.2869 | dt: 1236.90ms | tok/sec: 423871.16 +step 1973 | loss: 3.809985 | lr 1.6805e-04 | norm: 0.2910 | dt: 1231.99ms | tok/sec: 425561.61 +step 1974 | loss: 3.838418 | lr 1.6767e-04 | norm: 0.2710 | dt: 1239.17ms | tok/sec: 423097.06 +validation loss: 3.8102 +validation perplexity: 45.1616 +step 1975 | loss: 3.777533 | lr 1.6729e-04 | norm: 0.3181 | dt: 2684.39ms | tok/sec: 195309.95 +step 1976 | loss: 3.803784 | lr 1.6691e-04 | norm: 0.2761 | dt: 1234.88ms | tok/sec: 424565.63 +step 1977 | loss: 3.827159 | lr 1.6653e-04 | norm: 0.2942 | dt: 1235.95ms | tok/sec: 424197.33 +step 1978 | loss: 3.805415 | lr 1.6615e-04 | norm: 0.2726 | dt: 1236.99ms | tok/sec: 423840.45 +step 1979 | loss: 3.841802 | lr 1.6578e-04 | norm: 0.2980 | dt: 1234.54ms | tok/sec: 424684.11 +step 1980 | loss: 3.800605 | lr 1.6540e-04 | norm: 0.2590 | dt: 1236.44ms | tok/sec: 424029.32 +step 1981 | loss: 3.763860 | lr 1.6502e-04 | norm: 0.2863 | dt: 1235.46ms | tok/sec: 424366.54 +step 1982 | loss: 3.741947 | lr 1.6465e-04 | norm: 0.2597 | dt: 1235.19ms | tok/sec: 424457.95 +step 1983 | loss: 3.774350 | lr 1.6427e-04 | norm: 0.3015 | dt: 1235.96ms | tok/sec: 424194.06 +step 1984 | loss: 3.856619 | lr 1.6390e-04 | norm: 0.2930 | dt: 1233.74ms | tok/sec: 424957.40 +step 1985 | loss: 3.794824 | lr 1.6352e-04 | norm: 0.2923 | dt: 1236.25ms | tok/sec: 424095.89 +step 1986 | loss: 3.771401 | lr 1.6315e-04 | norm: 0.3150 | dt: 1233.46ms | tok/sec: 425056.14 +step 1987 | loss: 3.870786 | lr 1.6277e-04 | norm: 0.2818 | dt: 1236.05ms | tok/sec: 424163.45 +step 1988 | loss: 3.798083 | lr 1.6240e-04 | norm: 0.2879 | dt: 1236.86ms | tok/sec: 423888.00 +step 1989 | loss: 3.841662 | lr 1.6203e-04 | norm: 0.2864 | dt: 1236.88ms | tok/sec: 423878.68 +step 1990 | loss: 3.856618 | lr 1.6166e-04 | norm: 0.2844 | dt: 1236.15ms | tok/sec: 424128.69 +step 1991 | loss: 3.815775 | lr 1.6129e-04 | norm: 0.2849 | dt: 1232.19ms | tok/sec: 425492.94 +step 1992 | loss: 3.815669 | lr 1.6092e-04 | norm: 0.3462 | dt: 1237.51ms | tok/sec: 423665.21 +step 1993 | loss: 3.853765 | lr 1.6055e-04 | norm: 0.2838 | dt: 1237.80ms | tok/sec: 423565.08 +step 1994 | loss: 3.876486 | lr 1.6018e-04 | norm: 0.2872 | dt: 1232.83ms | tok/sec: 425270.44 +step 1995 | loss: 3.838751 | lr 1.5981e-04 | norm: 0.2937 | dt: 1237.20ms | tok/sec: 423768.90 +step 1996 | loss: 3.779909 | lr 1.5944e-04 | norm: 0.2743 | dt: 1234.45ms | tok/sec: 424712.25 +step 1997 | loss: 3.824533 | lr 1.5907e-04 | norm: 0.2946 | dt: 1237.33ms | tok/sec: 423726.84 +step 1998 | loss: 3.826291 | lr 1.5870e-04 | norm: 0.2577 | dt: 1234.99ms | tok/sec: 424528.99 +step 1999 | loss: 3.844870 | lr 1.5834e-04 | norm: 0.2676 | dt: 1234.79ms | tok/sec: 424597.19 +validation loss: 3.8065 +validation perplexity: 44.9932 +step 2000 | loss: 3.876450 | lr 1.5797e-04 | norm: 0.3445 | dt: 2675.40ms | tok/sec: 195966.00 +step 2001 | loss: 3.866910 | lr 1.5760e-04 | norm: 0.3152 | dt: 1232.95ms | tok/sec: 425232.12 +step 2002 | loss: 3.817987 | lr 1.5724e-04 | norm: 0.3100 | dt: 1234.97ms | tok/sec: 424533.91 +step 2003 | loss: 3.761965 | lr 1.5687e-04 | norm: 0.2868 | dt: 1235.81ms | tok/sec: 424247.00 +step 2004 | loss: 3.805471 | lr 1.5651e-04 | norm: 0.2857 | dt: 1232.48ms | tok/sec: 425391.54 +step 2005 | loss: 3.792804 | lr 1.5614e-04 | norm: 0.2633 | dt: 1234.04ms | tok/sec: 424856.17 +step 2006 | loss: 3.849188 | lr 1.5578e-04 | norm: 0.2766 | dt: 1235.26ms | tok/sec: 424435.91 +step 2007 | loss: 3.860116 | lr 1.5542e-04 | norm: 0.2602 | dt: 1233.67ms | tok/sec: 424981.06 +step 2008 | loss: 3.802810 | lr 1.5506e-04 | norm: 0.2828 | dt: 1236.61ms | tok/sec: 423972.34 +step 2009 | loss: 3.804202 | lr 1.5469e-04 | norm: 0.2953 | dt: 1235.93ms | tok/sec: 424205.92 +step 2010 | loss: 3.824251 | lr 1.5433e-04 | norm: 0.2904 | dt: 1234.37ms | tok/sec: 424742.19 +step 2011 | loss: 3.775552 | lr 1.5397e-04 | norm: 0.2756 | dt: 1234.74ms | tok/sec: 424614.57 +step 2012 | loss: 3.753550 | lr 1.5361e-04 | norm: 0.3302 | dt: 1237.00ms | tok/sec: 423838.00 +step 2013 | loss: 3.798352 | lr 1.5325e-04 | norm: 0.2696 | dt: 1236.10ms | tok/sec: 424147.09 +step 2014 | loss: 3.830823 | lr 1.5289e-04 | norm: 0.3272 | dt: 1235.77ms | tok/sec: 424258.79 +step 2015 | loss: 3.778418 | lr 1.5254e-04 | norm: 0.2726 | dt: 1236.83ms | tok/sec: 423896.49 +step 2016 | loss: 3.753958 | lr 1.5218e-04 | norm: 0.2634 | dt: 1236.35ms | tok/sec: 424062.27 +step 2017 | loss: 3.785126 | lr 1.5182e-04 | norm: 0.2789 | dt: 1233.73ms | tok/sec: 424962.74 +step 2018 | loss: 3.794756 | lr 1.5146e-04 | norm: 0.2578 | dt: 1235.70ms | tok/sec: 424285.64 +step 2019 | loss: 3.789227 | lr 1.5111e-04 | norm: 0.2494 | dt: 1235.68ms | tok/sec: 424292.52 +step 2020 | loss: 3.778317 | lr 1.5075e-04 | norm: 0.2525 | dt: 1234.61ms | tok/sec: 424660.25 +step 2021 | loss: 3.804776 | lr 1.5040e-04 | norm: 0.2465 | dt: 1235.64ms | tok/sec: 424303.98 +step 2022 | loss: 3.791013 | lr 1.5004e-04 | norm: 0.2398 | dt: 1237.14ms | tok/sec: 423791.03 +step 2023 | loss: 3.809565 | lr 1.4969e-04 | norm: 0.2439 | dt: 1236.36ms | tok/sec: 424056.71 +step 2024 | loss: 3.790082 | lr 1.4933e-04 | norm: 0.2675 | dt: 1236.82ms | tok/sec: 423900.82 +validation loss: 3.8058 +validation perplexity: 44.9614 +step 2025 | loss: 3.799218 | lr 1.4898e-04 | norm: 0.2713 | dt: 2686.91ms | tok/sec: 195126.39 +step 2026 | loss: 3.818273 | lr 1.4863e-04 | norm: 0.2568 | dt: 1232.66ms | tok/sec: 425332.13 +step 2027 | loss: 3.850888 | lr 1.4828e-04 | norm: 0.2733 | dt: 1234.94ms | tok/sec: 424544.57 +step 2028 | loss: 3.846525 | lr 1.4793e-04 | norm: 0.2936 | dt: 1230.64ms | tok/sec: 426030.32 +step 2029 | loss: 3.792452 | lr 1.4758e-04 | norm: 0.2618 | dt: 1236.79ms | tok/sec: 423909.90 +step 2030 | loss: 3.793483 | lr 1.4723e-04 | norm: 0.2632 | dt: 1235.95ms | tok/sec: 424199.46 +step 2031 | loss: 3.848888 | lr 1.4688e-04 | norm: 0.2529 | dt: 1234.41ms | tok/sec: 424726.93 +step 2032 | loss: 3.905939 | lr 1.4653e-04 | norm: 0.3006 | dt: 1233.12ms | tok/sec: 425173.50 +step 2033 | loss: 3.788701 | lr 1.4618e-04 | norm: 0.2762 | dt: 1235.73ms | tok/sec: 424272.46 +step 2034 | loss: 3.852795 | lr 1.4583e-04 | norm: 0.2678 | dt: 1234.86ms | tok/sec: 424574.08 +step 2035 | loss: 3.827810 | lr 1.4548e-04 | norm: 0.2884 | dt: 1234.33ms | tok/sec: 424754.66 +step 2036 | loss: 3.809172 | lr 1.4514e-04 | norm: 0.2830 | dt: 1232.37ms | tok/sec: 425432.27 +step 2037 | loss: 3.883066 | lr 1.4479e-04 | norm: 0.2842 | dt: 1234.06ms | tok/sec: 424848.62 +step 2038 | loss: 3.864274 | lr 1.4445e-04 | norm: 0.2787 | dt: 1235.17ms | tok/sec: 424465.16 +step 2039 | loss: 3.816583 | lr 1.4410e-04 | norm: 0.3013 | dt: 1232.64ms | tok/sec: 425337.15 +step 2040 | loss: 3.881159 | lr 1.4376e-04 | norm: 0.2716 | dt: 1235.36ms | tok/sec: 424402.65 +step 2041 | loss: 3.764436 | lr 1.4341e-04 | norm: 0.2716 | dt: 1235.77ms | tok/sec: 424259.04 +step 2042 | loss: 3.786880 | lr 1.4307e-04 | norm: 0.2904 | dt: 1238.68ms | tok/sec: 423262.78 +step 2043 | loss: 3.795573 | lr 1.4273e-04 | norm: 0.2705 | dt: 1232.72ms | tok/sec: 425310.33 +step 2044 | loss: 3.829225 | lr 1.4239e-04 | norm: 0.2688 | dt: 1233.50ms | tok/sec: 425041.60 +step 2045 | loss: 3.866665 | lr 1.4204e-04 | norm: 0.2781 | dt: 1231.70ms | tok/sec: 425660.38 +step 2046 | loss: 3.800903 | lr 1.4170e-04 | norm: 0.2952 | dt: 1235.13ms | tok/sec: 424478.92 +step 2047 | loss: 3.799510 | lr 1.4136e-04 | norm: 0.2682 | dt: 1235.44ms | tok/sec: 424374.48 +step 2048 | loss: 3.826766 | lr 1.4102e-04 | norm: 0.3248 | dt: 1235.50ms | tok/sec: 424352.86 +step 2049 | loss: 3.782743 | lr 1.4068e-04 | norm: 0.2761 | dt: 1239.89ms | tok/sec: 422851.36 +validation loss: 3.8039 +validation perplexity: 44.8770 +step 2050 | loss: 3.769061 | lr 1.4035e-04 | norm: 0.2982 | dt: 2675.82ms | tok/sec: 195935.62 +step 2051 | loss: 3.907624 | lr 1.4001e-04 | norm: 0.3120 | dt: 1237.97ms | tok/sec: 423504.72 +step 2052 | loss: 3.911467 | lr 1.3967e-04 | norm: 0.2539 | dt: 1234.91ms | tok/sec: 424555.06 +step 2053 | loss: 3.866334 | lr 1.3933e-04 | norm: 0.2666 | dt: 1233.84ms | tok/sec: 424924.72 +step 2054 | loss: 3.743102 | lr 1.3900e-04 | norm: 0.2997 | dt: 1232.75ms | tok/sec: 425298.81 +step 2055 | loss: 3.782958 | lr 1.3866e-04 | norm: 0.2459 | dt: 1236.07ms | tok/sec: 424158.14 +step 2056 | loss: 3.788559 | lr 1.3833e-04 | norm: 0.2566 | dt: 1235.83ms | tok/sec: 424240.13 +step 2057 | loss: 3.776254 | lr 1.3799e-04 | norm: 0.2582 | dt: 1234.80ms | tok/sec: 424594.82 +step 2058 | loss: 3.768028 | lr 1.3766e-04 | norm: 0.2488 | dt: 1235.52ms | tok/sec: 424347.70 +step 2059 | loss: 3.794600 | lr 1.3733e-04 | norm: 0.2580 | dt: 1235.41ms | tok/sec: 424383.49 +step 2060 | loss: 3.851300 | lr 1.3699e-04 | norm: 0.2588 | dt: 1234.52ms | tok/sec: 424689.36 +step 2061 | loss: 3.816942 | lr 1.3666e-04 | norm: 0.2672 | dt: 1237.12ms | tok/sec: 423798.13 +step 2062 | loss: 3.797189 | lr 1.3633e-04 | norm: 0.2823 | dt: 1231.53ms | tok/sec: 425719.80 +step 2063 | loss: 3.786829 | lr 1.3600e-04 | norm: 0.2936 | dt: 1236.08ms | tok/sec: 424154.05 +step 2064 | loss: 3.860206 | lr 1.3567e-04 | norm: 0.2553 | dt: 1234.76ms | tok/sec: 424607.03 +step 2065 | loss: 3.865570 | lr 1.3534e-04 | norm: 0.2916 | dt: 1234.49ms | tok/sec: 424701.09 +step 2066 | loss: 3.846935 | lr 1.3501e-04 | norm: 0.2796 | dt: 1234.31ms | tok/sec: 424762.62 +step 2067 | loss: 3.782894 | lr 1.3468e-04 | norm: 0.3013 | dt: 1236.82ms | tok/sec: 423898.86 +step 2068 | loss: 3.810308 | lr 1.3435e-04 | norm: 0.3011 | dt: 1237.43ms | tok/sec: 423691.17 +step 2069 | loss: 3.825292 | lr 1.3403e-04 | norm: 0.2816 | dt: 1235.78ms | tok/sec: 424256.25 +step 2070 | loss: 3.810300 | lr 1.3370e-04 | norm: 0.3027 | dt: 1233.09ms | tok/sec: 425183.20 +step 2071 | loss: 3.813627 | lr 1.3337e-04 | norm: 0.2936 | dt: 1233.54ms | tok/sec: 425026.81 +step 2072 | loss: 3.925297 | lr 1.3305e-04 | norm: 0.3279 | dt: 1236.88ms | tok/sec: 423879.91 +step 2073 | loss: 3.809913 | lr 1.3272e-04 | norm: 0.3087 | dt: 1237.06ms | tok/sec: 423816.68 +step 2074 | loss: 3.805075 | lr 1.3240e-04 | norm: 0.2980 | dt: 1238.97ms | tok/sec: 423163.49 +validation loss: 3.8007 +validation perplexity: 44.7322 +step 2075 | loss: 3.825446 | lr 1.3208e-04 | norm: 0.2971 | dt: 2690.45ms | tok/sec: 194869.95 +step 2076 | loss: 3.861112 | lr 1.3175e-04 | norm: 0.2904 | dt: 1235.32ms | tok/sec: 424414.70 +step 2077 | loss: 3.840151 | lr 1.3143e-04 | norm: 0.3046 | dt: 1236.93ms | tok/sec: 423863.08 +step 2078 | loss: 3.822907 | lr 1.3111e-04 | norm: 0.2933 | dt: 1236.22ms | tok/sec: 424105.21 +step 2079 | loss: 3.780080 | lr 1.3079e-04 | norm: 0.2931 | dt: 1236.90ms | tok/sec: 423871.57 +step 2080 | loss: 3.849760 | lr 1.3047e-04 | norm: 0.2933 | dt: 1231.97ms | tok/sec: 425567.54 +step 2081 | loss: 3.775448 | lr 1.3015e-04 | norm: 0.3008 | dt: 1236.24ms | tok/sec: 424098.50 +step 2082 | loss: 3.790434 | lr 1.2983e-04 | norm: 0.2641 | dt: 1237.91ms | tok/sec: 423526.41 +step 2083 | loss: 3.841049 | lr 1.2951e-04 | norm: 0.3307 | dt: 1235.94ms | tok/sec: 424202.57 +step 2084 | loss: 3.799471 | lr 1.2919e-04 | norm: 0.2733 | dt: 1234.97ms | tok/sec: 424536.29 +step 2085 | loss: 3.781245 | lr 1.2887e-04 | norm: 0.2532 | dt: 1235.90ms | tok/sec: 424215.66 +step 2086 | loss: 3.761601 | lr 1.2856e-04 | norm: 0.2602 | dt: 1234.93ms | tok/sec: 424547.85 +step 2087 | loss: 3.778239 | lr 1.2824e-04 | norm: 0.2973 | dt: 1236.41ms | tok/sec: 424040.77 +step 2088 | loss: 3.744459 | lr 1.2793e-04 | norm: 0.2816 | dt: 1233.67ms | tok/sec: 424982.04 +step 2089 | loss: 3.811768 | lr 1.2761e-04 | norm: 0.2547 | dt: 1235.55ms | tok/sec: 424335.75 +step 2090 | loss: 3.811486 | lr 1.2730e-04 | norm: 0.2610 | dt: 1233.60ms | tok/sec: 425007.18 +step 2091 | loss: 3.806534 | lr 1.2698e-04 | norm: 0.2973 | dt: 1241.16ms | tok/sec: 422418.01 +step 2092 | loss: 3.768148 | lr 1.2667e-04 | norm: 0.2792 | dt: 1233.34ms | tok/sec: 425095.25 +step 2093 | loss: 3.818435 | lr 1.2636e-04 | norm: 0.2549 | dt: 1232.71ms | tok/sec: 425313.87 +step 2094 | loss: 3.779541 | lr 1.2605e-04 | norm: 0.2604 | dt: 1237.71ms | tok/sec: 423594.05 +step 2095 | loss: 3.831657 | lr 1.2574e-04 | norm: 0.2780 | dt: 1232.92ms | tok/sec: 425241.66 +step 2096 | loss: 3.745245 | lr 1.2543e-04 | norm: 0.2701 | dt: 1236.09ms | tok/sec: 424151.67 +step 2097 | loss: 3.825768 | lr 1.2512e-04 | norm: 0.2628 | dt: 1234.11ms | tok/sec: 424832.45 +step 2098 | loss: 3.831925 | lr 1.2481e-04 | norm: 0.2721 | dt: 1236.12ms | tok/sec: 424139.40 +step 2099 | loss: 3.917769 | lr 1.2450e-04 | norm: 0.2778 | dt: 1233.92ms | tok/sec: 424894.75 +validation loss: 3.7986 +validation perplexity: 44.6375 +step 2100 | loss: 3.834013 | lr 1.2419e-04 | norm: 0.2857 | dt: 2688.40ms | tok/sec: 195018.35 +step 2101 | loss: 3.764498 | lr 1.2388e-04 | norm: 0.2601 | dt: 1235.10ms | tok/sec: 424489.90 +step 2102 | loss: 3.788270 | lr 1.2358e-04 | norm: 0.2575 | dt: 1233.48ms | tok/sec: 425049.24 +step 2103 | loss: 3.848491 | lr 1.2327e-04 | norm: 0.2862 | dt: 1234.80ms | tok/sec: 424592.11 +step 2104 | loss: 3.817942 | lr 1.2296e-04 | norm: 0.3059 | dt: 1233.89ms | tok/sec: 424906.91 +step 2105 | loss: 3.779584 | lr 1.2266e-04 | norm: 0.3334 | dt: 1237.75ms | tok/sec: 423580.75 +step 2106 | loss: 3.832448 | lr 1.2236e-04 | norm: 0.2958 | dt: 1236.85ms | tok/sec: 423888.16 +step 2107 | loss: 3.882251 | lr 1.2205e-04 | norm: 0.3060 | dt: 1234.75ms | tok/sec: 424609.82 +step 2108 | loss: 3.808353 | lr 1.2175e-04 | norm: 0.2809 | dt: 1234.72ms | tok/sec: 424619.33 +step 2109 | loss: 3.783624 | lr 1.2145e-04 | norm: 0.3037 | dt: 1231.98ms | tok/sec: 425565.90 +step 2110 | loss: 3.743673 | lr 1.2115e-04 | norm: 0.2517 | dt: 1235.10ms | tok/sec: 424491.13 +step 2111 | loss: 3.807868 | lr 1.2085e-04 | norm: 0.2946 | dt: 1237.31ms | tok/sec: 423730.44 +step 2112 | loss: 3.816249 | lr 1.2054e-04 | norm: 0.2751 | dt: 1236.81ms | tok/sec: 423902.13 +step 2113 | loss: 3.813588 | lr 1.2025e-04 | norm: 0.2804 | dt: 1234.38ms | tok/sec: 424738.09 +step 2114 | loss: 3.810738 | lr 1.1995e-04 | norm: 0.2667 | dt: 1237.51ms | tok/sec: 423663.66 +step 2115 | loss: 3.818663 | lr 1.1965e-04 | norm: 0.2745 | dt: 1238.19ms | tok/sec: 423432.38 +step 2116 | loss: 3.839705 | lr 1.1935e-04 | norm: 0.2643 | dt: 1236.35ms | tok/sec: 424060.06 +step 2117 | loss: 3.797634 | lr 1.1905e-04 | norm: 0.2496 | dt: 1236.24ms | tok/sec: 424099.48 +step 2118 | loss: 3.801780 | lr 1.1876e-04 | norm: 0.2498 | dt: 1236.67ms | tok/sec: 423951.25 +step 2119 | loss: 3.832431 | lr 1.1846e-04 | norm: 0.2402 | dt: 1233.52ms | tok/sec: 425033.79 +step 2120 | loss: 3.781923 | lr 1.1817e-04 | norm: 0.2716 | dt: 1871.98ms | tok/sec: 280071.48 +step 2121 | loss: 3.835061 | lr 1.1787e-04 | norm: 0.2592 | dt: 1238.66ms | tok/sec: 423269.95 +step 2122 | loss: 3.836420 | lr 1.1758e-04 | norm: 0.2464 | dt: 1237.37ms | tok/sec: 423710.43 +step 2123 | loss: 3.786269 | lr 1.1729e-04 | norm: 0.2936 | dt: 1238.16ms | tok/sec: 423440.46 +step 2124 | loss: 3.899399 | lr 1.1699e-04 | norm: 0.2508 | dt: 1235.81ms | tok/sec: 424247.74 +validation loss: 3.7978 +validation perplexity: 44.6034 +step 2125 | loss: 3.829367 | lr 1.1670e-04 | norm: 0.2852 | dt: 2680.11ms | tok/sec: 195621.54 +step 2126 | loss: 3.867986 | lr 1.1641e-04 | norm: 0.3107 | dt: 1234.74ms | tok/sec: 424615.15 +step 2127 | loss: 3.810795 | lr 1.1612e-04 | norm: 0.2703 | dt: 1232.70ms | tok/sec: 425316.66 +step 2128 | loss: 3.789040 | lr 1.1583e-04 | norm: 0.3092 | dt: 1234.73ms | tok/sec: 424617.03 +step 2129 | loss: 3.784962 | lr 1.1554e-04 | norm: 0.2743 | dt: 1234.75ms | tok/sec: 424609.90 +step 2130 | loss: 3.804531 | lr 1.1525e-04 | norm: 0.2738 | dt: 1238.08ms | tok/sec: 423468.42 +step 2131 | loss: 3.829532 | lr 1.1497e-04 | norm: 0.2882 | dt: 1235.77ms | tok/sec: 424260.02 +step 2132 | loss: 3.734655 | lr 1.1468e-04 | norm: 0.3073 | dt: 1235.28ms | tok/sec: 424430.10 +step 2133 | loss: 3.748956 | lr 1.1439e-04 | norm: 0.2500 | dt: 1236.89ms | tok/sec: 423875.49 +step 2134 | loss: 3.743320 | lr 1.1411e-04 | norm: 0.2903 | dt: 1236.14ms | tok/sec: 424133.76 +step 2135 | loss: 3.684787 | lr 1.1382e-04 | norm: 0.2676 | dt: 1234.62ms | tok/sec: 424655.33 +step 2136 | loss: 3.734105 | lr 1.1354e-04 | norm: 0.2649 | dt: 1233.88ms | tok/sec: 424909.78 +step 2137 | loss: 3.734500 | lr 1.1325e-04 | norm: 0.3473 | dt: 1238.71ms | tok/sec: 423254.63 +step 2138 | loss: 3.743794 | lr 1.1297e-04 | norm: 0.2633 | dt: 1237.75ms | tok/sec: 423580.58 +step 2139 | loss: 3.857929 | lr 1.1269e-04 | norm: 0.2713 | dt: 1234.90ms | tok/sec: 424558.17 +step 2140 | loss: 3.832487 | lr 1.1241e-04 | norm: 0.2679 | dt: 1234.28ms | tok/sec: 424770.66 +step 2141 | loss: 3.766171 | lr 1.1213e-04 | norm: 0.2562 | dt: 1233.68ms | tok/sec: 424979.99 +step 2142 | loss: 3.768771 | lr 1.1185e-04 | norm: 0.2994 | dt: 1235.29ms | tok/sec: 424425.10 +step 2143 | loss: 3.766866 | lr 1.1157e-04 | norm: 0.2682 | dt: 1234.92ms | tok/sec: 424553.75 +step 2144 | loss: 3.765569 | lr 1.1129e-04 | norm: 0.2934 | dt: 1232.83ms | tok/sec: 425273.65 +step 2145 | loss: 3.804099 | lr 1.1101e-04 | norm: 0.2669 | dt: 1236.14ms | tok/sec: 424132.53 +step 2146 | loss: 3.813960 | lr 1.1073e-04 | norm: 0.2595 | dt: 1234.88ms | tok/sec: 424565.39 +step 2147 | loss: 3.811257 | lr 1.1045e-04 | norm: 0.2651 | dt: 1234.40ms | tok/sec: 424731.61 +step 2148 | loss: 3.850454 | lr 1.1018e-04 | norm: 0.3201 | dt: 1235.26ms | tok/sec: 424435.17 +step 2149 | loss: 3.788929 | lr 1.0990e-04 | norm: 0.2646 | dt: 1232.76ms | tok/sec: 425294.37 +validation loss: 3.7945 +validation perplexity: 44.4559 +step 2150 | loss: 3.839499 | lr 1.0963e-04 | norm: 0.3000 | dt: 2674.96ms | tok/sec: 195998.35 +step 2151 | loss: 3.842136 | lr 1.0935e-04 | norm: 0.2672 | dt: 1233.92ms | tok/sec: 424895.00 +step 2152 | loss: 3.749490 | lr 1.0908e-04 | norm: 0.3437 | dt: 1234.05ms | tok/sec: 424852.23 +step 2153 | loss: 3.797845 | lr 1.0881e-04 | norm: 0.3258 | dt: 1239.74ms | tok/sec: 422903.24 +step 2154 | loss: 3.816730 | lr 1.0853e-04 | norm: 0.3223 | dt: 1236.07ms | tok/sec: 424158.05 +step 2155 | loss: 3.803625 | lr 1.0826e-04 | norm: 0.2924 | dt: 1237.07ms | tok/sec: 423813.74 +step 2156 | loss: 3.747784 | lr 1.0799e-04 | norm: 0.3001 | dt: 1236.60ms | tok/sec: 423973.89 +step 2157 | loss: 3.742785 | lr 1.0772e-04 | norm: 0.3265 | dt: 1240.14ms | tok/sec: 422766.73 +step 2158 | loss: 3.782106 | lr 1.0745e-04 | norm: 0.2825 | dt: 1235.49ms | tok/sec: 424356.87 +step 2159 | loss: 3.814404 | lr 1.0718e-04 | norm: 0.2848 | dt: 1236.54ms | tok/sec: 423994.49 +step 2160 | loss: 3.774070 | lr 1.0692e-04 | norm: 0.2889 | dt: 1234.76ms | tok/sec: 424608.84 +step 2161 | loss: 3.756911 | lr 1.0665e-04 | norm: 0.2691 | dt: 1231.92ms | tok/sec: 425585.25 +step 2162 | loss: 3.814522 | lr 1.0638e-04 | norm: 0.2836 | dt: 1236.85ms | tok/sec: 423889.71 +step 2163 | loss: 3.803551 | lr 1.0612e-04 | norm: 0.2758 | dt: 1238.03ms | tok/sec: 423487.18 +step 2164 | loss: 3.828099 | lr 1.0585e-04 | norm: 0.2752 | dt: 1234.64ms | tok/sec: 424649.26 +step 2165 | loss: 3.806095 | lr 1.0559e-04 | norm: 0.2814 | dt: 1236.27ms | tok/sec: 424087.79 +step 2166 | loss: 3.809601 | lr 1.0532e-04 | norm: 0.2699 | dt: 1240.56ms | tok/sec: 422621.29 +step 2167 | loss: 3.775532 | lr 1.0506e-04 | norm: 0.2879 | dt: 1235.33ms | tok/sec: 424410.03 +step 2168 | loss: 3.792693 | lr 1.0480e-04 | norm: 0.3299 | dt: 1236.10ms | tok/sec: 424148.32 +step 2169 | loss: 3.805320 | lr 1.0453e-04 | norm: 0.3176 | dt: 1232.31ms | tok/sec: 425453.01 +step 2170 | loss: 3.775573 | lr 1.0427e-04 | norm: 0.2603 | dt: 1237.06ms | tok/sec: 423819.29 +step 2171 | loss: 3.734008 | lr 1.0401e-04 | norm: 0.3047 | dt: 1234.80ms | tok/sec: 424594.49 +step 2172 | loss: 3.790123 | lr 1.0375e-04 | norm: 0.2785 | dt: 1235.23ms | tok/sec: 424445.09 +step 2173 | loss: 3.735047 | lr 1.0349e-04 | norm: 0.2510 | dt: 1234.92ms | tok/sec: 424553.91 +step 2174 | loss: 3.763420 | lr 1.0324e-04 | norm: 0.2546 | dt: 1235.00ms | tok/sec: 424525.55 +validation loss: 3.7913 +validation perplexity: 44.3151 +step 2175 | loss: 3.729948 | lr 1.0298e-04 | norm: 0.2701 | dt: 2679.71ms | tok/sec: 195651.34 +step 2176 | loss: 3.791343 | lr 1.0272e-04 | norm: 0.2652 | dt: 1236.54ms | tok/sec: 423995.06 +step 2177 | loss: 3.796644 | lr 1.0247e-04 | norm: 0.2883 | dt: 1235.69ms | tok/sec: 424289.00 +step 2178 | loss: 3.838979 | lr 1.0221e-04 | norm: 0.2712 | dt: 1234.88ms | tok/sec: 424567.60 +step 2179 | loss: 3.748332 | lr 1.0196e-04 | norm: 0.2521 | dt: 1233.44ms | tok/sec: 425062.22 +step 2180 | loss: 3.809679 | lr 1.0170e-04 | norm: 0.2916 | dt: 1235.61ms | tok/sec: 424314.21 +step 2181 | loss: 3.803254 | lr 1.0145e-04 | norm: 0.2906 | dt: 1239.12ms | tok/sec: 423114.07 +step 2182 | loss: 3.793998 | lr 1.0120e-04 | norm: 0.2586 | dt: 1234.44ms | tok/sec: 424716.59 +step 2183 | loss: 3.782753 | lr 1.0094e-04 | norm: 0.2748 | dt: 1236.27ms | tok/sec: 424088.28 +step 2184 | loss: 3.791171 | lr 1.0069e-04 | norm: 0.2902 | dt: 1234.18ms | tok/sec: 424806.27 +step 2185 | loss: 3.786931 | lr 1.0044e-04 | norm: 0.2678 | dt: 1234.41ms | tok/sec: 424727.83 +step 2186 | loss: 3.842192 | lr 1.0019e-04 | norm: 0.2597 | dt: 1237.23ms | tok/sec: 423760.24 +step 2187 | loss: 3.798338 | lr 9.9942e-05 | norm: 0.2519 | dt: 1234.20ms | tok/sec: 424800.77 +step 2188 | loss: 3.801321 | lr 9.9694e-05 | norm: 0.2586 | dt: 1233.59ms | tok/sec: 425009.07 +step 2189 | loss: 3.872357 | lr 9.9446e-05 | norm: 0.2560 | dt: 1236.45ms | tok/sec: 424027.77 +step 2190 | loss: 3.758863 | lr 9.9199e-05 | norm: 0.2629 | dt: 1235.65ms | tok/sec: 424302.34 +step 2191 | loss: 3.839323 | lr 9.8953e-05 | norm: 0.2540 | dt: 1235.88ms | tok/sec: 424221.55 +step 2192 | loss: 3.795908 | lr 9.8708e-05 | norm: 0.2662 | dt: 1236.90ms | tok/sec: 423874.27 +step 2193 | loss: 3.773411 | lr 9.8463e-05 | norm: 0.2455 | dt: 1233.92ms | tok/sec: 424895.58 +step 2194 | loss: 3.786652 | lr 9.8219e-05 | norm: 0.2547 | dt: 1234.13ms | tok/sec: 424823.42 +step 2195 | loss: 3.974071 | lr 9.7975e-05 | norm: 0.3222 | dt: 1236.56ms | tok/sec: 423989.75 +step 2196 | loss: 3.808803 | lr 9.7733e-05 | norm: 0.2685 | dt: 1234.72ms | tok/sec: 424619.25 +step 2197 | loss: 3.776363 | lr 9.7491e-05 | norm: 0.2617 | dt: 1235.35ms | tok/sec: 424404.29 +step 2198 | loss: 3.774312 | lr 9.7250e-05 | norm: 0.2511 | dt: 1232.54ms | tok/sec: 425373.19 +step 2199 | loss: 3.781172 | lr 9.7009e-05 | norm: 0.2630 | dt: 1237.15ms | tok/sec: 423786.70 +validation loss: 3.7910 +validation perplexity: 44.2997 +step 2200 | loss: 3.752847 | lr 9.6769e-05 | norm: 0.2719 | dt: 2689.13ms | tok/sec: 194965.48 +step 2201 | loss: 3.745008 | lr 9.6530e-05 | norm: 0.2739 | dt: 1235.58ms | tok/sec: 424325.18 +step 2202 | loss: 3.777761 | lr 9.6292e-05 | norm: 0.2436 | dt: 1238.93ms | tok/sec: 423176.68 +step 2203 | loss: 3.804918 | lr 9.6054e-05 | norm: 0.2512 | dt: 1234.54ms | tok/sec: 424682.80 +step 2204 | loss: 3.811574 | lr 9.5818e-05 | norm: 0.2565 | dt: 1240.02ms | tok/sec: 422805.66 +step 2205 | loss: 3.753078 | lr 9.5581e-05 | norm: 0.2754 | dt: 1237.77ms | tok/sec: 423573.65 +step 2206 | loss: 3.809823 | lr 9.5346e-05 | norm: 0.2797 | dt: 1233.53ms | tok/sec: 425030.83 +step 2207 | loss: 3.725853 | lr 9.5111e-05 | norm: 0.2394 | dt: 1235.45ms | tok/sec: 424370.79 +step 2208 | loss: 3.780766 | lr 9.4877e-05 | norm: 0.2618 | dt: 1236.40ms | tok/sec: 424043.96 +step 2209 | loss: 3.753957 | lr 9.4644e-05 | norm: 0.2468 | dt: 1235.57ms | tok/sec: 424327.97 +step 2210 | loss: 3.823893 | lr 9.4412e-05 | norm: 0.2536 | dt: 1235.56ms | tok/sec: 424332.96 +step 2211 | loss: 3.791901 | lr 9.4180e-05 | norm: 0.2395 | dt: 1235.67ms | tok/sec: 424293.91 +step 2212 | loss: 3.746784 | lr 9.3949e-05 | norm: 0.2546 | dt: 1236.80ms | tok/sec: 423907.44 +step 2213 | loss: 3.809205 | lr 9.3718e-05 | norm: 0.2561 | dt: 1237.66ms | tok/sec: 423611.59 +step 2214 | loss: 3.835301 | lr 9.3489e-05 | norm: 0.2427 | dt: 1237.59ms | tok/sec: 423637.38 +step 2215 | loss: 3.794668 | lr 9.3260e-05 | norm: 0.2361 | dt: 1239.00ms | tok/sec: 423155.68 +step 2216 | loss: 3.810029 | lr 9.3032e-05 | norm: 0.2667 | dt: 1235.09ms | tok/sec: 424495.48 +step 2217 | loss: 3.771368 | lr 9.2804e-05 | norm: 0.2420 | dt: 1235.28ms | tok/sec: 424429.69 +step 2218 | loss: 3.765256 | lr 9.2578e-05 | norm: 0.2642 | dt: 1235.36ms | tok/sec: 424402.57 +step 2219 | loss: 3.787565 | lr 9.2352e-05 | norm: 0.2355 | dt: 1234.52ms | tok/sec: 424690.84 +step 2220 | loss: 3.780956 | lr 9.2127e-05 | norm: 0.2374 | dt: 1236.63ms | tok/sec: 423965.96 +step 2221 | loss: 3.775681 | lr 9.1902e-05 | norm: 0.2630 | dt: 1234.97ms | tok/sec: 424533.42 +step 2222 | loss: 3.756567 | lr 9.1679e-05 | norm: 0.2562 | dt: 1236.49ms | tok/sec: 424012.48 +step 2223 | loss: 3.803008 | lr 9.1456e-05 | norm: 0.2426 | dt: 1238.71ms | tok/sec: 423252.27 +step 2224 | loss: 3.800628 | lr 9.1233e-05 | norm: 0.2622 | dt: 1234.28ms | tok/sec: 424772.22 +validation loss: 3.7883 +validation perplexity: 44.1818 +step 2225 | loss: 3.889447 | lr 9.1012e-05 | norm: 0.2602 | dt: 2693.29ms | tok/sec: 194664.45 +step 2226 | loss: 3.812604 | lr 9.0791e-05 | norm: 0.2370 | dt: 1231.95ms | tok/sec: 425576.60 +step 2227 | loss: 3.771749 | lr 9.0571e-05 | norm: 0.2921 | dt: 1235.58ms | tok/sec: 424325.59 +step 2228 | loss: 3.746773 | lr 9.0352e-05 | norm: 0.2232 | dt: 1236.13ms | tok/sec: 424136.70 +step 2229 | loss: 3.777399 | lr 9.0133e-05 | norm: 0.2450 | dt: 1234.98ms | tok/sec: 424532.44 +step 2230 | loss: 3.866730 | lr 8.9916e-05 | norm: 0.3433 | dt: 1234.89ms | tok/sec: 424563.66 +step 2231 | loss: 3.787912 | lr 8.9698e-05 | norm: 0.2873 | dt: 1232.16ms | tok/sec: 425501.83 +step 2232 | loss: 3.819930 | lr 8.9482e-05 | norm: 0.2586 | dt: 1236.19ms | tok/sec: 424116.50 +step 2233 | loss: 3.797124 | lr 8.9267e-05 | norm: 0.2681 | dt: 1234.83ms | tok/sec: 424581.45 +step 2234 | loss: 3.795646 | lr 8.9052e-05 | norm: 0.2658 | dt: 1235.37ms | tok/sec: 424396.92 +step 2235 | loss: 3.699573 | lr 8.8838e-05 | norm: 0.2330 | dt: 1237.34ms | tok/sec: 423721.21 +step 2236 | loss: 3.777795 | lr 8.8624e-05 | norm: 0.2830 | dt: 1233.32ms | tok/sec: 425103.06 +step 2237 | loss: 3.740031 | lr 8.8412e-05 | norm: 0.2572 | dt: 1237.59ms | tok/sec: 423636.32 +step 2238 | loss: 3.768285 | lr 8.8200e-05 | norm: 0.2425 | dt: 1233.94ms | tok/sec: 424889.99 +step 2239 | loss: 3.757761 | lr 8.7989e-05 | norm: 0.2631 | dt: 1232.39ms | tok/sec: 425422.31 +step 2240 | loss: 3.794073 | lr 8.7779e-05 | norm: 0.3118 | dt: 1237.59ms | tok/sec: 423635.91 +step 2241 | loss: 3.785238 | lr 8.7569e-05 | norm: 0.2739 | dt: 1237.12ms | tok/sec: 423796.99 +step 2242 | loss: 3.702050 | lr 8.7360e-05 | norm: 0.2712 | dt: 1235.38ms | tok/sec: 424395.12 +step 2243 | loss: 3.801972 | lr 8.7152e-05 | norm: 0.2775 | dt: 1235.06ms | tok/sec: 424502.93 +step 2244 | loss: 3.800313 | lr 8.6945e-05 | norm: 0.2742 | dt: 1236.29ms | tok/sec: 424082.39 +step 2245 | loss: 3.789619 | lr 8.6738e-05 | norm: 0.2822 | dt: 1232.18ms | tok/sec: 425496.81 +step 2246 | loss: 3.788414 | lr 8.6533e-05 | norm: 0.2608 | dt: 1232.66ms | tok/sec: 425330.73 +step 2247 | loss: 3.868347 | lr 8.6328e-05 | norm: 0.3177 | dt: 1232.56ms | tok/sec: 425365.20 +step 2248 | loss: 3.758946 | lr 8.6123e-05 | norm: 0.2912 | dt: 1231.13ms | tok/sec: 425860.36 +step 2249 | loss: 3.806669 | lr 8.5920e-05 | norm: 0.3544 | dt: 1230.79ms | tok/sec: 425977.01 +validation loss: 3.7862 +validation perplexity: 44.0904 +step 2250 | loss: 3.764990 | lr 8.5717e-05 | norm: 0.2684 | dt: 2697.65ms | tok/sec: 194349.68 +step 2251 | loss: 3.771467 | lr 8.5515e-05 | norm: 0.3014 | dt: 1230.82ms | tok/sec: 425968.02 +step 2252 | loss: 3.779455 | lr 8.5314e-05 | norm: 0.2988 | dt: 1238.15ms | tok/sec: 423444.21 +step 2253 | loss: 3.822761 | lr 8.5113e-05 | norm: 0.3028 | dt: 1235.11ms | tok/sec: 424486.46 +step 2254 | loss: 3.802416 | lr 8.4913e-05 | norm: 0.2816 | dt: 1236.88ms | tok/sec: 423880.32 +step 2255 | loss: 3.755500 | lr 8.4714e-05 | norm: 0.2860 | dt: 1237.20ms | tok/sec: 423768.98 +step 2256 | loss: 3.739339 | lr 8.4516e-05 | norm: 0.2834 | dt: 1233.43ms | tok/sec: 425066.41 +step 2257 | loss: 3.776545 | lr 8.4319e-05 | norm: 0.2919 | dt: 1235.11ms | tok/sec: 424488.18 +step 2258 | loss: 3.778099 | lr 8.4122e-05 | norm: 0.2967 | dt: 1235.74ms | tok/sec: 424269.84 +step 2259 | loss: 3.808932 | lr 8.3926e-05 | norm: 0.2586 | dt: 1238.93ms | tok/sec: 423178.07 +step 2260 | loss: 3.773490 | lr 8.3731e-05 | norm: 0.2816 | dt: 1236.94ms | tok/sec: 423860.30 +step 2261 | loss: 3.811203 | lr 8.3536e-05 | norm: 0.2440 | dt: 1236.35ms | tok/sec: 424062.03 +step 2262 | loss: 3.782438 | lr 8.3343e-05 | norm: 0.2669 | dt: 1235.99ms | tok/sec: 424183.99 +step 2263 | loss: 3.791943 | lr 8.3150e-05 | norm: 0.2580 | dt: 1234.62ms | tok/sec: 424656.64 +step 2264 | loss: 3.777011 | lr 8.2958e-05 | norm: 0.2442 | dt: 1234.70ms | tok/sec: 424627.45 +step 2265 | loss: 3.797772 | lr 8.2766e-05 | norm: 0.2505 | dt: 1239.20ms | tok/sec: 423086.39 +step 2266 | loss: 3.764022 | lr 8.2576e-05 | norm: 0.2431 | dt: 1238.32ms | tok/sec: 423387.06 +step 2267 | loss: 3.777492 | lr 8.2386e-05 | norm: 0.2420 | dt: 1234.78ms | tok/sec: 424600.06 +step 2268 | loss: 3.765334 | lr 8.2197e-05 | norm: 0.2234 | dt: 1233.35ms | tok/sec: 425090.98 +step 2269 | loss: 3.770276 | lr 8.2009e-05 | norm: 0.2371 | dt: 1237.43ms | tok/sec: 423690.92 +step 2270 | loss: 3.743004 | lr 8.1821e-05 | norm: 0.2280 | dt: 1235.49ms | tok/sec: 424354.99 +step 2271 | loss: 3.815142 | lr 8.1634e-05 | norm: 0.2471 | dt: 1234.94ms | tok/sec: 424546.21 +step 2272 | loss: 3.732862 | lr 8.1448e-05 | norm: 0.2713 | dt: 1233.64ms | tok/sec: 424993.62 +step 2273 | loss: 3.730464 | lr 8.1263e-05 | norm: 0.2488 | dt: 1233.79ms | tok/sec: 424942.29 +step 2274 | loss: 3.803696 | lr 8.1079e-05 | norm: 0.2552 | dt: 1234.61ms | tok/sec: 424660.00 +validation loss: 3.7857 +validation perplexity: 44.0673 +step 2275 | loss: 3.770863 | lr 8.0895e-05 | norm: 0.2524 | dt: 2687.99ms | tok/sec: 195048.26 +step 2276 | loss: 3.728756 | lr 8.0712e-05 | norm: 0.2607 | dt: 1233.27ms | tok/sec: 425121.71 +step 2277 | loss: 3.794904 | lr 8.0530e-05 | norm: 0.2415 | dt: 1233.54ms | tok/sec: 425028.62 +step 2278 | loss: 3.826924 | lr 8.0348e-05 | norm: 0.2545 | dt: 1234.10ms | tok/sec: 424834.75 +step 2279 | loss: 3.789232 | lr 8.0168e-05 | norm: 0.2432 | dt: 1232.33ms | tok/sec: 425445.52 +step 2280 | loss: 3.774305 | lr 7.9988e-05 | norm: 0.2493 | dt: 1237.99ms | tok/sec: 423498.84 +step 2281 | loss: 3.786320 | lr 7.9809e-05 | norm: 0.2604 | dt: 1235.95ms | tok/sec: 424196.92 +step 2282 | loss: 3.797111 | lr 7.9631e-05 | norm: 0.2469 | dt: 1236.39ms | tok/sec: 424047.31 +step 2283 | loss: 3.784977 | lr 7.9453e-05 | norm: 0.2622 | dt: 1236.12ms | tok/sec: 424139.07 +step 2284 | loss: 3.779605 | lr 7.9276e-05 | norm: 0.2297 | dt: 1235.56ms | tok/sec: 424332.06 +step 2285 | loss: 3.825650 | lr 7.9101e-05 | norm: 0.2903 | dt: 1239.02ms | tok/sec: 423147.86 +step 2286 | loss: 3.768232 | lr 7.8925e-05 | norm: 0.2735 | dt: 1240.07ms | tok/sec: 422788.27 +step 2287 | loss: 3.947362 | lr 7.8751e-05 | norm: 0.3627 | dt: 1234.54ms | tok/sec: 424684.44 +step 2288 | loss: 3.785933 | lr 7.8577e-05 | norm: 0.2822 | dt: 1232.46ms | tok/sec: 425400.59 +step 2289 | loss: 3.821084 | lr 7.8405e-05 | norm: 0.2640 | dt: 1232.24ms | tok/sec: 425476.39 +step 2290 | loss: 3.769676 | lr 7.8232e-05 | norm: 0.3323 | dt: 1235.77ms | tok/sec: 424261.00 +step 2291 | loss: 3.852083 | lr 7.8061e-05 | norm: 0.2692 | dt: 1238.08ms | tok/sec: 423467.28 +step 2292 | loss: 3.773879 | lr 7.7891e-05 | norm: 0.2574 | dt: 1233.75ms | tok/sec: 424955.02 +step 2293 | loss: 3.839136 | lr 7.7721e-05 | norm: 0.2828 | dt: 1234.93ms | tok/sec: 424550.14 +step 2294 | loss: 3.838897 | lr 7.7552e-05 | norm: 0.2578 | dt: 1236.40ms | tok/sec: 424042.65 +step 2295 | loss: 3.816273 | lr 7.7384e-05 | norm: 0.2445 | dt: 1236.31ms | tok/sec: 424073.80 +step 2296 | loss: 3.822213 | lr 7.7217e-05 | norm: 0.2778 | dt: 1233.96ms | tok/sec: 424883.26 +step 2297 | loss: 3.787373 | lr 7.7050e-05 | norm: 0.2403 | dt: 1232.94ms | tok/sec: 425232.28 +step 2298 | loss: 3.777983 | lr 7.6884e-05 | norm: 0.2370 | dt: 1238.28ms | tok/sec: 423398.88 +step 2299 | loss: 3.767390 | lr 7.6719e-05 | norm: 0.2621 | dt: 1234.81ms | tok/sec: 424590.80 +validation loss: 3.7840 +validation perplexity: 43.9900 +step 2300 | loss: 3.770551 | lr 7.6555e-05 | norm: 0.2530 | dt: 2689.67ms | tok/sec: 194926.41 +step 2301 | loss: 3.810054 | lr 7.6392e-05 | norm: 0.2467 | dt: 1236.59ms | tok/sec: 423980.10 +step 2302 | loss: 3.712697 | lr 7.6229e-05 | norm: 0.2732 | dt: 1234.87ms | tok/sec: 424569.32 +step 2303 | loss: 3.727605 | lr 7.6067e-05 | norm: 0.2397 | dt: 1239.02ms | tok/sec: 423147.29 +step 2304 | loss: 3.798949 | lr 7.5906e-05 | norm: 0.2474 | dt: 1233.20ms | tok/sec: 425143.99 +step 2305 | loss: 3.800185 | lr 7.5746e-05 | norm: 0.2520 | dt: 1234.86ms | tok/sec: 424573.34 +step 2306 | loss: 3.751740 | lr 7.5586e-05 | norm: 0.2308 | dt: 1236.51ms | tok/sec: 424006.10 +step 2307 | loss: 3.742714 | lr 7.5427e-05 | norm: 0.2286 | dt: 1235.99ms | tok/sec: 424183.66 +step 2308 | loss: 3.746845 | lr 7.5269e-05 | norm: 0.2293 | dt: 1238.09ms | tok/sec: 423464.76 +step 2309 | loss: 3.768311 | lr 7.5112e-05 | norm: 0.2206 | dt: 1234.58ms | tok/sec: 424668.69 +step 2310 | loss: 3.811019 | lr 7.4956e-05 | norm: 0.2447 | dt: 1235.72ms | tok/sec: 424277.45 +step 2311 | loss: 3.803897 | lr 7.4800e-05 | norm: 0.2574 | dt: 1856.91ms | tok/sec: 282344.29 +step 2312 | loss: 3.784559 | lr 7.4646e-05 | norm: 0.2422 | dt: 1237.52ms | tok/sec: 423660.15 +step 2313 | loss: 3.847715 | lr 7.4492e-05 | norm: 0.2530 | dt: 1237.55ms | tok/sec: 423649.54 +step 2314 | loss: 3.811624 | lr 7.4338e-05 | norm: 0.2488 | dt: 1238.35ms | tok/sec: 423377.68 +step 2315 | loss: 3.735158 | lr 7.4186e-05 | norm: 0.2431 | dt: 1236.41ms | tok/sec: 424040.19 +step 2316 | loss: 3.777252 | lr 7.4034e-05 | norm: 0.2477 | dt: 1236.21ms | tok/sec: 424109.95 +step 2317 | loss: 3.784209 | lr 7.3884e-05 | norm: 0.2475 | dt: 1235.30ms | tok/sec: 424422.89 +step 2318 | loss: 3.815646 | lr 7.3734e-05 | norm: 0.2542 | dt: 1233.49ms | tok/sec: 425044.23 +step 2319 | loss: 3.797773 | lr 7.3584e-05 | norm: 0.2529 | dt: 1235.95ms | tok/sec: 424198.97 +step 2320 | loss: 3.759415 | lr 7.3436e-05 | norm: 0.2378 | dt: 1236.47ms | tok/sec: 424021.31 +step 2321 | loss: 3.703044 | lr 7.3288e-05 | norm: 0.2372 | dt: 1236.52ms | tok/sec: 424003.81 +step 2322 | loss: 3.794204 | lr 7.3141e-05 | norm: 0.2765 | dt: 1236.74ms | tok/sec: 423926.57 +step 2323 | loss: 3.812684 | lr 7.2995e-05 | norm: 0.2499 | dt: 1235.52ms | tok/sec: 424345.49 +step 2324 | loss: 3.790573 | lr 7.2850e-05 | norm: 0.2415 | dt: 1239.82ms | tok/sec: 422873.96 +validation loss: 3.7816 +validation perplexity: 43.8845 +step 2325 | loss: 3.838848 | lr 7.2706e-05 | norm: 0.2519 | dt: 2692.86ms | tok/sec: 194695.54 +step 2326 | loss: 3.838377 | lr 7.2562e-05 | norm: 0.2473 | dt: 1237.17ms | tok/sec: 423780.98 +step 2327 | loss: 3.851518 | lr 7.2419e-05 | norm: 0.2774 | dt: 1232.02ms | tok/sec: 425552.23 +step 2328 | loss: 3.834009 | lr 7.2277e-05 | norm: 0.2532 | dt: 1234.11ms | tok/sec: 424831.71 +step 2329 | loss: 3.884616 | lr 7.2136e-05 | norm: 0.2611 | dt: 1235.47ms | tok/sec: 424362.93 +step 2330 | loss: 3.802889 | lr 7.1995e-05 | norm: 0.2694 | dt: 1235.64ms | tok/sec: 424305.86 +step 2331 | loss: 3.821551 | lr 7.1856e-05 | norm: 0.2409 | dt: 1239.93ms | tok/sec: 422838.10 +step 2332 | loss: 3.830666 | lr 7.1717e-05 | norm: 0.2573 | dt: 1234.05ms | tok/sec: 424852.64 +step 2333 | loss: 3.817981 | lr 7.1579e-05 | norm: 0.2589 | dt: 1237.69ms | tok/sec: 423602.70 +step 2334 | loss: 3.760583 | lr 7.1441e-05 | norm: 0.2572 | dt: 1233.43ms | tok/sec: 425065.18 +step 2335 | loss: 3.786437 | lr 7.1305e-05 | norm: 0.2565 | dt: 1234.23ms | tok/sec: 424790.52 +step 2336 | loss: 3.777277 | lr 7.1169e-05 | norm: 0.2501 | dt: 1236.04ms | tok/sec: 424166.15 +step 2337 | loss: 3.762483 | lr 7.1034e-05 | norm: 0.2571 | dt: 1240.06ms | tok/sec: 422793.96 +step 2338 | loss: 3.780051 | lr 7.0900e-05 | norm: 0.2141 | dt: 1237.99ms | tok/sec: 423497.70 +step 2339 | loss: 3.702289 | lr 7.0767e-05 | norm: 0.2436 | dt: 1235.17ms | tok/sec: 424465.65 +step 2340 | loss: 3.772983 | lr 7.0635e-05 | norm: 0.2918 | dt: 1236.11ms | tok/sec: 424144.72 +step 2341 | loss: 3.782083 | lr 7.0503e-05 | norm: 0.2490 | dt: 1236.19ms | tok/sec: 424115.19 +step 2342 | loss: 3.764787 | lr 7.0372e-05 | norm: 0.2329 | dt: 1235.04ms | tok/sec: 424510.06 +step 2343 | loss: 3.776157 | lr 7.0242e-05 | norm: 0.2392 | dt: 1232.21ms | tok/sec: 425487.34 +step 2344 | loss: 3.828331 | lr 7.0113e-05 | norm: 0.2756 | dt: 1237.69ms | tok/sec: 423602.94 +step 2345 | loss: 3.775594 | lr 6.9984e-05 | norm: 0.2473 | dt: 1234.84ms | tok/sec: 424579.08 +step 2346 | loss: 3.797185 | lr 6.9857e-05 | norm: 0.2709 | dt: 1236.87ms | tok/sec: 423883.91 +step 2347 | loss: 3.776797 | lr 6.9730e-05 | norm: 0.2673 | dt: 1237.13ms | tok/sec: 423794.46 +step 2348 | loss: 3.789246 | lr 6.9604e-05 | norm: 0.2711 | dt: 1240.15ms | tok/sec: 422760.39 +step 2349 | loss: 3.784328 | lr 6.9479e-05 | norm: 0.2545 | dt: 1235.36ms | tok/sec: 424401.43 +validation loss: 3.7808 +validation perplexity: 43.8530 +step 2350 | loss: 3.830515 | lr 6.9354e-05 | norm: 0.2786 | dt: 2689.67ms | tok/sec: 194926.28 +step 2351 | loss: 3.735385 | lr 6.9231e-05 | norm: 0.2527 | dt: 1235.56ms | tok/sec: 424333.86 +step 2352 | loss: 3.828375 | lr 6.9108e-05 | norm: 0.2501 | dt: 1233.24ms | tok/sec: 425128.86 +step 2353 | loss: 3.753122 | lr 6.8986e-05 | norm: 0.2461 | dt: 1235.32ms | tok/sec: 424414.12 +step 2354 | loss: 3.754469 | lr 6.8865e-05 | norm: 0.2358 | dt: 1235.02ms | tok/sec: 424518.34 +step 2355 | loss: 3.773690 | lr 6.8744e-05 | norm: 0.2542 | dt: 1236.73ms | tok/sec: 423929.43 +step 2356 | loss: 3.840383 | lr 6.8625e-05 | norm: 0.2719 | dt: 1235.89ms | tok/sec: 424218.85 +step 2357 | loss: 3.821913 | lr 6.8506e-05 | norm: 0.2500 | dt: 1235.68ms | tok/sec: 424291.78 +step 2358 | loss: 3.789783 | lr 6.8388e-05 | norm: 0.2779 | dt: 1233.93ms | tok/sec: 424893.60 +step 2359 | loss: 3.800432 | lr 6.8271e-05 | norm: 0.2399 | dt: 1234.36ms | tok/sec: 424746.13 +step 2360 | loss: 3.933139 | lr 6.8155e-05 | norm: 0.3341 | dt: 1237.11ms | tok/sec: 423799.85 +step 2361 | loss: 3.826661 | lr 6.8039e-05 | norm: 0.2540 | dt: 1232.91ms | tok/sec: 425244.70 +step 2362 | loss: 3.804511 | lr 6.7925e-05 | norm: 0.2837 | dt: 1237.20ms | tok/sec: 423770.12 +step 2363 | loss: 3.787045 | lr 6.7811e-05 | norm: 0.2403 | dt: 1235.25ms | tok/sec: 424438.70 +step 2364 | loss: 3.756790 | lr 6.7698e-05 | norm: 0.2794 | dt: 1235.72ms | tok/sec: 424275.65 +step 2365 | loss: 3.788090 | lr 6.7585e-05 | norm: 0.2646 | dt: 1234.14ms | tok/sec: 424819.24 +step 2366 | loss: 3.770043 | lr 6.7474e-05 | norm: 0.2734 | dt: 1236.56ms | tok/sec: 423989.75 +step 2367 | loss: 3.799750 | lr 6.7363e-05 | norm: 0.2453 | dt: 1237.58ms | tok/sec: 423641.05 +step 2368 | loss: 3.810210 | lr 6.7254e-05 | norm: 0.2638 | dt: 1236.54ms | tok/sec: 423997.02 +step 2369 | loss: 3.756329 | lr 6.7145e-05 | norm: 0.2749 | dt: 1233.94ms | tok/sec: 424889.17 +step 2370 | loss: 3.751044 | lr 6.7036e-05 | norm: 0.2710 | dt: 1233.08ms | tok/sec: 425184.26 +step 2371 | loss: 3.833463 | lr 6.6929e-05 | norm: 0.3284 | dt: 1234.68ms | tok/sec: 424633.43 +step 2372 | loss: 3.796353 | lr 6.6822e-05 | norm: 0.2822 | dt: 1237.19ms | tok/sec: 423771.59 +step 2373 | loss: 3.768484 | lr 6.6717e-05 | norm: 0.3015 | dt: 1235.61ms | tok/sec: 424315.11 +step 2374 | loss: 3.766057 | lr 6.6612e-05 | norm: 0.2674 | dt: 1234.26ms | tok/sec: 424778.21 +validation loss: 3.7811 +validation perplexity: 43.8654 +step 2375 | loss: 3.808656 | lr 6.6508e-05 | norm: 0.2867 | dt: 2682.39ms | tok/sec: 195455.41 +step 2376 | loss: 3.897954 | lr 6.6404e-05 | norm: 0.2936 | dt: 1237.54ms | tok/sec: 423652.15 +step 2377 | loss: 3.777468 | lr 6.6302e-05 | norm: 0.2537 | dt: 1232.80ms | tok/sec: 425282.12 +step 2378 | loss: 3.830704 | lr 6.6200e-05 | norm: 0.2626 | dt: 1236.34ms | tok/sec: 424065.54 +step 2379 | loss: 3.778040 | lr 6.6099e-05 | norm: 0.2743 | dt: 1235.56ms | tok/sec: 424333.04 +step 2380 | loss: 3.842057 | lr 6.5999e-05 | norm: 0.2399 | dt: 1235.46ms | tok/sec: 424367.19 +step 2381 | loss: 3.750406 | lr 6.5900e-05 | norm: 0.2518 | dt: 1235.40ms | tok/sec: 424388.65 +step 2382 | loss: 3.781203 | lr 6.5802e-05 | norm: 0.2845 | dt: 1233.31ms | tok/sec: 425106.84 +step 2383 | loss: 3.791243 | lr 6.5704e-05 | norm: 0.2707 | dt: 1233.93ms | tok/sec: 424894.34 +step 2384 | loss: 3.759757 | lr 6.5607e-05 | norm: 0.2521 | dt: 1234.89ms | tok/sec: 424561.78 +step 2385 | loss: 3.811032 | lr 6.5511e-05 | norm: 0.2471 | dt: 1232.57ms | tok/sec: 425360.76 +step 2386 | loss: 3.795084 | lr 6.5416e-05 | norm: 0.3167 | dt: 1234.71ms | tok/sec: 424622.86 +step 2387 | loss: 3.850036 | lr 6.5322e-05 | norm: 0.3392 | dt: 1233.84ms | tok/sec: 424922.09 +step 2388 | loss: 3.814170 | lr 6.5229e-05 | norm: 0.2751 | dt: 1234.62ms | tok/sec: 424654.26 +step 2389 | loss: 3.740597 | lr 6.5136e-05 | norm: 0.2497 | dt: 1233.90ms | tok/sec: 424904.28 +step 2390 | loss: 3.755968 | lr 6.5044e-05 | norm: 0.2692 | dt: 1232.10ms | tok/sec: 425522.58 +step 2391 | loss: 3.789952 | lr 6.4953e-05 | norm: 0.3118 | dt: 1235.09ms | tok/sec: 424494.74 +step 2392 | loss: 3.813070 | lr 6.4863e-05 | norm: 0.2719 | dt: 1236.45ms | tok/sec: 424026.05 +step 2393 | loss: 3.764839 | lr 6.4774e-05 | norm: 0.2913 | dt: 1233.89ms | tok/sec: 424905.92 +step 2394 | loss: 3.784952 | lr 6.4685e-05 | norm: 0.3145 | dt: 1234.34ms | tok/sec: 424751.79 +step 2395 | loss: 3.786369 | lr 6.4597e-05 | norm: 0.2481 | dt: 1232.91ms | tok/sec: 425245.11 +step 2396 | loss: 3.756527 | lr 6.4510e-05 | norm: 0.2519 | dt: 1233.24ms | tok/sec: 425130.59 +step 2397 | loss: 3.822416 | lr 6.4424e-05 | norm: 0.2832 | dt: 1238.61ms | tok/sec: 423288.28 +step 2398 | loss: 3.810754 | lr 6.4339e-05 | norm: 0.2537 | dt: 1237.69ms | tok/sec: 423602.70 +step 2399 | loss: 3.777400 | lr 6.4255e-05 | norm: 0.2342 | dt: 1230.44ms | tok/sec: 426098.51 +validation loss: 3.7791 +validation perplexity: 43.7752 +step 2400 | loss: 3.767545 | lr 6.4171e-05 | norm: 0.2452 | dt: 2680.40ms | tok/sec: 195600.68 +step 2401 | loss: 3.793176 | lr 6.4088e-05 | norm: 0.2554 | dt: 1234.71ms | tok/sec: 424625.15 +step 2402 | loss: 3.830543 | lr 6.4006e-05 | norm: 0.2442 | dt: 1232.81ms | tok/sec: 425277.18 +step 2403 | loss: 3.773710 | lr 6.3925e-05 | norm: 0.2312 | dt: 1233.49ms | tok/sec: 425044.72 +step 2404 | loss: 3.843570 | lr 6.3845e-05 | norm: 0.2379 | dt: 1232.00ms | tok/sec: 425557.41 +step 2405 | loss: 3.798037 | lr 6.3765e-05 | norm: 0.2530 | dt: 1239.79ms | tok/sec: 422885.10 +step 2406 | loss: 3.724931 | lr 6.3687e-05 | norm: 0.2334 | dt: 1237.49ms | tok/sec: 423670.19 +step 2407 | loss: 3.756518 | lr 6.3609e-05 | norm: 0.2248 | dt: 1235.46ms | tok/sec: 424367.35 +step 2408 | loss: 3.776959 | lr 6.3532e-05 | norm: 0.2751 | dt: 1236.22ms | tok/sec: 424107.25 +step 2409 | loss: 3.776130 | lr 6.3456e-05 | norm: 0.2345 | dt: 1230.97ms | tok/sec: 425913.57 +step 2410 | loss: 3.755063 | lr 6.3380e-05 | norm: 0.2308 | dt: 1234.84ms | tok/sec: 424580.06 +step 2411 | loss: 3.755984 | lr 6.3306e-05 | norm: 0.2279 | dt: 1233.29ms | tok/sec: 425114.64 +step 2412 | loss: 3.731514 | lr 6.3232e-05 | norm: 0.2244 | dt: 1234.82ms | tok/sec: 424588.09 +step 2413 | loss: 3.796452 | lr 6.3159e-05 | norm: 0.2288 | dt: 1236.29ms | tok/sec: 424081.16 +step 2414 | loss: 3.737607 | lr 6.3087e-05 | norm: 0.2247 | dt: 1233.32ms | tok/sec: 425101.33 +step 2415 | loss: 3.785713 | lr 6.3016e-05 | norm: 0.2180 | dt: 1232.34ms | tok/sec: 425440.59 +step 2416 | loss: 3.736984 | lr 6.2945e-05 | norm: 0.2225 | dt: 1231.82ms | tok/sec: 425620.01 +step 2417 | loss: 3.841328 | lr 6.2876e-05 | norm: 0.2550 | dt: 1233.39ms | tok/sec: 425078.57 +step 2418 | loss: 3.809939 | lr 6.2807e-05 | norm: 0.2547 | dt: 1237.58ms | tok/sec: 423639.42 +step 2419 | loss: 3.792015 | lr 6.2739e-05 | norm: 0.2500 | dt: 1236.30ms | tok/sec: 424079.36 +step 2420 | loss: 3.811259 | lr 6.2672e-05 | norm: 0.2453 | dt: 1235.46ms | tok/sec: 424366.78 +step 2421 | loss: 3.738984 | lr 6.2606e-05 | norm: 0.2548 | dt: 1231.46ms | tok/sec: 425744.61 +step 2422 | loss: 3.786507 | lr 6.2540e-05 | norm: 0.2542 | dt: 1234.47ms | tok/sec: 424708.64 +step 2423 | loss: 3.760263 | lr 6.2476e-05 | norm: 0.2438 | dt: 1235.50ms | tok/sec: 424352.53 +step 2424 | loss: 3.751554 | lr 6.2412e-05 | norm: 0.2414 | dt: 1235.30ms | tok/sec: 424420.27 +validation loss: 3.7772 +validation perplexity: 43.6942 +step 2425 | loss: 3.744011 | lr 6.2349e-05 | norm: 0.2315 | dt: 2683.24ms | tok/sec: 195393.70 +step 2426 | loss: 3.831644 | lr 6.2287e-05 | norm: 0.2535 | dt: 1232.47ms | tok/sec: 425397.13 +step 2427 | loss: 3.794119 | lr 6.2225e-05 | norm: 0.2288 | dt: 1234.95ms | tok/sec: 424541.70 +step 2428 | loss: 3.825758 | lr 6.2165e-05 | norm: 0.2476 | dt: 1233.54ms | tok/sec: 425028.37 +step 2429 | loss: 3.746689 | lr 6.2105e-05 | norm: 0.2463 | dt: 1233.87ms | tok/sec: 424912.24 +step 2430 | loss: 3.817559 | lr 6.2046e-05 | norm: 0.2677 | dt: 1233.00ms | tok/sec: 425213.53 +step 2431 | loss: 3.793001 | lr 6.1988e-05 | norm: 0.2528 | dt: 1236.73ms | tok/sec: 423931.31 +step 2432 | loss: 3.813324 | lr 6.1931e-05 | norm: 0.2258 | dt: 1237.41ms | tok/sec: 423698.11 +step 2433 | loss: 3.809974 | lr 6.1875e-05 | norm: 0.2370 | dt: 1234.89ms | tok/sec: 424562.35 +step 2434 | loss: 3.811116 | lr 6.1820e-05 | norm: 0.2522 | dt: 1235.59ms | tok/sec: 424321.25 +step 2435 | loss: 3.792212 | lr 6.1765e-05 | norm: 0.2551 | dt: 1234.51ms | tok/sec: 424694.20 +step 2436 | loss: 3.777019 | lr 6.1711e-05 | norm: 0.2404 | dt: 1236.04ms | tok/sec: 424167.22 +step 2437 | loss: 3.788423 | lr 6.1658e-05 | norm: 0.2548 | dt: 1239.13ms | tok/sec: 423110.90 +step 2438 | loss: 3.810727 | lr 6.1606e-05 | norm: 0.2567 | dt: 1234.26ms | tok/sec: 424778.86 +step 2439 | loss: 3.820361 | lr 6.1555e-05 | norm: 0.2419 | dt: 1232.49ms | tok/sec: 425388.98 +step 2440 | loss: 3.805113 | lr 6.1504e-05 | norm: 0.2404 | dt: 1234.98ms | tok/sec: 424532.60 +step 2441 | loss: 3.795601 | lr 6.1454e-05 | norm: 0.2582 | dt: 1234.52ms | tok/sec: 424689.85 +step 2442 | loss: 3.784097 | lr 6.1406e-05 | norm: 0.2331 | dt: 1234.74ms | tok/sec: 424613.43 +step 2443 | loss: 3.740152 | lr 6.1358e-05 | norm: 0.2278 | dt: 1237.11ms | tok/sec: 423800.99 +step 2444 | loss: 3.828813 | lr 6.1310e-05 | norm: 0.2494 | dt: 1233.48ms | tok/sec: 425048.74 +step 2445 | loss: 3.797647 | lr 6.1264e-05 | norm: 0.2460 | dt: 1235.04ms | tok/sec: 424509.24 +step 2446 | loss: 3.742680 | lr 6.1218e-05 | norm: 0.2346 | dt: 1230.44ms | tok/sec: 426096.78 +step 2447 | loss: 3.777825 | lr 6.1174e-05 | norm: 0.2368 | dt: 1233.77ms | tok/sec: 424948.62 +step 2448 | loss: 3.820624 | lr 6.1130e-05 | norm: 0.2352 | dt: 1234.68ms | tok/sec: 424634.09 +step 2449 | loss: 3.803969 | lr 6.1087e-05 | norm: 0.2433 | dt: 1233.84ms | tok/sec: 424924.07 +validation loss: 3.7777 +validation perplexity: 43.7172 +step 2450 | loss: 3.763089 | lr 6.1045e-05 | norm: 0.2347 | dt: 2679.49ms | tok/sec: 195667.34 +step 2451 | loss: 3.765787 | lr 6.1003e-05 | norm: 0.2456 | dt: 1238.23ms | tok/sec: 423418.77 +step 2452 | loss: 3.742214 | lr 6.0963e-05 | norm: 0.2398 | dt: 1236.55ms | tok/sec: 423991.47 +step 2453 | loss: 3.755149 | lr 6.0923e-05 | norm: 0.2379 | dt: 1233.26ms | tok/sec: 425124.51 +step 2454 | loss: 3.815914 | lr 6.0884e-05 | norm: 0.2607 | dt: 1233.72ms | tok/sec: 424964.88 +step 2455 | loss: 3.715799 | lr 6.0846e-05 | norm: 0.2378 | dt: 1238.55ms | tok/sec: 423307.67 +step 2456 | loss: 3.752185 | lr 6.0809e-05 | norm: 0.2549 | dt: 1234.71ms | tok/sec: 424622.69 +step 2457 | loss: 3.766255 | lr 6.0773e-05 | norm: 0.2363 | dt: 1235.49ms | tok/sec: 424354.83 +step 2458 | loss: 3.785966 | lr 6.0737e-05 | norm: 0.2426 | dt: 1232.97ms | tok/sec: 425223.81 +step 2459 | loss: 3.748286 | lr 6.0703e-05 | norm: 0.2369 | dt: 1235.79ms | tok/sec: 424252.65 +step 2460 | loss: 3.764948 | lr 6.0669e-05 | norm: 0.2387 | dt: 1238.24ms | tok/sec: 423414.94 +step 2461 | loss: 3.791641 | lr 6.0636e-05 | norm: 0.2493 | dt: 1234.50ms | tok/sec: 424695.02 +step 2462 | loss: 3.724435 | lr 6.0604e-05 | norm: 0.2392 | dt: 1236.62ms | tok/sec: 423967.92 +step 2463 | loss: 3.845265 | lr 6.0572e-05 | norm: 0.2738 | dt: 1231.80ms | tok/sec: 425626.36 +step 2464 | loss: 3.781408 | lr 6.0542e-05 | norm: 0.2420 | dt: 1234.98ms | tok/sec: 424530.39 +step 2465 | loss: 3.786151 | lr 6.0512e-05 | norm: 0.2582 | dt: 1233.64ms | tok/sec: 424993.38 +step 2466 | loss: 3.800663 | lr 6.0483e-05 | norm: 0.2512 | dt: 1232.06ms | tok/sec: 425537.90 +step 2467 | loss: 3.814377 | lr 6.0455e-05 | norm: 0.2339 | dt: 1240.18ms | tok/sec: 422750.80 +step 2468 | loss: 3.787941 | lr 6.0428e-05 | norm: 0.2462 | dt: 1234.40ms | tok/sec: 424731.69 +step 2469 | loss: 3.796092 | lr 6.0402e-05 | norm: 0.2530 | dt: 1237.66ms | tok/sec: 423612.90 +step 2470 | loss: 3.803588 | lr 6.0376e-05 | norm: 0.2377 | dt: 1234.56ms | tok/sec: 424675.91 +step 2471 | loss: 3.795648 | lr 6.0352e-05 | norm: 0.2423 | dt: 1235.88ms | tok/sec: 424220.73 +step 2472 | loss: 3.785455 | lr 6.0328e-05 | norm: 0.2362 | dt: 1235.39ms | tok/sec: 424392.01 +step 2473 | loss: 3.823287 | lr 6.0305e-05 | norm: 0.2293 | dt: 1233.22ms | tok/sec: 425137.57 +step 2474 | loss: 3.800354 | lr 6.0283e-05 | norm: 0.2435 | dt: 1236.31ms | tok/sec: 424075.52 +validation loss: 3.7761 +validation perplexity: 43.6450 +step 2475 | loss: 3.805627 | lr 6.0261e-05 | norm: 0.2534 | dt: 2692.39ms | tok/sec: 194729.57 +step 2476 | loss: 3.726729 | lr 6.0241e-05 | norm: 0.2350 | dt: 1234.25ms | tok/sec: 424782.06 +step 2477 | loss: 3.783800 | lr 6.0221e-05 | norm: 0.2517 | dt: 1238.08ms | tok/sec: 423467.45 +step 2478 | loss: 3.804862 | lr 6.0202e-05 | norm: 0.2469 | dt: 1235.32ms | tok/sec: 424414.12 +step 2479 | loss: 3.786509 | lr 6.0184e-05 | norm: 0.2276 | dt: 1236.60ms | tok/sec: 423976.51 +step 2480 | loss: 3.733209 | lr 6.0167e-05 | norm: 0.2286 | dt: 1235.35ms | tok/sec: 424405.85 +step 2481 | loss: 3.772811 | lr 6.0151e-05 | norm: 0.2394 | dt: 1237.71ms | tok/sec: 423594.37 +step 2482 | loss: 3.712527 | lr 6.0135e-05 | norm: 0.2505 | dt: 1234.77ms | tok/sec: 424603.51 +step 2483 | loss: 3.747961 | lr 6.0121e-05 | norm: 0.2359 | dt: 1233.70ms | tok/sec: 424970.79 +step 2484 | loss: 3.731586 | lr 6.0107e-05 | norm: 0.2327 | dt: 1232.94ms | tok/sec: 425235.41 +step 2485 | loss: 3.790689 | lr 6.0094e-05 | norm: 0.2322 | dt: 1233.79ms | tok/sec: 424941.23 +step 2486 | loss: 3.759753 | lr 6.0082e-05 | norm: 0.2424 | dt: 1233.36ms | tok/sec: 425088.92 +step 2487 | loss: 3.739746 | lr 6.0071e-05 | norm: 0.2670 | dt: 1234.59ms | tok/sec: 424666.73 +step 2488 | loss: 3.751466 | lr 6.0060e-05 | norm: 0.2749 | dt: 1235.67ms | tok/sec: 424294.07 +step 2489 | loss: 3.873823 | lr 6.0051e-05 | norm: 0.2938 | dt: 1236.00ms | tok/sec: 424181.70 +step 2490 | loss: 3.787562 | lr 6.0042e-05 | norm: 0.2680 | dt: 1234.66ms | tok/sec: 424642.04 +step 2491 | loss: 3.782930 | lr 6.0034e-05 | norm: 0.2670 | dt: 1237.85ms | tok/sec: 423547.62 +step 2492 | loss: 3.890589 | lr 6.0027e-05 | norm: 0.2855 | dt: 1235.63ms | tok/sec: 424307.83 +step 2493 | loss: 3.762928 | lr 6.0020e-05 | norm: 0.2525 | dt: 1234.92ms | tok/sec: 424551.21 +step 2494 | loss: 3.788917 | lr 6.0015e-05 | norm: 0.2495 | dt: 1238.61ms | tok/sec: 423285.84 +step 2495 | loss: 3.807387 | lr 6.0010e-05 | norm: 0.2982 | dt: 1235.92ms | tok/sec: 424207.88 +step 2496 | loss: 3.750281 | lr 6.0007e-05 | norm: 0.2638 | dt: 1235.88ms | tok/sec: 424222.04 +step 2497 | loss: 3.793504 | lr 6.0004e-05 | norm: 0.2417 | dt: 1235.26ms | tok/sec: 424436.57 +step 2498 | loss: 3.770416 | lr 6.0002e-05 | norm: 0.2720 | dt: 1235.11ms | tok/sec: 424485.23 +validation loss: 3.7746 +validation perplexity: 43.5804 +step 2499 | loss: 3.770425 | lr 6.0000e-05 | norm: 0.2775 | dt: 4505.48ms | tok/sec: 116366.80 +run-xzmoqbdx.wandb: 100%|██████████| 3.77M/3.77M [00:00<00:00, 5.43MB/s] +optimizer_02499.pt: 100%|██████████| 1.01G/1.01G [00:44<00:00, 22.7MB/s] +model_02499.pt: 100%|██████████| 558M/558M [01:04<00:00, 8.71MB/s]3MB/s] +run-xzmoqbdx.wandb: 94%|█████████▍| 3.54M/3.77M [00:00<00:00, 17.0MB/s] + +Upload 3 LFS files: 33%|███▎ | 1/3 [01:04<02:08, 64.46s/it] diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/requirements.txt b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fbe36c45bf94d514b4968f839b686e3d5101fd0 --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/requirements.txt @@ -0,0 +1,217 @@ +Brotli==1.1.0 +MarkupSafe==3.0.2 +PySocks==1.7.1 +PyYAML==6.0.2 +archspec==0.2.3 +asttokens==2.4.1 +astunparse==1.6.3 +attrs==24.2.0 +beautifulsoup4==4.12.3 +boltons==24.0.0 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.0 +click==8.1.7 +colorama==0.4.6 +conda==24.9.2 +conda-build==24.9.0 +conda_index==0.5.0 +conda-libmamba-solver==24.9.0 +conda-package-handling==2.4.0 +conda_package_streaming==0.11.0 +decorator==5.1.1 +distro==1.9.0 +dnspython==2.7.0 +exceptiongroup==1.2.2 +executing==2.1.0 +expecttest==0.2.1 +filelock==3.16.1 +frozendict==2.4.6 +h2==4.1.0 +hpack==4.0.0 +hyperframe==6.0.1 +hypothesis==6.115.5 +idna==3.10 +importlib_resources==6.4.5 +ipython==8.29.0 +jedi==0.19.1 +Jinja2==3.1.4 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +libarchive-c==5.1 +libmambapy==1.5.10 +lief==0.14.1 +lintrunner==0.12.5 +mamba==1.5.10 +matplotlib-inline==0.1.7 +menuinst==2.1.2 +more-itertools==10.5.0 +mpmath==1.3.0 +networkx==3.4.2 +ninja==1.11.1.1 +numpy==2.1.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +optree==0.13.0 +packaging==24.1 +parso==0.8.4 +pexpect==4.9.0 +pickleshare==0.7.5 +pillow==10.2.0 +pip==24.2 +pkginfo==1.11.2 +pkgutil_resolve_name==1.3.10 +platformdirs==4.3.6 +pluggy==1.5.0 +prompt_toolkit==3.0.48 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycosat==0.6.6 +pycparser==2.22 +Pygments==2.18.0 +python-etcd==0.4.5 +pytz==2024.2 +referencing==0.35.1 +requests==2.32.3 +rpds-py==0.20.0 +ruamel.yaml==0.18.6 +ruamel.yaml.clib==0.2.8 +setuptools==72.1.0 +six==1.16.0 +sortedcontainers==2.4.0 +soupsieve==2.5 +stack-data==0.6.2 +sympy==1.13.1 +torch==2.5.1+cu124 +torchaudio==2.5.1+cu124 +torchelastic==0.2.2 +torchvision==0.20.1+cu124 +tqdm==4.66.5 +traitlets==5.14.3 +triton==3.1.0 +truststore==0.9.2 +types-dataclasses==0.6.6 +typing_extensions==4.12.2 +urllib3==2.2.3 +wcwidth==0.2.13 +wheel==0.44.0 +zipp==3.20.2 +zstandard==0.23.0 +smmap==5.0.2 +setproctitle==1.3.4 +sentry-sdk==2.21.0 +pydantic_core==2.27.2 +protobuf==5.29.3 +docker-pycreds==0.4.0 +annotated-types==0.7.0 +pydantic==2.10.6 +gitdb==4.0.12 +GitPython==3.1.44 +wandb==0.19.6 +iniconfig==2.0.0 +pytest==8.3.4 +jupyter-archive==3.4.0 +nbzip==0.1.0 +webencodings==0.5.1 +filetype==1.2.0 +fastjsonschema==2.21.1 +widgetsnbextension==4.0.13 +websocket-client==1.8.0 +webcolors==24.11.1 +uri-template==1.3.0 +types-python-dateutil==2.9.0.20241206 +tornado==6.4.2 +tinycss2==1.4.0 +sniffio==1.3.1 +Send2Trash==1.8.3 +rfc3986-validator==0.1.1 +rfc3339-validator==0.1.4 +pyzmq==26.2.1 +python-json-logger==3.2.1 +prometheus_client==0.21.1 +pandocfilters==1.5.1 +overrides==7.7.0 +nest-asyncio==1.6.0 +mistune==3.1.1 +jupyterlab_widgets==3.0.13 +jupyterlab_pygments==0.3.0 +jupyter_core==5.7.2 +json5==0.10.0 +h11==0.14.0 +fqdn==1.5.1 +defusedxml==0.7.1 +debugpy==1.8.12 +comm==0.2.2 +bleach==6.2.0 +babel==2.17.0 +async-lru==2.0.4 +terminado==0.18.1 +jupyter_client==8.6.3 +httpcore==1.0.7 +arrow==1.3.0 +argon2-cffi-bindings==21.2.0 +anyio==4.8.0 +jupyter_server_terminals==0.5.3 +isoduration==20.11.0 +httpx==0.28.1 +argon2-cffi==23.1.0 +nbformat==5.10.4 +ipywidgets==8.1.5 +ipykernel==6.29.5 +nbclient==0.10.2 +jupyter-events==0.12.0 +jupyter-console==6.6.3 +bash_kernel==0.10.0 +nbconvert==7.16.6 +jupyter_server==2.15.0 +notebook_shim==0.2.4 +jupyterlab_server==2.27.3 +jupyter-lsp==2.2.5 +jupyterlab==4.3.5 +notebook==7.3.2 +jupyter-http-over-ws==0.0.8 +jupyter==1.1.1 +aiohappyeyeballs==2.4.6 +aiohttp==3.11.12 +aiosignal==1.3.2 +contourpy==1.3.1 +cycler==0.12.1 +datasets==3.2.0 +dill==0.3.8 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +huggingface-hub==0.28.1 +kiwisolver==1.4.8 +matplotlib==3.10.0 +multidict==6.1.0 +multiprocess==0.70.16 +pandas==2.2.3 +propcache==0.2.1 +pyarrow==19.0.0 +pyparsing==3.2.1 +python-dateutil==2.9.0.post0 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +tiktoken==0.8.0 +tokenizers==0.21.0 +transformers==4.48.3 +tzdata==2025.1 +vastai==0.2.8 +xxhash==3.5.0 +yarl==1.18.3 diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/wandb-metadata.json b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ee277adf9a77b6bda217e8ca8ae2597d09ab6390 --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.15.0-130-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2025-02-13T08:52:20.823062Z", + "args": [ + "--group", + "selective_surgery_3", + "--resume_checkpoint", + "hf://andrew-healey/context-compression/unselective_run_0/model_07500.pt", + "--max_steps", + "2500", + "--attention_kind", + "selective_with_memory_penalty", + "--log_dir", + "self_to_selective_run_1_restarted_with_memory_penalty", + "--add_a_head", + "--add_head_to_start", + "--new_head_init", + "ko_zero" + ], + "program": "-m context_compression.train", + "git": { + "remote": "https://github.com/andrew-healey/context-compression", + "commit": "1e8a433484c55130ed7ccbff384bcffb39d31ea1" + }, + "email": "doolie.healey@gmail.com", + "root": "self_to_selective_run_1_restarted_with_memory_penalty", + "host": "2ed8c74725ed", + "executable": "/opt/conda/bin/python3.11", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 8, + "disk": { + "/": { + "total": "222264557568", + "used": "2543665152" + } + }, + "memory": { + "total": "540670349312" + }, + "cpu": { + "count": 64, + "countLogical": 128 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + } + ], + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-core.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..aae91483069cc8df20cc587a568925fe3887501c --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-core.log @@ -0,0 +1,6 @@ +{"time":"2025-02-13T08:52:20.638106606Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmptkkkwtct/port-6274.txt","pid":6274,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-02-13T08:52:20.639962844Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":6274} +{"time":"2025-02-13T08:52:20.639948524Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44215,"Zone":""}} +{"time":"2025-02-13T08:52:20.815341994Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:60728"} +{"time":"2025-02-13T08:52:20.825843336Z","level":"INFO","msg":"handleInformInit: received","streamId":"xzmoqbdx","id":"127.0.0.1:60728"} +{"time":"2025-02-13T08:52:21.037351358Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"xzmoqbdx","id":"127.0.0.1:60728"} diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-internal.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b6da6d19448f7ab20f243a6f16f096830f8b46ce --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-02-13T08:52:20.82612412Z","level":"INFO","msg":"stream: starting","core version":"0.19.6","symlink path":"self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-core.log"} +{"time":"2025-02-13T08:52:21.03727827Z","level":"INFO","msg":"created new stream","id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037339038Z","level":"INFO","msg":"stream: started","id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037414046Z","level":"INFO","msg":"writer: Do: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037478925Z","level":"INFO","msg":"sender: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.037515744Z","level":"INFO","msg":"handler: started","stream_id":"xzmoqbdx"} +{"time":"2025-02-13T08:52:21.267231208Z","level":"INFO","msg":"Starting system monitor"} diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug.log b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8d71523eeef2967584a6b2d3533f5ce2061c4a6a --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug.log @@ -0,0 +1,22 @@ +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Current SDK version is 0.19.6 +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Configure stats pid to 6274 +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from /workspace/context-compression/wandb/settings +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_setup.py:_flush():68] Loading settings from environment variables +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:setup_run_log_directory():637] Logging user logs to self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug.log +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/logs/debug-internal.log +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():756] calling init triggers +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():761] wandb.init called with sweep_config: {} +config: {'hellaswag': True, 'attention_kind': , 'log_dir': 'self_to_selective_run_1_restarted_with_memory_penalty', 'resume_checkpoint': 'hf://andrew-healey/context-compression/unselective_run_0/model_07500.pt', 'resume_optimizer': False, 'add_a_head': True, 'add_head_to_start': True, 'new_head_init': , 'protect_bos_token': True, 'max_steps': 2500, 'group': 'selective_surgery_3', 'use_wandb': True, 'kill_self_after_run': False, '_wandb': {}} +2025-02-13 08:52:20,605 INFO MainThread:6274 [wandb_init.py:init():789] starting backend +2025-02-13 08:52:20,815 INFO MainThread:6274 [wandb_init.py:init():793] sending inform_init request +2025-02-13 08:52:20,822 INFO MainThread:6274 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-02-13 08:52:20,822 INFO MainThread:6274 [wandb_init.py:init():808] backend started and connected +2025-02-13 08:52:20,824 INFO MainThread:6274 [wandb_init.py:init():901] updated telemetry +2025-02-13 08:52:20,833 INFO MainThread:6274 [wandb_init.py:init():936] communicating run to backend with 90.0 second timeout +2025-02-13 08:52:21,264 INFO MainThread:6274 [wandb_init.py:init():994] starting run threads in backend +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_console_start():2385] atexit reg +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2235] redirect: wrap_raw +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2300] Wrapping output streams. +2025-02-13 08:52:21,386 INFO MainThread:6274 [wandb_run.py:_redirect():2325] Redirects installed. +2025-02-13 08:52:21,388 INFO MainThread:6274 [wandb_init.py:init():1036] run started, returning control to user process diff --git a/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/run-xzmoqbdx.wandb b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/run-xzmoqbdx.wandb new file mode 100644 index 0000000000000000000000000000000000000000..14e746ce0f216e7a54bce0bdeb699d4dabf568bd --- /dev/null +++ b/self_to_selective_run_1_restarted_with_memory_penalty/wandb/run-20250213_085220-xzmoqbdx/run-xzmoqbdx.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a6159d53c19c6a23c6a71aa09a62446bec2a0d7e77a1d6627ba6fe904d86bf +size 3768320