diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/date_utc.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/date_utc.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5abc32c5653cdce065ab11bb9af9605c079803 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/date_utc.txt @@ -0,0 +1 @@ +2026-03-09 21:57:08 UTC diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/df_workspace.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/df_workspace.txt new file mode 100644 index 0000000000000000000000000000000000000000..7deaf90cd11d96328673604203b7c7d4464d931b --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/df_workspace.txt @@ -0,0 +1,2 @@ +Filesystem Size Used Avail Use% Mounted on +mfs#us-mo-1.runpod.net:9421 154T 127T 27T 83% /workspace diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/env_selected.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/env_selected.txt new file mode 100644 index 0000000000000000000000000000000000000000..0962895a29dee120429cea69bc0da8f7621240c7 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/env_selected.txt @@ -0,0 +1,9 @@ +HF_HOME= +HF_HUB_CACHE= +HF_DATASETS_CACHE= +HUGGINGFACE_HUB_CACHE= +XDG_CACHE_HOME= +OPENPI_LEROBOT_HOME= +PYTORCH_CUDA_ALLOC_CONF= +OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS= +TOKENIZERS_PARALLELISM= diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa35694ec83a8b0c8a4beb84b2429c89e79c6bfc --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi.txt @@ -0,0 +1,32 @@ +Mon Mar 9 21:57:08 2026 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 580.126.09 Driver Version: 580.126.09 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:3A:00.0 Off | 0 | +| N/A 26C P0 71W / 700W | 0MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 25C P0 72W / 700W | 0MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:9A:00.0 Off | 0 | +| N/A 25C P0 72W / 700W | 0MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 25C P0 70W / 700W | 0MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi_topo.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi_topo.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c4404d0ce8049ec42f1bbc77481ad6d9a633e5a --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/nvidia_smi_topo.txt @@ -0,0 +1,23 @@ + GPU0 GPU1 GPU2 GPU3 NIC0 NIC1 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X NV18 NV18 NV18 NODE NODE 0-51,104-155 0 N/A +GPU1 NV18 X NV18 NV18 NODE NODE 0-51,104-155 0 N/A +GPU2 NV18 NV18 X NV18 SYS SYS 52-103,156-207 1 N/A +GPU3 NV18 NV18 NV18 X SYS SYS 52-103,156-207 1 N/A +NIC0 NODE NODE SYS SYS X PIX +NIC1 NODE NODE SYS SYS PIX X + +Legend: + + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks + +NIC Legend: + + NIC0: mlx5_3 + NIC1: mlx5_4 + diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/pip_freeze.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/pip_freeze.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c4745e7a03a08c39908e832bbfb724ec84c5bbf --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/pip_freeze.txt @@ -0,0 +1,145 @@ +anyio==4.6.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +blinker==1.4 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +cryptography==3.4.8 +dbus-python==1.2.18 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +distro==1.7.0 +entrypoints==0.4 +executing==2.1.0 +fastjsonschema==2.20.0 +filelock==3.13.1 +fqdn==1.5.1 +fsspec==2024.2.0 +h11==0.14.0 +httpcore==1.0.5 +httplib2==0.20.2 +httpx==0.27.2 +idna==3.10 +importlib-metadata==4.6.4 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +jeepney==0.7.1 +Jinja2==3.1.3 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_client==7.4.9 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +lxml==5.3.0 +MarkupSafe==2.1.5 +matplotlib-inline==0.1.7 +mistune==3.0.2 +more-itertools==8.10.0 +mpmath==1.3.0 +nbclassic==1.1.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.2.1 +notebook==6.5.5 +notebook_shim==0.2.4 +numpy==1.26.3 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +oauthlib==3.2.0 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==10.2.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +pyparsing==2.4.7 +python-apt==2.4.0+ubuntu4 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +PyYAML==6.0.2 +pyzmq==24.0.1 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +SecretStorage==3.3.1 +Send2Trash==1.8.3 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +sympy==1.12 +terminado==0.18.1 +tinycss2==1.3.0 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +tornado==6.4.1 +traitlets==5.14.3 +triton==3.0.0 +types-python-dateutil==2.9.0.20240906 +typing_extensions==4.9.0 +uri-template==1.3.0 +urllib3==2.2.3 +wadllib==1.3.6 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +zipp==1.0.0 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/python_version.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/python_version.txt new file mode 100644 index 0000000000000000000000000000000000000000..a01bd32d42c556ee855a5f16308dc56abea19f20 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/python_version.txt @@ -0,0 +1 @@ +Python 3.11.10 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/torch_env.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/torch_env.txt new file mode 100644 index 0000000000000000000000000000000000000000..e690e607d85077310ed3bbf576408875bd53b60c --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/torch_env.txt @@ -0,0 +1,10 @@ +python=3.11.10 +torch=2.7.1+cu126 +cuda=12.6 +cudnn=90501 +cuda_available=True +device_count=4 +device_0=NVIDIA H100 80GB HBM3 +device_1=NVIDIA H100 80GB HBM3 +device_2=NVIDIA H100 80GB HBM3 +device_3=NVIDIA H100 80GB HBM3 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/uname.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/uname.txt new file mode 100644 index 0000000000000000000000000000000000000000..789765c4cf679facafd530674b88349df3a6cb76 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/uname.txt @@ -0,0 +1 @@ +Linux 9a96de7d560b 6.8.0-90-generic #91-Ubuntu SMP PREEMPT_DYNAMIC Tue Nov 18 14:14:30 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/workspace_usage.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/workspace_usage.txt new file mode 100644 index 0000000000000000000000000000000000000000..e65ff81f3424e055eb15f53f1f550e61d6286b06 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/environment/workspace_usage.txt @@ -0,0 +1,5 @@ +391G /workspace/pi05tests-openpi-multiarm/openpi/checkpoints +26G /workspace/pi05tests-openpi-multiarm/artifacts +9.5G /workspace/checkpoints +23G /workspace/.hf +11G /workspace/lerobot diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/baseline_train_full.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/baseline_train_full.csv new file mode 100644 index 0000000000000000000000000000000000000000..d67be02b89afcbfda76ae3f95500c0bdbfa08052 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/baseline_train_full.csv @@ -0,0 +1,1001 @@ +model,ts,step,loss,smoothed,lr,grad_norm,step_time,data_time,its,eta,mem,grad_action_in_proj,grad_action_out_proj,grad_shared_expert +baseline,16:06:14.817,10,0.4808,1.0718,2.74e-07,10.6219,0.6644,0.1706,1.192,8378.4,35.23GB,0.0848,1.4254,3.0073 +baseline,16:06:21.116,20,0.8768,0.9957,7.73e-07,10.4006,0.5287,0.1012,1.588,6286.0,35.23GB,0.2188,1.6293,6.4112 +baseline,16:06:28.338,30,0.7311,0.8939,1.27e-06,5.5822,0.5907,0.1315,1.385,7198.9,35.23GB,0.1041,1.4849,3.7323 +baseline,16:06:35.416,40,0.6537,0.8119,1.77e-06,8.3512,0.5816,0.1263,1.413,7049.8,35.23GB,0.2733,1.6984,9.1250 +baseline,16:06:42.002,50,0.7034,0.8221,2.27e-06,6.1077,0.5544,0.1042,1.519,6551.9,35.23GB,0.1516,1.6697,5.3395 +baseline,16:06:48.550,60,0.5901,0.7609,2.77e-06,7.2780,0.5467,0.1082,1.527,6507.8,35.23GB,0.1455,1.8716,4.2327 +baseline,16:06:55.641,70,0.7597,0.7218,3.27e-06,7.8971,0.5780,0.1313,1.411,7039.9,35.23GB,0.2053,1.3199,7.5503 +baseline,16:07:02.003,80,1.1390,0.7339,3.77e-06,8.9468,0.5611,0.0749,1.572,6310.3,35.23GB,0.0902,1.4414,3.2057 +baseline,16:07:08.197,90,0.6515,0.6546,4.27e-06,4.7710,0.5247,0.0947,1.615,6137.8,35.23GB,0.0868,1.1201,3.5197 +baseline,16:07:15.048,100,0.5053,0.5609,4.77e-06,4.1018,0.5867,0.0985,1.460,6781.8,35.23GB,0.0844,0.9514,3.5740 +baseline,16:07:21.536,110,0.5214,0.5121,5.26e-06,3.2719,0.5534,0.0953,1.542,6415.5,35.23GB,0.0693,1.0125,3.3461 +baseline,16:07:28.906,120,0.4198,0.4590,5.76e-06,2.4553,0.5873,0.1496,1.357,7280.2,35.23GB,0.0542,0.8286,2.0604 +baseline,16:07:36.099,130,0.4511,0.3916,6.26e-06,2.0881,0.6059,0.1134,1.390,7098.6,35.23GB,0.0457,0.7509,1.8086 +baseline,16:07:42.229,140,0.2912,0.3669,6.76e-06,1.9907,0.5186,0.0944,1.632,6043.3,35.23GB,0.0398,0.6527,1.8515 +baseline,16:07:48.308,150,0.2916,0.3454,7.26e-06,1.9457,0.5315,0.0763,1.645,5986.8,35.23GB,0.0324,0.5896,1.2902 +baseline,16:07:54.640,160,0.4199,0.3232,7.76e-06,1.7847,0.5300,0.1033,1.579,6230.3,35.23GB,0.0430,0.5179,1.9623 +baseline,16:08:00.430,170,0.2704,0.3027,8.26e-06,1.5822,0.4903,0.0887,1.728,5690.2,35.23GB,0.0318,0.4689,1.2017 +baseline,16:08:06.120,180,0.1862,0.2797,8.76e-06,1.3466,0.5032,0.0658,1.758,5586.7,35.23GB,0.0238,0.4400,1.0950 +baseline,16:08:11.549,190,0.2303,0.2453,9.26e-06,1.4419,0.4812,0.0618,1.842,5325.1,35.23GB,0.0253,0.6144,1.3234 +baseline,16:08:17.145,200,0.1750,0.2360,9.76e-06,1.2892,0.5001,0.0597,1.787,5483.8,35.23GB,0.0209,0.4311,0.8700 +baseline,16:08:22.929,210,0.1886,0.2172,1.03e-05,1.3653,0.5213,0.0569,1.729,5660.9,35.23GB,0.0302,0.5334,1.1750 +baseline,16:08:28.576,220,0.1590,0.2260,1.08e-05,1.3652,0.5011,0.0638,1.771,5523.0,35.23GB,0.0250,0.4995,1.0182 +baseline,16:08:34.594,230,0.1846,0.2048,1.13e-05,1.2955,0.5341,0.0676,1.662,5878.6,35.23GB,0.0253,0.5097,1.5234 +baseline,16:08:40.277,240,0.1782,0.2099,1.18e-05,1.2624,0.5010,0.0673,1.760,5545.1,35.23GB,0.0196,0.3863,1.0330 +baseline,16:08:46.307,250,0.2441,0.1909,1.23e-05,1.1041,0.5325,0.0705,1.658,5878.9,35.23GB,0.0188,0.3542,0.9294 +baseline,16:08:52.126,260,0.2490,0.1881,1.27e-05,1.1599,0.5201,0.0618,1.719,5667.2,35.23GB,0.0170,0.3928,1.0919 +baseline,16:08:58.342,270,0.1128,0.1820,1.32e-05,1.2641,0.5438,0.0777,1.609,6046.6,35.23GB,0.0155,0.3229,1.3050 +baseline,16:09:05.051,280,0.1527,0.1818,1.37e-05,1.3426,0.5769,0.0940,1.491,6520.5,35.23GB,0.0182,0.3689,1.3518 +baseline,16:09:10.867,290,0.1310,0.1680,1.42e-05,1.2411,0.5185,0.0633,1.719,5647.1,35.23GB,0.0151,0.3208,0.8663 +baseline,16:09:16.658,300,0.0954,0.1365,1.47e-05,1.1318,0.5187,0.0605,1.727,5615.9,35.23GB,0.0156,0.3482,0.9836 +baseline,16:09:22.413,310,0.2194,0.1399,1.52e-05,1.1643,0.5161,0.0592,1.738,5575.9,35.23GB,0.0147,0.3349,1.0313 +baseline,16:09:27.785,320,0.1155,0.1331,1.57e-05,1.2539,0.4735,0.0637,1.862,5199.3,35.23GB,0.0167,0.3374,1.0258 +baseline,16:09:33.658,330,0.2135,0.1465,1.62e-05,1.1879,0.5259,0.0614,1.703,5678.0,35.23GB,0.0150,0.2842,1.1083 +baseline,16:09:39.550,340,0.0691,0.1319,1.67e-05,1.1252,0.5256,0.0635,1.698,5690.3,35.23GB,0.0180,0.3313,1.1754 +baseline,16:09:45.233,350,0.1980,0.1500,1.72e-05,1.5633,0.5111,0.0572,1.760,5483.6,35.23GB,0.0213,0.3695,1.1322 +baseline,16:09:51.336,360,0.1167,0.1251,1.77e-05,1.1671,0.5352,0.0751,1.639,5882.3,35.23GB,0.0176,0.2958,1.3792 +baseline,16:09:57.059,370,0.1412,0.1292,1.82e-05,1.2046,0.5016,0.0707,1.747,5511.1,35.23GB,0.0153,0.3944,1.2694 +baseline,16:10:03.229,380,0.2202,0.1342,1.87e-05,1.1815,0.5471,0.0698,1.621,5934.2,35.23GB,0.0134,0.3072,1.3295 +baseline,16:10:09.025,390,0.1092,0.1205,1.92e-05,1.0922,0.5165,0.0631,1.726,5568.9,35.23GB,0.0151,0.3023,0.9811 +baseline,16:10:15.444,400,0.1159,0.1174,1.97e-05,1.2524,0.5536,0.0883,1.558,6161.7,35.23GB,0.0135,0.3770,1.0751 +baseline,16:10:22.003,410,0.1224,0.1249,2.02e-05,1.0419,0.5681,0.0877,1.525,6289.2,35.23GB,0.0147,0.2860,0.7431 +baseline,16:10:28.060,420,0.0385,0.1015,2.07e-05,1.1034,0.5158,0.0899,1.651,5801.5,35.23GB,0.0152,0.2975,1.3564 +baseline,16:10:34.563,430,0.1755,0.1108,2.12e-05,1.0136,0.5632,0.0870,1.538,6221.9,35.23GB,0.0181,0.3146,0.8567 +baseline,16:10:40.408,440,0.0818,0.1022,2.17e-05,1.1021,0.5095,0.0751,1.711,5587.6,35.23GB,0.0111,0.2313,1.1863 +baseline,16:10:46.809,450,0.1229,0.1119,2.22e-05,1.2202,0.5486,0.0915,1.562,6112.2,35.23GB,0.0166,0.2964,1.0902 +baseline,16:10:53.308,460,0.0837,0.1069,2.27e-05,1.1519,0.5620,0.0879,1.539,6199.1,35.23GB,0.0181,0.3274,1.0420 +baseline,16:11:01.212,470,0.0904,0.0919,2.32e-05,1.1392,0.6335,0.1569,1.265,7531.2,35.23GB,0.0157,0.3160,0.9591 +baseline,16:11:09.273,480,0.0656,0.0852,2.37e-05,1.0968,0.6633,0.1429,1.241,7673.7,35.23GB,0.0177,0.3299,0.7443 +baseline,16:11:17.345,490,0.0466,0.0803,2.42e-05,1.0838,0.6374,0.1698,1.239,7675.4,35.23GB,0.0154,0.2872,1.0878 +baseline,16:11:25.243,500,0.1036,0.0796,2.47e-05,1.0678,0.6327,0.1571,1.266,7501.8,35.23GB,0.0165,0.4493,1.3475 +baseline,16:11:32.463,510,0.0863,0.0862,2.50e-05,1.1711,0.6012,0.1208,1.385,6850.5,35.23GB,0.0202,0.2762,1.2567 +baseline,16:11:38.633,520,0.0451,0.0860,2.50e-05,1.1597,0.5344,0.0826,1.621,5848.4,35.23GB,0.0160,0.2758,1.1836 +baseline,16:11:46.256,530,0.1078,0.0868,2.50e-05,1.1219,0.6456,0.1168,1.312,7218.0,35.23GB,0.0187,0.3030,1.1897 +baseline,16:11:53.274,540,0.1120,0.0907,2.50e-05,1.2555,0.5836,0.1181,1.425,6638.2,35.23GB,0.0310,0.3340,2.3064 +baseline,16:11:59.542,550,0.0546,0.0875,2.50e-05,1.0612,0.5611,0.0657,1.596,5922.0,35.23GB,0.0142,0.2215,0.8339 +baseline,16:12:05.689,560,0.0652,0.0924,2.50e-05,1.0336,0.5466,0.0680,1.627,5801.8,35.23GB,0.0142,0.2752,0.8258 +baseline,16:12:11.757,570,0.0686,0.0866,2.50e-05,1.0636,0.5340,0.0729,1.648,5721.4,35.23GB,0.0177,0.2981,1.0437 +baseline,16:12:18.893,580,0.0598,0.0789,2.50e-05,1.0377,0.6284,0.0851,1.402,6719.7,35.23GB,0.0164,0.3583,1.3082 +baseline,16:12:25.526,590,0.0335,0.0548,2.50e-05,1.0635,0.5751,0.0882,1.508,6241.3,35.23GB,0.0148,0.2413,0.7849 +baseline,16:12:31.894,600,0.0537,0.0693,2.50e-05,0.9775,0.5682,0.0686,1.571,5984.4,35.23GB,0.0140,0.2613,0.9165 +baseline,16:12:38.094,610,0.0481,0.0603,2.50e-05,0.8942,0.5496,0.0704,1.613,5821.0,35.23GB,0.0105,0.2159,0.8682 +baseline,16:12:44.187,620,0.0576,0.0635,2.50e-05,1.0371,0.5321,0.0773,1.641,5714.9,35.23GB,0.0173,0.2697,1.1307 +baseline,16:12:50.263,630,0.0884,0.0700,2.50e-05,1.0996,0.5479,0.0597,1.646,5691.9,35.23GB,0.0176,0.3307,1.0859 +baseline,16:12:56.369,640,0.0797,0.0722,2.50e-05,0.9442,0.5414,0.0692,1.638,5713.7,35.23GB,0.0170,0.2909,0.8321 +baseline,16:13:02.526,650,0.0199,0.0679,2.50e-05,0.9991,0.5500,0.0659,1.624,5756.5,35.23GB,0.0150,0.2647,0.9162 +baseline,16:13:08.622,660,0.0812,0.0757,2.50e-05,0.9777,0.5454,0.0639,1.641,5691.6,35.23GB,0.0191,0.2898,0.8168 +baseline,16:13:14.475,670,0.1366,0.0814,2.50e-05,0.8875,0.5173,0.0681,1.709,5459.7,35.23GB,0.0190,0.3491,1.1842 +baseline,16:13:20.743,680,0.0689,0.0740,2.50e-05,1.0066,0.5635,0.0636,1.596,5840.9,35.23GB,0.0217,0.2554,0.8847 +baseline,16:13:26.720,690,0.0596,0.0710,2.50e-05,0.9029,0.5358,0.0615,1.673,5563.6,35.23GB,0.0266,0.2651,0.6948 +baseline,16:13:32.808,700,0.0519,0.0741,2.50e-05,0.9250,0.5461,0.0627,1.643,5660.6,35.23GB,0.0156,0.2744,1.0106 +baseline,16:13:39.060,710,0.1381,0.0800,2.50e-05,0.9378,0.5469,0.0784,1.600,5807.4,35.23GB,0.0254,0.2963,0.7856 +baseline,16:13:45.989,720,0.1142,0.0876,2.50e-05,0.9654,0.5665,0.1263,1.444,6428.6,35.23GB,0.0246,0.2881,0.9950 +baseline,16:13:52.493,730,0.0594,0.0714,2.50e-05,0.9204,0.5852,0.0653,1.538,6029.0,35.23GB,0.0171,0.2624,0.6622 +baseline,16:13:58.885,740,0.0720,0.0790,2.50e-05,0.8624,0.5686,0.0705,1.565,5918.0,35.23GB,0.0164,0.2295,0.8288 +baseline,16:14:06.417,750,0.0666,0.0682,2.50e-05,1.0076,0.6363,0.1168,1.328,6965.4,35.23GB,0.0147,0.2541,0.7040 +baseline,16:14:12.451,760,0.0397,0.0635,2.50e-05,0.9689,0.5365,0.0670,1.658,5573.5,35.23GB,0.0161,0.3302,1.2185 +baseline,16:14:18.167,770,0.0711,0.0554,2.50e-05,0.7926,0.5093,0.0622,1.750,5274.8,35.23GB,0.0173,0.2588,0.8737 +baseline,16:14:24.131,780,0.1004,0.0557,2.50e-05,0.8729,0.5348,0.0617,1.677,5497.8,35.23GB,0.0220,0.2586,0.6249 +baseline,16:14:30.259,790,0.0554,0.0512,2.50e-05,0.7537,0.5468,0.0660,1.632,5642.2,35.23GB,0.0103,0.1754,0.6660 +baseline,16:14:36.471,800,0.0621,0.0499,2.49e-05,0.7899,0.5568,0.0643,1.610,5714.0,35.23GB,0.0204,0.2695,1.0071 +baseline,16:14:43.002,810,0.0486,0.0653,2.49e-05,0.9053,0.5776,0.0755,1.531,6000.7,35.23GB,0.0129,0.2052,1.1300 +baseline,16:14:49.498,820,0.0313,0.0618,2.49e-05,0.9123,0.5562,0.0934,1.540,5962.0,35.23GB,0.0110,0.1895,0.7138 +baseline,16:14:55.530,830,0.0387,0.0726,2.49e-05,1.1425,0.5483,0.0551,1.658,5531.0,35.23GB,0.0192,0.3457,1.8067 +baseline,16:15:01.586,840,0.0496,0.0697,2.49e-05,1.0460,0.5412,0.0644,1.652,5546.4,35.23GB,0.0211,0.2550,0.6379 +baseline,16:15:07.659,850,0.0416,0.0645,2.49e-05,0.8963,0.5510,0.0563,1.647,5555.4,35.23GB,0.0144,0.2312,0.7478 +baseline,16:15:13.590,860,0.0491,0.0626,2.49e-05,0.9694,0.5356,0.0576,1.686,5419.9,35.23GB,0.0113,0.2030,0.8971 +baseline,16:15:19.560,870,0.0493,0.0681,2.49e-05,0.8766,0.5297,0.0673,1.675,5449.8,35.23GB,0.0143,0.2057,0.9855 +baseline,16:15:26.063,880,0.0400,0.0534,2.49e-05,0.7844,0.5778,0.0725,1.538,5929.5,35.23GB,0.0118,0.2111,0.7729 +baseline,16:15:32.206,890,0.1179,0.0592,2.49e-05,0.8378,0.5487,0.0656,1.628,5595.7,35.23GB,0.0122,0.2095,1.0001 +baseline,16:15:38.669,900,0.0400,0.0536,2.49e-05,0.9191,0.5722,0.0741,1.548,5880.4,35.23GB,0.0155,0.2955,1.0742 +baseline,16:15:44.905,910,0.0323,0.0573,2.49e-05,0.8914,0.5672,0.0563,1.604,5667.7,35.23GB,0.0142,0.2381,0.8228 +baseline,16:15:50.997,920,0.0568,0.0545,2.49e-05,0.8295,0.5510,0.0583,1.642,5531.2,35.23GB,0.0141,0.1752,0.9500 +baseline,16:15:57.992,930,0.0156,0.0446,2.49e-05,0.7817,0.6001,0.0994,1.430,6343.6,35.23GB,0.0222,0.2680,0.6104 +baseline,16:16:04.552,940,0.0481,0.0470,2.49e-05,0.7171,0.5906,0.0654,1.525,5942.7,35.23GB,0.0152,0.2409,0.6225 +baseline,16:16:11.056,950,0.0185,0.0376,2.49e-05,0.7373,0.5799,0.0705,1.538,5885.4,35.23GB,0.0184,0.2401,0.5348 +baseline,16:16:17.306,960,0.0398,0.0460,2.49e-05,0.9076,0.5451,0.0799,1.600,5649.2,35.23GB,0.0171,0.2691,0.9090 +baseline,16:16:23.869,970,0.0776,0.0520,2.49e-05,0.7294,0.5808,0.0755,1.524,5924.7,35.23GB,0.0126,0.2392,1.0457 +baseline,16:16:30.061,980,0.0307,0.0591,2.49e-05,0.6984,0.5582,0.0610,1.615,5584.3,35.23GB,0.0160,0.1864,0.5864 +baseline,16:16:36.229,990,0.0372,0.0490,2.49e-05,0.7236,0.5526,0.0642,1.621,5557.1,35.23GB,0.0114,0.2184,0.7408 +baseline,16:16:42.668,1000,0.0228,0.0476,2.48e-05,0.9699,0.5638,0.0801,1.553,5793.6,35.23GB,0.0109,0.1595,0.4924 +baseline,16:18:09.187,1010,0.0297,0.0419,2.48e-05,0.8719,0.6066,8.0454,0.116,77780.4,35.23GB,0.0160,0.2040,0.7087 +baseline,16:18:17.066,1020,0.0364,0.0415,2.48e-05,0.8359,0.6508,0.1370,1.269,7073.8,35.23GB,0.0090,0.1792,0.5416 +baseline,16:18:24.297,1030,0.0188,0.0466,2.48e-05,0.8780,0.5944,0.1287,1.383,6484.9,35.23GB,0.0176,0.1984,0.7845 +baseline,16:18:31.049,1040,0.0821,0.0451,2.48e-05,0.8972,0.5810,0.0943,1.481,6049.5,35.23GB,0.0131,0.2065,0.9209 +baseline,16:18:38.481,1050,0.0323,0.0383,2.48e-05,0.9110,0.6087,0.1345,1.346,6651.0,35.23GB,0.0130,0.2217,0.7126 +baseline,16:18:45.644,1060,0.0459,0.0448,2.48e-05,0.8361,0.5833,0.1330,1.396,6402.7,35.23GB,0.0146,0.2213,0.5282 +baseline,16:18:53.016,1070,0.0454,0.0455,2.48e-05,0.8166,0.5998,0.1374,1.357,6582.2,35.23GB,0.0149,0.2567,0.8801 +baseline,16:19:00.106,1080,0.0218,0.0325,2.48e-05,0.7621,0.5909,0.1182,1.411,6323.3,35.23GB,0.0156,0.2086,0.7398 +baseline,16:19:07.160,1090,0.0449,0.0417,2.48e-05,0.6918,0.5825,0.1229,1.418,6283.5,35.23GB,0.0228,0.1775,0.7770 +baseline,16:19:13.394,1100,0.0450,0.0417,2.48e-05,0.7677,0.5418,0.0816,1.604,5547.0,35.23GB,0.0097,0.1641,0.6385 +baseline,16:19:19.537,1110,0.0918,0.0528,2.48e-05,0.7288,0.5146,0.0997,1.628,5460.8,35.23GB,0.0193,0.2358,0.5349 +baseline,16:19:26.575,1120,0.0450,0.0424,2.48e-05,0.7824,0.5915,0.1123,1.421,6247.7,35.23GB,0.0150,0.2222,0.8045 +baseline,16:19:33.193,1130,0.0836,0.0429,2.48e-05,0.7360,0.5573,0.1045,1.511,5869.6,35.23GB,0.0144,0.2458,0.7617 +baseline,16:19:39.294,1140,0.0124,0.0313,2.48e-05,0.7389,0.5296,0.0806,1.640,5401.8,35.23GB,0.0110,0.1850,0.5464 +baseline,16:19:45.245,1150,0.0292,0.0337,2.47e-05,0.7077,0.5148,0.0802,1.681,5265.5,35.23GB,0.0140,0.2150,0.6158 +baseline,16:19:51.533,1160,0.0240,0.0374,2.47e-05,0.7827,0.5393,0.0895,1.591,5557.6,35.23GB,0.0132,0.1702,0.6963 +baseline,16:19:58.080,1170,0.0497,0.0410,2.47e-05,0.8083,0.5590,0.0958,1.527,5781.0,35.23GB,0.0101,0.1826,0.7973 +baseline,16:20:04.433,1180,0.0602,0.0541,2.47e-05,0.8179,0.5410,0.0943,1.574,5602.6,35.23GB,0.0122,0.2021,0.7957 +baseline,16:20:10.895,1190,0.0343,0.0595,2.47e-05,0.7686,0.5398,0.1063,1.548,5691.9,35.23GB,0.0210,0.2469,0.7164 +baseline,16:20:16.924,1200,0.0536,0.0440,2.47e-05,0.6380,0.5123,0.0906,1.659,5304.9,35.23GB,0.0156,0.2205,0.6024 +baseline,16:20:23.004,1210,0.0364,0.0532,2.47e-05,0.7159,0.5353,0.0726,1.645,5343.1,35.23GB,0.0149,0.2303,0.5270 +baseline,16:20:29.204,1220,0.0361,0.0410,2.47e-05,0.8058,0.5336,0.0864,1.613,5443.0,35.23GB,0.0186,0.2065,0.9981 +baseline,16:20:35.258,1230,0.0343,0.0410,2.47e-05,0.6798,0.5194,0.0859,1.652,5308.2,35.23GB,0.0136,0.2022,0.5185 +baseline,16:20:42.055,1240,0.0381,0.0397,2.47e-05,0.7036,0.5932,0.0865,1.471,5953.4,35.23GB,0.0129,0.1882,0.6250 +baseline,16:20:47.841,1250,0.0173,0.0390,2.47e-05,0.6647,0.5026,0.0761,1.728,5062.3,35.23GB,0.0103,0.1947,0.6465 +baseline,16:20:54.652,1260,0.0573,0.0434,2.47e-05,0.8029,0.5770,0.1040,1.468,5951.8,35.23GB,0.0115,0.2011,0.8072 +baseline,16:21:01.094,1270,0.0207,0.0406,2.46e-05,0.7687,0.5552,0.0892,1.552,5623.6,35.23GB,0.0143,0.2229,0.6953 +baseline,16:21:07.304,1280,0.0219,0.0381,2.46e-05,0.7128,0.5305,0.0905,1.611,5414.0,35.23GB,0.0204,0.2451,0.8229 +baseline,16:21:13.478,1290,0.0385,0.0454,2.46e-05,0.7291,0.5353,0.0820,1.620,5376.2,35.23GB,0.0110,0.1752,0.9046 +baseline,16:21:19.381,1300,0.0320,0.0435,2.46e-05,0.7427,0.5123,0.0780,1.694,5135.0,35.23GB,0.0149,0.1617,0.6222 +baseline,16:21:25.545,1310,0.0187,0.0341,2.46e-05,0.7027,0.5380,0.0784,1.623,5355.9,35.23GB,0.0166,0.2039,0.5801 +baseline,16:21:31.650,1320,0.1497,0.0508,2.46e-05,0.7574,0.5319,0.0786,1.638,5297.9,35.23GB,0.0265,0.2295,0.6644 +baseline,16:21:37.812,1330,0.0274,0.0535,2.46e-05,0.8089,0.5283,0.0879,1.623,5341.7,35.23GB,0.0103,0.1869,0.5570 +baseline,16:21:44.372,1340,0.0306,0.0469,2.46e-05,0.7625,0.5783,0.0778,1.524,5680.6,35.23GB,0.0116,0.1770,0.4213 +baseline,16:21:50.667,1350,0.0636,0.0471,2.46e-05,0.6394,0.5456,0.0838,1.589,5443.9,35.23GB,0.0132,0.2263,0.6218 +baseline,16:21:56.909,1360,0.0195,0.0355,2.46e-05,0.5836,0.5552,0.0690,1.602,5392.2,35.23GB,0.0103,0.1242,0.3889 +baseline,16:22:03.045,1370,0.0195,0.0288,2.45e-05,0.6400,0.5399,0.0737,1.630,5295.2,35.23GB,0.0164,0.1847,0.7229 +baseline,16:22:09.382,1380,0.0270,0.0330,2.45e-05,0.7124,0.5375,0.0962,1.578,5461.9,35.23GB,0.0121,0.1801,0.6860 +baseline,16:22:16.203,1390,0.0581,0.0364,2.45e-05,0.6877,0.5879,0.0941,1.466,5871.3,35.23GB,0.0095,0.1398,0.6125 +baseline,16:22:22.138,1400,0.0524,0.0371,2.45e-05,0.6551,0.5131,0.0805,1.685,5104.2,35.23GB,0.0245,0.2522,0.6559 +baseline,16:22:28.282,1410,0.0410,0.0358,2.45e-05,0.8354,0.5369,0.0774,1.628,5276.6,35.23GB,0.0124,0.2053,0.5687 +baseline,16:22:34.867,1420,0.0296,0.0315,2.45e-05,0.6967,0.5584,0.1001,1.519,5649.2,35.23GB,0.0113,0.1410,0.5234 +baseline,16:22:41.104,1430,0.0625,0.0458,2.45e-05,0.6918,0.5359,0.0878,1.604,5344.4,35.23GB,0.0153,0.2071,0.7021 +baseline,16:22:48.408,1440,0.0267,0.0393,2.45e-05,0.7153,0.6194,0.1111,1.369,6251.4,35.23GB,0.0113,0.1862,0.5675 +baseline,16:22:56.465,1450,0.0626,0.0440,2.45e-05,0.7544,0.6601,0.1456,1.241,6887.1,35.23GB,0.0148,0.2101,0.6986 +baseline,16:23:03.533,1460,0.0844,0.0461,2.44e-05,0.8047,0.5963,0.1106,1.415,6035.9,35.23GB,0.0148,0.2578,0.5675 +baseline,16:23:10.785,1470,0.0360,0.0403,2.44e-05,0.6131,0.6015,0.1237,1.379,6185.0,35.23GB,0.0182,0.2074,0.6429 +baseline,16:23:17.043,1480,0.0219,0.0328,2.44e-05,0.7932,0.5362,0.0896,1.598,5330.9,35.23GB,0.0193,0.2153,0.5726 +baseline,16:23:23.836,1490,0.0246,0.0305,2.44e-05,0.8101,0.5658,0.1135,1.472,5780.1,35.23GB,0.0116,0.1624,0.6501 +baseline,16:23:30.579,1500,0.0143,0.0385,2.44e-05,0.7207,0.5624,0.1119,1.483,5731.1,35.23GB,0.0170,0.1871,0.5396 +baseline,16:23:37.593,1510,0.0106,0.0375,2.44e-05,0.8018,0.5643,0.1370,1.426,5953.9,35.23GB,0.0131,0.1507,0.4706 +baseline,16:23:44.337,1520,0.0591,0.0355,2.44e-05,0.6378,0.5698,0.1046,1.483,5718.1,35.23GB,0.0243,0.1991,0.5808 +baseline,16:23:50.322,1530,0.0429,0.0373,2.44e-05,0.6415,0.5148,0.0837,1.671,5069.1,35.23GB,0.0103,0.1670,0.7843 +baseline,16:23:56.372,1540,0.0303,0.0323,2.43e-05,0.6002,0.5297,0.0752,1.653,5116.7,35.23GB,0.0090,0.1322,0.7165 +baseline,16:24:02.554,1550,0.0445,0.0502,2.43e-05,0.7770,0.5266,0.0916,1.618,5223.3,35.23GB,0.0151,0.2314,0.6467 +baseline,16:24:08.423,1560,0.0273,0.0412,2.43e-05,0.6909,0.5162,0.0708,1.704,4952.9,35.23GB,0.0170,0.1870,0.6174 +baseline,16:24:14.488,1570,0.0126,0.0406,2.43e-05,0.7748,0.5314,0.0750,1.649,5112.1,35.23GB,0.0221,0.2688,1.0956 +baseline,16:24:20.193,1580,0.0174,0.0342,2.43e-05,0.6772,0.4927,0.0778,1.753,4802.5,35.23GB,0.0098,0.1499,0.5048 +baseline,16:24:26.519,1590,0.0244,0.0286,2.43e-05,0.6487,0.5330,0.0995,1.581,5318.8,35.23GB,0.0104,0.2018,0.4577 +baseline,16:24:32.575,1600,0.0275,0.0294,2.43e-05,0.6713,0.5257,0.0799,1.651,5086.7,35.23GB,0.0123,0.1598,0.6776 +baseline,16:24:38.231,1610,0.0087,0.0399,2.43e-05,0.7639,0.4945,0.0711,1.768,4744.7,35.23GB,0.0127,0.2163,0.5902 +baseline,16:24:44.435,1620,0.0264,0.0374,2.42e-05,0.7240,0.5285,0.0919,1.612,5198.0,35.23GB,0.0146,0.2483,1.5752 +baseline,16:24:50.894,1630,0.0280,0.0363,2.42e-05,0.6313,0.5592,0.0866,1.549,5404.8,35.23GB,0.0158,0.1951,0.6722 +baseline,16:24:57.267,1640,0.0336,0.0259,2.42e-05,0.5898,0.5390,0.0983,1.569,5326.9,35.23GB,0.0203,0.2349,0.6458 +baseline,16:25:03.394,1650,0.0536,0.0332,2.42e-05,0.6069,0.5298,0.0830,1.632,5116.0,35.23GB,0.0151,0.2142,0.6898 +baseline,16:25:09.496,1660,0.0216,0.0352,2.42e-05,0.5964,0.5111,0.0991,1.639,5088.3,35.23GB,0.0157,0.1364,0.5841 +baseline,16:25:15.239,1670,0.0279,0.0371,2.42e-05,0.7418,0.4997,0.0746,1.741,4783.7,35.23GB,0.0132,0.2254,0.6020 +baseline,16:25:20.608,1680,0.0206,0.0373,2.42e-05,0.6708,0.4728,0.0641,1.863,4466.5,35.23GB,0.0123,0.1688,0.5183 +baseline,16:25:26.161,1690,0.0532,0.0344,2.41e-05,0.6546,0.4876,0.0676,1.801,4613.5,35.23GB,0.0136,0.2538,0.6819 +baseline,16:25:32.453,1700,0.0437,0.0416,2.41e-05,0.8400,0.5487,0.0806,1.590,5221.7,35.23GB,0.0212,0.2886,0.6268 +baseline,16:25:38.660,1710,0.0196,0.0293,2.41e-05,0.6984,0.5092,0.1114,1.612,5144.2,35.23GB,0.0126,0.1827,0.8368 +baseline,16:25:44.713,1720,0.0327,0.0403,2.41e-05,0.6773,0.5434,0.0619,1.652,5011.4,35.23GB,0.0134,0.1808,0.6724 +baseline,16:25:50.395,1730,0.0086,0.0302,2.41e-05,0.8331,0.4986,0.0696,1.760,4698.8,35.23GB,0.0126,0.1850,0.7819 +baseline,16:25:55.813,1740,0.0087,0.0320,2.41e-05,0.6490,0.4821,0.0596,1.846,4474.2,35.23GB,0.0186,0.2260,0.5631 +baseline,16:26:01.756,1750,0.0492,0.0352,2.41e-05,0.6197,0.5298,0.0645,1.683,4902.3,35.23GB,0.0171,0.1983,0.6600 +baseline,16:26:07.086,1760,0.0113,0.0476,2.40e-05,0.6425,0.4731,0.0599,1.877,4390.7,35.23GB,0.0186,0.1739,0.5580 +baseline,16:26:12.705,1770,0.0286,0.0356,2.40e-05,0.6660,0.4984,0.0636,1.780,4623.8,35.23GB,0.0138,0.1902,0.7131 +baseline,16:26:18.194,1780,0.0226,0.0319,2.40e-05,0.7177,0.4863,0.0626,1.822,4511.0,35.23GB,0.0129,0.1646,0.4020 +baseline,16:26:23.614,1790,0.0226,0.0331,2.40e-05,0.6867,0.4763,0.0658,1.845,4449.3,35.23GB,0.0107,0.1743,0.6437 +baseline,16:26:29.191,1800,0.1000,0.0358,2.40e-05,0.7812,0.4933,0.0645,1.793,4572.6,35.23GB,0.0208,0.2676,0.6731 +baseline,16:26:34.309,1810,0.0135,0.0267,2.40e-05,0.6283,0.4506,0.0611,1.955,4190.3,35.23GB,0.0107,0.1516,0.5996 +baseline,16:26:39.506,1820,0.0142,0.0298,2.40e-05,0.8532,0.4563,0.0635,1.924,4250.6,35.23GB,0.0130,0.2040,0.7000 +baseline,16:26:45.072,1830,0.0153,0.0255,2.39e-05,0.5665,0.4953,0.0614,1.797,4546.8,35.23GB,0.0138,0.1544,0.4261 +baseline,16:26:50.469,1840,0.0245,0.0242,2.39e-05,0.6464,0.4785,0.0611,1.853,4402.6,35.23GB,0.0100,0.1735,0.6895 +baseline,16:26:56.314,1850,0.0130,0.0232,2.39e-05,0.5991,0.5192,0.0653,1.711,4763.0,35.23GB,0.0111,0.1403,0.4448 +baseline,16:27:02.103,1860,0.0165,0.0314,2.39e-05,0.6975,0.5092,0.0698,1.728,4711.8,35.23GB,0.0169,0.2051,0.5574 +baseline,16:27:08.400,1870,0.0397,0.0312,2.39e-05,0.6255,0.5446,0.0851,1.588,5118.9,35.23GB,0.0143,0.1599,0.7227 +baseline,16:27:15.037,1880,0.0204,0.0347,2.39e-05,0.6849,0.5496,0.1141,1.507,5388.7,35.23GB,0.0126,0.1931,1.1871 +baseline,16:27:21.320,1890,0.0129,0.0268,2.38e-05,0.6174,0.5175,0.1107,1.592,5093.9,35.23GB,0.0165,0.1710,0.3862 +baseline,16:27:27.671,1900,0.0123,0.0267,2.38e-05,0.6858,0.5429,0.0922,1.575,5143.5,35.23GB,0.0113,0.1903,0.4321 +baseline,16:27:33.865,1910,0.0805,0.0345,2.38e-05,0.6155,0.5242,0.0952,1.615,5010.7,35.23GB,0.0338,0.2451,0.7462 +baseline,16:27:39.681,1920,0.0452,0.0298,2.38e-05,0.5698,0.5087,0.0728,1.720,4698.3,35.23GB,0.0182,0.1483,0.4606 +baseline,16:27:46.486,1930,0.0677,0.0351,2.38e-05,0.6437,0.5742,0.1063,1.470,5491.5,35.23GB,0.0234,0.1778,0.5921 +baseline,16:27:53.397,1940,0.0871,0.0370,2.38e-05,0.6687,0.5716,0.1194,1.447,5569.2,35.23GB,0.0168,0.1969,0.6801 +baseline,16:27:59.340,1950,0.0205,0.0313,2.37e-05,0.5386,0.5173,0.0770,1.683,4783.4,35.23GB,0.0149,0.1714,0.5170 +baseline,16:28:06.638,1960,0.0229,0.0421,2.37e-05,0.6891,0.6299,0.0998,1.371,5866.4,35.23GB,0.0199,0.1706,0.6406 +baseline,16:28:13.103,1970,0.0224,0.0344,2.37e-05,0.6636,0.5480,0.0985,1.547,5190.2,35.23GB,0.0106,0.1630,0.5683 +baseline,16:28:19.423,1980,0.0106,0.0290,2.37e-05,0.5852,0.5521,0.0799,1.583,5067.6,35.23GB,0.0089,0.1635,0.3906 +baseline,16:28:25.268,1990,0.0185,0.0303,2.37e-05,0.6477,0.5018,0.0827,1.711,4681.3,35.23GB,0.0136,0.1832,0.6515 +baseline,16:28:30.872,2000,0.0492,0.0284,2.37e-05,0.6437,0.4982,0.0622,1.785,4482.7,35.23GB,0.0184,0.2195,0.8358 +baseline,16:30:18.252,2010,0.0382,0.0304,2.36e-05,0.5679,0.5806,10.1574,0.093,85796.2,35.23GB,0.0220,0.2020,0.5490 +baseline,16:30:25.896,2020,0.0093,0.0257,2.36e-05,0.6440,0.5986,0.1658,1.309,6098.4,35.23GB,0.0172,0.1451,0.6648 +baseline,16:30:32.873,2030,0.0254,0.0308,2.36e-05,0.5494,0.5826,0.1151,1.433,5560.0,35.23GB,0.0202,0.2381,0.5373 +baseline,16:30:40.372,2040,0.0590,0.0337,2.36e-05,0.5996,0.6019,0.1480,1.334,5968.2,35.23GB,0.0196,0.2096,0.6978 +baseline,16:30:47.263,2050,0.0173,0.0290,2.36e-05,0.6206,0.5851,0.1040,1.451,5477.8,35.23GB,0.0139,0.1825,0.6677 +baseline,16:30:53.592,2060,0.0315,0.0338,2.35e-05,0.5436,0.5440,0.0889,1.580,5025.1,35.23GB,0.0182,0.2143,0.5517 +baseline,16:31:00.610,2070,0.0130,0.0309,2.35e-05,0.5679,0.5527,0.1491,1.425,5564.3,35.23GB,0.0184,0.1608,0.4683 +baseline,16:31:08.444,2080,0.0113,0.0208,2.35e-05,0.5327,0.6364,0.1470,1.277,6203.9,35.23GB,0.0105,0.1735,0.4850 +baseline,16:31:15.901,2090,0.0342,0.0288,2.35e-05,0.5941,0.5940,0.1516,1.341,5897.5,35.23GB,0.0145,0.1825,0.6360 +baseline,16:31:22.923,2100,0.0282,0.0319,2.35e-05,0.6509,0.5850,0.1171,1.424,5546.2,35.23GB,0.0195,0.2288,0.6354 +baseline,16:31:28.994,2110,0.0206,0.0306,2.35e-05,0.6718,0.5275,0.0797,1.647,4790.0,35.23GB,0.0121,0.1336,0.5533 +baseline,16:31:36.068,2120,0.0675,0.0326,2.34e-05,0.6110,0.5851,0.1223,1.414,5573.5,35.23GB,0.0172,0.1887,0.4966 +baseline,16:31:43.253,2130,0.0158,0.0250,2.34e-05,0.6806,0.5729,0.1456,1.392,5654.2,35.23GB,0.0147,0.1759,0.5056 +baseline,16:31:49.491,2140,0.0224,0.0228,2.34e-05,0.5375,0.5378,0.0860,1.603,4902.0,35.23GB,0.0107,0.1622,0.4928 +baseline,16:31:55.557,2150,0.0173,0.0235,2.34e-05,0.5407,0.5376,0.0691,1.649,4761.8,35.23GB,0.0202,0.1988,0.5171 +baseline,16:32:02.712,2160,0.0173,0.0323,2.34e-05,0.5438,0.5716,0.1438,1.398,5608.2,35.23GB,0.0189,0.1829,0.3900 +baseline,16:32:08.490,2170,0.0216,0.0272,2.33e-05,0.5226,0.4996,0.0782,1.731,4522.9,35.23GB,0.0149,0.1407,0.5241 +baseline,16:32:15.432,2180,0.0112,0.0243,2.33e-05,0.6155,0.5774,0.1168,1.442,5424.6,35.23GB,0.0148,0.1772,0.6362 +baseline,16:32:22.527,2190,0.0068,0.0305,2.33e-05,0.6114,0.5740,0.1355,1.410,5540.8,35.23GB,0.0055,0.0809,0.3209 +baseline,16:32:30.082,2200,0.0282,0.0259,2.33e-05,0.5947,0.5769,0.1785,1.324,5892.2,35.23GB,0.0244,0.2706,0.8170 +baseline,16:32:35.910,2210,0.0044,0.0262,2.33e-05,0.5296,0.4911,0.0917,1.716,4539.4,35.23GB,0.0155,0.1674,0.4876 +baseline,16:32:42.089,2220,0.0331,0.0302,2.32e-05,0.5944,0.5252,0.0928,1.618,4807.0,35.23GB,0.0131,0.1986,0.5090 +baseline,16:32:48.075,2230,0.0254,0.0287,2.32e-05,0.5703,0.5175,0.0811,1.671,4650.1,35.23GB,0.0131,0.2021,0.5261 +baseline,16:32:54.310,2240,0.0303,0.0243,2.32e-05,0.5342,0.5340,0.0895,1.604,4837.5,35.23GB,0.0125,0.1539,0.4450 +baseline,16:33:00.429,2250,0.0457,0.0269,2.32e-05,0.5327,0.5319,0.0801,1.635,4741.4,35.23GB,0.0111,0.1482,0.4727 +baseline,16:33:05.907,2260,0.0218,0.0263,2.32e-05,0.6736,0.4891,0.0587,1.826,4239.4,35.23GB,0.0190,0.1815,0.6866 +baseline,16:33:11.208,2270,0.0152,0.0272,2.31e-05,0.5726,0.4721,0.0580,1.887,4097.3,35.23GB,0.0101,0.1315,0.4517 +baseline,16:33:16.669,2280,0.0517,0.0259,2.31e-05,0.5919,0.4891,0.0570,1.832,4214.9,35.23GB,0.0225,0.2166,0.6041 +baseline,16:33:22.155,2290,0.0248,0.0273,2.31e-05,0.6117,0.4909,0.0577,1.823,4228.6,35.23GB,0.0242,0.2079,0.5113 +baseline,16:33:28.513,2300,0.0302,0.0254,2.31e-05,0.5780,0.5245,0.1112,1.573,4894.6,35.23GB,0.0104,0.1754,0.6121 +baseline,16:33:33.834,2310,0.0822,0.0370,2.31e-05,0.5795,0.4744,0.0577,1.879,4091.8,35.23GB,0.0236,0.2406,0.7053 +baseline,16:33:39.407,2320,0.0998,0.0330,2.30e-05,0.5542,0.4974,0.0598,1.795,4279.1,35.23GB,0.0252,0.2004,0.6067 +baseline,16:33:44.999,2330,0.0233,0.0307,2.30e-05,0.5300,0.4984,0.0609,1.789,4288.5,35.23GB,0.0106,0.1237,0.5202 +baseline,16:33:50.891,2340,0.0198,0.0219,2.30e-05,0.6776,0.5150,0.0742,1.698,4512.5,35.23GB,0.0139,0.1571,0.4755 +baseline,16:33:56.319,2350,0.0316,0.0228,2.30e-05,0.5796,0.4824,0.0603,1.843,4151.3,35.23GB,0.0209,0.2057,0.5080 +baseline,16:34:02.039,2360,0.0130,0.0265,2.29e-05,0.5718,0.5120,0.0600,1.749,4369.1,35.23GB,0.0121,0.1297,0.3217 +baseline,16:34:07.616,2370,0.0187,0.0249,2.29e-05,0.5280,0.4968,0.0610,1.793,4255.3,35.23GB,0.0063,0.1078,0.3970 +baseline,16:34:13.313,2380,0.0462,0.0241,2.29e-05,0.5576,0.5072,0.0625,1.756,4340.1,35.23GB,0.0244,0.2201,0.5108 +baseline,16:34:18.975,2390,0.0073,0.0277,2.29e-05,0.5559,0.5030,0.0632,1.767,4307.6,35.23GB,0.0073,0.1367,0.4099 +baseline,16:34:24.349,2400,0.0151,0.0210,2.29e-05,0.5863,0.4792,0.0582,1.861,4083.4,35.23GB,0.0107,0.1300,0.5702 +baseline,16:34:30.009,2410,0.0275,0.0206,2.28e-05,0.4985,0.5056,0.0603,1.767,4294.6,35.23GB,0.0115,0.1225,0.5719 +baseline,16:34:35.563,2420,0.0191,0.0429,2.28e-05,0.6245,0.4956,0.0599,1.801,4209.5,35.23GB,0.0201,0.2094,0.5389 +baseline,16:34:41.390,2430,0.0618,0.0333,2.28e-05,0.5696,0.5213,0.0614,1.717,4410.1,35.23GB,0.0146,0.2176,0.6587 +baseline,16:34:47.100,2440,0.0095,0.0295,2.28e-05,0.5069,0.5089,0.0622,1.752,4316.1,35.23GB,0.0121,0.1697,0.3905 +baseline,16:34:52.516,2450,0.0234,0.0261,2.28e-05,0.4804,0.4833,0.0582,1.847,4087.8,35.23GB,0.0094,0.1257,0.3570 +baseline,16:34:58.319,2460,0.0302,0.0213,2.27e-05,0.4930,0.5161,0.0642,1.724,4374.6,35.23GB,0.0123,0.1654,0.4354 +baseline,16:35:04.047,2470,0.0407,0.0262,2.27e-05,0.5534,0.5045,0.0683,1.746,4312.6,35.23GB,0.0196,0.2349,0.6634 +baseline,16:35:09.696,2480,0.0264,0.0208,2.27e-05,0.5709,0.5070,0.0580,1.770,4247.6,35.23GB,0.0156,0.2059,0.4758 +baseline,16:35:15.382,2490,0.0231,0.0283,2.27e-05,0.5675,0.5043,0.0642,1.759,4269.1,35.23GB,0.0173,0.1941,0.5536 +baseline,16:35:20.863,2500,0.0777,0.0288,2.26e-05,0.6105,0.4875,0.0606,1.824,4110.7,35.23GB,0.0173,0.1869,0.6288 +baseline,16:35:26.765,2510,0.0120,0.0219,2.26e-05,0.5026,0.5298,0.0603,1.695,4419.4,35.23GB,0.0237,0.1590,0.3946 +baseline,16:35:32.332,2520,0.0109,0.0255,2.26e-05,0.5781,0.4984,0.0583,1.796,4164.2,35.23GB,0.0251,0.2695,0.4935 +baseline,16:35:37.871,2530,0.0209,0.0262,2.26e-05,0.5721,0.4925,0.0614,1.806,4136.7,35.23GB,0.0143,0.1408,0.4911 +baseline,16:35:43.948,2540,0.0464,0.0239,2.25e-05,0.5275,0.5332,0.0746,1.646,4532.8,35.23GB,0.0132,0.1611,0.5731 +baseline,16:35:49.703,2550,0.0276,0.0213,2.25e-05,0.5695,0.4925,0.0830,1.738,4286.7,35.23GB,0.0115,0.1557,0.7928 +baseline,16:35:56.293,2560,0.0294,0.0223,2.25e-05,0.5370,0.5637,0.0953,1.518,4901.8,35.23GB,0.0086,0.1216,0.5153 +baseline,16:36:02.351,2570,0.0508,0.0289,2.25e-05,0.5722,0.5271,0.0787,1.651,4500.0,35.23GB,0.0116,0.1794,0.4411 +baseline,16:36:08.514,2580,0.0256,0.0246,2.25e-05,0.5100,0.5370,0.0792,1.623,4571.8,35.23GB,0.0118,0.1462,0.5523 +baseline,16:36:14.765,2590,0.0428,0.0262,2.24e-05,0.6074,0.5199,0.1052,1.600,4630.9,35.23GB,0.0189,0.1913,1.1513 +baseline,16:36:21.681,2600,0.0102,0.0252,2.24e-05,0.7109,0.5754,0.1161,1.446,5117.0,35.23GB,0.0217,0.1689,0.4626 +baseline,16:36:27.463,2610,0.0382,0.0257,2.24e-05,0.6115,0.5079,0.0703,1.730,4272.1,35.23GB,0.0112,0.1661,0.4309 +baseline,16:36:33.105,2620,0.0092,0.0226,2.24e-05,0.5609,0.5001,0.0641,1.773,4163.6,35.23GB,0.0114,0.1190,0.5058 +baseline,16:36:38.955,2630,0.0131,0.0169,2.23e-05,0.4910,0.5183,0.0667,1.710,4311.0,35.23GB,0.0206,0.1610,0.4560 +baseline,16:36:44.287,2640,0.0192,0.0159,2.23e-05,0.5257,0.4750,0.0581,1.876,3923.7,35.23GB,0.0196,0.1794,0.5032 +baseline,16:36:50.373,2650,0.0341,0.0187,2.23e-05,0.4867,0.5310,0.0776,1.643,4472.2,35.23GB,0.0163,0.1209,0.4166 +baseline,16:36:56.498,2660,0.0266,0.0194,2.23e-05,0.5262,0.5410,0.0716,1.633,4495.3,35.23GB,0.0088,0.1431,0.3939 +baseline,16:37:02.327,2670,0.0107,0.0173,2.22e-05,0.4868,0.5185,0.0644,1.716,4271.5,35.23GB,0.0099,0.1181,0.3696 +baseline,16:37:08.402,2680,0.0094,0.0175,2.22e-05,0.5388,0.5486,0.0591,1.646,4446.4,35.23GB,0.0064,0.1100,0.3651 +baseline,16:37:14.448,2690,0.0193,0.0206,2.22e-05,0.5150,0.5419,0.0626,1.654,4419.2,35.23GB,0.0192,0.2071,0.4762 +baseline,16:37:20.546,2700,0.0053,0.0157,2.22e-05,0.5554,0.5334,0.0764,1.640,4450.8,35.23GB,0.0105,0.1161,0.3045 +baseline,16:37:26.679,2710,0.0303,0.0220,2.21e-05,0.5683,0.5352,0.0781,1.631,4470.8,35.23GB,0.0179,0.2134,0.5443 +baseline,16:37:32.235,2720,0.0294,0.0215,2.21e-05,0.4891,0.4907,0.0649,1.800,4043.5,35.23GB,0.0169,0.1773,0.4425 +baseline,16:37:40.011,2730,0.0115,0.0193,2.21e-05,0.6100,0.6002,0.1775,1.286,5652.1,35.23GB,0.0109,0.1345,0.4631 +baseline,16:37:46.641,2740,0.0553,0.0224,2.21e-05,0.5274,0.5359,0.1272,1.512,4801.2,35.23GB,0.0197,0.2128,0.5877 +baseline,16:37:52.556,2750,0.0144,0.0194,2.20e-05,0.4811,0.5033,0.0881,1.691,4287.3,35.23GB,0.0106,0.1709,0.4834 +baseline,16:37:59.169,2760,0.0232,0.0167,2.20e-05,0.4760,0.5568,0.1048,1.512,4786.8,35.23GB,0.0081,0.1109,0.3870 +baseline,16:38:05.459,2770,0.0105,0.0214,2.20e-05,0.5209,0.5419,0.0867,1.590,4546.5,35.23GB,0.0057,0.0944,0.2768 +baseline,16:38:11.508,2780,0.0192,0.0229,2.20e-05,0.5243,0.5217,0.0832,1.653,4366.8,35.23GB,0.0066,0.1139,0.3818 +baseline,16:38:18.350,2790,0.0077,0.0244,2.19e-05,0.5967,0.5812,0.1030,1.462,4932.4,35.23GB,0.0118,0.1530,0.4462 +baseline,16:38:24.749,2800,0.0525,0.0282,2.19e-05,0.4782,0.5490,0.0909,1.563,4605.8,35.23GB,0.0233,0.1703,0.5472 +baseline,16:38:31.623,2810,0.0100,0.0229,2.19e-05,0.5532,0.5788,0.1086,1.455,4941.7,35.23GB,0.0115,0.1537,0.6362 +baseline,16:38:38.227,2820,0.0524,0.0210,2.19e-05,0.5461,0.5318,0.1286,1.514,4741.3,35.23GB,0.0191,0.1982,0.7516 +baseline,16:38:44.894,2830,0.0054,0.0131,2.18e-05,0.4631,0.5608,0.1059,1.500,4779.2,35.23GB,0.0076,0.0837,0.4084 +baseline,16:38:52.374,2840,0.0161,0.0181,2.18e-05,0.4850,0.5894,0.1586,1.337,5355.1,35.23GB,0.0172,0.1635,0.4637 +baseline,16:38:58.562,2850,0.0161,0.0199,2.18e-05,0.5100,0.5435,0.0753,1.616,4424.3,35.23GB,0.0090,0.1323,0.4949 +baseline,16:39:04.964,2860,0.0264,0.0217,2.18e-05,0.4811,0.5625,0.0776,1.563,4569.6,35.23GB,0.0142,0.1226,0.3633 +baseline,16:39:11.842,2870,0.0154,0.0193,2.17e-05,0.5109,0.5701,0.1178,1.454,4903.9,35.23GB,0.0121,0.1803,0.4640 +baseline,16:39:18.493,2880,0.0103,0.0234,2.17e-05,0.5169,0.5698,0.0952,1.504,4734.6,35.23GB,0.0108,0.2020,0.6552 +baseline,16:39:25.016,2890,0.0173,0.0192,2.17e-05,0.5454,0.5583,0.0941,1.534,4635.1,35.23GB,0.0067,0.0907,0.3739 +baseline,16:39:31.152,2900,0.0156,0.0212,2.17e-05,0.5642,0.5299,0.0837,1.630,4355.4,35.23GB,0.0066,0.0881,0.2990 +baseline,16:39:38.321,2910,0.0128,0.0223,2.16e-05,0.5587,0.5887,0.1282,1.395,5081.9,35.23GB,0.0155,0.2325,0.5371 +baseline,16:39:44.208,2920,0.0066,0.0195,2.16e-05,0.4992,0.5051,0.0837,1.699,4167.9,35.23GB,0.0267,0.1862,0.5075 +baseline,16:39:49.994,2930,0.0262,0.0292,2.16e-05,0.5589,0.5031,0.0754,1.729,4089.7,35.23GB,0.0073,0.0810,0.3075 +baseline,16:39:56.633,2940,0.0139,0.0246,2.15e-05,0.4749,0.5510,0.1130,1.506,4686.8,35.23GB,0.0098,0.1133,0.5210 +baseline,16:40:02.433,2950,0.0182,0.0199,2.15e-05,0.5483,0.4970,0.0829,1.725,4088.0,35.23GB,0.0247,0.1669,0.5289 +baseline,16:40:10.186,2960,0.0072,0.0169,2.15e-05,0.4488,0.6263,0.1490,1.290,5457.7,35.23GB,0.0263,0.2042,0.6754 +baseline,16:40:19.122,2970,0.0163,0.0175,2.15e-05,0.6842,0.6145,0.2790,1.120,6279.5,35.23GB,0.0081,0.1291,0.3258 +baseline,16:40:26.620,2980,0.0250,0.0161,2.14e-05,0.4309,0.6103,0.1397,1.334,5261.6,35.23GB,0.0101,0.1260,0.4060 +baseline,16:40:36.931,2990,0.0593,0.0273,2.14e-05,0.5464,0.7881,0.2429,0.970,7224.2,35.23GB,0.0119,0.2029,0.4441 +baseline,16:40:46.677,3000,0.0208,0.0253,2.14e-05,0.5537,0.6869,0.2876,1.047,6683.9,35.23GB,0.0203,0.2077,0.6590 +baseline,16:42:36.127,3010,0.0067,0.0209,2.14e-05,0.6044,0.5740,10.3710,0.091,76440.7,35.23GB,0.0283,0.1709,0.4894 +baseline,16:42:42.972,3020,0.0130,0.0186,2.13e-05,0.5422,0.5827,0.1017,1.461,4777.0,35.23GB,0.0101,0.1146,0.4202 +baseline,16:42:49.857,3030,0.0088,0.0310,2.13e-05,0.5285,0.5798,0.1087,1.453,4797.8,35.23GB,0.0351,0.2588,0.8347 +baseline,16:42:57.588,3040,0.0178,0.0221,2.13e-05,0.5096,0.6376,0.1355,1.294,5379.1,35.23GB,0.0087,0.1172,0.4159 +baseline,16:43:07.829,3050,0.0128,0.0203,2.12e-05,0.6803,0.7186,0.3054,0.977,7116.2,35.23GB,0.0081,0.1240,0.4570 +baseline,16:43:16.233,3060,0.0264,0.0201,2.12e-05,0.4947,0.6510,0.1894,1.190,5830.9,35.23GB,0.0304,0.2489,0.6174 +baseline,16:43:24.867,3070,0.0130,0.0194,2.12e-05,0.5982,0.6471,0.2163,1.159,5980.7,35.23GB,0.0150,0.1483,0.6999 +baseline,16:43:33.093,3080,0.0605,0.0266,2.12e-05,0.4756,0.6535,0.1692,1.216,5692.3,35.23GB,0.0256,0.2000,0.4459 +baseline,16:43:39.880,3090,0.0150,0.0263,2.11e-05,0.6384,0.5832,0.0955,1.474,4688.9,35.23GB,0.0160,0.2071,0.5276 +baseline,16:43:45.854,3100,0.1251,0.0434,2.11e-05,0.5078,0.5244,0.0730,1.674,4121.0,35.23GB,0.0244,0.1917,0.4759 +baseline,16:43:52.040,3110,0.0038,0.0266,2.11e-05,0.5391,0.5535,0.0650,1.617,4261.2,35.23GB,0.0114,0.1633,0.4352 +baseline,16:43:58.845,3120,0.0207,0.0230,2.11e-05,0.4628,0.5641,0.1164,1.470,4681.4,35.23GB,0.0176,0.1760,0.3513 +baseline,16:44:06.206,3130,0.0275,0.0187,2.10e-05,0.5038,0.5967,0.1394,1.359,5056.1,35.23GB,0.0146,0.1701,0.4549 +baseline,16:44:12.887,3140,0.0225,0.0186,2.10e-05,0.5544,0.5882,0.0799,1.497,4582.7,35.23GB,0.0181,0.1184,0.4127 +baseline,16:44:18.566,3150,0.0299,0.0193,2.10e-05,0.4682,0.5013,0.0668,1.761,3889.3,35.23GB,0.0214,0.1637,0.5886 +baseline,16:44:24.860,3160,0.0347,0.0246,2.09e-05,0.5502,0.5536,0.0755,1.589,4303.9,35.23GB,0.0189,0.2165,0.5210 +baseline,16:44:30.894,3170,0.0062,0.0222,2.09e-05,0.5566,0.5376,0.0659,1.658,4120.6,35.23GB,0.0127,0.1419,0.4160 +baseline,16:44:36.838,3180,0.0140,0.0156,2.09e-05,0.5315,0.5221,0.0724,1.683,4053.3,35.23GB,0.0303,0.2562,0.6225 +baseline,16:44:43.355,3190,0.0156,0.0173,2.09e-05,0.4920,0.5577,0.0940,1.535,4437.9,35.23GB,0.0263,0.2695,0.5885 +baseline,16:44:49.271,3200,0.0660,0.0269,2.08e-05,0.4831,0.5225,0.0690,1.691,4021.9,35.23GB,0.0252,0.1548,0.4556 +baseline,16:44:55.284,3210,0.0168,0.0199,2.08e-05,0.4998,0.5341,0.0673,1.663,4082.6,35.23GB,0.0226,0.1785,0.4673 +baseline,16:45:01.509,3220,0.0413,0.0241,2.08e-05,0.5000,0.5452,0.0771,1.607,4219.5,35.23GB,0.0114,0.1395,0.4105 +baseline,16:45:07.571,3230,0.0223,0.0217,2.07e-05,0.4949,0.5185,0.0879,1.650,4103.1,35.23GB,0.0124,0.1245,0.4084 +baseline,16:45:14.429,3240,0.0241,0.0203,2.07e-05,0.5156,0.5973,0.0884,1.458,4635.4,35.23GB,0.0102,0.1331,0.6035 +baseline,16:45:20.528,3250,0.0067,0.0191,2.07e-05,0.5569,0.5340,0.0761,1.640,4116.3,35.23GB,0.0079,0.1042,0.2815 +baseline,16:45:26.348,3260,0.0312,0.0394,2.06e-05,0.5607,0.5189,0.0630,1.718,3922.7,35.23GB,0.0113,0.1350,0.4775 +baseline,16:45:33.033,3270,0.0193,0.0357,2.06e-05,0.6796,0.5929,0.0756,1.496,4498.2,35.23GB,0.0088,0.1312,0.3530 +baseline,16:45:39.111,3280,0.0249,0.0266,2.06e-05,0.5088,0.5355,0.0722,1.646,4083.2,35.23GB,0.0144,0.1657,0.4276 +baseline,16:45:45.504,3290,0.0421,0.0238,2.06e-05,0.5828,0.5588,0.0804,1.564,4289.5,35.23GB,0.0146,0.1628,0.6979 +baseline,16:45:52.806,3300,0.0102,0.0230,2.05e-05,0.4996,0.6248,0.1053,1.370,4889.7,35.23GB,0.0466,0.2025,0.6179 +baseline,16:45:59.479,3310,0.0118,0.0197,2.05e-05,0.5313,0.5615,0.1059,1.499,4463.6,35.23GB,0.0289,0.2221,0.5235 +baseline,16:46:07.829,3320,0.0216,0.0206,2.05e-05,0.5285,0.6788,0.1562,1.198,5577.0,35.23GB,0.0177,0.2167,0.5344 +baseline,16:46:15.046,3330,0.0063,0.0188,2.04e-05,0.4614,0.5972,0.1248,1.386,4812.6,35.23GB,0.0209,0.1670,0.5066 +baseline,16:46:24.709,3340,0.0447,0.0205,2.04e-05,0.5612,0.7193,0.2468,1.035,6434.2,35.23GB,0.0193,0.2252,1.4583 +baseline,16:46:32.557,3350,0.0114,0.0181,2.04e-05,0.4661,0.6558,0.1291,1.275,5214.1,35.23GB,0.0073,0.1070,0.2427 +baseline,16:46:39.823,3360,0.0116,0.0151,2.03e-05,0.5133,0.5776,0.1489,1.377,4822.5,35.23GB,0.0140,0.1466,0.3393 +baseline,16:46:48.510,3370,0.0289,0.0231,2.03e-05,0.6024,0.6778,0.1910,1.158,5726.3,35.23GB,0.0126,0.1367,0.4388 +baseline,16:46:56.362,3380,0.0460,0.0223,2.03e-05,0.4573,0.6349,0.1504,1.275,5191.9,35.23GB,0.0194,0.1869,0.3596 +baseline,16:47:03.980,3390,0.0091,0.0173,2.03e-05,0.5114,0.6088,0.1529,1.313,5034.7,35.23GB,0.0187,0.2386,0.6892 +baseline,16:47:11.012,3400,0.0133,0.0167,2.02e-05,0.4845,0.5894,0.1138,1.422,4640.8,35.23GB,0.0068,0.1142,0.3280 +baseline,16:47:17.734,3410,0.0240,0.0218,2.02e-05,0.5621,0.5631,0.1092,1.488,4429.0,35.23GB,0.0099,0.1184,0.4905 +baseline,16:47:25.296,3420,0.0063,0.0181,2.02e-05,0.6100,0.6316,0.1245,1.323,4973.6,35.23GB,0.0152,0.1780,0.4147 +baseline,16:47:32.103,3430,0.0104,0.0203,2.01e-05,0.6999,0.5621,0.1185,1.469,4471.3,35.23GB,0.0068,0.0995,0.5372 +baseline,16:47:40.159,3440,0.0125,0.0214,2.01e-05,0.4962,0.5902,0.2156,1.243,5278.8,35.23GB,0.0146,0.1327,0.4882 +baseline,16:47:47.587,3450,0.0100,0.0185,2.01e-05,0.4390,0.6176,0.1251,1.347,4861.0,35.23GB,0.0062,0.0883,0.2840 +baseline,16:47:54.937,3460,0.0186,0.0221,2.00e-05,0.4510,0.6014,0.1337,1.362,4802.0,35.23GB,0.0217,0.1994,0.5554 +baseline,16:48:03.160,3470,0.0167,0.0210,2.00e-05,0.4657,0.6317,0.1904,1.217,5363.8,35.23GB,0.0208,0.1649,0.4015 +baseline,16:48:09.849,3480,0.0579,0.0277,2.00e-05,0.5368,0.5562,0.1127,1.495,4360.5,35.23GB,0.0257,0.1783,0.4480 +baseline,16:48:15.974,3490,0.0246,0.0245,2.00e-05,0.5454,0.5466,0.0658,1.633,3986.4,35.23GB,0.0168,0.1449,0.4627 +baseline,16:48:21.934,3500,0.1942,0.0394,1.99e-05,0.5239,0.5314,0.0649,1.678,3873.5,35.23GB,0.0406,0.2119,0.5283 +baseline,16:48:27.962,3510,0.0301,0.0248,1.99e-05,0.5263,0.5280,0.0745,1.659,3911.3,35.23GB,0.0162,0.1592,0.5344 +baseline,16:48:34.957,3520,0.0093,0.0261,1.99e-05,0.4626,0.6004,0.0992,1.430,4532.2,35.23GB,0.0180,0.1487,0.3542 +baseline,16:48:40.614,3530,0.0129,0.0214,1.98e-05,0.4782,0.5002,0.0654,1.768,3659.2,35.23GB,0.0079,0.1058,0.3506 +baseline,16:48:46.394,3540,0.0152,0.0251,1.98e-05,0.4920,0.5038,0.0742,1.730,3733.2,35.23GB,0.0136,0.1850,0.3501 +baseline,16:48:53.092,3550,0.0349,0.0211,1.98e-05,0.4028,0.5894,0.0805,1.493,4319.9,35.23GB,0.0094,0.1460,0.4132 +baseline,16:48:58.762,3560,0.0189,0.0166,1.97e-05,0.4672,0.5044,0.0626,1.764,3650.8,35.23GB,0.0204,0.1418,0.4007 +baseline,16:49:04.647,3570,0.0176,0.0226,1.97e-05,0.5151,0.5294,0.0591,1.699,3783.7,35.23GB,0.0113,0.1299,0.3202 +baseline,16:49:10.425,3580,0.0320,0.0260,1.97e-05,0.4762,0.5191,0.0586,1.731,3708.1,35.23GB,0.0140,0.1551,0.4475 +baseline,16:49:16.579,3590,0.0199,0.0247,1.96e-05,0.5838,0.5386,0.0768,1.625,3943.9,35.23GB,0.0216,0.1849,0.5093 +baseline,16:49:22.262,3600,0.0127,0.0249,1.96e-05,0.5787,0.5036,0.0648,1.760,3636.8,35.23GB,0.0267,0.2140,0.4898 +baseline,16:49:27.999,3610,0.0080,0.0187,1.96e-05,0.4226,0.4998,0.0738,1.744,3664.6,35.23GB,0.0189,0.1861,0.4951 +baseline,16:49:33.688,3620,0.0191,0.0165,1.95e-05,0.5734,0.5115,0.0575,1.758,3629.0,35.23GB,0.0103,0.1130,0.3712 +baseline,16:49:39.582,3630,0.0045,0.0190,1.95e-05,0.5389,0.5293,0.0601,1.697,3753.9,35.23GB,0.0119,0.1351,0.6462 +baseline,16:49:45.356,3640,0.0149,0.0178,1.95e-05,0.5040,0.5171,0.0602,1.732,3671.2,35.23GB,0.0095,0.1081,0.3726 +baseline,16:49:51.215,3650,0.0180,0.0176,1.94e-05,0.4441,0.5195,0.0664,1.707,3720.1,35.23GB,0.0117,0.1289,0.4233 +baseline,16:49:57.109,3660,0.0470,0.0211,1.94e-05,0.4885,0.5241,0.0653,1.697,3736.0,35.23GB,0.0210,0.1962,0.5141 +baseline,16:50:02.815,3670,0.0056,0.0130,1.94e-05,0.4706,0.5089,0.0617,1.753,3611.7,35.23GB,0.0078,0.0939,0.2430 +baseline,16:50:08.528,3680,0.0163,0.0132,1.93e-05,0.4576,0.5087,0.0627,1.751,3609.7,35.23GB,0.0129,0.1191,0.4310 +baseline,16:50:14.203,3690,0.0145,0.0136,1.93e-05,0.4979,0.4995,0.0678,1.762,3580.2,35.23GB,0.0268,0.1618,0.4657 +baseline,16:50:19.908,3700,0.0136,0.0217,1.93e-05,0.5549,0.5094,0.0612,1.753,3594.0,35.23GB,0.0171,0.1528,0.5167 +baseline,16:50:25.650,3710,0.0062,0.0232,1.93e-05,0.5460,0.5146,0.0596,1.742,3611.3,35.23GB,0.0219,0.1685,0.5087 +baseline,16:50:31.598,3720,0.0129,0.0188,1.92e-05,0.6689,0.5307,0.0640,1.682,3734.1,35.23GB,0.0143,0.1452,0.3997 +baseline,16:50:37.114,3730,0.0057,0.0237,1.92e-05,0.5181,0.4902,0.0614,1.813,3458.1,35.23GB,0.0092,0.1309,0.3718 +baseline,16:50:42.921,3740,0.0181,0.0190,1.92e-05,0.4972,0.5151,0.0657,1.722,3635.0,35.23GB,0.0091,0.1054,0.3119 +baseline,16:50:48.817,3750,0.0257,0.0243,1.91e-05,0.5066,0.5219,0.0677,1.696,3684.1,35.23GB,0.0136,0.1234,0.3862 +baseline,16:50:54.887,3760,0.0058,0.0197,1.91e-05,0.5254,0.5426,0.0645,1.648,3787.4,35.23GB,0.0160,0.1221,0.4212 +baseline,16:51:00.814,3770,0.0022,0.0168,1.91e-05,0.4609,0.5289,0.0637,1.688,3691.3,35.23GB,0.0110,0.0983,0.3845 +baseline,16:51:06.820,3780,0.0173,0.0134,1.90e-05,0.4356,0.5322,0.0685,1.665,3735.2,35.23GB,0.0148,0.1469,0.5851 +baseline,16:51:13.091,3790,0.0188,0.0177,1.90e-05,0.5349,0.5570,0.0701,1.595,3893.5,35.23GB,0.0166,0.1429,0.7577 +baseline,16:51:19.040,3800,0.0225,0.0134,1.90e-05,0.4719,0.5386,0.0564,1.681,3687.7,35.23GB,0.0155,0.1384,0.7200 +baseline,16:51:24.902,3810,0.0070,0.0142,1.89e-05,0.4861,0.5251,0.0609,1.706,3627.6,35.23GB,0.0102,0.1107,0.3280 +baseline,16:51:31.251,3820,0.0197,0.0180,1.89e-05,0.4835,0.5700,0.0650,1.575,3923.1,35.23GB,0.0103,0.1204,0.3338 +baseline,16:51:36.902,3830,0.0439,0.0200,1.89e-05,0.5366,0.5038,0.0612,1.770,3486.0,35.23GB,0.0281,0.1879,0.7901 +baseline,16:51:43.326,3840,0.0140,0.0231,1.88e-05,0.4947,0.5523,0.0902,1.557,3956.6,35.23GB,0.0102,0.1123,0.3865 +baseline,16:51:49.600,3850,0.0076,0.0145,1.88e-05,0.5156,0.5429,0.0845,1.594,3858.0,35.23GB,0.0164,0.1627,0.3896 +baseline,16:51:55.518,3860,0.0178,0.0231,1.88e-05,0.5162,0.5345,0.0574,1.690,3633.4,35.23GB,0.0186,0.1199,0.3737 +baseline,16:52:01.438,3870,0.0224,0.0184,1.87e-05,0.4885,0.5286,0.0635,1.690,3628.2,35.23GB,0.0145,0.1035,0.4142 +baseline,16:52:07.189,3880,0.0621,0.0222,1.87e-05,0.4910,0.5145,0.0605,1.739,3518.8,35.23GB,0.0168,0.2335,0.5950 +baseline,16:52:13.101,3890,0.0143,0.0161,1.87e-05,0.4820,0.5336,0.0576,1.692,3611.9,35.23GB,0.0107,0.1052,0.6482 +baseline,16:52:18.905,3900,0.0090,0.0187,1.86e-05,0.4838,0.5235,0.0569,1.723,3539.9,35.23GB,0.0065,0.1116,0.3575 +baseline,16:52:24.408,3910,0.0116,0.0177,1.86e-05,0.5216,0.4942,0.0561,1.818,3350.5,35.23GB,0.0140,0.1627,0.5274 +baseline,16:52:30.550,3920,0.0191,0.0234,1.86e-05,0.4976,0.5542,0.0601,1.628,3733.6,35.23GB,0.0228,0.1679,0.4859 +baseline,16:52:36.199,3930,0.0201,0.0185,1.85e-05,0.5100,0.4994,0.0655,1.770,3428.5,35.23GB,0.0209,0.1964,0.5157 +baseline,16:52:42.404,3940,0.0131,0.0157,1.85e-05,0.4497,0.5515,0.0689,1.612,3758.9,35.23GB,0.0144,0.1538,0.4372 +baseline,16:52:48.328,3950,0.0090,0.0165,1.85e-05,0.5626,0.5303,0.0623,1.688,3583.9,35.23GB,0.0291,0.2017,0.5222 +baseline,16:52:54.107,3960,0.0250,0.0176,1.84e-05,0.4527,0.5090,0.0688,1.731,3489.8,35.23GB,0.0059,0.0917,0.4870 +baseline,16:53:00.357,3970,0.0135,0.0173,1.84e-05,0.5222,0.5556,0.0695,1.600,3768.3,35.23GB,0.0114,0.1270,0.4996 +baseline,16:53:05.907,3980,0.0058,0.0145,1.84e-05,0.5200,0.4964,0.0585,1.802,3340.4,35.23GB,0.0286,0.1615,0.3913 +baseline,16:53:11.659,3990,0.0348,0.0156,1.83e-05,0.4220,0.5159,0.0594,1.739,3456.5,35.23GB,0.0188,0.1712,0.6352 +baseline,16:53:17.993,4000,0.0270,0.0171,1.83e-05,0.4555,0.5572,0.0760,1.579,3799.0,35.23GB,0.0164,0.1647,0.3553 +baseline,16:54:41.236,4010,0.0063,0.0220,1.83e-05,0.5309,0.5112,7.8133,0.120,49862.4,35.23GB,0.0146,0.1802,0.4481 +baseline,16:54:47.218,4020,0.0132,0.0174,1.82e-05,0.5327,0.5362,0.0620,1.672,3576.6,35.23GB,0.0274,0.1844,0.5666 +baseline,16:54:52.809,4030,0.0189,0.0255,1.82e-05,0.4959,0.4953,0.0638,1.789,3337.1,35.23GB,0.0101,0.0989,0.3341 +baseline,16:54:58.644,4040,0.0219,0.0195,1.82e-05,0.4686,0.5198,0.0639,1.714,3477.3,35.23GB,0.0126,0.1487,0.6114 +baseline,16:55:04.641,4050,0.0220,0.0170,1.81e-05,0.4535,0.5289,0.0708,1.668,3567.2,35.23GB,0.0066,0.0955,0.4201 +baseline,16:55:10.502,4060,0.0161,0.0158,1.81e-05,0.3984,0.5245,0.0614,1.707,3480.5,35.23GB,0.0092,0.1256,0.4174 +baseline,16:55:16.014,4070,0.0117,0.0161,1.80e-05,0.4927,0.4887,0.0626,1.814,3268.5,35.23GB,0.0184,0.1479,0.4105 +baseline,16:55:21.798,4080,0.0202,0.0172,1.80e-05,0.5147,0.5147,0.0636,1.730,3422.8,35.23GB,0.0046,0.0681,0.2773 +baseline,16:55:27.519,4090,0.0331,0.0149,1.80e-05,0.5254,0.5123,0.0598,1.748,3380.9,35.23GB,0.0097,0.1330,0.6185 +baseline,16:55:33.220,4100,0.0086,0.0219,1.79e-05,0.4804,0.5074,0.0627,1.754,3363.0,35.23GB,0.0116,0.1214,0.4463 +baseline,16:55:38.795,4110,0.0120,0.0179,1.79e-05,0.5039,0.4937,0.0638,1.794,3283.1,35.23GB,0.0129,0.1512,0.3741 +baseline,16:55:44.491,4120,0.0096,0.0224,1.79e-05,0.5036,0.5105,0.0591,1.756,3348.7,35.23GB,0.0088,0.1200,0.3243 +baseline,16:55:50.191,4130,0.0146,0.0167,1.78e-05,0.4535,0.5108,0.0592,1.755,3345.1,35.23GB,0.0138,0.1239,0.4210 +baseline,16:55:55.855,4140,0.0249,0.0165,1.78e-05,0.4533,0.5048,0.0617,1.766,3318.8,35.23GB,0.0290,0.2231,0.5815 +baseline,16:56:01.735,4150,0.0209,0.0142,1.78e-05,0.4245,0.5218,0.0663,1.701,3438.9,35.23GB,0.0212,0.1055,0.3459 +baseline,16:56:07.313,4160,0.0182,0.0232,1.77e-05,0.4779,0.4963,0.0614,1.793,3256.8,35.23GB,0.0123,0.1398,0.5803 +baseline,16:56:13.095,4170,0.0293,0.0257,1.77e-05,0.5204,0.5151,0.0631,1.730,3370.6,35.23GB,0.0189,0.1730,0.4758 +baseline,16:56:18.802,4180,0.0087,0.0206,1.77e-05,0.5061,0.5115,0.0592,1.753,3320.9,35.23GB,0.0112,0.1514,0.5149 +baseline,16:56:24.713,4190,0.0340,0.0174,1.76e-05,0.4803,0.5312,0.0599,1.692,3434.1,35.23GB,0.0097,0.0988,0.3710 +baseline,16:56:30.663,4200,0.0044,0.0144,1.76e-05,0.4731,0.5338,0.0613,1.681,3450.2,35.23GB,0.0197,0.1710,0.5710 +baseline,16:56:36.256,4210,0.0154,0.0161,1.76e-05,0.4837,0.4967,0.0626,1.788,3237.7,35.23GB,0.0085,0.0967,0.4065 +baseline,16:56:42.050,4220,0.0069,0.0151,1.75e-05,0.4797,0.5086,0.0708,1.727,3347.4,35.23GB,0.0138,0.1518,0.3755 +baseline,16:56:47.894,4230,0.0120,0.0159,1.75e-05,0.4357,0.5204,0.0639,1.711,3371.3,35.23GB,0.0179,0.1696,0.3652 +baseline,16:56:54.603,4240,0.0189,0.0163,1.75e-05,0.4293,0.5389,0.1320,1.491,3863.7,35.23GB,0.0288,0.1946,0.5475 +baseline,16:57:01.293,4250,0.0182,0.0143,1.74e-05,0.4589,0.5863,0.0828,1.495,3846.3,35.23GB,0.0074,0.1181,0.4152 +baseline,16:57:07.232,4260,0.0135,0.0144,1.74e-05,0.4982,0.5141,0.0798,1.684,3408.3,35.23GB,0.0093,0.0757,0.4201 +baseline,16:57:14.656,4270,0.0134,0.0147,1.74e-05,0.4879,0.6132,0.1292,1.347,4253.2,35.23GB,0.0148,0.1341,0.3541 +baseline,16:57:21.143,4280,0.0067,0.0133,1.73e-05,0.4074,0.5539,0.0948,1.542,3709.8,35.23GB,0.0281,0.1625,0.4628 +baseline,16:57:28.029,4290,0.0114,0.0131,1.73e-05,0.5118,0.5705,0.1180,1.453,3930.9,35.23GB,0.0068,0.1009,0.3843 +baseline,16:57:34.030,4300,0.1237,0.0291,1.72e-05,0.4939,0.5411,0.0590,1.667,3420.1,35.23GB,0.0337,0.2473,0.6097 +baseline,16:57:39.601,4310,0.0017,0.0173,1.72e-05,0.5050,0.4930,0.0641,1.795,3169.1,35.23GB,0.0127,0.1307,0.3242 +baseline,16:57:45.743,4320,0.0407,0.0188,1.72e-05,0.4528,0.5122,0.1021,1.628,3488.2,35.23GB,0.0233,0.1836,0.5688 +baseline,16:57:51.872,4330,0.0158,0.0202,1.71e-05,0.4666,0.5411,0.0718,1.632,3474.6,35.23GB,0.0087,0.0910,0.2449 +baseline,16:57:57.378,4340,0.0086,0.0126,1.71e-05,0.4676,0.4912,0.0593,1.817,3115.2,35.23GB,0.0229,0.1381,0.4161 +baseline,16:58:03.432,4350,0.0134,0.0133,1.71e-05,0.4282,0.5414,0.0640,1.652,3419.9,35.23GB,0.0087,0.1183,0.4713 +baseline,16:58:08.905,4360,0.0126,0.0182,1.70e-05,0.5105,0.4843,0.0630,1.827,3086.4,35.23GB,0.0119,0.1459,0.5443 +baseline,16:58:14.505,4370,0.0248,0.0143,1.70e-05,0.5039,0.4994,0.0606,1.786,3152.4,35.23GB,0.0258,0.2271,0.6223 +baseline,16:58:20.216,4380,0.0160,0.0188,1.70e-05,0.5110,0.5117,0.0595,1.751,3209.4,35.23GB,0.0086,0.0907,0.3660 +baseline,16:58:26.139,4390,0.0234,0.0166,1.69e-05,0.4135,0.5138,0.0785,1.689,3322.1,35.23GB,0.0072,0.0897,0.3624 +baseline,16:58:32.637,4400,0.0092,0.0147,1.69e-05,0.4798,0.5619,0.0878,1.539,3637.8,35.23GB,0.0157,0.1058,0.3642 +baseline,16:58:38.778,4410,0.0021,0.0116,1.69e-05,0.4617,0.5223,0.0918,1.629,3432.3,35.23GB,0.0126,0.1495,0.3952 +baseline,16:58:44.599,4420,0.0095,0.0149,1.68e-05,0.4566,0.5066,0.0755,1.718,3247.6,35.23GB,0.0106,0.1328,0.3901 +baseline,16:58:50.591,4430,0.0088,0.0127,1.68e-05,0.4455,0.5344,0.0651,1.669,3337.2,35.23GB,0.0080,0.0948,0.3573 +baseline,16:58:56.107,4440,0.0100,0.0131,1.67e-05,0.4180,0.4939,0.0575,1.813,3066.4,35.23GB,0.0059,0.0891,0.3431 +baseline,16:59:01.919,4450,0.0075,0.0140,1.67e-05,0.4288,0.5169,0.0643,1.721,3224.7,35.23GB,0.0075,0.0733,0.3633 +baseline,16:59:07.385,4460,0.0056,0.0104,1.67e-05,0.3608,0.4893,0.0576,1.830,3027.5,35.23GB,0.0061,0.0691,0.2755 +baseline,16:59:12.672,4470,0.0164,0.0132,1.66e-05,0.5093,0.4705,0.0579,1.892,2923.4,35.23GB,0.0284,0.1837,0.4914 +baseline,16:59:18.102,4480,0.0120,0.0163,1.66e-05,0.5707,0.4828,0.0602,1.842,2997.0,35.23GB,0.0086,0.1205,0.3816 +baseline,16:59:23.358,4490,0.0251,0.0186,1.66e-05,0.5481,0.4638,0.0617,1.903,2895.3,35.23GB,0.0063,0.0822,0.3769 +baseline,16:59:28.692,4500,0.0159,0.0161,1.65e-05,0.4226,0.4755,0.0579,1.875,2933.2,35.23GB,0.0103,0.1015,0.3055 +baseline,16:59:34.495,4510,0.0339,0.0221,1.65e-05,0.4554,0.5215,0.0587,1.724,3185.3,35.23GB,0.0102,0.1272,0.3673 +baseline,16:59:39.927,4520,0.0252,0.0175,1.65e-05,0.5057,0.4810,0.0622,1.841,2976.2,35.23GB,0.0132,0.1817,0.4705 +baseline,16:59:45.517,4530,0.0144,0.0166,1.64e-05,0.4726,0.5001,0.0588,1.789,3057.1,35.23GB,0.0140,0.1415,0.4230 +baseline,16:59:50.724,4540,0.0050,0.0132,1.64e-05,0.4739,0.4640,0.0569,1.921,2842.8,35.23GB,0.0089,0.0880,0.2342 +baseline,16:59:56.418,4550,0.0150,0.0156,1.64e-05,0.4618,0.5132,0.0562,1.756,3102.9,35.23GB,0.0073,0.0884,0.3918 +baseline,17:00:02.353,4560,0.0439,0.0144,1.63e-05,0.3797,0.5281,0.0654,1.685,3228.2,35.23GB,0.0182,0.1668,0.4854 +baseline,17:00:07.808,4570,0.0041,0.0110,1.63e-05,0.4593,0.4878,0.0576,1.834,2961.3,35.23GB,0.0085,0.1109,0.2934 +baseline,17:00:13.614,4580,0.0328,0.0116,1.62e-05,0.4783,0.5241,0.0566,1.723,3146.6,35.23GB,0.0156,0.1648,0.4354 +baseline,17:00:19.011,4590,0.0116,0.0141,1.62e-05,0.4232,0.4814,0.0583,1.853,2919.2,35.23GB,0.0157,0.1275,0.4577 +baseline,17:00:24.707,4600,0.0089,0.0132,1.62e-05,0.4902,0.5098,0.0598,1.756,3075.5,35.23GB,0.0115,0.1014,0.3484 +baseline,17:00:30.985,4610,0.0111,0.0132,1.61e-05,0.4381,0.5351,0.0927,1.593,3383.3,35.23GB,0.0262,0.1617,0.3987 +baseline,17:00:36.407,4620,0.0188,0.0218,1.61e-05,0.4860,0.4763,0.0658,1.845,2916.4,35.23GB,0.0215,0.1691,0.4296 +baseline,17:00:42.792,4630,0.0208,0.0149,1.61e-05,0.4650,0.5607,0.0777,1.566,3428.1,35.23GB,0.0155,0.1197,0.4260 +baseline,17:00:48.356,4640,0.0016,0.0133,1.60e-05,0.4992,0.4878,0.0686,1.797,2982.1,35.23GB,0.0151,0.1041,0.5554 +baseline,17:00:54.229,4650,0.0101,0.0143,1.60e-05,0.4148,0.5161,0.0712,1.703,3141.3,35.23GB,0.0118,0.1078,0.3885 +baseline,17:01:00.243,4660,0.0089,0.0223,1.60e-05,0.5015,0.5297,0.0717,1.663,3211.3,35.23GB,0.0137,0.1936,0.4067 +baseline,17:01:06.025,4670,0.0126,0.0173,1.59e-05,0.5271,0.5127,0.0654,1.730,3081.3,35.23GB,0.0071,0.0812,0.5162 +baseline,17:01:11.831,4680,0.0091,0.0154,1.59e-05,0.4238,0.5247,0.0560,1.723,3088.5,35.23GB,0.0092,0.0965,0.3090 +baseline,17:01:17.450,4690,0.0057,0.0148,1.58e-05,0.5179,0.5003,0.0616,1.780,2982.9,35.23GB,0.0057,0.0682,0.2384 +baseline,17:01:22.854,4700,0.0147,0.0146,1.58e-05,0.4729,0.4848,0.0557,1.851,2863.8,35.23GB,0.0150,0.1557,0.3552 +baseline,17:01:28.869,4710,0.0045,0.0207,1.58e-05,0.4874,0.5394,0.0620,1.663,3180.9,35.23GB,0.0061,0.0754,0.2627 +baseline,17:01:34.489,4720,0.0221,0.0152,1.57e-05,0.4275,0.4982,0.0638,1.780,2966.9,35.23GB,0.0083,0.1270,0.3130 +baseline,17:01:41.018,4730,0.0195,0.0128,1.57e-05,0.4714,0.5475,0.1054,1.532,3440.4,35.23GB,0.0138,0.1224,0.8202 +baseline,17:01:47.268,4740,0.0130,0.0121,1.57e-05,0.4226,0.5361,0.0888,1.600,3287.0,35.23GB,0.0069,0.0931,0.2788 +baseline,17:01:53.469,4750,0.0114,0.0169,1.56e-05,0.4754,0.5169,0.1033,1.613,3255.4,35.23GB,0.0253,0.2191,0.4973 +baseline,17:02:00.695,4760,0.0208,0.0168,1.56e-05,0.4632,0.6158,0.1068,1.384,3785.5,35.23GB,0.0075,0.0996,0.5140 +baseline,17:02:06.329,4770,0.0063,0.0199,1.55e-05,0.5236,0.5051,0.0584,1.775,2946.0,35.23GB,0.0196,0.1511,0.6628 +baseline,17:02:12.140,4780,0.0073,0.0154,1.55e-05,0.4943,0.5248,0.0563,1.721,3032.9,35.23GB,0.0143,0.1379,0.3674 +baseline,17:02:18.148,4790,0.0246,0.0155,1.55e-05,0.5103,0.5317,0.0692,1.665,3129.9,35.23GB,0.0281,0.2413,0.6740 +baseline,17:02:23.981,4800,0.0075,0.0152,1.54e-05,0.4318,0.5193,0.0640,1.715,3032.5,35.23GB,0.0227,0.1411,0.4632 +baseline,17:02:30.101,4810,0.0355,0.0155,1.54e-05,0.4602,0.5512,0.0607,1.634,3175.7,35.23GB,0.0282,0.1751,0.5740 +baseline,17:02:36.699,4820,0.0098,0.0147,1.54e-05,0.4653,0.5624,0.0974,1.516,3417.1,35.23GB,0.0261,0.1549,0.5145 +baseline,17:02:43.075,4830,0.0056,0.0118,1.53e-05,0.4098,0.5515,0.0862,1.569,3296.1,35.23GB,0.0064,0.1109,0.4317 +baseline,17:02:49.347,4840,0.0114,0.0128,1.53e-05,0.4673,0.5507,0.0765,1.595,3235.5,35.23GB,0.0159,0.1317,0.6183 +baseline,17:02:55.032,4850,0.0223,0.0178,1.53e-05,0.4573,0.5106,0.0578,1.759,2927.2,35.23GB,0.0208,0.1891,0.5177 +baseline,17:03:00.958,4860,0.0162,0.0153,1.52e-05,0.4722,0.5343,0.0583,1.688,3045.6,35.23GB,0.0289,0.2505,0.5745 +baseline,17:03:06.775,4870,0.0219,0.0157,1.52e-05,0.4723,0.5213,0.0604,1.719,2983.7,35.23GB,0.0068,0.0796,0.3795 +baseline,17:03:12.606,4880,0.0096,0.0126,1.51e-05,0.4962,0.5245,0.0585,1.715,2984.9,35.23GB,0.0198,0.1920,0.5521 +baseline,17:03:18.470,4890,0.0056,0.0147,1.51e-05,0.4496,0.5200,0.0664,1.705,2996.3,35.23GB,0.0180,0.1429,0.3655 +baseline,17:03:24.211,4900,0.0121,0.0136,1.51e-05,0.4593,0.5032,0.0709,1.742,2927.7,35.23GB,0.0083,0.1069,0.3265 +baseline,17:03:29.940,4910,0.0180,0.0145,1.50e-05,0.4714,0.5150,0.0579,1.746,2915.6,35.23GB,0.0240,0.2115,0.5912 +baseline,17:03:35.674,4920,0.0165,0.0137,1.50e-05,0.5552,0.5159,0.0576,1.744,2912.5,35.23GB,0.0200,0.1625,0.5360 +baseline,17:03:41.250,4930,0.0111,0.0126,1.50e-05,0.4791,0.4976,0.0599,1.794,2826.2,35.23GB,0.0236,0.1274,0.4371 +baseline,17:03:47.173,4940,0.0062,0.0173,1.49e-05,0.4575,0.5333,0.0590,1.689,2996.7,35.23GB,0.0131,0.1203,0.3897 +baseline,17:03:53.047,4950,0.0095,0.0157,1.49e-05,0.4565,0.5291,0.0586,1.703,2965.7,35.23GB,0.0340,0.2413,0.6007 +baseline,17:03:58.779,4960,0.0482,0.0160,1.48e-05,0.4822,0.5139,0.0592,1.745,2888.7,35.23GB,0.0222,0.1400,0.3642 +baseline,17:04:04.497,4970,0.0120,0.0132,1.48e-05,0.3964,0.5115,0.0602,1.749,2875.1,35.23GB,0.0057,0.0960,0.2964 +baseline,17:04:10.019,4980,0.0257,0.0141,1.48e-05,0.4361,0.4939,0.0583,1.811,2771.7,35.23GB,0.0151,0.1783,0.5273 +baseline,17:04:16.046,4990,0.0027,0.0216,1.47e-05,0.5428,0.5430,0.0598,1.659,3019.4,35.23GB,0.0159,0.1287,0.3766 +baseline,17:04:21.626,5000,0.0038,0.0165,1.47e-05,0.5112,0.4974,0.0606,1.792,2789.7,35.23GB,0.0101,0.1353,1.1505 +baseline,17:05:43.400,5010,0.0162,0.0141,1.47e-05,0.4896,0.5870,7.5904,0.122,40804.6,35.23GB,0.0176,0.1201,0.3928 +baseline,17:05:50.160,5020,0.0063,0.0100,1.46e-05,0.4216,0.5711,0.1049,1.480,3365.9,35.23GB,0.0142,0.1367,0.3499 +baseline,17:05:56.657,5030,0.0039,0.0108,1.46e-05,0.4535,0.5667,0.0833,1.539,3228.9,35.23GB,0.0134,0.1110,0.3926 +baseline,17:06:03.541,5040,0.0062,0.0107,1.46e-05,0.7330,0.5944,0.0939,1.453,3413.8,35.23GB,0.0343,0.2555,0.5839 +baseline,17:06:09.950,5050,0.0134,0.0121,1.45e-05,0.5093,0.5622,0.0786,1.561,3171.9,35.23GB,0.0148,0.1548,0.4825 +baseline,17:06:17.497,5060,0.0138,0.0117,1.45e-05,0.4829,0.6313,0.1235,1.325,3728.2,35.23GB,0.0065,0.0761,0.4058 +baseline,17:06:25.458,5070,0.0059,0.0172,1.44e-05,0.6996,0.6565,0.1395,1.256,3924.1,35.23GB,0.0080,0.0788,0.3142 +baseline,17:06:33.408,5080,0.0138,0.0171,1.44e-05,0.4443,0.6266,0.1684,1.258,3909.6,35.23GB,0.0186,0.1404,0.4497 +baseline,17:06:46.168,5090,0.0099,0.0121,1.44e-05,0.5322,0.8661,0.4099,0.784,6259.2,35.23GB,0.0150,0.1513,0.4700 +baseline,17:06:57.703,5100,0.0020,0.0111,1.43e-05,0.4363,0.7983,0.3553,0.876,5590.5,35.23GB,0.0082,0.0916,0.3207 +baseline,17:07:06.193,5110,0.0082,0.0152,1.43e-05,0.4658,0.6827,0.1662,1.179,4149.1,35.23GB,0.0062,0.0900,0.3438 +baseline,17:07:14.264,5120,0.0028,0.0146,1.43e-05,0.4548,0.6341,0.1730,1.240,3934.6,35.23GB,0.0105,0.1045,0.2748 +baseline,17:07:22.187,5130,0.0190,0.0134,1.42e-05,0.4397,0.6244,0.1679,1.263,3856.6,35.23GB,0.0112,0.1165,0.3541 +baseline,17:07:30.212,5140,0.0159,0.0140,1.42e-05,0.3989,0.6384,0.1642,1.247,3898.6,35.23GB,0.0130,0.1171,0.4698 +baseline,17:07:39.186,5150,0.0261,0.0171,1.41e-05,0.4707,0.6758,0.2217,1.118,4336.4,35.23GB,0.0083,0.1082,0.3038 +baseline,17:07:46.627,5160,0.0092,0.0161,1.41e-05,0.4882,0.6077,0.1364,1.344,3600.9,35.23GB,0.0184,0.1879,0.6751 +baseline,17:07:54.954,5170,0.0104,0.0143,1.41e-05,0.4586,0.6881,0.1445,1.201,4020.8,35.23GB,0.0296,0.1789,0.5862 +baseline,17:08:03.331,5180,0.0075,0.0133,1.40e-05,0.4661,0.6530,0.1847,1.194,4036.6,35.23GB,0.0074,0.0744,0.4550 +baseline,17:08:11.367,5190,0.0060,0.0101,1.40e-05,0.4854,0.6462,0.1573,1.245,3863.3,35.23GB,0.0107,0.1095,0.5800 +baseline,17:08:20.268,5200,0.0122,0.0134,1.40e-05,0.4432,0.6885,0.2016,1.125,4268.5,35.23GB,0.0075,0.0883,0.6119 +baseline,17:08:29.196,5210,0.0243,0.0124,1.39e-05,0.5010,0.6809,0.2119,1.121,4274.9,35.23GB,0.0326,0.1776,0.5024 +baseline,17:08:38.412,5220,0.0127,0.0206,1.39e-05,0.4287,0.6975,0.2241,1.086,4400.8,35.23GB,0.0346,0.1743,0.6020 +baseline,17:08:47.088,5230,0.0280,0.0154,1.38e-05,0.4378,0.7042,0.1634,1.153,4136.2,35.23GB,0.0147,0.1656,0.3753 +baseline,17:08:55.020,5240,0.0084,0.0111,1.38e-05,0.4223,0.6469,0.1463,1.262,3772.2,35.23GB,0.0118,0.1256,0.6506 +baseline,17:09:02.181,5250,0.0144,0.0210,1.38e-05,0.4996,0.6173,0.0987,1.397,3399.0,35.23GB,0.0162,0.1721,0.5346 +baseline,17:09:09.676,5260,0.0010,0.0182,1.37e-05,0.4383,0.5853,0.1642,1.335,3550.6,35.23GB,0.0248,0.1257,0.3621 +baseline,17:09:16.825,5270,0.0144,0.0181,1.37e-05,0.4541,0.5952,0.1197,1.435,3295.1,35.23GB,0.0101,0.1010,0.4696 +baseline,17:09:23.591,5280,0.0251,0.0140,1.37e-05,0.4173,0.5841,0.0925,1.479,3192.1,35.23GB,0.0224,0.2121,0.5972 +baseline,17:09:31.273,5290,0.0027,0.0102,1.36e-05,0.4877,0.6232,0.1450,1.302,3617.8,35.23GB,0.0084,0.1034,0.3231 +baseline,17:09:38.455,5300,0.0082,0.0097,1.36e-05,0.4710,0.5903,0.1279,1.393,3374.3,35.23GB,0.0216,0.1614,0.5357 +baseline,17:09:46.347,5310,0.0041,0.0083,1.35e-05,0.4483,0.6430,0.1462,1.269,3696.2,35.23GB,0.0090,0.0837,0.3836 +baseline,17:09:53.974,5320,0.0150,0.0138,1.35e-05,0.6120,0.6233,0.1394,1.311,3568.4,35.23GB,0.0147,0.1268,0.6698 +baseline,17:10:02.365,5330,0.0033,0.0109,1.35e-05,0.4476,0.6181,0.2211,1.193,3914.6,35.23GB,0.0076,0.0906,0.3266 +baseline,17:10:10.586,5340,0.0151,0.0194,1.34e-05,0.4695,0.6499,0.1722,1.244,3745.3,35.23GB,0.0089,0.1019,0.3330 +baseline,17:10:19.733,5350,0.0048,0.0147,1.34e-05,0.4926,0.6993,0.2154,1.094,4249.4,35.23GB,0.0101,0.0968,0.4269 +baseline,17:10:27.410,5360,0.0039,0.0159,1.34e-05,0.4148,0.6009,0.1668,1.305,3555.6,35.23GB,0.0143,0.1218,0.3033 +baseline,17:10:34.365,5370,0.0021,0.0105,1.33e-05,0.3858,0.5901,0.1054,1.438,3219.2,35.23GB,0.0057,0.0955,0.3254 +baseline,17:10:41.251,5380,0.0122,0.0161,1.33e-05,0.4974,0.5704,0.1182,1.452,3181.1,35.23GB,0.0249,0.1835,0.4782 +baseline,17:10:48.316,5390,0.0058,0.0154,1.32e-05,0.4186,0.5884,0.1181,1.416,3255.2,35.23GB,0.0085,0.0922,0.4013 +baseline,17:10:54.978,5400,0.0120,0.0130,1.32e-05,0.4956,0.5493,0.1169,1.501,3063.8,35.23GB,0.0093,0.1023,0.3381 +baseline,17:11:02.859,5410,0.0078,0.0106,1.32e-05,0.4391,0.6278,0.1604,1.269,3617.0,35.23GB,0.0033,0.0518,0.1927 +baseline,17:11:09.695,5420,0.0120,0.0115,1.31e-05,0.4032,0.5833,0.1003,1.463,3130.1,35.23GB,0.0065,0.0969,0.3918 +baseline,17:11:16.535,5430,0.0193,0.0116,1.31e-05,0.4870,0.5692,0.1147,1.462,3125.4,35.23GB,0.0180,0.1441,0.4606 +baseline,17:11:23.232,5440,0.0037,0.0129,1.31e-05,0.4771,0.5700,0.0998,1.493,3054.0,35.23GB,0.0090,0.0890,0.3745 +baseline,17:11:29.977,5450,0.0048,0.0102,1.30e-05,0.5300,0.5850,0.0895,1.483,3068.2,35.23GB,0.0131,0.1533,0.5846 +baseline,17:11:37.360,5460,0.0022,0.0134,1.30e-05,0.4434,0.6233,0.1151,1.354,3351.8,35.23GB,0.0173,0.1675,0.4123 +baseline,17:11:45.144,5470,0.0200,0.0139,1.30e-05,0.5195,0.6288,0.1495,1.285,3525.2,35.23GB,0.0443,0.2602,0.6829 +baseline,17:11:52.943,5480,0.0055,0.0159,1.29e-05,0.5021,0.6263,0.1536,1.282,3524.6,35.23GB,0.0123,0.1062,0.4616 +baseline,17:12:01.116,5490,0.0036,0.0137,1.29e-05,0.4232,0.6420,0.1753,1.224,3685.9,35.23GB,0.0042,0.0565,0.1587 +baseline,17:12:08.297,5500,0.0052,0.0113,1.28e-05,0.3931,0.5929,0.1252,1.393,3230.8,35.23GB,0.0068,0.0903,0.4283 +baseline,17:12:16.435,5510,0.0278,0.0118,1.28e-05,0.4977,0.6372,0.1766,1.229,3653.7,35.23GB,0.0076,0.0845,0.6132 +baseline,17:12:24.235,5520,0.0080,0.0102,1.28e-05,0.4525,0.6264,0.1536,1.282,3493.8,35.23GB,0.0085,0.0745,0.2461 +baseline,17:12:31.842,5530,0.0197,0.0115,1.27e-05,0.6100,0.6356,0.1251,1.315,3399.7,35.23GB,0.0146,0.1717,0.6394 +baseline,17:12:40.009,5540,0.0031,0.0148,1.27e-05,0.5606,0.6520,0.1647,1.225,3642.1,35.23GB,0.0132,0.0935,0.3691 +baseline,17:12:46.761,5550,0.0017,0.0106,1.27e-05,0.4402,0.5819,0.0933,1.481,3003.8,35.23GB,0.0043,0.0563,0.1659 +baseline,17:12:54.302,5560,0.0014,0.0119,1.26e-05,0.4621,0.6177,0.1363,1.326,3347.8,35.23GB,0.0122,0.0897,0.3359 +baseline,17:13:01.486,5570,0.0031,0.0103,1.26e-05,0.4773,0.5979,0.1206,1.392,3182.2,35.23GB,0.0068,0.0647,0.4252 +baseline,17:13:09.820,5580,0.0048,0.0116,1.25e-05,0.4614,0.6672,0.1661,1.200,3682.8,35.23GB,0.0173,0.1370,0.4225 +baseline,17:13:17.637,5590,0.0144,0.0121,1.25e-05,0.5193,0.6259,0.1560,1.279,3447.0,35.23GB,0.0085,0.0939,0.2815 +baseline,17:13:24.900,5600,0.0149,0.0111,1.25e-05,0.5253,0.5927,0.1336,1.377,3195.1,35.23GB,0.0348,0.2154,0.6566 +baseline,17:13:32.158,5610,0.0091,0.0120,1.24e-05,0.4239,0.5839,0.1419,1.378,3185.9,35.23GB,0.0072,0.0716,0.2527 +baseline,17:13:40.885,5620,0.0135,0.0155,1.24e-05,0.4608,0.6838,0.1888,1.146,3821.6,35.23GB,0.0080,0.0770,0.3385 +baseline,17:13:48.912,5630,0.0124,0.0134,1.24e-05,0.4228,0.6467,0.1560,1.246,3507.1,35.23GB,0.0139,0.1470,0.3907 +baseline,17:13:57.165,5640,0.0041,0.0146,1.23e-05,0.3785,0.6680,0.1573,1.212,3597.8,35.23GB,0.0079,0.0988,0.3654 +baseline,17:14:05.517,5650,0.0048,0.0130,1.23e-05,0.4111,0.6482,0.1870,1.197,3632.9,35.23GB,0.0082,0.0892,0.3030 +baseline,17:14:14.198,5660,0.0112,0.0142,1.22e-05,0.4477,0.6892,0.1789,1.152,3766.9,35.23GB,0.0053,0.0604,0.2915 +baseline,17:14:23.607,5670,0.0145,0.0124,1.22e-05,0.4765,0.7165,0.2244,1.064,4070.3,35.23GB,0.0212,0.1736,0.6702 +baseline,17:14:31.625,5680,0.0098,0.0148,1.22e-05,0.4953,0.6398,0.1619,1.247,3463.0,35.23GB,0.0076,0.0801,0.4385 +baseline,17:14:40.710,5690,0.0069,0.0144,1.21e-05,0.4525,0.7173,0.1912,1.101,3914.8,35.23GB,0.0095,0.0900,0.5734 +baseline,17:14:49.137,5700,0.0714,0.0170,1.21e-05,0.5679,0.6561,0.1867,1.187,3623.2,35.23GB,0.0242,0.1511,1.4243 +baseline,17:14:58.971,5710,0.0420,0.0146,1.21e-05,0.4702,0.7319,0.2516,1.017,4218.0,35.23GB,0.0290,0.1758,0.6973 +baseline,17:15:07.941,5720,0.0140,0.0141,1.20e-05,0.5149,0.6918,0.2052,1.115,3838.9,35.23GB,0.0201,0.1572,0.6217 +baseline,17:15:15.219,5730,0.0297,0.0233,1.20e-05,0.4969,0.5996,0.1281,1.374,3107.4,35.23GB,0.0254,0.2105,0.6594 +baseline,17:15:23.564,5740,0.0147,0.0150,1.20e-05,0.4206,0.6741,0.1607,1.199,3552.5,35.23GB,0.0120,0.1070,0.5630 +baseline,17:15:31.074,5750,0.0047,0.0098,1.19e-05,0.4146,0.6152,0.1356,1.332,3191.3,35.23GB,0.0204,0.1174,0.6485 +baseline,17:15:38.607,5760,0.0105,0.0104,1.19e-05,0.3792,0.5963,0.1570,1.328,3193.4,35.23GB,0.0100,0.1265,0.5221 +baseline,17:15:47.445,5770,0.1741,0.0273,1.18e-05,0.4321,0.7013,0.1826,1.132,3737.8,35.23GB,0.0303,0.2007,0.5700 +baseline,17:15:55.137,5780,0.0161,0.0210,1.18e-05,0.5330,0.6171,0.1522,1.300,3245.5,35.23GB,0.0358,0.1539,0.5869 +baseline,17:16:03.130,5790,0.0019,0.0137,1.18e-05,0.4051,0.6354,0.1638,1.251,3364.7,35.23GB,0.0099,0.1046,0.5195 +baseline,17:16:11.541,5800,0.0052,0.0091,1.17e-05,0.4118,0.6809,0.1604,1.189,3532.0,35.23GB,0.0087,0.1271,0.3924 +baseline,17:16:18.727,5810,0.0186,0.0134,1.17e-05,0.4328,0.5782,0.1402,1.392,3010.2,35.23GB,0.0104,0.1039,0.3722 +baseline,17:16:26.398,5820,0.0068,0.0112,1.17e-05,0.4745,0.6461,0.1209,1.304,3206.5,35.23GB,0.0123,0.1072,0.3135 +baseline,17:16:33.667,5830,0.0254,0.0138,1.16e-05,0.4160,0.5977,0.1292,1.376,3030.5,35.23GB,0.0135,0.1487,0.4489 +baseline,17:16:41.448,5840,0.0168,0.0127,1.16e-05,0.4468,0.6183,0.1598,1.285,3236.4,35.23GB,0.0172,0.1455,0.5311 +baseline,17:16:49.664,5850,0.0036,0.0108,1.16e-05,0.4227,0.6865,0.1352,1.217,3409.2,35.23GB,0.0146,0.1332,0.4709 +baseline,17:16:56.863,5860,0.0165,0.0099,1.15e-05,0.4929,0.5890,0.1308,1.389,2979.6,35.23GB,0.0260,0.1905,0.9047 +baseline,17:17:06.478,5870,0.0123,0.0115,1.15e-05,0.4297,0.7284,0.2331,1.040,3970.8,35.23GB,0.0225,0.1613,0.3878 +baseline,17:17:14.775,5880,0.0174,0.0174,1.14e-05,0.4768,0.6592,0.1705,1.206,3417.6,35.23GB,0.0105,0.1079,0.6793 +baseline,17:17:22.704,5890,0.0035,0.0109,1.14e-05,0.4367,0.6255,0.1675,1.261,3258.3,35.23GB,0.0137,0.1166,0.3636 +baseline,17:17:29.863,5900,0.0104,0.0121,1.14e-05,0.5247,0.5957,0.1201,1.397,2935.0,35.23GB,0.0123,0.1175,0.3426 +baseline,17:17:36.955,5910,0.0132,0.0142,1.13e-05,0.4737,0.5821,0.1271,1.410,2900.1,35.23GB,0.0128,0.1119,0.3898 +baseline,17:17:44.003,5920,0.0222,0.0168,1.13e-05,0.4966,0.5773,0.1275,1.419,2875.0,35.23GB,0.0115,0.1370,0.6597 +baseline,17:17:50.996,5930,0.0213,0.0142,1.13e-05,0.3928,0.5873,0.1119,1.430,2845.5,35.23GB,0.0085,0.0963,0.2581 +baseline,17:17:59.030,5940,0.0115,0.0092,1.12e-05,0.4284,0.6353,0.1681,1.245,3261.1,35.23GB,0.0079,0.0813,0.3872 +baseline,17:18:06.883,5950,0.0023,0.0101,1.12e-05,0.4422,0.6283,0.1572,1.273,3180.3,35.23GB,0.0058,0.0691,0.2216 +baseline,17:18:15.086,5960,0.0182,0.0125,1.12e-05,0.4360,0.6267,0.1937,1.219,3313.6,35.23GB,0.0079,0.0906,0.2852 +baseline,17:18:22.530,5970,0.0151,0.0123,1.11e-05,0.3923,0.6108,0.1333,1.344,2999.5,35.23GB,0.0203,0.1370,0.3385 +baseline,17:18:30.802,5980,0.0057,0.0129,1.11e-05,0.4138,0.6560,0.1713,1.209,3324.9,35.23GB,0.0046,0.0484,0.2279 +baseline,17:18:38.744,5990,0.0974,0.0216,1.10e-05,0.4690,0.6386,0.1555,1.259,3184.0,35.23GB,0.0383,0.2206,0.6704 +baseline,17:18:46.268,6000,0.0108,0.0166,1.10e-05,0.4662,0.6373,0.1151,1.329,3009.3,35.23GB,0.0103,0.1015,0.5939 +baseline,17:20:03.248,6010,0.0032,0.0138,1.10e-05,0.4452,0.5486,7.1493,0.130,30714.4,35.23GB,0.0078,0.0742,0.2742 +baseline,17:20:09.653,6020,0.0064,0.0097,1.09e-05,0.4565,0.5474,0.0932,1.561,2549.2,35.23GB,0.0202,0.1297,0.5551 +baseline,17:20:16.677,6030,0.0079,0.0116,1.09e-05,0.4675,0.5829,0.1194,1.424,2787.7,35.23GB,0.0093,0.0848,0.4347 +baseline,17:20:23.335,6040,0.0074,0.0118,1.09e-05,0.4754,0.5518,0.1141,1.502,2636.4,35.23GB,0.0122,0.1159,0.3456 +baseline,17:20:29.458,6050,0.0070,0.0078,1.08e-05,0.3696,0.5377,0.0745,1.634,2417.9,35.23GB,0.0075,0.0871,0.2742 +baseline,17:20:35.764,6060,0.0091,0.0090,1.08e-05,0.3595,0.5430,0.0876,1.586,2484.4,35.23GB,0.0114,0.0852,0.2392 +baseline,17:20:42.199,6070,0.0097,0.0084,1.08e-05,0.3908,0.5493,0.0942,1.554,2528.6,35.23GB,0.0072,0.0836,0.2805 +baseline,17:20:48.546,6080,0.0028,0.0137,1.07e-05,0.4124,0.5404,0.0943,1.576,2487.5,35.23GB,0.0047,0.0688,0.1931 +baseline,17:20:55.268,6090,0.0025,0.0105,1.07e-05,0.4347,0.5532,0.1190,1.488,2627.8,35.23GB,0.0220,0.2006,0.4370 +baseline,17:21:01.829,6100,0.0096,0.0150,1.06e-05,0.5231,0.5492,0.1069,1.524,2558.4,35.23GB,0.0094,0.0951,0.4486 +baseline,17:21:08.443,6110,0.0070,0.0135,1.06e-05,0.4706,0.5484,0.1130,1.512,2572.5,35.23GB,0.0154,0.1844,0.4852 +baseline,17:21:15.349,6120,0.0153,0.0132,1.06e-05,0.4617,0.5556,0.1350,1.448,2679.1,35.23GB,0.0096,0.1312,0.4227 +baseline,17:21:22.607,6130,0.0065,0.0113,1.05e-05,0.4387,0.5954,0.1304,1.378,2808.4,35.23GB,0.0086,0.1411,0.4460 +baseline,17:21:29.253,6140,0.0045,0.0157,1.05e-05,0.4318,0.5392,0.1254,1.505,2565.1,35.23GB,0.0044,0.0606,0.1837 +baseline,17:21:36.673,6150,0.0048,0.0105,1.05e-05,0.4372,0.6048,0.1373,1.348,2856.5,35.23GB,0.0052,0.0655,0.2953 +baseline,17:21:43.183,6160,0.0154,0.0127,1.04e-05,0.4479,0.5438,0.1072,1.536,2499.5,35.23GB,0.0073,0.0895,0.3389 +baseline,17:21:49.611,6170,0.0055,0.0134,1.04e-05,0.4963,0.5382,0.1045,1.556,2461.5,35.23GB,0.0363,0.1370,0.6139 +baseline,17:21:55.882,6180,0.0080,0.0118,1.04e-05,0.3926,0.5325,0.0946,1.595,2395.4,35.23GB,0.0047,0.0517,0.1398 +baseline,17:22:02.081,6190,0.0040,0.0073,1.03e-05,0.4432,0.5261,0.0938,1.613,2361.5,35.23GB,0.0086,0.1084,0.9480 +baseline,17:22:08.489,6200,0.0081,0.0104,1.03e-05,0.5147,0.5399,0.1008,1.561,2434.3,35.23GB,0.0375,0.2099,0.5923 +baseline,17:22:15.297,6210,0.0139,0.0093,1.03e-05,0.4109,0.5599,0.1209,1.469,2579.8,35.23GB,0.0117,0.0929,0.2549 +baseline,17:22:21.969,6220,0.0346,0.0152,1.02e-05,0.5165,0.5631,0.1042,1.499,2521.7,35.23GB,0.0215,0.1220,0.3865 +baseline,17:22:28.706,6230,0.0093,0.0107,1.02e-05,0.4855,0.5617,0.1121,1.485,2539.4,35.23GB,0.0103,0.0829,0.3281 +baseline,17:22:35.729,6240,0.0349,0.0152,1.02e-05,0.4794,0.5726,0.1296,1.424,2640.2,35.23GB,0.0355,0.1813,0.4371 +baseline,17:22:42.344,6250,0.0092,0.0147,1.01e-05,0.3989,0.5681,0.0934,1.512,2480.3,35.23GB,0.0089,0.1104,0.3645 +baseline,17:22:49.268,6260,0.0033,0.0119,1.01e-05,0.4083,0.5592,0.1332,1.445,2589.1,35.23GB,0.0091,0.0866,0.2380 +baseline,17:22:56.021,6270,0.0029,0.0080,1.00e-05,0.3941,0.5570,0.1183,1.481,2518.2,35.23GB,0.0060,0.0728,0.4052 +baseline,17:23:03.604,6280,0.0131,0.0112,1.00e-05,0.3984,0.6331,0.1252,1.319,2820.6,35.23GB,0.0047,0.0637,0.2495 +baseline,17:23:11.371,6290,0.0010,0.0078,9.98e-06,0.4164,0.6119,0.1648,1.288,2881.2,35.23GB,0.0068,0.0616,0.3634 +baseline,17:23:18.405,6300,0.0091,0.0092,9.94e-06,0.4439,0.5702,0.1333,1.422,2602.1,35.23GB,0.0070,0.0699,0.2761 +baseline,17:23:24.833,6310,0.0078,0.0087,9.91e-06,0.4614,0.5421,0.1007,1.556,2371.6,35.23GB,0.0250,0.1716,0.5629 +baseline,17:23:31.311,6320,0.0240,0.0085,9.87e-06,0.4590,0.5383,0.1095,1.544,2383.5,35.23GB,0.0084,0.0829,0.4106 +baseline,17:23:39.032,6330,0.0027,0.0101,9.84e-06,0.4704,0.6246,0.1476,1.295,2833.2,35.23GB,0.0067,0.0711,0.4407 +baseline,17:23:45.582,6340,0.0130,0.0089,9.80e-06,0.3953,0.5525,0.1025,1.527,2396.9,35.23GB,0.0064,0.0704,0.2309 +baseline,17:23:53.273,6350,0.0146,0.0113,9.77e-06,0.4166,0.6297,0.1394,1.300,2806.9,35.23GB,0.0168,0.1507,0.4342 +baseline,17:24:00.907,6360,0.0017,0.0165,9.73e-06,0.4751,0.6191,0.1443,1.310,2778.2,35.23GB,0.0153,0.1510,0.4497 +baseline,17:24:07.613,6370,0.0109,0.0109,9.70e-06,0.3674,0.5563,0.1144,1.491,2433.9,35.23GB,0.0074,0.0891,0.3788 +baseline,17:24:16.161,6380,0.0039,0.0123,9.66e-06,0.4795,0.6489,0.2058,1.170,3093.8,35.23GB,0.0061,0.0712,0.2121 +baseline,17:24:23.482,6390,0.0237,0.0133,9.63e-06,0.4370,0.5917,0.1404,1.366,2642.6,35.23GB,0.0133,0.1140,0.5281 +baseline,17:24:30.529,6400,0.0191,0.0141,9.59e-06,0.4858,0.5791,0.1257,1.419,2536.7,35.23GB,0.0233,0.1597,0.5572 +baseline,17:24:37.658,6410,0.0042,0.0105,9.56e-06,0.3693,0.5790,0.1340,1.403,2559.0,35.23GB,0.0064,0.0775,0.2957 +baseline,17:24:44.489,6420,0.0066,0.0091,9.52e-06,0.4044,0.5613,0.1217,1.464,2444.7,35.23GB,0.0086,0.0814,0.2885 +baseline,17:24:51.318,6430,0.0032,0.0120,9.49e-06,0.4763,0.5801,0.1029,1.464,2438.0,35.23GB,0.0204,0.1297,0.4928 +baseline,17:24:59.768,6440,0.0111,0.0099,9.46e-06,0.4240,0.6527,0.1923,1.184,3007.7,35.23GB,0.0058,0.0718,0.2664 +baseline,17:25:08.023,6450,0.0138,0.0126,9.42e-06,0.4940,0.6408,0.1846,1.212,2929.9,35.23GB,0.0051,0.0624,0.2539 +baseline,17:25:15.460,6460,0.0248,0.0119,9.39e-06,0.4657,0.6013,0.1425,1.345,2632.6,35.23GB,0.0093,0.1185,0.4307 +baseline,17:25:22.801,6470,0.0031,0.0111,9.35e-06,0.4221,0.5835,0.1505,1.363,2590.8,35.23GB,0.0080,0.0880,0.2845 +baseline,17:25:31.742,6480,0.0031,0.0084,9.32e-06,0.4472,0.6779,0.2162,1.119,3146.7,35.23GB,0.0040,0.0577,0.1578 +baseline,17:25:40.368,6490,0.0038,0.0101,9.28e-06,0.3873,0.6611,0.2015,1.159,3027.6,35.23GB,0.0036,0.0545,0.2783 +baseline,17:25:48.061,6500,0.0154,0.0140,9.25e-06,0.4666,0.6104,0.1590,1.300,2692.1,35.23GB,0.0109,0.0772,0.5765 +baseline,17:25:57.165,6510,0.0061,0.0109,9.22e-06,0.4959,0.6846,0.2258,1.099,3176.9,35.23GB,0.0168,0.1677,0.4779 +baseline,17:26:04.399,6520,0.0084,0.0081,9.18e-06,0.4543,0.5943,0.1291,1.383,2517.1,35.23GB,0.0089,0.1025,0.4122 +baseline,17:26:11.221,6530,0.0130,0.0119,9.15e-06,0.6308,0.5700,0.1122,1.466,2366.8,35.23GB,0.0167,0.1528,2.1178 +baseline,17:26:19.277,6540,0.0092,0.0098,9.11e-06,0.4523,0.6225,0.1830,1.242,2786.7,35.23GB,0.0112,0.0734,0.4396 +baseline,17:26:27.163,6550,0.0110,0.0097,9.08e-06,0.5361,0.6214,0.1672,1.268,2720.3,35.23GB,0.0104,0.1157,0.3954 +baseline,17:26:35.444,6560,0.0027,0.0103,9.05e-06,0.4760,0.6505,0.1776,1.208,2848.3,35.23GB,0.0044,0.0548,0.2042 +baseline,17:26:44.717,6570,0.0386,0.0241,9.01e-06,0.4878,0.6882,0.2391,1.079,3180.3,35.23GB,0.0237,0.1588,0.5311 +baseline,17:26:54.074,6580,0.0061,0.0166,8.98e-06,0.5750,0.7137,0.2220,1.069,3199.5,35.23GB,0.0082,0.1024,0.3656 +baseline,17:27:01.366,6590,0.0177,0.0106,8.94e-06,0.4455,0.6019,0.1273,1.371,2486.3,35.23GB,0.0132,0.1402,0.5067 +baseline,17:27:08.736,6600,0.0060,0.0194,8.91e-06,0.4360,0.6098,0.1272,1.357,2505.5,35.23GB,0.0045,0.0606,0.2424 +baseline,17:27:17.308,6610,0.0075,0.0129,8.88e-06,0.4718,0.6649,0.1923,1.167,2905.4,35.23GB,0.0077,0.0890,0.2418 +baseline,17:27:25.954,6620,0.0025,0.0078,8.84e-06,0.4202,0.6860,0.1786,1.157,2922.3,35.23GB,0.0055,0.0666,0.2066 +baseline,17:27:33.834,6630,0.0144,0.0094,8.81e-06,0.4697,0.6322,0.1558,1.269,2654.9,35.23GB,0.0243,0.1874,0.6131 +baseline,17:27:41.573,6640,0.0097,0.0090,8.78e-06,0.4584,0.6155,0.1584,1.292,2600.1,35.23GB,0.0144,0.1077,0.4142 +baseline,17:27:50.661,6650,0.0033,0.0104,8.74e-06,0.4646,0.6783,0.2306,1.100,3044.1,35.23GB,0.0180,0.1083,0.3408 +baseline,17:28:00.793,6660,0.0026,0.0092,8.71e-06,0.4279,0.7464,0.2667,0.987,3383.5,35.23GB,0.0200,0.1622,0.4295 +baseline,17:28:08.956,6670,0.0121,0.0078,8.68e-06,0.4548,0.6525,0.1638,1.225,2718.1,35.23GB,0.0070,0.0688,0.4250 +baseline,17:28:16.964,6680,0.0054,0.0080,8.64e-06,0.5099,0.6410,0.1598,1.249,2658.4,35.23GB,0.0215,0.1683,0.8534 +baseline,17:28:25.540,6690,0.0060,0.0117,8.61e-06,0.4862,0.6285,0.2292,1.166,2838.2,35.23GB,0.0102,0.1206,0.6703 +baseline,17:28:34.213,6700,0.0135,0.0088,8.58e-06,0.4922,0.6611,0.2062,1.153,2861.7,35.23GB,0.0393,0.2061,0.5453 +baseline,17:28:43.747,6710,0.0152,0.0094,8.55e-06,0.4412,0.6997,0.2537,1.049,3136.2,35.23GB,0.0090,0.1132,0.4726 +baseline,17:28:52.257,6720,0.0022,0.0109,8.51e-06,0.4401,0.6620,0.1890,1.175,2790.9,35.23GB,0.0081,0.0728,0.5237 +baseline,17:29:00.174,6730,0.0057,0.0098,8.48e-06,0.3820,0.6268,0.1649,1.263,2588.7,35.23GB,0.0092,0.0783,0.3421 +baseline,17:29:06.665,6740,0.0152,0.0140,8.45e-06,0.4281,0.5434,0.1057,1.541,2115.9,35.23GB,0.0211,0.0950,0.3499 +baseline,17:29:13.869,6750,0.0017,0.0106,8.41e-06,0.7323,0.5916,0.1287,1.388,2340.7,35.23GB,0.0174,0.1450,0.3623 +baseline,17:29:20.595,6760,0.0078,0.0091,8.38e-06,0.4340,0.5738,0.0988,1.487,2178.9,35.23GB,0.0099,0.1696,0.4910 +baseline,17:29:27.660,6770,0.0279,0.0154,8.35e-06,0.5581,0.5687,0.1378,1.416,2281.7,35.23GB,0.0092,0.1205,0.7929 +baseline,17:29:35.476,6780,0.0501,0.0158,8.32e-06,0.4557,0.6255,0.1561,1.280,2516.6,35.23GB,0.0248,0.1819,0.4957 +baseline,17:29:43.720,6790,0.0186,0.0154,8.28e-06,0.5012,0.6459,0.1785,1.213,2646.0,35.23GB,0.0138,0.0986,0.4404 +baseline,17:29:51.398,6800,0.0092,0.0101,8.25e-06,0.4692,0.6171,0.1507,1.303,2456.2,35.23GB,0.0690,0.2389,0.7537 +baseline,17:30:00.012,6810,0.0085,0.0097,8.22e-06,0.4539,0.6811,0.1803,1.161,2747.5,35.23GB,0.0198,0.1296,0.6356 +baseline,17:30:08.735,6820,0.0103,0.0095,8.19e-06,0.3872,0.6497,0.2226,1.147,2773.6,35.23GB,0.0052,0.0716,0.3297 +baseline,17:30:16.772,6830,0.0032,0.0105,8.15e-06,0.4768,0.6339,0.1698,1.244,2547.3,35.23GB,0.0052,0.0803,0.2802 +baseline,17:30:23.368,6840,0.0180,0.0098,8.12e-06,0.4342,0.5453,0.1144,1.516,2084.1,35.23GB,0.0043,0.0582,0.4101 +baseline,17:30:30.269,6850,0.0095,0.0090,8.09e-06,0.4118,0.5733,0.1168,1.449,2173.4,35.23GB,0.0149,0.1398,0.4590 +baseline,17:30:37.031,6860,0.0100,0.0105,8.06e-06,0.4583,0.5775,0.0987,1.479,2123.1,35.23GB,0.0044,0.0603,0.2797 +baseline,17:30:45.698,6870,0.0052,0.0081,8.02e-06,0.3871,0.6651,0.2016,1.154,2712.4,35.23GB,0.0205,0.1127,0.3264 +baseline,17:30:53.689,6880,0.0059,0.0145,7.99e-06,0.4923,0.6473,0.1518,1.252,2492.7,35.23GB,0.0192,0.1207,0.5083 +baseline,17:31:02.621,6890,0.0433,0.0123,7.96e-06,0.4442,0.6525,0.2406,1.120,2777.5,35.23GB,0.0135,0.1162,0.5382 +baseline,17:31:10.336,6900,0.0174,0.0093,7.93e-06,0.4496,0.6218,0.1497,1.296,2391.5,35.23GB,0.0109,0.1300,0.6347 +baseline,17:31:18.898,6910,0.0018,0.0184,7.90e-06,0.5174,0.6664,0.1897,1.168,2645.1,35.23GB,0.0094,0.0799,0.5407 +baseline,17:31:25.770,6920,0.0016,0.0154,7.87e-06,0.4622,0.5673,0.1199,1.455,2116.3,35.23GB,0.0228,0.1140,0.3810 +baseline,17:31:33.435,6930,0.0035,0.0168,7.83e-06,0.5384,0.6173,0.1492,1.305,2352.9,35.23GB,0.0174,0.1377,0.4515 +baseline,17:31:41.211,6940,0.0095,0.0154,7.80e-06,0.5030,0.6172,0.1604,1.286,2379.0,35.23GB,0.0131,0.1010,0.3813 +baseline,17:31:49.313,6950,0.0061,0.0124,7.77e-06,0.5494,0.6446,0.1656,1.234,2470.8,35.23GB,0.0228,0.1314,0.4266 +baseline,17:31:55.699,6960,0.0448,0.0195,7.74e-06,0.4732,0.5529,0.0857,1.566,1941.0,35.23GB,0.0372,0.1561,0.6129 +baseline,17:32:06.742,6970,0.0426,0.0164,7.71e-06,0.5222,0.7599,0.3445,0.906,3345.8,35.23GB,0.0503,0.2581,0.9567 +baseline,17:32:17.604,6980,0.0070,0.0125,7.68e-06,0.4690,0.7888,0.2972,0.921,3278.1,35.23GB,0.0263,0.1108,0.4049 +baseline,17:32:26.500,6990,0.0014,0.0092,7.65e-06,0.4258,0.6751,0.2145,1.124,2677.5,35.23GB,0.0097,0.0833,0.3147 +baseline,17:32:35.179,7000,0.0146,0.0099,7.61e-06,0.4394,0.6604,0.2075,1.152,2603.3,35.23GB,0.0115,0.1022,0.4005 +baseline,17:34:46.163,7010,0.0076,0.0086,7.58e-06,0.3813,0.7704,12.3280,0.076,39164.0,35.23GB,0.0058,0.0676,0.2587 +baseline,17:34:53.353,7020,0.0054,0.0081,7.55e-06,0.4347,0.5740,0.1450,1.391,2142.3,35.23GB,0.0101,0.1129,0.4213 +baseline,17:35:00.659,7030,0.0038,0.0083,7.52e-06,0.3900,0.5976,0.1329,1.369,2169.5,35.23GB,0.0058,0.0712,0.3955 +baseline,17:35:07.941,7040,0.0019,0.0061,7.49e-06,0.4080,0.5820,0.1462,1.374,2155.0,35.23GB,0.0160,0.1288,0.4859 +baseline,17:35:14.667,7050,0.0664,0.0131,7.46e-06,0.4820,0.5352,0.1374,1.487,1983.7,35.23GB,0.0252,0.1369,0.5362 +baseline,17:35:22.351,7060,0.0027,0.0088,7.43e-06,0.3797,0.6346,0.1339,1.301,2259.0,35.23GB,0.0239,0.1960,0.5914 +baseline,17:35:29.705,7070,0.0050,0.0109,7.40e-06,0.5365,0.5752,0.1602,1.360,2154.4,35.23GB,0.0268,0.1706,0.6827 +baseline,17:35:37.780,7080,0.0154,0.0100,7.37e-06,0.3962,0.6492,0.1583,1.239,2357.6,35.23GB,0.0097,0.1159,0.3217 +baseline,17:35:45.325,7090,0.0019,0.0108,7.34e-06,0.4023,0.6040,0.1505,1.326,2195.0,35.23GB,0.0120,0.1154,0.4222 +baseline,17:35:53.625,7100,0.0106,0.0107,7.31e-06,0.3653,0.6445,0.1855,1.205,2406.9,35.23GB,0.0143,0.0950,0.3708 +baseline,17:36:01.753,7110,0.0144,0.0095,7.28e-06,0.3954,0.6415,0.1713,1.231,2348.6,35.23GB,0.0300,0.1790,0.6459 +baseline,17:36:09.534,7120,0.0032,0.0098,7.24e-06,0.4306,0.6208,0.1573,1.285,2240.6,35.23GB,0.0319,0.1830,0.5940 +baseline,17:36:16.842,7130,0.0051,0.0090,7.21e-06,0.5466,0.6041,0.1267,1.369,2096.9,35.23GB,0.0098,0.0911,0.3407 +baseline,17:36:23.692,7140,0.0035,0.0085,7.18e-06,0.3823,0.5697,0.1154,1.460,1959.0,35.23GB,0.0084,0.0832,0.3281 +baseline,17:36:30.549,7150,0.0848,0.0206,7.15e-06,0.4636,0.5651,0.1206,1.459,1953.9,35.23GB,0.0409,0.2027,0.5652 +baseline,17:36:38.443,7160,0.0156,0.0192,7.12e-06,0.4121,0.6401,0.1493,1.267,2241.6,35.23GB,0.0317,0.1752,0.5001 +baseline,17:36:44.591,7170,0.0055,0.0118,7.09e-06,0.3978,0.5222,0.0926,1.627,1739.5,35.23GB,0.0128,0.0908,0.3423 +baseline,17:36:52.693,7180,0.0215,0.0096,7.06e-06,0.5249,0.6436,0.1667,1.234,2284.5,35.23GB,0.0358,0.1447,0.6794 +baseline,17:37:01.877,7190,0.0084,0.0107,7.03e-06,0.4431,0.7177,0.2006,1.089,2580.1,35.23GB,0.0089,0.0795,0.5247 +baseline,17:37:09.119,7200,0.0158,0.0106,7.00e-06,0.4388,0.5786,0.1457,1.381,2027.7,35.23GB,0.0050,0.0873,0.2515 +baseline,17:37:17.149,7210,0.0173,0.0111,6.97e-06,0.4576,0.6400,0.1630,1.245,2240.1,35.23GB,0.0143,0.1131,0.3729 +baseline,17:37:24.973,7220,0.0108,0.0111,6.94e-06,0.4390,0.6236,0.1589,1.278,2174.8,35.23GB,0.0068,0.0865,0.3517 +baseline,17:37:32.631,7230,0.0114,0.0080,6.92e-06,0.3283,0.6062,0.1594,1.306,2120.7,35.23GB,0.0125,0.0881,0.2713 +baseline,17:37:39.601,7240,0.0045,0.0079,6.89e-06,0.4444,0.5856,0.1114,1.435,1923.5,35.23GB,0.0077,0.0653,0.2503 +baseline,17:37:47.344,7250,0.0131,0.0110,6.86e-06,0.4561,0.6163,0.1581,1.292,2129.1,35.23GB,0.0293,0.1409,0.5115 +baseline,17:37:55.742,7260,0.0017,0.0070,6.83e-06,0.3825,0.6681,0.1716,1.191,2300.5,35.23GB,0.0089,0.0917,0.4045 +baseline,17:38:03.721,7270,0.0131,0.0081,6.80e-06,0.3836,0.6351,0.1628,1.253,2178.0,35.23GB,0.0095,0.0844,0.3436 +baseline,17:38:10.497,7280,0.0073,0.0104,6.77e-06,0.4571,0.5534,0.1242,1.476,1842.7,35.23GB,0.0131,0.0955,0.3079 +baseline,17:38:17.544,7290,0.0055,0.0085,6.74e-06,0.4757,0.5731,0.1316,1.419,1909.4,35.23GB,0.0056,0.0666,0.2657 +baseline,17:38:24.383,7300,0.0241,0.0145,6.71e-06,0.4384,0.5601,0.1239,1.462,1846.3,35.23GB,0.0219,0.1210,0.4632 +baseline,17:38:31.370,7310,0.0029,0.0102,6.68e-06,0.4136,0.5997,0.0990,1.431,1879.3,35.23GB,0.0251,0.1251,0.5817 +baseline,17:38:37.590,7320,0.0011,0.0103,6.65e-06,0.4112,0.5315,0.0904,1.608,1666.5,35.23GB,0.0139,0.1262,0.5130 +baseline,17:38:44.099,7330,0.0136,0.0097,6.62e-06,0.4671,0.5480,0.1030,1.537,1737.7,35.23GB,0.0100,0.0966,0.4268 +baseline,17:38:51.170,7340,0.0109,0.0129,6.59e-06,0.4169,0.5897,0.1174,1.415,1880.5,35.23GB,0.0089,0.0804,0.3922 +baseline,17:38:58.156,7350,0.0022,0.0226,6.57e-06,0.4163,0.5675,0.1311,1.432,1850.8,35.23GB,0.0087,0.0921,0.2930 +baseline,17:39:06.402,7360,0.0387,0.0234,6.54e-06,0.4596,0.6489,0.1757,1.213,2176.7,35.23GB,0.0204,0.1682,0.4860 +baseline,17:39:13.794,7370,0.0018,0.0186,6.51e-06,0.4727,0.5890,0.1502,1.353,1943.9,35.23GB,0.0092,0.0943,0.5563 +baseline,17:39:20.815,7380,0.0063,0.0101,6.48e-06,0.4123,0.5711,0.1311,1.424,1839.3,35.23GB,0.0066,0.0833,0.4583 +baseline,17:39:28.422,7390,0.0073,0.0079,6.45e-06,0.3525,0.6370,0.1236,1.315,1985.1,35.23GB,0.0129,0.1230,0.5558 +baseline,17:39:34.871,7400,0.0049,0.0146,6.42e-06,0.4585,0.5356,0.1093,1.551,1676.4,35.23GB,0.0066,0.0672,0.2197 +baseline,17:39:41.839,7410,0.0018,0.0231,6.40e-06,0.4757,0.5868,0.1099,1.436,1804.2,35.23GB,0.0121,0.1029,0.3246 +baseline,17:39:49.269,7420,0.0030,0.0114,6.37e-06,0.4472,0.5830,0.1600,1.346,1916.7,35.23GB,0.0280,0.1580,0.4659 +baseline,17:39:56.389,7430,0.0128,0.0103,6.34e-06,0.5062,0.5979,0.1141,1.405,1829.4,35.23GB,0.0183,0.1687,0.4916 +baseline,17:40:03.740,7440,0.0039,0.0101,6.31e-06,0.3664,0.5874,0.1478,1.360,1881.7,35.23GB,0.0269,0.1746,0.4594 +baseline,17:40:11.197,7450,0.0047,0.0076,6.28e-06,0.4342,0.5904,0.1552,1.341,1901.1,35.23GB,0.0122,0.1079,0.2842 +baseline,17:40:17.793,7460,0.0035,0.0087,6.26e-06,0.4218,0.5560,0.1036,1.516,1675.3,35.23GB,0.0075,0.0872,0.2216 +baseline,17:40:24.448,7470,0.0329,0.0104,6.23e-06,0.4206,0.5639,0.1015,1.503,1683.3,35.23GB,0.0142,0.1799,0.5257 +baseline,17:40:30.544,7480,0.0072,0.0211,6.20e-06,0.5161,0.5143,0.0954,1.641,1535.9,35.23GB,0.0162,0.1370,0.4836 +baseline,17:40:36.741,7490,0.0699,0.0183,6.17e-06,0.4143,0.5414,0.0783,1.614,1555.3,35.23GB,0.0374,0.1572,0.5161 +baseline,17:40:42.760,7500,0.0051,0.0135,6.15e-06,0.4014,0.5153,0.0866,1.662,1504.5,35.23GB,0.0160,0.1234,0.5458 +baseline,17:40:48.744,7510,0.0038,0.0086,6.12e-06,0.3868,0.5160,0.0824,1.671,1489.8,35.23GB,0.0148,0.0973,0.3685 +baseline,17:40:54.702,7520,0.0198,0.0118,6.09e-06,0.4516,0.5246,0.0712,1.679,1477.3,35.23GB,0.0141,0.1082,0.5338 +baseline,17:41:00.532,7530,0.0128,0.0097,6.06e-06,0.4565,0.5003,0.0826,1.716,1439.7,35.23GB,0.0144,0.1283,0.3876 +baseline,17:41:06.637,7540,0.0246,0.0087,6.04e-06,0.4375,0.5338,0.0767,1.638,1501.6,35.23GB,0.0264,0.2225,0.7478 +baseline,17:41:12.401,7550,0.0036,0.0108,6.01e-06,0.4701,0.5068,0.0696,1.735,1411.9,35.23GB,0.0084,0.0893,0.2518 +baseline,17:41:19.466,7560,0.0029,0.0080,5.98e-06,0.4119,0.5683,0.1382,1.416,1723.8,35.23GB,0.0186,0.0959,0.3808 +baseline,17:41:26.773,7570,0.0041,0.0108,5.96e-06,0.4601,0.5578,0.1728,1.369,1775.2,35.23GB,0.0192,0.1351,0.4243 +baseline,17:41:32.659,7580,0.0029,0.0174,5.93e-06,0.4063,0.5032,0.0854,1.699,1424.2,35.23GB,0.0067,0.0689,0.2665 +baseline,17:41:39.493,7590,0.0078,0.0114,5.90e-06,0.3959,0.5809,0.1025,1.464,1646.7,35.23GB,0.0262,0.1231,0.5746 +baseline,17:41:45.971,7600,0.0060,0.0088,5.88e-06,0.4206,0.5338,0.1139,1.544,1554.4,35.23GB,0.0058,0.0678,0.3898 +baseline,17:41:51.893,7610,0.0034,0.0076,5.85e-06,0.3867,0.5038,0.0883,1.689,1414.9,35.23GB,0.0114,0.0820,0.2910 +baseline,17:41:57.944,7620,0.0022,0.0083,5.82e-06,0.4177,0.5324,0.0727,1.653,1440.1,35.23GB,0.0283,0.1576,0.4428 +baseline,17:42:03.860,7630,0.0146,0.0079,5.80e-06,0.4106,0.5076,0.0840,1.691,1401.8,35.23GB,0.0071,0.0808,0.4166 +baseline,17:42:10.459,7640,0.0035,0.0088,5.77e-06,0.4251,0.5437,0.1162,1.516,1557.2,35.23GB,0.0180,0.1541,0.4093 +baseline,17:42:16.783,7650,0.0019,0.0059,5.74e-06,0.3544,0.5240,0.1085,1.582,1485.9,35.23GB,0.0044,0.0591,0.3781 +baseline,17:42:23.488,7660,0.0045,0.0074,5.72e-06,0.4114,0.5496,0.1208,1.492,1568.5,35.23GB,0.0079,0.0729,0.3384 +baseline,17:42:31.441,7670,0.0128,0.0074,5.69e-06,0.4406,0.6453,0.1500,1.258,1852.9,35.23GB,0.0111,0.0838,0.3621 +baseline,17:42:38.704,7680,0.0088,0.0125,5.67e-06,0.4383,0.5897,0.1365,1.377,1684.8,35.23GB,0.0096,0.0796,0.4381 +baseline,17:42:47.796,7690,0.0072,0.0099,5.64e-06,0.4035,0.7228,0.1863,1.100,2099.8,35.23GB,0.0150,0.1208,0.4197 +baseline,17:42:56.362,7700,0.0126,0.0109,5.61e-06,0.4054,0.6452,0.2114,1.168,1969.9,35.23GB,0.0049,0.0674,0.2376 +baseline,17:43:03.031,7710,0.0073,0.0083,5.59e-06,0.4464,0.5577,0.1092,1.500,1527.0,35.23GB,0.0046,0.0558,0.1890 +baseline,17:43:10.273,7720,0.0137,0.0099,5.56e-06,0.4010,0.6138,0.1104,1.381,1650.9,35.23GB,0.0124,0.0784,0.3504 +baseline,17:43:17.453,7730,0.0065,0.0121,5.54e-06,0.3954,0.5910,0.1268,1.393,1629.6,35.23GB,0.0071,0.0858,0.3243 +baseline,17:43:24.282,7740,0.0033,0.0091,5.51e-06,0.3501,0.5794,0.1035,1.465,1543.2,35.23GB,0.0066,0.0796,0.3764 +baseline,17:43:30.982,7750,0.0059,0.0083,5.49e-06,0.4090,0.5737,0.0962,1.493,1507.3,35.23GB,0.0062,0.0662,0.2217 +baseline,17:43:37.256,7760,0.0164,0.0119,5.46e-06,0.4655,0.5316,0.0958,1.594,1405.1,35.23GB,0.0260,0.1267,0.7992 +baseline,17:43:44.057,7770,0.0041,0.0087,5.44e-06,0.3538,0.5717,0.1085,1.470,1516.6,35.23GB,0.0054,0.0696,0.2679 +baseline,17:43:50.356,7780,0.0058,0.0082,5.41e-06,0.4690,0.5333,0.0965,1.588,1398.0,35.23GB,0.0031,0.0490,0.2257 +baseline,17:43:57.419,7790,0.0103,0.0070,5.39e-06,0.4923,0.5980,0.1084,1.416,1560.8,35.23GB,0.0249,0.1574,0.5988 +baseline,17:44:03.732,7800,0.0054,0.0067,5.36e-06,0.3543,0.5453,0.0859,1.584,1388.6,35.23GB,0.0044,0.0574,0.2591 +baseline,17:44:10.247,7810,0.0142,0.0132,5.34e-06,0.4625,0.5375,0.1140,1.535,1426.5,35.23GB,0.0157,0.1044,0.4074 +baseline,17:44:16.483,7820,0.0034,0.0127,5.31e-06,0.3906,0.5421,0.0815,1.604,1359.3,35.23GB,0.0037,0.0425,0.2025 +baseline,17:44:22.576,7830,0.0103,0.0110,5.29e-06,0.4765,0.5214,0.0879,1.641,1322.0,35.23GB,0.0314,0.1867,0.6395 +baseline,17:44:28.466,7840,0.0077,0.0078,5.26e-06,0.3303,0.5090,0.0800,1.698,1272.0,35.23GB,0.0077,0.0676,0.2029 +baseline,17:44:34.853,7850,0.0090,0.0086,5.24e-06,0.5022,0.5524,0.0864,1.566,1373.0,35.23GB,0.0454,0.1846,0.5776 +baseline,17:44:40.832,7860,0.0070,0.0089,5.21e-06,0.4499,0.5072,0.0907,1.673,1279.4,35.23GB,0.0189,0.1294,0.4682 +baseline,17:44:47.251,7870,0.0013,0.0087,5.19e-06,0.4694,0.5533,0.0887,1.558,1367.1,35.23GB,0.0093,0.1312,0.3361 +baseline,17:44:53.407,7880,0.0014,0.0066,5.17e-06,0.4225,0.5287,0.0868,1.625,1304.8,35.23GB,0.0048,0.0659,0.3177 +baseline,17:45:00.123,7890,0.0059,0.0098,5.14e-06,0.4845,0.5629,0.1086,1.489,1416.7,35.23GB,0.0096,0.1090,0.5183 +baseline,17:45:06.692,7900,0.0023,0.0078,5.12e-06,0.3455,0.5614,0.0955,1.522,1379.4,35.23GB,0.0071,0.0548,0.1953 +baseline,17:45:13.926,7910,0.0040,0.0109,5.09e-06,0.4321,0.5875,0.1359,1.383,1511.7,35.23GB,0.0036,0.0516,0.2738 +baseline,17:45:21.976,7920,0.0395,0.0123,5.07e-06,0.5088,0.6407,0.1642,1.243,1674.0,35.23GB,0.0171,0.1194,0.4875 +baseline,17:45:29.861,7930,0.0041,0.0155,5.05e-06,0.4066,0.6040,0.1845,1.268,1632.1,35.23GB,0.0141,0.0852,0.2464 +baseline,17:45:37.636,7940,0.0188,0.0161,5.02e-06,0.4506,0.6039,0.1736,1.286,1601.4,35.23GB,0.0214,0.1318,0.4975 +baseline,17:45:45.788,7950,0.0071,0.0100,5.00e-06,0.4308,0.6438,0.1714,1.227,1670.9,35.23GB,0.0139,0.1245,0.3845 +baseline,17:45:53.606,7960,0.0033,0.0113,4.98e-06,0.4106,0.6288,0.1531,1.279,1594.8,35.23GB,0.0465,0.1614,0.5500 +baseline,17:46:00.593,7970,0.0136,0.0165,4.95e-06,0.4334,0.5888,0.1098,1.432,1418.0,35.23GB,0.0147,0.0870,0.2455 +baseline,17:46:09.269,7980,0.0055,0.0141,4.93e-06,0.5184,0.6674,0.2002,1.153,1752.3,35.23GB,0.0148,0.0837,0.9289 +baseline,17:46:16.299,7990,0.0026,0.0083,4.91e-06,0.3703,0.5817,0.1213,1.423,1412.9,35.23GB,0.0056,0.0477,0.1387 +baseline,17:46:24.193,8000,0.0079,0.0100,4.88e-06,0.3952,0.6392,0.1502,1.267,1578.6,35.23GB,0.0050,0.0713,0.3033 +baseline,17:48:27.357,8010,0.0102,0.0121,4.86e-06,0.4165,0.6027,11.7137,0.081,24509.4,35.23GB,0.0136,0.1238,0.4087 +baseline,17:48:35.151,8020,0.0125,0.0097,4.84e-06,0.4313,0.6099,0.1695,1.283,1543.1,35.23GB,0.0097,0.0986,0.4674 +baseline,17:48:43.407,8030,0.0029,0.0104,4.82e-06,0.4488,0.6394,0.1863,1.211,1626.3,35.23GB,0.0067,0.0793,0.2458 +baseline,17:48:50.942,8040,0.0034,0.0103,4.79e-06,0.3759,0.6014,0.1520,1.327,1476.6,35.23GB,0.0066,0.0817,0.2259 +baseline,17:48:58.510,8050,0.0031,0.0082,4.77e-06,0.4363,0.6268,0.1300,1.322,1475.5,35.23GB,0.0040,0.0435,0.1971 +baseline,17:49:05.816,8060,0.0083,0.0085,4.75e-06,0.3887,0.5945,0.1360,1.369,1417.1,35.23GB,0.0070,0.0606,0.2871 +baseline,17:49:13.945,8070,0.0221,0.0100,4.73e-06,0.4972,0.6541,0.1589,1.230,1568.8,35.23GB,0.0142,0.1225,0.4643 +baseline,17:49:22.395,8080,0.0072,0.0098,4.70e-06,0.4631,0.6634,0.1816,1.184,1622.0,35.23GB,0.0193,0.1459,0.4024 +baseline,17:49:29.853,8090,0.0145,0.0088,4.68e-06,0.4512,0.5967,0.1492,1.341,1424.4,35.23GB,0.0090,0.0759,0.7688 +baseline,17:49:38.032,8100,0.0082,0.0068,4.66e-06,0.3823,0.6563,0.1616,1.223,1553.8,35.23GB,0.0255,0.1928,0.4736 +baseline,17:49:46.427,8110,0.0069,0.0072,4.64e-06,0.4429,0.6421,0.1973,1.191,1586.3,35.23GB,0.0080,0.0674,0.4659 +baseline,17:49:55.321,8120,0.0031,0.0098,4.62e-06,0.4118,0.7092,0.1801,1.125,1671.8,35.23GB,0.0055,0.0588,0.2575 +baseline,17:50:04.640,8130,0.0085,0.0153,4.60e-06,0.4983,0.7012,0.2308,1.073,1742.5,35.23GB,0.0470,0.2262,0.7000 +baseline,17:50:14.084,8140,0.0016,0.0135,4.57e-06,0.4627,0.7157,0.2286,1.059,1756.3,35.23GB,0.0033,0.0450,0.1971 +baseline,17:50:23.650,8150,0.0112,0.0096,4.55e-06,0.3240,0.7278,0.2288,1.045,1769.5,35.23GB,0.0057,0.0612,0.2329 +baseline,17:50:33.032,8160,0.0059,0.0068,4.53e-06,0.3886,0.7095,0.2287,1.066,1726.1,35.23GB,0.0211,0.1866,0.4990 +baseline,17:50:41.441,8170,0.0058,0.0076,4.51e-06,0.4063,0.6732,0.1677,1.189,1538.7,35.23GB,0.0037,0.0468,0.1919 +baseline,17:50:48.916,8180,0.0235,0.0123,4.49e-06,0.5527,0.5990,0.1485,1.338,1360.3,35.23GB,0.0256,0.1369,0.7313 +baseline,17:50:56.989,8190,0.0036,0.0079,4.47e-06,0.4633,0.6362,0.1711,1.239,1461.0,35.23GB,0.0051,0.0728,0.2923 +baseline,17:51:05.998,8200,0.0108,0.0134,4.45e-06,0.4201,0.7156,0.1853,1.110,1621.5,35.23GB,0.0070,0.0781,0.3648 +baseline,17:51:14.667,8210,0.0038,0.0093,4.43e-06,0.3941,0.6610,0.2059,1.154,1551.6,35.23GB,0.0063,0.0798,0.2780 +baseline,17:51:22.863,8220,0.0155,0.0086,4.40e-06,0.4138,0.6614,0.1581,1.220,1458.7,35.23GB,0.0182,0.1700,0.7446 +baseline,17:51:30.367,8230,0.0029,0.0082,4.38e-06,0.4715,0.5809,0.1695,1.334,1326.9,35.23GB,0.0078,0.0821,0.2039 +baseline,17:51:38.743,8240,0.0034,0.0080,4.36e-06,0.3656,0.6493,0.1883,1.194,1474.1,35.23GB,0.0085,0.0792,0.2827 +baseline,17:51:45.745,8250,0.0059,0.0172,4.34e-06,0.3798,0.5769,0.1233,1.428,1225.2,35.23GB,0.0138,0.1039,0.4070 +baseline,17:51:53.050,8260,0.0035,0.0105,4.32e-06,0.4020,0.5813,0.1491,1.369,1270.8,35.23GB,0.0106,0.1488,0.3450 +baseline,17:52:00.525,8270,0.0042,0.0126,4.30e-06,0.4376,0.6079,0.1396,1.339,1291.7,35.23GB,0.0152,0.1014,0.5045 +baseline,17:52:07.216,8280,0.0022,0.0074,4.28e-06,0.3715,0.5385,0.1306,1.495,1150.6,35.23GB,0.0038,0.0493,0.2099 +baseline,17:52:15.375,8290,0.0017,0.0048,4.26e-06,0.3493,0.6632,0.1527,1.226,1395.0,35.23GB,0.0067,0.0534,0.1975 +baseline,17:52:23.201,8300,0.0602,0.0148,4.24e-06,0.5339,0.6066,0.1760,1.278,1330.3,35.23GB,0.0290,0.1829,0.5369 +baseline,17:52:30.808,8310,0.0075,0.0147,4.22e-06,0.3895,0.6169,0.1438,1.316,1284.5,35.23GB,0.0064,0.0709,0.2301 +baseline,17:52:37.669,8320,0.0215,0.0111,4.20e-06,0.3588,0.5643,0.1218,1.458,1152.5,35.23GB,0.0117,0.0924,0.5836 +baseline,17:52:45.045,8330,0.0061,0.0092,4.18e-06,0.4515,0.5811,0.1565,1.356,1231.5,35.23GB,0.0051,0.0656,0.2476 +baseline,17:52:52.526,8340,0.0078,0.0091,4.16e-06,0.4806,0.6043,0.1438,1.337,1241.7,35.23GB,0.0117,0.1604,0.4367 +baseline,17:53:00.376,8350,0.0101,0.0107,4.14e-06,0.3855,0.6244,0.1606,1.274,1295.1,35.23GB,0.0075,0.0876,0.2790 +baseline,17:53:06.983,8360,0.0068,0.0096,4.12e-06,0.4441,0.5497,0.1110,1.514,1083.3,35.23GB,0.0054,0.0704,0.2088 +baseline,17:53:14.490,8370,0.0049,0.0114,4.11e-06,0.3999,0.5911,0.1596,1.332,1223.5,35.23GB,0.0085,0.0859,0.3712 +baseline,17:53:21.645,8380,0.0053,0.0101,4.09e-06,0.3796,0.5895,0.1260,1.398,1158.9,35.23GB,0.0134,0.1030,0.2899 +baseline,17:53:29.286,8390,0.0012,0.0136,4.07e-06,0.5205,0.6023,0.1620,1.309,1230.2,35.23GB,0.0213,0.1226,0.6062 +baseline,17:53:36.832,8400,0.0021,0.0093,4.05e-06,0.4245,0.6133,0.1412,1.326,1207.1,35.23GB,0.0065,0.0832,0.4161 +baseline,17:53:44.691,8410,0.0029,0.0094,4.03e-06,0.4027,0.6320,0.1539,1.273,1249.4,35.23GB,0.0135,0.1045,0.3604 +baseline,17:53:53.592,8420,0.0062,0.0106,4.01e-06,0.4336,0.6853,0.2049,1.124,1406.2,35.23GB,0.0059,0.0606,0.3500 +baseline,17:54:02.019,8430,0.0547,0.0158,3.99e-06,0.4875,0.6620,0.1807,1.187,1322.9,35.23GB,0.0278,0.1624,0.4405 +baseline,17:54:08.873,8440,0.0040,0.0117,3.97e-06,0.3852,0.5500,0.1354,1.459,1069.1,35.23GB,0.0055,0.0394,0.4089 +baseline,17:54:16.773,8450,0.0268,0.0137,3.96e-06,0.4530,0.6235,0.1665,1.266,1224.3,35.23GB,0.0169,0.1931,0.4687 +baseline,17:54:24.331,8460,0.0141,0.0122,3.94e-06,0.3932,0.5777,0.1781,1.323,1163.8,35.23GB,0.0191,0.1244,0.5898 +baseline,17:54:32.432,8470,0.0029,0.0105,3.92e-06,0.4605,0.6310,0.1791,1.235,1239.2,35.23GB,0.0075,0.0854,0.3956 +baseline,17:54:41.021,8480,0.0025,0.0082,3.90e-06,0.4560,0.6600,0.1989,1.164,1305.4,35.23GB,0.0095,0.0887,0.4670 +baseline,17:54:49.487,8490,0.0017,0.0082,3.88e-06,0.4296,0.6420,0.2045,1.181,1278.1,35.23GB,0.0052,0.0706,0.3212 +baseline,17:54:55.393,8500,0.0049,0.0079,3.87e-06,0.3953,0.5193,0.0714,1.693,885.9,35.23GB,0.0169,0.1439,0.4403 +baseline,17:55:01.380,8510,0.0021,0.0119,3.85e-06,0.4711,0.5129,0.0858,1.670,892.0,35.23GB,0.0167,0.1106,0.3536 +baseline,17:55:08.247,8520,0.0063,0.0108,3.83e-06,0.4256,0.5444,0.1424,1.456,1016.1,35.23GB,0.0072,0.0639,0.1994 +baseline,17:55:15.228,8530,0.0017,0.0150,3.81e-06,0.4462,0.5567,0.1414,1.433,1026.1,35.23GB,0.0119,0.1093,0.2995 +baseline,17:55:21.919,8540,0.0045,0.0102,3.80e-06,0.3981,0.5402,0.1289,1.495,976.8,35.23GB,0.0447,0.1545,0.4847 +baseline,17:55:29.315,8550,0.0120,0.0073,3.78e-06,0.3784,0.6051,0.1345,1.352,1072.2,35.23GB,0.0095,0.1076,0.4817 +baseline,17:55:37.036,8560,0.0076,0.0083,3.76e-06,0.3981,0.6000,0.1721,1.295,1111.6,35.23GB,0.0152,0.1174,0.3488 +baseline,17:55:46.351,8570,0.0014,0.0094,3.74e-06,0.3828,0.6894,0.2421,1.074,1331.9,35.23GB,0.0054,0.0747,0.3021 +baseline,17:55:52.917,8580,0.0049,0.0061,3.73e-06,0.4402,0.5595,0.0971,1.523,932.2,35.23GB,0.0071,0.1070,0.4450 +baseline,17:56:00.454,8590,0.0017,0.0087,3.71e-06,0.3867,0.5841,0.1696,1.327,1062.6,35.23GB,0.0068,0.0884,0.2983 +baseline,17:56:07.476,8600,0.0036,0.0083,3.69e-06,0.4321,0.5791,0.1232,1.424,983.0,35.23GB,0.0070,0.0848,0.3857 +baseline,17:56:15.224,8610,0.0020,0.0079,3.68e-06,0.4750,0.6075,0.1673,1.291,1076.8,35.23GB,0.0133,0.0996,0.4741 +baseline,17:56:22.697,8620,0.0052,0.0085,3.66e-06,0.4850,0.5985,0.1488,1.338,1031.1,35.23GB,0.0242,0.1433,0.5158 +baseline,17:56:31.336,8630,0.0059,0.0066,3.64e-06,0.3594,0.6718,0.1921,1.158,1183.4,35.23GB,0.0091,0.0884,0.2790 +baseline,17:56:39.542,8640,0.0038,0.0091,3.63e-06,0.4332,0.6347,0.1859,1.219,1115.9,35.23GB,0.0181,0.1359,0.3749 +baseline,17:56:47.615,8650,0.0101,0.0141,3.61e-06,0.4733,0.6579,0.1495,1.239,1089.7,35.23GB,0.0211,0.1012,0.3194 +baseline,17:56:55.926,8660,0.0103,0.0119,3.60e-06,0.4133,0.6238,0.2073,1.203,1113.5,35.23GB,0.0104,0.0855,0.3258 +baseline,17:57:03.363,8670,0.0063,0.0092,3.58e-06,0.4443,0.5854,0.1583,1.345,989.0,35.23GB,0.0174,0.1103,0.4008 +baseline,17:57:11.646,8680,0.0016,0.0076,3.56e-06,0.4742,0.6599,0.1685,1.207,1093.2,35.23GB,0.0079,0.1418,0.5365 +baseline,17:57:21.635,8690,0.0073,0.0089,3.55e-06,0.4296,0.7507,0.2482,1.001,1308.4,35.23GB,0.0148,0.1430,0.4491 +baseline,17:57:29.664,8700,0.0639,0.0145,3.53e-06,0.4556,0.6220,0.1809,1.246,1043.7,35.23GB,0.0321,0.1592,0.5489 +baseline,17:57:37.123,8710,0.0047,0.0157,3.52e-06,0.4417,0.5886,0.1573,1.341,962.1,35.23GB,0.0045,0.0479,0.2445 +baseline,17:57:43.770,8720,0.0038,0.0100,3.50e-06,0.5380,0.5422,0.1226,1.505,850.7,35.23GB,0.0046,0.0711,0.1363 +baseline,17:57:50.678,8730,0.0027,0.0089,3.49e-06,0.4761,0.5671,0.1238,1.448,877.2,35.23GB,0.0207,0.1785,0.4409 +baseline,17:57:57.627,8740,0.0130,0.0088,3.47e-06,0.3790,0.5630,0.1318,1.439,875.3,35.23GB,0.0121,0.0943,0.3007 +baseline,17:58:05.419,8750,0.0051,0.0117,3.46e-06,0.4530,0.6174,0.1618,1.284,973.8,35.23GB,0.0109,0.0898,0.3252 +baseline,17:58:12.498,8760,0.0076,0.0106,3.44e-06,0.4014,0.5741,0.1337,1.413,877.6,35.23GB,0.0103,0.0824,0.5847 +baseline,17:58:19.538,8770,0.0029,0.0096,3.43e-06,0.4070,0.5581,0.1460,1.421,865.8,35.23GB,0.0098,0.0813,0.5001 +baseline,17:58:27.434,8780,0.0077,0.0106,3.41e-06,0.4683,0.6129,0.1767,1.267,963.2,35.23GB,0.0050,0.0646,0.3111 +baseline,17:58:34.371,8790,0.0134,0.0171,3.40e-06,0.6204,0.5645,0.1292,1.442,839.2,35.23GB,0.0095,0.1090,0.3260 +baseline,17:58:40.094,8800,0.0067,0.0148,3.38e-06,0.4798,0.4991,0.0733,1.748,686.6,35.23GB,0.0071,0.1206,0.8084 +baseline,17:58:47.143,8810,0.0156,0.0122,3.37e-06,0.4121,0.5857,0.1192,1.419,838.8,35.23GB,0.0198,0.1863,0.5994 +baseline,17:58:53.864,8820,0.0215,0.0144,3.35e-06,0.4744,0.5532,0.1188,1.488,792.9,35.23GB,0.0400,0.2137,0.6823 +baseline,17:59:01.377,8830,0.0046,0.0080,3.34e-06,0.3453,0.6115,0.1399,1.331,879.0,35.23GB,0.0050,0.0538,0.3280 +baseline,17:59:08.339,8840,0.0043,0.0092,3.33e-06,0.4168,0.5558,0.1405,1.437,807.5,35.23GB,0.0087,0.0841,0.2843 +baseline,17:59:15.781,8850,0.0074,0.0081,3.31e-06,0.4923,0.5989,0.1453,1.344,855.7,35.23GB,0.0097,0.0854,0.4180 +baseline,17:59:23.419,8860,0.0021,0.0062,3.30e-06,0.3872,0.6135,0.1503,1.309,870.6,35.23GB,0.0136,0.1091,0.4307 +baseline,17:59:30.288,8870,0.0081,0.0079,3.28e-06,0.3217,0.5625,0.1244,1.456,776.1,35.23GB,0.0065,0.0683,0.2354 +baseline,17:59:37.231,8880,0.0036,0.0095,3.27e-06,0.3516,0.5630,0.1313,1.441,777.4,35.23GB,0.0046,0.0875,0.3212 +baseline,17:59:43.798,8890,0.0093,0.0084,3.26e-06,0.4889,0.5499,0.1068,1.523,728.8,35.23GB,0.0054,0.0658,0.3863 +baseline,17:59:50.764,8900,0.0050,0.0088,3.24e-06,0.4508,0.5779,0.1187,1.436,766.1,35.23GB,0.0370,0.1892,0.6850 +baseline,17:59:58.434,8910,0.0094,0.0083,3.23e-06,0.4252,0.6182,0.1487,1.304,835.9,35.23GB,0.0150,0.1243,0.3996 +baseline,18:00:05.634,8920,0.0019,0.0049,3.22e-06,0.3391,0.5696,0.1504,1.389,777.5,35.23GB,0.0138,0.0863,0.2722 +baseline,18:00:14.039,8930,0.0072,0.0079,3.20e-06,0.3912,0.6597,0.1809,1.190,899.3,35.23GB,0.0093,0.0873,0.3618 +baseline,18:00:20.908,8940,0.0023,0.0058,3.19e-06,0.3753,0.5661,0.1208,1.456,727.9,35.23GB,0.0054,0.0537,0.2648 +baseline,18:00:29.465,8950,0.0120,0.0077,3.18e-06,0.7654,0.6619,0.1938,1.169,898.4,35.23GB,0.0363,0.2035,0.6761 +baseline,18:00:37.295,8960,0.0046,0.0080,3.17e-06,0.4122,0.6201,0.1629,1.278,813.8,35.23GB,0.0054,0.0645,0.4155 +baseline,18:00:45.585,8970,0.0017,0.0066,3.15e-06,0.4100,0.6464,0.1827,1.206,853.9,35.23GB,0.0032,0.0408,0.1715 +baseline,18:00:53.434,8980,0.0012,0.0068,3.14e-06,0.4652,0.6313,0.1535,1.274,800.4,35.23GB,0.0120,0.0925,0.8640 +baseline,18:01:00.588,8990,0.0038,0.0062,3.13e-06,0.3891,0.5801,0.1353,1.398,722.5,35.23GB,0.0102,0.0719,0.7492 +baseline,18:01:07.182,9000,0.0034,0.0100,3.12e-06,0.3807,0.5450,0.1145,1.517,659.4,35.23GB,0.0204,0.1373,0.3867 +baseline,18:03:07.655,9010,0.0081,0.0081,3.10e-06,0.3965,0.6696,11.3777,0.083,11926.8,35.23GB,0.0046,0.0550,0.4289 +baseline,18:03:15.319,9020,0.0030,0.0122,3.09e-06,0.4253,0.6061,0.1603,1.305,750.9,35.23GB,0.0035,0.0455,0.1945 +baseline,18:03:22.701,9030,0.0043,0.0110,3.08e-06,0.3637,0.5897,0.1485,1.355,715.9,35.23GB,0.0411,0.1424,0.5603 +baseline,18:03:30.451,9040,0.0073,0.0077,3.07e-06,0.3098,0.6047,0.1702,1.290,743.9,35.23GB,0.0059,0.0636,0.2207 +baseline,18:03:37.749,9050,0.0052,0.0076,3.06e-06,0.3544,0.5741,0.1557,1.370,693.3,35.23GB,0.0036,0.0610,0.2522 +baseline,18:03:45.785,9060,0.0214,0.0121,3.05e-06,0.4264,0.6547,0.1489,1.245,755.3,35.23GB,0.0127,0.0886,0.3299 +baseline,18:03:54.128,9070,0.0120,0.0108,3.03e-06,0.4611,0.6718,0.1624,1.199,775.7,35.23GB,0.0105,0.0733,0.3870 +baseline,18:04:01.938,9080,0.0036,0.0136,3.02e-06,0.3998,0.6327,0.1483,1.280,718.5,35.23GB,0.0041,0.0655,0.2298 +baseline,18:04:09.361,9090,0.0036,0.0097,3.01e-06,0.3718,0.6110,0.1314,1.347,675.4,35.23GB,0.0099,0.0983,0.3056 +baseline,18:04:17.893,9100,0.0089,0.0066,3.00e-06,0.3828,0.6590,0.1941,1.172,767.7,35.23GB,0.0508,0.1685,0.5391 +baseline,18:04:26.000,9110,0.0044,0.0043,2.99e-06,0.3458,0.6602,0.1505,1.234,721.4,35.23GB,0.0096,0.0955,0.4496 +baseline,18:04:33.121,9120,0.0012,0.0153,2.98e-06,0.4674,0.5860,0.1260,1.405,626.5,35.23GB,0.0263,0.1273,0.4751 +baseline,18:04:41.964,9130,0.0153,0.0160,2.97e-06,0.5309,0.6556,0.2288,1.131,769.3,35.23GB,0.0184,0.0936,1.3522 +baseline,18:04:51.323,9140,0.0159,0.0120,2.96e-06,0.4003,0.6878,0.2481,1.069,804.8,35.23GB,0.0255,0.1681,0.5326 +baseline,18:04:59.011,9150,0.0033,0.0092,2.95e-06,0.3861,0.6180,0.1509,1.301,653.4,35.23GB,0.0078,0.0626,0.1623 +baseline,18:05:07.222,9160,0.0074,0.0083,2.94e-06,0.4134,0.6674,0.1537,1.218,689.6,35.23GB,0.0035,0.0428,0.2705 +baseline,18:05:14.842,9170,0.0064,0.0083,2.93e-06,0.4466,0.6171,0.1448,1.313,632.3,35.23GB,0.0068,0.0756,0.2059 +baseline,18:05:22.858,9180,0.0103,0.0098,2.92e-06,0.4200,0.6198,0.1818,1.248,657.2,35.23GB,0.0106,0.0857,0.4316 +baseline,18:05:30.545,9190,0.0128,0.0090,2.91e-06,0.3965,0.6213,0.1474,1.301,622.6,35.23GB,0.0117,0.0810,0.3363 +baseline,18:05:38.864,9200,0.0019,0.0061,2.90e-06,0.3647,0.6438,0.1881,1.202,665.4,35.23GB,0.0056,0.0578,0.2340 +baseline,18:05:47.023,9210,0.0070,0.0051,2.89e-06,0.4857,0.6336,0.1823,1.226,644.5,35.23GB,0.0094,0.0908,0.3534 +baseline,18:05:55.096,9220,0.0048,0.0073,2.88e-06,0.4120,0.6303,0.1770,1.239,629.6,35.23GB,0.0030,0.0481,0.1851 +baseline,18:06:03.554,9230,0.0054,0.0133,2.87e-06,0.3829,0.6192,0.2266,1.182,651.2,35.23GB,0.0095,0.0789,0.3080 +baseline,18:06:11.241,9240,0.0057,0.0093,2.86e-06,0.3647,0.6375,0.1312,1.301,584.1,35.23GB,0.0086,0.0648,0.3614 +baseline,18:06:18.440,9250,0.0038,0.0083,2.85e-06,0.3417,0.5988,0.1211,1.389,539.9,35.23GB,0.0045,0.0506,0.2800 +baseline,18:06:26.550,9260,0.0058,0.0112,2.84e-06,0.4050,0.6352,0.1758,1.233,600.1,35.23GB,0.0103,0.0901,0.6081 +baseline,18:06:33.490,9270,0.0086,0.0094,2.83e-06,0.4630,0.5735,0.1205,1.441,506.5,35.23GB,0.0059,0.0560,0.2782 +baseline,18:06:40.104,9280,0.0065,0.0081,2.82e-06,0.4231,0.5601,0.1013,1.512,476.1,35.23GB,0.0143,0.1096,0.4456 +baseline,18:06:47.627,9290,0.0028,0.0070,2.81e-06,0.3680,0.6147,0.1376,1.330,534.0,35.23GB,0.0037,0.0430,0.1828 +baseline,18:06:54.211,9300,0.0079,0.0132,2.80e-06,0.4053,0.5453,0.1131,1.519,460.8,35.23GB,0.0069,0.0858,0.3299 +baseline,18:07:00.951,9310,0.0088,0.0099,2.80e-06,0.5043,0.5613,0.1127,1.484,465.0,35.23GB,0.0224,0.1585,0.4269 +baseline,18:07:08.798,9320,0.0029,0.0112,2.79e-06,0.4975,0.6176,0.1671,1.274,533.6,35.23GB,0.0231,0.1905,0.5606 +baseline,18:07:16.493,9330,0.0086,0.0129,2.78e-06,0.4142,0.5877,0.1817,1.300,515.4,35.23GB,0.0320,0.1540,0.5304 +baseline,18:07:23.826,9340,0.0049,0.0087,2.77e-06,0.4020,0.5972,0.1362,1.364,484.0,35.23GB,0.0053,0.0616,0.2974 +baseline,18:07:30.342,9350,0.0103,0.0075,2.76e-06,0.3554,0.5443,0.1073,1.535,423.5,35.23GB,0.0092,0.0857,0.4649 +baseline,18:07:37.144,9360,0.0076,0.0078,2.76e-06,0.3848,0.5562,0.1240,1.470,435.2,35.23GB,0.0091,0.0708,0.2960 +baseline,18:07:44.442,9370,0.0060,0.0072,2.75e-06,0.3792,0.5878,0.1420,1.371,459.7,35.23GB,0.0040,0.0553,0.2013 +baseline,18:07:51.129,9380,0.0016,0.0056,2.74e-06,0.4211,0.5598,0.1090,1.496,414.5,35.23GB,0.0036,0.0479,0.1158 +baseline,18:07:58.830,9390,0.0237,0.0079,2.73e-06,0.4958,0.6069,0.1632,1.299,469.7,35.23GB,0.0201,0.1336,0.7281 +baseline,18:08:06.062,9400,0.0047,0.0112,2.72e-06,0.3745,0.5850,0.1382,1.383,433.9,35.23GB,0.0057,0.0685,0.2892 +baseline,18:08:13.319,9410,0.0108,0.0082,2.72e-06,0.3748,0.5859,0.1398,1.378,428.1,35.23GB,0.0221,0.1288,0.4823 +baseline,18:08:20.327,9420,0.0054,0.0065,2.71e-06,0.4498,0.5733,0.1276,1.427,406.4,35.23GB,0.0046,0.0624,0.3575 +baseline,18:08:27.542,9430,0.0095,0.0079,2.70e-06,0.4312,0.5818,0.1397,1.386,411.2,35.23GB,0.0192,0.1227,0.4525 +baseline,18:08:34.971,9440,0.0043,0.0093,2.70e-06,0.4106,0.6077,0.1352,1.346,416.0,35.23GB,0.0050,0.0594,0.1800 +baseline,18:08:42.038,9450,0.0023,0.0077,2.69e-06,0.3921,0.5749,0.1317,1.415,388.6,35.23GB,0.0058,0.0665,0.2440 +baseline,18:08:48.776,9460,0.0206,0.0109,2.68e-06,0.4454,0.5558,0.1180,1.484,363.8,35.23GB,0.0140,0.0911,0.4005 +baseline,18:08:55.497,9470,0.0086,0.0078,2.68e-06,0.4208,0.5695,0.1026,1.488,356.2,35.23GB,0.0082,0.0785,0.2694 +baseline,18:09:02.485,9480,0.0198,0.0084,2.67e-06,0.4853,0.5675,0.1314,1.431,363.3,35.23GB,0.0173,0.1112,0.5434 +baseline,18:09:09.136,9490,0.0238,0.0098,2.66e-06,0.4431,0.5574,0.1076,1.504,339.0,35.23GB,0.0075,0.1141,0.7838 +baseline,18:09:15.746,9500,0.0029,0.0085,2.66e-06,0.4148,0.5516,0.1095,1.513,330.5,35.23GB,0.0036,0.0456,0.2314 +baseline,18:09:21.556,9510,0.0327,0.0117,2.65e-06,0.5235,0.5139,0.0671,1.722,284.6,35.23GB,0.0241,0.1341,0.4542 +baseline,18:09:27.342,9520,0.0064,0.0093,2.64e-06,0.3688,0.5155,0.0631,1.728,277.7,35.23GB,0.0046,0.0572,0.2390 +baseline,18:09:33.275,9530,0.0039,0.0078,2.64e-06,0.3187,0.5103,0.0830,1.686,278.8,35.23GB,0.0035,0.0433,0.1850 +baseline,18:09:39.500,9540,0.0010,0.0105,2.63e-06,0.4398,0.5464,0.0761,1.607,286.3,35.23GB,0.0176,0.1120,0.5587 +baseline,18:09:46.051,9550,0.0036,0.0086,2.63e-06,0.4349,0.5653,0.0898,1.527,294.7,35.23GB,0.0087,0.0939,0.3151 +baseline,18:09:52.294,9560,0.0082,0.0093,2.62e-06,0.4303,0.5515,0.0727,1.602,274.6,35.23GB,0.0284,0.2408,0.6269 +baseline,18:09:59.154,9570,0.0028,0.0086,2.62e-06,0.4051,0.5735,0.1125,1.458,294.9,35.23GB,0.0182,0.1235,0.4193 +baseline,18:10:05.574,9580,0.0023,0.0069,2.61e-06,0.3893,0.5437,0.0983,1.558,269.6,35.23GB,0.0314,0.1238,0.4530 +baseline,18:10:12.172,9590,0.0056,0.0068,2.61e-06,0.4466,0.5513,0.1084,1.516,270.5,35.23GB,0.0192,0.1177,0.5837 +baseline,18:10:18.977,9600,0.0088,0.0079,2.60e-06,0.3636,0.5445,0.1361,1.470,272.2,35.23GB,0.0092,0.0855,0.3968 +baseline,18:10:25.338,9610,0.0068,0.0066,2.60e-06,0.3476,0.5318,0.1043,1.572,248.0,35.23GB,0.0058,0.0597,0.2740 +baseline,18:10:32.408,9620,0.0081,0.0073,2.59e-06,0.3827,0.5735,0.1334,1.415,268.6,35.23GB,0.0155,0.0990,0.4694 +baseline,18:10:39.075,9630,0.0014,0.0073,2.59e-06,0.3641,0.5474,0.1193,1.500,246.6,35.23GB,0.0030,0.0404,0.2206 +baseline,18:10:45.402,9640,0.0053,0.0094,2.58e-06,0.4816,0.5446,0.0881,1.581,227.7,35.23GB,0.0226,0.1272,0.4744 +baseline,18:10:51.842,9650,0.0054,0.0078,2.58e-06,0.4123,0.5401,0.1040,1.553,225.4,35.23GB,0.0050,0.0687,0.2560 +baseline,18:10:58.647,9660,0.0179,0.0083,2.57e-06,0.4544,0.5704,0.1100,1.470,231.3,35.23GB,0.0322,0.1862,0.6386 +baseline,18:11:05.705,9670,0.0109,0.0151,2.57e-06,0.4994,0.5935,0.1123,1.417,232.9,35.23GB,0.0105,0.0779,0.3020 +baseline,18:11:11.696,9680,0.0014,0.0085,2.57e-06,0.3112,0.5183,0.0808,1.670,191.7,35.23GB,0.0049,0.0693,0.1947 +baseline,18:11:18.111,9690,0.0036,0.0191,2.56e-06,0.4620,0.5468,0.0947,1.559,198.8,35.23GB,0.0062,0.0685,0.4864 +baseline,18:11:26.341,9700,0.0035,0.0105,2.56e-06,0.3947,0.6285,0.1945,1.215,246.9,35.23GB,0.0074,0.0845,0.3729 +baseline,18:11:33.058,9710,0.0047,0.0103,2.55e-06,0.4748,0.5552,0.1165,1.489,194.8,35.23GB,0.0153,0.1502,0.3428 +baseline,18:11:40.314,9720,0.0023,0.0070,2.55e-06,0.3785,0.5964,0.1292,1.378,203.1,35.23GB,0.0476,0.1391,0.5027 +baseline,18:11:47.404,9730,0.0006,0.0077,2.55e-06,0.4150,0.5756,0.1334,1.411,191.4,35.23GB,0.0214,0.1021,0.4479 +baseline,18:11:55.258,9740,0.0020,0.0056,2.54e-06,0.4227,0.6422,0.1431,1.274,204.2,35.23GB,0.0182,0.0791,0.3250 +baseline,18:12:03.373,9750,0.0016,0.0056,2.54e-06,0.3245,0.6189,0.1926,1.232,202.8,35.23GB,0.0079,0.0670,0.1692 +baseline,18:12:10.421,9760,0.0016,0.0076,2.54e-06,0.4855,0.5864,0.1184,1.419,169.1,35.23GB,0.0372,0.1271,0.4865 +baseline,18:12:18.874,9770,0.0148,0.0131,2.53e-06,0.4726,0.6676,0.1777,1.183,194.4,35.23GB,0.0056,0.0886,0.2900 +baseline,18:12:26.810,9780,0.0067,0.0092,2.53e-06,0.4212,0.6204,0.1733,1.260,174.6,35.23GB,0.0240,0.1408,0.4093 +baseline,18:12:34.225,9790,0.0050,0.0085,2.53e-06,0.4916,0.6133,0.1282,1.349,155.7,35.23GB,0.0046,0.0619,0.2566 +baseline,18:12:40.711,9800,0.0011,0.0178,2.53e-06,0.3528,0.5512,0.0974,1.542,129.7,35.23GB,0.0072,0.0677,0.3213 +baseline,18:12:47.144,9810,0.0300,0.0134,2.52e-06,0.4502,0.5384,0.1049,1.555,122.2,35.23GB,0.0157,0.1134,0.6301 +baseline,18:12:54.235,9820,0.0150,0.0108,2.52e-06,0.4588,0.5785,0.1306,1.410,127.6,35.23GB,0.0065,0.0829,0.3710 +baseline,18:13:00.857,9830,0.0011,0.0100,2.52e-06,0.3867,0.5599,0.1024,1.510,112.6,35.23GB,0.0050,0.0431,0.1756 +baseline,18:13:08.348,9840,0.0068,0.0083,2.52e-06,0.4328,0.6042,0.1448,1.335,119.8,35.23GB,0.0117,0.1196,0.5125 +baseline,18:13:15.543,9850,0.0082,0.0093,2.51e-06,0.4160,0.5749,0.1447,1.390,107.9,35.23GB,0.0124,0.0980,0.3461 +baseline,18:13:24.221,9860,0.0117,0.0088,2.51e-06,0.3767,0.6528,0.2150,1.153,121.5,35.23GB,0.0128,0.0820,0.3272 +baseline,18:13:31.111,9870,0.0059,0.0070,2.51e-06,0.5194,0.5583,0.1308,1.451,89.6,35.23GB,0.0191,0.1788,0.6869 +baseline,18:13:38.970,9880,0.0041,0.0064,2.51e-06,0.4778,0.6064,0.1795,1.273,94.3,35.23GB,0.0447,0.2242,0.6269 +baseline,18:13:47.092,9890,0.0060,0.0082,2.51e-06,0.4531,0.6583,0.1540,1.231,89.3,35.23GB,0.0177,0.1205,0.4349 +baseline,18:13:54.553,9900,0.0202,0.0110,2.51e-06,0.4201,0.5837,0.1623,1.341,74.6,35.23GB,0.0083,0.0658,0.3654 +baseline,18:14:01.975,9910,0.0915,0.0169,2.51e-06,0.3870,0.5930,0.1493,1.347,66.8,35.23GB,0.0384,0.1925,0.4918 +baseline,18:14:07.801,9920,0.0012,0.0121,2.50e-06,0.4127,0.5111,0.0715,1.717,46.6,35.23GB,0.0183,0.1225,0.3161 +baseline,18:14:14.535,9930,0.0096,0.0115,2.50e-06,0.4252,0.5457,0.1277,1.485,47.1,35.23GB,0.0073,0.0860,0.3860 +baseline,18:14:21.422,9940,0.0050,0.0094,2.50e-06,0.5446,0.5607,0.1280,1.452,41.3,35.23GB,0.0175,0.1005,0.4694 +baseline,18:14:27.783,9950,0.0039,0.0118,2.50e-06,0.5098,0.5326,0.1035,1.572,31.8,35.23GB,0.0097,0.1392,0.9805 +baseline,18:14:33.566,9960,0.0366,0.0164,2.50e-06,0.4185,0.4790,0.0993,1.730,23.1,35.23GB,0.0357,0.1786,0.9393 +baseline,18:14:40.626,9970,0.0123,0.0115,2.50e-06,0.4304,0.5929,0.1131,1.417,21.2,35.23GB,0.0112,0.1085,0.3551 +baseline,18:14:47.083,9980,0.0029,0.0095,2.50e-06,0.3902,0.5540,0.0917,1.549,12.9,35.23GB,0.0022,0.0268,0.1216 +baseline,18:14:54.208,9990,0.0028,0.0070,2.50e-06,0.4925,0.5849,0.1276,1.404,7.1,35.23GB,0.0106,0.0706,0.6721 +baseline,18:15:00.659,10000,0.0141,0.0172,2.50e-06,0.4377,0.5241,0.1210,1.550,0.0,35.23GB,0.0125,0.1342,0.4184 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/comparison_2k_vs_10k.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/comparison_2k_vs_10k.csv new file mode 100644 index 0000000000000000000000000000000000000000..8f33d5a8b8bc620fed0f133d1df57105d7a804d4 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/comparison_2k_vs_10k.csv @@ -0,0 +1,5 @@ +model,run,val_1000_mean,val_2000_mean,val_5000_mean,val_10000_mean,runtime,peak_vram +baseline,2k,0.052885,0.035776,,,33:27,35.23GB +baseline,10k,0.06113,0.041595,0.027324,0.022345,2:13:40,35.23GB +parallel,2k,0.051214,0.03568,,,30:38,35.27GB +parallel,10k,0.059715,0.039947,0.02734,0.022168,2:20:51,35.27GB diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/parallel_train_full.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/parallel_train_full.csv new file mode 100644 index 0000000000000000000000000000000000000000..180560fdf05c52c92d724ab00095f58eac540dff --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/parallel_train_full.csv @@ -0,0 +1,1001 @@ +model,ts,step,loss,smoothed,lr,grad_norm,step_time,data_time,its,eta,mem,grad_action_in_proj_arms,grad_action_out_proj_arms,grad_arm_token_fuse,grad_shared_expert +parallel,18:44:40.107,10,0.4805,1.0720,2.74e-07,10.7935,0.6888,0.1360,1.207,8276.0,35.27GB,0.1025,1.4253,0.4146,3.0129 +parallel,18:44:45.893,20,0.8767,0.9940,7.73e-07,10.4887,0.5044,0.0741,1.729,5773.3,35.27GB,0.2246,1.6214,1.0997,5.8087 +parallel,18:44:51.768,30,0.7307,0.8917,1.27e-06,5.6603,0.5238,0.0637,1.702,5856.9,35.27GB,0.1250,1.4797,0.5097,3.7552 +parallel,18:44:57.779,40,0.6519,0.8042,1.77e-06,8.2050,0.5233,0.0779,1.664,5986.4,35.27GB,0.2982,1.6587,1.2377,8.1574 +parallel,18:45:03.714,50,0.6922,0.8089,2.27e-06,6.4187,0.5290,0.0644,1.685,5903.5,35.27GB,0.1692,1.6308,0.6495,4.8296 +parallel,18:45:09.116,60,0.5895,0.7402,2.77e-06,7.1647,0.4779,0.0623,1.851,5369.1,35.27GB,0.2519,1.8146,1.1504,6.6906 +parallel,18:45:14.851,70,0.7428,0.6959,3.27e-06,7.2884,0.5002,0.0733,1.744,5693.6,35.27GB,0.2240,1.2726,0.8724,7.0172 +parallel,18:45:21.005,80,1.1128,0.7049,3.77e-06,7.6562,0.5457,0.0697,1.625,6104.0,35.27GB,0.1138,1.4133,0.4264,3.4744 +parallel,18:45:26.626,90,0.6287,0.6233,4.27e-06,5.1438,0.4981,0.0640,1.779,5569.7,35.27GB,0.1093,1.0830,0.4441,3.9635 +parallel,18:45:32.656,100,0.4877,0.5308,4.77e-06,3.3763,0.5275,0.0755,1.659,5969.0,35.27GB,0.0819,0.8839,0.3777,2.9338 +parallel,18:45:38.639,110,0.5073,0.4906,5.26e-06,2.8861,0.5146,0.0837,1.672,5915.7,35.27GB,0.0793,0.9792,0.2841,3.0901 +parallel,18:45:44.466,120,0.4122,0.4434,5.76e-06,2.3112,0.5056,0.0771,1.717,5755.8,35.27GB,0.0551,0.8063,0.2794,1.7754 +parallel,18:45:50.645,130,0.4445,0.3811,6.26e-06,1.9656,0.5319,0.0861,1.619,6097.9,35.27GB,0.0510,0.7291,0.2641,1.7942 +parallel,18:45:56.988,140,0.2827,0.3585,6.76e-06,1.9102,0.5481,0.0862,1.577,6253.2,35.27GB,0.0445,0.6335,0.2075,1.8199 +parallel,18:46:03.405,150,0.2804,0.3386,7.26e-06,1.8551,0.5707,0.0710,1.559,6319.5,35.27GB,0.0352,0.5799,0.1715,1.2377 +parallel,18:46:10.632,160,0.4199,0.3182,7.76e-06,1.7483,0.5865,0.1362,1.384,7110.2,35.27GB,0.0501,0.5089,0.2482,1.9337 +parallel,18:46:17.134,170,0.2694,0.2991,8.26e-06,1.5418,0.5479,0.1023,1.538,6390.5,35.27GB,0.0355,0.4538,0.1818,1.1564 +parallel,18:46:24.982,180,0.1809,0.2764,8.76e-06,1.3395,0.6295,0.1553,1.274,7705.8,35.27GB,0.0264,0.4302,0.1363,1.0912 +parallel,18:46:31.983,190,0.2283,0.2421,9.26e-06,1.4411,0.5925,0.1076,1.429,6866.3,35.27GB,0.0290,0.6081,0.1490,1.3434 +parallel,18:46:38.936,200,0.1697,0.2323,9.76e-06,1.2837,0.5975,0.0978,1.438,6813.4,35.27GB,0.0230,0.4177,0.1172,0.8447 +parallel,18:46:45.589,210,0.1883,0.2153,1.03e-05,1.3707,0.5471,0.1182,1.503,6512.9,35.27GB,0.0327,0.5188,0.1715,1.1668 +parallel,18:46:54.137,220,0.1577,0.2246,1.08e-05,1.3574,0.6271,0.2276,1.170,8358.7,35.27GB,0.0265,0.4854,0.1391,0.9456 +parallel,18:47:01.825,230,0.1815,0.2034,1.13e-05,1.2783,0.6191,0.1497,1.301,7510.0,35.27GB,0.0292,0.4956,0.1434,1.5443 +parallel,18:47:09.026,240,0.1777,0.2079,1.18e-05,1.2456,0.5848,0.1354,1.389,7027.7,35.27GB,0.0219,0.3814,0.1237,1.0377 +parallel,18:47:14.933,250,0.2443,0.1889,1.23e-05,1.0934,0.5221,0.0686,1.693,5758.7,35.27GB,0.0215,0.3534,0.1080,0.9568 +parallel,18:47:21.550,260,0.2483,0.1864,1.27e-05,1.1566,0.5593,0.1023,1.512,6443.5,35.27GB,0.0188,0.3914,0.1008,1.0994 +parallel,18:47:27.889,270,0.1109,0.1804,1.32e-05,1.2577,0.5481,0.0858,1.578,6167.2,35.27GB,0.0167,0.3203,0.0888,1.2707 +parallel,18:47:35.112,280,0.1536,0.1808,1.37e-05,1.3578,0.6142,0.1081,1.385,7019.5,35.27GB,0.0205,0.3638,0.1106,1.3396 +parallel,18:47:42.242,290,0.1274,0.1666,1.42e-05,1.2477,0.5673,0.1457,1.403,6922.5,35.27GB,0.0164,0.3189,0.0898,0.8273 +parallel,18:47:48.418,300,0.0948,0.1357,1.47e-05,1.1277,0.5392,0.0784,1.619,5989.8,35.27GB,0.0171,0.3476,0.0950,0.9899 +parallel,18:47:54.818,310,0.2165,0.1388,1.52e-05,1.1620,0.5505,0.0894,1.563,6200.9,35.27GB,0.0161,0.3307,0.0834,1.0476 +parallel,18:48:01.298,320,0.1138,0.1313,1.57e-05,1.2267,0.5509,0.0971,1.544,6271.3,35.27GB,0.0178,0.3389,0.0957,0.9882 +parallel,18:48:08.365,330,0.2123,0.1445,1.62e-05,1.1404,0.5907,0.1160,1.415,6833.0,35.27GB,0.0166,0.2836,0.0893,1.0975 +parallel,18:48:16.888,340,0.0683,0.1303,1.67e-05,1.1280,0.6911,0.1613,1.173,8232.8,35.27GB,0.0206,0.3245,0.1104,1.1328 +parallel,18:48:23.844,350,0.1962,0.1477,1.72e-05,1.5232,0.6032,0.0924,1.438,6711.1,35.27GB,0.0222,0.3657,0.1264,1.1811 +parallel,18:48:29.670,360,0.1185,0.1241,1.77e-05,1.1892,0.5008,0.0817,1.717,5614.9,35.27GB,0.0195,0.2918,0.1083,1.4594 +parallel,18:48:36.039,370,0.1405,0.1273,1.82e-05,1.1998,0.5505,0.0864,1.570,6132.2,35.27GB,0.0169,0.3829,0.0851,1.3228 +parallel,18:48:43.972,380,0.2119,0.1320,1.87e-05,1.1990,0.6557,0.1377,1.261,7630.6,35.27GB,0.0150,0.3032,0.0806,1.3026 +parallel,18:48:50.076,390,0.1075,0.1189,1.92e-05,1.1378,0.5407,0.0697,1.639,5864.0,35.27GB,0.0169,0.3006,0.0849,0.9713 +parallel,18:48:57.716,400,0.1143,0.1171,1.97e-05,1.2517,0.6198,0.1442,1.309,7333.9,35.27GB,0.0150,0.3771,0.0825,1.0869 +parallel,18:49:03.986,410,0.1203,0.1242,2.02e-05,1.0436,0.5433,0.0837,1.595,6011.2,35.27GB,0.0157,0.2903,0.0875,0.7431 +parallel,18:49:10.575,420,0.0388,0.1004,2.07e-05,1.1345,0.5743,0.0846,1.518,6312.1,35.27GB,0.0169,0.2823,0.0947,1.2779 +parallel,18:49:17.441,430,0.1754,0.1102,2.12e-05,1.0300,0.5746,0.1122,1.457,6569.3,35.27GB,0.0217,0.3177,0.1126,0.8830 +parallel,18:49:25.449,440,0.0781,0.1014,2.17e-05,1.1011,0.6171,0.1835,1.249,7655.4,35.27GB,0.0126,0.2326,0.0664,1.0168 +parallel,18:49:33.513,450,0.1251,0.1115,2.22e-05,1.1712,0.6351,0.1713,1.240,7699.7,35.27GB,0.0188,0.3011,0.1039,1.0584 +parallel,18:49:41.420,460,0.0857,0.1071,2.27e-05,1.1753,0.6414,0.1493,1.265,7542.3,35.27GB,0.0202,0.3305,0.1039,1.0056 +parallel,18:49:47.933,470,0.0898,0.0918,2.32e-05,1.1283,0.5469,0.1044,1.536,6205.7,35.27GB,0.0182,0.3150,0.0922,0.9507 +parallel,18:49:55.108,480,0.0639,0.0836,2.37e-05,1.1015,0.5898,0.1278,1.394,6830.1,35.27GB,0.0203,0.3255,0.1000,0.7534 +parallel,18:50:02.411,490,0.0475,0.0803,2.42e-05,1.0980,0.5952,0.1351,1.370,6943.7,35.27GB,0.0177,0.2912,0.0925,1.1162 +parallel,18:50:09.532,500,0.1168,0.0801,2.47e-05,1.1037,0.6051,0.1071,1.405,6763.9,35.27GB,0.0187,0.4583,0.0979,1.6708 +parallel,18:50:16.953,510,0.0936,0.0873,2.50e-05,1.3092,0.5895,0.1526,1.348,7041.5,35.27GB,0.0212,0.2765,0.1144,1.3883 +parallel,18:50:23.308,520,0.0454,0.0859,2.50e-05,1.1606,0.5363,0.0992,1.574,6023.2,35.27GB,0.0203,0.2738,0.0996,1.1404 +parallel,18:50:30.425,530,0.1048,0.0868,2.50e-05,1.1428,0.6039,0.1078,1.405,6738.7,35.27GB,0.0199,0.2926,0.1091,1.1748 +parallel,18:50:36.321,540,0.1107,0.0893,2.50e-05,1.1189,0.5099,0.0798,1.696,5576.9,35.27GB,0.0273,0.3237,0.1389,1.4933 +parallel,18:50:43.371,550,0.0566,0.0868,2.50e-05,1.0232,0.5834,0.1216,1.419,6660.3,35.27GB,0.0165,0.2159,0.0871,0.8416 +parallel,18:50:49.704,560,0.0768,0.0923,2.50e-05,1.0516,0.5589,0.0744,1.579,5978.2,35.27GB,0.0163,0.2848,0.0872,0.8277 +parallel,18:50:55.331,570,0.0652,0.0858,2.50e-05,1.0102,0.5012,0.0614,1.778,5304.8,35.27GB,0.0206,0.2905,0.1115,1.0680 +parallel,18:51:01.404,580,0.0584,0.0786,2.50e-05,1.0292,0.5458,0.0615,1.647,5720.2,35.27GB,0.0165,0.3337,0.0902,1.3013 +parallel,18:51:08.266,590,0.0334,0.0551,2.50e-05,1.0571,0.5828,0.1034,1.457,6456.8,35.27GB,0.0169,0.2357,0.0893,0.7928 +parallel,18:51:15.405,600,0.0551,0.0691,2.50e-05,0.9723,0.6044,0.1095,1.401,6709.0,35.27GB,0.0162,0.2741,0.0897,0.8946 +parallel,18:51:22.384,610,0.0545,0.0601,2.50e-05,0.9028,0.5669,0.1311,1.433,6553.3,35.27GB,0.0115,0.2172,0.0593,0.9289 +parallel,18:51:29.679,620,0.0589,0.0636,2.50e-05,1.0178,0.5924,0.1371,1.371,6841.0,35.27GB,0.0192,0.2725,0.0997,1.0034 +parallel,18:51:37.753,630,0.0887,0.0714,2.50e-05,1.1195,0.6425,0.1650,1.239,7565.1,35.27GB,0.0189,0.3428,0.0963,1.2237 +parallel,18:51:44.592,640,0.0758,0.0734,2.50e-05,0.9464,0.5603,0.1236,1.463,6400.0,35.27GB,0.0184,0.2731,0.1008,0.7672 +parallel,18:51:51.968,650,0.0206,0.0676,2.50e-05,0.9359,0.6126,0.1250,1.356,6895.5,35.27GB,0.0165,0.2718,0.0861,0.9008 +parallel,18:51:58.804,660,0.0774,0.0749,2.50e-05,0.9756,0.5540,0.1296,1.463,6383.4,35.27GB,0.0213,0.2986,0.1215,0.9099 +parallel,18:52:06.580,670,0.1145,0.0802,2.50e-05,0.8724,0.6147,0.1629,1.286,7253.9,35.27GB,0.0228,0.3388,0.1217,1.0995 +parallel,18:52:15.901,680,0.0703,0.0735,2.50e-05,0.9855,0.7029,0.2293,1.073,8686.8,35.27GB,0.0231,0.2570,0.1321,1.0097 +parallel,18:52:25.089,690,0.0590,0.0723,2.50e-05,0.9168,0.6261,0.2927,1.089,8552.3,35.27GB,0.0290,0.2648,0.1577,0.6608 +parallel,18:52:32.739,700,0.0585,0.0748,2.50e-05,0.9291,0.6136,0.1514,1.307,7113.6,35.27GB,0.0164,0.2722,0.0916,0.9737 +parallel,18:52:40.627,710,0.1334,0.0787,2.50e-05,0.9285,0.6320,0.1567,1.268,7326.6,35.27GB,0.0278,0.2951,0.1474,0.7773 +parallel,18:52:48.506,720,0.1134,0.0875,2.50e-05,0.9376,0.6322,0.1557,1.269,7310.7,35.27GB,0.0272,0.2917,0.1501,0.8700 +parallel,18:52:55.415,730,0.0561,0.0708,2.50e-05,0.9052,0.5879,0.1030,1.448,6403.7,35.27GB,0.0182,0.2653,0.0967,0.6294 +parallel,18:53:02.374,740,0.0704,0.0788,2.50e-05,0.8533,0.5675,0.1284,1.437,6442.5,35.27GB,0.0175,0.2185,0.0945,0.6714 +parallel,18:53:10.085,750,0.0697,0.0677,2.50e-05,0.9536,0.6403,0.1308,1.297,7131.9,35.27GB,0.0170,0.2462,0.0917,0.6951 +parallel,18:53:18.853,760,0.0364,0.0646,2.50e-05,1.0298,0.6715,0.2053,1.141,8101.1,35.27GB,0.0171,0.3331,0.0946,1.1827 +parallel,18:53:26.125,770,0.0715,0.0567,2.50e-05,0.7826,0.5835,0.1436,1.375,6710.3,35.27GB,0.0177,0.2556,0.1019,0.7722 +parallel,18:53:33.443,780,0.0987,0.0559,2.50e-05,0.8506,0.5859,0.1459,1.367,6746.1,35.27GB,0.0235,0.2588,0.1237,0.6221 +parallel,18:53:41.105,790,0.0584,0.0519,2.50e-05,0.7316,0.6405,0.1257,1.305,7056.0,35.27GB,0.0113,0.1770,0.0559,0.6216 +parallel,18:53:49.028,800,0.0637,0.0502,2.49e-05,0.7659,0.6345,0.1578,1.262,7287.9,35.27GB,0.0224,0.2679,0.1218,0.9365 +parallel,18:53:57.752,810,0.0483,0.0634,2.49e-05,0.8540,0.6955,0.1770,1.146,8017.3,35.27GB,0.0133,0.1937,0.0660,1.0226 +parallel,18:54:05.608,820,0.0291,0.0621,2.49e-05,0.9391,0.6336,0.1519,1.273,7210.3,35.27GB,0.0135,0.1963,0.0655,0.7438 +parallel,18:54:14.080,830,0.0435,0.0740,2.49e-05,1.1114,0.6500,0.1973,1.180,7768.6,35.27GB,0.0212,0.3255,0.1131,1.8116 +parallel,18:54:21.923,840,0.0463,0.0690,2.49e-05,1.0292,0.6361,0.1482,1.275,7182.8,35.27GB,0.0225,0.2541,0.1206,0.6276 +parallel,18:54:28.890,850,0.0413,0.0655,2.49e-05,0.9186,0.5805,0.1162,1.436,6374.0,35.27GB,0.0165,0.2197,0.0827,0.7288 +parallel,18:54:37.252,860,0.0494,0.0656,2.49e-05,0.9884,0.6294,0.2069,1.196,7642.2,35.27GB,0.0130,0.2082,0.0677,0.9169 +parallel,18:54:45.797,870,0.0518,0.0702,2.49e-05,0.8424,0.6665,0.1880,1.170,7800.4,35.27GB,0.0150,0.2021,0.0803,0.9604 +parallel,18:54:54.160,880,0.0435,0.0549,2.49e-05,0.7868,0.6454,0.1908,1.196,7625.9,35.27GB,0.0135,0.1851,0.0597,0.7494 +parallel,18:55:01.938,890,0.1084,0.0599,2.49e-05,0.8285,0.6226,0.1552,1.286,7085.4,35.27GB,0.0135,0.1972,0.0728,1.0050 +parallel,18:55:09.889,900,0.0367,0.0537,2.49e-05,0.9003,0.6415,0.1535,1.258,7233.8,35.27GB,0.0170,0.2956,0.0914,1.0526 +parallel,18:55:17.579,910,0.0307,0.0583,2.49e-05,0.8706,0.5950,0.1741,1.300,6990.2,35.27GB,0.0152,0.2244,0.0787,0.7663 +parallel,18:55:25.250,920,0.0570,0.0544,2.49e-05,0.7684,0.6395,0.1276,1.304,6964.3,35.27GB,0.0150,0.1698,0.0817,0.7826 +parallel,18:55:32.507,930,0.0183,0.0451,2.49e-05,0.8152,0.5896,0.1360,1.378,6580.2,35.27GB,0.0248,0.2740,0.1393,0.5679 +parallel,18:55:39.884,940,0.0438,0.0469,2.49e-05,0.7024,0.6014,0.1363,1.356,6682.5,35.27GB,0.0160,0.2408,0.0887,0.5729 +parallel,18:55:47.309,950,0.0177,0.0386,2.49e-05,0.7109,0.6063,0.1362,1.347,6718.6,35.27GB,0.0212,0.2418,0.1093,0.5122 +parallel,18:55:53.953,960,0.0382,0.0446,2.49e-05,0.8889,0.5530,0.1114,1.505,6005.0,35.27GB,0.0176,0.2549,0.0934,0.8385 +parallel,18:56:01.011,970,0.0669,0.0498,2.49e-05,0.7250,0.5653,0.1403,1.417,6372.5,35.27GB,0.0136,0.2300,0.0712,0.9975 +parallel,18:56:08.827,980,0.0342,0.0570,2.49e-05,0.7291,0.6416,0.1400,1.280,7049.5,35.27GB,0.0173,0.1977,0.0949,0.6240 +parallel,18:56:15.925,990,0.0413,0.0488,2.49e-05,0.7397,0.5838,0.1261,1.409,6394.4,35.27GB,0.0130,0.2360,0.0705,0.7484 +parallel,18:56:22.847,1000,0.0246,0.0492,2.48e-05,0.9470,0.5836,0.1086,1.445,6229.0,35.27GB,0.0139,0.1631,0.0704,0.5049 +parallel,18:58:20.550,1010,0.0290,0.0429,2.48e-05,0.9173,0.5516,11.2188,0.085,105814.4,35.27GB,0.0176,0.1944,0.0948,0.7120 +parallel,18:58:27.458,1020,0.0388,0.0400,2.48e-05,0.8194,0.5832,0.1075,1.448,6201.9,35.27GB,0.0109,0.1844,0.0480,0.5752 +parallel,18:58:34.140,1030,0.0220,0.0446,2.48e-05,0.8237,0.5601,0.1080,1.497,5992.2,35.27GB,0.0175,0.1958,0.0912,0.7368 +parallel,18:58:40.298,1040,0.0793,0.0437,2.48e-05,0.8432,0.5297,0.0861,1.624,5516.7,35.27GB,0.0139,0.1995,0.0709,0.8590 +parallel,18:58:47.509,1050,0.0305,0.0378,2.48e-05,0.9045,0.5945,0.1267,1.387,6453.4,35.27GB,0.0147,0.2151,0.0802,0.6648 +parallel,18:58:54.711,1060,0.0473,0.0419,2.48e-05,0.8403,0.5853,0.1348,1.389,6437.7,35.27GB,0.0191,0.2407,0.1053,0.5989 +parallel,18:59:01.842,1070,0.0365,0.0435,2.48e-05,0.8376,0.5854,0.1277,1.402,6367.4,35.27GB,0.0181,0.2402,0.0942,0.8575 +parallel,18:59:08.646,1080,0.0235,0.0320,2.48e-05,0.7849,0.5758,0.1046,1.470,6068.4,35.27GB,0.0144,0.1972,0.0827,0.7827 +parallel,18:59:15.029,1090,0.0463,0.0414,2.48e-05,0.6990,0.5175,0.1208,1.567,5686.6,35.27GB,0.0234,0.1777,0.1277,0.6976 +parallel,18:59:20.999,1100,0.0429,0.0402,2.48e-05,0.7795,0.5211,0.0759,1.676,5311.8,35.27GB,0.0133,0.1808,0.0749,0.8600 +parallel,18:59:27.433,1110,0.0889,0.0548,2.48e-05,0.7852,0.5443,0.0992,1.554,5719.0,35.27GB,0.0200,0.2290,0.1159,0.5363 +parallel,18:59:33.731,1120,0.0394,0.0430,2.48e-05,0.7397,0.5412,0.0887,1.588,5592.0,35.27GB,0.0163,0.2267,0.0870,0.7686 +parallel,18:59:39.981,1130,0.0812,0.0425,2.48e-05,0.7304,0.5314,0.0935,1.600,5542.2,35.27GB,0.0150,0.2368,0.0737,0.6937 +parallel,18:59:46.516,1140,0.0141,0.0312,2.48e-05,0.7545,0.5588,0.0948,1.530,5789.4,35.27GB,0.0127,0.1838,0.0589,0.5282 +parallel,18:59:53.600,1150,0.0276,0.0333,2.47e-05,0.7081,0.6260,0.0825,1.412,6268.4,35.27GB,0.0155,0.2196,0.0810,0.6355 +parallel,19:00:00.038,1160,0.0206,0.0395,2.47e-05,0.8436,0.5523,0.0913,1.553,5690.6,35.27GB,0.0157,0.1694,0.0852,0.6914 +parallel,19:00:06.702,1170,0.0506,0.0417,2.47e-05,0.8244,0.5708,0.0956,1.501,5882.6,35.27GB,0.0104,0.1753,0.0582,0.8377 +parallel,19:00:14.039,1180,0.0720,0.0547,2.47e-05,0.8426,0.5980,0.1358,1.363,6470.6,35.27GB,0.0134,0.1964,0.0708,0.8491 +parallel,19:00:21.288,1190,0.0325,0.0587,2.47e-05,0.7397,0.5863,0.1386,1.380,6385.0,35.27GB,0.0234,0.2577,0.1170,0.6850 +parallel,19:00:28.919,1200,0.0531,0.0432,2.47e-05,0.6303,0.6518,0.1113,1.311,6714.7,35.27GB,0.0170,0.2143,0.0872,0.5745 +parallel,19:00:35.340,1210,0.0379,0.0524,2.47e-05,0.7050,0.5518,0.0904,1.557,5643.9,35.27GB,0.0176,0.2233,0.0893,0.5515 +parallel,19:00:42.907,1220,0.0397,0.0422,2.47e-05,0.8563,0.6312,0.1255,1.322,6643.1,35.27GB,0.0201,0.2020,0.1090,1.0032 +parallel,19:00:50.476,1230,0.0297,0.0399,2.47e-05,0.6583,0.5995,0.1574,1.321,6636.7,35.27GB,0.0141,0.1900,0.0732,0.4631 +parallel,19:00:57.511,1240,0.0375,0.0385,2.47e-05,0.7127,0.5590,0.1446,1.422,6162.1,35.27GB,0.0110,0.1633,0.0621,0.6229 +parallel,19:01:05.611,1250,0.0163,0.0379,2.47e-05,0.6020,0.6444,0.1655,1.235,7086.3,35.27GB,0.0099,0.1710,0.0537,0.5359 +parallel,19:01:12.778,1260,0.0591,0.0426,2.47e-05,0.8187,0.5784,0.1384,1.395,6263.7,35.27GB,0.0127,0.2164,0.0681,0.9012 +parallel,19:01:20.831,1270,0.0220,0.0425,2.46e-05,0.7673,0.6177,0.1876,1.242,7029.1,35.27GB,0.0157,0.2193,0.0860,0.6339 +parallel,19:01:28.484,1280,0.0189,0.0383,2.46e-05,0.7068,0.6262,0.1391,1.307,6672.2,35.27GB,0.0173,0.2374,0.0971,0.7644 +parallel,19:01:36.126,1290,0.0352,0.0456,2.46e-05,0.7469,0.6109,0.1533,1.309,6655.5,35.27GB,0.0109,0.1733,0.0547,0.8393 +parallel,19:01:45.134,1300,0.0292,0.0448,2.46e-05,0.7324,0.7408,0.1601,1.110,7836.2,35.27GB,0.0165,0.1689,0.0835,0.6470 +parallel,19:01:52.279,1310,0.0180,0.0348,2.46e-05,0.6781,0.5695,0.1450,1.400,6207.9,35.27GB,0.0185,0.2019,0.0854,0.5628 +parallel,19:02:00.173,1320,0.1465,0.0500,2.46e-05,0.7532,0.6279,0.1615,1.267,6851.5,35.27GB,0.0270,0.2238,0.1594,0.6368 +parallel,19:02:07.537,1330,0.0269,0.0527,2.46e-05,0.7676,0.6015,0.1349,1.358,6383.9,35.27GB,0.0114,0.1866,0.0571,0.4999 +parallel,19:02:13.726,1340,0.0332,0.0456,2.46e-05,0.7554,0.5265,0.0924,1.616,5358.2,35.27GB,0.0121,0.1731,0.0629,0.4757 +parallel,19:02:20.761,1350,0.0650,0.0469,2.46e-05,0.6417,0.5840,0.1195,1.422,6084.1,35.27GB,0.0152,0.2265,0.0719,0.6469 +parallel,19:02:26.906,1360,0.0202,0.0359,2.46e-05,0.5651,0.5246,0.0899,1.628,5308.7,35.27GB,0.0114,0.1162,0.0588,0.3670 +parallel,19:02:32.931,1370,0.0197,0.0288,2.45e-05,0.6260,0.5151,0.0874,1.660,5198.7,35.27GB,0.0159,0.1756,0.0854,0.6493 +parallel,19:02:39.146,1380,0.0372,0.0328,2.45e-05,0.7184,0.5480,0.0735,1.609,5356.4,35.27GB,0.0138,0.1706,0.0791,0.7708 +parallel,19:02:46.114,1390,0.0569,0.0359,2.45e-05,0.7087,0.5831,0.1137,1.435,5998.9,35.27GB,0.0107,0.1289,0.0540,0.4897 +parallel,19:02:53.093,1400,0.0556,0.0371,2.45e-05,0.7077,0.5830,0.1150,1.433,6001.6,35.27GB,0.0268,0.2455,0.1355,0.6964 +parallel,19:02:59.806,1410,0.0419,0.0366,2.45e-05,0.7837,0.5452,0.1260,1.490,5765.1,35.27GB,0.0142,0.2158,0.0786,0.5569 +parallel,19:03:06.493,1420,0.0396,0.0334,2.45e-05,0.7347,0.5458,0.1229,1.496,5736.9,35.27GB,0.0153,0.1414,0.0792,0.6606 +parallel,19:03:14.492,1430,0.0645,0.0463,2.45e-05,0.6815,0.6696,0.1302,1.250,6853.8,35.27GB,0.0162,0.2143,0.0927,0.7901 +parallel,19:03:21.718,1440,0.0349,0.0384,2.45e-05,0.7119,0.5910,0.1316,1.384,6184.6,35.27GB,0.0127,0.1841,0.0661,0.6052 +parallel,19:03:28.674,1450,0.0644,0.0447,2.45e-05,0.7383,0.5951,0.1005,1.438,5946.6,35.27GB,0.0160,0.1920,0.0822,0.6591 +parallel,19:03:34.323,1460,0.0863,0.0459,2.44e-05,0.7713,0.4978,0.0671,1.770,4823.6,35.27GB,0.0155,0.2512,0.0861,0.5724 +parallel,19:03:39.811,1470,0.0362,0.0399,2.44e-05,0.6318,0.4822,0.0666,1.823,4680.2,35.27GB,0.0221,0.2091,0.1217,0.6148 +parallel,19:03:45.942,1480,0.0184,0.0314,2.44e-05,0.7564,0.5324,0.0808,1.631,5223.2,35.27GB,0.0198,0.2176,0.1064,0.5787 +parallel,19:03:51.748,1490,0.0250,0.0288,2.44e-05,0.7436,0.5105,0.0701,1.723,4939.4,35.27GB,0.0133,0.1729,0.0625,0.5518 +parallel,19:03:57.586,1500,0.0165,0.0389,2.44e-05,0.6866,0.5182,0.0656,1.713,4961.9,35.27GB,0.0195,0.1744,0.1046,0.4480 +parallel,19:04:03.328,1510,0.0099,0.0385,2.44e-05,0.8025,0.4953,0.0788,1.742,4874.0,35.27GB,0.0139,0.1467,0.0771,0.4531 +parallel,19:04:09.583,1520,0.0559,0.0342,2.44e-05,0.6501,0.5343,0.0912,1.599,5303.7,35.27GB,0.0249,0.1968,0.1366,0.5752 +parallel,19:04:15.682,1530,0.0720,0.0395,2.44e-05,0.6473,0.5293,0.0806,1.640,5165.5,35.27GB,0.0157,0.1754,0.0826,0.8320 +parallel,19:04:21.730,1540,0.0334,0.0339,2.43e-05,0.6346,0.5308,0.0740,1.654,5115.6,35.27GB,0.0099,0.1289,0.0494,0.6417 +parallel,19:04:27.651,1550,0.0462,0.0515,2.43e-05,0.7521,0.5258,0.0663,1.689,5002.4,35.27GB,0.0156,0.2116,0.0860,0.6348 +parallel,19:04:34.255,1560,0.0233,0.0418,2.43e-05,0.6680,0.5681,0.0923,1.514,5573.2,35.27GB,0.0187,0.1946,0.0960,0.5892 +parallel,19:04:40.938,1570,0.0142,0.0431,2.43e-05,0.7641,0.5636,0.1047,1.497,5632.8,35.27GB,0.0266,0.2700,0.1253,1.0730 +parallel,19:04:46.869,1580,0.0182,0.0351,2.43e-05,0.6919,0.5182,0.0749,1.686,4993.2,35.27GB,0.0107,0.1494,0.0505,0.4146 +parallel,19:04:53.743,1590,0.0239,0.0295,2.43e-05,0.6951,0.5838,0.1036,1.455,5780.1,35.27GB,0.0107,0.1989,0.0573,0.4585 +parallel,19:05:00.681,1600,0.0258,0.0306,2.43e-05,0.6823,0.5834,0.1104,1.442,5826.7,35.27GB,0.0155,0.1631,0.0739,0.7295 +parallel,19:05:08.042,1610,0.0131,0.0403,2.43e-05,0.7426,0.5999,0.1363,1.359,6175.2,35.27GB,0.0155,0.2195,0.0717,0.6116 +parallel,19:05:15.001,1620,0.0275,0.0378,2.42e-05,0.6838,0.5800,0.1160,1.437,5830.8,35.27GB,0.0161,0.2408,0.0862,1.3863 +parallel,19:05:21.182,1630,0.0339,0.0369,2.42e-05,0.6173,0.5372,0.0808,1.618,5172.0,35.27GB,0.0170,0.1967,0.0901,0.6284 +parallel,19:05:26.788,1640,0.0373,0.0270,2.42e-05,0.5958,0.4973,0.0634,1.784,4686.2,35.27GB,0.0216,0.2435,0.1195,0.6856 +parallel,19:05:32.675,1650,0.0484,0.0326,2.42e-05,0.6035,0.5151,0.0736,1.699,4914.3,35.27GB,0.0168,0.2092,0.0898,0.6384 +parallel,19:05:38.280,1660,0.0238,0.0360,2.42e-05,0.5806,0.4954,0.0651,1.784,4673.9,35.27GB,0.0152,0.1329,0.0775,0.5350 +parallel,19:05:43.762,1670,0.0346,0.0377,2.42e-05,0.7035,0.4822,0.0660,1.825,4565.1,35.27GB,0.0143,0.2296,0.0685,0.6212 +parallel,19:05:49.682,1680,0.0194,0.0367,2.42e-05,0.6466,0.5265,0.0656,1.689,4925.0,35.27GB,0.0130,0.1691,0.0662,0.5645 +parallel,19:05:55.398,1690,0.0532,0.0336,2.41e-05,0.6479,0.5072,0.0643,1.750,4749.3,35.27GB,0.0142,0.2488,0.0735,0.7097 +parallel,19:06:01.380,1700,0.0486,0.0406,2.41e-05,0.7110,0.5236,0.0746,1.672,4964.5,35.27GB,0.0248,0.3108,0.1241,0.7008 +parallel,19:06:07.260,1710,0.0144,0.0283,2.41e-05,0.6042,0.5027,0.0854,1.701,4873.8,35.27GB,0.0136,0.1782,0.0617,0.5886 +parallel,19:06:14.457,1720,0.0355,0.0401,2.41e-05,0.6471,0.5811,0.1386,1.390,5958.5,35.27GB,0.0133,0.1716,0.0730,0.6572 +parallel,19:06:22.516,1730,0.0104,0.0302,2.41e-05,0.7345,0.6594,0.1465,1.241,6663.5,35.27GB,0.0137,0.1846,0.0665,0.7704 +parallel,19:06:30.267,1740,0.0080,0.0311,2.41e-05,0.7110,0.6175,0.1576,1.290,6401.5,35.27GB,0.0215,0.2194,0.1054,0.5143 +parallel,19:06:38.204,1750,0.0478,0.0341,2.41e-05,0.6419,0.6228,0.1709,1.260,6546.7,35.27GB,0.0199,0.2154,0.0988,0.8680 +parallel,19:06:45.693,1760,0.0128,0.0486,2.40e-05,0.6482,0.5831,0.1658,1.335,6170.5,35.27GB,0.0203,0.1805,0.1082,0.4590 +parallel,19:06:52.447,1770,0.0305,0.0378,2.40e-05,0.7349,0.5515,0.1239,1.481,5557.3,35.27GB,0.0159,0.1906,0.0753,0.6005 +parallel,19:07:00.758,1780,0.0222,0.0330,2.40e-05,0.5879,0.6742,0.1569,1.203,6830.7,35.27GB,0.0155,0.1657,0.0681,0.4276 +parallel,19:07:08.186,1790,0.0215,0.0333,2.40e-05,0.6438,0.6041,0.1387,1.346,6097.7,35.27GB,0.0098,0.1645,0.0546,0.5018 +parallel,19:07:15.810,1800,0.0999,0.0343,2.40e-05,0.7122,0.6266,0.1359,1.312,6251.1,35.27GB,0.0232,0.2762,0.1284,0.7935 +parallel,19:07:24.262,1810,0.0138,0.0268,2.40e-05,0.5938,0.6335,0.2117,1.183,6921.0,35.27GB,0.0101,0.1326,0.0551,0.5075 +parallel,19:07:32.552,1820,0.0140,0.0299,2.40e-05,0.6707,0.6612,0.1678,1.207,6779.9,35.27GB,0.0149,0.2032,0.0697,0.6875 +parallel,19:07:40.550,1830,0.0175,0.0259,2.39e-05,0.5750,0.6480,0.1518,1.251,6533.3,35.27GB,0.0139,0.1595,0.0696,0.4718 +parallel,19:07:48.650,1840,0.0256,0.0256,2.39e-05,0.6718,0.6488,0.1612,1.235,6609.1,35.27GB,0.0101,0.1563,0.0527,0.5841 +parallel,19:07:57.103,1850,0.0144,0.0238,2.39e-05,0.6204,0.6841,0.1612,1.183,6887.8,35.27GB,0.0118,0.1351,0.0592,0.4311 +parallel,19:08:04.414,1860,0.0175,0.0321,2.39e-05,0.7131,0.5917,0.1394,1.368,5950.3,35.27GB,0.0178,0.2110,0.1045,0.5490 +parallel,19:08:11.658,1870,0.0386,0.0315,2.39e-05,0.6103,0.6045,0.1199,1.381,5888.3,35.27GB,0.0133,0.1448,0.0750,0.6229 +parallel,19:08:20.767,1880,0.0224,0.0343,2.39e-05,0.6823,0.6999,0.2111,1.098,7396.2,35.27GB,0.0111,0.1645,0.0607,1.2589 +parallel,19:08:28.390,1890,0.0142,0.0256,2.38e-05,0.5904,0.6097,0.1527,1.312,6179.6,35.27GB,0.0176,0.1699,0.1005,0.3683 +parallel,19:08:36.915,1900,0.0122,0.0270,2.38e-05,0.6805,0.6622,0.1902,1.173,6904.0,35.27GB,0.0126,0.1887,0.0605,0.4870 +parallel,19:08:43.548,1910,0.0766,0.0340,2.38e-05,0.6078,0.5408,0.1225,1.508,5365.2,35.27GB,0.0341,0.2469,0.1871,0.7141 +parallel,19:08:51.009,1920,0.0517,0.0289,2.38e-05,0.5853,0.5847,0.1614,1.340,6027.9,35.27GB,0.0223,0.1658,0.1152,0.4647 +parallel,19:08:59.915,1930,0.0736,0.0354,2.38e-05,0.6598,0.7107,0.1798,1.123,7184.5,35.27GB,0.0259,0.1859,0.1342,0.6000 +parallel,19:09:07.130,1940,0.0893,0.0357,2.38e-05,0.6141,0.5679,0.1536,1.386,5813.9,35.27GB,0.0180,0.1976,0.0957,0.7395 +parallel,19:09:14.639,1950,0.0195,0.0317,2.37e-05,0.5467,0.6137,0.1372,1.332,6044.1,35.27GB,0.0171,0.1826,0.0853,0.5761 +parallel,19:09:22.575,1960,0.0267,0.0435,2.37e-05,0.7081,0.6319,0.1617,1.260,6380.2,35.27GB,0.0267,0.1927,0.1302,0.7528 +parallel,19:09:29.834,1970,0.0325,0.0364,2.37e-05,0.6703,0.6026,0.1233,1.378,5828.0,35.27GB,0.0132,0.1819,0.0613,0.7383 +parallel,19:09:37.528,1980,0.0104,0.0306,2.37e-05,0.5608,0.6290,0.1404,1.300,6169.2,35.27GB,0.0091,0.1574,0.0483,0.3474 +parallel,19:09:44.861,1990,0.0317,0.0321,2.37e-05,0.6389,0.6116,0.1218,1.364,5872.7,35.27GB,0.0142,0.1736,0.0778,0.6810 +parallel,19:09:53.627,2000,0.0280,0.0267,2.37e-05,0.6051,0.7138,0.1628,1.141,7012.2,35.27GB,0.0180,0.1784,0.0955,0.5627 +parallel,19:12:13.900,2010,0.0441,0.0309,2.36e-05,0.5684,0.5752,13.4521,0.071,112077.5,35.27GB,0.0282,0.2208,0.1501,0.5833 +parallel,19:12:21.382,2020,0.0095,0.0268,2.36e-05,0.6474,0.6031,0.1451,1.337,5969.2,35.27GB,0.0170,0.1521,0.0906,0.6150 +parallel,19:12:29.593,2030,0.0230,0.0320,2.36e-05,0.5594,0.6550,0.1662,1.218,6543.6,35.27GB,0.0221,0.2304,0.1217,0.5267 +parallel,19:12:38.028,2040,0.0598,0.0351,2.36e-05,0.5957,0.6497,0.1937,1.186,6712.9,35.27GB,0.0229,0.2128,0.1253,0.6465 +parallel,19:12:45.867,2050,0.0154,0.0289,2.36e-05,0.6164,0.6304,0.1535,1.276,6231.6,35.27GB,0.0144,0.1924,0.0750,0.7113 +parallel,19:12:53.345,2060,0.0314,0.0350,2.35e-05,0.5571,0.6012,0.1466,1.337,5936.6,35.27GB,0.0183,0.2079,0.0979,0.4581 +parallel,19:13:01.369,2070,0.0184,0.0317,2.35e-05,0.5616,0.6225,0.1800,1.246,6362.3,35.27GB,0.0201,0.1684,0.1146,0.4988 +parallel,19:13:07.715,2080,0.0170,0.0216,2.35e-05,0.5138,0.5438,0.0907,1.576,5025.1,35.27GB,0.0107,0.1591,0.0524,0.4468 +parallel,19:13:15.358,2090,0.0328,0.0290,2.35e-05,0.6198,0.6042,0.1601,1.309,6044.4,35.27GB,0.0158,0.1785,0.0805,0.5652 +parallel,19:13:22.627,2100,0.0271,0.0308,2.35e-05,0.6641,0.5892,0.1377,1.376,5742.4,35.27GB,0.0244,0.2344,0.1299,0.6544 +parallel,19:13:29.842,2110,0.0220,0.0307,2.35e-05,0.6519,0.5709,0.1506,1.386,5691.5,35.27GB,0.0140,0.1431,0.0708,0.5445 +parallel,19:13:36.050,2120,0.0657,0.0315,2.34e-05,0.6137,0.5220,0.0989,1.611,4891.3,35.27GB,0.0197,0.1916,0.1080,0.5001 +parallel,19:13:43.841,2130,0.0125,0.0240,2.34e-05,0.6547,0.6337,0.1454,1.284,6131.1,35.27GB,0.0162,0.1790,0.0867,0.5061 +parallel,19:13:51.389,2140,0.0206,0.0216,2.34e-05,0.5490,0.5957,0.1591,1.325,5931.2,35.27GB,0.0114,0.1737,0.0640,0.5218 +parallel,19:13:58.905,2150,0.0179,0.0230,2.34e-05,0.5342,0.6072,0.1444,1.331,5898.9,35.27GB,0.0214,0.1827,0.1233,0.4606 +parallel,19:14:07.233,2160,0.0183,0.0318,2.34e-05,0.5424,0.6463,0.1865,1.201,6528.4,35.27GB,0.0192,0.1966,0.1026,0.4306 +parallel,19:14:14.105,2170,0.0220,0.0262,2.33e-05,0.5086,0.5624,0.1247,1.455,5379.8,35.27GB,0.0156,0.1405,0.0788,0.4417 +parallel,19:14:22.139,2180,0.0104,0.0231,2.33e-05,0.5984,0.6254,0.1781,1.245,6282.2,35.27GB,0.0143,0.1549,0.0723,0.5079 +parallel,19:14:28.777,2190,0.0074,0.0295,2.33e-05,0.5972,0.5512,0.1125,1.507,5182.8,35.27GB,0.0062,0.0821,0.0302,0.2958 +parallel,19:14:35.689,2200,0.0297,0.0257,2.33e-05,0.6252,0.5716,0.1196,1.447,5390.6,35.27GB,0.0269,0.2761,0.1414,0.8229 +parallel,19:14:43.530,2210,0.0048,0.0264,2.33e-05,0.5325,0.6079,0.1761,1.276,6107.0,35.27GB,0.0171,0.1671,0.0940,0.5297 +parallel,19:14:51.154,2220,0.0342,0.0291,2.32e-05,0.5975,0.5874,0.1751,1.312,5930.5,35.27GB,0.0143,0.1959,0.0758,0.5077 +parallel,19:14:59.383,2230,0.0299,0.0282,2.32e-05,0.6065,0.6168,0.2061,1.215,6393.4,35.27GB,0.0141,0.2058,0.0755,0.6015 +parallel,19:15:07.033,2240,0.0333,0.0247,2.32e-05,0.5207,0.5933,0.1717,1.307,5935.8,35.27GB,0.0121,0.1505,0.0622,0.4722 +parallel,19:15:13.470,2250,0.0477,0.0268,2.32e-05,0.5262,0.5401,0.1036,1.554,4987.5,35.27GB,0.0132,0.1527,0.0598,0.4806 +parallel,19:15:20.157,2260,0.0200,0.0259,2.32e-05,0.6786,0.5509,0.1178,1.496,5175.2,35.27GB,0.0205,0.1665,0.1068,0.6506 +parallel,19:15:26.815,2270,0.0175,0.0272,2.31e-05,0.5522,0.5569,0.1089,1.502,5145.8,35.27GB,0.0106,0.1364,0.0511,0.4607 +parallel,19:15:33.882,2280,0.0543,0.0264,2.31e-05,0.5947,0.5889,0.1178,1.415,5454.6,35.27GB,0.0250,0.2159,0.1293,0.5894 +parallel,19:15:40.076,2290,0.0280,0.0280,2.31e-05,0.6412,0.5292,0.0903,1.615,4775.2,35.27GB,0.0253,0.2154,0.1336,0.5023 +parallel,19:15:46.732,2300,0.0320,0.0263,2.31e-05,0.6155,0.5725,0.0931,1.503,5124.1,35.27GB,0.0116,0.1684,0.0544,0.5484 +parallel,19:15:52.466,2310,0.0801,0.0375,2.31e-05,0.5936,0.5047,0.0687,1.744,4408.4,35.27GB,0.0233,0.2358,0.1348,0.6627 +parallel,19:15:58.105,2320,0.1007,0.0347,2.30e-05,0.5551,0.4933,0.0705,1.774,4330.1,35.27GB,0.0256,0.1877,0.1308,0.5658 +parallel,19:16:04.107,2330,0.0246,0.0312,2.30e-05,0.5141,0.5221,0.0782,1.666,4603.5,35.27GB,0.0118,0.1303,0.0595,0.4893 +parallel,19:16:10.056,2340,0.0185,0.0212,2.30e-05,0.6848,0.5186,0.0763,1.681,4556.0,35.27GB,0.0152,0.1679,0.0736,0.4759 +parallel,19:16:16.269,2350,0.0324,0.0226,2.30e-05,0.5726,0.5418,0.0795,1.610,4751.3,35.27GB,0.0216,0.2015,0.1126,0.5222 +parallel,19:16:22.370,2360,0.0137,0.0264,2.29e-05,0.5920,0.5114,0.0987,1.639,4660.3,35.27GB,0.0121,0.1273,0.0657,0.3158 +parallel,19:16:28.266,2370,0.0200,0.0257,2.29e-05,0.5386,0.4944,0.0952,1.696,4497.6,35.27GB,0.0071,0.1174,0.0379,0.3762 +parallel,19:16:33.699,2380,0.0445,0.0251,2.29e-05,0.5655,0.4805,0.0628,1.841,4139.3,35.27GB,0.0247,0.2160,0.1330,0.4830 +parallel,19:16:39.359,2390,0.0074,0.0282,2.29e-05,0.5667,0.4889,0.0771,1.767,4306.4,35.27GB,0.0076,0.1352,0.0392,0.3903 +parallel,19:16:45.394,2400,0.0151,0.0219,2.29e-05,0.5844,0.5267,0.0768,1.657,4585.8,35.27GB,0.0132,0.1367,0.0676,0.6095 +parallel,19:16:51.381,2410,0.0245,0.0203,2.28e-05,0.4979,0.5122,0.0865,1.671,4543.4,35.27GB,0.0120,0.1144,0.0559,0.5339 +parallel,19:16:58.073,2420,0.0189,0.0419,2.28e-05,0.6382,0.5588,0.1104,1.495,5071.6,35.27GB,0.0202,0.2083,0.1181,0.6415 +parallel,19:17:04.973,2430,0.0581,0.0321,2.28e-05,0.6099,0.5746,0.1154,1.449,5222.9,35.27GB,0.0158,0.2241,0.0869,0.6354 +parallel,19:17:10.810,2440,0.0095,0.0284,2.28e-05,0.4721,0.5055,0.0782,1.714,4411.4,35.27GB,0.0128,0.1534,0.0697,0.3680 +parallel,19:17:17.655,2450,0.0219,0.0251,2.28e-05,0.5551,0.5750,0.1096,1.461,5167.3,35.27GB,0.0084,0.1106,0.0464,0.3042 +parallel,19:17:23.860,2460,0.0318,0.0212,2.27e-05,0.4913,0.5176,0.1029,1.612,4677.6,35.27GB,0.0147,0.1711,0.0738,0.4441 +parallel,19:17:31.022,2470,0.0362,0.0270,2.27e-05,0.5364,0.5596,0.1566,1.396,5392.3,35.27GB,0.0199,0.2280,0.0987,0.6443 +parallel,19:17:37.197,2480,0.0256,0.0219,2.27e-05,0.5620,0.5223,0.0952,1.620,4642.7,35.27GB,0.0144,0.1921,0.0746,0.4260 +parallel,19:17:43.401,2490,0.0235,0.0283,2.27e-05,0.5414,0.5222,0.0982,1.612,4658.0,35.27GB,0.0180,0.2060,0.0932,0.5434 +parallel,19:17:50.121,2500,0.0684,0.0282,2.26e-05,0.5831,0.5741,0.0979,1.488,5038.9,35.27GB,0.0176,0.1804,0.0842,0.5649 +parallel,19:17:56.944,2510,0.0091,0.0226,2.26e-05,0.5188,0.5641,0.1182,1.466,5109.8,35.27GB,0.0224,0.1571,0.1174,0.3305 +parallel,19:18:02.608,2520,0.0104,0.0256,2.26e-05,0.5732,0.5008,0.0656,1.766,4236.3,35.27GB,0.0300,0.2717,0.1593,0.5126 +parallel,19:18:08.687,2530,0.0218,0.0261,2.26e-05,0.5306,0.5206,0.0872,1.645,4539.8,35.27GB,0.0173,0.1509,0.0851,0.4745 +parallel,19:18:14.530,2540,0.0418,0.0237,2.25e-05,0.5316,0.5014,0.0829,1.712,4357.9,35.27GB,0.0133,0.1445,0.0672,0.6081 +parallel,19:18:20.379,2550,0.0617,0.0246,2.25e-05,0.5513,0.5096,0.0753,1.710,4356.6,35.27GB,0.0117,0.1585,0.0617,0.5779 +parallel,19:18:26.129,2560,0.0302,0.0241,2.25e-05,0.5529,0.5048,0.0702,1.739,4278.0,35.27GB,0.0088,0.1159,0.0467,0.5075 +parallel,19:18:31.713,2570,0.0495,0.0292,2.25e-05,0.5717,0.4877,0.0707,1.791,4148.1,35.27GB,0.0122,0.1832,0.0664,0.4524 +parallel,19:18:37.822,2580,0.0321,0.0246,2.25e-05,0.5234,0.5319,0.0790,1.637,4532.2,35.27GB,0.0145,0.1533,0.0721,0.5820 +parallel,19:18:44.226,2590,0.1227,0.0336,2.24e-05,0.6507,0.5320,0.1083,1.562,4744.6,35.27GB,0.0221,0.2433,0.1118,1.4099 +parallel,19:18:49.763,2600,0.0155,0.0295,2.24e-05,0.6342,0.4879,0.0658,1.806,4096.7,35.27GB,0.0216,0.1688,0.1134,0.4951 +parallel,19:18:55.343,2610,0.0367,0.0283,2.24e-05,0.6245,0.4812,0.0768,1.792,4122.8,35.27GB,0.0125,0.1629,0.0595,0.4429 +parallel,19:19:00.933,2620,0.0086,0.0232,2.24e-05,0.5396,0.4897,0.0693,1.789,4124.8,35.27GB,0.0128,0.1221,0.0681,0.5209 +parallel,19:19:07.129,2630,0.0124,0.0177,2.23e-05,0.4944,0.5364,0.0832,1.614,4566.0,35.27GB,0.0235,0.1675,0.1191,0.4886 +parallel,19:19:12.783,2640,0.0281,0.0166,2.23e-05,0.5114,0.4898,0.0756,1.769,4160.7,35.27GB,0.0220,0.1929,0.1177,0.5384 +parallel,19:19:18.856,2650,0.0356,0.0190,2.23e-05,0.5005,0.5332,0.0741,1.647,4462.8,35.27GB,0.0167,0.1246,0.0874,0.4393 +parallel,19:19:24.839,2660,0.0278,0.0196,2.23e-05,0.5207,0.5006,0.0977,1.672,4390.6,35.27GB,0.0096,0.1465,0.0455,0.3820 +parallel,19:19:34.557,2670,0.0115,0.0176,2.22e-05,0.4742,0.6650,0.3068,1.029,7122.6,35.27GB,0.0112,0.1141,0.0547,0.3952 +parallel,19:19:47.012,2680,0.0107,0.0177,2.22e-05,0.4972,0.8740,0.3716,0.804,9107.9,35.27GB,0.0075,0.1118,0.0334,0.3007 +parallel,19:19:57.909,2690,0.0176,0.0202,2.22e-05,0.4937,0.7200,0.3696,0.918,7961.7,35.27GB,0.0201,0.2239,0.1098,0.5080 +parallel,19:20:08.860,2700,0.0058,0.0161,2.22e-05,0.5484,0.8121,0.2830,0.913,7991.6,35.27GB,0.0111,0.1219,0.0559,0.3449 +parallel,19:20:18.341,2710,0.0357,0.0230,2.21e-05,0.5690,0.7255,0.2226,1.056,6905.4,35.27GB,0.0206,0.2175,0.1063,0.5956 +parallel,19:20:26.000,2720,0.0307,0.0221,2.21e-05,0.4975,0.6115,0.1544,1.307,5570.7,35.27GB,0.0176,0.1846,0.0943,0.4927 +parallel,19:20:34.331,2730,0.0089,0.0196,2.21e-05,0.5381,0.6826,0.1506,1.201,6054.6,35.27GB,0.0118,0.1252,0.0578,0.3948 +parallel,19:20:42.376,2740,0.0556,0.0223,2.21e-05,0.5324,0.6120,0.1924,1.243,5838.8,35.27GB,0.0219,0.2113,0.1184,0.6011 +parallel,19:20:50.434,2750,0.0157,0.0200,2.20e-05,0.4970,0.6140,0.1917,1.241,5839.7,35.27GB,0.0112,0.1694,0.0606,0.5226 +parallel,19:21:00.088,2760,0.0272,0.0171,2.20e-05,0.4829,0.6599,0.3055,1.036,6989.0,35.27GB,0.0097,0.1085,0.0484,0.4126 +parallel,19:21:07.956,2770,0.0139,0.0226,2.20e-05,0.5479,0.5768,0.2100,1.271,5687.6,35.27GB,0.0067,0.0963,0.0329,0.4171 +parallel,19:21:15.540,2780,0.0164,0.0236,2.20e-05,0.4880,0.6330,0.1254,1.319,5472.1,35.27GB,0.0067,0.1034,0.0353,0.3744 +parallel,19:21:22.526,2790,0.0062,0.0241,2.19e-05,0.5330,0.5654,0.1332,1.432,5033.7,35.27GB,0.0121,0.1443,0.0672,0.4396 +parallel,19:21:30.175,2800,0.0506,0.0292,2.19e-05,0.4862,0.5926,0.1724,1.307,5507.0,35.27GB,0.0235,0.1602,0.1168,0.5073 +parallel,19:21:40.460,2810,0.0084,0.0232,2.19e-05,0.5652,0.6948,0.3336,0.972,7393.9,35.27GB,0.0123,0.1484,0.0595,0.6626 +parallel,19:21:49.873,2820,0.0361,0.0190,2.19e-05,0.5122,0.6476,0.2936,1.063,6757.2,35.27GB,0.0190,0.1762,0.1021,0.7007 +parallel,19:21:57.583,2830,0.0058,0.0127,2.18e-05,0.4606,0.6122,0.1587,1.297,5526.7,35.27GB,0.0065,0.0915,0.0353,0.3276 +parallel,19:22:03.389,2840,0.0141,0.0190,2.18e-05,0.4922,0.5019,0.0787,1.723,4156.7,35.27GB,0.0169,0.1700,0.0903,0.4407 +parallel,19:22:10.208,2850,0.0155,0.0201,2.18e-05,0.5022,0.5609,0.1210,1.467,4875.1,35.27GB,0.0097,0.1380,0.0491,0.5397 +parallel,19:22:16.357,2860,0.0266,0.0225,2.18e-05,0.4872,0.5351,0.0798,1.626,4390.5,35.27GB,0.0140,0.1237,0.0742,0.3875 +parallel,19:22:22.067,2870,0.0137,0.0202,2.17e-05,0.5385,0.4984,0.0726,1.752,4070.1,35.27GB,0.0142,0.2027,0.0678,0.4955 +parallel,19:22:28.926,2880,0.0116,0.0235,2.17e-05,0.4965,0.5923,0.0937,1.458,4883.9,35.27GB,0.0118,0.2041,0.0580,0.6080 +parallel,19:22:35.602,2890,0.0142,0.0190,2.17e-05,0.5482,0.5458,0.1217,1.498,4746.0,35.27GB,0.0077,0.0950,0.0351,0.3394 +parallel,19:22:41.522,2900,0.0158,0.0216,2.17e-05,0.5148,0.5183,0.0737,1.689,4202.8,35.27GB,0.0072,0.0933,0.0377,0.3448 +parallel,19:22:47.407,2910,0.0281,0.0248,2.16e-05,0.6104,0.5084,0.0801,1.700,4171.8,35.27GB,0.0174,0.2403,0.0966,0.5880 +parallel,19:22:54.056,2920,0.0071,0.0206,2.16e-05,0.5159,0.5332,0.1316,1.504,4706.8,35.27GB,0.0250,0.1814,0.1286,0.4983 +parallel,19:23:00.272,2930,0.0223,0.0291,2.16e-05,0.5616,0.5402,0.0815,1.609,4394.3,35.27GB,0.0070,0.0773,0.0360,0.2959 +parallel,19:23:06.651,2940,0.0171,0.0246,2.15e-05,0.4731,0.5398,0.0981,1.568,4502.9,35.27GB,0.0107,0.1142,0.0539,0.5367 +parallel,19:23:12.697,2950,0.0137,0.0203,2.15e-05,0.5309,0.5264,0.0782,1.654,4262.3,35.27GB,0.0237,0.1629,0.1215,0.4928 +parallel,19:23:18.373,2960,0.0066,0.0169,2.15e-05,0.4517,0.4981,0.0694,1.762,3994.7,35.27GB,0.0245,0.2016,0.1343,0.6550 +parallel,19:23:24.126,2970,0.0162,0.0176,2.15e-05,0.4830,0.5022,0.0731,1.738,4044.4,35.27GB,0.0079,0.1203,0.0411,0.3011 +parallel,19:23:30.080,2980,0.0236,0.0156,2.14e-05,0.4386,0.5170,0.0785,1.680,4179.2,35.27GB,0.0103,0.1275,0.0552,0.4303 +parallel,19:23:36.461,2990,0.0595,0.0269,2.14e-05,0.5241,0.5357,0.1023,1.567,4472.1,35.27GB,0.0119,0.2005,0.0614,0.4129 +parallel,19:23:42.798,3000,0.0201,0.0249,2.14e-05,0.5353,0.5412,0.0925,1.578,4435.5,35.27GB,0.0200,0.1988,0.1050,0.6035 +parallel,19:26:08.608,3010,0.0070,0.0205,2.14e-05,0.6298,0.6527,13.9283,0.069,101920.7,35.27GB,0.0318,0.1761,0.1618,0.6405 +parallel,19:26:15.728,3020,0.0138,0.0196,2.13e-05,0.5346,0.5938,0.1182,1.405,4969.3,35.27GB,0.0095,0.1090,0.0510,0.4272 +parallel,19:26:23.584,3030,0.0079,0.0319,2.13e-05,0.5188,0.6205,0.1651,1.273,5475.1,35.27GB,0.0311,0.2589,0.1845,0.8296 +parallel,19:26:30.239,3040,0.0177,0.0227,2.13e-05,0.5146,0.5518,0.1137,1.503,4631.6,35.27GB,0.0101,0.1197,0.0467,0.3931 +parallel,19:26:37.431,3050,0.0061,0.0207,2.12e-05,0.6467,0.6057,0.1134,1.391,4995.4,35.27GB,0.0078,0.1142,0.0356,0.3554 +parallel,19:26:44.770,3060,0.0243,0.0204,2.12e-05,0.4928,0.5739,0.1600,1.363,5092.6,35.27GB,0.0261,0.2474,0.1311,0.5398 +parallel,19:26:51.444,3070,0.0125,0.0196,2.12e-05,0.5857,0.5545,0.1129,1.499,4624.6,35.27GB,0.0144,0.1532,0.0690,0.7375 +parallel,19:26:58.878,3080,0.0596,0.0269,2.12e-05,0.4780,0.6166,0.1268,1.345,5143.7,35.27GB,0.0272,0.1916,0.1461,0.4235 +parallel,19:27:05.523,3090,0.0165,0.0233,2.11e-05,0.5428,0.5431,0.1214,1.505,4591.4,35.27GB,0.0221,0.2302,0.1118,0.5303 +parallel,19:27:13.120,3100,0.1228,0.0417,2.11e-05,0.5249,0.6132,0.1465,1.316,5241.5,35.27GB,0.0252,0.1981,0.1527,0.4690 +parallel,19:27:21.093,3110,0.0037,0.0262,2.11e-05,0.5473,0.6052,0.1921,1.254,5492.7,35.27GB,0.0149,0.1677,0.0763,0.4268 +parallel,19:27:28.212,3120,0.0204,0.0233,2.11e-05,0.4770,0.5573,0.1546,1.405,4897.2,35.27GB,0.0190,0.1919,0.1026,0.3531 +parallel,19:27:35.325,3130,0.0260,0.0194,2.10e-05,0.5252,0.5630,0.1483,1.406,4886.4,35.27GB,0.0151,0.1677,0.0795,0.4111 +parallel,19:27:42.029,3140,0.0194,0.0173,2.10e-05,0.5092,0.5545,0.1159,1.492,4598.3,35.27GB,0.0195,0.1228,0.1051,0.4232 +parallel,19:27:49.117,3150,0.0288,0.0186,2.10e-05,0.4789,0.5887,0.1202,1.411,4855.2,35.27GB,0.0203,0.1593,0.1181,0.5429 +parallel,19:27:56.287,3160,0.0388,0.0262,2.09e-05,0.5851,0.6019,0.1151,1.395,4903.7,35.27GB,0.0206,0.2228,0.1079,0.5647 +parallel,19:28:04.019,3170,0.0060,0.0236,2.09e-05,0.5881,0.5981,0.1751,1.294,5280.2,35.27GB,0.0135,0.1455,0.0641,0.3745 +parallel,19:28:10.762,3180,0.0179,0.0164,2.09e-05,0.4781,0.5720,0.1023,1.483,4598.1,35.27GB,0.0286,0.2471,0.1538,0.6004 +parallel,19:28:18.431,3190,0.0169,0.0178,2.09e-05,0.4987,0.6148,0.1521,1.304,5222.3,35.27GB,0.0306,0.2713,0.1608,0.5994 +parallel,19:28:26.506,3200,0.0673,0.0274,2.08e-05,0.5034,0.6530,0.1545,1.238,5490.7,35.27GB,0.0276,0.1715,0.1366,0.4667 +parallel,19:28:33.241,3210,0.0236,0.0208,2.08e-05,0.5143,0.5561,0.1174,1.485,4572.0,35.27GB,0.0207,0.1880,0.1142,0.4799 +parallel,19:28:39.897,3220,0.0414,0.0247,2.08e-05,0.5397,0.5503,0.1153,1.503,4512.5,35.27GB,0.0132,0.1605,0.0735,0.4325 +parallel,19:28:48.034,3230,0.0233,0.0225,2.07e-05,0.5073,0.6744,0.1393,1.229,5508.3,35.27GB,0.0132,0.1314,0.0692,0.4714 +parallel,19:28:54.516,3240,0.0256,0.0212,2.07e-05,0.5305,0.5383,0.1099,1.543,4381.2,35.27GB,0.0117,0.1490,0.0652,0.4982 +parallel,19:29:01.997,3250,0.0069,0.0187,2.07e-05,0.5243,0.6011,0.1469,1.337,5049.0,35.27GB,0.0090,0.0995,0.0427,0.2295 +parallel,19:29:10.223,3260,0.0312,0.0412,2.06e-05,0.5695,0.6439,0.1787,1.216,5542.4,35.27GB,0.0142,0.1480,0.0712,0.4993 +parallel,19:29:17.292,3270,0.0179,0.0404,2.06e-05,0.6858,0.5668,0.1401,1.415,4757.4,35.27GB,0.0087,0.1235,0.0483,0.3326 +parallel,19:29:24.754,3280,0.0286,0.0312,2.06e-05,0.5614,0.6223,0.1239,1.340,5013.4,35.27GB,0.0168,0.1680,0.0861,0.4310 +parallel,19:29:31.591,3290,0.0337,0.0250,2.06e-05,0.6012,0.5643,0.1194,1.463,4586.9,35.27GB,0.0141,0.1565,0.0712,0.4981 +parallel,19:29:39.359,3300,0.0088,0.0227,2.05e-05,0.5085,0.6271,0.1497,1.287,5203.9,35.27GB,0.0365,0.1838,0.2137,0.5782 +parallel,19:29:46.872,3310,0.0122,0.0202,2.05e-05,0.5440,0.5938,0.1575,1.331,5025.6,35.27GB,0.0303,0.2099,0.1675,0.4893 +parallel,19:29:54.693,3320,0.0216,0.0204,2.05e-05,0.5164,0.6196,0.1625,1.279,5224.0,35.27GB,0.0181,0.2122,0.0972,0.4893 +parallel,19:30:02.140,3330,0.0078,0.0188,2.04e-05,0.4431,0.5974,0.1473,1.343,4966.5,35.27GB,0.0199,0.1742,0.1107,0.4401 +parallel,19:30:08.677,3340,0.0422,0.0200,2.04e-05,0.4846,0.5369,0.1168,1.530,4352.7,35.27GB,0.0211,0.2363,0.1080,0.7666 +parallel,19:30:16.249,3350,0.0121,0.0183,2.04e-05,0.4958,0.6054,0.1519,1.321,5035.1,35.27GB,0.0077,0.1172,0.0404,0.3190 +parallel,19:30:23.708,3360,0.0112,0.0159,2.03e-05,0.5111,0.5631,0.1827,1.341,4951.8,35.27GB,0.0157,0.1601,0.0801,0.3499 +parallel,19:30:31.403,3370,0.0288,0.0254,2.03e-05,0.7487,0.5902,0.1793,1.300,5100.6,35.27GB,0.0122,0.1435,0.0631,0.4325 +parallel,19:30:38.256,3380,0.0460,0.0235,2.03e-05,0.4451,0.5721,0.1132,1.459,4536.0,35.27GB,0.0221,0.1872,0.1155,0.3657 +parallel,19:30:45.832,3390,0.0110,0.0182,2.03e-05,0.5251,0.5987,0.1589,1.320,5007.2,35.27GB,0.0213,0.2380,0.1001,0.6710 +parallel,19:30:53.354,3400,0.0151,0.0179,2.02e-05,0.4966,0.5991,0.1532,1.330,4964.2,35.27GB,0.0068,0.1097,0.0309,0.3332 +parallel,19:31:00.789,3410,0.0228,0.0214,2.02e-05,0.5274,0.5870,0.1563,1.345,4898.5,35.27GB,0.0093,0.1150,0.0425,0.4133 +parallel,19:31:08.389,3420,0.0074,0.0183,2.02e-05,0.6094,0.5854,0.1746,1.316,5000.4,35.27GB,0.0160,0.1763,0.0853,0.4239 +parallel,19:31:16.205,3430,0.0100,0.0198,2.01e-05,0.7561,0.6430,0.1385,1.280,5134.0,35.27GB,0.0067,0.0866,0.0367,0.4135 +parallel,19:31:23.219,3440,0.0124,0.0215,2.01e-05,0.5023,0.5715,0.1300,1.426,4600.5,35.27GB,0.0136,0.1376,0.0658,0.4925 +parallel,19:31:30.554,3450,0.0130,0.0187,2.01e-05,0.4574,0.5919,0.1416,1.363,4803.9,35.27GB,0.0073,0.1047,0.0386,0.3397 +parallel,19:31:37.277,3460,0.0207,0.0228,2.00e-05,0.4663,0.5481,0.1241,1.488,4395.7,35.27GB,0.0219,0.2086,0.1137,0.5451 +parallel,19:31:44.134,3470,0.0181,0.0218,2.00e-05,0.4805,0.5744,0.1113,1.459,4477.2,35.27GB,0.0202,0.1644,0.1095,0.3851 +parallel,19:31:51.647,3480,0.0564,0.0282,2.00e-05,0.5129,0.6308,0.1204,1.331,4897.6,35.27GB,0.0262,0.1830,0.1330,0.4122 +parallel,19:31:58.906,3490,0.0275,0.0251,2.00e-05,0.5635,0.5778,0.1481,1.378,4725.2,35.27GB,0.0161,0.1524,0.0887,0.5594 +parallel,19:32:05.704,3500,0.1977,0.0405,1.99e-05,0.4995,0.5724,0.1073,1.471,4417.8,35.27GB,0.0306,0.2074,0.1875,0.4800 +parallel,19:32:12.583,3510,0.0296,0.0249,1.99e-05,0.5740,0.5610,0.1270,1.454,4464.3,35.27GB,0.0156,0.1407,0.0840,0.4714 +parallel,19:32:19.051,3520,0.0063,0.0259,1.99e-05,0.4855,0.5396,0.1072,1.546,4190.8,35.27GB,0.0176,0.1421,0.0936,0.3783 +parallel,19:32:26.165,3530,0.0142,0.0218,1.98e-05,0.4946,0.5977,0.1137,1.406,4602.2,35.27GB,0.0093,0.1257,0.0467,0.4021 +parallel,19:32:33.073,3540,0.0161,0.0252,1.98e-05,0.4838,0.5635,0.1273,1.448,4461.7,35.27GB,0.0141,0.1912,0.0712,0.3321 +parallel,19:32:40.018,3550,0.0330,0.0205,1.98e-05,0.3967,0.5749,0.1196,1.440,4478.7,35.27GB,0.0086,0.1317,0.0489,0.3646 +parallel,19:32:46.452,3560,0.0231,0.0169,1.97e-05,0.4770,0.5413,0.1021,1.555,4142.7,35.27GB,0.0216,0.1530,0.1088,0.4076 +parallel,19:32:53.625,3570,0.0169,0.0237,1.97e-05,0.5681,0.5913,0.1260,1.394,4611.8,35.27GB,0.0109,0.1232,0.0598,0.2965 +parallel,19:33:01.211,3580,0.0296,0.0261,1.97e-05,0.4917,0.6215,0.1370,1.319,4868.9,35.27GB,0.0157,0.1541,0.0735,0.4130 +parallel,19:33:08.659,3590,0.0198,0.0256,1.96e-05,0.5846,0.5885,0.1563,1.343,4773.7,35.27GB,0.0214,0.1859,0.1112,0.4917 +parallel,19:33:15.094,3600,0.0129,0.0251,1.96e-05,0.5740,0.5494,0.0940,1.554,4117.5,35.27GB,0.0282,0.2098,0.1501,0.4786 +parallel,19:33:21.507,3610,0.0077,0.0190,1.96e-05,0.4358,0.5295,0.1119,1.559,4098.2,35.27GB,0.0180,0.1762,0.0953,0.4759 +parallel,19:33:27.708,3620,0.0167,0.0163,1.95e-05,0.5419,0.5425,0.0775,1.613,3955.4,35.27GB,0.0106,0.1153,0.0532,0.3426 +parallel,19:33:35.286,3630,0.0066,0.0182,1.95e-05,0.5573,0.6485,0.1094,1.320,4827.0,35.27GB,0.0166,0.1491,0.0762,0.9022 +parallel,19:33:42.257,3640,0.0159,0.0176,1.95e-05,0.5065,0.5534,0.1436,1.436,4430.2,35.27GB,0.0105,0.1109,0.0551,0.3667 +parallel,19:33:49.080,3650,0.0184,0.0172,1.94e-05,0.4466,0.5745,0.1079,1.466,4332.0,35.27GB,0.0139,0.1293,0.0714,0.4178 +parallel,19:33:55.274,3660,0.0498,0.0209,1.94e-05,0.5216,0.5204,0.0991,1.615,3926.6,35.27GB,0.0246,0.1917,0.1282,0.5223 +parallel,19:34:01.259,3670,0.0059,0.0132,1.94e-05,0.4846,0.5179,0.0805,1.671,3787.8,35.27GB,0.0102,0.1072,0.0471,0.2792 +parallel,19:34:07.589,3680,0.0155,0.0153,1.93e-05,0.5026,0.5542,0.0788,1.580,4000.0,35.27GB,0.0150,0.1275,0.0771,0.4339 +parallel,19:34:13.379,3690,0.0150,0.0153,1.93e-05,0.5147,0.5008,0.0781,1.728,3652.6,35.27GB,0.0269,0.1595,0.1438,0.4751 +parallel,19:34:20.409,3700,0.0120,0.0219,1.93e-05,0.5938,0.5708,0.1323,1.423,4428.6,35.27GB,0.0174,0.1510,0.0887,0.5320 +parallel,19:34:29.723,3710,0.0077,0.0245,1.93e-05,0.5707,0.7266,0.2047,1.074,5857.2,35.27GB,0.0258,0.1860,0.1269,0.5295 +parallel,19:34:47.575,3720,0.0140,0.0187,1.92e-05,0.6655,0.9904,0.7949,0.563,11154.4,35.27GB,0.0155,0.1457,0.0793,0.4085 +parallel,19:35:01.007,3730,0.0051,0.0240,1.92e-05,0.5209,0.9184,0.4248,0.745,8412.3,35.27GB,0.0096,0.1329,0.0500,0.3340 +parallel,19:35:14.740,3740,0.0133,0.0182,1.92e-05,0.4999,0.9420,0.4312,0.729,8582.7,35.27GB,0.0089,0.0976,0.0445,0.3919 +parallel,19:35:27.509,3750,0.0246,0.0236,1.91e-05,0.4955,0.8424,0.4345,0.784,7968.4,35.27GB,0.0140,0.1117,0.0723,0.3719 +parallel,19:35:39.675,3760,0.0065,0.0198,1.91e-05,0.5645,0.7735,0.4432,0.824,7571.7,35.27GB,0.0260,0.1407,0.1211,0.7603 +parallel,19:35:50.159,3770,0.0026,0.0164,1.91e-05,0.4734,0.7571,0.2912,0.955,6523.4,35.27GB,0.0108,0.1022,0.0559,0.4228 +parallel,19:36:04.911,3780,0.0169,0.0134,1.90e-05,0.4545,0.9552,0.5201,0.679,9161.8,35.27GB,0.0162,0.1345,0.0840,0.5242 +parallel,19:36:13.057,3790,0.0185,0.0168,1.90e-05,0.5383,0.6096,0.2050,1.230,5050.0,35.27GB,0.0147,0.1406,0.0770,0.6834 +parallel,19:36:19.469,3800,0.0266,0.0140,1.90e-05,0.4478,0.5606,0.0806,1.560,3975.0,35.27GB,0.0173,0.1533,0.0868,0.7889 +parallel,19:36:24.999,3810,0.0065,0.0140,1.89e-05,0.4836,0.4808,0.0721,1.809,3421.8,35.27GB,0.0120,0.1148,0.0525,0.3556 +parallel,19:36:30.784,3820,0.0201,0.0184,1.89e-05,0.5090,0.5108,0.0677,1.729,3574.6,35.27GB,0.0107,0.1181,0.0585,0.3203 +parallel,19:36:37.891,3830,0.0415,0.0193,1.89e-05,0.5542,0.5902,0.1205,1.408,4381.5,35.27GB,0.0289,0.1853,0.1542,0.7543 +parallel,19:36:44.179,3840,0.0143,0.0235,1.88e-05,0.4907,0.5287,0.1002,1.591,3873.0,35.27GB,0.0109,0.1110,0.0551,0.3751 +parallel,19:36:50.635,3850,0.0075,0.0143,1.88e-05,0.5058,0.5435,0.1022,1.551,3965.3,35.27GB,0.0164,0.1500,0.0782,0.4467 +parallel,19:36:57.843,3860,0.0179,0.0226,1.88e-05,0.5094,0.5709,0.1499,1.388,4424.1,35.27GB,0.0184,0.1243,0.0951,0.4017 +parallel,19:37:04.685,3870,0.0205,0.0184,1.87e-05,0.4762,0.5770,0.1072,1.462,4193.3,35.27GB,0.0130,0.0978,0.0673,0.4341 +parallel,19:37:11.768,3880,0.0656,0.0226,1.87e-05,0.4720,0.6031,0.1052,1.413,4329.8,35.27GB,0.0182,0.2368,0.0961,0.6105 +parallel,19:37:18.377,3890,0.0123,0.0157,1.87e-05,0.4578,0.5626,0.0984,1.514,4036.2,35.27GB,0.0101,0.0944,0.0525,0.3850 +parallel,19:37:24.467,3900,0.0089,0.0186,1.86e-05,0.4887,0.5243,0.0846,1.642,3714.1,35.27GB,0.0065,0.1095,0.0345,0.3134 +parallel,19:37:30.740,3910,0.0130,0.0174,1.86e-05,0.5049,0.5261,0.1012,1.596,3816.0,35.27GB,0.0144,0.1621,0.0800,0.5360 +parallel,19:37:37.028,3920,0.0196,0.0225,1.86e-05,0.4606,0.5492,0.0796,1.591,3822.4,35.27GB,0.0217,0.1577,0.1088,0.4484 +parallel,19:37:43.655,3930,0.0203,0.0172,1.85e-05,0.4946,0.5778,0.0849,1.509,4022.1,35.27GB,0.0208,0.2035,0.1147,0.5402 +parallel,19:37:49.555,3940,0.0120,0.0150,1.85e-05,0.4519,0.5137,0.0763,1.695,3574.6,35.27GB,0.0142,0.1541,0.0739,0.4194 +parallel,19:37:56.327,3950,0.0072,0.0157,1.85e-05,0.5591,0.5765,0.1008,1.477,4096.9,35.27GB,0.0282,0.2086,0.1647,0.4795 +parallel,19:38:02.456,3960,0.0182,0.0169,1.84e-05,0.4802,0.5210,0.0918,1.632,3701.3,35.27GB,0.0064,0.0838,0.0353,0.6214 +parallel,19:38:08.739,3970,0.0139,0.0179,1.84e-05,0.5350,0.5366,0.0918,1.592,3788.8,35.27GB,0.0114,0.1256,0.0584,0.5397 +parallel,19:38:15.338,3980,0.0068,0.0154,1.84e-05,0.5242,0.5558,0.1040,1.516,3971.3,35.27GB,0.0275,0.1650,0.1419,0.4527 +parallel,19:38:20.898,3990,0.0364,0.0162,1.83e-05,0.4366,0.4750,0.0810,1.799,3341.5,35.27GB,0.0192,0.1610,0.0964,0.6069 +parallel,19:38:27.254,4000,0.0278,0.0177,1.83e-05,0.4481,0.5487,0.0869,1.574,3812.9,35.27GB,0.0157,0.1730,0.0796,0.3729 +parallel,19:39:58.978,4010,0.0053,0.0222,1.83e-05,0.5485,0.5233,8.6491,0.109,54941.8,35.27GB,0.0155,0.1828,0.0853,0.4442 +parallel,19:40:05.005,4020,0.0154,0.0179,1.82e-05,0.5379,0.5242,0.0785,1.660,3603.3,35.27GB,0.0294,0.1911,0.1562,0.5652 +parallel,19:40:11.221,4030,0.0201,0.0261,1.82e-05,0.4950,0.5520,0.0696,1.609,3710.3,35.27GB,0.0126,0.1067,0.0576,0.3247 +parallel,19:40:16.796,4040,0.0180,0.0192,1.82e-05,0.4443,0.4838,0.0738,1.794,3322.2,35.27GB,0.0113,0.1386,0.0592,0.4986 +parallel,19:40:22.525,4050,0.0242,0.0168,1.81e-05,0.4103,0.5116,0.0613,1.746,3408.2,35.27GB,0.0068,0.0900,0.0371,0.4040 +parallel,19:40:27.869,4060,0.0168,0.0158,1.81e-05,0.3952,0.4687,0.0658,1.871,3174.3,35.27GB,0.0100,0.1242,0.0502,0.4295 +parallel,19:40:33.759,4070,0.0108,0.0168,1.80e-05,0.4887,0.5069,0.0821,1.698,3492.2,35.27GB,0.0165,0.1459,0.0859,0.3483 +parallel,19:40:39.758,4080,0.0215,0.0176,1.80e-05,0.5152,0.5356,0.0643,1.667,3551.0,35.27GB,0.0050,0.0684,0.0265,0.2937 +parallel,19:40:45.194,4090,0.0304,0.0150,1.80e-05,0.5281,0.4770,0.0666,1.840,3211.9,35.27GB,0.0103,0.1361,0.0517,0.5351 +parallel,19:40:51.018,4100,0.0085,0.0224,1.79e-05,0.4852,0.5070,0.0754,1.717,3435.7,35.27GB,0.0129,0.1208,0.0643,0.4796 +parallel,19:40:56.992,4110,0.0145,0.0173,1.79e-05,0.4305,0.5255,0.0719,1.674,3518.2,35.27GB,0.0128,0.1644,0.0694,0.3924 +parallel,19:41:02.445,4120,0.0097,0.0222,1.79e-05,0.5813,0.4833,0.0620,1.834,3205.6,35.27GB,0.0101,0.1176,0.0468,0.2835 +parallel,19:41:08.658,4130,0.0168,0.0171,1.78e-05,0.4504,0.5585,0.0628,1.610,3646.3,35.27GB,0.0136,0.1207,0.0652,0.3941 +parallel,19:41:14.040,4140,0.0357,0.0178,1.78e-05,0.4664,0.4752,0.0630,1.858,3153.6,35.27GB,0.0280,0.2227,0.1651,0.5636 +parallel,19:41:19.608,4150,0.0228,0.0148,1.78e-05,0.4062,0.4935,0.0633,1.796,3256.8,35.27GB,0.0189,0.1047,0.0989,0.3410 +parallel,19:41:25.214,4160,0.0183,0.0240,1.77e-05,0.4834,0.4909,0.0697,1.784,3273.3,35.27GB,0.0121,0.1403,0.0638,0.5958 +parallel,19:41:30.994,4170,0.0342,0.0261,1.77e-05,0.5859,0.5042,0.0738,1.730,3369.1,35.27GB,0.0183,0.1746,0.0911,0.5511 +parallel,19:41:37.416,4180,0.0115,0.0210,1.77e-05,0.5103,0.5544,0.0878,1.557,3736.8,35.27GB,0.0118,0.1494,0.0556,0.5573 +parallel,19:41:43.473,4190,0.0238,0.0168,1.76e-05,0.4847,0.5208,0.0849,1.651,3518.3,35.27GB,0.0110,0.0975,0.0548,0.4618 +parallel,19:41:50.302,4200,0.0041,0.0145,1.76e-05,0.4777,0.5769,0.1060,1.465,3960.2,35.27GB,0.0213,0.1693,0.1079,0.5339 +parallel,19:41:56.960,4210,0.0181,0.0164,1.76e-05,0.4864,0.5632,0.1026,1.502,3854.4,35.27GB,0.0096,0.1011,0.0492,0.4510 +parallel,19:42:03.383,4220,0.0067,0.0161,1.75e-05,0.4645,0.5478,0.0946,1.557,3712.2,35.27GB,0.0169,0.1524,0.0883,0.4469 +parallel,19:42:09.447,4230,0.0128,0.0162,1.75e-05,0.4183,0.5403,0.0660,1.649,3498.1,35.27GB,0.0164,0.1538,0.0890,0.3284 +parallel,19:42:14.847,4240,0.0149,0.0159,1.75e-05,0.4279,0.4737,0.0662,1.852,3109.6,35.27GB,0.0303,0.1964,0.1568,0.4887 +parallel,19:42:21.060,4250,0.0192,0.0144,1.74e-05,0.4727,0.5377,0.0836,1.610,3572.1,35.27GB,0.0080,0.1208,0.0395,0.4261 +parallel,19:42:26.815,4260,0.0119,0.0144,1.74e-05,0.4884,0.4945,0.0811,1.738,3303.0,35.27GB,0.0088,0.0754,0.0440,0.3808 +parallel,19:42:32.849,4270,0.0155,0.0146,1.74e-05,0.4907,0.5250,0.0785,1.658,3456.9,35.27GB,0.0187,0.1499,0.0909,0.3667 +parallel,19:42:38.888,4280,0.0078,0.0137,1.73e-05,0.4088,0.5346,0.0693,1.656,3453.2,35.27GB,0.0306,0.1631,0.1796,0.4660 +parallel,19:42:45.433,4290,0.0107,0.0123,1.73e-05,0.5055,0.5341,0.1204,1.528,3736.8,35.27GB,0.0074,0.1107,0.0394,0.4207 +parallel,19:42:52.117,4300,0.1242,0.0296,1.72e-05,0.4954,0.5589,0.1096,1.496,3809.7,35.27GB,0.0306,0.2490,0.1709,0.5197 +parallel,19:42:59.042,4310,0.0022,0.0178,1.72e-05,0.5244,0.5480,0.1446,1.444,3939.8,35.27GB,0.0132,0.1330,0.0691,0.3427 +parallel,19:43:05.694,4320,0.0401,0.0196,1.72e-05,0.4836,0.5565,0.1086,1.504,3777.3,35.27GB,0.0253,0.1902,0.1262,0.5798 +parallel,19:43:12.398,4330,0.0189,0.0213,1.71e-05,0.4846,0.5771,0.0934,1.492,3801.1,35.27GB,0.0112,0.1011,0.0570,0.4831 +parallel,19:43:19.338,4340,0.0095,0.0132,1.71e-05,0.4636,0.5753,0.1188,1.441,3927.5,35.27GB,0.0182,0.1352,0.0990,0.3477 +parallel,19:43:27.501,4350,0.0123,0.0139,1.71e-05,0.4402,0.6485,0.1678,1.225,4611.8,35.27GB,0.0078,0.1110,0.0444,0.4434 +parallel,19:43:34.645,4360,0.0112,0.0182,1.70e-05,0.4808,0.5749,0.1394,1.400,4028.3,35.27GB,0.0121,0.1457,0.0603,0.5140 +parallel,19:43:41.453,4370,0.0211,0.0139,1.70e-05,0.4925,0.5740,0.1068,1.469,3832.0,35.27GB,0.0241,0.2179,0.1301,0.5978 +parallel,19:43:48.967,4380,0.0120,0.0181,1.70e-05,0.5055,0.6183,0.1331,1.331,4222.7,35.27GB,0.0104,0.0921,0.0532,0.3694 +parallel,19:43:55.523,4390,0.0307,0.0168,1.69e-05,0.4103,0.5370,0.1187,1.525,3677.5,35.27GB,0.0081,0.0903,0.0430,0.4015 +parallel,19:44:04.308,4400,0.0095,0.0140,1.69e-05,0.4892,0.6604,0.2181,1.139,4918.6,35.27GB,0.0148,0.1079,0.0727,0.3535 +parallel,19:44:12.854,4410,0.0026,0.0117,1.69e-05,0.4319,0.6559,0.1987,1.170,4776.9,35.27GB,0.0132,0.1559,0.0682,0.3881 +parallel,19:44:19.776,4420,0.0073,0.0146,1.68e-05,0.4410,0.5564,0.1358,1.445,3861.7,35.27GB,0.0101,0.1191,0.0473,0.3218 +parallel,19:44:27.285,4430,0.0091,0.0126,1.68e-05,0.4072,0.6030,0.1479,1.332,4182.1,35.27GB,0.0086,0.0949,0.0451,0.3705 +parallel,19:44:35.082,4440,0.0103,0.0129,1.67e-05,0.4057,0.6150,0.1647,1.283,4334.3,35.27GB,0.0065,0.0924,0.0296,0.3068 +parallel,19:44:43.367,4450,0.0063,0.0134,1.67e-05,0.4093,0.6369,0.1916,1.207,4597.5,35.27GB,0.0087,0.0826,0.0428,0.2996 +parallel,19:44:51.635,4460,0.0058,0.0108,1.67e-05,0.3703,0.6409,0.1859,1.210,4580.2,35.27GB,0.0054,0.0672,0.0276,0.2277 +parallel,19:44:59.668,4470,0.0149,0.0133,1.66e-05,0.5143,0.6361,0.1672,1.245,4441.7,35.27GB,0.0278,0.1789,0.1329,0.4627 +parallel,19:45:08.055,4480,0.0118,0.0169,1.66e-05,0.5111,0.6752,0.1635,1.193,4628.6,35.27GB,0.0085,0.1233,0.0427,0.4176 +parallel,19:45:15.122,4490,0.0291,0.0196,1.66e-05,0.4771,0.5722,0.1345,1.415,3893.1,35.27GB,0.0070,0.0863,0.0368,0.4087 +parallel,19:45:21.680,4500,0.0090,0.0158,1.65e-05,0.4425,0.5544,0.1014,1.525,3606.0,35.27GB,0.0117,0.1076,0.0621,0.4395 +parallel,19:45:27.999,4510,0.0337,0.0220,1.65e-05,0.4676,0.5202,0.1118,1.583,3468.6,35.27GB,0.0101,0.1262,0.0539,0.4451 +parallel,19:45:34.104,4520,0.0262,0.0174,1.65e-05,0.5135,0.5261,0.0844,1.638,3344.9,35.27GB,0.0127,0.1878,0.0736,0.4307 +parallel,19:45:40.802,4530,0.0139,0.0172,1.64e-05,0.4543,0.5793,0.0905,1.493,3663.4,35.27GB,0.0141,0.1312,0.0729,0.3869 +parallel,19:45:46.778,4540,0.0052,0.0140,1.64e-05,0.4322,0.5115,0.0861,1.674,3262.0,35.27GB,0.0106,0.0954,0.0508,0.2833 +parallel,19:45:52.879,4550,0.0142,0.0159,1.64e-05,0.4521,0.5265,0.0836,1.639,3324.4,35.27GB,0.0072,0.0890,0.0408,0.3583 +parallel,19:45:59.635,4560,0.0417,0.0144,1.63e-05,0.3850,0.5497,0.1260,1.480,3675.0,35.27GB,0.0180,0.1682,0.0958,0.4570 +parallel,19:46:06.852,4570,0.0033,0.0111,1.63e-05,0.4749,0.5923,0.1294,1.386,3918.4,35.27GB,0.0085,0.1081,0.0419,0.3158 +parallel,19:46:13.729,4580,0.0328,0.0117,1.62e-05,0.4758,0.5911,0.0965,1.454,3726.5,35.27GB,0.0139,0.1615,0.0734,0.4145 +parallel,19:46:20.041,4590,0.0079,0.0135,1.62e-05,0.4231,0.5345,0.0967,1.585,3414.0,35.27GB,0.0181,0.1292,0.0850,0.4281 +parallel,19:46:26.475,4600,0.0079,0.0129,1.62e-05,0.4875,0.5486,0.0948,1.555,3473.7,35.27GB,0.0114,0.1062,0.0612,0.3775 +parallel,19:46:32.691,4610,0.0120,0.0131,1.61e-05,0.4203,0.5248,0.0968,1.609,3349.9,35.27GB,0.0228,0.1572,0.1235,0.3692 +parallel,19:46:39.051,4620,0.0191,0.0221,1.61e-05,0.4678,0.5415,0.0946,1.572,3421.5,35.27GB,0.0197,0.1640,0.1064,0.4498 +parallel,19:46:46.064,4630,0.0245,0.0161,1.61e-05,0.4969,0.5822,0.1191,1.426,3765.1,35.27GB,0.0218,0.1371,0.1184,0.4929 +parallel,19:46:53.044,4640,0.0016,0.0138,1.60e-05,0.4928,0.5628,0.1352,1.433,3740.9,35.27GB,0.0173,0.1133,0.0875,0.7222 +parallel,19:47:00.306,4650,0.0113,0.0147,1.60e-05,0.3868,0.5944,0.1317,1.377,3884.3,35.27GB,0.0142,0.1123,0.0739,0.3747 +parallel,19:47:07.474,4660,0.0067,0.0218,1.60e-05,0.5171,0.5824,0.1344,1.395,3827.3,35.27GB,0.0117,0.1906,0.0621,0.3738 +parallel,19:47:15.383,4670,0.0106,0.0166,1.59e-05,0.5357,0.6164,0.1745,1.264,4215.2,35.27GB,0.0070,0.0659,0.0355,0.4724 +parallel,19:47:22.693,4680,0.0082,0.0153,1.59e-05,0.4497,0.6077,0.1233,1.368,3888.3,35.27GB,0.0098,0.0902,0.0472,0.2997 +parallel,19:47:28.819,4690,0.0060,0.0139,1.58e-05,0.4438,0.5166,0.0960,1.633,3252.2,35.27GB,0.0068,0.0683,0.0308,0.2135 +parallel,19:47:34.486,4700,0.0126,0.0144,1.58e-05,0.5223,0.4992,0.0675,1.765,3003.0,35.27GB,0.0164,0.1650,0.0811,0.3610 +parallel,19:47:40.014,4710,0.0052,0.0212,1.58e-05,0.5550,0.4760,0.0768,1.810,2923.4,35.27GB,0.0071,0.0873,0.0363,0.3036 +parallel,19:47:45.866,4720,0.0247,0.0157,1.57e-05,0.4606,0.5071,0.0781,1.709,3089.4,35.27GB,0.0088,0.1457,0.0474,0.3627 +parallel,19:47:53.080,4730,0.0238,0.0139,1.57e-05,0.4622,0.6043,0.1172,1.386,3801.4,35.27GB,0.0127,0.1151,0.0655,0.6848 +parallel,19:48:00.765,4740,0.0125,0.0126,1.57e-05,0.4057,0.6381,0.1304,1.301,4041.7,35.27GB,0.0082,0.0977,0.0379,0.2711 +parallel,19:48:07.793,4750,0.0127,0.0174,1.56e-05,0.4630,0.5892,0.1135,1.423,3688.8,35.27GB,0.0266,0.2214,0.1425,0.4773 +parallel,19:48:14.360,4760,0.0215,0.0173,1.56e-05,0.4783,0.5472,0.1096,1.523,3440.9,35.27GB,0.0076,0.1012,0.0414,0.4656 +parallel,19:48:21.900,4770,0.0080,0.0197,1.55e-05,0.5290,0.5885,0.1655,1.326,3942.9,35.27GB,0.0216,0.1561,0.1177,0.7397 +parallel,19:48:29.415,4780,0.0061,0.0153,1.55e-05,0.4777,0.6065,0.1451,1.331,3922.6,35.27GB,0.0147,0.1383,0.0729,0.3592 +parallel,19:48:35.891,4790,0.0243,0.0158,1.55e-05,0.5400,0.5345,0.1130,1.545,3372.8,35.27GB,0.0278,0.2647,0.1527,0.7109 +parallel,19:48:42.755,4800,0.0063,0.0149,1.54e-05,0.4466,0.5558,0.1306,1.457,3568.5,35.27GB,0.0218,0.1456,0.1179,0.4484 +parallel,19:48:50.066,4810,0.0395,0.0163,1.54e-05,0.4655,0.5990,0.1322,1.368,3794.0,35.27GB,0.0333,0.1727,0.1735,0.6395 +parallel,19:48:57.447,4820,0.0099,0.0154,1.54e-05,0.4734,0.5890,0.1491,1.355,3822.7,35.27GB,0.0238,0.1482,0.1325,0.4909 +parallel,19:49:05.212,4830,0.0056,0.0125,1.53e-05,0.3925,0.6288,0.1477,1.288,4013.8,35.27GB,0.0069,0.1049,0.0337,0.3722 +parallel,19:49:13.024,4840,0.0113,0.0132,1.53e-05,0.4493,0.6096,0.1717,1.280,4030.8,35.27GB,0.0164,0.1163,0.0759,0.4806 +parallel,19:49:20.565,4850,0.0252,0.0176,1.53e-05,0.4631,0.6103,0.1437,1.326,3882.7,35.27GB,0.0234,0.1871,0.1261,0.5358 +parallel,19:49:29.635,4860,0.0185,0.0161,1.52e-05,0.4854,0.6539,0.2531,1.103,4661.3,35.27GB,0.0307,0.2332,0.1569,0.5534 +parallel,19:49:36.315,4870,0.0235,0.0155,1.52e-05,0.4740,0.5460,0.1219,1.497,3425.9,35.27GB,0.0073,0.0901,0.0367,0.3930 +parallel,19:49:42.882,4880,0.0089,0.0124,1.51e-05,0.4783,0.5645,0.0922,1.523,3361.9,35.27GB,0.0205,0.1801,0.1023,0.5084 +parallel,19:49:49.690,4890,0.0052,0.0148,1.51e-05,0.4450,0.5534,0.1273,1.469,3478.6,35.27GB,0.0188,0.1354,0.0971,0.3906 +parallel,19:49:56.343,4900,0.0120,0.0139,1.51e-05,0.4514,0.5582,0.1070,1.503,3392.4,35.27GB,0.0081,0.0948,0.0431,0.3239 +parallel,19:50:01.813,4910,0.0152,0.0147,1.50e-05,0.4941,0.4765,0.0705,1.828,2783.9,35.27GB,0.0220,0.2131,0.1207,0.5698 +parallel,19:50:07.839,4920,0.0155,0.0135,1.50e-05,0.5434,0.5088,0.0937,1.660,3060.5,35.27GB,0.0174,0.1656,0.0991,0.5532 +parallel,19:50:14.524,4930,0.0122,0.0129,1.50e-05,0.4820,0.5506,0.1179,1.496,3389.1,35.27GB,0.0235,0.1188,0.1222,0.4422 +parallel,19:50:20.489,4940,0.0063,0.0171,1.49e-05,0.4619,0.5009,0.0956,1.677,3017.8,35.27GB,0.0158,0.1212,0.0808,0.3971 +parallel,19:50:26.255,4950,0.0098,0.0156,1.49e-05,0.4522,0.5022,0.0744,1.735,2911.3,35.27GB,0.0340,0.2454,0.1684,0.6480 +parallel,19:50:31.963,4960,0.0479,0.0162,1.48e-05,0.4660,0.4979,0.0729,1.752,2876.6,35.27GB,0.0227,0.1413,0.1231,0.3280 +parallel,19:50:37.652,4970,0.0122,0.0129,1.48e-05,0.3873,0.4916,0.0773,1.758,2860.9,35.27GB,0.0064,0.1030,0.0336,0.2945 +parallel,19:50:44.320,4980,0.0261,0.0142,1.48e-05,0.4363,0.5798,0.0869,1.500,3346.6,35.27GB,0.0158,0.1824,0.0826,0.5360 +parallel,19:50:49.974,4990,0.0033,0.0211,1.47e-05,0.5321,0.4798,0.0856,1.769,2832.2,35.27GB,0.0146,0.1254,0.0779,0.3586 +parallel,19:50:55.815,5000,0.0043,0.0159,1.47e-05,0.4850,0.5183,0.0658,1.712,2920.0,35.27GB,0.0105,0.1454,0.0568,1.0533 +parallel,19:52:17.774,5010,0.0166,0.0141,1.47e-05,0.4803,0.5275,7.6684,0.122,40896.9,35.27GB,0.0176,0.1185,0.0891,0.3338 +parallel,19:52:23.323,5020,0.0053,0.0101,1.46e-05,0.4017,0.4926,0.0623,1.803,2762.8,35.27GB,0.0133,0.1345,0.0705,0.3269 +parallel,19:52:29.218,5030,0.0050,0.0110,1.46e-05,0.4569,0.5277,0.0619,1.696,2929.8,35.27GB,0.0119,0.1080,0.0629,0.3817 +parallel,19:52:34.743,5040,0.0065,0.0109,1.46e-05,0.6825,0.4892,0.0633,1.810,2739.8,35.27GB,0.0291,0.2425,0.1705,0.5690 +parallel,19:52:40.524,5050,0.0128,0.0121,1.45e-05,0.4927,0.5085,0.0696,1.730,2861.0,35.27GB,0.0146,0.1488,0.0782,0.4246 +parallel,19:52:46.048,5060,0.0138,0.0117,1.45e-05,0.4766,0.4785,0.0741,1.810,2728.6,35.27GB,0.0066,0.0789,0.0337,0.2994 +parallel,19:52:52.012,5070,0.0062,0.0169,1.44e-05,0.5921,0.5274,0.0688,1.677,2939.7,35.27GB,0.0085,0.0814,0.0421,0.3189 +parallel,19:52:58.218,5080,0.0157,0.0169,1.44e-05,0.4459,0.5587,0.0619,1.612,3053.0,35.27GB,0.0177,0.1439,0.0962,0.4195 +parallel,19:53:03.931,5090,0.0103,0.0121,1.44e-05,0.4865,0.4947,0.0766,1.751,2804.5,35.27GB,0.0135,0.1443,0.0716,0.4298 +parallel,19:53:10.824,5100,0.0017,0.0113,1.43e-05,0.4242,0.5944,0.0950,1.451,3377.5,35.27GB,0.0088,0.0899,0.0428,0.3084 +parallel,19:53:17.098,5110,0.0085,0.0155,1.43e-05,0.4784,0.5318,0.0956,1.594,3067.1,35.27GB,0.0069,0.0916,0.0350,0.3694 +parallel,19:53:23.190,5120,0.0029,0.0147,1.43e-05,0.4556,0.5305,0.0788,1.642,2972.6,35.27GB,0.0119,0.1078,0.0616,0.2550 +parallel,19:53:29.263,5130,0.0173,0.0132,1.42e-05,0.4076,0.5487,0.0586,1.647,2956.7,35.27GB,0.0112,0.1184,0.0615,0.3449 +parallel,19:53:34.733,5140,0.0151,0.0135,1.42e-05,0.3875,0.4872,0.0599,1.829,2657.9,35.27GB,0.0122,0.1056,0.0633,0.4365 +parallel,19:53:40.712,5150,0.0239,0.0168,1.41e-05,0.4609,0.5341,0.0637,1.673,2899.4,35.27GB,0.0082,0.1034,0.0451,0.2669 +parallel,19:53:46.181,5160,0.0079,0.0159,1.41e-05,0.5088,0.4840,0.0629,1.829,2646.6,35.27GB,0.0186,0.1910,0.1007,0.7121 +parallel,19:53:52.085,5170,0.0094,0.0144,1.41e-05,0.4576,0.5287,0.0617,1.694,2851.1,35.27GB,0.0326,0.1736,0.1671,0.4935 +parallel,19:53:58.286,5180,0.0067,0.0130,1.40e-05,0.4678,0.5520,0.0681,1.613,2988.2,35.27GB,0.0076,0.0843,0.0390,0.3995 +parallel,19:54:03.664,5190,0.0055,0.0099,1.40e-05,0.4773,0.4759,0.0619,1.860,2586.6,35.27GB,0.0119,0.1129,0.0587,0.5580 +parallel,19:54:09.604,5200,0.0138,0.0138,1.40e-05,0.4301,0.5225,0.0715,1.684,2850.6,35.27GB,0.0069,0.0856,0.0348,0.5473 +parallel,19:54:15.042,5210,0.0247,0.0125,1.39e-05,0.4910,0.4777,0.0661,1.839,2604.3,35.27GB,0.0329,0.1662,0.1724,0.4763 +parallel,19:54:20.864,5220,0.0125,0.0198,1.39e-05,0.4233,0.5180,0.0642,1.718,2782.2,35.27GB,0.0347,0.1666,0.1804,0.5635 +parallel,19:54:26.958,5230,0.0303,0.0154,1.38e-05,0.4487,0.5466,0.0628,1.641,2906.3,35.27GB,0.0161,0.1727,0.0858,0.3647 +parallel,19:54:32.352,5240,0.0099,0.0113,1.38e-05,0.4021,0.4772,0.0623,1.854,2567.1,35.27GB,0.0118,0.1208,0.0591,0.6181 +parallel,19:54:38.258,5250,0.0137,0.0218,1.38e-05,0.5252,0.5215,0.0690,1.694,2804.8,35.27GB,0.0155,0.1670,0.0788,0.5076 +parallel,19:54:43.732,5260,0.0012,0.0183,1.37e-05,0.4492,0.4774,0.0700,1.827,2594.3,35.27GB,0.0257,0.1336,0.1339,0.3515 +parallel,19:54:49.323,5270,0.0155,0.0181,1.37e-05,0.4671,0.4938,0.0653,1.789,2644.2,35.27GB,0.0098,0.0970,0.0519,0.4813 +parallel,19:54:55.226,5280,0.0281,0.0141,1.37e-05,0.4226,0.5228,0.0675,1.695,2785.3,35.27GB,0.0252,0.2255,0.1299,0.6235 +parallel,19:55:00.745,5290,0.0032,0.0107,1.36e-05,0.4777,0.4884,0.0637,1.812,2598.7,35.27GB,0.0086,0.1072,0.0478,0.3140 +parallel,19:55:06.414,5300,0.0084,0.0100,1.36e-05,0.4797,0.5001,0.0666,1.764,2664.4,35.27GB,0.0234,0.1639,0.1178,0.5322 +parallel,19:55:12.171,5310,0.0046,0.0089,1.35e-05,0.4482,0.5100,0.0658,1.738,2699.1,35.27GB,0.0100,0.0906,0.0507,0.3776 +parallel,19:55:17.623,5320,0.0167,0.0138,1.35e-05,0.5608,0.4762,0.0690,1.834,2551.5,35.27GB,0.0163,0.1290,0.0860,0.6403 +parallel,19:55:23.835,5330,0.0027,0.0109,1.35e-05,0.4413,0.5360,0.0852,1.610,2900.4,35.27GB,0.0079,0.0816,0.0414,0.2829 +parallel,19:55:29.288,5340,0.0152,0.0198,1.34e-05,0.4887,0.4792,0.0662,1.834,2540.7,35.27GB,0.0099,0.1084,0.0485,0.3125 +parallel,19:55:34.861,5350,0.0049,0.0152,1.34e-05,0.4995,0.4919,0.0655,1.795,2590.9,35.27GB,0.0119,0.1081,0.0584,0.4060 +parallel,19:55:41.037,5360,0.0036,0.0156,1.34e-05,0.4299,0.5448,0.0727,1.619,2865.1,35.27GB,0.0130,0.1130,0.0718,0.2874 +parallel,19:55:46.862,5370,0.0020,0.0105,1.33e-05,0.4003,0.5167,0.0660,1.717,2696.9,35.27GB,0.0061,0.0981,0.0294,0.3222 +parallel,19:55:52.949,5380,0.0115,0.0159,1.33e-05,0.4653,0.5413,0.0673,1.643,2811.7,35.27GB,0.0240,0.1844,0.1313,0.4737 +parallel,19:55:58.665,5390,0.0050,0.0157,1.32e-05,0.4412,0.5061,0.0654,1.750,2634.4,35.27GB,0.0092,0.0853,0.0444,0.3891 +parallel,19:56:04.443,5400,0.0133,0.0129,1.32e-05,0.4885,0.5135,0.0644,1.731,2657.6,35.27GB,0.0114,0.1068,0.0540,0.3084 +parallel,19:56:10.043,5410,0.0074,0.0109,1.32e-05,0.4449,0.4940,0.0661,1.786,2570.3,35.27GB,0.0036,0.0527,0.0183,0.1962 +parallel,19:56:15.655,5420,0.0143,0.0117,1.31e-05,0.4037,0.4978,0.0634,1.782,2569.9,35.27GB,0.0063,0.0941,0.0325,0.3491 +parallel,19:56:21.718,5430,0.0182,0.0112,1.31e-05,0.4656,0.5411,0.0652,1.650,2770.3,35.27GB,0.0168,0.1452,0.0865,0.4526 +parallel,19:56:27.399,5440,0.0044,0.0129,1.31e-05,0.4721,0.5040,0.0641,1.761,2589.9,35.27GB,0.0088,0.0937,0.0493,0.4096 +parallel,19:56:33.122,5450,0.0052,0.0102,1.30e-05,0.5410,0.5057,0.0667,1.747,2603.9,35.27GB,0.0148,0.1554,0.0758,0.5638 +parallel,19:56:38.822,5460,0.0025,0.0138,1.30e-05,0.4311,0.4919,0.0781,1.755,2587.2,35.27GB,0.0182,0.1774,0.0914,0.4294 +parallel,19:56:44.558,5470,0.0198,0.0135,1.30e-05,0.4966,0.5103,0.0633,1.744,2598.0,35.27GB,0.0384,0.2529,0.2322,0.6447 +parallel,19:56:50.891,5480,0.0053,0.0161,1.29e-05,0.4989,0.5449,0.0884,1.579,2861.8,35.27GB,0.0128,0.1097,0.0636,0.3542 +parallel,19:56:57.660,5490,0.0042,0.0138,1.29e-05,0.4332,0.5552,0.1218,1.477,3052.7,35.27GB,0.0052,0.0604,0.0233,0.1753 +parallel,19:57:04.821,5500,0.0041,0.0113,1.28e-05,0.3669,0.5770,0.1391,1.397,3221.6,35.27GB,0.0074,0.0906,0.0362,0.3890 +parallel,19:57:12.213,5510,0.0355,0.0126,1.28e-05,0.5534,0.6090,0.1303,1.353,3318.5,35.27GB,0.0082,0.0872,0.0416,0.6273 +parallel,19:57:19.638,5520,0.0075,0.0109,1.28e-05,0.4629,0.6131,0.1295,1.347,3326.1,35.27GB,0.0089,0.0793,0.0436,0.2225 +parallel,19:57:27.467,5530,0.0194,0.0111,1.27e-05,0.6269,0.6332,0.1497,1.278,3498.9,35.27GB,0.0153,0.1737,0.0775,0.6493 +parallel,19:57:33.920,5540,0.0029,0.0168,1.27e-05,0.5353,0.5269,0.1183,1.550,2877.2,35.27GB,0.0147,0.0979,0.0712,0.3811 +parallel,19:57:42.149,5550,0.0018,0.0112,1.27e-05,0.4497,0.6437,0.1793,1.215,3661.7,35.27GB,0.0047,0.0611,0.0247,0.1649 +parallel,19:57:48.847,5560,0.0017,0.0123,1.26e-05,0.4623,0.5327,0.1371,1.493,2973.3,35.27GB,0.0126,0.0992,0.0641,0.3364 +parallel,19:57:56.266,5570,0.0034,0.0105,1.26e-05,0.4741,0.5857,0.1562,1.348,3286.2,35.27GB,0.0067,0.0621,0.0344,0.4023 +parallel,19:58:04.699,5580,0.0041,0.0112,1.25e-05,0.4585,0.6579,0.1855,1.186,3727.0,35.27GB,0.0188,0.1398,0.0979,0.4290 +parallel,19:58:10.990,5590,0.0189,0.0123,1.25e-05,0.5136,0.5263,0.1028,1.590,2773.6,35.27GB,0.0107,0.1079,0.0498,0.3615 +parallel,19:58:17.595,5600,0.0155,0.0114,1.25e-05,0.5031,0.5583,0.1023,1.514,2906.1,35.27GB,0.0356,0.2242,0.1878,0.5690 +parallel,19:58:24.810,5610,0.0092,0.0121,1.24e-05,0.4149,0.5675,0.1541,1.386,3167.0,35.27GB,0.0074,0.0729,0.0358,0.2608 +parallel,19:58:31.749,5620,0.0128,0.0152,1.24e-05,0.4405,0.5538,0.1401,1.441,3038.6,35.27GB,0.0079,0.0828,0.0401,0.2887 +parallel,19:58:39.580,5630,0.0149,0.0136,1.24e-05,0.4043,0.6099,0.1732,1.277,3421.6,35.27GB,0.0153,0.1507,0.0819,0.4267 +parallel,19:58:46.329,5640,0.0050,0.0149,1.23e-05,0.3911,0.5360,0.1389,1.482,2942.2,35.27GB,0.0081,0.0960,0.0425,0.4451 +parallel,19:58:54.255,5650,0.0061,0.0128,1.23e-05,0.4018,0.6389,0.1536,1.263,3445.4,35.27GB,0.0088,0.0953,0.0430,0.2912 +parallel,19:59:01.486,5660,0.0090,0.0136,1.22e-05,0.4749,0.5707,0.1524,1.383,3138.0,35.27GB,0.0062,0.0649,0.0321,0.3317 +parallel,19:59:09.468,5670,0.0117,0.0118,1.22e-05,0.4513,0.6223,0.1759,1.253,3455.6,35.27GB,0.0208,0.1788,0.0981,0.5621 +parallel,19:59:16.511,5680,0.0074,0.0138,1.22e-05,0.4887,0.5735,0.1309,1.420,3042.6,35.27GB,0.0081,0.0804,0.0375,0.3263 +parallel,19:59:23.243,5690,0.0075,0.0142,1.21e-05,0.4556,0.5406,0.1326,1.486,2900.7,35.27GB,0.0099,0.0944,0.0533,0.5153 +parallel,19:59:30.155,5700,0.1304,0.0222,1.21e-05,0.6165,0.5633,0.1280,1.447,2972.0,35.27GB,0.0317,0.1917,0.1618,1.8588 +parallel,19:59:36.802,5710,0.0382,0.0163,1.21e-05,0.4590,0.5485,0.1161,1.505,2851.0,35.27GB,0.0281,0.1654,0.1418,0.6204 +parallel,19:59:42.649,5720,0.0139,0.0158,1.20e-05,0.5532,0.5001,0.0846,1.711,2502.0,35.27GB,0.0201,0.1538,0.1062,0.6302 +parallel,19:59:49.295,5730,0.0330,0.0242,1.20e-05,0.4848,0.5634,0.1011,1.505,2837.2,35.27GB,0.0250,0.2162,0.1318,0.6630 +parallel,19:59:55.515,5740,0.0147,0.0157,1.20e-05,0.4321,0.5268,0.0952,1.608,2649.5,35.27GB,0.0125,0.1042,0.0620,0.5631 +parallel,20:00:01.909,5750,0.0046,0.0100,1.19e-05,0.3840,0.5256,0.1137,1.564,2716.7,35.27GB,0.0203,0.1109,0.1026,0.6585 +parallel,20:00:07.858,5760,0.0127,0.0108,1.19e-05,0.3529,0.5067,0.0881,1.681,2522.0,35.27GB,0.0102,0.1216,0.0536,0.5413 +parallel,20:00:14.041,5770,0.1780,0.0277,1.18e-05,0.4514,0.5171,0.1012,1.618,2615.0,35.27GB,0.0301,0.2025,0.1837,0.5882 +parallel,20:00:20.472,5780,0.0150,0.0224,1.18e-05,0.5074,0.5575,0.0857,1.555,2713.6,35.27GB,0.0335,0.1449,0.1759,0.5257 +parallel,20:00:26.235,5790,0.0018,0.0144,1.18e-05,0.3740,0.4997,0.0767,1.735,2426.0,35.27GB,0.0104,0.1089,0.0526,0.5261 +parallel,20:00:32.639,5800,0.0054,0.0101,1.17e-05,0.4210,0.5296,0.1107,1.562,2688.8,35.27GB,0.0089,0.1173,0.0450,0.3473 +parallel,20:00:38.403,5810,0.0172,0.0133,1.17e-05,0.4278,0.5007,0.0757,1.735,2415.2,35.27GB,0.0103,0.1039,0.0514,0.3280 +parallel,20:00:44.052,5820,0.0079,0.0109,1.17e-05,0.4509,0.4990,0.0658,1.771,2360.7,35.27GB,0.0133,0.1119,0.0658,0.2962 +parallel,20:00:50.170,5830,0.0235,0.0136,1.16e-05,0.3992,0.5262,0.0857,1.635,2551.1,35.27GB,0.0146,0.1526,0.0739,0.4438 +parallel,20:00:55.958,5840,0.0153,0.0128,1.16e-05,0.4492,0.4988,0.0800,1.728,2407.2,35.27GB,0.0160,0.1334,0.0821,0.4558 +parallel,20:01:02.924,5850,0.0042,0.0110,1.16e-05,0.4350,0.5634,0.1331,1.436,2890.1,35.27GB,0.0152,0.1256,0.0766,0.4253 +parallel,20:01:08.616,5860,0.0168,0.0100,1.15e-05,0.4619,0.4850,0.0842,1.757,2356.3,35.27GB,0.0247,0.1798,0.1276,0.6864 +parallel,20:01:15.050,5870,0.0122,0.0113,1.15e-05,0.4414,0.5485,0.0949,1.554,2657.1,35.27GB,0.0212,0.1664,0.1207,0.3676 +parallel,20:01:21.647,5880,0.0250,0.0178,1.14e-05,0.4537,0.5678,0.0919,1.516,2717.6,35.27GB,0.0106,0.1127,0.0564,0.6658 +parallel,20:01:27.552,5890,0.0038,0.0110,1.14e-05,0.4482,0.5097,0.0808,1.694,2426.6,35.27GB,0.0139,0.1127,0.0703,0.3670 +parallel,20:01:33.189,5900,0.0112,0.0123,1.14e-05,0.5233,0.4995,0.0641,1.775,2310.5,35.27GB,0.0131,0.1165,0.0642,0.3475 +parallel,20:01:38.689,5910,0.0152,0.0143,1.13e-05,0.4740,0.4807,0.0693,1.819,2249.1,35.27GB,0.0151,0.1285,0.0792,0.4103 +parallel,20:01:44.530,5920,0.0197,0.0168,1.13e-05,0.4990,0.5064,0.0777,1.712,2382.8,35.27GB,0.0110,0.1259,0.0592,0.6252 +parallel,20:01:50.779,5930,0.0199,0.0138,1.13e-05,0.3877,0.5413,0.0836,1.600,2543.1,35.27GB,0.0095,0.0965,0.0475,0.2654 +parallel,20:01:56.531,5940,0.0100,0.0090,1.12e-05,0.4160,0.4961,0.0790,1.739,2334.7,35.27GB,0.0074,0.0835,0.0394,0.3308 +parallel,20:02:02.878,5950,0.0024,0.0102,1.12e-05,0.4281,0.5411,0.0936,1.576,2570.2,35.27GB,0.0057,0.0651,0.0283,0.1940 +parallel,20:02:09.842,5960,0.0186,0.0124,1.12e-05,0.4240,0.5644,0.1320,1.436,2813.1,35.27GB,0.0082,0.0978,0.0412,0.3067 +parallel,20:02:17.498,5970,0.0148,0.0125,1.11e-05,0.4010,0.6142,0.1514,1.306,3085.2,35.27GB,0.0232,0.1366,0.1227,0.3203 +parallel,20:02:24.376,5980,0.0061,0.0134,1.11e-05,0.4154,0.5680,0.1198,1.454,2764.6,35.27GB,0.0046,0.0505,0.0240,0.2336 +parallel,20:02:30.201,5990,0.0933,0.0220,1.10e-05,0.4634,0.4884,0.0941,1.717,2335.3,35.27GB,0.0333,0.2104,0.1785,0.6199 +parallel,20:02:37.515,6000,0.0115,0.0167,1.10e-05,0.4519,0.5962,0.1353,1.367,2925.3,35.27GB,0.0126,0.1073,0.0742,0.5833 +parallel,20:05:04.812,6010,0.0034,0.0137,1.10e-05,0.4221,0.6030,14.1266,0.068,58770.9,35.27GB,0.0086,0.0746,0.0424,0.2604 +parallel,20:05:12.720,6020,0.0076,0.0097,1.09e-05,0.4440,0.6410,0.1499,1.265,3147.1,35.27GB,0.0187,0.1258,0.0933,0.5497 +parallel,20:05:20.834,6030,0.0083,0.0120,1.09e-05,0.4946,0.6617,0.1496,1.233,3220.7,35.27GB,0.0079,0.0851,0.0423,0.3548 +parallel,20:05:29.766,6040,0.0083,0.0119,1.09e-05,0.4886,0.6053,0.2879,1.120,3536.5,35.27GB,0.0123,0.1147,0.0647,0.3145 +parallel,20:05:37.739,6050,0.0069,0.0080,1.08e-05,0.3597,0.6473,0.1501,1.254,3149.0,35.27GB,0.0079,0.0835,0.0360,0.2448 +parallel,20:05:45.500,6060,0.0101,0.0091,1.08e-05,0.3581,0.5958,0.1803,1.289,3057.5,35.27GB,0.0113,0.0942,0.0577,0.2608 +parallel,20:05:54.383,6070,0.0069,0.0086,1.08e-05,0.4202,0.6454,0.2429,1.126,3490.4,35.27GB,0.0071,0.0877,0.0350,0.2397 +parallel,20:06:02.689,6080,0.0028,0.0141,1.07e-05,0.4315,0.6760,0.1547,1.204,3255.6,35.27GB,0.0059,0.0689,0.0282,0.3349 +parallel,20:06:11.117,6090,0.0026,0.0102,1.07e-05,0.4171,0.6782,0.1646,1.187,3295.1,35.27GB,0.0198,0.1935,0.1065,0.3992 +parallel,20:06:19.507,6100,0.0123,0.0150,1.06e-05,0.5128,0.6603,0.1787,1.192,3271.6,35.27GB,0.0106,0.0953,0.0509,0.5014 +parallel,20:06:28.112,6110,0.0078,0.0139,1.06e-05,0.4594,0.6473,0.2132,1.162,3347.0,35.27GB,0.0144,0.1862,0.0773,0.4667 +parallel,20:06:36.310,6120,0.0162,0.0135,1.06e-05,0.4634,0.6322,0.1876,1.221,3178.8,35.27GB,0.0116,0.1383,0.0542,0.4328 +parallel,20:06:45.004,6130,0.0063,0.0119,1.05e-05,0.4572,0.6861,0.1833,1.150,3364.0,35.27GB,0.0081,0.1308,0.0411,0.4792 +parallel,20:06:52.954,6140,0.0042,0.0164,1.05e-05,0.4438,0.6080,0.1870,1.258,3068.4,35.27GB,0.0043,0.0620,0.0228,0.1757 +parallel,20:07:00.974,6150,0.0050,0.0107,1.05e-05,0.4536,0.6234,0.1786,1.247,3087.4,35.27GB,0.0056,0.0658,0.0253,0.2677 +parallel,20:07:09.894,6160,0.0149,0.0123,1.04e-05,0.4325,0.6517,0.2405,1.121,3424.9,35.27GB,0.0072,0.0897,0.0373,0.3066 +parallel,20:07:17.563,6170,0.0053,0.0132,1.04e-05,0.4932,0.6245,0.1422,1.304,2936.7,35.27GB,0.0414,0.1419,0.2162,0.6256 +parallel,20:07:26.338,6180,0.0088,0.0116,1.04e-05,0.3828,0.7048,0.1727,1.140,3351.6,35.27GB,0.0050,0.0492,0.0231,0.1350 +parallel,20:07:33.109,6190,0.0039,0.0072,1.03e-05,0.4207,0.5696,0.1075,1.477,2579.4,35.27GB,0.0088,0.1085,0.0442,0.8573 +parallel,20:07:41.075,6200,0.0097,0.0105,1.03e-05,0.5051,0.6386,0.1579,1.256,3026.4,35.27GB,0.0347,0.2070,0.2116,0.5694 +parallel,20:07:48.444,6210,0.0135,0.0095,1.03e-05,0.4292,0.5849,0.1520,1.357,2792.7,35.27GB,0.0117,0.0869,0.0581,0.2236 +parallel,20:07:55.599,6220,0.0352,0.0157,1.02e-05,0.5178,0.5701,0.1454,1.398,2704.0,35.27GB,0.0268,0.1337,0.1434,0.4050 +parallel,20:08:02.355,6230,0.0078,0.0101,1.02e-05,0.4147,0.5781,0.0976,1.480,2546.6,35.27GB,0.0112,0.0850,0.0580,0.3223 +parallel,20:08:09.103,6240,0.0328,0.0147,1.02e-05,0.4697,0.5465,0.1282,1.482,2536.8,35.27GB,0.0303,0.1780,0.1669,0.3687 +parallel,20:08:17.067,6250,0.0081,0.0146,1.01e-05,0.3919,0.6320,0.1644,1.256,2985.9,35.27GB,0.0096,0.1090,0.0504,0.3675 +parallel,20:08:25.384,6260,0.0040,0.0120,1.01e-05,0.4130,0.6326,0.1991,1.202,3110.2,35.27GB,0.0108,0.1005,0.0598,0.3259 +parallel,20:08:31.714,6270,0.0028,0.0083,1.00e-05,0.4151,0.5251,0.1079,1.580,2360.7,35.27GB,0.0061,0.0715,0.0315,0.4373 +parallel,20:08:38.503,6280,0.0132,0.0113,1.00e-05,0.4040,0.5683,0.1106,1.473,2525.1,35.27GB,0.0063,0.0614,0.0330,0.4125 +parallel,20:08:44.458,6290,0.0012,0.0082,9.98e-06,0.4017,0.5232,0.0723,1.680,2208.9,35.27GB,0.0065,0.0618,0.0326,0.2433 +parallel,20:08:50.184,6300,0.0074,0.0093,9.94e-06,0.4376,0.4945,0.0781,1.747,2118.3,35.27GB,0.0065,0.0711,0.0314,0.2545 +parallel,20:08:57.132,6310,0.0078,0.0089,9.91e-06,0.4378,0.5590,0.1357,1.439,2563.4,35.27GB,0.0221,0.1591,0.1283,0.4825 +parallel,20:09:03.238,6320,0.0254,0.0090,9.87e-06,0.4458,0.5420,0.0687,1.638,2246.6,35.27GB,0.0077,0.0773,0.0406,0.3770 +parallel,20:09:08.978,6330,0.0032,0.0105,9.84e-06,0.4618,0.5098,0.0642,1.742,2106.3,35.27GB,0.0068,0.0700,0.0319,0.3514 +parallel,20:09:14.162,6340,0.0142,0.0096,9.80e-06,0.3913,0.4584,0.0601,1.929,1897.1,35.27GB,0.0069,0.0714,0.0316,0.2555 +parallel,20:09:19.546,6350,0.0119,0.0113,9.77e-06,0.4018,0.4808,0.0576,1.858,1964.7,35.27GB,0.0163,0.1423,0.0898,0.4514 +parallel,20:09:25.195,6360,0.0021,0.0163,9.73e-06,0.4763,0.4792,0.0856,1.771,2055.8,35.27GB,0.0160,0.1566,0.0788,0.5127 +parallel,20:09:31.114,6370,0.0111,0.0108,9.70e-06,0.3767,0.4968,0.0951,1.690,2148.0,35.27GB,0.0078,0.0889,0.0368,0.3595 +parallel,20:09:37.311,6380,0.0047,0.0124,9.66e-06,0.4795,0.5321,0.0876,1.614,2243.0,35.27GB,0.0062,0.0746,0.0321,0.2276 +parallel,20:09:42.762,6390,0.0268,0.0136,9.63e-06,0.4383,0.4745,0.0706,1.835,1967.6,35.27GB,0.0143,0.1155,0.0662,0.5400 +parallel,20:09:48.516,6400,0.0170,0.0143,9.59e-06,0.4978,0.5097,0.0657,1.738,2071.0,35.27GB,0.0206,0.1446,0.1067,0.4757 +parallel,20:09:54.133,6410,0.0036,0.0106,9.56e-06,0.3714,0.4813,0.0805,1.780,2016.3,35.27GB,0.0076,0.0889,0.0371,0.3190 +parallel,20:10:00.014,6420,0.0079,0.0096,9.52e-06,0.4137,0.5167,0.0713,1.701,2104.9,35.27GB,0.0086,0.0754,0.0438,0.3232 +parallel,20:10:06.106,6430,0.0041,0.0120,9.49e-06,0.4617,0.5332,0.0760,1.642,2174.3,35.27GB,0.0214,0.1324,0.1118,0.4792 +parallel,20:10:11.477,6440,0.0100,0.0100,9.46e-06,0.4199,0.4718,0.0654,1.862,1911.9,35.27GB,0.0049,0.0653,0.0266,0.2180 +parallel,20:10:17.654,6450,0.0131,0.0123,9.42e-06,0.4149,0.5321,0.0855,1.619,2192.2,35.27GB,0.0058,0.0642,0.0313,0.2583 +parallel,20:10:23.691,6460,0.0311,0.0130,9.39e-06,0.4941,0.5171,0.0866,1.657,2136.8,35.27GB,0.0100,0.1230,0.0493,0.4672 +parallel,20:10:30.753,6470,0.0024,0.0118,9.35e-06,0.4423,0.5723,0.1340,1.416,2492.7,35.27GB,0.0091,0.0911,0.0433,0.3209 +parallel,20:10:36.818,6480,0.0047,0.0093,9.32e-06,0.4464,0.5190,0.0875,1.649,2134.2,35.27GB,0.0040,0.0549,0.0201,0.1935 +parallel,20:10:42.661,6490,0.0037,0.0103,9.28e-06,0.4053,0.4879,0.0964,1.712,2050.7,35.27GB,0.0035,0.0491,0.0177,0.2140 +parallel,20:10:48.919,6500,0.0156,0.0144,9.25e-06,0.4664,0.5340,0.0918,1.598,2189.9,35.27GB,0.0097,0.0778,0.0523,0.4955 +parallel,20:10:55.474,6510,0.0066,0.0111,9.22e-06,0.5015,0.5421,0.1133,1.526,2287.0,35.27GB,0.0185,0.1837,0.0939,0.5098 +parallel,20:11:01.936,6520,0.0089,0.0081,9.18e-06,0.4426,0.5357,0.1105,1.548,2248.4,35.27GB,0.0091,0.1048,0.0473,0.3970 +parallel,20:11:08.842,6530,0.0132,0.0125,9.15e-06,0.6256,0.5870,0.1036,1.448,2396.2,35.27GB,0.0161,0.1543,0.0819,2.1628 +parallel,20:11:15.236,6540,0.0084,0.0101,9.11e-06,0.4234,0.5302,0.1092,1.564,2211.9,35.27GB,0.0081,0.0711,0.0462,0.3335 +parallel,20:11:21.710,6550,0.0098,0.0096,9.08e-06,0.5171,0.5408,0.1066,1.545,2233.1,35.27GB,0.0103,0.1193,0.0552,0.3320 +parallel,20:11:28.166,6560,0.0026,0.0108,9.05e-06,0.4602,0.5437,0.1019,1.549,2220.5,35.27GB,0.0049,0.0561,0.0235,0.1841 +parallel,20:11:34.718,6570,0.0405,0.0243,9.01e-06,0.4781,0.5455,0.1097,1.526,2247.1,35.27GB,0.0251,0.1592,0.1305,0.5412 +parallel,20:11:40.840,6580,0.0051,0.0158,8.98e-06,0.5445,0.5240,0.0882,1.634,2093.4,35.27GB,0.0077,0.1002,0.0396,0.3478 +parallel,20:11:46.588,6590,0.0184,0.0106,8.94e-06,0.4355,0.4862,0.0885,1.740,1959.5,35.27GB,0.0134,0.1382,0.0681,0.4859 +parallel,20:11:52.637,6600,0.0070,0.0196,8.91e-06,0.4444,0.5230,0.0819,1.653,2056.5,35.27GB,0.0048,0.0583,0.0231,0.2568 +parallel,20:11:58.246,6610,0.0084,0.0130,8.88e-06,0.4561,0.4788,0.0820,1.783,1900.8,35.27GB,0.0074,0.0905,0.0366,0.2707 +parallel,20:12:04.039,6620,0.0022,0.0082,8.84e-06,0.3966,0.4982,0.0812,1.726,1958.0,35.27GB,0.0056,0.0676,0.0260,0.2056 +parallel,20:12:10.460,6630,0.0168,0.0099,8.81e-06,0.4624,0.5508,0.0913,1.558,2163.4,35.27GB,0.0246,0.1857,0.1242,0.6058 +parallel,20:12:16.398,6640,0.0097,0.0089,8.78e-06,0.4746,0.5042,0.0895,1.684,1994.7,35.27GB,0.0130,0.0988,0.0740,0.3972 +parallel,20:12:23.173,6650,0.0038,0.0103,8.74e-06,0.4623,0.5481,0.1294,1.476,2269.6,35.27GB,0.0169,0.1104,0.0909,0.3287 +parallel,20:12:29.218,6660,0.0028,0.0097,8.71e-06,0.4349,0.5074,0.0971,1.655,2018.6,35.27GB,0.0198,0.1529,0.1042,0.4178 +parallel,20:12:35.731,6670,0.0131,0.0083,8.68e-06,0.4574,0.5405,0.1107,1.536,2168.3,35.27GB,0.0072,0.0777,0.0385,0.5123 +parallel,20:12:42.030,6680,0.0063,0.0081,8.64e-06,0.4710,0.5460,0.0839,1.588,2091.0,35.27GB,0.0215,0.1658,0.1085,0.7250 +parallel,20:12:47.894,6690,0.0058,0.0115,8.61e-06,0.4814,0.5053,0.0812,1.706,1940.6,35.27GB,0.0142,0.1204,0.0686,0.8130 +parallel,20:12:53.874,6700,0.0127,0.0088,8.58e-06,0.4774,0.5232,0.0748,1.673,1972.9,35.27GB,0.0363,0.2019,0.1818,0.4763 +parallel,20:12:59.497,6710,0.0159,0.0097,8.55e-06,0.4082,0.4970,0.0654,1.778,1850.0,35.27GB,0.0088,0.1088,0.0436,0.4644 +parallel,20:13:05.269,6720,0.0021,0.0111,8.51e-06,0.4422,0.5081,0.0690,1.733,1892.6,35.27GB,0.0085,0.0778,0.0444,0.5251 +parallel,20:13:11.593,6730,0.0052,0.0098,8.48e-06,0.4007,0.5648,0.0676,1.581,2067.9,35.27GB,0.0080,0.0705,0.0414,0.2591 +parallel,20:13:17.528,6740,0.0159,0.0150,8.45e-06,0.5282,0.5116,0.0819,1.685,1934.4,35.27GB,0.0251,0.1130,0.1272,0.3545 +parallel,20:13:23.503,6750,0.0017,0.0113,8.41e-06,0.6342,0.5149,0.0826,1.674,1941.5,35.27GB,0.0166,0.1336,0.0851,0.3396 +parallel,20:13:29.309,6760,0.0088,0.0094,8.38e-06,0.4380,0.5037,0.0770,1.723,1881.0,35.27GB,0.0094,0.1591,0.0479,0.4687 +parallel,20:13:35.261,6770,0.0253,0.0153,8.35e-06,0.5013,0.5140,0.0812,1.680,1922.1,35.27GB,0.0076,0.1115,0.0432,0.6409 +parallel,20:13:42.026,6780,0.0434,0.0150,8.32e-06,0.4335,0.5855,0.0910,1.478,2178.2,35.27GB,0.0210,0.1611,0.1125,0.4475 +parallel,20:13:47.937,6790,0.0137,0.0146,8.28e-06,0.5102,0.5030,0.0880,1.692,1897.1,35.27GB,0.0150,0.0933,0.0783,0.4340 +parallel,20:13:53.845,6800,0.0082,0.0099,8.25e-06,0.4941,0.5102,0.0806,1.693,1890.1,35.27GB,0.0534,0.2405,0.2738,0.6699 +parallel,20:14:00.187,6810,0.0090,0.0096,8.22e-06,0.4745,0.5325,0.1017,1.577,2022.8,35.27GB,0.0208,0.1308,0.1049,0.6356 +parallel,20:14:06.530,6820,0.0107,0.0094,8.19e-06,0.3717,0.5454,0.0889,1.577,2016.7,35.27GB,0.0057,0.0748,0.0286,0.3363 +parallel,20:14:13.339,6830,0.0030,0.0109,8.15e-06,0.4488,0.5859,0.0950,1.469,2158.2,35.27GB,0.0053,0.0732,0.0280,0.2529 +parallel,20:14:19.144,6840,0.0166,0.0101,8.12e-06,0.4312,0.5043,0.0762,1.723,1834.2,35.27GB,0.0045,0.0580,0.0229,0.3524 +parallel,20:14:24.834,6850,0.0101,0.0091,8.09e-06,0.4283,0.4966,0.0724,1.758,1791.8,35.27GB,0.0154,0.1467,0.0828,0.4866 +parallel,20:14:30.646,6860,0.0105,0.0107,8.06e-06,0.4437,0.5071,0.0741,1.721,1824.7,35.27GB,0.0046,0.0619,0.0226,0.2648 +parallel,20:14:36.614,6870,0.0056,0.0088,8.02e-06,0.4032,0.5158,0.0811,1.676,1867.6,35.27GB,0.0234,0.1154,0.1245,0.3313 +parallel,20:14:43.225,6880,0.0059,0.0148,7.99e-06,0.4708,0.5569,0.1043,1.513,2062.4,35.27GB,0.0206,0.1255,0.1029,0.4564 +parallel,20:14:49.090,6890,0.0431,0.0127,7.96e-06,0.4821,0.5094,0.0770,1.706,1823.5,35.27GB,0.0154,0.1155,0.0793,0.6058 +parallel,20:14:54.922,6900,0.0183,0.0096,7.93e-06,0.4476,0.5133,0.0699,1.715,1807.7,35.27GB,0.0109,0.1270,0.0549,0.5886 +parallel,20:15:00.632,6910,0.0016,0.0188,7.90e-06,0.5331,0.4968,0.0742,1.752,1763.8,35.27GB,0.0103,0.0828,0.0517,0.5199 +parallel,20:15:06.623,6920,0.0016,0.0153,7.87e-06,0.4676,0.5267,0.0724,1.669,1845.0,35.27GB,0.0195,0.1111,0.0988,0.3730 +parallel,20:15:13.147,6930,0.0040,0.0168,7.83e-06,0.5224,0.5647,0.0877,1.533,2002.5,35.27GB,0.0175,0.1445,0.0895,0.4690 +parallel,20:15:19.297,6940,0.0109,0.0151,7.80e-06,0.4974,0.5233,0.0917,1.626,1881.7,35.27GB,0.0134,0.1054,0.0703,0.3930 +parallel,20:15:25.226,6950,0.0069,0.0124,7.77e-06,0.5440,0.5172,0.0757,1.687,1808.2,35.27GB,0.0264,0.1443,0.1318,0.4525 +parallel,20:15:31.076,6960,0.0409,0.0192,7.74e-06,0.4605,0.5042,0.0807,1.710,1777.8,35.27GB,0.0347,0.1589,0.1817,0.5989 +parallel,20:15:37.141,6970,0.0420,0.0152,7.71e-06,0.5004,0.5206,0.0860,1.649,1837.6,35.27GB,0.0530,0.2616,0.2855,0.9944 +parallel,20:15:43.495,6980,0.0082,0.0123,7.68e-06,0.4760,0.5588,0.0766,1.574,1918.7,35.27GB,0.0280,0.1177,0.1466,0.4120 +parallel,20:15:49.750,6990,0.0014,0.0092,7.65e-06,0.4371,0.5372,0.0883,1.599,1882.3,35.27GB,0.0099,0.0877,0.0541,0.3356 +parallel,20:15:56.182,7000,0.0142,0.0098,7.61e-06,0.4622,0.5458,0.0974,1.555,1929.4,35.27GB,0.0114,0.1070,0.0664,0.4286 +parallel,20:17:32.902,7010,0.0063,0.0086,7.58e-06,0.3949,0.6039,9.0682,0.103,28919.1,35.27GB,0.0056,0.0608,0.0283,0.2103 +parallel,20:17:39.997,7020,0.0061,0.0082,7.55e-06,0.4196,0.5728,0.1367,1.410,2113.9,35.27GB,0.0105,0.1118,0.0505,0.3837 +parallel,20:17:47.804,7030,0.0038,0.0084,7.52e-06,0.3948,0.6301,0.1506,1.281,2318.2,35.27GB,0.0054,0.0653,0.0293,0.3595 +parallel,20:17:54.664,7040,0.0020,0.0059,7.49e-06,0.3695,0.5557,0.1304,1.458,2030.3,35.27GB,0.0154,0.1333,0.0829,0.5313 +parallel,20:18:01.762,7050,0.0637,0.0131,7.46e-06,0.4684,0.5875,0.1223,1.409,2093.5,35.27GB,0.0252,0.1434,0.1287,0.5189 +parallel,20:18:08.918,7060,0.0026,0.0086,7.43e-06,0.3632,0.5698,0.1458,1.398,2103.5,35.27GB,0.0250,0.1997,0.1358,0.5619 +parallel,20:18:16.975,7070,0.0064,0.0113,7.40e-06,0.5412,0.5962,0.2096,1.241,2360.6,35.27GB,0.0279,0.1808,0.1435,0.7242 +parallel,20:18:25.304,7080,0.0175,0.0100,7.37e-06,0.4154,0.6616,0.1712,1.201,2431.6,35.27GB,0.0097,0.1240,0.0535,0.3029 +parallel,20:18:32.066,7090,0.0019,0.0111,7.34e-06,0.4050,0.5543,0.1219,1.479,1967.4,35.27GB,0.0122,0.1227,0.0696,0.4745 +parallel,20:18:39.218,7100,0.0116,0.0110,7.31e-06,0.3731,0.5826,0.1326,1.398,2073.8,35.27GB,0.0128,0.0874,0.0658,0.3774 +parallel,20:18:45.763,7110,0.0151,0.0099,7.28e-06,0.4004,0.5489,0.1056,1.528,1891.0,35.27GB,0.0318,0.1877,0.1709,0.6247 +parallel,20:18:52.200,7120,0.0037,0.0104,7.24e-06,0.4313,0.5364,0.1073,1.554,1853.7,35.27GB,0.0335,0.1835,0.1777,0.5702 +parallel,20:18:59.261,7130,0.0069,0.0094,7.21e-06,0.5973,0.5837,0.1224,1.416,2026.2,35.27GB,0.0104,0.0944,0.0533,0.3636 +parallel,20:19:05.708,7140,0.0031,0.0082,7.18e-06,0.3615,0.5166,0.1281,1.551,1843.6,35.27GB,0.0079,0.0777,0.0429,0.2910 +parallel,20:19:13.395,7150,0.0829,0.0201,7.15e-06,0.4455,0.6005,0.1682,1.301,2190.4,35.27GB,0.0357,0.1952,0.2012,0.5366 +parallel,20:19:20.438,7160,0.0131,0.0188,7.12e-06,0.4185,0.5715,0.1329,1.420,2000.0,35.27GB,0.0289,0.1697,0.1589,0.4294 +parallel,20:19:27.737,7170,0.0050,0.0117,7.09e-06,0.3943,0.5851,0.1448,1.370,2065.4,35.27GB,0.0104,0.0809,0.0520,0.3102 +parallel,20:19:35.030,7180,0.0237,0.0100,7.06e-06,0.5284,0.5998,0.1294,1.372,2056.1,35.27GB,0.0396,0.1527,0.2096,0.6734 +parallel,20:19:43.128,7190,0.0086,0.0110,7.03e-06,0.4407,0.6289,0.1810,1.235,2275.3,35.27GB,0.0097,0.0806,0.0529,0.5773 +parallel,20:19:50.828,7200,0.0157,0.0112,7.00e-06,0.4446,0.5932,0.1769,1.299,2155.7,35.27GB,0.0056,0.0946,0.0285,0.3147 +parallel,20:19:58.737,7210,0.0157,0.0110,6.97e-06,0.4660,0.6213,0.1696,1.265,2206.3,35.27GB,0.0132,0.1027,0.0668,0.4112 +parallel,20:20:06.311,7220,0.0130,0.0113,6.94e-06,0.4250,0.5948,0.1625,1.321,2105.0,35.27GB,0.0062,0.0751,0.0317,0.3314 +parallel,20:20:16.862,7230,0.0115,0.0080,6.92e-06,0.3271,0.7635,0.2916,0.948,2922.4,35.27GB,0.0119,0.0913,0.0663,0.2771 +parallel,20:20:24.838,7240,0.0049,0.0077,6.89e-06,0.4429,0.6192,0.1784,1.254,2201.1,35.27GB,0.0085,0.0676,0.0417,0.2513 +parallel,20:20:33.334,7250,0.0131,0.0109,6.86e-06,0.4481,0.6401,0.2094,1.177,2336.1,35.27GB,0.0320,0.1535,0.1651,0.5751 +parallel,20:20:41.660,7260,0.0017,0.0070,6.83e-06,0.3867,0.6432,0.1895,1.201,2281.3,35.27GB,0.0100,0.0938,0.0510,0.4615 +parallel,20:20:48.627,7270,0.0140,0.0082,6.80e-06,0.3797,0.5651,0.1315,1.436,1901.5,35.27GB,0.0094,0.0773,0.0478,0.3073 +parallel,20:20:57.280,7280,0.0070,0.0099,6.77e-06,0.4625,0.6662,0.1991,1.156,2353.5,35.27GB,0.0131,0.0982,0.0605,0.3183 +parallel,20:21:04.427,7290,0.0055,0.0083,6.74e-06,0.4991,0.5777,0.1371,1.400,1935.6,35.27GB,0.0053,0.0647,0.0275,0.2602 +parallel,20:21:11.458,7300,0.0259,0.0150,6.71e-06,0.4550,0.5600,0.1431,1.422,1898.1,35.27GB,0.0225,0.1237,0.1142,0.4718 +parallel,20:21:18.766,7310,0.0030,0.0098,6.68e-06,0.4271,0.5679,0.1629,1.369,1965.4,35.27GB,0.0273,0.1288,0.1379,0.5333 +parallel,20:21:26.160,7320,0.0009,0.0100,6.65e-06,0.4181,0.5968,0.1426,1.353,1981.3,35.27GB,0.0153,0.1244,0.0750,0.6287 +parallel,20:21:34.469,7330,0.0138,0.0091,6.62e-06,0.4209,0.6454,0.1856,1.204,2218.4,35.27GB,0.0109,0.1012,0.0558,0.4207 +parallel,20:21:42.200,7340,0.0108,0.0123,6.59e-06,0.4342,0.5973,0.1758,1.294,2056.1,35.27GB,0.0080,0.0749,0.0395,0.2715 +parallel,20:21:48.925,7350,0.0024,0.0224,6.57e-06,0.3917,0.5562,0.1163,1.487,1781.9,35.27GB,0.0087,0.0888,0.0439,0.3069 +parallel,20:21:57.290,7360,0.0392,0.0233,6.54e-06,0.4593,0.6456,0.1908,1.196,2207.9,35.27GB,0.0189,0.1721,0.0981,0.5334 +parallel,20:22:04.912,7370,0.0020,0.0190,6.51e-06,0.4492,0.6224,0.1399,1.312,2004.6,35.27GB,0.0069,0.0923,0.0372,0.3734 +parallel,20:22:12.764,7380,0.0050,0.0101,6.48e-06,0.4219,0.6277,0.1575,1.274,2056.9,35.27GB,0.0064,0.0827,0.0334,0.4587 +parallel,20:22:20.521,7390,0.0069,0.0079,6.45e-06,0.3549,0.6033,0.1723,1.289,2024.2,35.27GB,0.0115,0.1096,0.0585,0.5279 +parallel,20:22:27.795,7400,0.0059,0.0154,6.42e-06,0.4728,0.5854,0.1421,1.375,1891.0,35.27GB,0.0070,0.0712,0.0394,0.2541 +parallel,20:22:35.093,7410,0.0019,0.0234,6.40e-06,0.4582,0.5950,0.1348,1.371,1889.8,35.27GB,0.0114,0.0992,0.0583,0.3412 +parallel,20:22:42.110,7420,0.0034,0.0115,6.37e-06,0.4320,0.5809,0.1209,1.425,1810.2,35.27GB,0.0271,0.1533,0.1356,0.4476 +parallel,20:22:48.964,7430,0.0136,0.0105,6.34e-06,0.4958,0.5702,0.1152,1.459,1761.0,35.27GB,0.0172,0.1616,0.0875,0.4601 +parallel,20:22:56.208,7440,0.0056,0.0100,6.31e-06,0.3417,0.5842,0.1402,1.381,1854.1,35.27GB,0.0294,0.1781,0.1643,0.4684 +parallel,20:23:02.837,7450,0.0033,0.0068,6.28e-06,0.3444,0.5566,0.1063,1.509,1690.2,35.27GB,0.0122,0.1168,0.0657,0.3007 +parallel,20:23:09.743,7460,0.0034,0.0083,6.26e-06,0.4190,0.5603,0.1303,1.448,1753.9,35.27GB,0.0073,0.0854,0.0359,0.2317 +parallel,20:23:16.272,7470,0.0334,0.0102,6.23e-06,0.4182,0.5417,0.1112,1.532,1651.5,35.27GB,0.0128,0.1768,0.0708,0.4446 +parallel,20:23:22.651,7480,0.0068,0.0214,6.20e-06,0.5114,0.5402,0.0977,1.568,1607.2,35.27GB,0.0152,0.1382,0.0839,0.5422 +parallel,20:23:28.795,7490,0.0688,0.0182,6.17e-06,0.4321,0.5063,0.1081,1.628,1542.0,35.27GB,0.0378,0.1594,0.2008,0.4793 +parallel,20:23:36.321,7500,0.0058,0.0141,6.15e-06,0.4019,0.5971,0.1554,1.329,1881.0,35.27GB,0.0144,0.1131,0.0751,0.4870 +parallel,20:23:43.763,7510,0.0033,0.0089,6.12e-06,0.4053,0.5975,0.1467,1.344,1852.8,35.27GB,0.0153,0.0994,0.0832,0.3729 +parallel,20:23:51.769,7520,0.0193,0.0123,6.09e-06,0.4642,0.6284,0.1722,1.249,1985.3,35.27GB,0.0152,0.1007,0.0752,0.4445 +parallel,20:24:00.426,7530,0.0113,0.0097,6.06e-06,0.4712,0.6707,0.1950,1.155,2138.1,35.27GB,0.0117,0.1063,0.0629,0.3580 +parallel,20:24:08.685,7540,0.0226,0.0087,6.04e-06,0.4313,0.6214,0.2044,1.211,2031.2,35.27GB,0.0243,0.1999,0.1317,0.6251 +parallel,20:24:15.235,7550,0.0060,0.0110,6.01e-06,0.4780,0.5320,0.1231,1.527,1604.7,35.27GB,0.0094,0.0834,0.0504,0.3842 +parallel,20:24:23.731,7560,0.0035,0.0080,5.98e-06,0.4119,0.6609,0.1887,1.177,2072.9,35.27GB,0.0173,0.0987,0.0870,0.4064 +parallel,20:24:31.538,7570,0.0046,0.0109,5.96e-06,0.4475,0.5943,0.1864,1.281,1896.8,35.27GB,0.0173,0.1255,0.0967,0.4002 +parallel,20:24:38.293,7580,0.0029,0.0170,5.93e-06,0.4139,0.5607,0.1148,1.481,1634.4,35.27GB,0.0071,0.0704,0.0373,0.3026 +parallel,20:24:46.024,7590,0.0079,0.0113,5.90e-06,0.3748,0.6098,0.1633,1.294,1862.9,35.27GB,0.0214,0.1132,0.1137,0.4573 +parallel,20:24:53.923,7600,0.0057,0.0093,5.88e-06,0.4293,0.6245,0.1651,1.267,1894.8,35.27GB,0.0067,0.0741,0.0340,0.4352 +parallel,20:25:03.285,7610,0.0043,0.0079,5.85e-06,0.3868,0.7012,0.2353,1.069,2235.8,35.27GB,0.0130,0.0915,0.0612,0.3344 +parallel,20:25:11.146,7620,0.0022,0.0081,5.82e-06,0.4257,0.6128,0.1733,1.272,1870.7,35.27GB,0.0271,0.1534,0.1356,0.4250 +parallel,20:25:18.910,7630,0.0141,0.0077,5.80e-06,0.4156,0.6261,0.1503,1.288,1839.9,35.27GB,0.0064,0.0757,0.0325,0.3771 +parallel,20:25:27.605,7640,0.0033,0.0086,5.77e-06,0.4250,0.6880,0.1815,1.150,2051.9,35.27GB,0.0198,0.1756,0.1094,0.4095 +parallel,20:25:36.114,7650,0.0021,0.0057,5.74e-06,0.3593,0.6504,0.2005,1.175,1999.3,35.27GB,0.0045,0.0594,0.0251,0.3975 +parallel,20:25:45.369,7660,0.0055,0.0079,5.72e-06,0.4248,0.6936,0.2319,1.081,2165.6,35.27GB,0.0101,0.0782,0.0524,0.3407 +parallel,20:25:51.841,7670,0.0139,0.0076,5.69e-06,0.4551,0.5400,0.1072,1.546,1507.6,35.27GB,0.0115,0.0818,0.0525,0.3441 +parallel,20:26:00.224,7680,0.0129,0.0132,5.67e-06,0.4243,0.6685,0.1699,1.193,1944.8,35.27GB,0.0078,0.0832,0.0423,0.3415 +parallel,20:26:08.489,7690,0.0071,0.0102,5.64e-06,0.4304,0.6388,0.1876,1.210,1908.9,35.27GB,0.0125,0.1145,0.0676,0.3931 +parallel,20:26:16.951,7700,0.0136,0.0113,5.61e-06,0.4241,0.6454,0.2008,1.182,1946.0,35.27GB,0.0050,0.0644,0.0253,0.2272 +parallel,20:26:25.111,7710,0.0077,0.0085,5.59e-06,0.4466,0.6203,0.1957,1.226,1868.4,35.27GB,0.0049,0.0584,0.0264,0.2051 +parallel,20:26:32.848,7720,0.0131,0.0096,5.56e-06,0.3919,0.6065,0.1672,1.293,1763.7,35.27GB,0.0110,0.0723,0.0556,0.3421 +parallel,20:26:40.651,7730,0.0089,0.0125,5.54e-06,0.4142,0.6223,0.1579,1.282,1771.0,35.27GB,0.0087,0.1004,0.0423,0.4344 +parallel,20:26:48.799,7740,0.0034,0.0093,5.51e-06,0.3496,0.6353,0.1795,1.227,1841.2,35.27GB,0.0063,0.0765,0.0333,0.3235 +parallel,20:26:57.036,7750,0.0055,0.0081,5.49e-06,0.4049,0.6338,0.1899,1.214,1853.0,35.27GB,0.0061,0.0699,0.0329,0.2103 +parallel,20:27:05.091,7760,0.0169,0.0114,5.46e-06,0.4765,0.6283,0.1773,1.241,1804.3,35.27GB,0.0280,0.1291,0.1416,0.8995 +parallel,20:27:13.882,7770,0.0040,0.0088,5.44e-06,0.3585,0.6794,0.1997,1.138,1960.0,35.27GB,0.0056,0.0688,0.0295,0.2648 +parallel,20:27:23.951,7780,0.0076,0.0086,5.41e-06,0.4667,0.7647,0.2423,0.993,2235.2,35.27GB,0.0029,0.0509,0.0155,0.2157 +parallel,20:27:31.483,7790,0.0082,0.0069,5.39e-06,0.4828,0.5863,0.1668,1.328,1664.3,35.27GB,0.0256,0.1607,0.1252,0.5766 +parallel,20:27:41.106,7800,0.0056,0.0066,5.36e-06,0.3537,0.7004,0.2619,1.039,2116.9,35.27GB,0.0046,0.0529,0.0222,0.2623 +parallel,20:27:49.082,7810,0.0145,0.0131,5.34e-06,0.4647,0.5981,0.1995,1.254,1746.5,35.27GB,0.0168,0.1091,0.0866,0.4066 +parallel,20:27:58.326,7820,0.0027,0.0124,5.31e-06,0.3984,0.6907,0.2337,1.082,2015.0,35.27GB,0.0038,0.0455,0.0205,0.1912 +parallel,20:28:07.176,7830,0.0131,0.0111,5.29e-06,0.4729,0.6735,0.2115,1.130,1920.3,35.27GB,0.0297,0.1837,0.1531,0.6832 +parallel,20:28:16.564,7840,0.0083,0.0080,5.26e-06,0.3267,0.7026,0.2362,1.065,2027.5,35.27GB,0.0079,0.0695,0.0404,0.2247 +parallel,20:28:27.493,7850,0.0077,0.0084,5.24e-06,0.5040,0.7961,0.2968,0.915,2349.4,35.27GB,0.0405,0.1945,0.2398,0.5375 +parallel,20:28:38.258,7860,0.0075,0.0086,5.21e-06,0.4110,0.7543,0.3222,0.929,2303.5,35.27GB,0.0188,0.1275,0.1004,0.4666 +parallel,20:28:46.975,7870,0.0012,0.0087,5.19e-06,0.4836,0.6510,0.2207,1.147,1856.5,35.27GB,0.0101,0.1373,0.0543,0.3504 +parallel,20:28:57.521,7880,0.0013,0.0069,5.17e-06,0.4102,0.7554,0.2993,0.948,2235.6,35.27GB,0.0049,0.0669,0.0250,0.2975 +parallel,20:29:08.310,7890,0.0063,0.0097,5.14e-06,0.4723,0.7343,0.3446,0.927,2276.2,35.27GB,0.0111,0.1083,0.0542,0.5491 +parallel,20:29:16.848,7900,0.0025,0.0080,5.12e-06,0.3879,0.6730,0.1807,1.171,1792.6,35.27GB,0.0074,0.0571,0.0394,0.3674 +parallel,20:29:24.860,7910,0.0045,0.0112,5.09e-06,0.4573,0.6136,0.1876,1.248,1674.3,35.27GB,0.0035,0.0523,0.0174,0.2571 +parallel,20:29:32.541,7920,0.0370,0.0129,5.07e-06,0.5175,0.6015,0.1666,1.302,1597.4,35.27GB,0.0155,0.1070,0.0784,0.4697 +parallel,20:29:41.099,7930,0.0046,0.0157,5.05e-06,0.4057,0.6474,0.2084,1.169,1771.4,35.27GB,0.0135,0.0887,0.0689,0.2632 +parallel,20:29:52.420,7940,0.0187,0.0156,5.02e-06,0.4310,0.7447,0.3874,0.883,2331.9,35.27GB,0.0191,0.1329,0.0995,0.4691 +parallel,20:30:00.634,7950,0.0073,0.0096,5.00e-06,0.4117,0.6250,0.1964,1.218,1683.5,35.27GB,0.0143,0.1271,0.0781,0.4073 +parallel,20:30:10.236,7960,0.0030,0.0109,4.98e-06,0.4042,0.6897,0.2705,1.042,1958.7,35.27GB,0.0411,0.1569,0.2184,0.4863 +parallel,20:30:18.147,7970,0.0137,0.0167,4.95e-06,0.4116,0.5999,0.1912,1.264,1605.7,35.27GB,0.0142,0.0858,0.0756,0.2382 +parallel,20:30:27.643,7980,0.0054,0.0141,4.93e-06,0.4779,0.6953,0.2542,1.053,1918.0,35.27GB,0.0120,0.0844,0.0606,0.5605 +parallel,20:30:36.879,7990,0.0029,0.0084,4.91e-06,0.3800,0.6731,0.2505,1.083,1856.2,35.27GB,0.0063,0.0563,0.0314,0.1535 +parallel,20:30:44.380,8000,0.0058,0.0097,4.88e-06,0.3771,0.5868,0.1633,1.333,1500.1,35.27GB,0.0052,0.0690,0.0264,0.2813 +parallel,20:33:54.229,8010,0.0105,0.0118,4.86e-06,0.4309,0.6118,18.3731,0.053,37779.7,35.27GB,0.0138,0.1298,0.0779,0.4023 +parallel,20:34:02.216,8020,0.0112,0.0097,4.84e-06,0.4453,0.6059,0.1927,1.252,1581.0,35.27GB,0.0106,0.0987,0.0554,0.5058 +parallel,20:34:10.745,8030,0.0028,0.0101,4.82e-06,0.4434,0.6413,0.2116,1.173,1679.6,35.27GB,0.0069,0.0806,0.0349,0.2460 +parallel,20:34:18.383,8040,0.0031,0.0100,4.79e-06,0.3729,0.5693,0.1945,1.309,1496.8,35.27GB,0.0064,0.0767,0.0295,0.2016 +parallel,20:34:27.469,8050,0.0035,0.0085,4.77e-06,0.4757,0.6693,0.2393,1.102,1769.8,35.27GB,0.0045,0.0447,0.0249,0.1938 +parallel,20:34:34.117,8060,0.0079,0.0087,4.75e-06,0.3577,0.5352,0.1297,1.504,1289.6,35.27GB,0.0070,0.0586,0.0365,0.2759 +parallel,20:34:40.440,8070,0.0223,0.0100,4.73e-06,0.4949,0.5198,0.1125,1.582,1220.1,35.27GB,0.0141,0.1248,0.0722,0.4706 +parallel,20:34:47.753,8080,0.0068,0.0095,4.70e-06,0.4542,0.5934,0.1379,1.368,1403.8,35.27GB,0.0192,0.1483,0.1042,0.3982 +parallel,20:34:53.850,8090,0.0153,0.0086,4.68e-06,0.4830,0.5100,0.0997,1.640,1164.4,35.27GB,0.0078,0.0718,0.0419,0.6816 +parallel,20:35:00.463,8100,0.0089,0.0067,4.66e-06,0.3695,0.5479,0.1133,1.512,1256.2,35.27GB,0.0247,0.1946,0.1382,0.4772 +parallel,20:35:07.413,8110,0.0082,0.0074,4.64e-06,0.4415,0.5579,0.1371,1.439,1313.3,35.27GB,0.0076,0.0627,0.0448,0.4333 +parallel,20:35:13.365,8120,0.0037,0.0100,4.62e-06,0.3897,0.5042,0.0910,1.680,1118.9,35.27GB,0.0049,0.0543,0.0235,0.1683 +parallel,20:35:20.709,8130,0.0086,0.0158,4.60e-06,0.5038,0.6149,0.1195,1.362,1373.1,35.27GB,0.0452,0.2309,0.2599,0.6653 +parallel,20:35:28.076,8140,0.0017,0.0130,4.57e-06,0.4172,0.5681,0.1685,1.358,1370.0,35.27GB,0.0037,0.0467,0.0199,0.2264 +parallel,20:35:35.574,8150,0.0096,0.0092,4.55e-06,0.3270,0.5835,0.1663,1.334,1387.0,35.27GB,0.0056,0.0577,0.0294,0.2588 +parallel,20:35:42.580,8160,0.0070,0.0069,4.53e-06,0.3692,0.5591,0.1415,1.428,1288.8,35.27GB,0.0206,0.1774,0.1090,0.5145 +parallel,20:35:49.046,8170,0.0061,0.0073,4.51e-06,0.4206,0.5309,0.1157,1.547,1183.3,35.27GB,0.0041,0.0513,0.0215,0.2099 +parallel,20:35:57.154,8180,0.0195,0.0118,4.49e-06,0.5478,0.6453,0.1655,1.233,1475.5,35.27GB,0.0216,0.1245,0.1140,0.6774 +parallel,20:36:03.874,8190,0.0042,0.0081,4.47e-06,0.4320,0.5572,0.1147,1.489,1215.9,35.27GB,0.0051,0.0694,0.0269,0.2673 +parallel,20:36:09.973,8200,0.0104,0.0137,4.45e-06,0.4402,0.5244,0.0856,1.640,1097.8,35.27GB,0.0070,0.0731,0.0365,0.3645 +parallel,20:36:15.943,8210,0.0040,0.0091,4.43e-06,0.4173,0.5171,0.0799,1.675,1068.5,35.27GB,0.0081,0.0907,0.0408,0.3357 +parallel,20:36:21.711,8220,0.0194,0.0087,4.40e-06,0.4105,0.5061,0.0707,1.734,1026.4,35.27GB,0.0154,0.1703,0.0804,0.7226 +parallel,20:36:27.821,8230,0.0038,0.0086,4.38e-06,0.4568,0.5395,0.0715,1.637,1081.3,35.27GB,0.0075,0.0812,0.0402,0.2119 +parallel,20:36:33.250,8240,0.0031,0.0083,4.36e-06,0.3896,0.4752,0.0677,1.842,955.3,35.27GB,0.0081,0.0751,0.0420,0.2647 +parallel,20:36:38.917,8250,0.0035,0.0167,4.34e-06,0.3837,0.5065,0.0602,1.765,991.6,35.27GB,0.0146,0.1121,0.0776,0.3619 +parallel,20:36:44.441,8260,0.0041,0.0104,4.32e-06,0.4015,0.4800,0.0723,1.811,961.0,35.27GB,0.0095,0.1489,0.0540,0.3344 +parallel,20:36:50.204,8270,0.0047,0.0129,4.30e-06,0.4335,0.5055,0.0709,1.735,997.0,35.27GB,0.0136,0.0965,0.0720,0.4575 +parallel,20:36:56.076,8280,0.0021,0.0076,4.28e-06,0.3813,0.5261,0.0611,1.703,1009.8,35.27GB,0.0044,0.0521,0.0196,0.2249 +parallel,20:37:01.482,8290,0.0016,0.0047,4.26e-06,0.3508,0.4813,0.0593,1.850,924.2,35.27GB,0.0059,0.0544,0.0281,0.1998 +parallel,20:37:07.393,8300,0.0565,0.0142,4.24e-06,0.4685,0.5199,0.0712,1.692,1004.7,35.27GB,0.0258,0.1741,0.1327,0.4978 +parallel,20:37:13.460,8310,0.0073,0.0144,4.22e-06,0.3912,0.5258,0.0809,1.649,1025.0,35.27GB,0.0074,0.0760,0.0358,0.2565 +parallel,20:37:19.855,8320,0.0251,0.0112,4.20e-06,0.3516,0.5447,0.0948,1.564,1074.1,35.27GB,0.0123,0.1011,0.0624,0.5526 +parallel,20:37:26.363,8330,0.0062,0.0096,4.18e-06,0.4463,0.5561,0.0948,1.537,1086.9,35.27GB,0.0049,0.0628,0.0274,0.2531 +parallel,20:37:32.418,8340,0.0083,0.0096,4.16e-06,0.4656,0.5118,0.0937,1.652,1004.9,35.27GB,0.0106,0.1455,0.0572,0.4118 +parallel,20:37:39.052,8350,0.0097,0.0109,4.14e-06,0.3877,0.5689,0.0946,1.507,1094.5,35.27GB,0.0072,0.0848,0.0375,0.2870 +parallel,20:37:44.659,8360,0.0055,0.0103,4.12e-06,0.4472,0.4880,0.0726,1.784,919.2,35.27GB,0.0052,0.0642,0.0239,0.2153 +parallel,20:37:51.092,8370,0.0051,0.0116,4.11e-06,0.3898,0.5578,0.0856,1.555,1048.5,35.27GB,0.0086,0.0860,0.0445,0.3610 +parallel,20:37:57.726,8380,0.0053,0.0100,4.09e-06,0.3867,0.5682,0.0952,1.508,1074.6,35.27GB,0.0150,0.1103,0.0783,0.2994 +parallel,20:38:04.262,8390,0.0015,0.0136,4.07e-06,0.5097,0.5429,0.1107,1.530,1052.1,35.27GB,0.0215,0.1188,0.1087,0.5643 +parallel,20:38:10.843,8400,0.0026,0.0095,4.05e-06,0.4477,0.5492,0.1089,1.520,1052.8,35.27GB,0.0064,0.0824,0.0343,0.4340 +parallel,20:38:18.667,8410,0.0022,0.0093,4.03e-06,0.4153,0.5938,0.1885,1.278,1243.8,35.27GB,0.0135,0.0984,0.0666,0.3394 +parallel,20:38:27.410,8420,0.0066,0.0103,4.01e-06,0.4226,0.6661,0.2083,1.144,1380.8,35.27GB,0.0057,0.0593,0.0309,0.3302 +parallel,20:38:36.361,8430,0.0521,0.0154,3.99e-06,0.4794,0.6747,0.2204,1.117,1405.0,35.27GB,0.0274,0.1644,0.1468,0.4235 +parallel,20:38:43.109,8440,0.0037,0.0114,3.97e-06,0.3781,0.5587,0.1162,1.482,1052.5,35.27GB,0.0044,0.0368,0.0254,0.3554 +parallel,20:38:49.026,8450,0.0242,0.0137,3.96e-06,0.4392,0.5122,0.0795,1.690,916.9,35.27GB,0.0135,0.1761,0.0712,0.4269 +parallel,20:38:55.167,8460,0.0138,0.0121,3.94e-06,0.3935,0.5257,0.0884,1.629,945.5,35.27GB,0.0197,0.1198,0.1045,0.5744 +parallel,20:39:01.283,8470,0.0033,0.0107,3.92e-06,0.5111,0.5314,0.0803,1.635,935.7,35.27GB,0.0075,0.0953,0.0400,0.4109 +parallel,20:39:08.132,8480,0.0021,0.0083,3.90e-06,0.4472,0.5828,0.1021,1.460,1041.0,35.27GB,0.0088,0.0798,0.0439,0.4132 +parallel,20:39:14.398,8490,0.0019,0.0086,3.88e-06,0.4533,0.5290,0.0976,1.596,945.9,35.27GB,0.0053,0.0720,0.0289,0.3330 +parallel,20:39:20.634,8500,0.0052,0.0079,3.87e-06,0.4030,0.5178,0.1058,1.604,935.2,35.27GB,0.0140,0.1440,0.0765,0.4453 +parallel,20:39:26.057,8510,0.0022,0.0120,3.85e-06,0.4505,0.4760,0.0663,1.844,807.9,35.27GB,0.0157,0.1039,0.0817,0.3303 +parallel,20:39:31.861,8520,0.0064,0.0105,3.83e-06,0.4188,0.5022,0.0782,1.723,858.7,35.27GB,0.0070,0.0649,0.0361,0.2106 +parallel,20:39:38.045,8530,0.0020,0.0152,3.81e-06,0.4499,0.5422,0.0762,1.617,908.9,35.27GB,0.0119,0.1132,0.0590,0.3243 +parallel,20:39:43.635,8540,0.0036,0.0106,3.80e-06,0.4003,0.4758,0.0832,1.789,816.1,35.27GB,0.0410,0.1498,0.2198,0.4217 +parallel,20:39:49.345,8550,0.0125,0.0077,3.78e-06,0.3812,0.4914,0.0797,1.751,827.9,35.27GB,0.0082,0.1025,0.0439,0.4480 +parallel,20:39:55.550,8560,0.0057,0.0084,3.76e-06,0.4088,0.5161,0.1043,1.612,893.4,35.27GB,0.0125,0.1048,0.0693,0.3397 +parallel,20:40:02.277,8570,0.0016,0.0096,3.74e-06,0.4043,0.5483,0.1244,1.487,961.7,35.27GB,0.0052,0.0737,0.0253,0.2957 +parallel,20:40:09.068,8580,0.0048,0.0063,3.73e-06,0.4443,0.5625,0.1166,1.473,964.2,35.27GB,0.0074,0.1049,0.0386,0.4959 +parallel,20:40:14.965,8590,0.0015,0.0088,3.71e-06,0.3763,0.4868,0.1029,1.696,831.4,35.27GB,0.0068,0.0807,0.0348,0.2434 +parallel,20:40:20.984,8600,0.0059,0.0087,3.69e-06,0.4555,0.5116,0.0904,1.662,842.6,35.27GB,0.0064,0.0811,0.0367,0.4083 +parallel,20:40:27.194,8610,0.0020,0.0086,3.68e-06,0.4803,0.5169,0.1041,1.610,863.1,35.27GB,0.0132,0.0974,0.0674,0.4539 +parallel,20:40:32.832,8620,0.0034,0.0083,3.66e-06,0.4575,0.4861,0.0776,1.774,777.8,35.27GB,0.0198,0.1280,0.1091,0.3989 +parallel,20:40:39.025,8630,0.0058,0.0066,3.64e-06,0.3701,0.5327,0.0866,1.615,848.4,35.27GB,0.0097,0.0873,0.0506,0.2788 +parallel,20:40:45.655,8640,0.0040,0.0090,3.63e-06,0.4240,0.5264,0.1366,1.509,901.5,35.27GB,0.0170,0.1252,0.0899,0.3343 +parallel,20:40:52.080,8650,0.0107,0.0136,3.61e-06,0.4537,0.5368,0.1057,1.557,867.3,35.27GB,0.0205,0.1004,0.1048,0.3096 +parallel,20:40:58.829,8660,0.0098,0.0112,3.60e-06,0.3961,0.5362,0.1387,1.482,904.3,35.27GB,0.0104,0.0859,0.0555,0.3343 +parallel,20:41:07.392,8670,0.0062,0.0094,3.58e-06,0.4859,0.6668,0.1895,1.168,1138.7,35.27GB,0.0165,0.1002,0.0851,0.4366 +parallel,20:41:15.514,8680,0.0016,0.0077,3.56e-06,0.4620,0.6494,0.1628,1.231,1071.9,35.27GB,0.0074,0.1212,0.0394,0.4374 +parallel,20:41:22.963,8690,0.0078,0.0089,3.55e-06,0.4131,0.5907,0.1542,1.343,975.6,35.27GB,0.0149,0.1476,0.0826,0.4484 +parallel,20:41:30.711,8700,0.0658,0.0146,3.53e-06,0.4370,0.6110,0.1637,1.291,1007.1,35.27GB,0.0302,0.1568,0.1587,0.5166 +parallel,20:41:36.763,8710,0.0051,0.0158,3.52e-06,0.4729,0.4935,0.1117,1.653,780.6,35.27GB,0.0046,0.0517,0.0227,0.2867 +parallel,20:41:43.436,8720,0.0039,0.0098,3.50e-06,0.5053,0.5281,0.1392,1.499,854.0,35.27GB,0.0050,0.0737,0.0230,0.1347 +parallel,20:41:49.501,8730,0.0027,0.0089,3.49e-06,0.4889,0.5220,0.0846,1.649,770.2,35.27GB,0.0197,0.1773,0.1080,0.4171 +parallel,20:41:55.909,8740,0.0138,0.0090,3.47e-06,0.3856,0.5292,0.1117,1.561,807.4,35.27GB,0.0113,0.0892,0.0591,0.3401 +parallel,20:42:03.022,8750,0.0066,0.0117,3.46e-06,0.4564,0.5976,0.1136,1.406,889.0,35.27GB,0.0109,0.0921,0.0553,0.4222 +parallel,20:42:08.971,8760,0.0076,0.0109,3.44e-06,0.3987,0.4992,0.0957,1.681,737.5,35.27GB,0.0092,0.0794,0.0464,0.5099 +parallel,20:42:15.257,8770,0.0046,0.0101,3.43e-06,0.4024,0.5240,0.1046,1.591,773.1,35.27GB,0.0091,0.0825,0.0499,0.5755 +parallel,20:42:22.529,8780,0.0078,0.0106,3.41e-06,0.4681,0.6108,0.1163,1.375,887.0,35.27GB,0.0054,0.0637,0.0261,0.3466 +parallel,20:42:28.754,8790,0.0142,0.0170,3.40e-06,0.5597,0.5163,0.1062,1.607,753.1,35.27GB,0.0098,0.1108,0.0529,0.3256 +parallel,20:42:35.348,8800,0.0067,0.0148,3.38e-06,0.4740,0.5462,0.1132,1.517,791.1,35.27GB,0.0066,0.1152,0.0340,0.7290 +parallel,20:42:40.969,8810,0.0156,0.0122,3.37e-06,0.4279,0.4755,0.0867,1.779,668.8,35.27GB,0.0178,0.1708,0.0944,0.5746 +parallel,20:42:47.108,8820,0.0225,0.0144,3.35e-06,0.4679,0.5073,0.1066,1.629,724.3,35.27GB,0.0376,0.2192,0.2105,0.6616 +parallel,20:42:52.784,8830,0.0039,0.0080,3.34e-06,0.3591,0.5026,0.0650,1.762,663.9,35.27GB,0.0040,0.0502,0.0217,0.2223 +parallel,20:42:59.054,8840,0.0048,0.0090,3.33e-06,0.4082,0.5235,0.1035,1.595,727.1,35.27GB,0.0086,0.0873,0.0459,0.3215 +parallel,20:43:05.360,8850,0.0072,0.0082,3.31e-06,0.4621,0.5117,0.1189,1.586,725.0,35.27GB,0.0097,0.0918,0.0494,0.3891 +parallel,20:43:12.223,8860,0.0018,0.0063,3.30e-06,0.3892,0.5351,0.1512,1.457,782.3,35.27GB,0.0141,0.1198,0.0769,0.4590 +parallel,20:43:19.462,8870,0.0081,0.0079,3.28e-06,0.3240,0.5695,0.1544,1.381,818.0,35.27GB,0.0055,0.0626,0.0301,0.2093 +parallel,20:43:26.935,8880,0.0030,0.0096,3.27e-06,0.3553,0.6109,0.1364,1.338,836.8,35.27GB,0.0043,0.0794,0.0219,0.3047 +parallel,20:43:33.487,8890,0.0092,0.0086,3.26e-06,0.4404,0.5336,0.1217,1.526,727.2,35.27GB,0.0057,0.0634,0.0271,0.3575 +parallel,20:43:40.128,8900,0.0044,0.0085,3.24e-06,0.4341,0.5441,0.1199,1.506,730.4,35.27GB,0.0368,0.1866,0.1844,0.6679 +parallel,20:43:46.702,8910,0.0096,0.0088,3.23e-06,0.4798,0.5414,0.1160,1.521,716.5,35.27GB,0.0140,0.1215,0.0741,0.4035 +parallel,20:43:53.807,8920,0.0018,0.0052,3.22e-06,0.3492,0.5811,0.1293,1.408,767.1,35.27GB,0.0144,0.0852,0.0743,0.2822 +parallel,20:44:01.284,8930,0.0066,0.0079,3.20e-06,0.3814,0.6142,0.1335,1.339,799.4,35.27GB,0.0089,0.0832,0.0460,0.3588 +parallel,20:44:07.850,8940,0.0025,0.0058,3.19e-06,0.3685,0.5539,0.1027,1.523,695.8,35.27GB,0.0049,0.0467,0.0248,0.2722 +parallel,20:44:14.272,8950,0.0124,0.0078,3.18e-06,0.6050,0.5542,0.0880,1.557,674.2,35.27GB,0.0375,0.2056,0.1992,0.6311 +parallel,20:44:22.046,8960,0.0044,0.0079,3.17e-06,0.3734,0.5999,0.1775,1.286,808.4,35.27GB,0.0054,0.0677,0.0282,0.3855 +parallel,20:44:30.330,8970,0.0021,0.0063,3.15e-06,0.4070,0.6426,0.1858,1.207,853.1,35.27GB,0.0035,0.0434,0.0176,0.1717 +parallel,20:44:37.365,8980,0.0010,0.0066,3.14e-06,0.4150,0.5838,0.1198,1.422,717.5,35.27GB,0.0077,0.0876,0.0423,0.5295 +parallel,20:44:43.174,8990,0.0036,0.0063,3.13e-06,0.4070,0.4999,0.0810,1.722,586.6,35.27GB,0.0093,0.0644,0.0495,0.6687 +parallel,20:44:49.237,9000,0.0037,0.0099,3.12e-06,0.3671,0.5221,0.0841,1.650,606.2,35.27GB,0.0202,0.1327,0.1122,0.3523 +parallel,20:46:11.674,9010,0.0071,0.0079,3.10e-06,0.3771,0.4996,7.7441,0.121,8161.1,35.27GB,0.0045,0.0531,0.0239,0.3796 +parallel,20:46:17.733,9020,0.0027,0.0117,3.09e-06,0.4117,0.5231,0.0828,1.651,593.7,35.27GB,0.0037,0.0446,0.0189,0.1786 +parallel,20:46:23.715,9030,0.0037,0.0108,3.08e-06,0.3428,0.5318,0.0665,1.672,580.2,35.27GB,0.0375,0.1373,0.1893,0.4952 +parallel,20:46:28.998,9040,0.0077,0.0077,3.07e-06,0.3249,0.4639,0.0643,1.894,507.0,35.27GB,0.0063,0.0635,0.0327,0.2980 +parallel,20:46:34.665,9050,0.0054,0.0078,3.06e-06,0.3706,0.5022,0.0645,1.765,538.3,35.27GB,0.0056,0.0554,0.0333,0.3025 +parallel,20:46:40.232,9060,0.0212,0.0123,3.05e-06,0.4736,0.4848,0.0720,1.796,523.3,35.27GB,0.0121,0.0912,0.0663,0.3810 +parallel,20:46:46.595,9070,0.0095,0.0102,3.03e-06,0.4281,0.5376,0.0988,1.572,591.7,35.27GB,0.0106,0.0723,0.0548,0.4284 +parallel,20:46:53.589,9080,0.0031,0.0133,3.02e-06,0.3872,0.5790,0.1204,1.430,643.4,35.27GB,0.0040,0.0649,0.0205,0.2206 +parallel,20:46:59.469,9090,0.0040,0.0094,3.01e-06,0.3644,0.4954,0.0926,1.701,534.9,35.27GB,0.0095,0.1003,0.0489,0.3482 +parallel,20:47:05.766,9100,0.0095,0.0066,3.00e-06,0.3812,0.5208,0.1089,1.588,566.6,35.27GB,0.0425,0.1630,0.2288,0.4628 +parallel,20:47:12.241,9110,0.0044,0.0042,2.99e-06,0.3391,0.5376,0.1100,1.545,576.2,35.27GB,0.0086,0.0946,0.0444,0.4766 +parallel,20:47:17.929,9120,0.0019,0.0160,2.98e-06,0.4887,0.4833,0.0855,1.758,500.4,35.27GB,0.0251,0.1270,0.1328,0.4558 +parallel,20:47:23.768,9130,0.0147,0.0167,2.97e-06,0.4619,0.5086,0.0752,1.713,507.8,35.27GB,0.0138,0.0887,0.0736,0.7799 +parallel,20:47:29.689,9140,0.0137,0.0121,2.96e-06,0.4123,0.4959,0.0962,1.689,509.1,35.27GB,0.0244,0.1624,0.1315,0.4914 +parallel,20:47:35.981,9150,0.0031,0.0091,2.95e-06,0.3587,0.5291,0.1001,1.590,534.7,35.27GB,0.0068,0.0576,0.0352,0.1439 +parallel,20:47:41.491,9160,0.0087,0.0084,2.94e-06,0.4298,0.4616,0.0895,1.815,462.8,35.27GB,0.0051,0.0480,0.0294,0.4031 +parallel,20:47:47.715,9170,0.0068,0.0085,2.93e-06,0.4503,0.5115,0.1108,1.607,516.5,35.27GB,0.0067,0.0711,0.0332,0.2030 +parallel,20:47:55.435,9180,0.0092,0.0101,2.92e-06,0.4310,0.5902,0.1818,1.295,633.0,35.27GB,0.0099,0.0855,0.0554,0.3096 +parallel,20:48:02.264,9190,0.0115,0.0090,2.91e-06,0.3980,0.5593,0.1237,1.465,553.0,35.27GB,0.0101,0.0758,0.0537,0.3365 +parallel,20:48:09.249,9200,0.0018,0.0059,2.90e-06,0.3674,0.5812,0.1173,1.432,558.7,35.27GB,0.0055,0.0539,0.0331,0.2594 +parallel,20:48:15.448,9210,0.0065,0.0049,2.89e-06,0.4866,0.5180,0.1019,1.614,489.6,35.27GB,0.0090,0.0862,0.0447,0.3653 +parallel,20:48:21.362,9220,0.0045,0.0068,2.88e-06,0.3791,0.5084,0.0830,1.691,461.2,35.27GB,0.0030,0.0445,0.0142,0.1731 +parallel,20:48:28.499,9230,0.0045,0.0132,2.87e-06,0.3952,0.5755,0.1382,1.401,549.4,35.27GB,0.0102,0.0763,0.0495,0.2698 +parallel,20:48:35.000,9240,0.0038,0.0090,2.86e-06,0.3649,0.5200,0.1301,1.538,494.0,35.27GB,0.0080,0.0609,0.0391,0.3390 +parallel,20:48:40.876,9250,0.0053,0.0082,2.85e-06,0.3301,0.4974,0.0902,1.702,440.6,35.27GB,0.0047,0.0521,0.0251,0.3289 +parallel,20:48:46.797,9260,0.0035,0.0112,2.84e-06,0.4104,0.5088,0.0833,1.689,438.1,35.27GB,0.0104,0.0817,0.0532,0.6380 +parallel,20:48:53.700,9270,0.0064,0.0090,2.83e-06,0.4049,0.5568,0.1335,1.449,503.8,35.27GB,0.0072,0.0615,0.0357,0.2373 +parallel,20:49:01.139,9280,0.0057,0.0084,2.82e-06,0.4341,0.6150,0.1290,1.344,535.6,35.27GB,0.0130,0.0981,0.0670,0.4208 +parallel,20:49:07.465,9290,0.0028,0.0069,2.81e-06,0.3524,0.5226,0.1099,1.581,449.0,35.27GB,0.0036,0.0414,0.0185,0.2138 +parallel,20:49:13.164,9300,0.0082,0.0131,2.80e-06,0.4153,0.5028,0.0671,1.755,398.8,35.27GB,0.0062,0.0780,0.0318,0.2874 +parallel,20:49:20.254,9310,0.0072,0.0095,2.80e-06,0.4813,0.5649,0.1442,1.411,489.2,35.27GB,0.0222,0.1556,0.1264,0.4084 +parallel,20:49:26.093,9320,0.0034,0.0114,2.79e-06,0.5052,0.5061,0.0778,1.713,397.0,35.27GB,0.0245,0.1907,0.1287,0.5633 +parallel,20:49:34.168,9330,0.0096,0.0134,2.78e-06,0.4352,0.6530,0.1545,1.238,541.0,35.27GB,0.0292,0.1575,0.1561,0.5135 +parallel,20:49:42.642,9340,0.0042,0.0091,2.77e-06,0.4074,0.6193,0.2281,1.180,559.2,35.27GB,0.0055,0.0589,0.0276,0.3097 +parallel,20:49:52.236,9350,0.0107,0.0076,2.76e-06,0.3512,0.6679,0.2915,1.044,622.7,35.27GB,0.0089,0.0800,0.0477,0.4681 +parallel,20:50:01.674,9360,0.0073,0.0078,2.76e-06,0.4062,0.6462,0.2976,1.060,604.0,35.27GB,0.0097,0.0784,0.0509,0.3262 +parallel,20:50:11.809,9370,0.0054,0.0073,2.75e-06,0.3689,0.7135,0.3001,0.988,637.5,35.27GB,0.0041,0.0506,0.0215,0.1985 +parallel,20:50:20.829,9380,0.0017,0.0057,2.74e-06,0.4152,0.6749,0.2270,1.109,559.2,35.27GB,0.0035,0.0470,0.0159,0.1556 +parallel,20:50:30.216,9390,0.0191,0.0078,2.73e-06,0.5139,0.5566,0.3821,1.067,571.8,35.27GB,0.0197,0.1280,0.1040,0.4927 +parallel,20:50:39.588,9400,0.0057,0.0113,2.72e-06,0.3825,0.6667,0.2705,1.071,560.3,35.27GB,0.0062,0.0718,0.0302,0.3045 +parallel,20:50:51.735,9410,0.0126,0.0085,2.72e-06,0.3883,0.7344,0.4803,0.824,715.7,35.27GB,0.0247,0.1466,0.1244,0.5192 +parallel,20:51:01.595,9420,0.0063,0.0065,2.71e-06,0.4394,0.7633,0.2227,1.016,570.8,35.27GB,0.0044,0.0612,0.0234,0.3507 +parallel,20:51:14.685,9430,0.0111,0.0080,2.70e-06,0.4254,0.8938,0.4152,0.764,746.1,35.27GB,0.0207,0.1248,0.1081,0.4637 +parallel,20:51:23.842,9440,0.0041,0.0092,2.70e-06,0.4012,0.7034,0.2122,1.095,511.5,35.27GB,0.0047,0.0604,0.0259,0.1900 +parallel,20:51:30.579,9450,0.0020,0.0075,2.69e-06,0.4015,0.5494,0.1243,1.484,370.5,35.27GB,0.0056,0.0641,0.0278,0.2417 +parallel,20:51:37.928,9460,0.0218,0.0106,2.68e-06,0.4419,0.5730,0.1619,1.361,396.8,35.27GB,0.0134,0.0915,0.0700,0.4239 +parallel,20:51:45.970,9470,0.0082,0.0076,2.68e-06,0.4498,0.6410,0.1632,1.244,426.2,35.27GB,0.0087,0.0794,0.0430,0.2719 +parallel,20:51:54.368,9480,0.0181,0.0083,2.67e-06,0.4671,0.6686,0.1712,1.193,435.8,35.27GB,0.0167,0.1036,0.0846,0.5053 +parallel,20:52:03.172,9490,0.0317,0.0104,2.66e-06,0.4732,0.6925,0.1879,1.136,448.7,35.27GB,0.0095,0.1255,0.0457,0.9856 +parallel,20:52:11.339,9500,0.0037,0.0093,2.66e-06,0.4200,0.6087,0.2080,1.226,407.7,35.27GB,0.0032,0.0491,0.0176,0.2581 +parallel,20:52:19.546,9510,0.0344,0.0119,2.65e-06,0.5275,0.6152,0.2054,1.221,401.3,35.27GB,0.0236,0.1308,0.1239,0.4343 +parallel,20:52:26.738,9520,0.0064,0.0093,2.64e-06,0.3722,0.6031,0.1161,1.391,345.1,35.27GB,0.0047,0.0516,0.0265,0.2911 +parallel,20:52:35.061,9530,0.0044,0.0080,2.64e-06,0.3223,0.6835,0.1488,1.202,391.1,35.27GB,0.0032,0.0453,0.0175,0.1596 +parallel,20:52:42.054,9540,0.0006,0.0106,2.63e-06,0.4292,0.5674,0.1319,1.432,321.3,35.27GB,0.0178,0.1134,0.0911,0.5885 +parallel,20:52:50.641,9550,0.0036,0.0084,2.63e-06,0.4260,0.6543,0.2044,1.168,385.3,35.27GB,0.0072,0.0948,0.0383,0.2960 +parallel,20:52:57.109,9560,0.0075,0.0091,2.62e-06,0.4058,0.5468,0.1000,1.546,284.6,35.27GB,0.0259,0.2377,0.1414,0.5979 +parallel,20:53:03.581,9570,0.0028,0.0085,2.62e-06,0.3999,0.5431,0.1040,1.545,278.2,35.27GB,0.0181,0.1213,0.0920,0.4166 +parallel,20:53:10.553,9580,0.0023,0.0069,2.61e-06,0.3991,0.5929,0.1043,1.435,292.8,35.27GB,0.0330,0.1326,0.1745,0.4861 +parallel,20:53:17.176,9590,0.0062,0.0065,2.61e-06,0.4512,0.5469,0.1154,1.510,271.5,35.27GB,0.0176,0.1116,0.0979,0.5571 +parallel,20:53:23.228,9600,0.0082,0.0077,2.60e-06,0.3632,0.5268,0.0784,1.653,242.0,35.27GB,0.0084,0.0841,0.0438,0.3796 +parallel,20:53:29.143,9610,0.0072,0.0065,2.60e-06,0.3360,0.5053,0.0863,1.691,230.7,35.27GB,0.0051,0.0544,0.0262,0.2211 +parallel,20:53:36.133,9620,0.0081,0.0076,2.59e-06,0.3703,0.5863,0.1127,1.431,265.6,35.27GB,0.0152,0.1007,0.0794,0.4483 +parallel,20:53:43.900,9630,0.0011,0.0075,2.59e-06,0.3769,0.6311,0.1455,1.288,287.3,35.27GB,0.0029,0.0367,0.0156,0.2092 +parallel,20:53:50.541,9640,0.0043,0.0094,2.58e-06,0.5049,0.5545,0.1096,1.506,239.0,35.27GB,0.0212,0.1238,0.1185,0.4487 +parallel,20:53:57.571,9650,0.0058,0.0075,2.58e-06,0.3808,0.5825,0.1204,1.423,246.0,35.27GB,0.0052,0.0695,0.0270,0.2840 +parallel,20:54:04.713,9660,0.0168,0.0081,2.57e-06,0.4655,0.5711,0.1432,1.400,242.8,35.27GB,0.0306,0.1874,0.1749,0.6150 +parallel,20:54:13.468,9670,0.0110,0.0152,2.57e-06,0.4949,0.6812,0.1942,1.142,288.9,35.27GB,0.0107,0.0753,0.0563,0.2774 +parallel,20:54:21.278,9680,0.0014,0.0086,2.57e-06,0.2976,0.6411,0.1399,1.280,249.9,35.27GB,0.0045,0.0598,0.0240,0.1726 +parallel,20:54:29.300,9690,0.0030,0.0189,2.56e-06,0.4810,0.6373,0.1648,1.247,248.6,35.27GB,0.0055,0.0607,0.0290,0.4035 +parallel,20:54:36.541,9700,0.0032,0.0102,2.56e-06,0.3934,0.5738,0.1503,1.381,217.2,35.27GB,0.0073,0.0822,0.0371,0.3415 +parallel,20:54:43.096,9710,0.0044,0.0103,2.55e-06,0.4788,0.5345,0.1210,1.526,190.1,35.27GB,0.0137,0.1480,0.0762,0.3811 +parallel,20:54:50.386,9720,0.0027,0.0070,2.55e-06,0.3853,0.5807,0.1482,1.372,204.0,35.27GB,0.0416,0.1354,0.2256,0.4320 +parallel,20:54:58.642,9730,0.0006,0.0077,2.55e-06,0.4474,0.6766,0.1491,1.211,222.9,35.27GB,0.0219,0.1055,0.1161,0.5476 +parallel,20:55:05.937,9740,0.0017,0.0056,2.54e-06,0.4352,0.5722,0.1573,1.371,189.6,35.27GB,0.0175,0.0902,0.0909,0.3516 +parallel,20:55:13.729,9750,0.0016,0.0054,2.54e-06,0.3373,0.6085,0.1707,1.284,194.8,35.27GB,0.0087,0.0727,0.0456,0.1877 +parallel,20:55:20.801,9760,0.0024,0.0073,2.54e-06,0.4394,0.5525,0.1548,1.415,169.6,35.27GB,0.0338,0.1306,0.1797,0.4530 +parallel,20:55:28.817,9770,0.0143,0.0132,2.53e-06,0.4731,0.6265,0.1750,1.248,184.3,35.27GB,0.0055,0.0887,0.0298,0.2931 +parallel,20:55:36.056,9780,0.0069,0.0093,2.53e-06,0.3906,0.5995,0.1245,1.381,159.3,35.27GB,0.0220,0.1355,0.1190,0.3965 +parallel,20:55:42.836,9790,0.0055,0.0086,2.53e-06,0.4834,0.5613,0.1167,1.475,142.4,35.27GB,0.0047,0.0611,0.0239,0.2492 +parallel,20:55:50.619,9800,0.0010,0.0174,2.53e-06,0.3383,0.6231,0.1551,1.285,155.6,35.27GB,0.0072,0.0659,0.0369,0.3727 +parallel,20:55:58.054,9810,0.0280,0.0131,2.52e-06,0.4536,0.6109,0.1326,1.345,141.3,35.27GB,0.0152,0.1061,0.0787,0.6149 +parallel,20:56:06.072,9820,0.0159,0.0107,2.52e-06,0.4589,0.6155,0.1863,1.247,144.3,35.27GB,0.0066,0.0850,0.0349,0.3753 +parallel,20:56:15.284,9830,0.0012,0.0100,2.52e-06,0.3746,0.7067,0.2145,1.086,156.6,35.27GB,0.0060,0.0483,0.0323,0.1940 +parallel,20:56:22.620,9840,0.0058,0.0081,2.52e-06,0.4139,0.5723,0.1612,1.363,117.4,35.27GB,0.0111,0.1267,0.0570,0.3986 +parallel,20:56:30.336,9850,0.0074,0.0088,2.51e-06,0.4130,0.6382,0.1334,1.296,115.7,35.27GB,0.0111,0.0930,0.0578,0.3258 +parallel,20:56:37.828,9860,0.0123,0.0086,2.51e-06,0.3696,0.6094,0.1398,1.335,104.9,35.27GB,0.0114,0.0773,0.0634,0.3325 +parallel,20:56:45.445,9870,0.0050,0.0069,2.51e-06,0.5057,0.6025,0.1592,1.313,99.0,35.27GB,0.0189,0.1746,0.1006,0.6312 +parallel,20:56:52.650,9880,0.0041,0.0062,2.51e-06,0.4778,0.6105,0.1100,1.388,86.5,35.27GB,0.0380,0.2231,0.2347,0.6008 +parallel,20:57:00.692,9890,0.0051,0.0080,2.51e-06,0.4368,0.6293,0.1749,1.244,88.5,35.27GB,0.0176,0.1279,0.0924,0.4272 +parallel,20:57:08.112,9900,0.0275,0.0116,2.51e-06,0.4404,0.6005,0.1415,1.348,74.2,35.27GB,0.0117,0.0780,0.0662,0.5700 +parallel,20:57:14.792,9910,0.0895,0.0169,2.51e-06,0.3966,0.5523,0.1157,1.497,60.1,35.27GB,0.0340,0.1855,0.1967,0.4599 +parallel,20:57:22.734,9920,0.0010,0.0121,2.50e-06,0.4060,0.6385,0.1557,1.291,62.0,35.27GB,0.0168,0.1197,0.0913,0.2734 +parallel,20:57:30.526,9930,0.0087,0.0116,2.50e-06,0.3961,0.6442,0.1350,1.284,54.5,35.27GB,0.0067,0.0760,0.0369,0.2789 +parallel,20:57:37.261,9940,0.0048,0.0093,2.50e-06,0.5094,0.5468,0.1266,1.485,40.4,35.27GB,0.0169,0.0987,0.0887,0.4413 +parallel,20:57:44.136,9950,0.0041,0.0119,2.50e-06,0.5186,0.5790,0.1085,1.455,34.4,35.27GB,0.0113,0.1369,0.0550,0.9936 +parallel,20:57:51.812,9960,0.0336,0.0161,2.50e-06,0.4095,0.6125,0.1551,1.303,30.7,35.27GB,0.0322,0.1669,0.1656,0.8340 +parallel,20:57:59.813,9970,0.0142,0.0113,2.50e-06,0.4276,0.6209,0.1792,1.250,24.0,35.27GB,0.0120,0.1224,0.0636,0.4228 +parallel,20:58:07.396,9980,0.0031,0.0095,2.50e-06,0.3769,0.6279,0.1305,1.319,15.2,35.27GB,0.0020,0.0264,0.0107,0.1136 +parallel,20:58:14.664,9990,0.0030,0.0066,2.50e-06,0.4428,0.6006,0.1262,1.376,7.3,35.27GB,0.0084,0.0666,0.0445,0.4760 +parallel,20:58:23.797,10000,0.0140,0.0169,2.50e-06,0.4269,0.6919,0.2213,1.095,0.0,35.27GB,0.0121,0.1277,0.0634,0.4071 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/runtime_table.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/runtime_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..514d0ad652b3172ad7972eb76b296fdcc96a9f33 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/runtime_table.csv @@ -0,0 +1,12 @@ +stage,start_utc,end_utc,duration_seconds,duration_hms +baseline_train,2026-03-09 16:03:23 UTC,2026-03-09 18:17:03 UTC,8020,2:13:40 +baseline_eval_1000,2026-03-09 18:17:03 UTC,2026-03-09 18:23:42 UTC,399,0:06:39 +baseline_eval_2000,2026-03-09 18:23:42 UTC,2026-03-09 18:28:54 UTC,312,0:05:12 +baseline_eval_5000,2026-03-09 18:28:54 UTC,2026-03-09 18:33:53 UTC,299,0:04:59 +baseline_eval_10000,2026-03-09 18:33:53 UTC,2026-03-09 18:41:07 UTC,434,0:07:14 +parallel_train,2026-03-09 18:41:07 UTC,2026-03-09 21:01:58 UTC,8451,2:20:51 +parallel_eval_1000,2026-03-09 21:01:58 UTC,2026-03-09 21:14:35 UTC,757,0:12:37 +parallel_eval_2000,2026-03-09 21:14:35 UTC,2026-03-09 21:22:39 UTC,484,0:08:04 +parallel_eval_5000,2026-03-09 21:22:40 UTC,2026-03-09 21:35:26 UTC,766,0:12:46 +parallel_eval_10000,2026-03-09 21:35:26 UTC,2026-03-09 21:45:53 UTC,627,0:10:27 +full_pipeline,2026-03-09 15:57:20 UTC,2026-03-09 21:45:53 UTC,20913,5:48:33 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/sample_eval_table.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/sample_eval_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..4f5e33f50377be2dadbc68b1a59e2d223f2ba422 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/sample_eval_table.csv @@ -0,0 +1,17 @@ +model,checkpoint_step,num_steps,num_batches,mean_masked_mae,std_masked_mae,mean_left_arm_mae,std_left_arm_mae,mean_right_arm_mae,std_right_arm_mae,mean_left_joint_mae,std_left_joint_mae,mean_left_gripper_mae,std_left_gripper_mae,mean_right_joint_mae,std_right_joint_mae,mean_right_gripper_mae,std_right_gripper_mae,mean_left_right_imbalance_mae,std_left_right_imbalance_mae,per_batch_timing_seconds +baseline,1000,4,16,0.090938,0.02224,0.120414,0.046606,0.061461,0.058026,0.130966,0.054578,0.046552,0.06792,0.063945,0.062779,0.044077,0.053987,0.095076,0.059464,mean=0.3131 std=0.0370 min=0.2649 max=0.3781 +baseline,1000,10,16,0.100992,0.023502,0.132369,0.047803,0.069615,0.063335,0.143677,0.056155,0.053215,0.074232,0.072165,0.068555,0.051764,0.054067,0.101649,0.063159,mean=0.3640 std=0.0430 min=0.3333 max=0.4572 +baseline,2000,4,16,0.060253,0.017936,0.078725,0.032786,0.041781,0.04091,0.083688,0.036089,0.043985,0.072901,0.042767,0.041669,0.034874,0.058769,0.063418,0.039412,mean=0.3006 std=0.0345 min=0.2674 max=0.3753 +baseline,2000,10,16,0.065765,0.016923,0.086375,0.032761,0.045154,0.041131,0.092111,0.036788,0.046224,0.076043,0.046163,0.042138,0.038093,0.056179,0.066659,0.040501,mean=0.3586 std=0.0248 min=0.3396 max=0.4220 +baseline,5000,4,16,0.03972,0.014654,0.049239,0.019869,0.030201,0.034473,0.052215,0.023235,0.028408,0.028427,0.031159,0.037572,0.02349,0.024208,0.04196,0.030152,mean=0.2920 std=0.0342 min=0.2585 max=0.3528 +baseline,5000,10,16,0.043346,0.013818,0.053788,0.020493,0.032904,0.034889,0.057689,0.024439,0.026486,0.029864,0.0337,0.038002,0.027331,0.027093,0.044562,0.030999,mean=0.3951 std=0.0357 min=0.3463 max=0.4774 +baseline,10000,4,16,0.029935,0.0082,0.041062,0.019621,0.018807,0.018117,0.04444,0.02295,0.017416,0.016394,0.0195,0.019305,0.013963,0.019504,0.033733,0.022691,mean=0.2793 std=0.0247 min=0.2625 max=0.3469 +baseline,10000,10,16,0.030294,0.007277,0.041307,0.019181,0.019282,0.019077,0.045179,0.022508,0.014207,0.016425,0.020231,0.020465,0.01264,0.018571,0.034582,0.023261,mean=0.3823 std=0.0398 min=0.3432 max=0.4686 +parallel,1000,4,16,0.09253,0.020956,0.122108,0.04378,0.062952,0.056483,0.133062,0.052111,0.045431,0.055952,0.065476,0.060695,0.04528,0.053039,0.093392,0.056874,mean=0.3110 std=0.0430 min=0.2654 max=0.3864 +parallel,1000,10,16,0.102452,0.022208,0.13361,0.044796,0.071295,0.061523,0.145474,0.053589,0.05056,0.060317,0.073909,0.066406,0.053,0.051143,0.099213,0.060422,mean=0.4143 std=0.0560 min=0.3405 max=0.5017 +parallel,2000,4,16,0.05986,0.012924,0.080984,0.031604,0.038736,0.031293,0.086197,0.035912,0.04449,0.062755,0.039304,0.030982,0.034761,0.051397,0.061196,0.036442,mean=0.3702 std=0.1017 min=0.2793 max=0.7256 +parallel,2000,10,16,0.065897,0.012628,0.088735,0.03201,0.043059,0.032823,0.094654,0.036668,0.047298,0.06466,0.043769,0.032862,0.038089,0.049635,0.064491,0.038643,mean=0.4575 std=0.0902 min=0.3373 max=0.6590 +parallel,5000,4,16,0.040712,0.013646,0.050681,0.020624,0.030742,0.03279,0.053976,0.024153,0.027611,0.02458,0.032227,0.03635,0.020349,0.017496,0.042435,0.029207,mean=0.3861 std=0.0848 min=0.2719 max=0.5485 +parallel,5000,10,16,0.044799,0.012807,0.055016,0.021278,0.034583,0.032757,0.059296,0.025068,0.025058,0.027173,0.035777,0.036454,0.026224,0.01689,0.043614,0.030178,mean=0.4549 std=0.0835 min=0.3373 max=0.6280 +parallel,10000,4,16,0.029277,0.007579,0.040375,0.01919,0.018178,0.015856,0.043636,0.022278,0.017546,0.013485,0.018908,0.017028,0.013066,0.016678,0.031629,0.022404,mean=0.3241 std=0.0551 min=0.2600 max=0.4241 +parallel,10000,10,16,0.030241,0.00674,0.041072,0.018866,0.01941,0.017031,0.044817,0.022046,0.014857,0.014376,0.020279,0.018425,0.013323,0.014475,0.032456,0.022935,mean=0.4058 std=0.0569 min=0.3332 max=0.5100 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/startup_summaries.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/startup_summaries.txt new file mode 100644 index 0000000000000000000000000000000000000000..be1ebdc489ff26cbbb74b446b1b9768f7696c005 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/startup_summaries.txt @@ -0,0 +1,72 @@ +[baseline] +weight_missing_count: 0 (10775:train_pytorch.py:629) +weight_missing_keys: set() (10775:train_pytorch.py:630) +weight_unexpected_count: 0 (10775:train_pytorch.py:631) +weight_unexpected_keys: [] (10775:train_pytorch.py:632) +config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k (10775:train_pytorch.py:280) +dataset_repo_id: lsnu/twin_handover_256_train (10775:train_pytorch.py:281) +norm_stats_file: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (10775:train_pytorch.py:282) +norm_stats_summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (10775:train_pytorch.py:283) +checkpoint_source: /workspace/checkpoints/pi05_base_single_pytorch (10775:train_pytorch.py:284) +model_type: baseline (10775:train_pytorch.py:285) +packed_transforms: True (10775:train_pytorch.py:286) +world_size: 4 (10775:train_pytorch.py:287) +batch_size: local=4, global=16 (10775:train_pytorch.py:288) +num_workers: 8 (10775:train_pytorch.py:289) +precision: bfloat16 (10775:train_pytorch.py:290) +lr_schedule: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (10775:train_pytorch.py:291) +save_log_intervals: save_interval=1000, log_interval=10 (10775:train_pytorch.py:298) +action_loss_mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (10775:train_pytorch.py:299) +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (10775:train_pytorch.py:300) +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (10775:train_pytorch.py:301) +gradient_buckets: action_in_proj, action_out_proj, shared_expert (10775:train_pytorch.py:694) + +16:06:09.367 [I] debug_step=1 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (10775:train_pytorch.py:799) +16:06:09.368 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (10775:train_pytorch.py:803) +16:06:09.368 [I] debug_step=1 prompt_token_lengths=[74, 72, 76, 78] (10775:train_pytorch.py:806) +16:06:09.368 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0715 std=0.4362 (10775:train_pytorch.py:807) +16:06:09.369 [I] debug_step=1 action_stats min=-1.0000 max=1.0947 mean=0.0331 std=0.4134 (10775:train_pytorch.py:810) +16:06:09.369 [I] debug_step=1 state_nonzero_counts_8d_blocks=[32, 0, 32, 0] action_nonzero_counts_8d_blocks=[512, 0, 512, 0] (10775:train_pytorch.py:813) +16:06:09.390 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=64 actions=1024 (10775:train_pytorch.py:817) +16:06:09.390 [I] debug_step=1 lr=4.99e-08 grad_norm=15.9656 data_time=0.9485s step_time=1.9454s gpu_mem_allocated=28.49GB gpu_mem_reserved=35.24GB gpu_mem_max_allocated=35.23GB gpu_mem_max_reserved=35.24GB (10775:train_pytorch.py:822) +16:06:09.390 [I] debug_step=1 grad_shared_expert=15.5493 grad_action_in_proj=0.4919 grad_action_out_proj=2.1574 (10775:train_pytorch.py:830) +Training: 0%| | 1/10000 [00:02<8:12:34, 2.96s/it, loss=1.4673, lr=4.99e-08, step=1]16:06:10.034 [I] debug_step=2 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (10775:train_pytorch.py:799) +16:06:10.035 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (10775:train_pytorch.py:803) +16:06:10.035 [I] debug_step=2 prompt_token_lengths=[79, 76, 69, 69] (10775:train_pytorch.py:806) + +[parallel] +weight_missing_count: 0 (18633:train_pytorch.py:629) +weight_missing_keys: set() (18633:train_pytorch.py:630) +weight_unexpected_count: 0 (18633:train_pytorch.py:631) +weight_unexpected_keys: [] (18633:train_pytorch.py:632) +config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k (18633:train_pytorch.py:280) +dataset_repo_id: lsnu/twin_handover_256_train (18633:train_pytorch.py:281) +norm_stats_file: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (18633:train_pytorch.py:282) +norm_stats_summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (18633:train_pytorch.py:283) +checkpoint_source: /workspace/checkpoints/pi05_base_parallel_packed_from_single (18633:train_pytorch.py:284) +model_type: parallel (18633:train_pytorch.py:285) +packed_transforms: True (18633:train_pytorch.py:286) +world_size: 4 (18633:train_pytorch.py:287) +batch_size: local=4, global=16 (18633:train_pytorch.py:288) +num_workers: 8 (18633:train_pytorch.py:289) +precision: bfloat16 (18633:train_pytorch.py:290) +lr_schedule: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (18633:train_pytorch.py:291) +save_log_intervals: save_interval=1000, log_interval=10 (18633:train_pytorch.py:298) +action_loss_mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (18633:train_pytorch.py:299) +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (18633:train_pytorch.py:300) +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (18633:train_pytorch.py:301) +gradient_buckets: action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert (18633:train_pytorch.py:694) + +18:44:34.768 [I] debug_step=1 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (18633:train_pytorch.py:799) +18:44:34.769 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (18633:train_pytorch.py:803) +18:44:34.769 [I] debug_step=1 prompt_token_lengths=[74, 72, 76, 78] (18633:train_pytorch.py:806) +18:44:34.769 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0715 std=0.4362 (18633:train_pytorch.py:807) +18:44:34.770 [I] debug_step=1 action_stats min=-1.0000 max=1.0947 mean=0.0331 std=0.4134 (18633:train_pytorch.py:810) +18:44:34.770 [I] debug_step=1 state_nonzero_counts_8d_blocks=[32, 0, 32, 0] action_nonzero_counts_8d_blocks=[512, 0, 512, 0] (18633:train_pytorch.py:813) +18:44:34.791 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=64 actions=1024 (18633:train_pytorch.py:817) +18:44:34.792 [I] debug_step=1 lr=4.99e-08 grad_norm=16.1250 data_time=0.7232s step_time=2.1776s gpu_mem_allocated=28.53GB gpu_mem_reserved=35.28GB gpu_mem_max_allocated=35.27GB gpu_mem_max_reserved=35.28GB (18633:train_pytorch.py:822) +18:44:34.792 [I] debug_step=1 grad_shared_expert=15.5090 grad_action_in_proj_arms=0.5665 grad_arm_token_fuse=2.6833 grad_action_out_proj_arms=2.1581 (18633:train_pytorch.py:830) +Training: 0%| | 1/10000 [00:02<8:13:44, 2.96s/it, loss=1.4675, lr=4.99e-08, step=1]18:44:35.388 [I] debug_step=2 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (18633:train_pytorch.py:799) +18:44:35.389 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (18633:train_pytorch.py:803) +18:44:35.389 [I] debug_step=2 prompt_token_lengths=[79, 76, 69, 69] (18633:train_pytorch.py:806) + diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/summary.json b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..61fe0107314a34163bf365b347453e2bde145d72 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/summary.json @@ -0,0 +1,1018 @@ +{ + "reference_2k_summary_path": "/workspace/pi05tests-openpi-multiarm/artifacts/twin_handover_packed_parallelization_20260309/metrics/summary.json", + "train": { + "baseline": { + "startup": { + "weight_missing_count": "0 (10775:train_pytorch.py:629)", + "weight_missing_keys": "set() (10775:train_pytorch.py:630)", + "weight_unexpected_count": "0 (10775:train_pytorch.py:631)", + "weight_unexpected_keys": "[] (10775:train_pytorch.py:632)", + "config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k (10775:train_pytorch.py:280)", + "dataset_repo_id": "lsnu/twin_handover_256_train (10775:train_pytorch.py:281)", + "norm_stats_file": "/workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (10775:train_pytorch.py:282)", + "norm_stats_summary": "{'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (10775:train_pytorch.py:283)", + "checkpoint_source": "/workspace/checkpoints/pi05_base_single_pytorch (10775:train_pytorch.py:284)", + "model_type": "baseline (10775:train_pytorch.py:285)", + "packed_transforms": "True (10775:train_pytorch.py:286)", + "world_size": "4 (10775:train_pytorch.py:287)", + "batch_size": "local=4, global=16 (10775:train_pytorch.py:288)", + "num_workers": "8 (10775:train_pytorch.py:289)", + "precision": "bfloat16 (10775:train_pytorch.py:290)", + "lr_schedule": "warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (10775:train_pytorch.py:291)", + "save_log_intervals": "save_interval=1000, log_interval=10 (10775:train_pytorch.py:298)", + "action_loss_mask": "(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (10775:train_pytorch.py:299)", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (10775:train_pytorch.py:300)", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (10775:train_pytorch.py:301)", + "gradient_buckets": "action_in_proj, action_out_proj, shared_expert (10775:train_pytorch.py:694)" + }, + "steps": { + "1000": { + "ts": "16:16:42.668", + "step": "1000", + "loss": "0.0228", + "smoothed": "0.0476", + "lr": "2.48e-05", + "grad_norm": "0.9699", + "step_time": "0.5638", + "data_time": "0.0801", + "its": "1.553", + "eta": "5793.6", + "mem": "35.23GB", + "grad_action_in_proj": "0.0109", + "grad_action_out_proj": "0.1595", + "grad_shared_expert": "0.4924" + }, + "2000": { + "ts": "16:28:30.872", + "step": "2000", + "loss": "0.0492", + "smoothed": "0.0284", + "lr": "2.37e-05", + "grad_norm": "0.6437", + "step_time": "0.4982", + "data_time": "0.0622", + "its": "1.785", + "eta": "4482.7", + "mem": "35.23GB", + "grad_action_in_proj": "0.0184", + "grad_action_out_proj": "0.2195", + "grad_shared_expert": "0.8358" + }, + "5000": { + "ts": "17:04:21.626", + "step": "5000", + "loss": "0.0038", + "smoothed": "0.0165", + "lr": "1.47e-05", + "grad_norm": "0.5112", + "step_time": "0.4974", + "data_time": "0.0606", + "its": "1.792", + "eta": "2789.7", + "mem": "35.23GB", + "grad_action_in_proj": "0.0101", + "grad_action_out_proj": "0.1353", + "grad_shared_expert": "1.1505" + }, + "10000": { + "ts": "18:15:00.659", + "step": "10000", + "loss": "0.0141", + "smoothed": "0.0172", + "lr": "2.50e-06", + "grad_norm": "0.4377", + "step_time": "0.5241", + "data_time": "0.1210", + "its": "1.550", + "eta": "0.0", + "mem": "35.23GB", + "grad_action_in_proj": "0.0125", + "grad_action_out_proj": "0.1342", + "grad_shared_expert": "0.4184" + } + }, + "saves": { + "1000": { + "timestamp": "16:18:02.120", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000" + }, + "2000": { + "timestamp": "16:30:11.326", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000" + }, + "3000": { + "timestamp": "16:42:29.626", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/3000" + }, + "4000": { + "timestamp": "16:54:35.424", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/4000" + }, + "5000": { + "timestamp": "17:05:36.535", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000" + }, + "6000": { + "timestamp": "17:19:56.648", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/6000" + }, + "7000": { + "timestamp": "17:34:35.906", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/7000" + }, + "8000": { + "timestamp": "17:48:19.855", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/8000" + }, + "9000": { + "timestamp": "18:02:59.063", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/9000" + }, + "10000": { + "timestamp": "18:16:58.135", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000" + } + }, + "debug_lines": [ + "16:06:09.367 [I] debug_step=1 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (10775:train_pytorch.py:799)", + "16:06:09.368 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (10775:train_pytorch.py:803)", + "16:06:09.368 [I] debug_step=1 prompt_token_lengths=[74, 72, 76, 78] (10775:train_pytorch.py:806)", + "16:06:09.368 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0715 std=0.4362 (10775:train_pytorch.py:807)", + "16:06:09.369 [I] debug_step=1 action_stats min=-1.0000 max=1.0947 mean=0.0331 std=0.4134 (10775:train_pytorch.py:810)", + "16:06:09.369 [I] debug_step=1 state_nonzero_counts_8d_blocks=[32, 0, 32, 0] action_nonzero_counts_8d_blocks=[512, 0, 512, 0] (10775:train_pytorch.py:813)", + "16:06:09.390 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=64 actions=1024 (10775:train_pytorch.py:817)", + "16:06:09.390 [I] debug_step=1 lr=4.99e-08 grad_norm=15.9656 data_time=0.9485s step_time=1.9454s gpu_mem_allocated=28.49GB gpu_mem_reserved=35.24GB gpu_mem_max_allocated=35.23GB gpu_mem_max_reserved=35.24GB (10775:train_pytorch.py:822)", + "16:06:09.390 [I] debug_step=1 grad_shared_expert=15.5493 grad_action_in_proj=0.4919 grad_action_out_proj=2.1574 (10775:train_pytorch.py:830)", + "Training: 0%| | 1/10000 [00:02<8:12:34, 2.96s/it, loss=1.4673, lr=4.99e-08, step=1]16:06:10.034 [I] debug_step=2 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (10775:train_pytorch.py:799)", + "16:06:10.035 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (10775:train_pytorch.py:803)", + "16:06:10.035 [I] debug_step=2 prompt_token_lengths=[79, 76, 69, 69] (10775:train_pytorch.py:806)" + ], + "runtime": "2:13:40" + }, + "parallel": { + "startup": { + "weight_missing_count": "0 (18633:train_pytorch.py:629)", + "weight_missing_keys": "set() (18633:train_pytorch.py:630)", + "weight_unexpected_count": "0 (18633:train_pytorch.py:631)", + "weight_unexpected_keys": "[] (18633:train_pytorch.py:632)", + "config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k (18633:train_pytorch.py:280)", + "dataset_repo_id": "lsnu/twin_handover_256_train (18633:train_pytorch.py:281)", + "norm_stats_file": "/workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (18633:train_pytorch.py:282)", + "norm_stats_summary": "{'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (18633:train_pytorch.py:283)", + "checkpoint_source": "/workspace/checkpoints/pi05_base_parallel_packed_from_single (18633:train_pytorch.py:284)", + "model_type": "parallel (18633:train_pytorch.py:285)", + "packed_transforms": "True (18633:train_pytorch.py:286)", + "world_size": "4 (18633:train_pytorch.py:287)", + "batch_size": "local=4, global=16 (18633:train_pytorch.py:288)", + "num_workers": "8 (18633:train_pytorch.py:289)", + "precision": "bfloat16 (18633:train_pytorch.py:290)", + "lr_schedule": "warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (18633:train_pytorch.py:291)", + "save_log_intervals": "save_interval=1000, log_interval=10 (18633:train_pytorch.py:298)", + "action_loss_mask": "(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (18633:train_pytorch.py:299)", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (18633:train_pytorch.py:300)", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (18633:train_pytorch.py:301)", + "gradient_buckets": "action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert (18633:train_pytorch.py:694)" + }, + "steps": { + "1000": { + "ts": "18:56:22.847", + "step": "1000", + "loss": "0.0246", + "smoothed": "0.0492", + "lr": "2.48e-05", + "grad_norm": "0.9470", + "step_time": "0.5836", + "data_time": "0.1086", + "its": "1.445", + "eta": "6229.0", + "mem": "35.27GB", + "grad_action_in_proj_arms": "0.0139", + "grad_action_out_proj_arms": "0.1631", + "grad_arm_token_fuse": "0.0704", + "grad_shared_expert": "0.5049" + }, + "2000": { + "ts": "19:09:53.627", + "step": "2000", + "loss": "0.0280", + "smoothed": "0.0267", + "lr": "2.37e-05", + "grad_norm": "0.6051", + "step_time": "0.7138", + "data_time": "0.1628", + "its": "1.141", + "eta": "7012.2", + "mem": "35.27GB", + "grad_action_in_proj_arms": "0.0180", + "grad_action_out_proj_arms": "0.1784", + "grad_arm_token_fuse": "0.0955", + "grad_shared_expert": "0.5627" + }, + "5000": { + "ts": "19:50:55.815", + "step": "5000", + "loss": "0.0043", + "smoothed": "0.0159", + "lr": "1.47e-05", + "grad_norm": "0.4850", + "step_time": "0.5183", + "data_time": "0.0658", + "its": "1.712", + "eta": "2920.0", + "mem": "35.27GB", + "grad_action_in_proj_arms": "0.0105", + "grad_action_out_proj_arms": "0.1454", + "grad_arm_token_fuse": "0.0568", + "grad_shared_expert": "1.0533" + }, + "10000": { + "ts": "20:58:23.797", + "step": "10000", + "loss": "0.0140", + "smoothed": "0.0169", + "lr": "2.50e-06", + "grad_norm": "0.4269", + "step_time": "0.6919", + "data_time": "0.2213", + "its": "1.095", + "eta": "0.0", + "mem": "35.27GB", + "grad_action_in_proj_arms": "0.0121", + "grad_action_out_proj_arms": "0.1277", + "grad_arm_token_fuse": "0.0634", + "grad_shared_expert": "0.4071" + } + }, + "saves": { + "1000": { + "timestamp": "18:58:14.131", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000" + }, + "2000": { + "timestamp": "19:12:06.795", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000" + }, + "3000": { + "timestamp": "19:25:59.695", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/3000" + }, + "4000": { + "timestamp": "19:39:53.065", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/4000" + }, + "5000": { + "timestamp": "19:52:11.616", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000" + }, + "6000": { + "timestamp": "20:04:56.835", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/6000" + }, + "7000": { + "timestamp": "20:17:25.392", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/7000" + }, + "8000": { + "timestamp": "20:33:46.138", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/8000" + }, + "9000": { + "timestamp": "20:46:05.807", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/9000" + }, + "10000": { + "timestamp": "21:01:52.032", + "path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000" + } + }, + "debug_lines": [ + "18:44:34.768 [I] debug_step=1 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (18633:train_pytorch.py:799)", + "18:44:34.769 [I] debug_step=1 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (18633:train_pytorch.py:803)", + "18:44:34.769 [I] debug_step=1 prompt_token_lengths=[74, 72, 76, 78] (18633:train_pytorch.py:806)", + "18:44:34.769 [I] debug_step=1 state_stats min=-1.0000 max=1.0004 mean=0.0715 std=0.4362 (18633:train_pytorch.py:807)", + "18:44:34.770 [I] debug_step=1 action_stats min=-1.0000 max=1.0947 mean=0.0331 std=0.4134 (18633:train_pytorch.py:810)", + "18:44:34.770 [I] debug_step=1 state_nonzero_counts_8d_blocks=[32, 0, 32, 0] action_nonzero_counts_8d_blocks=[512, 0, 512, 0] (18633:train_pytorch.py:813)", + "18:44:34.791 [I] debug_step=1 masked_dims=[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] active_dims=[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] masked_zero_counts state=64 actions=1024 (18633:train_pytorch.py:817)", + "18:44:34.792 [I] debug_step=1 lr=4.99e-08 grad_norm=16.1250 data_time=0.7232s step_time=2.1776s gpu_mem_allocated=28.53GB gpu_mem_reserved=35.28GB gpu_mem_max_allocated=35.27GB gpu_mem_max_reserved=35.28GB (18633:train_pytorch.py:822)", + "18:44:34.792 [I] debug_step=1 grad_shared_expert=15.5090 grad_action_in_proj_arms=0.5665 grad_arm_token_fuse=2.6833 grad_action_out_proj_arms=2.1581 (18633:train_pytorch.py:830)", + "Training: 0%| | 1/10000 [00:02<8:13:44, 2.96s/it, loss=1.4675, lr=4.99e-08, step=1]18:44:35.388 [I] debug_step=2 observation.state shape=(4, 32) dtype=torch.float64 actions shape=(4, 16, 32) dtype=torch.float32 (18633:train_pytorch.py:799)", + "18:44:35.389 [I] debug_step=2 image_keys=['base_0_rgb', 'left_wrist_0_rgb', 'right_wrist_0_rgb'] image_shapes={'base_0_rgb': (4, 3, 224, 224), 'left_wrist_0_rgb': (4, 3, 224, 224), 'right_wrist_0_rgb': (4, 3, 224, 224)} (18633:train_pytorch.py:803)", + "18:44:35.389 [I] debug_step=2 prompt_token_lengths=[79, 76, 69, 69] (18633:train_pytorch.py:806)" + ], + "runtime": "2:20:51" + } + }, + "val_teacher_forced": { + "baseline_1000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.06113, + "std_val_loss": 0.043921, + "mean_left_arm_loss": 0.077421, + "std_left_arm_loss": 0.059309, + "mean_right_arm_loss": 0.04484, + "std_right_arm_loss": 0.080634, + "mean_left_joint_loss": 0.082092, + "std_left_joint_loss": 0.06674, + "mean_left_gripper_loss": 0.04472, + "std_left_gripper_loss": 0.088365, + "mean_right_joint_loss": 0.046274, + "std_right_joint_loss": 0.087919, + "mean_right_gripper_loss": 0.034807, + "std_right_gripper_loss": 0.076825, + "mean_left_right_imbalance": 0.08012, + "std_left_right_imbalance": 0.083456, + "per_batch_timing_seconds": "mean=0.3040 std=0.1266 min=0.2246 max=0.8837", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "baseline", + "checkpoint_step": 1000 + }, + "baseline_2000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.041595, + "std_val_loss": 0.030015, + "mean_left_arm_loss": 0.049919, + "std_left_arm_loss": 0.033208, + "mean_right_arm_loss": 0.033271, + "std_right_arm_loss": 0.059873, + "mean_left_joint_loss": 0.051501, + "std_left_joint_loss": 0.035502, + "mean_left_gripper_loss": 0.038846, + "std_left_gripper_loss": 0.082622, + "mean_right_joint_loss": 0.034159, + "std_right_joint_loss": 0.066139, + "mean_right_gripper_loss": 0.027055, + "std_right_gripper_loss": 0.06654, + "mean_left_right_imbalance": 0.05474, + "std_left_right_imbalance": 0.055247, + "per_batch_timing_seconds": "mean=0.2487 std=0.0844 min=0.2239 max=0.8257", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "baseline", + "checkpoint_step": 2000 + }, + "baseline_5000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.027324, + "std_val_loss": 0.020404, + "mean_left_arm_loss": 0.039118, + "std_left_arm_loss": 0.037404, + "mean_right_arm_loss": 0.015529, + "std_right_arm_loss": 0.023314, + "mean_left_joint_loss": 0.042035, + "std_left_joint_loss": 0.041763, + "mean_left_gripper_loss": 0.018705, + "std_left_gripper_loss": 0.031815, + "mean_right_joint_loss": 0.015711, + "std_right_joint_loss": 0.023929, + "mean_right_gripper_loss": 0.014261, + "std_right_gripper_loss": 0.030013, + "mean_left_right_imbalance": 0.038961, + "std_left_right_imbalance": 0.035474, + "per_batch_timing_seconds": "mean=0.2601 std=0.0801 min=0.2212 max=0.7730", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "baseline", + "checkpoint_step": 5000 + }, + "baseline_10000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 100, + "mean_val_loss": 0.022345, + "std_val_loss": 0.024337, + "mean_left_arm_loss": 0.029659, + "std_left_arm_loss": 0.039896, + "mean_right_arm_loss": 0.015031, + "std_right_arm_loss": 0.032929, + "mean_left_joint_loss": 0.031507, + "std_left_joint_loss": 0.044637, + "mean_left_gripper_loss": 0.016725, + "std_left_gripper_loss": 0.040894, + "mean_right_joint_loss": 0.015776, + "std_right_joint_loss": 0.036308, + "mean_right_gripper_loss": 0.009818, + "std_right_gripper_loss": 0.028543, + "mean_left_right_imbalance": 0.034067, + "std_left_right_imbalance": 0.045126, + "per_batch_timing_seconds": "mean=0.2524 std=0.0719 min=0.2263 max=0.8903", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "baseline", + "checkpoint_step": 10000 + }, + "parallel_1000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.059715, + "std_val_loss": 0.042962, + "mean_left_arm_loss": 0.073681, + "std_left_arm_loss": 0.049928, + "mean_right_arm_loss": 0.045749, + "std_right_arm_loss": 0.082818, + "mean_left_joint_loss": 0.078129, + "std_left_joint_loss": 0.055212, + "mean_left_gripper_loss": 0.042541, + "std_left_gripper_loss": 0.08491, + "mean_right_joint_loss": 0.047261, + "std_right_joint_loss": 0.090299, + "mean_right_gripper_loss": 0.035161, + "std_right_gripper_loss": 0.079674, + "mean_left_right_imbalance": 0.075806, + "std_left_right_imbalance": 0.079713, + "per_batch_timing_seconds": "mean=0.3663 std=0.6150 min=0.2224 max=4.6353", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "parallel", + "checkpoint_step": 1000 + }, + "parallel_2000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.039947, + "std_val_loss": 0.025053, + "mean_left_arm_loss": 0.050148, + "std_left_arm_loss": 0.033233, + "mean_right_arm_loss": 0.029745, + "std_right_arm_loss": 0.04786, + "mean_left_joint_loss": 0.051925, + "std_left_joint_loss": 0.036277, + "mean_left_gripper_loss": 0.037711, + "std_left_gripper_loss": 0.077017, + "mean_right_joint_loss": 0.030139, + "std_right_joint_loss": 0.051862, + "mean_right_gripper_loss": 0.026984, + "std_right_gripper_loss": 0.065713, + "mean_left_right_imbalance": 0.051938, + "std_left_right_imbalance": 0.044701, + "per_batch_timing_seconds": "mean=0.3708 std=0.1690 min=0.2327 max=1.3050", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "parallel", + "checkpoint_step": 2000 + }, + "parallel_5000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 50, + "mean_val_loss": 0.02734, + "std_val_loss": 0.020897, + "mean_left_arm_loss": 0.039155, + "std_left_arm_loss": 0.038641, + "mean_right_arm_loss": 0.015526, + "std_right_arm_loss": 0.023413, + "mean_left_joint_loss": 0.042035, + "std_left_joint_loss": 0.043377, + "mean_left_gripper_loss": 0.018994, + "std_left_gripper_loss": 0.032843, + "mean_right_joint_loss": 0.015753, + "std_right_joint_loss": 0.024564, + "mean_right_gripper_loss": 0.013938, + "std_right_gripper_loss": 0.029304, + "mean_left_right_imbalance": 0.038635, + "std_left_right_imbalance": 0.037436, + "per_batch_timing_seconds": "mean=0.3717 std=0.2172 min=0.2283 max=1.7875", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "parallel", + "checkpoint_step": 5000 + }, + "parallel_10000": { + "teacher_forced_eval_seed": 123, + "config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "checkpoint_path": "/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000", + "repo_id_used": "lsnu/twin_handover_256_val", + "num_batches": 100, + "mean_val_loss": 0.022168, + "std_val_loss": 0.024902, + "mean_left_arm_loss": 0.030184, + "std_left_arm_loss": 0.043653, + "mean_right_arm_loss": 0.014151, + "std_right_arm_loss": 0.029382, + "mean_left_joint_loss": 0.032356, + "std_left_joint_loss": 0.048977, + "mean_left_gripper_loss": 0.014984, + "std_left_gripper_loss": 0.037395, + "mean_right_joint_loss": 0.014888, + "std_right_joint_loss": 0.032582, + "mean_right_gripper_loss": 0.008996, + "std_right_gripper_loss": 0.025757, + "mean_left_right_imbalance": 0.033825, + "std_left_right_imbalance": 0.046586, + "per_batch_timing_seconds": "mean=0.3248 std=0.0893 min=0.2203 max=0.7969", + "active_mask_dims": "[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]", + "masked_dims": "[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]", + "weight_loading_missing_keys": "[]", + "weight_loading_unexpected_keys": "[]", + "model": "parallel", + "checkpoint_step": 10000 + } + }, + "val_sample": [ + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.090938, + "std_masked_mae": 0.02224, + "mean_left_arm_mae": 0.120414, + "std_left_arm_mae": 0.046606, + "mean_right_arm_mae": 0.061461, + "std_right_arm_mae": 0.058026, + "mean_left_joint_mae": 0.130966, + "std_left_joint_mae": 0.054578, + "mean_left_gripper_mae": 0.046552, + "std_left_gripper_mae": 0.06792, + "mean_right_joint_mae": 0.063945, + "std_right_joint_mae": 0.062779, + "mean_right_gripper_mae": 0.044077, + "std_right_gripper_mae": 0.053987, + "mean_left_right_imbalance_mae": 0.095076, + "std_left_right_imbalance_mae": 0.059464, + "per_batch_timing_seconds": "mean=0.3131 std=0.0370 min=0.2649 max=0.3781", + "model": "baseline", + "checkpoint_step": 1000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.100992, + "std_masked_mae": 0.023502, + "mean_left_arm_mae": 0.132369, + "std_left_arm_mae": 0.047803, + "mean_right_arm_mae": 0.069615, + "std_right_arm_mae": 0.063335, + "mean_left_joint_mae": 0.143677, + "std_left_joint_mae": 0.056155, + "mean_left_gripper_mae": 0.053215, + "std_left_gripper_mae": 0.074232, + "mean_right_joint_mae": 0.072165, + "std_right_joint_mae": 0.068555, + "mean_right_gripper_mae": 0.051764, + "std_right_gripper_mae": 0.054067, + "mean_left_right_imbalance_mae": 0.101649, + "std_left_right_imbalance_mae": 0.063159, + "per_batch_timing_seconds": "mean=0.3640 std=0.0430 min=0.3333 max=0.4572", + "model": "baseline", + "checkpoint_step": 1000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.060253, + "std_masked_mae": 0.017936, + "mean_left_arm_mae": 0.078725, + "std_left_arm_mae": 0.032786, + "mean_right_arm_mae": 0.041781, + "std_right_arm_mae": 0.04091, + "mean_left_joint_mae": 0.083688, + "std_left_joint_mae": 0.036089, + "mean_left_gripper_mae": 0.043985, + "std_left_gripper_mae": 0.072901, + "mean_right_joint_mae": 0.042767, + "std_right_joint_mae": 0.041669, + "mean_right_gripper_mae": 0.034874, + "std_right_gripper_mae": 0.058769, + "mean_left_right_imbalance_mae": 0.063418, + "std_left_right_imbalance_mae": 0.039412, + "per_batch_timing_seconds": "mean=0.3006 std=0.0345 min=0.2674 max=0.3753", + "model": "baseline", + "checkpoint_step": 2000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.065765, + "std_masked_mae": 0.016923, + "mean_left_arm_mae": 0.086375, + "std_left_arm_mae": 0.032761, + "mean_right_arm_mae": 0.045154, + "std_right_arm_mae": 0.041131, + "mean_left_joint_mae": 0.092111, + "std_left_joint_mae": 0.036788, + "mean_left_gripper_mae": 0.046224, + "std_left_gripper_mae": 0.076043, + "mean_right_joint_mae": 0.046163, + "std_right_joint_mae": 0.042138, + "mean_right_gripper_mae": 0.038093, + "std_right_gripper_mae": 0.056179, + "mean_left_right_imbalance_mae": 0.066659, + "std_left_right_imbalance_mae": 0.040501, + "per_batch_timing_seconds": "mean=0.3586 std=0.0248 min=0.3396 max=0.4220", + "model": "baseline", + "checkpoint_step": 2000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.03972, + "std_masked_mae": 0.014654, + "mean_left_arm_mae": 0.049239, + "std_left_arm_mae": 0.019869, + "mean_right_arm_mae": 0.030201, + "std_right_arm_mae": 0.034473, + "mean_left_joint_mae": 0.052215, + "std_left_joint_mae": 0.023235, + "mean_left_gripper_mae": 0.028408, + "std_left_gripper_mae": 0.028427, + "mean_right_joint_mae": 0.031159, + "std_right_joint_mae": 0.037572, + "mean_right_gripper_mae": 0.02349, + "std_right_gripper_mae": 0.024208, + "mean_left_right_imbalance_mae": 0.04196, + "std_left_right_imbalance_mae": 0.030152, + "per_batch_timing_seconds": "mean=0.2920 std=0.0342 min=0.2585 max=0.3528", + "model": "baseline", + "checkpoint_step": 5000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.043346, + "std_masked_mae": 0.013818, + "mean_left_arm_mae": 0.053788, + "std_left_arm_mae": 0.020493, + "mean_right_arm_mae": 0.032904, + "std_right_arm_mae": 0.034889, + "mean_left_joint_mae": 0.057689, + "std_left_joint_mae": 0.024439, + "mean_left_gripper_mae": 0.026486, + "std_left_gripper_mae": 0.029864, + "mean_right_joint_mae": 0.0337, + "std_right_joint_mae": 0.038002, + "mean_right_gripper_mae": 0.027331, + "std_right_gripper_mae": 0.027093, + "mean_left_right_imbalance_mae": 0.044562, + "std_left_right_imbalance_mae": 0.030999, + "per_batch_timing_seconds": "mean=0.3951 std=0.0357 min=0.3463 max=0.4774", + "model": "baseline", + "checkpoint_step": 5000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.029935, + "std_masked_mae": 0.0082, + "mean_left_arm_mae": 0.041062, + "std_left_arm_mae": 0.019621, + "mean_right_arm_mae": 0.018807, + "std_right_arm_mae": 0.018117, + "mean_left_joint_mae": 0.04444, + "std_left_joint_mae": 0.02295, + "mean_left_gripper_mae": 0.017416, + "std_left_gripper_mae": 0.016394, + "mean_right_joint_mae": 0.0195, + "std_right_joint_mae": 0.019305, + "mean_right_gripper_mae": 0.013963, + "std_right_gripper_mae": 0.019504, + "mean_left_right_imbalance_mae": 0.033733, + "std_left_right_imbalance_mae": 0.022691, + "per_batch_timing_seconds": "mean=0.2793 std=0.0247 min=0.2625 max=0.3469", + "model": "baseline", + "checkpoint_step": 10000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.030294, + "std_masked_mae": 0.007277, + "mean_left_arm_mae": 0.041307, + "std_left_arm_mae": 0.019181, + "mean_right_arm_mae": 0.019282, + "std_right_arm_mae": 0.019077, + "mean_left_joint_mae": 0.045179, + "std_left_joint_mae": 0.022508, + "mean_left_gripper_mae": 0.014207, + "std_left_gripper_mae": 0.016425, + "mean_right_joint_mae": 0.020231, + "std_right_joint_mae": 0.020465, + "mean_right_gripper_mae": 0.01264, + "std_right_gripper_mae": 0.018571, + "mean_left_right_imbalance_mae": 0.034582, + "std_left_right_imbalance_mae": 0.023261, + "per_batch_timing_seconds": "mean=0.3823 std=0.0398 min=0.3432 max=0.4686", + "model": "baseline", + "checkpoint_step": 10000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.09253, + "std_masked_mae": 0.020956, + "mean_left_arm_mae": 0.122108, + "std_left_arm_mae": 0.04378, + "mean_right_arm_mae": 0.062952, + "std_right_arm_mae": 0.056483, + "mean_left_joint_mae": 0.133062, + "std_left_joint_mae": 0.052111, + "mean_left_gripper_mae": 0.045431, + "std_left_gripper_mae": 0.055952, + "mean_right_joint_mae": 0.065476, + "std_right_joint_mae": 0.060695, + "mean_right_gripper_mae": 0.04528, + "std_right_gripper_mae": 0.053039, + "mean_left_right_imbalance_mae": 0.093392, + "std_left_right_imbalance_mae": 0.056874, + "per_batch_timing_seconds": "mean=0.3110 std=0.0430 min=0.2654 max=0.3864", + "model": "parallel", + "checkpoint_step": 1000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.102452, + "std_masked_mae": 0.022208, + "mean_left_arm_mae": 0.13361, + "std_left_arm_mae": 0.044796, + "mean_right_arm_mae": 0.071295, + "std_right_arm_mae": 0.061523, + "mean_left_joint_mae": 0.145474, + "std_left_joint_mae": 0.053589, + "mean_left_gripper_mae": 0.05056, + "std_left_gripper_mae": 0.060317, + "mean_right_joint_mae": 0.073909, + "std_right_joint_mae": 0.066406, + "mean_right_gripper_mae": 0.053, + "std_right_gripper_mae": 0.051143, + "mean_left_right_imbalance_mae": 0.099213, + "std_left_right_imbalance_mae": 0.060422, + "per_batch_timing_seconds": "mean=0.4143 std=0.0560 min=0.3405 max=0.5017", + "model": "parallel", + "checkpoint_step": 1000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.05986, + "std_masked_mae": 0.012924, + "mean_left_arm_mae": 0.080984, + "std_left_arm_mae": 0.031604, + "mean_right_arm_mae": 0.038736, + "std_right_arm_mae": 0.031293, + "mean_left_joint_mae": 0.086197, + "std_left_joint_mae": 0.035912, + "mean_left_gripper_mae": 0.04449, + "std_left_gripper_mae": 0.062755, + "mean_right_joint_mae": 0.039304, + "std_right_joint_mae": 0.030982, + "mean_right_gripper_mae": 0.034761, + "std_right_gripper_mae": 0.051397, + "mean_left_right_imbalance_mae": 0.061196, + "std_left_right_imbalance_mae": 0.036442, + "per_batch_timing_seconds": "mean=0.3702 std=0.1017 min=0.2793 max=0.7256", + "model": "parallel", + "checkpoint_step": 2000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.065897, + "std_masked_mae": 0.012628, + "mean_left_arm_mae": 0.088735, + "std_left_arm_mae": 0.03201, + "mean_right_arm_mae": 0.043059, + "std_right_arm_mae": 0.032823, + "mean_left_joint_mae": 0.094654, + "std_left_joint_mae": 0.036668, + "mean_left_gripper_mae": 0.047298, + "std_left_gripper_mae": 0.06466, + "mean_right_joint_mae": 0.043769, + "std_right_joint_mae": 0.032862, + "mean_right_gripper_mae": 0.038089, + "std_right_gripper_mae": 0.049635, + "mean_left_right_imbalance_mae": 0.064491, + "std_left_right_imbalance_mae": 0.038643, + "per_batch_timing_seconds": "mean=0.4575 std=0.0902 min=0.3373 max=0.6590", + "model": "parallel", + "checkpoint_step": 2000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.040712, + "std_masked_mae": 0.013646, + "mean_left_arm_mae": 0.050681, + "std_left_arm_mae": 0.020624, + "mean_right_arm_mae": 0.030742, + "std_right_arm_mae": 0.03279, + "mean_left_joint_mae": 0.053976, + "std_left_joint_mae": 0.024153, + "mean_left_gripper_mae": 0.027611, + "std_left_gripper_mae": 0.02458, + "mean_right_joint_mae": 0.032227, + "std_right_joint_mae": 0.03635, + "mean_right_gripper_mae": 0.020349, + "std_right_gripper_mae": 0.017496, + "mean_left_right_imbalance_mae": 0.042435, + "std_left_right_imbalance_mae": 0.029207, + "per_batch_timing_seconds": "mean=0.3861 std=0.0848 min=0.2719 max=0.5485", + "model": "parallel", + "checkpoint_step": 5000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.044799, + "std_masked_mae": 0.012807, + "mean_left_arm_mae": 0.055016, + "std_left_arm_mae": 0.021278, + "mean_right_arm_mae": 0.034583, + "std_right_arm_mae": 0.032757, + "mean_left_joint_mae": 0.059296, + "std_left_joint_mae": 0.025068, + "mean_left_gripper_mae": 0.025058, + "std_left_gripper_mae": 0.027173, + "mean_right_joint_mae": 0.035777, + "std_right_joint_mae": 0.036454, + "mean_right_gripper_mae": 0.026224, + "std_right_gripper_mae": 0.01689, + "mean_left_right_imbalance_mae": 0.043614, + "std_left_right_imbalance_mae": 0.030178, + "per_batch_timing_seconds": "mean=0.4549 std=0.0835 min=0.3373 max=0.6280", + "model": "parallel", + "checkpoint_step": 5000 + }, + { + "num_steps": 4, + "num_batches": 16, + "mean_masked_mae": 0.029277, + "std_masked_mae": 0.007579, + "mean_left_arm_mae": 0.040375, + "std_left_arm_mae": 0.01919, + "mean_right_arm_mae": 0.018178, + "std_right_arm_mae": 0.015856, + "mean_left_joint_mae": 0.043636, + "std_left_joint_mae": 0.022278, + "mean_left_gripper_mae": 0.017546, + "std_left_gripper_mae": 0.013485, + "mean_right_joint_mae": 0.018908, + "std_right_joint_mae": 0.017028, + "mean_right_gripper_mae": 0.013066, + "std_right_gripper_mae": 0.016678, + "mean_left_right_imbalance_mae": 0.031629, + "std_left_right_imbalance_mae": 0.022404, + "per_batch_timing_seconds": "mean=0.3241 std=0.0551 min=0.2600 max=0.4241", + "model": "parallel", + "checkpoint_step": 10000 + }, + { + "num_steps": 10, + "num_batches": 16, + "mean_masked_mae": 0.030241, + "std_masked_mae": 0.00674, + "mean_left_arm_mae": 0.041072, + "std_left_arm_mae": 0.018866, + "mean_right_arm_mae": 0.01941, + "std_right_arm_mae": 0.017031, + "mean_left_joint_mae": 0.044817, + "std_left_joint_mae": 0.022046, + "mean_left_gripper_mae": 0.014857, + "std_left_gripper_mae": 0.014376, + "mean_right_joint_mae": 0.020279, + "std_right_joint_mae": 0.018425, + "mean_right_gripper_mae": 0.013323, + "std_right_gripper_mae": 0.014475, + "mean_left_right_imbalance_mae": 0.032456, + "std_left_right_imbalance_mae": 0.022935, + "per_batch_timing_seconds": "mean=0.4058 std=0.0569 min=0.3332 max=0.5100", + "model": "parallel", + "checkpoint_step": 10000 + } + ], + "runtime": [ + { + "stage": "baseline_train", + "start_utc": "2026-03-09 16:03:23 UTC", + "end_utc": "2026-03-09 18:17:03 UTC", + "duration_seconds": 8020, + "duration_hms": "2:13:40" + }, + { + "stage": "baseline_eval_1000", + "start_utc": "2026-03-09 18:17:03 UTC", + "end_utc": "2026-03-09 18:23:42 UTC", + "duration_seconds": 399, + "duration_hms": "0:06:39" + }, + { + "stage": "baseline_eval_2000", + "start_utc": "2026-03-09 18:23:42 UTC", + "end_utc": "2026-03-09 18:28:54 UTC", + "duration_seconds": 312, + "duration_hms": "0:05:12" + }, + { + "stage": "baseline_eval_5000", + "start_utc": "2026-03-09 18:28:54 UTC", + "end_utc": "2026-03-09 18:33:53 UTC", + "duration_seconds": 299, + "duration_hms": "0:04:59" + }, + { + "stage": "baseline_eval_10000", + "start_utc": "2026-03-09 18:33:53 UTC", + "end_utc": "2026-03-09 18:41:07 UTC", + "duration_seconds": 434, + "duration_hms": "0:07:14" + }, + { + "stage": "parallel_train", + "start_utc": "2026-03-09 18:41:07 UTC", + "end_utc": "2026-03-09 21:01:58 UTC", + "duration_seconds": 8451, + "duration_hms": "2:20:51" + }, + { + "stage": "parallel_eval_1000", + "start_utc": "2026-03-09 21:01:58 UTC", + "end_utc": "2026-03-09 21:14:35 UTC", + "duration_seconds": 757, + "duration_hms": "0:12:37" + }, + { + "stage": "parallel_eval_2000", + "start_utc": "2026-03-09 21:14:35 UTC", + "end_utc": "2026-03-09 21:22:39 UTC", + "duration_seconds": 484, + "duration_hms": "0:08:04" + }, + { + "stage": "parallel_eval_5000", + "start_utc": "2026-03-09 21:22:40 UTC", + "end_utc": "2026-03-09 21:35:26 UTC", + "duration_seconds": 766, + "duration_hms": "0:12:46" + }, + { + "stage": "parallel_eval_10000", + "start_utc": "2026-03-09 21:35:26 UTC", + "end_utc": "2026-03-09 21:45:53 UTC", + "duration_seconds": 627, + "duration_hms": "0:10:27" + }, + { + "stage": "full_pipeline", + "start_utc": "2026-03-09 15:57:20 UTC", + "end_utc": "2026-03-09 21:45:53 UTC", + "duration_seconds": 20913, + "duration_hms": "5:48:33" + } + ], + "warmstart_equivalence": { + "baseline_config_name": "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "parallel_config_name": "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "repo_id_used": "lsnu/twin_handover_256_train", + "baseline_ckpt": "/workspace/checkpoints/pi05_base_single_pytorch", + "parallel_ckpt": "/workspace/checkpoints/pi05_base_parallel_packed_from_single", + "batch_size": 4, + "eval_seed": 777, + "tolerance": "1e-06", + "baseline_missing_keys": "[]", + "baseline_unexpected_keys": "[]", + "parallel_missing_keys": "[]", + "parallel_unexpected_keys": "[]", + "input_projection_max_abs_diff": 0.00122881, + "input_projection_mean_abs_diff": 0.00015435, + "loss_max_abs_diff": 0.90186501, + "loss_mean_abs_diff": 0.04585753, + "baseline_masked_loss": 1.00531137, + "parallel_masked_loss": 1.00929189, + "masked_loss_abs_diff": 0.00398052, + "warmstart_equivalent": false + } +} \ No newline at end of file diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/teacher_forced_eval_table.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/teacher_forced_eval_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..0b91b0504b67a6ab0ea6debee85305536bd4d856 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/teacher_forced_eval_table.csv @@ -0,0 +1,9 @@ +model,checkpoint_step,teacher_forced_eval_seed,config_name,checkpoint_path,repo_id_used,num_batches,mean_val_loss,std_val_loss,mean_left_arm_loss,std_left_arm_loss,mean_right_arm_loss,std_right_arm_loss,mean_left_joint_loss,std_left_joint_loss,mean_left_gripper_loss,std_left_gripper_loss,mean_right_joint_loss,std_right_joint_loss,mean_right_gripper_loss,std_right_gripper_loss,mean_left_right_imbalance,std_left_right_imbalance,per_batch_timing_seconds,active_mask_dims,masked_dims,weight_loading_missing_keys,weight_loading_unexpected_keys +baseline,1000,123,pi05_twin_handover_256_packed_baseline_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000,lsnu/twin_handover_256_val,50,0.06113,0.043921,0.077421,0.059309,0.04484,0.080634,0.082092,0.06674,0.04472,0.088365,0.046274,0.087919,0.034807,0.076825,0.08012,0.083456,mean=0.3040 std=0.1266 min=0.2246 max=0.8837,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +baseline,2000,123,pi05_twin_handover_256_packed_baseline_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000,lsnu/twin_handover_256_val,50,0.041595,0.030015,0.049919,0.033208,0.033271,0.059873,0.051501,0.035502,0.038846,0.082622,0.034159,0.066139,0.027055,0.06654,0.05474,0.055247,mean=0.2487 std=0.0844 min=0.2239 max=0.8257,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +baseline,5000,123,pi05_twin_handover_256_packed_baseline_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000,lsnu/twin_handover_256_val,50,0.027324,0.020404,0.039118,0.037404,0.015529,0.023314,0.042035,0.041763,0.018705,0.031815,0.015711,0.023929,0.014261,0.030013,0.038961,0.035474,mean=0.2601 std=0.0801 min=0.2212 max=0.7730,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +baseline,10000,123,pi05_twin_handover_256_packed_baseline_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000,lsnu/twin_handover_256_val,100,0.022345,0.024337,0.029659,0.039896,0.015031,0.032929,0.031507,0.044637,0.016725,0.040894,0.015776,0.036308,0.009818,0.028543,0.034067,0.045126,mean=0.2524 std=0.0719 min=0.2263 max=0.8903,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +parallel,1000,123,pi05_twin_handover_256_packed_parallel_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000,lsnu/twin_handover_256_val,50,0.059715,0.042962,0.073681,0.049928,0.045749,0.082818,0.078129,0.055212,0.042541,0.08491,0.047261,0.090299,0.035161,0.079674,0.075806,0.079713,mean=0.3663 std=0.6150 min=0.2224 max=4.6353,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +parallel,2000,123,pi05_twin_handover_256_packed_parallel_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000,lsnu/twin_handover_256_val,50,0.039947,0.025053,0.050148,0.033233,0.029745,0.04786,0.051925,0.036277,0.037711,0.077017,0.030139,0.051862,0.026984,0.065713,0.051938,0.044701,mean=0.3708 std=0.1690 min=0.2327 max=1.3050,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +parallel,5000,123,pi05_twin_handover_256_packed_parallel_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000,lsnu/twin_handover_256_val,50,0.02734,0.020897,0.039155,0.038641,0.015526,0.023413,0.042035,0.043377,0.018994,0.032843,0.015753,0.024564,0.013938,0.029304,0.038635,0.037436,mean=0.3717 std=0.2172 min=0.2283 max=1.7875,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] +parallel,10000,123,pi05_twin_handover_256_packed_parallel_pytorch_10k,/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000,lsnu/twin_handover_256_val,100,0.022168,0.024902,0.030184,0.043653,0.014151,0.029382,0.032356,0.048977,0.014984,0.037395,0.014888,0.032582,0.008996,0.025757,0.033825,0.046586,mean=0.3248 std=0.0893 min=0.2203 max=0.7969,"[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]","[8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31]",[],[] diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/train_loss_table.csv b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/train_loss_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..cfc37cc3331a370e445183989ff5038f5eb33fc7 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/metrics/train_loss_table.csv @@ -0,0 +1,9 @@ +model,step,ts,loss,smoothed,lr,grad_norm,step_time,data_time,its,eta,mem,grad_action_in_proj,grad_action_out_proj,grad_shared_expert,grad_action_in_proj_arms,grad_action_out_proj_arms,grad_arm_token_fuse +baseline,1000,16:16:42.668,0.0228,0.0476,2.48e-05,0.9699,0.5638,0.0801,1.553,5793.6,35.23GB,0.0109,0.1595,0.4924,,, +baseline,2000,16:28:30.872,0.0492,0.0284,2.37e-05,0.6437,0.4982,0.0622,1.785,4482.7,35.23GB,0.0184,0.2195,0.8358,,, +baseline,5000,17:04:21.626,0.0038,0.0165,1.47e-05,0.5112,0.4974,0.0606,1.792,2789.7,35.23GB,0.0101,0.1353,1.1505,,, +baseline,10000,18:15:00.659,0.0141,0.0172,2.50e-06,0.4377,0.5241,0.1210,1.550,0.0,35.23GB,0.0125,0.1342,0.4184,,, +parallel,1000,18:56:22.847,0.0246,0.0492,2.48e-05,0.9470,0.5836,0.1086,1.445,6229.0,35.27GB,,,0.5049,0.0139,0.1631,0.0704 +parallel,2000,19:09:53.627,0.0280,0.0267,2.37e-05,0.6051,0.7138,0.1628,1.141,7012.2,35.27GB,,,0.5627,0.0180,0.1784,0.0955 +parallel,5000,19:50:55.815,0.0043,0.0159,1.47e-05,0.4850,0.5183,0.0658,1.712,2920.0,35.27GB,,,1.0533,0.0105,0.1454,0.0568 +parallel,10000,20:58:23.797,0.0140,0.0169,2.50e-06,0.4269,0.6919,0.2213,1.095,0.0,35.27GB,,,0.4071,0.0121,0.1277,0.0634 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf.cpython-311.pyc b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4a069ccfd8597fe2151c92cf77de82b866f4348 Binary files /dev/null and b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf.cpython-311.pyc differ diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf_incremental.cpython-311.pyc b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf_incremental.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74eaa918d3f82422d77ce163b7a740e09f6bc36d Binary files /dev/null and b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/__pycache__/upload_to_hf_incremental.cpython-311.pyc differ diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/changed_files.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/changed_files.txt new file mode 100644 index 0000000000000000000000000000000000000000..84a57ca3c6a2d9861fb437614e5faa1a8ff01436 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/changed_files.txt @@ -0,0 +1,41 @@ +Phase 1 initial study file list: +see artifacts/twin_handover_packed_parallelization_20260309/repro/changed_files.txt + +Phase 2 10K follow-up additions and updates: + +openpi/src/openpi/training/config.py + added pi05_twin_handover_256_packed_baseline_pytorch_10k + added pi05_twin_handover_256_packed_parallel_pytorch_10k + added 10K packed norm-stats asset paths + +openpi/scripts/train_pytorch.py + added periodic per-module gradient bucket norms for baseline and parallel models + baseline buckets: action_in_proj, action_out_proj, shared_expert + parallel buckets: action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert + +openpi/scripts/eval_twin_val_loss_pytorch.py + added left/right arm teacher-forced losses + added joint vs gripper teacher-forced losses + added left/right imbalance + added deterministic sample_actions eval on a fixed subset for num_steps=4,10 + +openpi/scripts/check_parallel_warmstart_equivalence.py + added explicit step-0 numerical comparison between the packed single-head bootstrap and packed parallel warm-start + +openpi/scripts/run_twin_handover_packed_10k.sh + added detached 10K baseline->eval sweep->parallel->eval sweep runner + +openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json + copied existing public handover-train norm stats for the 10K baseline config + +openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json + copied existing public handover-train norm stats for the 10K parallel config + +README.md + updated repo landing page to cover both the 2K initial study and the 10K follow-up + +REPORT.md + updated full report to include methodology, changed files, runtimes, warm-start check, and final 10K metrics + +artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py + added reproducible Hub uploader for the final 10K bundle, docs, code, assets, and checkpoints diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/checkpoint_locations.txt b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/checkpoint_locations.txt new file mode 100644 index 0000000000000000000000000000000000000000..48582d0236438c8cb183fba026ca9b806a484d7d --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/checkpoint_locations.txt @@ -0,0 +1,4 @@ +/workspace/checkpoints/pi05_base_single_pytorch +/workspace/checkpoints/pi05_base_parallel_packed_from_single +/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k +/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/commands_reproduce.sh b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/commands_reproduce.sh new file mode 100644 index 0000000000000000000000000000000000000000..cb000e99d1009a35d909566dbcf6e9db50a5e94d --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/commands_reproduce.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd /workspace/pi05tests-openpi-multiarm/openpi +source .venv/bin/activate + +export HF_HOME=/workspace/.hf +export HF_HUB_CACHE=/workspace/.hf/hub +export HF_DATASETS_CACHE=/workspace/.hf/datasets +export HUGGINGFACE_HUB_CACHE=/workspace/.hf/hub +export XDG_CACHE_HOME=/workspace/.cache +export OPENPI_LEROBOT_HOME=/workspace/lerobot +export OPENPI_TORCH_COMPILE_SAMPLE_ACTIONS=0 +export TOKENIZERS_PARALLELISM=false +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# Warm-start numerical check. +python scripts/check_parallel_warmstart_equivalence.py + +# Optional smoke tests. +torchrun --standalone --nproc_per_node=4 scripts/train_pytorch.py \ + pi05_twin_handover_256_packed_baseline_pytorch_10k \ + --exp_name smoke_baseline_10k_diag \ + --overwrite + +torchrun --standalone --nproc_per_node=4 scripts/train_pytorch.py \ + pi05_twin_handover_256_packed_parallel_pytorch_10k \ + --exp_name smoke_parallel_10k_diag \ + --overwrite + +# Batch inspection. +python scripts/inspect_twin_packed_batch.py \ + --config_name pi05_twin_handover_256_packed_baseline_pytorch_2k \ + --repo_id lsnu/twin_handover_256_train + +# Detached full 10K chain. +setsid bash -lc 'cd /workspace/pi05tests-openpi-multiarm/openpi && exec bash ./scripts/run_twin_handover_packed_10k.sh >> /workspace/run_logs/handover_packed_10k_followup.log 2>&1' >/dev/null 2>&1 < /dev/null & + +# Direct full 10K chain, if detach is not needed. +bash ./scripts/run_twin_handover_packed_10k.sh + +# Push the final bundle to the Hugging Face repo after the run finishes. +python /workspace/pi05tests-openpi-multiarm/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py + +# Individual evals, if re-running manually after training. +python scripts/eval_twin_val_loss_pytorch.py \ + --config_name pi05_twin_handover_256_packed_baseline_pytorch_10k \ + --checkpoint_dir /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000 \ + --repo_id lsnu/twin_handover_256_val \ + --num_batches 100 \ + --num_workers 0 \ + --sample_num_batches 16 \ + --sample_num_steps 4,10 + +# The uploader expects HF_TOKEN in the environment. +# Example: +# export HF_TOKEN=... +# python /workspace/pi05tests-openpi-multiarm/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py + +python scripts/eval_twin_val_loss_pytorch.py \ + --config_name pi05_twin_handover_256_packed_parallel_pytorch_10k \ + --checkpoint_dir /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000 \ + --repo_id lsnu/twin_handover_256_val \ + --num_batches 100 \ + --num_workers 0 \ + --sample_num_batches 16 \ + --sample_num_steps 4,10 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..506df3bed5f4d681de58ba5ab420b19ca283a7c8 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import os +from pathlib import Path + +from huggingface_hub import HfApi + + +REPO_ID = "lsnu/pi05tests-openpi-multiarm" +REPO_TYPE = "model" + + +def main() -> None: + token = os.environ.get("HF_TOKEN") + token_file = os.environ.get("HF_TOKEN_FILE") + if not token and token_file: + token_path = Path(token_file) + if token_path.exists(): + token = token_path.read_text().strip() + if os.environ.get("HF_TOKEN_FILE_DELETE_AFTER_READ") == "1": + token_path.unlink(missing_ok=True) + if not token: + raise RuntimeError("HF_TOKEN is required in the environment") + + repo_root = Path(__file__).resolve().parents[3] + allow_patterns = [ + "README.md", + "REPORT.md", + "artifacts/twin_handover_packed_parallelization_10k_20260309/**", + "openpi/README.md", + "openpi/pyproject.toml", + "openpi/uv.lock", + "openpi/examples/convert_jax_model_to_pytorch.py", + "openpi/scripts/**", + "openpi/src/openpi/**", + "openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_2k/**", + "openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_2k/**", + "openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/**", + "openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/**", + "openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/**", + "openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/**", + ] + + print(f"uploading repo_root={repo_root}", flush=True) + print(f"repo_id={REPO_ID}", flush=True) + print(f"allow_patterns={allow_patterns}", flush=True) + + HfApi(token=token).upload_large_folder( + repo_id=REPO_ID, + folder_path=repo_root, + repo_type=REPO_TYPE, + allow_patterns=allow_patterns, + num_workers=8, + print_report=True, + print_report_every=30, + ) + + +if __name__ == "__main__": + main() diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf_incremental.py b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf_incremental.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1541129050884dd93497b222021dfb009cc097 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/repro/upload_to_hf_incremental.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import os +import shutil +import tempfile +from pathlib import Path + +from huggingface_hub import HfApi + + +REPO_ID = "lsnu/pi05tests-openpi-multiarm" +REPO_TYPE = "model" + + +def _read_token() -> str: + token = os.environ.get("HF_TOKEN") + token_file = os.environ.get("HF_TOKEN_FILE") + if not token and token_file: + token_path = Path(token_file) + if token_path.exists(): + token = token_path.read_text().strip() + if os.environ.get("HF_TOKEN_FILE_DELETE_AFTER_READ") == "1": + token_path.unlink(missing_ok=True) + if not token: + raise RuntimeError("HF_TOKEN is required in the environment") + return token + + +def _verify_path(api: HfApi, path_in_repo: str) -> None: + info = api.get_paths_info(repo_id=REPO_ID, paths=[path_in_repo], repo_type=REPO_TYPE) + if not info or info[0] is None: + raise RuntimeError(f"remote path missing after upload: {path_in_repo}") + print(f"verified remote path: {path_in_repo}", flush=True) + + +def _upload_folder( + api: HfApi, + folder_path: Path, + path_in_repo: str, + commit_message: str, + allow_patterns: list[str] | None = None, + verify_path: str | None = None, +) -> None: + print( + f"upload_folder start folder_path={folder_path} path_in_repo={path_in_repo} " + f"allow_patterns={allow_patterns}", + flush=True, + ) + api.upload_folder( + repo_id=REPO_ID, + repo_type=REPO_TYPE, + folder_path=folder_path, + path_in_repo=path_in_repo, + allow_patterns=allow_patterns, + commit_message=commit_message, + ) + path_to_verify = verify_path or path_in_repo or (allow_patterns or [""])[0].rstrip("/**") + _verify_path(api, path_to_verify) + + +def _stage_small_files(base_dir: Path, files: list[str]) -> Path: + stage_root = Path(tempfile.mkdtemp(prefix="hf_stage_small_", dir="/workspace")) + for rel_path in files: + src_path = base_dir / rel_path + dst_path = stage_root / rel_path + dst_path.parent.mkdir(parents=True, exist_ok=True) + os.link(src_path, dst_path) + return stage_root + + +def _upload_sparse_files( + api: HfApi, + base_dir: Path, + files: list[str], + path_in_repo: str, + commit_message: str, + verify_path: str, +) -> None: + stage_root = _stage_small_files(base_dir, files) + try: + _upload_folder(api, stage_root, path_in_repo, commit_message, verify_path=verify_path) + finally: + shutil.rmtree(stage_root, ignore_errors=True) + + +def _stage_large_tree(src_dir: Path, repo_subdir: str) -> Path: + stage_root = Path(tempfile.mkdtemp(prefix="hf_stage_", dir="/workspace")) + dst_dir = stage_root / repo_subdir + dst_dir.parent.mkdir(parents=True, exist_ok=True) + print(f"hardlink staging src={src_dir} dst={dst_dir}", flush=True) + shutil.copytree(src_dir, dst_dir, copy_function=os.link) + return stage_root + + +def _upload_large_tree(api: HfApi, src_dir: Path, repo_subdir: str) -> None: + stage_root = _stage_large_tree(src_dir, repo_subdir) + try: + print(f"upload_large_folder start repo_subdir={repo_subdir} stage_root={stage_root}", flush=True) + api.upload_large_folder( + repo_id=REPO_ID, + repo_type=REPO_TYPE, + folder_path=stage_root, + allow_patterns=[f"{repo_subdir}/**"], + num_workers=8, + print_report=True, + print_report_every=30, + ) + _verify_path(api, repo_subdir) + finally: + print(f"removing stage_root={stage_root}", flush=True) + shutil.rmtree(stage_root, ignore_errors=True) + + +def main() -> None: + token = _read_token() + api = HfApi(token=token) + repo_root = Path(__file__).resolve().parents[3] + openpi_root = repo_root / "openpi" + + print(f"repo_root={repo_root}", flush=True) + print(f"repo_id={REPO_ID}", flush=True) + + _upload_sparse_files( + api, + repo_root, + ["README.md", "REPORT.md"], + "", + "Upload 10k report docs", + "README.md", + ) + _upload_sparse_files( + api, + openpi_root, + ["README.md", "pyproject.toml", "uv.lock", "examples/convert_jax_model_to_pytorch.py"], + "openpi", + "Upload reproducibility metadata", + "openpi/pyproject.toml", + ) + _upload_folder( + api, + openpi_root / "scripts", + "openpi/scripts", + "Upload training and eval scripts", + ) + _upload_folder( + api, + openpi_root / "src" / "openpi", + "openpi/src/openpi", + "Upload training source tree", + ) + _upload_folder( + api, + openpi_root / "assets" / "pi05_twin_handover_256_packed_baseline_pytorch_2k", + "openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_2k", + "Upload 2k baseline norm stats", + ) + _upload_folder( + api, + openpi_root / "assets" / "pi05_twin_handover_256_packed_parallel_pytorch_2k", + "openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_2k", + "Upload 2k parallel norm stats", + ) + _upload_folder( + api, + openpi_root / "assets" / "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k", + "Upload 10k baseline norm stats", + ) + _upload_folder( + api, + openpi_root / "assets" / "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k", + "Upload 10k parallel norm stats", + ) + _upload_folder( + api, + repo_root / "artifacts" / "twin_handover_packed_parallelization_10k_20260309", + "artifacts/twin_handover_packed_parallelization_10k_20260309", + "Upload 10k metrics and environment snapshot", + ) + _upload_large_tree( + api, + openpi_root / "checkpoints" / "pi05_twin_handover_256_packed_baseline_pytorch_10k", + "openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k", + ) + _upload_large_tree( + api, + openpi_root / "checkpoints" / "pi05_twin_handover_256_packed_parallel_pytorch_10k", + "openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k", + ) + + print("incremental upload complete", flush=True) + + +if __name__ == "__main__": + main() diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_10k_followup.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_10k_followup.log new file mode 100644 index 0000000000000000000000000000000000000000..e3d9c26e81680f2a0c31d2ba8248aee222ebc2db --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_10k_followup.log @@ -0,0 +1,23 @@ +[2026-03-09 15:57:20 UTC] packed 10k runner started +[2026-03-09 16:03:23 UTC] warm-start equivalence check logged to /workspace/run_logs/warmstart_equivalence_10k.log +[2026-03-09 16:03:23 UTC] train start config=pi05_twin_handover_256_packed_baseline_pytorch_10k exp=handover_packed_baseline_10k +[2026-03-09 18:17:03 UTC] train done config=pi05_twin_handover_256_packed_baseline_pytorch_10k exp=handover_packed_baseline_10k +[2026-03-09 18:17:03 UTC] eval start config=pi05_twin_handover_256_packed_baseline_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000 batches=50 +[2026-03-09 18:23:42 UTC] eval done log=/workspace/run_logs/handover_packed_baseline_10k_val_1000.log +[2026-03-09 18:23:42 UTC] eval start config=pi05_twin_handover_256_packed_baseline_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000 batches=50 +[2026-03-09 18:28:54 UTC] eval done log=/workspace/run_logs/handover_packed_baseline_10k_val_2000.log +[2026-03-09 18:28:54 UTC] eval start config=pi05_twin_handover_256_packed_baseline_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000 batches=50 +[2026-03-09 18:33:53 UTC] eval done log=/workspace/run_logs/handover_packed_baseline_10k_val_5000.log +[2026-03-09 18:33:53 UTC] eval start config=pi05_twin_handover_256_packed_baseline_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000 batches=100 +[2026-03-09 18:41:07 UTC] eval done log=/workspace/run_logs/handover_packed_baseline_10k_val_10000.log +[2026-03-09 18:41:07 UTC] train start config=pi05_twin_handover_256_packed_parallel_pytorch_10k exp=handover_packed_parallel_10k +[2026-03-09 21:01:58 UTC] train done config=pi05_twin_handover_256_packed_parallel_pytorch_10k exp=handover_packed_parallel_10k +[2026-03-09 21:01:58 UTC] eval start config=pi05_twin_handover_256_packed_parallel_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000 batches=50 +[2026-03-09 21:14:35 UTC] eval done log=/workspace/run_logs/handover_packed_parallel_10k_val_1000.log +[2026-03-09 21:14:35 UTC] eval start config=pi05_twin_handover_256_packed_parallel_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000 batches=50 +[2026-03-09 21:22:39 UTC] eval done log=/workspace/run_logs/handover_packed_parallel_10k_val_2000.log +[2026-03-09 21:22:40 UTC] eval start config=pi05_twin_handover_256_packed_parallel_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000 batches=50 +[2026-03-09 21:35:26 UTC] eval done log=/workspace/run_logs/handover_packed_parallel_10k_val_5000.log +[2026-03-09 21:35:26 UTC] eval start config=pi05_twin_handover_256_packed_parallel_pytorch_10k ckpt=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000 batches=100 +[2026-03-09 21:45:53 UTC] eval done log=/workspace/run_logs/handover_packed_parallel_10k_val_10000.log +[2026-03-09 21:45:53 UTC] packed 10k runner finished diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k.log new file mode 100644 index 0000000000000000000000000000000000000000..6564bfc7aa56c386c20e3546f5f7f495d7669e9d --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k.log @@ -0,0 +1,1154 @@ +W0309 16:03:36.739000 10707 torch/distributed/run.py:766] +W0309 16:03:36.739000 10707 torch/distributed/run.py:766] ***************************************** +W0309 16:03:36.739000 10707 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0309 16:03:36.739000 10707 torch/distributed/run.py:766] ***************************************** +16:04:31.444 [I] Created experiment checkpoint directory: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k (10775:train_pytorch.py:505) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank0]:[W309 16:04:31.837050337 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank2]:[W309 16:04:32.396579299 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank3]:[W309 16:04:33.308967856 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank1]:[W309 16:04:33.086805020 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +16:04:35.265 [I] Using batch size per GPU: 4 (total batch size across 4 GPUs: 16) (10775:train_pytorch.py:524) +16:04:35.402 [I] Loaded norm stats from /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train (10775:config.py:234) +16:04:35.403 [I] data_config: DataConfig(repo_id='lsnu/twin_handover_256_train', asset_id='lsnu/twin_handover_256_train', norm_stats={'state': NormStats(mean=array([ 0.40321857, 0.17899239, -0.07588876, -2.06326795, -0.46418607, + 1.79356563, 0.70229131, 0.48194093, 0.93952829, 0.86693275, + -1.03168762, -1.9056077 , -0.53421056, 1.87584054, 2.36738205, + 0.91249251]), std=array([0.73344636, 0.47653052, 0.72710407, 0.42399687, 0.63613892, + 0.61144608, 1.11724186, 0.49967375, 0.86981195, 0.75071597, + 0.90787333, 0.35008711, 0.51183224, 0.36600712, 0.56947577, + 0.28257725]), q01=array([-1.52408956, -1.32446341, -1.91092197, -2.89885788, -1.66315554, + 0.59010215, -2.27611645, 0. , -1.77352981, -1.62131719, + -1.77092851, -2.19172778, -2.03159353, 0.55409113, 0.79255736, + 0. ]), q99=array([ 2.16638614, 1.38857444, 1.93436338, -0.88548369, 1.39976143, + 2.99162304, 2.8194857 , 0.9998 , 1.46557211, 1.74660106, + 1.58644652, -0.87876934, 2.25910752, 2.54628449, 2.89347284, + 0.9998 ])), 'actions': NormStats(mean=array([ 0.05879939, -0.00704042, -0.02719213, -0.07685276, -0.07520971, + -0.00498583, 0.03577602, 0.48164892, 0.06564316, 0.06023132, + -0.10068271, -0.09547432, -0.0526481 , 0.08205888, 0.13954687, + 0.88333535]), std=array([0.18337056, 0.28128958, 0.18525195, 0.29767084, 0.22944973, + 0.40312037, 0.3896611 , 0.49966311, 0.21938531, 0.16883859, + 0.20206179, 0.14864719, 0.12629333, 0.15546791, 0.23423795, + 0.32102022]), q01=array([-0.34140511, -0.71597991, -0.55301429, -0.8233152 , -0.68097536, + -0.87723451, -0.86000918, 0. , -0.53261366, -0.49289397, + -0.48524564, -0.35752607, -0.42426748, -0.18230745, -0.09212705, + 0. ]), q99=array([0.55444025, 0.69361174, 0.44115428, 0.550829 , 0.49707318, + 0.68353445, 0.82907713, 0.9998 , 0.42654409, 0.44255511, + 0.4114292 , 0.01550327, 0.38038206, 0.71452535, 0.62808441, + 0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (10775:data_loader.py:283) +16:04:35.407 [I] Using existing local LeRobot dataset mirror for lsnu/twin_handover_256_train: /workspace/lerobot/lsnu/twin_handover_256_train (10775:data_loader.py:149) +16:04:43.317 [I] local_batch_size: 4 (10775:data_loader.py:364) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +16:05:59.729 [I] Enabled gradient checkpointing for PI0Pytorch model (10775:pi0_pytorch.py:150) +16:05:59.733 [I] Enabled gradient checkpointing for memory optimization (10775:train_pytorch.py:596) +16:05:59.735 [I] Step 0 (after_model_creation): GPU memory - allocated: 7.47GB, reserved: 7.48GB, free: 0.01GB, peak_allocated: 7.47GB, peak_reserved: 7.48GB | DDP: rank=0, world_size=4 (10775:train_pytorch.py:465) +16:06:04.131 [I] Loading weights from: /workspace/checkpoints/pi05_base_single_pytorch (10775:train_pytorch.py:625) +16:06:06.424 [I] Weight loading missing key count: 0 (10775:train_pytorch.py:629) +16:06:06.424 [I] Weight loading missing keys: set() (10775:train_pytorch.py:630) +16:06:06.424 [I] Weight loading unexpected key count: 0 (10775:train_pytorch.py:631) +16:06:06.424 [I] Weight loading unexpected keys: [] (10775:train_pytorch.py:632) +16:06:06.424 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_single_pytorch (10775:train_pytorch.py:633) +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +16:06:06.430 [I] Running on: 9a96de7d560b | world_size=4 (10775:train_pytorch.py:673) +16:06:06.430 [I] Training config: batch_size=16, effective_batch_size=4, num_train_steps=10000 (10775:train_pytorch.py:674) +16:06:06.430 [I] Memory optimizations: gradient_checkpointing=True (10775:train_pytorch.py:677) +16:06:06.430 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (10775:train_pytorch.py:678) +16:06:06.430 [I] LR schedule: warmup=500, peak_lr=2.50e-05, decay_steps=10000, end_lr=2.50e-06 (10775:train_pytorch.py:679) +16:06:06.430 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (10775:train_pytorch.py:682) +16:06:06.431 [I] EMA is not supported for PyTorch training (10775:train_pytorch.py:685) +16:06:06.431 [I] Training precision: bfloat16 (10775:train_pytorch.py:686) +16:06:06.432 [I] Resolved config name: pi05_twin_handover_256_packed_baseline_pytorch_10k (10775:train_pytorch.py:280) +16:06:06.432 [I] Dataset repo_id: lsnu/twin_handover_256_train (10775:train_pytorch.py:281) +16:06:06.432 [I] Norm-stats file path: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (10775:train_pytorch.py:282) +16:06:06.432 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (10775:train_pytorch.py:283) +16:06:06.433 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_single_pytorch (10775:train_pytorch.py:284) +16:06:06.433 [I] Model type: baseline (10775:train_pytorch.py:285) +16:06:06.433 [I] Packed transforms active: True (10775:train_pytorch.py:286) +16:06:06.433 [I] World size: 4 (10775:train_pytorch.py:287) +16:06:06.433 [I] Batch size: local=4, global=16 (10775:train_pytorch.py:288) +16:06:06.433 [I] num_workers: 8 (10775:train_pytorch.py:289) +16:06:06.433 [I] Precision: bfloat16 (10775:train_pytorch.py:290) +16:06:06.433 [I] LR schedule summary: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (10775:train_pytorch.py:291) +16:06:06.433 [I] Save/log intervals: save_interval=1000, log_interval=10 (10775:train_pytorch.py:298) +16:06:06.433 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (10775:train_pytorch.py:299) +16:06:06.434 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (10775:train_pytorch.py:300) +16:06:06.434 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (10775:train_pytorch.py:301) +16:06:06.434 [I] Gradient bucket diagnostics: action_in_proj, action_out_proj, shared_expert (10775:train_pytorch.py:694) + Training: 0%| | 0/10000 [00:00 /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000 (10775:train_pytorch.py:350) + Training: 10%|█ | 1000/10000 [11:55<61:22:18, 24.55s/it, loss=0.0439, lr=2.48e-05, step=999] Training: 10%|█ | 1000/10000 [11:55<61:22:18, 24.55s/it, loss=0.0228, lr=2.48e-05, step=1000] Training: 10%|█ | 1001/10000 [11:56<43:23:24, 17.36s/it, loss=0.0228, lr=2.48e-05, step=1000] Training: 10%|█ | 1001/10000 [11:56<43:23:24, 17.36s/it, loss=0.0320, lr=2.48e-05, step=1001] Training: 10%|█ | 1002/10000 [11:57<30:56:08, 12.38s/it, loss=0.0320, lr=2.48e-05, step=1001] Training: 10%|█ | 1002/10000 [11:57<30:56:08, 12.38s/it, loss=0.0308, lr=2.48e-05, step=1002] Training: 10%|█ | 1003/10000 [11:57<22:15:04, 8.90s/it, loss=0.0308, lr=2.48e-05, step=1002] Training: 10%|█ | 1003/10000 [11:57<22:15:04, 8.90s/it, loss=0.0607, lr=2.48e-05, step=1003] Training: 10%|█ | 1004/10000 [11:58<15:58:47, 6.39s/it, loss=0.0607, lr=2.48e-05, step=1003] Training: 10%|█ | 1004/10000 [11:58<15:58:47, 6.39s/it, loss=0.1043, lr=2.48e-05, step=1004] Training: 10%|█ | 1005/10000 [11:58<11:33:11, 4.62s/it, loss=0.1043, lr=2.48e-05, step=1004] Training: 10%|█ | 1005/10000 [11:58<11:33:11, 4.62s/it, loss=0.0199, lr=2.48e-05, step=1005] Training: 10%|█ | 1006/10000 [11:59<8:38:09, 3.46s/it, loss=0.0199, lr=2.48e-05, step=1005] Training: 10%|█ | 1006/10000 [11:59<8:38:09, 3.46s/it, loss=0.0312, lr=2.48e-05, step=1006] Training: 10%|█ | 1007/10000 [12:00<6:45:28, 2.71s/it, loss=0.0312, lr=2.48e-05, step=1006] Training: 10%|█ | 1007/10000 [12:00<6:45:28, 2.71s/it, loss=0.0426, lr=2.48e-05, step=1007] Training: 10%|█ | 1008/10000 [12:01<5:14:49, 2.10s/it, loss=0.0426, lr=2.48e-05, step=1007] Training: 10%|█ | 1008/10000 [12:01<5:14:49, 2.10s/it, loss=0.0342, lr=2.48e-05, step=1008] Training: 10%|█ | 1009/10000 [12:01<4:02:00, 1.62s/it, loss=0.0342, lr=2.48e-05, step=1008] Training: 10%|█ | 1009/10000 [12:01<4:02:00, 1.62s/it, loss=0.0244, lr=2.48e-05, step=1009]16:18:09.187 [I] step=1010 loss=0.0297 smoothed_loss=0.0419 lr=2.48e-05 grad_norm=0.8719 step_time=0.6066s data_time=8.0454s it/s=0.116 eta_to_10000=77780.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0160 grad_action_out_proj=0.2040 grad_shared_expert=0.7087 (10775:train_pytorch.py:850) + Training: 10%|█ | 1010/10000 [12:02<3:36:19, 1.44s/it, loss=0.0244, lr=2.48e-05, step=1009] Training: 10%|█ | 1010/10000 [12:02<3:36:19, 1.44s/it, loss=0.0297, lr=2.48e-05, step=1010] Training: 10%|█ | 1011/10000 [12:03<3:00:42, 1.21s/it, loss=0.0297, lr=2.48e-05, step=1010] Training: 10%|█ | 1011/10000 [12:03<3:00:42, 1.21s/it, loss=0.0413, lr=2.48e-05, step=1011] Training: 10%|█ | 1012/10000 [12:04<2:44:53, 1.10s/it, loss=0.0413, lr=2.48e-05, step=1011] Training: 10%|█ | 1012/10000 [12:04<2:44:53, 1.10s/it, loss=0.0393, lr=2.48e-05, step=1012] Training: 10%|█ | 1013/10000 [12:04<2:17:25, 1.09it/s, loss=0.0393, lr=2.48e-05, step=1012] Training: 10%|█ | 1013/10000 [12:04<2:17:25, 1.09it/s, loss=0.0678, lr=2.48e-05, step=1013] Training: 10%|█ | 1014/10000 [12:05<2:23:03, 1.05it/s, loss=0.0678, lr=2.48e-05, step=1013] Training: 10%|█ | 1014/10000 [12:05<2:23:03, 1.05it/s, loss=0.0337, lr=2.48e-05, step=1014] Training: 10%|█ | 1015/10000 [12:06<2:14:28, 1.11it/s, loss=0.0337, lr=2.48e-05, step=1014] Training: 10%|█ | 1015/10000 [12:06<2:14:28, 1.11it/s, loss=0.0357, lr=2.48e-05, step=1015] Training: 10%|█ | 1016/10000 [12:07<2:04:50, 1.20it/s, loss=0.0357, lr=2.48e-05, step=1015] Training: 10%|█ | 1016/10000 [12:07<2:04:50, 1.20it/s, loss=0.0296, lr=2.48e-05, step=1016] Training: 10%|█ | 1017/10000 [12:07<1:51:41, 1.34it/s, loss=0.0296, lr=2.48e-05, step=1016] Training: 10%|█ | 1017/10000 [12:07<1:51:41, 1.34it/s, loss=0.0292, lr=2.48e-05, step=1017] Training: 10%|█ | 1018/10000 [12:09<2:12:52, 1.13it/s, loss=0.0292, lr=2.48e-05, step=1017] Training: 10%|█ | 1018/10000 [12:09<2:12:52, 1.13it/s, loss=0.0372, lr=2.48e-05, step=1018] Training: 10%|█ | 1019/10000 [12:09<2:11:27, 1.14it/s, loss=0.0372, lr=2.48e-05, step=1018] Training: 10%|█ | 1019/10000 [12:09<2:11:27, 1.14it/s, loss=0.0639, lr=2.48e-05, step=1019]16:18:17.066 [I] step=1020 loss=0.0364 smoothed_loss=0.0415 lr=2.48e-05 grad_norm=0.8359 step_time=0.6508s data_time=0.1370s it/s=1.269 eta_to_10000=7073.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.1792 grad_shared_expert=0.5416 (10775:train_pytorch.py:850) + Training: 10%|█ | 1020/10000 [12:10<2:06:48, 1.18it/s, loss=0.0639, lr=2.48e-05, step=1019] Training: 10%|█ | 1020/10000 [12:10<2:06:48, 1.18it/s, loss=0.0364, lr=2.48e-05, step=1020] Training: 10%|█ | 1021/10000 [12:11<2:05:23, 1.19it/s, loss=0.0364, lr=2.48e-05, step=1020] Training: 10%|█ | 1021/10000 [12:11<2:05:23, 1.19it/s, loss=0.0431, lr=2.48e-05, step=1021] Training: 10%|█ | 1022/10000 [12:12<2:01:05, 1.24it/s, loss=0.0431, lr=2.48e-05, step=1021] Training: 10%|█ | 1022/10000 [12:12<2:01:05, 1.24it/s, loss=0.0558, lr=2.48e-05, step=1022] Training: 10%|█ | 1023/10000 [12:12<1:58:38, 1.26it/s, loss=0.0558, lr=2.48e-05, step=1022] Training: 10%|█ | 1023/10000 [12:12<1:58:38, 1.26it/s, loss=0.0209, lr=2.48e-05, step=1023] Training: 10%|█ | 1024/10000 [12:13<1:50:21, 1.36it/s, loss=0.0209, lr=2.48e-05, step=1023] Training: 10%|█ | 1024/10000 [12:13<1:50:21, 1.36it/s, loss=0.0480, lr=2.48e-05, step=1024] Training: 10%|█ | 1025/10000 [12:14<1:54:21, 1.31it/s, loss=0.0480, lr=2.48e-05, step=1024] Training: 10%|█ | 1025/10000 [12:14<1:54:21, 1.31it/s, loss=0.0471, lr=2.48e-05, step=1025] Training: 10%|█ | 1026/10000 [12:15<1:51:36, 1.34it/s, loss=0.0471, lr=2.48e-05, step=1025] Training: 10%|█ | 1026/10000 [12:15<1:51:36, 1.34it/s, loss=0.0596, lr=2.48e-05, step=1026] Training: 10%|█ | 1027/10000 [12:15<1:50:33, 1.35it/s, loss=0.0596, lr=2.48e-05, step=1026] Training: 10%|█ | 1027/10000 [12:15<1:50:33, 1.35it/s, loss=0.1184, lr=2.48e-05, step=1027] Training: 10%|█ | 1028/10000 [12:16<1:48:58, 1.37it/s, loss=0.1184, lr=2.48e-05, step=1027] Training: 10%|█ | 1028/10000 [12:16<1:48:58, 1.37it/s, loss=0.0416, lr=2.48e-05, step=1028] Training: 10%|█ | 1029/10000 [12:17<1:38:10, 1.52it/s, loss=0.0416, lr=2.48e-05, step=1028] Training: 10%|█ | 1029/10000 [12:17<1:38:10, 1.52it/s, loss=0.0439, lr=2.48e-05, step=1029]16:18:24.297 [I] step=1030 loss=0.0188 smoothed_loss=0.0466 lr=2.48e-05 grad_norm=0.8780 step_time=0.5944s data_time=0.1287s it/s=1.383 eta_to_10000=6484.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0176 grad_action_out_proj=0.1984 grad_shared_expert=0.7845 (10775:train_pytorch.py:850) + Training: 10%|█ | 1030/10000 [12:17<1:47:21, 1.39it/s, loss=0.0439, lr=2.48e-05, step=1029] Training: 10%|█ | 1030/10000 [12:17<1:47:21, 1.39it/s, loss=0.0188, lr=2.48e-05, step=1030] Training: 10%|█ | 1031/10000 [12:18<1:44:46, 1.43it/s, loss=0.0188, lr=2.48e-05, step=1030] Training: 10%|█ | 1031/10000 [12:18<1:44:46, 1.43it/s, loss=0.0456, lr=2.48e-05, step=1031] Training: 10%|█ | 1032/10000 [12:19<1:46:35, 1.40it/s, loss=0.0456, lr=2.48e-05, step=1031] Training: 10%|█ | 1032/10000 [12:19<1:46:35, 1.40it/s, loss=0.0443, lr=2.48e-05, step=1032] Training: 10%|█ | 1033/10000 [12:19<1:37:25, 1.53it/s, loss=0.0443, lr=2.48e-05, step=1032] Training: 10%|█ | 1033/10000 [12:19<1:37:25, 1.53it/s, loss=0.0482, lr=2.48e-05, step=1033] Training: 10%|█ | 1034/10000 [12:20<1:43:49, 1.44it/s, loss=0.0482, lr=2.48e-05, step=1033] Training: 10%|█ | 1034/10000 [12:20<1:43:49, 1.44it/s, loss=0.0140, lr=2.48e-05, step=1034] Training: 10%|█ | 1035/10000 [12:21<1:44:02, 1.44it/s, loss=0.0140, lr=2.48e-05, step=1034] Training: 10%|█ | 1035/10000 [12:21<1:44:02, 1.44it/s, loss=0.0404, lr=2.48e-05, step=1035] Training: 10%|█ | 1036/10000 [12:22<1:48:00, 1.38it/s, loss=0.0404, lr=2.48e-05, step=1035] Training: 10%|█ | 1036/10000 [12:22<1:48:00, 1.38it/s, loss=0.0290, lr=2.48e-05, step=1036] Training: 10%|█ | 1037/10000 [12:22<1:37:34, 1.53it/s, loss=0.0290, lr=2.48e-05, step=1036] Training: 10%|█ | 1037/10000 [12:22<1:37:34, 1.53it/s, loss=0.0205, lr=2.48e-05, step=1037] Training: 10%|█ | 1038/10000 [12:23<1:30:12, 1.66it/s, loss=0.0205, lr=2.48e-05, step=1037] Training: 10%|█ | 1038/10000 [12:23<1:30:12, 1.66it/s, loss=0.0640, lr=2.48e-05, step=1038] Training: 10%|█ | 1039/10000 [12:23<1:38:56, 1.51it/s, loss=0.0640, lr=2.48e-05, step=1038] Training: 10%|█ | 1039/10000 [12:23<1:38:56, 1.51it/s, loss=0.0327, lr=2.48e-05, step=1039]16:18:31.049 [I] step=1040 loss=0.0821 smoothed_loss=0.0451 lr=2.48e-05 grad_norm=0.8972 step_time=0.5810s data_time=0.0943s it/s=1.481 eta_to_10000=6049.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.2065 grad_shared_expert=0.9209 (10775:train_pytorch.py:850) + Training: 10%|█ | 1040/10000 [12:24<1:44:18, 1.43it/s, loss=0.0327, lr=2.48e-05, step=1039] Training: 10%|█ | 1040/10000 [12:24<1:44:18, 1.43it/s, loss=0.0821, lr=2.48e-05, step=1040] Training: 10%|█ | 1041/10000 [12:25<1:35:35, 1.56it/s, loss=0.0821, lr=2.48e-05, step=1040] Training: 10%|█ | 1041/10000 [12:25<1:35:35, 1.56it/s, loss=0.0168, lr=2.48e-05, step=1041] Training: 10%|█ | 1042/10000 [12:25<1:45:15, 1.42it/s, loss=0.0168, lr=2.48e-05, step=1041] Training: 10%|█ | 1042/10000 [12:25<1:45:15, 1.42it/s, loss=0.0234, lr=2.48e-05, step=1042] Training: 10%|█ | 1043/10000 [12:26<1:42:11, 1.46it/s, loss=0.0234, lr=2.48e-05, step=1042] Training: 10%|█ | 1043/10000 [12:26<1:42:11, 1.46it/s, loss=0.0433, lr=2.48e-05, step=1043] Training: 10%|█ | 1044/10000 [12:27<1:48:11, 1.38it/s, loss=0.0433, lr=2.48e-05, step=1043] Training: 10%|█ | 1044/10000 [12:27<1:48:11, 1.38it/s, loss=0.0248, lr=2.48e-05, step=1044] Training: 10%|█ | 1045/10000 [12:28<1:50:42, 1.35it/s, loss=0.0248, lr=2.48e-05, step=1044] Training: 10%|█ | 1045/10000 [12:28<1:50:42, 1.35it/s, loss=0.0339, lr=2.48e-05, step=1045] Training: 10%|█ | 1046/10000 [12:29<2:06:21, 1.18it/s, loss=0.0339, lr=2.48e-05, step=1045] Training: 10%|█ | 1046/10000 [12:29<2:06:21, 1.18it/s, loss=0.0317, lr=2.48e-05, step=1046] Training: 10%|█ | 1047/10000 [12:29<1:59:13, 1.25it/s, loss=0.0317, lr=2.48e-05, step=1046] Training: 10%|█ | 1047/10000 [12:29<1:59:13, 1.25it/s, loss=0.0424, lr=2.48e-05, step=1047] Training: 10%|█ | 1048/10000 [12:30<1:51:33, 1.34it/s, loss=0.0424, lr=2.48e-05, step=1047] Training: 10%|█ | 1048/10000 [12:30<1:51:33, 1.34it/s, loss=0.0521, lr=2.48e-05, step=1048] Training: 10%|█ | 1049/10000 [12:31<1:46:09, 1.41it/s, loss=0.0521, lr=2.48e-05, step=1048] Training: 10%|█ | 1049/10000 [12:31<1:46:09, 1.41it/s, loss=0.0323, lr=2.48e-05, step=1049]16:18:38.481 [I] step=1050 loss=0.0323 smoothed_loss=0.0383 lr=2.48e-05 grad_norm=0.9110 step_time=0.6087s data_time=0.1345s it/s=1.346 eta_to_10000=6651.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0130 grad_action_out_proj=0.2217 grad_shared_expert=0.7126 (10775:train_pytorch.py:850) + Training: 10%|█ | 1050/10000 [12:32<1:50:08, 1.35it/s, loss=0.0323, lr=2.48e-05, step=1049] Training: 10%|█ | 1050/10000 [12:32<1:50:08, 1.35it/s, loss=0.0323, lr=2.48e-05, step=1050] Training: 11%|█ | 1051/10000 [12:32<1:39:38, 1.50it/s, loss=0.0323, lr=2.48e-05, step=1050] Training: 11%|█ | 1051/10000 [12:32<1:39:38, 1.50it/s, loss=0.0436, lr=2.48e-05, step=1051] Training: 11%|█ | 1052/10000 [12:33<1:35:43, 1.56it/s, loss=0.0436, lr=2.48e-05, step=1051] Training: 11%|█ | 1052/10000 [12:33<1:35:43, 1.56it/s, loss=0.0270, lr=2.48e-05, step=1052] Training: 11%|█ | 1053/10000 [12:34<1:57:22, 1.27it/s, loss=0.0270, lr=2.48e-05, step=1052] Training: 11%|█ | 1053/10000 [12:34<1:57:22, 1.27it/s, loss=0.0536, lr=2.48e-05, step=1053] Training: 11%|█ | 1054/10000 [12:34<1:44:13, 1.43it/s, loss=0.0536, lr=2.48e-05, step=1053] Training: 11%|█ | 1054/10000 [12:34<1:44:13, 1.43it/s, loss=0.0562, lr=2.48e-05, step=1054] Training: 11%|█ | 1055/10000 [12:35<1:42:09, 1.46it/s, loss=0.0562, lr=2.48e-05, step=1054] Training: 11%|█ | 1055/10000 [12:35<1:42:09, 1.46it/s, loss=0.0444, lr=2.48e-05, step=1055] Training: 11%|█ | 1056/10000 [12:36<1:46:26, 1.40it/s, loss=0.0444, lr=2.48e-05, step=1055] Training: 11%|█ | 1056/10000 [12:36<1:46:26, 1.40it/s, loss=0.0245, lr=2.48e-05, step=1056] Training: 11%|█ | 1057/10000 [12:36<1:47:22, 1.39it/s, loss=0.0245, lr=2.48e-05, step=1056] Training: 11%|█ | 1057/10000 [12:36<1:47:22, 1.39it/s, loss=0.0779, lr=2.48e-05, step=1057] Training: 11%|█ | 1058/10000 [12:37<1:53:45, 1.31it/s, loss=0.0779, lr=2.48e-05, step=1057] Training: 11%|█ | 1058/10000 [12:37<1:53:45, 1.31it/s, loss=0.0555, lr=2.48e-05, step=1058] Training: 11%|█ | 1059/10000 [12:38<1:45:02, 1.42it/s, loss=0.0555, lr=2.48e-05, step=1058] Training: 11%|█ | 1059/10000 [12:38<1:45:02, 1.42it/s, loss=0.0452, lr=2.48e-05, step=1059]16:18:45.644 [I] step=1060 loss=0.0459 smoothed_loss=0.0448 lr=2.48e-05 grad_norm=0.8361 step_time=0.5833s data_time=0.1330s it/s=1.396 eta_to_10000=6402.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.2213 grad_shared_expert=0.5282 (10775:train_pytorch.py:850) + Training: 11%|█ | 1060/10000 [12:39<1:51:49, 1.33it/s, loss=0.0452, lr=2.48e-05, step=1059] Training: 11%|█ | 1060/10000 [12:39<1:51:49, 1.33it/s, loss=0.0459, lr=2.48e-05, step=1060] Training: 11%|█ | 1061/10000 [12:40<2:02:44, 1.21it/s, loss=0.0459, lr=2.48e-05, step=1060] Training: 11%|█ | 1061/10000 [12:40<2:02:44, 1.21it/s, loss=0.0449, lr=2.48e-05, step=1061] Training: 11%|█ | 1062/10000 [12:41<2:06:26, 1.18it/s, loss=0.0449, lr=2.48e-05, step=1061] Training: 11%|█ | 1062/10000 [12:41<2:06:26, 1.18it/s, loss=0.0247, lr=2.48e-05, step=1062] Training: 11%|█ | 1063/10000 [12:41<2:02:17, 1.22it/s, loss=0.0247, lr=2.48e-05, step=1062] Training: 11%|█ | 1063/10000 [12:41<2:02:17, 1.22it/s, loss=0.0806, lr=2.48e-05, step=1063] Training: 11%|█ | 1064/10000 [12:42<1:47:46, 1.38it/s, loss=0.0806, lr=2.48e-05, step=1063] Training: 11%|█ | 1064/10000 [12:42<1:47:46, 1.38it/s, loss=0.0455, lr=2.48e-05, step=1064] Training: 11%|█ | 1065/10000 [12:43<1:53:59, 1.31it/s, loss=0.0455, lr=2.48e-05, step=1064] Training: 11%|█ | 1065/10000 [12:43<1:53:59, 1.31it/s, loss=0.0337, lr=2.48e-05, step=1065] Training: 11%|█ | 1066/10000 [12:43<1:42:15, 1.46it/s, loss=0.0337, lr=2.48e-05, step=1065] Training: 11%|█ | 1066/10000 [12:43<1:42:15, 1.46it/s, loss=0.0928, lr=2.48e-05, step=1066] Training: 11%|█ | 1067/10000 [12:44<1:46:20, 1.40it/s, loss=0.0928, lr=2.48e-05, step=1066] Training: 11%|█ | 1067/10000 [12:44<1:46:20, 1.40it/s, loss=0.0436, lr=2.48e-05, step=1067] Training: 11%|█ | 1068/10000 [12:45<1:59:07, 1.25it/s, loss=0.0436, lr=2.48e-05, step=1067] Training: 11%|█ | 1068/10000 [12:45<1:59:07, 1.25it/s, loss=0.0336, lr=2.48e-05, step=1068] Training: 11%|█ | 1069/10000 [12:46<1:46:58, 1.39it/s, loss=0.0336, lr=2.48e-05, step=1068] Training: 11%|█ | 1069/10000 [12:46<1:46:58, 1.39it/s, loss=0.0256, lr=2.48e-05, step=1069]16:18:53.016 [I] step=1070 loss=0.0454 smoothed_loss=0.0455 lr=2.48e-05 grad_norm=0.8166 step_time=0.5998s data_time=0.1374s it/s=1.357 eta_to_10000=6582.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.2567 grad_shared_expert=0.8801 (10775:train_pytorch.py:850) + Training: 11%|█ | 1070/10000 [12:46<1:39:09, 1.50it/s, loss=0.0256, lr=2.48e-05, step=1069] Training: 11%|█ | 1070/10000 [12:46<1:39:09, 1.50it/s, loss=0.0454, lr=2.48e-05, step=1070] Training: 11%|█ | 1071/10000 [12:47<1:43:26, 1.44it/s, loss=0.0454, lr=2.48e-05, step=1070] Training: 11%|█ | 1071/10000 [12:47<1:43:26, 1.44it/s, loss=0.0341, lr=2.48e-05, step=1071] Training: 11%|█ | 1072/10000 [12:48<1:47:37, 1.38it/s, loss=0.0341, lr=2.48e-05, step=1071] Training: 11%|█ | 1072/10000 [12:48<1:47:37, 1.38it/s, loss=0.0308, lr=2.48e-05, step=1072] Training: 11%|█ | 1073/10000 [12:48<1:49:26, 1.36it/s, loss=0.0308, lr=2.48e-05, step=1072] Training: 11%|█ | 1073/10000 [12:48<1:49:26, 1.36it/s, loss=0.0347, lr=2.48e-05, step=1073] Training: 11%|█ | 1074/10000 [12:49<1:51:59, 1.33it/s, loss=0.0347, lr=2.48e-05, step=1073] Training: 11%|█ | 1074/10000 [12:49<1:51:59, 1.33it/s, loss=0.0287, lr=2.48e-05, step=1074] Training: 11%|█ | 1075/10000 [12:50<2:02:22, 1.22it/s, loss=0.0287, lr=2.48e-05, step=1074] Training: 11%|█ | 1075/10000 [12:50<2:02:22, 1.22it/s, loss=0.0246, lr=2.48e-05, step=1075] Training: 11%|█ | 1076/10000 [12:51<1:58:02, 1.26it/s, loss=0.0246, lr=2.48e-05, step=1075] Training: 11%|█ | 1076/10000 [12:51<1:58:02, 1.26it/s, loss=0.0227, lr=2.48e-05, step=1076] Training: 11%|█ | 1077/10000 [12:51<1:44:26, 1.42it/s, loss=0.0227, lr=2.48e-05, step=1076] Training: 11%|█ | 1077/10000 [12:51<1:44:26, 1.42it/s, loss=0.0217, lr=2.48e-05, step=1077] Training: 11%|█ | 1078/10000 [12:52<1:35:41, 1.55it/s, loss=0.0217, lr=2.48e-05, step=1077] Training: 11%|█ | 1078/10000 [12:52<1:35:41, 1.55it/s, loss=0.0279, lr=2.48e-05, step=1078] Training: 11%|█ | 1079/10000 [12:53<1:39:36, 1.49it/s, loss=0.0279, lr=2.48e-05, step=1078] Training: 11%|█ | 1079/10000 [12:53<1:39:36, 1.49it/s, loss=0.0198, lr=2.48e-05, step=1079]16:19:00.106 [I] step=1080 loss=0.0218 smoothed_loss=0.0325 lr=2.48e-05 grad_norm=0.7621 step_time=0.5909s data_time=0.1182s it/s=1.411 eta_to_10000=6323.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0156 grad_action_out_proj=0.2086 grad_shared_expert=0.7398 (10775:train_pytorch.py:850) + Training: 11%|█ | 1080/10000 [12:53<1:33:55, 1.58it/s, loss=0.0198, lr=2.48e-05, step=1079] Training: 11%|█ | 1080/10000 [12:53<1:33:55, 1.58it/s, loss=0.0218, lr=2.48e-05, step=1080] Training: 11%|█ | 1081/10000 [12:54<1:32:43, 1.60it/s, loss=0.0218, lr=2.48e-05, step=1080] Training: 11%|█ | 1081/10000 [12:54<1:32:43, 1.60it/s, loss=0.0754, lr=2.48e-05, step=1081] Training: 11%|█ | 1082/10000 [12:55<1:42:44, 1.45it/s, loss=0.0754, lr=2.48e-05, step=1081] Training: 11%|█ | 1082/10000 [12:55<1:42:44, 1.45it/s, loss=0.0172, lr=2.48e-05, step=1082] Training: 11%|█ | 1083/10000 [12:55<1:43:07, 1.44it/s, loss=0.0172, lr=2.48e-05, step=1082] Training: 11%|█ | 1083/10000 [12:55<1:43:07, 1.44it/s, loss=0.0919, lr=2.48e-05, step=1083] Training: 11%|█ | 1084/10000 [12:56<1:34:22, 1.57it/s, loss=0.0919, lr=2.48e-05, step=1083] Training: 11%|█ | 1084/10000 [12:56<1:34:22, 1.57it/s, loss=0.1133, lr=2.48e-05, step=1084] Training: 11%|█ | 1085/10000 [12:56<1:29:09, 1.67it/s, loss=0.1133, lr=2.48e-05, step=1084] Training: 11%|█ | 1085/10000 [12:56<1:29:09, 1.67it/s, loss=0.0381, lr=2.48e-05, step=1085] Training: 11%|█ | 1086/10000 [12:57<1:51:44, 1.33it/s, loss=0.0381, lr=2.48e-05, step=1085] Training: 11%|█ | 1086/10000 [12:57<1:51:44, 1.33it/s, loss=0.0471, lr=2.48e-05, step=1086] Training: 11%|█ | 1087/10000 [12:58<1:54:35, 1.30it/s, loss=0.0471, lr=2.48e-05, step=1086] Training: 11%|█ | 1087/10000 [12:58<1:54:35, 1.30it/s, loss=0.0350, lr=2.48e-05, step=1087] Training: 11%|█ | 1088/10000 [12:59<1:41:53, 1.46it/s, loss=0.0350, lr=2.48e-05, step=1087] Training: 11%|█ | 1088/10000 [12:59<1:41:53, 1.46it/s, loss=0.0282, lr=2.48e-05, step=1088] Training: 11%|█ | 1089/10000 [12:59<1:43:00, 1.44it/s, loss=0.0282, lr=2.48e-05, step=1088] Training: 11%|█ | 1089/10000 [12:59<1:43:00, 1.44it/s, loss=0.0178, lr=2.48e-05, step=1089]16:19:07.160 [I] step=1090 loss=0.0449 smoothed_loss=0.0417 lr=2.48e-05 grad_norm=0.6918 step_time=0.5825s data_time=0.1229s it/s=1.418 eta_to_10000=6283.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0228 grad_action_out_proj=0.1775 grad_shared_expert=0.7770 (10775:train_pytorch.py:850) + Training: 11%|█ | 1090/10000 [13:00<1:46:02, 1.40it/s, loss=0.0178, lr=2.48e-05, step=1089] Training: 11%|█ | 1090/10000 [13:00<1:46:02, 1.40it/s, loss=0.0449, lr=2.48e-05, step=1090] Training: 11%|█ | 1091/10000 [13:01<1:36:19, 1.54it/s, loss=0.0449, lr=2.48e-05, step=1090] Training: 11%|█ | 1091/10000 [13:01<1:36:19, 1.54it/s, loss=0.0236, lr=2.48e-05, step=1091] Training: 11%|█ | 1092/10000 [13:01<1:29:49, 1.65it/s, loss=0.0236, lr=2.48e-05, step=1091] Training: 11%|█ | 1092/10000 [13:01<1:29:49, 1.65it/s, loss=0.0175, lr=2.48e-05, step=1092] Training: 11%|█ | 1093/10000 [13:02<1:37:40, 1.52it/s, loss=0.0175, lr=2.48e-05, step=1092] Training: 11%|█ | 1093/10000 [13:02<1:37:40, 1.52it/s, loss=0.0259, lr=2.48e-05, step=1093] Training: 11%|█ | 1094/10000 [13:02<1:30:13, 1.65it/s, loss=0.0259, lr=2.48e-05, step=1093] Training: 11%|█ | 1094/10000 [13:02<1:30:13, 1.65it/s, loss=0.0440, lr=2.48e-05, step=1094] Training: 11%|█ | 1095/10000 [13:03<1:25:42, 1.73it/s, loss=0.0440, lr=2.48e-05, step=1094] Training: 11%|█ | 1095/10000 [13:03<1:25:42, 1.73it/s, loss=0.0528, lr=2.48e-05, step=1095] Training: 11%|█ | 1096/10000 [13:04<1:36:27, 1.54it/s, loss=0.0528, lr=2.48e-05, step=1095] Training: 11%|█ | 1096/10000 [13:04<1:36:27, 1.54it/s, loss=0.0367, lr=2.48e-05, step=1096] Training: 11%|█ | 1097/10000 [13:05<1:42:48, 1.44it/s, loss=0.0367, lr=2.48e-05, step=1096] Training: 11%|█ | 1097/10000 [13:05<1:42:48, 1.44it/s, loss=0.0427, lr=2.48e-05, step=1097] Training: 11%|█ | 1098/10000 [13:05<1:33:59, 1.58it/s, loss=0.0427, lr=2.48e-05, step=1097] Training: 11%|█ | 1098/10000 [13:05<1:33:59, 1.58it/s, loss=0.0546, lr=2.48e-05, step=1098] Training: 11%|█ | 1099/10000 [13:06<1:31:11, 1.63it/s, loss=0.0546, lr=2.48e-05, step=1098] Training: 11%|█ | 1099/10000 [13:06<1:31:11, 1.63it/s, loss=0.0483, lr=2.48e-05, step=1099]16:19:13.394 [I] step=1100 loss=0.0450 smoothed_loss=0.0417 lr=2.48e-05 grad_norm=0.7677 step_time=0.5418s data_time=0.0816s it/s=1.604 eta_to_10000=5547.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.1641 grad_shared_expert=0.6385 (10775:train_pytorch.py:850) + Training: 11%|█ | 1100/10000 [13:06<1:38:25, 1.51it/s, loss=0.0483, lr=2.48e-05, step=1099] Training: 11%|█ | 1100/10000 [13:06<1:38:25, 1.51it/s, loss=0.0450, lr=2.48e-05, step=1100] Training: 11%|█ | 1101/10000 [13:07<1:46:27, 1.39it/s, loss=0.0450, lr=2.48e-05, step=1100] Training: 11%|█ | 1101/10000 [13:07<1:46:27, 1.39it/s, loss=0.2338, lr=2.48e-05, step=1101] Training: 11%|█ | 1102/10000 [13:08<1:36:23, 1.54it/s, loss=0.2338, lr=2.48e-05, step=1101] Training: 11%|█ | 1102/10000 [13:08<1:36:23, 1.54it/s, loss=0.0289, lr=2.48e-05, step=1102] Training: 11%|█ | 1103/10000 [13:08<1:33:45, 1.58it/s, loss=0.0289, lr=2.48e-05, step=1102] Training: 11%|█ | 1103/10000 [13:08<1:33:45, 1.58it/s, loss=0.0742, lr=2.48e-05, step=1103] Training: 11%|█ | 1104/10000 [13:09<1:43:21, 1.43it/s, loss=0.0742, lr=2.48e-05, step=1103] Training: 11%|█ | 1104/10000 [13:09<1:43:21, 1.43it/s, loss=0.0208, lr=2.48e-05, step=1104] Training: 11%|█ | 1105/10000 [13:10<1:41:26, 1.46it/s, loss=0.0208, lr=2.48e-05, step=1104] Training: 11%|█ | 1105/10000 [13:10<1:41:26, 1.46it/s, loss=0.0260, lr=2.48e-05, step=1105] Training: 11%|█ | 1106/10000 [13:10<1:33:34, 1.58it/s, loss=0.0260, lr=2.48e-05, step=1105] Training: 11%|█ | 1106/10000 [13:10<1:33:34, 1.58it/s, loss=0.0380, lr=2.48e-05, step=1106] Training: 11%|█ | 1107/10000 [13:11<1:36:52, 1.53it/s, loss=0.0380, lr=2.48e-05, step=1106] Training: 11%|█ | 1107/10000 [13:11<1:36:52, 1.53it/s, loss=0.0327, lr=2.48e-05, step=1107] Training: 11%|█ | 1108/10000 [13:12<1:29:11, 1.66it/s, loss=0.0327, lr=2.48e-05, step=1107] Training: 11%|█ | 1108/10000 [13:12<1:29:11, 1.66it/s, loss=0.0455, lr=2.48e-05, step=1108] Training: 11%|█ | 1109/10000 [13:12<1:24:40, 1.75it/s, loss=0.0455, lr=2.48e-05, step=1108] Training: 11%|█ | 1109/10000 [13:12<1:24:40, 1.75it/s, loss=0.0442, lr=2.48e-05, step=1109]16:19:19.537 [I] step=1110 loss=0.0918 smoothed_loss=0.0528 lr=2.48e-05 grad_norm=0.7288 step_time=0.5146s data_time=0.0997s it/s=1.628 eta_to_10000=5460.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0193 grad_action_out_proj=0.2358 grad_shared_expert=0.5349 (10775:train_pytorch.py:850) + Training: 11%|█ | 1110/10000 [13:13<1:22:21, 1.80it/s, loss=0.0442, lr=2.48e-05, step=1109] Training: 11%|█ | 1110/10000 [13:13<1:22:21, 1.80it/s, loss=0.0918, lr=2.48e-05, step=1110] Training: 11%|█ | 1111/10000 [13:14<1:42:29, 1.45it/s, loss=0.0918, lr=2.48e-05, step=1110] Training: 11%|█ | 1111/10000 [13:14<1:42:29, 1.45it/s, loss=0.0431, lr=2.48e-05, step=1111] Training: 11%|█ | 1112/10000 [13:14<1:36:25, 1.54it/s, loss=0.0431, lr=2.48e-05, step=1111] Training: 11%|█ | 1112/10000 [13:14<1:36:25, 1.54it/s, loss=0.0184, lr=2.48e-05, step=1112] Training: 11%|█ | 1113/10000 [13:15<1:28:48, 1.67it/s, loss=0.0184, lr=2.48e-05, step=1112] Training: 11%|█ | 1113/10000 [13:15<1:28:48, 1.67it/s, loss=0.0190, lr=2.48e-05, step=1113] Training: 11%|█ | 1114/10000 [13:15<1:37:29, 1.52it/s, loss=0.0190, lr=2.48e-05, step=1113] Training: 11%|█ | 1114/10000 [13:15<1:37:29, 1.52it/s, loss=0.0562, lr=2.48e-05, step=1114] Training: 11%|█ | 1115/10000 [13:16<1:41:18, 1.46it/s, loss=0.0562, lr=2.48e-05, step=1114] Training: 11%|█ | 1115/10000 [13:16<1:41:18, 1.46it/s, loss=0.0338, lr=2.48e-05, step=1115] Training: 11%|█ | 1116/10000 [13:17<1:41:05, 1.46it/s, loss=0.0338, lr=2.48e-05, step=1115] Training: 11%|█ | 1116/10000 [13:17<1:41:05, 1.46it/s, loss=0.0139, lr=2.48e-05, step=1116] Training: 11%|█ | 1117/10000 [13:17<1:34:19, 1.57it/s, loss=0.0139, lr=2.48e-05, step=1116] Training: 11%|█ | 1117/10000 [13:17<1:34:19, 1.57it/s, loss=0.0565, lr=2.48e-05, step=1117] Training: 11%|█ | 1118/10000 [13:18<1:42:42, 1.44it/s, loss=0.0565, lr=2.48e-05, step=1117] Training: 11%|█ | 1118/10000 [13:18<1:42:42, 1.44it/s, loss=0.0259, lr=2.48e-05, step=1118] Training: 11%|█ | 1119/10000 [13:19<1:50:35, 1.34it/s, loss=0.0259, lr=2.48e-05, step=1118] Training: 11%|█ | 1119/10000 [13:19<1:50:35, 1.34it/s, loss=0.0452, lr=2.48e-05, step=1119]16:19:26.575 [I] step=1120 loss=0.0450 smoothed_loss=0.0424 lr=2.48e-05 grad_norm=0.7824 step_time=0.5915s data_time=0.1123s it/s=1.421 eta_to_10000=6247.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.2222 grad_shared_expert=0.8045 (10775:train_pytorch.py:850) + Training: 11%|█ | 1120/10000 [13:20<1:41:35, 1.46it/s, loss=0.0452, lr=2.48e-05, step=1119] Training: 11%|█ | 1120/10000 [13:20<1:41:35, 1.46it/s, loss=0.0450, lr=2.48e-05, step=1120] Training: 11%|█ | 1121/10000 [13:20<1:42:20, 1.45it/s, loss=0.0450, lr=2.48e-05, step=1120] Training: 11%|█ | 1121/10000 [13:20<1:42:20, 1.45it/s, loss=0.0206, lr=2.48e-05, step=1121] Training: 11%|█ | 1122/10000 [13:21<1:45:01, 1.41it/s, loss=0.0206, lr=2.48e-05, step=1121] Training: 11%|█ | 1122/10000 [13:21<1:45:01, 1.41it/s, loss=0.0399, lr=2.48e-05, step=1122] Training: 11%|█ | 1123/10000 [13:22<1:46:31, 1.39it/s, loss=0.0399, lr=2.48e-05, step=1122] Training: 11%|█ | 1123/10000 [13:22<1:46:31, 1.39it/s, loss=0.0376, lr=2.48e-05, step=1123] Training: 11%|█ | 1124/10000 [13:22<1:35:54, 1.54it/s, loss=0.0376, lr=2.48e-05, step=1123] Training: 11%|█ | 1124/10000 [13:22<1:35:54, 1.54it/s, loss=0.0285, lr=2.48e-05, step=1124] Training: 11%|█▏ | 1125/10000 [13:23<1:29:20, 1.66it/s, loss=0.0285, lr=2.48e-05, step=1124] Training: 11%|█▏ | 1125/10000 [13:23<1:29:20, 1.66it/s, loss=0.0417, lr=2.48e-05, step=1125] Training: 11%|█▏ | 1126/10000 [13:24<1:36:13, 1.54it/s, loss=0.0417, lr=2.48e-05, step=1125] Training: 11%|█▏ | 1126/10000 [13:24<1:36:13, 1.54it/s, loss=0.0207, lr=2.48e-05, step=1126] Training: 11%|█▏ | 1127/10000 [13:24<1:41:33, 1.46it/s, loss=0.0207, lr=2.48e-05, step=1126] Training: 11%|█▏ | 1127/10000 [13:24<1:41:33, 1.46it/s, loss=0.0239, lr=2.48e-05, step=1127] Training: 11%|█▏ | 1128/10000 [13:25<1:32:25, 1.60it/s, loss=0.0239, lr=2.48e-05, step=1127] Training: 11%|█▏ | 1128/10000 [13:25<1:32:25, 1.60it/s, loss=0.0479, lr=2.48e-05, step=1128] Training: 11%|█▏ | 1129/10000 [13:25<1:32:57, 1.59it/s, loss=0.0479, lr=2.48e-05, step=1128] Training: 11%|█▏ | 1129/10000 [13:25<1:32:57, 1.59it/s, loss=0.0501, lr=2.48e-05, step=1129]16:19:33.193 [I] step=1130 loss=0.0836 smoothed_loss=0.0429 lr=2.48e-05 grad_norm=0.7360 step_time=0.5573s data_time=0.1045s it/s=1.511 eta_to_10000=5869.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0144 grad_action_out_proj=0.2458 grad_shared_expert=0.7617 (10775:train_pytorch.py:850) + Training: 11%|█▏ | 1130/10000 [13:26<1:40:09, 1.48it/s, loss=0.0501, lr=2.48e-05, step=1129] Training: 11%|█▏ | 1130/10000 [13:26<1:40:09, 1.48it/s, loss=0.0836, lr=2.48e-05, step=1130] Training: 11%|█▏ | 1131/10000 [13:27<1:36:08, 1.54it/s, loss=0.0836, lr=2.48e-05, step=1130] Training: 11%|█▏ | 1131/10000 [13:27<1:36:08, 1.54it/s, loss=0.0681, lr=2.48e-05, step=1131] Training: 11%|█▏ | 1132/10000 [13:27<1:29:18, 1.66it/s, loss=0.0681, lr=2.48e-05, step=1131] Training: 11%|█▏ | 1132/10000 [13:27<1:29:18, 1.66it/s, loss=0.0159, lr=2.48e-05, step=1132] Training: 11%|█▏ | 1133/10000 [13:28<1:36:15, 1.54it/s, loss=0.0159, lr=2.48e-05, step=1132] Training: 11%|█▏ | 1133/10000 [13:28<1:36:15, 1.54it/s, loss=0.0123, lr=2.48e-05, step=1133] Training: 11%|█▏ | 1134/10000 [13:29<1:28:37, 1.67it/s, loss=0.0123, lr=2.48e-05, step=1133] Training: 11%|█▏ | 1134/10000 [13:29<1:28:37, 1.67it/s, loss=0.0173, lr=2.48e-05, step=1134] Training: 11%|█▏ | 1135/10000 [13:29<1:23:15, 1.77it/s, loss=0.0173, lr=2.48e-05, step=1134] Training: 11%|█▏ | 1135/10000 [13:29<1:23:15, 1.77it/s, loss=0.0285, lr=2.48e-05, step=1135] Training: 11%|█▏ | 1136/10000 [13:30<1:19:35, 1.86it/s, loss=0.0285, lr=2.48e-05, step=1135] Training: 11%|█▏ | 1136/10000 [13:30<1:19:35, 1.86it/s, loss=0.0246, lr=2.48e-05, step=1136] Training: 11%|█▏ | 1137/10000 [13:30<1:27:53, 1.68it/s, loss=0.0246, lr=2.48e-05, step=1136] Training: 11%|█▏ | 1137/10000 [13:30<1:27:53, 1.68it/s, loss=0.0185, lr=2.48e-05, step=1137] Training: 11%|█▏ | 1138/10000 [13:31<1:34:51, 1.56it/s, loss=0.0185, lr=2.48e-05, step=1137] Training: 11%|█▏ | 1138/10000 [13:31<1:34:51, 1.56it/s, loss=0.0561, lr=2.48e-05, step=1138] Training: 11%|█▏ | 1139/10000 [13:32<1:38:30, 1.50it/s, loss=0.0561, lr=2.48e-05, step=1138] Training: 11%|█▏ | 1139/10000 [13:32<1:38:30, 1.50it/s, loss=0.0119, lr=2.48e-05, step=1139]16:19:39.294 [I] step=1140 loss=0.0124 smoothed_loss=0.0313 lr=2.48e-05 grad_norm=0.7389 step_time=0.5296s data_time=0.0806s it/s=1.640 eta_to_10000=5401.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0110 grad_action_out_proj=0.1850 grad_shared_expert=0.5464 (10775:train_pytorch.py:850) + Training: 11%|█▏ | 1140/10000 [13:32<1:35:58, 1.54it/s, loss=0.0119, lr=2.48e-05, step=1139] Training: 11%|█▏ | 1140/10000 [13:32<1:35:58, 1.54it/s, loss=0.0124, lr=2.47e-05, step=1140] Training: 11%|█▏ | 1141/10000 [13:33<1:30:53, 1.62it/s, loss=0.0124, lr=2.47e-05, step=1140] Training: 11%|█▏ | 1141/10000 [13:33<1:30:53, 1.62it/s, loss=0.0465, lr=2.47e-05, step=1141] Training: 11%|█▏ | 1142/10000 [13:33<1:25:42, 1.72it/s, loss=0.0465, lr=2.47e-05, step=1141] Training: 11%|█▏ | 1142/10000 [13:33<1:25:42, 1.72it/s, loss=0.0474, lr=2.47e-05, step=1142] Training: 11%|█▏ | 1143/10000 [13:34<1:21:55, 1.80it/s, loss=0.0474, lr=2.47e-05, step=1142] Training: 11%|█▏ | 1143/10000 [13:34<1:21:55, 1.80it/s, loss=0.0378, lr=2.47e-05, step=1143] Training: 11%|█▏ | 1144/10000 [13:34<1:21:16, 1.82it/s, loss=0.0378, lr=2.47e-05, step=1143] Training: 11%|█▏ | 1144/10000 [13:34<1:21:16, 1.82it/s, loss=0.0359, lr=2.47e-05, step=1144] Training: 11%|█▏ | 1145/10000 [13:35<1:31:54, 1.61it/s, loss=0.0359, lr=2.47e-05, step=1144] Training: 11%|█▏ | 1145/10000 [13:35<1:31:54, 1.61it/s, loss=0.0254, lr=2.47e-05, step=1145] Training: 11%|█▏ | 1146/10000 [13:36<1:39:18, 1.49it/s, loss=0.0254, lr=2.47e-05, step=1145] Training: 11%|█▏ | 1146/10000 [13:36<1:39:18, 1.49it/s, loss=0.0101, lr=2.47e-05, step=1146] Training: 11%|█▏ | 1147/10000 [13:37<1:44:02, 1.42it/s, loss=0.0101, lr=2.47e-05, step=1146] Training: 11%|█▏ | 1147/10000 [13:37<1:44:02, 1.42it/s, loss=0.0491, lr=2.47e-05, step=1147] Training: 11%|█▏ | 1148/10000 [13:37<1:35:01, 1.55it/s, loss=0.0491, lr=2.47e-05, step=1147] Training: 11%|█▏ | 1148/10000 [13:37<1:35:01, 1.55it/s, loss=0.0402, lr=2.47e-05, step=1148] Training: 11%|█▏ | 1149/10000 [13:38<1:27:55, 1.68it/s, loss=0.0402, lr=2.47e-05, step=1148] Training: 11%|█▏ | 1149/10000 [13:38<1:27:55, 1.68it/s, loss=0.0364, lr=2.47e-05, step=1149]16:19:45.245 [I] step=1150 loss=0.0292 smoothed_loss=0.0337 lr=2.47e-05 grad_norm=0.7077 step_time=0.5148s data_time=0.0802s it/s=1.681 eta_to_10000=5265.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.2150 grad_shared_expert=0.6158 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1150/10000 [13:38<1:25:10, 1.73it/s, loss=0.0364, lr=2.47e-05, step=1149] Training: 12%|█▏ | 1150/10000 [13:38<1:25:10, 1.73it/s, loss=0.0292, lr=2.47e-05, step=1150] Training: 12%|█▏ | 1151/10000 [13:39<1:22:05, 1.80it/s, loss=0.0292, lr=2.47e-05, step=1150] Training: 12%|█▏ | 1151/10000 [13:39<1:22:05, 1.80it/s, loss=0.0405, lr=2.47e-05, step=1151] Training: 12%|█▏ | 1152/10000 [13:40<1:29:11, 1.65it/s, loss=0.0405, lr=2.47e-05, step=1151] Training: 12%|█▏ | 1152/10000 [13:40<1:29:11, 1.65it/s, loss=0.0386, lr=2.47e-05, step=1152] Training: 12%|█▏ | 1153/10000 [13:40<1:34:59, 1.55it/s, loss=0.0386, lr=2.47e-05, step=1152] Training: 12%|█▏ | 1153/10000 [13:40<1:34:59, 1.55it/s, loss=0.0542, lr=2.47e-05, step=1153] Training: 12%|█▏ | 1154/10000 [13:41<1:40:28, 1.47it/s, loss=0.0542, lr=2.47e-05, step=1153] Training: 12%|█▏ | 1154/10000 [13:41<1:40:28, 1.47it/s, loss=0.0308, lr=2.47e-05, step=1154] Training: 12%|█▏ | 1155/10000 [13:42<1:33:15, 1.58it/s, loss=0.0308, lr=2.47e-05, step=1154] Training: 12%|█▏ | 1155/10000 [13:42<1:33:15, 1.58it/s, loss=0.0700, lr=2.47e-05, step=1155] Training: 12%|█▏ | 1156/10000 [13:42<1:32:24, 1.60it/s, loss=0.0700, lr=2.47e-05, step=1155] Training: 12%|█▏ | 1156/10000 [13:42<1:32:24, 1.60it/s, loss=0.0231, lr=2.47e-05, step=1156] Training: 12%|█▏ | 1157/10000 [13:43<1:26:17, 1.71it/s, loss=0.0231, lr=2.47e-05, step=1156] Training: 12%|█▏ | 1157/10000 [13:43<1:26:17, 1.71it/s, loss=0.0200, lr=2.47e-05, step=1157] Training: 12%|█▏ | 1158/10000 [13:43<1:29:32, 1.65it/s, loss=0.0200, lr=2.47e-05, step=1157] Training: 12%|█▏ | 1158/10000 [13:43<1:29:32, 1.65it/s, loss=0.0575, lr=2.47e-05, step=1158] Training: 12%|█▏ | 1159/10000 [13:44<1:35:58, 1.54it/s, loss=0.0575, lr=2.47e-05, step=1158] Training: 12%|█▏ | 1159/10000 [13:44<1:35:58, 1.54it/s, loss=0.0451, lr=2.47e-05, step=1159]16:19:51.533 [I] step=1160 loss=0.0240 smoothed_loss=0.0374 lr=2.47e-05 grad_norm=0.7827 step_time=0.5393s data_time=0.0895s it/s=1.591 eta_to_10000=5557.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.1702 grad_shared_expert=0.6963 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1160/10000 [13:45<1:30:20, 1.63it/s, loss=0.0451, lr=2.47e-05, step=1159] Training: 12%|█▏ | 1160/10000 [13:45<1:30:20, 1.63it/s, loss=0.0240, lr=2.47e-05, step=1160] Training: 12%|█▏ | 1161/10000 [13:45<1:37:43, 1.51it/s, loss=0.0240, lr=2.47e-05, step=1160] Training: 12%|█▏ | 1161/10000 [13:45<1:37:43, 1.51it/s, loss=0.0653, lr=2.47e-05, step=1161] Training: 12%|█▏ | 1162/10000 [13:46<1:30:38, 1.63it/s, loss=0.0653, lr=2.47e-05, step=1161] Training: 12%|█▏ | 1162/10000 [13:46<1:30:38, 1.63it/s, loss=0.0311, lr=2.47e-05, step=1162] Training: 12%|█▏ | 1163/10000 [13:46<1:26:00, 1.71it/s, loss=0.0311, lr=2.47e-05, step=1162] Training: 12%|█▏ | 1163/10000 [13:46<1:26:00, 1.71it/s, loss=0.0347, lr=2.47e-05, step=1163] Training: 12%|█▏ | 1164/10000 [13:47<1:23:00, 1.77it/s, loss=0.0347, lr=2.47e-05, step=1163] Training: 12%|█▏ | 1164/10000 [13:47<1:23:00, 1.77it/s, loss=0.0261, lr=2.47e-05, step=1164] Training: 12%|█▏ | 1165/10000 [13:48<1:29:48, 1.64it/s, loss=0.0261, lr=2.47e-05, step=1164] Training: 12%|█▏ | 1165/10000 [13:48<1:29:48, 1.64it/s, loss=0.0301, lr=2.47e-05, step=1165] Training: 12%|█▏ | 1166/10000 [13:48<1:36:12, 1.53it/s, loss=0.0301, lr=2.47e-05, step=1165] Training: 12%|█▏ | 1166/10000 [13:48<1:36:12, 1.53it/s, loss=0.0620, lr=2.47e-05, step=1166] Training: 12%|█▏ | 1167/10000 [13:49<1:28:41, 1.66it/s, loss=0.0620, lr=2.47e-05, step=1166] Training: 12%|█▏ | 1167/10000 [13:49<1:28:41, 1.66it/s, loss=0.0289, lr=2.47e-05, step=1167] Training: 12%|█▏ | 1168/10000 [13:50<1:36:34, 1.52it/s, loss=0.0289, lr=2.47e-05, step=1167] Training: 12%|█▏ | 1168/10000 [13:50<1:36:34, 1.52it/s, loss=0.0196, lr=2.47e-05, step=1168] Training: 12%|█▏ | 1169/10000 [13:50<1:39:16, 1.48it/s, loss=0.0196, lr=2.47e-05, step=1168] Training: 12%|█▏ | 1169/10000 [13:50<1:39:16, 1.48it/s, loss=0.0720, lr=2.47e-05, step=1169]16:19:58.080 [I] step=1170 loss=0.0497 smoothed_loss=0.0410 lr=2.47e-05 grad_norm=0.8083 step_time=0.5590s data_time=0.0958s it/s=1.527 eta_to_10000=5781.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1826 grad_shared_expert=0.7973 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1170/10000 [13:51<1:43:59, 1.42it/s, loss=0.0720, lr=2.47e-05, step=1169] Training: 12%|█▏ | 1170/10000 [13:51<1:43:59, 1.42it/s, loss=0.0497, lr=2.47e-05, step=1170] Training: 12%|█▏ | 1171/10000 [13:52<1:34:21, 1.56it/s, loss=0.0497, lr=2.47e-05, step=1170] Training: 12%|█▏ | 1171/10000 [13:52<1:34:21, 1.56it/s, loss=0.0885, lr=2.47e-05, step=1171] Training: 12%|█▏ | 1172/10000 [13:52<1:27:58, 1.67it/s, loss=0.0885, lr=2.47e-05, step=1171] Training: 12%|█▏ | 1172/10000 [13:52<1:27:58, 1.67it/s, loss=0.0924, lr=2.47e-05, step=1172] Training: 12%|█▏ | 1173/10000 [13:53<1:38:36, 1.49it/s, loss=0.0924, lr=2.47e-05, step=1172] Training: 12%|█▏ | 1173/10000 [13:53<1:38:36, 1.49it/s, loss=0.0299, lr=2.47e-05, step=1173] Training: 12%|█▏ | 1174/10000 [13:54<1:39:28, 1.48it/s, loss=0.0299, lr=2.47e-05, step=1173] Training: 12%|█▏ | 1174/10000 [13:54<1:39:28, 1.48it/s, loss=0.0512, lr=2.47e-05, step=1174] Training: 12%|█▏ | 1175/10000 [13:54<1:45:13, 1.40it/s, loss=0.0512, lr=2.47e-05, step=1174] Training: 12%|█▏ | 1175/10000 [13:54<1:45:13, 1.40it/s, loss=0.0447, lr=2.47e-05, step=1175] Training: 12%|█▏ | 1176/10000 [13:55<1:35:32, 1.54it/s, loss=0.0447, lr=2.47e-05, step=1175] Training: 12%|█▏ | 1176/10000 [13:55<1:35:32, 1.54it/s, loss=0.0154, lr=2.47e-05, step=1176] Training: 12%|█▏ | 1177/10000 [13:56<1:34:30, 1.56it/s, loss=0.0154, lr=2.47e-05, step=1176] Training: 12%|█▏ | 1177/10000 [13:56<1:34:30, 1.56it/s, loss=0.0735, lr=2.47e-05, step=1177] Training: 12%|█▏ | 1178/10000 [13:56<1:28:07, 1.67it/s, loss=0.0735, lr=2.47e-05, step=1177] Training: 12%|█▏ | 1178/10000 [13:56<1:28:07, 1.67it/s, loss=0.0544, lr=2.47e-05, step=1178] Training: 12%|█▏ | 1179/10000 [13:57<1:39:07, 1.48it/s, loss=0.0544, lr=2.47e-05, step=1178] Training: 12%|█▏ | 1179/10000 [13:57<1:39:07, 1.48it/s, loss=0.0979, lr=2.47e-05, step=1179]16:20:04.433 [I] step=1180 loss=0.0602 smoothed_loss=0.0541 lr=2.47e-05 grad_norm=0.8179 step_time=0.5410s data_time=0.0943s it/s=1.574 eta_to_10000=5602.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0122 grad_action_out_proj=0.2021 grad_shared_expert=0.7957 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1180/10000 [13:57<1:34:08, 1.56it/s, loss=0.0979, lr=2.47e-05, step=1179] Training: 12%|█▏ | 1180/10000 [13:57<1:34:08, 1.56it/s, loss=0.0602, lr=2.47e-05, step=1180] Training: 12%|█▏ | 1181/10000 [13:58<1:29:44, 1.64it/s, loss=0.0602, lr=2.47e-05, step=1180] Training: 12%|█▏ | 1181/10000 [13:58<1:29:44, 1.64it/s, loss=0.0271, lr=2.47e-05, step=1181] Training: 12%|█▏ | 1182/10000 [13:59<1:34:16, 1.56it/s, loss=0.0271, lr=2.47e-05, step=1181] Training: 12%|█▏ | 1182/10000 [13:59<1:34:16, 1.56it/s, loss=0.0527, lr=2.47e-05, step=1182] Training: 12%|█▏ | 1183/10000 [13:59<1:28:02, 1.67it/s, loss=0.0527, lr=2.47e-05, step=1182] Training: 12%|█▏ | 1183/10000 [13:59<1:28:02, 1.67it/s, loss=0.0173, lr=2.47e-05, step=1183] Training: 12%|█▏ | 1184/10000 [14:00<1:35:24, 1.54it/s, loss=0.0173, lr=2.47e-05, step=1183] Training: 12%|█▏ | 1184/10000 [14:00<1:35:24, 1.54it/s, loss=0.0321, lr=2.47e-05, step=1184] Training: 12%|█▏ | 1185/10000 [14:01<1:28:56, 1.65it/s, loss=0.0321, lr=2.47e-05, step=1184] Training: 12%|█▏ | 1185/10000 [14:01<1:28:56, 1.65it/s, loss=0.0344, lr=2.47e-05, step=1185] Training: 12%|█▏ | 1186/10000 [14:01<1:40:51, 1.46it/s, loss=0.0344, lr=2.47e-05, step=1185] Training: 12%|█▏ | 1186/10000 [14:01<1:40:51, 1.46it/s, loss=0.1377, lr=2.47e-05, step=1186] Training: 12%|█▏ | 1187/10000 [14:02<1:31:50, 1.60it/s, loss=0.1377, lr=2.47e-05, step=1186] Training: 12%|█▏ | 1187/10000 [14:02<1:31:50, 1.60it/s, loss=0.0565, lr=2.47e-05, step=1187] Training: 12%|█▏ | 1188/10000 [14:03<1:32:19, 1.59it/s, loss=0.0565, lr=2.47e-05, step=1187] Training: 12%|█▏ | 1188/10000 [14:03<1:32:19, 1.59it/s, loss=0.0984, lr=2.47e-05, step=1188] Training: 12%|█▏ | 1189/10000 [14:03<1:26:06, 1.71it/s, loss=0.0984, lr=2.47e-05, step=1188] Training: 12%|█▏ | 1189/10000 [14:03<1:26:06, 1.71it/s, loss=0.0911, lr=2.47e-05, step=1189]16:20:10.895 [I] step=1190 loss=0.0343 smoothed_loss=0.0595 lr=2.47e-05 grad_norm=0.7686 step_time=0.5398s data_time=0.1063s it/s=1.548 eta_to_10000=5691.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0210 grad_action_out_proj=0.2469 grad_shared_expert=0.7164 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1190/10000 [14:04<1:42:20, 1.43it/s, loss=0.0911, lr=2.47e-05, step=1189] Training: 12%|█▏ | 1190/10000 [14:04<1:42:20, 1.43it/s, loss=0.0343, lr=2.47e-05, step=1190] Training: 12%|█▏ | 1191/10000 [14:04<1:33:16, 1.57it/s, loss=0.0343, lr=2.47e-05, step=1190] Training: 12%|█▏ | 1191/10000 [14:04<1:33:16, 1.57it/s, loss=0.0491, lr=2.47e-05, step=1191] Training: 12%|█▏ | 1192/10000 [14:05<1:27:02, 1.69it/s, loss=0.0491, lr=2.47e-05, step=1191] Training: 12%|█▏ | 1192/10000 [14:05<1:27:02, 1.69it/s, loss=0.0204, lr=2.47e-05, step=1192] Training: 12%|█▏ | 1193/10000 [14:06<1:33:40, 1.57it/s, loss=0.0204, lr=2.47e-05, step=1192] Training: 12%|█▏ | 1193/10000 [14:06<1:33:40, 1.57it/s, loss=0.0340, lr=2.47e-05, step=1193] Training: 12%|█▏ | 1194/10000 [14:07<1:49:57, 1.33it/s, loss=0.0340, lr=2.47e-05, step=1193] Training: 12%|█▏ | 1194/10000 [14:07<1:49:57, 1.33it/s, loss=0.0223, lr=2.47e-05, step=1194] Training: 12%|█▏ | 1195/10000 [14:07<1:38:23, 1.49it/s, loss=0.0223, lr=2.47e-05, step=1194] Training: 12%|█▏ | 1195/10000 [14:07<1:38:23, 1.49it/s, loss=0.0494, lr=2.47e-05, step=1195] Training: 12%|█▏ | 1196/10000 [14:08<1:31:17, 1.61it/s, loss=0.0494, lr=2.47e-05, step=1195] Training: 12%|█▏ | 1196/10000 [14:08<1:31:17, 1.61it/s, loss=0.0429, lr=2.47e-05, step=1196] Training: 12%|█▏ | 1197/10000 [14:08<1:37:49, 1.50it/s, loss=0.0429, lr=2.47e-05, step=1196] Training: 12%|█▏ | 1197/10000 [14:08<1:37:49, 1.50it/s, loss=0.0223, lr=2.47e-05, step=1197] Training: 12%|█▏ | 1198/10000 [14:09<1:30:12, 1.63it/s, loss=0.0223, lr=2.47e-05, step=1197] Training: 12%|█▏ | 1198/10000 [14:09<1:30:12, 1.63it/s, loss=0.0464, lr=2.47e-05, step=1198] Training: 12%|█▏ | 1199/10000 [14:09<1:24:48, 1.73it/s, loss=0.0464, lr=2.47e-05, step=1198] Training: 12%|█▏ | 1199/10000 [14:09<1:24:48, 1.73it/s, loss=0.0137, lr=2.47e-05, step=1199]16:20:16.924 [I] step=1200 loss=0.0536 smoothed_loss=0.0440 lr=2.47e-05 grad_norm=0.6380 step_time=0.5123s data_time=0.0906s it/s=1.659 eta_to_10000=5304.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0156 grad_action_out_proj=0.2205 grad_shared_expert=0.6024 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1200/10000 [14:10<1:23:03, 1.77it/s, loss=0.0137, lr=2.47e-05, step=1199] Training: 12%|█▏ | 1200/10000 [14:10<1:23:03, 1.77it/s, loss=0.0536, lr=2.47e-05, step=1200] Training: 12%|█▏ | 1201/10000 [14:11<1:32:57, 1.58it/s, loss=0.0536, lr=2.47e-05, step=1200] Training: 12%|█▏ | 1201/10000 [14:11<1:32:57, 1.58it/s, loss=0.0731, lr=2.47e-05, step=1201] Training: 12%|█▏ | 1202/10000 [14:11<1:27:42, 1.67it/s, loss=0.0731, lr=2.47e-05, step=1201] Training: 12%|█▏ | 1202/10000 [14:11<1:27:42, 1.67it/s, loss=0.0479, lr=2.47e-05, step=1202] Training: 12%|█▏ | 1203/10000 [14:12<1:22:56, 1.77it/s, loss=0.0479, lr=2.47e-05, step=1202] Training: 12%|█▏ | 1203/10000 [14:12<1:22:56, 1.77it/s, loss=0.1513, lr=2.47e-05, step=1203] Training: 12%|█▏ | 1204/10000 [14:13<1:39:16, 1.48it/s, loss=0.1513, lr=2.47e-05, step=1203] Training: 12%|█▏ | 1204/10000 [14:13<1:39:16, 1.48it/s, loss=0.0363, lr=2.47e-05, step=1204] Training: 12%|█▏ | 1205/10000 [14:13<1:30:53, 1.61it/s, loss=0.0363, lr=2.47e-05, step=1204] Training: 12%|█▏ | 1205/10000 [14:13<1:30:53, 1.61it/s, loss=0.0308, lr=2.47e-05, step=1205] Training: 12%|█▏ | 1206/10000 [14:14<1:29:51, 1.63it/s, loss=0.0308, lr=2.47e-05, step=1205] Training: 12%|█▏ | 1206/10000 [14:14<1:29:51, 1.63it/s, loss=0.0822, lr=2.47e-05, step=1206] Training: 12%|█▏ | 1207/10000 [14:14<1:25:02, 1.72it/s, loss=0.0822, lr=2.47e-05, step=1206] Training: 12%|█▏ | 1207/10000 [14:14<1:25:02, 1.72it/s, loss=0.0236, lr=2.47e-05, step=1207] Training: 12%|█▏ | 1208/10000 [14:15<1:20:48, 1.81it/s, loss=0.0236, lr=2.47e-05, step=1207] Training: 12%|█▏ | 1208/10000 [14:15<1:20:48, 1.81it/s, loss=0.0878, lr=2.47e-05, step=1208] Training: 12%|█▏ | 1209/10000 [14:16<1:28:59, 1.65it/s, loss=0.0878, lr=2.47e-05, step=1208] Training: 12%|█▏ | 1209/10000 [14:16<1:28:59, 1.65it/s, loss=0.0459, lr=2.47e-05, step=1209]16:20:23.004 [I] step=1210 loss=0.0364 smoothed_loss=0.0532 lr=2.47e-05 grad_norm=0.7159 step_time=0.5353s data_time=0.0726s it/s=1.645 eta_to_10000=5343.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.2303 grad_shared_expert=0.5270 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1210/10000 [14:16<1:25:51, 1.71it/s, loss=0.0459, lr=2.47e-05, step=1209] Training: 12%|█▏ | 1210/10000 [14:16<1:25:51, 1.71it/s, loss=0.0364, lr=2.47e-05, step=1210] Training: 12%|█▏ | 1211/10000 [14:17<1:31:48, 1.60it/s, loss=0.0364, lr=2.47e-05, step=1210] Training: 12%|█▏ | 1211/10000 [14:17<1:31:48, 1.60it/s, loss=0.0268, lr=2.47e-05, step=1211] Training: 12%|█▏ | 1212/10000 [14:17<1:26:57, 1.68it/s, loss=0.0268, lr=2.47e-05, step=1211] Training: 12%|█▏ | 1212/10000 [14:17<1:26:57, 1.68it/s, loss=0.0440, lr=2.47e-05, step=1212] Training: 12%|█▏ | 1213/10000 [14:18<1:22:47, 1.77it/s, loss=0.0440, lr=2.47e-05, step=1212] Training: 12%|█▏ | 1213/10000 [14:18<1:22:47, 1.77it/s, loss=0.0228, lr=2.47e-05, step=1213] Training: 12%|█▏ | 1214/10000 [14:18<1:19:52, 1.83it/s, loss=0.0228, lr=2.47e-05, step=1213] Training: 12%|█▏ | 1214/10000 [14:18<1:19:52, 1.83it/s, loss=0.0409, lr=2.47e-05, step=1214] Training: 12%|█▏ | 1215/10000 [14:19<1:22:55, 1.77it/s, loss=0.0409, lr=2.47e-05, step=1214] Training: 12%|█▏ | 1215/10000 [14:19<1:22:55, 1.77it/s, loss=0.0178, lr=2.47e-05, step=1215] Training: 12%|█▏ | 1216/10000 [14:20<1:31:32, 1.60it/s, loss=0.0178, lr=2.47e-05, step=1215] Training: 12%|█▏ | 1216/10000 [14:20<1:31:32, 1.60it/s, loss=0.0335, lr=2.47e-05, step=1216] Training: 12%|█▏ | 1217/10000 [14:20<1:27:21, 1.68it/s, loss=0.0335, lr=2.47e-05, step=1216] Training: 12%|█▏ | 1217/10000 [14:20<1:27:21, 1.68it/s, loss=0.0524, lr=2.47e-05, step=1217] Training: 12%|█▏ | 1218/10000 [14:21<1:46:45, 1.37it/s, loss=0.0524, lr=2.47e-05, step=1217] Training: 12%|█▏ | 1218/10000 [14:21<1:46:45, 1.37it/s, loss=0.0381, lr=2.47e-05, step=1218] Training: 12%|█▏ | 1219/10000 [14:22<1:36:19, 1.52it/s, loss=0.0381, lr=2.47e-05, step=1218] Training: 12%|█▏ | 1219/10000 [14:22<1:36:19, 1.52it/s, loss=0.0280, lr=2.47e-05, step=1219]16:20:29.204 [I] step=1220 loss=0.0361 smoothed_loss=0.0410 lr=2.47e-05 grad_norm=0.8058 step_time=0.5336s data_time=0.0864s it/s=1.613 eta_to_10000=5443.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.2065 grad_shared_expert=0.9981 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1220/10000 [14:22<1:30:27, 1.62it/s, loss=0.0280, lr=2.47e-05, step=1219] Training: 12%|█▏ | 1220/10000 [14:22<1:30:27, 1.62it/s, loss=0.0361, lr=2.47e-05, step=1220] Training: 12%|█▏ | 1221/10000 [14:23<1:24:53, 1.72it/s, loss=0.0361, lr=2.47e-05, step=1220] Training: 12%|█▏ | 1221/10000 [14:23<1:24:53, 1.72it/s, loss=0.0447, lr=2.47e-05, step=1221] Training: 12%|█▏ | 1222/10000 [14:23<1:27:09, 1.68it/s, loss=0.0447, lr=2.47e-05, step=1221] Training: 12%|█▏ | 1222/10000 [14:23<1:27:09, 1.68it/s, loss=0.0102, lr=2.47e-05, step=1222] Training: 12%|█▏ | 1223/10000 [14:24<1:33:40, 1.56it/s, loss=0.0102, lr=2.47e-05, step=1222] Training: 12%|█▏ | 1223/10000 [14:24<1:33:40, 1.56it/s, loss=0.0400, lr=2.47e-05, step=1223] Training: 12%|█▏ | 1224/10000 [14:25<1:29:05, 1.64it/s, loss=0.0400, lr=2.47e-05, step=1223] Training: 12%|█▏ | 1224/10000 [14:25<1:29:05, 1.64it/s, loss=0.0296, lr=2.47e-05, step=1224] Training: 12%|█▏ | 1225/10000 [14:25<1:33:07, 1.57it/s, loss=0.0296, lr=2.47e-05, step=1224] Training: 12%|█▏ | 1225/10000 [14:25<1:33:07, 1.57it/s, loss=0.0947, lr=2.47e-05, step=1225] Training: 12%|█▏ | 1226/10000 [14:26<1:37:37, 1.50it/s, loss=0.0947, lr=2.47e-05, step=1225] Training: 12%|█▏ | 1226/10000 [14:26<1:37:37, 1.50it/s, loss=0.0464, lr=2.47e-05, step=1226] Training: 12%|█▏ | 1227/10000 [14:27<1:33:52, 1.56it/s, loss=0.0464, lr=2.47e-05, step=1226] Training: 12%|█▏ | 1227/10000 [14:27<1:33:52, 1.56it/s, loss=0.0245, lr=2.47e-05, step=1227] Training: 12%|█▏ | 1228/10000 [14:27<1:28:23, 1.65it/s, loss=0.0245, lr=2.47e-05, step=1227] Training: 12%|█▏ | 1228/10000 [14:27<1:28:23, 1.65it/s, loss=0.0222, lr=2.47e-05, step=1228] Training: 12%|█▏ | 1229/10000 [14:28<1:26:54, 1.68it/s, loss=0.0222, lr=2.47e-05, step=1228] Training: 12%|█▏ | 1229/10000 [14:28<1:26:54, 1.68it/s, loss=0.0598, lr=2.47e-05, step=1229]16:20:35.258 [I] step=1230 loss=0.0343 smoothed_loss=0.0410 lr=2.47e-05 grad_norm=0.6798 step_time=0.5194s data_time=0.0859s it/s=1.652 eta_to_10000=5308.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.2022 grad_shared_expert=0.5185 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1230/10000 [14:28<1:24:25, 1.73it/s, loss=0.0598, lr=2.47e-05, step=1229] Training: 12%|█▏ | 1230/10000 [14:28<1:24:25, 1.73it/s, loss=0.0343, lr=2.47e-05, step=1230] Training: 12%|█▏ | 1231/10000 [14:29<1:42:14, 1.43it/s, loss=0.0343, lr=2.47e-05, step=1230] Training: 12%|█▏ | 1231/10000 [14:29<1:42:14, 1.43it/s, loss=0.0354, lr=2.47e-05, step=1231] Training: 12%|█▏ | 1232/10000 [14:30<1:33:06, 1.57it/s, loss=0.0354, lr=2.47e-05, step=1231] Training: 12%|█▏ | 1232/10000 [14:30<1:33:06, 1.57it/s, loss=0.0466, lr=2.47e-05, step=1232] Training: 12%|█▏ | 1233/10000 [14:31<1:41:26, 1.44it/s, loss=0.0466, lr=2.47e-05, step=1232] Training: 12%|█▏ | 1233/10000 [14:31<1:41:26, 1.44it/s, loss=0.0761, lr=2.47e-05, step=1233] Training: 12%|█▏ | 1234/10000 [14:31<1:33:25, 1.56it/s, loss=0.0761, lr=2.47e-05, step=1233] Training: 12%|█▏ | 1234/10000 [14:31<1:33:25, 1.56it/s, loss=0.0086, lr=2.47e-05, step=1234] Training: 12%|█▏ | 1235/10000 [14:32<1:37:53, 1.49it/s, loss=0.0086, lr=2.47e-05, step=1234] Training: 12%|█▏ | 1235/10000 [14:32<1:37:53, 1.49it/s, loss=0.0157, lr=2.47e-05, step=1235] Training: 12%|█▏ | 1236/10000 [14:32<1:30:38, 1.61it/s, loss=0.0157, lr=2.47e-05, step=1235] Training: 12%|█▏ | 1236/10000 [14:32<1:30:38, 1.61it/s, loss=0.0155, lr=2.47e-05, step=1236] Training: 12%|█▏ | 1237/10000 [14:33<1:26:07, 1.70it/s, loss=0.0155, lr=2.47e-05, step=1236] Training: 12%|█▏ | 1237/10000 [14:33<1:26:07, 1.70it/s, loss=0.0719, lr=2.47e-05, step=1237] Training: 12%|█▏ | 1238/10000 [14:34<1:37:03, 1.50it/s, loss=0.0719, lr=2.47e-05, step=1237] Training: 12%|█▏ | 1238/10000 [14:34<1:37:03, 1.50it/s, loss=0.0275, lr=2.47e-05, step=1238] Training: 12%|█▏ | 1239/10000 [14:34<1:35:23, 1.53it/s, loss=0.0275, lr=2.47e-05, step=1238] Training: 12%|█▏ | 1239/10000 [14:34<1:35:23, 1.53it/s, loss=0.0522, lr=2.47e-05, step=1239]16:20:42.055 [I] step=1240 loss=0.0381 smoothed_loss=0.0397 lr=2.47e-05 grad_norm=0.7036 step_time=0.5932s data_time=0.0865s it/s=1.471 eta_to_10000=5953.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0129 grad_action_out_proj=0.1882 grad_shared_expert=0.6250 (10775:train_pytorch.py:850) + Training: 12%|█▏ | 1240/10000 [14:35<1:39:40, 1.46it/s, loss=0.0522, lr=2.47e-05, step=1239] Training: 12%|█▏ | 1240/10000 [14:35<1:39:40, 1.46it/s, loss=0.0381, lr=2.47e-05, step=1240] Training: 12%|█▏ | 1241/10000 [14:36<1:32:26, 1.58it/s, loss=0.0381, lr=2.47e-05, step=1240] Training: 12%|█▏ | 1241/10000 [14:36<1:32:26, 1.58it/s, loss=0.0579, lr=2.47e-05, step=1241] Training: 12%|█▏ | 1242/10000 [14:36<1:33:22, 1.56it/s, loss=0.0579, lr=2.47e-05, step=1241] Training: 12%|█▏ | 1242/10000 [14:36<1:33:22, 1.56it/s, loss=0.0194, lr=2.47e-05, step=1242] Training: 12%|█▏ | 1243/10000 [14:37<1:27:43, 1.66it/s, loss=0.0194, lr=2.47e-05, step=1242] Training: 12%|█▏ | 1243/10000 [14:37<1:27:43, 1.66it/s, loss=0.0214, lr=2.47e-05, step=1243] Training: 12%|█▏ | 1244/10000 [14:37<1:22:55, 1.76it/s, loss=0.0214, lr=2.47e-05, step=1243] Training: 12%|█▏ | 1244/10000 [14:37<1:22:55, 1.76it/s, loss=0.0581, lr=2.47e-05, step=1244] Training: 12%|█▏ | 1245/10000 [14:38<1:29:09, 1.64it/s, loss=0.0581, lr=2.47e-05, step=1244] Training: 12%|█▏ | 1245/10000 [14:38<1:29:09, 1.64it/s, loss=0.0559, lr=2.47e-05, step=1245] Training: 12%|█▏ | 1246/10000 [14:38<1:23:34, 1.75it/s, loss=0.0559, lr=2.47e-05, step=1245] Training: 12%|█▏ | 1246/10000 [14:38<1:23:34, 1.75it/s, loss=0.0404, lr=2.47e-05, step=1246] Training: 12%|█▏ | 1247/10000 [14:39<1:29:25, 1.63it/s, loss=0.0404, lr=2.47e-05, step=1246] Training: 12%|█▏ | 1247/10000 [14:39<1:29:25, 1.63it/s, loss=0.0804, lr=2.47e-05, step=1247] Training: 12%|█▏ | 1248/10000 [14:40<1:25:03, 1.72it/s, loss=0.0804, lr=2.47e-05, step=1247] Training: 12%|█▏ | 1248/10000 [14:40<1:25:03, 1.72it/s, loss=0.0290, lr=2.47e-05, step=1248] Training: 12%|█▏ | 1249/10000 [14:40<1:21:22, 1.79it/s, loss=0.0290, lr=2.47e-05, step=1248] Training: 12%|█▏ | 1249/10000 [14:40<1:21:22, 1.79it/s, loss=0.0230, lr=2.47e-05, step=1249]16:20:47.841 [I] step=1250 loss=0.0173 smoothed_loss=0.0390 lr=2.47e-05 grad_norm=0.6647 step_time=0.5026s data_time=0.0761s it/s=1.728 eta_to_10000=5062.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1947 grad_shared_expert=0.6465 (10775:train_pytorch.py:850) + Training: 12%|█▎ | 1250/10000 [14:41<1:27:28, 1.67it/s, loss=0.0230, lr=2.47e-05, step=1249] Training: 12%|█▎ | 1250/10000 [14:41<1:27:28, 1.67it/s, loss=0.0173, lr=2.47e-05, step=1250] Training: 13%|█▎ | 1251/10000 [14:42<1:27:42, 1.66it/s, loss=0.0173, lr=2.47e-05, step=1250] Training: 13%|█▎ | 1251/10000 [14:42<1:27:42, 1.66it/s, loss=0.0337, lr=2.47e-05, step=1251] Training: 13%|█▎ | 1252/10000 [14:42<1:35:46, 1.52it/s, loss=0.0337, lr=2.47e-05, step=1251] Training: 13%|█▎ | 1252/10000 [14:42<1:35:46, 1.52it/s, loss=0.0549, lr=2.47e-05, step=1252] Training: 13%|█▎ | 1253/10000 [14:43<1:38:39, 1.48it/s, loss=0.0549, lr=2.47e-05, step=1252] Training: 13%|█▎ | 1253/10000 [14:43<1:38:39, 1.48it/s, loss=0.0370, lr=2.47e-05, step=1253] Training: 13%|█▎ | 1254/10000 [14:44<1:51:28, 1.31it/s, loss=0.0370, lr=2.47e-05, step=1253] Training: 13%|█▎ | 1254/10000 [14:44<1:51:28, 1.31it/s, loss=0.0640, lr=2.47e-05, step=1254] Training: 13%|█▎ | 1255/10000 [14:45<1:40:35, 1.45it/s, loss=0.0640, lr=2.47e-05, step=1254] Training: 13%|█▎ | 1255/10000 [14:45<1:40:35, 1.45it/s, loss=0.0182, lr=2.47e-05, step=1255] Training: 13%|█▎ | 1256/10000 [14:45<1:37:36, 1.49it/s, loss=0.0182, lr=2.47e-05, step=1255] Training: 13%|█▎ | 1256/10000 [14:45<1:37:36, 1.49it/s, loss=0.0888, lr=2.47e-05, step=1256] Training: 13%|█▎ | 1257/10000 [14:46<1:41:37, 1.43it/s, loss=0.0888, lr=2.47e-05, step=1256] Training: 13%|█▎ | 1257/10000 [14:46<1:41:37, 1.43it/s, loss=0.0344, lr=2.47e-05, step=1257] Training: 13%|█▎ | 1258/10000 [14:47<1:47:00, 1.36it/s, loss=0.0344, lr=2.47e-05, step=1257] Training: 13%|█▎ | 1258/10000 [14:47<1:47:00, 1.36it/s, loss=0.0198, lr=2.46e-05, step=1258] Training: 13%|█▎ | 1259/10000 [14:47<1:36:08, 1.52it/s, loss=0.0198, lr=2.46e-05, step=1258] Training: 13%|█▎ | 1259/10000 [14:47<1:36:08, 1.52it/s, loss=0.0469, lr=2.46e-05, step=1259]16:20:54.652 [I] step=1260 loss=0.0573 smoothed_loss=0.0434 lr=2.47e-05 grad_norm=0.8029 step_time=0.5770s data_time=0.1040s it/s=1.468 eta_to_10000=5951.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.2011 grad_shared_expert=0.8072 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1260/10000 [14:48<1:29:57, 1.62it/s, loss=0.0469, lr=2.46e-05, step=1259] Training: 13%|█▎ | 1260/10000 [14:48<1:29:57, 1.62it/s, loss=0.0573, lr=2.46e-05, step=1260] Training: 13%|█▎ | 1261/10000 [14:49<1:47:00, 1.36it/s, loss=0.0573, lr=2.46e-05, step=1260] Training: 13%|█▎ | 1261/10000 [14:49<1:47:00, 1.36it/s, loss=0.0126, lr=2.46e-05, step=1261] Training: 13%|█▎ | 1262/10000 [14:49<1:40:08, 1.45it/s, loss=0.0126, lr=2.46e-05, step=1261] Training: 13%|█▎ | 1262/10000 [14:49<1:40:08, 1.45it/s, loss=0.0245, lr=2.46e-05, step=1262] Training: 13%|█▎ | 1263/10000 [14:50<1:31:26, 1.59it/s, loss=0.0245, lr=2.46e-05, step=1262] Training: 13%|█▎ | 1263/10000 [14:50<1:31:26, 1.59it/s, loss=0.0416, lr=2.46e-05, step=1263] Training: 13%|█▎ | 1264/10000 [14:50<1:26:16, 1.69it/s, loss=0.0416, lr=2.46e-05, step=1263] Training: 13%|█▎ | 1264/10000 [14:50<1:26:16, 1.69it/s, loss=0.0706, lr=2.46e-05, step=1264] Training: 13%|█▎ | 1265/10000 [14:51<1:35:00, 1.53it/s, loss=0.0706, lr=2.46e-05, step=1264] Training: 13%|█▎ | 1265/10000 [14:51<1:35:00, 1.53it/s, loss=0.0357, lr=2.46e-05, step=1265] Training: 13%|█▎ | 1266/10000 [14:52<1:39:20, 1.47it/s, loss=0.0357, lr=2.46e-05, step=1265] Training: 13%|█▎ | 1266/10000 [14:52<1:39:20, 1.47it/s, loss=0.0758, lr=2.46e-05, step=1266] Training: 13%|█▎ | 1267/10000 [14:52<1:31:14, 1.60it/s, loss=0.0758, lr=2.46e-05, step=1266] Training: 13%|█▎ | 1267/10000 [14:52<1:31:14, 1.60it/s, loss=0.0356, lr=2.46e-05, step=1267] Training: 13%|█▎ | 1268/10000 [14:53<1:36:11, 1.51it/s, loss=0.0356, lr=2.46e-05, step=1267] Training: 13%|█▎ | 1268/10000 [14:53<1:36:11, 1.51it/s, loss=0.0203, lr=2.46e-05, step=1268] Training: 13%|█▎ | 1269/10000 [14:54<1:28:38, 1.64it/s, loss=0.0203, lr=2.46e-05, step=1268] Training: 13%|█▎ | 1269/10000 [14:54<1:28:38, 1.64it/s, loss=0.0528, lr=2.46e-05, step=1269]16:21:01.094 [I] step=1270 loss=0.0207 smoothed_loss=0.0406 lr=2.46e-05 grad_norm=0.7687 step_time=0.5552s data_time=0.0892s it/s=1.552 eta_to_10000=5623.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.2229 grad_shared_expert=0.6953 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1270/10000 [14:54<1:27:41, 1.66it/s, loss=0.0528, lr=2.46e-05, step=1269] Training: 13%|█▎ | 1270/10000 [14:54<1:27:41, 1.66it/s, loss=0.0207, lr=2.46e-05, step=1270] Training: 13%|█▎ | 1271/10000 [14:55<1:26:12, 1.69it/s, loss=0.0207, lr=2.46e-05, step=1270] Training: 13%|█▎ | 1271/10000 [14:55<1:26:12, 1.69it/s, loss=0.0203, lr=2.46e-05, step=1271] Training: 13%|█▎ | 1272/10000 [14:55<1:26:10, 1.69it/s, loss=0.0203, lr=2.46e-05, step=1271] Training: 13%|█▎ | 1272/10000 [14:55<1:26:10, 1.69it/s, loss=0.0296, lr=2.46e-05, step=1272] Training: 13%|█▎ | 1273/10000 [14:56<1:34:17, 1.54it/s, loss=0.0296, lr=2.46e-05, step=1272] Training: 13%|█▎ | 1273/10000 [14:56<1:34:17, 1.54it/s, loss=0.0312, lr=2.46e-05, step=1273] Training: 13%|█▎ | 1274/10000 [14:57<1:32:17, 1.58it/s, loss=0.0312, lr=2.46e-05, step=1273] Training: 13%|█▎ | 1274/10000 [14:57<1:32:17, 1.58it/s, loss=0.0436, lr=2.46e-05, step=1274] Training: 13%|█▎ | 1275/10000 [14:57<1:27:32, 1.66it/s, loss=0.0436, lr=2.46e-05, step=1274] Training: 13%|█▎ | 1275/10000 [14:57<1:27:32, 1.66it/s, loss=0.0585, lr=2.46e-05, step=1275] Training: 13%|█▎ | 1276/10000 [14:58<1:32:20, 1.57it/s, loss=0.0585, lr=2.46e-05, step=1275] Training: 13%|█▎ | 1276/10000 [14:58<1:32:20, 1.57it/s, loss=0.0326, lr=2.46e-05, step=1276] Training: 13%|█▎ | 1277/10000 [14:58<1:25:57, 1.69it/s, loss=0.0326, lr=2.46e-05, step=1276] Training: 13%|█▎ | 1277/10000 [14:58<1:25:57, 1.69it/s, loss=0.0166, lr=2.46e-05, step=1277] Training: 13%|█▎ | 1278/10000 [14:59<1:28:19, 1.65it/s, loss=0.0166, lr=2.46e-05, step=1277] Training: 13%|█▎ | 1278/10000 [14:59<1:28:19, 1.65it/s, loss=0.0135, lr=2.46e-05, step=1278] Training: 13%|█▎ | 1279/10000 [15:00<1:23:34, 1.74it/s, loss=0.0135, lr=2.46e-05, step=1278] Training: 13%|█▎ | 1279/10000 [15:00<1:23:34, 1.74it/s, loss=0.0893, lr=2.46e-05, step=1279]16:21:07.304 [I] step=1280 loss=0.0219 smoothed_loss=0.0381 lr=2.46e-05 grad_norm=0.7128 step_time=0.5305s data_time=0.0905s it/s=1.611 eta_to_10000=5414.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.2451 grad_shared_expert=0.8229 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1280/10000 [15:00<1:33:10, 1.56it/s, loss=0.0893, lr=2.46e-05, step=1279] Training: 13%|█▎ | 1280/10000 [15:00<1:33:10, 1.56it/s, loss=0.0219, lr=2.46e-05, step=1280] Training: 13%|█▎ | 1281/10000 [15:01<1:27:50, 1.65it/s, loss=0.0219, lr=2.46e-05, step=1280] Training: 13%|█▎ | 1281/10000 [15:01<1:27:50, 1.65it/s, loss=0.0223, lr=2.46e-05, step=1281] Training: 13%|█▎ | 1282/10000 [15:01<1:22:55, 1.75it/s, loss=0.0223, lr=2.46e-05, step=1281] Training: 13%|█▎ | 1282/10000 [15:01<1:22:55, 1.75it/s, loss=0.0283, lr=2.46e-05, step=1282] Training: 13%|█▎ | 1283/10000 [15:02<1:38:33, 1.47it/s, loss=0.0283, lr=2.46e-05, step=1282] Training: 13%|█▎ | 1283/10000 [15:02<1:38:33, 1.47it/s, loss=0.1409, lr=2.46e-05, step=1283] Training: 13%|█▎ | 1284/10000 [15:03<1:30:11, 1.61it/s, loss=0.1409, lr=2.46e-05, step=1283] Training: 13%|█▎ | 1284/10000 [15:03<1:30:11, 1.61it/s, loss=0.0112, lr=2.46e-05, step=1284] Training: 13%|█▎ | 1285/10000 [15:03<1:23:48, 1.73it/s, loss=0.0112, lr=2.46e-05, step=1284] Training: 13%|█▎ | 1285/10000 [15:03<1:23:48, 1.73it/s, loss=0.0815, lr=2.46e-05, step=1285] Training: 13%|█▎ | 1286/10000 [15:04<1:31:16, 1.59it/s, loss=0.0815, lr=2.46e-05, step=1285] Training: 13%|█▎ | 1286/10000 [15:04<1:31:16, 1.59it/s, loss=0.0121, lr=2.46e-05, step=1286] Training: 13%|█▎ | 1287/10000 [15:04<1:24:38, 1.72it/s, loss=0.0121, lr=2.46e-05, step=1286] Training: 13%|█▎ | 1287/10000 [15:04<1:24:38, 1.72it/s, loss=0.0854, lr=2.46e-05, step=1287] Training: 13%|█▎ | 1288/10000 [15:05<1:32:04, 1.58it/s, loss=0.0854, lr=2.46e-05, step=1287] Training: 13%|█▎ | 1288/10000 [15:05<1:32:04, 1.58it/s, loss=0.0451, lr=2.46e-05, step=1288] Training: 13%|█▎ | 1289/10000 [15:06<1:27:04, 1.67it/s, loss=0.0451, lr=2.46e-05, step=1288] Training: 13%|█▎ | 1289/10000 [15:06<1:27:04, 1.67it/s, loss=0.0370, lr=2.46e-05, step=1289]16:21:13.478 [I] step=1290 loss=0.0385 smoothed_loss=0.0454 lr=2.46e-05 grad_norm=0.7291 step_time=0.5353s data_time=0.0820s it/s=1.620 eta_to_10000=5376.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0110 grad_action_out_proj=0.1752 grad_shared_expert=0.9046 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1290/10000 [15:07<1:34:36, 1.53it/s, loss=0.0370, lr=2.46e-05, step=1289] Training: 13%|█▎ | 1290/10000 [15:07<1:34:36, 1.53it/s, loss=0.0385, lr=2.46e-05, step=1290] Training: 13%|█▎ | 1291/10000 [15:07<1:27:30, 1.66it/s, loss=0.0385, lr=2.46e-05, step=1290] Training: 13%|█▎ | 1291/10000 [15:07<1:27:30, 1.66it/s, loss=0.0431, lr=2.46e-05, step=1291] Training: 13%|█▎ | 1292/10000 [15:08<1:33:22, 1.55it/s, loss=0.0431, lr=2.46e-05, step=1291] Training: 13%|█▎ | 1292/10000 [15:08<1:33:22, 1.55it/s, loss=0.0367, lr=2.46e-05, step=1292] Training: 13%|█▎ | 1293/10000 [15:08<1:26:36, 1.68it/s, loss=0.0367, lr=2.46e-05, step=1292] Training: 13%|█▎ | 1293/10000 [15:08<1:26:36, 1.68it/s, loss=0.0684, lr=2.46e-05, step=1293] Training: 13%|█▎ | 1294/10000 [15:09<1:21:57, 1.77it/s, loss=0.0684, lr=2.46e-05, step=1293] Training: 13%|█▎ | 1294/10000 [15:09<1:21:57, 1.77it/s, loss=0.1019, lr=2.46e-05, step=1294] Training: 13%|█▎ | 1295/10000 [15:09<1:18:44, 1.84it/s, loss=0.1019, lr=2.46e-05, step=1294] Training: 13%|█▎ | 1295/10000 [15:09<1:18:44, 1.84it/s, loss=0.0271, lr=2.46e-05, step=1295] Training: 13%|█▎ | 1296/10000 [15:10<1:28:28, 1.64it/s, loss=0.0271, lr=2.46e-05, step=1295] Training: 13%|█▎ | 1296/10000 [15:10<1:28:28, 1.64it/s, loss=0.0300, lr=2.46e-05, step=1296] Training: 13%|█▎ | 1297/10000 [15:10<1:23:16, 1.74it/s, loss=0.0300, lr=2.46e-05, step=1296] Training: 13%|█▎ | 1297/10000 [15:10<1:23:16, 1.74it/s, loss=0.0366, lr=2.46e-05, step=1297] Training: 13%|█▎ | 1298/10000 [15:11<1:30:09, 1.61it/s, loss=0.0366, lr=2.46e-05, step=1297] Training: 13%|█▎ | 1298/10000 [15:11<1:30:09, 1.61it/s, loss=0.0426, lr=2.46e-05, step=1298] Training: 13%|█▎ | 1299/10000 [15:12<1:32:56, 1.56it/s, loss=0.0426, lr=2.46e-05, step=1298] Training: 13%|█▎ | 1299/10000 [15:12<1:32:56, 1.56it/s, loss=0.0316, lr=2.46e-05, step=1299]16:21:19.381 [I] step=1300 loss=0.0320 smoothed_loss=0.0435 lr=2.46e-05 grad_norm=0.7427 step_time=0.5123s data_time=0.0780s it/s=1.694 eta_to_10000=5135.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.1617 grad_shared_expert=0.6222 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1300/10000 [15:12<1:28:10, 1.64it/s, loss=0.0316, lr=2.46e-05, step=1299] Training: 13%|█▎ | 1300/10000 [15:12<1:28:10, 1.64it/s, loss=0.0320, lr=2.46e-05, step=1300] Training: 13%|█▎ | 1301/10000 [15:13<1:24:17, 1.72it/s, loss=0.0320, lr=2.46e-05, step=1300] Training: 13%|█▎ | 1301/10000 [15:13<1:24:17, 1.72it/s, loss=0.0314, lr=2.46e-05, step=1301] Training: 13%|█▎ | 1302/10000 [15:13<1:20:09, 1.81it/s, loss=0.0314, lr=2.46e-05, step=1301] Training: 13%|█▎ | 1302/10000 [15:13<1:20:09, 1.81it/s, loss=0.0480, lr=2.46e-05, step=1302] Training: 13%|█▎ | 1303/10000 [15:14<1:29:02, 1.63it/s, loss=0.0480, lr=2.46e-05, step=1302] Training: 13%|█▎ | 1303/10000 [15:14<1:29:02, 1.63it/s, loss=0.0203, lr=2.46e-05, step=1303] Training: 13%|█▎ | 1304/10000 [15:15<1:23:17, 1.74it/s, loss=0.0203, lr=2.46e-05, step=1303] Training: 13%|█▎ | 1304/10000 [15:15<1:23:17, 1.74it/s, loss=0.0337, lr=2.46e-05, step=1304] Training: 13%|█▎ | 1305/10000 [15:16<1:39:30, 1.46it/s, loss=0.0337, lr=2.46e-05, step=1304] Training: 13%|█▎ | 1305/10000 [15:16<1:39:30, 1.46it/s, loss=0.0526, lr=2.46e-05, step=1305] Training: 13%|█▎ | 1306/10000 [15:16<1:40:15, 1.45it/s, loss=0.0526, lr=2.46e-05, step=1305] Training: 13%|█▎ | 1306/10000 [15:16<1:40:15, 1.45it/s, loss=0.0255, lr=2.46e-05, step=1306] Training: 13%|█▎ | 1307/10000 [15:17<1:31:07, 1.59it/s, loss=0.0255, lr=2.46e-05, step=1306] Training: 13%|█▎ | 1307/10000 [15:17<1:31:07, 1.59it/s, loss=0.0202, lr=2.46e-05, step=1307] Training: 13%|█▎ | 1308/10000 [15:17<1:25:01, 1.70it/s, loss=0.0202, lr=2.46e-05, step=1307] Training: 13%|█▎ | 1308/10000 [15:17<1:25:01, 1.70it/s, loss=0.0182, lr=2.46e-05, step=1308] Training: 13%|█▎ | 1309/10000 [15:18<1:20:37, 1.80it/s, loss=0.0182, lr=2.46e-05, step=1308] Training: 13%|█▎ | 1309/10000 [15:18<1:20:37, 1.80it/s, loss=0.0366, lr=2.46e-05, step=1309]16:21:25.545 [I] step=1310 loss=0.0187 smoothed_loss=0.0341 lr=2.46e-05 grad_norm=0.7027 step_time=0.5380s data_time=0.0784s it/s=1.623 eta_to_10000=5355.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0166 grad_action_out_proj=0.2039 grad_shared_expert=0.5801 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1310/10000 [15:19<1:31:37, 1.58it/s, loss=0.0366, lr=2.46e-05, step=1309] Training: 13%|█▎ | 1310/10000 [15:19<1:31:37, 1.58it/s, loss=0.0187, lr=2.46e-05, step=1310] Training: 13%|█▎ | 1311/10000 [15:19<1:36:08, 1.51it/s, loss=0.0187, lr=2.46e-05, step=1310] Training: 13%|█▎ | 1311/10000 [15:19<1:36:08, 1.51it/s, loss=0.0762, lr=2.46e-05, step=1311] Training: 13%|█▎ | 1312/10000 [15:20<1:29:19, 1.62it/s, loss=0.0762, lr=2.46e-05, step=1311] Training: 13%|█▎ | 1312/10000 [15:20<1:29:19, 1.62it/s, loss=0.0622, lr=2.46e-05, step=1312] Training: 13%|█▎ | 1313/10000 [15:21<1:34:08, 1.54it/s, loss=0.0622, lr=2.46e-05, step=1312] Training: 13%|█▎ | 1313/10000 [15:21<1:34:08, 1.54it/s, loss=0.0577, lr=2.46e-05, step=1313] Training: 13%|█▎ | 1314/10000 [15:21<1:27:58, 1.65it/s, loss=0.0577, lr=2.46e-05, step=1313] Training: 13%|█▎ | 1314/10000 [15:21<1:27:58, 1.65it/s, loss=0.0485, lr=2.46e-05, step=1314] Training: 13%|█▎ | 1315/10000 [15:22<1:23:36, 1.73it/s, loss=0.0485, lr=2.46e-05, step=1314] Training: 13%|█▎ | 1315/10000 [15:22<1:23:36, 1.73it/s, loss=0.0609, lr=2.46e-05, step=1315] Training: 13%|█▎ | 1316/10000 [15:22<1:20:17, 1.80it/s, loss=0.0609, lr=2.46e-05, step=1315] Training: 13%|█▎ | 1316/10000 [15:22<1:20:17, 1.80it/s, loss=0.0202, lr=2.46e-05, step=1316] Training: 13%|█▎ | 1317/10000 [15:23<1:34:01, 1.54it/s, loss=0.0202, lr=2.46e-05, step=1316] Training: 13%|█▎ | 1317/10000 [15:23<1:34:01, 1.54it/s, loss=0.0459, lr=2.46e-05, step=1317] Training: 13%|█▎ | 1318/10000 [15:23<1:27:37, 1.65it/s, loss=0.0459, lr=2.46e-05, step=1317] Training: 13%|█▎ | 1318/10000 [15:23<1:27:37, 1.65it/s, loss=0.0168, lr=2.46e-05, step=1318] Training: 13%|█▎ | 1319/10000 [15:24<1:32:12, 1.57it/s, loss=0.0168, lr=2.46e-05, step=1318] Training: 13%|█▎ | 1319/10000 [15:24<1:32:12, 1.57it/s, loss=0.0374, lr=2.46e-05, step=1319]16:21:31.650 [I] step=1320 loss=0.1497 smoothed_loss=0.0508 lr=2.46e-05 grad_norm=0.7574 step_time=0.5319s data_time=0.0786s it/s=1.638 eta_to_10000=5297.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0265 grad_action_out_proj=0.2295 grad_shared_expert=0.6644 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1320/10000 [15:25<1:27:31, 1.65it/s, loss=0.0374, lr=2.46e-05, step=1319] Training: 13%|█▎ | 1320/10000 [15:25<1:27:31, 1.65it/s, loss=0.1497, lr=2.46e-05, step=1320] Training: 13%|█▎ | 1321/10000 [15:25<1:24:32, 1.71it/s, loss=0.1497, lr=2.46e-05, step=1320] Training: 13%|█▎ | 1321/10000 [15:25<1:24:32, 1.71it/s, loss=0.0864, lr=2.46e-05, step=1321] Training: 13%|█▎ | 1322/10000 [15:26<1:29:51, 1.61it/s, loss=0.0864, lr=2.46e-05, step=1321] Training: 13%|█▎ | 1322/10000 [15:26<1:29:51, 1.61it/s, loss=0.0696, lr=2.46e-05, step=1322] Training: 13%|█▎ | 1323/10000 [15:27<1:27:15, 1.66it/s, loss=0.0696, lr=2.46e-05, step=1322] Training: 13%|█▎ | 1323/10000 [15:27<1:27:15, 1.66it/s, loss=0.0208, lr=2.46e-05, step=1323] Training: 13%|█▎ | 1324/10000 [15:27<1:35:38, 1.51it/s, loss=0.0208, lr=2.46e-05, step=1323] Training: 13%|█▎ | 1324/10000 [15:27<1:35:38, 1.51it/s, loss=0.1043, lr=2.46e-05, step=1324] Training: 13%|█▎ | 1325/10000 [15:28<1:28:34, 1.63it/s, loss=0.1043, lr=2.46e-05, step=1324] Training: 13%|█▎ | 1325/10000 [15:28<1:28:34, 1.63it/s, loss=0.0398, lr=2.46e-05, step=1325] Training: 13%|█▎ | 1326/10000 [15:29<1:32:02, 1.57it/s, loss=0.0398, lr=2.46e-05, step=1325] Training: 13%|█▎ | 1326/10000 [15:29<1:32:02, 1.57it/s, loss=0.0349, lr=2.46e-05, step=1326] Training: 13%|█▎ | 1327/10000 [15:29<1:28:25, 1.63it/s, loss=0.0349, lr=2.46e-05, step=1326] Training: 13%|█▎ | 1327/10000 [15:29<1:28:25, 1.63it/s, loss=0.0626, lr=2.46e-05, step=1327] Training: 13%|█▎ | 1328/10000 [15:30<1:25:54, 1.68it/s, loss=0.0626, lr=2.46e-05, step=1327] Training: 13%|█▎ | 1328/10000 [15:30<1:25:54, 1.68it/s, loss=0.0769, lr=2.46e-05, step=1328] Training: 13%|█▎ | 1329/10000 [15:30<1:22:31, 1.75it/s, loss=0.0769, lr=2.46e-05, step=1328] Training: 13%|█▎ | 1329/10000 [15:30<1:22:31, 1.75it/s, loss=0.0525, lr=2.46e-05, step=1329]16:21:37.812 [I] step=1330 loss=0.0274 smoothed_loss=0.0535 lr=2.46e-05 grad_norm=0.8089 step_time=0.5283s data_time=0.0879s it/s=1.623 eta_to_10000=5341.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1869 grad_shared_expert=0.5570 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1330/10000 [15:31<1:30:00, 1.61it/s, loss=0.0525, lr=2.46e-05, step=1329] Training: 13%|█▎ | 1330/10000 [15:31<1:30:00, 1.61it/s, loss=0.0274, lr=2.46e-05, step=1330] Training: 13%|█▎ | 1331/10000 [15:32<1:43:15, 1.40it/s, loss=0.0274, lr=2.46e-05, step=1330] Training: 13%|█▎ | 1331/10000 [15:32<1:43:15, 1.40it/s, loss=0.0521, lr=2.46e-05, step=1331] Training: 13%|█▎ | 1332/10000 [15:33<1:42:35, 1.41it/s, loss=0.0521, lr=2.46e-05, step=1331] Training: 13%|█▎ | 1332/10000 [15:33<1:42:35, 1.41it/s, loss=0.0453, lr=2.46e-05, step=1332] Training: 13%|█▎ | 1333/10000 [15:33<1:45:51, 1.36it/s, loss=0.0453, lr=2.46e-05, step=1332] Training: 13%|█▎ | 1333/10000 [15:33<1:45:51, 1.36it/s, loss=0.0181, lr=2.46e-05, step=1333] Training: 13%|█▎ | 1334/10000 [15:34<1:36:34, 1.50it/s, loss=0.0181, lr=2.46e-05, step=1333] Training: 13%|█▎ | 1334/10000 [15:34<1:36:34, 1.50it/s, loss=0.1060, lr=2.46e-05, step=1334] Training: 13%|█▎ | 1335/10000 [15:34<1:29:20, 1.62it/s, loss=0.1060, lr=2.46e-05, step=1334] Training: 13%|█▎ | 1335/10000 [15:34<1:29:20, 1.62it/s, loss=0.0586, lr=2.46e-05, step=1335] Training: 13%|█▎ | 1336/10000 [15:35<1:28:35, 1.63it/s, loss=0.0586, lr=2.46e-05, step=1335] Training: 13%|█▎ | 1336/10000 [15:35<1:28:35, 1.63it/s, loss=0.0265, lr=2.46e-05, step=1336] Training: 13%|█▎ | 1337/10000 [15:35<1:23:11, 1.74it/s, loss=0.0265, lr=2.46e-05, step=1336] Training: 13%|█▎ | 1337/10000 [15:35<1:23:11, 1.74it/s, loss=0.0446, lr=2.46e-05, step=1337] Training: 13%|█▎ | 1338/10000 [15:36<1:31:24, 1.58it/s, loss=0.0446, lr=2.46e-05, step=1337] Training: 13%|█▎ | 1338/10000 [15:36<1:31:24, 1.58it/s, loss=0.0396, lr=2.46e-05, step=1338] Training: 13%|█▎ | 1339/10000 [15:37<1:25:27, 1.69it/s, loss=0.0396, lr=2.46e-05, step=1338] Training: 13%|█▎ | 1339/10000 [15:37<1:25:27, 1.69it/s, loss=0.0338, lr=2.46e-05, step=1339]16:21:44.372 [I] step=1340 loss=0.0306 smoothed_loss=0.0469 lr=2.46e-05 grad_norm=0.7625 step_time=0.5783s data_time=0.0778s it/s=1.524 eta_to_10000=5680.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0116 grad_action_out_proj=0.1770 grad_shared_expert=0.4213 (10775:train_pytorch.py:850) + Training: 13%|█▎ | 1340/10000 [15:37<1:33:15, 1.55it/s, loss=0.0338, lr=2.46e-05, step=1339] Training: 13%|█▎ | 1340/10000 [15:37<1:33:15, 1.55it/s, loss=0.0306, lr=2.46e-05, step=1340] Training: 13%|█▎ | 1341/10000 [15:38<1:28:27, 1.63it/s, loss=0.0306, lr=2.46e-05, step=1340] Training: 13%|█▎ | 1341/10000 [15:38<1:28:27, 1.63it/s, loss=0.0148, lr=2.46e-05, step=1341] Training: 13%|█▎ | 1342/10000 [15:38<1:23:22, 1.73it/s, loss=0.0148, lr=2.46e-05, step=1341] Training: 13%|█▎ | 1342/10000 [15:38<1:23:22, 1.73it/s, loss=0.0421, lr=2.46e-05, step=1342] Training: 13%|█▎ | 1343/10000 [15:39<1:19:33, 1.81it/s, loss=0.0421, lr=2.46e-05, step=1342] Training: 13%|█▎ | 1343/10000 [15:39<1:19:33, 1.81it/s, loss=0.0204, lr=2.46e-05, step=1343] Training: 13%|█▎ | 1344/10000 [15:40<1:26:39, 1.66it/s, loss=0.0204, lr=2.46e-05, step=1343] Training: 13%|█▎ | 1344/10000 [15:40<1:26:39, 1.66it/s, loss=0.0448, lr=2.46e-05, step=1344] Training: 13%|█▎ | 1345/10000 [15:40<1:34:29, 1.53it/s, loss=0.0448, lr=2.46e-05, step=1344] Training: 13%|█▎ | 1345/10000 [15:40<1:34:29, 1.53it/s, loss=0.0244, lr=2.46e-05, step=1345] Training: 13%|█▎ | 1346/10000 [15:41<1:30:02, 1.60it/s, loss=0.0244, lr=2.46e-05, step=1345] Training: 13%|█▎ | 1346/10000 [15:41<1:30:02, 1.60it/s, loss=0.0565, lr=2.46e-05, step=1346] Training: 13%|█▎ | 1347/10000 [15:42<1:35:35, 1.51it/s, loss=0.0565, lr=2.46e-05, step=1346] Training: 13%|█▎ | 1347/10000 [15:42<1:35:35, 1.51it/s, loss=0.0786, lr=2.46e-05, step=1347] Training: 13%|█▎ | 1348/10000 [15:42<1:28:47, 1.62it/s, loss=0.0786, lr=2.46e-05, step=1347] Training: 13%|█▎ | 1348/10000 [15:42<1:28:47, 1.62it/s, loss=0.0510, lr=2.46e-05, step=1348] Training: 13%|█▎ | 1349/10000 [15:43<1:33:28, 1.54it/s, loss=0.0510, lr=2.46e-05, step=1348] Training: 13%|█▎ | 1349/10000 [15:43<1:33:28, 1.54it/s, loss=0.0407, lr=2.46e-05, step=1349]16:21:50.667 [I] step=1350 loss=0.0636 smoothed_loss=0.0471 lr=2.46e-05 grad_norm=0.6394 step_time=0.5456s data_time=0.0838s it/s=1.589 eta_to_10000=5443.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.2263 grad_shared_expert=0.6218 (10775:train_pytorch.py:850) + Training: 14%|█▎ | 1350/10000 [15:44<1:37:27, 1.48it/s, loss=0.0407, lr=2.46e-05, step=1349] Training: 14%|█▎ | 1350/10000 [15:44<1:37:27, 1.48it/s, loss=0.0636, lr=2.46e-05, step=1350] Training: 14%|█▎ | 1351/10000 [15:45<1:43:34, 1.39it/s, loss=0.0636, lr=2.46e-05, step=1350] Training: 14%|█▎ | 1351/10000 [15:45<1:43:34, 1.39it/s, loss=0.0264, lr=2.46e-05, step=1351] Training: 14%|█▎ | 1352/10000 [15:45<1:44:31, 1.38it/s, loss=0.0264, lr=2.46e-05, step=1351] Training: 14%|█▎ | 1352/10000 [15:45<1:44:31, 1.38it/s, loss=0.0167, lr=2.46e-05, step=1352] Training: 14%|█▎ | 1353/10000 [15:46<1:34:04, 1.53it/s, loss=0.0167, lr=2.46e-05, step=1352] Training: 14%|█▎ | 1353/10000 [15:46<1:34:04, 1.53it/s, loss=0.0326, lr=2.46e-05, step=1353] Training: 14%|█▎ | 1354/10000 [15:46<1:37:09, 1.48it/s, loss=0.0326, lr=2.46e-05, step=1353] Training: 14%|█▎ | 1354/10000 [15:46<1:37:09, 1.48it/s, loss=0.0616, lr=2.46e-05, step=1354] Training: 14%|█▎ | 1355/10000 [15:47<1:30:38, 1.59it/s, loss=0.0616, lr=2.46e-05, step=1354] Training: 14%|█▎ | 1355/10000 [15:47<1:30:38, 1.59it/s, loss=0.0496, lr=2.46e-05, step=1355] Training: 14%|█▎ | 1356/10000 [15:48<1:26:27, 1.67it/s, loss=0.0496, lr=2.46e-05, step=1355] Training: 14%|█▎ | 1356/10000 [15:48<1:26:27, 1.67it/s, loss=0.0208, lr=2.46e-05, step=1356] Training: 14%|█▎ | 1357/10000 [15:48<1:26:29, 1.67it/s, loss=0.0208, lr=2.46e-05, step=1356] Training: 14%|█▎ | 1357/10000 [15:48<1:26:29, 1.67it/s, loss=0.0237, lr=2.46e-05, step=1357] Training: 14%|█▎ | 1358/10000 [15:49<1:22:47, 1.74it/s, loss=0.0237, lr=2.46e-05, step=1357] Training: 14%|█▎ | 1358/10000 [15:49<1:22:47, 1.74it/s, loss=0.0170, lr=2.46e-05, step=1358] Training: 14%|█▎ | 1359/10000 [15:49<1:19:12, 1.82it/s, loss=0.0170, lr=2.46e-05, step=1358] Training: 14%|█▎ | 1359/10000 [15:49<1:19:12, 1.82it/s, loss=0.0355, lr=2.46e-05, step=1359]16:21:56.909 [I] step=1360 loss=0.0195 smoothed_loss=0.0355 lr=2.46e-05 grad_norm=0.5836 step_time=0.5552s data_time=0.0690s it/s=1.602 eta_to_10000=5392.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1242 grad_shared_expert=0.3889 (10775:train_pytorch.py:850) + Training: 14%|█▎ | 1360/10000 [15:50<1:30:26, 1.59it/s, loss=0.0355, lr=2.46e-05, step=1359] Training: 14%|█▎ | 1360/10000 [15:50<1:30:26, 1.59it/s, loss=0.0195, lr=2.45e-05, step=1360] Training: 14%|█▎ | 1361/10000 [15:50<1:24:34, 1.70it/s, loss=0.0195, lr=2.45e-05, step=1360] Training: 14%|█▎ | 1361/10000 [15:50<1:24:34, 1.70it/s, loss=0.0261, lr=2.45e-05, step=1361] Training: 14%|█▎ | 1362/10000 [15:51<1:31:14, 1.58it/s, loss=0.0261, lr=2.45e-05, step=1361] Training: 14%|█▎ | 1362/10000 [15:51<1:31:14, 1.58it/s, loss=0.0380, lr=2.45e-05, step=1362] Training: 14%|█▎ | 1363/10000 [15:52<1:25:30, 1.68it/s, loss=0.0380, lr=2.45e-05, step=1362] Training: 14%|█▎ | 1363/10000 [15:52<1:25:30, 1.68it/s, loss=0.0333, lr=2.45e-05, step=1363] Training: 14%|█▎ | 1364/10000 [15:52<1:21:20, 1.77it/s, loss=0.0333, lr=2.45e-05, step=1363] Training: 14%|█▎ | 1364/10000 [15:52<1:21:20, 1.77it/s, loss=0.0214, lr=2.45e-05, step=1364] Training: 14%|█▎ | 1365/10000 [15:53<1:18:50, 1.83it/s, loss=0.0214, lr=2.45e-05, step=1364] Training: 14%|█▎ | 1365/10000 [15:53<1:18:50, 1.83it/s, loss=0.0285, lr=2.45e-05, step=1365] Training: 14%|█▎ | 1366/10000 [15:53<1:16:19, 1.89it/s, loss=0.0285, lr=2.45e-05, step=1365] Training: 14%|█▎ | 1366/10000 [15:53<1:16:19, 1.89it/s, loss=0.0236, lr=2.45e-05, step=1366] Training: 14%|█▎ | 1367/10000 [15:54<1:29:16, 1.61it/s, loss=0.0236, lr=2.45e-05, step=1366] Training: 14%|█▎ | 1367/10000 [15:54<1:29:16, 1.61it/s, loss=0.0291, lr=2.45e-05, step=1367] Training: 14%|█▎ | 1368/10000 [15:55<1:33:14, 1.54it/s, loss=0.0291, lr=2.45e-05, step=1367] Training: 14%|█▎ | 1368/10000 [15:55<1:33:14, 1.54it/s, loss=0.0119, lr=2.45e-05, step=1368] Training: 14%|█▎ | 1369/10000 [15:55<1:36:50, 1.49it/s, loss=0.0119, lr=2.45e-05, step=1368] Training: 14%|█▎ | 1369/10000 [15:55<1:36:50, 1.49it/s, loss=0.0302, lr=2.45e-05, step=1369]16:22:03.045 [I] step=1370 loss=0.0195 smoothed_loss=0.0288 lr=2.45e-05 grad_norm=0.6400 step_time=0.5399s data_time=0.0737s it/s=1.630 eta_to_10000=5295.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0164 grad_action_out_proj=0.1847 grad_shared_expert=0.7229 (10775:train_pytorch.py:850) + Training: 14%|█▎ | 1370/10000 [15:56<1:35:01, 1.51it/s, loss=0.0302, lr=2.45e-05, step=1369] Training: 14%|█▎ | 1370/10000 [15:56<1:35:01, 1.51it/s, loss=0.0195, lr=2.45e-05, step=1370] Training: 14%|█▎ | 1371/10000 [15:57<1:27:59, 1.63it/s, loss=0.0195, lr=2.45e-05, step=1370] Training: 14%|█▎ | 1371/10000 [15:57<1:27:59, 1.63it/s, loss=0.0207, lr=2.45e-05, step=1371] Training: 14%|█▎ | 1372/10000 [15:57<1:32:22, 1.56it/s, loss=0.0207, lr=2.45e-05, step=1371] Training: 14%|█▎ | 1372/10000 [15:57<1:32:22, 1.56it/s, loss=0.0678, lr=2.45e-05, step=1372] Training: 14%|█▎ | 1373/10000 [15:58<1:26:06, 1.67it/s, loss=0.0678, lr=2.45e-05, step=1372] Training: 14%|█▎ | 1373/10000 [15:58<1:26:06, 1.67it/s, loss=0.0278, lr=2.45e-05, step=1373] Training: 14%|█▎ | 1374/10000 [15:59<1:32:07, 1.56it/s, loss=0.0278, lr=2.45e-05, step=1373] Training: 14%|█▎ | 1374/10000 [15:59<1:32:07, 1.56it/s, loss=0.0431, lr=2.45e-05, step=1374] Training: 14%|█▍ | 1375/10000 [15:59<1:37:31, 1.47it/s, loss=0.0431, lr=2.45e-05, step=1374] Training: 14%|█▍ | 1375/10000 [15:59<1:37:31, 1.47it/s, loss=0.0405, lr=2.45e-05, step=1375] Training: 14%|█▍ | 1376/10000 [16:00<1:40:14, 1.43it/s, loss=0.0405, lr=2.45e-05, step=1375] Training: 14%|█▍ | 1376/10000 [16:00<1:40:14, 1.43it/s, loss=0.0527, lr=2.45e-05, step=1376] Training: 14%|█▍ | 1377/10000 [16:01<1:37:34, 1.47it/s, loss=0.0527, lr=2.45e-05, step=1376] Training: 14%|█▍ | 1377/10000 [16:01<1:37:34, 1.47it/s, loss=0.0200, lr=2.45e-05, step=1377] Training: 14%|█▍ | 1378/10000 [16:01<1:30:04, 1.60it/s, loss=0.0200, lr=2.45e-05, step=1377] Training: 14%|█▍ | 1378/10000 [16:01<1:30:04, 1.60it/s, loss=0.0347, lr=2.45e-05, step=1378] Training: 14%|█▍ | 1379/10000 [16:02<1:24:44, 1.70it/s, loss=0.0347, lr=2.45e-05, step=1378] Training: 14%|█▍ | 1379/10000 [16:02<1:24:44, 1.70it/s, loss=0.0307, lr=2.45e-05, step=1379]16:22:09.382 [I] step=1380 loss=0.0270 smoothed_loss=0.0330 lr=2.45e-05 grad_norm=0.7124 step_time=0.5375s data_time=0.0962s it/s=1.578 eta_to_10000=5461.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1801 grad_shared_expert=0.6860 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1380/10000 [16:02<1:31:05, 1.58it/s, loss=0.0307, lr=2.45e-05, step=1379] Training: 14%|█▍ | 1380/10000 [16:02<1:31:05, 1.58it/s, loss=0.0270, lr=2.45e-05, step=1380] Training: 14%|█▍ | 1381/10000 [16:03<1:32:13, 1.56it/s, loss=0.0270, lr=2.45e-05, step=1380] Training: 14%|█▍ | 1381/10000 [16:03<1:32:13, 1.56it/s, loss=0.0236, lr=2.45e-05, step=1381] Training: 14%|█▍ | 1382/10000 [16:04<1:37:14, 1.48it/s, loss=0.0236, lr=2.45e-05, step=1381] Training: 14%|█▍ | 1382/10000 [16:04<1:37:14, 1.48it/s, loss=0.0153, lr=2.45e-05, step=1382] Training: 14%|█▍ | 1383/10000 [16:05<1:48:55, 1.32it/s, loss=0.0153, lr=2.45e-05, step=1382] Training: 14%|█▍ | 1383/10000 [16:05<1:48:55, 1.32it/s, loss=0.0384, lr=2.45e-05, step=1383] Training: 14%|█▍ | 1384/10000 [16:05<1:37:39, 1.47it/s, loss=0.0384, lr=2.45e-05, step=1383] Training: 14%|█▍ | 1384/10000 [16:05<1:37:39, 1.47it/s, loss=0.0224, lr=2.45e-05, step=1384] Training: 14%|█▍ | 1385/10000 [16:06<1:29:22, 1.61it/s, loss=0.0224, lr=2.45e-05, step=1384] Training: 14%|█▍ | 1385/10000 [16:06<1:29:22, 1.61it/s, loss=0.0414, lr=2.45e-05, step=1385] Training: 14%|█▍ | 1386/10000 [16:07<1:33:13, 1.54it/s, loss=0.0414, lr=2.45e-05, step=1385] Training: 14%|█▍ | 1386/10000 [16:07<1:33:13, 1.54it/s, loss=0.0230, lr=2.45e-05, step=1386] Training: 14%|█▍ | 1387/10000 [16:07<1:30:48, 1.58it/s, loss=0.0230, lr=2.45e-05, step=1386] Training: 14%|█▍ | 1387/10000 [16:07<1:30:48, 1.58it/s, loss=0.0198, lr=2.45e-05, step=1387] Training: 14%|█▍ | 1388/10000 [16:08<1:32:16, 1.56it/s, loss=0.0198, lr=2.45e-05, step=1387] Training: 14%|█▍ | 1388/10000 [16:08<1:32:16, 1.56it/s, loss=0.0797, lr=2.45e-05, step=1388] Training: 14%|█▍ | 1389/10000 [16:09<1:36:55, 1.48it/s, loss=0.0797, lr=2.45e-05, step=1388] Training: 14%|█▍ | 1389/10000 [16:09<1:36:55, 1.48it/s, loss=0.0292, lr=2.45e-05, step=1389]16:22:16.203 [I] step=1390 loss=0.0581 smoothed_loss=0.0364 lr=2.45e-05 grad_norm=0.6877 step_time=0.5879s data_time=0.0941s it/s=1.466 eta_to_10000=5871.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.1398 grad_shared_expert=0.6125 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1390/10000 [16:09<1:39:49, 1.44it/s, loss=0.0292, lr=2.45e-05, step=1389] Training: 14%|█▍ | 1390/10000 [16:09<1:39:49, 1.44it/s, loss=0.0581, lr=2.45e-05, step=1390] Training: 14%|█▍ | 1391/10000 [16:10<1:31:12, 1.57it/s, loss=0.0581, lr=2.45e-05, step=1390] Training: 14%|█▍ | 1391/10000 [16:10<1:31:12, 1.57it/s, loss=0.0369, lr=2.45e-05, step=1391] Training: 14%|█▍ | 1392/10000 [16:10<1:34:31, 1.52it/s, loss=0.0369, lr=2.45e-05, step=1391] Training: 14%|█▍ | 1392/10000 [16:10<1:34:31, 1.52it/s, loss=0.0196, lr=2.45e-05, step=1392] Training: 14%|█▍ | 1393/10000 [16:11<1:28:37, 1.62it/s, loss=0.0196, lr=2.45e-05, step=1392] Training: 14%|█▍ | 1393/10000 [16:11<1:28:37, 1.62it/s, loss=0.0349, lr=2.45e-05, step=1393] Training: 14%|█▍ | 1394/10000 [16:11<1:23:10, 1.72it/s, loss=0.0349, lr=2.45e-05, step=1393] Training: 14%|█▍ | 1394/10000 [16:11<1:23:10, 1.72it/s, loss=0.0171, lr=2.45e-05, step=1394] Training: 14%|█▍ | 1395/10000 [16:12<1:19:02, 1.81it/s, loss=0.0171, lr=2.45e-05, step=1394] Training: 14%|█▍ | 1395/10000 [16:12<1:19:02, 1.81it/s, loss=0.0266, lr=2.45e-05, step=1395] Training: 14%|█▍ | 1396/10000 [16:13<1:26:46, 1.65it/s, loss=0.0266, lr=2.45e-05, step=1395] Training: 14%|█▍ | 1396/10000 [16:13<1:26:46, 1.65it/s, loss=0.0438, lr=2.45e-05, step=1396] Training: 14%|█▍ | 1397/10000 [16:14<1:37:17, 1.47it/s, loss=0.0438, lr=2.45e-05, step=1396] Training: 14%|█▍ | 1397/10000 [16:14<1:37:17, 1.47it/s, loss=0.0678, lr=2.45e-05, step=1397] Training: 14%|█▍ | 1398/10000 [16:14<1:29:49, 1.60it/s, loss=0.0678, lr=2.45e-05, step=1397] Training: 14%|█▍ | 1398/10000 [16:14<1:29:49, 1.60it/s, loss=0.0247, lr=2.45e-05, step=1398] Training: 14%|█▍ | 1399/10000 [16:15<1:26:31, 1.66it/s, loss=0.0247, lr=2.45e-05, step=1398] Training: 14%|█▍ | 1399/10000 [16:15<1:26:31, 1.66it/s, loss=0.0328, lr=2.45e-05, step=1399]16:22:22.138 [I] step=1400 loss=0.0524 smoothed_loss=0.0371 lr=2.45e-05 grad_norm=0.6551 step_time=0.5131s data_time=0.0805s it/s=1.685 eta_to_10000=5104.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0245 grad_action_out_proj=0.2522 grad_shared_expert=0.6559 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1400/10000 [16:15<1:26:05, 1.66it/s, loss=0.0328, lr=2.45e-05, step=1399] Training: 14%|█▍ | 1400/10000 [16:15<1:26:05, 1.66it/s, loss=0.0524, lr=2.45e-05, step=1400] Training: 14%|█▍ | 1401/10000 [16:16<1:23:58, 1.71it/s, loss=0.0524, lr=2.45e-05, step=1400] Training: 14%|█▍ | 1401/10000 [16:16<1:23:58, 1.71it/s, loss=0.0133, lr=2.45e-05, step=1401] Training: 14%|█▍ | 1402/10000 [16:16<1:20:46, 1.77it/s, loss=0.0133, lr=2.45e-05, step=1401] Training: 14%|█▍ | 1402/10000 [16:16<1:20:46, 1.77it/s, loss=0.0233, lr=2.45e-05, step=1402] Training: 14%|█▍ | 1403/10000 [16:17<1:38:51, 1.45it/s, loss=0.0233, lr=2.45e-05, step=1402] Training: 14%|█▍ | 1403/10000 [16:17<1:38:51, 1.45it/s, loss=0.0339, lr=2.45e-05, step=1403] Training: 14%|█▍ | 1404/10000 [16:18<1:31:52, 1.56it/s, loss=0.0339, lr=2.45e-05, step=1403] Training: 14%|█▍ | 1404/10000 [16:18<1:31:52, 1.56it/s, loss=0.0796, lr=2.45e-05, step=1404] Training: 14%|█▍ | 1405/10000 [16:19<1:38:31, 1.45it/s, loss=0.0796, lr=2.45e-05, step=1404] Training: 14%|█▍ | 1405/10000 [16:19<1:38:31, 1.45it/s, loss=0.0355, lr=2.45e-05, step=1405] Training: 14%|█▍ | 1406/10000 [16:19<1:30:19, 1.59it/s, loss=0.0355, lr=2.45e-05, step=1405] Training: 14%|█▍ | 1406/10000 [16:19<1:30:19, 1.59it/s, loss=0.0334, lr=2.45e-05, step=1406] Training: 14%|█▍ | 1407/10000 [16:20<1:24:15, 1.70it/s, loss=0.0334, lr=2.45e-05, step=1406] Training: 14%|█▍ | 1407/10000 [16:20<1:24:15, 1.70it/s, loss=0.0388, lr=2.45e-05, step=1407] Training: 14%|█▍ | 1408/10000 [16:20<1:20:54, 1.77it/s, loss=0.0388, lr=2.45e-05, step=1407] Training: 14%|█▍ | 1408/10000 [16:20<1:20:54, 1.77it/s, loss=0.0177, lr=2.45e-05, step=1408] Training: 14%|█▍ | 1409/10000 [16:21<1:28:12, 1.62it/s, loss=0.0177, lr=2.45e-05, step=1408] Training: 14%|█▍ | 1409/10000 [16:21<1:28:12, 1.62it/s, loss=0.0321, lr=2.45e-05, step=1409]16:22:28.282 [I] step=1410 loss=0.0410 smoothed_loss=0.0358 lr=2.45e-05 grad_norm=0.8354 step_time=0.5369s data_time=0.0774s it/s=1.628 eta_to_10000=5276.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0124 grad_action_out_proj=0.2053 grad_shared_expert=0.5687 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1410/10000 [16:21<1:24:53, 1.69it/s, loss=0.0321, lr=2.45e-05, step=1409] Training: 14%|█▍ | 1410/10000 [16:21<1:24:53, 1.69it/s, loss=0.0410, lr=2.45e-05, step=1410] Training: 14%|█▍ | 1411/10000 [16:22<1:27:01, 1.64it/s, loss=0.0410, lr=2.45e-05, step=1410] Training: 14%|█▍ | 1411/10000 [16:22<1:27:01, 1.64it/s, loss=0.0134, lr=2.45e-05, step=1411] Training: 14%|█▍ | 1412/10000 [16:23<1:32:02, 1.56it/s, loss=0.0134, lr=2.45e-05, step=1411] Training: 14%|█▍ | 1412/10000 [16:23<1:32:02, 1.56it/s, loss=0.0393, lr=2.45e-05, step=1412] Training: 14%|█▍ | 1413/10000 [16:23<1:29:30, 1.60it/s, loss=0.0393, lr=2.45e-05, step=1412] Training: 14%|█▍ | 1413/10000 [16:23<1:29:30, 1.60it/s, loss=0.0212, lr=2.45e-05, step=1413] Training: 14%|█▍ | 1414/10000 [16:24<1:33:18, 1.53it/s, loss=0.0212, lr=2.45e-05, step=1413] Training: 14%|█▍ | 1414/10000 [16:24<1:33:18, 1.53it/s, loss=0.0364, lr=2.45e-05, step=1414] Training: 14%|█▍ | 1415/10000 [16:25<1:34:52, 1.51it/s, loss=0.0364, lr=2.45e-05, step=1414] Training: 14%|█▍ | 1415/10000 [16:25<1:34:52, 1.51it/s, loss=0.0412, lr=2.45e-05, step=1415] Training: 14%|█▍ | 1416/10000 [16:26<1:46:32, 1.34it/s, loss=0.0412, lr=2.45e-05, step=1415] Training: 14%|█▍ | 1416/10000 [16:26<1:46:32, 1.34it/s, loss=0.0187, lr=2.45e-05, step=1416] Training: 14%|█▍ | 1417/10000 [16:26<1:35:44, 1.49it/s, loss=0.0187, lr=2.45e-05, step=1416] Training: 14%|█▍ | 1417/10000 [16:26<1:35:44, 1.49it/s, loss=0.0254, lr=2.45e-05, step=1417] Training: 14%|█▍ | 1418/10000 [16:27<1:28:49, 1.61it/s, loss=0.0254, lr=2.45e-05, step=1417] Training: 14%|█▍ | 1418/10000 [16:27<1:28:49, 1.61it/s, loss=0.0384, lr=2.45e-05, step=1418] Training: 14%|█▍ | 1419/10000 [16:27<1:34:22, 1.52it/s, loss=0.0384, lr=2.45e-05, step=1418] Training: 14%|█▍ | 1419/10000 [16:27<1:34:22, 1.52it/s, loss=0.0250, lr=2.45e-05, step=1419]16:22:34.867 [I] step=1420 loss=0.0296 smoothed_loss=0.0315 lr=2.45e-05 grad_norm=0.6967 step_time=0.5584s data_time=0.1001s it/s=1.519 eta_to_10000=5649.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0113 grad_action_out_proj=0.1410 grad_shared_expert=0.5234 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1420/10000 [16:28<1:29:19, 1.60it/s, loss=0.0250, lr=2.45e-05, step=1419] Training: 14%|█▍ | 1420/10000 [16:28<1:29:19, 1.60it/s, loss=0.0296, lr=2.45e-05, step=1420] Training: 14%|█▍ | 1421/10000 [16:28<1:23:40, 1.71it/s, loss=0.0296, lr=2.45e-05, step=1420] Training: 14%|█▍ | 1421/10000 [16:28<1:23:40, 1.71it/s, loss=0.0263, lr=2.45e-05, step=1421] Training: 14%|█▍ | 1422/10000 [16:29<1:24:44, 1.69it/s, loss=0.0263, lr=2.45e-05, step=1421] Training: 14%|█▍ | 1422/10000 [16:29<1:24:44, 1.69it/s, loss=0.0239, lr=2.45e-05, step=1422] Training: 14%|█▍ | 1423/10000 [16:30<1:21:18, 1.76it/s, loss=0.0239, lr=2.45e-05, step=1422] Training: 14%|█▍ | 1423/10000 [16:30<1:21:18, 1.76it/s, loss=0.0491, lr=2.45e-05, step=1423] Training: 14%|█▍ | 1424/10000 [16:31<1:40:35, 1.42it/s, loss=0.0491, lr=2.45e-05, step=1423] Training: 14%|█▍ | 1424/10000 [16:31<1:40:35, 1.42it/s, loss=0.0500, lr=2.45e-05, step=1424] Training: 14%|█▍ | 1425/10000 [16:31<1:31:55, 1.55it/s, loss=0.0500, lr=2.45e-05, step=1424] Training: 14%|█▍ | 1425/10000 [16:31<1:31:55, 1.55it/s, loss=0.0166, lr=2.45e-05, step=1425] Training: 14%|█▍ | 1426/10000 [16:32<1:35:12, 1.50it/s, loss=0.0166, lr=2.45e-05, step=1425] Training: 14%|█▍ | 1426/10000 [16:32<1:35:12, 1.50it/s, loss=0.0331, lr=2.45e-05, step=1426] Training: 14%|█▍ | 1427/10000 [16:32<1:27:51, 1.63it/s, loss=0.0331, lr=2.45e-05, step=1426] Training: 14%|█▍ | 1427/10000 [16:32<1:27:51, 1.63it/s, loss=0.0485, lr=2.45e-05, step=1427] Training: 14%|█▍ | 1428/10000 [16:33<1:28:04, 1.62it/s, loss=0.0485, lr=2.45e-05, step=1427] Training: 14%|█▍ | 1428/10000 [16:33<1:28:04, 1.62it/s, loss=0.1360, lr=2.45e-05, step=1428] Training: 14%|█▍ | 1429/10000 [16:34<1:28:25, 1.62it/s, loss=0.1360, lr=2.45e-05, step=1428] Training: 14%|█▍ | 1429/10000 [16:34<1:28:25, 1.62it/s, loss=0.0428, lr=2.45e-05, step=1429]16:22:41.104 [I] step=1430 loss=0.0625 smoothed_loss=0.0458 lr=2.45e-05 grad_norm=0.6918 step_time=0.5359s data_time=0.0878s it/s=1.604 eta_to_10000=5344.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0153 grad_action_out_proj=0.2071 grad_shared_expert=0.7021 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1430/10000 [16:34<1:29:19, 1.60it/s, loss=0.0428, lr=2.45e-05, step=1429] Training: 14%|█▍ | 1430/10000 [16:34<1:29:19, 1.60it/s, loss=0.0625, lr=2.45e-05, step=1430] Training: 14%|█▍ | 1431/10000 [16:35<1:36:14, 1.48it/s, loss=0.0625, lr=2.45e-05, step=1430] Training: 14%|█▍ | 1431/10000 [16:35<1:36:14, 1.48it/s, loss=0.0240, lr=2.45e-05, step=1431] Training: 14%|█▍ | 1432/10000 [16:35<1:28:57, 1.61it/s, loss=0.0240, lr=2.45e-05, step=1431] Training: 14%|█▍ | 1432/10000 [16:35<1:28:57, 1.61it/s, loss=0.0495, lr=2.45e-05, step=1432] Training: 14%|█▍ | 1433/10000 [16:36<1:35:26, 1.50it/s, loss=0.0495, lr=2.45e-05, step=1432] Training: 14%|█▍ | 1433/10000 [16:36<1:35:26, 1.50it/s, loss=0.0585, lr=2.45e-05, step=1433] Training: 14%|█▍ | 1434/10000 [16:37<1:29:47, 1.59it/s, loss=0.0585, lr=2.45e-05, step=1433] Training: 14%|█▍ | 1434/10000 [16:37<1:29:47, 1.59it/s, loss=0.0222, lr=2.45e-05, step=1434] Training: 14%|█▍ | 1435/10000 [16:38<1:36:49, 1.47it/s, loss=0.0222, lr=2.45e-05, step=1434] Training: 14%|█▍ | 1435/10000 [16:38<1:36:49, 1.47it/s, loss=0.0732, lr=2.45e-05, step=1435] Training: 14%|█▍ | 1436/10000 [16:38<1:36:36, 1.48it/s, loss=0.0732, lr=2.45e-05, step=1435] Training: 14%|█▍ | 1436/10000 [16:38<1:36:36, 1.48it/s, loss=0.0207, lr=2.45e-05, step=1436] Training: 14%|█▍ | 1437/10000 [16:39<1:34:51, 1.50it/s, loss=0.0207, lr=2.45e-05, step=1436] Training: 14%|█▍ | 1437/10000 [16:39<1:34:51, 1.50it/s, loss=0.0403, lr=2.45e-05, step=1437] Training: 14%|█▍ | 1438/10000 [16:40<1:42:45, 1.39it/s, loss=0.0403, lr=2.45e-05, step=1437] Training: 14%|█▍ | 1438/10000 [16:40<1:42:45, 1.39it/s, loss=0.0182, lr=2.45e-05, step=1438] Training: 14%|█▍ | 1439/10000 [16:41<1:53:39, 1.26it/s, loss=0.0182, lr=2.45e-05, step=1438] Training: 14%|█▍ | 1439/10000 [16:41<1:53:39, 1.26it/s, loss=0.0387, lr=2.45e-05, step=1439]16:22:48.408 [I] step=1440 loss=0.0267 smoothed_loss=0.0393 lr=2.45e-05 grad_norm=0.7153 step_time=0.6194s data_time=0.1111s it/s=1.369 eta_to_10000=6251.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0113 grad_action_out_proj=0.1862 grad_shared_expert=0.5675 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1440/10000 [16:41<1:52:41, 1.27it/s, loss=0.0387, lr=2.45e-05, step=1439] Training: 14%|█▍ | 1440/10000 [16:41<1:52:41, 1.27it/s, loss=0.0267, lr=2.45e-05, step=1440] Training: 14%|█▍ | 1441/10000 [16:42<1:46:48, 1.34it/s, loss=0.0267, lr=2.45e-05, step=1440] Training: 14%|█▍ | 1441/10000 [16:42<1:46:48, 1.34it/s, loss=0.0186, lr=2.45e-05, step=1441] Training: 14%|█▍ | 1442/10000 [16:43<1:44:31, 1.36it/s, loss=0.0186, lr=2.45e-05, step=1441] Training: 14%|█▍ | 1442/10000 [16:43<1:44:31, 1.36it/s, loss=0.0445, lr=2.45e-05, step=1442] Training: 14%|█▍ | 1443/10000 [16:43<1:36:17, 1.48it/s, loss=0.0445, lr=2.45e-05, step=1442] Training: 14%|█▍ | 1443/10000 [16:43<1:36:17, 1.48it/s, loss=0.0242, lr=2.45e-05, step=1443] Training: 14%|█▍ | 1444/10000 [16:44<1:40:51, 1.41it/s, loss=0.0242, lr=2.45e-05, step=1443] Training: 14%|█▍ | 1444/10000 [16:44<1:40:51, 1.41it/s, loss=0.0518, lr=2.45e-05, step=1444] Training: 14%|█▍ | 1445/10000 [16:45<1:35:18, 1.50it/s, loss=0.0518, lr=2.45e-05, step=1444] Training: 14%|█▍ | 1445/10000 [16:45<1:35:18, 1.50it/s, loss=0.1445, lr=2.45e-05, step=1445] Training: 14%|█▍ | 1446/10000 [16:46<1:50:41, 1.29it/s, loss=0.1445, lr=2.45e-05, step=1445] Training: 14%|█▍ | 1446/10000 [16:46<1:50:41, 1.29it/s, loss=0.0553, lr=2.45e-05, step=1446] Training: 14%|█▍ | 1447/10000 [16:47<1:54:51, 1.24it/s, loss=0.0553, lr=2.45e-05, step=1446] Training: 14%|█▍ | 1447/10000 [16:47<1:54:51, 1.24it/s, loss=0.0252, lr=2.45e-05, step=1447] Training: 14%|█▍ | 1448/10000 [16:48<2:28:34, 1.04s/it, loss=0.0252, lr=2.45e-05, step=1447] Training: 14%|█▍ | 1448/10000 [16:48<2:28:34, 1.04s/it, loss=0.0277, lr=2.45e-05, step=1448] Training: 14%|█▍ | 1449/10000 [16:49<2:10:17, 1.09it/s, loss=0.0277, lr=2.45e-05, step=1448] Training: 14%|█▍ | 1449/10000 [16:49<2:10:17, 1.09it/s, loss=0.0137, lr=2.45e-05, step=1449]16:22:56.465 [I] step=1450 loss=0.0626 smoothed_loss=0.0440 lr=2.45e-05 grad_norm=0.7544 step_time=0.6601s data_time=0.1456s it/s=1.241 eta_to_10000=6887.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.2101 grad_shared_expert=0.6986 (10775:train_pytorch.py:850) + Training: 14%|█▍ | 1450/10000 [16:50<2:00:54, 1.18it/s, loss=0.0137, lr=2.45e-05, step=1449] Training: 14%|█▍ | 1450/10000 [16:50<2:00:54, 1.18it/s, loss=0.0626, lr=2.45e-05, step=1450] Training: 15%|█▍ | 1451/10000 [16:50<1:50:39, 1.29it/s, loss=0.0626, lr=2.45e-05, step=1450] Training: 15%|█▍ | 1451/10000 [16:50<1:50:39, 1.29it/s, loss=0.0185, lr=2.44e-05, step=1451] Training: 15%|█▍ | 1452/10000 [16:51<1:50:37, 1.29it/s, loss=0.0185, lr=2.44e-05, step=1451] Training: 15%|█▍ | 1452/10000 [16:51<1:50:37, 1.29it/s, loss=0.0134, lr=2.44e-05, step=1452] Training: 15%|█▍ | 1453/10000 [16:52<1:49:32, 1.30it/s, loss=0.0134, lr=2.44e-05, step=1452] Training: 15%|█▍ | 1453/10000 [16:52<1:49:32, 1.30it/s, loss=0.0365, lr=2.44e-05, step=1453] Training: 15%|█▍ | 1454/10000 [16:52<1:43:38, 1.37it/s, loss=0.0365, lr=2.44e-05, step=1453] Training: 15%|█▍ | 1454/10000 [16:52<1:43:38, 1.37it/s, loss=0.0492, lr=2.44e-05, step=1454] Training: 15%|█▍ | 1455/10000 [16:53<1:43:18, 1.38it/s, loss=0.0492, lr=2.44e-05, step=1454] Training: 15%|█▍ | 1455/10000 [16:53<1:43:18, 1.38it/s, loss=0.1438, lr=2.44e-05, step=1455] Training: 15%|█▍ | 1456/10000 [16:54<1:40:58, 1.41it/s, loss=0.1438, lr=2.44e-05, step=1455] Training: 15%|█▍ | 1456/10000 [16:54<1:40:58, 1.41it/s, loss=0.0206, lr=2.44e-05, step=1456] Training: 15%|█▍ | 1457/10000 [16:54<1:43:24, 1.38it/s, loss=0.0206, lr=2.44e-05, step=1456] Training: 15%|█▍ | 1457/10000 [16:54<1:43:24, 1.38it/s, loss=0.0254, lr=2.44e-05, step=1457] Training: 15%|█▍ | 1458/10000 [16:55<1:35:05, 1.50it/s, loss=0.0254, lr=2.44e-05, step=1457] Training: 15%|█▍ | 1458/10000 [16:55<1:35:05, 1.50it/s, loss=0.0335, lr=2.44e-05, step=1458] Training: 15%|█▍ | 1459/10000 [16:56<1:37:42, 1.46it/s, loss=0.0335, lr=2.44e-05, step=1458] Training: 15%|█▍ | 1459/10000 [16:56<1:37:42, 1.46it/s, loss=0.0248, lr=2.44e-05, step=1459]16:23:03.533 [I] step=1460 loss=0.0844 smoothed_loss=0.0461 lr=2.44e-05 grad_norm=0.8047 step_time=0.5963s data_time=0.1106s it/s=1.415 eta_to_10000=6035.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.2578 grad_shared_expert=0.5675 (10775:train_pytorch.py:850) + Training: 15%|█▍ | 1460/10000 [16:57<1:46:02, 1.34it/s, loss=0.0248, lr=2.44e-05, step=1459] Training: 15%|█▍ | 1460/10000 [16:57<1:46:02, 1.34it/s, loss=0.0844, lr=2.44e-05, step=1460] Training: 15%|█▍ | 1461/10000 [16:57<1:35:16, 1.49it/s, loss=0.0844, lr=2.44e-05, step=1460] Training: 15%|█▍ | 1461/10000 [16:57<1:35:16, 1.49it/s, loss=0.0213, lr=2.44e-05, step=1461] Training: 15%|█▍ | 1462/10000 [16:58<1:52:09, 1.27it/s, loss=0.0213, lr=2.44e-05, step=1461] Training: 15%|█▍ | 1462/10000 [16:58<1:52:09, 1.27it/s, loss=0.0518, lr=2.44e-05, step=1462] Training: 15%|█▍ | 1463/10000 [16:59<1:54:29, 1.24it/s, loss=0.0518, lr=2.44e-05, step=1462] Training: 15%|█▍ | 1463/10000 [16:59<1:54:29, 1.24it/s, loss=0.0311, lr=2.44e-05, step=1463] Training: 15%|█▍ | 1464/10000 [17:00<1:46:30, 1.34it/s, loss=0.0311, lr=2.44e-05, step=1463] Training: 15%|█▍ | 1464/10000 [17:00<1:46:30, 1.34it/s, loss=0.0322, lr=2.44e-05, step=1464] Training: 15%|█▍ | 1465/10000 [17:00<1:43:28, 1.37it/s, loss=0.0322, lr=2.44e-05, step=1464] Training: 15%|█▍ | 1465/10000 [17:00<1:43:28, 1.37it/s, loss=0.0907, lr=2.44e-05, step=1465] Training: 15%|█▍ | 1466/10000 [17:01<1:36:34, 1.47it/s, loss=0.0907, lr=2.44e-05, step=1465] Training: 15%|█▍ | 1466/10000 [17:01<1:36:34, 1.47it/s, loss=0.0317, lr=2.44e-05, step=1466] Training: 15%|█▍ | 1467/10000 [17:02<1:42:38, 1.39it/s, loss=0.0317, lr=2.44e-05, step=1466] Training: 15%|█▍ | 1467/10000 [17:02<1:42:38, 1.39it/s, loss=0.0424, lr=2.44e-05, step=1467] Training: 15%|█▍ | 1468/10000 [17:02<1:42:39, 1.39it/s, loss=0.0424, lr=2.44e-05, step=1467] Training: 15%|█▍ | 1468/10000 [17:02<1:42:39, 1.39it/s, loss=0.0303, lr=2.44e-05, step=1468] Training: 15%|█▍ | 1469/10000 [17:03<1:33:21, 1.52it/s, loss=0.0303, lr=2.44e-05, step=1468] Training: 15%|█▍ | 1469/10000 [17:03<1:33:21, 1.52it/s, loss=0.0157, lr=2.44e-05, step=1469]16:23:10.785 [I] step=1470 loss=0.0360 smoothed_loss=0.0403 lr=2.44e-05 grad_norm=0.6131 step_time=0.6015s data_time=0.1237s it/s=1.379 eta_to_10000=6185.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.2074 grad_shared_expert=0.6429 (10775:train_pytorch.py:850) + Training: 15%|█▍ | 1470/10000 [17:04<1:45:27, 1.35it/s, loss=0.0157, lr=2.44e-05, step=1469] Training: 15%|█▍ | 1470/10000 [17:04<1:45:27, 1.35it/s, loss=0.0360, lr=2.44e-05, step=1470] Training: 15%|█▍ | 1471/10000 [17:05<1:44:10, 1.36it/s, loss=0.0360, lr=2.44e-05, step=1470] Training: 15%|█▍ | 1471/10000 [17:05<1:44:10, 1.36it/s, loss=0.0934, lr=2.44e-05, step=1471] Training: 15%|█▍ | 1472/10000 [17:05<1:34:25, 1.51it/s, loss=0.0934, lr=2.44e-05, step=1471] Training: 15%|█▍ | 1472/10000 [17:05<1:34:25, 1.51it/s, loss=0.0209, lr=2.44e-05, step=1472] Training: 15%|█▍ | 1473/10000 [17:06<1:33:29, 1.52it/s, loss=0.0209, lr=2.44e-05, step=1472] Training: 15%|█▍ | 1473/10000 [17:06<1:33:29, 1.52it/s, loss=0.0263, lr=2.44e-05, step=1473] Training: 15%|█▍ | 1474/10000 [17:06<1:38:47, 1.44it/s, loss=0.0263, lr=2.44e-05, step=1473] Training: 15%|█▍ | 1474/10000 [17:06<1:38:47, 1.44it/s, loss=0.0216, lr=2.44e-05, step=1474] Training: 15%|█▍ | 1475/10000 [17:07<1:30:53, 1.56it/s, loss=0.0216, lr=2.44e-05, step=1474] Training: 15%|█▍ | 1475/10000 [17:07<1:30:53, 1.56it/s, loss=0.0520, lr=2.44e-05, step=1475] Training: 15%|█▍ | 1476/10000 [17:08<1:28:11, 1.61it/s, loss=0.0520, lr=2.44e-05, step=1475] Training: 15%|█▍ | 1476/10000 [17:08<1:28:11, 1.61it/s, loss=0.0422, lr=2.44e-05, step=1476] Training: 15%|█▍ | 1477/10000 [17:09<1:41:42, 1.40it/s, loss=0.0422, lr=2.44e-05, step=1476] Training: 15%|█▍ | 1477/10000 [17:09<1:41:42, 1.40it/s, loss=0.0170, lr=2.44e-05, step=1477] Training: 15%|█▍ | 1478/10000 [17:09<1:32:17, 1.54it/s, loss=0.0170, lr=2.44e-05, step=1477] Training: 15%|█▍ | 1478/10000 [17:09<1:32:17, 1.54it/s, loss=0.0100, lr=2.44e-05, step=1478] Training: 15%|█▍ | 1479/10000 [17:10<1:25:23, 1.66it/s, loss=0.0100, lr=2.44e-05, step=1478] Training: 15%|█▍ | 1479/10000 [17:10<1:25:23, 1.66it/s, loss=0.0196, lr=2.44e-05, step=1479]16:23:17.043 [I] step=1480 loss=0.0219 smoothed_loss=0.0328 lr=2.44e-05 grad_norm=0.7932 step_time=0.5362s data_time=0.0896s it/s=1.598 eta_to_10000=5330.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0193 grad_action_out_proj=0.2153 grad_shared_expert=0.5726 (10775:train_pytorch.py:850) + Training: 15%|█▍ | 1480/10000 [17:10<1:25:40, 1.66it/s, loss=0.0196, lr=2.44e-05, step=1479] Training: 15%|█▍ | 1480/10000 [17:10<1:25:40, 1.66it/s, loss=0.0219, lr=2.44e-05, step=1480] Training: 15%|█▍ | 1481/10000 [17:11<1:40:09, 1.42it/s, loss=0.0219, lr=2.44e-05, step=1480] Training: 15%|█▍ | 1481/10000 [17:11<1:40:09, 1.42it/s, loss=0.0437, lr=2.44e-05, step=1481] Training: 15%|█▍ | 1482/10000 [17:12<1:39:11, 1.43it/s, loss=0.0437, lr=2.44e-05, step=1481] Training: 15%|█▍ | 1482/10000 [17:12<1:39:11, 1.43it/s, loss=0.0347, lr=2.44e-05, step=1482] Training: 15%|█▍ | 1483/10000 [17:13<1:53:05, 1.26it/s, loss=0.0347, lr=2.44e-05, step=1482] Training: 15%|█▍ | 1483/10000 [17:13<1:53:05, 1.26it/s, loss=0.0919, lr=2.44e-05, step=1483] Training: 15%|█▍ | 1484/10000 [17:13<1:41:06, 1.40it/s, loss=0.0919, lr=2.44e-05, step=1483] Training: 15%|█▍ | 1484/10000 [17:13<1:41:06, 1.40it/s, loss=0.0179, lr=2.44e-05, step=1484] Training: 15%|█▍ | 1485/10000 [17:14<1:35:26, 1.49it/s, loss=0.0179, lr=2.44e-05, step=1484] Training: 15%|█▍ | 1485/10000 [17:14<1:35:26, 1.49it/s, loss=0.0480, lr=2.44e-05, step=1485] Training: 15%|█▍ | 1486/10000 [17:14<1:28:13, 1.61it/s, loss=0.0480, lr=2.44e-05, step=1485] Training: 15%|█▍ | 1486/10000 [17:14<1:28:13, 1.61it/s, loss=0.0231, lr=2.44e-05, step=1486] Training: 15%|█▍ | 1487/10000 [17:15<1:27:58, 1.61it/s, loss=0.0231, lr=2.44e-05, step=1486] Training: 15%|█▍ | 1487/10000 [17:15<1:27:58, 1.61it/s, loss=0.0110, lr=2.44e-05, step=1487] Training: 15%|█▍ | 1488/10000 [17:16<1:31:48, 1.55it/s, loss=0.0110, lr=2.44e-05, step=1487] Training: 15%|█▍ | 1488/10000 [17:16<1:31:48, 1.55it/s, loss=0.0160, lr=2.44e-05, step=1488] Training: 15%|█▍ | 1489/10000 [17:16<1:25:53, 1.65it/s, loss=0.0160, lr=2.44e-05, step=1488] Training: 15%|█▍ | 1489/10000 [17:16<1:25:53, 1.65it/s, loss=0.0175, lr=2.44e-05, step=1489]16:23:23.836 [I] step=1490 loss=0.0246 smoothed_loss=0.0305 lr=2.44e-05 grad_norm=0.8101 step_time=0.5658s data_time=0.1135s it/s=1.472 eta_to_10000=5780.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0116 grad_action_out_proj=0.1624 grad_shared_expert=0.6501 (10775:train_pytorch.py:850) + Training: 15%|█▍ | 1490/10000 [17:17<1:30:13, 1.57it/s, loss=0.0175, lr=2.44e-05, step=1489] Training: 15%|█▍ | 1490/10000 [17:17<1:30:13, 1.57it/s, loss=0.0246, lr=2.44e-05, step=1490] Training: 15%|█▍ | 1491/10000 [17:18<1:35:05, 1.49it/s, loss=0.0246, lr=2.44e-05, step=1490] Training: 15%|█▍ | 1491/10000 [17:18<1:35:05, 1.49it/s, loss=0.0117, lr=2.44e-05, step=1491] Training: 15%|█▍ | 1492/10000 [17:18<1:35:03, 1.49it/s, loss=0.0117, lr=2.44e-05, step=1491] Training: 15%|█▍ | 1492/10000 [17:18<1:35:03, 1.49it/s, loss=0.0398, lr=2.44e-05, step=1492] Training: 15%|█▍ | 1493/10000 [17:19<1:28:09, 1.61it/s, loss=0.0398, lr=2.44e-05, step=1492] Training: 15%|█▍ | 1493/10000 [17:19<1:28:09, 1.61it/s, loss=0.0380, lr=2.44e-05, step=1493] Training: 15%|█▍ | 1494/10000 [17:19<1:23:25, 1.70it/s, loss=0.0380, lr=2.44e-05, step=1493] Training: 15%|█▍ | 1494/10000 [17:19<1:23:25, 1.70it/s, loss=0.0753, lr=2.44e-05, step=1494] Training: 15%|█▍ | 1495/10000 [17:20<1:38:53, 1.43it/s, loss=0.0753, lr=2.44e-05, step=1494] Training: 15%|█▍ | 1495/10000 [17:20<1:38:53, 1.43it/s, loss=0.0366, lr=2.44e-05, step=1495] Training: 15%|█▍ | 1496/10000 [17:21<1:38:47, 1.43it/s, loss=0.0366, lr=2.44e-05, step=1495] Training: 15%|█▍ | 1496/10000 [17:21<1:38:47, 1.43it/s, loss=0.1211, lr=2.44e-05, step=1496] Training: 15%|█▍ | 1497/10000 [17:21<1:30:23, 1.57it/s, loss=0.1211, lr=2.44e-05, step=1496] Training: 15%|█▍ | 1497/10000 [17:21<1:30:23, 1.57it/s, loss=0.0257, lr=2.44e-05, step=1497] Training: 15%|█▍ | 1498/10000 [17:22<1:40:40, 1.41it/s, loss=0.0257, lr=2.44e-05, step=1497] Training: 15%|█▍ | 1498/10000 [17:22<1:40:40, 1.41it/s, loss=0.0447, lr=2.44e-05, step=1498] Training: 15%|█▍ | 1499/10000 [17:23<1:32:13, 1.54it/s, loss=0.0447, lr=2.44e-05, step=1498] Training: 15%|█▍ | 1499/10000 [17:23<1:32:13, 1.54it/s, loss=0.0319, lr=2.44e-05, step=1499]16:23:30.579 [I] step=1500 loss=0.0143 smoothed_loss=0.0385 lr=2.44e-05 grad_norm=0.7207 step_time=0.5624s data_time=0.1119s it/s=1.483 eta_to_10000=5731.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0170 grad_action_out_proj=0.1871 grad_shared_expert=0.5396 (10775:train_pytorch.py:850) + Training: 15%|█▌ | 1500/10000 [17:24<1:36:59, 1.46it/s, loss=0.0319, lr=2.44e-05, step=1499] Training: 15%|█▌ | 1500/10000 [17:24<1:36:59, 1.46it/s, loss=0.0143, lr=2.44e-05, step=1500] Training: 15%|█▌ | 1501/10000 [17:24<1:30:17, 1.57it/s, loss=0.0143, lr=2.44e-05, step=1500] Training: 15%|█▌ | 1501/10000 [17:24<1:30:17, 1.57it/s, loss=0.0134, lr=2.44e-05, step=1501] Training: 15%|█▌ | 1502/10000 [17:25<1:25:14, 1.66it/s, loss=0.0134, lr=2.44e-05, step=1501] Training: 15%|█▌ | 1502/10000 [17:25<1:25:14, 1.66it/s, loss=0.0089, lr=2.44e-05, step=1502] Training: 15%|█▌ | 1503/10000 [17:26<1:35:19, 1.49it/s, loss=0.0089, lr=2.44e-05, step=1502] Training: 15%|█▌ | 1503/10000 [17:26<1:35:19, 1.49it/s, loss=0.0280, lr=2.44e-05, step=1503] Training: 15%|█▌ | 1504/10000 [17:26<1:37:43, 1.45it/s, loss=0.0280, lr=2.44e-05, step=1503] Training: 15%|█▌ | 1504/10000 [17:26<1:37:43, 1.45it/s, loss=0.0402, lr=2.44e-05, step=1504] Training: 15%|█▌ | 1505/10000 [17:27<1:42:57, 1.38it/s, loss=0.0402, lr=2.44e-05, step=1504] Training: 15%|█▌ | 1505/10000 [17:27<1:42:57, 1.38it/s, loss=0.0275, lr=2.44e-05, step=1505] Training: 15%|█▌ | 1506/10000 [17:28<1:40:41, 1.41it/s, loss=0.0275, lr=2.44e-05, step=1505] Training: 15%|█▌ | 1506/10000 [17:28<1:40:41, 1.41it/s, loss=0.0294, lr=2.44e-05, step=1506] Training: 15%|█▌ | 1507/10000 [17:28<1:32:16, 1.53it/s, loss=0.0294, lr=2.44e-05, step=1506] Training: 15%|█▌ | 1507/10000 [17:28<1:32:16, 1.53it/s, loss=0.0541, lr=2.44e-05, step=1507] Training: 15%|█▌ | 1508/10000 [17:29<1:42:45, 1.38it/s, loss=0.0541, lr=2.44e-05, step=1507] Training: 15%|█▌ | 1508/10000 [17:29<1:42:45, 1.38it/s, loss=0.0712, lr=2.44e-05, step=1508] Training: 15%|█▌ | 1509/10000 [17:30<1:33:06, 1.52it/s, loss=0.0712, lr=2.44e-05, step=1508] Training: 15%|█▌ | 1509/10000 [17:30<1:33:06, 1.52it/s, loss=0.0595, lr=2.44e-05, step=1509]16:23:37.593 [I] step=1510 loss=0.0106 smoothed_loss=0.0375 lr=2.44e-05 grad_norm=0.8018 step_time=0.5643s data_time=0.1370s it/s=1.426 eta_to_10000=5953.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.1507 grad_shared_expert=0.4706 (10775:train_pytorch.py:850) + Training: 15%|█▌ | 1510/10000 [17:31<1:47:36, 1.32it/s, loss=0.0595, lr=2.44e-05, step=1509] Training: 15%|█▌ | 1510/10000 [17:31<1:47:36, 1.32it/s, loss=0.0106, lr=2.44e-05, step=1510] Training: 15%|█▌ | 1511/10000 [17:31<1:43:19, 1.37it/s, loss=0.0106, lr=2.44e-05, step=1510] Training: 15%|█▌ | 1511/10000 [17:31<1:43:19, 1.37it/s, loss=0.0456, lr=2.44e-05, step=1511] Training: 15%|█▌ | 1512/10000 [17:32<1:51:10, 1.27it/s, loss=0.0456, lr=2.44e-05, step=1511] Training: 15%|█▌ | 1512/10000 [17:32<1:51:10, 1.27it/s, loss=0.0585, lr=2.44e-05, step=1512] Training: 15%|█▌ | 1513/10000 [17:33<1:49:30, 1.29it/s, loss=0.0585, lr=2.44e-05, step=1512] Training: 15%|█▌ | 1513/10000 [17:33<1:49:30, 1.29it/s, loss=0.0536, lr=2.44e-05, step=1513] Training: 15%|█▌ | 1514/10000 [17:34<1:49:36, 1.29it/s, loss=0.0536, lr=2.44e-05, step=1513] Training: 15%|█▌ | 1514/10000 [17:34<1:49:36, 1.29it/s, loss=0.0147, lr=2.44e-05, step=1514] Training: 15%|█▌ | 1515/10000 [17:34<1:39:55, 1.42it/s, loss=0.0147, lr=2.44e-05, step=1514] Training: 15%|█▌ | 1515/10000 [17:34<1:39:55, 1.42it/s, loss=0.0331, lr=2.44e-05, step=1515] Training: 15%|█▌ | 1516/10000 [17:35<1:31:51, 1.54it/s, loss=0.0331, lr=2.44e-05, step=1515] Training: 15%|█▌ | 1516/10000 [17:35<1:31:51, 1.54it/s, loss=0.0241, lr=2.44e-05, step=1516] Training: 15%|█▌ | 1517/10000 [17:35<1:28:26, 1.60it/s, loss=0.0241, lr=2.44e-05, step=1516] Training: 15%|█▌ | 1517/10000 [17:35<1:28:26, 1.60it/s, loss=0.0242, lr=2.44e-05, step=1517] Training: 15%|█▌ | 1518/10000 [17:36<1:32:00, 1.54it/s, loss=0.0242, lr=2.44e-05, step=1517] Training: 15%|█▌ | 1518/10000 [17:36<1:32:00, 1.54it/s, loss=0.0330, lr=2.44e-05, step=1518] Training: 15%|█▌ | 1519/10000 [17:37<1:36:42, 1.46it/s, loss=0.0330, lr=2.44e-05, step=1518] Training: 15%|█▌ | 1519/10000 [17:37<1:36:42, 1.46it/s, loss=0.0098, lr=2.44e-05, step=1519]16:23:44.337 [I] step=1520 loss=0.0591 smoothed_loss=0.0355 lr=2.44e-05 grad_norm=0.6378 step_time=0.5698s data_time=0.1046s it/s=1.483 eta_to_10000=5718.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0243 grad_action_out_proj=0.1991 grad_shared_expert=0.5808 (10775:train_pytorch.py:850) + Training: 15%|█▌ | 1520/10000 [17:37<1:30:33, 1.56it/s, loss=0.0098, lr=2.44e-05, step=1519] Training: 15%|█▌ | 1520/10000 [17:37<1:30:33, 1.56it/s, loss=0.0591, lr=2.44e-05, step=1520] Training: 15%|█▌ | 1521/10000 [17:38<1:24:20, 1.68it/s, loss=0.0591, lr=2.44e-05, step=1520] Training: 15%|█▌ | 1521/10000 [17:38<1:24:20, 1.68it/s, loss=0.0541, lr=2.44e-05, step=1521] Training: 15%|█▌ | 1522/10000 [17:38<1:19:59, 1.77it/s, loss=0.0541, lr=2.44e-05, step=1521] Training: 15%|█▌ | 1522/10000 [17:38<1:19:59, 1.77it/s, loss=0.0441, lr=2.44e-05, step=1522] Training: 15%|█▌ | 1523/10000 [17:39<1:22:36, 1.71it/s, loss=0.0441, lr=2.44e-05, step=1522] Training: 15%|█▌ | 1523/10000 [17:39<1:22:36, 1.71it/s, loss=0.0247, lr=2.44e-05, step=1523] Training: 15%|█▌ | 1524/10000 [17:40<1:19:19, 1.78it/s, loss=0.0247, lr=2.44e-05, step=1523] Training: 15%|█▌ | 1524/10000 [17:40<1:19:19, 1.78it/s, loss=0.0159, lr=2.44e-05, step=1524] Training: 15%|█▌ | 1525/10000 [17:40<1:33:03, 1.52it/s, loss=0.0159, lr=2.44e-05, step=1524] Training: 15%|█▌ | 1525/10000 [17:40<1:33:03, 1.52it/s, loss=0.0391, lr=2.44e-05, step=1525] Training: 15%|█▌ | 1526/10000 [17:41<1:37:11, 1.45it/s, loss=0.0391, lr=2.44e-05, step=1525] Training: 15%|█▌ | 1526/10000 [17:41<1:37:11, 1.45it/s, loss=0.0216, lr=2.44e-05, step=1526] Training: 15%|█▌ | 1527/10000 [17:42<1:32:20, 1.53it/s, loss=0.0216, lr=2.44e-05, step=1526] Training: 15%|█▌ | 1527/10000 [17:42<1:32:20, 1.53it/s, loss=0.0447, lr=2.44e-05, step=1527] Training: 15%|█▌ | 1528/10000 [17:42<1:29:15, 1.58it/s, loss=0.0447, lr=2.44e-05, step=1527] Training: 15%|█▌ | 1528/10000 [17:42<1:29:15, 1.58it/s, loss=0.0740, lr=2.44e-05, step=1528] Training: 15%|█▌ | 1529/10000 [17:43<1:24:01, 1.68it/s, loss=0.0740, lr=2.44e-05, step=1528] Training: 15%|█▌ | 1529/10000 [17:43<1:24:01, 1.68it/s, loss=0.0178, lr=2.44e-05, step=1529]16:23:50.322 [I] step=1530 loss=0.0429 smoothed_loss=0.0373 lr=2.44e-05 grad_norm=0.6415 step_time=0.5148s data_time=0.0837s it/s=1.671 eta_to_10000=5069.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1670 grad_shared_expert=0.7843 (10775:train_pytorch.py:850) + Training: 15%|█▌ | 1530/10000 [17:43<1:22:18, 1.71it/s, loss=0.0178, lr=2.44e-05, step=1529] Training: 15%|█▌ | 1530/10000 [17:43<1:22:18, 1.71it/s, loss=0.0429, lr=2.44e-05, step=1530] Training: 15%|█▌ | 1531/10000 [17:44<1:27:13, 1.62it/s, loss=0.0429, lr=2.44e-05, step=1530] Training: 15%|█▌ | 1531/10000 [17:44<1:27:13, 1.62it/s, loss=0.0099, lr=2.44e-05, step=1531] Training: 15%|█▌ | 1532/10000 [17:45<1:22:27, 1.71it/s, loss=0.0099, lr=2.44e-05, step=1531] Training: 15%|█▌ | 1532/10000 [17:45<1:22:27, 1.71it/s, loss=0.0176, lr=2.44e-05, step=1532] Training: 15%|█▌ | 1533/10000 [17:45<1:33:54, 1.50it/s, loss=0.0176, lr=2.44e-05, step=1532] Training: 15%|█▌ | 1533/10000 [17:45<1:33:54, 1.50it/s, loss=0.0220, lr=2.44e-05, step=1533] Training: 15%|█▌ | 1534/10000 [17:46<1:37:10, 1.45it/s, loss=0.0220, lr=2.44e-05, step=1533] Training: 15%|█▌ | 1534/10000 [17:46<1:37:10, 1.45it/s, loss=0.0219, lr=2.43e-05, step=1534] Training: 15%|█▌ | 1535/10000 [17:47<1:29:36, 1.57it/s, loss=0.0219, lr=2.43e-05, step=1534] Training: 15%|█▌ | 1535/10000 [17:47<1:29:36, 1.57it/s, loss=0.0388, lr=2.43e-05, step=1535] Training: 15%|█▌ | 1536/10000 [17:47<1:24:01, 1.68it/s, loss=0.0388, lr=2.43e-05, step=1535] Training: 15%|█▌ | 1536/10000 [17:47<1:24:01, 1.68it/s, loss=0.0346, lr=2.43e-05, step=1536] Training: 15%|█▌ | 1537/10000 [17:48<1:19:36, 1.77it/s, loss=0.0346, lr=2.43e-05, step=1536] Training: 15%|█▌ | 1537/10000 [17:48<1:19:36, 1.77it/s, loss=0.0243, lr=2.43e-05, step=1537] Training: 15%|█▌ | 1538/10000 [17:48<1:16:58, 1.83it/s, loss=0.0243, lr=2.43e-05, step=1537] Training: 15%|█▌ | 1538/10000 [17:48<1:16:58, 1.83it/s, loss=0.0633, lr=2.43e-05, step=1538] Training: 15%|█▌ | 1539/10000 [17:49<1:15:54, 1.86it/s, loss=0.0633, lr=2.43e-05, step=1538] Training: 15%|█▌ | 1539/10000 [17:49<1:15:54, 1.86it/s, loss=0.0161, lr=2.43e-05, step=1539]16:23:56.372 [I] step=1540 loss=0.0303 smoothed_loss=0.0323 lr=2.43e-05 grad_norm=0.6002 step_time=0.5297s data_time=0.0752s it/s=1.653 eta_to_10000=5116.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.1322 grad_shared_expert=0.7165 (10775:train_pytorch.py:850) + Training: 15%|█▌ | 1540/10000 [17:49<1:23:29, 1.69it/s, loss=0.0161, lr=2.43e-05, step=1539] Training: 15%|█▌ | 1540/10000 [17:49<1:23:29, 1.69it/s, loss=0.0303, lr=2.43e-05, step=1540] Training: 15%|█▌ | 1541/10000 [17:50<1:34:10, 1.50it/s, loss=0.0303, lr=2.43e-05, step=1540] Training: 15%|█▌ | 1541/10000 [17:50<1:34:10, 1.50it/s, loss=0.0078, lr=2.43e-05, step=1541] Training: 15%|█▌ | 1542/10000 [17:51<1:26:58, 1.62it/s, loss=0.0078, lr=2.43e-05, step=1541] Training: 15%|█▌ | 1542/10000 [17:51<1:26:58, 1.62it/s, loss=0.0265, lr=2.43e-05, step=1542] Training: 15%|█▌ | 1543/10000 [17:51<1:27:40, 1.61it/s, loss=0.0265, lr=2.43e-05, step=1542] Training: 15%|█▌ | 1543/10000 [17:51<1:27:40, 1.61it/s, loss=0.1133, lr=2.43e-05, step=1543] Training: 15%|█▌ | 1544/10000 [17:52<1:22:46, 1.70it/s, loss=0.1133, lr=2.43e-05, step=1543] Training: 15%|█▌ | 1544/10000 [17:52<1:22:46, 1.70it/s, loss=0.0870, lr=2.43e-05, step=1544] Training: 15%|█▌ | 1545/10000 [17:52<1:19:23, 1.78it/s, loss=0.0870, lr=2.43e-05, step=1544] Training: 15%|█▌ | 1545/10000 [17:52<1:19:23, 1.78it/s, loss=0.0823, lr=2.43e-05, step=1545] Training: 15%|█▌ | 1546/10000 [17:53<1:17:40, 1.81it/s, loss=0.0823, lr=2.43e-05, step=1545] Training: 15%|█▌ | 1546/10000 [17:53<1:17:40, 1.81it/s, loss=0.0668, lr=2.43e-05, step=1546] Training: 15%|█▌ | 1547/10000 [17:54<1:23:31, 1.69it/s, loss=0.0668, lr=2.43e-05, step=1546] Training: 15%|█▌ | 1547/10000 [17:54<1:23:31, 1.69it/s, loss=0.1009, lr=2.43e-05, step=1547] Training: 15%|█▌ | 1548/10000 [17:55<1:36:59, 1.45it/s, loss=0.1009, lr=2.43e-05, step=1547] Training: 15%|█▌ | 1548/10000 [17:55<1:36:59, 1.45it/s, loss=0.0532, lr=2.43e-05, step=1548] Training: 15%|█▌ | 1549/10000 [17:55<1:30:24, 1.56it/s, loss=0.0532, lr=2.43e-05, step=1548] Training: 15%|█▌ | 1549/10000 [17:55<1:30:24, 1.56it/s, loss=0.0230, lr=2.43e-05, step=1549]16:24:02.554 [I] step=1550 loss=0.0445 smoothed_loss=0.0502 lr=2.43e-05 grad_norm=0.7770 step_time=0.5266s data_time=0.0916s it/s=1.618 eta_to_10000=5223.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0151 grad_action_out_proj=0.2314 grad_shared_expert=0.6467 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1550/10000 [17:56<1:25:50, 1.64it/s, loss=0.0230, lr=2.43e-05, step=1549] Training: 16%|█▌ | 1550/10000 [17:56<1:25:50, 1.64it/s, loss=0.0445, lr=2.43e-05, step=1550] Training: 16%|█▌ | 1551/10000 [17:56<1:21:26, 1.73it/s, loss=0.0445, lr=2.43e-05, step=1550] Training: 16%|█▌ | 1551/10000 [17:56<1:21:26, 1.73it/s, loss=0.0501, lr=2.43e-05, step=1551] Training: 16%|█▌ | 1552/10000 [17:57<1:19:12, 1.78it/s, loss=0.0501, lr=2.43e-05, step=1551] Training: 16%|█▌ | 1552/10000 [17:57<1:19:12, 1.78it/s, loss=0.0305, lr=2.43e-05, step=1552] Training: 16%|█▌ | 1553/10000 [17:57<1:16:21, 1.84it/s, loss=0.0305, lr=2.43e-05, step=1552] Training: 16%|█▌ | 1553/10000 [17:57<1:16:21, 1.84it/s, loss=0.0145, lr=2.43e-05, step=1553] Training: 16%|█▌ | 1554/10000 [17:58<1:27:28, 1.61it/s, loss=0.0145, lr=2.43e-05, step=1553] Training: 16%|█▌ | 1554/10000 [17:58<1:27:28, 1.61it/s, loss=0.0236, lr=2.43e-05, step=1554] Training: 16%|█▌ | 1555/10000 [17:59<1:33:00, 1.51it/s, loss=0.0236, lr=2.43e-05, step=1554] Training: 16%|█▌ | 1555/10000 [17:59<1:33:00, 1.51it/s, loss=0.0329, lr=2.43e-05, step=1555] Training: 16%|█▌ | 1556/10000 [17:59<1:25:48, 1.64it/s, loss=0.0329, lr=2.43e-05, step=1555] Training: 16%|█▌ | 1556/10000 [17:59<1:25:48, 1.64it/s, loss=0.0947, lr=2.43e-05, step=1556] Training: 16%|█▌ | 1557/10000 [18:00<1:20:33, 1.75it/s, loss=0.0947, lr=2.43e-05, step=1556] Training: 16%|█▌ | 1557/10000 [18:00<1:20:33, 1.75it/s, loss=0.0248, lr=2.43e-05, step=1557] Training: 16%|█▌ | 1558/10000 [18:00<1:17:15, 1.82it/s, loss=0.0248, lr=2.43e-05, step=1557] Training: 16%|█▌ | 1558/10000 [18:00<1:17:15, 1.82it/s, loss=0.0230, lr=2.43e-05, step=1558] Training: 16%|█▌ | 1559/10000 [18:01<1:14:38, 1.88it/s, loss=0.0230, lr=2.43e-05, step=1558] Training: 16%|█▌ | 1559/10000 [18:01<1:14:38, 1.88it/s, loss=0.0437, lr=2.43e-05, step=1559]16:24:08.423 [I] step=1560 loss=0.0273 smoothed_loss=0.0412 lr=2.43e-05 grad_norm=0.6909 step_time=0.5162s data_time=0.0708s it/s=1.704 eta_to_10000=4952.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0170 grad_action_out_proj=0.1870 grad_shared_expert=0.6174 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1560/10000 [18:01<1:27:05, 1.62it/s, loss=0.0437, lr=2.43e-05, step=1559] Training: 16%|█▌ | 1560/10000 [18:01<1:27:05, 1.62it/s, loss=0.0273, lr=2.43e-05, step=1560] Training: 16%|█▌ | 1561/10000 [18:02<1:22:07, 1.71it/s, loss=0.0273, lr=2.43e-05, step=1560] Training: 16%|█▌ | 1561/10000 [18:02<1:22:07, 1.71it/s, loss=0.0141, lr=2.43e-05, step=1561] Training: 16%|█▌ | 1562/10000 [18:03<1:31:58, 1.53it/s, loss=0.0141, lr=2.43e-05, step=1561] Training: 16%|█▌ | 1562/10000 [18:03<1:31:58, 1.53it/s, loss=0.0327, lr=2.43e-05, step=1562] Training: 16%|█▌ | 1563/10000 [18:03<1:26:03, 1.63it/s, loss=0.0327, lr=2.43e-05, step=1562] Training: 16%|█▌ | 1563/10000 [18:03<1:26:03, 1.63it/s, loss=0.1365, lr=2.43e-05, step=1563] Training: 16%|█▌ | 1564/10000 [18:04<1:24:36, 1.66it/s, loss=0.1365, lr=2.43e-05, step=1563] Training: 16%|█▌ | 1564/10000 [18:04<1:24:36, 1.66it/s, loss=0.0623, lr=2.43e-05, step=1564] Training: 16%|█▌ | 1565/10000 [18:04<1:20:47, 1.74it/s, loss=0.0623, lr=2.43e-05, step=1564] Training: 16%|█▌ | 1565/10000 [18:04<1:20:47, 1.74it/s, loss=0.0449, lr=2.43e-05, step=1565] Training: 16%|█▌ | 1566/10000 [18:05<1:17:04, 1.82it/s, loss=0.0449, lr=2.43e-05, step=1565] Training: 16%|█▌ | 1566/10000 [18:05<1:17:04, 1.82it/s, loss=0.0527, lr=2.43e-05, step=1566] Training: 16%|█▌ | 1567/10000 [18:06<1:23:35, 1.68it/s, loss=0.0527, lr=2.43e-05, step=1566] Training: 16%|█▌ | 1567/10000 [18:06<1:23:35, 1.68it/s, loss=0.0448, lr=2.43e-05, step=1567] Training: 16%|█▌ | 1568/10000 [18:06<1:19:55, 1.76it/s, loss=0.0448, lr=2.43e-05, step=1567] Training: 16%|█▌ | 1568/10000 [18:06<1:19:55, 1.76it/s, loss=0.0182, lr=2.43e-05, step=1568] Training: 16%|█▌ | 1569/10000 [18:07<1:34:17, 1.49it/s, loss=0.0182, lr=2.43e-05, step=1568] Training: 16%|█▌ | 1569/10000 [18:07<1:34:17, 1.49it/s, loss=0.0259, lr=2.43e-05, step=1569]16:24:14.488 [I] step=1570 loss=0.0126 smoothed_loss=0.0406 lr=2.43e-05 grad_norm=0.7748 step_time=0.5314s data_time=0.0750s it/s=1.649 eta_to_10000=5112.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0221 grad_action_out_proj=0.2688 grad_shared_expert=1.0956 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1570/10000 [18:08<1:28:33, 1.59it/s, loss=0.0259, lr=2.43e-05, step=1569] Training: 16%|█▌ | 1570/10000 [18:08<1:28:33, 1.59it/s, loss=0.0126, lr=2.43e-05, step=1570] Training: 16%|█▌ | 1571/10000 [18:08<1:22:51, 1.70it/s, loss=0.0126, lr=2.43e-05, step=1570] Training: 16%|█▌ | 1571/10000 [18:08<1:22:51, 1.70it/s, loss=0.0392, lr=2.43e-05, step=1571] Training: 16%|█▌ | 1572/10000 [18:09<1:26:57, 1.62it/s, loss=0.0392, lr=2.43e-05, step=1571] Training: 16%|█▌ | 1572/10000 [18:09<1:26:57, 1.62it/s, loss=0.0288, lr=2.43e-05, step=1572] Training: 16%|█▌ | 1573/10000 [18:09<1:22:13, 1.71it/s, loss=0.0288, lr=2.43e-05, step=1572] Training: 16%|█▌ | 1573/10000 [18:09<1:22:13, 1.71it/s, loss=0.0405, lr=2.43e-05, step=1573] Training: 16%|█▌ | 1574/10000 [18:10<1:27:30, 1.60it/s, loss=0.0405, lr=2.43e-05, step=1573] Training: 16%|█▌ | 1574/10000 [18:10<1:27:30, 1.60it/s, loss=0.0715, lr=2.43e-05, step=1574] Training: 16%|█▌ | 1575/10000 [18:10<1:23:56, 1.67it/s, loss=0.0715, lr=2.43e-05, step=1574] Training: 16%|█▌ | 1575/10000 [18:10<1:23:56, 1.67it/s, loss=0.0261, lr=2.43e-05, step=1575] Training: 16%|█▌ | 1576/10000 [18:11<1:20:20, 1.75it/s, loss=0.0261, lr=2.43e-05, step=1575] Training: 16%|█▌ | 1576/10000 [18:11<1:20:20, 1.75it/s, loss=0.0414, lr=2.43e-05, step=1576] Training: 16%|█▌ | 1577/10000 [18:12<1:26:26, 1.62it/s, loss=0.0414, lr=2.43e-05, step=1576] Training: 16%|█▌ | 1577/10000 [18:12<1:26:26, 1.62it/s, loss=0.0330, lr=2.43e-05, step=1577] Training: 16%|█▌ | 1578/10000 [18:12<1:21:55, 1.71it/s, loss=0.0330, lr=2.43e-05, step=1577] Training: 16%|█▌ | 1578/10000 [18:12<1:21:55, 1.71it/s, loss=0.0066, lr=2.43e-05, step=1578] Training: 16%|█▌ | 1579/10000 [18:13<1:18:28, 1.79it/s, loss=0.0066, lr=2.43e-05, step=1578] Training: 16%|█▌ | 1579/10000 [18:13<1:18:28, 1.79it/s, loss=0.0286, lr=2.43e-05, step=1579]16:24:20.193 [I] step=1580 loss=0.0174 smoothed_loss=0.0342 lr=2.43e-05 grad_norm=0.6772 step_time=0.4927s data_time=0.0778s it/s=1.753 eta_to_10000=4802.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0098 grad_action_out_proj=0.1499 grad_shared_expert=0.5048 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1580/10000 [18:13<1:17:01, 1.82it/s, loss=0.0286, lr=2.43e-05, step=1579] Training: 16%|█▌ | 1580/10000 [18:13<1:17:01, 1.82it/s, loss=0.0174, lr=2.43e-05, step=1580] Training: 16%|█▌ | 1581/10000 [18:14<1:17:02, 1.82it/s, loss=0.0174, lr=2.43e-05, step=1580] Training: 16%|█▌ | 1581/10000 [18:14<1:17:02, 1.82it/s, loss=0.0975, lr=2.43e-05, step=1581] Training: 16%|█▌ | 1582/10000 [18:15<1:23:29, 1.68it/s, loss=0.0975, lr=2.43e-05, step=1581] Training: 16%|█▌ | 1582/10000 [18:15<1:23:29, 1.68it/s, loss=0.0263, lr=2.43e-05, step=1582] Training: 16%|█▌ | 1583/10000 [18:15<1:29:49, 1.56it/s, loss=0.0263, lr=2.43e-05, step=1582] Training: 16%|█▌ | 1583/10000 [18:15<1:29:49, 1.56it/s, loss=0.0229, lr=2.43e-05, step=1583] Training: 16%|█▌ | 1584/10000 [18:16<1:37:19, 1.44it/s, loss=0.0229, lr=2.43e-05, step=1583] Training: 16%|█▌ | 1584/10000 [18:16<1:37:19, 1.44it/s, loss=0.0284, lr=2.43e-05, step=1584] Training: 16%|█▌ | 1585/10000 [18:17<1:39:57, 1.40it/s, loss=0.0284, lr=2.43e-05, step=1584] Training: 16%|█▌ | 1585/10000 [18:17<1:39:57, 1.40it/s, loss=0.0090, lr=2.43e-05, step=1585] Training: 16%|█▌ | 1586/10000 [18:17<1:32:55, 1.51it/s, loss=0.0090, lr=2.43e-05, step=1585] Training: 16%|█▌ | 1586/10000 [18:17<1:32:55, 1.51it/s, loss=0.0375, lr=2.43e-05, step=1586] Training: 16%|█▌ | 1587/10000 [18:18<1:25:38, 1.64it/s, loss=0.0375, lr=2.43e-05, step=1586] Training: 16%|█▌ | 1587/10000 [18:18<1:25:38, 1.64it/s, loss=0.0252, lr=2.43e-05, step=1587] Training: 16%|█▌ | 1588/10000 [18:18<1:20:47, 1.74it/s, loss=0.0252, lr=2.43e-05, step=1587] Training: 16%|█▌ | 1588/10000 [18:18<1:20:47, 1.74it/s, loss=0.0139, lr=2.43e-05, step=1588] Training: 16%|█▌ | 1589/10000 [18:19<1:17:04, 1.82it/s, loss=0.0139, lr=2.43e-05, step=1588] Training: 16%|█▌ | 1589/10000 [18:19<1:17:04, 1.82it/s, loss=0.0083, lr=2.43e-05, step=1589]16:24:26.519 [I] step=1590 loss=0.0244 smoothed_loss=0.0286 lr=2.43e-05 grad_norm=0.6487 step_time=0.5330s data_time=0.0995s it/s=1.581 eta_to_10000=5318.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0104 grad_action_out_proj=0.2018 grad_shared_expert=0.4577 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1590/10000 [18:20<1:24:44, 1.65it/s, loss=0.0083, lr=2.43e-05, step=1589] Training: 16%|█▌ | 1590/10000 [18:20<1:24:44, 1.65it/s, loss=0.0244, lr=2.43e-05, step=1590] Training: 16%|█▌ | 1591/10000 [18:20<1:30:01, 1.56it/s, loss=0.0244, lr=2.43e-05, step=1590] Training: 16%|█▌ | 1591/10000 [18:20<1:30:01, 1.56it/s, loss=0.0600, lr=2.43e-05, step=1591] Training: 16%|█▌ | 1592/10000 [18:21<1:23:30, 1.68it/s, loss=0.0600, lr=2.43e-05, step=1591] Training: 16%|█▌ | 1592/10000 [18:21<1:23:30, 1.68it/s, loss=0.0098, lr=2.43e-05, step=1592] Training: 16%|█▌ | 1593/10000 [18:21<1:19:37, 1.76it/s, loss=0.0098, lr=2.43e-05, step=1592] Training: 16%|█▌ | 1593/10000 [18:21<1:19:37, 1.76it/s, loss=0.0762, lr=2.43e-05, step=1593] Training: 16%|█▌ | 1594/10000 [18:22<1:17:51, 1.80it/s, loss=0.0762, lr=2.43e-05, step=1593] Training: 16%|█▌ | 1594/10000 [18:22<1:17:51, 1.80it/s, loss=0.0168, lr=2.43e-05, step=1594] Training: 16%|█▌ | 1595/10000 [18:23<1:23:39, 1.67it/s, loss=0.0168, lr=2.43e-05, step=1594] Training: 16%|█▌ | 1595/10000 [18:23<1:23:39, 1.67it/s, loss=0.0361, lr=2.43e-05, step=1595] Training: 16%|█▌ | 1596/10000 [18:23<1:21:09, 1.73it/s, loss=0.0361, lr=2.43e-05, step=1595] Training: 16%|█▌ | 1596/10000 [18:23<1:21:09, 1.73it/s, loss=0.0138, lr=2.43e-05, step=1596] Training: 16%|█▌ | 1597/10000 [18:24<1:26:24, 1.62it/s, loss=0.0138, lr=2.43e-05, step=1596] Training: 16%|█▌ | 1597/10000 [18:24<1:26:24, 1.62it/s, loss=0.0304, lr=2.43e-05, step=1597] Training: 16%|█▌ | 1598/10000 [18:25<1:32:22, 1.52it/s, loss=0.0304, lr=2.43e-05, step=1597] Training: 16%|█▌ | 1598/10000 [18:25<1:32:22, 1.52it/s, loss=0.0124, lr=2.43e-05, step=1598] Training: 16%|█▌ | 1599/10000 [18:25<1:26:05, 1.63it/s, loss=0.0124, lr=2.43e-05, step=1598] Training: 16%|█▌ | 1599/10000 [18:25<1:26:05, 1.63it/s, loss=0.0354, lr=2.43e-05, step=1599]16:24:32.575 [I] step=1600 loss=0.0275 smoothed_loss=0.0294 lr=2.43e-05 grad_norm=0.6713 step_time=0.5257s data_time=0.0799s it/s=1.651 eta_to_10000=5086.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1598 grad_shared_expert=0.6776 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1600/10000 [18:26<1:25:33, 1.64it/s, loss=0.0354, lr=2.43e-05, step=1599] Training: 16%|█▌ | 1600/10000 [18:26<1:25:33, 1.64it/s, loss=0.0275, lr=2.43e-05, step=1600] Training: 16%|█▌ | 1601/10000 [18:26<1:21:07, 1.73it/s, loss=0.0275, lr=2.43e-05, step=1600] Training: 16%|█▌ | 1601/10000 [18:26<1:21:07, 1.73it/s, loss=0.0271, lr=2.43e-05, step=1601] Training: 16%|█▌ | 1602/10000 [18:27<1:17:57, 1.80it/s, loss=0.0271, lr=2.43e-05, step=1601] Training: 16%|█▌ | 1602/10000 [18:27<1:17:57, 1.80it/s, loss=0.0358, lr=2.43e-05, step=1602] Training: 16%|█▌ | 1603/10000 [18:27<1:15:45, 1.85it/s, loss=0.0358, lr=2.43e-05, step=1602] Training: 16%|█▌ | 1603/10000 [18:27<1:15:45, 1.85it/s, loss=0.0393, lr=2.43e-05, step=1603] Training: 16%|█▌ | 1604/10000 [18:28<1:23:08, 1.68it/s, loss=0.0393, lr=2.43e-05, step=1603] Training: 16%|█▌ | 1604/10000 [18:28<1:23:08, 1.68it/s, loss=0.0313, lr=2.43e-05, step=1604] Training: 16%|█▌ | 1605/10000 [18:29<1:30:13, 1.55it/s, loss=0.0313, lr=2.43e-05, step=1604] Training: 16%|█▌ | 1605/10000 [18:29<1:30:13, 1.55it/s, loss=0.0134, lr=2.43e-05, step=1605] Training: 16%|█▌ | 1606/10000 [18:29<1:23:55, 1.67it/s, loss=0.0134, lr=2.43e-05, step=1605] Training: 16%|█▌ | 1606/10000 [18:29<1:23:55, 1.67it/s, loss=0.2486, lr=2.43e-05, step=1606] Training: 16%|█▌ | 1607/10000 [18:30<1:19:43, 1.75it/s, loss=0.2486, lr=2.43e-05, step=1606] Training: 16%|█▌ | 1607/10000 [18:30<1:19:43, 1.75it/s, loss=0.0250, lr=2.43e-05, step=1607] Training: 16%|█▌ | 1608/10000 [18:30<1:16:59, 1.82it/s, loss=0.0250, lr=2.43e-05, step=1607] Training: 16%|█▌ | 1608/10000 [18:30<1:16:59, 1.82it/s, loss=0.0329, lr=2.43e-05, step=1608] Training: 16%|█▌ | 1609/10000 [18:31<1:15:27, 1.85it/s, loss=0.0329, lr=2.43e-05, step=1608] Training: 16%|█▌ | 1609/10000 [18:31<1:15:27, 1.85it/s, loss=0.0117, lr=2.43e-05, step=1609]16:24:38.231 [I] step=1610 loss=0.0087 smoothed_loss=0.0399 lr=2.43e-05 grad_norm=0.7639 step_time=0.4945s data_time=0.0711s it/s=1.768 eta_to_10000=4744.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0127 grad_action_out_proj=0.2163 grad_shared_expert=0.5902 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1610/10000 [18:31<1:19:56, 1.75it/s, loss=0.0117, lr=2.43e-05, step=1609] Training: 16%|█▌ | 1610/10000 [18:31<1:19:56, 1.75it/s, loss=0.0087, lr=2.43e-05, step=1610] Training: 16%|█▌ | 1611/10000 [18:32<1:16:56, 1.82it/s, loss=0.0087, lr=2.43e-05, step=1610] Training: 16%|█▌ | 1611/10000 [18:32<1:16:56, 1.82it/s, loss=0.0278, lr=2.43e-05, step=1611] Training: 16%|█▌ | 1612/10000 [18:33<1:26:58, 1.61it/s, loss=0.0278, lr=2.43e-05, step=1611] Training: 16%|█▌ | 1612/10000 [18:33<1:26:58, 1.61it/s, loss=0.0515, lr=2.42e-05, step=1612] Training: 16%|█▌ | 1613/10000 [18:33<1:21:46, 1.71it/s, loss=0.0515, lr=2.42e-05, step=1612] Training: 16%|█▌ | 1613/10000 [18:33<1:21:46, 1.71it/s, loss=0.0242, lr=2.42e-05, step=1613] Training: 16%|█▌ | 1614/10000 [18:34<1:24:12, 1.66it/s, loss=0.0242, lr=2.42e-05, step=1613] Training: 16%|█▌ | 1614/10000 [18:34<1:24:12, 1.66it/s, loss=0.0096, lr=2.42e-05, step=1614] Training: 16%|█▌ | 1615/10000 [18:35<1:32:41, 1.51it/s, loss=0.0096, lr=2.42e-05, step=1614] Training: 16%|█▌ | 1615/10000 [18:35<1:32:41, 1.51it/s, loss=0.0151, lr=2.42e-05, step=1615] Training: 16%|█▌ | 1616/10000 [18:35<1:27:43, 1.59it/s, loss=0.0151, lr=2.42e-05, step=1615] Training: 16%|█▌ | 1616/10000 [18:35<1:27:43, 1.59it/s, loss=0.0383, lr=2.42e-05, step=1616] Training: 16%|█▌ | 1617/10000 [18:36<1:22:24, 1.70it/s, loss=0.0383, lr=2.42e-05, step=1616] Training: 16%|█▌ | 1617/10000 [18:36<1:22:24, 1.70it/s, loss=0.0816, lr=2.42e-05, step=1617] Training: 16%|█▌ | 1618/10000 [18:36<1:18:02, 1.79it/s, loss=0.0816, lr=2.42e-05, step=1617] Training: 16%|█▌ | 1618/10000 [18:36<1:18:02, 1.79it/s, loss=0.0576, lr=2.42e-05, step=1618] Training: 16%|█▌ | 1619/10000 [18:37<1:23:00, 1.68it/s, loss=0.0576, lr=2.42e-05, step=1618] Training: 16%|█▌ | 1619/10000 [18:37<1:23:00, 1.68it/s, loss=0.0204, lr=2.42e-05, step=1619]16:24:44.435 [I] step=1620 loss=0.0264 smoothed_loss=0.0374 lr=2.42e-05 grad_norm=0.7240 step_time=0.5285s data_time=0.0919s it/s=1.612 eta_to_10000=5198.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.2483 grad_shared_expert=1.5752 (10775:train_pytorch.py:850) + Training: 16%|█▌ | 1620/10000 [18:38<1:29:49, 1.55it/s, loss=0.0204, lr=2.42e-05, step=1619] Training: 16%|█▌ | 1620/10000 [18:38<1:29:49, 1.55it/s, loss=0.0264, lr=2.42e-05, step=1620] Training: 16%|█▌ | 1621/10000 [18:38<1:24:05, 1.66it/s, loss=0.0264, lr=2.42e-05, step=1620] Training: 16%|█▌ | 1621/10000 [18:38<1:24:05, 1.66it/s, loss=0.0439, lr=2.42e-05, step=1621] Training: 16%|█▌ | 1622/10000 [18:39<1:19:57, 1.75it/s, loss=0.0439, lr=2.42e-05, step=1621] Training: 16%|█▌ | 1622/10000 [18:39<1:19:57, 1.75it/s, loss=0.0108, lr=2.42e-05, step=1622] Training: 16%|█▌ | 1623/10000 [18:39<1:20:19, 1.74it/s, loss=0.0108, lr=2.42e-05, step=1622] Training: 16%|█▌ | 1623/10000 [18:39<1:20:19, 1.74it/s, loss=0.0292, lr=2.42e-05, step=1623] Training: 16%|█▌ | 1624/10000 [18:40<1:21:18, 1.72it/s, loss=0.0292, lr=2.42e-05, step=1623] Training: 16%|█▌ | 1624/10000 [18:40<1:21:18, 1.72it/s, loss=0.0384, lr=2.42e-05, step=1624] Training: 16%|█▋ | 1625/10000 [18:40<1:19:57, 1.75it/s, loss=0.0384, lr=2.42e-05, step=1624] Training: 16%|█▋ | 1625/10000 [18:40<1:19:57, 1.75it/s, loss=0.0534, lr=2.42e-05, step=1625] Training: 16%|█▋ | 1626/10000 [18:41<1:29:29, 1.56it/s, loss=0.0534, lr=2.42e-05, step=1625] Training: 16%|█▋ | 1626/10000 [18:41<1:29:29, 1.56it/s, loss=0.0310, lr=2.42e-05, step=1626] Training: 16%|█▋ | 1627/10000 [18:42<1:40:32, 1.39it/s, loss=0.0310, lr=2.42e-05, step=1626] Training: 16%|█▋ | 1627/10000 [18:42<1:40:32, 1.39it/s, loss=0.0352, lr=2.42e-05, step=1627] Training: 16%|█▋ | 1628/10000 [18:43<1:35:39, 1.46it/s, loss=0.0352, lr=2.42e-05, step=1627] Training: 16%|█▋ | 1628/10000 [18:43<1:35:39, 1.46it/s, loss=0.0313, lr=2.42e-05, step=1628] Training: 16%|█▋ | 1629/10000 [18:43<1:43:46, 1.34it/s, loss=0.0313, lr=2.42e-05, step=1628] Training: 16%|█▋ | 1629/10000 [18:43<1:43:46, 1.34it/s, loss=0.0508, lr=2.42e-05, step=1629]16:24:50.894 [I] step=1630 loss=0.0280 smoothed_loss=0.0363 lr=2.42e-05 grad_norm=0.6313 step_time=0.5592s data_time=0.0866s it/s=1.549 eta_to_10000=5404.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0158 grad_action_out_proj=0.1951 grad_shared_expert=0.6722 (10775:train_pytorch.py:850) + Training: 16%|█▋ | 1630/10000 [18:44<1:34:41, 1.47it/s, loss=0.0508, lr=2.42e-05, step=1629] Training: 16%|█▋ | 1630/10000 [18:44<1:34:41, 1.47it/s, loss=0.0280, lr=2.42e-05, step=1630] Training: 16%|█▋ | 1631/10000 [18:44<1:26:33, 1.61it/s, loss=0.0280, lr=2.42e-05, step=1630] Training: 16%|█▋ | 1631/10000 [18:44<1:26:33, 1.61it/s, loss=0.0190, lr=2.42e-05, step=1631] Training: 16%|█▋ | 1632/10000 [18:45<1:21:54, 1.70it/s, loss=0.0190, lr=2.42e-05, step=1631] Training: 16%|█▋ | 1632/10000 [18:45<1:21:54, 1.70it/s, loss=0.0183, lr=2.42e-05, step=1632] Training: 16%|█▋ | 1633/10000 [18:46<1:31:16, 1.53it/s, loss=0.0183, lr=2.42e-05, step=1632] Training: 16%|█▋ | 1633/10000 [18:46<1:31:16, 1.53it/s, loss=0.0352, lr=2.42e-05, step=1633] Training: 16%|█▋ | 1634/10000 [18:46<1:33:08, 1.50it/s, loss=0.0352, lr=2.42e-05, step=1633] Training: 16%|█▋ | 1634/10000 [18:46<1:33:08, 1.50it/s, loss=0.0168, lr=2.42e-05, step=1634] Training: 16%|█▋ | 1635/10000 [18:47<1:25:31, 1.63it/s, loss=0.0168, lr=2.42e-05, step=1634] Training: 16%|█▋ | 1635/10000 [18:47<1:25:31, 1.63it/s, loss=0.0083, lr=2.42e-05, step=1635] Training: 16%|█▋ | 1636/10000 [18:48<1:25:37, 1.63it/s, loss=0.0083, lr=2.42e-05, step=1635] Training: 16%|█▋ | 1636/10000 [18:48<1:25:37, 1.63it/s, loss=0.0100, lr=2.42e-05, step=1636] Training: 16%|█▋ | 1637/10000 [18:48<1:26:33, 1.61it/s, loss=0.0100, lr=2.42e-05, step=1636] Training: 16%|█▋ | 1637/10000 [18:48<1:26:33, 1.61it/s, loss=0.0314, lr=2.42e-05, step=1637] Training: 16%|█▋ | 1638/10000 [18:49<1:27:06, 1.60it/s, loss=0.0314, lr=2.42e-05, step=1637] Training: 16%|█▋ | 1638/10000 [18:49<1:27:06, 1.60it/s, loss=0.0188, lr=2.42e-05, step=1638] Training: 16%|█▋ | 1639/10000 [18:49<1:28:03, 1.58it/s, loss=0.0188, lr=2.42e-05, step=1638] Training: 16%|█▋ | 1639/10000 [18:49<1:28:03, 1.58it/s, loss=0.0092, lr=2.42e-05, step=1639]16:24:57.267 [I] step=1640 loss=0.0336 smoothed_loss=0.0259 lr=2.42e-05 grad_norm=0.5898 step_time=0.5390s data_time=0.0983s it/s=1.569 eta_to_10000=5326.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0203 grad_action_out_proj=0.2349 grad_shared_expert=0.6458 (10775:train_pytorch.py:850) + Training: 16%|█▋ | 1640/10000 [18:50<1:36:59, 1.44it/s, loss=0.0092, lr=2.42e-05, step=1639] Training: 16%|█▋ | 1640/10000 [18:50<1:36:59, 1.44it/s, loss=0.0336, lr=2.42e-05, step=1640] Training: 16%|█▋ | 1641/10000 [18:51<1:29:00, 1.57it/s, loss=0.0336, lr=2.42e-05, step=1640] Training: 16%|█▋ | 1641/10000 [18:51<1:29:00, 1.57it/s, loss=0.0070, lr=2.42e-05, step=1641] Training: 16%|█▋ | 1642/10000 [18:52<1:32:34, 1.50it/s, loss=0.0070, lr=2.42e-05, step=1641] Training: 16%|█▋ | 1642/10000 [18:52<1:32:34, 1.50it/s, loss=0.0094, lr=2.42e-05, step=1642] Training: 16%|█▋ | 1643/10000 [18:52<1:26:15, 1.61it/s, loss=0.0094, lr=2.42e-05, step=1642] Training: 16%|█▋ | 1643/10000 [18:52<1:26:15, 1.61it/s, loss=0.0415, lr=2.42e-05, step=1643] Training: 16%|█▋ | 1644/10000 [18:53<1:30:22, 1.54it/s, loss=0.0415, lr=2.42e-05, step=1643] Training: 16%|█▋ | 1644/10000 [18:53<1:30:22, 1.54it/s, loss=0.0346, lr=2.42e-05, step=1644] Training: 16%|█▋ | 1645/10000 [18:53<1:25:13, 1.63it/s, loss=0.0346, lr=2.42e-05, step=1644] Training: 16%|█▋ | 1645/10000 [18:53<1:25:13, 1.63it/s, loss=0.0534, lr=2.42e-05, step=1645] Training: 16%|█▋ | 1646/10000 [18:54<1:22:13, 1.69it/s, loss=0.0534, lr=2.42e-05, step=1645] Training: 16%|█▋ | 1646/10000 [18:54<1:22:13, 1.69it/s, loss=0.0700, lr=2.42e-05, step=1646] Training: 16%|█▋ | 1647/10000 [18:55<1:32:32, 1.50it/s, loss=0.0700, lr=2.42e-05, step=1646] Training: 16%|█▋ | 1647/10000 [18:55<1:32:32, 1.50it/s, loss=0.0338, lr=2.42e-05, step=1647] Training: 16%|█▋ | 1648/10000 [18:55<1:25:06, 1.64it/s, loss=0.0338, lr=2.42e-05, step=1647] Training: 16%|█▋ | 1648/10000 [18:55<1:25:06, 1.64it/s, loss=0.0318, lr=2.42e-05, step=1648] Training: 16%|█▋ | 1649/10000 [18:56<1:30:42, 1.53it/s, loss=0.0318, lr=2.42e-05, step=1648] Training: 16%|█▋ | 1649/10000 [18:56<1:30:42, 1.53it/s, loss=0.0165, lr=2.42e-05, step=1649]16:25:03.394 [I] step=1650 loss=0.0536 smoothed_loss=0.0332 lr=2.42e-05 grad_norm=0.6069 step_time=0.5298s data_time=0.0830s it/s=1.632 eta_to_10000=5116.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0151 grad_action_out_proj=0.2142 grad_shared_expert=0.6898 (10775:train_pytorch.py:850) + Training: 16%|█▋ | 1650/10000 [18:56<1:25:34, 1.63it/s, loss=0.0165, lr=2.42e-05, step=1649] Training: 16%|█▋ | 1650/10000 [18:56<1:25:34, 1.63it/s, loss=0.0536, lr=2.42e-05, step=1650] Training: 17%|█▋ | 1651/10000 [18:57<1:21:12, 1.71it/s, loss=0.0536, lr=2.42e-05, step=1650] Training: 17%|█▋ | 1651/10000 [18:57<1:21:12, 1.71it/s, loss=0.0267, lr=2.42e-05, step=1651] Training: 17%|█▋ | 1652/10000 [18:58<1:22:37, 1.68it/s, loss=0.0267, lr=2.42e-05, step=1651] Training: 17%|█▋ | 1652/10000 [18:58<1:22:37, 1.68it/s, loss=0.0137, lr=2.42e-05, step=1652] Training: 17%|█▋ | 1653/10000 [18:58<1:33:59, 1.48it/s, loss=0.0137, lr=2.42e-05, step=1652] Training: 17%|█▋ | 1653/10000 [18:58<1:33:59, 1.48it/s, loss=0.0127, lr=2.42e-05, step=1653] Training: 17%|█▋ | 1654/10000 [18:59<1:38:11, 1.42it/s, loss=0.0127, lr=2.42e-05, step=1653] Training: 17%|█▋ | 1654/10000 [18:59<1:38:11, 1.42it/s, loss=0.0249, lr=2.42e-05, step=1654] Training: 17%|█▋ | 1655/10000 [19:00<1:41:52, 1.37it/s, loss=0.0249, lr=2.42e-05, step=1654] Training: 17%|█▋ | 1655/10000 [19:00<1:41:52, 1.37it/s, loss=0.0202, lr=2.42e-05, step=1655] Training: 17%|█▋ | 1656/10000 [19:01<1:32:25, 1.50it/s, loss=0.0202, lr=2.42e-05, step=1655] Training: 17%|█▋ | 1656/10000 [19:01<1:32:25, 1.50it/s, loss=0.0192, lr=2.42e-05, step=1656] Training: 17%|█▋ | 1657/10000 [19:01<1:25:13, 1.63it/s, loss=0.0192, lr=2.42e-05, step=1656] Training: 17%|█▋ | 1657/10000 [19:01<1:25:13, 1.63it/s, loss=0.0254, lr=2.42e-05, step=1657] Training: 17%|█▋ | 1658/10000 [19:02<1:20:26, 1.73it/s, loss=0.0254, lr=2.42e-05, step=1657] Training: 17%|█▋ | 1658/10000 [19:02<1:20:26, 1.73it/s, loss=0.0354, lr=2.42e-05, step=1658] Training: 17%|█▋ | 1659/10000 [19:02<1:16:56, 1.81it/s, loss=0.0354, lr=2.42e-05, step=1658] Training: 17%|█▋ | 1659/10000 [19:02<1:16:56, 1.81it/s, loss=0.1192, lr=2.42e-05, step=1659]16:25:09.496 [I] step=1660 loss=0.0216 smoothed_loss=0.0352 lr=2.42e-05 grad_norm=0.5964 step_time=0.5111s data_time=0.0991s it/s=1.639 eta_to_10000=5088.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0157 grad_action_out_proj=0.1364 grad_shared_expert=0.5841 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1660/10000 [19:03<1:16:32, 1.82it/s, loss=0.1192, lr=2.42e-05, step=1659] Training: 17%|█▋ | 1660/10000 [19:03<1:16:32, 1.82it/s, loss=0.0216, lr=2.42e-05, step=1660] Training: 17%|█▋ | 1661/10000 [19:03<1:14:20, 1.87it/s, loss=0.0216, lr=2.42e-05, step=1660] Training: 17%|█▋ | 1661/10000 [19:03<1:14:20, 1.87it/s, loss=0.0112, lr=2.42e-05, step=1661] Training: 17%|█▋ | 1662/10000 [19:04<1:19:35, 1.75it/s, loss=0.0112, lr=2.42e-05, step=1661] Training: 17%|█▋ | 1662/10000 [19:04<1:19:35, 1.75it/s, loss=0.0132, lr=2.42e-05, step=1662] Training: 17%|█▋ | 1663/10000 [19:04<1:24:25, 1.65it/s, loss=0.0132, lr=2.42e-05, step=1662] Training: 17%|█▋ | 1663/10000 [19:04<1:24:25, 1.65it/s, loss=0.0455, lr=2.42e-05, step=1663] Training: 17%|█▋ | 1664/10000 [19:05<1:19:38, 1.74it/s, loss=0.0455, lr=2.42e-05, step=1663] Training: 17%|█▋ | 1664/10000 [19:05<1:19:38, 1.74it/s, loss=0.0212, lr=2.42e-05, step=1664] Training: 17%|█▋ | 1665/10000 [19:05<1:15:37, 1.84it/s, loss=0.0212, lr=2.42e-05, step=1664] Training: 17%|█▋ | 1665/10000 [19:05<1:15:37, 1.84it/s, loss=0.0724, lr=2.42e-05, step=1665] Training: 17%|█▋ | 1666/10000 [19:06<1:13:05, 1.90it/s, loss=0.0724, lr=2.42e-05, step=1665] Training: 17%|█▋ | 1666/10000 [19:06<1:13:05, 1.90it/s, loss=0.0222, lr=2.42e-05, step=1666] Training: 17%|█▋ | 1667/10000 [19:06<1:11:41, 1.94it/s, loss=0.0222, lr=2.42e-05, step=1666] Training: 17%|█▋ | 1667/10000 [19:06<1:11:41, 1.94it/s, loss=0.0082, lr=2.42e-05, step=1667] Training: 17%|█▋ | 1668/10000 [19:07<1:20:05, 1.73it/s, loss=0.0082, lr=2.42e-05, step=1667] Training: 17%|█▋ | 1668/10000 [19:07<1:20:05, 1.73it/s, loss=0.0892, lr=2.42e-05, step=1668] Training: 17%|█▋ | 1669/10000 [19:08<1:16:21, 1.82it/s, loss=0.0892, lr=2.42e-05, step=1668] Training: 17%|█▋ | 1669/10000 [19:08<1:16:21, 1.82it/s, loss=0.0463, lr=2.42e-05, step=1669]16:25:15.239 [I] step=1670 loss=0.0279 smoothed_loss=0.0371 lr=2.42e-05 grad_norm=0.7418 step_time=0.4997s data_time=0.0746s it/s=1.741 eta_to_10000=4783.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.2254 grad_shared_expert=0.6020 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1670/10000 [19:08<1:24:23, 1.65it/s, loss=0.0463, lr=2.42e-05, step=1669] Training: 17%|█▋ | 1670/10000 [19:08<1:24:23, 1.65it/s, loss=0.0279, lr=2.42e-05, step=1670] Training: 17%|█▋ | 1671/10000 [19:09<1:19:48, 1.74it/s, loss=0.0279, lr=2.42e-05, step=1670] Training: 17%|█▋ | 1671/10000 [19:09<1:19:48, 1.74it/s, loss=0.1323, lr=2.42e-05, step=1671] Training: 17%|█▋ | 1672/10000 [19:09<1:16:16, 1.82it/s, loss=0.1323, lr=2.42e-05, step=1671] Training: 17%|█▋ | 1672/10000 [19:09<1:16:16, 1.82it/s, loss=0.0222, lr=2.42e-05, step=1672] Training: 17%|█▋ | 1673/10000 [19:10<1:13:45, 1.88it/s, loss=0.0222, lr=2.42e-05, step=1672] Training: 17%|█▋ | 1673/10000 [19:10<1:13:45, 1.88it/s, loss=0.0374, lr=2.42e-05, step=1673] Training: 17%|█▋ | 1674/10000 [19:10<1:12:35, 1.91it/s, loss=0.0374, lr=2.42e-05, step=1673] Training: 17%|█▋ | 1674/10000 [19:10<1:12:35, 1.91it/s, loss=0.0440, lr=2.42e-05, step=1674] Training: 17%|█▋ | 1675/10000 [19:11<1:12:37, 1.91it/s, loss=0.0440, lr=2.42e-05, step=1674] Training: 17%|█▋ | 1675/10000 [19:11<1:12:37, 1.91it/s, loss=0.0236, lr=2.42e-05, step=1675] Training: 17%|█▋ | 1676/10000 [19:11<1:13:37, 1.88it/s, loss=0.0236, lr=2.42e-05, step=1675] Training: 17%|█▋ | 1676/10000 [19:11<1:13:37, 1.88it/s, loss=0.0390, lr=2.42e-05, step=1676] Training: 17%|█▋ | 1677/10000 [19:12<1:25:07, 1.63it/s, loss=0.0390, lr=2.42e-05, step=1676] Training: 17%|█▋ | 1677/10000 [19:12<1:25:07, 1.63it/s, loss=0.0450, lr=2.42e-05, step=1677] Training: 17%|█▋ | 1678/10000 [19:13<1:20:03, 1.73it/s, loss=0.0450, lr=2.42e-05, step=1677] Training: 17%|█▋ | 1678/10000 [19:13<1:20:03, 1.73it/s, loss=0.0230, lr=2.42e-05, step=1678] Training: 17%|█▋ | 1679/10000 [19:13<1:16:39, 1.81it/s, loss=0.0230, lr=2.42e-05, step=1678] Training: 17%|█▋ | 1679/10000 [19:13<1:16:39, 1.81it/s, loss=0.0329, lr=2.42e-05, step=1679]16:25:20.608 [I] step=1680 loss=0.0206 smoothed_loss=0.0373 lr=2.42e-05 grad_norm=0.6708 step_time=0.4728s data_time=0.0641s it/s=1.863 eta_to_10000=4466.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1688 grad_shared_expert=0.5183 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1680/10000 [19:14<1:15:20, 1.84it/s, loss=0.0329, lr=2.42e-05, step=1679] Training: 17%|█▋ | 1680/10000 [19:14<1:15:20, 1.84it/s, loss=0.0206, lr=2.42e-05, step=1680] Training: 17%|█▋ | 1681/10000 [19:14<1:12:56, 1.90it/s, loss=0.0206, lr=2.42e-05, step=1680] Training: 17%|█▋ | 1681/10000 [19:14<1:12:56, 1.90it/s, loss=0.0266, lr=2.42e-05, step=1681] Training: 17%|█▋ | 1682/10000 [19:15<1:11:09, 1.95it/s, loss=0.0266, lr=2.42e-05, step=1681] Training: 17%|█▋ | 1682/10000 [19:15<1:11:09, 1.95it/s, loss=0.0413, lr=2.42e-05, step=1682] Training: 17%|█▋ | 1683/10000 [19:15<1:10:50, 1.96it/s, loss=0.0413, lr=2.42e-05, step=1682] Training: 17%|█▋ | 1683/10000 [19:15<1:10:50, 1.96it/s, loss=0.0224, lr=2.42e-05, step=1683] Training: 17%|█▋ | 1684/10000 [19:16<1:22:05, 1.69it/s, loss=0.0224, lr=2.42e-05, step=1683] Training: 17%|█▋ | 1684/10000 [19:16<1:22:05, 1.69it/s, loss=0.0252, lr=2.42e-05, step=1684] Training: 17%|█▋ | 1685/10000 [19:17<1:27:20, 1.59it/s, loss=0.0252, lr=2.42e-05, step=1684] Training: 17%|█▋ | 1685/10000 [19:17<1:27:20, 1.59it/s, loss=0.0242, lr=2.41e-05, step=1685] Training: 17%|█▋ | 1686/10000 [19:17<1:22:07, 1.69it/s, loss=0.0242, lr=2.41e-05, step=1685] Training: 17%|█▋ | 1686/10000 [19:17<1:22:07, 1.69it/s, loss=0.0266, lr=2.41e-05, step=1686] Training: 17%|█▋ | 1687/10000 [19:18<1:17:36, 1.79it/s, loss=0.0266, lr=2.41e-05, step=1686] Training: 17%|█▋ | 1687/10000 [19:18<1:17:36, 1.79it/s, loss=0.0524, lr=2.41e-05, step=1687] Training: 17%|█▋ | 1688/10000 [19:18<1:14:25, 1.86it/s, loss=0.0524, lr=2.41e-05, step=1687] Training: 17%|█▋ | 1688/10000 [19:18<1:14:25, 1.86it/s, loss=0.0244, lr=2.41e-05, step=1688] Training: 17%|█▋ | 1689/10000 [19:19<1:16:39, 1.81it/s, loss=0.0244, lr=2.41e-05, step=1688] Training: 17%|█▋ | 1689/10000 [19:19<1:16:39, 1.81it/s, loss=0.0216, lr=2.41e-05, step=1689]16:25:26.161 [I] step=1690 loss=0.0532 smoothed_loss=0.0344 lr=2.41e-05 grad_norm=0.6546 step_time=0.4876s data_time=0.0676s it/s=1.801 eta_to_10000=4613.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.2538 grad_shared_expert=0.6819 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1690/10000 [19:19<1:14:58, 1.85it/s, loss=0.0216, lr=2.41e-05, step=1689] Training: 17%|█▋ | 1690/10000 [19:19<1:14:58, 1.85it/s, loss=0.0532, lr=2.41e-05, step=1690] Training: 17%|█▋ | 1691/10000 [19:20<1:32:31, 1.50it/s, loss=0.0532, lr=2.41e-05, step=1690] Training: 17%|█▋ | 1691/10000 [19:20<1:32:31, 1.50it/s, loss=0.0225, lr=2.41e-05, step=1691] Training: 17%|█▋ | 1692/10000 [19:21<1:34:48, 1.46it/s, loss=0.0225, lr=2.41e-05, step=1691] Training: 17%|█▋ | 1692/10000 [19:21<1:34:48, 1.46it/s, loss=0.0119, lr=2.41e-05, step=1692] Training: 17%|█▋ | 1693/10000 [19:22<1:37:23, 1.42it/s, loss=0.0119, lr=2.41e-05, step=1692] Training: 17%|█▋ | 1693/10000 [19:22<1:37:23, 1.42it/s, loss=0.0550, lr=2.41e-05, step=1693] Training: 17%|█▋ | 1694/10000 [19:22<1:29:11, 1.55it/s, loss=0.0550, lr=2.41e-05, step=1693] Training: 17%|█▋ | 1694/10000 [19:22<1:29:11, 1.55it/s, loss=0.0181, lr=2.41e-05, step=1694] Training: 17%|█▋ | 1695/10000 [19:23<1:22:43, 1.67it/s, loss=0.0181, lr=2.41e-05, step=1694] Training: 17%|█▋ | 1695/10000 [19:23<1:22:43, 1.67it/s, loss=0.0612, lr=2.41e-05, step=1695] Training: 17%|█▋ | 1696/10000 [19:23<1:18:10, 1.77it/s, loss=0.0612, lr=2.41e-05, step=1695] Training: 17%|█▋ | 1696/10000 [19:23<1:18:10, 1.77it/s, loss=0.0910, lr=2.41e-05, step=1696] Training: 17%|█▋ | 1697/10000 [19:24<1:14:14, 1.86it/s, loss=0.0910, lr=2.41e-05, step=1696] Training: 17%|█▋ | 1697/10000 [19:24<1:14:14, 1.86it/s, loss=0.0864, lr=2.41e-05, step=1697] Training: 17%|█▋ | 1698/10000 [19:24<1:20:23, 1.72it/s, loss=0.0864, lr=2.41e-05, step=1697] Training: 17%|█▋ | 1698/10000 [19:24<1:20:23, 1.72it/s, loss=0.0374, lr=2.41e-05, step=1698] Training: 17%|█▋ | 1699/10000 [19:25<1:24:46, 1.63it/s, loss=0.0374, lr=2.41e-05, step=1698] Training: 17%|█▋ | 1699/10000 [19:25<1:24:46, 1.63it/s, loss=0.0149, lr=2.41e-05, step=1699]16:25:32.453 [I] step=1700 loss=0.0437 smoothed_loss=0.0416 lr=2.41e-05 grad_norm=0.8400 step_time=0.5487s data_time=0.0806s it/s=1.590 eta_to_10000=5221.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0212 grad_action_out_proj=0.2886 grad_shared_expert=0.6268 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1700/10000 [19:26<1:21:24, 1.70it/s, loss=0.0149, lr=2.41e-05, step=1699] Training: 17%|█▋ | 1700/10000 [19:26<1:21:24, 1.70it/s, loss=0.0437, lr=2.41e-05, step=1700] Training: 17%|█▋ | 1701/10000 [19:26<1:17:16, 1.79it/s, loss=0.0437, lr=2.41e-05, step=1700] Training: 17%|█▋ | 1701/10000 [19:26<1:17:16, 1.79it/s, loss=0.0122, lr=2.41e-05, step=1701] Training: 17%|█▋ | 1702/10000 [19:27<1:22:10, 1.68it/s, loss=0.0122, lr=2.41e-05, step=1701] Training: 17%|█▋ | 1702/10000 [19:27<1:22:10, 1.68it/s, loss=0.0362, lr=2.41e-05, step=1702] Training: 17%|█▋ | 1703/10000 [19:27<1:25:12, 1.62it/s, loss=0.0362, lr=2.41e-05, step=1702] Training: 17%|█▋ | 1703/10000 [19:27<1:25:12, 1.62it/s, loss=0.0075, lr=2.41e-05, step=1703] Training: 17%|█▋ | 1704/10000 [19:28<1:20:20, 1.72it/s, loss=0.0075, lr=2.41e-05, step=1703] Training: 17%|█▋ | 1704/10000 [19:28<1:20:20, 1.72it/s, loss=0.0693, lr=2.41e-05, step=1704] Training: 17%|█▋ | 1705/10000 [19:29<1:28:03, 1.57it/s, loss=0.0693, lr=2.41e-05, step=1704] Training: 17%|█▋ | 1705/10000 [19:29<1:28:03, 1.57it/s, loss=0.0238, lr=2.41e-05, step=1705] Training: 17%|█▋ | 1706/10000 [19:30<1:42:56, 1.34it/s, loss=0.0238, lr=2.41e-05, step=1705] Training: 17%|█▋ | 1706/10000 [19:30<1:42:56, 1.34it/s, loss=0.0207, lr=2.41e-05, step=1706] Training: 17%|█▋ | 1707/10000 [19:30<1:33:11, 1.48it/s, loss=0.0207, lr=2.41e-05, step=1706] Training: 17%|█▋ | 1707/10000 [19:30<1:33:11, 1.48it/s, loss=0.0207, lr=2.41e-05, step=1707] Training: 17%|█▋ | 1708/10000 [19:31<1:26:30, 1.60it/s, loss=0.0207, lr=2.41e-05, step=1707] Training: 17%|█▋ | 1708/10000 [19:31<1:26:30, 1.60it/s, loss=0.0196, lr=2.41e-05, step=1708] Training: 17%|█▋ | 1709/10000 [19:31<1:23:33, 1.65it/s, loss=0.0196, lr=2.41e-05, step=1708] Training: 17%|█▋ | 1709/10000 [19:31<1:23:33, 1.65it/s, loss=0.0104, lr=2.41e-05, step=1709]16:25:38.660 [I] step=1710 loss=0.0196 smoothed_loss=0.0293 lr=2.41e-05 grad_norm=0.6984 step_time=0.5092s data_time=0.1114s it/s=1.612 eta_to_10000=5144.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1827 grad_shared_expert=0.8368 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1710/10000 [19:32<1:20:30, 1.72it/s, loss=0.0104, lr=2.41e-05, step=1709] Training: 17%|█▋ | 1710/10000 [19:32<1:20:30, 1.72it/s, loss=0.0196, lr=2.41e-05, step=1710] Training: 17%|█▋ | 1711/10000 [19:32<1:17:31, 1.78it/s, loss=0.0196, lr=2.41e-05, step=1710] Training: 17%|█▋ | 1711/10000 [19:32<1:17:31, 1.78it/s, loss=0.0659, lr=2.41e-05, step=1711] Training: 17%|█▋ | 1712/10000 [19:33<1:15:53, 1.82it/s, loss=0.0659, lr=2.41e-05, step=1711] Training: 17%|█▋ | 1712/10000 [19:33<1:15:53, 1.82it/s, loss=0.0371, lr=2.41e-05, step=1712] Training: 17%|█▋ | 1713/10000 [19:34<1:26:59, 1.59it/s, loss=0.0371, lr=2.41e-05, step=1712] Training: 17%|█▋ | 1713/10000 [19:34<1:26:59, 1.59it/s, loss=0.0172, lr=2.41e-05, step=1713] Training: 17%|█▋ | 1714/10000 [19:34<1:21:16, 1.70it/s, loss=0.0172, lr=2.41e-05, step=1713] Training: 17%|█▋ | 1714/10000 [19:34<1:21:16, 1.70it/s, loss=0.0560, lr=2.41e-05, step=1714] Training: 17%|█▋ | 1715/10000 [19:35<1:19:41, 1.73it/s, loss=0.0560, lr=2.41e-05, step=1714] Training: 17%|█▋ | 1715/10000 [19:35<1:19:41, 1.73it/s, loss=0.0289, lr=2.41e-05, step=1715] Training: 17%|█▋ | 1716/10000 [19:35<1:16:01, 1.82it/s, loss=0.0289, lr=2.41e-05, step=1715] Training: 17%|█▋ | 1716/10000 [19:35<1:16:01, 1.82it/s, loss=0.0247, lr=2.41e-05, step=1716] Training: 17%|█▋ | 1717/10000 [19:36<1:13:04, 1.89it/s, loss=0.0247, lr=2.41e-05, step=1716] Training: 17%|█▋ | 1717/10000 [19:36<1:13:04, 1.89it/s, loss=0.0174, lr=2.41e-05, step=1717] Training: 17%|█▋ | 1718/10000 [19:36<1:23:47, 1.65it/s, loss=0.0174, lr=2.41e-05, step=1717] Training: 17%|█▋ | 1718/10000 [19:36<1:23:47, 1.65it/s, loss=0.0265, lr=2.41e-05, step=1718] Training: 17%|█▋ | 1719/10000 [19:37<1:21:18, 1.70it/s, loss=0.0265, lr=2.41e-05, step=1718] Training: 17%|█▋ | 1719/10000 [19:37<1:21:18, 1.70it/s, loss=0.1344, lr=2.41e-05, step=1719]16:25:44.713 [I] step=1720 loss=0.0327 smoothed_loss=0.0403 lr=2.41e-05 grad_norm=0.6773 step_time=0.5434s data_time=0.0619s it/s=1.652 eta_to_10000=5011.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0134 grad_action_out_proj=0.1808 grad_shared_expert=0.6724 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1720/10000 [19:38<1:32:21, 1.49it/s, loss=0.1344, lr=2.41e-05, step=1719] Training: 17%|█▋ | 1720/10000 [19:38<1:32:21, 1.49it/s, loss=0.0327, lr=2.41e-05, step=1720] Training: 17%|█▋ | 1721/10000 [19:38<1:27:44, 1.57it/s, loss=0.0327, lr=2.41e-05, step=1720] Training: 17%|█▋ | 1721/10000 [19:38<1:27:44, 1.57it/s, loss=0.0377, lr=2.41e-05, step=1721] Training: 17%|█▋ | 1722/10000 [19:39<1:31:07, 1.51it/s, loss=0.0377, lr=2.41e-05, step=1721] Training: 17%|█▋ | 1722/10000 [19:39<1:31:07, 1.51it/s, loss=0.0257, lr=2.41e-05, step=1722] Training: 17%|█▋ | 1723/10000 [19:40<1:24:16, 1.64it/s, loss=0.0257, lr=2.41e-05, step=1722] Training: 17%|█▋ | 1723/10000 [19:40<1:24:16, 1.64it/s, loss=0.0274, lr=2.41e-05, step=1723] Training: 17%|█▋ | 1724/10000 [19:40<1:19:34, 1.73it/s, loss=0.0274, lr=2.41e-05, step=1723] Training: 17%|█▋ | 1724/10000 [19:40<1:19:34, 1.73it/s, loss=0.0207, lr=2.41e-05, step=1724] Training: 17%|█▋ | 1725/10000 [19:41<1:16:31, 1.80it/s, loss=0.0207, lr=2.41e-05, step=1724] Training: 17%|█▋ | 1725/10000 [19:41<1:16:31, 1.80it/s, loss=0.0070, lr=2.41e-05, step=1725] Training: 17%|█▋ | 1726/10000 [19:41<1:13:36, 1.87it/s, loss=0.0070, lr=2.41e-05, step=1725] Training: 17%|█▋ | 1726/10000 [19:41<1:13:36, 1.87it/s, loss=0.0647, lr=2.41e-05, step=1726] Training: 17%|█▋ | 1727/10000 [19:42<1:30:29, 1.52it/s, loss=0.0647, lr=2.41e-05, step=1726] Training: 17%|█▋ | 1727/10000 [19:42<1:30:29, 1.52it/s, loss=0.0153, lr=2.41e-05, step=1727] Training: 17%|█▋ | 1728/10000 [19:42<1:23:40, 1.65it/s, loss=0.0153, lr=2.41e-05, step=1727] Training: 17%|█▋ | 1728/10000 [19:42<1:23:40, 1.65it/s, loss=0.0354, lr=2.41e-05, step=1728] Training: 17%|█▋ | 1729/10000 [19:43<1:18:21, 1.76it/s, loss=0.0354, lr=2.41e-05, step=1728] Training: 17%|█▋ | 1729/10000 [19:43<1:18:21, 1.76it/s, loss=0.0183, lr=2.41e-05, step=1729]16:25:50.395 [I] step=1730 loss=0.0086 smoothed_loss=0.0302 lr=2.41e-05 grad_norm=0.8331 step_time=0.4986s data_time=0.0696s it/s=1.760 eta_to_10000=4698.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1850 grad_shared_expert=0.7819 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1730/10000 [19:43<1:16:04, 1.81it/s, loss=0.0183, lr=2.41e-05, step=1729] Training: 17%|█▋ | 1730/10000 [19:43<1:16:04, 1.81it/s, loss=0.0086, lr=2.41e-05, step=1730] Training: 17%|█▋ | 1731/10000 [19:44<1:13:39, 1.87it/s, loss=0.0086, lr=2.41e-05, step=1730] Training: 17%|█▋ | 1731/10000 [19:44<1:13:39, 1.87it/s, loss=0.0321, lr=2.41e-05, step=1731] Training: 17%|█▋ | 1732/10000 [19:44<1:13:27, 1.88it/s, loss=0.0321, lr=2.41e-05, step=1731] Training: 17%|█▋ | 1732/10000 [19:44<1:13:27, 1.88it/s, loss=0.0657, lr=2.41e-05, step=1732] Training: 17%|█▋ | 1733/10000 [19:45<1:11:35, 1.92it/s, loss=0.0657, lr=2.41e-05, step=1732] Training: 17%|█▋ | 1733/10000 [19:45<1:11:35, 1.92it/s, loss=0.0358, lr=2.41e-05, step=1733] Training: 17%|█▋ | 1734/10000 [19:46<1:20:22, 1.71it/s, loss=0.0358, lr=2.41e-05, step=1733] Training: 17%|█▋ | 1734/10000 [19:46<1:20:22, 1.71it/s, loss=0.0891, lr=2.41e-05, step=1734] Training: 17%|█▋ | 1735/10000 [19:46<1:24:02, 1.64it/s, loss=0.0891, lr=2.41e-05, step=1734] Training: 17%|█▋ | 1735/10000 [19:46<1:24:02, 1.64it/s, loss=0.0464, lr=2.41e-05, step=1735] Training: 17%|█▋ | 1736/10000 [19:47<1:18:37, 1.75it/s, loss=0.0464, lr=2.41e-05, step=1735] Training: 17%|█▋ | 1736/10000 [19:47<1:18:37, 1.75it/s, loss=0.0191, lr=2.41e-05, step=1736] Training: 17%|█▋ | 1737/10000 [19:47<1:15:39, 1.82it/s, loss=0.0191, lr=2.41e-05, step=1736] Training: 17%|█▋ | 1737/10000 [19:47<1:15:39, 1.82it/s, loss=0.0268, lr=2.41e-05, step=1737] Training: 17%|█▋ | 1738/10000 [19:48<1:13:23, 1.88it/s, loss=0.0268, lr=2.41e-05, step=1737] Training: 17%|█▋ | 1738/10000 [19:48<1:13:23, 1.88it/s, loss=0.0164, lr=2.41e-05, step=1738] Training: 17%|█▋ | 1739/10000 [19:48<1:11:50, 1.92it/s, loss=0.0164, lr=2.41e-05, step=1738] Training: 17%|█▋ | 1739/10000 [19:48<1:11:50, 1.92it/s, loss=0.0311, lr=2.41e-05, step=1739]16:25:55.813 [I] step=1740 loss=0.0087 smoothed_loss=0.0320 lr=2.41e-05 grad_norm=0.6490 step_time=0.4821s data_time=0.0596s it/s=1.846 eta_to_10000=4474.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.2260 grad_shared_expert=0.5631 (10775:train_pytorch.py:850) + Training: 17%|█▋ | 1740/10000 [19:49<1:12:16, 1.90it/s, loss=0.0311, lr=2.41e-05, step=1739] Training: 17%|█▋ | 1740/10000 [19:49<1:12:16, 1.90it/s, loss=0.0087, lr=2.41e-05, step=1740] Training: 17%|█▋ | 1741/10000 [19:50<1:25:10, 1.62it/s, loss=0.0087, lr=2.41e-05, step=1740] Training: 17%|█▋ | 1741/10000 [19:50<1:25:10, 1.62it/s, loss=0.0106, lr=2.41e-05, step=1741] Training: 17%|█▋ | 1742/10000 [19:50<1:27:15, 1.58it/s, loss=0.0106, lr=2.41e-05, step=1741] Training: 17%|█▋ | 1742/10000 [19:50<1:27:15, 1.58it/s, loss=0.0235, lr=2.41e-05, step=1742] Training: 17%|█▋ | 1743/10000 [19:51<1:22:30, 1.67it/s, loss=0.0235, lr=2.41e-05, step=1742] Training: 17%|█▋ | 1743/10000 [19:51<1:22:30, 1.67it/s, loss=0.0352, lr=2.41e-05, step=1743] Training: 17%|█▋ | 1744/10000 [19:51<1:17:27, 1.78it/s, loss=0.0352, lr=2.41e-05, step=1743] Training: 17%|█▋ | 1744/10000 [19:51<1:17:27, 1.78it/s, loss=0.0722, lr=2.41e-05, step=1744] Training: 17%|█▋ | 1745/10000 [19:52<1:14:17, 1.85it/s, loss=0.0722, lr=2.41e-05, step=1744] Training: 17%|█▋ | 1745/10000 [19:52<1:14:17, 1.85it/s, loss=0.0103, lr=2.41e-05, step=1745] Training: 17%|█▋ | 1746/10000 [19:52<1:12:26, 1.90it/s, loss=0.0103, lr=2.41e-05, step=1745] Training: 17%|█▋ | 1746/10000 [19:52<1:12:26, 1.90it/s, loss=0.0170, lr=2.41e-05, step=1746] Training: 17%|█▋ | 1747/10000 [19:53<1:10:53, 1.94it/s, loss=0.0170, lr=2.41e-05, step=1746] Training: 17%|█▋ | 1747/10000 [19:53<1:10:53, 1.94it/s, loss=0.0427, lr=2.41e-05, step=1747] Training: 17%|█▋ | 1748/10000 [19:53<1:09:56, 1.97it/s, loss=0.0427, lr=2.41e-05, step=1747] Training: 17%|█▋ | 1748/10000 [19:53<1:09:56, 1.97it/s, loss=0.0371, lr=2.41e-05, step=1748] Training: 17%|█▋ | 1749/10000 [19:54<1:20:46, 1.70it/s, loss=0.0371, lr=2.41e-05, step=1748] Training: 17%|█▋ | 1749/10000 [19:54<1:20:46, 1.70it/s, loss=0.0478, lr=2.41e-05, step=1749]16:26:01.756 [I] step=1750 loss=0.0492 smoothed_loss=0.0352 lr=2.41e-05 grad_norm=0.6197 step_time=0.5298s data_time=0.0645s it/s=1.683 eta_to_10000=4902.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0171 grad_action_out_proj=0.1983 grad_shared_expert=0.6600 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1750/10000 [19:55<1:25:35, 1.61it/s, loss=0.0478, lr=2.41e-05, step=1749] Training: 18%|█▊ | 1750/10000 [19:55<1:25:35, 1.61it/s, loss=0.0492, lr=2.41e-05, step=1750] Training: 18%|█▊ | 1751/10000 [19:55<1:19:46, 1.72it/s, loss=0.0492, lr=2.41e-05, step=1750] Training: 18%|█▊ | 1751/10000 [19:55<1:19:46, 1.72it/s, loss=0.0333, lr=2.41e-05, step=1751] Training: 18%|█▊ | 1752/10000 [19:56<1:15:48, 1.81it/s, loss=0.0333, lr=2.41e-05, step=1751] Training: 18%|█▊ | 1752/10000 [19:56<1:15:48, 1.81it/s, loss=0.0535, lr=2.41e-05, step=1752] Training: 18%|█▊ | 1753/10000 [19:56<1:13:02, 1.88it/s, loss=0.0535, lr=2.41e-05, step=1752] Training: 18%|█▊ | 1753/10000 [19:56<1:13:02, 1.88it/s, loss=0.0265, lr=2.40e-05, step=1753] Training: 18%|█▊ | 1754/10000 [19:57<1:10:33, 1.95it/s, loss=0.0265, lr=2.40e-05, step=1753] Training: 18%|█▊ | 1754/10000 [19:57<1:10:33, 1.95it/s, loss=0.1937, lr=2.40e-05, step=1754] Training: 18%|█▊ | 1755/10000 [19:57<1:09:26, 1.98it/s, loss=0.1937, lr=2.40e-05, step=1754] Training: 18%|█▊ | 1755/10000 [19:57<1:09:26, 1.98it/s, loss=0.0277, lr=2.40e-05, step=1755] Training: 18%|█▊ | 1756/10000 [19:58<1:18:19, 1.75it/s, loss=0.0277, lr=2.40e-05, step=1755] Training: 18%|█▊ | 1756/10000 [19:58<1:18:19, 1.75it/s, loss=0.0482, lr=2.40e-05, step=1756] Training: 18%|█▊ | 1757/10000 [19:59<1:23:23, 1.65it/s, loss=0.0482, lr=2.40e-05, step=1756] Training: 18%|█▊ | 1757/10000 [19:59<1:23:23, 1.65it/s, loss=0.0225, lr=2.40e-05, step=1757] Training: 18%|█▊ | 1758/10000 [19:59<1:18:20, 1.75it/s, loss=0.0225, lr=2.40e-05, step=1757] Training: 18%|█▊ | 1758/10000 [19:59<1:18:20, 1.75it/s, loss=0.0759, lr=2.40e-05, step=1758] Training: 18%|█▊ | 1759/10000 [20:00<1:15:40, 1.81it/s, loss=0.0759, lr=2.40e-05, step=1758] Training: 18%|█▊ | 1759/10000 [20:00<1:15:40, 1.81it/s, loss=0.0721, lr=2.40e-05, step=1759]16:26:07.086 [I] step=1760 loss=0.0113 smoothed_loss=0.0476 lr=2.40e-05 grad_norm=0.6425 step_time=0.4731s data_time=0.0599s it/s=1.877 eta_to_10000=4390.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.1739 grad_shared_expert=0.5580 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1760/10000 [20:00<1:14:14, 1.85it/s, loss=0.0721, lr=2.40e-05, step=1759] Training: 18%|█▊ | 1760/10000 [20:00<1:14:14, 1.85it/s, loss=0.0113, lr=2.40e-05, step=1760] Training: 18%|█▊ | 1761/10000 [20:01<1:12:05, 1.90it/s, loss=0.0113, lr=2.40e-05, step=1760] Training: 18%|█▊ | 1761/10000 [20:01<1:12:05, 1.90it/s, loss=0.0275, lr=2.40e-05, step=1761] Training: 18%|█▊ | 1762/10000 [20:01<1:10:32, 1.95it/s, loss=0.0275, lr=2.40e-05, step=1761] Training: 18%|█▊ | 1762/10000 [20:01<1:10:32, 1.95it/s, loss=0.0484, lr=2.40e-05, step=1762] Training: 18%|█▊ | 1763/10000 [20:02<1:19:12, 1.73it/s, loss=0.0484, lr=2.40e-05, step=1762] Training: 18%|█▊ | 1763/10000 [20:02<1:19:12, 1.73it/s, loss=0.0242, lr=2.40e-05, step=1763] Training: 18%|█▊ | 1764/10000 [20:02<1:16:32, 1.79it/s, loss=0.0242, lr=2.40e-05, step=1763] Training: 18%|█▊ | 1764/10000 [20:02<1:16:32, 1.79it/s, loss=0.0207, lr=2.40e-05, step=1764] Training: 18%|█▊ | 1765/10000 [20:03<1:20:28, 1.71it/s, loss=0.0207, lr=2.40e-05, step=1764] Training: 18%|█▊ | 1765/10000 [20:03<1:20:28, 1.71it/s, loss=0.0055, lr=2.40e-05, step=1765] Training: 18%|█▊ | 1766/10000 [20:04<1:16:48, 1.79it/s, loss=0.0055, lr=2.40e-05, step=1765] Training: 18%|█▊ | 1766/10000 [20:04<1:16:48, 1.79it/s, loss=0.0276, lr=2.40e-05, step=1766] Training: 18%|█▊ | 1767/10000 [20:04<1:14:54, 1.83it/s, loss=0.0276, lr=2.40e-05, step=1766] Training: 18%|█▊ | 1767/10000 [20:04<1:14:54, 1.83it/s, loss=0.0389, lr=2.40e-05, step=1767] Training: 18%|█▊ | 1768/10000 [20:05<1:11:55, 1.91it/s, loss=0.0389, lr=2.40e-05, step=1767] Training: 18%|█▊ | 1768/10000 [20:05<1:11:55, 1.91it/s, loss=0.0439, lr=2.40e-05, step=1768] Training: 18%|█▊ | 1769/10000 [20:05<1:10:55, 1.93it/s, loss=0.0439, lr=2.40e-05, step=1768] Training: 18%|█▊ | 1769/10000 [20:05<1:10:55, 1.93it/s, loss=0.0244, lr=2.40e-05, step=1769]16:26:12.705 [I] step=1770 loss=0.0286 smoothed_loss=0.0356 lr=2.40e-05 grad_norm=0.6660 step_time=0.4984s data_time=0.0636s it/s=1.780 eta_to_10000=4623.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1902 grad_shared_expert=0.7131 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1770/10000 [20:06<1:21:13, 1.69it/s, loss=0.0244, lr=2.40e-05, step=1769] Training: 18%|█▊ | 1770/10000 [20:06<1:21:13, 1.69it/s, loss=0.0286, lr=2.40e-05, step=1770] Training: 18%|█▊ | 1771/10000 [20:06<1:17:10, 1.78it/s, loss=0.0286, lr=2.40e-05, step=1770] Training: 18%|█▊ | 1771/10000 [20:06<1:17:10, 1.78it/s, loss=0.0419, lr=2.40e-05, step=1771] Training: 18%|█▊ | 1772/10000 [20:07<1:21:33, 1.68it/s, loss=0.0419, lr=2.40e-05, step=1771] Training: 18%|█▊ | 1772/10000 [20:07<1:21:33, 1.68it/s, loss=0.0272, lr=2.40e-05, step=1772] Training: 18%|█▊ | 1773/10000 [20:07<1:17:20, 1.77it/s, loss=0.0272, lr=2.40e-05, step=1772] Training: 18%|█▊ | 1773/10000 [20:07<1:17:20, 1.77it/s, loss=0.0396, lr=2.40e-05, step=1773] Training: 18%|█▊ | 1774/10000 [20:08<1:14:28, 1.84it/s, loss=0.0396, lr=2.40e-05, step=1773] Training: 18%|█▊ | 1774/10000 [20:08<1:14:28, 1.84it/s, loss=0.0285, lr=2.40e-05, step=1774] Training: 18%|█▊ | 1775/10000 [20:08<1:11:53, 1.91it/s, loss=0.0285, lr=2.40e-05, step=1774] Training: 18%|█▊ | 1775/10000 [20:08<1:11:53, 1.91it/s, loss=0.0300, lr=2.40e-05, step=1775] Training: 18%|█▊ | 1776/10000 [20:09<1:10:47, 1.94it/s, loss=0.0300, lr=2.40e-05, step=1775] Training: 18%|█▊ | 1776/10000 [20:09<1:10:47, 1.94it/s, loss=0.0167, lr=2.40e-05, step=1776] Training: 18%|█▊ | 1777/10000 [20:10<1:17:45, 1.76it/s, loss=0.0167, lr=2.40e-05, step=1776] Training: 18%|█▊ | 1777/10000 [20:10<1:17:45, 1.76it/s, loss=0.0319, lr=2.40e-05, step=1777] Training: 18%|█▊ | 1778/10000 [20:10<1:14:31, 1.84it/s, loss=0.0319, lr=2.40e-05, step=1777] Training: 18%|█▊ | 1778/10000 [20:10<1:14:31, 1.84it/s, loss=0.0295, lr=2.40e-05, step=1778] Training: 18%|█▊ | 1779/10000 [20:11<1:19:08, 1.73it/s, loss=0.0295, lr=2.40e-05, step=1778] Training: 18%|█▊ | 1779/10000 [20:11<1:19:08, 1.73it/s, loss=0.0382, lr=2.40e-05, step=1779]16:26:18.194 [I] step=1780 loss=0.0226 smoothed_loss=0.0319 lr=2.40e-05 grad_norm=0.7177 step_time=0.4863s data_time=0.0626s it/s=1.822 eta_to_10000=4511.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0129 grad_action_out_proj=0.1646 grad_shared_expert=0.4020 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1780/10000 [20:11<1:17:06, 1.78it/s, loss=0.0382, lr=2.40e-05, step=1779] Training: 18%|█▊ | 1780/10000 [20:11<1:17:06, 1.78it/s, loss=0.0226, lr=2.40e-05, step=1780] Training: 18%|█▊ | 1781/10000 [20:12<1:14:32, 1.84it/s, loss=0.0226, lr=2.40e-05, step=1780] Training: 18%|█▊ | 1781/10000 [20:12<1:14:32, 1.84it/s, loss=0.0118, lr=2.40e-05, step=1781] Training: 18%|█▊ | 1782/10000 [20:12<1:12:24, 1.89it/s, loss=0.0118, lr=2.40e-05, step=1781] Training: 18%|█▊ | 1782/10000 [20:12<1:12:24, 1.89it/s, loss=0.0367, lr=2.40e-05, step=1782] Training: 18%|█▊ | 1783/10000 [20:13<1:11:33, 1.91it/s, loss=0.0367, lr=2.40e-05, step=1782] Training: 18%|█▊ | 1783/10000 [20:13<1:11:33, 1.91it/s, loss=0.0852, lr=2.40e-05, step=1783] Training: 18%|█▊ | 1784/10000 [20:13<1:19:16, 1.73it/s, loss=0.0852, lr=2.40e-05, step=1783] Training: 18%|█▊ | 1784/10000 [20:13<1:19:16, 1.73it/s, loss=0.0459, lr=2.40e-05, step=1784] Training: 18%|█▊ | 1785/10000 [20:14<1:17:36, 1.76it/s, loss=0.0459, lr=2.40e-05, step=1784] Training: 18%|█▊ | 1785/10000 [20:14<1:17:36, 1.76it/s, loss=0.0256, lr=2.40e-05, step=1785] Training: 18%|█▊ | 1786/10000 [20:15<1:21:19, 1.68it/s, loss=0.0256, lr=2.40e-05, step=1785] Training: 18%|█▊ | 1786/10000 [20:15<1:21:19, 1.68it/s, loss=0.0070, lr=2.40e-05, step=1786] Training: 18%|█▊ | 1787/10000 [20:15<1:18:25, 1.75it/s, loss=0.0070, lr=2.40e-05, step=1786] Training: 18%|█▊ | 1787/10000 [20:15<1:18:25, 1.75it/s, loss=0.0249, lr=2.40e-05, step=1787] Training: 18%|█▊ | 1788/10000 [20:16<1:15:17, 1.82it/s, loss=0.0249, lr=2.40e-05, step=1787] Training: 18%|█▊ | 1788/10000 [20:16<1:15:17, 1.82it/s, loss=0.0561, lr=2.40e-05, step=1788] Training: 18%|█▊ | 1789/10000 [20:16<1:12:27, 1.89it/s, loss=0.0561, lr=2.40e-05, step=1788] Training: 18%|█▊ | 1789/10000 [20:16<1:12:27, 1.89it/s, loss=0.0320, lr=2.40e-05, step=1789]16:26:23.614 [I] step=1790 loss=0.0226 smoothed_loss=0.0331 lr=2.40e-05 grad_norm=0.6867 step_time=0.4763s data_time=0.0658s it/s=1.845 eta_to_10000=4449.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1743 grad_shared_expert=0.6437 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1790/10000 [20:17<1:11:40, 1.91it/s, loss=0.0320, lr=2.40e-05, step=1789] Training: 18%|█▊ | 1790/10000 [20:17<1:11:40, 1.91it/s, loss=0.0226, lr=2.40e-05, step=1790] Training: 18%|█▊ | 1791/10000 [20:17<1:09:46, 1.96it/s, loss=0.0226, lr=2.40e-05, step=1790] Training: 18%|█▊ | 1791/10000 [20:17<1:09:46, 1.96it/s, loss=0.0300, lr=2.40e-05, step=1791] Training: 18%|█▊ | 1792/10000 [20:18<1:18:37, 1.74it/s, loss=0.0300, lr=2.40e-05, step=1791] Training: 18%|█▊ | 1792/10000 [20:18<1:18:37, 1.74it/s, loss=0.0237, lr=2.40e-05, step=1792] Training: 18%|█▊ | 1793/10000 [20:19<1:21:56, 1.67it/s, loss=0.0237, lr=2.40e-05, step=1792] Training: 18%|█▊ | 1793/10000 [20:19<1:21:56, 1.67it/s, loss=0.0487, lr=2.40e-05, step=1793] Training: 18%|█▊ | 1794/10000 [20:19<1:18:05, 1.75it/s, loss=0.0487, lr=2.40e-05, step=1793] Training: 18%|█▊ | 1794/10000 [20:19<1:18:05, 1.75it/s, loss=0.0116, lr=2.40e-05, step=1794] Training: 18%|█▊ | 1795/10000 [20:20<1:15:08, 1.82it/s, loss=0.0116, lr=2.40e-05, step=1794] Training: 18%|█▊ | 1795/10000 [20:20<1:15:08, 1.82it/s, loss=0.0284, lr=2.40e-05, step=1795] Training: 18%|█▊ | 1796/10000 [20:20<1:12:55, 1.88it/s, loss=0.0284, lr=2.40e-05, step=1795] Training: 18%|█▊ | 1796/10000 [20:20<1:12:55, 1.88it/s, loss=0.0253, lr=2.40e-05, step=1796] Training: 18%|█▊ | 1797/10000 [20:21<1:11:03, 1.92it/s, loss=0.0253, lr=2.40e-05, step=1796] Training: 18%|█▊ | 1797/10000 [20:21<1:11:03, 1.92it/s, loss=0.0393, lr=2.40e-05, step=1797] Training: 18%|█▊ | 1798/10000 [20:21<1:11:07, 1.92it/s, loss=0.0393, lr=2.40e-05, step=1797] Training: 18%|█▊ | 1798/10000 [20:21<1:11:07, 1.92it/s, loss=0.0213, lr=2.40e-05, step=1798] Training: 18%|█▊ | 1799/10000 [20:22<1:18:08, 1.75it/s, loss=0.0213, lr=2.40e-05, step=1798] Training: 18%|█▊ | 1799/10000 [20:22<1:18:08, 1.75it/s, loss=0.0135, lr=2.40e-05, step=1799]16:26:29.191 [I] step=1800 loss=0.1000 smoothed_loss=0.0358 lr=2.40e-05 grad_norm=0.7812 step_time=0.4933s data_time=0.0645s it/s=1.793 eta_to_10000=4572.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0208 grad_action_out_proj=0.2676 grad_shared_expert=0.6731 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1800/10000 [20:22<1:15:51, 1.80it/s, loss=0.0135, lr=2.40e-05, step=1799] Training: 18%|█▊ | 1800/10000 [20:22<1:15:51, 1.80it/s, loss=0.1000, lr=2.40e-05, step=1800] Training: 18%|█▊ | 1801/10000 [20:23<1:13:00, 1.87it/s, loss=0.1000, lr=2.40e-05, step=1800] Training: 18%|█▊ | 1801/10000 [20:23<1:13:00, 1.87it/s, loss=0.0202, lr=2.40e-05, step=1801] Training: 18%|█▊ | 1802/10000 [20:23<1:10:51, 1.93it/s, loss=0.0202, lr=2.40e-05, step=1801] Training: 18%|█▊ | 1802/10000 [20:23<1:10:51, 1.93it/s, loss=0.0066, lr=2.40e-05, step=1802] Training: 18%|█▊ | 1803/10000 [20:24<1:09:43, 1.96it/s, loss=0.0066, lr=2.40e-05, step=1802] Training: 18%|█▊ | 1803/10000 [20:24<1:09:43, 1.96it/s, loss=0.0118, lr=2.40e-05, step=1803] Training: 18%|█▊ | 1804/10000 [20:24<1:08:41, 1.99it/s, loss=0.0118, lr=2.40e-05, step=1803] Training: 18%|█▊ | 1804/10000 [20:24<1:08:41, 1.99it/s, loss=0.0277, lr=2.40e-05, step=1804] Training: 18%|█▊ | 1805/10000 [20:25<1:08:02, 2.01it/s, loss=0.0277, lr=2.40e-05, step=1804] Training: 18%|█▊ | 1805/10000 [20:25<1:08:02, 2.01it/s, loss=0.0170, lr=2.40e-05, step=1805] Training: 18%|█▊ | 1806/10000 [20:25<1:15:59, 1.80it/s, loss=0.0170, lr=2.40e-05, step=1805] Training: 18%|█▊ | 1806/10000 [20:25<1:15:59, 1.80it/s, loss=0.0190, lr=2.40e-05, step=1806] Training: 18%|█▊ | 1807/10000 [20:26<1:13:21, 1.86it/s, loss=0.0190, lr=2.40e-05, step=1806] Training: 18%|█▊ | 1807/10000 [20:26<1:13:21, 1.86it/s, loss=0.0172, lr=2.40e-05, step=1807] Training: 18%|█▊ | 1808/10000 [20:26<1:12:08, 1.89it/s, loss=0.0172, lr=2.40e-05, step=1807] Training: 18%|█▊ | 1808/10000 [20:26<1:12:08, 1.89it/s, loss=0.0294, lr=2.40e-05, step=1808] Training: 18%|█▊ | 1809/10000 [20:27<1:10:28, 1.94it/s, loss=0.0294, lr=2.40e-05, step=1808] Training: 18%|█▊ | 1809/10000 [20:27<1:10:28, 1.94it/s, loss=0.0426, lr=2.40e-05, step=1809]16:26:34.309 [I] step=1810 loss=0.0135 smoothed_loss=0.0267 lr=2.40e-05 grad_norm=0.6283 step_time=0.4506s data_time=0.0611s it/s=1.955 eta_to_10000=4190.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1516 grad_shared_expert=0.5996 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1810/10000 [20:27<1:10:01, 1.95it/s, loss=0.0426, lr=2.40e-05, step=1809] Training: 18%|█▊ | 1810/10000 [20:27<1:10:01, 1.95it/s, loss=0.0135, lr=2.40e-05, step=1810] Training: 18%|█▊ | 1811/10000 [20:28<1:08:42, 1.99it/s, loss=0.0135, lr=2.40e-05, step=1810] Training: 18%|█▊ | 1811/10000 [20:28<1:08:42, 1.99it/s, loss=0.0283, lr=2.40e-05, step=1811] Training: 18%|█▊ | 1812/10000 [20:28<1:07:50, 2.01it/s, loss=0.0283, lr=2.40e-05, step=1811] Training: 18%|█▊ | 1812/10000 [20:28<1:07:50, 2.01it/s, loss=0.0144, lr=2.40e-05, step=1812] Training: 18%|█▊ | 1813/10000 [20:29<1:07:18, 2.03it/s, loss=0.0144, lr=2.40e-05, step=1812] Training: 18%|█▊ | 1813/10000 [20:29<1:07:18, 2.03it/s, loss=0.0137, lr=2.40e-05, step=1813] Training: 18%|█▊ | 1814/10000 [20:29<1:14:44, 1.83it/s, loss=0.0137, lr=2.40e-05, step=1813] Training: 18%|█▊ | 1814/10000 [20:29<1:14:44, 1.83it/s, loss=0.0184, lr=2.40e-05, step=1814] Training: 18%|█▊ | 1815/10000 [20:30<1:13:17, 1.86it/s, loss=0.0184, lr=2.40e-05, step=1814] Training: 18%|█▊ | 1815/10000 [20:30<1:13:17, 1.86it/s, loss=0.0456, lr=2.40e-05, step=1815] Training: 18%|█▊ | 1816/10000 [20:31<1:11:26, 1.91it/s, loss=0.0456, lr=2.40e-05, step=1815] Training: 18%|█▊ | 1816/10000 [20:31<1:11:26, 1.91it/s, loss=0.0451, lr=2.40e-05, step=1816] Training: 18%|█▊ | 1817/10000 [20:31<1:09:34, 1.96it/s, loss=0.0451, lr=2.40e-05, step=1816] Training: 18%|█▊ | 1817/10000 [20:31<1:09:34, 1.96it/s, loss=0.0436, lr=2.40e-05, step=1817] Training: 18%|█▊ | 1818/10000 [20:32<1:12:32, 1.88it/s, loss=0.0436, lr=2.40e-05, step=1817] Training: 18%|█▊ | 1818/10000 [20:32<1:12:32, 1.88it/s, loss=0.0401, lr=2.39e-05, step=1818] Training: 18%|█▊ | 1819/10000 [20:32<1:10:40, 1.93it/s, loss=0.0401, lr=2.39e-05, step=1818] Training: 18%|█▊ | 1819/10000 [20:32<1:10:40, 1.93it/s, loss=0.0403, lr=2.39e-05, step=1819]16:26:39.506 [I] step=1820 loss=0.0142 smoothed_loss=0.0298 lr=2.40e-05 grad_norm=0.8532 step_time=0.4563s data_time=0.0635s it/s=1.924 eta_to_10000=4250.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0130 grad_action_out_proj=0.2040 grad_shared_expert=0.7000 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1820/10000 [20:33<1:10:48, 1.93it/s, loss=0.0403, lr=2.39e-05, step=1819] Training: 18%|█▊ | 1820/10000 [20:33<1:10:48, 1.93it/s, loss=0.0142, lr=2.39e-05, step=1820] Training: 18%|█▊ | 1821/10000 [20:33<1:18:58, 1.73it/s, loss=0.0142, lr=2.39e-05, step=1820] Training: 18%|█▊ | 1821/10000 [20:33<1:18:58, 1.73it/s, loss=0.0162, lr=2.39e-05, step=1821] Training: 18%|█▊ | 1822/10000 [20:34<1:14:51, 1.82it/s, loss=0.0162, lr=2.39e-05, step=1821] Training: 18%|█▊ | 1822/10000 [20:34<1:14:51, 1.82it/s, loss=0.0534, lr=2.39e-05, step=1822] Training: 18%|█▊ | 1823/10000 [20:34<1:12:06, 1.89it/s, loss=0.0534, lr=2.39e-05, step=1822] Training: 18%|█▊ | 1823/10000 [20:34<1:12:06, 1.89it/s, loss=0.0067, lr=2.39e-05, step=1823] Training: 18%|█▊ | 1824/10000 [20:35<1:10:22, 1.94it/s, loss=0.0067, lr=2.39e-05, step=1823] Training: 18%|█▊ | 1824/10000 [20:35<1:10:22, 1.94it/s, loss=0.0387, lr=2.39e-05, step=1824] Training: 18%|█▊ | 1825/10000 [20:35<1:09:09, 1.97it/s, loss=0.0387, lr=2.39e-05, step=1824] Training: 18%|█▊ | 1825/10000 [20:35<1:09:09, 1.97it/s, loss=0.0160, lr=2.39e-05, step=1825] Training: 18%|█▊ | 1826/10000 [20:36<1:08:16, 2.00it/s, loss=0.0160, lr=2.39e-05, step=1825] Training: 18%|█▊ | 1826/10000 [20:36<1:08:16, 2.00it/s, loss=0.0092, lr=2.39e-05, step=1826] Training: 18%|█▊ | 1827/10000 [20:36<1:16:49, 1.77it/s, loss=0.0092, lr=2.39e-05, step=1826] Training: 18%|█▊ | 1827/10000 [20:36<1:16:49, 1.77it/s, loss=0.0118, lr=2.39e-05, step=1827] Training: 18%|█▊ | 1828/10000 [20:37<1:14:09, 1.84it/s, loss=0.0118, lr=2.39e-05, step=1827] Training: 18%|█▊ | 1828/10000 [20:37<1:14:09, 1.84it/s, loss=0.0244, lr=2.39e-05, step=1828] Training: 18%|█▊ | 1829/10000 [20:38<1:19:26, 1.71it/s, loss=0.0244, lr=2.39e-05, step=1828] Training: 18%|█▊ | 1829/10000 [20:38<1:19:26, 1.71it/s, loss=0.0438, lr=2.39e-05, step=1829]16:26:45.072 [I] step=1830 loss=0.0153 smoothed_loss=0.0255 lr=2.39e-05 grad_norm=0.5665 step_time=0.4953s data_time=0.0614s it/s=1.797 eta_to_10000=4546.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1544 grad_shared_expert=0.4261 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1830/10000 [20:38<1:17:45, 1.75it/s, loss=0.0438, lr=2.39e-05, step=1829] Training: 18%|█▊ | 1830/10000 [20:38<1:17:45, 1.75it/s, loss=0.0153, lr=2.39e-05, step=1830] Training: 18%|█▊ | 1831/10000 [20:39<1:14:11, 1.84it/s, loss=0.0153, lr=2.39e-05, step=1830] Training: 18%|█▊ | 1831/10000 [20:39<1:14:11, 1.84it/s, loss=0.0327, lr=2.39e-05, step=1831] Training: 18%|█▊ | 1832/10000 [20:39<1:12:35, 1.88it/s, loss=0.0327, lr=2.39e-05, step=1831] Training: 18%|█▊ | 1832/10000 [20:39<1:12:35, 1.88it/s, loss=0.0217, lr=2.39e-05, step=1832] Training: 18%|█▊ | 1833/10000 [20:40<1:11:02, 1.92it/s, loss=0.0217, lr=2.39e-05, step=1832] Training: 18%|█▊ | 1833/10000 [20:40<1:11:02, 1.92it/s, loss=0.0200, lr=2.39e-05, step=1833] Training: 18%|█▊ | 1834/10000 [20:40<1:09:41, 1.95it/s, loss=0.0200, lr=2.39e-05, step=1833] Training: 18%|█▊ | 1834/10000 [20:40<1:09:41, 1.95it/s, loss=0.0320, lr=2.39e-05, step=1834] Training: 18%|█▊ | 1835/10000 [20:41<1:18:03, 1.74it/s, loss=0.0320, lr=2.39e-05, step=1834] Training: 18%|█▊ | 1835/10000 [20:41<1:18:03, 1.74it/s, loss=0.0048, lr=2.39e-05, step=1835] Training: 18%|█▊ | 1836/10000 [20:42<1:22:22, 1.65it/s, loss=0.0048, lr=2.39e-05, step=1835] Training: 18%|█▊ | 1836/10000 [20:42<1:22:22, 1.65it/s, loss=0.0554, lr=2.39e-05, step=1836] Training: 18%|█▊ | 1837/10000 [20:42<1:17:49, 1.75it/s, loss=0.0554, lr=2.39e-05, step=1836] Training: 18%|█▊ | 1837/10000 [20:42<1:17:49, 1.75it/s, loss=0.0227, lr=2.39e-05, step=1837] Training: 18%|█▊ | 1838/10000 [20:43<1:14:55, 1.82it/s, loss=0.0227, lr=2.39e-05, step=1837] Training: 18%|█▊ | 1838/10000 [20:43<1:14:55, 1.82it/s, loss=0.0157, lr=2.39e-05, step=1838] Training: 18%|█▊ | 1839/10000 [20:43<1:13:16, 1.86it/s, loss=0.0157, lr=2.39e-05, step=1838] Training: 18%|█▊ | 1839/10000 [20:43<1:13:16, 1.86it/s, loss=0.0131, lr=2.39e-05, step=1839]16:26:50.469 [I] step=1840 loss=0.0245 smoothed_loss=0.0242 lr=2.39e-05 grad_norm=0.6464 step_time=0.4785s data_time=0.0611s it/s=1.853 eta_to_10000=4402.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0100 grad_action_out_proj=0.1735 grad_shared_expert=0.6895 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1840/10000 [20:44<1:12:27, 1.88it/s, loss=0.0131, lr=2.39e-05, step=1839] Training: 18%|█▊ | 1840/10000 [20:44<1:12:27, 1.88it/s, loss=0.0245, lr=2.39e-05, step=1840] Training: 18%|█▊ | 1841/10000 [20:44<1:10:38, 1.92it/s, loss=0.0245, lr=2.39e-05, step=1840] Training: 18%|█▊ | 1841/10000 [20:44<1:10:38, 1.92it/s, loss=0.0101, lr=2.39e-05, step=1841] Training: 18%|█▊ | 1842/10000 [20:45<1:19:16, 1.72it/s, loss=0.0101, lr=2.39e-05, step=1841] Training: 18%|█▊ | 1842/10000 [20:45<1:19:16, 1.72it/s, loss=0.0194, lr=2.39e-05, step=1842] Training: 18%|█▊ | 1843/10000 [20:45<1:24:16, 1.61it/s, loss=0.0194, lr=2.39e-05, step=1842] Training: 18%|█▊ | 1843/10000 [20:45<1:24:16, 1.61it/s, loss=0.0242, lr=2.39e-05, step=1843] Training: 18%|█▊ | 1844/10000 [20:46<1:18:58, 1.72it/s, loss=0.0242, lr=2.39e-05, step=1843] Training: 18%|█▊ | 1844/10000 [20:46<1:18:58, 1.72it/s, loss=0.0297, lr=2.39e-05, step=1844] Training: 18%|█▊ | 1845/10000 [20:46<1:15:09, 1.81it/s, loss=0.0297, lr=2.39e-05, step=1844] Training: 18%|█▊ | 1845/10000 [20:46<1:15:09, 1.81it/s, loss=0.0194, lr=2.39e-05, step=1845] Training: 18%|█▊ | 1846/10000 [20:47<1:12:13, 1.88it/s, loss=0.0194, lr=2.39e-05, step=1845] Training: 18%|█▊ | 1846/10000 [20:47<1:12:13, 1.88it/s, loss=0.0251, lr=2.39e-05, step=1846] Training: 18%|█▊ | 1847/10000 [20:47<1:11:49, 1.89it/s, loss=0.0251, lr=2.39e-05, step=1846] Training: 18%|█▊ | 1847/10000 [20:47<1:11:49, 1.89it/s, loss=0.0362, lr=2.39e-05, step=1847] Training: 18%|█▊ | 1848/10000 [20:48<1:12:06, 1.88it/s, loss=0.0362, lr=2.39e-05, step=1847] Training: 18%|█▊ | 1848/10000 [20:48<1:12:06, 1.88it/s, loss=0.0364, lr=2.39e-05, step=1848] Training: 18%|█▊ | 1849/10000 [20:49<1:25:35, 1.59it/s, loss=0.0364, lr=2.39e-05, step=1848] Training: 18%|█▊ | 1849/10000 [20:49<1:25:35, 1.59it/s, loss=0.0121, lr=2.39e-05, step=1849]16:26:56.314 [I] step=1850 loss=0.0130 smoothed_loss=0.0232 lr=2.39e-05 grad_norm=0.5991 step_time=0.5192s data_time=0.0653s it/s=1.711 eta_to_10000=4763.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0111 grad_action_out_proj=0.1403 grad_shared_expert=0.4448 (10775:train_pytorch.py:850) + Training: 18%|█▊ | 1850/10000 [20:49<1:21:56, 1.66it/s, loss=0.0121, lr=2.39e-05, step=1849] Training: 18%|█▊ | 1850/10000 [20:49<1:21:56, 1.66it/s, loss=0.0130, lr=2.39e-05, step=1850] Training: 19%|█▊ | 1851/10000 [20:50<1:24:45, 1.60it/s, loss=0.0130, lr=2.39e-05, step=1850] Training: 19%|█▊ | 1851/10000 [20:50<1:24:45, 1.60it/s, loss=0.0069, lr=2.39e-05, step=1851] Training: 19%|█▊ | 1852/10000 [20:51<1:20:21, 1.69it/s, loss=0.0069, lr=2.39e-05, step=1851] Training: 19%|█▊ | 1852/10000 [20:51<1:20:21, 1.69it/s, loss=0.0676, lr=2.39e-05, step=1852] Training: 19%|█▊ | 1853/10000 [20:51<1:16:21, 1.78it/s, loss=0.0676, lr=2.39e-05, step=1852] Training: 19%|█▊ | 1853/10000 [20:51<1:16:21, 1.78it/s, loss=0.0189, lr=2.39e-05, step=1853] Training: 19%|█▊ | 1854/10000 [20:52<1:13:44, 1.84it/s, loss=0.0189, lr=2.39e-05, step=1853] Training: 19%|█▊ | 1854/10000 [20:52<1:13:44, 1.84it/s, loss=0.0152, lr=2.39e-05, step=1854] Training: 19%|█▊ | 1855/10000 [20:52<1:12:48, 1.86it/s, loss=0.0152, lr=2.39e-05, step=1854] Training: 19%|█▊ | 1855/10000 [20:52<1:12:48, 1.86it/s, loss=0.0546, lr=2.39e-05, step=1855] Training: 19%|█▊ | 1856/10000 [20:53<1:21:09, 1.67it/s, loss=0.0546, lr=2.39e-05, step=1855] Training: 19%|█▊ | 1856/10000 [20:53<1:21:09, 1.67it/s, loss=0.0282, lr=2.39e-05, step=1856] Training: 19%|█▊ | 1857/10000 [20:53<1:17:12, 1.76it/s, loss=0.0282, lr=2.39e-05, step=1856] Training: 19%|█▊ | 1857/10000 [20:53<1:17:12, 1.76it/s, loss=0.0783, lr=2.39e-05, step=1857] Training: 19%|█▊ | 1858/10000 [20:54<1:21:17, 1.67it/s, loss=0.0783, lr=2.39e-05, step=1857] Training: 19%|█▊ | 1858/10000 [20:54<1:21:17, 1.67it/s, loss=0.0304, lr=2.39e-05, step=1858] Training: 19%|█▊ | 1859/10000 [20:55<1:19:48, 1.70it/s, loss=0.0304, lr=2.39e-05, step=1858] Training: 19%|█▊ | 1859/10000 [20:55<1:19:48, 1.70it/s, loss=0.0390, lr=2.39e-05, step=1859]16:27:02.103 [I] step=1860 loss=0.0165 smoothed_loss=0.0314 lr=2.39e-05 grad_norm=0.6975 step_time=0.5092s data_time=0.0698s it/s=1.728 eta_to_10000=4711.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0169 grad_action_out_proj=0.2051 grad_shared_expert=0.5574 (10775:train_pytorch.py:850) + Training: 19%|█▊ | 1860/10000 [20:55<1:20:48, 1.68it/s, loss=0.0390, lr=2.39e-05, step=1859] Training: 19%|█▊ | 1860/10000 [20:55<1:20:48, 1.68it/s, loss=0.0165, lr=2.39e-05, step=1860] Training: 19%|█▊ | 1861/10000 [20:56<1:16:54, 1.76it/s, loss=0.0165, lr=2.39e-05, step=1860] Training: 19%|█▊ | 1861/10000 [20:56<1:16:54, 1.76it/s, loss=0.0325, lr=2.39e-05, step=1861] Training: 19%|█▊ | 1862/10000 [20:56<1:14:14, 1.83it/s, loss=0.0325, lr=2.39e-05, step=1861] Training: 19%|█▊ | 1862/10000 [20:56<1:14:14, 1.83it/s, loss=0.0476, lr=2.39e-05, step=1862] Training: 19%|█▊ | 1863/10000 [20:57<1:23:28, 1.62it/s, loss=0.0476, lr=2.39e-05, step=1862] Training: 19%|█▊ | 1863/10000 [20:57<1:23:28, 1.62it/s, loss=0.0216, lr=2.39e-05, step=1863] Training: 19%|█▊ | 1864/10000 [20:57<1:20:12, 1.69it/s, loss=0.0216, lr=2.39e-05, step=1863] Training: 19%|█▊ | 1864/10000 [20:57<1:20:12, 1.69it/s, loss=0.0397, lr=2.39e-05, step=1864] Training: 19%|█▊ | 1865/10000 [20:58<1:31:42, 1.48it/s, loss=0.0397, lr=2.39e-05, step=1864] Training: 19%|█▊ | 1865/10000 [20:58<1:31:42, 1.48it/s, loss=0.0233, lr=2.39e-05, step=1865] Training: 19%|█▊ | 1866/10000 [20:59<1:29:44, 1.51it/s, loss=0.0233, lr=2.39e-05, step=1865] Training: 19%|█▊ | 1866/10000 [20:59<1:29:44, 1.51it/s, loss=0.0284, lr=2.39e-05, step=1866] Training: 19%|█▊ | 1867/10000 [20:59<1:22:57, 1.63it/s, loss=0.0284, lr=2.39e-05, step=1866] Training: 19%|█▊ | 1867/10000 [20:59<1:22:57, 1.63it/s, loss=0.0274, lr=2.39e-05, step=1867] Training: 19%|█▊ | 1868/10000 [21:00<1:19:16, 1.71it/s, loss=0.0274, lr=2.39e-05, step=1867] Training: 19%|█▊ | 1868/10000 [21:00<1:19:16, 1.71it/s, loss=0.0370, lr=2.39e-05, step=1868] Training: 19%|█▊ | 1869/10000 [21:00<1:15:40, 1.79it/s, loss=0.0370, lr=2.39e-05, step=1868] Training: 19%|█▊ | 1869/10000 [21:00<1:15:40, 1.79it/s, loss=0.0183, lr=2.39e-05, step=1869]16:27:08.400 [I] step=1870 loss=0.0397 smoothed_loss=0.0312 lr=2.39e-05 grad_norm=0.6255 step_time=0.5446s data_time=0.0851s it/s=1.588 eta_to_10000=5118.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1599 grad_shared_expert=0.7227 (10775:train_pytorch.py:850) + Training: 19%|█▊ | 1870/10000 [21:01<1:32:20, 1.47it/s, loss=0.0183, lr=2.39e-05, step=1869] Training: 19%|█▊ | 1870/10000 [21:01<1:32:20, 1.47it/s, loss=0.0397, lr=2.39e-05, step=1870] Training: 19%|█▊ | 1871/10000 [21:02<1:27:06, 1.56it/s, loss=0.0397, lr=2.39e-05, step=1870] Training: 19%|█▊ | 1871/10000 [21:02<1:27:06, 1.56it/s, loss=0.0542, lr=2.39e-05, step=1871] Training: 19%|█▊ | 1872/10000 [21:03<1:32:19, 1.47it/s, loss=0.0542, lr=2.39e-05, step=1871] Training: 19%|█▊ | 1872/10000 [21:03<1:32:19, 1.47it/s, loss=0.0532, lr=2.39e-05, step=1872] Training: 19%|█▊ | 1873/10000 [21:03<1:27:51, 1.54it/s, loss=0.0532, lr=2.39e-05, step=1872] Training: 19%|█▊ | 1873/10000 [21:03<1:27:51, 1.54it/s, loss=0.0383, lr=2.39e-05, step=1873] Training: 19%|█▊ | 1874/10000 [21:04<1:22:31, 1.64it/s, loss=0.0383, lr=2.39e-05, step=1873] Training: 19%|█▊ | 1874/10000 [21:04<1:22:31, 1.64it/s, loss=0.0365, lr=2.39e-05, step=1874] Training: 19%|█▉ | 1875/10000 [21:04<1:22:07, 1.65it/s, loss=0.0365, lr=2.39e-05, step=1874] Training: 19%|█▉ | 1875/10000 [21:04<1:22:07, 1.65it/s, loss=0.0104, lr=2.39e-05, step=1875] Training: 19%|█▉ | 1876/10000 [21:05<1:22:57, 1.63it/s, loss=0.0104, lr=2.39e-05, step=1875] Training: 19%|█▉ | 1876/10000 [21:05<1:22:57, 1.63it/s, loss=0.0541, lr=2.39e-05, step=1876] Training: 19%|█▉ | 1877/10000 [21:06<1:21:47, 1.66it/s, loss=0.0541, lr=2.39e-05, step=1876] Training: 19%|█▉ | 1877/10000 [21:06<1:21:47, 1.66it/s, loss=0.0291, lr=2.39e-05, step=1877] Training: 19%|█▉ | 1878/10000 [21:06<1:29:38, 1.51it/s, loss=0.0291, lr=2.39e-05, step=1877] Training: 19%|█▉ | 1878/10000 [21:06<1:29:38, 1.51it/s, loss=0.0559, lr=2.39e-05, step=1878] Training: 19%|█▉ | 1879/10000 [21:07<1:35:26, 1.42it/s, loss=0.0559, lr=2.39e-05, step=1878] Training: 19%|█▉ | 1879/10000 [21:07<1:35:26, 1.42it/s, loss=0.0315, lr=2.39e-05, step=1879]16:27:15.037 [I] step=1880 loss=0.0204 smoothed_loss=0.0347 lr=2.39e-05 grad_norm=0.6849 step_time=0.5496s data_time=0.1141s it/s=1.507 eta_to_10000=5388.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1931 grad_shared_expert=1.1871 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1880/10000 [21:08<1:39:39, 1.36it/s, loss=0.0315, lr=2.39e-05, step=1879] Training: 19%|█▉ | 1880/10000 [21:08<1:39:39, 1.36it/s, loss=0.0204, lr=2.39e-05, step=1880] Training: 19%|█▉ | 1881/10000 [21:09<1:29:18, 1.52it/s, loss=0.0204, lr=2.39e-05, step=1880] Training: 19%|█▉ | 1881/10000 [21:09<1:29:18, 1.52it/s, loss=0.0145, lr=2.38e-05, step=1881] Training: 19%|█▉ | 1882/10000 [21:09<1:26:09, 1.57it/s, loss=0.0145, lr=2.38e-05, step=1881] Training: 19%|█▉ | 1882/10000 [21:09<1:26:09, 1.57it/s, loss=0.0133, lr=2.38e-05, step=1882] Training: 19%|█▉ | 1883/10000 [21:10<1:30:45, 1.49it/s, loss=0.0133, lr=2.38e-05, step=1882] Training: 19%|█▉ | 1883/10000 [21:10<1:30:45, 1.49it/s, loss=0.0107, lr=2.38e-05, step=1883] Training: 19%|█▉ | 1884/10000 [21:11<1:28:34, 1.53it/s, loss=0.0107, lr=2.38e-05, step=1883] Training: 19%|█▉ | 1884/10000 [21:11<1:28:34, 1.53it/s, loss=0.0239, lr=2.38e-05, step=1884] Training: 19%|█▉ | 1885/10000 [21:11<1:38:14, 1.38it/s, loss=0.0239, lr=2.38e-05, step=1884] Training: 19%|█▉ | 1885/10000 [21:11<1:38:14, 1.38it/s, loss=0.0248, lr=2.38e-05, step=1885] Training: 19%|█▉ | 1886/10000 [21:12<1:34:46, 1.43it/s, loss=0.0248, lr=2.38e-05, step=1885] Training: 19%|█▉ | 1886/10000 [21:12<1:34:46, 1.43it/s, loss=0.0177, lr=2.38e-05, step=1886] Training: 19%|█▉ | 1887/10000 [21:13<1:32:33, 1.46it/s, loss=0.0177, lr=2.38e-05, step=1886] Training: 19%|█▉ | 1887/10000 [21:13<1:32:33, 1.46it/s, loss=0.0622, lr=2.38e-05, step=1887] Training: 19%|█▉ | 1888/10000 [21:13<1:25:03, 1.59it/s, loss=0.0622, lr=2.38e-05, step=1887] Training: 19%|█▉ | 1888/10000 [21:13<1:25:03, 1.59it/s, loss=0.0202, lr=2.38e-05, step=1888] Training: 19%|█▉ | 1889/10000 [21:14<1:25:24, 1.58it/s, loss=0.0202, lr=2.38e-05, step=1888] Training: 19%|█▉ | 1889/10000 [21:14<1:25:24, 1.58it/s, loss=0.0191, lr=2.38e-05, step=1889]16:27:21.320 [I] step=1890 loss=0.0129 smoothed_loss=0.0268 lr=2.38e-05 grad_norm=0.6174 step_time=0.5175s data_time=0.1107s it/s=1.592 eta_to_10000=5093.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0165 grad_action_out_proj=0.1710 grad_shared_expert=0.3862 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1890/10000 [21:14<1:21:19, 1.66it/s, loss=0.0191, lr=2.38e-05, step=1889] Training: 19%|█▉ | 1890/10000 [21:14<1:21:19, 1.66it/s, loss=0.0129, lr=2.38e-05, step=1890] Training: 19%|█▉ | 1891/10000 [21:15<1:20:17, 1.68it/s, loss=0.0129, lr=2.38e-05, step=1890] Training: 19%|█▉ | 1891/10000 [21:15<1:20:17, 1.68it/s, loss=0.0341, lr=2.38e-05, step=1891] Training: 19%|█▉ | 1892/10000 [21:16<1:26:54, 1.55it/s, loss=0.0341, lr=2.38e-05, step=1891] Training: 19%|█▉ | 1892/10000 [21:16<1:26:54, 1.55it/s, loss=0.0221, lr=2.38e-05, step=1892] Training: 19%|█▉ | 1893/10000 [21:16<1:28:44, 1.52it/s, loss=0.0221, lr=2.38e-05, step=1892] Training: 19%|█▉ | 1893/10000 [21:16<1:28:44, 1.52it/s, loss=0.0163, lr=2.38e-05, step=1893] Training: 19%|█▉ | 1894/10000 [21:17<1:27:43, 1.54it/s, loss=0.0163, lr=2.38e-05, step=1893] Training: 19%|█▉ | 1894/10000 [21:17<1:27:43, 1.54it/s, loss=0.0179, lr=2.38e-05, step=1894] Training: 19%|█▉ | 1895/10000 [21:18<1:24:38, 1.60it/s, loss=0.0179, lr=2.38e-05, step=1894] Training: 19%|█▉ | 1895/10000 [21:18<1:24:38, 1.60it/s, loss=0.0242, lr=2.38e-05, step=1895] Training: 19%|█▉ | 1896/10000 [21:18<1:20:55, 1.67it/s, loss=0.0242, lr=2.38e-05, step=1895] Training: 19%|█▉ | 1896/10000 [21:18<1:20:55, 1.67it/s, loss=0.0705, lr=2.38e-05, step=1896] Training: 19%|█▉ | 1897/10000 [21:19<1:18:52, 1.71it/s, loss=0.0705, lr=2.38e-05, step=1896] Training: 19%|█▉ | 1897/10000 [21:19<1:18:52, 1.71it/s, loss=0.0263, lr=2.38e-05, step=1897] Training: 19%|█▉ | 1898/10000 [21:19<1:15:17, 1.79it/s, loss=0.0263, lr=2.38e-05, step=1897] Training: 19%|█▉ | 1898/10000 [21:19<1:15:17, 1.79it/s, loss=0.0094, lr=2.38e-05, step=1898] Training: 19%|█▉ | 1899/10000 [21:20<1:22:34, 1.64it/s, loss=0.0094, lr=2.38e-05, step=1898] Training: 19%|█▉ | 1899/10000 [21:20<1:22:34, 1.64it/s, loss=0.0370, lr=2.38e-05, step=1899]16:27:27.671 [I] step=1900 loss=0.0123 smoothed_loss=0.0267 lr=2.38e-05 grad_norm=0.6858 step_time=0.5429s data_time=0.0922s it/s=1.575 eta_to_10000=5143.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0113 grad_action_out_proj=0.1903 grad_shared_expert=0.4321 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1900/10000 [21:21<1:30:24, 1.49it/s, loss=0.0370, lr=2.38e-05, step=1899] Training: 19%|█▉ | 1900/10000 [21:21<1:30:24, 1.49it/s, loss=0.0123, lr=2.38e-05, step=1900] Training: 19%|█▉ | 1901/10000 [21:21<1:24:22, 1.60it/s, loss=0.0123, lr=2.38e-05, step=1900] Training: 19%|█▉ | 1901/10000 [21:21<1:24:22, 1.60it/s, loss=0.0474, lr=2.38e-05, step=1901] Training: 19%|█▉ | 1902/10000 [21:22<1:24:57, 1.59it/s, loss=0.0474, lr=2.38e-05, step=1901] Training: 19%|█▉ | 1902/10000 [21:22<1:24:57, 1.59it/s, loss=0.0262, lr=2.38e-05, step=1902] Training: 19%|█▉ | 1903/10000 [21:22<1:20:32, 1.68it/s, loss=0.0262, lr=2.38e-05, step=1902] Training: 19%|█▉ | 1903/10000 [21:22<1:20:32, 1.68it/s, loss=0.0085, lr=2.38e-05, step=1903] Training: 19%|█▉ | 1904/10000 [21:23<1:22:25, 1.64it/s, loss=0.0085, lr=2.38e-05, step=1903] Training: 19%|█▉ | 1904/10000 [21:23<1:22:25, 1.64it/s, loss=0.0120, lr=2.38e-05, step=1904] Training: 19%|█▉ | 1905/10000 [21:24<1:22:37, 1.63it/s, loss=0.0120, lr=2.38e-05, step=1904] Training: 19%|█▉ | 1905/10000 [21:24<1:22:37, 1.63it/s, loss=0.0099, lr=2.38e-05, step=1905] Training: 19%|█▉ | 1906/10000 [21:24<1:29:51, 1.50it/s, loss=0.0099, lr=2.38e-05, step=1905] Training: 19%|█▉ | 1906/10000 [21:24<1:29:51, 1.50it/s, loss=0.1018, lr=2.38e-05, step=1906] Training: 19%|█▉ | 1907/10000 [21:25<1:26:52, 1.55it/s, loss=0.1018, lr=2.38e-05, step=1906] Training: 19%|█▉ | 1907/10000 [21:25<1:26:52, 1.55it/s, loss=0.0566, lr=2.38e-05, step=1907] Training: 19%|█▉ | 1908/10000 [21:26<1:28:51, 1.52it/s, loss=0.0566, lr=2.38e-05, step=1907] Training: 19%|█▉ | 1908/10000 [21:26<1:28:51, 1.52it/s, loss=0.0132, lr=2.38e-05, step=1908] Training: 19%|█▉ | 1909/10000 [21:26<1:27:17, 1.54it/s, loss=0.0132, lr=2.38e-05, step=1908] Training: 19%|█▉ | 1909/10000 [21:26<1:27:17, 1.54it/s, loss=0.0075, lr=2.38e-05, step=1909]16:27:33.865 [I] step=1910 loss=0.0805 smoothed_loss=0.0345 lr=2.38e-05 grad_norm=0.6155 step_time=0.5242s data_time=0.0952s it/s=1.615 eta_to_10000=5010.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0338 grad_action_out_proj=0.2451 grad_shared_expert=0.7462 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1910/10000 [21:27<1:23:33, 1.61it/s, loss=0.0075, lr=2.38e-05, step=1909] Training: 19%|█▉ | 1910/10000 [21:27<1:23:33, 1.61it/s, loss=0.0805, lr=2.38e-05, step=1910] Training: 19%|█▉ | 1911/10000 [21:27<1:19:09, 1.70it/s, loss=0.0805, lr=2.38e-05, step=1910] Training: 19%|█▉ | 1911/10000 [21:27<1:19:09, 1.70it/s, loss=0.0174, lr=2.38e-05, step=1911] Training: 19%|█▉ | 1912/10000 [21:28<1:15:29, 1.79it/s, loss=0.0174, lr=2.38e-05, step=1911] Training: 19%|█▉ | 1912/10000 [21:28<1:15:29, 1.79it/s, loss=0.0063, lr=2.38e-05, step=1912] Training: 19%|█▉ | 1913/10000 [21:29<1:23:38, 1.61it/s, loss=0.0063, lr=2.38e-05, step=1912] Training: 19%|█▉ | 1913/10000 [21:29<1:23:38, 1.61it/s, loss=0.0163, lr=2.38e-05, step=1913] Training: 19%|█▉ | 1914/10000 [21:29<1:19:32, 1.69it/s, loss=0.0163, lr=2.38e-05, step=1913] Training: 19%|█▉ | 1914/10000 [21:29<1:19:32, 1.69it/s, loss=0.0395, lr=2.38e-05, step=1914] Training: 19%|█▉ | 1915/10000 [21:30<1:15:26, 1.79it/s, loss=0.0395, lr=2.38e-05, step=1914] Training: 19%|█▉ | 1915/10000 [21:30<1:15:26, 1.79it/s, loss=0.0312, lr=2.38e-05, step=1915] Training: 19%|█▉ | 1916/10000 [21:30<1:19:32, 1.69it/s, loss=0.0312, lr=2.38e-05, step=1915] Training: 19%|█▉ | 1916/10000 [21:30<1:19:32, 1.69it/s, loss=0.0249, lr=2.38e-05, step=1916] Training: 19%|█▉ | 1917/10000 [21:31<1:15:05, 1.79it/s, loss=0.0249, lr=2.38e-05, step=1916] Training: 19%|█▉ | 1917/10000 [21:31<1:15:05, 1.79it/s, loss=0.0027, lr=2.38e-05, step=1917] Training: 19%|█▉ | 1918/10000 [21:32<1:19:23, 1.70it/s, loss=0.0027, lr=2.38e-05, step=1917] Training: 19%|█▉ | 1918/10000 [21:32<1:19:23, 1.70it/s, loss=0.0432, lr=2.38e-05, step=1918] Training: 19%|█▉ | 1919/10000 [21:32<1:23:03, 1.62it/s, loss=0.0432, lr=2.38e-05, step=1918] Training: 19%|█▉ | 1919/10000 [21:32<1:23:03, 1.62it/s, loss=0.0251, lr=2.38e-05, step=1919]16:27:39.681 [I] step=1920 loss=0.0452 smoothed_loss=0.0298 lr=2.38e-05 grad_norm=0.5698 step_time=0.5087s data_time=0.0728s it/s=1.720 eta_to_10000=4698.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.1483 grad_shared_expert=0.4606 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1920/10000 [21:33<1:20:23, 1.68it/s, loss=0.0251, lr=2.38e-05, step=1919] Training: 19%|█▉ | 1920/10000 [21:33<1:20:23, 1.68it/s, loss=0.0452, lr=2.38e-05, step=1920] Training: 19%|█▉ | 1921/10000 [21:34<1:34:20, 1.43it/s, loss=0.0452, lr=2.38e-05, step=1920] Training: 19%|█▉ | 1921/10000 [21:34<1:34:20, 1.43it/s, loss=0.0561, lr=2.38e-05, step=1921] Training: 19%|█▉ | 1922/10000 [21:34<1:26:38, 1.55it/s, loss=0.0561, lr=2.38e-05, step=1921] Training: 19%|█▉ | 1922/10000 [21:34<1:26:38, 1.55it/s, loss=0.0079, lr=2.38e-05, step=1922] Training: 19%|█▉ | 1923/10000 [21:35<1:28:54, 1.51it/s, loss=0.0079, lr=2.38e-05, step=1922] Training: 19%|█▉ | 1923/10000 [21:35<1:28:54, 1.51it/s, loss=0.0204, lr=2.38e-05, step=1923] Training: 19%|█▉ | 1924/10000 [21:35<1:25:21, 1.58it/s, loss=0.0204, lr=2.38e-05, step=1923] Training: 19%|█▉ | 1924/10000 [21:35<1:25:21, 1.58it/s, loss=0.0594, lr=2.38e-05, step=1924] Training: 19%|█▉ | 1925/10000 [21:36<1:20:49, 1.67it/s, loss=0.0594, lr=2.38e-05, step=1924] Training: 19%|█▉ | 1925/10000 [21:36<1:20:49, 1.67it/s, loss=0.0181, lr=2.38e-05, step=1925] Training: 19%|█▉ | 1926/10000 [21:37<1:28:25, 1.52it/s, loss=0.0181, lr=2.38e-05, step=1925] Training: 19%|█▉ | 1926/10000 [21:37<1:28:25, 1.52it/s, loss=0.0305, lr=2.38e-05, step=1926] Training: 19%|█▉ | 1927/10000 [21:37<1:22:03, 1.64it/s, loss=0.0305, lr=2.38e-05, step=1926] Training: 19%|█▉ | 1927/10000 [21:37<1:22:03, 1.64it/s, loss=0.0248, lr=2.38e-05, step=1927] Training: 19%|█▉ | 1928/10000 [21:38<1:30:05, 1.49it/s, loss=0.0248, lr=2.38e-05, step=1927] Training: 19%|█▉ | 1928/10000 [21:38<1:30:05, 1.49it/s, loss=0.0496, lr=2.38e-05, step=1928] Training: 19%|█▉ | 1929/10000 [21:39<1:22:42, 1.63it/s, loss=0.0496, lr=2.38e-05, step=1928] Training: 19%|█▉ | 1929/10000 [21:39<1:22:42, 1.63it/s, loss=0.0267, lr=2.38e-05, step=1929]16:27:46.486 [I] step=1930 loss=0.0677 smoothed_loss=0.0351 lr=2.38e-05 grad_norm=0.6437 step_time=0.5742s data_time=0.1063s it/s=1.470 eta_to_10000=5491.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0234 grad_action_out_proj=0.1778 grad_shared_expert=0.5921 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1930/10000 [21:40<1:37:09, 1.38it/s, loss=0.0267, lr=2.38e-05, step=1929] Training: 19%|█▉ | 1930/10000 [21:40<1:37:09, 1.38it/s, loss=0.0677, lr=2.38e-05, step=1930] Training: 19%|█▉ | 1931/10000 [21:40<1:44:21, 1.29it/s, loss=0.0677, lr=2.38e-05, step=1930] Training: 19%|█▉ | 1931/10000 [21:40<1:44:21, 1.29it/s, loss=0.0810, lr=2.38e-05, step=1931] Training: 19%|█▉ | 1932/10000 [21:41<1:37:25, 1.38it/s, loss=0.0810, lr=2.38e-05, step=1931] Training: 19%|█▉ | 1932/10000 [21:41<1:37:25, 1.38it/s, loss=0.0102, lr=2.38e-05, step=1932] Training: 19%|█▉ | 1933/10000 [21:42<1:30:44, 1.48it/s, loss=0.0102, lr=2.38e-05, step=1932] Training: 19%|█▉ | 1933/10000 [21:42<1:30:44, 1.48it/s, loss=0.0070, lr=2.38e-05, step=1933] Training: 19%|█▉ | 1934/10000 [21:42<1:31:49, 1.46it/s, loss=0.0070, lr=2.38e-05, step=1933] Training: 19%|█▉ | 1934/10000 [21:42<1:31:49, 1.46it/s, loss=0.0604, lr=2.38e-05, step=1934] Training: 19%|█▉ | 1935/10000 [21:43<1:40:58, 1.33it/s, loss=0.0604, lr=2.38e-05, step=1934] Training: 19%|█▉ | 1935/10000 [21:43<1:40:58, 1.33it/s, loss=0.0309, lr=2.38e-05, step=1935] Training: 19%|█▉ | 1936/10000 [21:44<1:41:51, 1.32it/s, loss=0.0309, lr=2.38e-05, step=1935] Training: 19%|█▉ | 1936/10000 [21:44<1:41:51, 1.32it/s, loss=0.0254, lr=2.38e-05, step=1936] Training: 19%|█▉ | 1937/10000 [21:45<1:41:52, 1.32it/s, loss=0.0254, lr=2.38e-05, step=1936] Training: 19%|█▉ | 1937/10000 [21:45<1:41:52, 1.32it/s, loss=0.0159, lr=2.38e-05, step=1937] Training: 19%|█▉ | 1938/10000 [21:45<1:33:55, 1.43it/s, loss=0.0159, lr=2.38e-05, step=1937] Training: 19%|█▉ | 1938/10000 [21:45<1:33:55, 1.43it/s, loss=0.0385, lr=2.38e-05, step=1938] Training: 19%|█▉ | 1939/10000 [21:46<1:29:33, 1.50it/s, loss=0.0385, lr=2.38e-05, step=1938] Training: 19%|█▉ | 1939/10000 [21:46<1:29:33, 1.50it/s, loss=0.0123, lr=2.38e-05, step=1939]16:27:53.397 [I] step=1940 loss=0.0871 smoothed_loss=0.0370 lr=2.38e-05 grad_norm=0.6687 step_time=0.5716s data_time=0.1194s it/s=1.447 eta_to_10000=5569.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0168 grad_action_out_proj=0.1969 grad_shared_expert=0.6801 (10775:train_pytorch.py:850) + Training: 19%|█▉ | 1940/10000 [21:46<1:24:50, 1.58it/s, loss=0.0123, lr=2.38e-05, step=1939] Training: 19%|█▉ | 1940/10000 [21:46<1:24:50, 1.58it/s, loss=0.0871, lr=2.38e-05, step=1940] Training: 19%|█▉ | 1941/10000 [21:47<1:19:48, 1.68it/s, loss=0.0871, lr=2.38e-05, step=1940] Training: 19%|█▉ | 1941/10000 [21:47<1:19:48, 1.68it/s, loss=0.0427, lr=2.37e-05, step=1941] Training: 19%|█▉ | 1942/10000 [21:48<1:27:00, 1.54it/s, loss=0.0427, lr=2.37e-05, step=1941] Training: 19%|█▉ | 1942/10000 [21:48<1:27:00, 1.54it/s, loss=0.0468, lr=2.37e-05, step=1942] Training: 19%|█▉ | 1943/10000 [21:48<1:20:50, 1.66it/s, loss=0.0468, lr=2.37e-05, step=1942] Training: 19%|█▉ | 1943/10000 [21:48<1:20:50, 1.66it/s, loss=0.0361, lr=2.37e-05, step=1943] Training: 19%|█▉ | 1944/10000 [21:49<1:29:28, 1.50it/s, loss=0.0361, lr=2.37e-05, step=1943] Training: 19%|█▉ | 1944/10000 [21:49<1:29:28, 1.50it/s, loss=0.0733, lr=2.37e-05, step=1944] Training: 19%|█▉ | 1945/10000 [21:50<1:22:10, 1.63it/s, loss=0.0733, lr=2.37e-05, step=1944] Training: 19%|█▉ | 1945/10000 [21:50<1:22:10, 1.63it/s, loss=0.0236, lr=2.37e-05, step=1945] Training: 19%|█▉ | 1946/10000 [21:50<1:18:04, 1.72it/s, loss=0.0236, lr=2.37e-05, step=1945] Training: 19%|█▉ | 1946/10000 [21:50<1:18:04, 1.72it/s, loss=0.0158, lr=2.37e-05, step=1946] Training: 19%|█▉ | 1947/10000 [21:51<1:15:01, 1.79it/s, loss=0.0158, lr=2.37e-05, step=1946] Training: 19%|█▉ | 1947/10000 [21:51<1:15:01, 1.79it/s, loss=0.0317, lr=2.37e-05, step=1947] Training: 19%|█▉ | 1948/10000 [21:51<1:13:26, 1.83it/s, loss=0.0317, lr=2.37e-05, step=1947] Training: 19%|█▉ | 1948/10000 [21:51<1:13:26, 1.83it/s, loss=0.0154, lr=2.37e-05, step=1948] Training: 19%|█▉ | 1949/10000 [21:52<1:19:58, 1.68it/s, loss=0.0154, lr=2.37e-05, step=1948] Training: 19%|█▉ | 1949/10000 [21:52<1:19:58, 1.68it/s, loss=0.0120, lr=2.37e-05, step=1949]16:27:59.340 [I] step=1950 loss=0.0205 smoothed_loss=0.0313 lr=2.37e-05 grad_norm=0.5386 step_time=0.5173s data_time=0.0770s it/s=1.683 eta_to_10000=4783.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.1714 grad_shared_expert=0.5170 (10775:train_pytorch.py:850) + Training: 20%|█▉ | 1950/10000 [21:52<1:20:55, 1.66it/s, loss=0.0120, lr=2.37e-05, step=1949] Training: 20%|█▉ | 1950/10000 [21:52<1:20:55, 1.66it/s, loss=0.0205, lr=2.37e-05, step=1950] Training: 20%|█▉ | 1951/10000 [21:53<1:28:45, 1.51it/s, loss=0.0205, lr=2.37e-05, step=1950] Training: 20%|█▉ | 1951/10000 [21:53<1:28:45, 1.51it/s, loss=0.0175, lr=2.37e-05, step=1951] Training: 20%|█▉ | 1952/10000 [21:54<1:28:26, 1.52it/s, loss=0.0175, lr=2.37e-05, step=1951] Training: 20%|█▉ | 1952/10000 [21:54<1:28:26, 1.52it/s, loss=0.0669, lr=2.37e-05, step=1952] Training: 20%|█▉ | 1953/10000 [21:55<1:52:11, 1.20it/s, loss=0.0669, lr=2.37e-05, step=1952] Training: 20%|█▉ | 1953/10000 [21:55<1:52:11, 1.20it/s, loss=0.0738, lr=2.37e-05, step=1953] Training: 20%|█▉ | 1954/10000 [21:56<1:48:39, 1.23it/s, loss=0.0738, lr=2.37e-05, step=1953] Training: 20%|█▉ | 1954/10000 [21:56<1:48:39, 1.23it/s, loss=0.0326, lr=2.37e-05, step=1954] Training: 20%|█▉ | 1955/10000 [21:56<1:36:30, 1.39it/s, loss=0.0326, lr=2.37e-05, step=1954] Training: 20%|█▉ | 1955/10000 [21:56<1:36:30, 1.39it/s, loss=0.0223, lr=2.37e-05, step=1955] Training: 20%|█▉ | 1956/10000 [21:57<1:35:43, 1.40it/s, loss=0.0223, lr=2.37e-05, step=1955] Training: 20%|█▉ | 1956/10000 [21:57<1:35:43, 1.40it/s, loss=0.0555, lr=2.37e-05, step=1956] Training: 20%|█▉ | 1957/10000 [21:58<1:27:47, 1.53it/s, loss=0.0555, lr=2.37e-05, step=1956] Training: 20%|█▉ | 1957/10000 [21:58<1:27:47, 1.53it/s, loss=0.0805, lr=2.37e-05, step=1957] Training: 20%|█▉ | 1958/10000 [21:58<1:32:57, 1.44it/s, loss=0.0805, lr=2.37e-05, step=1957] Training: 20%|█▉ | 1958/10000 [21:58<1:32:57, 1.44it/s, loss=0.0689, lr=2.37e-05, step=1958] Training: 20%|█▉ | 1959/10000 [21:59<1:24:27, 1.59it/s, loss=0.0689, lr=2.37e-05, step=1958] Training: 20%|█▉ | 1959/10000 [21:59<1:24:27, 1.59it/s, loss=0.0411, lr=2.37e-05, step=1959]16:28:06.638 [I] step=1960 loss=0.0229 smoothed_loss=0.0421 lr=2.37e-05 grad_norm=0.6891 step_time=0.6299s data_time=0.0998s it/s=1.371 eta_to_10000=5866.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0199 grad_action_out_proj=0.1706 grad_shared_expert=0.6406 (10775:train_pytorch.py:850) + Training: 20%|█▉ | 1960/10000 [22:00<1:33:28, 1.43it/s, loss=0.0411, lr=2.37e-05, step=1959] Training: 20%|█▉ | 1960/10000 [22:00<1:33:28, 1.43it/s, loss=0.0229, lr=2.37e-05, step=1960] Training: 20%|█▉ | 1961/10000 [22:00<1:26:10, 1.55it/s, loss=0.0229, lr=2.37e-05, step=1960] Training: 20%|█▉ | 1961/10000 [22:00<1:26:10, 1.55it/s, loss=0.0092, lr=2.37e-05, step=1961] Training: 20%|█▉ | 1962/10000 [22:01<1:25:14, 1.57it/s, loss=0.0092, lr=2.37e-05, step=1961] Training: 20%|█▉ | 1962/10000 [22:01<1:25:14, 1.57it/s, loss=0.0467, lr=2.37e-05, step=1962] Training: 20%|█▉ | 1963/10000 [22:01<1:24:41, 1.58it/s, loss=0.0467, lr=2.37e-05, step=1962] Training: 20%|█▉ | 1963/10000 [22:01<1:24:41, 1.58it/s, loss=0.0114, lr=2.37e-05, step=1963] Training: 20%|█▉ | 1964/10000 [22:02<1:30:53, 1.47it/s, loss=0.0114, lr=2.37e-05, step=1963] Training: 20%|█▉ | 1964/10000 [22:02<1:30:53, 1.47it/s, loss=0.0109, lr=2.37e-05, step=1964] Training: 20%|█▉ | 1965/10000 [22:03<1:38:51, 1.35it/s, loss=0.0109, lr=2.37e-05, step=1964] Training: 20%|█▉ | 1965/10000 [22:03<1:38:51, 1.35it/s, loss=0.0151, lr=2.37e-05, step=1965] Training: 20%|█▉ | 1966/10000 [22:04<1:33:48, 1.43it/s, loss=0.0151, lr=2.37e-05, step=1965] Training: 20%|█▉ | 1966/10000 [22:04<1:33:48, 1.43it/s, loss=0.0138, lr=2.37e-05, step=1966] Training: 20%|█▉ | 1967/10000 [22:04<1:29:53, 1.49it/s, loss=0.0138, lr=2.37e-05, step=1966] Training: 20%|█▉ | 1967/10000 [22:04<1:29:53, 1.49it/s, loss=0.0337, lr=2.37e-05, step=1967] Training: 20%|█▉ | 1968/10000 [22:05<1:28:58, 1.50it/s, loss=0.0337, lr=2.37e-05, step=1967] Training: 20%|█▉ | 1968/10000 [22:05<1:28:58, 1.50it/s, loss=0.0173, lr=2.37e-05, step=1968] Training: 20%|█▉ | 1969/10000 [22:06<1:24:00, 1.59it/s, loss=0.0173, lr=2.37e-05, step=1968] Training: 20%|█▉ | 1969/10000 [22:06<1:24:00, 1.59it/s, loss=0.0921, lr=2.37e-05, step=1969]16:28:13.103 [I] step=1970 loss=0.0224 smoothed_loss=0.0344 lr=2.37e-05 grad_norm=0.6636 step_time=0.5480s data_time=0.0985s it/s=1.547 eta_to_10000=5190.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.1630 grad_shared_expert=0.5683 (10775:train_pytorch.py:850) + Training: 20%|█▉ | 1970/10000 [22:06<1:24:17, 1.59it/s, loss=0.0921, lr=2.37e-05, step=1969] Training: 20%|█▉ | 1970/10000 [22:06<1:24:17, 1.59it/s, loss=0.0224, lr=2.37e-05, step=1970] Training: 20%|█▉ | 1971/10000 [22:07<1:29:10, 1.50it/s, loss=0.0224, lr=2.37e-05, step=1970] Training: 20%|█▉ | 1971/10000 [22:07<1:29:10, 1.50it/s, loss=0.0093, lr=2.37e-05, step=1971] Training: 20%|█▉ | 1972/10000 [22:08<1:30:41, 1.48it/s, loss=0.0093, lr=2.37e-05, step=1971] Training: 20%|█▉ | 1972/10000 [22:08<1:30:41, 1.48it/s, loss=0.0279, lr=2.37e-05, step=1972] Training: 20%|█▉ | 1973/10000 [22:08<1:28:30, 1.51it/s, loss=0.0279, lr=2.37e-05, step=1972] Training: 20%|█▉ | 1973/10000 [22:08<1:28:30, 1.51it/s, loss=0.0321, lr=2.37e-05, step=1973] Training: 20%|█▉ | 1974/10000 [22:09<1:22:02, 1.63it/s, loss=0.0321, lr=2.37e-05, step=1973] Training: 20%|█▉ | 1974/10000 [22:09<1:22:02, 1.63it/s, loss=0.0378, lr=2.37e-05, step=1974] Training: 20%|█▉ | 1975/10000 [22:09<1:21:58, 1.63it/s, loss=0.0378, lr=2.37e-05, step=1974] Training: 20%|█▉ | 1975/10000 [22:09<1:21:58, 1.63it/s, loss=0.0344, lr=2.37e-05, step=1975] Training: 20%|█▉ | 1976/10000 [22:10<1:17:34, 1.72it/s, loss=0.0344, lr=2.37e-05, step=1975] Training: 20%|█▉ | 1976/10000 [22:10<1:17:34, 1.72it/s, loss=0.0266, lr=2.37e-05, step=1976] Training: 20%|█▉ | 1977/10000 [22:10<1:19:40, 1.68it/s, loss=0.0266, lr=2.37e-05, step=1976] Training: 20%|█▉ | 1977/10000 [22:10<1:19:40, 1.68it/s, loss=0.0255, lr=2.37e-05, step=1977] Training: 20%|█▉ | 1978/10000 [22:11<1:25:17, 1.57it/s, loss=0.0255, lr=2.37e-05, step=1977] Training: 20%|█▉ | 1978/10000 [22:11<1:25:17, 1.57it/s, loss=0.0165, lr=2.37e-05, step=1978] Training: 20%|█▉ | 1979/10000 [22:12<1:19:22, 1.68it/s, loss=0.0165, lr=2.37e-05, step=1978] Training: 20%|█▉ | 1979/10000 [22:12<1:19:22, 1.68it/s, loss=0.0432, lr=2.37e-05, step=1979]16:28:19.423 [I] step=1980 loss=0.0106 smoothed_loss=0.0290 lr=2.37e-05 grad_norm=0.5852 step_time=0.5521s data_time=0.0799s it/s=1.583 eta_to_10000=5067.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.1635 grad_shared_expert=0.3906 (10775:train_pytorch.py:850) + Training: 20%|█▉ | 1980/10000 [22:12<1:26:11, 1.55it/s, loss=0.0432, lr=2.37e-05, step=1979] Training: 20%|█▉ | 1980/10000 [22:12<1:26:11, 1.55it/s, loss=0.0106, lr=2.37e-05, step=1980] Training: 20%|█▉ | 1981/10000 [22:13<1:20:34, 1.66it/s, loss=0.0106, lr=2.37e-05, step=1980] Training: 20%|█▉ | 1981/10000 [22:13<1:20:34, 1.66it/s, loss=0.0338, lr=2.37e-05, step=1981] Training: 20%|█▉ | 1982/10000 [22:14<1:21:11, 1.65it/s, loss=0.0338, lr=2.37e-05, step=1981] Training: 20%|█▉ | 1982/10000 [22:14<1:21:11, 1.65it/s, loss=0.0383, lr=2.37e-05, step=1982] Training: 20%|█▉ | 1983/10000 [22:14<1:16:09, 1.75it/s, loss=0.0383, lr=2.37e-05, step=1982] Training: 20%|█▉ | 1983/10000 [22:14<1:16:09, 1.75it/s, loss=0.0267, lr=2.37e-05, step=1983] Training: 20%|█▉ | 1984/10000 [22:15<1:12:39, 1.84it/s, loss=0.0267, lr=2.37e-05, step=1983] Training: 20%|█▉ | 1984/10000 [22:15<1:12:39, 1.84it/s, loss=0.0832, lr=2.37e-05, step=1984] Training: 20%|█▉ | 1985/10000 [22:15<1:11:53, 1.86it/s, loss=0.0832, lr=2.37e-05, step=1984] Training: 20%|█▉ | 1985/10000 [22:15<1:11:53, 1.86it/s, loss=0.0178, lr=2.37e-05, step=1985] Training: 20%|█▉ | 1986/10000 [22:16<1:20:44, 1.65it/s, loss=0.0178, lr=2.37e-05, step=1985] Training: 20%|█▉ | 1986/10000 [22:16<1:20:44, 1.65it/s, loss=0.0250, lr=2.37e-05, step=1986] Training: 20%|█▉ | 1987/10000 [22:17<1:26:12, 1.55it/s, loss=0.0250, lr=2.37e-05, step=1986] Training: 20%|█▉ | 1987/10000 [22:17<1:26:12, 1.55it/s, loss=0.0288, lr=2.37e-05, step=1987] Training: 20%|█▉ | 1988/10000 [22:17<1:25:57, 1.55it/s, loss=0.0288, lr=2.37e-05, step=1987] Training: 20%|█▉ | 1988/10000 [22:17<1:25:57, 1.55it/s, loss=0.0477, lr=2.37e-05, step=1988] Training: 20%|█▉ | 1989/10000 [22:18<1:20:15, 1.66it/s, loss=0.0477, lr=2.37e-05, step=1988] Training: 20%|█▉ | 1989/10000 [22:18<1:20:15, 1.66it/s, loss=0.0117, lr=2.37e-05, step=1989]16:28:25.268 [I] step=1990 loss=0.0185 smoothed_loss=0.0303 lr=2.37e-05 grad_norm=0.6477 step_time=0.5018s data_time=0.0827s it/s=1.711 eta_to_10000=4681.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.1832 grad_shared_expert=0.6515 (10775:train_pytorch.py:850) + Training: 20%|█▉ | 1990/10000 [22:18<1:19:47, 1.67it/s, loss=0.0117, lr=2.37e-05, step=1989] Training: 20%|█▉ | 1990/10000 [22:18<1:19:47, 1.67it/s, loss=0.0185, lr=2.37e-05, step=1990] Training: 20%|█▉ | 1991/10000 [22:19<1:15:35, 1.77it/s, loss=0.0185, lr=2.37e-05, step=1990] Training: 20%|█▉ | 1991/10000 [22:19<1:15:35, 1.77it/s, loss=0.0253, lr=2.37e-05, step=1991] Training: 20%|█▉ | 1992/10000 [22:19<1:12:29, 1.84it/s, loss=0.0253, lr=2.37e-05, step=1991] Training: 20%|█▉ | 1992/10000 [22:19<1:12:29, 1.84it/s, loss=0.0273, lr=2.37e-05, step=1992] Training: 20%|█▉ | 1993/10000 [22:20<1:19:22, 1.68it/s, loss=0.0273, lr=2.37e-05, step=1992] Training: 20%|█▉ | 1993/10000 [22:20<1:19:22, 1.68it/s, loss=0.0178, lr=2.37e-05, step=1993] Training: 20%|█▉ | 1994/10000 [22:21<1:15:50, 1.76it/s, loss=0.0178, lr=2.37e-05, step=1993] Training: 20%|█▉ | 1994/10000 [22:21<1:15:50, 1.76it/s, loss=0.0399, lr=2.37e-05, step=1994] Training: 20%|█▉ | 1995/10000 [22:21<1:20:27, 1.66it/s, loss=0.0399, lr=2.37e-05, step=1994] Training: 20%|█▉ | 1995/10000 [22:21<1:20:27, 1.66it/s, loss=0.0098, lr=2.37e-05, step=1995] Training: 20%|█▉ | 1996/10000 [22:22<1:16:07, 1.75it/s, loss=0.0098, lr=2.37e-05, step=1995] Training: 20%|█▉ | 1996/10000 [22:22<1:16:07, 1.75it/s, loss=0.0283, lr=2.37e-05, step=1996] Training: 20%|█▉ | 1997/10000 [22:22<1:12:59, 1.83it/s, loss=0.0283, lr=2.37e-05, step=1996] Training: 20%|█▉ | 1997/10000 [22:22<1:12:59, 1.83it/s, loss=0.0242, lr=2.37e-05, step=1997] Training: 20%|█▉ | 1998/10000 [22:23<1:10:36, 1.89it/s, loss=0.0242, lr=2.37e-05, step=1997] Training: 20%|█▉ | 1998/10000 [22:23<1:10:36, 1.89it/s, loss=0.0243, lr=2.36e-05, step=1998] Training: 20%|█▉ | 1999/10000 [22:23<1:18:13, 1.70it/s, loss=0.0243, lr=2.36e-05, step=1998] Training: 20%|█▉ | 1999/10000 [22:23<1:18:13, 1.70it/s, loss=0.0176, lr=2.36e-05, step=1999]16:28:30.872 [I] step=2000 loss=0.0492 smoothed_loss=0.0284 lr=2.37e-05 grad_norm=0.6437 step_time=0.4982s data_time=0.0622s it/s=1.785 eta_to_10000=4482.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0184 grad_action_out_proj=0.2195 grad_shared_expert=0.8358 (10775:train_pytorch.py:850) +16:30:11.326 [I] Saved checkpoint at step 2000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000 (10775:train_pytorch.py:350) + Training: 20%|██ | 2000/10000 [24:04<68:13:58, 30.70s/it, loss=0.0176, lr=2.36e-05, step=1999] Training: 20%|██ | 2000/10000 [24:04<68:13:58, 30.70s/it, loss=0.0492, lr=2.36e-05, step=2000] Training: 20%|██ | 2001/10000 [24:05<48:27:03, 21.81s/it, loss=0.0492, lr=2.36e-05, step=2000] Training: 20%|██ | 2001/10000 [24:05<48:27:03, 21.81s/it, loss=0.0105, lr=2.36e-05, step=2001] Training: 20%|██ | 2002/10000 [24:06<34:15:55, 15.42s/it, loss=0.0105, lr=2.36e-05, step=2001] Training: 20%|██ | 2002/10000 [24:06<34:15:55, 15.42s/it, loss=0.0492, lr=2.36e-05, step=2002] Training: 20%|██ | 2003/10000 [24:07<24:32:39, 11.05s/it, loss=0.0492, lr=2.36e-05, step=2002] Training: 20%|██ | 2003/10000 [24:07<24:32:39, 11.05s/it, loss=0.0485, lr=2.36e-05, step=2003] Training: 20%|██ | 2004/10000 [24:07<17:33:37, 7.91s/it, loss=0.0485, lr=2.36e-05, step=2003] Training: 20%|██ | 2004/10000 [24:07<17:33:37, 7.91s/it, loss=0.0490, lr=2.36e-05, step=2004] Training: 20%|██ | 2005/10000 [24:08<12:40:10, 5.70s/it, loss=0.0490, lr=2.36e-05, step=2004] Training: 20%|██ | 2005/10000 [24:08<12:40:10, 5.70s/it, loss=0.0047, lr=2.36e-05, step=2005] Training: 20%|██ | 2006/10000 [24:09<9:16:16, 4.18s/it, loss=0.0047, lr=2.36e-05, step=2005] Training: 20%|██ | 2006/10000 [24:09<9:16:16, 4.18s/it, loss=0.0110, lr=2.36e-05, step=2006] Training: 20%|██ | 2007/10000 [24:09<7:02:00, 3.17s/it, loss=0.0110, lr=2.36e-05, step=2006] Training: 20%|██ | 2007/10000 [24:09<7:02:00, 3.17s/it, loss=0.0318, lr=2.36e-05, step=2007] Training: 20%|██ | 2008/10000 [24:10<5:24:17, 2.43s/it, loss=0.0318, lr=2.36e-05, step=2007] Training: 20%|██ | 2008/10000 [24:10<5:24:17, 2.43s/it, loss=0.0122, lr=2.36e-05, step=2008] Training: 20%|██ | 2009/10000 [24:11<4:06:41, 1.85s/it, loss=0.0122, lr=2.36e-05, step=2008] Training: 20%|██ | 2009/10000 [24:11<4:06:41, 1.85s/it, loss=0.0543, lr=2.36e-05, step=2009]16:30:18.252 [I] step=2010 loss=0.0382 smoothed_loss=0.0304 lr=2.36e-05 grad_norm=0.5679 step_time=0.5806s data_time=10.1574s it/s=0.093 eta_to_10000=85796.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0220 grad_action_out_proj=0.2020 grad_shared_expert=0.5490 (10775:train_pytorch.py:850) + Training: 20%|██ | 2010/10000 [24:11<3:21:36, 1.51s/it, loss=0.0543, lr=2.36e-05, step=2009] Training: 20%|██ | 2010/10000 [24:11<3:21:36, 1.51s/it, loss=0.0382, lr=2.36e-05, step=2010] Training: 20%|██ | 2011/10000 [24:12<2:43:17, 1.23s/it, loss=0.0382, lr=2.36e-05, step=2010] Training: 20%|██ | 2011/10000 [24:12<2:43:17, 1.23s/it, loss=0.0323, lr=2.36e-05, step=2011] Training: 20%|██ | 2012/10000 [24:13<2:25:21, 1.09s/it, loss=0.0323, lr=2.36e-05, step=2011] Training: 20%|██ | 2012/10000 [24:13<2:25:21, 1.09s/it, loss=0.0508, lr=2.36e-05, step=2012] Training: 20%|██ | 2013/10000 [24:14<2:21:56, 1.07s/it, loss=0.0508, lr=2.36e-05, step=2012] Training: 20%|██ | 2013/10000 [24:14<2:21:56, 1.07s/it, loss=0.0237, lr=2.36e-05, step=2013] Training: 20%|██ | 2014/10000 [24:15<2:16:05, 1.02s/it, loss=0.0237, lr=2.36e-05, step=2013] Training: 20%|██ | 2014/10000 [24:15<2:16:05, 1.02s/it, loss=0.0114, lr=2.36e-05, step=2014] Training: 20%|██ | 2015/10000 [24:16<2:13:18, 1.00s/it, loss=0.0114, lr=2.36e-05, step=2014] Training: 20%|██ | 2015/10000 [24:16<2:13:18, 1.00s/it, loss=0.0173, lr=2.36e-05, step=2015] Training: 20%|██ | 2016/10000 [24:16<1:57:07, 1.14it/s, loss=0.0173, lr=2.36e-05, step=2015] Training: 20%|██ | 2016/10000 [24:16<1:57:07, 1.14it/s, loss=0.0240, lr=2.36e-05, step=2016] Training: 20%|██ | 2017/10000 [24:17<1:57:46, 1.13it/s, loss=0.0240, lr=2.36e-05, step=2016] Training: 20%|██ | 2017/10000 [24:17<1:57:46, 1.13it/s, loss=0.0263, lr=2.36e-05, step=2017] Training: 20%|██ | 2018/10000 [24:18<1:43:46, 1.28it/s, loss=0.0263, lr=2.36e-05, step=2017] Training: 20%|██ | 2018/10000 [24:18<1:43:46, 1.28it/s, loss=0.0200, lr=2.36e-05, step=2018] Training: 20%|██ | 2019/10000 [24:18<1:32:54, 1.43it/s, loss=0.0200, lr=2.36e-05, step=2018] Training: 20%|██ | 2019/10000 [24:18<1:32:54, 1.43it/s, loss=0.0315, lr=2.36e-05, step=2019]16:30:25.896 [I] step=2020 loss=0.0093 smoothed_loss=0.0257 lr=2.36e-05 grad_norm=0.6440 step_time=0.5986s data_time=0.1658s it/s=1.309 eta_to_10000=6098.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0172 grad_action_out_proj=0.1451 grad_shared_expert=0.6648 (10775:train_pytorch.py:850) + Training: 20%|██ | 2020/10000 [24:19<1:40:40, 1.32it/s, loss=0.0315, lr=2.36e-05, step=2019] Training: 20%|██ | 2020/10000 [24:19<1:40:40, 1.32it/s, loss=0.0093, lr=2.36e-05, step=2020] Training: 20%|██ | 2021/10000 [24:20<1:51:16, 1.20it/s, loss=0.0093, lr=2.36e-05, step=2020] Training: 20%|██ | 2021/10000 [24:20<1:51:16, 1.20it/s, loss=0.0353, lr=2.36e-05, step=2021] Training: 20%|██ | 2022/10000 [24:21<1:53:25, 1.17it/s, loss=0.0353, lr=2.36e-05, step=2021] Training: 20%|██ | 2022/10000 [24:21<1:53:25, 1.17it/s, loss=0.0367, lr=2.36e-05, step=2022] Training: 20%|██ | 2023/10000 [24:22<1:46:25, 1.25it/s, loss=0.0367, lr=2.36e-05, step=2022] Training: 20%|██ | 2023/10000 [24:22<1:46:25, 1.25it/s, loss=0.0363, lr=2.36e-05, step=2023] Training: 20%|██ | 2024/10000 [24:22<1:41:00, 1.32it/s, loss=0.0363, lr=2.36e-05, step=2023] Training: 20%|██ | 2024/10000 [24:22<1:41:00, 1.32it/s, loss=0.0325, lr=2.36e-05, step=2024] Training: 20%|██ | 2025/10000 [24:23<1:35:38, 1.39it/s, loss=0.0325, lr=2.36e-05, step=2024] Training: 20%|██ | 2025/10000 [24:23<1:35:38, 1.39it/s, loss=0.0240, lr=2.36e-05, step=2025] Training: 20%|██ | 2026/10000 [24:24<1:35:21, 1.39it/s, loss=0.0240, lr=2.36e-05, step=2025] Training: 20%|██ | 2026/10000 [24:24<1:35:21, 1.39it/s, loss=0.0097, lr=2.36e-05, step=2026] Training: 20%|██ | 2027/10000 [24:24<1:26:26, 1.54it/s, loss=0.0097, lr=2.36e-05, step=2026] Training: 20%|██ | 2027/10000 [24:24<1:26:26, 1.54it/s, loss=0.0658, lr=2.36e-05, step=2027] Training: 20%|██ | 2028/10000 [24:25<1:31:45, 1.45it/s, loss=0.0658, lr=2.36e-05, step=2027] Training: 20%|██ | 2028/10000 [24:25<1:31:45, 1.45it/s, loss=0.0441, lr=2.36e-05, step=2028] Training: 20%|██ | 2029/10000 [24:25<1:23:44, 1.59it/s, loss=0.0441, lr=2.36e-05, step=2028] Training: 20%|██ | 2029/10000 [24:25<1:23:44, 1.59it/s, loss=0.0274, lr=2.36e-05, step=2029]16:30:32.873 [I] step=2030 loss=0.0254 smoothed_loss=0.0308 lr=2.36e-05 grad_norm=0.5494 step_time=0.5826s data_time=0.1151s it/s=1.433 eta_to_10000=5560.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0202 grad_action_out_proj=0.2381 grad_shared_expert=0.5373 (10775:train_pytorch.py:850) + Training: 20%|██ | 2030/10000 [24:26<1:23:04, 1.60it/s, loss=0.0274, lr=2.36e-05, step=2029] Training: 20%|██ | 2030/10000 [24:26<1:23:04, 1.60it/s, loss=0.0254, lr=2.36e-05, step=2030] Training: 20%|██ | 2031/10000 [24:27<1:39:16, 1.34it/s, loss=0.0254, lr=2.36e-05, step=2030] Training: 20%|██ | 2031/10000 [24:27<1:39:16, 1.34it/s, loss=0.0294, lr=2.36e-05, step=2031] Training: 20%|██ | 2032/10000 [24:28<1:33:50, 1.42it/s, loss=0.0294, lr=2.36e-05, step=2031] Training: 20%|██ | 2032/10000 [24:28<1:33:50, 1.42it/s, loss=0.0092, lr=2.36e-05, step=2032] Training: 20%|██ | 2033/10000 [24:28<1:33:55, 1.41it/s, loss=0.0092, lr=2.36e-05, step=2032] Training: 20%|██ | 2033/10000 [24:28<1:33:55, 1.41it/s, loss=0.0212, lr=2.36e-05, step=2033] Training: 20%|██ | 2034/10000 [24:29<1:36:38, 1.37it/s, loss=0.0212, lr=2.36e-05, step=2033] Training: 20%|██ | 2034/10000 [24:29<1:36:38, 1.37it/s, loss=0.0450, lr=2.36e-05, step=2034] Training: 20%|██ | 2035/10000 [24:30<1:40:12, 1.32it/s, loss=0.0450, lr=2.36e-05, step=2034] Training: 20%|██ | 2035/10000 [24:30<1:40:12, 1.32it/s, loss=0.0109, lr=2.36e-05, step=2035] Training: 20%|██ | 2036/10000 [24:31<1:43:42, 1.28it/s, loss=0.0109, lr=2.36e-05, step=2035] Training: 20%|██ | 2036/10000 [24:31<1:43:42, 1.28it/s, loss=0.0230, lr=2.36e-05, step=2036] Training: 20%|██ | 2037/10000 [24:31<1:36:24, 1.38it/s, loss=0.0230, lr=2.36e-05, step=2036] Training: 20%|██ | 2037/10000 [24:31<1:36:24, 1.38it/s, loss=0.0110, lr=2.36e-05, step=2037] Training: 20%|██ | 2038/10000 [24:32<1:44:38, 1.27it/s, loss=0.0110, lr=2.36e-05, step=2037] Training: 20%|██ | 2038/10000 [24:32<1:44:38, 1.27it/s, loss=0.0405, lr=2.36e-05, step=2038] Training: 20%|██ | 2039/10000 [24:33<1:38:03, 1.35it/s, loss=0.0405, lr=2.36e-05, step=2038] Training: 20%|██ | 2039/10000 [24:33<1:38:03, 1.35it/s, loss=0.0649, lr=2.36e-05, step=2039]16:30:40.372 [I] step=2040 loss=0.0590 smoothed_loss=0.0337 lr=2.36e-05 grad_norm=0.5996 step_time=0.6019s data_time=0.1480s it/s=1.334 eta_to_10000=5968.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0196 grad_action_out_proj=0.2096 grad_shared_expert=0.6978 (10775:train_pytorch.py:850) + Training: 20%|██ | 2040/10000 [24:33<1:30:40, 1.46it/s, loss=0.0649, lr=2.36e-05, step=2039] Training: 20%|██ | 2040/10000 [24:33<1:30:40, 1.46it/s, loss=0.0590, lr=2.36e-05, step=2040] Training: 20%|██ | 2041/10000 [24:34<1:23:46, 1.58it/s, loss=0.0590, lr=2.36e-05, step=2040] Training: 20%|██ | 2041/10000 [24:34<1:23:46, 1.58it/s, loss=0.0339, lr=2.36e-05, step=2041] Training: 20%|██ | 2042/10000 [24:35<1:37:17, 1.36it/s, loss=0.0339, lr=2.36e-05, step=2041] Training: 20%|██ | 2042/10000 [24:35<1:37:17, 1.36it/s, loss=0.0220, lr=2.36e-05, step=2042] Training: 20%|██ | 2043/10000 [24:36<1:35:57, 1.38it/s, loss=0.0220, lr=2.36e-05, step=2042] Training: 20%|██ | 2043/10000 [24:36<1:35:57, 1.38it/s, loss=0.0174, lr=2.36e-05, step=2043] Training: 20%|██ | 2044/10000 [24:36<1:31:18, 1.45it/s, loss=0.0174, lr=2.36e-05, step=2043] Training: 20%|██ | 2044/10000 [24:36<1:31:18, 1.45it/s, loss=0.0378, lr=2.36e-05, step=2044] Training: 20%|██ | 2045/10000 [24:37<1:33:03, 1.42it/s, loss=0.0378, lr=2.36e-05, step=2044] Training: 20%|██ | 2045/10000 [24:37<1:33:03, 1.42it/s, loss=0.0304, lr=2.36e-05, step=2045] Training: 20%|██ | 2046/10000 [24:38<1:35:13, 1.39it/s, loss=0.0304, lr=2.36e-05, step=2045] Training: 20%|██ | 2046/10000 [24:38<1:35:13, 1.39it/s, loss=0.0323, lr=2.36e-05, step=2046] Training: 20%|██ | 2047/10000 [24:38<1:27:30, 1.51it/s, loss=0.0323, lr=2.36e-05, step=2046] Training: 20%|██ | 2047/10000 [24:38<1:27:30, 1.51it/s, loss=0.0275, lr=2.36e-05, step=2047] Training: 20%|██ | 2048/10000 [24:39<1:29:21, 1.48it/s, loss=0.0275, lr=2.36e-05, step=2047] Training: 20%|██ | 2048/10000 [24:39<1:29:21, 1.48it/s, loss=0.0153, lr=2.36e-05, step=2048] Training: 20%|██ | 2049/10000 [24:39<1:23:09, 1.59it/s, loss=0.0153, lr=2.36e-05, step=2048] Training: 20%|██ | 2049/10000 [24:39<1:23:09, 1.59it/s, loss=0.0367, lr=2.36e-05, step=2049]16:30:47.263 [I] step=2050 loss=0.0173 smoothed_loss=0.0290 lr=2.36e-05 grad_norm=0.6206 step_time=0.5851s data_time=0.1040s it/s=1.451 eta_to_10000=5477.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0139 grad_action_out_proj=0.1825 grad_shared_expert=0.6677 (10775:train_pytorch.py:850) + Training: 20%|██ | 2050/10000 [24:40<1:32:27, 1.43it/s, loss=0.0367, lr=2.36e-05, step=2049] Training: 20%|██ | 2050/10000 [24:40<1:32:27, 1.43it/s, loss=0.0173, lr=2.36e-05, step=2050] Training: 21%|██ | 2051/10000 [24:41<1:26:15, 1.54it/s, loss=0.0173, lr=2.36e-05, step=2050] Training: 21%|██ | 2051/10000 [24:41<1:26:15, 1.54it/s, loss=0.0071, lr=2.36e-05, step=2051] Training: 21%|██ | 2052/10000 [24:42<1:27:25, 1.52it/s, loss=0.0071, lr=2.36e-05, step=2051] Training: 21%|██ | 2052/10000 [24:42<1:27:25, 1.52it/s, loss=0.0129, lr=2.36e-05, step=2052] Training: 21%|██ | 2053/10000 [24:42<1:28:18, 1.50it/s, loss=0.0129, lr=2.36e-05, step=2052] Training: 21%|██ | 2053/10000 [24:42<1:28:18, 1.50it/s, loss=0.0285, lr=2.36e-05, step=2053] Training: 21%|██ | 2054/10000 [24:43<1:26:03, 1.54it/s, loss=0.0285, lr=2.36e-05, step=2053] Training: 21%|██ | 2054/10000 [24:43<1:26:03, 1.54it/s, loss=0.0311, lr=2.35e-05, step=2054] Training: 21%|██ | 2055/10000 [24:43<1:20:34, 1.64it/s, loss=0.0311, lr=2.35e-05, step=2054] Training: 21%|██ | 2055/10000 [24:43<1:20:34, 1.64it/s, loss=0.1809, lr=2.35e-05, step=2055] Training: 21%|██ | 2056/10000 [24:44<1:16:00, 1.74it/s, loss=0.1809, lr=2.35e-05, step=2055] Training: 21%|██ | 2056/10000 [24:44<1:16:00, 1.74it/s, loss=0.0435, lr=2.35e-05, step=2056] Training: 21%|██ | 2057/10000 [24:45<1:21:47, 1.62it/s, loss=0.0435, lr=2.35e-05, step=2056] Training: 21%|██ | 2057/10000 [24:45<1:21:47, 1.62it/s, loss=0.0058, lr=2.35e-05, step=2057] Training: 21%|██ | 2058/10000 [24:45<1:17:21, 1.71it/s, loss=0.0058, lr=2.35e-05, step=2057] Training: 21%|██ | 2058/10000 [24:45<1:17:21, 1.71it/s, loss=0.0152, lr=2.35e-05, step=2058] Training: 21%|██ | 2059/10000 [24:46<1:18:31, 1.69it/s, loss=0.0152, lr=2.35e-05, step=2058] Training: 21%|██ | 2059/10000 [24:46<1:18:31, 1.69it/s, loss=0.0167, lr=2.35e-05, step=2059]16:30:53.592 [I] step=2060 loss=0.0315 smoothed_loss=0.0338 lr=2.35e-05 grad_norm=0.5436 step_time=0.5440s data_time=0.0889s it/s=1.580 eta_to_10000=5025.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.2143 grad_shared_expert=0.5517 (10775:train_pytorch.py:850) + Training: 21%|██ | 2060/10000 [24:47<1:33:23, 1.42it/s, loss=0.0167, lr=2.35e-05, step=2059] Training: 21%|██ | 2060/10000 [24:47<1:33:23, 1.42it/s, loss=0.0315, lr=2.35e-05, step=2060] Training: 21%|██ | 2061/10000 [24:47<1:28:12, 1.50it/s, loss=0.0315, lr=2.35e-05, step=2060] Training: 21%|██ | 2061/10000 [24:47<1:28:12, 1.50it/s, loss=0.0178, lr=2.35e-05, step=2061] Training: 21%|██ | 2062/10000 [24:48<1:32:19, 1.43it/s, loss=0.0178, lr=2.35e-05, step=2061] Training: 21%|██ | 2062/10000 [24:48<1:32:19, 1.43it/s, loss=0.0213, lr=2.35e-05, step=2062] Training: 21%|██ | 2063/10000 [24:48<1:23:46, 1.58it/s, loss=0.0213, lr=2.35e-05, step=2062] Training: 21%|██ | 2063/10000 [24:48<1:23:46, 1.58it/s, loss=0.0439, lr=2.35e-05, step=2063] Training: 21%|██ | 2064/10000 [24:49<1:35:04, 1.39it/s, loss=0.0439, lr=2.35e-05, step=2063] Training: 21%|██ | 2064/10000 [24:49<1:35:04, 1.39it/s, loss=0.0360, lr=2.35e-05, step=2064] Training: 21%|██ | 2065/10000 [24:50<1:29:09, 1.48it/s, loss=0.0360, lr=2.35e-05, step=2064] Training: 21%|██ | 2065/10000 [24:50<1:29:09, 1.48it/s, loss=0.0289, lr=2.35e-05, step=2065] Training: 21%|██ | 2066/10000 [24:51<1:25:18, 1.55it/s, loss=0.0289, lr=2.35e-05, step=2065] Training: 21%|██ | 2066/10000 [24:51<1:25:18, 1.55it/s, loss=0.0286, lr=2.35e-05, step=2066] Training: 21%|██ | 2067/10000 [24:51<1:32:46, 1.43it/s, loss=0.0286, lr=2.35e-05, step=2066] Training: 21%|██ | 2067/10000 [24:51<1:32:46, 1.43it/s, loss=0.0295, lr=2.35e-05, step=2067] Training: 21%|██ | 2068/10000 [24:52<1:35:14, 1.39it/s, loss=0.0295, lr=2.35e-05, step=2067] Training: 21%|██ | 2068/10000 [24:52<1:35:14, 1.39it/s, loss=0.0385, lr=2.35e-05, step=2068] Training: 21%|██ | 2069/10000 [24:53<1:38:34, 1.34it/s, loss=0.0385, lr=2.35e-05, step=2068] Training: 21%|██ | 2069/10000 [24:53<1:38:34, 1.34it/s, loss=0.0367, lr=2.35e-05, step=2069]16:31:00.610 [I] step=2070 loss=0.0130 smoothed_loss=0.0309 lr=2.35e-05 grad_norm=0.5679 step_time=0.5527s data_time=0.1491s it/s=1.425 eta_to_10000=5564.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0184 grad_action_out_proj=0.1608 grad_shared_expert=0.4683 (10775:train_pytorch.py:850) + Training: 21%|██ | 2070/10000 [24:54<1:37:33, 1.35it/s, loss=0.0367, lr=2.35e-05, step=2069] Training: 21%|██ | 2070/10000 [24:54<1:37:33, 1.35it/s, loss=0.0130, lr=2.35e-05, step=2070] Training: 21%|██ | 2071/10000 [24:55<1:45:08, 1.26it/s, loss=0.0130, lr=2.35e-05, step=2070] Training: 21%|██ | 2071/10000 [24:55<1:45:08, 1.26it/s, loss=0.0142, lr=2.35e-05, step=2071] Training: 21%|██ | 2072/10000 [24:55<1:45:55, 1.25it/s, loss=0.0142, lr=2.35e-05, step=2071] Training: 21%|██ | 2072/10000 [24:55<1:45:55, 1.25it/s, loss=0.0154, lr=2.35e-05, step=2072] Training: 21%|██ | 2073/10000 [24:56<1:34:29, 1.40it/s, loss=0.0154, lr=2.35e-05, step=2072] Training: 21%|██ | 2073/10000 [24:56<1:34:29, 1.40it/s, loss=0.0171, lr=2.35e-05, step=2073] Training: 21%|██ | 2074/10000 [24:57<1:32:02, 1.44it/s, loss=0.0171, lr=2.35e-05, step=2073] Training: 21%|██ | 2074/10000 [24:57<1:32:02, 1.44it/s, loss=0.0082, lr=2.35e-05, step=2074] Training: 21%|██ | 2075/10000 [24:57<1:31:46, 1.44it/s, loss=0.0082, lr=2.35e-05, step=2074] Training: 21%|██ | 2075/10000 [24:57<1:31:46, 1.44it/s, loss=0.0182, lr=2.35e-05, step=2075] Training: 21%|██ | 2076/10000 [24:58<1:34:52, 1.39it/s, loss=0.0182, lr=2.35e-05, step=2075] Training: 21%|██ | 2076/10000 [24:58<1:34:52, 1.39it/s, loss=0.0141, lr=2.35e-05, step=2076] Training: 21%|██ | 2077/10000 [24:59<1:37:18, 1.36it/s, loss=0.0141, lr=2.35e-05, step=2076] Training: 21%|██ | 2077/10000 [24:59<1:37:18, 1.36it/s, loss=0.0084, lr=2.35e-05, step=2077] Training: 21%|██ | 2078/10000 [25:00<1:47:46, 1.23it/s, loss=0.0084, lr=2.35e-05, step=2077] Training: 21%|██ | 2078/10000 [25:00<1:47:46, 1.23it/s, loss=0.0316, lr=2.35e-05, step=2078] Training: 21%|██ | 2079/10000 [25:00<1:40:46, 1.31it/s, loss=0.0316, lr=2.35e-05, step=2078] Training: 21%|██ | 2079/10000 [25:00<1:40:46, 1.31it/s, loss=0.0143, lr=2.35e-05, step=2079]16:31:08.444 [I] step=2080 loss=0.0113 smoothed_loss=0.0208 lr=2.35e-05 grad_norm=0.5327 step_time=0.6364s data_time=0.1470s it/s=1.277 eta_to_10000=6203.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.1735 grad_shared_expert=0.4850 (10775:train_pytorch.py:850) + Training: 21%|██ | 2080/10000 [25:02<1:51:34, 1.18it/s, loss=0.0143, lr=2.35e-05, step=2079] Training: 21%|██ | 2080/10000 [25:02<1:51:34, 1.18it/s, loss=0.0113, lr=2.35e-05, step=2080] Training: 21%|██ | 2081/10000 [25:02<1:44:11, 1.27it/s, loss=0.0113, lr=2.35e-05, step=2080] Training: 21%|██ | 2081/10000 [25:02<1:44:11, 1.27it/s, loss=0.1364, lr=2.35e-05, step=2081] Training: 21%|██ | 2082/10000 [25:03<1:32:28, 1.43it/s, loss=0.1364, lr=2.35e-05, step=2081] Training: 21%|██ | 2082/10000 [25:03<1:32:28, 1.43it/s, loss=0.0427, lr=2.35e-05, step=2082] Training: 21%|██ | 2083/10000 [25:03<1:29:56, 1.47it/s, loss=0.0427, lr=2.35e-05, step=2082] Training: 21%|██ | 2083/10000 [25:03<1:29:56, 1.47it/s, loss=0.0056, lr=2.35e-05, step=2083] Training: 21%|██ | 2084/10000 [25:04<1:32:31, 1.43it/s, loss=0.0056, lr=2.35e-05, step=2083] Training: 21%|██ | 2084/10000 [25:04<1:32:31, 1.43it/s, loss=0.0466, lr=2.35e-05, step=2084] Training: 21%|██ | 2085/10000 [25:05<1:42:50, 1.28it/s, loss=0.0466, lr=2.35e-05, step=2084] Training: 21%|██ | 2085/10000 [25:05<1:42:50, 1.28it/s, loss=0.0381, lr=2.35e-05, step=2085] Training: 21%|██ | 2086/10000 [25:05<1:31:13, 1.45it/s, loss=0.0381, lr=2.35e-05, step=2085] Training: 21%|██ | 2086/10000 [25:05<1:31:13, 1.45it/s, loss=0.0194, lr=2.35e-05, step=2086] Training: 21%|██ | 2087/10000 [25:06<1:30:57, 1.45it/s, loss=0.0194, lr=2.35e-05, step=2086] Training: 21%|██ | 2087/10000 [25:06<1:30:57, 1.45it/s, loss=0.0093, lr=2.35e-05, step=2087] Training: 21%|██ | 2088/10000 [25:07<1:48:22, 1.22it/s, loss=0.0093, lr=2.35e-05, step=2087] Training: 21%|██ | 2088/10000 [25:07<1:48:22, 1.22it/s, loss=0.0334, lr=2.35e-05, step=2088] Training: 21%|██ | 2089/10000 [25:08<1:39:28, 1.33it/s, loss=0.0334, lr=2.35e-05, step=2088] Training: 21%|██ | 2089/10000 [25:08<1:39:28, 1.33it/s, loss=0.0147, lr=2.35e-05, step=2089]16:31:15.901 [I] step=2090 loss=0.0342 smoothed_loss=0.0288 lr=2.35e-05 grad_norm=0.5941 step_time=0.5940s data_time=0.1516s it/s=1.341 eta_to_10000=5897.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0145 grad_action_out_proj=0.1825 grad_shared_expert=0.6360 (10775:train_pytorch.py:850) + Training: 21%|██ | 2090/10000 [25:09<1:51:29, 1.18it/s, loss=0.0147, lr=2.35e-05, step=2089] Training: 21%|██ | 2090/10000 [25:09<1:51:29, 1.18it/s, loss=0.0342, lr=2.35e-05, step=2090] Training: 21%|██ | 2091/10000 [25:10<1:42:51, 1.28it/s, loss=0.0342, lr=2.35e-05, step=2090] Training: 21%|██ | 2091/10000 [25:10<1:42:51, 1.28it/s, loss=0.0138, lr=2.35e-05, step=2091] Training: 21%|██ | 2092/10000 [25:10<1:32:14, 1.43it/s, loss=0.0138, lr=2.35e-05, step=2091] Training: 21%|██ | 2092/10000 [25:10<1:32:14, 1.43it/s, loss=0.0483, lr=2.35e-05, step=2092] Training: 21%|██ | 2093/10000 [25:11<1:34:13, 1.40it/s, loss=0.0483, lr=2.35e-05, step=2092] Training: 21%|██ | 2093/10000 [25:11<1:34:13, 1.40it/s, loss=0.0396, lr=2.35e-05, step=2093] Training: 21%|██ | 2094/10000 [25:12<1:35:05, 1.39it/s, loss=0.0396, lr=2.35e-05, step=2093] Training: 21%|██ | 2094/10000 [25:12<1:35:05, 1.39it/s, loss=0.0318, lr=2.35e-05, step=2094] Training: 21%|██ | 2095/10000 [25:12<1:29:20, 1.47it/s, loss=0.0318, lr=2.35e-05, step=2094] Training: 21%|██ | 2095/10000 [25:12<1:29:20, 1.47it/s, loss=0.0078, lr=2.35e-05, step=2095] Training: 21%|██ | 2096/10000 [25:13<1:28:26, 1.49it/s, loss=0.0078, lr=2.35e-05, step=2095] Training: 21%|██ | 2096/10000 [25:13<1:28:26, 1.49it/s, loss=0.0562, lr=2.35e-05, step=2096] Training: 21%|██ | 2097/10000 [25:14<1:33:19, 1.41it/s, loss=0.0562, lr=2.35e-05, step=2096] Training: 21%|██ | 2097/10000 [25:14<1:33:19, 1.41it/s, loss=0.0319, lr=2.35e-05, step=2097] Training: 21%|██ | 2098/10000 [25:14<1:38:36, 1.34it/s, loss=0.0319, lr=2.35e-05, step=2097] Training: 21%|██ | 2098/10000 [25:14<1:38:36, 1.34it/s, loss=0.0103, lr=2.35e-05, step=2098] Training: 21%|██ | 2099/10000 [25:15<1:33:26, 1.41it/s, loss=0.0103, lr=2.35e-05, step=2098] Training: 21%|██ | 2099/10000 [25:15<1:33:26, 1.41it/s, loss=0.0618, lr=2.35e-05, step=2099]16:31:22.923 [I] step=2100 loss=0.0282 smoothed_loss=0.0319 lr=2.35e-05 grad_norm=0.6509 step_time=0.5850s data_time=0.1171s it/s=1.424 eta_to_10000=5546.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0195 grad_action_out_proj=0.2288 grad_shared_expert=0.6354 (10775:train_pytorch.py:850) + Training: 21%|██ | 2100/10000 [25:16<1:41:09, 1.30it/s, loss=0.0618, lr=2.35e-05, step=2099] Training: 21%|██ | 2100/10000 [25:16<1:41:09, 1.30it/s, loss=0.0282, lr=2.35e-05, step=2100] Training: 21%|██ | 2101/10000 [25:17<1:32:09, 1.43it/s, loss=0.0282, lr=2.35e-05, step=2100] Training: 21%|██ | 2101/10000 [25:17<1:32:09, 1.43it/s, loss=0.0374, lr=2.35e-05, step=2101] Training: 21%|██ | 2102/10000 [25:17<1:35:25, 1.38it/s, loss=0.0374, lr=2.35e-05, step=2101] Training: 21%|██ | 2102/10000 [25:17<1:35:25, 1.38it/s, loss=0.0286, lr=2.35e-05, step=2102] Training: 21%|██ | 2103/10000 [25:18<1:26:16, 1.53it/s, loss=0.0286, lr=2.35e-05, step=2102] Training: 21%|██ | 2103/10000 [25:18<1:26:16, 1.53it/s, loss=0.0687, lr=2.35e-05, step=2103] Training: 21%|██ | 2104/10000 [25:18<1:19:30, 1.66it/s, loss=0.0687, lr=2.35e-05, step=2103] Training: 21%|██ | 2104/10000 [25:18<1:19:30, 1.66it/s, loss=0.0160, lr=2.35e-05, step=2104] Training: 21%|██ | 2105/10000 [25:19<1:16:04, 1.73it/s, loss=0.0160, lr=2.35e-05, step=2104] Training: 21%|██ | 2105/10000 [25:19<1:16:04, 1.73it/s, loss=0.0143, lr=2.35e-05, step=2105] Training: 21%|██ | 2106/10000 [25:19<1:13:01, 1.80it/s, loss=0.0143, lr=2.35e-05, step=2105] Training: 21%|██ | 2106/10000 [25:19<1:13:01, 1.80it/s, loss=0.0216, lr=2.35e-05, step=2106] Training: 21%|██ | 2107/10000 [25:20<1:32:18, 1.42it/s, loss=0.0216, lr=2.35e-05, step=2106] Training: 21%|██ | 2107/10000 [25:20<1:32:18, 1.42it/s, loss=0.0167, lr=2.35e-05, step=2107] Training: 21%|██ | 2108/10000 [25:21<1:24:32, 1.56it/s, loss=0.0167, lr=2.35e-05, step=2107] Training: 21%|██ | 2108/10000 [25:21<1:24:32, 1.56it/s, loss=0.0645, lr=2.34e-05, step=2108] Training: 21%|██ | 2109/10000 [25:22<1:26:21, 1.52it/s, loss=0.0645, lr=2.34e-05, step=2108] Training: 21%|██ | 2109/10000 [25:22<1:26:21, 1.52it/s, loss=0.0213, lr=2.34e-05, step=2109]16:31:28.994 [I] step=2110 loss=0.0206 smoothed_loss=0.0306 lr=2.35e-05 grad_norm=0.6718 step_time=0.5275s data_time=0.0797s it/s=1.647 eta_to_10000=4790.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1336 grad_shared_expert=0.5533 (10775:train_pytorch.py:850) + Training: 21%|██ | 2110/10000 [25:22<1:20:41, 1.63it/s, loss=0.0213, lr=2.34e-05, step=2109] Training: 21%|██ | 2110/10000 [25:22<1:20:41, 1.63it/s, loss=0.0206, lr=2.34e-05, step=2110] Training: 21%|██ | 2111/10000 [25:23<1:21:52, 1.61it/s, loss=0.0206, lr=2.34e-05, step=2110] Training: 21%|██ | 2111/10000 [25:23<1:21:52, 1.61it/s, loss=0.0233, lr=2.34e-05, step=2111] Training: 21%|██ | 2112/10000 [25:23<1:16:26, 1.72it/s, loss=0.0233, lr=2.34e-05, step=2111] Training: 21%|██ | 2112/10000 [25:23<1:16:26, 1.72it/s, loss=0.0247, lr=2.34e-05, step=2112] Training: 21%|██ | 2113/10000 [25:24<1:12:35, 1.81it/s, loss=0.0247, lr=2.34e-05, step=2112] Training: 21%|██ | 2113/10000 [25:24<1:12:35, 1.81it/s, loss=0.0201, lr=2.34e-05, step=2113] Training: 21%|██ | 2114/10000 [25:24<1:20:16, 1.64it/s, loss=0.0201, lr=2.34e-05, step=2113] Training: 21%|██ | 2114/10000 [25:24<1:20:16, 1.64it/s, loss=0.0190, lr=2.34e-05, step=2114] Training: 21%|██ | 2115/10000 [25:25<1:16:04, 1.73it/s, loss=0.0190, lr=2.34e-05, step=2114] Training: 21%|██ | 2115/10000 [25:25<1:16:04, 1.73it/s, loss=0.0368, lr=2.34e-05, step=2115] Training: 21%|██ | 2116/10000 [25:26<1:21:36, 1.61it/s, loss=0.0368, lr=2.34e-05, step=2115] Training: 21%|██ | 2116/10000 [25:26<1:21:36, 1.61it/s, loss=0.0416, lr=2.34e-05, step=2116] Training: 21%|██ | 2117/10000 [25:26<1:29:52, 1.46it/s, loss=0.0416, lr=2.34e-05, step=2116] Training: 21%|██ | 2117/10000 [25:26<1:29:52, 1.46it/s, loss=0.0139, lr=2.34e-05, step=2117] Training: 21%|██ | 2118/10000 [25:27<1:39:42, 1.32it/s, loss=0.0139, lr=2.34e-05, step=2117] Training: 21%|██ | 2118/10000 [25:27<1:39:42, 1.32it/s, loss=0.0216, lr=2.34e-05, step=2118] Training: 21%|██ | 2119/10000 [25:28<1:51:59, 1.17it/s, loss=0.0216, lr=2.34e-05, step=2118] Training: 21%|██ | 2119/10000 [25:28<1:51:59, 1.17it/s, loss=0.0401, lr=2.34e-05, step=2119]16:31:36.068 [I] step=2120 loss=0.0675 smoothed_loss=0.0326 lr=2.34e-05 grad_norm=0.6110 step_time=0.5851s data_time=0.1223s it/s=1.414 eta_to_10000=5573.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0172 grad_action_out_proj=0.1887 grad_shared_expert=0.4966 (10775:train_pytorch.py:850) + Training: 21%|██ | 2120/10000 [25:29<1:44:08, 1.26it/s, loss=0.0401, lr=2.34e-05, step=2119] Training: 21%|██ | 2120/10000 [25:29<1:44:08, 1.26it/s, loss=0.0675, lr=2.34e-05, step=2120] Training: 21%|██ | 2121/10000 [25:30<1:45:36, 1.24it/s, loss=0.0675, lr=2.34e-05, step=2120] Training: 21%|██ | 2121/10000 [25:30<1:45:36, 1.24it/s, loss=0.0247, lr=2.34e-05, step=2121] Training: 21%|██ | 2122/10000 [25:31<1:42:54, 1.28it/s, loss=0.0247, lr=2.34e-05, step=2121] Training: 21%|██ | 2122/10000 [25:31<1:42:54, 1.28it/s, loss=0.0321, lr=2.34e-05, step=2122] Training: 21%|██ | 2123/10000 [25:31<1:31:59, 1.43it/s, loss=0.0321, lr=2.34e-05, step=2122] Training: 21%|██ | 2123/10000 [25:31<1:31:59, 1.43it/s, loss=0.0330, lr=2.34e-05, step=2123] Training: 21%|██ | 2124/10000 [25:32<1:36:44, 1.36it/s, loss=0.0330, lr=2.34e-05, step=2123] Training: 21%|██ | 2124/10000 [25:32<1:36:44, 1.36it/s, loss=0.0342, lr=2.34e-05, step=2124] Training: 21%|██▏ | 2125/10000 [25:33<1:31:29, 1.43it/s, loss=0.0342, lr=2.34e-05, step=2124] Training: 21%|██▏ | 2125/10000 [25:33<1:31:29, 1.43it/s, loss=0.0156, lr=2.34e-05, step=2125] Training: 21%|██▏ | 2126/10000 [25:33<1:29:46, 1.46it/s, loss=0.0156, lr=2.34e-05, step=2125] Training: 21%|██▏ | 2126/10000 [25:33<1:29:46, 1.46it/s, loss=0.0215, lr=2.34e-05, step=2126] Training: 21%|██▏ | 2127/10000 [25:34<1:22:33, 1.59it/s, loss=0.0215, lr=2.34e-05, step=2126] Training: 21%|██▏ | 2127/10000 [25:34<1:22:33, 1.59it/s, loss=0.0165, lr=2.34e-05, step=2127] Training: 21%|██▏ | 2128/10000 [25:35<1:32:04, 1.42it/s, loss=0.0165, lr=2.34e-05, step=2127] Training: 21%|██▏ | 2128/10000 [25:35<1:32:04, 1.42it/s, loss=0.0130, lr=2.34e-05, step=2128] Training: 21%|██▏ | 2129/10000 [25:35<1:24:20, 1.56it/s, loss=0.0130, lr=2.34e-05, step=2128] Training: 21%|██▏ | 2129/10000 [25:35<1:24:20, 1.56it/s, loss=0.0191, lr=2.34e-05, step=2129]16:31:43.253 [I] step=2130 loss=0.0158 smoothed_loss=0.0250 lr=2.34e-05 grad_norm=0.6806 step_time=0.5729s data_time=0.1456s it/s=1.392 eta_to_10000=5654.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0147 grad_action_out_proj=0.1759 grad_shared_expert=0.5056 (10775:train_pytorch.py:850) + Training: 21%|██▏ | 2130/10000 [25:36<1:44:28, 1.26it/s, loss=0.0191, lr=2.34e-05, step=2129] Training: 21%|██▏ | 2130/10000 [25:36<1:44:28, 1.26it/s, loss=0.0158, lr=2.34e-05, step=2130] Training: 21%|██▏ | 2131/10000 [25:37<1:33:42, 1.40it/s, loss=0.0158, lr=2.34e-05, step=2130] Training: 21%|██▏ | 2131/10000 [25:37<1:33:42, 1.40it/s, loss=0.0618, lr=2.34e-05, step=2131] Training: 21%|██▏ | 2132/10000 [25:38<1:36:37, 1.36it/s, loss=0.0618, lr=2.34e-05, step=2131] Training: 21%|██▏ | 2132/10000 [25:38<1:36:37, 1.36it/s, loss=0.0169, lr=2.34e-05, step=2132] Training: 21%|██▏ | 2133/10000 [25:38<1:30:11, 1.45it/s, loss=0.0169, lr=2.34e-05, step=2132] Training: 21%|██▏ | 2133/10000 [25:38<1:30:11, 1.45it/s, loss=0.0224, lr=2.34e-05, step=2133] Training: 21%|██▏ | 2134/10000 [25:39<1:24:48, 1.55it/s, loss=0.0224, lr=2.34e-05, step=2133] Training: 21%|██▏ | 2134/10000 [25:39<1:24:48, 1.55it/s, loss=0.0190, lr=2.34e-05, step=2134] Training: 21%|██▏ | 2135/10000 [25:39<1:22:43, 1.58it/s, loss=0.0190, lr=2.34e-05, step=2134] Training: 21%|██▏ | 2135/10000 [25:39<1:22:43, 1.58it/s, loss=0.0265, lr=2.34e-05, step=2135] Training: 21%|██▏ | 2136/10000 [25:40<1:30:36, 1.45it/s, loss=0.0265, lr=2.34e-05, step=2135] Training: 21%|██▏ | 2136/10000 [25:40<1:30:36, 1.45it/s, loss=0.0106, lr=2.34e-05, step=2136] Training: 21%|██▏ | 2137/10000 [25:41<1:29:28, 1.46it/s, loss=0.0106, lr=2.34e-05, step=2136] Training: 21%|██▏ | 2137/10000 [25:41<1:29:28, 1.46it/s, loss=0.0093, lr=2.34e-05, step=2137] Training: 21%|██▏ | 2138/10000 [25:41<1:22:10, 1.59it/s, loss=0.0093, lr=2.34e-05, step=2137] Training: 21%|██▏ | 2138/10000 [25:41<1:22:10, 1.59it/s, loss=0.0109, lr=2.34e-05, step=2138] Training: 21%|██▏ | 2139/10000 [25:42<1:24:09, 1.56it/s, loss=0.0109, lr=2.34e-05, step=2138] Training: 21%|██▏ | 2139/10000 [25:42<1:24:09, 1.56it/s, loss=0.0317, lr=2.34e-05, step=2139]16:31:49.491 [I] step=2140 loss=0.0224 smoothed_loss=0.0228 lr=2.34e-05 grad_norm=0.5375 step_time=0.5378s data_time=0.0860s it/s=1.603 eta_to_10000=4902.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1622 grad_shared_expert=0.4928 (10775:train_pytorch.py:850) + Training: 21%|██▏ | 2140/10000 [25:43<1:19:58, 1.64it/s, loss=0.0317, lr=2.34e-05, step=2139] Training: 21%|██▏ | 2140/10000 [25:43<1:19:58, 1.64it/s, loss=0.0224, lr=2.34e-05, step=2140] Training: 21%|██▏ | 2141/10000 [25:43<1:15:05, 1.74it/s, loss=0.0224, lr=2.34e-05, step=2140] Training: 21%|██▏ | 2141/10000 [25:43<1:15:05, 1.74it/s, loss=0.0276, lr=2.34e-05, step=2141] Training: 21%|██▏ | 2142/10000 [25:44<1:15:06, 1.74it/s, loss=0.0276, lr=2.34e-05, step=2141] Training: 21%|██▏ | 2142/10000 [25:44<1:15:06, 1.74it/s, loss=0.0131, lr=2.34e-05, step=2142] Training: 21%|██▏ | 2143/10000 [25:44<1:23:28, 1.57it/s, loss=0.0131, lr=2.34e-05, step=2142] Training: 21%|██▏ | 2143/10000 [25:44<1:23:28, 1.57it/s, loss=0.0262, lr=2.34e-05, step=2143] Training: 21%|██▏ | 2144/10000 [25:45<1:18:36, 1.67it/s, loss=0.0262, lr=2.34e-05, step=2143] Training: 21%|██▏ | 2144/10000 [25:45<1:18:36, 1.67it/s, loss=0.0133, lr=2.34e-05, step=2144] Training: 21%|██▏ | 2145/10000 [25:45<1:14:53, 1.75it/s, loss=0.0133, lr=2.34e-05, step=2144] Training: 21%|██▏ | 2145/10000 [25:45<1:14:53, 1.75it/s, loss=0.0358, lr=2.34e-05, step=2145] Training: 21%|██▏ | 2146/10000 [25:46<1:23:56, 1.56it/s, loss=0.0358, lr=2.34e-05, step=2145] Training: 21%|██▏ | 2146/10000 [25:46<1:23:56, 1.56it/s, loss=0.0322, lr=2.34e-05, step=2146] Training: 21%|██▏ | 2147/10000 [25:47<1:19:33, 1.65it/s, loss=0.0322, lr=2.34e-05, step=2146] Training: 21%|██▏ | 2147/10000 [25:47<1:19:33, 1.65it/s, loss=0.0230, lr=2.34e-05, step=2147] Training: 21%|██▏ | 2148/10000 [25:47<1:15:42, 1.73it/s, loss=0.0230, lr=2.34e-05, step=2147] Training: 21%|██▏ | 2148/10000 [25:47<1:15:42, 1.73it/s, loss=0.0079, lr=2.34e-05, step=2148] Training: 21%|██▏ | 2149/10000 [25:48<1:12:13, 1.81it/s, loss=0.0079, lr=2.34e-05, step=2148] Training: 21%|██▏ | 2149/10000 [25:48<1:12:13, 1.81it/s, loss=0.0411, lr=2.34e-05, step=2149]16:31:55.557 [I] step=2150 loss=0.0173 smoothed_loss=0.0235 lr=2.34e-05 grad_norm=0.5407 step_time=0.5376s data_time=0.0691s it/s=1.649 eta_to_10000=4761.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0202 grad_action_out_proj=0.1988 grad_shared_expert=0.5171 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2150/10000 [25:49<1:24:35, 1.55it/s, loss=0.0411, lr=2.34e-05, step=2149] Training: 22%|██▏ | 2150/10000 [25:49<1:24:35, 1.55it/s, loss=0.0173, lr=2.34e-05, step=2150] Training: 22%|██▏ | 2151/10000 [25:49<1:25:13, 1.54it/s, loss=0.0173, lr=2.34e-05, step=2150] Training: 22%|██▏ | 2151/10000 [25:49<1:25:13, 1.54it/s, loss=0.0273, lr=2.34e-05, step=2151] Training: 22%|██▏ | 2152/10000 [25:50<1:19:40, 1.64it/s, loss=0.0273, lr=2.34e-05, step=2151] Training: 22%|██▏ | 2152/10000 [25:50<1:19:40, 1.64it/s, loss=0.0190, lr=2.34e-05, step=2152] Training: 22%|██▏ | 2153/10000 [25:51<1:34:12, 1.39it/s, loss=0.0190, lr=2.34e-05, step=2152] Training: 22%|██▏ | 2153/10000 [25:51<1:34:12, 1.39it/s, loss=0.0275, lr=2.34e-05, step=2153] Training: 22%|██▏ | 2154/10000 [25:51<1:31:43, 1.43it/s, loss=0.0275, lr=2.34e-05, step=2153] Training: 22%|██▏ | 2154/10000 [25:51<1:31:43, 1.43it/s, loss=0.0171, lr=2.34e-05, step=2154] Training: 22%|██▏ | 2155/10000 [25:52<1:26:01, 1.52it/s, loss=0.0171, lr=2.34e-05, step=2154] Training: 22%|██▏ | 2155/10000 [25:52<1:26:01, 1.52it/s, loss=0.0956, lr=2.34e-05, step=2155] Training: 22%|██▏ | 2156/10000 [25:53<1:27:07, 1.50it/s, loss=0.0956, lr=2.34e-05, step=2155] Training: 22%|██▏ | 2156/10000 [25:53<1:27:07, 1.50it/s, loss=0.0423, lr=2.34e-05, step=2156] Training: 22%|██▏ | 2157/10000 [25:53<1:24:11, 1.55it/s, loss=0.0423, lr=2.34e-05, step=2156] Training: 22%|██▏ | 2157/10000 [25:53<1:24:11, 1.55it/s, loss=0.0183, lr=2.34e-05, step=2157] Training: 22%|██▏ | 2158/10000 [25:54<1:38:58, 1.32it/s, loss=0.0183, lr=2.34e-05, step=2157] Training: 22%|██▏ | 2158/10000 [25:54<1:38:58, 1.32it/s, loss=0.0274, lr=2.34e-05, step=2158] Training: 22%|██▏ | 2159/10000 [25:55<1:47:35, 1.21it/s, loss=0.0274, lr=2.34e-05, step=2158] Training: 22%|██▏ | 2159/10000 [25:55<1:47:35, 1.21it/s, loss=0.0704, lr=2.34e-05, step=2159]16:32:02.712 [I] step=2160 loss=0.0173 smoothed_loss=0.0323 lr=2.34e-05 grad_norm=0.5438 step_time=0.5716s data_time=0.1438s it/s=1.398 eta_to_10000=5608.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.1829 grad_shared_expert=0.3900 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2160/10000 [25:56<1:35:19, 1.37it/s, loss=0.0704, lr=2.34e-05, step=2159] Training: 22%|██▏ | 2160/10000 [25:56<1:35:19, 1.37it/s, loss=0.0173, lr=2.33e-05, step=2160] Training: 22%|██▏ | 2161/10000 [25:56<1:29:18, 1.46it/s, loss=0.0173, lr=2.33e-05, step=2160] Training: 22%|██▏ | 2161/10000 [25:56<1:29:18, 1.46it/s, loss=0.0950, lr=2.33e-05, step=2161] Training: 22%|██▏ | 2162/10000 [25:57<1:21:41, 1.60it/s, loss=0.0950, lr=2.33e-05, step=2161] Training: 22%|██▏ | 2162/10000 [25:57<1:21:41, 1.60it/s, loss=0.0112, lr=2.33e-05, step=2162] Training: 22%|██▏ | 2163/10000 [25:58<1:24:24, 1.55it/s, loss=0.0112, lr=2.33e-05, step=2162] Training: 22%|██▏ | 2163/10000 [25:58<1:24:24, 1.55it/s, loss=0.0325, lr=2.33e-05, step=2163] Training: 22%|██▏ | 2164/10000 [25:58<1:18:27, 1.66it/s, loss=0.0325, lr=2.33e-05, step=2163] Training: 22%|██▏ | 2164/10000 [25:58<1:18:27, 1.66it/s, loss=0.0379, lr=2.33e-05, step=2164] Training: 22%|██▏ | 2165/10000 [25:59<1:22:48, 1.58it/s, loss=0.0379, lr=2.33e-05, step=2164] Training: 22%|██▏ | 2165/10000 [25:59<1:22:48, 1.58it/s, loss=0.0134, lr=2.33e-05, step=2165] Training: 22%|██▏ | 2166/10000 [25:59<1:25:40, 1.52it/s, loss=0.0134, lr=2.33e-05, step=2165] Training: 22%|██▏ | 2166/10000 [25:59<1:25:40, 1.52it/s, loss=0.0218, lr=2.33e-05, step=2166] Training: 22%|██▏ | 2167/10000 [26:00<1:18:44, 1.66it/s, loss=0.0218, lr=2.33e-05, step=2166] Training: 22%|██▏ | 2167/10000 [26:00<1:18:44, 1.66it/s, loss=0.0113, lr=2.33e-05, step=2167] Training: 22%|██▏ | 2168/10000 [26:00<1:14:16, 1.76it/s, loss=0.0113, lr=2.33e-05, step=2167] Training: 22%|██▏ | 2168/10000 [26:00<1:14:16, 1.76it/s, loss=0.0150, lr=2.33e-05, step=2168] Training: 22%|██▏ | 2169/10000 [26:01<1:12:34, 1.80it/s, loss=0.0150, lr=2.33e-05, step=2168] Training: 22%|██▏ | 2169/10000 [26:01<1:12:34, 1.80it/s, loss=0.0193, lr=2.33e-05, step=2169]16:32:08.490 [I] step=2170 loss=0.0216 smoothed_loss=0.0272 lr=2.33e-05 grad_norm=0.5226 step_time=0.4996s data_time=0.0782s it/s=1.731 eta_to_10000=4522.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.1407 grad_shared_expert=0.5241 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2170/10000 [26:02<1:14:45, 1.75it/s, loss=0.0193, lr=2.33e-05, step=2169] Training: 22%|██▏ | 2170/10000 [26:02<1:14:45, 1.75it/s, loss=0.0216, lr=2.33e-05, step=2170] Training: 22%|██▏ | 2171/10000 [26:03<1:29:21, 1.46it/s, loss=0.0216, lr=2.33e-05, step=2170] Training: 22%|██▏ | 2171/10000 [26:03<1:29:21, 1.46it/s, loss=0.0179, lr=2.33e-05, step=2171] Training: 22%|██▏ | 2172/10000 [26:03<1:29:42, 1.45it/s, loss=0.0179, lr=2.33e-05, step=2171] Training: 22%|██▏ | 2172/10000 [26:03<1:29:42, 1.45it/s, loss=0.0088, lr=2.33e-05, step=2172] Training: 22%|██▏ | 2173/10000 [26:04<1:33:05, 1.40it/s, loss=0.0088, lr=2.33e-05, step=2172] Training: 22%|██▏ | 2173/10000 [26:04<1:33:05, 1.40it/s, loss=0.0574, lr=2.33e-05, step=2173] Training: 22%|██▏ | 2174/10000 [26:05<1:42:57, 1.27it/s, loss=0.0574, lr=2.33e-05, step=2173] Training: 22%|██▏ | 2174/10000 [26:05<1:42:57, 1.27it/s, loss=0.0304, lr=2.33e-05, step=2174] Training: 22%|██▏ | 2175/10000 [26:06<1:43:09, 1.26it/s, loss=0.0304, lr=2.33e-05, step=2174] Training: 22%|██▏ | 2175/10000 [26:06<1:43:09, 1.26it/s, loss=0.0290, lr=2.33e-05, step=2175] Training: 22%|██▏ | 2176/10000 [26:06<1:32:39, 1.41it/s, loss=0.0290, lr=2.33e-05, step=2175] Training: 22%|██▏ | 2176/10000 [26:06<1:32:39, 1.41it/s, loss=0.0113, lr=2.33e-05, step=2176] Training: 22%|██▏ | 2177/10000 [26:07<1:24:08, 1.55it/s, loss=0.0113, lr=2.33e-05, step=2176] Training: 22%|██▏ | 2177/10000 [26:07<1:24:08, 1.55it/s, loss=0.0242, lr=2.33e-05, step=2177] Training: 22%|██▏ | 2178/10000 [26:07<1:18:09, 1.67it/s, loss=0.0242, lr=2.33e-05, step=2177] Training: 22%|██▏ | 2178/10000 [26:07<1:18:09, 1.67it/s, loss=0.0227, lr=2.33e-05, step=2178] Training: 22%|██▏ | 2179/10000 [26:08<1:22:52, 1.57it/s, loss=0.0227, lr=2.33e-05, step=2178] Training: 22%|██▏ | 2179/10000 [26:08<1:22:52, 1.57it/s, loss=0.0244, lr=2.33e-05, step=2179]16:32:15.432 [I] step=2180 loss=0.0112 smoothed_loss=0.0243 lr=2.33e-05 grad_norm=0.6155 step_time=0.5774s data_time=0.1168s it/s=1.442 eta_to_10000=5424.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.1772 grad_shared_expert=0.6362 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2180/10000 [26:08<1:18:51, 1.65it/s, loss=0.0244, lr=2.33e-05, step=2179] Training: 22%|██▏ | 2180/10000 [26:08<1:18:51, 1.65it/s, loss=0.0112, lr=2.33e-05, step=2180] Training: 22%|██▏ | 2181/10000 [26:09<1:29:26, 1.46it/s, loss=0.0112, lr=2.33e-05, step=2180] Training: 22%|██▏ | 2181/10000 [26:09<1:29:26, 1.46it/s, loss=0.0235, lr=2.33e-05, step=2181] Training: 22%|██▏ | 2182/10000 [26:10<1:21:22, 1.60it/s, loss=0.0235, lr=2.33e-05, step=2181] Training: 22%|██▏ | 2182/10000 [26:10<1:21:22, 1.60it/s, loss=0.0220, lr=2.33e-05, step=2182] Training: 22%|██▏ | 2183/10000 [26:10<1:15:45, 1.72it/s, loss=0.0220, lr=2.33e-05, step=2182] Training: 22%|██▏ | 2183/10000 [26:10<1:15:45, 1.72it/s, loss=0.0279, lr=2.33e-05, step=2183] Training: 22%|██▏ | 2184/10000 [26:11<1:21:35, 1.60it/s, loss=0.0279, lr=2.33e-05, step=2183] Training: 22%|██▏ | 2184/10000 [26:11<1:21:35, 1.60it/s, loss=0.0144, lr=2.33e-05, step=2184] Training: 22%|██▏ | 2185/10000 [26:12<1:32:53, 1.40it/s, loss=0.0144, lr=2.33e-05, step=2184] Training: 22%|██▏ | 2185/10000 [26:12<1:32:53, 1.40it/s, loss=0.0097, lr=2.33e-05, step=2185] Training: 22%|██▏ | 2186/10000 [26:13<1:39:21, 1.31it/s, loss=0.0097, lr=2.33e-05, step=2185] Training: 22%|██▏ | 2186/10000 [26:13<1:39:21, 1.31it/s, loss=0.0192, lr=2.33e-05, step=2186] Training: 22%|██▏ | 2187/10000 [26:14<1:35:27, 1.36it/s, loss=0.0192, lr=2.33e-05, step=2186] Training: 22%|██▏ | 2187/10000 [26:14<1:35:27, 1.36it/s, loss=0.1206, lr=2.33e-05, step=2187] Training: 22%|██▏ | 2188/10000 [26:14<1:35:50, 1.36it/s, loss=0.1206, lr=2.33e-05, step=2187] Training: 22%|██▏ | 2188/10000 [26:14<1:35:50, 1.36it/s, loss=0.0251, lr=2.33e-05, step=2188] Training: 22%|██▏ | 2189/10000 [26:15<1:33:46, 1.39it/s, loss=0.0251, lr=2.33e-05, step=2188] Training: 22%|██▏ | 2189/10000 [26:15<1:33:46, 1.39it/s, loss=0.0525, lr=2.33e-05, step=2189]16:32:22.527 [I] step=2190 loss=0.0068 smoothed_loss=0.0305 lr=2.33e-05 grad_norm=0.6114 step_time=0.5740s data_time=0.1355s it/s=1.410 eta_to_10000=5540.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0055 grad_action_out_proj=0.0809 grad_shared_expert=0.3209 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2190/10000 [26:16<1:30:41, 1.44it/s, loss=0.0525, lr=2.33e-05, step=2189] Training: 22%|██▏ | 2190/10000 [26:16<1:30:41, 1.44it/s, loss=0.0068, lr=2.33e-05, step=2190] Training: 22%|██▏ | 2191/10000 [26:16<1:22:43, 1.57it/s, loss=0.0068, lr=2.33e-05, step=2190] Training: 22%|██▏ | 2191/10000 [26:16<1:22:43, 1.57it/s, loss=0.0725, lr=2.33e-05, step=2191] Training: 22%|██▏ | 2192/10000 [26:17<1:17:14, 1.68it/s, loss=0.0725, lr=2.33e-05, step=2191] Training: 22%|██▏ | 2192/10000 [26:17<1:17:14, 1.68it/s, loss=0.0277, lr=2.33e-05, step=2192] Training: 22%|██▏ | 2193/10000 [26:17<1:21:31, 1.60it/s, loss=0.0277, lr=2.33e-05, step=2192] Training: 22%|██▏ | 2193/10000 [26:17<1:21:31, 1.60it/s, loss=0.0396, lr=2.33e-05, step=2193] Training: 22%|██▏ | 2194/10000 [26:18<1:25:06, 1.53it/s, loss=0.0396, lr=2.33e-05, step=2193] Training: 22%|██▏ | 2194/10000 [26:18<1:25:06, 1.53it/s, loss=0.0081, lr=2.33e-05, step=2194] Training: 22%|██▏ | 2195/10000 [26:19<1:31:02, 1.43it/s, loss=0.0081, lr=2.33e-05, step=2194] Training: 22%|██▏ | 2195/10000 [26:19<1:31:02, 1.43it/s, loss=0.0175, lr=2.33e-05, step=2195] Training: 22%|██▏ | 2196/10000 [26:20<1:40:30, 1.29it/s, loss=0.0175, lr=2.33e-05, step=2195] Training: 22%|██▏ | 2196/10000 [26:20<1:40:30, 1.29it/s, loss=0.0264, lr=2.33e-05, step=2196] Training: 22%|██▏ | 2197/10000 [26:21<1:45:57, 1.23it/s, loss=0.0264, lr=2.33e-05, step=2196] Training: 22%|██▏ | 2197/10000 [26:21<1:45:57, 1.23it/s, loss=0.0119, lr=2.33e-05, step=2197] Training: 22%|██▏ | 2198/10000 [26:22<1:47:37, 1.21it/s, loss=0.0119, lr=2.33e-05, step=2197] Training: 22%|██▏ | 2198/10000 [26:22<1:47:37, 1.21it/s, loss=0.0214, lr=2.33e-05, step=2198] Training: 22%|██▏ | 2199/10000 [26:22<1:46:41, 1.22it/s, loss=0.0214, lr=2.33e-05, step=2198] Training: 22%|██▏ | 2199/10000 [26:22<1:46:41, 1.22it/s, loss=0.0090, lr=2.33e-05, step=2199]16:32:30.082 [I] step=2200 loss=0.0282 smoothed_loss=0.0259 lr=2.33e-05 grad_norm=0.5947 step_time=0.5769s data_time=0.1785s it/s=1.324 eta_to_10000=5892.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0244 grad_action_out_proj=0.2706 grad_shared_expert=0.8170 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2200/10000 [26:23<1:46:39, 1.22it/s, loss=0.0090, lr=2.33e-05, step=2199] Training: 22%|██▏ | 2200/10000 [26:23<1:46:39, 1.22it/s, loss=0.0282, lr=2.33e-05, step=2200] Training: 22%|██▏ | 2201/10000 [26:24<1:34:09, 1.38it/s, loss=0.0282, lr=2.33e-05, step=2200] Training: 22%|██▏ | 2201/10000 [26:24<1:34:09, 1.38it/s, loss=0.0103, lr=2.33e-05, step=2201] Training: 22%|██▏ | 2202/10000 [26:24<1:25:34, 1.52it/s, loss=0.0103, lr=2.33e-05, step=2201] Training: 22%|██▏ | 2202/10000 [26:24<1:25:34, 1.52it/s, loss=0.0119, lr=2.33e-05, step=2202] Training: 22%|██▏ | 2203/10000 [26:25<1:26:24, 1.50it/s, loss=0.0119, lr=2.33e-05, step=2202] Training: 22%|██▏ | 2203/10000 [26:25<1:26:24, 1.50it/s, loss=0.0534, lr=2.33e-05, step=2203] Training: 22%|██▏ | 2204/10000 [26:25<1:22:50, 1.57it/s, loss=0.0534, lr=2.33e-05, step=2203] Training: 22%|██▏ | 2204/10000 [26:25<1:22:50, 1.57it/s, loss=0.0423, lr=2.33e-05, step=2204] Training: 22%|██▏ | 2205/10000 [26:26<1:26:15, 1.51it/s, loss=0.0423, lr=2.33e-05, step=2204] Training: 22%|██▏ | 2205/10000 [26:26<1:26:15, 1.51it/s, loss=0.0348, lr=2.33e-05, step=2205] Training: 22%|██▏ | 2206/10000 [26:27<1:22:51, 1.57it/s, loss=0.0348, lr=2.33e-05, step=2205] Training: 22%|██▏ | 2206/10000 [26:27<1:22:51, 1.57it/s, loss=0.0488, lr=2.33e-05, step=2206] Training: 22%|██▏ | 2207/10000 [26:27<1:27:42, 1.48it/s, loss=0.0488, lr=2.33e-05, step=2206] Training: 22%|██▏ | 2207/10000 [26:27<1:27:42, 1.48it/s, loss=0.0320, lr=2.33e-05, step=2207] Training: 22%|██▏ | 2208/10000 [26:28<1:20:22, 1.62it/s, loss=0.0320, lr=2.33e-05, step=2207] Training: 22%|██▏ | 2208/10000 [26:28<1:20:22, 1.62it/s, loss=0.0039, lr=2.33e-05, step=2208] Training: 22%|██▏ | 2209/10000 [26:28<1:15:28, 1.72it/s, loss=0.0039, lr=2.33e-05, step=2208] Training: 22%|██▏ | 2209/10000 [26:28<1:15:28, 1.72it/s, loss=0.0341, lr=2.33e-05, step=2209]16:32:35.910 [I] step=2210 loss=0.0044 smoothed_loss=0.0262 lr=2.33e-05 grad_norm=0.5296 step_time=0.4911s data_time=0.0917s it/s=1.716 eta_to_10000=4539.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0155 grad_action_out_proj=0.1674 grad_shared_expert=0.4876 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2210/10000 [26:29<1:13:14, 1.77it/s, loss=0.0341, lr=2.33e-05, step=2209] Training: 22%|██▏ | 2210/10000 [26:29<1:13:14, 1.77it/s, loss=0.0044, lr=2.33e-05, step=2210] Training: 22%|██▏ | 2211/10000 [26:30<1:18:43, 1.65it/s, loss=0.0044, lr=2.33e-05, step=2210] Training: 22%|██▏ | 2211/10000 [26:30<1:18:43, 1.65it/s, loss=0.0145, lr=2.32e-05, step=2211] Training: 22%|██▏ | 2212/10000 [26:30<1:22:08, 1.58it/s, loss=0.0145, lr=2.32e-05, step=2211] Training: 22%|██▏ | 2212/10000 [26:30<1:22:08, 1.58it/s, loss=0.0068, lr=2.32e-05, step=2212] Training: 22%|██▏ | 2213/10000 [26:31<1:16:31, 1.70it/s, loss=0.0068, lr=2.32e-05, step=2212] Training: 22%|██▏ | 2213/10000 [26:31<1:16:31, 1.70it/s, loss=0.0416, lr=2.32e-05, step=2213] Training: 22%|██▏ | 2214/10000 [26:32<1:35:00, 1.37it/s, loss=0.0416, lr=2.32e-05, step=2213] Training: 22%|██▏ | 2214/10000 [26:32<1:35:00, 1.37it/s, loss=0.0223, lr=2.32e-05, step=2214] Training: 22%|██▏ | 2215/10000 [26:32<1:25:50, 1.51it/s, loss=0.0223, lr=2.32e-05, step=2214] Training: 22%|██▏ | 2215/10000 [26:32<1:25:50, 1.51it/s, loss=0.0244, lr=2.32e-05, step=2215] Training: 22%|██▏ | 2216/10000 [26:33<1:18:42, 1.65it/s, loss=0.0244, lr=2.32e-05, step=2215] Training: 22%|██▏ | 2216/10000 [26:33<1:18:42, 1.65it/s, loss=0.0235, lr=2.32e-05, step=2216] Training: 22%|██▏ | 2217/10000 [26:33<1:16:10, 1.70it/s, loss=0.0235, lr=2.32e-05, step=2216] Training: 22%|██▏ | 2217/10000 [26:33<1:16:10, 1.70it/s, loss=0.0780, lr=2.32e-05, step=2217] Training: 22%|██▏ | 2218/10000 [26:34<1:19:35, 1.63it/s, loss=0.0780, lr=2.32e-05, step=2217] Training: 22%|██▏ | 2218/10000 [26:34<1:19:35, 1.63it/s, loss=0.0281, lr=2.32e-05, step=2218] Training: 22%|██▏ | 2219/10000 [26:35<1:14:55, 1.73it/s, loss=0.0281, lr=2.32e-05, step=2218] Training: 22%|██▏ | 2219/10000 [26:35<1:14:55, 1.73it/s, loss=0.0309, lr=2.32e-05, step=2219]16:32:42.089 [I] step=2220 loss=0.0331 smoothed_loss=0.0302 lr=2.32e-05 grad_norm=0.5944 step_time=0.5252s data_time=0.0928s it/s=1.618 eta_to_10000=4807.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.1986 grad_shared_expert=0.5090 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2220/10000 [26:35<1:13:26, 1.77it/s, loss=0.0309, lr=2.32e-05, step=2219] Training: 22%|██▏ | 2220/10000 [26:35<1:13:26, 1.77it/s, loss=0.0331, lr=2.32e-05, step=2220] Training: 22%|██▏ | 2221/10000 [26:36<1:11:06, 1.82it/s, loss=0.0331, lr=2.32e-05, step=2220] Training: 22%|██▏ | 2221/10000 [26:36<1:11:06, 1.82it/s, loss=0.0116, lr=2.32e-05, step=2221] Training: 22%|██▏ | 2222/10000 [26:36<1:20:07, 1.62it/s, loss=0.0116, lr=2.32e-05, step=2221] Training: 22%|██▏ | 2222/10000 [26:36<1:20:07, 1.62it/s, loss=0.0484, lr=2.32e-05, step=2222] Training: 22%|██▏ | 2223/10000 [26:37<1:15:41, 1.71it/s, loss=0.0484, lr=2.32e-05, step=2222] Training: 22%|██▏ | 2223/10000 [26:37<1:15:41, 1.71it/s, loss=0.0125, lr=2.32e-05, step=2223] Training: 22%|██▏ | 2224/10000 [26:37<1:12:29, 1.79it/s, loss=0.0125, lr=2.32e-05, step=2223] Training: 22%|██▏ | 2224/10000 [26:37<1:12:29, 1.79it/s, loss=0.0215, lr=2.32e-05, step=2224] Training: 22%|██▏ | 2225/10000 [26:38<1:16:45, 1.69it/s, loss=0.0215, lr=2.32e-05, step=2224] Training: 22%|██▏ | 2225/10000 [26:38<1:16:45, 1.69it/s, loss=0.0981, lr=2.32e-05, step=2225] Training: 22%|██▏ | 2226/10000 [26:39<1:16:12, 1.70it/s, loss=0.0981, lr=2.32e-05, step=2225] Training: 22%|██▏ | 2226/10000 [26:39<1:16:12, 1.70it/s, loss=0.0312, lr=2.32e-05, step=2226] Training: 22%|██▏ | 2227/10000 [26:39<1:12:22, 1.79it/s, loss=0.0312, lr=2.32e-05, step=2226] Training: 22%|██▏ | 2227/10000 [26:39<1:12:22, 1.79it/s, loss=0.0184, lr=2.32e-05, step=2227] Training: 22%|██▏ | 2228/10000 [26:40<1:18:59, 1.64it/s, loss=0.0184, lr=2.32e-05, step=2227] Training: 22%|██▏ | 2228/10000 [26:40<1:18:59, 1.64it/s, loss=0.0142, lr=2.32e-05, step=2228] Training: 22%|██▏ | 2229/10000 [26:41<1:22:52, 1.56it/s, loss=0.0142, lr=2.32e-05, step=2228] Training: 22%|██▏ | 2229/10000 [26:41<1:22:52, 1.56it/s, loss=0.0111, lr=2.32e-05, step=2229]16:32:48.075 [I] step=2230 loss=0.0254 smoothed_loss=0.0287 lr=2.32e-05 grad_norm=0.5703 step_time=0.5175s data_time=0.0811s it/s=1.671 eta_to_10000=4650.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.2021 grad_shared_expert=0.5261 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2230/10000 [26:41<1:18:00, 1.66it/s, loss=0.0111, lr=2.32e-05, step=2229] Training: 22%|██▏ | 2230/10000 [26:41<1:18:00, 1.66it/s, loss=0.0254, lr=2.32e-05, step=2230] Training: 22%|██▏ | 2231/10000 [26:42<1:13:38, 1.76it/s, loss=0.0254, lr=2.32e-05, step=2230] Training: 22%|██▏ | 2231/10000 [26:42<1:13:38, 1.76it/s, loss=0.0245, lr=2.32e-05, step=2231] Training: 22%|██▏ | 2232/10000 [26:43<1:29:51, 1.44it/s, loss=0.0245, lr=2.32e-05, step=2231] Training: 22%|██▏ | 2232/10000 [26:43<1:29:51, 1.44it/s, loss=0.0333, lr=2.32e-05, step=2232] Training: 22%|██▏ | 2233/10000 [26:43<1:21:30, 1.59it/s, loss=0.0333, lr=2.32e-05, step=2232] Training: 22%|██▏ | 2233/10000 [26:43<1:21:30, 1.59it/s, loss=0.0138, lr=2.32e-05, step=2233] Training: 22%|██▏ | 2234/10000 [26:44<1:16:46, 1.69it/s, loss=0.0138, lr=2.32e-05, step=2233] Training: 22%|██▏ | 2234/10000 [26:44<1:16:46, 1.69it/s, loss=0.0128, lr=2.32e-05, step=2234] Training: 22%|██▏ | 2235/10000 [26:44<1:19:40, 1.62it/s, loss=0.0128, lr=2.32e-05, step=2234] Training: 22%|██▏ | 2235/10000 [26:44<1:19:40, 1.62it/s, loss=0.0259, lr=2.32e-05, step=2235] Training: 22%|██▏ | 2236/10000 [26:45<1:31:33, 1.41it/s, loss=0.0259, lr=2.32e-05, step=2235] Training: 22%|██▏ | 2236/10000 [26:45<1:31:33, 1.41it/s, loss=0.0488, lr=2.32e-05, step=2236] Training: 22%|██▏ | 2237/10000 [26:46<1:22:49, 1.56it/s, loss=0.0488, lr=2.32e-05, step=2236] Training: 22%|██▏ | 2237/10000 [26:46<1:22:49, 1.56it/s, loss=0.0080, lr=2.32e-05, step=2237] Training: 22%|██▏ | 2238/10000 [26:46<1:24:02, 1.54it/s, loss=0.0080, lr=2.32e-05, step=2237] Training: 22%|██▏ | 2238/10000 [26:46<1:24:02, 1.54it/s, loss=0.0197, lr=2.32e-05, step=2238] Training: 22%|██▏ | 2239/10000 [26:47<1:17:41, 1.66it/s, loss=0.0197, lr=2.32e-05, step=2238] Training: 22%|██▏ | 2239/10000 [26:47<1:17:41, 1.66it/s, loss=0.0073, lr=2.32e-05, step=2239]16:32:54.310 [I] step=2240 loss=0.0303 smoothed_loss=0.0243 lr=2.32e-05 grad_norm=0.5342 step_time=0.5340s data_time=0.0895s it/s=1.604 eta_to_10000=4837.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0125 grad_action_out_proj=0.1539 grad_shared_expert=0.4450 (10775:train_pytorch.py:850) + Training: 22%|██▏ | 2240/10000 [26:47<1:15:19, 1.72it/s, loss=0.0073, lr=2.32e-05, step=2239] Training: 22%|██▏ | 2240/10000 [26:47<1:15:19, 1.72it/s, loss=0.0303, lr=2.32e-05, step=2240] Training: 22%|██▏ | 2241/10000 [26:48<1:11:58, 1.80it/s, loss=0.0303, lr=2.32e-05, step=2240] Training: 22%|██▏ | 2241/10000 [26:48<1:11:58, 1.80it/s, loss=0.0608, lr=2.32e-05, step=2241] Training: 22%|██▏ | 2242/10000 [26:48<1:09:18, 1.87it/s, loss=0.0608, lr=2.32e-05, step=2241] Training: 22%|██▏ | 2242/10000 [26:48<1:09:18, 1.87it/s, loss=0.0186, lr=2.32e-05, step=2242] Training: 22%|██▏ | 2243/10000 [26:49<1:23:43, 1.54it/s, loss=0.0186, lr=2.32e-05, step=2242] Training: 22%|██▏ | 2243/10000 [26:49<1:23:43, 1.54it/s, loss=0.0216, lr=2.32e-05, step=2243] Training: 22%|██▏ | 2244/10000 [26:50<1:25:11, 1.52it/s, loss=0.0216, lr=2.32e-05, step=2243] Training: 22%|██▏ | 2244/10000 [26:50<1:25:11, 1.52it/s, loss=0.0208, lr=2.32e-05, step=2244] Training: 22%|██▏ | 2245/10000 [26:51<1:24:57, 1.52it/s, loss=0.0208, lr=2.32e-05, step=2244] Training: 22%|██▏ | 2245/10000 [26:51<1:24:57, 1.52it/s, loss=0.0176, lr=2.32e-05, step=2245] Training: 22%|██▏ | 2246/10000 [26:51<1:17:56, 1.66it/s, loss=0.0176, lr=2.32e-05, step=2245] Training: 22%|██▏ | 2246/10000 [26:51<1:17:56, 1.66it/s, loss=0.0188, lr=2.32e-05, step=2246] Training: 22%|██▏ | 2247/10000 [26:52<1:19:55, 1.62it/s, loss=0.0188, lr=2.32e-05, step=2246] Training: 22%|██▏ | 2247/10000 [26:52<1:19:55, 1.62it/s, loss=0.0329, lr=2.32e-05, step=2247] Training: 22%|██▏ | 2248/10000 [26:52<1:15:21, 1.71it/s, loss=0.0329, lr=2.32e-05, step=2247] Training: 22%|██▏ | 2248/10000 [26:52<1:15:21, 1.71it/s, loss=0.0313, lr=2.32e-05, step=2248] Training: 22%|██▏ | 2249/10000 [26:53<1:11:16, 1.81it/s, loss=0.0313, lr=2.32e-05, step=2248] Training: 22%|██▏ | 2249/10000 [26:53<1:11:16, 1.81it/s, loss=0.0152, lr=2.32e-05, step=2249]16:33:00.429 [I] step=2250 loss=0.0457 smoothed_loss=0.0269 lr=2.32e-05 grad_norm=0.5327 step_time=0.5319s data_time=0.0801s it/s=1.635 eta_to_10000=4741.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0111 grad_action_out_proj=0.1482 grad_shared_expert=0.4727 (10775:train_pytorch.py:850) + Training: 22%|██▎ | 2250/10000 [26:53<1:20:01, 1.61it/s, loss=0.0152, lr=2.32e-05, step=2249] Training: 22%|██▎ | 2250/10000 [26:53<1:20:01, 1.61it/s, loss=0.0457, lr=2.32e-05, step=2250] Training: 23%|██▎ | 2251/10000 [26:54<1:15:19, 1.71it/s, loss=0.0457, lr=2.32e-05, step=2250] Training: 23%|██▎ | 2251/10000 [26:54<1:15:19, 1.71it/s, loss=0.0399, lr=2.32e-05, step=2251] Training: 23%|██▎ | 2252/10000 [26:54<1:12:13, 1.79it/s, loss=0.0399, lr=2.32e-05, step=2251] Training: 23%|██▎ | 2252/10000 [26:54<1:12:13, 1.79it/s, loss=0.0181, lr=2.32e-05, step=2252] Training: 23%|██▎ | 2253/10000 [26:55<1:16:22, 1.69it/s, loss=0.0181, lr=2.32e-05, step=2252] Training: 23%|██▎ | 2253/10000 [26:55<1:16:22, 1.69it/s, loss=0.0028, lr=2.32e-05, step=2253] Training: 23%|██▎ | 2254/10000 [26:56<1:11:41, 1.80it/s, loss=0.0028, lr=2.32e-05, step=2253] Training: 23%|██▎ | 2254/10000 [26:56<1:11:41, 1.80it/s, loss=0.0697, lr=2.32e-05, step=2254] Training: 23%|██▎ | 2255/10000 [26:56<1:08:27, 1.89it/s, loss=0.0697, lr=2.32e-05, step=2254] Training: 23%|██▎ | 2255/10000 [26:56<1:08:27, 1.89it/s, loss=0.0098, lr=2.32e-05, step=2255] Training: 23%|██▎ | 2256/10000 [26:57<1:06:29, 1.94it/s, loss=0.0098, lr=2.32e-05, step=2255] Training: 23%|██▎ | 2256/10000 [26:57<1:06:29, 1.94it/s, loss=0.0028, lr=2.32e-05, step=2256] Training: 23%|██▎ | 2257/10000 [26:57<1:14:01, 1.74it/s, loss=0.0028, lr=2.32e-05, step=2256] Training: 23%|██▎ | 2257/10000 [26:57<1:14:01, 1.74it/s, loss=0.0159, lr=2.32e-05, step=2257] Training: 23%|██▎ | 2258/10000 [26:58<1:10:52, 1.82it/s, loss=0.0159, lr=2.32e-05, step=2257] Training: 23%|██▎ | 2258/10000 [26:58<1:10:52, 1.82it/s, loss=0.0425, lr=2.32e-05, step=2258] Training: 23%|██▎ | 2259/10000 [26:58<1:08:12, 1.89it/s, loss=0.0425, lr=2.32e-05, step=2258] Training: 23%|██▎ | 2259/10000 [26:58<1:08:12, 1.89it/s, loss=0.0355, lr=2.32e-05, step=2259]16:33:05.907 [I] step=2260 loss=0.0218 smoothed_loss=0.0263 lr=2.32e-05 grad_norm=0.6736 step_time=0.4891s data_time=0.0587s it/s=1.826 eta_to_10000=4239.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0190 grad_action_out_proj=0.1815 grad_shared_expert=0.6866 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2260/10000 [26:59<1:14:58, 1.72it/s, loss=0.0355, lr=2.32e-05, step=2259] Training: 23%|██▎ | 2260/10000 [26:59<1:14:58, 1.72it/s, loss=0.0218, lr=2.31e-05, step=2260] Training: 23%|██▎ | 2261/10000 [26:59<1:11:49, 1.80it/s, loss=0.0218, lr=2.31e-05, step=2260] Training: 23%|██▎ | 2261/10000 [26:59<1:11:49, 1.80it/s, loss=0.0140, lr=2.31e-05, step=2261] Training: 23%|██▎ | 2262/10000 [27:00<1:09:12, 1.86it/s, loss=0.0140, lr=2.31e-05, step=2261] Training: 23%|██▎ | 2262/10000 [27:00<1:09:12, 1.86it/s, loss=0.0353, lr=2.31e-05, step=2262] Training: 23%|██▎ | 2263/10000 [27:00<1:07:23, 1.91it/s, loss=0.0353, lr=2.31e-05, step=2262] Training: 23%|██▎ | 2263/10000 [27:00<1:07:23, 1.91it/s, loss=0.0629, lr=2.31e-05, step=2263] Training: 23%|██▎ | 2264/10000 [27:01<1:05:34, 1.97it/s, loss=0.0629, lr=2.31e-05, step=2263] Training: 23%|██▎ | 2264/10000 [27:01<1:05:34, 1.97it/s, loss=0.0166, lr=2.31e-05, step=2264] Training: 23%|██▎ | 2265/10000 [27:02<1:14:09, 1.74it/s, loss=0.0166, lr=2.31e-05, step=2264] Training: 23%|██▎ | 2265/10000 [27:02<1:14:09, 1.74it/s, loss=0.0217, lr=2.31e-05, step=2265] Training: 23%|██▎ | 2266/10000 [27:02<1:10:44, 1.82it/s, loss=0.0217, lr=2.31e-05, step=2265] Training: 23%|██▎ | 2266/10000 [27:02<1:10:44, 1.82it/s, loss=0.0182, lr=2.31e-05, step=2266] Training: 23%|██▎ | 2267/10000 [27:03<1:08:02, 1.89it/s, loss=0.0182, lr=2.31e-05, step=2266] Training: 23%|██▎ | 2267/10000 [27:03<1:08:02, 1.89it/s, loss=0.0195, lr=2.31e-05, step=2267] Training: 23%|██▎ | 2268/10000 [27:03<1:12:23, 1.78it/s, loss=0.0195, lr=2.31e-05, step=2267] Training: 23%|██▎ | 2268/10000 [27:03<1:12:23, 1.78it/s, loss=0.0464, lr=2.31e-05, step=2268] Training: 23%|██▎ | 2269/10000 [27:04<1:09:17, 1.86it/s, loss=0.0464, lr=2.31e-05, step=2268] Training: 23%|██▎ | 2269/10000 [27:04<1:09:17, 1.86it/s, loss=0.0321, lr=2.31e-05, step=2269]16:33:11.208 [I] step=2270 loss=0.0152 smoothed_loss=0.0272 lr=2.31e-05 grad_norm=0.5726 step_time=0.4721s data_time=0.0580s it/s=1.887 eta_to_10000=4097.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1315 grad_shared_expert=0.4517 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2270/10000 [27:04<1:08:50, 1.87it/s, loss=0.0321, lr=2.31e-05, step=2269] Training: 23%|██▎ | 2270/10000 [27:04<1:08:50, 1.87it/s, loss=0.0152, lr=2.31e-05, step=2270] Training: 23%|██▎ | 2271/10000 [27:05<1:06:53, 1.93it/s, loss=0.0152, lr=2.31e-05, step=2270] Training: 23%|██▎ | 2271/10000 [27:05<1:06:53, 1.93it/s, loss=0.0280, lr=2.31e-05, step=2271] Training: 23%|██▎ | 2272/10000 [27:05<1:14:11, 1.74it/s, loss=0.0280, lr=2.31e-05, step=2271] Training: 23%|██▎ | 2272/10000 [27:05<1:14:11, 1.74it/s, loss=0.0093, lr=2.31e-05, step=2272] Training: 23%|██▎ | 2273/10000 [27:06<1:10:17, 1.83it/s, loss=0.0093, lr=2.31e-05, step=2272] Training: 23%|██▎ | 2273/10000 [27:06<1:10:17, 1.83it/s, loss=0.0299, lr=2.31e-05, step=2273] Training: 23%|██▎ | 2274/10000 [27:06<1:07:54, 1.90it/s, loss=0.0299, lr=2.31e-05, step=2273] Training: 23%|██▎ | 2274/10000 [27:06<1:07:54, 1.90it/s, loss=0.0106, lr=2.31e-05, step=2274] Training: 23%|██▎ | 2275/10000 [27:07<1:05:50, 1.96it/s, loss=0.0106, lr=2.31e-05, step=2274] Training: 23%|██▎ | 2275/10000 [27:07<1:05:50, 1.96it/s, loss=0.0130, lr=2.31e-05, step=2275] Training: 23%|██▎ | 2276/10000 [27:08<1:11:13, 1.81it/s, loss=0.0130, lr=2.31e-05, step=2275] Training: 23%|██▎ | 2276/10000 [27:08<1:11:13, 1.81it/s, loss=0.0196, lr=2.31e-05, step=2276] Training: 23%|██▎ | 2277/10000 [27:08<1:08:16, 1.89it/s, loss=0.0196, lr=2.31e-05, step=2276] Training: 23%|██▎ | 2277/10000 [27:08<1:08:16, 1.89it/s, loss=0.0155, lr=2.31e-05, step=2277] Training: 23%|██▎ | 2278/10000 [27:09<1:06:10, 1.94it/s, loss=0.0155, lr=2.31e-05, step=2277] Training: 23%|██▎ | 2278/10000 [27:09<1:06:10, 1.94it/s, loss=0.0293, lr=2.31e-05, step=2278] Training: 23%|██▎ | 2279/10000 [27:09<1:13:46, 1.74it/s, loss=0.0293, lr=2.31e-05, step=2278] Training: 23%|██▎ | 2279/10000 [27:09<1:13:46, 1.74it/s, loss=0.0242, lr=2.31e-05, step=2279]16:33:16.669 [I] step=2280 loss=0.0517 smoothed_loss=0.0259 lr=2.31e-05 grad_norm=0.5919 step_time=0.4891s data_time=0.0570s it/s=1.832 eta_to_10000=4214.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0225 grad_action_out_proj=0.2166 grad_shared_expert=0.6041 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2280/10000 [27:10<1:11:40, 1.80it/s, loss=0.0242, lr=2.31e-05, step=2279] Training: 23%|██▎ | 2280/10000 [27:10<1:11:40, 1.80it/s, loss=0.0517, lr=2.31e-05, step=2280] Training: 23%|██▎ | 2281/10000 [27:10<1:08:23, 1.88it/s, loss=0.0517, lr=2.31e-05, step=2280] Training: 23%|██▎ | 2281/10000 [27:10<1:08:23, 1.88it/s, loss=0.0235, lr=2.31e-05, step=2281] Training: 23%|██▎ | 2282/10000 [27:11<1:06:33, 1.93it/s, loss=0.0235, lr=2.31e-05, step=2281] Training: 23%|██▎ | 2282/10000 [27:11<1:06:33, 1.93it/s, loss=0.0232, lr=2.31e-05, step=2282] Training: 23%|██▎ | 2283/10000 [27:11<1:12:27, 1.78it/s, loss=0.0232, lr=2.31e-05, step=2282] Training: 23%|██▎ | 2283/10000 [27:11<1:12:27, 1.78it/s, loss=0.0223, lr=2.31e-05, step=2283] Training: 23%|██▎ | 2284/10000 [27:12<1:10:22, 1.83it/s, loss=0.0223, lr=2.31e-05, step=2283] Training: 23%|██▎ | 2284/10000 [27:12<1:10:22, 1.83it/s, loss=0.0111, lr=2.31e-05, step=2284] Training: 23%|██▎ | 2285/10000 [27:12<1:08:19, 1.88it/s, loss=0.0111, lr=2.31e-05, step=2284] Training: 23%|██▎ | 2285/10000 [27:12<1:08:19, 1.88it/s, loss=0.0165, lr=2.31e-05, step=2285] Training: 23%|██▎ | 2286/10000 [27:13<1:15:37, 1.70it/s, loss=0.0165, lr=2.31e-05, step=2285] Training: 23%|██▎ | 2286/10000 [27:13<1:15:37, 1.70it/s, loss=0.0625, lr=2.31e-05, step=2286] Training: 23%|██▎ | 2287/10000 [27:14<1:11:19, 1.80it/s, loss=0.0625, lr=2.31e-05, step=2286] Training: 23%|██▎ | 2287/10000 [27:14<1:11:19, 1.80it/s, loss=0.0456, lr=2.31e-05, step=2287] Training: 23%|██▎ | 2288/10000 [27:14<1:08:35, 1.87it/s, loss=0.0456, lr=2.31e-05, step=2287] Training: 23%|██▎ | 2288/10000 [27:14<1:08:35, 1.87it/s, loss=0.0401, lr=2.31e-05, step=2288] Training: 23%|██▎ | 2289/10000 [27:15<1:06:52, 1.92it/s, loss=0.0401, lr=2.31e-05, step=2288] Training: 23%|██▎ | 2289/10000 [27:15<1:06:52, 1.92it/s, loss=0.0066, lr=2.31e-05, step=2289]16:33:22.155 [I] step=2290 loss=0.0248 smoothed_loss=0.0273 lr=2.31e-05 grad_norm=0.6117 step_time=0.4909s data_time=0.0577s it/s=1.823 eta_to_10000=4228.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0242 grad_action_out_proj=0.2079 grad_shared_expert=0.5113 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2290/10000 [27:15<1:13:08, 1.76it/s, loss=0.0066, lr=2.31e-05, step=2289] Training: 23%|██▎ | 2290/10000 [27:15<1:13:08, 1.76it/s, loss=0.0248, lr=2.31e-05, step=2290] Training: 23%|██▎ | 2291/10000 [27:16<1:10:08, 1.83it/s, loss=0.0248, lr=2.31e-05, step=2290] Training: 23%|██▎ | 2291/10000 [27:16<1:10:08, 1.83it/s, loss=0.0058, lr=2.31e-05, step=2291] Training: 23%|██▎ | 2292/10000 [27:17<1:25:43, 1.50it/s, loss=0.0058, lr=2.31e-05, step=2291] Training: 23%|██▎ | 2292/10000 [27:17<1:25:43, 1.50it/s, loss=0.0312, lr=2.31e-05, step=2292] Training: 23%|██▎ | 2293/10000 [27:18<1:33:02, 1.38it/s, loss=0.0312, lr=2.31e-05, step=2292] Training: 23%|██▎ | 2293/10000 [27:18<1:33:02, 1.38it/s, loss=0.0156, lr=2.31e-05, step=2293] Training: 23%|██▎ | 2294/10000 [27:18<1:27:54, 1.46it/s, loss=0.0156, lr=2.31e-05, step=2293] Training: 23%|██▎ | 2294/10000 [27:18<1:27:54, 1.46it/s, loss=0.0271, lr=2.31e-05, step=2294] Training: 23%|██▎ | 2295/10000 [27:19<1:25:45, 1.50it/s, loss=0.0271, lr=2.31e-05, step=2294] Training: 23%|██▎ | 2295/10000 [27:19<1:25:45, 1.50it/s, loss=0.0055, lr=2.31e-05, step=2295] Training: 23%|██▎ | 2296/10000 [27:19<1:18:39, 1.63it/s, loss=0.0055, lr=2.31e-05, step=2295] Training: 23%|██▎ | 2296/10000 [27:19<1:18:39, 1.63it/s, loss=0.0300, lr=2.31e-05, step=2296] Training: 23%|██▎ | 2297/10000 [27:20<1:20:02, 1.60it/s, loss=0.0300, lr=2.31e-05, step=2296] Training: 23%|██▎ | 2297/10000 [27:20<1:20:02, 1.60it/s, loss=0.0347, lr=2.31e-05, step=2297] Training: 23%|██▎ | 2298/10000 [27:20<1:14:30, 1.72it/s, loss=0.0347, lr=2.31e-05, step=2297] Training: 23%|██▎ | 2298/10000 [27:20<1:14:30, 1.72it/s, loss=0.0388, lr=2.31e-05, step=2298] Training: 23%|██▎ | 2299/10000 [27:21<1:10:23, 1.82it/s, loss=0.0388, lr=2.31e-05, step=2298] Training: 23%|██▎ | 2299/10000 [27:21<1:10:23, 1.82it/s, loss=0.0123, lr=2.31e-05, step=2299]16:33:28.513 [I] step=2300 loss=0.0302 smoothed_loss=0.0254 lr=2.31e-05 grad_norm=0.5780 step_time=0.5245s data_time=0.1112s it/s=1.573 eta_to_10000=4894.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0104 grad_action_out_proj=0.1754 grad_shared_expert=0.6121 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2300/10000 [27:22<1:18:11, 1.64it/s, loss=0.0123, lr=2.31e-05, step=2299] Training: 23%|██▎ | 2300/10000 [27:22<1:18:11, 1.64it/s, loss=0.0302, lr=2.31e-05, step=2300] Training: 23%|██▎ | 2301/10000 [27:22<1:13:39, 1.74it/s, loss=0.0302, lr=2.31e-05, step=2300] Training: 23%|██▎ | 2301/10000 [27:22<1:13:39, 1.74it/s, loss=0.0154, lr=2.31e-05, step=2301] Training: 23%|██▎ | 2302/10000 [27:23<1:10:06, 1.83it/s, loss=0.0154, lr=2.31e-05, step=2301] Training: 23%|██▎ | 2302/10000 [27:23<1:10:06, 1.83it/s, loss=0.0147, lr=2.31e-05, step=2302] Training: 23%|██▎ | 2303/10000 [27:23<1:07:48, 1.89it/s, loss=0.0147, lr=2.31e-05, step=2302] Training: 23%|██▎ | 2303/10000 [27:23<1:07:48, 1.89it/s, loss=0.0228, lr=2.31e-05, step=2303] Training: 23%|██▎ | 2304/10000 [27:24<1:12:29, 1.77it/s, loss=0.0228, lr=2.31e-05, step=2303] Training: 23%|██▎ | 2304/10000 [27:24<1:12:29, 1.77it/s, loss=0.0356, lr=2.31e-05, step=2304] Training: 23%|██▎ | 2305/10000 [27:24<1:09:34, 1.84it/s, loss=0.0356, lr=2.31e-05, step=2304] Training: 23%|██▎ | 2305/10000 [27:24<1:09:34, 1.84it/s, loss=0.1118, lr=2.31e-05, step=2305] Training: 23%|██▎ | 2306/10000 [27:25<1:07:08, 1.91it/s, loss=0.1118, lr=2.31e-05, step=2305] Training: 23%|██▎ | 2306/10000 [27:25<1:07:08, 1.91it/s, loss=0.0275, lr=2.31e-05, step=2306] Training: 23%|██▎ | 2307/10000 [27:25<1:05:34, 1.96it/s, loss=0.0275, lr=2.31e-05, step=2306] Training: 23%|██▎ | 2307/10000 [27:25<1:05:34, 1.96it/s, loss=0.0351, lr=2.31e-05, step=2307] Training: 23%|██▎ | 2308/10000 [27:26<1:13:58, 1.73it/s, loss=0.0351, lr=2.31e-05, step=2307] Training: 23%|██▎ | 2308/10000 [27:26<1:13:58, 1.73it/s, loss=0.0179, lr=2.31e-05, step=2308] Training: 23%|██▎ | 2309/10000 [27:26<1:10:42, 1.81it/s, loss=0.0179, lr=2.31e-05, step=2308] Training: 23%|██▎ | 2309/10000 [27:26<1:10:42, 1.81it/s, loss=0.0371, lr=2.30e-05, step=2309]16:33:33.834 [I] step=2310 loss=0.0822 smoothed_loss=0.0370 lr=2.31e-05 grad_norm=0.5795 step_time=0.4744s data_time=0.0577s it/s=1.879 eta_to_10000=4091.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0236 grad_action_out_proj=0.2406 grad_shared_expert=0.7053 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2310/10000 [27:27<1:10:06, 1.83it/s, loss=0.0371, lr=2.30e-05, step=2309] Training: 23%|██▎ | 2310/10000 [27:27<1:10:06, 1.83it/s, loss=0.0822, lr=2.30e-05, step=2310] Training: 23%|██▎ | 2311/10000 [27:28<1:15:03, 1.71it/s, loss=0.0822, lr=2.30e-05, step=2310] Training: 23%|██▎ | 2311/10000 [27:28<1:15:03, 1.71it/s, loss=0.0088, lr=2.30e-05, step=2311] Training: 23%|██▎ | 2312/10000 [27:28<1:11:10, 1.80it/s, loss=0.0088, lr=2.30e-05, step=2311] Training: 23%|██▎ | 2312/10000 [27:28<1:11:10, 1.80it/s, loss=0.0172, lr=2.30e-05, step=2312] Training: 23%|██▎ | 2313/10000 [27:29<1:08:39, 1.87it/s, loss=0.0172, lr=2.30e-05, step=2312] Training: 23%|██▎ | 2313/10000 [27:29<1:08:39, 1.87it/s, loss=0.0240, lr=2.30e-05, step=2313] Training: 23%|██▎ | 2314/10000 [27:29<1:07:07, 1.91it/s, loss=0.0240, lr=2.30e-05, step=2313] Training: 23%|██▎ | 2314/10000 [27:29<1:07:07, 1.91it/s, loss=0.0154, lr=2.30e-05, step=2314] Training: 23%|██▎ | 2315/10000 [27:30<1:14:19, 1.72it/s, loss=0.0154, lr=2.30e-05, step=2314] Training: 23%|██▎ | 2315/10000 [27:30<1:14:19, 1.72it/s, loss=0.0191, lr=2.30e-05, step=2315] Training: 23%|██▎ | 2316/10000 [27:30<1:11:48, 1.78it/s, loss=0.0191, lr=2.30e-05, step=2315] Training: 23%|██▎ | 2316/10000 [27:30<1:11:48, 1.78it/s, loss=0.0248, lr=2.30e-05, step=2316] Training: 23%|██▎ | 2317/10000 [27:31<1:09:17, 1.85it/s, loss=0.0248, lr=2.30e-05, step=2316] Training: 23%|██▎ | 2317/10000 [27:31<1:09:17, 1.85it/s, loss=0.0164, lr=2.30e-05, step=2317] Training: 23%|██▎ | 2318/10000 [27:31<1:14:44, 1.71it/s, loss=0.0164, lr=2.30e-05, step=2317] Training: 23%|██▎ | 2318/10000 [27:31<1:14:44, 1.71it/s, loss=0.0177, lr=2.30e-05, step=2318] Training: 23%|██▎ | 2319/10000 [27:32<1:11:29, 1.79it/s, loss=0.0177, lr=2.30e-05, step=2318] Training: 23%|██▎ | 2319/10000 [27:32<1:11:29, 1.79it/s, loss=0.0182, lr=2.30e-05, step=2319]16:33:39.407 [I] step=2320 loss=0.0998 smoothed_loss=0.0330 lr=2.30e-05 grad_norm=0.5542 step_time=0.4974s data_time=0.0598s it/s=1.795 eta_to_10000=4279.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0252 grad_action_out_proj=0.2004 grad_shared_expert=0.6067 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2320/10000 [27:32<1:10:01, 1.83it/s, loss=0.0182, lr=2.30e-05, step=2319] Training: 23%|██▎ | 2320/10000 [27:32<1:10:01, 1.83it/s, loss=0.0998, lr=2.30e-05, step=2320] Training: 23%|██▎ | 2321/10000 [27:33<1:07:48, 1.89it/s, loss=0.0998, lr=2.30e-05, step=2320] Training: 23%|██▎ | 2321/10000 [27:33<1:07:48, 1.89it/s, loss=0.0363, lr=2.30e-05, step=2321] Training: 23%|██▎ | 2322/10000 [27:34<1:15:21, 1.70it/s, loss=0.0363, lr=2.30e-05, step=2321] Training: 23%|██▎ | 2322/10000 [27:34<1:15:21, 1.70it/s, loss=0.0435, lr=2.30e-05, step=2322] Training: 23%|██▎ | 2323/10000 [27:34<1:11:08, 1.80it/s, loss=0.0435, lr=2.30e-05, step=2322] Training: 23%|██▎ | 2323/10000 [27:34<1:11:08, 1.80it/s, loss=0.0222, lr=2.30e-05, step=2323] Training: 23%|██▎ | 2324/10000 [27:35<1:08:51, 1.86it/s, loss=0.0222, lr=2.30e-05, step=2323] Training: 23%|██▎ | 2324/10000 [27:35<1:08:51, 1.86it/s, loss=0.0300, lr=2.30e-05, step=2324] Training: 23%|██▎ | 2325/10000 [27:35<1:14:21, 1.72it/s, loss=0.0300, lr=2.30e-05, step=2324] Training: 23%|██▎ | 2325/10000 [27:35<1:14:21, 1.72it/s, loss=0.0165, lr=2.30e-05, step=2325] Training: 23%|██▎ | 2326/10000 [27:36<1:10:59, 1.80it/s, loss=0.0165, lr=2.30e-05, step=2325] Training: 23%|██▎ | 2326/10000 [27:36<1:10:59, 1.80it/s, loss=0.0101, lr=2.30e-05, step=2326] Training: 23%|██▎ | 2327/10000 [27:36<1:08:44, 1.86it/s, loss=0.0101, lr=2.30e-05, step=2326] Training: 23%|██▎ | 2327/10000 [27:36<1:08:44, 1.86it/s, loss=0.0543, lr=2.30e-05, step=2327] Training: 23%|██▎ | 2328/10000 [27:37<1:06:39, 1.92it/s, loss=0.0543, lr=2.30e-05, step=2327] Training: 23%|██▎ | 2328/10000 [27:37<1:06:39, 1.92it/s, loss=0.0398, lr=2.30e-05, step=2328] Training: 23%|██▎ | 2329/10000 [27:37<1:05:11, 1.96it/s, loss=0.0398, lr=2.30e-05, step=2328] Training: 23%|██▎ | 2329/10000 [27:37<1:05:11, 1.96it/s, loss=0.0238, lr=2.30e-05, step=2329]16:33:44.999 [I] step=2330 loss=0.0233 smoothed_loss=0.0307 lr=2.30e-05 grad_norm=0.5300 step_time=0.4984s data_time=0.0609s it/s=1.789 eta_to_10000=4288.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.1237 grad_shared_expert=0.5202 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2330/10000 [27:38<1:14:47, 1.71it/s, loss=0.0238, lr=2.30e-05, step=2329] Training: 23%|██▎ | 2330/10000 [27:38<1:14:47, 1.71it/s, loss=0.0233, lr=2.30e-05, step=2330] Training: 23%|██▎ | 2331/10000 [27:39<1:11:06, 1.80it/s, loss=0.0233, lr=2.30e-05, step=2330] Training: 23%|██▎ | 2331/10000 [27:39<1:11:06, 1.80it/s, loss=0.0169, lr=2.30e-05, step=2331] Training: 23%|██▎ | 2332/10000 [27:39<1:16:56, 1.66it/s, loss=0.0169, lr=2.30e-05, step=2331] Training: 23%|██▎ | 2332/10000 [27:39<1:16:56, 1.66it/s, loss=0.0351, lr=2.30e-05, step=2332] Training: 23%|██▎ | 2333/10000 [27:40<1:23:06, 1.54it/s, loss=0.0351, lr=2.30e-05, step=2332] Training: 23%|██▎ | 2333/10000 [27:40<1:23:06, 1.54it/s, loss=0.0440, lr=2.30e-05, step=2333] Training: 23%|██▎ | 2334/10000 [27:41<1:17:22, 1.65it/s, loss=0.0440, lr=2.30e-05, step=2333] Training: 23%|██▎ | 2334/10000 [27:41<1:17:22, 1.65it/s, loss=0.0192, lr=2.30e-05, step=2334] Training: 23%|██▎ | 2335/10000 [27:41<1:13:16, 1.74it/s, loss=0.0192, lr=2.30e-05, step=2334] Training: 23%|██▎ | 2335/10000 [27:41<1:13:16, 1.74it/s, loss=0.0092, lr=2.30e-05, step=2335] Training: 23%|██▎ | 2336/10000 [27:42<1:09:58, 1.83it/s, loss=0.0092, lr=2.30e-05, step=2335] Training: 23%|██▎ | 2336/10000 [27:42<1:09:58, 1.83it/s, loss=0.0111, lr=2.30e-05, step=2336] Training: 23%|██▎ | 2337/10000 [27:42<1:17:49, 1.64it/s, loss=0.0111, lr=2.30e-05, step=2336] Training: 23%|██▎ | 2337/10000 [27:42<1:17:49, 1.64it/s, loss=0.0150, lr=2.30e-05, step=2337] Training: 23%|██▎ | 2338/10000 [27:43<1:13:23, 1.74it/s, loss=0.0150, lr=2.30e-05, step=2337] Training: 23%|██▎ | 2338/10000 [27:43<1:13:23, 1.74it/s, loss=0.0115, lr=2.30e-05, step=2338] Training: 23%|██▎ | 2339/10000 [27:43<1:10:05, 1.82it/s, loss=0.0115, lr=2.30e-05, step=2338] Training: 23%|██▎ | 2339/10000 [27:43<1:10:05, 1.82it/s, loss=0.0073, lr=2.30e-05, step=2339]16:33:50.891 [I] step=2340 loss=0.0198 smoothed_loss=0.0219 lr=2.30e-05 grad_norm=0.6776 step_time=0.5150s data_time=0.0742s it/s=1.698 eta_to_10000=4512.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0139 grad_action_out_proj=0.1571 grad_shared_expert=0.4755 (10775:train_pytorch.py:850) + Training: 23%|██▎ | 2340/10000 [27:44<1:16:09, 1.68it/s, loss=0.0073, lr=2.30e-05, step=2339] Training: 23%|██▎ | 2340/10000 [27:44<1:16:09, 1.68it/s, loss=0.0198, lr=2.30e-05, step=2340] Training: 23%|██▎ | 2341/10000 [27:44<1:11:38, 1.78it/s, loss=0.0198, lr=2.30e-05, step=2340] Training: 23%|██▎ | 2341/10000 [27:44<1:11:38, 1.78it/s, loss=0.0163, lr=2.30e-05, step=2341] Training: 23%|██▎ | 2342/10000 [27:45<1:09:09, 1.85it/s, loss=0.0163, lr=2.30e-05, step=2341] Training: 23%|██▎ | 2342/10000 [27:45<1:09:09, 1.85it/s, loss=0.0439, lr=2.30e-05, step=2342] Training: 23%|██▎ | 2343/10000 [27:46<1:16:35, 1.67it/s, loss=0.0439, lr=2.30e-05, step=2342] Training: 23%|██▎ | 2343/10000 [27:46<1:16:35, 1.67it/s, loss=0.0299, lr=2.30e-05, step=2343] Training: 23%|██▎ | 2344/10000 [27:46<1:12:13, 1.77it/s, loss=0.0299, lr=2.30e-05, step=2343] Training: 23%|██▎ | 2344/10000 [27:46<1:12:13, 1.77it/s, loss=0.0355, lr=2.30e-05, step=2344] Training: 23%|██▎ | 2345/10000 [27:47<1:09:22, 1.84it/s, loss=0.0355, lr=2.30e-05, step=2344] Training: 23%|██▎ | 2345/10000 [27:47<1:09:22, 1.84it/s, loss=0.0130, lr=2.30e-05, step=2345] Training: 23%|██▎ | 2346/10000 [27:47<1:07:50, 1.88it/s, loss=0.0130, lr=2.30e-05, step=2345] Training: 23%|██▎ | 2346/10000 [27:47<1:07:50, 1.88it/s, loss=0.0125, lr=2.30e-05, step=2346] Training: 23%|██▎ | 2347/10000 [27:48<1:14:04, 1.72it/s, loss=0.0125, lr=2.30e-05, step=2346] Training: 23%|██▎ | 2347/10000 [27:48<1:14:04, 1.72it/s, loss=0.0259, lr=2.30e-05, step=2347] Training: 23%|██▎ | 2348/10000 [27:48<1:11:15, 1.79it/s, loss=0.0259, lr=2.30e-05, step=2347] Training: 23%|██▎ | 2348/10000 [27:48<1:11:15, 1.79it/s, loss=0.0162, lr=2.30e-05, step=2348] Training: 23%|██▎ | 2349/10000 [27:49<1:08:58, 1.85it/s, loss=0.0162, lr=2.30e-05, step=2348] Training: 23%|██▎ | 2349/10000 [27:49<1:08:58, 1.85it/s, loss=0.0156, lr=2.30e-05, step=2349]16:33:56.319 [I] step=2350 loss=0.0316 smoothed_loss=0.0228 lr=2.30e-05 grad_norm=0.5796 step_time=0.4824s data_time=0.0603s it/s=1.843 eta_to_10000=4151.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0209 grad_action_out_proj=0.2057 grad_shared_expert=0.5080 (10775:train_pytorch.py:850) + Training: 24%|██▎ | 2350/10000 [27:49<1:08:37, 1.86it/s, loss=0.0156, lr=2.30e-05, step=2349] Training: 24%|██▎ | 2350/10000 [27:49<1:08:37, 1.86it/s, loss=0.0316, lr=2.30e-05, step=2350] Training: 24%|██▎ | 2351/10000 [27:50<1:16:14, 1.67it/s, loss=0.0316, lr=2.30e-05, step=2350] Training: 24%|██▎ | 2351/10000 [27:50<1:16:14, 1.67it/s, loss=0.0162, lr=2.30e-05, step=2351] Training: 24%|██▎ | 2352/10000 [27:51<1:12:50, 1.75it/s, loss=0.0162, lr=2.30e-05, step=2351] Training: 24%|██▎ | 2352/10000 [27:51<1:12:50, 1.75it/s, loss=0.0559, lr=2.30e-05, step=2352] Training: 24%|██▎ | 2353/10000 [27:51<1:10:04, 1.82it/s, loss=0.0559, lr=2.30e-05, step=2352] Training: 24%|██▎ | 2353/10000 [27:51<1:10:04, 1.82it/s, loss=0.0331, lr=2.30e-05, step=2353] Training: 24%|██▎ | 2354/10000 [27:52<1:08:07, 1.87it/s, loss=0.0331, lr=2.30e-05, step=2353] Training: 24%|██▎ | 2354/10000 [27:52<1:08:07, 1.87it/s, loss=0.0195, lr=2.30e-05, step=2354] Training: 24%|██▎ | 2355/10000 [27:52<1:13:05, 1.74it/s, loss=0.0195, lr=2.30e-05, step=2354] Training: 24%|██▎ | 2355/10000 [27:52<1:13:05, 1.74it/s, loss=0.0453, lr=2.30e-05, step=2355] Training: 24%|██▎ | 2356/10000 [27:53<1:09:54, 1.82it/s, loss=0.0453, lr=2.30e-05, step=2355] Training: 24%|██▎ | 2356/10000 [27:53<1:09:54, 1.82it/s, loss=0.0233, lr=2.29e-05, step=2356] Training: 24%|██▎ | 2357/10000 [27:53<1:07:55, 1.88it/s, loss=0.0233, lr=2.29e-05, step=2356] Training: 24%|██▎ | 2357/10000 [27:53<1:07:55, 1.88it/s, loss=0.0229, lr=2.29e-05, step=2357] Training: 24%|██▎ | 2358/10000 [27:54<1:18:31, 1.62it/s, loss=0.0229, lr=2.29e-05, step=2357] Training: 24%|██▎ | 2358/10000 [27:54<1:18:31, 1.62it/s, loss=0.0487, lr=2.29e-05, step=2358] Training: 24%|██▎ | 2359/10000 [27:55<1:13:42, 1.73it/s, loss=0.0487, lr=2.29e-05, step=2358] Training: 24%|██▎ | 2359/10000 [27:55<1:13:42, 1.73it/s, loss=0.0192, lr=2.29e-05, step=2359]16:34:02.039 [I] step=2360 loss=0.0130 smoothed_loss=0.0265 lr=2.29e-05 grad_norm=0.5718 step_time=0.5120s data_time=0.0600s it/s=1.749 eta_to_10000=4369.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1297 grad_shared_expert=0.3217 (10775:train_pytorch.py:850) + Training: 24%|██▎ | 2360/10000 [27:55<1:11:27, 1.78it/s, loss=0.0192, lr=2.29e-05, step=2359] Training: 24%|██▎ | 2360/10000 [27:55<1:11:27, 1.78it/s, loss=0.0130, lr=2.29e-05, step=2360] Training: 24%|██▎ | 2361/10000 [27:56<1:09:02, 1.84it/s, loss=0.0130, lr=2.29e-05, step=2360] Training: 24%|██▎ | 2361/10000 [27:56<1:09:02, 1.84it/s, loss=0.0064, lr=2.29e-05, step=2361] Training: 24%|██▎ | 2362/10000 [27:56<1:14:27, 1.71it/s, loss=0.0064, lr=2.29e-05, step=2361] Training: 24%|██▎ | 2362/10000 [27:56<1:14:27, 1.71it/s, loss=0.0146, lr=2.29e-05, step=2362] Training: 24%|██▎ | 2363/10000 [27:57<1:11:09, 1.79it/s, loss=0.0146, lr=2.29e-05, step=2362] Training: 24%|██▎ | 2363/10000 [27:57<1:11:09, 1.79it/s, loss=0.0090, lr=2.29e-05, step=2363] Training: 24%|██▎ | 2364/10000 [27:57<1:08:42, 1.85it/s, loss=0.0090, lr=2.29e-05, step=2363] Training: 24%|██▎ | 2364/10000 [27:57<1:08:42, 1.85it/s, loss=0.0397, lr=2.29e-05, step=2364] Training: 24%|██▎ | 2365/10000 [27:58<1:14:28, 1.71it/s, loss=0.0397, lr=2.29e-05, step=2364] Training: 24%|██▎ | 2365/10000 [27:58<1:14:28, 1.71it/s, loss=0.0174, lr=2.29e-05, step=2365] Training: 24%|██▎ | 2366/10000 [27:58<1:11:58, 1.77it/s, loss=0.0174, lr=2.29e-05, step=2365] Training: 24%|██▎ | 2366/10000 [27:58<1:11:58, 1.77it/s, loss=0.0380, lr=2.29e-05, step=2366] Training: 24%|██▎ | 2367/10000 [27:59<1:09:13, 1.84it/s, loss=0.0380, lr=2.29e-05, step=2366] Training: 24%|██▎ | 2367/10000 [27:59<1:09:13, 1.84it/s, loss=0.0170, lr=2.29e-05, step=2367] Training: 24%|██▎ | 2368/10000 [27:59<1:07:10, 1.89it/s, loss=0.0170, lr=2.29e-05, step=2367] Training: 24%|██▎ | 2368/10000 [27:59<1:07:10, 1.89it/s, loss=0.0432, lr=2.29e-05, step=2368] Training: 24%|██▎ | 2369/10000 [28:00<1:12:50, 1.75it/s, loss=0.0432, lr=2.29e-05, step=2368] Training: 24%|██▎ | 2369/10000 [28:00<1:12:50, 1.75it/s, loss=0.0234, lr=2.29e-05, step=2369]16:34:07.616 [I] step=2370 loss=0.0187 smoothed_loss=0.0249 lr=2.29e-05 grad_norm=0.5280 step_time=0.4968s data_time=0.0610s it/s=1.793 eta_to_10000=4255.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0063 grad_action_out_proj=0.1078 grad_shared_expert=0.3970 (10775:train_pytorch.py:850) + Training: 24%|██▎ | 2370/10000 [28:01<1:11:10, 1.79it/s, loss=0.0234, lr=2.29e-05, step=2369] Training: 24%|██▎ | 2370/10000 [28:01<1:11:10, 1.79it/s, loss=0.0187, lr=2.29e-05, step=2370] Training: 24%|██▎ | 2371/10000 [28:01<1:08:40, 1.85it/s, loss=0.0187, lr=2.29e-05, step=2370] Training: 24%|██▎ | 2371/10000 [28:01<1:08:40, 1.85it/s, loss=0.0071, lr=2.29e-05, step=2371] Training: 24%|██▎ | 2372/10000 [28:02<1:17:30, 1.64it/s, loss=0.0071, lr=2.29e-05, step=2371] Training: 24%|██▎ | 2372/10000 [28:02<1:17:30, 1.64it/s, loss=0.0194, lr=2.29e-05, step=2372] Training: 24%|██▎ | 2373/10000 [28:02<1:13:03, 1.74it/s, loss=0.0194, lr=2.29e-05, step=2372] Training: 24%|██▎ | 2373/10000 [28:02<1:13:03, 1.74it/s, loss=0.0367, lr=2.29e-05, step=2373] Training: 24%|██▎ | 2374/10000 [28:03<1:09:42, 1.82it/s, loss=0.0367, lr=2.29e-05, step=2373] Training: 24%|██▎ | 2374/10000 [28:03<1:09:42, 1.82it/s, loss=0.0257, lr=2.29e-05, step=2374] Training: 24%|██▍ | 2375/10000 [28:03<1:08:12, 1.86it/s, loss=0.0257, lr=2.29e-05, step=2374] Training: 24%|██▍ | 2375/10000 [28:03<1:08:12, 1.86it/s, loss=0.0198, lr=2.29e-05, step=2375] Training: 24%|██▍ | 2376/10000 [28:04<1:13:35, 1.73it/s, loss=0.0198, lr=2.29e-05, step=2375] Training: 24%|██▍ | 2376/10000 [28:04<1:13:35, 1.73it/s, loss=0.0136, lr=2.29e-05, step=2376] Training: 24%|██▍ | 2377/10000 [28:05<1:10:32, 1.80it/s, loss=0.0136, lr=2.29e-05, step=2376] Training: 24%|██▍ | 2377/10000 [28:05<1:10:32, 1.80it/s, loss=0.0306, lr=2.29e-05, step=2377] Training: 24%|██▍ | 2378/10000 [28:05<1:08:28, 1.86it/s, loss=0.0306, lr=2.29e-05, step=2377] Training: 24%|██▍ | 2378/10000 [28:05<1:08:28, 1.86it/s, loss=0.0235, lr=2.29e-05, step=2378] Training: 24%|██▍ | 2379/10000 [28:06<1:15:18, 1.69it/s, loss=0.0235, lr=2.29e-05, step=2378] Training: 24%|██▍ | 2379/10000 [28:06<1:15:18, 1.69it/s, loss=0.0037, lr=2.29e-05, step=2379]16:34:13.313 [I] step=2380 loss=0.0462 smoothed_loss=0.0241 lr=2.29e-05 grad_norm=0.5576 step_time=0.5072s data_time=0.0625s it/s=1.756 eta_to_10000=4340.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0244 grad_action_out_proj=0.2201 grad_shared_expert=0.5108 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2380/10000 [28:06<1:13:25, 1.73it/s, loss=0.0037, lr=2.29e-05, step=2379] Training: 24%|██▍ | 2380/10000 [28:06<1:13:25, 1.73it/s, loss=0.0462, lr=2.29e-05, step=2380] Training: 24%|██▍ | 2381/10000 [28:07<1:10:15, 1.81it/s, loss=0.0462, lr=2.29e-05, step=2380] Training: 24%|██▍ | 2381/10000 [28:07<1:10:15, 1.81it/s, loss=0.0558, lr=2.29e-05, step=2381] Training: 24%|██▍ | 2382/10000 [28:07<1:07:55, 1.87it/s, loss=0.0558, lr=2.29e-05, step=2381] Training: 24%|██▍ | 2382/10000 [28:07<1:07:55, 1.87it/s, loss=0.0235, lr=2.29e-05, step=2382] Training: 24%|██▍ | 2383/10000 [28:08<1:13:23, 1.73it/s, loss=0.0235, lr=2.29e-05, step=2382] Training: 24%|██▍ | 2383/10000 [28:08<1:13:23, 1.73it/s, loss=0.0131, lr=2.29e-05, step=2383] Training: 24%|██▍ | 2384/10000 [28:09<1:10:33, 1.80it/s, loss=0.0131, lr=2.29e-05, step=2383] Training: 24%|██▍ | 2384/10000 [28:09<1:10:33, 1.80it/s, loss=0.0197, lr=2.29e-05, step=2384] Training: 24%|██▍ | 2385/10000 [28:09<1:08:15, 1.86it/s, loss=0.0197, lr=2.29e-05, step=2384] Training: 24%|██▍ | 2385/10000 [28:09<1:08:15, 1.86it/s, loss=0.0270, lr=2.29e-05, step=2385] Training: 24%|██▍ | 2386/10000 [28:10<1:15:32, 1.68it/s, loss=0.0270, lr=2.29e-05, step=2385] Training: 24%|██▍ | 2386/10000 [28:10<1:15:32, 1.68it/s, loss=0.0457, lr=2.29e-05, step=2386] Training: 24%|██▍ | 2387/10000 [28:10<1:11:40, 1.77it/s, loss=0.0457, lr=2.29e-05, step=2386] Training: 24%|██▍ | 2387/10000 [28:10<1:11:40, 1.77it/s, loss=0.0515, lr=2.29e-05, step=2387] Training: 24%|██▍ | 2388/10000 [28:11<1:10:39, 1.80it/s, loss=0.0515, lr=2.29e-05, step=2387] Training: 24%|██▍ | 2388/10000 [28:11<1:10:39, 1.80it/s, loss=0.0306, lr=2.29e-05, step=2388] Training: 24%|██▍ | 2389/10000 [28:11<1:15:44, 1.67it/s, loss=0.0306, lr=2.29e-05, step=2388] Training: 24%|██▍ | 2389/10000 [28:11<1:15:44, 1.67it/s, loss=0.0328, lr=2.29e-05, step=2389]16:34:18.975 [I] step=2390 loss=0.0073 smoothed_loss=0.0277 lr=2.29e-05 grad_norm=0.5559 step_time=0.5030s data_time=0.0632s it/s=1.767 eta_to_10000=4307.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.1367 grad_shared_expert=0.4099 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2390/10000 [28:12<1:13:40, 1.72it/s, loss=0.0328, lr=2.29e-05, step=2389] Training: 24%|██▍ | 2390/10000 [28:12<1:13:40, 1.72it/s, loss=0.0073, lr=2.29e-05, step=2390] Training: 24%|██▍ | 2391/10000 [28:13<1:09:41, 1.82it/s, loss=0.0073, lr=2.29e-05, step=2390] Training: 24%|██▍ | 2391/10000 [28:13<1:09:41, 1.82it/s, loss=0.0225, lr=2.29e-05, step=2391] Training: 24%|██▍ | 2392/10000 [28:13<1:10:06, 1.81it/s, loss=0.0225, lr=2.29e-05, step=2391] Training: 24%|██▍ | 2392/10000 [28:13<1:10:06, 1.81it/s, loss=0.0154, lr=2.29e-05, step=2392] Training: 24%|██▍ | 2393/10000 [28:14<1:07:23, 1.88it/s, loss=0.0154, lr=2.29e-05, step=2392] Training: 24%|██▍ | 2393/10000 [28:14<1:07:23, 1.88it/s, loss=0.0155, lr=2.29e-05, step=2393] Training: 24%|██▍ | 2394/10000 [28:14<1:14:50, 1.69it/s, loss=0.0155, lr=2.29e-05, step=2393] Training: 24%|██▍ | 2394/10000 [28:14<1:14:50, 1.69it/s, loss=0.0133, lr=2.29e-05, step=2394] Training: 24%|██▍ | 2395/10000 [28:15<1:10:45, 1.79it/s, loss=0.0133, lr=2.29e-05, step=2394] Training: 24%|██▍ | 2395/10000 [28:15<1:10:45, 1.79it/s, loss=0.0203, lr=2.29e-05, step=2395] Training: 24%|██▍ | 2396/10000 [28:15<1:15:41, 1.67it/s, loss=0.0203, lr=2.29e-05, step=2395] Training: 24%|██▍ | 2396/10000 [28:15<1:15:41, 1.67it/s, loss=0.0269, lr=2.29e-05, step=2396] Training: 24%|██▍ | 2397/10000 [28:16<1:11:14, 1.78it/s, loss=0.0269, lr=2.29e-05, step=2396] Training: 24%|██▍ | 2397/10000 [28:16<1:11:14, 1.78it/s, loss=0.0160, lr=2.29e-05, step=2397] Training: 24%|██▍ | 2398/10000 [28:16<1:08:04, 1.86it/s, loss=0.0160, lr=2.29e-05, step=2397] Training: 24%|██▍ | 2398/10000 [28:16<1:08:04, 1.86it/s, loss=0.0214, lr=2.29e-05, step=2398] Training: 24%|██▍ | 2399/10000 [28:17<1:05:45, 1.93it/s, loss=0.0214, lr=2.29e-05, step=2398] Training: 24%|██▍ | 2399/10000 [28:17<1:05:45, 1.93it/s, loss=0.0104, lr=2.29e-05, step=2399]16:34:24.349 [I] step=2400 loss=0.0151 smoothed_loss=0.0210 lr=2.29e-05 grad_norm=0.5863 step_time=0.4792s data_time=0.0582s it/s=1.861 eta_to_10000=4083.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1300 grad_shared_expert=0.5702 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2400/10000 [28:17<1:05:46, 1.93it/s, loss=0.0104, lr=2.29e-05, step=2399] Training: 24%|██▍ | 2400/10000 [28:17<1:05:46, 1.93it/s, loss=0.0151, lr=2.29e-05, step=2400] Training: 24%|██▍ | 2401/10000 [28:18<1:15:25, 1.68it/s, loss=0.0151, lr=2.29e-05, step=2400] Training: 24%|██▍ | 2401/10000 [28:18<1:15:25, 1.68it/s, loss=0.0180, lr=2.29e-05, step=2401] Training: 24%|██▍ | 2402/10000 [28:19<1:11:24, 1.77it/s, loss=0.0180, lr=2.29e-05, step=2401] Training: 24%|██▍ | 2402/10000 [28:19<1:11:24, 1.77it/s, loss=0.0312, lr=2.28e-05, step=2402] Training: 24%|██▍ | 2403/10000 [28:19<1:08:09, 1.86it/s, loss=0.0312, lr=2.28e-05, step=2402] Training: 24%|██▍ | 2403/10000 [28:19<1:08:09, 1.86it/s, loss=0.0437, lr=2.28e-05, step=2403] Training: 24%|██▍ | 2404/10000 [28:20<1:13:17, 1.73it/s, loss=0.0437, lr=2.28e-05, step=2403] Training: 24%|██▍ | 2404/10000 [28:20<1:13:17, 1.73it/s, loss=0.0279, lr=2.28e-05, step=2404] Training: 24%|██▍ | 2405/10000 [28:20<1:09:50, 1.81it/s, loss=0.0279, lr=2.28e-05, step=2404] Training: 24%|██▍ | 2405/10000 [28:20<1:09:50, 1.81it/s, loss=0.0082, lr=2.28e-05, step=2405] Training: 24%|██▍ | 2406/10000 [28:21<1:07:08, 1.89it/s, loss=0.0082, lr=2.28e-05, step=2405] Training: 24%|██▍ | 2406/10000 [28:21<1:07:08, 1.89it/s, loss=0.0154, lr=2.28e-05, step=2406] Training: 24%|██▍ | 2407/10000 [28:21<1:08:05, 1.86it/s, loss=0.0154, lr=2.28e-05, step=2406] Training: 24%|██▍ | 2407/10000 [28:21<1:08:05, 1.86it/s, loss=0.0118, lr=2.28e-05, step=2407] Training: 24%|██▍ | 2408/10000 [28:22<1:14:12, 1.71it/s, loss=0.0118, lr=2.28e-05, step=2407] Training: 24%|██▍ | 2408/10000 [28:22<1:14:12, 1.71it/s, loss=0.0257, lr=2.28e-05, step=2408] Training: 24%|██▍ | 2409/10000 [28:23<1:10:53, 1.78it/s, loss=0.0257, lr=2.28e-05, step=2408] Training: 24%|██▍ | 2409/10000 [28:23<1:10:53, 1.78it/s, loss=0.0058, lr=2.28e-05, step=2409]16:34:30.009 [I] step=2410 loss=0.0275 smoothed_loss=0.0206 lr=2.28e-05 grad_norm=0.4985 step_time=0.5056s data_time=0.0603s it/s=1.767 eta_to_10000=4294.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1225 grad_shared_expert=0.5719 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2410/10000 [28:23<1:09:20, 1.82it/s, loss=0.0058, lr=2.28e-05, step=2409] Training: 24%|██▍ | 2410/10000 [28:23<1:09:20, 1.82it/s, loss=0.0275, lr=2.28e-05, step=2410] Training: 24%|██▍ | 2411/10000 [28:24<1:14:22, 1.70it/s, loss=0.0275, lr=2.28e-05, step=2410] Training: 24%|██▍ | 2411/10000 [28:24<1:14:22, 1.70it/s, loss=0.0931, lr=2.28e-05, step=2411] Training: 24%|██▍ | 2412/10000 [28:24<1:10:57, 1.78it/s, loss=0.0931, lr=2.28e-05, step=2411] Training: 24%|██▍ | 2412/10000 [28:24<1:10:57, 1.78it/s, loss=0.0190, lr=2.28e-05, step=2412] Training: 24%|██▍ | 2413/10000 [28:25<1:08:19, 1.85it/s, loss=0.0190, lr=2.28e-05, step=2412] Training: 24%|██▍ | 2413/10000 [28:25<1:08:19, 1.85it/s, loss=0.0689, lr=2.28e-05, step=2413] Training: 24%|██▍ | 2414/10000 [28:25<1:06:40, 1.90it/s, loss=0.0689, lr=2.28e-05, step=2413] Training: 24%|██▍ | 2414/10000 [28:25<1:06:40, 1.90it/s, loss=0.0634, lr=2.28e-05, step=2414] Training: 24%|██▍ | 2415/10000 [28:26<1:14:28, 1.70it/s, loss=0.0634, lr=2.28e-05, step=2414] Training: 24%|██▍ | 2415/10000 [28:26<1:14:28, 1.70it/s, loss=0.0224, lr=2.28e-05, step=2415] Training: 24%|██▍ | 2416/10000 [28:26<1:10:26, 1.79it/s, loss=0.0224, lr=2.28e-05, step=2415] Training: 24%|██▍ | 2416/10000 [28:26<1:10:26, 1.79it/s, loss=0.0119, lr=2.28e-05, step=2416] Training: 24%|██▍ | 2417/10000 [28:27<1:07:28, 1.87it/s, loss=0.0119, lr=2.28e-05, step=2416] Training: 24%|██▍ | 2417/10000 [28:27<1:07:28, 1.87it/s, loss=0.0276, lr=2.28e-05, step=2417] Training: 24%|██▍ | 2418/10000 [28:27<1:05:35, 1.93it/s, loss=0.0276, lr=2.28e-05, step=2417] Training: 24%|██▍ | 2418/10000 [28:27<1:05:35, 1.93it/s, loss=0.1923, lr=2.28e-05, step=2418] Training: 24%|██▍ | 2419/10000 [28:28<1:11:18, 1.77it/s, loss=0.1923, lr=2.28e-05, step=2418] Training: 24%|██▍ | 2419/10000 [28:28<1:11:18, 1.77it/s, loss=0.0337, lr=2.28e-05, step=2419]16:34:35.563 [I] step=2420 loss=0.0191 smoothed_loss=0.0429 lr=2.28e-05 grad_norm=0.6245 step_time=0.4956s data_time=0.0599s it/s=1.801 eta_to_10000=4209.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0201 grad_action_out_proj=0.2094 grad_shared_expert=0.5389 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2420/10000 [28:29<1:10:13, 1.80it/s, loss=0.0337, lr=2.28e-05, step=2419] Training: 24%|██▍ | 2420/10000 [28:29<1:10:13, 1.80it/s, loss=0.0191, lr=2.28e-05, step=2420] Training: 24%|██▍ | 2421/10000 [28:29<1:07:24, 1.87it/s, loss=0.0191, lr=2.28e-05, step=2420] Training: 24%|██▍ | 2421/10000 [28:29<1:07:24, 1.87it/s, loss=0.0576, lr=2.28e-05, step=2421] Training: 24%|██▍ | 2422/10000 [28:30<1:14:12, 1.70it/s, loss=0.0576, lr=2.28e-05, step=2421] Training: 24%|██▍ | 2422/10000 [28:30<1:14:12, 1.70it/s, loss=0.0225, lr=2.28e-05, step=2422] Training: 24%|██▍ | 2423/10000 [28:30<1:10:13, 1.80it/s, loss=0.0225, lr=2.28e-05, step=2422] Training: 24%|██▍ | 2423/10000 [28:30<1:10:13, 1.80it/s, loss=0.0066, lr=2.28e-05, step=2423] Training: 24%|██▍ | 2424/10000 [28:31<1:07:30, 1.87it/s, loss=0.0066, lr=2.28e-05, step=2423] Training: 24%|██▍ | 2424/10000 [28:31<1:07:30, 1.87it/s, loss=0.0170, lr=2.28e-05, step=2424] Training: 24%|██▍ | 2425/10000 [28:31<1:10:56, 1.78it/s, loss=0.0170, lr=2.28e-05, step=2424] Training: 24%|██▍ | 2425/10000 [28:31<1:10:56, 1.78it/s, loss=0.0316, lr=2.28e-05, step=2425] Training: 24%|██▍ | 2426/10000 [28:32<1:16:57, 1.64it/s, loss=0.0316, lr=2.28e-05, step=2425] Training: 24%|██▍ | 2426/10000 [28:32<1:16:57, 1.64it/s, loss=0.0140, lr=2.28e-05, step=2426] Training: 24%|██▍ | 2427/10000 [28:33<1:11:58, 1.75it/s, loss=0.0140, lr=2.28e-05, step=2426] Training: 24%|██▍ | 2427/10000 [28:33<1:11:58, 1.75it/s, loss=0.0344, lr=2.28e-05, step=2427] Training: 24%|██▍ | 2428/10000 [28:33<1:09:44, 1.81it/s, loss=0.0344, lr=2.28e-05, step=2427] Training: 24%|██▍ | 2428/10000 [28:33<1:09:44, 1.81it/s, loss=0.0083, lr=2.28e-05, step=2428] Training: 24%|██▍ | 2429/10000 [28:34<1:18:52, 1.60it/s, loss=0.0083, lr=2.28e-05, step=2428] Training: 24%|██▍ | 2429/10000 [28:34<1:18:52, 1.60it/s, loss=0.0192, lr=2.28e-05, step=2429]16:34:41.390 [I] step=2430 loss=0.0618 smoothed_loss=0.0333 lr=2.28e-05 grad_norm=0.5696 step_time=0.5213s data_time=0.0614s it/s=1.717 eta_to_10000=4410.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.2176 grad_shared_expert=0.6587 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2430/10000 [28:34<1:15:25, 1.67it/s, loss=0.0192, lr=2.28e-05, step=2429] Training: 24%|██▍ | 2430/10000 [28:34<1:15:25, 1.67it/s, loss=0.0618, lr=2.28e-05, step=2430] Training: 24%|██▍ | 2431/10000 [28:35<1:11:25, 1.77it/s, loss=0.0618, lr=2.28e-05, step=2430] Training: 24%|██▍ | 2431/10000 [28:35<1:11:25, 1.77it/s, loss=0.0205, lr=2.28e-05, step=2431] Training: 24%|██▍ | 2432/10000 [28:35<1:08:53, 1.83it/s, loss=0.0205, lr=2.28e-05, step=2431] Training: 24%|██▍ | 2432/10000 [28:35<1:08:53, 1.83it/s, loss=0.0259, lr=2.28e-05, step=2432] Training: 24%|██▍ | 2433/10000 [28:36<1:14:39, 1.69it/s, loss=0.0259, lr=2.28e-05, step=2432] Training: 24%|██▍ | 2433/10000 [28:36<1:14:39, 1.69it/s, loss=0.0709, lr=2.28e-05, step=2433] Training: 24%|██▍ | 2434/10000 [28:37<1:10:51, 1.78it/s, loss=0.0709, lr=2.28e-05, step=2433] Training: 24%|██▍ | 2434/10000 [28:37<1:10:51, 1.78it/s, loss=0.0128, lr=2.28e-05, step=2434] Training: 24%|██▍ | 2435/10000 [28:37<1:07:42, 1.86it/s, loss=0.0128, lr=2.28e-05, step=2434] Training: 24%|██▍ | 2435/10000 [28:37<1:07:42, 1.86it/s, loss=0.0308, lr=2.28e-05, step=2435] Training: 24%|██▍ | 2436/10000 [28:38<1:06:23, 1.90it/s, loss=0.0308, lr=2.28e-05, step=2435] Training: 24%|██▍ | 2436/10000 [28:38<1:06:23, 1.90it/s, loss=0.0330, lr=2.28e-05, step=2436] Training: 24%|██▍ | 2437/10000 [28:38<1:17:07, 1.63it/s, loss=0.0330, lr=2.28e-05, step=2436] Training: 24%|██▍ | 2437/10000 [28:38<1:17:07, 1.63it/s, loss=0.0138, lr=2.28e-05, step=2437] Training: 24%|██▍ | 2438/10000 [28:39<1:13:26, 1.72it/s, loss=0.0138, lr=2.28e-05, step=2437] Training: 24%|██▍ | 2438/10000 [28:39<1:13:26, 1.72it/s, loss=0.0620, lr=2.28e-05, step=2438] Training: 24%|██▍ | 2439/10000 [28:39<1:10:10, 1.80it/s, loss=0.0620, lr=2.28e-05, step=2438] Training: 24%|██▍ | 2439/10000 [28:39<1:10:10, 1.80it/s, loss=0.0111, lr=2.28e-05, step=2439]16:34:47.100 [I] step=2440 loss=0.0095 smoothed_loss=0.0295 lr=2.28e-05 grad_norm=0.5069 step_time=0.5089s data_time=0.0622s it/s=1.752 eta_to_10000=4316.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1697 grad_shared_expert=0.3905 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2440/10000 [28:40<1:16:29, 1.65it/s, loss=0.0111, lr=2.28e-05, step=2439] Training: 24%|██▍ | 2440/10000 [28:40<1:16:29, 1.65it/s, loss=0.0095, lr=2.28e-05, step=2440] Training: 24%|██▍ | 2441/10000 [28:41<1:12:41, 1.73it/s, loss=0.0095, lr=2.28e-05, step=2440] Training: 24%|██▍ | 2441/10000 [28:41<1:12:41, 1.73it/s, loss=0.0584, lr=2.28e-05, step=2441] Training: 24%|██▍ | 2442/10000 [28:41<1:09:16, 1.82it/s, loss=0.0584, lr=2.28e-05, step=2441] Training: 24%|██▍ | 2442/10000 [28:41<1:09:16, 1.82it/s, loss=0.0035, lr=2.28e-05, step=2442] Training: 24%|██▍ | 2443/10000 [28:42<1:07:05, 1.88it/s, loss=0.0035, lr=2.28e-05, step=2442] Training: 24%|██▍ | 2443/10000 [28:42<1:07:05, 1.88it/s, loss=0.0097, lr=2.28e-05, step=2443] Training: 24%|██▍ | 2444/10000 [28:42<1:13:37, 1.71it/s, loss=0.0097, lr=2.28e-05, step=2443] Training: 24%|██▍ | 2444/10000 [28:42<1:13:37, 1.71it/s, loss=0.0065, lr=2.28e-05, step=2444] Training: 24%|██▍ | 2445/10000 [28:43<1:10:04, 1.80it/s, loss=0.0065, lr=2.28e-05, step=2444] Training: 24%|██▍ | 2445/10000 [28:43<1:10:04, 1.80it/s, loss=0.0176, lr=2.28e-05, step=2445] Training: 24%|██▍ | 2446/10000 [28:43<1:08:09, 1.85it/s, loss=0.0176, lr=2.28e-05, step=2445] Training: 24%|██▍ | 2446/10000 [28:43<1:08:09, 1.85it/s, loss=0.0424, lr=2.28e-05, step=2446] Training: 24%|██▍ | 2447/10000 [28:44<1:14:57, 1.68it/s, loss=0.0424, lr=2.28e-05, step=2446] Training: 24%|██▍ | 2447/10000 [28:44<1:14:57, 1.68it/s, loss=0.0170, lr=2.27e-05, step=2447] Training: 24%|██▍ | 2448/10000 [28:45<1:10:39, 1.78it/s, loss=0.0170, lr=2.27e-05, step=2447] Training: 24%|██▍ | 2448/10000 [28:45<1:10:39, 1.78it/s, loss=0.0227, lr=2.27e-05, step=2448] Training: 24%|██▍ | 2449/10000 [28:45<1:07:31, 1.86it/s, loss=0.0227, lr=2.27e-05, step=2448] Training: 24%|██▍ | 2449/10000 [28:45<1:07:31, 1.86it/s, loss=0.0372, lr=2.27e-05, step=2449]16:34:52.516 [I] step=2450 loss=0.0234 smoothed_loss=0.0261 lr=2.28e-05 grad_norm=0.4804 step_time=0.4833s data_time=0.0582s it/s=1.847 eta_to_10000=4087.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0094 grad_action_out_proj=0.1257 grad_shared_expert=0.3570 (10775:train_pytorch.py:850) + Training: 24%|██▍ | 2450/10000 [28:46<1:07:49, 1.86it/s, loss=0.0372, lr=2.27e-05, step=2449] Training: 24%|██▍ | 2450/10000 [28:46<1:07:49, 1.86it/s, loss=0.0234, lr=2.27e-05, step=2450] Training: 25%|██▍ | 2451/10000 [28:46<1:17:39, 1.62it/s, loss=0.0234, lr=2.27e-05, step=2450] Training: 25%|██▍ | 2451/10000 [28:46<1:17:39, 1.62it/s, loss=0.0166, lr=2.27e-05, step=2451] Training: 25%|██▍ | 2452/10000 [28:47<1:13:02, 1.72it/s, loss=0.0166, lr=2.27e-05, step=2451] Training: 25%|██▍ | 2452/10000 [28:47<1:13:02, 1.72it/s, loss=0.0207, lr=2.27e-05, step=2452] Training: 25%|██▍ | 2453/10000 [28:47<1:10:09, 1.79it/s, loss=0.0207, lr=2.27e-05, step=2452] Training: 25%|██▍ | 2453/10000 [28:47<1:10:09, 1.79it/s, loss=0.0079, lr=2.27e-05, step=2453] Training: 25%|██▍ | 2454/10000 [28:48<1:14:41, 1.68it/s, loss=0.0079, lr=2.27e-05, step=2453] Training: 25%|██▍ | 2454/10000 [28:48<1:14:41, 1.68it/s, loss=0.0307, lr=2.27e-05, step=2454] Training: 25%|██▍ | 2455/10000 [28:49<1:10:49, 1.78it/s, loss=0.0307, lr=2.27e-05, step=2454] Training: 25%|██▍ | 2455/10000 [28:49<1:10:49, 1.78it/s, loss=0.0131, lr=2.27e-05, step=2455] Training: 25%|██▍ | 2456/10000 [28:49<1:07:58, 1.85it/s, loss=0.0131, lr=2.27e-05, step=2455] Training: 25%|██▍ | 2456/10000 [28:49<1:07:58, 1.85it/s, loss=0.0271, lr=2.27e-05, step=2456] Training: 25%|██▍ | 2457/10000 [28:50<1:06:12, 1.90it/s, loss=0.0271, lr=2.27e-05, step=2456] Training: 25%|██▍ | 2457/10000 [28:50<1:06:12, 1.90it/s, loss=0.0171, lr=2.27e-05, step=2457] Training: 25%|██▍ | 2458/10000 [28:50<1:18:02, 1.61it/s, loss=0.0171, lr=2.27e-05, step=2457] Training: 25%|██▍ | 2458/10000 [28:50<1:18:02, 1.61it/s, loss=0.0063, lr=2.27e-05, step=2458] Training: 25%|██▍ | 2459/10000 [28:51<1:12:44, 1.73it/s, loss=0.0063, lr=2.27e-05, step=2458] Training: 25%|██▍ | 2459/10000 [28:51<1:12:44, 1.73it/s, loss=0.0152, lr=2.27e-05, step=2459]16:34:58.319 [I] step=2460 loss=0.0302 smoothed_loss=0.0213 lr=2.27e-05 grad_norm=0.4930 step_time=0.5161s data_time=0.0642s it/s=1.724 eta_to_10000=4374.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1654 grad_shared_expert=0.4354 (10775:train_pytorch.py:850) + Training: 25%|██▍ | 2460/10000 [28:51<1:10:54, 1.77it/s, loss=0.0152, lr=2.27e-05, step=2459] Training: 25%|██▍ | 2460/10000 [28:51<1:10:54, 1.77it/s, loss=0.0302, lr=2.27e-05, step=2460] Training: 25%|██▍ | 2461/10000 [28:52<1:15:49, 1.66it/s, loss=0.0302, lr=2.27e-05, step=2460] Training: 25%|██▍ | 2461/10000 [28:52<1:15:49, 1.66it/s, loss=0.0113, lr=2.27e-05, step=2461] Training: 25%|██▍ | 2462/10000 [28:53<1:11:27, 1.76it/s, loss=0.0113, lr=2.27e-05, step=2461] Training: 25%|██▍ | 2462/10000 [28:53<1:11:27, 1.76it/s, loss=0.0146, lr=2.27e-05, step=2462] Training: 25%|██▍ | 2463/10000 [28:53<1:08:27, 1.83it/s, loss=0.0146, lr=2.27e-05, step=2462] Training: 25%|██▍ | 2463/10000 [28:53<1:08:27, 1.83it/s, loss=0.0234, lr=2.27e-05, step=2463] Training: 25%|██▍ | 2464/10000 [28:54<1:06:03, 1.90it/s, loss=0.0234, lr=2.27e-05, step=2463] Training: 25%|██▍ | 2464/10000 [28:54<1:06:03, 1.90it/s, loss=0.0204, lr=2.27e-05, step=2464] Training: 25%|██▍ | 2465/10000 [28:54<1:14:47, 1.68it/s, loss=0.0204, lr=2.27e-05, step=2464] Training: 25%|██▍ | 2465/10000 [28:54<1:14:47, 1.68it/s, loss=0.0234, lr=2.27e-05, step=2465] Training: 25%|██▍ | 2466/10000 [28:55<1:11:47, 1.75it/s, loss=0.0234, lr=2.27e-05, step=2465] Training: 25%|██▍ | 2466/10000 [28:55<1:11:47, 1.75it/s, loss=0.0420, lr=2.27e-05, step=2466] Training: 25%|██▍ | 2467/10000 [28:55<1:08:41, 1.83it/s, loss=0.0420, lr=2.27e-05, step=2466] Training: 25%|██▍ | 2467/10000 [28:55<1:08:41, 1.83it/s, loss=0.0606, lr=2.27e-05, step=2467] Training: 25%|██▍ | 2468/10000 [28:56<1:17:46, 1.61it/s, loss=0.0606, lr=2.27e-05, step=2467] Training: 25%|██▍ | 2468/10000 [28:56<1:17:46, 1.61it/s, loss=0.0243, lr=2.27e-05, step=2468] Training: 25%|██▍ | 2469/10000 [28:57<1:12:51, 1.72it/s, loss=0.0243, lr=2.27e-05, step=2468] Training: 25%|██▍ | 2469/10000 [28:57<1:12:51, 1.72it/s, loss=0.0099, lr=2.27e-05, step=2469]16:35:04.047 [I] step=2470 loss=0.0407 smoothed_loss=0.0262 lr=2.27e-05 grad_norm=0.5534 step_time=0.5045s data_time=0.0683s it/s=1.746 eta_to_10000=4312.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0196 grad_action_out_proj=0.2349 grad_shared_expert=0.6634 (10775:train_pytorch.py:850) + Training: 25%|██▍ | 2470/10000 [28:57<1:11:02, 1.77it/s, loss=0.0099, lr=2.27e-05, step=2469] Training: 25%|██▍ | 2470/10000 [28:57<1:11:02, 1.77it/s, loss=0.0407, lr=2.27e-05, step=2470] Training: 25%|██▍ | 2471/10000 [28:58<1:07:42, 1.85it/s, loss=0.0407, lr=2.27e-05, step=2470] Training: 25%|██▍ | 2471/10000 [28:58<1:07:42, 1.85it/s, loss=0.0192, lr=2.27e-05, step=2471] Training: 25%|██▍ | 2472/10000 [28:58<1:14:41, 1.68it/s, loss=0.0192, lr=2.27e-05, step=2471] Training: 25%|██▍ | 2472/10000 [28:58<1:14:41, 1.68it/s, loss=0.0198, lr=2.27e-05, step=2472] Training: 25%|██▍ | 2473/10000 [28:59<1:10:27, 1.78it/s, loss=0.0198, lr=2.27e-05, step=2472] Training: 25%|██▍ | 2473/10000 [28:59<1:10:27, 1.78it/s, loss=0.0119, lr=2.27e-05, step=2473] Training: 25%|██▍ | 2474/10000 [28:59<1:08:30, 1.83it/s, loss=0.0119, lr=2.27e-05, step=2473] Training: 25%|██▍ | 2474/10000 [28:59<1:08:30, 1.83it/s, loss=0.0263, lr=2.27e-05, step=2474] Training: 25%|██▍ | 2475/10000 [29:00<1:13:16, 1.71it/s, loss=0.0263, lr=2.27e-05, step=2474] Training: 25%|██▍ | 2475/10000 [29:00<1:13:16, 1.71it/s, loss=0.0135, lr=2.27e-05, step=2475] Training: 25%|██▍ | 2476/10000 [29:00<1:10:02, 1.79it/s, loss=0.0135, lr=2.27e-05, step=2475] Training: 25%|██▍ | 2476/10000 [29:00<1:10:02, 1.79it/s, loss=0.0065, lr=2.27e-05, step=2476] Training: 25%|██▍ | 2477/10000 [29:01<1:07:21, 1.86it/s, loss=0.0065, lr=2.27e-05, step=2476] Training: 25%|██▍ | 2477/10000 [29:01<1:07:21, 1.86it/s, loss=0.0161, lr=2.27e-05, step=2477] Training: 25%|██▍ | 2478/10000 [29:01<1:05:10, 1.92it/s, loss=0.0161, lr=2.27e-05, step=2477] Training: 25%|██▍ | 2478/10000 [29:01<1:05:10, 1.92it/s, loss=0.0115, lr=2.27e-05, step=2478] Training: 25%|██▍ | 2479/10000 [29:02<1:04:02, 1.96it/s, loss=0.0115, lr=2.27e-05, step=2478] Training: 25%|██▍ | 2479/10000 [29:02<1:04:02, 1.96it/s, loss=0.0240, lr=2.27e-05, step=2479]16:35:09.696 [I] step=2480 loss=0.0264 smoothed_loss=0.0208 lr=2.27e-05 grad_norm=0.5709 step_time=0.5070s data_time=0.0580s it/s=1.770 eta_to_10000=4247.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0156 grad_action_out_proj=0.2059 grad_shared_expert=0.4758 (10775:train_pytorch.py:850) + Training: 25%|██▍ | 2480/10000 [29:03<1:15:50, 1.65it/s, loss=0.0240, lr=2.27e-05, step=2479] Training: 25%|██▍ | 2480/10000 [29:03<1:15:50, 1.65it/s, loss=0.0264, lr=2.27e-05, step=2480] Training: 25%|██▍ | 2481/10000 [29:03<1:11:39, 1.75it/s, loss=0.0264, lr=2.27e-05, step=2480] Training: 25%|██▍ | 2481/10000 [29:03<1:11:39, 1.75it/s, loss=0.0292, lr=2.27e-05, step=2481] Training: 25%|██▍ | 2482/10000 [29:04<1:13:16, 1.71it/s, loss=0.0292, lr=2.27e-05, step=2481] Training: 25%|██▍ | 2482/10000 [29:04<1:13:16, 1.71it/s, loss=0.0901, lr=2.27e-05, step=2482] Training: 25%|██▍ | 2483/10000 [29:05<1:16:37, 1.64it/s, loss=0.0901, lr=2.27e-05, step=2482] Training: 25%|██▍ | 2483/10000 [29:05<1:16:37, 1.64it/s, loss=0.0155, lr=2.27e-05, step=2483] Training: 25%|██▍ | 2484/10000 [29:05<1:12:36, 1.73it/s, loss=0.0155, lr=2.27e-05, step=2483] Training: 25%|██▍ | 2484/10000 [29:05<1:12:36, 1.73it/s, loss=0.0285, lr=2.27e-05, step=2484] Training: 25%|██▍ | 2485/10000 [29:06<1:09:33, 1.80it/s, loss=0.0285, lr=2.27e-05, step=2484] Training: 25%|██▍ | 2485/10000 [29:06<1:09:33, 1.80it/s, loss=0.0398, lr=2.27e-05, step=2485] Training: 25%|██▍ | 2486/10000 [29:06<1:07:12, 1.86it/s, loss=0.0398, lr=2.27e-05, step=2485] Training: 25%|██▍ | 2486/10000 [29:06<1:07:12, 1.86it/s, loss=0.0110, lr=2.27e-05, step=2486] Training: 25%|██▍ | 2487/10000 [29:07<1:14:40, 1.68it/s, loss=0.0110, lr=2.27e-05, step=2486] Training: 25%|██▍ | 2487/10000 [29:07<1:14:40, 1.68it/s, loss=0.0465, lr=2.27e-05, step=2487] Training: 25%|██▍ | 2488/10000 [29:07<1:10:31, 1.78it/s, loss=0.0465, lr=2.27e-05, step=2487] Training: 25%|██▍ | 2488/10000 [29:07<1:10:31, 1.78it/s, loss=0.0055, lr=2.27e-05, step=2488] Training: 25%|██▍ | 2489/10000 [29:08<1:07:32, 1.85it/s, loss=0.0055, lr=2.27e-05, step=2488] Training: 25%|██▍ | 2489/10000 [29:08<1:07:32, 1.85it/s, loss=0.0508, lr=2.27e-05, step=2489]16:35:15.382 [I] step=2490 loss=0.0231 smoothed_loss=0.0283 lr=2.27e-05 grad_norm=0.5675 step_time=0.5043s data_time=0.0642s it/s=1.759 eta_to_10000=4269.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0173 grad_action_out_proj=0.1941 grad_shared_expert=0.5536 (10775:train_pytorch.py:850) + Training: 25%|██▍ | 2490/10000 [29:08<1:13:31, 1.70it/s, loss=0.0508, lr=2.27e-05, step=2489] Training: 25%|██▍ | 2490/10000 [29:08<1:13:31, 1.70it/s, loss=0.0231, lr=2.27e-05, step=2490] Training: 25%|██▍ | 2491/10000 [29:09<1:09:58, 1.79it/s, loss=0.0231, lr=2.27e-05, step=2490] Training: 25%|██▍ | 2491/10000 [29:09<1:09:58, 1.79it/s, loss=0.0244, lr=2.27e-05, step=2491] Training: 25%|██▍ | 2492/10000 [29:09<1:07:25, 1.86it/s, loss=0.0244, lr=2.27e-05, step=2491] Training: 25%|██▍ | 2492/10000 [29:09<1:07:25, 1.86it/s, loss=0.0218, lr=2.26e-05, step=2492] Training: 25%|██▍ | 2493/10000 [29:10<1:06:58, 1.87it/s, loss=0.0218, lr=2.26e-05, step=2492] Training: 25%|██▍ | 2493/10000 [29:10<1:06:58, 1.87it/s, loss=0.0102, lr=2.26e-05, step=2493] Training: 25%|██▍ | 2494/10000 [29:11<1:16:16, 1.64it/s, loss=0.0102, lr=2.26e-05, step=2493] Training: 25%|██▍ | 2494/10000 [29:11<1:16:16, 1.64it/s, loss=0.0132, lr=2.26e-05, step=2494] Training: 25%|██▍ | 2495/10000 [29:11<1:11:21, 1.75it/s, loss=0.0132, lr=2.26e-05, step=2494] Training: 25%|██▍ | 2495/10000 [29:11<1:11:21, 1.75it/s, loss=0.0142, lr=2.26e-05, step=2495] Training: 25%|██▍ | 2496/10000 [29:12<1:07:54, 1.84it/s, loss=0.0142, lr=2.26e-05, step=2495] Training: 25%|██▍ | 2496/10000 [29:12<1:07:54, 1.84it/s, loss=0.0478, lr=2.26e-05, step=2496] Training: 25%|██▍ | 2497/10000 [29:12<1:05:54, 1.90it/s, loss=0.0478, lr=2.26e-05, step=2496] Training: 25%|██▍ | 2497/10000 [29:12<1:05:54, 1.90it/s, loss=0.0074, lr=2.26e-05, step=2497] Training: 25%|██▍ | 2498/10000 [29:13<1:12:34, 1.72it/s, loss=0.0074, lr=2.26e-05, step=2497] Training: 25%|██▍ | 2498/10000 [29:13<1:12:34, 1.72it/s, loss=0.0254, lr=2.26e-05, step=2498] Training: 25%|██▍ | 2499/10000 [29:13<1:09:15, 1.81it/s, loss=0.0254, lr=2.26e-05, step=2498] Training: 25%|██▍ | 2499/10000 [29:13<1:09:15, 1.81it/s, loss=0.0173, lr=2.26e-05, step=2499]16:35:20.863 [I] step=2500 loss=0.0777 smoothed_loss=0.0288 lr=2.26e-05 grad_norm=0.6105 step_time=0.4875s data_time=0.0606s it/s=1.824 eta_to_10000=4110.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0173 grad_action_out_proj=0.1869 grad_shared_expert=0.6288 (10775:train_pytorch.py:850) + Training: 25%|██▌ | 2500/10000 [29:14<1:08:47, 1.82it/s, loss=0.0173, lr=2.26e-05, step=2499] Training: 25%|██▌ | 2500/10000 [29:14<1:08:47, 1.82it/s, loss=0.0777, lr=2.26e-05, step=2500] Training: 25%|██▌ | 2501/10000 [29:14<1:06:45, 1.87it/s, loss=0.0777, lr=2.26e-05, step=2500] Training: 25%|██▌ | 2501/10000 [29:14<1:06:45, 1.87it/s, loss=0.0123, lr=2.26e-05, step=2501] Training: 25%|██▌ | 2502/10000 [29:15<1:18:02, 1.60it/s, loss=0.0123, lr=2.26e-05, step=2501] Training: 25%|██▌ | 2502/10000 [29:15<1:18:02, 1.60it/s, loss=0.0132, lr=2.26e-05, step=2502] Training: 25%|██▌ | 2503/10000 [29:16<1:13:11, 1.71it/s, loss=0.0132, lr=2.26e-05, step=2502] Training: 25%|██▌ | 2503/10000 [29:16<1:13:11, 1.71it/s, loss=0.0153, lr=2.26e-05, step=2503] Training: 25%|██▌ | 2504/10000 [29:16<1:10:41, 1.77it/s, loss=0.0153, lr=2.26e-05, step=2503] Training: 25%|██▌ | 2504/10000 [29:16<1:10:41, 1.77it/s, loss=0.0194, lr=2.26e-05, step=2504] Training: 25%|██▌ | 2505/10000 [29:17<1:15:59, 1.64it/s, loss=0.0194, lr=2.26e-05, step=2504] Training: 25%|██▌ | 2505/10000 [29:17<1:15:59, 1.64it/s, loss=0.0137, lr=2.26e-05, step=2505] Training: 25%|██▌ | 2506/10000 [29:17<1:12:31, 1.72it/s, loss=0.0137, lr=2.26e-05, step=2505] Training: 25%|██▌ | 2506/10000 [29:17<1:12:31, 1.72it/s, loss=0.0163, lr=2.26e-05, step=2506] Training: 25%|██▌ | 2507/10000 [29:18<1:08:55, 1.81it/s, loss=0.0163, lr=2.26e-05, step=2506] Training: 25%|██▌ | 2507/10000 [29:18<1:08:55, 1.81it/s, loss=0.0065, lr=2.26e-05, step=2507] Training: 25%|██▌ | 2508/10000 [29:18<1:07:14, 1.86it/s, loss=0.0065, lr=2.26e-05, step=2507] Training: 25%|██▌ | 2508/10000 [29:18<1:07:14, 1.86it/s, loss=0.0265, lr=2.26e-05, step=2508] Training: 25%|██▌ | 2509/10000 [29:19<1:17:44, 1.61it/s, loss=0.0265, lr=2.26e-05, step=2508] Training: 25%|██▌ | 2509/10000 [29:19<1:17:44, 1.61it/s, loss=0.0373, lr=2.26e-05, step=2509]16:35:26.765 [I] step=2510 loss=0.0120 smoothed_loss=0.0219 lr=2.26e-05 grad_norm=0.5026 step_time=0.5298s data_time=0.0603s it/s=1.695 eta_to_10000=4419.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0237 grad_action_out_proj=0.1590 grad_shared_expert=0.3946 (10775:train_pytorch.py:850) + Training: 25%|██▌ | 2510/10000 [29:20<1:13:52, 1.69it/s, loss=0.0373, lr=2.26e-05, step=2509] Training: 25%|██▌ | 2510/10000 [29:20<1:13:52, 1.69it/s, loss=0.0120, lr=2.26e-05, step=2510] Training: 25%|██▌ | 2511/10000 [29:20<1:10:16, 1.78it/s, loss=0.0120, lr=2.26e-05, step=2510] Training: 25%|██▌ | 2511/10000 [29:20<1:10:16, 1.78it/s, loss=0.0143, lr=2.26e-05, step=2511] Training: 25%|██▌ | 2512/10000 [29:21<1:07:04, 1.86it/s, loss=0.0143, lr=2.26e-05, step=2511] Training: 25%|██▌ | 2512/10000 [29:21<1:07:04, 1.86it/s, loss=0.0326, lr=2.26e-05, step=2512] Training: 25%|██▌ | 2513/10000 [29:21<1:13:00, 1.71it/s, loss=0.0326, lr=2.26e-05, step=2512] Training: 25%|██▌ | 2513/10000 [29:22<1:13:00, 1.71it/s, loss=0.0205, lr=2.26e-05, step=2513] Training: 25%|██▌ | 2514/10000 [29:22<1:09:29, 1.80it/s, loss=0.0205, lr=2.26e-05, step=2513] Training: 25%|██▌ | 2514/10000 [29:22<1:09:29, 1.80it/s, loss=0.0402, lr=2.26e-05, step=2514] Training: 25%|██▌ | 2515/10000 [29:23<1:16:16, 1.64it/s, loss=0.0402, lr=2.26e-05, step=2514] Training: 25%|██▌ | 2515/10000 [29:23<1:16:16, 1.64it/s, loss=0.0209, lr=2.26e-05, step=2515] Training: 25%|██▌ | 2516/10000 [29:23<1:11:46, 1.74it/s, loss=0.0209, lr=2.26e-05, step=2515] Training: 25%|██▌ | 2516/10000 [29:23<1:11:46, 1.74it/s, loss=0.0039, lr=2.26e-05, step=2516] Training: 25%|██▌ | 2517/10000 [29:24<1:09:08, 1.80it/s, loss=0.0039, lr=2.26e-05, step=2516] Training: 25%|██▌ | 2517/10000 [29:24<1:09:08, 1.80it/s, loss=0.0299, lr=2.26e-05, step=2517] Training: 25%|██▌ | 2518/10000 [29:24<1:06:12, 1.88it/s, loss=0.0299, lr=2.26e-05, step=2517] Training: 25%|██▌ | 2518/10000 [29:24<1:06:12, 1.88it/s, loss=0.0866, lr=2.26e-05, step=2518] Training: 25%|██▌ | 2519/10000 [29:25<1:05:30, 1.90it/s, loss=0.0866, lr=2.26e-05, step=2518] Training: 25%|██▌ | 2519/10000 [29:25<1:05:30, 1.90it/s, loss=0.0107, lr=2.26e-05, step=2519]16:35:32.332 [I] step=2520 loss=0.0109 smoothed_loss=0.0255 lr=2.26e-05 grad_norm=0.5781 step_time=0.4984s data_time=0.0583s it/s=1.796 eta_to_10000=4164.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0251 grad_action_out_proj=0.2695 grad_shared_expert=0.4935 (10775:train_pytorch.py:850) + Training: 25%|██▌ | 2520/10000 [29:25<1:11:23, 1.75it/s, loss=0.0107, lr=2.26e-05, step=2519] Training: 25%|██▌ | 2520/10000 [29:25<1:11:23, 1.75it/s, loss=0.0109, lr=2.26e-05, step=2520] Training: 25%|██▌ | 2521/10000 [29:26<1:08:00, 1.83it/s, loss=0.0109, lr=2.26e-05, step=2520] Training: 25%|██▌ | 2521/10000 [29:26<1:08:00, 1.83it/s, loss=0.0126, lr=2.26e-05, step=2521] Training: 25%|██▌ | 2522/10000 [29:26<1:05:31, 1.90it/s, loss=0.0126, lr=2.26e-05, step=2521] Training: 25%|██▌ | 2522/10000 [29:26<1:05:31, 1.90it/s, loss=0.0153, lr=2.26e-05, step=2522] Training: 25%|██▌ | 2523/10000 [29:27<1:11:37, 1.74it/s, loss=0.0153, lr=2.26e-05, step=2522] Training: 25%|██▌ | 2523/10000 [29:27<1:11:37, 1.74it/s, loss=0.0131, lr=2.26e-05, step=2523] Training: 25%|██▌ | 2524/10000 [29:28<1:09:16, 1.80it/s, loss=0.0131, lr=2.26e-05, step=2523] Training: 25%|██▌ | 2524/10000 [29:28<1:09:16, 1.80it/s, loss=0.0581, lr=2.26e-05, step=2524] Training: 25%|██▌ | 2525/10000 [29:28<1:06:32, 1.87it/s, loss=0.0581, lr=2.26e-05, step=2524] Training: 25%|██▌ | 2525/10000 [29:28<1:06:32, 1.87it/s, loss=0.0078, lr=2.26e-05, step=2525] Training: 25%|██▌ | 2526/10000 [29:29<1:05:03, 1.91it/s, loss=0.0078, lr=2.26e-05, step=2525] Training: 25%|██▌ | 2526/10000 [29:29<1:05:03, 1.91it/s, loss=0.0368, lr=2.26e-05, step=2526] Training: 25%|██▌ | 2527/10000 [29:29<1:11:30, 1.74it/s, loss=0.0368, lr=2.26e-05, step=2526] Training: 25%|██▌ | 2527/10000 [29:29<1:11:30, 1.74it/s, loss=0.0367, lr=2.26e-05, step=2527] Training: 25%|██▌ | 2528/10000 [29:30<1:08:36, 1.82it/s, loss=0.0367, lr=2.26e-05, step=2527] Training: 25%|██▌ | 2528/10000 [29:30<1:08:36, 1.82it/s, loss=0.0124, lr=2.26e-05, step=2528] Training: 25%|██▌ | 2529/10000 [29:30<1:06:26, 1.87it/s, loss=0.0124, lr=2.26e-05, step=2528] Training: 25%|██▌ | 2529/10000 [29:30<1:06:26, 1.87it/s, loss=0.0421, lr=2.26e-05, step=2529]16:35:37.871 [I] step=2530 loss=0.0209 smoothed_loss=0.0262 lr=2.26e-05 grad_norm=0.5721 step_time=0.4925s data_time=0.0614s it/s=1.806 eta_to_10000=4136.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1408 grad_shared_expert=0.4911 (10775:train_pytorch.py:850) + Training: 25%|██▌ | 2530/10000 [29:31<1:13:08, 1.70it/s, loss=0.0421, lr=2.26e-05, step=2529] Training: 25%|██▌ | 2530/10000 [29:31<1:13:08, 1.70it/s, loss=0.0209, lr=2.26e-05, step=2530] Training: 25%|██▌ | 2531/10000 [29:31<1:09:30, 1.79it/s, loss=0.0209, lr=2.26e-05, step=2530] Training: 25%|██▌ | 2531/10000 [29:31<1:09:30, 1.79it/s, loss=0.0093, lr=2.26e-05, step=2531] Training: 25%|██▌ | 2532/10000 [29:32<1:10:51, 1.76it/s, loss=0.0093, lr=2.26e-05, step=2531] Training: 25%|██▌ | 2532/10000 [29:32<1:10:51, 1.76it/s, loss=0.0090, lr=2.26e-05, step=2532] Training: 25%|██▌ | 2533/10000 [29:33<1:08:51, 1.81it/s, loss=0.0090, lr=2.26e-05, step=2532] Training: 25%|██▌ | 2533/10000 [29:33<1:08:51, 1.81it/s, loss=0.0130, lr=2.26e-05, step=2533] Training: 25%|██▌ | 2534/10000 [29:33<1:19:06, 1.57it/s, loss=0.0130, lr=2.26e-05, step=2533] Training: 25%|██▌ | 2534/10000 [29:33<1:19:06, 1.57it/s, loss=0.0226, lr=2.26e-05, step=2534] Training: 25%|██▌ | 2535/10000 [29:34<1:13:28, 1.69it/s, loss=0.0226, lr=2.26e-05, step=2534] Training: 25%|██▌ | 2535/10000 [29:34<1:13:28, 1.69it/s, loss=0.0124, lr=2.25e-05, step=2535] Training: 25%|██▌ | 2536/10000 [29:34<1:10:05, 1.77it/s, loss=0.0124, lr=2.25e-05, step=2535] Training: 25%|██▌ | 2536/10000 [29:34<1:10:05, 1.77it/s, loss=0.0235, lr=2.25e-05, step=2536] Training: 25%|██▌ | 2537/10000 [29:35<1:19:07, 1.57it/s, loss=0.0235, lr=2.25e-05, step=2536] Training: 25%|██▌ | 2537/10000 [29:35<1:19:07, 1.57it/s, loss=0.0374, lr=2.25e-05, step=2537] Training: 25%|██▌ | 2538/10000 [29:36<1:20:42, 1.54it/s, loss=0.0374, lr=2.25e-05, step=2537] Training: 25%|██▌ | 2538/10000 [29:36<1:20:42, 1.54it/s, loss=0.0116, lr=2.25e-05, step=2538] Training: 25%|██▌ | 2539/10000 [29:36<1:14:40, 1.67it/s, loss=0.0116, lr=2.25e-05, step=2538] Training: 25%|██▌ | 2539/10000 [29:36<1:14:40, 1.67it/s, loss=0.0186, lr=2.25e-05, step=2539]16:35:43.948 [I] step=2540 loss=0.0464 smoothed_loss=0.0239 lr=2.25e-05 grad_norm=0.5275 step_time=0.5332s data_time=0.0746s it/s=1.646 eta_to_10000=4532.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.1611 grad_shared_expert=0.5731 (10775:train_pytorch.py:850) + Training: 25%|██▌ | 2540/10000 [29:37<1:18:01, 1.59it/s, loss=0.0186, lr=2.25e-05, step=2539] Training: 25%|██▌ | 2540/10000 [29:37<1:18:01, 1.59it/s, loss=0.0464, lr=2.25e-05, step=2540] Training: 25%|██▌ | 2541/10000 [29:37<1:12:42, 1.71it/s, loss=0.0464, lr=2.25e-05, step=2540] Training: 25%|██▌ | 2541/10000 [29:37<1:12:42, 1.71it/s, loss=0.0126, lr=2.25e-05, step=2541] Training: 25%|██▌ | 2542/10000 [29:38<1:08:48, 1.81it/s, loss=0.0126, lr=2.25e-05, step=2541] Training: 25%|██▌ | 2542/10000 [29:38<1:08:48, 1.81it/s, loss=0.0188, lr=2.25e-05, step=2542] Training: 25%|██▌ | 2543/10000 [29:38<1:05:58, 1.88it/s, loss=0.0188, lr=2.25e-05, step=2542] Training: 25%|██▌ | 2543/10000 [29:38<1:05:58, 1.88it/s, loss=0.0079, lr=2.25e-05, step=2543] Training: 25%|██▌ | 2544/10000 [29:39<1:13:24, 1.69it/s, loss=0.0079, lr=2.25e-05, step=2543] Training: 25%|██▌ | 2544/10000 [29:39<1:13:24, 1.69it/s, loss=0.0068, lr=2.25e-05, step=2544] Training: 25%|██▌ | 2545/10000 [29:40<1:11:28, 1.74it/s, loss=0.0068, lr=2.25e-05, step=2544] Training: 25%|██▌ | 2545/10000 [29:40<1:11:28, 1.74it/s, loss=0.0254, lr=2.25e-05, step=2545] Training: 25%|██▌ | 2546/10000 [29:40<1:08:47, 1.81it/s, loss=0.0254, lr=2.25e-05, step=2545] Training: 25%|██▌ | 2546/10000 [29:40<1:08:47, 1.81it/s, loss=0.0321, lr=2.25e-05, step=2546] Training: 25%|██▌ | 2547/10000 [29:41<1:19:48, 1.56it/s, loss=0.0321, lr=2.25e-05, step=2546] Training: 25%|██▌ | 2547/10000 [29:41<1:19:48, 1.56it/s, loss=0.0152, lr=2.25e-05, step=2547] Training: 25%|██▌ | 2548/10000 [29:42<1:19:52, 1.55it/s, loss=0.0152, lr=2.25e-05, step=2547] Training: 25%|██▌ | 2548/10000 [29:42<1:19:52, 1.55it/s, loss=0.0256, lr=2.25e-05, step=2548] Training: 25%|██▌ | 2549/10000 [29:42<1:15:11, 1.65it/s, loss=0.0256, lr=2.25e-05, step=2548] Training: 25%|██▌ | 2549/10000 [29:42<1:15:11, 1.65it/s, loss=0.0154, lr=2.25e-05, step=2549]16:35:49.703 [I] step=2550 loss=0.0276 smoothed_loss=0.0213 lr=2.25e-05 grad_norm=0.5695 step_time=0.4925s data_time=0.0830s it/s=1.738 eta_to_10000=4286.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1557 grad_shared_expert=0.7928 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2550/10000 [29:43<1:12:13, 1.72it/s, loss=0.0154, lr=2.25e-05, step=2549] Training: 26%|██▌ | 2550/10000 [29:43<1:12:13, 1.72it/s, loss=0.0276, lr=2.25e-05, step=2550] Training: 26%|██▌ | 2551/10000 [29:44<1:19:03, 1.57it/s, loss=0.0276, lr=2.25e-05, step=2550] Training: 26%|██▌ | 2551/10000 [29:44<1:19:03, 1.57it/s, loss=0.0164, lr=2.25e-05, step=2551] Training: 26%|██▌ | 2552/10000 [29:44<1:13:31, 1.69it/s, loss=0.0164, lr=2.25e-05, step=2551] Training: 26%|██▌ | 2552/10000 [29:44<1:13:31, 1.69it/s, loss=0.0191, lr=2.25e-05, step=2552] Training: 26%|██▌ | 2553/10000 [29:45<1:10:25, 1.76it/s, loss=0.0191, lr=2.25e-05, step=2552] Training: 26%|██▌ | 2553/10000 [29:45<1:10:25, 1.76it/s, loss=0.0326, lr=2.25e-05, step=2553] Training: 26%|██▌ | 2554/10000 [29:46<1:28:01, 1.41it/s, loss=0.0326, lr=2.25e-05, step=2553] Training: 26%|██▌ | 2554/10000 [29:46<1:28:01, 1.41it/s, loss=0.0180, lr=2.25e-05, step=2554] Training: 26%|██▌ | 2555/10000 [29:46<1:28:41, 1.40it/s, loss=0.0180, lr=2.25e-05, step=2554] Training: 26%|██▌ | 2555/10000 [29:46<1:28:41, 1.40it/s, loss=0.0225, lr=2.25e-05, step=2555] Training: 26%|██▌ | 2556/10000 [29:47<1:21:41, 1.52it/s, loss=0.0225, lr=2.25e-05, step=2555] Training: 26%|██▌ | 2556/10000 [29:47<1:21:41, 1.52it/s, loss=0.0075, lr=2.25e-05, step=2556] Training: 26%|██▌ | 2557/10000 [29:47<1:16:12, 1.63it/s, loss=0.0075, lr=2.25e-05, step=2556] Training: 26%|██▌ | 2557/10000 [29:47<1:16:12, 1.63it/s, loss=0.0193, lr=2.25e-05, step=2557] Training: 26%|██▌ | 2558/10000 [29:48<1:22:22, 1.51it/s, loss=0.0193, lr=2.25e-05, step=2557] Training: 26%|██▌ | 2558/10000 [29:48<1:22:22, 1.51it/s, loss=0.0436, lr=2.25e-05, step=2558] Training: 26%|██▌ | 2559/10000 [29:49<1:22:37, 1.50it/s, loss=0.0436, lr=2.25e-05, step=2558] Training: 26%|██▌ | 2559/10000 [29:49<1:22:37, 1.50it/s, loss=0.0127, lr=2.25e-05, step=2559]16:35:56.293 [I] step=2560 loss=0.0294 smoothed_loss=0.0223 lr=2.25e-05 grad_norm=0.5370 step_time=0.5637s data_time=0.0953s it/s=1.518 eta_to_10000=4901.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.1216 grad_shared_expert=0.5153 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2560/10000 [29:49<1:18:59, 1.57it/s, loss=0.0127, lr=2.25e-05, step=2559] Training: 26%|██▌ | 2560/10000 [29:49<1:18:59, 1.57it/s, loss=0.0294, lr=2.25e-05, step=2560] Training: 26%|██▌ | 2561/10000 [29:50<1:13:18, 1.69it/s, loss=0.0294, lr=2.25e-05, step=2560] Training: 26%|██▌ | 2561/10000 [29:50<1:13:18, 1.69it/s, loss=0.0443, lr=2.25e-05, step=2561] Training: 26%|██▌ | 2562/10000 [29:51<1:16:20, 1.62it/s, loss=0.0443, lr=2.25e-05, step=2561] Training: 26%|██▌ | 2562/10000 [29:51<1:16:20, 1.62it/s, loss=0.0361, lr=2.25e-05, step=2562] Training: 26%|██▌ | 2563/10000 [29:51<1:11:06, 1.74it/s, loss=0.0361, lr=2.25e-05, step=2562] Training: 26%|██▌ | 2563/10000 [29:51<1:11:06, 1.74it/s, loss=0.0463, lr=2.25e-05, step=2563] Training: 26%|██▌ | 2564/10000 [29:51<1:08:13, 1.82it/s, loss=0.0463, lr=2.25e-05, step=2563] Training: 26%|██▌ | 2564/10000 [29:51<1:08:13, 1.82it/s, loss=0.0151, lr=2.25e-05, step=2564] Training: 26%|██▌ | 2565/10000 [29:52<1:06:39, 1.86it/s, loss=0.0151, lr=2.25e-05, step=2564] Training: 26%|██▌ | 2565/10000 [29:52<1:06:39, 1.86it/s, loss=0.0154, lr=2.25e-05, step=2565] Training: 26%|██▌ | 2566/10000 [29:53<1:19:40, 1.55it/s, loss=0.0154, lr=2.25e-05, step=2565] Training: 26%|██▌ | 2566/10000 [29:53<1:19:40, 1.55it/s, loss=0.0217, lr=2.25e-05, step=2566] Training: 26%|██▌ | 2567/10000 [29:54<1:22:23, 1.50it/s, loss=0.0217, lr=2.25e-05, step=2566] Training: 26%|██▌ | 2567/10000 [29:54<1:22:23, 1.50it/s, loss=0.0249, lr=2.25e-05, step=2567] Training: 26%|██▌ | 2568/10000 [29:54<1:21:04, 1.53it/s, loss=0.0249, lr=2.25e-05, step=2567] Training: 26%|██▌ | 2568/10000 [29:54<1:21:04, 1.53it/s, loss=0.0069, lr=2.25e-05, step=2568] Training: 26%|██▌ | 2569/10000 [29:55<1:15:03, 1.65it/s, loss=0.0069, lr=2.25e-05, step=2568] Training: 26%|██▌ | 2569/10000 [29:55<1:15:03, 1.65it/s, loss=0.0558, lr=2.25e-05, step=2569]16:36:02.351 [I] step=2570 loss=0.0508 smoothed_loss=0.0289 lr=2.25e-05 grad_norm=0.5722 step_time=0.5271s data_time=0.0787s it/s=1.651 eta_to_10000=4500.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0116 grad_action_out_proj=0.1794 grad_shared_expert=0.4411 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2570/10000 [29:55<1:18:14, 1.58it/s, loss=0.0558, lr=2.25e-05, step=2569] Training: 26%|██▌ | 2570/10000 [29:55<1:18:14, 1.58it/s, loss=0.0508, lr=2.25e-05, step=2570] Training: 26%|██▌ | 2571/10000 [29:56<1:15:35, 1.64it/s, loss=0.0508, lr=2.25e-05, step=2570] Training: 26%|██▌ | 2571/10000 [29:56<1:15:35, 1.64it/s, loss=0.0095, lr=2.25e-05, step=2571] Training: 26%|██▌ | 2572/10000 [29:57<1:15:46, 1.63it/s, loss=0.0095, lr=2.25e-05, step=2571] Training: 26%|██▌ | 2572/10000 [29:57<1:15:46, 1.63it/s, loss=0.0310, lr=2.25e-05, step=2572] Training: 26%|██▌ | 2573/10000 [29:57<1:22:52, 1.49it/s, loss=0.0310, lr=2.25e-05, step=2572] Training: 26%|██▌ | 2573/10000 [29:57<1:22:52, 1.49it/s, loss=0.0132, lr=2.25e-05, step=2573] Training: 26%|██▌ | 2574/10000 [29:58<1:16:34, 1.62it/s, loss=0.0132, lr=2.25e-05, step=2573] Training: 26%|██▌ | 2574/10000 [29:58<1:16:34, 1.62it/s, loss=0.0111, lr=2.25e-05, step=2574] Training: 26%|██▌ | 2575/10000 [29:58<1:11:43, 1.73it/s, loss=0.0111, lr=2.25e-05, step=2574] Training: 26%|██▌ | 2575/10000 [29:58<1:11:43, 1.73it/s, loss=0.0141, lr=2.25e-05, step=2575] Training: 26%|██▌ | 2576/10000 [29:59<1:17:26, 1.60it/s, loss=0.0141, lr=2.25e-05, step=2575] Training: 26%|██▌ | 2576/10000 [29:59<1:17:26, 1.60it/s, loss=0.0290, lr=2.25e-05, step=2576] Training: 26%|██▌ | 2577/10000 [30:00<1:18:34, 1.57it/s, loss=0.0290, lr=2.25e-05, step=2576] Training: 26%|██▌ | 2577/10000 [30:00<1:18:34, 1.57it/s, loss=0.0104, lr=2.25e-05, step=2577] Training: 26%|██▌ | 2578/10000 [30:00<1:12:50, 1.70it/s, loss=0.0104, lr=2.25e-05, step=2577] Training: 26%|██▌ | 2578/10000 [30:00<1:12:50, 1.70it/s, loss=0.0453, lr=2.24e-05, step=2578] Training: 26%|██▌ | 2579/10000 [30:01<1:09:01, 1.79it/s, loss=0.0453, lr=2.24e-05, step=2578] Training: 26%|██▌ | 2579/10000 [30:01<1:09:01, 1.79it/s, loss=0.0207, lr=2.24e-05, step=2579]16:36:08.514 [I] step=2580 loss=0.0256 smoothed_loss=0.0246 lr=2.25e-05 grad_norm=0.5100 step_time=0.5370s data_time=0.0792s it/s=1.623 eta_to_10000=4571.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0118 grad_action_out_proj=0.1462 grad_shared_expert=0.5523 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2580/10000 [30:02<1:19:22, 1.56it/s, loss=0.0207, lr=2.24e-05, step=2579] Training: 26%|██▌ | 2580/10000 [30:02<1:19:22, 1.56it/s, loss=0.0256, lr=2.24e-05, step=2580] Training: 26%|██▌ | 2581/10000 [30:02<1:22:35, 1.50it/s, loss=0.0256, lr=2.24e-05, step=2580] Training: 26%|██▌ | 2581/10000 [30:02<1:22:35, 1.50it/s, loss=0.0141, lr=2.24e-05, step=2581] Training: 26%|██▌ | 2582/10000 [30:03<1:16:08, 1.62it/s, loss=0.0141, lr=2.24e-05, step=2581] Training: 26%|██▌ | 2582/10000 [30:03<1:16:08, 1.62it/s, loss=0.0190, lr=2.24e-05, step=2582] Training: 26%|██▌ | 2583/10000 [30:03<1:11:35, 1.73it/s, loss=0.0190, lr=2.24e-05, step=2582] Training: 26%|██▌ | 2583/10000 [30:03<1:11:35, 1.73it/s, loss=0.0562, lr=2.24e-05, step=2583] Training: 26%|██▌ | 2584/10000 [30:04<1:23:06, 1.49it/s, loss=0.0562, lr=2.24e-05, step=2583] Training: 26%|██▌ | 2584/10000 [30:04<1:23:06, 1.49it/s, loss=0.0468, lr=2.24e-05, step=2584] Training: 26%|██▌ | 2585/10000 [30:05<1:16:52, 1.61it/s, loss=0.0468, lr=2.24e-05, step=2584] Training: 26%|██▌ | 2585/10000 [30:05<1:16:52, 1.61it/s, loss=0.0123, lr=2.24e-05, step=2585] Training: 26%|██▌ | 2586/10000 [30:05<1:20:26, 1.54it/s, loss=0.0123, lr=2.24e-05, step=2585] Training: 26%|██▌ | 2586/10000 [30:05<1:20:26, 1.54it/s, loss=0.0147, lr=2.24e-05, step=2586] Training: 26%|██▌ | 2587/10000 [30:06<1:28:09, 1.40it/s, loss=0.0147, lr=2.24e-05, step=2586] Training: 26%|██▌ | 2587/10000 [30:06<1:28:09, 1.40it/s, loss=0.0384, lr=2.24e-05, step=2587] Training: 26%|██▌ | 2588/10000 [30:07<1:21:30, 1.52it/s, loss=0.0384, lr=2.24e-05, step=2587] Training: 26%|██▌ | 2588/10000 [30:07<1:21:30, 1.52it/s, loss=0.0178, lr=2.24e-05, step=2588] Training: 26%|██▌ | 2589/10000 [30:07<1:15:43, 1.63it/s, loss=0.0178, lr=2.24e-05, step=2588] Training: 26%|██▌ | 2589/10000 [30:07<1:15:43, 1.63it/s, loss=0.0101, lr=2.24e-05, step=2589]16:36:14.765 [I] step=2590 loss=0.0428 smoothed_loss=0.0262 lr=2.24e-05 grad_norm=0.6074 step_time=0.5199s data_time=0.1052s it/s=1.600 eta_to_10000=4630.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.1913 grad_shared_expert=1.1513 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2590/10000 [30:08<1:12:21, 1.71it/s, loss=0.0101, lr=2.24e-05, step=2589] Training: 26%|██▌ | 2590/10000 [30:08<1:12:21, 1.71it/s, loss=0.0428, lr=2.24e-05, step=2590] Training: 26%|██▌ | 2591/10000 [30:09<1:18:49, 1.57it/s, loss=0.0428, lr=2.24e-05, step=2590] Training: 26%|██▌ | 2591/10000 [30:09<1:18:49, 1.57it/s, loss=0.0088, lr=2.24e-05, step=2591] Training: 26%|██▌ | 2592/10000 [30:09<1:28:20, 1.40it/s, loss=0.0088, lr=2.24e-05, step=2591] Training: 26%|██▌ | 2592/10000 [30:09<1:28:20, 1.40it/s, loss=0.0475, lr=2.24e-05, step=2592] Training: 26%|██▌ | 2593/10000 [30:10<1:21:10, 1.52it/s, loss=0.0475, lr=2.24e-05, step=2592] Training: 26%|██▌ | 2593/10000 [30:10<1:21:10, 1.52it/s, loss=0.0280, lr=2.24e-05, step=2593] Training: 26%|██▌ | 2594/10000 [30:11<1:35:00, 1.30it/s, loss=0.0280, lr=2.24e-05, step=2593] Training: 26%|██▌ | 2594/10000 [30:11<1:35:00, 1.30it/s, loss=0.0306, lr=2.24e-05, step=2594] Training: 26%|██▌ | 2595/10000 [30:12<1:25:12, 1.45it/s, loss=0.0306, lr=2.24e-05, step=2594] Training: 26%|██▌ | 2595/10000 [30:12<1:25:12, 1.45it/s, loss=0.0500, lr=2.24e-05, step=2595] Training: 26%|██▌ | 2596/10000 [30:12<1:19:39, 1.55it/s, loss=0.0500, lr=2.24e-05, step=2595] Training: 26%|██▌ | 2596/10000 [30:12<1:19:39, 1.55it/s, loss=0.0126, lr=2.24e-05, step=2596] Training: 26%|██▌ | 2597/10000 [30:13<1:26:35, 1.42it/s, loss=0.0126, lr=2.24e-05, step=2596] Training: 26%|██▌ | 2597/10000 [30:13<1:26:35, 1.42it/s, loss=0.0385, lr=2.24e-05, step=2597] Training: 26%|██▌ | 2598/10000 [30:13<1:19:45, 1.55it/s, loss=0.0385, lr=2.24e-05, step=2597] Training: 26%|██▌ | 2598/10000 [30:13<1:19:45, 1.55it/s, loss=0.0242, lr=2.24e-05, step=2598] Training: 26%|██▌ | 2599/10000 [30:14<1:24:29, 1.46it/s, loss=0.0242, lr=2.24e-05, step=2598] Training: 26%|██▌ | 2599/10000 [30:14<1:24:29, 1.46it/s, loss=0.0126, lr=2.24e-05, step=2599]16:36:21.681 [I] step=2600 loss=0.0102 smoothed_loss=0.0252 lr=2.24e-05 grad_norm=0.7109 step_time=0.5754s data_time=0.1161s it/s=1.446 eta_to_10000=5117.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0217 grad_action_out_proj=0.1689 grad_shared_expert=0.4626 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2600/10000 [30:15<1:18:54, 1.56it/s, loss=0.0126, lr=2.24e-05, step=2599] Training: 26%|██▌ | 2600/10000 [30:15<1:18:54, 1.56it/s, loss=0.0102, lr=2.24e-05, step=2600] Training: 26%|██▌ | 2601/10000 [30:16<1:31:05, 1.35it/s, loss=0.0102, lr=2.24e-05, step=2600] Training: 26%|██▌ | 2601/10000 [30:16<1:31:05, 1.35it/s, loss=0.0822, lr=2.24e-05, step=2601] Training: 26%|██▌ | 2602/10000 [30:16<1:21:53, 1.51it/s, loss=0.0822, lr=2.24e-05, step=2601] Training: 26%|██▌ | 2602/10000 [30:16<1:21:53, 1.51it/s, loss=0.0090, lr=2.24e-05, step=2602] Training: 26%|██▌ | 2603/10000 [30:17<1:15:38, 1.63it/s, loss=0.0090, lr=2.24e-05, step=2602] Training: 26%|██▌ | 2603/10000 [30:17<1:15:38, 1.63it/s, loss=0.0199, lr=2.24e-05, step=2603] Training: 26%|██▌ | 2604/10000 [30:17<1:11:35, 1.72it/s, loss=0.0199, lr=2.24e-05, step=2603] Training: 26%|██▌ | 2604/10000 [30:17<1:11:35, 1.72it/s, loss=0.0122, lr=2.24e-05, step=2604] Training: 26%|██▌ | 2605/10000 [30:18<1:08:01, 1.81it/s, loss=0.0122, lr=2.24e-05, step=2604] Training: 26%|██▌ | 2605/10000 [30:18<1:08:01, 1.81it/s, loss=0.0134, lr=2.24e-05, step=2605] Training: 26%|██▌ | 2606/10000 [30:18<1:12:28, 1.70it/s, loss=0.0134, lr=2.24e-05, step=2605] Training: 26%|██▌ | 2606/10000 [30:18<1:12:28, 1.70it/s, loss=0.0041, lr=2.24e-05, step=2606] Training: 26%|██▌ | 2607/10000 [30:19<1:08:38, 1.80it/s, loss=0.0041, lr=2.24e-05, step=2606] Training: 26%|██▌ | 2607/10000 [30:19<1:08:38, 1.80it/s, loss=0.0619, lr=2.24e-05, step=2607] Training: 26%|██▌ | 2608/10000 [30:19<1:05:40, 1.88it/s, loss=0.0619, lr=2.24e-05, step=2607] Training: 26%|██▌ | 2608/10000 [30:19<1:05:40, 1.88it/s, loss=0.0150, lr=2.24e-05, step=2608] Training: 26%|██▌ | 2609/10000 [30:20<1:10:43, 1.74it/s, loss=0.0150, lr=2.24e-05, step=2608] Training: 26%|██▌ | 2609/10000 [30:20<1:10:43, 1.74it/s, loss=0.0128, lr=2.24e-05, step=2609]16:36:27.463 [I] step=2610 loss=0.0382 smoothed_loss=0.0257 lr=2.24e-05 grad_norm=0.6115 step_time=0.5079s data_time=0.0703s it/s=1.730 eta_to_10000=4272.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0112 grad_action_out_proj=0.1661 grad_shared_expert=0.4309 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2610/10000 [30:21<1:09:11, 1.78it/s, loss=0.0128, lr=2.24e-05, step=2609] Training: 26%|██▌ | 2610/10000 [30:21<1:09:11, 1.78it/s, loss=0.0382, lr=2.24e-05, step=2610] Training: 26%|██▌ | 2611/10000 [30:21<1:06:43, 1.85it/s, loss=0.0382, lr=2.24e-05, step=2610] Training: 26%|██▌ | 2611/10000 [30:21<1:06:43, 1.85it/s, loss=0.0158, lr=2.24e-05, step=2611] Training: 26%|██▌ | 2612/10000 [30:22<1:08:30, 1.80it/s, loss=0.0158, lr=2.24e-05, step=2611] Training: 26%|██▌ | 2612/10000 [30:22<1:08:30, 1.80it/s, loss=0.0343, lr=2.24e-05, step=2612] Training: 26%|██▌ | 2613/10000 [30:22<1:12:46, 1.69it/s, loss=0.0343, lr=2.24e-05, step=2612] Training: 26%|██▌ | 2613/10000 [30:22<1:12:46, 1.69it/s, loss=0.0206, lr=2.24e-05, step=2613] Training: 26%|██▌ | 2614/10000 [30:23<1:08:42, 1.79it/s, loss=0.0206, lr=2.24e-05, step=2613] Training: 26%|██▌ | 2614/10000 [30:23<1:08:42, 1.79it/s, loss=0.0184, lr=2.24e-05, step=2614] Training: 26%|██▌ | 2615/10000 [30:23<1:06:07, 1.86it/s, loss=0.0184, lr=2.24e-05, step=2614] Training: 26%|██▌ | 2615/10000 [30:23<1:06:07, 1.86it/s, loss=0.0276, lr=2.24e-05, step=2615] Training: 26%|██▌ | 2616/10000 [30:24<1:11:55, 1.71it/s, loss=0.0276, lr=2.24e-05, step=2615] Training: 26%|██▌ | 2616/10000 [30:24<1:11:55, 1.71it/s, loss=0.0349, lr=2.24e-05, step=2616] Training: 26%|██▌ | 2617/10000 [30:24<1:08:27, 1.80it/s, loss=0.0349, lr=2.24e-05, step=2616] Training: 26%|██▌ | 2617/10000 [30:24<1:08:27, 1.80it/s, loss=0.0305, lr=2.24e-05, step=2617] Training: 26%|██▌ | 2618/10000 [30:25<1:05:46, 1.87it/s, loss=0.0305, lr=2.24e-05, step=2617] Training: 26%|██▌ | 2618/10000 [30:25<1:05:46, 1.87it/s, loss=0.0194, lr=2.24e-05, step=2618] Training: 26%|██▌ | 2619/10000 [30:25<1:03:46, 1.93it/s, loss=0.0194, lr=2.24e-05, step=2618] Training: 26%|██▌ | 2619/10000 [30:25<1:03:46, 1.93it/s, loss=0.0104, lr=2.24e-05, step=2619]16:36:33.105 [I] step=2620 loss=0.0092 smoothed_loss=0.0226 lr=2.24e-05 grad_norm=0.5609 step_time=0.5001s data_time=0.0641s it/s=1.773 eta_to_10000=4163.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.1190 grad_shared_expert=0.5058 (10775:train_pytorch.py:850) + Training: 26%|██▌ | 2620/10000 [30:26<1:12:52, 1.69it/s, loss=0.0104, lr=2.24e-05, step=2619] Training: 26%|██▌ | 2620/10000 [30:26<1:12:52, 1.69it/s, loss=0.0092, lr=2.23e-05, step=2620] Training: 26%|██▌ | 2621/10000 [30:27<1:08:49, 1.79it/s, loss=0.0092, lr=2.23e-05, step=2620] Training: 26%|██▌ | 2621/10000 [30:27<1:08:49, 1.79it/s, loss=0.0232, lr=2.23e-05, step=2621] Training: 26%|██▌ | 2622/10000 [30:27<1:06:03, 1.86it/s, loss=0.0232, lr=2.23e-05, step=2621] Training: 26%|██▌ | 2622/10000 [30:27<1:06:03, 1.86it/s, loss=0.0243, lr=2.23e-05, step=2622] Training: 26%|██▌ | 2623/10000 [30:28<1:13:06, 1.68it/s, loss=0.0243, lr=2.23e-05, step=2622] Training: 26%|██▌ | 2623/10000 [30:28<1:13:06, 1.68it/s, loss=0.0143, lr=2.23e-05, step=2623] Training: 26%|██▌ | 2624/10000 [30:28<1:09:07, 1.78it/s, loss=0.0143, lr=2.23e-05, step=2623] Training: 26%|██▌ | 2624/10000 [30:28<1:09:07, 1.78it/s, loss=0.0115, lr=2.23e-05, step=2624] Training: 26%|██▋ | 2625/10000 [30:29<1:07:20, 1.83it/s, loss=0.0115, lr=2.23e-05, step=2624] Training: 26%|██▋ | 2625/10000 [30:29<1:07:20, 1.83it/s, loss=0.0067, lr=2.23e-05, step=2625] Training: 26%|██▋ | 2626/10000 [30:29<1:07:42, 1.82it/s, loss=0.0067, lr=2.23e-05, step=2625] Training: 26%|██▋ | 2626/10000 [30:29<1:07:42, 1.82it/s, loss=0.0235, lr=2.23e-05, step=2626] Training: 26%|██▋ | 2627/10000 [30:30<1:11:37, 1.72it/s, loss=0.0235, lr=2.23e-05, step=2626] Training: 26%|██▋ | 2627/10000 [30:30<1:11:37, 1.72it/s, loss=0.0126, lr=2.23e-05, step=2627] Training: 26%|██▋ | 2628/10000 [30:31<1:09:33, 1.77it/s, loss=0.0126, lr=2.23e-05, step=2627] Training: 26%|██▋ | 2628/10000 [30:31<1:09:33, 1.77it/s, loss=0.0059, lr=2.23e-05, step=2628] Training: 26%|██▋ | 2629/10000 [30:31<1:06:53, 1.84it/s, loss=0.0059, lr=2.23e-05, step=2628] Training: 26%|██▋ | 2629/10000 [30:31<1:06:53, 1.84it/s, loss=0.0127, lr=2.23e-05, step=2629]16:36:38.955 [I] step=2630 loss=0.0131 smoothed_loss=0.0169 lr=2.23e-05 grad_norm=0.4910 step_time=0.5183s data_time=0.0667s it/s=1.710 eta_to_10000=4311.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0206 grad_action_out_proj=0.1610 grad_shared_expert=0.4560 (10775:train_pytorch.py:850) + Training: 26%|██▋ | 2630/10000 [30:32<1:20:34, 1.52it/s, loss=0.0127, lr=2.23e-05, step=2629] Training: 26%|██▋ | 2630/10000 [30:32<1:20:34, 1.52it/s, loss=0.0131, lr=2.23e-05, step=2630] Training: 26%|██▋ | 2631/10000 [30:32<1:14:00, 1.66it/s, loss=0.0131, lr=2.23e-05, step=2630] Training: 26%|██▋ | 2631/10000 [30:32<1:14:00, 1.66it/s, loss=0.0069, lr=2.23e-05, step=2631] Training: 26%|██▋ | 2632/10000 [30:33<1:09:32, 1.77it/s, loss=0.0069, lr=2.23e-05, step=2631] Training: 26%|██▋ | 2632/10000 [30:33<1:09:32, 1.77it/s, loss=0.0290, lr=2.23e-05, step=2632] Training: 26%|██▋ | 2633/10000 [30:33<1:06:29, 1.85it/s, loss=0.0290, lr=2.23e-05, step=2632] Training: 26%|██▋ | 2633/10000 [30:33<1:06:29, 1.85it/s, loss=0.0174, lr=2.23e-05, step=2633] Training: 26%|██▋ | 2634/10000 [30:34<1:10:22, 1.74it/s, loss=0.0174, lr=2.23e-05, step=2633] Training: 26%|██▋ | 2634/10000 [30:34<1:10:22, 1.74it/s, loss=0.0392, lr=2.23e-05, step=2634] Training: 26%|██▋ | 2635/10000 [30:35<1:06:49, 1.84it/s, loss=0.0392, lr=2.23e-05, step=2634] Training: 26%|██▋ | 2635/10000 [30:35<1:06:49, 1.84it/s, loss=0.0107, lr=2.23e-05, step=2635] Training: 26%|██▋ | 2636/10000 [30:35<1:05:44, 1.87it/s, loss=0.0107, lr=2.23e-05, step=2635] Training: 26%|██▋ | 2636/10000 [30:35<1:05:44, 1.87it/s, loss=0.0080, lr=2.23e-05, step=2636] Training: 26%|██▋ | 2637/10000 [30:36<1:12:35, 1.69it/s, loss=0.0080, lr=2.23e-05, step=2636] Training: 26%|██▋ | 2637/10000 [30:36<1:12:35, 1.69it/s, loss=0.0095, lr=2.23e-05, step=2637] Training: 26%|██▋ | 2638/10000 [30:36<1:09:09, 1.77it/s, loss=0.0095, lr=2.23e-05, step=2637] Training: 26%|██▋ | 2638/10000 [30:36<1:09:09, 1.77it/s, loss=0.0099, lr=2.23e-05, step=2638] Training: 26%|██▋ | 2639/10000 [30:37<1:06:25, 1.85it/s, loss=0.0099, lr=2.23e-05, step=2638] Training: 26%|██▋ | 2639/10000 [30:37<1:06:25, 1.85it/s, loss=0.0110, lr=2.23e-05, step=2639]16:36:44.287 [I] step=2640 loss=0.0192 smoothed_loss=0.0159 lr=2.23e-05 grad_norm=0.5257 step_time=0.4750s data_time=0.0581s it/s=1.876 eta_to_10000=3923.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0196 grad_action_out_proj=0.1794 grad_shared_expert=0.5032 (10775:train_pytorch.py:850) + Training: 26%|██▋ | 2640/10000 [30:37<1:06:18, 1.85it/s, loss=0.0110, lr=2.23e-05, step=2639] Training: 26%|██▋ | 2640/10000 [30:37<1:06:18, 1.85it/s, loss=0.0192, lr=2.23e-05, step=2640] Training: 26%|██▋ | 2641/10000 [30:38<1:06:03, 1.86it/s, loss=0.0192, lr=2.23e-05, step=2640] Training: 26%|██▋ | 2641/10000 [30:38<1:06:03, 1.86it/s, loss=0.0180, lr=2.23e-05, step=2641] Training: 26%|██▋ | 2642/10000 [30:39<1:10:01, 1.75it/s, loss=0.0180, lr=2.23e-05, step=2641] Training: 26%|██▋ | 2642/10000 [30:39<1:10:01, 1.75it/s, loss=0.0125, lr=2.23e-05, step=2642] Training: 26%|██▋ | 2643/10000 [30:39<1:07:13, 1.82it/s, loss=0.0125, lr=2.23e-05, step=2642] Training: 26%|██▋ | 2643/10000 [30:39<1:07:13, 1.82it/s, loss=0.0231, lr=2.23e-05, step=2643] Training: 26%|██▋ | 2644/10000 [30:40<1:22:15, 1.49it/s, loss=0.0231, lr=2.23e-05, step=2643] Training: 26%|██▋ | 2644/10000 [30:40<1:22:15, 1.49it/s, loss=0.0042, lr=2.23e-05, step=2644] Training: 26%|██▋ | 2645/10000 [30:40<1:16:04, 1.61it/s, loss=0.0042, lr=2.23e-05, step=2644] Training: 26%|██▋ | 2645/10000 [30:40<1:16:04, 1.61it/s, loss=0.0295, lr=2.23e-05, step=2645] Training: 26%|██▋ | 2646/10000 [30:41<1:19:47, 1.54it/s, loss=0.0295, lr=2.23e-05, step=2645] Training: 26%|██▋ | 2646/10000 [30:41<1:19:47, 1.54it/s, loss=0.0078, lr=2.23e-05, step=2646] Training: 26%|██▋ | 2647/10000 [30:42<1:14:42, 1.64it/s, loss=0.0078, lr=2.23e-05, step=2646] Training: 26%|██▋ | 2647/10000 [30:42<1:14:42, 1.64it/s, loss=0.0314, lr=2.23e-05, step=2647] Training: 26%|██▋ | 2648/10000 [30:42<1:11:32, 1.71it/s, loss=0.0314, lr=2.23e-05, step=2647] Training: 26%|██▋ | 2648/10000 [30:42<1:11:32, 1.71it/s, loss=0.0149, lr=2.23e-05, step=2648] Training: 26%|██▋ | 2649/10000 [30:43<1:08:01, 1.80it/s, loss=0.0149, lr=2.23e-05, step=2648] Training: 26%|██▋ | 2649/10000 [30:43<1:08:01, 1.80it/s, loss=0.0157, lr=2.23e-05, step=2649]16:36:50.373 [I] step=2650 loss=0.0341 smoothed_loss=0.0187 lr=2.23e-05 grad_norm=0.4867 step_time=0.5310s data_time=0.0776s it/s=1.643 eta_to_10000=4472.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0163 grad_action_out_proj=0.1209 grad_shared_expert=0.4166 (10775:train_pytorch.py:850) + Training: 26%|██▋ | 2650/10000 [30:43<1:13:25, 1.67it/s, loss=0.0157, lr=2.23e-05, step=2649] Training: 26%|██▋ | 2650/10000 [30:43<1:13:25, 1.67it/s, loss=0.0341, lr=2.23e-05, step=2650] Training: 27%|██▋ | 2651/10000 [30:44<1:16:53, 1.59it/s, loss=0.0341, lr=2.23e-05, step=2650] Training: 27%|██▋ | 2651/10000 [30:44<1:16:53, 1.59it/s, loss=0.0088, lr=2.23e-05, step=2651] Training: 27%|██▋ | 2652/10000 [30:45<1:23:06, 1.47it/s, loss=0.0088, lr=2.23e-05, step=2651] Training: 27%|██▋ | 2652/10000 [30:45<1:23:06, 1.47it/s, loss=0.0275, lr=2.23e-05, step=2652] Training: 27%|██▋ | 2653/10000 [30:45<1:17:24, 1.58it/s, loss=0.0275, lr=2.23e-05, step=2652] Training: 27%|██▋ | 2653/10000 [30:45<1:17:24, 1.58it/s, loss=0.0147, lr=2.23e-05, step=2653] Training: 27%|██▋ | 2654/10000 [30:46<1:11:57, 1.70it/s, loss=0.0147, lr=2.23e-05, step=2653] Training: 27%|██▋ | 2654/10000 [30:46<1:11:57, 1.70it/s, loss=0.0429, lr=2.23e-05, step=2654] Training: 27%|██▋ | 2655/10000 [30:46<1:10:20, 1.74it/s, loss=0.0429, lr=2.23e-05, step=2654] Training: 27%|██▋ | 2655/10000 [30:46<1:10:20, 1.74it/s, loss=0.0198, lr=2.23e-05, step=2655] Training: 27%|██▋ | 2656/10000 [30:47<1:06:54, 1.83it/s, loss=0.0198, lr=2.23e-05, step=2655] Training: 27%|██▋ | 2656/10000 [30:47<1:06:54, 1.83it/s, loss=0.0134, lr=2.23e-05, step=2656] Training: 27%|██▋ | 2657/10000 [30:48<1:12:34, 1.69it/s, loss=0.0134, lr=2.23e-05, step=2656] Training: 27%|██▋ | 2657/10000 [30:48<1:12:34, 1.69it/s, loss=0.0053, lr=2.23e-05, step=2657] Training: 27%|██▋ | 2658/10000 [30:48<1:11:07, 1.72it/s, loss=0.0053, lr=2.23e-05, step=2657] Training: 27%|██▋ | 2658/10000 [30:48<1:11:07, 1.72it/s, loss=0.0138, lr=2.23e-05, step=2658] Training: 27%|██▋ | 2659/10000 [30:49<1:18:56, 1.55it/s, loss=0.0138, lr=2.23e-05, step=2658] Training: 27%|██▋ | 2659/10000 [30:49<1:18:56, 1.55it/s, loss=0.0236, lr=2.23e-05, step=2659]16:36:56.498 [I] step=2660 loss=0.0266 smoothed_loss=0.0194 lr=2.23e-05 grad_norm=0.5262 step_time=0.5410s data_time=0.0716s it/s=1.633 eta_to_10000=4495.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0088 grad_action_out_proj=0.1431 grad_shared_expert=0.3939 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2660/10000 [30:50<1:15:29, 1.62it/s, loss=0.0236, lr=2.23e-05, step=2659] Training: 27%|██▋ | 2660/10000 [30:50<1:15:29, 1.62it/s, loss=0.0266, lr=2.23e-05, step=2660] Training: 27%|██▋ | 2661/10000 [30:50<1:15:04, 1.63it/s, loss=0.0266, lr=2.23e-05, step=2660] Training: 27%|██▋ | 2661/10000 [30:50<1:15:04, 1.63it/s, loss=0.0325, lr=2.22e-05, step=2661] Training: 27%|██▋ | 2662/10000 [30:51<1:11:06, 1.72it/s, loss=0.0325, lr=2.22e-05, step=2661] Training: 27%|██▋ | 2662/10000 [30:51<1:11:06, 1.72it/s, loss=0.0059, lr=2.22e-05, step=2662] Training: 27%|██▋ | 2663/10000 [30:51<1:09:57, 1.75it/s, loss=0.0059, lr=2.22e-05, step=2662] Training: 27%|██▋ | 2663/10000 [30:51<1:09:57, 1.75it/s, loss=0.0123, lr=2.22e-05, step=2663] Training: 27%|██▋ | 2664/10000 [30:52<1:07:07, 1.82it/s, loss=0.0123, lr=2.22e-05, step=2663] Training: 27%|██▋ | 2664/10000 [30:52<1:07:07, 1.82it/s, loss=0.0094, lr=2.22e-05, step=2664] Training: 27%|██▋ | 2665/10000 [30:52<1:12:31, 1.69it/s, loss=0.0094, lr=2.22e-05, step=2664] Training: 27%|██▋ | 2665/10000 [30:52<1:12:31, 1.69it/s, loss=0.0045, lr=2.22e-05, step=2665] Training: 27%|██▋ | 2666/10000 [30:53<1:23:42, 1.46it/s, loss=0.0045, lr=2.22e-05, step=2665] Training: 27%|██▋ | 2666/10000 [30:53<1:23:42, 1.46it/s, loss=0.0202, lr=2.22e-05, step=2666] Training: 27%|██▋ | 2667/10000 [30:54<1:17:04, 1.59it/s, loss=0.0202, lr=2.22e-05, step=2666] Training: 27%|██▋ | 2667/10000 [30:54<1:17:04, 1.59it/s, loss=0.0123, lr=2.22e-05, step=2667] Training: 27%|██▋ | 2668/10000 [30:54<1:12:54, 1.68it/s, loss=0.0123, lr=2.22e-05, step=2667] Training: 27%|██▋ | 2668/10000 [30:54<1:12:54, 1.68it/s, loss=0.0354, lr=2.22e-05, step=2668] Training: 27%|██▋ | 2669/10000 [30:55<1:09:37, 1.75it/s, loss=0.0354, lr=2.22e-05, step=2668] Training: 27%|██▋ | 2669/10000 [30:55<1:09:37, 1.75it/s, loss=0.0174, lr=2.22e-05, step=2669]16:37:02.327 [I] step=2670 loss=0.0107 smoothed_loss=0.0173 lr=2.22e-05 grad_norm=0.4868 step_time=0.5185s data_time=0.0644s it/s=1.716 eta_to_10000=4271.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0099 grad_action_out_proj=0.1181 grad_shared_expert=0.3696 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2670/10000 [30:55<1:08:46, 1.78it/s, loss=0.0174, lr=2.22e-05, step=2669] Training: 27%|██▋ | 2670/10000 [30:55<1:08:46, 1.78it/s, loss=0.0107, lr=2.22e-05, step=2670] Training: 27%|██▋ | 2671/10000 [30:56<1:06:20, 1.84it/s, loss=0.0107, lr=2.22e-05, step=2670] Training: 27%|██▋ | 2671/10000 [30:56<1:06:20, 1.84it/s, loss=0.0090, lr=2.22e-05, step=2671] Training: 27%|██▋ | 2672/10000 [30:57<1:13:14, 1.67it/s, loss=0.0090, lr=2.22e-05, step=2671] Training: 27%|██▋ | 2672/10000 [30:57<1:13:14, 1.67it/s, loss=0.0168, lr=2.22e-05, step=2672] Training: 27%|██▋ | 2673/10000 [30:57<1:09:46, 1.75it/s, loss=0.0168, lr=2.22e-05, step=2672] Training: 27%|██▋ | 2673/10000 [30:57<1:09:46, 1.75it/s, loss=0.0105, lr=2.22e-05, step=2673] Training: 27%|██▋ | 2674/10000 [30:58<1:21:14, 1.50it/s, loss=0.0105, lr=2.22e-05, step=2673] Training: 27%|██▋ | 2674/10000 [30:58<1:21:14, 1.50it/s, loss=0.0296, lr=2.22e-05, step=2674] Training: 27%|██▋ | 2675/10000 [30:59<1:18:00, 1.57it/s, loss=0.0296, lr=2.22e-05, step=2674] Training: 27%|██▋ | 2675/10000 [30:59<1:18:00, 1.57it/s, loss=0.0309, lr=2.22e-05, step=2675] Training: 27%|██▋ | 2676/10000 [30:59<1:17:02, 1.58it/s, loss=0.0309, lr=2.22e-05, step=2675] Training: 27%|██▋ | 2676/10000 [30:59<1:17:02, 1.58it/s, loss=0.0081, lr=2.22e-05, step=2676] Training: 27%|██▋ | 2677/10000 [31:00<1:12:38, 1.68it/s, loss=0.0081, lr=2.22e-05, step=2676] Training: 27%|██▋ | 2677/10000 [31:00<1:12:38, 1.68it/s, loss=0.0289, lr=2.22e-05, step=2677] Training: 27%|██▋ | 2678/10000 [31:00<1:08:48, 1.77it/s, loss=0.0289, lr=2.22e-05, step=2677] Training: 27%|██▋ | 2678/10000 [31:00<1:08:48, 1.77it/s, loss=0.0162, lr=2.22e-05, step=2678] Training: 27%|██▋ | 2679/10000 [31:01<1:14:43, 1.63it/s, loss=0.0162, lr=2.22e-05, step=2678] Training: 27%|██▋ | 2679/10000 [31:01<1:14:43, 1.63it/s, loss=0.0174, lr=2.22e-05, step=2679]16:37:08.402 [I] step=2680 loss=0.0094 smoothed_loss=0.0175 lr=2.22e-05 grad_norm=0.5388 step_time=0.5486s data_time=0.0591s it/s=1.646 eta_to_10000=4446.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0064 grad_action_out_proj=0.1100 grad_shared_expert=0.3651 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2680/10000 [31:01<1:12:00, 1.69it/s, loss=0.0174, lr=2.22e-05, step=2679] Training: 27%|██▋ | 2680/10000 [31:01<1:12:00, 1.69it/s, loss=0.0094, lr=2.22e-05, step=2680] Training: 27%|██▋ | 2681/10000 [31:02<1:20:33, 1.51it/s, loss=0.0094, lr=2.22e-05, step=2680] Training: 27%|██▋ | 2681/10000 [31:02<1:20:33, 1.51it/s, loss=0.0103, lr=2.22e-05, step=2681] Training: 27%|██▋ | 2682/10000 [31:03<1:14:59, 1.63it/s, loss=0.0103, lr=2.22e-05, step=2681] Training: 27%|██▋ | 2682/10000 [31:03<1:14:59, 1.63it/s, loss=0.0209, lr=2.22e-05, step=2682] Training: 27%|██▋ | 2683/10000 [31:03<1:11:26, 1.71it/s, loss=0.0209, lr=2.22e-05, step=2682] Training: 27%|██▋ | 2683/10000 [31:03<1:11:26, 1.71it/s, loss=0.0275, lr=2.22e-05, step=2683] Training: 27%|██▋ | 2684/10000 [31:04<1:08:26, 1.78it/s, loss=0.0275, lr=2.22e-05, step=2683] Training: 27%|██▋ | 2684/10000 [31:04<1:08:26, 1.78it/s, loss=0.0388, lr=2.22e-05, step=2684] Training: 27%|██▋ | 2685/10000 [31:04<1:07:54, 1.80it/s, loss=0.0388, lr=2.22e-05, step=2684] Training: 27%|██▋ | 2685/10000 [31:04<1:07:54, 1.80it/s, loss=0.0181, lr=2.22e-05, step=2685] Training: 27%|██▋ | 2686/10000 [31:05<1:14:33, 1.63it/s, loss=0.0181, lr=2.22e-05, step=2685] Training: 27%|██▋ | 2686/10000 [31:05<1:14:33, 1.63it/s, loss=0.0097, lr=2.22e-05, step=2686] Training: 27%|██▋ | 2687/10000 [31:06<1:23:07, 1.47it/s, loss=0.0097, lr=2.22e-05, step=2686] Training: 27%|██▋ | 2687/10000 [31:06<1:23:07, 1.47it/s, loss=0.0296, lr=2.22e-05, step=2687] Training: 27%|██▋ | 2688/10000 [31:06<1:16:49, 1.59it/s, loss=0.0296, lr=2.22e-05, step=2687] Training: 27%|██▋ | 2688/10000 [31:06<1:16:49, 1.59it/s, loss=0.0176, lr=2.22e-05, step=2688] Training: 27%|██▋ | 2689/10000 [31:07<1:11:48, 1.70it/s, loss=0.0176, lr=2.22e-05, step=2688] Training: 27%|██▋ | 2689/10000 [31:07<1:11:48, 1.70it/s, loss=0.0294, lr=2.22e-05, step=2689]16:37:14.448 [I] step=2690 loss=0.0193 smoothed_loss=0.0206 lr=2.22e-05 grad_norm=0.5150 step_time=0.5419s data_time=0.0626s it/s=1.654 eta_to_10000=4419.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0192 grad_action_out_proj=0.2071 grad_shared_expert=0.4762 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2690/10000 [31:08<1:10:34, 1.73it/s, loss=0.0294, lr=2.22e-05, step=2689] Training: 27%|██▋ | 2690/10000 [31:08<1:10:34, 1.73it/s, loss=0.0193, lr=2.22e-05, step=2690] Training: 27%|██▋ | 2691/10000 [31:08<1:14:59, 1.62it/s, loss=0.0193, lr=2.22e-05, step=2690] Training: 27%|██▋ | 2691/10000 [31:08<1:14:59, 1.62it/s, loss=0.0122, lr=2.22e-05, step=2691] Training: 27%|██▋ | 2692/10000 [31:09<1:17:23, 1.57it/s, loss=0.0122, lr=2.22e-05, step=2691] Training: 27%|██▋ | 2692/10000 [31:09<1:17:23, 1.57it/s, loss=0.0223, lr=2.22e-05, step=2692] Training: 27%|██▋ | 2693/10000 [31:10<1:18:10, 1.56it/s, loss=0.0223, lr=2.22e-05, step=2692] Training: 27%|██▋ | 2693/10000 [31:10<1:18:10, 1.56it/s, loss=0.0128, lr=2.22e-05, step=2693] Training: 27%|██▋ | 2694/10000 [31:10<1:12:52, 1.67it/s, loss=0.0128, lr=2.22e-05, step=2693] Training: 27%|██▋ | 2694/10000 [31:10<1:12:52, 1.67it/s, loss=0.0264, lr=2.22e-05, step=2694] Training: 27%|██▋ | 2695/10000 [31:11<1:19:25, 1.53it/s, loss=0.0264, lr=2.22e-05, step=2694] Training: 27%|██▋ | 2695/10000 [31:11<1:19:25, 1.53it/s, loss=0.0080, lr=2.22e-05, step=2695] Training: 27%|██▋ | 2696/10000 [31:11<1:13:51, 1.65it/s, loss=0.0080, lr=2.22e-05, step=2695] Training: 27%|██▋ | 2696/10000 [31:11<1:13:51, 1.65it/s, loss=0.0185, lr=2.22e-05, step=2696] Training: 27%|██▋ | 2697/10000 [31:12<1:11:14, 1.71it/s, loss=0.0185, lr=2.22e-05, step=2696] Training: 27%|██▋ | 2697/10000 [31:12<1:11:14, 1.71it/s, loss=0.0198, lr=2.22e-05, step=2697] Training: 27%|██▋ | 2698/10000 [31:12<1:09:44, 1.74it/s, loss=0.0198, lr=2.22e-05, step=2697] Training: 27%|██▋ | 2698/10000 [31:12<1:09:44, 1.74it/s, loss=0.0052, lr=2.22e-05, step=2698] Training: 27%|██▋ | 2699/10000 [31:13<1:07:32, 1.80it/s, loss=0.0052, lr=2.22e-05, step=2698] Training: 27%|██▋ | 2699/10000 [31:13<1:07:32, 1.80it/s, loss=0.0104, lr=2.22e-05, step=2699]16:37:20.546 [I] step=2700 loss=0.0053 smoothed_loss=0.0157 lr=2.22e-05 grad_norm=0.5554 step_time=0.5334s data_time=0.0764s it/s=1.640 eta_to_10000=4450.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.1161 grad_shared_expert=0.3045 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2700/10000 [31:14<1:12:28, 1.68it/s, loss=0.0104, lr=2.22e-05, step=2699] Training: 27%|██▋ | 2700/10000 [31:14<1:12:28, 1.68it/s, loss=0.0053, lr=2.22e-05, step=2700] Training: 27%|██▋ | 2701/10000 [31:14<1:08:45, 1.77it/s, loss=0.0053, lr=2.22e-05, step=2700] Training: 27%|██▋ | 2701/10000 [31:14<1:08:45, 1.77it/s, loss=0.0370, lr=2.22e-05, step=2701] Training: 27%|██▋ | 2702/10000 [31:15<1:17:32, 1.57it/s, loss=0.0370, lr=2.22e-05, step=2701] Training: 27%|██▋ | 2702/10000 [31:15<1:17:32, 1.57it/s, loss=0.0045, lr=2.21e-05, step=2702] Training: 27%|██▋ | 2703/10000 [31:15<1:14:13, 1.64it/s, loss=0.0045, lr=2.21e-05, step=2702] Training: 27%|██▋ | 2703/10000 [31:15<1:14:13, 1.64it/s, loss=0.1030, lr=2.21e-05, step=2703] Training: 27%|██▋ | 2704/10000 [31:16<1:09:42, 1.74it/s, loss=0.1030, lr=2.21e-05, step=2703] Training: 27%|██▋ | 2704/10000 [31:16<1:09:42, 1.74it/s, loss=0.0282, lr=2.21e-05, step=2704] Training: 27%|██▋ | 2705/10000 [31:16<1:06:37, 1.83it/s, loss=0.0282, lr=2.21e-05, step=2704] Training: 27%|██▋ | 2705/10000 [31:16<1:06:37, 1.83it/s, loss=0.0162, lr=2.21e-05, step=2705] Training: 27%|██▋ | 2706/10000 [31:17<1:04:57, 1.87it/s, loss=0.0162, lr=2.21e-05, step=2705] Training: 27%|██▋ | 2706/10000 [31:17<1:04:57, 1.87it/s, loss=0.0177, lr=2.21e-05, step=2706] Training: 27%|██▋ | 2707/10000 [31:18<1:14:22, 1.63it/s, loss=0.0177, lr=2.21e-05, step=2706] Training: 27%|██▋ | 2707/10000 [31:18<1:14:22, 1.63it/s, loss=0.0205, lr=2.21e-05, step=2707] Training: 27%|██▋ | 2708/10000 [31:18<1:11:23, 1.70it/s, loss=0.0205, lr=2.21e-05, step=2707] Training: 27%|██▋ | 2708/10000 [31:18<1:11:23, 1.70it/s, loss=0.0100, lr=2.21e-05, step=2708] Training: 27%|██▋ | 2709/10000 [31:19<1:17:49, 1.56it/s, loss=0.0100, lr=2.21e-05, step=2708] Training: 27%|██▋ | 2709/10000 [31:19<1:17:49, 1.56it/s, loss=0.0110, lr=2.21e-05, step=2709]16:37:26.679 [I] step=2710 loss=0.0303 smoothed_loss=0.0220 lr=2.21e-05 grad_norm=0.5683 step_time=0.5352s data_time=0.0781s it/s=1.631 eta_to_10000=4470.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0179 grad_action_out_proj=0.2134 grad_shared_expert=0.5443 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2710/10000 [31:20<1:20:45, 1.50it/s, loss=0.0110, lr=2.21e-05, step=2709] Training: 27%|██▋ | 2710/10000 [31:20<1:20:45, 1.50it/s, loss=0.0303, lr=2.21e-05, step=2710] Training: 27%|██▋ | 2711/10000 [31:20<1:17:25, 1.57it/s, loss=0.0303, lr=2.21e-05, step=2710] Training: 27%|██▋ | 2711/10000 [31:20<1:17:25, 1.57it/s, loss=0.0350, lr=2.21e-05, step=2711] Training: 27%|██▋ | 2712/10000 [31:21<1:16:53, 1.58it/s, loss=0.0350, lr=2.21e-05, step=2711] Training: 27%|██▋ | 2712/10000 [31:21<1:16:53, 1.58it/s, loss=0.0150, lr=2.21e-05, step=2712] Training: 27%|██▋ | 2713/10000 [31:21<1:11:21, 1.70it/s, loss=0.0150, lr=2.21e-05, step=2712] Training: 27%|██▋ | 2713/10000 [31:21<1:11:21, 1.70it/s, loss=0.0165, lr=2.21e-05, step=2713] Training: 27%|██▋ | 2714/10000 [31:22<1:07:28, 1.80it/s, loss=0.0165, lr=2.21e-05, step=2713] Training: 27%|██▋ | 2714/10000 [31:22<1:07:28, 1.80it/s, loss=0.0088, lr=2.21e-05, step=2714] Training: 27%|██▋ | 2715/10000 [31:23<1:11:22, 1.70it/s, loss=0.0088, lr=2.21e-05, step=2714] Training: 27%|██▋ | 2715/10000 [31:23<1:11:22, 1.70it/s, loss=0.0150, lr=2.21e-05, step=2715] Training: 27%|██▋ | 2716/10000 [31:23<1:18:07, 1.55it/s, loss=0.0150, lr=2.21e-05, step=2715] Training: 27%|██▋ | 2716/10000 [31:23<1:18:07, 1.55it/s, loss=0.0148, lr=2.21e-05, step=2716] Training: 27%|██▋ | 2717/10000 [31:24<1:12:08, 1.68it/s, loss=0.0148, lr=2.21e-05, step=2716] Training: 27%|██▋ | 2717/10000 [31:24<1:12:08, 1.68it/s, loss=0.0369, lr=2.21e-05, step=2717] Training: 27%|██▋ | 2718/10000 [31:24<1:08:09, 1.78it/s, loss=0.0369, lr=2.21e-05, step=2717] Training: 27%|██▋ | 2718/10000 [31:24<1:08:09, 1.78it/s, loss=0.0188, lr=2.21e-05, step=2718] Training: 27%|██▋ | 2719/10000 [31:25<1:05:14, 1.86it/s, loss=0.0188, lr=2.21e-05, step=2718] Training: 27%|██▋ | 2719/10000 [31:25<1:05:14, 1.86it/s, loss=0.0179, lr=2.21e-05, step=2719]16:37:32.235 [I] step=2720 loss=0.0294 smoothed_loss=0.0215 lr=2.21e-05 grad_norm=0.4891 step_time=0.4907s data_time=0.0649s it/s=1.800 eta_to_10000=4043.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0169 grad_action_out_proj=0.1773 grad_shared_expert=0.4425 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2720/10000 [31:25<1:04:21, 1.89it/s, loss=0.0179, lr=2.21e-05, step=2719] Training: 27%|██▋ | 2720/10000 [31:25<1:04:21, 1.89it/s, loss=0.0294, lr=2.21e-05, step=2720] Training: 27%|██▋ | 2721/10000 [31:26<1:03:21, 1.91it/s, loss=0.0294, lr=2.21e-05, step=2720] Training: 27%|██▋ | 2721/10000 [31:26<1:03:21, 1.91it/s, loss=0.0056, lr=2.21e-05, step=2721] Training: 27%|██▋ | 2722/10000 [31:27<1:13:02, 1.66it/s, loss=0.0056, lr=2.21e-05, step=2721] Training: 27%|██▋ | 2722/10000 [31:27<1:13:02, 1.66it/s, loss=0.0718, lr=2.21e-05, step=2722] Training: 27%|██▋ | 2723/10000 [31:27<1:21:02, 1.50it/s, loss=0.0718, lr=2.21e-05, step=2722] Training: 27%|██▋ | 2723/10000 [31:27<1:21:02, 1.50it/s, loss=0.0344, lr=2.21e-05, step=2723] Training: 27%|██▋ | 2724/10000 [31:28<1:25:16, 1.42it/s, loss=0.0344, lr=2.21e-05, step=2723] Training: 27%|██▋ | 2724/10000 [31:28<1:25:16, 1.42it/s, loss=0.0294, lr=2.21e-05, step=2724] Training: 27%|██▋ | 2725/10000 [31:29<1:19:42, 1.52it/s, loss=0.0294, lr=2.21e-05, step=2724] Training: 27%|██▋ | 2725/10000 [31:29<1:19:42, 1.52it/s, loss=0.0183, lr=2.21e-05, step=2725] Training: 27%|██▋ | 2726/10000 [31:29<1:17:04, 1.57it/s, loss=0.0183, lr=2.21e-05, step=2725] Training: 27%|██▋ | 2726/10000 [31:29<1:17:04, 1.57it/s, loss=0.0148, lr=2.21e-05, step=2726] Training: 27%|██▋ | 2727/10000 [31:30<1:23:05, 1.46it/s, loss=0.0148, lr=2.21e-05, step=2726] Training: 27%|██▋ | 2727/10000 [31:30<1:23:05, 1.46it/s, loss=0.0052, lr=2.21e-05, step=2727] Training: 27%|██▋ | 2728/10000 [31:31<1:22:40, 1.47it/s, loss=0.0052, lr=2.21e-05, step=2727] Training: 27%|██▋ | 2728/10000 [31:31<1:22:40, 1.47it/s, loss=0.0050, lr=2.21e-05, step=2728] Training: 27%|██▋ | 2729/10000 [31:32<1:48:14, 1.12it/s, loss=0.0050, lr=2.21e-05, step=2728] Training: 27%|██▋ | 2729/10000 [31:32<1:48:14, 1.12it/s, loss=0.0145, lr=2.21e-05, step=2729]16:37:40.011 [I] step=2730 loss=0.0115 smoothed_loss=0.0193 lr=2.21e-05 grad_norm=0.6100 step_time=0.6002s data_time=0.1775s it/s=1.286 eta_to_10000=5652.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0109 grad_action_out_proj=0.1345 grad_shared_expert=0.4631 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2730/10000 [31:33<1:48:18, 1.12it/s, loss=0.0145, lr=2.21e-05, step=2729] Training: 27%|██▋ | 2730/10000 [31:33<1:48:18, 1.12it/s, loss=0.0115, lr=2.21e-05, step=2730] Training: 27%|██▋ | 2731/10000 [31:34<1:52:37, 1.08it/s, loss=0.0115, lr=2.21e-05, step=2730] Training: 27%|██▋ | 2731/10000 [31:34<1:52:37, 1.08it/s, loss=0.0134, lr=2.21e-05, step=2731] Training: 27%|██▋ | 2732/10000 [31:35<1:42:09, 1.19it/s, loss=0.0134, lr=2.21e-05, step=2731] Training: 27%|██▋ | 2732/10000 [31:35<1:42:09, 1.19it/s, loss=0.0186, lr=2.21e-05, step=2732] Training: 27%|██▋ | 2733/10000 [31:35<1:36:28, 1.26it/s, loss=0.0186, lr=2.21e-05, step=2732] Training: 27%|██▋ | 2733/10000 [31:35<1:36:28, 1.26it/s, loss=0.0076, lr=2.21e-05, step=2733] Training: 27%|██▋ | 2734/10000 [31:36<1:26:21, 1.40it/s, loss=0.0076, lr=2.21e-05, step=2733] Training: 27%|██▋ | 2734/10000 [31:36<1:26:21, 1.40it/s, loss=0.0070, lr=2.21e-05, step=2734] Training: 27%|██▋ | 2735/10000 [31:37<1:23:45, 1.45it/s, loss=0.0070, lr=2.21e-05, step=2734] Training: 27%|██▋ | 2735/10000 [31:37<1:23:45, 1.45it/s, loss=0.0394, lr=2.21e-05, step=2735] Training: 27%|██▋ | 2736/10000 [31:37<1:21:08, 1.49it/s, loss=0.0394, lr=2.21e-05, step=2735] Training: 27%|██▋ | 2736/10000 [31:37<1:21:08, 1.49it/s, loss=0.0315, lr=2.21e-05, step=2736] Training: 27%|██▋ | 2737/10000 [31:38<1:23:00, 1.46it/s, loss=0.0315, lr=2.21e-05, step=2736] Training: 27%|██▋ | 2737/10000 [31:38<1:23:00, 1.46it/s, loss=0.0147, lr=2.21e-05, step=2737] Training: 27%|██▋ | 2738/10000 [31:39<1:24:06, 1.44it/s, loss=0.0147, lr=2.21e-05, step=2737] Training: 27%|██▋ | 2738/10000 [31:39<1:24:06, 1.44it/s, loss=0.0174, lr=2.21e-05, step=2738] Training: 27%|██▋ | 2739/10000 [31:39<1:16:51, 1.57it/s, loss=0.0174, lr=2.21e-05, step=2738] Training: 27%|██▋ | 2739/10000 [31:39<1:16:51, 1.57it/s, loss=0.0131, lr=2.21e-05, step=2739]16:37:46.641 [I] step=2740 loss=0.0553 smoothed_loss=0.0224 lr=2.21e-05 grad_norm=0.5274 step_time=0.5359s data_time=0.1272s it/s=1.512 eta_to_10000=4801.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0197 grad_action_out_proj=0.2128 grad_shared_expert=0.5877 (10775:train_pytorch.py:850) + Training: 27%|██▋ | 2740/10000 [31:40<1:14:02, 1.63it/s, loss=0.0131, lr=2.21e-05, step=2739] Training: 27%|██▋ | 2740/10000 [31:40<1:14:02, 1.63it/s, loss=0.0553, lr=2.21e-05, step=2740] Training: 27%|██▋ | 2741/10000 [31:40<1:09:58, 1.73it/s, loss=0.0553, lr=2.21e-05, step=2740] Training: 27%|██▋ | 2741/10000 [31:40<1:09:58, 1.73it/s, loss=0.0139, lr=2.21e-05, step=2741] Training: 27%|██▋ | 2742/10000 [31:41<1:07:40, 1.79it/s, loss=0.0139, lr=2.21e-05, step=2741] Training: 27%|██▋ | 2742/10000 [31:41<1:07:40, 1.79it/s, loss=0.0164, lr=2.20e-05, step=2742] Training: 27%|██▋ | 2743/10000 [31:41<1:09:00, 1.75it/s, loss=0.0164, lr=2.20e-05, step=2742] Training: 27%|██▋ | 2743/10000 [31:41<1:09:00, 1.75it/s, loss=0.0312, lr=2.20e-05, step=2743] Training: 27%|██▋ | 2744/10000 [31:42<1:09:22, 1.74it/s, loss=0.0312, lr=2.20e-05, step=2743] Training: 27%|██▋ | 2744/10000 [31:42<1:09:22, 1.74it/s, loss=0.0079, lr=2.20e-05, step=2744] Training: 27%|██▋ | 2745/10000 [31:43<1:27:33, 1.38it/s, loss=0.0079, lr=2.20e-05, step=2744] Training: 27%|██▋ | 2745/10000 [31:43<1:27:33, 1.38it/s, loss=0.0106, lr=2.20e-05, step=2745] Training: 27%|██▋ | 2746/10000 [31:44<1:21:34, 1.48it/s, loss=0.0106, lr=2.20e-05, step=2745] Training: 27%|██▋ | 2746/10000 [31:44<1:21:34, 1.48it/s, loss=0.0239, lr=2.20e-05, step=2746] Training: 27%|██▋ | 2747/10000 [31:44<1:16:00, 1.59it/s, loss=0.0239, lr=2.20e-05, step=2746] Training: 27%|██▋ | 2747/10000 [31:44<1:16:00, 1.59it/s, loss=0.0191, lr=2.20e-05, step=2747] Training: 27%|██▋ | 2748/10000 [31:45<1:11:32, 1.69it/s, loss=0.0191, lr=2.20e-05, step=2747] Training: 27%|██▋ | 2748/10000 [31:45<1:11:32, 1.69it/s, loss=0.0037, lr=2.20e-05, step=2748] Training: 27%|██▋ | 2749/10000 [31:45<1:08:54, 1.75it/s, loss=0.0037, lr=2.20e-05, step=2748] Training: 27%|██▋ | 2749/10000 [31:45<1:08:54, 1.75it/s, loss=0.0349, lr=2.20e-05, step=2749]16:37:52.556 [I] step=2750 loss=0.0144 smoothed_loss=0.0194 lr=2.20e-05 grad_norm=0.4811 step_time=0.5033s data_time=0.0881s it/s=1.691 eta_to_10000=4287.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.1709 grad_shared_expert=0.4834 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2750/10000 [31:46<1:07:49, 1.78it/s, loss=0.0349, lr=2.20e-05, step=2749] Training: 28%|██▊ | 2750/10000 [31:46<1:07:49, 1.78it/s, loss=0.0144, lr=2.20e-05, step=2750] Training: 28%|██▊ | 2751/10000 [31:46<1:06:06, 1.83it/s, loss=0.0144, lr=2.20e-05, step=2750] Training: 28%|██▊ | 2751/10000 [31:46<1:06:06, 1.83it/s, loss=0.0122, lr=2.20e-05, step=2751] Training: 28%|██▊ | 2752/10000 [31:47<1:22:33, 1.46it/s, loss=0.0122, lr=2.20e-05, step=2751] Training: 28%|██▊ | 2752/10000 [31:47<1:22:33, 1.46it/s, loss=0.0158, lr=2.20e-05, step=2752] Training: 28%|██▊ | 2753/10000 [31:48<1:21:21, 1.48it/s, loss=0.0158, lr=2.20e-05, step=2752] Training: 28%|██▊ | 2753/10000 [31:48<1:21:21, 1.48it/s, loss=0.0114, lr=2.20e-05, step=2753] Training: 28%|██▊ | 2754/10000 [31:48<1:22:25, 1.47it/s, loss=0.0114, lr=2.20e-05, step=2753] Training: 28%|██▊ | 2754/10000 [31:48<1:22:25, 1.47it/s, loss=0.0161, lr=2.20e-05, step=2754] Training: 28%|██▊ | 2755/10000 [31:49<1:20:08, 1.51it/s, loss=0.0161, lr=2.20e-05, step=2754] Training: 28%|██▊ | 2755/10000 [31:49<1:20:08, 1.51it/s, loss=0.0167, lr=2.20e-05, step=2755] Training: 28%|██▊ | 2756/10000 [31:50<1:18:18, 1.54it/s, loss=0.0167, lr=2.20e-05, step=2755] Training: 28%|██▊ | 2756/10000 [31:50<1:18:18, 1.54it/s, loss=0.0209, lr=2.20e-05, step=2756] Training: 28%|██▊ | 2757/10000 [31:50<1:12:43, 1.66it/s, loss=0.0209, lr=2.20e-05, step=2756] Training: 28%|██▊ | 2757/10000 [31:50<1:12:43, 1.66it/s, loss=0.0067, lr=2.20e-05, step=2757] Training: 28%|██▊ | 2758/10000 [31:51<1:08:51, 1.75it/s, loss=0.0067, lr=2.20e-05, step=2757] Training: 28%|██▊ | 2758/10000 [31:51<1:08:51, 1.75it/s, loss=0.0061, lr=2.20e-05, step=2758] Training: 28%|██▊ | 2759/10000 [31:52<1:23:12, 1.45it/s, loss=0.0061, lr=2.20e-05, step=2758] Training: 28%|██▊ | 2759/10000 [31:52<1:23:12, 1.45it/s, loss=0.0187, lr=2.20e-05, step=2759]16:37:59.169 [I] step=2760 loss=0.0232 smoothed_loss=0.0167 lr=2.20e-05 grad_norm=0.4760 step_time=0.5568s data_time=0.1048s it/s=1.512 eta_to_10000=4786.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0081 grad_action_out_proj=0.1109 grad_shared_expert=0.3870 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2760/10000 [31:52<1:18:14, 1.54it/s, loss=0.0187, lr=2.20e-05, step=2759] Training: 28%|██▊ | 2760/10000 [31:52<1:18:14, 1.54it/s, loss=0.0232, lr=2.20e-05, step=2760] Training: 28%|██▊ | 2761/10000 [31:53<1:21:07, 1.49it/s, loss=0.0232, lr=2.20e-05, step=2760] Training: 28%|██▊ | 2761/10000 [31:53<1:21:07, 1.49it/s, loss=0.0041, lr=2.20e-05, step=2761] Training: 28%|██▊ | 2762/10000 [31:54<1:17:11, 1.56it/s, loss=0.0041, lr=2.20e-05, step=2761] Training: 28%|██▊ | 2762/10000 [31:54<1:17:11, 1.56it/s, loss=0.0281, lr=2.20e-05, step=2762] Training: 28%|██▊ | 2763/10000 [31:54<1:17:27, 1.56it/s, loss=0.0281, lr=2.20e-05, step=2762] Training: 28%|██▊ | 2763/10000 [31:54<1:17:27, 1.56it/s, loss=0.0684, lr=2.20e-05, step=2763] Training: 28%|██▊ | 2764/10000 [31:55<1:25:51, 1.40it/s, loss=0.0684, lr=2.20e-05, step=2763] Training: 28%|██▊ | 2764/10000 [31:55<1:25:51, 1.40it/s, loss=0.0259, lr=2.20e-05, step=2764] Training: 28%|██▊ | 2765/10000 [31:56<1:20:34, 1.50it/s, loss=0.0259, lr=2.20e-05, step=2764] Training: 28%|██▊ | 2765/10000 [31:56<1:20:34, 1.50it/s, loss=0.0318, lr=2.20e-05, step=2765] Training: 28%|██▊ | 2766/10000 [31:56<1:24:48, 1.42it/s, loss=0.0318, lr=2.20e-05, step=2765] Training: 28%|██▊ | 2766/10000 [31:56<1:24:48, 1.42it/s, loss=0.0441, lr=2.20e-05, step=2766] Training: 28%|██▊ | 2767/10000 [31:57<1:17:43, 1.55it/s, loss=0.0441, lr=2.20e-05, step=2766] Training: 28%|██▊ | 2767/10000 [31:57<1:17:43, 1.55it/s, loss=0.0141, lr=2.20e-05, step=2767] Training: 28%|██▊ | 2768/10000 [31:57<1:12:50, 1.65it/s, loss=0.0141, lr=2.20e-05, step=2767] Training: 28%|██▊ | 2768/10000 [31:57<1:12:50, 1.65it/s, loss=0.0169, lr=2.20e-05, step=2768] Training: 28%|██▊ | 2769/10000 [31:58<1:11:01, 1.70it/s, loss=0.0169, lr=2.20e-05, step=2768] Training: 28%|██▊ | 2769/10000 [31:58<1:11:01, 1.70it/s, loss=0.0153, lr=2.20e-05, step=2769]16:38:05.459 [I] step=2770 loss=0.0105 smoothed_loss=0.0214 lr=2.20e-05 grad_norm=0.5209 step_time=0.5419s data_time=0.0867s it/s=1.590 eta_to_10000=4546.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0944 grad_shared_expert=0.2768 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2770/10000 [31:59<1:09:37, 1.73it/s, loss=0.0153, lr=2.20e-05, step=2769] Training: 28%|██▊ | 2770/10000 [31:59<1:09:37, 1.73it/s, loss=0.0105, lr=2.20e-05, step=2770] Training: 28%|██▊ | 2771/10000 [31:59<1:06:46, 1.80it/s, loss=0.0105, lr=2.20e-05, step=2770] Training: 28%|██▊ | 2771/10000 [31:59<1:06:46, 1.80it/s, loss=0.0070, lr=2.20e-05, step=2771] Training: 28%|██▊ | 2772/10000 [32:00<1:05:03, 1.85it/s, loss=0.0070, lr=2.20e-05, step=2771] Training: 28%|██▊ | 2772/10000 [32:00<1:05:03, 1.85it/s, loss=0.0496, lr=2.20e-05, step=2772] Training: 28%|██▊ | 2773/10000 [32:00<1:17:11, 1.56it/s, loss=0.0496, lr=2.20e-05, step=2772] Training: 28%|██▊ | 2773/10000 [32:00<1:17:11, 1.56it/s, loss=0.0439, lr=2.20e-05, step=2773] Training: 28%|██▊ | 2774/10000 [32:01<1:12:17, 1.67it/s, loss=0.0439, lr=2.20e-05, step=2773] Training: 28%|██▊ | 2774/10000 [32:01<1:12:17, 1.67it/s, loss=0.0226, lr=2.20e-05, step=2774] Training: 28%|██▊ | 2775/10000 [32:01<1:08:39, 1.75it/s, loss=0.0226, lr=2.20e-05, step=2774] Training: 28%|██▊ | 2775/10000 [32:01<1:08:39, 1.75it/s, loss=0.0130, lr=2.20e-05, step=2775] Training: 28%|██▊ | 2776/10000 [32:02<1:07:49, 1.78it/s, loss=0.0130, lr=2.20e-05, step=2775] Training: 28%|██▊ | 2776/10000 [32:02<1:07:49, 1.78it/s, loss=0.0089, lr=2.20e-05, step=2776] Training: 28%|██▊ | 2777/10000 [32:03<1:10:31, 1.71it/s, loss=0.0089, lr=2.20e-05, step=2776] Training: 28%|██▊ | 2777/10000 [32:03<1:10:31, 1.71it/s, loss=0.0429, lr=2.20e-05, step=2777] Training: 28%|██▊ | 2778/10000 [32:03<1:09:18, 1.74it/s, loss=0.0429, lr=2.20e-05, step=2777] Training: 28%|██▊ | 2778/10000 [32:03<1:09:18, 1.74it/s, loss=0.0320, lr=2.20e-05, step=2778] Training: 28%|██▊ | 2779/10000 [32:04<1:14:32, 1.61it/s, loss=0.0320, lr=2.20e-05, step=2778] Training: 28%|██▊ | 2779/10000 [32:04<1:14:32, 1.61it/s, loss=0.0079, lr=2.20e-05, step=2779]16:38:11.508 [I] step=2780 loss=0.0192 smoothed_loss=0.0229 lr=2.20e-05 grad_norm=0.5243 step_time=0.5217s data_time=0.0832s it/s=1.653 eta_to_10000=4366.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.1139 grad_shared_expert=0.3818 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2780/10000 [32:05<1:17:31, 1.55it/s, loss=0.0079, lr=2.20e-05, step=2779] Training: 28%|██▊ | 2780/10000 [32:05<1:17:31, 1.55it/s, loss=0.0192, lr=2.20e-05, step=2780] Training: 28%|██▊ | 2781/10000 [32:05<1:20:22, 1.50it/s, loss=0.0192, lr=2.20e-05, step=2780] Training: 28%|██▊ | 2781/10000 [32:05<1:20:22, 1.50it/s, loss=0.0261, lr=2.20e-05, step=2781] Training: 28%|██▊ | 2782/10000 [32:06<1:16:30, 1.57it/s, loss=0.0261, lr=2.20e-05, step=2781] Training: 28%|██▊ | 2782/10000 [32:06<1:16:30, 1.57it/s, loss=0.0292, lr=2.19e-05, step=2782] Training: 28%|██▊ | 2783/10000 [32:06<1:10:54, 1.70it/s, loss=0.0292, lr=2.19e-05, step=2782] Training: 28%|██▊ | 2783/10000 [32:06<1:10:54, 1.70it/s, loss=0.0617, lr=2.19e-05, step=2783] Training: 28%|██▊ | 2784/10000 [32:07<1:12:15, 1.66it/s, loss=0.0617, lr=2.19e-05, step=2783] Training: 28%|██▊ | 2784/10000 [32:07<1:12:15, 1.66it/s, loss=0.0586, lr=2.19e-05, step=2784] Training: 28%|██▊ | 2785/10000 [32:08<1:12:35, 1.66it/s, loss=0.0586, lr=2.19e-05, step=2784] Training: 28%|██▊ | 2785/10000 [32:08<1:12:35, 1.66it/s, loss=0.0340, lr=2.19e-05, step=2785] Training: 28%|██▊ | 2786/10000 [32:08<1:20:06, 1.50it/s, loss=0.0340, lr=2.19e-05, step=2785] Training: 28%|██▊ | 2786/10000 [32:08<1:20:06, 1.50it/s, loss=0.0126, lr=2.19e-05, step=2786] Training: 28%|██▊ | 2787/10000 [32:09<1:21:24, 1.48it/s, loss=0.0126, lr=2.19e-05, step=2786] Training: 28%|██▊ | 2787/10000 [32:09<1:21:24, 1.48it/s, loss=0.0151, lr=2.19e-05, step=2787] Training: 28%|██▊ | 2788/10000 [32:10<1:32:54, 1.29it/s, loss=0.0151, lr=2.19e-05, step=2787] Training: 28%|██▊ | 2788/10000 [32:10<1:32:54, 1.29it/s, loss=0.0252, lr=2.19e-05, step=2788] Training: 28%|██▊ | 2789/10000 [32:11<1:25:07, 1.41it/s, loss=0.0252, lr=2.19e-05, step=2788] Training: 28%|██▊ | 2789/10000 [32:11<1:25:07, 1.41it/s, loss=0.0148, lr=2.19e-05, step=2789]16:38:18.350 [I] step=2790 loss=0.0077 smoothed_loss=0.0244 lr=2.19e-05 grad_norm=0.5967 step_time=0.5812s data_time=0.1030s it/s=1.462 eta_to_10000=4932.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0118 grad_action_out_proj=0.1530 grad_shared_expert=0.4462 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2790/10000 [32:11<1:27:21, 1.38it/s, loss=0.0148, lr=2.19e-05, step=2789] Training: 28%|██▊ | 2790/10000 [32:11<1:27:21, 1.38it/s, loss=0.0077, lr=2.19e-05, step=2790] Training: 28%|██▊ | 2791/10000 [32:12<1:23:18, 1.44it/s, loss=0.0077, lr=2.19e-05, step=2790] Training: 28%|██▊ | 2791/10000 [32:12<1:23:18, 1.44it/s, loss=0.0145, lr=2.19e-05, step=2791] Training: 28%|██▊ | 2792/10000 [32:13<1:20:24, 1.49it/s, loss=0.0145, lr=2.19e-05, step=2791] Training: 28%|██▊ | 2792/10000 [32:13<1:20:24, 1.49it/s, loss=0.0064, lr=2.19e-05, step=2792] Training: 28%|██▊ | 2793/10000 [32:13<1:17:51, 1.54it/s, loss=0.0064, lr=2.19e-05, step=2792] Training: 28%|██▊ | 2793/10000 [32:13<1:17:51, 1.54it/s, loss=0.0097, lr=2.19e-05, step=2793] Training: 28%|██▊ | 2794/10000 [32:14<1:24:38, 1.42it/s, loss=0.0097, lr=2.19e-05, step=2793] Training: 28%|██▊ | 2794/10000 [32:14<1:24:38, 1.42it/s, loss=0.0240, lr=2.19e-05, step=2794] Training: 28%|██▊ | 2795/10000 [32:15<1:25:42, 1.40it/s, loss=0.0240, lr=2.19e-05, step=2794] Training: 28%|██▊ | 2795/10000 [32:15<1:25:42, 1.40it/s, loss=0.0564, lr=2.19e-05, step=2795] Training: 28%|██▊ | 2796/10000 [32:15<1:20:20, 1.49it/s, loss=0.0564, lr=2.19e-05, step=2795] Training: 28%|██▊ | 2796/10000 [32:15<1:20:20, 1.49it/s, loss=0.0389, lr=2.19e-05, step=2796] Training: 28%|██▊ | 2797/10000 [32:16<1:15:07, 1.60it/s, loss=0.0389, lr=2.19e-05, step=2796] Training: 28%|██▊ | 2797/10000 [32:16<1:15:07, 1.60it/s, loss=0.0508, lr=2.19e-05, step=2797] Training: 28%|██▊ | 2798/10000 [32:17<1:14:38, 1.61it/s, loss=0.0508, lr=2.19e-05, step=2797] Training: 28%|██▊ | 2798/10000 [32:17<1:14:38, 1.61it/s, loss=0.0133, lr=2.19e-05, step=2798] Training: 28%|██▊ | 2799/10000 [32:17<1:13:38, 1.63it/s, loss=0.0133, lr=2.19e-05, step=2798] Training: 28%|██▊ | 2799/10000 [32:17<1:13:38, 1.63it/s, loss=0.0135, lr=2.19e-05, step=2799]16:38:24.749 [I] step=2800 loss=0.0525 smoothed_loss=0.0282 lr=2.19e-05 grad_norm=0.4782 step_time=0.5490s data_time=0.0909s it/s=1.563 eta_to_10000=4605.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0233 grad_action_out_proj=0.1703 grad_shared_expert=0.5472 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2800/10000 [32:18<1:16:50, 1.56it/s, loss=0.0135, lr=2.19e-05, step=2799] Training: 28%|██▊ | 2800/10000 [32:18<1:16:50, 1.56it/s, loss=0.0525, lr=2.19e-05, step=2800] Training: 28%|██▊ | 2801/10000 [32:19<1:19:46, 1.50it/s, loss=0.0525, lr=2.19e-05, step=2800] Training: 28%|██▊ | 2801/10000 [32:19<1:19:46, 1.50it/s, loss=0.0441, lr=2.19e-05, step=2801] Training: 28%|██▊ | 2802/10000 [32:19<1:27:14, 1.38it/s, loss=0.0441, lr=2.19e-05, step=2801] Training: 28%|██▊ | 2802/10000 [32:19<1:27:14, 1.38it/s, loss=0.0296, lr=2.19e-05, step=2802] Training: 28%|██▊ | 2803/10000 [32:20<1:27:30, 1.37it/s, loss=0.0296, lr=2.19e-05, step=2802] Training: 28%|██▊ | 2803/10000 [32:20<1:27:30, 1.37it/s, loss=0.0151, lr=2.19e-05, step=2803] Training: 28%|██▊ | 2804/10000 [32:21<1:22:39, 1.45it/s, loss=0.0151, lr=2.19e-05, step=2803] Training: 28%|██▊ | 2804/10000 [32:21<1:22:39, 1.45it/s, loss=0.0269, lr=2.19e-05, step=2804] Training: 28%|██▊ | 2805/10000 [32:22<1:28:15, 1.36it/s, loss=0.0269, lr=2.19e-05, step=2804] Training: 28%|██▊ | 2805/10000 [32:22<1:28:15, 1.36it/s, loss=0.0203, lr=2.19e-05, step=2805] Training: 28%|██▊ | 2806/10000 [32:22<1:21:55, 1.46it/s, loss=0.0203, lr=2.19e-05, step=2805] Training: 28%|██▊ | 2806/10000 [32:22<1:21:55, 1.46it/s, loss=0.0104, lr=2.19e-05, step=2806] Training: 28%|██▊ | 2807/10000 [32:23<1:15:23, 1.59it/s, loss=0.0104, lr=2.19e-05, step=2806] Training: 28%|██▊ | 2807/10000 [32:23<1:15:23, 1.59it/s, loss=0.0067, lr=2.19e-05, step=2807] Training: 28%|██▊ | 2808/10000 [32:23<1:10:07, 1.71it/s, loss=0.0067, lr=2.19e-05, step=2807] Training: 28%|██▊ | 2808/10000 [32:23<1:10:07, 1.71it/s, loss=0.0174, lr=2.19e-05, step=2808] Training: 28%|██▊ | 2809/10000 [32:24<1:21:33, 1.47it/s, loss=0.0174, lr=2.19e-05, step=2808] Training: 28%|██▊ | 2809/10000 [32:24<1:21:33, 1.47it/s, loss=0.0354, lr=2.19e-05, step=2809]16:38:31.623 [I] step=2810 loss=0.0100 smoothed_loss=0.0229 lr=2.19e-05 grad_norm=0.5532 step_time=0.5788s data_time=0.1086s it/s=1.455 eta_to_10000=4941.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1537 grad_shared_expert=0.6362 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2810/10000 [32:25<1:20:39, 1.49it/s, loss=0.0354, lr=2.19e-05, step=2809] Training: 28%|██▊ | 2810/10000 [32:25<1:20:39, 1.49it/s, loss=0.0100, lr=2.19e-05, step=2810] Training: 28%|██▊ | 2811/10000 [32:25<1:17:10, 1.55it/s, loss=0.0100, lr=2.19e-05, step=2810] Training: 28%|██▊ | 2811/10000 [32:25<1:17:10, 1.55it/s, loss=0.0092, lr=2.19e-05, step=2811] Training: 28%|██▊ | 2812/10000 [32:26<1:17:24, 1.55it/s, loss=0.0092, lr=2.19e-05, step=2811] Training: 28%|██▊ | 2812/10000 [32:26<1:17:24, 1.55it/s, loss=0.0283, lr=2.19e-05, step=2812] Training: 28%|██▊ | 2813/10000 [32:26<1:12:20, 1.66it/s, loss=0.0283, lr=2.19e-05, step=2812] Training: 28%|██▊ | 2813/10000 [32:26<1:12:20, 1.66it/s, loss=0.0227, lr=2.19e-05, step=2813] Training: 28%|██▊ | 2814/10000 [32:27<1:11:45, 1.67it/s, loss=0.0227, lr=2.19e-05, step=2813] Training: 28%|██▊ | 2814/10000 [32:27<1:11:45, 1.67it/s, loss=0.0125, lr=2.19e-05, step=2814] Training: 28%|██▊ | 2815/10000 [32:28<1:10:02, 1.71it/s, loss=0.0125, lr=2.19e-05, step=2814] Training: 28%|██▊ | 2815/10000 [32:28<1:10:02, 1.71it/s, loss=0.0060, lr=2.19e-05, step=2815] Training: 28%|██▊ | 2816/10000 [32:29<1:32:34, 1.29it/s, loss=0.0060, lr=2.19e-05, step=2815] Training: 28%|██▊ | 2816/10000 [32:29<1:32:34, 1.29it/s, loss=0.0131, lr=2.19e-05, step=2816] Training: 28%|██▊ | 2817/10000 [32:29<1:23:54, 1.43it/s, loss=0.0131, lr=2.19e-05, step=2816] Training: 28%|██▊ | 2817/10000 [32:29<1:23:54, 1.43it/s, loss=0.0198, lr=2.19e-05, step=2817] Training: 28%|██▊ | 2818/10000 [32:30<1:18:18, 1.53it/s, loss=0.0198, lr=2.19e-05, step=2817] Training: 28%|██▊ | 2818/10000 [32:30<1:18:18, 1.53it/s, loss=0.0030, lr=2.19e-05, step=2818] Training: 28%|██▊ | 2819/10000 [32:30<1:12:56, 1.64it/s, loss=0.0030, lr=2.19e-05, step=2818] Training: 28%|██▊ | 2819/10000 [32:30<1:12:56, 1.64it/s, loss=0.0176, lr=2.19e-05, step=2819]16:38:38.227 [I] step=2820 loss=0.0524 smoothed_loss=0.0210 lr=2.19e-05 grad_norm=0.5461 step_time=0.5318s data_time=0.1286s it/s=1.514 eta_to_10000=4741.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0191 grad_action_out_proj=0.1982 grad_shared_expert=0.7516 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2820/10000 [32:31<1:24:41, 1.41it/s, loss=0.0176, lr=2.19e-05, step=2819] Training: 28%|██▊ | 2820/10000 [32:31<1:24:41, 1.41it/s, loss=0.0524, lr=2.19e-05, step=2820] Training: 28%|██▊ | 2821/10000 [32:32<1:25:24, 1.40it/s, loss=0.0524, lr=2.19e-05, step=2820] Training: 28%|██▊ | 2821/10000 [32:32<1:25:24, 1.40it/s, loss=0.0053, lr=2.18e-05, step=2821] Training: 28%|██▊ | 2822/10000 [32:33<1:27:38, 1.37it/s, loss=0.0053, lr=2.18e-05, step=2821] Training: 28%|██▊ | 2822/10000 [32:33<1:27:38, 1.37it/s, loss=0.0055, lr=2.18e-05, step=2822] Training: 28%|██▊ | 2823/10000 [32:34<1:27:22, 1.37it/s, loss=0.0055, lr=2.18e-05, step=2822] Training: 28%|██▊ | 2823/10000 [32:34<1:27:22, 1.37it/s, loss=0.0060, lr=2.18e-05, step=2823] Training: 28%|██▊ | 2824/10000 [32:34<1:35:54, 1.25it/s, loss=0.0060, lr=2.18e-05, step=2823] Training: 28%|██▊ | 2824/10000 [32:34<1:35:54, 1.25it/s, loss=0.0064, lr=2.18e-05, step=2824] Training: 28%|██▊ | 2825/10000 [32:35<1:26:16, 1.39it/s, loss=0.0064, lr=2.18e-05, step=2824] Training: 28%|██▊ | 2825/10000 [32:35<1:26:16, 1.39it/s, loss=0.0093, lr=2.18e-05, step=2825] Training: 28%|██▊ | 2826/10000 [32:36<1:18:36, 1.52it/s, loss=0.0093, lr=2.18e-05, step=2825] Training: 28%|██▊ | 2826/10000 [32:36<1:18:36, 1.52it/s, loss=0.0124, lr=2.18e-05, step=2826] Training: 28%|██▊ | 2827/10000 [32:36<1:21:09, 1.47it/s, loss=0.0124, lr=2.18e-05, step=2826] Training: 28%|██▊ | 2827/10000 [32:36<1:21:09, 1.47it/s, loss=0.0115, lr=2.18e-05, step=2827] Training: 28%|██▊ | 2828/10000 [32:37<1:15:18, 1.59it/s, loss=0.0115, lr=2.18e-05, step=2827] Training: 28%|██▊ | 2828/10000 [32:37<1:15:18, 1.59it/s, loss=0.0028, lr=2.18e-05, step=2828] Training: 28%|██▊ | 2829/10000 [32:37<1:10:31, 1.69it/s, loss=0.0028, lr=2.18e-05, step=2828] Training: 28%|██▊ | 2829/10000 [32:37<1:10:31, 1.69it/s, loss=0.0194, lr=2.18e-05, step=2829]16:38:44.894 [I] step=2830 loss=0.0054 smoothed_loss=0.0131 lr=2.18e-05 grad_norm=0.4631 step_time=0.5608s data_time=0.1059s it/s=1.500 eta_to_10000=4779.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0076 grad_action_out_proj=0.0837 grad_shared_expert=0.4084 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2830/10000 [32:38<1:13:53, 1.62it/s, loss=0.0194, lr=2.18e-05, step=2829] Training: 28%|██▊ | 2830/10000 [32:38<1:13:53, 1.62it/s, loss=0.0054, lr=2.18e-05, step=2830] Training: 28%|██▊ | 2831/10000 [32:39<1:25:17, 1.40it/s, loss=0.0054, lr=2.18e-05, step=2830] Training: 28%|██▊ | 2831/10000 [32:39<1:25:17, 1.40it/s, loss=0.0364, lr=2.18e-05, step=2831] Training: 28%|██▊ | 2832/10000 [32:40<1:22:53, 1.44it/s, loss=0.0364, lr=2.18e-05, step=2831] Training: 28%|██▊ | 2832/10000 [32:40<1:22:53, 1.44it/s, loss=0.0069, lr=2.18e-05, step=2832] Training: 28%|██▊ | 2833/10000 [32:41<1:32:21, 1.29it/s, loss=0.0069, lr=2.18e-05, step=2832] Training: 28%|██▊ | 2833/10000 [32:41<1:32:21, 1.29it/s, loss=0.0083, lr=2.18e-05, step=2833] Training: 28%|██▊ | 2834/10000 [32:41<1:37:54, 1.22it/s, loss=0.0083, lr=2.18e-05, step=2833] Training: 28%|██▊ | 2834/10000 [32:41<1:37:54, 1.22it/s, loss=0.0185, lr=2.18e-05, step=2834] Training: 28%|██▊ | 2835/10000 [32:42<1:31:37, 1.30it/s, loss=0.0185, lr=2.18e-05, step=2834] Training: 28%|██▊ | 2835/10000 [32:42<1:31:37, 1.30it/s, loss=0.0102, lr=2.18e-05, step=2835] Training: 28%|██▊ | 2836/10000 [32:43<1:31:00, 1.31it/s, loss=0.0102, lr=2.18e-05, step=2835] Training: 28%|██▊ | 2836/10000 [32:43<1:31:00, 1.31it/s, loss=0.0164, lr=2.18e-05, step=2836] Training: 28%|██▊ | 2837/10000 [32:44<1:29:24, 1.34it/s, loss=0.0164, lr=2.18e-05, step=2836] Training: 28%|██▊ | 2837/10000 [32:44<1:29:24, 1.34it/s, loss=0.0195, lr=2.18e-05, step=2837] Training: 28%|██▊ | 2838/10000 [32:44<1:32:29, 1.29it/s, loss=0.0195, lr=2.18e-05, step=2837] Training: 28%|██▊ | 2838/10000 [32:44<1:32:29, 1.29it/s, loss=0.0373, lr=2.18e-05, step=2838] Training: 28%|██▊ | 2839/10000 [32:45<1:22:00, 1.46it/s, loss=0.0373, lr=2.18e-05, step=2838] Training: 28%|██▊ | 2839/10000 [32:45<1:22:00, 1.46it/s, loss=0.0299, lr=2.18e-05, step=2839]16:38:52.374 [I] step=2840 loss=0.0161 smoothed_loss=0.0181 lr=2.18e-05 grad_norm=0.4850 step_time=0.5894s data_time=0.1586s it/s=1.337 eta_to_10000=5355.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0172 grad_action_out_proj=0.1635 grad_shared_expert=0.4637 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2840/10000 [32:45<1:18:05, 1.53it/s, loss=0.0299, lr=2.18e-05, step=2839] Training: 28%|██▊ | 2840/10000 [32:45<1:18:05, 1.53it/s, loss=0.0161, lr=2.18e-05, step=2840] Training: 28%|██▊ | 2841/10000 [32:46<1:12:42, 1.64it/s, loss=0.0161, lr=2.18e-05, step=2840] Training: 28%|██▊ | 2841/10000 [32:46<1:12:42, 1.64it/s, loss=0.0364, lr=2.18e-05, step=2841] Training: 28%|██▊ | 2842/10000 [32:47<1:11:03, 1.68it/s, loss=0.0364, lr=2.18e-05, step=2841] Training: 28%|██▊ | 2842/10000 [32:47<1:11:03, 1.68it/s, loss=0.0146, lr=2.18e-05, step=2842] Training: 28%|██▊ | 2843/10000 [32:47<1:08:29, 1.74it/s, loss=0.0146, lr=2.18e-05, step=2842] Training: 28%|██▊ | 2843/10000 [32:47<1:08:29, 1.74it/s, loss=0.0257, lr=2.18e-05, step=2843] Training: 28%|██▊ | 2844/10000 [32:48<1:06:11, 1.80it/s, loss=0.0257, lr=2.18e-05, step=2843] Training: 28%|██▊ | 2844/10000 [32:48<1:06:11, 1.80it/s, loss=0.0141, lr=2.18e-05, step=2844] Training: 28%|██▊ | 2845/10000 [32:48<1:12:38, 1.64it/s, loss=0.0141, lr=2.18e-05, step=2844] Training: 28%|██▊ | 2845/10000 [32:48<1:12:38, 1.64it/s, loss=0.0234, lr=2.18e-05, step=2845] Training: 28%|██▊ | 2846/10000 [32:49<1:19:12, 1.51it/s, loss=0.0234, lr=2.18e-05, step=2845] Training: 28%|██▊ | 2846/10000 [32:49<1:19:12, 1.51it/s, loss=0.0154, lr=2.18e-05, step=2846] Training: 28%|██▊ | 2847/10000 [32:50<1:13:24, 1.62it/s, loss=0.0154, lr=2.18e-05, step=2846] Training: 28%|██▊ | 2847/10000 [32:50<1:13:24, 1.62it/s, loss=0.0238, lr=2.18e-05, step=2847] Training: 28%|██▊ | 2848/10000 [32:50<1:13:38, 1.62it/s, loss=0.0238, lr=2.18e-05, step=2847] Training: 28%|██▊ | 2848/10000 [32:50<1:13:38, 1.62it/s, loss=0.0396, lr=2.18e-05, step=2848] Training: 28%|██▊ | 2849/10000 [32:51<1:18:54, 1.51it/s, loss=0.0396, lr=2.18e-05, step=2848] Training: 28%|██▊ | 2849/10000 [32:51<1:18:54, 1.51it/s, loss=0.0070, lr=2.18e-05, step=2849]16:38:58.562 [I] step=2850 loss=0.0161 smoothed_loss=0.0199 lr=2.18e-05 grad_norm=0.5100 step_time=0.5435s data_time=0.0753s it/s=1.616 eta_to_10000=4424.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.1323 grad_shared_expert=0.4949 (10775:train_pytorch.py:850) + Training: 28%|██▊ | 2850/10000 [32:52<1:19:06, 1.51it/s, loss=0.0070, lr=2.18e-05, step=2849] Training: 28%|██▊ | 2850/10000 [32:52<1:19:06, 1.51it/s, loss=0.0161, lr=2.18e-05, step=2850] Training: 29%|██▊ | 2851/10000 [32:52<1:18:41, 1.51it/s, loss=0.0161, lr=2.18e-05, step=2850] Training: 29%|██▊ | 2851/10000 [32:52<1:18:41, 1.51it/s, loss=0.0518, lr=2.18e-05, step=2851] Training: 29%|██▊ | 2852/10000 [32:53<1:21:07, 1.47it/s, loss=0.0518, lr=2.18e-05, step=2851] Training: 29%|██▊ | 2852/10000 [32:53<1:21:07, 1.47it/s, loss=0.0109, lr=2.18e-05, step=2852] Training: 29%|██▊ | 2853/10000 [32:54<1:24:29, 1.41it/s, loss=0.0109, lr=2.18e-05, step=2852] Training: 29%|██▊ | 2853/10000 [32:54<1:24:29, 1.41it/s, loss=0.0078, lr=2.18e-05, step=2853] Training: 29%|██▊ | 2854/10000 [32:54<1:17:49, 1.53it/s, loss=0.0078, lr=2.18e-05, step=2853] Training: 29%|██▊ | 2854/10000 [32:54<1:17:49, 1.53it/s, loss=0.0289, lr=2.18e-05, step=2854] Training: 29%|██▊ | 2855/10000 [32:55<1:12:01, 1.65it/s, loss=0.0289, lr=2.18e-05, step=2854] Training: 29%|██▊ | 2855/10000 [32:55<1:12:01, 1.65it/s, loss=0.0175, lr=2.18e-05, step=2855] Training: 29%|██▊ | 2856/10000 [32:55<1:08:38, 1.73it/s, loss=0.0175, lr=2.18e-05, step=2855] Training: 29%|██▊ | 2856/10000 [32:55<1:08:38, 1.73it/s, loss=0.0106, lr=2.18e-05, step=2856] Training: 29%|██▊ | 2857/10000 [32:56<1:07:50, 1.75it/s, loss=0.0106, lr=2.18e-05, step=2856] Training: 29%|██▊ | 2857/10000 [32:56<1:07:50, 1.75it/s, loss=0.0279, lr=2.18e-05, step=2857] Training: 29%|██▊ | 2858/10000 [32:57<1:13:46, 1.61it/s, loss=0.0279, lr=2.18e-05, step=2857] Training: 29%|██▊ | 2858/10000 [32:57<1:13:46, 1.61it/s, loss=0.0239, lr=2.18e-05, step=2858] Training: 29%|██▊ | 2859/10000 [32:57<1:20:28, 1.48it/s, loss=0.0239, lr=2.18e-05, step=2858] Training: 29%|██▊ | 2859/10000 [32:57<1:20:28, 1.48it/s, loss=0.0231, lr=2.17e-05, step=2859]16:39:04.964 [I] step=2860 loss=0.0264 smoothed_loss=0.0217 lr=2.18e-05 grad_norm=0.4811 step_time=0.5625s data_time=0.0776s it/s=1.563 eta_to_10000=4569.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0142 grad_action_out_proj=0.1226 grad_shared_expert=0.3633 (10775:train_pytorch.py:850) + Training: 29%|██▊ | 2860/10000 [32:58<1:18:30, 1.52it/s, loss=0.0231, lr=2.17e-05, step=2859] Training: 29%|██▊ | 2860/10000 [32:58<1:18:30, 1.52it/s, loss=0.0264, lr=2.17e-05, step=2860] Training: 29%|██▊ | 2861/10000 [32:59<1:18:25, 1.52it/s, loss=0.0264, lr=2.17e-05, step=2860] Training: 29%|██▊ | 2861/10000 [32:59<1:18:25, 1.52it/s, loss=0.0186, lr=2.17e-05, step=2861] Training: 29%|██▊ | 2862/10000 [33:00<1:26:09, 1.38it/s, loss=0.0186, lr=2.17e-05, step=2861] Training: 29%|██▊ | 2862/10000 [33:00<1:26:09, 1.38it/s, loss=0.0092, lr=2.17e-05, step=2862] Training: 29%|██▊ | 2863/10000 [33:00<1:20:52, 1.47it/s, loss=0.0092, lr=2.17e-05, step=2862] Training: 29%|██▊ | 2863/10000 [33:00<1:20:52, 1.47it/s, loss=0.0206, lr=2.17e-05, step=2863] Training: 29%|██▊ | 2864/10000 [33:01<1:13:59, 1.61it/s, loss=0.0206, lr=2.17e-05, step=2863] Training: 29%|██▊ | 2864/10000 [33:01<1:13:59, 1.61it/s, loss=0.0139, lr=2.17e-05, step=2864] Training: 29%|██▊ | 2865/10000 [33:01<1:17:52, 1.53it/s, loss=0.0139, lr=2.17e-05, step=2864] Training: 29%|██▊ | 2865/10000 [33:01<1:17:52, 1.53it/s, loss=0.0348, lr=2.17e-05, step=2865] Training: 29%|██▊ | 2866/10000 [33:02<1:15:08, 1.58it/s, loss=0.0348, lr=2.17e-05, step=2865] Training: 29%|██▊ | 2866/10000 [33:02<1:15:08, 1.58it/s, loss=0.0213, lr=2.17e-05, step=2866] Training: 29%|██▊ | 2867/10000 [33:03<1:18:30, 1.51it/s, loss=0.0213, lr=2.17e-05, step=2866] Training: 29%|██▊ | 2867/10000 [33:03<1:18:30, 1.51it/s, loss=0.0221, lr=2.17e-05, step=2867] Training: 29%|██▊ | 2868/10000 [33:03<1:19:17, 1.50it/s, loss=0.0221, lr=2.17e-05, step=2867] Training: 29%|██▊ | 2868/10000 [33:03<1:19:17, 1.50it/s, loss=0.0144, lr=2.17e-05, step=2868] Training: 29%|██▊ | 2869/10000 [33:04<1:22:34, 1.44it/s, loss=0.0144, lr=2.17e-05, step=2868] Training: 29%|██▊ | 2869/10000 [33:04<1:22:34, 1.44it/s, loss=0.0127, lr=2.17e-05, step=2869]16:39:11.842 [I] step=2870 loss=0.0154 smoothed_loss=0.0193 lr=2.17e-05 grad_norm=0.5109 step_time=0.5701s data_time=0.1178s it/s=1.454 eta_to_10000=4903.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1803 grad_shared_expert=0.4640 (10775:train_pytorch.py:850) + Training: 29%|██▊ | 2870/10000 [33:05<1:26:23, 1.38it/s, loss=0.0127, lr=2.17e-05, step=2869] Training: 29%|██▊ | 2870/10000 [33:05<1:26:23, 1.38it/s, loss=0.0154, lr=2.17e-05, step=2870] Training: 29%|██▊ | 2871/10000 [33:06<1:22:09, 1.45it/s, loss=0.0154, lr=2.17e-05, step=2870] Training: 29%|██▊ | 2871/10000 [33:06<1:22:09, 1.45it/s, loss=0.0276, lr=2.17e-05, step=2871] Training: 29%|██▊ | 2872/10000 [33:06<1:17:06, 1.54it/s, loss=0.0276, lr=2.17e-05, step=2871] Training: 29%|██▊ | 2872/10000 [33:06<1:17:06, 1.54it/s, loss=0.0581, lr=2.17e-05, step=2872] Training: 29%|██▊ | 2873/10000 [33:07<1:18:51, 1.51it/s, loss=0.0581, lr=2.17e-05, step=2872] Training: 29%|██▊ | 2873/10000 [33:07<1:18:51, 1.51it/s, loss=0.0336, lr=2.17e-05, step=2873] Training: 29%|██▊ | 2874/10000 [33:08<1:22:38, 1.44it/s, loss=0.0336, lr=2.17e-05, step=2873] Training: 29%|██▊ | 2874/10000 [33:08<1:22:38, 1.44it/s, loss=0.0118, lr=2.17e-05, step=2874] Training: 29%|██▉ | 2875/10000 [33:08<1:21:42, 1.45it/s, loss=0.0118, lr=2.17e-05, step=2874] Training: 29%|██▉ | 2875/10000 [33:08<1:21:42, 1.45it/s, loss=0.0186, lr=2.17e-05, step=2875] Training: 29%|██▉ | 2876/10000 [33:09<1:15:21, 1.58it/s, loss=0.0186, lr=2.17e-05, step=2875] Training: 29%|██▉ | 2876/10000 [33:09<1:15:21, 1.58it/s, loss=0.0182, lr=2.17e-05, step=2876] Training: 29%|██▉ | 2877/10000 [33:09<1:15:42, 1.57it/s, loss=0.0182, lr=2.17e-05, step=2876] Training: 29%|██▉ | 2877/10000 [33:09<1:15:42, 1.57it/s, loss=0.0487, lr=2.17e-05, step=2877] Training: 29%|██▉ | 2878/10000 [33:10<1:19:25, 1.49it/s, loss=0.0487, lr=2.17e-05, step=2877] Training: 29%|██▉ | 2878/10000 [33:10<1:19:25, 1.49it/s, loss=0.0281, lr=2.17e-05, step=2878] Training: 29%|██▉ | 2879/10000 [33:11<1:20:40, 1.47it/s, loss=0.0281, lr=2.17e-05, step=2878] Training: 29%|██▉ | 2879/10000 [33:11<1:20:40, 1.47it/s, loss=0.0194, lr=2.17e-05, step=2879]16:39:18.493 [I] step=2880 loss=0.0103 smoothed_loss=0.0234 lr=2.17e-05 grad_norm=0.5169 step_time=0.5698s data_time=0.0952s it/s=1.504 eta_to_10000=4734.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0108 grad_action_out_proj=0.2020 grad_shared_expert=0.6552 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2880/10000 [33:12<1:23:19, 1.42it/s, loss=0.0194, lr=2.17e-05, step=2879] Training: 29%|██▉ | 2880/10000 [33:12<1:23:19, 1.42it/s, loss=0.0103, lr=2.17e-05, step=2880] Training: 29%|██▉ | 2881/10000 [33:12<1:25:19, 1.39it/s, loss=0.0103, lr=2.17e-05, step=2880] Training: 29%|██▉ | 2881/10000 [33:12<1:25:19, 1.39it/s, loss=0.0474, lr=2.17e-05, step=2881] Training: 29%|██▉ | 2882/10000 [33:13<1:22:43, 1.43it/s, loss=0.0474, lr=2.17e-05, step=2881] Training: 29%|██▉ | 2882/10000 [33:13<1:22:43, 1.43it/s, loss=0.0192, lr=2.17e-05, step=2882] Training: 29%|██▉ | 2883/10000 [33:14<1:21:13, 1.46it/s, loss=0.0192, lr=2.17e-05, step=2882] Training: 29%|██▉ | 2883/10000 [33:14<1:21:13, 1.46it/s, loss=0.0237, lr=2.17e-05, step=2883] Training: 29%|██▉ | 2884/10000 [33:14<1:19:37, 1.49it/s, loss=0.0237, lr=2.17e-05, step=2883] Training: 29%|██▉ | 2884/10000 [33:14<1:19:37, 1.49it/s, loss=0.0234, lr=2.17e-05, step=2884] Training: 29%|██▉ | 2885/10000 [33:15<1:15:17, 1.58it/s, loss=0.0234, lr=2.17e-05, step=2884] Training: 29%|██▉ | 2885/10000 [33:15<1:15:17, 1.58it/s, loss=0.0065, lr=2.17e-05, step=2885] Training: 29%|██▉ | 2886/10000 [33:15<1:16:03, 1.56it/s, loss=0.0065, lr=2.17e-05, step=2885] Training: 29%|██▉ | 2886/10000 [33:15<1:16:03, 1.56it/s, loss=0.0077, lr=2.17e-05, step=2886] Training: 29%|██▉ | 2887/10000 [33:16<1:10:52, 1.67it/s, loss=0.0077, lr=2.17e-05, step=2886] Training: 29%|██▉ | 2887/10000 [33:16<1:10:52, 1.67it/s, loss=0.0200, lr=2.17e-05, step=2887] Training: 29%|██▉ | 2888/10000 [33:17<1:25:46, 1.38it/s, loss=0.0200, lr=2.17e-05, step=2887] Training: 29%|██▉ | 2888/10000 [33:17<1:25:46, 1.38it/s, loss=0.0170, lr=2.17e-05, step=2888] Training: 29%|██▉ | 2889/10000 [33:18<1:18:35, 1.51it/s, loss=0.0170, lr=2.17e-05, step=2888] Training: 29%|██▉ | 2889/10000 [33:18<1:18:35, 1.51it/s, loss=0.0062, lr=2.17e-05, step=2889]16:39:25.016 [I] step=2890 loss=0.0173 smoothed_loss=0.0192 lr=2.17e-05 grad_norm=0.5454 step_time=0.5583s data_time=0.0941s it/s=1.534 eta_to_10000=4635.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0067 grad_action_out_proj=0.0907 grad_shared_expert=0.3739 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2890/10000 [33:18<1:15:32, 1.57it/s, loss=0.0062, lr=2.17e-05, step=2889] Training: 29%|██▉ | 2890/10000 [33:18<1:15:32, 1.57it/s, loss=0.0173, lr=2.17e-05, step=2890] Training: 29%|██▉ | 2891/10000 [33:19<1:10:38, 1.68it/s, loss=0.0173, lr=2.17e-05, step=2890] Training: 29%|██▉ | 2891/10000 [33:19<1:10:38, 1.68it/s, loss=0.0139, lr=2.17e-05, step=2891] Training: 29%|██▉ | 2892/10000 [33:19<1:07:14, 1.76it/s, loss=0.0139, lr=2.17e-05, step=2891] Training: 29%|██▉ | 2892/10000 [33:19<1:07:14, 1.76it/s, loss=0.0140, lr=2.17e-05, step=2892] Training: 29%|██▉ | 2893/10000 [33:20<1:04:42, 1.83it/s, loss=0.0140, lr=2.17e-05, step=2892] Training: 29%|██▉ | 2893/10000 [33:20<1:04:42, 1.83it/s, loss=0.0054, lr=2.17e-05, step=2893] Training: 29%|██▉ | 2894/10000 [33:20<1:02:46, 1.89it/s, loss=0.0054, lr=2.17e-05, step=2893] Training: 29%|██▉ | 2894/10000 [33:20<1:02:46, 1.89it/s, loss=0.0056, lr=2.17e-05, step=2894] Training: 29%|██▉ | 2895/10000 [33:21<1:16:30, 1.55it/s, loss=0.0056, lr=2.17e-05, step=2894] Training: 29%|██▉ | 2895/10000 [33:21<1:16:30, 1.55it/s, loss=0.0441, lr=2.17e-05, step=2895] Training: 29%|██▉ | 2896/10000 [33:22<1:22:56, 1.43it/s, loss=0.0441, lr=2.17e-05, step=2895] Training: 29%|██▉ | 2896/10000 [33:22<1:22:56, 1.43it/s, loss=0.0406, lr=2.17e-05, step=2896] Training: 29%|██▉ | 2897/10000 [33:22<1:20:06, 1.48it/s, loss=0.0406, lr=2.17e-05, step=2896] Training: 29%|██▉ | 2897/10000 [33:22<1:20:06, 1.48it/s, loss=0.0183, lr=2.16e-05, step=2897] Training: 29%|██▉ | 2898/10000 [33:23<1:15:00, 1.58it/s, loss=0.0183, lr=2.16e-05, step=2897] Training: 29%|██▉ | 2898/10000 [33:23<1:15:00, 1.58it/s, loss=0.0358, lr=2.16e-05, step=2898] Training: 29%|██▉ | 2899/10000 [33:24<1:16:49, 1.54it/s, loss=0.0358, lr=2.16e-05, step=2898] Training: 29%|██▉ | 2899/10000 [33:24<1:16:49, 1.54it/s, loss=0.0190, lr=2.16e-05, step=2899]16:39:31.152 [I] step=2900 loss=0.0156 smoothed_loss=0.0212 lr=2.17e-05 grad_norm=0.5642 step_time=0.5299s data_time=0.0837s it/s=1.630 eta_to_10000=4355.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0881 grad_shared_expert=0.2990 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2900/10000 [33:24<1:13:43, 1.61it/s, loss=0.0190, lr=2.16e-05, step=2899] Training: 29%|██▉ | 2900/10000 [33:24<1:13:43, 1.61it/s, loss=0.0156, lr=2.16e-05, step=2900] Training: 29%|██▉ | 2901/10000 [33:25<1:09:11, 1.71it/s, loss=0.0156, lr=2.16e-05, step=2900] Training: 29%|██▉ | 2901/10000 [33:25<1:09:11, 1.71it/s, loss=0.0070, lr=2.16e-05, step=2901] Training: 29%|██▉ | 2902/10000 [33:26<1:17:29, 1.53it/s, loss=0.0070, lr=2.16e-05, step=2901] Training: 29%|██▉ | 2902/10000 [33:26<1:17:29, 1.53it/s, loss=0.0237, lr=2.16e-05, step=2902] Training: 29%|██▉ | 2903/10000 [33:26<1:19:50, 1.48it/s, loss=0.0237, lr=2.16e-05, step=2902] Training: 29%|██▉ | 2903/10000 [33:26<1:19:50, 1.48it/s, loss=0.0172, lr=2.16e-05, step=2903] Training: 29%|██▉ | 2904/10000 [33:27<1:13:17, 1.61it/s, loss=0.0172, lr=2.16e-05, step=2903] Training: 29%|██▉ | 2904/10000 [33:27<1:13:17, 1.61it/s, loss=0.0215, lr=2.16e-05, step=2904] Training: 29%|██▉ | 2905/10000 [33:28<1:20:11, 1.47it/s, loss=0.0215, lr=2.16e-05, step=2904] Training: 29%|██▉ | 2905/10000 [33:28<1:20:11, 1.47it/s, loss=0.0299, lr=2.16e-05, step=2905] Training: 29%|██▉ | 2906/10000 [33:28<1:25:08, 1.39it/s, loss=0.0299, lr=2.16e-05, step=2905] Training: 29%|██▉ | 2906/10000 [33:28<1:25:08, 1.39it/s, loss=0.0608, lr=2.16e-05, step=2906] Training: 29%|██▉ | 2907/10000 [33:29<1:27:51, 1.35it/s, loss=0.0608, lr=2.16e-05, step=2906] Training: 29%|██▉ | 2907/10000 [33:29<1:27:51, 1.35it/s, loss=0.0169, lr=2.16e-05, step=2907] Training: 29%|██▉ | 2908/10000 [33:30<1:27:38, 1.35it/s, loss=0.0169, lr=2.16e-05, step=2907] Training: 29%|██▉ | 2908/10000 [33:30<1:27:38, 1.35it/s, loss=0.0093, lr=2.16e-05, step=2908] Training: 29%|██▉ | 2909/10000 [33:31<1:22:38, 1.43it/s, loss=0.0093, lr=2.16e-05, step=2908] Training: 29%|██▉ | 2909/10000 [33:31<1:22:38, 1.43it/s, loss=0.0288, lr=2.16e-05, step=2909]16:39:38.321 [I] step=2910 loss=0.0128 smoothed_loss=0.0223 lr=2.16e-05 grad_norm=0.5587 step_time=0.5887s data_time=0.1282s it/s=1.395 eta_to_10000=5081.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0155 grad_action_out_proj=0.2325 grad_shared_expert=0.5371 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2910/10000 [33:31<1:28:48, 1.33it/s, loss=0.0288, lr=2.16e-05, step=2909] Training: 29%|██▉ | 2910/10000 [33:31<1:28:48, 1.33it/s, loss=0.0128, lr=2.16e-05, step=2910] Training: 29%|██▉ | 2911/10000 [33:32<1:19:16, 1.49it/s, loss=0.0128, lr=2.16e-05, step=2910] Training: 29%|██▉ | 2911/10000 [33:32<1:19:16, 1.49it/s, loss=0.0065, lr=2.16e-05, step=2911] Training: 29%|██▉ | 2912/10000 [33:32<1:13:29, 1.61it/s, loss=0.0065, lr=2.16e-05, step=2911] Training: 29%|██▉ | 2912/10000 [33:32<1:13:29, 1.61it/s, loss=0.0086, lr=2.16e-05, step=2912] Training: 29%|██▉ | 2913/10000 [33:33<1:12:54, 1.62it/s, loss=0.0086, lr=2.16e-05, step=2912] Training: 29%|██▉ | 2913/10000 [33:33<1:12:54, 1.62it/s, loss=0.0085, lr=2.16e-05, step=2913] Training: 29%|██▉ | 2914/10000 [33:33<1:09:07, 1.71it/s, loss=0.0085, lr=2.16e-05, step=2913] Training: 29%|██▉ | 2914/10000 [33:33<1:09:07, 1.71it/s, loss=0.0106, lr=2.16e-05, step=2914] Training: 29%|██▉ | 2915/10000 [33:34<1:11:24, 1.65it/s, loss=0.0106, lr=2.16e-05, step=2914] Training: 29%|██▉ | 2915/10000 [33:34<1:11:24, 1.65it/s, loss=0.0270, lr=2.16e-05, step=2915] Training: 29%|██▉ | 2916/10000 [33:35<1:11:52, 1.64it/s, loss=0.0270, lr=2.16e-05, step=2915] Training: 29%|██▉ | 2916/10000 [33:35<1:11:52, 1.64it/s, loss=0.0494, lr=2.16e-05, step=2916] Training: 29%|██▉ | 2917/10000 [33:36<1:18:20, 1.51it/s, loss=0.0494, lr=2.16e-05, step=2916] Training: 29%|██▉ | 2917/10000 [33:36<1:18:20, 1.51it/s, loss=0.0124, lr=2.16e-05, step=2917] Training: 29%|██▉ | 2918/10000 [33:36<1:13:06, 1.61it/s, loss=0.0124, lr=2.16e-05, step=2917] Training: 29%|██▉ | 2918/10000 [33:36<1:13:06, 1.61it/s, loss=0.0190, lr=2.16e-05, step=2918] Training: 29%|██▉ | 2919/10000 [33:37<1:10:54, 1.66it/s, loss=0.0190, lr=2.16e-05, step=2918] Training: 29%|██▉ | 2919/10000 [33:37<1:10:54, 1.66it/s, loss=0.0250, lr=2.16e-05, step=2919]16:39:44.208 [I] step=2920 loss=0.0066 smoothed_loss=0.0195 lr=2.16e-05 grad_norm=0.4992 step_time=0.5051s data_time=0.0837s it/s=1.699 eta_to_10000=4167.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0267 grad_action_out_proj=0.1862 grad_shared_expert=0.5075 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2920/10000 [33:37<1:12:30, 1.63it/s, loss=0.0250, lr=2.16e-05, step=2919] Training: 29%|██▉ | 2920/10000 [33:37<1:12:30, 1.63it/s, loss=0.0066, lr=2.16e-05, step=2920] Training: 29%|██▉ | 2921/10000 [33:38<1:09:44, 1.69it/s, loss=0.0066, lr=2.16e-05, step=2920] Training: 29%|██▉ | 2921/10000 [33:38<1:09:44, 1.69it/s, loss=0.0168, lr=2.16e-05, step=2921] Training: 29%|██▉ | 2922/10000 [33:38<1:12:04, 1.64it/s, loss=0.0168, lr=2.16e-05, step=2921] Training: 29%|██▉ | 2922/10000 [33:38<1:12:04, 1.64it/s, loss=0.0075, lr=2.16e-05, step=2922] Training: 29%|██▉ | 2923/10000 [33:39<1:11:04, 1.66it/s, loss=0.0075, lr=2.16e-05, step=2922] Training: 29%|██▉ | 2923/10000 [33:39<1:11:04, 1.66it/s, loss=0.0203, lr=2.16e-05, step=2923] Training: 29%|██▉ | 2924/10000 [33:40<1:19:15, 1.49it/s, loss=0.0203, lr=2.16e-05, step=2923] Training: 29%|██▉ | 2924/10000 [33:40<1:19:15, 1.49it/s, loss=0.0143, lr=2.16e-05, step=2924] Training: 29%|██▉ | 2925/10000 [33:40<1:12:47, 1.62it/s, loss=0.0143, lr=2.16e-05, step=2924] Training: 29%|██▉ | 2925/10000 [33:40<1:12:47, 1.62it/s, loss=0.0388, lr=2.16e-05, step=2925] Training: 29%|██▉ | 2926/10000 [33:41<1:08:32, 1.72it/s, loss=0.0388, lr=2.16e-05, step=2925] Training: 29%|██▉ | 2926/10000 [33:41<1:08:32, 1.72it/s, loss=0.0375, lr=2.16e-05, step=2926] Training: 29%|██▉ | 2927/10000 [33:41<1:05:02, 1.81it/s, loss=0.0375, lr=2.16e-05, step=2926] Training: 29%|██▉ | 2927/10000 [33:41<1:05:02, 1.81it/s, loss=0.0331, lr=2.16e-05, step=2927] Training: 29%|██▉ | 2928/10000 [33:42<1:04:22, 1.83it/s, loss=0.0331, lr=2.16e-05, step=2927] Training: 29%|██▉ | 2928/10000 [33:42<1:04:22, 1.83it/s, loss=0.0179, lr=2.16e-05, step=2928] Training: 29%|██▉ | 2929/10000 [33:42<1:04:00, 1.84it/s, loss=0.0179, lr=2.16e-05, step=2928] Training: 29%|██▉ | 2929/10000 [33:42<1:04:00, 1.84it/s, loss=0.0940, lr=2.16e-05, step=2929]16:39:49.994 [I] step=2930 loss=0.0262 smoothed_loss=0.0292 lr=2.16e-05 grad_norm=0.5589 step_time=0.5031s data_time=0.0754s it/s=1.729 eta_to_10000=4089.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.0810 grad_shared_expert=0.3075 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2930/10000 [33:43<1:07:16, 1.75it/s, loss=0.0940, lr=2.16e-05, step=2929] Training: 29%|██▉ | 2930/10000 [33:43<1:07:16, 1.75it/s, loss=0.0262, lr=2.16e-05, step=2930] Training: 29%|██▉ | 2931/10000 [33:44<1:19:39, 1.48it/s, loss=0.0262, lr=2.16e-05, step=2930] Training: 29%|██▉ | 2931/10000 [33:44<1:19:39, 1.48it/s, loss=0.0151, lr=2.16e-05, step=2931] Training: 29%|██▉ | 2932/10000 [33:45<1:25:11, 1.38it/s, loss=0.0151, lr=2.16e-05, step=2931] Training: 29%|██▉ | 2932/10000 [33:45<1:25:11, 1.38it/s, loss=0.0276, lr=2.16e-05, step=2932] Training: 29%|██▉ | 2933/10000 [33:45<1:21:21, 1.45it/s, loss=0.0276, lr=2.16e-05, step=2932] Training: 29%|██▉ | 2933/10000 [33:45<1:21:21, 1.45it/s, loss=0.0416, lr=2.16e-05, step=2933] Training: 29%|██▉ | 2934/10000 [33:46<1:17:21, 1.52it/s, loss=0.0416, lr=2.16e-05, step=2933] Training: 29%|██▉ | 2934/10000 [33:46<1:17:21, 1.52it/s, loss=0.0216, lr=2.16e-05, step=2934] Training: 29%|██▉ | 2935/10000 [33:47<1:13:20, 1.61it/s, loss=0.0216, lr=2.16e-05, step=2934] Training: 29%|██▉ | 2935/10000 [33:47<1:13:20, 1.61it/s, loss=0.0128, lr=2.15e-05, step=2935] Training: 29%|██▉ | 2936/10000 [33:47<1:15:48, 1.55it/s, loss=0.0128, lr=2.15e-05, step=2935] Training: 29%|██▉ | 2936/10000 [33:47<1:15:48, 1.55it/s, loss=0.0269, lr=2.15e-05, step=2936] Training: 29%|██▉ | 2937/10000 [33:48<1:11:57, 1.64it/s, loss=0.0269, lr=2.15e-05, step=2936] Training: 29%|██▉ | 2937/10000 [33:48<1:11:57, 1.64it/s, loss=0.0058, lr=2.15e-05, step=2937] Training: 29%|██▉ | 2938/10000 [33:49<1:22:09, 1.43it/s, loss=0.0058, lr=2.15e-05, step=2937] Training: 29%|██▉ | 2938/10000 [33:49<1:22:09, 1.43it/s, loss=0.0277, lr=2.15e-05, step=2938] Training: 29%|██▉ | 2939/10000 [33:49<1:14:29, 1.58it/s, loss=0.0277, lr=2.15e-05, step=2938] Training: 29%|██▉ | 2939/10000 [33:49<1:14:29, 1.58it/s, loss=0.0320, lr=2.15e-05, step=2939]16:39:56.633 [I] step=2940 loss=0.0139 smoothed_loss=0.0246 lr=2.15e-05 grad_norm=0.4749 step_time=0.5510s data_time=0.1130s it/s=1.506 eta_to_10000=4686.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0098 grad_action_out_proj=0.1133 grad_shared_expert=0.5210 (10775:train_pytorch.py:850) + Training: 29%|██▉ | 2940/10000 [33:50<1:11:08, 1.65it/s, loss=0.0320, lr=2.15e-05, step=2939] Training: 29%|██▉ | 2940/10000 [33:50<1:11:08, 1.65it/s, loss=0.0139, lr=2.15e-05, step=2940] Training: 29%|██▉ | 2941/10000 [33:50<1:06:45, 1.76it/s, loss=0.0139, lr=2.15e-05, step=2940] Training: 29%|██▉ | 2941/10000 [33:50<1:06:45, 1.76it/s, loss=0.0370, lr=2.15e-05, step=2941] Training: 29%|██▉ | 2942/10000 [33:51<1:04:43, 1.82it/s, loss=0.0370, lr=2.15e-05, step=2941] Training: 29%|██▉ | 2942/10000 [33:51<1:04:43, 1.82it/s, loss=0.0097, lr=2.15e-05, step=2942] Training: 29%|██▉ | 2943/10000 [33:51<1:08:02, 1.73it/s, loss=0.0097, lr=2.15e-05, step=2942] Training: 29%|██▉ | 2943/10000 [33:51<1:08:02, 1.73it/s, loss=0.0325, lr=2.15e-05, step=2943] Training: 29%|██▉ | 2944/10000 [33:52<1:05:36, 1.79it/s, loss=0.0325, lr=2.15e-05, step=2943] Training: 29%|██▉ | 2944/10000 [33:52<1:05:36, 1.79it/s, loss=0.0402, lr=2.15e-05, step=2944] Training: 29%|██▉ | 2945/10000 [33:53<1:16:41, 1.53it/s, loss=0.0402, lr=2.15e-05, step=2944] Training: 29%|██▉ | 2945/10000 [33:53<1:16:41, 1.53it/s, loss=0.0168, lr=2.15e-05, step=2945] Training: 29%|██▉ | 2946/10000 [33:53<1:11:33, 1.64it/s, loss=0.0168, lr=2.15e-05, step=2945] Training: 29%|██▉ | 2946/10000 [33:53<1:11:33, 1.64it/s, loss=0.0063, lr=2.15e-05, step=2946] Training: 29%|██▉ | 2947/10000 [33:54<1:07:54, 1.73it/s, loss=0.0063, lr=2.15e-05, step=2946] Training: 29%|██▉ | 2947/10000 [33:54<1:07:54, 1.73it/s, loss=0.0168, lr=2.15e-05, step=2947] Training: 29%|██▉ | 2948/10000 [33:54<1:08:30, 1.72it/s, loss=0.0168, lr=2.15e-05, step=2947] Training: 29%|██▉ | 2948/10000 [33:54<1:08:30, 1.72it/s, loss=0.0087, lr=2.15e-05, step=2948] Training: 29%|██▉ | 2949/10000 [33:55<1:09:15, 1.70it/s, loss=0.0087, lr=2.15e-05, step=2948] Training: 29%|██▉ | 2949/10000 [33:55<1:09:15, 1.70it/s, loss=0.0075, lr=2.15e-05, step=2949]16:40:02.433 [I] step=2950 loss=0.0182 smoothed_loss=0.0199 lr=2.15e-05 grad_norm=0.5483 step_time=0.4970s data_time=0.0829s it/s=1.725 eta_to_10000=4088.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0247 grad_action_out_proj=0.1669 grad_shared_expert=0.5289 (10775:train_pytorch.py:850) + Training: 30%|██▉ | 2950/10000 [33:55<1:08:33, 1.71it/s, loss=0.0075, lr=2.15e-05, step=2949] Training: 30%|██▉ | 2950/10000 [33:56<1:08:33, 1.71it/s, loss=0.0182, lr=2.15e-05, step=2950] Training: 30%|██▉ | 2951/10000 [33:56<1:06:43, 1.76it/s, loss=0.0182, lr=2.15e-05, step=2950] Training: 30%|██▉ | 2951/10000 [33:56<1:06:43, 1.76it/s, loss=0.0164, lr=2.15e-05, step=2951] Training: 30%|██▉ | 2952/10000 [33:57<1:17:49, 1.51it/s, loss=0.0164, lr=2.15e-05, step=2951] Training: 30%|██▉ | 2952/10000 [33:57<1:17:49, 1.51it/s, loss=0.0060, lr=2.15e-05, step=2952] Training: 30%|██▉ | 2953/10000 [33:58<1:28:56, 1.32it/s, loss=0.0060, lr=2.15e-05, step=2952] Training: 30%|██▉ | 2953/10000 [33:58<1:28:56, 1.32it/s, loss=0.0210, lr=2.15e-05, step=2953] Training: 30%|██▉ | 2954/10000 [33:59<1:24:26, 1.39it/s, loss=0.0210, lr=2.15e-05, step=2953] Training: 30%|██▉ | 2954/10000 [33:59<1:24:26, 1.39it/s, loss=0.0155, lr=2.15e-05, step=2954] Training: 30%|██▉ | 2955/10000 [33:59<1:21:37, 1.44it/s, loss=0.0155, lr=2.15e-05, step=2954] Training: 30%|██▉ | 2955/10000 [33:59<1:21:37, 1.44it/s, loss=0.0602, lr=2.15e-05, step=2955] Training: 30%|██▉ | 2956/10000 [34:00<1:22:27, 1.42it/s, loss=0.0602, lr=2.15e-05, step=2955] Training: 30%|██▉ | 2956/10000 [34:00<1:22:27, 1.42it/s, loss=0.0112, lr=2.15e-05, step=2956] Training: 30%|██▉ | 2957/10000 [34:01<1:25:19, 1.38it/s, loss=0.0112, lr=2.15e-05, step=2956] Training: 30%|██▉ | 2957/10000 [34:01<1:25:19, 1.38it/s, loss=0.0066, lr=2.15e-05, step=2957] Training: 30%|██▉ | 2958/10000 [34:01<1:26:40, 1.35it/s, loss=0.0066, lr=2.15e-05, step=2957] Training: 30%|██▉ | 2958/10000 [34:01<1:26:40, 1.35it/s, loss=0.0115, lr=2.15e-05, step=2958] Training: 30%|██▉ | 2959/10000 [34:02<1:24:12, 1.39it/s, loss=0.0115, lr=2.15e-05, step=2958] Training: 30%|██▉ | 2959/10000 [34:02<1:24:12, 1.39it/s, loss=0.0084, lr=2.15e-05, step=2959]16:40:10.186 [I] step=2960 loss=0.0072 smoothed_loss=0.0169 lr=2.15e-05 grad_norm=0.4488 step_time=0.6263s data_time=0.1490s it/s=1.290 eta_to_10000=5457.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0263 grad_action_out_proj=0.2042 grad_shared_expert=0.6754 (10775:train_pytorch.py:850) + Training: 30%|██▉ | 2960/10000 [34:03<1:39:37, 1.18it/s, loss=0.0084, lr=2.15e-05, step=2959] Training: 30%|██▉ | 2960/10000 [34:03<1:39:37, 1.18it/s, loss=0.0072, lr=2.15e-05, step=2960] Training: 30%|██▉ | 2961/10000 [34:05<1:54:48, 1.02it/s, loss=0.0072, lr=2.15e-05, step=2960] Training: 30%|██▉ | 2961/10000 [34:05<1:54:48, 1.02it/s, loss=0.0204, lr=2.15e-05, step=2961] Training: 30%|██▉ | 2962/10000 [34:06<1:55:58, 1.01it/s, loss=0.0204, lr=2.15e-05, step=2961] Training: 30%|██▉ | 2962/10000 [34:06<1:55:58, 1.01it/s, loss=0.0165, lr=2.15e-05, step=2962] Training: 30%|██▉ | 2963/10000 [34:06<1:47:13, 1.09it/s, loss=0.0165, lr=2.15e-05, step=2962] Training: 30%|██▉ | 2963/10000 [34:06<1:47:13, 1.09it/s, loss=0.0126, lr=2.15e-05, step=2963] Training: 30%|██▉ | 2964/10000 [34:07<1:44:56, 1.12it/s, loss=0.0126, lr=2.15e-05, step=2963] Training: 30%|██▉ | 2964/10000 [34:07<1:44:56, 1.12it/s, loss=0.0225, lr=2.15e-05, step=2964] Training: 30%|██▉ | 2965/10000 [34:08<1:39:37, 1.18it/s, loss=0.0225, lr=2.15e-05, step=2964] Training: 30%|██▉ | 2965/10000 [34:08<1:39:37, 1.18it/s, loss=0.0054, lr=2.15e-05, step=2965] Training: 30%|██▉ | 2966/10000 [34:09<1:44:16, 1.12it/s, loss=0.0054, lr=2.15e-05, step=2965] Training: 30%|██▉ | 2966/10000 [34:09<1:44:16, 1.12it/s, loss=0.0417, lr=2.15e-05, step=2966] Training: 30%|██▉ | 2967/10000 [34:10<1:42:38, 1.14it/s, loss=0.0417, lr=2.15e-05, step=2966] Training: 30%|██▉ | 2967/10000 [34:10<1:42:38, 1.14it/s, loss=0.0209, lr=2.15e-05, step=2967] Training: 30%|██▉ | 2968/10000 [34:10<1:34:33, 1.24it/s, loss=0.0209, lr=2.15e-05, step=2967] Training: 30%|██▉ | 2968/10000 [34:10<1:34:33, 1.24it/s, loss=0.0128, lr=2.15e-05, step=2968] Training: 30%|██▉ | 2969/10000 [34:12<1:46:56, 1.10it/s, loss=0.0128, lr=2.15e-05, step=2968] Training: 30%|██▉ | 2969/10000 [34:12<1:46:56, 1.10it/s, loss=0.0121, lr=2.15e-05, step=2969]16:40:19.122 [I] step=2970 loss=0.0163 smoothed_loss=0.0175 lr=2.15e-05 grad_norm=0.6842 step_time=0.6145s data_time=0.2790s it/s=1.120 eta_to_10000=6279.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0081 grad_action_out_proj=0.1291 grad_shared_expert=0.3258 (10775:train_pytorch.py:850) + Training: 30%|██▉ | 2970/10000 [34:12<1:38:38, 1.19it/s, loss=0.0121, lr=2.15e-05, step=2969] Training: 30%|██▉ | 2970/10000 [34:12<1:38:38, 1.19it/s, loss=0.0163, lr=2.15e-05, step=2970] Training: 30%|██▉ | 2971/10000 [34:13<1:32:55, 1.26it/s, loss=0.0163, lr=2.15e-05, step=2970] Training: 30%|██▉ | 2971/10000 [34:13<1:32:55, 1.26it/s, loss=0.0070, lr=2.15e-05, step=2971] Training: 30%|██▉ | 2972/10000 [34:13<1:27:11, 1.34it/s, loss=0.0070, lr=2.15e-05, step=2971] Training: 30%|██▉ | 2972/10000 [34:14<1:27:11, 1.34it/s, loss=0.0104, lr=2.14e-05, step=2972] Training: 30%|██▉ | 2973/10000 [34:14<1:24:58, 1.38it/s, loss=0.0104, lr=2.14e-05, step=2972] Training: 30%|██▉ | 2973/10000 [34:14<1:24:58, 1.38it/s, loss=0.0221, lr=2.14e-05, step=2973] Training: 30%|██▉ | 2974/10000 [34:15<1:32:50, 1.26it/s, loss=0.0221, lr=2.14e-05, step=2973] Training: 30%|██▉ | 2974/10000 [34:15<1:32:50, 1.26it/s, loss=0.0231, lr=2.14e-05, step=2974] Training: 30%|██▉ | 2975/10000 [34:16<1:32:49, 1.26it/s, loss=0.0231, lr=2.14e-05, step=2974] Training: 30%|██▉ | 2975/10000 [34:16<1:32:49, 1.26it/s, loss=0.0075, lr=2.14e-05, step=2975] Training: 30%|██▉ | 2976/10000 [34:16<1:25:09, 1.37it/s, loss=0.0075, lr=2.14e-05, step=2975] Training: 30%|██▉ | 2976/10000 [34:17<1:25:09, 1.37it/s, loss=0.0154, lr=2.14e-05, step=2976] Training: 30%|██▉ | 2977/10000 [34:17<1:19:07, 1.48it/s, loss=0.0154, lr=2.14e-05, step=2976] Training: 30%|██▉ | 2977/10000 [34:17<1:19:07, 1.48it/s, loss=0.0175, lr=2.14e-05, step=2977] Training: 30%|██▉ | 2978/10000 [34:18<1:17:33, 1.51it/s, loss=0.0175, lr=2.14e-05, step=2977] Training: 30%|██▉ | 2978/10000 [34:18<1:17:33, 1.51it/s, loss=0.0079, lr=2.14e-05, step=2978] Training: 30%|██▉ | 2979/10000 [34:19<1:35:13, 1.23it/s, loss=0.0079, lr=2.14e-05, step=2978] Training: 30%|██▉ | 2979/10000 [34:19<1:35:13, 1.23it/s, loss=0.0122, lr=2.14e-05, step=2979]16:40:26.620 [I] step=2980 loss=0.0250 smoothed_loss=0.0161 lr=2.14e-05 grad_norm=0.4309 step_time=0.6103s data_time=0.1397s it/s=1.334 eta_to_10000=5261.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1260 grad_shared_expert=0.4060 (10775:train_pytorch.py:850) + Training: 30%|██▉ | 2980/10000 [34:20<1:36:01, 1.22it/s, loss=0.0122, lr=2.14e-05, step=2979] Training: 30%|██▉ | 2980/10000 [34:20<1:36:01, 1.22it/s, loss=0.0250, lr=2.14e-05, step=2980] Training: 30%|██▉ | 2981/10000 [34:21<1:44:27, 1.12it/s, loss=0.0250, lr=2.14e-05, step=2980] Training: 30%|██▉ | 2981/10000 [34:21<1:44:27, 1.12it/s, loss=0.0108, lr=2.14e-05, step=2981] Training: 30%|██▉ | 2982/10000 [34:22<1:48:40, 1.08it/s, loss=0.0108, lr=2.14e-05, step=2981] Training: 30%|██▉ | 2982/10000 [34:22<1:48:40, 1.08it/s, loss=0.0070, lr=2.14e-05, step=2982] Training: 30%|██▉ | 2983/10000 [34:22<1:39:27, 1.18it/s, loss=0.0070, lr=2.14e-05, step=2982] Training: 30%|██▉ | 2983/10000 [34:22<1:39:27, 1.18it/s, loss=0.0329, lr=2.14e-05, step=2983] Training: 30%|██▉ | 2984/10000 [34:23<1:45:26, 1.11it/s, loss=0.0329, lr=2.14e-05, step=2983] Training: 30%|██▉ | 2984/10000 [34:24<1:45:26, 1.11it/s, loss=0.0267, lr=2.14e-05, step=2984] Training: 30%|██▉ | 2985/10000 [34:25<1:53:34, 1.03it/s, loss=0.0267, lr=2.14e-05, step=2984] Training: 30%|██▉ | 2985/10000 [34:25<1:53:34, 1.03it/s, loss=0.0383, lr=2.14e-05, step=2985] Training: 30%|██▉ | 2986/10000 [34:25<1:47:41, 1.09it/s, loss=0.0383, lr=2.14e-05, step=2985] Training: 30%|██▉ | 2986/10000 [34:25<1:47:41, 1.09it/s, loss=0.0413, lr=2.14e-05, step=2986] Training: 30%|██▉ | 2987/10000 [34:27<2:07:02, 1.09s/it, loss=0.0413, lr=2.14e-05, step=2986] Training: 30%|██▉ | 2987/10000 [34:27<2:07:02, 1.09s/it, loss=0.0261, lr=2.14e-05, step=2987] Training: 30%|██▉ | 2988/10000 [34:28<2:05:17, 1.07s/it, loss=0.0261, lr=2.14e-05, step=2987] Training: 30%|██▉ | 2988/10000 [34:28<2:05:17, 1.07s/it, loss=0.0216, lr=2.14e-05, step=2988] Training: 30%|██▉ | 2989/10000 [34:29<2:07:51, 1.09s/it, loss=0.0216, lr=2.14e-05, step=2988] Training: 30%|██▉ | 2989/10000 [34:29<2:07:51, 1.09s/it, loss=0.0377, lr=2.14e-05, step=2989]16:40:36.931 [I] step=2990 loss=0.0593 smoothed_loss=0.0273 lr=2.14e-05 grad_norm=0.5464 step_time=0.7881s data_time=0.2429s it/s=0.970 eta_to_10000=7224.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0119 grad_action_out_proj=0.2029 grad_shared_expert=0.4441 (10775:train_pytorch.py:850) + Training: 30%|██▉ | 2990/10000 [34:30<2:09:38, 1.11s/it, loss=0.0377, lr=2.14e-05, step=2989] Training: 30%|██▉ | 2990/10000 [34:30<2:09:38, 1.11s/it, loss=0.0593, lr=2.14e-05, step=2990] Training: 30%|██▉ | 2991/10000 [34:31<2:08:02, 1.10s/it, loss=0.0593, lr=2.14e-05, step=2990] Training: 30%|██▉ | 2991/10000 [34:31<2:08:02, 1.10s/it, loss=0.0370, lr=2.14e-05, step=2991] Training: 30%|██▉ | 2992/10000 [34:33<2:32:02, 1.30s/it, loss=0.0370, lr=2.14e-05, step=2991] Training: 30%|██▉ | 2992/10000 [34:33<2:32:02, 1.30s/it, loss=0.0339, lr=2.14e-05, step=2992] Training: 30%|██▉ | 2993/10000 [34:34<2:17:42, 1.18s/it, loss=0.0339, lr=2.14e-05, step=2992] Training: 30%|██▉ | 2993/10000 [34:34<2:17:42, 1.18s/it, loss=0.0046, lr=2.14e-05, step=2993] Training: 30%|██▉ | 2994/10000 [34:35<2:02:01, 1.04s/it, loss=0.0046, lr=2.14e-05, step=2993] Training: 30%|██▉ | 2994/10000 [34:35<2:02:01, 1.04s/it, loss=0.0220, lr=2.14e-05, step=2994] Training: 30%|██▉ | 2995/10000 [34:35<1:54:36, 1.02it/s, loss=0.0220, lr=2.14e-05, step=2994] Training: 30%|██▉ | 2995/10000 [34:36<1:54:36, 1.02it/s, loss=0.0341, lr=2.14e-05, step=2995] Training: 30%|██▉ | 2996/10000 [34:37<1:59:00, 1.02s/it, loss=0.0341, lr=2.14e-05, step=2995] Training: 30%|██▉ | 2996/10000 [34:37<1:59:00, 1.02s/it, loss=0.0105, lr=2.14e-05, step=2996] Training: 30%|██▉ | 2997/10000 [34:37<1:52:26, 1.04it/s, loss=0.0105, lr=2.14e-05, step=2996] Training: 30%|██▉ | 2997/10000 [34:37<1:52:26, 1.04it/s, loss=0.0118, lr=2.14e-05, step=2997] Training: 30%|██▉ | 2998/10000 [34:38<1:51:32, 1.05it/s, loss=0.0118, lr=2.14e-05, step=2997] Training: 30%|██▉ | 2998/10000 [34:38<1:51:32, 1.05it/s, loss=0.0373, lr=2.14e-05, step=2998] Training: 30%|██▉ | 2999/10000 [34:39<1:44:06, 1.12it/s, loss=0.0373, lr=2.14e-05, step=2998] Training: 30%|██▉ | 2999/10000 [34:39<1:44:06, 1.12it/s, loss=0.0315, lr=2.14e-05, step=2999]16:40:46.677 [I] step=3000 loss=0.0208 smoothed_loss=0.0253 lr=2.14e-05 grad_norm=0.5537 step_time=0.6869s data_time=0.2876s it/s=1.047 eta_to_10000=6683.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0203 grad_action_out_proj=0.2077 grad_shared_expert=0.6590 (10775:train_pytorch.py:850) +16:42:29.626 [I] Saved checkpoint at step 3000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/3000 (10775:train_pytorch.py:350) + Training: 30%|███ | 3000/10000 [36:23<61:37:52, 31.70s/it, loss=0.0315, lr=2.14e-05, step=2999] Training: 30%|███ | 3000/10000 [36:23<61:37:52, 31.70s/it, loss=0.0208, lr=2.14e-05, step=3000] Training: 30%|███ | 3001/10000 [36:23<43:28:36, 22.36s/it, loss=0.0208, lr=2.14e-05, step=3000] Training: 30%|███ | 3001/10000 [36:23<43:28:36, 22.36s/it, loss=0.0039, lr=2.14e-05, step=3001] Training: 30%|███ | 3002/10000 [36:24<30:52:06, 15.88s/it, loss=0.0039, lr=2.14e-05, step=3001] Training: 30%|███ | 3002/10000 [36:24<30:52:06, 15.88s/it, loss=0.0039, lr=2.14e-05, step=3002] Training: 30%|███ | 3003/10000 [36:25<22:07:43, 11.39s/it, loss=0.0039, lr=2.14e-05, step=3002] Training: 30%|███ | 3003/10000 [36:25<22:07:43, 11.39s/it, loss=0.0221, lr=2.14e-05, step=3003] Training: 30%|███ | 3004/10000 [36:25<15:49:14, 8.14s/it, loss=0.0221, lr=2.14e-05, step=3003] Training: 30%|███ | 3004/10000 [36:25<15:49:14, 8.14s/it, loss=0.0156, lr=2.14e-05, step=3004] Training: 30%|███ | 3005/10000 [36:26<11:23:33, 5.86s/it, loss=0.0156, lr=2.14e-05, step=3004] Training: 30%|███ | 3005/10000 [36:26<11:23:33, 5.86s/it, loss=0.0153, lr=2.14e-05, step=3005] Training: 30%|███ | 3006/10000 [36:27<8:18:08, 4.27s/it, loss=0.0153, lr=2.14e-05, step=3005] Training: 30%|███ | 3006/10000 [36:27<8:18:08, 4.27s/it, loss=0.0193, lr=2.14e-05, step=3006] Training: 30%|███ | 3007/10000 [36:27<6:10:19, 3.18s/it, loss=0.0193, lr=2.14e-05, step=3006] Training: 30%|███ | 3007/10000 [36:27<6:10:19, 3.18s/it, loss=0.0247, lr=2.14e-05, step=3007] Training: 30%|███ | 3008/10000 [36:28<4:39:00, 2.39s/it, loss=0.0247, lr=2.14e-05, step=3007] Training: 30%|███ | 3008/10000 [36:28<4:39:00, 2.39s/it, loss=0.0388, lr=2.14e-05, step=3008] Training: 30%|███ | 3009/10000 [36:28<3:34:45, 1.84s/it, loss=0.0388, lr=2.14e-05, step=3008] Training: 30%|███ | 3009/10000 [36:28<3:34:45, 1.84s/it, loss=0.0235, lr=2.13e-05, step=3009]16:42:36.127 [I] step=3010 loss=0.0067 smoothed_loss=0.0209 lr=2.14e-05 grad_norm=0.6044 step_time=0.5740s data_time=10.3710s it/s=0.091 eta_to_10000=76440.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0283 grad_action_out_proj=0.1709 grad_shared_expert=0.4894 (10775:train_pytorch.py:850) + Training: 30%|███ | 3010/10000 [36:29<2:59:33, 1.54s/it, loss=0.0235, lr=2.13e-05, step=3009] Training: 30%|███ | 3010/10000 [36:29<2:59:33, 1.54s/it, loss=0.0067, lr=2.13e-05, step=3010] Training: 30%|███ | 3011/10000 [36:30<2:23:22, 1.23s/it, loss=0.0067, lr=2.13e-05, step=3010] Training: 30%|███ | 3011/10000 [36:30<2:23:22, 1.23s/it, loss=0.0135, lr=2.13e-05, step=3011] Training: 30%|███ | 3012/10000 [36:30<1:59:46, 1.03s/it, loss=0.0135, lr=2.13e-05, step=3011] Training: 30%|███ | 3012/10000 [36:30<1:59:46, 1.03s/it, loss=0.0521, lr=2.13e-05, step=3012] Training: 30%|███ | 3013/10000 [36:31<1:48:46, 1.07it/s, loss=0.0521, lr=2.13e-05, step=3012] Training: 30%|███ | 3013/10000 [36:31<1:48:46, 1.07it/s, loss=0.0077, lr=2.13e-05, step=3013] Training: 30%|███ | 3014/10000 [36:32<1:35:37, 1.22it/s, loss=0.0077, lr=2.13e-05, step=3013] Training: 30%|███ | 3014/10000 [36:32<1:35:37, 1.22it/s, loss=0.0163, lr=2.13e-05, step=3014] Training: 30%|███ | 3015/10000 [36:32<1:26:19, 1.35it/s, loss=0.0163, lr=2.13e-05, step=3014] Training: 30%|███ | 3015/10000 [36:32<1:26:19, 1.35it/s, loss=0.0042, lr=2.13e-05, step=3015] Training: 30%|███ | 3016/10000 [36:33<1:19:12, 1.47it/s, loss=0.0042, lr=2.13e-05, step=3015] Training: 30%|███ | 3016/10000 [36:33<1:19:12, 1.47it/s, loss=0.0158, lr=2.13e-05, step=3016] Training: 30%|███ | 3017/10000 [36:34<1:28:46, 1.31it/s, loss=0.0158, lr=2.13e-05, step=3016] Training: 30%|███ | 3017/10000 [36:34<1:28:46, 1.31it/s, loss=0.0166, lr=2.13e-05, step=3017] Training: 30%|███ | 3018/10000 [36:35<1:39:25, 1.17it/s, loss=0.0166, lr=2.13e-05, step=3017] Training: 30%|███ | 3018/10000 [36:35<1:39:25, 1.17it/s, loss=0.0324, lr=2.13e-05, step=3018] Training: 30%|███ | 3019/10000 [36:35<1:36:53, 1.20it/s, loss=0.0324, lr=2.13e-05, step=3018] Training: 30%|███ | 3019/10000 [36:35<1:36:53, 1.20it/s, loss=0.0093, lr=2.13e-05, step=3019]16:42:42.972 [I] step=3020 loss=0.0130 smoothed_loss=0.0186 lr=2.13e-05 grad_norm=0.5422 step_time=0.5827s data_time=0.1017s it/s=1.461 eta_to_10000=4777.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1146 grad_shared_expert=0.4202 (10775:train_pytorch.py:850) + Training: 30%|███ | 3020/10000 [36:36<1:29:10, 1.30it/s, loss=0.0093, lr=2.13e-05, step=3019] Training: 30%|███ | 3020/10000 [36:36<1:29:10, 1.30it/s, loss=0.0130, lr=2.13e-05, step=3020] Training: 30%|███ | 3021/10000 [36:37<1:26:24, 1.35it/s, loss=0.0130, lr=2.13e-05, step=3020] Training: 30%|███ | 3021/10000 [36:37<1:26:24, 1.35it/s, loss=0.0449, lr=2.13e-05, step=3021] Training: 30%|███ | 3022/10000 [36:37<1:18:55, 1.47it/s, loss=0.0449, lr=2.13e-05, step=3021] Training: 30%|███ | 3022/10000 [36:37<1:18:55, 1.47it/s, loss=0.0317, lr=2.13e-05, step=3022] Training: 30%|███ | 3023/10000 [36:38<1:14:14, 1.57it/s, loss=0.0317, lr=2.13e-05, step=3022] Training: 30%|███ | 3023/10000 [36:38<1:14:14, 1.57it/s, loss=0.0202, lr=2.13e-05, step=3023] Training: 30%|███ | 3024/10000 [36:38<1:10:07, 1.66it/s, loss=0.0202, lr=2.13e-05, step=3023] Training: 30%|███ | 3024/10000 [36:38<1:10:07, 1.66it/s, loss=0.0060, lr=2.13e-05, step=3024] Training: 30%|███ | 3025/10000 [36:40<1:30:25, 1.29it/s, loss=0.0060, lr=2.13e-05, step=3024] Training: 30%|███ | 3025/10000 [36:40<1:30:25, 1.29it/s, loss=0.0091, lr=2.13e-05, step=3025] Training: 30%|███ | 3026/10000 [36:40<1:22:41, 1.41it/s, loss=0.0091, lr=2.13e-05, step=3025] Training: 30%|███ | 3026/10000 [36:40<1:22:41, 1.41it/s, loss=0.0212, lr=2.13e-05, step=3026] Training: 30%|███ | 3027/10000 [36:41<1:18:32, 1.48it/s, loss=0.0212, lr=2.13e-05, step=3026] Training: 30%|███ | 3027/10000 [36:41<1:18:32, 1.48it/s, loss=0.0183, lr=2.13e-05, step=3027] Training: 30%|███ | 3028/10000 [36:41<1:19:26, 1.46it/s, loss=0.0183, lr=2.13e-05, step=3027] Training: 30%|███ | 3028/10000 [36:41<1:19:26, 1.46it/s, loss=0.1034, lr=2.13e-05, step=3028] Training: 30%|███ | 3029/10000 [36:42<1:19:36, 1.46it/s, loss=0.1034, lr=2.13e-05, step=3028] Training: 30%|███ | 3029/10000 [36:42<1:19:36, 1.46it/s, loss=0.0845, lr=2.13e-05, step=3029]16:42:49.857 [I] step=3030 loss=0.0088 smoothed_loss=0.0310 lr=2.13e-05 grad_norm=0.5285 step_time=0.5798s data_time=0.1087s it/s=1.453 eta_to_10000=4797.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0351 grad_action_out_proj=0.2588 grad_shared_expert=0.8347 (10775:train_pytorch.py:850) + Training: 30%|███ | 3030/10000 [36:43<1:26:20, 1.35it/s, loss=0.0845, lr=2.13e-05, step=3029] Training: 30%|███ | 3030/10000 [36:43<1:26:20, 1.35it/s, loss=0.0088, lr=2.13e-05, step=3030] Training: 30%|███ | 3031/10000 [36:44<1:38:09, 1.18it/s, loss=0.0088, lr=2.13e-05, step=3030] Training: 30%|███ | 3031/10000 [36:44<1:38:09, 1.18it/s, loss=0.0064, lr=2.13e-05, step=3031] Training: 30%|███ | 3032/10000 [36:45<1:41:03, 1.15it/s, loss=0.0064, lr=2.13e-05, step=3031] Training: 30%|███ | 3032/10000 [36:45<1:41:03, 1.15it/s, loss=0.0116, lr=2.13e-05, step=3032] Training: 30%|███ | 3033/10000 [36:45<1:29:45, 1.29it/s, loss=0.0116, lr=2.13e-05, step=3032] Training: 30%|███ | 3033/10000 [36:45<1:29:45, 1.29it/s, loss=0.0271, lr=2.13e-05, step=3033] Training: 30%|███ | 3034/10000 [36:46<1:22:57, 1.40it/s, loss=0.0271, lr=2.13e-05, step=3033] Training: 30%|███ | 3034/10000 [36:46<1:22:57, 1.40it/s, loss=0.0370, lr=2.13e-05, step=3034] Training: 30%|███ | 3035/10000 [36:47<1:15:36, 1.54it/s, loss=0.0370, lr=2.13e-05, step=3034] Training: 30%|███ | 3035/10000 [36:47<1:15:36, 1.54it/s, loss=0.0072, lr=2.13e-05, step=3035] Training: 30%|███ | 3036/10000 [36:47<1:16:10, 1.52it/s, loss=0.0072, lr=2.13e-05, step=3035] Training: 30%|███ | 3036/10000 [36:47<1:16:10, 1.52it/s, loss=0.0212, lr=2.13e-05, step=3036] Training: 30%|███ | 3037/10000 [36:48<1:21:36, 1.42it/s, loss=0.0212, lr=2.13e-05, step=3036] Training: 30%|███ | 3037/10000 [36:48<1:21:36, 1.42it/s, loss=0.0034, lr=2.13e-05, step=3037] Training: 30%|███ | 3038/10000 [36:49<1:29:29, 1.30it/s, loss=0.0034, lr=2.13e-05, step=3037] Training: 30%|███ | 3038/10000 [36:49<1:29:29, 1.30it/s, loss=0.0174, lr=2.13e-05, step=3038] Training: 30%|███ | 3039/10000 [36:50<1:39:39, 1.16it/s, loss=0.0174, lr=2.13e-05, step=3038] Training: 30%|███ | 3039/10000 [36:50<1:39:39, 1.16it/s, loss=0.0224, lr=2.13e-05, step=3039]16:42:57.588 [I] step=3040 loss=0.0178 smoothed_loss=0.0221 lr=2.13e-05 grad_norm=0.5096 step_time=0.6376s data_time=0.1355s it/s=1.294 eta_to_10000=5379.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.1172 grad_shared_expert=0.4159 (10775:train_pytorch.py:850) + Training: 30%|███ | 3040/10000 [36:51<1:31:11, 1.27it/s, loss=0.0224, lr=2.13e-05, step=3039] Training: 30%|███ | 3040/10000 [36:51<1:31:11, 1.27it/s, loss=0.0178, lr=2.13e-05, step=3040] Training: 30%|███ | 3041/10000 [36:51<1:29:19, 1.30it/s, loss=0.0178, lr=2.13e-05, step=3040] Training: 30%|███ | 3041/10000 [36:51<1:29:19, 1.30it/s, loss=0.0165, lr=2.13e-05, step=3041] Training: 30%|███ | 3042/10000 [36:53<2:04:57, 1.08s/it, loss=0.0165, lr=2.13e-05, step=3041] Training: 30%|███ | 3042/10000 [36:53<2:04:57, 1.08s/it, loss=0.0322, lr=2.13e-05, step=3042] Training: 30%|███ | 3043/10000 [36:55<2:21:18, 1.22s/it, loss=0.0322, lr=2.13e-05, step=3042] Training: 30%|███ | 3043/10000 [36:55<2:21:18, 1.22s/it, loss=0.0269, lr=2.13e-05, step=3043] Training: 30%|███ | 3044/10000 [36:56<2:12:59, 1.15s/it, loss=0.0269, lr=2.13e-05, step=3043] Training: 30%|███ | 3044/10000 [36:56<2:12:59, 1.15s/it, loss=0.0285, lr=2.13e-05, step=3044] Training: 30%|███ | 3045/10000 [36:57<2:10:30, 1.13s/it, loss=0.0285, lr=2.13e-05, step=3044] Training: 30%|███ | 3045/10000 [36:57<2:10:30, 1.13s/it, loss=0.0415, lr=2.12e-05, step=3045] Training: 30%|███ | 3046/10000 [36:58<2:07:26, 1.10s/it, loss=0.0415, lr=2.12e-05, step=3045] Training: 30%|███ | 3046/10000 [36:58<2:07:26, 1.10s/it, loss=0.0022, lr=2.12e-05, step=3046] Training: 30%|███ | 3047/10000 [36:59<1:54:17, 1.01it/s, loss=0.0022, lr=2.12e-05, step=3046] Training: 30%|███ | 3047/10000 [36:59<1:54:17, 1.01it/s, loss=0.0183, lr=2.12e-05, step=3047] Training: 30%|███ | 3048/10000 [36:59<1:39:32, 1.16it/s, loss=0.0183, lr=2.12e-05, step=3047] Training: 30%|███ | 3048/10000 [36:59<1:39:32, 1.16it/s, loss=0.0167, lr=2.12e-05, step=3048] Training: 30%|███ | 3049/10000 [37:00<1:51:15, 1.04it/s, loss=0.0167, lr=2.12e-05, step=3048] Training: 30%|███ | 3049/10000 [37:00<1:51:15, 1.04it/s, loss=0.0139, lr=2.12e-05, step=3049]16:43:07.829 [I] step=3050 loss=0.0128 smoothed_loss=0.0203 lr=2.12e-05 grad_norm=0.6803 step_time=0.7186s data_time=0.3054s it/s=0.977 eta_to_10000=7116.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0081 grad_action_out_proj=0.1240 grad_shared_expert=0.4570 (10775:train_pytorch.py:850) + Training: 30%|███ | 3050/10000 [37:01<1:38:22, 1.18it/s, loss=0.0139, lr=2.12e-05, step=3049] Training: 30%|███ | 3050/10000 [37:01<1:38:22, 1.18it/s, loss=0.0128, lr=2.12e-05, step=3050] Training: 31%|███ | 3051/10000 [37:02<1:34:20, 1.23it/s, loss=0.0128, lr=2.12e-05, step=3050] Training: 31%|███ | 3051/10000 [37:02<1:34:20, 1.23it/s, loss=0.0087, lr=2.12e-05, step=3051] Training: 31%|███ | 3052/10000 [37:02<1:27:14, 1.33it/s, loss=0.0087, lr=2.12e-05, step=3051] Training: 31%|███ | 3052/10000 [37:02<1:27:14, 1.33it/s, loss=0.0136, lr=2.12e-05, step=3052] Training: 31%|███ | 3053/10000 [37:03<1:36:46, 1.20it/s, loss=0.0136, lr=2.12e-05, step=3052] Training: 31%|███ | 3053/10000 [37:03<1:36:46, 1.20it/s, loss=0.0075, lr=2.12e-05, step=3053] Training: 31%|███ | 3054/10000 [37:04<1:25:48, 1.35it/s, loss=0.0075, lr=2.12e-05, step=3053] Training: 31%|███ | 3054/10000 [37:04<1:25:48, 1.35it/s, loss=0.0248, lr=2.12e-05, step=3054] Training: 31%|███ | 3055/10000 [37:05<1:25:20, 1.36it/s, loss=0.0248, lr=2.12e-05, step=3054] Training: 31%|███ | 3055/10000 [37:05<1:25:20, 1.36it/s, loss=0.0144, lr=2.12e-05, step=3055] Training: 31%|███ | 3056/10000 [37:05<1:27:03, 1.33it/s, loss=0.0144, lr=2.12e-05, step=3055] Training: 31%|███ | 3056/10000 [37:05<1:27:03, 1.33it/s, loss=0.0209, lr=2.12e-05, step=3056] Training: 31%|███ | 3057/10000 [37:06<1:28:13, 1.31it/s, loss=0.0209, lr=2.12e-05, step=3056] Training: 31%|███ | 3057/10000 [37:06<1:28:13, 1.31it/s, loss=0.0129, lr=2.12e-05, step=3057] Training: 31%|███ | 3058/10000 [37:07<1:32:17, 1.25it/s, loss=0.0129, lr=2.12e-05, step=3057] Training: 31%|███ | 3058/10000 [37:07<1:32:17, 1.25it/s, loss=0.0069, lr=2.12e-05, step=3058] Training: 31%|███ | 3059/10000 [37:08<1:37:34, 1.19it/s, loss=0.0069, lr=2.12e-05, step=3058] Training: 31%|███ | 3059/10000 [37:08<1:37:34, 1.19it/s, loss=0.0448, lr=2.12e-05, step=3059]16:43:16.233 [I] step=3060 loss=0.0264 smoothed_loss=0.0201 lr=2.12e-05 grad_norm=0.4947 step_time=0.6510s data_time=0.1894s it/s=1.190 eta_to_10000=5830.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0304 grad_action_out_proj=0.2489 grad_shared_expert=0.6174 (10775:train_pytorch.py:850) + Training: 31%|███ | 3060/10000 [37:09<1:56:12, 1.00s/it, loss=0.0448, lr=2.12e-05, step=3059] Training: 31%|███ | 3060/10000 [37:09<1:56:12, 1.00s/it, loss=0.0264, lr=2.12e-05, step=3060] Training: 31%|███ | 3061/10000 [37:10<1:50:48, 1.04it/s, loss=0.0264, lr=2.12e-05, step=3060] Training: 31%|███ | 3061/10000 [37:10<1:50:48, 1.04it/s, loss=0.0537, lr=2.12e-05, step=3061] Training: 31%|███ | 3062/10000 [37:11<1:37:57, 1.18it/s, loss=0.0537, lr=2.12e-05, step=3061] Training: 31%|███ | 3062/10000 [37:11<1:37:57, 1.18it/s, loss=0.0102, lr=2.12e-05, step=3062] Training: 31%|███ | 3063/10000 [37:12<1:43:01, 1.12it/s, loss=0.0102, lr=2.12e-05, step=3062] Training: 31%|███ | 3063/10000 [37:12<1:43:01, 1.12it/s, loss=0.0155, lr=2.12e-05, step=3063] Training: 31%|███ | 3064/10000 [37:12<1:36:40, 1.20it/s, loss=0.0155, lr=2.12e-05, step=3063] Training: 31%|███ | 3064/10000 [37:12<1:36:40, 1.20it/s, loss=0.0289, lr=2.12e-05, step=3064] Training: 31%|███ | 3065/10000 [37:13<1:41:08, 1.14it/s, loss=0.0289, lr=2.12e-05, step=3064] Training: 31%|███ | 3065/10000 [37:13<1:41:08, 1.14it/s, loss=0.0158, lr=2.12e-05, step=3065] Training: 31%|███ | 3066/10000 [37:15<1:51:25, 1.04it/s, loss=0.0158, lr=2.12e-05, step=3065] Training: 31%|███ | 3066/10000 [37:15<1:51:25, 1.04it/s, loss=0.0061, lr=2.12e-05, step=3066] Training: 31%|███ | 3067/10000 [37:16<1:56:07, 1.01s/it, loss=0.0061, lr=2.12e-05, step=3066] Training: 31%|███ | 3067/10000 [37:16<1:56:07, 1.01s/it, loss=0.0087, lr=2.12e-05, step=3067] Training: 31%|███ | 3068/10000 [37:16<1:41:23, 1.14it/s, loss=0.0087, lr=2.12e-05, step=3067] Training: 31%|███ | 3068/10000 [37:16<1:41:23, 1.14it/s, loss=0.0128, lr=2.12e-05, step=3068] Training: 31%|███ | 3069/10000 [37:17<1:43:22, 1.12it/s, loss=0.0128, lr=2.12e-05, step=3068] Training: 31%|███ | 3069/10000 [37:17<1:43:22, 1.12it/s, loss=0.0361, lr=2.12e-05, step=3069]16:43:24.867 [I] step=3070 loss=0.0130 smoothed_loss=0.0194 lr=2.12e-05 grad_norm=0.5982 step_time=0.6471s data_time=0.2163s it/s=1.159 eta_to_10000=5980.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.1483 grad_shared_expert=0.6999 (10775:train_pytorch.py:850) + Training: 31%|███ | 3070/10000 [37:18<1:37:55, 1.18it/s, loss=0.0361, lr=2.12e-05, step=3069] Training: 31%|███ | 3070/10000 [37:18<1:37:55, 1.18it/s, loss=0.0130, lr=2.12e-05, step=3070] Training: 31%|███ | 3071/10000 [37:18<1:25:53, 1.34it/s, loss=0.0130, lr=2.12e-05, step=3070] Training: 31%|███ | 3071/10000 [37:18<1:25:53, 1.34it/s, loss=0.0085, lr=2.12e-05, step=3071] Training: 31%|███ | 3072/10000 [37:19<1:32:15, 1.25it/s, loss=0.0085, lr=2.12e-05, step=3071] Training: 31%|███ | 3072/10000 [37:19<1:32:15, 1.25it/s, loss=0.0346, lr=2.12e-05, step=3072] Training: 31%|███ | 3073/10000 [37:20<1:35:09, 1.21it/s, loss=0.0346, lr=2.12e-05, step=3072] Training: 31%|███ | 3073/10000 [37:20<1:35:09, 1.21it/s, loss=0.0273, lr=2.12e-05, step=3073] Training: 31%|███ | 3074/10000 [37:21<1:47:24, 1.07it/s, loss=0.0273, lr=2.12e-05, step=3073] Training: 31%|███ | 3074/10000 [37:21<1:47:24, 1.07it/s, loss=0.0112, lr=2.12e-05, step=3074] Training: 31%|███ | 3075/10000 [37:22<1:41:44, 1.13it/s, loss=0.0112, lr=2.12e-05, step=3074] Training: 31%|███ | 3075/10000 [37:22<1:41:44, 1.13it/s, loss=0.0265, lr=2.12e-05, step=3075] Training: 31%|███ | 3076/10000 [37:23<1:45:49, 1.09it/s, loss=0.0265, lr=2.12e-05, step=3075] Training: 31%|███ | 3076/10000 [37:23<1:45:49, 1.09it/s, loss=0.0183, lr=2.12e-05, step=3076] Training: 31%|███ | 3077/10000 [37:24<1:43:15, 1.12it/s, loss=0.0183, lr=2.12e-05, step=3076] Training: 31%|███ | 3077/10000 [37:24<1:43:15, 1.12it/s, loss=0.0175, lr=2.12e-05, step=3077] Training: 31%|███ | 3078/10000 [37:25<1:35:03, 1.21it/s, loss=0.0175, lr=2.12e-05, step=3077] Training: 31%|███ | 3078/10000 [37:25<1:35:03, 1.21it/s, loss=0.0100, lr=2.12e-05, step=3078] Training: 31%|███ | 3079/10000 [37:25<1:27:56, 1.31it/s, loss=0.0100, lr=2.12e-05, step=3078] Training: 31%|███ | 3079/10000 [37:25<1:27:56, 1.31it/s, loss=0.0586, lr=2.12e-05, step=3079]16:43:33.093 [I] step=3080 loss=0.0605 smoothed_loss=0.0266 lr=2.12e-05 grad_norm=0.4756 step_time=0.6535s data_time=0.1692s it/s=1.216 eta_to_10000=5692.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0256 grad_action_out_proj=0.2000 grad_shared_expert=0.4459 (10775:train_pytorch.py:850) + Training: 31%|███ | 3080/10000 [37:26<1:30:55, 1.27it/s, loss=0.0586, lr=2.12e-05, step=3079] Training: 31%|███ | 3080/10000 [37:26<1:30:55, 1.27it/s, loss=0.0605, lr=2.12e-05, step=3080] Training: 31%|███ | 3081/10000 [37:27<1:23:55, 1.37it/s, loss=0.0605, lr=2.12e-05, step=3080] Training: 31%|███ | 3081/10000 [37:27<1:23:55, 1.37it/s, loss=0.0035, lr=2.11e-05, step=3081] Training: 31%|███ | 3082/10000 [37:28<1:37:07, 1.19it/s, loss=0.0035, lr=2.11e-05, step=3081] Training: 31%|███ | 3082/10000 [37:28<1:37:07, 1.19it/s, loss=0.0094, lr=2.11e-05, step=3082] Training: 31%|███ | 3083/10000 [37:28<1:25:21, 1.35it/s, loss=0.0094, lr=2.11e-05, step=3082] Training: 31%|███ | 3083/10000 [37:28<1:25:21, 1.35it/s, loss=0.1450, lr=2.11e-05, step=3083] Training: 31%|███ | 3084/10000 [37:29<1:17:23, 1.49it/s, loss=0.1450, lr=2.11e-05, step=3083] Training: 31%|███ | 3084/10000 [37:29<1:17:23, 1.49it/s, loss=0.0212, lr=2.11e-05, step=3084] Training: 31%|███ | 3085/10000 [37:29<1:12:14, 1.60it/s, loss=0.0212, lr=2.11e-05, step=3084] Training: 31%|███ | 3085/10000 [37:29<1:12:14, 1.60it/s, loss=0.0258, lr=2.11e-05, step=3085] Training: 31%|███ | 3086/10000 [37:30<1:11:36, 1.61it/s, loss=0.0258, lr=2.11e-05, step=3085] Training: 31%|███ | 3086/10000 [37:30<1:11:36, 1.61it/s, loss=0.0052, lr=2.11e-05, step=3086] Training: 31%|███ | 3087/10000 [37:31<1:10:21, 1.64it/s, loss=0.0052, lr=2.11e-05, step=3086] Training: 31%|███ | 3087/10000 [37:31<1:10:21, 1.64it/s, loss=0.0425, lr=2.11e-05, step=3087] Training: 31%|███ | 3088/10000 [37:31<1:06:47, 1.72it/s, loss=0.0425, lr=2.11e-05, step=3087] Training: 31%|███ | 3088/10000 [37:31<1:06:47, 1.72it/s, loss=0.0088, lr=2.11e-05, step=3088] Training: 31%|███ | 3089/10000 [37:32<1:14:16, 1.55it/s, loss=0.0088, lr=2.11e-05, step=3088] Training: 31%|███ | 3089/10000 [37:32<1:14:16, 1.55it/s, loss=0.0136, lr=2.11e-05, step=3089]16:43:39.880 [I] step=3090 loss=0.0150 smoothed_loss=0.0263 lr=2.11e-05 grad_norm=0.6384 step_time=0.5832s data_time=0.0955s it/s=1.474 eta_to_10000=4688.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0160 grad_action_out_proj=0.2071 grad_shared_expert=0.5276 (10775:train_pytorch.py:850) + Training: 31%|███ | 3090/10000 [37:33<1:28:28, 1.30it/s, loss=0.0136, lr=2.11e-05, step=3089] Training: 31%|███ | 3090/10000 [37:33<1:28:28, 1.30it/s, loss=0.0150, lr=2.11e-05, step=3090] Training: 31%|███ | 3091/10000 [37:33<1:19:53, 1.44it/s, loss=0.0150, lr=2.11e-05, step=3090] Training: 31%|███ | 3091/10000 [37:33<1:19:53, 1.44it/s, loss=0.0072, lr=2.11e-05, step=3091] Training: 31%|███ | 3092/10000 [37:34<1:13:55, 1.56it/s, loss=0.0072, lr=2.11e-05, step=3091] Training: 31%|███ | 3092/10000 [37:34<1:13:55, 1.56it/s, loss=0.0707, lr=2.11e-05, step=3092] Training: 31%|███ | 3093/10000 [37:35<1:09:53, 1.65it/s, loss=0.0707, lr=2.11e-05, step=3092] Training: 31%|███ | 3093/10000 [37:35<1:09:53, 1.65it/s, loss=0.0176, lr=2.11e-05, step=3093] Training: 31%|███ | 3094/10000 [37:35<1:07:53, 1.70it/s, loss=0.0176, lr=2.11e-05, step=3093] Training: 31%|███ | 3094/10000 [37:35<1:07:53, 1.70it/s, loss=0.0253, lr=2.11e-05, step=3094] Training: 31%|███ | 3095/10000 [37:36<1:05:05, 1.77it/s, loss=0.0253, lr=2.11e-05, step=3094] Training: 31%|███ | 3095/10000 [37:36<1:05:05, 1.77it/s, loss=0.1430, lr=2.11e-05, step=3095] Training: 31%|███ | 3096/10000 [37:36<1:15:20, 1.53it/s, loss=0.1430, lr=2.11e-05, step=3095] Training: 31%|███ | 3096/10000 [37:36<1:15:20, 1.53it/s, loss=0.0041, lr=2.11e-05, step=3096] Training: 31%|███ | 3097/10000 [37:37<1:17:56, 1.48it/s, loss=0.0041, lr=2.11e-05, step=3096] Training: 31%|███ | 3097/10000 [37:37<1:17:56, 1.48it/s, loss=0.0069, lr=2.11e-05, step=3097] Training: 31%|███ | 3098/10000 [37:38<1:12:37, 1.58it/s, loss=0.0069, lr=2.11e-05, step=3097] Training: 31%|███ | 3098/10000 [37:38<1:12:37, 1.58it/s, loss=0.0532, lr=2.11e-05, step=3098] Training: 31%|███ | 3099/10000 [37:38<1:13:03, 1.57it/s, loss=0.0532, lr=2.11e-05, step=3098] Training: 31%|███ | 3099/10000 [37:38<1:13:03, 1.57it/s, loss=0.0296, lr=2.11e-05, step=3099]16:43:45.854 [I] step=3100 loss=0.1251 smoothed_loss=0.0434 lr=2.11e-05 grad_norm=0.5078 step_time=0.5244s data_time=0.0730s it/s=1.674 eta_to_10000=4121.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0244 grad_action_out_proj=0.1917 grad_shared_expert=0.4759 (10775:train_pytorch.py:850) + Training: 31%|███ | 3100/10000 [37:39<1:11:22, 1.61it/s, loss=0.0296, lr=2.11e-05, step=3099] Training: 31%|███ | 3100/10000 [37:39<1:11:22, 1.61it/s, loss=0.1251, lr=2.11e-05, step=3100] Training: 31%|███ | 3101/10000 [37:39<1:08:04, 1.69it/s, loss=0.1251, lr=2.11e-05, step=3100] Training: 31%|███ | 3101/10000 [37:39<1:08:04, 1.69it/s, loss=0.0466, lr=2.11e-05, step=3101] Training: 31%|███ | 3102/10000 [37:40<1:05:28, 1.76it/s, loss=0.0466, lr=2.11e-05, step=3101] Training: 31%|███ | 3102/10000 [37:40<1:05:28, 1.76it/s, loss=0.0085, lr=2.11e-05, step=3102] Training: 31%|███ | 3103/10000 [37:41<1:15:13, 1.53it/s, loss=0.0085, lr=2.11e-05, step=3102] Training: 31%|███ | 3103/10000 [37:41<1:15:13, 1.53it/s, loss=0.0222, lr=2.11e-05, step=3103] Training: 31%|███ | 3104/10000 [37:41<1:11:27, 1.61it/s, loss=0.0222, lr=2.11e-05, step=3103] Training: 31%|███ | 3104/10000 [37:41<1:11:27, 1.61it/s, loss=0.0049, lr=2.11e-05, step=3104] Training: 31%|███ | 3105/10000 [37:42<1:15:24, 1.52it/s, loss=0.0049, lr=2.11e-05, step=3104] Training: 31%|███ | 3105/10000 [37:42<1:15:24, 1.52it/s, loss=0.0267, lr=2.11e-05, step=3105] Training: 31%|███ | 3106/10000 [37:43<1:11:13, 1.61it/s, loss=0.0267, lr=2.11e-05, step=3105] Training: 31%|███ | 3106/10000 [37:43<1:11:13, 1.61it/s, loss=0.0183, lr=2.11e-05, step=3106] Training: 31%|███ | 3107/10000 [37:43<1:07:33, 1.70it/s, loss=0.0183, lr=2.11e-05, step=3106] Training: 31%|███ | 3107/10000 [37:43<1:07:33, 1.70it/s, loss=0.0189, lr=2.11e-05, step=3107] Training: 31%|███ | 3108/10000 [37:44<1:05:41, 1.75it/s, loss=0.0189, lr=2.11e-05, step=3107] Training: 31%|███ | 3108/10000 [37:44<1:05:41, 1.75it/s, loss=0.0156, lr=2.11e-05, step=3108] Training: 31%|███ | 3109/10000 [37:44<1:03:38, 1.80it/s, loss=0.0156, lr=2.11e-05, step=3108] Training: 31%|███ | 3109/10000 [37:44<1:03:38, 1.80it/s, loss=0.0246, lr=2.11e-05, step=3109]16:43:52.040 [I] step=3110 loss=0.0038 smoothed_loss=0.0266 lr=2.11e-05 grad_norm=0.5391 step_time=0.5535s data_time=0.0650s it/s=1.617 eta_to_10000=4261.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.1633 grad_shared_expert=0.4352 (10775:train_pytorch.py:850) + Training: 31%|███ | 3110/10000 [37:45<1:15:58, 1.51it/s, loss=0.0246, lr=2.11e-05, step=3109] Training: 31%|███ | 3110/10000 [37:45<1:15:58, 1.51it/s, loss=0.0038, lr=2.11e-05, step=3110] Training: 31%|███ | 3111/10000 [37:46<1:11:55, 1.60it/s, loss=0.0038, lr=2.11e-05, step=3110] Training: 31%|███ | 3111/10000 [37:46<1:11:55, 1.60it/s, loss=0.0150, lr=2.11e-05, step=3111] Training: 31%|███ | 3112/10000 [37:46<1:16:11, 1.51it/s, loss=0.0150, lr=2.11e-05, step=3111] Training: 31%|███ | 3112/10000 [37:46<1:16:11, 1.51it/s, loss=0.0098, lr=2.11e-05, step=3112] Training: 31%|███ | 3113/10000 [37:47<1:15:20, 1.52it/s, loss=0.0098, lr=2.11e-05, step=3112] Training: 31%|███ | 3113/10000 [37:47<1:15:20, 1.52it/s, loss=0.0153, lr=2.11e-05, step=3113] Training: 31%|███ | 3114/10000 [37:48<1:21:17, 1.41it/s, loss=0.0153, lr=2.11e-05, step=3113] Training: 31%|███ | 3114/10000 [37:48<1:21:17, 1.41it/s, loss=0.0401, lr=2.11e-05, step=3114] Training: 31%|███ | 3115/10000 [37:48<1:14:58, 1.53it/s, loss=0.0401, lr=2.11e-05, step=3114] Training: 31%|███ | 3115/10000 [37:48<1:14:58, 1.53it/s, loss=0.0333, lr=2.11e-05, step=3115] Training: 31%|███ | 3116/10000 [37:49<1:10:01, 1.64it/s, loss=0.0333, lr=2.11e-05, step=3115] Training: 31%|███ | 3116/10000 [37:49<1:10:01, 1.64it/s, loss=0.0100, lr=2.10e-05, step=3116] Training: 31%|███ | 3117/10000 [37:50<1:18:48, 1.46it/s, loss=0.0100, lr=2.10e-05, step=3116] Training: 31%|███ | 3117/10000 [37:50<1:18:48, 1.46it/s, loss=0.0227, lr=2.10e-05, step=3117] Training: 31%|███ | 3118/10000 [37:50<1:12:46, 1.58it/s, loss=0.0227, lr=2.10e-05, step=3117] Training: 31%|███ | 3118/10000 [37:50<1:12:46, 1.58it/s, loss=0.0174, lr=2.10e-05, step=3118] Training: 31%|███ | 3119/10000 [37:51<1:23:16, 1.38it/s, loss=0.0174, lr=2.10e-05, step=3118] Training: 31%|███ | 3119/10000 [37:51<1:23:16, 1.38it/s, loss=0.0232, lr=2.10e-05, step=3119]16:43:58.845 [I] step=3120 loss=0.0207 smoothed_loss=0.0230 lr=2.11e-05 grad_norm=0.4628 step_time=0.5641s data_time=0.1164s it/s=1.470 eta_to_10000=4681.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0176 grad_action_out_proj=0.1760 grad_shared_expert=0.3513 (10775:train_pytorch.py:850) + Training: 31%|███ | 3120/10000 [37:52<1:21:59, 1.40it/s, loss=0.0232, lr=2.10e-05, step=3119] Training: 31%|███ | 3120/10000 [37:52<1:21:59, 1.40it/s, loss=0.0207, lr=2.10e-05, step=3120] Training: 31%|███ | 3121/10000 [37:53<1:17:40, 1.48it/s, loss=0.0207, lr=2.10e-05, step=3120] Training: 31%|███ | 3121/10000 [37:53<1:17:40, 1.48it/s, loss=0.0130, lr=2.10e-05, step=3121] Training: 31%|███ | 3122/10000 [37:53<1:14:48, 1.53it/s, loss=0.0130, lr=2.10e-05, step=3121] Training: 31%|███ | 3122/10000 [37:53<1:14:48, 1.53it/s, loss=0.0174, lr=2.10e-05, step=3122] Training: 31%|███ | 3123/10000 [37:54<1:29:02, 1.29it/s, loss=0.0174, lr=2.10e-05, step=3122] Training: 31%|███ | 3123/10000 [37:54<1:29:02, 1.29it/s, loss=0.0141, lr=2.10e-05, step=3123] Training: 31%|███ | 3124/10000 [37:55<1:24:47, 1.35it/s, loss=0.0141, lr=2.10e-05, step=3123] Training: 31%|███ | 3124/10000 [37:55<1:24:47, 1.35it/s, loss=0.0307, lr=2.10e-05, step=3124] Training: 31%|███▏ | 3125/10000 [37:56<1:38:22, 1.16it/s, loss=0.0307, lr=2.10e-05, step=3124] Training: 31%|███▏ | 3125/10000 [37:56<1:38:22, 1.16it/s, loss=0.0029, lr=2.10e-05, step=3125] Training: 31%|███▏ | 3126/10000 [37:57<1:34:16, 1.22it/s, loss=0.0029, lr=2.10e-05, step=3125] Training: 31%|███▏ | 3126/10000 [37:57<1:34:16, 1.22it/s, loss=0.0173, lr=2.10e-05, step=3126] Training: 31%|███▏ | 3127/10000 [37:57<1:25:03, 1.35it/s, loss=0.0173, lr=2.10e-05, step=3126] Training: 31%|███▏ | 3127/10000 [37:57<1:25:03, 1.35it/s, loss=0.0200, lr=2.10e-05, step=3127] Training: 31%|███▏ | 3128/10000 [37:58<1:27:35, 1.31it/s, loss=0.0200, lr=2.10e-05, step=3127] Training: 31%|███▏ | 3128/10000 [37:58<1:27:35, 1.31it/s, loss=0.0128, lr=2.10e-05, step=3128] Training: 31%|███▏ | 3129/10000 [37:59<1:22:01, 1.40it/s, loss=0.0128, lr=2.10e-05, step=3128] Training: 31%|███▏ | 3129/10000 [37:59<1:22:01, 1.40it/s, loss=0.0065, lr=2.10e-05, step=3129]16:44:06.206 [I] step=3130 loss=0.0275 smoothed_loss=0.0187 lr=2.10e-05 grad_norm=0.5038 step_time=0.5967s data_time=0.1394s it/s=1.359 eta_to_10000=5056.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1701 grad_shared_expert=0.4549 (10775:train_pytorch.py:850) + Training: 31%|███▏ | 3130/10000 [37:59<1:18:14, 1.46it/s, loss=0.0065, lr=2.10e-05, step=3129] Training: 31%|███▏ | 3130/10000 [37:59<1:18:14, 1.46it/s, loss=0.0275, lr=2.10e-05, step=3130] Training: 31%|███▏ | 3131/10000 [38:00<1:11:44, 1.60it/s, loss=0.0275, lr=2.10e-05, step=3130] Training: 31%|███▏ | 3131/10000 [38:00<1:11:44, 1.60it/s, loss=0.0143, lr=2.10e-05, step=3131] Training: 31%|███▏ | 3132/10000 [38:01<1:22:36, 1.39it/s, loss=0.0143, lr=2.10e-05, step=3131] Training: 31%|███▏ | 3132/10000 [38:01<1:22:36, 1.39it/s, loss=0.0627, lr=2.10e-05, step=3132] Training: 31%|███▏ | 3133/10000 [38:01<1:21:57, 1.40it/s, loss=0.0627, lr=2.10e-05, step=3132] Training: 31%|███▏ | 3133/10000 [38:01<1:21:57, 1.40it/s, loss=0.0076, lr=2.10e-05, step=3133] Training: 31%|███▏ | 3134/10000 [38:02<1:20:22, 1.42it/s, loss=0.0076, lr=2.10e-05, step=3133] Training: 31%|███▏ | 3134/10000 [38:02<1:20:22, 1.42it/s, loss=0.0075, lr=2.10e-05, step=3134] Training: 31%|███▏ | 3135/10000 [38:03<1:18:31, 1.46it/s, loss=0.0075, lr=2.10e-05, step=3134] Training: 31%|███▏ | 3135/10000 [38:03<1:18:31, 1.46it/s, loss=0.0026, lr=2.10e-05, step=3135] Training: 31%|███▏ | 3136/10000 [38:03<1:16:57, 1.49it/s, loss=0.0026, lr=2.10e-05, step=3135] Training: 31%|███▏ | 3136/10000 [38:03<1:16:57, 1.49it/s, loss=0.0161, lr=2.10e-05, step=3136] Training: 31%|███▏ | 3137/10000 [38:04<1:13:46, 1.55it/s, loss=0.0161, lr=2.10e-05, step=3136] Training: 31%|███▏ | 3137/10000 [38:04<1:13:46, 1.55it/s, loss=0.0166, lr=2.10e-05, step=3137] Training: 31%|███▏ | 3138/10000 [38:05<1:10:29, 1.62it/s, loss=0.0166, lr=2.10e-05, step=3137] Training: 31%|███▏ | 3138/10000 [38:05<1:10:29, 1.62it/s, loss=0.0331, lr=2.10e-05, step=3138] Training: 31%|███▏ | 3139/10000 [38:05<1:20:10, 1.43it/s, loss=0.0331, lr=2.10e-05, step=3138] Training: 31%|███▏ | 3139/10000 [38:05<1:20:10, 1.43it/s, loss=0.0076, lr=2.10e-05, step=3139]16:44:12.887 [I] step=3140 loss=0.0225 smoothed_loss=0.0186 lr=2.10e-05 grad_norm=0.5544 step_time=0.5882s data_time=0.0799s it/s=1.497 eta_to_10000=4582.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0181 grad_action_out_proj=0.1184 grad_shared_expert=0.4127 (10775:train_pytorch.py:850) + Training: 31%|███▏ | 3140/10000 [38:06<1:15:01, 1.52it/s, loss=0.0076, lr=2.10e-05, step=3139] Training: 31%|███▏ | 3140/10000 [38:06<1:15:01, 1.52it/s, loss=0.0225, lr=2.10e-05, step=3140] Training: 31%|███▏ | 3141/10000 [38:06<1:09:52, 1.64it/s, loss=0.0225, lr=2.10e-05, step=3140] Training: 31%|███▏ | 3141/10000 [38:06<1:09:52, 1.64it/s, loss=0.0114, lr=2.10e-05, step=3141] Training: 31%|███▏ | 3142/10000 [38:07<1:06:59, 1.71it/s, loss=0.0114, lr=2.10e-05, step=3141] Training: 31%|███▏ | 3142/10000 [38:07<1:06:59, 1.71it/s, loss=0.0114, lr=2.10e-05, step=3142] Training: 31%|███▏ | 3143/10000 [38:08<1:05:40, 1.74it/s, loss=0.0114, lr=2.10e-05, step=3142] Training: 31%|███▏ | 3143/10000 [38:08<1:05:40, 1.74it/s, loss=0.0094, lr=2.10e-05, step=3143] Training: 31%|███▏ | 3144/10000 [38:08<1:03:34, 1.80it/s, loss=0.0094, lr=2.10e-05, step=3143] Training: 31%|███▏ | 3144/10000 [38:08<1:03:34, 1.80it/s, loss=0.0217, lr=2.10e-05, step=3144] Training: 31%|███▏ | 3145/10000 [38:09<1:02:18, 1.83it/s, loss=0.0217, lr=2.10e-05, step=3144] Training: 31%|███▏ | 3145/10000 [38:09<1:02:18, 1.83it/s, loss=0.0191, lr=2.10e-05, step=3145] Training: 31%|███▏ | 3146/10000 [38:09<1:12:47, 1.57it/s, loss=0.0191, lr=2.10e-05, step=3145] Training: 31%|███▏ | 3146/10000 [38:09<1:12:47, 1.57it/s, loss=0.0668, lr=2.10e-05, step=3146] Training: 31%|███▏ | 3147/10000 [38:10<1:12:04, 1.58it/s, loss=0.0668, lr=2.10e-05, step=3146] Training: 31%|███▏ | 3147/10000 [38:10<1:12:04, 1.58it/s, loss=0.0110, lr=2.10e-05, step=3147] Training: 31%|███▏ | 3148/10000 [38:11<1:08:17, 1.67it/s, loss=0.0110, lr=2.10e-05, step=3147] Training: 31%|███▏ | 3148/10000 [38:11<1:08:17, 1.67it/s, loss=0.0062, lr=2.10e-05, step=3148] Training: 31%|███▏ | 3149/10000 [38:11<1:05:19, 1.75it/s, loss=0.0062, lr=2.10e-05, step=3148] Training: 31%|███▏ | 3149/10000 [38:11<1:05:19, 1.75it/s, loss=0.0050, lr=2.10e-05, step=3149]16:44:18.566 [I] step=3150 loss=0.0299 smoothed_loss=0.0193 lr=2.10e-05 grad_norm=0.4682 step_time=0.5013s data_time=0.0668s it/s=1.761 eta_to_10000=3889.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0214 grad_action_out_proj=0.1637 grad_shared_expert=0.5886 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3150/10000 [38:12<1:04:59, 1.76it/s, loss=0.0050, lr=2.10e-05, step=3149] Training: 32%|███▏ | 3150/10000 [38:12<1:04:59, 1.76it/s, loss=0.0299, lr=2.10e-05, step=3150] Training: 32%|███▏ | 3151/10000 [38:12<1:03:25, 1.80it/s, loss=0.0299, lr=2.10e-05, step=3150] Training: 32%|███▏ | 3151/10000 [38:12<1:03:25, 1.80it/s, loss=0.0093, lr=2.09e-05, step=3151] Training: 32%|███▏ | 3152/10000 [38:13<1:03:00, 1.81it/s, loss=0.0093, lr=2.09e-05, step=3151] Training: 32%|███▏ | 3152/10000 [38:13<1:03:00, 1.81it/s, loss=0.0121, lr=2.09e-05, step=3152] Training: 32%|███▏ | 3153/10000 [38:14<1:11:53, 1.59it/s, loss=0.0121, lr=2.09e-05, step=3152] Training: 32%|███▏ | 3153/10000 [38:14<1:11:53, 1.59it/s, loss=0.0200, lr=2.09e-05, step=3153] Training: 32%|███▏ | 3154/10000 [38:14<1:16:31, 1.49it/s, loss=0.0200, lr=2.09e-05, step=3153] Training: 32%|███▏ | 3154/10000 [38:14<1:16:31, 1.49it/s, loss=0.0146, lr=2.09e-05, step=3154] Training: 32%|███▏ | 3155/10000 [38:15<1:17:29, 1.47it/s, loss=0.0146, lr=2.09e-05, step=3154] Training: 32%|███▏ | 3155/10000 [38:15<1:17:29, 1.47it/s, loss=0.0063, lr=2.09e-05, step=3155] Training: 32%|███▏ | 3156/10000 [38:15<1:11:33, 1.59it/s, loss=0.0063, lr=2.09e-05, step=3155] Training: 32%|███▏ | 3156/10000 [38:15<1:11:33, 1.59it/s, loss=0.0163, lr=2.09e-05, step=3156] Training: 32%|███▏ | 3157/10000 [38:16<1:08:07, 1.67it/s, loss=0.0163, lr=2.09e-05, step=3156] Training: 32%|███▏ | 3157/10000 [38:16<1:08:07, 1.67it/s, loss=0.0228, lr=2.09e-05, step=3157] Training: 32%|███▏ | 3158/10000 [38:17<1:05:26, 1.74it/s, loss=0.0228, lr=2.09e-05, step=3157] Training: 32%|███▏ | 3158/10000 [38:17<1:05:26, 1.74it/s, loss=0.0669, lr=2.09e-05, step=3158] Training: 32%|███▏ | 3159/10000 [38:17<1:04:31, 1.77it/s, loss=0.0669, lr=2.09e-05, step=3158] Training: 32%|███▏ | 3159/10000 [38:17<1:04:31, 1.77it/s, loss=0.0368, lr=2.09e-05, step=3159]16:44:24.860 [I] step=3160 loss=0.0347 smoothed_loss=0.0246 lr=2.09e-05 grad_norm=0.5502 step_time=0.5536s data_time=0.0755s it/s=1.589 eta_to_10000=4303.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.2165 grad_shared_expert=0.5210 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3160/10000 [38:18<1:14:12, 1.54it/s, loss=0.0368, lr=2.09e-05, step=3159] Training: 32%|███▏ | 3160/10000 [38:18<1:14:12, 1.54it/s, loss=0.0347, lr=2.09e-05, step=3160] Training: 32%|███▏ | 3161/10000 [38:19<1:21:16, 1.40it/s, loss=0.0347, lr=2.09e-05, step=3160] Training: 32%|███▏ | 3161/10000 [38:19<1:21:16, 1.40it/s, loss=0.0366, lr=2.09e-05, step=3161] Training: 32%|███▏ | 3162/10000 [38:19<1:15:04, 1.52it/s, loss=0.0366, lr=2.09e-05, step=3161] Training: 32%|███▏ | 3162/10000 [38:19<1:15:04, 1.52it/s, loss=0.0073, lr=2.09e-05, step=3162] Training: 32%|███▏ | 3163/10000 [38:20<1:10:04, 1.63it/s, loss=0.0073, lr=2.09e-05, step=3162] Training: 32%|███▏ | 3163/10000 [38:20<1:10:04, 1.63it/s, loss=0.0092, lr=2.09e-05, step=3163] Training: 32%|███▏ | 3164/10000 [38:20<1:06:58, 1.70it/s, loss=0.0092, lr=2.09e-05, step=3163] Training: 32%|███▏ | 3164/10000 [38:20<1:06:58, 1.70it/s, loss=0.0085, lr=2.09e-05, step=3164] Training: 32%|███▏ | 3165/10000 [38:21<1:04:32, 1.76it/s, loss=0.0085, lr=2.09e-05, step=3164] Training: 32%|███▏ | 3165/10000 [38:21<1:04:32, 1.76it/s, loss=0.0142, lr=2.09e-05, step=3165] Training: 32%|███▏ | 3166/10000 [38:21<1:02:00, 1.84it/s, loss=0.0142, lr=2.09e-05, step=3165] Training: 32%|███▏ | 3166/10000 [38:21<1:02:00, 1.84it/s, loss=0.0457, lr=2.09e-05, step=3166] Training: 32%|███▏ | 3167/10000 [38:22<1:00:51, 1.87it/s, loss=0.0457, lr=2.09e-05, step=3166] Training: 32%|███▏ | 3167/10000 [38:22<1:00:51, 1.87it/s, loss=0.0166, lr=2.09e-05, step=3167] Training: 32%|███▏ | 3168/10000 [38:23<1:08:58, 1.65it/s, loss=0.0166, lr=2.09e-05, step=3167] Training: 32%|███▏ | 3168/10000 [38:23<1:08:58, 1.65it/s, loss=0.0505, lr=2.09e-05, step=3168] Training: 32%|███▏ | 3169/10000 [38:23<1:13:50, 1.54it/s, loss=0.0505, lr=2.09e-05, step=3168] Training: 32%|███▏ | 3169/10000 [38:23<1:13:50, 1.54it/s, loss=0.0132, lr=2.09e-05, step=3169]16:44:30.894 [I] step=3170 loss=0.0062 smoothed_loss=0.0222 lr=2.09e-05 grad_norm=0.5566 step_time=0.5376s data_time=0.0659s it/s=1.658 eta_to_10000=4120.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0127 grad_action_out_proj=0.1419 grad_shared_expert=0.4160 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3170/10000 [38:24<1:10:58, 1.60it/s, loss=0.0132, lr=2.09e-05, step=3169] Training: 32%|███▏ | 3170/10000 [38:24<1:10:58, 1.60it/s, loss=0.0062, lr=2.09e-05, step=3170] Training: 32%|███▏ | 3171/10000 [38:24<1:06:58, 1.70it/s, loss=0.0062, lr=2.09e-05, step=3170] Training: 32%|███▏ | 3171/10000 [38:24<1:06:58, 1.70it/s, loss=0.0065, lr=2.09e-05, step=3171] Training: 32%|███▏ | 3172/10000 [38:25<1:03:58, 1.78it/s, loss=0.0065, lr=2.09e-05, step=3171] Training: 32%|███▏ | 3172/10000 [38:25<1:03:58, 1.78it/s, loss=0.0045, lr=2.09e-05, step=3172] Training: 32%|███▏ | 3173/10000 [38:26<1:03:42, 1.79it/s, loss=0.0045, lr=2.09e-05, step=3172] Training: 32%|███▏ | 3173/10000 [38:26<1:03:42, 1.79it/s, loss=0.0070, lr=2.09e-05, step=3173] Training: 32%|███▏ | 3174/10000 [38:26<1:03:00, 1.81it/s, loss=0.0070, lr=2.09e-05, step=3173] Training: 32%|███▏ | 3174/10000 [38:26<1:03:00, 1.81it/s, loss=0.0031, lr=2.09e-05, step=3174] Training: 32%|███▏ | 3175/10000 [38:27<1:14:24, 1.53it/s, loss=0.0031, lr=2.09e-05, step=3174] Training: 32%|███▏ | 3175/10000 [38:27<1:14:24, 1.53it/s, loss=0.0142, lr=2.09e-05, step=3175] Training: 32%|███▏ | 3176/10000 [38:28<1:18:46, 1.44it/s, loss=0.0142, lr=2.09e-05, step=3175] Training: 32%|███▏ | 3176/10000 [38:28<1:18:46, 1.44it/s, loss=0.0241, lr=2.09e-05, step=3176] Training: 32%|███▏ | 3177/10000 [38:28<1:12:50, 1.56it/s, loss=0.0241, lr=2.09e-05, step=3176] Training: 32%|███▏ | 3177/10000 [38:28<1:12:50, 1.56it/s, loss=0.0204, lr=2.09e-05, step=3177] Training: 32%|███▏ | 3178/10000 [38:29<1:10:42, 1.61it/s, loss=0.0204, lr=2.09e-05, step=3177] Training: 32%|███▏ | 3178/10000 [38:29<1:10:42, 1.61it/s, loss=0.0155, lr=2.09e-05, step=3178] Training: 32%|███▏ | 3179/10000 [38:29<1:06:52, 1.70it/s, loss=0.0155, lr=2.09e-05, step=3178] Training: 32%|███▏ | 3179/10000 [38:29<1:06:52, 1.70it/s, loss=0.0044, lr=2.09e-05, step=3179]16:44:36.838 [I] step=3180 loss=0.0140 smoothed_loss=0.0156 lr=2.09e-05 grad_norm=0.5315 step_time=0.5221s data_time=0.0724s it/s=1.683 eta_to_10000=4053.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0303 grad_action_out_proj=0.2562 grad_shared_expert=0.6225 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3180/10000 [38:30<1:06:03, 1.72it/s, loss=0.0044, lr=2.09e-05, step=3179] Training: 32%|███▏ | 3180/10000 [38:30<1:06:03, 1.72it/s, loss=0.0140, lr=2.09e-05, step=3180] Training: 32%|███▏ | 3181/10000 [38:30<1:04:06, 1.77it/s, loss=0.0140, lr=2.09e-05, step=3180] Training: 32%|███▏ | 3181/10000 [38:30<1:04:06, 1.77it/s, loss=0.0073, lr=2.09e-05, step=3181] Training: 32%|███▏ | 3182/10000 [38:31<1:14:18, 1.53it/s, loss=0.0073, lr=2.09e-05, step=3181] Training: 32%|███▏ | 3182/10000 [38:31<1:14:18, 1.53it/s, loss=0.0250, lr=2.09e-05, step=3182] Training: 32%|███▏ | 3183/10000 [38:32<1:18:01, 1.46it/s, loss=0.0250, lr=2.09e-05, step=3182] Training: 32%|███▏ | 3183/10000 [38:32<1:18:01, 1.46it/s, loss=0.0103, lr=2.09e-05, step=3183] Training: 32%|███▏ | 3184/10000 [38:33<1:12:13, 1.57it/s, loss=0.0103, lr=2.09e-05, step=3183] Training: 32%|███▏ | 3184/10000 [38:33<1:12:13, 1.57it/s, loss=0.0336, lr=2.09e-05, step=3184] Training: 32%|███▏ | 3185/10000 [38:33<1:18:46, 1.44it/s, loss=0.0336, lr=2.09e-05, step=3184] Training: 32%|███▏ | 3185/10000 [38:33<1:18:46, 1.44it/s, loss=0.0214, lr=2.09e-05, step=3185] Training: 32%|███▏ | 3186/10000 [38:34<1:12:50, 1.56it/s, loss=0.0214, lr=2.09e-05, step=3185] Training: 32%|███▏ | 3186/10000 [38:34<1:12:50, 1.56it/s, loss=0.0045, lr=2.08e-05, step=3186] Training: 32%|███▏ | 3187/10000 [38:34<1:08:47, 1.65it/s, loss=0.0045, lr=2.08e-05, step=3186] Training: 32%|███▏ | 3187/10000 [38:34<1:08:47, 1.65it/s, loss=0.0380, lr=2.08e-05, step=3187] Training: 32%|███▏ | 3188/10000 [38:35<1:06:36, 1.70it/s, loss=0.0380, lr=2.08e-05, step=3187] Training: 32%|███▏ | 3188/10000 [38:35<1:06:36, 1.70it/s, loss=0.0108, lr=2.08e-05, step=3188] Training: 32%|███▏ | 3189/10000 [38:36<1:06:53, 1.70it/s, loss=0.0108, lr=2.08e-05, step=3188] Training: 32%|███▏ | 3189/10000 [38:36<1:06:53, 1.70it/s, loss=0.0159, lr=2.08e-05, step=3189]16:44:43.355 [I] step=3190 loss=0.0156 smoothed_loss=0.0173 lr=2.09e-05 grad_norm=0.4920 step_time=0.5577s data_time=0.0940s it/s=1.535 eta_to_10000=4437.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0263 grad_action_out_proj=0.2695 grad_shared_expert=0.5885 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3190/10000 [38:36<1:15:27, 1.50it/s, loss=0.0159, lr=2.08e-05, step=3189] Training: 32%|███▏ | 3190/10000 [38:36<1:15:27, 1.50it/s, loss=0.0156, lr=2.08e-05, step=3190] Training: 32%|███▏ | 3191/10000 [38:37<1:17:59, 1.46it/s, loss=0.0156, lr=2.08e-05, step=3190] Training: 32%|███▏ | 3191/10000 [38:37<1:17:59, 1.46it/s, loss=0.0228, lr=2.08e-05, step=3191] Training: 32%|███▏ | 3192/10000 [38:38<1:11:17, 1.59it/s, loss=0.0228, lr=2.08e-05, step=3191] Training: 32%|███▏ | 3192/10000 [38:38<1:11:17, 1.59it/s, loss=0.0070, lr=2.08e-05, step=3192] Training: 32%|███▏ | 3193/10000 [38:38<1:10:48, 1.60it/s, loss=0.0070, lr=2.08e-05, step=3192] Training: 32%|███▏ | 3193/10000 [38:38<1:10:48, 1.60it/s, loss=0.0128, lr=2.08e-05, step=3193] Training: 32%|███▏ | 3194/10000 [38:39<1:06:34, 1.70it/s, loss=0.0128, lr=2.08e-05, step=3193] Training: 32%|███▏ | 3194/10000 [38:39<1:06:34, 1.70it/s, loss=0.0341, lr=2.08e-05, step=3194] Training: 32%|███▏ | 3195/10000 [38:39<1:03:14, 1.79it/s, loss=0.0341, lr=2.08e-05, step=3194] Training: 32%|███▏ | 3195/10000 [38:39<1:03:14, 1.79it/s, loss=0.0301, lr=2.08e-05, step=3195] Training: 32%|███▏ | 3196/10000 [38:40<1:01:17, 1.85it/s, loss=0.0301, lr=2.08e-05, step=3195] Training: 32%|███▏ | 3196/10000 [38:40<1:01:17, 1.85it/s, loss=0.0911, lr=2.08e-05, step=3196] Training: 32%|███▏ | 3197/10000 [38:41<1:09:19, 1.64it/s, loss=0.0911, lr=2.08e-05, step=3196] Training: 32%|███▏ | 3197/10000 [38:41<1:09:19, 1.64it/s, loss=0.0120, lr=2.08e-05, step=3197] Training: 32%|███▏ | 3198/10000 [38:41<1:13:43, 1.54it/s, loss=0.0120, lr=2.08e-05, step=3197] Training: 32%|███▏ | 3198/10000 [38:41<1:13:43, 1.54it/s, loss=0.0226, lr=2.08e-05, step=3198] Training: 32%|███▏ | 3199/10000 [38:42<1:08:13, 1.66it/s, loss=0.0226, lr=2.08e-05, step=3198] Training: 32%|███▏ | 3199/10000 [38:42<1:08:13, 1.66it/s, loss=0.0024, lr=2.08e-05, step=3199]16:44:49.271 [I] step=3200 loss=0.0660 smoothed_loss=0.0269 lr=2.08e-05 grad_norm=0.4831 step_time=0.5225s data_time=0.0690s it/s=1.691 eta_to_10000=4021.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0252 grad_action_out_proj=0.1548 grad_shared_expert=0.4556 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3200/10000 [38:42<1:07:16, 1.68it/s, loss=0.0024, lr=2.08e-05, step=3199] Training: 32%|███▏ | 3200/10000 [38:42<1:07:16, 1.68it/s, loss=0.0660, lr=2.08e-05, step=3200] Training: 32%|███▏ | 3201/10000 [38:43<1:04:15, 1.76it/s, loss=0.0660, lr=2.08e-05, step=3200] Training: 32%|███▏ | 3201/10000 [38:43<1:04:15, 1.76it/s, loss=0.0057, lr=2.08e-05, step=3201] Training: 32%|███▏ | 3202/10000 [38:43<1:02:11, 1.82it/s, loss=0.0057, lr=2.08e-05, step=3201] Training: 32%|███▏ | 3202/10000 [38:43<1:02:11, 1.82it/s, loss=0.0108, lr=2.08e-05, step=3202] Training: 32%|███▏ | 3203/10000 [38:44<1:10:43, 1.60it/s, loss=0.0108, lr=2.08e-05, step=3202] Training: 32%|███▏ | 3203/10000 [38:44<1:10:43, 1.60it/s, loss=0.0135, lr=2.08e-05, step=3203] Training: 32%|███▏ | 3204/10000 [38:45<1:06:40, 1.70it/s, loss=0.0135, lr=2.08e-05, step=3203] Training: 32%|███▏ | 3204/10000 [38:45<1:06:40, 1.70it/s, loss=0.0063, lr=2.08e-05, step=3204] Training: 32%|███▏ | 3205/10000 [38:45<1:13:46, 1.54it/s, loss=0.0063, lr=2.08e-05, step=3204] Training: 32%|███▏ | 3205/10000 [38:45<1:13:46, 1.54it/s, loss=0.0113, lr=2.08e-05, step=3205] Training: 32%|███▏ | 3206/10000 [38:46<1:08:34, 1.65it/s, loss=0.0113, lr=2.08e-05, step=3205] Training: 32%|███▏ | 3206/10000 [38:46<1:08:34, 1.65it/s, loss=0.0101, lr=2.08e-05, step=3206] Training: 32%|███▏ | 3207/10000 [38:46<1:05:19, 1.73it/s, loss=0.0101, lr=2.08e-05, step=3206] Training: 32%|███▏ | 3207/10000 [38:46<1:05:19, 1.73it/s, loss=0.0053, lr=2.08e-05, step=3207] Training: 32%|███▏ | 3208/10000 [38:47<1:08:39, 1.65it/s, loss=0.0053, lr=2.08e-05, step=3207] Training: 32%|███▏ | 3208/10000 [38:47<1:08:39, 1.65it/s, loss=0.0563, lr=2.08e-05, step=3208] Training: 32%|███▏ | 3209/10000 [38:48<1:08:19, 1.66it/s, loss=0.0563, lr=2.08e-05, step=3208] Training: 32%|███▏ | 3209/10000 [38:48<1:08:19, 1.66it/s, loss=0.0103, lr=2.08e-05, step=3209]16:44:55.284 [I] step=3210 loss=0.0168 smoothed_loss=0.0199 lr=2.08e-05 grad_norm=0.4998 step_time=0.5341s data_time=0.0673s it/s=1.663 eta_to_10000=4082.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0226 grad_action_out_proj=0.1785 grad_shared_expert=0.4673 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3210/10000 [38:48<1:08:45, 1.65it/s, loss=0.0103, lr=2.08e-05, step=3209] Training: 32%|███▏ | 3210/10000 [38:48<1:08:45, 1.65it/s, loss=0.0168, lr=2.08e-05, step=3210] Training: 32%|███▏ | 3211/10000 [38:49<1:14:41, 1.51it/s, loss=0.0168, lr=2.08e-05, step=3210] Training: 32%|███▏ | 3211/10000 [38:49<1:14:41, 1.51it/s, loss=0.0247, lr=2.08e-05, step=3211] Training: 32%|███▏ | 3212/10000 [38:50<1:17:29, 1.46it/s, loss=0.0247, lr=2.08e-05, step=3211] Training: 32%|███▏ | 3212/10000 [38:50<1:17:29, 1.46it/s, loss=0.0490, lr=2.08e-05, step=3212] Training: 32%|███▏ | 3213/10000 [38:50<1:13:18, 1.54it/s, loss=0.0490, lr=2.08e-05, step=3212] Training: 32%|███▏ | 3213/10000 [38:50<1:13:18, 1.54it/s, loss=0.0251, lr=2.08e-05, step=3213] Training: 32%|███▏ | 3214/10000 [38:51<1:09:25, 1.63it/s, loss=0.0251, lr=2.08e-05, step=3213] Training: 32%|███▏ | 3214/10000 [38:51<1:09:25, 1.63it/s, loss=0.0316, lr=2.08e-05, step=3214] Training: 32%|███▏ | 3215/10000 [38:52<1:10:35, 1.60it/s, loss=0.0316, lr=2.08e-05, step=3214] Training: 32%|███▏ | 3215/10000 [38:52<1:10:35, 1.60it/s, loss=0.0412, lr=2.08e-05, step=3215] Training: 32%|███▏ | 3216/10000 [38:52<1:08:21, 1.65it/s, loss=0.0412, lr=2.08e-05, step=3215] Training: 32%|███▏ | 3216/10000 [38:52<1:08:21, 1.65it/s, loss=0.0135, lr=2.08e-05, step=3216] Training: 32%|███▏ | 3217/10000 [38:53<1:05:05, 1.74it/s, loss=0.0135, lr=2.08e-05, step=3216] Training: 32%|███▏ | 3217/10000 [38:53<1:05:05, 1.74it/s, loss=0.0302, lr=2.08e-05, step=3217] Training: 32%|███▏ | 3218/10000 [38:53<1:12:37, 1.56it/s, loss=0.0302, lr=2.08e-05, step=3217] Training: 32%|███▏ | 3218/10000 [38:53<1:12:37, 1.56it/s, loss=0.0084, lr=2.08e-05, step=3218] Training: 32%|███▏ | 3219/10000 [38:54<1:08:53, 1.64it/s, loss=0.0084, lr=2.08e-05, step=3218] Training: 32%|███▏ | 3219/10000 [38:54<1:08:53, 1.64it/s, loss=0.0094, lr=2.08e-05, step=3219]16:45:01.509 [I] step=3220 loss=0.0413 smoothed_loss=0.0241 lr=2.08e-05 grad_norm=0.5000 step_time=0.5452s data_time=0.0771s it/s=1.607 eta_to_10000=4219.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.1395 grad_shared_expert=0.4105 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3220/10000 [38:55<1:07:04, 1.68it/s, loss=0.0094, lr=2.08e-05, step=3219] Training: 32%|███▏ | 3220/10000 [38:55<1:07:04, 1.68it/s, loss=0.0413, lr=2.08e-05, step=3220] Training: 32%|███▏ | 3221/10000 [38:55<1:06:17, 1.70it/s, loss=0.0413, lr=2.08e-05, step=3220] Training: 32%|███▏ | 3221/10000 [38:55<1:06:17, 1.70it/s, loss=0.0173, lr=2.07e-05, step=3221] Training: 32%|███▏ | 3222/10000 [38:56<1:04:44, 1.74it/s, loss=0.0173, lr=2.07e-05, step=3221] Training: 32%|███▏ | 3222/10000 [38:56<1:04:44, 1.74it/s, loss=0.0211, lr=2.07e-05, step=3222] Training: 32%|███▏ | 3223/10000 [38:56<1:03:21, 1.78it/s, loss=0.0211, lr=2.07e-05, step=3222] Training: 32%|███▏ | 3223/10000 [38:56<1:03:21, 1.78it/s, loss=0.0965, lr=2.07e-05, step=3223] Training: 32%|███▏ | 3224/10000 [38:57<1:03:50, 1.77it/s, loss=0.0965, lr=2.07e-05, step=3223] Training: 32%|███▏ | 3224/10000 [38:57<1:03:50, 1.77it/s, loss=0.0121, lr=2.07e-05, step=3224] Training: 32%|███▏ | 3225/10000 [38:58<1:15:41, 1.49it/s, loss=0.0121, lr=2.07e-05, step=3224] Training: 32%|███▏ | 3225/10000 [38:58<1:15:41, 1.49it/s, loss=0.0045, lr=2.07e-05, step=3225] Training: 32%|███▏ | 3226/10000 [38:58<1:11:54, 1.57it/s, loss=0.0045, lr=2.07e-05, step=3225] Training: 32%|███▏ | 3226/10000 [38:58<1:11:54, 1.57it/s, loss=0.0091, lr=2.07e-05, step=3226] Training: 32%|███▏ | 3227/10000 [38:59<1:08:06, 1.66it/s, loss=0.0091, lr=2.07e-05, step=3226] Training: 32%|███▏ | 3227/10000 [38:59<1:08:06, 1.66it/s, loss=0.0167, lr=2.07e-05, step=3227] Training: 32%|███▏ | 3228/10000 [38:59<1:06:51, 1.69it/s, loss=0.0167, lr=2.07e-05, step=3227] Training: 32%|███▏ | 3228/10000 [38:59<1:06:51, 1.69it/s, loss=0.0183, lr=2.07e-05, step=3228] Training: 32%|███▏ | 3229/10000 [39:00<1:08:13, 1.65it/s, loss=0.0183, lr=2.07e-05, step=3228] Training: 32%|███▏ | 3229/10000 [39:00<1:08:13, 1.65it/s, loss=0.0070, lr=2.07e-05, step=3229]16:45:07.571 [I] step=3230 loss=0.0223 smoothed_loss=0.0217 lr=2.07e-05 grad_norm=0.4949 step_time=0.5185s data_time=0.0879s it/s=1.650 eta_to_10000=4103.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0124 grad_action_out_proj=0.1245 grad_shared_expert=0.4084 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3230/10000 [39:01<1:09:30, 1.62it/s, loss=0.0070, lr=2.07e-05, step=3229] Training: 32%|███▏ | 3230/10000 [39:01<1:09:30, 1.62it/s, loss=0.0223, lr=2.07e-05, step=3230] Training: 32%|███▏ | 3231/10000 [39:01<1:11:07, 1.59it/s, loss=0.0223, lr=2.07e-05, step=3230] Training: 32%|███▏ | 3231/10000 [39:01<1:11:07, 1.59it/s, loss=0.0297, lr=2.07e-05, step=3231] Training: 32%|███▏ | 3232/10000 [39:02<1:20:01, 1.41it/s, loss=0.0297, lr=2.07e-05, step=3231] Training: 32%|███▏ | 3232/10000 [39:02<1:20:01, 1.41it/s, loss=0.0188, lr=2.07e-05, step=3232] Training: 32%|███▏ | 3233/10000 [39:03<1:26:48, 1.30it/s, loss=0.0188, lr=2.07e-05, step=3232] Training: 32%|███▏ | 3233/10000 [39:03<1:26:48, 1.30it/s, loss=0.0155, lr=2.07e-05, step=3233] Training: 32%|███▏ | 3234/10000 [39:04<1:20:41, 1.40it/s, loss=0.0155, lr=2.07e-05, step=3233] Training: 32%|███▏ | 3234/10000 [39:04<1:20:41, 1.40it/s, loss=0.0171, lr=2.07e-05, step=3234] Training: 32%|███▏ | 3235/10000 [39:04<1:14:47, 1.51it/s, loss=0.0171, lr=2.07e-05, step=3234] Training: 32%|███▏ | 3235/10000 [39:04<1:14:47, 1.51it/s, loss=0.0208, lr=2.07e-05, step=3235] Training: 32%|███▏ | 3236/10000 [39:05<1:10:39, 1.60it/s, loss=0.0208, lr=2.07e-05, step=3235] Training: 32%|███▏ | 3236/10000 [39:05<1:10:39, 1.60it/s, loss=0.0088, lr=2.07e-05, step=3236] Training: 32%|███▏ | 3237/10000 [39:05<1:08:22, 1.65it/s, loss=0.0088, lr=2.07e-05, step=3236] Training: 32%|███▏ | 3237/10000 [39:05<1:08:22, 1.65it/s, loss=0.0148, lr=2.07e-05, step=3237] Training: 32%|███▏ | 3238/10000 [39:06<1:06:10, 1.70it/s, loss=0.0148, lr=2.07e-05, step=3237] Training: 32%|███▏ | 3238/10000 [39:06<1:06:10, 1.70it/s, loss=0.0355, lr=2.07e-05, step=3238] Training: 32%|███▏ | 3239/10000 [39:07<1:15:26, 1.49it/s, loss=0.0355, lr=2.07e-05, step=3238] Training: 32%|███▏ | 3239/10000 [39:07<1:15:26, 1.49it/s, loss=0.0109, lr=2.07e-05, step=3239]16:45:14.429 [I] step=3240 loss=0.0241 smoothed_loss=0.0203 lr=2.07e-05 grad_norm=0.5156 step_time=0.5973s data_time=0.0884s it/s=1.458 eta_to_10000=4635.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.1331 grad_shared_expert=0.6035 (10775:train_pytorch.py:850) + Training: 32%|███▏ | 3240/10000 [39:07<1:18:17, 1.44it/s, loss=0.0109, lr=2.07e-05, step=3239] Training: 32%|███▏ | 3240/10000 [39:07<1:18:17, 1.44it/s, loss=0.0241, lr=2.07e-05, step=3240] Training: 32%|███▏ | 3241/10000 [39:08<1:15:16, 1.50it/s, loss=0.0241, lr=2.07e-05, step=3240] Training: 32%|███▏ | 3241/10000 [39:08<1:15:16, 1.50it/s, loss=0.0167, lr=2.07e-05, step=3241] Training: 32%|███▏ | 3242/10000 [39:09<1:11:27, 1.58it/s, loss=0.0167, lr=2.07e-05, step=3241] Training: 32%|███▏ | 3242/10000 [39:09<1:11:27, 1.58it/s, loss=0.0174, lr=2.07e-05, step=3242] Training: 32%|███▏ | 3243/10000 [39:09<1:07:56, 1.66it/s, loss=0.0174, lr=2.07e-05, step=3242] Training: 32%|███▏ | 3243/10000 [39:09<1:07:56, 1.66it/s, loss=0.0253, lr=2.07e-05, step=3243] Training: 32%|███▏ | 3244/10000 [39:10<1:06:15, 1.70it/s, loss=0.0253, lr=2.07e-05, step=3243] Training: 32%|███▏ | 3244/10000 [39:10<1:06:15, 1.70it/s, loss=0.0337, lr=2.07e-05, step=3244] Training: 32%|███▏ | 3245/10000 [39:10<1:06:21, 1.70it/s, loss=0.0337, lr=2.07e-05, step=3244] Training: 32%|███▏ | 3245/10000 [39:10<1:06:21, 1.70it/s, loss=0.0184, lr=2.07e-05, step=3245] Training: 32%|███▏ | 3246/10000 [39:11<1:18:13, 1.44it/s, loss=0.0184, lr=2.07e-05, step=3245] Training: 32%|███▏ | 3246/10000 [39:11<1:18:13, 1.44it/s, loss=0.0112, lr=2.07e-05, step=3246] Training: 32%|███▏ | 3247/10000 [39:12<1:12:22, 1.56it/s, loss=0.0112, lr=2.07e-05, step=3246] Training: 32%|███▏ | 3247/10000 [39:12<1:12:22, 1.56it/s, loss=0.0090, lr=2.07e-05, step=3247] Training: 32%|███▏ | 3248/10000 [39:13<1:14:57, 1.50it/s, loss=0.0090, lr=2.07e-05, step=3247] Training: 32%|███▏ | 3248/10000 [39:13<1:14:57, 1.50it/s, loss=0.0296, lr=2.07e-05, step=3248] Training: 32%|███▏ | 3249/10000 [39:13<1:09:56, 1.61it/s, loss=0.0296, lr=2.07e-05, step=3248] Training: 32%|███▏ | 3249/10000 [39:13<1:09:56, 1.61it/s, loss=0.0231, lr=2.07e-05, step=3249]16:45:20.528 [I] step=3250 loss=0.0067 smoothed_loss=0.0191 lr=2.07e-05 grad_norm=0.5569 step_time=0.5340s data_time=0.0761s it/s=1.640 eta_to_10000=4116.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.1042 grad_shared_expert=0.2815 (10775:train_pytorch.py:850) + Training: 32%|███▎ | 3250/10000 [39:14<1:07:52, 1.66it/s, loss=0.0231, lr=2.07e-05, step=3249] Training: 32%|███▎ | 3250/10000 [39:14<1:07:52, 1.66it/s, loss=0.0067, lr=2.07e-05, step=3250] Training: 33%|███▎ | 3251/10000 [39:14<1:05:57, 1.71it/s, loss=0.0067, lr=2.07e-05, step=3250] Training: 33%|███▎ | 3251/10000 [39:14<1:05:57, 1.71it/s, loss=0.0165, lr=2.07e-05, step=3251] Training: 33%|███▎ | 3252/10000 [39:15<1:03:04, 1.78it/s, loss=0.0165, lr=2.07e-05, step=3251] Training: 33%|███▎ | 3252/10000 [39:15<1:03:04, 1.78it/s, loss=0.0278, lr=2.07e-05, step=3252] Training: 33%|███▎ | 3253/10000 [39:15<1:01:35, 1.83it/s, loss=0.0278, lr=2.07e-05, step=3252] Training: 33%|███▎ | 3253/10000 [39:15<1:01:35, 1.83it/s, loss=0.0229, lr=2.07e-05, step=3253] Training: 33%|███▎ | 3254/10000 [39:16<1:10:02, 1.61it/s, loss=0.0229, lr=2.07e-05, step=3253] Training: 33%|███▎ | 3254/10000 [39:16<1:10:02, 1.61it/s, loss=0.1010, lr=2.07e-05, step=3254] Training: 33%|███▎ | 3255/10000 [39:16<1:06:07, 1.70it/s, loss=0.1010, lr=2.07e-05, step=3254] Training: 33%|███▎ | 3255/10000 [39:16<1:06:07, 1.70it/s, loss=0.0218, lr=2.06e-05, step=3255] Training: 33%|███▎ | 3256/10000 [39:17<1:13:26, 1.53it/s, loss=0.0218, lr=2.06e-05, step=3255] Training: 33%|███▎ | 3256/10000 [39:17<1:13:26, 1.53it/s, loss=0.0122, lr=2.06e-05, step=3256] Training: 33%|███▎ | 3257/10000 [39:18<1:10:59, 1.58it/s, loss=0.0122, lr=2.06e-05, step=3256] Training: 33%|███▎ | 3257/10000 [39:18<1:10:59, 1.58it/s, loss=0.0158, lr=2.06e-05, step=3257] Training: 33%|███▎ | 3258/10000 [39:18<1:07:12, 1.67it/s, loss=0.0158, lr=2.06e-05, step=3257] Training: 33%|███▎ | 3258/10000 [39:18<1:07:12, 1.67it/s, loss=0.0097, lr=2.06e-05, step=3258] Training: 33%|███▎ | 3259/10000 [39:19<1:04:05, 1.75it/s, loss=0.0097, lr=2.06e-05, step=3258] Training: 33%|███▎ | 3259/10000 [39:19<1:04:05, 1.75it/s, loss=0.1920, lr=2.06e-05, step=3259]16:45:26.348 [I] step=3260 loss=0.0312 smoothed_loss=0.0394 lr=2.06e-05 grad_norm=0.5607 step_time=0.5189s data_time=0.0630s it/s=1.718 eta_to_10000=3922.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0113 grad_action_out_proj=0.1350 grad_shared_expert=0.4775 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3260/10000 [39:19<1:02:59, 1.78it/s, loss=0.1920, lr=2.06e-05, step=3259] Training: 33%|███▎ | 3260/10000 [39:19<1:02:59, 1.78it/s, loss=0.0312, lr=2.06e-05, step=3260] Training: 33%|███▎ | 3261/10000 [39:20<1:09:58, 1.61it/s, loss=0.0312, lr=2.06e-05, step=3260] Training: 33%|███▎ | 3261/10000 [39:20<1:09:58, 1.61it/s, loss=0.0126, lr=2.06e-05, step=3261] Training: 33%|███▎ | 3262/10000 [39:21<1:07:57, 1.65it/s, loss=0.0126, lr=2.06e-05, step=3261] Training: 33%|███▎ | 3262/10000 [39:21<1:07:57, 1.65it/s, loss=0.0376, lr=2.06e-05, step=3262] Training: 33%|███▎ | 3263/10000 [39:21<1:12:15, 1.55it/s, loss=0.0376, lr=2.06e-05, step=3262] Training: 33%|███▎ | 3263/10000 [39:21<1:12:15, 1.55it/s, loss=0.0078, lr=2.06e-05, step=3263] Training: 33%|███▎ | 3264/10000 [39:22<1:09:38, 1.61it/s, loss=0.0078, lr=2.06e-05, step=3263] Training: 33%|███▎ | 3264/10000 [39:22<1:09:38, 1.61it/s, loss=0.0241, lr=2.06e-05, step=3264] Training: 33%|███▎ | 3265/10000 [39:23<1:16:48, 1.46it/s, loss=0.0241, lr=2.06e-05, step=3264] Training: 33%|███▎ | 3265/10000 [39:23<1:16:48, 1.46it/s, loss=0.1143, lr=2.06e-05, step=3265] Training: 33%|███▎ | 3266/10000 [39:23<1:12:21, 1.55it/s, loss=0.1143, lr=2.06e-05, step=3265] Training: 33%|███▎ | 3266/10000 [39:23<1:12:21, 1.55it/s, loss=0.0265, lr=2.06e-05, step=3266] Training: 33%|███▎ | 3267/10000 [39:24<1:09:45, 1.61it/s, loss=0.0265, lr=2.06e-05, step=3266] Training: 33%|███▎ | 3267/10000 [39:24<1:09:45, 1.61it/s, loss=0.0148, lr=2.06e-05, step=3267] Training: 33%|███▎ | 3268/10000 [39:25<1:16:31, 1.47it/s, loss=0.0148, lr=2.06e-05, step=3267] Training: 33%|███▎ | 3268/10000 [39:25<1:16:31, 1.47it/s, loss=0.0107, lr=2.06e-05, step=3268] Training: 33%|███▎ | 3269/10000 [39:25<1:11:40, 1.57it/s, loss=0.0107, lr=2.06e-05, step=3268] Training: 33%|███▎ | 3269/10000 [39:25<1:11:40, 1.57it/s, loss=0.0651, lr=2.06e-05, step=3269]16:45:33.033 [I] step=3270 loss=0.0193 smoothed_loss=0.0357 lr=2.06e-05 grad_norm=0.6796 step_time=0.5929s data_time=0.0756s it/s=1.496 eta_to_10000=4498.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0088 grad_action_out_proj=0.1312 grad_shared_expert=0.3530 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3270/10000 [39:26<1:15:01, 1.49it/s, loss=0.0651, lr=2.06e-05, step=3269] Training: 33%|███▎ | 3270/10000 [39:26<1:15:01, 1.49it/s, loss=0.0193, lr=2.06e-05, step=3270] Training: 33%|███▎ | 3271/10000 [39:27<1:10:39, 1.59it/s, loss=0.0193, lr=2.06e-05, step=3270] Training: 33%|███▎ | 3271/10000 [39:27<1:10:39, 1.59it/s, loss=0.0116, lr=2.06e-05, step=3271] Training: 33%|███▎ | 3272/10000 [39:27<1:08:52, 1.63it/s, loss=0.0116, lr=2.06e-05, step=3271] Training: 33%|███▎ | 3272/10000 [39:27<1:08:52, 1.63it/s, loss=0.0347, lr=2.06e-05, step=3272] Training: 33%|███▎ | 3273/10000 [39:28<1:05:50, 1.70it/s, loss=0.0347, lr=2.06e-05, step=3272] Training: 33%|███▎ | 3273/10000 [39:28<1:05:50, 1.70it/s, loss=0.0172, lr=2.06e-05, step=3273] Training: 33%|███▎ | 3274/10000 [39:28<1:03:19, 1.77it/s, loss=0.0172, lr=2.06e-05, step=3273] Training: 33%|███▎ | 3274/10000 [39:28<1:03:19, 1.77it/s, loss=0.0085, lr=2.06e-05, step=3274] Training: 33%|███▎ | 3275/10000 [39:29<1:12:31, 1.55it/s, loss=0.0085, lr=2.06e-05, step=3274] Training: 33%|███▎ | 3275/10000 [39:29<1:12:31, 1.55it/s, loss=0.0194, lr=2.06e-05, step=3275] Training: 33%|███▎ | 3276/10000 [39:30<1:08:53, 1.63it/s, loss=0.0194, lr=2.06e-05, step=3275] Training: 33%|███▎ | 3276/10000 [39:30<1:08:53, 1.63it/s, loss=0.0271, lr=2.06e-05, step=3276] Training: 33%|███▎ | 3277/10000 [39:31<1:17:48, 1.44it/s, loss=0.0271, lr=2.06e-05, step=3276] Training: 33%|███▎ | 3277/10000 [39:31<1:17:48, 1.44it/s, loss=0.0216, lr=2.06e-05, step=3277] Training: 33%|███▎ | 3278/10000 [39:31<1:12:23, 1.55it/s, loss=0.0216, lr=2.06e-05, step=3277] Training: 33%|███▎ | 3278/10000 [39:31<1:12:23, 1.55it/s, loss=0.0093, lr=2.06e-05, step=3278] Training: 33%|███▎ | 3279/10000 [39:32<1:08:17, 1.64it/s, loss=0.0093, lr=2.06e-05, step=3278] Training: 33%|███▎ | 3279/10000 [39:32<1:08:17, 1.64it/s, loss=0.0356, lr=2.06e-05, step=3279]16:45:39.111 [I] step=3280 loss=0.0249 smoothed_loss=0.0266 lr=2.06e-05 grad_norm=0.5088 step_time=0.5355s data_time=0.0722s it/s=1.646 eta_to_10000=4083.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0144 grad_action_out_proj=0.1657 grad_shared_expert=0.4276 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3280/10000 [39:32<1:08:11, 1.64it/s, loss=0.0356, lr=2.06e-05, step=3279] Training: 33%|███▎ | 3280/10000 [39:32<1:08:11, 1.64it/s, loss=0.0249, lr=2.06e-05, step=3280] Training: 33%|███▎ | 3281/10000 [39:33<1:05:11, 1.72it/s, loss=0.0249, lr=2.06e-05, step=3280] Training: 33%|███▎ | 3281/10000 [39:33<1:05:11, 1.72it/s, loss=0.0144, lr=2.06e-05, step=3281] Training: 33%|███▎ | 3282/10000 [39:34<1:18:12, 1.43it/s, loss=0.0144, lr=2.06e-05, step=3281] Training: 33%|███▎ | 3282/10000 [39:34<1:18:12, 1.43it/s, loss=0.0199, lr=2.06e-05, step=3282] Training: 33%|███▎ | 3283/10000 [39:34<1:12:10, 1.55it/s, loss=0.0199, lr=2.06e-05, step=3282] Training: 33%|███▎ | 3283/10000 [39:34<1:12:10, 1.55it/s, loss=0.0134, lr=2.06e-05, step=3283] Training: 33%|███▎ | 3284/10000 [39:35<1:14:39, 1.50it/s, loss=0.0134, lr=2.06e-05, step=3283] Training: 33%|███▎ | 3284/10000 [39:35<1:14:39, 1.50it/s, loss=0.0126, lr=2.06e-05, step=3284] Training: 33%|███▎ | 3285/10000 [39:35<1:09:49, 1.60it/s, loss=0.0126, lr=2.06e-05, step=3284] Training: 33%|███▎ | 3285/10000 [39:35<1:09:49, 1.60it/s, loss=0.0118, lr=2.06e-05, step=3285] Training: 33%|███▎ | 3286/10000 [39:36<1:06:12, 1.69it/s, loss=0.0118, lr=2.06e-05, step=3285] Training: 33%|███▎ | 3286/10000 [39:36<1:06:12, 1.69it/s, loss=0.0108, lr=2.06e-05, step=3286] Training: 33%|███▎ | 3287/10000 [39:37<1:07:11, 1.67it/s, loss=0.0108, lr=2.06e-05, step=3286] Training: 33%|███▎ | 3287/10000 [39:37<1:07:11, 1.67it/s, loss=0.0044, lr=2.06e-05, step=3287] Training: 33%|███▎ | 3288/10000 [39:37<1:03:57, 1.75it/s, loss=0.0044, lr=2.06e-05, step=3287] Training: 33%|███▎ | 3288/10000 [39:37<1:03:57, 1.75it/s, loss=0.0480, lr=2.06e-05, step=3288] Training: 33%|███▎ | 3289/10000 [39:38<1:10:14, 1.59it/s, loss=0.0480, lr=2.06e-05, step=3288] Training: 33%|███▎ | 3289/10000 [39:38<1:10:14, 1.59it/s, loss=0.0221, lr=2.05e-05, step=3289]16:45:45.504 [I] step=3290 loss=0.0421 smoothed_loss=0.0238 lr=2.06e-05 grad_norm=0.5828 step_time=0.5588s data_time=0.0804s it/s=1.564 eta_to_10000=4289.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1628 grad_shared_expert=0.6979 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3290/10000 [39:39<1:14:05, 1.51it/s, loss=0.0221, lr=2.05e-05, step=3289] Training: 33%|███▎ | 3290/10000 [39:39<1:14:05, 1.51it/s, loss=0.0421, lr=2.05e-05, step=3290] Training: 33%|███▎ | 3291/10000 [39:40<1:28:09, 1.27it/s, loss=0.0421, lr=2.05e-05, step=3290] Training: 33%|███▎ | 3291/10000 [39:40<1:28:09, 1.27it/s, loss=0.0360, lr=2.05e-05, step=3291] Training: 33%|███▎ | 3292/10000 [39:40<1:29:09, 1.25it/s, loss=0.0360, lr=2.05e-05, step=3291] Training: 33%|███▎ | 3292/10000 [39:40<1:29:09, 1.25it/s, loss=0.0131, lr=2.05e-05, step=3292] Training: 33%|███▎ | 3293/10000 [39:41<1:23:00, 1.35it/s, loss=0.0131, lr=2.05e-05, step=3292] Training: 33%|███▎ | 3293/10000 [39:41<1:23:00, 1.35it/s, loss=0.0163, lr=2.05e-05, step=3293] Training: 33%|███▎ | 3294/10000 [39:42<1:16:28, 1.46it/s, loss=0.0163, lr=2.05e-05, step=3293] Training: 33%|███▎ | 3294/10000 [39:42<1:16:28, 1.46it/s, loss=0.0074, lr=2.05e-05, step=3294] Training: 33%|███▎ | 3295/10000 [39:42<1:10:44, 1.58it/s, loss=0.0074, lr=2.05e-05, step=3294] Training: 33%|███▎ | 3295/10000 [39:42<1:10:44, 1.58it/s, loss=0.0029, lr=2.05e-05, step=3295] Training: 33%|███▎ | 3296/10000 [39:43<1:07:51, 1.65it/s, loss=0.0029, lr=2.05e-05, step=3295] Training: 33%|███▎ | 3296/10000 [39:43<1:07:51, 1.65it/s, loss=0.1156, lr=2.05e-05, step=3296] Training: 33%|███▎ | 3297/10000 [39:44<1:15:25, 1.48it/s, loss=0.1156, lr=2.05e-05, step=3296] Training: 33%|███▎ | 3297/10000 [39:44<1:15:25, 1.48it/s, loss=0.0042, lr=2.05e-05, step=3297] Training: 33%|███▎ | 3298/10000 [39:44<1:21:34, 1.37it/s, loss=0.0042, lr=2.05e-05, step=3297] Training: 33%|███▎ | 3298/10000 [39:44<1:21:34, 1.37it/s, loss=0.0243, lr=2.05e-05, step=3298] Training: 33%|███▎ | 3299/10000 [39:45<1:18:38, 1.42it/s, loss=0.0243, lr=2.05e-05, step=3298] Training: 33%|███▎ | 3299/10000 [39:45<1:18:38, 1.42it/s, loss=0.0052, lr=2.05e-05, step=3299]16:45:52.806 [I] step=3300 loss=0.0102 smoothed_loss=0.0230 lr=2.05e-05 grad_norm=0.4996 step_time=0.6248s data_time=0.1053s it/s=1.370 eta_to_10000=4889.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0466 grad_action_out_proj=0.2025 grad_shared_expert=0.6179 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3300/10000 [39:46<1:23:11, 1.34it/s, loss=0.0052, lr=2.05e-05, step=3299] Training: 33%|███▎ | 3300/10000 [39:46<1:23:11, 1.34it/s, loss=0.0102, lr=2.05e-05, step=3300] Training: 33%|███▎ | 3301/10000 [39:46<1:16:53, 1.45it/s, loss=0.0102, lr=2.05e-05, step=3300] Training: 33%|███▎ | 3301/10000 [39:46<1:16:53, 1.45it/s, loss=0.0203, lr=2.05e-05, step=3301] Training: 33%|███▎ | 3302/10000 [39:47<1:12:13, 1.55it/s, loss=0.0203, lr=2.05e-05, step=3301] Training: 33%|███▎ | 3302/10000 [39:47<1:12:13, 1.55it/s, loss=0.0343, lr=2.05e-05, step=3302] Training: 33%|███▎ | 3303/10000 [39:47<1:07:38, 1.65it/s, loss=0.0343, lr=2.05e-05, step=3302] Training: 33%|███▎ | 3303/10000 [39:47<1:07:38, 1.65it/s, loss=0.0065, lr=2.05e-05, step=3303] Training: 33%|███▎ | 3304/10000 [39:48<1:14:29, 1.50it/s, loss=0.0065, lr=2.05e-05, step=3303] Training: 33%|███▎ | 3304/10000 [39:48<1:14:29, 1.50it/s, loss=0.0276, lr=2.05e-05, step=3304] Training: 33%|███▎ | 3305/10000 [39:49<1:16:33, 1.46it/s, loss=0.0276, lr=2.05e-05, step=3304] Training: 33%|███▎ | 3305/10000 [39:49<1:16:33, 1.46it/s, loss=0.0189, lr=2.05e-05, step=3305] Training: 33%|███▎ | 3306/10000 [39:50<1:23:55, 1.33it/s, loss=0.0189, lr=2.05e-05, step=3305] Training: 33%|███▎ | 3306/10000 [39:50<1:23:55, 1.33it/s, loss=0.0091, lr=2.05e-05, step=3306] Training: 33%|███▎ | 3307/10000 [39:50<1:16:13, 1.46it/s, loss=0.0091, lr=2.05e-05, step=3306] Training: 33%|███▎ | 3307/10000 [39:50<1:16:13, 1.46it/s, loss=0.0162, lr=2.05e-05, step=3307] Training: 33%|███▎ | 3308/10000 [39:51<1:12:44, 1.53it/s, loss=0.0162, lr=2.05e-05, step=3307] Training: 33%|███▎ | 3308/10000 [39:51<1:12:44, 1.53it/s, loss=0.0235, lr=2.05e-05, step=3308] Training: 33%|███▎ | 3309/10000 [39:52<1:15:33, 1.48it/s, loss=0.0235, lr=2.05e-05, step=3308] Training: 33%|███▎ | 3309/10000 [39:52<1:15:33, 1.48it/s, loss=0.0183, lr=2.05e-05, step=3309]16:45:59.479 [I] step=3310 loss=0.0118 smoothed_loss=0.0197 lr=2.05e-05 grad_norm=0.5313 step_time=0.5615s data_time=0.1059s it/s=1.499 eta_to_10000=4463.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0289 grad_action_out_proj=0.2221 grad_shared_expert=0.5235 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3310/10000 [39:53<1:18:36, 1.42it/s, loss=0.0183, lr=2.05e-05, step=3309] Training: 33%|███▎ | 3310/10000 [39:53<1:18:36, 1.42it/s, loss=0.0118, lr=2.05e-05, step=3310] Training: 33%|███▎ | 3311/10000 [39:53<1:21:24, 1.37it/s, loss=0.0118, lr=2.05e-05, step=3310] Training: 33%|███▎ | 3311/10000 [39:53<1:21:24, 1.37it/s, loss=0.0036, lr=2.05e-05, step=3311] Training: 33%|███▎ | 3312/10000 [39:54<1:29:22, 1.25it/s, loss=0.0036, lr=2.05e-05, step=3311] Training: 33%|███▎ | 3312/10000 [39:54<1:29:22, 1.25it/s, loss=0.0644, lr=2.05e-05, step=3312] Training: 33%|███▎ | 3313/10000 [39:55<1:30:24, 1.23it/s, loss=0.0644, lr=2.05e-05, step=3312] Training: 33%|███▎ | 3313/10000 [39:55<1:30:24, 1.23it/s, loss=0.0328, lr=2.05e-05, step=3313] Training: 33%|███▎ | 3314/10000 [39:56<1:30:58, 1.22it/s, loss=0.0328, lr=2.05e-05, step=3313] Training: 33%|███▎ | 3314/10000 [39:56<1:30:58, 1.22it/s, loss=0.0161, lr=2.05e-05, step=3314] Training: 33%|███▎ | 3315/10000 [39:57<1:28:02, 1.27it/s, loss=0.0161, lr=2.05e-05, step=3314] Training: 33%|███▎ | 3315/10000 [39:57<1:28:02, 1.27it/s, loss=0.0118, lr=2.05e-05, step=3315] Training: 33%|███▎ | 3316/10000 [39:58<1:29:24, 1.25it/s, loss=0.0118, lr=2.05e-05, step=3315] Training: 33%|███▎ | 3316/10000 [39:58<1:29:24, 1.25it/s, loss=0.0152, lr=2.05e-05, step=3316] Training: 33%|███▎ | 3317/10000 [39:58<1:25:37, 1.30it/s, loss=0.0152, lr=2.05e-05, step=3316] Training: 33%|███▎ | 3317/10000 [39:58<1:25:37, 1.30it/s, loss=0.0374, lr=2.05e-05, step=3317] Training: 33%|███▎ | 3318/10000 [39:59<1:30:16, 1.23it/s, loss=0.0374, lr=2.05e-05, step=3317] Training: 33%|███▎ | 3318/10000 [39:59<1:30:16, 1.23it/s, loss=0.0188, lr=2.05e-05, step=3318] Training: 33%|███▎ | 3319/10000 [40:00<1:35:33, 1.17it/s, loss=0.0188, lr=2.05e-05, step=3318] Training: 33%|███▎ | 3319/10000 [40:00<1:35:33, 1.17it/s, loss=0.0029, lr=2.05e-05, step=3319]16:46:07.829 [I] step=3320 loss=0.0216 smoothed_loss=0.0206 lr=2.05e-05 grad_norm=0.5285 step_time=0.6788s data_time=0.1562s it/s=1.198 eta_to_10000=5577.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0177 grad_action_out_proj=0.2167 grad_shared_expert=0.5344 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3320/10000 [40:01<1:33:43, 1.19it/s, loss=0.0029, lr=2.05e-05, step=3319] Training: 33%|███▎ | 3320/10000 [40:01<1:33:43, 1.19it/s, loss=0.0216, lr=2.05e-05, step=3320] Training: 33%|███▎ | 3321/10000 [40:02<1:31:54, 1.21it/s, loss=0.0216, lr=2.05e-05, step=3320] Training: 33%|███▎ | 3321/10000 [40:02<1:31:54, 1.21it/s, loss=0.0157, lr=2.05e-05, step=3321] Training: 33%|███▎ | 3322/10000 [40:02<1:23:04, 1.34it/s, loss=0.0157, lr=2.05e-05, step=3321] Training: 33%|███▎ | 3322/10000 [40:02<1:23:04, 1.34it/s, loss=0.0146, lr=2.04e-05, step=3322] Training: 33%|███▎ | 3323/10000 [40:03<1:17:06, 1.44it/s, loss=0.0146, lr=2.04e-05, step=3322] Training: 33%|███▎ | 3323/10000 [40:03<1:17:06, 1.44it/s, loss=0.0097, lr=2.04e-05, step=3323] Training: 33%|███▎ | 3324/10000 [40:04<1:19:09, 1.41it/s, loss=0.0097, lr=2.04e-05, step=3323] Training: 33%|███▎ | 3324/10000 [40:04<1:19:09, 1.41it/s, loss=0.0081, lr=2.04e-05, step=3324] Training: 33%|███▎ | 3325/10000 [40:05<1:27:07, 1.28it/s, loss=0.0081, lr=2.04e-05, step=3324] Training: 33%|███▎ | 3325/10000 [40:05<1:27:07, 1.28it/s, loss=0.0501, lr=2.04e-05, step=3325] Training: 33%|███▎ | 3326/10000 [40:05<1:25:40, 1.30it/s, loss=0.0501, lr=2.04e-05, step=3325] Training: 33%|███▎ | 3326/10000 [40:05<1:25:40, 1.30it/s, loss=0.0181, lr=2.04e-05, step=3326] Training: 33%|███▎ | 3327/10000 [40:06<1:25:36, 1.30it/s, loss=0.0181, lr=2.04e-05, step=3326] Training: 33%|███▎ | 3327/10000 [40:06<1:25:36, 1.30it/s, loss=0.0217, lr=2.04e-05, step=3327] Training: 33%|███▎ | 3328/10000 [40:07<1:20:03, 1.39it/s, loss=0.0217, lr=2.04e-05, step=3327] Training: 33%|███▎ | 3328/10000 [40:07<1:20:03, 1.39it/s, loss=0.0160, lr=2.04e-05, step=3328] Training: 33%|███▎ | 3329/10000 [40:07<1:22:30, 1.35it/s, loss=0.0160, lr=2.04e-05, step=3328] Training: 33%|███▎ | 3329/10000 [40:07<1:22:30, 1.35it/s, loss=0.0207, lr=2.04e-05, step=3329]16:46:15.046 [I] step=3330 loss=0.0063 smoothed_loss=0.0188 lr=2.04e-05 grad_norm=0.4614 step_time=0.5972s data_time=0.1248s it/s=1.386 eta_to_10000=4812.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0209 grad_action_out_proj=0.1670 grad_shared_expert=0.5066 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3330/10000 [40:08<1:20:42, 1.38it/s, loss=0.0207, lr=2.04e-05, step=3329] Training: 33%|███▎ | 3330/10000 [40:08<1:20:42, 1.38it/s, loss=0.0063, lr=2.04e-05, step=3330] Training: 33%|███▎ | 3331/10000 [40:09<1:17:45, 1.43it/s, loss=0.0063, lr=2.04e-05, step=3330] Training: 33%|███▎ | 3331/10000 [40:09<1:17:45, 1.43it/s, loss=0.0315, lr=2.04e-05, step=3331] Training: 33%|███▎ | 3332/10000 [40:10<1:42:23, 1.09it/s, loss=0.0315, lr=2.04e-05, step=3331] Training: 33%|███▎ | 3332/10000 [40:10<1:42:23, 1.09it/s, loss=0.0150, lr=2.04e-05, step=3332] Training: 33%|███▎ | 3333/10000 [40:11<1:34:25, 1.18it/s, loss=0.0150, lr=2.04e-05, step=3332] Training: 33%|███▎ | 3333/10000 [40:11<1:34:25, 1.18it/s, loss=0.0126, lr=2.04e-05, step=3333] Training: 33%|███▎ | 3334/10000 [40:12<1:35:18, 1.17it/s, loss=0.0126, lr=2.04e-05, step=3333] Training: 33%|███▎ | 3334/10000 [40:12<1:35:18, 1.17it/s, loss=0.0074, lr=2.04e-05, step=3334] Training: 33%|███▎ | 3335/10000 [40:13<1:33:38, 1.19it/s, loss=0.0074, lr=2.04e-05, step=3334] Training: 33%|███▎ | 3335/10000 [40:13<1:33:38, 1.19it/s, loss=0.0100, lr=2.04e-05, step=3335] Training: 33%|███▎ | 3336/10000 [40:14<1:41:39, 1.09it/s, loss=0.0100, lr=2.04e-05, step=3335] Training: 33%|███▎ | 3336/10000 [40:14<1:41:39, 1.09it/s, loss=0.0201, lr=2.04e-05, step=3336] Training: 33%|███▎ | 3337/10000 [40:14<1:38:03, 1.13it/s, loss=0.0201, lr=2.04e-05, step=3336] Training: 33%|███▎ | 3337/10000 [40:14<1:38:03, 1.13it/s, loss=0.0350, lr=2.04e-05, step=3337] Training: 33%|███▎ | 3338/10000 [40:15<1:31:09, 1.22it/s, loss=0.0350, lr=2.04e-05, step=3337] Training: 33%|███▎ | 3338/10000 [40:15<1:31:09, 1.22it/s, loss=0.0134, lr=2.04e-05, step=3338] Training: 33%|███▎ | 3339/10000 [40:17<1:50:32, 1.00it/s, loss=0.0134, lr=2.04e-05, step=3338] Training: 33%|███▎ | 3339/10000 [40:17<1:50:32, 1.00it/s, loss=0.0122, lr=2.04e-05, step=3339]16:46:24.709 [I] step=3340 loss=0.0447 smoothed_loss=0.0205 lr=2.04e-05 grad_norm=0.5612 step_time=0.7193s data_time=0.2468s it/s=1.035 eta_to_10000=6434.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0193 grad_action_out_proj=0.2252 grad_shared_expert=1.4583 (10775:train_pytorch.py:850) + Training: 33%|███▎ | 3340/10000 [40:18<1:59:04, 1.07s/it, loss=0.0122, lr=2.04e-05, step=3339] Training: 33%|███▎ | 3340/10000 [40:18<1:59:04, 1.07s/it, loss=0.0447, lr=2.04e-05, step=3340] Training: 33%|███▎ | 3341/10000 [40:18<1:45:28, 1.05it/s, loss=0.0447, lr=2.04e-05, step=3340] Training: 33%|███▎ | 3341/10000 [40:18<1:45:28, 1.05it/s, loss=0.0174, lr=2.04e-05, step=3341] Training: 33%|███▎ | 3342/10000 [40:19<1:43:29, 1.07it/s, loss=0.0174, lr=2.04e-05, step=3341] Training: 33%|███▎ | 3342/10000 [40:19<1:43:29, 1.07it/s, loss=0.0136, lr=2.04e-05, step=3342] Training: 33%|███▎ | 3343/10000 [40:20<1:40:15, 1.11it/s, loss=0.0136, lr=2.04e-05, step=3342] Training: 33%|███▎ | 3343/10000 [40:20<1:40:15, 1.11it/s, loss=0.0297, lr=2.04e-05, step=3343] Training: 33%|███▎ | 3344/10000 [40:21<1:31:10, 1.22it/s, loss=0.0297, lr=2.04e-05, step=3343] Training: 33%|███▎ | 3344/10000 [40:21<1:31:10, 1.22it/s, loss=0.0082, lr=2.04e-05, step=3344] Training: 33%|███▎ | 3345/10000 [40:21<1:22:54, 1.34it/s, loss=0.0082, lr=2.04e-05, step=3344] Training: 33%|███▎ | 3345/10000 [40:21<1:22:54, 1.34it/s, loss=0.0271, lr=2.04e-05, step=3345] Training: 33%|███▎ | 3346/10000 [40:22<1:21:52, 1.35it/s, loss=0.0271, lr=2.04e-05, step=3345] Training: 33%|███▎ | 3346/10000 [40:22<1:21:52, 1.35it/s, loss=0.0286, lr=2.04e-05, step=3346] Training: 33%|███▎ | 3347/10000 [40:23<1:30:48, 1.22it/s, loss=0.0286, lr=2.04e-05, step=3346] Training: 33%|███▎ | 3347/10000 [40:23<1:30:48, 1.22it/s, loss=0.0103, lr=2.04e-05, step=3347] Training: 33%|███▎ | 3348/10000 [40:24<1:30:15, 1.23it/s, loss=0.0103, lr=2.04e-05, step=3347] Training: 33%|███▎ | 3348/10000 [40:24<1:30:15, 1.23it/s, loss=0.0179, lr=2.04e-05, step=3348] Training: 33%|███▎ | 3349/10000 [40:25<1:33:05, 1.19it/s, loss=0.0179, lr=2.04e-05, step=3348] Training: 33%|███▎ | 3349/10000 [40:25<1:33:05, 1.19it/s, loss=0.0111, lr=2.04e-05, step=3349]16:46:32.557 [I] step=3350 loss=0.0114 smoothed_loss=0.0181 lr=2.04e-05 grad_norm=0.4661 step_time=0.6558s data_time=0.1291s it/s=1.275 eta_to_10000=5214.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.1070 grad_shared_expert=0.2427 (10775:train_pytorch.py:850) + Training: 34%|███▎ | 3350/10000 [40:26<1:32:24, 1.20it/s, loss=0.0111, lr=2.04e-05, step=3349] Training: 34%|███▎ | 3350/10000 [40:26<1:32:24, 1.20it/s, loss=0.0114, lr=2.04e-05, step=3350] Training: 34%|███▎ | 3351/10000 [40:26<1:23:57, 1.32it/s, loss=0.0114, lr=2.04e-05, step=3350] Training: 34%|███▎ | 3351/10000 [40:26<1:23:57, 1.32it/s, loss=0.0144, lr=2.04e-05, step=3351] Training: 34%|███▎ | 3352/10000 [40:27<1:27:20, 1.27it/s, loss=0.0144, lr=2.04e-05, step=3351] Training: 34%|███▎ | 3352/10000 [40:27<1:27:20, 1.27it/s, loss=0.0140, lr=2.04e-05, step=3352] Training: 34%|███▎ | 3353/10000 [40:28<1:25:35, 1.29it/s, loss=0.0140, lr=2.04e-05, step=3352] Training: 34%|███▎ | 3353/10000 [40:28<1:25:35, 1.29it/s, loss=0.0077, lr=2.04e-05, step=3353] Training: 34%|███▎ | 3354/10000 [40:29<1:38:46, 1.12it/s, loss=0.0077, lr=2.04e-05, step=3353] Training: 34%|███▎ | 3354/10000 [40:29<1:38:46, 1.12it/s, loss=0.0114, lr=2.04e-05, step=3354] Training: 34%|███▎ | 3355/10000 [40:30<1:28:43, 1.25it/s, loss=0.0114, lr=2.04e-05, step=3354] Training: 34%|███▎ | 3355/10000 [40:30<1:28:43, 1.25it/s, loss=0.0341, lr=2.04e-05, step=3355] Training: 34%|███▎ | 3356/10000 [40:30<1:31:13, 1.21it/s, loss=0.0341, lr=2.04e-05, step=3355] Training: 34%|███▎ | 3356/10000 [40:30<1:31:13, 1.21it/s, loss=0.0194, lr=2.03e-05, step=3356] Training: 34%|███▎ | 3357/10000 [40:31<1:21:07, 1.36it/s, loss=0.0194, lr=2.03e-05, step=3356] Training: 34%|███▎ | 3357/10000 [40:31<1:21:07, 1.36it/s, loss=0.0057, lr=2.03e-05, step=3357] Training: 34%|███▎ | 3358/10000 [40:32<1:19:09, 1.40it/s, loss=0.0057, lr=2.03e-05, step=3357] Training: 34%|███▎ | 3358/10000 [40:32<1:19:09, 1.40it/s, loss=0.0129, lr=2.03e-05, step=3358] Training: 34%|███▎ | 3359/10000 [40:32<1:16:12, 1.45it/s, loss=0.0129, lr=2.03e-05, step=3358] Training: 34%|███▎ | 3359/10000 [40:32<1:16:12, 1.45it/s, loss=0.0085, lr=2.03e-05, step=3359]16:46:39.823 [I] step=3360 loss=0.0116 smoothed_loss=0.0151 lr=2.03e-05 grad_norm=0.5133 step_time=0.5776s data_time=0.1489s it/s=1.377 eta_to_10000=4822.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.1466 grad_shared_expert=0.3393 (10775:train_pytorch.py:850) + Training: 34%|███▎ | 3360/10000 [40:33<1:15:55, 1.46it/s, loss=0.0085, lr=2.03e-05, step=3359] Training: 34%|███▎ | 3360/10000 [40:33<1:15:55, 1.46it/s, loss=0.0116, lr=2.03e-05, step=3360] Training: 34%|███▎ | 3361/10000 [40:34<1:12:40, 1.52it/s, loss=0.0116, lr=2.03e-05, step=3360] Training: 34%|███▎ | 3361/10000 [40:34<1:12:40, 1.52it/s, loss=0.0153, lr=2.03e-05, step=3361] Training: 34%|███▎ | 3362/10000 [40:34<1:21:37, 1.36it/s, loss=0.0153, lr=2.03e-05, step=3361] Training: 34%|███▎ | 3362/10000 [40:34<1:21:37, 1.36it/s, loss=0.0172, lr=2.03e-05, step=3362] Training: 34%|███▎ | 3363/10000 [40:36<1:45:00, 1.05it/s, loss=0.0172, lr=2.03e-05, step=3362] Training: 34%|███▎ | 3363/10000 [40:36<1:45:00, 1.05it/s, loss=0.0222, lr=2.03e-05, step=3363] Training: 34%|███▎ | 3364/10000 [40:37<1:36:44, 1.14it/s, loss=0.0222, lr=2.03e-05, step=3363] Training: 34%|███▎ | 3364/10000 [40:37<1:36:44, 1.14it/s, loss=0.0072, lr=2.03e-05, step=3364] Training: 34%|███▎ | 3365/10000 [40:37<1:29:33, 1.23it/s, loss=0.0072, lr=2.03e-05, step=3364] Training: 34%|███▎ | 3365/10000 [40:37<1:29:33, 1.23it/s, loss=0.0195, lr=2.03e-05, step=3365] Training: 34%|███▎ | 3366/10000 [40:38<1:43:54, 1.06it/s, loss=0.0195, lr=2.03e-05, step=3365] Training: 34%|███▎ | 3366/10000 [40:39<1:43:54, 1.06it/s, loss=0.0284, lr=2.03e-05, step=3366] Training: 34%|███▎ | 3367/10000 [40:39<1:37:47, 1.13it/s, loss=0.0284, lr=2.03e-05, step=3366] Training: 34%|███▎ | 3367/10000 [40:39<1:37:47, 1.13it/s, loss=0.0428, lr=2.03e-05, step=3367] Training: 34%|███▎ | 3368/10000 [40:40<1:35:35, 1.16it/s, loss=0.0428, lr=2.03e-05, step=3367] Training: 34%|███▎ | 3368/10000 [40:40<1:35:35, 1.16it/s, loss=0.0625, lr=2.03e-05, step=3368] Training: 34%|███▎ | 3369/10000 [40:41<1:35:34, 1.16it/s, loss=0.0625, lr=2.03e-05, step=3368] Training: 34%|███▎ | 3369/10000 [40:41<1:35:34, 1.16it/s, loss=0.0103, lr=2.03e-05, step=3369]16:46:48.510 [I] step=3370 loss=0.0289 smoothed_loss=0.0231 lr=2.03e-05 grad_norm=0.6024 step_time=0.6778s data_time=0.1910s it/s=1.158 eta_to_10000=5726.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1367 grad_shared_expert=0.4388 (10775:train_pytorch.py:850) + Training: 34%|███▎ | 3370/10000 [40:42<1:28:20, 1.25it/s, loss=0.0103, lr=2.03e-05, step=3369] Training: 34%|███▎ | 3370/10000 [40:42<1:28:20, 1.25it/s, loss=0.0289, lr=2.03e-05, step=3370] Training: 34%|███▎ | 3371/10000 [40:42<1:23:43, 1.32it/s, loss=0.0289, lr=2.03e-05, step=3370] Training: 34%|███▎ | 3371/10000 [40:42<1:23:43, 1.32it/s, loss=0.0168, lr=2.03e-05, step=3371] Training: 34%|███▎ | 3372/10000 [40:43<1:18:03, 1.42it/s, loss=0.0168, lr=2.03e-05, step=3371] Training: 34%|███▎ | 3372/10000 [40:43<1:18:03, 1.42it/s, loss=0.0207, lr=2.03e-05, step=3372] Training: 34%|███▎ | 3373/10000 [40:44<1:17:52, 1.42it/s, loss=0.0207, lr=2.03e-05, step=3372] Training: 34%|███▎ | 3373/10000 [40:44<1:17:52, 1.42it/s, loss=0.0112, lr=2.03e-05, step=3373] Training: 34%|███▎ | 3374/10000 [40:44<1:12:56, 1.51it/s, loss=0.0112, lr=2.03e-05, step=3373] Training: 34%|███▎ | 3374/10000 [40:44<1:12:56, 1.51it/s, loss=0.0049, lr=2.03e-05, step=3374] Training: 34%|███▍ | 3375/10000 [40:45<1:30:02, 1.23it/s, loss=0.0049, lr=2.03e-05, step=3374] Training: 34%|███▍ | 3375/10000 [40:45<1:30:02, 1.23it/s, loss=0.0186, lr=2.03e-05, step=3375] Training: 34%|███▍ | 3376/10000 [40:46<1:34:34, 1.17it/s, loss=0.0186, lr=2.03e-05, step=3375] Training: 34%|███▍ | 3376/10000 [40:46<1:34:34, 1.17it/s, loss=0.0242, lr=2.03e-05, step=3376] Training: 34%|███▍ | 3377/10000 [40:47<1:33:00, 1.19it/s, loss=0.0242, lr=2.03e-05, step=3376] Training: 34%|███▍ | 3377/10000 [40:47<1:33:00, 1.19it/s, loss=0.0309, lr=2.03e-05, step=3377] Training: 34%|███▍ | 3378/10000 [40:48<1:38:14, 1.12it/s, loss=0.0309, lr=2.03e-05, step=3377] Training: 34%|███▍ | 3378/10000 [40:48<1:38:14, 1.12it/s, loss=0.0178, lr=2.03e-05, step=3378] Training: 34%|███▍ | 3379/10000 [40:49<1:30:13, 1.22it/s, loss=0.0178, lr=2.03e-05, step=3378] Training: 34%|███▍ | 3379/10000 [40:49<1:30:13, 1.22it/s, loss=0.0101, lr=2.03e-05, step=3379]16:46:56.362 [I] step=3380 loss=0.0460 smoothed_loss=0.0223 lr=2.03e-05 grad_norm=0.4573 step_time=0.6349s data_time=0.1504s it/s=1.275 eta_to_10000=5191.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0194 grad_action_out_proj=0.1869 grad_shared_expert=0.3596 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3380/10000 [40:49<1:27:55, 1.25it/s, loss=0.0101, lr=2.03e-05, step=3379] Training: 34%|███▍ | 3380/10000 [40:49<1:27:55, 1.25it/s, loss=0.0460, lr=2.03e-05, step=3380] Training: 34%|███▍ | 3381/10000 [40:50<1:21:29, 1.35it/s, loss=0.0460, lr=2.03e-05, step=3380] Training: 34%|███▍ | 3381/10000 [40:50<1:21:29, 1.35it/s, loss=0.0102, lr=2.03e-05, step=3381] Training: 34%|███▍ | 3382/10000 [40:51<1:20:04, 1.38it/s, loss=0.0102, lr=2.03e-05, step=3381] Training: 34%|███▍ | 3382/10000 [40:51<1:20:04, 1.38it/s, loss=0.0161, lr=2.03e-05, step=3382] Training: 34%|███▍ | 3383/10000 [40:52<1:22:29, 1.34it/s, loss=0.0161, lr=2.03e-05, step=3382] Training: 34%|███▍ | 3383/10000 [40:52<1:22:29, 1.34it/s, loss=0.0277, lr=2.03e-05, step=3383] Training: 34%|███▍ | 3384/10000 [40:52<1:24:14, 1.31it/s, loss=0.0277, lr=2.03e-05, step=3383] Training: 34%|███▍ | 3384/10000 [40:52<1:24:14, 1.31it/s, loss=0.0093, lr=2.03e-05, step=3384] Training: 34%|███▍ | 3385/10000 [40:53<1:15:51, 1.45it/s, loss=0.0093, lr=2.03e-05, step=3384] Training: 34%|███▍ | 3385/10000 [40:53<1:15:51, 1.45it/s, loss=0.0137, lr=2.03e-05, step=3385] Training: 34%|███▍ | 3386/10000 [40:53<1:12:32, 1.52it/s, loss=0.0137, lr=2.03e-05, step=3385] Training: 34%|███▍ | 3386/10000 [40:53<1:12:32, 1.52it/s, loss=0.0088, lr=2.03e-05, step=3386] Training: 34%|███▍ | 3387/10000 [40:54<1:23:14, 1.32it/s, loss=0.0088, lr=2.03e-05, step=3386] Training: 34%|███▍ | 3387/10000 [40:54<1:23:14, 1.32it/s, loss=0.0281, lr=2.03e-05, step=3387] Training: 34%|███▍ | 3388/10000 [40:55<1:22:19, 1.34it/s, loss=0.0281, lr=2.03e-05, step=3387] Training: 34%|███▍ | 3388/10000 [40:55<1:22:19, 1.34it/s, loss=0.0170, lr=2.03e-05, step=3388] Training: 34%|███▍ | 3389/10000 [40:56<1:23:25, 1.32it/s, loss=0.0170, lr=2.03e-05, step=3388] Training: 34%|███▍ | 3389/10000 [40:56<1:23:25, 1.32it/s, loss=0.0098, lr=2.02e-05, step=3389]16:47:03.980 [I] step=3390 loss=0.0091 smoothed_loss=0.0173 lr=2.03e-05 grad_norm=0.5114 step_time=0.6088s data_time=0.1529s it/s=1.313 eta_to_10000=5034.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0187 grad_action_out_proj=0.2386 grad_shared_expert=0.6892 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3390/10000 [40:57<1:35:40, 1.15it/s, loss=0.0098, lr=2.02e-05, step=3389] Training: 34%|███▍ | 3390/10000 [40:57<1:35:40, 1.15it/s, loss=0.0091, lr=2.02e-05, step=3390] Training: 34%|███▍ | 3391/10000 [40:58<1:33:32, 1.18it/s, loss=0.0091, lr=2.02e-05, step=3390] Training: 34%|███▍ | 3391/10000 [40:58<1:33:32, 1.18it/s, loss=0.0212, lr=2.02e-05, step=3391] Training: 34%|███▍ | 3392/10000 [40:58<1:23:08, 1.32it/s, loss=0.0212, lr=2.02e-05, step=3391] Training: 34%|███▍ | 3392/10000 [40:58<1:23:08, 1.32it/s, loss=0.0104, lr=2.02e-05, step=3392] Training: 34%|███▍ | 3393/10000 [40:59<1:16:41, 1.44it/s, loss=0.0104, lr=2.02e-05, step=3392] Training: 34%|███▍ | 3393/10000 [40:59<1:16:41, 1.44it/s, loss=0.0045, lr=2.02e-05, step=3393] Training: 34%|███▍ | 3394/10000 [41:00<1:12:12, 1.52it/s, loss=0.0045, lr=2.02e-05, step=3393] Training: 34%|███▍ | 3394/10000 [41:00<1:12:12, 1.52it/s, loss=0.0443, lr=2.02e-05, step=3394] Training: 34%|███▍ | 3395/10000 [41:00<1:11:24, 1.54it/s, loss=0.0443, lr=2.02e-05, step=3394] Training: 34%|███▍ | 3395/10000 [41:00<1:11:24, 1.54it/s, loss=0.0078, lr=2.02e-05, step=3395] Training: 34%|███▍ | 3396/10000 [41:01<1:14:20, 1.48it/s, loss=0.0078, lr=2.02e-05, step=3395] Training: 34%|███▍ | 3396/10000 [41:01<1:14:20, 1.48it/s, loss=0.0152, lr=2.02e-05, step=3396] Training: 34%|███▍ | 3397/10000 [41:02<1:20:17, 1.37it/s, loss=0.0152, lr=2.02e-05, step=3396] Training: 34%|███▍ | 3397/10000 [41:02<1:20:17, 1.37it/s, loss=0.0051, lr=2.02e-05, step=3397] Training: 34%|███▍ | 3398/10000 [41:02<1:16:46, 1.43it/s, loss=0.0051, lr=2.02e-05, step=3397] Training: 34%|███▍ | 3398/10000 [41:02<1:16:46, 1.43it/s, loss=0.0370, lr=2.02e-05, step=3398] Training: 34%|███▍ | 3399/10000 [41:03<1:26:28, 1.27it/s, loss=0.0370, lr=2.02e-05, step=3398] Training: 34%|███▍ | 3399/10000 [41:03<1:26:28, 1.27it/s, loss=0.0074, lr=2.02e-05, step=3399]16:47:11.012 [I] step=3400 loss=0.0133 smoothed_loss=0.0167 lr=2.02e-05 grad_norm=0.4845 step_time=0.5894s data_time=0.1138s it/s=1.422 eta_to_10000=4640.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.1142 grad_shared_expert=0.3280 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3400/10000 [41:04<1:24:39, 1.30it/s, loss=0.0074, lr=2.02e-05, step=3399] Training: 34%|███▍ | 3400/10000 [41:04<1:24:39, 1.30it/s, loss=0.0133, lr=2.02e-05, step=3400] Training: 34%|███▍ | 3401/10000 [41:05<1:18:24, 1.40it/s, loss=0.0133, lr=2.02e-05, step=3400] Training: 34%|███▍ | 3401/10000 [41:05<1:18:24, 1.40it/s, loss=0.0292, lr=2.02e-05, step=3401] Training: 34%|███▍ | 3402/10000 [41:05<1:18:47, 1.40it/s, loss=0.0292, lr=2.02e-05, step=3401] Training: 34%|███▍ | 3402/10000 [41:05<1:18:47, 1.40it/s, loss=0.1248, lr=2.02e-05, step=3402] Training: 34%|███▍ | 3403/10000 [41:06<1:17:52, 1.41it/s, loss=0.1248, lr=2.02e-05, step=3402] Training: 34%|███▍ | 3403/10000 [41:06<1:17:52, 1.41it/s, loss=0.0321, lr=2.02e-05, step=3403] Training: 34%|███▍ | 3404/10000 [41:07<1:22:56, 1.33it/s, loss=0.0321, lr=2.02e-05, step=3403] Training: 34%|███▍ | 3404/10000 [41:07<1:22:56, 1.33it/s, loss=0.0116, lr=2.02e-05, step=3404] Training: 34%|███▍ | 3405/10000 [41:08<1:16:43, 1.43it/s, loss=0.0116, lr=2.02e-05, step=3404] Training: 34%|███▍ | 3405/10000 [41:08<1:16:43, 1.43it/s, loss=0.0125, lr=2.02e-05, step=3405] Training: 34%|███▍ | 3406/10000 [41:08<1:20:15, 1.37it/s, loss=0.0125, lr=2.02e-05, step=3405] Training: 34%|███▍ | 3406/10000 [41:08<1:20:15, 1.37it/s, loss=0.0063, lr=2.02e-05, step=3406] Training: 34%|███▍ | 3407/10000 [41:09<1:18:28, 1.40it/s, loss=0.0063, lr=2.02e-05, step=3406] Training: 34%|███▍ | 3407/10000 [41:09<1:18:28, 1.40it/s, loss=0.0147, lr=2.02e-05, step=3407] Training: 34%|███▍ | 3408/10000 [41:10<1:19:16, 1.39it/s, loss=0.0147, lr=2.02e-05, step=3407] Training: 34%|███▍ | 3408/10000 [41:10<1:19:16, 1.39it/s, loss=0.0112, lr=2.02e-05, step=3408] Training: 34%|███▍ | 3409/10000 [41:10<1:13:21, 1.50it/s, loss=0.0112, lr=2.02e-05, step=3408] Training: 34%|███▍ | 3409/10000 [41:10<1:13:21, 1.50it/s, loss=0.0195, lr=2.02e-05, step=3409]16:47:17.734 [I] step=3410 loss=0.0240 smoothed_loss=0.0218 lr=2.02e-05 grad_norm=0.5621 step_time=0.5631s data_time=0.1092s it/s=1.488 eta_to_10000=4429.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0099 grad_action_out_proj=0.1184 grad_shared_expert=0.4905 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3410/10000 [41:11<1:09:04, 1.59it/s, loss=0.0195, lr=2.02e-05, step=3409] Training: 34%|███▍ | 3410/10000 [41:11<1:09:04, 1.59it/s, loss=0.0240, lr=2.02e-05, step=3410] Training: 34%|███▍ | 3411/10000 [41:12<1:18:48, 1.39it/s, loss=0.0240, lr=2.02e-05, step=3410] Training: 34%|███▍ | 3411/10000 [41:12<1:18:48, 1.39it/s, loss=0.0022, lr=2.02e-05, step=3411] Training: 34%|███▍ | 3412/10000 [41:12<1:13:25, 1.50it/s, loss=0.0022, lr=2.02e-05, step=3411] Training: 34%|███▍ | 3412/10000 [41:12<1:13:25, 1.50it/s, loss=0.0148, lr=2.02e-05, step=3412] Training: 34%|███▍ | 3413/10000 [41:13<1:19:53, 1.37it/s, loss=0.0148, lr=2.02e-05, step=3412] Training: 34%|███▍ | 3413/10000 [41:13<1:19:53, 1.37it/s, loss=0.0304, lr=2.02e-05, step=3413] Training: 34%|███▍ | 3414/10000 [41:14<1:14:54, 1.47it/s, loss=0.0304, lr=2.02e-05, step=3413] Training: 34%|███▍ | 3414/10000 [41:14<1:14:54, 1.47it/s, loss=0.0074, lr=2.02e-05, step=3414] Training: 34%|███▍ | 3415/10000 [41:14<1:09:42, 1.57it/s, loss=0.0074, lr=2.02e-05, step=3414] Training: 34%|███▍ | 3415/10000 [41:14<1:09:42, 1.57it/s, loss=0.0056, lr=2.02e-05, step=3415] Training: 34%|███▍ | 3416/10000 [41:15<1:08:02, 1.61it/s, loss=0.0056, lr=2.02e-05, step=3415] Training: 34%|███▍ | 3416/10000 [41:15<1:08:02, 1.61it/s, loss=0.0194, lr=2.02e-05, step=3416] Training: 34%|███▍ | 3417/10000 [41:16<1:10:28, 1.56it/s, loss=0.0194, lr=2.02e-05, step=3416] Training: 34%|███▍ | 3417/10000 [41:16<1:10:28, 1.56it/s, loss=0.0304, lr=2.02e-05, step=3417] Training: 34%|███▍ | 3418/10000 [41:16<1:15:18, 1.46it/s, loss=0.0304, lr=2.02e-05, step=3417] Training: 34%|███▍ | 3418/10000 [41:16<1:15:18, 1.46it/s, loss=0.0212, lr=2.02e-05, step=3418] Training: 34%|███▍ | 3419/10000 [41:18<1:31:40, 1.20it/s, loss=0.0212, lr=2.02e-05, step=3418] Training: 34%|███▍ | 3419/10000 [41:18<1:31:40, 1.20it/s, loss=0.0195, lr=2.02e-05, step=3419]16:47:25.296 [I] step=3420 loss=0.0063 smoothed_loss=0.0181 lr=2.02e-05 grad_norm=0.6100 step_time=0.6316s data_time=0.1245s it/s=1.323 eta_to_10000=4973.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0152 grad_action_out_proj=0.1780 grad_shared_expert=0.4147 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3420/10000 [41:18<1:32:27, 1.19it/s, loss=0.0195, lr=2.02e-05, step=3419] Training: 34%|███▍ | 3420/10000 [41:18<1:32:27, 1.19it/s, loss=0.0063, lr=2.02e-05, step=3420] Training: 34%|███▍ | 3421/10000 [41:19<1:22:11, 1.33it/s, loss=0.0063, lr=2.02e-05, step=3420] Training: 34%|███▍ | 3421/10000 [41:19<1:22:11, 1.33it/s, loss=0.0127, lr=2.02e-05, step=3421] Training: 34%|███▍ | 3422/10000 [41:20<1:20:49, 1.36it/s, loss=0.0127, lr=2.02e-05, step=3421] Training: 34%|███▍ | 3422/10000 [41:20<1:20:49, 1.36it/s, loss=0.0149, lr=2.01e-05, step=3422] Training: 34%|███▍ | 3423/10000 [41:20<1:19:02, 1.39it/s, loss=0.0149, lr=2.01e-05, step=3422] Training: 34%|███▍ | 3423/10000 [41:20<1:19:02, 1.39it/s, loss=0.0103, lr=2.01e-05, step=3423] Training: 34%|███▍ | 3424/10000 [41:21<1:14:37, 1.47it/s, loss=0.0103, lr=2.01e-05, step=3423] Training: 34%|███▍ | 3424/10000 [41:21<1:14:37, 1.47it/s, loss=0.0155, lr=2.01e-05, step=3424] Training: 34%|███▍ | 3425/10000 [41:21<1:12:25, 1.51it/s, loss=0.0155, lr=2.01e-05, step=3424] Training: 34%|███▍ | 3425/10000 [41:21<1:12:25, 1.51it/s, loss=0.0460, lr=2.01e-05, step=3425] Training: 34%|███▍ | 3426/10000 [41:22<1:15:47, 1.45it/s, loss=0.0460, lr=2.01e-05, step=3425] Training: 34%|███▍ | 3426/10000 [41:22<1:15:47, 1.45it/s, loss=0.0363, lr=2.01e-05, step=3426] Training: 34%|███▍ | 3427/10000 [41:23<1:19:29, 1.38it/s, loss=0.0363, lr=2.01e-05, step=3426] Training: 34%|███▍ | 3427/10000 [41:23<1:19:29, 1.38it/s, loss=0.0214, lr=2.01e-05, step=3427] Training: 34%|███▍ | 3428/10000 [41:24<1:14:52, 1.46it/s, loss=0.0214, lr=2.01e-05, step=3427] Training: 34%|███▍ | 3428/10000 [41:24<1:14:52, 1.46it/s, loss=0.0456, lr=2.01e-05, step=3428] Training: 34%|███▍ | 3429/10000 [41:25<1:21:33, 1.34it/s, loss=0.0456, lr=2.01e-05, step=3428] Training: 34%|███▍ | 3429/10000 [41:25<1:21:33, 1.34it/s, loss=0.0019, lr=2.01e-05, step=3429]16:47:32.103 [I] step=3430 loss=0.0104 smoothed_loss=0.0203 lr=2.01e-05 grad_norm=0.6999 step_time=0.5621s data_time=0.1185s it/s=1.469 eta_to_10000=4471.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0995 grad_shared_expert=0.5372 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3430/10000 [41:25<1:18:24, 1.40it/s, loss=0.0019, lr=2.01e-05, step=3429] Training: 34%|███▍ | 3430/10000 [41:25<1:18:24, 1.40it/s, loss=0.0104, lr=2.01e-05, step=3430] Training: 34%|███▍ | 3431/10000 [41:26<1:14:26, 1.47it/s, loss=0.0104, lr=2.01e-05, step=3430] Training: 34%|███▍ | 3431/10000 [41:26<1:14:26, 1.47it/s, loss=0.0184, lr=2.01e-05, step=3431] Training: 34%|███▍ | 3432/10000 [41:26<1:10:12, 1.56it/s, loss=0.0184, lr=2.01e-05, step=3431] Training: 34%|███▍ | 3432/10000 [41:26<1:10:12, 1.56it/s, loss=0.0064, lr=2.01e-05, step=3432] Training: 34%|███▍ | 3433/10000 [41:27<1:17:20, 1.42it/s, loss=0.0064, lr=2.01e-05, step=3432] Training: 34%|███▍ | 3433/10000 [41:27<1:17:20, 1.42it/s, loss=0.0190, lr=2.01e-05, step=3433] Training: 34%|███▍ | 3434/10000 [41:28<1:21:06, 1.35it/s, loss=0.0190, lr=2.01e-05, step=3433] Training: 34%|███▍ | 3434/10000 [41:28<1:21:06, 1.35it/s, loss=0.0338, lr=2.01e-05, step=3434] Training: 34%|███▍ | 3435/10000 [41:29<1:23:03, 1.32it/s, loss=0.0338, lr=2.01e-05, step=3434] Training: 34%|███▍ | 3435/10000 [41:29<1:23:03, 1.32it/s, loss=0.0299, lr=2.01e-05, step=3435] Training: 34%|███▍ | 3436/10000 [41:30<1:23:17, 1.31it/s, loss=0.0299, lr=2.01e-05, step=3435] Training: 34%|███▍ | 3436/10000 [41:30<1:23:17, 1.31it/s, loss=0.0032, lr=2.01e-05, step=3436] Training: 34%|███▍ | 3437/10000 [41:30<1:19:09, 1.38it/s, loss=0.0032, lr=2.01e-05, step=3436] Training: 34%|███▍ | 3437/10000 [41:30<1:19:09, 1.38it/s, loss=0.0438, lr=2.01e-05, step=3437] Training: 34%|███▍ | 3438/10000 [41:31<1:14:36, 1.47it/s, loss=0.0438, lr=2.01e-05, step=3437] Training: 34%|███▍ | 3438/10000 [41:31<1:14:36, 1.47it/s, loss=0.0211, lr=2.01e-05, step=3438] Training: 34%|███▍ | 3439/10000 [41:31<1:11:22, 1.53it/s, loss=0.0211, lr=2.01e-05, step=3438] Training: 34%|███▍ | 3439/10000 [41:31<1:11:22, 1.53it/s, loss=0.0280, lr=2.01e-05, step=3439]16:47:40.159 [I] step=3440 loss=0.0125 smoothed_loss=0.0214 lr=2.01e-05 grad_norm=0.4962 step_time=0.5902s data_time=0.2156s it/s=1.243 eta_to_10000=5278.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1327 grad_shared_expert=0.4882 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3440/10000 [41:33<1:50:44, 1.01s/it, loss=0.0280, lr=2.01e-05, step=3439] Training: 34%|███▍ | 3440/10000 [41:33<1:50:44, 1.01s/it, loss=0.0125, lr=2.01e-05, step=3440] Training: 34%|███▍ | 3441/10000 [41:34<1:43:31, 1.06it/s, loss=0.0125, lr=2.01e-05, step=3440] Training: 34%|███▍ | 3441/10000 [41:34<1:43:31, 1.06it/s, loss=0.0995, lr=2.01e-05, step=3441] Training: 34%|███▍ | 3442/10000 [41:35<1:32:27, 1.18it/s, loss=0.0995, lr=2.01e-05, step=3441] Training: 34%|███▍ | 3442/10000 [41:35<1:32:27, 1.18it/s, loss=0.0153, lr=2.01e-05, step=3442] Training: 34%|███▍ | 3443/10000 [41:35<1:23:18, 1.31it/s, loss=0.0153, lr=2.01e-05, step=3442] Training: 34%|███▍ | 3443/10000 [41:35<1:23:18, 1.31it/s, loss=0.0044, lr=2.01e-05, step=3443] Training: 34%|███▍ | 3444/10000 [41:36<1:20:56, 1.35it/s, loss=0.0044, lr=2.01e-05, step=3443] Training: 34%|███▍ | 3444/10000 [41:36<1:20:56, 1.35it/s, loss=0.0221, lr=2.01e-05, step=3444] Training: 34%|███▍ | 3445/10000 [41:37<1:25:21, 1.28it/s, loss=0.0221, lr=2.01e-05, step=3444] Training: 34%|███▍ | 3445/10000 [41:37<1:25:21, 1.28it/s, loss=0.0078, lr=2.01e-05, step=3445] Training: 34%|███▍ | 3446/10000 [41:38<1:26:13, 1.27it/s, loss=0.0078, lr=2.01e-05, step=3445] Training: 34%|███▍ | 3446/10000 [41:38<1:26:13, 1.27it/s, loss=0.0228, lr=2.01e-05, step=3446] Training: 34%|███▍ | 3447/10000 [41:38<1:27:27, 1.25it/s, loss=0.0228, lr=2.01e-05, step=3446] Training: 34%|███▍ | 3447/10000 [41:38<1:27:27, 1.25it/s, loss=0.0100, lr=2.01e-05, step=3447] Training: 34%|███▍ | 3448/10000 [41:39<1:30:53, 1.20it/s, loss=0.0100, lr=2.01e-05, step=3447] Training: 34%|███▍ | 3448/10000 [41:39<1:30:53, 1.20it/s, loss=0.0100, lr=2.01e-05, step=3448] Training: 34%|███▍ | 3449/10000 [41:40<1:22:56, 1.32it/s, loss=0.0100, lr=2.01e-05, step=3448] Training: 34%|███▍ | 3449/10000 [41:40<1:22:56, 1.32it/s, loss=0.0067, lr=2.01e-05, step=3449]16:47:47.587 [I] step=3450 loss=0.0100 smoothed_loss=0.0185 lr=2.01e-05 grad_norm=0.4390 step_time=0.6176s data_time=0.1251s it/s=1.347 eta_to_10000=4861.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0062 grad_action_out_proj=0.0883 grad_shared_expert=0.2840 (10775:train_pytorch.py:850) + Training: 34%|███▍ | 3450/10000 [41:41<1:22:56, 1.32it/s, loss=0.0067, lr=2.01e-05, step=3449] Training: 34%|███▍ | 3450/10000 [41:41<1:22:56, 1.32it/s, loss=0.0100, lr=2.01e-05, step=3450] Training: 35%|███▍ | 3451/10000 [41:41<1:16:30, 1.43it/s, loss=0.0100, lr=2.01e-05, step=3450] Training: 35%|███▍ | 3451/10000 [41:41<1:16:30, 1.43it/s, loss=0.0151, lr=2.01e-05, step=3451] Training: 35%|███▍ | 3452/10000 [41:42<1:18:33, 1.39it/s, loss=0.0151, lr=2.01e-05, step=3451] Training: 35%|███▍ | 3452/10000 [41:42<1:18:33, 1.39it/s, loss=0.0187, lr=2.01e-05, step=3452] Training: 35%|███▍ | 3453/10000 [41:43<1:22:41, 1.32it/s, loss=0.0187, lr=2.01e-05, step=3452] Training: 35%|███▍ | 3453/10000 [41:43<1:22:41, 1.32it/s, loss=0.0645, lr=2.01e-05, step=3453] Training: 35%|███▍ | 3454/10000 [41:44<1:28:44, 1.23it/s, loss=0.0645, lr=2.01e-05, step=3453] Training: 35%|███▍ | 3454/10000 [41:44<1:28:44, 1.23it/s, loss=0.0032, lr=2.00e-05, step=3454] Training: 35%|███▍ | 3455/10000 [41:45<1:29:07, 1.22it/s, loss=0.0032, lr=2.00e-05, step=3454] Training: 35%|███▍ | 3455/10000 [41:45<1:29:07, 1.22it/s, loss=0.0071, lr=2.00e-05, step=3455] Training: 35%|███▍ | 3456/10000 [41:45<1:23:32, 1.31it/s, loss=0.0071, lr=2.00e-05, step=3455] Training: 35%|███▍ | 3456/10000 [41:45<1:23:32, 1.31it/s, loss=0.0902, lr=2.00e-05, step=3456] Training: 35%|███▍ | 3457/10000 [41:46<1:22:09, 1.33it/s, loss=0.0902, lr=2.00e-05, step=3456] Training: 35%|███▍ | 3457/10000 [41:46<1:22:09, 1.33it/s, loss=0.0255, lr=2.00e-05, step=3457] Training: 35%|███▍ | 3458/10000 [41:47<1:19:33, 1.37it/s, loss=0.0255, lr=2.00e-05, step=3457] Training: 35%|███▍ | 3458/10000 [41:47<1:19:33, 1.37it/s, loss=0.0059, lr=2.00e-05, step=3458] Training: 35%|███▍ | 3459/10000 [41:47<1:17:35, 1.40it/s, loss=0.0059, lr=2.00e-05, step=3458] Training: 35%|███▍ | 3459/10000 [41:47<1:17:35, 1.40it/s, loss=0.0053, lr=2.00e-05, step=3459]16:47:54.937 [I] step=3460 loss=0.0186 smoothed_loss=0.0221 lr=2.00e-05 grad_norm=0.4510 step_time=0.6014s data_time=0.1337s it/s=1.362 eta_to_10000=4802.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0217 grad_action_out_proj=0.1994 grad_shared_expert=0.5554 (10775:train_pytorch.py:850) + Training: 35%|███▍ | 3460/10000 [41:48<1:16:58, 1.42it/s, loss=0.0053, lr=2.00e-05, step=3459] Training: 35%|███▍ | 3460/10000 [41:48<1:16:58, 1.42it/s, loss=0.0186, lr=2.00e-05, step=3460] Training: 35%|███▍ | 3461/10000 [41:49<1:19:55, 1.36it/s, loss=0.0186, lr=2.00e-05, step=3460] Training: 35%|███▍ | 3461/10000 [41:49<1:19:55, 1.36it/s, loss=0.0291, lr=2.00e-05, step=3461] Training: 35%|███▍ | 3462/10000 [41:49<1:16:50, 1.42it/s, loss=0.0291, lr=2.00e-05, step=3461] Training: 35%|███▍ | 3462/10000 [41:49<1:16:50, 1.42it/s, loss=0.0125, lr=2.00e-05, step=3462] Training: 35%|███▍ | 3463/10000 [41:50<1:22:00, 1.33it/s, loss=0.0125, lr=2.00e-05, step=3462] Training: 35%|███▍ | 3463/10000 [41:50<1:22:00, 1.33it/s, loss=0.0122, lr=2.00e-05, step=3463] Training: 35%|███▍ | 3464/10000 [41:51<1:30:18, 1.21it/s, loss=0.0122, lr=2.00e-05, step=3463] Training: 35%|███▍ | 3464/10000 [41:51<1:30:18, 1.21it/s, loss=0.0204, lr=2.00e-05, step=3464] Training: 35%|███▍ | 3465/10000 [41:52<1:25:28, 1.27it/s, loss=0.0204, lr=2.00e-05, step=3464] Training: 35%|███▍ | 3465/10000 [41:52<1:25:28, 1.27it/s, loss=0.0380, lr=2.00e-05, step=3465] Training: 35%|███▍ | 3466/10000 [41:53<1:26:39, 1.26it/s, loss=0.0380, lr=2.00e-05, step=3465] Training: 35%|███▍ | 3466/10000 [41:53<1:26:39, 1.26it/s, loss=0.0376, lr=2.00e-05, step=3466] Training: 35%|███▍ | 3467/10000 [41:53<1:20:44, 1.35it/s, loss=0.0376, lr=2.00e-05, step=3466] Training: 35%|███▍ | 3467/10000 [41:53<1:20:44, 1.35it/s, loss=0.0330, lr=2.00e-05, step=3467] Training: 35%|███▍ | 3468/10000 [41:54<1:15:49, 1.44it/s, loss=0.0330, lr=2.00e-05, step=3467] Training: 35%|███▍ | 3468/10000 [41:54<1:15:49, 1.44it/s, loss=0.0108, lr=2.00e-05, step=3468] Training: 35%|███▍ | 3469/10000 [41:55<1:23:29, 1.30it/s, loss=0.0108, lr=2.00e-05, step=3468] Training: 35%|███▍ | 3469/10000 [41:55<1:23:29, 1.30it/s, loss=0.0036, lr=2.00e-05, step=3469]16:48:03.160 [I] step=3470 loss=0.0167 smoothed_loss=0.0210 lr=2.00e-05 grad_norm=0.4657 step_time=0.6317s data_time=0.1904s it/s=1.217 eta_to_10000=5363.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0208 grad_action_out_proj=0.1649 grad_shared_expert=0.4015 (10775:train_pytorch.py:850) + Training: 35%|███▍ | 3470/10000 [41:56<1:39:50, 1.09it/s, loss=0.0036, lr=2.00e-05, step=3469] Training: 35%|███▍ | 3470/10000 [41:56<1:39:50, 1.09it/s, loss=0.0167, lr=2.00e-05, step=3470] Training: 35%|███▍ | 3471/10000 [41:57<1:27:45, 1.24it/s, loss=0.0167, lr=2.00e-05, step=3470] Training: 35%|███▍ | 3471/10000 [41:57<1:27:45, 1.24it/s, loss=0.0203, lr=2.00e-05, step=3471] Training: 35%|███▍ | 3472/10000 [41:58<1:26:00, 1.27it/s, loss=0.0203, lr=2.00e-05, step=3471] Training: 35%|███▍ | 3472/10000 [41:58<1:26:00, 1.27it/s, loss=0.0293, lr=2.00e-05, step=3472] Training: 35%|███▍ | 3473/10000 [41:58<1:21:44, 1.33it/s, loss=0.0293, lr=2.00e-05, step=3472] Training: 35%|███▍ | 3473/10000 [41:58<1:21:44, 1.33it/s, loss=0.0152, lr=2.00e-05, step=3473] Training: 35%|███▍ | 3474/10000 [41:59<1:23:21, 1.30it/s, loss=0.0152, lr=2.00e-05, step=3473] Training: 35%|███▍ | 3474/10000 [41:59<1:23:21, 1.30it/s, loss=0.0653, lr=2.00e-05, step=3474] Training: 35%|███▍ | 3475/10000 [42:00<1:16:57, 1.41it/s, loss=0.0653, lr=2.00e-05, step=3474] Training: 35%|███▍ | 3475/10000 [42:00<1:16:57, 1.41it/s, loss=0.0077, lr=2.00e-05, step=3475] Training: 35%|███▍ | 3476/10000 [42:00<1:23:12, 1.31it/s, loss=0.0077, lr=2.00e-05, step=3475] Training: 35%|███▍ | 3476/10000 [42:00<1:23:12, 1.31it/s, loss=0.0247, lr=2.00e-05, step=3476] Training: 35%|███▍ | 3477/10000 [42:01<1:17:37, 1.40it/s, loss=0.0247, lr=2.00e-05, step=3476] Training: 35%|███▍ | 3477/10000 [42:01<1:17:37, 1.40it/s, loss=0.0095, lr=2.00e-05, step=3477] Training: 35%|███▍ | 3478/10000 [42:02<1:20:16, 1.35it/s, loss=0.0095, lr=2.00e-05, step=3477] Training: 35%|███▍ | 3478/10000 [42:02<1:20:16, 1.35it/s, loss=0.0059, lr=2.00e-05, step=3478] Training: 35%|███▍ | 3479/10000 [42:02<1:12:55, 1.49it/s, loss=0.0059, lr=2.00e-05, step=3478] Training: 35%|███▍ | 3479/10000 [42:02<1:12:55, 1.49it/s, loss=0.0568, lr=2.00e-05, step=3479]16:48:09.849 [I] step=3480 loss=0.0579 smoothed_loss=0.0277 lr=2.00e-05 grad_norm=0.5368 step_time=0.5562s data_time=0.1127s it/s=1.495 eta_to_10000=4360.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0257 grad_action_out_proj=0.1783 grad_shared_expert=0.4480 (10775:train_pytorch.py:850) + Training: 35%|███▍ | 3480/10000 [42:03<1:09:03, 1.57it/s, loss=0.0568, lr=2.00e-05, step=3479] Training: 35%|███▍ | 3480/10000 [42:03<1:09:03, 1.57it/s, loss=0.0579, lr=2.00e-05, step=3480] Training: 35%|███▍ | 3481/10000 [42:03<1:05:33, 1.66it/s, loss=0.0579, lr=2.00e-05, step=3480] Training: 35%|███▍ | 3481/10000 [42:03<1:05:33, 1.66it/s, loss=0.0301, lr=2.00e-05, step=3481] Training: 35%|███▍ | 3482/10000 [42:04<1:03:07, 1.72it/s, loss=0.0301, lr=2.00e-05, step=3481] Training: 35%|███▍ | 3482/10000 [42:04<1:03:07, 1.72it/s, loss=0.0790, lr=2.00e-05, step=3482] Training: 35%|███▍ | 3483/10000 [42:05<1:13:37, 1.48it/s, loss=0.0790, lr=2.00e-05, step=3482] Training: 35%|███▍ | 3483/10000 [42:05<1:13:37, 1.48it/s, loss=0.0068, lr=2.00e-05, step=3483] Training: 35%|███▍ | 3484/10000 [42:05<1:09:20, 1.57it/s, loss=0.0068, lr=2.00e-05, step=3483] Training: 35%|███▍ | 3484/10000 [42:05<1:09:20, 1.57it/s, loss=0.0044, lr=2.00e-05, step=3484] Training: 35%|███▍ | 3485/10000 [42:06<1:12:42, 1.49it/s, loss=0.0044, lr=2.00e-05, step=3484] Training: 35%|███▍ | 3485/10000 [42:06<1:12:42, 1.49it/s, loss=0.0126, lr=2.00e-05, step=3485] Training: 35%|███▍ | 3486/10000 [42:07<1:08:49, 1.58it/s, loss=0.0126, lr=2.00e-05, step=3485] Training: 35%|███▍ | 3486/10000 [42:07<1:08:49, 1.58it/s, loss=0.0287, lr=1.99e-05, step=3486] Training: 35%|███▍ | 3487/10000 [42:07<1:05:10, 1.67it/s, loss=0.0287, lr=1.99e-05, step=3486] Training: 35%|███▍ | 3487/10000 [42:07<1:05:10, 1.67it/s, loss=0.0381, lr=1.99e-05, step=3487] Training: 35%|███▍ | 3488/10000 [42:08<1:02:19, 1.74it/s, loss=0.0381, lr=1.99e-05, step=3487] Training: 35%|███▍ | 3488/10000 [42:08<1:02:19, 1.74it/s, loss=0.0188, lr=1.99e-05, step=3488] Training: 35%|███▍ | 3489/10000 [42:08<1:00:09, 1.80it/s, loss=0.0188, lr=1.99e-05, step=3488] Training: 35%|███▍ | 3489/10000 [42:08<1:00:09, 1.80it/s, loss=0.0036, lr=1.99e-05, step=3489]16:48:15.974 [I] step=3490 loss=0.0246 smoothed_loss=0.0245 lr=2.00e-05 grad_norm=0.5454 step_time=0.5466s data_time=0.0658s it/s=1.633 eta_to_10000=3986.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0168 grad_action_out_proj=0.1449 grad_shared_expert=0.4627 (10775:train_pytorch.py:850) + Training: 35%|███▍ | 3490/10000 [42:09<1:07:30, 1.61it/s, loss=0.0036, lr=1.99e-05, step=3489] Training: 35%|███▍ | 3490/10000 [42:09<1:07:30, 1.61it/s, loss=0.0246, lr=1.99e-05, step=3490] Training: 35%|███▍ | 3491/10000 [42:10<1:03:49, 1.70it/s, loss=0.0246, lr=1.99e-05, step=3490] Training: 35%|███▍ | 3491/10000 [42:10<1:03:49, 1.70it/s, loss=0.0090, lr=1.99e-05, step=3491] Training: 35%|███▍ | 3492/10000 [42:10<1:01:08, 1.77it/s, loss=0.0090, lr=1.99e-05, step=3491] Training: 35%|███▍ | 3492/10000 [42:10<1:01:08, 1.77it/s, loss=0.0306, lr=1.99e-05, step=3492] Training: 35%|███▍ | 3493/10000 [42:11<1:05:50, 1.65it/s, loss=0.0306, lr=1.99e-05, step=3492] Training: 35%|███▍ | 3493/10000 [42:11<1:05:50, 1.65it/s, loss=0.0161, lr=1.99e-05, step=3493] Training: 35%|███▍ | 3494/10000 [42:11<1:05:34, 1.65it/s, loss=0.0161, lr=1.99e-05, step=3493] Training: 35%|███▍ | 3494/10000 [42:11<1:05:34, 1.65it/s, loss=0.0472, lr=1.99e-05, step=3494] Training: 35%|███▍ | 3495/10000 [42:12<1:02:09, 1.74it/s, loss=0.0472, lr=1.99e-05, step=3494] Training: 35%|███▍ | 3495/10000 [42:12<1:02:09, 1.74it/s, loss=0.0329, lr=1.99e-05, step=3495] Training: 35%|███▍ | 3496/10000 [42:12<59:36, 1.82it/s, loss=0.0329, lr=1.99e-05, step=3495] Training: 35%|███▍ | 3496/10000 [42:12<59:36, 1.82it/s, loss=0.0130, lr=1.99e-05, step=3496] Training: 35%|███▍ | 3497/10000 [42:13<1:05:44, 1.65it/s, loss=0.0130, lr=1.99e-05, step=3496] Training: 35%|███▍ | 3497/10000 [42:13<1:05:44, 1.65it/s, loss=0.0096, lr=1.99e-05, step=3497] Training: 35%|███▍ | 3498/10000 [42:14<1:03:09, 1.72it/s, loss=0.0096, lr=1.99e-05, step=3497] Training: 35%|███▍ | 3498/10000 [42:14<1:03:09, 1.72it/s, loss=0.0257, lr=1.99e-05, step=3498] Training: 35%|███▍ | 3499/10000 [42:14<1:02:31, 1.73it/s, loss=0.0257, lr=1.99e-05, step=3498] Training: 35%|███▍ | 3499/10000 [42:14<1:02:31, 1.73it/s, loss=0.0099, lr=1.99e-05, step=3499]16:48:21.934 [I] step=3500 loss=0.1942 smoothed_loss=0.0394 lr=1.99e-05 grad_norm=0.5239 step_time=0.5314s data_time=0.0649s it/s=1.678 eta_to_10000=3873.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0406 grad_action_out_proj=0.2119 grad_shared_expert=0.5283 (10775:train_pytorch.py:850) + Training: 35%|███▌ | 3500/10000 [42:15<1:10:08, 1.54it/s, loss=0.0099, lr=1.99e-05, step=3499] Training: 35%|███▌ | 3500/10000 [42:15<1:10:08, 1.54it/s, loss=0.1942, lr=1.99e-05, step=3500] Training: 35%|███▌ | 3501/10000 [42:16<1:06:13, 1.64it/s, loss=0.1942, lr=1.99e-05, step=3500] Training: 35%|███▌ | 3501/10000 [42:16<1:06:13, 1.64it/s, loss=0.0193, lr=1.99e-05, step=3501] Training: 35%|███▌ | 3502/10000 [42:16<1:03:03, 1.72it/s, loss=0.0193, lr=1.99e-05, step=3501] Training: 35%|███▌ | 3502/10000 [42:16<1:03:03, 1.72it/s, loss=0.0163, lr=1.99e-05, step=3502] Training: 35%|███▌ | 3503/10000 [42:17<1:03:03, 1.72it/s, loss=0.0163, lr=1.99e-05, step=3502] Training: 35%|███▌ | 3503/10000 [42:17<1:03:03, 1.72it/s, loss=0.0088, lr=1.99e-05, step=3503] Training: 35%|███▌ | 3504/10000 [42:17<1:09:56, 1.55it/s, loss=0.0088, lr=1.99e-05, step=3503] Training: 35%|███▌ | 3504/10000 [42:17<1:09:56, 1.55it/s, loss=0.0267, lr=1.99e-05, step=3504] Training: 35%|███▌ | 3505/10000 [42:18<1:07:15, 1.61it/s, loss=0.0267, lr=1.99e-05, step=3504] Training: 35%|███▌ | 3505/10000 [42:18<1:07:15, 1.61it/s, loss=0.0028, lr=1.99e-05, step=3505] Training: 35%|███▌ | 3506/10000 [42:19<1:04:45, 1.67it/s, loss=0.0028, lr=1.99e-05, step=3505] Training: 35%|███▌ | 3506/10000 [42:19<1:04:45, 1.67it/s, loss=0.0051, lr=1.99e-05, step=3506] Training: 35%|███▌ | 3507/10000 [42:19<1:08:57, 1.57it/s, loss=0.0051, lr=1.99e-05, step=3506] Training: 35%|███▌ | 3507/10000 [42:19<1:08:57, 1.57it/s, loss=0.0364, lr=1.99e-05, step=3507] Training: 35%|███▌ | 3508/10000 [42:20<1:06:33, 1.63it/s, loss=0.0364, lr=1.99e-05, step=3507] Training: 35%|███▌ | 3508/10000 [42:20<1:06:33, 1.63it/s, loss=0.0082, lr=1.99e-05, step=3508] Training: 35%|███▌ | 3509/10000 [42:20<1:06:01, 1.64it/s, loss=0.0082, lr=1.99e-05, step=3508] Training: 35%|███▌ | 3509/10000 [42:20<1:06:01, 1.64it/s, loss=0.0101, lr=1.99e-05, step=3509]16:48:27.962 [I] step=3510 loss=0.0301 smoothed_loss=0.0248 lr=1.99e-05 grad_norm=0.5263 step_time=0.5280s data_time=0.0745s it/s=1.659 eta_to_10000=3911.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0162 grad_action_out_proj=0.1592 grad_shared_expert=0.5344 (10775:train_pytorch.py:850) + Training: 35%|███▌ | 3510/10000 [42:21<1:06:00, 1.64it/s, loss=0.0101, lr=1.99e-05, step=3509] Training: 35%|███▌ | 3510/10000 [42:21<1:06:00, 1.64it/s, loss=0.0301, lr=1.99e-05, step=3510] Training: 35%|███▌ | 3511/10000 [42:22<1:04:37, 1.67it/s, loss=0.0301, lr=1.99e-05, step=3510] Training: 35%|███▌ | 3511/10000 [42:22<1:04:37, 1.67it/s, loss=0.0034, lr=1.99e-05, step=3511] Training: 35%|███▌ | 3512/10000 [42:23<1:15:41, 1.43it/s, loss=0.0034, lr=1.99e-05, step=3511] Training: 35%|███▌ | 3512/10000 [42:23<1:15:41, 1.43it/s, loss=0.0183, lr=1.99e-05, step=3512] Training: 35%|███▌ | 3513/10000 [42:23<1:12:03, 1.50it/s, loss=0.0183, lr=1.99e-05, step=3512] Training: 35%|███▌ | 3513/10000 [42:23<1:12:03, 1.50it/s, loss=0.0243, lr=1.99e-05, step=3513] Training: 35%|███▌ | 3514/10000 [42:24<1:14:19, 1.45it/s, loss=0.0243, lr=1.99e-05, step=3513] Training: 35%|███▌ | 3514/10000 [42:24<1:14:19, 1.45it/s, loss=0.0504, lr=1.99e-05, step=3514] Training: 35%|███▌ | 3515/10000 [42:24<1:12:32, 1.49it/s, loss=0.0504, lr=1.99e-05, step=3514] Training: 35%|███▌ | 3515/10000 [42:24<1:12:32, 1.49it/s, loss=0.0099, lr=1.99e-05, step=3515] Training: 35%|███▌ | 3516/10000 [42:25<1:10:07, 1.54it/s, loss=0.0099, lr=1.99e-05, step=3515] Training: 35%|███▌ | 3516/10000 [42:25<1:10:07, 1.54it/s, loss=0.0081, lr=1.99e-05, step=3516] Training: 35%|███▌ | 3517/10000 [42:26<1:09:38, 1.55it/s, loss=0.0081, lr=1.99e-05, step=3516] Training: 35%|███▌ | 3517/10000 [42:26<1:09:38, 1.55it/s, loss=0.0164, lr=1.99e-05, step=3517] Training: 35%|███▌ | 3518/10000 [42:26<1:12:14, 1.50it/s, loss=0.0164, lr=1.99e-05, step=3517] Training: 35%|███▌ | 3518/10000 [42:26<1:12:14, 1.50it/s, loss=0.0116, lr=1.99e-05, step=3518] Training: 35%|███▌ | 3519/10000 [42:27<1:14:42, 1.45it/s, loss=0.0116, lr=1.99e-05, step=3518] Training: 35%|███▌ | 3519/10000 [42:27<1:14:42, 1.45it/s, loss=0.0943, lr=1.98e-05, step=3519]16:48:34.957 [I] step=3520 loss=0.0093 smoothed_loss=0.0261 lr=1.99e-05 grad_norm=0.4626 step_time=0.6004s data_time=0.0992s it/s=1.430 eta_to_10000=4532.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0180 grad_action_out_proj=0.1487 grad_shared_expert=0.3542 (10775:train_pytorch.py:850) + Training: 35%|███▌ | 3520/10000 [42:28<1:19:08, 1.36it/s, loss=0.0943, lr=1.98e-05, step=3519] Training: 35%|███▌ | 3520/10000 [42:28<1:19:08, 1.36it/s, loss=0.0093, lr=1.98e-05, step=3520] Training: 35%|███▌ | 3521/10000 [42:29<1:12:19, 1.49it/s, loss=0.0093, lr=1.98e-05, step=3520] Training: 35%|███▌ | 3521/10000 [42:29<1:12:19, 1.49it/s, loss=0.0063, lr=1.98e-05, step=3521] Training: 35%|███▌ | 3522/10000 [42:29<1:07:02, 1.61it/s, loss=0.0063, lr=1.98e-05, step=3521] Training: 35%|███▌ | 3522/10000 [42:29<1:07:02, 1.61it/s, loss=0.0311, lr=1.98e-05, step=3522] Training: 35%|███▌ | 3523/10000 [42:30<1:02:57, 1.71it/s, loss=0.0311, lr=1.98e-05, step=3522] Training: 35%|███▌ | 3523/10000 [42:30<1:02:57, 1.71it/s, loss=0.0282, lr=1.98e-05, step=3523] Training: 35%|███▌ | 3524/10000 [42:30<1:00:00, 1.80it/s, loss=0.0282, lr=1.98e-05, step=3523] Training: 35%|███▌ | 3524/10000 [42:30<1:00:00, 1.80it/s, loss=0.0123, lr=1.98e-05, step=3524] Training: 35%|███▌ | 3525/10000 [42:31<58:17, 1.85it/s, loss=0.0123, lr=1.98e-05, step=3524] Training: 35%|███▌ | 3525/10000 [42:31<58:17, 1.85it/s, loss=0.0554, lr=1.98e-05, step=3525] Training: 35%|███▌ | 3526/10000 [42:31<1:07:12, 1.61it/s, loss=0.0554, lr=1.98e-05, step=3525] Training: 35%|███▌ | 3526/10000 [42:31<1:07:12, 1.61it/s, loss=0.0059, lr=1.98e-05, step=3526] Training: 35%|███▌ | 3527/10000 [42:32<1:12:07, 1.50it/s, loss=0.0059, lr=1.98e-05, step=3526] Training: 35%|███▌ | 3527/10000 [42:32<1:12:07, 1.50it/s, loss=0.0038, lr=1.98e-05, step=3527] Training: 35%|███▌ | 3528/10000 [42:33<1:06:46, 1.62it/s, loss=0.0038, lr=1.98e-05, step=3527] Training: 35%|███▌ | 3528/10000 [42:33<1:06:46, 1.62it/s, loss=0.0247, lr=1.98e-05, step=3528] Training: 35%|███▌ | 3529/10000 [42:33<1:02:47, 1.72it/s, loss=0.0247, lr=1.98e-05, step=3528] Training: 35%|███▌ | 3529/10000 [42:33<1:02:47, 1.72it/s, loss=0.0166, lr=1.98e-05, step=3529]16:48:40.614 [I] step=3530 loss=0.0129 smoothed_loss=0.0214 lr=1.98e-05 grad_norm=0.4782 step_time=0.5002s data_time=0.0654s it/s=1.768 eta_to_10000=3659.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.1058 grad_shared_expert=0.3506 (10775:train_pytorch.py:850) + Training: 35%|███▌ | 3530/10000 [42:34<1:01:34, 1.75it/s, loss=0.0166, lr=1.98e-05, step=3529] Training: 35%|███▌ | 3530/10000 [42:34<1:01:34, 1.75it/s, loss=0.0129, lr=1.98e-05, step=3530] Training: 35%|███▌ | 3531/10000 [42:34<1:01:31, 1.75it/s, loss=0.0129, lr=1.98e-05, step=3530] Training: 35%|███▌ | 3531/10000 [42:34<1:01:31, 1.75it/s, loss=0.0179, lr=1.98e-05, step=3531] Training: 35%|███▌ | 3532/10000 [42:35<59:20, 1.82it/s, loss=0.0179, lr=1.98e-05, step=3531] Training: 35%|███▌ | 3532/10000 [42:35<59:20, 1.82it/s, loss=0.0089, lr=1.98e-05, step=3532] Training: 35%|███▌ | 3533/10000 [42:35<58:14, 1.85it/s, loss=0.0089, lr=1.98e-05, step=3532] Training: 35%|███▌ | 3533/10000 [42:35<58:14, 1.85it/s, loss=0.0173, lr=1.98e-05, step=3533] Training: 35%|███▌ | 3534/10000 [42:36<1:07:23, 1.60it/s, loss=0.0173, lr=1.98e-05, step=3533] Training: 35%|███▌ | 3534/10000 [42:36<1:07:23, 1.60it/s, loss=0.0306, lr=1.98e-05, step=3534] Training: 35%|███▌ | 3535/10000 [42:37<1:04:12, 1.68it/s, loss=0.0306, lr=1.98e-05, step=3534] Training: 35%|███▌ | 3535/10000 [42:37<1:04:12, 1.68it/s, loss=0.0274, lr=1.98e-05, step=3535] Training: 35%|███▌ | 3536/10000 [42:37<1:02:35, 1.72it/s, loss=0.0274, lr=1.98e-05, step=3535] Training: 35%|███▌ | 3536/10000 [42:37<1:02:35, 1.72it/s, loss=0.0278, lr=1.98e-05, step=3536] Training: 35%|███▌ | 3537/10000 [42:38<1:01:54, 1.74it/s, loss=0.0278, lr=1.98e-05, step=3536] Training: 35%|███▌ | 3537/10000 [42:38<1:01:54, 1.74it/s, loss=0.0407, lr=1.98e-05, step=3537] Training: 35%|███▌ | 3538/10000 [42:38<1:01:10, 1.76it/s, loss=0.0407, lr=1.98e-05, step=3537] Training: 35%|███▌ | 3538/10000 [42:38<1:01:10, 1.76it/s, loss=0.0671, lr=1.98e-05, step=3538] Training: 35%|███▌ | 3539/10000 [42:39<1:01:22, 1.75it/s, loss=0.0671, lr=1.98e-05, step=3538] Training: 35%|███▌ | 3539/10000 [42:39<1:01:22, 1.75it/s, loss=0.0078, lr=1.98e-05, step=3539]16:48:46.394 [I] step=3540 loss=0.0152 smoothed_loss=0.0251 lr=1.98e-05 grad_norm=0.4920 step_time=0.5038s data_time=0.0742s it/s=1.730 eta_to_10000=3733.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.1850 grad_shared_expert=0.3501 (10775:train_pytorch.py:850) + Training: 35%|███▌ | 3540/10000 [42:39<1:02:31, 1.72it/s, loss=0.0078, lr=1.98e-05, step=3539] Training: 35%|███▌ | 3540/10000 [42:39<1:02:31, 1.72it/s, loss=0.0152, lr=1.98e-05, step=3540] Training: 35%|███▌ | 3541/10000 [42:40<1:08:58, 1.56it/s, loss=0.0152, lr=1.98e-05, step=3540] Training: 35%|███▌ | 3541/10000 [42:40<1:08:58, 1.56it/s, loss=0.0164, lr=1.98e-05, step=3541] Training: 35%|███▌ | 3542/10000 [42:41<1:16:53, 1.40it/s, loss=0.0164, lr=1.98e-05, step=3541] Training: 35%|███▌ | 3542/10000 [42:41<1:16:53, 1.40it/s, loss=0.0126, lr=1.98e-05, step=3542] Training: 35%|███▌ | 3543/10000 [42:42<1:13:50, 1.46it/s, loss=0.0126, lr=1.98e-05, step=3542] Training: 35%|███▌ | 3543/10000 [42:42<1:13:50, 1.46it/s, loss=0.0275, lr=1.98e-05, step=3543] Training: 35%|███▌ | 3544/10000 [42:42<1:07:45, 1.59it/s, loss=0.0275, lr=1.98e-05, step=3543] Training: 35%|███▌ | 3544/10000 [42:42<1:07:45, 1.59it/s, loss=0.0060, lr=1.98e-05, step=3544] Training: 35%|███▌ | 3545/10000 [42:43<1:04:04, 1.68it/s, loss=0.0060, lr=1.98e-05, step=3544] Training: 35%|███▌ | 3545/10000 [42:43<1:04:04, 1.68it/s, loss=0.0075, lr=1.98e-05, step=3545] Training: 35%|███▌ | 3546/10000 [42:43<1:03:39, 1.69it/s, loss=0.0075, lr=1.98e-05, step=3545] Training: 35%|███▌ | 3546/10000 [42:43<1:03:39, 1.69it/s, loss=0.0076, lr=1.98e-05, step=3546] Training: 35%|███▌ | 3547/10000 [42:44<1:13:15, 1.47it/s, loss=0.0076, lr=1.98e-05, step=3546] Training: 35%|███▌ | 3547/10000 [42:44<1:13:15, 1.47it/s, loss=0.0232, lr=1.98e-05, step=3547] Training: 35%|███▌ | 3548/10000 [42:45<1:08:18, 1.57it/s, loss=0.0232, lr=1.98e-05, step=3547] Training: 35%|███▌ | 3548/10000 [42:45<1:08:18, 1.57it/s, loss=0.0180, lr=1.98e-05, step=3548] Training: 35%|███▌ | 3549/10000 [42:45<1:05:49, 1.63it/s, loss=0.0180, lr=1.98e-05, step=3548] Training: 35%|███▌ | 3549/10000 [42:45<1:05:49, 1.63it/s, loss=0.0216, lr=1.98e-05, step=3549]16:48:53.092 [I] step=3550 loss=0.0349 smoothed_loss=0.0211 lr=1.98e-05 grad_norm=0.4028 step_time=0.5894s data_time=0.0805s it/s=1.493 eta_to_10000=4319.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0094 grad_action_out_proj=0.1460 grad_shared_expert=0.4132 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3550/10000 [42:46<1:13:05, 1.47it/s, loss=0.0216, lr=1.98e-05, step=3549] Training: 36%|███▌ | 3550/10000 [42:46<1:13:05, 1.47it/s, loss=0.0349, lr=1.97e-05, step=3550] Training: 36%|███▌ | 3551/10000 [42:47<1:09:56, 1.54it/s, loss=0.0349, lr=1.97e-05, step=3550] Training: 36%|███▌ | 3551/10000 [42:47<1:09:56, 1.54it/s, loss=0.0141, lr=1.97e-05, step=3551] Training: 36%|███▌ | 3552/10000 [42:47<1:04:58, 1.65it/s, loss=0.0141, lr=1.97e-05, step=3551] Training: 36%|███▌ | 3552/10000 [42:47<1:04:58, 1.65it/s, loss=0.0160, lr=1.97e-05, step=3552] Training: 36%|███▌ | 3553/10000 [42:48<1:01:55, 1.74it/s, loss=0.0160, lr=1.97e-05, step=3552] Training: 36%|███▌ | 3553/10000 [42:48<1:01:55, 1.74it/s, loss=0.0240, lr=1.97e-05, step=3553] Training: 36%|███▌ | 3554/10000 [42:48<59:28, 1.81it/s, loss=0.0240, lr=1.97e-05, step=3553] Training: 36%|███▌ | 3554/10000 [42:48<59:28, 1.81it/s, loss=0.0106, lr=1.97e-05, step=3554] Training: 36%|███▌ | 3555/10000 [42:49<1:06:02, 1.63it/s, loss=0.0106, lr=1.97e-05, step=3554] Training: 36%|███▌ | 3555/10000 [42:49<1:06:02, 1.63it/s, loss=0.0126, lr=1.97e-05, step=3555] Training: 36%|███▌ | 3556/10000 [42:50<1:02:44, 1.71it/s, loss=0.0126, lr=1.97e-05, step=3555] Training: 36%|███▌ | 3556/10000 [42:50<1:02:44, 1.71it/s, loss=0.0191, lr=1.97e-05, step=3556] Training: 36%|███▌ | 3557/10000 [42:50<1:07:52, 1.58it/s, loss=0.0191, lr=1.97e-05, step=3556] Training: 36%|███▌ | 3557/10000 [42:50<1:07:52, 1.58it/s, loss=0.0061, lr=1.97e-05, step=3557] Training: 36%|███▌ | 3558/10000 [42:51<1:03:45, 1.68it/s, loss=0.0061, lr=1.97e-05, step=3557] Training: 36%|███▌ | 3558/10000 [42:51<1:03:45, 1.68it/s, loss=0.0145, lr=1.97e-05, step=3558] Training: 36%|███▌ | 3559/10000 [42:51<1:00:16, 1.78it/s, loss=0.0145, lr=1.97e-05, step=3558] Training: 36%|███▌ | 3559/10000 [42:51<1:00:16, 1.78it/s, loss=0.0084, lr=1.97e-05, step=3559]16:48:58.762 [I] step=3560 loss=0.0189 smoothed_loss=0.0166 lr=1.97e-05 grad_norm=0.4672 step_time=0.5044s data_time=0.0626s it/s=1.764 eta_to_10000=3650.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.1418 grad_shared_expert=0.4007 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3560/10000 [42:52<1:00:40, 1.77it/s, loss=0.0084, lr=1.97e-05, step=3559] Training: 36%|███▌ | 3560/10000 [42:52<1:00:40, 1.77it/s, loss=0.0189, lr=1.97e-05, step=3560] Training: 36%|███▌ | 3561/10000 [42:52<58:45, 1.83it/s, loss=0.0189, lr=1.97e-05, step=3560] Training: 36%|███▌ | 3561/10000 [42:52<58:45, 1.83it/s, loss=0.0149, lr=1.97e-05, step=3561] Training: 36%|███▌ | 3562/10000 [42:53<1:04:34, 1.66it/s, loss=0.0149, lr=1.97e-05, step=3561] Training: 36%|███▌ | 3562/10000 [42:53<1:04:34, 1.66it/s, loss=0.0079, lr=1.97e-05, step=3562] Training: 36%|███▌ | 3563/10000 [42:54<1:01:26, 1.75it/s, loss=0.0079, lr=1.97e-05, step=3562] Training: 36%|███▌ | 3563/10000 [42:54<1:01:26, 1.75it/s, loss=0.1224, lr=1.97e-05, step=3563] Training: 36%|███▌ | 3564/10000 [42:54<1:06:12, 1.62it/s, loss=0.1224, lr=1.97e-05, step=3563] Training: 36%|███▌ | 3564/10000 [42:54<1:06:12, 1.62it/s, loss=0.0296, lr=1.97e-05, step=3564] Training: 36%|███▌ | 3565/10000 [42:55<1:02:40, 1.71it/s, loss=0.0296, lr=1.97e-05, step=3564] Training: 36%|███▌ | 3565/10000 [42:55<1:02:40, 1.71it/s, loss=0.0118, lr=1.97e-05, step=3565] Training: 36%|███▌ | 3566/10000 [42:55<1:00:52, 1.76it/s, loss=0.0118, lr=1.97e-05, step=3565] Training: 36%|███▌ | 3566/10000 [42:55<1:00:52, 1.76it/s, loss=0.0090, lr=1.97e-05, step=3566] Training: 36%|███▌ | 3567/10000 [42:56<58:59, 1.82it/s, loss=0.0090, lr=1.97e-05, step=3566] Training: 36%|███▌ | 3567/10000 [42:56<58:59, 1.82it/s, loss=0.0360, lr=1.97e-05, step=3567] Training: 36%|███▌ | 3568/10000 [42:56<57:51, 1.85it/s, loss=0.0360, lr=1.97e-05, step=3567] Training: 36%|███▌ | 3568/10000 [42:56<57:51, 1.85it/s, loss=0.0211, lr=1.97e-05, step=3568] Training: 36%|███▌ | 3569/10000 [42:57<1:07:12, 1.59it/s, loss=0.0211, lr=1.97e-05, step=3568] Training: 36%|███▌ | 3569/10000 [42:57<1:07:12, 1.59it/s, loss=0.0127, lr=1.97e-05, step=3569]16:49:04.647 [I] step=3570 loss=0.0176 smoothed_loss=0.0226 lr=1.97e-05 grad_norm=0.5151 step_time=0.5294s data_time=0.0591s it/s=1.699 eta_to_10000=3783.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0113 grad_action_out_proj=0.1299 grad_shared_expert=0.3202 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3570/10000 [42:58<1:04:12, 1.67it/s, loss=0.0127, lr=1.97e-05, step=3569] Training: 36%|███▌ | 3570/10000 [42:58<1:04:12, 1.67it/s, loss=0.0176, lr=1.97e-05, step=3570] Training: 36%|███▌ | 3571/10000 [42:58<1:01:49, 1.73it/s, loss=0.0176, lr=1.97e-05, step=3570] Training: 36%|███▌ | 3571/10000 [42:58<1:01:49, 1.73it/s, loss=0.0041, lr=1.97e-05, step=3571] Training: 36%|███▌ | 3572/10000 [42:59<1:06:50, 1.60it/s, loss=0.0041, lr=1.97e-05, step=3571] Training: 36%|███▌ | 3572/10000 [42:59<1:06:50, 1.60it/s, loss=0.0096, lr=1.97e-05, step=3572] Training: 36%|███▌ | 3573/10000 [42:59<1:02:47, 1.71it/s, loss=0.0096, lr=1.97e-05, step=3572] Training: 36%|███▌ | 3573/10000 [42:59<1:02:47, 1.71it/s, loss=0.0243, lr=1.97e-05, step=3573] Training: 36%|███▌ | 3574/10000 [43:00<59:54, 1.79it/s, loss=0.0243, lr=1.97e-05, step=3573] Training: 36%|███▌ | 3574/10000 [43:00<59:54, 1.79it/s, loss=0.0207, lr=1.97e-05, step=3574] Training: 36%|███▌ | 3575/10000 [43:00<59:02, 1.81it/s, loss=0.0207, lr=1.97e-05, step=3574] Training: 36%|███▌ | 3575/10000 [43:01<59:02, 1.81it/s, loss=0.0164, lr=1.97e-05, step=3575] Training: 36%|███▌ | 3576/10000 [43:01<1:04:34, 1.66it/s, loss=0.0164, lr=1.97e-05, step=3575] Training: 36%|███▌ | 3576/10000 [43:01<1:04:34, 1.66it/s, loss=0.0485, lr=1.97e-05, step=3576] Training: 36%|███▌ | 3577/10000 [43:02<1:00:49, 1.76it/s, loss=0.0485, lr=1.97e-05, step=3576] Training: 36%|███▌ | 3577/10000 [43:02<1:00:49, 1.76it/s, loss=0.0252, lr=1.97e-05, step=3577] Training: 36%|███▌ | 3578/10000 [43:02<59:28, 1.80it/s, loss=0.0252, lr=1.97e-05, step=3577] Training: 36%|███▌ | 3578/10000 [43:02<59:28, 1.80it/s, loss=0.0474, lr=1.97e-05, step=3578] Training: 36%|███▌ | 3579/10000 [43:03<1:04:42, 1.65it/s, loss=0.0474, lr=1.97e-05, step=3578] Training: 36%|███▌ | 3579/10000 [43:03<1:04:42, 1.65it/s, loss=0.0247, lr=1.97e-05, step=3579]16:49:10.425 [I] step=3580 loss=0.0320 smoothed_loss=0.0260 lr=1.97e-05 grad_norm=0.4762 step_time=0.5191s data_time=0.0586s it/s=1.731 eta_to_10000=3708.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.1551 grad_shared_expert=0.4475 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3580/10000 [43:03<1:02:27, 1.71it/s, loss=0.0247, lr=1.97e-05, step=3579] Training: 36%|███▌ | 3580/10000 [43:03<1:02:27, 1.71it/s, loss=0.0320, lr=1.97e-05, step=3580] Training: 36%|███▌ | 3581/10000 [43:04<1:03:18, 1.69it/s, loss=0.0320, lr=1.97e-05, step=3580] Training: 36%|███▌ | 3581/10000 [43:04<1:03:18, 1.69it/s, loss=0.0180, lr=1.97e-05, step=3581] Training: 36%|███▌ | 3582/10000 [43:05<1:02:28, 1.71it/s, loss=0.0180, lr=1.97e-05, step=3581] Training: 36%|███▌ | 3582/10000 [43:05<1:02:28, 1.71it/s, loss=0.0257, lr=1.96e-05, step=3582] Training: 36%|███▌ | 3583/10000 [43:05<1:08:30, 1.56it/s, loss=0.0257, lr=1.96e-05, step=3582] Training: 36%|███▌ | 3583/10000 [43:05<1:08:30, 1.56it/s, loss=0.0053, lr=1.96e-05, step=3583] Training: 36%|███▌ | 3584/10000 [43:06<1:04:01, 1.67it/s, loss=0.0053, lr=1.96e-05, step=3583] Training: 36%|███▌ | 3584/10000 [43:06<1:04:01, 1.67it/s, loss=0.0641, lr=1.96e-05, step=3584] Training: 36%|███▌ | 3585/10000 [43:06<1:00:14, 1.77it/s, loss=0.0641, lr=1.96e-05, step=3584] Training: 36%|███▌ | 3585/10000 [43:06<1:00:14, 1.77it/s, loss=0.0262, lr=1.96e-05, step=3585] Training: 36%|███▌ | 3586/10000 [43:07<1:07:39, 1.58it/s, loss=0.0262, lr=1.96e-05, step=3585] Training: 36%|███▌ | 3586/10000 [43:07<1:07:39, 1.58it/s, loss=0.0204, lr=1.96e-05, step=3586] Training: 36%|███▌ | 3587/10000 [43:08<1:02:58, 1.70it/s, loss=0.0204, lr=1.96e-05, step=3586] Training: 36%|███▌ | 3587/10000 [43:08<1:02:58, 1.70it/s, loss=0.0243, lr=1.96e-05, step=3587] Training: 36%|███▌ | 3588/10000 [43:08<1:02:39, 1.71it/s, loss=0.0243, lr=1.96e-05, step=3587] Training: 36%|███▌ | 3588/10000 [43:08<1:02:39, 1.71it/s, loss=0.0234, lr=1.96e-05, step=3588] Training: 36%|███▌ | 3589/10000 [43:09<1:00:35, 1.76it/s, loss=0.0234, lr=1.96e-05, step=3588] Training: 36%|███▌ | 3589/10000 [43:09<1:00:35, 1.76it/s, loss=0.0184, lr=1.96e-05, step=3589]16:49:16.579 [I] step=3590 loss=0.0199 smoothed_loss=0.0247 lr=1.96e-05 grad_norm=0.5838 step_time=0.5386s data_time=0.0768s it/s=1.625 eta_to_10000=3943.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0216 grad_action_out_proj=0.1849 grad_shared_expert=0.5093 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3590/10000 [43:10<1:09:17, 1.54it/s, loss=0.0184, lr=1.96e-05, step=3589] Training: 36%|███▌ | 3590/10000 [43:10<1:09:17, 1.54it/s, loss=0.0199, lr=1.96e-05, step=3590] Training: 36%|███▌ | 3591/10000 [43:10<1:04:30, 1.66it/s, loss=0.0199, lr=1.96e-05, step=3590] Training: 36%|███▌ | 3591/10000 [43:10<1:04:30, 1.66it/s, loss=0.0195, lr=1.96e-05, step=3591] Training: 36%|███▌ | 3592/10000 [43:11<1:02:24, 1.71it/s, loss=0.0195, lr=1.96e-05, step=3591] Training: 36%|███▌ | 3592/10000 [43:11<1:02:24, 1.71it/s, loss=0.0374, lr=1.96e-05, step=3592] Training: 36%|███▌ | 3593/10000 [43:11<1:06:14, 1.61it/s, loss=0.0374, lr=1.96e-05, step=3592] Training: 36%|███▌ | 3593/10000 [43:11<1:06:14, 1.61it/s, loss=0.0837, lr=1.96e-05, step=3593] Training: 36%|███▌ | 3594/10000 [43:12<1:02:12, 1.72it/s, loss=0.0837, lr=1.96e-05, step=3593] Training: 36%|███▌ | 3594/10000 [43:12<1:02:12, 1.72it/s, loss=0.0268, lr=1.96e-05, step=3594] Training: 36%|███▌ | 3595/10000 [43:12<1:01:16, 1.74it/s, loss=0.0268, lr=1.96e-05, step=3594] Training: 36%|███▌ | 3595/10000 [43:12<1:01:16, 1.74it/s, loss=0.0142, lr=1.96e-05, step=3595] Training: 36%|███▌ | 3596/10000 [43:13<58:40, 1.82it/s, loss=0.0142, lr=1.96e-05, step=3595] Training: 36%|███▌ | 3596/10000 [43:13<58:40, 1.82it/s, loss=0.0165, lr=1.96e-05, step=3596] Training: 36%|███▌ | 3597/10000 [43:13<56:57, 1.87it/s, loss=0.0165, lr=1.96e-05, step=3596] Training: 36%|███▌ | 3597/10000 [43:13<56:57, 1.87it/s, loss=0.0186, lr=1.96e-05, step=3597] Training: 36%|███▌ | 3598/10000 [43:14<1:02:51, 1.70it/s, loss=0.0186, lr=1.96e-05, step=3597] Training: 36%|███▌ | 3598/10000 [43:14<1:02:51, 1.70it/s, loss=0.0162, lr=1.96e-05, step=3598] Training: 36%|███▌ | 3599/10000 [43:15<59:57, 1.78it/s, loss=0.0162, lr=1.96e-05, step=3598] Training: 36%|███▌ | 3599/10000 [43:15<59:57, 1.78it/s, loss=0.0292, lr=1.96e-05, step=3599]16:49:22.262 [I] step=3600 loss=0.0127 smoothed_loss=0.0249 lr=1.96e-05 grad_norm=0.5787 step_time=0.5036s data_time=0.0648s it/s=1.760 eta_to_10000=3636.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0267 grad_action_out_proj=0.2140 grad_shared_expert=0.4898 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3600/10000 [43:15<1:03:54, 1.67it/s, loss=0.0292, lr=1.96e-05, step=3599] Training: 36%|███▌ | 3600/10000 [43:15<1:03:54, 1.67it/s, loss=0.0127, lr=1.96e-05, step=3600] Training: 36%|███▌ | 3601/10000 [43:16<1:03:44, 1.67it/s, loss=0.0127, lr=1.96e-05, step=3600] Training: 36%|███▌ | 3601/10000 [43:16<1:03:44, 1.67it/s, loss=0.0222, lr=1.96e-05, step=3601] Training: 36%|███▌ | 3602/10000 [43:16<1:00:34, 1.76it/s, loss=0.0222, lr=1.96e-05, step=3601] Training: 36%|███▌ | 3602/10000 [43:16<1:00:34, 1.76it/s, loss=0.0142, lr=1.96e-05, step=3602] Training: 36%|███▌ | 3603/10000 [43:17<58:30, 1.82it/s, loss=0.0142, lr=1.96e-05, step=3602] Training: 36%|███▌ | 3603/10000 [43:17<58:30, 1.82it/s, loss=0.0057, lr=1.96e-05, step=3603] Training: 36%|███▌ | 3604/10000 [43:17<57:15, 1.86it/s, loss=0.0057, lr=1.96e-05, step=3603] Training: 36%|███▌ | 3604/10000 [43:17<57:15, 1.86it/s, loss=0.0161, lr=1.96e-05, step=3604] Training: 36%|███▌ | 3605/10000 [43:18<1:03:42, 1.67it/s, loss=0.0161, lr=1.96e-05, step=3604] Training: 36%|███▌ | 3605/10000 [43:18<1:03:42, 1.67it/s, loss=0.0084, lr=1.96e-05, step=3605] Training: 36%|███▌ | 3606/10000 [43:19<1:00:13, 1.77it/s, loss=0.0084, lr=1.96e-05, step=3605] Training: 36%|███▌ | 3606/10000 [43:19<1:00:13, 1.77it/s, loss=0.0333, lr=1.96e-05, step=3606] Training: 36%|███▌ | 3607/10000 [43:19<1:03:43, 1.67it/s, loss=0.0333, lr=1.96e-05, step=3606] Training: 36%|███▌ | 3607/10000 [43:19<1:03:43, 1.67it/s, loss=0.0082, lr=1.96e-05, step=3607] Training: 36%|███▌ | 3608/10000 [43:20<1:03:30, 1.68it/s, loss=0.0082, lr=1.96e-05, step=3607] Training: 36%|███▌ | 3608/10000 [43:20<1:03:30, 1.68it/s, loss=0.0114, lr=1.96e-05, step=3608] Training: 36%|███▌ | 3609/10000 [43:20<1:00:55, 1.75it/s, loss=0.0114, lr=1.96e-05, step=3608] Training: 36%|███▌ | 3609/10000 [43:20<1:00:55, 1.75it/s, loss=0.0266, lr=1.96e-05, step=3609]16:49:27.999 [I] step=3610 loss=0.0080 smoothed_loss=0.0187 lr=1.96e-05 grad_norm=0.4226 step_time=0.4998s data_time=0.0738s it/s=1.744 eta_to_10000=3664.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.1861 grad_shared_expert=0.4951 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3610/10000 [43:21<1:02:25, 1.71it/s, loss=0.0266, lr=1.96e-05, step=3609] Training: 36%|███▌ | 3610/10000 [43:21<1:02:25, 1.71it/s, loss=0.0080, lr=1.96e-05, step=3610] Training: 36%|███▌ | 3611/10000 [43:22<59:38, 1.79it/s, loss=0.0080, lr=1.96e-05, step=3610] Training: 36%|███▌ | 3611/10000 [43:22<59:38, 1.79it/s, loss=0.0111, lr=1.96e-05, step=3611] Training: 36%|███▌ | 3612/10000 [43:22<1:04:13, 1.66it/s, loss=0.0111, lr=1.96e-05, step=3611] Training: 36%|███▌ | 3612/10000 [43:22<1:04:13, 1.66it/s, loss=0.0150, lr=1.96e-05, step=3612] Training: 36%|███▌ | 3613/10000 [43:23<1:01:00, 1.74it/s, loss=0.0150, lr=1.96e-05, step=3612] Training: 36%|███▌ | 3613/10000 [43:23<1:01:00, 1.74it/s, loss=0.0105, lr=1.96e-05, step=3613] Training: 36%|███▌ | 3614/10000 [43:23<1:05:37, 1.62it/s, loss=0.0105, lr=1.96e-05, step=3613] Training: 36%|███▌ | 3614/10000 [43:23<1:05:37, 1.62it/s, loss=0.0122, lr=1.95e-05, step=3614] Training: 36%|███▌ | 3615/10000 [43:24<1:01:28, 1.73it/s, loss=0.0122, lr=1.95e-05, step=3614] Training: 36%|███▌ | 3615/10000 [43:24<1:01:28, 1.73it/s, loss=0.0123, lr=1.95e-05, step=3615] Training: 36%|███▌ | 3616/10000 [43:24<59:15, 1.80it/s, loss=0.0123, lr=1.95e-05, step=3615] Training: 36%|███▌ | 3616/10000 [43:24<59:15, 1.80it/s, loss=0.0211, lr=1.95e-05, step=3616] Training: 36%|███▌ | 3617/10000 [43:25<57:20, 1.86it/s, loss=0.0211, lr=1.95e-05, step=3616] Training: 36%|███▌ | 3617/10000 [43:25<57:20, 1.86it/s, loss=0.0227, lr=1.95e-05, step=3617] Training: 36%|███▌ | 3618/10000 [43:25<56:15, 1.89it/s, loss=0.0227, lr=1.95e-05, step=3617] Training: 36%|███▌ | 3618/10000 [43:25<56:15, 1.89it/s, loss=0.0057, lr=1.95e-05, step=3618] Training: 36%|███▌ | 3619/10000 [43:26<1:01:54, 1.72it/s, loss=0.0057, lr=1.95e-05, step=3618] Training: 36%|███▌ | 3619/10000 [43:26<1:01:54, 1.72it/s, loss=0.0179, lr=1.95e-05, step=3619]16:49:33.688 [I] step=3620 loss=0.0191 smoothed_loss=0.0165 lr=1.95e-05 grad_norm=0.5734 step_time=0.5115s data_time=0.0575s it/s=1.758 eta_to_10000=3629.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1130 grad_shared_expert=0.3712 (10775:train_pytorch.py:850) + Training: 36%|███▌ | 3620/10000 [43:27<1:01:13, 1.74it/s, loss=0.0179, lr=1.95e-05, step=3619] Training: 36%|███▌ | 3620/10000 [43:27<1:01:13, 1.74it/s, loss=0.0191, lr=1.95e-05, step=3620] Training: 36%|███▌ | 3621/10000 [43:27<1:00:19, 1.76it/s, loss=0.0191, lr=1.95e-05, step=3620] Training: 36%|███▌ | 3621/10000 [43:27<1:00:19, 1.76it/s, loss=0.0404, lr=1.95e-05, step=3621] Training: 36%|███▌ | 3622/10000 [43:28<1:06:40, 1.59it/s, loss=0.0404, lr=1.95e-05, step=3621] Training: 36%|███▌ | 3622/10000 [43:28<1:06:40, 1.59it/s, loss=0.0076, lr=1.95e-05, step=3622] Training: 36%|███▌ | 3623/10000 [43:29<1:03:04, 1.68it/s, loss=0.0076, lr=1.95e-05, step=3622] Training: 36%|███▌ | 3623/10000 [43:29<1:03:04, 1.68it/s, loss=0.1035, lr=1.95e-05, step=3623] Training: 36%|███▌ | 3624/10000 [43:29<1:00:43, 1.75it/s, loss=0.1035, lr=1.95e-05, step=3623] Training: 36%|███▌ | 3624/10000 [43:29<1:00:43, 1.75it/s, loss=0.0202, lr=1.95e-05, step=3624] Training: 36%|███▋ | 3625/10000 [43:30<58:11, 1.83it/s, loss=0.0202, lr=1.95e-05, step=3624] Training: 36%|███▋ | 3625/10000 [43:30<58:11, 1.83it/s, loss=0.0107, lr=1.95e-05, step=3625] Training: 36%|███▋ | 3626/10000 [43:30<1:05:12, 1.63it/s, loss=0.0107, lr=1.95e-05, step=3625] Training: 36%|███▋ | 3626/10000 [43:30<1:05:12, 1.63it/s, loss=0.0072, lr=1.95e-05, step=3626] Training: 36%|███▋ | 3627/10000 [43:31<1:01:32, 1.73it/s, loss=0.0072, lr=1.95e-05, step=3626] Training: 36%|███▋ | 3627/10000 [43:31<1:01:32, 1.73it/s, loss=0.0297, lr=1.95e-05, step=3627] Training: 36%|███▋ | 3628/10000 [43:31<59:10, 1.79it/s, loss=0.0297, lr=1.95e-05, step=3627] Training: 36%|███▋ | 3628/10000 [43:31<59:10, 1.79it/s, loss=0.0059, lr=1.95e-05, step=3628] Training: 36%|███▋ | 3629/10000 [43:32<56:58, 1.86it/s, loss=0.0059, lr=1.95e-05, step=3628] Training: 36%|███▋ | 3629/10000 [43:32<56:58, 1.86it/s, loss=0.0130, lr=1.95e-05, step=3629]16:49:39.582 [I] step=3630 loss=0.0045 smoothed_loss=0.0190 lr=1.95e-05 grad_norm=0.5389 step_time=0.5293s data_time=0.0601s it/s=1.697 eta_to_10000=3753.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0119 grad_action_out_proj=0.1351 grad_shared_expert=0.6462 (10775:train_pytorch.py:850) + Training: 36%|███▋ | 3630/10000 [43:33<1:05:05, 1.63it/s, loss=0.0130, lr=1.95e-05, step=3629] Training: 36%|███▋ | 3630/10000 [43:33<1:05:05, 1.63it/s, loss=0.0045, lr=1.95e-05, step=3630] Training: 36%|███▋ | 3631/10000 [43:33<1:01:10, 1.74it/s, loss=0.0045, lr=1.95e-05, step=3630] Training: 36%|███▋ | 3631/10000 [43:33<1:01:10, 1.74it/s, loss=0.0118, lr=1.95e-05, step=3631] Training: 36%|███▋ | 3632/10000 [43:34<59:03, 1.80it/s, loss=0.0118, lr=1.95e-05, step=3631] Training: 36%|███▋ | 3632/10000 [43:34<59:03, 1.80it/s, loss=0.0231, lr=1.95e-05, step=3632] Training: 36%|███▋ | 3633/10000 [43:34<1:08:17, 1.55it/s, loss=0.0231, lr=1.95e-05, step=3632] Training: 36%|███▋ | 3633/10000 [43:34<1:08:17, 1.55it/s, loss=0.0196, lr=1.95e-05, step=3633] Training: 36%|███▋ | 3634/10000 [43:35<1:04:32, 1.64it/s, loss=0.0196, lr=1.95e-05, step=3633] Training: 36%|███▋ | 3634/10000 [43:35<1:04:32, 1.64it/s, loss=0.0118, lr=1.95e-05, step=3634] Training: 36%|███▋ | 3635/10000 [43:36<1:01:27, 1.73it/s, loss=0.0118, lr=1.95e-05, step=3634] Training: 36%|███▋ | 3635/10000 [43:36<1:01:27, 1.73it/s, loss=0.0229, lr=1.95e-05, step=3635] Training: 36%|███▋ | 3636/10000 [43:36<58:33, 1.81it/s, loss=0.0229, lr=1.95e-05, step=3635] Training: 36%|███▋ | 3636/10000 [43:36<58:33, 1.81it/s, loss=0.0149, lr=1.95e-05, step=3636] Training: 36%|███▋ | 3637/10000 [43:37<1:04:25, 1.65it/s, loss=0.0149, lr=1.95e-05, step=3636] Training: 36%|███▋ | 3637/10000 [43:37<1:04:25, 1.65it/s, loss=0.0057, lr=1.95e-05, step=3637] Training: 36%|███▋ | 3638/10000 [43:37<1:01:47, 1.72it/s, loss=0.0057, lr=1.95e-05, step=3637] Training: 36%|███▋ | 3638/10000 [43:37<1:01:47, 1.72it/s, loss=0.0214, lr=1.95e-05, step=3638] Training: 36%|███▋ | 3639/10000 [43:38<1:01:37, 1.72it/s, loss=0.0214, lr=1.95e-05, step=3638] Training: 36%|███▋ | 3639/10000 [43:38<1:01:37, 1.72it/s, loss=0.0237, lr=1.95e-05, step=3639]16:49:45.356 [I] step=3640 loss=0.0149 smoothed_loss=0.0178 lr=1.95e-05 grad_norm=0.5040 step_time=0.5171s data_time=0.0602s it/s=1.732 eta_to_10000=3671.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.1081 grad_shared_expert=0.3726 (10775:train_pytorch.py:850) + Training: 36%|███▋ | 3640/10000 [43:38<1:00:56, 1.74it/s, loss=0.0237, lr=1.95e-05, step=3639] Training: 36%|███▋ | 3640/10000 [43:38<1:00:56, 1.74it/s, loss=0.0149, lr=1.95e-05, step=3640] Training: 36%|███▋ | 3641/10000 [43:39<1:05:26, 1.62it/s, loss=0.0149, lr=1.95e-05, step=3640] Training: 36%|███▋ | 3641/10000 [43:39<1:05:26, 1.62it/s, loss=0.0262, lr=1.95e-05, step=3641] Training: 36%|███▋ | 3642/10000 [43:40<1:02:09, 1.70it/s, loss=0.0262, lr=1.95e-05, step=3641] Training: 36%|███▋ | 3642/10000 [43:40<1:02:09, 1.70it/s, loss=0.0234, lr=1.95e-05, step=3642] Training: 36%|███▋ | 3643/10000 [43:40<58:45, 1.80it/s, loss=0.0234, lr=1.95e-05, step=3642] Training: 36%|███▋ | 3643/10000 [43:40<58:45, 1.80it/s, loss=0.0048, lr=1.95e-05, step=3643] Training: 36%|███▋ | 3644/10000 [43:41<56:58, 1.86it/s, loss=0.0048, lr=1.95e-05, step=3643] Training: 36%|███▋ | 3644/10000 [43:41<56:58, 1.86it/s, loss=0.0057, lr=1.95e-05, step=3644] Training: 36%|███▋ | 3645/10000 [43:41<1:02:47, 1.69it/s, loss=0.0057, lr=1.95e-05, step=3644] Training: 36%|███▋ | 3645/10000 [43:41<1:02:47, 1.69it/s, loss=0.0043, lr=1.94e-05, step=3645] Training: 36%|███▋ | 3646/10000 [43:42<59:59, 1.77it/s, loss=0.0043, lr=1.94e-05, step=3645] Training: 36%|███▋ | 3646/10000 [43:42<59:59, 1.77it/s, loss=0.0061, lr=1.94e-05, step=3646] Training: 36%|███▋ | 3647/10000 [43:42<1:01:47, 1.71it/s, loss=0.0061, lr=1.94e-05, step=3646] Training: 36%|███▋ | 3647/10000 [43:42<1:01:47, 1.71it/s, loss=0.0515, lr=1.94e-05, step=3647] Training: 36%|███▋ | 3648/10000 [43:43<1:06:45, 1.59it/s, loss=0.0515, lr=1.94e-05, step=3647] Training: 36%|███▋ | 3648/10000 [43:43<1:06:45, 1.59it/s, loss=0.0093, lr=1.94e-05, step=3648] Training: 36%|███▋ | 3649/10000 [43:44<1:03:38, 1.66it/s, loss=0.0093, lr=1.94e-05, step=3648] Training: 36%|███▋ | 3649/10000 [43:44<1:03:38, 1.66it/s, loss=0.0215, lr=1.94e-05, step=3649]16:49:51.215 [I] step=3650 loss=0.0180 smoothed_loss=0.0176 lr=1.94e-05 grad_norm=0.4441 step_time=0.5195s data_time=0.0664s it/s=1.707 eta_to_10000=3720.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.1289 grad_shared_expert=0.4233 (10775:train_pytorch.py:850) + Training: 36%|███▋ | 3650/10000 [43:44<1:01:15, 1.73it/s, loss=0.0215, lr=1.94e-05, step=3649] Training: 36%|███▋ | 3650/10000 [43:44<1:01:15, 1.73it/s, loss=0.0180, lr=1.94e-05, step=3650] Training: 37%|███▋ | 3651/10000 [43:45<1:00:16, 1.76it/s, loss=0.0180, lr=1.94e-05, step=3650] Training: 37%|███▋ | 3651/10000 [43:45<1:00:16, 1.76it/s, loss=0.0093, lr=1.94e-05, step=3651] Training: 37%|███▋ | 3652/10000 [43:46<1:05:27, 1.62it/s, loss=0.0093, lr=1.94e-05, step=3651] Training: 37%|███▋ | 3652/10000 [43:46<1:05:27, 1.62it/s, loss=0.0084, lr=1.94e-05, step=3652] Training: 37%|███▋ | 3653/10000 [43:46<1:04:47, 1.63it/s, loss=0.0084, lr=1.94e-05, step=3652] Training: 37%|███▋ | 3653/10000 [43:46<1:04:47, 1.63it/s, loss=0.0216, lr=1.94e-05, step=3653] Training: 37%|███▋ | 3654/10000 [43:47<1:01:28, 1.72it/s, loss=0.0216, lr=1.94e-05, step=3653] Training: 37%|███▋ | 3654/10000 [43:47<1:01:28, 1.72it/s, loss=0.0324, lr=1.94e-05, step=3654] Training: 37%|███▋ | 3655/10000 [43:47<1:05:30, 1.61it/s, loss=0.0324, lr=1.94e-05, step=3654] Training: 37%|███▋ | 3655/10000 [43:47<1:05:30, 1.61it/s, loss=0.0279, lr=1.94e-05, step=3655] Training: 37%|███▋ | 3656/10000 [43:48<1:02:41, 1.69it/s, loss=0.0279, lr=1.94e-05, step=3655] Training: 37%|███▋ | 3656/10000 [43:48<1:02:41, 1.69it/s, loss=0.0132, lr=1.94e-05, step=3656] Training: 37%|███▋ | 3657/10000 [43:48<59:57, 1.76it/s, loss=0.0132, lr=1.94e-05, step=3656] Training: 37%|███▋ | 3657/10000 [43:48<59:57, 1.76it/s, loss=0.0058, lr=1.94e-05, step=3657] Training: 37%|███▋ | 3658/10000 [43:49<58:15, 1.81it/s, loss=0.0058, lr=1.94e-05, step=3657] Training: 37%|███▋ | 3658/10000 [43:49<58:15, 1.81it/s, loss=0.0131, lr=1.94e-05, step=3658] Training: 37%|███▋ | 3659/10000 [43:50<1:03:18, 1.67it/s, loss=0.0131, lr=1.94e-05, step=3658] Training: 37%|███▋ | 3659/10000 [43:50<1:03:18, 1.67it/s, loss=0.0306, lr=1.94e-05, step=3659]16:49:57.109 [I] step=3660 loss=0.0470 smoothed_loss=0.0211 lr=1.94e-05 grad_norm=0.4885 step_time=0.5241s data_time=0.0653s it/s=1.697 eta_to_10000=3736.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0210 grad_action_out_proj=0.1962 grad_shared_expert=0.5141 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3660/10000 [43:50<1:01:14, 1.73it/s, loss=0.0306, lr=1.94e-05, step=3659] Training: 37%|███▋ | 3660/10000 [43:50<1:01:14, 1.73it/s, loss=0.0470, lr=1.94e-05, step=3660] Training: 37%|███▋ | 3661/10000 [43:51<1:00:27, 1.75it/s, loss=0.0470, lr=1.94e-05, step=3660] Training: 37%|███▋ | 3661/10000 [43:51<1:00:27, 1.75it/s, loss=0.0073, lr=1.94e-05, step=3661] Training: 37%|███▋ | 3662/10000 [43:51<1:05:35, 1.61it/s, loss=0.0073, lr=1.94e-05, step=3661] Training: 37%|███▋ | 3662/10000 [43:51<1:05:35, 1.61it/s, loss=0.0115, lr=1.94e-05, step=3662] Training: 37%|███▋ | 3663/10000 [43:52<1:01:17, 1.72it/s, loss=0.0115, lr=1.94e-05, step=3662] Training: 37%|███▋ | 3663/10000 [43:52<1:01:17, 1.72it/s, loss=0.0275, lr=1.94e-05, step=3663] Training: 37%|███▋ | 3664/10000 [43:52<58:39, 1.80it/s, loss=0.0275, lr=1.94e-05, step=3663] Training: 37%|███▋ | 3664/10000 [43:52<58:39, 1.80it/s, loss=0.0072, lr=1.94e-05, step=3664] Training: 37%|███▋ | 3665/10000 [43:53<56:26, 1.87it/s, loss=0.0072, lr=1.94e-05, step=3664] Training: 37%|███▋ | 3665/10000 [43:53<56:26, 1.87it/s, loss=0.0080, lr=1.94e-05, step=3665] Training: 37%|███▋ | 3666/10000 [43:54<1:02:34, 1.69it/s, loss=0.0080, lr=1.94e-05, step=3665] Training: 37%|███▋ | 3666/10000 [43:54<1:02:34, 1.69it/s, loss=0.0028, lr=1.94e-05, step=3666] Training: 37%|███▋ | 3667/10000 [43:54<59:19, 1.78it/s, loss=0.0028, lr=1.94e-05, step=3666] Training: 37%|███▋ | 3667/10000 [43:54<59:19, 1.78it/s, loss=0.0184, lr=1.94e-05, step=3667] Training: 37%|███▋ | 3668/10000 [43:55<57:20, 1.84it/s, loss=0.0184, lr=1.94e-05, step=3667] Training: 37%|███▋ | 3668/10000 [43:55<57:20, 1.84it/s, loss=0.0032, lr=1.94e-05, step=3668] Training: 37%|███▋ | 3669/10000 [43:55<1:02:17, 1.69it/s, loss=0.0032, lr=1.94e-05, step=3668] Training: 37%|███▋ | 3669/10000 [43:55<1:02:17, 1.69it/s, loss=0.0039, lr=1.94e-05, step=3669]16:50:02.815 [I] step=3670 loss=0.0056 smoothed_loss=0.0130 lr=1.94e-05 grad_norm=0.4706 step_time=0.5089s data_time=0.0617s it/s=1.753 eta_to_10000=3611.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0078 grad_action_out_proj=0.0939 grad_shared_expert=0.2430 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3670/10000 [43:56<1:00:21, 1.75it/s, loss=0.0039, lr=1.94e-05, step=3669] Training: 37%|███▋ | 3670/10000 [43:56<1:00:21, 1.75it/s, loss=0.0056, lr=1.94e-05, step=3670] Training: 37%|███▋ | 3671/10000 [43:56<59:17, 1.78it/s, loss=0.0056, lr=1.94e-05, step=3670] Training: 37%|███▋ | 3671/10000 [43:56<59:17, 1.78it/s, loss=0.0051, lr=1.94e-05, step=3671] Training: 37%|███▋ | 3672/10000 [43:57<57:38, 1.83it/s, loss=0.0051, lr=1.94e-05, step=3671] Training: 37%|███▋ | 3672/10000 [43:57<57:38, 1.83it/s, loss=0.0218, lr=1.94e-05, step=3672] Training: 37%|███▋ | 3673/10000 [43:58<1:03:10, 1.67it/s, loss=0.0218, lr=1.94e-05, step=3672] Training: 37%|███▋ | 3673/10000 [43:58<1:03:10, 1.67it/s, loss=0.0086, lr=1.94e-05, step=3673] Training: 37%|███▋ | 3674/10000 [43:58<59:48, 1.76it/s, loss=0.0086, lr=1.94e-05, step=3673] Training: 37%|███▋ | 3674/10000 [43:58<59:48, 1.76it/s, loss=0.0077, lr=1.94e-05, step=3674] Training: 37%|███▋ | 3675/10000 [43:59<57:15, 1.84it/s, loss=0.0077, lr=1.94e-05, step=3674] Training: 37%|███▋ | 3675/10000 [43:59<57:15, 1.84it/s, loss=0.0233, lr=1.94e-05, step=3675] Training: 37%|███▋ | 3676/10000 [43:59<1:02:25, 1.69it/s, loss=0.0233, lr=1.94e-05, step=3675] Training: 37%|███▋ | 3676/10000 [43:59<1:02:25, 1.69it/s, loss=0.0228, lr=1.93e-05, step=3676] Training: 37%|███▋ | 3677/10000 [44:00<59:50, 1.76it/s, loss=0.0228, lr=1.93e-05, step=3676] Training: 37%|███▋ | 3677/10000 [44:00<59:50, 1.76it/s, loss=0.0066, lr=1.93e-05, step=3677] Training: 37%|███▋ | 3678/10000 [44:00<57:57, 1.82it/s, loss=0.0066, lr=1.93e-05, step=3677] Training: 37%|███▋ | 3678/10000 [44:00<57:57, 1.82it/s, loss=0.0058, lr=1.93e-05, step=3678] Training: 37%|███▋ | 3679/10000 [44:01<56:02, 1.88it/s, loss=0.0058, lr=1.93e-05, step=3678] Training: 37%|███▋ | 3679/10000 [44:01<56:02, 1.88it/s, loss=0.0138, lr=1.93e-05, step=3679]16:50:08.528 [I] step=3680 loss=0.0163 smoothed_loss=0.0132 lr=1.93e-05 grad_norm=0.4576 step_time=0.5087s data_time=0.0627s it/s=1.751 eta_to_10000=3609.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0129 grad_action_out_proj=0.1191 grad_shared_expert=0.4310 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3680/10000 [44:02<1:02:47, 1.68it/s, loss=0.0138, lr=1.93e-05, step=3679] Training: 37%|███▋ | 3680/10000 [44:02<1:02:47, 1.68it/s, loss=0.0163, lr=1.93e-05, step=3680] Training: 37%|███▋ | 3681/10000 [44:02<1:01:05, 1.72it/s, loss=0.0163, lr=1.93e-05, step=3680] Training: 37%|███▋ | 3681/10000 [44:02<1:01:05, 1.72it/s, loss=0.0049, lr=1.93e-05, step=3681] Training: 37%|███▋ | 3682/10000 [44:03<58:48, 1.79it/s, loss=0.0049, lr=1.93e-05, step=3681] Training: 37%|███▋ | 3682/10000 [44:03<58:48, 1.79it/s, loss=0.0072, lr=1.93e-05, step=3682] Training: 37%|███▋ | 3683/10000 [44:03<58:49, 1.79it/s, loss=0.0072, lr=1.93e-05, step=3682] Training: 37%|███▋ | 3683/10000 [44:03<58:49, 1.79it/s, loss=0.0399, lr=1.93e-05, step=3683] Training: 37%|███▋ | 3684/10000 [44:04<1:04:21, 1.64it/s, loss=0.0399, lr=1.93e-05, step=3683] Training: 37%|███▋ | 3684/10000 [44:04<1:04:21, 1.64it/s, loss=0.0234, lr=1.93e-05, step=3684] Training: 37%|███▋ | 3685/10000 [44:04<1:01:19, 1.72it/s, loss=0.0234, lr=1.93e-05, step=3684] Training: 37%|███▋ | 3685/10000 [44:04<1:01:19, 1.72it/s, loss=0.0106, lr=1.93e-05, step=3685] Training: 37%|███▋ | 3686/10000 [44:05<59:48, 1.76it/s, loss=0.0106, lr=1.93e-05, step=3685] Training: 37%|███▋ | 3686/10000 [44:05<59:48, 1.76it/s, loss=0.0101, lr=1.93e-05, step=3686] Training: 37%|███▋ | 3687/10000 [44:06<1:05:07, 1.62it/s, loss=0.0101, lr=1.93e-05, step=3686] Training: 37%|███▋ | 3687/10000 [44:06<1:05:07, 1.62it/s, loss=0.0140, lr=1.93e-05, step=3687] Training: 37%|███▋ | 3688/10000 [44:06<1:01:36, 1.71it/s, loss=0.0140, lr=1.93e-05, step=3687] Training: 37%|███▋ | 3688/10000 [44:06<1:01:36, 1.71it/s, loss=0.0102, lr=1.93e-05, step=3688] Training: 37%|███▋ | 3689/10000 [44:07<58:43, 1.79it/s, loss=0.0102, lr=1.93e-05, step=3688] Training: 37%|███▋ | 3689/10000 [44:07<58:43, 1.79it/s, loss=0.0090, lr=1.93e-05, step=3689]16:50:14.203 [I] step=3690 loss=0.0145 smoothed_loss=0.0136 lr=1.93e-05 grad_norm=0.4979 step_time=0.4995s data_time=0.0678s it/s=1.762 eta_to_10000=3580.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0268 grad_action_out_proj=0.1618 grad_shared_expert=0.4657 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3690/10000 [44:07<58:09, 1.81it/s, loss=0.0090, lr=1.93e-05, step=3689] Training: 37%|███▋ | 3690/10000 [44:07<58:09, 1.81it/s, loss=0.0145, lr=1.93e-05, step=3690] Training: 37%|███▋ | 3691/10000 [44:08<1:03:23, 1.66it/s, loss=0.0145, lr=1.93e-05, step=3690] Training: 37%|███▋ | 3691/10000 [44:08<1:03:23, 1.66it/s, loss=0.0442, lr=1.93e-05, step=3691] Training: 37%|███▋ | 3692/10000 [44:08<1:00:28, 1.74it/s, loss=0.0442, lr=1.93e-05, step=3691] Training: 37%|███▋ | 3692/10000 [44:08<1:00:28, 1.74it/s, loss=0.0196, lr=1.93e-05, step=3692] Training: 37%|███▋ | 3693/10000 [44:09<58:38, 1.79it/s, loss=0.0196, lr=1.93e-05, step=3692] Training: 37%|███▋ | 3693/10000 [44:09<58:38, 1.79it/s, loss=0.0294, lr=1.93e-05, step=3693] Training: 37%|███▋ | 3694/10000 [44:10<57:53, 1.82it/s, loss=0.0294, lr=1.93e-05, step=3693] Training: 37%|███▋ | 3694/10000 [44:10<57:53, 1.82it/s, loss=0.0192, lr=1.93e-05, step=3694] Training: 37%|███▋ | 3695/10000 [44:10<1:01:43, 1.70it/s, loss=0.0192, lr=1.93e-05, step=3694] Training: 37%|███▋ | 3695/10000 [44:10<1:01:43, 1.70it/s, loss=0.0518, lr=1.93e-05, step=3695] Training: 37%|███▋ | 3696/10000 [44:11<58:55, 1.78it/s, loss=0.0518, lr=1.93e-05, step=3695] Training: 37%|███▋ | 3696/10000 [44:11<58:55, 1.78it/s, loss=0.0147, lr=1.93e-05, step=3696] Training: 37%|███▋ | 3697/10000 [44:11<56:27, 1.86it/s, loss=0.0147, lr=1.93e-05, step=3696] Training: 37%|███▋ | 3697/10000 [44:11<56:27, 1.86it/s, loss=0.0549, lr=1.93e-05, step=3697] Training: 37%|███▋ | 3698/10000 [44:12<1:03:11, 1.66it/s, loss=0.0549, lr=1.93e-05, step=3697] Training: 37%|███▋ | 3698/10000 [44:12<1:03:11, 1.66it/s, loss=0.0174, lr=1.93e-05, step=3698] Training: 37%|███▋ | 3699/10000 [44:12<59:19, 1.77it/s, loss=0.0174, lr=1.93e-05, step=3698] Training: 37%|███▋ | 3699/10000 [44:12<59:19, 1.77it/s, loss=0.0131, lr=1.93e-05, step=3699]16:50:19.908 [I] step=3700 loss=0.0136 smoothed_loss=0.0217 lr=1.93e-05 grad_norm=0.5549 step_time=0.5094s data_time=0.0612s it/s=1.753 eta_to_10000=3594.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0171 grad_action_out_proj=0.1528 grad_shared_expert=0.5167 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3700/10000 [44:13<58:29, 1.80it/s, loss=0.0131, lr=1.93e-05, step=3699] Training: 37%|███▋ | 3700/10000 [44:13<58:29, 1.80it/s, loss=0.0136, lr=1.93e-05, step=3700] Training: 37%|███▋ | 3701/10000 [44:13<57:27, 1.83it/s, loss=0.0136, lr=1.93e-05, step=3700] Training: 37%|███▋ | 3701/10000 [44:13<57:27, 1.83it/s, loss=0.0387, lr=1.93e-05, step=3701] Training: 37%|███▋ | 3702/10000 [44:14<1:03:29, 1.65it/s, loss=0.0387, lr=1.93e-05, step=3701] Training: 37%|███▋ | 3702/10000 [44:14<1:03:29, 1.65it/s, loss=0.0807, lr=1.93e-05, step=3702] Training: 37%|███▋ | 3703/10000 [44:15<1:00:27, 1.74it/s, loss=0.0807, lr=1.93e-05, step=3702] Training: 37%|███▋ | 3703/10000 [44:15<1:00:27, 1.74it/s, loss=0.0131, lr=1.93e-05, step=3703] Training: 37%|███▋ | 3704/10000 [44:15<58:22, 1.80it/s, loss=0.0131, lr=1.93e-05, step=3703] Training: 37%|███▋ | 3704/10000 [44:15<58:22, 1.80it/s, loss=0.0256, lr=1.93e-05, step=3704] Training: 37%|███▋ | 3705/10000 [44:16<56:25, 1.86it/s, loss=0.0256, lr=1.93e-05, step=3704] Training: 37%|███▋ | 3705/10000 [44:16<56:25, 1.86it/s, loss=0.0691, lr=1.93e-05, step=3705] Training: 37%|███▋ | 3706/10000 [44:16<1:01:57, 1.69it/s, loss=0.0691, lr=1.93e-05, step=3705] Training: 37%|███▋ | 3706/10000 [44:16<1:01:57, 1.69it/s, loss=0.0095, lr=1.93e-05, step=3706] Training: 37%|███▋ | 3707/10000 [44:17<58:21, 1.80it/s, loss=0.0095, lr=1.93e-05, step=3706] Training: 37%|███▋ | 3707/10000 [44:17<58:21, 1.80it/s, loss=0.0162, lr=1.92e-05, step=3707] Training: 37%|███▋ | 3708/10000 [44:17<56:46, 1.85it/s, loss=0.0162, lr=1.92e-05, step=3707] Training: 37%|███▋ | 3708/10000 [44:17<56:46, 1.85it/s, loss=0.0205, lr=1.92e-05, step=3708] Training: 37%|███▋ | 3709/10000 [44:18<54:59, 1.91it/s, loss=0.0205, lr=1.92e-05, step=3708] Training: 37%|███▋ | 3709/10000 [44:18<54:59, 1.91it/s, loss=0.0059, lr=1.92e-05, step=3709]16:50:25.650 [I] step=3710 loss=0.0062 smoothed_loss=0.0232 lr=1.93e-05 grad_norm=0.5460 step_time=0.5146s data_time=0.0596s it/s=1.742 eta_to_10000=3611.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0219 grad_action_out_proj=0.1685 grad_shared_expert=0.5087 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3710/10000 [44:19<1:03:06, 1.66it/s, loss=0.0059, lr=1.92e-05, step=3709] Training: 37%|███▋ | 3710/10000 [44:19<1:03:06, 1.66it/s, loss=0.0062, lr=1.92e-05, step=3710] Training: 37%|███▋ | 3711/10000 [44:19<1:00:36, 1.73it/s, loss=0.0062, lr=1.92e-05, step=3710] Training: 37%|███▋ | 3711/10000 [44:19<1:00:36, 1.73it/s, loss=0.0224, lr=1.92e-05, step=3711] Training: 37%|███▋ | 3712/10000 [44:20<58:35, 1.79it/s, loss=0.0224, lr=1.92e-05, step=3711] Training: 37%|███▋ | 3712/10000 [44:20<58:35, 1.79it/s, loss=0.0308, lr=1.92e-05, step=3712] Training: 37%|███▋ | 3713/10000 [44:21<1:05:00, 1.61it/s, loss=0.0308, lr=1.92e-05, step=3712] Training: 37%|███▋ | 3713/10000 [44:21<1:05:00, 1.61it/s, loss=0.0161, lr=1.92e-05, step=3713] Training: 37%|███▋ | 3714/10000 [44:21<1:03:25, 1.65it/s, loss=0.0161, lr=1.92e-05, step=3713] Training: 37%|███▋ | 3714/10000 [44:21<1:03:25, 1.65it/s, loss=0.0032, lr=1.92e-05, step=3714] Training: 37%|███▋ | 3715/10000 [44:22<1:01:28, 1.70it/s, loss=0.0032, lr=1.92e-05, step=3714] Training: 37%|███▋ | 3715/10000 [44:22<1:01:28, 1.70it/s, loss=0.0194, lr=1.92e-05, step=3715] Training: 37%|███▋ | 3716/10000 [44:22<59:25, 1.76it/s, loss=0.0194, lr=1.92e-05, step=3715] Training: 37%|███▋ | 3716/10000 [44:22<59:25, 1.76it/s, loss=0.0066, lr=1.92e-05, step=3716] Training: 37%|███▋ | 3717/10000 [44:23<1:04:04, 1.63it/s, loss=0.0066, lr=1.92e-05, step=3716] Training: 37%|███▋ | 3717/10000 [44:23<1:04:04, 1.63it/s, loss=0.0365, lr=1.92e-05, step=3717] Training: 37%|███▋ | 3718/10000 [44:23<1:00:37, 1.73it/s, loss=0.0365, lr=1.92e-05, step=3717] Training: 37%|███▋ | 3718/10000 [44:23<1:00:37, 1.73it/s, loss=0.0107, lr=1.92e-05, step=3718] Training: 37%|███▋ | 3719/10000 [44:24<1:05:11, 1.61it/s, loss=0.0107, lr=1.92e-05, step=3718] Training: 37%|███▋ | 3719/10000 [44:24<1:05:11, 1.61it/s, loss=0.0128, lr=1.92e-05, step=3719]16:50:31.598 [I] step=3720 loss=0.0129 smoothed_loss=0.0188 lr=1.92e-05 grad_norm=0.6689 step_time=0.5307s data_time=0.0640s it/s=1.682 eta_to_10000=3734.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1452 grad_shared_expert=0.3997 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3720/10000 [44:25<1:03:26, 1.65it/s, loss=0.0128, lr=1.92e-05, step=3719] Training: 37%|███▋ | 3720/10000 [44:25<1:03:26, 1.65it/s, loss=0.0129, lr=1.92e-05, step=3720] Training: 37%|███▋ | 3721/10000 [44:25<1:00:01, 1.74it/s, loss=0.0129, lr=1.92e-05, step=3720] Training: 37%|███▋ | 3721/10000 [44:25<1:00:01, 1.74it/s, loss=0.0102, lr=1.92e-05, step=3721] Training: 37%|███▋ | 3722/10000 [44:26<57:26, 1.82it/s, loss=0.0102, lr=1.92e-05, step=3721] Training: 37%|███▋ | 3722/10000 [44:26<57:26, 1.82it/s, loss=0.0058, lr=1.92e-05, step=3722] Training: 37%|███▋ | 3723/10000 [44:26<55:40, 1.88it/s, loss=0.0058, lr=1.92e-05, step=3722] Training: 37%|███▋ | 3723/10000 [44:26<55:40, 1.88it/s, loss=0.0269, lr=1.92e-05, step=3723] Training: 37%|███▋ | 3724/10000 [44:27<54:11, 1.93it/s, loss=0.0269, lr=1.92e-05, step=3723] Training: 37%|███▋ | 3724/10000 [44:27<54:11, 1.93it/s, loss=0.0188, lr=1.92e-05, step=3724] Training: 37%|███▋ | 3725/10000 [44:27<1:00:20, 1.73it/s, loss=0.0188, lr=1.92e-05, step=3724] Training: 37%|███▋ | 3725/10000 [44:27<1:00:20, 1.73it/s, loss=0.0242, lr=1.92e-05, step=3725] Training: 37%|███▋ | 3726/10000 [44:28<58:12, 1.80it/s, loss=0.0242, lr=1.92e-05, step=3725] Training: 37%|███▋ | 3726/10000 [44:28<58:12, 1.80it/s, loss=0.0104, lr=1.92e-05, step=3726] Training: 37%|███▋ | 3727/10000 [44:29<1:03:35, 1.64it/s, loss=0.0104, lr=1.92e-05, step=3726] Training: 37%|███▋ | 3727/10000 [44:29<1:03:35, 1.64it/s, loss=0.0091, lr=1.92e-05, step=3727] Training: 37%|███▋ | 3728/10000 [44:29<1:01:24, 1.70it/s, loss=0.0091, lr=1.92e-05, step=3727] Training: 37%|███▋ | 3728/10000 [44:29<1:01:24, 1.70it/s, loss=0.0059, lr=1.92e-05, step=3728] Training: 37%|███▋ | 3729/10000 [44:30<59:02, 1.77it/s, loss=0.0059, lr=1.92e-05, step=3728] Training: 37%|███▋ | 3729/10000 [44:30<59:02, 1.77it/s, loss=0.1149, lr=1.92e-05, step=3729]16:50:37.114 [I] step=3730 loss=0.0057 smoothed_loss=0.0237 lr=1.92e-05 grad_norm=0.5181 step_time=0.4902s data_time=0.0614s it/s=1.813 eta_to_10000=3458.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.1309 grad_shared_expert=0.3718 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3730/10000 [44:30<58:24, 1.79it/s, loss=0.1149, lr=1.92e-05, step=3729] Training: 37%|███▋ | 3730/10000 [44:30<58:24, 1.79it/s, loss=0.0057, lr=1.92e-05, step=3730] Training: 37%|███▋ | 3731/10000 [44:31<57:11, 1.83it/s, loss=0.0057, lr=1.92e-05, step=3730] Training: 37%|███▋ | 3731/10000 [44:31<57:11, 1.83it/s, loss=0.0100, lr=1.92e-05, step=3731] Training: 37%|███▋ | 3732/10000 [44:31<1:03:04, 1.66it/s, loss=0.0100, lr=1.92e-05, step=3731] Training: 37%|███▋ | 3732/10000 [44:31<1:03:04, 1.66it/s, loss=0.0237, lr=1.92e-05, step=3732] Training: 37%|███▋ | 3733/10000 [44:32<59:50, 1.75it/s, loss=0.0237, lr=1.92e-05, step=3732] Training: 37%|███▋ | 3733/10000 [44:32<59:50, 1.75it/s, loss=0.0162, lr=1.92e-05, step=3733] Training: 37%|███▋ | 3734/10000 [44:33<1:04:50, 1.61it/s, loss=0.0162, lr=1.92e-05, step=3733] Training: 37%|███▋ | 3734/10000 [44:33<1:04:50, 1.61it/s, loss=0.0120, lr=1.92e-05, step=3734] Training: 37%|███▋ | 3735/10000 [44:33<1:01:45, 1.69it/s, loss=0.0120, lr=1.92e-05, step=3734] Training: 37%|███▋ | 3735/10000 [44:33<1:01:45, 1.69it/s, loss=0.0207, lr=1.92e-05, step=3735] Training: 37%|███▋ | 3736/10000 [44:34<59:36, 1.75it/s, loss=0.0207, lr=1.92e-05, step=3735] Training: 37%|███▋ | 3736/10000 [44:34<59:36, 1.75it/s, loss=0.0105, lr=1.92e-05, step=3736] Training: 37%|███▋ | 3737/10000 [44:34<57:59, 1.80it/s, loss=0.0105, lr=1.92e-05, step=3736] Training: 37%|███▋ | 3737/10000 [44:34<57:59, 1.80it/s, loss=0.0157, lr=1.92e-05, step=3737] Training: 37%|███▋ | 3738/10000 [44:35<55:46, 1.87it/s, loss=0.0157, lr=1.92e-05, step=3737] Training: 37%|███▋ | 3738/10000 [44:35<55:46, 1.87it/s, loss=0.0156, lr=1.91e-05, step=3738] Training: 37%|███▋ | 3739/10000 [44:35<1:02:22, 1.67it/s, loss=0.0156, lr=1.91e-05, step=3738] Training: 37%|███▋ | 3739/10000 [44:35<1:02:22, 1.67it/s, loss=0.0198, lr=1.91e-05, step=3739]16:50:42.921 [I] step=3740 loss=0.0181 smoothed_loss=0.0190 lr=1.92e-05 grad_norm=0.4972 step_time=0.5151s data_time=0.0657s it/s=1.722 eta_to_10000=3635.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0091 grad_action_out_proj=0.1054 grad_shared_expert=0.3119 (10775:train_pytorch.py:850) + Training: 37%|███▋ | 3740/10000 [44:36<1:00:00, 1.74it/s, loss=0.0198, lr=1.91e-05, step=3739] Training: 37%|███▋ | 3740/10000 [44:36<1:00:00, 1.74it/s, loss=0.0181, lr=1.91e-05, step=3740] Training: 37%|███▋ | 3741/10000 [44:37<1:04:56, 1.61it/s, loss=0.0181, lr=1.91e-05, step=3740] Training: 37%|███▋ | 3741/10000 [44:37<1:04:56, 1.61it/s, loss=0.0286, lr=1.91e-05, step=3741] Training: 37%|███▋ | 3742/10000 [44:37<1:02:43, 1.66it/s, loss=0.0286, lr=1.91e-05, step=3741] Training: 37%|███▋ | 3742/10000 [44:37<1:02:43, 1.66it/s, loss=0.0112, lr=1.91e-05, step=3742] Training: 37%|███▋ | 3743/10000 [44:38<59:34, 1.75it/s, loss=0.0112, lr=1.91e-05, step=3742] Training: 37%|███▋ | 3743/10000 [44:38<59:34, 1.75it/s, loss=0.0389, lr=1.91e-05, step=3743] Training: 37%|███▋ | 3744/10000 [44:38<1:00:40, 1.72it/s, loss=0.0389, lr=1.91e-05, step=3743] Training: 37%|███▋ | 3744/10000 [44:38<1:00:40, 1.72it/s, loss=0.0119, lr=1.91e-05, step=3744] Training: 37%|███▋ | 3745/10000 [44:39<58:13, 1.79it/s, loss=0.0119, lr=1.91e-05, step=3744] Training: 37%|███▋ | 3745/10000 [44:39<58:13, 1.79it/s, loss=0.1072, lr=1.91e-05, step=3745] Training: 37%|███▋ | 3746/10000 [44:40<1:02:56, 1.66it/s, loss=0.1072, lr=1.91e-05, step=3745] Training: 37%|███▋ | 3746/10000 [44:40<1:02:56, 1.66it/s, loss=0.0088, lr=1.91e-05, step=3746] Training: 37%|███▋ | 3747/10000 [44:40<1:00:05, 1.73it/s, loss=0.0088, lr=1.91e-05, step=3746] Training: 37%|███▋ | 3747/10000 [44:40<1:00:05, 1.73it/s, loss=0.0172, lr=1.91e-05, step=3747] Training: 37%|███▋ | 3748/10000 [44:41<1:05:02, 1.60it/s, loss=0.0172, lr=1.91e-05, step=3747] Training: 37%|███▋ | 3748/10000 [44:41<1:05:02, 1.60it/s, loss=0.0055, lr=1.91e-05, step=3748] Training: 37%|███▋ | 3749/10000 [44:41<1:00:54, 1.71it/s, loss=0.0055, lr=1.91e-05, step=3748] Training: 37%|███▋ | 3749/10000 [44:41<1:00:54, 1.71it/s, loss=0.0273, lr=1.91e-05, step=3749]16:50:48.817 [I] step=3750 loss=0.0257 smoothed_loss=0.0243 lr=1.91e-05 grad_norm=0.5066 step_time=0.5219s data_time=0.0677s it/s=1.696 eta_to_10000=3684.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.1234 grad_shared_expert=0.3862 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3750/10000 [44:42<59:45, 1.74it/s, loss=0.0273, lr=1.91e-05, step=3749] Training: 38%|███▊ | 3750/10000 [44:42<59:45, 1.74it/s, loss=0.0257, lr=1.91e-05, step=3750] Training: 38%|███▊ | 3751/10000 [44:42<58:14, 1.79it/s, loss=0.0257, lr=1.91e-05, step=3750] Training: 38%|███▊ | 3751/10000 [44:42<58:14, 1.79it/s, loss=0.0137, lr=1.91e-05, step=3751] Training: 38%|███▊ | 3752/10000 [44:43<57:08, 1.82it/s, loss=0.0137, lr=1.91e-05, step=3751] Training: 38%|███▊ | 3752/10000 [44:43<57:08, 1.82it/s, loss=0.0158, lr=1.91e-05, step=3752] Training: 38%|███▊ | 3753/10000 [44:44<1:07:13, 1.55it/s, loss=0.0158, lr=1.91e-05, step=3752] Training: 38%|███▊ | 3753/10000 [44:44<1:07:13, 1.55it/s, loss=0.0632, lr=1.91e-05, step=3753] Training: 38%|███▊ | 3754/10000 [44:44<1:03:12, 1.65it/s, loss=0.0632, lr=1.91e-05, step=3753] Training: 38%|███▊ | 3754/10000 [44:44<1:03:12, 1.65it/s, loss=0.0039, lr=1.91e-05, step=3754] Training: 38%|███▊ | 3755/10000 [44:45<1:06:31, 1.56it/s, loss=0.0039, lr=1.91e-05, step=3754] Training: 38%|███▊ | 3755/10000 [44:45<1:06:31, 1.56it/s, loss=0.0308, lr=1.91e-05, step=3755] Training: 38%|███▊ | 3756/10000 [44:46<1:02:15, 1.67it/s, loss=0.0308, lr=1.91e-05, step=3755] Training: 38%|███▊ | 3756/10000 [44:46<1:02:15, 1.67it/s, loss=0.0057, lr=1.91e-05, step=3756] Training: 38%|███▊ | 3757/10000 [44:46<59:38, 1.74it/s, loss=0.0057, lr=1.91e-05, step=3756] Training: 38%|███▊ | 3757/10000 [44:46<59:38, 1.74it/s, loss=0.0418, lr=1.91e-05, step=3757] Training: 38%|███▊ | 3758/10000 [44:47<58:38, 1.77it/s, loss=0.0418, lr=1.91e-05, step=3757] Training: 38%|███▊ | 3758/10000 [44:47<58:38, 1.77it/s, loss=0.0052, lr=1.91e-05, step=3758] Training: 38%|███▊ | 3759/10000 [44:47<1:05:35, 1.59it/s, loss=0.0052, lr=1.91e-05, step=3758] Training: 38%|███▊ | 3759/10000 [44:47<1:05:35, 1.59it/s, loss=0.0058, lr=1.91e-05, step=3759]16:50:54.887 [I] step=3760 loss=0.0058 smoothed_loss=0.0197 lr=1.91e-05 grad_norm=0.5254 step_time=0.5426s data_time=0.0645s it/s=1.648 eta_to_10000=3787.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0160 grad_action_out_proj=0.1221 grad_shared_expert=0.4212 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3760/10000 [44:48<1:03:45, 1.63it/s, loss=0.0058, lr=1.91e-05, step=3759] Training: 38%|███▊ | 3760/10000 [44:48<1:03:45, 1.63it/s, loss=0.0058, lr=1.91e-05, step=3760] Training: 38%|███▊ | 3761/10000 [44:48<1:00:46, 1.71it/s, loss=0.0058, lr=1.91e-05, step=3760] Training: 38%|███▊ | 3761/10000 [44:48<1:00:46, 1.71it/s, loss=0.0125, lr=1.91e-05, step=3761] Training: 38%|███▊ | 3762/10000 [44:49<1:05:28, 1.59it/s, loss=0.0125, lr=1.91e-05, step=3761] Training: 38%|███▊ | 3762/10000 [44:49<1:05:28, 1.59it/s, loss=0.0094, lr=1.91e-05, step=3762] Training: 38%|███▊ | 3763/10000 [44:50<1:01:56, 1.68it/s, loss=0.0094, lr=1.91e-05, step=3762] Training: 38%|███▊ | 3763/10000 [44:50<1:01:56, 1.68it/s, loss=0.0193, lr=1.91e-05, step=3763] Training: 38%|███▊ | 3764/10000 [44:50<59:09, 1.76it/s, loss=0.0193, lr=1.91e-05, step=3763] Training: 38%|███▊ | 3764/10000 [44:50<59:09, 1.76it/s, loss=0.0105, lr=1.91e-05, step=3764] Training: 38%|███▊ | 3765/10000 [44:51<59:55, 1.73it/s, loss=0.0105, lr=1.91e-05, step=3764] Training: 38%|███▊ | 3765/10000 [44:51<59:55, 1.73it/s, loss=0.0065, lr=1.91e-05, step=3765] Training: 38%|███▊ | 3766/10000 [44:52<1:05:22, 1.59it/s, loss=0.0065, lr=1.91e-05, step=3765] Training: 38%|███▊ | 3766/10000 [44:52<1:05:22, 1.59it/s, loss=0.0170, lr=1.91e-05, step=3766] Training: 38%|███▊ | 3767/10000 [44:52<1:02:06, 1.67it/s, loss=0.0170, lr=1.91e-05, step=3766] Training: 38%|███▊ | 3767/10000 [44:52<1:02:06, 1.67it/s, loss=0.0156, lr=1.91e-05, step=3767] Training: 38%|███▊ | 3768/10000 [44:53<59:45, 1.74it/s, loss=0.0156, lr=1.91e-05, step=3767] Training: 38%|███▊ | 3768/10000 [44:53<59:45, 1.74it/s, loss=0.0420, lr=1.90e-05, step=3768] Training: 38%|███▊ | 3769/10000 [44:53<57:59, 1.79it/s, loss=0.0420, lr=1.90e-05, step=3768] Training: 38%|███▊ | 3769/10000 [44:53<57:59, 1.79it/s, loss=0.0147, lr=1.90e-05, step=3769]16:51:00.814 [I] step=3770 loss=0.0022 smoothed_loss=0.0168 lr=1.91e-05 grad_norm=0.4609 step_time=0.5289s data_time=0.0637s it/s=1.688 eta_to_10000=3691.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0110 grad_action_out_proj=0.0983 grad_shared_expert=0.3845 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3770/10000 [44:54<1:03:32, 1.63it/s, loss=0.0147, lr=1.90e-05, step=3769] Training: 38%|███▊ | 3770/10000 [44:54<1:03:32, 1.63it/s, loss=0.0022, lr=1.90e-05, step=3770] Training: 38%|███▊ | 3771/10000 [44:54<1:00:31, 1.72it/s, loss=0.0022, lr=1.90e-05, step=3770] Training: 38%|███▊ | 3771/10000 [44:54<1:00:31, 1.72it/s, loss=0.0178, lr=1.90e-05, step=3771] Training: 38%|███▊ | 3772/10000 [44:55<1:03:21, 1.64it/s, loss=0.0178, lr=1.90e-05, step=3771] Training: 38%|███▊ | 3772/10000 [44:55<1:03:21, 1.64it/s, loss=0.0090, lr=1.90e-05, step=3772] Training: 38%|███▊ | 3773/10000 [44:56<59:53, 1.73it/s, loss=0.0090, lr=1.90e-05, step=3772] Training: 38%|███▊ | 3773/10000 [44:56<59:53, 1.73it/s, loss=0.0136, lr=1.90e-05, step=3773] Training: 38%|███▊ | 3774/10000 [44:56<1:08:02, 1.53it/s, loss=0.0136, lr=1.90e-05, step=3773] Training: 38%|███▊ | 3774/10000 [44:56<1:08:02, 1.53it/s, loss=0.0073, lr=1.90e-05, step=3774] Training: 38%|███▊ | 3775/10000 [44:57<1:03:49, 1.63it/s, loss=0.0073, lr=1.90e-05, step=3774] Training: 38%|███▊ | 3775/10000 [44:57<1:03:49, 1.63it/s, loss=0.0031, lr=1.90e-05, step=3775] Training: 38%|███▊ | 3776/10000 [44:58<1:02:52, 1.65it/s, loss=0.0031, lr=1.90e-05, step=3775] Training: 38%|███▊ | 3776/10000 [44:58<1:02:52, 1.65it/s, loss=0.0062, lr=1.90e-05, step=3776] Training: 38%|███▊ | 3777/10000 [44:58<1:06:14, 1.57it/s, loss=0.0062, lr=1.90e-05, step=3776] Training: 38%|███▊ | 3777/10000 [44:58<1:06:14, 1.57it/s, loss=0.0055, lr=1.90e-05, step=3777] Training: 38%|███▊ | 3778/10000 [44:59<1:05:40, 1.58it/s, loss=0.0055, lr=1.90e-05, step=3777] Training: 38%|███▊ | 3778/10000 [44:59<1:05:40, 1.58it/s, loss=0.0142, lr=1.90e-05, step=3778] Training: 38%|███▊ | 3779/10000 [44:59<1:01:32, 1.68it/s, loss=0.0142, lr=1.90e-05, step=3778] Training: 38%|███▊ | 3779/10000 [44:59<1:01:32, 1.68it/s, loss=0.0176, lr=1.90e-05, step=3779]16:51:06.820 [I] step=3780 loss=0.0173 smoothed_loss=0.0134 lr=1.90e-05 grad_norm=0.4356 step_time=0.5322s data_time=0.0685s it/s=1.665 eta_to_10000=3735.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.1469 grad_shared_expert=0.5851 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3780/10000 [45:00<59:45, 1.73it/s, loss=0.0176, lr=1.90e-05, step=3779] Training: 38%|███▊ | 3780/10000 [45:00<59:45, 1.73it/s, loss=0.0173, lr=1.90e-05, step=3780] Training: 38%|███▊ | 3781/10000 [45:01<1:08:52, 1.50it/s, loss=0.0173, lr=1.90e-05, step=3780] Training: 38%|███▊ | 3781/10000 [45:01<1:08:52, 1.50it/s, loss=0.0736, lr=1.90e-05, step=3781] Training: 38%|███▊ | 3782/10000 [45:01<1:07:36, 1.53it/s, loss=0.0736, lr=1.90e-05, step=3781] Training: 38%|███▊ | 3782/10000 [45:01<1:07:36, 1.53it/s, loss=0.0183, lr=1.90e-05, step=3782] Training: 38%|███▊ | 3783/10000 [45:02<1:05:11, 1.59it/s, loss=0.0183, lr=1.90e-05, step=3782] Training: 38%|███▊ | 3783/10000 [45:02<1:05:11, 1.59it/s, loss=0.0547, lr=1.90e-05, step=3783] Training: 38%|███▊ | 3784/10000 [45:03<1:08:07, 1.52it/s, loss=0.0547, lr=1.90e-05, step=3783] Training: 38%|███▊ | 3784/10000 [45:03<1:08:07, 1.52it/s, loss=0.0028, lr=1.90e-05, step=3784] Training: 38%|███▊ | 3785/10000 [45:03<1:04:55, 1.60it/s, loss=0.0028, lr=1.90e-05, step=3784] Training: 38%|███▊ | 3785/10000 [45:03<1:04:55, 1.60it/s, loss=0.0093, lr=1.90e-05, step=3785] Training: 38%|███▊ | 3786/10000 [45:04<1:01:36, 1.68it/s, loss=0.0093, lr=1.90e-05, step=3785] Training: 38%|███▊ | 3786/10000 [45:04<1:01:36, 1.68it/s, loss=0.0188, lr=1.90e-05, step=3786] Training: 38%|███▊ | 3787/10000 [45:04<58:31, 1.77it/s, loss=0.0188, lr=1.90e-05, step=3786] Training: 38%|███▊ | 3787/10000 [45:04<58:31, 1.77it/s, loss=0.0132, lr=1.90e-05, step=3787] Training: 38%|███▊ | 3788/10000 [45:05<56:32, 1.83it/s, loss=0.0132, lr=1.90e-05, step=3787] Training: 38%|███▊ | 3788/10000 [45:05<56:32, 1.83it/s, loss=0.0039, lr=1.90e-05, step=3788] Training: 38%|███▊ | 3789/10000 [45:06<1:05:15, 1.59it/s, loss=0.0039, lr=1.90e-05, step=3788] Training: 38%|███▊ | 3789/10000 [45:06<1:05:15, 1.59it/s, loss=0.0182, lr=1.90e-05, step=3789]16:51:13.091 [I] step=3790 loss=0.0188 smoothed_loss=0.0177 lr=1.90e-05 grad_norm=0.5349 step_time=0.5570s data_time=0.0701s it/s=1.595 eta_to_10000=3893.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0166 grad_action_out_proj=0.1429 grad_shared_expert=0.7577 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3790/10000 [45:06<1:03:36, 1.63it/s, loss=0.0182, lr=1.90e-05, step=3789] Training: 38%|███▊ | 3790/10000 [45:06<1:03:36, 1.63it/s, loss=0.0188, lr=1.90e-05, step=3790] Training: 38%|███▊ | 3791/10000 [45:07<1:06:35, 1.55it/s, loss=0.0188, lr=1.90e-05, step=3790] Training: 38%|███▊ | 3791/10000 [45:07<1:06:35, 1.55it/s, loss=0.0094, lr=1.90e-05, step=3791] Training: 38%|███▊ | 3792/10000 [45:07<1:02:40, 1.65it/s, loss=0.0094, lr=1.90e-05, step=3791] Training: 38%|███▊ | 3792/10000 [45:07<1:02:40, 1.65it/s, loss=0.0087, lr=1.90e-05, step=3792] Training: 38%|███▊ | 3793/10000 [45:08<59:38, 1.73it/s, loss=0.0087, lr=1.90e-05, step=3792] Training: 38%|███▊ | 3793/10000 [45:08<59:38, 1.73it/s, loss=0.0062, lr=1.90e-05, step=3793] Training: 38%|███▊ | 3794/10000 [45:08<58:17, 1.77it/s, loss=0.0062, lr=1.90e-05, step=3793] Training: 38%|███▊ | 3794/10000 [45:08<58:17, 1.77it/s, loss=0.0072, lr=1.90e-05, step=3794] Training: 38%|███▊ | 3795/10000 [45:09<57:22, 1.80it/s, loss=0.0072, lr=1.90e-05, step=3794] Training: 38%|███▊ | 3795/10000 [45:09<57:22, 1.80it/s, loss=0.0080, lr=1.90e-05, step=3795] Training: 38%|███▊ | 3796/10000 [45:10<1:05:28, 1.58it/s, loss=0.0080, lr=1.90e-05, step=3795] Training: 38%|███▊ | 3796/10000 [45:10<1:05:28, 1.58it/s, loss=0.0055, lr=1.90e-05, step=3796] Training: 38%|███▊ | 3797/10000 [45:10<1:01:04, 1.69it/s, loss=0.0055, lr=1.90e-05, step=3796] Training: 38%|███▊ | 3797/10000 [45:10<1:01:04, 1.69it/s, loss=0.0072, lr=1.90e-05, step=3797] Training: 38%|███▊ | 3798/10000 [45:11<1:04:47, 1.60it/s, loss=0.0072, lr=1.90e-05, step=3797] Training: 38%|███▊ | 3798/10000 [45:11<1:04:47, 1.60it/s, loss=0.0112, lr=1.89e-05, step=3798] Training: 38%|███▊ | 3799/10000 [45:12<1:02:57, 1.64it/s, loss=0.0112, lr=1.89e-05, step=3798] Training: 38%|███▊ | 3799/10000 [45:12<1:02:57, 1.64it/s, loss=0.0146, lr=1.89e-05, step=3799]16:51:19.040 [I] step=3800 loss=0.0225 smoothed_loss=0.0134 lr=1.90e-05 grad_norm=0.4719 step_time=0.5386s data_time=0.0564s it/s=1.681 eta_to_10000=3687.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0155 grad_action_out_proj=0.1384 grad_shared_expert=0.7200 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3800/10000 [45:12<1:01:20, 1.68it/s, loss=0.0146, lr=1.89e-05, step=3799] Training: 38%|███▊ | 3800/10000 [45:12<1:01:20, 1.68it/s, loss=0.0225, lr=1.89e-05, step=3800] Training: 38%|███▊ | 3801/10000 [45:13<59:19, 1.74it/s, loss=0.0225, lr=1.89e-05, step=3800] Training: 38%|███▊ | 3801/10000 [45:13<59:19, 1.74it/s, loss=0.0161, lr=1.89e-05, step=3801] Training: 38%|███▊ | 3802/10000 [45:13<57:39, 1.79it/s, loss=0.0161, lr=1.89e-05, step=3801] Training: 38%|███▊ | 3802/10000 [45:13<57:39, 1.79it/s, loss=0.0017, lr=1.89e-05, step=3802] Training: 38%|███▊ | 3803/10000 [45:14<1:05:44, 1.57it/s, loss=0.0017, lr=1.89e-05, step=3802] Training: 38%|███▊ | 3803/10000 [45:14<1:05:44, 1.57it/s, loss=0.0165, lr=1.89e-05, step=3803] Training: 38%|███▊ | 3804/10000 [45:14<1:02:02, 1.66it/s, loss=0.0165, lr=1.89e-05, step=3803] Training: 38%|███▊ | 3804/10000 [45:14<1:02:02, 1.66it/s, loss=0.0225, lr=1.89e-05, step=3804] Training: 38%|███▊ | 3805/10000 [45:15<1:05:04, 1.59it/s, loss=0.0225, lr=1.89e-05, step=3804] Training: 38%|███▊ | 3805/10000 [45:15<1:05:04, 1.59it/s, loss=0.0225, lr=1.89e-05, step=3805] Training: 38%|███▊ | 3806/10000 [45:16<1:02:51, 1.64it/s, loss=0.0225, lr=1.89e-05, step=3805] Training: 38%|███▊ | 3806/10000 [45:16<1:02:51, 1.64it/s, loss=0.0100, lr=1.89e-05, step=3806] Training: 38%|███▊ | 3807/10000 [45:16<1:00:06, 1.72it/s, loss=0.0100, lr=1.89e-05, step=3806] Training: 38%|███▊ | 3807/10000 [45:16<1:00:06, 1.72it/s, loss=0.0325, lr=1.89e-05, step=3807] Training: 38%|███▊ | 3808/10000 [45:17<58:41, 1.76it/s, loss=0.0325, lr=1.89e-05, step=3807] Training: 38%|███▊ | 3808/10000 [45:17<58:41, 1.76it/s, loss=0.0074, lr=1.89e-05, step=3808] Training: 38%|███▊ | 3809/10000 [45:17<56:54, 1.81it/s, loss=0.0074, lr=1.89e-05, step=3808] Training: 38%|███▊ | 3809/10000 [45:17<56:54, 1.81it/s, loss=0.0134, lr=1.89e-05, step=3809]16:51:24.902 [I] step=3810 loss=0.0070 smoothed_loss=0.0142 lr=1.89e-05 grad_norm=0.4861 step_time=0.5251s data_time=0.0609s it/s=1.706 eta_to_10000=3627.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.1107 grad_shared_expert=0.3280 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3810/10000 [45:18<59:53, 1.72it/s, loss=0.0134, lr=1.89e-05, step=3809] Training: 38%|███▊ | 3810/10000 [45:18<59:53, 1.72it/s, loss=0.0070, lr=1.89e-05, step=3810] Training: 38%|███▊ | 3811/10000 [45:19<1:08:16, 1.51it/s, loss=0.0070, lr=1.89e-05, step=3810] Training: 38%|███▊ | 3811/10000 [45:19<1:08:16, 1.51it/s, loss=0.0132, lr=1.89e-05, step=3811] Training: 38%|███▊ | 3812/10000 [45:19<1:03:38, 1.62it/s, loss=0.0132, lr=1.89e-05, step=3811] Training: 38%|███▊ | 3812/10000 [45:19<1:03:38, 1.62it/s, loss=0.0287, lr=1.89e-05, step=3812] Training: 38%|███▊ | 3813/10000 [45:20<1:06:20, 1.55it/s, loss=0.0287, lr=1.89e-05, step=3812] Training: 38%|███▊ | 3813/10000 [45:20<1:06:20, 1.55it/s, loss=0.0347, lr=1.89e-05, step=3813] Training: 38%|███▊ | 3814/10000 [45:21<1:02:01, 1.66it/s, loss=0.0347, lr=1.89e-05, step=3813] Training: 38%|███▊ | 3814/10000 [45:21<1:02:01, 1.66it/s, loss=0.0211, lr=1.89e-05, step=3814] Training: 38%|███▊ | 3815/10000 [45:21<1:01:19, 1.68it/s, loss=0.0211, lr=1.89e-05, step=3814] Training: 38%|███▊ | 3815/10000 [45:21<1:01:19, 1.68it/s, loss=0.0046, lr=1.89e-05, step=3815] Training: 38%|███▊ | 3816/10000 [45:22<59:12, 1.74it/s, loss=0.0046, lr=1.89e-05, step=3815] Training: 38%|███▊ | 3816/10000 [45:22<59:12, 1.74it/s, loss=0.0290, lr=1.89e-05, step=3816] Training: 38%|███▊ | 3817/10000 [45:22<56:39, 1.82it/s, loss=0.0290, lr=1.89e-05, step=3816] Training: 38%|███▊ | 3817/10000 [45:22<56:39, 1.82it/s, loss=0.0103, lr=1.89e-05, step=3817] Training: 38%|███▊ | 3818/10000 [45:23<1:04:19, 1.60it/s, loss=0.0103, lr=1.89e-05, step=3817] Training: 38%|███▊ | 3818/10000 [45:23<1:04:19, 1.60it/s, loss=0.0259, lr=1.89e-05, step=3818] Training: 38%|███▊ | 3819/10000 [45:23<1:00:23, 1.71it/s, loss=0.0259, lr=1.89e-05, step=3818] Training: 38%|███▊ | 3819/10000 [45:23<1:00:23, 1.71it/s, loss=0.0170, lr=1.89e-05, step=3819]16:51:31.251 [I] step=3820 loss=0.0197 smoothed_loss=0.0180 lr=1.89e-05 grad_norm=0.4835 step_time=0.5700s data_time=0.0650s it/s=1.575 eta_to_10000=3923.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1204 grad_shared_expert=0.3338 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3820/10000 [45:24<1:09:32, 1.48it/s, loss=0.0170, lr=1.89e-05, step=3819] Training: 38%|███▊ | 3820/10000 [45:24<1:09:32, 1.48it/s, loss=0.0197, lr=1.89e-05, step=3820] Training: 38%|███▊ | 3821/10000 [45:25<1:04:28, 1.60it/s, loss=0.0197, lr=1.89e-05, step=3820] Training: 38%|███▊ | 3821/10000 [45:25<1:04:28, 1.60it/s, loss=0.0055, lr=1.89e-05, step=3821] Training: 38%|███▊ | 3822/10000 [45:25<1:00:55, 1.69it/s, loss=0.0055, lr=1.89e-05, step=3821] Training: 38%|███▊ | 3822/10000 [45:25<1:00:55, 1.69it/s, loss=0.0387, lr=1.89e-05, step=3822] Training: 38%|███▊ | 3823/10000 [45:26<58:16, 1.77it/s, loss=0.0387, lr=1.89e-05, step=3822] Training: 38%|███▊ | 3823/10000 [45:26<58:16, 1.77it/s, loss=0.0125, lr=1.89e-05, step=3823] Training: 38%|███▊ | 3824/10000 [45:26<56:24, 1.82it/s, loss=0.0125, lr=1.89e-05, step=3823] Training: 38%|███▊ | 3824/10000 [45:26<56:24, 1.82it/s, loss=0.0459, lr=1.89e-05, step=3824] Training: 38%|███▊ | 3825/10000 [45:27<1:04:07, 1.61it/s, loss=0.0459, lr=1.89e-05, step=3824] Training: 38%|███▊ | 3825/10000 [45:27<1:04:07, 1.61it/s, loss=0.0166, lr=1.89e-05, step=3825] Training: 38%|███▊ | 3826/10000 [45:28<1:00:53, 1.69it/s, loss=0.0166, lr=1.89e-05, step=3825] Training: 38%|███▊ | 3826/10000 [45:28<1:00:53, 1.69it/s, loss=0.0244, lr=1.89e-05, step=3826] Training: 38%|███▊ | 3827/10000 [45:28<1:04:47, 1.59it/s, loss=0.0244, lr=1.89e-05, step=3826] Training: 38%|███▊ | 3827/10000 [45:28<1:04:47, 1.59it/s, loss=0.0125, lr=1.89e-05, step=3827] Training: 38%|███▊ | 3828/10000 [45:29<1:01:07, 1.68it/s, loss=0.0125, lr=1.89e-05, step=3827] Training: 38%|███▊ | 3828/10000 [45:29<1:01:07, 1.68it/s, loss=0.0039, lr=1.89e-05, step=3828] Training: 38%|███▊ | 3829/10000 [45:29<58:36, 1.76it/s, loss=0.0039, lr=1.89e-05, step=3828] Training: 38%|███▊ | 3829/10000 [45:29<58:36, 1.76it/s, loss=0.0068, lr=1.88e-05, step=3829]16:51:36.902 [I] step=3830 loss=0.0439 smoothed_loss=0.0200 lr=1.89e-05 grad_norm=0.5366 step_time=0.5038s data_time=0.0612s it/s=1.770 eta_to_10000=3486.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0281 grad_action_out_proj=0.1879 grad_shared_expert=0.7901 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3830/10000 [45:30<58:11, 1.77it/s, loss=0.0068, lr=1.88e-05, step=3829] Training: 38%|███▊ | 3830/10000 [45:30<58:11, 1.77it/s, loss=0.0439, lr=1.88e-05, step=3830] Training: 38%|███▊ | 3831/10000 [45:31<57:32, 1.79it/s, loss=0.0439, lr=1.88e-05, step=3830] Training: 38%|███▊ | 3831/10000 [45:31<57:32, 1.79it/s, loss=0.0093, lr=1.88e-05, step=3831] Training: 38%|███▊ | 3832/10000 [45:31<1:04:51, 1.59it/s, loss=0.0093, lr=1.88e-05, step=3831] Training: 38%|███▊ | 3832/10000 [45:31<1:04:51, 1.59it/s, loss=0.0255, lr=1.88e-05, step=3832] Training: 38%|███▊ | 3833/10000 [45:32<1:01:27, 1.67it/s, loss=0.0255, lr=1.88e-05, step=3832] Training: 38%|███▊ | 3833/10000 [45:32<1:01:27, 1.67it/s, loss=0.0124, lr=1.88e-05, step=3833] Training: 38%|███▊ | 3834/10000 [45:33<1:05:19, 1.57it/s, loss=0.0124, lr=1.88e-05, step=3833] Training: 38%|███▊ | 3834/10000 [45:33<1:05:19, 1.57it/s, loss=0.0460, lr=1.88e-05, step=3834] Training: 38%|███▊ | 3835/10000 [45:33<1:00:44, 1.69it/s, loss=0.0460, lr=1.88e-05, step=3834] Training: 38%|███▊ | 3835/10000 [45:33<1:00:44, 1.69it/s, loss=0.0021, lr=1.88e-05, step=3835] Training: 38%|███▊ | 3836/10000 [45:34<57:51, 1.78it/s, loss=0.0021, lr=1.88e-05, step=3835] Training: 38%|███▊ | 3836/10000 [45:34<57:51, 1.78it/s, loss=0.0167, lr=1.88e-05, step=3836] Training: 38%|███▊ | 3837/10000 [45:34<58:46, 1.75it/s, loss=0.0167, lr=1.88e-05, step=3836] Training: 38%|███▊ | 3837/10000 [45:34<58:46, 1.75it/s, loss=0.0703, lr=1.88e-05, step=3837] Training: 38%|███▊ | 3838/10000 [45:35<1:16:38, 1.34it/s, loss=0.0703, lr=1.88e-05, step=3837] Training: 38%|███▊ | 3838/10000 [45:35<1:16:38, 1.34it/s, loss=0.0118, lr=1.88e-05, step=3838] Training: 38%|███▊ | 3839/10000 [45:36<1:10:51, 1.45it/s, loss=0.0118, lr=1.88e-05, step=3838] Training: 38%|███▊ | 3839/10000 [45:36<1:10:51, 1.45it/s, loss=0.0320, lr=1.88e-05, step=3839]16:51:43.326 [I] step=3840 loss=0.0140 smoothed_loss=0.0231 lr=1.88e-05 grad_norm=0.4947 step_time=0.5523s data_time=0.0902s it/s=1.557 eta_to_10000=3956.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.1123 grad_shared_expert=0.3865 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3840/10000 [45:36<1:06:26, 1.55it/s, loss=0.0320, lr=1.88e-05, step=3839] Training: 38%|███▊ | 3840/10000 [45:36<1:06:26, 1.55it/s, loss=0.0140, lr=1.88e-05, step=3840] Training: 38%|███▊ | 3841/10000 [45:37<1:09:10, 1.48it/s, loss=0.0140, lr=1.88e-05, step=3840] Training: 38%|███▊ | 3841/10000 [45:37<1:09:10, 1.48it/s, loss=0.0065, lr=1.88e-05, step=3841] Training: 38%|███▊ | 3842/10000 [45:38<1:04:33, 1.59it/s, loss=0.0065, lr=1.88e-05, step=3841] Training: 38%|███▊ | 3842/10000 [45:38<1:04:33, 1.59it/s, loss=0.0119, lr=1.88e-05, step=3842] Training: 38%|███▊ | 3843/10000 [45:38<1:05:18, 1.57it/s, loss=0.0119, lr=1.88e-05, step=3842] Training: 38%|███▊ | 3843/10000 [45:38<1:05:18, 1.57it/s, loss=0.0060, lr=1.88e-05, step=3843] Training: 38%|███▊ | 3844/10000 [45:39<1:07:40, 1.52it/s, loss=0.0060, lr=1.88e-05, step=3843] Training: 38%|███▊ | 3844/10000 [45:39<1:07:40, 1.52it/s, loss=0.0122, lr=1.88e-05, step=3844] Training: 38%|███▊ | 3845/10000 [45:40<1:11:58, 1.43it/s, loss=0.0122, lr=1.88e-05, step=3844] Training: 38%|███▊ | 3845/10000 [45:40<1:11:58, 1.43it/s, loss=0.0314, lr=1.88e-05, step=3845] Training: 38%|███▊ | 3846/10000 [45:40<1:07:00, 1.53it/s, loss=0.0314, lr=1.88e-05, step=3845] Training: 38%|███▊ | 3846/10000 [45:40<1:07:00, 1.53it/s, loss=0.0041, lr=1.88e-05, step=3846] Training: 38%|███▊ | 3847/10000 [45:41<1:03:10, 1.62it/s, loss=0.0041, lr=1.88e-05, step=3846] Training: 38%|███▊ | 3847/10000 [45:41<1:03:10, 1.62it/s, loss=0.0030, lr=1.88e-05, step=3847] Training: 38%|███▊ | 3848/10000 [45:42<1:07:10, 1.53it/s, loss=0.0030, lr=1.88e-05, step=3847] Training: 38%|███▊ | 3848/10000 [45:42<1:07:10, 1.53it/s, loss=0.0119, lr=1.88e-05, step=3848] Training: 38%|███▊ | 3849/10000 [45:42<1:02:15, 1.65it/s, loss=0.0119, lr=1.88e-05, step=3848] Training: 38%|███▊ | 3849/10000 [45:42<1:02:15, 1.65it/s, loss=0.0073, lr=1.88e-05, step=3849]16:51:49.600 [I] step=3850 loss=0.0076 smoothed_loss=0.0145 lr=1.88e-05 grad_norm=0.5156 step_time=0.5429s data_time=0.0845s it/s=1.594 eta_to_10000=3858.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0164 grad_action_out_proj=0.1627 grad_shared_expert=0.3896 (10775:train_pytorch.py:850) + Training: 38%|███▊ | 3850/10000 [45:43<1:00:00, 1.71it/s, loss=0.0073, lr=1.88e-05, step=3849] Training: 38%|███▊ | 3850/10000 [45:43<1:00:00, 1.71it/s, loss=0.0076, lr=1.88e-05, step=3850] Training: 39%|███▊ | 3851/10000 [45:43<57:00, 1.80it/s, loss=0.0076, lr=1.88e-05, step=3850] Training: 39%|███▊ | 3851/10000 [45:43<57:00, 1.80it/s, loss=0.0189, lr=1.88e-05, step=3851] Training: 39%|███▊ | 3852/10000 [45:44<55:10, 1.86it/s, loss=0.0189, lr=1.88e-05, step=3851] Training: 39%|███▊ | 3852/10000 [45:44<55:10, 1.86it/s, loss=0.0169, lr=1.88e-05, step=3852] Training: 39%|███▊ | 3853/10000 [45:44<1:02:49, 1.63it/s, loss=0.0169, lr=1.88e-05, step=3852] Training: 39%|███▊ | 3853/10000 [45:44<1:02:49, 1.63it/s, loss=0.0362, lr=1.88e-05, step=3853] Training: 39%|███▊ | 3854/10000 [45:45<59:51, 1.71it/s, loss=0.0362, lr=1.88e-05, step=3853] Training: 39%|███▊ | 3854/10000 [45:45<59:51, 1.71it/s, loss=0.0291, lr=1.88e-05, step=3854] Training: 39%|███▊ | 3855/10000 [45:45<57:12, 1.79it/s, loss=0.0291, lr=1.88e-05, step=3854] Training: 39%|███▊ | 3855/10000 [45:45<57:12, 1.79it/s, loss=0.0098, lr=1.88e-05, step=3855] Training: 39%|███▊ | 3856/10000 [45:46<1:01:51, 1.66it/s, loss=0.0098, lr=1.88e-05, step=3855] Training: 39%|███▊ | 3856/10000 [45:46<1:01:51, 1.66it/s, loss=0.0393, lr=1.88e-05, step=3856] Training: 39%|███▊ | 3857/10000 [45:47<1:00:47, 1.68it/s, loss=0.0393, lr=1.88e-05, step=3856] Training: 39%|███▊ | 3857/10000 [45:47<1:00:47, 1.68it/s, loss=0.0215, lr=1.88e-05, step=3857] Training: 39%|███▊ | 3858/10000 [45:47<58:06, 1.76it/s, loss=0.0215, lr=1.88e-05, step=3857] Training: 39%|███▊ | 3858/10000 [45:47<58:06, 1.76it/s, loss=0.0731, lr=1.88e-05, step=3858] Training: 39%|███▊ | 3859/10000 [45:48<56:09, 1.82it/s, loss=0.0731, lr=1.88e-05, step=3858] Training: 39%|███▊ | 3859/10000 [45:48<56:09, 1.82it/s, loss=0.0099, lr=1.87e-05, step=3859]16:51:55.518 [I] step=3860 loss=0.0178 smoothed_loss=0.0231 lr=1.88e-05 grad_norm=0.5162 step_time=0.5345s data_time=0.0574s it/s=1.690 eta_to_10000=3633.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.1199 grad_shared_expert=0.3737 (10775:train_pytorch.py:850) + Training: 39%|███▊ | 3860/10000 [45:49<1:05:04, 1.57it/s, loss=0.0099, lr=1.87e-05, step=3859] Training: 39%|███▊ | 3860/10000 [45:49<1:05:04, 1.57it/s, loss=0.0178, lr=1.87e-05, step=3860] Training: 39%|███▊ | 3861/10000 [45:49<1:01:04, 1.68it/s, loss=0.0178, lr=1.87e-05, step=3860] Training: 39%|███▊ | 3861/10000 [45:49<1:01:04, 1.68it/s, loss=0.0077, lr=1.87e-05, step=3861] Training: 39%|███▊ | 3862/10000 [45:50<58:51, 1.74it/s, loss=0.0077, lr=1.87e-05, step=3861] Training: 39%|███▊ | 3862/10000 [45:50<58:51, 1.74it/s, loss=0.0412, lr=1.87e-05, step=3862] Training: 39%|███▊ | 3863/10000 [45:50<1:04:28, 1.59it/s, loss=0.0412, lr=1.87e-05, step=3862] Training: 39%|███▊ | 3863/10000 [45:50<1:04:28, 1.59it/s, loss=0.0090, lr=1.87e-05, step=3863] Training: 39%|███▊ | 3864/10000 [45:51<1:01:13, 1.67it/s, loss=0.0090, lr=1.87e-05, step=3863] Training: 39%|███▊ | 3864/10000 [45:51<1:01:13, 1.67it/s, loss=0.0124, lr=1.87e-05, step=3864] Training: 39%|███▊ | 3865/10000 [45:51<59:11, 1.73it/s, loss=0.0124, lr=1.87e-05, step=3864] Training: 39%|███▊ | 3865/10000 [45:51<59:11, 1.73it/s, loss=0.0162, lr=1.87e-05, step=3865] Training: 39%|███▊ | 3866/10000 [45:52<56:57, 1.80it/s, loss=0.0162, lr=1.87e-05, step=3865] Training: 39%|███▊ | 3866/10000 [45:52<56:57, 1.80it/s, loss=0.0067, lr=1.87e-05, step=3866] Training: 39%|███▊ | 3867/10000 [45:52<55:48, 1.83it/s, loss=0.0067, lr=1.87e-05, step=3866] Training: 39%|███▊ | 3867/10000 [45:52<55:48, 1.83it/s, loss=0.0211, lr=1.87e-05, step=3867] Training: 39%|███▊ | 3868/10000 [45:53<1:02:59, 1.62it/s, loss=0.0211, lr=1.87e-05, step=3867] Training: 39%|███▊ | 3868/10000 [45:53<1:02:59, 1.62it/s, loss=0.0103, lr=1.87e-05, step=3868] Training: 39%|███▊ | 3869/10000 [45:54<59:26, 1.72it/s, loss=0.0103, lr=1.87e-05, step=3868] Training: 39%|███▊ | 3869/10000 [45:54<59:26, 1.72it/s, loss=0.0130, lr=1.87e-05, step=3869]16:52:01.438 [I] step=3870 loss=0.0224 smoothed_loss=0.0184 lr=1.87e-05 grad_norm=0.4885 step_time=0.5286s data_time=0.0635s it/s=1.690 eta_to_10000=3628.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0145 grad_action_out_proj=0.1035 grad_shared_expert=0.4142 (10775:train_pytorch.py:850) + Training: 39%|███▊ | 3870/10000 [45:55<1:05:02, 1.57it/s, loss=0.0130, lr=1.87e-05, step=3869] Training: 39%|███▊ | 3870/10000 [45:55<1:05:02, 1.57it/s, loss=0.0224, lr=1.87e-05, step=3870] Training: 39%|███▊ | 3871/10000 [45:55<1:01:35, 1.66it/s, loss=0.0224, lr=1.87e-05, step=3870] Training: 39%|███▊ | 3871/10000 [45:55<1:01:35, 1.66it/s, loss=0.0129, lr=1.87e-05, step=3871] Training: 39%|███▊ | 3872/10000 [45:56<58:38, 1.74it/s, loss=0.0129, lr=1.87e-05, step=3871] Training: 39%|███▊ | 3872/10000 [45:56<58:38, 1.74it/s, loss=0.0061, lr=1.87e-05, step=3872] Training: 39%|███▊ | 3873/10000 [45:56<58:31, 1.74it/s, loss=0.0061, lr=1.87e-05, step=3872] Training: 39%|███▊ | 3873/10000 [45:56<58:31, 1.74it/s, loss=0.0330, lr=1.87e-05, step=3873] Training: 39%|███▊ | 3874/10000 [45:57<58:01, 1.76it/s, loss=0.0330, lr=1.87e-05, step=3873] Training: 39%|███▊ | 3874/10000 [45:57<58:01, 1.76it/s, loss=0.0102, lr=1.87e-05, step=3874] Training: 39%|███▉ | 3875/10000 [45:57<55:32, 1.84it/s, loss=0.0102, lr=1.87e-05, step=3874] Training: 39%|███▉ | 3875/10000 [45:57<55:32, 1.84it/s, loss=0.0074, lr=1.87e-05, step=3875] Training: 39%|███▉ | 3876/10000 [45:58<1:02:47, 1.63it/s, loss=0.0074, lr=1.87e-05, step=3875] Training: 39%|███▉ | 3876/10000 [45:58<1:02:47, 1.63it/s, loss=0.0459, lr=1.87e-05, step=3876] Training: 39%|███▉ | 3877/10000 [45:58<59:25, 1.72it/s, loss=0.0459, lr=1.87e-05, step=3876] Training: 39%|███▉ | 3877/10000 [45:58<59:25, 1.72it/s, loss=0.0071, lr=1.87e-05, step=3877] Training: 39%|███▉ | 3878/10000 [45:59<1:04:18, 1.59it/s, loss=0.0071, lr=1.87e-05, step=3877] Training: 39%|███▉ | 3878/10000 [45:59<1:04:18, 1.59it/s, loss=0.0227, lr=1.87e-05, step=3878] Training: 39%|███▉ | 3879/10000 [46:00<1:00:28, 1.69it/s, loss=0.0227, lr=1.87e-05, step=3878] Training: 39%|███▉ | 3879/10000 [46:00<1:00:28, 1.69it/s, loss=0.0101, lr=1.87e-05, step=3879]16:52:07.189 [I] step=3880 loss=0.0621 smoothed_loss=0.0222 lr=1.87e-05 grad_norm=0.4910 step_time=0.5145s data_time=0.0605s it/s=1.739 eta_to_10000=3518.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0168 grad_action_out_proj=0.2335 grad_shared_expert=0.5950 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3880/10000 [46:00<59:45, 1.71it/s, loss=0.0101, lr=1.87e-05, step=3879] Training: 39%|███▉ | 3880/10000 [46:00<59:45, 1.71it/s, loss=0.0621, lr=1.87e-05, step=3880] Training: 39%|███▉ | 3881/10000 [46:01<56:41, 1.80it/s, loss=0.0621, lr=1.87e-05, step=3880] Training: 39%|███▉ | 3881/10000 [46:01<56:41, 1.80it/s, loss=0.0262, lr=1.87e-05, step=3881] Training: 39%|███▉ | 3882/10000 [46:01<55:00, 1.85it/s, loss=0.0262, lr=1.87e-05, step=3881] Training: 39%|███▉ | 3882/10000 [46:01<55:00, 1.85it/s, loss=0.0253, lr=1.87e-05, step=3882] Training: 39%|███▉ | 3883/10000 [46:02<1:02:59, 1.62it/s, loss=0.0253, lr=1.87e-05, step=3882] Training: 39%|███▉ | 3883/10000 [46:02<1:02:59, 1.62it/s, loss=0.0068, lr=1.87e-05, step=3883] Training: 39%|███▉ | 3884/10000 [46:03<59:13, 1.72it/s, loss=0.0068, lr=1.87e-05, step=3883] Training: 39%|███▉ | 3884/10000 [46:03<59:13, 1.72it/s, loss=0.0139, lr=1.87e-05, step=3884] Training: 39%|███▉ | 3885/10000 [46:03<1:03:41, 1.60it/s, loss=0.0139, lr=1.87e-05, step=3884] Training: 39%|███▉ | 3885/10000 [46:03<1:03:41, 1.60it/s, loss=0.0112, lr=1.87e-05, step=3885] Training: 39%|███▉ | 3886/10000 [46:04<59:50, 1.70it/s, loss=0.0112, lr=1.87e-05, step=3885] Training: 39%|███▉ | 3886/10000 [46:04<59:50, 1.70it/s, loss=0.0105, lr=1.87e-05, step=3886] Training: 39%|███▉ | 3887/10000 [46:04<56:56, 1.79it/s, loss=0.0105, lr=1.87e-05, step=3886] Training: 39%|███▉ | 3887/10000 [46:04<56:56, 1.79it/s, loss=0.0085, lr=1.87e-05, step=3887] Training: 39%|███▉ | 3888/10000 [46:05<57:20, 1.78it/s, loss=0.0085, lr=1.87e-05, step=3887] Training: 39%|███▉ | 3888/10000 [46:05<57:20, 1.78it/s, loss=0.0183, lr=1.87e-05, step=3888] Training: 39%|███▉ | 3889/10000 [46:05<55:42, 1.83it/s, loss=0.0183, lr=1.87e-05, step=3888] Training: 39%|███▉ | 3889/10000 [46:05<55:42, 1.83it/s, loss=0.0033, lr=1.86e-05, step=3889]16:52:13.101 [I] step=3890 loss=0.0143 smoothed_loss=0.0161 lr=1.87e-05 grad_norm=0.4820 step_time=0.5336s data_time=0.0576s it/s=1.692 eta_to_10000=3611.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1052 grad_shared_expert=0.6482 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3890/10000 [46:06<1:04:17, 1.58it/s, loss=0.0033, lr=1.86e-05, step=3889] Training: 39%|███▉ | 3890/10000 [46:06<1:04:17, 1.58it/s, loss=0.0143, lr=1.86e-05, step=3890] Training: 39%|███▉ | 3891/10000 [46:07<1:09:17, 1.47it/s, loss=0.0143, lr=1.86e-05, step=3890] Training: 39%|███▉ | 3891/10000 [46:07<1:09:17, 1.47it/s, loss=0.0234, lr=1.86e-05, step=3891] Training: 39%|███▉ | 3892/10000 [46:07<1:03:39, 1.60it/s, loss=0.0234, lr=1.86e-05, step=3891] Training: 39%|███▉ | 3892/10000 [46:07<1:03:39, 1.60it/s, loss=0.0188, lr=1.86e-05, step=3892] Training: 39%|███▉ | 3893/10000 [46:08<59:26, 1.71it/s, loss=0.0188, lr=1.86e-05, step=3892] Training: 39%|███▉ | 3893/10000 [46:08<59:26, 1.71it/s, loss=0.0156, lr=1.86e-05, step=3893] Training: 39%|███▉ | 3894/10000 [46:08<56:58, 1.79it/s, loss=0.0156, lr=1.86e-05, step=3893] Training: 39%|███▉ | 3894/10000 [46:08<56:58, 1.79it/s, loss=0.0367, lr=1.86e-05, step=3894] Training: 39%|███▉ | 3895/10000 [46:09<55:30, 1.83it/s, loss=0.0367, lr=1.86e-05, step=3894] Training: 39%|███▉ | 3895/10000 [46:09<55:30, 1.83it/s, loss=0.0052, lr=1.86e-05, step=3895] Training: 39%|███▉ | 3896/10000 [46:09<54:23, 1.87it/s, loss=0.0052, lr=1.86e-05, step=3895] Training: 39%|███▉ | 3896/10000 [46:09<54:23, 1.87it/s, loss=0.0462, lr=1.86e-05, step=3896] Training: 39%|███▉ | 3897/10000 [46:10<1:02:18, 1.63it/s, loss=0.0462, lr=1.86e-05, step=3896] Training: 39%|███▉ | 3897/10000 [46:10<1:02:18, 1.63it/s, loss=0.0254, lr=1.86e-05, step=3897] Training: 39%|███▉ | 3898/10000 [46:11<58:11, 1.75it/s, loss=0.0254, lr=1.86e-05, step=3897] Training: 39%|███▉ | 3898/10000 [46:11<58:11, 1.75it/s, loss=0.0222, lr=1.86e-05, step=3898] Training: 39%|███▉ | 3899/10000 [46:11<1:02:02, 1.64it/s, loss=0.0222, lr=1.86e-05, step=3898] Training: 39%|███▉ | 3899/10000 [46:11<1:02:02, 1.64it/s, loss=0.0083, lr=1.86e-05, step=3899]16:52:18.905 [I] step=3900 loss=0.0090 smoothed_loss=0.0187 lr=1.86e-05 grad_norm=0.4838 step_time=0.5235s data_time=0.0569s it/s=1.723 eta_to_10000=3539.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.1116 grad_shared_expert=0.3575 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3900/10000 [46:12<59:32, 1.71it/s, loss=0.0083, lr=1.86e-05, step=3899] Training: 39%|███▉ | 3900/10000 [46:12<59:32, 1.71it/s, loss=0.0090, lr=1.86e-05, step=3900] Training: 39%|███▉ | 3901/10000 [46:12<56:43, 1.79it/s, loss=0.0090, lr=1.86e-05, step=3900] Training: 39%|███▉ | 3901/10000 [46:12<56:43, 1.79it/s, loss=0.0158, lr=1.86e-05, step=3901] Training: 39%|███▉ | 3902/10000 [46:13<55:11, 1.84it/s, loss=0.0158, lr=1.86e-05, step=3901] Training: 39%|███▉ | 3902/10000 [46:13<55:11, 1.84it/s, loss=0.0041, lr=1.86e-05, step=3902] Training: 39%|███▉ | 3903/10000 [46:13<53:40, 1.89it/s, loss=0.0041, lr=1.86e-05, step=3902] Training: 39%|███▉ | 3903/10000 [46:13<53:40, 1.89it/s, loss=0.0404, lr=1.86e-05, step=3903] Training: 39%|███▉ | 3904/10000 [46:14<1:00:55, 1.67it/s, loss=0.0404, lr=1.86e-05, step=3903] Training: 39%|███▉ | 3904/10000 [46:14<1:00:55, 1.67it/s, loss=0.0215, lr=1.86e-05, step=3904] Training: 39%|███▉ | 3905/10000 [46:15<57:46, 1.76it/s, loss=0.0215, lr=1.86e-05, step=3904] Training: 39%|███▉ | 3905/10000 [46:15<57:46, 1.76it/s, loss=0.0038, lr=1.86e-05, step=3905] Training: 39%|███▉ | 3906/10000 [46:15<1:03:02, 1.61it/s, loss=0.0038, lr=1.86e-05, step=3905] Training: 39%|███▉ | 3906/10000 [46:15<1:03:02, 1.61it/s, loss=0.0076, lr=1.86e-05, step=3906] Training: 39%|███▉ | 3907/10000 [46:16<59:06, 1.72it/s, loss=0.0076, lr=1.86e-05, step=3906] Training: 39%|███▉ | 3907/10000 [46:16<59:06, 1.72it/s, loss=0.0158, lr=1.86e-05, step=3907] Training: 39%|███▉ | 3908/10000 [46:16<56:04, 1.81it/s, loss=0.0158, lr=1.86e-05, step=3907] Training: 39%|███▉ | 3908/10000 [46:16<56:04, 1.81it/s, loss=0.0391, lr=1.86e-05, step=3908] Training: 39%|███▉ | 3909/10000 [46:17<54:02, 1.88it/s, loss=0.0391, lr=1.86e-05, step=3908] Training: 39%|███▉ | 3909/10000 [46:17<54:02, 1.88it/s, loss=0.0128, lr=1.86e-05, step=3909]16:52:24.408 [I] step=3910 loss=0.0116 smoothed_loss=0.0177 lr=1.86e-05 grad_norm=0.5216 step_time=0.4942s data_time=0.0561s it/s=1.818 eta_to_10000=3350.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.1627 grad_shared_expert=0.5274 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3910/10000 [46:17<54:20, 1.87it/s, loss=0.0128, lr=1.86e-05, step=3909] Training: 39%|███▉ | 3910/10000 [46:17<54:20, 1.87it/s, loss=0.0116, lr=1.86e-05, step=3910] Training: 39%|███▉ | 3911/10000 [46:18<1:03:09, 1.61it/s, loss=0.0116, lr=1.86e-05, step=3910] Training: 39%|███▉ | 3911/10000 [46:18<1:03:09, 1.61it/s, loss=0.0118, lr=1.86e-05, step=3911] Training: 39%|███▉ | 3912/10000 [46:19<1:00:13, 1.68it/s, loss=0.0118, lr=1.86e-05, step=3911] Training: 39%|███▉ | 3912/10000 [46:19<1:00:13, 1.68it/s, loss=0.0359, lr=1.86e-05, step=3912] Training: 39%|███▉ | 3913/10000 [46:20<1:04:28, 1.57it/s, loss=0.0359, lr=1.86e-05, step=3912] Training: 39%|███▉ | 3913/10000 [46:20<1:04:28, 1.57it/s, loss=0.0069, lr=1.86e-05, step=3913] Training: 39%|███▉ | 3914/10000 [46:20<1:00:14, 1.68it/s, loss=0.0069, lr=1.86e-05, step=3913] Training: 39%|███▉ | 3914/10000 [46:20<1:00:14, 1.68it/s, loss=0.0320, lr=1.86e-05, step=3914] Training: 39%|███▉ | 3915/10000 [46:21<57:20, 1.77it/s, loss=0.0320, lr=1.86e-05, step=3914] Training: 39%|███▉ | 3915/10000 [46:21<57:20, 1.77it/s, loss=0.0767, lr=1.86e-05, step=3915] Training: 39%|███▉ | 3916/10000 [46:21<54:49, 1.85it/s, loss=0.0767, lr=1.86e-05, step=3915] Training: 39%|███▉ | 3916/10000 [46:21<54:49, 1.85it/s, loss=0.0390, lr=1.86e-05, step=3916] Training: 39%|███▉ | 3917/10000 [46:22<53:43, 1.89it/s, loss=0.0390, lr=1.86e-05, step=3916] Training: 39%|███▉ | 3917/10000 [46:22<53:43, 1.89it/s, loss=0.0286, lr=1.86e-05, step=3917] Training: 39%|███▉ | 3918/10000 [46:22<1:01:38, 1.64it/s, loss=0.0286, lr=1.86e-05, step=3917] Training: 39%|███▉ | 3918/10000 [46:22<1:01:38, 1.64it/s, loss=0.0143, lr=1.86e-05, step=3918] Training: 39%|███▉ | 3919/10000 [46:23<58:47, 1.72it/s, loss=0.0143, lr=1.86e-05, step=3918] Training: 39%|███▉ | 3919/10000 [46:23<58:47, 1.72it/s, loss=0.0106, lr=1.85e-05, step=3919]16:52:30.550 [I] step=3920 loss=0.0191 smoothed_loss=0.0234 lr=1.86e-05 grad_norm=0.4976 step_time=0.5542s data_time=0.0601s it/s=1.628 eta_to_10000=3733.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0228 grad_action_out_proj=0.1679 grad_shared_expert=0.4859 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3920/10000 [46:24<1:04:29, 1.57it/s, loss=0.0106, lr=1.85e-05, step=3919] Training: 39%|███▉ | 3920/10000 [46:24<1:04:29, 1.57it/s, loss=0.0191, lr=1.85e-05, step=3920] Training: 39%|███▉ | 3921/10000 [46:24<1:00:41, 1.67it/s, loss=0.0191, lr=1.85e-05, step=3920] Training: 39%|███▉ | 3921/10000 [46:24<1:00:41, 1.67it/s, loss=0.0164, lr=1.85e-05, step=3921] Training: 39%|███▉ | 3922/10000 [46:25<57:45, 1.75it/s, loss=0.0164, lr=1.85e-05, step=3921] Training: 39%|███▉ | 3922/10000 [46:25<57:45, 1.75it/s, loss=0.0100, lr=1.85e-05, step=3922] Training: 39%|███▉ | 3923/10000 [46:25<55:29, 1.83it/s, loss=0.0100, lr=1.85e-05, step=3922] Training: 39%|███▉ | 3923/10000 [46:25<55:29, 1.83it/s, loss=0.0116, lr=1.85e-05, step=3923] Training: 39%|███▉ | 3924/10000 [46:26<54:17, 1.87it/s, loss=0.0116, lr=1.85e-05, step=3923] Training: 39%|███▉ | 3924/10000 [46:26<54:17, 1.87it/s, loss=0.0155, lr=1.85e-05, step=3924] Training: 39%|███▉ | 3925/10000 [46:26<1:02:34, 1.62it/s, loss=0.0155, lr=1.85e-05, step=3924] Training: 39%|███▉ | 3925/10000 [46:26<1:02:34, 1.62it/s, loss=0.0181, lr=1.85e-05, step=3925] Training: 39%|███▉ | 3926/10000 [46:27<1:00:18, 1.68it/s, loss=0.0181, lr=1.85e-05, step=3925] Training: 39%|███▉ | 3926/10000 [46:27<1:00:18, 1.68it/s, loss=0.0204, lr=1.85e-05, step=3926] Training: 39%|███▉ | 3927/10000 [46:28<1:04:24, 1.57it/s, loss=0.0204, lr=1.85e-05, step=3926] Training: 39%|███▉ | 3927/10000 [46:28<1:04:24, 1.57it/s, loss=0.0300, lr=1.85e-05, step=3927] Training: 39%|███▉ | 3928/10000 [46:28<1:00:57, 1.66it/s, loss=0.0300, lr=1.85e-05, step=3927] Training: 39%|███▉ | 3928/10000 [46:28<1:00:57, 1.66it/s, loss=0.0101, lr=1.85e-05, step=3928] Training: 39%|███▉ | 3929/10000 [46:29<57:31, 1.76it/s, loss=0.0101, lr=1.85e-05, step=3928] Training: 39%|███▉ | 3929/10000 [46:29<57:31, 1.76it/s, loss=0.0056, lr=1.85e-05, step=3929]16:52:36.199 [I] step=3930 loss=0.0201 smoothed_loss=0.0185 lr=1.85e-05 grad_norm=0.5100 step_time=0.4994s data_time=0.0655s it/s=1.770 eta_to_10000=3428.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0209 grad_action_out_proj=0.1964 grad_shared_expert=0.5157 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3930/10000 [46:29<56:27, 1.79it/s, loss=0.0056, lr=1.85e-05, step=3929] Training: 39%|███▉ | 3930/10000 [46:29<56:27, 1.79it/s, loss=0.0201, lr=1.85e-05, step=3930] Training: 39%|███▉ | 3931/10000 [46:30<54:53, 1.84it/s, loss=0.0201, lr=1.85e-05, step=3930] Training: 39%|███▉ | 3931/10000 [46:30<54:53, 1.84it/s, loss=0.0147, lr=1.85e-05, step=3931] Training: 39%|███▉ | 3932/10000 [46:31<1:01:50, 1.64it/s, loss=0.0147, lr=1.85e-05, step=3931] Training: 39%|███▉ | 3932/10000 [46:31<1:01:50, 1.64it/s, loss=0.0064, lr=1.85e-05, step=3932] Training: 39%|███▉ | 3933/10000 [46:31<1:00:05, 1.68it/s, loss=0.0064, lr=1.85e-05, step=3932] Training: 39%|███▉ | 3933/10000 [46:31<1:00:05, 1.68it/s, loss=0.0121, lr=1.85e-05, step=3933] Training: 39%|███▉ | 3934/10000 [46:32<1:08:10, 1.48it/s, loss=0.0121, lr=1.85e-05, step=3933] Training: 39%|███▉ | 3934/10000 [46:32<1:08:10, 1.48it/s, loss=0.0090, lr=1.85e-05, step=3934] Training: 39%|███▉ | 3935/10000 [46:32<1:02:18, 1.62it/s, loss=0.0090, lr=1.85e-05, step=3934] Training: 39%|███▉ | 3935/10000 [46:32<1:02:18, 1.62it/s, loss=0.0300, lr=1.85e-05, step=3935] Training: 39%|███▉ | 3936/10000 [46:33<1:03:59, 1.58it/s, loss=0.0300, lr=1.85e-05, step=3935] Training: 39%|███▉ | 3936/10000 [46:33<1:03:59, 1.58it/s, loss=0.0347, lr=1.85e-05, step=3936] Training: 39%|███▉ | 3937/10000 [46:34<1:00:22, 1.67it/s, loss=0.0347, lr=1.85e-05, step=3936] Training: 39%|███▉ | 3937/10000 [46:34<1:00:22, 1.67it/s, loss=0.0122, lr=1.85e-05, step=3937] Training: 39%|███▉ | 3938/10000 [46:34<57:38, 1.75it/s, loss=0.0122, lr=1.85e-05, step=3937] Training: 39%|███▉ | 3938/10000 [46:34<57:38, 1.75it/s, loss=0.0083, lr=1.85e-05, step=3938] Training: 39%|███▉ | 3939/10000 [46:35<55:52, 1.81it/s, loss=0.0083, lr=1.85e-05, step=3938] Training: 39%|███▉ | 3939/10000 [46:35<55:52, 1.81it/s, loss=0.0048, lr=1.85e-05, step=3939]16:52:42.404 [I] step=3940 loss=0.0131 smoothed_loss=0.0157 lr=1.85e-05 grad_norm=0.4497 step_time=0.5515s data_time=0.0689s it/s=1.612 eta_to_10000=3758.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0144 grad_action_out_proj=0.1538 grad_shared_expert=0.4372 (10775:train_pytorch.py:850) + Training: 39%|███▉ | 3940/10000 [46:35<1:04:00, 1.58it/s, loss=0.0048, lr=1.85e-05, step=3939] Training: 39%|███▉ | 3940/10000 [46:35<1:04:00, 1.58it/s, loss=0.0131, lr=1.85e-05, step=3940] Training: 39%|███▉ | 3941/10000 [46:36<1:02:33, 1.61it/s, loss=0.0131, lr=1.85e-05, step=3940] Training: 39%|███▉ | 3941/10000 [46:36<1:02:33, 1.61it/s, loss=0.0087, lr=1.85e-05, step=3941] Training: 39%|███▉ | 3942/10000 [46:37<1:05:38, 1.54it/s, loss=0.0087, lr=1.85e-05, step=3941] Training: 39%|███▉ | 3942/10000 [46:37<1:05:38, 1.54it/s, loss=0.0090, lr=1.85e-05, step=3942] Training: 39%|███▉ | 3943/10000 [46:37<1:01:21, 1.65it/s, loss=0.0090, lr=1.85e-05, step=3942] Training: 39%|███▉ | 3943/10000 [46:37<1:01:21, 1.65it/s, loss=0.0397, lr=1.85e-05, step=3943] Training: 39%|███▉ | 3944/10000 [46:38<58:48, 1.72it/s, loss=0.0397, lr=1.85e-05, step=3943] Training: 39%|███▉ | 3944/10000 [46:38<58:48, 1.72it/s, loss=0.0177, lr=1.85e-05, step=3944] Training: 39%|███▉ | 3945/10000 [46:38<56:40, 1.78it/s, loss=0.0177, lr=1.85e-05, step=3944] Training: 39%|███▉ | 3945/10000 [46:38<56:40, 1.78it/s, loss=0.0074, lr=1.85e-05, step=3945] Training: 39%|███▉ | 3946/10000 [46:39<55:26, 1.82it/s, loss=0.0074, lr=1.85e-05, step=3945] Training: 39%|███▉ | 3946/10000 [46:39<55:26, 1.82it/s, loss=0.0059, lr=1.85e-05, step=3946] Training: 39%|███▉ | 3947/10000 [46:40<1:01:50, 1.63it/s, loss=0.0059, lr=1.85e-05, step=3946] Training: 39%|███▉ | 3947/10000 [46:40<1:01:50, 1.63it/s, loss=0.0223, lr=1.85e-05, step=3947] Training: 39%|███▉ | 3948/10000 [46:40<59:09, 1.70it/s, loss=0.0223, lr=1.85e-05, step=3947] Training: 39%|███▉ | 3948/10000 [46:40<59:09, 1.70it/s, loss=0.0148, lr=1.84e-05, step=3948] Training: 39%|███▉ | 3949/10000 [46:41<1:03:31, 1.59it/s, loss=0.0148, lr=1.84e-05, step=3948] Training: 39%|███▉ | 3949/10000 [46:41<1:03:31, 1.59it/s, loss=0.0323, lr=1.84e-05, step=3949]16:52:48.328 [I] step=3950 loss=0.0090 smoothed_loss=0.0165 lr=1.85e-05 grad_norm=0.5626 step_time=0.5303s data_time=0.0623s it/s=1.688 eta_to_10000=3583.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0291 grad_action_out_proj=0.2017 grad_shared_expert=0.5222 (10775:train_pytorch.py:850) + Training: 40%|███▉ | 3950/10000 [46:41<1:00:34, 1.66it/s, loss=0.0323, lr=1.84e-05, step=3949] Training: 40%|███▉ | 3950/10000 [46:41<1:00:34, 1.66it/s, loss=0.0090, lr=1.84e-05, step=3950] Training: 40%|███▉ | 3951/10000 [46:42<59:20, 1.70it/s, loss=0.0090, lr=1.84e-05, step=3950] Training: 40%|███▉ | 3951/10000 [46:42<59:20, 1.70it/s, loss=0.0167, lr=1.84e-05, step=3951] Training: 40%|███▉ | 3952/10000 [46:43<1:00:54, 1.65it/s, loss=0.0167, lr=1.84e-05, step=3951] Training: 40%|███▉ | 3952/10000 [46:43<1:00:54, 1.65it/s, loss=0.0099, lr=1.84e-05, step=3952] Training: 40%|███▉ | 3953/10000 [46:43<58:26, 1.72it/s, loss=0.0099, lr=1.84e-05, step=3952] Training: 40%|███▉ | 3953/10000 [46:43<58:26, 1.72it/s, loss=0.0120, lr=1.84e-05, step=3953] Training: 40%|███▉ | 3954/10000 [46:44<56:22, 1.79it/s, loss=0.0120, lr=1.84e-05, step=3953] Training: 40%|███▉ | 3954/10000 [46:44<56:22, 1.79it/s, loss=0.0065, lr=1.84e-05, step=3954] Training: 40%|███▉ | 3955/10000 [46:44<1:03:04, 1.60it/s, loss=0.0065, lr=1.84e-05, step=3954] Training: 40%|███▉ | 3955/10000 [46:44<1:03:04, 1.60it/s, loss=0.0039, lr=1.84e-05, step=3955] Training: 40%|███▉ | 3956/10000 [46:45<1:05:45, 1.53it/s, loss=0.0039, lr=1.84e-05, step=3955] Training: 40%|███▉ | 3956/10000 [46:45<1:05:45, 1.53it/s, loss=0.0095, lr=1.84e-05, step=3956] Training: 40%|███▉ | 3957/10000 [46:46<1:01:21, 1.64it/s, loss=0.0095, lr=1.84e-05, step=3956] Training: 40%|███▉ | 3957/10000 [46:46<1:01:21, 1.64it/s, loss=0.0060, lr=1.84e-05, step=3957] Training: 40%|███▉ | 3958/10000 [46:46<58:07, 1.73it/s, loss=0.0060, lr=1.84e-05, step=3957] Training: 40%|███▉ | 3958/10000 [46:46<58:07, 1.73it/s, loss=0.0184, lr=1.84e-05, step=3958] Training: 40%|███▉ | 3959/10000 [46:47<55:39, 1.81it/s, loss=0.0184, lr=1.84e-05, step=3958] Training: 40%|███▉ | 3959/10000 [46:47<55:39, 1.81it/s, loss=0.0505, lr=1.84e-05, step=3959]16:52:54.107 [I] step=3960 loss=0.0250 smoothed_loss=0.0176 lr=1.84e-05 grad_norm=0.4527 step_time=0.5090s data_time=0.0688s it/s=1.731 eta_to_10000=3489.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0059 grad_action_out_proj=0.0917 grad_shared_expert=0.4870 (10775:train_pytorch.py:850) + Training: 40%|███▉ | 3960/10000 [46:47<55:16, 1.82it/s, loss=0.0505, lr=1.84e-05, step=3959] Training: 40%|███▉ | 3960/10000 [46:47<55:16, 1.82it/s, loss=0.0250, lr=1.84e-05, step=3960] Training: 40%|███▉ | 3961/10000 [46:48<53:18, 1.89it/s, loss=0.0250, lr=1.84e-05, step=3960] Training: 40%|███▉ | 3961/10000 [46:48<53:18, 1.89it/s, loss=0.0073, lr=1.84e-05, step=3961] Training: 40%|███▉ | 3962/10000 [46:48<59:02, 1.70it/s, loss=0.0073, lr=1.84e-05, step=3961] Training: 40%|███▉ | 3962/10000 [46:48<59:02, 1.70it/s, loss=0.0212, lr=1.84e-05, step=3962] Training: 40%|███▉ | 3963/10000 [46:49<1:03:00, 1.60it/s, loss=0.0212, lr=1.84e-05, step=3962] Training: 40%|███▉ | 3963/10000 [46:49<1:03:00, 1.60it/s, loss=0.0102, lr=1.84e-05, step=3963] Training: 40%|███▉ | 3964/10000 [46:50<59:50, 1.68it/s, loss=0.0102, lr=1.84e-05, step=3963] Training: 40%|███▉ | 3964/10000 [46:50<59:50, 1.68it/s, loss=0.0079, lr=1.84e-05, step=3964] Training: 40%|███▉ | 3965/10000 [46:50<58:41, 1.71it/s, loss=0.0079, lr=1.84e-05, step=3964] Training: 40%|███▉ | 3965/10000 [46:50<58:41, 1.71it/s, loss=0.0299, lr=1.84e-05, step=3965] Training: 40%|███▉ | 3966/10000 [46:51<59:01, 1.70it/s, loss=0.0299, lr=1.84e-05, step=3965] Training: 40%|███▉ | 3966/10000 [46:51<59:01, 1.70it/s, loss=0.0219, lr=1.84e-05, step=3966] Training: 40%|███▉ | 3967/10000 [46:51<56:27, 1.78it/s, loss=0.0219, lr=1.84e-05, step=3966] Training: 40%|███▉ | 3967/10000 [46:51<56:27, 1.78it/s, loss=0.0081, lr=1.84e-05, step=3967] Training: 40%|███▉ | 3968/10000 [46:52<56:11, 1.79it/s, loss=0.0081, lr=1.84e-05, step=3967] Training: 40%|███▉ | 3968/10000 [46:52<56:11, 1.79it/s, loss=0.0394, lr=1.84e-05, step=3968] Training: 40%|███▉ | 3969/10000 [46:53<1:04:57, 1.55it/s, loss=0.0394, lr=1.84e-05, step=3968] Training: 40%|███▉ | 3969/10000 [46:53<1:04:57, 1.55it/s, loss=0.0081, lr=1.84e-05, step=3969]16:53:00.357 [I] step=3970 loss=0.0135 smoothed_loss=0.0173 lr=1.84e-05 grad_norm=0.5222 step_time=0.5556s data_time=0.0695s it/s=1.600 eta_to_10000=3768.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.1270 grad_shared_expert=0.4996 (10775:train_pytorch.py:850) + Training: 40%|███▉ | 3970/10000 [46:53<1:08:03, 1.48it/s, loss=0.0081, lr=1.84e-05, step=3969] Training: 40%|███▉ | 3970/10000 [46:53<1:08:03, 1.48it/s, loss=0.0135, lr=1.84e-05, step=3970] Training: 40%|███▉ | 3971/10000 [46:54<1:03:27, 1.58it/s, loss=0.0135, lr=1.84e-05, step=3970] Training: 40%|███▉ | 3971/10000 [46:54<1:03:27, 1.58it/s, loss=0.0293, lr=1.84e-05, step=3971] Training: 40%|███▉ | 3972/10000 [46:54<59:25, 1.69it/s, loss=0.0293, lr=1.84e-05, step=3971] Training: 40%|███▉ | 3972/10000 [46:54<59:25, 1.69it/s, loss=0.0260, lr=1.84e-05, step=3972] Training: 40%|███▉ | 3973/10000 [46:55<56:24, 1.78it/s, loss=0.0260, lr=1.84e-05, step=3972] Training: 40%|███▉ | 3973/10000 [46:55<56:24, 1.78it/s, loss=0.0046, lr=1.84e-05, step=3973] Training: 40%|███▉ | 3974/10000 [46:55<54:59, 1.83it/s, loss=0.0046, lr=1.84e-05, step=3973] Training: 40%|███▉ | 3974/10000 [46:55<54:59, 1.83it/s, loss=0.0122, lr=1.84e-05, step=3974] Training: 40%|███▉ | 3975/10000 [46:56<53:45, 1.87it/s, loss=0.0122, lr=1.84e-05, step=3974] Training: 40%|███▉ | 3975/10000 [46:56<53:45, 1.87it/s, loss=0.0241, lr=1.84e-05, step=3975] Training: 40%|███▉ | 3976/10000 [46:57<59:35, 1.68it/s, loss=0.0241, lr=1.84e-05, step=3975] Training: 40%|███▉ | 3976/10000 [46:57<59:35, 1.68it/s, loss=0.0090, lr=1.84e-05, step=3976] Training: 40%|███▉ | 3977/10000 [46:57<1:03:36, 1.58it/s, loss=0.0090, lr=1.84e-05, step=3976] Training: 40%|███▉ | 3977/10000 [46:57<1:03:36, 1.58it/s, loss=0.0154, lr=1.84e-05, step=3977] Training: 40%|███▉ | 3978/10000 [46:58<1:00:00, 1.67it/s, loss=0.0154, lr=1.84e-05, step=3977] Training: 40%|███▉ | 3978/10000 [46:58<1:00:00, 1.67it/s, loss=0.0062, lr=1.83e-05, step=3978] Training: 40%|███▉ | 3979/10000 [46:58<57:14, 1.75it/s, loss=0.0062, lr=1.83e-05, step=3978] Training: 40%|███▉ | 3979/10000 [46:58<57:14, 1.75it/s, loss=0.0121, lr=1.83e-05, step=3979]16:53:05.907 [I] step=3980 loss=0.0058 smoothed_loss=0.0145 lr=1.84e-05 grad_norm=0.5200 step_time=0.4964s data_time=0.0585s it/s=1.802 eta_to_10000=3340.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0286 grad_action_out_proj=0.1615 grad_shared_expert=0.3913 (10775:train_pytorch.py:850) + Training: 40%|███▉ | 3980/10000 [46:59<56:11, 1.79it/s, loss=0.0121, lr=1.83e-05, step=3979] Training: 40%|███▉ | 3980/10000 [46:59<56:11, 1.79it/s, loss=0.0058, lr=1.83e-05, step=3980] Training: 40%|███▉ | 3981/10000 [46:59<54:25, 1.84it/s, loss=0.0058, lr=1.83e-05, step=3980] Training: 40%|███▉ | 3981/10000 [46:59<54:25, 1.84it/s, loss=0.0184, lr=1.83e-05, step=3981] Training: 40%|███▉ | 3982/10000 [47:00<53:51, 1.86it/s, loss=0.0184, lr=1.83e-05, step=3981] Training: 40%|███▉ | 3982/10000 [47:00<53:51, 1.86it/s, loss=0.0222, lr=1.83e-05, step=3982] Training: 40%|███▉ | 3983/10000 [47:01<1:00:03, 1.67it/s, loss=0.0222, lr=1.83e-05, step=3982] Training: 40%|███▉ | 3983/10000 [47:01<1:00:03, 1.67it/s, loss=0.0399, lr=1.83e-05, step=3983] Training: 40%|███▉ | 3984/10000 [47:01<56:56, 1.76it/s, loss=0.0399, lr=1.83e-05, step=3983] Training: 40%|███▉ | 3984/10000 [47:01<56:56, 1.76it/s, loss=0.0050, lr=1.83e-05, step=3984] Training: 40%|███▉ | 3985/10000 [47:02<1:01:29, 1.63it/s, loss=0.0050, lr=1.83e-05, step=3984] Training: 40%|███▉ | 3985/10000 [47:02<1:01:29, 1.63it/s, loss=0.0062, lr=1.83e-05, step=3985] Training: 40%|███▉ | 3986/10000 [47:02<57:40, 1.74it/s, loss=0.0062, lr=1.83e-05, step=3985] Training: 40%|███▉ | 3986/10000 [47:02<57:40, 1.74it/s, loss=0.0053, lr=1.83e-05, step=3986] Training: 40%|███▉ | 3987/10000 [47:03<55:01, 1.82it/s, loss=0.0053, lr=1.83e-05, step=3986] Training: 40%|███▉ | 3987/10000 [47:03<55:01, 1.82it/s, loss=0.0065, lr=1.83e-05, step=3987] Training: 40%|███▉ | 3988/10000 [47:03<53:30, 1.87it/s, loss=0.0065, lr=1.83e-05, step=3987] Training: 40%|███▉ | 3988/10000 [47:03<53:30, 1.87it/s, loss=0.0086, lr=1.83e-05, step=3988] Training: 40%|███▉ | 3989/10000 [47:04<59:51, 1.67it/s, loss=0.0086, lr=1.83e-05, step=3988] Training: 40%|███▉ | 3989/10000 [47:04<59:51, 1.67it/s, loss=0.0153, lr=1.83e-05, step=3989]16:53:11.659 [I] step=3990 loss=0.0348 smoothed_loss=0.0156 lr=1.83e-05 grad_norm=0.4220 step_time=0.5159s data_time=0.0594s it/s=1.739 eta_to_10000=3456.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0188 grad_action_out_proj=0.1712 grad_shared_expert=0.6352 (10775:train_pytorch.py:850) + Training: 40%|███▉ | 3990/10000 [47:05<58:24, 1.71it/s, loss=0.0153, lr=1.83e-05, step=3989] Training: 40%|███▉ | 3990/10000 [47:05<58:24, 1.71it/s, loss=0.0348, lr=1.83e-05, step=3990] Training: 40%|███▉ | 3991/10000 [47:05<55:56, 1.79it/s, loss=0.0348, lr=1.83e-05, step=3990] Training: 40%|███▉ | 3991/10000 [47:05<55:56, 1.79it/s, loss=0.0177, lr=1.83e-05, step=3991] Training: 40%|███▉ | 3992/10000 [47:06<1:00:22, 1.66it/s, loss=0.0177, lr=1.83e-05, step=3991] Training: 40%|███▉ | 3992/10000 [47:06<1:00:22, 1.66it/s, loss=0.0042, lr=1.83e-05, step=3992] Training: 40%|███▉ | 3993/10000 [47:06<57:12, 1.75it/s, loss=0.0042, lr=1.83e-05, step=3992] Training: 40%|███▉ | 3993/10000 [47:06<57:12, 1.75it/s, loss=0.0241, lr=1.83e-05, step=3993] Training: 40%|███▉ | 3994/10000 [47:07<55:10, 1.81it/s, loss=0.0241, lr=1.83e-05, step=3993] Training: 40%|███▉ | 3994/10000 [47:07<55:10, 1.81it/s, loss=0.0506, lr=1.83e-05, step=3994] Training: 40%|███▉ | 3995/10000 [47:08<57:44, 1.73it/s, loss=0.0506, lr=1.83e-05, step=3994] Training: 40%|███▉ | 3995/10000 [47:08<57:44, 1.73it/s, loss=0.0084, lr=1.83e-05, step=3995] Training: 40%|███▉ | 3996/10000 [47:08<1:05:14, 1.53it/s, loss=0.0084, lr=1.83e-05, step=3995] Training: 40%|███▉ | 3996/10000 [47:08<1:05:14, 1.53it/s, loss=0.0127, lr=1.83e-05, step=3996] Training: 40%|███▉ | 3997/10000 [47:09<1:02:33, 1.60it/s, loss=0.0127, lr=1.83e-05, step=3996] Training: 40%|███▉ | 3997/10000 [47:09<1:02:33, 1.60it/s, loss=0.0188, lr=1.83e-05, step=3997] Training: 40%|███▉ | 3998/10000 [47:10<1:03:30, 1.58it/s, loss=0.0188, lr=1.83e-05, step=3997] Training: 40%|███▉ | 3998/10000 [47:10<1:03:30, 1.58it/s, loss=0.0134, lr=1.83e-05, step=3998] Training: 40%|███▉ | 3999/10000 [47:10<1:08:29, 1.46it/s, loss=0.0134, lr=1.83e-05, step=3998] Training: 40%|███▉ | 3999/10000 [47:10<1:08:29, 1.46it/s, loss=0.0052, lr=1.83e-05, step=3999]16:53:17.993 [I] step=4000 loss=0.0270 smoothed_loss=0.0171 lr=1.83e-05 grad_norm=0.4555 step_time=0.5572s data_time=0.0760s it/s=1.579 eta_to_10000=3799.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0164 grad_action_out_proj=0.1647 grad_shared_expert=0.3553 (10775:train_pytorch.py:850) +16:54:35.424 [I] Saved checkpoint at step 4000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/4000 (10775:train_pytorch.py:350) + Training: 40%|████ | 4000/10000 [48:28<39:50:03, 23.90s/it, loss=0.0052, lr=1.83e-05, step=3999] Training: 40%|████ | 4000/10000 [48:28<39:50:03, 23.90s/it, loss=0.0270, lr=1.83e-05, step=4000] Training: 40%|████ | 4001/10000 [48:29<28:11:02, 16.91s/it, loss=0.0270, lr=1.83e-05, step=4000] Training: 40%|████ | 4001/10000 [48:29<28:11:02, 16.91s/it, loss=0.0191, lr=1.83e-05, step=4001] Training: 40%|████ | 4002/10000 [48:30<19:59:12, 12.00s/it, loss=0.0191, lr=1.83e-05, step=4001] Training: 40%|████ | 4002/10000 [48:30<19:59:12, 12.00s/it, loss=0.0062, lr=1.83e-05, step=4002] Training: 40%|████ | 4003/10000 [48:30<14:14:32, 8.55s/it, loss=0.0062, lr=1.83e-05, step=4002] Training: 40%|████ | 4003/10000 [48:30<14:14:32, 8.55s/it, loss=0.0083, lr=1.83e-05, step=4003] Training: 40%|████ | 4004/10000 [48:31<10:21:09, 6.22s/it, loss=0.0083, lr=1.83e-05, step=4003] Training: 40%|████ | 4004/10000 [48:31<10:21:09, 6.22s/it, loss=0.0159, lr=1.83e-05, step=4004] Training: 40%|████ | 4005/10000 [48:31<7:30:24, 4.51s/it, loss=0.0159, lr=1.83e-05, step=4004] Training: 40%|████ | 4005/10000 [48:31<7:30:24, 4.51s/it, loss=0.0748, lr=1.83e-05, step=4005] Training: 40%|████ | 4006/10000 [48:32<5:38:04, 3.38s/it, loss=0.0748, lr=1.83e-05, step=4005] Training: 40%|████ | 4006/10000 [48:32<5:38:04, 3.38s/it, loss=0.0062, lr=1.83e-05, step=4006] Training: 40%|████ | 4007/10000 [48:33<4:12:27, 2.53s/it, loss=0.0062, lr=1.83e-05, step=4006] Training: 40%|████ | 4007/10000 [48:33<4:12:27, 2.53s/it, loss=0.0149, lr=1.82e-05, step=4007] Training: 40%|████ | 4008/10000 [48:33<3:11:44, 1.92s/it, loss=0.0149, lr=1.82e-05, step=4007] Training: 40%|████ | 4008/10000 [48:33<3:11:44, 1.92s/it, loss=0.0199, lr=1.82e-05, step=4008] Training: 40%|████ | 4009/10000 [48:34<2:30:59, 1.51s/it, loss=0.0199, lr=1.82e-05, step=4008] Training: 40%|████ | 4009/10000 [48:34<2:30:59, 1.51s/it, loss=0.0631, lr=1.82e-05, step=4009]16:54:41.236 [I] step=4010 loss=0.0063 smoothed_loss=0.0220 lr=1.83e-05 grad_norm=0.5309 step_time=0.5112s data_time=7.8133s it/s=0.120 eta_to_10000=49862.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1802 grad_shared_expert=0.4481 (10775:train_pytorch.py:850) + Training: 40%|████ | 4010/10000 [48:34<2:01:22, 1.22s/it, loss=0.0631, lr=1.82e-05, step=4009] Training: 40%|████ | 4010/10000 [48:34<2:01:22, 1.22s/it, loss=0.0063, lr=1.82e-05, step=4010] Training: 40%|████ | 4011/10000 [48:35<1:47:51, 1.08s/it, loss=0.0063, lr=1.82e-05, step=4010] Training: 40%|████ | 4011/10000 [48:35<1:47:51, 1.08s/it, loss=0.0102, lr=1.82e-05, step=4011] Training: 40%|████ | 4012/10000 [48:36<1:29:48, 1.11it/s, loss=0.0102, lr=1.82e-05, step=4011] Training: 40%|████ | 4012/10000 [48:36<1:29:48, 1.11it/s, loss=0.0452, lr=1.82e-05, step=4012] Training: 40%|████ | 4013/10000 [48:36<1:25:27, 1.17it/s, loss=0.0452, lr=1.82e-05, step=4012] Training: 40%|████ | 4013/10000 [48:36<1:25:27, 1.17it/s, loss=0.0040, lr=1.82e-05, step=4013] Training: 40%|████ | 4014/10000 [48:37<1:15:40, 1.32it/s, loss=0.0040, lr=1.82e-05, step=4013] Training: 40%|████ | 4014/10000 [48:37<1:15:40, 1.32it/s, loss=0.0132, lr=1.82e-05, step=4014] Training: 40%|████ | 4015/10000 [48:37<1:08:02, 1.47it/s, loss=0.0132, lr=1.82e-05, step=4014] Training: 40%|████ | 4015/10000 [48:37<1:08:02, 1.47it/s, loss=0.0334, lr=1.82e-05, step=4015] Training: 40%|████ | 4016/10000 [48:38<1:02:20, 1.60it/s, loss=0.0334, lr=1.82e-05, step=4015] Training: 40%|████ | 4016/10000 [48:38<1:02:20, 1.60it/s, loss=0.0175, lr=1.82e-05, step=4016] Training: 40%|████ | 4017/10000 [48:38<58:35, 1.70it/s, loss=0.0175, lr=1.82e-05, step=4016] Training: 40%|████ | 4017/10000 [48:38<58:35, 1.70it/s, loss=0.0140, lr=1.82e-05, step=4017] Training: 40%|████ | 4018/10000 [48:39<1:03:00, 1.58it/s, loss=0.0140, lr=1.82e-05, step=4017] Training: 40%|████ | 4018/10000 [48:39<1:03:00, 1.58it/s, loss=0.0071, lr=1.82e-05, step=4018] Training: 40%|████ | 4019/10000 [48:40<58:42, 1.70it/s, loss=0.0071, lr=1.82e-05, step=4018] Training: 40%|████ | 4019/10000 [48:40<58:42, 1.70it/s, loss=0.0053, lr=1.82e-05, step=4019]16:54:47.218 [I] step=4020 loss=0.0132 smoothed_loss=0.0174 lr=1.82e-05 grad_norm=0.5327 step_time=0.5362s data_time=0.0620s it/s=1.672 eta_to_10000=3576.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0274 grad_action_out_proj=0.1844 grad_shared_expert=0.5666 (10775:train_pytorch.py:850) + Training: 40%|████ | 4020/10000 [48:40<1:03:01, 1.58it/s, loss=0.0053, lr=1.82e-05, step=4019] Training: 40%|████ | 4020/10000 [48:40<1:03:01, 1.58it/s, loss=0.0132, lr=1.82e-05, step=4020] Training: 40%|████ | 4021/10000 [48:41<59:06, 1.69it/s, loss=0.0132, lr=1.82e-05, step=4020] Training: 40%|████ | 4021/10000 [48:41<59:06, 1.69it/s, loss=0.0193, lr=1.82e-05, step=4021] Training: 40%|████ | 4022/10000 [48:41<56:34, 1.76it/s, loss=0.0193, lr=1.82e-05, step=4021] Training: 40%|████ | 4022/10000 [48:41<56:34, 1.76it/s, loss=0.0073, lr=1.82e-05, step=4022] Training: 40%|████ | 4023/10000 [48:42<55:15, 1.80it/s, loss=0.0073, lr=1.82e-05, step=4022] Training: 40%|████ | 4023/10000 [48:42<55:15, 1.80it/s, loss=0.0109, lr=1.82e-05, step=4023] Training: 40%|████ | 4024/10000 [48:42<53:40, 1.86it/s, loss=0.0109, lr=1.82e-05, step=4023] Training: 40%|████ | 4024/10000 [48:42<53:40, 1.86it/s, loss=0.1873, lr=1.82e-05, step=4024] Training: 40%|████ | 4025/10000 [48:43<59:55, 1.66it/s, loss=0.1873, lr=1.82e-05, step=4024] Training: 40%|████ | 4025/10000 [48:43<59:55, 1.66it/s, loss=0.0060, lr=1.82e-05, step=4025] Training: 40%|████ | 4026/10000 [48:44<55:55, 1.78it/s, loss=0.0060, lr=1.82e-05, step=4025] Training: 40%|████ | 4026/10000 [48:44<55:55, 1.78it/s, loss=0.0290, lr=1.82e-05, step=4026] Training: 40%|████ | 4027/10000 [48:44<55:31, 1.79it/s, loss=0.0290, lr=1.82e-05, step=4026] Training: 40%|████ | 4027/10000 [48:44<55:31, 1.79it/s, loss=0.0211, lr=1.82e-05, step=4027] Training: 40%|████ | 4028/10000 [48:45<1:00:47, 1.64it/s, loss=0.0211, lr=1.82e-05, step=4027] Training: 40%|████ | 4028/10000 [48:45<1:00:47, 1.64it/s, loss=0.0233, lr=1.82e-05, step=4028] Training: 40%|████ | 4029/10000 [48:45<57:34, 1.73it/s, loss=0.0233, lr=1.82e-05, step=4028] Training: 40%|████ | 4029/10000 [48:45<57:34, 1.73it/s, loss=0.0033, lr=1.82e-05, step=4029]16:54:52.809 [I] step=4030 loss=0.0189 smoothed_loss=0.0255 lr=1.82e-05 grad_norm=0.4959 step_time=0.4953s data_time=0.0638s it/s=1.789 eta_to_10000=3337.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.0989 grad_shared_expert=0.3341 (10775:train_pytorch.py:850) + Training: 40%|████ | 4030/10000 [48:46<56:44, 1.75it/s, loss=0.0033, lr=1.82e-05, step=4029] Training: 40%|████ | 4030/10000 [48:46<56:44, 1.75it/s, loss=0.0189, lr=1.82e-05, step=4030] Training: 40%|████ | 4031/10000 [48:46<55:50, 1.78it/s, loss=0.0189, lr=1.82e-05, step=4030] Training: 40%|████ | 4031/10000 [48:46<55:50, 1.78it/s, loss=0.0149, lr=1.82e-05, step=4031] Training: 40%|████ | 4032/10000 [48:47<1:00:21, 1.65it/s, loss=0.0149, lr=1.82e-05, step=4031] Training: 40%|████ | 4032/10000 [48:47<1:00:21, 1.65it/s, loss=0.0092, lr=1.82e-05, step=4032] Training: 40%|████ | 4033/10000 [48:48<57:05, 1.74it/s, loss=0.0092, lr=1.82e-05, step=4032] Training: 40%|████ | 4033/10000 [48:48<57:05, 1.74it/s, loss=0.0072, lr=1.82e-05, step=4033] Training: 40%|████ | 4034/10000 [48:48<56:04, 1.77it/s, loss=0.0072, lr=1.82e-05, step=4033] Training: 40%|████ | 4034/10000 [48:48<56:04, 1.77it/s, loss=0.0256, lr=1.82e-05, step=4034] Training: 40%|████ | 4035/10000 [48:49<1:00:39, 1.64it/s, loss=0.0256, lr=1.82e-05, step=4034] Training: 40%|████ | 4035/10000 [48:49<1:00:39, 1.64it/s, loss=0.0088, lr=1.82e-05, step=4035] Training: 40%|████ | 4036/10000 [48:49<57:06, 1.74it/s, loss=0.0088, lr=1.82e-05, step=4035] Training: 40%|████ | 4036/10000 [48:49<57:06, 1.74it/s, loss=0.0028, lr=1.81e-05, step=4036] Training: 40%|████ | 4037/10000 [48:50<54:33, 1.82it/s, loss=0.0028, lr=1.81e-05, step=4036] Training: 40%|████ | 4037/10000 [48:50<54:33, 1.82it/s, loss=0.0050, lr=1.81e-05, step=4037] Training: 40%|████ | 4038/10000 [48:51<1:00:19, 1.65it/s, loss=0.0050, lr=1.81e-05, step=4037] Training: 40%|████ | 4038/10000 [48:51<1:00:19, 1.65it/s, loss=0.0337, lr=1.81e-05, step=4038] Training: 40%|████ | 4039/10000 [48:51<58:54, 1.69it/s, loss=0.0337, lr=1.81e-05, step=4038] Training: 40%|████ | 4039/10000 [48:51<58:54, 1.69it/s, loss=0.0211, lr=1.81e-05, step=4039]16:54:58.644 [I] step=4040 loss=0.0219 smoothed_loss=0.0195 lr=1.82e-05 grad_norm=0.4686 step_time=0.5198s data_time=0.0639s it/s=1.714 eta_to_10000=3477.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1487 grad_shared_expert=0.6114 (10775:train_pytorch.py:850) + Training: 40%|████ | 4040/10000 [48:52<57:24, 1.73it/s, loss=0.0211, lr=1.81e-05, step=4039] Training: 40%|████ | 4040/10000 [48:52<57:24, 1.73it/s, loss=0.0219, lr=1.81e-05, step=4040] Training: 40%|████ | 4041/10000 [48:52<55:35, 1.79it/s, loss=0.0219, lr=1.81e-05, step=4040] Training: 40%|████ | 4041/10000 [48:52<55:35, 1.79it/s, loss=0.0110, lr=1.81e-05, step=4041] Training: 40%|████ | 4042/10000 [48:53<1:00:06, 1.65it/s, loss=0.0110, lr=1.81e-05, step=4041] Training: 40%|████ | 4042/10000 [48:53<1:00:06, 1.65it/s, loss=0.0167, lr=1.81e-05, step=4042] Training: 40%|████ | 4043/10000 [48:54<1:01:18, 1.62it/s, loss=0.0167, lr=1.81e-05, step=4042] Training: 40%|████ | 4043/10000 [48:54<1:01:18, 1.62it/s, loss=0.0381, lr=1.81e-05, step=4043] Training: 40%|████ | 4044/10000 [48:54<57:27, 1.73it/s, loss=0.0381, lr=1.81e-05, step=4043] Training: 40%|████ | 4044/10000 [48:54<57:27, 1.73it/s, loss=0.0121, lr=1.81e-05, step=4044] Training: 40%|████ | 4045/10000 [48:55<1:01:49, 1.61it/s, loss=0.0121, lr=1.81e-05, step=4044] Training: 40%|████ | 4045/10000 [48:55<1:01:49, 1.61it/s, loss=0.0167, lr=1.81e-05, step=4045] Training: 40%|████ | 4046/10000 [48:55<59:10, 1.68it/s, loss=0.0167, lr=1.81e-05, step=4045] Training: 40%|████ | 4046/10000 [48:55<59:10, 1.68it/s, loss=0.0141, lr=1.81e-05, step=4046] Training: 40%|████ | 4047/10000 [48:56<58:46, 1.69it/s, loss=0.0141, lr=1.81e-05, step=4046] Training: 40%|████ | 4047/10000 [48:56<58:46, 1.69it/s, loss=0.0018, lr=1.81e-05, step=4047] Training: 40%|████ | 4048/10000 [48:56<56:17, 1.76it/s, loss=0.0018, lr=1.81e-05, step=4047] Training: 40%|████ | 4048/10000 [48:56<56:17, 1.76it/s, loss=0.0159, lr=1.81e-05, step=4048] Training: 40%|████ | 4049/10000 [48:57<54:28, 1.82it/s, loss=0.0159, lr=1.81e-05, step=4048] Training: 40%|████ | 4049/10000 [48:57<54:28, 1.82it/s, loss=0.0114, lr=1.81e-05, step=4049]16:55:04.641 [I] step=4050 loss=0.0220 smoothed_loss=0.0170 lr=1.81e-05 grad_norm=0.4535 step_time=0.5289s data_time=0.0708s it/s=1.668 eta_to_10000=3567.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0955 grad_shared_expert=0.4201 (10775:train_pytorch.py:850) + Training: 40%|████ | 4050/10000 [48:58<1:01:08, 1.62it/s, loss=0.0114, lr=1.81e-05, step=4049] Training: 40%|████ | 4050/10000 [48:58<1:01:08, 1.62it/s, loss=0.0220, lr=1.81e-05, step=4050] Training: 41%|████ | 4051/10000 [48:58<58:10, 1.70it/s, loss=0.0220, lr=1.81e-05, step=4050] Training: 41%|████ | 4051/10000 [48:58<58:10, 1.70it/s, loss=0.0173, lr=1.81e-05, step=4051] Training: 41%|████ | 4052/10000 [48:59<55:42, 1.78it/s, loss=0.0173, lr=1.81e-05, step=4051] Training: 41%|████ | 4052/10000 [48:59<55:42, 1.78it/s, loss=0.0162, lr=1.81e-05, step=4052] Training: 41%|████ | 4053/10000 [49:00<1:03:04, 1.57it/s, loss=0.0162, lr=1.81e-05, step=4052] Training: 41%|████ | 4053/10000 [49:00<1:03:04, 1.57it/s, loss=0.0183, lr=1.81e-05, step=4053] Training: 41%|████ | 4054/10000 [49:00<58:55, 1.68it/s, loss=0.0183, lr=1.81e-05, step=4053] Training: 41%|████ | 4054/10000 [49:00<58:55, 1.68it/s, loss=0.0119, lr=1.81e-05, step=4054] Training: 41%|████ | 4055/10000 [49:01<56:12, 1.76it/s, loss=0.0119, lr=1.81e-05, step=4054] Training: 41%|████ | 4055/10000 [49:01<56:12, 1.76it/s, loss=0.0117, lr=1.81e-05, step=4055] Training: 41%|████ | 4056/10000 [49:01<53:58, 1.84it/s, loss=0.0117, lr=1.81e-05, step=4055] Training: 41%|████ | 4056/10000 [49:01<53:58, 1.84it/s, loss=0.0123, lr=1.81e-05, step=4056] Training: 41%|████ | 4057/10000 [49:02<59:16, 1.67it/s, loss=0.0123, lr=1.81e-05, step=4056] Training: 41%|████ | 4057/10000 [49:02<59:16, 1.67it/s, loss=0.0108, lr=1.81e-05, step=4057] Training: 41%|████ | 4058/10000 [49:02<58:15, 1.70it/s, loss=0.0108, lr=1.81e-05, step=4057] Training: 41%|████ | 4058/10000 [49:02<58:15, 1.70it/s, loss=0.0059, lr=1.81e-05, step=4058] Training: 41%|████ | 4059/10000 [49:03<55:51, 1.77it/s, loss=0.0059, lr=1.81e-05, step=4058] Training: 41%|████ | 4059/10000 [49:03<55:51, 1.77it/s, loss=0.0290, lr=1.81e-05, step=4059]16:55:10.502 [I] step=4060 loss=0.0161 smoothed_loss=0.0158 lr=1.81e-05 grad_norm=0.3984 step_time=0.5245s data_time=0.0614s it/s=1.707 eta_to_10000=3480.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.1256 grad_shared_expert=0.4174 (10775:train_pytorch.py:850) + Training: 41%|████ | 4060/10000 [49:04<1:01:04, 1.62it/s, loss=0.0290, lr=1.81e-05, step=4059] Training: 41%|████ | 4060/10000 [49:04<1:01:04, 1.62it/s, loss=0.0161, lr=1.81e-05, step=4060] Training: 41%|████ | 4061/10000 [49:04<56:59, 1.74it/s, loss=0.0161, lr=1.81e-05, step=4060] Training: 41%|████ | 4061/10000 [49:04<56:59, 1.74it/s, loss=0.0058, lr=1.81e-05, step=4061] Training: 41%|████ | 4062/10000 [49:05<54:30, 1.82it/s, loss=0.0058, lr=1.81e-05, step=4061] Training: 41%|████ | 4062/10000 [49:05<54:30, 1.82it/s, loss=0.0394, lr=1.81e-05, step=4062] Training: 41%|████ | 4063/10000 [49:05<1:00:15, 1.64it/s, loss=0.0394, lr=1.81e-05, step=4062] Training: 41%|████ | 4063/10000 [49:05<1:00:15, 1.64it/s, loss=0.0129, lr=1.81e-05, step=4063] Training: 41%|████ | 4064/10000 [49:06<57:31, 1.72it/s, loss=0.0129, lr=1.81e-05, step=4063] Training: 41%|████ | 4064/10000 [49:06<57:31, 1.72it/s, loss=0.0142, lr=1.81e-05, step=4064] Training: 41%|████ | 4065/10000 [49:06<55:45, 1.77it/s, loss=0.0142, lr=1.81e-05, step=4064] Training: 41%|████ | 4065/10000 [49:06<55:45, 1.77it/s, loss=0.0191, lr=1.81e-05, step=4065] Training: 41%|████ | 4066/10000 [49:07<53:48, 1.84it/s, loss=0.0191, lr=1.81e-05, step=4065] Training: 41%|████ | 4066/10000 [49:07<53:48, 1.84it/s, loss=0.0297, lr=1.80e-05, step=4066] Training: 41%|████ | 4067/10000 [49:07<52:31, 1.88it/s, loss=0.0297, lr=1.80e-05, step=4066] Training: 41%|████ | 4067/10000 [49:07<52:31, 1.88it/s, loss=0.0046, lr=1.80e-05, step=4067] Training: 41%|████ | 4068/10000 [49:08<58:29, 1.69it/s, loss=0.0046, lr=1.80e-05, step=4067] Training: 41%|████ | 4068/10000 [49:08<58:29, 1.69it/s, loss=0.0259, lr=1.80e-05, step=4068] Training: 41%|████ | 4069/10000 [49:09<55:47, 1.77it/s, loss=0.0259, lr=1.80e-05, step=4068] Training: 41%|████ | 4069/10000 [49:09<55:47, 1.77it/s, loss=0.0073, lr=1.80e-05, step=4069]16:55:16.014 [I] step=4070 loss=0.0117 smoothed_loss=0.0161 lr=1.80e-05 grad_norm=0.4927 step_time=0.4887s data_time=0.0626s it/s=1.814 eta_to_10000=3268.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0184 grad_action_out_proj=0.1479 grad_shared_expert=0.4105 (10775:train_pytorch.py:850) + Training: 41%|████ | 4070/10000 [49:09<54:34, 1.81it/s, loss=0.0073, lr=1.80e-05, step=4069] Training: 41%|████ | 4070/10000 [49:09<54:34, 1.81it/s, loss=0.0117, lr=1.80e-05, step=4070] Training: 41%|████ | 4071/10000 [49:10<1:00:43, 1.63it/s, loss=0.0117, lr=1.80e-05, step=4070] Training: 41%|████ | 4071/10000 [49:10<1:00:43, 1.63it/s, loss=0.0067, lr=1.80e-05, step=4071] Training: 41%|████ | 4072/10000 [49:10<57:34, 1.72it/s, loss=0.0067, lr=1.80e-05, step=4071] Training: 41%|████ | 4072/10000 [49:10<57:34, 1.72it/s, loss=0.0231, lr=1.80e-05, step=4072] Training: 41%|████ | 4073/10000 [49:11<54:38, 1.81it/s, loss=0.0231, lr=1.80e-05, step=4072] Training: 41%|████ | 4073/10000 [49:11<54:38, 1.81it/s, loss=0.0148, lr=1.80e-05, step=4073] Training: 41%|████ | 4074/10000 [49:11<53:20, 1.85it/s, loss=0.0148, lr=1.80e-05, step=4073] Training: 41%|████ | 4074/10000 [49:11<53:20, 1.85it/s, loss=0.0201, lr=1.80e-05, step=4074] Training: 41%|████ | 4075/10000 [49:12<58:42, 1.68it/s, loss=0.0201, lr=1.80e-05, step=4074] Training: 41%|████ | 4075/10000 [49:12<58:42, 1.68it/s, loss=0.0070, lr=1.80e-05, step=4075] Training: 41%|████ | 4076/10000 [49:13<55:23, 1.78it/s, loss=0.0070, lr=1.80e-05, step=4075] Training: 41%|████ | 4076/10000 [49:13<55:23, 1.78it/s, loss=0.0347, lr=1.80e-05, step=4076] Training: 41%|████ | 4077/10000 [49:13<53:40, 1.84it/s, loss=0.0347, lr=1.80e-05, step=4076] Training: 41%|████ | 4077/10000 [49:13<53:40, 1.84it/s, loss=0.0116, lr=1.80e-05, step=4077] Training: 41%|████ | 4078/10000 [49:14<59:53, 1.65it/s, loss=0.0116, lr=1.80e-05, step=4077] Training: 41%|████ | 4078/10000 [49:14<59:53, 1.65it/s, loss=0.0226, lr=1.80e-05, step=4078] Training: 41%|████ | 4079/10000 [49:14<57:24, 1.72it/s, loss=0.0226, lr=1.80e-05, step=4078] Training: 41%|████ | 4079/10000 [49:14<57:24, 1.72it/s, loss=0.0129, lr=1.80e-05, step=4079]16:55:21.798 [I] step=4080 loss=0.0202 smoothed_loss=0.0172 lr=1.80e-05 grad_norm=0.5147 step_time=0.5147s data_time=0.0636s it/s=1.730 eta_to_10000=3422.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0681 grad_shared_expert=0.2773 (10775:train_pytorch.py:850) + Training: 41%|████ | 4080/10000 [49:15<56:02, 1.76it/s, loss=0.0129, lr=1.80e-05, step=4079] Training: 41%|████ | 4080/10000 [49:15<56:02, 1.76it/s, loss=0.0202, lr=1.80e-05, step=4080] Training: 41%|████ | 4081/10000 [49:15<54:12, 1.82it/s, loss=0.0202, lr=1.80e-05, step=4080] Training: 41%|████ | 4081/10000 [49:15<54:12, 1.82it/s, loss=0.0084, lr=1.80e-05, step=4081] Training: 41%|████ | 4082/10000 [49:16<52:15, 1.89it/s, loss=0.0084, lr=1.80e-05, step=4081] Training: 41%|████ | 4082/10000 [49:16<52:15, 1.89it/s, loss=0.0090, lr=1.80e-05, step=4082] Training: 41%|████ | 4083/10000 [49:17<58:08, 1.70it/s, loss=0.0090, lr=1.80e-05, step=4082] Training: 41%|████ | 4083/10000 [49:17<58:08, 1.70it/s, loss=0.0100, lr=1.80e-05, step=4083] Training: 41%|████ | 4084/10000 [49:17<54:57, 1.79it/s, loss=0.0100, lr=1.80e-05, step=4083] Training: 41%|████ | 4084/10000 [49:17<54:57, 1.79it/s, loss=0.0065, lr=1.80e-05, step=4084] Training: 41%|████ | 4085/10000 [49:18<59:18, 1.66it/s, loss=0.0065, lr=1.80e-05, step=4084] Training: 41%|████ | 4085/10000 [49:18<59:18, 1.66it/s, loss=0.0114, lr=1.80e-05, step=4085] Training: 41%|████ | 4086/10000 [49:18<56:27, 1.75it/s, loss=0.0114, lr=1.80e-05, step=4085] Training: 41%|████ | 4086/10000 [49:18<56:27, 1.75it/s, loss=0.0192, lr=1.80e-05, step=4086] Training: 41%|████ | 4087/10000 [49:19<54:17, 1.82it/s, loss=0.0192, lr=1.80e-05, step=4086] Training: 41%|████ | 4087/10000 [49:19<54:17, 1.82it/s, loss=0.0111, lr=1.80e-05, step=4087] Training: 41%|████ | 4088/10000 [49:19<54:17, 1.81it/s, loss=0.0111, lr=1.80e-05, step=4087] Training: 41%|████ | 4088/10000 [49:19<54:17, 1.81it/s, loss=0.0145, lr=1.80e-05, step=4088] Training: 41%|████ | 4089/10000 [49:20<52:45, 1.87it/s, loss=0.0145, lr=1.80e-05, step=4088] Training: 41%|████ | 4089/10000 [49:20<52:45, 1.87it/s, loss=0.0019, lr=1.80e-05, step=4089]16:55:27.519 [I] step=4090 loss=0.0331 smoothed_loss=0.0149 lr=1.80e-05 grad_norm=0.5254 step_time=0.5123s data_time=0.0598s it/s=1.748 eta_to_10000=3380.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.1330 grad_shared_expert=0.6185 (10775:train_pytorch.py:850) + Training: 41%|████ | 4090/10000 [49:21<59:21, 1.66it/s, loss=0.0019, lr=1.80e-05, step=4089] Training: 41%|████ | 4090/10000 [49:21<59:21, 1.66it/s, loss=0.0331, lr=1.80e-05, step=4090] Training: 41%|████ | 4091/10000 [49:21<56:31, 1.74it/s, loss=0.0331, lr=1.80e-05, step=4090] Training: 41%|████ | 4091/10000 [49:21<56:31, 1.74it/s, loss=0.0087, lr=1.80e-05, step=4091] Training: 41%|████ | 4092/10000 [49:22<1:01:09, 1.61it/s, loss=0.0087, lr=1.80e-05, step=4091] Training: 41%|████ | 4092/10000 [49:22<1:01:09, 1.61it/s, loss=0.1262, lr=1.80e-05, step=4092] Training: 41%|████ | 4093/10000 [49:22<57:36, 1.71it/s, loss=0.1262, lr=1.80e-05, step=4092] Training: 41%|████ | 4093/10000 [49:22<57:36, 1.71it/s, loss=0.0218, lr=1.80e-05, step=4093] Training: 41%|████ | 4094/10000 [49:23<55:13, 1.78it/s, loss=0.0218, lr=1.80e-05, step=4093] Training: 41%|████ | 4094/10000 [49:23<55:13, 1.78it/s, loss=0.0690, lr=1.80e-05, step=4094] Training: 41%|████ | 4095/10000 [49:23<53:29, 1.84it/s, loss=0.0690, lr=1.80e-05, step=4094] Training: 41%|████ | 4095/10000 [49:23<53:29, 1.84it/s, loss=0.0099, lr=1.79e-05, step=4095] Training: 41%|████ | 4096/10000 [49:24<52:15, 1.88it/s, loss=0.0099, lr=1.79e-05, step=4095] Training: 41%|████ | 4096/10000 [49:24<52:15, 1.88it/s, loss=0.0090, lr=1.79e-05, step=4096] Training: 41%|████ | 4097/10000 [49:25<57:42, 1.70it/s, loss=0.0090, lr=1.79e-05, step=4096] Training: 41%|████ | 4097/10000 [49:25<57:42, 1.70it/s, loss=0.0051, lr=1.79e-05, step=4097] Training: 41%|████ | 4098/10000 [49:25<54:40, 1.80it/s, loss=0.0051, lr=1.79e-05, step=4097] Training: 41%|████ | 4098/10000 [49:25<54:40, 1.80it/s, loss=0.0062, lr=1.79e-05, step=4098] Training: 41%|████ | 4099/10000 [49:26<59:50, 1.64it/s, loss=0.0062, lr=1.79e-05, step=4098] Training: 41%|████ | 4099/10000 [49:26<59:50, 1.64it/s, loss=0.0365, lr=1.79e-05, step=4099]16:55:33.220 [I] step=4100 loss=0.0086 smoothed_loss=0.0219 lr=1.79e-05 grad_norm=0.4804 step_time=0.5074s data_time=0.0627s it/s=1.754 eta_to_10000=3363.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0116 grad_action_out_proj=0.1214 grad_shared_expert=0.4463 (10775:train_pytorch.py:850) + Training: 41%|████ | 4100/10000 [49:26<57:14, 1.72it/s, loss=0.0365, lr=1.79e-05, step=4099] Training: 41%|████ | 4100/10000 [49:26<57:14, 1.72it/s, loss=0.0086, lr=1.79e-05, step=4100] Training: 41%|████ | 4101/10000 [49:27<55:38, 1.77it/s, loss=0.0086, lr=1.79e-05, step=4100] Training: 41%|████ | 4101/10000 [49:27<55:38, 1.77it/s, loss=0.0021, lr=1.79e-05, step=4101] Training: 41%|████ | 4102/10000 [49:27<53:48, 1.83it/s, loss=0.0021, lr=1.79e-05, step=4101] Training: 41%|████ | 4102/10000 [49:27<53:48, 1.83it/s, loss=0.0032, lr=1.79e-05, step=4102] Training: 41%|████ | 4103/10000 [49:28<52:28, 1.87it/s, loss=0.0032, lr=1.79e-05, step=4102] Training: 41%|████ | 4103/10000 [49:28<52:28, 1.87it/s, loss=0.0084, lr=1.79e-05, step=4103] Training: 41%|████ | 4104/10000 [49:29<58:15, 1.69it/s, loss=0.0084, lr=1.79e-05, step=4103] Training: 41%|████ | 4104/10000 [49:29<58:15, 1.69it/s, loss=0.0046, lr=1.79e-05, step=4104] Training: 41%|████ | 4105/10000 [49:29<55:27, 1.77it/s, loss=0.0046, lr=1.79e-05, step=4104] Training: 41%|████ | 4105/10000 [49:29<55:27, 1.77it/s, loss=0.0119, lr=1.79e-05, step=4105] Training: 41%|████ | 4106/10000 [49:30<59:24, 1.65it/s, loss=0.0119, lr=1.79e-05, step=4105] Training: 41%|████ | 4106/10000 [49:30<59:24, 1.65it/s, loss=0.0395, lr=1.79e-05, step=4106] Training: 41%|████ | 4107/10000 [49:30<58:29, 1.68it/s, loss=0.0395, lr=1.79e-05, step=4106] Training: 41%|████ | 4107/10000 [49:30<58:29, 1.68it/s, loss=0.0203, lr=1.79e-05, step=4107] Training: 41%|████ | 4108/10000 [49:31<55:36, 1.77it/s, loss=0.0203, lr=1.79e-05, step=4107] Training: 41%|████ | 4108/10000 [49:31<55:36, 1.77it/s, loss=0.0290, lr=1.79e-05, step=4108] Training: 41%|████ | 4109/10000 [49:31<53:44, 1.83it/s, loss=0.0290, lr=1.79e-05, step=4108] Training: 41%|████ | 4109/10000 [49:31<53:44, 1.83it/s, loss=0.0119, lr=1.79e-05, step=4109]16:55:38.795 [I] step=4110 loss=0.0120 smoothed_loss=0.0179 lr=1.79e-05 grad_norm=0.5039 step_time=0.4937s data_time=0.0638s it/s=1.794 eta_to_10000=3283.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0129 grad_action_out_proj=0.1512 grad_shared_expert=0.3741 (10775:train_pytorch.py:850) + Training: 41%|████ | 4110/10000 [49:32<53:26, 1.84it/s, loss=0.0119, lr=1.79e-05, step=4109] Training: 41%|████ | 4110/10000 [49:32<53:26, 1.84it/s, loss=0.0120, lr=1.79e-05, step=4110] Training: 41%|████ | 4111/10000 [49:33<58:52, 1.67it/s, loss=0.0120, lr=1.79e-05, step=4110] Training: 41%|████ | 4111/10000 [49:33<58:52, 1.67it/s, loss=0.0177, lr=1.79e-05, step=4111] Training: 41%|████ | 4112/10000 [49:33<55:19, 1.77it/s, loss=0.0177, lr=1.79e-05, step=4111] Training: 41%|████ | 4112/10000 [49:33<55:19, 1.77it/s, loss=0.0333, lr=1.79e-05, step=4112] Training: 41%|████ | 4113/10000 [49:34<52:42, 1.86it/s, loss=0.0333, lr=1.79e-05, step=4112] Training: 41%|████ | 4113/10000 [49:34<52:42, 1.86it/s, loss=0.0087, lr=1.79e-05, step=4113] Training: 41%|████ | 4114/10000 [49:34<57:26, 1.71it/s, loss=0.0087, lr=1.79e-05, step=4113] Training: 41%|████ | 4114/10000 [49:34<57:26, 1.71it/s, loss=0.0062, lr=1.79e-05, step=4114] Training: 41%|████ | 4115/10000 [49:35<55:07, 1.78it/s, loss=0.0062, lr=1.79e-05, step=4114] Training: 41%|████ | 4115/10000 [49:35<55:07, 1.78it/s, loss=0.0463, lr=1.79e-05, step=4115] Training: 41%|████ | 4116/10000 [49:35<53:16, 1.84it/s, loss=0.0463, lr=1.79e-05, step=4115] Training: 41%|████ | 4116/10000 [49:35<53:16, 1.84it/s, loss=0.0361, lr=1.79e-05, step=4116] Training: 41%|████ | 4117/10000 [49:36<59:11, 1.66it/s, loss=0.0361, lr=1.79e-05, step=4116] Training: 41%|████ | 4117/10000 [49:36<59:11, 1.66it/s, loss=0.0128, lr=1.79e-05, step=4117] Training: 41%|████ | 4118/10000 [49:36<56:13, 1.74it/s, loss=0.0128, lr=1.79e-05, step=4117] Training: 41%|████ | 4118/10000 [49:36<56:13, 1.74it/s, loss=0.0294, lr=1.79e-05, step=4118] Training: 41%|████ | 4119/10000 [49:37<54:45, 1.79it/s, loss=0.0294, lr=1.79e-05, step=4118] Training: 41%|████ | 4119/10000 [49:37<54:45, 1.79it/s, loss=0.0438, lr=1.79e-05, step=4119]16:55:44.491 [I] step=4120 loss=0.0096 smoothed_loss=0.0224 lr=1.79e-05 grad_norm=0.5036 step_time=0.5105s data_time=0.0591s it/s=1.756 eta_to_10000=3348.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0088 grad_action_out_proj=0.1200 grad_shared_expert=0.3243 (10775:train_pytorch.py:850) + Training: 41%|████ | 4120/10000 [49:38<54:04, 1.81it/s, loss=0.0438, lr=1.79e-05, step=4119] Training: 41%|████ | 4120/10000 [49:38<54:04, 1.81it/s, loss=0.0096, lr=1.79e-05, step=4120] Training: 41%|████ | 4121/10000 [49:38<59:08, 1.66it/s, loss=0.0096, lr=1.79e-05, step=4120] Training: 41%|████ | 4121/10000 [49:38<59:08, 1.66it/s, loss=0.0119, lr=1.79e-05, step=4121] Training: 41%|████ | 4122/10000 [49:39<56:40, 1.73it/s, loss=0.0119, lr=1.79e-05, step=4121] Training: 41%|████ | 4122/10000 [49:39<56:40, 1.73it/s, loss=0.0074, lr=1.79e-05, step=4122] Training: 41%|████ | 4123/10000 [49:39<54:44, 1.79it/s, loss=0.0074, lr=1.79e-05, step=4122] Training: 41%|████ | 4123/10000 [49:39<54:44, 1.79it/s, loss=0.0313, lr=1.78e-05, step=4123] Training: 41%|████ | 4124/10000 [49:40<59:28, 1.65it/s, loss=0.0313, lr=1.78e-05, step=4123] Training: 41%|████ | 4124/10000 [49:40<59:28, 1.65it/s, loss=0.0180, lr=1.78e-05, step=4124] Training: 41%|████▏ | 4125/10000 [49:41<56:27, 1.73it/s, loss=0.0180, lr=1.78e-05, step=4124] Training: 41%|████▏ | 4125/10000 [49:41<56:27, 1.73it/s, loss=0.0117, lr=1.78e-05, step=4125] Training: 41%|████▏ | 4126/10000 [49:41<54:01, 1.81it/s, loss=0.0117, lr=1.78e-05, step=4125] Training: 41%|████▏ | 4126/10000 [49:41<54:01, 1.81it/s, loss=0.0128, lr=1.78e-05, step=4126] Training: 41%|████▏ | 4127/10000 [49:42<52:43, 1.86it/s, loss=0.0128, lr=1.78e-05, step=4126] Training: 41%|████▏ | 4127/10000 [49:42<52:43, 1.86it/s, loss=0.0187, lr=1.78e-05, step=4127] Training: 41%|████▏ | 4128/10000 [49:42<57:41, 1.70it/s, loss=0.0187, lr=1.78e-05, step=4127] Training: 41%|████▏ | 4128/10000 [49:42<57:41, 1.70it/s, loss=0.0062, lr=1.78e-05, step=4128] Training: 41%|████▏ | 4129/10000 [49:43<54:30, 1.80it/s, loss=0.0062, lr=1.78e-05, step=4128] Training: 41%|████▏ | 4129/10000 [49:43<54:30, 1.80it/s, loss=0.0086, lr=1.78e-05, step=4129]16:55:50.191 [I] step=4130 loss=0.0146 smoothed_loss=0.0167 lr=1.78e-05 grad_norm=0.4535 step_time=0.5108s data_time=0.0592s it/s=1.755 eta_to_10000=3345.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1239 grad_shared_expert=0.4210 (10775:train_pytorch.py:850) + Training: 41%|████▏ | 4130/10000 [49:43<53:33, 1.83it/s, loss=0.0086, lr=1.78e-05, step=4129] Training: 41%|████▏ | 4130/10000 [49:43<53:33, 1.83it/s, loss=0.0146, lr=1.78e-05, step=4130] Training: 41%|████▏ | 4131/10000 [49:44<51:34, 1.90it/s, loss=0.0146, lr=1.78e-05, step=4130] Training: 41%|████▏ | 4131/10000 [49:44<51:34, 1.90it/s, loss=0.0078, lr=1.78e-05, step=4131] Training: 41%|████▏ | 4132/10000 [49:44<56:43, 1.72it/s, loss=0.0078, lr=1.78e-05, step=4131] Training: 41%|████▏ | 4132/10000 [49:44<56:43, 1.72it/s, loss=0.0030, lr=1.78e-05, step=4132] Training: 41%|████▏ | 4133/10000 [49:45<54:15, 1.80it/s, loss=0.0030, lr=1.78e-05, step=4132] Training: 41%|████▏ | 4133/10000 [49:45<54:15, 1.80it/s, loss=0.0332, lr=1.78e-05, step=4133] Training: 41%|████▏ | 4134/10000 [49:45<52:34, 1.86it/s, loss=0.0332, lr=1.78e-05, step=4133] Training: 41%|████▏ | 4134/10000 [49:45<52:34, 1.86it/s, loss=0.0185, lr=1.78e-05, step=4134] Training: 41%|████▏ | 4135/10000 [49:46<57:38, 1.70it/s, loss=0.0185, lr=1.78e-05, step=4134] Training: 41%|████▏ | 4135/10000 [49:46<57:38, 1.70it/s, loss=0.0351, lr=1.78e-05, step=4135] Training: 41%|████▏ | 4136/10000 [49:47<55:00, 1.78it/s, loss=0.0351, lr=1.78e-05, step=4135] Training: 41%|████▏ | 4136/10000 [49:47<55:00, 1.78it/s, loss=0.0115, lr=1.78e-05, step=4136] Training: 41%|████▏ | 4137/10000 [49:47<53:02, 1.84it/s, loss=0.0115, lr=1.78e-05, step=4136] Training: 41%|████▏ | 4137/10000 [49:47<53:02, 1.84it/s, loss=0.0127, lr=1.78e-05, step=4137] Training: 41%|████▏ | 4138/10000 [49:48<51:47, 1.89it/s, loss=0.0127, lr=1.78e-05, step=4137] Training: 41%|████▏ | 4138/10000 [49:48<51:47, 1.89it/s, loss=0.0104, lr=1.78e-05, step=4138] Training: 41%|████▏ | 4139/10000 [49:48<51:37, 1.89it/s, loss=0.0104, lr=1.78e-05, step=4138] Training: 41%|████▏ | 4139/10000 [49:48<51:37, 1.89it/s, loss=0.0062, lr=1.78e-05, step=4139]16:55:55.855 [I] step=4140 loss=0.0249 smoothed_loss=0.0165 lr=1.78e-05 grad_norm=0.4533 step_time=0.5048s data_time=0.0617s it/s=1.766 eta_to_10000=3318.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0290 grad_action_out_proj=0.2231 grad_shared_expert=0.5815 (10775:train_pytorch.py:850) + Training: 41%|████▏ | 4140/10000 [49:49<58:15, 1.68it/s, loss=0.0062, lr=1.78e-05, step=4139] Training: 41%|████▏ | 4140/10000 [49:49<58:15, 1.68it/s, loss=0.0249, lr=1.78e-05, step=4140] Training: 41%|████▏ | 4141/10000 [49:49<55:29, 1.76it/s, loss=0.0249, lr=1.78e-05, step=4140] Training: 41%|████▏ | 4141/10000 [49:49<55:29, 1.76it/s, loss=0.0234, lr=1.78e-05, step=4141] Training: 41%|████▏ | 4142/10000 [49:50<59:50, 1.63it/s, loss=0.0234, lr=1.78e-05, step=4141] Training: 41%|████▏ | 4142/10000 [49:50<59:50, 1.63it/s, loss=0.0155, lr=1.78e-05, step=4142] Training: 41%|████▏ | 4143/10000 [49:51<58:16, 1.67it/s, loss=0.0155, lr=1.78e-05, step=4142] Training: 41%|████▏ | 4143/10000 [49:51<58:16, 1.67it/s, loss=0.0083, lr=1.78e-05, step=4143] Training: 41%|████▏ | 4144/10000 [49:51<55:06, 1.77it/s, loss=0.0083, lr=1.78e-05, step=4143] Training: 41%|████▏ | 4144/10000 [49:51<55:06, 1.77it/s, loss=0.0079, lr=1.78e-05, step=4144] Training: 41%|████▏ | 4145/10000 [49:52<53:15, 1.83it/s, loss=0.0079, lr=1.78e-05, step=4144] Training: 41%|████▏ | 4145/10000 [49:52<53:15, 1.83it/s, loss=0.0053, lr=1.78e-05, step=4145] Training: 41%|████▏ | 4146/10000 [49:52<51:40, 1.89it/s, loss=0.0053, lr=1.78e-05, step=4145] Training: 41%|████▏ | 4146/10000 [49:52<51:40, 1.89it/s, loss=0.0136, lr=1.78e-05, step=4146] Training: 41%|████▏ | 4147/10000 [49:53<56:58, 1.71it/s, loss=0.0136, lr=1.78e-05, step=4146] Training: 41%|████▏ | 4147/10000 [49:53<56:58, 1.71it/s, loss=0.0020, lr=1.78e-05, step=4147] Training: 41%|████▏ | 4148/10000 [49:53<53:57, 1.81it/s, loss=0.0020, lr=1.78e-05, step=4147] Training: 41%|████▏ | 4148/10000 [49:53<53:57, 1.81it/s, loss=0.0130, lr=1.78e-05, step=4148] Training: 41%|████▏ | 4149/10000 [49:54<1:02:09, 1.57it/s, loss=0.0130, lr=1.78e-05, step=4148] Training: 41%|████▏ | 4149/10000 [49:54<1:02:09, 1.57it/s, loss=0.0173, lr=1.78e-05, step=4149]16:56:01.735 [I] step=4150 loss=0.0209 smoothed_loss=0.0142 lr=1.78e-05 grad_norm=0.4245 step_time=0.5218s data_time=0.0663s it/s=1.701 eta_to_10000=3438.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0212 grad_action_out_proj=0.1055 grad_shared_expert=0.3459 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4150/10000 [49:55<1:00:48, 1.60it/s, loss=0.0173, lr=1.78e-05, step=4149] Training: 42%|████▏ | 4150/10000 [49:55<1:00:48, 1.60it/s, loss=0.0209, lr=1.78e-05, step=4150] Training: 42%|████▏ | 4151/10000 [49:55<57:36, 1.69it/s, loss=0.0209, lr=1.78e-05, step=4150] Training: 42%|████▏ | 4151/10000 [49:55<57:36, 1.69it/s, loss=0.0103, lr=1.78e-05, step=4151] Training: 42%|████▏ | 4152/10000 [49:56<54:57, 1.77it/s, loss=0.0103, lr=1.78e-05, step=4151] Training: 42%|████▏ | 4152/10000 [49:56<54:57, 1.77it/s, loss=0.0159, lr=1.77e-05, step=4152] Training: 42%|████▏ | 4153/10000 [49:56<53:05, 1.84it/s, loss=0.0159, lr=1.77e-05, step=4152] Training: 42%|████▏ | 4153/10000 [49:56<53:05, 1.84it/s, loss=0.0064, lr=1.77e-05, step=4153] Training: 42%|████▏ | 4154/10000 [49:57<51:50, 1.88it/s, loss=0.0064, lr=1.77e-05, step=4153] Training: 42%|████▏ | 4154/10000 [49:57<51:50, 1.88it/s, loss=0.0382, lr=1.77e-05, step=4154] Training: 42%|████▏ | 4155/10000 [49:58<58:41, 1.66it/s, loss=0.0382, lr=1.77e-05, step=4154] Training: 42%|████▏ | 4155/10000 [49:58<58:41, 1.66it/s, loss=0.0051, lr=1.77e-05, step=4155] Training: 42%|████▏ | 4156/10000 [49:58<55:33, 1.75it/s, loss=0.0051, lr=1.77e-05, step=4155] Training: 42%|████▏ | 4156/10000 [49:58<55:33, 1.75it/s, loss=0.0217, lr=1.77e-05, step=4156] Training: 42%|████▏ | 4157/10000 [49:59<59:55, 1.63it/s, loss=0.0217, lr=1.77e-05, step=4156] Training: 42%|████▏ | 4157/10000 [49:59<59:55, 1.63it/s, loss=0.0092, lr=1.77e-05, step=4157] Training: 42%|████▏ | 4158/10000 [49:59<57:05, 1.71it/s, loss=0.0092, lr=1.77e-05, step=4157] Training: 42%|████▏ | 4158/10000 [49:59<57:05, 1.71it/s, loss=0.0522, lr=1.77e-05, step=4158] Training: 42%|████▏ | 4159/10000 [50:00<54:47, 1.78it/s, loss=0.0522, lr=1.77e-05, step=4158] Training: 42%|████▏ | 4159/10000 [50:00<54:47, 1.78it/s, loss=0.0709, lr=1.77e-05, step=4159]16:56:07.313 [I] step=4160 loss=0.0182 smoothed_loss=0.0232 lr=1.77e-05 grad_norm=0.4779 step_time=0.4963s data_time=0.0614s it/s=1.793 eta_to_10000=3256.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1398 grad_shared_expert=0.5803 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4160/10000 [50:00<54:25, 1.79it/s, loss=0.0709, lr=1.77e-05, step=4159] Training: 42%|████▏ | 4160/10000 [50:00<54:25, 1.79it/s, loss=0.0182, lr=1.77e-05, step=4160] Training: 42%|████▏ | 4161/10000 [50:01<53:06, 1.83it/s, loss=0.0182, lr=1.77e-05, step=4160] Training: 42%|████▏ | 4161/10000 [50:01<53:06, 1.83it/s, loss=0.0176, lr=1.77e-05, step=4161] Training: 42%|████▏ | 4162/10000 [50:02<57:48, 1.68it/s, loss=0.0176, lr=1.77e-05, step=4161] Training: 42%|████▏ | 4162/10000 [50:02<57:48, 1.68it/s, loss=0.1028, lr=1.77e-05, step=4162] Training: 42%|████▏ | 4163/10000 [50:02<54:51, 1.77it/s, loss=0.1028, lr=1.77e-05, step=4162] Training: 42%|████▏ | 4163/10000 [50:02<54:51, 1.77it/s, loss=0.0109, lr=1.77e-05, step=4163] Training: 42%|████▏ | 4164/10000 [50:03<59:32, 1.63it/s, loss=0.0109, lr=1.77e-05, step=4163] Training: 42%|████▏ | 4164/10000 [50:03<59:32, 1.63it/s, loss=0.0559, lr=1.77e-05, step=4164] Training: 42%|████▏ | 4165/10000 [50:03<56:31, 1.72it/s, loss=0.0559, lr=1.77e-05, step=4164] Training: 42%|████▏ | 4165/10000 [50:03<56:31, 1.72it/s, loss=0.0110, lr=1.77e-05, step=4165] Training: 42%|████▏ | 4166/10000 [50:04<54:23, 1.79it/s, loss=0.0110, lr=1.77e-05, step=4165] Training: 42%|████▏ | 4166/10000 [50:04<54:23, 1.79it/s, loss=0.0188, lr=1.77e-05, step=4166] Training: 42%|████▏ | 4167/10000 [50:04<54:50, 1.77it/s, loss=0.0188, lr=1.77e-05, step=4166] Training: 42%|████▏ | 4167/10000 [50:04<54:50, 1.77it/s, loss=0.0048, lr=1.77e-05, step=4167] Training: 42%|████▏ | 4168/10000 [50:05<53:20, 1.82it/s, loss=0.0048, lr=1.77e-05, step=4167] Training: 42%|████▏ | 4168/10000 [50:05<53:20, 1.82it/s, loss=0.0321, lr=1.77e-05, step=4168] Training: 42%|████▏ | 4169/10000 [50:06<58:02, 1.67it/s, loss=0.0321, lr=1.77e-05, step=4168] Training: 42%|████▏ | 4169/10000 [50:06<58:02, 1.67it/s, loss=0.0138, lr=1.77e-05, step=4169]16:56:13.095 [I] step=4170 loss=0.0293 smoothed_loss=0.0257 lr=1.77e-05 grad_norm=0.5204 step_time=0.5151s data_time=0.0631s it/s=1.730 eta_to_10000=3370.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.1730 grad_shared_expert=0.4758 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4170/10000 [50:06<55:58, 1.74it/s, loss=0.0138, lr=1.77e-05, step=4169] Training: 42%|████▏ | 4170/10000 [50:06<55:58, 1.74it/s, loss=0.0293, lr=1.77e-05, step=4170] Training: 42%|████▏ | 4171/10000 [50:07<1:00:12, 1.61it/s, loss=0.0293, lr=1.77e-05, step=4170] Training: 42%|████▏ | 4171/10000 [50:07<1:00:12, 1.61it/s, loss=0.0172, lr=1.77e-05, step=4171] Training: 42%|████▏ | 4172/10000 [50:07<56:22, 1.72it/s, loss=0.0172, lr=1.77e-05, step=4171] Training: 42%|████▏ | 4172/10000 [50:07<56:22, 1.72it/s, loss=0.0221, lr=1.77e-05, step=4172] Training: 42%|████▏ | 4173/10000 [50:08<53:42, 1.81it/s, loss=0.0221, lr=1.77e-05, step=4172] Training: 42%|████▏ | 4173/10000 [50:08<53:42, 1.81it/s, loss=0.0066, lr=1.77e-05, step=4173] Training: 42%|████▏ | 4174/10000 [50:08<52:01, 1.87it/s, loss=0.0066, lr=1.77e-05, step=4173] Training: 42%|████▏ | 4174/10000 [50:08<52:01, 1.87it/s, loss=0.0221, lr=1.77e-05, step=4174] Training: 42%|████▏ | 4175/10000 [50:09<50:58, 1.90it/s, loss=0.0221, lr=1.77e-05, step=4174] Training: 42%|████▏ | 4175/10000 [50:09<50:58, 1.90it/s, loss=0.0049, lr=1.77e-05, step=4175] Training: 42%|████▏ | 4176/10000 [50:10<57:29, 1.69it/s, loss=0.0049, lr=1.77e-05, step=4175] Training: 42%|████▏ | 4176/10000 [50:10<57:29, 1.69it/s, loss=0.0340, lr=1.77e-05, step=4176] Training: 42%|████▏ | 4177/10000 [50:10<54:46, 1.77it/s, loss=0.0340, lr=1.77e-05, step=4176] Training: 42%|████▏ | 4177/10000 [50:10<54:46, 1.77it/s, loss=0.0109, lr=1.77e-05, step=4177] Training: 42%|████▏ | 4178/10000 [50:11<59:22, 1.63it/s, loss=0.0109, lr=1.77e-05, step=4177] Training: 42%|████▏ | 4178/10000 [50:11<59:22, 1.63it/s, loss=0.0460, lr=1.77e-05, step=4178] Training: 42%|████▏ | 4179/10000 [50:11<56:15, 1.72it/s, loss=0.0460, lr=1.77e-05, step=4178] Training: 42%|████▏ | 4179/10000 [50:11<56:15, 1.72it/s, loss=0.0073, lr=1.77e-05, step=4179]16:56:18.802 [I] step=4180 loss=0.0087 smoothed_loss=0.0206 lr=1.77e-05 grad_norm=0.5061 step_time=0.5115s data_time=0.0592s it/s=1.753 eta_to_10000=3320.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0112 grad_action_out_proj=0.1514 grad_shared_expert=0.5149 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4180/10000 [50:12<54:57, 1.77it/s, loss=0.0073, lr=1.77e-05, step=4179] Training: 42%|████▏ | 4180/10000 [50:12<54:57, 1.77it/s, loss=0.0087, lr=1.77e-05, step=4180] Training: 42%|████▏ | 4181/10000 [50:12<52:26, 1.85it/s, loss=0.0087, lr=1.77e-05, step=4180] Training: 42%|████▏ | 4181/10000 [50:12<52:26, 1.85it/s, loss=0.0365, lr=1.76e-05, step=4181] Training: 42%|████▏ | 4182/10000 [50:13<50:50, 1.91it/s, loss=0.0365, lr=1.76e-05, step=4181] Training: 42%|████▏ | 4182/10000 [50:13<50:50, 1.91it/s, loss=0.0253, lr=1.76e-05, step=4182] Training: 42%|████▏ | 4183/10000 [50:14<59:17, 1.64it/s, loss=0.0253, lr=1.76e-05, step=4182] Training: 42%|████▏ | 4183/10000 [50:14<59:17, 1.64it/s, loss=0.0215, lr=1.76e-05, step=4183] Training: 42%|████▏ | 4184/10000 [50:14<57:03, 1.70it/s, loss=0.0215, lr=1.76e-05, step=4183] Training: 42%|████▏ | 4184/10000 [50:14<57:03, 1.70it/s, loss=0.0097, lr=1.76e-05, step=4184] Training: 42%|████▏ | 4185/10000 [50:15<1:00:30, 1.60it/s, loss=0.0097, lr=1.76e-05, step=4184] Training: 42%|████▏ | 4185/10000 [50:15<1:00:30, 1.60it/s, loss=0.0073, lr=1.76e-05, step=4185] Training: 42%|████▏ | 4186/10000 [50:15<57:34, 1.68it/s, loss=0.0073, lr=1.76e-05, step=4185] Training: 42%|████▏ | 4186/10000 [50:15<57:34, 1.68it/s, loss=0.0126, lr=1.76e-05, step=4186] Training: 42%|████▏ | 4187/10000 [50:16<56:57, 1.70it/s, loss=0.0126, lr=1.76e-05, step=4186] Training: 42%|████▏ | 4187/10000 [50:16<56:57, 1.70it/s, loss=0.0075, lr=1.76e-05, step=4187] Training: 42%|████▏ | 4188/10000 [50:17<56:01, 1.73it/s, loss=0.0075, lr=1.76e-05, step=4187] Training: 42%|████▏ | 4188/10000 [50:17<56:01, 1.73it/s, loss=0.0022, lr=1.76e-05, step=4188] Training: 42%|████▏ | 4189/10000 [50:17<53:25, 1.81it/s, loss=0.0022, lr=1.76e-05, step=4188] Training: 42%|████▏ | 4189/10000 [50:17<53:25, 1.81it/s, loss=0.0088, lr=1.76e-05, step=4189]16:56:24.713 [I] step=4190 loss=0.0340 smoothed_loss=0.0174 lr=1.76e-05 grad_norm=0.4803 step_time=0.5312s data_time=0.0599s it/s=1.692 eta_to_10000=3434.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.0988 grad_shared_expert=0.3710 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4190/10000 [50:18<59:01, 1.64it/s, loss=0.0088, lr=1.76e-05, step=4189] Training: 42%|████▏ | 4190/10000 [50:18<59:01, 1.64it/s, loss=0.0340, lr=1.76e-05, step=4190] Training: 42%|████▏ | 4191/10000 [50:18<56:18, 1.72it/s, loss=0.0340, lr=1.76e-05, step=4190] Training: 42%|████▏ | 4191/10000 [50:18<56:18, 1.72it/s, loss=0.0065, lr=1.76e-05, step=4191] Training: 42%|████▏ | 4192/10000 [50:19<1:00:17, 1.61it/s, loss=0.0065, lr=1.76e-05, step=4191] Training: 42%|████▏ | 4192/10000 [50:19<1:00:17, 1.61it/s, loss=0.0322, lr=1.76e-05, step=4192] Training: 42%|████▏ | 4193/10000 [50:20<58:41, 1.65it/s, loss=0.0322, lr=1.76e-05, step=4192] Training: 42%|████▏ | 4193/10000 [50:20<58:41, 1.65it/s, loss=0.0360, lr=1.76e-05, step=4193] Training: 42%|████▏ | 4194/10000 [50:20<57:18, 1.69it/s, loss=0.0360, lr=1.76e-05, step=4193] Training: 42%|████▏ | 4194/10000 [50:20<57:18, 1.69it/s, loss=0.0138, lr=1.76e-05, step=4194] Training: 42%|████▏ | 4195/10000 [50:21<54:07, 1.79it/s, loss=0.0138, lr=1.76e-05, step=4194] Training: 42%|████▏ | 4195/10000 [50:21<54:07, 1.79it/s, loss=0.0132, lr=1.76e-05, step=4195] Training: 42%|████▏ | 4196/10000 [50:21<52:17, 1.85it/s, loss=0.0132, lr=1.76e-05, step=4195] Training: 42%|████▏ | 4196/10000 [50:21<52:17, 1.85it/s, loss=0.0117, lr=1.76e-05, step=4196] Training: 42%|████▏ | 4197/10000 [50:22<59:38, 1.62it/s, loss=0.0117, lr=1.76e-05, step=4196] Training: 42%|████▏ | 4197/10000 [50:22<59:38, 1.62it/s, loss=0.0220, lr=1.76e-05, step=4197] Training: 42%|████▏ | 4198/10000 [50:22<56:35, 1.71it/s, loss=0.0220, lr=1.76e-05, step=4197] Training: 42%|████▏ | 4198/10000 [50:22<56:35, 1.71it/s, loss=0.0050, lr=1.76e-05, step=4198] Training: 42%|████▏ | 4199/10000 [50:23<54:41, 1.77it/s, loss=0.0050, lr=1.76e-05, step=4198] Training: 42%|████▏ | 4199/10000 [50:23<54:41, 1.77it/s, loss=0.0028, lr=1.76e-05, step=4199]16:56:30.663 [I] step=4200 loss=0.0044 smoothed_loss=0.0144 lr=1.76e-05 grad_norm=0.4731 step_time=0.5338s data_time=0.0613s it/s=1.681 eta_to_10000=3450.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0197 grad_action_out_proj=0.1710 grad_shared_expert=0.5710 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4200/10000 [50:24<1:00:57, 1.59it/s, loss=0.0028, lr=1.76e-05, step=4199] Training: 42%|████▏ | 4200/10000 [50:24<1:00:57, 1.59it/s, loss=0.0044, lr=1.76e-05, step=4200] Training: 42%|████▏ | 4201/10000 [50:24<57:33, 1.68it/s, loss=0.0044, lr=1.76e-05, step=4200] Training: 42%|████▏ | 4201/10000 [50:24<57:33, 1.68it/s, loss=0.0218, lr=1.76e-05, step=4201] Training: 42%|████▏ | 4202/10000 [50:25<54:41, 1.77it/s, loss=0.0218, lr=1.76e-05, step=4201] Training: 42%|████▏ | 4202/10000 [50:25<54:41, 1.77it/s, loss=0.0080, lr=1.76e-05, step=4202] Training: 42%|████▏ | 4203/10000 [50:25<52:26, 1.84it/s, loss=0.0080, lr=1.76e-05, step=4202] Training: 42%|████▏ | 4203/10000 [50:25<52:26, 1.84it/s, loss=0.0312, lr=1.76e-05, step=4203] Training: 42%|████▏ | 4204/10000 [50:26<51:01, 1.89it/s, loss=0.0312, lr=1.76e-05, step=4203] Training: 42%|████▏ | 4204/10000 [50:26<51:01, 1.89it/s, loss=0.0112, lr=1.76e-05, step=4204] Training: 42%|████▏ | 4205/10000 [50:26<56:40, 1.70it/s, loss=0.0112, lr=1.76e-05, step=4204] Training: 42%|████▏ | 4205/10000 [50:26<56:40, 1.70it/s, loss=0.0133, lr=1.76e-05, step=4205] Training: 42%|████▏ | 4206/10000 [50:27<53:49, 1.79it/s, loss=0.0133, lr=1.76e-05, step=4205] Training: 42%|████▏ | 4206/10000 [50:27<53:49, 1.79it/s, loss=0.0033, lr=1.76e-05, step=4206] Training: 42%|████▏ | 4207/10000 [50:28<59:00, 1.64it/s, loss=0.0033, lr=1.76e-05, step=4206] Training: 42%|████▏ | 4207/10000 [50:28<59:00, 1.64it/s, loss=0.0101, lr=1.76e-05, step=4207] Training: 42%|████▏ | 4208/10000 [50:28<59:28, 1.62it/s, loss=0.0101, lr=1.76e-05, step=4207] Training: 42%|████▏ | 4208/10000 [50:28<59:28, 1.62it/s, loss=0.0238, lr=1.76e-05, step=4208] Training: 42%|████▏ | 4209/10000 [50:29<55:58, 1.72it/s, loss=0.0238, lr=1.76e-05, step=4208] Training: 42%|████▏ | 4209/10000 [50:29<55:58, 1.72it/s, loss=0.0282, lr=1.76e-05, step=4209]16:56:36.256 [I] step=4210 loss=0.0154 smoothed_loss=0.0161 lr=1.76e-05 grad_norm=0.4837 step_time=0.4967s data_time=0.0626s it/s=1.788 eta_to_10000=3237.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0967 grad_shared_expert=0.4065 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4210/10000 [50:29<54:27, 1.77it/s, loss=0.0282, lr=1.76e-05, step=4209] Training: 42%|████▏ | 4210/10000 [50:29<54:27, 1.77it/s, loss=0.0154, lr=1.75e-05, step=4210] Training: 42%|████▏ | 4211/10000 [50:30<52:36, 1.83it/s, loss=0.0154, lr=1.75e-05, step=4210] Training: 42%|████▏ | 4211/10000 [50:30<52:36, 1.83it/s, loss=0.0230, lr=1.75e-05, step=4211] Training: 42%|████▏ | 4212/10000 [50:31<57:24, 1.68it/s, loss=0.0230, lr=1.75e-05, step=4211] Training: 42%|████▏ | 4212/10000 [50:31<57:24, 1.68it/s, loss=0.0182, lr=1.75e-05, step=4212] Training: 42%|████▏ | 4213/10000 [50:31<54:23, 1.77it/s, loss=0.0182, lr=1.75e-05, step=4212] Training: 42%|████▏ | 4213/10000 [50:31<54:23, 1.77it/s, loss=0.0086, lr=1.75e-05, step=4213] Training: 42%|████▏ | 4214/10000 [50:32<58:37, 1.65it/s, loss=0.0086, lr=1.75e-05, step=4213] Training: 42%|████▏ | 4214/10000 [50:32<58:37, 1.65it/s, loss=0.0102, lr=1.75e-05, step=4214] Training: 42%|████▏ | 4215/10000 [50:32<55:24, 1.74it/s, loss=0.0102, lr=1.75e-05, step=4214] Training: 42%|████▏ | 4215/10000 [50:32<55:24, 1.74it/s, loss=0.0338, lr=1.75e-05, step=4215] Training: 42%|████▏ | 4216/10000 [50:33<54:22, 1.77it/s, loss=0.0338, lr=1.75e-05, step=4215] Training: 42%|████▏ | 4216/10000 [50:33<54:22, 1.77it/s, loss=0.0068, lr=1.75e-05, step=4216] Training: 42%|████▏ | 4217/10000 [50:33<51:52, 1.86it/s, loss=0.0068, lr=1.75e-05, step=4216] Training: 42%|████▏ | 4217/10000 [50:33<51:52, 1.86it/s, loss=0.0090, lr=1.75e-05, step=4217] Training: 42%|████▏ | 4218/10000 [50:34<51:28, 1.87it/s, loss=0.0090, lr=1.75e-05, step=4217] Training: 42%|████▏ | 4218/10000 [50:34<51:28, 1.87it/s, loss=0.0311, lr=1.75e-05, step=4218] Training: 42%|████▏ | 4219/10000 [50:34<53:16, 1.81it/s, loss=0.0311, lr=1.75e-05, step=4218] Training: 42%|████▏ | 4219/10000 [50:34<53:16, 1.81it/s, loss=0.0067, lr=1.75e-05, step=4219]16:56:42.050 [I] step=4220 loss=0.0069 smoothed_loss=0.0151 lr=1.75e-05 grad_norm=0.4797 step_time=0.5086s data_time=0.0708s it/s=1.727 eta_to_10000=3347.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1518 grad_shared_expert=0.3755 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4220/10000 [50:35<58:46, 1.64it/s, loss=0.0067, lr=1.75e-05, step=4219] Training: 42%|████▏ | 4220/10000 [50:35<58:46, 1.64it/s, loss=0.0069, lr=1.75e-05, step=4220] Training: 42%|████▏ | 4221/10000 [50:36<55:46, 1.73it/s, loss=0.0069, lr=1.75e-05, step=4220] Training: 42%|████▏ | 4221/10000 [50:36<55:46, 1.73it/s, loss=0.0035, lr=1.75e-05, step=4221] Training: 42%|████▏ | 4222/10000 [50:36<59:36, 1.62it/s, loss=0.0035, lr=1.75e-05, step=4221] Training: 42%|████▏ | 4222/10000 [50:36<59:36, 1.62it/s, loss=0.0073, lr=1.75e-05, step=4222] Training: 42%|████▏ | 4223/10000 [50:37<57:04, 1.69it/s, loss=0.0073, lr=1.75e-05, step=4222] Training: 42%|████▏ | 4223/10000 [50:37<57:04, 1.69it/s, loss=0.0076, lr=1.75e-05, step=4223] Training: 42%|████▏ | 4224/10000 [50:37<53:45, 1.79it/s, loss=0.0076, lr=1.75e-05, step=4223] Training: 42%|████▏ | 4224/10000 [50:37<53:45, 1.79it/s, loss=0.0118, lr=1.75e-05, step=4224] Training: 42%|████▏ | 4225/10000 [50:38<51:29, 1.87it/s, loss=0.0118, lr=1.75e-05, step=4224] Training: 42%|████▏ | 4225/10000 [50:38<51:29, 1.87it/s, loss=0.0032, lr=1.75e-05, step=4225] Training: 42%|████▏ | 4226/10000 [50:38<53:43, 1.79it/s, loss=0.0032, lr=1.75e-05, step=4225] Training: 42%|████▏ | 4226/10000 [50:38<53:43, 1.79it/s, loss=0.0376, lr=1.75e-05, step=4226] Training: 42%|████▏ | 4227/10000 [50:39<58:32, 1.64it/s, loss=0.0376, lr=1.75e-05, step=4226] Training: 42%|████▏ | 4227/10000 [50:39<58:32, 1.64it/s, loss=0.0263, lr=1.75e-05, step=4227] Training: 42%|████▏ | 4228/10000 [50:40<55:17, 1.74it/s, loss=0.0263, lr=1.75e-05, step=4227] Training: 42%|████▏ | 4228/10000 [50:40<55:17, 1.74it/s, loss=0.0134, lr=1.75e-05, step=4228] Training: 42%|████▏ | 4229/10000 [50:40<59:34, 1.61it/s, loss=0.0134, lr=1.75e-05, step=4228] Training: 42%|████▏ | 4229/10000 [50:40<59:34, 1.61it/s, loss=0.0264, lr=1.75e-05, step=4229]16:56:47.894 [I] step=4230 loss=0.0120 smoothed_loss=0.0159 lr=1.75e-05 grad_norm=0.4357 step_time=0.5204s data_time=0.0639s it/s=1.711 eta_to_10000=3371.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0179 grad_action_out_proj=0.1696 grad_shared_expert=0.3652 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4230/10000 [50:41<58:21, 1.65it/s, loss=0.0264, lr=1.75e-05, step=4229] Training: 42%|████▏ | 4230/10000 [50:41<58:21, 1.65it/s, loss=0.0120, lr=1.75e-05, step=4230] Training: 42%|████▏ | 4231/10000 [50:42<1:02:04, 1.55it/s, loss=0.0120, lr=1.75e-05, step=4230] Training: 42%|████▏ | 4231/10000 [50:42<1:02:04, 1.55it/s, loss=0.0643, lr=1.75e-05, step=4231] Training: 42%|████▏ | 4232/10000 [50:42<1:04:28, 1.49it/s, loss=0.0643, lr=1.75e-05, step=4231] Training: 42%|████▏ | 4232/10000 [50:42<1:04:28, 1.49it/s, loss=0.0135, lr=1.75e-05, step=4232] Training: 42%|████▏ | 4233/10000 [50:43<1:01:28, 1.56it/s, loss=0.0135, lr=1.75e-05, step=4232] Training: 42%|████▏ | 4233/10000 [50:43<1:01:28, 1.56it/s, loss=0.0111, lr=1.75e-05, step=4233] Training: 42%|████▏ | 4234/10000 [50:44<1:00:38, 1.58it/s, loss=0.0111, lr=1.75e-05, step=4233] Training: 42%|████▏ | 4234/10000 [50:44<1:00:38, 1.58it/s, loss=0.0088, lr=1.75e-05, step=4234] Training: 42%|████▏ | 4235/10000 [50:44<1:03:49, 1.51it/s, loss=0.0088, lr=1.75e-05, step=4234] Training: 42%|████▏ | 4235/10000 [50:44<1:03:49, 1.51it/s, loss=0.0093, lr=1.75e-05, step=4235] Training: 42%|████▏ | 4236/10000 [50:45<58:48, 1.63it/s, loss=0.0093, lr=1.75e-05, step=4235] Training: 42%|████▏ | 4236/10000 [50:45<58:48, 1.63it/s, loss=0.0282, lr=1.75e-05, step=4236] Training: 42%|████▏ | 4237/10000 [50:46<1:01:08, 1.57it/s, loss=0.0282, lr=1.75e-05, step=4236] Training: 42%|████▏ | 4237/10000 [50:46<1:01:08, 1.57it/s, loss=0.0065, lr=1.75e-05, step=4237] Training: 42%|████▏ | 4238/10000 [50:46<1:05:12, 1.47it/s, loss=0.0065, lr=1.75e-05, step=4237] Training: 42%|████▏ | 4238/10000 [50:46<1:05:12, 1.47it/s, loss=0.0054, lr=1.74e-05, step=4238] Training: 42%|████▏ | 4239/10000 [50:47<1:05:36, 1.46it/s, loss=0.0054, lr=1.74e-05, step=4238] Training: 42%|████▏ | 4239/10000 [50:47<1:05:36, 1.46it/s, loss=0.0167, lr=1.74e-05, step=4239]16:56:54.603 [I] step=4240 loss=0.0189 smoothed_loss=0.0163 lr=1.75e-05 grad_norm=0.4293 step_time=0.5389s data_time=0.1320s it/s=1.491 eta_to_10000=3863.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0288 grad_action_out_proj=0.1946 grad_shared_expert=0.5475 (10775:train_pytorch.py:850) + Training: 42%|████▏ | 4240/10000 [50:48<1:05:09, 1.47it/s, loss=0.0167, lr=1.74e-05, step=4239] Training: 42%|████▏ | 4240/10000 [50:48<1:05:09, 1.47it/s, loss=0.0189, lr=1.74e-05, step=4240] Training: 42%|████▏ | 4241/10000 [50:48<1:01:37, 1.56it/s, loss=0.0189, lr=1.74e-05, step=4240] Training: 42%|████▏ | 4241/10000 [50:48<1:01:37, 1.56it/s, loss=0.0106, lr=1.74e-05, step=4241] Training: 42%|████▏ | 4242/10000 [50:49<1:07:22, 1.42it/s, loss=0.0106, lr=1.74e-05, step=4241] Training: 42%|████▏ | 4242/10000 [50:49<1:07:22, 1.42it/s, loss=0.0130, lr=1.74e-05, step=4242] Training: 42%|████▏ | 4243/10000 [50:50<1:08:20, 1.40it/s, loss=0.0130, lr=1.74e-05, step=4242] Training: 42%|████▏ | 4243/10000 [50:50<1:08:20, 1.40it/s, loss=0.0130, lr=1.74e-05, step=4243] Training: 42%|████▏ | 4244/10000 [50:50<1:06:13, 1.45it/s, loss=0.0130, lr=1.74e-05, step=4243] Training: 42%|████▏ | 4244/10000 [50:50<1:06:13, 1.45it/s, loss=0.0089, lr=1.74e-05, step=4244] Training: 42%|████▏ | 4245/10000 [50:51<1:04:15, 1.49it/s, loss=0.0089, lr=1.74e-05, step=4244] Training: 42%|████▏ | 4245/10000 [50:51<1:04:15, 1.49it/s, loss=0.0136, lr=1.74e-05, step=4245] Training: 42%|████▏ | 4246/10000 [50:52<59:09, 1.62it/s, loss=0.0136, lr=1.74e-05, step=4245] Training: 42%|████▏ | 4246/10000 [50:52<59:09, 1.62it/s, loss=0.0103, lr=1.74e-05, step=4246] Training: 42%|████▏ | 4247/10000 [50:52<55:36, 1.72it/s, loss=0.0103, lr=1.74e-05, step=4246] Training: 42%|████▏ | 4247/10000 [50:52<55:36, 1.72it/s, loss=0.0090, lr=1.74e-05, step=4247] Training: 42%|████▏ | 4248/10000 [50:53<56:49, 1.69it/s, loss=0.0090, lr=1.74e-05, step=4247] Training: 42%|████▏ | 4248/10000 [50:53<56:49, 1.69it/s, loss=0.0209, lr=1.74e-05, step=4248] Training: 42%|████▏ | 4249/10000 [50:54<1:04:11, 1.49it/s, loss=0.0209, lr=1.74e-05, step=4248] Training: 42%|████▏ | 4249/10000 [50:54<1:04:11, 1.49it/s, loss=0.0098, lr=1.74e-05, step=4249]16:57:01.293 [I] step=4250 loss=0.0182 smoothed_loss=0.0143 lr=1.74e-05 grad_norm=0.4589 step_time=0.5863s data_time=0.0828s it/s=1.495 eta_to_10000=3846.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0074 grad_action_out_proj=0.1181 grad_shared_expert=0.4152 (10775:train_pytorch.py:850) + Training: 42%|████▎ | 4250/10000 [50:54<1:08:56, 1.39it/s, loss=0.0098, lr=1.74e-05, step=4249] Training: 42%|████▎ | 4250/10000 [50:54<1:08:56, 1.39it/s, loss=0.0182, lr=1.74e-05, step=4250] Training: 43%|████▎ | 4251/10000 [50:55<1:01:56, 1.55it/s, loss=0.0182, lr=1.74e-05, step=4250] Training: 43%|████▎ | 4251/10000 [50:55<1:01:56, 1.55it/s, loss=0.0113, lr=1.74e-05, step=4251] Training: 43%|████▎ | 4252/10000 [50:55<57:40, 1.66it/s, loss=0.0113, lr=1.74e-05, step=4251] Training: 43%|████▎ | 4252/10000 [50:55<57:40, 1.66it/s, loss=0.0218, lr=1.74e-05, step=4252] Training: 43%|████▎ | 4253/10000 [50:56<55:08, 1.74it/s, loss=0.0218, lr=1.74e-05, step=4252] Training: 43%|████▎ | 4253/10000 [50:56<55:08, 1.74it/s, loss=0.0054, lr=1.74e-05, step=4253] Training: 43%|████▎ | 4254/10000 [50:57<57:29, 1.67it/s, loss=0.0054, lr=1.74e-05, step=4253] Training: 43%|████▎ | 4254/10000 [50:57<57:29, 1.67it/s, loss=0.0266, lr=1.74e-05, step=4254] Training: 43%|████▎ | 4255/10000 [50:57<55:03, 1.74it/s, loss=0.0266, lr=1.74e-05, step=4254] Training: 43%|████▎ | 4255/10000 [50:57<55:03, 1.74it/s, loss=0.0303, lr=1.74e-05, step=4255] Training: 43%|████▎ | 4256/10000 [50:58<59:17, 1.61it/s, loss=0.0303, lr=1.74e-05, step=4255] Training: 43%|████▎ | 4256/10000 [50:58<59:17, 1.61it/s, loss=0.0089, lr=1.74e-05, step=4256] Training: 43%|████▎ | 4257/10000 [50:59<1:04:04, 1.49it/s, loss=0.0089, lr=1.74e-05, step=4256] Training: 43%|████▎ | 4257/10000 [50:59<1:04:04, 1.49it/s, loss=0.0077, lr=1.74e-05, step=4257] Training: 43%|████▎ | 4258/10000 [50:59<1:02:56, 1.52it/s, loss=0.0077, lr=1.74e-05, step=4257] Training: 43%|████▎ | 4258/10000 [50:59<1:02:56, 1.52it/s, loss=0.0130, lr=1.74e-05, step=4258] Training: 43%|████▎ | 4259/10000 [51:00<1:01:50, 1.55it/s, loss=0.0130, lr=1.74e-05, step=4258] Training: 43%|████▎ | 4259/10000 [51:00<1:01:50, 1.55it/s, loss=0.0110, lr=1.74e-05, step=4259]16:57:07.232 [I] step=4260 loss=0.0135 smoothed_loss=0.0144 lr=1.74e-05 grad_norm=0.4982 step_time=0.5141s data_time=0.0798s it/s=1.684 eta_to_10000=3408.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.0757 grad_shared_expert=0.4201 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4260/10000 [51:00<58:07, 1.65it/s, loss=0.0110, lr=1.74e-05, step=4259] Training: 43%|████▎ | 4260/10000 [51:00<58:07, 1.65it/s, loss=0.0135, lr=1.74e-05, step=4260] Training: 43%|████▎ | 4261/10000 [51:01<54:55, 1.74it/s, loss=0.0135, lr=1.74e-05, step=4260] Training: 43%|████▎ | 4261/10000 [51:01<54:55, 1.74it/s, loss=0.0045, lr=1.74e-05, step=4261] Training: 43%|████▎ | 4262/10000 [51:01<54:18, 1.76it/s, loss=0.0045, lr=1.74e-05, step=4261] Training: 43%|████▎ | 4262/10000 [51:01<54:18, 1.76it/s, loss=0.0093, lr=1.74e-05, step=4262] Training: 43%|████▎ | 4263/10000 [51:02<1:06:53, 1.43it/s, loss=0.0093, lr=1.74e-05, step=4262] Training: 43%|████▎ | 4263/10000 [51:02<1:06:53, 1.43it/s, loss=0.0291, lr=1.74e-05, step=4263] Training: 43%|████▎ | 4264/10000 [51:03<1:15:28, 1.27it/s, loss=0.0291, lr=1.74e-05, step=4263] Training: 43%|████▎ | 4264/10000 [51:03<1:15:28, 1.27it/s, loss=0.0094, lr=1.74e-05, step=4264] Training: 43%|████▎ | 4265/10000 [51:04<1:06:51, 1.43it/s, loss=0.0094, lr=1.74e-05, step=4264] Training: 43%|████▎ | 4265/10000 [51:04<1:06:51, 1.43it/s, loss=0.0539, lr=1.74e-05, step=4265] Training: 43%|████▎ | 4266/10000 [51:05<1:12:11, 1.32it/s, loss=0.0539, lr=1.74e-05, step=4265] Training: 43%|████▎ | 4266/10000 [51:05<1:12:11, 1.32it/s, loss=0.0107, lr=1.74e-05, step=4266] Training: 43%|████▎ | 4267/10000 [51:05<1:07:38, 1.41it/s, loss=0.0107, lr=1.74e-05, step=4266] Training: 43%|████▎ | 4267/10000 [51:05<1:07:38, 1.41it/s, loss=0.0061, lr=1.73e-05, step=4267] Training: 43%|████▎ | 4268/10000 [51:06<1:06:22, 1.44it/s, loss=0.0061, lr=1.73e-05, step=4267] Training: 43%|████▎ | 4268/10000 [51:06<1:06:22, 1.44it/s, loss=0.0050, lr=1.73e-05, step=4268] Training: 43%|████▎ | 4269/10000 [51:07<1:15:46, 1.26it/s, loss=0.0050, lr=1.73e-05, step=4268] Training: 43%|████▎ | 4269/10000 [51:07<1:15:46, 1.26it/s, loss=0.0129, lr=1.73e-05, step=4269]16:57:14.656 [I] step=4270 loss=0.0134 smoothed_loss=0.0147 lr=1.74e-05 grad_norm=0.4879 step_time=0.6132s data_time=0.1292s it/s=1.347 eta_to_10000=4253.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.1341 grad_shared_expert=0.3541 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4270/10000 [51:08<1:13:20, 1.30it/s, loss=0.0129, lr=1.73e-05, step=4269] Training: 43%|████▎ | 4270/10000 [51:08<1:13:20, 1.30it/s, loss=0.0134, lr=1.73e-05, step=4270] Training: 43%|████▎ | 4271/10000 [51:09<1:15:29, 1.26it/s, loss=0.0134, lr=1.73e-05, step=4270] Training: 43%|████▎ | 4271/10000 [51:09<1:15:29, 1.26it/s, loss=0.0186, lr=1.73e-05, step=4271] Training: 43%|████▎ | 4272/10000 [51:09<1:10:13, 1.36it/s, loss=0.0186, lr=1.73e-05, step=4271] Training: 43%|████▎ | 4272/10000 [51:09<1:10:13, 1.36it/s, loss=0.0118, lr=1.73e-05, step=4272] Training: 43%|████▎ | 4273/10000 [51:10<1:06:20, 1.44it/s, loss=0.0118, lr=1.73e-05, step=4272] Training: 43%|████▎ | 4273/10000 [51:10<1:06:20, 1.44it/s, loss=0.0027, lr=1.73e-05, step=4273] Training: 43%|████▎ | 4274/10000 [51:10<1:03:31, 1.50it/s, loss=0.0027, lr=1.73e-05, step=4273] Training: 43%|████▎ | 4274/10000 [51:10<1:03:31, 1.50it/s, loss=0.0181, lr=1.73e-05, step=4274] Training: 43%|████▎ | 4275/10000 [51:11<58:50, 1.62it/s, loss=0.0181, lr=1.73e-05, step=4274] Training: 43%|████▎ | 4275/10000 [51:11<58:50, 1.62it/s, loss=0.0172, lr=1.73e-05, step=4275] Training: 43%|████▎ | 4276/10000 [51:12<1:03:33, 1.50it/s, loss=0.0172, lr=1.73e-05, step=4275] Training: 43%|████▎ | 4276/10000 [51:12<1:03:33, 1.50it/s, loss=0.0091, lr=1.73e-05, step=4276] Training: 43%|████▎ | 4277/10000 [51:12<1:04:40, 1.47it/s, loss=0.0091, lr=1.73e-05, step=4276] Training: 43%|████▎ | 4277/10000 [51:12<1:04:40, 1.47it/s, loss=0.0036, lr=1.73e-05, step=4277] Training: 43%|████▎ | 4278/10000 [51:13<1:07:06, 1.42it/s, loss=0.0036, lr=1.73e-05, step=4277] Training: 43%|████▎ | 4278/10000 [51:13<1:07:06, 1.42it/s, loss=0.0226, lr=1.73e-05, step=4278] Training: 43%|████▎ | 4279/10000 [51:14<1:01:34, 1.55it/s, loss=0.0226, lr=1.73e-05, step=4278] Training: 43%|████▎ | 4279/10000 [51:14<1:01:34, 1.55it/s, loss=0.0165, lr=1.73e-05, step=4279]16:57:21.143 [I] step=4280 loss=0.0067 smoothed_loss=0.0133 lr=1.73e-05 grad_norm=0.4074 step_time=0.5539s data_time=0.0948s it/s=1.542 eta_to_10000=3709.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0281 grad_action_out_proj=0.1625 grad_shared_expert=0.4628 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4280/10000 [51:14<59:31, 1.60it/s, loss=0.0165, lr=1.73e-05, step=4279] Training: 43%|████▎ | 4280/10000 [51:14<59:31, 1.60it/s, loss=0.0067, lr=1.73e-05, step=4280] Training: 43%|████▎ | 4281/10000 [51:15<56:12, 1.70it/s, loss=0.0067, lr=1.73e-05, step=4280] Training: 43%|████▎ | 4281/10000 [51:15<56:12, 1.70it/s, loss=0.0223, lr=1.73e-05, step=4281] Training: 43%|████▎ | 4282/10000 [51:15<1:00:33, 1.57it/s, loss=0.0223, lr=1.73e-05, step=4281] Training: 43%|████▎ | 4282/10000 [51:15<1:00:33, 1.57it/s, loss=0.0049, lr=1.73e-05, step=4282] Training: 43%|████▎ | 4283/10000 [51:16<1:02:03, 1.54it/s, loss=0.0049, lr=1.73e-05, step=4282] Training: 43%|████▎ | 4283/10000 [51:16<1:02:03, 1.54it/s, loss=0.0261, lr=1.73e-05, step=4283] Training: 43%|████▎ | 4284/10000 [51:17<1:06:27, 1.43it/s, loss=0.0261, lr=1.73e-05, step=4283] Training: 43%|████▎ | 4284/10000 [51:17<1:06:27, 1.43it/s, loss=0.0294, lr=1.73e-05, step=4284] Training: 43%|████▎ | 4285/10000 [51:18<1:05:26, 1.46it/s, loss=0.0294, lr=1.73e-05, step=4284] Training: 43%|████▎ | 4285/10000 [51:18<1:05:26, 1.46it/s, loss=0.0042, lr=1.73e-05, step=4285] Training: 43%|████▎ | 4286/10000 [51:19<1:11:43, 1.33it/s, loss=0.0042, lr=1.73e-05, step=4285] Training: 43%|████▎ | 4286/10000 [51:19<1:11:43, 1.33it/s, loss=0.0069, lr=1.73e-05, step=4286] Training: 43%|████▎ | 4287/10000 [51:19<1:08:59, 1.38it/s, loss=0.0069, lr=1.73e-05, step=4286] Training: 43%|████▎ | 4287/10000 [51:19<1:08:59, 1.38it/s, loss=0.0114, lr=1.73e-05, step=4287] Training: 43%|████▎ | 4288/10000 [51:20<1:05:25, 1.46it/s, loss=0.0114, lr=1.73e-05, step=4287] Training: 43%|████▎ | 4288/10000 [51:20<1:05:25, 1.46it/s, loss=0.0149, lr=1.73e-05, step=4288] Training: 43%|████▎ | 4289/10000 [51:20<1:02:47, 1.52it/s, loss=0.0149, lr=1.73e-05, step=4288] Training: 43%|████▎ | 4289/10000 [51:20<1:02:47, 1.52it/s, loss=0.0077, lr=1.73e-05, step=4289]16:57:28.029 [I] step=4290 loss=0.0114 smoothed_loss=0.0131 lr=1.73e-05 grad_norm=0.5118 step_time=0.5705s data_time=0.1180s it/s=1.453 eta_to_10000=3930.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.1009 grad_shared_expert=0.3843 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4290/10000 [51:21<1:04:25, 1.48it/s, loss=0.0077, lr=1.73e-05, step=4289] Training: 43%|████▎ | 4290/10000 [51:21<1:04:25, 1.48it/s, loss=0.0114, lr=1.73e-05, step=4290] Training: 43%|████▎ | 4291/10000 [51:22<1:06:18, 1.44it/s, loss=0.0114, lr=1.73e-05, step=4290] Training: 43%|████▎ | 4291/10000 [51:22<1:06:18, 1.44it/s, loss=0.0113, lr=1.73e-05, step=4291] Training: 43%|████▎ | 4292/10000 [51:22<1:00:24, 1.57it/s, loss=0.0113, lr=1.73e-05, step=4291] Training: 43%|████▎ | 4292/10000 [51:22<1:00:24, 1.57it/s, loss=0.0067, lr=1.73e-05, step=4292] Training: 43%|████▎ | 4293/10000 [51:23<1:02:17, 1.53it/s, loss=0.0067, lr=1.73e-05, step=4292] Training: 43%|████▎ | 4293/10000 [51:23<1:02:17, 1.53it/s, loss=0.0055, lr=1.73e-05, step=4293] Training: 43%|████▎ | 4294/10000 [51:24<58:14, 1.63it/s, loss=0.0055, lr=1.73e-05, step=4293] Training: 43%|████▎ | 4294/10000 [51:24<58:14, 1.63it/s, loss=0.0032, lr=1.73e-05, step=4294] Training: 43%|████▎ | 4295/10000 [51:24<57:36, 1.65it/s, loss=0.0032, lr=1.73e-05, step=4294] Training: 43%|████▎ | 4295/10000 [51:24<57:36, 1.65it/s, loss=0.0245, lr=1.72e-05, step=4295] Training: 43%|████▎ | 4296/10000 [51:25<54:48, 1.73it/s, loss=0.0245, lr=1.72e-05, step=4295] Training: 43%|████▎ | 4296/10000 [51:25<54:48, 1.73it/s, loss=0.0585, lr=1.72e-05, step=4296] Training: 43%|████▎ | 4297/10000 [51:25<52:43, 1.80it/s, loss=0.0585, lr=1.72e-05, step=4296] Training: 43%|████▎ | 4297/10000 [51:25<52:43, 1.80it/s, loss=0.0416, lr=1.72e-05, step=4297] Training: 43%|████▎ | 4298/10000 [51:26<50:59, 1.86it/s, loss=0.0416, lr=1.72e-05, step=4297] Training: 43%|████▎ | 4298/10000 [51:26<50:59, 1.86it/s, loss=0.0053, lr=1.72e-05, step=4298] Training: 43%|████▎ | 4299/10000 [51:26<56:44, 1.67it/s, loss=0.0053, lr=1.72e-05, step=4298] Training: 43%|████▎ | 4299/10000 [51:26<56:44, 1.67it/s, loss=0.0249, lr=1.72e-05, step=4299]16:57:34.030 [I] step=4300 loss=0.1237 smoothed_loss=0.0291 lr=1.72e-05 grad_norm=0.4939 step_time=0.5411s data_time=0.0590s it/s=1.667 eta_to_10000=3420.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0337 grad_action_out_proj=0.2473 grad_shared_expert=0.6097 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4300/10000 [51:27<1:00:09, 1.58it/s, loss=0.0249, lr=1.72e-05, step=4299] Training: 43%|████▎ | 4300/10000 [51:27<1:00:09, 1.58it/s, loss=0.1237, lr=1.72e-05, step=4300] Training: 43%|████▎ | 4301/10000 [51:28<56:35, 1.68it/s, loss=0.1237, lr=1.72e-05, step=4300] Training: 43%|████▎ | 4301/10000 [51:28<56:35, 1.68it/s, loss=0.0277, lr=1.72e-05, step=4301] Training: 43%|████▎ | 4302/10000 [51:28<54:00, 1.76it/s, loss=0.0277, lr=1.72e-05, step=4301] Training: 43%|████▎ | 4302/10000 [51:28<54:00, 1.76it/s, loss=0.0142, lr=1.72e-05, step=4302] Training: 43%|████▎ | 4303/10000 [51:29<52:31, 1.81it/s, loss=0.0142, lr=1.72e-05, step=4302] Training: 43%|████▎ | 4303/10000 [51:29<52:31, 1.81it/s, loss=0.0097, lr=1.72e-05, step=4303] Training: 43%|████▎ | 4304/10000 [51:29<51:27, 1.84it/s, loss=0.0097, lr=1.72e-05, step=4303] Training: 43%|████▎ | 4304/10000 [51:29<51:27, 1.84it/s, loss=0.0046, lr=1.72e-05, step=4304] Training: 43%|████▎ | 4305/10000 [51:30<51:46, 1.83it/s, loss=0.0046, lr=1.72e-05, step=4304] Training: 43%|████▎ | 4305/10000 [51:30<51:46, 1.83it/s, loss=0.0110, lr=1.72e-05, step=4305] Training: 43%|████▎ | 4306/10000 [51:30<56:11, 1.69it/s, loss=0.0110, lr=1.72e-05, step=4305] Training: 43%|████▎ | 4306/10000 [51:30<56:11, 1.69it/s, loss=0.0320, lr=1.72e-05, step=4306] Training: 43%|████▎ | 4307/10000 [51:31<59:03, 1.61it/s, loss=0.0320, lr=1.72e-05, step=4306] Training: 43%|████▎ | 4307/10000 [51:31<59:03, 1.61it/s, loss=0.0079, lr=1.72e-05, step=4307] Training: 43%|████▎ | 4308/10000 [51:32<56:00, 1.69it/s, loss=0.0079, lr=1.72e-05, step=4307] Training: 43%|████▎ | 4308/10000 [51:32<56:00, 1.69it/s, loss=0.0083, lr=1.72e-05, step=4308] Training: 43%|████▎ | 4309/10000 [51:32<53:27, 1.77it/s, loss=0.0083, lr=1.72e-05, step=4308] Training: 43%|████▎ | 4309/10000 [51:32<53:27, 1.77it/s, loss=0.0060, lr=1.72e-05, step=4309]16:57:39.601 [I] step=4310 loss=0.0017 smoothed_loss=0.0173 lr=1.72e-05 grad_norm=0.5050 step_time=0.4930s data_time=0.0641s it/s=1.795 eta_to_10000=3169.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0127 grad_action_out_proj=0.1307 grad_shared_expert=0.3242 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4310/10000 [51:33<53:19, 1.78it/s, loss=0.0060, lr=1.72e-05, step=4309] Training: 43%|████▎ | 4310/10000 [51:33<53:19, 1.78it/s, loss=0.0017, lr=1.72e-05, step=4310] Training: 43%|████▎ | 4311/10000 [51:33<51:40, 1.84it/s, loss=0.0017, lr=1.72e-05, step=4310] Training: 43%|████▎ | 4311/10000 [51:33<51:40, 1.84it/s, loss=0.0050, lr=1.72e-05, step=4311] Training: 43%|████▎ | 4312/10000 [51:34<52:24, 1.81it/s, loss=0.0050, lr=1.72e-05, step=4311] Training: 43%|████▎ | 4312/10000 [51:34<52:24, 1.81it/s, loss=0.0104, lr=1.72e-05, step=4312] Training: 43%|████▎ | 4313/10000 [51:34<57:40, 1.64it/s, loss=0.0104, lr=1.72e-05, step=4312] Training: 43%|████▎ | 4313/10000 [51:34<57:40, 1.64it/s, loss=0.0115, lr=1.72e-05, step=4313] Training: 43%|████▎ | 4314/10000 [51:35<1:04:11, 1.48it/s, loss=0.0115, lr=1.72e-05, step=4313] Training: 43%|████▎ | 4314/10000 [51:35<1:04:11, 1.48it/s, loss=0.0035, lr=1.72e-05, step=4314] Training: 43%|████▎ | 4315/10000 [51:36<58:53, 1.61it/s, loss=0.0035, lr=1.72e-05, step=4314] Training: 43%|████▎ | 4315/10000 [51:36<58:53, 1.61it/s, loss=0.0057, lr=1.72e-05, step=4315] Training: 43%|████▎ | 4316/10000 [51:36<1:00:39, 1.56it/s, loss=0.0057, lr=1.72e-05, step=4315] Training: 43%|████▎ | 4316/10000 [51:36<1:00:39, 1.56it/s, loss=0.0148, lr=1.72e-05, step=4316] Training: 43%|████▎ | 4317/10000 [51:37<56:52, 1.67it/s, loss=0.0148, lr=1.72e-05, step=4316] Training: 43%|████▎ | 4317/10000 [51:37<56:52, 1.67it/s, loss=0.0094, lr=1.72e-05, step=4317] Training: 43%|████▎ | 4318/10000 [51:38<58:03, 1.63it/s, loss=0.0094, lr=1.72e-05, step=4317] Training: 43%|████▎ | 4318/10000 [51:38<58:03, 1.63it/s, loss=0.0447, lr=1.72e-05, step=4318] Training: 43%|████▎ | 4319/10000 [51:38<54:28, 1.74it/s, loss=0.0447, lr=1.72e-05, step=4318] Training: 43%|████▎ | 4319/10000 [51:38<54:28, 1.74it/s, loss=0.0196, lr=1.72e-05, step=4319]16:57:45.743 [I] step=4320 loss=0.0407 smoothed_loss=0.0188 lr=1.72e-05 grad_norm=0.4528 step_time=0.5122s data_time=0.1021s it/s=1.628 eta_to_10000=3488.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0233 grad_action_out_proj=0.1836 grad_shared_expert=0.5688 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4320/10000 [51:39<57:21, 1.65it/s, loss=0.0196, lr=1.72e-05, step=4319] Training: 43%|████▎ | 4320/10000 [51:39<57:21, 1.65it/s, loss=0.0407, lr=1.72e-05, step=4320] Training: 43%|████▎ | 4321/10000 [51:40<1:01:25, 1.54it/s, loss=0.0407, lr=1.72e-05, step=4320] Training: 43%|████▎ | 4321/10000 [51:40<1:01:25, 1.54it/s, loss=0.0018, lr=1.72e-05, step=4321] Training: 43%|████▎ | 4322/10000 [51:40<56:37, 1.67it/s, loss=0.0018, lr=1.72e-05, step=4321] Training: 43%|████▎ | 4322/10000 [51:40<56:37, 1.67it/s, loss=0.0020, lr=1.72e-05, step=4322] Training: 43%|████▎ | 4323/10000 [51:41<59:51, 1.58it/s, loss=0.0020, lr=1.72e-05, step=4322] Training: 43%|████▎ | 4323/10000 [51:41<59:51, 1.58it/s, loss=0.0066, lr=1.71e-05, step=4323] Training: 43%|████▎ | 4324/10000 [51:41<57:40, 1.64it/s, loss=0.0066, lr=1.71e-05, step=4323] Training: 43%|████▎ | 4324/10000 [51:41<57:40, 1.64it/s, loss=0.0426, lr=1.71e-05, step=4324] Training: 43%|████▎ | 4325/10000 [51:42<55:06, 1.72it/s, loss=0.0426, lr=1.71e-05, step=4324] Training: 43%|████▎ | 4325/10000 [51:42<55:06, 1.72it/s, loss=0.0473, lr=1.71e-05, step=4325] Training: 43%|████▎ | 4326/10000 [51:42<54:09, 1.75it/s, loss=0.0473, lr=1.71e-05, step=4325] Training: 43%|████▎ | 4326/10000 [51:42<54:09, 1.75it/s, loss=0.0364, lr=1.71e-05, step=4326] Training: 43%|████▎ | 4327/10000 [51:43<52:45, 1.79it/s, loss=0.0364, lr=1.71e-05, step=4326] Training: 43%|████▎ | 4327/10000 [51:43<52:45, 1.79it/s, loss=0.0486, lr=1.71e-05, step=4327] Training: 43%|████▎ | 4328/10000 [51:44<57:52, 1.63it/s, loss=0.0486, lr=1.71e-05, step=4327] Training: 43%|████▎ | 4328/10000 [51:44<57:52, 1.63it/s, loss=0.0020, lr=1.71e-05, step=4328] Training: 43%|████▎ | 4329/10000 [51:44<1:01:36, 1.53it/s, loss=0.0020, lr=1.71e-05, step=4328] Training: 43%|████▎ | 4329/10000 [51:44<1:01:36, 1.53it/s, loss=0.0053, lr=1.71e-05, step=4329]16:57:51.872 [I] step=4330 loss=0.0158 smoothed_loss=0.0202 lr=1.71e-05 grad_norm=0.4666 step_time=0.5411s data_time=0.0718s it/s=1.632 eta_to_10000=3474.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.0910 grad_shared_expert=0.2449 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4330/10000 [51:45<58:51, 1.61it/s, loss=0.0053, lr=1.71e-05, step=4329] Training: 43%|████▎ | 4330/10000 [51:45<58:51, 1.61it/s, loss=0.0158, lr=1.71e-05, step=4330] Training: 43%|████▎ | 4331/10000 [51:45<55:08, 1.71it/s, loss=0.0158, lr=1.71e-05, step=4330] Training: 43%|████▎ | 4331/10000 [51:45<55:08, 1.71it/s, loss=0.0065, lr=1.71e-05, step=4331] Training: 43%|████▎ | 4332/10000 [51:46<52:53, 1.79it/s, loss=0.0065, lr=1.71e-05, step=4331] Training: 43%|████▎ | 4332/10000 [51:46<52:53, 1.79it/s, loss=0.0237, lr=1.71e-05, step=4332] Training: 43%|████▎ | 4333/10000 [51:46<51:03, 1.85it/s, loss=0.0237, lr=1.71e-05, step=4332] Training: 43%|████▎ | 4333/10000 [51:46<51:03, 1.85it/s, loss=0.0038, lr=1.71e-05, step=4333] Training: 43%|████▎ | 4334/10000 [51:47<50:11, 1.88it/s, loss=0.0038, lr=1.71e-05, step=4333] Training: 43%|████▎ | 4334/10000 [51:47<50:11, 1.88it/s, loss=0.0064, lr=1.71e-05, step=4334] Training: 43%|████▎ | 4335/10000 [51:48<54:59, 1.72it/s, loss=0.0064, lr=1.71e-05, step=4334] Training: 43%|████▎ | 4335/10000 [51:48<54:59, 1.72it/s, loss=0.0085, lr=1.71e-05, step=4335] Training: 43%|████▎ | 4336/10000 [51:48<1:00:25, 1.56it/s, loss=0.0085, lr=1.71e-05, step=4335] Training: 43%|████▎ | 4336/10000 [51:48<1:00:25, 1.56it/s, loss=0.0127, lr=1.71e-05, step=4336] Training: 43%|████▎ | 4337/10000 [51:49<56:40, 1.67it/s, loss=0.0127, lr=1.71e-05, step=4336] Training: 43%|████▎ | 4337/10000 [51:49<56:40, 1.67it/s, loss=0.0076, lr=1.71e-05, step=4337] Training: 43%|████▎ | 4338/10000 [51:49<53:38, 1.76it/s, loss=0.0076, lr=1.71e-05, step=4337] Training: 43%|████▎ | 4338/10000 [51:49<53:38, 1.76it/s, loss=0.0033, lr=1.71e-05, step=4338] Training: 43%|████▎ | 4339/10000 [51:50<51:22, 1.84it/s, loss=0.0033, lr=1.71e-05, step=4338] Training: 43%|████▎ | 4339/10000 [51:50<51:22, 1.84it/s, loss=0.0078, lr=1.71e-05, step=4339]16:57:57.378 [I] step=4340 loss=0.0086 smoothed_loss=0.0126 lr=1.71e-05 grad_norm=0.4676 step_time=0.4912s data_time=0.0593s it/s=1.817 eta_to_10000=3115.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0229 grad_action_out_proj=0.1381 grad_shared_expert=0.4161 (10775:train_pytorch.py:850) + Training: 43%|████▎ | 4340/10000 [51:50<51:07, 1.84it/s, loss=0.0078, lr=1.71e-05, step=4339] Training: 43%|████▎ | 4340/10000 [51:50<51:07, 1.84it/s, loss=0.0086, lr=1.71e-05, step=4340] Training: 43%|████▎ | 4341/10000 [51:51<49:31, 1.90it/s, loss=0.0086, lr=1.71e-05, step=4340] Training: 43%|████▎ | 4341/10000 [51:51<49:31, 1.90it/s, loss=0.0063, lr=1.71e-05, step=4341] Training: 43%|████▎ | 4342/10000 [51:52<57:26, 1.64it/s, loss=0.0063, lr=1.71e-05, step=4341] Training: 43%|████▎ | 4342/10000 [51:52<57:26, 1.64it/s, loss=0.0056, lr=1.71e-05, step=4342] Training: 43%|████▎ | 4343/10000 [51:52<1:01:27, 1.53it/s, loss=0.0056, lr=1.71e-05, step=4342] Training: 43%|████▎ | 4343/10000 [51:52<1:01:27, 1.53it/s, loss=0.0079, lr=1.71e-05, step=4343] Training: 43%|████▎ | 4344/10000 [51:53<57:48, 1.63it/s, loss=0.0079, lr=1.71e-05, step=4343] Training: 43%|████▎ | 4344/10000 [51:53<57:48, 1.63it/s, loss=0.0075, lr=1.71e-05, step=4344] Training: 43%|████▎ | 4345/10000 [51:54<54:42, 1.72it/s, loss=0.0075, lr=1.71e-05, step=4344] Training: 43%|████▎ | 4345/10000 [51:54<54:42, 1.72it/s, loss=0.0103, lr=1.71e-05, step=4345] Training: 43%|████▎ | 4346/10000 [51:54<52:32, 1.79it/s, loss=0.0103, lr=1.71e-05, step=4345] Training: 43%|████▎ | 4346/10000 [51:54<52:32, 1.79it/s, loss=0.0190, lr=1.71e-05, step=4346] Training: 43%|████▎ | 4347/10000 [51:55<51:16, 1.84it/s, loss=0.0190, lr=1.71e-05, step=4346] Training: 43%|████▎ | 4347/10000 [51:55<51:16, 1.84it/s, loss=0.0102, lr=1.71e-05, step=4347] Training: 43%|████▎ | 4348/10000 [51:55<56:38, 1.66it/s, loss=0.0102, lr=1.71e-05, step=4347] Training: 43%|████▎ | 4348/10000 [51:55<56:38, 1.66it/s, loss=0.0211, lr=1.71e-05, step=4348] Training: 43%|████▎ | 4349/10000 [51:56<53:20, 1.77it/s, loss=0.0211, lr=1.71e-05, step=4348] Training: 43%|████▎ | 4349/10000 [51:56<53:20, 1.77it/s, loss=0.0219, lr=1.71e-05, step=4349]16:58:03.432 [I] step=4350 loss=0.0134 smoothed_loss=0.0133 lr=1.71e-05 grad_norm=0.4282 step_time=0.5414s data_time=0.0640s it/s=1.652 eta_to_10000=3419.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.1183 grad_shared_expert=0.4713 (10775:train_pytorch.py:850) + Training: 44%|████▎ | 4350/10000 [51:56<58:26, 1.61it/s, loss=0.0219, lr=1.71e-05, step=4349] Training: 44%|████▎ | 4350/10000 [51:56<58:26, 1.61it/s, loss=0.0134, lr=1.71e-05, step=4350] Training: 44%|████▎ | 4351/10000 [51:57<55:21, 1.70it/s, loss=0.0134, lr=1.71e-05, step=4350] Training: 44%|████▎ | 4351/10000 [51:57<55:21, 1.70it/s, loss=0.0068, lr=1.70e-05, step=4351] Training: 44%|████▎ | 4352/10000 [51:58<52:47, 1.78it/s, loss=0.0068, lr=1.70e-05, step=4351] Training: 44%|████▎ | 4352/10000 [51:58<52:47, 1.78it/s, loss=0.0247, lr=1.70e-05, step=4352] Training: 44%|████▎ | 4353/10000 [51:58<51:14, 1.84it/s, loss=0.0247, lr=1.70e-05, step=4352] Training: 44%|████▎ | 4353/10000 [51:58<51:14, 1.84it/s, loss=0.0048, lr=1.70e-05, step=4353] Training: 44%|████▎ | 4354/10000 [51:59<50:10, 1.88it/s, loss=0.0048, lr=1.70e-05, step=4353] Training: 44%|████▎ | 4354/10000 [51:59<50:10, 1.88it/s, loss=0.0067, lr=1.70e-05, step=4354] Training: 44%|████▎ | 4355/10000 [51:59<55:00, 1.71it/s, loss=0.0067, lr=1.70e-05, step=4354] Training: 44%|████▎ | 4355/10000 [51:59<55:00, 1.71it/s, loss=0.0152, lr=1.70e-05, step=4355] Training: 44%|████▎ | 4356/10000 [52:00<52:26, 1.79it/s, loss=0.0152, lr=1.70e-05, step=4355] Training: 44%|████▎ | 4356/10000 [52:00<52:26, 1.79it/s, loss=0.0039, lr=1.70e-05, step=4356] Training: 44%|████▎ | 4357/10000 [52:00<56:54, 1.65it/s, loss=0.0039, lr=1.70e-05, step=4356] Training: 44%|████▎ | 4357/10000 [52:00<56:54, 1.65it/s, loss=0.0279, lr=1.70e-05, step=4357] Training: 44%|████▎ | 4358/10000 [52:01<53:50, 1.75it/s, loss=0.0279, lr=1.70e-05, step=4357] Training: 44%|████▎ | 4358/10000 [52:01<53:50, 1.75it/s, loss=0.0229, lr=1.70e-05, step=4358] Training: 44%|████▎ | 4359/10000 [52:01<51:46, 1.82it/s, loss=0.0229, lr=1.70e-05, step=4358] Training: 44%|████▎ | 4359/10000 [52:01<51:46, 1.82it/s, loss=0.0596, lr=1.70e-05, step=4359]16:58:08.905 [I] step=4360 loss=0.0126 smoothed_loss=0.0182 lr=1.70e-05 grad_norm=0.5105 step_time=0.4843s data_time=0.0630s it/s=1.827 eta_to_10000=3086.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0119 grad_action_out_proj=0.1459 grad_shared_expert=0.5443 (10775:train_pytorch.py:850) + Training: 44%|████▎ | 4360/10000 [52:02<51:26, 1.83it/s, loss=0.0596, lr=1.70e-05, step=4359] Training: 44%|████▎ | 4360/10000 [52:02<51:26, 1.83it/s, loss=0.0126, lr=1.70e-05, step=4360] Training: 44%|████▎ | 4361/10000 [52:02<49:33, 1.90it/s, loss=0.0126, lr=1.70e-05, step=4360] Training: 44%|████▎ | 4361/10000 [52:02<49:33, 1.90it/s, loss=0.0059, lr=1.70e-05, step=4361] Training: 44%|████▎ | 4362/10000 [52:03<48:13, 1.95it/s, loss=0.0059, lr=1.70e-05, step=4361] Training: 44%|████▎ | 4362/10000 [52:03<48:13, 1.95it/s, loss=0.0111, lr=1.70e-05, step=4362] Training: 44%|████▎ | 4363/10000 [52:04<53:49, 1.75it/s, loss=0.0111, lr=1.70e-05, step=4362] Training: 44%|████▎ | 4363/10000 [52:04<53:49, 1.75it/s, loss=0.0037, lr=1.70e-05, step=4363] Training: 44%|████▎ | 4364/10000 [52:04<57:07, 1.64it/s, loss=0.0037, lr=1.70e-05, step=4363] Training: 44%|████▎ | 4364/10000 [52:04<57:07, 1.64it/s, loss=0.0184, lr=1.70e-05, step=4364] Training: 44%|████▎ | 4365/10000 [52:05<53:57, 1.74it/s, loss=0.0184, lr=1.70e-05, step=4364] Training: 44%|████▎ | 4365/10000 [52:05<53:57, 1.74it/s, loss=0.0218, lr=1.70e-05, step=4365] Training: 44%|████▎ | 4366/10000 [52:05<51:48, 1.81it/s, loss=0.0218, lr=1.70e-05, step=4365] Training: 44%|████▎ | 4366/10000 [52:05<51:48, 1.81it/s, loss=0.0109, lr=1.70e-05, step=4366] Training: 44%|████▎ | 4367/10000 [52:06<50:22, 1.86it/s, loss=0.0109, lr=1.70e-05, step=4366] Training: 44%|████▎ | 4367/10000 [52:06<50:22, 1.86it/s, loss=0.0109, lr=1.70e-05, step=4367] Training: 44%|████▎ | 4368/10000 [52:06<49:11, 1.91it/s, loss=0.0109, lr=1.70e-05, step=4367] Training: 44%|████▎ | 4368/10000 [52:06<49:11, 1.91it/s, loss=0.0034, lr=1.70e-05, step=4368] Training: 44%|████▎ | 4369/10000 [52:07<48:20, 1.94it/s, loss=0.0034, lr=1.70e-05, step=4368] Training: 44%|████▎ | 4369/10000 [52:07<48:20, 1.94it/s, loss=0.0061, lr=1.70e-05, step=4369]16:58:14.505 [I] step=4370 loss=0.0248 smoothed_loss=0.0143 lr=1.70e-05 grad_norm=0.5039 step_time=0.4994s data_time=0.0606s it/s=1.786 eta_to_10000=3152.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0258 grad_action_out_proj=0.2271 grad_shared_expert=0.6223 (10775:train_pytorch.py:850) + Training: 44%|████▎ | 4370/10000 [52:08<54:59, 1.71it/s, loss=0.0061, lr=1.70e-05, step=4369] Training: 44%|████▎ | 4370/10000 [52:08<54:59, 1.71it/s, loss=0.0248, lr=1.70e-05, step=4370] Training: 44%|████▎ | 4371/10000 [52:08<51:49, 1.81it/s, loss=0.0248, lr=1.70e-05, step=4370] Training: 44%|████▎ | 4371/10000 [52:08<51:49, 1.81it/s, loss=0.0137, lr=1.70e-05, step=4371] Training: 44%|████▎ | 4372/10000 [52:09<55:42, 1.68it/s, loss=0.0137, lr=1.70e-05, step=4371] Training: 44%|████▎ | 4372/10000 [52:09<55:42, 1.68it/s, loss=0.0131, lr=1.70e-05, step=4372] Training: 44%|████▎ | 4373/10000 [52:09<52:56, 1.77it/s, loss=0.0131, lr=1.70e-05, step=4372] Training: 44%|████▎ | 4373/10000 [52:09<52:56, 1.77it/s, loss=0.0138, lr=1.70e-05, step=4373] Training: 44%|████▎ | 4374/10000 [52:10<51:24, 1.82it/s, loss=0.0138, lr=1.70e-05, step=4373] Training: 44%|████▎ | 4374/10000 [52:10<51:24, 1.82it/s, loss=0.0220, lr=1.70e-05, step=4374] Training: 44%|████▍ | 4375/10000 [52:10<51:40, 1.81it/s, loss=0.0220, lr=1.70e-05, step=4374] Training: 44%|████▍ | 4375/10000 [52:10<51:40, 1.81it/s, loss=0.0154, lr=1.70e-05, step=4375] Training: 44%|████▍ | 4376/10000 [52:11<50:35, 1.85it/s, loss=0.0154, lr=1.70e-05, step=4375] Training: 44%|████▍ | 4376/10000 [52:11<50:35, 1.85it/s, loss=0.0663, lr=1.70e-05, step=4376] Training: 44%|████▍ | 4377/10000 [52:11<49:51, 1.88it/s, loss=0.0663, lr=1.70e-05, step=4376] Training: 44%|████▍ | 4377/10000 [52:11<49:51, 1.88it/s, loss=0.0276, lr=1.70e-05, step=4377] Training: 44%|████▍ | 4378/10000 [52:12<55:06, 1.70it/s, loss=0.0276, lr=1.70e-05, step=4377] Training: 44%|████▍ | 4378/10000 [52:12<55:06, 1.70it/s, loss=0.0093, lr=1.70e-05, step=4378] Training: 44%|████▍ | 4379/10000 [52:13<58:51, 1.59it/s, loss=0.0093, lr=1.70e-05, step=4378] Training: 44%|████▍ | 4379/10000 [52:13<58:51, 1.59it/s, loss=0.0139, lr=1.69e-05, step=4379]16:58:20.216 [I] step=4380 loss=0.0160 smoothed_loss=0.0188 lr=1.70e-05 grad_norm=0.5110 step_time=0.5117s data_time=0.0595s it/s=1.751 eta_to_10000=3209.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.0907 grad_shared_expert=0.3660 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4380/10000 [52:13<55:40, 1.68it/s, loss=0.0139, lr=1.69e-05, step=4379] Training: 44%|████▍ | 4380/10000 [52:13<55:40, 1.68it/s, loss=0.0160, lr=1.69e-05, step=4380] Training: 44%|████▍ | 4381/10000 [52:14<52:54, 1.77it/s, loss=0.0160, lr=1.69e-05, step=4380] Training: 44%|████▍ | 4381/10000 [52:14<52:54, 1.77it/s, loss=0.0038, lr=1.69e-05, step=4381] Training: 44%|████▍ | 4382/10000 [52:14<53:12, 1.76it/s, loss=0.0038, lr=1.69e-05, step=4381] Training: 44%|████▍ | 4382/10000 [52:14<53:12, 1.76it/s, loss=0.0081, lr=1.69e-05, step=4382] Training: 44%|████▍ | 4383/10000 [52:15<56:20, 1.66it/s, loss=0.0081, lr=1.69e-05, step=4382] Training: 44%|████▍ | 4383/10000 [52:15<56:20, 1.66it/s, loss=0.0124, lr=1.69e-05, step=4383] Training: 44%|████▍ | 4384/10000 [52:16<57:24, 1.63it/s, loss=0.0124, lr=1.69e-05, step=4383] Training: 44%|████▍ | 4384/10000 [52:16<57:24, 1.63it/s, loss=0.0105, lr=1.69e-05, step=4384] Training: 44%|████▍ | 4385/10000 [52:16<53:42, 1.74it/s, loss=0.0105, lr=1.69e-05, step=4384] Training: 44%|████▍ | 4385/10000 [52:16<53:42, 1.74it/s, loss=0.0112, lr=1.69e-05, step=4385] Training: 44%|████▍ | 4386/10000 [52:17<59:17, 1.58it/s, loss=0.0112, lr=1.69e-05, step=4385] Training: 44%|████▍ | 4386/10000 [52:17<59:17, 1.58it/s, loss=0.0185, lr=1.69e-05, step=4386] Training: 44%|████▍ | 4387/10000 [52:17<55:26, 1.69it/s, loss=0.0185, lr=1.69e-05, step=4386] Training: 44%|████▍ | 4387/10000 [52:17<55:26, 1.69it/s, loss=0.0024, lr=1.69e-05, step=4387] Training: 44%|████▍ | 4388/10000 [52:18<55:08, 1.70it/s, loss=0.0024, lr=1.69e-05, step=4387] Training: 44%|████▍ | 4388/10000 [52:18<55:08, 1.70it/s, loss=0.0303, lr=1.69e-05, step=4388] Training: 44%|████▍ | 4389/10000 [52:19<57:28, 1.63it/s, loss=0.0303, lr=1.69e-05, step=4388] Training: 44%|████▍ | 4389/10000 [52:19<57:28, 1.63it/s, loss=0.0173, lr=1.69e-05, step=4389]16:58:26.139 [I] step=4390 loss=0.0234 smoothed_loss=0.0166 lr=1.69e-05 grad_norm=0.4135 step_time=0.5138s data_time=0.0785s it/s=1.689 eta_to_10000=3322.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0072 grad_action_out_proj=0.0897 grad_shared_expert=0.3624 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4390/10000 [52:19<54:54, 1.70it/s, loss=0.0173, lr=1.69e-05, step=4389] Training: 44%|████▍ | 4390/10000 [52:19<54:54, 1.70it/s, loss=0.0234, lr=1.69e-05, step=4390] Training: 44%|████▍ | 4391/10000 [52:20<53:48, 1.74it/s, loss=0.0234, lr=1.69e-05, step=4390] Training: 44%|████▍ | 4391/10000 [52:20<53:48, 1.74it/s, loss=0.0090, lr=1.69e-05, step=4391] Training: 44%|████▍ | 4392/10000 [52:20<56:59, 1.64it/s, loss=0.0090, lr=1.69e-05, step=4391] Training: 44%|████▍ | 4392/10000 [52:20<56:59, 1.64it/s, loss=0.0070, lr=1.69e-05, step=4392] Training: 44%|████▍ | 4393/10000 [52:21<1:01:17, 1.52it/s, loss=0.0070, lr=1.69e-05, step=4392] Training: 44%|████▍ | 4393/10000 [52:21<1:01:17, 1.52it/s, loss=0.0080, lr=1.69e-05, step=4393] Training: 44%|████▍ | 4394/10000 [52:22<1:03:48, 1.46it/s, loss=0.0080, lr=1.69e-05, step=4393] Training: 44%|████▍ | 4394/10000 [52:22<1:03:48, 1.46it/s, loss=0.0118, lr=1.69e-05, step=4394] Training: 44%|████▍ | 4395/10000 [52:23<1:01:23, 1.52it/s, loss=0.0118, lr=1.69e-05, step=4394] Training: 44%|████▍ | 4395/10000 [52:23<1:01:23, 1.52it/s, loss=0.0114, lr=1.69e-05, step=4395] Training: 44%|████▍ | 4396/10000 [52:23<1:03:06, 1.48it/s, loss=0.0114, lr=1.69e-05, step=4395] Training: 44%|████▍ | 4396/10000 [52:23<1:03:06, 1.48it/s, loss=0.0093, lr=1.69e-05, step=4396] Training: 44%|████▍ | 4397/10000 [52:24<57:25, 1.63it/s, loss=0.0093, lr=1.69e-05, step=4396] Training: 44%|████▍ | 4397/10000 [52:24<57:25, 1.63it/s, loss=0.0111, lr=1.69e-05, step=4397] Training: 44%|████▍ | 4398/10000 [52:24<54:11, 1.72it/s, loss=0.0111, lr=1.69e-05, step=4397] Training: 44%|████▍ | 4398/10000 [52:24<54:11, 1.72it/s, loss=0.0181, lr=1.69e-05, step=4398] Training: 44%|████▍ | 4399/10000 [52:25<52:11, 1.79it/s, loss=0.0181, lr=1.69e-05, step=4398] Training: 44%|████▍ | 4399/10000 [52:25<52:11, 1.79it/s, loss=0.0304, lr=1.69e-05, step=4399]16:58:32.637 [I] step=4400 loss=0.0092 smoothed_loss=0.0147 lr=1.69e-05 grad_norm=0.4798 step_time=0.5619s data_time=0.0878s it/s=1.539 eta_to_10000=3637.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0157 grad_action_out_proj=0.1058 grad_shared_expert=0.3642 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4400/10000 [52:26<1:03:09, 1.48it/s, loss=0.0304, lr=1.69e-05, step=4399] Training: 44%|████▍ | 4400/10000 [52:26<1:03:09, 1.48it/s, loss=0.0092, lr=1.69e-05, step=4400] Training: 44%|████▍ | 4401/10000 [52:26<1:05:05, 1.43it/s, loss=0.0092, lr=1.69e-05, step=4400] Training: 44%|████▍ | 4401/10000 [52:26<1:05:05, 1.43it/s, loss=0.0023, lr=1.69e-05, step=4401] Training: 44%|████▍ | 4402/10000 [52:27<1:04:05, 1.46it/s, loss=0.0023, lr=1.69e-05, step=4401] Training: 44%|████▍ | 4402/10000 [52:27<1:04:05, 1.46it/s, loss=0.0152, lr=1.69e-05, step=4402] Training: 44%|████▍ | 4403/10000 [52:28<59:11, 1.58it/s, loss=0.0152, lr=1.69e-05, step=4402] Training: 44%|████▍ | 4403/10000 [52:28<59:11, 1.58it/s, loss=0.0073, lr=1.69e-05, step=4403] Training: 44%|████▍ | 4404/10000 [52:28<55:28, 1.68it/s, loss=0.0073, lr=1.69e-05, step=4403] Training: 44%|████▍ | 4404/10000 [52:28<55:28, 1.68it/s, loss=0.0065, lr=1.69e-05, step=4404] Training: 44%|████▍ | 4405/10000 [52:29<53:01, 1.76it/s, loss=0.0065, lr=1.69e-05, step=4404] Training: 44%|████▍ | 4405/10000 [52:29<53:01, 1.76it/s, loss=0.0152, lr=1.69e-05, step=4405] Training: 44%|████▍ | 4406/10000 [52:29<51:00, 1.83it/s, loss=0.0152, lr=1.69e-05, step=4405] Training: 44%|████▍ | 4406/10000 [52:29<51:00, 1.83it/s, loss=0.0124, lr=1.69e-05, step=4406] Training: 44%|████▍ | 4407/10000 [52:30<56:59, 1.64it/s, loss=0.0124, lr=1.69e-05, step=4406] Training: 44%|████▍ | 4407/10000 [52:30<56:59, 1.64it/s, loss=0.0339, lr=1.68e-05, step=4407] Training: 44%|████▍ | 4408/10000 [52:31<58:29, 1.59it/s, loss=0.0339, lr=1.68e-05, step=4407] Training: 44%|████▍ | 4408/10000 [52:31<58:29, 1.59it/s, loss=0.0058, lr=1.68e-05, step=4408] Training: 44%|████▍ | 4409/10000 [52:31<59:33, 1.56it/s, loss=0.0058, lr=1.68e-05, step=4408] Training: 44%|████▍ | 4409/10000 [52:31<59:33, 1.56it/s, loss=0.0025, lr=1.68e-05, step=4409]16:58:38.778 [I] step=4410 loss=0.0021 smoothed_loss=0.0116 lr=1.69e-05 grad_norm=0.4617 step_time=0.5223s data_time=0.0918s it/s=1.629 eta_to_10000=3432.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0126 grad_action_out_proj=0.1495 grad_shared_expert=0.3952 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4410/10000 [52:32<59:02, 1.58it/s, loss=0.0025, lr=1.68e-05, step=4409] Training: 44%|████▍ | 4410/10000 [52:32<59:02, 1.58it/s, loss=0.0021, lr=1.68e-05, step=4410] Training: 44%|████▍ | 4411/10000 [52:32<54:43, 1.70it/s, loss=0.0021, lr=1.68e-05, step=4410] Training: 44%|████▍ | 4411/10000 [52:32<54:43, 1.70it/s, loss=0.0145, lr=1.68e-05, step=4411] Training: 44%|████▍ | 4412/10000 [52:33<52:05, 1.79it/s, loss=0.0145, lr=1.68e-05, step=4411] Training: 44%|████▍ | 4412/10000 [52:33<52:05, 1.79it/s, loss=0.0035, lr=1.68e-05, step=4412] Training: 44%|████▍ | 4413/10000 [52:34<55:57, 1.66it/s, loss=0.0035, lr=1.68e-05, step=4412] Training: 44%|████▍ | 4413/10000 [52:34<55:57, 1.66it/s, loss=0.0057, lr=1.68e-05, step=4413] Training: 44%|████▍ | 4414/10000 [52:34<1:01:24, 1.52it/s, loss=0.0057, lr=1.68e-05, step=4413] Training: 44%|████▍ | 4414/10000 [52:34<1:01:24, 1.52it/s, loss=0.0153, lr=1.68e-05, step=4414] Training: 44%|████▍ | 4415/10000 [52:35<1:03:11, 1.47it/s, loss=0.0153, lr=1.68e-05, step=4414] Training: 44%|████▍ | 4415/10000 [52:35<1:03:11, 1.47it/s, loss=0.0711, lr=1.68e-05, step=4415] Training: 44%|████▍ | 4416/10000 [52:36<1:00:18, 1.54it/s, loss=0.0711, lr=1.68e-05, step=4415] Training: 44%|████▍ | 4416/10000 [52:36<1:00:18, 1.54it/s, loss=0.0028, lr=1.68e-05, step=4416] Training: 44%|████▍ | 4417/10000 [52:36<56:29, 1.65it/s, loss=0.0028, lr=1.68e-05, step=4416] Training: 44%|████▍ | 4417/10000 [52:36<56:29, 1.65it/s, loss=0.0057, lr=1.68e-05, step=4417] Training: 44%|████▍ | 4418/10000 [52:37<54:34, 1.70it/s, loss=0.0057, lr=1.68e-05, step=4417] Training: 44%|████▍ | 4418/10000 [52:37<54:34, 1.70it/s, loss=0.0234, lr=1.68e-05, step=4418] Training: 44%|████▍ | 4419/10000 [52:37<51:32, 1.80it/s, loss=0.0234, lr=1.68e-05, step=4418] Training: 44%|████▍ | 4419/10000 [52:37<51:32, 1.80it/s, loss=0.0151, lr=1.68e-05, step=4419]16:58:44.599 [I] step=4420 loss=0.0095 smoothed_loss=0.0149 lr=1.68e-05 grad_norm=0.4566 step_time=0.5066s data_time=0.0755s it/s=1.718 eta_to_10000=3247.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.1328 grad_shared_expert=0.3901 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4420/10000 [52:38<50:43, 1.83it/s, loss=0.0151, lr=1.68e-05, step=4419] Training: 44%|████▍ | 4420/10000 [52:38<50:43, 1.83it/s, loss=0.0095, lr=1.68e-05, step=4420] Training: 44%|████▍ | 4421/10000 [52:38<56:30, 1.65it/s, loss=0.0095, lr=1.68e-05, step=4420] Training: 44%|████▍ | 4421/10000 [52:38<56:30, 1.65it/s, loss=0.0133, lr=1.68e-05, step=4421] Training: 44%|████▍ | 4422/10000 [52:39<59:03, 1.57it/s, loss=0.0133, lr=1.68e-05, step=4421] Training: 44%|████▍ | 4422/10000 [52:39<59:03, 1.57it/s, loss=0.0020, lr=1.68e-05, step=4422] Training: 44%|████▍ | 4423/10000 [52:40<55:32, 1.67it/s, loss=0.0020, lr=1.68e-05, step=4422] Training: 44%|████▍ | 4423/10000 [52:40<55:32, 1.67it/s, loss=0.0115, lr=1.68e-05, step=4423] Training: 44%|████▍ | 4424/10000 [52:40<52:35, 1.77it/s, loss=0.0115, lr=1.68e-05, step=4423] Training: 44%|████▍ | 4424/10000 [52:40<52:35, 1.77it/s, loss=0.0091, lr=1.68e-05, step=4424] Training: 44%|████▍ | 4425/10000 [52:41<50:20, 1.85it/s, loss=0.0091, lr=1.68e-05, step=4424] Training: 44%|████▍ | 4425/10000 [52:41<50:20, 1.85it/s, loss=0.0105, lr=1.68e-05, step=4425] Training: 44%|████▍ | 4426/10000 [52:41<50:50, 1.83it/s, loss=0.0105, lr=1.68e-05, step=4425] Training: 44%|████▍ | 4426/10000 [52:41<50:50, 1.83it/s, loss=0.0062, lr=1.68e-05, step=4426] Training: 44%|████▍ | 4427/10000 [52:42<48:48, 1.90it/s, loss=0.0062, lr=1.68e-05, step=4426] Training: 44%|████▍ | 4427/10000 [52:42<48:48, 1.90it/s, loss=0.0060, lr=1.68e-05, step=4427] Training: 44%|████▍ | 4428/10000 [52:42<54:00, 1.72it/s, loss=0.0060, lr=1.68e-05, step=4427] Training: 44%|████▍ | 4428/10000 [52:42<54:00, 1.72it/s, loss=0.0147, lr=1.68e-05, step=4428] Training: 44%|████▍ | 4429/10000 [52:43<57:17, 1.62it/s, loss=0.0147, lr=1.68e-05, step=4428] Training: 44%|████▍ | 4429/10000 [52:43<57:17, 1.62it/s, loss=0.0260, lr=1.68e-05, step=4429]16:58:50.591 [I] step=4430 loss=0.0088 smoothed_loss=0.0127 lr=1.68e-05 grad_norm=0.4455 step_time=0.5344s data_time=0.0651s it/s=1.669 eta_to_10000=3337.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0080 grad_action_out_proj=0.0948 grad_shared_expert=0.3573 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4430/10000 [52:44<56:57, 1.63it/s, loss=0.0260, lr=1.68e-05, step=4429] Training: 44%|████▍ | 4430/10000 [52:44<56:57, 1.63it/s, loss=0.0088, lr=1.68e-05, step=4430] Training: 44%|████▍ | 4431/10000 [52:44<55:21, 1.68it/s, loss=0.0088, lr=1.68e-05, step=4430] Training: 44%|████▍ | 4431/10000 [52:44<55:21, 1.68it/s, loss=0.0249, lr=1.68e-05, step=4431] Training: 44%|████▍ | 4432/10000 [52:45<52:45, 1.76it/s, loss=0.0249, lr=1.68e-05, step=4431] Training: 44%|████▍ | 4432/10000 [52:45<52:45, 1.76it/s, loss=0.0116, lr=1.68e-05, step=4432] Training: 44%|████▍ | 4433/10000 [52:45<51:07, 1.82it/s, loss=0.0116, lr=1.68e-05, step=4432] Training: 44%|████▍ | 4433/10000 [52:45<51:07, 1.82it/s, loss=0.0148, lr=1.68e-05, step=4433] Training: 44%|████▍ | 4434/10000 [52:46<49:45, 1.86it/s, loss=0.0148, lr=1.68e-05, step=4433] Training: 44%|████▍ | 4434/10000 [52:46<49:45, 1.86it/s, loss=0.0120, lr=1.68e-05, step=4434] Training: 44%|████▍ | 4435/10000 [52:46<54:57, 1.69it/s, loss=0.0120, lr=1.68e-05, step=4434] Training: 44%|████▍ | 4435/10000 [52:46<54:57, 1.69it/s, loss=0.0050, lr=1.67e-05, step=4435] Training: 44%|████▍ | 4436/10000 [52:47<58:24, 1.59it/s, loss=0.0050, lr=1.67e-05, step=4435] Training: 44%|████▍ | 4436/10000 [52:47<58:24, 1.59it/s, loss=0.0127, lr=1.67e-05, step=4436] Training: 44%|████▍ | 4437/10000 [52:48<54:14, 1.71it/s, loss=0.0127, lr=1.67e-05, step=4436] Training: 44%|████▍ | 4437/10000 [52:48<54:14, 1.71it/s, loss=0.0243, lr=1.67e-05, step=4437] Training: 44%|████▍ | 4438/10000 [52:48<51:42, 1.79it/s, loss=0.0243, lr=1.67e-05, step=4437] Training: 44%|████▍ | 4438/10000 [52:48<51:42, 1.79it/s, loss=0.0182, lr=1.67e-05, step=4438] Training: 44%|████▍ | 4439/10000 [52:49<49:53, 1.86it/s, loss=0.0182, lr=1.67e-05, step=4438] Training: 44%|████▍ | 4439/10000 [52:49<49:53, 1.86it/s, loss=0.0054, lr=1.67e-05, step=4439]16:58:56.107 [I] step=4440 loss=0.0100 smoothed_loss=0.0131 lr=1.67e-05 grad_norm=0.4180 step_time=0.4939s data_time=0.0575s it/s=1.813 eta_to_10000=3066.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0059 grad_action_out_proj=0.0891 grad_shared_expert=0.3431 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4440/10000 [52:49<49:49, 1.86it/s, loss=0.0054, lr=1.67e-05, step=4439] Training: 44%|████▍ | 4440/10000 [52:49<49:49, 1.86it/s, loss=0.0100, lr=1.67e-05, step=4440] Training: 44%|████▍ | 4441/10000 [52:50<48:55, 1.89it/s, loss=0.0100, lr=1.67e-05, step=4440] Training: 44%|████▍ | 4441/10000 [52:50<48:55, 1.89it/s, loss=0.0095, lr=1.67e-05, step=4441] Training: 44%|████▍ | 4442/10000 [52:50<54:31, 1.70it/s, loss=0.0095, lr=1.67e-05, step=4441] Training: 44%|████▍ | 4442/10000 [52:50<54:31, 1.70it/s, loss=0.0205, lr=1.67e-05, step=4442] Training: 44%|████▍ | 4443/10000 [52:51<57:45, 1.60it/s, loss=0.0205, lr=1.67e-05, step=4442] Training: 44%|████▍ | 4443/10000 [52:51<57:45, 1.60it/s, loss=0.0039, lr=1.67e-05, step=4443] Training: 44%|████▍ | 4444/10000 [52:52<54:28, 1.70it/s, loss=0.0039, lr=1.67e-05, step=4443] Training: 44%|████▍ | 4444/10000 [52:52<54:28, 1.70it/s, loss=0.0312, lr=1.67e-05, step=4444] Training: 44%|████▍ | 4445/10000 [52:52<51:55, 1.78it/s, loss=0.0312, lr=1.67e-05, step=4444] Training: 44%|████▍ | 4445/10000 [52:52<51:55, 1.78it/s, loss=0.0096, lr=1.67e-05, step=4445] Training: 44%|████▍ | 4446/10000 [52:53<52:17, 1.77it/s, loss=0.0096, lr=1.67e-05, step=4445] Training: 44%|████▍ | 4446/10000 [52:53<52:17, 1.77it/s, loss=0.0071, lr=1.67e-05, step=4446] Training: 44%|████▍ | 4447/10000 [52:53<50:46, 1.82it/s, loss=0.0071, lr=1.67e-05, step=4446] Training: 44%|████▍ | 4447/10000 [52:53<50:46, 1.82it/s, loss=0.0089, lr=1.67e-05, step=4447] Training: 44%|████▍ | 4448/10000 [52:54<49:53, 1.85it/s, loss=0.0089, lr=1.67e-05, step=4447] Training: 44%|████▍ | 4448/10000 [52:54<49:53, 1.85it/s, loss=0.0322, lr=1.67e-05, step=4448] Training: 44%|████▍ | 4449/10000 [52:54<48:32, 1.91it/s, loss=0.0322, lr=1.67e-05, step=4448] Training: 44%|████▍ | 4449/10000 [52:54<48:32, 1.91it/s, loss=0.0140, lr=1.67e-05, step=4449]16:59:01.919 [I] step=4450 loss=0.0075 smoothed_loss=0.0140 lr=1.67e-05 grad_norm=0.4288 step_time=0.5169s data_time=0.0643s it/s=1.721 eta_to_10000=3224.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0733 grad_shared_expert=0.3633 (10775:train_pytorch.py:850) + Training: 44%|████▍ | 4450/10000 [52:55<55:28, 1.67it/s, loss=0.0140, lr=1.67e-05, step=4449] Training: 44%|████▍ | 4450/10000 [52:55<55:28, 1.67it/s, loss=0.0075, lr=1.67e-05, step=4450] Training: 45%|████▍ | 4451/10000 [52:55<52:20, 1.77it/s, loss=0.0075, lr=1.67e-05, step=4450] Training: 45%|████▍ | 4451/10000 [52:55<52:20, 1.77it/s, loss=0.0165, lr=1.67e-05, step=4451] Training: 45%|████▍ | 4452/10000 [52:56<49:58, 1.85it/s, loss=0.0165, lr=1.67e-05, step=4451] Training: 45%|████▍ | 4452/10000 [52:56<49:58, 1.85it/s, loss=0.0086, lr=1.67e-05, step=4452] Training: 45%|████▍ | 4453/10000 [52:56<48:22, 1.91it/s, loss=0.0086, lr=1.67e-05, step=4452] Training: 45%|████▍ | 4453/10000 [52:56<48:22, 1.91it/s, loss=0.0121, lr=1.67e-05, step=4453] Training: 45%|████▍ | 4454/10000 [52:57<47:47, 1.93it/s, loss=0.0121, lr=1.67e-05, step=4453] Training: 45%|████▍ | 4454/10000 [52:57<47:47, 1.93it/s, loss=0.0092, lr=1.67e-05, step=4454] Training: 45%|████▍ | 4455/10000 [52:57<47:39, 1.94it/s, loss=0.0092, lr=1.67e-05, step=4454] Training: 45%|████▍ | 4455/10000 [52:57<47:39, 1.94it/s, loss=0.0081, lr=1.67e-05, step=4455] Training: 45%|████▍ | 4456/10000 [52:58<46:43, 1.98it/s, loss=0.0081, lr=1.67e-05, step=4455] Training: 45%|████▍ | 4456/10000 [52:58<46:43, 1.98it/s, loss=0.0120, lr=1.67e-05, step=4456] Training: 45%|████▍ | 4457/10000 [52:59<52:06, 1.77it/s, loss=0.0120, lr=1.67e-05, step=4456] Training: 45%|████▍ | 4457/10000 [52:59<52:06, 1.77it/s, loss=0.0066, lr=1.67e-05, step=4457] Training: 45%|████▍ | 4458/10000 [52:59<56:29, 1.63it/s, loss=0.0066, lr=1.67e-05, step=4457] Training: 45%|████▍ | 4458/10000 [52:59<56:29, 1.63it/s, loss=0.0028, lr=1.67e-05, step=4458] Training: 45%|████▍ | 4459/10000 [53:00<52:59, 1.74it/s, loss=0.0028, lr=1.67e-05, step=4458] Training: 45%|████▍ | 4459/10000 [53:00<52:59, 1.74it/s, loss=0.0103, lr=1.67e-05, step=4459]16:59:07.385 [I] step=4460 loss=0.0056 smoothed_loss=0.0104 lr=1.67e-05 grad_norm=0.3608 step_time=0.4893s data_time=0.0576s it/s=1.830 eta_to_10000=3027.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0061 grad_action_out_proj=0.0691 grad_shared_expert=0.2755 (10775:train_pytorch.py:850) + Training: 45%|████▍ | 4460/10000 [53:00<53:57, 1.71it/s, loss=0.0103, lr=1.67e-05, step=4459] Training: 45%|████▍ | 4460/10000 [53:00<53:57, 1.71it/s, loss=0.0056, lr=1.67e-05, step=4460] Training: 45%|████▍ | 4461/10000 [53:01<51:02, 1.81it/s, loss=0.0056, lr=1.67e-05, step=4460] Training: 45%|████▍ | 4461/10000 [53:01<51:02, 1.81it/s, loss=0.0043, lr=1.67e-05, step=4461] Training: 45%|████▍ | 4462/10000 [53:01<49:32, 1.86it/s, loss=0.0043, lr=1.67e-05, step=4461] Training: 45%|████▍ | 4462/10000 [53:01<49:32, 1.86it/s, loss=0.0062, lr=1.67e-05, step=4462] Training: 45%|████▍ | 4463/10000 [53:02<48:59, 1.88it/s, loss=0.0062, lr=1.67e-05, step=4462] Training: 45%|████▍ | 4463/10000 [53:02<48:59, 1.88it/s, loss=0.0153, lr=1.66e-05, step=4463] Training: 45%|████▍ | 4464/10000 [53:02<47:50, 1.93it/s, loss=0.0153, lr=1.66e-05, step=4463] Training: 45%|████▍ | 4464/10000 [53:02<47:50, 1.93it/s, loss=0.0119, lr=1.66e-05, step=4464] Training: 45%|████▍ | 4465/10000 [53:03<54:07, 1.70it/s, loss=0.0119, lr=1.66e-05, step=4464] Training: 45%|████▍ | 4465/10000 [53:03<54:07, 1.70it/s, loss=0.0024, lr=1.66e-05, step=4465] Training: 45%|████▍ | 4466/10000 [53:04<51:59, 1.77it/s, loss=0.0024, lr=1.66e-05, step=4465] Training: 45%|████▍ | 4466/10000 [53:04<51:59, 1.77it/s, loss=0.0558, lr=1.66e-05, step=4466] Training: 45%|████▍ | 4467/10000 [53:04<50:05, 1.84it/s, loss=0.0558, lr=1.66e-05, step=4466] Training: 45%|████▍ | 4467/10000 [53:04<50:05, 1.84it/s, loss=0.0079, lr=1.66e-05, step=4467] Training: 45%|████▍ | 4468/10000 [53:05<48:52, 1.89it/s, loss=0.0079, lr=1.66e-05, step=4467] Training: 45%|████▍ | 4468/10000 [53:05<48:52, 1.89it/s, loss=0.0119, lr=1.66e-05, step=4468] Training: 45%|████▍ | 4469/10000 [53:05<48:08, 1.92it/s, loss=0.0119, lr=1.66e-05, step=4468] Training: 45%|████▍ | 4469/10000 [53:05<48:08, 1.92it/s, loss=0.0087, lr=1.66e-05, step=4469]16:59:12.672 [I] step=4470 loss=0.0164 smoothed_loss=0.0132 lr=1.66e-05 grad_norm=0.5093 step_time=0.4705s data_time=0.0579s it/s=1.892 eta_to_10000=2923.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0284 grad_action_out_proj=0.1837 grad_shared_expert=0.4914 (10775:train_pytorch.py:850) + Training: 45%|████▍ | 4470/10000 [53:06<48:48, 1.89it/s, loss=0.0087, lr=1.66e-05, step=4469] Training: 45%|████▍ | 4470/10000 [53:06<48:48, 1.89it/s, loss=0.0164, lr=1.66e-05, step=4470] Training: 45%|████▍ | 4471/10000 [53:06<47:46, 1.93it/s, loss=0.0164, lr=1.66e-05, step=4470] Training: 45%|████▍ | 4471/10000 [53:06<47:46, 1.93it/s, loss=0.0990, lr=1.66e-05, step=4471] Training: 45%|████▍ | 4472/10000 [53:07<53:22, 1.73it/s, loss=0.0990, lr=1.66e-05, step=4471] Training: 45%|████▍ | 4472/10000 [53:07<53:22, 1.73it/s, loss=0.0041, lr=1.66e-05, step=4472] Training: 45%|████▍ | 4473/10000 [53:07<50:56, 1.81it/s, loss=0.0041, lr=1.66e-05, step=4472] Training: 45%|████▍ | 4473/10000 [53:07<50:56, 1.81it/s, loss=0.0091, lr=1.66e-05, step=4473] Training: 45%|████▍ | 4474/10000 [53:08<49:13, 1.87it/s, loss=0.0091, lr=1.66e-05, step=4473] Training: 45%|████▍ | 4474/10000 [53:08<49:13, 1.87it/s, loss=0.0051, lr=1.66e-05, step=4474] Training: 45%|████▍ | 4475/10000 [53:08<48:09, 1.91it/s, loss=0.0051, lr=1.66e-05, step=4474] Training: 45%|████▍ | 4475/10000 [53:08<48:09, 1.91it/s, loss=0.0187, lr=1.66e-05, step=4475] Training: 45%|████▍ | 4476/10000 [53:09<47:35, 1.93it/s, loss=0.0187, lr=1.66e-05, step=4475] Training: 45%|████▍ | 4476/10000 [53:09<47:35, 1.93it/s, loss=0.0050, lr=1.66e-05, step=4476] Training: 45%|████▍ | 4477/10000 [53:09<47:05, 1.95it/s, loss=0.0050, lr=1.66e-05, step=4476] Training: 45%|████▍ | 4477/10000 [53:09<47:05, 1.95it/s, loss=0.0333, lr=1.66e-05, step=4477] Training: 45%|████▍ | 4478/10000 [53:10<46:39, 1.97it/s, loss=0.0333, lr=1.66e-05, step=4477] Training: 45%|████▍ | 4478/10000 [53:10<46:39, 1.97it/s, loss=0.0124, lr=1.66e-05, step=4478] Training: 45%|████▍ | 4479/10000 [53:11<52:40, 1.75it/s, loss=0.0124, lr=1.66e-05, step=4478] Training: 45%|████▍ | 4479/10000 [53:11<52:40, 1.75it/s, loss=0.0099, lr=1.66e-05, step=4479]16:59:18.102 [I] step=4480 loss=0.0120 smoothed_loss=0.0163 lr=1.66e-05 grad_norm=0.5707 step_time=0.4828s data_time=0.0602s it/s=1.842 eta_to_10000=2997.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.1205 grad_shared_expert=0.3816 (10775:train_pytorch.py:850) + Training: 45%|████▍ | 4480/10000 [53:11<51:03, 1.80it/s, loss=0.0099, lr=1.66e-05, step=4479] Training: 45%|████▍ | 4480/10000 [53:11<51:03, 1.80it/s, loss=0.0120, lr=1.66e-05, step=4480] Training: 45%|████▍ | 4481/10000 [53:12<48:47, 1.89it/s, loss=0.0120, lr=1.66e-05, step=4480] Training: 45%|████▍ | 4481/10000 [53:12<48:47, 1.89it/s, loss=0.0068, lr=1.66e-05, step=4481] Training: 45%|████▍ | 4482/10000 [53:12<47:34, 1.93it/s, loss=0.0068, lr=1.66e-05, step=4481] Training: 45%|████▍ | 4482/10000 [53:12<47:34, 1.93it/s, loss=0.0302, lr=1.66e-05, step=4482] Training: 45%|████▍ | 4483/10000 [53:13<47:09, 1.95it/s, loss=0.0302, lr=1.66e-05, step=4482] Training: 45%|████▍ | 4483/10000 [53:13<47:09, 1.95it/s, loss=0.0091, lr=1.66e-05, step=4483] Training: 45%|████▍ | 4484/10000 [53:13<46:38, 1.97it/s, loss=0.0091, lr=1.66e-05, step=4483] Training: 45%|████▍ | 4484/10000 [53:13<46:38, 1.97it/s, loss=0.0505, lr=1.66e-05, step=4484] Training: 45%|████▍ | 4485/10000 [53:14<47:53, 1.92it/s, loss=0.0505, lr=1.66e-05, step=4484] Training: 45%|████▍ | 4485/10000 [53:14<47:53, 1.92it/s, loss=0.0105, lr=1.66e-05, step=4485] Training: 45%|████▍ | 4486/10000 [53:14<53:54, 1.70it/s, loss=0.0105, lr=1.66e-05, step=4485] Training: 45%|████▍ | 4486/10000 [53:14<53:54, 1.70it/s, loss=0.0098, lr=1.66e-05, step=4486] Training: 45%|████▍ | 4487/10000 [53:15<51:32, 1.78it/s, loss=0.0098, lr=1.66e-05, step=4486] Training: 45%|████▍ | 4487/10000 [53:15<51:32, 1.78it/s, loss=0.0096, lr=1.66e-05, step=4487] Training: 45%|████▍ | 4488/10000 [53:15<49:56, 1.84it/s, loss=0.0096, lr=1.66e-05, step=4487] Training: 45%|████▍ | 4488/10000 [53:15<49:56, 1.84it/s, loss=0.0085, lr=1.66e-05, step=4488] Training: 45%|████▍ | 4489/10000 [53:16<48:21, 1.90it/s, loss=0.0085, lr=1.66e-05, step=4488] Training: 45%|████▍ | 4489/10000 [53:16<48:21, 1.90it/s, loss=0.0346, lr=1.66e-05, step=4489]16:59:23.358 [I] step=4490 loss=0.0251 smoothed_loss=0.0186 lr=1.66e-05 grad_norm=0.5481 step_time=0.4638s data_time=0.0617s it/s=1.903 eta_to_10000=2895.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0063 grad_action_out_proj=0.0822 grad_shared_expert=0.3769 (10775:train_pytorch.py:850) + Training: 45%|████▍ | 4490/10000 [53:16<48:03, 1.91it/s, loss=0.0346, lr=1.66e-05, step=4489] Training: 45%|████▍ | 4490/10000 [53:16<48:03, 1.91it/s, loss=0.0251, lr=1.66e-05, step=4490] Training: 45%|████▍ | 4491/10000 [53:17<47:06, 1.95it/s, loss=0.0251, lr=1.66e-05, step=4490] Training: 45%|████▍ | 4491/10000 [53:17<47:06, 1.95it/s, loss=0.0271, lr=1.65e-05, step=4491] Training: 45%|████▍ | 4492/10000 [53:17<46:08, 1.99it/s, loss=0.0271, lr=1.65e-05, step=4491] Training: 45%|████▍ | 4492/10000 [53:17<46:08, 1.99it/s, loss=0.0041, lr=1.65e-05, step=4492] Training: 45%|████▍ | 4493/10000 [53:18<51:35, 1.78it/s, loss=0.0041, lr=1.65e-05, step=4492] Training: 45%|████▍ | 4493/10000 [53:18<51:35, 1.78it/s, loss=0.0125, lr=1.65e-05, step=4493] Training: 45%|████▍ | 4494/10000 [53:19<49:29, 1.85it/s, loss=0.0125, lr=1.65e-05, step=4493] Training: 45%|████▍ | 4494/10000 [53:19<49:29, 1.85it/s, loss=0.0067, lr=1.65e-05, step=4494] Training: 45%|████▍ | 4495/10000 [53:19<48:28, 1.89it/s, loss=0.0067, lr=1.65e-05, step=4494] Training: 45%|████▍ | 4495/10000 [53:19<48:28, 1.89it/s, loss=0.0020, lr=1.65e-05, step=4495] Training: 45%|████▍ | 4496/10000 [53:20<47:07, 1.95it/s, loss=0.0020, lr=1.65e-05, step=4495] Training: 45%|████▍ | 4496/10000 [53:20<47:07, 1.95it/s, loss=0.0051, lr=1.65e-05, step=4496] Training: 45%|████▍ | 4497/10000 [53:20<46:03, 1.99it/s, loss=0.0051, lr=1.65e-05, step=4496] Training: 45%|████▍ | 4497/10000 [53:20<46:03, 1.99it/s, loss=0.0395, lr=1.65e-05, step=4497] Training: 45%|████▍ | 4498/10000 [53:21<45:41, 2.01it/s, loss=0.0395, lr=1.65e-05, step=4497] Training: 45%|████▍ | 4498/10000 [53:21<45:41, 2.01it/s, loss=0.0130, lr=1.65e-05, step=4498] Training: 45%|████▍ | 4499/10000 [53:21<51:19, 1.79it/s, loss=0.0130, lr=1.65e-05, step=4498] Training: 45%|████▍ | 4499/10000 [53:21<51:19, 1.79it/s, loss=0.0160, lr=1.65e-05, step=4499]16:59:28.692 [I] step=4500 loss=0.0159 smoothed_loss=0.0161 lr=1.65e-05 grad_norm=0.4226 step_time=0.4755s data_time=0.0579s it/s=1.875 eta_to_10000=2933.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1015 grad_shared_expert=0.3055 (10775:train_pytorch.py:850) + Training: 45%|████▌ | 4500/10000 [53:22<50:28, 1.82it/s, loss=0.0160, lr=1.65e-05, step=4499] Training: 45%|████▌ | 4500/10000 [53:22<50:28, 1.82it/s, loss=0.0159, lr=1.65e-05, step=4500] Training: 45%|████▌ | 4501/10000 [53:22<55:16, 1.66it/s, loss=0.0159, lr=1.65e-05, step=4500] Training: 45%|████▌ | 4501/10000 [53:22<55:16, 1.66it/s, loss=0.0129, lr=1.65e-05, step=4501] Training: 45%|████▌ | 4502/10000 [53:23<54:14, 1.69it/s, loss=0.0129, lr=1.65e-05, step=4501] Training: 45%|████▌ | 4502/10000 [53:23<54:14, 1.69it/s, loss=0.0088, lr=1.65e-05, step=4502] Training: 45%|████▌ | 4503/10000 [53:24<52:36, 1.74it/s, loss=0.0088, lr=1.65e-05, step=4502] Training: 45%|████▌ | 4503/10000 [53:24<52:36, 1.74it/s, loss=0.1412, lr=1.65e-05, step=4503] Training: 45%|████▌ | 4504/10000 [53:24<50:03, 1.83it/s, loss=0.1412, lr=1.65e-05, step=4503] Training: 45%|████▌ | 4504/10000 [53:24<50:03, 1.83it/s, loss=0.0211, lr=1.65e-05, step=4504] Training: 45%|████▌ | 4505/10000 [53:25<48:30, 1.89it/s, loss=0.0211, lr=1.65e-05, step=4504] Training: 45%|████▌ | 4505/10000 [53:25<48:30, 1.89it/s, loss=0.0032, lr=1.65e-05, step=4505] Training: 45%|████▌ | 4506/10000 [53:25<53:37, 1.71it/s, loss=0.0032, lr=1.65e-05, step=4505] Training: 45%|████▌ | 4506/10000 [53:25<53:37, 1.71it/s, loss=0.0044, lr=1.65e-05, step=4506] Training: 45%|████▌ | 4507/10000 [53:26<50:57, 1.80it/s, loss=0.0044, lr=1.65e-05, step=4506] Training: 45%|████▌ | 4507/10000 [53:26<50:57, 1.80it/s, loss=0.0082, lr=1.65e-05, step=4507] Training: 45%|████▌ | 4508/10000 [53:26<54:57, 1.67it/s, loss=0.0082, lr=1.65e-05, step=4507] Training: 45%|████▌ | 4508/10000 [53:26<54:57, 1.67it/s, loss=0.0264, lr=1.65e-05, step=4508] Training: 45%|████▌ | 4509/10000 [53:27<53:02, 1.73it/s, loss=0.0264, lr=1.65e-05, step=4508] Training: 45%|████▌ | 4509/10000 [53:27<53:02, 1.73it/s, loss=0.0127, lr=1.65e-05, step=4509]16:59:34.495 [I] step=4510 loss=0.0339 smoothed_loss=0.0221 lr=1.65e-05 grad_norm=0.4554 step_time=0.5215s data_time=0.0587s it/s=1.724 eta_to_10000=3185.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.1272 grad_shared_expert=0.3673 (10775:train_pytorch.py:850) + Training: 45%|████▌ | 4510/10000 [53:28<52:42, 1.74it/s, loss=0.0127, lr=1.65e-05, step=4509] Training: 45%|████▌ | 4510/10000 [53:28<52:42, 1.74it/s, loss=0.0339, lr=1.65e-05, step=4510] Training: 45%|████▌ | 4511/10000 [53:28<50:26, 1.81it/s, loss=0.0339, lr=1.65e-05, step=4510] Training: 45%|████▌ | 4511/10000 [53:28<50:26, 1.81it/s, loss=0.0054, lr=1.65e-05, step=4511] Training: 45%|████▌ | 4512/10000 [53:29<49:01, 1.87it/s, loss=0.0054, lr=1.65e-05, step=4511] Training: 45%|████▌ | 4512/10000 [53:29<49:01, 1.87it/s, loss=0.0523, lr=1.65e-05, step=4512] Training: 45%|████▌ | 4513/10000 [53:29<47:36, 1.92it/s, loss=0.0523, lr=1.65e-05, step=4512] Training: 45%|████▌ | 4513/10000 [53:29<47:36, 1.92it/s, loss=0.0029, lr=1.65e-05, step=4513] Training: 45%|████▌ | 4514/10000 [53:30<52:20, 1.75it/s, loss=0.0029, lr=1.65e-05, step=4513] Training: 45%|████▌ | 4514/10000 [53:30<52:20, 1.75it/s, loss=0.0077, lr=1.65e-05, step=4514] Training: 45%|████▌ | 4515/10000 [53:30<56:08, 1.63it/s, loss=0.0077, lr=1.65e-05, step=4514] Training: 45%|████▌ | 4515/10000 [53:30<56:08, 1.63it/s, loss=0.0091, lr=1.65e-05, step=4515] Training: 45%|████▌ | 4516/10000 [53:31<52:45, 1.73it/s, loss=0.0091, lr=1.65e-05, step=4515] Training: 45%|████▌ | 4516/10000 [53:31<52:45, 1.73it/s, loss=0.0061, lr=1.65e-05, step=4516] Training: 45%|████▌ | 4517/10000 [53:31<50:26, 1.81it/s, loss=0.0061, lr=1.65e-05, step=4516] Training: 45%|████▌ | 4517/10000 [53:31<50:26, 1.81it/s, loss=0.0147, lr=1.65e-05, step=4517] Training: 45%|████▌ | 4518/10000 [53:32<49:04, 1.86it/s, loss=0.0147, lr=1.65e-05, step=4517] Training: 45%|████▌ | 4518/10000 [53:32<49:04, 1.86it/s, loss=0.0137, lr=1.65e-05, step=4518] Training: 45%|████▌ | 4519/10000 [53:32<48:24, 1.89it/s, loss=0.0137, lr=1.65e-05, step=4518] Training: 45%|████▌ | 4519/10000 [53:32<48:24, 1.89it/s, loss=0.0128, lr=1.64e-05, step=4519]16:59:39.927 [I] step=4520 loss=0.0252 smoothed_loss=0.0175 lr=1.65e-05 grad_norm=0.5057 step_time=0.4810s data_time=0.0622s it/s=1.841 eta_to_10000=2976.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.1817 grad_shared_expert=0.4705 (10775:train_pytorch.py:850) + Training: 45%|████▌ | 4520/10000 [53:33<48:54, 1.87it/s, loss=0.0128, lr=1.64e-05, step=4519] Training: 45%|████▌ | 4520/10000 [53:33<48:54, 1.87it/s, loss=0.0252, lr=1.64e-05, step=4520] Training: 45%|████▌ | 4521/10000 [53:34<53:13, 1.72it/s, loss=0.0252, lr=1.64e-05, step=4520] Training: 45%|████▌ | 4521/10000 [53:34<53:13, 1.72it/s, loss=0.0103, lr=1.64e-05, step=4521] Training: 45%|████▌ | 4522/10000 [53:34<56:29, 1.62it/s, loss=0.0103, lr=1.64e-05, step=4521] Training: 45%|████▌ | 4522/10000 [53:34<56:29, 1.62it/s, loss=0.0474, lr=1.64e-05, step=4522] Training: 45%|████▌ | 4523/10000 [53:35<52:35, 1.74it/s, loss=0.0474, lr=1.64e-05, step=4522] Training: 45%|████▌ | 4523/10000 [53:35<52:35, 1.74it/s, loss=0.0076, lr=1.64e-05, step=4523] Training: 45%|████▌ | 4524/10000 [53:35<49:53, 1.83it/s, loss=0.0076, lr=1.64e-05, step=4523] Training: 45%|████▌ | 4524/10000 [53:35<49:53, 1.83it/s, loss=0.0155, lr=1.64e-05, step=4524] Training: 45%|████▌ | 4525/10000 [53:36<48:04, 1.90it/s, loss=0.0155, lr=1.64e-05, step=4524] Training: 45%|████▌ | 4525/10000 [53:36<48:04, 1.90it/s, loss=0.0059, lr=1.64e-05, step=4525] Training: 45%|████▌ | 4526/10000 [53:36<47:16, 1.93it/s, loss=0.0059, lr=1.64e-05, step=4525] Training: 45%|████▌ | 4526/10000 [53:36<47:16, 1.93it/s, loss=0.0081, lr=1.64e-05, step=4526] Training: 45%|████▌ | 4527/10000 [53:37<46:58, 1.94it/s, loss=0.0081, lr=1.64e-05, step=4526] Training: 45%|████▌ | 4527/10000 [53:37<46:58, 1.94it/s, loss=0.0062, lr=1.64e-05, step=4527] Training: 45%|████▌ | 4528/10000 [53:37<46:22, 1.97it/s, loss=0.0062, lr=1.64e-05, step=4527] Training: 45%|████▌ | 4528/10000 [53:37<46:22, 1.97it/s, loss=0.0485, lr=1.64e-05, step=4528] Training: 45%|████▌ | 4529/10000 [53:38<52:20, 1.74it/s, loss=0.0485, lr=1.64e-05, step=4528] Training: 45%|████▌ | 4529/10000 [53:38<52:20, 1.74it/s, loss=0.0014, lr=1.64e-05, step=4529]16:59:45.517 [I] step=4530 loss=0.0144 smoothed_loss=0.0166 lr=1.64e-05 grad_norm=0.4726 step_time=0.5001s data_time=0.0588s it/s=1.789 eta_to_10000=3057.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.1415 grad_shared_expert=0.4230 (10775:train_pytorch.py:850) + Training: 45%|████▌ | 4530/10000 [53:39<51:11, 1.78it/s, loss=0.0014, lr=1.64e-05, step=4529] Training: 45%|████▌ | 4530/10000 [53:39<51:11, 1.78it/s, loss=0.0144, lr=1.64e-05, step=4530] Training: 45%|████▌ | 4531/10000 [53:39<48:54, 1.86it/s, loss=0.0144, lr=1.64e-05, step=4530] Training: 45%|████▌ | 4531/10000 [53:39<48:54, 1.86it/s, loss=0.0232, lr=1.64e-05, step=4531] Training: 45%|████▌ | 4532/10000 [53:40<48:07, 1.89it/s, loss=0.0232, lr=1.64e-05, step=4531] Training: 45%|████▌ | 4532/10000 [53:40<48:07, 1.89it/s, loss=0.0123, lr=1.64e-05, step=4532] Training: 45%|████▌ | 4533/10000 [53:40<46:37, 1.95it/s, loss=0.0123, lr=1.64e-05, step=4532] Training: 45%|████▌ | 4533/10000 [53:40<46:37, 1.95it/s, loss=0.0083, lr=1.64e-05, step=4533] Training: 45%|████▌ | 4534/10000 [53:41<46:10, 1.97it/s, loss=0.0083, lr=1.64e-05, step=4533] Training: 45%|████▌ | 4534/10000 [53:41<46:10, 1.97it/s, loss=0.0251, lr=1.64e-05, step=4534] Training: 45%|████▌ | 4535/10000 [53:41<46:06, 1.98it/s, loss=0.0251, lr=1.64e-05, step=4534] Training: 45%|████▌ | 4535/10000 [53:41<46:06, 1.98it/s, loss=0.0077, lr=1.64e-05, step=4535] Training: 45%|████▌ | 4536/10000 [53:42<52:49, 1.72it/s, loss=0.0077, lr=1.64e-05, step=4535] Training: 45%|████▌ | 4536/10000 [53:42<52:49, 1.72it/s, loss=0.0027, lr=1.64e-05, step=4536] Training: 45%|████▌ | 4537/10000 [53:42<50:15, 1.81it/s, loss=0.0027, lr=1.64e-05, step=4536] Training: 45%|████▌ | 4537/10000 [53:42<50:15, 1.81it/s, loss=0.0219, lr=1.64e-05, step=4537] Training: 45%|████▌ | 4538/10000 [53:43<48:21, 1.88it/s, loss=0.0219, lr=1.64e-05, step=4537] Training: 45%|████▌ | 4538/10000 [53:43<48:21, 1.88it/s, loss=0.0061, lr=1.64e-05, step=4538] Training: 45%|████▌ | 4539/10000 [53:43<47:28, 1.92it/s, loss=0.0061, lr=1.64e-05, step=4538] Training: 45%|████▌ | 4539/10000 [53:43<47:28, 1.92it/s, loss=0.0117, lr=1.64e-05, step=4539]16:59:50.724 [I] step=4540 loss=0.0050 smoothed_loss=0.0132 lr=1.64e-05 grad_norm=0.4739 step_time=0.4640s data_time=0.0569s it/s=1.921 eta_to_10000=2842.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.0880 grad_shared_expert=0.2342 (10775:train_pytorch.py:850) + Training: 45%|████▌ | 4540/10000 [53:44<47:36, 1.91it/s, loss=0.0117, lr=1.64e-05, step=4539] Training: 45%|████▌ | 4540/10000 [53:44<47:36, 1.91it/s, loss=0.0050, lr=1.64e-05, step=4540] Training: 45%|████▌ | 4541/10000 [53:44<46:38, 1.95it/s, loss=0.0050, lr=1.64e-05, step=4540] Training: 45%|████▌ | 4541/10000 [53:44<46:38, 1.95it/s, loss=0.0257, lr=1.64e-05, step=4541] Training: 45%|████▌ | 4542/10000 [53:45<46:17, 1.97it/s, loss=0.0257, lr=1.64e-05, step=4541] Training: 45%|████▌ | 4542/10000 [53:45<46:17, 1.97it/s, loss=0.0580, lr=1.64e-05, step=4542] Training: 45%|████▌ | 4543/10000 [53:45<51:44, 1.76it/s, loss=0.0580, lr=1.64e-05, step=4542] Training: 45%|████▌ | 4543/10000 [53:45<51:44, 1.76it/s, loss=0.0186, lr=1.64e-05, step=4543] Training: 45%|████▌ | 4544/10000 [53:46<55:30, 1.64it/s, loss=0.0186, lr=1.64e-05, step=4543] Training: 45%|████▌ | 4544/10000 [53:46<55:30, 1.64it/s, loss=0.0122, lr=1.64e-05, step=4544] Training: 45%|████▌ | 4545/10000 [53:47<52:39, 1.73it/s, loss=0.0122, lr=1.64e-05, step=4544] Training: 45%|████▌ | 4545/10000 [53:47<52:39, 1.73it/s, loss=0.0161, lr=1.64e-05, step=4545] Training: 45%|████▌ | 4546/10000 [53:47<50:51, 1.79it/s, loss=0.0161, lr=1.64e-05, step=4545] Training: 45%|████▌ | 4546/10000 [53:47<50:51, 1.79it/s, loss=0.0082, lr=1.63e-05, step=4546] Training: 45%|████▌ | 4547/10000 [53:48<49:04, 1.85it/s, loss=0.0082, lr=1.63e-05, step=4546] Training: 45%|████▌ | 4547/10000 [53:48<49:04, 1.85it/s, loss=0.0133, lr=1.63e-05, step=4547] Training: 45%|████▌ | 4548/10000 [53:48<48:02, 1.89it/s, loss=0.0133, lr=1.63e-05, step=4547] Training: 45%|████▌ | 4548/10000 [53:48<48:02, 1.89it/s, loss=0.0052, lr=1.63e-05, step=4548] Training: 45%|████▌ | 4549/10000 [53:49<47:25, 1.92it/s, loss=0.0052, lr=1.63e-05, step=4548] Training: 45%|████▌ | 4549/10000 [53:49<47:25, 1.92it/s, loss=0.0180, lr=1.63e-05, step=4549]16:59:56.418 [I] step=4550 loss=0.0150 smoothed_loss=0.0156 lr=1.64e-05 grad_norm=0.4618 step_time=0.5132s data_time=0.0562s it/s=1.756 eta_to_10000=3102.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.0884 grad_shared_expert=0.3918 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4550/10000 [53:49<54:04, 1.68it/s, loss=0.0180, lr=1.63e-05, step=4549] Training: 46%|████▌ | 4550/10000 [53:49<54:04, 1.68it/s, loss=0.0150, lr=1.63e-05, step=4550] Training: 46%|████▌ | 4551/10000 [53:50<57:39, 1.57it/s, loss=0.0150, lr=1.63e-05, step=4550] Training: 46%|████▌ | 4551/10000 [53:50<57:39, 1.57it/s, loss=0.0136, lr=1.63e-05, step=4551] Training: 46%|████▌ | 4552/10000 [53:51<54:00, 1.68it/s, loss=0.0136, lr=1.63e-05, step=4551] Training: 46%|████▌ | 4552/10000 [53:51<54:00, 1.68it/s, loss=0.0050, lr=1.63e-05, step=4552] Training: 46%|████▌ | 4553/10000 [53:51<51:33, 1.76it/s, loss=0.0050, lr=1.63e-05, step=4552] Training: 46%|████▌ | 4553/10000 [53:51<51:33, 1.76it/s, loss=0.0045, lr=1.63e-05, step=4553] Training: 46%|████▌ | 4554/10000 [53:52<49:18, 1.84it/s, loss=0.0045, lr=1.63e-05, step=4553] Training: 46%|████▌ | 4554/10000 [53:52<49:18, 1.84it/s, loss=0.0070, lr=1.63e-05, step=4554] Training: 46%|████▌ | 4555/10000 [53:52<48:02, 1.89it/s, loss=0.0070, lr=1.63e-05, step=4554] Training: 46%|████▌ | 4555/10000 [53:52<48:02, 1.89it/s, loss=0.0040, lr=1.63e-05, step=4555] Training: 46%|████▌ | 4556/10000 [53:53<47:00, 1.93it/s, loss=0.0040, lr=1.63e-05, step=4555] Training: 46%|████▌ | 4556/10000 [53:53<47:00, 1.93it/s, loss=0.0082, lr=1.63e-05, step=4556] Training: 46%|████▌ | 4557/10000 [53:53<52:29, 1.73it/s, loss=0.0082, lr=1.63e-05, step=4556] Training: 46%|████▌ | 4557/10000 [53:53<52:29, 1.73it/s, loss=0.0054, lr=1.63e-05, step=4557] Training: 46%|████▌ | 4558/10000 [53:54<1:00:27, 1.50it/s, loss=0.0054, lr=1.63e-05, step=4557] Training: 46%|████▌ | 4558/10000 [53:54<1:00:27, 1.50it/s, loss=0.0181, lr=1.63e-05, step=4558] Training: 46%|████▌ | 4559/10000 [53:55<56:36, 1.60it/s, loss=0.0181, lr=1.63e-05, step=4558] Training: 46%|████▌ | 4559/10000 [53:55<56:36, 1.60it/s, loss=0.0069, lr=1.63e-05, step=4559]17:00:02.353 [I] step=4560 loss=0.0439 smoothed_loss=0.0144 lr=1.63e-05 grad_norm=0.3797 step_time=0.5281s data_time=0.0654s it/s=1.685 eta_to_10000=3228.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.1668 grad_shared_expert=0.4854 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4560/10000 [53:55<56:12, 1.61it/s, loss=0.0069, lr=1.63e-05, step=4559] Training: 46%|████▌ | 4560/10000 [53:55<56:12, 1.61it/s, loss=0.0439, lr=1.63e-05, step=4560] Training: 46%|████▌ | 4561/10000 [53:56<53:20, 1.70it/s, loss=0.0439, lr=1.63e-05, step=4560] Training: 46%|████▌ | 4561/10000 [53:56<53:20, 1.70it/s, loss=0.0094, lr=1.63e-05, step=4561] Training: 46%|████▌ | 4562/10000 [53:56<50:55, 1.78it/s, loss=0.0094, lr=1.63e-05, step=4561] Training: 46%|████▌ | 4562/10000 [53:56<50:55, 1.78it/s, loss=0.0090, lr=1.63e-05, step=4562] Training: 46%|████▌ | 4563/10000 [53:57<49:17, 1.84it/s, loss=0.0090, lr=1.63e-05, step=4562] Training: 46%|████▌ | 4563/10000 [53:57<49:17, 1.84it/s, loss=0.0046, lr=1.63e-05, step=4563] Training: 46%|████▌ | 4564/10000 [53:58<53:49, 1.68it/s, loss=0.0046, lr=1.63e-05, step=4563] Training: 46%|████▌ | 4564/10000 [53:58<53:49, 1.68it/s, loss=0.0076, lr=1.63e-05, step=4564] Training: 46%|████▌ | 4565/10000 [53:58<51:08, 1.77it/s, loss=0.0076, lr=1.63e-05, step=4564] Training: 46%|████▌ | 4565/10000 [53:58<51:08, 1.77it/s, loss=0.0081, lr=1.63e-05, step=4565] Training: 46%|████▌ | 4566/10000 [53:59<56:00, 1.62it/s, loss=0.0081, lr=1.63e-05, step=4565] Training: 46%|████▌ | 4566/10000 [53:59<56:00, 1.62it/s, loss=0.0290, lr=1.63e-05, step=4566] Training: 46%|████▌ | 4567/10000 [53:59<53:14, 1.70it/s, loss=0.0290, lr=1.63e-05, step=4566] Training: 46%|████▌ | 4567/10000 [53:59<53:14, 1.70it/s, loss=0.0092, lr=1.63e-05, step=4567] Training: 46%|████▌ | 4568/10000 [54:00<50:24, 1.80it/s, loss=0.0092, lr=1.63e-05, step=4567] Training: 46%|████▌ | 4568/10000 [54:00<50:24, 1.80it/s, loss=0.0100, lr=1.63e-05, step=4568] Training: 46%|████▌ | 4569/10000 [54:00<48:08, 1.88it/s, loss=0.0100, lr=1.63e-05, step=4568] Training: 46%|████▌ | 4569/10000 [54:00<48:08, 1.88it/s, loss=0.0036, lr=1.63e-05, step=4569]17:00:07.808 [I] step=4570 loss=0.0041 smoothed_loss=0.0110 lr=1.63e-05 grad_norm=0.4593 step_time=0.4878s data_time=0.0576s it/s=1.834 eta_to_10000=2961.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.1109 grad_shared_expert=0.2934 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4570/10000 [54:01<47:35, 1.90it/s, loss=0.0036, lr=1.63e-05, step=4569] Training: 46%|████▌ | 4570/10000 [54:01<47:35, 1.90it/s, loss=0.0041, lr=1.63e-05, step=4570] Training: 46%|████▌ | 4571/10000 [54:02<52:15, 1.73it/s, loss=0.0041, lr=1.63e-05, step=4570] Training: 46%|████▌ | 4571/10000 [54:02<52:15, 1.73it/s, loss=0.0108, lr=1.63e-05, step=4571] Training: 46%|████▌ | 4572/10000 [54:02<49:23, 1.83it/s, loss=0.0108, lr=1.63e-05, step=4571] Training: 46%|████▌ | 4572/10000 [54:02<49:23, 1.83it/s, loss=0.0050, lr=1.63e-05, step=4572] Training: 46%|████▌ | 4573/10000 [54:03<53:57, 1.68it/s, loss=0.0050, lr=1.63e-05, step=4572] Training: 46%|████▌ | 4573/10000 [54:03<53:57, 1.68it/s, loss=0.0029, lr=1.63e-05, step=4573] Training: 46%|████▌ | 4574/10000 [54:03<51:27, 1.76it/s, loss=0.0029, lr=1.63e-05, step=4573] Training: 46%|████▌ | 4574/10000 [54:03<51:27, 1.76it/s, loss=0.0157, lr=1.62e-05, step=4574] Training: 46%|████▌ | 4575/10000 [54:04<49:23, 1.83it/s, loss=0.0157, lr=1.62e-05, step=4574] Training: 46%|████▌ | 4575/10000 [54:04<49:23, 1.83it/s, loss=0.0038, lr=1.62e-05, step=4575] Training: 46%|████▌ | 4576/10000 [54:04<47:44, 1.89it/s, loss=0.0038, lr=1.62e-05, step=4575] Training: 46%|████▌ | 4576/10000 [54:04<47:44, 1.89it/s, loss=0.0138, lr=1.62e-05, step=4576] Training: 46%|████▌ | 4577/10000 [54:05<46:38, 1.94it/s, loss=0.0138, lr=1.62e-05, step=4576] Training: 46%|████▌ | 4577/10000 [54:05<46:38, 1.94it/s, loss=0.0029, lr=1.62e-05, step=4577] Training: 46%|████▌ | 4578/10000 [54:05<51:48, 1.74it/s, loss=0.0029, lr=1.62e-05, step=4577] Training: 46%|████▌ | 4578/10000 [54:05<51:48, 1.74it/s, loss=0.0074, lr=1.62e-05, step=4578] Training: 46%|████▌ | 4579/10000 [54:06<55:53, 1.62it/s, loss=0.0074, lr=1.62e-05, step=4578] Training: 46%|████▌ | 4579/10000 [54:06<55:53, 1.62it/s, loss=0.0104, lr=1.62e-05, step=4579]17:00:13.614 [I] step=4580 loss=0.0328 smoothed_loss=0.0116 lr=1.62e-05 grad_norm=0.4783 step_time=0.5241s data_time=0.0566s it/s=1.723 eta_to_10000=3146.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0156 grad_action_out_proj=0.1648 grad_shared_expert=0.4354 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4580/10000 [54:07<53:09, 1.70it/s, loss=0.0104, lr=1.62e-05, step=4579] Training: 46%|████▌ | 4580/10000 [54:07<53:09, 1.70it/s, loss=0.0328, lr=1.62e-05, step=4580] Training: 46%|████▌ | 4581/10000 [54:07<50:06, 1.80it/s, loss=0.0328, lr=1.62e-05, step=4580] Training: 46%|████▌ | 4581/10000 [54:07<50:06, 1.80it/s, loss=0.0044, lr=1.62e-05, step=4581] Training: 46%|████▌ | 4582/10000 [54:08<48:27, 1.86it/s, loss=0.0044, lr=1.62e-05, step=4581] Training: 46%|████▌ | 4582/10000 [54:08<48:27, 1.86it/s, loss=0.0410, lr=1.62e-05, step=4582] Training: 46%|████▌ | 4583/10000 [54:08<47:16, 1.91it/s, loss=0.0410, lr=1.62e-05, step=4582] Training: 46%|████▌ | 4583/10000 [54:08<47:16, 1.91it/s, loss=0.0144, lr=1.62e-05, step=4583] Training: 46%|████▌ | 4584/10000 [54:09<46:16, 1.95it/s, loss=0.0144, lr=1.62e-05, step=4583] Training: 46%|████▌ | 4584/10000 [54:09<46:16, 1.95it/s, loss=0.0287, lr=1.62e-05, step=4584] Training: 46%|████▌ | 4585/10000 [54:09<51:59, 1.74it/s, loss=0.0287, lr=1.62e-05, step=4584] Training: 46%|████▌ | 4585/10000 [54:09<51:59, 1.74it/s, loss=0.0184, lr=1.62e-05, step=4585] Training: 46%|████▌ | 4586/10000 [54:10<49:24, 1.83it/s, loss=0.0184, lr=1.62e-05, step=4585] Training: 46%|████▌ | 4586/10000 [54:10<49:24, 1.83it/s, loss=0.0166, lr=1.62e-05, step=4586] Training: 46%|████▌ | 4587/10000 [54:11<53:47, 1.68it/s, loss=0.0166, lr=1.62e-05, step=4586] Training: 46%|████▌ | 4587/10000 [54:11<53:47, 1.68it/s, loss=0.0038, lr=1.62e-05, step=4587] Training: 46%|████▌ | 4588/10000 [54:11<50:47, 1.78it/s, loss=0.0038, lr=1.62e-05, step=4587] Training: 46%|████▌ | 4588/10000 [54:11<50:47, 1.78it/s, loss=0.0149, lr=1.62e-05, step=4588] Training: 46%|████▌ | 4589/10000 [54:12<49:08, 1.84it/s, loss=0.0149, lr=1.62e-05, step=4588] Training: 46%|████▌ | 4589/10000 [54:12<49:08, 1.84it/s, loss=0.0122, lr=1.62e-05, step=4589]17:00:19.011 [I] step=4590 loss=0.0116 smoothed_loss=0.0141 lr=1.62e-05 grad_norm=0.4232 step_time=0.4814s data_time=0.0583s it/s=1.853 eta_to_10000=2919.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0157 grad_action_out_proj=0.1275 grad_shared_expert=0.4577 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4590/10000 [54:12<49:04, 1.84it/s, loss=0.0122, lr=1.62e-05, step=4589] Training: 46%|████▌ | 4590/10000 [54:12<49:04, 1.84it/s, loss=0.0116, lr=1.62e-05, step=4590] Training: 46%|████▌ | 4591/10000 [54:13<47:38, 1.89it/s, loss=0.0116, lr=1.62e-05, step=4590] Training: 46%|████▌ | 4591/10000 [54:13<47:38, 1.89it/s, loss=0.0152, lr=1.62e-05, step=4591] Training: 46%|████▌ | 4592/10000 [54:13<48:14, 1.87it/s, loss=0.0152, lr=1.62e-05, step=4591] Training: 46%|████▌ | 4592/10000 [54:13<48:14, 1.87it/s, loss=0.0421, lr=1.62e-05, step=4592] Training: 46%|████▌ | 4593/10000 [54:14<53:45, 1.68it/s, loss=0.0421, lr=1.62e-05, step=4592] Training: 46%|████▌ | 4593/10000 [54:14<53:45, 1.68it/s, loss=0.0171, lr=1.62e-05, step=4593] Training: 46%|████▌ | 4594/10000 [54:15<55:58, 1.61it/s, loss=0.0171, lr=1.62e-05, step=4593] Training: 46%|████▌ | 4594/10000 [54:15<55:58, 1.61it/s, loss=0.0235, lr=1.62e-05, step=4594] Training: 46%|████▌ | 4595/10000 [54:15<52:39, 1.71it/s, loss=0.0235, lr=1.62e-05, step=4594] Training: 46%|████▌ | 4595/10000 [54:15<52:39, 1.71it/s, loss=0.0062, lr=1.62e-05, step=4595] Training: 46%|████▌ | 4596/10000 [54:16<50:35, 1.78it/s, loss=0.0062, lr=1.62e-05, step=4595] Training: 46%|████▌ | 4596/10000 [54:16<50:35, 1.78it/s, loss=0.0125, lr=1.62e-05, step=4596] Training: 46%|████▌ | 4597/10000 [54:16<49:05, 1.83it/s, loss=0.0125, lr=1.62e-05, step=4596] Training: 46%|████▌ | 4597/10000 [54:16<49:05, 1.83it/s, loss=0.0108, lr=1.62e-05, step=4597] Training: 46%|████▌ | 4598/10000 [54:17<47:37, 1.89it/s, loss=0.0108, lr=1.62e-05, step=4597] Training: 46%|████▌ | 4598/10000 [54:17<47:37, 1.89it/s, loss=0.0077, lr=1.62e-05, step=4598] Training: 46%|████▌ | 4599/10000 [54:17<46:43, 1.93it/s, loss=0.0077, lr=1.62e-05, step=4598] Training: 46%|████▌ | 4599/10000 [54:17<46:43, 1.93it/s, loss=0.0032, lr=1.62e-05, step=4599]17:00:24.707 [I] step=4600 loss=0.0089 smoothed_loss=0.0132 lr=1.62e-05 grad_norm=0.4902 step_time=0.5098s data_time=0.0598s it/s=1.756 eta_to_10000=3075.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1014 grad_shared_expert=0.3484 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4600/10000 [54:18<52:31, 1.71it/s, loss=0.0032, lr=1.62e-05, step=4599] Training: 46%|████▌ | 4600/10000 [54:18<52:31, 1.71it/s, loss=0.0089, lr=1.62e-05, step=4600] Training: 46%|████▌ | 4601/10000 [54:18<54:54, 1.64it/s, loss=0.0089, lr=1.62e-05, step=4600] Training: 46%|████▌ | 4601/10000 [54:18<54:54, 1.64it/s, loss=0.0125, lr=1.61e-05, step=4601] Training: 46%|████▌ | 4602/10000 [54:19<57:12, 1.57it/s, loss=0.0125, lr=1.61e-05, step=4601] Training: 46%|████▌ | 4602/10000 [54:19<57:12, 1.57it/s, loss=0.0317, lr=1.61e-05, step=4602] Training: 46%|████▌ | 4603/10000 [54:20<1:00:23, 1.49it/s, loss=0.0317, lr=1.61e-05, step=4602] Training: 46%|████▌ | 4603/10000 [54:20<1:00:23, 1.49it/s, loss=0.0119, lr=1.61e-05, step=4603] Training: 46%|████▌ | 4604/10000 [54:20<57:48, 1.56it/s, loss=0.0119, lr=1.61e-05, step=4603] Training: 46%|████▌ | 4604/10000 [54:20<57:48, 1.56it/s, loss=0.0248, lr=1.61e-05, step=4604] Training: 46%|████▌ | 4605/10000 [54:21<57:36, 1.56it/s, loss=0.0248, lr=1.61e-05, step=4604] Training: 46%|████▌ | 4605/10000 [54:21<57:36, 1.56it/s, loss=0.0164, lr=1.61e-05, step=4605] Training: 46%|████▌ | 4606/10000 [54:22<54:14, 1.66it/s, loss=0.0164, lr=1.61e-05, step=4605] Training: 46%|████▌ | 4606/10000 [54:22<54:14, 1.66it/s, loss=0.0139, lr=1.61e-05, step=4606] Training: 46%|████▌ | 4607/10000 [54:22<54:02, 1.66it/s, loss=0.0139, lr=1.61e-05, step=4606] Training: 46%|████▌ | 4607/10000 [54:22<54:02, 1.66it/s, loss=0.0134, lr=1.61e-05, step=4607] Training: 46%|████▌ | 4608/10000 [54:23<58:10, 1.54it/s, loss=0.0134, lr=1.61e-05, step=4607] Training: 46%|████▌ | 4608/10000 [54:23<58:10, 1.54it/s, loss=0.0090, lr=1.61e-05, step=4608] Training: 46%|████▌ | 4609/10000 [54:24<55:02, 1.63it/s, loss=0.0090, lr=1.61e-05, step=4608] Training: 46%|████▌ | 4609/10000 [54:24<55:02, 1.63it/s, loss=0.0018, lr=1.61e-05, step=4609]17:00:30.985 [I] step=4610 loss=0.0111 smoothed_loss=0.0132 lr=1.61e-05 grad_norm=0.4381 step_time=0.5351s data_time=0.0927s it/s=1.593 eta_to_10000=3383.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0262 grad_action_out_proj=0.1617 grad_shared_expert=0.3987 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4610/10000 [54:24<53:12, 1.69it/s, loss=0.0018, lr=1.61e-05, step=4609] Training: 46%|████▌ | 4610/10000 [54:24<53:12, 1.69it/s, loss=0.0111, lr=1.61e-05, step=4610] Training: 46%|████▌ | 4611/10000 [54:25<50:29, 1.78it/s, loss=0.0111, lr=1.61e-05, step=4610] Training: 46%|████▌ | 4611/10000 [54:25<50:29, 1.78it/s, loss=0.0088, lr=1.61e-05, step=4611] Training: 46%|████▌ | 4612/10000 [54:25<48:41, 1.84it/s, loss=0.0088, lr=1.61e-05, step=4611] Training: 46%|████▌ | 4612/10000 [54:25<48:41, 1.84it/s, loss=0.0130, lr=1.61e-05, step=4612] Training: 46%|████▌ | 4613/10000 [54:26<47:11, 1.90it/s, loss=0.0130, lr=1.61e-05, step=4612] Training: 46%|████▌ | 4613/10000 [54:26<47:11, 1.90it/s, loss=0.0066, lr=1.61e-05, step=4613] Training: 46%|████▌ | 4614/10000 [54:26<46:30, 1.93it/s, loss=0.0066, lr=1.61e-05, step=4613] Training: 46%|████▌ | 4614/10000 [54:26<46:30, 1.93it/s, loss=0.0268, lr=1.61e-05, step=4614] Training: 46%|████▌ | 4615/10000 [54:27<57:56, 1.55it/s, loss=0.0268, lr=1.61e-05, step=4614] Training: 46%|████▌ | 4615/10000 [54:27<57:56, 1.55it/s, loss=0.0148, lr=1.61e-05, step=4615] Training: 46%|████▌ | 4616/10000 [54:27<53:27, 1.68it/s, loss=0.0148, lr=1.61e-05, step=4615] Training: 46%|████▌ | 4616/10000 [54:27<53:27, 1.68it/s, loss=0.1122, lr=1.61e-05, step=4616] Training: 46%|████▌ | 4617/10000 [54:28<50:42, 1.77it/s, loss=0.1122, lr=1.61e-05, step=4616] Training: 46%|████▌ | 4617/10000 [54:28<50:42, 1.77it/s, loss=0.0291, lr=1.61e-05, step=4617] Training: 46%|████▌ | 4618/10000 [54:28<48:47, 1.84it/s, loss=0.0291, lr=1.61e-05, step=4617] Training: 46%|████▌ | 4618/10000 [54:28<48:47, 1.84it/s, loss=0.0067, lr=1.61e-05, step=4618] Training: 46%|████▌ | 4619/10000 [54:29<47:31, 1.89it/s, loss=0.0067, lr=1.61e-05, step=4618] Training: 46%|████▌ | 4619/10000 [54:29<47:31, 1.89it/s, loss=0.0200, lr=1.61e-05, step=4619]17:00:36.407 [I] step=4620 loss=0.0188 smoothed_loss=0.0218 lr=1.61e-05 grad_norm=0.4860 step_time=0.4763s data_time=0.0658s it/s=1.845 eta_to_10000=2916.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0215 grad_action_out_proj=0.1691 grad_shared_expert=0.4296 (10775:train_pytorch.py:850) + Training: 46%|████▌ | 4620/10000 [54:29<47:44, 1.88it/s, loss=0.0200, lr=1.61e-05, step=4619] Training: 46%|████▌ | 4620/10000 [54:29<47:44, 1.88it/s, loss=0.0188, lr=1.61e-05, step=4620] Training: 46%|████▌ | 4621/10000 [54:30<47:11, 1.90it/s, loss=0.0188, lr=1.61e-05, step=4620] Training: 46%|████▌ | 4621/10000 [54:30<47:11, 1.90it/s, loss=0.0102, lr=1.61e-05, step=4621] Training: 46%|████▌ | 4622/10000 [54:31<56:46, 1.58it/s, loss=0.0102, lr=1.61e-05, step=4621] Training: 46%|████▌ | 4622/10000 [54:31<56:46, 1.58it/s, loss=0.0070, lr=1.61e-05, step=4622] Training: 46%|████▌ | 4623/10000 [54:32<1:04:55, 1.38it/s, loss=0.0070, lr=1.61e-05, step=4622] Training: 46%|████▌ | 4623/10000 [54:32<1:04:55, 1.38it/s, loss=0.0093, lr=1.61e-05, step=4623] Training: 46%|████▌ | 4624/10000 [54:32<58:27, 1.53it/s, loss=0.0093, lr=1.61e-05, step=4623] Training: 46%|████▌ | 4624/10000 [54:32<58:27, 1.53it/s, loss=0.0114, lr=1.61e-05, step=4624] Training: 46%|████▋ | 4625/10000 [54:33<53:39, 1.67it/s, loss=0.0114, lr=1.61e-05, step=4624] Training: 46%|████▋ | 4625/10000 [54:33<53:39, 1.67it/s, loss=0.0126, lr=1.61e-05, step=4625] Training: 46%|████▋ | 4626/10000 [54:33<56:12, 1.59it/s, loss=0.0126, lr=1.61e-05, step=4625] Training: 46%|████▋ | 4626/10000 [54:33<56:12, 1.59it/s, loss=0.0093, lr=1.61e-05, step=4626] Training: 46%|████▋ | 4627/10000 [54:34<52:35, 1.70it/s, loss=0.0093, lr=1.61e-05, step=4626] Training: 46%|████▋ | 4627/10000 [54:34<52:35, 1.70it/s, loss=0.0138, lr=1.61e-05, step=4627] Training: 46%|████▋ | 4628/10000 [54:34<49:58, 1.79it/s, loss=0.0138, lr=1.61e-05, step=4627] Training: 46%|████▋ | 4628/10000 [54:34<49:58, 1.79it/s, loss=0.0078, lr=1.61e-05, step=4628] Training: 46%|████▋ | 4629/10000 [54:35<50:06, 1.79it/s, loss=0.0078, lr=1.61e-05, step=4628] Training: 46%|████▋ | 4629/10000 [54:35<50:06, 1.79it/s, loss=0.0049, lr=1.60e-05, step=4629]17:00:42.792 [I] step=4630 loss=0.0208 smoothed_loss=0.0149 lr=1.61e-05 grad_norm=0.4650 step_time=0.5607s data_time=0.0777s it/s=1.566 eta_to_10000=3428.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0155 grad_action_out_proj=0.1197 grad_shared_expert=0.4260 (10775:train_pytorch.py:850) + Training: 46%|████▋ | 4630/10000 [54:36<57:57, 1.54it/s, loss=0.0049, lr=1.60e-05, step=4629] Training: 46%|████▋ | 4630/10000 [54:36<57:57, 1.54it/s, loss=0.0208, lr=1.60e-05, step=4630] Training: 46%|████▋ | 4631/10000 [54:36<53:34, 1.67it/s, loss=0.0208, lr=1.60e-05, step=4630] Training: 46%|████▋ | 4631/10000 [54:36<53:34, 1.67it/s, loss=0.0089, lr=1.60e-05, step=4631] Training: 46%|████▋ | 4632/10000 [54:37<54:58, 1.63it/s, loss=0.0089, lr=1.60e-05, step=4631] Training: 46%|████▋ | 4632/10000 [54:37<54:58, 1.63it/s, loss=0.0341, lr=1.60e-05, step=4632] Training: 46%|████▋ | 4633/10000 [54:37<52:02, 1.72it/s, loss=0.0341, lr=1.60e-05, step=4632] Training: 46%|████▋ | 4633/10000 [54:37<52:02, 1.72it/s, loss=0.0084, lr=1.60e-05, step=4633] Training: 46%|████▋ | 4634/10000 [54:38<50:38, 1.77it/s, loss=0.0084, lr=1.60e-05, step=4633] Training: 46%|████▋ | 4634/10000 [54:38<50:38, 1.77it/s, loss=0.0186, lr=1.60e-05, step=4634] Training: 46%|████▋ | 4635/10000 [54:39<49:21, 1.81it/s, loss=0.0186, lr=1.60e-05, step=4634] Training: 46%|████▋ | 4635/10000 [54:39<49:21, 1.81it/s, loss=0.0055, lr=1.60e-05, step=4635] Training: 46%|████▋ | 4636/10000 [54:39<48:51, 1.83it/s, loss=0.0055, lr=1.60e-05, step=4635] Training: 46%|████▋ | 4636/10000 [54:39<48:51, 1.83it/s, loss=0.0072, lr=1.60e-05, step=4636] Training: 46%|████▋ | 4637/10000 [54:40<54:05, 1.65it/s, loss=0.0072, lr=1.60e-05, step=4636] Training: 46%|████▋ | 4637/10000 [54:40<54:05, 1.65it/s, loss=0.0213, lr=1.60e-05, step=4637] Training: 46%|████▋ | 4638/10000 [54:40<51:37, 1.73it/s, loss=0.0213, lr=1.60e-05, step=4637] Training: 46%|████▋ | 4638/10000 [54:40<51:37, 1.73it/s, loss=0.0193, lr=1.60e-05, step=4638] Training: 46%|████▋ | 4639/10000 [54:41<51:13, 1.74it/s, loss=0.0193, lr=1.60e-05, step=4638] Training: 46%|████▋ | 4639/10000 [54:41<51:13, 1.74it/s, loss=0.0092, lr=1.60e-05, step=4639]17:00:48.356 [I] step=4640 loss=0.0016 smoothed_loss=0.0133 lr=1.60e-05 grad_norm=0.4992 step_time=0.4878s data_time=0.0686s it/s=1.797 eta_to_10000=2982.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0151 grad_action_out_proj=0.1041 grad_shared_expert=0.5554 (10775:train_pytorch.py:850) + Training: 46%|████▋ | 4640/10000 [54:41<49:50, 1.79it/s, loss=0.0092, lr=1.60e-05, step=4639] Training: 46%|████▋ | 4640/10000 [54:41<49:50, 1.79it/s, loss=0.0016, lr=1.60e-05, step=4640] Training: 46%|████▋ | 4641/10000 [54:42<48:11, 1.85it/s, loss=0.0016, lr=1.60e-05, step=4640] Training: 46%|████▋ | 4641/10000 [54:42<48:11, 1.85it/s, loss=0.0399, lr=1.60e-05, step=4641] Training: 46%|████▋ | 4642/10000 [54:43<54:24, 1.64it/s, loss=0.0399, lr=1.60e-05, step=4641] Training: 46%|████▋ | 4642/10000 [54:43<54:24, 1.64it/s, loss=0.0108, lr=1.60e-05, step=4642] Training: 46%|████▋ | 4643/10000 [54:43<50:52, 1.76it/s, loss=0.0108, lr=1.60e-05, step=4642] Training: 46%|████▋ | 4643/10000 [54:43<50:52, 1.76it/s, loss=0.0204, lr=1.60e-05, step=4643] Training: 46%|████▋ | 4644/10000 [54:44<56:12, 1.59it/s, loss=0.0204, lr=1.60e-05, step=4643] Training: 46%|████▋ | 4644/10000 [54:44<56:12, 1.59it/s, loss=0.0044, lr=1.60e-05, step=4644] Training: 46%|████▋ | 4645/10000 [54:44<52:33, 1.70it/s, loss=0.0044, lr=1.60e-05, step=4644] Training: 46%|████▋ | 4645/10000 [54:44<52:33, 1.70it/s, loss=0.0141, lr=1.60e-05, step=4645] Training: 46%|████▋ | 4646/10000 [54:45<50:23, 1.77it/s, loss=0.0141, lr=1.60e-05, step=4645] Training: 46%|████▋ | 4646/10000 [54:45<50:23, 1.77it/s, loss=0.0129, lr=1.60e-05, step=4646] Training: 46%|████▋ | 4647/10000 [54:45<48:50, 1.83it/s, loss=0.0129, lr=1.60e-05, step=4646] Training: 46%|████▋ | 4647/10000 [54:45<48:50, 1.83it/s, loss=0.0276, lr=1.60e-05, step=4647] Training: 46%|████▋ | 4648/10000 [54:46<48:27, 1.84it/s, loss=0.0276, lr=1.60e-05, step=4647] Training: 46%|████▋ | 4648/10000 [54:46<48:27, 1.84it/s, loss=0.0122, lr=1.60e-05, step=4648] Training: 46%|████▋ | 4649/10000 [54:46<46:59, 1.90it/s, loss=0.0122, lr=1.60e-05, step=4648] Training: 46%|████▋ | 4649/10000 [54:46<46:59, 1.90it/s, loss=0.0080, lr=1.60e-05, step=4649]17:00:54.229 [I] step=4650 loss=0.0101 smoothed_loss=0.0143 lr=1.60e-05 grad_norm=0.4148 step_time=0.5161s data_time=0.0712s it/s=1.703 eta_to_10000=3141.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0118 grad_action_out_proj=0.1078 grad_shared_expert=0.3885 (10775:train_pytorch.py:850) + Training: 46%|████▋ | 4650/10000 [54:47<54:58, 1.62it/s, loss=0.0080, lr=1.60e-05, step=4649] Training: 46%|████▋ | 4650/10000 [54:47<54:58, 1.62it/s, loss=0.0101, lr=1.60e-05, step=4650] Training: 47%|████▋ | 4651/10000 [54:48<58:36, 1.52it/s, loss=0.0101, lr=1.60e-05, step=4650] Training: 47%|████▋ | 4651/10000 [54:48<58:36, 1.52it/s, loss=0.0254, lr=1.60e-05, step=4651] Training: 47%|████▋ | 4652/10000 [54:49<54:23, 1.64it/s, loss=0.0254, lr=1.60e-05, step=4651] Training: 47%|████▋ | 4652/10000 [54:49<54:23, 1.64it/s, loss=0.0067, lr=1.60e-05, step=4652] Training: 47%|████▋ | 4653/10000 [54:49<54:42, 1.63it/s, loss=0.0067, lr=1.60e-05, step=4652] Training: 47%|████▋ | 4653/10000 [54:49<54:42, 1.63it/s, loss=0.0301, lr=1.60e-05, step=4653] Training: 47%|████▋ | 4654/10000 [54:50<54:43, 1.63it/s, loss=0.0301, lr=1.60e-05, step=4653] Training: 47%|████▋ | 4654/10000 [54:50<54:43, 1.63it/s, loss=0.1244, lr=1.60e-05, step=4654] Training: 47%|████▋ | 4655/10000 [54:50<51:27, 1.73it/s, loss=0.1244, lr=1.60e-05, step=4654] Training: 47%|████▋ | 4655/10000 [54:50<51:27, 1.73it/s, loss=0.0171, lr=1.60e-05, step=4655] Training: 47%|████▋ | 4656/10000 [54:51<48:45, 1.83it/s, loss=0.0171, lr=1.60e-05, step=4655] Training: 47%|████▋ | 4656/10000 [54:51<48:45, 1.83it/s, loss=0.0134, lr=1.59e-05, step=4656] Training: 47%|████▋ | 4657/10000 [54:51<53:03, 1.68it/s, loss=0.0134, lr=1.59e-05, step=4656] Training: 47%|████▋ | 4657/10000 [54:51<53:03, 1.68it/s, loss=0.0220, lr=1.59e-05, step=4657] Training: 47%|████▋ | 4658/10000 [54:52<55:54, 1.59it/s, loss=0.0220, lr=1.59e-05, step=4657] Training: 47%|████▋ | 4658/10000 [54:52<55:54, 1.59it/s, loss=0.0105, lr=1.59e-05, step=4658] Training: 47%|████▋ | 4659/10000 [54:53<53:55, 1.65it/s, loss=0.0105, lr=1.59e-05, step=4658] Training: 47%|████▋ | 4659/10000 [54:53<53:55, 1.65it/s, loss=0.0311, lr=1.59e-05, step=4659]17:01:00.243 [I] step=4660 loss=0.0089 smoothed_loss=0.0223 lr=1.60e-05 grad_norm=0.5015 step_time=0.5297s data_time=0.0717s it/s=1.663 eta_to_10000=3211.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0137 grad_action_out_proj=0.1936 grad_shared_expert=0.4067 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4660/10000 [54:53<53:28, 1.66it/s, loss=0.0311, lr=1.59e-05, step=4659] Training: 47%|████▋ | 4660/10000 [54:53<53:28, 1.66it/s, loss=0.0089, lr=1.59e-05, step=4660] Training: 47%|████▋ | 4661/10000 [54:54<50:42, 1.76it/s, loss=0.0089, lr=1.59e-05, step=4660] Training: 47%|████▋ | 4661/10000 [54:54<50:42, 1.76it/s, loss=0.0040, lr=1.59e-05, step=4661] Training: 47%|████▋ | 4662/10000 [54:54<49:10, 1.81it/s, loss=0.0040, lr=1.59e-05, step=4661] Training: 47%|████▋ | 4662/10000 [54:54<49:10, 1.81it/s, loss=0.0118, lr=1.59e-05, step=4662] Training: 47%|████▋ | 4663/10000 [54:55<48:06, 1.85it/s, loss=0.0118, lr=1.59e-05, step=4662] Training: 47%|████▋ | 4663/10000 [54:55<48:06, 1.85it/s, loss=0.0331, lr=1.59e-05, step=4663] Training: 47%|████▋ | 4664/10000 [54:56<56:15, 1.58it/s, loss=0.0331, lr=1.59e-05, step=4663] Training: 47%|████▋ | 4664/10000 [54:56<56:15, 1.58it/s, loss=0.0088, lr=1.59e-05, step=4664] Training: 47%|████▋ | 4665/10000 [54:56<57:49, 1.54it/s, loss=0.0088, lr=1.59e-05, step=4664] Training: 47%|████▋ | 4665/10000 [54:56<57:49, 1.54it/s, loss=0.0107, lr=1.59e-05, step=4665] Training: 47%|████▋ | 4666/10000 [54:57<53:53, 1.65it/s, loss=0.0107, lr=1.59e-05, step=4665] Training: 47%|████▋ | 4666/10000 [54:57<53:53, 1.65it/s, loss=0.0114, lr=1.59e-05, step=4666] Training: 47%|████▋ | 4667/10000 [54:58<55:36, 1.60it/s, loss=0.0114, lr=1.59e-05, step=4666] Training: 47%|████▋ | 4667/10000 [54:58<55:36, 1.60it/s, loss=0.0029, lr=1.59e-05, step=4667] Training: 47%|████▋ | 4668/10000 [54:58<52:12, 1.70it/s, loss=0.0029, lr=1.59e-05, step=4667] Training: 47%|████▋ | 4668/10000 [54:58<52:12, 1.70it/s, loss=0.0051, lr=1.59e-05, step=4668] Training: 47%|████▋ | 4669/10000 [54:59<49:54, 1.78it/s, loss=0.0051, lr=1.59e-05, step=4668] Training: 47%|████▋ | 4669/10000 [54:59<49:54, 1.78it/s, loss=0.0395, lr=1.59e-05, step=4669]17:01:06.025 [I] step=4670 loss=0.0126 smoothed_loss=0.0173 lr=1.59e-05 grad_norm=0.5271 step_time=0.5127s data_time=0.0654s it/s=1.730 eta_to_10000=3081.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.0812 grad_shared_expert=0.5162 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4670/10000 [54:59<49:28, 1.80it/s, loss=0.0395, lr=1.59e-05, step=4669] Training: 47%|████▋ | 4670/10000 [54:59<49:28, 1.80it/s, loss=0.0126, lr=1.59e-05, step=4670] Training: 47%|████▋ | 4671/10000 [55:00<48:23, 1.84it/s, loss=0.0126, lr=1.59e-05, step=4670] Training: 47%|████▋ | 4671/10000 [55:00<48:23, 1.84it/s, loss=0.0151, lr=1.59e-05, step=4671] Training: 47%|████▋ | 4672/10000 [55:00<53:29, 1.66it/s, loss=0.0151, lr=1.59e-05, step=4671] Training: 47%|████▋ | 4672/10000 [55:00<53:29, 1.66it/s, loss=0.0041, lr=1.59e-05, step=4672] Training: 47%|████▋ | 4673/10000 [55:01<56:36, 1.57it/s, loss=0.0041, lr=1.59e-05, step=4672] Training: 47%|████▋ | 4673/10000 [55:01<56:36, 1.57it/s, loss=0.0099, lr=1.59e-05, step=4673] Training: 47%|████▋ | 4674/10000 [55:02<53:11, 1.67it/s, loss=0.0099, lr=1.59e-05, step=4673] Training: 47%|████▋ | 4674/10000 [55:02<53:11, 1.67it/s, loss=0.0131, lr=1.59e-05, step=4674] Training: 47%|████▋ | 4675/10000 [55:02<51:02, 1.74it/s, loss=0.0131, lr=1.59e-05, step=4674] Training: 47%|████▋ | 4675/10000 [55:02<51:02, 1.74it/s, loss=0.0041, lr=1.59e-05, step=4675] Training: 47%|████▋ | 4676/10000 [55:03<49:03, 1.81it/s, loss=0.0041, lr=1.59e-05, step=4675] Training: 47%|████▋ | 4676/10000 [55:03<49:03, 1.81it/s, loss=0.0357, lr=1.59e-05, step=4676] Training: 47%|████▋ | 4677/10000 [55:03<47:43, 1.86it/s, loss=0.0357, lr=1.59e-05, step=4676] Training: 47%|████▋ | 4677/10000 [55:03<47:43, 1.86it/s, loss=0.0163, lr=1.59e-05, step=4677] Training: 47%|████▋ | 4678/10000 [55:04<46:23, 1.91it/s, loss=0.0163, lr=1.59e-05, step=4677] Training: 47%|████▋ | 4678/10000 [55:04<46:23, 1.91it/s, loss=0.0260, lr=1.59e-05, step=4678] Training: 47%|████▋ | 4679/10000 [55:04<46:36, 1.90it/s, loss=0.0260, lr=1.59e-05, step=4678] Training: 47%|████▋ | 4679/10000 [55:04<46:36, 1.90it/s, loss=0.0068, lr=1.59e-05, step=4679]17:01:11.831 [I] step=4680 loss=0.0091 smoothed_loss=0.0154 lr=1.59e-05 grad_norm=0.4238 step_time=0.5247s data_time=0.0560s it/s=1.723 eta_to_10000=3088.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.0965 grad_shared_expert=0.3090 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4680/10000 [55:05<53:23, 1.66it/s, loss=0.0068, lr=1.59e-05, step=4679] Training: 47%|████▋ | 4680/10000 [55:05<53:23, 1.66it/s, loss=0.0091, lr=1.59e-05, step=4680] Training: 47%|████▋ | 4681/10000 [55:05<50:39, 1.75it/s, loss=0.0091, lr=1.59e-05, step=4680] Training: 47%|████▋ | 4681/10000 [55:05<50:39, 1.75it/s, loss=0.0055, lr=1.59e-05, step=4681] Training: 47%|████▋ | 4682/10000 [55:06<48:29, 1.83it/s, loss=0.0055, lr=1.59e-05, step=4681] Training: 47%|████▋ | 4682/10000 [55:06<48:29, 1.83it/s, loss=0.0025, lr=1.59e-05, step=4682] Training: 47%|████▋ | 4683/10000 [55:07<50:55, 1.74it/s, loss=0.0025, lr=1.59e-05, step=4682] Training: 47%|████▋ | 4683/10000 [55:07<50:55, 1.74it/s, loss=0.0105, lr=1.59e-05, step=4683] Training: 47%|████▋ | 4684/10000 [55:07<49:05, 1.80it/s, loss=0.0105, lr=1.59e-05, step=4683] Training: 47%|████▋ | 4684/10000 [55:07<49:05, 1.80it/s, loss=0.0159, lr=1.58e-05, step=4684] Training: 47%|████▋ | 4685/10000 [55:08<47:37, 1.86it/s, loss=0.0159, lr=1.58e-05, step=4684] Training: 47%|████▋ | 4685/10000 [55:08<47:37, 1.86it/s, loss=0.0134, lr=1.58e-05, step=4685] Training: 47%|████▋ | 4686/10000 [55:08<47:08, 1.88it/s, loss=0.0134, lr=1.58e-05, step=4685] Training: 47%|████▋ | 4686/10000 [55:08<47:08, 1.88it/s, loss=0.0176, lr=1.58e-05, step=4686] Training: 47%|████▋ | 4687/10000 [55:09<55:26, 1.60it/s, loss=0.0176, lr=1.58e-05, step=4686] Training: 47%|████▋ | 4687/10000 [55:09<55:26, 1.60it/s, loss=0.0362, lr=1.58e-05, step=4687] Training: 47%|████▋ | 4688/10000 [55:09<52:22, 1.69it/s, loss=0.0362, lr=1.58e-05, step=4687] Training: 47%|████▋ | 4688/10000 [55:09<52:22, 1.69it/s, loss=0.0128, lr=1.58e-05, step=4688] Training: 47%|████▋ | 4689/10000 [55:10<51:14, 1.73it/s, loss=0.0128, lr=1.58e-05, step=4688] Training: 47%|████▋ | 4689/10000 [55:10<51:14, 1.73it/s, loss=0.0173, lr=1.58e-05, step=4689]17:01:17.450 [I] step=4690 loss=0.0057 smoothed_loss=0.0148 lr=1.58e-05 grad_norm=0.5179 step_time=0.5003s data_time=0.0616s it/s=1.780 eta_to_10000=2982.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0682 grad_shared_expert=0.2384 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4690/10000 [55:11<50:41, 1.75it/s, loss=0.0173, lr=1.58e-05, step=4689] Training: 47%|████▋ | 4690/10000 [55:11<50:41, 1.75it/s, loss=0.0057, lr=1.58e-05, step=4690] Training: 47%|████▋ | 4691/10000 [55:11<49:12, 1.80it/s, loss=0.0057, lr=1.58e-05, step=4690] Training: 47%|████▋ | 4691/10000 [55:11<49:12, 1.80it/s, loss=0.0042, lr=1.58e-05, step=4691] Training: 47%|████▋ | 4692/10000 [55:12<48:01, 1.84it/s, loss=0.0042, lr=1.58e-05, step=4691] Training: 47%|████▋ | 4692/10000 [55:12<48:01, 1.84it/s, loss=0.0394, lr=1.58e-05, step=4692] Training: 47%|████▋ | 4693/10000 [55:12<47:19, 1.87it/s, loss=0.0394, lr=1.58e-05, step=4692] Training: 47%|████▋ | 4693/10000 [55:12<47:19, 1.87it/s, loss=0.0059, lr=1.58e-05, step=4693] Training: 47%|████▋ | 4694/10000 [55:13<53:32, 1.65it/s, loss=0.0059, lr=1.58e-05, step=4693] Training: 47%|████▋ | 4694/10000 [55:13<53:32, 1.65it/s, loss=0.0126, lr=1.58e-05, step=4694] Training: 47%|████▋ | 4695/10000 [55:13<51:27, 1.72it/s, loss=0.0126, lr=1.58e-05, step=4694] Training: 47%|████▋ | 4695/10000 [55:13<51:27, 1.72it/s, loss=0.0035, lr=1.58e-05, step=4695] Training: 47%|████▋ | 4696/10000 [55:14<49:26, 1.79it/s, loss=0.0035, lr=1.58e-05, step=4695] Training: 47%|████▋ | 4696/10000 [55:14<49:26, 1.79it/s, loss=0.0222, lr=1.58e-05, step=4696] Training: 47%|████▋ | 4697/10000 [55:14<48:02, 1.84it/s, loss=0.0222, lr=1.58e-05, step=4696] Training: 47%|████▋ | 4697/10000 [55:14<48:02, 1.84it/s, loss=0.0152, lr=1.58e-05, step=4697] Training: 47%|████▋ | 4698/10000 [55:15<47:08, 1.87it/s, loss=0.0152, lr=1.58e-05, step=4697] Training: 47%|████▋ | 4698/10000 [55:15<47:08, 1.87it/s, loss=0.0172, lr=1.58e-05, step=4698] Training: 47%|████▋ | 4699/10000 [55:15<46:15, 1.91it/s, loss=0.0172, lr=1.58e-05, step=4698] Training: 47%|████▋ | 4699/10000 [55:15<46:15, 1.91it/s, loss=0.0117, lr=1.58e-05, step=4699]17:01:22.854 [I] step=4700 loss=0.0147 smoothed_loss=0.0146 lr=1.58e-05 grad_norm=0.4729 step_time=0.4848s data_time=0.0557s it/s=1.851 eta_to_10000=2863.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.1557 grad_shared_expert=0.3552 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4700/10000 [55:16<46:38, 1.89it/s, loss=0.0117, lr=1.58e-05, step=4699] Training: 47%|████▋ | 4700/10000 [55:16<46:38, 1.89it/s, loss=0.0147, lr=1.58e-05, step=4700] Training: 47%|████▋ | 4701/10000 [55:17<53:03, 1.66it/s, loss=0.0147, lr=1.58e-05, step=4700] Training: 47%|████▋ | 4701/10000 [55:17<53:03, 1.66it/s, loss=0.0438, lr=1.58e-05, step=4701] Training: 47%|████▋ | 4702/10000 [55:17<56:21, 1.57it/s, loss=0.0438, lr=1.58e-05, step=4701] Training: 47%|████▋ | 4702/10000 [55:17<56:21, 1.57it/s, loss=0.0090, lr=1.58e-05, step=4702] Training: 47%|████▋ | 4703/10000 [55:18<53:07, 1.66it/s, loss=0.0090, lr=1.58e-05, step=4702] Training: 47%|████▋ | 4703/10000 [55:18<53:07, 1.66it/s, loss=0.0204, lr=1.58e-05, step=4703] Training: 47%|████▋ | 4704/10000 [55:18<50:40, 1.74it/s, loss=0.0204, lr=1.58e-05, step=4703] Training: 47%|████▋ | 4704/10000 [55:18<50:40, 1.74it/s, loss=0.0071, lr=1.58e-05, step=4704] Training: 47%|████▋ | 4705/10000 [55:19<48:26, 1.82it/s, loss=0.0071, lr=1.58e-05, step=4704] Training: 47%|████▋ | 4705/10000 [55:19<48:26, 1.82it/s, loss=0.0342, lr=1.58e-05, step=4705] Training: 47%|████▋ | 4706/10000 [55:19<46:59, 1.88it/s, loss=0.0342, lr=1.58e-05, step=4705] Training: 47%|████▋ | 4706/10000 [55:19<46:59, 1.88it/s, loss=0.0081, lr=1.58e-05, step=4706] Training: 47%|████▋ | 4707/10000 [55:20<46:25, 1.90it/s, loss=0.0081, lr=1.58e-05, step=4706] Training: 47%|████▋ | 4707/10000 [55:20<46:25, 1.90it/s, loss=0.0082, lr=1.58e-05, step=4707] Training: 47%|████▋ | 4708/10000 [55:21<51:48, 1.70it/s, loss=0.0082, lr=1.58e-05, step=4707] Training: 47%|████▋ | 4708/10000 [55:21<51:48, 1.70it/s, loss=0.0098, lr=1.58e-05, step=4708] Training: 47%|████▋ | 4709/10000 [55:21<55:16, 1.60it/s, loss=0.0098, lr=1.58e-05, step=4708] Training: 47%|████▋ | 4709/10000 [55:21<55:16, 1.60it/s, loss=0.0859, lr=1.58e-05, step=4709]17:01:28.869 [I] step=4710 loss=0.0045 smoothed_loss=0.0207 lr=1.58e-05 grad_norm=0.4874 step_time=0.5394s data_time=0.0620s it/s=1.663 eta_to_10000=3180.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0061 grad_action_out_proj=0.0754 grad_shared_expert=0.2627 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4710/10000 [55:22<53:08, 1.66it/s, loss=0.0859, lr=1.58e-05, step=4709] Training: 47%|████▋ | 4710/10000 [55:22<53:08, 1.66it/s, loss=0.0045, lr=1.58e-05, step=4710] Training: 47%|████▋ | 4711/10000 [55:22<50:52, 1.73it/s, loss=0.0045, lr=1.58e-05, step=4710] Training: 47%|████▋ | 4711/10000 [55:22<50:52, 1.73it/s, loss=0.0039, lr=1.57e-05, step=4711] Training: 47%|████▋ | 4712/10000 [55:23<49:19, 1.79it/s, loss=0.0039, lr=1.57e-05, step=4711] Training: 47%|████▋ | 4712/10000 [55:23<49:19, 1.79it/s, loss=0.0077, lr=1.57e-05, step=4712] Training: 47%|████▋ | 4713/10000 [55:23<47:50, 1.84it/s, loss=0.0077, lr=1.57e-05, step=4712] Training: 47%|████▋ | 4713/10000 [55:23<47:50, 1.84it/s, loss=0.0170, lr=1.57e-05, step=4713] Training: 47%|████▋ | 4714/10000 [55:24<46:36, 1.89it/s, loss=0.0170, lr=1.57e-05, step=4713] Training: 47%|████▋ | 4714/10000 [55:24<46:36, 1.89it/s, loss=0.0141, lr=1.57e-05, step=4714] Training: 47%|████▋ | 4715/10000 [55:25<49:49, 1.77it/s, loss=0.0141, lr=1.57e-05, step=4714] Training: 47%|████▋ | 4715/10000 [55:25<49:49, 1.77it/s, loss=0.0111, lr=1.57e-05, step=4715] Training: 47%|████▋ | 4716/10000 [55:25<54:24, 1.62it/s, loss=0.0111, lr=1.57e-05, step=4715] Training: 47%|████▋ | 4716/10000 [55:25<54:24, 1.62it/s, loss=0.0168, lr=1.57e-05, step=4716] Training: 47%|████▋ | 4717/10000 [55:26<51:10, 1.72it/s, loss=0.0168, lr=1.57e-05, step=4716] Training: 47%|████▋ | 4717/10000 [55:26<51:10, 1.72it/s, loss=0.0161, lr=1.57e-05, step=4717] Training: 47%|████▋ | 4718/10000 [55:26<49:32, 1.78it/s, loss=0.0161, lr=1.57e-05, step=4717] Training: 47%|████▋ | 4718/10000 [55:26<49:32, 1.78it/s, loss=0.0079, lr=1.57e-05, step=4718] Training: 47%|████▋ | 4719/10000 [55:27<49:55, 1.76it/s, loss=0.0079, lr=1.57e-05, step=4718] Training: 47%|████▋ | 4719/10000 [55:27<49:55, 1.76it/s, loss=0.0018, lr=1.57e-05, step=4719]17:01:34.489 [I] step=4720 loss=0.0221 smoothed_loss=0.0152 lr=1.57e-05 grad_norm=0.4275 step_time=0.4982s data_time=0.0638s it/s=1.780 eta_to_10000=2966.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0083 grad_action_out_proj=0.1270 grad_shared_expert=0.3130 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4720/10000 [55:28<50:46, 1.73it/s, loss=0.0018, lr=1.57e-05, step=4719] Training: 47%|████▋ | 4720/10000 [55:28<50:46, 1.73it/s, loss=0.0221, lr=1.57e-05, step=4720] Training: 47%|████▋ | 4721/10000 [55:28<48:24, 1.82it/s, loss=0.0221, lr=1.57e-05, step=4720] Training: 47%|████▋ | 4721/10000 [55:28<48:24, 1.82it/s, loss=0.0233, lr=1.57e-05, step=4721] Training: 47%|████▋ | 4722/10000 [55:29<47:48, 1.84it/s, loss=0.0233, lr=1.57e-05, step=4721] Training: 47%|████▋ | 4722/10000 [55:29<47:48, 1.84it/s, loss=0.0099, lr=1.57e-05, step=4722] Training: 47%|████▋ | 4723/10000 [55:29<56:27, 1.56it/s, loss=0.0099, lr=1.57e-05, step=4722] Training: 47%|████▋ | 4723/10000 [55:29<56:27, 1.56it/s, loss=0.0053, lr=1.57e-05, step=4723] Training: 47%|████▋ | 4724/10000 [55:30<55:52, 1.57it/s, loss=0.0053, lr=1.57e-05, step=4723] Training: 47%|████▋ | 4724/10000 [55:30<55:52, 1.57it/s, loss=0.0110, lr=1.57e-05, step=4724] Training: 47%|████▋ | 4725/10000 [55:31<53:49, 1.63it/s, loss=0.0110, lr=1.57e-05, step=4724] Training: 47%|████▋ | 4725/10000 [55:31<53:49, 1.63it/s, loss=0.0142, lr=1.57e-05, step=4725] Training: 47%|████▋ | 4726/10000 [55:31<54:59, 1.60it/s, loss=0.0142, lr=1.57e-05, step=4725] Training: 47%|████▋ | 4726/10000 [55:31<54:59, 1.60it/s, loss=0.0133, lr=1.57e-05, step=4726] Training: 47%|████▋ | 4727/10000 [55:32<54:08, 1.62it/s, loss=0.0133, lr=1.57e-05, step=4726] Training: 47%|████▋ | 4727/10000 [55:32<54:08, 1.62it/s, loss=0.0071, lr=1.57e-05, step=4727] Training: 47%|████▋ | 4728/10000 [55:32<53:49, 1.63it/s, loss=0.0071, lr=1.57e-05, step=4727] Training: 47%|████▋ | 4728/10000 [55:32<53:49, 1.63it/s, loss=0.0041, lr=1.57e-05, step=4728] Training: 47%|████▋ | 4729/10000 [55:33<58:09, 1.51it/s, loss=0.0041, lr=1.57e-05, step=4728] Training: 47%|████▋ | 4729/10000 [55:33<58:09, 1.51it/s, loss=0.0086, lr=1.57e-05, step=4729]17:01:41.018 [I] step=4730 loss=0.0195 smoothed_loss=0.0128 lr=1.57e-05 grad_norm=0.4714 step_time=0.5475s data_time=0.1054s it/s=1.532 eta_to_10000=3440.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1224 grad_shared_expert=0.8202 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4730/10000 [55:34<1:02:35, 1.40it/s, loss=0.0086, lr=1.57e-05, step=4729] Training: 47%|████▋ | 4730/10000 [55:34<1:02:35, 1.40it/s, loss=0.0195, lr=1.57e-05, step=4730] Training: 47%|████▋ | 4731/10000 [55:35<56:56, 1.54it/s, loss=0.0195, lr=1.57e-05, step=4730] Training: 47%|████▋ | 4731/10000 [55:35<56:56, 1.54it/s, loss=0.0057, lr=1.57e-05, step=4731] Training: 47%|████▋ | 4732/10000 [55:35<54:16, 1.62it/s, loss=0.0057, lr=1.57e-05, step=4731] Training: 47%|████▋ | 4732/10000 [55:35<54:16, 1.62it/s, loss=0.0144, lr=1.57e-05, step=4732] Training: 47%|████▋ | 4733/10000 [55:36<50:58, 1.72it/s, loss=0.0144, lr=1.57e-05, step=4732] Training: 47%|████▋ | 4733/10000 [55:36<50:58, 1.72it/s, loss=0.0046, lr=1.57e-05, step=4733] Training: 47%|████▋ | 4734/10000 [55:36<49:14, 1.78it/s, loss=0.0046, lr=1.57e-05, step=4733] Training: 47%|████▋ | 4734/10000 [55:36<49:14, 1.78it/s, loss=0.0155, lr=1.57e-05, step=4734] Training: 47%|████▋ | 4735/10000 [55:37<53:01, 1.65it/s, loss=0.0155, lr=1.57e-05, step=4734] Training: 47%|████▋ | 4735/10000 [55:37<53:01, 1.65it/s, loss=0.0059, lr=1.57e-05, step=4735] Training: 47%|████▋ | 4736/10000 [55:37<54:01, 1.62it/s, loss=0.0059, lr=1.57e-05, step=4735] Training: 47%|████▋ | 4736/10000 [55:37<54:01, 1.62it/s, loss=0.0048, lr=1.57e-05, step=4736] Training: 47%|████▋ | 4737/10000 [55:38<1:02:59, 1.39it/s, loss=0.0048, lr=1.57e-05, step=4736] Training: 47%|████▋ | 4737/10000 [55:38<1:02:59, 1.39it/s, loss=0.0100, lr=1.57e-05, step=4737] Training: 47%|████▋ | 4738/10000 [55:39<1:05:26, 1.34it/s, loss=0.0100, lr=1.57e-05, step=4737] Training: 47%|████▋ | 4738/10000 [55:39<1:05:26, 1.34it/s, loss=0.0100, lr=1.56e-05, step=4738] Training: 47%|████▋ | 4739/10000 [55:40<1:00:04, 1.46it/s, loss=0.0100, lr=1.56e-05, step=4738] Training: 47%|████▋ | 4739/10000 [55:40<1:00:04, 1.46it/s, loss=0.0256, lr=1.56e-05, step=4739]17:01:47.268 [I] step=4740 loss=0.0130 smoothed_loss=0.0121 lr=1.57e-05 grad_norm=0.4226 step_time=0.5361s data_time=0.0888s it/s=1.600 eta_to_10000=3287.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0069 grad_action_out_proj=0.0931 grad_shared_expert=0.2788 (10775:train_pytorch.py:850) + Training: 47%|████▋ | 4740/10000 [55:40<56:09, 1.56it/s, loss=0.0256, lr=1.56e-05, step=4739] Training: 47%|████▋ | 4740/10000 [55:40<56:09, 1.56it/s, loss=0.0130, lr=1.56e-05, step=4740] Training: 47%|████▋ | 4741/10000 [55:41<52:25, 1.67it/s, loss=0.0130, lr=1.56e-05, step=4740] Training: 47%|████▋ | 4741/10000 [55:41<52:25, 1.67it/s, loss=0.0181, lr=1.56e-05, step=4741] Training: 47%|████▋ | 4742/10000 [55:41<50:54, 1.72it/s, loss=0.0181, lr=1.56e-05, step=4741] Training: 47%|████▋ | 4742/10000 [55:41<50:54, 1.72it/s, loss=0.0056, lr=1.56e-05, step=4742] Training: 47%|████▋ | 4743/10000 [55:42<48:55, 1.79it/s, loss=0.0056, lr=1.56e-05, step=4742] Training: 47%|████▋ | 4743/10000 [55:42<48:55, 1.79it/s, loss=0.0041, lr=1.56e-05, step=4743] Training: 47%|████▋ | 4744/10000 [55:43<57:07, 1.53it/s, loss=0.0041, lr=1.56e-05, step=4743] Training: 47%|████▋ | 4744/10000 [55:43<57:07, 1.53it/s, loss=0.0296, lr=1.56e-05, step=4744] Training: 47%|████▋ | 4745/10000 [55:43<58:27, 1.50it/s, loss=0.0296, lr=1.56e-05, step=4744] Training: 47%|████▋ | 4745/10000 [55:43<58:27, 1.50it/s, loss=0.0038, lr=1.56e-05, step=4745] Training: 47%|████▋ | 4746/10000 [55:44<54:55, 1.59it/s, loss=0.0038, lr=1.56e-05, step=4745] Training: 47%|████▋ | 4746/10000 [55:44<54:55, 1.59it/s, loss=0.0989, lr=1.56e-05, step=4746] Training: 47%|████▋ | 4747/10000 [55:45<55:02, 1.59it/s, loss=0.0989, lr=1.56e-05, step=4746] Training: 47%|████▋ | 4747/10000 [55:45<55:02, 1.59it/s, loss=0.0120, lr=1.56e-05, step=4747] Training: 47%|████▋ | 4748/10000 [55:45<59:50, 1.46it/s, loss=0.0120, lr=1.56e-05, step=4747] Training: 47%|████▋ | 4748/10000 [55:45<59:50, 1.46it/s, loss=0.0084, lr=1.56e-05, step=4748] Training: 47%|████▋ | 4749/10000 [55:46<55:12, 1.59it/s, loss=0.0084, lr=1.56e-05, step=4748] Training: 47%|████▋ | 4749/10000 [55:46<55:12, 1.59it/s, loss=0.0057, lr=1.56e-05, step=4749]17:01:53.469 [I] step=4750 loss=0.0114 smoothed_loss=0.0169 lr=1.56e-05 grad_norm=0.4754 step_time=0.5169s data_time=0.1033s it/s=1.613 eta_to_10000=3255.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0253 grad_action_out_proj=0.2191 grad_shared_expert=0.4973 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4750/10000 [55:47<54:19, 1.61it/s, loss=0.0057, lr=1.56e-05, step=4749] Training: 48%|████▊ | 4750/10000 [55:47<54:19, 1.61it/s, loss=0.0114, lr=1.56e-05, step=4750] Training: 48%|████▊ | 4751/10000 [55:47<59:29, 1.47it/s, loss=0.0114, lr=1.56e-05, step=4750] Training: 48%|████▊ | 4751/10000 [55:47<59:29, 1.47it/s, loss=0.0792, lr=1.56e-05, step=4751] Training: 48%|████▊ | 4752/10000 [55:48<1:00:19, 1.45it/s, loss=0.0792, lr=1.56e-05, step=4751] Training: 48%|████▊ | 4752/10000 [55:48<1:00:19, 1.45it/s, loss=0.0125, lr=1.56e-05, step=4752] Training: 48%|████▊ | 4753/10000 [55:49<57:57, 1.51it/s, loss=0.0125, lr=1.56e-05, step=4752] Training: 48%|████▊ | 4753/10000 [55:49<57:57, 1.51it/s, loss=0.0529, lr=1.56e-05, step=4753] Training: 48%|████▊ | 4754/10000 [55:49<56:32, 1.55it/s, loss=0.0529, lr=1.56e-05, step=4753] Training: 48%|████▊ | 4754/10000 [55:49<56:32, 1.55it/s, loss=0.0044, lr=1.56e-05, step=4754] Training: 48%|████▊ | 4755/10000 [55:50<1:03:36, 1.37it/s, loss=0.0044, lr=1.56e-05, step=4754] Training: 48%|████▊ | 4755/10000 [55:50<1:03:36, 1.37it/s, loss=0.0126, lr=1.56e-05, step=4755] Training: 48%|████▊ | 4756/10000 [55:51<1:09:52, 1.25it/s, loss=0.0126, lr=1.56e-05, step=4755] Training: 48%|████▊ | 4756/10000 [55:51<1:09:52, 1.25it/s, loss=0.0081, lr=1.56e-05, step=4756] Training: 48%|████▊ | 4757/10000 [55:52<1:05:02, 1.34it/s, loss=0.0081, lr=1.56e-05, step=4756] Training: 48%|████▊ | 4757/10000 [55:52<1:05:02, 1.34it/s, loss=0.0039, lr=1.56e-05, step=4757] Training: 48%|████▊ | 4758/10000 [55:52<59:05, 1.48it/s, loss=0.0039, lr=1.56e-05, step=4757] Training: 48%|████▊ | 4758/10000 [55:52<59:05, 1.48it/s, loss=0.0013, lr=1.56e-05, step=4758] Training: 48%|████▊ | 4759/10000 [55:53<1:00:08, 1.45it/s, loss=0.0013, lr=1.56e-05, step=4758] Training: 48%|████▊ | 4759/10000 [55:53<1:00:08, 1.45it/s, loss=0.0091, lr=1.56e-05, step=4759]17:02:00.695 [I] step=4760 loss=0.0208 smoothed_loss=0.0168 lr=1.56e-05 grad_norm=0.4632 step_time=0.6158s data_time=0.1068s it/s=1.384 eta_to_10000=3785.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0996 grad_shared_expert=0.5140 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4760/10000 [55:54<1:01:47, 1.41it/s, loss=0.0091, lr=1.56e-05, step=4759] Training: 48%|████▊ | 4760/10000 [55:54<1:01:47, 1.41it/s, loss=0.0208, lr=1.56e-05, step=4760] Training: 48%|████▊ | 4761/10000 [55:54<56:10, 1.55it/s, loss=0.0208, lr=1.56e-05, step=4760] Training: 48%|████▊ | 4761/10000 [55:54<56:10, 1.55it/s, loss=0.0289, lr=1.56e-05, step=4761] Training: 48%|████▊ | 4762/10000 [55:55<54:06, 1.61it/s, loss=0.0289, lr=1.56e-05, step=4761] Training: 48%|████▊ | 4762/10000 [55:55<54:06, 1.61it/s, loss=0.0130, lr=1.56e-05, step=4762] Training: 48%|████▊ | 4763/10000 [55:55<51:44, 1.69it/s, loss=0.0130, lr=1.56e-05, step=4762] Training: 48%|████▊ | 4763/10000 [55:55<51:44, 1.69it/s, loss=0.0318, lr=1.56e-05, step=4763] Training: 48%|████▊ | 4764/10000 [55:56<49:15, 1.77it/s, loss=0.0318, lr=1.56e-05, step=4763] Training: 48%|████▊ | 4764/10000 [55:56<49:15, 1.77it/s, loss=0.0063, lr=1.56e-05, step=4764] Training: 48%|████▊ | 4765/10000 [55:56<47:38, 1.83it/s, loss=0.0063, lr=1.56e-05, step=4764] Training: 48%|████▊ | 4765/10000 [55:56<47:38, 1.83it/s, loss=0.0031, lr=1.56e-05, step=4765] Training: 48%|████▊ | 4766/10000 [55:57<52:51, 1.65it/s, loss=0.0031, lr=1.56e-05, step=4765] Training: 48%|████▊ | 4766/10000 [55:57<52:51, 1.65it/s, loss=0.0039, lr=1.55e-05, step=4766] Training: 48%|████▊ | 4767/10000 [55:58<56:11, 1.55it/s, loss=0.0039, lr=1.55e-05, step=4766] Training: 48%|████▊ | 4767/10000 [55:58<56:11, 1.55it/s, loss=0.0796, lr=1.55e-05, step=4767] Training: 48%|████▊ | 4768/10000 [55:58<52:43, 1.65it/s, loss=0.0796, lr=1.55e-05, step=4767] Training: 48%|████▊ | 4768/10000 [55:58<52:43, 1.65it/s, loss=0.0239, lr=1.55e-05, step=4768] Training: 48%|████▊ | 4769/10000 [55:59<50:04, 1.74it/s, loss=0.0239, lr=1.55e-05, step=4768] Training: 48%|████▊ | 4769/10000 [55:59<50:04, 1.74it/s, loss=0.0185, lr=1.55e-05, step=4769]17:02:06.329 [I] step=4770 loss=0.0063 smoothed_loss=0.0199 lr=1.55e-05 grad_norm=0.5236 step_time=0.5051s data_time=0.0584s it/s=1.775 eta_to_10000=2946.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0196 grad_action_out_proj=0.1511 grad_shared_expert=0.6628 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4770/10000 [55:59<49:23, 1.76it/s, loss=0.0185, lr=1.55e-05, step=4769] Training: 48%|████▊ | 4770/10000 [55:59<49:23, 1.76it/s, loss=0.0063, lr=1.55e-05, step=4770] Training: 48%|████▊ | 4771/10000 [56:00<48:04, 1.81it/s, loss=0.0063, lr=1.55e-05, step=4770] Training: 48%|████▊ | 4771/10000 [56:00<48:04, 1.81it/s, loss=0.0268, lr=1.55e-05, step=4771] Training: 48%|████▊ | 4772/10000 [56:00<47:18, 1.84it/s, loss=0.0268, lr=1.55e-05, step=4771] Training: 48%|████▊ | 4772/10000 [56:00<47:18, 1.84it/s, loss=0.0033, lr=1.55e-05, step=4772] Training: 48%|████▊ | 4773/10000 [56:01<53:12, 1.64it/s, loss=0.0033, lr=1.55e-05, step=4772] Training: 48%|████▊ | 4773/10000 [56:01<53:12, 1.64it/s, loss=0.0065, lr=1.55e-05, step=4773] Training: 48%|████▊ | 4774/10000 [56:02<51:29, 1.69it/s, loss=0.0065, lr=1.55e-05, step=4773] Training: 48%|████▊ | 4774/10000 [56:02<51:29, 1.69it/s, loss=0.0101, lr=1.55e-05, step=4774] Training: 48%|████▊ | 4775/10000 [56:02<54:58, 1.58it/s, loss=0.0101, lr=1.55e-05, step=4774] Training: 48%|████▊ | 4775/10000 [56:02<54:58, 1.58it/s, loss=0.0033, lr=1.55e-05, step=4775] Training: 48%|████▊ | 4776/10000 [56:03<51:11, 1.70it/s, loss=0.0033, lr=1.55e-05, step=4775] Training: 48%|████▊ | 4776/10000 [56:03<51:11, 1.70it/s, loss=0.0052, lr=1.55e-05, step=4776] Training: 48%|████▊ | 4777/10000 [56:03<48:46, 1.78it/s, loss=0.0052, lr=1.55e-05, step=4776] Training: 48%|████▊ | 4777/10000 [56:03<48:46, 1.78it/s, loss=0.0063, lr=1.55e-05, step=4777] Training: 48%|████▊ | 4778/10000 [56:04<46:59, 1.85it/s, loss=0.0063, lr=1.55e-05, step=4777] Training: 48%|████▊ | 4778/10000 [56:04<46:59, 1.85it/s, loss=0.0348, lr=1.55e-05, step=4778] Training: 48%|████▊ | 4779/10000 [56:04<46:29, 1.87it/s, loss=0.0348, lr=1.55e-05, step=4778] Training: 48%|████▊ | 4779/10000 [56:04<46:29, 1.87it/s, loss=0.0210, lr=1.55e-05, step=4779]17:02:12.140 [I] step=4780 loss=0.0073 smoothed_loss=0.0154 lr=1.55e-05 grad_norm=0.4943 step_time=0.5248s data_time=0.0563s it/s=1.721 eta_to_10000=3032.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1379 grad_shared_expert=0.3674 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4780/10000 [56:05<51:44, 1.68it/s, loss=0.0210, lr=1.55e-05, step=4779] Training: 48%|████▊ | 4780/10000 [56:05<51:44, 1.68it/s, loss=0.0073, lr=1.55e-05, step=4780] Training: 48%|████▊ | 4781/10000 [56:06<48:56, 1.78it/s, loss=0.0073, lr=1.55e-05, step=4780] Training: 48%|████▊ | 4781/10000 [56:06<48:56, 1.78it/s, loss=0.0197, lr=1.55e-05, step=4781] Training: 48%|████▊ | 4782/10000 [56:06<52:44, 1.65it/s, loss=0.0197, lr=1.55e-05, step=4781] Training: 48%|████▊ | 4782/10000 [56:06<52:44, 1.65it/s, loss=0.0169, lr=1.55e-05, step=4782] Training: 48%|████▊ | 4783/10000 [56:07<53:18, 1.63it/s, loss=0.0169, lr=1.55e-05, step=4782] Training: 48%|████▊ | 4783/10000 [56:07<53:18, 1.63it/s, loss=0.0218, lr=1.55e-05, step=4783] Training: 48%|████▊ | 4784/10000 [56:08<51:48, 1.68it/s, loss=0.0218, lr=1.55e-05, step=4783] Training: 48%|████▊ | 4784/10000 [56:08<51:48, 1.68it/s, loss=0.0058, lr=1.55e-05, step=4784] Training: 48%|████▊ | 4785/10000 [56:08<49:07, 1.77it/s, loss=0.0058, lr=1.55e-05, step=4784] Training: 48%|████▊ | 4785/10000 [56:08<49:07, 1.77it/s, loss=0.0078, lr=1.55e-05, step=4785] Training: 48%|████▊ | 4786/10000 [56:09<47:17, 1.84it/s, loss=0.0078, lr=1.55e-05, step=4785] Training: 48%|████▊ | 4786/10000 [56:09<47:17, 1.84it/s, loss=0.0159, lr=1.55e-05, step=4786] Training: 48%|████▊ | 4787/10000 [56:09<51:38, 1.68it/s, loss=0.0159, lr=1.55e-05, step=4786] Training: 48%|████▊ | 4787/10000 [56:09<51:38, 1.68it/s, loss=0.0055, lr=1.55e-05, step=4787] Training: 48%|████▊ | 4788/10000 [56:10<49:59, 1.74it/s, loss=0.0055, lr=1.55e-05, step=4787] Training: 48%|████▊ | 4788/10000 [56:10<49:59, 1.74it/s, loss=0.0050, lr=1.55e-05, step=4788] Training: 48%|████▊ | 4789/10000 [56:11<54:57, 1.58it/s, loss=0.0050, lr=1.55e-05, step=4788] Training: 48%|████▊ | 4789/10000 [56:11<54:57, 1.58it/s, loss=0.0280, lr=1.55e-05, step=4789]17:02:18.148 [I] step=4790 loss=0.0246 smoothed_loss=0.0155 lr=1.55e-05 grad_norm=0.5103 step_time=0.5317s data_time=0.0692s it/s=1.665 eta_to_10000=3129.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0281 grad_action_out_proj=0.2413 grad_shared_expert=0.6740 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4790/10000 [56:11<54:52, 1.58it/s, loss=0.0280, lr=1.55e-05, step=4789] Training: 48%|████▊ | 4790/10000 [56:11<54:52, 1.58it/s, loss=0.0246, lr=1.55e-05, step=4790] Training: 48%|████▊ | 4791/10000 [56:12<52:16, 1.66it/s, loss=0.0246, lr=1.55e-05, step=4790] Training: 48%|████▊ | 4791/10000 [56:12<52:16, 1.66it/s, loss=0.0094, lr=1.55e-05, step=4791] Training: 48%|████▊ | 4792/10000 [56:12<50:19, 1.72it/s, loss=0.0094, lr=1.55e-05, step=4791] Training: 48%|████▊ | 4792/10000 [56:12<50:19, 1.72it/s, loss=0.0193, lr=1.55e-05, step=4792] Training: 48%|████▊ | 4793/10000 [56:13<48:48, 1.78it/s, loss=0.0193, lr=1.55e-05, step=4792] Training: 48%|████▊ | 4793/10000 [56:13<48:48, 1.78it/s, loss=0.0081, lr=1.54e-05, step=4793] Training: 48%|████▊ | 4794/10000 [56:14<53:59, 1.61it/s, loss=0.0081, lr=1.54e-05, step=4793] Training: 48%|████▊ | 4794/10000 [56:14<53:59, 1.61it/s, loss=0.0096, lr=1.54e-05, step=4794] Training: 48%|████▊ | 4795/10000 [56:14<51:21, 1.69it/s, loss=0.0096, lr=1.54e-05, step=4794] Training: 48%|████▊ | 4795/10000 [56:14<51:21, 1.69it/s, loss=0.0153, lr=1.54e-05, step=4795] Training: 48%|████▊ | 4796/10000 [56:15<54:46, 1.58it/s, loss=0.0153, lr=1.54e-05, step=4795] Training: 48%|████▊ | 4796/10000 [56:15<54:46, 1.58it/s, loss=0.0111, lr=1.54e-05, step=4796] Training: 48%|████▊ | 4797/10000 [56:15<52:09, 1.66it/s, loss=0.0111, lr=1.54e-05, step=4796] Training: 48%|████▊ | 4797/10000 [56:15<52:09, 1.66it/s, loss=0.0035, lr=1.54e-05, step=4797] Training: 48%|████▊ | 4798/10000 [56:16<50:00, 1.73it/s, loss=0.0035, lr=1.54e-05, step=4797] Training: 48%|████▊ | 4798/10000 [56:16<50:00, 1.73it/s, loss=0.0420, lr=1.54e-05, step=4798] Training: 48%|████▊ | 4799/10000 [56:16<50:29, 1.72it/s, loss=0.0420, lr=1.54e-05, step=4798] Training: 48%|████▊ | 4799/10000 [56:16<50:29, 1.72it/s, loss=0.0188, lr=1.54e-05, step=4799]17:02:23.981 [I] step=4800 loss=0.0075 smoothed_loss=0.0152 lr=1.54e-05 grad_norm=0.4318 step_time=0.5193s data_time=0.0640s it/s=1.715 eta_to_10000=3032.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0227 grad_action_out_proj=0.1411 grad_shared_expert=0.4632 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4800/10000 [56:17<50:51, 1.70it/s, loss=0.0188, lr=1.54e-05, step=4799] Training: 48%|████▊ | 4800/10000 [56:17<50:51, 1.70it/s, loss=0.0075, lr=1.54e-05, step=4800] Training: 48%|████▊ | 4801/10000 [56:18<48:36, 1.78it/s, loss=0.0075, lr=1.54e-05, step=4800] Training: 48%|████▊ | 4801/10000 [56:18<48:36, 1.78it/s, loss=0.0027, lr=1.54e-05, step=4801] Training: 48%|████▊ | 4802/10000 [56:18<52:38, 1.65it/s, loss=0.0027, lr=1.54e-05, step=4801] Training: 48%|████▊ | 4802/10000 [56:18<52:38, 1.65it/s, loss=0.0032, lr=1.54e-05, step=4802] Training: 48%|████▊ | 4803/10000 [56:19<56:38, 1.53it/s, loss=0.0032, lr=1.54e-05, step=4802] Training: 48%|████▊ | 4803/10000 [56:19<56:38, 1.53it/s, loss=0.0133, lr=1.54e-05, step=4803] Training: 48%|████▊ | 4804/10000 [56:20<53:01, 1.63it/s, loss=0.0133, lr=1.54e-05, step=4803] Training: 48%|████▊ | 4804/10000 [56:20<53:01, 1.63it/s, loss=0.0163, lr=1.54e-05, step=4804] Training: 48%|████▊ | 4805/10000 [56:20<50:18, 1.72it/s, loss=0.0163, lr=1.54e-05, step=4804] Training: 48%|████▊ | 4805/10000 [56:20<50:18, 1.72it/s, loss=0.0102, lr=1.54e-05, step=4805] Training: 48%|████▊ | 4806/10000 [56:21<49:00, 1.77it/s, loss=0.0102, lr=1.54e-05, step=4805] Training: 48%|████▊ | 4806/10000 [56:21<49:00, 1.77it/s, loss=0.0171, lr=1.54e-05, step=4806] Training: 48%|████▊ | 4807/10000 [56:21<47:44, 1.81it/s, loss=0.0171, lr=1.54e-05, step=4806] Training: 48%|████▊ | 4807/10000 [56:21<47:44, 1.81it/s, loss=0.0035, lr=1.54e-05, step=4807] Training: 48%|████▊ | 4808/10000 [56:22<47:44, 1.81it/s, loss=0.0035, lr=1.54e-05, step=4807] Training: 48%|████▊ | 4808/10000 [56:22<47:44, 1.81it/s, loss=0.0131, lr=1.54e-05, step=4808] Training: 48%|████▊ | 4809/10000 [56:22<52:33, 1.65it/s, loss=0.0131, lr=1.54e-05, step=4808] Training: 48%|████▊ | 4809/10000 [56:22<52:33, 1.65it/s, loss=0.0201, lr=1.54e-05, step=4809]17:02:30.101 [I] step=4810 loss=0.0355 smoothed_loss=0.0155 lr=1.54e-05 grad_norm=0.4602 step_time=0.5512s data_time=0.0607s it/s=1.634 eta_to_10000=3175.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0282 grad_action_out_proj=0.1751 grad_shared_expert=0.5740 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4810/10000 [56:23<56:59, 1.52it/s, loss=0.0201, lr=1.54e-05, step=4809] Training: 48%|████▊ | 4810/10000 [56:23<56:59, 1.52it/s, loss=0.0355, lr=1.54e-05, step=4810] Training: 48%|████▊ | 4811/10000 [56:24<52:41, 1.64it/s, loss=0.0355, lr=1.54e-05, step=4810] Training: 48%|████▊ | 4811/10000 [56:24<52:41, 1.64it/s, loss=0.0116, lr=1.54e-05, step=4811] Training: 48%|████▊ | 4812/10000 [56:24<54:34, 1.58it/s, loss=0.0116, lr=1.54e-05, step=4811] Training: 48%|████▊ | 4812/10000 [56:24<54:34, 1.58it/s, loss=0.0454, lr=1.54e-05, step=4812] Training: 48%|████▊ | 4813/10000 [56:25<55:16, 1.56it/s, loss=0.0454, lr=1.54e-05, step=4812] Training: 48%|████▊ | 4813/10000 [56:25<55:16, 1.56it/s, loss=0.0188, lr=1.54e-05, step=4813] Training: 48%|████▊ | 4814/10000 [56:26<57:17, 1.51it/s, loss=0.0188, lr=1.54e-05, step=4813] Training: 48%|████▊ | 4814/10000 [56:26<57:17, 1.51it/s, loss=0.0249, lr=1.54e-05, step=4814] Training: 48%|████▊ | 4815/10000 [56:26<58:41, 1.47it/s, loss=0.0249, lr=1.54e-05, step=4814] Training: 48%|████▊ | 4815/10000 [56:26<58:41, 1.47it/s, loss=0.0063, lr=1.54e-05, step=4815] Training: 48%|████▊ | 4816/10000 [56:27<1:04:31, 1.34it/s, loss=0.0063, lr=1.54e-05, step=4815] Training: 48%|████▊ | 4816/10000 [56:27<1:04:31, 1.34it/s, loss=0.0048, lr=1.54e-05, step=4816] Training: 48%|████▊ | 4817/10000 [56:28<1:03:21, 1.36it/s, loss=0.0048, lr=1.54e-05, step=4816] Training: 48%|████▊ | 4817/10000 [56:28<1:03:21, 1.36it/s, loss=0.0206, lr=1.54e-05, step=4817] Training: 48%|████▊ | 4818/10000 [56:29<1:01:32, 1.40it/s, loss=0.0206, lr=1.54e-05, step=4817] Training: 48%|████▊ | 4818/10000 [56:29<1:01:32, 1.40it/s, loss=0.0153, lr=1.54e-05, step=4818] Training: 48%|████▊ | 4819/10000 [56:29<56:22, 1.53it/s, loss=0.0153, lr=1.54e-05, step=4818] Training: 48%|████▊ | 4819/10000 [56:29<56:22, 1.53it/s, loss=0.0034, lr=1.54e-05, step=4819]17:02:36.699 [I] step=4820 loss=0.0098 smoothed_loss=0.0147 lr=1.54e-05 grad_norm=0.4653 step_time=0.5624s data_time=0.0974s it/s=1.516 eta_to_10000=3417.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0261 grad_action_out_proj=0.1549 grad_shared_expert=0.5145 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4820/10000 [56:30<53:33, 1.61it/s, loss=0.0034, lr=1.54e-05, step=4819] Training: 48%|████▊ | 4820/10000 [56:30<53:33, 1.61it/s, loss=0.0098, lr=1.53e-05, step=4820] Training: 48%|████▊ | 4821/10000 [56:30<54:45, 1.58it/s, loss=0.0098, lr=1.53e-05, step=4820] Training: 48%|████▊ | 4821/10000 [56:30<54:45, 1.58it/s, loss=0.0163, lr=1.53e-05, step=4821] Training: 48%|████▊ | 4822/10000 [56:31<54:47, 1.58it/s, loss=0.0163, lr=1.53e-05, step=4821] Training: 48%|████▊ | 4822/10000 [56:31<54:47, 1.58it/s, loss=0.0085, lr=1.53e-05, step=4822] Training: 48%|████▊ | 4823/10000 [56:32<56:29, 1.53it/s, loss=0.0085, lr=1.53e-05, step=4822] Training: 48%|████▊ | 4823/10000 [56:32<56:29, 1.53it/s, loss=0.0025, lr=1.53e-05, step=4823] Training: 48%|████▊ | 4824/10000 [56:32<55:42, 1.55it/s, loss=0.0025, lr=1.53e-05, step=4823] Training: 48%|████▊ | 4824/10000 [56:32<55:42, 1.55it/s, loss=0.0285, lr=1.53e-05, step=4824] Training: 48%|████▊ | 4825/10000 [56:33<1:01:57, 1.39it/s, loss=0.0285, lr=1.53e-05, step=4824] Training: 48%|████▊ | 4825/10000 [56:33<1:01:57, 1.39it/s, loss=0.0113, lr=1.53e-05, step=4825] Training: 48%|████▊ | 4826/10000 [56:34<55:54, 1.54it/s, loss=0.0113, lr=1.53e-05, step=4825] Training: 48%|████▊ | 4826/10000 [56:34<55:54, 1.54it/s, loss=0.0131, lr=1.53e-05, step=4826] Training: 48%|████▊ | 4827/10000 [56:34<51:56, 1.66it/s, loss=0.0131, lr=1.53e-05, step=4826] Training: 48%|████▊ | 4827/10000 [56:34<51:56, 1.66it/s, loss=0.0156, lr=1.53e-05, step=4827] Training: 48%|████▊ | 4828/10000 [56:35<49:09, 1.75it/s, loss=0.0156, lr=1.53e-05, step=4827] Training: 48%|████▊ | 4828/10000 [56:35<49:09, 1.75it/s, loss=0.0050, lr=1.53e-05, step=4828] Training: 48%|████▊ | 4829/10000 [56:35<51:04, 1.69it/s, loss=0.0050, lr=1.53e-05, step=4828] Training: 48%|████▊ | 4829/10000 [56:35<51:04, 1.69it/s, loss=0.0040, lr=1.53e-05, step=4829]17:02:43.075 [I] step=4830 loss=0.0056 smoothed_loss=0.0118 lr=1.53e-05 grad_norm=0.4098 step_time=0.5515s data_time=0.0862s it/s=1.569 eta_to_10000=3296.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0064 grad_action_out_proj=0.1109 grad_shared_expert=0.4317 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4830/10000 [56:36<54:52, 1.57it/s, loss=0.0040, lr=1.53e-05, step=4829] Training: 48%|████▊ | 4830/10000 [56:36<54:52, 1.57it/s, loss=0.0056, lr=1.53e-05, step=4830] Training: 48%|████▊ | 4831/10000 [56:37<51:45, 1.66it/s, loss=0.0056, lr=1.53e-05, step=4830] Training: 48%|████▊ | 4831/10000 [56:37<51:45, 1.66it/s, loss=0.0130, lr=1.53e-05, step=4831] Training: 48%|████▊ | 4832/10000 [56:37<56:36, 1.52it/s, loss=0.0130, lr=1.53e-05, step=4831] Training: 48%|████▊ | 4832/10000 [56:37<56:36, 1.52it/s, loss=0.0084, lr=1.53e-05, step=4832] Training: 48%|████▊ | 4833/10000 [56:38<52:47, 1.63it/s, loss=0.0084, lr=1.53e-05, step=4832] Training: 48%|████▊ | 4833/10000 [56:38<52:47, 1.63it/s, loss=0.0097, lr=1.53e-05, step=4833] Training: 48%|████▊ | 4834/10000 [56:39<51:26, 1.67it/s, loss=0.0097, lr=1.53e-05, step=4833] Training: 48%|████▊ | 4834/10000 [56:39<51:26, 1.67it/s, loss=0.0068, lr=1.53e-05, step=4834] Training: 48%|████▊ | 4835/10000 [56:39<53:40, 1.60it/s, loss=0.0068, lr=1.53e-05, step=4834] Training: 48%|████▊ | 4835/10000 [56:39<53:40, 1.60it/s, loss=0.0294, lr=1.53e-05, step=4835] Training: 48%|████▊ | 4836/10000 [56:40<50:23, 1.71it/s, loss=0.0294, lr=1.53e-05, step=4835] Training: 48%|████▊ | 4836/10000 [56:40<50:23, 1.71it/s, loss=0.0127, lr=1.53e-05, step=4836] Training: 48%|████▊ | 4837/10000 [56:40<53:24, 1.61it/s, loss=0.0127, lr=1.53e-05, step=4836] Training: 48%|████▊ | 4837/10000 [56:40<53:24, 1.61it/s, loss=0.0084, lr=1.53e-05, step=4837] Training: 48%|████▊ | 4838/10000 [56:41<54:51, 1.57it/s, loss=0.0084, lr=1.53e-05, step=4837] Training: 48%|████▊ | 4838/10000 [56:41<54:51, 1.57it/s, loss=0.0244, lr=1.53e-05, step=4838] Training: 48%|████▊ | 4839/10000 [56:42<52:03, 1.65it/s, loss=0.0244, lr=1.53e-05, step=4838] Training: 48%|████▊ | 4839/10000 [56:42<52:03, 1.65it/s, loss=0.0076, lr=1.53e-05, step=4839]17:02:49.347 [I] step=4840 loss=0.0114 smoothed_loss=0.0128 lr=1.53e-05 grad_norm=0.4673 step_time=0.5507s data_time=0.0765s it/s=1.595 eta_to_10000=3235.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0159 grad_action_out_proj=0.1317 grad_shared_expert=0.6183 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4840/10000 [56:42<57:10, 1.50it/s, loss=0.0076, lr=1.53e-05, step=4839] Training: 48%|████▊ | 4840/10000 [56:42<57:10, 1.50it/s, loss=0.0114, lr=1.53e-05, step=4840] Training: 48%|████▊ | 4841/10000 [56:43<55:36, 1.55it/s, loss=0.0114, lr=1.53e-05, step=4840] Training: 48%|████▊ | 4841/10000 [56:43<55:36, 1.55it/s, loss=0.0033, lr=1.53e-05, step=4841] Training: 48%|████▊ | 4842/10000 [56:44<51:56, 1.66it/s, loss=0.0033, lr=1.53e-05, step=4841] Training: 48%|████▊ | 4842/10000 [56:44<51:56, 1.66it/s, loss=0.0115, lr=1.53e-05, step=4842] Training: 48%|████▊ | 4843/10000 [56:44<49:30, 1.74it/s, loss=0.0115, lr=1.53e-05, step=4842] Training: 48%|████▊ | 4843/10000 [56:44<49:30, 1.74it/s, loss=0.0056, lr=1.53e-05, step=4843] Training: 48%|████▊ | 4844/10000 [56:45<47:38, 1.80it/s, loss=0.0056, lr=1.53e-05, step=4843] Training: 48%|████▊ | 4844/10000 [56:45<47:38, 1.80it/s, loss=0.0240, lr=1.53e-05, step=4844] Training: 48%|████▊ | 4845/10000 [56:45<52:10, 1.65it/s, loss=0.0240, lr=1.53e-05, step=4844] Training: 48%|████▊ | 4845/10000 [56:45<52:10, 1.65it/s, loss=0.0173, lr=1.53e-05, step=4845] Training: 48%|████▊ | 4846/10000 [56:46<49:26, 1.74it/s, loss=0.0173, lr=1.53e-05, step=4845] Training: 48%|████▊ | 4846/10000 [56:46<49:26, 1.74it/s, loss=0.0355, lr=1.53e-05, step=4846] Training: 48%|████▊ | 4847/10000 [56:47<54:35, 1.57it/s, loss=0.0355, lr=1.53e-05, step=4846] Training: 48%|████▊ | 4847/10000 [56:47<54:35, 1.57it/s, loss=0.0629, lr=1.52e-05, step=4847] Training: 48%|████▊ | 4848/10000 [56:47<51:20, 1.67it/s, loss=0.0629, lr=1.52e-05, step=4847] Training: 48%|████▊ | 4848/10000 [56:47<51:20, 1.67it/s, loss=0.0040, lr=1.52e-05, step=4848] Training: 48%|████▊ | 4849/10000 [56:48<48:54, 1.76it/s, loss=0.0040, lr=1.52e-05, step=4848] Training: 48%|████▊ | 4849/10000 [56:48<48:54, 1.76it/s, loss=0.0081, lr=1.52e-05, step=4849]17:02:55.032 [I] step=4850 loss=0.0223 smoothed_loss=0.0178 lr=1.53e-05 grad_norm=0.4573 step_time=0.5106s data_time=0.0578s it/s=1.759 eta_to_10000=2927.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0208 grad_action_out_proj=0.1891 grad_shared_expert=0.5177 (10775:train_pytorch.py:850) + Training: 48%|████▊ | 4850/10000 [56:48<48:06, 1.78it/s, loss=0.0081, lr=1.52e-05, step=4849] Training: 48%|████▊ | 4850/10000 [56:48<48:06, 1.78it/s, loss=0.0223, lr=1.52e-05, step=4850] Training: 49%|████▊ | 4851/10000 [56:49<46:47, 1.83it/s, loss=0.0223, lr=1.52e-05, step=4850] Training: 49%|████▊ | 4851/10000 [56:49<46:47, 1.83it/s, loss=0.0158, lr=1.52e-05, step=4851] Training: 49%|████▊ | 4852/10000 [56:49<51:11, 1.68it/s, loss=0.0158, lr=1.52e-05, step=4851] Training: 49%|████▊ | 4852/10000 [56:49<51:11, 1.68it/s, loss=0.0082, lr=1.52e-05, step=4852] Training: 49%|████▊ | 4853/10000 [56:50<49:19, 1.74it/s, loss=0.0082, lr=1.52e-05, step=4852] Training: 49%|████▊ | 4853/10000 [56:50<49:19, 1.74it/s, loss=0.0268, lr=1.52e-05, step=4853] Training: 49%|████▊ | 4854/10000 [56:50<47:31, 1.80it/s, loss=0.0268, lr=1.52e-05, step=4853] Training: 49%|████▊ | 4854/10000 [56:50<47:31, 1.80it/s, loss=0.0259, lr=1.52e-05, step=4854] Training: 49%|████▊ | 4855/10000 [56:51<53:42, 1.60it/s, loss=0.0259, lr=1.52e-05, step=4854] Training: 49%|████▊ | 4855/10000 [56:51<53:42, 1.60it/s, loss=0.0045, lr=1.52e-05, step=4855] Training: 49%|████▊ | 4856/10000 [56:52<52:53, 1.62it/s, loss=0.0045, lr=1.52e-05, step=4855] Training: 49%|████▊ | 4856/10000 [56:52<52:53, 1.62it/s, loss=0.0106, lr=1.52e-05, step=4856] Training: 49%|████▊ | 4857/10000 [56:52<50:03, 1.71it/s, loss=0.0106, lr=1.52e-05, step=4856] Training: 49%|████▊ | 4857/10000 [56:52<50:03, 1.71it/s, loss=0.0236, lr=1.52e-05, step=4857] Training: 49%|████▊ | 4858/10000 [56:53<47:56, 1.79it/s, loss=0.0236, lr=1.52e-05, step=4857] Training: 49%|████▊ | 4858/10000 [56:53<47:56, 1.79it/s, loss=0.0087, lr=1.52e-05, step=4858] Training: 49%|████▊ | 4859/10000 [56:53<52:22, 1.64it/s, loss=0.0087, lr=1.52e-05, step=4858] Training: 49%|████▊ | 4859/10000 [56:53<52:22, 1.64it/s, loss=0.0055, lr=1.52e-05, step=4859]17:03:00.958 [I] step=4860 loss=0.0162 smoothed_loss=0.0153 lr=1.52e-05 grad_norm=0.4722 step_time=0.5343s data_time=0.0583s it/s=1.688 eta_to_10000=3045.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0289 grad_action_out_proj=0.2505 grad_shared_expert=0.5745 (10775:train_pytorch.py:850) + Training: 49%|████▊ | 4860/10000 [56:54<50:30, 1.70it/s, loss=0.0055, lr=1.52e-05, step=4859] Training: 49%|████▊ | 4860/10000 [56:54<50:30, 1.70it/s, loss=0.0162, lr=1.52e-05, step=4860] Training: 49%|████▊ | 4861/10000 [56:55<48:42, 1.76it/s, loss=0.0162, lr=1.52e-05, step=4860] Training: 49%|████▊ | 4861/10000 [56:55<48:42, 1.76it/s, loss=0.0196, lr=1.52e-05, step=4861] Training: 49%|████▊ | 4862/10000 [56:55<54:38, 1.57it/s, loss=0.0196, lr=1.52e-05, step=4861] Training: 49%|████▊ | 4862/10000 [56:55<54:38, 1.57it/s, loss=0.0127, lr=1.52e-05, step=4862] Training: 49%|████▊ | 4863/10000 [56:56<51:02, 1.68it/s, loss=0.0127, lr=1.52e-05, step=4862] Training: 49%|████▊ | 4863/10000 [56:56<51:02, 1.68it/s, loss=0.0058, lr=1.52e-05, step=4863] Training: 49%|████▊ | 4864/10000 [56:56<48:41, 1.76it/s, loss=0.0058, lr=1.52e-05, step=4863] Training: 49%|████▊ | 4864/10000 [56:56<48:41, 1.76it/s, loss=0.0093, lr=1.52e-05, step=4864] Training: 49%|████▊ | 4865/10000 [56:57<47:09, 1.81it/s, loss=0.0093, lr=1.52e-05, step=4864] Training: 49%|████▊ | 4865/10000 [56:57<47:09, 1.81it/s, loss=0.0135, lr=1.52e-05, step=4865] Training: 49%|████▊ | 4866/10000 [56:58<51:10, 1.67it/s, loss=0.0135, lr=1.52e-05, step=4865] Training: 49%|████▊ | 4866/10000 [56:58<51:10, 1.67it/s, loss=0.0334, lr=1.52e-05, step=4866] Training: 49%|████▊ | 4867/10000 [56:58<48:45, 1.75it/s, loss=0.0334, lr=1.52e-05, step=4866] Training: 49%|████▊ | 4867/10000 [56:58<48:45, 1.75it/s, loss=0.0185, lr=1.52e-05, step=4867] Training: 49%|████▊ | 4868/10000 [56:59<46:49, 1.83it/s, loss=0.0185, lr=1.52e-05, step=4867] Training: 49%|████▊ | 4868/10000 [56:59<46:49, 1.83it/s, loss=0.0063, lr=1.52e-05, step=4868] Training: 49%|████▊ | 4869/10000 [56:59<51:46, 1.65it/s, loss=0.0063, lr=1.52e-05, step=4868] Training: 49%|████▊ | 4869/10000 [56:59<51:46, 1.65it/s, loss=0.0138, lr=1.52e-05, step=4869]17:03:06.775 [I] step=4870 loss=0.0219 smoothed_loss=0.0157 lr=1.52e-05 grad_norm=0.4723 step_time=0.5213s data_time=0.0604s it/s=1.719 eta_to_10000=2983.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0796 grad_shared_expert=0.3795 (10775:train_pytorch.py:850) + Training: 49%|████▊ | 4870/10000 [57:00<49:59, 1.71it/s, loss=0.0138, lr=1.52e-05, step=4869] Training: 49%|████▊ | 4870/10000 [57:00<49:59, 1.71it/s, loss=0.0219, lr=1.52e-05, step=4870] Training: 49%|████▊ | 4871/10000 [57:00<47:48, 1.79it/s, loss=0.0219, lr=1.52e-05, step=4870] Training: 49%|████▊ | 4871/10000 [57:00<47:48, 1.79it/s, loss=0.0124, lr=1.52e-05, step=4871] Training: 49%|████▊ | 4872/10000 [57:01<46:54, 1.82it/s, loss=0.0124, lr=1.52e-05, step=4871] Training: 49%|████▊ | 4872/10000 [57:01<46:54, 1.82it/s, loss=0.0067, lr=1.52e-05, step=4872] Training: 49%|████▊ | 4873/10000 [57:02<51:16, 1.67it/s, loss=0.0067, lr=1.52e-05, step=4872] Training: 49%|████▊ | 4873/10000 [57:02<51:16, 1.67it/s, loss=0.0118, lr=1.52e-05, step=4873] Training: 49%|████▊ | 4874/10000 [57:02<49:07, 1.74it/s, loss=0.0118, lr=1.52e-05, step=4873] Training: 49%|████▊ | 4874/10000 [57:02<49:07, 1.74it/s, loss=0.0075, lr=1.51e-05, step=4874] Training: 49%|████▉ | 4875/10000 [57:03<47:17, 1.81it/s, loss=0.0075, lr=1.51e-05, step=4874] Training: 49%|████▉ | 4875/10000 [57:03<47:17, 1.81it/s, loss=0.0052, lr=1.51e-05, step=4875] Training: 49%|████▉ | 4876/10000 [57:03<52:31, 1.63it/s, loss=0.0052, lr=1.51e-05, step=4875] Training: 49%|████▉ | 4876/10000 [57:03<52:31, 1.63it/s, loss=0.0259, lr=1.51e-05, step=4876] Training: 49%|████▉ | 4877/10000 [57:04<49:45, 1.72it/s, loss=0.0259, lr=1.51e-05, step=4876] Training: 49%|████▉ | 4877/10000 [57:04<49:45, 1.72it/s, loss=0.0095, lr=1.51e-05, step=4877] Training: 49%|████▉ | 4878/10000 [57:04<47:51, 1.78it/s, loss=0.0095, lr=1.51e-05, step=4877] Training: 49%|████▉ | 4878/10000 [57:04<47:51, 1.78it/s, loss=0.0045, lr=1.51e-05, step=4878] Training: 49%|████▉ | 4879/10000 [57:05<46:33, 1.83it/s, loss=0.0045, lr=1.51e-05, step=4878] Training: 49%|████▉ | 4879/10000 [57:05<46:33, 1.83it/s, loss=0.0150, lr=1.51e-05, step=4879]17:03:12.606 [I] step=4880 loss=0.0096 smoothed_loss=0.0126 lr=1.51e-05 grad_norm=0.4962 step_time=0.5245s data_time=0.0585s it/s=1.715 eta_to_10000=2984.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0198 grad_action_out_proj=0.1920 grad_shared_expert=0.5521 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4880/10000 [57:06<52:34, 1.62it/s, loss=0.0150, lr=1.51e-05, step=4879] Training: 49%|████▉ | 4880/10000 [57:06<52:34, 1.62it/s, loss=0.0096, lr=1.51e-05, step=4880] Training: 49%|████▉ | 4881/10000 [57:06<49:35, 1.72it/s, loss=0.0096, lr=1.51e-05, step=4880] Training: 49%|████▉ | 4881/10000 [57:06<49:35, 1.72it/s, loss=0.0059, lr=1.51e-05, step=4881] Training: 49%|████▉ | 4882/10000 [57:07<47:24, 1.80it/s, loss=0.0059, lr=1.51e-05, step=4881] Training: 49%|████▉ | 4882/10000 [57:07<47:24, 1.80it/s, loss=0.0089, lr=1.51e-05, step=4882] Training: 49%|████▉ | 4883/10000 [57:07<51:54, 1.64it/s, loss=0.0089, lr=1.51e-05, step=4882] Training: 49%|████▉ | 4883/10000 [57:07<51:54, 1.64it/s, loss=0.0218, lr=1.51e-05, step=4883] Training: 49%|████▉ | 4884/10000 [57:08<49:26, 1.72it/s, loss=0.0218, lr=1.51e-05, step=4883] Training: 49%|████▉ | 4884/10000 [57:08<49:26, 1.72it/s, loss=0.0196, lr=1.51e-05, step=4884] Training: 49%|████▉ | 4885/10000 [57:08<47:43, 1.79it/s, loss=0.0196, lr=1.51e-05, step=4884] Training: 49%|████▉ | 4885/10000 [57:08<47:43, 1.79it/s, loss=0.0356, lr=1.51e-05, step=4885] Training: 49%|████▉ | 4886/10000 [57:09<46:36, 1.83it/s, loss=0.0356, lr=1.51e-05, step=4885] Training: 49%|████▉ | 4886/10000 [57:09<46:36, 1.83it/s, loss=0.0232, lr=1.51e-05, step=4886] Training: 49%|████▉ | 4887/10000 [57:09<45:23, 1.88it/s, loss=0.0232, lr=1.51e-05, step=4886] Training: 49%|████▉ | 4887/10000 [57:09<45:23, 1.88it/s, loss=0.0248, lr=1.51e-05, step=4887] Training: 49%|████▉ | 4888/10000 [57:10<50:03, 1.70it/s, loss=0.0248, lr=1.51e-05, step=4887] Training: 49%|████▉ | 4888/10000 [57:10<50:03, 1.70it/s, loss=0.0137, lr=1.51e-05, step=4888] Training: 49%|████▉ | 4889/10000 [57:11<56:30, 1.51it/s, loss=0.0137, lr=1.51e-05, step=4888] Training: 49%|████▉ | 4889/10000 [57:11<56:30, 1.51it/s, loss=0.0053, lr=1.51e-05, step=4889]17:03:18.470 [I] step=4890 loss=0.0056 smoothed_loss=0.0147 lr=1.51e-05 grad_norm=0.4496 step_time=0.5200s data_time=0.0664s it/s=1.705 eta_to_10000=2996.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0180 grad_action_out_proj=0.1429 grad_shared_expert=0.3655 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4890/10000 [57:12<53:18, 1.60it/s, loss=0.0053, lr=1.51e-05, step=4889] Training: 49%|████▉ | 4890/10000 [57:12<53:18, 1.60it/s, loss=0.0056, lr=1.51e-05, step=4890] Training: 49%|████▉ | 4891/10000 [57:12<50:04, 1.70it/s, loss=0.0056, lr=1.51e-05, step=4890] Training: 49%|████▉ | 4891/10000 [57:12<50:04, 1.70it/s, loss=0.0056, lr=1.51e-05, step=4891] Training: 49%|████▉ | 4892/10000 [57:13<47:47, 1.78it/s, loss=0.0056, lr=1.51e-05, step=4891] Training: 49%|████▉ | 4892/10000 [57:13<47:47, 1.78it/s, loss=0.0358, lr=1.51e-05, step=4892] Training: 49%|████▉ | 4893/10000 [57:13<46:18, 1.84it/s, loss=0.0358, lr=1.51e-05, step=4892] Training: 49%|████▉ | 4893/10000 [57:13<46:18, 1.84it/s, loss=0.0080, lr=1.51e-05, step=4893] Training: 49%|████▉ | 4894/10000 [57:14<45:11, 1.88it/s, loss=0.0080, lr=1.51e-05, step=4893] Training: 49%|████▉ | 4894/10000 [57:14<45:11, 1.88it/s, loss=0.0304, lr=1.51e-05, step=4894] Training: 49%|████▉ | 4895/10000 [57:14<50:20, 1.69it/s, loss=0.0304, lr=1.51e-05, step=4894] Training: 49%|████▉ | 4895/10000 [57:14<50:20, 1.69it/s, loss=0.0222, lr=1.51e-05, step=4895] Training: 49%|████▉ | 4896/10000 [57:15<54:43, 1.55it/s, loss=0.0222, lr=1.51e-05, step=4895] Training: 49%|████▉ | 4896/10000 [57:15<54:43, 1.55it/s, loss=0.0073, lr=1.51e-05, step=4896] Training: 49%|████▉ | 4897/10000 [57:16<51:00, 1.67it/s, loss=0.0073, lr=1.51e-05, step=4896] Training: 49%|████▉ | 4897/10000 [57:16<51:00, 1.67it/s, loss=0.0117, lr=1.51e-05, step=4897] Training: 49%|████▉ | 4898/10000 [57:16<48:34, 1.75it/s, loss=0.0117, lr=1.51e-05, step=4897] Training: 49%|████▉ | 4898/10000 [57:16<48:34, 1.75it/s, loss=0.0035, lr=1.51e-05, step=4898] Training: 49%|████▉ | 4899/10000 [57:17<51:13, 1.66it/s, loss=0.0035, lr=1.51e-05, step=4898] Training: 49%|████▉ | 4899/10000 [57:17<51:13, 1.66it/s, loss=0.0067, lr=1.51e-05, step=4899]17:03:24.211 [I] step=4900 loss=0.0121 smoothed_loss=0.0136 lr=1.51e-05 grad_norm=0.4593 step_time=0.5032s data_time=0.0709s it/s=1.742 eta_to_10000=2927.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0083 grad_action_out_proj=0.1069 grad_shared_expert=0.3265 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4900/10000 [57:17<50:11, 1.69it/s, loss=0.0067, lr=1.51e-05, step=4899] Training: 49%|████▉ | 4900/10000 [57:17<50:11, 1.69it/s, loss=0.0121, lr=1.51e-05, step=4900] Training: 49%|████▉ | 4901/10000 [57:18<47:49, 1.78it/s, loss=0.0121, lr=1.51e-05, step=4900] Training: 49%|████▉ | 4901/10000 [57:18<47:49, 1.78it/s, loss=0.0067, lr=1.50e-05, step=4901] Training: 49%|████▉ | 4902/10000 [57:19<52:00, 1.63it/s, loss=0.0067, lr=1.50e-05, step=4901] Training: 49%|████▉ | 4902/10000 [57:19<52:00, 1.63it/s, loss=0.0125, lr=1.50e-05, step=4902] Training: 49%|████▉ | 4903/10000 [57:19<49:39, 1.71it/s, loss=0.0125, lr=1.50e-05, step=4902] Training: 49%|████▉ | 4903/10000 [57:19<49:39, 1.71it/s, loss=0.0186, lr=1.50e-05, step=4903] Training: 49%|████▉ | 4904/10000 [57:20<53:41, 1.58it/s, loss=0.0186, lr=1.50e-05, step=4903] Training: 49%|████▉ | 4904/10000 [57:20<53:41, 1.58it/s, loss=0.0212, lr=1.50e-05, step=4904] Training: 49%|████▉ | 4905/10000 [57:20<50:21, 1.69it/s, loss=0.0212, lr=1.50e-05, step=4904] Training: 49%|████▉ | 4905/10000 [57:20<50:21, 1.69it/s, loss=0.0139, lr=1.50e-05, step=4905] Training: 49%|████▉ | 4906/10000 [57:21<47:40, 1.78it/s, loss=0.0139, lr=1.50e-05, step=4905] Training: 49%|████▉ | 4906/10000 [57:21<47:40, 1.78it/s, loss=0.0127, lr=1.50e-05, step=4906] Training: 49%|████▉ | 4907/10000 [57:21<46:01, 1.84it/s, loss=0.0127, lr=1.50e-05, step=4906] Training: 49%|████▉ | 4907/10000 [57:21<46:01, 1.84it/s, loss=0.0189, lr=1.50e-05, step=4907] Training: 49%|████▉ | 4908/10000 [57:22<45:11, 1.88it/s, loss=0.0189, lr=1.50e-05, step=4907] Training: 49%|████▉ | 4908/10000 [57:22<45:11, 1.88it/s, loss=0.0177, lr=1.50e-05, step=4908] Training: 49%|████▉ | 4909/10000 [57:22<44:10, 1.92it/s, loss=0.0177, lr=1.50e-05, step=4908] Training: 49%|████▉ | 4909/10000 [57:22<44:10, 1.92it/s, loss=0.0078, lr=1.50e-05, step=4909]17:03:29.940 [I] step=4910 loss=0.0180 smoothed_loss=0.0145 lr=1.50e-05 grad_norm=0.4714 step_time=0.5150s data_time=0.0579s it/s=1.746 eta_to_10000=2915.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0240 grad_action_out_proj=0.2115 grad_shared_expert=0.5912 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4910/10000 [57:23<50:01, 1.70it/s, loss=0.0078, lr=1.50e-05, step=4909] Training: 49%|████▉ | 4910/10000 [57:23<50:01, 1.70it/s, loss=0.0180, lr=1.50e-05, step=4910] Training: 49%|████▉ | 4911/10000 [57:24<53:54, 1.57it/s, loss=0.0180, lr=1.50e-05, step=4910] Training: 49%|████▉ | 4911/10000 [57:24<53:54, 1.57it/s, loss=0.0265, lr=1.50e-05, step=4911] Training: 49%|████▉ | 4912/10000 [57:24<51:06, 1.66it/s, loss=0.0265, lr=1.50e-05, step=4911] Training: 49%|████▉ | 4912/10000 [57:24<51:06, 1.66it/s, loss=0.0480, lr=1.50e-05, step=4912] Training: 49%|████▉ | 4913/10000 [57:25<48:21, 1.75it/s, loss=0.0480, lr=1.50e-05, step=4912] Training: 49%|████▉ | 4913/10000 [57:25<48:21, 1.75it/s, loss=0.0018, lr=1.50e-05, step=4913] Training: 49%|████▉ | 4914/10000 [57:25<46:52, 1.81it/s, loss=0.0018, lr=1.50e-05, step=4913] Training: 49%|████▉ | 4914/10000 [57:25<46:52, 1.81it/s, loss=0.0077, lr=1.50e-05, step=4914] Training: 49%|████▉ | 4915/10000 [57:26<45:40, 1.86it/s, loss=0.0077, lr=1.50e-05, step=4914] Training: 49%|████▉ | 4915/10000 [57:26<45:40, 1.86it/s, loss=0.0075, lr=1.50e-05, step=4915] Training: 49%|████▉ | 4916/10000 [57:26<44:25, 1.91it/s, loss=0.0075, lr=1.50e-05, step=4915] Training: 49%|████▉ | 4916/10000 [57:26<44:25, 1.91it/s, loss=0.0125, lr=1.50e-05, step=4916] Training: 49%|████▉ | 4917/10000 [57:27<48:47, 1.74it/s, loss=0.0125, lr=1.50e-05, step=4916] Training: 49%|████▉ | 4917/10000 [57:27<48:47, 1.74it/s, loss=0.0169, lr=1.50e-05, step=4917] Training: 49%|████▉ | 4918/10000 [57:27<46:30, 1.82it/s, loss=0.0169, lr=1.50e-05, step=4917] Training: 49%|████▉ | 4918/10000 [57:27<46:30, 1.82it/s, loss=0.0037, lr=1.50e-05, step=4918] Training: 49%|████▉ | 4919/10000 [57:28<50:58, 1.66it/s, loss=0.0037, lr=1.50e-05, step=4918] Training: 49%|████▉ | 4919/10000 [57:28<50:58, 1.66it/s, loss=0.0065, lr=1.50e-05, step=4919]17:03:35.674 [I] step=4920 loss=0.0165 smoothed_loss=0.0137 lr=1.50e-05 grad_norm=0.5552 step_time=0.5159s data_time=0.0576s it/s=1.744 eta_to_10000=2912.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0200 grad_action_out_proj=0.1625 grad_shared_expert=0.5360 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4920/10000 [57:29<49:44, 1.70it/s, loss=0.0065, lr=1.50e-05, step=4919] Training: 49%|████▉ | 4920/10000 [57:29<49:44, 1.70it/s, loss=0.0165, lr=1.50e-05, step=4920] Training: 49%|████▉ | 4921/10000 [57:29<47:54, 1.77it/s, loss=0.0165, lr=1.50e-05, step=4920] Training: 49%|████▉ | 4921/10000 [57:29<47:54, 1.77it/s, loss=0.0136, lr=1.50e-05, step=4921] Training: 49%|████▉ | 4922/10000 [57:30<47:00, 1.80it/s, loss=0.0136, lr=1.50e-05, step=4921] Training: 49%|████▉ | 4922/10000 [57:30<47:00, 1.80it/s, loss=0.0091, lr=1.50e-05, step=4922] Training: 49%|████▉ | 4923/10000 [57:31<51:14, 1.65it/s, loss=0.0091, lr=1.50e-05, step=4922] Training: 49%|████▉ | 4923/10000 [57:31<51:14, 1.65it/s, loss=0.0217, lr=1.50e-05, step=4923] Training: 49%|████▉ | 4924/10000 [57:31<48:23, 1.75it/s, loss=0.0217, lr=1.50e-05, step=4923] Training: 49%|████▉ | 4924/10000 [57:31<48:23, 1.75it/s, loss=0.0086, lr=1.50e-05, step=4924] Training: 49%|████▉ | 4925/10000 [57:32<46:37, 1.81it/s, loss=0.0086, lr=1.50e-05, step=4924] Training: 49%|████▉ | 4925/10000 [57:32<46:37, 1.81it/s, loss=0.0065, lr=1.50e-05, step=4925] Training: 49%|████▉ | 4926/10000 [57:32<50:58, 1.66it/s, loss=0.0065, lr=1.50e-05, step=4925] Training: 49%|████▉ | 4926/10000 [57:32<50:58, 1.66it/s, loss=0.0071, lr=1.50e-05, step=4926] Training: 49%|████▉ | 4927/10000 [57:33<48:53, 1.73it/s, loss=0.0071, lr=1.50e-05, step=4926] Training: 49%|████▉ | 4927/10000 [57:33<48:53, 1.73it/s, loss=0.0168, lr=1.50e-05, step=4927] Training: 49%|████▉ | 4928/10000 [57:33<46:53, 1.80it/s, loss=0.0168, lr=1.50e-05, step=4927] Training: 49%|████▉ | 4928/10000 [57:33<46:53, 1.80it/s, loss=0.0220, lr=1.49e-05, step=4928] Training: 49%|████▉ | 4929/10000 [57:34<45:54, 1.84it/s, loss=0.0220, lr=1.49e-05, step=4928] Training: 49%|████▉ | 4929/10000 [57:34<45:54, 1.84it/s, loss=0.0056, lr=1.49e-05, step=4929]17:03:41.250 [I] step=4930 loss=0.0111 smoothed_loss=0.0126 lr=1.50e-05 grad_norm=0.4791 step_time=0.4976s data_time=0.0599s it/s=1.794 eta_to_10000=2826.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0236 grad_action_out_proj=0.1274 grad_shared_expert=0.4371 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4930/10000 [57:34<46:03, 1.83it/s, loss=0.0056, lr=1.49e-05, step=4929] Training: 49%|████▉ | 4930/10000 [57:34<46:03, 1.83it/s, loss=0.0111, lr=1.49e-05, step=4930] Training: 49%|████▉ | 4931/10000 [57:35<50:54, 1.66it/s, loss=0.0111, lr=1.49e-05, step=4930] Training: 49%|████▉ | 4931/10000 [57:35<50:54, 1.66it/s, loss=0.0417, lr=1.49e-05, step=4931] Training: 49%|████▉ | 4932/10000 [57:36<48:50, 1.73it/s, loss=0.0417, lr=1.49e-05, step=4931] Training: 49%|████▉ | 4932/10000 [57:36<48:50, 1.73it/s, loss=0.0076, lr=1.49e-05, step=4932] Training: 49%|████▉ | 4933/10000 [57:36<53:14, 1.59it/s, loss=0.0076, lr=1.49e-05, step=4932] Training: 49%|████▉ | 4933/10000 [57:36<53:14, 1.59it/s, loss=0.0090, lr=1.49e-05, step=4933] Training: 49%|████▉ | 4934/10000 [57:37<50:17, 1.68it/s, loss=0.0090, lr=1.49e-05, step=4933] Training: 49%|████▉ | 4934/10000 [57:37<50:17, 1.68it/s, loss=0.0088, lr=1.49e-05, step=4934] Training: 49%|████▉ | 4935/10000 [57:37<48:30, 1.74it/s, loss=0.0088, lr=1.49e-05, step=4934] Training: 49%|████▉ | 4935/10000 [57:37<48:30, 1.74it/s, loss=0.0254, lr=1.49e-05, step=4935] Training: 49%|████▉ | 4936/10000 [57:38<46:58, 1.80it/s, loss=0.0254, lr=1.49e-05, step=4935] Training: 49%|████▉ | 4936/10000 [57:38<46:58, 1.80it/s, loss=0.0141, lr=1.49e-05, step=4936] Training: 49%|████▉ | 4937/10000 [57:38<45:32, 1.85it/s, loss=0.0141, lr=1.49e-05, step=4936] Training: 49%|████▉ | 4937/10000 [57:38<45:32, 1.85it/s, loss=0.0266, lr=1.49e-05, step=4937] Training: 49%|████▉ | 4938/10000 [57:39<51:23, 1.64it/s, loss=0.0266, lr=1.49e-05, step=4937] Training: 49%|████▉ | 4938/10000 [57:39<51:23, 1.64it/s, loss=0.0081, lr=1.49e-05, step=4938] Training: 49%|████▉ | 4939/10000 [57:40<49:04, 1.72it/s, loss=0.0081, lr=1.49e-05, step=4938] Training: 49%|████▉ | 4939/10000 [57:40<49:04, 1.72it/s, loss=0.0488, lr=1.49e-05, step=4939]17:03:47.173 [I] step=4940 loss=0.0062 smoothed_loss=0.0173 lr=1.49e-05 grad_norm=0.4575 step_time=0.5333s data_time=0.0590s it/s=1.689 eta_to_10000=2996.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.1203 grad_shared_expert=0.3897 (10775:train_pytorch.py:850) + Training: 49%|████▉ | 4940/10000 [57:40<48:44, 1.73it/s, loss=0.0488, lr=1.49e-05, step=4939] Training: 49%|████▉ | 4940/10000 [57:40<48:44, 1.73it/s, loss=0.0062, lr=1.49e-05, step=4940] Training: 49%|████▉ | 4941/10000 [57:41<52:35, 1.60it/s, loss=0.0062, lr=1.49e-05, step=4940] Training: 49%|████▉ | 4941/10000 [57:41<52:35, 1.60it/s, loss=0.0030, lr=1.49e-05, step=4941] Training: 49%|████▉ | 4942/10000 [57:41<49:52, 1.69it/s, loss=0.0030, lr=1.49e-05, step=4941] Training: 49%|████▉ | 4942/10000 [57:41<49:52, 1.69it/s, loss=0.0041, lr=1.49e-05, step=4942] Training: 49%|████▉ | 4943/10000 [57:42<48:00, 1.76it/s, loss=0.0041, lr=1.49e-05, step=4942] Training: 49%|████▉ | 4943/10000 [57:42<48:00, 1.76it/s, loss=0.0107, lr=1.49e-05, step=4943] Training: 49%|████▉ | 4944/10000 [57:43<47:10, 1.79it/s, loss=0.0107, lr=1.49e-05, step=4943] Training: 49%|████▉ | 4944/10000 [57:43<47:10, 1.79it/s, loss=0.0982, lr=1.49e-05, step=4944] Training: 49%|████▉ | 4945/10000 [57:43<52:40, 1.60it/s, loss=0.0982, lr=1.49e-05, step=4944] Training: 49%|████▉ | 4945/10000 [57:43<52:40, 1.60it/s, loss=0.0041, lr=1.49e-05, step=4945] Training: 49%|████▉ | 4946/10000 [57:44<49:39, 1.70it/s, loss=0.0041, lr=1.49e-05, step=4945] Training: 49%|████▉ | 4946/10000 [57:44<49:39, 1.70it/s, loss=0.0171, lr=1.49e-05, step=4946] Training: 49%|████▉ | 4947/10000 [57:44<47:53, 1.76it/s, loss=0.0171, lr=1.49e-05, step=4946] Training: 49%|████▉ | 4947/10000 [57:44<47:53, 1.76it/s, loss=0.0044, lr=1.49e-05, step=4947] Training: 49%|████▉ | 4948/10000 [57:45<52:25, 1.61it/s, loss=0.0044, lr=1.49e-05, step=4947] Training: 49%|████▉ | 4948/10000 [57:45<52:25, 1.61it/s, loss=0.0028, lr=1.49e-05, step=4948] Training: 49%|████▉ | 4949/10000 [57:46<48:53, 1.72it/s, loss=0.0028, lr=1.49e-05, step=4948] Training: 49%|████▉ | 4949/10000 [57:46<48:53, 1.72it/s, loss=0.0090, lr=1.49e-05, step=4949]17:03:53.047 [I] step=4950 loss=0.0095 smoothed_loss=0.0157 lr=1.49e-05 grad_norm=0.4565 step_time=0.5291s data_time=0.0586s it/s=1.703 eta_to_10000=2965.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0340 grad_action_out_proj=0.2413 grad_shared_expert=0.6007 (10775:train_pytorch.py:850) + Training: 50%|████▉ | 4950/10000 [57:46<47:45, 1.76it/s, loss=0.0090, lr=1.49e-05, step=4949] Training: 50%|████▉ | 4950/10000 [57:46<47:45, 1.76it/s, loss=0.0095, lr=1.49e-05, step=4950] Training: 50%|████▉ | 4951/10000 [57:47<46:28, 1.81it/s, loss=0.0095, lr=1.49e-05, step=4950] Training: 50%|████▉ | 4951/10000 [57:47<46:28, 1.81it/s, loss=0.0076, lr=1.49e-05, step=4951] Training: 50%|████▉ | 4952/10000 [57:47<51:00, 1.65it/s, loss=0.0076, lr=1.49e-05, step=4951] Training: 50%|████▉ | 4952/10000 [57:47<51:00, 1.65it/s, loss=0.0059, lr=1.49e-05, step=4952] Training: 50%|████▉ | 4953/10000 [57:48<48:09, 1.75it/s, loss=0.0059, lr=1.49e-05, step=4952] Training: 50%|████▉ | 4953/10000 [57:48<48:09, 1.75it/s, loss=0.0050, lr=1.49e-05, step=4953] Training: 50%|████▉ | 4954/10000 [57:48<46:10, 1.82it/s, loss=0.0050, lr=1.49e-05, step=4953] Training: 50%|████▉ | 4954/10000 [57:48<46:10, 1.82it/s, loss=0.0207, lr=1.49e-05, step=4954] Training: 50%|████▉ | 4955/10000 [57:49<50:05, 1.68it/s, loss=0.0207, lr=1.49e-05, step=4954] Training: 50%|████▉ | 4955/10000 [57:49<50:05, 1.68it/s, loss=0.0142, lr=1.48e-05, step=4955] Training: 50%|████▉ | 4956/10000 [57:50<47:27, 1.77it/s, loss=0.0142, lr=1.48e-05, step=4955] Training: 50%|████▉ | 4956/10000 [57:50<47:27, 1.77it/s, loss=0.0087, lr=1.48e-05, step=4956] Training: 50%|████▉ | 4957/10000 [57:50<45:56, 1.83it/s, loss=0.0087, lr=1.48e-05, step=4956] Training: 50%|████▉ | 4957/10000 [57:50<45:56, 1.83it/s, loss=0.0110, lr=1.48e-05, step=4957] Training: 50%|████▉ | 4958/10000 [57:51<44:58, 1.87it/s, loss=0.0110, lr=1.48e-05, step=4957] Training: 50%|████▉ | 4958/10000 [57:51<44:58, 1.87it/s, loss=0.0046, lr=1.48e-05, step=4958] Training: 50%|████▉ | 4959/10000 [57:51<49:52, 1.68it/s, loss=0.0046, lr=1.48e-05, step=4958] Training: 50%|████▉ | 4959/10000 [57:51<49:52, 1.68it/s, loss=0.0140, lr=1.48e-05, step=4959]17:03:58.779 [I] step=4960 loss=0.0482 smoothed_loss=0.0160 lr=1.48e-05 grad_norm=0.4822 step_time=0.5139s data_time=0.0592s it/s=1.745 eta_to_10000=2888.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0222 grad_action_out_proj=0.1400 grad_shared_expert=0.3642 (10775:train_pytorch.py:850) + Training: 50%|████▉ | 4960/10000 [57:52<48:54, 1.72it/s, loss=0.0140, lr=1.48e-05, step=4959] Training: 50%|████▉ | 4960/10000 [57:52<48:54, 1.72it/s, loss=0.0482, lr=1.48e-05, step=4960] Training: 50%|████▉ | 4961/10000 [57:52<46:36, 1.80it/s, loss=0.0482, lr=1.48e-05, step=4960] Training: 50%|████▉ | 4961/10000 [57:52<46:36, 1.80it/s, loss=0.0098, lr=1.48e-05, step=4961] Training: 50%|████▉ | 4962/10000 [57:53<50:45, 1.65it/s, loss=0.0098, lr=1.48e-05, step=4961] Training: 50%|████▉ | 4962/10000 [57:53<50:45, 1.65it/s, loss=0.0434, lr=1.48e-05, step=4962] Training: 50%|████▉ | 4963/10000 [57:54<47:41, 1.76it/s, loss=0.0434, lr=1.48e-05, step=4962] Training: 50%|████▉ | 4963/10000 [57:54<47:41, 1.76it/s, loss=0.0031, lr=1.48e-05, step=4963] Training: 50%|████▉ | 4964/10000 [57:54<45:52, 1.83it/s, loss=0.0031, lr=1.48e-05, step=4963] Training: 50%|████▉ | 4964/10000 [57:54<45:52, 1.83it/s, loss=0.0093, lr=1.48e-05, step=4964] Training: 50%|████▉ | 4965/10000 [57:55<44:37, 1.88it/s, loss=0.0093, lr=1.48e-05, step=4964] Training: 50%|████▉ | 4965/10000 [57:55<44:37, 1.88it/s, loss=0.0047, lr=1.48e-05, step=4965] Training: 50%|████▉ | 4966/10000 [57:55<49:07, 1.71it/s, loss=0.0047, lr=1.48e-05, step=4965] Training: 50%|████▉ | 4966/10000 [57:55<49:07, 1.71it/s, loss=0.0089, lr=1.48e-05, step=4966] Training: 50%|████▉ | 4967/10000 [57:56<47:06, 1.78it/s, loss=0.0089, lr=1.48e-05, step=4966] Training: 50%|████▉ | 4967/10000 [57:56<47:06, 1.78it/s, loss=0.0306, lr=1.48e-05, step=4967] Training: 50%|████▉ | 4968/10000 [57:57<52:08, 1.61it/s, loss=0.0306, lr=1.48e-05, step=4967] Training: 50%|████▉ | 4968/10000 [57:57<52:08, 1.61it/s, loss=0.0023, lr=1.48e-05, step=4968] Training: 50%|████▉ | 4969/10000 [57:57<49:13, 1.70it/s, loss=0.0023, lr=1.48e-05, step=4968] Training: 50%|████▉ | 4969/10000 [57:57<49:13, 1.70it/s, loss=0.0029, lr=1.48e-05, step=4969]17:04:04.497 [I] step=4970 loss=0.0120 smoothed_loss=0.0132 lr=1.48e-05 grad_norm=0.3964 step_time=0.5115s data_time=0.0602s it/s=1.749 eta_to_10000=2875.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0960 grad_shared_expert=0.2964 (10775:train_pytorch.py:850) + Training: 50%|████▉ | 4970/10000 [57:58<48:07, 1.74it/s, loss=0.0029, lr=1.48e-05, step=4969] Training: 50%|████▉ | 4970/10000 [57:58<48:07, 1.74it/s, loss=0.0120, lr=1.48e-05, step=4970] Training: 50%|████▉ | 4971/10000 [57:58<46:10, 1.81it/s, loss=0.0120, lr=1.48e-05, step=4970] Training: 50%|████▉ | 4971/10000 [57:58<46:10, 1.81it/s, loss=0.0482, lr=1.48e-05, step=4971] Training: 50%|████▉ | 4972/10000 [57:59<44:42, 1.87it/s, loss=0.0482, lr=1.48e-05, step=4971] Training: 50%|████▉ | 4972/10000 [57:59<44:42, 1.87it/s, loss=0.0028, lr=1.48e-05, step=4972] Training: 50%|████▉ | 4973/10000 [57:59<43:40, 1.92it/s, loss=0.0028, lr=1.48e-05, step=4972] Training: 50%|████▉ | 4973/10000 [57:59<43:40, 1.92it/s, loss=0.0281, lr=1.48e-05, step=4973] Training: 50%|████▉ | 4974/10000 [58:00<49:11, 1.70it/s, loss=0.0281, lr=1.48e-05, step=4973] Training: 50%|████▉ | 4974/10000 [58:00<49:11, 1.70it/s, loss=0.0065, lr=1.48e-05, step=4974] Training: 50%|████▉ | 4975/10000 [58:01<52:34, 1.59it/s, loss=0.0065, lr=1.48e-05, step=4974] Training: 50%|████▉ | 4975/10000 [58:01<52:34, 1.59it/s, loss=0.0132, lr=1.48e-05, step=4975] Training: 50%|████▉ | 4976/10000 [58:01<49:31, 1.69it/s, loss=0.0132, lr=1.48e-05, step=4975] Training: 50%|████▉ | 4976/10000 [58:01<49:31, 1.69it/s, loss=0.0106, lr=1.48e-05, step=4976] Training: 50%|████▉ | 4977/10000 [58:02<47:42, 1.76it/s, loss=0.0106, lr=1.48e-05, step=4976] Training: 50%|████▉ | 4977/10000 [58:02<47:42, 1.76it/s, loss=0.0079, lr=1.48e-05, step=4977] Training: 50%|████▉ | 4978/10000 [58:02<46:01, 1.82it/s, loss=0.0079, lr=1.48e-05, step=4977] Training: 50%|████▉ | 4978/10000 [58:02<46:01, 1.82it/s, loss=0.0075, lr=1.48e-05, step=4978] Training: 50%|████▉ | 4979/10000 [58:03<44:49, 1.87it/s, loss=0.0075, lr=1.48e-05, step=4978] Training: 50%|████▉ | 4979/10000 [58:03<44:49, 1.87it/s, loss=0.0065, lr=1.48e-05, step=4979]17:04:10.019 [I] step=4980 loss=0.0257 smoothed_loss=0.0141 lr=1.48e-05 grad_norm=0.4361 step_time=0.4939s data_time=0.0583s it/s=1.811 eta_to_10000=2771.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0151 grad_action_out_proj=0.1783 grad_shared_expert=0.5273 (10775:train_pytorch.py:850) + Training: 50%|████▉ | 4980/10000 [58:03<45:01, 1.86it/s, loss=0.0065, lr=1.48e-05, step=4979] Training: 50%|████▉ | 4980/10000 [58:03<45:01, 1.86it/s, loss=0.0257, lr=1.48e-05, step=4980] Training: 50%|████▉ | 4981/10000 [58:04<49:23, 1.69it/s, loss=0.0257, lr=1.48e-05, step=4980] Training: 50%|████▉ | 4981/10000 [58:04<49:23, 1.69it/s, loss=0.2252, lr=1.48e-05, step=4981] Training: 50%|████▉ | 4982/10000 [58:04<47:36, 1.76it/s, loss=0.2252, lr=1.48e-05, step=4981] Training: 50%|████▉ | 4982/10000 [58:04<47:36, 1.76it/s, loss=0.0072, lr=1.47e-05, step=4982] Training: 50%|████▉ | 4983/10000 [58:05<52:11, 1.60it/s, loss=0.0072, lr=1.47e-05, step=4982] Training: 50%|████▉ | 4983/10000 [58:05<52:11, 1.60it/s, loss=0.0239, lr=1.47e-05, step=4983] Training: 50%|████▉ | 4984/10000 [58:06<49:17, 1.70it/s, loss=0.0239, lr=1.47e-05, step=4983] Training: 50%|████▉ | 4984/10000 [58:06<49:17, 1.70it/s, loss=0.0016, lr=1.47e-05, step=4984] Training: 50%|████▉ | 4985/10000 [58:06<46:57, 1.78it/s, loss=0.0016, lr=1.47e-05, step=4984] Training: 50%|████▉ | 4985/10000 [58:06<46:57, 1.78it/s, loss=0.0178, lr=1.47e-05, step=4985] Training: 50%|████▉ | 4986/10000 [58:07<45:34, 1.83it/s, loss=0.0178, lr=1.47e-05, step=4985] Training: 50%|████▉ | 4986/10000 [58:07<45:34, 1.83it/s, loss=0.0177, lr=1.47e-05, step=4986] Training: 50%|████▉ | 4987/10000 [58:07<44:11, 1.89it/s, loss=0.0177, lr=1.47e-05, step=4986] Training: 50%|████▉ | 4987/10000 [58:07<44:11, 1.89it/s, loss=0.0113, lr=1.47e-05, step=4987] Training: 50%|████▉ | 4988/10000 [58:08<49:36, 1.68it/s, loss=0.0113, lr=1.47e-05, step=4987] Training: 50%|████▉ | 4988/10000 [58:08<49:36, 1.68it/s, loss=0.0192, lr=1.47e-05, step=4988] Training: 50%|████▉ | 4989/10000 [58:08<47:43, 1.75it/s, loss=0.0192, lr=1.47e-05, step=4988] Training: 50%|████▉ | 4989/10000 [58:08<47:43, 1.75it/s, loss=0.0174, lr=1.47e-05, step=4989]17:04:16.046 [I] step=4990 loss=0.0027 smoothed_loss=0.0216 lr=1.47e-05 grad_norm=0.5428 step_time=0.5430s data_time=0.0598s it/s=1.659 eta_to_10000=3019.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0159 grad_action_out_proj=0.1287 grad_shared_expert=0.3766 (10775:train_pytorch.py:850) + Training: 50%|████▉ | 4990/10000 [58:09<52:48, 1.58it/s, loss=0.0174, lr=1.47e-05, step=4989] Training: 50%|████▉ | 4990/10000 [58:09<52:48, 1.58it/s, loss=0.0027, lr=1.47e-05, step=4990] Training: 50%|████▉ | 4991/10000 [58:10<49:49, 1.68it/s, loss=0.0027, lr=1.47e-05, step=4990] Training: 50%|████▉ | 4991/10000 [58:10<49:49, 1.68it/s, loss=0.0150, lr=1.47e-05, step=4991] Training: 50%|████▉ | 4992/10000 [58:10<48:17, 1.73it/s, loss=0.0150, lr=1.47e-05, step=4991] Training: 50%|████▉ | 4992/10000 [58:10<48:17, 1.73it/s, loss=0.0459, lr=1.47e-05, step=4992] Training: 50%|████▉ | 4993/10000 [58:11<45:54, 1.82it/s, loss=0.0459, lr=1.47e-05, step=4992] Training: 50%|████▉ | 4993/10000 [58:11<45:54, 1.82it/s, loss=0.0050, lr=1.47e-05, step=4993] Training: 50%|████▉ | 4994/10000 [58:11<44:29, 1.88it/s, loss=0.0050, lr=1.47e-05, step=4993] Training: 50%|████▉ | 4994/10000 [58:11<44:29, 1.88it/s, loss=0.0117, lr=1.47e-05, step=4994] Training: 50%|████▉ | 4995/10000 [58:12<49:05, 1.70it/s, loss=0.0117, lr=1.47e-05, step=4994] Training: 50%|████▉ | 4995/10000 [58:12<49:05, 1.70it/s, loss=0.0100, lr=1.47e-05, step=4995] Training: 50%|████▉ | 4996/10000 [58:12<46:50, 1.78it/s, loss=0.0100, lr=1.47e-05, step=4995] Training: 50%|████▉ | 4996/10000 [58:12<46:50, 1.78it/s, loss=0.0058, lr=1.47e-05, step=4996] Training: 50%|████▉ | 4997/10000 [58:13<45:02, 1.85it/s, loss=0.0058, lr=1.47e-05, step=4996] Training: 50%|████▉ | 4997/10000 [58:13<45:02, 1.85it/s, loss=0.0212, lr=1.47e-05, step=4997] Training: 50%|████▉ | 4998/10000 [58:14<50:28, 1.65it/s, loss=0.0212, lr=1.47e-05, step=4997] Training: 50%|████▉ | 4998/10000 [58:14<50:28, 1.65it/s, loss=0.0037, lr=1.47e-05, step=4998] Training: 50%|████▉ | 4999/10000 [58:14<48:18, 1.73it/s, loss=0.0037, lr=1.47e-05, step=4998] Training: 50%|████▉ | 4999/10000 [58:14<48:18, 1.73it/s, loss=0.0257, lr=1.47e-05, step=4999]17:04:21.626 [I] step=5000 loss=0.0038 smoothed_loss=0.0165 lr=1.47e-05 grad_norm=0.5112 step_time=0.4974s data_time=0.0606s it/s=1.792 eta_to_10000=2789.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1353 grad_shared_expert=1.1505 (10775:train_pytorch.py:850) +17:05:36.535 [I] Saved checkpoint at step 5000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000 (10775:train_pytorch.py:350) + Training: 50%|█████ | 5000/10000 [59:30<32:00:51, 23.05s/it, loss=0.0257, lr=1.47e-05, step=4999] Training: 50%|█████ | 5000/10000 [59:30<32:00:51, 23.05s/it, loss=0.0038, lr=1.47e-05, step=5000] Training: 50%|█████ | 5001/10000 [59:30<22:41:18, 16.34s/it, loss=0.0038, lr=1.47e-05, step=5000] Training: 50%|█████ | 5001/10000 [59:30<22:41:18, 16.34s/it, loss=0.0169, lr=1.47e-05, step=5001] Training: 50%|█████ | 5002/10000 [59:31<16:14:31, 11.70s/it, loss=0.0169, lr=1.47e-05, step=5001] Training: 50%|█████ | 5002/10000 [59:31<16:14:31, 11.70s/it, loss=0.0037, lr=1.47e-05, step=5002] Training: 50%|█████ | 5003/10000 [59:32<11:42:01, 8.43s/it, loss=0.0037, lr=1.47e-05, step=5002] Training: 50%|█████ | 5003/10000 [59:32<11:42:01, 8.43s/it, loss=0.0163, lr=1.47e-05, step=5003] Training: 50%|█████ | 5004/10000 [59:32<8:24:27, 6.06s/it, loss=0.0163, lr=1.47e-05, step=5003] Training: 50%|█████ | 5004/10000 [59:32<8:24:27, 6.06s/it, loss=0.0132, lr=1.47e-05, step=5004] Training: 50%|█████ | 5005/10000 [59:33<6:17:04, 4.53s/it, loss=0.0132, lr=1.47e-05, step=5004] Training: 50%|█████ | 5005/10000 [59:33<6:17:04, 4.53s/it, loss=0.0024, lr=1.47e-05, step=5005] Training: 50%|█████ | 5006/10000 [59:34<4:37:52, 3.34s/it, loss=0.0024, lr=1.47e-05, step=5005] Training: 50%|█████ | 5006/10000 [59:34<4:37:52, 3.34s/it, loss=0.0095, lr=1.47e-05, step=5006] Training: 50%|█████ | 5007/10000 [59:35<3:27:34, 2.49s/it, loss=0.0095, lr=1.47e-05, step=5006] Training: 50%|█████ | 5007/10000 [59:35<3:27:34, 2.49s/it, loss=0.0328, lr=1.47e-05, step=5007] Training: 50%|█████ | 5008/10000 [59:35<2:38:12, 1.90s/it, loss=0.0328, lr=1.47e-05, step=5007] Training: 50%|█████ | 5008/10000 [59:35<2:38:12, 1.90s/it, loss=0.0108, lr=1.47e-05, step=5008] Training: 50%|█████ | 5009/10000 [59:36<2:12:07, 1.59s/it, loss=0.0108, lr=1.47e-05, step=5008] Training: 50%|█████ | 5009/10000 [59:36<2:12:07, 1.59s/it, loss=0.0041, lr=1.46e-05, step=5009]17:05:43.400 [I] step=5010 loss=0.0162 smoothed_loss=0.0141 lr=1.47e-05 grad_norm=0.4896 step_time=0.5870s data_time=7.5904s it/s=0.122 eta_to_10000=40804.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0176 grad_action_out_proj=0.1201 grad_shared_expert=0.3928 (10775:train_pytorch.py:850) + Training: 50%|█████ | 5010/10000 [59:36<1:46:27, 1.28s/it, loss=0.0041, lr=1.46e-05, step=5009] Training: 50%|█████ | 5010/10000 [59:36<1:46:27, 1.28s/it, loss=0.0162, lr=1.46e-05, step=5010] Training: 50%|█████ | 5011/10000 [59:37<1:32:24, 1.11s/it, loss=0.0162, lr=1.46e-05, step=5010] Training: 50%|█████ | 5011/10000 [59:37<1:32:24, 1.11s/it, loss=0.0021, lr=1.46e-05, step=5011] Training: 50%|█████ | 5012/10000 [59:38<1:17:35, 1.07it/s, loss=0.0021, lr=1.46e-05, step=5011] Training: 50%|█████ | 5012/10000 [59:38<1:17:35, 1.07it/s, loss=0.0036, lr=1.46e-05, step=5012] Training: 50%|█████ | 5013/10000 [59:38<1:12:17, 1.15it/s, loss=0.0036, lr=1.46e-05, step=5012] Training: 50%|█████ | 5013/10000 [59:38<1:12:17, 1.15it/s, loss=0.0233, lr=1.46e-05, step=5013] Training: 50%|█████ | 5014/10000 [59:39<1:03:12, 1.31it/s, loss=0.0233, lr=1.46e-05, step=5013] Training: 50%|█████ | 5014/10000 [59:39<1:03:12, 1.31it/s, loss=0.0025, lr=1.46e-05, step=5014] Training: 50%|█████ | 5015/10000 [59:40<1:02:30, 1.33it/s, loss=0.0025, lr=1.46e-05, step=5014] Training: 50%|█████ | 5015/10000 [59:40<1:02:30, 1.33it/s, loss=0.0091, lr=1.46e-05, step=5015] Training: 50%|█████ | 5016/10000 [59:40<1:01:39, 1.35it/s, loss=0.0091, lr=1.46e-05, step=5015] Training: 50%|█████ | 5016/10000 [59:40<1:01:39, 1.35it/s, loss=0.0044, lr=1.46e-05, step=5016] Training: 50%|█████ | 5017/10000 [59:41<1:05:42, 1.26it/s, loss=0.0044, lr=1.46e-05, step=5016] Training: 50%|█████ | 5017/10000 [59:41<1:05:42, 1.26it/s, loss=0.0091, lr=1.46e-05, step=5017] Training: 50%|█████ | 5018/10000 [59:42<1:04:50, 1.28it/s, loss=0.0091, lr=1.46e-05, step=5017] Training: 50%|█████ | 5018/10000 [59:42<1:04:50, 1.28it/s, loss=0.0024, lr=1.46e-05, step=5018] Training: 50%|█████ | 5019/10000 [59:43<1:01:37, 1.35it/s, loss=0.0024, lr=1.46e-05, step=5018] Training: 50%|█████ | 5019/10000 [59:43<1:01:37, 1.35it/s, loss=0.0141, lr=1.46e-05, step=5019]17:05:50.160 [I] step=5020 loss=0.0063 smoothed_loss=0.0100 lr=1.46e-05 grad_norm=0.4216 step_time=0.5711s data_time=0.1049s it/s=1.480 eta_to_10000=3365.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0142 grad_action_out_proj=0.1367 grad_shared_expert=0.3499 (10775:train_pytorch.py:850) + Training: 50%|█████ | 5020/10000 [59:43<56:20, 1.47it/s, loss=0.0141, lr=1.46e-05, step=5019] Training: 50%|█████ | 5020/10000 [59:43<56:20, 1.47it/s, loss=0.0063, lr=1.46e-05, step=5020] Training: 50%|█████ | 5021/10000 [59:44<55:04, 1.51it/s, loss=0.0063, lr=1.46e-05, step=5020] Training: 50%|█████ | 5021/10000 [59:44<55:04, 1.51it/s, loss=0.0229, lr=1.46e-05, step=5021] Training: 50%|█████ | 5022/10000 [59:44<51:53, 1.60it/s, loss=0.0229, lr=1.46e-05, step=5021] Training: 50%|█████ | 5022/10000 [59:44<51:53, 1.60it/s, loss=0.0034, lr=1.46e-05, step=5022] Training: 50%|█████ | 5023/10000 [59:45<52:29, 1.58it/s, loss=0.0034, lr=1.46e-05, step=5022] Training: 50%|█████ | 5023/10000 [59:45<52:29, 1.58it/s, loss=0.0154, lr=1.46e-05, step=5023] Training: 50%|█████ | 5024/10000 [59:46<56:26, 1.47it/s, loss=0.0154, lr=1.46e-05, step=5023] Training: 50%|█████ | 5024/10000 [59:46<56:26, 1.47it/s, loss=0.0078, lr=1.46e-05, step=5024] Training: 50%|█████ | 5025/10000 [59:47<1:03:20, 1.31it/s, loss=0.0078, lr=1.46e-05, step=5024] Training: 50%|█████ | 5025/10000 [59:47<1:03:20, 1.31it/s, loss=0.0117, lr=1.46e-05, step=5025] Training: 50%|█████ | 5026/10000 [59:47<58:17, 1.42it/s, loss=0.0117, lr=1.46e-05, step=5025] Training: 50%|█████ | 5026/10000 [59:47<58:17, 1.42it/s, loss=0.0027, lr=1.46e-05, step=5026] Training: 50%|█████ | 5027/10000 [59:48<55:48, 1.49it/s, loss=0.0027, lr=1.46e-05, step=5026] Training: 50%|█████ | 5027/10000 [59:48<55:48, 1.49it/s, loss=0.0052, lr=1.46e-05, step=5027] Training: 50%|█████ | 5028/10000 [59:49<53:00, 1.56it/s, loss=0.0052, lr=1.46e-05, step=5027] Training: 50%|█████ | 5028/10000 [59:49<53:00, 1.56it/s, loss=0.0059, lr=1.46e-05, step=5028] Training: 50%|█████ | 5029/10000 [59:49<50:41, 1.63it/s, loss=0.0059, lr=1.46e-05, step=5028] Training: 50%|█████ | 5029/10000 [59:49<50:41, 1.63it/s, loss=0.0333, lr=1.46e-05, step=5029]17:05:56.657 [I] step=5030 loss=0.0039 smoothed_loss=0.0108 lr=1.46e-05 grad_norm=0.4535 step_time=0.5667s data_time=0.0833s it/s=1.539 eta_to_10000=3228.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0134 grad_action_out_proj=0.1110 grad_shared_expert=0.3926 (10775:train_pytorch.py:850) + Training: 50%|█████ | 5030/10000 [59:50<51:51, 1.60it/s, loss=0.0333, lr=1.46e-05, step=5029] Training: 50%|█████ | 5030/10000 [59:50<51:51, 1.60it/s, loss=0.0039, lr=1.46e-05, step=5030] Training: 50%|█████ | 5031/10000 [59:51<56:51, 1.46it/s, loss=0.0039, lr=1.46e-05, step=5030] Training: 50%|█████ | 5031/10000 [59:51<56:51, 1.46it/s, loss=0.0187, lr=1.46e-05, step=5031] Training: 50%|█████ | 5032/10000 [59:51<53:19, 1.55it/s, loss=0.0187, lr=1.46e-05, step=5031] Training: 50%|█████ | 5032/10000 [59:51<53:19, 1.55it/s, loss=0.0244, lr=1.46e-05, step=5032] Training: 50%|█████ | 5033/10000 [59:52<59:46, 1.39it/s, loss=0.0244, lr=1.46e-05, step=5032] Training: 50%|█████ | 5033/10000 [59:52<59:46, 1.39it/s, loss=0.0116, lr=1.46e-05, step=5033] Training: 50%|█████ | 5034/10000 [59:53<55:09, 1.50it/s, loss=0.0116, lr=1.46e-05, step=5033] Training: 50%|█████ | 5034/10000 [59:53<55:09, 1.50it/s, loss=0.0090, lr=1.46e-05, step=5034] Training: 50%|█████ | 5035/10000 [59:53<51:43, 1.60it/s, loss=0.0090, lr=1.46e-05, step=5034] Training: 50%|█████ | 5035/10000 [59:53<51:43, 1.60it/s, loss=0.0150, lr=1.46e-05, step=5035] Training: 50%|█████ | 5036/10000 [59:54<56:30, 1.46it/s, loss=0.0150, lr=1.46e-05, step=5035] Training: 50%|█████ | 5036/10000 [59:54<56:30, 1.46it/s, loss=0.0128, lr=1.45e-05, step=5036] Training: 50%|█████ | 5037/10000 [59:54<53:18, 1.55it/s, loss=0.0128, lr=1.45e-05, step=5036] Training: 50%|█████ | 5037/10000 [59:54<53:18, 1.55it/s, loss=0.0104, lr=1.45e-05, step=5037] Training: 50%|█████ | 5038/10000 [59:55<56:22, 1.47it/s, loss=0.0104, lr=1.45e-05, step=5037] Training: 50%|█████ | 5038/10000 [59:55<56:22, 1.47it/s, loss=0.0084, lr=1.45e-05, step=5038] Training: 50%|█████ | 5039/10000 [59:56<52:21, 1.58it/s, loss=0.0084, lr=1.45e-05, step=5038] Training: 50%|█████ | 5039/10000 [59:56<52:21, 1.58it/s, loss=0.0042, lr=1.45e-05, step=5039]17:06:03.541 [I] step=5040 loss=0.0062 smoothed_loss=0.0107 lr=1.46e-05 grad_norm=0.7330 step_time=0.5944s data_time=0.0939s it/s=1.453 eta_to_10000=3413.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0343 grad_action_out_proj=0.2555 grad_shared_expert=0.5839 (10775:train_pytorch.py:850) + Training: 50%|█████ | 5040/10000 [59:57<58:28, 1.41it/s, loss=0.0042, lr=1.45e-05, step=5039] Training: 50%|█████ | 5040/10000 [59:57<58:28, 1.41it/s, loss=0.0062, lr=1.45e-05, step=5040] Training: 50%|█████ | 5041/10000 [59:57<53:57, 1.53it/s, loss=0.0062, lr=1.45e-05, step=5040] Training: 50%|█████ | 5041/10000 [59:57<53:57, 1.53it/s, loss=0.0028, lr=1.45e-05, step=5041] Training: 50%|█████ | 5042/10000 [59:58<50:29, 1.64it/s, loss=0.0028, lr=1.45e-05, step=5041] Training: 50%|█████ | 5042/10000 [59:58<50:29, 1.64it/s, loss=0.0128, lr=1.45e-05, step=5042] Training: 50%|█████ | 5043/10000 [59:58<48:48, 1.69it/s, loss=0.0128, lr=1.45e-05, step=5042] Training: 50%|█████ | 5043/10000 [59:58<48:48, 1.69it/s, loss=0.0061, lr=1.45e-05, step=5043] Training: 50%|█████ | 5044/10000 [59:59<46:58, 1.76it/s, loss=0.0061, lr=1.45e-05, step=5043] Training: 50%|█████ | 5044/10000 [59:59<46:58, 1.76it/s, loss=0.0026, lr=1.45e-05, step=5044] Training: 50%|█████ | 5045/10000 [1:00:00<52:37, 1.57it/s, loss=0.0026, lr=1.45e-05, step=5044] Training: 50%|█████ | 5045/10000 [1:00:00<52:37, 1.57it/s, loss=0.0392, lr=1.45e-05, step=5045] Training: 50%|█████ | 5046/10000 [1:00:00<54:17, 1.52it/s, loss=0.0392, lr=1.45e-05, step=5045] Training: 50%|█████ | 5046/10000 [1:00:00<54:17, 1.52it/s, loss=0.0129, lr=1.45e-05, step=5046] Training: 50%|█████ | 5047/10000 [1:00:01<51:03, 1.62it/s, loss=0.0129, lr=1.45e-05, step=5046] Training: 50%|█████ | 5047/10000 [1:00:01<51:03, 1.62it/s, loss=0.0153, lr=1.45e-05, step=5047] Training: 50%|█████ | 5048/10000 [1:00:02<56:29, 1.46it/s, loss=0.0153, lr=1.45e-05, step=5047] Training: 50%|█████ | 5048/10000 [1:00:02<56:29, 1.46it/s, loss=0.0020, lr=1.45e-05, step=5048] Training: 50%|█████ | 5049/10000 [1:00:02<53:34, 1.54it/s, loss=0.0020, lr=1.45e-05, step=5048] Training: 50%|█████ | 5049/10000 [1:00:02<53:34, 1.54it/s, loss=0.0172, lr=1.45e-05, step=5049]17:06:09.950 [I] step=5050 loss=0.0134 smoothed_loss=0.0121 lr=1.45e-05 grad_norm=0.5093 step_time=0.5622s data_time=0.0786s it/s=1.561 eta_to_10000=3171.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.1548 grad_shared_expert=0.4825 (10775:train_pytorch.py:850) + Training: 50%|█████ | 5050/10000 [1:00:03<59:07, 1.40it/s, loss=0.0172, lr=1.45e-05, step=5049] Training: 50%|█████ | 5050/10000 [1:00:03<59:07, 1.40it/s, loss=0.0134, lr=1.45e-05, step=5050] Training: 51%|█████ | 5051/10000 [1:00:04<54:25, 1.52it/s, loss=0.0134, lr=1.45e-05, step=5050] Training: 51%|█████ | 5051/10000 [1:00:04<54:25, 1.52it/s, loss=0.0080, lr=1.45e-05, step=5051] Training: 51%|█████ | 5052/10000 [1:00:04<1:01:27, 1.34it/s, loss=0.0080, lr=1.45e-05, step=5051] Training: 51%|█████ | 5052/10000 [1:00:04<1:01:27, 1.34it/s, loss=0.0024, lr=1.45e-05, step=5052] Training: 51%|█████ | 5053/10000 [1:00:05<1:05:33, 1.26it/s, loss=0.0024, lr=1.45e-05, step=5052] Training: 51%|█████ | 5053/10000 [1:00:05<1:05:33, 1.26it/s, loss=0.0012, lr=1.45e-05, step=5053] Training: 51%|█████ | 5054/10000 [1:00:06<1:03:59, 1.29it/s, loss=0.0012, lr=1.45e-05, step=5053] Training: 51%|█████ | 5054/10000 [1:00:06<1:03:59, 1.29it/s, loss=0.0120, lr=1.45e-05, step=5054] Training: 51%|█████ | 5055/10000 [1:00:07<1:11:45, 1.15it/s, loss=0.0120, lr=1.45e-05, step=5054] Training: 51%|█████ | 5055/10000 [1:00:07<1:11:45, 1.15it/s, loss=0.0108, lr=1.45e-05, step=5055] Training: 51%|█████ | 5056/10000 [1:00:08<1:04:50, 1.27it/s, loss=0.0108, lr=1.45e-05, step=5055] Training: 51%|█████ | 5056/10000 [1:00:08<1:04:50, 1.27it/s, loss=0.0058, lr=1.45e-05, step=5056] Training: 51%|█████ | 5057/10000 [1:00:08<59:54, 1.38it/s, loss=0.0058, lr=1.45e-05, step=5056] Training: 51%|█████ | 5057/10000 [1:00:08<59:54, 1.38it/s, loss=0.0150, lr=1.45e-05, step=5057] Training: 51%|█████ | 5058/10000 [1:00:09<57:09, 1.44it/s, loss=0.0150, lr=1.45e-05, step=5057] Training: 51%|█████ | 5058/10000 [1:00:09<57:09, 1.44it/s, loss=0.0103, lr=1.45e-05, step=5058] Training: 51%|█████ | 5059/10000 [1:00:10<54:35, 1.51it/s, loss=0.0103, lr=1.45e-05, step=5058] Training: 51%|█████ | 5059/10000 [1:00:10<54:35, 1.51it/s, loss=0.0221, lr=1.45e-05, step=5059]17:06:17.497 [I] step=5060 loss=0.0138 smoothed_loss=0.0117 lr=1.45e-05 grad_norm=0.4829 step_time=0.6313s data_time=0.1235s it/s=1.325 eta_to_10000=3728.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.0761 grad_shared_expert=0.4058 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5060/10000 [1:00:11<1:01:48, 1.33it/s, loss=0.0221, lr=1.45e-05, step=5059] Training: 51%|█████ | 5060/10000 [1:00:11<1:01:48, 1.33it/s, loss=0.0138, lr=1.45e-05, step=5060] Training: 51%|█████ | 5061/10000 [1:00:11<1:00:48, 1.35it/s, loss=0.0138, lr=1.45e-05, step=5060] Training: 51%|█████ | 5061/10000 [1:00:11<1:00:48, 1.35it/s, loss=0.0136, lr=1.45e-05, step=5061] Training: 51%|█████ | 5062/10000 [1:00:12<1:07:03, 1.23it/s, loss=0.0136, lr=1.45e-05, step=5061] Training: 51%|█████ | 5062/10000 [1:00:12<1:07:03, 1.23it/s, loss=0.0033, lr=1.45e-05, step=5062] Training: 51%|█████ | 5063/10000 [1:00:13<1:12:40, 1.13it/s, loss=0.0033, lr=1.45e-05, step=5062] Training: 51%|█████ | 5063/10000 [1:00:13<1:12:40, 1.13it/s, loss=0.0046, lr=1.44e-05, step=5063] Training: 51%|█████ | 5064/10000 [1:00:14<1:05:32, 1.26it/s, loss=0.0046, lr=1.44e-05, step=5063] Training: 51%|█████ | 5064/10000 [1:00:14<1:05:32, 1.26it/s, loss=0.0188, lr=1.44e-05, step=5064] Training: 51%|█████ | 5065/10000 [1:00:15<1:03:20, 1.30it/s, loss=0.0188, lr=1.44e-05, step=5064] Training: 51%|█████ | 5065/10000 [1:00:15<1:03:20, 1.30it/s, loss=0.0217, lr=1.44e-05, step=5065] Training: 51%|█████ | 5066/10000 [1:00:15<57:42, 1.42it/s, loss=0.0217, lr=1.44e-05, step=5065] Training: 51%|█████ | 5066/10000 [1:00:15<57:42, 1.42it/s, loss=0.0082, lr=1.44e-05, step=5066] Training: 51%|█████ | 5067/10000 [1:00:16<1:02:12, 1.32it/s, loss=0.0082, lr=1.44e-05, step=5066] Training: 51%|█████ | 5067/10000 [1:00:16<1:02:12, 1.32it/s, loss=0.0529, lr=1.44e-05, step=5067] Training: 51%|█████ | 5068/10000 [1:00:17<57:35, 1.43it/s, loss=0.0529, lr=1.44e-05, step=5067] Training: 51%|█████ | 5068/10000 [1:00:17<57:35, 1.43it/s, loss=0.0570, lr=1.44e-05, step=5068] Training: 51%|█████ | 5069/10000 [1:00:17<55:35, 1.48it/s, loss=0.0570, lr=1.44e-05, step=5068] Training: 51%|█████ | 5069/10000 [1:00:17<55:35, 1.48it/s, loss=0.0042, lr=1.44e-05, step=5069]17:06:25.458 [I] step=5070 loss=0.0059 smoothed_loss=0.0172 lr=1.44e-05 grad_norm=0.6996 step_time=0.6565s data_time=0.1395s it/s=1.256 eta_to_10000=3924.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0080 grad_action_out_proj=0.0788 grad_shared_expert=0.3142 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5070/10000 [1:00:19<1:10:54, 1.16it/s, loss=0.0042, lr=1.44e-05, step=5069] Training: 51%|█████ | 5070/10000 [1:00:19<1:10:54, 1.16it/s, loss=0.0059, lr=1.44e-05, step=5070] Training: 51%|█████ | 5071/10000 [1:00:19<1:04:20, 1.28it/s, loss=0.0059, lr=1.44e-05, step=5070] Training: 51%|█████ | 5071/10000 [1:00:19<1:04:20, 1.28it/s, loss=0.0064, lr=1.44e-05, step=5071] Training: 51%|█████ | 5072/10000 [1:00:20<1:00:34, 1.36it/s, loss=0.0064, lr=1.44e-05, step=5071] Training: 51%|█████ | 5072/10000 [1:00:20<1:00:34, 1.36it/s, loss=0.0015, lr=1.44e-05, step=5072] Training: 51%|█████ | 5073/10000 [1:00:20<59:17, 1.38it/s, loss=0.0015, lr=1.44e-05, step=5072] Training: 51%|█████ | 5073/10000 [1:00:20<59:17, 1.38it/s, loss=0.1020, lr=1.44e-05, step=5073] Training: 51%|█████ | 5074/10000 [1:00:21<1:05:10, 1.26it/s, loss=0.1020, lr=1.44e-05, step=5073] Training: 51%|█████ | 5074/10000 [1:00:21<1:05:10, 1.26it/s, loss=0.0054, lr=1.44e-05, step=5074] Training: 51%|█████ | 5075/10000 [1:00:22<1:04:55, 1.26it/s, loss=0.0054, lr=1.44e-05, step=5074] Training: 51%|█████ | 5075/10000 [1:00:22<1:04:55, 1.26it/s, loss=0.0311, lr=1.44e-05, step=5075] Training: 51%|█████ | 5076/10000 [1:00:23<1:05:25, 1.25it/s, loss=0.0311, lr=1.44e-05, step=5075] Training: 51%|█████ | 5076/10000 [1:00:23<1:05:25, 1.25it/s, loss=0.0067, lr=1.44e-05, step=5076] Training: 51%|█████ | 5077/10000 [1:00:24<1:17:13, 1.06it/s, loss=0.0067, lr=1.44e-05, step=5076] Training: 51%|█████ | 5077/10000 [1:00:24<1:17:13, 1.06it/s, loss=0.0071, lr=1.44e-05, step=5077] Training: 51%|█████ | 5078/10000 [1:00:25<1:10:00, 1.17it/s, loss=0.0071, lr=1.44e-05, step=5077] Training: 51%|█████ | 5078/10000 [1:00:25<1:10:00, 1.17it/s, loss=0.0066, lr=1.44e-05, step=5078] Training: 51%|█████ | 5079/10000 [1:00:26<1:09:55, 1.17it/s, loss=0.0066, lr=1.44e-05, step=5078] Training: 51%|█████ | 5079/10000 [1:00:26<1:09:55, 1.17it/s, loss=0.0099, lr=1.44e-05, step=5079]17:06:33.408 [I] step=5080 loss=0.0138 smoothed_loss=0.0171 lr=1.44e-05 grad_norm=0.4443 step_time=0.6266s data_time=0.1684s it/s=1.258 eta_to_10000=3909.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.1404 grad_shared_expert=0.4497 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5080/10000 [1:00:26<1:06:24, 1.23it/s, loss=0.0099, lr=1.44e-05, step=5079] Training: 51%|█████ | 5080/10000 [1:00:26<1:06:24, 1.23it/s, loss=0.0138, lr=1.44e-05, step=5080] Training: 51%|█████ | 5081/10000 [1:00:27<1:06:59, 1.22it/s, loss=0.0138, lr=1.44e-05, step=5080] Training: 51%|█████ | 5081/10000 [1:00:27<1:06:59, 1.22it/s, loss=0.0056, lr=1.44e-05, step=5081] Training: 51%|█████ | 5082/10000 [1:00:28<1:04:32, 1.27it/s, loss=0.0056, lr=1.44e-05, step=5081] Training: 51%|█████ | 5082/10000 [1:00:28<1:04:32, 1.27it/s, loss=0.0023, lr=1.44e-05, step=5082] Training: 51%|█████ | 5083/10000 [1:00:29<1:02:41, 1.31it/s, loss=0.0023, lr=1.44e-05, step=5082] Training: 51%|█████ | 5083/10000 [1:00:29<1:02:41, 1.31it/s, loss=0.0333, lr=1.44e-05, step=5083] Training: 51%|█████ | 5084/10000 [1:00:30<1:21:35, 1.00it/s, loss=0.0333, lr=1.44e-05, step=5083] Training: 51%|█████ | 5084/10000 [1:00:30<1:21:35, 1.00it/s, loss=0.0069, lr=1.44e-05, step=5084] Training: 51%|█████ | 5085/10000 [1:00:32<1:37:55, 1.20s/it, loss=0.0069, lr=1.44e-05, step=5084] Training: 51%|█████ | 5085/10000 [1:00:32<1:37:55, 1.20s/it, loss=0.0070, lr=1.44e-05, step=5085] Training: 51%|█████ | 5086/10000 [1:00:34<2:09:52, 1.59s/it, loss=0.0070, lr=1.44e-05, step=5085] Training: 51%|█████ | 5086/10000 [1:00:34<2:09:52, 1.59s/it, loss=0.0108, lr=1.44e-05, step=5086] Training: 51%|█████ | 5087/10000 [1:00:36<2:07:25, 1.56s/it, loss=0.0108, lr=1.44e-05, step=5086] Training: 51%|█████ | 5087/10000 [1:00:36<2:07:25, 1.56s/it, loss=0.0025, lr=1.44e-05, step=5087] Training: 51%|█████ | 5088/10000 [1:00:37<1:51:43, 1.36s/it, loss=0.0025, lr=1.44e-05, step=5087] Training: 51%|█████ | 5088/10000 [1:00:37<1:51:43, 1.36s/it, loss=0.0071, lr=1.44e-05, step=5088] Training: 51%|█████ | 5089/10000 [1:00:38<1:43:11, 1.26s/it, loss=0.0071, lr=1.44e-05, step=5088] Training: 51%|█████ | 5089/10000 [1:00:38<1:43:11, 1.26s/it, loss=0.0107, lr=1.44e-05, step=5089]17:06:46.168 [I] step=5090 loss=0.0099 smoothed_loss=0.0121 lr=1.44e-05 grad_norm=0.5322 step_time=0.8661s data_time=0.4099s it/s=0.784 eta_to_10000=6259.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.1513 grad_shared_expert=0.4700 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5090/10000 [1:00:39<1:48:53, 1.33s/it, loss=0.0107, lr=1.44e-05, step=5089] Training: 51%|█████ | 5090/10000 [1:00:39<1:48:53, 1.33s/it, loss=0.0099, lr=1.43e-05, step=5090] Training: 51%|█████ | 5091/10000 [1:00:41<1:59:24, 1.46s/it, loss=0.0099, lr=1.43e-05, step=5090] Training: 51%|█████ | 5091/10000 [1:00:41<1:59:24, 1.46s/it, loss=0.0114, lr=1.43e-05, step=5091] Training: 51%|█████ | 5092/10000 [1:00:42<1:54:47, 1.40s/it, loss=0.0114, lr=1.43e-05, step=5091] Training: 51%|█████ | 5092/10000 [1:00:42<1:54:47, 1.40s/it, loss=0.0035, lr=1.43e-05, step=5092] Training: 51%|█████ | 5093/10000 [1:00:43<1:42:58, 1.26s/it, loss=0.0035, lr=1.43e-05, step=5092] Training: 51%|█████ | 5093/10000 [1:00:43<1:42:58, 1.26s/it, loss=0.0063, lr=1.43e-05, step=5093] Training: 51%|█████ | 5094/10000 [1:00:44<1:35:48, 1.17s/it, loss=0.0063, lr=1.43e-05, step=5093] Training: 51%|█████ | 5094/10000 [1:00:44<1:35:48, 1.17s/it, loss=0.0344, lr=1.43e-05, step=5094] Training: 51%|█████ | 5095/10000 [1:00:46<1:40:21, 1.23s/it, loss=0.0344, lr=1.43e-05, step=5094] Training: 51%|█████ | 5095/10000 [1:00:46<1:40:21, 1.23s/it, loss=0.0279, lr=1.43e-05, step=5095] Training: 51%|█████ | 5096/10000 [1:00:46<1:29:43, 1.10s/it, loss=0.0279, lr=1.43e-05, step=5095] Training: 51%|█████ | 5096/10000 [1:00:46<1:29:43, 1.10s/it, loss=0.0050, lr=1.43e-05, step=5096] Training: 51%|█████ | 5097/10000 [1:00:48<1:35:32, 1.17s/it, loss=0.0050, lr=1.43e-05, step=5096] Training: 51%|█████ | 5097/10000 [1:00:48<1:35:32, 1.17s/it, loss=0.0015, lr=1.43e-05, step=5097] Training: 51%|█████ | 5098/10000 [1:00:49<1:26:19, 1.06s/it, loss=0.0015, lr=1.43e-05, step=5097] Training: 51%|█████ | 5098/10000 [1:00:49<1:26:19, 1.06s/it, loss=0.0197, lr=1.43e-05, step=5098] Training: 51%|█████ | 5099/10000 [1:00:50<1:32:37, 1.13s/it, loss=0.0197, lr=1.43e-05, step=5098] Training: 51%|█████ | 5099/10000 [1:00:50<1:32:37, 1.13s/it, loss=0.0027, lr=1.43e-05, step=5099]17:06:57.703 [I] step=5100 loss=0.0020 smoothed_loss=0.0111 lr=1.43e-05 grad_norm=0.4363 step_time=0.7983s data_time=0.3553s it/s=0.876 eta_to_10000=5590.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0082 grad_action_out_proj=0.0916 grad_shared_expert=0.3207 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5100/10000 [1:00:51<1:26:42, 1.06s/it, loss=0.0027, lr=1.43e-05, step=5099] Training: 51%|█████ | 5100/10000 [1:00:51<1:26:42, 1.06s/it, loss=0.0020, lr=1.43e-05, step=5100] Training: 51%|█████ | 5101/10000 [1:00:51<1:16:00, 1.07it/s, loss=0.0020, lr=1.43e-05, step=5100] Training: 51%|█████ | 5101/10000 [1:00:51<1:16:00, 1.07it/s, loss=0.0320, lr=1.43e-05, step=5101] Training: 51%|█████ | 5102/10000 [1:00:52<1:07:41, 1.21it/s, loss=0.0320, lr=1.43e-05, step=5101] Training: 51%|█████ | 5102/10000 [1:00:52<1:07:41, 1.21it/s, loss=0.0052, lr=1.43e-05, step=5102] Training: 51%|█████ | 5103/10000 [1:00:53<1:06:01, 1.24it/s, loss=0.0052, lr=1.43e-05, step=5102] Training: 51%|█████ | 5103/10000 [1:00:53<1:06:01, 1.24it/s, loss=0.0079, lr=1.43e-05, step=5103] Training: 51%|█████ | 5104/10000 [1:00:54<1:15:17, 1.08it/s, loss=0.0079, lr=1.43e-05, step=5103] Training: 51%|█████ | 5104/10000 [1:00:54<1:15:17, 1.08it/s, loss=0.0632, lr=1.43e-05, step=5104] Training: 51%|█████ | 5105/10000 [1:00:55<1:17:17, 1.06it/s, loss=0.0632, lr=1.43e-05, step=5104] Training: 51%|█████ | 5105/10000 [1:00:55<1:17:17, 1.06it/s, loss=0.0160, lr=1.43e-05, step=5105] Training: 51%|█████ | 5106/10000 [1:00:56<1:08:45, 1.19it/s, loss=0.0160, lr=1.43e-05, step=5105] Training: 51%|█████ | 5106/10000 [1:00:56<1:08:45, 1.19it/s, loss=0.0102, lr=1.43e-05, step=5106] Training: 51%|█████ | 5107/10000 [1:00:56<1:03:31, 1.28it/s, loss=0.0102, lr=1.43e-05, step=5106] Training: 51%|█████ | 5107/10000 [1:00:56<1:03:31, 1.28it/s, loss=0.0164, lr=1.43e-05, step=5107] Training: 51%|█████ | 5108/10000 [1:00:57<1:00:43, 1.34it/s, loss=0.0164, lr=1.43e-05, step=5107] Training: 51%|█████ | 5108/10000 [1:00:57<1:00:43, 1.34it/s, loss=0.0166, lr=1.43e-05, step=5108] Training: 51%|█████ | 5109/10000 [1:00:58<1:05:36, 1.24it/s, loss=0.0166, lr=1.43e-05, step=5108] Training: 51%|█████ | 5109/10000 [1:00:58<1:05:36, 1.24it/s, loss=0.0125, lr=1.43e-05, step=5109]17:07:06.193 [I] step=5110 loss=0.0082 smoothed_loss=0.0152 lr=1.43e-05 grad_norm=0.4658 step_time=0.6827s data_time=0.1662s it/s=1.179 eta_to_10000=4149.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0062 grad_action_out_proj=0.0900 grad_shared_expert=0.3438 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5110/10000 [1:00:59<1:22:08, 1.01s/it, loss=0.0125, lr=1.43e-05, step=5109] Training: 51%|█████ | 5110/10000 [1:00:59<1:22:08, 1.01s/it, loss=0.0082, lr=1.43e-05, step=5110] Training: 51%|█████ | 5111/10000 [1:01:00<1:11:36, 1.14it/s, loss=0.0082, lr=1.43e-05, step=5110] Training: 51%|█████ | 5111/10000 [1:01:00<1:11:36, 1.14it/s, loss=0.0134, lr=1.43e-05, step=5111] Training: 51%|█████ | 5112/10000 [1:01:01<1:10:13, 1.16it/s, loss=0.0134, lr=1.43e-05, step=5111] Training: 51%|█████ | 5112/10000 [1:01:01<1:10:13, 1.16it/s, loss=0.0081, lr=1.43e-05, step=5112] Training: 51%|█████ | 5113/10000 [1:01:02<1:10:13, 1.16it/s, loss=0.0081, lr=1.43e-05, step=5112] Training: 51%|█████ | 5113/10000 [1:01:02<1:10:13, 1.16it/s, loss=0.0043, lr=1.43e-05, step=5113] Training: 51%|█████ | 5114/10000 [1:01:02<1:07:09, 1.21it/s, loss=0.0043, lr=1.43e-05, step=5113] Training: 51%|█████ | 5114/10000 [1:01:02<1:07:09, 1.21it/s, loss=0.0163, lr=1.43e-05, step=5114] Training: 51%|█████ | 5115/10000 [1:01:03<1:02:53, 1.29it/s, loss=0.0163, lr=1.43e-05, step=5114] Training: 51%|█████ | 5115/10000 [1:01:03<1:02:53, 1.29it/s, loss=0.0397, lr=1.43e-05, step=5115] Training: 51%|█████ | 5116/10000 [1:01:04<1:04:14, 1.27it/s, loss=0.0397, lr=1.43e-05, step=5115] Training: 51%|█████ | 5116/10000 [1:01:04<1:04:14, 1.27it/s, loss=0.0142, lr=1.43e-05, step=5116] Training: 51%|█████ | 5117/10000 [1:01:05<1:06:30, 1.22it/s, loss=0.0142, lr=1.43e-05, step=5116] Training: 51%|█████ | 5117/10000 [1:01:05<1:06:30, 1.22it/s, loss=0.0148, lr=1.42e-05, step=5117] Training: 51%|█████ | 5118/10000 [1:01:06<1:08:49, 1.18it/s, loss=0.0148, lr=1.42e-05, step=5117] Training: 51%|█████ | 5118/10000 [1:01:06<1:08:49, 1.18it/s, loss=0.0171, lr=1.42e-05, step=5118] Training: 51%|█████ | 5119/10000 [1:01:06<1:05:29, 1.24it/s, loss=0.0171, lr=1.42e-05, step=5118] Training: 51%|█████ | 5119/10000 [1:01:06<1:05:29, 1.24it/s, loss=0.0155, lr=1.42e-05, step=5119]17:07:14.264 [I] step=5120 loss=0.0028 smoothed_loss=0.0146 lr=1.43e-05 grad_norm=0.4548 step_time=0.6341s data_time=0.1730s it/s=1.240 eta_to_10000=3934.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.1045 grad_shared_expert=0.2748 (10775:train_pytorch.py:850) + Training: 51%|█████ | 5120/10000 [1:01:07<1:12:14, 1.13it/s, loss=0.0155, lr=1.42e-05, step=5119] Training: 51%|█████ | 5120/10000 [1:01:07<1:12:14, 1.13it/s, loss=0.0028, lr=1.42e-05, step=5120] Training: 51%|█████ | 5121/10000 [1:01:08<1:08:01, 1.20it/s, loss=0.0028, lr=1.42e-05, step=5120] Training: 51%|█████ | 5121/10000 [1:01:08<1:08:01, 1.20it/s, loss=0.0019, lr=1.42e-05, step=5121] Training: 51%|█████ | 5122/10000 [1:01:09<1:05:21, 1.24it/s, loss=0.0019, lr=1.42e-05, step=5121] Training: 51%|█████ | 5122/10000 [1:01:09<1:05:21, 1.24it/s, loss=0.0108, lr=1.42e-05, step=5122] Training: 51%|█████ | 5123/10000 [1:01:10<1:08:19, 1.19it/s, loss=0.0108, lr=1.42e-05, step=5122] Training: 51%|█████ | 5123/10000 [1:01:10<1:08:19, 1.19it/s, loss=0.0079, lr=1.42e-05, step=5123] Training: 51%|█████ | 5124/10000 [1:01:10<1:06:32, 1.22it/s, loss=0.0079, lr=1.42e-05, step=5123] Training: 51%|█████ | 5124/10000 [1:01:10<1:06:32, 1.22it/s, loss=0.0154, lr=1.42e-05, step=5124] Training: 51%|█████▏ | 5125/10000 [1:01:11<1:04:35, 1.26it/s, loss=0.0154, lr=1.42e-05, step=5124] Training: 51%|█████▏ | 5125/10000 [1:01:11<1:04:35, 1.26it/s, loss=0.0080, lr=1.42e-05, step=5125] Training: 51%|█████▏ | 5126/10000 [1:01:12<1:04:36, 1.26it/s, loss=0.0080, lr=1.42e-05, step=5125] Training: 51%|█████▏ | 5126/10000 [1:01:12<1:04:36, 1.26it/s, loss=0.0052, lr=1.42e-05, step=5126] Training: 51%|█████▏ | 5127/10000 [1:01:13<1:09:28, 1.17it/s, loss=0.0052, lr=1.42e-05, step=5126] Training: 51%|█████▏ | 5127/10000 [1:01:13<1:09:28, 1.17it/s, loss=0.0036, lr=1.42e-05, step=5127] Training: 51%|█████▏ | 5128/10000 [1:01:14<1:08:44, 1.18it/s, loss=0.0036, lr=1.42e-05, step=5127] Training: 51%|█████▏ | 5128/10000 [1:01:14<1:08:44, 1.18it/s, loss=0.0170, lr=1.42e-05, step=5128] Training: 51%|█████▏ | 5129/10000 [1:01:14<1:01:17, 1.32it/s, loss=0.0170, lr=1.42e-05, step=5128] Training: 51%|█████▏ | 5129/10000 [1:01:14<1:01:17, 1.32it/s, loss=0.0249, lr=1.42e-05, step=5129]17:07:22.187 [I] step=5130 loss=0.0190 smoothed_loss=0.0134 lr=1.42e-05 grad_norm=0.4397 step_time=0.6244s data_time=0.1679s it/s=1.263 eta_to_10000=3856.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0112 grad_action_out_proj=0.1165 grad_shared_expert=0.3541 (10775:train_pytorch.py:850) + Training: 51%|█████▏ | 5130/10000 [1:01:15<1:04:28, 1.26it/s, loss=0.0249, lr=1.42e-05, step=5129] Training: 51%|█████▏ | 5130/10000 [1:01:15<1:04:28, 1.26it/s, loss=0.0190, lr=1.42e-05, step=5130] Training: 51%|█████▏ | 5131/10000 [1:01:16<1:05:14, 1.24it/s, loss=0.0190, lr=1.42e-05, step=5130] Training: 51%|█████▏ | 5131/10000 [1:01:16<1:05:14, 1.24it/s, loss=0.0201, lr=1.42e-05, step=5131] Training: 51%|█████▏ | 5132/10000 [1:01:17<1:01:06, 1.33it/s, loss=0.0201, lr=1.42e-05, step=5131] Training: 51%|█████▏ | 5132/10000 [1:01:17<1:01:06, 1.33it/s, loss=0.0075, lr=1.42e-05, step=5132] Training: 51%|█████▏ | 5133/10000 [1:01:17<56:25, 1.44it/s, loss=0.0075, lr=1.42e-05, step=5132] Training: 51%|█████▏ | 5133/10000 [1:01:17<56:25, 1.44it/s, loss=0.0101, lr=1.42e-05, step=5133] Training: 51%|█████▏ | 5134/10000 [1:01:18<1:04:01, 1.27it/s, loss=0.0101, lr=1.42e-05, step=5133] Training: 51%|█████▏ | 5134/10000 [1:01:18<1:04:01, 1.27it/s, loss=0.0098, lr=1.42e-05, step=5134] Training: 51%|█████▏ | 5135/10000 [1:01:19<1:05:24, 1.24it/s, loss=0.0098, lr=1.42e-05, step=5134] Training: 51%|█████▏ | 5135/10000 [1:01:19<1:05:24, 1.24it/s, loss=0.0285, lr=1.42e-05, step=5135] Training: 51%|█████▏ | 5136/10000 [1:01:20<1:05:20, 1.24it/s, loss=0.0285, lr=1.42e-05, step=5135] Training: 51%|█████▏ | 5136/10000 [1:01:20<1:05:20, 1.24it/s, loss=0.0055, lr=1.42e-05, step=5136] Training: 51%|█████▏ | 5137/10000 [1:01:21<1:05:54, 1.23it/s, loss=0.0055, lr=1.42e-05, step=5136] Training: 51%|█████▏ | 5137/10000 [1:01:21<1:05:54, 1.23it/s, loss=0.0284, lr=1.42e-05, step=5137] Training: 51%|█████▏ | 5138/10000 [1:01:22<1:08:19, 1.19it/s, loss=0.0284, lr=1.42e-05, step=5137] Training: 51%|█████▏ | 5138/10000 [1:01:22<1:08:19, 1.19it/s, loss=0.0126, lr=1.42e-05, step=5138] Training: 51%|█████▏ | 5139/10000 [1:01:22<1:05:07, 1.24it/s, loss=0.0126, lr=1.42e-05, step=5138] Training: 51%|█████▏ | 5139/10000 [1:01:22<1:05:07, 1.24it/s, loss=0.0049, lr=1.42e-05, step=5139]17:07:30.212 [I] step=5140 loss=0.0159 smoothed_loss=0.0140 lr=1.42e-05 grad_norm=0.3989 step_time=0.6384s data_time=0.1642s it/s=1.247 eta_to_10000=3898.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0130 grad_action_out_proj=0.1171 grad_shared_expert=0.4698 (10775:train_pytorch.py:850) + Training: 51%|█████▏ | 5140/10000 [1:01:23<1:07:52, 1.19it/s, loss=0.0049, lr=1.42e-05, step=5139] Training: 51%|█████▏ | 5140/10000 [1:01:23<1:07:52, 1.19it/s, loss=0.0159, lr=1.42e-05, step=5140] Training: 51%|█████▏ | 5141/10000 [1:01:24<1:11:35, 1.13it/s, loss=0.0159, lr=1.42e-05, step=5140] Training: 51%|█████▏ | 5141/10000 [1:01:24<1:11:35, 1.13it/s, loss=0.0117, lr=1.42e-05, step=5141] Training: 51%|█████▏ | 5142/10000 [1:01:25<1:13:30, 1.10it/s, loss=0.0117, lr=1.42e-05, step=5141] Training: 51%|█████▏ | 5142/10000 [1:01:25<1:13:30, 1.10it/s, loss=0.0131, lr=1.42e-05, step=5142] Training: 51%|█████▏ | 5143/10000 [1:01:26<1:18:15, 1.03it/s, loss=0.0131, lr=1.42e-05, step=5142] Training: 51%|█████▏ | 5143/10000 [1:01:26<1:18:15, 1.03it/s, loss=0.0228, lr=1.42e-05, step=5143] Training: 51%|█████▏ | 5144/10000 [1:01:27<1:17:45, 1.04it/s, loss=0.0228, lr=1.42e-05, step=5143] Training: 51%|█████▏ | 5144/10000 [1:01:27<1:17:45, 1.04it/s, loss=0.0238, lr=1.41e-05, step=5144] Training: 51%|█████▏ | 5145/10000 [1:01:28<1:16:32, 1.06it/s, loss=0.0238, lr=1.41e-05, step=5144] Training: 51%|█████▏ | 5145/10000 [1:01:28<1:16:32, 1.06it/s, loss=0.0076, lr=1.41e-05, step=5145] Training: 51%|█████▏ | 5146/10000 [1:01:29<1:13:16, 1.10it/s, loss=0.0076, lr=1.41e-05, step=5145] Training: 51%|█████▏ | 5146/10000 [1:01:29<1:13:16, 1.10it/s, loss=0.0099, lr=1.41e-05, step=5146] Training: 51%|█████▏ | 5147/10000 [1:01:30<1:07:49, 1.19it/s, loss=0.0099, lr=1.41e-05, step=5146] Training: 51%|█████▏ | 5147/10000 [1:01:30<1:07:49, 1.19it/s, loss=0.0244, lr=1.41e-05, step=5147] Training: 51%|█████▏ | 5148/10000 [1:01:30<1:02:09, 1.30it/s, loss=0.0244, lr=1.41e-05, step=5147] Training: 51%|█████▏ | 5148/10000 [1:01:30<1:02:09, 1.30it/s, loss=0.0121, lr=1.41e-05, step=5148] Training: 51%|█████▏ | 5149/10000 [1:01:31<1:08:56, 1.17it/s, loss=0.0121, lr=1.41e-05, step=5148] Training: 51%|█████▏ | 5149/10000 [1:01:31<1:08:56, 1.17it/s, loss=0.0266, lr=1.41e-05, step=5149]17:07:39.186 [I] step=5150 loss=0.0261 smoothed_loss=0.0171 lr=1.41e-05 grad_norm=0.4707 step_time=0.6758s data_time=0.2217s it/s=1.118 eta_to_10000=4336.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0083 grad_action_out_proj=0.1082 grad_shared_expert=0.3038 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5150/10000 [1:01:32<1:09:31, 1.16it/s, loss=0.0266, lr=1.41e-05, step=5149] Training: 52%|█████▏ | 5150/10000 [1:01:32<1:09:31, 1.16it/s, loss=0.0261, lr=1.41e-05, step=5150] Training: 52%|█████▏ | 5151/10000 [1:01:33<1:03:11, 1.28it/s, loss=0.0261, lr=1.41e-05, step=5150] Training: 52%|█████▏ | 5151/10000 [1:01:33<1:03:11, 1.28it/s, loss=0.0105, lr=1.41e-05, step=5151] Training: 52%|█████▏ | 5152/10000 [1:01:34<1:06:36, 1.21it/s, loss=0.0105, lr=1.41e-05, step=5151] Training: 52%|█████▏ | 5152/10000 [1:01:34<1:06:36, 1.21it/s, loss=0.0121, lr=1.41e-05, step=5152] Training: 52%|█████▏ | 5153/10000 [1:01:35<1:06:41, 1.21it/s, loss=0.0121, lr=1.41e-05, step=5152] Training: 52%|█████▏ | 5153/10000 [1:01:35<1:06:41, 1.21it/s, loss=0.0082, lr=1.41e-05, step=5153] Training: 52%|█████▏ | 5154/10000 [1:01:35<1:02:38, 1.29it/s, loss=0.0082, lr=1.41e-05, step=5153] Training: 52%|█████▏ | 5154/10000 [1:01:35<1:02:38, 1.29it/s, loss=0.0090, lr=1.41e-05, step=5154] Training: 52%|█████▏ | 5155/10000 [1:01:36<1:04:48, 1.25it/s, loss=0.0090, lr=1.41e-05, step=5154] Training: 52%|█████▏ | 5155/10000 [1:01:36<1:04:48, 1.25it/s, loss=0.0641, lr=1.41e-05, step=5155] Training: 52%|█████▏ | 5156/10000 [1:01:37<1:05:29, 1.23it/s, loss=0.0641, lr=1.41e-05, step=5155] Training: 52%|█████▏ | 5156/10000 [1:01:37<1:05:29, 1.23it/s, loss=0.0021, lr=1.41e-05, step=5156] Training: 52%|█████▏ | 5157/10000 [1:01:38<1:02:40, 1.29it/s, loss=0.0021, lr=1.41e-05, step=5156] Training: 52%|█████▏ | 5157/10000 [1:01:38<1:02:40, 1.29it/s, loss=0.0090, lr=1.41e-05, step=5157] Training: 52%|█████▏ | 5158/10000 [1:01:38<57:53, 1.39it/s, loss=0.0090, lr=1.41e-05, step=5157] Training: 52%|█████▏ | 5158/10000 [1:01:38<57:53, 1.39it/s, loss=0.0161, lr=1.41e-05, step=5158] Training: 52%|█████▏ | 5159/10000 [1:01:39<55:26, 1.46it/s, loss=0.0161, lr=1.41e-05, step=5158] Training: 52%|█████▏ | 5159/10000 [1:01:39<55:26, 1.46it/s, loss=0.0176, lr=1.41e-05, step=5159]17:07:46.627 [I] step=5160 loss=0.0092 smoothed_loss=0.0161 lr=1.41e-05 grad_norm=0.4882 step_time=0.6077s data_time=0.1364s it/s=1.344 eta_to_10000=3600.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0184 grad_action_out_proj=0.1879 grad_shared_expert=0.6751 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5160/10000 [1:01:40<59:11, 1.36it/s, loss=0.0176, lr=1.41e-05, step=5159] Training: 52%|█████▏ | 5160/10000 [1:01:40<59:11, 1.36it/s, loss=0.0092, lr=1.41e-05, step=5160] Training: 52%|█████▏ | 5161/10000 [1:01:40<55:41, 1.45it/s, loss=0.0092, lr=1.41e-05, step=5160] Training: 52%|█████▏ | 5161/10000 [1:01:40<55:41, 1.45it/s, loss=0.0210, lr=1.41e-05, step=5161] Training: 52%|█████▏ | 5162/10000 [1:01:41<53:10, 1.52it/s, loss=0.0210, lr=1.41e-05, step=5161] Training: 52%|█████▏ | 5162/10000 [1:01:41<53:10, 1.52it/s, loss=0.0178, lr=1.41e-05, step=5162] Training: 52%|█████▏ | 5163/10000 [1:01:42<1:05:13, 1.24it/s, loss=0.0178, lr=1.41e-05, step=5162] Training: 52%|█████▏ | 5163/10000 [1:01:42<1:05:13, 1.24it/s, loss=0.0250, lr=1.41e-05, step=5163] Training: 52%|█████▏ | 5164/10000 [1:01:43<1:03:24, 1.27it/s, loss=0.0250, lr=1.41e-05, step=5163] Training: 52%|█████▏ | 5164/10000 [1:01:43<1:03:24, 1.27it/s, loss=0.0263, lr=1.41e-05, step=5164] Training: 52%|█████▏ | 5165/10000 [1:01:43<58:32, 1.38it/s, loss=0.0263, lr=1.41e-05, step=5164] Training: 52%|█████▏ | 5165/10000 [1:01:43<58:32, 1.38it/s, loss=0.0061, lr=1.41e-05, step=5165] Training: 52%|█████▏ | 5166/10000 [1:01:45<1:14:08, 1.09it/s, loss=0.0061, lr=1.41e-05, step=5165] Training: 52%|█████▏ | 5166/10000 [1:01:45<1:14:08, 1.09it/s, loss=0.0091, lr=1.41e-05, step=5166] Training: 52%|█████▏ | 5167/10000 [1:01:45<1:10:00, 1.15it/s, loss=0.0091, lr=1.41e-05, step=5166] Training: 52%|█████▏ | 5167/10000 [1:01:45<1:10:00, 1.15it/s, loss=0.0086, lr=1.41e-05, step=5167] Training: 52%|█████▏ | 5168/10000 [1:01:46<1:04:56, 1.24it/s, loss=0.0086, lr=1.41e-05, step=5167] Training: 52%|█████▏ | 5168/10000 [1:01:46<1:04:56, 1.24it/s, loss=0.0043, lr=1.41e-05, step=5168] Training: 52%|█████▏ | 5169/10000 [1:01:47<1:10:01, 1.15it/s, loss=0.0043, lr=1.41e-05, step=5168] Training: 52%|█████▏ | 5169/10000 [1:01:47<1:10:01, 1.15it/s, loss=0.0165, lr=1.41e-05, step=5169]17:07:54.954 [I] step=5170 loss=0.0104 smoothed_loss=0.0143 lr=1.41e-05 grad_norm=0.4586 step_time=0.6881s data_time=0.1445s it/s=1.201 eta_to_10000=4020.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0296 grad_action_out_proj=0.1789 grad_shared_expert=0.5862 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5170/10000 [1:01:48<1:10:04, 1.15it/s, loss=0.0165, lr=1.41e-05, step=5169] Training: 52%|█████▏ | 5170/10000 [1:01:48<1:10:04, 1.15it/s, loss=0.0104, lr=1.41e-05, step=5170] Training: 52%|█████▏ | 5171/10000 [1:01:49<1:04:16, 1.25it/s, loss=0.0104, lr=1.41e-05, step=5170] Training: 52%|█████▏ | 5171/10000 [1:01:49<1:04:16, 1.25it/s, loss=0.0048, lr=1.40e-05, step=5171] Training: 52%|█████▏ | 5172/10000 [1:01:50<1:15:29, 1.07it/s, loss=0.0048, lr=1.40e-05, step=5171] Training: 52%|█████▏ | 5172/10000 [1:01:50<1:15:29, 1.07it/s, loss=0.0100, lr=1.40e-05, step=5172] Training: 52%|█████▏ | 5173/10000 [1:01:51<1:12:55, 1.10it/s, loss=0.0100, lr=1.40e-05, step=5172] Training: 52%|█████▏ | 5173/10000 [1:01:51<1:12:55, 1.10it/s, loss=0.0152, lr=1.40e-05, step=5173] Training: 52%|█████▏ | 5174/10000 [1:01:52<1:14:27, 1.08it/s, loss=0.0152, lr=1.40e-05, step=5173] Training: 52%|█████▏ | 5174/10000 [1:01:52<1:14:27, 1.08it/s, loss=0.0088, lr=1.40e-05, step=5174] Training: 52%|█████▏ | 5175/10000 [1:01:52<1:07:54, 1.18it/s, loss=0.0088, lr=1.40e-05, step=5174] Training: 52%|█████▏ | 5175/10000 [1:01:52<1:07:54, 1.18it/s, loss=0.0486, lr=1.40e-05, step=5175] Training: 52%|█████▏ | 5176/10000 [1:01:53<1:02:48, 1.28it/s, loss=0.0486, lr=1.40e-05, step=5175] Training: 52%|█████▏ | 5176/10000 [1:01:53<1:02:48, 1.28it/s, loss=0.0161, lr=1.40e-05, step=5176] Training: 52%|█████▏ | 5177/10000 [1:01:54<1:08:50, 1.17it/s, loss=0.0161, lr=1.40e-05, step=5176] Training: 52%|█████▏ | 5177/10000 [1:01:54<1:08:50, 1.17it/s, loss=0.0063, lr=1.40e-05, step=5177] Training: 52%|█████▏ | 5178/10000 [1:01:55<1:09:20, 1.16it/s, loss=0.0063, lr=1.40e-05, step=5177] Training: 52%|█████▏ | 5178/10000 [1:01:55<1:09:20, 1.16it/s, loss=0.0104, lr=1.40e-05, step=5178] Training: 52%|█████▏ | 5179/10000 [1:01:56<1:03:41, 1.26it/s, loss=0.0104, lr=1.40e-05, step=5178] Training: 52%|█████▏ | 5179/10000 [1:01:56<1:03:41, 1.26it/s, loss=0.0060, lr=1.40e-05, step=5179]17:08:03.331 [I] step=5180 loss=0.0075 smoothed_loss=0.0133 lr=1.40e-05 grad_norm=0.4661 step_time=0.6530s data_time=0.1847s it/s=1.194 eta_to_10000=4036.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0074 grad_action_out_proj=0.0744 grad_shared_expert=0.4550 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5180/10000 [1:01:56<1:05:08, 1.23it/s, loss=0.0060, lr=1.40e-05, step=5179] Training: 52%|█████▏ | 5180/10000 [1:01:56<1:05:08, 1.23it/s, loss=0.0075, lr=1.40e-05, step=5180] Training: 52%|█████▏ | 5181/10000 [1:01:57<1:07:48, 1.18it/s, loss=0.0075, lr=1.40e-05, step=5180] Training: 52%|█████▏ | 5181/10000 [1:01:57<1:07:48, 1.18it/s, loss=0.0085, lr=1.40e-05, step=5181] Training: 52%|█████▏ | 5182/10000 [1:01:58<1:02:59, 1.27it/s, loss=0.0085, lr=1.40e-05, step=5181] Training: 52%|█████▏ | 5182/10000 [1:01:58<1:02:59, 1.27it/s, loss=0.0213, lr=1.40e-05, step=5182] Training: 52%|█████▏ | 5183/10000 [1:01:59<1:09:02, 1.16it/s, loss=0.0213, lr=1.40e-05, step=5182] Training: 52%|█████▏ | 5183/10000 [1:01:59<1:09:02, 1.16it/s, loss=0.0026, lr=1.40e-05, step=5183] Training: 52%|█████▏ | 5184/10000 [1:02:00<1:07:16, 1.19it/s, loss=0.0026, lr=1.40e-05, step=5183] Training: 52%|█████▏ | 5184/10000 [1:02:00<1:07:16, 1.19it/s, loss=0.0051, lr=1.40e-05, step=5184] Training: 52%|█████▏ | 5185/10000 [1:02:01<1:10:17, 1.14it/s, loss=0.0051, lr=1.40e-05, step=5184] Training: 52%|█████▏ | 5185/10000 [1:02:01<1:10:17, 1.14it/s, loss=0.0045, lr=1.40e-05, step=5185] Training: 52%|█████▏ | 5186/10000 [1:02:02<1:08:51, 1.17it/s, loss=0.0045, lr=1.40e-05, step=5185] Training: 52%|█████▏ | 5186/10000 [1:02:02<1:08:51, 1.17it/s, loss=0.0098, lr=1.40e-05, step=5186] Training: 52%|█████▏ | 5187/10000 [1:02:02<1:06:59, 1.20it/s, loss=0.0098, lr=1.40e-05, step=5186] Training: 52%|█████▏ | 5187/10000 [1:02:02<1:06:59, 1.20it/s, loss=0.0096, lr=1.40e-05, step=5187] Training: 52%|█████▏ | 5188/10000 [1:02:03<1:01:13, 1.31it/s, loss=0.0096, lr=1.40e-05, step=5187] Training: 52%|█████▏ | 5188/10000 [1:02:03<1:01:13, 1.31it/s, loss=0.0082, lr=1.40e-05, step=5188] Training: 52%|█████▏ | 5189/10000 [1:02:04<1:04:09, 1.25it/s, loss=0.0082, lr=1.40e-05, step=5188] Training: 52%|█████▏ | 5189/10000 [1:02:04<1:04:09, 1.25it/s, loss=0.0102, lr=1.40e-05, step=5189]17:08:11.367 [I] step=5190 loss=0.0060 smoothed_loss=0.0101 lr=1.40e-05 grad_norm=0.4854 step_time=0.6462s data_time=0.1573s it/s=1.245 eta_to_10000=3863.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0107 grad_action_out_proj=0.1095 grad_shared_expert=0.5800 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5190/10000 [1:02:04<59:27, 1.35it/s, loss=0.0102, lr=1.40e-05, step=5189] Training: 52%|█████▏ | 5190/10000 [1:02:04<59:27, 1.35it/s, loss=0.0060, lr=1.40e-05, step=5190] Training: 52%|█████▏ | 5191/10000 [1:02:05<55:10, 1.45it/s, loss=0.0060, lr=1.40e-05, step=5190] Training: 52%|█████▏ | 5191/10000 [1:02:05<55:10, 1.45it/s, loss=0.0140, lr=1.40e-05, step=5191] Training: 52%|█████▏ | 5192/10000 [1:02:06<1:06:43, 1.20it/s, loss=0.0140, lr=1.40e-05, step=5191] Training: 52%|█████▏ | 5192/10000 [1:02:06<1:06:43, 1.20it/s, loss=0.0131, lr=1.40e-05, step=5192] Training: 52%|█████▏ | 5193/10000 [1:02:07<1:02:04, 1.29it/s, loss=0.0131, lr=1.40e-05, step=5192] Training: 52%|█████▏ | 5193/10000 [1:02:07<1:02:04, 1.29it/s, loss=0.0014, lr=1.40e-05, step=5193] Training: 52%|█████▏ | 5194/10000 [1:02:08<1:02:58, 1.27it/s, loss=0.0014, lr=1.40e-05, step=5193] Training: 52%|█████▏ | 5194/10000 [1:02:08<1:02:58, 1.27it/s, loss=0.0033, lr=1.40e-05, step=5194] Training: 52%|█████▏ | 5195/10000 [1:02:08<57:25, 1.39it/s, loss=0.0033, lr=1.40e-05, step=5194] Training: 52%|█████▏ | 5195/10000 [1:02:08<57:25, 1.39it/s, loss=0.0090, lr=1.40e-05, step=5195] Training: 52%|█████▏ | 5196/10000 [1:02:09<1:02:50, 1.27it/s, loss=0.0090, lr=1.40e-05, step=5195] Training: 52%|█████▏ | 5196/10000 [1:02:09<1:02:50, 1.27it/s, loss=0.0325, lr=1.40e-05, step=5196] Training: 52%|█████▏ | 5197/10000 [1:02:10<59:03, 1.36it/s, loss=0.0325, lr=1.40e-05, step=5196] Training: 52%|█████▏ | 5197/10000 [1:02:10<59:03, 1.36it/s, loss=0.0340, lr=1.40e-05, step=5197] Training: 52%|█████▏ | 5198/10000 [1:02:11<1:05:45, 1.22it/s, loss=0.0340, lr=1.40e-05, step=5197] Training: 52%|█████▏ | 5198/10000 [1:02:11<1:05:45, 1.22it/s, loss=0.0159, lr=1.39e-05, step=5198] Training: 52%|█████▏ | 5199/10000 [1:02:12<1:09:23, 1.15it/s, loss=0.0159, lr=1.39e-05, step=5198] Training: 52%|█████▏ | 5199/10000 [1:02:12<1:09:23, 1.15it/s, loss=0.0102, lr=1.39e-05, step=5199]17:08:20.268 [I] step=5200 loss=0.0122 smoothed_loss=0.0134 lr=1.40e-05 grad_norm=0.4432 step_time=0.6885s data_time=0.2016s it/s=1.125 eta_to_10000=4268.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0883 grad_shared_expert=0.6119 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5200/10000 [1:02:13<1:26:48, 1.09s/it, loss=0.0102, lr=1.39e-05, step=5199] Training: 52%|█████▏ | 5200/10000 [1:02:13<1:26:48, 1.09s/it, loss=0.0122, lr=1.39e-05, step=5200] Training: 52%|█████▏ | 5201/10000 [1:02:14<1:15:30, 1.06it/s, loss=0.0122, lr=1.39e-05, step=5200] Training: 52%|█████▏ | 5201/10000 [1:02:14<1:15:30, 1.06it/s, loss=0.0021, lr=1.39e-05, step=5201] Training: 52%|█████▏ | 5202/10000 [1:02:15<1:09:10, 1.16it/s, loss=0.0021, lr=1.39e-05, step=5201] Training: 52%|█████▏ | 5202/10000 [1:02:15<1:09:10, 1.16it/s, loss=0.0024, lr=1.39e-05, step=5202] Training: 52%|█████▏ | 5203/10000 [1:02:16<1:12:15, 1.11it/s, loss=0.0024, lr=1.39e-05, step=5202] Training: 52%|█████▏ | 5203/10000 [1:02:16<1:12:15, 1.11it/s, loss=0.0027, lr=1.39e-05, step=5203] Training: 52%|█████▏ | 5204/10000 [1:02:16<1:11:11, 1.12it/s, loss=0.0027, lr=1.39e-05, step=5203] Training: 52%|█████▏ | 5204/10000 [1:02:17<1:11:11, 1.12it/s, loss=0.0116, lr=1.39e-05, step=5204] Training: 52%|█████▏ | 5205/10000 [1:02:17<1:08:31, 1.17it/s, loss=0.0116, lr=1.39e-05, step=5204] Training: 52%|█████▏ | 5205/10000 [1:02:17<1:08:31, 1.17it/s, loss=0.0040, lr=1.39e-05, step=5205] Training: 52%|█████▏ | 5206/10000 [1:02:18<1:06:58, 1.19it/s, loss=0.0040, lr=1.39e-05, step=5205] Training: 52%|█████▏ | 5206/10000 [1:02:18<1:06:58, 1.19it/s, loss=0.0085, lr=1.39e-05, step=5206] Training: 52%|█████▏ | 5207/10000 [1:02:19<1:11:06, 1.12it/s, loss=0.0085, lr=1.39e-05, step=5206] Training: 52%|█████▏ | 5207/10000 [1:02:19<1:11:06, 1.12it/s, loss=0.0038, lr=1.39e-05, step=5207] Training: 52%|█████▏ | 5208/10000 [1:02:20<1:15:17, 1.06it/s, loss=0.0038, lr=1.39e-05, step=5207] Training: 52%|█████▏ | 5208/10000 [1:02:20<1:15:17, 1.06it/s, loss=0.0092, lr=1.39e-05, step=5208] Training: 52%|█████▏ | 5209/10000 [1:02:21<1:14:04, 1.08it/s, loss=0.0092, lr=1.39e-05, step=5208] Training: 52%|█████▏ | 5209/10000 [1:02:21<1:14:04, 1.08it/s, loss=0.0285, lr=1.39e-05, step=5209]17:08:29.196 [I] step=5210 loss=0.0243 smoothed_loss=0.0124 lr=1.39e-05 grad_norm=0.5010 step_time=0.6809s data_time=0.2119s it/s=1.121 eta_to_10000=4274.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0326 grad_action_out_proj=0.1776 grad_shared_expert=0.5024 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5210/10000 [1:02:22<1:21:33, 1.02s/it, loss=0.0285, lr=1.39e-05, step=5209] Training: 52%|█████▏ | 5210/10000 [1:02:22<1:21:33, 1.02s/it, loss=0.0243, lr=1.39e-05, step=5210] Training: 52%|█████▏ | 5211/10000 [1:02:23<1:10:27, 1.13it/s, loss=0.0243, lr=1.39e-05, step=5210] Training: 52%|█████▏ | 5211/10000 [1:02:23<1:10:27, 1.13it/s, loss=0.0156, lr=1.39e-05, step=5211] Training: 52%|█████▏ | 5212/10000 [1:02:24<1:13:09, 1.09it/s, loss=0.0156, lr=1.39e-05, step=5211] Training: 52%|█████▏ | 5212/10000 [1:02:24<1:13:09, 1.09it/s, loss=0.0119, lr=1.39e-05, step=5212] Training: 52%|█████▏ | 5213/10000 [1:02:25<1:11:57, 1.11it/s, loss=0.0119, lr=1.39e-05, step=5212] Training: 52%|█████▏ | 5213/10000 [1:02:25<1:11:57, 1.11it/s, loss=0.0025, lr=1.39e-05, step=5213] Training: 52%|█████▏ | 5214/10000 [1:02:26<1:13:15, 1.09it/s, loss=0.0025, lr=1.39e-05, step=5213] Training: 52%|█████▏ | 5214/10000 [1:02:26<1:13:15, 1.09it/s, loss=0.0335, lr=1.39e-05, step=5214] Training: 52%|█████▏ | 5215/10000 [1:02:27<1:23:06, 1.04s/it, loss=0.0335, lr=1.39e-05, step=5214] Training: 52%|█████▏ | 5215/10000 [1:02:27<1:23:06, 1.04s/it, loss=0.0164, lr=1.39e-05, step=5215] Training: 52%|█████▏ | 5216/10000 [1:02:28<1:19:32, 1.00it/s, loss=0.0164, lr=1.39e-05, step=5215] Training: 52%|█████▏ | 5216/10000 [1:02:28<1:19:32, 1.00it/s, loss=0.0025, lr=1.39e-05, step=5216] Training: 52%|█████▏ | 5217/10000 [1:02:29<1:22:16, 1.03s/it, loss=0.0025, lr=1.39e-05, step=5216] Training: 52%|█████▏ | 5217/10000 [1:02:29<1:22:16, 1.03s/it, loss=0.0495, lr=1.39e-05, step=5217] Training: 52%|█████▏ | 5218/10000 [1:02:30<1:18:32, 1.01it/s, loss=0.0495, lr=1.39e-05, step=5217] Training: 52%|█████▏ | 5218/10000 [1:02:30<1:18:32, 1.01it/s, loss=0.0209, lr=1.39e-05, step=5218] Training: 52%|█████▏ | 5219/10000 [1:02:31<1:12:31, 1.10it/s, loss=0.0209, lr=1.39e-05, step=5218] Training: 52%|█████▏ | 5219/10000 [1:02:31<1:12:31, 1.10it/s, loss=0.0612, lr=1.39e-05, step=5219]17:08:38.412 [I] step=5220 loss=0.0127 smoothed_loss=0.0206 lr=1.39e-05 grad_norm=0.4287 step_time=0.6975s data_time=0.2241s it/s=1.086 eta_to_10000=4400.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0346 grad_action_out_proj=0.1743 grad_shared_expert=0.6020 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5220/10000 [1:02:31<1:11:53, 1.11it/s, loss=0.0612, lr=1.39e-05, step=5219] Training: 52%|█████▏ | 5220/10000 [1:02:31<1:11:53, 1.11it/s, loss=0.0127, lr=1.39e-05, step=5220] Training: 52%|█████▏ | 5221/10000 [1:02:32<1:08:34, 1.16it/s, loss=0.0127, lr=1.39e-05, step=5220] Training: 52%|█████▏ | 5221/10000 [1:02:32<1:08:34, 1.16it/s, loss=0.0097, lr=1.39e-05, step=5221] Training: 52%|█████▏ | 5222/10000 [1:02:33<1:10:47, 1.12it/s, loss=0.0097, lr=1.39e-05, step=5221] Training: 52%|█████▏ | 5222/10000 [1:02:33<1:10:47, 1.12it/s, loss=0.0244, lr=1.39e-05, step=5222] Training: 52%|█████▏ | 5223/10000 [1:02:34<1:06:20, 1.20it/s, loss=0.0244, lr=1.39e-05, step=5222] Training: 52%|█████▏ | 5223/10000 [1:02:34<1:06:20, 1.20it/s, loss=0.0064, lr=1.39e-05, step=5223] Training: 52%|█████▏ | 5224/10000 [1:02:35<1:11:51, 1.11it/s, loss=0.0064, lr=1.39e-05, step=5223] Training: 52%|█████▏ | 5224/10000 [1:02:35<1:11:51, 1.11it/s, loss=0.0104, lr=1.39e-05, step=5224] Training: 52%|█████▏ | 5225/10000 [1:02:36<1:09:00, 1.15it/s, loss=0.0104, lr=1.39e-05, step=5224] Training: 52%|█████▏ | 5225/10000 [1:02:36<1:09:00, 1.15it/s, loss=0.0084, lr=1.38e-05, step=5225] Training: 52%|█████▏ | 5226/10000 [1:02:36<1:03:06, 1.26it/s, loss=0.0084, lr=1.38e-05, step=5225] Training: 52%|█████▏ | 5226/10000 [1:02:36<1:03:06, 1.26it/s, loss=0.0126, lr=1.38e-05, step=5226] Training: 52%|█████▏ | 5227/10000 [1:02:37<1:05:12, 1.22it/s, loss=0.0126, lr=1.38e-05, step=5226] Training: 52%|█████▏ | 5227/10000 [1:02:37<1:05:12, 1.22it/s, loss=0.0082, lr=1.38e-05, step=5227] Training: 52%|█████▏ | 5228/10000 [1:02:38<1:05:12, 1.22it/s, loss=0.0082, lr=1.38e-05, step=5227] Training: 52%|█████▏ | 5228/10000 [1:02:38<1:05:12, 1.22it/s, loss=0.0041, lr=1.38e-05, step=5228] Training: 52%|█████▏ | 5229/10000 [1:02:39<1:09:59, 1.14it/s, loss=0.0041, lr=1.38e-05, step=5228] Training: 52%|█████▏ | 5229/10000 [1:02:39<1:09:59, 1.14it/s, loss=0.0097, lr=1.38e-05, step=5229]17:08:47.088 [I] step=5230 loss=0.0280 smoothed_loss=0.0154 lr=1.38e-05 grad_norm=0.4378 step_time=0.7042s data_time=0.1634s it/s=1.153 eta_to_10000=4136.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0147 grad_action_out_proj=0.1656 grad_shared_expert=0.3753 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5230/10000 [1:02:40<1:14:21, 1.07it/s, loss=0.0097, lr=1.38e-05, step=5229] Training: 52%|█████▏ | 5230/10000 [1:02:40<1:14:21, 1.07it/s, loss=0.0280, lr=1.38e-05, step=5230] Training: 52%|█████▏ | 5231/10000 [1:02:41<1:09:39, 1.14it/s, loss=0.0280, lr=1.38e-05, step=5230] Training: 52%|█████▏ | 5231/10000 [1:02:41<1:09:39, 1.14it/s, loss=0.0131, lr=1.38e-05, step=5231] Training: 52%|█████▏ | 5232/10000 [1:02:42<1:10:18, 1.13it/s, loss=0.0131, lr=1.38e-05, step=5231] Training: 52%|█████▏ | 5232/10000 [1:02:42<1:10:18, 1.13it/s, loss=0.0287, lr=1.38e-05, step=5232] Training: 52%|█████▏ | 5233/10000 [1:02:43<1:11:04, 1.12it/s, loss=0.0287, lr=1.38e-05, step=5232] Training: 52%|█████▏ | 5233/10000 [1:02:43<1:11:04, 1.12it/s, loss=0.0100, lr=1.38e-05, step=5233] Training: 52%|█████▏ | 5234/10000 [1:02:43<1:06:01, 1.20it/s, loss=0.0100, lr=1.38e-05, step=5233] Training: 52%|█████▏ | 5234/10000 [1:02:43<1:06:01, 1.20it/s, loss=0.0063, lr=1.38e-05, step=5234] Training: 52%|█████▏ | 5235/10000 [1:02:44<1:02:13, 1.28it/s, loss=0.0063, lr=1.38e-05, step=5234] Training: 52%|█████▏ | 5235/10000 [1:02:44<1:02:13, 1.28it/s, loss=0.0047, lr=1.38e-05, step=5235] Training: 52%|█████▏ | 5236/10000 [1:02:45<1:07:02, 1.18it/s, loss=0.0047, lr=1.38e-05, step=5235] Training: 52%|█████▏ | 5236/10000 [1:02:45<1:07:02, 1.18it/s, loss=0.0064, lr=1.38e-05, step=5236] Training: 52%|█████▏ | 5237/10000 [1:02:46<1:04:16, 1.24it/s, loss=0.0064, lr=1.38e-05, step=5236] Training: 52%|█████▏ | 5237/10000 [1:02:46<1:04:16, 1.24it/s, loss=0.0150, lr=1.38e-05, step=5237] Training: 52%|█████▏ | 5238/10000 [1:02:46<58:01, 1.37it/s, loss=0.0150, lr=1.38e-05, step=5237] Training: 52%|█████▏ | 5238/10000 [1:02:46<58:01, 1.37it/s, loss=0.0042, lr=1.38e-05, step=5238] Training: 52%|█████▏ | 5239/10000 [1:02:47<1:03:12, 1.26it/s, loss=0.0042, lr=1.38e-05, step=5238] Training: 52%|█████▏ | 5239/10000 [1:02:47<1:03:12, 1.26it/s, loss=0.0027, lr=1.38e-05, step=5239]17:08:55.020 [I] step=5240 loss=0.0084 smoothed_loss=0.0111 lr=1.38e-05 grad_norm=0.4223 step_time=0.6469s data_time=0.1463s it/s=1.262 eta_to_10000=3772.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0118 grad_action_out_proj=0.1256 grad_shared_expert=0.6506 (10775:train_pytorch.py:850) + Training: 52%|█████▏ | 5240/10000 [1:02:48<1:03:19, 1.25it/s, loss=0.0027, lr=1.38e-05, step=5239] Training: 52%|█████▏ | 5240/10000 [1:02:48<1:03:19, 1.25it/s, loss=0.0084, lr=1.38e-05, step=5240] Training: 52%|█████▏ | 5241/10000 [1:02:49<56:55, 1.39it/s, loss=0.0084, lr=1.38e-05, step=5240] Training: 52%|█████▏ | 5241/10000 [1:02:49<56:55, 1.39it/s, loss=0.0343, lr=1.38e-05, step=5241] Training: 52%|█████▏ | 5242/10000 [1:02:49<56:58, 1.39it/s, loss=0.0343, lr=1.38e-05, step=5241] Training: 52%|█████▏ | 5242/10000 [1:02:49<56:58, 1.39it/s, loss=0.0595, lr=1.38e-05, step=5242] Training: 52%|█████▏ | 5243/10000 [1:02:50<1:02:19, 1.27it/s, loss=0.0595, lr=1.38e-05, step=5242] Training: 52%|█████▏ | 5243/10000 [1:02:50<1:02:19, 1.27it/s, loss=0.0239, lr=1.38e-05, step=5243] Training: 52%|█████▏ | 5244/10000 [1:02:51<59:14, 1.34it/s, loss=0.0239, lr=1.38e-05, step=5243] Training: 52%|█████▏ | 5244/10000 [1:02:51<59:14, 1.34it/s, loss=0.0182, lr=1.38e-05, step=5244] Training: 52%|█████▏ | 5245/10000 [1:02:52<55:04, 1.44it/s, loss=0.0182, lr=1.38e-05, step=5244] Training: 52%|█████▏ | 5245/10000 [1:02:52<55:04, 1.44it/s, loss=0.0171, lr=1.38e-05, step=5245] Training: 52%|█████▏ | 5246/10000 [1:02:52<56:14, 1.41it/s, loss=0.0171, lr=1.38e-05, step=5245] Training: 52%|█████▏ | 5246/10000 [1:02:52<56:14, 1.41it/s, loss=0.0089, lr=1.38e-05, step=5246] Training: 52%|█████▏ | 5247/10000 [1:02:53<53:53, 1.47it/s, loss=0.0089, lr=1.38e-05, step=5246] Training: 52%|█████▏ | 5247/10000 [1:02:53<53:53, 1.47it/s, loss=0.0299, lr=1.38e-05, step=5247] Training: 52%|█████▏ | 5248/10000 [1:02:53<52:30, 1.51it/s, loss=0.0299, lr=1.38e-05, step=5247] Training: 52%|█████▏ | 5248/10000 [1:02:53<52:30, 1.51it/s, loss=0.0685, lr=1.38e-05, step=5248] Training: 52%|█████▏ | 5249/10000 [1:02:54<59:55, 1.32it/s, loss=0.0685, lr=1.38e-05, step=5248] Training: 52%|█████▏ | 5249/10000 [1:02:54<59:55, 1.32it/s, loss=0.0045, lr=1.38e-05, step=5249]17:09:02.181 [I] step=5250 loss=0.0144 smoothed_loss=0.0210 lr=1.38e-05 grad_norm=0.4996 step_time=0.6173s data_time=0.0987s it/s=1.397 eta_to_10000=3399.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0162 grad_action_out_proj=0.1721 grad_shared_expert=0.5346 (10775:train_pytorch.py:850) + Training: 52%|█████▎ | 5250/10000 [1:02:55<1:00:31, 1.31it/s, loss=0.0045, lr=1.38e-05, step=5249] Training: 52%|█████▎ | 5250/10000 [1:02:55<1:00:31, 1.31it/s, loss=0.0144, lr=1.38e-05, step=5250] Training: 53%|█████▎ | 5251/10000 [1:02:56<56:29, 1.40it/s, loss=0.0144, lr=1.38e-05, step=5250] Training: 53%|█████▎ | 5251/10000 [1:02:56<56:29, 1.40it/s, loss=0.0119, lr=1.38e-05, step=5251] Training: 53%|█████▎ | 5252/10000 [1:02:56<52:56, 1.49it/s, loss=0.0119, lr=1.38e-05, step=5251] Training: 53%|█████▎ | 5252/10000 [1:02:56<52:56, 1.49it/s, loss=0.0119, lr=1.37e-05, step=5252] Training: 53%|█████▎ | 5253/10000 [1:02:57<50:51, 1.56it/s, loss=0.0119, lr=1.37e-05, step=5252] Training: 53%|█████▎ | 5253/10000 [1:02:57<50:51, 1.56it/s, loss=0.0063, lr=1.37e-05, step=5253] Training: 53%|█████▎ | 5254/10000 [1:02:58<53:35, 1.48it/s, loss=0.0063, lr=1.37e-05, step=5253] Training: 53%|█████▎ | 5254/10000 [1:02:58<53:35, 1.48it/s, loss=0.0065, lr=1.37e-05, step=5254] Training: 53%|█████▎ | 5255/10000 [1:02:58<51:40, 1.53it/s, loss=0.0065, lr=1.37e-05, step=5254] Training: 53%|█████▎ | 5255/10000 [1:02:58<51:40, 1.53it/s, loss=0.0136, lr=1.37e-05, step=5255] Training: 53%|█████▎ | 5256/10000 [1:02:59<58:27, 1.35it/s, loss=0.0136, lr=1.37e-05, step=5255] Training: 53%|█████▎ | 5256/10000 [1:02:59<58:27, 1.35it/s, loss=0.0049, lr=1.37e-05, step=5256] Training: 53%|█████▎ | 5257/10000 [1:03:00<1:02:59, 1.25it/s, loss=0.0049, lr=1.37e-05, step=5256] Training: 53%|█████▎ | 5257/10000 [1:03:00<1:02:59, 1.25it/s, loss=0.0787, lr=1.37e-05, step=5257] Training: 53%|█████▎ | 5258/10000 [1:03:01<59:48, 1.32it/s, loss=0.0787, lr=1.37e-05, step=5257] Training: 53%|█████▎ | 5258/10000 [1:03:01<59:48, 1.32it/s, loss=0.0181, lr=1.37e-05, step=5258] Training: 53%|█████▎ | 5259/10000 [1:03:02<56:51, 1.39it/s, loss=0.0181, lr=1.37e-05, step=5258] Training: 53%|█████▎ | 5259/10000 [1:03:02<56:51, 1.39it/s, loss=0.0093, lr=1.37e-05, step=5259]17:09:09.676 [I] step=5260 loss=0.0010 smoothed_loss=0.0182 lr=1.37e-05 grad_norm=0.4383 step_time=0.5853s data_time=0.1642s it/s=1.335 eta_to_10000=3550.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0248 grad_action_out_proj=0.1257 grad_shared_expert=0.3621 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5260/10000 [1:03:03<1:13:11, 1.08it/s, loss=0.0093, lr=1.37e-05, step=5259] Training: 53%|█████▎ | 5260/10000 [1:03:03<1:13:11, 1.08it/s, loss=0.0010, lr=1.37e-05, step=5260] Training: 53%|█████▎ | 5261/10000 [1:03:04<1:12:44, 1.09it/s, loss=0.0010, lr=1.37e-05, step=5260] Training: 53%|█████▎ | 5261/10000 [1:03:04<1:12:44, 1.09it/s, loss=0.0114, lr=1.37e-05, step=5261] Training: 53%|█████▎ | 5262/10000 [1:03:04<1:06:06, 1.19it/s, loss=0.0114, lr=1.37e-05, step=5261] Training: 53%|█████▎ | 5262/10000 [1:03:04<1:06:06, 1.19it/s, loss=0.0116, lr=1.37e-05, step=5262] Training: 53%|█████▎ | 5263/10000 [1:03:05<1:00:46, 1.30it/s, loss=0.0116, lr=1.37e-05, step=5262] Training: 53%|█████▎ | 5263/10000 [1:03:05<1:00:46, 1.30it/s, loss=0.0880, lr=1.37e-05, step=5263] Training: 53%|█████▎ | 5264/10000 [1:03:06<1:02:01, 1.27it/s, loss=0.0880, lr=1.37e-05, step=5263] Training: 53%|█████▎ | 5264/10000 [1:03:06<1:02:01, 1.27it/s, loss=0.0055, lr=1.37e-05, step=5264] Training: 53%|█████▎ | 5265/10000 [1:03:06<57:21, 1.38it/s, loss=0.0055, lr=1.37e-05, step=5264] Training: 53%|█████▎ | 5265/10000 [1:03:07<57:21, 1.38it/s, loss=0.0057, lr=1.37e-05, step=5265] Training: 53%|█████▎ | 5266/10000 [1:03:07<54:51, 1.44it/s, loss=0.0057, lr=1.37e-05, step=5265] Training: 53%|█████▎ | 5266/10000 [1:03:07<54:51, 1.44it/s, loss=0.0112, lr=1.37e-05, step=5266] Training: 53%|█████▎ | 5267/10000 [1:03:08<57:28, 1.37it/s, loss=0.0112, lr=1.37e-05, step=5266] Training: 53%|█████▎ | 5267/10000 [1:03:08<57:28, 1.37it/s, loss=0.0084, lr=1.37e-05, step=5267] Training: 53%|█████▎ | 5268/10000 [1:03:09<56:54, 1.39it/s, loss=0.0084, lr=1.37e-05, step=5267] Training: 53%|█████▎ | 5268/10000 [1:03:09<56:54, 1.39it/s, loss=0.0348, lr=1.37e-05, step=5268] Training: 53%|█████▎ | 5269/10000 [1:03:09<54:00, 1.46it/s, loss=0.0348, lr=1.37e-05, step=5268] Training: 53%|█████▎ | 5269/10000 [1:03:09<54:00, 1.46it/s, loss=0.0043, lr=1.37e-05, step=5269]17:09:16.825 [I] step=5270 loss=0.0144 smoothed_loss=0.0181 lr=1.37e-05 grad_norm=0.4541 step_time=0.5952s data_time=0.1197s it/s=1.435 eta_to_10000=3295.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1010 grad_shared_expert=0.4696 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5270/10000 [1:03:10<53:26, 1.48it/s, loss=0.0043, lr=1.37e-05, step=5269] Training: 53%|█████▎ | 5270/10000 [1:03:10<53:26, 1.48it/s, loss=0.0144, lr=1.37e-05, step=5270] Training: 53%|█████▎ | 5271/10000 [1:03:11<58:05, 1.36it/s, loss=0.0144, lr=1.37e-05, step=5270] Training: 53%|█████▎ | 5271/10000 [1:03:11<58:05, 1.36it/s, loss=0.0114, lr=1.37e-05, step=5271] Training: 53%|█████▎ | 5272/10000 [1:03:11<55:04, 1.43it/s, loss=0.0114, lr=1.37e-05, step=5271] Training: 53%|█████▎ | 5272/10000 [1:03:11<55:04, 1.43it/s, loss=0.0066, lr=1.37e-05, step=5272] Training: 53%|█████▎ | 5273/10000 [1:03:12<52:53, 1.49it/s, loss=0.0066, lr=1.37e-05, step=5272] Training: 53%|█████▎ | 5273/10000 [1:03:12<52:53, 1.49it/s, loss=0.0124, lr=1.37e-05, step=5273] Training: 53%|█████▎ | 5274/10000 [1:03:13<53:05, 1.48it/s, loss=0.0124, lr=1.37e-05, step=5273] Training: 53%|█████▎ | 5274/10000 [1:03:13<53:05, 1.48it/s, loss=0.0215, lr=1.37e-05, step=5274] Training: 53%|█████▎ | 5275/10000 [1:03:14<58:46, 1.34it/s, loss=0.0215, lr=1.37e-05, step=5274] Training: 53%|█████▎ | 5275/10000 [1:03:14<58:46, 1.34it/s, loss=0.0025, lr=1.37e-05, step=5275] Training: 53%|█████▎ | 5276/10000 [1:03:14<54:45, 1.44it/s, loss=0.0025, lr=1.37e-05, step=5275] Training: 53%|█████▎ | 5276/10000 [1:03:14<54:45, 1.44it/s, loss=0.0055, lr=1.37e-05, step=5276] Training: 53%|█████▎ | 5277/10000 [1:03:15<51:08, 1.54it/s, loss=0.0055, lr=1.37e-05, step=5276] Training: 53%|█████▎ | 5277/10000 [1:03:15<51:08, 1.54it/s, loss=0.0027, lr=1.37e-05, step=5277] Training: 53%|█████▎ | 5278/10000 [1:03:15<49:35, 1.59it/s, loss=0.0027, lr=1.37e-05, step=5277] Training: 53%|█████▎ | 5278/10000 [1:03:15<49:35, 1.59it/s, loss=0.0035, lr=1.36e-05, step=5278] Training: 53%|█████▎ | 5279/10000 [1:03:16<52:45, 1.49it/s, loss=0.0035, lr=1.36e-05, step=5278] Training: 53%|█████▎ | 5279/10000 [1:03:16<52:45, 1.49it/s, loss=0.0195, lr=1.36e-05, step=5279]17:09:23.591 [I] step=5280 loss=0.0251 smoothed_loss=0.0140 lr=1.37e-05 grad_norm=0.4173 step_time=0.5841s data_time=0.0925s it/s=1.479 eta_to_10000=3192.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0224 grad_action_out_proj=0.2121 grad_shared_expert=0.5972 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5280/10000 [1:03:17<51:15, 1.53it/s, loss=0.0195, lr=1.36e-05, step=5279] Training: 53%|█████▎ | 5280/10000 [1:03:17<51:15, 1.53it/s, loss=0.0251, lr=1.36e-05, step=5280] Training: 53%|█████▎ | 5281/10000 [1:03:17<49:00, 1.61it/s, loss=0.0251, lr=1.36e-05, step=5280] Training: 53%|█████▎ | 5281/10000 [1:03:17<49:00, 1.61it/s, loss=0.0139, lr=1.36e-05, step=5281] Training: 53%|█████▎ | 5282/10000 [1:03:18<56:33, 1.39it/s, loss=0.0139, lr=1.36e-05, step=5281] Training: 53%|█████▎ | 5282/10000 [1:03:18<56:33, 1.39it/s, loss=0.0034, lr=1.36e-05, step=5282] Training: 53%|█████▎ | 5283/10000 [1:03:19<54:59, 1.43it/s, loss=0.0034, lr=1.36e-05, step=5282] Training: 53%|█████▎ | 5283/10000 [1:03:19<54:59, 1.43it/s, loss=0.0096, lr=1.36e-05, step=5283] Training: 53%|█████▎ | 5284/10000 [1:03:20<56:10, 1.40it/s, loss=0.0096, lr=1.36e-05, step=5283] Training: 53%|█████▎ | 5284/10000 [1:03:20<56:10, 1.40it/s, loss=0.0093, lr=1.36e-05, step=5284] Training: 53%|█████▎ | 5285/10000 [1:03:20<54:41, 1.44it/s, loss=0.0093, lr=1.36e-05, step=5284] Training: 53%|█████▎ | 5285/10000 [1:03:20<54:41, 1.44it/s, loss=0.0134, lr=1.36e-05, step=5285] Training: 53%|█████▎ | 5286/10000 [1:03:21<1:00:06, 1.31it/s, loss=0.0134, lr=1.36e-05, step=5285] Training: 53%|█████▎ | 5286/10000 [1:03:21<1:00:06, 1.31it/s, loss=0.0128, lr=1.36e-05, step=5286] Training: 53%|█████▎ | 5287/10000 [1:03:22<59:38, 1.32it/s, loss=0.0128, lr=1.36e-05, step=5286] Training: 53%|█████▎ | 5287/10000 [1:03:22<59:38, 1.32it/s, loss=0.0083, lr=1.36e-05, step=5287] Training: 53%|█████▎ | 5288/10000 [1:03:23<56:32, 1.39it/s, loss=0.0083, lr=1.36e-05, step=5287] Training: 53%|█████▎ | 5288/10000 [1:03:23<56:32, 1.39it/s, loss=0.0042, lr=1.36e-05, step=5288] Training: 53%|█████▎ | 5289/10000 [1:03:23<1:01:12, 1.28it/s, loss=0.0042, lr=1.36e-05, step=5288] Training: 53%|█████▎ | 5289/10000 [1:03:23<1:01:12, 1.28it/s, loss=0.0088, lr=1.36e-05, step=5289]17:09:31.273 [I] step=5290 loss=0.0027 smoothed_loss=0.0102 lr=1.36e-05 grad_norm=0.4877 step_time=0.6232s data_time=0.1450s it/s=1.302 eta_to_10000=3617.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0084 grad_action_out_proj=0.1034 grad_shared_expert=0.3231 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5290/10000 [1:03:24<1:04:15, 1.22it/s, loss=0.0088, lr=1.36e-05, step=5289] Training: 53%|█████▎ | 5290/10000 [1:03:24<1:04:15, 1.22it/s, loss=0.0027, lr=1.36e-05, step=5290] Training: 53%|█████▎ | 5291/10000 [1:03:25<59:44, 1.31it/s, loss=0.0027, lr=1.36e-05, step=5290] Training: 53%|█████▎ | 5291/10000 [1:03:25<59:44, 1.31it/s, loss=0.0050, lr=1.36e-05, step=5291] Training: 53%|█████▎ | 5292/10000 [1:03:26<56:53, 1.38it/s, loss=0.0050, lr=1.36e-05, step=5291] Training: 53%|█████▎ | 5292/10000 [1:03:26<56:53, 1.38it/s, loss=0.0057, lr=1.36e-05, step=5292] Training: 53%|█████▎ | 5293/10000 [1:03:27<1:00:57, 1.29it/s, loss=0.0057, lr=1.36e-05, step=5292] Training: 53%|█████▎ | 5293/10000 [1:03:27<1:00:57, 1.29it/s, loss=0.0024, lr=1.36e-05, step=5293] Training: 53%|█████▎ | 5294/10000 [1:03:27<57:47, 1.36it/s, loss=0.0024, lr=1.36e-05, step=5293] Training: 53%|█████▎ | 5294/10000 [1:03:27<57:47, 1.36it/s, loss=0.0069, lr=1.36e-05, step=5294] Training: 53%|█████▎ | 5295/10000 [1:03:28<54:24, 1.44it/s, loss=0.0069, lr=1.36e-05, step=5294] Training: 53%|█████▎ | 5295/10000 [1:03:28<54:24, 1.44it/s, loss=0.0118, lr=1.36e-05, step=5295] Training: 53%|█████▎ | 5296/10000 [1:03:29<1:03:40, 1.23it/s, loss=0.0118, lr=1.36e-05, step=5295] Training: 53%|█████▎ | 5296/10000 [1:03:29<1:03:40, 1.23it/s, loss=0.0195, lr=1.36e-05, step=5296] Training: 53%|█████▎ | 5297/10000 [1:03:30<1:03:09, 1.24it/s, loss=0.0195, lr=1.36e-05, step=5296] Training: 53%|█████▎ | 5297/10000 [1:03:30<1:03:09, 1.24it/s, loss=0.0027, lr=1.36e-05, step=5297] Training: 53%|█████▎ | 5298/10000 [1:03:30<59:27, 1.32it/s, loss=0.0027, lr=1.36e-05, step=5297] Training: 53%|█████▎ | 5298/10000 [1:03:30<59:27, 1.32it/s, loss=0.0192, lr=1.36e-05, step=5298] Training: 53%|█████▎ | 5299/10000 [1:03:31<55:55, 1.40it/s, loss=0.0192, lr=1.36e-05, step=5298] Training: 53%|█████▎ | 5299/10000 [1:03:31<55:55, 1.40it/s, loss=0.0080, lr=1.36e-05, step=5299]17:09:38.455 [I] step=5300 loss=0.0082 smoothed_loss=0.0097 lr=1.36e-05 grad_norm=0.4710 step_time=0.5903s data_time=0.1279s it/s=1.393 eta_to_10000=3374.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0216 grad_action_out_proj=0.1614 grad_shared_expert=0.5357 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5300/10000 [1:03:32<54:24, 1.44it/s, loss=0.0080, lr=1.36e-05, step=5299] Training: 53%|█████▎ | 5300/10000 [1:03:32<54:24, 1.44it/s, loss=0.0082, lr=1.36e-05, step=5300] Training: 53%|█████▎ | 5301/10000 [1:03:32<57:30, 1.36it/s, loss=0.0082, lr=1.36e-05, step=5300] Training: 53%|█████▎ | 5301/10000 [1:03:32<57:30, 1.36it/s, loss=0.0079, lr=1.36e-05, step=5301] Training: 53%|█████▎ | 5302/10000 [1:03:33<52:26, 1.49it/s, loss=0.0079, lr=1.36e-05, step=5301] Training: 53%|█████▎ | 5302/10000 [1:03:33<52:26, 1.49it/s, loss=0.0051, lr=1.36e-05, step=5302] Training: 53%|█████▎ | 5303/10000 [1:03:34<57:10, 1.37it/s, loss=0.0051, lr=1.36e-05, step=5302] Training: 53%|█████▎ | 5303/10000 [1:03:34<57:10, 1.37it/s, loss=0.0120, lr=1.36e-05, step=5303] Training: 53%|█████▎ | 5304/10000 [1:03:34<57:32, 1.36it/s, loss=0.0120, lr=1.36e-05, step=5303] Training: 53%|█████▎ | 5304/10000 [1:03:34<57:32, 1.36it/s, loss=0.0077, lr=1.36e-05, step=5304] Training: 53%|█████▎ | 5305/10000 [1:03:35<1:02:04, 1.26it/s, loss=0.0077, lr=1.36e-05, step=5304] Training: 53%|█████▎ | 5305/10000 [1:03:35<1:02:04, 1.26it/s, loss=0.0123, lr=1.35e-05, step=5305] Training: 53%|█████▎ | 5306/10000 [1:03:36<59:39, 1.31it/s, loss=0.0123, lr=1.35e-05, step=5305] Training: 53%|█████▎ | 5306/10000 [1:03:36<59:39, 1.31it/s, loss=0.0052, lr=1.35e-05, step=5306] Training: 53%|█████▎ | 5307/10000 [1:03:37<1:03:13, 1.24it/s, loss=0.0052, lr=1.35e-05, step=5306] Training: 53%|█████▎ | 5307/10000 [1:03:37<1:03:13, 1.24it/s, loss=0.0044, lr=1.35e-05, step=5307] Training: 53%|█████▎ | 5308/10000 [1:03:38<1:04:38, 1.21it/s, loss=0.0044, lr=1.35e-05, step=5307] Training: 53%|█████▎ | 5308/10000 [1:03:38<1:04:38, 1.21it/s, loss=0.0078, lr=1.35e-05, step=5308] Training: 53%|█████▎ | 5309/10000 [1:03:39<1:00:27, 1.29it/s, loss=0.0078, lr=1.35e-05, step=5308] Training: 53%|█████▎ | 5309/10000 [1:03:39<1:00:27, 1.29it/s, loss=0.0110, lr=1.35e-05, step=5309]17:09:46.347 [I] step=5310 loss=0.0041 smoothed_loss=0.0083 lr=1.35e-05 grad_norm=0.4483 step_time=0.6430s data_time=0.1462s it/s=1.269 eta_to_10000=3696.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.0837 grad_shared_expert=0.3836 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5310/10000 [1:03:39<1:02:37, 1.25it/s, loss=0.0110, lr=1.35e-05, step=5309] Training: 53%|█████▎ | 5310/10000 [1:03:39<1:02:37, 1.25it/s, loss=0.0041, lr=1.35e-05, step=5310] Training: 53%|█████▎ | 5311/10000 [1:03:40<57:11, 1.37it/s, loss=0.0041, lr=1.35e-05, step=5310] Training: 53%|█████▎ | 5311/10000 [1:03:40<57:11, 1.37it/s, loss=0.0118, lr=1.35e-05, step=5311] Training: 53%|█████▎ | 5312/10000 [1:03:41<55:18, 1.41it/s, loss=0.0118, lr=1.35e-05, step=5311] Training: 53%|█████▎ | 5312/10000 [1:03:41<55:18, 1.41it/s, loss=0.0379, lr=1.35e-05, step=5312] Training: 53%|█████▎ | 5313/10000 [1:03:41<53:53, 1.45it/s, loss=0.0379, lr=1.35e-05, step=5312] Training: 53%|█████▎ | 5313/10000 [1:03:41<53:53, 1.45it/s, loss=0.0132, lr=1.35e-05, step=5313] Training: 53%|█████▎ | 5314/10000 [1:03:42<55:02, 1.42it/s, loss=0.0132, lr=1.35e-05, step=5313] Training: 53%|█████▎ | 5314/10000 [1:03:42<55:02, 1.42it/s, loss=0.0315, lr=1.35e-05, step=5314] Training: 53%|█████▎ | 5315/10000 [1:03:43<1:05:58, 1.18it/s, loss=0.0315, lr=1.35e-05, step=5314] Training: 53%|█████▎ | 5315/10000 [1:03:43<1:05:58, 1.18it/s, loss=0.0083, lr=1.35e-05, step=5315] Training: 53%|█████▎ | 5316/10000 [1:03:44<1:01:36, 1.27it/s, loss=0.0083, lr=1.35e-05, step=5315] Training: 53%|█████▎ | 5316/10000 [1:03:44<1:01:36, 1.27it/s, loss=0.0021, lr=1.35e-05, step=5316] Training: 53%|█████▎ | 5317/10000 [1:03:44<56:39, 1.38it/s, loss=0.0021, lr=1.35e-05, step=5316] Training: 53%|█████▎ | 5317/10000 [1:03:44<56:39, 1.38it/s, loss=0.0095, lr=1.35e-05, step=5317] Training: 53%|█████▎ | 5318/10000 [1:03:46<1:06:24, 1.18it/s, loss=0.0095, lr=1.35e-05, step=5317] Training: 53%|█████▎ | 5318/10000 [1:03:46<1:06:24, 1.18it/s, loss=0.0146, lr=1.35e-05, step=5318] Training: 53%|█████▎ | 5319/10000 [1:03:46<1:03:48, 1.22it/s, loss=0.0146, lr=1.35e-05, step=5318] Training: 53%|█████▎ | 5319/10000 [1:03:46<1:03:48, 1.22it/s, loss=0.0273, lr=1.35e-05, step=5319]17:09:53.974 [I] step=5320 loss=0.0150 smoothed_loss=0.0138 lr=1.35e-05 grad_norm=0.6120 step_time=0.6233s data_time=0.1394s it/s=1.311 eta_to_10000=3568.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0147 grad_action_out_proj=0.1268 grad_shared_expert=0.6698 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5320/10000 [1:03:47<1:01:46, 1.26it/s, loss=0.0273, lr=1.35e-05, step=5319] Training: 53%|█████▎ | 5320/10000 [1:03:47<1:01:46, 1.26it/s, loss=0.0150, lr=1.35e-05, step=5320] Training: 53%|█████▎ | 5321/10000 [1:03:48<57:57, 1.35it/s, loss=0.0150, lr=1.35e-05, step=5320] Training: 53%|█████▎ | 5321/10000 [1:03:48<57:57, 1.35it/s, loss=0.0042, lr=1.35e-05, step=5321] Training: 53%|█████▎ | 5322/10000 [1:03:49<59:58, 1.30it/s, loss=0.0042, lr=1.35e-05, step=5321] Training: 53%|█████▎ | 5322/10000 [1:03:49<59:58, 1.30it/s, loss=0.0110, lr=1.35e-05, step=5322] Training: 53%|█████▎ | 5323/10000 [1:03:49<1:02:47, 1.24it/s, loss=0.0110, lr=1.35e-05, step=5322] Training: 53%|█████▎ | 5323/10000 [1:03:49<1:02:47, 1.24it/s, loss=0.0437, lr=1.35e-05, step=5323] Training: 53%|█████▎ | 5324/10000 [1:03:50<59:36, 1.31it/s, loss=0.0437, lr=1.35e-05, step=5323] Training: 53%|█████▎ | 5324/10000 [1:03:50<59:36, 1.31it/s, loss=0.0080, lr=1.35e-05, step=5324] Training: 53%|█████▎ | 5325/10000 [1:03:51<1:04:05, 1.22it/s, loss=0.0080, lr=1.35e-05, step=5324] Training: 53%|█████▎ | 5325/10000 [1:03:51<1:04:05, 1.22it/s, loss=0.0057, lr=1.35e-05, step=5325] Training: 53%|█████▎ | 5326/10000 [1:03:52<1:05:19, 1.19it/s, loss=0.0057, lr=1.35e-05, step=5325] Training: 53%|█████▎ | 5326/10000 [1:03:52<1:05:19, 1.19it/s, loss=0.0082, lr=1.35e-05, step=5326] Training: 53%|█████▎ | 5327/10000 [1:03:53<1:00:59, 1.28it/s, loss=0.0082, lr=1.35e-05, step=5326] Training: 53%|█████▎ | 5327/10000 [1:03:53<1:00:59, 1.28it/s, loss=0.0126, lr=1.35e-05, step=5327] Training: 53%|█████▎ | 5328/10000 [1:03:54<1:12:31, 1.07it/s, loss=0.0126, lr=1.35e-05, step=5327] Training: 53%|█████▎ | 5328/10000 [1:03:54<1:12:31, 1.07it/s, loss=0.0070, lr=1.35e-05, step=5328] Training: 53%|█████▎ | 5329/10000 [1:03:55<1:12:09, 1.08it/s, loss=0.0070, lr=1.35e-05, step=5328] Training: 53%|█████▎ | 5329/10000 [1:03:55<1:12:09, 1.08it/s, loss=0.0024, lr=1.35e-05, step=5329]17:10:02.365 [I] step=5330 loss=0.0033 smoothed_loss=0.0109 lr=1.35e-05 grad_norm=0.4476 step_time=0.6181s data_time=0.2211s it/s=1.193 eta_to_10000=3914.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0076 grad_action_out_proj=0.0906 grad_shared_expert=0.3266 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5330/10000 [1:03:56<1:10:43, 1.10it/s, loss=0.0024, lr=1.35e-05, step=5329] Training: 53%|█████▎ | 5330/10000 [1:03:56<1:10:43, 1.10it/s, loss=0.0033, lr=1.35e-05, step=5330] Training: 53%|█████▎ | 5331/10000 [1:03:56<1:06:08, 1.18it/s, loss=0.0033, lr=1.35e-05, step=5330] Training: 53%|█████▎ | 5331/10000 [1:03:56<1:06:08, 1.18it/s, loss=0.0131, lr=1.35e-05, step=5331] Training: 53%|█████▎ | 5332/10000 [1:03:57<1:09:56, 1.11it/s, loss=0.0131, lr=1.35e-05, step=5331] Training: 53%|█████▎ | 5332/10000 [1:03:57<1:09:56, 1.11it/s, loss=0.0167, lr=1.34e-05, step=5332] Training: 53%|█████▎ | 5333/10000 [1:03:58<1:04:02, 1.21it/s, loss=0.0167, lr=1.34e-05, step=5332] Training: 53%|█████▎ | 5333/10000 [1:03:58<1:04:02, 1.21it/s, loss=0.0410, lr=1.34e-05, step=5333] Training: 53%|█████▎ | 5334/10000 [1:03:59<1:00:37, 1.28it/s, loss=0.0410, lr=1.34e-05, step=5333] Training: 53%|█████▎ | 5334/10000 [1:03:59<1:00:37, 1.28it/s, loss=0.0077, lr=1.34e-05, step=5334] Training: 53%|█████▎ | 5335/10000 [1:03:59<1:01:45, 1.26it/s, loss=0.0077, lr=1.34e-05, step=5334] Training: 53%|█████▎ | 5335/10000 [1:03:59<1:01:45, 1.26it/s, loss=0.0997, lr=1.34e-05, step=5335] Training: 53%|█████▎ | 5336/10000 [1:04:00<56:41, 1.37it/s, loss=0.0997, lr=1.34e-05, step=5335] Training: 53%|█████▎ | 5336/10000 [1:04:00<56:41, 1.37it/s, loss=0.0054, lr=1.34e-05, step=5336] Training: 53%|█████▎ | 5337/10000 [1:04:01<54:39, 1.42it/s, loss=0.0054, lr=1.34e-05, step=5336] Training: 53%|█████▎ | 5337/10000 [1:04:01<54:39, 1.42it/s, loss=0.0280, lr=1.34e-05, step=5337] Training: 53%|█████▎ | 5338/10000 [1:04:01<54:39, 1.42it/s, loss=0.0280, lr=1.34e-05, step=5337] Training: 53%|█████▎ | 5338/10000 [1:04:01<54:39, 1.42it/s, loss=0.0097, lr=1.34e-05, step=5338] Training: 53%|█████▎ | 5339/10000 [1:04:03<1:09:41, 1.11it/s, loss=0.0097, lr=1.34e-05, step=5338] Training: 53%|█████▎ | 5339/10000 [1:04:03<1:09:41, 1.11it/s, loss=0.0162, lr=1.34e-05, step=5339]17:10:10.586 [I] step=5340 loss=0.0151 smoothed_loss=0.0194 lr=1.34e-05 grad_norm=0.4695 step_time=0.6499s data_time=0.1722s it/s=1.244 eta_to_10000=3745.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.1019 grad_shared_expert=0.3330 (10775:train_pytorch.py:850) + Training: 53%|█████▎ | 5340/10000 [1:04:04<1:09:35, 1.12it/s, loss=0.0162, lr=1.34e-05, step=5339] Training: 53%|█████▎ | 5340/10000 [1:04:04<1:09:35, 1.12it/s, loss=0.0151, lr=1.34e-05, step=5340] Training: 53%|█████▎ | 5341/10000 [1:04:04<1:03:11, 1.23it/s, loss=0.0151, lr=1.34e-05, step=5340] Training: 53%|█████▎ | 5341/10000 [1:04:04<1:03:11, 1.23it/s, loss=0.0100, lr=1.34e-05, step=5341] Training: 53%|█████▎ | 5342/10000 [1:04:05<59:26, 1.31it/s, loss=0.0100, lr=1.34e-05, step=5341] Training: 53%|█████▎ | 5342/10000 [1:04:05<59:26, 1.31it/s, loss=0.0197, lr=1.34e-05, step=5342] Training: 53%|█████▎ | 5343/10000 [1:04:06<1:01:33, 1.26it/s, loss=0.0197, lr=1.34e-05, step=5342] Training: 53%|█████▎ | 5343/10000 [1:04:06<1:01:33, 1.26it/s, loss=0.0301, lr=1.34e-05, step=5343] Training: 53%|█████▎ | 5344/10000 [1:04:07<1:00:45, 1.28it/s, loss=0.0301, lr=1.34e-05, step=5343] Training: 53%|█████▎ | 5344/10000 [1:04:07<1:00:45, 1.28it/s, loss=0.0163, lr=1.34e-05, step=5344] Training: 53%|█████▎ | 5345/10000 [1:04:08<1:06:50, 1.16it/s, loss=0.0163, lr=1.34e-05, step=5344] Training: 53%|█████▎ | 5345/10000 [1:04:08<1:06:50, 1.16it/s, loss=0.0058, lr=1.34e-05, step=5345] Training: 53%|█████▎ | 5346/10000 [1:04:09<1:08:07, 1.14it/s, loss=0.0058, lr=1.34e-05, step=5345] Training: 53%|█████▎ | 5346/10000 [1:04:09<1:08:07, 1.14it/s, loss=0.0104, lr=1.34e-05, step=5346] Training: 53%|█████▎ | 5347/10000 [1:04:09<1:05:01, 1.19it/s, loss=0.0104, lr=1.34e-05, step=5346] Training: 53%|█████▎ | 5347/10000 [1:04:09<1:05:01, 1.19it/s, loss=0.0115, lr=1.34e-05, step=5347] Training: 53%|█████▎ | 5348/10000 [1:04:10<1:12:15, 1.07it/s, loss=0.0115, lr=1.34e-05, step=5347] Training: 53%|█████▎ | 5348/10000 [1:04:10<1:12:15, 1.07it/s, loss=0.0147, lr=1.34e-05, step=5348] Training: 53%|█████▎ | 5349/10000 [1:04:12<1:25:23, 1.10s/it, loss=0.0147, lr=1.34e-05, step=5348] Training: 53%|█████▎ | 5349/10000 [1:04:12<1:25:23, 1.10s/it, loss=0.0100, lr=1.34e-05, step=5349]17:10:19.733 [I] step=5350 loss=0.0048 smoothed_loss=0.0147 lr=1.34e-05 grad_norm=0.4926 step_time=0.6993s data_time=0.2154s it/s=1.094 eta_to_10000=4249.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.0968 grad_shared_expert=0.4269 (10775:train_pytorch.py:850) + Training: 54%|█████▎ | 5350/10000 [1:04:13<1:20:48, 1.04s/it, loss=0.0100, lr=1.34e-05, step=5349] Training: 54%|█████▎ | 5350/10000 [1:04:13<1:20:48, 1.04s/it, loss=0.0048, lr=1.34e-05, step=5350] Training: 54%|█████▎ | 5351/10000 [1:04:13<1:10:26, 1.10it/s, loss=0.0048, lr=1.34e-05, step=5350] Training: 54%|█████▎ | 5351/10000 [1:04:13<1:10:26, 1.10it/s, loss=0.0579, lr=1.34e-05, step=5351] Training: 54%|█████▎ | 5352/10000 [1:04:14<1:02:52, 1.23it/s, loss=0.0579, lr=1.34e-05, step=5351] Training: 54%|█████▎ | 5352/10000 [1:04:14<1:02:52, 1.23it/s, loss=0.0089, lr=1.34e-05, step=5352] Training: 54%|█████▎ | 5353/10000 [1:04:15<1:09:17, 1.12it/s, loss=0.0089, lr=1.34e-05, step=5352] Training: 54%|█████▎ | 5353/10000 [1:04:15<1:09:17, 1.12it/s, loss=0.0012, lr=1.34e-05, step=5353] Training: 54%|█████▎ | 5354/10000 [1:04:16<1:02:44, 1.23it/s, loss=0.0012, lr=1.34e-05, step=5353] Training: 54%|█████▎ | 5354/10000 [1:04:16<1:02:44, 1.23it/s, loss=0.0233, lr=1.34e-05, step=5354] Training: 54%|█████▎ | 5355/10000 [1:04:16<1:01:24, 1.26it/s, loss=0.0233, lr=1.34e-05, step=5354] Training: 54%|█████▎ | 5355/10000 [1:04:16<1:01:24, 1.26it/s, loss=0.0107, lr=1.34e-05, step=5355] Training: 54%|█████▎ | 5356/10000 [1:04:17<1:04:54, 1.19it/s, loss=0.0107, lr=1.34e-05, step=5355] Training: 54%|█████▎ | 5356/10000 [1:04:17<1:04:54, 1.19it/s, loss=0.0290, lr=1.34e-05, step=5356] Training: 54%|█████▎ | 5357/10000 [1:04:18<1:05:30, 1.18it/s, loss=0.0290, lr=1.34e-05, step=5356] Training: 54%|█████▎ | 5357/10000 [1:04:18<1:05:30, 1.18it/s, loss=0.0086, lr=1.34e-05, step=5357] Training: 54%|█████▎ | 5358/10000 [1:04:19<1:06:15, 1.17it/s, loss=0.0086, lr=1.34e-05, step=5357] Training: 54%|█████▎ | 5358/10000 [1:04:19<1:06:15, 1.17it/s, loss=0.0219, lr=1.34e-05, step=5358] Training: 54%|█████▎ | 5359/10000 [1:04:20<1:00:24, 1.28it/s, loss=0.0219, lr=1.34e-05, step=5358] Training: 54%|█████▎ | 5359/10000 [1:04:20<1:00:24, 1.28it/s, loss=0.0166, lr=1.33e-05, step=5359]17:10:27.410 [I] step=5360 loss=0.0039 smoothed_loss=0.0159 lr=1.34e-05 grad_norm=0.4148 step_time=0.6009s data_time=0.1668s it/s=1.305 eta_to_10000=3555.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1218 grad_shared_expert=0.3033 (10775:train_pytorch.py:850) + Training: 54%|█████▎ | 5360/10000 [1:04:20<59:19, 1.30it/s, loss=0.0166, lr=1.33e-05, step=5359] Training: 54%|█████▎ | 5360/10000 [1:04:20<59:19, 1.30it/s, loss=0.0039, lr=1.33e-05, step=5360] Training: 54%|█████▎ | 5361/10000 [1:04:21<59:50, 1.29it/s, loss=0.0039, lr=1.33e-05, step=5360] Training: 54%|█████▎ | 5361/10000 [1:04:21<59:50, 1.29it/s, loss=0.0093, lr=1.33e-05, step=5361] Training: 54%|█████▎ | 5362/10000 [1:04:22<58:12, 1.33it/s, loss=0.0093, lr=1.33e-05, step=5361] Training: 54%|█████▎ | 5362/10000 [1:04:22<58:12, 1.33it/s, loss=0.0320, lr=1.33e-05, step=5362] Training: 54%|█████▎ | 5363/10000 [1:04:23<58:27, 1.32it/s, loss=0.0320, lr=1.33e-05, step=5362] Training: 54%|█████▎ | 5363/10000 [1:04:23<58:27, 1.32it/s, loss=0.0265, lr=1.33e-05, step=5363] Training: 54%|█████▎ | 5364/10000 [1:04:23<54:14, 1.42it/s, loss=0.0265, lr=1.33e-05, step=5363] Training: 54%|█████▎ | 5364/10000 [1:04:23<54:14, 1.42it/s, loss=0.0058, lr=1.33e-05, step=5364] Training: 54%|█████▎ | 5365/10000 [1:04:24<51:16, 1.51it/s, loss=0.0058, lr=1.33e-05, step=5364] Training: 54%|█████▎ | 5365/10000 [1:04:24<51:16, 1.51it/s, loss=0.0088, lr=1.33e-05, step=5365] Training: 54%|█████▎ | 5366/10000 [1:04:25<53:32, 1.44it/s, loss=0.0088, lr=1.33e-05, step=5365] Training: 54%|█████▎ | 5366/10000 [1:04:25<53:32, 1.44it/s, loss=0.0051, lr=1.33e-05, step=5366] Training: 54%|█████▎ | 5367/10000 [1:04:25<50:36, 1.53it/s, loss=0.0051, lr=1.33e-05, step=5366] Training: 54%|█████▎ | 5367/10000 [1:04:25<50:36, 1.53it/s, loss=0.0027, lr=1.33e-05, step=5367] Training: 54%|█████▎ | 5368/10000 [1:04:26<54:16, 1.42it/s, loss=0.0027, lr=1.33e-05, step=5367] Training: 54%|█████▎ | 5368/10000 [1:04:26<54:16, 1.42it/s, loss=0.0033, lr=1.33e-05, step=5368] Training: 54%|█████▎ | 5369/10000 [1:04:27<51:59, 1.48it/s, loss=0.0033, lr=1.33e-05, step=5368] Training: 54%|█████▎ | 5369/10000 [1:04:27<51:59, 1.48it/s, loss=0.0017, lr=1.33e-05, step=5369]17:10:34.365 [I] step=5370 loss=0.0021 smoothed_loss=0.0105 lr=1.33e-05 grad_norm=0.3858 step_time=0.5901s data_time=0.1054s it/s=1.438 eta_to_10000=3219.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0955 grad_shared_expert=0.3254 (10775:train_pytorch.py:850) + Training: 54%|█████▎ | 5370/10000 [1:04:27<54:50, 1.41it/s, loss=0.0017, lr=1.33e-05, step=5369] Training: 54%|█████▎ | 5370/10000 [1:04:27<54:50, 1.41it/s, loss=0.0021, lr=1.33e-05, step=5370] Training: 54%|█████▎ | 5371/10000 [1:04:28<50:55, 1.52it/s, loss=0.0021, lr=1.33e-05, step=5370] Training: 54%|█████▎ | 5371/10000 [1:04:28<50:55, 1.52it/s, loss=0.0032, lr=1.33e-05, step=5371] Training: 54%|█████▎ | 5372/10000 [1:04:28<47:27, 1.63it/s, loss=0.0032, lr=1.33e-05, step=5371] Training: 54%|█████▎ | 5372/10000 [1:04:28<47:27, 1.63it/s, loss=0.0066, lr=1.33e-05, step=5372] Training: 54%|█████▎ | 5373/10000 [1:04:29<53:29, 1.44it/s, loss=0.0066, lr=1.33e-05, step=5372] Training: 54%|█████▎ | 5373/10000 [1:04:29<53:29, 1.44it/s, loss=0.0776, lr=1.33e-05, step=5373] Training: 54%|█████▎ | 5374/10000 [1:04:30<55:01, 1.40it/s, loss=0.0776, lr=1.33e-05, step=5373] Training: 54%|█████▎ | 5374/10000 [1:04:30<55:01, 1.40it/s, loss=0.0060, lr=1.33e-05, step=5374] Training: 54%|█████▍ | 5375/10000 [1:04:31<55:51, 1.38it/s, loss=0.0060, lr=1.33e-05, step=5374] Training: 54%|█████▍ | 5375/10000 [1:04:31<55:51, 1.38it/s, loss=0.0256, lr=1.33e-05, step=5375] Training: 54%|█████▍ | 5376/10000 [1:04:32<54:59, 1.40it/s, loss=0.0256, lr=1.33e-05, step=5375] Training: 54%|█████▍ | 5376/10000 [1:04:32<54:59, 1.40it/s, loss=0.0095, lr=1.33e-05, step=5376] Training: 54%|█████▍ | 5377/10000 [1:04:32<51:52, 1.49it/s, loss=0.0095, lr=1.33e-05, step=5376] Training: 54%|█████▍ | 5377/10000 [1:04:32<51:52, 1.49it/s, loss=0.0182, lr=1.33e-05, step=5377] Training: 54%|█████▍ | 5378/10000 [1:04:33<54:51, 1.40it/s, loss=0.0182, lr=1.33e-05, step=5377] Training: 54%|█████▍ | 5378/10000 [1:04:33<54:51, 1.40it/s, loss=0.0042, lr=1.33e-05, step=5378] Training: 54%|█████▍ | 5379/10000 [1:04:34<51:35, 1.49it/s, loss=0.0042, lr=1.33e-05, step=5378] Training: 54%|█████▍ | 5379/10000 [1:04:34<51:35, 1.49it/s, loss=0.0328, lr=1.33e-05, step=5379]17:10:41.251 [I] step=5380 loss=0.0122 smoothed_loss=0.0161 lr=1.33e-05 grad_norm=0.4974 step_time=0.5704s data_time=0.1182s it/s=1.452 eta_to_10000=3181.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0249 grad_action_out_proj=0.1835 grad_shared_expert=0.4782 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5380/10000 [1:04:34<54:46, 1.41it/s, loss=0.0328, lr=1.33e-05, step=5379] Training: 54%|█████▍ | 5380/10000 [1:04:34<54:46, 1.41it/s, loss=0.0122, lr=1.33e-05, step=5380] Training: 54%|█████▍ | 5381/10000 [1:04:35<51:29, 1.50it/s, loss=0.0122, lr=1.33e-05, step=5380] Training: 54%|█████▍ | 5381/10000 [1:04:35<51:29, 1.50it/s, loss=0.1203, lr=1.33e-05, step=5381] Training: 54%|█████▍ | 5382/10000 [1:04:36<54:56, 1.40it/s, loss=0.1203, lr=1.33e-05, step=5381] Training: 54%|█████▍ | 5382/10000 [1:04:36<54:56, 1.40it/s, loss=0.0042, lr=1.33e-05, step=5382] Training: 54%|█████▍ | 5383/10000 [1:04:36<52:09, 1.48it/s, loss=0.0042, lr=1.33e-05, step=5382] Training: 54%|█████▍ | 5383/10000 [1:04:36<52:09, 1.48it/s, loss=0.0089, lr=1.33e-05, step=5383] Training: 54%|█████▍ | 5384/10000 [1:04:37<54:11, 1.42it/s, loss=0.0089, lr=1.33e-05, step=5383] Training: 54%|█████▍ | 5384/10000 [1:04:37<54:11, 1.42it/s, loss=0.0163, lr=1.33e-05, step=5384] Training: 54%|█████▍ | 5385/10000 [1:04:38<56:49, 1.35it/s, loss=0.0163, lr=1.33e-05, step=5384] Training: 54%|█████▍ | 5385/10000 [1:04:38<56:49, 1.35it/s, loss=0.0115, lr=1.33e-05, step=5385] Training: 54%|█████▍ | 5386/10000 [1:04:38<52:19, 1.47it/s, loss=0.0115, lr=1.33e-05, step=5385] Training: 54%|█████▍ | 5386/10000 [1:04:38<52:19, 1.47it/s, loss=0.0091, lr=1.32e-05, step=5386] Training: 54%|█████▍ | 5387/10000 [1:04:39<55:00, 1.40it/s, loss=0.0091, lr=1.32e-05, step=5386] Training: 54%|█████▍ | 5387/10000 [1:04:39<55:00, 1.40it/s, loss=0.0023, lr=1.32e-05, step=5387] Training: 54%|█████▍ | 5388/10000 [1:04:40<51:03, 1.51it/s, loss=0.0023, lr=1.32e-05, step=5387] Training: 54%|█████▍ | 5388/10000 [1:04:40<51:03, 1.51it/s, loss=0.0168, lr=1.32e-05, step=5388] Training: 54%|█████▍ | 5389/10000 [1:04:41<54:34, 1.41it/s, loss=0.0168, lr=1.32e-05, step=5388] Training: 54%|█████▍ | 5389/10000 [1:04:41<54:34, 1.41it/s, loss=0.0031, lr=1.32e-05, step=5389]17:10:48.316 [I] step=5390 loss=0.0058 smoothed_loss=0.0154 lr=1.32e-05 grad_norm=0.4186 step_time=0.5884s data_time=0.1181s it/s=1.416 eta_to_10000=3255.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0922 grad_shared_expert=0.4013 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5390/10000 [1:04:41<56:25, 1.36it/s, loss=0.0031, lr=1.32e-05, step=5389] Training: 54%|█████▍ | 5390/10000 [1:04:41<56:25, 1.36it/s, loss=0.0058, lr=1.32e-05, step=5390] Training: 54%|█████▍ | 5391/10000 [1:04:42<51:51, 1.48it/s, loss=0.0058, lr=1.32e-05, step=5390] Training: 54%|█████▍ | 5391/10000 [1:04:42<51:51, 1.48it/s, loss=0.0019, lr=1.32e-05, step=5391] Training: 54%|█████▍ | 5392/10000 [1:04:43<49:46, 1.54it/s, loss=0.0019, lr=1.32e-05, step=5391] Training: 54%|█████▍ | 5392/10000 [1:04:43<49:46, 1.54it/s, loss=0.0236, lr=1.32e-05, step=5392] Training: 54%|█████▍ | 5393/10000 [1:04:43<51:20, 1.50it/s, loss=0.0236, lr=1.32e-05, step=5392] Training: 54%|█████▍ | 5393/10000 [1:04:43<51:20, 1.50it/s, loss=0.0144, lr=1.32e-05, step=5393] Training: 54%|█████▍ | 5394/10000 [1:04:44<53:30, 1.43it/s, loss=0.0144, lr=1.32e-05, step=5393] Training: 54%|█████▍ | 5394/10000 [1:04:44<53:30, 1.43it/s, loss=0.0047, lr=1.32e-05, step=5394] Training: 54%|█████▍ | 5395/10000 [1:04:45<53:12, 1.44it/s, loss=0.0047, lr=1.32e-05, step=5394] Training: 54%|█████▍ | 5395/10000 [1:04:45<53:12, 1.44it/s, loss=0.0214, lr=1.32e-05, step=5395] Training: 54%|█████▍ | 5396/10000 [1:04:45<55:29, 1.38it/s, loss=0.0214, lr=1.32e-05, step=5395] Training: 54%|█████▍ | 5396/10000 [1:04:45<55:29, 1.38it/s, loss=0.0026, lr=1.32e-05, step=5396] Training: 54%|█████▍ | 5397/10000 [1:04:46<52:34, 1.46it/s, loss=0.0026, lr=1.32e-05, step=5396] Training: 54%|█████▍ | 5397/10000 [1:04:46<52:34, 1.46it/s, loss=0.0076, lr=1.32e-05, step=5397] Training: 54%|█████▍ | 5398/10000 [1:04:47<52:45, 1.45it/s, loss=0.0076, lr=1.32e-05, step=5397] Training: 54%|█████▍ | 5398/10000 [1:04:47<52:45, 1.45it/s, loss=0.0082, lr=1.32e-05, step=5398] Training: 54%|█████▍ | 5399/10000 [1:04:47<53:09, 1.44it/s, loss=0.0082, lr=1.32e-05, step=5398] Training: 54%|█████▍ | 5399/10000 [1:04:47<53:09, 1.44it/s, loss=0.0195, lr=1.32e-05, step=5399]17:10:54.978 [I] step=5400 loss=0.0120 smoothed_loss=0.0130 lr=1.32e-05 grad_norm=0.4956 step_time=0.5493s data_time=0.1169s it/s=1.501 eta_to_10000=3063.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.1023 grad_shared_expert=0.3381 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5400/10000 [1:04:48<50:40, 1.51it/s, loss=0.0195, lr=1.32e-05, step=5399] Training: 54%|█████▍ | 5400/10000 [1:04:48<50:40, 1.51it/s, loss=0.0120, lr=1.32e-05, step=5400] Training: 54%|█████▍ | 5401/10000 [1:04:49<59:39, 1.28it/s, loss=0.0120, lr=1.32e-05, step=5400] Training: 54%|█████▍ | 5401/10000 [1:04:49<59:39, 1.28it/s, loss=0.0263, lr=1.32e-05, step=5401] Training: 54%|█████▍ | 5402/10000 [1:04:50<1:03:35, 1.21it/s, loss=0.0263, lr=1.32e-05, step=5401] Training: 54%|█████▍ | 5402/10000 [1:04:50<1:03:35, 1.21it/s, loss=0.0022, lr=1.32e-05, step=5402] Training: 54%|█████▍ | 5403/10000 [1:04:51<58:39, 1.31it/s, loss=0.0022, lr=1.32e-05, step=5402] Training: 54%|█████▍ | 5403/10000 [1:04:51<58:39, 1.31it/s, loss=0.0058, lr=1.32e-05, step=5403] Training: 54%|█████▍ | 5404/10000 [1:04:52<1:00:26, 1.27it/s, loss=0.0058, lr=1.32e-05, step=5403] Training: 54%|█████▍ | 5404/10000 [1:04:52<1:00:26, 1.27it/s, loss=0.0060, lr=1.32e-05, step=5404] Training: 54%|█████▍ | 5405/10000 [1:04:52<1:02:38, 1.22it/s, loss=0.0060, lr=1.32e-05, step=5404] Training: 54%|█████▍ | 5405/10000 [1:04:52<1:02:38, 1.22it/s, loss=0.0085, lr=1.32e-05, step=5405] Training: 54%|█████▍ | 5406/10000 [1:04:53<1:00:24, 1.27it/s, loss=0.0085, lr=1.32e-05, step=5405] Training: 54%|█████▍ | 5406/10000 [1:04:53<1:00:24, 1.27it/s, loss=0.0203, lr=1.32e-05, step=5406] Training: 54%|█████▍ | 5407/10000 [1:04:54<55:37, 1.38it/s, loss=0.0203, lr=1.32e-05, step=5406] Training: 54%|█████▍ | 5407/10000 [1:04:54<55:37, 1.38it/s, loss=0.0093, lr=1.32e-05, step=5407] Training: 54%|█████▍ | 5408/10000 [1:04:54<56:41, 1.35it/s, loss=0.0093, lr=1.32e-05, step=5407] Training: 54%|█████▍ | 5408/10000 [1:04:54<56:41, 1.35it/s, loss=0.0048, lr=1.32e-05, step=5408] Training: 54%|█████▍ | 5409/10000 [1:04:55<56:36, 1.35it/s, loss=0.0048, lr=1.32e-05, step=5408] Training: 54%|█████▍ | 5409/10000 [1:04:55<56:36, 1.35it/s, loss=0.0074, lr=1.32e-05, step=5409]17:11:02.859 [I] step=5410 loss=0.0078 smoothed_loss=0.0106 lr=1.32e-05 grad_norm=0.4391 step_time=0.6278s data_time=0.1604s it/s=1.269 eta_to_10000=3617.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0033 grad_action_out_proj=0.0518 grad_shared_expert=0.1927 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5410/10000 [1:04:56<56:08, 1.36it/s, loss=0.0074, lr=1.32e-05, step=5409] Training: 54%|█████▍ | 5410/10000 [1:04:56<56:08, 1.36it/s, loss=0.0078, lr=1.32e-05, step=5410] Training: 54%|█████▍ | 5411/10000 [1:04:57<56:36, 1.35it/s, loss=0.0078, lr=1.32e-05, step=5410] Training: 54%|█████▍ | 5411/10000 [1:04:57<56:36, 1.35it/s, loss=0.0091, lr=1.32e-05, step=5411] Training: 54%|█████▍ | 5412/10000 [1:04:57<52:42, 1.45it/s, loss=0.0091, lr=1.32e-05, step=5411] Training: 54%|█████▍ | 5412/10000 [1:04:57<52:42, 1.45it/s, loss=0.0112, lr=1.32e-05, step=5412] Training: 54%|█████▍ | 5413/10000 [1:04:58<49:25, 1.55it/s, loss=0.0112, lr=1.32e-05, step=5412] Training: 54%|█████▍ | 5413/10000 [1:04:58<49:25, 1.55it/s, loss=0.0025, lr=1.31e-05, step=5413] Training: 54%|█████▍ | 5414/10000 [1:04:58<47:00, 1.63it/s, loss=0.0025, lr=1.31e-05, step=5413] Training: 54%|█████▍ | 5414/10000 [1:04:58<47:00, 1.63it/s, loss=0.0098, lr=1.31e-05, step=5414] Training: 54%|█████▍ | 5415/10000 [1:04:59<56:11, 1.36it/s, loss=0.0098, lr=1.31e-05, step=5414] Training: 54%|█████▍ | 5415/10000 [1:04:59<56:11, 1.36it/s, loss=0.0133, lr=1.31e-05, step=5415] Training: 54%|█████▍ | 5416/10000 [1:05:00<53:43, 1.42it/s, loss=0.0133, lr=1.31e-05, step=5415] Training: 54%|█████▍ | 5416/10000 [1:05:00<53:43, 1.42it/s, loss=0.0103, lr=1.31e-05, step=5416] Training: 54%|█████▍ | 5417/10000 [1:05:01<54:44, 1.40it/s, loss=0.0103, lr=1.31e-05, step=5416] Training: 54%|█████▍ | 5417/10000 [1:05:01<54:44, 1.40it/s, loss=0.0095, lr=1.31e-05, step=5417] Training: 54%|█████▍ | 5418/10000 [1:05:02<56:45, 1.35it/s, loss=0.0095, lr=1.31e-05, step=5417] Training: 54%|█████▍ | 5418/10000 [1:05:02<56:45, 1.35it/s, loss=0.0239, lr=1.31e-05, step=5418] Training: 54%|█████▍ | 5419/10000 [1:05:02<53:47, 1.42it/s, loss=0.0239, lr=1.31e-05, step=5418] Training: 54%|█████▍ | 5419/10000 [1:05:02<53:47, 1.42it/s, loss=0.0115, lr=1.31e-05, step=5419]17:11:09.695 [I] step=5420 loss=0.0120 smoothed_loss=0.0115 lr=1.31e-05 grad_norm=0.4032 step_time=0.5833s data_time=0.1003s it/s=1.463 eta_to_10000=3130.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.0969 grad_shared_expert=0.3918 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5420/10000 [1:05:03<51:36, 1.48it/s, loss=0.0115, lr=1.31e-05, step=5419] Training: 54%|█████▍ | 5420/10000 [1:05:03<51:36, 1.48it/s, loss=0.0120, lr=1.31e-05, step=5420] Training: 54%|█████▍ | 5421/10000 [1:05:03<48:52, 1.56it/s, loss=0.0120, lr=1.31e-05, step=5420] Training: 54%|█████▍ | 5421/10000 [1:05:03<48:52, 1.56it/s, loss=0.0091, lr=1.31e-05, step=5421] Training: 54%|█████▍ | 5422/10000 [1:05:04<57:37, 1.32it/s, loss=0.0091, lr=1.31e-05, step=5421] Training: 54%|█████▍ | 5422/10000 [1:05:04<57:37, 1.32it/s, loss=0.0024, lr=1.31e-05, step=5422] Training: 54%|█████▍ | 5423/10000 [1:05:05<55:08, 1.38it/s, loss=0.0024, lr=1.31e-05, step=5422] Training: 54%|█████▍ | 5423/10000 [1:05:05<55:08, 1.38it/s, loss=0.0062, lr=1.31e-05, step=5423] Training: 54%|█████▍ | 5424/10000 [1:05:06<52:35, 1.45it/s, loss=0.0062, lr=1.31e-05, step=5423] Training: 54%|█████▍ | 5424/10000 [1:05:06<52:35, 1.45it/s, loss=0.0228, lr=1.31e-05, step=5424] Training: 54%|█████▍ | 5425/10000 [1:05:06<49:37, 1.54it/s, loss=0.0228, lr=1.31e-05, step=5424] Training: 54%|█████▍ | 5425/10000 [1:05:06<49:37, 1.54it/s, loss=0.0041, lr=1.31e-05, step=5425] Training: 54%|█████▍ | 5426/10000 [1:05:07<53:30, 1.42it/s, loss=0.0041, lr=1.31e-05, step=5425] Training: 54%|█████▍ | 5426/10000 [1:05:07<53:30, 1.42it/s, loss=0.0339, lr=1.31e-05, step=5426] Training: 54%|█████▍ | 5427/10000 [1:05:08<52:22, 1.46it/s, loss=0.0339, lr=1.31e-05, step=5426] Training: 54%|█████▍ | 5427/10000 [1:05:08<52:22, 1.46it/s, loss=0.0037, lr=1.31e-05, step=5427] Training: 54%|█████▍ | 5428/10000 [1:05:08<50:52, 1.50it/s, loss=0.0037, lr=1.31e-05, step=5427] Training: 54%|█████▍ | 5428/10000 [1:05:08<50:52, 1.50it/s, loss=0.0096, lr=1.31e-05, step=5428] Training: 54%|█████▍ | 5429/10000 [1:05:09<47:48, 1.59it/s, loss=0.0096, lr=1.31e-05, step=5428] Training: 54%|█████▍ | 5429/10000 [1:05:09<47:48, 1.59it/s, loss=0.0018, lr=1.31e-05, step=5429]17:11:16.535 [I] step=5430 loss=0.0193 smoothed_loss=0.0116 lr=1.31e-05 grad_norm=0.4870 step_time=0.5692s data_time=0.1147s it/s=1.462 eta_to_10000=3125.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0180 grad_action_out_proj=0.1441 grad_shared_expert=0.4606 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5430/10000 [1:05:10<51:57, 1.47it/s, loss=0.0018, lr=1.31e-05, step=5429] Training: 54%|█████▍ | 5430/10000 [1:05:10<51:57, 1.47it/s, loss=0.0193, lr=1.31e-05, step=5430] Training: 54%|█████▍ | 5431/10000 [1:05:10<48:19, 1.58it/s, loss=0.0193, lr=1.31e-05, step=5430] Training: 54%|█████▍ | 5431/10000 [1:05:10<48:19, 1.58it/s, loss=0.0181, lr=1.31e-05, step=5431] Training: 54%|█████▍ | 5432/10000 [1:05:11<48:25, 1.57it/s, loss=0.0181, lr=1.31e-05, step=5431] Training: 54%|█████▍ | 5432/10000 [1:05:11<48:25, 1.57it/s, loss=0.0066, lr=1.31e-05, step=5432] Training: 54%|█████▍ | 5433/10000 [1:05:12<52:05, 1.46it/s, loss=0.0066, lr=1.31e-05, step=5432] Training: 54%|█████▍ | 5433/10000 [1:05:12<52:05, 1.46it/s, loss=0.0398, lr=1.31e-05, step=5433] Training: 54%|█████▍ | 5434/10000 [1:05:12<49:03, 1.55it/s, loss=0.0398, lr=1.31e-05, step=5433] Training: 54%|█████▍ | 5434/10000 [1:05:12<49:03, 1.55it/s, loss=0.0308, lr=1.31e-05, step=5434] Training: 54%|█████▍ | 5435/10000 [1:05:13<46:53, 1.62it/s, loss=0.0308, lr=1.31e-05, step=5434] Training: 54%|█████▍ | 5435/10000 [1:05:13<46:53, 1.62it/s, loss=0.0016, lr=1.31e-05, step=5435] Training: 54%|█████▍ | 5436/10000 [1:05:13<49:48, 1.53it/s, loss=0.0016, lr=1.31e-05, step=5435] Training: 54%|█████▍ | 5436/10000 [1:05:13<49:48, 1.53it/s, loss=0.0060, lr=1.31e-05, step=5436] Training: 54%|█████▍ | 5437/10000 [1:05:14<52:39, 1.44it/s, loss=0.0060, lr=1.31e-05, step=5436] Training: 54%|█████▍ | 5437/10000 [1:05:14<52:39, 1.44it/s, loss=0.0063, lr=1.31e-05, step=5437] Training: 54%|█████▍ | 5438/10000 [1:05:15<50:00, 1.52it/s, loss=0.0063, lr=1.31e-05, step=5437] Training: 54%|█████▍ | 5438/10000 [1:05:15<50:00, 1.52it/s, loss=0.0094, lr=1.31e-05, step=5438] Training: 54%|█████▍ | 5439/10000 [1:05:16<55:16, 1.38it/s, loss=0.0094, lr=1.31e-05, step=5438] Training: 54%|█████▍ | 5439/10000 [1:05:16<55:16, 1.38it/s, loss=0.0251, lr=1.31e-05, step=5439]17:11:23.232 [I] step=5440 loss=0.0037 smoothed_loss=0.0129 lr=1.31e-05 grad_norm=0.4771 step_time=0.5700s data_time=0.0998s it/s=1.493 eta_to_10000=3054.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.0890 grad_shared_expert=0.3745 (10775:train_pytorch.py:850) + Training: 54%|█████▍ | 5440/10000 [1:05:16<53:23, 1.42it/s, loss=0.0251, lr=1.31e-05, step=5439] Training: 54%|█████▍ | 5440/10000 [1:05:16<53:23, 1.42it/s, loss=0.0037, lr=1.30e-05, step=5440] Training: 54%|█████▍ | 5441/10000 [1:05:17<52:52, 1.44it/s, loss=0.0037, lr=1.30e-05, step=5440] Training: 54%|█████▍ | 5441/10000 [1:05:17<52:52, 1.44it/s, loss=0.0013, lr=1.30e-05, step=5441] Training: 54%|█████▍ | 5442/10000 [1:05:18<50:57, 1.49it/s, loss=0.0013, lr=1.30e-05, step=5441] Training: 54%|█████▍ | 5442/10000 [1:05:18<50:57, 1.49it/s, loss=0.0156, lr=1.30e-05, step=5442] Training: 54%|█████▍ | 5443/10000 [1:05:18<50:01, 1.52it/s, loss=0.0156, lr=1.30e-05, step=5442] Training: 54%|█████▍ | 5443/10000 [1:05:18<50:01, 1.52it/s, loss=0.0032, lr=1.30e-05, step=5443] Training: 54%|█████▍ | 5444/10000 [1:05:19<48:39, 1.56it/s, loss=0.0032, lr=1.30e-05, step=5443] Training: 54%|█████▍ | 5444/10000 [1:05:19<48:39, 1.56it/s, loss=0.0047, lr=1.30e-05, step=5444] Training: 54%|█████▍ | 5445/10000 [1:05:20<51:54, 1.46it/s, loss=0.0047, lr=1.30e-05, step=5444] Training: 54%|█████▍ | 5445/10000 [1:05:20<51:54, 1.46it/s, loss=0.0082, lr=1.30e-05, step=5445] Training: 54%|█████▍ | 5446/10000 [1:05:20<51:34, 1.47it/s, loss=0.0082, lr=1.30e-05, step=5445] Training: 54%|█████▍ | 5446/10000 [1:05:20<51:34, 1.47it/s, loss=0.0228, lr=1.30e-05, step=5446] Training: 54%|█████▍ | 5447/10000 [1:05:21<57:22, 1.32it/s, loss=0.0228, lr=1.30e-05, step=5446] Training: 54%|█████▍ | 5447/10000 [1:05:21<57:22, 1.32it/s, loss=0.0060, lr=1.30e-05, step=5447] Training: 54%|█████▍ | 5448/10000 [1:05:22<54:45, 1.39it/s, loss=0.0060, lr=1.30e-05, step=5447] Training: 54%|█████▍ | 5448/10000 [1:05:22<54:45, 1.39it/s, loss=0.0131, lr=1.30e-05, step=5448] Training: 54%|█████▍ | 5449/10000 [1:05:22<50:35, 1.50it/s, loss=0.0131, lr=1.30e-05, step=5448] Training: 54%|█████▍ | 5449/10000 [1:05:22<50:35, 1.50it/s, loss=0.0065, lr=1.30e-05, step=5449]17:11:29.977 [I] step=5450 loss=0.0048 smoothed_loss=0.0102 lr=1.30e-05 grad_norm=0.5300 step_time=0.5850s data_time=0.0895s it/s=1.483 eta_to_10000=3068.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.1533 grad_shared_expert=0.5846 (10775:train_pytorch.py:850) + Training: 55%|█████▍ | 5450/10000 [1:05:23<50:15, 1.51it/s, loss=0.0065, lr=1.30e-05, step=5449] Training: 55%|█████▍ | 5450/10000 [1:05:23<50:15, 1.51it/s, loss=0.0048, lr=1.30e-05, step=5450] Training: 55%|█████▍ | 5451/10000 [1:05:24<51:33, 1.47it/s, loss=0.0048, lr=1.30e-05, step=5450] Training: 55%|█████▍ | 5451/10000 [1:05:24<51:33, 1.47it/s, loss=0.0180, lr=1.30e-05, step=5451] Training: 55%|█████▍ | 5452/10000 [1:05:25<54:08, 1.40it/s, loss=0.0180, lr=1.30e-05, step=5451] Training: 55%|█████▍ | 5452/10000 [1:05:25<54:08, 1.40it/s, loss=0.0131, lr=1.30e-05, step=5452] Training: 55%|█████▍ | 5453/10000 [1:05:25<54:26, 1.39it/s, loss=0.0131, lr=1.30e-05, step=5452] Training: 55%|█████▍ | 5453/10000 [1:05:25<54:26, 1.39it/s, loss=0.0033, lr=1.30e-05, step=5453] Training: 55%|█████▍ | 5454/10000 [1:05:26<1:02:09, 1.22it/s, loss=0.0033, lr=1.30e-05, step=5453] Training: 55%|█████▍ | 5454/10000 [1:05:26<1:02:09, 1.22it/s, loss=0.0047, lr=1.30e-05, step=5454] Training: 55%|█████▍ | 5455/10000 [1:05:27<55:43, 1.36it/s, loss=0.0047, lr=1.30e-05, step=5454] Training: 55%|█████▍ | 5455/10000 [1:05:27<55:43, 1.36it/s, loss=0.0045, lr=1.30e-05, step=5455] Training: 55%|█████▍ | 5456/10000 [1:05:27<52:29, 1.44it/s, loss=0.0045, lr=1.30e-05, step=5455] Training: 55%|█████▍ | 5456/10000 [1:05:27<52:29, 1.44it/s, loss=0.0545, lr=1.30e-05, step=5456] Training: 55%|█████▍ | 5457/10000 [1:05:28<50:45, 1.49it/s, loss=0.0545, lr=1.30e-05, step=5456] Training: 55%|█████▍ | 5457/10000 [1:05:28<50:45, 1.49it/s, loss=0.0029, lr=1.30e-05, step=5457] Training: 55%|█████▍ | 5458/10000 [1:05:29<50:06, 1.51it/s, loss=0.0029, lr=1.30e-05, step=5457] Training: 55%|█████▍ | 5458/10000 [1:05:29<50:06, 1.51it/s, loss=0.0065, lr=1.30e-05, step=5458] Training: 55%|█████▍ | 5459/10000 [1:05:30<54:28, 1.39it/s, loss=0.0065, lr=1.30e-05, step=5458] Training: 55%|█████▍ | 5459/10000 [1:05:30<54:28, 1.39it/s, loss=0.0378, lr=1.30e-05, step=5459]17:11:37.360 [I] step=5460 loss=0.0022 smoothed_loss=0.0134 lr=1.30e-05 grad_norm=0.4434 step_time=0.6233s data_time=0.1151s it/s=1.354 eta_to_10000=3351.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0173 grad_action_out_proj=0.1675 grad_shared_expert=0.4123 (10775:train_pytorch.py:850) + Training: 55%|█████▍ | 5460/10000 [1:05:30<57:08, 1.32it/s, loss=0.0378, lr=1.30e-05, step=5459] Training: 55%|█████▍ | 5460/10000 [1:05:30<57:08, 1.32it/s, loss=0.0022, lr=1.30e-05, step=5460] Training: 55%|█████▍ | 5461/10000 [1:05:31<1:00:31, 1.25it/s, loss=0.0022, lr=1.30e-05, step=5460] Training: 55%|█████▍ | 5461/10000 [1:05:31<1:00:31, 1.25it/s, loss=0.0135, lr=1.30e-05, step=5461] Training: 55%|█████▍ | 5462/10000 [1:05:32<1:02:44, 1.21it/s, loss=0.0135, lr=1.30e-05, step=5461] Training: 55%|█████▍ | 5462/10000 [1:05:32<1:02:44, 1.21it/s, loss=0.0119, lr=1.30e-05, step=5462] Training: 55%|█████▍ | 5463/10000 [1:05:33<1:00:41, 1.25it/s, loss=0.0119, lr=1.30e-05, step=5462] Training: 55%|█████▍ | 5463/10000 [1:05:33<1:00:41, 1.25it/s, loss=0.0099, lr=1.30e-05, step=5463] Training: 55%|█████▍ | 5464/10000 [1:05:34<1:00:16, 1.25it/s, loss=0.0099, lr=1.30e-05, step=5463] Training: 55%|█████▍ | 5464/10000 [1:05:34<1:00:16, 1.25it/s, loss=0.0081, lr=1.30e-05, step=5464] Training: 55%|█████▍ | 5465/10000 [1:05:34<57:26, 1.32it/s, loss=0.0081, lr=1.30e-05, step=5464] Training: 55%|█████▍ | 5465/10000 [1:05:34<57:26, 1.32it/s, loss=0.0307, lr=1.30e-05, step=5465] Training: 55%|█████▍ | 5466/10000 [1:05:35<1:01:21, 1.23it/s, loss=0.0307, lr=1.30e-05, step=5465] Training: 55%|█████▍ | 5466/10000 [1:05:35<1:01:21, 1.23it/s, loss=0.0121, lr=1.30e-05, step=5466] Training: 55%|█████▍ | 5467/10000 [1:05:36<55:32, 1.36it/s, loss=0.0121, lr=1.30e-05, step=5466] Training: 55%|█████▍ | 5467/10000 [1:05:36<55:32, 1.36it/s, loss=0.0103, lr=1.29e-05, step=5467] Training: 55%|█████▍ | 5468/10000 [1:05:37<57:59, 1.30it/s, loss=0.0103, lr=1.29e-05, step=5467] Training: 55%|█████▍ | 5468/10000 [1:05:37<57:59, 1.30it/s, loss=0.0219, lr=1.29e-05, step=5468] Training: 55%|█████▍ | 5469/10000 [1:05:38<1:00:15, 1.25it/s, loss=0.0219, lr=1.29e-05, step=5468] Training: 55%|█████▍ | 5469/10000 [1:05:38<1:00:15, 1.25it/s, loss=0.0016, lr=1.29e-05, step=5469]17:11:45.144 [I] step=5470 loss=0.0200 smoothed_loss=0.0139 lr=1.30e-05 grad_norm=0.5195 step_time=0.6288s data_time=0.1495s it/s=1.285 eta_to_10000=3525.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0443 grad_action_out_proj=0.2602 grad_shared_expert=0.6829 (10775:train_pytorch.py:850) + Training: 55%|█████▍ | 5470/10000 [1:05:38<55:18, 1.36it/s, loss=0.0016, lr=1.29e-05, step=5469] Training: 55%|█████▍ | 5470/10000 [1:05:38<55:18, 1.36it/s, loss=0.0200, lr=1.29e-05, step=5470] Training: 55%|█████▍ | 5471/10000 [1:05:39<51:51, 1.46it/s, loss=0.0200, lr=1.29e-05, step=5470] Training: 55%|█████▍ | 5471/10000 [1:05:39<51:51, 1.46it/s, loss=0.0066, lr=1.29e-05, step=5471] Training: 55%|█████▍ | 5472/10000 [1:05:39<49:31, 1.52it/s, loss=0.0066, lr=1.29e-05, step=5471] Training: 55%|█████▍ | 5472/10000 [1:05:39<49:31, 1.52it/s, loss=0.0079, lr=1.29e-05, step=5472] Training: 55%|█████▍ | 5473/10000 [1:05:40<52:20, 1.44it/s, loss=0.0079, lr=1.29e-05, step=5472] Training: 55%|█████▍ | 5473/10000 [1:05:40<52:20, 1.44it/s, loss=0.0162, lr=1.29e-05, step=5473] Training: 55%|█████▍ | 5474/10000 [1:05:41<52:31, 1.44it/s, loss=0.0162, lr=1.29e-05, step=5473] Training: 55%|█████▍ | 5474/10000 [1:05:41<52:31, 1.44it/s, loss=0.0449, lr=1.29e-05, step=5474] Training: 55%|█████▍ | 5475/10000 [1:05:42<1:03:11, 1.19it/s, loss=0.0449, lr=1.29e-05, step=5474] Training: 55%|█████▍ | 5475/10000 [1:05:42<1:03:11, 1.19it/s, loss=0.0129, lr=1.29e-05, step=5475] Training: 55%|█████▍ | 5476/10000 [1:05:43<1:02:53, 1.20it/s, loss=0.0129, lr=1.29e-05, step=5475] Training: 55%|█████▍ | 5476/10000 [1:05:43<1:02:53, 1.20it/s, loss=0.0146, lr=1.29e-05, step=5476] Training: 55%|█████▍ | 5477/10000 [1:05:44<1:02:33, 1.21it/s, loss=0.0146, lr=1.29e-05, step=5476] Training: 55%|█████▍ | 5477/10000 [1:05:44<1:02:33, 1.21it/s, loss=0.0177, lr=1.29e-05, step=5477] Training: 55%|█████▍ | 5478/10000 [1:05:44<1:01:29, 1.23it/s, loss=0.0177, lr=1.29e-05, step=5477] Training: 55%|█████▍ | 5478/10000 [1:05:44<1:01:29, 1.23it/s, loss=0.0017, lr=1.29e-05, step=5478] Training: 55%|█████▍ | 5479/10000 [1:05:45<1:01:16, 1.23it/s, loss=0.0017, lr=1.29e-05, step=5478] Training: 55%|█████▍ | 5479/10000 [1:05:45<1:01:16, 1.23it/s, loss=0.0395, lr=1.29e-05, step=5479]17:11:52.943 [I] step=5480 loss=0.0055 smoothed_loss=0.0159 lr=1.29e-05 grad_norm=0.5021 step_time=0.6263s data_time=0.1536s it/s=1.282 eta_to_10000=3524.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1062 grad_shared_expert=0.4616 (10775:train_pytorch.py:850) + Training: 55%|█████▍ | 5480/10000 [1:05:46<59:45, 1.26it/s, loss=0.0395, lr=1.29e-05, step=5479] Training: 55%|█████▍ | 5480/10000 [1:05:46<59:45, 1.26it/s, loss=0.0055, lr=1.29e-05, step=5480] Training: 55%|█████▍ | 5481/10000 [1:05:47<58:30, 1.29it/s, loss=0.0055, lr=1.29e-05, step=5480] Training: 55%|█████▍ | 5481/10000 [1:05:47<58:30, 1.29it/s, loss=0.0062, lr=1.29e-05, step=5481] Training: 55%|█████▍ | 5482/10000 [1:05:48<1:04:01, 1.18it/s, loss=0.0062, lr=1.29e-05, step=5481] Training: 55%|█████▍ | 5482/10000 [1:05:48<1:04:01, 1.18it/s, loss=0.0115, lr=1.29e-05, step=5482] Training: 55%|█████▍ | 5483/10000 [1:05:49<1:02:50, 1.20it/s, loss=0.0115, lr=1.29e-05, step=5482] Training: 55%|█████▍ | 5483/10000 [1:05:49<1:02:50, 1.20it/s, loss=0.0024, lr=1.29e-05, step=5483] Training: 55%|█████▍ | 5484/10000 [1:05:49<59:00, 1.28it/s, loss=0.0024, lr=1.29e-05, step=5483] Training: 55%|█████▍ | 5484/10000 [1:05:49<59:00, 1.28it/s, loss=0.0109, lr=1.29e-05, step=5484] Training: 55%|█████▍ | 5485/10000 [1:05:50<57:38, 1.31it/s, loss=0.0109, lr=1.29e-05, step=5484] Training: 55%|█████▍ | 5485/10000 [1:05:50<57:38, 1.31it/s, loss=0.0185, lr=1.29e-05, step=5485] Training: 55%|█████▍ | 5486/10000 [1:05:51<1:05:15, 1.15it/s, loss=0.0185, lr=1.29e-05, step=5485] Training: 55%|█████▍ | 5486/10000 [1:05:51<1:05:15, 1.15it/s, loss=0.0160, lr=1.29e-05, step=5486] Training: 55%|█████▍ | 5487/10000 [1:05:52<1:08:32, 1.10it/s, loss=0.0160, lr=1.29e-05, step=5486] Training: 55%|█████▍ | 5487/10000 [1:05:52<1:08:32, 1.10it/s, loss=0.0215, lr=1.29e-05, step=5487] Training: 55%|█████▍ | 5488/10000 [1:05:53<1:01:44, 1.22it/s, loss=0.0215, lr=1.29e-05, step=5487] Training: 55%|█████▍ | 5488/10000 [1:05:53<1:01:44, 1.22it/s, loss=0.0254, lr=1.29e-05, step=5488] Training: 55%|█████▍ | 5489/10000 [1:05:53<57:12, 1.31it/s, loss=0.0254, lr=1.29e-05, step=5488] Training: 55%|█████▍ | 5489/10000 [1:05:53<57:12, 1.31it/s, loss=0.0063, lr=1.29e-05, step=5489]17:12:01.116 [I] step=5490 loss=0.0036 smoothed_loss=0.0137 lr=1.29e-05 grad_norm=0.4232 step_time=0.6420s data_time=0.1753s it/s=1.224 eta_to_10000=3685.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0042 grad_action_out_proj=0.0565 grad_shared_expert=0.1587 (10775:train_pytorch.py:850) + Training: 55%|█████▍ | 5490/10000 [1:05:54<59:48, 1.26it/s, loss=0.0063, lr=1.29e-05, step=5489] Training: 55%|█████▍ | 5490/10000 [1:05:54<59:48, 1.26it/s, loss=0.0036, lr=1.29e-05, step=5490] Training: 55%|█████▍ | 5491/10000 [1:05:55<58:57, 1.27it/s, loss=0.0036, lr=1.29e-05, step=5490] Training: 55%|█████▍ | 5491/10000 [1:05:55<58:57, 1.27it/s, loss=0.0093, lr=1.29e-05, step=5491] Training: 55%|█████▍ | 5492/10000 [1:05:56<59:22, 1.27it/s, loss=0.0093, lr=1.29e-05, step=5491] Training: 55%|█████▍ | 5492/10000 [1:05:56<59:22, 1.27it/s, loss=0.0056, lr=1.29e-05, step=5492] Training: 55%|█████▍ | 5493/10000 [1:05:56<55:29, 1.35it/s, loss=0.0056, lr=1.29e-05, step=5492] Training: 55%|█████▍ | 5493/10000 [1:05:56<55:29, 1.35it/s, loss=0.0047, lr=1.29e-05, step=5493] Training: 55%|█████▍ | 5494/10000 [1:05:57<56:27, 1.33it/s, loss=0.0047, lr=1.29e-05, step=5493] Training: 55%|█████▍ | 5494/10000 [1:05:57<56:27, 1.33it/s, loss=0.0185, lr=1.28e-05, step=5494] Training: 55%|█████▍ | 5495/10000 [1:05:58<53:23, 1.41it/s, loss=0.0185, lr=1.28e-05, step=5494] Training: 55%|█████▍ | 5495/10000 [1:05:58<53:23, 1.41it/s, loss=0.0118, lr=1.28e-05, step=5495] Training: 55%|█████▍ | 5496/10000 [1:05:58<49:53, 1.50it/s, loss=0.0118, lr=1.28e-05, step=5495] Training: 55%|█████▍ | 5496/10000 [1:05:58<49:53, 1.50it/s, loss=0.0038, lr=1.28e-05, step=5496] Training: 55%|█████▍ | 5497/10000 [1:05:59<54:12, 1.38it/s, loss=0.0038, lr=1.28e-05, step=5496] Training: 55%|█████▍ | 5497/10000 [1:05:59<54:12, 1.38it/s, loss=0.0114, lr=1.28e-05, step=5497] Training: 55%|█████▍ | 5498/10000 [1:06:00<55:15, 1.36it/s, loss=0.0114, lr=1.28e-05, step=5497] Training: 55%|█████▍ | 5498/10000 [1:06:00<55:15, 1.36it/s, loss=0.0152, lr=1.28e-05, step=5498] Training: 55%|█████▍ | 5499/10000 [1:06:01<56:22, 1.33it/s, loss=0.0152, lr=1.28e-05, step=5498] Training: 55%|█████▍ | 5499/10000 [1:06:01<56:22, 1.33it/s, loss=0.0131, lr=1.28e-05, step=5499]17:12:08.297 [I] step=5500 loss=0.0052 smoothed_loss=0.0113 lr=1.28e-05 grad_norm=0.3931 step_time=0.5929s data_time=0.1252s it/s=1.393 eta_to_10000=3230.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0903 grad_shared_expert=0.4283 (10775:train_pytorch.py:850) + Training: 55%|█████▌ | 5500/10000 [1:06:01<53:40, 1.40it/s, loss=0.0131, lr=1.28e-05, step=5499] Training: 55%|█████▌ | 5500/10000 [1:06:01<53:40, 1.40it/s, loss=0.0052, lr=1.28e-05, step=5500] Training: 55%|█████▌ | 5501/10000 [1:06:02<55:49, 1.34it/s, loss=0.0052, lr=1.28e-05, step=5500] Training: 55%|█████▌ | 5501/10000 [1:06:02<55:49, 1.34it/s, loss=0.0070, lr=1.28e-05, step=5501] Training: 55%|█████▌ | 5502/10000 [1:06:03<53:17, 1.41it/s, loss=0.0070, lr=1.28e-05, step=5501] Training: 55%|█████▌ | 5502/10000 [1:06:03<53:17, 1.41it/s, loss=0.0068, lr=1.28e-05, step=5502] Training: 55%|█████▌ | 5503/10000 [1:06:04<54:20, 1.38it/s, loss=0.0068, lr=1.28e-05, step=5502] Training: 55%|█████▌ | 5503/10000 [1:06:04<54:20, 1.38it/s, loss=0.0070, lr=1.28e-05, step=5503] Training: 55%|█████▌ | 5504/10000 [1:06:05<1:00:53, 1.23it/s, loss=0.0070, lr=1.28e-05, step=5503] Training: 55%|█████▌ | 5504/10000 [1:06:05<1:00:53, 1.23it/s, loss=0.0110, lr=1.28e-05, step=5504] Training: 55%|█████▌ | 5505/10000 [1:06:05<1:02:37, 1.20it/s, loss=0.0110, lr=1.28e-05, step=5504] Training: 55%|█████▌ | 5505/10000 [1:06:05<1:02:37, 1.20it/s, loss=0.0049, lr=1.28e-05, step=5505] Training: 55%|█████▌ | 5506/10000 [1:06:06<56:54, 1.32it/s, loss=0.0049, lr=1.28e-05, step=5505] Training: 55%|█████▌ | 5506/10000 [1:06:06<56:54, 1.32it/s, loss=0.0087, lr=1.28e-05, step=5506] Training: 55%|█████▌ | 5507/10000 [1:06:07<56:47, 1.32it/s, loss=0.0087, lr=1.28e-05, step=5506] Training: 55%|█████▌ | 5507/10000 [1:06:07<56:47, 1.32it/s, loss=0.0048, lr=1.28e-05, step=5507] Training: 55%|█████▌ | 5508/10000 [1:06:07<52:14, 1.43it/s, loss=0.0048, lr=1.28e-05, step=5507] Training: 55%|█████▌ | 5508/10000 [1:06:07<52:14, 1.43it/s, loss=0.0150, lr=1.28e-05, step=5508] Training: 55%|█████▌ | 5509/10000 [1:06:09<1:05:05, 1.15it/s, loss=0.0150, lr=1.28e-05, step=5508] Training: 55%|█████▌ | 5509/10000 [1:06:09<1:05:05, 1.15it/s, loss=0.0125, lr=1.28e-05, step=5509]17:12:16.435 [I] step=5510 loss=0.0278 smoothed_loss=0.0118 lr=1.28e-05 grad_norm=0.4977 step_time=0.6372s data_time=0.1766s it/s=1.229 eta_to_10000=3653.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0076 grad_action_out_proj=0.0845 grad_shared_expert=0.6132 (10775:train_pytorch.py:850) + Training: 55%|█████▌ | 5510/10000 [1:06:10<1:04:58, 1.15it/s, loss=0.0125, lr=1.28e-05, step=5509] Training: 55%|█████▌ | 5510/10000 [1:06:10<1:04:58, 1.15it/s, loss=0.0278, lr=1.28e-05, step=5510] Training: 55%|█████▌ | 5511/10000 [1:06:10<1:03:23, 1.18it/s, loss=0.0278, lr=1.28e-05, step=5510] Training: 55%|█████▌ | 5511/10000 [1:06:10<1:03:23, 1.18it/s, loss=0.0025, lr=1.28e-05, step=5511] Training: 55%|█████▌ | 5512/10000 [1:06:11<1:03:35, 1.18it/s, loss=0.0025, lr=1.28e-05, step=5511] Training: 55%|█████▌ | 5512/10000 [1:06:11<1:03:35, 1.18it/s, loss=0.0267, lr=1.28e-05, step=5512] Training: 55%|█████▌ | 5513/10000 [1:06:12<1:05:11, 1.15it/s, loss=0.0267, lr=1.28e-05, step=5512] Training: 55%|█████▌ | 5513/10000 [1:06:12<1:05:11, 1.15it/s, loss=0.0046, lr=1.28e-05, step=5513] Training: 55%|█████▌ | 5514/10000 [1:06:13<1:00:01, 1.25it/s, loss=0.0046, lr=1.28e-05, step=5513] Training: 55%|█████▌ | 5514/10000 [1:06:13<1:00:01, 1.25it/s, loss=0.0148, lr=1.28e-05, step=5514] Training: 55%|█████▌ | 5515/10000 [1:06:13<57:51, 1.29it/s, loss=0.0148, lr=1.28e-05, step=5514] Training: 55%|█████▌ | 5515/10000 [1:06:13<57:51, 1.29it/s, loss=0.0061, lr=1.28e-05, step=5515] Training: 55%|█████▌ | 5516/10000 [1:06:14<58:02, 1.29it/s, loss=0.0061, lr=1.28e-05, step=5515] Training: 55%|█████▌ | 5516/10000 [1:06:14<58:02, 1.29it/s, loss=0.0045, lr=1.28e-05, step=5516] Training: 55%|█████▌ | 5517/10000 [1:06:15<58:34, 1.28it/s, loss=0.0045, lr=1.28e-05, step=5516] Training: 55%|█████▌ | 5517/10000 [1:06:15<58:34, 1.28it/s, loss=0.0144, lr=1.28e-05, step=5517] Training: 55%|█████▌ | 5518/10000 [1:06:16<1:00:20, 1.24it/s, loss=0.0144, lr=1.28e-05, step=5517] Training: 55%|█████▌ | 5518/10000 [1:06:16<1:00:20, 1.24it/s, loss=0.0078, lr=1.28e-05, step=5518] Training: 55%|█████▌ | 5519/10000 [1:06:17<57:20, 1.30it/s, loss=0.0078, lr=1.28e-05, step=5518] Training: 55%|█████▌ | 5519/10000 [1:06:17<57:20, 1.30it/s, loss=0.0083, lr=1.28e-05, step=5519]17:12:24.235 [I] step=5520 loss=0.0080 smoothed_loss=0.0102 lr=1.28e-05 grad_norm=0.4525 step_time=0.6264s data_time=0.1536s it/s=1.282 eta_to_10000=3493.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0745 grad_shared_expert=0.2461 (10775:train_pytorch.py:850) + Training: 55%|█████▌ | 5520/10000 [1:06:17<56:58, 1.31it/s, loss=0.0083, lr=1.28e-05, step=5519] Training: 55%|█████▌ | 5520/10000 [1:06:17<56:58, 1.31it/s, loss=0.0080, lr=1.28e-05, step=5520] Training: 55%|█████▌ | 5521/10000 [1:06:18<54:07, 1.38it/s, loss=0.0080, lr=1.28e-05, step=5520] Training: 55%|█████▌ | 5521/10000 [1:06:18<54:07, 1.38it/s, loss=0.0446, lr=1.27e-05, step=5521] Training: 55%|█████▌ | 5522/10000 [1:06:19<52:33, 1.42it/s, loss=0.0446, lr=1.27e-05, step=5521] Training: 55%|█████▌ | 5522/10000 [1:06:19<52:33, 1.42it/s, loss=0.0098, lr=1.27e-05, step=5522] Training: 55%|█████▌ | 5523/10000 [1:06:19<56:00, 1.33it/s, loss=0.0098, lr=1.27e-05, step=5522] Training: 55%|█████▌ | 5523/10000 [1:06:19<56:00, 1.33it/s, loss=0.0197, lr=1.27e-05, step=5523] Training: 55%|█████▌ | 5524/10000 [1:06:20<56:34, 1.32it/s, loss=0.0197, lr=1.27e-05, step=5523] Training: 55%|█████▌ | 5524/10000 [1:06:20<56:34, 1.32it/s, loss=0.0026, lr=1.27e-05, step=5524] Training: 55%|█████▌ | 5525/10000 [1:06:21<1:02:42, 1.19it/s, loss=0.0026, lr=1.27e-05, step=5524] Training: 55%|█████▌ | 5525/10000 [1:06:21<1:02:42, 1.19it/s, loss=0.0052, lr=1.27e-05, step=5525] Training: 55%|█████▌ | 5526/10000 [1:06:22<1:02:26, 1.19it/s, loss=0.0052, lr=1.27e-05, step=5525] Training: 55%|█████▌ | 5526/10000 [1:06:22<1:02:26, 1.19it/s, loss=0.0089, lr=1.27e-05, step=5526] Training: 55%|█████▌ | 5527/10000 [1:06:23<58:54, 1.27it/s, loss=0.0089, lr=1.27e-05, step=5526] Training: 55%|█████▌ | 5527/10000 [1:06:23<58:54, 1.27it/s, loss=0.0031, lr=1.27e-05, step=5527] Training: 55%|█████▌ | 5528/10000 [1:06:23<54:19, 1.37it/s, loss=0.0031, lr=1.27e-05, step=5527] Training: 55%|█████▌ | 5528/10000 [1:06:23<54:19, 1.37it/s, loss=0.0028, lr=1.27e-05, step=5528] Training: 55%|█████▌ | 5529/10000 [1:06:24<52:12, 1.43it/s, loss=0.0028, lr=1.27e-05, step=5528] Training: 55%|█████▌ | 5529/10000 [1:06:24<52:12, 1.43it/s, loss=0.0154, lr=1.27e-05, step=5529]17:12:31.842 [I] step=5530 loss=0.0197 smoothed_loss=0.0115 lr=1.27e-05 grad_norm=0.6100 step_time=0.6356s data_time=0.1251s it/s=1.315 eta_to_10000=3399.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1717 grad_shared_expert=0.6394 (10775:train_pytorch.py:850) + Training: 55%|█████▌ | 5530/10000 [1:06:25<57:01, 1.31it/s, loss=0.0154, lr=1.27e-05, step=5529] Training: 55%|█████▌ | 5530/10000 [1:06:25<57:01, 1.31it/s, loss=0.0197, lr=1.27e-05, step=5530] Training: 55%|█████▌ | 5531/10000 [1:06:26<58:04, 1.28it/s, loss=0.0197, lr=1.27e-05, step=5530] Training: 55%|█████▌ | 5531/10000 [1:06:26<58:04, 1.28it/s, loss=0.0176, lr=1.27e-05, step=5531] Training: 55%|█████▌ | 5532/10000 [1:06:26<55:35, 1.34it/s, loss=0.0176, lr=1.27e-05, step=5531] Training: 55%|█████▌ | 5532/10000 [1:06:26<55:35, 1.34it/s, loss=0.0069, lr=1.27e-05, step=5532] Training: 55%|█████▌ | 5533/10000 [1:06:27<1:02:53, 1.18it/s, loss=0.0069, lr=1.27e-05, step=5532] Training: 55%|█████▌ | 5533/10000 [1:06:27<1:02:53, 1.18it/s, loss=0.0042, lr=1.27e-05, step=5533] Training: 55%|█████▌ | 5534/10000 [1:06:28<1:01:00, 1.22it/s, loss=0.0042, lr=1.27e-05, step=5533] Training: 55%|█████▌ | 5534/10000 [1:06:28<1:01:00, 1.22it/s, loss=0.0018, lr=1.27e-05, step=5534] Training: 55%|█████▌ | 5535/10000 [1:06:29<59:36, 1.25it/s, loss=0.0018, lr=1.27e-05, step=5534] Training: 55%|█████▌ | 5535/10000 [1:06:29<59:36, 1.25it/s, loss=0.0353, lr=1.27e-05, step=5535] Training: 55%|█████▌ | 5536/10000 [1:06:30<55:21, 1.34it/s, loss=0.0353, lr=1.27e-05, step=5535] Training: 55%|█████▌ | 5536/10000 [1:06:30<55:21, 1.34it/s, loss=0.0114, lr=1.27e-05, step=5536] Training: 55%|█████▌ | 5537/10000 [1:06:30<57:51, 1.29it/s, loss=0.0114, lr=1.27e-05, step=5536] Training: 55%|█████▌ | 5537/10000 [1:06:30<57:51, 1.29it/s, loss=0.0319, lr=1.27e-05, step=5537] Training: 55%|█████▌ | 5538/10000 [1:06:31<55:51, 1.33it/s, loss=0.0319, lr=1.27e-05, step=5537] Training: 55%|█████▌ | 5538/10000 [1:06:31<55:51, 1.33it/s, loss=0.0377, lr=1.27e-05, step=5538] Training: 55%|█████▌ | 5539/10000 [1:06:32<56:36, 1.31it/s, loss=0.0377, lr=1.27e-05, step=5538] Training: 55%|█████▌ | 5539/10000 [1:06:32<56:36, 1.31it/s, loss=0.0109, lr=1.27e-05, step=5539]17:12:40.009 [I] step=5540 loss=0.0031 smoothed_loss=0.0148 lr=1.27e-05 grad_norm=0.5606 step_time=0.6520s data_time=0.1647s it/s=1.225 eta_to_10000=3642.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.0935 grad_shared_expert=0.3691 (10775:train_pytorch.py:850) + Training: 55%|█████▌ | 5540/10000 [1:06:33<1:05:16, 1.14it/s, loss=0.0109, lr=1.27e-05, step=5539] Training: 55%|█████▌ | 5540/10000 [1:06:33<1:05:16, 1.14it/s, loss=0.0031, lr=1.27e-05, step=5540] Training: 55%|█████▌ | 5541/10000 [1:06:34<59:53, 1.24it/s, loss=0.0031, lr=1.27e-05, step=5540] Training: 55%|█████▌ | 5541/10000 [1:06:34<59:53, 1.24it/s, loss=0.0285, lr=1.27e-05, step=5541] Training: 55%|█████▌ | 5542/10000 [1:06:34<57:05, 1.30it/s, loss=0.0285, lr=1.27e-05, step=5541] Training: 55%|█████▌ | 5542/10000 [1:06:34<57:05, 1.30it/s, loss=0.0011, lr=1.27e-05, step=5542] Training: 55%|█████▌ | 5543/10000 [1:06:35<55:09, 1.35it/s, loss=0.0011, lr=1.27e-05, step=5542] Training: 55%|█████▌ | 5543/10000 [1:06:35<55:09, 1.35it/s, loss=0.0171, lr=1.27e-05, step=5543] Training: 55%|█████▌ | 5544/10000 [1:06:36<58:00, 1.28it/s, loss=0.0171, lr=1.27e-05, step=5543] Training: 55%|█████▌ | 5544/10000 [1:06:36<58:00, 1.28it/s, loss=0.0183, lr=1.27e-05, step=5544] Training: 55%|█████▌ | 5545/10000 [1:06:37<54:11, 1.37it/s, loss=0.0183, lr=1.27e-05, step=5544] Training: 55%|█████▌ | 5545/10000 [1:06:37<54:11, 1.37it/s, loss=0.0167, lr=1.27e-05, step=5545] Training: 55%|█████▌ | 5546/10000 [1:06:37<51:44, 1.43it/s, loss=0.0167, lr=1.27e-05, step=5545] Training: 55%|█████▌ | 5546/10000 [1:06:37<51:44, 1.43it/s, loss=0.0051, lr=1.27e-05, step=5546] Training: 55%|█████▌ | 5547/10000 [1:06:38<55:27, 1.34it/s, loss=0.0051, lr=1.27e-05, step=5546] Training: 55%|█████▌ | 5547/10000 [1:06:38<55:27, 1.34it/s, loss=0.0025, lr=1.27e-05, step=5547] Training: 55%|█████▌ | 5548/10000 [1:06:39<51:39, 1.44it/s, loss=0.0025, lr=1.27e-05, step=5547] Training: 55%|█████▌ | 5548/10000 [1:06:39<51:39, 1.44it/s, loss=0.0043, lr=1.26e-05, step=5548] Training: 55%|█████▌ | 5549/10000 [1:06:39<50:06, 1.48it/s, loss=0.0043, lr=1.26e-05, step=5548] Training: 55%|█████▌ | 5549/10000 [1:06:39<50:06, 1.48it/s, loss=0.0049, lr=1.26e-05, step=5549]17:12:46.761 [I] step=5550 loss=0.0017 smoothed_loss=0.0106 lr=1.27e-05 grad_norm=0.4402 step_time=0.5819s data_time=0.0933s it/s=1.481 eta_to_10000=3003.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0043 grad_action_out_proj=0.0563 grad_shared_expert=0.1659 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5550/10000 [1:06:40<47:58, 1.55it/s, loss=0.0049, lr=1.26e-05, step=5549] Training: 56%|█████▌ | 5550/10000 [1:06:40<47:58, 1.55it/s, loss=0.0017, lr=1.26e-05, step=5550] Training: 56%|█████▌ | 5551/10000 [1:06:41<50:47, 1.46it/s, loss=0.0017, lr=1.26e-05, step=5550] Training: 56%|█████▌ | 5551/10000 [1:06:41<50:47, 1.46it/s, loss=0.0035, lr=1.26e-05, step=5551] Training: 56%|█████▌ | 5552/10000 [1:06:41<49:10, 1.51it/s, loss=0.0035, lr=1.26e-05, step=5551] Training: 56%|█████▌ | 5552/10000 [1:06:41<49:10, 1.51it/s, loss=0.0534, lr=1.26e-05, step=5552] Training: 56%|█████▌ | 5553/10000 [1:06:42<51:35, 1.44it/s, loss=0.0534, lr=1.26e-05, step=5552] Training: 56%|█████▌ | 5553/10000 [1:06:42<51:35, 1.44it/s, loss=0.0066, lr=1.26e-05, step=5553] Training: 56%|█████▌ | 5554/10000 [1:06:43<52:00, 1.42it/s, loss=0.0066, lr=1.26e-05, step=5553] Training: 56%|█████▌ | 5554/10000 [1:06:43<52:00, 1.42it/s, loss=0.0221, lr=1.26e-05, step=5554] Training: 56%|█████▌ | 5555/10000 [1:06:44<54:22, 1.36it/s, loss=0.0221, lr=1.26e-05, step=5554] Training: 56%|█████▌ | 5555/10000 [1:06:44<54:22, 1.36it/s, loss=0.0066, lr=1.26e-05, step=5555] Training: 56%|█████▌ | 5556/10000 [1:06:44<54:09, 1.37it/s, loss=0.0066, lr=1.26e-05, step=5555] Training: 56%|█████▌ | 5556/10000 [1:06:44<54:09, 1.37it/s, loss=0.0099, lr=1.26e-05, step=5556] Training: 56%|█████▌ | 5557/10000 [1:06:45<51:51, 1.43it/s, loss=0.0099, lr=1.26e-05, step=5556] Training: 56%|█████▌ | 5557/10000 [1:06:45<51:51, 1.43it/s, loss=0.0169, lr=1.26e-05, step=5557] Training: 56%|█████▌ | 5558/10000 [1:06:46<57:39, 1.28it/s, loss=0.0169, lr=1.26e-05, step=5557] Training: 56%|█████▌ | 5558/10000 [1:06:46<57:39, 1.28it/s, loss=0.0083, lr=1.26e-05, step=5558] Training: 56%|█████▌ | 5559/10000 [1:06:47<57:05, 1.30it/s, loss=0.0083, lr=1.26e-05, step=5558] Training: 56%|█████▌ | 5559/10000 [1:06:47<57:05, 1.30it/s, loss=0.0130, lr=1.26e-05, step=5559]17:12:54.302 [I] step=5560 loss=0.0014 smoothed_loss=0.0119 lr=1.26e-05 grad_norm=0.4621 step_time=0.6177s data_time=0.1363s it/s=1.326 eta_to_10000=3347.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0122 grad_action_out_proj=0.0897 grad_shared_expert=0.3359 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5560/10000 [1:06:47<57:28, 1.29it/s, loss=0.0130, lr=1.26e-05, step=5559] Training: 56%|█████▌ | 5560/10000 [1:06:47<57:28, 1.29it/s, loss=0.0014, lr=1.26e-05, step=5560] Training: 56%|█████▌ | 5561/10000 [1:06:48<1:01:58, 1.19it/s, loss=0.0014, lr=1.26e-05, step=5560] Training: 56%|█████▌ | 5561/10000 [1:06:48<1:01:58, 1.19it/s, loss=0.0038, lr=1.26e-05, step=5561] Training: 56%|█████▌ | 5562/10000 [1:06:49<55:38, 1.33it/s, loss=0.0038, lr=1.26e-05, step=5561] Training: 56%|█████▌ | 5562/10000 [1:06:49<55:38, 1.33it/s, loss=0.0096, lr=1.26e-05, step=5562] Training: 56%|█████▌ | 5563/10000 [1:06:50<52:21, 1.41it/s, loss=0.0096, lr=1.26e-05, step=5562] Training: 56%|█████▌ | 5563/10000 [1:06:50<52:21, 1.41it/s, loss=0.0037, lr=1.26e-05, step=5563] Training: 56%|█████▌ | 5564/10000 [1:06:50<52:39, 1.40it/s, loss=0.0037, lr=1.26e-05, step=5563] Training: 56%|█████▌ | 5564/10000 [1:06:50<52:39, 1.40it/s, loss=0.0094, lr=1.26e-05, step=5564] Training: 56%|█████▌ | 5565/10000 [1:06:51<54:25, 1.36it/s, loss=0.0094, lr=1.26e-05, step=5564] Training: 56%|█████▌ | 5565/10000 [1:06:51<54:25, 1.36it/s, loss=0.0176, lr=1.26e-05, step=5565] Training: 56%|█████▌ | 5566/10000 [1:06:52<54:20, 1.36it/s, loss=0.0176, lr=1.26e-05, step=5565] Training: 56%|█████▌ | 5566/10000 [1:06:52<54:20, 1.36it/s, loss=0.0219, lr=1.26e-05, step=5566] Training: 56%|█████▌ | 5567/10000 [1:06:52<52:47, 1.40it/s, loss=0.0219, lr=1.26e-05, step=5566] Training: 56%|█████▌ | 5567/10000 [1:06:52<52:47, 1.40it/s, loss=0.0017, lr=1.26e-05, step=5567] Training: 56%|█████▌ | 5568/10000 [1:06:53<54:51, 1.35it/s, loss=0.0017, lr=1.26e-05, step=5567] Training: 56%|█████▌ | 5568/10000 [1:06:53<54:51, 1.35it/s, loss=0.0169, lr=1.26e-05, step=5568] Training: 56%|█████▌ | 5569/10000 [1:06:54<53:22, 1.38it/s, loss=0.0169, lr=1.26e-05, step=5568] Training: 56%|█████▌ | 5569/10000 [1:06:54<53:22, 1.38it/s, loss=0.0070, lr=1.26e-05, step=5569]17:13:01.486 [I] step=5570 loss=0.0031 smoothed_loss=0.0103 lr=1.26e-05 grad_norm=0.4773 step_time=0.5979s data_time=0.1206s it/s=1.392 eta_to_10000=3182.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0647 grad_shared_expert=0.4252 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5570/10000 [1:06:55<51:42, 1.43it/s, loss=0.0070, lr=1.26e-05, step=5569] Training: 56%|█████▌ | 5570/10000 [1:06:55<51:42, 1.43it/s, loss=0.0031, lr=1.26e-05, step=5570] Training: 56%|█████▌ | 5571/10000 [1:06:55<49:28, 1.49it/s, loss=0.0031, lr=1.26e-05, step=5570] Training: 56%|█████▌ | 5571/10000 [1:06:55<49:28, 1.49it/s, loss=0.0201, lr=1.26e-05, step=5571] Training: 56%|█████▌ | 5572/10000 [1:06:56<52:20, 1.41it/s, loss=0.0201, lr=1.26e-05, step=5571] Training: 56%|█████▌ | 5572/10000 [1:06:56<52:20, 1.41it/s, loss=0.0033, lr=1.26e-05, step=5572] Training: 56%|█████▌ | 5573/10000 [1:06:57<1:01:45, 1.19it/s, loss=0.0033, lr=1.26e-05, step=5572] Training: 56%|█████▌ | 5573/10000 [1:06:57<1:01:45, 1.19it/s, loss=0.0065, lr=1.26e-05, step=5573] Training: 56%|█████▌ | 5574/10000 [1:06:58<1:00:20, 1.22it/s, loss=0.0065, lr=1.26e-05, step=5573] Training: 56%|█████▌ | 5574/10000 [1:06:58<1:00:20, 1.22it/s, loss=0.0190, lr=1.26e-05, step=5574] Training: 56%|█████▌ | 5575/10000 [1:06:58<55:01, 1.34it/s, loss=0.0190, lr=1.26e-05, step=5574] Training: 56%|█████▌ | 5575/10000 [1:06:58<55:01, 1.34it/s, loss=0.0040, lr=1.25e-05, step=5575] Training: 56%|█████▌ | 5576/10000 [1:06:59<1:01:26, 1.20it/s, loss=0.0040, lr=1.25e-05, step=5575] Training: 56%|█████▌ | 5576/10000 [1:06:59<1:01:26, 1.20it/s, loss=0.0073, lr=1.25e-05, step=5576] Training: 56%|█████▌ | 5577/10000 [1:07:00<1:00:46, 1.21it/s, loss=0.0073, lr=1.25e-05, step=5576] Training: 56%|█████▌ | 5577/10000 [1:07:00<1:00:46, 1.21it/s, loss=0.0191, lr=1.25e-05, step=5577] Training: 56%|█████▌ | 5578/10000 [1:07:01<1:00:02, 1.23it/s, loss=0.0191, lr=1.25e-05, step=5577] Training: 56%|█████▌ | 5578/10000 [1:07:01<1:00:02, 1.23it/s, loss=0.0315, lr=1.25e-05, step=5578] Training: 56%|█████▌ | 5579/10000 [1:07:02<59:32, 1.24it/s, loss=0.0315, lr=1.25e-05, step=5578] Training: 56%|█████▌ | 5579/10000 [1:07:02<59:32, 1.24it/s, loss=0.0075, lr=1.25e-05, step=5579]17:13:09.820 [I] step=5580 loss=0.0048 smoothed_loss=0.0116 lr=1.25e-05 grad_norm=0.4614 step_time=0.6672s data_time=0.1661s it/s=1.200 eta_to_10000=3682.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0173 grad_action_out_proj=0.1370 grad_shared_expert=0.4225 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5580/10000 [1:07:03<1:04:14, 1.15it/s, loss=0.0075, lr=1.25e-05, step=5579] Training: 56%|█████▌ | 5580/10000 [1:07:03<1:04:14, 1.15it/s, loss=0.0048, lr=1.25e-05, step=5580] Training: 56%|█████▌ | 5581/10000 [1:07:03<58:11, 1.27it/s, loss=0.0048, lr=1.25e-05, step=5580] Training: 56%|█████▌ | 5581/10000 [1:07:03<58:11, 1.27it/s, loss=0.0066, lr=1.25e-05, step=5581] Training: 56%|█████▌ | 5582/10000 [1:07:04<53:38, 1.37it/s, loss=0.0066, lr=1.25e-05, step=5581] Training: 56%|█████▌ | 5582/10000 [1:07:04<53:38, 1.37it/s, loss=0.0132, lr=1.25e-05, step=5582] Training: 56%|█████▌ | 5583/10000 [1:07:05<56:27, 1.30it/s, loss=0.0132, lr=1.25e-05, step=5582] Training: 56%|█████▌ | 5583/10000 [1:07:05<56:27, 1.30it/s, loss=0.0118, lr=1.25e-05, step=5583] Training: 56%|█████▌ | 5584/10000 [1:07:06<57:09, 1.29it/s, loss=0.0118, lr=1.25e-05, step=5583] Training: 56%|█████▌ | 5584/10000 [1:07:06<57:09, 1.29it/s, loss=0.0115, lr=1.25e-05, step=5584] Training: 56%|█████▌ | 5585/10000 [1:07:07<57:12, 1.29it/s, loss=0.0115, lr=1.25e-05, step=5584] Training: 56%|█████▌ | 5585/10000 [1:07:07<57:12, 1.29it/s, loss=0.0039, lr=1.25e-05, step=5585] Training: 56%|█████▌ | 5586/10000 [1:07:07<59:32, 1.24it/s, loss=0.0039, lr=1.25e-05, step=5585] Training: 56%|█████▌ | 5586/10000 [1:07:07<59:32, 1.24it/s, loss=0.0292, lr=1.25e-05, step=5586] Training: 56%|█████▌ | 5587/10000 [1:07:08<58:01, 1.27it/s, loss=0.0292, lr=1.25e-05, step=5586] Training: 56%|█████▌ | 5587/10000 [1:07:08<58:01, 1.27it/s, loss=0.0243, lr=1.25e-05, step=5587] Training: 56%|█████▌ | 5588/10000 [1:07:09<1:02:48, 1.17it/s, loss=0.0243, lr=1.25e-05, step=5587] Training: 56%|█████▌ | 5588/10000 [1:07:09<1:02:48, 1.17it/s, loss=0.0024, lr=1.25e-05, step=5588] Training: 56%|█████▌ | 5589/10000 [1:07:10<1:01:32, 1.19it/s, loss=0.0024, lr=1.25e-05, step=5588] Training: 56%|█████▌ | 5589/10000 [1:07:10<1:01:32, 1.19it/s, loss=0.0053, lr=1.25e-05, step=5589]17:13:17.637 [I] step=5590 loss=0.0144 smoothed_loss=0.0121 lr=1.25e-05 grad_norm=0.5193 step_time=0.6259s data_time=0.1560s it/s=1.279 eta_to_10000=3447.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0939 grad_shared_expert=0.2815 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5590/10000 [1:07:11<1:00:02, 1.22it/s, loss=0.0053, lr=1.25e-05, step=5589] Training: 56%|█████▌ | 5590/10000 [1:07:11<1:00:02, 1.22it/s, loss=0.0144, lr=1.25e-05, step=5590] Training: 56%|█████▌ | 5591/10000 [1:07:11<54:37, 1.35it/s, loss=0.0144, lr=1.25e-05, step=5590] Training: 56%|█████▌ | 5591/10000 [1:07:11<54:37, 1.35it/s, loss=0.0083, lr=1.25e-05, step=5591] Training: 56%|█████▌ | 5592/10000 [1:07:12<52:50, 1.39it/s, loss=0.0083, lr=1.25e-05, step=5591] Training: 56%|█████▌ | 5592/10000 [1:07:12<52:50, 1.39it/s, loss=0.0163, lr=1.25e-05, step=5592] Training: 56%|█████▌ | 5593/10000 [1:07:13<50:14, 1.46it/s, loss=0.0163, lr=1.25e-05, step=5592] Training: 56%|█████▌ | 5593/10000 [1:07:13<50:14, 1.46it/s, loss=0.0018, lr=1.25e-05, step=5593] Training: 56%|█████▌ | 5594/10000 [1:07:13<49:18, 1.49it/s, loss=0.0018, lr=1.25e-05, step=5593] Training: 56%|█████▌ | 5594/10000 [1:07:13<49:18, 1.49it/s, loss=0.0355, lr=1.25e-05, step=5594] Training: 56%|█████▌ | 5595/10000 [1:07:14<51:39, 1.42it/s, loss=0.0355, lr=1.25e-05, step=5594] Training: 56%|█████▌ | 5595/10000 [1:07:14<51:39, 1.42it/s, loss=0.0035, lr=1.25e-05, step=5595] Training: 56%|█████▌ | 5596/10000 [1:07:15<49:33, 1.48it/s, loss=0.0035, lr=1.25e-05, step=5595] Training: 56%|█████▌ | 5596/10000 [1:07:15<49:33, 1.48it/s, loss=0.0050, lr=1.25e-05, step=5596] Training: 56%|█████▌ | 5597/10000 [1:07:15<49:57, 1.47it/s, loss=0.0050, lr=1.25e-05, step=5596] Training: 56%|█████▌ | 5597/10000 [1:07:15<49:57, 1.47it/s, loss=0.0106, lr=1.25e-05, step=5597] Training: 56%|█████▌ | 5598/10000 [1:07:16<55:29, 1.32it/s, loss=0.0106, lr=1.25e-05, step=5597] Training: 56%|█████▌ | 5598/10000 [1:07:16<55:29, 1.32it/s, loss=0.0052, lr=1.25e-05, step=5598] Training: 56%|█████▌ | 5599/10000 [1:07:17<57:36, 1.27it/s, loss=0.0052, lr=1.25e-05, step=5598] Training: 56%|█████▌ | 5599/10000 [1:07:17<57:36, 1.27it/s, loss=0.0073, lr=1.25e-05, step=5599]17:13:24.900 [I] step=5600 loss=0.0149 smoothed_loss=0.0111 lr=1.25e-05 grad_norm=0.5253 step_time=0.5927s data_time=0.1336s it/s=1.377 eta_to_10000=3195.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0348 grad_action_out_proj=0.2154 grad_shared_expert=0.6566 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5600/10000 [1:07:18<1:00:28, 1.21it/s, loss=0.0073, lr=1.25e-05, step=5599] Training: 56%|█████▌ | 5600/10000 [1:07:18<1:00:28, 1.21it/s, loss=0.0149, lr=1.25e-05, step=5600] Training: 56%|█████▌ | 5601/10000 [1:07:19<55:37, 1.32it/s, loss=0.0149, lr=1.25e-05, step=5600] Training: 56%|█████▌ | 5601/10000 [1:07:19<55:37, 1.32it/s, loss=0.0417, lr=1.25e-05, step=5601] Training: 56%|█████▌ | 5602/10000 [1:07:19<57:31, 1.27it/s, loss=0.0417, lr=1.25e-05, step=5601] Training: 56%|█████▌ | 5602/10000 [1:07:19<57:31, 1.27it/s, loss=0.0042, lr=1.24e-05, step=5602] Training: 56%|█████▌ | 5603/10000 [1:07:20<59:41, 1.23it/s, loss=0.0042, lr=1.24e-05, step=5602] Training: 56%|█████▌ | 5603/10000 [1:07:20<59:41, 1.23it/s, loss=0.0071, lr=1.24e-05, step=5603] Training: 56%|█████▌ | 5604/10000 [1:07:21<53:54, 1.36it/s, loss=0.0071, lr=1.24e-05, step=5603] Training: 56%|█████▌ | 5604/10000 [1:07:21<53:54, 1.36it/s, loss=0.0050, lr=1.24e-05, step=5604] Training: 56%|█████▌ | 5605/10000 [1:07:22<53:54, 1.36it/s, loss=0.0050, lr=1.24e-05, step=5604] Training: 56%|█████▌ | 5605/10000 [1:07:22<53:54, 1.36it/s, loss=0.0246, lr=1.24e-05, step=5605] Training: 56%|█████▌ | 5606/10000 [1:07:22<49:59, 1.46it/s, loss=0.0246, lr=1.24e-05, step=5605] Training: 56%|█████▌ | 5606/10000 [1:07:22<49:59, 1.46it/s, loss=0.0118, lr=1.24e-05, step=5606] Training: 56%|█████▌ | 5607/10000 [1:07:23<49:02, 1.49it/s, loss=0.0118, lr=1.24e-05, step=5606] Training: 56%|█████▌ | 5607/10000 [1:07:23<49:02, 1.49it/s, loss=0.0021, lr=1.24e-05, step=5607] Training: 56%|█████▌ | 5608/10000 [1:07:23<48:08, 1.52it/s, loss=0.0021, lr=1.24e-05, step=5607] Training: 56%|█████▌ | 5608/10000 [1:07:23<48:08, 1.52it/s, loss=0.0116, lr=1.24e-05, step=5608] Training: 56%|█████▌ | 5609/10000 [1:07:24<50:16, 1.46it/s, loss=0.0116, lr=1.24e-05, step=5608] Training: 56%|█████▌ | 5609/10000 [1:07:24<50:16, 1.46it/s, loss=0.0171, lr=1.24e-05, step=5609]17:13:32.158 [I] step=5610 loss=0.0091 smoothed_loss=0.0120 lr=1.24e-05 grad_norm=0.4239 step_time=0.5839s data_time=0.1419s it/s=1.378 eta_to_10000=3185.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0072 grad_action_out_proj=0.0716 grad_shared_expert=0.2527 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5610/10000 [1:07:25<58:17, 1.26it/s, loss=0.0171, lr=1.24e-05, step=5609] Training: 56%|█████▌ | 5610/10000 [1:07:25<58:17, 1.26it/s, loss=0.0091, lr=1.24e-05, step=5610] Training: 56%|█████▌ | 5611/10000 [1:07:26<1:03:50, 1.15it/s, loss=0.0091, lr=1.24e-05, step=5610] Training: 56%|█████▌ | 5611/10000 [1:07:26<1:03:50, 1.15it/s, loss=0.0044, lr=1.24e-05, step=5611] Training: 56%|█████▌ | 5612/10000 [1:07:27<1:01:05, 1.20it/s, loss=0.0044, lr=1.24e-05, step=5611] Training: 56%|█████▌ | 5612/10000 [1:07:27<1:01:05, 1.20it/s, loss=0.0076, lr=1.24e-05, step=5612] Training: 56%|█████▌ | 5613/10000 [1:07:28<56:24, 1.30it/s, loss=0.0076, lr=1.24e-05, step=5612] Training: 56%|█████▌ | 5613/10000 [1:07:28<56:24, 1.30it/s, loss=0.0166, lr=1.24e-05, step=5613] Training: 56%|█████▌ | 5614/10000 [1:07:28<57:28, 1.27it/s, loss=0.0166, lr=1.24e-05, step=5613] Training: 56%|█████▌ | 5614/10000 [1:07:28<57:28, 1.27it/s, loss=0.0060, lr=1.24e-05, step=5614] Training: 56%|█████▌ | 5615/10000 [1:07:29<1:00:14, 1.21it/s, loss=0.0060, lr=1.24e-05, step=5614] Training: 56%|█████▌ | 5615/10000 [1:07:29<1:00:14, 1.21it/s, loss=0.0087, lr=1.24e-05, step=5615] Training: 56%|█████▌ | 5616/10000 [1:07:30<1:01:18, 1.19it/s, loss=0.0087, lr=1.24e-05, step=5615] Training: 56%|█████▌ | 5616/10000 [1:07:30<1:01:18, 1.19it/s, loss=0.0069, lr=1.24e-05, step=5616] Training: 56%|█████▌ | 5617/10000 [1:07:31<1:06:05, 1.11it/s, loss=0.0069, lr=1.24e-05, step=5616] Training: 56%|█████▌ | 5617/10000 [1:07:31<1:06:05, 1.11it/s, loss=0.0478, lr=1.24e-05, step=5617] Training: 56%|█████▌ | 5618/10000 [1:07:32<1:03:16, 1.15it/s, loss=0.0478, lr=1.24e-05, step=5617] Training: 56%|█████▌ | 5618/10000 [1:07:32<1:03:16, 1.15it/s, loss=0.0254, lr=1.24e-05, step=5618] Training: 56%|█████▌ | 5619/10000 [1:07:33<1:07:38, 1.08it/s, loss=0.0254, lr=1.24e-05, step=5618] Training: 56%|█████▌ | 5619/10000 [1:07:33<1:07:38, 1.08it/s, loss=0.0203, lr=1.24e-05, step=5619]17:13:40.885 [I] step=5620 loss=0.0135 smoothed_loss=0.0155 lr=1.24e-05 grad_norm=0.4608 step_time=0.6838s data_time=0.1888s it/s=1.146 eta_to_10000=3821.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0080 grad_action_out_proj=0.0770 grad_shared_expert=0.3385 (10775:train_pytorch.py:850) + Training: 56%|█████▌ | 5620/10000 [1:07:34<1:04:48, 1.13it/s, loss=0.0203, lr=1.24e-05, step=5619] Training: 56%|█████▌ | 5620/10000 [1:07:34<1:04:48, 1.13it/s, loss=0.0135, lr=1.24e-05, step=5620] Training: 56%|█████▌ | 5621/10000 [1:07:35<1:02:36, 1.17it/s, loss=0.0135, lr=1.24e-05, step=5620] Training: 56%|█████▌ | 5621/10000 [1:07:35<1:02:36, 1.17it/s, loss=0.0151, lr=1.24e-05, step=5621] Training: 56%|█████▌ | 5622/10000 [1:07:35<58:16, 1.25it/s, loss=0.0151, lr=1.24e-05, step=5621] Training: 56%|█████▌ | 5622/10000 [1:07:35<58:16, 1.25it/s, loss=0.0050, lr=1.24e-05, step=5622] Training: 56%|█████▌ | 5623/10000 [1:07:36<58:25, 1.25it/s, loss=0.0050, lr=1.24e-05, step=5622] Training: 56%|█████▌ | 5623/10000 [1:07:36<58:25, 1.25it/s, loss=0.0051, lr=1.24e-05, step=5623] Training: 56%|█████▌ | 5624/10000 [1:07:37<1:03:19, 1.15it/s, loss=0.0051, lr=1.24e-05, step=5623] Training: 56%|█████▌ | 5624/10000 [1:07:37<1:03:19, 1.15it/s, loss=0.0122, lr=1.24e-05, step=5624] Training: 56%|█████▋ | 5625/10000 [1:07:38<1:04:23, 1.13it/s, loss=0.0122, lr=1.24e-05, step=5624] Training: 56%|█████▋ | 5625/10000 [1:07:38<1:04:23, 1.13it/s, loss=0.0056, lr=1.24e-05, step=5625] Training: 56%|█████▋ | 5626/10000 [1:07:39<1:07:24, 1.08it/s, loss=0.0056, lr=1.24e-05, step=5625] Training: 56%|█████▋ | 5626/10000 [1:07:39<1:07:24, 1.08it/s, loss=0.0433, lr=1.24e-05, step=5626] Training: 56%|█████▋ | 5627/10000 [1:07:40<1:03:04, 1.16it/s, loss=0.0433, lr=1.24e-05, step=5626] Training: 56%|█████▋ | 5627/10000 [1:07:40<1:03:04, 1.16it/s, loss=0.0027, lr=1.24e-05, step=5627] Training: 56%|█████▋ | 5628/10000 [1:07:40<57:02, 1.28it/s, loss=0.0027, lr=1.24e-05, step=5627] Training: 56%|█████▋ | 5628/10000 [1:07:40<57:02, 1.28it/s, loss=0.0088, lr=1.24e-05, step=5628] Training: 56%|█████▋ | 5629/10000 [1:07:41<53:24, 1.36it/s, loss=0.0088, lr=1.24e-05, step=5628] Training: 56%|█████▋ | 5629/10000 [1:07:41<53:24, 1.36it/s, loss=0.0113, lr=1.23e-05, step=5629]17:13:48.912 [I] step=5630 loss=0.0124 smoothed_loss=0.0134 lr=1.24e-05 grad_norm=0.4228 step_time=0.6467s data_time=0.1560s it/s=1.246 eta_to_10000=3507.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0139 grad_action_out_proj=0.1470 grad_shared_expert=0.3907 (10775:train_pytorch.py:850) + Training: 56%|█████▋ | 5630/10000 [1:07:42<56:26, 1.29it/s, loss=0.0113, lr=1.23e-05, step=5629] Training: 56%|█████▋ | 5630/10000 [1:07:42<56:26, 1.29it/s, loss=0.0124, lr=1.23e-05, step=5630] Training: 56%|█████▋ | 5631/10000 [1:07:43<55:38, 1.31it/s, loss=0.0124, lr=1.23e-05, step=5630] Training: 56%|█████▋ | 5631/10000 [1:07:43<55:38, 1.31it/s, loss=0.0046, lr=1.23e-05, step=5631] Training: 56%|█████▋ | 5632/10000 [1:07:44<59:25, 1.22it/s, loss=0.0046, lr=1.23e-05, step=5631] Training: 56%|█████▋ | 5632/10000 [1:07:44<59:25, 1.22it/s, loss=0.0029, lr=1.23e-05, step=5632] Training: 56%|█████▋ | 5633/10000 [1:07:45<1:05:40, 1.11it/s, loss=0.0029, lr=1.23e-05, step=5632] Training: 56%|█████▋ | 5633/10000 [1:07:45<1:05:40, 1.11it/s, loss=0.0101, lr=1.23e-05, step=5633] Training: 56%|█████▋ | 5634/10000 [1:07:45<1:00:28, 1.20it/s, loss=0.0101, lr=1.23e-05, step=5633] Training: 56%|█████▋ | 5634/10000 [1:07:45<1:00:28, 1.20it/s, loss=0.0180, lr=1.23e-05, step=5634] Training: 56%|█████▋ | 5635/10000 [1:07:46<59:39, 1.22it/s, loss=0.0180, lr=1.23e-05, step=5634] Training: 56%|█████▋ | 5635/10000 [1:07:46<59:39, 1.22it/s, loss=0.0262, lr=1.23e-05, step=5635] Training: 56%|█████▋ | 5636/10000 [1:07:47<58:18, 1.25it/s, loss=0.0262, lr=1.23e-05, step=5635] Training: 56%|█████▋ | 5636/10000 [1:07:47<58:18, 1.25it/s, loss=0.0039, lr=1.23e-05, step=5636] Training: 56%|█████▋ | 5637/10000 [1:07:48<59:45, 1.22it/s, loss=0.0039, lr=1.23e-05, step=5636] Training: 56%|█████▋ | 5637/10000 [1:07:48<59:45, 1.22it/s, loss=0.0113, lr=1.23e-05, step=5637] Training: 56%|█████▋ | 5638/10000 [1:07:49<58:23, 1.24it/s, loss=0.0113, lr=1.23e-05, step=5637] Training: 56%|█████▋ | 5638/10000 [1:07:49<58:23, 1.24it/s, loss=0.0566, lr=1.23e-05, step=5638] Training: 56%|█████▋ | 5639/10000 [1:07:49<54:55, 1.32it/s, loss=0.0566, lr=1.23e-05, step=5638] Training: 56%|█████▋ | 5639/10000 [1:07:49<54:55, 1.32it/s, loss=0.0065, lr=1.23e-05, step=5639]17:13:57.165 [I] step=5640 loss=0.0041 smoothed_loss=0.0146 lr=1.23e-05 grad_norm=0.3785 step_time=0.6680s data_time=0.1573s it/s=1.212 eta_to_10000=3597.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.0988 grad_shared_expert=0.3654 (10775:train_pytorch.py:850) + Training: 56%|█████▋ | 5640/10000 [1:07:50<59:50, 1.21it/s, loss=0.0065, lr=1.23e-05, step=5639] Training: 56%|█████▋ | 5640/10000 [1:07:50<59:50, 1.21it/s, loss=0.0041, lr=1.23e-05, step=5640] Training: 56%|█████▋ | 5641/10000 [1:07:51<57:53, 1.26it/s, loss=0.0041, lr=1.23e-05, step=5640] Training: 56%|█████▋ | 5641/10000 [1:07:51<57:53, 1.26it/s, loss=0.0088, lr=1.23e-05, step=5641] Training: 56%|█████▋ | 5642/10000 [1:07:51<51:48, 1.40it/s, loss=0.0088, lr=1.23e-05, step=5641] Training: 56%|█████▋ | 5642/10000 [1:07:51<51:48, 1.40it/s, loss=0.0097, lr=1.23e-05, step=5642] Training: 56%|█████▋ | 5643/10000 [1:07:52<52:51, 1.37it/s, loss=0.0097, lr=1.23e-05, step=5642] Training: 56%|█████▋ | 5643/10000 [1:07:52<52:51, 1.37it/s, loss=0.0078, lr=1.23e-05, step=5643] Training: 56%|█████▋ | 5644/10000 [1:07:53<59:19, 1.22it/s, loss=0.0078, lr=1.23e-05, step=5643] Training: 56%|█████▋ | 5644/10000 [1:07:53<59:19, 1.22it/s, loss=0.0357, lr=1.23e-05, step=5644] Training: 56%|█████▋ | 5645/10000 [1:07:54<1:00:22, 1.20it/s, loss=0.0357, lr=1.23e-05, step=5644] Training: 56%|█████▋ | 5645/10000 [1:07:54<1:00:22, 1.20it/s, loss=0.0128, lr=1.23e-05, step=5645] Training: 56%|█████▋ | 5646/10000 [1:07:55<1:03:39, 1.14it/s, loss=0.0128, lr=1.23e-05, step=5645] Training: 56%|█████▋ | 5646/10000 [1:07:55<1:03:39, 1.14it/s, loss=0.0041, lr=1.23e-05, step=5646] Training: 56%|█████▋ | 5647/10000 [1:07:56<1:05:54, 1.10it/s, loss=0.0041, lr=1.23e-05, step=5646] Training: 56%|█████▋ | 5647/10000 [1:07:56<1:05:54, 1.10it/s, loss=0.0170, lr=1.23e-05, step=5647] Training: 56%|█████▋ | 5648/10000 [1:07:57<1:01:22, 1.18it/s, loss=0.0170, lr=1.23e-05, step=5647] Training: 56%|█████▋ | 5648/10000 [1:07:57<1:01:22, 1.18it/s, loss=0.0137, lr=1.23e-05, step=5648] Training: 56%|█████▋ | 5649/10000 [1:07:58<59:02, 1.23it/s, loss=0.0137, lr=1.23e-05, step=5648] Training: 56%|█████▋ | 5649/10000 [1:07:58<59:02, 1.23it/s, loss=0.0111, lr=1.23e-05, step=5649]17:14:05.517 [I] step=5650 loss=0.0048 smoothed_loss=0.0130 lr=1.23e-05 grad_norm=0.4111 step_time=0.6482s data_time=0.1870s it/s=1.197 eta_to_10000=3632.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0082 grad_action_out_proj=0.0892 grad_shared_expert=0.3030 (10775:train_pytorch.py:850) + Training: 56%|█████▋ | 5650/10000 [1:07:59<1:03:58, 1.13it/s, loss=0.0111, lr=1.23e-05, step=5649] Training: 56%|█████▋ | 5650/10000 [1:07:59<1:03:58, 1.13it/s, loss=0.0048, lr=1.23e-05, step=5650] Training: 57%|█████▋ | 5651/10000 [1:07:59<1:01:34, 1.18it/s, loss=0.0048, lr=1.23e-05, step=5650] Training: 57%|█████▋ | 5651/10000 [1:07:59<1:01:34, 1.18it/s, loss=0.0117, lr=1.23e-05, step=5651] Training: 57%|█████▋ | 5652/10000 [1:08:00<1:00:47, 1.19it/s, loss=0.0117, lr=1.23e-05, step=5651] Training: 57%|█████▋ | 5652/10000 [1:08:00<1:00:47, 1.19it/s, loss=0.0083, lr=1.23e-05, step=5652] Training: 57%|█████▋ | 5653/10000 [1:08:01<59:33, 1.22it/s, loss=0.0083, lr=1.23e-05, step=5652] Training: 57%|█████▋ | 5653/10000 [1:08:01<59:33, 1.22it/s, loss=0.0056, lr=1.23e-05, step=5653] Training: 57%|█████▋ | 5654/10000 [1:08:02<1:02:05, 1.17it/s, loss=0.0056, lr=1.23e-05, step=5653] Training: 57%|█████▋ | 5654/10000 [1:08:02<1:02:05, 1.17it/s, loss=0.0507, lr=1.23e-05, step=5654] Training: 57%|█████▋ | 5655/10000 [1:08:03<58:13, 1.24it/s, loss=0.0507, lr=1.23e-05, step=5654] Training: 57%|█████▋ | 5655/10000 [1:08:03<58:13, 1.24it/s, loss=0.0134, lr=1.23e-05, step=5655] Training: 57%|█████▋ | 5656/10000 [1:08:03<57:57, 1.25it/s, loss=0.0134, lr=1.23e-05, step=5655] Training: 57%|█████▋ | 5656/10000 [1:08:03<57:57, 1.25it/s, loss=0.0374, lr=1.22e-05, step=5656] Training: 57%|█████▋ | 5657/10000 [1:08:04<58:50, 1.23it/s, loss=0.0374, lr=1.22e-05, step=5656] Training: 57%|█████▋ | 5657/10000 [1:08:04<58:50, 1.23it/s, loss=0.0127, lr=1.22e-05, step=5657] Training: 57%|█████▋ | 5658/10000 [1:08:05<1:01:46, 1.17it/s, loss=0.0127, lr=1.22e-05, step=5657] Training: 57%|█████▋ | 5658/10000 [1:08:05<1:01:46, 1.17it/s, loss=0.0032, lr=1.22e-05, step=5658] Training: 57%|█████▋ | 5659/10000 [1:08:06<1:02:41, 1.15it/s, loss=0.0032, lr=1.22e-05, step=5658] Training: 57%|█████▋ | 5659/10000 [1:08:06<1:02:41, 1.15it/s, loss=0.0036, lr=1.22e-05, step=5659]17:14:14.198 [I] step=5660 loss=0.0112 smoothed_loss=0.0142 lr=1.22e-05 grad_norm=0.4477 step_time=0.6892s data_time=0.1789s it/s=1.152 eta_to_10000=3766.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0053 grad_action_out_proj=0.0604 grad_shared_expert=0.2915 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5660/10000 [1:08:07<1:10:21, 1.03it/s, loss=0.0036, lr=1.22e-05, step=5659] Training: 57%|█████▋ | 5660/10000 [1:08:07<1:10:21, 1.03it/s, loss=0.0112, lr=1.22e-05, step=5660] Training: 57%|█████▋ | 5661/10000 [1:08:08<1:11:04, 1.02it/s, loss=0.0112, lr=1.22e-05, step=5660] Training: 57%|█████▋ | 5661/10000 [1:08:08<1:11:04, 1.02it/s, loss=0.0026, lr=1.22e-05, step=5661] Training: 57%|█████▋ | 5662/10000 [1:08:09<1:14:56, 1.04s/it, loss=0.0026, lr=1.22e-05, step=5661] Training: 57%|█████▋ | 5662/10000 [1:08:09<1:14:56, 1.04s/it, loss=0.0050, lr=1.22e-05, step=5662] Training: 57%|█████▋ | 5663/10000 [1:08:10<1:15:17, 1.04s/it, loss=0.0050, lr=1.22e-05, step=5662] Training: 57%|█████▋ | 5663/10000 [1:08:10<1:15:17, 1.04s/it, loss=0.0154, lr=1.22e-05, step=5663] Training: 57%|█████▋ | 5664/10000 [1:08:11<1:14:12, 1.03s/it, loss=0.0154, lr=1.22e-05, step=5663] Training: 57%|█████▋ | 5664/10000 [1:08:11<1:14:12, 1.03s/it, loss=0.0600, lr=1.22e-05, step=5664] Training: 57%|█████▋ | 5665/10000 [1:08:12<1:11:12, 1.01it/s, loss=0.0600, lr=1.22e-05, step=5664] Training: 57%|█████▋ | 5665/10000 [1:08:12<1:11:12, 1.01it/s, loss=0.0018, lr=1.22e-05, step=5665] Training: 57%|█████▋ | 5666/10000 [1:08:13<1:07:15, 1.07it/s, loss=0.0018, lr=1.22e-05, step=5665] Training: 57%|█████▋ | 5666/10000 [1:08:13<1:07:15, 1.07it/s, loss=0.0028, lr=1.22e-05, step=5666] Training: 57%|█████▋ | 5667/10000 [1:08:14<1:07:36, 1.07it/s, loss=0.0028, lr=1.22e-05, step=5666] Training: 57%|█████▋ | 5667/10000 [1:08:14<1:07:36, 1.07it/s, loss=0.0018, lr=1.22e-05, step=5667] Training: 57%|█████▋ | 5668/10000 [1:08:15<1:03:37, 1.13it/s, loss=0.0018, lr=1.22e-05, step=5667] Training: 57%|█████▋ | 5668/10000 [1:08:15<1:03:37, 1.13it/s, loss=0.0132, lr=1.22e-05, step=5668] Training: 57%|█████▋ | 5669/10000 [1:08:16<1:06:40, 1.08it/s, loss=0.0132, lr=1.22e-05, step=5668] Training: 57%|█████▋ | 5669/10000 [1:08:16<1:06:40, 1.08it/s, loss=0.0029, lr=1.22e-05, step=5669]17:14:23.607 [I] step=5670 loss=0.0145 smoothed_loss=0.0124 lr=1.22e-05 grad_norm=0.4765 step_time=0.7165s data_time=0.2244s it/s=1.064 eta_to_10000=4070.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0212 grad_action_out_proj=0.1736 grad_shared_expert=0.6702 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5670/10000 [1:08:17<1:03:19, 1.14it/s, loss=0.0029, lr=1.22e-05, step=5669] Training: 57%|█████▋ | 5670/10000 [1:08:17<1:03:19, 1.14it/s, loss=0.0145, lr=1.22e-05, step=5670] Training: 57%|█████▋ | 5671/10000 [1:08:17<1:00:23, 1.19it/s, loss=0.0145, lr=1.22e-05, step=5670] Training: 57%|█████▋ | 5671/10000 [1:08:17<1:00:23, 1.19it/s, loss=0.0053, lr=1.22e-05, step=5671] Training: 57%|█████▋ | 5672/10000 [1:08:18<58:38, 1.23it/s, loss=0.0053, lr=1.22e-05, step=5671] Training: 57%|█████▋ | 5672/10000 [1:08:18<58:38, 1.23it/s, loss=0.0059, lr=1.22e-05, step=5672] Training: 57%|█████▋ | 5673/10000 [1:08:19<58:23, 1.24it/s, loss=0.0059, lr=1.22e-05, step=5672] Training: 57%|█████▋ | 5673/10000 [1:08:19<58:23, 1.24it/s, loss=0.0156, lr=1.22e-05, step=5673] Training: 57%|█████▋ | 5674/10000 [1:08:20<1:05:41, 1.10it/s, loss=0.0156, lr=1.22e-05, step=5673] Training: 57%|█████▋ | 5674/10000 [1:08:20<1:05:41, 1.10it/s, loss=0.0321, lr=1.22e-05, step=5674] Training: 57%|█████▋ | 5675/10000 [1:08:21<1:04:33, 1.12it/s, loss=0.0321, lr=1.22e-05, step=5674] Training: 57%|█████▋ | 5675/10000 [1:08:21<1:04:33, 1.12it/s, loss=0.0051, lr=1.22e-05, step=5675] Training: 57%|█████▋ | 5676/10000 [1:08:22<1:01:54, 1.16it/s, loss=0.0051, lr=1.22e-05, step=5675] Training: 57%|█████▋ | 5676/10000 [1:08:22<1:01:54, 1.16it/s, loss=0.0040, lr=1.22e-05, step=5676] Training: 57%|█████▋ | 5677/10000 [1:08:22<57:49, 1.25it/s, loss=0.0040, lr=1.22e-05, step=5676] Training: 57%|█████▋ | 5677/10000 [1:08:22<57:49, 1.25it/s, loss=0.0089, lr=1.22e-05, step=5677] Training: 57%|█████▋ | 5678/10000 [1:08:23<57:08, 1.26it/s, loss=0.0089, lr=1.22e-05, step=5677] Training: 57%|█████▋ | 5678/10000 [1:08:23<57:08, 1.26it/s, loss=0.0133, lr=1.22e-05, step=5678] Training: 57%|█████▋ | 5679/10000 [1:08:24<54:25, 1.32it/s, loss=0.0133, lr=1.22e-05, step=5678] Training: 57%|█████▋ | 5679/10000 [1:08:24<54:25, 1.32it/s, loss=0.0473, lr=1.22e-05, step=5679]17:14:31.625 [I] step=5680 loss=0.0098 smoothed_loss=0.0148 lr=1.22e-05 grad_norm=0.4953 step_time=0.6398s data_time=0.1619s it/s=1.247 eta_to_10000=3463.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0076 grad_action_out_proj=0.0801 grad_shared_expert=0.4385 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5680/10000 [1:08:25<55:54, 1.29it/s, loss=0.0473, lr=1.22e-05, step=5679] Training: 57%|█████▋ | 5680/10000 [1:08:25<55:54, 1.29it/s, loss=0.0098, lr=1.22e-05, step=5680] Training: 57%|█████▋ | 5681/10000 [1:08:25<55:26, 1.30it/s, loss=0.0098, lr=1.22e-05, step=5680] Training: 57%|█████▋ | 5681/10000 [1:08:25<55:26, 1.30it/s, loss=0.0060, lr=1.22e-05, step=5681] Training: 57%|█████▋ | 5682/10000 [1:08:26<56:07, 1.28it/s, loss=0.0060, lr=1.22e-05, step=5681] Training: 57%|█████▋ | 5682/10000 [1:08:26<56:07, 1.28it/s, loss=0.0148, lr=1.22e-05, step=5682] Training: 57%|█████▋ | 5683/10000 [1:08:27<1:00:24, 1.19it/s, loss=0.0148, lr=1.22e-05, step=5682] Training: 57%|█████▋ | 5683/10000 [1:08:27<1:00:24, 1.19it/s, loss=0.0027, lr=1.21e-05, step=5683] Training: 57%|█████▋ | 5684/10000 [1:08:28<57:50, 1.24it/s, loss=0.0027, lr=1.21e-05, step=5683] Training: 57%|█████▋ | 5684/10000 [1:08:28<57:50, 1.24it/s, loss=0.0317, lr=1.21e-05, step=5684] Training: 57%|█████▋ | 5685/10000 [1:08:29<57:05, 1.26it/s, loss=0.0317, lr=1.21e-05, step=5684] Training: 57%|█████▋ | 5685/10000 [1:08:29<57:05, 1.26it/s, loss=0.0095, lr=1.21e-05, step=5685] Training: 57%|█████▋ | 5686/10000 [1:08:30<58:42, 1.22it/s, loss=0.0095, lr=1.21e-05, step=5685] Training: 57%|█████▋ | 5686/10000 [1:08:30<58:42, 1.22it/s, loss=0.0099, lr=1.21e-05, step=5686] Training: 57%|█████▋ | 5687/10000 [1:08:31<1:01:32, 1.17it/s, loss=0.0099, lr=1.21e-05, step=5686] Training: 57%|█████▋ | 5687/10000 [1:08:31<1:01:32, 1.17it/s, loss=0.0108, lr=1.21e-05, step=5687] Training: 57%|█████▋ | 5688/10000 [1:08:31<1:00:04, 1.20it/s, loss=0.0108, lr=1.21e-05, step=5687] Training: 57%|█████▋ | 5688/10000 [1:08:31<1:00:04, 1.20it/s, loss=0.0296, lr=1.21e-05, step=5688] Training: 57%|█████▋ | 5689/10000 [1:08:33<1:09:34, 1.03it/s, loss=0.0296, lr=1.21e-05, step=5688] Training: 57%|█████▋ | 5689/10000 [1:08:33<1:09:34, 1.03it/s, loss=0.0160, lr=1.21e-05, step=5689]17:14:40.710 [I] step=5690 loss=0.0069 smoothed_loss=0.0144 lr=1.21e-05 grad_norm=0.4525 step_time=0.7173s data_time=0.1912s it/s=1.101 eta_to_10000=3914.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.0900 grad_shared_expert=0.5734 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5690/10000 [1:08:34<1:13:57, 1.03s/it, loss=0.0160, lr=1.21e-05, step=5689] Training: 57%|█████▋ | 5690/10000 [1:08:34<1:13:57, 1.03s/it, loss=0.0069, lr=1.21e-05, step=5690] Training: 57%|█████▋ | 5691/10000 [1:08:35<1:13:08, 1.02s/it, loss=0.0069, lr=1.21e-05, step=5690] Training: 57%|█████▋ | 5691/10000 [1:08:35<1:13:08, 1.02s/it, loss=0.0058, lr=1.21e-05, step=5691] Training: 57%|█████▋ | 5692/10000 [1:08:36<1:11:07, 1.01it/s, loss=0.0058, lr=1.21e-05, step=5691] Training: 57%|█████▋ | 5692/10000 [1:08:36<1:11:07, 1.01it/s, loss=0.0028, lr=1.21e-05, step=5692] Training: 57%|█████▋ | 5693/10000 [1:08:37<1:08:46, 1.04it/s, loss=0.0028, lr=1.21e-05, step=5692] Training: 57%|█████▋ | 5693/10000 [1:08:37<1:08:46, 1.04it/s, loss=0.0066, lr=1.21e-05, step=5693] Training: 57%|█████▋ | 5694/10000 [1:08:37<1:01:23, 1.17it/s, loss=0.0066, lr=1.21e-05, step=5693] Training: 57%|█████▋ | 5694/10000 [1:08:37<1:01:23, 1.17it/s, loss=0.0069, lr=1.21e-05, step=5694] Training: 57%|█████▋ | 5695/10000 [1:08:38<1:01:03, 1.18it/s, loss=0.0069, lr=1.21e-05, step=5694] Training: 57%|█████▋ | 5695/10000 [1:08:38<1:01:03, 1.18it/s, loss=0.0065, lr=1.21e-05, step=5695] Training: 57%|█████▋ | 5696/10000 [1:08:39<1:06:06, 1.09it/s, loss=0.0065, lr=1.21e-05, step=5695] Training: 57%|█████▋ | 5696/10000 [1:08:39<1:06:06, 1.09it/s, loss=0.0140, lr=1.21e-05, step=5696] Training: 57%|█████▋ | 5697/10000 [1:08:40<1:07:47, 1.06it/s, loss=0.0140, lr=1.21e-05, step=5696] Training: 57%|█████▋ | 5697/10000 [1:08:40<1:07:47, 1.06it/s, loss=0.0060, lr=1.21e-05, step=5697] Training: 57%|█████▋ | 5698/10000 [1:08:41<1:01:10, 1.17it/s, loss=0.0060, lr=1.21e-05, step=5697] Training: 57%|█████▋ | 5698/10000 [1:08:41<1:01:10, 1.17it/s, loss=0.0151, lr=1.21e-05, step=5698] Training: 57%|█████▋ | 5699/10000 [1:08:41<58:01, 1.24it/s, loss=0.0151, lr=1.21e-05, step=5698] Training: 57%|█████▋ | 5699/10000 [1:08:41<58:01, 1.24it/s, loss=0.0091, lr=1.21e-05, step=5699]17:14:49.137 [I] step=5700 loss=0.0714 smoothed_loss=0.0170 lr=1.21e-05 grad_norm=0.5679 step_time=0.6561s data_time=0.1867s it/s=1.187 eta_to_10000=3623.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0242 grad_action_out_proj=0.1511 grad_shared_expert=1.4243 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5700/10000 [1:08:42<56:28, 1.27it/s, loss=0.0091, lr=1.21e-05, step=5699] Training: 57%|█████▋ | 5700/10000 [1:08:42<56:28, 1.27it/s, loss=0.0714, lr=1.21e-05, step=5700] Training: 57%|█████▋ | 5701/10000 [1:08:43<56:15, 1.27it/s, loss=0.0714, lr=1.21e-05, step=5700] Training: 57%|█████▋ | 5701/10000 [1:08:43<56:15, 1.27it/s, loss=0.0043, lr=1.21e-05, step=5701] Training: 57%|█████▋ | 5702/10000 [1:08:44<1:00:22, 1.19it/s, loss=0.0043, lr=1.21e-05, step=5701] Training: 57%|█████▋ | 5702/10000 [1:08:44<1:00:22, 1.19it/s, loss=0.0127, lr=1.21e-05, step=5702] Training: 57%|█████▋ | 5703/10000 [1:08:45<1:10:55, 1.01it/s, loss=0.0127, lr=1.21e-05, step=5702] Training: 57%|█████▋ | 5703/10000 [1:08:45<1:10:55, 1.01it/s, loss=0.0020, lr=1.21e-05, step=5703] Training: 57%|█████▋ | 5704/10000 [1:08:46<1:11:12, 1.01it/s, loss=0.0020, lr=1.21e-05, step=5703] Training: 57%|█████▋ | 5704/10000 [1:08:46<1:11:12, 1.01it/s, loss=0.0052, lr=1.21e-05, step=5704] Training: 57%|█████▋ | 5705/10000 [1:08:47<1:13:09, 1.02s/it, loss=0.0052, lr=1.21e-05, step=5704] Training: 57%|█████▋ | 5705/10000 [1:08:47<1:13:09, 1.02s/it, loss=0.0049, lr=1.21e-05, step=5705] Training: 57%|█████▋ | 5706/10000 [1:08:48<1:10:43, 1.01it/s, loss=0.0049, lr=1.21e-05, step=5705] Training: 57%|█████▋ | 5706/10000 [1:08:48<1:10:43, 1.01it/s, loss=0.0121, lr=1.21e-05, step=5706] Training: 57%|█████▋ | 5707/10000 [1:08:49<1:04:18, 1.11it/s, loss=0.0121, lr=1.21e-05, step=5706] Training: 57%|█████▋ | 5707/10000 [1:08:49<1:04:18, 1.11it/s, loss=0.0057, lr=1.21e-05, step=5707] Training: 57%|█████▋ | 5708/10000 [1:08:50<1:01:30, 1.16it/s, loss=0.0057, lr=1.21e-05, step=5707] Training: 57%|█████▋ | 5708/10000 [1:08:50<1:01:30, 1.16it/s, loss=0.0028, lr=1.21e-05, step=5708] Training: 57%|█████▋ | 5709/10000 [1:08:51<1:03:29, 1.13it/s, loss=0.0028, lr=1.21e-05, step=5708] Training: 57%|█████▋ | 5709/10000 [1:08:51<1:03:29, 1.13it/s, loss=0.0188, lr=1.21e-05, step=5709]17:14:58.971 [I] step=5710 loss=0.0420 smoothed_loss=0.0146 lr=1.21e-05 grad_norm=0.4702 step_time=0.7319s data_time=0.2516s it/s=1.017 eta_to_10000=4218.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0290 grad_action_out_proj=0.1758 grad_shared_expert=0.6973 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5710/10000 [1:08:52<1:12:58, 1.02s/it, loss=0.0188, lr=1.21e-05, step=5709] Training: 57%|█████▋ | 5710/10000 [1:08:52<1:12:58, 1.02s/it, loss=0.0420, lr=1.20e-05, step=5710] Training: 57%|█████▋ | 5711/10000 [1:08:53<1:10:09, 1.02it/s, loss=0.0420, lr=1.20e-05, step=5710] Training: 57%|█████▋ | 5711/10000 [1:08:53<1:10:09, 1.02it/s, loss=0.0117, lr=1.20e-05, step=5711] Training: 57%|█████▋ | 5712/10000 [1:08:54<1:11:57, 1.01s/it, loss=0.0117, lr=1.20e-05, step=5711] Training: 57%|█████▋ | 5712/10000 [1:08:54<1:11:57, 1.01s/it, loss=0.0017, lr=1.20e-05, step=5712] Training: 57%|█████▋ | 5713/10000 [1:08:55<1:08:21, 1.05it/s, loss=0.0017, lr=1.20e-05, step=5712] Training: 57%|█████▋ | 5713/10000 [1:08:55<1:08:21, 1.05it/s, loss=0.0069, lr=1.20e-05, step=5713] Training: 57%|█████▋ | 5714/10000 [1:08:56<1:05:03, 1.10it/s, loss=0.0069, lr=1.20e-05, step=5713] Training: 57%|█████▋ | 5714/10000 [1:08:56<1:05:03, 1.10it/s, loss=0.0047, lr=1.20e-05, step=5714] Training: 57%|█████▋ | 5715/10000 [1:08:56<1:03:41, 1.12it/s, loss=0.0047, lr=1.20e-05, step=5714] Training: 57%|█████▋ | 5715/10000 [1:08:56<1:03:41, 1.12it/s, loss=0.0085, lr=1.20e-05, step=5715] Training: 57%|█████▋ | 5716/10000 [1:08:57<1:02:54, 1.14it/s, loss=0.0085, lr=1.20e-05, step=5715] Training: 57%|█████▋ | 5716/10000 [1:08:57<1:02:54, 1.14it/s, loss=0.0091, lr=1.20e-05, step=5716] Training: 57%|█████▋ | 5717/10000 [1:08:58<1:05:12, 1.09it/s, loss=0.0091, lr=1.20e-05, step=5716] Training: 57%|█████▋ | 5717/10000 [1:08:58<1:05:12, 1.09it/s, loss=0.0229, lr=1.20e-05, step=5717] Training: 57%|█████▋ | 5718/10000 [1:08:59<1:03:49, 1.12it/s, loss=0.0229, lr=1.20e-05, step=5717] Training: 57%|█████▋ | 5718/10000 [1:08:59<1:03:49, 1.12it/s, loss=0.0058, lr=1.20e-05, step=5718] Training: 57%|█████▋ | 5719/10000 [1:09:00<1:08:35, 1.04it/s, loss=0.0058, lr=1.20e-05, step=5718] Training: 57%|█████▋ | 5719/10000 [1:09:00<1:08:35, 1.04it/s, loss=0.0364, lr=1.20e-05, step=5719]17:15:07.941 [I] step=5720 loss=0.0140 smoothed_loss=0.0141 lr=1.20e-05 grad_norm=0.5149 step_time=0.6918s data_time=0.2052s it/s=1.115 eta_to_10000=3838.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0201 grad_action_out_proj=0.1572 grad_shared_expert=0.6217 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5720/10000 [1:09:01<1:03:13, 1.13it/s, loss=0.0364, lr=1.20e-05, step=5719] Training: 57%|█████▋ | 5720/10000 [1:09:01<1:03:13, 1.13it/s, loss=0.0140, lr=1.20e-05, step=5720] Training: 57%|█████▋ | 5721/10000 [1:09:02<56:06, 1.27it/s, loss=0.0140, lr=1.20e-05, step=5720] Training: 57%|█████▋ | 5721/10000 [1:09:02<56:06, 1.27it/s, loss=0.0087, lr=1.20e-05, step=5721] Training: 57%|█████▋ | 5722/10000 [1:09:02<55:15, 1.29it/s, loss=0.0087, lr=1.20e-05, step=5721] Training: 57%|█████▋ | 5722/10000 [1:09:02<55:15, 1.29it/s, loss=0.0321, lr=1.20e-05, step=5722] Training: 57%|█████▋ | 5723/10000 [1:09:03<53:08, 1.34it/s, loss=0.0321, lr=1.20e-05, step=5722] Training: 57%|█████▋ | 5723/10000 [1:09:03<53:08, 1.34it/s, loss=0.0098, lr=1.20e-05, step=5723] Training: 57%|█████▋ | 5724/10000 [1:09:04<56:55, 1.25it/s, loss=0.0098, lr=1.20e-05, step=5723] Training: 57%|█████▋ | 5724/10000 [1:09:04<56:55, 1.25it/s, loss=0.0106, lr=1.20e-05, step=5724] Training: 57%|█████▋ | 5725/10000 [1:09:05<53:44, 1.33it/s, loss=0.0106, lr=1.20e-05, step=5724] Training: 57%|█████▋ | 5725/10000 [1:09:05<53:44, 1.33it/s, loss=0.0073, lr=1.20e-05, step=5725] Training: 57%|█████▋ | 5726/10000 [1:09:06<1:00:25, 1.18it/s, loss=0.0073, lr=1.20e-05, step=5725] Training: 57%|█████▋ | 5726/10000 [1:09:06<1:00:25, 1.18it/s, loss=0.0024, lr=1.20e-05, step=5726] Training: 57%|█████▋ | 5727/10000 [1:09:06<55:46, 1.28it/s, loss=0.0024, lr=1.20e-05, step=5726] Training: 57%|█████▋ | 5727/10000 [1:09:06<55:46, 1.28it/s, loss=0.0027, lr=1.20e-05, step=5727] Training: 57%|█████▋ | 5728/10000 [1:09:07<54:06, 1.32it/s, loss=0.0027, lr=1.20e-05, step=5727] Training: 57%|█████▋ | 5728/10000 [1:09:07<54:06, 1.32it/s, loss=0.0521, lr=1.20e-05, step=5728] Training: 57%|█████▋ | 5729/10000 [1:09:08<53:44, 1.32it/s, loss=0.0521, lr=1.20e-05, step=5728] Training: 57%|█████▋ | 5729/10000 [1:09:08<53:44, 1.32it/s, loss=0.0852, lr=1.20e-05, step=5729]17:15:15.219 [I] step=5730 loss=0.0297 smoothed_loss=0.0233 lr=1.20e-05 grad_norm=0.4969 step_time=0.5996s data_time=0.1281s it/s=1.374 eta_to_10000=3107.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0254 grad_action_out_proj=0.2105 grad_shared_expert=0.6594 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5730/10000 [1:09:08<50:02, 1.42it/s, loss=0.0852, lr=1.20e-05, step=5729] Training: 57%|█████▋ | 5730/10000 [1:09:08<50:02, 1.42it/s, loss=0.0297, lr=1.20e-05, step=5730] Training: 57%|█████▋ | 5731/10000 [1:09:09<50:27, 1.41it/s, loss=0.0297, lr=1.20e-05, step=5730] Training: 57%|█████▋ | 5731/10000 [1:09:09<50:27, 1.41it/s, loss=0.0158, lr=1.20e-05, step=5731] Training: 57%|█████▋ | 5732/10000 [1:09:10<54:04, 1.32it/s, loss=0.0158, lr=1.20e-05, step=5731] Training: 57%|█████▋ | 5732/10000 [1:09:10<54:04, 1.32it/s, loss=0.0047, lr=1.20e-05, step=5732] Training: 57%|█████▋ | 5733/10000 [1:09:11<1:00:57, 1.17it/s, loss=0.0047, lr=1.20e-05, step=5732] Training: 57%|█████▋ | 5733/10000 [1:09:11<1:00:57, 1.17it/s, loss=0.0100, lr=1.20e-05, step=5733] Training: 57%|█████▋ | 5734/10000 [1:09:12<1:03:24, 1.12it/s, loss=0.0100, lr=1.20e-05, step=5733] Training: 57%|█████▋ | 5734/10000 [1:09:12<1:03:24, 1.12it/s, loss=0.0102, lr=1.20e-05, step=5734] Training: 57%|█████▋ | 5735/10000 [1:09:12<56:08, 1.27it/s, loss=0.0102, lr=1.20e-05, step=5734] Training: 57%|█████▋ | 5735/10000 [1:09:12<56:08, 1.27it/s, loss=0.0079, lr=1.20e-05, step=5735] Training: 57%|█████▋ | 5736/10000 [1:09:13<54:48, 1.30it/s, loss=0.0079, lr=1.20e-05, step=5735] Training: 57%|█████▋ | 5736/10000 [1:09:13<54:48, 1.30it/s, loss=0.0082, lr=1.20e-05, step=5736] Training: 57%|█████▋ | 5737/10000 [1:09:14<54:59, 1.29it/s, loss=0.0082, lr=1.20e-05, step=5736] Training: 57%|█████▋ | 5737/10000 [1:09:14<54:59, 1.29it/s, loss=0.0085, lr=1.19e-05, step=5737] Training: 57%|█████▋ | 5738/10000 [1:09:15<56:14, 1.26it/s, loss=0.0085, lr=1.19e-05, step=5737] Training: 57%|█████▋ | 5738/10000 [1:09:15<56:14, 1.26it/s, loss=0.0201, lr=1.19e-05, step=5738] Training: 57%|█████▋ | 5739/10000 [1:09:15<52:20, 1.36it/s, loss=0.0201, lr=1.19e-05, step=5738] Training: 57%|█████▋ | 5739/10000 [1:09:15<52:20, 1.36it/s, loss=0.0030, lr=1.19e-05, step=5739]17:15:23.564 [I] step=5740 loss=0.0147 smoothed_loss=0.0150 lr=1.20e-05 grad_norm=0.4206 step_time=0.6741s data_time=0.1607s it/s=1.199 eta_to_10000=3552.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0120 grad_action_out_proj=0.1070 grad_shared_expert=0.5630 (10775:train_pytorch.py:850) + Training: 57%|█████▋ | 5740/10000 [1:09:17<1:01:45, 1.15it/s, loss=0.0030, lr=1.19e-05, step=5739] Training: 57%|█████▋ | 5740/10000 [1:09:17<1:01:45, 1.15it/s, loss=0.0147, lr=1.19e-05, step=5740] Training: 57%|█████▋ | 5741/10000 [1:09:17<59:14, 1.20it/s, loss=0.0147, lr=1.19e-05, step=5740] Training: 57%|█████▋ | 5741/10000 [1:09:17<59:14, 1.20it/s, loss=0.0026, lr=1.19e-05, step=5741] Training: 57%|█████▋ | 5742/10000 [1:09:18<54:32, 1.30it/s, loss=0.0026, lr=1.19e-05, step=5741] Training: 57%|█████▋ | 5742/10000 [1:09:18<54:32, 1.30it/s, loss=0.0058, lr=1.19e-05, step=5742] Training: 57%|█████▋ | 5743/10000 [1:09:19<55:44, 1.27it/s, loss=0.0058, lr=1.19e-05, step=5742] Training: 57%|█████▋ | 5743/10000 [1:09:19<55:44, 1.27it/s, loss=0.0119, lr=1.19e-05, step=5743] Training: 57%|█████▋ | 5744/10000 [1:09:20<56:26, 1.26it/s, loss=0.0119, lr=1.19e-05, step=5743] Training: 57%|█████▋ | 5744/10000 [1:09:20<56:26, 1.26it/s, loss=0.0014, lr=1.19e-05, step=5744] Training: 57%|█████▋ | 5745/10000 [1:09:20<56:13, 1.26it/s, loss=0.0014, lr=1.19e-05, step=5744] Training: 57%|█████▋ | 5745/10000 [1:09:20<56:13, 1.26it/s, loss=0.0043, lr=1.19e-05, step=5745] Training: 57%|█████▋ | 5746/10000 [1:09:21<52:14, 1.36it/s, loss=0.0043, lr=1.19e-05, step=5745] Training: 57%|█████▋ | 5746/10000 [1:09:21<52:14, 1.36it/s, loss=0.0051, lr=1.19e-05, step=5746] Training: 57%|█████▋ | 5747/10000 [1:09:22<56:47, 1.25it/s, loss=0.0051, lr=1.19e-05, step=5746] Training: 57%|█████▋ | 5747/10000 [1:09:22<56:47, 1.25it/s, loss=0.0073, lr=1.19e-05, step=5747] Training: 57%|█████▋ | 5748/10000 [1:09:23<1:01:02, 1.16it/s, loss=0.0073, lr=1.19e-05, step=5747] Training: 57%|█████▋ | 5748/10000 [1:09:23<1:01:02, 1.16it/s, loss=0.0134, lr=1.19e-05, step=5748] Training: 57%|█████▋ | 5749/10000 [1:09:24<54:51, 1.29it/s, loss=0.0134, lr=1.19e-05, step=5748] Training: 57%|█████▋ | 5749/10000 [1:09:24<54:51, 1.29it/s, loss=0.0105, lr=1.19e-05, step=5749]17:15:31.074 [I] step=5750 loss=0.0047 smoothed_loss=0.0098 lr=1.19e-05 grad_norm=0.4146 step_time=0.6152s data_time=0.1356s it/s=1.332 eta_to_10000=3191.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.1174 grad_shared_expert=0.6485 (10775:train_pytorch.py:850) + Training: 57%|█████▊ | 5750/10000 [1:09:24<50:45, 1.40it/s, loss=0.0105, lr=1.19e-05, step=5749] Training: 57%|█████▊ | 5750/10000 [1:09:24<50:45, 1.40it/s, loss=0.0047, lr=1.19e-05, step=5750] Training: 58%|█████▊ | 5751/10000 [1:09:25<47:47, 1.48it/s, loss=0.0047, lr=1.19e-05, step=5750] Training: 58%|█████▊ | 5751/10000 [1:09:25<47:47, 1.48it/s, loss=0.0092, lr=1.19e-05, step=5751] Training: 58%|█████▊ | 5752/10000 [1:09:26<50:52, 1.39it/s, loss=0.0092, lr=1.19e-05, step=5751] Training: 58%|█████▊ | 5752/10000 [1:09:26<50:52, 1.39it/s, loss=0.0176, lr=1.19e-05, step=5752] Training: 58%|█████▊ | 5753/10000 [1:09:26<51:35, 1.37it/s, loss=0.0176, lr=1.19e-05, step=5752] Training: 58%|█████▊ | 5753/10000 [1:09:26<51:35, 1.37it/s, loss=0.0028, lr=1.19e-05, step=5753] Training: 58%|█████▊ | 5754/10000 [1:09:27<48:53, 1.45it/s, loss=0.0028, lr=1.19e-05, step=5753] Training: 58%|█████▊ | 5754/10000 [1:09:27<48:53, 1.45it/s, loss=0.0158, lr=1.19e-05, step=5754] Training: 58%|█████▊ | 5755/10000 [1:09:28<56:36, 1.25it/s, loss=0.0158, lr=1.19e-05, step=5754] Training: 58%|█████▊ | 5755/10000 [1:09:28<56:36, 1.25it/s, loss=0.0104, lr=1.19e-05, step=5755] Training: 58%|█████▊ | 5756/10000 [1:09:29<56:01, 1.26it/s, loss=0.0104, lr=1.19e-05, step=5755] Training: 58%|█████▊ | 5756/10000 [1:09:29<56:01, 1.26it/s, loss=0.0023, lr=1.19e-05, step=5756] Training: 58%|█████▊ | 5757/10000 [1:09:30<56:39, 1.25it/s, loss=0.0023, lr=1.19e-05, step=5756] Training: 58%|█████▊ | 5757/10000 [1:09:30<56:39, 1.25it/s, loss=0.0045, lr=1.19e-05, step=5757] Training: 58%|█████▊ | 5758/10000 [1:09:30<58:30, 1.21it/s, loss=0.0045, lr=1.19e-05, step=5757] Training: 58%|█████▊ | 5758/10000 [1:09:30<58:30, 1.21it/s, loss=0.0171, lr=1.19e-05, step=5758] Training: 58%|█████▊ | 5759/10000 [1:09:31<54:41, 1.29it/s, loss=0.0171, lr=1.19e-05, step=5758] Training: 58%|█████▊ | 5759/10000 [1:09:31<54:41, 1.29it/s, loss=0.0151, lr=1.19e-05, step=5759]17:15:38.607 [I] step=5760 loss=0.0105 smoothed_loss=0.0104 lr=1.19e-05 grad_norm=0.3792 step_time=0.5963s data_time=0.1570s it/s=1.328 eta_to_10000=3193.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0100 grad_action_out_proj=0.1265 grad_shared_expert=0.5221 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5760/10000 [1:09:32<50:49, 1.39it/s, loss=0.0151, lr=1.19e-05, step=5759] Training: 58%|█████▊ | 5760/10000 [1:09:32<50:49, 1.39it/s, loss=0.0105, lr=1.19e-05, step=5760] Training: 58%|█████▊ | 5761/10000 [1:09:32<52:18, 1.35it/s, loss=0.0105, lr=1.19e-05, step=5760] Training: 58%|█████▊ | 5761/10000 [1:09:32<52:18, 1.35it/s, loss=0.0049, lr=1.19e-05, step=5761] Training: 58%|█████▊ | 5762/10000 [1:09:34<1:01:35, 1.15it/s, loss=0.0049, lr=1.19e-05, step=5761] Training: 58%|█████▊ | 5762/10000 [1:09:34<1:01:35, 1.15it/s, loss=0.0374, lr=1.19e-05, step=5762] Training: 58%|█████▊ | 5763/10000 [1:09:34<1:00:30, 1.17it/s, loss=0.0374, lr=1.19e-05, step=5762] Training: 58%|█████▊ | 5763/10000 [1:09:34<1:00:30, 1.17it/s, loss=0.0037, lr=1.19e-05, step=5763] Training: 58%|█████▊ | 5764/10000 [1:09:35<59:13, 1.19it/s, loss=0.0037, lr=1.19e-05, step=5763] Training: 58%|█████▊ | 5764/10000 [1:09:35<59:13, 1.19it/s, loss=0.0127, lr=1.19e-05, step=5764] Training: 58%|█████▊ | 5765/10000 [1:09:36<56:51, 1.24it/s, loss=0.0127, lr=1.19e-05, step=5764] Training: 58%|█████▊ | 5765/10000 [1:09:36<56:51, 1.24it/s, loss=0.0051, lr=1.18e-05, step=5765] Training: 58%|█████▊ | 5766/10000 [1:09:37<54:59, 1.28it/s, loss=0.0051, lr=1.18e-05, step=5765] Training: 58%|█████▊ | 5766/10000 [1:09:37<54:59, 1.28it/s, loss=0.0196, lr=1.18e-05, step=5766] Training: 58%|█████▊ | 5767/10000 [1:09:37<55:10, 1.28it/s, loss=0.0196, lr=1.18e-05, step=5766] Training: 58%|█████▊ | 5767/10000 [1:09:37<55:10, 1.28it/s, loss=0.0156, lr=1.18e-05, step=5767] Training: 58%|█████▊ | 5768/10000 [1:09:38<58:24, 1.21it/s, loss=0.0156, lr=1.18e-05, step=5767] Training: 58%|█████▊ | 5768/10000 [1:09:38<58:24, 1.21it/s, loss=0.0053, lr=1.18e-05, step=5768] Training: 58%|█████▊ | 5769/10000 [1:09:39<1:00:55, 1.16it/s, loss=0.0053, lr=1.18e-05, step=5768] Training: 58%|█████▊ | 5769/10000 [1:09:39<1:00:55, 1.16it/s, loss=0.0055, lr=1.18e-05, step=5769]17:15:47.445 [I] step=5770 loss=0.1741 smoothed_loss=0.0273 lr=1.18e-05 grad_norm=0.4321 step_time=0.7013s data_time=0.1826s it/s=1.132 eta_to_10000=3737.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0303 grad_action_out_proj=0.2007 grad_shared_expert=0.5700 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5770/10000 [1:09:41<1:06:35, 1.06it/s, loss=0.0055, lr=1.18e-05, step=5769] Training: 58%|█████▊ | 5770/10000 [1:09:41<1:06:35, 1.06it/s, loss=0.1741, lr=1.18e-05, step=5770] Training: 58%|█████▊ | 5771/10000 [1:09:41<1:01:32, 1.15it/s, loss=0.1741, lr=1.18e-05, step=5770] Training: 58%|█████▊ | 5771/10000 [1:09:41<1:01:32, 1.15it/s, loss=0.0107, lr=1.18e-05, step=5771] Training: 58%|█████▊ | 5772/10000 [1:09:42<55:55, 1.26it/s, loss=0.0107, lr=1.18e-05, step=5771] Training: 58%|█████▊ | 5772/10000 [1:09:42<55:55, 1.26it/s, loss=0.0113, lr=1.18e-05, step=5772] Training: 58%|█████▊ | 5773/10000 [1:09:42<51:48, 1.36it/s, loss=0.0113, lr=1.18e-05, step=5772] Training: 58%|█████▊ | 5773/10000 [1:09:42<51:48, 1.36it/s, loss=0.0138, lr=1.18e-05, step=5773] Training: 58%|█████▊ | 5774/10000 [1:09:43<52:13, 1.35it/s, loss=0.0138, lr=1.18e-05, step=5773] Training: 58%|█████▊ | 5774/10000 [1:09:43<52:13, 1.35it/s, loss=0.0314, lr=1.18e-05, step=5774] Training: 58%|█████▊ | 5775/10000 [1:09:44<53:35, 1.31it/s, loss=0.0314, lr=1.18e-05, step=5774] Training: 58%|█████▊ | 5775/10000 [1:09:44<53:35, 1.31it/s, loss=0.0190, lr=1.18e-05, step=5775] Training: 58%|█████▊ | 5776/10000 [1:09:45<57:13, 1.23it/s, loss=0.0190, lr=1.18e-05, step=5775] Training: 58%|█████▊ | 5776/10000 [1:09:45<57:13, 1.23it/s, loss=0.0016, lr=1.18e-05, step=5776] Training: 58%|█████▊ | 5777/10000 [1:09:46<59:50, 1.18it/s, loss=0.0016, lr=1.18e-05, step=5776] Training: 58%|█████▊ | 5777/10000 [1:09:46<59:50, 1.18it/s, loss=0.0076, lr=1.18e-05, step=5777] Training: 58%|█████▊ | 5778/10000 [1:09:46<54:39, 1.29it/s, loss=0.0076, lr=1.18e-05, step=5777] Training: 58%|█████▊ | 5778/10000 [1:09:46<54:39, 1.29it/s, loss=0.0331, lr=1.18e-05, step=5778] Training: 58%|█████▊ | 5779/10000 [1:09:47<55:55, 1.26it/s, loss=0.0331, lr=1.18e-05, step=5778] Training: 58%|█████▊ | 5779/10000 [1:09:47<55:55, 1.26it/s, loss=0.0245, lr=1.18e-05, step=5779]17:15:55.137 [I] step=5780 loss=0.0161 smoothed_loss=0.0210 lr=1.18e-05 grad_norm=0.5330 step_time=0.6171s data_time=0.1522s it/s=1.300 eta_to_10000=3245.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0358 grad_action_out_proj=0.1539 grad_shared_expert=0.5869 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5780/10000 [1:09:48<58:09, 1.21it/s, loss=0.0245, lr=1.18e-05, step=5779] Training: 58%|█████▊ | 5780/10000 [1:09:48<58:09, 1.21it/s, loss=0.0161, lr=1.18e-05, step=5780] Training: 58%|█████▊ | 5781/10000 [1:09:49<57:51, 1.22it/s, loss=0.0161, lr=1.18e-05, step=5780] Training: 58%|█████▊ | 5781/10000 [1:09:49<57:51, 1.22it/s, loss=0.0026, lr=1.18e-05, step=5781] Training: 58%|█████▊ | 5782/10000 [1:09:50<56:52, 1.24it/s, loss=0.0026, lr=1.18e-05, step=5781] Training: 58%|█████▊ | 5782/10000 [1:09:50<56:52, 1.24it/s, loss=0.0059, lr=1.18e-05, step=5782] Training: 58%|█████▊ | 5783/10000 [1:09:51<58:41, 1.20it/s, loss=0.0059, lr=1.18e-05, step=5782] Training: 58%|█████▊ | 5783/10000 [1:09:51<58:41, 1.20it/s, loss=0.0017, lr=1.18e-05, step=5783] Training: 58%|█████▊ | 5784/10000 [1:09:51<56:45, 1.24it/s, loss=0.0017, lr=1.18e-05, step=5783] Training: 58%|█████▊ | 5784/10000 [1:09:51<56:45, 1.24it/s, loss=0.0042, lr=1.18e-05, step=5784] Training: 58%|█████▊ | 5785/10000 [1:09:52<52:09, 1.35it/s, loss=0.0042, lr=1.18e-05, step=5784] Training: 58%|█████▊ | 5785/10000 [1:09:52<52:09, 1.35it/s, loss=0.0033, lr=1.18e-05, step=5785] Training: 58%|█████▊ | 5786/10000 [1:09:53<48:49, 1.44it/s, loss=0.0033, lr=1.18e-05, step=5785] Training: 58%|█████▊ | 5786/10000 [1:09:53<48:49, 1.44it/s, loss=0.0261, lr=1.18e-05, step=5786] Training: 58%|█████▊ | 5787/10000 [1:09:53<48:30, 1.45it/s, loss=0.0261, lr=1.18e-05, step=5786] Training: 58%|█████▊ | 5787/10000 [1:09:53<48:30, 1.45it/s, loss=0.0038, lr=1.18e-05, step=5787] Training: 58%|█████▊ | 5788/10000 [1:09:54<52:40, 1.33it/s, loss=0.0038, lr=1.18e-05, step=5787] Training: 58%|█████▊ | 5788/10000 [1:09:54<52:40, 1.33it/s, loss=0.0064, lr=1.18e-05, step=5788] Training: 58%|█████▊ | 5789/10000 [1:09:55<55:52, 1.26it/s, loss=0.0064, lr=1.18e-05, step=5788] Training: 58%|█████▊ | 5789/10000 [1:09:55<55:52, 1.26it/s, loss=0.0310, lr=1.18e-05, step=5789]17:16:03.130 [I] step=5790 loss=0.0019 smoothed_loss=0.0137 lr=1.18e-05 grad_norm=0.4051 step_time=0.6354s data_time=0.1638s it/s=1.251 eta_to_10000=3364.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0099 grad_action_out_proj=0.1046 grad_shared_expert=0.5195 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5790/10000 [1:09:56<1:02:34, 1.12it/s, loss=0.0310, lr=1.18e-05, step=5789] Training: 58%|█████▊ | 5790/10000 [1:09:56<1:02:34, 1.12it/s, loss=0.0019, lr=1.18e-05, step=5790] Training: 58%|█████▊ | 5791/10000 [1:09:57<1:07:11, 1.04it/s, loss=0.0019, lr=1.18e-05, step=5790] Training: 58%|█████▊ | 5791/10000 [1:09:57<1:07:11, 1.04it/s, loss=0.0159, lr=1.18e-05, step=5791] Training: 58%|█████▊ | 5792/10000 [1:09:58<1:04:22, 1.09it/s, loss=0.0159, lr=1.18e-05, step=5791] Training: 58%|█████▊ | 5792/10000 [1:09:58<1:04:22, 1.09it/s, loss=0.0131, lr=1.17e-05, step=5792] Training: 58%|█████▊ | 5793/10000 [1:09:59<1:02:31, 1.12it/s, loss=0.0131, lr=1.17e-05, step=5792] Training: 58%|█████▊ | 5793/10000 [1:09:59<1:02:31, 1.12it/s, loss=0.0026, lr=1.17e-05, step=5793] Training: 58%|█████▊ | 5794/10000 [1:09:59<54:37, 1.28it/s, loss=0.0026, lr=1.17e-05, step=5793] Training: 58%|█████▊ | 5794/10000 [1:09:59<54:37, 1.28it/s, loss=0.0070, lr=1.17e-05, step=5794] Training: 58%|█████▊ | 5795/10000 [1:10:00<49:52, 1.41it/s, loss=0.0070, lr=1.17e-05, step=5794] Training: 58%|█████▊ | 5795/10000 [1:10:00<49:52, 1.41it/s, loss=0.0030, lr=1.17e-05, step=5795] Training: 58%|█████▊ | 5796/10000 [1:10:01<52:32, 1.33it/s, loss=0.0030, lr=1.17e-05, step=5795] Training: 58%|█████▊ | 5796/10000 [1:10:01<52:32, 1.33it/s, loss=0.0039, lr=1.17e-05, step=5796] Training: 58%|█████▊ | 5797/10000 [1:10:02<58:53, 1.19it/s, loss=0.0039, lr=1.17e-05, step=5796] Training: 58%|█████▊ | 5797/10000 [1:10:02<58:53, 1.19it/s, loss=0.0081, lr=1.17e-05, step=5797] Training: 58%|█████▊ | 5798/10000 [1:10:03<1:03:57, 1.10it/s, loss=0.0081, lr=1.17e-05, step=5797] Training: 58%|█████▊ | 5798/10000 [1:10:03<1:03:57, 1.10it/s, loss=0.0075, lr=1.17e-05, step=5798] Training: 58%|█████▊ | 5799/10000 [1:10:04<1:04:03, 1.09it/s, loss=0.0075, lr=1.17e-05, step=5798] Training: 58%|█████▊ | 5799/10000 [1:10:04<1:04:03, 1.09it/s, loss=0.0059, lr=1.17e-05, step=5799]17:16:11.541 [I] step=5800 loss=0.0052 smoothed_loss=0.0091 lr=1.17e-05 grad_norm=0.4118 step_time=0.6809s data_time=0.1604s it/s=1.189 eta_to_10000=3532.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.1271 grad_shared_expert=0.3924 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5800/10000 [1:10:05<59:06, 1.18it/s, loss=0.0059, lr=1.17e-05, step=5799] Training: 58%|█████▊ | 5800/10000 [1:10:05<59:06, 1.18it/s, loss=0.0052, lr=1.17e-05, step=5800] Training: 58%|█████▊ | 5801/10000 [1:10:05<54:06, 1.29it/s, loss=0.0052, lr=1.17e-05, step=5800] Training: 58%|█████▊ | 5801/10000 [1:10:05<54:06, 1.29it/s, loss=0.0072, lr=1.17e-05, step=5801] Training: 58%|█████▊ | 5802/10000 [1:10:06<50:11, 1.39it/s, loss=0.0072, lr=1.17e-05, step=5801] Training: 58%|█████▊ | 5802/10000 [1:10:06<50:11, 1.39it/s, loss=0.0128, lr=1.17e-05, step=5802] Training: 58%|█████▊ | 5803/10000 [1:10:06<49:08, 1.42it/s, loss=0.0128, lr=1.17e-05, step=5802] Training: 58%|█████▊ | 5803/10000 [1:10:06<49:08, 1.42it/s, loss=0.0080, lr=1.17e-05, step=5803] Training: 58%|█████▊ | 5804/10000 [1:10:07<49:27, 1.41it/s, loss=0.0080, lr=1.17e-05, step=5803] Training: 58%|█████▊ | 5804/10000 [1:10:07<49:27, 1.41it/s, loss=0.0248, lr=1.17e-05, step=5804] Training: 58%|█████▊ | 5805/10000 [1:10:08<54:11, 1.29it/s, loss=0.0248, lr=1.17e-05, step=5804] Training: 58%|█████▊ | 5805/10000 [1:10:08<54:11, 1.29it/s, loss=0.0031, lr=1.17e-05, step=5805] Training: 58%|█████▊ | 5806/10000 [1:10:09<49:51, 1.40it/s, loss=0.0031, lr=1.17e-05, step=5805] Training: 58%|█████▊ | 5806/10000 [1:10:09<49:51, 1.40it/s, loss=0.0145, lr=1.17e-05, step=5806] Training: 58%|█████▊ | 5807/10000 [1:10:10<52:30, 1.33it/s, loss=0.0145, lr=1.17e-05, step=5806] Training: 58%|█████▊ | 5807/10000 [1:10:10<52:30, 1.33it/s, loss=0.0287, lr=1.17e-05, step=5807] Training: 58%|█████▊ | 5808/10000 [1:10:10<52:56, 1.32it/s, loss=0.0287, lr=1.17e-05, step=5807] Training: 58%|█████▊ | 5808/10000 [1:10:10<52:56, 1.32it/s, loss=0.0124, lr=1.17e-05, step=5808] Training: 58%|█████▊ | 5809/10000 [1:10:11<55:03, 1.27it/s, loss=0.0124, lr=1.17e-05, step=5808] Training: 58%|█████▊ | 5809/10000 [1:10:11<55:03, 1.27it/s, loss=0.0181, lr=1.17e-05, step=5809]17:16:18.727 [I] step=5810 loss=0.0186 smoothed_loss=0.0134 lr=1.17e-05 grad_norm=0.4328 step_time=0.5782s data_time=0.1402s it/s=1.392 eta_to_10000=3010.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0104 grad_action_out_proj=0.1039 grad_shared_expert=0.3722 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5810/10000 [1:10:12<51:44, 1.35it/s, loss=0.0181, lr=1.17e-05, step=5809] Training: 58%|█████▊ | 5810/10000 [1:10:12<51:44, 1.35it/s, loss=0.0186, lr=1.17e-05, step=5810] Training: 58%|█████▊ | 5811/10000 [1:10:12<47:27, 1.47it/s, loss=0.0186, lr=1.17e-05, step=5810] Training: 58%|█████▊ | 5811/10000 [1:10:12<47:27, 1.47it/s, loss=0.0049, lr=1.17e-05, step=5811] Training: 58%|█████▊ | 5812/10000 [1:10:13<53:03, 1.32it/s, loss=0.0049, lr=1.17e-05, step=5811] Training: 58%|█████▊ | 5812/10000 [1:10:13<53:03, 1.32it/s, loss=0.0055, lr=1.17e-05, step=5812] Training: 58%|█████▊ | 5813/10000 [1:10:14<51:22, 1.36it/s, loss=0.0055, lr=1.17e-05, step=5812] Training: 58%|█████▊ | 5813/10000 [1:10:14<51:22, 1.36it/s, loss=0.0015, lr=1.17e-05, step=5813] Training: 58%|█████▊ | 5814/10000 [1:10:15<49:18, 1.41it/s, loss=0.0015, lr=1.17e-05, step=5813] Training: 58%|█████▊ | 5814/10000 [1:10:15<49:18, 1.41it/s, loss=0.0513, lr=1.17e-05, step=5814] Training: 58%|█████▊ | 5815/10000 [1:10:15<51:36, 1.35it/s, loss=0.0513, lr=1.17e-05, step=5814] Training: 58%|█████▊ | 5815/10000 [1:10:15<51:36, 1.35it/s, loss=0.0175, lr=1.17e-05, step=5815] Training: 58%|█████▊ | 5816/10000 [1:10:16<53:54, 1.29it/s, loss=0.0175, lr=1.17e-05, step=5815] Training: 58%|█████▊ | 5816/10000 [1:10:16<53:54, 1.29it/s, loss=0.0041, lr=1.17e-05, step=5816] Training: 58%|█████▊ | 5817/10000 [1:10:17<50:00, 1.39it/s, loss=0.0041, lr=1.17e-05, step=5816] Training: 58%|█████▊ | 5817/10000 [1:10:17<50:00, 1.39it/s, loss=0.0045, lr=1.17e-05, step=5817] Training: 58%|█████▊ | 5818/10000 [1:10:17<46:45, 1.49it/s, loss=0.0045, lr=1.17e-05, step=5817] Training: 58%|█████▊ | 5818/10000 [1:10:17<46:45, 1.49it/s, loss=0.0061, lr=1.17e-05, step=5818] Training: 58%|█████▊ | 5819/10000 [1:10:18<53:46, 1.30it/s, loss=0.0061, lr=1.17e-05, step=5818] Training: 58%|█████▊ | 5819/10000 [1:10:18<53:46, 1.30it/s, loss=0.0054, lr=1.16e-05, step=5819]17:16:26.398 [I] step=5820 loss=0.0068 smoothed_loss=0.0112 lr=1.17e-05 grad_norm=0.4745 step_time=0.6461s data_time=0.1209s it/s=1.304 eta_to_10000=3206.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1072 grad_shared_expert=0.3135 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5820/10000 [1:10:19<59:27, 1.17it/s, loss=0.0054, lr=1.16e-05, step=5819] Training: 58%|█████▊ | 5820/10000 [1:10:19<59:27, 1.17it/s, loss=0.0068, lr=1.16e-05, step=5820] Training: 58%|█████▊ | 5821/10000 [1:10:20<55:08, 1.26it/s, loss=0.0068, lr=1.16e-05, step=5820] Training: 58%|█████▊ | 5821/10000 [1:10:20<55:08, 1.26it/s, loss=0.0650, lr=1.16e-05, step=5821] Training: 58%|█████▊ | 5822/10000 [1:10:21<51:10, 1.36it/s, loss=0.0650, lr=1.16e-05, step=5821] Training: 58%|█████▊ | 5822/10000 [1:10:21<51:10, 1.36it/s, loss=0.0154, lr=1.16e-05, step=5822] Training: 58%|█████▊ | 5823/10000 [1:10:22<53:28, 1.30it/s, loss=0.0154, lr=1.16e-05, step=5822] Training: 58%|█████▊ | 5823/10000 [1:10:22<53:28, 1.30it/s, loss=0.0033, lr=1.16e-05, step=5823] Training: 58%|█████▊ | 5824/10000 [1:10:22<50:30, 1.38it/s, loss=0.0033, lr=1.16e-05, step=5823] Training: 58%|█████▊ | 5824/10000 [1:10:22<50:30, 1.38it/s, loss=0.0051, lr=1.16e-05, step=5824] Training: 58%|█████▊ | 5825/10000 [1:10:23<48:50, 1.42it/s, loss=0.0051, lr=1.16e-05, step=5824] Training: 58%|█████▊ | 5825/10000 [1:10:23<48:50, 1.42it/s, loss=0.0091, lr=1.16e-05, step=5825] Training: 58%|█████▊ | 5826/10000 [1:10:24<55:02, 1.26it/s, loss=0.0091, lr=1.16e-05, step=5825] Training: 58%|█████▊ | 5826/10000 [1:10:24<55:02, 1.26it/s, loss=0.0071, lr=1.16e-05, step=5826] Training: 58%|█████▊ | 5827/10000 [1:10:25<56:57, 1.22it/s, loss=0.0071, lr=1.16e-05, step=5826] Training: 58%|█████▊ | 5827/10000 [1:10:25<56:57, 1.22it/s, loss=0.0085, lr=1.16e-05, step=5827] Training: 58%|█████▊ | 5828/10000 [1:10:25<51:19, 1.35it/s, loss=0.0085, lr=1.16e-05, step=5827] Training: 58%|█████▊ | 5828/10000 [1:10:25<51:19, 1.35it/s, loss=0.0056, lr=1.16e-05, step=5828] Training: 58%|█████▊ | 5829/10000 [1:10:26<48:59, 1.42it/s, loss=0.0056, lr=1.16e-05, step=5828] Training: 58%|█████▊ | 5829/10000 [1:10:26<48:59, 1.42it/s, loss=0.0184, lr=1.16e-05, step=5829]17:16:33.667 [I] step=5830 loss=0.0254 smoothed_loss=0.0138 lr=1.16e-05 grad_norm=0.4160 step_time=0.5977s data_time=0.1292s it/s=1.376 eta_to_10000=3030.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0135 grad_action_out_proj=0.1487 grad_shared_expert=0.4489 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5830/10000 [1:10:27<51:48, 1.34it/s, loss=0.0184, lr=1.16e-05, step=5829] Training: 58%|█████▊ | 5830/10000 [1:10:27<51:48, 1.34it/s, loss=0.0254, lr=1.16e-05, step=5830] Training: 58%|█████▊ | 5831/10000 [1:10:28<56:23, 1.23it/s, loss=0.0254, lr=1.16e-05, step=5830] Training: 58%|█████▊ | 5831/10000 [1:10:28<56:23, 1.23it/s, loss=0.0113, lr=1.16e-05, step=5831] Training: 58%|█████▊ | 5832/10000 [1:10:28<52:46, 1.32it/s, loss=0.0113, lr=1.16e-05, step=5831] Training: 58%|█████▊ | 5832/10000 [1:10:28<52:46, 1.32it/s, loss=0.0189, lr=1.16e-05, step=5832] Training: 58%|█████▊ | 5833/10000 [1:10:29<48:38, 1.43it/s, loss=0.0189, lr=1.16e-05, step=5832] Training: 58%|█████▊ | 5833/10000 [1:10:29<48:38, 1.43it/s, loss=0.0203, lr=1.16e-05, step=5833] Training: 58%|█████▊ | 5834/10000 [1:10:30<50:28, 1.38it/s, loss=0.0203, lr=1.16e-05, step=5833] Training: 58%|█████▊ | 5834/10000 [1:10:30<50:28, 1.38it/s, loss=0.0063, lr=1.16e-05, step=5834] Training: 58%|█████▊ | 5835/10000 [1:10:31<56:00, 1.24it/s, loss=0.0063, lr=1.16e-05, step=5834] Training: 58%|█████▊ | 5835/10000 [1:10:31<56:00, 1.24it/s, loss=0.0168, lr=1.16e-05, step=5835] Training: 58%|█████▊ | 5836/10000 [1:10:31<55:40, 1.25it/s, loss=0.0168, lr=1.16e-05, step=5835] Training: 58%|█████▊ | 5836/10000 [1:10:31<55:40, 1.25it/s, loss=0.0140, lr=1.16e-05, step=5836] Training: 58%|█████▊ | 5837/10000 [1:10:32<52:37, 1.32it/s, loss=0.0140, lr=1.16e-05, step=5836] Training: 58%|█████▊ | 5837/10000 [1:10:32<52:37, 1.32it/s, loss=0.0072, lr=1.16e-05, step=5837] Training: 58%|█████▊ | 5838/10000 [1:10:33<51:04, 1.36it/s, loss=0.0072, lr=1.16e-05, step=5837] Training: 58%|█████▊ | 5838/10000 [1:10:33<51:04, 1.36it/s, loss=0.0108, lr=1.16e-05, step=5838] Training: 58%|█████▊ | 5839/10000 [1:10:34<51:10, 1.36it/s, loss=0.0108, lr=1.16e-05, step=5838] Training: 58%|█████▊ | 5839/10000 [1:10:34<51:10, 1.36it/s, loss=0.0035, lr=1.16e-05, step=5839]17:16:41.448 [I] step=5840 loss=0.0168 smoothed_loss=0.0127 lr=1.16e-05 grad_norm=0.4468 step_time=0.6183s data_time=0.1598s it/s=1.285 eta_to_10000=3236.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0172 grad_action_out_proj=0.1455 grad_shared_expert=0.5311 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5840/10000 [1:10:35<55:45, 1.24it/s, loss=0.0035, lr=1.16e-05, step=5839] Training: 58%|█████▊ | 5840/10000 [1:10:35<55:45, 1.24it/s, loss=0.0168, lr=1.16e-05, step=5840] Training: 58%|█████▊ | 5841/10000 [1:10:35<55:19, 1.25it/s, loss=0.0168, lr=1.16e-05, step=5840] Training: 58%|█████▊ | 5841/10000 [1:10:35<55:19, 1.25it/s, loss=0.0014, lr=1.16e-05, step=5841] Training: 58%|█████▊ | 5842/10000 [1:10:36<57:30, 1.20it/s, loss=0.0014, lr=1.16e-05, step=5841] Training: 58%|█████▊ | 5842/10000 [1:10:36<57:30, 1.20it/s, loss=0.0161, lr=1.16e-05, step=5842] Training: 58%|█████▊ | 5843/10000 [1:10:37<52:05, 1.33it/s, loss=0.0161, lr=1.16e-05, step=5842] Training: 58%|█████▊ | 5843/10000 [1:10:37<52:05, 1.33it/s, loss=0.0027, lr=1.16e-05, step=5843] Training: 58%|█████▊ | 5844/10000 [1:10:37<49:28, 1.40it/s, loss=0.0027, lr=1.16e-05, step=5843] Training: 58%|█████▊ | 5844/10000 [1:10:37<49:28, 1.40it/s, loss=0.0106, lr=1.16e-05, step=5844] Training: 58%|█████▊ | 5845/10000 [1:10:38<47:10, 1.47it/s, loss=0.0106, lr=1.16e-05, step=5844] Training: 58%|█████▊ | 5845/10000 [1:10:38<47:10, 1.47it/s, loss=0.0037, lr=1.16e-05, step=5845] Training: 58%|█████▊ | 5846/10000 [1:10:39<53:47, 1.29it/s, loss=0.0037, lr=1.16e-05, step=5845] Training: 58%|█████▊ | 5846/10000 [1:10:39<53:47, 1.29it/s, loss=0.0166, lr=1.16e-05, step=5846] Training: 58%|█████▊ | 5847/10000 [1:10:40<58:22, 1.19it/s, loss=0.0166, lr=1.16e-05, step=5846] Training: 58%|█████▊ | 5847/10000 [1:10:40<58:22, 1.19it/s, loss=0.0230, lr=1.15e-05, step=5847] Training: 58%|█████▊ | 5848/10000 [1:10:41<1:02:52, 1.10it/s, loss=0.0230, lr=1.15e-05, step=5847] Training: 58%|█████▊ | 5848/10000 [1:10:41<1:02:52, 1.10it/s, loss=0.0074, lr=1.15e-05, step=5848] Training: 58%|█████▊ | 5849/10000 [1:10:42<1:04:48, 1.07it/s, loss=0.0074, lr=1.15e-05, step=5848] Training: 58%|█████▊ | 5849/10000 [1:10:42<1:04:48, 1.07it/s, loss=0.0115, lr=1.15e-05, step=5849]17:16:49.664 [I] step=5850 loss=0.0036 smoothed_loss=0.0108 lr=1.16e-05 grad_norm=0.4227 step_time=0.6865s data_time=0.1352s it/s=1.217 eta_to_10000=3409.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0146 grad_action_out_proj=0.1332 grad_shared_expert=0.4709 (10775:train_pytorch.py:850) + Training: 58%|█████▊ | 5850/10000 [1:10:43<59:10, 1.17it/s, loss=0.0115, lr=1.15e-05, step=5849] Training: 58%|█████▊ | 5850/10000 [1:10:43<59:10, 1.17it/s, loss=0.0036, lr=1.15e-05, step=5850] Training: 59%|█████▊ | 5851/10000 [1:10:43<55:03, 1.26it/s, loss=0.0036, lr=1.15e-05, step=5850] Training: 59%|█████▊ | 5851/10000 [1:10:43<55:03, 1.26it/s, loss=0.0068, lr=1.15e-05, step=5851] Training: 59%|█████▊ | 5852/10000 [1:10:44<52:24, 1.32it/s, loss=0.0068, lr=1.15e-05, step=5851] Training: 59%|█████▊ | 5852/10000 [1:10:44<52:24, 1.32it/s, loss=0.0234, lr=1.15e-05, step=5852] Training: 59%|█████▊ | 5853/10000 [1:10:45<53:02, 1.30it/s, loss=0.0234, lr=1.15e-05, step=5852] Training: 59%|█████▊ | 5853/10000 [1:10:45<53:02, 1.30it/s, loss=0.0016, lr=1.15e-05, step=5853] Training: 59%|█████▊ | 5854/10000 [1:10:45<48:53, 1.41it/s, loss=0.0016, lr=1.15e-05, step=5853] Training: 59%|█████▊ | 5854/10000 [1:10:45<48:53, 1.41it/s, loss=0.0025, lr=1.15e-05, step=5854] Training: 59%|█████▊ | 5855/10000 [1:10:46<53:27, 1.29it/s, loss=0.0025, lr=1.15e-05, step=5854] Training: 59%|█████▊ | 5855/10000 [1:10:46<53:27, 1.29it/s, loss=0.0062, lr=1.15e-05, step=5855] Training: 59%|█████▊ | 5856/10000 [1:10:48<1:01:42, 1.12it/s, loss=0.0062, lr=1.15e-05, step=5855] Training: 59%|█████▊ | 5856/10000 [1:10:48<1:01:42, 1.12it/s, loss=0.0090, lr=1.15e-05, step=5856] Training: 59%|█████▊ | 5857/10000 [1:10:48<58:19, 1.18it/s, loss=0.0090, lr=1.15e-05, step=5856] Training: 59%|█████▊ | 5857/10000 [1:10:48<58:19, 1.18it/s, loss=0.0181, lr=1.15e-05, step=5857] Training: 59%|█████▊ | 5858/10000 [1:10:49<52:33, 1.31it/s, loss=0.0181, lr=1.15e-05, step=5857] Training: 59%|█████▊ | 5858/10000 [1:10:49<52:33, 1.31it/s, loss=0.0032, lr=1.15e-05, step=5858] Training: 59%|█████▊ | 5859/10000 [1:10:49<48:12, 1.43it/s, loss=0.0032, lr=1.15e-05, step=5858] Training: 59%|█████▊ | 5859/10000 [1:10:49<48:12, 1.43it/s, loss=0.0052, lr=1.15e-05, step=5859]17:16:56.863 [I] step=5860 loss=0.0165 smoothed_loss=0.0099 lr=1.15e-05 grad_norm=0.4929 step_time=0.5890s data_time=0.1308s it/s=1.389 eta_to_10000=2979.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0260 grad_action_out_proj=0.1905 grad_shared_expert=0.9047 (10775:train_pytorch.py:850) + Training: 59%|█████▊ | 5860/10000 [1:10:50<45:23, 1.52it/s, loss=0.0052, lr=1.15e-05, step=5859] Training: 59%|█████▊ | 5860/10000 [1:10:50<45:23, 1.52it/s, loss=0.0165, lr=1.15e-05, step=5860] Training: 59%|█████▊ | 5861/10000 [1:10:50<43:26, 1.59it/s, loss=0.0165, lr=1.15e-05, step=5860] Training: 59%|█████▊ | 5861/10000 [1:10:50<43:26, 1.59it/s, loss=0.0237, lr=1.15e-05, step=5861] Training: 59%|█████▊ | 5862/10000 [1:10:51<46:51, 1.47it/s, loss=0.0237, lr=1.15e-05, step=5861] Training: 59%|█████▊ | 5862/10000 [1:10:51<46:51, 1.47it/s, loss=0.0053, lr=1.15e-05, step=5862] Training: 59%|█████▊ | 5863/10000 [1:10:52<55:09, 1.25it/s, loss=0.0053, lr=1.15e-05, step=5862] Training: 59%|█████▊ | 5863/10000 [1:10:52<55:09, 1.25it/s, loss=0.0204, lr=1.15e-05, step=5863] Training: 59%|█████▊ | 5864/10000 [1:10:53<55:28, 1.24it/s, loss=0.0204, lr=1.15e-05, step=5863] Training: 59%|█████▊ | 5864/10000 [1:10:53<55:28, 1.24it/s, loss=0.0028, lr=1.15e-05, step=5864] Training: 59%|█████▊ | 5865/10000 [1:10:54<57:33, 1.20it/s, loss=0.0028, lr=1.15e-05, step=5864] Training: 59%|█████▊ | 5865/10000 [1:10:54<57:33, 1.20it/s, loss=0.0335, lr=1.15e-05, step=5865] Training: 59%|█████▊ | 5866/10000 [1:10:55<1:02:28, 1.10it/s, loss=0.0335, lr=1.15e-05, step=5865] Training: 59%|█████▊ | 5866/10000 [1:10:55<1:02:28, 1.10it/s, loss=0.0053, lr=1.15e-05, step=5866] Training: 59%|█████▊ | 5867/10000 [1:10:56<1:05:29, 1.05it/s, loss=0.0053, lr=1.15e-05, step=5866] Training: 59%|█████▊ | 5867/10000 [1:10:56<1:05:29, 1.05it/s, loss=0.0016, lr=1.15e-05, step=5867] Training: 59%|█████▊ | 5868/10000 [1:10:57<1:07:45, 1.02it/s, loss=0.0016, lr=1.15e-05, step=5867] Training: 59%|█████▊ | 5868/10000 [1:10:57<1:07:45, 1.02it/s, loss=0.0040, lr=1.15e-05, step=5868] Training: 59%|█████▊ | 5869/10000 [1:10:58<1:10:39, 1.03s/it, loss=0.0040, lr=1.15e-05, step=5868] Training: 59%|█████▊ | 5869/10000 [1:10:58<1:10:39, 1.03s/it, loss=0.0202, lr=1.15e-05, step=5869]17:17:06.478 [I] step=5870 loss=0.0123 smoothed_loss=0.0115 lr=1.15e-05 grad_norm=0.4297 step_time=0.7284s data_time=0.2331s it/s=1.040 eta_to_10000=3970.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0225 grad_action_out_proj=0.1613 grad_shared_expert=0.3878 (10775:train_pytorch.py:850) + Training: 59%|█████▊ | 5870/10000 [1:11:00<1:12:58, 1.06s/it, loss=0.0202, lr=1.15e-05, step=5869] Training: 59%|█████▊ | 5870/10000 [1:11:00<1:12:58, 1.06s/it, loss=0.0123, lr=1.15e-05, step=5870] Training: 59%|█████▊ | 5871/10000 [1:11:00<1:08:00, 1.01it/s, loss=0.0123, lr=1.15e-05, step=5870] Training: 59%|█████▊ | 5871/10000 [1:11:00<1:08:00, 1.01it/s, loss=0.0014, lr=1.15e-05, step=5871] Training: 59%|█████▊ | 5872/10000 [1:11:01<1:04:24, 1.07it/s, loss=0.0014, lr=1.15e-05, step=5871] Training: 59%|█████▊ | 5872/10000 [1:11:01<1:04:24, 1.07it/s, loss=0.0478, lr=1.15e-05, step=5872] Training: 59%|█████▊ | 5873/10000 [1:11:02<1:02:24, 1.10it/s, loss=0.0478, lr=1.15e-05, step=5872] Training: 59%|█████▊ | 5873/10000 [1:11:02<1:02:24, 1.10it/s, loss=0.0183, lr=1.15e-05, step=5873] Training: 59%|█████▊ | 5874/10000 [1:11:03<56:14, 1.22it/s, loss=0.0183, lr=1.15e-05, step=5873] Training: 59%|█████▊ | 5874/10000 [1:11:03<56:14, 1.22it/s, loss=0.0085, lr=1.14e-05, step=5874] Training: 59%|█████▉ | 5875/10000 [1:11:03<50:47, 1.35it/s, loss=0.0085, lr=1.14e-05, step=5874] Training: 59%|█████▉ | 5875/10000 [1:11:03<50:47, 1.35it/s, loss=0.0037, lr=1.14e-05, step=5875] Training: 59%|█████▉ | 5876/10000 [1:11:04<56:25, 1.22it/s, loss=0.0037, lr=1.14e-05, step=5875] Training: 59%|█████▉ | 5876/10000 [1:11:04<56:25, 1.22it/s, loss=0.0031, lr=1.14e-05, step=5876] Training: 59%|█████▉ | 5877/10000 [1:11:05<1:02:07, 1.11it/s, loss=0.0031, lr=1.14e-05, step=5876] Training: 59%|█████▉ | 5877/10000 [1:11:05<1:02:07, 1.11it/s, loss=0.0244, lr=1.14e-05, step=5877] Training: 59%|█████▉ | 5878/10000 [1:11:06<1:01:29, 1.12it/s, loss=0.0244, lr=1.14e-05, step=5877] Training: 59%|█████▉ | 5878/10000 [1:11:06<1:01:29, 1.12it/s, loss=0.0690, lr=1.14e-05, step=5878] Training: 59%|█████▉ | 5879/10000 [1:11:07<1:00:12, 1.14it/s, loss=0.0690, lr=1.14e-05, step=5878] Training: 59%|█████▉ | 5879/10000 [1:11:07<1:00:12, 1.14it/s, loss=0.0042, lr=1.14e-05, step=5879]17:17:14.775 [I] step=5880 loss=0.0174 smoothed_loss=0.0174 lr=1.14e-05 grad_norm=0.4768 step_time=0.6592s data_time=0.1705s it/s=1.206 eta_to_10000=3417.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.1079 grad_shared_expert=0.6793 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5880/10000 [1:11:08<59:25, 1.16it/s, loss=0.0042, lr=1.14e-05, step=5879] Training: 59%|█████▉ | 5880/10000 [1:11:08<59:25, 1.16it/s, loss=0.0174, lr=1.14e-05, step=5880] Training: 59%|█████▉ | 5881/10000 [1:11:09<58:20, 1.18it/s, loss=0.0174, lr=1.14e-05, step=5880] Training: 59%|█████▉ | 5881/10000 [1:11:09<58:20, 1.18it/s, loss=0.0030, lr=1.14e-05, step=5881] Training: 59%|█████▉ | 5882/10000 [1:11:09<52:53, 1.30it/s, loss=0.0030, lr=1.14e-05, step=5881] Training: 59%|█████▉ | 5882/10000 [1:11:09<52:53, 1.30it/s, loss=0.0093, lr=1.14e-05, step=5882] Training: 59%|█████▉ | 5883/10000 [1:11:10<49:31, 1.39it/s, loss=0.0093, lr=1.14e-05, step=5882] Training: 59%|█████▉ | 5883/10000 [1:11:10<49:31, 1.39it/s, loss=0.0109, lr=1.14e-05, step=5883] Training: 59%|█████▉ | 5884/10000 [1:11:11<54:21, 1.26it/s, loss=0.0109, lr=1.14e-05, step=5883] Training: 59%|█████▉ | 5884/10000 [1:11:11<54:21, 1.26it/s, loss=0.0227, lr=1.14e-05, step=5884] Training: 59%|█████▉ | 5885/10000 [1:11:11<50:42, 1.35it/s, loss=0.0227, lr=1.14e-05, step=5884] Training: 59%|█████▉ | 5885/10000 [1:11:11<50:42, 1.35it/s, loss=0.0071, lr=1.14e-05, step=5885] Training: 59%|█████▉ | 5886/10000 [1:11:12<51:06, 1.34it/s, loss=0.0071, lr=1.14e-05, step=5885] Training: 59%|█████▉ | 5886/10000 [1:11:12<51:06, 1.34it/s, loss=0.0018, lr=1.14e-05, step=5886] Training: 59%|█████▉ | 5887/10000 [1:11:13<55:33, 1.23it/s, loss=0.0018, lr=1.14e-05, step=5886] Training: 59%|█████▉ | 5887/10000 [1:11:13<55:33, 1.23it/s, loss=0.0024, lr=1.14e-05, step=5887] Training: 59%|█████▉ | 5888/10000 [1:11:14<1:00:41, 1.13it/s, loss=0.0024, lr=1.14e-05, step=5887] Training: 59%|█████▉ | 5888/10000 [1:11:14<1:00:41, 1.13it/s, loss=0.0114, lr=1.14e-05, step=5888] Training: 59%|█████▉ | 5889/10000 [1:11:15<59:53, 1.14it/s, loss=0.0114, lr=1.14e-05, step=5888] Training: 59%|█████▉ | 5889/10000 [1:11:15<59:53, 1.14it/s, loss=0.0072, lr=1.14e-05, step=5889]17:17:22.704 [I] step=5890 loss=0.0035 smoothed_loss=0.0109 lr=1.14e-05 grad_norm=0.4367 step_time=0.6255s data_time=0.1675s it/s=1.261 eta_to_10000=3258.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0137 grad_action_out_proj=0.1166 grad_shared_expert=0.3636 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5890/10000 [1:11:16<56:41, 1.21it/s, loss=0.0072, lr=1.14e-05, step=5889] Training: 59%|█████▉ | 5890/10000 [1:11:16<56:41, 1.21it/s, loss=0.0035, lr=1.14e-05, step=5890] Training: 59%|█████▉ | 5891/10000 [1:11:17<59:32, 1.15it/s, loss=0.0035, lr=1.14e-05, step=5890] Training: 59%|█████▉ | 5891/10000 [1:11:17<59:32, 1.15it/s, loss=0.0337, lr=1.14e-05, step=5891] Training: 59%|█████▉ | 5892/10000 [1:11:17<53:24, 1.28it/s, loss=0.0337, lr=1.14e-05, step=5891] Training: 59%|█████▉ | 5892/10000 [1:11:17<53:24, 1.28it/s, loss=0.0060, lr=1.14e-05, step=5892] Training: 59%|█████▉ | 5893/10000 [1:11:18<53:09, 1.29it/s, loss=0.0060, lr=1.14e-05, step=5892] Training: 59%|█████▉ | 5893/10000 [1:11:18<53:09, 1.29it/s, loss=0.0114, lr=1.14e-05, step=5893] Training: 59%|█████▉ | 5894/10000 [1:11:19<48:07, 1.42it/s, loss=0.0114, lr=1.14e-05, step=5893] Training: 59%|█████▉ | 5894/10000 [1:11:19<48:07, 1.42it/s, loss=0.0019, lr=1.14e-05, step=5894] Training: 59%|█████▉ | 5895/10000 [1:11:19<44:59, 1.52it/s, loss=0.0019, lr=1.14e-05, step=5894] Training: 59%|█████▉ | 5895/10000 [1:11:19<44:59, 1.52it/s, loss=0.0071, lr=1.14e-05, step=5895] Training: 59%|█████▉ | 5896/10000 [1:11:20<48:29, 1.41it/s, loss=0.0071, lr=1.14e-05, step=5895] Training: 59%|█████▉ | 5896/10000 [1:11:20<48:29, 1.41it/s, loss=0.0119, lr=1.14e-05, step=5896] Training: 59%|█████▉ | 5897/10000 [1:11:21<45:55, 1.49it/s, loss=0.0119, lr=1.14e-05, step=5896] Training: 59%|█████▉ | 5897/10000 [1:11:21<45:55, 1.49it/s, loss=0.0403, lr=1.14e-05, step=5897] Training: 59%|█████▉ | 5898/10000 [1:11:21<47:53, 1.43it/s, loss=0.0403, lr=1.14e-05, step=5897] Training: 59%|█████▉ | 5898/10000 [1:11:21<47:53, 1.43it/s, loss=0.0080, lr=1.14e-05, step=5898] Training: 59%|█████▉ | 5899/10000 [1:11:22<51:41, 1.32it/s, loss=0.0080, lr=1.14e-05, step=5898] Training: 59%|█████▉ | 5899/10000 [1:11:22<51:41, 1.32it/s, loss=0.0025, lr=1.14e-05, step=5899]17:17:29.863 [I] step=5900 loss=0.0104 smoothed_loss=0.0121 lr=1.14e-05 grad_norm=0.5247 step_time=0.5957s data_time=0.1201s it/s=1.397 eta_to_10000=2935.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0123 grad_action_out_proj=0.1175 grad_shared_expert=0.3426 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5900/10000 [1:11:23<50:33, 1.35it/s, loss=0.0025, lr=1.14e-05, step=5899] Training: 59%|█████▉ | 5900/10000 [1:11:23<50:33, 1.35it/s, loss=0.0104, lr=1.14e-05, step=5900] Training: 59%|█████▉ | 5901/10000 [1:11:23<46:47, 1.46it/s, loss=0.0104, lr=1.14e-05, step=5900] Training: 59%|█████▉ | 5901/10000 [1:11:23<46:47, 1.46it/s, loss=0.0062, lr=1.14e-05, step=5901] Training: 59%|█████▉ | 5902/10000 [1:11:24<42:58, 1.59it/s, loss=0.0062, lr=1.14e-05, step=5901] Training: 59%|█████▉ | 5902/10000 [1:11:24<42:58, 1.59it/s, loss=0.0097, lr=1.13e-05, step=5902] Training: 59%|█████▉ | 5903/10000 [1:11:25<46:09, 1.48it/s, loss=0.0097, lr=1.13e-05, step=5902] Training: 59%|█████▉ | 5903/10000 [1:11:25<46:09, 1.48it/s, loss=0.0151, lr=1.13e-05, step=5903] Training: 59%|█████▉ | 5904/10000 [1:11:25<45:22, 1.50it/s, loss=0.0151, lr=1.13e-05, step=5903] Training: 59%|█████▉ | 5904/10000 [1:11:25<45:22, 1.50it/s, loss=0.0035, lr=1.13e-05, step=5904] Training: 59%|█████▉ | 5905/10000 [1:11:26<52:44, 1.29it/s, loss=0.0035, lr=1.13e-05, step=5904] Training: 59%|█████▉ | 5905/10000 [1:11:26<52:44, 1.29it/s, loss=0.0046, lr=1.13e-05, step=5905] Training: 59%|█████▉ | 5906/10000 [1:11:27<57:05, 1.20it/s, loss=0.0046, lr=1.13e-05, step=5905] Training: 59%|█████▉ | 5906/10000 [1:11:27<57:05, 1.20it/s, loss=0.0141, lr=1.13e-05, step=5906] Training: 59%|█████▉ | 5907/10000 [1:11:28<52:58, 1.29it/s, loss=0.0141, lr=1.13e-05, step=5906] Training: 59%|█████▉ | 5907/10000 [1:11:28<52:58, 1.29it/s, loss=0.0383, lr=1.13e-05, step=5907] Training: 59%|█████▉ | 5908/10000 [1:11:29<50:49, 1.34it/s, loss=0.0383, lr=1.13e-05, step=5907] Training: 59%|█████▉ | 5908/10000 [1:11:29<50:49, 1.34it/s, loss=0.0365, lr=1.13e-05, step=5908] Training: 59%|█████▉ | 5909/10000 [1:11:29<48:00, 1.42it/s, loss=0.0365, lr=1.13e-05, step=5908] Training: 59%|█████▉ | 5909/10000 [1:11:29<48:00, 1.42it/s, loss=0.0019, lr=1.13e-05, step=5909]17:17:36.955 [I] step=5910 loss=0.0132 smoothed_loss=0.0142 lr=1.13e-05 grad_norm=0.4737 step_time=0.5821s data_time=0.1271s it/s=1.410 eta_to_10000=2900.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0128 grad_action_out_proj=0.1119 grad_shared_expert=0.3898 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5910/10000 [1:11:30<47:36, 1.43it/s, loss=0.0019, lr=1.13e-05, step=5909] Training: 59%|█████▉ | 5910/10000 [1:11:30<47:36, 1.43it/s, loss=0.0132, lr=1.13e-05, step=5910] Training: 59%|█████▉ | 5911/10000 [1:11:31<43:45, 1.56it/s, loss=0.0132, lr=1.13e-05, step=5910] Training: 59%|█████▉ | 5911/10000 [1:11:31<43:45, 1.56it/s, loss=0.0138, lr=1.13e-05, step=5911] Training: 59%|█████▉ | 5912/10000 [1:11:31<45:33, 1.50it/s, loss=0.0138, lr=1.13e-05, step=5911] Training: 59%|█████▉ | 5912/10000 [1:11:31<45:33, 1.50it/s, loss=0.0161, lr=1.13e-05, step=5912] Training: 59%|█████▉ | 5913/10000 [1:11:32<47:51, 1.42it/s, loss=0.0161, lr=1.13e-05, step=5912] Training: 59%|█████▉ | 5913/10000 [1:11:32<47:51, 1.42it/s, loss=0.0132, lr=1.13e-05, step=5913] Training: 59%|█████▉ | 5914/10000 [1:11:33<49:30, 1.38it/s, loss=0.0132, lr=1.13e-05, step=5913] Training: 59%|█████▉ | 5914/10000 [1:11:33<49:30, 1.38it/s, loss=0.0291, lr=1.13e-05, step=5914] Training: 59%|█████▉ | 5915/10000 [1:11:33<47:32, 1.43it/s, loss=0.0291, lr=1.13e-05, step=5914] Training: 59%|█████▉ | 5915/10000 [1:11:33<47:32, 1.43it/s, loss=0.0013, lr=1.13e-05, step=5915] Training: 59%|█████▉ | 5916/10000 [1:11:34<46:43, 1.46it/s, loss=0.0013, lr=1.13e-05, step=5915] Training: 59%|█████▉ | 5916/10000 [1:11:34<46:43, 1.46it/s, loss=0.0032, lr=1.13e-05, step=5916] Training: 59%|█████▉ | 5917/10000 [1:11:35<46:03, 1.48it/s, loss=0.0032, lr=1.13e-05, step=5916] Training: 59%|█████▉ | 5917/10000 [1:11:35<46:03, 1.48it/s, loss=0.0345, lr=1.13e-05, step=5917] Training: 59%|█████▉ | 5918/10000 [1:11:36<47:48, 1.42it/s, loss=0.0345, lr=1.13e-05, step=5917] Training: 59%|█████▉ | 5918/10000 [1:11:36<47:48, 1.42it/s, loss=0.0084, lr=1.13e-05, step=5918] Training: 59%|█████▉ | 5919/10000 [1:11:36<46:10, 1.47it/s, loss=0.0084, lr=1.13e-05, step=5918] Training: 59%|█████▉ | 5919/10000 [1:11:36<46:10, 1.47it/s, loss=0.0305, lr=1.13e-05, step=5919]17:17:44.003 [I] step=5920 loss=0.0222 smoothed_loss=0.0168 lr=1.13e-05 grad_norm=0.4966 step_time=0.5773s data_time=0.1275s it/s=1.419 eta_to_10000=2875.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1370 grad_shared_expert=0.6597 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5920/10000 [1:11:37<50:52, 1.34it/s, loss=0.0305, lr=1.13e-05, step=5919] Training: 59%|█████▉ | 5920/10000 [1:11:37<50:52, 1.34it/s, loss=0.0222, lr=1.13e-05, step=5920] Training: 59%|█████▉ | 5921/10000 [1:11:38<52:28, 1.30it/s, loss=0.0222, lr=1.13e-05, step=5920] Training: 59%|█████▉ | 5921/10000 [1:11:38<52:28, 1.30it/s, loss=0.0138, lr=1.13e-05, step=5921] Training: 59%|█████▉ | 5922/10000 [1:11:38<46:50, 1.45it/s, loss=0.0138, lr=1.13e-05, step=5921] Training: 59%|█████▉ | 5922/10000 [1:11:38<46:50, 1.45it/s, loss=0.0013, lr=1.13e-05, step=5922] Training: 59%|█████▉ | 5923/10000 [1:11:39<48:02, 1.41it/s, loss=0.0013, lr=1.13e-05, step=5922] Training: 59%|█████▉ | 5923/10000 [1:11:39<48:02, 1.41it/s, loss=0.0091, lr=1.13e-05, step=5923] Training: 59%|█████▉ | 5924/10000 [1:11:40<47:51, 1.42it/s, loss=0.0091, lr=1.13e-05, step=5923] Training: 59%|█████▉ | 5924/10000 [1:11:40<47:51, 1.42it/s, loss=0.0282, lr=1.13e-05, step=5924] Training: 59%|█████▉ | 5925/10000 [1:11:41<49:57, 1.36it/s, loss=0.0282, lr=1.13e-05, step=5924] Training: 59%|█████▉ | 5925/10000 [1:11:41<49:57, 1.36it/s, loss=0.0125, lr=1.13e-05, step=5925] Training: 59%|█████▉ | 5926/10000 [1:11:41<45:18, 1.50it/s, loss=0.0125, lr=1.13e-05, step=5925] Training: 59%|█████▉ | 5926/10000 [1:11:41<45:18, 1.50it/s, loss=0.0223, lr=1.13e-05, step=5926] Training: 59%|█████▉ | 5927/10000 [1:11:42<52:25, 1.29it/s, loss=0.0223, lr=1.13e-05, step=5926] Training: 59%|█████▉ | 5927/10000 [1:11:42<52:25, 1.29it/s, loss=0.0053, lr=1.13e-05, step=5927] Training: 59%|█████▉ | 5928/10000 [1:11:43<53:52, 1.26it/s, loss=0.0053, lr=1.13e-05, step=5927] Training: 59%|█████▉ | 5928/10000 [1:11:43<53:52, 1.26it/s, loss=0.0049, lr=1.13e-05, step=5928] Training: 59%|█████▉ | 5929/10000 [1:11:44<47:47, 1.42it/s, loss=0.0049, lr=1.13e-05, step=5928] Training: 59%|█████▉ | 5929/10000 [1:11:44<47:47, 1.42it/s, loss=0.0083, lr=1.12e-05, step=5929]17:17:50.996 [I] step=5930 loss=0.0213 smoothed_loss=0.0142 lr=1.13e-05 grad_norm=0.3928 step_time=0.5873s data_time=0.1119s it/s=1.430 eta_to_10000=2845.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0963 grad_shared_expert=0.2581 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5930/10000 [1:11:44<44:37, 1.52it/s, loss=0.0083, lr=1.12e-05, step=5929] Training: 59%|█████▉ | 5930/10000 [1:11:44<44:37, 1.52it/s, loss=0.0213, lr=1.12e-05, step=5930] Training: 59%|█████▉ | 5931/10000 [1:11:45<45:52, 1.48it/s, loss=0.0213, lr=1.12e-05, step=5930] Training: 59%|█████▉ | 5931/10000 [1:11:45<45:52, 1.48it/s, loss=0.0106, lr=1.12e-05, step=5931] Training: 59%|█████▉ | 5932/10000 [1:11:45<45:49, 1.48it/s, loss=0.0106, lr=1.12e-05, step=5931] Training: 59%|█████▉ | 5932/10000 [1:11:45<45:49, 1.48it/s, loss=0.0054, lr=1.12e-05, step=5932] Training: 59%|█████▉ | 5933/10000 [1:11:46<47:25, 1.43it/s, loss=0.0054, lr=1.12e-05, step=5932] Training: 59%|█████▉ | 5933/10000 [1:11:46<47:25, 1.43it/s, loss=0.0033, lr=1.12e-05, step=5933] Training: 59%|█████▉ | 5934/10000 [1:11:47<50:43, 1.34it/s, loss=0.0033, lr=1.12e-05, step=5933] Training: 59%|█████▉ | 5934/10000 [1:11:47<50:43, 1.34it/s, loss=0.0031, lr=1.12e-05, step=5934] Training: 59%|█████▉ | 5935/10000 [1:11:48<53:01, 1.28it/s, loss=0.0031, lr=1.12e-05, step=5934] Training: 59%|█████▉ | 5935/10000 [1:11:48<53:01, 1.28it/s, loss=0.0103, lr=1.12e-05, step=5935] Training: 59%|█████▉ | 5936/10000 [1:11:49<52:02, 1.30it/s, loss=0.0103, lr=1.12e-05, step=5935] Training: 59%|█████▉ | 5936/10000 [1:11:49<52:02, 1.30it/s, loss=0.0033, lr=1.12e-05, step=5936] Training: 59%|█████▉ | 5937/10000 [1:11:49<52:47, 1.28it/s, loss=0.0033, lr=1.12e-05, step=5936] Training: 59%|█████▉ | 5937/10000 [1:11:49<52:47, 1.28it/s, loss=0.0098, lr=1.12e-05, step=5937] Training: 59%|█████▉ | 5938/10000 [1:11:50<55:10, 1.23it/s, loss=0.0098, lr=1.12e-05, step=5937] Training: 59%|█████▉ | 5938/10000 [1:11:50<55:10, 1.23it/s, loss=0.0023, lr=1.12e-05, step=5938] Training: 59%|█████▉ | 5939/10000 [1:11:51<56:13, 1.20it/s, loss=0.0023, lr=1.12e-05, step=5938] Training: 59%|█████▉ | 5939/10000 [1:11:51<56:13, 1.20it/s, loss=0.0042, lr=1.12e-05, step=5939]17:17:59.030 [I] step=5940 loss=0.0115 smoothed_loss=0.0092 lr=1.12e-05 grad_norm=0.4284 step_time=0.6353s data_time=0.1681s it/s=1.245 eta_to_10000=3261.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.0813 grad_shared_expert=0.3872 (10775:train_pytorch.py:850) + Training: 59%|█████▉ | 5940/10000 [1:11:52<56:39, 1.19it/s, loss=0.0042, lr=1.12e-05, step=5939] Training: 59%|█████▉ | 5940/10000 [1:11:52<56:39, 1.19it/s, loss=0.0115, lr=1.12e-05, step=5940] Training: 59%|█████▉ | 5941/10000 [1:11:53<55:03, 1.23it/s, loss=0.0115, lr=1.12e-05, step=5940] Training: 59%|█████▉ | 5941/10000 [1:11:53<55:03, 1.23it/s, loss=0.0074, lr=1.12e-05, step=5941] Training: 59%|█████▉ | 5942/10000 [1:11:54<53:45, 1.26it/s, loss=0.0074, lr=1.12e-05, step=5941] Training: 59%|█████▉ | 5942/10000 [1:11:54<53:45, 1.26it/s, loss=0.0077, lr=1.12e-05, step=5942] Training: 59%|█████▉ | 5943/10000 [1:11:54<53:24, 1.27it/s, loss=0.0077, lr=1.12e-05, step=5942] Training: 59%|█████▉ | 5943/10000 [1:11:54<53:24, 1.27it/s, loss=0.0070, lr=1.12e-05, step=5943] Training: 59%|█████▉ | 5944/10000 [1:11:55<53:10, 1.27it/s, loss=0.0070, lr=1.12e-05, step=5943] Training: 59%|█████▉ | 5944/10000 [1:11:55<53:10, 1.27it/s, loss=0.0053, lr=1.12e-05, step=5944] Training: 59%|█████▉ | 5945/10000 [1:11:56<55:09, 1.23it/s, loss=0.0053, lr=1.12e-05, step=5944] Training: 59%|█████▉ | 5945/10000 [1:11:56<55:09, 1.23it/s, loss=0.0055, lr=1.12e-05, step=5945] Training: 59%|█████▉ | 5946/10000 [1:11:57<48:35, 1.39it/s, loss=0.0055, lr=1.12e-05, step=5945] Training: 59%|█████▉ | 5946/10000 [1:11:57<48:35, 1.39it/s, loss=0.0173, lr=1.12e-05, step=5946] Training: 59%|█████▉ | 5947/10000 [1:11:57<46:14, 1.46it/s, loss=0.0173, lr=1.12e-05, step=5946] Training: 59%|█████▉ | 5947/10000 [1:11:57<46:14, 1.46it/s, loss=0.0033, lr=1.12e-05, step=5947] Training: 59%|█████▉ | 5948/10000 [1:11:58<49:47, 1.36it/s, loss=0.0033, lr=1.12e-05, step=5947] Training: 59%|█████▉ | 5948/10000 [1:11:58<49:47, 1.36it/s, loss=0.0366, lr=1.12e-05, step=5948] Training: 59%|█████▉ | 5949/10000 [1:11:59<53:33, 1.26it/s, loss=0.0366, lr=1.12e-05, step=5948] Training: 59%|█████▉ | 5949/10000 [1:11:59<53:33, 1.26it/s, loss=0.0086, lr=1.12e-05, step=5949]17:18:06.883 [I] step=5950 loss=0.0023 smoothed_loss=0.0101 lr=1.12e-05 grad_norm=0.4422 step_time=0.6283s data_time=0.1572s it/s=1.273 eta_to_10000=3180.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0691 grad_shared_expert=0.2216 (10775:train_pytorch.py:850) + Training: 60%|█████▉ | 5950/10000 [1:12:00<58:09, 1.16it/s, loss=0.0086, lr=1.12e-05, step=5949] Training: 60%|█████▉ | 5950/10000 [1:12:00<58:09, 1.16it/s, loss=0.0023, lr=1.12e-05, step=5950] Training: 60%|█████▉ | 5951/10000 [1:12:01<59:02, 1.14it/s, loss=0.0023, lr=1.12e-05, step=5950] Training: 60%|█████▉ | 5951/10000 [1:12:01<59:02, 1.14it/s, loss=0.0117, lr=1.12e-05, step=5951] Training: 60%|█████▉ | 5952/10000 [1:12:02<58:30, 1.15it/s, loss=0.0117, lr=1.12e-05, step=5951] Training: 60%|█████▉ | 5952/10000 [1:12:02<58:30, 1.15it/s, loss=0.0072, lr=1.12e-05, step=5952] Training: 60%|█████▉ | 5953/10000 [1:12:03<58:59, 1.14it/s, loss=0.0072, lr=1.12e-05, step=5952] Training: 60%|█████▉ | 5953/10000 [1:12:03<58:59, 1.14it/s, loss=0.0167, lr=1.12e-05, step=5953] Training: 60%|█████▉ | 5954/10000 [1:12:03<56:10, 1.20it/s, loss=0.0167, lr=1.12e-05, step=5953] Training: 60%|█████▉ | 5954/10000 [1:12:03<56:10, 1.20it/s, loss=0.0115, lr=1.12e-05, step=5954] Training: 60%|█████▉ | 5955/10000 [1:12:04<58:44, 1.15it/s, loss=0.0115, lr=1.12e-05, step=5954] Training: 60%|█████▉ | 5955/10000 [1:12:04<58:44, 1.15it/s, loss=0.0201, lr=1.12e-05, step=5955] Training: 60%|█████▉ | 5956/10000 [1:12:05<57:24, 1.17it/s, loss=0.0201, lr=1.12e-05, step=5955] Training: 60%|█████▉ | 5956/10000 [1:12:05<57:24, 1.17it/s, loss=0.0107, lr=1.12e-05, step=5956] Training: 60%|█████▉ | 5957/10000 [1:12:06<55:39, 1.21it/s, loss=0.0107, lr=1.12e-05, step=5956] Training: 60%|█████▉ | 5957/10000 [1:12:06<55:39, 1.21it/s, loss=0.0235, lr=1.11e-05, step=5957] Training: 60%|█████▉ | 5958/10000 [1:12:07<57:22, 1.17it/s, loss=0.0235, lr=1.11e-05, step=5957] Training: 60%|█████▉ | 5958/10000 [1:12:07<57:22, 1.17it/s, loss=0.0064, lr=1.11e-05, step=5958] Training: 60%|█████▉ | 5959/10000 [1:12:08<56:25, 1.19it/s, loss=0.0064, lr=1.11e-05, step=5958] Training: 60%|█████▉ | 5959/10000 [1:12:08<56:25, 1.19it/s, loss=0.0100, lr=1.11e-05, step=5959]17:18:15.086 [I] step=5960 loss=0.0182 smoothed_loss=0.0125 lr=1.12e-05 grad_norm=0.4360 step_time=0.6267s data_time=0.1937s it/s=1.219 eta_to_10000=3313.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.0906 grad_shared_expert=0.2852 (10775:train_pytorch.py:850) + Training: 60%|█████▉ | 5960/10000 [1:12:08<50:59, 1.32it/s, loss=0.0100, lr=1.11e-05, step=5959] Training: 60%|█████▉ | 5960/10000 [1:12:08<50:59, 1.32it/s, loss=0.0182, lr=1.11e-05, step=5960] Training: 60%|█████▉ | 5961/10000 [1:12:09<48:00, 1.40it/s, loss=0.0182, lr=1.11e-05, step=5960] Training: 60%|█████▉ | 5961/10000 [1:12:09<48:00, 1.40it/s, loss=0.0161, lr=1.11e-05, step=5961] Training: 60%|█████▉ | 5962/10000 [1:12:09<46:35, 1.44it/s, loss=0.0161, lr=1.11e-05, step=5961] Training: 60%|█████▉ | 5962/10000 [1:12:09<46:35, 1.44it/s, loss=0.0107, lr=1.11e-05, step=5962] Training: 60%|█████▉ | 5963/10000 [1:12:10<51:14, 1.31it/s, loss=0.0107, lr=1.11e-05, step=5962] Training: 60%|█████▉ | 5963/10000 [1:12:10<51:14, 1.31it/s, loss=0.0020, lr=1.11e-05, step=5963] Training: 60%|█████▉ | 5964/10000 [1:12:11<50:29, 1.33it/s, loss=0.0020, lr=1.11e-05, step=5963] Training: 60%|█████▉ | 5964/10000 [1:12:11<50:29, 1.33it/s, loss=0.0180, lr=1.11e-05, step=5964] Training: 60%|█████▉ | 5965/10000 [1:12:12<46:32, 1.44it/s, loss=0.0180, lr=1.11e-05, step=5964] Training: 60%|█████▉ | 5965/10000 [1:12:12<46:32, 1.44it/s, loss=0.0366, lr=1.11e-05, step=5965] Training: 60%|█████▉ | 5966/10000 [1:12:12<48:29, 1.39it/s, loss=0.0366, lr=1.11e-05, step=5965] Training: 60%|█████▉ | 5966/10000 [1:12:12<48:29, 1.39it/s, loss=0.0058, lr=1.11e-05, step=5966] Training: 60%|█████▉ | 5967/10000 [1:12:13<49:15, 1.36it/s, loss=0.0058, lr=1.11e-05, step=5966] Training: 60%|█████▉ | 5967/10000 [1:12:13<49:15, 1.36it/s, loss=0.0017, lr=1.11e-05, step=5967] Training: 60%|█████▉ | 5968/10000 [1:12:14<51:12, 1.31it/s, loss=0.0017, lr=1.11e-05, step=5967] Training: 60%|█████▉ | 5968/10000 [1:12:14<51:12, 1.31it/s, loss=0.0127, lr=1.11e-05, step=5968] Training: 60%|█████▉ | 5969/10000 [1:12:15<47:32, 1.41it/s, loss=0.0127, lr=1.11e-05, step=5968] Training: 60%|█████▉ | 5969/10000 [1:12:15<47:32, 1.41it/s, loss=0.0063, lr=1.11e-05, step=5969]17:18:22.530 [I] step=5970 loss=0.0151 smoothed_loss=0.0123 lr=1.11e-05 grad_norm=0.3923 step_time=0.6108s data_time=0.1333s it/s=1.344 eta_to_10000=2999.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0203 grad_action_out_proj=0.1370 grad_shared_expert=0.3385 (10775:train_pytorch.py:850) + Training: 60%|█████▉ | 5970/10000 [1:12:16<53:57, 1.24it/s, loss=0.0063, lr=1.11e-05, step=5969] Training: 60%|█████▉ | 5970/10000 [1:12:16<53:57, 1.24it/s, loss=0.0151, lr=1.11e-05, step=5970] Training: 60%|█████▉ | 5971/10000 [1:12:16<53:09, 1.26it/s, loss=0.0151, lr=1.11e-05, step=5970] Training: 60%|█████▉ | 5971/10000 [1:12:16<53:09, 1.26it/s, loss=0.0040, lr=1.11e-05, step=5971] Training: 60%|█████▉ | 5972/10000 [1:12:17<51:46, 1.30it/s, loss=0.0040, lr=1.11e-05, step=5971] Training: 60%|█████▉ | 5972/10000 [1:12:17<51:46, 1.30it/s, loss=0.0117, lr=1.11e-05, step=5972] Training: 60%|█████▉ | 5973/10000 [1:12:18<47:07, 1.42it/s, loss=0.0117, lr=1.11e-05, step=5972] Training: 60%|█████▉ | 5973/10000 [1:12:18<47:07, 1.42it/s, loss=0.0071, lr=1.11e-05, step=5973] Training: 60%|█████▉ | 5974/10000 [1:12:18<47:29, 1.41it/s, loss=0.0071, lr=1.11e-05, step=5973] Training: 60%|█████▉ | 5974/10000 [1:12:18<47:29, 1.41it/s, loss=0.0041, lr=1.11e-05, step=5974] Training: 60%|█████▉ | 5975/10000 [1:12:19<49:43, 1.35it/s, loss=0.0041, lr=1.11e-05, step=5974] Training: 60%|█████▉ | 5975/10000 [1:12:19<49:43, 1.35it/s, loss=0.0749, lr=1.11e-05, step=5975] Training: 60%|█████▉ | 5976/10000 [1:12:20<48:25, 1.39it/s, loss=0.0749, lr=1.11e-05, step=5975] Training: 60%|█████▉ | 5976/10000 [1:12:20<48:25, 1.39it/s, loss=0.0034, lr=1.11e-05, step=5976] Training: 60%|█████▉ | 5977/10000 [1:12:21<54:46, 1.22it/s, loss=0.0034, lr=1.11e-05, step=5976] Training: 60%|█████▉ | 5977/10000 [1:12:21<54:46, 1.22it/s, loss=0.0097, lr=1.11e-05, step=5977] Training: 60%|█████▉ | 5978/10000 [1:12:22<1:02:45, 1.07it/s, loss=0.0097, lr=1.11e-05, step=5977] Training: 60%|█████▉ | 5978/10000 [1:12:22<1:02:45, 1.07it/s, loss=0.0128, lr=1.11e-05, step=5978] Training: 60%|█████▉ | 5979/10000 [1:12:23<1:01:21, 1.09it/s, loss=0.0128, lr=1.11e-05, step=5978] Training: 60%|█████▉ | 5979/10000 [1:12:23<1:01:21, 1.09it/s, loss=0.0053, lr=1.11e-05, step=5979]17:18:30.802 [I] step=5980 loss=0.0057 smoothed_loss=0.0129 lr=1.11e-05 grad_norm=0.4138 step_time=0.6560s data_time=0.1713s it/s=1.209 eta_to_10000=3324.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0484 grad_shared_expert=0.2279 (10775:train_pytorch.py:850) + Training: 60%|█████▉ | 5980/10000 [1:12:24<1:01:08, 1.10it/s, loss=0.0053, lr=1.11e-05, step=5979] Training: 60%|█████▉ | 5980/10000 [1:12:24<1:01:08, 1.10it/s, loss=0.0057, lr=1.11e-05, step=5980] Training: 60%|█████▉ | 5981/10000 [1:12:25<57:52, 1.16it/s, loss=0.0057, lr=1.11e-05, step=5980] Training: 60%|█████▉ | 5981/10000 [1:12:25<57:52, 1.16it/s, loss=0.0110, lr=1.11e-05, step=5981] Training: 60%|█████▉ | 5982/10000 [1:12:25<56:33, 1.18it/s, loss=0.0110, lr=1.11e-05, step=5981] Training: 60%|█████▉ | 5982/10000 [1:12:25<56:33, 1.18it/s, loss=0.0068, lr=1.11e-05, step=5982] Training: 60%|█████▉ | 5983/10000 [1:12:26<53:11, 1.26it/s, loss=0.0068, lr=1.11e-05, step=5982] Training: 60%|█████▉ | 5983/10000 [1:12:26<53:11, 1.26it/s, loss=0.0215, lr=1.11e-05, step=5983] Training: 60%|█████▉ | 5984/10000 [1:12:27<54:36, 1.23it/s, loss=0.0215, lr=1.11e-05, step=5983] Training: 60%|█████▉ | 5984/10000 [1:12:27<54:36, 1.23it/s, loss=0.0079, lr=1.10e-05, step=5984] Training: 60%|█████▉ | 5985/10000 [1:12:28<55:31, 1.21it/s, loss=0.0079, lr=1.10e-05, step=5984] Training: 60%|█████▉ | 5985/10000 [1:12:28<55:31, 1.21it/s, loss=0.0254, lr=1.10e-05, step=5985] Training: 60%|█████▉ | 5986/10000 [1:12:29<57:40, 1.16it/s, loss=0.0254, lr=1.10e-05, step=5985] Training: 60%|█████▉ | 5986/10000 [1:12:29<57:40, 1.16it/s, loss=0.0087, lr=1.10e-05, step=5986] Training: 60%|█████▉ | 5987/10000 [1:12:30<57:58, 1.15it/s, loss=0.0087, lr=1.10e-05, step=5986] Training: 60%|█████▉ | 5987/10000 [1:12:30<57:58, 1.15it/s, loss=0.0116, lr=1.10e-05, step=5987] Training: 60%|█████▉ | 5988/10000 [1:12:31<58:02, 1.15it/s, loss=0.0116, lr=1.10e-05, step=5987] Training: 60%|█████▉ | 5988/10000 [1:12:31<58:02, 1.15it/s, loss=0.0098, lr=1.10e-05, step=5988] Training: 60%|█████▉ | 5989/10000 [1:12:31<54:06, 1.24it/s, loss=0.0098, lr=1.10e-05, step=5988] Training: 60%|█████▉ | 5989/10000 [1:12:31<54:06, 1.24it/s, loss=0.0163, lr=1.10e-05, step=5989]17:18:38.744 [I] step=5990 loss=0.0974 smoothed_loss=0.0216 lr=1.10e-05 grad_norm=0.4690 step_time=0.6386s data_time=0.1555s it/s=1.259 eta_to_10000=3184.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0383 grad_action_out_proj=0.2206 grad_shared_expert=0.6704 (10775:train_pytorch.py:850) + Training: 60%|█████▉ | 5990/10000 [1:12:32<50:25, 1.33it/s, loss=0.0163, lr=1.10e-05, step=5989] Training: 60%|█████▉ | 5990/10000 [1:12:32<50:25, 1.33it/s, loss=0.0974, lr=1.10e-05, step=5990] Training: 60%|█████▉ | 5991/10000 [1:12:33<52:24, 1.28it/s, loss=0.0974, lr=1.10e-05, step=5990] Training: 60%|█████▉ | 5991/10000 [1:12:33<52:24, 1.28it/s, loss=0.0095, lr=1.10e-05, step=5991] Training: 60%|█████▉ | 5992/10000 [1:12:34<53:29, 1.25it/s, loss=0.0095, lr=1.10e-05, step=5991] Training: 60%|█████▉ | 5992/10000 [1:12:34<53:29, 1.25it/s, loss=0.0061, lr=1.10e-05, step=5992] Training: 60%|█████▉ | 5993/10000 [1:12:34<54:19, 1.23it/s, loss=0.0061, lr=1.10e-05, step=5992] Training: 60%|█████▉ | 5993/10000 [1:12:34<54:19, 1.23it/s, loss=0.0195, lr=1.10e-05, step=5993] Training: 60%|█████▉ | 5994/10000 [1:12:35<48:59, 1.36it/s, loss=0.0195, lr=1.10e-05, step=5993] Training: 60%|█████▉ | 5994/10000 [1:12:35<48:59, 1.36it/s, loss=0.0253, lr=1.10e-05, step=5994] Training: 60%|█████▉ | 5995/10000 [1:12:35<44:23, 1.50it/s, loss=0.0253, lr=1.10e-05, step=5994] Training: 60%|█████▉ | 5995/10000 [1:12:35<44:23, 1.50it/s, loss=0.0144, lr=1.10e-05, step=5995] Training: 60%|█████▉ | 5996/10000 [1:12:36<42:44, 1.56it/s, loss=0.0144, lr=1.10e-05, step=5995] Training: 60%|█████▉ | 5996/10000 [1:12:36<42:44, 1.56it/s, loss=0.0104, lr=1.10e-05, step=5996] Training: 60%|█████▉ | 5997/10000 [1:12:37<41:45, 1.60it/s, loss=0.0104, lr=1.10e-05, step=5996] Training: 60%|█████▉ | 5997/10000 [1:12:37<41:45, 1.60it/s, loss=0.0299, lr=1.10e-05, step=5997] Training: 60%|█████▉ | 5998/10000 [1:12:38<50:14, 1.33it/s, loss=0.0299, lr=1.10e-05, step=5997] Training: 60%|█████▉ | 5998/10000 [1:12:38<50:14, 1.33it/s, loss=0.0106, lr=1.10e-05, step=5998] Training: 60%|█████▉ | 5999/10000 [1:12:38<50:42, 1.32it/s, loss=0.0106, lr=1.10e-05, step=5998] Training: 60%|█████▉ | 5999/10000 [1:12:38<50:42, 1.32it/s, loss=0.0053, lr=1.10e-05, step=5999]17:18:46.268 [I] step=6000 loss=0.0108 smoothed_loss=0.0166 lr=1.10e-05 grad_norm=0.4662 step_time=0.6373s data_time=0.1151s it/s=1.329 eta_to_10000=3009.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.1015 grad_shared_expert=0.5939 (10775:train_pytorch.py:850) +17:19:56.648 [I] Saved checkpoint at step 6000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/6000 (10775:train_pytorch.py:350) + Training: 60%|██████ | 6000/10000 [1:13:50<24:21:46, 21.93s/it, loss=0.0053, lr=1.10e-05, step=5999] Training: 60%|██████ | 6000/10000 [1:13:50<24:21:46, 21.93s/it, loss=0.0108, lr=1.10e-05, step=6000] Training: 60%|██████ | 6001/10000 [1:13:50<17:16:00, 15.54s/it, loss=0.0108, lr=1.10e-05, step=6000] Training: 60%|██████ | 6001/10000 [1:13:50<17:16:00, 15.54s/it, loss=0.0120, lr=1.10e-05, step=6001] Training: 60%|██████ | 6002/10000 [1:13:51<12:14:56, 11.03s/it, loss=0.0120, lr=1.10e-05, step=6001] Training: 60%|██████ | 6002/10000 [1:13:51<12:14:56, 11.03s/it, loss=0.0017, lr=1.10e-05, step=6002] Training: 60%|██████ | 6003/10000 [1:13:51<8:47:00, 7.91s/it, loss=0.0017, lr=1.10e-05, step=6002] Training: 60%|██████ | 6003/10000 [1:13:51<8:47:00, 7.91s/it, loss=0.0055, lr=1.10e-05, step=6003] Training: 60%|██████ | 6004/10000 [1:13:52<6:18:49, 5.69s/it, loss=0.0055, lr=1.10e-05, step=6003] Training: 60%|██████ | 6004/10000 [1:13:52<6:18:49, 5.69s/it, loss=0.0013, lr=1.10e-05, step=6004] Training: 60%|██████ | 6005/10000 [1:13:53<4:39:24, 4.20s/it, loss=0.0013, lr=1.10e-05, step=6004] Training: 60%|██████ | 6005/10000 [1:13:53<4:39:24, 4.20s/it, loss=0.0035, lr=1.10e-05, step=6005] Training: 60%|██████ | 6006/10000 [1:13:54<3:34:56, 3.23s/it, loss=0.0035, lr=1.10e-05, step=6005] Training: 60%|██████ | 6006/10000 [1:13:54<3:34:56, 3.23s/it, loss=0.0044, lr=1.10e-05, step=6006] Training: 60%|██████ | 6007/10000 [1:13:54<2:43:07, 2.45s/it, loss=0.0044, lr=1.10e-05, step=6006] Training: 60%|██████ | 6007/10000 [1:13:54<2:43:07, 2.45s/it, loss=0.0382, lr=1.10e-05, step=6007] Training: 60%|██████ | 6008/10000 [1:13:55<2:08:45, 1.94s/it, loss=0.0382, lr=1.10e-05, step=6007] Training: 60%|██████ | 6008/10000 [1:13:55<2:08:45, 1.94s/it, loss=0.0382, lr=1.10e-05, step=6008] Training: 60%|██████ | 6009/10000 [1:13:56<1:40:17, 1.51s/it, loss=0.0382, lr=1.10e-05, step=6008] Training: 60%|██████ | 6009/10000 [1:13:56<1:40:17, 1.51s/it, loss=0.0046, lr=1.10e-05, step=6009]17:20:03.248 [I] step=6010 loss=0.0032 smoothed_loss=0.0138 lr=1.10e-05 grad_norm=0.4452 step_time=0.5486s data_time=7.1493s it/s=0.130 eta_to_10000=30714.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0078 grad_action_out_proj=0.0742 grad_shared_expert=0.2742 (10775:train_pytorch.py:850) + Training: 60%|██████ | 6010/10000 [1:13:56<1:25:09, 1.28s/it, loss=0.0046, lr=1.10e-05, step=6009] Training: 60%|██████ | 6010/10000 [1:13:56<1:25:09, 1.28s/it, loss=0.0032, lr=1.10e-05, step=6010] Training: 60%|██████ | 6011/10000 [1:13:57<1:09:21, 1.04s/it, loss=0.0032, lr=1.10e-05, step=6010] Training: 60%|██████ | 6011/10000 [1:13:57<1:09:21, 1.04s/it, loss=0.0034, lr=1.10e-05, step=6011] Training: 60%|██████ | 6012/10000 [1:13:57<1:01:51, 1.07it/s, loss=0.0034, lr=1.10e-05, step=6011] Training: 60%|██████ | 6012/10000 [1:13:57<1:01:51, 1.07it/s, loss=0.0036, lr=1.09e-05, step=6012] Training: 60%|██████ | 6013/10000 [1:13:58<59:27, 1.12it/s, loss=0.0036, lr=1.09e-05, step=6012] Training: 60%|██████ | 6013/10000 [1:13:58<59:27, 1.12it/s, loss=0.0088, lr=1.09e-05, step=6013] Training: 60%|██████ | 6014/10000 [1:13:59<58:28, 1.14it/s, loss=0.0088, lr=1.09e-05, step=6013] Training: 60%|██████ | 6014/10000 [1:13:59<58:28, 1.14it/s, loss=0.0066, lr=1.09e-05, step=6014] Training: 60%|██████ | 6015/10000 [1:14:00<50:45, 1.31it/s, loss=0.0066, lr=1.09e-05, step=6014] Training: 60%|██████ | 6015/10000 [1:14:00<50:45, 1.31it/s, loss=0.0146, lr=1.09e-05, step=6015] Training: 60%|██████ | 6016/10000 [1:14:00<45:14, 1.47it/s, loss=0.0146, lr=1.09e-05, step=6015] Training: 60%|██████ | 6016/10000 [1:14:00<45:14, 1.47it/s, loss=0.0022, lr=1.09e-05, step=6016] Training: 60%|██████ | 6017/10000 [1:14:01<46:00, 1.44it/s, loss=0.0022, lr=1.09e-05, step=6016] Training: 60%|██████ | 6017/10000 [1:14:01<46:00, 1.44it/s, loss=0.0041, lr=1.09e-05, step=6017] Training: 60%|██████ | 6018/10000 [1:14:01<44:02, 1.51it/s, loss=0.0041, lr=1.09e-05, step=6017] Training: 60%|██████ | 6018/10000 [1:14:01<44:02, 1.51it/s, loss=0.0087, lr=1.09e-05, step=6018] Training: 60%|██████ | 6019/10000 [1:14:02<41:21, 1.60it/s, loss=0.0087, lr=1.09e-05, step=6018] Training: 60%|██████ | 6019/10000 [1:14:02<41:21, 1.60it/s, loss=0.0137, lr=1.09e-05, step=6019]17:20:09.653 [I] step=6020 loss=0.0064 smoothed_loss=0.0097 lr=1.09e-05 grad_norm=0.4565 step_time=0.5474s data_time=0.0932s it/s=1.561 eta_to_10000=2549.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0202 grad_action_out_proj=0.1297 grad_shared_expert=0.5551 (10775:train_pytorch.py:850) + Training: 60%|██████ | 6020/10000 [1:14:03<44:10, 1.50it/s, loss=0.0137, lr=1.09e-05, step=6019] Training: 60%|██████ | 6020/10000 [1:14:03<44:10, 1.50it/s, loss=0.0064, lr=1.09e-05, step=6020] Training: 60%|██████ | 6021/10000 [1:14:04<48:39, 1.36it/s, loss=0.0064, lr=1.09e-05, step=6020] Training: 60%|██████ | 6021/10000 [1:14:04<48:39, 1.36it/s, loss=0.0038, lr=1.09e-05, step=6021] Training: 60%|██████ | 6022/10000 [1:14:04<47:24, 1.40it/s, loss=0.0038, lr=1.09e-05, step=6021] Training: 60%|██████ | 6022/10000 [1:14:04<47:24, 1.40it/s, loss=0.0567, lr=1.09e-05, step=6022] Training: 60%|██████ | 6023/10000 [1:14:05<46:02, 1.44it/s, loss=0.0567, lr=1.09e-05, step=6022] Training: 60%|██████ | 6023/10000 [1:14:05<46:02, 1.44it/s, loss=0.0101, lr=1.09e-05, step=6023] Training: 60%|██████ | 6024/10000 [1:14:06<47:42, 1.39it/s, loss=0.0101, lr=1.09e-05, step=6023] Training: 60%|██████ | 6024/10000 [1:14:06<47:42, 1.39it/s, loss=0.0036, lr=1.09e-05, step=6024] Training: 60%|██████ | 6025/10000 [1:14:06<43:03, 1.54it/s, loss=0.0036, lr=1.09e-05, step=6024] Training: 60%|██████ | 6025/10000 [1:14:06<43:03, 1.54it/s, loss=0.0234, lr=1.09e-05, step=6025] Training: 60%|██████ | 6026/10000 [1:14:07<45:55, 1.44it/s, loss=0.0234, lr=1.09e-05, step=6025] Training: 60%|██████ | 6026/10000 [1:14:07<45:55, 1.44it/s, loss=0.0053, lr=1.09e-05, step=6026] Training: 60%|██████ | 6027/10000 [1:14:08<50:14, 1.32it/s, loss=0.0053, lr=1.09e-05, step=6026] Training: 60%|██████ | 6027/10000 [1:14:08<50:14, 1.32it/s, loss=0.0198, lr=1.09e-05, step=6027] Training: 60%|██████ | 6028/10000 [1:14:08<45:00, 1.47it/s, loss=0.0198, lr=1.09e-05, step=6027] Training: 60%|██████ | 6028/10000 [1:14:08<45:00, 1.47it/s, loss=0.0063, lr=1.09e-05, step=6028] Training: 60%|██████ | 6029/10000 [1:14:09<45:55, 1.44it/s, loss=0.0063, lr=1.09e-05, step=6028] Training: 60%|██████ | 6029/10000 [1:14:09<45:55, 1.44it/s, loss=0.0052, lr=1.09e-05, step=6029]17:20:16.677 [I] step=6030 loss=0.0079 smoothed_loss=0.0116 lr=1.09e-05 grad_norm=0.4675 step_time=0.5829s data_time=0.1194s it/s=1.424 eta_to_10000=2787.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.0848 grad_shared_expert=0.4347 (10775:train_pytorch.py:850) + Training: 60%|██████ | 6030/10000 [1:14:10<44:25, 1.49it/s, loss=0.0052, lr=1.09e-05, step=6029] Training: 60%|██████ | 6030/10000 [1:14:10<44:25, 1.49it/s, loss=0.0079, lr=1.09e-05, step=6030] Training: 60%|██████ | 6031/10000 [1:14:10<41:03, 1.61it/s, loss=0.0079, lr=1.09e-05, step=6030] Training: 60%|██████ | 6031/10000 [1:14:10<41:03, 1.61it/s, loss=0.0292, lr=1.09e-05, step=6031] Training: 60%|██████ | 6032/10000 [1:14:11<39:37, 1.67it/s, loss=0.0292, lr=1.09e-05, step=6031] Training: 60%|██████ | 6032/10000 [1:14:11<39:37, 1.67it/s, loss=0.0040, lr=1.09e-05, step=6032] Training: 60%|██████ | 6033/10000 [1:14:12<42:40, 1.55it/s, loss=0.0040, lr=1.09e-05, step=6032] Training: 60%|██████ | 6033/10000 [1:14:12<42:40, 1.55it/s, loss=0.0030, lr=1.09e-05, step=6033] Training: 60%|██████ | 6034/10000 [1:14:13<51:02, 1.29it/s, loss=0.0030, lr=1.09e-05, step=6033] Training: 60%|██████ | 6034/10000 [1:14:13<51:02, 1.29it/s, loss=0.0083, lr=1.09e-05, step=6034] Training: 60%|██████ | 6035/10000 [1:14:13<45:39, 1.45it/s, loss=0.0083, lr=1.09e-05, step=6034] Training: 60%|██████ | 6035/10000 [1:14:13<45:39, 1.45it/s, loss=0.0217, lr=1.09e-05, step=6035] Training: 60%|██████ | 6036/10000 [1:14:14<45:54, 1.44it/s, loss=0.0217, lr=1.09e-05, step=6035] Training: 60%|██████ | 6036/10000 [1:14:14<45:54, 1.44it/s, loss=0.0205, lr=1.09e-05, step=6036] Training: 60%|██████ | 6037/10000 [1:14:15<45:57, 1.44it/s, loss=0.0205, lr=1.09e-05, step=6036] Training: 60%|██████ | 6037/10000 [1:14:15<45:57, 1.44it/s, loss=0.0012, lr=1.09e-05, step=6037] Training: 60%|██████ | 6038/10000 [1:14:15<47:58, 1.38it/s, loss=0.0012, lr=1.09e-05, step=6037] Training: 60%|██████ | 6038/10000 [1:14:15<47:58, 1.38it/s, loss=0.0036, lr=1.09e-05, step=6038] Training: 60%|██████ | 6039/10000 [1:14:16<44:01, 1.50it/s, loss=0.0036, lr=1.09e-05, step=6038] Training: 60%|██████ | 6039/10000 [1:14:16<44:01, 1.50it/s, loss=0.0233, lr=1.09e-05, step=6039]17:20:23.335 [I] step=6040 loss=0.0074 smoothed_loss=0.0118 lr=1.09e-05 grad_norm=0.4754 step_time=0.5518s data_time=0.1141s it/s=1.502 eta_to_10000=2636.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0122 grad_action_out_proj=0.1159 grad_shared_expert=0.3456 (10775:train_pytorch.py:850) + Training: 60%|██████ | 6040/10000 [1:14:16<41:49, 1.58it/s, loss=0.0233, lr=1.09e-05, step=6039] Training: 60%|██████ | 6040/10000 [1:14:16<41:49, 1.58it/s, loss=0.0074, lr=1.08e-05, step=6040] Training: 60%|██████ | 6041/10000 [1:14:17<45:27, 1.45it/s, loss=0.0074, lr=1.08e-05, step=6040] Training: 60%|██████ | 6041/10000 [1:14:17<45:27, 1.45it/s, loss=0.0065, lr=1.08e-05, step=6041] Training: 60%|██████ | 6042/10000 [1:14:18<42:14, 1.56it/s, loss=0.0065, lr=1.08e-05, step=6041] Training: 60%|██████ | 6042/10000 [1:14:18<42:14, 1.56it/s, loss=0.0049, lr=1.08e-05, step=6042] Training: 60%|██████ | 6043/10000 [1:14:18<42:03, 1.57it/s, loss=0.0049, lr=1.08e-05, step=6042] Training: 60%|██████ | 6043/10000 [1:14:18<42:03, 1.57it/s, loss=0.0049, lr=1.08e-05, step=6043] Training: 60%|██████ | 6044/10000 [1:14:19<44:45, 1.47it/s, loss=0.0049, lr=1.08e-05, step=6043] Training: 60%|██████ | 6044/10000 [1:14:19<44:45, 1.47it/s, loss=0.0025, lr=1.08e-05, step=6044] Training: 60%|██████ | 6045/10000 [1:14:20<42:01, 1.57it/s, loss=0.0025, lr=1.08e-05, step=6044] Training: 60%|██████ | 6045/10000 [1:14:20<42:01, 1.57it/s, loss=0.0084, lr=1.08e-05, step=6045] Training: 60%|██████ | 6046/10000 [1:14:20<39:07, 1.68it/s, loss=0.0084, lr=1.08e-05, step=6045] Training: 60%|██████ | 6046/10000 [1:14:20<39:07, 1.68it/s, loss=0.0071, lr=1.08e-05, step=6046] Training: 60%|██████ | 6047/10000 [1:14:21<37:17, 1.77it/s, loss=0.0071, lr=1.08e-05, step=6046] Training: 60%|██████ | 6047/10000 [1:14:21<37:17, 1.77it/s, loss=0.0041, lr=1.08e-05, step=6047] Training: 60%|██████ | 6048/10000 [1:14:21<37:17, 1.77it/s, loss=0.0041, lr=1.08e-05, step=6047] Training: 60%|██████ | 6048/10000 [1:14:21<37:17, 1.77it/s, loss=0.0077, lr=1.08e-05, step=6048] Training: 60%|██████ | 6049/10000 [1:14:22<40:55, 1.61it/s, loss=0.0077, lr=1.08e-05, step=6048] Training: 60%|██████ | 6049/10000 [1:14:22<40:55, 1.61it/s, loss=0.0031, lr=1.08e-05, step=6049]17:20:29.458 [I] step=6050 loss=0.0070 smoothed_loss=0.0078 lr=1.08e-05 grad_norm=0.3696 step_time=0.5377s data_time=0.0745s it/s=1.634 eta_to_10000=2417.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0871 grad_shared_expert=0.2742 (10775:train_pytorch.py:850) + Training: 60%|██████ | 6050/10000 [1:14:23<38:54, 1.69it/s, loss=0.0031, lr=1.08e-05, step=6049] Training: 60%|██████ | 6050/10000 [1:14:23<38:54, 1.69it/s, loss=0.0070, lr=1.08e-05, step=6050] Training: 61%|██████ | 6051/10000 [1:14:23<40:51, 1.61it/s, loss=0.0070, lr=1.08e-05, step=6050] Training: 61%|██████ | 6051/10000 [1:14:23<40:51, 1.61it/s, loss=0.0039, lr=1.08e-05, step=6051] Training: 61%|██████ | 6052/10000 [1:14:24<39:33, 1.66it/s, loss=0.0039, lr=1.08e-05, step=6051] Training: 61%|██████ | 6052/10000 [1:14:24<39:33, 1.66it/s, loss=0.0053, lr=1.08e-05, step=6052] Training: 61%|██████ | 6053/10000 [1:14:24<37:42, 1.74it/s, loss=0.0053, lr=1.08e-05, step=6052] Training: 61%|██████ | 6053/10000 [1:14:24<37:42, 1.74it/s, loss=0.0119, lr=1.08e-05, step=6053] Training: 61%|██████ | 6054/10000 [1:14:25<36:22, 1.81it/s, loss=0.0119, lr=1.08e-05, step=6053] Training: 61%|██████ | 6054/10000 [1:14:25<36:22, 1.81it/s, loss=0.0023, lr=1.08e-05, step=6054] Training: 61%|██████ | 6055/10000 [1:14:25<35:52, 1.83it/s, loss=0.0023, lr=1.08e-05, step=6054] Training: 61%|██████ | 6055/10000 [1:14:25<35:52, 1.83it/s, loss=0.0029, lr=1.08e-05, step=6055] Training: 61%|██████ | 6056/10000 [1:14:26<45:46, 1.44it/s, loss=0.0029, lr=1.08e-05, step=6055] Training: 61%|██████ | 6056/10000 [1:14:26<45:46, 1.44it/s, loss=0.0128, lr=1.08e-05, step=6056] Training: 61%|██████ | 6057/10000 [1:14:27<42:45, 1.54it/s, loss=0.0128, lr=1.08e-05, step=6056] Training: 61%|██████ | 6057/10000 [1:14:27<42:45, 1.54it/s, loss=0.0167, lr=1.08e-05, step=6057] Training: 61%|██████ | 6058/10000 [1:14:28<47:41, 1.38it/s, loss=0.0167, lr=1.08e-05, step=6057] Training: 61%|██████ | 6058/10000 [1:14:28<47:41, 1.38it/s, loss=0.0226, lr=1.08e-05, step=6058] Training: 61%|██████ | 6059/10000 [1:14:28<42:59, 1.53it/s, loss=0.0226, lr=1.08e-05, step=6058] Training: 61%|██████ | 6059/10000 [1:14:28<42:59, 1.53it/s, loss=0.0029, lr=1.08e-05, step=6059]17:20:35.764 [I] step=6060 loss=0.0091 smoothed_loss=0.0090 lr=1.08e-05 grad_norm=0.3595 step_time=0.5430s data_time=0.0876s it/s=1.586 eta_to_10000=2484.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.0852 grad_shared_expert=0.2392 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6060/10000 [1:14:29<40:40, 1.61it/s, loss=0.0029, lr=1.08e-05, step=6059] Training: 61%|██████ | 6060/10000 [1:14:29<40:40, 1.61it/s, loss=0.0091, lr=1.08e-05, step=6060] Training: 61%|██████ | 6061/10000 [1:14:29<41:30, 1.58it/s, loss=0.0091, lr=1.08e-05, step=6060] Training: 61%|██████ | 6061/10000 [1:14:29<41:30, 1.58it/s, loss=0.0190, lr=1.08e-05, step=6061] Training: 61%|██████ | 6062/10000 [1:14:30<43:46, 1.50it/s, loss=0.0190, lr=1.08e-05, step=6061] Training: 61%|██████ | 6062/10000 [1:14:30<43:46, 1.50it/s, loss=0.0177, lr=1.08e-05, step=6062] Training: 61%|██████ | 6063/10000 [1:14:31<47:42, 1.38it/s, loss=0.0177, lr=1.08e-05, step=6062] Training: 61%|██████ | 6063/10000 [1:14:31<47:42, 1.38it/s, loss=0.0059, lr=1.08e-05, step=6063] Training: 61%|██████ | 6064/10000 [1:14:32<44:22, 1.48it/s, loss=0.0059, lr=1.08e-05, step=6063] Training: 61%|██████ | 6064/10000 [1:14:32<44:22, 1.48it/s, loss=0.0041, lr=1.08e-05, step=6064] Training: 61%|██████ | 6065/10000 [1:14:32<44:46, 1.46it/s, loss=0.0041, lr=1.08e-05, step=6064] Training: 61%|██████ | 6065/10000 [1:14:32<44:46, 1.46it/s, loss=0.0049, lr=1.08e-05, step=6065] Training: 61%|██████ | 6066/10000 [1:14:33<41:26, 1.58it/s, loss=0.0049, lr=1.08e-05, step=6065] Training: 61%|██████ | 6066/10000 [1:14:33<41:26, 1.58it/s, loss=0.0095, lr=1.08e-05, step=6066] Training: 61%|██████ | 6067/10000 [1:14:33<38:41, 1.69it/s, loss=0.0095, lr=1.08e-05, step=6066] Training: 61%|██████ | 6067/10000 [1:14:33<38:41, 1.69it/s, loss=0.0061, lr=1.08e-05, step=6067] Training: 61%|██████ | 6068/10000 [1:14:34<37:11, 1.76it/s, loss=0.0061, lr=1.08e-05, step=6067] Training: 61%|██████ | 6068/10000 [1:14:34<37:11, 1.76it/s, loss=0.0043, lr=1.07e-05, step=6068] Training: 61%|██████ | 6069/10000 [1:14:35<38:27, 1.70it/s, loss=0.0043, lr=1.07e-05, step=6068] Training: 61%|██████ | 6069/10000 [1:14:35<38:27, 1.70it/s, loss=0.0064, lr=1.07e-05, step=6069]17:20:42.199 [I] step=6070 loss=0.0097 smoothed_loss=0.0084 lr=1.08e-05 grad_norm=0.3908 step_time=0.5493s data_time=0.0942s it/s=1.554 eta_to_10000=2528.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0072 grad_action_out_proj=0.0836 grad_shared_expert=0.2805 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6070/10000 [1:14:35<41:37, 1.57it/s, loss=0.0064, lr=1.07e-05, step=6069] Training: 61%|██████ | 6070/10000 [1:14:35<41:37, 1.57it/s, loss=0.0097, lr=1.07e-05, step=6070] Training: 61%|██████ | 6071/10000 [1:14:36<43:17, 1.51it/s, loss=0.0097, lr=1.07e-05, step=6070] Training: 61%|██████ | 6071/10000 [1:14:36<43:17, 1.51it/s, loss=0.0268, lr=1.07e-05, step=6071] Training: 61%|██████ | 6072/10000 [1:14:37<40:35, 1.61it/s, loss=0.0268, lr=1.07e-05, step=6071] Training: 61%|██████ | 6072/10000 [1:14:37<40:35, 1.61it/s, loss=0.0121, lr=1.07e-05, step=6072] Training: 61%|██████ | 6073/10000 [1:14:37<38:04, 1.72it/s, loss=0.0121, lr=1.07e-05, step=6072] Training: 61%|██████ | 6073/10000 [1:14:37<38:04, 1.72it/s, loss=0.0093, lr=1.07e-05, step=6073] Training: 61%|██████ | 6074/10000 [1:14:37<36:20, 1.80it/s, loss=0.0093, lr=1.07e-05, step=6073] Training: 61%|██████ | 6074/10000 [1:14:37<36:20, 1.80it/s, loss=0.0385, lr=1.07e-05, step=6074] Training: 61%|██████ | 6075/10000 [1:14:38<37:34, 1.74it/s, loss=0.0385, lr=1.07e-05, step=6074] Training: 61%|██████ | 6075/10000 [1:14:38<37:34, 1.74it/s, loss=0.0084, lr=1.07e-05, step=6075] Training: 61%|██████ | 6076/10000 [1:14:39<37:16, 1.75it/s, loss=0.0084, lr=1.07e-05, step=6075] Training: 61%|██████ | 6076/10000 [1:14:39<37:16, 1.75it/s, loss=0.0628, lr=1.07e-05, step=6076] Training: 61%|██████ | 6077/10000 [1:14:40<44:42, 1.46it/s, loss=0.0628, lr=1.07e-05, step=6076] Training: 61%|██████ | 6077/10000 [1:14:40<44:42, 1.46it/s, loss=0.0023, lr=1.07e-05, step=6077] Training: 61%|██████ | 6078/10000 [1:14:40<46:48, 1.40it/s, loss=0.0023, lr=1.07e-05, step=6077] Training: 61%|██████ | 6078/10000 [1:14:40<46:48, 1.40it/s, loss=0.0114, lr=1.07e-05, step=6078] Training: 61%|██████ | 6079/10000 [1:14:41<43:02, 1.52it/s, loss=0.0114, lr=1.07e-05, step=6078] Training: 61%|██████ | 6079/10000 [1:14:41<43:02, 1.52it/s, loss=0.0081, lr=1.07e-05, step=6079]17:20:48.546 [I] step=6080 loss=0.0028 smoothed_loss=0.0137 lr=1.07e-05 grad_norm=0.4124 step_time=0.5404s data_time=0.0943s it/s=1.576 eta_to_10000=2487.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0047 grad_action_out_proj=0.0688 grad_shared_expert=0.1931 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6080/10000 [1:14:42<43:16, 1.51it/s, loss=0.0081, lr=1.07e-05, step=6079] Training: 61%|██████ | 6080/10000 [1:14:42<43:16, 1.51it/s, loss=0.0028, lr=1.07e-05, step=6080] Training: 61%|██████ | 6081/10000 [1:14:42<40:33, 1.61it/s, loss=0.0028, lr=1.07e-05, step=6080] Training: 61%|██████ | 6081/10000 [1:14:42<40:33, 1.61it/s, loss=0.0089, lr=1.07e-05, step=6081] Training: 61%|██████ | 6082/10000 [1:14:43<45:00, 1.45it/s, loss=0.0089, lr=1.07e-05, step=6081] Training: 61%|██████ | 6082/10000 [1:14:43<45:00, 1.45it/s, loss=0.0080, lr=1.07e-05, step=6082] Training: 61%|██████ | 6083/10000 [1:14:44<42:28, 1.54it/s, loss=0.0080, lr=1.07e-05, step=6082] Training: 61%|██████ | 6083/10000 [1:14:44<42:28, 1.54it/s, loss=0.0201, lr=1.07e-05, step=6083] Training: 61%|██████ | 6084/10000 [1:14:44<44:57, 1.45it/s, loss=0.0201, lr=1.07e-05, step=6083] Training: 61%|██████ | 6084/10000 [1:14:44<44:57, 1.45it/s, loss=0.0155, lr=1.07e-05, step=6084] Training: 61%|██████ | 6085/10000 [1:14:45<48:48, 1.34it/s, loss=0.0155, lr=1.07e-05, step=6084] Training: 61%|██████ | 6085/10000 [1:14:45<48:48, 1.34it/s, loss=0.0068, lr=1.07e-05, step=6085] Training: 61%|██████ | 6086/10000 [1:14:46<47:22, 1.38it/s, loss=0.0068, lr=1.07e-05, step=6085] Training: 61%|██████ | 6086/10000 [1:14:46<47:22, 1.38it/s, loss=0.0092, lr=1.07e-05, step=6086] Training: 61%|██████ | 6087/10000 [1:14:46<43:18, 1.51it/s, loss=0.0092, lr=1.07e-05, step=6086] Training: 61%|██████ | 6087/10000 [1:14:46<43:18, 1.51it/s, loss=0.0142, lr=1.07e-05, step=6087] Training: 61%|██████ | 6088/10000 [1:14:47<44:25, 1.47it/s, loss=0.0142, lr=1.07e-05, step=6087] Training: 61%|██████ | 6088/10000 [1:14:47<44:25, 1.47it/s, loss=0.0093, lr=1.07e-05, step=6088] Training: 61%|██████ | 6089/10000 [1:14:48<43:30, 1.50it/s, loss=0.0093, lr=1.07e-05, step=6088] Training: 61%|██████ | 6089/10000 [1:14:48<43:30, 1.50it/s, loss=0.0022, lr=1.07e-05, step=6089]17:20:55.268 [I] step=6090 loss=0.0025 smoothed_loss=0.0105 lr=1.07e-05 grad_norm=0.4347 step_time=0.5532s data_time=0.1190s it/s=1.488 eta_to_10000=2627.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0220 grad_action_out_proj=0.2006 grad_shared_expert=0.4370 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6090/10000 [1:14:48<41:38, 1.56it/s, loss=0.0022, lr=1.07e-05, step=6089] Training: 61%|██████ | 6090/10000 [1:14:48<41:38, 1.56it/s, loss=0.0025, lr=1.07e-05, step=6090] Training: 61%|██████ | 6091/10000 [1:14:49<39:23, 1.65it/s, loss=0.0025, lr=1.07e-05, step=6090] Training: 61%|██████ | 6091/10000 [1:14:49<39:23, 1.65it/s, loss=0.0590, lr=1.07e-05, step=6091] Training: 61%|██████ | 6092/10000 [1:14:50<47:43, 1.36it/s, loss=0.0590, lr=1.07e-05, step=6091] Training: 61%|██████ | 6092/10000 [1:14:50<47:43, 1.36it/s, loss=0.0038, lr=1.07e-05, step=6092] Training: 61%|██████ | 6093/10000 [1:14:51<47:16, 1.38it/s, loss=0.0038, lr=1.07e-05, step=6092] Training: 61%|██████ | 6093/10000 [1:14:51<47:16, 1.38it/s, loss=0.0242, lr=1.07e-05, step=6093] Training: 61%|██████ | 6094/10000 [1:14:51<47:57, 1.36it/s, loss=0.0242, lr=1.07e-05, step=6093] Training: 61%|██████ | 6094/10000 [1:14:51<47:57, 1.36it/s, loss=0.0086, lr=1.07e-05, step=6094] Training: 61%|██████ | 6095/10000 [1:14:52<48:00, 1.36it/s, loss=0.0086, lr=1.07e-05, step=6094] Training: 61%|██████ | 6095/10000 [1:14:52<48:00, 1.36it/s, loss=0.0499, lr=1.07e-05, step=6095] Training: 61%|██████ | 6096/10000 [1:14:53<43:05, 1.51it/s, loss=0.0499, lr=1.07e-05, step=6095] Training: 61%|██████ | 6096/10000 [1:14:53<43:05, 1.51it/s, loss=0.0055, lr=1.06e-05, step=6096] Training: 61%|██████ | 6097/10000 [1:14:53<40:05, 1.62it/s, loss=0.0055, lr=1.06e-05, step=6096] Training: 61%|██████ | 6097/10000 [1:14:53<40:05, 1.62it/s, loss=0.0229, lr=1.06e-05, step=6097] Training: 61%|██████ | 6098/10000 [1:14:54<37:35, 1.73it/s, loss=0.0229, lr=1.06e-05, step=6097] Training: 61%|██████ | 6098/10000 [1:14:54<37:35, 1.73it/s, loss=0.0085, lr=1.06e-05, step=6098] Training: 61%|██████ | 6099/10000 [1:14:54<41:34, 1.56it/s, loss=0.0085, lr=1.06e-05, step=6098] Training: 61%|██████ | 6099/10000 [1:14:54<41:34, 1.56it/s, loss=0.0071, lr=1.06e-05, step=6099]17:21:01.829 [I] step=6100 loss=0.0096 smoothed_loss=0.0150 lr=1.06e-05 grad_norm=0.5231 step_time=0.5492s data_time=0.1069s it/s=1.524 eta_to_10000=2558.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0094 grad_action_out_proj=0.0951 grad_shared_expert=0.4486 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6100/10000 [1:14:55<39:21, 1.65it/s, loss=0.0071, lr=1.06e-05, step=6099] Training: 61%|██████ | 6100/10000 [1:14:55<39:21, 1.65it/s, loss=0.0096, lr=1.06e-05, step=6100] Training: 61%|██████ | 6101/10000 [1:14:56<44:31, 1.46it/s, loss=0.0096, lr=1.06e-05, step=6100] Training: 61%|██████ | 6101/10000 [1:14:56<44:31, 1.46it/s, loss=0.0052, lr=1.06e-05, step=6101] Training: 61%|██████ | 6102/10000 [1:14:56<45:10, 1.44it/s, loss=0.0052, lr=1.06e-05, step=6101] Training: 61%|██████ | 6102/10000 [1:14:56<45:10, 1.44it/s, loss=0.0032, lr=1.06e-05, step=6102] Training: 61%|██████ | 6103/10000 [1:14:57<41:39, 1.56it/s, loss=0.0032, lr=1.06e-05, step=6102] Training: 61%|██████ | 6103/10000 [1:14:57<41:39, 1.56it/s, loss=0.0178, lr=1.06e-05, step=6103] Training: 61%|██████ | 6104/10000 [1:14:57<38:44, 1.68it/s, loss=0.0178, lr=1.06e-05, step=6103] Training: 61%|██████ | 6104/10000 [1:14:57<38:44, 1.68it/s, loss=0.0189, lr=1.06e-05, step=6104] Training: 61%|██████ | 6105/10000 [1:14:58<37:02, 1.75it/s, loss=0.0189, lr=1.06e-05, step=6104] Training: 61%|██████ | 6105/10000 [1:14:58<37:02, 1.75it/s, loss=0.0154, lr=1.06e-05, step=6105] Training: 61%|██████ | 6106/10000 [1:14:59<39:56, 1.63it/s, loss=0.0154, lr=1.06e-05, step=6105] Training: 61%|██████ | 6106/10000 [1:14:59<39:56, 1.63it/s, loss=0.0088, lr=1.06e-05, step=6106] Training: 61%|██████ | 6107/10000 [1:14:59<41:49, 1.55it/s, loss=0.0088, lr=1.06e-05, step=6106] Training: 61%|██████ | 6107/10000 [1:14:59<41:49, 1.55it/s, loss=0.0105, lr=1.06e-05, step=6107] Training: 61%|██████ | 6108/10000 [1:15:00<45:46, 1.42it/s, loss=0.0105, lr=1.06e-05, step=6107] Training: 61%|██████ | 6108/10000 [1:15:00<45:46, 1.42it/s, loss=0.0054, lr=1.06e-05, step=6108] Training: 61%|██████ | 6109/10000 [1:15:01<41:46, 1.55it/s, loss=0.0054, lr=1.06e-05, step=6108] Training: 61%|██████ | 6109/10000 [1:15:01<41:46, 1.55it/s, loss=0.0301, lr=1.06e-05, step=6109]17:21:08.443 [I] step=6110 loss=0.0070 smoothed_loss=0.0135 lr=1.06e-05 grad_norm=0.4706 step_time=0.5484s data_time=0.1130s it/s=1.512 eta_to_10000=2572.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0154 grad_action_out_proj=0.1844 grad_shared_expert=0.4852 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6110/10000 [1:15:02<43:19, 1.50it/s, loss=0.0301, lr=1.06e-05, step=6109] Training: 61%|██████ | 6110/10000 [1:15:02<43:19, 1.50it/s, loss=0.0070, lr=1.06e-05, step=6110] Training: 61%|██████ | 6111/10000 [1:15:02<42:48, 1.51it/s, loss=0.0070, lr=1.06e-05, step=6110] Training: 61%|██████ | 6111/10000 [1:15:02<42:48, 1.51it/s, loss=0.0052, lr=1.06e-05, step=6111] Training: 61%|██████ | 6112/10000 [1:15:03<41:51, 1.55it/s, loss=0.0052, lr=1.06e-05, step=6111] Training: 61%|██████ | 6112/10000 [1:15:03<41:51, 1.55it/s, loss=0.0260, lr=1.06e-05, step=6112] Training: 61%|██████ | 6113/10000 [1:15:03<38:56, 1.66it/s, loss=0.0260, lr=1.06e-05, step=6112] Training: 61%|██████ | 6113/10000 [1:15:03<38:56, 1.66it/s, loss=0.0064, lr=1.06e-05, step=6113] Training: 61%|██████ | 6114/10000 [1:15:04<45:34, 1.42it/s, loss=0.0064, lr=1.06e-05, step=6113] Training: 61%|██████ | 6114/10000 [1:15:04<45:34, 1.42it/s, loss=0.0083, lr=1.06e-05, step=6114] Training: 61%|██████ | 6115/10000 [1:15:05<50:30, 1.28it/s, loss=0.0083, lr=1.06e-05, step=6114] Training: 61%|██████ | 6115/10000 [1:15:05<50:30, 1.28it/s, loss=0.0042, lr=1.06e-05, step=6115] Training: 61%|██████ | 6116/10000 [1:15:06<50:23, 1.28it/s, loss=0.0042, lr=1.06e-05, step=6115] Training: 61%|██████ | 6116/10000 [1:15:06<50:23, 1.28it/s, loss=0.0359, lr=1.06e-05, step=6116] Training: 61%|██████ | 6117/10000 [1:15:06<46:04, 1.40it/s, loss=0.0359, lr=1.06e-05, step=6116] Training: 61%|██████ | 6117/10000 [1:15:06<46:04, 1.40it/s, loss=0.0142, lr=1.06e-05, step=6117] Training: 61%|██████ | 6118/10000 [1:15:07<45:46, 1.41it/s, loss=0.0142, lr=1.06e-05, step=6117] Training: 61%|██████ | 6118/10000 [1:15:07<45:46, 1.41it/s, loss=0.0089, lr=1.06e-05, step=6118] Training: 61%|██████ | 6119/10000 [1:15:08<42:09, 1.53it/s, loss=0.0089, lr=1.06e-05, step=6118] Training: 61%|██████ | 6119/10000 [1:15:08<42:09, 1.53it/s, loss=0.0055, lr=1.06e-05, step=6119]17:21:15.349 [I] step=6120 loss=0.0153 smoothed_loss=0.0132 lr=1.06e-05 grad_norm=0.4617 step_time=0.5556s data_time=0.1350s it/s=1.448 eta_to_10000=2679.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0096 grad_action_out_proj=0.1312 grad_shared_expert=0.4227 (10775:train_pytorch.py:850) + Training: 61%|██████ | 6120/10000 [1:15:08<43:08, 1.50it/s, loss=0.0055, lr=1.06e-05, step=6119] Training: 61%|██████ | 6120/10000 [1:15:08<43:08, 1.50it/s, loss=0.0153, lr=1.06e-05, step=6120] Training: 61%|██████ | 6121/10000 [1:15:09<48:20, 1.34it/s, loss=0.0153, lr=1.06e-05, step=6120] Training: 61%|██████ | 6121/10000 [1:15:09<48:20, 1.34it/s, loss=0.0179, lr=1.06e-05, step=6121] Training: 61%|██████ | 6122/10000 [1:15:10<47:55, 1.35it/s, loss=0.0179, lr=1.06e-05, step=6121] Training: 61%|██████ | 6122/10000 [1:15:10<47:55, 1.35it/s, loss=0.0076, lr=1.06e-05, step=6122] Training: 61%|██████ | 6123/10000 [1:15:11<46:34, 1.39it/s, loss=0.0076, lr=1.06e-05, step=6122] Training: 61%|██████ | 6123/10000 [1:15:11<46:34, 1.39it/s, loss=0.0031, lr=1.06e-05, step=6123] Training: 61%|██████ | 6124/10000 [1:15:11<42:00, 1.54it/s, loss=0.0031, lr=1.06e-05, step=6123] Training: 61%|██████ | 6124/10000 [1:15:11<42:00, 1.54it/s, loss=0.0233, lr=1.05e-05, step=6124] Training: 61%|██████▏ | 6125/10000 [1:15:12<39:21, 1.64it/s, loss=0.0233, lr=1.05e-05, step=6124] Training: 61%|██████▏ | 6125/10000 [1:15:12<39:21, 1.64it/s, loss=0.0159, lr=1.05e-05, step=6125] Training: 61%|██████▏ | 6126/10000 [1:15:12<41:12, 1.57it/s, loss=0.0159, lr=1.05e-05, step=6125] Training: 61%|██████▏ | 6126/10000 [1:15:12<41:12, 1.57it/s, loss=0.0096, lr=1.05e-05, step=6126] Training: 61%|██████▏ | 6127/10000 [1:15:13<44:29, 1.45it/s, loss=0.0096, lr=1.05e-05, step=6126] Training: 61%|██████▏ | 6127/10000 [1:15:13<44:29, 1.45it/s, loss=0.0075, lr=1.05e-05, step=6127] Training: 61%|██████▏ | 6128/10000 [1:15:14<47:06, 1.37it/s, loss=0.0075, lr=1.05e-05, step=6127] Training: 61%|██████▏ | 6128/10000 [1:15:14<47:06, 1.37it/s, loss=0.0140, lr=1.05e-05, step=6128] Training: 61%|██████▏ | 6129/10000 [1:15:15<49:41, 1.30it/s, loss=0.0140, lr=1.05e-05, step=6128] Training: 61%|██████▏ | 6129/10000 [1:15:15<49:41, 1.30it/s, loss=0.0049, lr=1.05e-05, step=6129]17:21:22.607 [I] step=6130 loss=0.0065 smoothed_loss=0.0113 lr=1.05e-05 grad_norm=0.4387 step_time=0.5954s data_time=0.1304s it/s=1.378 eta_to_10000=2808.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.1411 grad_shared_expert=0.4460 (10775:train_pytorch.py:850) + Training: 61%|██████▏ | 6130/10000 [1:15:16<48:42, 1.32it/s, loss=0.0049, lr=1.05e-05, step=6129] Training: 61%|██████▏ | 6130/10000 [1:15:16<48:42, 1.32it/s, loss=0.0065, lr=1.05e-05, step=6130] Training: 61%|██████▏ | 6131/10000 [1:15:16<43:14, 1.49it/s, loss=0.0065, lr=1.05e-05, step=6130] Training: 61%|██████▏ | 6131/10000 [1:15:16<43:14, 1.49it/s, loss=0.0056, lr=1.05e-05, step=6131] Training: 61%|██████▏ | 6132/10000 [1:15:17<39:47, 1.62it/s, loss=0.0056, lr=1.05e-05, step=6131] Training: 61%|██████▏ | 6132/10000 [1:15:17<39:47, 1.62it/s, loss=0.0582, lr=1.05e-05, step=6132] Training: 61%|██████▏ | 6133/10000 [1:15:17<39:48, 1.62it/s, loss=0.0582, lr=1.05e-05, step=6132] Training: 61%|██████▏ | 6133/10000 [1:15:17<39:48, 1.62it/s, loss=0.0202, lr=1.05e-05, step=6133] Training: 61%|██████▏ | 6134/10000 [1:15:18<37:26, 1.72it/s, loss=0.0202, lr=1.05e-05, step=6133] Training: 61%|██████▏ | 6134/10000 [1:15:18<37:26, 1.72it/s, loss=0.0079, lr=1.05e-05, step=6134] Training: 61%|██████▏ | 6135/10000 [1:15:19<43:37, 1.48it/s, loss=0.0079, lr=1.05e-05, step=6134] Training: 61%|██████▏ | 6135/10000 [1:15:19<43:37, 1.48it/s, loss=0.0182, lr=1.05e-05, step=6135] Training: 61%|██████▏ | 6136/10000 [1:15:19<44:33, 1.45it/s, loss=0.0182, lr=1.05e-05, step=6135] Training: 61%|██████▏ | 6136/10000 [1:15:19<44:33, 1.45it/s, loss=0.0525, lr=1.05e-05, step=6136] Training: 61%|██████▏ | 6137/10000 [1:15:20<46:50, 1.37it/s, loss=0.0525, lr=1.05e-05, step=6136] Training: 61%|██████▏ | 6137/10000 [1:15:20<46:50, 1.37it/s, loss=0.0230, lr=1.05e-05, step=6137] Training: 61%|██████▏ | 6138/10000 [1:15:21<43:17, 1.49it/s, loss=0.0230, lr=1.05e-05, step=6137] Training: 61%|██████▏ | 6138/10000 [1:15:21<43:17, 1.49it/s, loss=0.0029, lr=1.05e-05, step=6138] Training: 61%|██████▏ | 6139/10000 [1:15:21<42:41, 1.51it/s, loss=0.0029, lr=1.05e-05, step=6138] Training: 61%|██████▏ | 6139/10000 [1:15:21<42:41, 1.51it/s, loss=0.0080, lr=1.05e-05, step=6139]17:21:29.253 [I] step=6140 loss=0.0045 smoothed_loss=0.0157 lr=1.05e-05 grad_norm=0.4318 step_time=0.5392s data_time=0.1254s it/s=1.505 eta_to_10000=2565.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0044 grad_action_out_proj=0.0606 grad_shared_expert=0.1837 (10775:train_pytorch.py:850) + Training: 61%|██████▏ | 6140/10000 [1:15:22<48:02, 1.34it/s, loss=0.0080, lr=1.05e-05, step=6139] Training: 61%|██████▏ | 6140/10000 [1:15:22<48:02, 1.34it/s, loss=0.0045, lr=1.05e-05, step=6140] Training: 61%|██████▏ | 6141/10000 [1:15:23<43:59, 1.46it/s, loss=0.0045, lr=1.05e-05, step=6140] Training: 61%|██████▏ | 6141/10000 [1:15:23<43:59, 1.46it/s, loss=0.0058, lr=1.05e-05, step=6141] Training: 61%|██████▏ | 6142/10000 [1:15:24<47:34, 1.35it/s, loss=0.0058, lr=1.05e-05, step=6141] Training: 61%|██████▏ | 6142/10000 [1:15:24<47:34, 1.35it/s, loss=0.0087, lr=1.05e-05, step=6142] Training: 61%|██████▏ | 6143/10000 [1:15:25<49:23, 1.30it/s, loss=0.0087, lr=1.05e-05, step=6142] Training: 61%|██████▏ | 6143/10000 [1:15:25<49:23, 1.30it/s, loss=0.0102, lr=1.05e-05, step=6143] Training: 61%|██████▏ | 6144/10000 [1:15:25<51:12, 1.26it/s, loss=0.0102, lr=1.05e-05, step=6143] Training: 61%|██████▏ | 6144/10000 [1:15:25<51:12, 1.26it/s, loss=0.0049, lr=1.05e-05, step=6144] Training: 61%|██████▏ | 6145/10000 [1:15:26<51:33, 1.25it/s, loss=0.0049, lr=1.05e-05, step=6144] Training: 61%|██████▏ | 6145/10000 [1:15:26<51:33, 1.25it/s, loss=0.0131, lr=1.05e-05, step=6145] Training: 61%|██████▏ | 6146/10000 [1:15:27<49:37, 1.29it/s, loss=0.0131, lr=1.05e-05, step=6145] Training: 61%|██████▏ | 6146/10000 [1:15:27<49:37, 1.29it/s, loss=0.0096, lr=1.05e-05, step=6146] Training: 61%|██████▏ | 6147/10000 [1:15:28<49:03, 1.31it/s, loss=0.0096, lr=1.05e-05, step=6146] Training: 61%|██████▏ | 6147/10000 [1:15:28<49:03, 1.31it/s, loss=0.0095, lr=1.05e-05, step=6147] Training: 61%|██████▏ | 6148/10000 [1:15:28<45:29, 1.41it/s, loss=0.0095, lr=1.05e-05, step=6147] Training: 61%|██████▏ | 6148/10000 [1:15:28<45:29, 1.41it/s, loss=0.0051, lr=1.05e-05, step=6148] Training: 61%|██████▏ | 6149/10000 [1:15:29<49:13, 1.30it/s, loss=0.0051, lr=1.05e-05, step=6148] Training: 61%|██████▏ | 6149/10000 [1:15:29<49:13, 1.30it/s, loss=0.0081, lr=1.05e-05, step=6149]17:21:36.673 [I] step=6150 loss=0.0048 smoothed_loss=0.0105 lr=1.05e-05 grad_norm=0.4372 step_time=0.6048s data_time=0.1373s it/s=1.348 eta_to_10000=2856.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0052 grad_action_out_proj=0.0655 grad_shared_expert=0.2953 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6150/10000 [1:15:30<45:24, 1.41it/s, loss=0.0081, lr=1.05e-05, step=6149] Training: 62%|██████▏ | 6150/10000 [1:15:30<45:24, 1.41it/s, loss=0.0048, lr=1.05e-05, step=6150] Training: 62%|██████▏ | 6151/10000 [1:15:30<45:31, 1.41it/s, loss=0.0048, lr=1.05e-05, step=6150] Training: 62%|██████▏ | 6151/10000 [1:15:30<45:31, 1.41it/s, loss=0.0027, lr=1.05e-05, step=6151] Training: 62%|██████▏ | 6152/10000 [1:15:31<45:29, 1.41it/s, loss=0.0027, lr=1.05e-05, step=6151] Training: 62%|██████▏ | 6152/10000 [1:15:31<45:29, 1.41it/s, loss=0.0127, lr=1.04e-05, step=6152] Training: 62%|██████▏ | 6153/10000 [1:15:32<41:15, 1.55it/s, loss=0.0127, lr=1.04e-05, step=6152] Training: 62%|██████▏ | 6153/10000 [1:15:32<41:15, 1.55it/s, loss=0.0026, lr=1.04e-05, step=6153] Training: 62%|██████▏ | 6154/10000 [1:15:32<38:37, 1.66it/s, loss=0.0026, lr=1.04e-05, step=6153] Training: 62%|██████▏ | 6154/10000 [1:15:32<38:37, 1.66it/s, loss=0.0215, lr=1.04e-05, step=6154] Training: 62%|██████▏ | 6155/10000 [1:15:33<43:07, 1.49it/s, loss=0.0215, lr=1.04e-05, step=6154] Training: 62%|██████▏ | 6155/10000 [1:15:33<43:07, 1.49it/s, loss=0.0092, lr=1.04e-05, step=6155] Training: 62%|██████▏ | 6156/10000 [1:15:34<44:19, 1.45it/s, loss=0.0092, lr=1.04e-05, step=6155] Training: 62%|██████▏ | 6156/10000 [1:15:34<44:19, 1.45it/s, loss=0.0544, lr=1.04e-05, step=6156] Training: 62%|██████▏ | 6157/10000 [1:15:35<47:22, 1.35it/s, loss=0.0544, lr=1.04e-05, step=6156] Training: 62%|██████▏ | 6157/10000 [1:15:35<47:22, 1.35it/s, loss=0.0020, lr=1.04e-05, step=6157] Training: 62%|██████▏ | 6158/10000 [1:15:35<43:40, 1.47it/s, loss=0.0020, lr=1.04e-05, step=6157] Training: 62%|██████▏ | 6158/10000 [1:15:35<43:40, 1.47it/s, loss=0.0092, lr=1.04e-05, step=6158] Training: 62%|██████▏ | 6159/10000 [1:15:36<40:09, 1.59it/s, loss=0.0092, lr=1.04e-05, step=6158] Training: 62%|██████▏ | 6159/10000 [1:15:36<40:09, 1.59it/s, loss=0.0067, lr=1.04e-05, step=6159]17:21:43.183 [I] step=6160 loss=0.0154 smoothed_loss=0.0127 lr=1.04e-05 grad_norm=0.4479 step_time=0.5438s data_time=0.1072s it/s=1.536 eta_to_10000=2499.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.0895 grad_shared_expert=0.3389 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6160/10000 [1:15:36<39:58, 1.60it/s, loss=0.0067, lr=1.04e-05, step=6159] Training: 62%|██████▏ | 6160/10000 [1:15:36<39:58, 1.60it/s, loss=0.0154, lr=1.04e-05, step=6160] Training: 62%|██████▏ | 6161/10000 [1:15:37<37:46, 1.69it/s, loss=0.0154, lr=1.04e-05, step=6160] Training: 62%|██████▏ | 6161/10000 [1:15:37<37:46, 1.69it/s, loss=0.0274, lr=1.04e-05, step=6161] Training: 62%|██████▏ | 6162/10000 [1:15:37<36:52, 1.74it/s, loss=0.0274, lr=1.04e-05, step=6161] Training: 62%|██████▏ | 6162/10000 [1:15:37<36:52, 1.74it/s, loss=0.0726, lr=1.04e-05, step=6162] Training: 62%|██████▏ | 6163/10000 [1:15:38<41:34, 1.54it/s, loss=0.0726, lr=1.04e-05, step=6162] Training: 62%|██████▏ | 6163/10000 [1:15:38<41:34, 1.54it/s, loss=0.0063, lr=1.04e-05, step=6163] Training: 62%|██████▏ | 6164/10000 [1:15:39<45:22, 1.41it/s, loss=0.0063, lr=1.04e-05, step=6163] Training: 62%|██████▏ | 6164/10000 [1:15:39<45:22, 1.41it/s, loss=0.0064, lr=1.04e-05, step=6164] Training: 62%|██████▏ | 6165/10000 [1:15:39<41:40, 1.53it/s, loss=0.0064, lr=1.04e-05, step=6164] Training: 62%|██████▏ | 6165/10000 [1:15:39<41:40, 1.53it/s, loss=0.0040, lr=1.04e-05, step=6165] Training: 62%|██████▏ | 6166/10000 [1:15:40<38:30, 1.66it/s, loss=0.0040, lr=1.04e-05, step=6165] Training: 62%|██████▏ | 6166/10000 [1:15:40<38:30, 1.66it/s, loss=0.0029, lr=1.04e-05, step=6166] Training: 62%|██████▏ | 6167/10000 [1:15:41<38:21, 1.67it/s, loss=0.0029, lr=1.04e-05, step=6166] Training: 62%|██████▏ | 6167/10000 [1:15:41<38:21, 1.67it/s, loss=0.0041, lr=1.04e-05, step=6167] Training: 62%|██████▏ | 6168/10000 [1:15:41<40:02, 1.59it/s, loss=0.0041, lr=1.04e-05, step=6167] Training: 62%|██████▏ | 6168/10000 [1:15:41<40:02, 1.59it/s, loss=0.0184, lr=1.04e-05, step=6168] Training: 62%|██████▏ | 6169/10000 [1:15:42<37:36, 1.70it/s, loss=0.0184, lr=1.04e-05, step=6168] Training: 62%|██████▏ | 6169/10000 [1:15:42<37:36, 1.70it/s, loss=0.0156, lr=1.04e-05, step=6169]17:21:49.611 [I] step=6170 loss=0.0055 smoothed_loss=0.0134 lr=1.04e-05 grad_norm=0.4963 step_time=0.5382s data_time=0.1045s it/s=1.556 eta_to_10000=2461.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0363 grad_action_out_proj=0.1370 grad_shared_expert=0.6139 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6170/10000 [1:15:43<43:48, 1.46it/s, loss=0.0156, lr=1.04e-05, step=6169] Training: 62%|██████▏ | 6170/10000 [1:15:43<43:48, 1.46it/s, loss=0.0055, lr=1.04e-05, step=6170] Training: 62%|██████▏ | 6171/10000 [1:15:43<43:45, 1.46it/s, loss=0.0055, lr=1.04e-05, step=6170] Training: 62%|██████▏ | 6171/10000 [1:15:43<43:45, 1.46it/s, loss=0.0059, lr=1.04e-05, step=6171] Training: 62%|██████▏ | 6172/10000 [1:15:44<40:11, 1.59it/s, loss=0.0059, lr=1.04e-05, step=6171] Training: 62%|██████▏ | 6172/10000 [1:15:44<40:11, 1.59it/s, loss=0.0128, lr=1.04e-05, step=6172] Training: 62%|██████▏ | 6173/10000 [1:15:44<37:46, 1.69it/s, loss=0.0128, lr=1.04e-05, step=6172] Training: 62%|██████▏ | 6173/10000 [1:15:44<37:46, 1.69it/s, loss=0.0044, lr=1.04e-05, step=6173] Training: 62%|██████▏ | 6174/10000 [1:15:45<41:13, 1.55it/s, loss=0.0044, lr=1.04e-05, step=6173] Training: 62%|██████▏ | 6174/10000 [1:15:45<41:13, 1.55it/s, loss=0.0084, lr=1.04e-05, step=6174] Training: 62%|██████▏ | 6175/10000 [1:15:46<41:49, 1.52it/s, loss=0.0084, lr=1.04e-05, step=6174] Training: 62%|██████▏ | 6175/10000 [1:15:46<41:49, 1.52it/s, loss=0.0198, lr=1.04e-05, step=6175] Training: 62%|██████▏ | 6176/10000 [1:15:47<43:23, 1.47it/s, loss=0.0198, lr=1.04e-05, step=6175] Training: 62%|██████▏ | 6176/10000 [1:15:47<43:23, 1.47it/s, loss=0.0036, lr=1.04e-05, step=6176] Training: 62%|██████▏ | 6177/10000 [1:15:47<40:26, 1.58it/s, loss=0.0036, lr=1.04e-05, step=6176] Training: 62%|██████▏ | 6177/10000 [1:15:47<40:26, 1.58it/s, loss=0.0041, lr=1.04e-05, step=6177] Training: 62%|██████▏ | 6178/10000 [1:15:48<43:54, 1.45it/s, loss=0.0041, lr=1.04e-05, step=6177] Training: 62%|██████▏ | 6178/10000 [1:15:48<43:54, 1.45it/s, loss=0.0155, lr=1.04e-05, step=6178] Training: 62%|██████▏ | 6179/10000 [1:15:48<40:30, 1.57it/s, loss=0.0155, lr=1.04e-05, step=6178] Training: 62%|██████▏ | 6179/10000 [1:15:48<40:30, 1.57it/s, loss=0.0215, lr=1.04e-05, step=6179]17:21:55.882 [I] step=6180 loss=0.0080 smoothed_loss=0.0118 lr=1.04e-05 grad_norm=0.3926 step_time=0.5325s data_time=0.0946s it/s=1.595 eta_to_10000=2395.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0047 grad_action_out_proj=0.0517 grad_shared_expert=0.1398 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6180/10000 [1:15:49<38:37, 1.65it/s, loss=0.0215, lr=1.04e-05, step=6179] Training: 62%|██████▏ | 6180/10000 [1:15:49<38:37, 1.65it/s, loss=0.0080, lr=1.03e-05, step=6180] Training: 62%|██████▏ | 6181/10000 [1:15:49<36:54, 1.72it/s, loss=0.0080, lr=1.03e-05, step=6180] Training: 62%|██████▏ | 6181/10000 [1:15:49<36:54, 1.72it/s, loss=0.0016, lr=1.03e-05, step=6181] Training: 62%|██████▏ | 6182/10000 [1:15:50<35:42, 1.78it/s, loss=0.0016, lr=1.03e-05, step=6181] Training: 62%|██████▏ | 6182/10000 [1:15:50<35:42, 1.78it/s, loss=0.0094, lr=1.03e-05, step=6182] Training: 62%|██████▏ | 6183/10000 [1:15:51<39:41, 1.60it/s, loss=0.0094, lr=1.03e-05, step=6182] Training: 62%|██████▏ | 6183/10000 [1:15:51<39:41, 1.60it/s, loss=0.0030, lr=1.03e-05, step=6183] Training: 62%|██████▏ | 6184/10000 [1:15:51<37:07, 1.71it/s, loss=0.0030, lr=1.03e-05, step=6183] Training: 62%|██████▏ | 6184/10000 [1:15:51<37:07, 1.71it/s, loss=0.0048, lr=1.03e-05, step=6184] Training: 62%|██████▏ | 6185/10000 [1:15:52<43:07, 1.47it/s, loss=0.0048, lr=1.03e-05, step=6184] Training: 62%|██████▏ | 6185/10000 [1:15:52<43:07, 1.47it/s, loss=0.0038, lr=1.03e-05, step=6185] Training: 62%|██████▏ | 6186/10000 [1:15:53<40:02, 1.59it/s, loss=0.0038, lr=1.03e-05, step=6185] Training: 62%|██████▏ | 6186/10000 [1:15:53<40:02, 1.59it/s, loss=0.0146, lr=1.03e-05, step=6186] Training: 62%|██████▏ | 6187/10000 [1:15:53<38:01, 1.67it/s, loss=0.0146, lr=1.03e-05, step=6186] Training: 62%|██████▏ | 6187/10000 [1:15:53<38:01, 1.67it/s, loss=0.0036, lr=1.03e-05, step=6187] Training: 62%|██████▏ | 6188/10000 [1:15:54<37:08, 1.71it/s, loss=0.0036, lr=1.03e-05, step=6187] Training: 62%|██████▏ | 6188/10000 [1:15:54<37:08, 1.71it/s, loss=0.0047, lr=1.03e-05, step=6188] Training: 62%|██████▏ | 6189/10000 [1:15:54<36:17, 1.75it/s, loss=0.0047, lr=1.03e-05, step=6188] Training: 62%|██████▏ | 6189/10000 [1:15:54<36:17, 1.75it/s, loss=0.0016, lr=1.03e-05, step=6189]17:22:02.081 [I] step=6190 loss=0.0040 smoothed_loss=0.0073 lr=1.03e-05 grad_norm=0.4432 step_time=0.5261s data_time=0.0938s it/s=1.613 eta_to_10000=2361.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.1084 grad_shared_expert=0.9480 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6190/10000 [1:15:55<41:58, 1.51it/s, loss=0.0016, lr=1.03e-05, step=6189] Training: 62%|██████▏ | 6190/10000 [1:15:55<41:58, 1.51it/s, loss=0.0040, lr=1.03e-05, step=6190] Training: 62%|██████▏ | 6191/10000 [1:15:56<38:48, 1.64it/s, loss=0.0040, lr=1.03e-05, step=6190] Training: 62%|██████▏ | 6191/10000 [1:15:56<38:48, 1.64it/s, loss=0.0228, lr=1.03e-05, step=6191] Training: 62%|██████▏ | 6192/10000 [1:15:57<43:58, 1.44it/s, loss=0.0228, lr=1.03e-05, step=6191] Training: 62%|██████▏ | 6192/10000 [1:15:57<43:58, 1.44it/s, loss=0.0172, lr=1.03e-05, step=6192] Training: 62%|██████▏ | 6193/10000 [1:15:57<40:00, 1.59it/s, loss=0.0172, lr=1.03e-05, step=6192] Training: 62%|██████▏ | 6193/10000 [1:15:57<40:00, 1.59it/s, loss=0.0229, lr=1.03e-05, step=6193] Training: 62%|██████▏ | 6194/10000 [1:15:58<37:19, 1.70it/s, loss=0.0229, lr=1.03e-05, step=6193] Training: 62%|██████▏ | 6194/10000 [1:15:58<37:19, 1.70it/s, loss=0.0065, lr=1.03e-05, step=6194] Training: 62%|██████▏ | 6195/10000 [1:15:58<35:58, 1.76it/s, loss=0.0065, lr=1.03e-05, step=6194] Training: 62%|██████▏ | 6195/10000 [1:15:58<35:58, 1.76it/s, loss=0.0096, lr=1.03e-05, step=6195] Training: 62%|██████▏ | 6196/10000 [1:15:59<34:49, 1.82it/s, loss=0.0096, lr=1.03e-05, step=6195] Training: 62%|██████▏ | 6196/10000 [1:15:59<34:49, 1.82it/s, loss=0.0127, lr=1.03e-05, step=6196] Training: 62%|██████▏ | 6197/10000 [1:15:59<35:15, 1.80it/s, loss=0.0127, lr=1.03e-05, step=6196] Training: 62%|██████▏ | 6197/10000 [1:15:59<35:15, 1.80it/s, loss=0.0055, lr=1.03e-05, step=6197] Training: 62%|██████▏ | 6198/10000 [1:16:00<36:06, 1.76it/s, loss=0.0055, lr=1.03e-05, step=6197] Training: 62%|██████▏ | 6198/10000 [1:16:00<36:06, 1.76it/s, loss=0.0231, lr=1.03e-05, step=6198] Training: 62%|██████▏ | 6199/10000 [1:16:01<40:35, 1.56it/s, loss=0.0231, lr=1.03e-05, step=6198] Training: 62%|██████▏ | 6199/10000 [1:16:01<40:35, 1.56it/s, loss=0.0030, lr=1.03e-05, step=6199]17:22:08.489 [I] step=6200 loss=0.0081 smoothed_loss=0.0104 lr=1.03e-05 grad_norm=0.5147 step_time=0.5399s data_time=0.1008s it/s=1.561 eta_to_10000=2434.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0375 grad_action_out_proj=0.2099 grad_shared_expert=0.5923 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6200/10000 [1:16:02<48:19, 1.31it/s, loss=0.0030, lr=1.03e-05, step=6199] Training: 62%|██████▏ | 6200/10000 [1:16:02<48:19, 1.31it/s, loss=0.0081, lr=1.03e-05, step=6200] Training: 62%|██████▏ | 6201/10000 [1:16:02<50:18, 1.26it/s, loss=0.0081, lr=1.03e-05, step=6200] Training: 62%|██████▏ | 6201/10000 [1:16:02<50:18, 1.26it/s, loss=0.0026, lr=1.03e-05, step=6201] Training: 62%|██████▏ | 6202/10000 [1:16:03<44:33, 1.42it/s, loss=0.0026, lr=1.03e-05, step=6201] Training: 62%|██████▏ | 6202/10000 [1:16:03<44:33, 1.42it/s, loss=0.0067, lr=1.03e-05, step=6202] Training: 62%|██████▏ | 6203/10000 [1:16:04<44:17, 1.43it/s, loss=0.0067, lr=1.03e-05, step=6202] Training: 62%|██████▏ | 6203/10000 [1:16:04<44:17, 1.43it/s, loss=0.0014, lr=1.03e-05, step=6203] Training: 62%|██████▏ | 6204/10000 [1:16:04<44:37, 1.42it/s, loss=0.0014, lr=1.03e-05, step=6203] Training: 62%|██████▏ | 6204/10000 [1:16:04<44:37, 1.42it/s, loss=0.0070, lr=1.03e-05, step=6204] Training: 62%|██████▏ | 6205/10000 [1:16:05<40:57, 1.54it/s, loss=0.0070, lr=1.03e-05, step=6204] Training: 62%|██████▏ | 6205/10000 [1:16:05<40:57, 1.54it/s, loss=0.0107, lr=1.03e-05, step=6205] Training: 62%|██████▏ | 6206/10000 [1:16:06<42:03, 1.50it/s, loss=0.0107, lr=1.03e-05, step=6205] Training: 62%|██████▏ | 6206/10000 [1:16:06<42:03, 1.50it/s, loss=0.0040, lr=1.03e-05, step=6206] Training: 62%|██████▏ | 6207/10000 [1:16:06<44:30, 1.42it/s, loss=0.0040, lr=1.03e-05, step=6206] Training: 62%|██████▏ | 6207/10000 [1:16:06<44:30, 1.42it/s, loss=0.0041, lr=1.03e-05, step=6207] Training: 62%|██████▏ | 6208/10000 [1:16:07<45:41, 1.38it/s, loss=0.0041, lr=1.03e-05, step=6207] Training: 62%|██████▏ | 6208/10000 [1:16:07<45:41, 1.38it/s, loss=0.0100, lr=1.02e-05, step=6208] Training: 62%|██████▏ | 6209/10000 [1:16:08<45:50, 1.38it/s, loss=0.0100, lr=1.02e-05, step=6208] Training: 62%|██████▏ | 6209/10000 [1:16:08<45:50, 1.38it/s, loss=0.0163, lr=1.02e-05, step=6209]17:22:15.297 [I] step=6210 loss=0.0139 smoothed_loss=0.0093 lr=1.03e-05 grad_norm=0.4109 step_time=0.5599s data_time=0.1209s it/s=1.469 eta_to_10000=2579.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.0929 grad_shared_expert=0.2549 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6210/10000 [1:16:08<42:03, 1.50it/s, loss=0.0163, lr=1.02e-05, step=6209] Training: 62%|██████▏ | 6210/10000 [1:16:08<42:03, 1.50it/s, loss=0.0139, lr=1.02e-05, step=6210] Training: 62%|██████▏ | 6211/10000 [1:16:09<44:22, 1.42it/s, loss=0.0139, lr=1.02e-05, step=6210] Training: 62%|██████▏ | 6211/10000 [1:16:09<44:22, 1.42it/s, loss=0.0171, lr=1.02e-05, step=6211] Training: 62%|██████▏ | 6212/10000 [1:16:10<44:29, 1.42it/s, loss=0.0171, lr=1.02e-05, step=6211] Training: 62%|██████▏ | 6212/10000 [1:16:10<44:29, 1.42it/s, loss=0.0018, lr=1.02e-05, step=6212] Training: 62%|██████▏ | 6213/10000 [1:16:11<44:52, 1.41it/s, loss=0.0018, lr=1.02e-05, step=6212] Training: 62%|██████▏ | 6213/10000 [1:16:11<44:52, 1.41it/s, loss=0.0236, lr=1.02e-05, step=6213] Training: 62%|██████▏ | 6214/10000 [1:16:11<44:50, 1.41it/s, loss=0.0236, lr=1.02e-05, step=6213] Training: 62%|██████▏ | 6214/10000 [1:16:11<44:50, 1.41it/s, loss=0.0465, lr=1.02e-05, step=6214] Training: 62%|██████▏ | 6215/10000 [1:16:12<47:52, 1.32it/s, loss=0.0465, lr=1.02e-05, step=6214] Training: 62%|██████▏ | 6215/10000 [1:16:12<47:52, 1.32it/s, loss=0.0065, lr=1.02e-05, step=6215] Training: 62%|██████▏ | 6216/10000 [1:16:13<42:52, 1.47it/s, loss=0.0065, lr=1.02e-05, step=6215] Training: 62%|██████▏ | 6216/10000 [1:16:13<42:52, 1.47it/s, loss=0.0079, lr=1.02e-05, step=6216] Training: 62%|██████▏ | 6217/10000 [1:16:13<40:26, 1.56it/s, loss=0.0079, lr=1.02e-05, step=6216] Training: 62%|██████▏ | 6217/10000 [1:16:13<40:26, 1.56it/s, loss=0.0250, lr=1.02e-05, step=6217] Training: 62%|██████▏ | 6218/10000 [1:16:14<37:23, 1.69it/s, loss=0.0250, lr=1.02e-05, step=6217] Training: 62%|██████▏ | 6218/10000 [1:16:14<37:23, 1.69it/s, loss=0.0072, lr=1.02e-05, step=6218] Training: 62%|██████▏ | 6219/10000 [1:16:14<36:29, 1.73it/s, loss=0.0072, lr=1.02e-05, step=6218] Training: 62%|██████▏ | 6219/10000 [1:16:14<36:29, 1.73it/s, loss=0.0090, lr=1.02e-05, step=6219]17:22:21.969 [I] step=6220 loss=0.0346 smoothed_loss=0.0152 lr=1.02e-05 grad_norm=0.5165 step_time=0.5631s data_time=0.1042s it/s=1.499 eta_to_10000=2521.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0215 grad_action_out_proj=0.1220 grad_shared_expert=0.3865 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6220/10000 [1:16:15<40:31, 1.55it/s, loss=0.0090, lr=1.02e-05, step=6219] Training: 62%|██████▏ | 6220/10000 [1:16:15<40:31, 1.55it/s, loss=0.0346, lr=1.02e-05, step=6220] Training: 62%|██████▏ | 6221/10000 [1:16:16<43:00, 1.46it/s, loss=0.0346, lr=1.02e-05, step=6220] Training: 62%|██████▏ | 6221/10000 [1:16:16<43:00, 1.46it/s, loss=0.0043, lr=1.02e-05, step=6221] Training: 62%|██████▏ | 6222/10000 [1:16:17<44:02, 1.43it/s, loss=0.0043, lr=1.02e-05, step=6221] Training: 62%|██████▏ | 6222/10000 [1:16:17<44:02, 1.43it/s, loss=0.0198, lr=1.02e-05, step=6222] Training: 62%|██████▏ | 6223/10000 [1:16:17<41:39, 1.51it/s, loss=0.0198, lr=1.02e-05, step=6222] Training: 62%|██████▏ | 6223/10000 [1:16:17<41:39, 1.51it/s, loss=0.0050, lr=1.02e-05, step=6223] Training: 62%|██████▏ | 6224/10000 [1:16:18<39:03, 1.61it/s, loss=0.0050, lr=1.02e-05, step=6223] Training: 62%|██████▏ | 6224/10000 [1:16:18<39:03, 1.61it/s, loss=0.0086, lr=1.02e-05, step=6224] Training: 62%|██████▏ | 6225/10000 [1:16:18<40:29, 1.55it/s, loss=0.0086, lr=1.02e-05, step=6224] Training: 62%|██████▏ | 6225/10000 [1:16:18<40:29, 1.55it/s, loss=0.0051, lr=1.02e-05, step=6225] Training: 62%|██████▏ | 6226/10000 [1:16:19<37:28, 1.68it/s, loss=0.0051, lr=1.02e-05, step=6225] Training: 62%|██████▏ | 6226/10000 [1:16:19<37:28, 1.68it/s, loss=0.0062, lr=1.02e-05, step=6226] Training: 62%|██████▏ | 6227/10000 [1:16:19<35:44, 1.76it/s, loss=0.0062, lr=1.02e-05, step=6226] Training: 62%|██████▏ | 6227/10000 [1:16:19<35:44, 1.76it/s, loss=0.0051, lr=1.02e-05, step=6227] Training: 62%|██████▏ | 6228/10000 [1:16:20<40:55, 1.54it/s, loss=0.0051, lr=1.02e-05, step=6227] Training: 62%|██████▏ | 6228/10000 [1:16:20<40:55, 1.54it/s, loss=0.0183, lr=1.02e-05, step=6228] Training: 62%|██████▏ | 6229/10000 [1:16:21<45:17, 1.39it/s, loss=0.0183, lr=1.02e-05, step=6228] Training: 62%|██████▏ | 6229/10000 [1:16:21<45:17, 1.39it/s, loss=0.0023, lr=1.02e-05, step=6229]17:22:28.706 [I] step=6230 loss=0.0093 smoothed_loss=0.0107 lr=1.02e-05 grad_norm=0.4855 step_time=0.5617s data_time=0.1121s it/s=1.485 eta_to_10000=2539.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.0829 grad_shared_expert=0.3281 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6230/10000 [1:16:22<45:07, 1.39it/s, loss=0.0023, lr=1.02e-05, step=6229] Training: 62%|██████▏ | 6230/10000 [1:16:22<45:07, 1.39it/s, loss=0.0093, lr=1.02e-05, step=6230] Training: 62%|██████▏ | 6231/10000 [1:16:22<41:04, 1.53it/s, loss=0.0093, lr=1.02e-05, step=6230] Training: 62%|██████▏ | 6231/10000 [1:16:22<41:04, 1.53it/s, loss=0.0130, lr=1.02e-05, step=6231] Training: 62%|██████▏ | 6232/10000 [1:16:23<43:39, 1.44it/s, loss=0.0130, lr=1.02e-05, step=6231] Training: 62%|██████▏ | 6232/10000 [1:16:23<43:39, 1.44it/s, loss=0.0191, lr=1.02e-05, step=6232] Training: 62%|██████▏ | 6233/10000 [1:16:24<39:52, 1.57it/s, loss=0.0191, lr=1.02e-05, step=6232] Training: 62%|██████▏ | 6233/10000 [1:16:24<39:52, 1.57it/s, loss=0.0122, lr=1.02e-05, step=6233] Training: 62%|██████▏ | 6234/10000 [1:16:24<37:25, 1.68it/s, loss=0.0122, lr=1.02e-05, step=6233] Training: 62%|██████▏ | 6234/10000 [1:16:24<37:25, 1.68it/s, loss=0.0042, lr=1.02e-05, step=6234] Training: 62%|██████▏ | 6235/10000 [1:16:25<44:50, 1.40it/s, loss=0.0042, lr=1.02e-05, step=6234] Training: 62%|██████▏ | 6235/10000 [1:16:25<44:50, 1.40it/s, loss=0.0166, lr=1.02e-05, step=6235] Training: 62%|██████▏ | 6236/10000 [1:16:26<50:33, 1.24it/s, loss=0.0166, lr=1.02e-05, step=6235] Training: 62%|██████▏ | 6236/10000 [1:16:26<50:33, 1.24it/s, loss=0.0328, lr=1.01e-05, step=6236] Training: 62%|██████▏ | 6237/10000 [1:16:27<48:03, 1.31it/s, loss=0.0328, lr=1.01e-05, step=6236] Training: 62%|██████▏ | 6237/10000 [1:16:27<48:03, 1.31it/s, loss=0.0259, lr=1.01e-05, step=6237] Training: 62%|██████▏ | 6238/10000 [1:16:27<46:59, 1.33it/s, loss=0.0259, lr=1.01e-05, step=6237] Training: 62%|██████▏ | 6238/10000 [1:16:27<46:59, 1.33it/s, loss=0.0041, lr=1.01e-05, step=6238] Training: 62%|██████▏ | 6239/10000 [1:16:28<44:28, 1.41it/s, loss=0.0041, lr=1.01e-05, step=6238] Training: 62%|██████▏ | 6239/10000 [1:16:28<44:28, 1.41it/s, loss=0.0051, lr=1.01e-05, step=6239]17:22:35.729 [I] step=6240 loss=0.0349 smoothed_loss=0.0152 lr=1.02e-05 grad_norm=0.4794 step_time=0.5726s data_time=0.1296s it/s=1.424 eta_to_10000=2640.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0355 grad_action_out_proj=0.1813 grad_shared_expert=0.4371 (10775:train_pytorch.py:850) + Training: 62%|██████▏ | 6240/10000 [1:16:29<44:36, 1.40it/s, loss=0.0051, lr=1.01e-05, step=6239] Training: 62%|██████▏ | 6240/10000 [1:16:29<44:36, 1.40it/s, loss=0.0349, lr=1.01e-05, step=6240] Training: 62%|██████▏ | 6241/10000 [1:16:29<41:26, 1.51it/s, loss=0.0349, lr=1.01e-05, step=6240] Training: 62%|██████▏ | 6241/10000 [1:16:29<41:26, 1.51it/s, loss=0.0097, lr=1.01e-05, step=6241] Training: 62%|██████▏ | 6242/10000 [1:16:30<47:15, 1.33it/s, loss=0.0097, lr=1.01e-05, step=6241] Training: 62%|██████▏ | 6242/10000 [1:16:30<47:15, 1.33it/s, loss=0.0228, lr=1.01e-05, step=6242] Training: 62%|██████▏ | 6243/10000 [1:16:31<46:36, 1.34it/s, loss=0.0228, lr=1.01e-05, step=6242] Training: 62%|██████▏ | 6243/10000 [1:16:31<46:36, 1.34it/s, loss=0.0025, lr=1.01e-05, step=6243] Training: 62%|██████▏ | 6244/10000 [1:16:32<47:42, 1.31it/s, loss=0.0025, lr=1.01e-05, step=6243] Training: 62%|██████▏ | 6244/10000 [1:16:32<47:42, 1.31it/s, loss=0.0020, lr=1.01e-05, step=6244] Training: 62%|██████▏ | 6245/10000 [1:16:32<43:11, 1.45it/s, loss=0.0020, lr=1.01e-05, step=6244] Training: 62%|██████▏ | 6245/10000 [1:16:32<43:11, 1.45it/s, loss=0.0039, lr=1.01e-05, step=6245] Training: 62%|██████▏ | 6246/10000 [1:16:33<39:12, 1.60it/s, loss=0.0039, lr=1.01e-05, step=6245] Training: 62%|██████▏ | 6246/10000 [1:16:33<39:12, 1.60it/s, loss=0.0117, lr=1.01e-05, step=6246] Training: 62%|██████▏ | 6247/10000 [1:16:33<39:06, 1.60it/s, loss=0.0117, lr=1.01e-05, step=6246] Training: 62%|██████▏ | 6247/10000 [1:16:33<39:06, 1.60it/s, loss=0.0200, lr=1.01e-05, step=6247] Training: 62%|██████▏ | 6248/10000 [1:16:34<36:45, 1.70it/s, loss=0.0200, lr=1.01e-05, step=6247] Training: 62%|██████▏ | 6248/10000 [1:16:34<36:45, 1.70it/s, loss=0.0102, lr=1.01e-05, step=6248] Training: 62%|██████▏ | 6249/10000 [1:16:35<39:15, 1.59it/s, loss=0.0102, lr=1.01e-05, step=6248] Training: 62%|██████▏ | 6249/10000 [1:16:35<39:15, 1.59it/s, loss=0.0408, lr=1.01e-05, step=6249]17:22:42.344 [I] step=6250 loss=0.0092 smoothed_loss=0.0147 lr=1.01e-05 grad_norm=0.3989 step_time=0.5681s data_time=0.0934s it/s=1.512 eta_to_10000=2480.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.1104 grad_shared_expert=0.3645 (10775:train_pytorch.py:850) + Training: 62%|██████▎ | 6250/10000 [1:16:35<41:10, 1.52it/s, loss=0.0408, lr=1.01e-05, step=6249] Training: 62%|██████▎ | 6250/10000 [1:16:35<41:10, 1.52it/s, loss=0.0092, lr=1.01e-05, step=6250] Training: 63%|██████▎ | 6251/10000 [1:16:36<40:53, 1.53it/s, loss=0.0092, lr=1.01e-05, step=6250] Training: 63%|██████▎ | 6251/10000 [1:16:36<40:53, 1.53it/s, loss=0.0045, lr=1.01e-05, step=6251] Training: 63%|██████▎ | 6252/10000 [1:16:37<37:45, 1.65it/s, loss=0.0045, lr=1.01e-05, step=6251] Training: 63%|██████▎ | 6252/10000 [1:16:37<37:45, 1.65it/s, loss=0.0075, lr=1.01e-05, step=6252] Training: 63%|██████▎ | 6253/10000 [1:16:37<40:50, 1.53it/s, loss=0.0075, lr=1.01e-05, step=6252] Training: 63%|██████▎ | 6253/10000 [1:16:37<40:50, 1.53it/s, loss=0.0057, lr=1.01e-05, step=6253] Training: 63%|██████▎ | 6254/10000 [1:16:38<37:55, 1.65it/s, loss=0.0057, lr=1.01e-05, step=6253] Training: 63%|██████▎ | 6254/10000 [1:16:38<37:55, 1.65it/s, loss=0.0045, lr=1.01e-05, step=6254] Training: 63%|██████▎ | 6255/10000 [1:16:38<36:33, 1.71it/s, loss=0.0045, lr=1.01e-05, step=6254] Training: 63%|██████▎ | 6255/10000 [1:16:38<36:33, 1.71it/s, loss=0.0121, lr=1.01e-05, step=6255] Training: 63%|██████▎ | 6256/10000 [1:16:39<45:20, 1.38it/s, loss=0.0121, lr=1.01e-05, step=6255] Training: 63%|██████▎ | 6256/10000 [1:16:39<45:20, 1.38it/s, loss=0.0060, lr=1.01e-05, step=6256] Training: 63%|██████▎ | 6257/10000 [1:16:40<47:46, 1.31it/s, loss=0.0060, lr=1.01e-05, step=6256] Training: 63%|██████▎ | 6257/10000 [1:16:40<47:46, 1.31it/s, loss=0.0086, lr=1.01e-05, step=6257] Training: 63%|██████▎ | 6258/10000 [1:16:41<47:10, 1.32it/s, loss=0.0086, lr=1.01e-05, step=6257] Training: 63%|██████▎ | 6258/10000 [1:16:41<47:10, 1.32it/s, loss=0.0058, lr=1.01e-05, step=6258] Training: 63%|██████▎ | 6259/10000 [1:16:42<42:59, 1.45it/s, loss=0.0058, lr=1.01e-05, step=6258] Training: 63%|██████▎ | 6259/10000 [1:16:42<42:59, 1.45it/s, loss=0.0359, lr=1.01e-05, step=6259]17:22:49.268 [I] step=6260 loss=0.0033 smoothed_loss=0.0119 lr=1.01e-05 grad_norm=0.4083 step_time=0.5592s data_time=0.1332s it/s=1.445 eta_to_10000=2589.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0091 grad_action_out_proj=0.0866 grad_shared_expert=0.2380 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6260/10000 [1:16:42<45:11, 1.38it/s, loss=0.0359, lr=1.01e-05, step=6259] Training: 63%|██████▎ | 6260/10000 [1:16:42<45:11, 1.38it/s, loss=0.0033, lr=1.01e-05, step=6260] Training: 63%|██████▎ | 6261/10000 [1:16:43<46:42, 1.33it/s, loss=0.0033, lr=1.01e-05, step=6260] Training: 63%|██████▎ | 6261/10000 [1:16:43<46:42, 1.33it/s, loss=0.0037, lr=1.01e-05, step=6261] Training: 63%|██████▎ | 6262/10000 [1:16:44<45:13, 1.38it/s, loss=0.0037, lr=1.01e-05, step=6261] Training: 63%|██████▎ | 6262/10000 [1:16:44<45:13, 1.38it/s, loss=0.0034, lr=1.01e-05, step=6262] Training: 63%|██████▎ | 6263/10000 [1:16:44<41:03, 1.52it/s, loss=0.0034, lr=1.01e-05, step=6262] Training: 63%|██████▎ | 6263/10000 [1:16:44<41:03, 1.52it/s, loss=0.0079, lr=1.01e-05, step=6263] Training: 63%|██████▎ | 6264/10000 [1:16:45<42:05, 1.48it/s, loss=0.0079, lr=1.01e-05, step=6263] Training: 63%|██████▎ | 6264/10000 [1:16:45<42:05, 1.48it/s, loss=0.0034, lr=1.01e-05, step=6264] Training: 63%|██████▎ | 6265/10000 [1:16:46<43:23, 1.43it/s, loss=0.0034, lr=1.01e-05, step=6264] Training: 63%|██████▎ | 6265/10000 [1:16:46<43:23, 1.43it/s, loss=0.0100, lr=1.00e-05, step=6265] Training: 63%|██████▎ | 6266/10000 [1:16:47<46:10, 1.35it/s, loss=0.0100, lr=1.00e-05, step=6265] Training: 63%|██████▎ | 6266/10000 [1:16:47<46:10, 1.35it/s, loss=0.0063, lr=1.00e-05, step=6266] Training: 63%|██████▎ | 6267/10000 [1:16:47<42:18, 1.47it/s, loss=0.0063, lr=1.00e-05, step=6266] Training: 63%|██████▎ | 6267/10000 [1:16:47<42:18, 1.47it/s, loss=0.0035, lr=1.00e-05, step=6267] Training: 63%|██████▎ | 6268/10000 [1:16:48<44:33, 1.40it/s, loss=0.0035, lr=1.00e-05, step=6267] Training: 63%|██████▎ | 6268/10000 [1:16:48<44:33, 1.40it/s, loss=0.0120, lr=1.00e-05, step=6268] Training: 63%|██████▎ | 6269/10000 [1:16:49<41:39, 1.49it/s, loss=0.0120, lr=1.00e-05, step=6268] Training: 63%|██████▎ | 6269/10000 [1:16:49<41:39, 1.49it/s, loss=0.0053, lr=1.00e-05, step=6269]17:22:56.021 [I] step=6270 loss=0.0029 smoothed_loss=0.0080 lr=1.00e-05 grad_norm=0.3941 step_time=0.5570s data_time=0.1183s it/s=1.481 eta_to_10000=2518.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0060 grad_action_out_proj=0.0728 grad_shared_expert=0.4052 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6270/10000 [1:16:49<39:40, 1.57it/s, loss=0.0053, lr=1.00e-05, step=6269] Training: 63%|██████▎ | 6270/10000 [1:16:49<39:40, 1.57it/s, loss=0.0029, lr=1.00e-05, step=6270] Training: 63%|██████▎ | 6271/10000 [1:16:50<41:31, 1.50it/s, loss=0.0029, lr=1.00e-05, step=6270] Training: 63%|██████▎ | 6271/10000 [1:16:50<41:31, 1.50it/s, loss=0.0047, lr=1.00e-05, step=6271] Training: 63%|██████▎ | 6272/10000 [1:16:51<48:16, 1.29it/s, loss=0.0047, lr=1.00e-05, step=6271] Training: 63%|██████▎ | 6272/10000 [1:16:51<48:16, 1.29it/s, loss=0.0134, lr=1.00e-05, step=6272] Training: 63%|██████▎ | 6273/10000 [1:16:51<44:17, 1.40it/s, loss=0.0134, lr=1.00e-05, step=6272] Training: 63%|██████▎ | 6273/10000 [1:16:51<44:17, 1.40it/s, loss=0.0042, lr=1.00e-05, step=6273] Training: 63%|██████▎ | 6274/10000 [1:16:52<41:45, 1.49it/s, loss=0.0042, lr=1.00e-05, step=6273] Training: 63%|██████▎ | 6274/10000 [1:16:52<41:45, 1.49it/s, loss=0.0196, lr=1.00e-05, step=6274] Training: 63%|██████▎ | 6275/10000 [1:16:53<41:45, 1.49it/s, loss=0.0196, lr=1.00e-05, step=6274] Training: 63%|██████▎ | 6275/10000 [1:16:53<41:45, 1.49it/s, loss=0.0020, lr=1.00e-05, step=6275] Training: 63%|██████▎ | 6276/10000 [1:16:53<43:41, 1.42it/s, loss=0.0020, lr=1.00e-05, step=6275] Training: 63%|██████▎ | 6276/10000 [1:16:53<43:41, 1.42it/s, loss=0.0027, lr=1.00e-05, step=6276] Training: 63%|██████▎ | 6277/10000 [1:16:54<40:49, 1.52it/s, loss=0.0027, lr=1.00e-05, step=6276] Training: 63%|██████▎ | 6277/10000 [1:16:54<40:49, 1.52it/s, loss=0.0064, lr=1.00e-05, step=6277] Training: 63%|██████▎ | 6278/10000 [1:16:55<44:42, 1.39it/s, loss=0.0064, lr=1.00e-05, step=6277] Training: 63%|██████▎ | 6278/10000 [1:16:55<44:42, 1.39it/s, loss=0.0068, lr=1.00e-05, step=6278] Training: 63%|██████▎ | 6279/10000 [1:16:56<45:48, 1.35it/s, loss=0.0068, lr=1.00e-05, step=6278] Training: 63%|██████▎ | 6279/10000 [1:16:56<45:48, 1.35it/s, loss=0.0420, lr=1.00e-05, step=6279]17:23:03.604 [I] step=6280 loss=0.0131 smoothed_loss=0.0112 lr=1.00e-05 grad_norm=0.3984 step_time=0.6331s data_time=0.1252s it/s=1.319 eta_to_10000=2820.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0047 grad_action_out_proj=0.0637 grad_shared_expert=0.2495 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6280/10000 [1:16:57<51:07, 1.21it/s, loss=0.0420, lr=1.00e-05, step=6279] Training: 63%|██████▎ | 6280/10000 [1:16:57<51:07, 1.21it/s, loss=0.0131, lr=1.00e-05, step=6280] Training: 63%|██████▎ | 6281/10000 [1:16:58<51:44, 1.20it/s, loss=0.0131, lr=1.00e-05, step=6280] Training: 63%|██████▎ | 6281/10000 [1:16:58<51:44, 1.20it/s, loss=0.0111, lr=9.99e-06, step=6281] Training: 63%|██████▎ | 6282/10000 [1:16:58<50:25, 1.23it/s, loss=0.0111, lr=9.99e-06, step=6281] Training: 63%|██████▎ | 6282/10000 [1:16:58<50:25, 1.23it/s, loss=0.0119, lr=9.99e-06, step=6282] Training: 63%|██████▎ | 6283/10000 [1:16:59<44:32, 1.39it/s, loss=0.0119, lr=9.99e-06, step=6282] Training: 63%|██████▎ | 6283/10000 [1:16:59<44:32, 1.39it/s, loss=0.0038, lr=9.98e-06, step=6283] Training: 63%|██████▎ | 6284/10000 [1:17:00<44:53, 1.38it/s, loss=0.0038, lr=9.98e-06, step=6283] Training: 63%|██████▎ | 6284/10000 [1:17:00<44:53, 1.38it/s, loss=0.0027, lr=9.98e-06, step=6284] Training: 63%|██████▎ | 6285/10000 [1:17:00<43:43, 1.42it/s, loss=0.0027, lr=9.98e-06, step=6284] Training: 63%|██████▎ | 6285/10000 [1:17:00<43:43, 1.42it/s, loss=0.0051, lr=9.98e-06, step=6285] Training: 63%|██████▎ | 6286/10000 [1:17:01<48:22, 1.28it/s, loss=0.0051, lr=9.98e-06, step=6285] Training: 63%|██████▎ | 6286/10000 [1:17:01<48:22, 1.28it/s, loss=0.0024, lr=9.97e-06, step=6286] Training: 63%|██████▎ | 6287/10000 [1:17:02<53:11, 1.16it/s, loss=0.0024, lr=9.97e-06, step=6286] Training: 63%|██████▎ | 6287/10000 [1:17:02<53:11, 1.16it/s, loss=0.0065, lr=9.97e-06, step=6287] Training: 63%|██████▎ | 6288/10000 [1:17:03<46:14, 1.34it/s, loss=0.0065, lr=9.97e-06, step=6287] Training: 63%|██████▎ | 6288/10000 [1:17:03<46:14, 1.34it/s, loss=0.0102, lr=9.97e-06, step=6288] Training: 63%|██████▎ | 6289/10000 [1:17:04<48:52, 1.27it/s, loss=0.0102, lr=9.97e-06, step=6288] Training: 63%|██████▎ | 6289/10000 [1:17:04<48:52, 1.27it/s, loss=0.0085, lr=9.96e-06, step=6289]17:23:11.371 [I] step=6290 loss=0.0010 smoothed_loss=0.0078 lr=9.98e-06 grad_norm=0.4164 step_time=0.6119s data_time=0.1648s it/s=1.288 eta_to_10000=2881.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0616 grad_shared_expert=0.3634 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6290/10000 [1:17:04<50:20, 1.23it/s, loss=0.0085, lr=9.96e-06, step=6289] Training: 63%|██████▎ | 6290/10000 [1:17:04<50:20, 1.23it/s, loss=0.0010, lr=9.96e-06, step=6290] Training: 63%|██████▎ | 6291/10000 [1:17:05<51:18, 1.20it/s, loss=0.0010, lr=9.96e-06, step=6290] Training: 63%|██████▎ | 6291/10000 [1:17:05<51:18, 1.20it/s, loss=0.0179, lr=9.96e-06, step=6291] Training: 63%|██████▎ | 6292/10000 [1:17:06<48:00, 1.29it/s, loss=0.0179, lr=9.96e-06, step=6291] Training: 63%|██████▎ | 6292/10000 [1:17:06<48:00, 1.29it/s, loss=0.0098, lr=9.95e-06, step=6292] Training: 63%|██████▎ | 6293/10000 [1:17:07<47:10, 1.31it/s, loss=0.0098, lr=9.95e-06, step=6292] Training: 63%|██████▎ | 6293/10000 [1:17:07<47:10, 1.31it/s, loss=0.0051, lr=9.95e-06, step=6293] Training: 63%|██████▎ | 6294/10000 [1:17:07<42:25, 1.46it/s, loss=0.0051, lr=9.95e-06, step=6293] Training: 63%|██████▎ | 6294/10000 [1:17:07<42:25, 1.46it/s, loss=0.0137, lr=9.95e-06, step=6294] Training: 63%|██████▎ | 6295/10000 [1:17:08<45:09, 1.37it/s, loss=0.0137, lr=9.95e-06, step=6294] Training: 63%|██████▎ | 6295/10000 [1:17:08<45:09, 1.37it/s, loss=0.0197, lr=9.94e-06, step=6295] Training: 63%|██████▎ | 6296/10000 [1:17:09<41:09, 1.50it/s, loss=0.0197, lr=9.94e-06, step=6295] Training: 63%|██████▎ | 6296/10000 [1:17:09<41:09, 1.50it/s, loss=0.0046, lr=9.94e-06, step=6296] Training: 63%|██████▎ | 6297/10000 [1:17:09<43:19, 1.42it/s, loss=0.0046, lr=9.94e-06, step=6296] Training: 63%|██████▎ | 6297/10000 [1:17:09<43:19, 1.42it/s, loss=0.0119, lr=9.94e-06, step=6297] Training: 63%|██████▎ | 6298/10000 [1:17:10<45:19, 1.36it/s, loss=0.0119, lr=9.94e-06, step=6297] Training: 63%|██████▎ | 6298/10000 [1:17:10<45:19, 1.36it/s, loss=0.0014, lr=9.93e-06, step=6298] Training: 63%|██████▎ | 6299/10000 [1:17:11<44:38, 1.38it/s, loss=0.0014, lr=9.93e-06, step=6298] Training: 63%|██████▎ | 6299/10000 [1:17:11<44:38, 1.38it/s, loss=0.0111, lr=9.93e-06, step=6299]17:23:18.405 [I] step=6300 loss=0.0091 smoothed_loss=0.0092 lr=9.94e-06 grad_norm=0.4439 step_time=0.5702s data_time=0.1333s it/s=1.422 eta_to_10000=2602.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0070 grad_action_out_proj=0.0699 grad_shared_expert=0.2761 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6300/10000 [1:17:11<42:53, 1.44it/s, loss=0.0111, lr=9.93e-06, step=6299] Training: 63%|██████▎ | 6300/10000 [1:17:11<42:53, 1.44it/s, loss=0.0091, lr=9.93e-06, step=6300] Training: 63%|██████▎ | 6301/10000 [1:17:12<39:44, 1.55it/s, loss=0.0091, lr=9.93e-06, step=6300] Training: 63%|██████▎ | 6301/10000 [1:17:12<39:44, 1.55it/s, loss=0.0035, lr=9.92e-06, step=6301] Training: 63%|██████▎ | 6302/10000 [1:17:13<42:02, 1.47it/s, loss=0.0035, lr=9.92e-06, step=6301] Training: 63%|██████▎ | 6302/10000 [1:17:13<42:02, 1.47it/s, loss=0.0211, lr=9.92e-06, step=6302] Training: 63%|██████▎ | 6303/10000 [1:17:13<38:55, 1.58it/s, loss=0.0211, lr=9.92e-06, step=6302] Training: 63%|██████▎ | 6303/10000 [1:17:13<38:55, 1.58it/s, loss=0.0158, lr=9.91e-06, step=6303] Training: 63%|██████▎ | 6304/10000 [1:17:14<38:00, 1.62it/s, loss=0.0158, lr=9.91e-06, step=6303] Training: 63%|██████▎ | 6304/10000 [1:17:14<38:00, 1.62it/s, loss=0.0058, lr=9.91e-06, step=6304] Training: 63%|██████▎ | 6305/10000 [1:17:14<35:44, 1.72it/s, loss=0.0058, lr=9.91e-06, step=6304] Training: 63%|██████▎ | 6305/10000 [1:17:14<35:44, 1.72it/s, loss=0.0088, lr=9.91e-06, step=6305] Training: 63%|██████▎ | 6306/10000 [1:17:15<37:51, 1.63it/s, loss=0.0088, lr=9.91e-06, step=6305] Training: 63%|██████▎ | 6306/10000 [1:17:15<37:51, 1.63it/s, loss=0.0064, lr=9.90e-06, step=6306] Training: 63%|██████▎ | 6307/10000 [1:17:16<40:12, 1.53it/s, loss=0.0064, lr=9.90e-06, step=6306] Training: 63%|██████▎ | 6307/10000 [1:17:16<40:12, 1.53it/s, loss=0.0133, lr=9.90e-06, step=6307] Training: 63%|██████▎ | 6308/10000 [1:17:16<38:50, 1.58it/s, loss=0.0133, lr=9.90e-06, step=6307] Training: 63%|██████▎ | 6308/10000 [1:17:16<38:50, 1.58it/s, loss=0.0032, lr=9.90e-06, step=6308] Training: 63%|██████▎ | 6309/10000 [1:17:17<41:31, 1.48it/s, loss=0.0032, lr=9.90e-06, step=6308] Training: 63%|██████▎ | 6309/10000 [1:17:17<41:31, 1.48it/s, loss=0.0045, lr=9.89e-06, step=6309]17:23:24.833 [I] step=6310 loss=0.0078 smoothed_loss=0.0087 lr=9.91e-06 grad_norm=0.4614 step_time=0.5421s data_time=0.1007s it/s=1.556 eta_to_10000=2371.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0250 grad_action_out_proj=0.1716 grad_shared_expert=0.5629 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6310/10000 [1:17:18<42:48, 1.44it/s, loss=0.0045, lr=9.89e-06, step=6309] Training: 63%|██████▎ | 6310/10000 [1:17:18<42:48, 1.44it/s, loss=0.0078, lr=9.89e-06, step=6310] Training: 63%|██████▎ | 6311/10000 [1:17:19<43:49, 1.40it/s, loss=0.0078, lr=9.89e-06, step=6310] Training: 63%|██████▎ | 6311/10000 [1:17:19<43:49, 1.40it/s, loss=0.0053, lr=9.89e-06, step=6311] Training: 63%|██████▎ | 6312/10000 [1:17:19<40:48, 1.51it/s, loss=0.0053, lr=9.89e-06, step=6311] Training: 63%|██████▎ | 6312/10000 [1:17:19<40:48, 1.51it/s, loss=0.0046, lr=9.88e-06, step=6312] Training: 63%|██████▎ | 6313/10000 [1:17:20<37:26, 1.64it/s, loss=0.0046, lr=9.88e-06, step=6312] Training: 63%|██████▎ | 6313/10000 [1:17:20<37:26, 1.64it/s, loss=0.0015, lr=9.88e-06, step=6313] Training: 63%|██████▎ | 6314/10000 [1:17:20<38:57, 1.58it/s, loss=0.0015, lr=9.88e-06, step=6313] Training: 63%|██████▎ | 6314/10000 [1:17:20<38:57, 1.58it/s, loss=0.0087, lr=9.88e-06, step=6314] Training: 63%|██████▎ | 6315/10000 [1:17:21<42:04, 1.46it/s, loss=0.0087, lr=9.88e-06, step=6314] Training: 63%|██████▎ | 6315/10000 [1:17:21<42:04, 1.46it/s, loss=0.0135, lr=9.87e-06, step=6315] Training: 63%|██████▎ | 6316/10000 [1:17:22<47:10, 1.30it/s, loss=0.0135, lr=9.87e-06, step=6315] Training: 63%|██████▎ | 6316/10000 [1:17:22<47:10, 1.30it/s, loss=0.0093, lr=9.87e-06, step=6316] Training: 63%|██████▎ | 6317/10000 [1:17:23<42:45, 1.44it/s, loss=0.0093, lr=9.87e-06, step=6316] Training: 63%|██████▎ | 6317/10000 [1:17:23<42:45, 1.44it/s, loss=0.0047, lr=9.87e-06, step=6317] Training: 63%|██████▎ | 6318/10000 [1:17:23<42:00, 1.46it/s, loss=0.0047, lr=9.87e-06, step=6317] Training: 63%|██████▎ | 6318/10000 [1:17:23<42:00, 1.46it/s, loss=0.0025, lr=9.86e-06, step=6318] Training: 63%|██████▎ | 6319/10000 [1:17:24<39:11, 1.57it/s, loss=0.0025, lr=9.86e-06, step=6318] Training: 63%|██████▎ | 6319/10000 [1:17:24<39:11, 1.57it/s, loss=0.0022, lr=9.86e-06, step=6319]17:23:31.311 [I] step=6320 loss=0.0240 smoothed_loss=0.0085 lr=9.87e-06 grad_norm=0.4590 step_time=0.5383s data_time=0.1095s it/s=1.544 eta_to_10000=2383.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0084 grad_action_out_proj=0.0829 grad_shared_expert=0.4106 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6320/10000 [1:17:24<36:57, 1.66it/s, loss=0.0022, lr=9.86e-06, step=6319] Training: 63%|██████▎ | 6320/10000 [1:17:24<36:57, 1.66it/s, loss=0.0240, lr=9.86e-06, step=6320] Training: 63%|██████▎ | 6321/10000 [1:17:25<44:14, 1.39it/s, loss=0.0240, lr=9.86e-06, step=6320] Training: 63%|██████▎ | 6321/10000 [1:17:25<44:14, 1.39it/s, loss=0.0048, lr=9.85e-06, step=6321] Training: 63%|██████▎ | 6322/10000 [1:17:26<40:06, 1.53it/s, loss=0.0048, lr=9.85e-06, step=6321] Training: 63%|██████▎ | 6322/10000 [1:17:26<40:06, 1.53it/s, loss=0.0059, lr=9.85e-06, step=6322] Training: 63%|██████▎ | 6323/10000 [1:17:27<47:34, 1.29it/s, loss=0.0059, lr=9.85e-06, step=6322] Training: 63%|██████▎ | 6323/10000 [1:17:27<47:34, 1.29it/s, loss=0.0011, lr=9.84e-06, step=6323] Training: 63%|██████▎ | 6324/10000 [1:17:28<46:01, 1.33it/s, loss=0.0011, lr=9.84e-06, step=6323] Training: 63%|██████▎ | 6324/10000 [1:17:28<46:01, 1.33it/s, loss=0.0135, lr=9.84e-06, step=6324] Training: 63%|██████▎ | 6325/10000 [1:17:28<47:03, 1.30it/s, loss=0.0135, lr=9.84e-06, step=6324] Training: 63%|██████▎ | 6325/10000 [1:17:28<47:03, 1.30it/s, loss=0.0416, lr=9.84e-06, step=6325] Training: 63%|██████▎ | 6326/10000 [1:17:29<48:13, 1.27it/s, loss=0.0416, lr=9.84e-06, step=6325] Training: 63%|██████▎ | 6326/10000 [1:17:29<48:13, 1.27it/s, loss=0.0157, lr=9.83e-06, step=6326] Training: 63%|██████▎ | 6327/10000 [1:17:30<48:29, 1.26it/s, loss=0.0157, lr=9.83e-06, step=6326] Training: 63%|██████▎ | 6327/10000 [1:17:30<48:29, 1.26it/s, loss=0.0143, lr=9.83e-06, step=6327] Training: 63%|██████▎ | 6328/10000 [1:17:31<47:23, 1.29it/s, loss=0.0143, lr=9.83e-06, step=6327] Training: 63%|██████▎ | 6328/10000 [1:17:31<47:23, 1.29it/s, loss=0.0036, lr=9.83e-06, step=6328] Training: 63%|██████▎ | 6329/10000 [1:17:32<46:01, 1.33it/s, loss=0.0036, lr=9.83e-06, step=6328] Training: 63%|██████▎ | 6329/10000 [1:17:32<46:01, 1.33it/s, loss=0.0095, lr=9.82e-06, step=6329]17:23:39.032 [I] step=6330 loss=0.0027 smoothed_loss=0.0101 lr=9.84e-06 grad_norm=0.4704 step_time=0.6246s data_time=0.1476s it/s=1.295 eta_to_10000=2833.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0067 grad_action_out_proj=0.0711 grad_shared_expert=0.4407 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6330/10000 [1:17:32<43:06, 1.42it/s, loss=0.0095, lr=9.82e-06, step=6329] Training: 63%|██████▎ | 6330/10000 [1:17:32<43:06, 1.42it/s, loss=0.0027, lr=9.82e-06, step=6330] Training: 63%|██████▎ | 6331/10000 [1:17:33<39:00, 1.57it/s, loss=0.0027, lr=9.82e-06, step=6330] Training: 63%|██████▎ | 6331/10000 [1:17:33<39:00, 1.57it/s, loss=0.0141, lr=9.82e-06, step=6331] Training: 63%|██████▎ | 6332/10000 [1:17:33<36:18, 1.68it/s, loss=0.0141, lr=9.82e-06, step=6331] Training: 63%|██████▎ | 6332/10000 [1:17:33<36:18, 1.68it/s, loss=0.0071, lr=9.81e-06, step=6332] Training: 63%|██████▎ | 6333/10000 [1:17:34<36:20, 1.68it/s, loss=0.0071, lr=9.81e-06, step=6332] Training: 63%|██████▎ | 6333/10000 [1:17:34<36:20, 1.68it/s, loss=0.0030, lr=9.81e-06, step=6333] Training: 63%|██████▎ | 6334/10000 [1:17:34<34:21, 1.78it/s, loss=0.0030, lr=9.81e-06, step=6333] Training: 63%|██████▎ | 6334/10000 [1:17:34<34:21, 1.78it/s, loss=0.0147, lr=9.81e-06, step=6334] Training: 63%|██████▎ | 6335/10000 [1:17:35<38:04, 1.60it/s, loss=0.0147, lr=9.81e-06, step=6334] Training: 63%|██████▎ | 6335/10000 [1:17:35<38:04, 1.60it/s, loss=0.0065, lr=9.80e-06, step=6335] Training: 63%|██████▎ | 6336/10000 [1:17:36<43:40, 1.40it/s, loss=0.0065, lr=9.80e-06, step=6335] Training: 63%|██████▎ | 6336/10000 [1:17:36<43:40, 1.40it/s, loss=0.0042, lr=9.80e-06, step=6336] Training: 63%|██████▎ | 6337/10000 [1:17:36<40:29, 1.51it/s, loss=0.0042, lr=9.80e-06, step=6336] Training: 63%|██████▎ | 6337/10000 [1:17:36<40:29, 1.51it/s, loss=0.0041, lr=9.80e-06, step=6337] Training: 63%|██████▎ | 6338/10000 [1:17:37<43:01, 1.42it/s, loss=0.0041, lr=9.80e-06, step=6337] Training: 63%|██████▎ | 6338/10000 [1:17:37<43:01, 1.42it/s, loss=0.0118, lr=9.79e-06, step=6338] Training: 63%|██████▎ | 6339/10000 [1:17:38<44:56, 1.36it/s, loss=0.0118, lr=9.79e-06, step=6338] Training: 63%|██████▎ | 6339/10000 [1:17:38<44:56, 1.36it/s, loss=0.0040, lr=9.79e-06, step=6339]17:23:45.582 [I] step=6340 loss=0.0130 smoothed_loss=0.0089 lr=9.80e-06 grad_norm=0.3953 step_time=0.5525s data_time=0.1025s it/s=1.527 eta_to_10000=2396.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0064 grad_action_out_proj=0.0704 grad_shared_expert=0.2309 (10775:train_pytorch.py:850) + Training: 63%|██████▎ | 6340/10000 [1:17:39<43:14, 1.41it/s, loss=0.0040, lr=9.79e-06, step=6339] Training: 63%|██████▎ | 6340/10000 [1:17:39<43:14, 1.41it/s, loss=0.0130, lr=9.79e-06, step=6340] Training: 63%|██████▎ | 6341/10000 [1:17:39<43:51, 1.39it/s, loss=0.0130, lr=9.79e-06, step=6340] Training: 63%|██████▎ | 6341/10000 [1:17:39<43:51, 1.39it/s, loss=0.0059, lr=9.78e-06, step=6341] Training: 63%|██████▎ | 6342/10000 [1:17:40<44:53, 1.36it/s, loss=0.0059, lr=9.78e-06, step=6341] Training: 63%|██████▎ | 6342/10000 [1:17:40<44:53, 1.36it/s, loss=0.0069, lr=9.78e-06, step=6342] Training: 63%|██████▎ | 6343/10000 [1:17:41<44:56, 1.36it/s, loss=0.0069, lr=9.78e-06, step=6342] Training: 63%|██████▎ | 6343/10000 [1:17:41<44:56, 1.36it/s, loss=0.0146, lr=9.78e-06, step=6343] Training: 63%|██████▎ | 6344/10000 [1:17:42<46:27, 1.31it/s, loss=0.0146, lr=9.78e-06, step=6343] Training: 63%|██████▎ | 6344/10000 [1:17:42<46:27, 1.31it/s, loss=0.0109, lr=9.77e-06, step=6344] Training: 63%|██████▎ | 6345/10000 [1:17:42<46:18, 1.32it/s, loss=0.0109, lr=9.77e-06, step=6344] Training: 63%|██████▎ | 6345/10000 [1:17:42<46:18, 1.32it/s, loss=0.0196, lr=9.77e-06, step=6345] Training: 63%|██████▎ | 6346/10000 [1:17:43<47:25, 1.28it/s, loss=0.0196, lr=9.77e-06, step=6345] Training: 63%|██████▎ | 6346/10000 [1:17:43<47:25, 1.28it/s, loss=0.0094, lr=9.76e-06, step=6346] Training: 63%|██████▎ | 6347/10000 [1:17:44<46:40, 1.30it/s, loss=0.0094, lr=9.76e-06, step=6346] Training: 63%|██████▎ | 6347/10000 [1:17:44<46:40, 1.30it/s, loss=0.0225, lr=9.76e-06, step=6347] Training: 63%|██████▎ | 6348/10000 [1:17:45<45:43, 1.33it/s, loss=0.0225, lr=9.76e-06, step=6347] Training: 63%|██████▎ | 6348/10000 [1:17:45<45:43, 1.33it/s, loss=0.0048, lr=9.76e-06, step=6348] Training: 63%|██████▎ | 6349/10000 [1:17:45<41:17, 1.47it/s, loss=0.0048, lr=9.76e-06, step=6348] Training: 63%|██████▎ | 6349/10000 [1:17:45<41:17, 1.47it/s, loss=0.0127, lr=9.75e-06, step=6349]17:23:53.273 [I] step=6350 loss=0.0146 smoothed_loss=0.0113 lr=9.77e-06 grad_norm=0.4166 step_time=0.6297s data_time=0.1394s it/s=1.300 eta_to_10000=2806.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0168 grad_action_out_proj=0.1507 grad_shared_expert=0.4342 (10775:train_pytorch.py:850) + Training: 64%|██████▎ | 6350/10000 [1:17:46<48:25, 1.26it/s, loss=0.0127, lr=9.75e-06, step=6349] Training: 64%|██████▎ | 6350/10000 [1:17:46<48:25, 1.26it/s, loss=0.0146, lr=9.75e-06, step=6350] Training: 64%|██████▎ | 6351/10000 [1:17:47<51:47, 1.17it/s, loss=0.0146, lr=9.75e-06, step=6350] Training: 64%|██████▎ | 6351/10000 [1:17:47<51:47, 1.17it/s, loss=0.0283, lr=9.75e-06, step=6351] Training: 64%|██████▎ | 6352/10000 [1:17:48<50:47, 1.20it/s, loss=0.0283, lr=9.75e-06, step=6351] Training: 64%|██████▎ | 6352/10000 [1:17:48<50:47, 1.20it/s, loss=0.0826, lr=9.74e-06, step=6352] Training: 64%|██████▎ | 6353/10000 [1:17:49<50:45, 1.20it/s, loss=0.0826, lr=9.74e-06, step=6352] Training: 64%|██████▎ | 6353/10000 [1:17:49<50:45, 1.20it/s, loss=0.0085, lr=9.74e-06, step=6353] Training: 64%|██████▎ | 6354/10000 [1:17:50<48:41, 1.25it/s, loss=0.0085, lr=9.74e-06, step=6353] Training: 64%|██████▎ | 6354/10000 [1:17:50<48:41, 1.25it/s, loss=0.0114, lr=9.74e-06, step=6354] Training: 64%|██████▎ | 6355/10000 [1:17:50<47:45, 1.27it/s, loss=0.0114, lr=9.74e-06, step=6354] Training: 64%|██████▎ | 6355/10000 [1:17:50<47:45, 1.27it/s, loss=0.0290, lr=9.73e-06, step=6355] Training: 64%|██████▎ | 6356/10000 [1:17:51<42:53, 1.42it/s, loss=0.0290, lr=9.73e-06, step=6355] Training: 64%|██████▎ | 6356/10000 [1:17:51<42:53, 1.42it/s, loss=0.0519, lr=9.73e-06, step=6356] Training: 64%|██████▎ | 6357/10000 [1:17:52<44:20, 1.37it/s, loss=0.0519, lr=9.73e-06, step=6356] Training: 64%|██████▎ | 6357/10000 [1:17:52<44:20, 1.37it/s, loss=0.0021, lr=9.73e-06, step=6357] Training: 64%|██████▎ | 6358/10000 [1:17:52<40:31, 1.50it/s, loss=0.0021, lr=9.73e-06, step=6357] Training: 64%|██████▎ | 6358/10000 [1:17:52<40:31, 1.50it/s, loss=0.0017, lr=9.72e-06, step=6358] Training: 64%|██████▎ | 6359/10000 [1:17:53<47:39, 1.27it/s, loss=0.0017, lr=9.72e-06, step=6358] Training: 64%|██████▎ | 6359/10000 [1:17:53<47:39, 1.27it/s, loss=0.0146, lr=9.72e-06, step=6359]17:24:00.907 [I] step=6360 loss=0.0017 smoothed_loss=0.0165 lr=9.73e-06 grad_norm=0.4751 step_time=0.6191s data_time=0.1443s it/s=1.310 eta_to_10000=2778.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0153 grad_action_out_proj=0.1510 grad_shared_expert=0.4497 (10775:train_pytorch.py:850) + Training: 64%|██████▎ | 6360/10000 [1:17:54<45:21, 1.34it/s, loss=0.0146, lr=9.72e-06, step=6359] Training: 64%|██████▎ | 6360/10000 [1:17:54<45:21, 1.34it/s, loss=0.0017, lr=9.72e-06, step=6360] Training: 64%|██████▎ | 6361/10000 [1:17:55<44:16, 1.37it/s, loss=0.0017, lr=9.72e-06, step=6360] Training: 64%|██████▎ | 6361/10000 [1:17:55<44:16, 1.37it/s, loss=0.0047, lr=9.71e-06, step=6361] Training: 64%|██████▎ | 6362/10000 [1:17:55<41:10, 1.47it/s, loss=0.0047, lr=9.71e-06, step=6361] Training: 64%|██████▎ | 6362/10000 [1:17:55<41:10, 1.47it/s, loss=0.0028, lr=9.71e-06, step=6362] Training: 64%|██████▎ | 6363/10000 [1:17:56<41:30, 1.46it/s, loss=0.0028, lr=9.71e-06, step=6362] Training: 64%|██████▎ | 6363/10000 [1:17:56<41:30, 1.46it/s, loss=0.0062, lr=9.71e-06, step=6363] Training: 64%|██████▎ | 6364/10000 [1:17:57<41:58, 1.44it/s, loss=0.0062, lr=9.71e-06, step=6363] Training: 64%|██████▎ | 6364/10000 [1:17:57<41:58, 1.44it/s, loss=0.0230, lr=9.70e-06, step=6364] Training: 64%|██████▎ | 6365/10000 [1:17:57<38:31, 1.57it/s, loss=0.0230, lr=9.70e-06, step=6364] Training: 64%|██████▎ | 6365/10000 [1:17:57<38:31, 1.57it/s, loss=0.0137, lr=9.70e-06, step=6365] Training: 64%|██████▎ | 6366/10000 [1:17:58<39:45, 1.52it/s, loss=0.0137, lr=9.70e-06, step=6365] Training: 64%|██████▎ | 6366/10000 [1:17:58<39:45, 1.52it/s, loss=0.0031, lr=9.70e-06, step=6366] Training: 64%|██████▎ | 6367/10000 [1:17:59<41:41, 1.45it/s, loss=0.0031, lr=9.70e-06, step=6366] Training: 64%|██████▎ | 6367/10000 [1:17:59<41:41, 1.45it/s, loss=0.0068, lr=9.69e-06, step=6367] Training: 64%|██████▎ | 6368/10000 [1:17:59<43:46, 1.38it/s, loss=0.0068, lr=9.69e-06, step=6367] Training: 64%|██████▎ | 6368/10000 [1:17:59<43:46, 1.38it/s, loss=0.0063, lr=9.69e-06, step=6368] Training: 64%|██████▎ | 6369/10000 [1:18:00<39:32, 1.53it/s, loss=0.0063, lr=9.69e-06, step=6368] Training: 64%|██████▎ | 6369/10000 [1:18:00<39:32, 1.53it/s, loss=0.0024, lr=9.68e-06, step=6369]17:24:07.613 [I] step=6370 loss=0.0109 smoothed_loss=0.0109 lr=9.70e-06 grad_norm=0.3674 step_time=0.5563s data_time=0.1144s it/s=1.491 eta_to_10000=2433.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0074 grad_action_out_proj=0.0891 grad_shared_expert=0.3788 (10775:train_pytorch.py:850) + Training: 64%|██████▎ | 6370/10000 [1:18:01<41:52, 1.45it/s, loss=0.0024, lr=9.68e-06, step=6369] Training: 64%|██████▎ | 6370/10000 [1:18:01<41:52, 1.45it/s, loss=0.0109, lr=9.68e-06, step=6370] Training: 64%|██████▎ | 6371/10000 [1:18:02<45:16, 1.34it/s, loss=0.0109, lr=9.68e-06, step=6370] Training: 64%|██████▎ | 6371/10000 [1:18:02<45:16, 1.34it/s, loss=0.0015, lr=9.68e-06, step=6371] Training: 64%|██████▎ | 6372/10000 [1:18:02<44:49, 1.35it/s, loss=0.0015, lr=9.68e-06, step=6371] Training: 64%|██████▎ | 6372/10000 [1:18:02<44:49, 1.35it/s, loss=0.0261, lr=9.67e-06, step=6372] Training: 64%|██████▎ | 6373/10000 [1:18:03<48:39, 1.24it/s, loss=0.0261, lr=9.67e-06, step=6372] Training: 64%|██████▎ | 6373/10000 [1:18:03<48:39, 1.24it/s, loss=0.0231, lr=9.67e-06, step=6373] Training: 64%|██████▎ | 6374/10000 [1:18:04<50:00, 1.21it/s, loss=0.0231, lr=9.67e-06, step=6373] Training: 64%|██████▎ | 6374/10000 [1:18:04<50:00, 1.21it/s, loss=0.0130, lr=9.67e-06, step=6374] Training: 64%|██████▍ | 6375/10000 [1:18:05<44:00, 1.37it/s, loss=0.0130, lr=9.67e-06, step=6374] Training: 64%|██████▍ | 6375/10000 [1:18:05<44:00, 1.37it/s, loss=0.0147, lr=9.66e-06, step=6375] Training: 64%|██████▍ | 6376/10000 [1:18:05<43:32, 1.39it/s, loss=0.0147, lr=9.66e-06, step=6375] Training: 64%|██████▍ | 6376/10000 [1:18:05<43:32, 1.39it/s, loss=0.0230, lr=9.66e-06, step=6376] Training: 64%|██████▍ | 6377/10000 [1:18:06<44:23, 1.36it/s, loss=0.0230, lr=9.66e-06, step=6376] Training: 64%|██████▍ | 6377/10000 [1:18:06<44:23, 1.36it/s, loss=0.0088, lr=9.66e-06, step=6377] Training: 64%|██████▍ | 6378/10000 [1:18:07<51:25, 1.17it/s, loss=0.0088, lr=9.66e-06, step=6377] Training: 64%|██████▍ | 6378/10000 [1:18:07<51:25, 1.17it/s, loss=0.0078, lr=9.65e-06, step=6378] Training: 64%|██████▍ | 6379/10000 [1:18:08<52:35, 1.15it/s, loss=0.0078, lr=9.65e-06, step=6378] Training: 64%|██████▍ | 6379/10000 [1:18:08<52:35, 1.15it/s, loss=0.0160, lr=9.65e-06, step=6379]17:24:16.161 [I] step=6380 loss=0.0039 smoothed_loss=0.0123 lr=9.66e-06 grad_norm=0.4795 step_time=0.6489s data_time=0.2058s it/s=1.170 eta_to_10000=3093.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0061 grad_action_out_proj=0.0712 grad_shared_expert=0.2121 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6380/10000 [1:18:09<56:41, 1.06it/s, loss=0.0160, lr=9.65e-06, step=6379] Training: 64%|██████▍ | 6380/10000 [1:18:09<56:41, 1.06it/s, loss=0.0039, lr=9.65e-06, step=6380] Training: 64%|██████▍ | 6381/10000 [1:18:10<48:32, 1.24it/s, loss=0.0039, lr=9.65e-06, step=6380] Training: 64%|██████▍ | 6381/10000 [1:18:10<48:32, 1.24it/s, loss=0.0053, lr=9.64e-06, step=6381] Training: 64%|██████▍ | 6382/10000 [1:18:11<48:24, 1.25it/s, loss=0.0053, lr=9.64e-06, step=6381] Training: 64%|██████▍ | 6382/10000 [1:18:11<48:24, 1.25it/s, loss=0.0027, lr=9.64e-06, step=6382] Training: 64%|██████▍ | 6383/10000 [1:18:11<47:20, 1.27it/s, loss=0.0027, lr=9.64e-06, step=6382] Training: 64%|██████▍ | 6383/10000 [1:18:11<47:20, 1.27it/s, loss=0.0377, lr=9.64e-06, step=6383] Training: 64%|██████▍ | 6384/10000 [1:18:12<47:13, 1.28it/s, loss=0.0377, lr=9.64e-06, step=6383] Training: 64%|██████▍ | 6384/10000 [1:18:12<47:13, 1.28it/s, loss=0.0085, lr=9.63e-06, step=6384] Training: 64%|██████▍ | 6385/10000 [1:18:13<49:06, 1.23it/s, loss=0.0085, lr=9.63e-06, step=6384] Training: 64%|██████▍ | 6385/10000 [1:18:13<49:06, 1.23it/s, loss=0.0059, lr=9.63e-06, step=6385] Training: 64%|██████▍ | 6386/10000 [1:18:13<43:49, 1.37it/s, loss=0.0059, lr=9.63e-06, step=6385] Training: 64%|██████▍ | 6386/10000 [1:18:13<43:49, 1.37it/s, loss=0.0079, lr=9.63e-06, step=6386] Training: 64%|██████▍ | 6387/10000 [1:18:15<49:55, 1.21it/s, loss=0.0079, lr=9.63e-06, step=6386] Training: 64%|██████▍ | 6387/10000 [1:18:15<49:55, 1.21it/s, loss=0.0024, lr=9.62e-06, step=6387] Training: 64%|██████▍ | 6388/10000 [1:18:15<49:21, 1.22it/s, loss=0.0024, lr=9.62e-06, step=6387] Training: 64%|██████▍ | 6388/10000 [1:18:15<49:21, 1.22it/s, loss=0.0079, lr=9.62e-06, step=6388] Training: 64%|██████▍ | 6389/10000 [1:18:16<46:48, 1.29it/s, loss=0.0079, lr=9.62e-06, step=6388] Training: 64%|██████▍ | 6389/10000 [1:18:16<46:48, 1.29it/s, loss=0.0263, lr=9.62e-06, step=6389]17:24:23.482 [I] step=6390 loss=0.0237 smoothed_loss=0.0133 lr=9.63e-06 grad_norm=0.4370 step_time=0.5917s data_time=0.1404s it/s=1.366 eta_to_10000=2642.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0133 grad_action_out_proj=0.1140 grad_shared_expert=0.5281 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6390/10000 [1:18:17<42:44, 1.41it/s, loss=0.0263, lr=9.62e-06, step=6389] Training: 64%|██████▍ | 6390/10000 [1:18:17<42:44, 1.41it/s, loss=0.0237, lr=9.61e-06, step=6390] Training: 64%|██████▍ | 6391/10000 [1:18:17<42:23, 1.42it/s, loss=0.0237, lr=9.61e-06, step=6390] Training: 64%|██████▍ | 6391/10000 [1:18:17<42:23, 1.42it/s, loss=0.0012, lr=9.61e-06, step=6391] Training: 64%|██████▍ | 6392/10000 [1:18:18<42:04, 1.43it/s, loss=0.0012, lr=9.61e-06, step=6391] Training: 64%|██████▍ | 6392/10000 [1:18:18<42:04, 1.43it/s, loss=0.0078, lr=9.61e-06, step=6392] Training: 64%|██████▍ | 6393/10000 [1:18:19<42:23, 1.42it/s, loss=0.0078, lr=9.61e-06, step=6392] Training: 64%|██████▍ | 6393/10000 [1:18:19<42:23, 1.42it/s, loss=0.0018, lr=9.60e-06, step=6393] Training: 64%|██████▍ | 6394/10000 [1:18:19<42:42, 1.41it/s, loss=0.0018, lr=9.60e-06, step=6393] Training: 64%|██████▍ | 6394/10000 [1:18:19<42:42, 1.41it/s, loss=0.0133, lr=9.60e-06, step=6394] Training: 64%|██████▍ | 6395/10000 [1:18:20<42:05, 1.43it/s, loss=0.0133, lr=9.60e-06, step=6394] Training: 64%|██████▍ | 6395/10000 [1:18:20<42:05, 1.43it/s, loss=0.0050, lr=9.59e-06, step=6395] Training: 64%|██████▍ | 6396/10000 [1:18:21<43:03, 1.40it/s, loss=0.0050, lr=9.59e-06, step=6395] Training: 64%|██████▍ | 6396/10000 [1:18:21<43:03, 1.40it/s, loss=0.0083, lr=9.59e-06, step=6396] Training: 64%|██████▍ | 6397/10000 [1:18:22<43:54, 1.37it/s, loss=0.0083, lr=9.59e-06, step=6396] Training: 64%|██████▍ | 6397/10000 [1:18:22<43:54, 1.37it/s, loss=0.0112, lr=9.59e-06, step=6397] Training: 64%|██████▍ | 6398/10000 [1:18:22<40:44, 1.47it/s, loss=0.0112, lr=9.59e-06, step=6397] Training: 64%|██████▍ | 6398/10000 [1:18:22<40:44, 1.47it/s, loss=0.0121, lr=9.58e-06, step=6398] Training: 64%|██████▍ | 6399/10000 [1:18:23<40:36, 1.48it/s, loss=0.0121, lr=9.58e-06, step=6398] Training: 64%|██████▍ | 6399/10000 [1:18:23<40:36, 1.48it/s, loss=0.0423, lr=9.58e-06, step=6399]17:24:30.529 [I] step=6400 loss=0.0191 smoothed_loss=0.0141 lr=9.59e-06 grad_norm=0.4858 step_time=0.5791s data_time=0.1257s it/s=1.419 eta_to_10000=2536.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0233 grad_action_out_proj=0.1597 grad_shared_expert=0.5572 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6400/10000 [1:18:24<42:51, 1.40it/s, loss=0.0423, lr=9.58e-06, step=6399] Training: 64%|██████▍ | 6400/10000 [1:18:24<42:51, 1.40it/s, loss=0.0191, lr=9.58e-06, step=6400] Training: 64%|██████▍ | 6401/10000 [1:18:25<48:30, 1.24it/s, loss=0.0191, lr=9.58e-06, step=6400] Training: 64%|██████▍ | 6401/10000 [1:18:25<48:30, 1.24it/s, loss=0.0046, lr=9.57e-06, step=6401] Training: 64%|██████▍ | 6402/10000 [1:18:25<45:58, 1.30it/s, loss=0.0046, lr=9.57e-06, step=6401] Training: 64%|██████▍ | 6402/10000 [1:18:25<45:58, 1.30it/s, loss=0.0148, lr=9.57e-06, step=6402] Training: 64%|██████▍ | 6403/10000 [1:18:26<45:42, 1.31it/s, loss=0.0148, lr=9.57e-06, step=6402] Training: 64%|██████▍ | 6403/10000 [1:18:26<45:42, 1.31it/s, loss=0.0097, lr=9.57e-06, step=6403] Training: 64%|██████▍ | 6404/10000 [1:18:27<42:00, 1.43it/s, loss=0.0097, lr=9.57e-06, step=6403] Training: 64%|██████▍ | 6404/10000 [1:18:27<42:00, 1.43it/s, loss=0.0067, lr=9.56e-06, step=6404] Training: 64%|██████▍ | 6405/10000 [1:18:27<38:27, 1.56it/s, loss=0.0067, lr=9.56e-06, step=6404] Training: 64%|██████▍ | 6405/10000 [1:18:27<38:27, 1.56it/s, loss=0.0047, lr=9.56e-06, step=6405] Training: 64%|██████▍ | 6406/10000 [1:18:28<38:46, 1.54it/s, loss=0.0047, lr=9.56e-06, step=6405] Training: 64%|██████▍ | 6406/10000 [1:18:28<38:46, 1.54it/s, loss=0.0103, lr=9.56e-06, step=6406] Training: 64%|██████▍ | 6407/10000 [1:18:29<44:35, 1.34it/s, loss=0.0103, lr=9.56e-06, step=6406] Training: 64%|██████▍ | 6407/10000 [1:18:29<44:35, 1.34it/s, loss=0.0173, lr=9.55e-06, step=6407] Training: 64%|██████▍ | 6408/10000 [1:18:29<43:50, 1.37it/s, loss=0.0173, lr=9.55e-06, step=6407] Training: 64%|██████▍ | 6408/10000 [1:18:29<43:50, 1.37it/s, loss=0.0104, lr=9.55e-06, step=6408] Training: 64%|██████▍ | 6409/10000 [1:18:30<39:46, 1.50it/s, loss=0.0104, lr=9.55e-06, step=6408] Training: 64%|██████▍ | 6409/10000 [1:18:30<39:46, 1.50it/s, loss=0.0050, lr=9.55e-06, step=6409]17:24:37.658 [I] step=6410 loss=0.0042 smoothed_loss=0.0105 lr=9.56e-06 grad_norm=0.3693 step_time=0.5790s data_time=0.1340s it/s=1.403 eta_to_10000=2559.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0064 grad_action_out_proj=0.0775 grad_shared_expert=0.2957 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6410/10000 [1:18:31<41:45, 1.43it/s, loss=0.0050, lr=9.55e-06, step=6409] Training: 64%|██████▍ | 6410/10000 [1:18:31<41:45, 1.43it/s, loss=0.0042, lr=9.54e-06, step=6410] Training: 64%|██████▍ | 6411/10000 [1:18:31<39:00, 1.53it/s, loss=0.0042, lr=9.54e-06, step=6410] Training: 64%|██████▍ | 6411/10000 [1:18:31<39:00, 1.53it/s, loss=0.0090, lr=9.54e-06, step=6411] Training: 64%|██████▍ | 6412/10000 [1:18:32<40:47, 1.47it/s, loss=0.0090, lr=9.54e-06, step=6411] Training: 64%|██████▍ | 6412/10000 [1:18:32<40:47, 1.47it/s, loss=0.0088, lr=9.54e-06, step=6412] Training: 64%|██████▍ | 6413/10000 [1:18:33<38:57, 1.53it/s, loss=0.0088, lr=9.54e-06, step=6412] Training: 64%|██████▍ | 6413/10000 [1:18:33<38:57, 1.53it/s, loss=0.0072, lr=9.53e-06, step=6413] Training: 64%|██████▍ | 6414/10000 [1:18:33<41:26, 1.44it/s, loss=0.0072, lr=9.53e-06, step=6413] Training: 64%|██████▍ | 6414/10000 [1:18:33<41:26, 1.44it/s, loss=0.0150, lr=9.53e-06, step=6414] Training: 64%|██████▍ | 6415/10000 [1:18:34<44:30, 1.34it/s, loss=0.0150, lr=9.53e-06, step=6414] Training: 64%|██████▍ | 6415/10000 [1:18:34<44:30, 1.34it/s, loss=0.0060, lr=9.53e-06, step=6415] Training: 64%|██████▍ | 6416/10000 [1:18:35<43:32, 1.37it/s, loss=0.0060, lr=9.53e-06, step=6415] Training: 64%|██████▍ | 6416/10000 [1:18:35<43:32, 1.37it/s, loss=0.0110, lr=9.52e-06, step=6416] Training: 64%|██████▍ | 6417/10000 [1:18:36<44:07, 1.35it/s, loss=0.0110, lr=9.52e-06, step=6416] Training: 64%|██████▍ | 6417/10000 [1:18:36<44:07, 1.35it/s, loss=0.0040, lr=9.52e-06, step=6417] Training: 64%|██████▍ | 6418/10000 [1:18:36<42:19, 1.41it/s, loss=0.0040, lr=9.52e-06, step=6417] Training: 64%|██████▍ | 6418/10000 [1:18:36<42:19, 1.41it/s, loss=0.0121, lr=9.52e-06, step=6418] Training: 64%|██████▍ | 6419/10000 [1:18:37<38:52, 1.53it/s, loss=0.0121, lr=9.52e-06, step=6418] Training: 64%|██████▍ | 6419/10000 [1:18:37<38:52, 1.53it/s, loss=0.0066, lr=9.51e-06, step=6419]17:24:44.489 [I] step=6420 loss=0.0066 smoothed_loss=0.0091 lr=9.52e-06 grad_norm=0.4044 step_time=0.5613s data_time=0.1217s it/s=1.464 eta_to_10000=2444.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.0814 grad_shared_expert=0.2885 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6420/10000 [1:18:38<39:28, 1.51it/s, loss=0.0066, lr=9.51e-06, step=6419] Training: 64%|██████▍ | 6420/10000 [1:18:38<39:28, 1.51it/s, loss=0.0066, lr=9.51e-06, step=6420] Training: 64%|██████▍ | 6421/10000 [1:18:38<42:03, 1.42it/s, loss=0.0066, lr=9.51e-06, step=6420] Training: 64%|██████▍ | 6421/10000 [1:18:38<42:03, 1.42it/s, loss=0.0054, lr=9.51e-06, step=6421] Training: 64%|██████▍ | 6422/10000 [1:18:39<40:33, 1.47it/s, loss=0.0054, lr=9.51e-06, step=6421] Training: 64%|██████▍ | 6422/10000 [1:18:39<40:33, 1.47it/s, loss=0.0107, lr=9.50e-06, step=6422] Training: 64%|██████▍ | 6423/10000 [1:18:40<40:54, 1.46it/s, loss=0.0107, lr=9.50e-06, step=6422] Training: 64%|██████▍ | 6423/10000 [1:18:40<40:54, 1.46it/s, loss=0.0271, lr=9.50e-06, step=6423] Training: 64%|██████▍ | 6424/10000 [1:18:40<40:02, 1.49it/s, loss=0.0271, lr=9.50e-06, step=6423] Training: 64%|██████▍ | 6424/10000 [1:18:40<40:02, 1.49it/s, loss=0.0037, lr=9.49e-06, step=6424] Training: 64%|██████▍ | 6425/10000 [1:18:41<38:19, 1.55it/s, loss=0.0037, lr=9.49e-06, step=6424] Training: 64%|██████▍ | 6425/10000 [1:18:41<38:19, 1.55it/s, loss=0.0154, lr=9.49e-06, step=6425] Training: 64%|██████▍ | 6426/10000 [1:18:41<35:30, 1.68it/s, loss=0.0154, lr=9.49e-06, step=6425] Training: 64%|██████▍ | 6426/10000 [1:18:41<35:30, 1.68it/s, loss=0.0059, lr=9.49e-06, step=6426] Training: 64%|██████▍ | 6427/10000 [1:18:42<38:26, 1.55it/s, loss=0.0059, lr=9.49e-06, step=6426] Training: 64%|██████▍ | 6427/10000 [1:18:42<38:26, 1.55it/s, loss=0.0067, lr=9.48e-06, step=6427] Training: 64%|██████▍ | 6428/10000 [1:18:43<44:29, 1.34it/s, loss=0.0067, lr=9.48e-06, step=6427] Training: 64%|██████▍ | 6428/10000 [1:18:43<44:29, 1.34it/s, loss=0.0271, lr=9.48e-06, step=6428] Training: 64%|██████▍ | 6429/10000 [1:18:44<40:26, 1.47it/s, loss=0.0271, lr=9.48e-06, step=6428] Training: 64%|██████▍ | 6429/10000 [1:18:44<40:26, 1.47it/s, loss=0.0265, lr=9.48e-06, step=6429]17:24:51.318 [I] step=6430 loss=0.0032 smoothed_loss=0.0120 lr=9.49e-06 grad_norm=0.4763 step_time=0.5801s data_time=0.1029s it/s=1.464 eta_to_10000=2438.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.1297 grad_shared_expert=0.4928 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6430/10000 [1:18:44<41:23, 1.44it/s, loss=0.0265, lr=9.48e-06, step=6429] Training: 64%|██████▍ | 6430/10000 [1:18:44<41:23, 1.44it/s, loss=0.0032, lr=9.47e-06, step=6430] Training: 64%|██████▍ | 6431/10000 [1:18:45<41:50, 1.42it/s, loss=0.0032, lr=9.47e-06, step=6430] Training: 64%|██████▍ | 6431/10000 [1:18:45<41:50, 1.42it/s, loss=0.0034, lr=9.47e-06, step=6431] Training: 64%|██████▍ | 6432/10000 [1:18:46<39:56, 1.49it/s, loss=0.0034, lr=9.47e-06, step=6431] Training: 64%|██████▍ | 6432/10000 [1:18:46<39:56, 1.49it/s, loss=0.0111, lr=9.47e-06, step=6432] Training: 64%|██████▍ | 6433/10000 [1:18:46<40:10, 1.48it/s, loss=0.0111, lr=9.47e-06, step=6432] Training: 64%|██████▍ | 6433/10000 [1:18:46<40:10, 1.48it/s, loss=0.0030, lr=9.46e-06, step=6433] Training: 64%|██████▍ | 6434/10000 [1:18:47<46:21, 1.28it/s, loss=0.0030, lr=9.46e-06, step=6433] Training: 64%|██████▍ | 6434/10000 [1:18:47<46:21, 1.28it/s, loss=0.0025, lr=9.46e-06, step=6434] Training: 64%|██████▍ | 6435/10000 [1:18:48<48:30, 1.22it/s, loss=0.0025, lr=9.46e-06, step=6434] Training: 64%|██████▍ | 6435/10000 [1:18:48<48:30, 1.22it/s, loss=0.0034, lr=9.46e-06, step=6435] Training: 64%|██████▍ | 6436/10000 [1:18:50<55:27, 1.07it/s, loss=0.0034, lr=9.46e-06, step=6435] Training: 64%|██████▍ | 6436/10000 [1:18:50<55:27, 1.07it/s, loss=0.0058, lr=9.45e-06, step=6436] Training: 64%|██████▍ | 6437/10000 [1:18:50<53:47, 1.10it/s, loss=0.0058, lr=9.45e-06, step=6436] Training: 64%|██████▍ | 6437/10000 [1:18:50<53:47, 1.10it/s, loss=0.0241, lr=9.45e-06, step=6437] Training: 64%|██████▍ | 6438/10000 [1:18:51<50:38, 1.17it/s, loss=0.0241, lr=9.45e-06, step=6437] Training: 64%|██████▍ | 6438/10000 [1:18:51<50:38, 1.17it/s, loss=0.0047, lr=9.45e-06, step=6438] Training: 64%|██████▍ | 6439/10000 [1:18:52<52:11, 1.14it/s, loss=0.0047, lr=9.45e-06, step=6438] Training: 64%|██████▍ | 6439/10000 [1:18:52<52:11, 1.14it/s, loss=0.0114, lr=9.44e-06, step=6439]17:24:59.768 [I] step=6440 loss=0.0111 smoothed_loss=0.0099 lr=9.46e-06 grad_norm=0.4240 step_time=0.6527s data_time=0.1923s it/s=1.184 eta_to_10000=3007.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0718 grad_shared_expert=0.2664 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6440/10000 [1:18:53<50:48, 1.17it/s, loss=0.0114, lr=9.44e-06, step=6439] Training: 64%|██████▍ | 6440/10000 [1:18:53<50:48, 1.17it/s, loss=0.0111, lr=9.44e-06, step=6440] Training: 64%|██████▍ | 6441/10000 [1:18:53<44:47, 1.32it/s, loss=0.0111, lr=9.44e-06, step=6440] Training: 64%|██████▍ | 6441/10000 [1:18:53<44:47, 1.32it/s, loss=0.0384, lr=9.44e-06, step=6441] Training: 64%|██████▍ | 6442/10000 [1:18:54<45:04, 1.32it/s, loss=0.0384, lr=9.44e-06, step=6441] Training: 64%|██████▍ | 6442/10000 [1:18:54<45:04, 1.32it/s, loss=0.0016, lr=9.43e-06, step=6442] Training: 64%|██████▍ | 6443/10000 [1:18:55<49:29, 1.20it/s, loss=0.0016, lr=9.43e-06, step=6442] Training: 64%|██████▍ | 6443/10000 [1:18:55<49:29, 1.20it/s, loss=0.0044, lr=9.43e-06, step=6443] Training: 64%|██████▍ | 6444/10000 [1:18:56<48:31, 1.22it/s, loss=0.0044, lr=9.43e-06, step=6443] Training: 64%|██████▍ | 6444/10000 [1:18:56<48:31, 1.22it/s, loss=0.0086, lr=9.43e-06, step=6444] Training: 64%|██████▍ | 6445/10000 [1:18:57<52:46, 1.12it/s, loss=0.0086, lr=9.43e-06, step=6444] Training: 64%|██████▍ | 6445/10000 [1:18:57<52:46, 1.12it/s, loss=0.0098, lr=9.42e-06, step=6445] Training: 64%|██████▍ | 6446/10000 [1:18:58<49:27, 1.20it/s, loss=0.0098, lr=9.42e-06, step=6445] Training: 64%|██████▍ | 6446/10000 [1:18:58<49:27, 1.20it/s, loss=0.0249, lr=9.42e-06, step=6446] Training: 64%|██████▍ | 6447/10000 [1:18:58<47:45, 1.24it/s, loss=0.0249, lr=9.42e-06, step=6446] Training: 64%|██████▍ | 6447/10000 [1:18:58<47:45, 1.24it/s, loss=0.0139, lr=9.42e-06, step=6447] Training: 64%|██████▍ | 6448/10000 [1:18:59<49:10, 1.20it/s, loss=0.0139, lr=9.42e-06, step=6447] Training: 64%|██████▍ | 6448/10000 [1:18:59<49:10, 1.20it/s, loss=0.0126, lr=9.41e-06, step=6448] Training: 64%|██████▍ | 6449/10000 [1:19:00<49:34, 1.19it/s, loss=0.0126, lr=9.41e-06, step=6448] Training: 64%|██████▍ | 6449/10000 [1:19:00<49:34, 1.19it/s, loss=0.0145, lr=9.41e-06, step=6449]17:25:08.023 [I] step=6450 loss=0.0138 smoothed_loss=0.0126 lr=9.42e-06 grad_norm=0.4940 step_time=0.6408s data_time=0.1846s it/s=1.212 eta_to_10000=2929.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0051 grad_action_out_proj=0.0624 grad_shared_expert=0.2539 (10775:train_pytorch.py:850) + Training: 64%|██████▍ | 6450/10000 [1:19:01<51:09, 1.16it/s, loss=0.0145, lr=9.41e-06, step=6449] Training: 64%|██████▍ | 6450/10000 [1:19:01<51:09, 1.16it/s, loss=0.0138, lr=9.41e-06, step=6450] Training: 65%|██████▍ | 6451/10000 [1:19:02<44:53, 1.32it/s, loss=0.0138, lr=9.41e-06, step=6450] Training: 65%|██████▍ | 6451/10000 [1:19:02<44:53, 1.32it/s, loss=0.0053, lr=9.40e-06, step=6451] Training: 65%|██████▍ | 6452/10000 [1:19:02<42:21, 1.40it/s, loss=0.0053, lr=9.40e-06, step=6451] Training: 65%|██████▍ | 6452/10000 [1:19:02<42:21, 1.40it/s, loss=0.0147, lr=9.40e-06, step=6452] Training: 65%|██████▍ | 6453/10000 [1:19:03<44:44, 1.32it/s, loss=0.0147, lr=9.40e-06, step=6452] Training: 65%|██████▍ | 6453/10000 [1:19:03<44:44, 1.32it/s, loss=0.0061, lr=9.40e-06, step=6453] Training: 65%|██████▍ | 6454/10000 [1:19:04<42:40, 1.38it/s, loss=0.0061, lr=9.40e-06, step=6453] Training: 65%|██████▍ | 6454/10000 [1:19:04<42:40, 1.38it/s, loss=0.0342, lr=9.39e-06, step=6454] Training: 65%|██████▍ | 6455/10000 [1:19:04<40:40, 1.45it/s, loss=0.0342, lr=9.39e-06, step=6454] Training: 65%|██████▍ | 6455/10000 [1:19:04<40:40, 1.45it/s, loss=0.0039, lr=9.39e-06, step=6455] Training: 65%|██████▍ | 6456/10000 [1:19:05<37:21, 1.58it/s, loss=0.0039, lr=9.39e-06, step=6455] Training: 65%|██████▍ | 6456/10000 [1:19:05<37:21, 1.58it/s, loss=0.0077, lr=9.38e-06, step=6456] Training: 65%|██████▍ | 6457/10000 [1:19:05<37:28, 1.58it/s, loss=0.0077, lr=9.38e-06, step=6456] Training: 65%|██████▍ | 6457/10000 [1:19:05<37:28, 1.58it/s, loss=0.0097, lr=9.38e-06, step=6457] Training: 65%|██████▍ | 6458/10000 [1:19:07<45:17, 1.30it/s, loss=0.0097, lr=9.38e-06, step=6457] Training: 65%|██████▍ | 6458/10000 [1:19:07<45:17, 1.30it/s, loss=0.0026, lr=9.38e-06, step=6458] Training: 65%|██████▍ | 6459/10000 [1:19:07<47:34, 1.24it/s, loss=0.0026, lr=9.38e-06, step=6458] Training: 65%|██████▍ | 6459/10000 [1:19:07<47:34, 1.24it/s, loss=0.0043, lr=9.37e-06, step=6459]17:25:15.460 [I] step=6460 loss=0.0248 smoothed_loss=0.0119 lr=9.39e-06 grad_norm=0.4657 step_time=0.6013s data_time=0.1425s it/s=1.345 eta_to_10000=2632.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.1185 grad_shared_expert=0.4307 (10775:train_pytorch.py:850) + Training: 65%|██████▍ | 6460/10000 [1:19:09<52:36, 1.12it/s, loss=0.0043, lr=9.37e-06, step=6459] Training: 65%|██████▍ | 6460/10000 [1:19:09<52:36, 1.12it/s, loss=0.0248, lr=9.37e-06, step=6460] Training: 65%|██████▍ | 6461/10000 [1:19:09<52:00, 1.13it/s, loss=0.0248, lr=9.37e-06, step=6460] Training: 65%|██████▍ | 6461/10000 [1:19:09<52:00, 1.13it/s, loss=0.0042, lr=9.37e-06, step=6461] Training: 65%|██████▍ | 6462/10000 [1:19:10<48:04, 1.23it/s, loss=0.0042, lr=9.37e-06, step=6461] Training: 65%|██████▍ | 6462/10000 [1:19:10<48:04, 1.23it/s, loss=0.0412, lr=9.36e-06, step=6462] Training: 65%|██████▍ | 6463/10000 [1:19:11<42:44, 1.38it/s, loss=0.0412, lr=9.36e-06, step=6462] Training: 65%|██████▍ | 6463/10000 [1:19:11<42:44, 1.38it/s, loss=0.0036, lr=9.36e-06, step=6463] Training: 65%|██████▍ | 6464/10000 [1:19:11<38:46, 1.52it/s, loss=0.0036, lr=9.36e-06, step=6463] Training: 65%|██████▍ | 6464/10000 [1:19:11<38:46, 1.52it/s, loss=0.0030, lr=9.36e-06, step=6464] Training: 65%|██████▍ | 6465/10000 [1:19:12<45:21, 1.30it/s, loss=0.0030, lr=9.36e-06, step=6464] Training: 65%|██████▍ | 6465/10000 [1:19:12<45:21, 1.30it/s, loss=0.0495, lr=9.35e-06, step=6465] Training: 65%|██████▍ | 6466/10000 [1:19:13<45:56, 1.28it/s, loss=0.0495, lr=9.35e-06, step=6465] Training: 65%|██████▍ | 6466/10000 [1:19:13<45:56, 1.28it/s, loss=0.0057, lr=9.35e-06, step=6466] Training: 65%|██████▍ | 6467/10000 [1:19:14<44:15, 1.33it/s, loss=0.0057, lr=9.35e-06, step=6466] Training: 65%|██████▍ | 6467/10000 [1:19:14<44:15, 1.33it/s, loss=0.0055, lr=9.35e-06, step=6467] Training: 65%|██████▍ | 6468/10000 [1:19:14<40:37, 1.45it/s, loss=0.0055, lr=9.35e-06, step=6467] Training: 65%|██████▍ | 6468/10000 [1:19:14<40:37, 1.45it/s, loss=0.0042, lr=9.34e-06, step=6468] Training: 65%|██████▍ | 6469/10000 [1:19:15<43:00, 1.37it/s, loss=0.0042, lr=9.34e-06, step=6468] Training: 65%|██████▍ | 6469/10000 [1:19:15<43:00, 1.37it/s, loss=0.0039, lr=9.34e-06, step=6469]17:25:22.801 [I] step=6470 loss=0.0031 smoothed_loss=0.0111 lr=9.35e-06 grad_norm=0.4221 step_time=0.5835s data_time=0.1505s it/s=1.363 eta_to_10000=2590.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0080 grad_action_out_proj=0.0880 grad_shared_expert=0.2845 (10775:train_pytorch.py:850) + Training: 65%|██████▍ | 6470/10000 [1:19:16<46:13, 1.27it/s, loss=0.0039, lr=9.34e-06, step=6469] Training: 65%|██████▍ | 6470/10000 [1:19:16<46:13, 1.27it/s, loss=0.0031, lr=9.34e-06, step=6470] Training: 65%|██████▍ | 6471/10000 [1:19:17<47:57, 1.23it/s, loss=0.0031, lr=9.34e-06, step=6470] Training: 65%|██████▍ | 6471/10000 [1:19:17<47:57, 1.23it/s, loss=0.0028, lr=9.33e-06, step=6471] Training: 65%|██████▍ | 6472/10000 [1:19:18<48:42, 1.21it/s, loss=0.0028, lr=9.33e-06, step=6471] Training: 65%|██████▍ | 6472/10000 [1:19:18<48:42, 1.21it/s, loss=0.0047, lr=9.33e-06, step=6472] Training: 65%|██████▍ | 6473/10000 [1:19:18<45:49, 1.28it/s, loss=0.0047, lr=9.33e-06, step=6472] Training: 65%|██████▍ | 6473/10000 [1:19:18<45:49, 1.28it/s, loss=0.0026, lr=9.33e-06, step=6473] Training: 65%|██████▍ | 6474/10000 [1:19:19<50:36, 1.16it/s, loss=0.0026, lr=9.33e-06, step=6473] Training: 65%|██████▍ | 6474/10000 [1:19:19<50:36, 1.16it/s, loss=0.0024, lr=9.32e-06, step=6474] Training: 65%|██████▍ | 6475/10000 [1:19:20<53:02, 1.11it/s, loss=0.0024, lr=9.32e-06, step=6474] Training: 65%|██████▍ | 6475/10000 [1:19:20<53:02, 1.11it/s, loss=0.0024, lr=9.32e-06, step=6475] Training: 65%|██████▍ | 6476/10000 [1:19:21<48:41, 1.21it/s, loss=0.0024, lr=9.32e-06, step=6475] Training: 65%|██████▍ | 6476/10000 [1:19:21<48:41, 1.21it/s, loss=0.0113, lr=9.32e-06, step=6476] Training: 65%|██████▍ | 6477/10000 [1:19:22<49:19, 1.19it/s, loss=0.0113, lr=9.32e-06, step=6476] Training: 65%|██████▍ | 6477/10000 [1:19:22<49:19, 1.19it/s, loss=0.0148, lr=9.31e-06, step=6477] Training: 65%|██████▍ | 6478/10000 [1:19:23<50:48, 1.16it/s, loss=0.0148, lr=9.31e-06, step=6477] Training: 65%|██████▍ | 6478/10000 [1:19:23<50:48, 1.16it/s, loss=0.0134, lr=9.31e-06, step=6478] Training: 65%|██████▍ | 6479/10000 [1:19:24<55:30, 1.06it/s, loss=0.0134, lr=9.31e-06, step=6478] Training: 65%|██████▍ | 6479/10000 [1:19:24<55:30, 1.06it/s, loss=0.0071, lr=9.31e-06, step=6479]17:25:31.742 [I] step=6480 loss=0.0031 smoothed_loss=0.0084 lr=9.32e-06 grad_norm=0.4472 step_time=0.6779s data_time=0.2162s it/s=1.119 eta_to_10000=3146.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0040 grad_action_out_proj=0.0577 grad_shared_expert=0.1578 (10775:train_pytorch.py:850) + Training: 65%|██████▍ | 6480/10000 [1:19:25<54:39, 1.07it/s, loss=0.0071, lr=9.31e-06, step=6479] Training: 65%|██████▍ | 6480/10000 [1:19:25<54:39, 1.07it/s, loss=0.0031, lr=9.30e-06, step=6480] Training: 65%|██████▍ | 6481/10000 [1:19:26<51:25, 1.14it/s, loss=0.0031, lr=9.30e-06, step=6480] Training: 65%|██████▍ | 6481/10000 [1:19:26<51:25, 1.14it/s, loss=0.0062, lr=9.30e-06, step=6481] Training: 65%|██████▍ | 6482/10000 [1:19:26<51:51, 1.13it/s, loss=0.0062, lr=9.30e-06, step=6481] Training: 65%|██████▍ | 6482/10000 [1:19:26<51:51, 1.13it/s, loss=0.0076, lr=9.30e-06, step=6482] Training: 65%|██████▍ | 6483/10000 [1:19:27<53:02, 1.10it/s, loss=0.0076, lr=9.30e-06, step=6482] Training: 65%|██████▍ | 6483/10000 [1:19:27<53:02, 1.10it/s, loss=0.0060, lr=9.29e-06, step=6483] Training: 65%|██████▍ | 6484/10000 [1:19:28<56:01, 1.05it/s, loss=0.0060, lr=9.29e-06, step=6483] Training: 65%|██████▍ | 6484/10000 [1:19:28<56:01, 1.05it/s, loss=0.0038, lr=9.29e-06, step=6484] Training: 65%|██████▍ | 6485/10000 [1:19:29<53:12, 1.10it/s, loss=0.0038, lr=9.29e-06, step=6484] Training: 65%|██████▍ | 6485/10000 [1:19:29<53:12, 1.10it/s, loss=0.0097, lr=9.29e-06, step=6485] Training: 65%|██████▍ | 6486/10000 [1:19:30<54:05, 1.08it/s, loss=0.0097, lr=9.29e-06, step=6485] Training: 65%|██████▍ | 6486/10000 [1:19:30<54:05, 1.08it/s, loss=0.0167, lr=9.28e-06, step=6486] Training: 65%|██████▍ | 6487/10000 [1:19:31<52:26, 1.12it/s, loss=0.0167, lr=9.28e-06, step=6486] Training: 65%|██████▍ | 6487/10000 [1:19:31<52:26, 1.12it/s, loss=0.0378, lr=9.28e-06, step=6487] Training: 65%|██████▍ | 6488/10000 [1:19:32<52:25, 1.12it/s, loss=0.0378, lr=9.28e-06, step=6487] Training: 65%|██████▍ | 6488/10000 [1:19:32<52:25, 1.12it/s, loss=0.0038, lr=9.28e-06, step=6488] Training: 65%|██████▍ | 6489/10000 [1:19:33<47:21, 1.24it/s, loss=0.0038, lr=9.28e-06, step=6488] Training: 65%|██████▍ | 6489/10000 [1:19:33<47:21, 1.24it/s, loss=0.0115, lr=9.27e-06, step=6489]17:25:40.368 [I] step=6490 loss=0.0038 smoothed_loss=0.0101 lr=9.28e-06 grad_norm=0.3873 step_time=0.6611s data_time=0.2015s it/s=1.159 eta_to_10000=3027.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0036 grad_action_out_proj=0.0545 grad_shared_expert=0.2783 (10775:train_pytorch.py:850) + Training: 65%|██████▍ | 6490/10000 [1:19:33<48:11, 1.21it/s, loss=0.0115, lr=9.27e-06, step=6489] Training: 65%|██████▍ | 6490/10000 [1:19:33<48:11, 1.21it/s, loss=0.0038, lr=9.27e-06, step=6490] Training: 65%|██████▍ | 6491/10000 [1:19:34<48:35, 1.20it/s, loss=0.0038, lr=9.27e-06, step=6490] Training: 65%|██████▍ | 6491/10000 [1:19:34<48:35, 1.20it/s, loss=0.0017, lr=9.27e-06, step=6491] Training: 65%|██████▍ | 6492/10000 [1:19:35<44:53, 1.30it/s, loss=0.0017, lr=9.27e-06, step=6491] Training: 65%|██████▍ | 6492/10000 [1:19:35<44:53, 1.30it/s, loss=0.0214, lr=9.26e-06, step=6492] Training: 65%|██████▍ | 6493/10000 [1:19:36<51:17, 1.14it/s, loss=0.0214, lr=9.26e-06, step=6492] Training: 65%|██████▍ | 6493/10000 [1:19:36<51:17, 1.14it/s, loss=0.0097, lr=9.26e-06, step=6493] Training: 65%|██████▍ | 6494/10000 [1:19:37<55:56, 1.04it/s, loss=0.0097, lr=9.26e-06, step=6493] Training: 65%|██████▍ | 6494/10000 [1:19:37<55:56, 1.04it/s, loss=0.0397, lr=9.25e-06, step=6494] Training: 65%|██████▍ | 6495/10000 [1:19:38<54:07, 1.08it/s, loss=0.0397, lr=9.25e-06, step=6494] Training: 65%|██████▍ | 6495/10000 [1:19:38<54:07, 1.08it/s, loss=0.0249, lr=9.25e-06, step=6495] Training: 65%|██████▍ | 6496/10000 [1:19:39<48:22, 1.21it/s, loss=0.0249, lr=9.25e-06, step=6495] Training: 65%|██████▍ | 6496/10000 [1:19:39<48:22, 1.21it/s, loss=0.0129, lr=9.25e-06, step=6496] Training: 65%|██████▍ | 6497/10000 [1:19:39<44:33, 1.31it/s, loss=0.0129, lr=9.25e-06, step=6496] Training: 65%|██████▍ | 6497/10000 [1:19:39<44:33, 1.31it/s, loss=0.0253, lr=9.24e-06, step=6497] Training: 65%|██████▍ | 6498/10000 [1:19:40<42:31, 1.37it/s, loss=0.0253, lr=9.24e-06, step=6497] Training: 65%|██████▍ | 6498/10000 [1:19:40<42:31, 1.37it/s, loss=0.0065, lr=9.24e-06, step=6498] Training: 65%|██████▍ | 6499/10000 [1:19:40<38:27, 1.52it/s, loss=0.0065, lr=9.24e-06, step=6498] Training: 65%|██████▍ | 6499/10000 [1:19:40<38:27, 1.52it/s, loss=0.0077, lr=9.24e-06, step=6499]17:25:48.061 [I] step=6500 loss=0.0154 smoothed_loss=0.0140 lr=9.25e-06 grad_norm=0.4666 step_time=0.6104s data_time=0.1590s it/s=1.300 eta_to_10000=2692.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0109 grad_action_out_proj=0.0772 grad_shared_expert=0.5765 (10775:train_pytorch.py:850) + Training: 65%|██████▌ | 6500/10000 [1:19:41<39:50, 1.46it/s, loss=0.0077, lr=9.24e-06, step=6499] Training: 65%|██████▌ | 6500/10000 [1:19:41<39:50, 1.46it/s, loss=0.0154, lr=9.23e-06, step=6500] Training: 65%|██████▌ | 6501/10000 [1:19:42<42:38, 1.37it/s, loss=0.0154, lr=9.23e-06, step=6500] Training: 65%|██████▌ | 6501/10000 [1:19:42<42:38, 1.37it/s, loss=0.0118, lr=9.23e-06, step=6501] Training: 65%|██████▌ | 6502/10000 [1:19:43<48:19, 1.21it/s, loss=0.0118, lr=9.23e-06, step=6501] Training: 65%|██████▌ | 6502/10000 [1:19:43<48:19, 1.21it/s, loss=0.0275, lr=9.23e-06, step=6502] Training: 65%|██████▌ | 6503/10000 [1:19:44<48:08, 1.21it/s, loss=0.0275, lr=9.23e-06, step=6502] Training: 65%|██████▌ | 6503/10000 [1:19:44<48:08, 1.21it/s, loss=0.0099, lr=9.22e-06, step=6503] Training: 65%|██████▌ | 6504/10000 [1:19:45<48:17, 1.21it/s, loss=0.0099, lr=9.22e-06, step=6503] Training: 65%|██████▌ | 6504/10000 [1:19:45<48:17, 1.21it/s, loss=0.0076, lr=9.22e-06, step=6504] Training: 65%|██████▌ | 6505/10000 [1:19:46<50:12, 1.16it/s, loss=0.0076, lr=9.22e-06, step=6504] Training: 65%|██████▌ | 6505/10000 [1:19:46<50:12, 1.16it/s, loss=0.0256, lr=9.22e-06, step=6505] Training: 65%|██████▌ | 6506/10000 [1:19:47<51:59, 1.12it/s, loss=0.0256, lr=9.22e-06, step=6505] Training: 65%|██████▌ | 6506/10000 [1:19:47<51:59, 1.12it/s, loss=0.0100, lr=9.21e-06, step=6506] Training: 65%|██████▌ | 6507/10000 [1:19:48<54:34, 1.07it/s, loss=0.0100, lr=9.21e-06, step=6506] Training: 65%|██████▌ | 6507/10000 [1:19:48<54:34, 1.07it/s, loss=0.0011, lr=9.21e-06, step=6507] Training: 65%|██████▌ | 6508/10000 [1:19:49<54:37, 1.07it/s, loss=0.0011, lr=9.21e-06, step=6507] Training: 65%|██████▌ | 6508/10000 [1:19:49<54:37, 1.07it/s, loss=0.0040, lr=9.21e-06, step=6508] Training: 65%|██████▌ | 6509/10000 [1:19:49<52:05, 1.12it/s, loss=0.0040, lr=9.21e-06, step=6508] Training: 65%|██████▌ | 6509/10000 [1:19:49<52:05, 1.12it/s, loss=0.0033, lr=9.20e-06, step=6509]17:25:57.165 [I] step=6510 loss=0.0061 smoothed_loss=0.0109 lr=9.22e-06 grad_norm=0.4959 step_time=0.6846s data_time=0.2258s it/s=1.099 eta_to_10000=3176.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0168 grad_action_out_proj=0.1677 grad_shared_expert=0.4779 (10775:train_pytorch.py:850) + Training: 65%|██████▌ | 6510/10000 [1:19:50<51:35, 1.13it/s, loss=0.0033, lr=9.20e-06, step=6509] Training: 65%|██████▌ | 6510/10000 [1:19:50<51:35, 1.13it/s, loss=0.0061, lr=9.20e-06, step=6510] Training: 65%|██████▌ | 6511/10000 [1:19:51<47:43, 1.22it/s, loss=0.0061, lr=9.20e-06, step=6510] Training: 65%|██████▌ | 6511/10000 [1:19:51<47:43, 1.22it/s, loss=0.0026, lr=9.20e-06, step=6511] Training: 65%|██████▌ | 6512/10000 [1:19:52<44:11, 1.32it/s, loss=0.0026, lr=9.20e-06, step=6511] Training: 65%|██████▌ | 6512/10000 [1:19:52<44:11, 1.32it/s, loss=0.0023, lr=9.19e-06, step=6512] Training: 65%|██████▌ | 6513/10000 [1:19:52<44:12, 1.31it/s, loss=0.0023, lr=9.19e-06, step=6512] Training: 65%|██████▌ | 6513/10000 [1:19:52<44:12, 1.31it/s, loss=0.0050, lr=9.19e-06, step=6513] Training: 65%|██████▌ | 6514/10000 [1:19:54<52:33, 1.11it/s, loss=0.0050, lr=9.19e-06, step=6513] Training: 65%|██████▌ | 6514/10000 [1:19:54<52:33, 1.11it/s, loss=0.0089, lr=9.19e-06, step=6514] Training: 65%|██████▌ | 6515/10000 [1:19:54<47:18, 1.23it/s, loss=0.0089, lr=9.19e-06, step=6514] Training: 65%|██████▌ | 6515/10000 [1:19:54<47:18, 1.23it/s, loss=0.0078, lr=9.18e-06, step=6515] Training: 65%|██████▌ | 6516/10000 [1:19:55<42:25, 1.37it/s, loss=0.0078, lr=9.18e-06, step=6515] Training: 65%|██████▌ | 6516/10000 [1:19:55<42:25, 1.37it/s, loss=0.0095, lr=9.18e-06, step=6516] Training: 65%|██████▌ | 6517/10000 [1:19:55<42:12, 1.38it/s, loss=0.0095, lr=9.18e-06, step=6516] Training: 65%|██████▌ | 6517/10000 [1:19:55<42:12, 1.38it/s, loss=0.0120, lr=9.18e-06, step=6517] Training: 65%|██████▌ | 6518/10000 [1:19:56<40:01, 1.45it/s, loss=0.0120, lr=9.18e-06, step=6517] Training: 65%|██████▌ | 6518/10000 [1:19:56<40:01, 1.45it/s, loss=0.0031, lr=9.17e-06, step=6518] Training: 65%|██████▌ | 6519/10000 [1:19:57<41:27, 1.40it/s, loss=0.0031, lr=9.17e-06, step=6518] Training: 65%|██████▌ | 6519/10000 [1:19:57<41:27, 1.40it/s, loss=0.0042, lr=9.17e-06, step=6519]17:26:04.399 [I] step=6520 loss=0.0084 smoothed_loss=0.0081 lr=9.18e-06 grad_norm=0.4543 step_time=0.5943s data_time=0.1291s it/s=1.383 eta_to_10000=2517.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.1025 grad_shared_expert=0.4122 (10775:train_pytorch.py:850) + Training: 65%|██████▌ | 6520/10000 [1:19:57<41:25, 1.40it/s, loss=0.0042, lr=9.17e-06, step=6519] Training: 65%|██████▌ | 6520/10000 [1:19:57<41:25, 1.40it/s, loss=0.0084, lr=9.17e-06, step=6520] Training: 65%|██████▌ | 6521/10000 [1:19:58<42:02, 1.38it/s, loss=0.0084, lr=9.17e-06, step=6520] Training: 65%|██████▌ | 6521/10000 [1:19:58<42:02, 1.38it/s, loss=0.0026, lr=9.16e-06, step=6521] Training: 65%|██████▌ | 6522/10000 [1:19:59<45:46, 1.27it/s, loss=0.0026, lr=9.16e-06, step=6521] Training: 65%|██████▌ | 6522/10000 [1:19:59<45:46, 1.27it/s, loss=0.0034, lr=9.16e-06, step=6522] Training: 65%|██████▌ | 6523/10000 [1:20:00<41:30, 1.40it/s, loss=0.0034, lr=9.16e-06, step=6522] Training: 65%|██████▌ | 6523/10000 [1:20:00<41:30, 1.40it/s, loss=0.0121, lr=9.16e-06, step=6523] Training: 65%|██████▌ | 6524/10000 [1:20:00<40:59, 1.41it/s, loss=0.0121, lr=9.16e-06, step=6523] Training: 65%|██████▌ | 6524/10000 [1:20:00<40:59, 1.41it/s, loss=0.0436, lr=9.15e-06, step=6524] Training: 65%|██████▌ | 6525/10000 [1:20:01<37:23, 1.55it/s, loss=0.0436, lr=9.15e-06, step=6524] Training: 65%|██████▌ | 6525/10000 [1:20:01<37:23, 1.55it/s, loss=0.0075, lr=9.15e-06, step=6525] Training: 65%|██████▌ | 6526/10000 [1:20:01<34:22, 1.68it/s, loss=0.0075, lr=9.15e-06, step=6525] Training: 65%|██████▌ | 6526/10000 [1:20:01<34:22, 1.68it/s, loss=0.0178, lr=9.15e-06, step=6526] Training: 65%|██████▌ | 6527/10000 [1:20:02<37:49, 1.53it/s, loss=0.0178, lr=9.15e-06, step=6526] Training: 65%|██████▌ | 6527/10000 [1:20:02<37:49, 1.53it/s, loss=0.0010, lr=9.14e-06, step=6527] Training: 65%|██████▌ | 6528/10000 [1:20:03<35:13, 1.64it/s, loss=0.0010, lr=9.14e-06, step=6527] Training: 65%|██████▌ | 6528/10000 [1:20:03<35:13, 1.64it/s, loss=0.0284, lr=9.14e-06, step=6528] Training: 65%|██████▌ | 6529/10000 [1:20:03<37:30, 1.54it/s, loss=0.0284, lr=9.14e-06, step=6528] Training: 65%|██████▌ | 6529/10000 [1:20:03<37:30, 1.54it/s, loss=0.0076, lr=9.14e-06, step=6529]17:26:11.221 [I] step=6530 loss=0.0130 smoothed_loss=0.0119 lr=9.15e-06 grad_norm=0.6308 step_time=0.5700s data_time=0.1122s it/s=1.466 eta_to_10000=2366.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0167 grad_action_out_proj=0.1528 grad_shared_expert=2.1178 (10775:train_pytorch.py:850) + Training: 65%|██████▌ | 6530/10000 [1:20:04<41:38, 1.39it/s, loss=0.0076, lr=9.14e-06, step=6529] Training: 65%|██████▌ | 6530/10000 [1:20:04<41:38, 1.39it/s, loss=0.0130, lr=9.13e-06, step=6530] Training: 65%|██████▌ | 6531/10000 [1:20:05<43:07, 1.34it/s, loss=0.0130, lr=9.13e-06, step=6530] Training: 65%|██████▌ | 6531/10000 [1:20:05<43:07, 1.34it/s, loss=0.0102, lr=9.13e-06, step=6531] Training: 65%|██████▌ | 6532/10000 [1:20:06<46:51, 1.23it/s, loss=0.0102, lr=9.13e-06, step=6531] Training: 65%|██████▌ | 6532/10000 [1:20:06<46:51, 1.23it/s, loss=0.0097, lr=9.13e-06, step=6532] Training: 65%|██████▌ | 6533/10000 [1:20:07<47:46, 1.21it/s, loss=0.0097, lr=9.13e-06, step=6532] Training: 65%|██████▌ | 6533/10000 [1:20:07<47:46, 1.21it/s, loss=0.0138, lr=9.12e-06, step=6533] Training: 65%|██████▌ | 6534/10000 [1:20:08<47:00, 1.23it/s, loss=0.0138, lr=9.12e-06, step=6533] Training: 65%|██████▌ | 6534/10000 [1:20:08<47:00, 1.23it/s, loss=0.0080, lr=9.12e-06, step=6534] Training: 65%|██████▌ | 6535/10000 [1:20:08<45:35, 1.27it/s, loss=0.0080, lr=9.12e-06, step=6534] Training: 65%|██████▌ | 6535/10000 [1:20:08<45:35, 1.27it/s, loss=0.0125, lr=9.12e-06, step=6535] Training: 65%|██████▌ | 6536/10000 [1:20:09<48:48, 1.18it/s, loss=0.0125, lr=9.12e-06, step=6535] Training: 65%|██████▌ | 6536/10000 [1:20:09<48:48, 1.18it/s, loss=0.0066, lr=9.11e-06, step=6536] Training: 65%|██████▌ | 6537/10000 [1:20:10<47:31, 1.21it/s, loss=0.0066, lr=9.11e-06, step=6536] Training: 65%|██████▌ | 6537/10000 [1:20:10<47:31, 1.21it/s, loss=0.0129, lr=9.11e-06, step=6537] Training: 65%|██████▌ | 6538/10000 [1:20:11<42:59, 1.34it/s, loss=0.0129, lr=9.11e-06, step=6537] Training: 65%|██████▌ | 6538/10000 [1:20:11<42:59, 1.34it/s, loss=0.0052, lr=9.11e-06, step=6538] Training: 65%|██████▌ | 6539/10000 [1:20:12<46:23, 1.24it/s, loss=0.0052, lr=9.11e-06, step=6538] Training: 65%|██████▌ | 6539/10000 [1:20:12<46:23, 1.24it/s, loss=0.0032, lr=9.10e-06, step=6539]17:26:19.277 [I] step=6540 loss=0.0092 smoothed_loss=0.0098 lr=9.11e-06 grad_norm=0.4523 step_time=0.6225s data_time=0.1830s it/s=1.242 eta_to_10000=2786.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0112 grad_action_out_proj=0.0734 grad_shared_expert=0.4396 (10775:train_pytorch.py:850) + Training: 65%|██████▌ | 6540/10000 [1:20:12<43:46, 1.32it/s, loss=0.0032, lr=9.10e-06, step=6539] Training: 65%|██████▌ | 6540/10000 [1:20:12<43:46, 1.32it/s, loss=0.0092, lr=9.10e-06, step=6540] Training: 65%|██████▌ | 6541/10000 [1:20:13<41:15, 1.40it/s, loss=0.0092, lr=9.10e-06, step=6540] Training: 65%|██████▌ | 6541/10000 [1:20:13<41:15, 1.40it/s, loss=0.0045, lr=9.10e-06, step=6541] Training: 65%|██████▌ | 6542/10000 [1:20:14<41:47, 1.38it/s, loss=0.0045, lr=9.10e-06, step=6541] Training: 65%|██████▌ | 6542/10000 [1:20:14<41:47, 1.38it/s, loss=0.0026, lr=9.09e-06, step=6542] Training: 65%|██████▌ | 6543/10000 [1:20:15<45:24, 1.27it/s, loss=0.0026, lr=9.09e-06, step=6542] Training: 65%|██████▌ | 6543/10000 [1:20:15<45:24, 1.27it/s, loss=0.0379, lr=9.09e-06, step=6543] Training: 65%|██████▌ | 6544/10000 [1:20:16<47:01, 1.22it/s, loss=0.0379, lr=9.09e-06, step=6543] Training: 65%|██████▌ | 6544/10000 [1:20:16<47:01, 1.22it/s, loss=0.0103, lr=9.08e-06, step=6544] Training: 65%|██████▌ | 6545/10000 [1:20:16<47:36, 1.21it/s, loss=0.0103, lr=9.08e-06, step=6544] Training: 65%|██████▌ | 6545/10000 [1:20:16<47:36, 1.21it/s, loss=0.0062, lr=9.08e-06, step=6545] Training: 65%|██████▌ | 6546/10000 [1:20:17<50:47, 1.13it/s, loss=0.0062, lr=9.08e-06, step=6545] Training: 65%|██████▌ | 6546/10000 [1:20:17<50:47, 1.13it/s, loss=0.0048, lr=9.08e-06, step=6546] Training: 65%|██████▌ | 6547/10000 [1:20:18<45:29, 1.27it/s, loss=0.0048, lr=9.08e-06, step=6546] Training: 65%|██████▌ | 6547/10000 [1:20:18<45:29, 1.27it/s, loss=0.0014, lr=9.07e-06, step=6547] Training: 65%|██████▌ | 6548/10000 [1:20:19<44:45, 1.29it/s, loss=0.0014, lr=9.07e-06, step=6547] Training: 65%|██████▌ | 6548/10000 [1:20:19<44:45, 1.29it/s, loss=0.0054, lr=9.07e-06, step=6548] Training: 65%|██████▌ | 6549/10000 [1:20:19<39:55, 1.44it/s, loss=0.0054, lr=9.07e-06, step=6548] Training: 65%|██████▌ | 6549/10000 [1:20:19<39:55, 1.44it/s, loss=0.0147, lr=9.07e-06, step=6549]17:26:27.163 [I] step=6550 loss=0.0110 smoothed_loss=0.0097 lr=9.08e-06 grad_norm=0.5361 step_time=0.6214s data_time=0.1672s it/s=1.268 eta_to_10000=2720.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0104 grad_action_out_proj=0.1157 grad_shared_expert=0.3954 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6550/10000 [1:20:20<45:33, 1.26it/s, loss=0.0147, lr=9.07e-06, step=6549] Training: 66%|██████▌ | 6550/10000 [1:20:20<45:33, 1.26it/s, loss=0.0110, lr=9.06e-06, step=6550] Training: 66%|██████▌ | 6551/10000 [1:20:21<40:43, 1.41it/s, loss=0.0110, lr=9.06e-06, step=6550] Training: 66%|██████▌ | 6551/10000 [1:20:21<40:43, 1.41it/s, loss=0.0150, lr=9.06e-06, step=6551] Training: 66%|██████▌ | 6552/10000 [1:20:22<42:45, 1.34it/s, loss=0.0150, lr=9.06e-06, step=6551] Training: 66%|██████▌ | 6552/10000 [1:20:22<42:45, 1.34it/s, loss=0.0036, lr=9.06e-06, step=6552] Training: 66%|██████▌ | 6553/10000 [1:20:23<46:38, 1.23it/s, loss=0.0036, lr=9.06e-06, step=6552] Training: 66%|██████▌ | 6553/10000 [1:20:23<46:38, 1.23it/s, loss=0.0119, lr=9.05e-06, step=6553] Training: 66%|██████▌ | 6554/10000 [1:20:23<40:56, 1.40it/s, loss=0.0119, lr=9.05e-06, step=6553] Training: 66%|██████▌ | 6554/10000 [1:20:23<40:56, 1.40it/s, loss=0.0099, lr=9.05e-06, step=6554] Training: 66%|██████▌ | 6555/10000 [1:20:24<43:49, 1.31it/s, loss=0.0099, lr=9.05e-06, step=6554] Training: 66%|██████▌ | 6555/10000 [1:20:24<43:49, 1.31it/s, loss=0.0019, lr=9.05e-06, step=6555] Training: 66%|██████▌ | 6556/10000 [1:20:25<44:06, 1.30it/s, loss=0.0019, lr=9.05e-06, step=6555] Training: 66%|██████▌ | 6556/10000 [1:20:25<44:06, 1.30it/s, loss=0.0101, lr=9.04e-06, step=6556] Training: 66%|██████▌ | 6557/10000 [1:20:25<44:51, 1.28it/s, loss=0.0101, lr=9.04e-06, step=6556] Training: 66%|██████▌ | 6557/10000 [1:20:25<44:51, 1.28it/s, loss=0.0009, lr=9.04e-06, step=6557] Training: 66%|██████▌ | 6558/10000 [1:20:26<44:49, 1.28it/s, loss=0.0009, lr=9.04e-06, step=6557] Training: 66%|██████▌ | 6558/10000 [1:20:26<44:49, 1.28it/s, loss=0.0266, lr=9.04e-06, step=6558] Training: 66%|██████▌ | 6559/10000 [1:20:27<49:15, 1.16it/s, loss=0.0266, lr=9.04e-06, step=6558] Training: 66%|██████▌ | 6559/10000 [1:20:27<49:15, 1.16it/s, loss=0.0202, lr=9.03e-06, step=6559]17:26:35.444 [I] step=6560 loss=0.0027 smoothed_loss=0.0103 lr=9.05e-06 grad_norm=0.4760 step_time=0.6505s data_time=0.1776s it/s=1.208 eta_to_10000=2848.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0044 grad_action_out_proj=0.0548 grad_shared_expert=0.2042 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6560/10000 [1:20:29<55:01, 1.04it/s, loss=0.0202, lr=9.03e-06, step=6559] Training: 66%|██████▌ | 6560/10000 [1:20:29<55:01, 1.04it/s, loss=0.0027, lr=9.03e-06, step=6560] Training: 66%|██████▌ | 6561/10000 [1:20:29<53:44, 1.07it/s, loss=0.0027, lr=9.03e-06, step=6560] Training: 66%|██████▌ | 6561/10000 [1:20:29<53:44, 1.07it/s, loss=0.2549, lr=9.03e-06, step=6561] Training: 66%|██████▌ | 6562/10000 [1:20:30<53:31, 1.07it/s, loss=0.2549, lr=9.03e-06, step=6561] Training: 66%|██████▌ | 6562/10000 [1:20:30<53:31, 1.07it/s, loss=0.0159, lr=9.02e-06, step=6562] Training: 66%|██████▌ | 6563/10000 [1:20:31<51:29, 1.11it/s, loss=0.0159, lr=9.02e-06, step=6562] Training: 66%|██████▌ | 6563/10000 [1:20:31<51:29, 1.11it/s, loss=0.0040, lr=9.02e-06, step=6563] Training: 66%|██████▌ | 6564/10000 [1:20:32<48:23, 1.18it/s, loss=0.0040, lr=9.02e-06, step=6563] Training: 66%|██████▌ | 6564/10000 [1:20:32<48:23, 1.18it/s, loss=0.0149, lr=9.02e-06, step=6564] Training: 66%|██████▌ | 6565/10000 [1:20:33<50:27, 1.13it/s, loss=0.0149, lr=9.02e-06, step=6564] Training: 66%|██████▌ | 6565/10000 [1:20:33<50:27, 1.13it/s, loss=0.0132, lr=9.01e-06, step=6565] Training: 66%|██████▌ | 6566/10000 [1:20:34<52:16, 1.09it/s, loss=0.0132, lr=9.01e-06, step=6565] Training: 66%|██████▌ | 6566/10000 [1:20:34<52:16, 1.09it/s, loss=0.0033, lr=9.01e-06, step=6566] Training: 66%|██████▌ | 6567/10000 [1:20:35<56:59, 1.00it/s, loss=0.0033, lr=9.01e-06, step=6566] Training: 66%|██████▌ | 6567/10000 [1:20:35<56:59, 1.00it/s, loss=0.0117, lr=9.01e-06, step=6567] Training: 66%|██████▌ | 6568/10000 [1:20:36<57:53, 1.01s/it, loss=0.0117, lr=9.01e-06, step=6567] Training: 66%|██████▌ | 6568/10000 [1:20:36<57:53, 1.01s/it, loss=0.0058, lr=9.00e-06, step=6568] Training: 66%|██████▌ | 6569/10000 [1:20:37<53:04, 1.08it/s, loss=0.0058, lr=9.00e-06, step=6568] Training: 66%|██████▌ | 6569/10000 [1:20:37<53:04, 1.08it/s, loss=0.0306, lr=9.00e-06, step=6569]17:26:44.717 [I] step=6570 loss=0.0386 smoothed_loss=0.0241 lr=9.01e-06 grad_norm=0.4878 step_time=0.6882s data_time=0.2391s it/s=1.079 eta_to_10000=3180.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0237 grad_action_out_proj=0.1588 grad_shared_expert=0.5311 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6570/10000 [1:20:38<54:17, 1.05it/s, loss=0.0306, lr=9.00e-06, step=6569] Training: 66%|██████▌ | 6570/10000 [1:20:38<54:17, 1.05it/s, loss=0.0386, lr=9.00e-06, step=6570] Training: 66%|██████▌ | 6571/10000 [1:20:38<49:27, 1.16it/s, loss=0.0386, lr=9.00e-06, step=6570] Training: 66%|██████▌ | 6571/10000 [1:20:38<49:27, 1.16it/s, loss=0.0284, lr=8.99e-06, step=6571] Training: 66%|██████▌ | 6572/10000 [1:20:40<56:12, 1.02it/s, loss=0.0284, lr=8.99e-06, step=6571] Training: 66%|██████▌ | 6572/10000 [1:20:40<56:12, 1.02it/s, loss=0.0337, lr=8.99e-06, step=6572] Training: 66%|██████▌ | 6573/10000 [1:20:41<57:18, 1.00s/it, loss=0.0337, lr=8.99e-06, step=6572] Training: 66%|██████▌ | 6573/10000 [1:20:41<57:18, 1.00s/it, loss=0.0088, lr=8.99e-06, step=6573] Training: 66%|██████▌ | 6574/10000 [1:20:42<1:01:01, 1.07s/it, loss=0.0088, lr=8.99e-06, step=6573] Training: 66%|██████▌ | 6574/10000 [1:20:42<1:01:01, 1.07s/it, loss=0.0034, lr=8.98e-06, step=6574] Training: 66%|██████▌ | 6575/10000 [1:20:43<1:00:15, 1.06s/it, loss=0.0034, lr=8.98e-06, step=6574] Training: 66%|██████▌ | 6575/10000 [1:20:43<1:00:15, 1.06s/it, loss=0.0068, lr=8.98e-06, step=6575] Training: 66%|██████▌ | 6576/10000 [1:20:44<56:07, 1.02it/s, loss=0.0068, lr=8.98e-06, step=6575] Training: 66%|██████▌ | 6576/10000 [1:20:44<56:07, 1.02it/s, loss=0.0148, lr=8.98e-06, step=6576] Training: 66%|██████▌ | 6577/10000 [1:20:45<54:04, 1.06it/s, loss=0.0148, lr=8.98e-06, step=6576] Training: 66%|██████▌ | 6577/10000 [1:20:45<54:04, 1.06it/s, loss=0.0155, lr=8.97e-06, step=6577] Training: 66%|██████▌ | 6578/10000 [1:20:46<51:57, 1.10it/s, loss=0.0155, lr=8.97e-06, step=6577] Training: 66%|██████▌ | 6578/10000 [1:20:46<51:57, 1.10it/s, loss=0.0017, lr=8.97e-06, step=6578] Training: 66%|██████▌ | 6579/10000 [1:20:47<55:04, 1.04it/s, loss=0.0017, lr=8.97e-06, step=6578] Training: 66%|██████▌ | 6579/10000 [1:20:47<55:04, 1.04it/s, loss=0.0200, lr=8.97e-06, step=6579]17:26:54.074 [I] step=6580 loss=0.0061 smoothed_loss=0.0166 lr=8.98e-06 grad_norm=0.5750 step_time=0.7137s data_time=0.2220s it/s=1.069 eta_to_10000=3199.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0082 grad_action_out_proj=0.1024 grad_shared_expert=0.3656 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6580/10000 [1:20:47<47:39, 1.20it/s, loss=0.0200, lr=8.97e-06, step=6579] Training: 66%|██████▌ | 6580/10000 [1:20:47<47:39, 1.20it/s, loss=0.0061, lr=8.96e-06, step=6580] Training: 66%|██████▌ | 6581/10000 [1:20:48<47:07, 1.21it/s, loss=0.0061, lr=8.96e-06, step=6580] Training: 66%|██████▌ | 6581/10000 [1:20:48<47:07, 1.21it/s, loss=0.0017, lr=8.96e-06, step=6581] Training: 66%|██████▌ | 6582/10000 [1:20:49<47:46, 1.19it/s, loss=0.0017, lr=8.96e-06, step=6581] Training: 66%|██████▌ | 6582/10000 [1:20:49<47:46, 1.19it/s, loss=0.0110, lr=8.96e-06, step=6582] Training: 66%|██████▌ | 6583/10000 [1:20:49<42:23, 1.34it/s, loss=0.0110, lr=8.96e-06, step=6582] Training: 66%|██████▌ | 6583/10000 [1:20:49<42:23, 1.34it/s, loss=0.0047, lr=8.95e-06, step=6583] Training: 66%|██████▌ | 6584/10000 [1:20:50<42:41, 1.33it/s, loss=0.0047, lr=8.95e-06, step=6583] Training: 66%|██████▌ | 6584/10000 [1:20:50<42:41, 1.33it/s, loss=0.0018, lr=8.95e-06, step=6584] Training: 66%|██████▌ | 6585/10000 [1:20:51<42:37, 1.34it/s, loss=0.0018, lr=8.95e-06, step=6584] Training: 66%|██████▌ | 6585/10000 [1:20:51<42:37, 1.34it/s, loss=0.0027, lr=8.95e-06, step=6585] Training: 66%|██████▌ | 6586/10000 [1:20:52<44:53, 1.27it/s, loss=0.0027, lr=8.95e-06, step=6585] Training: 66%|██████▌ | 6586/10000 [1:20:52<44:53, 1.27it/s, loss=0.0076, lr=8.94e-06, step=6586] Training: 66%|██████▌ | 6587/10000 [1:20:52<43:05, 1.32it/s, loss=0.0076, lr=8.94e-06, step=6586] Training: 66%|██████▌ | 6587/10000 [1:20:52<43:05, 1.32it/s, loss=0.0015, lr=8.94e-06, step=6587] Training: 66%|██████▌ | 6588/10000 [1:20:53<40:52, 1.39it/s, loss=0.0015, lr=8.94e-06, step=6587] Training: 66%|██████▌ | 6588/10000 [1:20:53<40:52, 1.39it/s, loss=0.0053, lr=8.94e-06, step=6588] Training: 66%|██████▌ | 6589/10000 [1:20:54<37:11, 1.53it/s, loss=0.0053, lr=8.94e-06, step=6588] Training: 66%|██████▌ | 6589/10000 [1:20:54<37:11, 1.53it/s, loss=0.0110, lr=8.93e-06, step=6589]17:27:01.366 [I] step=6590 loss=0.0177 smoothed_loss=0.0106 lr=8.94e-06 grad_norm=0.4455 step_time=0.6019s data_time=0.1273s it/s=1.371 eta_to_10000=2486.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0132 grad_action_out_proj=0.1402 grad_shared_expert=0.5067 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6590/10000 [1:20:54<41:10, 1.38it/s, loss=0.0110, lr=8.93e-06, step=6589] Training: 66%|██████▌ | 6590/10000 [1:20:54<41:10, 1.38it/s, loss=0.0177, lr=8.93e-06, step=6590] Training: 66%|██████▌ | 6591/10000 [1:20:55<38:17, 1.48it/s, loss=0.0177, lr=8.93e-06, step=6590] Training: 66%|██████▌ | 6591/10000 [1:20:55<38:17, 1.48it/s, loss=0.0039, lr=8.93e-06, step=6591] Training: 66%|██████▌ | 6592/10000 [1:20:56<41:32, 1.37it/s, loss=0.0039, lr=8.93e-06, step=6591] Training: 66%|██████▌ | 6592/10000 [1:20:56<41:32, 1.37it/s, loss=0.0055, lr=8.92e-06, step=6592] Training: 66%|██████▌ | 6593/10000 [1:20:57<43:24, 1.31it/s, loss=0.0055, lr=8.92e-06, step=6592] Training: 66%|██████▌ | 6593/10000 [1:20:57<43:24, 1.31it/s, loss=0.0119, lr=8.92e-06, step=6593] Training: 66%|██████▌ | 6594/10000 [1:20:57<42:50, 1.33it/s, loss=0.0119, lr=8.92e-06, step=6593] Training: 66%|██████▌ | 6594/10000 [1:20:57<42:50, 1.33it/s, loss=0.0031, lr=8.92e-06, step=6594] Training: 66%|██████▌ | 6595/10000 [1:20:58<41:16, 1.37it/s, loss=0.0031, lr=8.92e-06, step=6594] Training: 66%|██████▌ | 6595/10000 [1:20:58<41:16, 1.37it/s, loss=0.0125, lr=8.91e-06, step=6595] Training: 66%|██████▌ | 6596/10000 [1:20:59<38:39, 1.47it/s, loss=0.0125, lr=8.91e-06, step=6595] Training: 66%|██████▌ | 6596/10000 [1:20:59<38:39, 1.47it/s, loss=0.0099, lr=8.91e-06, step=6596] Training: 66%|██████▌ | 6597/10000 [1:20:59<38:11, 1.49it/s, loss=0.0099, lr=8.91e-06, step=6596] Training: 66%|██████▌ | 6597/10000 [1:20:59<38:11, 1.49it/s, loss=0.0101, lr=8.91e-06, step=6597] Training: 66%|██████▌ | 6598/10000 [1:21:00<38:48, 1.46it/s, loss=0.0101, lr=8.91e-06, step=6597] Training: 66%|██████▌ | 6598/10000 [1:21:00<38:48, 1.46it/s, loss=0.0084, lr=8.90e-06, step=6598] Training: 66%|██████▌ | 6599/10000 [1:21:01<39:28, 1.44it/s, loss=0.0084, lr=8.90e-06, step=6598] Training: 66%|██████▌ | 6599/10000 [1:21:01<39:28, 1.44it/s, loss=0.1242, lr=8.90e-06, step=6599]17:27:08.736 [I] step=6600 loss=0.0060 smoothed_loss=0.0194 lr=8.91e-06 grad_norm=0.4360 step_time=0.6098s data_time=0.1272s it/s=1.357 eta_to_10000=2505.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0045 grad_action_out_proj=0.0606 grad_shared_expert=0.2424 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6600/10000 [1:21:02<45:27, 1.25it/s, loss=0.1242, lr=8.90e-06, step=6599] Training: 66%|██████▌ | 6600/10000 [1:21:02<45:27, 1.25it/s, loss=0.0060, lr=8.90e-06, step=6600] Training: 66%|██████▌ | 6601/10000 [1:21:03<45:43, 1.24it/s, loss=0.0060, lr=8.90e-06, step=6600] Training: 66%|██████▌ | 6601/10000 [1:21:03<45:43, 1.24it/s, loss=0.0464, lr=8.89e-06, step=6601] Training: 66%|██████▌ | 6602/10000 [1:21:04<47:30, 1.19it/s, loss=0.0464, lr=8.89e-06, step=6601] Training: 66%|██████▌ | 6602/10000 [1:21:04<47:30, 1.19it/s, loss=0.0045, lr=8.89e-06, step=6602] Training: 66%|██████▌ | 6603/10000 [1:21:04<48:49, 1.16it/s, loss=0.0045, lr=8.89e-06, step=6602] Training: 66%|██████▌ | 6603/10000 [1:21:04<48:49, 1.16it/s, loss=0.0016, lr=8.89e-06, step=6603] Training: 66%|██████▌ | 6604/10000 [1:21:05<47:40, 1.19it/s, loss=0.0016, lr=8.89e-06, step=6603] Training: 66%|██████▌ | 6604/10000 [1:21:05<47:40, 1.19it/s, loss=0.0126, lr=8.88e-06, step=6604] Training: 66%|██████▌ | 6605/10000 [1:21:06<49:42, 1.14it/s, loss=0.0126, lr=8.88e-06, step=6604] Training: 66%|██████▌ | 6605/10000 [1:21:06<49:42, 1.14it/s, loss=0.0067, lr=8.88e-06, step=6605] Training: 66%|██████▌ | 6606/10000 [1:21:07<48:05, 1.18it/s, loss=0.0067, lr=8.88e-06, step=6605] Training: 66%|██████▌ | 6606/10000 [1:21:07<48:05, 1.18it/s, loss=0.0142, lr=8.88e-06, step=6606] Training: 66%|██████▌ | 6607/10000 [1:21:08<48:00, 1.18it/s, loss=0.0142, lr=8.88e-06, step=6606] Training: 66%|██████▌ | 6607/10000 [1:21:08<48:00, 1.18it/s, loss=0.0094, lr=8.87e-06, step=6607] Training: 66%|██████▌ | 6608/10000 [1:21:09<53:05, 1.06it/s, loss=0.0094, lr=8.87e-06, step=6607] Training: 66%|██████▌ | 6608/10000 [1:21:09<53:05, 1.06it/s, loss=0.0027, lr=8.87e-06, step=6608] Training: 66%|██████▌ | 6609/10000 [1:21:10<51:16, 1.10it/s, loss=0.0027, lr=8.87e-06, step=6608] Training: 66%|██████▌ | 6609/10000 [1:21:10<51:16, 1.10it/s, loss=0.0049, lr=8.87e-06, step=6609]17:27:17.308 [I] step=6610 loss=0.0075 smoothed_loss=0.0129 lr=8.88e-06 grad_norm=0.4718 step_time=0.6649s data_time=0.1923s it/s=1.167 eta_to_10000=2905.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0077 grad_action_out_proj=0.0890 grad_shared_expert=0.2418 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6610/10000 [1:21:10<45:13, 1.25it/s, loss=0.0049, lr=8.87e-06, step=6609] Training: 66%|██████▌ | 6610/10000 [1:21:10<45:13, 1.25it/s, loss=0.0075, lr=8.86e-06, step=6610] Training: 66%|██████▌ | 6611/10000 [1:21:11<44:51, 1.26it/s, loss=0.0075, lr=8.86e-06, step=6610] Training: 66%|██████▌ | 6611/10000 [1:21:11<44:51, 1.26it/s, loss=0.0078, lr=8.86e-06, step=6611] Training: 66%|██████▌ | 6612/10000 [1:21:12<45:33, 1.24it/s, loss=0.0078, lr=8.86e-06, step=6611] Training: 66%|██████▌ | 6612/10000 [1:21:12<45:33, 1.24it/s, loss=0.0125, lr=8.86e-06, step=6612] Training: 66%|██████▌ | 6613/10000 [1:21:13<47:09, 1.20it/s, loss=0.0125, lr=8.86e-06, step=6612] Training: 66%|██████▌ | 6613/10000 [1:21:13<47:09, 1.20it/s, loss=0.0038, lr=8.85e-06, step=6613] Training: 66%|██████▌ | 6614/10000 [1:21:14<45:51, 1.23it/s, loss=0.0038, lr=8.85e-06, step=6613] Training: 66%|██████▌ | 6614/10000 [1:21:14<45:51, 1.23it/s, loss=0.0067, lr=8.85e-06, step=6614] Training: 66%|██████▌ | 6615/10000 [1:21:15<51:06, 1.10it/s, loss=0.0067, lr=8.85e-06, step=6614] Training: 66%|██████▌ | 6615/10000 [1:21:15<51:06, 1.10it/s, loss=0.0033, lr=8.85e-06, step=6615] Training: 66%|██████▌ | 6616/10000 [1:21:16<50:46, 1.11it/s, loss=0.0033, lr=8.85e-06, step=6615] Training: 66%|██████▌ | 6616/10000 [1:21:16<50:46, 1.11it/s, loss=0.0029, lr=8.84e-06, step=6616] Training: 66%|██████▌ | 6617/10000 [1:21:17<54:20, 1.04it/s, loss=0.0029, lr=8.84e-06, step=6616] Training: 66%|██████▌ | 6617/10000 [1:21:17<54:20, 1.04it/s, loss=0.0037, lr=8.84e-06, step=6617] Training: 66%|██████▌ | 6618/10000 [1:21:18<52:04, 1.08it/s, loss=0.0037, lr=8.84e-06, step=6617] Training: 66%|██████▌ | 6618/10000 [1:21:18<52:04, 1.08it/s, loss=0.0046, lr=8.84e-06, step=6618] Training: 66%|██████▌ | 6619/10000 [1:21:18<48:15, 1.17it/s, loss=0.0046, lr=8.84e-06, step=6618] Training: 66%|██████▌ | 6619/10000 [1:21:18<48:15, 1.17it/s, loss=0.0075, lr=8.83e-06, step=6619]17:27:25.954 [I] step=6620 loss=0.0025 smoothed_loss=0.0078 lr=8.84e-06 grad_norm=0.4202 step_time=0.6860s data_time=0.1786s it/s=1.157 eta_to_10000=2922.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0055 grad_action_out_proj=0.0666 grad_shared_expert=0.2066 (10775:train_pytorch.py:850) + Training: 66%|██████▌ | 6620/10000 [1:21:19<45:53, 1.23it/s, loss=0.0075, lr=8.83e-06, step=6619] Training: 66%|██████▌ | 6620/10000 [1:21:19<45:53, 1.23it/s, loss=0.0025, lr=8.83e-06, step=6620] Training: 66%|██████▌ | 6621/10000 [1:21:20<45:33, 1.24it/s, loss=0.0025, lr=8.83e-06, step=6620] Training: 66%|██████▌ | 6621/10000 [1:21:20<45:33, 1.24it/s, loss=0.0084, lr=8.83e-06, step=6621] Training: 66%|██████▌ | 6622/10000 [1:21:21<51:26, 1.09it/s, loss=0.0084, lr=8.83e-06, step=6621] Training: 66%|██████▌ | 6622/10000 [1:21:21<51:26, 1.09it/s, loss=0.0076, lr=8.82e-06, step=6622] Training: 66%|██████▌ | 6623/10000 [1:21:22<52:54, 1.06it/s, loss=0.0076, lr=8.82e-06, step=6622] Training: 66%|██████▌ | 6623/10000 [1:21:22<52:54, 1.06it/s, loss=0.0086, lr=8.82e-06, step=6623] Training: 66%|██████▌ | 6624/10000 [1:21:23<50:56, 1.10it/s, loss=0.0086, lr=8.82e-06, step=6623] Training: 66%|██████▌ | 6624/10000 [1:21:23<50:56, 1.10it/s, loss=0.0020, lr=8.82e-06, step=6624] Training: 66%|██████▋ | 6625/10000 [1:21:23<46:16, 1.22it/s, loss=0.0020, lr=8.82e-06, step=6624] Training: 66%|██████▋ | 6625/10000 [1:21:23<46:16, 1.22it/s, loss=0.0107, lr=8.81e-06, step=6625] Training: 66%|██████▋ | 6626/10000 [1:21:25<52:32, 1.07it/s, loss=0.0107, lr=8.81e-06, step=6625] Training: 66%|██████▋ | 6626/10000 [1:21:25<52:32, 1.07it/s, loss=0.0165, lr=8.81e-06, step=6626] Training: 66%|██████▋ | 6627/10000 [1:21:25<45:09, 1.25it/s, loss=0.0165, lr=8.81e-06, step=6626] Training: 66%|██████▋ | 6627/10000 [1:21:25<45:09, 1.25it/s, loss=0.0131, lr=8.81e-06, step=6627] Training: 66%|██████▋ | 6628/10000 [1:21:26<40:35, 1.38it/s, loss=0.0131, lr=8.81e-06, step=6627] Training: 66%|██████▋ | 6628/10000 [1:21:26<40:35, 1.38it/s, loss=0.0095, lr=8.80e-06, step=6628] Training: 66%|██████▋ | 6629/10000 [1:21:26<36:59, 1.52it/s, loss=0.0095, lr=8.80e-06, step=6628] Training: 66%|██████▋ | 6629/10000 [1:21:26<36:59, 1.52it/s, loss=0.0067, lr=8.80e-06, step=6629]17:27:33.834 [I] step=6630 loss=0.0144 smoothed_loss=0.0094 lr=8.81e-06 grad_norm=0.4697 step_time=0.6322s data_time=0.1558s it/s=1.269 eta_to_10000=2654.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0243 grad_action_out_proj=0.1874 grad_shared_expert=0.6131 (10775:train_pytorch.py:850) + Training: 66%|██████▋ | 6630/10000 [1:21:27<38:15, 1.47it/s, loss=0.0067, lr=8.80e-06, step=6629] Training: 66%|██████▋ | 6630/10000 [1:21:27<38:15, 1.47it/s, loss=0.0144, lr=8.80e-06, step=6630] Training: 66%|██████▋ | 6631/10000 [1:21:27<36:02, 1.56it/s, loss=0.0144, lr=8.80e-06, step=6630] Training: 66%|██████▋ | 6631/10000 [1:21:27<36:02, 1.56it/s, loss=0.0056, lr=8.79e-06, step=6631] Training: 66%|██████▋ | 6632/10000 [1:21:28<34:18, 1.64it/s, loss=0.0056, lr=8.79e-06, step=6631] Training: 66%|██████▋ | 6632/10000 [1:21:28<34:18, 1.64it/s, loss=0.0047, lr=8.79e-06, step=6632] Training: 66%|██████▋ | 6633/10000 [1:21:29<41:12, 1.36it/s, loss=0.0047, lr=8.79e-06, step=6632] Training: 66%|██████▋ | 6633/10000 [1:21:29<41:12, 1.36it/s, loss=0.0104, lr=8.79e-06, step=6633] Training: 66%|██████▋ | 6634/10000 [1:21:30<42:52, 1.31it/s, loss=0.0104, lr=8.79e-06, step=6633] Training: 66%|██████▋ | 6634/10000 [1:21:30<42:52, 1.31it/s, loss=0.0067, lr=8.78e-06, step=6634] Training: 66%|██████▋ | 6635/10000 [1:21:31<42:39, 1.31it/s, loss=0.0067, lr=8.78e-06, step=6634] Training: 66%|██████▋ | 6635/10000 [1:21:31<42:39, 1.31it/s, loss=0.0041, lr=8.78e-06, step=6635] Training: 66%|██████▋ | 6636/10000 [1:21:32<46:19, 1.21it/s, loss=0.0041, lr=8.78e-06, step=6635] Training: 66%|██████▋ | 6636/10000 [1:21:32<46:19, 1.21it/s, loss=0.0034, lr=8.78e-06, step=6636] Training: 66%|██████▋ | 6637/10000 [1:21:33<53:10, 1.05it/s, loss=0.0034, lr=8.78e-06, step=6636] Training: 66%|██████▋ | 6637/10000 [1:21:33<53:10, 1.05it/s, loss=0.0057, lr=8.77e-06, step=6637] Training: 66%|██████▋ | 6638/10000 [1:21:33<45:48, 1.22it/s, loss=0.0057, lr=8.77e-06, step=6637] Training: 66%|██████▋ | 6638/10000 [1:21:33<45:48, 1.22it/s, loss=0.0025, lr=8.77e-06, step=6638] Training: 66%|██████▋ | 6639/10000 [1:21:34<41:17, 1.36it/s, loss=0.0025, lr=8.77e-06, step=6638] Training: 66%|██████▋ | 6639/10000 [1:21:34<41:17, 1.36it/s, loss=0.0273, lr=8.77e-06, step=6639]17:27:41.573 [I] step=6640 loss=0.0097 smoothed_loss=0.0090 lr=8.78e-06 grad_norm=0.4584 step_time=0.6155s data_time=0.1584s it/s=1.292 eta_to_10000=2600.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0144 grad_action_out_proj=0.1077 grad_shared_expert=0.4142 (10775:train_pytorch.py:850) + Training: 66%|██████▋ | 6640/10000 [1:21:35<41:47, 1.34it/s, loss=0.0273, lr=8.77e-06, step=6639] Training: 66%|██████▋ | 6640/10000 [1:21:35<41:47, 1.34it/s, loss=0.0097, lr=8.76e-06, step=6640] Training: 66%|██████▋ | 6641/10000 [1:21:35<38:06, 1.47it/s, loss=0.0097, lr=8.76e-06, step=6640] Training: 66%|██████▋ | 6641/10000 [1:21:35<38:06, 1.47it/s, loss=0.0421, lr=8.76e-06, step=6641] Training: 66%|██████▋ | 6642/10000 [1:21:36<35:11, 1.59it/s, loss=0.0421, lr=8.76e-06, step=6641] Training: 66%|██████▋ | 6642/10000 [1:21:36<35:11, 1.59it/s, loss=0.0028, lr=8.76e-06, step=6642] Training: 66%|██████▋ | 6643/10000 [1:21:37<40:43, 1.37it/s, loss=0.0028, lr=8.76e-06, step=6642] Training: 66%|██████▋ | 6643/10000 [1:21:37<40:43, 1.37it/s, loss=0.0075, lr=8.75e-06, step=6643] Training: 66%|██████▋ | 6644/10000 [1:21:37<38:36, 1.45it/s, loss=0.0075, lr=8.75e-06, step=6643] Training: 66%|██████▋ | 6644/10000 [1:21:37<38:36, 1.45it/s, loss=0.0094, lr=8.75e-06, step=6644] Training: 66%|██████▋ | 6645/10000 [1:21:38<46:55, 1.19it/s, loss=0.0094, lr=8.75e-06, step=6644] Training: 66%|██████▋ | 6645/10000 [1:21:38<46:55, 1.19it/s, loss=0.0357, lr=8.75e-06, step=6645] Training: 66%|██████▋ | 6646/10000 [1:21:40<51:16, 1.09it/s, loss=0.0357, lr=8.75e-06, step=6645] Training: 66%|██████▋ | 6646/10000 [1:21:40<51:16, 1.09it/s, loss=0.0120, lr=8.74e-06, step=6646] Training: 66%|██████▋ | 6647/10000 [1:21:41<58:38, 1.05s/it, loss=0.0120, lr=8.74e-06, step=6646] Training: 66%|██████▋ | 6647/10000 [1:21:41<58:38, 1.05s/it, loss=0.0007, lr=8.74e-06, step=6647] Training: 66%|██████▋ | 6648/10000 [1:21:42<58:11, 1.04s/it, loss=0.0007, lr=8.74e-06, step=6647] Training: 66%|██████▋ | 6648/10000 [1:21:42<58:11, 1.04s/it, loss=0.0111, lr=8.74e-06, step=6648] Training: 66%|██████▋ | 6649/10000 [1:21:43<55:43, 1.00it/s, loss=0.0111, lr=8.74e-06, step=6648] Training: 66%|██████▋ | 6649/10000 [1:21:43<55:43, 1.00it/s, loss=0.0055, lr=8.73e-06, step=6649]17:27:50.661 [I] step=6650 loss=0.0033 smoothed_loss=0.0104 lr=8.74e-06 grad_norm=0.4646 step_time=0.6783s data_time=0.2306s it/s=1.100 eta_to_10000=3044.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0180 grad_action_out_proj=0.1083 grad_shared_expert=0.3408 (10775:train_pytorch.py:850) + Training: 66%|██████▋ | 6650/10000 [1:21:44<54:32, 1.02it/s, loss=0.0055, lr=8.73e-06, step=6649] Training: 66%|██████▋ | 6650/10000 [1:21:44<54:32, 1.02it/s, loss=0.0033, lr=8.73e-06, step=6650] Training: 67%|██████▋ | 6651/10000 [1:21:45<53:02, 1.05it/s, loss=0.0033, lr=8.73e-06, step=6650] Training: 67%|██████▋ | 6651/10000 [1:21:45<53:02, 1.05it/s, loss=0.0097, lr=8.73e-06, step=6651] Training: 67%|██████▋ | 6652/10000 [1:21:46<55:14, 1.01it/s, loss=0.0097, lr=8.73e-06, step=6651] Training: 67%|██████▋ | 6652/10000 [1:21:46<55:14, 1.01it/s, loss=0.0045, lr=8.72e-06, step=6652] Training: 67%|██████▋ | 6653/10000 [1:21:47<56:53, 1.02s/it, loss=0.0045, lr=8.72e-06, step=6652] Training: 67%|██████▋ | 6653/10000 [1:21:47<56:53, 1.02s/it, loss=0.0037, lr=8.72e-06, step=6653] Training: 67%|██████▋ | 6654/10000 [1:21:48<57:45, 1.04s/it, loss=0.0037, lr=8.72e-06, step=6653] Training: 67%|██████▋ | 6654/10000 [1:21:48<57:45, 1.04s/it, loss=0.0079, lr=8.72e-06, step=6654] Training: 67%|██████▋ | 6655/10000 [1:21:49<1:01:06, 1.10s/it, loss=0.0079, lr=8.72e-06, step=6654] Training: 67%|██████▋ | 6655/10000 [1:21:49<1:01:06, 1.10s/it, loss=0.0212, lr=8.71e-06, step=6655] Training: 67%|██████▋ | 6656/10000 [1:21:50<58:32, 1.05s/it, loss=0.0212, lr=8.71e-06, step=6655] Training: 67%|██████▋ | 6656/10000 [1:21:50<58:32, 1.05s/it, loss=0.0143, lr=8.71e-06, step=6656] Training: 67%|██████▋ | 6657/10000 [1:21:51<55:05, 1.01it/s, loss=0.0143, lr=8.71e-06, step=6656] Training: 67%|██████▋ | 6657/10000 [1:21:51<55:05, 1.01it/s, loss=0.0039, lr=8.71e-06, step=6657] Training: 67%|██████▋ | 6658/10000 [1:21:52<1:02:15, 1.12s/it, loss=0.0039, lr=8.71e-06, step=6657] Training: 67%|██████▋ | 6658/10000 [1:21:52<1:02:15, 1.12s/it, loss=0.0053, lr=8.70e-06, step=6658] Training: 67%|██████▋ | 6659/10000 [1:21:53<57:59, 1.04s/it, loss=0.0053, lr=8.70e-06, step=6658] Training: 67%|██████▋ | 6659/10000 [1:21:53<57:59, 1.04s/it, loss=0.0134, lr=8.70e-06, step=6659]17:28:00.793 [I] step=6660 loss=0.0026 smoothed_loss=0.0092 lr=8.71e-06 grad_norm=0.4279 step_time=0.7464s data_time=0.2667s it/s=0.987 eta_to_10000=3383.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0200 grad_action_out_proj=0.1622 grad_shared_expert=0.4295 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6660/10000 [1:21:54<52:06, 1.07it/s, loss=0.0134, lr=8.70e-06, step=6659] Training: 67%|██████▋ | 6660/10000 [1:21:54<52:06, 1.07it/s, loss=0.0026, lr=8.70e-06, step=6660] Training: 67%|██████▋ | 6661/10000 [1:21:55<51:44, 1.08it/s, loss=0.0026, lr=8.70e-06, step=6660] Training: 67%|██████▋ | 6661/10000 [1:21:55<51:44, 1.08it/s, loss=0.0065, lr=8.69e-06, step=6661] Training: 67%|██████▋ | 6662/10000 [1:21:56<53:30, 1.04it/s, loss=0.0065, lr=8.69e-06, step=6661] Training: 67%|██████▋ | 6662/10000 [1:21:56<53:30, 1.04it/s, loss=0.0058, lr=8.69e-06, step=6662] Training: 67%|██████▋ | 6663/10000 [1:21:57<53:39, 1.04it/s, loss=0.0058, lr=8.69e-06, step=6662] Training: 67%|██████▋ | 6663/10000 [1:21:57<53:39, 1.04it/s, loss=0.0030, lr=8.69e-06, step=6663] Training: 67%|██████▋ | 6664/10000 [1:21:58<51:12, 1.09it/s, loss=0.0030, lr=8.69e-06, step=6663] Training: 67%|██████▋ | 6664/10000 [1:21:58<51:12, 1.09it/s, loss=0.0123, lr=8.68e-06, step=6664] Training: 67%|██████▋ | 6665/10000 [1:21:59<54:31, 1.02it/s, loss=0.0123, lr=8.68e-06, step=6664] Training: 67%|██████▋ | 6665/10000 [1:21:59<54:31, 1.02it/s, loss=0.0074, lr=8.68e-06, step=6665] Training: 67%|██████▋ | 6666/10000 [1:21:59<47:03, 1.18it/s, loss=0.0074, lr=8.68e-06, step=6665] Training: 67%|██████▋ | 6666/10000 [1:21:59<47:03, 1.18it/s, loss=0.0031, lr=8.68e-06, step=6666] Training: 67%|██████▋ | 6667/10000 [1:22:00<41:25, 1.34it/s, loss=0.0031, lr=8.68e-06, step=6666] Training: 67%|██████▋ | 6667/10000 [1:22:00<41:25, 1.34it/s, loss=0.0077, lr=8.67e-06, step=6667] Training: 67%|██████▋ | 6668/10000 [1:22:00<38:25, 1.45it/s, loss=0.0077, lr=8.67e-06, step=6667] Training: 67%|██████▋ | 6668/10000 [1:22:00<38:25, 1.45it/s, loss=0.0028, lr=8.67e-06, step=6668] Training: 67%|██████▋ | 6669/10000 [1:22:01<38:03, 1.46it/s, loss=0.0028, lr=8.67e-06, step=6668] Training: 67%|██████▋ | 6669/10000 [1:22:01<38:03, 1.46it/s, loss=0.0077, lr=8.67e-06, step=6669]17:28:08.956 [I] step=6670 loss=0.0121 smoothed_loss=0.0078 lr=8.68e-06 grad_norm=0.4548 step_time=0.6525s data_time=0.1638s it/s=1.225 eta_to_10000=2718.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0070 grad_action_out_proj=0.0688 grad_shared_expert=0.4250 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6670/10000 [1:22:02<43:37, 1.27it/s, loss=0.0077, lr=8.67e-06, step=6669] Training: 67%|██████▋ | 6670/10000 [1:22:02<43:37, 1.27it/s, loss=0.0121, lr=8.66e-06, step=6670] Training: 67%|██████▋ | 6671/10000 [1:22:03<38:40, 1.43it/s, loss=0.0121, lr=8.66e-06, step=6670] Training: 67%|██████▋ | 6671/10000 [1:22:03<38:40, 1.43it/s, loss=0.0116, lr=8.66e-06, step=6671] Training: 67%|██████▋ | 6672/10000 [1:22:03<39:27, 1.41it/s, loss=0.0116, lr=8.66e-06, step=6671] Training: 67%|██████▋ | 6672/10000 [1:22:03<39:27, 1.41it/s, loss=0.0032, lr=8.66e-06, step=6672] Training: 67%|██████▋ | 6673/10000 [1:22:04<36:35, 1.52it/s, loss=0.0032, lr=8.66e-06, step=6672] Training: 67%|██████▋ | 6673/10000 [1:22:04<36:35, 1.52it/s, loss=0.0025, lr=8.65e-06, step=6673] Training: 67%|██████▋ | 6674/10000 [1:22:05<39:57, 1.39it/s, loss=0.0025, lr=8.65e-06, step=6673] Training: 67%|██████▋ | 6674/10000 [1:22:05<39:57, 1.39it/s, loss=0.0103, lr=8.65e-06, step=6674] Training: 67%|██████▋ | 6675/10000 [1:22:05<41:31, 1.33it/s, loss=0.0103, lr=8.65e-06, step=6674] Training: 67%|██████▋ | 6675/10000 [1:22:05<41:31, 1.33it/s, loss=0.0055, lr=8.65e-06, step=6675] Training: 67%|██████▋ | 6676/10000 [1:22:06<42:45, 1.30it/s, loss=0.0055, lr=8.65e-06, step=6675] Training: 67%|██████▋ | 6676/10000 [1:22:06<42:45, 1.30it/s, loss=0.0103, lr=8.64e-06, step=6676] Training: 67%|██████▋ | 6677/10000 [1:22:07<46:23, 1.19it/s, loss=0.0103, lr=8.64e-06, step=6676] Training: 67%|██████▋ | 6677/10000 [1:22:07<46:23, 1.19it/s, loss=0.0098, lr=8.64e-06, step=6677] Training: 67%|██████▋ | 6678/10000 [1:22:08<46:34, 1.19it/s, loss=0.0098, lr=8.64e-06, step=6677] Training: 67%|██████▋ | 6678/10000 [1:22:08<46:34, 1.19it/s, loss=0.0188, lr=8.64e-06, step=6678] Training: 67%|██████▋ | 6679/10000 [1:22:09<49:58, 1.11it/s, loss=0.0188, lr=8.64e-06, step=6678] Training: 67%|██████▋ | 6679/10000 [1:22:09<49:58, 1.11it/s, loss=0.0030, lr=8.63e-06, step=6679]17:28:16.964 [I] step=6680 loss=0.0054 smoothed_loss=0.0080 lr=8.64e-06 grad_norm=0.5099 step_time=0.6410s data_time=0.1598s it/s=1.249 eta_to_10000=2658.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0215 grad_action_out_proj=0.1683 grad_shared_expert=0.8534 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6680/10000 [1:22:10<48:58, 1.13it/s, loss=0.0030, lr=8.63e-06, step=6679] Training: 67%|██████▋ | 6680/10000 [1:22:10<48:58, 1.13it/s, loss=0.0054, lr=8.63e-06, step=6680] Training: 67%|██████▋ | 6681/10000 [1:22:11<42:39, 1.30it/s, loss=0.0054, lr=8.63e-06, step=6680] Training: 67%|██████▋ | 6681/10000 [1:22:11<42:39, 1.30it/s, loss=0.0055, lr=8.63e-06, step=6681] Training: 67%|██████▋ | 6682/10000 [1:22:11<38:35, 1.43it/s, loss=0.0055, lr=8.63e-06, step=6681] Training: 67%|██████▋ | 6682/10000 [1:22:11<38:35, 1.43it/s, loss=0.0034, lr=8.62e-06, step=6682] Training: 67%|██████▋ | 6683/10000 [1:22:12<39:52, 1.39it/s, loss=0.0034, lr=8.62e-06, step=6682] Training: 67%|██████▋ | 6683/10000 [1:22:12<39:52, 1.39it/s, loss=0.0110, lr=8.62e-06, step=6683] Training: 67%|██████▋ | 6684/10000 [1:22:13<45:41, 1.21it/s, loss=0.0110, lr=8.62e-06, step=6683] Training: 67%|██████▋ | 6684/10000 [1:22:13<45:41, 1.21it/s, loss=0.0522, lr=8.62e-06, step=6684] Training: 67%|██████▋ | 6685/10000 [1:22:14<51:13, 1.08it/s, loss=0.0522, lr=8.62e-06, step=6684] Training: 67%|██████▋ | 6685/10000 [1:22:14<51:13, 1.08it/s, loss=0.0012, lr=8.61e-06, step=6685] Training: 67%|██████▋ | 6686/10000 [1:22:15<54:08, 1.02it/s, loss=0.0012, lr=8.61e-06, step=6685] Training: 67%|██████▋ | 6686/10000 [1:22:15<54:08, 1.02it/s, loss=0.0030, lr=8.61e-06, step=6686] Training: 67%|██████▋ | 6687/10000 [1:22:16<54:46, 1.01it/s, loss=0.0030, lr=8.61e-06, step=6686] Training: 67%|██████▋ | 6687/10000 [1:22:16<54:46, 1.01it/s, loss=0.0084, lr=8.61e-06, step=6687] Training: 67%|██████▋ | 6688/10000 [1:22:17<52:05, 1.06it/s, loss=0.0084, lr=8.61e-06, step=6687] Training: 67%|██████▋ | 6688/10000 [1:22:17<52:05, 1.06it/s, loss=0.0349, lr=8.60e-06, step=6688] Training: 67%|██████▋ | 6689/10000 [1:22:18<48:25, 1.14it/s, loss=0.0349, lr=8.60e-06, step=6688] Training: 67%|██████▋ | 6689/10000 [1:22:18<48:25, 1.14it/s, loss=0.0109, lr=8.60e-06, step=6689]17:28:25.540 [I] step=6690 loss=0.0060 smoothed_loss=0.0117 lr=8.61e-06 grad_norm=0.4862 step_time=0.6285s data_time=0.2292s it/s=1.166 eta_to_10000=2838.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.1206 grad_shared_expert=0.6703 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6690/10000 [1:22:19<48:02, 1.15it/s, loss=0.0109, lr=8.60e-06, step=6689] Training: 67%|██████▋ | 6690/10000 [1:22:19<48:02, 1.15it/s, loss=0.0060, lr=8.60e-06, step=6690] Training: 67%|██████▋ | 6691/10000 [1:22:19<44:32, 1.24it/s, loss=0.0060, lr=8.60e-06, step=6690] Training: 67%|██████▋ | 6691/10000 [1:22:19<44:32, 1.24it/s, loss=0.0024, lr=8.59e-06, step=6691] Training: 67%|██████▋ | 6692/10000 [1:22:20<44:34, 1.24it/s, loss=0.0024, lr=8.59e-06, step=6691] Training: 67%|██████▋ | 6692/10000 [1:22:20<44:34, 1.24it/s, loss=0.0178, lr=8.59e-06, step=6692] Training: 67%|██████▋ | 6693/10000 [1:22:21<42:47, 1.29it/s, loss=0.0178, lr=8.59e-06, step=6692] Training: 67%|██████▋ | 6693/10000 [1:22:21<42:47, 1.29it/s, loss=0.0031, lr=8.59e-06, step=6693] Training: 67%|██████▋ | 6694/10000 [1:22:22<47:31, 1.16it/s, loss=0.0031, lr=8.59e-06, step=6693] Training: 67%|██████▋ | 6694/10000 [1:22:22<47:31, 1.16it/s, loss=0.0016, lr=8.58e-06, step=6694] Training: 67%|██████▋ | 6695/10000 [1:22:23<47:38, 1.16it/s, loss=0.0016, lr=8.58e-06, step=6694] Training: 67%|██████▋ | 6695/10000 [1:22:23<47:38, 1.16it/s, loss=0.0084, lr=8.58e-06, step=6695] Training: 67%|██████▋ | 6696/10000 [1:22:23<45:21, 1.21it/s, loss=0.0084, lr=8.58e-06, step=6695] Training: 67%|██████▋ | 6696/10000 [1:22:23<45:21, 1.21it/s, loss=0.0014, lr=8.58e-06, step=6696] Training: 67%|██████▋ | 6697/10000 [1:22:24<47:18, 1.16it/s, loss=0.0014, lr=8.58e-06, step=6696] Training: 67%|██████▋ | 6697/10000 [1:22:24<47:18, 1.16it/s, loss=0.0160, lr=8.57e-06, step=6697] Training: 67%|██████▋ | 6698/10000 [1:22:25<45:49, 1.20it/s, loss=0.0160, lr=8.57e-06, step=6697] Training: 67%|██████▋ | 6698/10000 [1:22:25<45:49, 1.20it/s, loss=0.0062, lr=8.57e-06, step=6698] Training: 67%|██████▋ | 6699/10000 [1:22:26<49:11, 1.12it/s, loss=0.0062, lr=8.57e-06, step=6698] Training: 67%|██████▋ | 6699/10000 [1:22:26<49:11, 1.12it/s, loss=0.0006, lr=8.57e-06, step=6699]17:28:34.213 [I] step=6700 loss=0.0135 smoothed_loss=0.0088 lr=8.58e-06 grad_norm=0.4922 step_time=0.6611s data_time=0.2062s it/s=1.153 eta_to_10000=2861.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0393 grad_action_out_proj=0.2061 grad_shared_expert=0.5453 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6700/10000 [1:22:27<52:23, 1.05it/s, loss=0.0006, lr=8.57e-06, step=6699] Training: 67%|██████▋ | 6700/10000 [1:22:27<52:23, 1.05it/s, loss=0.0135, lr=8.56e-06, step=6700] Training: 67%|██████▋ | 6701/10000 [1:22:28<54:25, 1.01it/s, loss=0.0135, lr=8.56e-06, step=6700] Training: 67%|██████▋ | 6701/10000 [1:22:28<54:25, 1.01it/s, loss=0.0224, lr=8.56e-06, step=6701] Training: 67%|██████▋ | 6702/10000 [1:22:29<52:37, 1.04it/s, loss=0.0224, lr=8.56e-06, step=6701] Training: 67%|██████▋ | 6702/10000 [1:22:29<52:37, 1.04it/s, loss=0.0238, lr=8.56e-06, step=6702] Training: 67%|██████▋ | 6703/10000 [1:22:30<53:35, 1.03it/s, loss=0.0238, lr=8.56e-06, step=6702] Training: 67%|██████▋ | 6703/10000 [1:22:30<53:35, 1.03it/s, loss=0.0156, lr=8.55e-06, step=6703] Training: 67%|██████▋ | 6704/10000 [1:22:31<52:04, 1.05it/s, loss=0.0156, lr=8.55e-06, step=6703] Training: 67%|██████▋ | 6704/10000 [1:22:31<52:04, 1.05it/s, loss=0.0078, lr=8.55e-06, step=6704] Training: 67%|██████▋ | 6705/10000 [1:22:32<46:59, 1.17it/s, loss=0.0078, lr=8.55e-06, step=6704] Training: 67%|██████▋ | 6705/10000 [1:22:32<46:59, 1.17it/s, loss=0.0036, lr=8.55e-06, step=6705] Training: 67%|██████▋ | 6706/10000 [1:22:33<52:53, 1.04it/s, loss=0.0036, lr=8.55e-06, step=6705] Training: 67%|██████▋ | 6706/10000 [1:22:33<52:53, 1.04it/s, loss=0.0024, lr=8.54e-06, step=6706] Training: 67%|██████▋ | 6707/10000 [1:22:34<57:23, 1.05s/it, loss=0.0024, lr=8.54e-06, step=6706] Training: 67%|██████▋ | 6707/10000 [1:22:34<57:23, 1.05s/it, loss=0.0116, lr=8.54e-06, step=6707] Training: 67%|██████▋ | 6708/10000 [1:22:36<1:01:29, 1.12s/it, loss=0.0116, lr=8.54e-06, step=6707] Training: 67%|██████▋ | 6708/10000 [1:22:36<1:01:29, 1.12s/it, loss=0.0041, lr=8.54e-06, step=6708] Training: 67%|██████▋ | 6709/10000 [1:22:36<55:05, 1.00s/it, loss=0.0041, lr=8.54e-06, step=6708] Training: 67%|██████▋ | 6709/10000 [1:22:36<55:05, 1.00s/it, loss=0.0019, lr=8.53e-06, step=6709]17:28:43.747 [I] step=6710 loss=0.0152 smoothed_loss=0.0094 lr=8.55e-06 grad_norm=0.4412 step_time=0.6997s data_time=0.2537s it/s=1.049 eta_to_10000=3136.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.1132 grad_shared_expert=0.4726 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6710/10000 [1:22:37<47:38, 1.15it/s, loss=0.0019, lr=8.53e-06, step=6709] Training: 67%|██████▋ | 6710/10000 [1:22:37<47:38, 1.15it/s, loss=0.0152, lr=8.53e-06, step=6710] Training: 67%|██████▋ | 6711/10000 [1:22:38<47:16, 1.16it/s, loss=0.0152, lr=8.53e-06, step=6710] Training: 67%|██████▋ | 6711/10000 [1:22:38<47:16, 1.16it/s, loss=0.0159, lr=8.53e-06, step=6711] Training: 67%|██████▋ | 6712/10000 [1:22:38<46:10, 1.19it/s, loss=0.0159, lr=8.53e-06, step=6711] Training: 67%|██████▋ | 6712/10000 [1:22:38<46:10, 1.19it/s, loss=0.0305, lr=8.52e-06, step=6712] Training: 67%|██████▋ | 6713/10000 [1:22:39<47:40, 1.15it/s, loss=0.0305, lr=8.52e-06, step=6712] Training: 67%|██████▋ | 6713/10000 [1:22:39<47:40, 1.15it/s, loss=0.0105, lr=8.52e-06, step=6713] Training: 67%|██████▋ | 6714/10000 [1:22:40<42:38, 1.28it/s, loss=0.0105, lr=8.52e-06, step=6713] Training: 67%|██████▋ | 6714/10000 [1:22:40<42:38, 1.28it/s, loss=0.0093, lr=8.52e-06, step=6714] Training: 67%|██████▋ | 6715/10000 [1:22:41<46:58, 1.17it/s, loss=0.0093, lr=8.52e-06, step=6714] Training: 67%|██████▋ | 6715/10000 [1:22:41<46:58, 1.17it/s, loss=0.0022, lr=8.51e-06, step=6715] Training: 67%|██████▋ | 6716/10000 [1:22:42<43:13, 1.27it/s, loss=0.0022, lr=8.51e-06, step=6715] Training: 67%|██████▋ | 6716/10000 [1:22:42<43:13, 1.27it/s, loss=0.0352, lr=8.51e-06, step=6716] Training: 67%|██████▋ | 6717/10000 [1:22:42<42:20, 1.29it/s, loss=0.0352, lr=8.51e-06, step=6716] Training: 67%|██████▋ | 6717/10000 [1:22:42<42:20, 1.29it/s, loss=0.0053, lr=8.51e-06, step=6717] Training: 67%|██████▋ | 6718/10000 [1:22:43<43:59, 1.24it/s, loss=0.0053, lr=8.51e-06, step=6717] Training: 67%|██████▋ | 6718/10000 [1:22:43<43:59, 1.24it/s, loss=0.0039, lr=8.50e-06, step=6718] Training: 67%|██████▋ | 6719/10000 [1:22:44<49:34, 1.10it/s, loss=0.0039, lr=8.50e-06, step=6718] Training: 67%|██████▋ | 6719/10000 [1:22:44<49:34, 1.10it/s, loss=0.0144, lr=8.50e-06, step=6719]17:28:52.257 [I] step=6720 loss=0.0022 smoothed_loss=0.0109 lr=8.51e-06 grad_norm=0.4401 step_time=0.6620s data_time=0.1890s it/s=1.175 eta_to_10000=2790.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0081 grad_action_out_proj=0.0728 grad_shared_expert=0.5237 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6720/10000 [1:22:45<50:02, 1.09it/s, loss=0.0144, lr=8.50e-06, step=6719] Training: 67%|██████▋ | 6720/10000 [1:22:45<50:02, 1.09it/s, loss=0.0022, lr=8.50e-06, step=6720] Training: 67%|██████▋ | 6721/10000 [1:22:46<48:59, 1.12it/s, loss=0.0022, lr=8.50e-06, step=6720] Training: 67%|██████▋ | 6721/10000 [1:22:46<48:59, 1.12it/s, loss=0.0083, lr=8.49e-06, step=6721] Training: 67%|██████▋ | 6722/10000 [1:22:47<50:49, 1.07it/s, loss=0.0083, lr=8.49e-06, step=6721] Training: 67%|██████▋ | 6722/10000 [1:22:47<50:49, 1.07it/s, loss=0.0069, lr=8.49e-06, step=6722] Training: 67%|██████▋ | 6723/10000 [1:22:48<44:29, 1.23it/s, loss=0.0069, lr=8.49e-06, step=6722] Training: 67%|██████▋ | 6723/10000 [1:22:48<44:29, 1.23it/s, loss=0.0089, lr=8.49e-06, step=6723] Training: 67%|██████▋ | 6724/10000 [1:22:49<44:16, 1.23it/s, loss=0.0089, lr=8.49e-06, step=6723] Training: 67%|██████▋ | 6724/10000 [1:22:49<44:16, 1.23it/s, loss=0.0130, lr=8.48e-06, step=6724] Training: 67%|██████▋ | 6725/10000 [1:22:49<38:58, 1.40it/s, loss=0.0130, lr=8.48e-06, step=6724] Training: 67%|██████▋ | 6725/10000 [1:22:49<38:58, 1.40it/s, loss=0.0072, lr=8.48e-06, step=6725] Training: 67%|██████▋ | 6726/10000 [1:22:50<42:50, 1.27it/s, loss=0.0072, lr=8.48e-06, step=6725] Training: 67%|██████▋ | 6726/10000 [1:22:50<42:50, 1.27it/s, loss=0.0037, lr=8.48e-06, step=6726] Training: 67%|██████▋ | 6727/10000 [1:22:51<44:52, 1.22it/s, loss=0.0037, lr=8.48e-06, step=6726] Training: 67%|██████▋ | 6727/10000 [1:22:51<44:52, 1.22it/s, loss=0.0120, lr=8.47e-06, step=6727] Training: 67%|██████▋ | 6728/10000 [1:22:52<42:55, 1.27it/s, loss=0.0120, lr=8.47e-06, step=6727] Training: 67%|██████▋ | 6728/10000 [1:22:52<42:55, 1.27it/s, loss=0.0082, lr=8.47e-06, step=6728] Training: 67%|██████▋ | 6729/10000 [1:22:52<42:04, 1.30it/s, loss=0.0082, lr=8.47e-06, step=6728] Training: 67%|██████▋ | 6729/10000 [1:22:52<42:04, 1.30it/s, loss=0.0167, lr=8.47e-06, step=6729]17:29:00.174 [I] step=6730 loss=0.0057 smoothed_loss=0.0098 lr=8.48e-06 grad_norm=0.3820 step_time=0.6268s data_time=0.1649s it/s=1.263 eta_to_10000=2588.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.0783 grad_shared_expert=0.3421 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6730/10000 [1:22:53<44:28, 1.23it/s, loss=0.0167, lr=8.47e-06, step=6729] Training: 67%|██████▋ | 6730/10000 [1:22:53<44:28, 1.23it/s, loss=0.0057, lr=8.46e-06, step=6730] Training: 67%|██████▋ | 6731/10000 [1:22:54<43:48, 1.24it/s, loss=0.0057, lr=8.46e-06, step=6730] Training: 67%|██████▋ | 6731/10000 [1:22:54<43:48, 1.24it/s, loss=0.0162, lr=8.46e-06, step=6731] Training: 67%|██████▋ | 6732/10000 [1:22:55<38:35, 1.41it/s, loss=0.0162, lr=8.46e-06, step=6731] Training: 67%|██████▋ | 6732/10000 [1:22:55<38:35, 1.41it/s, loss=0.0048, lr=8.46e-06, step=6732] Training: 67%|██████▋ | 6733/10000 [1:22:55<35:05, 1.55it/s, loss=0.0048, lr=8.46e-06, step=6732] Training: 67%|██████▋ | 6733/10000 [1:22:55<35:05, 1.55it/s, loss=0.0028, lr=8.45e-06, step=6733] Training: 67%|██████▋ | 6734/10000 [1:22:56<38:02, 1.43it/s, loss=0.0028, lr=8.45e-06, step=6733] Training: 67%|██████▋ | 6734/10000 [1:22:56<38:02, 1.43it/s, loss=0.0559, lr=8.45e-06, step=6734] Training: 67%|██████▋ | 6735/10000 [1:22:56<35:10, 1.55it/s, loss=0.0559, lr=8.45e-06, step=6734] Training: 67%|██████▋ | 6735/10000 [1:22:56<35:10, 1.55it/s, loss=0.0088, lr=8.45e-06, step=6735] Training: 67%|██████▋ | 6736/10000 [1:22:57<36:06, 1.51it/s, loss=0.0088, lr=8.45e-06, step=6735] Training: 67%|██████▋ | 6736/10000 [1:22:57<36:06, 1.51it/s, loss=0.0077, lr=8.44e-06, step=6736] Training: 67%|██████▋ | 6737/10000 [1:22:58<39:26, 1.38it/s, loss=0.0077, lr=8.44e-06, step=6736] Training: 67%|██████▋ | 6737/10000 [1:22:58<39:26, 1.38it/s, loss=0.0113, lr=8.44e-06, step=6737] Training: 67%|██████▋ | 6738/10000 [1:22:58<35:42, 1.52it/s, loss=0.0113, lr=8.44e-06, step=6737] Training: 67%|██████▋ | 6738/10000 [1:22:58<35:42, 1.52it/s, loss=0.0309, lr=8.44e-06, step=6738] Training: 67%|██████▋ | 6739/10000 [1:22:59<33:50, 1.61it/s, loss=0.0309, lr=8.44e-06, step=6738] Training: 67%|██████▋ | 6739/10000 [1:22:59<33:50, 1.61it/s, loss=0.0082, lr=8.43e-06, step=6739]17:29:06.665 [I] step=6740 loss=0.0152 smoothed_loss=0.0140 lr=8.45e-06 grad_norm=0.4281 step_time=0.5434s data_time=0.1057s it/s=1.541 eta_to_10000=2115.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0211 grad_action_out_proj=0.0950 grad_shared_expert=0.3499 (10775:train_pytorch.py:850) + Training: 67%|██████▋ | 6740/10000 [1:23:00<36:15, 1.50it/s, loss=0.0082, lr=8.43e-06, step=6739] Training: 67%|██████▋ | 6740/10000 [1:23:00<36:15, 1.50it/s, loss=0.0152, lr=8.43e-06, step=6740] Training: 67%|██████▋ | 6741/10000 [1:23:01<42:08, 1.29it/s, loss=0.0152, lr=8.43e-06, step=6740] Training: 67%|██████▋ | 6741/10000 [1:23:01<42:08, 1.29it/s, loss=0.0109, lr=8.43e-06, step=6741] Training: 67%|██████▋ | 6742/10000 [1:23:01<40:47, 1.33it/s, loss=0.0109, lr=8.43e-06, step=6741] Training: 67%|██████▋ | 6742/10000 [1:23:01<40:47, 1.33it/s, loss=0.0281, lr=8.43e-06, step=6742] Training: 67%|██████▋ | 6743/10000 [1:23:02<41:15, 1.32it/s, loss=0.0281, lr=8.43e-06, step=6742] Training: 67%|██████▋ | 6743/10000 [1:23:02<41:15, 1.32it/s, loss=0.0071, lr=8.42e-06, step=6743] Training: 67%|██████▋ | 6744/10000 [1:23:03<40:38, 1.33it/s, loss=0.0071, lr=8.42e-06, step=6743] Training: 67%|██████▋ | 6744/10000 [1:23:03<40:38, 1.33it/s, loss=0.0093, lr=8.42e-06, step=6744] Training: 67%|██████▋ | 6745/10000 [1:23:04<40:17, 1.35it/s, loss=0.0093, lr=8.42e-06, step=6744] Training: 67%|██████▋ | 6745/10000 [1:23:04<40:17, 1.35it/s, loss=0.0035, lr=8.42e-06, step=6745] Training: 67%|██████▋ | 6746/10000 [1:23:04<39:15, 1.38it/s, loss=0.0035, lr=8.42e-06, step=6745] Training: 67%|██████▋ | 6746/10000 [1:23:04<39:15, 1.38it/s, loss=0.0055, lr=8.41e-06, step=6746] Training: 67%|██████▋ | 6747/10000 [1:23:05<41:52, 1.29it/s, loss=0.0055, lr=8.41e-06, step=6746] Training: 67%|██████▋ | 6747/10000 [1:23:05<41:52, 1.29it/s, loss=0.0061, lr=8.41e-06, step=6747] Training: 67%|██████▋ | 6748/10000 [1:23:06<37:23, 1.45it/s, loss=0.0061, lr=8.41e-06, step=6747] Training: 67%|██████▋ | 6748/10000 [1:23:06<37:23, 1.45it/s, loss=0.0098, lr=8.41e-06, step=6748] Training: 67%|██████▋ | 6749/10000 [1:23:06<36:41, 1.48it/s, loss=0.0098, lr=8.41e-06, step=6748] Training: 67%|██████▋ | 6749/10000 [1:23:06<36:41, 1.48it/s, loss=0.0147, lr=8.40e-06, step=6749]17:29:13.869 [I] step=6750 loss=0.0017 smoothed_loss=0.0106 lr=8.41e-06 grad_norm=0.7323 step_time=0.5916s data_time=0.1287s it/s=1.388 eta_to_10000=2340.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0174 grad_action_out_proj=0.1450 grad_shared_expert=0.3623 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6750/10000 [1:23:07<34:24, 1.57it/s, loss=0.0147, lr=8.40e-06, step=6749] Training: 68%|██████▊ | 6750/10000 [1:23:07<34:24, 1.57it/s, loss=0.0017, lr=8.40e-06, step=6750] Training: 68%|██████▊ | 6751/10000 [1:23:08<36:38, 1.48it/s, loss=0.0017, lr=8.40e-06, step=6750] Training: 68%|██████▊ | 6751/10000 [1:23:08<36:38, 1.48it/s, loss=0.0063, lr=8.40e-06, step=6751] Training: 68%|██████▊ | 6752/10000 [1:23:08<35:07, 1.54it/s, loss=0.0063, lr=8.40e-06, step=6751] Training: 68%|██████▊ | 6752/10000 [1:23:08<35:07, 1.54it/s, loss=0.0208, lr=8.39e-06, step=6752] Training: 68%|██████▊ | 6753/10000 [1:23:09<35:58, 1.50it/s, loss=0.0208, lr=8.39e-06, step=6752] Training: 68%|██████▊ | 6753/10000 [1:23:09<35:58, 1.50it/s, loss=0.0057, lr=8.39e-06, step=6753] Training: 68%|██████▊ | 6754/10000 [1:23:09<33:21, 1.62it/s, loss=0.0057, lr=8.39e-06, step=6753] Training: 68%|██████▊ | 6754/10000 [1:23:09<33:21, 1.62it/s, loss=0.0062, lr=8.39e-06, step=6754] Training: 68%|██████▊ | 6755/10000 [1:23:10<31:17, 1.73it/s, loss=0.0062, lr=8.39e-06, step=6754] Training: 68%|██████▊ | 6755/10000 [1:23:10<31:17, 1.73it/s, loss=0.0247, lr=8.38e-06, step=6755] Training: 68%|██████▊ | 6756/10000 [1:23:11<34:44, 1.56it/s, loss=0.0247, lr=8.38e-06, step=6755] Training: 68%|██████▊ | 6756/10000 [1:23:11<34:44, 1.56it/s, loss=0.0046, lr=8.38e-06, step=6756] Training: 68%|██████▊ | 6757/10000 [1:23:11<32:32, 1.66it/s, loss=0.0046, lr=8.38e-06, step=6756] Training: 68%|██████▊ | 6757/10000 [1:23:11<32:32, 1.66it/s, loss=0.0043, lr=8.38e-06, step=6757] Training: 68%|██████▊ | 6758/10000 [1:23:12<40:55, 1.32it/s, loss=0.0043, lr=8.38e-06, step=6757] Training: 68%|██████▊ | 6758/10000 [1:23:12<40:55, 1.32it/s, loss=0.0029, lr=8.37e-06, step=6758] Training: 68%|██████▊ | 6759/10000 [1:23:13<36:58, 1.46it/s, loss=0.0029, lr=8.37e-06, step=6758] Training: 68%|██████▊ | 6759/10000 [1:23:13<36:58, 1.46it/s, loss=0.0063, lr=8.37e-06, step=6759]17:29:20.595 [I] step=6760 loss=0.0078 smoothed_loss=0.0091 lr=8.38e-06 grad_norm=0.4340 step_time=0.5738s data_time=0.0988s it/s=1.487 eta_to_10000=2178.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0099 grad_action_out_proj=0.1696 grad_shared_expert=0.4910 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6760/10000 [1:23:14<37:50, 1.43it/s, loss=0.0063, lr=8.37e-06, step=6759] Training: 68%|██████▊ | 6760/10000 [1:23:14<37:50, 1.43it/s, loss=0.0078, lr=8.37e-06, step=6760] Training: 68%|██████▊ | 6761/10000 [1:23:14<37:25, 1.44it/s, loss=0.0078, lr=8.37e-06, step=6760] Training: 68%|██████▊ | 6761/10000 [1:23:14<37:25, 1.44it/s, loss=0.0025, lr=8.36e-06, step=6761] Training: 68%|██████▊ | 6762/10000 [1:23:15<36:36, 1.47it/s, loss=0.0025, lr=8.36e-06, step=6761] Training: 68%|██████▊ | 6762/10000 [1:23:15<36:36, 1.47it/s, loss=0.0105, lr=8.36e-06, step=6762] Training: 68%|██████▊ | 6763/10000 [1:23:16<40:02, 1.35it/s, loss=0.0105, lr=8.36e-06, step=6762] Training: 68%|██████▊ | 6763/10000 [1:23:16<40:02, 1.35it/s, loss=0.0129, lr=8.36e-06, step=6763] Training: 68%|██████▊ | 6764/10000 [1:23:16<37:31, 1.44it/s, loss=0.0129, lr=8.36e-06, step=6763] Training: 68%|██████▊ | 6764/10000 [1:23:16<37:31, 1.44it/s, loss=0.0085, lr=8.35e-06, step=6764] Training: 68%|██████▊ | 6765/10000 [1:23:17<42:08, 1.28it/s, loss=0.0085, lr=8.35e-06, step=6764] Training: 68%|██████▊ | 6765/10000 [1:23:17<42:08, 1.28it/s, loss=0.0055, lr=8.35e-06, step=6765] Training: 68%|██████▊ | 6766/10000 [1:23:18<41:11, 1.31it/s, loss=0.0055, lr=8.35e-06, step=6765] Training: 68%|██████▊ | 6766/10000 [1:23:18<41:11, 1.31it/s, loss=0.0197, lr=8.35e-06, step=6766] Training: 68%|██████▊ | 6767/10000 [1:23:19<40:04, 1.34it/s, loss=0.0197, lr=8.35e-06, step=6766] Training: 68%|██████▊ | 6767/10000 [1:23:19<40:04, 1.34it/s, loss=0.0144, lr=8.34e-06, step=6767] Training: 68%|██████▊ | 6768/10000 [1:23:20<39:00, 1.38it/s, loss=0.0144, lr=8.34e-06, step=6767] Training: 68%|██████▊ | 6768/10000 [1:23:20<39:00, 1.38it/s, loss=0.0194, lr=8.34e-06, step=6768] Training: 68%|██████▊ | 6769/10000 [1:23:20<38:00, 1.42it/s, loss=0.0194, lr=8.34e-06, step=6768] Training: 68%|██████▊ | 6769/10000 [1:23:20<38:00, 1.42it/s, loss=0.0397, lr=8.34e-06, step=6769]17:29:27.660 [I] step=6770 loss=0.0279 smoothed_loss=0.0154 lr=8.35e-06 grad_norm=0.5581 step_time=0.5687s data_time=0.1378s it/s=1.416 eta_to_10000=2281.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.1205 grad_shared_expert=0.7929 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6770/10000 [1:23:21<35:03, 1.54it/s, loss=0.0397, lr=8.34e-06, step=6769] Training: 68%|██████▊ | 6770/10000 [1:23:21<35:03, 1.54it/s, loss=0.0279, lr=8.33e-06, step=6770] Training: 68%|██████▊ | 6771/10000 [1:23:22<38:29, 1.40it/s, loss=0.0279, lr=8.33e-06, step=6770] Training: 68%|██████▊ | 6771/10000 [1:23:22<38:29, 1.40it/s, loss=0.0077, lr=8.33e-06, step=6771] Training: 68%|██████▊ | 6772/10000 [1:23:23<42:10, 1.28it/s, loss=0.0077, lr=8.33e-06, step=6771] Training: 68%|██████▊ | 6772/10000 [1:23:23<42:10, 1.28it/s, loss=0.0261, lr=8.33e-06, step=6772] Training: 68%|██████▊ | 6773/10000 [1:23:23<43:02, 1.25it/s, loss=0.0261, lr=8.33e-06, step=6772] Training: 68%|██████▊ | 6773/10000 [1:23:23<43:02, 1.25it/s, loss=0.0023, lr=8.32e-06, step=6773] Training: 68%|██████▊ | 6774/10000 [1:23:24<41:04, 1.31it/s, loss=0.0023, lr=8.32e-06, step=6773] Training: 68%|██████▊ | 6774/10000 [1:23:24<41:04, 1.31it/s, loss=0.0253, lr=8.32e-06, step=6774] Training: 68%|██████▊ | 6775/10000 [1:23:25<36:48, 1.46it/s, loss=0.0253, lr=8.32e-06, step=6774] Training: 68%|██████▊ | 6775/10000 [1:23:25<36:48, 1.46it/s, loss=0.0242, lr=8.32e-06, step=6775] Training: 68%|██████▊ | 6776/10000 [1:23:25<36:44, 1.46it/s, loss=0.0242, lr=8.32e-06, step=6775] Training: 68%|██████▊ | 6776/10000 [1:23:25<36:44, 1.46it/s, loss=0.0059, lr=8.31e-06, step=6776] Training: 68%|██████▊ | 6777/10000 [1:23:26<37:19, 1.44it/s, loss=0.0059, lr=8.31e-06, step=6776] Training: 68%|██████▊ | 6777/10000 [1:23:26<37:19, 1.44it/s, loss=0.0013, lr=8.31e-06, step=6777] Training: 68%|██████▊ | 6778/10000 [1:23:27<40:58, 1.31it/s, loss=0.0013, lr=8.31e-06, step=6777] Training: 68%|██████▊ | 6778/10000 [1:23:27<40:58, 1.31it/s, loss=0.0027, lr=8.31e-06, step=6778] Training: 68%|██████▊ | 6779/10000 [1:23:28<39:49, 1.35it/s, loss=0.0027, lr=8.31e-06, step=6778] Training: 68%|██████▊ | 6779/10000 [1:23:28<39:49, 1.35it/s, loss=0.0051, lr=8.30e-06, step=6779]17:29:35.476 [I] step=6780 loss=0.0501 smoothed_loss=0.0158 lr=8.32e-06 grad_norm=0.4557 step_time=0.6255s data_time=0.1561s it/s=1.280 eta_to_10000=2516.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0248 grad_action_out_proj=0.1819 grad_shared_expert=0.4957 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6780/10000 [1:23:29<43:32, 1.23it/s, loss=0.0051, lr=8.30e-06, step=6779] Training: 68%|██████▊ | 6780/10000 [1:23:29<43:32, 1.23it/s, loss=0.0501, lr=8.30e-06, step=6780] Training: 68%|██████▊ | 6781/10000 [1:23:29<43:49, 1.22it/s, loss=0.0501, lr=8.30e-06, step=6780] Training: 68%|██████▊ | 6781/10000 [1:23:29<43:49, 1.22it/s, loss=0.0278, lr=8.30e-06, step=6781] Training: 68%|██████▊ | 6782/10000 [1:23:30<44:58, 1.19it/s, loss=0.0278, lr=8.30e-06, step=6781] Training: 68%|██████▊ | 6782/10000 [1:23:30<44:58, 1.19it/s, loss=0.0209, lr=8.29e-06, step=6782] Training: 68%|██████▊ | 6783/10000 [1:23:31<45:10, 1.19it/s, loss=0.0209, lr=8.29e-06, step=6782] Training: 68%|██████▊ | 6783/10000 [1:23:31<45:10, 1.19it/s, loss=0.0279, lr=8.29e-06, step=6783] Training: 68%|██████▊ | 6784/10000 [1:23:32<44:55, 1.19it/s, loss=0.0279, lr=8.29e-06, step=6783] Training: 68%|██████▊ | 6784/10000 [1:23:32<44:55, 1.19it/s, loss=0.0029, lr=8.29e-06, step=6784] Training: 68%|██████▊ | 6785/10000 [1:23:33<46:48, 1.14it/s, loss=0.0029, lr=8.29e-06, step=6784] Training: 68%|██████▊ | 6785/10000 [1:23:33<46:48, 1.14it/s, loss=0.0068, lr=8.28e-06, step=6785] Training: 68%|██████▊ | 6786/10000 [1:23:33<41:09, 1.30it/s, loss=0.0068, lr=8.28e-06, step=6785] Training: 68%|██████▊ | 6786/10000 [1:23:33<41:09, 1.30it/s, loss=0.0237, lr=8.28e-06, step=6786] Training: 68%|██████▊ | 6787/10000 [1:23:34<45:33, 1.18it/s, loss=0.0237, lr=8.28e-06, step=6786] Training: 68%|██████▊ | 6787/10000 [1:23:34<45:33, 1.18it/s, loss=0.0009, lr=8.28e-06, step=6787] Training: 68%|██████▊ | 6788/10000 [1:23:35<44:57, 1.19it/s, loss=0.0009, lr=8.28e-06, step=6787] Training: 68%|██████▊ | 6788/10000 [1:23:35<44:57, 1.19it/s, loss=0.0020, lr=8.27e-06, step=6788] Training: 68%|██████▊ | 6789/10000 [1:23:36<39:30, 1.35it/s, loss=0.0020, lr=8.27e-06, step=6788] Training: 68%|██████▊ | 6789/10000 [1:23:36<39:30, 1.35it/s, loss=0.0257, lr=8.27e-06, step=6789]17:29:43.720 [I] step=6790 loss=0.0186 smoothed_loss=0.0154 lr=8.28e-06 grad_norm=0.5012 step_time=0.6459s data_time=0.1785s it/s=1.213 eta_to_10000=2646.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.0986 grad_shared_expert=0.4404 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6790/10000 [1:23:37<43:49, 1.22it/s, loss=0.0257, lr=8.27e-06, step=6789] Training: 68%|██████▊ | 6790/10000 [1:23:37<43:49, 1.22it/s, loss=0.0186, lr=8.27e-06, step=6790] Training: 68%|██████▊ | 6791/10000 [1:23:37<41:07, 1.30it/s, loss=0.0186, lr=8.27e-06, step=6790] Training: 68%|██████▊ | 6791/10000 [1:23:37<41:07, 1.30it/s, loss=0.0044, lr=8.27e-06, step=6791] Training: 68%|██████▊ | 6792/10000 [1:23:38<45:03, 1.19it/s, loss=0.0044, lr=8.27e-06, step=6791] Training: 68%|██████▊ | 6792/10000 [1:23:38<45:03, 1.19it/s, loss=0.0092, lr=8.26e-06, step=6792] Training: 68%|██████▊ | 6793/10000 [1:23:39<42:38, 1.25it/s, loss=0.0092, lr=8.26e-06, step=6792] Training: 68%|██████▊ | 6793/10000 [1:23:39<42:38, 1.25it/s, loss=0.0041, lr=8.26e-06, step=6793] Training: 68%|██████▊ | 6794/10000 [1:23:40<50:15, 1.06it/s, loss=0.0041, lr=8.26e-06, step=6793] Training: 68%|██████▊ | 6794/10000 [1:23:40<50:15, 1.06it/s, loss=0.0133, lr=8.26e-06, step=6794] Training: 68%|██████▊ | 6795/10000 [1:23:41<49:06, 1.09it/s, loss=0.0133, lr=8.26e-06, step=6794] Training: 68%|██████▊ | 6795/10000 [1:23:41<49:06, 1.09it/s, loss=0.0025, lr=8.25e-06, step=6795] Training: 68%|██████▊ | 6796/10000 [1:23:42<44:48, 1.19it/s, loss=0.0025, lr=8.25e-06, step=6795] Training: 68%|██████▊ | 6796/10000 [1:23:42<44:48, 1.19it/s, loss=0.0034, lr=8.25e-06, step=6796] Training: 68%|██████▊ | 6797/10000 [1:23:42<39:59, 1.33it/s, loss=0.0034, lr=8.25e-06, step=6796] Training: 68%|██████▊ | 6797/10000 [1:23:42<39:59, 1.33it/s, loss=0.0112, lr=8.25e-06, step=6797] Training: 68%|██████▊ | 6798/10000 [1:23:43<39:16, 1.36it/s, loss=0.0112, lr=8.25e-06, step=6797] Training: 68%|██████▊ | 6798/10000 [1:23:43<39:16, 1.36it/s, loss=0.0102, lr=8.24e-06, step=6798] Training: 68%|██████▊ | 6799/10000 [1:23:44<38:35, 1.38it/s, loss=0.0102, lr=8.24e-06, step=6798] Training: 68%|██████▊ | 6799/10000 [1:23:44<38:35, 1.38it/s, loss=0.0042, lr=8.24e-06, step=6799]17:29:51.398 [I] step=6800 loss=0.0092 smoothed_loss=0.0101 lr=8.25e-06 grad_norm=0.4692 step_time=0.6171s data_time=0.1507s it/s=1.303 eta_to_10000=2456.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0690 grad_action_out_proj=0.2389 grad_shared_expert=0.7537 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6800/10000 [1:23:44<36:20, 1.47it/s, loss=0.0042, lr=8.24e-06, step=6799] Training: 68%|██████▊ | 6800/10000 [1:23:44<36:20, 1.47it/s, loss=0.0092, lr=8.24e-06, step=6800] Training: 68%|██████▊ | 6801/10000 [1:23:45<38:11, 1.40it/s, loss=0.0092, lr=8.24e-06, step=6800] Training: 68%|██████▊ | 6801/10000 [1:23:45<38:11, 1.40it/s, loss=0.0073, lr=8.23e-06, step=6801] Training: 68%|██████▊ | 6802/10000 [1:23:46<42:47, 1.25it/s, loss=0.0073, lr=8.23e-06, step=6801] Training: 68%|██████▊ | 6802/10000 [1:23:46<42:47, 1.25it/s, loss=0.0044, lr=8.23e-06, step=6802] Training: 68%|██████▊ | 6803/10000 [1:23:47<43:33, 1.22it/s, loss=0.0044, lr=8.23e-06, step=6802] Training: 68%|██████▊ | 6803/10000 [1:23:47<43:33, 1.22it/s, loss=0.0086, lr=8.23e-06, step=6803] Training: 68%|██████▊ | 6804/10000 [1:23:48<44:07, 1.21it/s, loss=0.0086, lr=8.23e-06, step=6803] Training: 68%|██████▊ | 6804/10000 [1:23:48<44:07, 1.21it/s, loss=0.0214, lr=8.22e-06, step=6804] Training: 68%|██████▊ | 6805/10000 [1:23:49<43:44, 1.22it/s, loss=0.0214, lr=8.22e-06, step=6804] Training: 68%|██████▊ | 6805/10000 [1:23:49<43:44, 1.22it/s, loss=0.0053, lr=8.22e-06, step=6805] Training: 68%|██████▊ | 6806/10000 [1:23:49<38:38, 1.38it/s, loss=0.0053, lr=8.22e-06, step=6805] Training: 68%|██████▊ | 6806/10000 [1:23:49<38:38, 1.38it/s, loss=0.0251, lr=8.22e-06, step=6806] Training: 68%|██████▊ | 6807/10000 [1:23:50<40:48, 1.30it/s, loss=0.0251, lr=8.22e-06, step=6806] Training: 68%|██████▊ | 6807/10000 [1:23:50<40:48, 1.30it/s, loss=0.0017, lr=8.21e-06, step=6807] Training: 68%|██████▊ | 6808/10000 [1:23:51<42:25, 1.25it/s, loss=0.0017, lr=8.21e-06, step=6807] Training: 68%|██████▊ | 6808/10000 [1:23:51<42:25, 1.25it/s, loss=0.0057, lr=8.21e-06, step=6808] Training: 68%|██████▊ | 6809/10000 [1:23:52<44:47, 1.19it/s, loss=0.0057, lr=8.21e-06, step=6808] Training: 68%|██████▊ | 6809/10000 [1:23:52<44:47, 1.19it/s, loss=0.0084, lr=8.21e-06, step=6809]17:30:00.012 [I] step=6810 loss=0.0085 smoothed_loss=0.0097 lr=8.22e-06 grad_norm=0.4539 step_time=0.6811s data_time=0.1803s it/s=1.161 eta_to_10000=2747.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0198 grad_action_out_proj=0.1296 grad_shared_expert=0.6356 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6810/10000 [1:23:53<49:12, 1.08it/s, loss=0.0084, lr=8.21e-06, step=6809] Training: 68%|██████▊ | 6810/10000 [1:23:53<49:12, 1.08it/s, loss=0.0085, lr=8.20e-06, step=6810] Training: 68%|██████▊ | 6811/10000 [1:23:54<50:09, 1.06it/s, loss=0.0085, lr=8.20e-06, step=6810] Training: 68%|██████▊ | 6811/10000 [1:23:54<50:09, 1.06it/s, loss=0.0213, lr=8.20e-06, step=6811] Training: 68%|██████▊ | 6812/10000 [1:23:55<47:38, 1.12it/s, loss=0.0213, lr=8.20e-06, step=6811] Training: 68%|██████▊ | 6812/10000 [1:23:55<47:38, 1.12it/s, loss=0.0113, lr=8.20e-06, step=6812] Training: 68%|██████▊ | 6813/10000 [1:23:56<50:06, 1.06it/s, loss=0.0113, lr=8.20e-06, step=6812] Training: 68%|██████▊ | 6813/10000 [1:23:56<50:06, 1.06it/s, loss=0.0187, lr=8.19e-06, step=6813] Training: 68%|██████▊ | 6814/10000 [1:23:56<43:22, 1.22it/s, loss=0.0187, lr=8.19e-06, step=6813] Training: 68%|██████▊ | 6814/10000 [1:23:56<43:22, 1.22it/s, loss=0.0134, lr=8.19e-06, step=6814] Training: 68%|██████▊ | 6815/10000 [1:23:58<50:45, 1.05it/s, loss=0.0134, lr=8.19e-06, step=6814] Training: 68%|██████▊ | 6815/10000 [1:23:58<50:45, 1.05it/s, loss=0.0053, lr=8.19e-06, step=6815] Training: 68%|██████▊ | 6816/10000 [1:23:59<52:21, 1.01it/s, loss=0.0053, lr=8.19e-06, step=6815] Training: 68%|██████▊ | 6816/10000 [1:23:59<52:21, 1.01it/s, loss=0.0034, lr=8.18e-06, step=6816] Training: 68%|██████▊ | 6817/10000 [1:24:00<48:43, 1.09it/s, loss=0.0034, lr=8.18e-06, step=6816] Training: 68%|██████▊ | 6817/10000 [1:24:00<48:43, 1.09it/s, loss=0.0141, lr=8.18e-06, step=6817] Training: 68%|██████▊ | 6818/10000 [1:24:00<45:56, 1.15it/s, loss=0.0141, lr=8.18e-06, step=6817] Training: 68%|██████▊ | 6818/10000 [1:24:00<45:56, 1.15it/s, loss=0.0009, lr=8.18e-06, step=6818] Training: 68%|██████▊ | 6819/10000 [1:24:01<43:32, 1.22it/s, loss=0.0009, lr=8.18e-06, step=6818] Training: 68%|██████▊ | 6819/10000 [1:24:01<43:32, 1.22it/s, loss=0.0061, lr=8.17e-06, step=6819]17:30:08.735 [I] step=6820 loss=0.0103 smoothed_loss=0.0095 lr=8.19e-06 grad_norm=0.3872 step_time=0.6497s data_time=0.2226s it/s=1.147 eta_to_10000=2773.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0052 grad_action_out_proj=0.0716 grad_shared_expert=0.3297 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6820/10000 [1:24:02<43:27, 1.22it/s, loss=0.0061, lr=8.17e-06, step=6819] Training: 68%|██████▊ | 6820/10000 [1:24:02<43:27, 1.22it/s, loss=0.0103, lr=8.17e-06, step=6820] Training: 68%|██████▊ | 6821/10000 [1:24:03<43:42, 1.21it/s, loss=0.0103, lr=8.17e-06, step=6820] Training: 68%|██████▊ | 6821/10000 [1:24:03<43:42, 1.21it/s, loss=0.0074, lr=8.17e-06, step=6821] Training: 68%|██████▊ | 6822/10000 [1:24:03<42:45, 1.24it/s, loss=0.0074, lr=8.17e-06, step=6821] Training: 68%|██████▊ | 6822/10000 [1:24:03<42:45, 1.24it/s, loss=0.0026, lr=8.16e-06, step=6822] Training: 68%|██████▊ | 6823/10000 [1:24:05<48:16, 1.10it/s, loss=0.0026, lr=8.16e-06, step=6822] Training: 68%|██████▊ | 6823/10000 [1:24:05<48:16, 1.10it/s, loss=0.0320, lr=8.16e-06, step=6823] Training: 68%|██████▊ | 6824/10000 [1:24:05<48:16, 1.10it/s, loss=0.0320, lr=8.16e-06, step=6823] Training: 68%|██████▊ | 6824/10000 [1:24:05<48:16, 1.10it/s, loss=0.0092, lr=8.16e-06, step=6824] Training: 68%|██████▊ | 6825/10000 [1:24:06<45:51, 1.15it/s, loss=0.0092, lr=8.16e-06, step=6824] Training: 68%|██████▊ | 6825/10000 [1:24:06<45:51, 1.15it/s, loss=0.0196, lr=8.16e-06, step=6825] Training: 68%|██████▊ | 6826/10000 [1:24:07<41:49, 1.26it/s, loss=0.0196, lr=8.16e-06, step=6825] Training: 68%|██████▊ | 6826/10000 [1:24:07<41:49, 1.26it/s, loss=0.0224, lr=8.15e-06, step=6826] Training: 68%|██████▊ | 6827/10000 [1:24:08<42:12, 1.25it/s, loss=0.0224, lr=8.15e-06, step=6826] Training: 68%|██████▊ | 6827/10000 [1:24:08<42:12, 1.25it/s, loss=0.0073, lr=8.15e-06, step=6827] Training: 68%|██████▊ | 6828/10000 [1:24:08<42:08, 1.25it/s, loss=0.0073, lr=8.15e-06, step=6827] Training: 68%|██████▊ | 6828/10000 [1:24:08<42:08, 1.25it/s, loss=0.0090, lr=8.15e-06, step=6828] Training: 68%|██████▊ | 6829/10000 [1:24:09<38:06, 1.39it/s, loss=0.0090, lr=8.15e-06, step=6828] Training: 68%|██████▊ | 6829/10000 [1:24:09<38:06, 1.39it/s, loss=0.0067, lr=8.14e-06, step=6829]17:30:16.772 [I] step=6830 loss=0.0032 smoothed_loss=0.0105 lr=8.15e-06 grad_norm=0.4768 step_time=0.6339s data_time=0.1698s it/s=1.244 eta_to_10000=2547.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0052 grad_action_out_proj=0.0803 grad_shared_expert=0.2802 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6830/10000 [1:24:10<39:56, 1.32it/s, loss=0.0067, lr=8.14e-06, step=6829] Training: 68%|██████▊ | 6830/10000 [1:24:10<39:56, 1.32it/s, loss=0.0032, lr=8.14e-06, step=6830] Training: 68%|██████▊ | 6831/10000 [1:24:11<40:35, 1.30it/s, loss=0.0032, lr=8.14e-06, step=6830] Training: 68%|██████▊ | 6831/10000 [1:24:11<40:35, 1.30it/s, loss=0.0108, lr=8.14e-06, step=6831] Training: 68%|██████▊ | 6832/10000 [1:24:11<36:33, 1.44it/s, loss=0.0108, lr=8.14e-06, step=6831] Training: 68%|██████▊ | 6832/10000 [1:24:11<36:33, 1.44it/s, loss=0.0017, lr=8.13e-06, step=6832] Training: 68%|██████▊ | 6833/10000 [1:24:12<34:26, 1.53it/s, loss=0.0017, lr=8.13e-06, step=6832] Training: 68%|██████▊ | 6833/10000 [1:24:12<34:26, 1.53it/s, loss=0.0028, lr=8.13e-06, step=6833] Training: 68%|██████▊ | 6834/10000 [1:24:12<35:37, 1.48it/s, loss=0.0028, lr=8.13e-06, step=6833] Training: 68%|██████▊ | 6834/10000 [1:24:12<35:37, 1.48it/s, loss=0.0032, lr=8.13e-06, step=6834] Training: 68%|██████▊ | 6835/10000 [1:24:13<33:11, 1.59it/s, loss=0.0032, lr=8.13e-06, step=6834] Training: 68%|██████▊ | 6835/10000 [1:24:13<33:11, 1.59it/s, loss=0.0039, lr=8.12e-06, step=6835] Training: 68%|██████▊ | 6836/10000 [1:24:14<37:21, 1.41it/s, loss=0.0039, lr=8.12e-06, step=6835] Training: 68%|██████▊ | 6836/10000 [1:24:14<37:21, 1.41it/s, loss=0.0011, lr=8.12e-06, step=6836] Training: 68%|██████▊ | 6837/10000 [1:24:15<39:01, 1.35it/s, loss=0.0011, lr=8.12e-06, step=6836] Training: 68%|██████▊ | 6837/10000 [1:24:15<39:01, 1.35it/s, loss=0.0241, lr=8.12e-06, step=6837] Training: 68%|██████▊ | 6838/10000 [1:24:15<35:27, 1.49it/s, loss=0.0241, lr=8.12e-06, step=6837] Training: 68%|██████▊ | 6838/10000 [1:24:15<35:27, 1.49it/s, loss=0.0134, lr=8.11e-06, step=6838] Training: 68%|██████▊ | 6839/10000 [1:24:16<32:51, 1.60it/s, loss=0.0134, lr=8.11e-06, step=6838] Training: 68%|██████▊ | 6839/10000 [1:24:16<32:51, 1.60it/s, loss=0.0042, lr=8.11e-06, step=6839]17:30:23.368 [I] step=6840 loss=0.0180 smoothed_loss=0.0098 lr=8.12e-06 grad_norm=0.4342 step_time=0.5453s data_time=0.1144s it/s=1.516 eta_to_10000=2084.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0043 grad_action_out_proj=0.0582 grad_shared_expert=0.4101 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6840/10000 [1:24:16<34:43, 1.52it/s, loss=0.0042, lr=8.11e-06, step=6839] Training: 68%|██████▊ | 6840/10000 [1:24:16<34:43, 1.52it/s, loss=0.0180, lr=8.11e-06, step=6840] Training: 68%|██████▊ | 6841/10000 [1:24:17<36:55, 1.43it/s, loss=0.0180, lr=8.11e-06, step=6840] Training: 68%|██████▊ | 6841/10000 [1:24:17<36:55, 1.43it/s, loss=0.0014, lr=8.10e-06, step=6841] Training: 68%|██████▊ | 6842/10000 [1:24:18<39:29, 1.33it/s, loss=0.0014, lr=8.10e-06, step=6841] Training: 68%|██████▊ | 6842/10000 [1:24:18<39:29, 1.33it/s, loss=0.0115, lr=8.10e-06, step=6842] Training: 68%|██████▊ | 6843/10000 [1:24:19<38:49, 1.36it/s, loss=0.0115, lr=8.10e-06, step=6842] Training: 68%|██████▊ | 6843/10000 [1:24:19<38:49, 1.36it/s, loss=0.0052, lr=8.10e-06, step=6843] Training: 68%|██████▊ | 6844/10000 [1:24:20<39:31, 1.33it/s, loss=0.0052, lr=8.10e-06, step=6843] Training: 68%|██████▊ | 6844/10000 [1:24:20<39:31, 1.33it/s, loss=0.0038, lr=8.09e-06, step=6844] Training: 68%|██████▊ | 6845/10000 [1:24:20<38:49, 1.35it/s, loss=0.0038, lr=8.09e-06, step=6844] Training: 68%|██████▊ | 6845/10000 [1:24:20<38:49, 1.35it/s, loss=0.0066, lr=8.09e-06, step=6845] Training: 68%|██████▊ | 6846/10000 [1:24:21<38:17, 1.37it/s, loss=0.0066, lr=8.09e-06, step=6845] Training: 68%|██████▊ | 6846/10000 [1:24:21<38:17, 1.37it/s, loss=0.0038, lr=8.09e-06, step=6846] Training: 68%|██████▊ | 6847/10000 [1:24:22<34:39, 1.52it/s, loss=0.0038, lr=8.09e-06, step=6846] Training: 68%|██████▊ | 6847/10000 [1:24:22<34:39, 1.52it/s, loss=0.0094, lr=8.08e-06, step=6847] Training: 68%|██████▊ | 6848/10000 [1:24:22<32:26, 1.62it/s, loss=0.0094, lr=8.08e-06, step=6847] Training: 68%|██████▊ | 6848/10000 [1:24:22<32:26, 1.62it/s, loss=0.0018, lr=8.08e-06, step=6848] Training: 68%|██████▊ | 6849/10000 [1:24:23<32:23, 1.62it/s, loss=0.0018, lr=8.08e-06, step=6848] Training: 68%|██████▊ | 6849/10000 [1:24:23<32:23, 1.62it/s, loss=0.0240, lr=8.08e-06, step=6849]17:30:30.269 [I] step=6850 loss=0.0095 smoothed_loss=0.0090 lr=8.09e-06 grad_norm=0.4118 step_time=0.5733s data_time=0.1168s it/s=1.449 eta_to_10000=2173.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0149 grad_action_out_proj=0.1398 grad_shared_expert=0.4590 (10775:train_pytorch.py:850) + Training: 68%|██████▊ | 6850/10000 [1:24:23<33:39, 1.56it/s, loss=0.0240, lr=8.08e-06, step=6849] Training: 68%|██████▊ | 6850/10000 [1:24:23<33:39, 1.56it/s, loss=0.0095, lr=8.07e-06, step=6850] Training: 69%|██████▊ | 6851/10000 [1:24:24<36:09, 1.45it/s, loss=0.0095, lr=8.07e-06, step=6850] Training: 69%|██████▊ | 6851/10000 [1:24:24<36:09, 1.45it/s, loss=0.0073, lr=8.07e-06, step=6851] Training: 69%|██████▊ | 6852/10000 [1:24:25<35:56, 1.46it/s, loss=0.0073, lr=8.07e-06, step=6851] Training: 69%|██████▊ | 6852/10000 [1:24:25<35:56, 1.46it/s, loss=0.0102, lr=8.07e-06, step=6852] Training: 69%|██████▊ | 6853/10000 [1:24:25<33:10, 1.58it/s, loss=0.0102, lr=8.07e-06, step=6852] Training: 69%|██████▊ | 6853/10000 [1:24:25<33:10, 1.58it/s, loss=0.0212, lr=8.06e-06, step=6853] Training: 69%|██████▊ | 6854/10000 [1:24:26<31:01, 1.69it/s, loss=0.0212, lr=8.06e-06, step=6853] Training: 69%|██████▊ | 6854/10000 [1:24:26<31:01, 1.69it/s, loss=0.0097, lr=8.06e-06, step=6854] Training: 69%|██████▊ | 6855/10000 [1:24:26<29:20, 1.79it/s, loss=0.0097, lr=8.06e-06, step=6854] Training: 69%|██████▊ | 6855/10000 [1:24:26<29:20, 1.79it/s, loss=0.0027, lr=8.06e-06, step=6855] Training: 69%|██████▊ | 6856/10000 [1:24:27<28:16, 1.85it/s, loss=0.0027, lr=8.06e-06, step=6855] Training: 69%|██████▊ | 6856/10000 [1:24:27<28:16, 1.85it/s, loss=0.0452, lr=8.06e-06, step=6856] Training: 69%|██████▊ | 6857/10000 [1:24:27<29:57, 1.75it/s, loss=0.0452, lr=8.06e-06, step=6856] Training: 69%|██████▊ | 6857/10000 [1:24:27<29:57, 1.75it/s, loss=0.0026, lr=8.05e-06, step=6857] Training: 69%|██████▊ | 6858/10000 [1:24:28<35:18, 1.48it/s, loss=0.0026, lr=8.05e-06, step=6857] Training: 69%|██████▊ | 6858/10000 [1:24:28<35:18, 1.48it/s, loss=0.0076, lr=8.05e-06, step=6858] Training: 69%|██████▊ | 6859/10000 [1:24:29<38:38, 1.35it/s, loss=0.0076, lr=8.05e-06, step=6858] Training: 69%|██████▊ | 6859/10000 [1:24:29<38:38, 1.35it/s, loss=0.0015, lr=8.05e-06, step=6859]17:30:37.031 [I] step=6860 loss=0.0100 smoothed_loss=0.0105 lr=8.06e-06 grad_norm=0.4583 step_time=0.5775s data_time=0.0987s it/s=1.479 eta_to_10000=2123.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0044 grad_action_out_proj=0.0603 grad_shared_expert=0.2797 (10775:train_pytorch.py:850) + Training: 69%|██████▊ | 6860/10000 [1:24:30<40:27, 1.29it/s, loss=0.0015, lr=8.05e-06, step=6859] Training: 69%|██████▊ | 6860/10000 [1:24:30<40:27, 1.29it/s, loss=0.0100, lr=8.04e-06, step=6860] Training: 69%|██████▊ | 6861/10000 [1:24:31<37:50, 1.38it/s, loss=0.0100, lr=8.04e-06, step=6860] Training: 69%|██████▊ | 6861/10000 [1:24:31<37:50, 1.38it/s, loss=0.0062, lr=8.04e-06, step=6861] Training: 69%|██████▊ | 6862/10000 [1:24:32<39:59, 1.31it/s, loss=0.0062, lr=8.04e-06, step=6861] Training: 69%|██████▊ | 6862/10000 [1:24:32<39:59, 1.31it/s, loss=0.0022, lr=8.04e-06, step=6862] Training: 69%|██████▊ | 6863/10000 [1:24:32<41:51, 1.25it/s, loss=0.0022, lr=8.04e-06, step=6862] Training: 69%|██████▊ | 6863/10000 [1:24:32<41:51, 1.25it/s, loss=0.0488, lr=8.03e-06, step=6863] Training: 69%|██████▊ | 6864/10000 [1:24:34<45:53, 1.14it/s, loss=0.0488, lr=8.03e-06, step=6863] Training: 69%|██████▊ | 6864/10000 [1:24:34<45:53, 1.14it/s, loss=0.0043, lr=8.03e-06, step=6864] Training: 69%|██████▊ | 6865/10000 [1:24:34<43:14, 1.21it/s, loss=0.0043, lr=8.03e-06, step=6864] Training: 69%|██████▊ | 6865/10000 [1:24:34<43:14, 1.21it/s, loss=0.0009, lr=8.03e-06, step=6865] Training: 69%|██████▊ | 6866/10000 [1:24:35<46:24, 1.13it/s, loss=0.0009, lr=8.03e-06, step=6865] Training: 69%|██████▊ | 6866/10000 [1:24:35<46:24, 1.13it/s, loss=0.0027, lr=8.02e-06, step=6866] Training: 69%|██████▊ | 6867/10000 [1:24:36<45:53, 1.14it/s, loss=0.0027, lr=8.02e-06, step=6866] Training: 69%|██████▊ | 6867/10000 [1:24:36<45:53, 1.14it/s, loss=0.0053, lr=8.02e-06, step=6867] Training: 69%|██████▊ | 6868/10000 [1:24:37<45:49, 1.14it/s, loss=0.0053, lr=8.02e-06, step=6867] Training: 69%|██████▊ | 6868/10000 [1:24:37<45:49, 1.14it/s, loss=0.0035, lr=8.02e-06, step=6868] Training: 69%|██████▊ | 6869/10000 [1:24:38<45:04, 1.16it/s, loss=0.0035, lr=8.02e-06, step=6868] Training: 69%|██████▊ | 6869/10000 [1:24:38<45:04, 1.16it/s, loss=0.0022, lr=8.01e-06, step=6869]17:30:45.698 [I] step=6870 loss=0.0052 smoothed_loss=0.0081 lr=8.02e-06 grad_norm=0.3871 step_time=0.6651s data_time=0.2016s it/s=1.154 eta_to_10000=2712.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0205 grad_action_out_proj=0.1127 grad_shared_expert=0.3264 (10775:train_pytorch.py:850) + Training: 69%|██████▊ | 6870/10000 [1:24:39<46:26, 1.12it/s, loss=0.0022, lr=8.01e-06, step=6869] Training: 69%|██████▊ | 6870/10000 [1:24:39<46:26, 1.12it/s, loss=0.0052, lr=8.01e-06, step=6870] Training: 69%|██████▊ | 6871/10000 [1:24:40<46:59, 1.11it/s, loss=0.0052, lr=8.01e-06, step=6870] Training: 69%|██████▊ | 6871/10000 [1:24:40<46:59, 1.11it/s, loss=0.0021, lr=8.01e-06, step=6871] Training: 69%|██████▊ | 6872/10000 [1:24:41<46:05, 1.13it/s, loss=0.0021, lr=8.01e-06, step=6871] Training: 69%|██████▊ | 6872/10000 [1:24:41<46:05, 1.13it/s, loss=0.0018, lr=8.00e-06, step=6872] Training: 69%|██████▊ | 6873/10000 [1:24:42<49:53, 1.04it/s, loss=0.0018, lr=8.00e-06, step=6872] Training: 69%|██████▊ | 6873/10000 [1:24:42<49:53, 1.04it/s, loss=0.0367, lr=8.00e-06, step=6873] Training: 69%|██████▊ | 6874/10000 [1:24:42<42:47, 1.22it/s, loss=0.0367, lr=8.00e-06, step=6873] Training: 69%|██████▊ | 6874/10000 [1:24:42<42:47, 1.22it/s, loss=0.0798, lr=8.00e-06, step=6874] Training: 69%|██████▉ | 6875/10000 [1:24:43<38:40, 1.35it/s, loss=0.0798, lr=8.00e-06, step=6874] Training: 69%|██████▉ | 6875/10000 [1:24:43<38:40, 1.35it/s, loss=0.0100, lr=7.99e-06, step=6875] Training: 69%|██████▉ | 6876/10000 [1:24:43<36:57, 1.41it/s, loss=0.0100, lr=7.99e-06, step=6875] Training: 69%|██████▉ | 6876/10000 [1:24:43<36:57, 1.41it/s, loss=0.0276, lr=7.99e-06, step=6876] Training: 69%|██████▉ | 6877/10000 [1:24:44<34:26, 1.51it/s, loss=0.0276, lr=7.99e-06, step=6876] Training: 69%|██████▉ | 6877/10000 [1:24:44<34:26, 1.51it/s, loss=0.0018, lr=7.99e-06, step=6877] Training: 69%|██████▉ | 6878/10000 [1:24:45<35:14, 1.48it/s, loss=0.0018, lr=7.99e-06, step=6877] Training: 69%|██████▉ | 6878/10000 [1:24:45<35:14, 1.48it/s, loss=0.0028, lr=7.98e-06, step=6878] Training: 69%|██████▉ | 6879/10000 [1:24:45<38:00, 1.37it/s, loss=0.0028, lr=7.98e-06, step=6878] Training: 69%|██████▉ | 6879/10000 [1:24:45<38:00, 1.37it/s, loss=0.0243, lr=7.98e-06, step=6879]17:30:53.689 [I] step=6880 loss=0.0059 smoothed_loss=0.0145 lr=7.99e-06 grad_norm=0.4923 step_time=0.6473s data_time=0.1518s it/s=1.252 eta_to_10000=2492.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0192 grad_action_out_proj=0.1207 grad_shared_expert=0.5083 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6880/10000 [1:24:47<46:30, 1.12it/s, loss=0.0243, lr=7.98e-06, step=6879] Training: 69%|██████▉ | 6880/10000 [1:24:47<46:30, 1.12it/s, loss=0.0059, lr=7.98e-06, step=6880] Training: 69%|██████▉ | 6881/10000 [1:24:48<46:33, 1.12it/s, loss=0.0059, lr=7.98e-06, step=6880] Training: 69%|██████▉ | 6881/10000 [1:24:48<46:33, 1.12it/s, loss=0.0045, lr=7.98e-06, step=6881] Training: 69%|██████▉ | 6882/10000 [1:24:49<49:39, 1.05it/s, loss=0.0045, lr=7.98e-06, step=6881] Training: 69%|██████▉ | 6882/10000 [1:24:49<49:39, 1.05it/s, loss=0.0109, lr=7.97e-06, step=6882] Training: 69%|██████▉ | 6883/10000 [1:24:50<50:58, 1.02it/s, loss=0.0109, lr=7.97e-06, step=6882] Training: 69%|██████▉ | 6883/10000 [1:24:50<50:58, 1.02it/s, loss=0.0043, lr=7.97e-06, step=6883] Training: 69%|██████▉ | 6884/10000 [1:24:51<47:50, 1.09it/s, loss=0.0043, lr=7.97e-06, step=6883] Training: 69%|██████▉ | 6884/10000 [1:24:51<47:50, 1.09it/s, loss=0.0128, lr=7.97e-06, step=6884] Training: 69%|██████▉ | 6885/10000 [1:24:52<47:58, 1.08it/s, loss=0.0128, lr=7.97e-06, step=6884] Training: 69%|██████▉ | 6885/10000 [1:24:52<47:58, 1.08it/s, loss=0.0030, lr=7.96e-06, step=6885] Training: 69%|██████▉ | 6886/10000 [1:24:52<46:27, 1.12it/s, loss=0.0030, lr=7.96e-06, step=6885] Training: 69%|██████▉ | 6886/10000 [1:24:52<46:27, 1.12it/s, loss=0.0016, lr=7.96e-06, step=6886] Training: 69%|██████▉ | 6887/10000 [1:24:53<46:48, 1.11it/s, loss=0.0016, lr=7.96e-06, step=6886] Training: 69%|██████▉ | 6887/10000 [1:24:53<46:48, 1.11it/s, loss=0.0020, lr=7.96e-06, step=6887] Training: 69%|██████▉ | 6888/10000 [1:24:54<41:14, 1.26it/s, loss=0.0020, lr=7.96e-06, step=6887] Training: 69%|██████▉ | 6888/10000 [1:24:54<41:14, 1.26it/s, loss=0.0015, lr=7.95e-06, step=6888] Training: 69%|██████▉ | 6889/10000 [1:24:55<43:44, 1.19it/s, loss=0.0015, lr=7.95e-06, step=6888] Training: 69%|██████▉ | 6889/10000 [1:24:55<43:44, 1.19it/s, loss=0.0096, lr=7.95e-06, step=6889]17:31:02.621 [I] step=6890 loss=0.0433 smoothed_loss=0.0123 lr=7.96e-06 grad_norm=0.4442 step_time=0.6525s data_time=0.2406s it/s=1.120 eta_to_10000=2777.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0135 grad_action_out_proj=0.1162 grad_shared_expert=0.5382 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6890/10000 [1:24:56<45:09, 1.15it/s, loss=0.0096, lr=7.95e-06, step=6889] Training: 69%|██████▉ | 6890/10000 [1:24:56<45:09, 1.15it/s, loss=0.0433, lr=7.95e-06, step=6890] Training: 69%|██████▉ | 6891/10000 [1:24:56<40:27, 1.28it/s, loss=0.0433, lr=7.95e-06, step=6890] Training: 69%|██████▉ | 6891/10000 [1:24:56<40:27, 1.28it/s, loss=0.0011, lr=7.94e-06, step=6891] Training: 69%|██████▉ | 6892/10000 [1:24:57<40:59, 1.26it/s, loss=0.0011, lr=7.94e-06, step=6891] Training: 69%|██████▉ | 6892/10000 [1:24:57<40:59, 1.26it/s, loss=0.0093, lr=7.94e-06, step=6892] Training: 69%|██████▉ | 6893/10000 [1:24:58<36:17, 1.43it/s, loss=0.0093, lr=7.94e-06, step=6892] Training: 69%|██████▉ | 6893/10000 [1:24:58<36:17, 1.43it/s, loss=0.0056, lr=7.94e-06, step=6893] Training: 69%|██████▉ | 6894/10000 [1:24:58<38:07, 1.36it/s, loss=0.0056, lr=7.94e-06, step=6893] Training: 69%|██████▉ | 6894/10000 [1:24:58<38:07, 1.36it/s, loss=0.0103, lr=7.93e-06, step=6894] Training: 69%|██████▉ | 6895/10000 [1:24:59<39:18, 1.32it/s, loss=0.0103, lr=7.93e-06, step=6894] Training: 69%|██████▉ | 6895/10000 [1:24:59<39:18, 1.32it/s, loss=0.0029, lr=7.93e-06, step=6895] Training: 69%|██████▉ | 6896/10000 [1:25:00<42:03, 1.23it/s, loss=0.0029, lr=7.93e-06, step=6895] Training: 69%|██████▉ | 6896/10000 [1:25:00<42:03, 1.23it/s, loss=0.0017, lr=7.93e-06, step=6896] Training: 69%|██████▉ | 6897/10000 [1:25:01<43:37, 1.19it/s, loss=0.0017, lr=7.93e-06, step=6896] Training: 69%|██████▉ | 6897/10000 [1:25:01<43:37, 1.19it/s, loss=0.0061, lr=7.92e-06, step=6897] Training: 69%|██████▉ | 6898/10000 [1:25:02<38:36, 1.34it/s, loss=0.0061, lr=7.92e-06, step=6897] Training: 69%|██████▉ | 6898/10000 [1:25:02<38:36, 1.34it/s, loss=0.0029, lr=7.92e-06, step=6898] Training: 69%|██████▉ | 6899/10000 [1:25:02<39:27, 1.31it/s, loss=0.0029, lr=7.92e-06, step=6898] Training: 69%|██████▉ | 6899/10000 [1:25:02<39:27, 1.31it/s, loss=0.0109, lr=7.92e-06, step=6899]17:31:10.336 [I] step=6900 loss=0.0174 smoothed_loss=0.0093 lr=7.93e-06 grad_norm=0.4496 step_time=0.6218s data_time=0.1497s it/s=1.296 eta_to_10000=2391.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0109 grad_action_out_proj=0.1300 grad_shared_expert=0.6347 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6900/10000 [1:25:03<43:36, 1.18it/s, loss=0.0109, lr=7.92e-06, step=6899] Training: 69%|██████▉ | 6900/10000 [1:25:03<43:36, 1.18it/s, loss=0.0174, lr=7.91e-06, step=6900] Training: 69%|██████▉ | 6901/10000 [1:25:05<47:31, 1.09it/s, loss=0.0174, lr=7.91e-06, step=6900] Training: 69%|██████▉ | 6901/10000 [1:25:05<47:31, 1.09it/s, loss=0.0017, lr=7.91e-06, step=6901] Training: 69%|██████▉ | 6902/10000 [1:25:05<41:28, 1.25it/s, loss=0.0017, lr=7.91e-06, step=6901] Training: 69%|██████▉ | 6902/10000 [1:25:05<41:28, 1.25it/s, loss=0.0067, lr=7.91e-06, step=6902] Training: 69%|██████▉ | 6903/10000 [1:25:06<43:48, 1.18it/s, loss=0.0067, lr=7.91e-06, step=6902] Training: 69%|██████▉ | 6903/10000 [1:25:06<43:48, 1.18it/s, loss=0.0067, lr=7.91e-06, step=6903] Training: 69%|██████▉ | 6904/10000 [1:25:07<44:06, 1.17it/s, loss=0.0067, lr=7.91e-06, step=6903] Training: 69%|██████▉ | 6904/10000 [1:25:07<44:06, 1.17it/s, loss=0.0056, lr=7.90e-06, step=6904] Training: 69%|██████▉ | 6905/10000 [1:25:08<43:30, 1.19it/s, loss=0.0056, lr=7.90e-06, step=6904] Training: 69%|██████▉ | 6905/10000 [1:25:08<43:30, 1.19it/s, loss=0.0157, lr=7.90e-06, step=6905] Training: 69%|██████▉ | 6906/10000 [1:25:08<42:39, 1.21it/s, loss=0.0157, lr=7.90e-06, step=6905] Training: 69%|██████▉ | 6906/10000 [1:25:08<42:39, 1.21it/s, loss=0.0118, lr=7.90e-06, step=6906] Training: 69%|██████▉ | 6907/10000 [1:25:10<46:06, 1.12it/s, loss=0.0118, lr=7.90e-06, step=6906] Training: 69%|██████▉ | 6907/10000 [1:25:10<46:06, 1.12it/s, loss=0.1390, lr=7.89e-06, step=6907] Training: 69%|██████▉ | 6908/10000 [1:25:10<42:43, 1.21it/s, loss=0.1390, lr=7.89e-06, step=6907] Training: 69%|██████▉ | 6908/10000 [1:25:10<42:43, 1.21it/s, loss=0.0016, lr=7.89e-06, step=6908] Training: 69%|██████▉ | 6909/10000 [1:25:11<46:08, 1.12it/s, loss=0.0016, lr=7.89e-06, step=6908] Training: 69%|██████▉ | 6909/10000 [1:25:11<46:08, 1.12it/s, loss=0.0229, lr=7.89e-06, step=6909]17:31:18.898 [I] step=6910 loss=0.0018 smoothed_loss=0.0184 lr=7.90e-06 grad_norm=0.5174 step_time=0.6664s data_time=0.1897s it/s=1.168 eta_to_10000=2645.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0094 grad_action_out_proj=0.0799 grad_shared_expert=0.5407 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6910/10000 [1:25:12<43:28, 1.18it/s, loss=0.0229, lr=7.89e-06, step=6909] Training: 69%|██████▉ | 6910/10000 [1:25:12<43:28, 1.18it/s, loss=0.0018, lr=7.88e-06, step=6910] Training: 69%|██████▉ | 6911/10000 [1:25:13<39:14, 1.31it/s, loss=0.0018, lr=7.88e-06, step=6910] Training: 69%|██████▉ | 6911/10000 [1:25:13<39:14, 1.31it/s, loss=0.0051, lr=7.88e-06, step=6911] Training: 69%|██████▉ | 6912/10000 [1:25:13<35:50, 1.44it/s, loss=0.0051, lr=7.88e-06, step=6911] Training: 69%|██████▉ | 6912/10000 [1:25:13<35:50, 1.44it/s, loss=0.0019, lr=7.88e-06, step=6912] Training: 69%|██████▉ | 6913/10000 [1:25:14<36:25, 1.41it/s, loss=0.0019, lr=7.88e-06, step=6912] Training: 69%|██████▉ | 6913/10000 [1:25:14<36:25, 1.41it/s, loss=0.0088, lr=7.87e-06, step=6913] Training: 69%|██████▉ | 6914/10000 [1:25:14<33:36, 1.53it/s, loss=0.0088, lr=7.87e-06, step=6913] Training: 69%|██████▉ | 6914/10000 [1:25:14<33:36, 1.53it/s, loss=0.0140, lr=7.87e-06, step=6914] Training: 69%|██████▉ | 6915/10000 [1:25:15<34:54, 1.47it/s, loss=0.0140, lr=7.87e-06, step=6914] Training: 69%|██████▉ | 6915/10000 [1:25:15<34:54, 1.47it/s, loss=0.0031, lr=7.87e-06, step=6915] Training: 69%|██████▉ | 6916/10000 [1:25:16<43:06, 1.19it/s, loss=0.0031, lr=7.87e-06, step=6915] Training: 69%|██████▉ | 6916/10000 [1:25:16<43:06, 1.19it/s, loss=0.0369, lr=7.86e-06, step=6916] Training: 69%|██████▉ | 6917/10000 [1:25:17<42:34, 1.21it/s, loss=0.0369, lr=7.86e-06, step=6916] Training: 69%|██████▉ | 6917/10000 [1:25:17<42:34, 1.21it/s, loss=0.0246, lr=7.86e-06, step=6917] Training: 69%|██████▉ | 6918/10000 [1:25:18<37:53, 1.36it/s, loss=0.0246, lr=7.86e-06, step=6917] Training: 69%|██████▉ | 6918/10000 [1:25:18<37:53, 1.36it/s, loss=0.0137, lr=7.86e-06, step=6918] Training: 69%|██████▉ | 6919/10000 [1:25:18<36:52, 1.39it/s, loss=0.0137, lr=7.86e-06, step=6918] Training: 69%|██████▉ | 6919/10000 [1:25:18<36:52, 1.39it/s, loss=0.0211, lr=7.85e-06, step=6919]17:31:25.770 [I] step=6920 loss=0.0016 smoothed_loss=0.0154 lr=7.87e-06 grad_norm=0.4622 step_time=0.5673s data_time=0.1199s it/s=1.455 eta_to_10000=2116.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0228 grad_action_out_proj=0.1140 grad_shared_expert=0.3810 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6920/10000 [1:25:19<34:10, 1.50it/s, loss=0.0211, lr=7.85e-06, step=6919] Training: 69%|██████▉ | 6920/10000 [1:25:19<34:10, 1.50it/s, loss=0.0016, lr=7.85e-06, step=6920] Training: 69%|██████▉ | 6921/10000 [1:25:19<32:56, 1.56it/s, loss=0.0016, lr=7.85e-06, step=6920] Training: 69%|██████▉ | 6921/10000 [1:25:19<32:56, 1.56it/s, loss=0.0133, lr=7.85e-06, step=6921] Training: 69%|██████▉ | 6922/10000 [1:25:20<38:03, 1.35it/s, loss=0.0133, lr=7.85e-06, step=6921] Training: 69%|██████▉ | 6922/10000 [1:25:20<38:03, 1.35it/s, loss=0.0091, lr=7.84e-06, step=6922] Training: 69%|██████▉ | 6923/10000 [1:25:21<43:28, 1.18it/s, loss=0.0091, lr=7.84e-06, step=6922] Training: 69%|██████▉ | 6923/10000 [1:25:21<43:28, 1.18it/s, loss=0.0015, lr=7.84e-06, step=6923] Training: 69%|██████▉ | 6924/10000 [1:25:22<43:14, 1.19it/s, loss=0.0015, lr=7.84e-06, step=6923] Training: 69%|██████▉ | 6924/10000 [1:25:22<43:14, 1.19it/s, loss=0.0120, lr=7.84e-06, step=6924] Training: 69%|██████▉ | 6925/10000 [1:25:23<38:31, 1.33it/s, loss=0.0120, lr=7.84e-06, step=6924] Training: 69%|██████▉ | 6925/10000 [1:25:23<38:31, 1.33it/s, loss=0.0270, lr=7.84e-06, step=6925] Training: 69%|██████▉ | 6926/10000 [1:25:24<40:08, 1.28it/s, loss=0.0270, lr=7.84e-06, step=6925] Training: 69%|██████▉ | 6926/10000 [1:25:24<40:08, 1.28it/s, loss=0.0067, lr=7.83e-06, step=6926] Training: 69%|██████▉ | 6927/10000 [1:25:24<35:33, 1.44it/s, loss=0.0067, lr=7.83e-06, step=6926] Training: 69%|██████▉ | 6927/10000 [1:25:24<35:33, 1.44it/s, loss=0.0212, lr=7.83e-06, step=6927] Training: 69%|██████▉ | 6928/10000 [1:25:25<35:52, 1.43it/s, loss=0.0212, lr=7.83e-06, step=6927] Training: 69%|██████▉ | 6928/10000 [1:25:25<35:52, 1.43it/s, loss=0.0125, lr=7.83e-06, step=6928] Training: 69%|██████▉ | 6929/10000 [1:25:26<36:45, 1.39it/s, loss=0.0125, lr=7.83e-06, step=6928] Training: 69%|██████▉ | 6929/10000 [1:25:26<36:45, 1.39it/s, loss=0.0535, lr=7.82e-06, step=6929]17:31:33.435 [I] step=6930 loss=0.0035 smoothed_loss=0.0168 lr=7.83e-06 grad_norm=0.5384 step_time=0.6173s data_time=0.1492s it/s=1.305 eta_to_10000=2352.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0174 grad_action_out_proj=0.1377 grad_shared_expert=0.4515 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6930/10000 [1:25:27<38:18, 1.34it/s, loss=0.0535, lr=7.82e-06, step=6929] Training: 69%|██████▉ | 6930/10000 [1:25:27<38:18, 1.34it/s, loss=0.0035, lr=7.82e-06, step=6930] Training: 69%|██████▉ | 6931/10000 [1:25:27<38:24, 1.33it/s, loss=0.0035, lr=7.82e-06, step=6930] Training: 69%|██████▉ | 6931/10000 [1:25:27<38:24, 1.33it/s, loss=0.0181, lr=7.82e-06, step=6931] Training: 69%|██████▉ | 6932/10000 [1:25:28<36:11, 1.41it/s, loss=0.0181, lr=7.82e-06, step=6931] Training: 69%|██████▉ | 6932/10000 [1:25:28<36:11, 1.41it/s, loss=0.0060, lr=7.81e-06, step=6932] Training: 69%|██████▉ | 6933/10000 [1:25:29<39:15, 1.30it/s, loss=0.0060, lr=7.81e-06, step=6932] Training: 69%|██████▉ | 6933/10000 [1:25:29<39:15, 1.30it/s, loss=0.0116, lr=7.81e-06, step=6933] Training: 69%|██████▉ | 6934/10000 [1:25:30<41:07, 1.24it/s, loss=0.0116, lr=7.81e-06, step=6933] Training: 69%|██████▉ | 6934/10000 [1:25:30<41:07, 1.24it/s, loss=0.0158, lr=7.81e-06, step=6934] Training: 69%|██████▉ | 6935/10000 [1:25:31<42:13, 1.21it/s, loss=0.0158, lr=7.81e-06, step=6934] Training: 69%|██████▉ | 6935/10000 [1:25:31<42:13, 1.21it/s, loss=0.0304, lr=7.80e-06, step=6935] Training: 69%|██████▉ | 6936/10000 [1:25:31<41:09, 1.24it/s, loss=0.0304, lr=7.80e-06, step=6935] Training: 69%|██████▉ | 6936/10000 [1:25:31<41:09, 1.24it/s, loss=0.0026, lr=7.80e-06, step=6936] Training: 69%|██████▉ | 6937/10000 [1:25:32<45:09, 1.13it/s, loss=0.0026, lr=7.80e-06, step=6936] Training: 69%|██████▉ | 6937/10000 [1:25:32<45:09, 1.13it/s, loss=0.0205, lr=7.80e-06, step=6937] Training: 69%|██████▉ | 6938/10000 [1:25:33<40:05, 1.27it/s, loss=0.0205, lr=7.80e-06, step=6937] Training: 69%|██████▉ | 6938/10000 [1:25:33<40:05, 1.27it/s, loss=0.0291, lr=7.79e-06, step=6938] Training: 69%|██████▉ | 6939/10000 [1:25:33<36:25, 1.40it/s, loss=0.0291, lr=7.79e-06, step=6938] Training: 69%|██████▉ | 6939/10000 [1:25:33<36:25, 1.40it/s, loss=0.0045, lr=7.79e-06, step=6939]17:31:41.211 [I] step=6940 loss=0.0095 smoothed_loss=0.0154 lr=7.80e-06 grad_norm=0.5030 step_time=0.6172s data_time=0.1604s it/s=1.286 eta_to_10000=2379.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.1010 grad_shared_expert=0.3813 (10775:train_pytorch.py:850) + Training: 69%|██████▉ | 6940/10000 [1:25:34<37:52, 1.35it/s, loss=0.0045, lr=7.79e-06, step=6939] Training: 69%|██████▉ | 6940/10000 [1:25:34<37:52, 1.35it/s, loss=0.0095, lr=7.79e-06, step=6940] Training: 69%|██████▉ | 6941/10000 [1:25:35<36:01, 1.42it/s, loss=0.0095, lr=7.79e-06, step=6940] Training: 69%|██████▉ | 6941/10000 [1:25:35<36:01, 1.42it/s, loss=0.0079, lr=7.78e-06, step=6941] Training: 69%|██████▉ | 6942/10000 [1:25:36<39:32, 1.29it/s, loss=0.0079, lr=7.78e-06, step=6941] Training: 69%|██████▉ | 6942/10000 [1:25:36<39:32, 1.29it/s, loss=0.0038, lr=7.78e-06, step=6942] Training: 69%|██████▉ | 6943/10000 [1:25:37<43:59, 1.16it/s, loss=0.0038, lr=7.78e-06, step=6942] Training: 69%|██████▉ | 6943/10000 [1:25:37<43:59, 1.16it/s, loss=0.0300, lr=7.78e-06, step=6943] Training: 69%|██████▉ | 6944/10000 [1:25:38<42:59, 1.18it/s, loss=0.0300, lr=7.78e-06, step=6943] Training: 69%|██████▉ | 6944/10000 [1:25:38<42:59, 1.18it/s, loss=0.0046, lr=7.78e-06, step=6944] Training: 69%|██████▉ | 6945/10000 [1:25:39<42:34, 1.20it/s, loss=0.0046, lr=7.78e-06, step=6944] Training: 69%|██████▉ | 6945/10000 [1:25:39<42:34, 1.20it/s, loss=0.0140, lr=7.77e-06, step=6945] Training: 69%|██████▉ | 6946/10000 [1:25:39<39:57, 1.27it/s, loss=0.0140, lr=7.77e-06, step=6945] Training: 69%|██████▉ | 6946/10000 [1:25:39<39:57, 1.27it/s, loss=0.0077, lr=7.77e-06, step=6946] Training: 69%|██████▉ | 6947/10000 [1:25:40<41:14, 1.23it/s, loss=0.0077, lr=7.77e-06, step=6946] Training: 69%|██████▉ | 6947/10000 [1:25:40<41:14, 1.23it/s, loss=0.0333, lr=7.77e-06, step=6947] Training: 69%|██████▉ | 6948/10000 [1:25:41<42:26, 1.20it/s, loss=0.0333, lr=7.77e-06, step=6947] Training: 69%|██████▉ | 6948/10000 [1:25:41<42:26, 1.20it/s, loss=0.0014, lr=7.76e-06, step=6948] Training: 69%|██████▉ | 6949/10000 [1:25:42<40:26, 1.26it/s, loss=0.0014, lr=7.76e-06, step=6948] Training: 69%|██████▉ | 6949/10000 [1:25:42<40:26, 1.26it/s, loss=0.0045, lr=7.76e-06, step=6949]17:31:49.313 [I] step=6950 loss=0.0061 smoothed_loss=0.0124 lr=7.77e-06 grad_norm=0.5494 step_time=0.6446s data_time=0.1656s it/s=1.234 eta_to_10000=2470.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0228 grad_action_out_proj=0.1314 grad_shared_expert=0.4266 (10775:train_pytorch.py:850) + Training: 70%|██████▉ | 6950/10000 [1:25:42<39:24, 1.29it/s, loss=0.0045, lr=7.76e-06, step=6949] Training: 70%|██████▉ | 6950/10000 [1:25:42<39:24, 1.29it/s, loss=0.0061, lr=7.76e-06, step=6950] Training: 70%|██████▉ | 6951/10000 [1:25:43<37:12, 1.37it/s, loss=0.0061, lr=7.76e-06, step=6950] Training: 70%|██████▉ | 6951/10000 [1:25:43<37:12, 1.37it/s, loss=0.0096, lr=7.75e-06, step=6951] Training: 70%|██████▉ | 6952/10000 [1:25:44<37:42, 1.35it/s, loss=0.0096, lr=7.75e-06, step=6951] Training: 70%|██████▉ | 6952/10000 [1:25:44<37:42, 1.35it/s, loss=0.0089, lr=7.75e-06, step=6952] Training: 70%|██████▉ | 6953/10000 [1:25:44<33:56, 1.50it/s, loss=0.0089, lr=7.75e-06, step=6952] Training: 70%|██████▉ | 6953/10000 [1:25:44<33:56, 1.50it/s, loss=0.0026, lr=7.75e-06, step=6953] Training: 70%|██████▉ | 6954/10000 [1:25:45<32:26, 1.56it/s, loss=0.0026, lr=7.75e-06, step=6953] Training: 70%|██████▉ | 6954/10000 [1:25:45<32:26, 1.56it/s, loss=0.0043, lr=7.74e-06, step=6954] Training: 70%|██████▉ | 6955/10000 [1:25:45<30:16, 1.68it/s, loss=0.0043, lr=7.74e-06, step=6954] Training: 70%|██████▉ | 6955/10000 [1:25:45<30:16, 1.68it/s, loss=0.0088, lr=7.74e-06, step=6955] Training: 70%|██████▉ | 6956/10000 [1:25:46<31:58, 1.59it/s, loss=0.0088, lr=7.74e-06, step=6955] Training: 70%|██████▉ | 6956/10000 [1:25:46<31:58, 1.59it/s, loss=0.0653, lr=7.74e-06, step=6956] Training: 70%|██████▉ | 6957/10000 [1:25:47<31:48, 1.59it/s, loss=0.0653, lr=7.74e-06, step=6956] Training: 70%|██████▉ | 6957/10000 [1:25:47<31:48, 1.59it/s, loss=0.0317, lr=7.73e-06, step=6957] Training: 70%|██████▉ | 6958/10000 [1:25:47<32:11, 1.58it/s, loss=0.0317, lr=7.73e-06, step=6957] Training: 70%|██████▉ | 6958/10000 [1:25:47<32:11, 1.58it/s, loss=0.0297, lr=7.73e-06, step=6958] Training: 70%|██████▉ | 6959/10000 [1:25:48<36:13, 1.40it/s, loss=0.0297, lr=7.73e-06, step=6958] Training: 70%|██████▉ | 6959/10000 [1:25:48<36:13, 1.40it/s, loss=0.0013, lr=7.73e-06, step=6959]17:31:55.699 [I] step=6960 loss=0.0448 smoothed_loss=0.0195 lr=7.74e-06 grad_norm=0.4732 step_time=0.5529s data_time=0.0857s it/s=1.566 eta_to_10000=1941.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0372 grad_action_out_proj=0.1561 grad_shared_expert=0.6129 (10775:train_pytorch.py:850) + Training: 70%|██████▉ | 6960/10000 [1:25:49<33:35, 1.51it/s, loss=0.0013, lr=7.73e-06, step=6959] Training: 70%|██████▉ | 6960/10000 [1:25:49<33:35, 1.51it/s, loss=0.0448, lr=7.73e-06, step=6960] Training: 70%|██████▉ | 6961/10000 [1:25:50<36:20, 1.39it/s, loss=0.0448, lr=7.73e-06, step=6960] Training: 70%|██████▉ | 6961/10000 [1:25:50<36:20, 1.39it/s, loss=0.0017, lr=7.72e-06, step=6961] Training: 70%|██████▉ | 6962/10000 [1:25:51<44:45, 1.13it/s, loss=0.0017, lr=7.72e-06, step=6961] Training: 70%|██████▉ | 6962/10000 [1:25:51<44:45, 1.13it/s, loss=0.0013, lr=7.72e-06, step=6962] Training: 70%|██████▉ | 6963/10000 [1:25:52<46:50, 1.08it/s, loss=0.0013, lr=7.72e-06, step=6962] Training: 70%|██████▉ | 6963/10000 [1:25:52<46:50, 1.08it/s, loss=0.0073, lr=7.72e-06, step=6963] Training: 70%|██████▉ | 6964/10000 [1:25:53<51:35, 1.02s/it, loss=0.0073, lr=7.72e-06, step=6963] Training: 70%|██████▉ | 6964/10000 [1:25:53<51:35, 1.02s/it, loss=0.0034, lr=7.71e-06, step=6964] Training: 70%|██████▉ | 6965/10000 [1:25:54<51:27, 1.02s/it, loss=0.0034, lr=7.71e-06, step=6964] Training: 70%|██████▉ | 6965/10000 [1:25:54<51:27, 1.02s/it, loss=0.0043, lr=7.71e-06, step=6965] Training: 70%|██████▉ | 6966/10000 [1:25:55<56:10, 1.11s/it, loss=0.0043, lr=7.71e-06, step=6965] Training: 70%|██████▉ | 6966/10000 [1:25:55<56:10, 1.11s/it, loss=0.0073, lr=7.71e-06, step=6966] Training: 70%|██████▉ | 6967/10000 [1:25:57<55:07, 1.09s/it, loss=0.0073, lr=7.71e-06, step=6966] Training: 70%|██████▉ | 6967/10000 [1:25:57<55:07, 1.09s/it, loss=0.0078, lr=7.70e-06, step=6967] Training: 70%|██████▉ | 6968/10000 [1:25:58<55:35, 1.10s/it, loss=0.0078, lr=7.70e-06, step=6967] Training: 70%|██████▉ | 6968/10000 [1:25:58<55:35, 1.10s/it, loss=0.0024, lr=7.70e-06, step=6968] Training: 70%|██████▉ | 6969/10000 [1:25:59<55:07, 1.09s/it, loss=0.0024, lr=7.70e-06, step=6968] Training: 70%|██████▉ | 6969/10000 [1:25:59<55:07, 1.09s/it, loss=0.0350, lr=7.70e-06, step=6969]17:32:06.742 [I] step=6970 loss=0.0426 smoothed_loss=0.0164 lr=7.71e-06 grad_norm=0.5222 step_time=0.7599s data_time=0.3445s it/s=0.906 eta_to_10000=3345.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0503 grad_action_out_proj=0.2581 grad_shared_expert=0.9567 (10775:train_pytorch.py:850) + Training: 70%|██████▉ | 6970/10000 [1:26:00<55:07, 1.09s/it, loss=0.0350, lr=7.70e-06, step=6969] Training: 70%|██████▉ | 6970/10000 [1:26:00<55:07, 1.09s/it, loss=0.0426, lr=7.69e-06, step=6970] Training: 70%|██████▉ | 6971/10000 [1:26:01<55:05, 1.09s/it, loss=0.0426, lr=7.69e-06, step=6970] Training: 70%|██████▉ | 6971/10000 [1:26:01<55:05, 1.09s/it, loss=0.0353, lr=7.69e-06, step=6971] Training: 70%|██████▉ | 6972/10000 [1:26:02<52:59, 1.05s/it, loss=0.0353, lr=7.69e-06, step=6971] Training: 70%|██████▉ | 6972/10000 [1:26:02<52:59, 1.05s/it, loss=0.0014, lr=7.69e-06, step=6972] Training: 70%|██████▉ | 6973/10000 [1:26:03<55:28, 1.10s/it, loss=0.0014, lr=7.69e-06, step=6972] Training: 70%|██████▉ | 6973/10000 [1:26:03<55:28, 1.10s/it, loss=0.0107, lr=7.68e-06, step=6973] Training: 70%|██████▉ | 6974/10000 [1:26:04<57:35, 1.14s/it, loss=0.0107, lr=7.68e-06, step=6973] Training: 70%|██████▉ | 6974/10000 [1:26:04<57:35, 1.14s/it, loss=0.0069, lr=7.68e-06, step=6974] Training: 70%|██████▉ | 6975/10000 [1:26:05<55:49, 1.11s/it, loss=0.0069, lr=7.68e-06, step=6974] Training: 70%|██████▉ | 6975/10000 [1:26:05<55:49, 1.11s/it, loss=0.0196, lr=7.68e-06, step=6975] Training: 70%|██████▉ | 6976/10000 [1:26:06<51:24, 1.02s/it, loss=0.0196, lr=7.68e-06, step=6975] Training: 70%|██████▉ | 6976/10000 [1:26:06<51:24, 1.02s/it, loss=0.0090, lr=7.67e-06, step=6976] Training: 70%|██████▉ | 6977/10000 [1:26:07<51:05, 1.01s/it, loss=0.0090, lr=7.67e-06, step=6976] Training: 70%|██████▉ | 6977/10000 [1:26:07<51:05, 1.01s/it, loss=0.0049, lr=7.67e-06, step=6977] Training: 70%|██████▉ | 6978/10000 [1:26:08<48:39, 1.04it/s, loss=0.0049, lr=7.67e-06, step=6977] Training: 70%|██████▉ | 6978/10000 [1:26:08<48:39, 1.04it/s, loss=0.0173, lr=7.67e-06, step=6978] Training: 70%|██████▉ | 6979/10000 [1:26:09<54:29, 1.08s/it, loss=0.0173, lr=7.67e-06, step=6978] Training: 70%|██████▉ | 6979/10000 [1:26:09<54:29, 1.08s/it, loss=0.0035, lr=7.67e-06, step=6979]17:32:17.604 [I] step=6980 loss=0.0070 smoothed_loss=0.0125 lr=7.68e-06 grad_norm=0.4690 step_time=0.7888s data_time=0.2972s it/s=0.921 eta_to_10000=3278.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0263 grad_action_out_proj=0.1108 grad_shared_expert=0.4049 (10775:train_pytorch.py:850) + Training: 70%|██████▉ | 6980/10000 [1:26:11<57:50, 1.15s/it, loss=0.0035, lr=7.67e-06, step=6979] Training: 70%|██████▉ | 6980/10000 [1:26:11<57:50, 1.15s/it, loss=0.0070, lr=7.66e-06, step=6980] Training: 70%|██████▉ | 6981/10000 [1:26:12<55:20, 1.10s/it, loss=0.0070, lr=7.66e-06, step=6980] Training: 70%|██████▉ | 6981/10000 [1:26:12<55:20, 1.10s/it, loss=0.0024, lr=7.66e-06, step=6981] Training: 70%|██████▉ | 6982/10000 [1:26:13<54:12, 1.08s/it, loss=0.0024, lr=7.66e-06, step=6981] Training: 70%|██████▉ | 6982/10000 [1:26:13<54:12, 1.08s/it, loss=0.0018, lr=7.66e-06, step=6982] Training: 70%|██████▉ | 6983/10000 [1:26:13<47:16, 1.06it/s, loss=0.0018, lr=7.66e-06, step=6982] Training: 70%|██████▉ | 6983/10000 [1:26:13<47:16, 1.06it/s, loss=0.0040, lr=7.65e-06, step=6983] Training: 70%|██████▉ | 6984/10000 [1:26:14<46:10, 1.09it/s, loss=0.0040, lr=7.65e-06, step=6983] Training: 70%|██████▉ | 6984/10000 [1:26:14<46:10, 1.09it/s, loss=0.0011, lr=7.65e-06, step=6984] Training: 70%|██████▉ | 6985/10000 [1:26:15<48:04, 1.05it/s, loss=0.0011, lr=7.65e-06, step=6984] Training: 70%|██████▉ | 6985/10000 [1:26:15<48:04, 1.05it/s, loss=0.0020, lr=7.65e-06, step=6985] Training: 70%|██████▉ | 6986/10000 [1:26:16<52:18, 1.04s/it, loss=0.0020, lr=7.65e-06, step=6985] Training: 70%|██████▉ | 6986/10000 [1:26:16<52:18, 1.04s/it, loss=0.0056, lr=7.64e-06, step=6986] Training: 70%|██████▉ | 6987/10000 [1:26:17<49:33, 1.01it/s, loss=0.0056, lr=7.64e-06, step=6986] Training: 70%|██████▉ | 6987/10000 [1:26:17<49:33, 1.01it/s, loss=0.0196, lr=7.64e-06, step=6987] Training: 70%|██████▉ | 6988/10000 [1:26:18<47:12, 1.06it/s, loss=0.0196, lr=7.64e-06, step=6987] Training: 70%|██████▉ | 6988/10000 [1:26:18<47:12, 1.06it/s, loss=0.0166, lr=7.64e-06, step=6988] Training: 70%|██████▉ | 6989/10000 [1:26:19<40:53, 1.23it/s, loss=0.0166, lr=7.64e-06, step=6988] Training: 70%|██████▉ | 6989/10000 [1:26:19<40:53, 1.23it/s, loss=0.0114, lr=7.63e-06, step=6989]17:32:26.500 [I] step=6990 loss=0.0014 smoothed_loss=0.0092 lr=7.65e-06 grad_norm=0.4258 step_time=0.6751s data_time=0.2145s it/s=1.124 eta_to_10000=2677.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.0833 grad_shared_expert=0.3147 (10775:train_pytorch.py:850) + Training: 70%|██████▉ | 6990/10000 [1:26:20<42:09, 1.19it/s, loss=0.0114, lr=7.63e-06, step=6989] Training: 70%|██████▉ | 6990/10000 [1:26:20<42:09, 1.19it/s, loss=0.0014, lr=7.63e-06, step=6990] Training: 70%|██████▉ | 6991/10000 [1:26:21<43:35, 1.15it/s, loss=0.0014, lr=7.63e-06, step=6990] Training: 70%|██████▉ | 6991/10000 [1:26:21<43:35, 1.15it/s, loss=0.0136, lr=7.63e-06, step=6991] Training: 70%|██████▉ | 6992/10000 [1:26:21<43:00, 1.17it/s, loss=0.0136, lr=7.63e-06, step=6991] Training: 70%|██████▉ | 6992/10000 [1:26:21<43:00, 1.17it/s, loss=0.0095, lr=7.62e-06, step=6992] Training: 70%|██████▉ | 6993/10000 [1:26:22<43:55, 1.14it/s, loss=0.0095, lr=7.62e-06, step=6992] Training: 70%|██████▉ | 6993/10000 [1:26:22<43:55, 1.14it/s, loss=0.0170, lr=7.62e-06, step=6993] Training: 70%|██████▉ | 6994/10000 [1:26:23<41:12, 1.22it/s, loss=0.0170, lr=7.62e-06, step=6993] Training: 70%|██████▉ | 6994/10000 [1:26:23<41:12, 1.22it/s, loss=0.0012, lr=7.62e-06, step=6994] Training: 70%|██████▉ | 6995/10000 [1:26:24<47:13, 1.06it/s, loss=0.0012, lr=7.62e-06, step=6994] Training: 70%|██████▉ | 6995/10000 [1:26:24<47:13, 1.06it/s, loss=0.0177, lr=7.62e-06, step=6995] Training: 70%|██████▉ | 6996/10000 [1:26:25<47:18, 1.06it/s, loss=0.0177, lr=7.62e-06, step=6995] Training: 70%|██████▉ | 6996/10000 [1:26:25<47:18, 1.06it/s, loss=0.0059, lr=7.61e-06, step=6996] Training: 70%|██████▉ | 6997/10000 [1:26:26<44:59, 1.11it/s, loss=0.0059, lr=7.61e-06, step=6996] Training: 70%|██████▉ | 6997/10000 [1:26:26<44:59, 1.11it/s, loss=0.0101, lr=7.61e-06, step=6997] Training: 70%|██████▉ | 6998/10000 [1:26:27<43:51, 1.14it/s, loss=0.0101, lr=7.61e-06, step=6997] Training: 70%|██████▉ | 6998/10000 [1:26:27<43:51, 1.14it/s, loss=0.0043, lr=7.61e-06, step=6998] Training: 70%|██████▉ | 6999/10000 [1:26:27<39:00, 1.28it/s, loss=0.0043, lr=7.61e-06, step=6998] Training: 70%|██████▉ | 6999/10000 [1:26:27<39:00, 1.28it/s, loss=0.0105, lr=7.60e-06, step=6999]17:32:35.179 [I] step=7000 loss=0.0146 smoothed_loss=0.0099 lr=7.61e-06 grad_norm=0.4394 step_time=0.6604s data_time=0.2075s it/s=1.152 eta_to_10000=2603.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0115 grad_action_out_proj=0.1022 grad_shared_expert=0.4005 (10775:train_pytorch.py:850) +17:34:35.906 [I] Saved checkpoint at step 7000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/7000 (10775:train_pytorch.py:350) + Training: 70%|███████ | 7000/10000 [1:28:29<30:52:33, 37.05s/it, loss=0.0105, lr=7.60e-06, step=6999] Training: 70%|███████ | 7000/10000 [1:28:29<30:52:33, 37.05s/it, loss=0.0146, lr=7.60e-06, step=7000] Training: 70%|███████ | 7001/10000 [1:28:30<21:50:37, 26.22s/it, loss=0.0146, lr=7.60e-06, step=7000] Training: 70%|███████ | 7001/10000 [1:28:30<21:50:37, 26.22s/it, loss=0.0037, lr=7.60e-06, step=7001] Training: 70%|███████ | 7002/10000 [1:28:31<15:38:03, 18.77s/it, loss=0.0037, lr=7.60e-06, step=7001] Training: 70%|███████ | 7002/10000 [1:28:31<15:38:03, 18.77s/it, loss=0.0045, lr=7.59e-06, step=7002] Training: 70%|███████ | 7003/10000 [1:28:32<11:11:26, 13.44s/it, loss=0.0045, lr=7.59e-06, step=7002] Training: 70%|███████ | 7003/10000 [1:28:32<11:11:26, 13.44s/it, loss=0.0013, lr=7.59e-06, step=7003] Training: 70%|███████ | 7004/10000 [1:28:33<8:04:50, 9.71s/it, loss=0.0013, lr=7.59e-06, step=7003] Training: 70%|███████ | 7004/10000 [1:28:33<8:04:50, 9.71s/it, loss=0.0291, lr=7.59e-06, step=7004] Training: 70%|███████ | 7005/10000 [1:28:34<5:54:14, 7.10s/it, loss=0.0291, lr=7.59e-06, step=7004] Training: 70%|███████ | 7005/10000 [1:28:34<5:54:14, 7.10s/it, loss=0.0099, lr=7.58e-06, step=7005] Training: 70%|███████ | 7006/10000 [1:28:35<4:22:26, 5.26s/it, loss=0.0099, lr=7.58e-06, step=7005] Training: 70%|███████ | 7006/10000 [1:28:35<4:22:26, 5.26s/it, loss=0.0044, lr=7.58e-06, step=7006] Training: 70%|███████ | 7007/10000 [1:28:36<3:20:42, 4.02s/it, loss=0.0044, lr=7.58e-06, step=7006] Training: 70%|███████ | 7007/10000 [1:28:36<3:20:42, 4.02s/it, loss=0.0037, lr=7.58e-06, step=7007] Training: 70%|███████ | 7008/10000 [1:28:37<2:34:22, 3.10s/it, loss=0.0037, lr=7.58e-06, step=7007] Training: 70%|███████ | 7008/10000 [1:28:37<2:34:22, 3.10s/it, loss=0.0083, lr=7.58e-06, step=7008] Training: 70%|███████ | 7009/10000 [1:28:38<2:02:43, 2.46s/it, loss=0.0083, lr=7.58e-06, step=7008] Training: 70%|███████ | 7009/10000 [1:28:38<2:02:43, 2.46s/it, loss=0.0074, lr=7.57e-06, step=7009]17:34:46.163 [I] step=7010 loss=0.0076 smoothed_loss=0.0086 lr=7.58e-06 grad_norm=0.3813 step_time=0.7704s data_time=12.3280s it/s=0.076 eta_to_10000=39164.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0676 grad_shared_expert=0.2587 (10775:train_pytorch.py:850) + Training: 70%|███████ | 7010/10000 [1:28:39<1:38:57, 1.99s/it, loss=0.0074, lr=7.57e-06, step=7009] Training: 70%|███████ | 7010/10000 [1:28:39<1:38:57, 1.99s/it, loss=0.0076, lr=7.57e-06, step=7010] Training: 70%|███████ | 7011/10000 [1:28:40<1:18:33, 1.58s/it, loss=0.0076, lr=7.57e-06, step=7010] Training: 70%|███████ | 7011/10000 [1:28:40<1:18:33, 1.58s/it, loss=0.0057, lr=7.57e-06, step=7011] Training: 70%|███████ | 7012/10000 [1:28:41<1:05:56, 1.32s/it, loss=0.0057, lr=7.57e-06, step=7011] Training: 70%|███████ | 7012/10000 [1:28:41<1:05:56, 1.32s/it, loss=0.0171, lr=7.56e-06, step=7012] Training: 70%|███████ | 7013/10000 [1:28:42<1:03:53, 1.28s/it, loss=0.0171, lr=7.56e-06, step=7012] Training: 70%|███████ | 7013/10000 [1:28:42<1:03:53, 1.28s/it, loss=0.0082, lr=7.56e-06, step=7013] Training: 70%|███████ | 7014/10000 [1:28:43<56:05, 1.13s/it, loss=0.0082, lr=7.56e-06, step=7013] Training: 70%|███████ | 7014/10000 [1:28:43<56:05, 1.13s/it, loss=0.0151, lr=7.56e-06, step=7014] Training: 70%|███████ | 7015/10000 [1:28:43<51:44, 1.04s/it, loss=0.0151, lr=7.56e-06, step=7014] Training: 70%|███████ | 7015/10000 [1:28:43<51:44, 1.04s/it, loss=0.0057, lr=7.55e-06, step=7015] Training: 70%|███████ | 7016/10000 [1:28:44<47:52, 1.04it/s, loss=0.0057, lr=7.55e-06, step=7015] Training: 70%|███████ | 7016/10000 [1:28:44<47:52, 1.04it/s, loss=0.0021, lr=7.55e-06, step=7016] Training: 70%|███████ | 7017/10000 [1:28:45<42:12, 1.18it/s, loss=0.0021, lr=7.55e-06, step=7016] Training: 70%|███████ | 7017/10000 [1:28:45<42:12, 1.18it/s, loss=0.0048, lr=7.55e-06, step=7017] Training: 70%|███████ | 7018/10000 [1:28:45<37:13, 1.34it/s, loss=0.0048, lr=7.55e-06, step=7017] Training: 70%|███████ | 7018/10000 [1:28:45<37:13, 1.34it/s, loss=0.0123, lr=7.54e-06, step=7018] Training: 70%|███████ | 7019/10000 [1:28:46<34:06, 1.46it/s, loss=0.0123, lr=7.54e-06, step=7018] Training: 70%|███████ | 7019/10000 [1:28:46<34:06, 1.46it/s, loss=0.0064, lr=7.54e-06, step=7019]17:34:53.353 [I] step=7020 loss=0.0054 smoothed_loss=0.0081 lr=7.55e-06 grad_norm=0.4347 step_time=0.5740s data_time=0.1450s it/s=1.391 eta_to_10000=2142.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0101 grad_action_out_proj=0.1129 grad_shared_expert=0.4213 (10775:train_pytorch.py:850) + Training: 70%|███████ | 7020/10000 [1:28:46<33:08, 1.50it/s, loss=0.0064, lr=7.54e-06, step=7019] Training: 70%|███████ | 7020/10000 [1:28:46<33:08, 1.50it/s, loss=0.0054, lr=7.54e-06, step=7020] Training: 70%|███████ | 7021/10000 [1:28:47<33:47, 1.47it/s, loss=0.0054, lr=7.54e-06, step=7020] Training: 70%|███████ | 7021/10000 [1:28:47<33:47, 1.47it/s, loss=0.0067, lr=7.53e-06, step=7021] Training: 70%|███████ | 7022/10000 [1:28:48<34:21, 1.44it/s, loss=0.0067, lr=7.53e-06, step=7021] Training: 70%|███████ | 7022/10000 [1:28:48<34:21, 1.44it/s, loss=0.0130, lr=7.53e-06, step=7022] Training: 70%|███████ | 7023/10000 [1:28:49<37:23, 1.33it/s, loss=0.0130, lr=7.53e-06, step=7022] Training: 70%|███████ | 7023/10000 [1:28:49<37:23, 1.33it/s, loss=0.0024, lr=7.53e-06, step=7023] Training: 70%|███████ | 7024/10000 [1:28:49<36:24, 1.36it/s, loss=0.0024, lr=7.53e-06, step=7023] Training: 70%|███████ | 7024/10000 [1:28:49<36:24, 1.36it/s, loss=0.0094, lr=7.53e-06, step=7024] Training: 70%|███████ | 7025/10000 [1:28:50<33:12, 1.49it/s, loss=0.0094, lr=7.53e-06, step=7024] Training: 70%|███████ | 7025/10000 [1:28:50<33:12, 1.49it/s, loss=0.0163, lr=7.52e-06, step=7025] Training: 70%|███████ | 7026/10000 [1:28:51<33:39, 1.47it/s, loss=0.0163, lr=7.52e-06, step=7025] Training: 70%|███████ | 7026/10000 [1:28:51<33:39, 1.47it/s, loss=0.0059, lr=7.52e-06, step=7026] Training: 70%|███████ | 7027/10000 [1:28:51<31:40, 1.56it/s, loss=0.0059, lr=7.52e-06, step=7026] Training: 70%|███████ | 7027/10000 [1:28:51<31:40, 1.56it/s, loss=0.0087, lr=7.52e-06, step=7027] Training: 70%|███████ | 7028/10000 [1:28:52<29:44, 1.67it/s, loss=0.0087, lr=7.52e-06, step=7027] Training: 70%|███████ | 7028/10000 [1:28:52<29:44, 1.67it/s, loss=0.0165, lr=7.51e-06, step=7028] Training: 70%|███████ | 7029/10000 [1:28:53<34:21, 1.44it/s, loss=0.0165, lr=7.51e-06, step=7028] Training: 70%|███████ | 7029/10000 [1:28:53<34:21, 1.44it/s, loss=0.0040, lr=7.51e-06, step=7029]17:35:00.659 [I] step=7030 loss=0.0038 smoothed_loss=0.0083 lr=7.52e-06 grad_norm=0.3900 step_time=0.5976s data_time=0.1329s it/s=1.369 eta_to_10000=2169.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0712 grad_shared_expert=0.3955 (10775:train_pytorch.py:850) + Training: 70%|███████ | 7030/10000 [1:28:54<40:23, 1.23it/s, loss=0.0040, lr=7.51e-06, step=7029] Training: 70%|███████ | 7030/10000 [1:28:54<40:23, 1.23it/s, loss=0.0038, lr=7.51e-06, step=7030] Training: 70%|███████ | 7031/10000 [1:28:55<39:47, 1.24it/s, loss=0.0038, lr=7.51e-06, step=7030] Training: 70%|███████ | 7031/10000 [1:28:55<39:47, 1.24it/s, loss=0.0044, lr=7.50e-06, step=7031] Training: 70%|███████ | 7032/10000 [1:28:55<36:17, 1.36it/s, loss=0.0044, lr=7.50e-06, step=7031] Training: 70%|███████ | 7032/10000 [1:28:55<36:17, 1.36it/s, loss=0.0046, lr=7.50e-06, step=7032] Training: 70%|███████ | 7033/10000 [1:28:56<32:54, 1.50it/s, loss=0.0046, lr=7.50e-06, step=7032] Training: 70%|███████ | 7033/10000 [1:28:56<32:54, 1.50it/s, loss=0.0140, lr=7.50e-06, step=7033] Training: 70%|███████ | 7034/10000 [1:28:56<34:10, 1.45it/s, loss=0.0140, lr=7.50e-06, step=7033] Training: 70%|███████ | 7034/10000 [1:28:56<34:10, 1.45it/s, loss=0.0025, lr=7.49e-06, step=7034] Training: 70%|███████ | 7035/10000 [1:28:57<31:30, 1.57it/s, loss=0.0025, lr=7.49e-06, step=7034] Training: 70%|███████ | 7035/10000 [1:28:57<31:30, 1.57it/s, loss=0.0056, lr=7.49e-06, step=7035] Training: 70%|███████ | 7036/10000 [1:28:58<33:44, 1.46it/s, loss=0.0056, lr=7.49e-06, step=7035] Training: 70%|███████ | 7036/10000 [1:28:58<33:44, 1.46it/s, loss=0.0092, lr=7.49e-06, step=7036] Training: 70%|███████ | 7037/10000 [1:28:58<34:50, 1.42it/s, loss=0.0092, lr=7.49e-06, step=7036] Training: 70%|███████ | 7037/10000 [1:28:58<34:50, 1.42it/s, loss=0.0030, lr=7.49e-06, step=7037] Training: 70%|███████ | 7038/10000 [1:28:59<39:25, 1.25it/s, loss=0.0030, lr=7.49e-06, step=7037] Training: 70%|███████ | 7038/10000 [1:28:59<39:25, 1.25it/s, loss=0.0050, lr=7.48e-06, step=7038] Training: 70%|███████ | 7039/10000 [1:29:00<39:32, 1.25it/s, loss=0.0050, lr=7.48e-06, step=7038] Training: 70%|███████ | 7039/10000 [1:29:00<39:32, 1.25it/s, loss=0.0026, lr=7.48e-06, step=7039]17:35:07.941 [I] step=7040 loss=0.0019 smoothed_loss=0.0061 lr=7.49e-06 grad_norm=0.4080 step_time=0.5820s data_time=0.1462s it/s=1.374 eta_to_10000=2155.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0160 grad_action_out_proj=0.1288 grad_shared_expert=0.4859 (10775:train_pytorch.py:850) + Training: 70%|███████ | 7040/10000 [1:29:01<39:27, 1.25it/s, loss=0.0026, lr=7.48e-06, step=7039] Training: 70%|███████ | 7040/10000 [1:29:01<39:27, 1.25it/s, loss=0.0019, lr=7.48e-06, step=7040] Training: 70%|███████ | 7041/10000 [1:29:02<37:55, 1.30it/s, loss=0.0019, lr=7.48e-06, step=7040] Training: 70%|███████ | 7041/10000 [1:29:02<37:55, 1.30it/s, loss=0.0069, lr=7.47e-06, step=7041] Training: 70%|███████ | 7042/10000 [1:29:02<33:39, 1.46it/s, loss=0.0069, lr=7.47e-06, step=7041] Training: 70%|███████ | 7042/10000 [1:29:02<33:39, 1.46it/s, loss=0.0053, lr=7.47e-06, step=7042] Training: 70%|███████ | 7043/10000 [1:29:03<34:36, 1.42it/s, loss=0.0053, lr=7.47e-06, step=7042] Training: 70%|███████ | 7043/10000 [1:29:03<34:36, 1.42it/s, loss=0.0033, lr=7.47e-06, step=7043] Training: 70%|███████ | 7044/10000 [1:29:04<34:06, 1.44it/s, loss=0.0033, lr=7.47e-06, step=7043] Training: 70%|███████ | 7044/10000 [1:29:04<34:06, 1.44it/s, loss=0.0027, lr=7.46e-06, step=7044] Training: 70%|███████ | 7045/10000 [1:29:05<39:15, 1.25it/s, loss=0.0027, lr=7.46e-06, step=7044] Training: 70%|███████ | 7045/10000 [1:29:05<39:15, 1.25it/s, loss=0.0031, lr=7.46e-06, step=7045] Training: 70%|███████ | 7046/10000 [1:29:05<35:39, 1.38it/s, loss=0.0031, lr=7.46e-06, step=7045] Training: 70%|███████ | 7046/10000 [1:29:05<35:39, 1.38it/s, loss=0.0176, lr=7.46e-06, step=7046] Training: 70%|███████ | 7047/10000 [1:29:06<32:05, 1.53it/s, loss=0.0176, lr=7.46e-06, step=7046] Training: 70%|███████ | 7047/10000 [1:29:06<32:05, 1.53it/s, loss=0.0134, lr=7.45e-06, step=7047] Training: 70%|███████ | 7048/10000 [1:29:07<36:24, 1.35it/s, loss=0.0134, lr=7.45e-06, step=7047] Training: 70%|███████ | 7048/10000 [1:29:07<36:24, 1.35it/s, loss=0.0059, lr=7.45e-06, step=7048] Training: 70%|███████ | 7049/10000 [1:29:07<33:29, 1.47it/s, loss=0.0059, lr=7.45e-06, step=7048] Training: 70%|███████ | 7049/10000 [1:29:07<33:29, 1.47it/s, loss=0.0088, lr=7.45e-06, step=7049]17:35:14.667 [I] step=7050 loss=0.0664 smoothed_loss=0.0131 lr=7.46e-06 grad_norm=0.4820 step_time=0.5352s data_time=0.1374s it/s=1.487 eta_to_10000=1983.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0252 grad_action_out_proj=0.1369 grad_shared_expert=0.5362 (10775:train_pytorch.py:850) + Training: 70%|███████ | 7050/10000 [1:29:08<31:43, 1.55it/s, loss=0.0088, lr=7.45e-06, step=7049] Training: 70%|███████ | 7050/10000 [1:29:08<31:43, 1.55it/s, loss=0.0664, lr=7.45e-06, step=7050] Training: 71%|███████ | 7051/10000 [1:29:08<32:18, 1.52it/s, loss=0.0664, lr=7.45e-06, step=7050] Training: 71%|███████ | 7051/10000 [1:29:08<32:18, 1.52it/s, loss=0.0030, lr=7.44e-06, step=7051] Training: 71%|███████ | 7052/10000 [1:29:09<36:08, 1.36it/s, loss=0.0030, lr=7.44e-06, step=7051] Training: 71%|███████ | 7052/10000 [1:29:09<36:08, 1.36it/s, loss=0.0049, lr=7.44e-06, step=7052] Training: 71%|███████ | 7053/10000 [1:29:10<35:13, 1.39it/s, loss=0.0049, lr=7.44e-06, step=7052] Training: 71%|███████ | 7053/10000 [1:29:10<35:13, 1.39it/s, loss=0.0064, lr=7.44e-06, step=7053] Training: 71%|███████ | 7054/10000 [1:29:11<33:07, 1.48it/s, loss=0.0064, lr=7.44e-06, step=7053] Training: 71%|███████ | 7054/10000 [1:29:11<33:07, 1.48it/s, loss=0.0020, lr=7.43e-06, step=7054] Training: 71%|███████ | 7055/10000 [1:29:11<31:48, 1.54it/s, loss=0.0020, lr=7.43e-06, step=7054] Training: 71%|███████ | 7055/10000 [1:29:11<31:48, 1.54it/s, loss=0.0110, lr=7.43e-06, step=7055] Training: 71%|███████ | 7056/10000 [1:29:12<35:09, 1.40it/s, loss=0.0110, lr=7.43e-06, step=7055] Training: 71%|███████ | 7056/10000 [1:29:12<35:09, 1.40it/s, loss=0.0033, lr=7.43e-06, step=7056] Training: 71%|███████ | 7057/10000 [1:29:13<34:48, 1.41it/s, loss=0.0033, lr=7.43e-06, step=7056] Training: 71%|███████ | 7057/10000 [1:29:13<34:48, 1.41it/s, loss=0.0150, lr=7.42e-06, step=7057] Training: 71%|███████ | 7058/10000 [1:29:13<34:56, 1.40it/s, loss=0.0150, lr=7.42e-06, step=7057] Training: 71%|███████ | 7058/10000 [1:29:13<34:56, 1.40it/s, loss=0.0084, lr=7.42e-06, step=7058] Training: 71%|███████ | 7059/10000 [1:29:15<42:52, 1.14it/s, loss=0.0084, lr=7.42e-06, step=7058] Training: 71%|███████ | 7059/10000 [1:29:15<42:52, 1.14it/s, loss=0.0061, lr=7.42e-06, step=7059]17:35:22.351 [I] step=7060 loss=0.0027 smoothed_loss=0.0088 lr=7.43e-06 grad_norm=0.3797 step_time=0.6346s data_time=0.1339s it/s=1.301 eta_to_10000=2259.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0239 grad_action_out_proj=0.1960 grad_shared_expert=0.5914 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7060/10000 [1:29:15<40:20, 1.21it/s, loss=0.0061, lr=7.42e-06, step=7059] Training: 71%|███████ | 7060/10000 [1:29:15<40:20, 1.21it/s, loss=0.0027, lr=7.41e-06, step=7060] Training: 71%|███████ | 7061/10000 [1:29:16<35:22, 1.38it/s, loss=0.0027, lr=7.41e-06, step=7060] Training: 71%|███████ | 7061/10000 [1:29:16<35:22, 1.38it/s, loss=0.0117, lr=7.41e-06, step=7061] Training: 71%|███████ | 7062/10000 [1:29:17<37:06, 1.32it/s, loss=0.0117, lr=7.41e-06, step=7061] Training: 71%|███████ | 7062/10000 [1:29:17<37:06, 1.32it/s, loss=0.0055, lr=7.41e-06, step=7062] Training: 71%|███████ | 7063/10000 [1:29:18<38:09, 1.28it/s, loss=0.0055, lr=7.41e-06, step=7062] Training: 71%|███████ | 7063/10000 [1:29:18<38:09, 1.28it/s, loss=0.0016, lr=7.41e-06, step=7063] Training: 71%|███████ | 7064/10000 [1:29:18<38:17, 1.28it/s, loss=0.0016, lr=7.41e-06, step=7063] Training: 71%|███████ | 7064/10000 [1:29:18<38:17, 1.28it/s, loss=0.0090, lr=7.40e-06, step=7064] Training: 71%|███████ | 7065/10000 [1:29:19<39:42, 1.23it/s, loss=0.0090, lr=7.40e-06, step=7064] Training: 71%|███████ | 7065/10000 [1:29:19<39:42, 1.23it/s, loss=0.0015, lr=7.40e-06, step=7065] Training: 71%|███████ | 7066/10000 [1:29:20<38:53, 1.26it/s, loss=0.0015, lr=7.40e-06, step=7065] Training: 71%|███████ | 7066/10000 [1:29:20<38:53, 1.26it/s, loss=0.0028, lr=7.40e-06, step=7066] Training: 71%|███████ | 7067/10000 [1:29:21<39:33, 1.24it/s, loss=0.0028, lr=7.40e-06, step=7066] Training: 71%|███████ | 7067/10000 [1:29:21<39:33, 1.24it/s, loss=0.0025, lr=7.39e-06, step=7067] Training: 71%|███████ | 7068/10000 [1:29:21<35:47, 1.37it/s, loss=0.0025, lr=7.39e-06, step=7067] Training: 71%|███████ | 7068/10000 [1:29:21<35:47, 1.37it/s, loss=0.0366, lr=7.39e-06, step=7068] Training: 71%|███████ | 7069/10000 [1:29:22<36:40, 1.33it/s, loss=0.0366, lr=7.39e-06, step=7068] Training: 71%|███████ | 7069/10000 [1:29:22<36:40, 1.33it/s, loss=0.0293, lr=7.39e-06, step=7069]17:35:29.705 [I] step=7070 loss=0.0050 smoothed_loss=0.0109 lr=7.40e-06 grad_norm=0.5365 step_time=0.5752s data_time=0.1602s it/s=1.360 eta_to_10000=2154.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0268 grad_action_out_proj=0.1706 grad_shared_expert=0.6827 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7070/10000 [1:29:23<34:10, 1.43it/s, loss=0.0293, lr=7.39e-06, step=7069] Training: 71%|███████ | 7070/10000 [1:29:23<34:10, 1.43it/s, loss=0.0050, lr=7.38e-06, step=7070] Training: 71%|███████ | 7071/10000 [1:29:24<34:52, 1.40it/s, loss=0.0050, lr=7.38e-06, step=7070] Training: 71%|███████ | 7071/10000 [1:29:24<34:52, 1.40it/s, loss=0.0018, lr=7.38e-06, step=7071] Training: 71%|███████ | 7072/10000 [1:29:25<40:05, 1.22it/s, loss=0.0018, lr=7.38e-06, step=7071] Training: 71%|███████ | 7072/10000 [1:29:25<40:05, 1.22it/s, loss=0.0127, lr=7.38e-06, step=7072] Training: 71%|███████ | 7073/10000 [1:29:26<44:57, 1.09it/s, loss=0.0127, lr=7.38e-06, step=7072] Training: 71%|███████ | 7073/10000 [1:29:26<44:57, 1.09it/s, loss=0.0172, lr=7.37e-06, step=7073] Training: 71%|███████ | 7074/10000 [1:29:26<40:59, 1.19it/s, loss=0.0172, lr=7.37e-06, step=7073] Training: 71%|███████ | 7074/10000 [1:29:26<40:59, 1.19it/s, loss=0.0041, lr=7.37e-06, step=7074] Training: 71%|███████ | 7075/10000 [1:29:27<42:55, 1.14it/s, loss=0.0041, lr=7.37e-06, step=7074] Training: 71%|███████ | 7075/10000 [1:29:27<42:55, 1.14it/s, loss=0.0122, lr=7.37e-06, step=7075] Training: 71%|███████ | 7076/10000 [1:29:28<38:24, 1.27it/s, loss=0.0122, lr=7.37e-06, step=7075] Training: 71%|███████ | 7076/10000 [1:29:28<38:24, 1.27it/s, loss=0.0071, lr=7.37e-06, step=7076] Training: 71%|███████ | 7077/10000 [1:29:28<34:15, 1.42it/s, loss=0.0071, lr=7.37e-06, step=7076] Training: 71%|███████ | 7077/10000 [1:29:28<34:15, 1.42it/s, loss=0.0023, lr=7.36e-06, step=7077] Training: 71%|███████ | 7078/10000 [1:29:29<34:08, 1.43it/s, loss=0.0023, lr=7.36e-06, step=7077] Training: 71%|███████ | 7078/10000 [1:29:29<34:08, 1.43it/s, loss=0.0068, lr=7.36e-06, step=7078] Training: 71%|███████ | 7079/10000 [1:29:30<36:42, 1.33it/s, loss=0.0068, lr=7.36e-06, step=7078] Training: 71%|███████ | 7079/10000 [1:29:30<36:42, 1.33it/s, loss=0.0123, lr=7.36e-06, step=7079]17:35:37.780 [I] step=7080 loss=0.0154 smoothed_loss=0.0100 lr=7.37e-06 grad_norm=0.3962 step_time=0.6492s data_time=0.1583s it/s=1.239 eta_to_10000=2357.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.1159 grad_shared_expert=0.3217 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7080/10000 [1:29:31<37:43, 1.29it/s, loss=0.0123, lr=7.36e-06, step=7079] Training: 71%|███████ | 7080/10000 [1:29:31<37:43, 1.29it/s, loss=0.0154, lr=7.35e-06, step=7080] Training: 71%|███████ | 7081/10000 [1:29:32<39:21, 1.24it/s, loss=0.0154, lr=7.35e-06, step=7080] Training: 71%|███████ | 7081/10000 [1:29:32<39:21, 1.24it/s, loss=0.0048, lr=7.35e-06, step=7081] Training: 71%|███████ | 7082/10000 [1:29:32<36:38, 1.33it/s, loss=0.0048, lr=7.35e-06, step=7081] Training: 71%|███████ | 7082/10000 [1:29:32<36:38, 1.33it/s, loss=0.0265, lr=7.35e-06, step=7082] Training: 71%|███████ | 7083/10000 [1:29:33<33:20, 1.46it/s, loss=0.0265, lr=7.35e-06, step=7082] Training: 71%|███████ | 7083/10000 [1:29:33<33:20, 1.46it/s, loss=0.0077, lr=7.34e-06, step=7083] Training: 71%|███████ | 7084/10000 [1:29:33<30:43, 1.58it/s, loss=0.0077, lr=7.34e-06, step=7083] Training: 71%|███████ | 7084/10000 [1:29:33<30:43, 1.58it/s, loss=0.0209, lr=7.34e-06, step=7084] Training: 71%|███████ | 7085/10000 [1:29:34<35:50, 1.36it/s, loss=0.0209, lr=7.34e-06, step=7084] Training: 71%|███████ | 7085/10000 [1:29:34<35:50, 1.36it/s, loss=0.0041, lr=7.34e-06, step=7085] Training: 71%|███████ | 7086/10000 [1:29:35<37:12, 1.31it/s, loss=0.0041, lr=7.34e-06, step=7085] Training: 71%|███████ | 7086/10000 [1:29:35<37:12, 1.31it/s, loss=0.0171, lr=7.33e-06, step=7086] Training: 71%|███████ | 7087/10000 [1:29:36<36:42, 1.32it/s, loss=0.0171, lr=7.33e-06, step=7086] Training: 71%|███████ | 7087/10000 [1:29:36<36:42, 1.32it/s, loss=0.0058, lr=7.33e-06, step=7087] Training: 71%|███████ | 7088/10000 [1:29:37<37:42, 1.29it/s, loss=0.0058, lr=7.33e-06, step=7087] Training: 71%|███████ | 7088/10000 [1:29:37<37:42, 1.29it/s, loss=0.0172, lr=7.33e-06, step=7088] Training: 71%|███████ | 7089/10000 [1:29:37<34:11, 1.42it/s, loss=0.0172, lr=7.33e-06, step=7088] Training: 71%|███████ | 7089/10000 [1:29:37<34:11, 1.42it/s, loss=0.0126, lr=7.33e-06, step=7089]17:35:45.325 [I] step=7090 loss=0.0019 smoothed_loss=0.0108 lr=7.34e-06 grad_norm=0.4023 step_time=0.6040s data_time=0.1505s it/s=1.326 eta_to_10000=2195.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0120 grad_action_out_proj=0.1154 grad_shared_expert=0.4222 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7090/10000 [1:29:38<39:42, 1.22it/s, loss=0.0126, lr=7.33e-06, step=7089] Training: 71%|███████ | 7090/10000 [1:29:38<39:42, 1.22it/s, loss=0.0019, lr=7.32e-06, step=7090] Training: 71%|███████ | 7091/10000 [1:29:39<40:20, 1.20it/s, loss=0.0019, lr=7.32e-06, step=7090] Training: 71%|███████ | 7091/10000 [1:29:39<40:20, 1.20it/s, loss=0.0662, lr=7.32e-06, step=7091] Training: 71%|███████ | 7092/10000 [1:29:40<37:14, 1.30it/s, loss=0.0662, lr=7.32e-06, step=7091] Training: 71%|███████ | 7092/10000 [1:29:40<37:14, 1.30it/s, loss=0.0133, lr=7.32e-06, step=7092] Training: 71%|███████ | 7093/10000 [1:29:41<40:36, 1.19it/s, loss=0.0133, lr=7.32e-06, step=7092] Training: 71%|███████ | 7093/10000 [1:29:41<40:36, 1.19it/s, loss=0.0079, lr=7.31e-06, step=7093] Training: 71%|███████ | 7094/10000 [1:29:41<37:29, 1.29it/s, loss=0.0079, lr=7.31e-06, step=7093] Training: 71%|███████ | 7094/10000 [1:29:41<37:29, 1.29it/s, loss=0.0031, lr=7.31e-06, step=7094] Training: 71%|███████ | 7095/10000 [1:29:43<45:40, 1.06it/s, loss=0.0031, lr=7.31e-06, step=7094] Training: 71%|███████ | 7095/10000 [1:29:43<45:40, 1.06it/s, loss=0.0042, lr=7.31e-06, step=7095] Training: 71%|███████ | 7096/10000 [1:29:44<45:59, 1.05it/s, loss=0.0042, lr=7.31e-06, step=7095] Training: 71%|███████ | 7096/10000 [1:29:44<45:59, 1.05it/s, loss=0.0175, lr=7.30e-06, step=7096] Training: 71%|███████ | 7097/10000 [1:29:45<45:23, 1.07it/s, loss=0.0175, lr=7.30e-06, step=7096] Training: 71%|███████ | 7097/10000 [1:29:45<45:23, 1.07it/s, loss=0.0046, lr=7.30e-06, step=7097] Training: 71%|███████ | 7098/10000 [1:29:45<42:13, 1.15it/s, loss=0.0046, lr=7.30e-06, step=7097] Training: 71%|███████ | 7098/10000 [1:29:45<42:13, 1.15it/s, loss=0.0028, lr=7.30e-06, step=7098] Training: 71%|███████ | 7099/10000 [1:29:46<37:15, 1.30it/s, loss=0.0028, lr=7.30e-06, step=7098] Training: 71%|███████ | 7099/10000 [1:29:46<37:15, 1.30it/s, loss=0.0025, lr=7.29e-06, step=7099]17:35:53.625 [I] step=7100 loss=0.0106 smoothed_loss=0.0107 lr=7.31e-06 grad_norm=0.3653 step_time=0.6445s data_time=0.1855s it/s=1.205 eta_to_10000=2406.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.0950 grad_shared_expert=0.3708 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7100/10000 [1:29:47<36:35, 1.32it/s, loss=0.0025, lr=7.29e-06, step=7099] Training: 71%|███████ | 7100/10000 [1:29:47<36:35, 1.32it/s, loss=0.0106, lr=7.29e-06, step=7100] Training: 71%|███████ | 7101/10000 [1:29:48<39:39, 1.22it/s, loss=0.0106, lr=7.29e-06, step=7100] Training: 71%|███████ | 7101/10000 [1:29:48<39:39, 1.22it/s, loss=0.0009, lr=7.29e-06, step=7101] Training: 71%|███████ | 7102/10000 [1:29:49<42:10, 1.15it/s, loss=0.0009, lr=7.29e-06, step=7101] Training: 71%|███████ | 7102/10000 [1:29:49<42:10, 1.15it/s, loss=0.0075, lr=7.29e-06, step=7102] Training: 71%|███████ | 7103/10000 [1:29:49<41:19, 1.17it/s, loss=0.0075, lr=7.29e-06, step=7102] Training: 71%|███████ | 7103/10000 [1:29:49<41:19, 1.17it/s, loss=0.0091, lr=7.28e-06, step=7103] Training: 71%|███████ | 7104/10000 [1:29:50<36:14, 1.33it/s, loss=0.0091, lr=7.28e-06, step=7103] Training: 71%|███████ | 7104/10000 [1:29:50<36:14, 1.33it/s, loss=0.0068, lr=7.28e-06, step=7104] Training: 71%|███████ | 7105/10000 [1:29:51<35:22, 1.36it/s, loss=0.0068, lr=7.28e-06, step=7104] Training: 71%|███████ | 7105/10000 [1:29:51<35:22, 1.36it/s, loss=0.0028, lr=7.28e-06, step=7105] Training: 71%|███████ | 7106/10000 [1:29:51<33:54, 1.42it/s, loss=0.0028, lr=7.28e-06, step=7105] Training: 71%|███████ | 7106/10000 [1:29:51<33:54, 1.42it/s, loss=0.0158, lr=7.27e-06, step=7106] Training: 71%|███████ | 7107/10000 [1:29:52<38:23, 1.26it/s, loss=0.0158, lr=7.27e-06, step=7106] Training: 71%|███████ | 7107/10000 [1:29:52<38:23, 1.26it/s, loss=0.0026, lr=7.27e-06, step=7107] Training: 71%|███████ | 7108/10000 [1:29:53<41:43, 1.16it/s, loss=0.0026, lr=7.27e-06, step=7107] Training: 71%|███████ | 7108/10000 [1:29:53<41:43, 1.16it/s, loss=0.0183, lr=7.27e-06, step=7108] Training: 71%|███████ | 7109/10000 [1:29:54<42:03, 1.15it/s, loss=0.0183, lr=7.27e-06, step=7108] Training: 71%|███████ | 7109/10000 [1:29:54<42:03, 1.15it/s, loss=0.0033, lr=7.26e-06, step=7109]17:36:01.753 [I] step=7110 loss=0.0144 smoothed_loss=0.0095 lr=7.28e-06 grad_norm=0.3954 step_time=0.6415s data_time=0.1713s it/s=1.231 eta_to_10000=2348.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0300 grad_action_out_proj=0.1790 grad_shared_expert=0.6459 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7110/10000 [1:29:55<37:54, 1.27it/s, loss=0.0033, lr=7.26e-06, step=7109] Training: 71%|███████ | 7110/10000 [1:29:55<37:54, 1.27it/s, loss=0.0144, lr=7.26e-06, step=7110] Training: 71%|███████ | 7111/10000 [1:29:56<36:50, 1.31it/s, loss=0.0144, lr=7.26e-06, step=7110] Training: 71%|███████ | 7111/10000 [1:29:56<36:50, 1.31it/s, loss=0.0053, lr=7.26e-06, step=7111] Training: 71%|███████ | 7112/10000 [1:29:57<40:32, 1.19it/s, loss=0.0053, lr=7.26e-06, step=7111] Training: 71%|███████ | 7112/10000 [1:29:57<40:32, 1.19it/s, loss=0.0024, lr=7.26e-06, step=7112] Training: 71%|███████ | 7113/10000 [1:29:57<35:43, 1.35it/s, loss=0.0024, lr=7.26e-06, step=7112] Training: 71%|███████ | 7113/10000 [1:29:57<35:43, 1.35it/s, loss=0.0091, lr=7.25e-06, step=7113] Training: 71%|███████ | 7114/10000 [1:29:58<34:32, 1.39it/s, loss=0.0091, lr=7.25e-06, step=7113] Training: 71%|███████ | 7114/10000 [1:29:58<34:32, 1.39it/s, loss=0.0251, lr=7.25e-06, step=7114] Training: 71%|███████ | 7115/10000 [1:29:58<34:19, 1.40it/s, loss=0.0251, lr=7.25e-06, step=7114] Training: 71%|███████ | 7115/10000 [1:29:58<34:19, 1.40it/s, loss=0.0118, lr=7.25e-06, step=7115] Training: 71%|███████ | 7116/10000 [1:30:00<41:05, 1.17it/s, loss=0.0118, lr=7.25e-06, step=7115] Training: 71%|███████ | 7116/10000 [1:30:00<41:05, 1.17it/s, loss=0.0226, lr=7.24e-06, step=7116] Training: 71%|███████ | 7117/10000 [1:30:00<39:14, 1.22it/s, loss=0.0226, lr=7.24e-06, step=7116] Training: 71%|███████ | 7117/10000 [1:30:00<39:14, 1.22it/s, loss=0.0051, lr=7.24e-06, step=7117] Training: 71%|███████ | 7118/10000 [1:30:01<35:43, 1.34it/s, loss=0.0051, lr=7.24e-06, step=7117] Training: 71%|███████ | 7118/10000 [1:30:01<35:43, 1.34it/s, loss=0.0141, lr=7.24e-06, step=7118] Training: 71%|███████ | 7119/10000 [1:30:02<39:32, 1.21it/s, loss=0.0141, lr=7.24e-06, step=7118] Training: 71%|███████ | 7119/10000 [1:30:02<39:32, 1.21it/s, loss=0.0047, lr=7.23e-06, step=7119]17:36:09.534 [I] step=7120 loss=0.0032 smoothed_loss=0.0098 lr=7.24e-06 grad_norm=0.4306 step_time=0.6208s data_time=0.1573s it/s=1.285 eta_to_10000=2240.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0319 grad_action_out_proj=0.1830 grad_shared_expert=0.5940 (10775:train_pytorch.py:850) + Training: 71%|███████ | 7120/10000 [1:30:03<37:24, 1.28it/s, loss=0.0047, lr=7.23e-06, step=7119] Training: 71%|███████ | 7120/10000 [1:30:03<37:24, 1.28it/s, loss=0.0032, lr=7.23e-06, step=7120] Training: 71%|███████ | 7121/10000 [1:30:03<38:43, 1.24it/s, loss=0.0032, lr=7.23e-06, step=7120] Training: 71%|███████ | 7121/10000 [1:30:03<38:43, 1.24it/s, loss=0.0132, lr=7.23e-06, step=7121] Training: 71%|███████ | 7122/10000 [1:30:04<36:36, 1.31it/s, loss=0.0132, lr=7.23e-06, step=7121] Training: 71%|███████ | 7122/10000 [1:30:04<36:36, 1.31it/s, loss=0.0082, lr=7.23e-06, step=7122] Training: 71%|███████ | 7123/10000 [1:30:05<36:21, 1.32it/s, loss=0.0082, lr=7.23e-06, step=7122] Training: 71%|███████ | 7123/10000 [1:30:05<36:21, 1.32it/s, loss=0.0101, lr=7.22e-06, step=7123] Training: 71%|███████ | 7124/10000 [1:30:06<38:48, 1.24it/s, loss=0.0101, lr=7.22e-06, step=7123] Training: 71%|███████ | 7124/10000 [1:30:06<38:48, 1.24it/s, loss=0.0091, lr=7.22e-06, step=7124] Training: 71%|███████▏ | 7125/10000 [1:30:07<39:28, 1.21it/s, loss=0.0091, lr=7.22e-06, step=7124] Training: 71%|███████▏ | 7125/10000 [1:30:07<39:28, 1.21it/s, loss=0.0091, lr=7.22e-06, step=7125] Training: 71%|███████▏ | 7126/10000 [1:30:07<35:37, 1.34it/s, loss=0.0091, lr=7.22e-06, step=7125] Training: 71%|███████▏ | 7126/10000 [1:30:07<35:37, 1.34it/s, loss=0.0016, lr=7.21e-06, step=7126] Training: 71%|███████▏ | 7127/10000 [1:30:08<36:00, 1.33it/s, loss=0.0016, lr=7.21e-06, step=7126] Training: 71%|███████▏ | 7127/10000 [1:30:08<36:00, 1.33it/s, loss=0.0044, lr=7.21e-06, step=7127] Training: 71%|███████▏ | 7128/10000 [1:30:09<32:54, 1.45it/s, loss=0.0044, lr=7.21e-06, step=7127] Training: 71%|███████▏ | 7128/10000 [1:30:09<32:54, 1.45it/s, loss=0.0123, lr=7.21e-06, step=7128] Training: 71%|███████▏ | 7129/10000 [1:30:09<32:05, 1.49it/s, loss=0.0123, lr=7.21e-06, step=7128] Training: 71%|███████▏ | 7129/10000 [1:30:09<32:05, 1.49it/s, loss=0.0146, lr=7.20e-06, step=7129]17:36:16.842 [I] step=7130 loss=0.0051 smoothed_loss=0.0090 lr=7.21e-06 grad_norm=0.5466 step_time=0.6041s data_time=0.1267s it/s=1.369 eta_to_10000=2096.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0098 grad_action_out_proj=0.0911 grad_shared_expert=0.3407 (10775:train_pytorch.py:850) + Training: 71%|███████▏ | 7130/10000 [1:30:10<33:08, 1.44it/s, loss=0.0146, lr=7.20e-06, step=7129] Training: 71%|███████▏ | 7130/10000 [1:30:10<33:08, 1.44it/s, loss=0.0051, lr=7.20e-06, step=7130] Training: 71%|███████▏ | 7131/10000 [1:30:11<34:33, 1.38it/s, loss=0.0051, lr=7.20e-06, step=7130] Training: 71%|███████▏ | 7131/10000 [1:30:11<34:33, 1.38it/s, loss=0.0136, lr=7.20e-06, step=7131] Training: 71%|███████▏ | 7132/10000 [1:30:11<32:45, 1.46it/s, loss=0.0136, lr=7.20e-06, step=7131] Training: 71%|███████▏ | 7132/10000 [1:30:11<32:45, 1.46it/s, loss=0.0134, lr=7.19e-06, step=7132] Training: 71%|███████▏ | 7133/10000 [1:30:12<30:18, 1.58it/s, loss=0.0134, lr=7.19e-06, step=7132] Training: 71%|███████▏ | 7133/10000 [1:30:12<30:18, 1.58it/s, loss=0.0060, lr=7.19e-06, step=7133] Training: 71%|███████▏ | 7134/10000 [1:30:12<28:18, 1.69it/s, loss=0.0060, lr=7.19e-06, step=7133] Training: 71%|███████▏ | 7134/10000 [1:30:12<28:18, 1.69it/s, loss=0.0099, lr=7.19e-06, step=7134] Training: 71%|███████▏ | 7135/10000 [1:30:13<33:12, 1.44it/s, loss=0.0099, lr=7.19e-06, step=7134] Training: 71%|███████▏ | 7135/10000 [1:30:13<33:12, 1.44it/s, loss=0.0159, lr=7.19e-06, step=7135] Training: 71%|███████▏ | 7136/10000 [1:30:14<30:42, 1.55it/s, loss=0.0159, lr=7.19e-06, step=7135] Training: 71%|███████▏ | 7136/10000 [1:30:14<30:42, 1.55it/s, loss=0.0012, lr=7.18e-06, step=7136] Training: 71%|███████▏ | 7137/10000 [1:30:14<31:39, 1.51it/s, loss=0.0012, lr=7.18e-06, step=7136] Training: 71%|███████▏ | 7137/10000 [1:30:14<31:39, 1.51it/s, loss=0.0119, lr=7.18e-06, step=7137] Training: 71%|███████▏ | 7138/10000 [1:30:15<33:08, 1.44it/s, loss=0.0119, lr=7.18e-06, step=7137] Training: 71%|███████▏ | 7138/10000 [1:30:15<33:08, 1.44it/s, loss=0.0031, lr=7.18e-06, step=7138] Training: 71%|███████▏ | 7139/10000 [1:30:16<37:06, 1.28it/s, loss=0.0031, lr=7.18e-06, step=7138] Training: 71%|███████▏ | 7139/10000 [1:30:16<37:06, 1.28it/s, loss=0.0110, lr=7.17e-06, step=7139]17:36:23.692 [I] step=7140 loss=0.0035 smoothed_loss=0.0085 lr=7.18e-06 grad_norm=0.3823 step_time=0.5697s data_time=0.1154s it/s=1.460 eta_to_10000=1959.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0084 grad_action_out_proj=0.0832 grad_shared_expert=0.3281 (10775:train_pytorch.py:850) + Training: 71%|███████▏ | 7140/10000 [1:30:17<33:41, 1.41it/s, loss=0.0110, lr=7.17e-06, step=7139] Training: 71%|███████▏ | 7140/10000 [1:30:17<33:41, 1.41it/s, loss=0.0035, lr=7.17e-06, step=7140] Training: 71%|███████▏ | 7141/10000 [1:30:17<31:46, 1.50it/s, loss=0.0035, lr=7.17e-06, step=7140] Training: 71%|███████▏ | 7141/10000 [1:30:17<31:46, 1.50it/s, loss=0.0046, lr=7.17e-06, step=7141] Training: 71%|███████▏ | 7142/10000 [1:30:18<29:31, 1.61it/s, loss=0.0046, lr=7.17e-06, step=7141] Training: 71%|███████▏ | 7142/10000 [1:30:18<29:31, 1.61it/s, loss=0.0192, lr=7.16e-06, step=7142] Training: 71%|███████▏ | 7143/10000 [1:30:19<30:44, 1.55it/s, loss=0.0192, lr=7.16e-06, step=7142] Training: 71%|███████▏ | 7143/10000 [1:30:19<30:44, 1.55it/s, loss=0.0011, lr=7.16e-06, step=7143] Training: 71%|███████▏ | 7144/10000 [1:30:19<31:19, 1.52it/s, loss=0.0011, lr=7.16e-06, step=7143] Training: 71%|███████▏ | 7144/10000 [1:30:19<31:19, 1.52it/s, loss=0.0019, lr=7.16e-06, step=7144] Training: 71%|███████▏ | 7145/10000 [1:30:20<30:28, 1.56it/s, loss=0.0019, lr=7.16e-06, step=7144] Training: 71%|███████▏ | 7145/10000 [1:30:20<30:28, 1.56it/s, loss=0.0548, lr=7.16e-06, step=7145] Training: 71%|███████▏ | 7146/10000 [1:30:21<33:31, 1.42it/s, loss=0.0548, lr=7.16e-06, step=7145] Training: 71%|███████▏ | 7146/10000 [1:30:21<33:31, 1.42it/s, loss=0.0206, lr=7.15e-06, step=7146] Training: 71%|███████▏ | 7147/10000 [1:30:21<30:57, 1.54it/s, loss=0.0206, lr=7.15e-06, step=7146] Training: 71%|███████▏ | 7147/10000 [1:30:21<30:57, 1.54it/s, loss=0.0329, lr=7.15e-06, step=7147] Training: 71%|███████▏ | 7148/10000 [1:30:22<31:31, 1.51it/s, loss=0.0329, lr=7.15e-06, step=7147] Training: 71%|███████▏ | 7148/10000 [1:30:22<31:31, 1.51it/s, loss=0.0036, lr=7.15e-06, step=7148] Training: 71%|███████▏ | 7149/10000 [1:30:23<32:22, 1.47it/s, loss=0.0036, lr=7.15e-06, step=7148] Training: 71%|███████▏ | 7149/10000 [1:30:23<32:22, 1.47it/s, loss=0.0075, lr=7.14e-06, step=7149]17:36:30.549 [I] step=7150 loss=0.0848 smoothed_loss=0.0206 lr=7.15e-06 grad_norm=0.4636 step_time=0.5651s data_time=0.1206s it/s=1.459 eta_to_10000=1953.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0409 grad_action_out_proj=0.2027 grad_shared_expert=0.5652 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7150/10000 [1:30:24<36:42, 1.29it/s, loss=0.0075, lr=7.14e-06, step=7149] Training: 72%|███████▏ | 7150/10000 [1:30:24<36:42, 1.29it/s, loss=0.0848, lr=7.14e-06, step=7150] Training: 72%|███████▏ | 7151/10000 [1:30:25<40:19, 1.18it/s, loss=0.0848, lr=7.14e-06, step=7150] Training: 72%|███████▏ | 7151/10000 [1:30:25<40:19, 1.18it/s, loss=0.0092, lr=7.14e-06, step=7151] Training: 72%|███████▏ | 7152/10000 [1:30:25<37:41, 1.26it/s, loss=0.0092, lr=7.14e-06, step=7151] Training: 72%|███████▏ | 7152/10000 [1:30:25<37:41, 1.26it/s, loss=0.0998, lr=7.13e-06, step=7152] Training: 72%|███████▏ | 7153/10000 [1:30:26<39:52, 1.19it/s, loss=0.0998, lr=7.13e-06, step=7152] Training: 72%|███████▏ | 7153/10000 [1:30:26<39:52, 1.19it/s, loss=0.0034, lr=7.13e-06, step=7153] Training: 72%|███████▏ | 7154/10000 [1:30:27<37:38, 1.26it/s, loss=0.0034, lr=7.13e-06, step=7153] Training: 72%|███████▏ | 7154/10000 [1:30:27<37:38, 1.26it/s, loss=0.0011, lr=7.13e-06, step=7154] Training: 72%|███████▏ | 7155/10000 [1:30:28<37:11, 1.27it/s, loss=0.0011, lr=7.13e-06, step=7154] Training: 72%|███████▏ | 7155/10000 [1:30:28<37:11, 1.27it/s, loss=0.0056, lr=7.13e-06, step=7155] Training: 72%|███████▏ | 7156/10000 [1:30:28<37:12, 1.27it/s, loss=0.0056, lr=7.13e-06, step=7155] Training: 72%|███████▏ | 7156/10000 [1:30:28<37:12, 1.27it/s, loss=0.0419, lr=7.12e-06, step=7156] Training: 72%|███████▏ | 7157/10000 [1:30:29<39:36, 1.20it/s, loss=0.0419, lr=7.12e-06, step=7156] Training: 72%|███████▏ | 7157/10000 [1:30:29<39:36, 1.20it/s, loss=0.0050, lr=7.12e-06, step=7157] Training: 72%|███████▏ | 7158/10000 [1:30:30<35:26, 1.34it/s, loss=0.0050, lr=7.12e-06, step=7157] Training: 72%|███████▏ | 7158/10000 [1:30:30<35:26, 1.34it/s, loss=0.0157, lr=7.12e-06, step=7158] Training: 72%|███████▏ | 7159/10000 [1:30:31<35:49, 1.32it/s, loss=0.0157, lr=7.12e-06, step=7158] Training: 72%|███████▏ | 7159/10000 [1:30:31<35:49, 1.32it/s, loss=0.0097, lr=7.11e-06, step=7159]17:36:38.443 [I] step=7160 loss=0.0156 smoothed_loss=0.0192 lr=7.12e-06 grad_norm=0.4121 step_time=0.6401s data_time=0.1493s it/s=1.267 eta_to_10000=2241.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0317 grad_action_out_proj=0.1752 grad_shared_expert=0.5001 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7160/10000 [1:30:32<35:38, 1.33it/s, loss=0.0097, lr=7.11e-06, step=7159] Training: 72%|███████▏ | 7160/10000 [1:30:32<35:38, 1.33it/s, loss=0.0156, lr=7.11e-06, step=7160] Training: 72%|███████▏ | 7161/10000 [1:30:32<32:20, 1.46it/s, loss=0.0156, lr=7.11e-06, step=7160] Training: 72%|███████▏ | 7161/10000 [1:30:32<32:20, 1.46it/s, loss=0.0008, lr=7.11e-06, step=7161] Training: 72%|███████▏ | 7162/10000 [1:30:33<29:40, 1.59it/s, loss=0.0008, lr=7.11e-06, step=7161] Training: 72%|███████▏ | 7162/10000 [1:30:33<29:40, 1.59it/s, loss=0.0229, lr=7.10e-06, step=7162] Training: 72%|███████▏ | 7163/10000 [1:30:33<27:50, 1.70it/s, loss=0.0229, lr=7.10e-06, step=7162] Training: 72%|███████▏ | 7163/10000 [1:30:33<27:50, 1.70it/s, loss=0.0022, lr=7.10e-06, step=7163] Training: 72%|███████▏ | 7164/10000 [1:30:34<34:32, 1.37it/s, loss=0.0022, lr=7.10e-06, step=7163] Training: 72%|███████▏ | 7164/10000 [1:30:34<34:32, 1.37it/s, loss=0.0043, lr=7.10e-06, step=7164] Training: 72%|███████▏ | 7165/10000 [1:30:35<31:32, 1.50it/s, loss=0.0043, lr=7.10e-06, step=7164] Training: 72%|███████▏ | 7165/10000 [1:30:35<31:32, 1.50it/s, loss=0.0031, lr=7.10e-06, step=7165] Training: 72%|███████▏ | 7166/10000 [1:30:35<29:33, 1.60it/s, loss=0.0031, lr=7.10e-06, step=7165] Training: 72%|███████▏ | 7166/10000 [1:30:35<29:33, 1.60it/s, loss=0.0031, lr=7.09e-06, step=7166] Training: 72%|███████▏ | 7167/10000 [1:30:36<32:18, 1.46it/s, loss=0.0031, lr=7.09e-06, step=7166] Training: 72%|███████▏ | 7167/10000 [1:30:36<32:18, 1.46it/s, loss=0.0039, lr=7.09e-06, step=7167] Training: 72%|███████▏ | 7168/10000 [1:30:36<29:57, 1.58it/s, loss=0.0039, lr=7.09e-06, step=7167] Training: 72%|███████▏ | 7168/10000 [1:30:36<29:57, 1.58it/s, loss=0.0068, lr=7.09e-06, step=7168] Training: 72%|███████▏ | 7169/10000 [1:30:37<28:01, 1.68it/s, loss=0.0068, lr=7.09e-06, step=7168] Training: 72%|███████▏ | 7169/10000 [1:30:37<28:01, 1.68it/s, loss=0.0221, lr=7.08e-06, step=7169]17:36:44.591 [I] step=7170 loss=0.0055 smoothed_loss=0.0118 lr=7.09e-06 grad_norm=0.3978 step_time=0.5222s data_time=0.0926s it/s=1.627 eta_to_10000=1739.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0128 grad_action_out_proj=0.0908 grad_shared_expert=0.3423 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7170/10000 [1:30:38<29:15, 1.61it/s, loss=0.0221, lr=7.08e-06, step=7169] Training: 72%|███████▏ | 7170/10000 [1:30:38<29:15, 1.61it/s, loss=0.0055, lr=7.08e-06, step=7170] Training: 72%|███████▏ | 7171/10000 [1:30:38<32:18, 1.46it/s, loss=0.0055, lr=7.08e-06, step=7170] Training: 72%|███████▏ | 7171/10000 [1:30:38<32:18, 1.46it/s, loss=0.0043, lr=7.08e-06, step=7171] Training: 72%|███████▏ | 7172/10000 [1:30:40<37:08, 1.27it/s, loss=0.0043, lr=7.08e-06, step=7171] Training: 72%|███████▏ | 7172/10000 [1:30:40<37:08, 1.27it/s, loss=0.0015, lr=7.07e-06, step=7172] Training: 72%|███████▏ | 7173/10000 [1:30:40<36:38, 1.29it/s, loss=0.0015, lr=7.07e-06, step=7172] Training: 72%|███████▏ | 7173/10000 [1:30:40<36:38, 1.29it/s, loss=0.0108, lr=7.07e-06, step=7173] Training: 72%|███████▏ | 7174/10000 [1:30:41<38:23, 1.23it/s, loss=0.0108, lr=7.07e-06, step=7173] Training: 72%|███████▏ | 7174/10000 [1:30:41<38:23, 1.23it/s, loss=0.0035, lr=7.07e-06, step=7174] Training: 72%|███████▏ | 7175/10000 [1:30:42<37:34, 1.25it/s, loss=0.0035, lr=7.07e-06, step=7174] Training: 72%|███████▏ | 7175/10000 [1:30:42<37:34, 1.25it/s, loss=0.0057, lr=7.07e-06, step=7175] Training: 72%|███████▏ | 7176/10000 [1:30:42<33:40, 1.40it/s, loss=0.0057, lr=7.07e-06, step=7175] Training: 72%|███████▏ | 7176/10000 [1:30:42<33:40, 1.40it/s, loss=0.0053, lr=7.06e-06, step=7176] Training: 72%|███████▏ | 7177/10000 [1:30:43<38:02, 1.24it/s, loss=0.0053, lr=7.06e-06, step=7176] Training: 72%|███████▏ | 7177/10000 [1:30:43<38:02, 1.24it/s, loss=0.0069, lr=7.06e-06, step=7177] Training: 72%|███████▏ | 7178/10000 [1:30:44<38:29, 1.22it/s, loss=0.0069, lr=7.06e-06, step=7177] Training: 72%|███████▏ | 7178/10000 [1:30:44<38:29, 1.22it/s, loss=0.0072, lr=7.06e-06, step=7178] Training: 72%|███████▏ | 7179/10000 [1:30:45<37:37, 1.25it/s, loss=0.0072, lr=7.06e-06, step=7178] Training: 72%|███████▏ | 7179/10000 [1:30:45<37:37, 1.25it/s, loss=0.0074, lr=7.05e-06, step=7179]17:36:52.693 [I] step=7180 loss=0.0215 smoothed_loss=0.0096 lr=7.06e-06 grad_norm=0.5249 step_time=0.6436s data_time=0.1667s it/s=1.234 eta_to_10000=2284.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0358 grad_action_out_proj=0.1447 grad_shared_expert=0.6794 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7180/10000 [1:30:46<35:51, 1.31it/s, loss=0.0074, lr=7.05e-06, step=7179] Training: 72%|███████▏ | 7180/10000 [1:30:46<35:51, 1.31it/s, loss=0.0215, lr=7.05e-06, step=7180] Training: 72%|███████▏ | 7181/10000 [1:30:47<45:28, 1.03it/s, loss=0.0215, lr=7.05e-06, step=7180] Training: 72%|███████▏ | 7181/10000 [1:30:47<45:28, 1.03it/s, loss=0.0261, lr=7.05e-06, step=7181] Training: 72%|███████▏ | 7182/10000 [1:30:48<43:38, 1.08it/s, loss=0.0261, lr=7.05e-06, step=7181] Training: 72%|███████▏ | 7182/10000 [1:30:48<43:38, 1.08it/s, loss=0.0048, lr=7.04e-06, step=7182] Training: 72%|███████▏ | 7183/10000 [1:30:49<42:30, 1.10it/s, loss=0.0048, lr=7.04e-06, step=7182] Training: 72%|███████▏ | 7183/10000 [1:30:49<42:30, 1.10it/s, loss=0.0110, lr=7.04e-06, step=7183] Training: 72%|███████▏ | 7184/10000 [1:30:49<36:46, 1.28it/s, loss=0.0110, lr=7.04e-06, step=7183] Training: 72%|███████▏ | 7184/10000 [1:30:49<36:46, 1.28it/s, loss=0.0132, lr=7.04e-06, step=7184] Training: 72%|███████▏ | 7185/10000 [1:30:50<37:22, 1.26it/s, loss=0.0132, lr=7.04e-06, step=7184] Training: 72%|███████▏ | 7185/10000 [1:30:50<37:22, 1.26it/s, loss=0.0095, lr=7.04e-06, step=7185] Training: 72%|███████▏ | 7186/10000 [1:30:51<36:47, 1.27it/s, loss=0.0095, lr=7.04e-06, step=7185] Training: 72%|███████▏ | 7186/10000 [1:30:51<36:47, 1.27it/s, loss=0.0222, lr=7.03e-06, step=7186] Training: 72%|███████▏ | 7187/10000 [1:30:52<40:39, 1.15it/s, loss=0.0222, lr=7.03e-06, step=7186] Training: 72%|███████▏ | 7187/10000 [1:30:52<40:39, 1.15it/s, loss=0.0126, lr=7.03e-06, step=7187] Training: 72%|███████▏ | 7188/10000 [1:30:53<44:47, 1.05it/s, loss=0.0126, lr=7.03e-06, step=7187] Training: 72%|███████▏ | 7188/10000 [1:30:53<44:47, 1.05it/s, loss=0.0127, lr=7.03e-06, step=7188] Training: 72%|███████▏ | 7189/10000 [1:30:54<43:57, 1.07it/s, loss=0.0127, lr=7.03e-06, step=7188] Training: 72%|███████▏ | 7189/10000 [1:30:54<43:57, 1.07it/s, loss=0.0017, lr=7.02e-06, step=7189]17:37:01.877 [I] step=7190 loss=0.0084 smoothed_loss=0.0107 lr=7.03e-06 grad_norm=0.4431 step_time=0.7177s data_time=0.2006s it/s=1.089 eta_to_10000=2580.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.0795 grad_shared_expert=0.5247 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7190/10000 [1:30:55<42:38, 1.10it/s, loss=0.0017, lr=7.02e-06, step=7189] Training: 72%|███████▏ | 7190/10000 [1:30:55<42:38, 1.10it/s, loss=0.0084, lr=7.02e-06, step=7190] Training: 72%|███████▏ | 7191/10000 [1:30:56<39:45, 1.18it/s, loss=0.0084, lr=7.02e-06, step=7190] Training: 72%|███████▏ | 7191/10000 [1:30:56<39:45, 1.18it/s, loss=0.0072, lr=7.02e-06, step=7191] Training: 72%|███████▏ | 7192/10000 [1:30:56<38:53, 1.20it/s, loss=0.0072, lr=7.02e-06, step=7191] Training: 72%|███████▏ | 7192/10000 [1:30:56<38:53, 1.20it/s, loss=0.0130, lr=7.01e-06, step=7192] Training: 72%|███████▏ | 7193/10000 [1:30:57<34:37, 1.35it/s, loss=0.0130, lr=7.01e-06, step=7192] Training: 72%|███████▏ | 7193/10000 [1:30:57<34:37, 1.35it/s, loss=0.0165, lr=7.01e-06, step=7193] Training: 72%|███████▏ | 7194/10000 [1:30:58<38:24, 1.22it/s, loss=0.0165, lr=7.01e-06, step=7193] Training: 72%|███████▏ | 7194/10000 [1:30:58<38:24, 1.22it/s, loss=0.0044, lr=7.01e-06, step=7194] Training: 72%|███████▏ | 7195/10000 [1:30:59<37:54, 1.23it/s, loss=0.0044, lr=7.01e-06, step=7194] Training: 72%|███████▏ | 7195/10000 [1:30:59<37:54, 1.23it/s, loss=0.0237, lr=7.01e-06, step=7195] Training: 72%|███████▏ | 7196/10000 [1:30:59<33:46, 1.38it/s, loss=0.0237, lr=7.01e-06, step=7195] Training: 72%|███████▏ | 7196/10000 [1:30:59<33:46, 1.38it/s, loss=0.0078, lr=7.00e-06, step=7196] Training: 72%|███████▏ | 7197/10000 [1:31:00<34:30, 1.35it/s, loss=0.0078, lr=7.00e-06, step=7196] Training: 72%|███████▏ | 7197/10000 [1:31:00<34:30, 1.35it/s, loss=0.0079, lr=7.00e-06, step=7197] Training: 72%|███████▏ | 7198/10000 [1:31:01<36:14, 1.29it/s, loss=0.0079, lr=7.00e-06, step=7197] Training: 72%|███████▏ | 7198/10000 [1:31:01<36:14, 1.29it/s, loss=0.0026, lr=7.00e-06, step=7198] Training: 72%|███████▏ | 7199/10000 [1:31:01<32:24, 1.44it/s, loss=0.0026, lr=7.00e-06, step=7198] Training: 72%|███████▏ | 7199/10000 [1:31:01<32:24, 1.44it/s, loss=0.0076, lr=6.99e-06, step=7199]17:37:09.119 [I] step=7200 loss=0.0158 smoothed_loss=0.0106 lr=7.00e-06 grad_norm=0.4388 step_time=0.5786s data_time=0.1457s it/s=1.381 eta_to_10000=2027.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0873 grad_shared_expert=0.2515 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7200/10000 [1:31:02<33:21, 1.40it/s, loss=0.0076, lr=6.99e-06, step=7199] Training: 72%|███████▏ | 7200/10000 [1:31:02<33:21, 1.40it/s, loss=0.0158, lr=6.99e-06, step=7200] Training: 72%|███████▏ | 7201/10000 [1:31:03<30:49, 1.51it/s, loss=0.0158, lr=6.99e-06, step=7200] Training: 72%|███████▏ | 7201/10000 [1:31:03<30:49, 1.51it/s, loss=0.0012, lr=6.99e-06, step=7201] Training: 72%|███████▏ | 7202/10000 [1:31:04<37:53, 1.23it/s, loss=0.0012, lr=6.99e-06, step=7201] Training: 72%|███████▏ | 7202/10000 [1:31:04<37:53, 1.23it/s, loss=0.0107, lr=6.98e-06, step=7202] Training: 72%|███████▏ | 7203/10000 [1:31:05<37:40, 1.24it/s, loss=0.0107, lr=6.98e-06, step=7202] Training: 72%|███████▏ | 7203/10000 [1:31:05<37:40, 1.24it/s, loss=0.0426, lr=6.98e-06, step=7203] Training: 72%|███████▏ | 7204/10000 [1:31:05<37:19, 1.25it/s, loss=0.0426, lr=6.98e-06, step=7203] Training: 72%|███████▏ | 7204/10000 [1:31:05<37:19, 1.25it/s, loss=0.0092, lr=6.98e-06, step=7204] Training: 72%|███████▏ | 7205/10000 [1:31:06<34:16, 1.36it/s, loss=0.0092, lr=6.98e-06, step=7204] Training: 72%|███████▏ | 7205/10000 [1:31:06<34:16, 1.36it/s, loss=0.0030, lr=6.98e-06, step=7205] Training: 72%|███████▏ | 7206/10000 [1:31:07<34:32, 1.35it/s, loss=0.0030, lr=6.98e-06, step=7205] Training: 72%|███████▏ | 7206/10000 [1:31:07<34:32, 1.35it/s, loss=0.0043, lr=6.97e-06, step=7206] Training: 72%|███████▏ | 7207/10000 [1:31:07<31:29, 1.48it/s, loss=0.0043, lr=6.97e-06, step=7206] Training: 72%|███████▏ | 7207/10000 [1:31:07<31:29, 1.48it/s, loss=0.0033, lr=6.97e-06, step=7207] Training: 72%|███████▏ | 7208/10000 [1:31:08<32:31, 1.43it/s, loss=0.0033, lr=6.97e-06, step=7207] Training: 72%|███████▏ | 7208/10000 [1:31:08<32:31, 1.43it/s, loss=0.0076, lr=6.97e-06, step=7208] Training: 72%|███████▏ | 7209/10000 [1:31:09<36:55, 1.26it/s, loss=0.0076, lr=6.97e-06, step=7208] Training: 72%|███████▏ | 7209/10000 [1:31:09<36:55, 1.26it/s, loss=0.0144, lr=6.96e-06, step=7209]17:37:17.149 [I] step=7210 loss=0.0173 smoothed_loss=0.0111 lr=6.97e-06 grad_norm=0.4576 step_time=0.6400s data_time=0.1630s it/s=1.245 eta_to_10000=2240.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1131 grad_shared_expert=0.3729 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7210/10000 [1:31:10<41:24, 1.12it/s, loss=0.0144, lr=6.96e-06, step=7209] Training: 72%|███████▏ | 7210/10000 [1:31:10<41:24, 1.12it/s, loss=0.0173, lr=6.96e-06, step=7210] Training: 72%|███████▏ | 7211/10000 [1:31:11<39:09, 1.19it/s, loss=0.0173, lr=6.96e-06, step=7210] Training: 72%|███████▏ | 7211/10000 [1:31:11<39:09, 1.19it/s, loss=0.0036, lr=6.96e-06, step=7211] Training: 72%|███████▏ | 7212/10000 [1:31:11<34:20, 1.35it/s, loss=0.0036, lr=6.96e-06, step=7211] Training: 72%|███████▏ | 7212/10000 [1:31:11<34:20, 1.35it/s, loss=0.0158, lr=6.96e-06, step=7212] Training: 72%|███████▏ | 7213/10000 [1:31:12<34:02, 1.36it/s, loss=0.0158, lr=6.96e-06, step=7212] Training: 72%|███████▏ | 7213/10000 [1:31:12<34:02, 1.36it/s, loss=0.0100, lr=6.95e-06, step=7213] Training: 72%|███████▏ | 7214/10000 [1:31:13<33:17, 1.39it/s, loss=0.0100, lr=6.95e-06, step=7213] Training: 72%|███████▏ | 7214/10000 [1:31:13<33:17, 1.39it/s, loss=0.0218, lr=6.95e-06, step=7214] Training: 72%|███████▏ | 7215/10000 [1:31:13<30:42, 1.51it/s, loss=0.0218, lr=6.95e-06, step=7214] Training: 72%|███████▏ | 7215/10000 [1:31:13<30:42, 1.51it/s, loss=0.0215, lr=6.95e-06, step=7215] Training: 72%|███████▏ | 7216/10000 [1:31:14<34:37, 1.34it/s, loss=0.0215, lr=6.95e-06, step=7215] Training: 72%|███████▏ | 7216/10000 [1:31:14<34:37, 1.34it/s, loss=0.0057, lr=6.94e-06, step=7216] Training: 72%|███████▏ | 7217/10000 [1:31:15<38:32, 1.20it/s, loss=0.0057, lr=6.94e-06, step=7216] Training: 72%|███████▏ | 7217/10000 [1:31:15<38:32, 1.20it/s, loss=0.0044, lr=6.94e-06, step=7217] Training: 72%|███████▏ | 7218/10000 [1:31:16<38:36, 1.20it/s, loss=0.0044, lr=6.94e-06, step=7217] Training: 72%|███████▏ | 7218/10000 [1:31:16<38:36, 1.20it/s, loss=0.0120, lr=6.94e-06, step=7218] Training: 72%|███████▏ | 7219/10000 [1:31:17<39:30, 1.17it/s, loss=0.0120, lr=6.94e-06, step=7218] Training: 72%|███████▏ | 7219/10000 [1:31:17<39:30, 1.17it/s, loss=0.0084, lr=6.93e-06, step=7219]17:37:24.973 [I] step=7220 loss=0.0108 smoothed_loss=0.0111 lr=6.94e-06 grad_norm=0.4390 step_time=0.6236s data_time=0.1589s it/s=1.278 eta_to_10000=2174.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0865 grad_shared_expert=0.3517 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7220/10000 [1:31:18<40:57, 1.13it/s, loss=0.0084, lr=6.93e-06, step=7219] Training: 72%|███████▏ | 7220/10000 [1:31:18<40:57, 1.13it/s, loss=0.0108, lr=6.93e-06, step=7220] Training: 72%|███████▏ | 7221/10000 [1:31:19<41:14, 1.12it/s, loss=0.0108, lr=6.93e-06, step=7220] Training: 72%|███████▏ | 7221/10000 [1:31:19<41:14, 1.12it/s, loss=0.0057, lr=6.93e-06, step=7221] Training: 72%|███████▏ | 7222/10000 [1:31:19<36:10, 1.28it/s, loss=0.0057, lr=6.93e-06, step=7221] Training: 72%|███████▏ | 7222/10000 [1:31:19<36:10, 1.28it/s, loss=0.0035, lr=6.93e-06, step=7222] Training: 72%|███████▏ | 7223/10000 [1:31:20<39:11, 1.18it/s, loss=0.0035, lr=6.93e-06, step=7222] Training: 72%|███████▏ | 7223/10000 [1:31:20<39:11, 1.18it/s, loss=0.0211, lr=6.92e-06, step=7223] Training: 72%|███████▏ | 7224/10000 [1:31:22<42:37, 1.09it/s, loss=0.0211, lr=6.92e-06, step=7223] Training: 72%|███████▏ | 7224/10000 [1:31:22<42:37, 1.09it/s, loss=0.0011, lr=6.92e-06, step=7224] Training: 72%|███████▏ | 7225/10000 [1:31:22<42:05, 1.10it/s, loss=0.0011, lr=6.92e-06, step=7224] Training: 72%|███████▏ | 7225/10000 [1:31:22<42:05, 1.10it/s, loss=0.0051, lr=6.92e-06, step=7225] Training: 72%|███████▏ | 7226/10000 [1:31:23<36:57, 1.25it/s, loss=0.0051, lr=6.92e-06, step=7225] Training: 72%|███████▏ | 7226/10000 [1:31:23<36:57, 1.25it/s, loss=0.0019, lr=6.91e-06, step=7226] Training: 72%|███████▏ | 7227/10000 [1:31:24<34:03, 1.36it/s, loss=0.0019, lr=6.91e-06, step=7226] Training: 72%|███████▏ | 7227/10000 [1:31:24<34:03, 1.36it/s, loss=0.0085, lr=6.91e-06, step=7227] Training: 72%|███████▏ | 7228/10000 [1:31:24<34:18, 1.35it/s, loss=0.0085, lr=6.91e-06, step=7227] Training: 72%|███████▏ | 7228/10000 [1:31:24<34:18, 1.35it/s, loss=0.0048, lr=6.91e-06, step=7228] Training: 72%|███████▏ | 7229/10000 [1:31:25<34:04, 1.36it/s, loss=0.0048, lr=6.91e-06, step=7228] Training: 72%|███████▏ | 7229/10000 [1:31:25<34:04, 1.36it/s, loss=0.0018, lr=6.90e-06, step=7229]17:37:32.631 [I] step=7230 loss=0.0114 smoothed_loss=0.0080 lr=6.92e-06 grad_norm=0.3283 step_time=0.6062s data_time=0.1594s it/s=1.306 eta_to_10000=2120.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0125 grad_action_out_proj=0.0881 grad_shared_expert=0.2713 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7230/10000 [1:31:26<32:35, 1.42it/s, loss=0.0018, lr=6.90e-06, step=7229] Training: 72%|███████▏ | 7230/10000 [1:31:26<32:35, 1.42it/s, loss=0.0114, lr=6.90e-06, step=7230] Training: 72%|███████▏ | 7231/10000 [1:31:27<37:09, 1.24it/s, loss=0.0114, lr=6.90e-06, step=7230] Training: 72%|███████▏ | 7231/10000 [1:31:27<37:09, 1.24it/s, loss=0.0070, lr=6.90e-06, step=7231] Training: 72%|███████▏ | 7232/10000 [1:31:27<33:30, 1.38it/s, loss=0.0070, lr=6.90e-06, step=7231] Training: 72%|███████▏ | 7232/10000 [1:31:27<33:30, 1.38it/s, loss=0.0160, lr=6.90e-06, step=7232] Training: 72%|███████▏ | 7233/10000 [1:31:28<32:52, 1.40it/s, loss=0.0160, lr=6.90e-06, step=7232] Training: 72%|███████▏ | 7233/10000 [1:31:28<32:52, 1.40it/s, loss=0.0074, lr=6.89e-06, step=7233] Training: 72%|███████▏ | 7234/10000 [1:31:28<29:50, 1.54it/s, loss=0.0074, lr=6.89e-06, step=7233] Training: 72%|███████▏ | 7234/10000 [1:31:28<29:50, 1.54it/s, loss=0.0060, lr=6.89e-06, step=7234] Training: 72%|███████▏ | 7235/10000 [1:31:29<28:26, 1.62it/s, loss=0.0060, lr=6.89e-06, step=7234] Training: 72%|███████▏ | 7235/10000 [1:31:29<28:26, 1.62it/s, loss=0.0007, lr=6.89e-06, step=7235] Training: 72%|███████▏ | 7236/10000 [1:31:30<29:33, 1.56it/s, loss=0.0007, lr=6.89e-06, step=7235] Training: 72%|███████▏ | 7236/10000 [1:31:30<29:33, 1.56it/s, loss=0.0059, lr=6.88e-06, step=7236] Training: 72%|███████▏ | 7237/10000 [1:31:30<30:13, 1.52it/s, loss=0.0059, lr=6.88e-06, step=7236] Training: 72%|███████▏ | 7237/10000 [1:31:30<30:13, 1.52it/s, loss=0.0050, lr=6.88e-06, step=7237] Training: 72%|███████▏ | 7238/10000 [1:31:31<32:11, 1.43it/s, loss=0.0050, lr=6.88e-06, step=7237] Training: 72%|███████▏ | 7238/10000 [1:31:31<32:11, 1.43it/s, loss=0.0219, lr=6.88e-06, step=7238] Training: 72%|███████▏ | 7239/10000 [1:31:32<33:42, 1.37it/s, loss=0.0219, lr=6.88e-06, step=7238] Training: 72%|███████▏ | 7239/10000 [1:31:32<33:42, 1.37it/s, loss=0.0047, lr=6.88e-06, step=7239]17:37:39.601 [I] step=7240 loss=0.0045 smoothed_loss=0.0079 lr=6.89e-06 grad_norm=0.4444 step_time=0.5856s data_time=0.1114s it/s=1.435 eta_to_10000=1923.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0077 grad_action_out_proj=0.0653 grad_shared_expert=0.2503 (10775:train_pytorch.py:850) + Training: 72%|███████▏ | 7240/10000 [1:31:33<32:49, 1.40it/s, loss=0.0047, lr=6.88e-06, step=7239] Training: 72%|███████▏ | 7240/10000 [1:31:33<32:49, 1.40it/s, loss=0.0045, lr=6.87e-06, step=7240] Training: 72%|███████▏ | 7241/10000 [1:31:33<34:07, 1.35it/s, loss=0.0045, lr=6.87e-06, step=7240] Training: 72%|███████▏ | 7241/10000 [1:31:33<34:07, 1.35it/s, loss=0.0177, lr=6.87e-06, step=7241] Training: 72%|███████▏ | 7242/10000 [1:31:34<35:29, 1.30it/s, loss=0.0177, lr=6.87e-06, step=7241] Training: 72%|███████▏ | 7242/10000 [1:31:34<35:29, 1.30it/s, loss=0.0545, lr=6.87e-06, step=7242] Training: 72%|███████▏ | 7243/10000 [1:31:35<35:08, 1.31it/s, loss=0.0545, lr=6.87e-06, step=7242] Training: 72%|███████▏ | 7243/10000 [1:31:35<35:08, 1.31it/s, loss=0.0043, lr=6.86e-06, step=7243] Training: 72%|███████▏ | 7244/10000 [1:31:36<34:16, 1.34it/s, loss=0.0043, lr=6.86e-06, step=7243] Training: 72%|███████▏ | 7244/10000 [1:31:36<34:16, 1.34it/s, loss=0.0017, lr=6.86e-06, step=7244] Training: 72%|███████▏ | 7245/10000 [1:31:37<39:44, 1.16it/s, loss=0.0017, lr=6.86e-06, step=7244] Training: 72%|███████▏ | 7245/10000 [1:31:37<39:44, 1.16it/s, loss=0.0096, lr=6.86e-06, step=7245] Training: 72%|███████▏ | 7246/10000 [1:31:37<35:23, 1.30it/s, loss=0.0096, lr=6.86e-06, step=7245] Training: 72%|███████▏ | 7246/10000 [1:31:37<35:23, 1.30it/s, loss=0.0035, lr=6.85e-06, step=7246] Training: 72%|███████▏ | 7247/10000 [1:31:38<35:12, 1.30it/s, loss=0.0035, lr=6.85e-06, step=7246] Training: 72%|███████▏ | 7247/10000 [1:31:38<35:12, 1.30it/s, loss=0.0072, lr=6.85e-06, step=7247] Training: 72%|███████▏ | 7248/10000 [1:31:39<33:48, 1.36it/s, loss=0.0072, lr=6.85e-06, step=7247] Training: 72%|███████▏ | 7248/10000 [1:31:39<33:48, 1.36it/s, loss=0.0157, lr=6.85e-06, step=7248] Training: 72%|███████▏ | 7249/10000 [1:31:40<34:00, 1.35it/s, loss=0.0157, lr=6.85e-06, step=7248] Training: 72%|███████▏ | 7249/10000 [1:31:40<34:00, 1.35it/s, loss=0.0108, lr=6.85e-06, step=7249]17:37:47.344 [I] step=7250 loss=0.0131 smoothed_loss=0.0110 lr=6.86e-06 grad_norm=0.4561 step_time=0.6163s data_time=0.1581s it/s=1.292 eta_to_10000=2129.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0293 grad_action_out_proj=0.1409 grad_shared_expert=0.5115 (10775:train_pytorch.py:850) + Training: 72%|███████▎ | 7250/10000 [1:31:40<34:23, 1.33it/s, loss=0.0108, lr=6.85e-06, step=7249] Training: 72%|███████▎ | 7250/10000 [1:31:40<34:23, 1.33it/s, loss=0.0131, lr=6.84e-06, step=7250] Training: 73%|███████▎ | 7251/10000 [1:31:41<37:18, 1.23it/s, loss=0.0131, lr=6.84e-06, step=7250] Training: 73%|███████▎ | 7251/10000 [1:31:41<37:18, 1.23it/s, loss=0.0017, lr=6.84e-06, step=7251] Training: 73%|███████▎ | 7252/10000 [1:31:42<35:27, 1.29it/s, loss=0.0017, lr=6.84e-06, step=7251] Training: 73%|███████▎ | 7252/10000 [1:31:42<35:27, 1.29it/s, loss=0.0090, lr=6.84e-06, step=7252] Training: 73%|███████▎ | 7253/10000 [1:31:43<38:09, 1.20it/s, loss=0.0090, lr=6.84e-06, step=7252] Training: 73%|███████▎ | 7253/10000 [1:31:43<38:09, 1.20it/s, loss=0.0048, lr=6.83e-06, step=7253] Training: 73%|███████▎ | 7254/10000 [1:31:44<38:39, 1.18it/s, loss=0.0048, lr=6.83e-06, step=7253] Training: 73%|███████▎ | 7254/10000 [1:31:44<38:39, 1.18it/s, loss=0.0195, lr=6.83e-06, step=7254] Training: 73%|███████▎ | 7255/10000 [1:31:44<33:47, 1.35it/s, loss=0.0195, lr=6.83e-06, step=7254] Training: 73%|███████▎ | 7255/10000 [1:31:44<33:47, 1.35it/s, loss=0.0037, lr=6.83e-06, step=7255] Training: 73%|███████▎ | 7256/10000 [1:31:45<31:25, 1.46it/s, loss=0.0037, lr=6.83e-06, step=7255] Training: 73%|███████▎ | 7256/10000 [1:31:45<31:25, 1.46it/s, loss=0.0030, lr=6.83e-06, step=7256] Training: 73%|███████▎ | 7257/10000 [1:31:46<32:16, 1.42it/s, loss=0.0030, lr=6.83e-06, step=7256] Training: 73%|███████▎ | 7257/10000 [1:31:46<32:16, 1.42it/s, loss=0.0027, lr=6.82e-06, step=7257] Training: 73%|███████▎ | 7258/10000 [1:31:47<36:48, 1.24it/s, loss=0.0027, lr=6.82e-06, step=7257] Training: 73%|███████▎ | 7258/10000 [1:31:47<36:48, 1.24it/s, loss=0.0042, lr=6.82e-06, step=7258] Training: 73%|███████▎ | 7259/10000 [1:31:48<39:36, 1.15it/s, loss=0.0042, lr=6.82e-06, step=7258] Training: 73%|███████▎ | 7259/10000 [1:31:48<39:36, 1.15it/s, loss=0.0041, lr=6.82e-06, step=7259]17:37:55.742 [I] step=7260 loss=0.0017 smoothed_loss=0.0070 lr=6.83e-06 grad_norm=0.3825 step_time=0.6681s data_time=0.1716s it/s=1.191 eta_to_10000=2300.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.0917 grad_shared_expert=0.4045 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7260/10000 [1:31:49<42:08, 1.08it/s, loss=0.0041, lr=6.82e-06, step=7259] Training: 73%|███████▎ | 7260/10000 [1:31:49<42:08, 1.08it/s, loss=0.0017, lr=6.81e-06, step=7260] Training: 73%|███████▎ | 7261/10000 [1:31:50<42:08, 1.08it/s, loss=0.0017, lr=6.81e-06, step=7260] Training: 73%|███████▎ | 7261/10000 [1:31:50<42:08, 1.08it/s, loss=0.0043, lr=6.81e-06, step=7261] Training: 73%|███████▎ | 7262/10000 [1:31:50<37:44, 1.21it/s, loss=0.0043, lr=6.81e-06, step=7261] Training: 73%|███████▎ | 7262/10000 [1:31:50<37:44, 1.21it/s, loss=0.0129, lr=6.81e-06, step=7262] Training: 73%|███████▎ | 7263/10000 [1:31:51<37:32, 1.21it/s, loss=0.0129, lr=6.81e-06, step=7262] Training: 73%|███████▎ | 7263/10000 [1:31:51<37:32, 1.21it/s, loss=0.0029, lr=6.80e-06, step=7263] Training: 73%|███████▎ | 7264/10000 [1:31:52<35:11, 1.30it/s, loss=0.0029, lr=6.80e-06, step=7263] Training: 73%|███████▎ | 7264/10000 [1:31:52<35:11, 1.30it/s, loss=0.0051, lr=6.80e-06, step=7264] Training: 73%|███████▎ | 7265/10000 [1:31:52<32:14, 1.41it/s, loss=0.0051, lr=6.80e-06, step=7264] Training: 73%|███████▎ | 7265/10000 [1:31:52<32:14, 1.41it/s, loss=0.0146, lr=6.80e-06, step=7265] Training: 73%|███████▎ | 7266/10000 [1:31:53<36:29, 1.25it/s, loss=0.0146, lr=6.80e-06, step=7265] Training: 73%|███████▎ | 7266/10000 [1:31:53<36:29, 1.25it/s, loss=0.0059, lr=6.80e-06, step=7266] Training: 73%|███████▎ | 7267/10000 [1:31:54<36:58, 1.23it/s, loss=0.0059, lr=6.80e-06, step=7266] Training: 73%|███████▎ | 7267/10000 [1:31:54<36:58, 1.23it/s, loss=0.0148, lr=6.79e-06, step=7267] Training: 73%|███████▎ | 7268/10000 [1:31:55<39:29, 1.15it/s, loss=0.0148, lr=6.79e-06, step=7267] Training: 73%|███████▎ | 7268/10000 [1:31:55<39:29, 1.15it/s, loss=0.0030, lr=6.79e-06, step=7268] Training: 73%|███████▎ | 7269/10000 [1:31:56<38:32, 1.18it/s, loss=0.0030, lr=6.79e-06, step=7268] Training: 73%|███████▎ | 7269/10000 [1:31:56<38:32, 1.18it/s, loss=0.0074, lr=6.79e-06, step=7269]17:38:03.721 [I] step=7270 loss=0.0131 smoothed_loss=0.0081 lr=6.80e-06 grad_norm=0.3836 step_time=0.6351s data_time=0.1628s it/s=1.253 eta_to_10000=2178.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.0844 grad_shared_expert=0.3436 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7270/10000 [1:31:57<37:34, 1.21it/s, loss=0.0074, lr=6.79e-06, step=7269] Training: 73%|███████▎ | 7270/10000 [1:31:57<37:34, 1.21it/s, loss=0.0131, lr=6.78e-06, step=7270] Training: 73%|███████▎ | 7271/10000 [1:31:58<36:31, 1.25it/s, loss=0.0131, lr=6.78e-06, step=7270] Training: 73%|███████▎ | 7271/10000 [1:31:58<36:31, 1.25it/s, loss=0.0208, lr=6.78e-06, step=7271] Training: 73%|███████▎ | 7272/10000 [1:31:58<32:39, 1.39it/s, loss=0.0208, lr=6.78e-06, step=7271] Training: 73%|███████▎ | 7272/10000 [1:31:58<32:39, 1.39it/s, loss=0.0223, lr=6.78e-06, step=7272] Training: 73%|███████▎ | 7273/10000 [1:31:59<32:12, 1.41it/s, loss=0.0223, lr=6.78e-06, step=7272] Training: 73%|███████▎ | 7273/10000 [1:31:59<32:12, 1.41it/s, loss=0.0062, lr=6.78e-06, step=7273] Training: 73%|███████▎ | 7274/10000 [1:32:00<32:53, 1.38it/s, loss=0.0062, lr=6.78e-06, step=7273] Training: 73%|███████▎ | 7274/10000 [1:32:00<32:53, 1.38it/s, loss=0.0112, lr=6.77e-06, step=7274] Training: 73%|███████▎ | 7275/10000 [1:32:00<31:58, 1.42it/s, loss=0.0112, lr=6.77e-06, step=7274] Training: 73%|███████▎ | 7275/10000 [1:32:00<31:58, 1.42it/s, loss=0.0094, lr=6.77e-06, step=7275] Training: 73%|███████▎ | 7276/10000 [1:32:01<35:11, 1.29it/s, loss=0.0094, lr=6.77e-06, step=7275] Training: 73%|███████▎ | 7276/10000 [1:32:01<35:11, 1.29it/s, loss=0.0056, lr=6.77e-06, step=7276] Training: 73%|███████▎ | 7277/10000 [1:32:02<31:56, 1.42it/s, loss=0.0056, lr=6.77e-06, step=7276] Training: 73%|███████▎ | 7277/10000 [1:32:02<31:56, 1.42it/s, loss=0.0056, lr=6.76e-06, step=7277] Training: 73%|███████▎ | 7278/10000 [1:32:02<31:23, 1.44it/s, loss=0.0056, lr=6.76e-06, step=7277] Training: 73%|███████▎ | 7278/10000 [1:32:02<31:23, 1.44it/s, loss=0.0316, lr=6.76e-06, step=7278] Training: 73%|███████▎ | 7279/10000 [1:32:03<28:36, 1.58it/s, loss=0.0316, lr=6.76e-06, step=7278] Training: 73%|███████▎ | 7279/10000 [1:32:03<28:36, 1.58it/s, loss=0.0034, lr=6.76e-06, step=7279]17:38:10.497 [I] step=7280 loss=0.0073 smoothed_loss=0.0104 lr=6.77e-06 grad_norm=0.4571 step_time=0.5534s data_time=0.1242s it/s=1.476 eta_to_10000=1842.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0131 grad_action_out_proj=0.0955 grad_shared_expert=0.3079 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7280/10000 [1:32:04<30:30, 1.49it/s, loss=0.0034, lr=6.76e-06, step=7279] Training: 73%|███████▎ | 7280/10000 [1:32:04<30:30, 1.49it/s, loss=0.0073, lr=6.76e-06, step=7280] Training: 73%|███████▎ | 7281/10000 [1:32:04<31:51, 1.42it/s, loss=0.0073, lr=6.76e-06, step=7280] Training: 73%|███████▎ | 7281/10000 [1:32:04<31:51, 1.42it/s, loss=0.0057, lr=6.75e-06, step=7281] Training: 73%|███████▎ | 7282/10000 [1:32:05<29:02, 1.56it/s, loss=0.0057, lr=6.75e-06, step=7281] Training: 73%|███████▎ | 7282/10000 [1:32:05<29:02, 1.56it/s, loss=0.0071, lr=6.75e-06, step=7282] Training: 73%|███████▎ | 7283/10000 [1:32:06<31:36, 1.43it/s, loss=0.0071, lr=6.75e-06, step=7282] Training: 73%|███████▎ | 7283/10000 [1:32:06<31:36, 1.43it/s, loss=0.0042, lr=6.75e-06, step=7283] Training: 73%|███████▎ | 7284/10000 [1:32:06<30:48, 1.47it/s, loss=0.0042, lr=6.75e-06, step=7283] Training: 73%|███████▎ | 7284/10000 [1:32:06<30:48, 1.47it/s, loss=0.0146, lr=6.74e-06, step=7284] Training: 73%|███████▎ | 7285/10000 [1:32:07<28:41, 1.58it/s, loss=0.0146, lr=6.74e-06, step=7284] Training: 73%|███████▎ | 7285/10000 [1:32:07<28:41, 1.58it/s, loss=0.0067, lr=6.74e-06, step=7285] Training: 73%|███████▎ | 7286/10000 [1:32:08<31:45, 1.42it/s, loss=0.0067, lr=6.74e-06, step=7285] Training: 73%|███████▎ | 7286/10000 [1:32:08<31:45, 1.42it/s, loss=0.0105, lr=6.74e-06, step=7286] Training: 73%|███████▎ | 7287/10000 [1:32:08<28:50, 1.57it/s, loss=0.0105, lr=6.74e-06, step=7286] Training: 73%|███████▎ | 7287/10000 [1:32:08<28:50, 1.57it/s, loss=0.0089, lr=6.73e-06, step=7287] Training: 73%|███████▎ | 7288/10000 [1:32:09<30:18, 1.49it/s, loss=0.0089, lr=6.73e-06, step=7287] Training: 73%|███████▎ | 7288/10000 [1:32:09<30:18, 1.49it/s, loss=0.0109, lr=6.73e-06, step=7288] Training: 73%|███████▎ | 7289/10000 [1:32:10<32:04, 1.41it/s, loss=0.0109, lr=6.73e-06, step=7288] Training: 73%|███████▎ | 7289/10000 [1:32:10<32:04, 1.41it/s, loss=0.0028, lr=6.73e-06, step=7289]17:38:17.544 [I] step=7290 loss=0.0055 smoothed_loss=0.0085 lr=6.74e-06 grad_norm=0.4757 step_time=0.5731s data_time=0.1316s it/s=1.419 eta_to_10000=1909.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0056 grad_action_out_proj=0.0666 grad_shared_expert=0.2657 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7290/10000 [1:32:11<34:25, 1.31it/s, loss=0.0028, lr=6.73e-06, step=7289] Training: 73%|███████▎ | 7290/10000 [1:32:11<34:25, 1.31it/s, loss=0.0055, lr=6.73e-06, step=7290] Training: 73%|███████▎ | 7291/10000 [1:32:11<32:45, 1.38it/s, loss=0.0055, lr=6.73e-06, step=7290] Training: 73%|███████▎ | 7291/10000 [1:32:11<32:45, 1.38it/s, loss=0.0082, lr=6.72e-06, step=7291] Training: 73%|███████▎ | 7292/10000 [1:32:12<29:45, 1.52it/s, loss=0.0082, lr=6.72e-06, step=7291] Training: 73%|███████▎ | 7292/10000 [1:32:12<29:45, 1.52it/s, loss=0.0030, lr=6.72e-06, step=7292] Training: 73%|███████▎ | 7293/10000 [1:32:12<27:21, 1.65it/s, loss=0.0030, lr=6.72e-06, step=7292] Training: 73%|███████▎ | 7293/10000 [1:32:12<27:21, 1.65it/s, loss=0.0065, lr=6.72e-06, step=7293] Training: 73%|███████▎ | 7294/10000 [1:32:13<28:02, 1.61it/s, loss=0.0065, lr=6.72e-06, step=7293] Training: 73%|███████▎ | 7294/10000 [1:32:13<28:02, 1.61it/s, loss=0.0547, lr=6.71e-06, step=7294] Training: 73%|███████▎ | 7295/10000 [1:32:14<33:17, 1.35it/s, loss=0.0547, lr=6.71e-06, step=7294] Training: 73%|███████▎ | 7295/10000 [1:32:14<33:17, 1.35it/s, loss=0.0742, lr=6.71e-06, step=7295] Training: 73%|███████▎ | 7296/10000 [1:32:15<34:28, 1.31it/s, loss=0.0742, lr=6.71e-06, step=7295] Training: 73%|███████▎ | 7296/10000 [1:32:15<34:28, 1.31it/s, loss=0.0058, lr=6.71e-06, step=7296] Training: 73%|███████▎ | 7297/10000 [1:32:15<31:02, 1.45it/s, loss=0.0058, lr=6.71e-06, step=7296] Training: 73%|███████▎ | 7297/10000 [1:32:15<31:02, 1.45it/s, loss=0.0036, lr=6.71e-06, step=7297] Training: 73%|███████▎ | 7298/10000 [1:32:16<30:59, 1.45it/s, loss=0.0036, lr=6.71e-06, step=7297] Training: 73%|███████▎ | 7298/10000 [1:32:16<30:59, 1.45it/s, loss=0.0018, lr=6.70e-06, step=7298] Training: 73%|███████▎ | 7299/10000 [1:32:17<31:04, 1.45it/s, loss=0.0018, lr=6.70e-06, step=7298] Training: 73%|███████▎ | 7299/10000 [1:32:17<31:04, 1.45it/s, loss=0.0036, lr=6.70e-06, step=7299]17:38:24.383 [I] step=7300 loss=0.0241 smoothed_loss=0.0145 lr=6.71e-06 grad_norm=0.4384 step_time=0.5601s data_time=0.1239s it/s=1.462 eta_to_10000=1846.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0219 grad_action_out_proj=0.1210 grad_shared_expert=0.4632 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7300/10000 [1:32:17<32:50, 1.37it/s, loss=0.0036, lr=6.70e-06, step=7299] Training: 73%|███████▎ | 7300/10000 [1:32:17<32:50, 1.37it/s, loss=0.0241, lr=6.70e-06, step=7300] Training: 73%|███████▎ | 7301/10000 [1:32:18<29:54, 1.50it/s, loss=0.0241, lr=6.70e-06, step=7300] Training: 73%|███████▎ | 7301/10000 [1:32:18<29:54, 1.50it/s, loss=0.0056, lr=6.69e-06, step=7301] Training: 73%|███████▎ | 7302/10000 [1:32:19<31:00, 1.45it/s, loss=0.0056, lr=6.69e-06, step=7301] Training: 73%|███████▎ | 7302/10000 [1:32:19<31:00, 1.45it/s, loss=0.0076, lr=6.69e-06, step=7302] Training: 73%|███████▎ | 7303/10000 [1:32:20<36:52, 1.22it/s, loss=0.0076, lr=6.69e-06, step=7302] Training: 73%|███████▎ | 7303/10000 [1:32:20<36:52, 1.22it/s, loss=0.0091, lr=6.69e-06, step=7303] Training: 73%|███████▎ | 7304/10000 [1:32:20<33:26, 1.34it/s, loss=0.0091, lr=6.69e-06, step=7303] Training: 73%|███████▎ | 7304/10000 [1:32:20<33:26, 1.34it/s, loss=0.0339, lr=6.69e-06, step=7304] Training: 73%|███████▎ | 7305/10000 [1:32:21<31:40, 1.42it/s, loss=0.0339, lr=6.69e-06, step=7304] Training: 73%|███████▎ | 7305/10000 [1:32:21<31:40, 1.42it/s, loss=0.0124, lr=6.68e-06, step=7305] Training: 73%|███████▎ | 7306/10000 [1:32:22<31:02, 1.45it/s, loss=0.0124, lr=6.68e-06, step=7305] Training: 73%|███████▎ | 7306/10000 [1:32:22<31:02, 1.45it/s, loss=0.0086, lr=6.68e-06, step=7306] Training: 73%|███████▎ | 7307/10000 [1:32:22<28:44, 1.56it/s, loss=0.0086, lr=6.68e-06, step=7306] Training: 73%|███████▎ | 7307/10000 [1:32:22<28:44, 1.56it/s, loss=0.0017, lr=6.68e-06, step=7307] Training: 73%|███████▎ | 7308/10000 [1:32:23<29:34, 1.52it/s, loss=0.0017, lr=6.68e-06, step=7307] Training: 73%|███████▎ | 7308/10000 [1:32:23<29:34, 1.52it/s, loss=0.0033, lr=6.67e-06, step=7308] Training: 73%|███████▎ | 7309/10000 [1:32:24<29:31, 1.52it/s, loss=0.0033, lr=6.67e-06, step=7308] Training: 73%|███████▎ | 7309/10000 [1:32:24<29:31, 1.52it/s, loss=0.0041, lr=6.67e-06, step=7309]17:38:31.370 [I] step=7310 loss=0.0029 smoothed_loss=0.0102 lr=6.68e-06 grad_norm=0.4136 step_time=0.5997s data_time=0.0990s it/s=1.431 eta_to_10000=1879.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0251 grad_action_out_proj=0.1251 grad_shared_expert=0.5817 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7310/10000 [1:32:24<32:30, 1.38it/s, loss=0.0041, lr=6.67e-06, step=7309] Training: 73%|███████▎ | 7310/10000 [1:32:24<32:30, 1.38it/s, loss=0.0029, lr=6.67e-06, step=7310] Training: 73%|███████▎ | 7311/10000 [1:32:25<29:26, 1.52it/s, loss=0.0029, lr=6.67e-06, step=7310] Training: 73%|███████▎ | 7311/10000 [1:32:25<29:26, 1.52it/s, loss=0.0143, lr=6.67e-06, step=7311] Training: 73%|███████▎ | 7312/10000 [1:32:25<27:28, 1.63it/s, loss=0.0143, lr=6.67e-06, step=7311] Training: 73%|███████▎ | 7312/10000 [1:32:25<27:28, 1.63it/s, loss=0.0068, lr=6.66e-06, step=7312] Training: 73%|███████▎ | 7313/10000 [1:32:26<26:36, 1.68it/s, loss=0.0068, lr=6.66e-06, step=7312] Training: 73%|███████▎ | 7313/10000 [1:32:26<26:36, 1.68it/s, loss=0.0051, lr=6.66e-06, step=7313] Training: 73%|███████▎ | 7314/10000 [1:32:27<25:36, 1.75it/s, loss=0.0051, lr=6.66e-06, step=7313] Training: 73%|███████▎ | 7314/10000 [1:32:27<25:36, 1.75it/s, loss=0.0016, lr=6.66e-06, step=7314] Training: 73%|███████▎ | 7315/10000 [1:32:27<28:22, 1.58it/s, loss=0.0016, lr=6.66e-06, step=7314] Training: 73%|███████▎ | 7315/10000 [1:32:27<28:22, 1.58it/s, loss=0.0028, lr=6.65e-06, step=7315] Training: 73%|███████▎ | 7316/10000 [1:32:28<27:45, 1.61it/s, loss=0.0028, lr=6.65e-06, step=7315] Training: 73%|███████▎ | 7316/10000 [1:32:28<27:45, 1.61it/s, loss=0.0098, lr=6.65e-06, step=7316] Training: 73%|███████▎ | 7317/10000 [1:32:28<26:46, 1.67it/s, loss=0.0098, lr=6.65e-06, step=7316] Training: 73%|███████▎ | 7317/10000 [1:32:28<26:46, 1.67it/s, loss=0.0048, lr=6.65e-06, step=7317] Training: 73%|███████▎ | 7318/10000 [1:32:29<31:24, 1.42it/s, loss=0.0048, lr=6.65e-06, step=7317] Training: 73%|███████▎ | 7318/10000 [1:32:29<31:24, 1.42it/s, loss=0.0498, lr=6.65e-06, step=7318] Training: 73%|███████▎ | 7319/10000 [1:32:30<29:35, 1.51it/s, loss=0.0498, lr=6.65e-06, step=7318] Training: 73%|███████▎ | 7319/10000 [1:32:30<29:35, 1.51it/s, loss=0.0029, lr=6.64e-06, step=7319]17:38:37.590 [I] step=7320 loss=0.0011 smoothed_loss=0.0103 lr=6.65e-06 grad_norm=0.4112 step_time=0.5315s data_time=0.0904s it/s=1.608 eta_to_10000=1666.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0139 grad_action_out_proj=0.1262 grad_shared_expert=0.5130 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7320/10000 [1:32:31<30:14, 1.48it/s, loss=0.0029, lr=6.64e-06, step=7319] Training: 73%|███████▎ | 7320/10000 [1:32:31<30:14, 1.48it/s, loss=0.0011, lr=6.64e-06, step=7320] Training: 73%|███████▎ | 7321/10000 [1:32:31<27:57, 1.60it/s, loss=0.0011, lr=6.64e-06, step=7320] Training: 73%|███████▎ | 7321/10000 [1:32:31<27:57, 1.60it/s, loss=0.0111, lr=6.64e-06, step=7321] Training: 73%|███████▎ | 7322/10000 [1:32:32<29:54, 1.49it/s, loss=0.0111, lr=6.64e-06, step=7321] Training: 73%|███████▎ | 7322/10000 [1:32:32<29:54, 1.49it/s, loss=0.0121, lr=6.63e-06, step=7322] Training: 73%|███████▎ | 7323/10000 [1:32:33<30:09, 1.48it/s, loss=0.0121, lr=6.63e-06, step=7322] Training: 73%|███████▎ | 7323/10000 [1:32:33<30:09, 1.48it/s, loss=0.0054, lr=6.63e-06, step=7323] Training: 73%|███████▎ | 7324/10000 [1:32:33<28:23, 1.57it/s, loss=0.0054, lr=6.63e-06, step=7323] Training: 73%|███████▎ | 7324/10000 [1:32:33<28:23, 1.57it/s, loss=0.0016, lr=6.63e-06, step=7324] Training: 73%|███████▎ | 7325/10000 [1:32:34<32:11, 1.38it/s, loss=0.0016, lr=6.63e-06, step=7324] Training: 73%|███████▎ | 7325/10000 [1:32:34<32:11, 1.38it/s, loss=0.0011, lr=6.62e-06, step=7325] Training: 73%|███████▎ | 7326/10000 [1:32:35<29:47, 1.50it/s, loss=0.0011, lr=6.62e-06, step=7325] Training: 73%|███████▎ | 7326/10000 [1:32:35<29:47, 1.50it/s, loss=0.0057, lr=6.62e-06, step=7326] Training: 73%|███████▎ | 7327/10000 [1:32:35<27:46, 1.60it/s, loss=0.0057, lr=6.62e-06, step=7326] Training: 73%|███████▎ | 7327/10000 [1:32:35<27:46, 1.60it/s, loss=0.0072, lr=6.62e-06, step=7327] Training: 73%|███████▎ | 7328/10000 [1:32:36<28:10, 1.58it/s, loss=0.0072, lr=6.62e-06, step=7327] Training: 73%|███████▎ | 7328/10000 [1:32:36<28:10, 1.58it/s, loss=0.0206, lr=6.62e-06, step=7328] Training: 73%|███████▎ | 7329/10000 [1:32:36<28:13, 1.58it/s, loss=0.0206, lr=6.62e-06, step=7328] Training: 73%|███████▎ | 7329/10000 [1:32:36<28:13, 1.58it/s, loss=0.0096, lr=6.61e-06, step=7329]17:38:44.099 [I] step=7330 loss=0.0136 smoothed_loss=0.0097 lr=6.62e-06 grad_norm=0.4671 step_time=0.5480s data_time=0.1030s it/s=1.537 eta_to_10000=1737.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0100 grad_action_out_proj=0.0966 grad_shared_expert=0.4268 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7330/10000 [1:32:37<29:22, 1.52it/s, loss=0.0096, lr=6.61e-06, step=7329] Training: 73%|███████▎ | 7330/10000 [1:32:37<29:22, 1.52it/s, loss=0.0136, lr=6.61e-06, step=7330] Training: 73%|███████▎ | 7331/10000 [1:32:38<31:01, 1.43it/s, loss=0.0136, lr=6.61e-06, step=7330] Training: 73%|███████▎ | 7331/10000 [1:32:38<31:01, 1.43it/s, loss=0.0185, lr=6.61e-06, step=7331] Training: 73%|███████▎ | 7332/10000 [1:32:39<30:47, 1.44it/s, loss=0.0185, lr=6.61e-06, step=7331] Training: 73%|███████▎ | 7332/10000 [1:32:39<30:47, 1.44it/s, loss=0.0181, lr=6.60e-06, step=7332] Training: 73%|███████▎ | 7333/10000 [1:32:39<33:01, 1.35it/s, loss=0.0181, lr=6.60e-06, step=7332] Training: 73%|███████▎ | 7333/10000 [1:32:39<33:01, 1.35it/s, loss=0.0021, lr=6.60e-06, step=7333] Training: 73%|███████▎ | 7334/10000 [1:32:40<29:50, 1.49it/s, loss=0.0021, lr=6.60e-06, step=7333] Training: 73%|███████▎ | 7334/10000 [1:32:40<29:50, 1.49it/s, loss=0.0151, lr=6.60e-06, step=7334] Training: 73%|███████▎ | 7335/10000 [1:32:41<30:49, 1.44it/s, loss=0.0151, lr=6.60e-06, step=7334] Training: 73%|███████▎ | 7335/10000 [1:32:41<30:49, 1.44it/s, loss=0.0009, lr=6.60e-06, step=7335] Training: 73%|███████▎ | 7336/10000 [1:32:41<28:44, 1.54it/s, loss=0.0009, lr=6.60e-06, step=7335] Training: 73%|███████▎ | 7336/10000 [1:32:41<28:44, 1.54it/s, loss=0.0124, lr=6.59e-06, step=7336] Training: 73%|███████▎ | 7337/10000 [1:32:42<28:13, 1.57it/s, loss=0.0124, lr=6.59e-06, step=7336] Training: 73%|███████▎ | 7337/10000 [1:32:42<28:13, 1.57it/s, loss=0.0121, lr=6.59e-06, step=7337] Training: 73%|███████▎ | 7338/10000 [1:32:43<28:52, 1.54it/s, loss=0.0121, lr=6.59e-06, step=7337] Training: 73%|███████▎ | 7338/10000 [1:32:43<28:52, 1.54it/s, loss=0.0496, lr=6.59e-06, step=7338] Training: 73%|███████▎ | 7339/10000 [1:32:44<34:41, 1.28it/s, loss=0.0496, lr=6.59e-06, step=7338] Training: 73%|███████▎ | 7339/10000 [1:32:44<34:41, 1.28it/s, loss=0.0022, lr=6.58e-06, step=7339]17:38:51.170 [I] step=7340 loss=0.0109 smoothed_loss=0.0129 lr=6.59e-06 grad_norm=0.4169 step_time=0.5897s data_time=0.1174s it/s=1.415 eta_to_10000=1880.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0089 grad_action_out_proj=0.0804 grad_shared_expert=0.3922 (10775:train_pytorch.py:850) + Training: 73%|███████▎ | 7340/10000 [1:32:44<31:51, 1.39it/s, loss=0.0022, lr=6.58e-06, step=7339] Training: 73%|███████▎ | 7340/10000 [1:32:44<31:51, 1.39it/s, loss=0.0109, lr=6.58e-06, step=7340] Training: 73%|███████▎ | 7341/10000 [1:32:45<31:48, 1.39it/s, loss=0.0109, lr=6.58e-06, step=7340] Training: 73%|███████▎ | 7341/10000 [1:32:45<31:48, 1.39it/s, loss=0.0060, lr=6.58e-06, step=7341] Training: 73%|███████▎ | 7342/10000 [1:32:46<31:05, 1.43it/s, loss=0.0060, lr=6.58e-06, step=7341] Training: 73%|███████▎ | 7342/10000 [1:32:46<31:05, 1.43it/s, loss=0.0056, lr=6.58e-06, step=7342] Training: 73%|███████▎ | 7343/10000 [1:32:46<28:52, 1.53it/s, loss=0.0056, lr=6.58e-06, step=7342] Training: 73%|███████▎ | 7343/10000 [1:32:46<28:52, 1.53it/s, loss=0.0037, lr=6.57e-06, step=7343] Training: 73%|███████▎ | 7344/10000 [1:32:47<28:25, 1.56it/s, loss=0.0037, lr=6.57e-06, step=7343] Training: 73%|███████▎ | 7344/10000 [1:32:47<28:25, 1.56it/s, loss=0.0194, lr=6.57e-06, step=7344] Training: 73%|███████▎ | 7345/10000 [1:32:48<30:40, 1.44it/s, loss=0.0194, lr=6.57e-06, step=7344] Training: 73%|███████▎ | 7345/10000 [1:32:48<30:40, 1.44it/s, loss=0.0072, lr=6.57e-06, step=7345] Training: 73%|███████▎ | 7346/10000 [1:32:49<34:26, 1.28it/s, loss=0.0072, lr=6.57e-06, step=7345] Training: 73%|███████▎ | 7346/10000 [1:32:49<34:26, 1.28it/s, loss=0.0012, lr=6.56e-06, step=7346] Training: 73%|███████▎ | 7347/10000 [1:32:49<32:52, 1.35it/s, loss=0.0012, lr=6.56e-06, step=7346] Training: 73%|███████▎ | 7347/10000 [1:32:49<32:52, 1.35it/s, loss=0.0030, lr=6.56e-06, step=7347] Training: 73%|███████▎ | 7348/10000 [1:32:50<33:00, 1.34it/s, loss=0.0030, lr=6.56e-06, step=7347] Training: 73%|███████▎ | 7348/10000 [1:32:50<33:00, 1.34it/s, loss=0.1743, lr=6.56e-06, step=7348] Training: 73%|███████▎ | 7349/10000 [1:32:51<31:54, 1.38it/s, loss=0.1743, lr=6.56e-06, step=7348] Training: 73%|███████▎ | 7349/10000 [1:32:51<31:54, 1.38it/s, loss=0.0154, lr=6.56e-06, step=7349]17:38:58.156 [I] step=7350 loss=0.0022 smoothed_loss=0.0226 lr=6.57e-06 grad_norm=0.4163 step_time=0.5675s data_time=0.1311s it/s=1.432 eta_to_10000=1850.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.0921 grad_shared_expert=0.2930 (10775:train_pytorch.py:850) + Training: 74%|███████▎ | 7350/10000 [1:32:51<30:00, 1.47it/s, loss=0.0154, lr=6.56e-06, step=7349] Training: 74%|███████▎ | 7350/10000 [1:32:51<30:00, 1.47it/s, loss=0.0022, lr=6.55e-06, step=7350] Training: 74%|███████▎ | 7351/10000 [1:32:52<29:13, 1.51it/s, loss=0.0022, lr=6.55e-06, step=7350] Training: 74%|███████▎ | 7351/10000 [1:32:52<29:13, 1.51it/s, loss=0.0042, lr=6.55e-06, step=7351] Training: 74%|███████▎ | 7352/10000 [1:32:53<32:20, 1.36it/s, loss=0.0042, lr=6.55e-06, step=7351] Training: 74%|███████▎ | 7352/10000 [1:32:53<32:20, 1.36it/s, loss=0.0685, lr=6.55e-06, step=7352] Training: 74%|███████▎ | 7353/10000 [1:32:54<38:29, 1.15it/s, loss=0.0685, lr=6.55e-06, step=7352] Training: 74%|███████▎ | 7353/10000 [1:32:54<38:29, 1.15it/s, loss=0.0177, lr=6.54e-06, step=7353] Training: 74%|███████▎ | 7354/10000 [1:32:55<35:00, 1.26it/s, loss=0.0177, lr=6.54e-06, step=7353] Training: 74%|███████▎ | 7354/10000 [1:32:55<35:00, 1.26it/s, loss=0.0061, lr=6.54e-06, step=7354] Training: 74%|███████▎ | 7355/10000 [1:32:55<33:12, 1.33it/s, loss=0.0061, lr=6.54e-06, step=7354] Training: 74%|███████▎ | 7355/10000 [1:32:55<33:12, 1.33it/s, loss=0.0021, lr=6.54e-06, step=7355] Training: 74%|███████▎ | 7356/10000 [1:32:56<33:22, 1.32it/s, loss=0.0021, lr=6.54e-06, step=7355] Training: 74%|███████▎ | 7356/10000 [1:32:56<33:22, 1.32it/s, loss=0.0106, lr=6.54e-06, step=7356] Training: 74%|███████▎ | 7357/10000 [1:32:57<31:17, 1.41it/s, loss=0.0106, lr=6.54e-06, step=7356] Training: 74%|███████▎ | 7357/10000 [1:32:57<31:17, 1.41it/s, loss=0.0013, lr=6.53e-06, step=7357] Training: 74%|███████▎ | 7358/10000 [1:32:57<32:59, 1.33it/s, loss=0.0013, lr=6.53e-06, step=7357] Training: 74%|███████▎ | 7358/10000 [1:32:57<32:59, 1.33it/s, loss=0.0218, lr=6.53e-06, step=7358] Training: 74%|███████▎ | 7359/10000 [1:32:58<34:42, 1.27it/s, loss=0.0218, lr=6.53e-06, step=7358] Training: 74%|███████▎ | 7359/10000 [1:32:58<34:42, 1.27it/s, loss=0.0522, lr=6.53e-06, step=7359]17:39:06.402 [I] step=7360 loss=0.0387 smoothed_loss=0.0234 lr=6.54e-06 grad_norm=0.4596 step_time=0.6489s data_time=0.1757s it/s=1.213 eta_to_10000=2176.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.1682 grad_shared_expert=0.4860 (10775:train_pytorch.py:850) + Training: 74%|███████▎ | 7360/10000 [1:32:59<39:44, 1.11it/s, loss=0.0522, lr=6.53e-06, step=7359] Training: 74%|███████▎ | 7360/10000 [1:32:59<39:44, 1.11it/s, loss=0.0387, lr=6.52e-06, step=7360] Training: 74%|███████▎ | 7361/10000 [1:33:00<37:14, 1.18it/s, loss=0.0387, lr=6.52e-06, step=7360] Training: 74%|███████▎ | 7361/10000 [1:33:00<37:14, 1.18it/s, loss=0.0067, lr=6.52e-06, step=7361] Training: 74%|███████▎ | 7362/10000 [1:33:01<36:59, 1.19it/s, loss=0.0067, lr=6.52e-06, step=7361] Training: 74%|███████▎ | 7362/10000 [1:33:01<36:59, 1.19it/s, loss=0.0116, lr=6.52e-06, step=7362] Training: 74%|███████▎ | 7363/10000 [1:33:02<37:45, 1.16it/s, loss=0.0116, lr=6.52e-06, step=7362] Training: 74%|███████▎ | 7363/10000 [1:33:02<37:45, 1.16it/s, loss=0.0027, lr=6.52e-06, step=7363] Training: 74%|███████▎ | 7364/10000 [1:33:02<33:26, 1.31it/s, loss=0.0027, lr=6.52e-06, step=7363] Training: 74%|███████▎ | 7364/10000 [1:33:02<33:26, 1.31it/s, loss=0.0640, lr=6.51e-06, step=7364] Training: 74%|███████▎ | 7365/10000 [1:33:03<34:17, 1.28it/s, loss=0.0640, lr=6.51e-06, step=7364] Training: 74%|███████▎ | 7365/10000 [1:33:03<34:17, 1.28it/s, loss=0.0206, lr=6.51e-06, step=7365] Training: 74%|███████▎ | 7366/10000 [1:33:04<32:26, 1.35it/s, loss=0.0206, lr=6.51e-06, step=7365] Training: 74%|███████▎ | 7366/10000 [1:33:04<32:26, 1.35it/s, loss=0.0125, lr=6.51e-06, step=7366] Training: 74%|███████▎ | 7367/10000 [1:33:05<32:40, 1.34it/s, loss=0.0125, lr=6.51e-06, step=7366] Training: 74%|███████▎ | 7367/10000 [1:33:05<32:40, 1.34it/s, loss=0.0271, lr=6.50e-06, step=7367] Training: 74%|███████▎ | 7368/10000 [1:33:05<31:13, 1.40it/s, loss=0.0271, lr=6.50e-06, step=7367] Training: 74%|███████▎ | 7368/10000 [1:33:05<31:13, 1.40it/s, loss=0.0039, lr=6.50e-06, step=7368] Training: 74%|███████▎ | 7369/10000 [1:33:06<33:24, 1.31it/s, loss=0.0039, lr=6.50e-06, step=7368] Training: 74%|███████▎ | 7369/10000 [1:33:06<33:24, 1.31it/s, loss=0.0185, lr=6.50e-06, step=7369]17:39:13.794 [I] step=7370 loss=0.0018 smoothed_loss=0.0186 lr=6.51e-06 grad_norm=0.4727 step_time=0.5890s data_time=0.1502s it/s=1.353 eta_to_10000=1943.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.0943 grad_shared_expert=0.5563 (10775:train_pytorch.py:850) + Training: 74%|███████▎ | 7370/10000 [1:33:07<32:16, 1.36it/s, loss=0.0185, lr=6.50e-06, step=7369] Training: 74%|███████▎ | 7370/10000 [1:33:07<32:16, 1.36it/s, loss=0.0018, lr=6.50e-06, step=7370] Training: 74%|███████▎ | 7371/10000 [1:33:08<33:28, 1.31it/s, loss=0.0018, lr=6.50e-06, step=7370] Training: 74%|███████▎ | 7371/10000 [1:33:08<33:28, 1.31it/s, loss=0.0059, lr=6.49e-06, step=7371] Training: 74%|███████▎ | 7372/10000 [1:33:08<31:23, 1.39it/s, loss=0.0059, lr=6.49e-06, step=7371] Training: 74%|███████▎ | 7372/10000 [1:33:08<31:23, 1.39it/s, loss=0.0073, lr=6.49e-06, step=7372] Training: 74%|███████▎ | 7373/10000 [1:33:09<30:33, 1.43it/s, loss=0.0073, lr=6.49e-06, step=7372] Training: 74%|███████▎ | 7373/10000 [1:33:09<30:33, 1.43it/s, loss=0.0029, lr=6.49e-06, step=7373] Training: 74%|███████▎ | 7374/10000 [1:33:10<34:51, 1.26it/s, loss=0.0029, lr=6.49e-06, step=7373] Training: 74%|███████▎ | 7374/10000 [1:33:10<34:51, 1.26it/s, loss=0.0130, lr=6.48e-06, step=7374] Training: 74%|███████▍ | 7375/10000 [1:33:11<32:54, 1.33it/s, loss=0.0130, lr=6.48e-06, step=7374] Training: 74%|███████▍ | 7375/10000 [1:33:11<32:54, 1.33it/s, loss=0.0038, lr=6.48e-06, step=7375] Training: 74%|███████▍ | 7376/10000 [1:33:11<30:00, 1.46it/s, loss=0.0038, lr=6.48e-06, step=7375] Training: 74%|███████▍ | 7376/10000 [1:33:11<30:00, 1.46it/s, loss=0.0053, lr=6.48e-06, step=7376] Training: 74%|███████▍ | 7377/10000 [1:33:12<28:09, 1.55it/s, loss=0.0053, lr=6.48e-06, step=7376] Training: 74%|███████▍ | 7377/10000 [1:33:12<28:09, 1.55it/s, loss=0.0056, lr=6.48e-06, step=7377] Training: 74%|███████▍ | 7378/10000 [1:33:13<30:21, 1.44it/s, loss=0.0056, lr=6.48e-06, step=7377] Training: 74%|███████▍ | 7378/10000 [1:33:13<30:21, 1.44it/s, loss=0.0045, lr=6.47e-06, step=7378] Training: 74%|███████▍ | 7379/10000 [1:33:13<31:32, 1.38it/s, loss=0.0045, lr=6.47e-06, step=7378] Training: 74%|███████▍ | 7379/10000 [1:33:13<31:32, 1.38it/s, loss=0.0026, lr=6.47e-06, step=7379]17:39:20.815 [I] step=7380 loss=0.0063 smoothed_loss=0.0101 lr=6.48e-06 grad_norm=0.4123 step_time=0.5711s data_time=0.1311s it/s=1.424 eta_to_10000=1839.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0833 grad_shared_expert=0.4583 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7380/10000 [1:33:14<29:39, 1.47it/s, loss=0.0026, lr=6.47e-06, step=7379] Training: 74%|███████▍ | 7380/10000 [1:33:14<29:39, 1.47it/s, loss=0.0063, lr=6.47e-06, step=7380] Training: 74%|███████▍ | 7381/10000 [1:33:15<33:00, 1.32it/s, loss=0.0063, lr=6.47e-06, step=7380] Training: 74%|███████▍ | 7381/10000 [1:33:15<33:00, 1.32it/s, loss=0.0033, lr=6.46e-06, step=7381] Training: 74%|███████▍ | 7382/10000 [1:33:16<37:38, 1.16it/s, loss=0.0033, lr=6.46e-06, step=7381] Training: 74%|███████▍ | 7382/10000 [1:33:16<37:38, 1.16it/s, loss=0.0039, lr=6.46e-06, step=7382] Training: 74%|███████▍ | 7383/10000 [1:33:17<34:23, 1.27it/s, loss=0.0039, lr=6.46e-06, step=7382] Training: 74%|███████▍ | 7383/10000 [1:33:17<34:23, 1.27it/s, loss=0.0059, lr=6.46e-06, step=7383] Training: 74%|███████▍ | 7384/10000 [1:33:17<34:04, 1.28it/s, loss=0.0059, lr=6.46e-06, step=7383] Training: 74%|███████▍ | 7384/10000 [1:33:17<34:04, 1.28it/s, loss=0.0033, lr=6.46e-06, step=7384] Training: 74%|███████▍ | 7385/10000 [1:33:18<31:12, 1.40it/s, loss=0.0033, lr=6.46e-06, step=7384] Training: 74%|███████▍ | 7385/10000 [1:33:18<31:12, 1.40it/s, loss=0.0022, lr=6.45e-06, step=7385] Training: 74%|███████▍ | 7386/10000 [1:33:18<29:25, 1.48it/s, loss=0.0022, lr=6.45e-06, step=7385] Training: 74%|███████▍ | 7386/10000 [1:33:18<29:25, 1.48it/s, loss=0.0033, lr=6.45e-06, step=7386] Training: 74%|███████▍ | 7387/10000 [1:33:19<29:42, 1.47it/s, loss=0.0033, lr=6.45e-06, step=7386] Training: 74%|███████▍ | 7387/10000 [1:33:19<29:42, 1.47it/s, loss=0.0030, lr=6.45e-06, step=7387] Training: 74%|███████▍ | 7388/10000 [1:33:20<32:00, 1.36it/s, loss=0.0030, lr=6.45e-06, step=7387] Training: 74%|███████▍ | 7388/10000 [1:33:20<32:00, 1.36it/s, loss=0.0022, lr=6.45e-06, step=7388] Training: 74%|███████▍ | 7389/10000 [1:33:21<33:05, 1.32it/s, loss=0.0022, lr=6.45e-06, step=7388] Training: 74%|███████▍ | 7389/10000 [1:33:21<33:05, 1.32it/s, loss=0.0244, lr=6.44e-06, step=7389]17:39:28.422 [I] step=7390 loss=0.0073 smoothed_loss=0.0079 lr=6.45e-06 grad_norm=0.3525 step_time=0.6370s data_time=0.1236s it/s=1.315 eta_to_10000=1985.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0129 grad_action_out_proj=0.1230 grad_shared_expert=0.5558 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7390/10000 [1:33:21<31:46, 1.37it/s, loss=0.0244, lr=6.44e-06, step=7389] Training: 74%|███████▍ | 7390/10000 [1:33:21<31:46, 1.37it/s, loss=0.0073, lr=6.44e-06, step=7390] Training: 74%|███████▍ | 7391/10000 [1:33:22<32:44, 1.33it/s, loss=0.0073, lr=6.44e-06, step=7390] Training: 74%|███████▍ | 7391/10000 [1:33:22<32:44, 1.33it/s, loss=0.0034, lr=6.44e-06, step=7391] Training: 74%|███████▍ | 7392/10000 [1:33:23<30:57, 1.40it/s, loss=0.0034, lr=6.44e-06, step=7391] Training: 74%|███████▍ | 7392/10000 [1:33:23<30:57, 1.40it/s, loss=0.0600, lr=6.43e-06, step=7392] Training: 74%|███████▍ | 7393/10000 [1:33:24<29:37, 1.47it/s, loss=0.0600, lr=6.43e-06, step=7392] Training: 74%|███████▍ | 7393/10000 [1:33:24<29:37, 1.47it/s, loss=0.0036, lr=6.43e-06, step=7393] Training: 74%|███████▍ | 7394/10000 [1:33:24<28:18, 1.53it/s, loss=0.0036, lr=6.43e-06, step=7393] Training: 74%|███████▍ | 7394/10000 [1:33:24<28:18, 1.53it/s, loss=0.0481, lr=6.43e-06, step=7394] Training: 74%|███████▍ | 7395/10000 [1:33:25<29:43, 1.46it/s, loss=0.0481, lr=6.43e-06, step=7394] Training: 74%|███████▍ | 7395/10000 [1:33:25<29:43, 1.46it/s, loss=0.0147, lr=6.43e-06, step=7395] Training: 74%|███████▍ | 7396/10000 [1:33:26<31:40, 1.37it/s, loss=0.0147, lr=6.43e-06, step=7395] Training: 74%|███████▍ | 7396/10000 [1:33:26<31:40, 1.37it/s, loss=0.0030, lr=6.42e-06, step=7396] Training: 74%|███████▍ | 7397/10000 [1:33:26<29:10, 1.49it/s, loss=0.0030, lr=6.42e-06, step=7396] Training: 74%|███████▍ | 7397/10000 [1:33:26<29:10, 1.49it/s, loss=0.0100, lr=6.42e-06, step=7397] Training: 74%|███████▍ | 7398/10000 [1:33:27<28:01, 1.55it/s, loss=0.0100, lr=6.42e-06, step=7397] Training: 74%|███████▍ | 7398/10000 [1:33:27<28:01, 1.55it/s, loss=0.0421, lr=6.42e-06, step=7398] Training: 74%|███████▍ | 7399/10000 [1:33:27<27:03, 1.60it/s, loss=0.0421, lr=6.42e-06, step=7398] Training: 74%|███████▍ | 7399/10000 [1:33:27<27:03, 1.60it/s, loss=0.0082, lr=6.41e-06, step=7399]17:39:34.871 [I] step=7400 loss=0.0049 smoothed_loss=0.0146 lr=6.42e-06 grad_norm=0.4585 step_time=0.5356s data_time=0.1093s it/s=1.551 eta_to_10000=1676.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0672 grad_shared_expert=0.2197 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7400/10000 [1:33:28<25:57, 1.67it/s, loss=0.0082, lr=6.41e-06, step=7399] Training: 74%|███████▍ | 7400/10000 [1:33:28<25:57, 1.67it/s, loss=0.0049, lr=6.41e-06, step=7400] Training: 74%|███████▍ | 7401/10000 [1:33:29<27:28, 1.58it/s, loss=0.0049, lr=6.41e-06, step=7400] Training: 74%|███████▍ | 7401/10000 [1:33:29<27:28, 1.58it/s, loss=0.0031, lr=6.41e-06, step=7401] Training: 74%|███████▍ | 7402/10000 [1:33:29<28:22, 1.53it/s, loss=0.0031, lr=6.41e-06, step=7401] Training: 74%|███████▍ | 7402/10000 [1:33:29<28:22, 1.53it/s, loss=0.0513, lr=6.41e-06, step=7402] Training: 74%|███████▍ | 7403/10000 [1:33:30<30:06, 1.44it/s, loss=0.0513, lr=6.41e-06, step=7402] Training: 74%|███████▍ | 7403/10000 [1:33:30<30:06, 1.44it/s, loss=0.0267, lr=6.40e-06, step=7403] Training: 74%|███████▍ | 7404/10000 [1:33:31<28:35, 1.51it/s, loss=0.0267, lr=6.40e-06, step=7403] Training: 74%|███████▍ | 7404/10000 [1:33:31<28:35, 1.51it/s, loss=0.0023, lr=6.40e-06, step=7404] Training: 74%|███████▍ | 7405/10000 [1:33:31<27:32, 1.57it/s, loss=0.0023, lr=6.40e-06, step=7404] Training: 74%|███████▍ | 7405/10000 [1:33:31<27:32, 1.57it/s, loss=0.0024, lr=6.40e-06, step=7405] Training: 74%|███████▍ | 7406/10000 [1:33:32<26:37, 1.62it/s, loss=0.0024, lr=6.40e-06, step=7405] Training: 74%|███████▍ | 7406/10000 [1:33:32<26:37, 1.62it/s, loss=0.0161, lr=6.39e-06, step=7406] Training: 74%|███████▍ | 7407/10000 [1:33:33<27:48, 1.55it/s, loss=0.0161, lr=6.39e-06, step=7406] Training: 74%|███████▍ | 7407/10000 [1:33:33<27:48, 1.55it/s, loss=0.0041, lr=6.39e-06, step=7407] Training: 74%|███████▍ | 7408/10000 [1:33:33<26:49, 1.61it/s, loss=0.0041, lr=6.39e-06, step=7407] Training: 74%|███████▍ | 7408/10000 [1:33:33<26:49, 1.61it/s, loss=0.1541, lr=6.39e-06, step=7408] Training: 74%|███████▍ | 7409/10000 [1:33:34<28:34, 1.51it/s, loss=0.1541, lr=6.39e-06, step=7408] Training: 74%|███████▍ | 7409/10000 [1:33:34<28:34, 1.51it/s, loss=0.0017, lr=6.39e-06, step=7409]17:39:41.839 [I] step=7410 loss=0.0018 smoothed_loss=0.0231 lr=6.40e-06 grad_norm=0.4757 step_time=0.5868s data_time=0.1099s it/s=1.436 eta_to_10000=1804.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.1029 grad_shared_expert=0.3246 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7410/10000 [1:33:35<32:53, 1.31it/s, loss=0.0017, lr=6.39e-06, step=7409] Training: 74%|███████▍ | 7410/10000 [1:33:35<32:53, 1.31it/s, loss=0.0018, lr=6.38e-06, step=7410] Training: 74%|███████▍ | 7411/10000 [1:33:36<35:03, 1.23it/s, loss=0.0018, lr=6.38e-06, step=7410] Training: 74%|███████▍ | 7411/10000 [1:33:36<35:03, 1.23it/s, loss=0.0121, lr=6.38e-06, step=7411] Training: 74%|███████▍ | 7412/10000 [1:33:36<32:08, 1.34it/s, loss=0.0121, lr=6.38e-06, step=7411] Training: 74%|███████▍ | 7412/10000 [1:33:36<32:08, 1.34it/s, loss=0.0102, lr=6.38e-06, step=7412] Training: 74%|███████▍ | 7413/10000 [1:33:37<30:37, 1.41it/s, loss=0.0102, lr=6.38e-06, step=7412] Training: 74%|███████▍ | 7413/10000 [1:33:37<30:37, 1.41it/s, loss=0.0017, lr=6.37e-06, step=7413] Training: 74%|███████▍ | 7414/10000 [1:33:38<30:05, 1.43it/s, loss=0.0017, lr=6.37e-06, step=7413] Training: 74%|███████▍ | 7414/10000 [1:33:38<30:05, 1.43it/s, loss=0.0039, lr=6.37e-06, step=7414] Training: 74%|███████▍ | 7415/10000 [1:33:39<32:51, 1.31it/s, loss=0.0039, lr=6.37e-06, step=7414] Training: 74%|███████▍ | 7415/10000 [1:33:39<32:51, 1.31it/s, loss=0.0036, lr=6.37e-06, step=7415] Training: 74%|███████▍ | 7416/10000 [1:33:39<31:54, 1.35it/s, loss=0.0036, lr=6.37e-06, step=7415] Training: 74%|███████▍ | 7416/10000 [1:33:39<31:54, 1.35it/s, loss=0.0078, lr=6.37e-06, step=7416] Training: 74%|███████▍ | 7417/10000 [1:33:40<33:49, 1.27it/s, loss=0.0078, lr=6.37e-06, step=7416] Training: 74%|███████▍ | 7417/10000 [1:33:40<33:49, 1.27it/s, loss=0.0032, lr=6.36e-06, step=7417] Training: 74%|███████▍ | 7418/10000 [1:33:41<32:21, 1.33it/s, loss=0.0032, lr=6.36e-06, step=7417] Training: 74%|███████▍ | 7418/10000 [1:33:41<32:21, 1.33it/s, loss=0.0056, lr=6.36e-06, step=7418] Training: 74%|███████▍ | 7419/10000 [1:33:41<30:09, 1.43it/s, loss=0.0056, lr=6.36e-06, step=7418] Training: 74%|███████▍ | 7419/10000 [1:33:41<30:09, 1.43it/s, loss=0.0047, lr=6.36e-06, step=7419]17:39:49.269 [I] step=7420 loss=0.0030 smoothed_loss=0.0114 lr=6.37e-06 grad_norm=0.4472 step_time=0.5830s data_time=0.1600s it/s=1.346 eta_to_10000=1916.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0280 grad_action_out_proj=0.1580 grad_shared_expert=0.4659 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7420/10000 [1:33:42<32:15, 1.33it/s, loss=0.0047, lr=6.36e-06, step=7419] Training: 74%|███████▍ | 7420/10000 [1:33:42<32:15, 1.33it/s, loss=0.0030, lr=6.35e-06, step=7420] Training: 74%|███████▍ | 7421/10000 [1:33:43<31:27, 1.37it/s, loss=0.0030, lr=6.35e-06, step=7420] Training: 74%|███████▍ | 7421/10000 [1:33:43<31:27, 1.37it/s, loss=0.0064, lr=6.35e-06, step=7421] Training: 74%|███████▍ | 7422/10000 [1:33:44<30:05, 1.43it/s, loss=0.0064, lr=6.35e-06, step=7421] Training: 74%|███████▍ | 7422/10000 [1:33:44<30:05, 1.43it/s, loss=0.0068, lr=6.35e-06, step=7422] Training: 74%|███████▍ | 7423/10000 [1:33:44<29:06, 1.48it/s, loss=0.0068, lr=6.35e-06, step=7422] Training: 74%|███████▍ | 7423/10000 [1:33:44<29:06, 1.48it/s, loss=0.0112, lr=6.35e-06, step=7423] Training: 74%|███████▍ | 7424/10000 [1:33:45<28:24, 1.51it/s, loss=0.0112, lr=6.35e-06, step=7423] Training: 74%|███████▍ | 7424/10000 [1:33:45<28:24, 1.51it/s, loss=0.0065, lr=6.34e-06, step=7424] Training: 74%|███████▍ | 7425/10000 [1:33:46<31:33, 1.36it/s, loss=0.0065, lr=6.34e-06, step=7424] Training: 74%|███████▍ | 7425/10000 [1:33:46<31:33, 1.36it/s, loss=0.0146, lr=6.34e-06, step=7425] Training: 74%|███████▍ | 7426/10000 [1:33:47<33:39, 1.27it/s, loss=0.0146, lr=6.34e-06, step=7425] Training: 74%|███████▍ | 7426/10000 [1:33:47<33:39, 1.27it/s, loss=0.0019, lr=6.34e-06, step=7426] Training: 74%|███████▍ | 7427/10000 [1:33:47<31:40, 1.35it/s, loss=0.0019, lr=6.34e-06, step=7426] Training: 74%|███████▍ | 7427/10000 [1:33:47<31:40, 1.35it/s, loss=0.0074, lr=6.34e-06, step=7427] Training: 74%|███████▍ | 7428/10000 [1:33:48<28:48, 1.49it/s, loss=0.0074, lr=6.34e-06, step=7427] Training: 74%|███████▍ | 7428/10000 [1:33:48<28:48, 1.49it/s, loss=0.0123, lr=6.33e-06, step=7428] Training: 74%|███████▍ | 7429/10000 [1:33:49<28:55, 1.48it/s, loss=0.0123, lr=6.33e-06, step=7428] Training: 74%|███████▍ | 7429/10000 [1:33:49<28:55, 1.48it/s, loss=0.0127, lr=6.33e-06, step=7429]17:39:56.389 [I] step=7430 loss=0.0128 smoothed_loss=0.0103 lr=6.34e-06 grad_norm=0.5062 step_time=0.5979s data_time=0.1141s it/s=1.405 eta_to_10000=1829.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0183 grad_action_out_proj=0.1687 grad_shared_expert=0.4916 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7430/10000 [1:33:49<32:01, 1.34it/s, loss=0.0127, lr=6.33e-06, step=7429] Training: 74%|███████▍ | 7430/10000 [1:33:49<32:01, 1.34it/s, loss=0.0128, lr=6.33e-06, step=7430] Training: 74%|███████▍ | 7431/10000 [1:33:50<33:19, 1.28it/s, loss=0.0128, lr=6.33e-06, step=7430] Training: 74%|███████▍ | 7431/10000 [1:33:50<33:19, 1.28it/s, loss=0.0037, lr=6.32e-06, step=7431] Training: 74%|███████▍ | 7432/10000 [1:33:51<36:50, 1.16it/s, loss=0.0037, lr=6.32e-06, step=7431] Training: 74%|███████▍ | 7432/10000 [1:33:51<36:50, 1.16it/s, loss=0.0020, lr=6.32e-06, step=7432] Training: 74%|███████▍ | 7433/10000 [1:33:52<32:18, 1.32it/s, loss=0.0020, lr=6.32e-06, step=7432] Training: 74%|███████▍ | 7433/10000 [1:33:52<32:18, 1.32it/s, loss=0.0102, lr=6.32e-06, step=7433] Training: 74%|███████▍ | 7434/10000 [1:33:52<28:52, 1.48it/s, loss=0.0102, lr=6.32e-06, step=7433] Training: 74%|███████▍ | 7434/10000 [1:33:52<28:52, 1.48it/s, loss=0.0040, lr=6.32e-06, step=7434] Training: 74%|███████▍ | 7435/10000 [1:33:53<29:00, 1.47it/s, loss=0.0040, lr=6.32e-06, step=7434] Training: 74%|███████▍ | 7435/10000 [1:33:53<29:00, 1.47it/s, loss=0.0037, lr=6.31e-06, step=7435] Training: 74%|███████▍ | 7436/10000 [1:33:54<26:42, 1.60it/s, loss=0.0037, lr=6.31e-06, step=7435] Training: 74%|███████▍ | 7436/10000 [1:33:54<26:42, 1.60it/s, loss=0.0032, lr=6.31e-06, step=7436] Training: 74%|███████▍ | 7437/10000 [1:33:54<29:06, 1.47it/s, loss=0.0032, lr=6.31e-06, step=7436] Training: 74%|███████▍ | 7437/10000 [1:33:54<29:06, 1.47it/s, loss=0.0037, lr=6.31e-06, step=7437] Training: 74%|███████▍ | 7438/10000 [1:33:55<30:02, 1.42it/s, loss=0.0037, lr=6.31e-06, step=7437] Training: 74%|███████▍ | 7438/10000 [1:33:55<30:02, 1.42it/s, loss=0.0522, lr=6.30e-06, step=7438] Training: 74%|███████▍ | 7439/10000 [1:33:56<31:16, 1.37it/s, loss=0.0522, lr=6.30e-06, step=7438] Training: 74%|███████▍ | 7439/10000 [1:33:56<31:16, 1.37it/s, loss=0.0033, lr=6.30e-06, step=7439]17:40:03.740 [I] step=7440 loss=0.0039 smoothed_loss=0.0101 lr=6.31e-06 grad_norm=0.3664 step_time=0.5874s data_time=0.1478s it/s=1.360 eta_to_10000=1881.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0269 grad_action_out_proj=0.1746 grad_shared_expert=0.4594 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7440/10000 [1:33:57<33:20, 1.28it/s, loss=0.0033, lr=6.30e-06, step=7439] Training: 74%|███████▍ | 7440/10000 [1:33:57<33:20, 1.28it/s, loss=0.0039, lr=6.30e-06, step=7440] Training: 74%|███████▍ | 7441/10000 [1:33:58<33:05, 1.29it/s, loss=0.0039, lr=6.30e-06, step=7440] Training: 74%|███████▍ | 7441/10000 [1:33:58<33:05, 1.29it/s, loss=0.0050, lr=6.30e-06, step=7441] Training: 74%|███████▍ | 7442/10000 [1:33:58<31:12, 1.37it/s, loss=0.0050, lr=6.30e-06, step=7441] Training: 74%|███████▍ | 7442/10000 [1:33:58<31:12, 1.37it/s, loss=0.0180, lr=6.29e-06, step=7442] Training: 74%|███████▍ | 7443/10000 [1:33:59<32:13, 1.32it/s, loss=0.0180, lr=6.29e-06, step=7442] Training: 74%|███████▍ | 7443/10000 [1:33:59<32:13, 1.32it/s, loss=0.0134, lr=6.29e-06, step=7443] Training: 74%|███████▍ | 7444/10000 [1:34:00<31:04, 1.37it/s, loss=0.0134, lr=6.29e-06, step=7443] Training: 74%|███████▍ | 7444/10000 [1:34:00<31:04, 1.37it/s, loss=0.0028, lr=6.29e-06, step=7444] Training: 74%|███████▍ | 7445/10000 [1:34:00<30:09, 1.41it/s, loss=0.0028, lr=6.29e-06, step=7444] Training: 74%|███████▍ | 7445/10000 [1:34:00<30:09, 1.41it/s, loss=0.0044, lr=6.29e-06, step=7445] Training: 74%|███████▍ | 7446/10000 [1:34:01<33:38, 1.27it/s, loss=0.0044, lr=6.29e-06, step=7445] Training: 74%|███████▍ | 7446/10000 [1:34:01<33:38, 1.27it/s, loss=0.0090, lr=6.28e-06, step=7446] Training: 74%|███████▍ | 7447/10000 [1:34:02<32:00, 1.33it/s, loss=0.0090, lr=6.28e-06, step=7446] Training: 74%|███████▍ | 7447/10000 [1:34:02<32:00, 1.33it/s, loss=0.0029, lr=6.28e-06, step=7447] Training: 74%|███████▍ | 7448/10000 [1:34:03<31:09, 1.36it/s, loss=0.0029, lr=6.28e-06, step=7447] Training: 74%|███████▍ | 7448/10000 [1:34:03<31:09, 1.36it/s, loss=0.0066, lr=6.28e-06, step=7448] Training: 74%|███████▍ | 7449/10000 [1:34:03<31:07, 1.37it/s, loss=0.0066, lr=6.28e-06, step=7448] Training: 74%|███████▍ | 7449/10000 [1:34:03<31:07, 1.37it/s, loss=0.0025, lr=6.27e-06, step=7449]17:40:11.197 [I] step=7450 loss=0.0047 smoothed_loss=0.0076 lr=6.28e-06 grad_norm=0.4342 step_time=0.5904s data_time=0.1552s it/s=1.341 eta_to_10000=1901.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0122 grad_action_out_proj=0.1079 grad_shared_expert=0.2842 (10775:train_pytorch.py:850) + Training: 74%|███████▍ | 7450/10000 [1:34:04<32:45, 1.30it/s, loss=0.0025, lr=6.27e-06, step=7449] Training: 74%|███████▍ | 7450/10000 [1:34:04<32:45, 1.30it/s, loss=0.0047, lr=6.27e-06, step=7450] Training: 75%|███████▍ | 7451/10000 [1:34:05<31:13, 1.36it/s, loss=0.0047, lr=6.27e-06, step=7450] Training: 75%|███████▍ | 7451/10000 [1:34:05<31:13, 1.36it/s, loss=0.0048, lr=6.27e-06, step=7451] Training: 75%|███████▍ | 7452/10000 [1:34:06<29:28, 1.44it/s, loss=0.0048, lr=6.27e-06, step=7451] Training: 75%|███████▍ | 7452/10000 [1:34:06<29:28, 1.44it/s, loss=0.0073, lr=6.27e-06, step=7452] Training: 75%|███████▍ | 7453/10000 [1:34:06<31:58, 1.33it/s, loss=0.0073, lr=6.27e-06, step=7452] Training: 75%|███████▍ | 7453/10000 [1:34:06<31:58, 1.33it/s, loss=0.0042, lr=6.26e-06, step=7453] Training: 75%|███████▍ | 7454/10000 [1:34:07<29:34, 1.43it/s, loss=0.0042, lr=6.26e-06, step=7453] Training: 75%|███████▍ | 7454/10000 [1:34:07<29:34, 1.43it/s, loss=0.0036, lr=6.26e-06, step=7454] Training: 75%|███████▍ | 7455/10000 [1:34:08<28:34, 1.48it/s, loss=0.0036, lr=6.26e-06, step=7454] Training: 75%|███████▍ | 7455/10000 [1:34:08<28:34, 1.48it/s, loss=0.0020, lr=6.26e-06, step=7455] Training: 75%|███████▍ | 7456/10000 [1:34:08<26:12, 1.62it/s, loss=0.0020, lr=6.26e-06, step=7455] Training: 75%|███████▍ | 7456/10000 [1:34:08<26:12, 1.62it/s, loss=0.0030, lr=6.25e-06, step=7456] Training: 75%|███████▍ | 7457/10000 [1:34:09<24:50, 1.71it/s, loss=0.0030, lr=6.25e-06, step=7456] Training: 75%|███████▍ | 7457/10000 [1:34:09<24:50, 1.71it/s, loss=0.0285, lr=6.25e-06, step=7457] Training: 75%|███████▍ | 7458/10000 [1:34:09<25:11, 1.68it/s, loss=0.0285, lr=6.25e-06, step=7457] Training: 75%|███████▍ | 7458/10000 [1:34:09<25:11, 1.68it/s, loss=0.0054, lr=6.25e-06, step=7458] Training: 75%|███████▍ | 7459/10000 [1:34:10<28:07, 1.51it/s, loss=0.0054, lr=6.25e-06, step=7458] Training: 75%|███████▍ | 7459/10000 [1:34:10<28:07, 1.51it/s, loss=0.0216, lr=6.25e-06, step=7459]17:40:17.793 [I] step=7460 loss=0.0035 smoothed_loss=0.0087 lr=6.26e-06 grad_norm=0.4218 step_time=0.5560s data_time=0.1036s it/s=1.516 eta_to_10000=1675.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0872 grad_shared_expert=0.2216 (10775:train_pytorch.py:850) + Training: 75%|███████▍ | 7460/10000 [1:34:11<30:13, 1.40it/s, loss=0.0216, lr=6.25e-06, step=7459] Training: 75%|███████▍ | 7460/10000 [1:34:11<30:13, 1.40it/s, loss=0.0035, lr=6.24e-06, step=7460] Training: 75%|███████▍ | 7461/10000 [1:34:12<30:10, 1.40it/s, loss=0.0035, lr=6.24e-06, step=7460] Training: 75%|███████▍ | 7461/10000 [1:34:12<30:10, 1.40it/s, loss=0.0012, lr=6.24e-06, step=7461] Training: 75%|███████▍ | 7462/10000 [1:34:12<32:37, 1.30it/s, loss=0.0012, lr=6.24e-06, step=7461] Training: 75%|███████▍ | 7462/10000 [1:34:12<32:37, 1.30it/s, loss=0.0018, lr=6.24e-06, step=7462] Training: 75%|███████▍ | 7463/10000 [1:34:13<29:28, 1.43it/s, loss=0.0018, lr=6.24e-06, step=7462] Training: 75%|███████▍ | 7463/10000 [1:34:13<29:28, 1.43it/s, loss=0.0062, lr=6.24e-06, step=7463] Training: 75%|███████▍ | 7464/10000 [1:34:13<26:53, 1.57it/s, loss=0.0062, lr=6.24e-06, step=7463] Training: 75%|███████▍ | 7464/10000 [1:34:13<26:53, 1.57it/s, loss=0.0140, lr=6.23e-06, step=7464] Training: 75%|███████▍ | 7465/10000 [1:34:14<25:36, 1.65it/s, loss=0.0140, lr=6.23e-06, step=7464] Training: 75%|███████▍ | 7465/10000 [1:34:14<25:36, 1.65it/s, loss=0.0038, lr=6.23e-06, step=7465] Training: 75%|███████▍ | 7466/10000 [1:34:15<26:47, 1.58it/s, loss=0.0038, lr=6.23e-06, step=7465] Training: 75%|███████▍ | 7466/10000 [1:34:15<26:47, 1.58it/s, loss=0.0123, lr=6.23e-06, step=7466] Training: 75%|███████▍ | 7467/10000 [1:34:15<25:45, 1.64it/s, loss=0.0123, lr=6.23e-06, step=7466] Training: 75%|███████▍ | 7467/10000 [1:34:15<25:45, 1.64it/s, loss=0.0104, lr=6.22e-06, step=7467] Training: 75%|███████▍ | 7468/10000 [1:34:16<28:11, 1.50it/s, loss=0.0104, lr=6.22e-06, step=7467] Training: 75%|███████▍ | 7468/10000 [1:34:16<28:11, 1.50it/s, loss=0.0036, lr=6.22e-06, step=7468] Training: 75%|███████▍ | 7469/10000 [1:34:17<29:06, 1.45it/s, loss=0.0036, lr=6.22e-06, step=7468] Training: 75%|███████▍ | 7469/10000 [1:34:17<29:06, 1.45it/s, loss=0.0088, lr=6.22e-06, step=7469]17:40:24.448 [I] step=7470 loss=0.0329 smoothed_loss=0.0104 lr=6.23e-06 grad_norm=0.4206 step_time=0.5639s data_time=0.1015s it/s=1.503 eta_to_10000=1683.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0142 grad_action_out_proj=0.1799 grad_shared_expert=0.5257 (10775:train_pytorch.py:850) + Training: 75%|███████▍ | 7470/10000 [1:34:18<29:01, 1.45it/s, loss=0.0088, lr=6.22e-06, step=7469] Training: 75%|███████▍ | 7470/10000 [1:34:18<29:01, 1.45it/s, loss=0.0329, lr=6.22e-06, step=7470] Training: 75%|███████▍ | 7471/10000 [1:34:18<28:28, 1.48it/s, loss=0.0329, lr=6.22e-06, step=7470] Training: 75%|███████▍ | 7471/10000 [1:34:18<28:28, 1.48it/s, loss=0.0088, lr=6.21e-06, step=7471] Training: 75%|███████▍ | 7472/10000 [1:34:19<26:11, 1.61it/s, loss=0.0088, lr=6.21e-06, step=7471] Training: 75%|███████▍ | 7472/10000 [1:34:19<26:11, 1.61it/s, loss=0.0285, lr=6.21e-06, step=7472] Training: 75%|███████▍ | 7473/10000 [1:34:19<24:37, 1.71it/s, loss=0.0285, lr=6.21e-06, step=7472] Training: 75%|███████▍ | 7473/10000 [1:34:19<24:37, 1.71it/s, loss=0.0010, lr=6.21e-06, step=7473] Training: 75%|███████▍ | 7474/10000 [1:34:20<25:56, 1.62it/s, loss=0.0010, lr=6.21e-06, step=7473] Training: 75%|███████▍ | 7474/10000 [1:34:20<25:56, 1.62it/s, loss=0.0098, lr=6.20e-06, step=7474] Training: 75%|███████▍ | 7475/10000 [1:34:21<30:24, 1.38it/s, loss=0.0098, lr=6.20e-06, step=7474] Training: 75%|███████▍ | 7475/10000 [1:34:21<30:24, 1.38it/s, loss=0.0864, lr=6.20e-06, step=7475] Training: 75%|███████▍ | 7476/10000 [1:34:21<27:54, 1.51it/s, loss=0.0864, lr=6.20e-06, step=7475] Training: 75%|███████▍ | 7476/10000 [1:34:21<27:54, 1.51it/s, loss=0.0085, lr=6.20e-06, step=7476] Training: 75%|███████▍ | 7477/10000 [1:34:22<27:25, 1.53it/s, loss=0.0085, lr=6.20e-06, step=7476] Training: 75%|███████▍ | 7477/10000 [1:34:22<27:25, 1.53it/s, loss=0.0017, lr=6.20e-06, step=7477] Training: 75%|███████▍ | 7478/10000 [1:34:23<26:52, 1.56it/s, loss=0.0017, lr=6.20e-06, step=7477] Training: 75%|███████▍ | 7478/10000 [1:34:23<26:52, 1.56it/s, loss=0.0422, lr=6.19e-06, step=7478] Training: 75%|███████▍ | 7479/10000 [1:34:23<24:58, 1.68it/s, loss=0.0422, lr=6.19e-06, step=7478] Training: 75%|███████▍ | 7479/10000 [1:34:23<24:58, 1.68it/s, loss=0.0605, lr=6.19e-06, step=7479]17:40:30.544 [I] step=7480 loss=0.0072 smoothed_loss=0.0211 lr=6.20e-06 grad_norm=0.5161 step_time=0.5143s data_time=0.0954s it/s=1.641 eta_to_10000=1535.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0162 grad_action_out_proj=0.1370 grad_shared_expert=0.4836 (10775:train_pytorch.py:850) + Training: 75%|███████▍ | 7480/10000 [1:34:24<24:20, 1.73it/s, loss=0.0605, lr=6.19e-06, step=7479] Training: 75%|███████▍ | 7480/10000 [1:34:24<24:20, 1.73it/s, loss=0.0072, lr=6.19e-06, step=7480] Training: 75%|███████▍ | 7481/10000 [1:34:24<26:30, 1.58it/s, loss=0.0072, lr=6.19e-06, step=7480] Training: 75%|███████▍ | 7481/10000 [1:34:24<26:30, 1.58it/s, loss=0.0007, lr=6.19e-06, step=7481] Training: 75%|███████▍ | 7482/10000 [1:34:25<29:42, 1.41it/s, loss=0.0007, lr=6.19e-06, step=7481] Training: 75%|███████▍ | 7482/10000 [1:34:25<29:42, 1.41it/s, loss=0.0047, lr=6.18e-06, step=7482] Training: 75%|███████▍ | 7483/10000 [1:34:26<27:21, 1.53it/s, loss=0.0047, lr=6.18e-06, step=7482] Training: 75%|███████▍ | 7483/10000 [1:34:26<27:21, 1.53it/s, loss=0.0104, lr=6.18e-06, step=7483] Training: 75%|███████▍ | 7484/10000 [1:34:26<25:19, 1.66it/s, loss=0.0104, lr=6.18e-06, step=7483] Training: 75%|███████▍ | 7484/10000 [1:34:26<25:19, 1.66it/s, loss=0.0016, lr=6.18e-06, step=7484] Training: 75%|███████▍ | 7485/10000 [1:34:27<23:53, 1.75it/s, loss=0.0016, lr=6.18e-06, step=7484] Training: 75%|███████▍ | 7485/10000 [1:34:27<23:53, 1.75it/s, loss=0.0064, lr=6.17e-06, step=7485] Training: 75%|███████▍ | 7486/10000 [1:34:27<23:22, 1.79it/s, loss=0.0064, lr=6.17e-06, step=7485] Training: 75%|███████▍ | 7486/10000 [1:34:27<23:22, 1.79it/s, loss=0.0150, lr=6.17e-06, step=7486] Training: 75%|███████▍ | 7487/10000 [1:34:28<22:42, 1.84it/s, loss=0.0150, lr=6.17e-06, step=7486] Training: 75%|███████▍ | 7487/10000 [1:34:28<22:42, 1.84it/s, loss=0.0163, lr=6.17e-06, step=7487] Training: 75%|███████▍ | 7488/10000 [1:34:28<22:28, 1.86it/s, loss=0.0163, lr=6.17e-06, step=7487] Training: 75%|███████▍ | 7488/10000 [1:34:28<22:28, 1.86it/s, loss=0.0047, lr=6.17e-06, step=7488] Training: 75%|███████▍ | 7489/10000 [1:34:29<24:21, 1.72it/s, loss=0.0047, lr=6.17e-06, step=7488] Training: 75%|███████▍ | 7489/10000 [1:34:29<24:21, 1.72it/s, loss=0.0019, lr=6.16e-06, step=7489]17:40:36.741 [I] step=7490 loss=0.0699 smoothed_loss=0.0183 lr=6.17e-06 grad_norm=0.4143 step_time=0.5414s data_time=0.0783s it/s=1.614 eta_to_10000=1555.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0374 grad_action_out_proj=0.1572 grad_shared_expert=0.5161 (10775:train_pytorch.py:850) + Training: 75%|███████▍ | 7490/10000 [1:34:30<27:10, 1.54it/s, loss=0.0019, lr=6.16e-06, step=7489] Training: 75%|███████▍ | 7490/10000 [1:34:30<27:10, 1.54it/s, loss=0.0699, lr=6.16e-06, step=7490] Training: 75%|███████▍ | 7491/10000 [1:34:30<25:06, 1.67it/s, loss=0.0699, lr=6.16e-06, step=7490] Training: 75%|███████▍ | 7491/10000 [1:34:30<25:06, 1.67it/s, loss=0.0027, lr=6.16e-06, step=7491] Training: 75%|███████▍ | 7492/10000 [1:34:31<25:21, 1.65it/s, loss=0.0027, lr=6.16e-06, step=7491] Training: 75%|███████▍ | 7492/10000 [1:34:31<25:21, 1.65it/s, loss=0.0023, lr=6.16e-06, step=7492] Training: 75%|███████▍ | 7493/10000 [1:34:31<24:06, 1.73it/s, loss=0.0023, lr=6.16e-06, step=7492] Training: 75%|███████▍ | 7493/10000 [1:34:31<24:06, 1.73it/s, loss=0.0451, lr=6.15e-06, step=7493] Training: 75%|███████▍ | 7494/10000 [1:34:32<26:06, 1.60it/s, loss=0.0451, lr=6.15e-06, step=7493] Training: 75%|███████▍ | 7494/10000 [1:34:32<26:06, 1.60it/s, loss=0.0625, lr=6.15e-06, step=7494] Training: 75%|███████▍ | 7495/10000 [1:34:33<24:35, 1.70it/s, loss=0.0625, lr=6.15e-06, step=7494] Training: 75%|███████▍ | 7495/10000 [1:34:33<24:35, 1.70it/s, loss=0.0027, lr=6.15e-06, step=7495] Training: 75%|███████▍ | 7496/10000 [1:34:33<25:26, 1.64it/s, loss=0.0027, lr=6.15e-06, step=7495] Training: 75%|███████▍ | 7496/10000 [1:34:33<25:26, 1.64it/s, loss=0.0054, lr=6.14e-06, step=7496] Training: 75%|███████▍ | 7497/10000 [1:34:34<28:17, 1.47it/s, loss=0.0054, lr=6.14e-06, step=7496] Training: 75%|███████▍ | 7497/10000 [1:34:34<28:17, 1.47it/s, loss=0.0026, lr=6.14e-06, step=7497] Training: 75%|███████▍ | 7498/10000 [1:34:35<26:40, 1.56it/s, loss=0.0026, lr=6.14e-06, step=7497] Training: 75%|███████▍ | 7498/10000 [1:34:35<26:40, 1.56it/s, loss=0.0013, lr=6.14e-06, step=7498] Training: 75%|███████▍ | 7499/10000 [1:34:35<25:53, 1.61it/s, loss=0.0013, lr=6.14e-06, step=7498] Training: 75%|███████▍ | 7499/10000 [1:34:35<25:53, 1.61it/s, loss=0.0018, lr=6.14e-06, step=7499]17:40:42.760 [I] step=7500 loss=0.0051 smoothed_loss=0.0135 lr=6.15e-06 grad_norm=0.4014 step_time=0.5153s data_time=0.0866s it/s=1.662 eta_to_10000=1504.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0160 grad_action_out_proj=0.1234 grad_shared_expert=0.5458 (10775:train_pytorch.py:850) + Training: 75%|███████▌ | 7500/10000 [1:34:36<24:50, 1.68it/s, loss=0.0018, lr=6.14e-06, step=7499] Training: 75%|███████▌ | 7500/10000 [1:34:36<24:50, 1.68it/s, loss=0.0051, lr=6.13e-06, step=7500] Training: 75%|███████▌ | 7501/10000 [1:34:36<24:41, 1.69it/s, loss=0.0051, lr=6.13e-06, step=7500] Training: 75%|███████▌ | 7501/10000 [1:34:36<24:41, 1.69it/s, loss=0.0030, lr=6.13e-06, step=7501] Training: 75%|███████▌ | 7502/10000 [1:34:37<24:37, 1.69it/s, loss=0.0030, lr=6.13e-06, step=7501] Training: 75%|███████▌ | 7502/10000 [1:34:37<24:37, 1.69it/s, loss=0.0028, lr=6.13e-06, step=7502] Training: 75%|███████▌ | 7503/10000 [1:34:38<26:28, 1.57it/s, loss=0.0028, lr=6.13e-06, step=7502] Training: 75%|███████▌ | 7503/10000 [1:34:38<26:28, 1.57it/s, loss=0.0027, lr=6.13e-06, step=7503] Training: 75%|███████▌ | 7504/10000 [1:34:38<24:49, 1.68it/s, loss=0.0027, lr=6.13e-06, step=7503] Training: 75%|███████▌ | 7504/10000 [1:34:38<24:49, 1.68it/s, loss=0.0069, lr=6.12e-06, step=7504] Training: 75%|███████▌ | 7505/10000 [1:34:39<24:06, 1.72it/s, loss=0.0069, lr=6.12e-06, step=7504] Training: 75%|███████▌ | 7505/10000 [1:34:39<24:06, 1.72it/s, loss=0.0077, lr=6.12e-06, step=7505] Training: 75%|███████▌ | 7506/10000 [1:34:39<23:48, 1.75it/s, loss=0.0077, lr=6.12e-06, step=7505] Training: 75%|███████▌ | 7506/10000 [1:34:39<23:48, 1.75it/s, loss=0.0147, lr=6.12e-06, step=7506] Training: 75%|███████▌ | 7507/10000 [1:34:40<24:25, 1.70it/s, loss=0.0147, lr=6.12e-06, step=7506] Training: 75%|███████▌ | 7507/10000 [1:34:40<24:25, 1.70it/s, loss=0.0060, lr=6.11e-06, step=7507] Training: 75%|███████▌ | 7508/10000 [1:34:41<24:52, 1.67it/s, loss=0.0060, lr=6.11e-06, step=7507] Training: 75%|███████▌ | 7508/10000 [1:34:41<24:52, 1.67it/s, loss=0.0090, lr=6.11e-06, step=7508] Training: 75%|███████▌ | 7509/10000 [1:34:41<23:45, 1.75it/s, loss=0.0090, lr=6.11e-06, step=7508] Training: 75%|███████▌ | 7509/10000 [1:34:41<23:45, 1.75it/s, loss=0.0016, lr=6.11e-06, step=7509]17:40:48.744 [I] step=7510 loss=0.0038 smoothed_loss=0.0086 lr=6.12e-06 grad_norm=0.3868 step_time=0.5160s data_time=0.0824s it/s=1.671 eta_to_10000=1489.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.0973 grad_shared_expert=0.3685 (10775:train_pytorch.py:850) + Training: 75%|███████▌ | 7510/10000 [1:34:42<25:28, 1.63it/s, loss=0.0016, lr=6.11e-06, step=7509] Training: 75%|███████▌ | 7510/10000 [1:34:42<25:28, 1.63it/s, loss=0.0038, lr=6.11e-06, step=7510] Training: 75%|███████▌ | 7511/10000 [1:34:43<27:08, 1.53it/s, loss=0.0038, lr=6.11e-06, step=7510] Training: 75%|███████▌ | 7511/10000 [1:34:43<27:08, 1.53it/s, loss=0.0029, lr=6.10e-06, step=7511] Training: 75%|███████▌ | 7512/10000 [1:34:43<25:22, 1.63it/s, loss=0.0029, lr=6.10e-06, step=7511] Training: 75%|███████▌ | 7512/10000 [1:34:43<25:22, 1.63it/s, loss=0.0408, lr=6.10e-06, step=7512] Training: 75%|███████▌ | 7513/10000 [1:34:44<25:33, 1.62it/s, loss=0.0408, lr=6.10e-06, step=7512] Training: 75%|███████▌ | 7513/10000 [1:34:44<25:33, 1.62it/s, loss=0.0041, lr=6.10e-06, step=7513] Training: 75%|███████▌ | 7514/10000 [1:34:44<24:07, 1.72it/s, loss=0.0041, lr=6.10e-06, step=7513] Training: 75%|███████▌ | 7514/10000 [1:34:44<24:07, 1.72it/s, loss=0.0054, lr=6.10e-06, step=7514] Training: 75%|███████▌ | 7515/10000 [1:34:45<23:10, 1.79it/s, loss=0.0054, lr=6.10e-06, step=7514] Training: 75%|███████▌ | 7515/10000 [1:34:45<23:10, 1.79it/s, loss=0.0308, lr=6.09e-06, step=7515] Training: 75%|███████▌ | 7516/10000 [1:34:45<22:51, 1.81it/s, loss=0.0308, lr=6.09e-06, step=7515] Training: 75%|███████▌ | 7516/10000 [1:34:45<22:51, 1.81it/s, loss=0.0013, lr=6.09e-06, step=7516] Training: 75%|███████▌ | 7517/10000 [1:34:46<24:23, 1.70it/s, loss=0.0013, lr=6.09e-06, step=7516] Training: 75%|███████▌ | 7517/10000 [1:34:46<24:23, 1.70it/s, loss=0.0062, lr=6.09e-06, step=7517] Training: 75%|███████▌ | 7518/10000 [1:34:47<26:42, 1.55it/s, loss=0.0062, lr=6.09e-06, step=7517] Training: 75%|███████▌ | 7518/10000 [1:34:47<26:42, 1.55it/s, loss=0.0158, lr=6.08e-06, step=7518] Training: 75%|███████▌ | 7519/10000 [1:34:47<24:48, 1.67it/s, loss=0.0158, lr=6.08e-06, step=7518] Training: 75%|███████▌ | 7519/10000 [1:34:47<24:48, 1.67it/s, loss=0.0091, lr=6.08e-06, step=7519]17:40:54.702 [I] step=7520 loss=0.0198 smoothed_loss=0.0118 lr=6.09e-06 grad_norm=0.4516 step_time=0.5246s data_time=0.0712s it/s=1.679 eta_to_10000=1477.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0141 grad_action_out_proj=0.1082 grad_shared_expert=0.5338 (10775:train_pytorch.py:850) + Training: 75%|███████▌ | 7520/10000 [1:34:48<24:31, 1.68it/s, loss=0.0091, lr=6.08e-06, step=7519] Training: 75%|███████▌ | 7520/10000 [1:34:48<24:31, 1.68it/s, loss=0.0198, lr=6.08e-06, step=7520] Training: 75%|███████▌ | 7521/10000 [1:34:48<24:20, 1.70it/s, loss=0.0198, lr=6.08e-06, step=7520] Training: 75%|███████▌ | 7521/10000 [1:34:48<24:20, 1.70it/s, loss=0.0151, lr=6.08e-06, step=7521] Training: 75%|███████▌ | 7522/10000 [1:34:49<23:11, 1.78it/s, loss=0.0151, lr=6.08e-06, step=7521] Training: 75%|███████▌ | 7522/10000 [1:34:49<23:11, 1.78it/s, loss=0.0023, lr=6.07e-06, step=7522] Training: 75%|███████▌ | 7523/10000 [1:34:49<22:28, 1.84it/s, loss=0.0023, lr=6.07e-06, step=7522] Training: 75%|███████▌ | 7523/10000 [1:34:49<22:28, 1.84it/s, loss=0.0087, lr=6.07e-06, step=7523] Training: 75%|███████▌ | 7524/10000 [1:34:50<23:42, 1.74it/s, loss=0.0087, lr=6.07e-06, step=7523] Training: 75%|███████▌ | 7524/10000 [1:34:50<23:42, 1.74it/s, loss=0.0396, lr=6.07e-06, step=7524] Training: 75%|███████▌ | 7525/10000 [1:34:51<26:12, 1.57it/s, loss=0.0396, lr=6.07e-06, step=7524] Training: 75%|███████▌ | 7525/10000 [1:34:51<26:12, 1.57it/s, loss=0.0020, lr=6.07e-06, step=7525] Training: 75%|███████▌ | 7526/10000 [1:34:51<25:27, 1.62it/s, loss=0.0020, lr=6.07e-06, step=7525] Training: 75%|███████▌ | 7526/10000 [1:34:51<25:27, 1.62it/s, loss=0.0016, lr=6.06e-06, step=7526] Training: 75%|███████▌ | 7527/10000 [1:34:52<23:55, 1.72it/s, loss=0.0016, lr=6.06e-06, step=7526] Training: 75%|███████▌ | 7527/10000 [1:34:52<23:55, 1.72it/s, loss=0.0036, lr=6.06e-06, step=7527] Training: 75%|███████▌ | 7528/10000 [1:34:52<22:50, 1.80it/s, loss=0.0036, lr=6.06e-06, step=7527] Training: 75%|███████▌ | 7528/10000 [1:34:52<22:50, 1.80it/s, loss=0.0047, lr=6.06e-06, step=7528] Training: 75%|███████▌ | 7529/10000 [1:34:53<22:06, 1.86it/s, loss=0.0047, lr=6.06e-06, step=7528] Training: 75%|███████▌ | 7529/10000 [1:34:53<22:06, 1.86it/s, loss=0.0030, lr=6.05e-06, step=7529]17:41:00.532 [I] step=7530 loss=0.0128 smoothed_loss=0.0097 lr=6.06e-06 grad_norm=0.4565 step_time=0.5003s data_time=0.0826s it/s=1.716 eta_to_10000=1439.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0144 grad_action_out_proj=0.1283 grad_shared_expert=0.3876 (10775:train_pytorch.py:850) + Training: 75%|███████▌ | 7530/10000 [1:34:54<24:56, 1.65it/s, loss=0.0030, lr=6.05e-06, step=7529] Training: 75%|███████▌ | 7530/10000 [1:34:54<24:56, 1.65it/s, loss=0.0128, lr=6.05e-06, step=7530] Training: 75%|███████▌ | 7531/10000 [1:34:54<25:30, 1.61it/s, loss=0.0128, lr=6.05e-06, step=7530] Training: 75%|███████▌ | 7531/10000 [1:34:54<25:30, 1.61it/s, loss=0.0244, lr=6.05e-06, step=7531] Training: 75%|███████▌ | 7532/10000 [1:34:55<26:50, 1.53it/s, loss=0.0244, lr=6.05e-06, step=7531] Training: 75%|███████▌ | 7532/10000 [1:34:55<26:50, 1.53it/s, loss=0.0035, lr=6.05e-06, step=7532] Training: 75%|███████▌ | 7533/10000 [1:34:55<24:51, 1.65it/s, loss=0.0035, lr=6.05e-06, step=7532] Training: 75%|███████▌ | 7533/10000 [1:34:55<24:51, 1.65it/s, loss=0.0057, lr=6.04e-06, step=7533] Training: 75%|███████▌ | 7534/10000 [1:34:56<23:46, 1.73it/s, loss=0.0057, lr=6.04e-06, step=7533] Training: 75%|███████▌ | 7534/10000 [1:34:56<23:46, 1.73it/s, loss=0.0061, lr=6.04e-06, step=7534] Training: 75%|███████▌ | 7535/10000 [1:34:56<22:40, 1.81it/s, loss=0.0061, lr=6.04e-06, step=7534] Training: 75%|███████▌ | 7535/10000 [1:34:56<22:40, 1.81it/s, loss=0.0029, lr=6.04e-06, step=7535] Training: 75%|███████▌ | 7536/10000 [1:34:57<22:08, 1.85it/s, loss=0.0029, lr=6.04e-06, step=7535] Training: 75%|███████▌ | 7536/10000 [1:34:57<22:08, 1.85it/s, loss=0.0035, lr=6.04e-06, step=7536] Training: 75%|███████▌ | 7537/10000 [1:34:58<24:58, 1.64it/s, loss=0.0035, lr=6.04e-06, step=7536] Training: 75%|███████▌ | 7537/10000 [1:34:58<24:58, 1.64it/s, loss=0.0023, lr=6.03e-06, step=7537] Training: 75%|███████▌ | 7538/10000 [1:34:58<25:32, 1.61it/s, loss=0.0023, lr=6.03e-06, step=7537] Training: 75%|███████▌ | 7538/10000 [1:34:58<25:32, 1.61it/s, loss=0.0043, lr=6.03e-06, step=7538] Training: 75%|███████▌ | 7539/10000 [1:34:59<27:01, 1.52it/s, loss=0.0043, lr=6.03e-06, step=7538] Training: 75%|███████▌ | 7539/10000 [1:34:59<27:01, 1.52it/s, loss=0.0023, lr=6.03e-06, step=7539]17:41:06.637 [I] step=7540 loss=0.0246 smoothed_loss=0.0087 lr=6.04e-06 grad_norm=0.4375 step_time=0.5338s data_time=0.0767s it/s=1.638 eta_to_10000=1501.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0264 grad_action_out_proj=0.2225 grad_shared_expert=0.7478 (10775:train_pytorch.py:850) + Training: 75%|███████▌ | 7540/10000 [1:35:00<25:37, 1.60it/s, loss=0.0023, lr=6.03e-06, step=7539] Training: 75%|███████▌ | 7540/10000 [1:35:00<25:37, 1.60it/s, loss=0.0246, lr=6.02e-06, step=7540] Training: 75%|███████▌ | 7541/10000 [1:35:00<25:14, 1.62it/s, loss=0.0246, lr=6.02e-06, step=7540] Training: 75%|███████▌ | 7541/10000 [1:35:00<25:14, 1.62it/s, loss=0.0026, lr=6.02e-06, step=7541] Training: 75%|███████▌ | 7542/10000 [1:35:01<26:06, 1.57it/s, loss=0.0026, lr=6.02e-06, step=7541] Training: 75%|███████▌ | 7542/10000 [1:35:01<26:06, 1.57it/s, loss=0.0071, lr=6.02e-06, step=7542] Training: 75%|███████▌ | 7543/10000 [1:35:02<25:03, 1.63it/s, loss=0.0071, lr=6.02e-06, step=7542] Training: 75%|███████▌ | 7543/10000 [1:35:02<25:03, 1.63it/s, loss=0.0061, lr=6.02e-06, step=7543] Training: 75%|███████▌ | 7544/10000 [1:35:02<23:30, 1.74it/s, loss=0.0061, lr=6.02e-06, step=7543] Training: 75%|███████▌ | 7544/10000 [1:35:02<23:30, 1.74it/s, loss=0.0048, lr=6.01e-06, step=7544] Training: 75%|███████▌ | 7545/10000 [1:35:03<24:30, 1.67it/s, loss=0.0048, lr=6.01e-06, step=7544] Training: 75%|███████▌ | 7545/10000 [1:35:03<24:30, 1.67it/s, loss=0.0512, lr=6.01e-06, step=7545] Training: 75%|███████▌ | 7546/10000 [1:35:03<25:57, 1.58it/s, loss=0.0512, lr=6.01e-06, step=7545] Training: 75%|███████▌ | 7546/10000 [1:35:03<25:57, 1.58it/s, loss=0.0140, lr=6.01e-06, step=7546] Training: 75%|███████▌ | 7547/10000 [1:35:04<24:36, 1.66it/s, loss=0.0140, lr=6.01e-06, step=7546] Training: 75%|███████▌ | 7547/10000 [1:35:04<24:36, 1.66it/s, loss=0.0132, lr=6.01e-06, step=7547] Training: 75%|███████▌ | 7548/10000 [1:35:04<23:24, 1.75it/s, loss=0.0132, lr=6.01e-06, step=7547] Training: 75%|███████▌ | 7548/10000 [1:35:04<23:24, 1.75it/s, loss=0.0052, lr=6.00e-06, step=7548] Training: 75%|███████▌ | 7549/10000 [1:35:05<22:15, 1.83it/s, loss=0.0052, lr=6.00e-06, step=7548] Training: 75%|███████▌ | 7549/10000 [1:35:05<22:15, 1.83it/s, loss=0.0131, lr=6.00e-06, step=7549]17:41:12.401 [I] step=7550 loss=0.0036 smoothed_loss=0.0108 lr=6.01e-06 grad_norm=0.4701 step_time=0.5068s data_time=0.0696s it/s=1.735 eta_to_10000=1411.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0084 grad_action_out_proj=0.0893 grad_shared_expert=0.2518 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7550/10000 [1:35:05<22:23, 1.82it/s, loss=0.0131, lr=6.00e-06, step=7549] Training: 76%|███████▌ | 7550/10000 [1:35:05<22:23, 1.82it/s, loss=0.0036, lr=6.00e-06, step=7550] Training: 76%|███████▌ | 7551/10000 [1:35:06<21:33, 1.89it/s, loss=0.0036, lr=6.00e-06, step=7550] Training: 76%|███████▌ | 7551/10000 [1:35:06<21:33, 1.89it/s, loss=0.0037, lr=5.99e-06, step=7551] Training: 76%|███████▌ | 7552/10000 [1:35:06<21:36, 1.89it/s, loss=0.0037, lr=5.99e-06, step=7551] Training: 76%|███████▌ | 7552/10000 [1:35:06<21:36, 1.89it/s, loss=0.0060, lr=5.99e-06, step=7552] Training: 76%|███████▌ | 7553/10000 [1:35:07<27:04, 1.51it/s, loss=0.0060, lr=5.99e-06, step=7552] Training: 76%|███████▌ | 7553/10000 [1:35:07<27:04, 1.51it/s, loss=0.0018, lr=5.99e-06, step=7553] Training: 76%|███████▌ | 7554/10000 [1:35:08<28:41, 1.42it/s, loss=0.0018, lr=5.99e-06, step=7553] Training: 76%|███████▌ | 7554/10000 [1:35:08<28:41, 1.42it/s, loss=0.0231, lr=5.99e-06, step=7554] Training: 76%|███████▌ | 7555/10000 [1:35:09<28:06, 1.45it/s, loss=0.0231, lr=5.99e-06, step=7554] Training: 76%|███████▌ | 7555/10000 [1:35:09<28:06, 1.45it/s, loss=0.0072, lr=5.98e-06, step=7555] Training: 76%|███████▌ | 7556/10000 [1:35:09<26:24, 1.54it/s, loss=0.0072, lr=5.98e-06, step=7555] Training: 76%|███████▌ | 7556/10000 [1:35:09<26:24, 1.54it/s, loss=0.0065, lr=5.98e-06, step=7556] Training: 76%|███████▌ | 7557/10000 [1:35:10<26:25, 1.54it/s, loss=0.0065, lr=5.98e-06, step=7556] Training: 76%|███████▌ | 7557/10000 [1:35:10<26:25, 1.54it/s, loss=0.0027, lr=5.98e-06, step=7557] Training: 76%|███████▌ | 7558/10000 [1:35:11<25:49, 1.58it/s, loss=0.0027, lr=5.98e-06, step=7557] Training: 76%|███████▌ | 7558/10000 [1:35:11<25:49, 1.58it/s, loss=0.0049, lr=5.98e-06, step=7558] Training: 76%|███████▌ | 7559/10000 [1:35:11<27:27, 1.48it/s, loss=0.0049, lr=5.98e-06, step=7558] Training: 76%|███████▌ | 7559/10000 [1:35:11<27:27, 1.48it/s, loss=0.0091, lr=5.97e-06, step=7559]17:41:19.466 [I] step=7560 loss=0.0029 smoothed_loss=0.0080 lr=5.98e-06 grad_norm=0.4119 step_time=0.5683s data_time=0.1382s it/s=1.416 eta_to_10000=1723.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0186 grad_action_out_proj=0.0959 grad_shared_expert=0.3808 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7560/10000 [1:35:13<32:00, 1.27it/s, loss=0.0091, lr=5.97e-06, step=7559] Training: 76%|███████▌ | 7560/10000 [1:35:13<32:00, 1.27it/s, loss=0.0029, lr=5.97e-06, step=7560] Training: 76%|███████▌ | 7561/10000 [1:35:13<32:58, 1.23it/s, loss=0.0029, lr=5.97e-06, step=7560] Training: 76%|███████▌ | 7561/10000 [1:35:13<32:58, 1.23it/s, loss=0.0055, lr=5.97e-06, step=7561] Training: 76%|███████▌ | 7562/10000 [1:35:14<31:08, 1.30it/s, loss=0.0055, lr=5.97e-06, step=7561] Training: 76%|███████▌ | 7562/10000 [1:35:14<31:08, 1.30it/s, loss=0.0181, lr=5.97e-06, step=7562] Training: 76%|███████▌ | 7563/10000 [1:35:15<29:42, 1.37it/s, loss=0.0181, lr=5.97e-06, step=7562] Training: 76%|███████▌ | 7563/10000 [1:35:15<29:42, 1.37it/s, loss=0.0037, lr=5.96e-06, step=7563] Training: 76%|███████▌ | 7564/10000 [1:35:15<27:46, 1.46it/s, loss=0.0037, lr=5.96e-06, step=7563] Training: 76%|███████▌ | 7564/10000 [1:35:15<27:46, 1.46it/s, loss=0.0094, lr=5.96e-06, step=7564] Training: 76%|███████▌ | 7565/10000 [1:35:16<26:07, 1.55it/s, loss=0.0094, lr=5.96e-06, step=7564] Training: 76%|███████▌ | 7565/10000 [1:35:16<26:07, 1.55it/s, loss=0.0064, lr=5.96e-06, step=7565] Training: 76%|███████▌ | 7566/10000 [1:35:17<30:45, 1.32it/s, loss=0.0064, lr=5.96e-06, step=7565] Training: 76%|███████▌ | 7566/10000 [1:35:17<30:45, 1.32it/s, loss=0.0163, lr=5.95e-06, step=7566] Training: 76%|███████▌ | 7567/10000 [1:35:17<28:49, 1.41it/s, loss=0.0163, lr=5.95e-06, step=7566] Training: 76%|███████▌ | 7567/10000 [1:35:17<28:49, 1.41it/s, loss=0.0120, lr=5.95e-06, step=7567] Training: 76%|███████▌ | 7568/10000 [1:35:19<36:44, 1.10it/s, loss=0.0120, lr=5.95e-06, step=7567] Training: 76%|███████▌ | 7568/10000 [1:35:19<36:44, 1.10it/s, loss=0.0082, lr=5.95e-06, step=7568] Training: 76%|███████▌ | 7569/10000 [1:35:19<31:38, 1.28it/s, loss=0.0082, lr=5.95e-06, step=7568] Training: 76%|███████▌ | 7569/10000 [1:35:19<31:38, 1.28it/s, loss=0.0329, lr=5.95e-06, step=7569]17:41:26.773 [I] step=7570 loss=0.0041 smoothed_loss=0.0108 lr=5.96e-06 grad_norm=0.4601 step_time=0.5578s data_time=0.1728s it/s=1.369 eta_to_10000=1775.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0192 grad_action_out_proj=0.1351 grad_shared_expert=0.4243 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7570/10000 [1:35:20<28:33, 1.42it/s, loss=0.0329, lr=5.95e-06, step=7569] Training: 76%|███████▌ | 7570/10000 [1:35:20<28:33, 1.42it/s, loss=0.0041, lr=5.94e-06, step=7570] Training: 76%|███████▌ | 7571/10000 [1:35:21<28:05, 1.44it/s, loss=0.0041, lr=5.94e-06, step=7570] Training: 76%|███████▌ | 7571/10000 [1:35:21<28:05, 1.44it/s, loss=0.0127, lr=5.94e-06, step=7571] Training: 76%|███████▌ | 7572/10000 [1:35:21<28:07, 1.44it/s, loss=0.0127, lr=5.94e-06, step=7571] Training: 76%|███████▌ | 7572/10000 [1:35:21<28:07, 1.44it/s, loss=0.0041, lr=5.94e-06, step=7572] Training: 76%|███████▌ | 7573/10000 [1:35:22<25:57, 1.56it/s, loss=0.0041, lr=5.94e-06, step=7572] Training: 76%|███████▌ | 7573/10000 [1:35:22<25:57, 1.56it/s, loss=0.0483, lr=5.94e-06, step=7573] Training: 76%|███████▌ | 7574/10000 [1:35:22<24:25, 1.66it/s, loss=0.0483, lr=5.94e-06, step=7573] Training: 76%|███████▌ | 7574/10000 [1:35:22<24:25, 1.66it/s, loss=0.0028, lr=5.93e-06, step=7574] Training: 76%|███████▌ | 7575/10000 [1:35:23<26:55, 1.50it/s, loss=0.0028, lr=5.93e-06, step=7574] Training: 76%|███████▌ | 7575/10000 [1:35:23<26:55, 1.50it/s, loss=0.0068, lr=5.93e-06, step=7575] Training: 76%|███████▌ | 7576/10000 [1:35:24<24:52, 1.62it/s, loss=0.0068, lr=5.93e-06, step=7575] Training: 76%|███████▌ | 7576/10000 [1:35:24<24:52, 1.62it/s, loss=0.0050, lr=5.93e-06, step=7576] Training: 76%|███████▌ | 7577/10000 [1:35:24<23:40, 1.71it/s, loss=0.0050, lr=5.93e-06, step=7576] Training: 76%|███████▌ | 7577/10000 [1:35:24<23:40, 1.71it/s, loss=0.0121, lr=5.92e-06, step=7577] Training: 76%|███████▌ | 7578/10000 [1:35:25<23:31, 1.72it/s, loss=0.0121, lr=5.92e-06, step=7577] Training: 76%|███████▌ | 7578/10000 [1:35:25<23:31, 1.72it/s, loss=0.1010, lr=5.92e-06, step=7578] Training: 76%|███████▌ | 7579/10000 [1:35:25<22:43, 1.78it/s, loss=0.1010, lr=5.92e-06, step=7578] Training: 76%|███████▌ | 7579/10000 [1:35:25<22:43, 1.78it/s, loss=0.0047, lr=5.92e-06, step=7579]17:41:32.659 [I] step=7580 loss=0.0029 smoothed_loss=0.0174 lr=5.93e-06 grad_norm=0.4063 step_time=0.5032s data_time=0.0854s it/s=1.699 eta_to_10000=1424.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0067 grad_action_out_proj=0.0689 grad_shared_expert=0.2665 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7580/10000 [1:35:26<22:46, 1.77it/s, loss=0.0047, lr=5.92e-06, step=7579] Training: 76%|███████▌ | 7580/10000 [1:35:26<22:46, 1.77it/s, loss=0.0029, lr=5.92e-06, step=7580] Training: 76%|███████▌ | 7581/10000 [1:35:26<22:00, 1.83it/s, loss=0.0029, lr=5.92e-06, step=7580] Training: 76%|███████▌ | 7581/10000 [1:35:26<22:00, 1.83it/s, loss=0.0016, lr=5.91e-06, step=7581] Training: 76%|███████▌ | 7582/10000 [1:35:27<25:55, 1.55it/s, loss=0.0016, lr=5.91e-06, step=7581] Training: 76%|███████▌ | 7582/10000 [1:35:27<25:55, 1.55it/s, loss=0.0132, lr=5.91e-06, step=7582] Training: 76%|███████▌ | 7583/10000 [1:35:28<26:16, 1.53it/s, loss=0.0132, lr=5.91e-06, step=7582] Training: 76%|███████▌ | 7583/10000 [1:35:28<26:16, 1.53it/s, loss=0.0102, lr=5.91e-06, step=7583] Training: 76%|███████▌ | 7584/10000 [1:35:28<26:11, 1.54it/s, loss=0.0102, lr=5.91e-06, step=7583] Training: 76%|███████▌ | 7584/10000 [1:35:28<26:11, 1.54it/s, loss=0.0091, lr=5.91e-06, step=7584] Training: 76%|███████▌ | 7585/10000 [1:35:29<27:43, 1.45it/s, loss=0.0091, lr=5.91e-06, step=7584] Training: 76%|███████▌ | 7585/10000 [1:35:29<27:43, 1.45it/s, loss=0.0158, lr=5.90e-06, step=7585] Training: 76%|███████▌ | 7586/10000 [1:35:30<26:40, 1.51it/s, loss=0.0158, lr=5.90e-06, step=7585] Training: 76%|███████▌ | 7586/10000 [1:35:30<26:40, 1.51it/s, loss=0.0089, lr=5.90e-06, step=7586] Training: 76%|███████▌ | 7587/10000 [1:35:30<25:53, 1.55it/s, loss=0.0089, lr=5.90e-06, step=7586] Training: 76%|███████▌ | 7587/10000 [1:35:30<25:53, 1.55it/s, loss=0.0081, lr=5.90e-06, step=7587] Training: 76%|███████▌ | 7588/10000 [1:35:31<24:25, 1.65it/s, loss=0.0081, lr=5.90e-06, step=7587] Training: 76%|███████▌ | 7588/10000 [1:35:31<24:25, 1.65it/s, loss=0.0078, lr=5.90e-06, step=7588] Training: 76%|███████▌ | 7589/10000 [1:35:32<27:34, 1.46it/s, loss=0.0078, lr=5.90e-06, step=7588] Training: 76%|███████▌ | 7589/10000 [1:35:32<27:34, 1.46it/s, loss=0.0029, lr=5.89e-06, step=7589]17:41:39.493 [I] step=7590 loss=0.0078 smoothed_loss=0.0114 lr=5.90e-06 grad_norm=0.3959 step_time=0.5809s data_time=0.1025s it/s=1.464 eta_to_10000=1646.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0262 grad_action_out_proj=0.1231 grad_shared_expert=0.5746 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7590/10000 [1:35:33<28:33, 1.41it/s, loss=0.0029, lr=5.89e-06, step=7589] Training: 76%|███████▌ | 7590/10000 [1:35:33<28:33, 1.41it/s, loss=0.0078, lr=5.89e-06, step=7590] Training: 76%|███████▌ | 7591/10000 [1:35:33<26:18, 1.53it/s, loss=0.0078, lr=5.89e-06, step=7590] Training: 76%|███████▌ | 7591/10000 [1:35:33<26:18, 1.53it/s, loss=0.0082, lr=5.89e-06, step=7591] Training: 76%|███████▌ | 7592/10000 [1:35:34<24:45, 1.62it/s, loss=0.0082, lr=5.89e-06, step=7591] Training: 76%|███████▌ | 7592/10000 [1:35:34<24:45, 1.62it/s, loss=0.0110, lr=5.88e-06, step=7592] Training: 76%|███████▌ | 7593/10000 [1:35:34<24:43, 1.62it/s, loss=0.0110, lr=5.88e-06, step=7592] Training: 76%|███████▌ | 7593/10000 [1:35:34<24:43, 1.62it/s, loss=0.0071, lr=5.88e-06, step=7593] Training: 76%|███████▌ | 7594/10000 [1:35:35<24:24, 1.64it/s, loss=0.0071, lr=5.88e-06, step=7593] Training: 76%|███████▌ | 7594/10000 [1:35:35<24:24, 1.64it/s, loss=0.0051, lr=5.88e-06, step=7594] Training: 76%|███████▌ | 7595/10000 [1:35:35<23:57, 1.67it/s, loss=0.0051, lr=5.88e-06, step=7594] Training: 76%|███████▌ | 7595/10000 [1:35:35<23:57, 1.67it/s, loss=0.0044, lr=5.88e-06, step=7595] Training: 76%|███████▌ | 7596/10000 [1:35:36<26:22, 1.52it/s, loss=0.0044, lr=5.88e-06, step=7595] Training: 76%|███████▌ | 7596/10000 [1:35:36<26:22, 1.52it/s, loss=0.0019, lr=5.87e-06, step=7596] Training: 76%|███████▌ | 7597/10000 [1:35:37<29:10, 1.37it/s, loss=0.0019, lr=5.87e-06, step=7596] Training: 76%|███████▌ | 7597/10000 [1:35:37<29:10, 1.37it/s, loss=0.0066, lr=5.87e-06, step=7597] Training: 76%|███████▌ | 7598/10000 [1:35:38<28:46, 1.39it/s, loss=0.0066, lr=5.87e-06, step=7597] Training: 76%|███████▌ | 7598/10000 [1:35:38<28:46, 1.39it/s, loss=0.0011, lr=5.87e-06, step=7598] Training: 76%|███████▌ | 7599/10000 [1:35:38<28:10, 1.42it/s, loss=0.0011, lr=5.87e-06, step=7598] Training: 76%|███████▌ | 7599/10000 [1:35:38<28:10, 1.42it/s, loss=0.0205, lr=5.87e-06, step=7599]17:41:45.971 [I] step=7600 loss=0.0060 smoothed_loss=0.0088 lr=5.88e-06 grad_norm=0.4206 step_time=0.5338s data_time=0.1139s it/s=1.544 eta_to_10000=1554.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0678 grad_shared_expert=0.3898 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7600/10000 [1:35:39<26:47, 1.49it/s, loss=0.0205, lr=5.87e-06, step=7599] Training: 76%|███████▌ | 7600/10000 [1:35:39<26:47, 1.49it/s, loss=0.0060, lr=5.86e-06, step=7600] Training: 76%|███████▌ | 7601/10000 [1:35:40<24:48, 1.61it/s, loss=0.0060, lr=5.86e-06, step=7600] Training: 76%|███████▌ | 7601/10000 [1:35:40<24:48, 1.61it/s, loss=0.0045, lr=5.86e-06, step=7601] Training: 76%|███████▌ | 7602/10000 [1:35:40<26:01, 1.54it/s, loss=0.0045, lr=5.86e-06, step=7601] Training: 76%|███████▌ | 7602/10000 [1:35:40<26:01, 1.54it/s, loss=0.0115, lr=5.86e-06, step=7602] Training: 76%|███████▌ | 7603/10000 [1:35:41<25:22, 1.57it/s, loss=0.0115, lr=5.86e-06, step=7602] Training: 76%|███████▌ | 7603/10000 [1:35:41<25:22, 1.57it/s, loss=0.0023, lr=5.86e-06, step=7603] Training: 76%|███████▌ | 7604/10000 [1:35:42<27:45, 1.44it/s, loss=0.0023, lr=5.86e-06, step=7603] Training: 76%|███████▌ | 7604/10000 [1:35:42<27:45, 1.44it/s, loss=0.0049, lr=5.85e-06, step=7604] Training: 76%|███████▌ | 7605/10000 [1:35:42<25:38, 1.56it/s, loss=0.0049, lr=5.85e-06, step=7604] Training: 76%|███████▌ | 7605/10000 [1:35:42<25:38, 1.56it/s, loss=0.0111, lr=5.85e-06, step=7605] Training: 76%|███████▌ | 7606/10000 [1:35:43<24:07, 1.65it/s, loss=0.0111, lr=5.85e-06, step=7605] Training: 76%|███████▌ | 7606/10000 [1:35:43<24:07, 1.65it/s, loss=0.0061, lr=5.85e-06, step=7606] Training: 76%|███████▌ | 7607/10000 [1:35:43<22:57, 1.74it/s, loss=0.0061, lr=5.85e-06, step=7606] Training: 76%|███████▌ | 7607/10000 [1:35:43<22:57, 1.74it/s, loss=0.0039, lr=5.85e-06, step=7607] Training: 76%|███████▌ | 7608/10000 [1:35:44<22:21, 1.78it/s, loss=0.0039, lr=5.85e-06, step=7607] Training: 76%|███████▌ | 7608/10000 [1:35:44<22:21, 1.78it/s, loss=0.0122, lr=5.84e-06, step=7608] Training: 76%|███████▌ | 7609/10000 [1:35:44<21:33, 1.85it/s, loss=0.0122, lr=5.84e-06, step=7608] Training: 76%|███████▌ | 7609/10000 [1:35:44<21:33, 1.85it/s, loss=0.0092, lr=5.84e-06, step=7609]17:41:51.893 [I] step=7610 loss=0.0034 smoothed_loss=0.0076 lr=5.85e-06 grad_norm=0.3867 step_time=0.5038s data_time=0.0883s it/s=1.689 eta_to_10000=1414.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0114 grad_action_out_proj=0.0820 grad_shared_expert=0.2910 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7610/10000 [1:35:45<23:23, 1.70it/s, loss=0.0092, lr=5.84e-06, step=7609] Training: 76%|███████▌ | 7610/10000 [1:35:45<23:23, 1.70it/s, loss=0.0034, lr=5.84e-06, step=7610] Training: 76%|███████▌ | 7611/10000 [1:35:46<25:30, 1.56it/s, loss=0.0034, lr=5.84e-06, step=7610] Training: 76%|███████▌ | 7611/10000 [1:35:46<25:30, 1.56it/s, loss=0.0124, lr=5.83e-06, step=7611] Training: 76%|███████▌ | 7612/10000 [1:35:46<24:04, 1.65it/s, loss=0.0124, lr=5.83e-06, step=7611] Training: 76%|███████▌ | 7612/10000 [1:35:46<24:04, 1.65it/s, loss=0.0041, lr=5.83e-06, step=7612] Training: 76%|███████▌ | 7613/10000 [1:35:47<22:37, 1.76it/s, loss=0.0041, lr=5.83e-06, step=7612] Training: 76%|███████▌ | 7613/10000 [1:35:47<22:37, 1.76it/s, loss=0.0083, lr=5.83e-06, step=7613] Training: 76%|███████▌ | 7614/10000 [1:35:47<21:54, 1.81it/s, loss=0.0083, lr=5.83e-06, step=7613] Training: 76%|███████▌ | 7614/10000 [1:35:47<21:54, 1.81it/s, loss=0.0057, lr=5.83e-06, step=7614] Training: 76%|███████▌ | 7615/10000 [1:35:48<24:14, 1.64it/s, loss=0.0057, lr=5.83e-06, step=7614] Training: 76%|███████▌ | 7615/10000 [1:35:48<24:14, 1.64it/s, loss=0.0026, lr=5.82e-06, step=7615] Training: 76%|███████▌ | 7616/10000 [1:35:48<22:54, 1.73it/s, loss=0.0026, lr=5.82e-06, step=7615] Training: 76%|███████▌ | 7616/10000 [1:35:48<22:54, 1.73it/s, loss=0.0058, lr=5.82e-06, step=7616] Training: 76%|███████▌ | 7617/10000 [1:35:49<24:10, 1.64it/s, loss=0.0058, lr=5.82e-06, step=7616] Training: 76%|███████▌ | 7617/10000 [1:35:49<24:10, 1.64it/s, loss=0.0061, lr=5.82e-06, step=7617] Training: 76%|███████▌ | 7618/10000 [1:35:50<26:20, 1.51it/s, loss=0.0061, lr=5.82e-06, step=7617] Training: 76%|███████▌ | 7618/10000 [1:35:50<26:20, 1.51it/s, loss=0.0068, lr=5.82e-06, step=7618] Training: 76%|███████▌ | 7619/10000 [1:35:50<24:29, 1.62it/s, loss=0.0068, lr=5.82e-06, step=7618] Training: 76%|███████▌ | 7619/10000 [1:35:50<24:29, 1.62it/s, loss=0.0286, lr=5.81e-06, step=7619]17:41:57.944 [I] step=7620 loss=0.0022 smoothed_loss=0.0083 lr=5.82e-06 grad_norm=0.4177 step_time=0.5324s data_time=0.0727s it/s=1.653 eta_to_10000=1440.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0283 grad_action_out_proj=0.1576 grad_shared_expert=0.4428 (10775:train_pytorch.py:850) + Training: 76%|███████▌ | 7620/10000 [1:35:51<23:34, 1.68it/s, loss=0.0286, lr=5.81e-06, step=7619] Training: 76%|███████▌ | 7620/10000 [1:35:51<23:34, 1.68it/s, loss=0.0022, lr=5.81e-06, step=7620] Training: 76%|███████▌ | 7621/10000 [1:35:51<22:13, 1.78it/s, loss=0.0022, lr=5.81e-06, step=7620] Training: 76%|███████▌ | 7621/10000 [1:35:51<22:13, 1.78it/s, loss=0.0124, lr=5.81e-06, step=7621] Training: 76%|███████▌ | 7622/10000 [1:35:52<21:45, 1.82it/s, loss=0.0124, lr=5.81e-06, step=7621] Training: 76%|███████▌ | 7622/10000 [1:35:52<21:45, 1.82it/s, loss=0.0054, lr=5.81e-06, step=7622] Training: 76%|███████▌ | 7623/10000 [1:35:53<21:13, 1.87it/s, loss=0.0054, lr=5.81e-06, step=7622] Training: 76%|███████▌ | 7623/10000 [1:35:53<21:13, 1.87it/s, loss=0.0071, lr=5.80e-06, step=7623] Training: 76%|███████▌ | 7624/10000 [1:35:53<24:55, 1.59it/s, loss=0.0071, lr=5.80e-06, step=7623] Training: 76%|███████▌ | 7624/10000 [1:35:53<24:55, 1.59it/s, loss=0.0056, lr=5.80e-06, step=7624] Training: 76%|███████▋ | 7625/10000 [1:35:54<26:35, 1.49it/s, loss=0.0056, lr=5.80e-06, step=7624] Training: 76%|███████▋ | 7625/10000 [1:35:54<26:35, 1.49it/s, loss=0.0097, lr=5.80e-06, step=7625] Training: 76%|███████▋ | 7626/10000 [1:35:55<25:22, 1.56it/s, loss=0.0097, lr=5.80e-06, step=7625] Training: 76%|███████▋ | 7626/10000 [1:35:55<25:22, 1.56it/s, loss=0.0059, lr=5.80e-06, step=7626] Training: 76%|███████▋ | 7627/10000 [1:35:55<24:14, 1.63it/s, loss=0.0059, lr=5.80e-06, step=7626] Training: 76%|███████▋ | 7627/10000 [1:35:55<24:14, 1.63it/s, loss=0.0078, lr=5.79e-06, step=7627] Training: 76%|███████▋ | 7628/10000 [1:35:56<23:37, 1.67it/s, loss=0.0078, lr=5.79e-06, step=7627] Training: 76%|███████▋ | 7628/10000 [1:35:56<23:37, 1.67it/s, loss=0.0055, lr=5.79e-06, step=7628] Training: 76%|███████▋ | 7629/10000 [1:35:56<23:14, 1.70it/s, loss=0.0055, lr=5.79e-06, step=7628] Training: 76%|███████▋ | 7629/10000 [1:35:56<23:14, 1.70it/s, loss=0.0026, lr=5.79e-06, step=7629]17:42:03.860 [I] step=7630 loss=0.0146 smoothed_loss=0.0079 lr=5.80e-06 grad_norm=0.4106 step_time=0.5076s data_time=0.0840s it/s=1.691 eta_to_10000=1401.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.0808 grad_shared_expert=0.4166 (10775:train_pytorch.py:850) + Training: 76%|███████▋ | 7630/10000 [1:35:57<22:42, 1.74it/s, loss=0.0026, lr=5.79e-06, step=7629] Training: 76%|███████▋ | 7630/10000 [1:35:57<22:42, 1.74it/s, loss=0.0146, lr=5.78e-06, step=7630] Training: 76%|███████▋ | 7631/10000 [1:35:57<21:44, 1.82it/s, loss=0.0146, lr=5.78e-06, step=7630] Training: 76%|███████▋ | 7631/10000 [1:35:57<21:44, 1.82it/s, loss=0.0010, lr=5.78e-06, step=7631] Training: 76%|███████▋ | 7632/10000 [1:35:58<24:49, 1.59it/s, loss=0.0010, lr=5.78e-06, step=7631] Training: 76%|███████▋ | 7632/10000 [1:35:58<24:49, 1.59it/s, loss=0.0053, lr=5.78e-06, step=7632] Training: 76%|███████▋ | 7633/10000 [1:35:59<24:25, 1.62it/s, loss=0.0053, lr=5.78e-06, step=7632] Training: 76%|███████▋ | 7633/10000 [1:35:59<24:25, 1.62it/s, loss=0.0133, lr=5.78e-06, step=7633] Training: 76%|███████▋ | 7634/10000 [1:36:00<26:55, 1.46it/s, loss=0.0133, lr=5.78e-06, step=7633] Training: 76%|███████▋ | 7634/10000 [1:36:00<26:55, 1.46it/s, loss=0.0046, lr=5.77e-06, step=7634] Training: 76%|███████▋ | 7635/10000 [1:36:00<25:06, 1.57it/s, loss=0.0046, lr=5.77e-06, step=7634] Training: 76%|███████▋ | 7635/10000 [1:36:00<25:06, 1.57it/s, loss=0.0233, lr=5.77e-06, step=7635] Training: 76%|███████▋ | 7636/10000 [1:36:01<24:18, 1.62it/s, loss=0.0233, lr=5.77e-06, step=7635] Training: 76%|███████▋ | 7636/10000 [1:36:01<24:18, 1.62it/s, loss=0.0079, lr=5.77e-06, step=7636] Training: 76%|███████▋ | 7637/10000 [1:36:01<24:05, 1.63it/s, loss=0.0079, lr=5.77e-06, step=7636] Training: 76%|███████▋ | 7637/10000 [1:36:01<24:05, 1.63it/s, loss=0.0224, lr=5.77e-06, step=7637] Training: 76%|███████▋ | 7638/10000 [1:36:02<23:54, 1.65it/s, loss=0.0224, lr=5.77e-06, step=7637] Training: 76%|███████▋ | 7638/10000 [1:36:02<23:54, 1.65it/s, loss=0.0018, lr=5.76e-06, step=7638] Training: 76%|███████▋ | 7639/10000 [1:36:03<24:19, 1.62it/s, loss=0.0018, lr=5.76e-06, step=7638] Training: 76%|███████▋ | 7639/10000 [1:36:03<24:19, 1.62it/s, loss=0.0095, lr=5.76e-06, step=7639]17:42:10.459 [I] step=7640 loss=0.0035 smoothed_loss=0.0088 lr=5.77e-06 grad_norm=0.4251 step_time=0.5437s data_time=0.1162s it/s=1.516 eta_to_10000=1557.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0180 grad_action_out_proj=0.1541 grad_shared_expert=0.4093 (10775:train_pytorch.py:850) + Training: 76%|███████▋ | 7640/10000 [1:36:04<27:56, 1.41it/s, loss=0.0095, lr=5.76e-06, step=7639] Training: 76%|███████▋ | 7640/10000 [1:36:04<27:56, 1.41it/s, loss=0.0035, lr=5.76e-06, step=7640] Training: 76%|███████▋ | 7641/10000 [1:36:04<25:43, 1.53it/s, loss=0.0035, lr=5.76e-06, step=7640] Training: 76%|███████▋ | 7641/10000 [1:36:04<25:43, 1.53it/s, loss=0.0034, lr=5.76e-06, step=7641] Training: 76%|███████▋ | 7642/10000 [1:36:05<27:16, 1.44it/s, loss=0.0034, lr=5.76e-06, step=7641] Training: 76%|███████▋ | 7642/10000 [1:36:05<27:16, 1.44it/s, loss=0.0045, lr=5.75e-06, step=7642] Training: 76%|███████▋ | 7643/10000 [1:36:05<25:18, 1.55it/s, loss=0.0045, lr=5.75e-06, step=7642] Training: 76%|███████▋ | 7643/10000 [1:36:05<25:18, 1.55it/s, loss=0.0068, lr=5.75e-06, step=7643] Training: 76%|███████▋ | 7644/10000 [1:36:06<23:54, 1.64it/s, loss=0.0068, lr=5.75e-06, step=7643] Training: 76%|███████▋ | 7644/10000 [1:36:06<23:54, 1.64it/s, loss=0.0080, lr=5.75e-06, step=7644] Training: 76%|███████▋ | 7645/10000 [1:36:06<22:26, 1.75it/s, loss=0.0080, lr=5.75e-06, step=7644] Training: 76%|███████▋ | 7645/10000 [1:36:06<22:26, 1.75it/s, loss=0.0025, lr=5.75e-06, step=7645] Training: 76%|███████▋ | 7646/10000 [1:36:07<21:33, 1.82it/s, loss=0.0025, lr=5.75e-06, step=7645] Training: 76%|███████▋ | 7646/10000 [1:36:07<21:33, 1.82it/s, loss=0.0073, lr=5.74e-06, step=7646] Training: 76%|███████▋ | 7647/10000 [1:36:08<26:56, 1.46it/s, loss=0.0073, lr=5.74e-06, step=7646] Training: 76%|███████▋ | 7647/10000 [1:36:08<26:56, 1.46it/s, loss=0.0026, lr=5.74e-06, step=7647] Training: 76%|███████▋ | 7648/10000 [1:36:08<25:01, 1.57it/s, loss=0.0026, lr=5.74e-06, step=7647] Training: 76%|███████▋ | 7648/10000 [1:36:08<25:01, 1.57it/s, loss=0.0011, lr=5.74e-06, step=7648] Training: 76%|███████▋ | 7649/10000 [1:36:09<24:54, 1.57it/s, loss=0.0011, lr=5.74e-06, step=7648] Training: 76%|███████▋ | 7649/10000 [1:36:09<24:54, 1.57it/s, loss=0.0072, lr=5.73e-06, step=7649]17:42:16.783 [I] step=7650 loss=0.0019 smoothed_loss=0.0059 lr=5.74e-06 grad_norm=0.3544 step_time=0.5240s data_time=0.1085s it/s=1.582 eta_to_10000=1485.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0044 grad_action_out_proj=0.0591 grad_shared_expert=0.3781 (10775:train_pytorch.py:850) + Training: 76%|███████▋ | 7650/10000 [1:36:10<27:00, 1.45it/s, loss=0.0072, lr=5.73e-06, step=7649] Training: 76%|███████▋ | 7650/10000 [1:36:10<27:00, 1.45it/s, loss=0.0019, lr=5.73e-06, step=7650] Training: 77%|███████▋ | 7651/10000 [1:36:11<26:33, 1.47it/s, loss=0.0019, lr=5.73e-06, step=7650] Training: 77%|███████▋ | 7651/10000 [1:36:11<26:33, 1.47it/s, loss=0.0164, lr=5.73e-06, step=7651] Training: 77%|███████▋ | 7652/10000 [1:36:11<27:02, 1.45it/s, loss=0.0164, lr=5.73e-06, step=7651] Training: 77%|███████▋ | 7652/10000 [1:36:11<27:02, 1.45it/s, loss=0.0148, lr=5.73e-06, step=7652] Training: 77%|███████▋ | 7653/10000 [1:36:12<25:51, 1.51it/s, loss=0.0148, lr=5.73e-06, step=7652] Training: 77%|███████▋ | 7653/10000 [1:36:12<25:51, 1.51it/s, loss=0.0019, lr=5.72e-06, step=7653] Training: 77%|███████▋ | 7654/10000 [1:36:13<28:43, 1.36it/s, loss=0.0019, lr=5.72e-06, step=7653] Training: 77%|███████▋ | 7654/10000 [1:36:13<28:43, 1.36it/s, loss=0.0013, lr=5.72e-06, step=7654] Training: 77%|███████▋ | 7655/10000 [1:36:13<29:06, 1.34it/s, loss=0.0013, lr=5.72e-06, step=7654] Training: 77%|███████▋ | 7655/10000 [1:36:13<29:06, 1.34it/s, loss=0.0170, lr=5.72e-06, step=7655] Training: 77%|███████▋ | 7656/10000 [1:36:14<29:14, 1.34it/s, loss=0.0170, lr=5.72e-06, step=7655] Training: 77%|███████▋ | 7656/10000 [1:36:14<29:14, 1.34it/s, loss=0.0062, lr=5.72e-06, step=7656] Training: 77%|███████▋ | 7657/10000 [1:36:15<27:26, 1.42it/s, loss=0.0062, lr=5.72e-06, step=7656] Training: 77%|███████▋ | 7657/10000 [1:36:15<27:26, 1.42it/s, loss=0.0145, lr=5.71e-06, step=7657] Training: 77%|███████▋ | 7658/10000 [1:36:15<25:56, 1.50it/s, loss=0.0145, lr=5.71e-06, step=7657] Training: 77%|███████▋ | 7658/10000 [1:36:15<25:56, 1.50it/s, loss=0.0050, lr=5.71e-06, step=7658] Training: 77%|███████▋ | 7659/10000 [1:36:16<24:09, 1.62it/s, loss=0.0050, lr=5.71e-06, step=7658] Training: 77%|███████▋ | 7659/10000 [1:36:16<24:09, 1.62it/s, loss=0.0062, lr=5.71e-06, step=7659]17:42:23.488 [I] step=7660 loss=0.0045 smoothed_loss=0.0074 lr=5.72e-06 grad_norm=0.4114 step_time=0.5496s data_time=0.1208s it/s=1.492 eta_to_10000=1568.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.0729 grad_shared_expert=0.3384 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7660/10000 [1:36:17<24:11, 1.61it/s, loss=0.0062, lr=5.71e-06, step=7659] Training: 77%|███████▋ | 7660/10000 [1:36:17<24:11, 1.61it/s, loss=0.0045, lr=5.71e-06, step=7660] Training: 77%|███████▋ | 7661/10000 [1:36:17<25:12, 1.55it/s, loss=0.0045, lr=5.71e-06, step=7660] Training: 77%|███████▋ | 7661/10000 [1:36:17<25:12, 1.55it/s, loss=0.0030, lr=5.70e-06, step=7661] Training: 77%|███████▋ | 7662/10000 [1:36:19<32:41, 1.19it/s, loss=0.0030, lr=5.70e-06, step=7661] Training: 77%|███████▋ | 7662/10000 [1:36:19<32:41, 1.19it/s, loss=0.0139, lr=5.70e-06, step=7662] Training: 77%|███████▋ | 7663/10000 [1:36:19<31:14, 1.25it/s, loss=0.0139, lr=5.70e-06, step=7662] Training: 77%|███████▋ | 7663/10000 [1:36:19<31:14, 1.25it/s, loss=0.0035, lr=5.70e-06, step=7663] Training: 77%|███████▋ | 7664/10000 [1:36:20<32:18, 1.21it/s, loss=0.0035, lr=5.70e-06, step=7663] Training: 77%|███████▋ | 7664/10000 [1:36:20<32:18, 1.21it/s, loss=0.0090, lr=5.70e-06, step=7664] Training: 77%|███████▋ | 7665/10000 [1:36:21<29:59, 1.30it/s, loss=0.0090, lr=5.70e-06, step=7664] Training: 77%|███████▋ | 7665/10000 [1:36:21<29:59, 1.30it/s, loss=0.0166, lr=5.69e-06, step=7665] Training: 77%|███████▋ | 7666/10000 [1:36:21<28:12, 1.38it/s, loss=0.0166, lr=5.69e-06, step=7665] Training: 77%|███████▋ | 7666/10000 [1:36:21<28:12, 1.38it/s, loss=0.0047, lr=5.69e-06, step=7666] Training: 77%|███████▋ | 7667/10000 [1:36:22<26:32, 1.47it/s, loss=0.0047, lr=5.69e-06, step=7666] Training: 77%|███████▋ | 7667/10000 [1:36:22<26:32, 1.47it/s, loss=0.0012, lr=5.69e-06, step=7667] Training: 77%|███████▋ | 7668/10000 [1:36:23<28:04, 1.38it/s, loss=0.0012, lr=5.69e-06, step=7667] Training: 77%|███████▋ | 7668/10000 [1:36:23<28:04, 1.38it/s, loss=0.0052, lr=5.69e-06, step=7668] Training: 77%|███████▋ | 7669/10000 [1:36:24<29:52, 1.30it/s, loss=0.0052, lr=5.69e-06, step=7668] Training: 77%|███████▋ | 7669/10000 [1:36:24<29:52, 1.30it/s, loss=0.0040, lr=5.68e-06, step=7669]17:42:31.441 [I] step=7670 loss=0.0128 smoothed_loss=0.0074 lr=5.69e-06 grad_norm=0.4406 step_time=0.6453s data_time=0.1500s it/s=1.258 eta_to_10000=1852.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0111 grad_action_out_proj=0.0838 grad_shared_expert=0.3621 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7670/10000 [1:36:25<30:27, 1.28it/s, loss=0.0040, lr=5.68e-06, step=7669] Training: 77%|███████▋ | 7670/10000 [1:36:25<30:27, 1.28it/s, loss=0.0128, lr=5.68e-06, step=7670] Training: 77%|███████▋ | 7671/10000 [1:36:25<27:46, 1.40it/s, loss=0.0128, lr=5.68e-06, step=7670] Training: 77%|███████▋ | 7671/10000 [1:36:25<27:46, 1.40it/s, loss=0.0103, lr=5.68e-06, step=7671] Training: 77%|███████▋ | 7672/10000 [1:36:26<28:18, 1.37it/s, loss=0.0103, lr=5.68e-06, step=7671] Training: 77%|███████▋ | 7672/10000 [1:36:26<28:18, 1.37it/s, loss=0.0044, lr=5.67e-06, step=7672] Training: 77%|███████▋ | 7673/10000 [1:36:26<26:26, 1.47it/s, loss=0.0044, lr=5.67e-06, step=7672] Training: 77%|███████▋ | 7673/10000 [1:36:26<26:26, 1.47it/s, loss=0.0397, lr=5.67e-06, step=7673] Training: 77%|███████▋ | 7674/10000 [1:36:27<27:20, 1.42it/s, loss=0.0397, lr=5.67e-06, step=7673] Training: 77%|███████▋ | 7674/10000 [1:36:27<27:20, 1.42it/s, loss=0.0020, lr=5.67e-06, step=7674] Training: 77%|███████▋ | 7675/10000 [1:36:28<30:16, 1.28it/s, loss=0.0020, lr=5.67e-06, step=7674] Training: 77%|███████▋ | 7675/10000 [1:36:28<30:16, 1.28it/s, loss=0.0328, lr=5.67e-06, step=7675] Training: 77%|███████▋ | 7676/10000 [1:36:29<33:01, 1.17it/s, loss=0.0328, lr=5.67e-06, step=7675] Training: 77%|███████▋ | 7676/10000 [1:36:29<33:01, 1.17it/s, loss=0.0353, lr=5.66e-06, step=7676] Training: 77%|███████▋ | 7677/10000 [1:36:30<31:51, 1.22it/s, loss=0.0353, lr=5.66e-06, step=7676] Training: 77%|███████▋ | 7677/10000 [1:36:30<31:51, 1.22it/s, loss=0.0054, lr=5.66e-06, step=7677] Training: 77%|███████▋ | 7678/10000 [1:36:30<28:22, 1.36it/s, loss=0.0054, lr=5.66e-06, step=7677] Training: 77%|███████▋ | 7678/10000 [1:36:30<28:22, 1.36it/s, loss=0.0159, lr=5.66e-06, step=7678] Training: 77%|███████▋ | 7679/10000 [1:36:31<26:57, 1.44it/s, loss=0.0159, lr=5.66e-06, step=7678] Training: 77%|███████▋ | 7679/10000 [1:36:31<26:57, 1.44it/s, loss=0.0058, lr=5.66e-06, step=7679]17:42:38.704 [I] step=7680 loss=0.0088 smoothed_loss=0.0125 lr=5.67e-06 grad_norm=0.4383 step_time=0.5897s data_time=0.1365s it/s=1.377 eta_to_10000=1684.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0096 grad_action_out_proj=0.0796 grad_shared_expert=0.4381 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7680/10000 [1:36:32<27:32, 1.40it/s, loss=0.0058, lr=5.66e-06, step=7679] Training: 77%|███████▋ | 7680/10000 [1:36:32<27:32, 1.40it/s, loss=0.0088, lr=5.65e-06, step=7680] Training: 77%|███████▋ | 7681/10000 [1:36:33<32:26, 1.19it/s, loss=0.0088, lr=5.65e-06, step=7680] Training: 77%|███████▋ | 7681/10000 [1:36:33<32:26, 1.19it/s, loss=0.0067, lr=5.65e-06, step=7681] Training: 77%|███████▋ | 7682/10000 [1:36:34<34:03, 1.13it/s, loss=0.0067, lr=5.65e-06, step=7681] Training: 77%|███████▋ | 7682/10000 [1:36:34<34:03, 1.13it/s, loss=0.0029, lr=5.65e-06, step=7682] Training: 77%|███████▋ | 7683/10000 [1:36:35<37:53, 1.02it/s, loss=0.0029, lr=5.65e-06, step=7682] Training: 77%|███████▋ | 7683/10000 [1:36:35<37:53, 1.02it/s, loss=0.0007, lr=5.65e-06, step=7683] Training: 77%|███████▋ | 7684/10000 [1:36:36<33:38, 1.15it/s, loss=0.0007, lr=5.65e-06, step=7683] Training: 77%|███████▋ | 7684/10000 [1:36:36<33:38, 1.15it/s, loss=0.0091, lr=5.64e-06, step=7684] Training: 77%|███████▋ | 7685/10000 [1:36:36<32:14, 1.20it/s, loss=0.0091, lr=5.64e-06, step=7684] Training: 77%|███████▋ | 7685/10000 [1:36:36<32:14, 1.20it/s, loss=0.0057, lr=5.64e-06, step=7685] Training: 77%|███████▋ | 7686/10000 [1:36:37<30:02, 1.28it/s, loss=0.0057, lr=5.64e-06, step=7685] Training: 77%|███████▋ | 7686/10000 [1:36:37<30:02, 1.28it/s, loss=0.0014, lr=5.64e-06, step=7686] Training: 77%|███████▋ | 7687/10000 [1:36:38<31:58, 1.21it/s, loss=0.0014, lr=5.64e-06, step=7686] Training: 77%|███████▋ | 7687/10000 [1:36:38<31:58, 1.21it/s, loss=0.0143, lr=5.64e-06, step=7687] Training: 77%|███████▋ | 7688/10000 [1:36:39<33:25, 1.15it/s, loss=0.0143, lr=5.64e-06, step=7687] Training: 77%|███████▋ | 7688/10000 [1:36:39<33:25, 1.15it/s, loss=0.0096, lr=5.63e-06, step=7688] Training: 77%|███████▋ | 7689/10000 [1:36:40<31:54, 1.21it/s, loss=0.0096, lr=5.63e-06, step=7688] Training: 77%|███████▋ | 7689/10000 [1:36:40<31:54, 1.21it/s, loss=0.0180, lr=5.63e-06, step=7689]17:42:47.796 [I] step=7690 loss=0.0072 smoothed_loss=0.0099 lr=5.64e-06 grad_norm=0.4035 step_time=0.7228s data_time=0.1863s it/s=1.100 eta_to_10000=2099.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.1208 grad_shared_expert=0.4197 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7690/10000 [1:36:41<35:07, 1.10it/s, loss=0.0180, lr=5.63e-06, step=7689] Training: 77%|███████▋ | 7690/10000 [1:36:41<35:07, 1.10it/s, loss=0.0072, lr=5.63e-06, step=7690] Training: 77%|███████▋ | 7691/10000 [1:36:42<34:11, 1.13it/s, loss=0.0072, lr=5.63e-06, step=7690] Training: 77%|███████▋ | 7691/10000 [1:36:42<34:11, 1.13it/s, loss=0.0115, lr=5.63e-06, step=7691] Training: 77%|███████▋ | 7692/10000 [1:36:43<34:00, 1.13it/s, loss=0.0115, lr=5.63e-06, step=7691] Training: 77%|███████▋ | 7692/10000 [1:36:43<34:00, 1.13it/s, loss=0.0029, lr=5.62e-06, step=7692] Training: 77%|███████▋ | 7693/10000 [1:36:43<33:58, 1.13it/s, loss=0.0029, lr=5.62e-06, step=7692] Training: 77%|███████▋ | 7693/10000 [1:36:43<33:58, 1.13it/s, loss=0.0108, lr=5.62e-06, step=7693] Training: 77%|███████▋ | 7694/10000 [1:36:44<33:08, 1.16it/s, loss=0.0108, lr=5.62e-06, step=7693] Training: 77%|███████▋ | 7694/10000 [1:36:44<33:08, 1.16it/s, loss=0.0036, lr=5.62e-06, step=7694] Training: 77%|███████▋ | 7695/10000 [1:36:45<33:08, 1.16it/s, loss=0.0036, lr=5.62e-06, step=7694] Training: 77%|███████▋ | 7695/10000 [1:36:45<33:08, 1.16it/s, loss=0.0031, lr=5.62e-06, step=7695] Training: 77%|███████▋ | 7696/10000 [1:36:46<32:18, 1.19it/s, loss=0.0031, lr=5.62e-06, step=7695] Training: 77%|███████▋ | 7696/10000 [1:36:46<32:18, 1.19it/s, loss=0.0117, lr=5.61e-06, step=7696] Training: 77%|███████▋ | 7697/10000 [1:36:47<34:25, 1.11it/s, loss=0.0117, lr=5.61e-06, step=7696] Training: 77%|███████▋ | 7697/10000 [1:36:47<34:25, 1.11it/s, loss=0.0061, lr=5.61e-06, step=7697] Training: 77%|███████▋ | 7698/10000 [1:36:48<32:24, 1.18it/s, loss=0.0061, lr=5.61e-06, step=7697] Training: 77%|███████▋ | 7698/10000 [1:36:48<32:24, 1.18it/s, loss=0.0415, lr=5.61e-06, step=7698] Training: 77%|███████▋ | 7699/10000 [1:36:49<32:20, 1.19it/s, loss=0.0415, lr=5.61e-06, step=7698] Training: 77%|███████▋ | 7699/10000 [1:36:49<32:20, 1.19it/s, loss=0.0018, lr=5.61e-06, step=7699]17:42:56.362 [I] step=7700 loss=0.0126 smoothed_loss=0.0109 lr=5.61e-06 grad_norm=0.4054 step_time=0.6452s data_time=0.2114s it/s=1.168 eta_to_10000=1969.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0049 grad_action_out_proj=0.0674 grad_shared_expert=0.2376 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7700/10000 [1:36:49<33:11, 1.15it/s, loss=0.0018, lr=5.61e-06, step=7699] Training: 77%|███████▋ | 7700/10000 [1:36:49<33:11, 1.15it/s, loss=0.0126, lr=5.60e-06, step=7700] Training: 77%|███████▋ | 7701/10000 [1:36:50<32:48, 1.17it/s, loss=0.0126, lr=5.60e-06, step=7700] Training: 77%|███████▋ | 7701/10000 [1:36:50<32:48, 1.17it/s, loss=0.0061, lr=5.60e-06, step=7701] Training: 77%|███████▋ | 7702/10000 [1:36:51<32:16, 1.19it/s, loss=0.0061, lr=5.60e-06, step=7701] Training: 77%|███████▋ | 7702/10000 [1:36:51<32:16, 1.19it/s, loss=0.0048, lr=5.60e-06, step=7702] Training: 77%|███████▋ | 7703/10000 [1:36:52<28:09, 1.36it/s, loss=0.0048, lr=5.60e-06, step=7702] Training: 77%|███████▋ | 7703/10000 [1:36:52<28:09, 1.36it/s, loss=0.0107, lr=5.60e-06, step=7703] Training: 77%|███████▋ | 7704/10000 [1:36:52<28:45, 1.33it/s, loss=0.0107, lr=5.60e-06, step=7703] Training: 77%|███████▋ | 7704/10000 [1:36:52<28:45, 1.33it/s, loss=0.0018, lr=5.59e-06, step=7704] Training: 77%|███████▋ | 7705/10000 [1:36:53<27:49, 1.37it/s, loss=0.0018, lr=5.59e-06, step=7704] Training: 77%|███████▋ | 7705/10000 [1:36:53<27:49, 1.37it/s, loss=0.0161, lr=5.59e-06, step=7705] Training: 77%|███████▋ | 7706/10000 [1:36:54<27:15, 1.40it/s, loss=0.0161, lr=5.59e-06, step=7705] Training: 77%|███████▋ | 7706/10000 [1:36:54<27:15, 1.40it/s, loss=0.0043, lr=5.59e-06, step=7706] Training: 77%|███████▋ | 7707/10000 [1:36:54<24:54, 1.53it/s, loss=0.0043, lr=5.59e-06, step=7706] Training: 77%|███████▋ | 7707/10000 [1:36:54<24:54, 1.53it/s, loss=0.0090, lr=5.58e-06, step=7707] Training: 77%|███████▋ | 7708/10000 [1:36:55<23:11, 1.65it/s, loss=0.0090, lr=5.58e-06, step=7707] Training: 77%|███████▋ | 7708/10000 [1:36:55<23:11, 1.65it/s, loss=0.0069, lr=5.58e-06, step=7708] Training: 77%|███████▋ | 7709/10000 [1:36:56<25:26, 1.50it/s, loss=0.0069, lr=5.58e-06, step=7708] Training: 77%|███████▋ | 7709/10000 [1:36:56<25:26, 1.50it/s, loss=0.0032, lr=5.58e-06, step=7709]17:43:03.031 [I] step=7710 loss=0.0073 smoothed_loss=0.0083 lr=5.59e-06 grad_norm=0.4464 step_time=0.5577s data_time=0.1092s it/s=1.500 eta_to_10000=1527.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0558 grad_shared_expert=0.1890 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7710/10000 [1:36:56<24:29, 1.56it/s, loss=0.0032, lr=5.58e-06, step=7709] Training: 77%|███████▋ | 7710/10000 [1:36:56<24:29, 1.56it/s, loss=0.0073, lr=5.58e-06, step=7710] Training: 77%|███████▋ | 7711/10000 [1:36:57<27:33, 1.38it/s, loss=0.0073, lr=5.58e-06, step=7710] Training: 77%|███████▋ | 7711/10000 [1:36:57<27:33, 1.38it/s, loss=0.0083, lr=5.57e-06, step=7711] Training: 77%|███████▋ | 7712/10000 [1:36:58<29:31, 1.29it/s, loss=0.0083, lr=5.57e-06, step=7711] Training: 77%|███████▋ | 7712/10000 [1:36:58<29:31, 1.29it/s, loss=0.0082, lr=5.57e-06, step=7712] Training: 77%|███████▋ | 7713/10000 [1:36:59<28:47, 1.32it/s, loss=0.0082, lr=5.57e-06, step=7712] Training: 77%|███████▋ | 7713/10000 [1:36:59<28:47, 1.32it/s, loss=0.0104, lr=5.57e-06, step=7713] Training: 77%|███████▋ | 7714/10000 [1:36:59<27:31, 1.38it/s, loss=0.0104, lr=5.57e-06, step=7713] Training: 77%|███████▋ | 7714/10000 [1:36:59<27:31, 1.38it/s, loss=0.0243, lr=5.57e-06, step=7714] Training: 77%|███████▋ | 7715/10000 [1:37:00<26:58, 1.41it/s, loss=0.0243, lr=5.57e-06, step=7714] Training: 77%|███████▋ | 7715/10000 [1:37:00<26:58, 1.41it/s, loss=0.0073, lr=5.56e-06, step=7715] Training: 77%|███████▋ | 7716/10000 [1:37:00<24:39, 1.54it/s, loss=0.0073, lr=5.56e-06, step=7715] Training: 77%|███████▋ | 7716/10000 [1:37:00<24:39, 1.54it/s, loss=0.0077, lr=5.56e-06, step=7716] Training: 77%|███████▋ | 7717/10000 [1:37:01<23:02, 1.65it/s, loss=0.0077, lr=5.56e-06, step=7716] Training: 77%|███████▋ | 7717/10000 [1:37:01<23:02, 1.65it/s, loss=0.0034, lr=5.56e-06, step=7717] Training: 77%|███████▋ | 7718/10000 [1:37:02<25:35, 1.49it/s, loss=0.0034, lr=5.56e-06, step=7717] Training: 77%|███████▋ | 7718/10000 [1:37:02<25:35, 1.49it/s, loss=0.0069, lr=5.56e-06, step=7718] Training: 77%|███████▋ | 7719/10000 [1:37:03<26:55, 1.41it/s, loss=0.0069, lr=5.56e-06, step=7718] Training: 77%|███████▋ | 7719/10000 [1:37:03<26:55, 1.41it/s, loss=0.0159, lr=5.55e-06, step=7719]17:43:10.273 [I] step=7720 loss=0.0137 smoothed_loss=0.0099 lr=5.56e-06 grad_norm=0.4010 step_time=0.6138s data_time=0.1104s it/s=1.381 eta_to_10000=1650.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0124 grad_action_out_proj=0.0784 grad_shared_expert=0.3504 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7720/10000 [1:37:03<27:36, 1.38it/s, loss=0.0159, lr=5.55e-06, step=7719] Training: 77%|███████▋ | 7720/10000 [1:37:03<27:36, 1.38it/s, loss=0.0137, lr=5.55e-06, step=7720] Training: 77%|███████▋ | 7721/10000 [1:37:04<27:15, 1.39it/s, loss=0.0137, lr=5.55e-06, step=7720] Training: 77%|███████▋ | 7721/10000 [1:37:04<27:15, 1.39it/s, loss=0.0116, lr=5.55e-06, step=7721] Training: 77%|███████▋ | 7722/10000 [1:37:05<29:24, 1.29it/s, loss=0.0116, lr=5.55e-06, step=7721] Training: 77%|███████▋ | 7722/10000 [1:37:05<29:24, 1.29it/s, loss=0.0110, lr=5.55e-06, step=7722] Training: 77%|███████▋ | 7723/10000 [1:37:06<27:35, 1.38it/s, loss=0.0110, lr=5.55e-06, step=7722] Training: 77%|███████▋ | 7723/10000 [1:37:06<27:35, 1.38it/s, loss=0.0326, lr=5.54e-06, step=7723] Training: 77%|███████▋ | 7724/10000 [1:37:06<26:13, 1.45it/s, loss=0.0326, lr=5.54e-06, step=7723] Training: 77%|███████▋ | 7724/10000 [1:37:06<26:13, 1.45it/s, loss=0.0016, lr=5.54e-06, step=7724] Training: 77%|███████▋ | 7725/10000 [1:37:07<25:52, 1.47it/s, loss=0.0016, lr=5.54e-06, step=7724] Training: 77%|███████▋ | 7725/10000 [1:37:07<25:52, 1.47it/s, loss=0.0224, lr=5.54e-06, step=7725] Training: 77%|███████▋ | 7726/10000 [1:37:08<27:50, 1.36it/s, loss=0.0224, lr=5.54e-06, step=7725] Training: 77%|███████▋ | 7726/10000 [1:37:08<27:50, 1.36it/s, loss=0.0143, lr=5.54e-06, step=7726] Training: 77%|███████▋ | 7727/10000 [1:37:08<27:56, 1.36it/s, loss=0.0143, lr=5.54e-06, step=7726] Training: 77%|███████▋ | 7727/10000 [1:37:08<27:56, 1.36it/s, loss=0.0031, lr=5.53e-06, step=7727] Training: 77%|███████▋ | 7728/10000 [1:37:09<29:25, 1.29it/s, loss=0.0031, lr=5.53e-06, step=7727] Training: 77%|███████▋ | 7728/10000 [1:37:09<29:25, 1.29it/s, loss=0.0256, lr=5.53e-06, step=7728] Training: 77%|███████▋ | 7729/10000 [1:37:10<27:55, 1.36it/s, loss=0.0256, lr=5.53e-06, step=7728] Training: 77%|███████▋ | 7729/10000 [1:37:10<27:55, 1.36it/s, loss=0.0100, lr=5.53e-06, step=7729]17:43:17.453 [I] step=7730 loss=0.0065 smoothed_loss=0.0121 lr=5.54e-06 grad_norm=0.3954 step_time=0.5910s data_time=0.1268s it/s=1.393 eta_to_10000=1629.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.0858 grad_shared_expert=0.3243 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7730/10000 [1:37:11<26:03, 1.45it/s, loss=0.0100, lr=5.53e-06, step=7729] Training: 77%|███████▋ | 7730/10000 [1:37:11<26:03, 1.45it/s, loss=0.0065, lr=5.53e-06, step=7730] Training: 77%|███████▋ | 7731/10000 [1:37:11<24:48, 1.52it/s, loss=0.0065, lr=5.53e-06, step=7730] Training: 77%|███████▋ | 7731/10000 [1:37:11<24:48, 1.52it/s, loss=0.0021, lr=5.52e-06, step=7731] Training: 77%|███████▋ | 7732/10000 [1:37:12<27:16, 1.39it/s, loss=0.0021, lr=5.52e-06, step=7731] Training: 77%|███████▋ | 7732/10000 [1:37:12<27:16, 1.39it/s, loss=0.0025, lr=5.52e-06, step=7732] Training: 77%|███████▋ | 7733/10000 [1:37:13<28:51, 1.31it/s, loss=0.0025, lr=5.52e-06, step=7732] Training: 77%|███████▋ | 7733/10000 [1:37:13<28:51, 1.31it/s, loss=0.0022, lr=5.52e-06, step=7733] Training: 77%|███████▋ | 7734/10000 [1:37:13<26:39, 1.42it/s, loss=0.0022, lr=5.52e-06, step=7733] Training: 77%|███████▋ | 7734/10000 [1:37:13<26:39, 1.42it/s, loss=0.0108, lr=5.52e-06, step=7734] Training: 77%|███████▋ | 7735/10000 [1:37:14<26:00, 1.45it/s, loss=0.0108, lr=5.52e-06, step=7734] Training: 77%|███████▋ | 7735/10000 [1:37:14<26:00, 1.45it/s, loss=0.0197, lr=5.51e-06, step=7735] Training: 77%|███████▋ | 7736/10000 [1:37:15<24:02, 1.57it/s, loss=0.0197, lr=5.51e-06, step=7735] Training: 77%|███████▋ | 7736/10000 [1:37:15<24:02, 1.57it/s, loss=0.0111, lr=5.51e-06, step=7736] Training: 77%|███████▋ | 7737/10000 [1:37:15<23:12, 1.62it/s, loss=0.0111, lr=5.51e-06, step=7736] Training: 77%|███████▋ | 7737/10000 [1:37:15<23:12, 1.62it/s, loss=0.0020, lr=5.51e-06, step=7737] Training: 77%|███████▋ | 7738/10000 [1:37:16<24:57, 1.51it/s, loss=0.0020, lr=5.51e-06, step=7737] Training: 77%|███████▋ | 7738/10000 [1:37:16<24:57, 1.51it/s, loss=0.0155, lr=5.51e-06, step=7738] Training: 77%|███████▋ | 7739/10000 [1:37:16<24:02, 1.57it/s, loss=0.0155, lr=5.51e-06, step=7738] Training: 77%|███████▋ | 7739/10000 [1:37:16<24:02, 1.57it/s, loss=0.0038, lr=5.50e-06, step=7739]17:43:24.282 [I] step=7740 loss=0.0033 smoothed_loss=0.0091 lr=5.51e-06 grad_norm=0.3501 step_time=0.5794s data_time=0.1035s it/s=1.465 eta_to_10000=1543.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0796 grad_shared_expert=0.3764 (10775:train_pytorch.py:850) + Training: 77%|███████▋ | 7740/10000 [1:37:17<26:31, 1.42it/s, loss=0.0038, lr=5.50e-06, step=7739] Training: 77%|███████▋ | 7740/10000 [1:37:17<26:31, 1.42it/s, loss=0.0033, lr=5.50e-06, step=7740] Training: 77%|███████▋ | 7741/10000 [1:37:18<24:47, 1.52it/s, loss=0.0033, lr=5.50e-06, step=7740] Training: 77%|███████▋ | 7741/10000 [1:37:18<24:47, 1.52it/s, loss=0.0032, lr=5.50e-06, step=7741] Training: 77%|███████▋ | 7742/10000 [1:37:19<25:50, 1.46it/s, loss=0.0032, lr=5.50e-06, step=7741] Training: 77%|███████▋ | 7742/10000 [1:37:19<25:50, 1.46it/s, loss=0.0033, lr=5.50e-06, step=7742] Training: 77%|███████▋ | 7743/10000 [1:37:19<23:52, 1.58it/s, loss=0.0033, lr=5.50e-06, step=7742] Training: 77%|███████▋ | 7743/10000 [1:37:19<23:52, 1.58it/s, loss=0.0014, lr=5.49e-06, step=7743] Training: 77%|███████▋ | 7744/10000 [1:37:20<22:39, 1.66it/s, loss=0.0014, lr=5.49e-06, step=7743] Training: 77%|███████▋ | 7744/10000 [1:37:20<22:39, 1.66it/s, loss=0.0116, lr=5.49e-06, step=7744] Training: 77%|███████▋ | 7745/10000 [1:37:20<22:27, 1.67it/s, loss=0.0116, lr=5.49e-06, step=7744] Training: 77%|███████▋ | 7745/10000 [1:37:20<22:27, 1.67it/s, loss=0.0018, lr=5.49e-06, step=7745] Training: 77%|███████▋ | 7746/10000 [1:37:21<21:25, 1.75it/s, loss=0.0018, lr=5.49e-06, step=7745] Training: 77%|███████▋ | 7746/10000 [1:37:21<21:25, 1.75it/s, loss=0.0064, lr=5.49e-06, step=7746] Training: 77%|███████▋ | 7747/10000 [1:37:22<26:55, 1.39it/s, loss=0.0064, lr=5.49e-06, step=7746] Training: 77%|███████▋ | 7747/10000 [1:37:22<26:55, 1.39it/s, loss=0.0011, lr=5.48e-06, step=7747] Training: 77%|███████▋ | 7748/10000 [1:37:22<25:41, 1.46it/s, loss=0.0011, lr=5.48e-06, step=7747] Training: 77%|███████▋ | 7748/10000 [1:37:22<25:41, 1.46it/s, loss=0.0295, lr=5.48e-06, step=7748] Training: 77%|███████▋ | 7749/10000 [1:37:23<26:44, 1.40it/s, loss=0.0295, lr=5.48e-06, step=7748] Training: 77%|███████▋ | 7749/10000 [1:37:23<26:44, 1.40it/s, loss=0.0067, lr=5.48e-06, step=7749]17:43:30.982 [I] step=7750 loss=0.0059 smoothed_loss=0.0083 lr=5.49e-06 grad_norm=0.4090 step_time=0.5737s data_time=0.0962s it/s=1.493 eta_to_10000=1507.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0062 grad_action_out_proj=0.0662 grad_shared_expert=0.2217 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7750/10000 [1:37:24<27:53, 1.34it/s, loss=0.0067, lr=5.48e-06, step=7749] Training: 78%|███████▊ | 7750/10000 [1:37:24<27:53, 1.34it/s, loss=0.0059, lr=5.48e-06, step=7750] Training: 78%|███████▊ | 7751/10000 [1:37:25<25:11, 1.49it/s, loss=0.0059, lr=5.48e-06, step=7750] Training: 78%|███████▊ | 7751/10000 [1:37:25<25:11, 1.49it/s, loss=0.0037, lr=5.47e-06, step=7751] Training: 78%|███████▊ | 7752/10000 [1:37:25<23:30, 1.59it/s, loss=0.0037, lr=5.47e-06, step=7751] Training: 78%|███████▊ | 7752/10000 [1:37:25<23:30, 1.59it/s, loss=0.0265, lr=5.47e-06, step=7752] Training: 78%|███████▊ | 7753/10000 [1:37:26<23:12, 1.61it/s, loss=0.0265, lr=5.47e-06, step=7752] Training: 78%|███████▊ | 7753/10000 [1:37:26<23:12, 1.61it/s, loss=0.0146, lr=5.47e-06, step=7753] Training: 78%|███████▊ | 7754/10000 [1:37:27<25:35, 1.46it/s, loss=0.0146, lr=5.47e-06, step=7753] Training: 78%|███████▊ | 7754/10000 [1:37:27<25:35, 1.46it/s, loss=0.0067, lr=5.47e-06, step=7754] Training: 78%|███████▊ | 7755/10000 [1:37:27<24:05, 1.55it/s, loss=0.0067, lr=5.47e-06, step=7754] Training: 78%|███████▊ | 7755/10000 [1:37:27<24:05, 1.55it/s, loss=0.0022, lr=5.46e-06, step=7755] Training: 78%|███████▊ | 7756/10000 [1:37:28<26:01, 1.44it/s, loss=0.0022, lr=5.46e-06, step=7755] Training: 78%|███████▊ | 7756/10000 [1:37:28<26:01, 1.44it/s, loss=0.0030, lr=5.46e-06, step=7756] Training: 78%|███████▊ | 7757/10000 [1:37:28<24:48, 1.51it/s, loss=0.0030, lr=5.46e-06, step=7756] Training: 78%|███████▊ | 7757/10000 [1:37:28<24:48, 1.51it/s, loss=0.0099, lr=5.46e-06, step=7757] Training: 78%|███████▊ | 7758/10000 [1:37:29<25:58, 1.44it/s, loss=0.0099, lr=5.46e-06, step=7757] Training: 78%|███████▊ | 7758/10000 [1:37:29<25:58, 1.44it/s, loss=0.0397, lr=5.46e-06, step=7758] Training: 78%|███████▊ | 7759/10000 [1:37:30<23:40, 1.58it/s, loss=0.0397, lr=5.46e-06, step=7758] Training: 78%|███████▊ | 7759/10000 [1:37:30<23:40, 1.58it/s, loss=0.0086, lr=5.45e-06, step=7759]17:43:37.256 [I] step=7760 loss=0.0164 smoothed_loss=0.0119 lr=5.46e-06 grad_norm=0.4655 step_time=0.5316s data_time=0.0958s it/s=1.594 eta_to_10000=1405.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0260 grad_action_out_proj=0.1267 grad_shared_expert=0.7992 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7760/10000 [1:37:30<23:13, 1.61it/s, loss=0.0086, lr=5.45e-06, step=7759] Training: 78%|███████▊ | 7760/10000 [1:37:30<23:13, 1.61it/s, loss=0.0164, lr=5.45e-06, step=7760] Training: 78%|███████▊ | 7761/10000 [1:37:31<25:31, 1.46it/s, loss=0.0164, lr=5.45e-06, step=7760] Training: 78%|███████▊ | 7761/10000 [1:37:31<25:31, 1.46it/s, loss=0.0016, lr=5.45e-06, step=7761] Training: 78%|███████▊ | 7762/10000 [1:37:32<23:57, 1.56it/s, loss=0.0016, lr=5.45e-06, step=7761] Training: 78%|███████▊ | 7762/10000 [1:37:32<23:57, 1.56it/s, loss=0.0051, lr=5.45e-06, step=7762] Training: 78%|███████▊ | 7763/10000 [1:37:33<26:14, 1.42it/s, loss=0.0051, lr=5.45e-06, step=7762] Training: 78%|███████▊ | 7763/10000 [1:37:33<26:14, 1.42it/s, loss=0.0034, lr=5.44e-06, step=7763] Training: 78%|███████▊ | 7764/10000 [1:37:33<25:19, 1.47it/s, loss=0.0034, lr=5.44e-06, step=7763] Training: 78%|███████▊ | 7764/10000 [1:37:33<25:19, 1.47it/s, loss=0.0258, lr=5.44e-06, step=7764] Training: 78%|███████▊ | 7765/10000 [1:37:34<23:32, 1.58it/s, loss=0.0258, lr=5.44e-06, step=7764] Training: 78%|███████▊ | 7765/10000 [1:37:34<23:32, 1.58it/s, loss=0.0039, lr=5.44e-06, step=7765] Training: 78%|███████▊ | 7766/10000 [1:37:35<25:36, 1.45it/s, loss=0.0039, lr=5.44e-06, step=7765] Training: 78%|███████▊ | 7766/10000 [1:37:35<25:36, 1.45it/s, loss=0.0068, lr=5.44e-06, step=7766] Training: 78%|███████▊ | 7767/10000 [1:37:35<23:51, 1.56it/s, loss=0.0068, lr=5.44e-06, step=7766] Training: 78%|███████▊ | 7767/10000 [1:37:35<23:51, 1.56it/s, loss=0.0062, lr=5.43e-06, step=7767] Training: 78%|███████▊ | 7768/10000 [1:37:36<22:42, 1.64it/s, loss=0.0062, lr=5.43e-06, step=7767] Training: 78%|███████▊ | 7768/10000 [1:37:36<22:42, 1.64it/s, loss=0.0099, lr=5.43e-06, step=7768] Training: 78%|███████▊ | 7769/10000 [1:37:36<24:56, 1.49it/s, loss=0.0099, lr=5.43e-06, step=7768] Training: 78%|███████▊ | 7769/10000 [1:37:36<24:56, 1.49it/s, loss=0.0049, lr=5.43e-06, step=7769]17:43:44.057 [I] step=7770 loss=0.0041 smoothed_loss=0.0087 lr=5.44e-06 grad_norm=0.3538 step_time=0.5717s data_time=0.1085s it/s=1.470 eta_to_10000=1516.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0696 grad_shared_expert=0.2679 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7770/10000 [1:37:37<25:39, 1.45it/s, loss=0.0049, lr=5.43e-06, step=7769] Training: 78%|███████▊ | 7770/10000 [1:37:37<25:39, 1.45it/s, loss=0.0041, lr=5.43e-06, step=7770] Training: 78%|███████▊ | 7771/10000 [1:37:38<23:31, 1.58it/s, loss=0.0041, lr=5.43e-06, step=7770] Training: 78%|███████▊ | 7771/10000 [1:37:38<23:31, 1.58it/s, loss=0.0030, lr=5.42e-06, step=7771] Training: 78%|███████▊ | 7772/10000 [1:37:38<23:04, 1.61it/s, loss=0.0030, lr=5.42e-06, step=7771] Training: 78%|███████▊ | 7772/10000 [1:37:38<23:04, 1.61it/s, loss=0.0107, lr=5.42e-06, step=7772] Training: 78%|███████▊ | 7773/10000 [1:37:39<22:49, 1.63it/s, loss=0.0107, lr=5.42e-06, step=7772] Training: 78%|███████▊ | 7773/10000 [1:37:39<22:49, 1.63it/s, loss=0.0480, lr=5.42e-06, step=7773] Training: 78%|███████▊ | 7774/10000 [1:37:39<22:28, 1.65it/s, loss=0.0480, lr=5.42e-06, step=7773] Training: 78%|███████▊ | 7774/10000 [1:37:39<22:28, 1.65it/s, loss=0.0034, lr=5.42e-06, step=7774] Training: 78%|███████▊ | 7775/10000 [1:37:40<21:29, 1.73it/s, loss=0.0034, lr=5.42e-06, step=7774] Training: 78%|███████▊ | 7775/10000 [1:37:40<21:29, 1.73it/s, loss=0.0011, lr=5.41e-06, step=7775] Training: 78%|███████▊ | 7776/10000 [1:37:41<25:45, 1.44it/s, loss=0.0011, lr=5.41e-06, step=7775] Training: 78%|███████▊ | 7776/10000 [1:37:41<25:45, 1.44it/s, loss=0.0067, lr=5.41e-06, step=7776] Training: 78%|███████▊ | 7777/10000 [1:37:42<26:19, 1.41it/s, loss=0.0067, lr=5.41e-06, step=7776] Training: 78%|███████▊ | 7777/10000 [1:37:42<26:19, 1.41it/s, loss=0.0071, lr=5.41e-06, step=7777] Training: 78%|███████▊ | 7778/10000 [1:37:42<24:37, 1.50it/s, loss=0.0071, lr=5.41e-06, step=7777] Training: 78%|███████▊ | 7778/10000 [1:37:42<24:37, 1.50it/s, loss=0.0018, lr=5.41e-06, step=7778] Training: 78%|███████▊ | 7779/10000 [1:37:43<23:50, 1.55it/s, loss=0.0018, lr=5.41e-06, step=7778] Training: 78%|███████▊ | 7779/10000 [1:37:43<23:50, 1.55it/s, loss=0.0044, lr=5.40e-06, step=7779]17:43:50.356 [I] step=7780 loss=0.0058 smoothed_loss=0.0082 lr=5.41e-06 grad_norm=0.4690 step_time=0.5333s data_time=0.0965s it/s=1.588 eta_to_10000=1398.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0031 grad_action_out_proj=0.0490 grad_shared_expert=0.2257 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7780/10000 [1:37:43<23:45, 1.56it/s, loss=0.0044, lr=5.40e-06, step=7779] Training: 78%|███████▊ | 7780/10000 [1:37:43<23:45, 1.56it/s, loss=0.0058, lr=5.40e-06, step=7780] Training: 78%|███████▊ | 7781/10000 [1:37:44<22:51, 1.62it/s, loss=0.0058, lr=5.40e-06, step=7780] Training: 78%|███████▊ | 7781/10000 [1:37:44<22:51, 1.62it/s, loss=0.0029, lr=5.40e-06, step=7781] Training: 78%|███████▊ | 7782/10000 [1:37:45<22:57, 1.61it/s, loss=0.0029, lr=5.40e-06, step=7781] Training: 78%|███████▊ | 7782/10000 [1:37:45<22:57, 1.61it/s, loss=0.0049, lr=5.40e-06, step=7782] Training: 78%|███████▊ | 7783/10000 [1:37:45<25:10, 1.47it/s, loss=0.0049, lr=5.40e-06, step=7782] Training: 78%|███████▊ | 7783/10000 [1:37:45<25:10, 1.47it/s, loss=0.0007, lr=5.39e-06, step=7783] Training: 78%|███████▊ | 7784/10000 [1:37:46<26:34, 1.39it/s, loss=0.0007, lr=5.39e-06, step=7783] Training: 78%|███████▊ | 7784/10000 [1:37:46<26:34, 1.39it/s, loss=0.0018, lr=5.39e-06, step=7784] Training: 78%|███████▊ | 7785/10000 [1:37:47<26:57, 1.37it/s, loss=0.0018, lr=5.39e-06, step=7784] Training: 78%|███████▊ | 7785/10000 [1:37:47<26:57, 1.37it/s, loss=0.0031, lr=5.39e-06, step=7785] Training: 78%|███████▊ | 7786/10000 [1:37:48<26:27, 1.39it/s, loss=0.0031, lr=5.39e-06, step=7785] Training: 78%|███████▊ | 7786/10000 [1:37:48<26:27, 1.39it/s, loss=0.0029, lr=5.39e-06, step=7786] Training: 78%|███████▊ | 7787/10000 [1:37:48<24:11, 1.52it/s, loss=0.0029, lr=5.39e-06, step=7786] Training: 78%|███████▊ | 7787/10000 [1:37:48<24:11, 1.52it/s, loss=0.0032, lr=5.38e-06, step=7787] Training: 78%|███████▊ | 7788/10000 [1:37:49<24:52, 1.48it/s, loss=0.0032, lr=5.38e-06, step=7787] Training: 78%|███████▊ | 7788/10000 [1:37:49<24:52, 1.48it/s, loss=0.0227, lr=5.38e-06, step=7788] Training: 78%|███████▊ | 7789/10000 [1:37:49<23:34, 1.56it/s, loss=0.0227, lr=5.38e-06, step=7788] Training: 78%|███████▊ | 7789/10000 [1:37:49<23:34, 1.56it/s, loss=0.0025, lr=5.38e-06, step=7789]17:43:57.419 [I] step=7790 loss=0.0103 smoothed_loss=0.0070 lr=5.39e-06 grad_norm=0.4923 step_time=0.5980s data_time=0.1084s it/s=1.416 eta_to_10000=1560.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0249 grad_action_out_proj=0.1574 grad_shared_expert=0.5988 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7790/10000 [1:37:50<27:40, 1.33it/s, loss=0.0025, lr=5.38e-06, step=7789] Training: 78%|███████▊ | 7790/10000 [1:37:50<27:40, 1.33it/s, loss=0.0103, lr=5.38e-06, step=7790] Training: 78%|███████▊ | 7791/10000 [1:37:51<26:09, 1.41it/s, loss=0.0103, lr=5.38e-06, step=7790] Training: 78%|███████▊ | 7791/10000 [1:37:51<26:09, 1.41it/s, loss=0.0137, lr=5.37e-06, step=7791] Training: 78%|███████▊ | 7792/10000 [1:37:52<25:45, 1.43it/s, loss=0.0137, lr=5.37e-06, step=7791] Training: 78%|███████▊ | 7792/10000 [1:37:52<25:45, 1.43it/s, loss=0.0040, lr=5.37e-06, step=7792] Training: 78%|███████▊ | 7793/10000 [1:37:52<25:16, 1.46it/s, loss=0.0040, lr=5.37e-06, step=7792] Training: 78%|███████▊ | 7793/10000 [1:37:52<25:16, 1.46it/s, loss=0.0027, lr=5.37e-06, step=7793] Training: 78%|███████▊ | 7794/10000 [1:37:53<24:08, 1.52it/s, loss=0.0027, lr=5.37e-06, step=7793] Training: 78%|███████▊ | 7794/10000 [1:37:53<24:08, 1.52it/s, loss=0.0085, lr=5.37e-06, step=7794] Training: 78%|███████▊ | 7795/10000 [1:37:54<22:27, 1.64it/s, loss=0.0085, lr=5.37e-06, step=7794] Training: 78%|███████▊ | 7795/10000 [1:37:54<22:27, 1.64it/s, loss=0.0025, lr=5.36e-06, step=7795] Training: 78%|███████▊ | 7796/10000 [1:37:54<21:22, 1.72it/s, loss=0.0025, lr=5.36e-06, step=7795] Training: 78%|███████▊ | 7796/10000 [1:37:54<21:22, 1.72it/s, loss=0.0090, lr=5.36e-06, step=7796] Training: 78%|███████▊ | 7797/10000 [1:37:55<25:36, 1.43it/s, loss=0.0090, lr=5.36e-06, step=7796] Training: 78%|███████▊ | 7797/10000 [1:37:55<25:36, 1.43it/s, loss=0.0080, lr=5.36e-06, step=7797] Training: 78%|███████▊ | 7798/10000 [1:37:56<23:40, 1.55it/s, loss=0.0080, lr=5.36e-06, step=7797] Training: 78%|███████▊ | 7798/10000 [1:37:56<23:40, 1.55it/s, loss=0.0049, lr=5.36e-06, step=7798] Training: 78%|███████▊ | 7799/10000 [1:37:56<22:18, 1.64it/s, loss=0.0049, lr=5.36e-06, step=7798] Training: 78%|███████▊ | 7799/10000 [1:37:56<22:18, 1.64it/s, loss=0.0081, lr=5.35e-06, step=7799]17:44:03.732 [I] step=7800 loss=0.0054 smoothed_loss=0.0067 lr=5.36e-06 grad_norm=0.3543 step_time=0.5453s data_time=0.0859s it/s=1.584 eta_to_10000=1388.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0044 grad_action_out_proj=0.0574 grad_shared_expert=0.2591 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7800/10000 [1:37:57<23:50, 1.54it/s, loss=0.0081, lr=5.35e-06, step=7799] Training: 78%|███████▊ | 7800/10000 [1:37:57<23:50, 1.54it/s, loss=0.0054, lr=5.35e-06, step=7800] Training: 78%|███████▊ | 7801/10000 [1:37:57<23:20, 1.57it/s, loss=0.0054, lr=5.35e-06, step=7800] Training: 78%|███████▊ | 7801/10000 [1:37:57<23:20, 1.57it/s, loss=0.0030, lr=5.35e-06, step=7801] Training: 78%|███████▊ | 7802/10000 [1:37:58<22:03, 1.66it/s, loss=0.0030, lr=5.35e-06, step=7801] Training: 78%|███████▊ | 7802/10000 [1:37:58<22:03, 1.66it/s, loss=0.0188, lr=5.35e-06, step=7802] Training: 78%|███████▊ | 7803/10000 [1:37:58<21:27, 1.71it/s, loss=0.0188, lr=5.35e-06, step=7802] Training: 78%|███████▊ | 7803/10000 [1:37:58<21:27, 1.71it/s, loss=0.0098, lr=5.34e-06, step=7803] Training: 78%|███████▊ | 7804/10000 [1:38:00<27:00, 1.35it/s, loss=0.0098, lr=5.34e-06, step=7803] Training: 78%|███████▊ | 7804/10000 [1:38:00<27:00, 1.35it/s, loss=0.0918, lr=5.34e-06, step=7804] Training: 78%|███████▊ | 7805/10000 [1:38:00<25:34, 1.43it/s, loss=0.0918, lr=5.34e-06, step=7804] Training: 78%|███████▊ | 7805/10000 [1:38:00<25:34, 1.43it/s, loss=0.0092, lr=5.34e-06, step=7805] Training: 78%|███████▊ | 7806/10000 [1:38:01<24:30, 1.49it/s, loss=0.0092, lr=5.34e-06, step=7805] Training: 78%|███████▊ | 7806/10000 [1:38:01<24:30, 1.49it/s, loss=0.0018, lr=5.34e-06, step=7806] Training: 78%|███████▊ | 7807/10000 [1:38:02<25:21, 1.44it/s, loss=0.0018, lr=5.34e-06, step=7806] Training: 78%|███████▊ | 7807/10000 [1:38:02<25:21, 1.44it/s, loss=0.0067, lr=5.33e-06, step=7807] Training: 78%|███████▊ | 7808/10000 [1:38:02<25:57, 1.41it/s, loss=0.0067, lr=5.33e-06, step=7807] Training: 78%|███████▊ | 7808/10000 [1:38:02<25:57, 1.41it/s, loss=0.0191, lr=5.33e-06, step=7808] Training: 78%|███████▊ | 7809/10000 [1:38:03<23:34, 1.55it/s, loss=0.0191, lr=5.33e-06, step=7808] Training: 78%|███████▊ | 7809/10000 [1:38:03<23:34, 1.55it/s, loss=0.0054, lr=5.33e-06, step=7809]17:44:10.247 [I] step=7810 loss=0.0142 smoothed_loss=0.0132 lr=5.34e-06 grad_norm=0.4625 step_time=0.5375s data_time=0.1140s it/s=1.535 eta_to_10000=1426.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0157 grad_action_out_proj=0.1044 grad_shared_expert=0.4074 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7810/10000 [1:38:03<22:25, 1.63it/s, loss=0.0054, lr=5.33e-06, step=7809] Training: 78%|███████▊ | 7810/10000 [1:38:03<22:25, 1.63it/s, loss=0.0142, lr=5.33e-06, step=7810] Training: 78%|███████▊ | 7811/10000 [1:38:04<21:19, 1.71it/s, loss=0.0142, lr=5.33e-06, step=7810] Training: 78%|███████▊ | 7811/10000 [1:38:04<21:19, 1.71it/s, loss=0.0097, lr=5.32e-06, step=7811] Training: 78%|███████▊ | 7812/10000 [1:38:05<24:35, 1.48it/s, loss=0.0097, lr=5.32e-06, step=7811] Training: 78%|███████▊ | 7812/10000 [1:38:05<24:35, 1.48it/s, loss=0.0074, lr=5.32e-06, step=7812] Training: 78%|███████▊ | 7813/10000 [1:38:05<22:51, 1.59it/s, loss=0.0074, lr=5.32e-06, step=7812] Training: 78%|███████▊ | 7813/10000 [1:38:05<22:51, 1.59it/s, loss=0.0034, lr=5.32e-06, step=7813] Training: 78%|███████▊ | 7814/10000 [1:38:06<21:44, 1.68it/s, loss=0.0034, lr=5.32e-06, step=7813] Training: 78%|███████▊ | 7814/10000 [1:38:06<21:44, 1.68it/s, loss=0.0039, lr=5.32e-06, step=7814] Training: 78%|███████▊ | 7815/10000 [1:38:07<23:34, 1.54it/s, loss=0.0039, lr=5.32e-06, step=7814] Training: 78%|███████▊ | 7815/10000 [1:38:07<23:34, 1.54it/s, loss=0.0029, lr=5.31e-06, step=7815] Training: 78%|███████▊ | 7816/10000 [1:38:07<22:13, 1.64it/s, loss=0.0029, lr=5.31e-06, step=7815] Training: 78%|███████▊ | 7816/10000 [1:38:07<22:13, 1.64it/s, loss=0.0113, lr=5.31e-06, step=7816] Training: 78%|███████▊ | 7817/10000 [1:38:08<21:06, 1.72it/s, loss=0.0113, lr=5.31e-06, step=7816] Training: 78%|███████▊ | 7817/10000 [1:38:08<21:06, 1.72it/s, loss=0.0060, lr=5.31e-06, step=7817] Training: 78%|███████▊ | 7818/10000 [1:38:08<22:07, 1.64it/s, loss=0.0060, lr=5.31e-06, step=7817] Training: 78%|███████▊ | 7818/10000 [1:38:08<22:07, 1.64it/s, loss=0.0168, lr=5.31e-06, step=7818] Training: 78%|███████▊ | 7819/10000 [1:38:09<23:51, 1.52it/s, loss=0.0168, lr=5.31e-06, step=7818] Training: 78%|███████▊ | 7819/10000 [1:38:09<23:51, 1.52it/s, loss=0.0443, lr=5.30e-06, step=7819]17:44:16.483 [I] step=7820 loss=0.0034 smoothed_loss=0.0127 lr=5.31e-06 grad_norm=0.3906 step_time=0.5421s data_time=0.0815s it/s=1.604 eta_to_10000=1359.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0037 grad_action_out_proj=0.0425 grad_shared_expert=0.2025 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7820/10000 [1:38:10<22:43, 1.60it/s, loss=0.0443, lr=5.30e-06, step=7819] Training: 78%|███████▊ | 7820/10000 [1:38:10<22:43, 1.60it/s, loss=0.0034, lr=5.30e-06, step=7820] Training: 78%|███████▊ | 7821/10000 [1:38:10<21:13, 1.71it/s, loss=0.0034, lr=5.30e-06, step=7820] Training: 78%|███████▊ | 7821/10000 [1:38:10<21:13, 1.71it/s, loss=0.0088, lr=5.30e-06, step=7821] Training: 78%|███████▊ | 7822/10000 [1:38:11<22:18, 1.63it/s, loss=0.0088, lr=5.30e-06, step=7821] Training: 78%|███████▊ | 7822/10000 [1:38:11<22:18, 1.63it/s, loss=0.0042, lr=5.30e-06, step=7822] Training: 78%|███████▊ | 7823/10000 [1:38:11<21:13, 1.71it/s, loss=0.0042, lr=5.30e-06, step=7822] Training: 78%|███████▊ | 7823/10000 [1:38:11<21:13, 1.71it/s, loss=0.0041, lr=5.29e-06, step=7823] Training: 78%|███████▊ | 7824/10000 [1:38:12<20:25, 1.78it/s, loss=0.0041, lr=5.29e-06, step=7823] Training: 78%|███████▊ | 7824/10000 [1:38:12<20:25, 1.78it/s, loss=0.0126, lr=5.29e-06, step=7824] Training: 78%|███████▊ | 7825/10000 [1:38:12<22:09, 1.64it/s, loss=0.0126, lr=5.29e-06, step=7824] Training: 78%|███████▊ | 7825/10000 [1:38:12<22:09, 1.64it/s, loss=0.0194, lr=5.29e-06, step=7825] Training: 78%|███████▊ | 7826/10000 [1:38:13<23:48, 1.52it/s, loss=0.0194, lr=5.29e-06, step=7825] Training: 78%|███████▊ | 7826/10000 [1:38:13<23:48, 1.52it/s, loss=0.0187, lr=5.29e-06, step=7826] Training: 78%|███████▊ | 7827/10000 [1:38:14<22:28, 1.61it/s, loss=0.0187, lr=5.29e-06, step=7826] Training: 78%|███████▊ | 7827/10000 [1:38:14<22:28, 1.61it/s, loss=0.0123, lr=5.28e-06, step=7827] Training: 78%|███████▊ | 7828/10000 [1:38:14<21:38, 1.67it/s, loss=0.0123, lr=5.28e-06, step=7827] Training: 78%|███████▊ | 7828/10000 [1:38:14<21:38, 1.67it/s, loss=0.0048, lr=5.28e-06, step=7828] Training: 78%|███████▊ | 7829/10000 [1:38:15<23:02, 1.57it/s, loss=0.0048, lr=5.28e-06, step=7828] Training: 78%|███████▊ | 7829/10000 [1:38:15<23:02, 1.57it/s, loss=0.0051, lr=5.28e-06, step=7829]17:44:22.576 [I] step=7830 loss=0.0103 smoothed_loss=0.0110 lr=5.29e-06 grad_norm=0.4765 step_time=0.5214s data_time=0.0879s it/s=1.641 eta_to_10000=1322.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0314 grad_action_out_proj=0.1867 grad_shared_expert=0.6395 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7830/10000 [1:38:16<22:34, 1.60it/s, loss=0.0051, lr=5.28e-06, step=7829] Training: 78%|███████▊ | 7830/10000 [1:38:16<22:34, 1.60it/s, loss=0.0103, lr=5.28e-06, step=7830] Training: 78%|███████▊ | 7831/10000 [1:38:16<21:18, 1.70it/s, loss=0.0103, lr=5.28e-06, step=7830] Training: 78%|███████▊ | 7831/10000 [1:38:16<21:18, 1.70it/s, loss=0.0088, lr=5.27e-06, step=7831] Training: 78%|███████▊ | 7832/10000 [1:38:17<20:32, 1.76it/s, loss=0.0088, lr=5.27e-06, step=7831] Training: 78%|███████▊ | 7832/10000 [1:38:17<20:32, 1.76it/s, loss=0.0032, lr=5.27e-06, step=7832] Training: 78%|███████▊ | 7833/10000 [1:38:17<22:52, 1.58it/s, loss=0.0032, lr=5.27e-06, step=7832] Training: 78%|███████▊ | 7833/10000 [1:38:17<22:52, 1.58it/s, loss=0.0043, lr=5.27e-06, step=7833] Training: 78%|███████▊ | 7834/10000 [1:38:18<24:20, 1.48it/s, loss=0.0043, lr=5.27e-06, step=7833] Training: 78%|███████▊ | 7834/10000 [1:38:18<24:20, 1.48it/s, loss=0.0042, lr=5.27e-06, step=7834] Training: 78%|███████▊ | 7835/10000 [1:38:19<23:09, 1.56it/s, loss=0.0042, lr=5.27e-06, step=7834] Training: 78%|███████▊ | 7835/10000 [1:38:19<23:09, 1.56it/s, loss=0.0027, lr=5.26e-06, step=7835] Training: 78%|███████▊ | 7836/10000 [1:38:19<23:23, 1.54it/s, loss=0.0027, lr=5.26e-06, step=7835] Training: 78%|███████▊ | 7836/10000 [1:38:19<23:23, 1.54it/s, loss=0.0107, lr=5.26e-06, step=7836] Training: 78%|███████▊ | 7837/10000 [1:38:20<21:49, 1.65it/s, loss=0.0107, lr=5.26e-06, step=7836] Training: 78%|███████▊ | 7837/10000 [1:38:20<21:49, 1.65it/s, loss=0.0024, lr=5.26e-06, step=7837] Training: 78%|███████▊ | 7838/10000 [1:38:20<20:45, 1.74it/s, loss=0.0024, lr=5.26e-06, step=7837] Training: 78%|███████▊ | 7838/10000 [1:38:20<20:45, 1.74it/s, loss=0.0034, lr=5.26e-06, step=7838] Training: 78%|███████▊ | 7839/10000 [1:38:21<20:03, 1.80it/s, loss=0.0034, lr=5.26e-06, step=7838] Training: 78%|███████▊ | 7839/10000 [1:38:21<20:03, 1.80it/s, loss=0.0105, lr=5.25e-06, step=7839]17:44:28.466 [I] step=7840 loss=0.0077 smoothed_loss=0.0078 lr=5.26e-06 grad_norm=0.3303 step_time=0.5090s data_time=0.0800s it/s=1.698 eta_to_10000=1272.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0077 grad_action_out_proj=0.0676 grad_shared_expert=0.2029 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7840/10000 [1:38:22<20:00, 1.80it/s, loss=0.0105, lr=5.25e-06, step=7839] Training: 78%|███████▊ | 7840/10000 [1:38:22<20:00, 1.80it/s, loss=0.0077, lr=5.25e-06, step=7840] Training: 78%|███████▊ | 7841/10000 [1:38:22<23:31, 1.53it/s, loss=0.0077, lr=5.25e-06, step=7840] Training: 78%|███████▊ | 7841/10000 [1:38:22<23:31, 1.53it/s, loss=0.0221, lr=5.25e-06, step=7841] Training: 78%|███████▊ | 7842/10000 [1:38:23<21:49, 1.65it/s, loss=0.0221, lr=5.25e-06, step=7841] Training: 78%|███████▊ | 7842/10000 [1:38:23<21:49, 1.65it/s, loss=0.0515, lr=5.25e-06, step=7842] Training: 78%|███████▊ | 7843/10000 [1:38:24<22:26, 1.60it/s, loss=0.0515, lr=5.25e-06, step=7842] Training: 78%|███████▊ | 7843/10000 [1:38:24<22:26, 1.60it/s, loss=0.0018, lr=5.25e-06, step=7843] Training: 78%|███████▊ | 7844/10000 [1:38:24<21:08, 1.70it/s, loss=0.0018, lr=5.25e-06, step=7843] Training: 78%|███████▊ | 7844/10000 [1:38:24<21:08, 1.70it/s, loss=0.0059, lr=5.24e-06, step=7844] Training: 78%|███████▊ | 7845/10000 [1:38:25<20:24, 1.76it/s, loss=0.0059, lr=5.24e-06, step=7844] Training: 78%|███████▊ | 7845/10000 [1:38:25<20:24, 1.76it/s, loss=0.0037, lr=5.24e-06, step=7845] Training: 78%|███████▊ | 7846/10000 [1:38:25<20:02, 1.79it/s, loss=0.0037, lr=5.24e-06, step=7845] Training: 78%|███████▊ | 7846/10000 [1:38:25<20:02, 1.79it/s, loss=0.0037, lr=5.24e-06, step=7846] Training: 78%|███████▊ | 7847/10000 [1:38:26<24:50, 1.44it/s, loss=0.0037, lr=5.24e-06, step=7846] Training: 78%|███████▊ | 7847/10000 [1:38:26<24:50, 1.44it/s, loss=0.0114, lr=5.24e-06, step=7847] Training: 78%|███████▊ | 7848/10000 [1:38:27<22:55, 1.56it/s, loss=0.0114, lr=5.24e-06, step=7847] Training: 78%|███████▊ | 7848/10000 [1:38:27<22:55, 1.56it/s, loss=0.0018, lr=5.23e-06, step=7848] Training: 78%|███████▊ | 7849/10000 [1:38:27<23:43, 1.51it/s, loss=0.0018, lr=5.23e-06, step=7848] Training: 78%|███████▊ | 7849/10000 [1:38:27<23:43, 1.51it/s, loss=0.0011, lr=5.23e-06, step=7849]17:44:34.853 [I] step=7850 loss=0.0090 smoothed_loss=0.0086 lr=5.24e-06 grad_norm=0.5022 step_time=0.5524s data_time=0.0864s it/s=1.566 eta_to_10000=1373.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0454 grad_action_out_proj=0.1846 grad_shared_expert=0.5776 (10775:train_pytorch.py:850) + Training: 78%|███████▊ | 7850/10000 [1:38:28<22:28, 1.59it/s, loss=0.0011, lr=5.23e-06, step=7849] Training: 78%|███████▊ | 7850/10000 [1:38:28<22:28, 1.59it/s, loss=0.0090, lr=5.23e-06, step=7850] Training: 79%|███████▊ | 7851/10000 [1:38:28<21:07, 1.70it/s, loss=0.0090, lr=5.23e-06, step=7850] Training: 79%|███████▊ | 7851/10000 [1:38:28<21:07, 1.70it/s, loss=0.0321, lr=5.23e-06, step=7851] Training: 79%|███████▊ | 7852/10000 [1:38:29<20:40, 1.73it/s, loss=0.0321, lr=5.23e-06, step=7851] Training: 79%|███████▊ | 7852/10000 [1:38:29<20:40, 1.73it/s, loss=0.0129, lr=5.22e-06, step=7852] Training: 79%|███████▊ | 7853/10000 [1:38:30<20:15, 1.77it/s, loss=0.0129, lr=5.22e-06, step=7852] Training: 79%|███████▊ | 7853/10000 [1:38:30<20:15, 1.77it/s, loss=0.0092, lr=5.22e-06, step=7853] Training: 79%|███████▊ | 7854/10000 [1:38:30<19:53, 1.80it/s, loss=0.0092, lr=5.22e-06, step=7853] Training: 79%|███████▊ | 7854/10000 [1:38:30<19:53, 1.80it/s, loss=0.0059, lr=5.22e-06, step=7854] Training: 79%|███████▊ | 7855/10000 [1:38:31<22:38, 1.58it/s, loss=0.0059, lr=5.22e-06, step=7854] Training: 79%|███████▊ | 7855/10000 [1:38:31<22:38, 1.58it/s, loss=0.0031, lr=5.22e-06, step=7855] Training: 79%|███████▊ | 7856/10000 [1:38:32<23:26, 1.52it/s, loss=0.0031, lr=5.22e-06, step=7855] Training: 79%|███████▊ | 7856/10000 [1:38:32<23:26, 1.52it/s, loss=0.0040, lr=5.21e-06, step=7856] Training: 79%|███████▊ | 7857/10000 [1:38:32<21:55, 1.63it/s, loss=0.0040, lr=5.21e-06, step=7856] Training: 79%|███████▊ | 7857/10000 [1:38:32<21:55, 1.63it/s, loss=0.0106, lr=5.21e-06, step=7857] Training: 79%|███████▊ | 7858/10000 [1:38:33<21:40, 1.65it/s, loss=0.0106, lr=5.21e-06, step=7857] Training: 79%|███████▊ | 7858/10000 [1:38:33<21:40, 1.65it/s, loss=0.0099, lr=5.21e-06, step=7858] Training: 79%|███████▊ | 7859/10000 [1:38:33<20:43, 1.72it/s, loss=0.0099, lr=5.21e-06, step=7858] Training: 79%|███████▊ | 7859/10000 [1:38:33<20:43, 1.72it/s, loss=0.0073, lr=5.21e-06, step=7859]17:44:40.832 [I] step=7860 loss=0.0070 smoothed_loss=0.0089 lr=5.21e-06 grad_norm=0.4499 step_time=0.5072s data_time=0.0907s it/s=1.673 eta_to_10000=1279.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0189 grad_action_out_proj=0.1294 grad_shared_expert=0.4682 (10775:train_pytorch.py:850) + Training: 79%|███████▊ | 7860/10000 [1:38:34<22:04, 1.62it/s, loss=0.0073, lr=5.21e-06, step=7859] Training: 79%|███████▊ | 7860/10000 [1:38:34<22:04, 1.62it/s, loss=0.0070, lr=5.20e-06, step=7860] Training: 79%|███████▊ | 7861/10000 [1:38:34<21:20, 1.67it/s, loss=0.0070, lr=5.20e-06, step=7860] Training: 79%|███████▊ | 7861/10000 [1:38:34<21:20, 1.67it/s, loss=0.0043, lr=5.20e-06, step=7861] Training: 79%|███████▊ | 7862/10000 [1:38:35<24:45, 1.44it/s, loss=0.0043, lr=5.20e-06, step=7861] Training: 79%|███████▊ | 7862/10000 [1:38:35<24:45, 1.44it/s, loss=0.0015, lr=5.20e-06, step=7862] Training: 79%|███████▊ | 7863/10000 [1:38:36<22:32, 1.58it/s, loss=0.0015, lr=5.20e-06, step=7862] Training: 79%|███████▊ | 7863/10000 [1:38:36<22:32, 1.58it/s, loss=0.0097, lr=5.20e-06, step=7863] Training: 79%|███████▊ | 7864/10000 [1:38:37<22:48, 1.56it/s, loss=0.0097, lr=5.20e-06, step=7863] Training: 79%|███████▊ | 7864/10000 [1:38:37<22:48, 1.56it/s, loss=0.0124, lr=5.19e-06, step=7864] Training: 79%|███████▊ | 7865/10000 [1:38:37<21:19, 1.67it/s, loss=0.0124, lr=5.19e-06, step=7864] Training: 79%|███████▊ | 7865/10000 [1:38:37<21:19, 1.67it/s, loss=0.0108, lr=5.19e-06, step=7865] Training: 79%|███████▊ | 7866/10000 [1:38:38<21:06, 1.69it/s, loss=0.0108, lr=5.19e-06, step=7865] Training: 79%|███████▊ | 7866/10000 [1:38:38<21:06, 1.69it/s, loss=0.0124, lr=5.19e-06, step=7866] Training: 79%|███████▊ | 7867/10000 [1:38:38<20:29, 1.73it/s, loss=0.0124, lr=5.19e-06, step=7866] Training: 79%|███████▊ | 7867/10000 [1:38:38<20:29, 1.73it/s, loss=0.0143, lr=5.19e-06, step=7867] Training: 79%|███████▊ | 7868/10000 [1:38:39<20:04, 1.77it/s, loss=0.0143, lr=5.19e-06, step=7867] Training: 79%|███████▊ | 7868/10000 [1:38:39<20:04, 1.77it/s, loss=0.0087, lr=5.18e-06, step=7868] Training: 79%|███████▊ | 7869/10000 [1:38:40<23:23, 1.52it/s, loss=0.0087, lr=5.18e-06, step=7868] Training: 79%|███████▊ | 7869/10000 [1:38:40<23:23, 1.52it/s, loss=0.0100, lr=5.18e-06, step=7869]17:44:47.251 [I] step=7870 loss=0.0013 smoothed_loss=0.0087 lr=5.19e-06 grad_norm=0.4694 step_time=0.5533s data_time=0.0887s it/s=1.558 eta_to_10000=1367.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.1312 grad_shared_expert=0.3361 (10775:train_pytorch.py:850) + Training: 79%|███████▊ | 7870/10000 [1:38:40<24:32, 1.45it/s, loss=0.0100, lr=5.18e-06, step=7869] Training: 79%|███████▊ | 7870/10000 [1:38:40<24:32, 1.45it/s, loss=0.0013, lr=5.18e-06, step=7870] Training: 79%|███████▊ | 7871/10000 [1:38:41<24:33, 1.44it/s, loss=0.0013, lr=5.18e-06, step=7870] Training: 79%|███████▊ | 7871/10000 [1:38:41<24:33, 1.44it/s, loss=0.0234, lr=5.18e-06, step=7871] Training: 79%|███████▊ | 7872/10000 [1:38:42<22:28, 1.58it/s, loss=0.0234, lr=5.18e-06, step=7871] Training: 79%|███████▊ | 7872/10000 [1:38:42<22:28, 1.58it/s, loss=0.0059, lr=5.17e-06, step=7872] Training: 79%|███████▊ | 7873/10000 [1:38:42<20:54, 1.70it/s, loss=0.0059, lr=5.17e-06, step=7872] Training: 79%|███████▊ | 7873/10000 [1:38:42<20:54, 1.70it/s, loss=0.0017, lr=5.17e-06, step=7873] Training: 79%|███████▊ | 7874/10000 [1:38:43<20:51, 1.70it/s, loss=0.0017, lr=5.17e-06, step=7873] Training: 79%|███████▊ | 7874/10000 [1:38:43<20:51, 1.70it/s, loss=0.0069, lr=5.17e-06, step=7874] Training: 79%|███████▉ | 7875/10000 [1:38:43<21:24, 1.65it/s, loss=0.0069, lr=5.17e-06, step=7874] Training: 79%|███████▉ | 7875/10000 [1:38:43<21:24, 1.65it/s, loss=0.0043, lr=5.17e-06, step=7875] Training: 79%|███████▉ | 7876/10000 [1:38:44<24:43, 1.43it/s, loss=0.0043, lr=5.17e-06, step=7875] Training: 79%|███████▉ | 7876/10000 [1:38:44<24:43, 1.43it/s, loss=0.0053, lr=5.17e-06, step=7876] Training: 79%|███████▉ | 7877/10000 [1:38:45<23:10, 1.53it/s, loss=0.0053, lr=5.17e-06, step=7876] Training: 79%|███████▉ | 7877/10000 [1:38:45<23:10, 1.53it/s, loss=0.0075, lr=5.16e-06, step=7877] Training: 79%|███████▉ | 7878/10000 [1:38:45<21:42, 1.63it/s, loss=0.0075, lr=5.16e-06, step=7877] Training: 79%|███████▉ | 7878/10000 [1:38:45<21:42, 1.63it/s, loss=0.0066, lr=5.16e-06, step=7878] Training: 79%|███████▉ | 7879/10000 [1:38:46<22:25, 1.58it/s, loss=0.0066, lr=5.16e-06, step=7878] Training: 79%|███████▉ | 7879/10000 [1:38:46<22:25, 1.58it/s, loss=0.0020, lr=5.16e-06, step=7879]17:44:53.407 [I] step=7880 loss=0.0014 smoothed_loss=0.0066 lr=5.17e-06 grad_norm=0.4225 step_time=0.5287s data_time=0.0868s it/s=1.625 eta_to_10000=1304.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0048 grad_action_out_proj=0.0659 grad_shared_expert=0.3177 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7880/10000 [1:38:46<21:48, 1.62it/s, loss=0.0020, lr=5.16e-06, step=7879] Training: 79%|███████▉ | 7880/10000 [1:38:46<21:48, 1.62it/s, loss=0.0014, lr=5.16e-06, step=7880] Training: 79%|███████▉ | 7881/10000 [1:38:47<22:22, 1.58it/s, loss=0.0014, lr=5.16e-06, step=7880] Training: 79%|███████▉ | 7881/10000 [1:38:47<22:22, 1.58it/s, loss=0.0024, lr=5.15e-06, step=7881] Training: 79%|███████▉ | 7882/10000 [1:38:48<21:52, 1.61it/s, loss=0.0024, lr=5.15e-06, step=7881] Training: 79%|███████▉ | 7882/10000 [1:38:48<21:52, 1.61it/s, loss=0.0034, lr=5.15e-06, step=7882] Training: 79%|███████▉ | 7883/10000 [1:38:49<24:45, 1.43it/s, loss=0.0034, lr=5.15e-06, step=7882] Training: 79%|███████▉ | 7883/10000 [1:38:49<24:45, 1.43it/s, loss=0.0037, lr=5.15e-06, step=7883] Training: 79%|███████▉ | 7884/10000 [1:38:49<22:42, 1.55it/s, loss=0.0037, lr=5.15e-06, step=7883] Training: 79%|███████▉ | 7884/10000 [1:38:49<22:42, 1.55it/s, loss=0.0010, lr=5.15e-06, step=7884] Training: 79%|███████▉ | 7885/10000 [1:38:50<21:57, 1.60it/s, loss=0.0010, lr=5.15e-06, step=7884] Training: 79%|███████▉ | 7885/10000 [1:38:50<21:57, 1.60it/s, loss=0.0278, lr=5.14e-06, step=7885] Training: 79%|███████▉ | 7886/10000 [1:38:50<23:22, 1.51it/s, loss=0.0278, lr=5.14e-06, step=7885] Training: 79%|███████▉ | 7886/10000 [1:38:50<23:22, 1.51it/s, loss=0.0016, lr=5.14e-06, step=7886] Training: 79%|███████▉ | 7887/10000 [1:38:51<22:43, 1.55it/s, loss=0.0016, lr=5.14e-06, step=7886] Training: 79%|███████▉ | 7887/10000 [1:38:51<22:43, 1.55it/s, loss=0.0525, lr=5.14e-06, step=7887] Training: 79%|███████▉ | 7888/10000 [1:38:52<22:33, 1.56it/s, loss=0.0525, lr=5.14e-06, step=7887] Training: 79%|███████▉ | 7888/10000 [1:38:52<22:33, 1.56it/s, loss=0.0062, lr=5.14e-06, step=7888] Training: 79%|███████▉ | 7889/10000 [1:38:52<22:05, 1.59it/s, loss=0.0062, lr=5.14e-06, step=7888] Training: 79%|███████▉ | 7889/10000 [1:38:52<22:05, 1.59it/s, loss=0.0039, lr=5.13e-06, step=7889]17:45:00.123 [I] step=7890 loss=0.0059 smoothed_loss=0.0098 lr=5.14e-06 grad_norm=0.4845 step_time=0.5629s data_time=0.1086s it/s=1.489 eta_to_10000=1416.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0096 grad_action_out_proj=0.1090 grad_shared_expert=0.5183 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7890/10000 [1:38:53<24:51, 1.41it/s, loss=0.0039, lr=5.13e-06, step=7889] Training: 79%|███████▉ | 7890/10000 [1:38:53<24:51, 1.41it/s, loss=0.0059, lr=5.13e-06, step=7890] Training: 79%|███████▉ | 7891/10000 [1:38:54<22:51, 1.54it/s, loss=0.0059, lr=5.13e-06, step=7890] Training: 79%|███████▉ | 7891/10000 [1:38:54<22:51, 1.54it/s, loss=0.0118, lr=5.13e-06, step=7891] Training: 79%|███████▉ | 7892/10000 [1:38:54<22:49, 1.54it/s, loss=0.0118, lr=5.13e-06, step=7891] Training: 79%|███████▉ | 7892/10000 [1:38:54<22:49, 1.54it/s, loss=0.0010, lr=5.13e-06, step=7892] Training: 79%|███████▉ | 7893/10000 [1:38:55<23:18, 1.51it/s, loss=0.0010, lr=5.13e-06, step=7892] Training: 79%|███████▉ | 7893/10000 [1:38:55<23:18, 1.51it/s, loss=0.0137, lr=5.12e-06, step=7893] Training: 79%|███████▉ | 7894/10000 [1:38:56<21:44, 1.61it/s, loss=0.0137, lr=5.12e-06, step=7893] Training: 79%|███████▉ | 7894/10000 [1:38:56<21:44, 1.61it/s, loss=0.0037, lr=5.12e-06, step=7894] Training: 79%|███████▉ | 7895/10000 [1:38:56<20:47, 1.69it/s, loss=0.0037, lr=5.12e-06, step=7894] Training: 79%|███████▉ | 7895/10000 [1:38:56<20:47, 1.69it/s, loss=0.0030, lr=5.12e-06, step=7895] Training: 79%|███████▉ | 7896/10000 [1:38:57<22:05, 1.59it/s, loss=0.0030, lr=5.12e-06, step=7895] Training: 79%|███████▉ | 7896/10000 [1:38:57<22:05, 1.59it/s, loss=0.0120, lr=5.12e-06, step=7896] Training: 79%|███████▉ | 7897/10000 [1:38:57<21:57, 1.60it/s, loss=0.0120, lr=5.12e-06, step=7896] Training: 79%|███████▉ | 7897/10000 [1:38:57<21:57, 1.60it/s, loss=0.0056, lr=5.12e-06, step=7897] Training: 79%|███████▉ | 7898/10000 [1:38:58<26:32, 1.32it/s, loss=0.0056, lr=5.12e-06, step=7897] Training: 79%|███████▉ | 7898/10000 [1:38:58<26:32, 1.32it/s, loss=0.0115, lr=5.11e-06, step=7898] Training: 79%|███████▉ | 7899/10000 [1:38:59<24:48, 1.41it/s, loss=0.0115, lr=5.11e-06, step=7898] Training: 79%|███████▉ | 7899/10000 [1:38:59<24:48, 1.41it/s, loss=0.0053, lr=5.11e-06, step=7899]17:45:06.692 [I] step=7900 loss=0.0023 smoothed_loss=0.0078 lr=5.12e-06 grad_norm=0.3455 step_time=0.5614s data_time=0.0955s it/s=1.522 eta_to_10000=1379.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.0548 grad_shared_expert=0.1953 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7900/10000 [1:39:00<24:21, 1.44it/s, loss=0.0053, lr=5.11e-06, step=7899] Training: 79%|███████▉ | 7900/10000 [1:39:00<24:21, 1.44it/s, loss=0.0023, lr=5.11e-06, step=7900] Training: 79%|███████▉ | 7901/10000 [1:39:00<24:34, 1.42it/s, loss=0.0023, lr=5.11e-06, step=7900] Training: 79%|███████▉ | 7901/10000 [1:39:00<24:34, 1.42it/s, loss=0.0165, lr=5.11e-06, step=7901] Training: 79%|███████▉ | 7902/10000 [1:39:01<25:13, 1.39it/s, loss=0.0165, lr=5.11e-06, step=7901] Training: 79%|███████▉ | 7902/10000 [1:39:01<25:13, 1.39it/s, loss=0.0033, lr=5.10e-06, step=7902] Training: 79%|███████▉ | 7903/10000 [1:39:02<26:54, 1.30it/s, loss=0.0033, lr=5.10e-06, step=7902] Training: 79%|███████▉ | 7903/10000 [1:39:02<26:54, 1.30it/s, loss=0.0058, lr=5.10e-06, step=7903] Training: 79%|███████▉ | 7904/10000 [1:39:03<25:33, 1.37it/s, loss=0.0058, lr=5.10e-06, step=7903] Training: 79%|███████▉ | 7904/10000 [1:39:03<25:33, 1.37it/s, loss=0.0073, lr=5.10e-06, step=7904] Training: 79%|███████▉ | 7905/10000 [1:39:04<28:11, 1.24it/s, loss=0.0073, lr=5.10e-06, step=7904] Training: 79%|███████▉ | 7905/10000 [1:39:04<28:11, 1.24it/s, loss=0.0399, lr=5.10e-06, step=7905] Training: 79%|███████▉ | 7906/10000 [1:39:04<25:36, 1.36it/s, loss=0.0399, lr=5.10e-06, step=7905] Training: 79%|███████▉ | 7906/10000 [1:39:04<25:36, 1.36it/s, loss=0.0325, lr=5.09e-06, step=7906] Training: 79%|███████▉ | 7907/10000 [1:39:05<23:14, 1.50it/s, loss=0.0325, lr=5.09e-06, step=7906] Training: 79%|███████▉ | 7907/10000 [1:39:05<23:14, 1.50it/s, loss=0.0092, lr=5.09e-06, step=7907] Training: 79%|███████▉ | 7908/10000 [1:39:06<23:54, 1.46it/s, loss=0.0092, lr=5.09e-06, step=7907] Training: 79%|███████▉ | 7908/10000 [1:39:06<23:54, 1.46it/s, loss=0.0050, lr=5.09e-06, step=7908] Training: 79%|███████▉ | 7909/10000 [1:39:06<22:52, 1.52it/s, loss=0.0050, lr=5.09e-06, step=7908] Training: 79%|███████▉ | 7909/10000 [1:39:06<22:52, 1.52it/s, loss=0.0084, lr=5.09e-06, step=7909]17:45:13.926 [I] step=7910 loss=0.0040 smoothed_loss=0.0109 lr=5.09e-06 grad_norm=0.4321 step_time=0.5875s data_time=0.1359s it/s=1.383 eta_to_10000=1511.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0036 grad_action_out_proj=0.0516 grad_shared_expert=0.2738 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7910/10000 [1:39:07<24:54, 1.40it/s, loss=0.0084, lr=5.09e-06, step=7909] Training: 79%|███████▉ | 7910/10000 [1:39:07<24:54, 1.40it/s, loss=0.0040, lr=5.08e-06, step=7910] Training: 79%|███████▉ | 7911/10000 [1:39:08<24:03, 1.45it/s, loss=0.0040, lr=5.08e-06, step=7910] Training: 79%|███████▉ | 7911/10000 [1:39:08<24:03, 1.45it/s, loss=0.0110, lr=5.08e-06, step=7911] Training: 79%|███████▉ | 7912/10000 [1:39:09<29:45, 1.17it/s, loss=0.0110, lr=5.08e-06, step=7911] Training: 79%|███████▉ | 7912/10000 [1:39:09<29:45, 1.17it/s, loss=0.0013, lr=5.08e-06, step=7912] Training: 79%|███████▉ | 7913/10000 [1:39:09<26:53, 1.29it/s, loss=0.0013, lr=5.08e-06, step=7912] Training: 79%|███████▉ | 7913/10000 [1:39:09<26:53, 1.29it/s, loss=0.0044, lr=5.08e-06, step=7913] Training: 79%|███████▉ | 7914/10000 [1:39:10<27:18, 1.27it/s, loss=0.0044, lr=5.08e-06, step=7913] Training: 79%|███████▉ | 7914/10000 [1:39:10<27:18, 1.27it/s, loss=0.0228, lr=5.07e-06, step=7914] Training: 79%|███████▉ | 7915/10000 [1:39:11<29:38, 1.17it/s, loss=0.0228, lr=5.07e-06, step=7914] Training: 79%|███████▉ | 7915/10000 [1:39:11<29:38, 1.17it/s, loss=0.0016, lr=5.07e-06, step=7915] Training: 79%|███████▉ | 7916/10000 [1:39:12<29:44, 1.17it/s, loss=0.0016, lr=5.07e-06, step=7915] Training: 79%|███████▉ | 7916/10000 [1:39:12<29:44, 1.17it/s, loss=0.0108, lr=5.07e-06, step=7916] Training: 79%|███████▉ | 7917/10000 [1:39:13<26:05, 1.33it/s, loss=0.0108, lr=5.07e-06, step=7916] Training: 79%|███████▉ | 7917/10000 [1:39:13<26:05, 1.33it/s, loss=0.0017, lr=5.07e-06, step=7917] Training: 79%|███████▉ | 7918/10000 [1:39:13<23:27, 1.48it/s, loss=0.0017, lr=5.07e-06, step=7917] Training: 79%|███████▉ | 7918/10000 [1:39:13<23:27, 1.48it/s, loss=0.0115, lr=5.07e-06, step=7918] Training: 79%|███████▉ | 7919/10000 [1:39:14<27:29, 1.26it/s, loss=0.0115, lr=5.07e-06, step=7918] Training: 79%|███████▉ | 7919/10000 [1:39:14<27:29, 1.26it/s, loss=0.0088, lr=5.06e-06, step=7919]17:45:21.976 [I] step=7920 loss=0.0395 smoothed_loss=0.0123 lr=5.07e-06 grad_norm=0.5088 step_time=0.6407s data_time=0.1642s it/s=1.243 eta_to_10000=1674.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0171 grad_action_out_proj=0.1194 grad_shared_expert=0.4875 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7920/10000 [1:39:15<27:54, 1.24it/s, loss=0.0088, lr=5.06e-06, step=7919] Training: 79%|███████▉ | 7920/10000 [1:39:15<27:54, 1.24it/s, loss=0.0395, lr=5.06e-06, step=7920] Training: 79%|███████▉ | 7921/10000 [1:39:16<27:39, 1.25it/s, loss=0.0395, lr=5.06e-06, step=7920] Training: 79%|███████▉ | 7921/10000 [1:39:16<27:39, 1.25it/s, loss=0.0104, lr=5.06e-06, step=7921] Training: 79%|███████▉ | 7922/10000 [1:39:17<27:01, 1.28it/s, loss=0.0104, lr=5.06e-06, step=7921] Training: 79%|███████▉ | 7922/10000 [1:39:17<27:01, 1.28it/s, loss=0.0108, lr=5.06e-06, step=7922] Training: 79%|███████▉ | 7923/10000 [1:39:17<24:21, 1.42it/s, loss=0.0108, lr=5.06e-06, step=7922] Training: 79%|███████▉ | 7923/10000 [1:39:17<24:21, 1.42it/s, loss=0.0059, lr=5.05e-06, step=7923] Training: 79%|███████▉ | 7924/10000 [1:39:18<27:38, 1.25it/s, loss=0.0059, lr=5.05e-06, step=7923] Training: 79%|███████▉ | 7924/10000 [1:39:18<27:38, 1.25it/s, loss=0.0078, lr=5.05e-06, step=7924] Training: 79%|███████▉ | 7925/10000 [1:39:19<27:39, 1.25it/s, loss=0.0078, lr=5.05e-06, step=7924] Training: 79%|███████▉ | 7925/10000 [1:39:19<27:39, 1.25it/s, loss=0.0137, lr=5.05e-06, step=7925] Training: 79%|███████▉ | 7926/10000 [1:39:20<28:35, 1.21it/s, loss=0.0137, lr=5.05e-06, step=7925] Training: 79%|███████▉ | 7926/10000 [1:39:20<28:35, 1.21it/s, loss=0.0153, lr=5.05e-06, step=7926] Training: 79%|███████▉ | 7927/10000 [1:39:21<27:52, 1.24it/s, loss=0.0153, lr=5.05e-06, step=7926] Training: 79%|███████▉ | 7927/10000 [1:39:21<27:52, 1.24it/s, loss=0.0130, lr=5.04e-06, step=7927] Training: 79%|███████▉ | 7928/10000 [1:39:22<29:18, 1.18it/s, loss=0.0130, lr=5.04e-06, step=7927] Training: 79%|███████▉ | 7928/10000 [1:39:22<29:18, 1.18it/s, loss=0.0147, lr=5.04e-06, step=7928] Training: 79%|███████▉ | 7929/10000 [1:39:22<29:35, 1.17it/s, loss=0.0147, lr=5.04e-06, step=7928] Training: 79%|███████▉ | 7929/10000 [1:39:22<29:35, 1.17it/s, loss=0.0593, lr=5.04e-06, step=7929]17:45:29.861 [I] step=7930 loss=0.0041 smoothed_loss=0.0155 lr=5.05e-06 grad_norm=0.4066 step_time=0.6040s data_time=0.1845s it/s=1.268 eta_to_10000=1632.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0141 grad_action_out_proj=0.0852 grad_shared_expert=0.2464 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7930/10000 [1:39:23<26:19, 1.31it/s, loss=0.0593, lr=5.04e-06, step=7929] Training: 79%|███████▉ | 7930/10000 [1:39:23<26:19, 1.31it/s, loss=0.0041, lr=5.04e-06, step=7930] Training: 79%|███████▉ | 7931/10000 [1:39:24<24:53, 1.38it/s, loss=0.0041, lr=5.04e-06, step=7930] Training: 79%|███████▉ | 7931/10000 [1:39:24<24:53, 1.38it/s, loss=0.0014, lr=5.03e-06, step=7931] Training: 79%|███████▉ | 7932/10000 [1:39:24<22:57, 1.50it/s, loss=0.0014, lr=5.03e-06, step=7931] Training: 79%|███████▉ | 7932/10000 [1:39:24<22:57, 1.50it/s, loss=0.0046, lr=5.03e-06, step=7932] Training: 79%|███████▉ | 7933/10000 [1:39:25<25:16, 1.36it/s, loss=0.0046, lr=5.03e-06, step=7932] Training: 79%|███████▉ | 7933/10000 [1:39:25<25:16, 1.36it/s, loss=0.0142, lr=5.03e-06, step=7933] Training: 79%|███████▉ | 7934/10000 [1:39:26<25:05, 1.37it/s, loss=0.0142, lr=5.03e-06, step=7933] Training: 79%|███████▉ | 7934/10000 [1:39:26<25:05, 1.37it/s, loss=0.0349, lr=5.03e-06, step=7934] Training: 79%|███████▉ | 7935/10000 [1:39:27<27:42, 1.24it/s, loss=0.0349, lr=5.03e-06, step=7934] Training: 79%|███████▉ | 7935/10000 [1:39:27<27:42, 1.24it/s, loss=0.0138, lr=5.03e-06, step=7935] Training: 79%|███████▉ | 7936/10000 [1:39:28<30:42, 1.12it/s, loss=0.0138, lr=5.03e-06, step=7935] Training: 79%|███████▉ | 7936/10000 [1:39:28<30:42, 1.12it/s, loss=0.0044, lr=5.02e-06, step=7936] Training: 79%|███████▉ | 7937/10000 [1:39:29<29:25, 1.17it/s, loss=0.0044, lr=5.02e-06, step=7936] Training: 79%|███████▉ | 7937/10000 [1:39:29<29:25, 1.17it/s, loss=0.0029, lr=5.02e-06, step=7937] Training: 79%|███████▉ | 7938/10000 [1:39:29<28:26, 1.21it/s, loss=0.0029, lr=5.02e-06, step=7937] Training: 79%|███████▉ | 7938/10000 [1:39:29<28:26, 1.21it/s, loss=0.0333, lr=5.02e-06, step=7938] Training: 79%|███████▉ | 7939/10000 [1:39:30<25:12, 1.36it/s, loss=0.0333, lr=5.02e-06, step=7938] Training: 79%|███████▉ | 7939/10000 [1:39:30<25:12, 1.36it/s, loss=0.0220, lr=5.02e-06, step=7939]17:45:37.636 [I] step=7940 loss=0.0188 smoothed_loss=0.0161 lr=5.02e-06 grad_norm=0.4506 step_time=0.6039s data_time=0.1736s it/s=1.286 eta_to_10000=1601.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0214 grad_action_out_proj=0.1318 grad_shared_expert=0.4975 (10775:train_pytorch.py:850) + Training: 79%|███████▉ | 7940/10000 [1:39:31<26:40, 1.29it/s, loss=0.0220, lr=5.02e-06, step=7939] Training: 79%|███████▉ | 7940/10000 [1:39:31<26:40, 1.29it/s, loss=0.0188, lr=5.01e-06, step=7940] Training: 79%|███████▉ | 7941/10000 [1:39:32<26:56, 1.27it/s, loss=0.0188, lr=5.01e-06, step=7940] Training: 79%|███████▉ | 7941/10000 [1:39:32<26:56, 1.27it/s, loss=0.0082, lr=5.01e-06, step=7941] Training: 79%|███████▉ | 7942/10000 [1:39:32<27:32, 1.25it/s, loss=0.0082, lr=5.01e-06, step=7941] Training: 79%|███████▉ | 7942/10000 [1:39:32<27:32, 1.25it/s, loss=0.0028, lr=5.01e-06, step=7942] Training: 79%|███████▉ | 7943/10000 [1:39:33<26:24, 1.30it/s, loss=0.0028, lr=5.01e-06, step=7942] Training: 79%|███████▉ | 7943/10000 [1:39:33<26:24, 1.30it/s, loss=0.0246, lr=5.01e-06, step=7943] Training: 79%|███████▉ | 7944/10000 [1:39:34<27:50, 1.23it/s, loss=0.0246, lr=5.01e-06, step=7943] Training: 79%|███████▉ | 7944/10000 [1:39:34<27:50, 1.23it/s, loss=0.0059, lr=5.00e-06, step=7944] Training: 79%|███████▉ | 7945/10000 [1:39:35<27:45, 1.23it/s, loss=0.0059, lr=5.00e-06, step=7944] Training: 79%|███████▉ | 7945/10000 [1:39:35<27:45, 1.23it/s, loss=0.0023, lr=5.00e-06, step=7945] Training: 79%|███████▉ | 7946/10000 [1:39:35<24:54, 1.37it/s, loss=0.0023, lr=5.00e-06, step=7945] Training: 79%|███████▉ | 7946/10000 [1:39:35<24:54, 1.37it/s, loss=0.0010, lr=5.00e-06, step=7946] Training: 79%|███████▉ | 7947/10000 [1:39:36<24:59, 1.37it/s, loss=0.0010, lr=5.00e-06, step=7946] Training: 79%|███████▉ | 7947/10000 [1:39:36<24:59, 1.37it/s, loss=0.0044, lr=5.00e-06, step=7947] Training: 79%|███████▉ | 7948/10000 [1:39:37<30:59, 1.10it/s, loss=0.0044, lr=5.00e-06, step=7947] Training: 79%|███████▉ | 7948/10000 [1:39:37<30:59, 1.10it/s, loss=0.0081, lr=4.99e-06, step=7948] Training: 79%|███████▉ | 7949/10000 [1:39:38<28:23, 1.20it/s, loss=0.0081, lr=4.99e-06, step=7948] Training: 79%|███████▉ | 7949/10000 [1:39:38<28:23, 1.20it/s, loss=0.0069, lr=4.99e-06, step=7949]17:45:45.788 [I] step=7950 loss=0.0071 smoothed_loss=0.0100 lr=5.00e-06 grad_norm=0.4308 step_time=0.6438s data_time=0.1714s it/s=1.227 eta_to_10000=1670.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0139 grad_action_out_proj=0.1245 grad_shared_expert=0.3845 (10775:train_pytorch.py:850) + Training: 80%|███████▉ | 7950/10000 [1:39:39<28:34, 1.20it/s, loss=0.0069, lr=4.99e-06, step=7949] Training: 80%|███████▉ | 7950/10000 [1:39:39<28:34, 1.20it/s, loss=0.0071, lr=4.99e-06, step=7950] Training: 80%|███████▉ | 7951/10000 [1:39:40<27:03, 1.26it/s, loss=0.0071, lr=4.99e-06, step=7950] Training: 80%|███████▉ | 7951/10000 [1:39:40<27:03, 1.26it/s, loss=0.0171, lr=4.99e-06, step=7951] Training: 80%|███████▉ | 7952/10000 [1:39:40<26:58, 1.27it/s, loss=0.0171, lr=4.99e-06, step=7951] Training: 80%|███████▉ | 7952/10000 [1:39:40<26:58, 1.27it/s, loss=0.0036, lr=4.99e-06, step=7952] Training: 80%|███████▉ | 7953/10000 [1:39:41<26:45, 1.27it/s, loss=0.0036, lr=4.99e-06, step=7952] Training: 80%|███████▉ | 7953/10000 [1:39:41<26:45, 1.27it/s, loss=0.0041, lr=4.98e-06, step=7953] Training: 80%|███████▉ | 7954/10000 [1:39:42<27:35, 1.24it/s, loss=0.0041, lr=4.98e-06, step=7953] Training: 80%|███████▉ | 7954/10000 [1:39:42<27:35, 1.24it/s, loss=0.0011, lr=4.98e-06, step=7954] Training: 80%|███████▉ | 7955/10000 [1:39:43<28:03, 1.21it/s, loss=0.0011, lr=4.98e-06, step=7954] Training: 80%|███████▉ | 7955/10000 [1:39:43<28:03, 1.21it/s, loss=0.0088, lr=4.98e-06, step=7955] Training: 80%|███████▉ | 7956/10000 [1:39:44<27:20, 1.25it/s, loss=0.0088, lr=4.98e-06, step=7955] Training: 80%|███████▉ | 7956/10000 [1:39:44<27:20, 1.25it/s, loss=0.0024, lr=4.98e-06, step=7956] Training: 80%|███████▉ | 7957/10000 [1:39:44<25:44, 1.32it/s, loss=0.0024, lr=4.98e-06, step=7956] Training: 80%|███████▉ | 7957/10000 [1:39:44<25:44, 1.32it/s, loss=0.0603, lr=4.97e-06, step=7957] Training: 80%|███████▉ | 7958/10000 [1:39:45<26:45, 1.27it/s, loss=0.0603, lr=4.97e-06, step=7957] Training: 80%|███████▉ | 7958/10000 [1:39:45<26:45, 1.27it/s, loss=0.0047, lr=4.97e-06, step=7958] Training: 80%|███████▉ | 7959/10000 [1:39:46<26:56, 1.26it/s, loss=0.0047, lr=4.97e-06, step=7958] Training: 80%|███████▉ | 7959/10000 [1:39:46<26:56, 1.26it/s, loss=0.0100, lr=4.97e-06, step=7959]17:45:53.606 [I] step=7960 loss=0.0033 smoothed_loss=0.0113 lr=4.98e-06 grad_norm=0.4106 step_time=0.6288s data_time=0.1531s it/s=1.279 eta_to_10000=1594.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0465 grad_action_out_proj=0.1614 grad_shared_expert=0.5500 (10775:train_pytorch.py:850) + Training: 80%|███████▉ | 7960/10000 [1:39:47<26:52, 1.27it/s, loss=0.0100, lr=4.97e-06, step=7959] Training: 80%|███████▉ | 7960/10000 [1:39:47<26:52, 1.27it/s, loss=0.0033, lr=4.97e-06, step=7960] Training: 80%|███████▉ | 7961/10000 [1:39:47<25:51, 1.31it/s, loss=0.0033, lr=4.97e-06, step=7960] Training: 80%|███████▉ | 7961/10000 [1:39:47<25:51, 1.31it/s, loss=0.0059, lr=4.96e-06, step=7961] Training: 80%|███████▉ | 7962/10000 [1:39:48<27:33, 1.23it/s, loss=0.0059, lr=4.96e-06, step=7961] Training: 80%|███████▉ | 7962/10000 [1:39:48<27:33, 1.23it/s, loss=0.0268, lr=4.96e-06, step=7962] Training: 80%|███████▉ | 7963/10000 [1:39:49<24:46, 1.37it/s, loss=0.0268, lr=4.96e-06, step=7962] Training: 80%|███████▉ | 7963/10000 [1:39:49<24:46, 1.37it/s, loss=0.1111, lr=4.96e-06, step=7963] Training: 80%|███████▉ | 7964/10000 [1:39:49<23:53, 1.42it/s, loss=0.1111, lr=4.96e-06, step=7963] Training: 80%|███████▉ | 7964/10000 [1:39:49<23:53, 1.42it/s, loss=0.0090, lr=4.96e-06, step=7964] Training: 80%|███████▉ | 7965/10000 [1:39:50<24:55, 1.36it/s, loss=0.0090, lr=4.96e-06, step=7964] Training: 80%|███████▉ | 7965/10000 [1:39:50<24:55, 1.36it/s, loss=0.0030, lr=4.96e-06, step=7965] Training: 80%|███████▉ | 7966/10000 [1:39:51<24:48, 1.37it/s, loss=0.0030, lr=4.96e-06, step=7965] Training: 80%|███████▉ | 7966/10000 [1:39:51<24:48, 1.37it/s, loss=0.0087, lr=4.95e-06, step=7966] Training: 80%|███████▉ | 7967/10000 [1:39:52<22:48, 1.49it/s, loss=0.0087, lr=4.95e-06, step=7966] Training: 80%|███████▉ | 7967/10000 [1:39:52<22:48, 1.49it/s, loss=0.0248, lr=4.95e-06, step=7967] Training: 80%|███████▉ | 7968/10000 [1:39:52<21:03, 1.61it/s, loss=0.0248, lr=4.95e-06, step=7967] Training: 80%|███████▉ | 7968/10000 [1:39:52<21:03, 1.61it/s, loss=0.0009, lr=4.95e-06, step=7968] Training: 80%|███████▉ | 7969/10000 [1:39:53<24:38, 1.37it/s, loss=0.0009, lr=4.95e-06, step=7968] Training: 80%|███████▉ | 7969/10000 [1:39:53<24:38, 1.37it/s, loss=0.0156, lr=4.95e-06, step=7969]17:46:00.593 [I] step=7970 loss=0.0136 smoothed_loss=0.0165 lr=4.95e-06 grad_norm=0.4334 step_time=0.5888s data_time=0.1098s it/s=1.432 eta_to_10000=1418.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0147 grad_action_out_proj=0.0870 grad_shared_expert=0.2455 (10775:train_pytorch.py:850) + Training: 80%|███████▉ | 7970/10000 [1:39:54<23:41, 1.43it/s, loss=0.0156, lr=4.95e-06, step=7969] Training: 80%|███████▉ | 7970/10000 [1:39:54<23:41, 1.43it/s, loss=0.0136, lr=4.94e-06, step=7970] Training: 80%|███████▉ | 7971/10000 [1:39:54<23:50, 1.42it/s, loss=0.0136, lr=4.94e-06, step=7970] Training: 80%|███████▉ | 7971/10000 [1:39:54<23:50, 1.42it/s, loss=0.0646, lr=4.94e-06, step=7971] Training: 80%|███████▉ | 7972/10000 [1:39:55<24:05, 1.40it/s, loss=0.0646, lr=4.94e-06, step=7971] Training: 80%|███████▉ | 7972/10000 [1:39:55<24:05, 1.40it/s, loss=0.0030, lr=4.94e-06, step=7972] Training: 80%|███████▉ | 7973/10000 [1:39:56<28:19, 1.19it/s, loss=0.0030, lr=4.94e-06, step=7972] Training: 80%|███████▉ | 7973/10000 [1:39:56<28:19, 1.19it/s, loss=0.0025, lr=4.94e-06, step=7973] Training: 80%|███████▉ | 7974/10000 [1:39:57<28:54, 1.17it/s, loss=0.0025, lr=4.94e-06, step=7973] Training: 80%|███████▉ | 7974/10000 [1:39:57<28:54, 1.17it/s, loss=0.0342, lr=4.93e-06, step=7974] Training: 80%|███████▉ | 7975/10000 [1:39:58<28:30, 1.18it/s, loss=0.0342, lr=4.93e-06, step=7974] Training: 80%|███████▉ | 7975/10000 [1:39:58<28:30, 1.18it/s, loss=0.0087, lr=4.93e-06, step=7975] Training: 80%|███████▉ | 7976/10000 [1:39:59<30:56, 1.09it/s, loss=0.0087, lr=4.93e-06, step=7975] Training: 80%|███████▉ | 7976/10000 [1:39:59<30:56, 1.09it/s, loss=0.0034, lr=4.93e-06, step=7976] Training: 80%|███████▉ | 7977/10000 [1:40:00<30:15, 1.11it/s, loss=0.0034, lr=4.93e-06, step=7976] Training: 80%|███████▉ | 7977/10000 [1:40:00<30:15, 1.11it/s, loss=0.0203, lr=4.93e-06, step=7977] Training: 80%|███████▉ | 7978/10000 [1:40:00<26:49, 1.26it/s, loss=0.0203, lr=4.93e-06, step=7977] Training: 80%|███████▉ | 7978/10000 [1:40:00<26:49, 1.26it/s, loss=0.0081, lr=4.92e-06, step=7978] Training: 80%|███████▉ | 7979/10000 [1:40:01<27:22, 1.23it/s, loss=0.0081, lr=4.92e-06, step=7978] Training: 80%|███████▉ | 7979/10000 [1:40:01<27:22, 1.23it/s, loss=0.0043, lr=4.92e-06, step=7979]17:46:09.269 [I] step=7980 loss=0.0055 smoothed_loss=0.0141 lr=4.93e-06 grad_norm=0.5184 step_time=0.6674s data_time=0.2002s it/s=1.153 eta_to_10000=1752.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.0837 grad_shared_expert=0.9289 (10775:train_pytorch.py:850) + Training: 80%|███████▉ | 7980/10000 [1:40:02<29:35, 1.14it/s, loss=0.0043, lr=4.92e-06, step=7979] Training: 80%|███████▉ | 7980/10000 [1:40:02<29:35, 1.14it/s, loss=0.0055, lr=4.92e-06, step=7980] Training: 80%|███████▉ | 7981/10000 [1:40:03<29:10, 1.15it/s, loss=0.0055, lr=4.92e-06, step=7980] Training: 80%|███████▉ | 7981/10000 [1:40:03<29:10, 1.15it/s, loss=0.0065, lr=4.92e-06, step=7981] Training: 80%|███████▉ | 7982/10000 [1:40:04<26:18, 1.28it/s, loss=0.0065, lr=4.92e-06, step=7981] Training: 80%|███████▉ | 7982/10000 [1:40:04<26:18, 1.28it/s, loss=0.0058, lr=4.92e-06, step=7982] Training: 80%|███████▉ | 7983/10000 [1:40:04<25:20, 1.33it/s, loss=0.0058, lr=4.92e-06, step=7982] Training: 80%|███████▉ | 7983/10000 [1:40:04<25:20, 1.33it/s, loss=0.0028, lr=4.91e-06, step=7983] Training: 80%|███████▉ | 7984/10000 [1:40:06<29:15, 1.15it/s, loss=0.0028, lr=4.91e-06, step=7983] Training: 80%|███████▉ | 7984/10000 [1:40:06<29:15, 1.15it/s, loss=0.0055, lr=4.91e-06, step=7984] Training: 80%|███████▉ | 7985/10000 [1:40:06<26:21, 1.27it/s, loss=0.0055, lr=4.91e-06, step=7984] Training: 80%|███████▉ | 7985/10000 [1:40:06<26:21, 1.27it/s, loss=0.0010, lr=4.91e-06, step=7985] Training: 80%|███████▉ | 7986/10000 [1:40:07<24:20, 1.38it/s, loss=0.0010, lr=4.91e-06, step=7985] Training: 80%|███████▉ | 7986/10000 [1:40:07<24:20, 1.38it/s, loss=0.0060, lr=4.91e-06, step=7986] Training: 80%|███████▉ | 7987/10000 [1:40:07<24:10, 1.39it/s, loss=0.0060, lr=4.91e-06, step=7986] Training: 80%|███████▉ | 7987/10000 [1:40:07<24:10, 1.39it/s, loss=0.0034, lr=4.90e-06, step=7987] Training: 80%|███████▉ | 7988/10000 [1:40:08<23:46, 1.41it/s, loss=0.0034, lr=4.90e-06, step=7987] Training: 80%|███████▉ | 7988/10000 [1:40:08<23:46, 1.41it/s, loss=0.0044, lr=4.90e-06, step=7988] Training: 80%|███████▉ | 7989/10000 [1:40:09<22:49, 1.47it/s, loss=0.0044, lr=4.90e-06, step=7988] Training: 80%|███████▉ | 7989/10000 [1:40:09<22:49, 1.47it/s, loss=0.0121, lr=4.90e-06, step=7989]17:46:16.299 [I] step=7990 loss=0.0026 smoothed_loss=0.0083 lr=4.91e-06 grad_norm=0.3703 step_time=0.5817s data_time=0.1213s it/s=1.423 eta_to_10000=1412.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0056 grad_action_out_proj=0.0477 grad_shared_expert=0.1387 (10775:train_pytorch.py:850) + Training: 80%|███████▉ | 7990/10000 [1:40:09<21:58, 1.52it/s, loss=0.0121, lr=4.90e-06, step=7989] Training: 80%|███████▉ | 7990/10000 [1:40:09<21:58, 1.52it/s, loss=0.0026, lr=4.90e-06, step=7990] Training: 80%|███████▉ | 7991/10000 [1:40:10<23:35, 1.42it/s, loss=0.0026, lr=4.90e-06, step=7990] Training: 80%|███████▉ | 7991/10000 [1:40:10<23:35, 1.42it/s, loss=0.0043, lr=4.90e-06, step=7991] Training: 80%|███████▉ | 7992/10000 [1:40:11<23:39, 1.41it/s, loss=0.0043, lr=4.90e-06, step=7991] Training: 80%|███████▉ | 7992/10000 [1:40:11<23:39, 1.41it/s, loss=0.0088, lr=4.89e-06, step=7992] Training: 80%|███████▉ | 7993/10000 [1:40:11<21:32, 1.55it/s, loss=0.0088, lr=4.89e-06, step=7992] Training: 80%|███████▉ | 7993/10000 [1:40:11<21:32, 1.55it/s, loss=0.0068, lr=4.89e-06, step=7993] Training: 80%|███████▉ | 7994/10000 [1:40:12<24:38, 1.36it/s, loss=0.0068, lr=4.89e-06, step=7993] Training: 80%|███████▉ | 7994/10000 [1:40:12<24:38, 1.36it/s, loss=0.0282, lr=4.89e-06, step=7994] Training: 80%|███████▉ | 7995/10000 [1:40:13<25:41, 1.30it/s, loss=0.0282, lr=4.89e-06, step=7994] Training: 80%|███████▉ | 7995/10000 [1:40:13<25:41, 1.30it/s, loss=0.0015, lr=4.89e-06, step=7995] Training: 80%|███████▉ | 7996/10000 [1:40:14<25:24, 1.31it/s, loss=0.0015, lr=4.89e-06, step=7995] Training: 80%|███████▉ | 7996/10000 [1:40:14<25:24, 1.31it/s, loss=0.0096, lr=4.88e-06, step=7996] Training: 80%|███████▉ | 7997/10000 [1:40:15<24:36, 1.36it/s, loss=0.0096, lr=4.88e-06, step=7996] Training: 80%|███████▉ | 7997/10000 [1:40:15<24:36, 1.36it/s, loss=0.0102, lr=4.88e-06, step=7997] Training: 80%|███████▉ | 7998/10000 [1:40:16<28:04, 1.19it/s, loss=0.0102, lr=4.88e-06, step=7997] Training: 80%|███████▉ | 7998/10000 [1:40:16<28:04, 1.19it/s, loss=0.0009, lr=4.88e-06, step=7998] Training: 80%|███████▉ | 7999/10000 [1:40:17<28:40, 1.16it/s, loss=0.0009, lr=4.88e-06, step=7998] Training: 80%|███████▉ | 7999/10000 [1:40:17<28:40, 1.16it/s, loss=0.0267, lr=4.88e-06, step=7999]17:46:24.193 [I] step=8000 loss=0.0079 smoothed_loss=0.0100 lr=4.88e-06 grad_norm=0.3952 step_time=0.6392s data_time=0.1502s it/s=1.267 eta_to_10000=1578.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0713 grad_shared_expert=0.3033 (10775:train_pytorch.py:850) +17:48:19.855 [I] Saved checkpoint at step 8000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/8000 (10775:train_pytorch.py:350) + Training: 80%|████████ | 8000/10000 [1:42:13<19:43:22, 35.50s/it, loss=0.0267, lr=4.88e-06, step=7999] Training: 80%|████████ | 8000/10000 [1:42:13<19:43:22, 35.50s/it, loss=0.0079, lr=4.87e-06, step=8000] Training: 80%|████████ | 8001/10000 [1:42:14<13:58:08, 25.16s/it, loss=0.0079, lr=4.87e-06, step=8000] Training: 80%|████████ | 8001/10000 [1:42:14<13:58:08, 25.16s/it, loss=0.0120, lr=4.87e-06, step=8001] Training: 80%|████████ | 8002/10000 [1:42:14<9:51:41, 17.77s/it, loss=0.0120, lr=4.87e-06, step=8001] Training: 80%|████████ | 8002/10000 [1:42:14<9:51:41, 17.77s/it, loss=0.0128, lr=4.87e-06, step=8002] Training: 80%|████████ | 8003/10000 [1:42:15<7:02:56, 12.71s/it, loss=0.0128, lr=4.87e-06, step=8002] Training: 80%|████████ | 8003/10000 [1:42:15<7:02:56, 12.71s/it, loss=0.0015, lr=4.87e-06, step=8003] Training: 80%|████████ | 8004/10000 [1:42:16<5:01:08, 9.05s/it, loss=0.0015, lr=4.87e-06, step=8003] Training: 80%|████████ | 8004/10000 [1:42:16<5:01:08, 9.05s/it, loss=0.0016, lr=4.87e-06, step=8004] Training: 80%|████████ | 8005/10000 [1:42:17<3:38:17, 6.57s/it, loss=0.0016, lr=4.87e-06, step=8004] Training: 80%|████████ | 8005/10000 [1:42:17<3:38:17, 6.57s/it, loss=0.0138, lr=4.86e-06, step=8005] Training: 80%|████████ | 8006/10000 [1:42:17<2:39:11, 4.79s/it, loss=0.0138, lr=4.86e-06, step=8005] Training: 80%|████████ | 8006/10000 [1:42:17<2:39:11, 4.79s/it, loss=0.0063, lr=4.86e-06, step=8006] Training: 80%|████████ | 8007/10000 [1:42:18<1:57:54, 3.55s/it, loss=0.0063, lr=4.86e-06, step=8006] Training: 80%|████████ | 8007/10000 [1:42:18<1:57:54, 3.55s/it, loss=0.0047, lr=4.86e-06, step=8007] Training: 80%|████████ | 8008/10000 [1:42:19<1:30:47, 2.73s/it, loss=0.0047, lr=4.86e-06, step=8007] Training: 80%|████████ | 8008/10000 [1:42:19<1:30:47, 2.73s/it, loss=0.0171, lr=4.86e-06, step=8008] Training: 80%|████████ | 8009/10000 [1:42:20<1:11:53, 2.17s/it, loss=0.0171, lr=4.86e-06, step=8008] Training: 80%|████████ | 8009/10000 [1:42:20<1:11:53, 2.17s/it, loss=0.0381, lr=4.85e-06, step=8009]17:48:27.357 [I] step=8010 loss=0.0102 smoothed_loss=0.0121 lr=4.86e-06 grad_norm=0.4165 step_time=0.6027s data_time=11.7137s it/s=0.081 eta_to_10000=24509.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.1238 grad_shared_expert=0.4087 (10775:train_pytorch.py:850) + Training: 80%|████████ | 8010/10000 [1:42:20<58:04, 1.75s/it, loss=0.0381, lr=4.85e-06, step=8009] Training: 80%|████████ | 8010/10000 [1:42:20<58:04, 1.75s/it, loss=0.0102, lr=4.85e-06, step=8010] Training: 80%|████████ | 8011/10000 [1:42:21<47:38, 1.44s/it, loss=0.0102, lr=4.85e-06, step=8010] Training: 80%|████████ | 8011/10000 [1:42:21<47:38, 1.44s/it, loss=0.0014, lr=4.85e-06, step=8011] Training: 80%|████████ | 8012/10000 [1:42:22<41:19, 1.25s/it, loss=0.0014, lr=4.85e-06, step=8011] Training: 80%|████████ | 8012/10000 [1:42:22<41:19, 1.25s/it, loss=0.0026, lr=4.85e-06, step=8012] Training: 80%|████████ | 8013/10000 [1:42:23<39:40, 1.20s/it, loss=0.0026, lr=4.85e-06, step=8012] Training: 80%|████████ | 8013/10000 [1:42:23<39:40, 1.20s/it, loss=0.0035, lr=4.84e-06, step=8013] Training: 80%|████████ | 8014/10000 [1:42:24<33:19, 1.01s/it, loss=0.0035, lr=4.84e-06, step=8013] Training: 80%|████████ | 8014/10000 [1:42:24<33:19, 1.01s/it, loss=0.0099, lr=4.84e-06, step=8014] Training: 80%|████████ | 8015/10000 [1:42:24<29:42, 1.11it/s, loss=0.0099, lr=4.84e-06, step=8014] Training: 80%|████████ | 8015/10000 [1:42:24<29:42, 1.11it/s, loss=0.0019, lr=4.84e-06, step=8015] Training: 80%|████████ | 8016/10000 [1:42:25<28:17, 1.17it/s, loss=0.0019, lr=4.84e-06, step=8015] Training: 80%|████████ | 8016/10000 [1:42:25<28:17, 1.17it/s, loss=0.0020, lr=4.84e-06, step=8016] Training: 80%|████████ | 8017/10000 [1:42:26<27:01, 1.22it/s, loss=0.0020, lr=4.84e-06, step=8016] Training: 80%|████████ | 8017/10000 [1:42:26<27:01, 1.22it/s, loss=0.0156, lr=4.84e-06, step=8017] Training: 80%|████████ | 8018/10000 [1:42:26<26:02, 1.27it/s, loss=0.0156, lr=4.84e-06, step=8017] Training: 80%|████████ | 8018/10000 [1:42:26<26:02, 1.27it/s, loss=0.0086, lr=4.83e-06, step=8018] Training: 80%|████████ | 8019/10000 [1:42:27<28:28, 1.16it/s, loss=0.0086, lr=4.83e-06, step=8018] Training: 80%|████████ | 8019/10000 [1:42:27<28:28, 1.16it/s, loss=0.0151, lr=4.83e-06, step=8019]17:48:35.151 [I] step=8020 loss=0.0125 smoothed_loss=0.0097 lr=4.84e-06 grad_norm=0.4313 step_time=0.6099s data_time=0.1695s it/s=1.283 eta_to_10000=1543.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.0986 grad_shared_expert=0.4674 (10775:train_pytorch.py:850) + Training: 80%|████████ | 8020/10000 [1:42:28<27:24, 1.20it/s, loss=0.0151, lr=4.83e-06, step=8019] Training: 80%|████████ | 8020/10000 [1:42:28<27:24, 1.20it/s, loss=0.0125, lr=4.83e-06, step=8020] Training: 80%|████████ | 8021/10000 [1:42:29<26:54, 1.23it/s, loss=0.0125, lr=4.83e-06, step=8020] Training: 80%|████████ | 8021/10000 [1:42:29<26:54, 1.23it/s, loss=0.0352, lr=4.83e-06, step=8021] Training: 80%|████████ | 8022/10000 [1:42:30<23:50, 1.38it/s, loss=0.0352, lr=4.83e-06, step=8021] Training: 80%|████████ | 8022/10000 [1:42:30<23:50, 1.38it/s, loss=0.0160, lr=4.82e-06, step=8022] Training: 80%|████████ | 8023/10000 [1:42:30<26:03, 1.26it/s, loss=0.0160, lr=4.82e-06, step=8022] Training: 80%|████████ | 8023/10000 [1:42:30<26:03, 1.26it/s, loss=0.0178, lr=4.82e-06, step=8023] Training: 80%|████████ | 8024/10000 [1:42:31<27:16, 1.21it/s, loss=0.0178, lr=4.82e-06, step=8023] Training: 80%|████████ | 8024/10000 [1:42:31<27:16, 1.21it/s, loss=0.0083, lr=4.82e-06, step=8024] Training: 80%|████████ | 8025/10000 [1:42:32<26:39, 1.23it/s, loss=0.0083, lr=4.82e-06, step=8024] Training: 80%|████████ | 8025/10000 [1:42:32<26:39, 1.23it/s, loss=0.0038, lr=4.82e-06, step=8025] Training: 80%|████████ | 8026/10000 [1:42:33<26:43, 1.23it/s, loss=0.0038, lr=4.82e-06, step=8025] Training: 80%|████████ | 8026/10000 [1:42:33<26:43, 1.23it/s, loss=0.0297, lr=4.82e-06, step=8026] Training: 80%|████████ | 8027/10000 [1:42:34<27:41, 1.19it/s, loss=0.0297, lr=4.82e-06, step=8026] Training: 80%|████████ | 8027/10000 [1:42:34<27:41, 1.19it/s, loss=0.0103, lr=4.81e-06, step=8027] Training: 80%|████████ | 8028/10000 [1:42:35<28:00, 1.17it/s, loss=0.0103, lr=4.81e-06, step=8027] Training: 80%|████████ | 8028/10000 [1:42:35<28:00, 1.17it/s, loss=0.0018, lr=4.81e-06, step=8028] Training: 80%|████████ | 8029/10000 [1:42:36<27:54, 1.18it/s, loss=0.0018, lr=4.81e-06, step=8028] Training: 80%|████████ | 8029/10000 [1:42:36<27:54, 1.18it/s, loss=0.0030, lr=4.81e-06, step=8029]17:48:43.407 [I] step=8030 loss=0.0029 smoothed_loss=0.0104 lr=4.82e-06 grad_norm=0.4488 step_time=0.6394s data_time=0.1863s it/s=1.211 eta_to_10000=1626.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0067 grad_action_out_proj=0.0793 grad_shared_expert=0.2458 (10775:train_pytorch.py:850) + Training: 80%|████████ | 8030/10000 [1:42:36<28:15, 1.16it/s, loss=0.0030, lr=4.81e-06, step=8029] Training: 80%|████████ | 8030/10000 [1:42:36<28:15, 1.16it/s, loss=0.0029, lr=4.81e-06, step=8030] Training: 80%|████████ | 8031/10000 [1:42:37<28:08, 1.17it/s, loss=0.0029, lr=4.81e-06, step=8030] Training: 80%|████████ | 8031/10000 [1:42:37<28:08, 1.17it/s, loss=0.0070, lr=4.80e-06, step=8031] Training: 80%|████████ | 8032/10000 [1:42:38<26:12, 1.25it/s, loss=0.0070, lr=4.80e-06, step=8031] Training: 80%|████████ | 8032/10000 [1:42:38<26:12, 1.25it/s, loss=0.0047, lr=4.80e-06, step=8032] Training: 80%|████████ | 8033/10000 [1:42:39<25:48, 1.27it/s, loss=0.0047, lr=4.80e-06, step=8032] Training: 80%|████████ | 8033/10000 [1:42:39<25:48, 1.27it/s, loss=0.0015, lr=4.80e-06, step=8033] Training: 80%|████████ | 8034/10000 [1:42:39<25:20, 1.29it/s, loss=0.0015, lr=4.80e-06, step=8033] Training: 80%|████████ | 8034/10000 [1:42:39<25:20, 1.29it/s, loss=0.0095, lr=4.80e-06, step=8034] Training: 80%|████████ | 8035/10000 [1:42:40<23:13, 1.41it/s, loss=0.0095, lr=4.80e-06, step=8034] Training: 80%|████████ | 8035/10000 [1:42:40<23:13, 1.41it/s, loss=0.0163, lr=4.80e-06, step=8035] Training: 80%|████████ | 8036/10000 [1:42:41<23:28, 1.39it/s, loss=0.0163, lr=4.80e-06, step=8035] Training: 80%|████████ | 8036/10000 [1:42:41<23:28, 1.39it/s, loss=0.0016, lr=4.79e-06, step=8036] Training: 80%|████████ | 8037/10000 [1:42:42<26:36, 1.23it/s, loss=0.0016, lr=4.79e-06, step=8036] Training: 80%|████████ | 8037/10000 [1:42:42<26:36, 1.23it/s, loss=0.0018, lr=4.79e-06, step=8037] Training: 80%|████████ | 8038/10000 [1:42:43<26:42, 1.22it/s, loss=0.0018, lr=4.79e-06, step=8037] Training: 80%|████████ | 8038/10000 [1:42:43<26:42, 1.22it/s, loss=0.0153, lr=4.79e-06, step=8038] Training: 80%|████████ | 8039/10000 [1:42:43<23:39, 1.38it/s, loss=0.0153, lr=4.79e-06, step=8038] Training: 80%|████████ | 8039/10000 [1:42:43<23:39, 1.38it/s, loss=0.0317, lr=4.79e-06, step=8039]17:48:50.942 [I] step=8040 loss=0.0034 smoothed_loss=0.0103 lr=4.79e-06 grad_norm=0.3759 step_time=0.6014s data_time=0.1520s it/s=1.327 eta_to_10000=1476.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0066 grad_action_out_proj=0.0817 grad_shared_expert=0.2259 (10775:train_pytorch.py:850) + Training: 80%|████████ | 8040/10000 [1:42:44<24:56, 1.31it/s, loss=0.0317, lr=4.79e-06, step=8039] Training: 80%|████████ | 8040/10000 [1:42:44<24:56, 1.31it/s, loss=0.0034, lr=4.78e-06, step=8040] Training: 80%|████████ | 8041/10000 [1:42:45<25:24, 1.28it/s, loss=0.0034, lr=4.78e-06, step=8040] Training: 80%|████████ | 8041/10000 [1:42:45<25:24, 1.28it/s, loss=0.0089, lr=4.78e-06, step=8041] Training: 80%|████████ | 8042/10000 [1:42:46<28:44, 1.14it/s, loss=0.0089, lr=4.78e-06, step=8041] Training: 80%|████████ | 8042/10000 [1:42:46<28:44, 1.14it/s, loss=0.0177, lr=4.78e-06, step=8042] Training: 80%|████████ | 8043/10000 [1:42:46<25:05, 1.30it/s, loss=0.0177, lr=4.78e-06, step=8042] Training: 80%|████████ | 8043/10000 [1:42:46<25:05, 1.30it/s, loss=0.0078, lr=4.78e-06, step=8043] Training: 80%|████████ | 8044/10000 [1:42:47<25:03, 1.30it/s, loss=0.0078, lr=4.78e-06, step=8043] Training: 80%|████████ | 8044/10000 [1:42:47<25:03, 1.30it/s, loss=0.0053, lr=4.77e-06, step=8044] Training: 80%|████████ | 8045/10000 [1:42:48<25:11, 1.29it/s, loss=0.0053, lr=4.77e-06, step=8044] Training: 80%|████████ | 8045/10000 [1:42:48<25:11, 1.29it/s, loss=0.0046, lr=4.77e-06, step=8045] Training: 80%|████████ | 8046/10000 [1:42:48<22:25, 1.45it/s, loss=0.0046, lr=4.77e-06, step=8045] Training: 80%|████████ | 8046/10000 [1:42:48<22:25, 1.45it/s, loss=0.0049, lr=4.77e-06, step=8046] Training: 80%|████████ | 8047/10000 [1:42:49<23:16, 1.40it/s, loss=0.0049, lr=4.77e-06, step=8046] Training: 80%|████████ | 8047/10000 [1:42:49<23:16, 1.40it/s, loss=0.0046, lr=4.77e-06, step=8047] Training: 80%|████████ | 8048/10000 [1:42:50<25:48, 1.26it/s, loss=0.0046, lr=4.77e-06, step=8047] Training: 80%|████████ | 8048/10000 [1:42:50<25:48, 1.26it/s, loss=0.0122, lr=4.77e-06, step=8048] Training: 80%|████████ | 8049/10000 [1:42:51<25:49, 1.26it/s, loss=0.0122, lr=4.77e-06, step=8048] Training: 80%|████████ | 8049/10000 [1:42:51<25:49, 1.26it/s, loss=0.0066, lr=4.76e-06, step=8049]17:48:58.510 [I] step=8050 loss=0.0031 smoothed_loss=0.0082 lr=4.77e-06 grad_norm=0.4363 step_time=0.6268s data_time=0.1300s it/s=1.322 eta_to_10000=1475.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0040 grad_action_out_proj=0.0435 grad_shared_expert=0.1971 (10775:train_pytorch.py:850) + Training: 80%|████████ | 8050/10000 [1:42:52<23:17, 1.40it/s, loss=0.0066, lr=4.76e-06, step=8049] Training: 80%|████████ | 8050/10000 [1:42:52<23:17, 1.40it/s, loss=0.0031, lr=4.76e-06, step=8050] Training: 81%|████████ | 8051/10000 [1:42:52<22:55, 1.42it/s, loss=0.0031, lr=4.76e-06, step=8050] Training: 81%|████████ | 8051/10000 [1:42:52<22:55, 1.42it/s, loss=0.0048, lr=4.76e-06, step=8051] Training: 81%|████████ | 8052/10000 [1:42:53<21:42, 1.50it/s, loss=0.0048, lr=4.76e-06, step=8051] Training: 81%|████████ | 8052/10000 [1:42:53<21:42, 1.50it/s, loss=0.0038, lr=4.76e-06, step=8052] Training: 81%|████████ | 8053/10000 [1:42:54<22:14, 1.46it/s, loss=0.0038, lr=4.76e-06, step=8052] Training: 81%|████████ | 8053/10000 [1:42:54<22:14, 1.46it/s, loss=0.0037, lr=4.75e-06, step=8053] Training: 81%|████████ | 8054/10000 [1:42:54<22:30, 1.44it/s, loss=0.0037, lr=4.75e-06, step=8053] Training: 81%|████████ | 8054/10000 [1:42:54<22:30, 1.44it/s, loss=0.0108, lr=4.75e-06, step=8054] Training: 81%|████████ | 8055/10000 [1:42:55<23:07, 1.40it/s, loss=0.0108, lr=4.75e-06, step=8054] Training: 81%|████████ | 8055/10000 [1:42:55<23:07, 1.40it/s, loss=0.0036, lr=4.75e-06, step=8055] Training: 81%|████████ | 8056/10000 [1:42:56<21:53, 1.48it/s, loss=0.0036, lr=4.75e-06, step=8055] Training: 81%|████████ | 8056/10000 [1:42:56<21:53, 1.48it/s, loss=0.0078, lr=4.75e-06, step=8056] Training: 81%|████████ | 8057/10000 [1:42:57<25:08, 1.29it/s, loss=0.0078, lr=4.75e-06, step=8056] Training: 81%|████████ | 8057/10000 [1:42:57<25:08, 1.29it/s, loss=0.0118, lr=4.75e-06, step=8057] Training: 81%|████████ | 8058/10000 [1:42:57<23:30, 1.38it/s, loss=0.0118, lr=4.75e-06, step=8057] Training: 81%|████████ | 8058/10000 [1:42:57<23:30, 1.38it/s, loss=0.0243, lr=4.74e-06, step=8058] Training: 81%|████████ | 8059/10000 [1:42:58<23:47, 1.36it/s, loss=0.0243, lr=4.74e-06, step=8058] Training: 81%|████████ | 8059/10000 [1:42:58<23:47, 1.36it/s, loss=0.0022, lr=4.74e-06, step=8059]17:49:05.816 [I] step=8060 loss=0.0083 smoothed_loss=0.0085 lr=4.75e-06 grad_norm=0.3887 step_time=0.5945s data_time=0.1360s it/s=1.369 eta_to_10000=1417.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0070 grad_action_out_proj=0.0606 grad_shared_expert=0.2871 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8060/10000 [1:42:59<25:11, 1.28it/s, loss=0.0022, lr=4.74e-06, step=8059] Training: 81%|████████ | 8060/10000 [1:42:59<25:11, 1.28it/s, loss=0.0083, lr=4.74e-06, step=8060] Training: 81%|████████ | 8061/10000 [1:42:59<23:33, 1.37it/s, loss=0.0083, lr=4.74e-06, step=8060] Training: 81%|████████ | 8061/10000 [1:42:59<23:33, 1.37it/s, loss=0.0166, lr=4.74e-06, step=8061] Training: 81%|████████ | 8062/10000 [1:43:00<23:31, 1.37it/s, loss=0.0166, lr=4.74e-06, step=8061] Training: 81%|████████ | 8062/10000 [1:43:00<23:31, 1.37it/s, loss=0.0204, lr=4.73e-06, step=8062] Training: 81%|████████ | 8063/10000 [1:43:01<24:22, 1.32it/s, loss=0.0204, lr=4.73e-06, step=8062] Training: 81%|████████ | 8063/10000 [1:43:01<24:22, 1.32it/s, loss=0.0038, lr=4.73e-06, step=8063] Training: 81%|████████ | 8064/10000 [1:43:02<29:04, 1.11it/s, loss=0.0038, lr=4.73e-06, step=8063] Training: 81%|████████ | 8064/10000 [1:43:02<29:04, 1.11it/s, loss=0.0017, lr=4.73e-06, step=8064] Training: 81%|████████ | 8065/10000 [1:43:03<28:53, 1.12it/s, loss=0.0017, lr=4.73e-06, step=8064] Training: 81%|████████ | 8065/10000 [1:43:03<28:53, 1.12it/s, loss=0.0060, lr=4.73e-06, step=8065] Training: 81%|████████ | 8066/10000 [1:43:04<26:34, 1.21it/s, loss=0.0060, lr=4.73e-06, step=8065] Training: 81%|████████ | 8066/10000 [1:43:04<26:34, 1.21it/s, loss=0.0075, lr=4.73e-06, step=8066] Training: 81%|████████ | 8067/10000 [1:43:05<26:47, 1.20it/s, loss=0.0075, lr=4.73e-06, step=8066] Training: 81%|████████ | 8067/10000 [1:43:05<26:47, 1.20it/s, loss=0.0256, lr=4.72e-06, step=8067] Training: 81%|████████ | 8068/10000 [1:43:05<23:33, 1.37it/s, loss=0.0256, lr=4.72e-06, step=8067] Training: 81%|████████ | 8068/10000 [1:43:05<23:33, 1.37it/s, loss=0.0009, lr=4.72e-06, step=8068] Training: 81%|████████ | 8069/10000 [1:43:06<22:48, 1.41it/s, loss=0.0009, lr=4.72e-06, step=8068] Training: 81%|████████ | 8069/10000 [1:43:06<22:48, 1.41it/s, loss=0.0021, lr=4.72e-06, step=8069]17:49:13.945 [I] step=8070 loss=0.0221 smoothed_loss=0.0100 lr=4.73e-06 grad_norm=0.4972 step_time=0.6541s data_time=0.1589s it/s=1.230 eta_to_10000=1568.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0142 grad_action_out_proj=0.1225 grad_shared_expert=0.4643 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8070/10000 [1:43:07<27:25, 1.17it/s, loss=0.0021, lr=4.72e-06, step=8069] Training: 81%|████████ | 8070/10000 [1:43:07<27:25, 1.17it/s, loss=0.0221, lr=4.72e-06, step=8070] Training: 81%|████████ | 8071/10000 [1:43:08<27:09, 1.18it/s, loss=0.0221, lr=4.72e-06, step=8070] Training: 81%|████████ | 8071/10000 [1:43:08<27:09, 1.18it/s, loss=0.0167, lr=4.71e-06, step=8071] Training: 81%|████████ | 8072/10000 [1:43:09<29:07, 1.10it/s, loss=0.0167, lr=4.71e-06, step=8071] Training: 81%|████████ | 8072/10000 [1:43:09<29:07, 1.10it/s, loss=0.0340, lr=4.71e-06, step=8072] Training: 81%|████████ | 8073/10000 [1:43:10<26:59, 1.19it/s, loss=0.0340, lr=4.71e-06, step=8072] Training: 81%|████████ | 8073/10000 [1:43:10<26:59, 1.19it/s, loss=0.0088, lr=4.71e-06, step=8073] Training: 81%|████████ | 8074/10000 [1:43:10<23:47, 1.35it/s, loss=0.0088, lr=4.71e-06, step=8073] Training: 81%|████████ | 8074/10000 [1:43:10<23:47, 1.35it/s, loss=0.0076, lr=4.71e-06, step=8074] Training: 81%|████████ | 8075/10000 [1:43:11<21:43, 1.48it/s, loss=0.0076, lr=4.71e-06, step=8074] Training: 81%|████████ | 8075/10000 [1:43:11<21:43, 1.48it/s, loss=0.0110, lr=4.71e-06, step=8075] Training: 81%|████████ | 8076/10000 [1:43:11<22:54, 1.40it/s, loss=0.0110, lr=4.71e-06, step=8075] Training: 81%|████████ | 8076/10000 [1:43:11<22:54, 1.40it/s, loss=0.0078, lr=4.70e-06, step=8076] Training: 81%|████████ | 8077/10000 [1:43:13<27:49, 1.15it/s, loss=0.0078, lr=4.70e-06, step=8076] Training: 81%|████████ | 8077/10000 [1:43:13<27:49, 1.15it/s, loss=0.0064, lr=4.70e-06, step=8077] Training: 81%|████████ | 8078/10000 [1:43:14<28:36, 1.12it/s, loss=0.0064, lr=4.70e-06, step=8077] Training: 81%|████████ | 8078/10000 [1:43:14<28:36, 1.12it/s, loss=0.0030, lr=4.70e-06, step=8078] Training: 81%|████████ | 8079/10000 [1:43:14<27:56, 1.15it/s, loss=0.0030, lr=4.70e-06, step=8078] Training: 81%|████████ | 8079/10000 [1:43:14<27:56, 1.15it/s, loss=0.0088, lr=4.70e-06, step=8079]17:49:22.395 [I] step=8080 loss=0.0072 smoothed_loss=0.0098 lr=4.70e-06 grad_norm=0.4631 step_time=0.6634s data_time=0.1816s it/s=1.184 eta_to_10000=1622.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0193 grad_action_out_proj=0.1459 grad_shared_expert=0.4024 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8080/10000 [1:43:15<29:34, 1.08it/s, loss=0.0088, lr=4.70e-06, step=8079] Training: 81%|████████ | 8080/10000 [1:43:15<29:34, 1.08it/s, loss=0.0072, lr=4.69e-06, step=8080] Training: 81%|████████ | 8081/10000 [1:43:16<28:37, 1.12it/s, loss=0.0072, lr=4.69e-06, step=8080] Training: 81%|████████ | 8081/10000 [1:43:16<28:37, 1.12it/s, loss=0.0085, lr=4.69e-06, step=8081] Training: 81%|████████ | 8082/10000 [1:43:17<28:15, 1.13it/s, loss=0.0085, lr=4.69e-06, step=8081] Training: 81%|████████ | 8082/10000 [1:43:17<28:15, 1.13it/s, loss=0.0097, lr=4.69e-06, step=8082] Training: 81%|████████ | 8083/10000 [1:43:18<26:09, 1.22it/s, loss=0.0097, lr=4.69e-06, step=8082] Training: 81%|████████ | 8083/10000 [1:43:18<26:09, 1.22it/s, loss=0.0244, lr=4.69e-06, step=8083] Training: 81%|████████ | 8084/10000 [1:43:19<28:50, 1.11it/s, loss=0.0244, lr=4.69e-06, step=8083] Training: 81%|████████ | 8084/10000 [1:43:19<28:50, 1.11it/s, loss=0.0040, lr=4.69e-06, step=8084] Training: 81%|████████ | 8085/10000 [1:43:20<28:51, 1.11it/s, loss=0.0040, lr=4.69e-06, step=8084] Training: 81%|████████ | 8085/10000 [1:43:20<28:51, 1.11it/s, loss=0.0020, lr=4.68e-06, step=8085] Training: 81%|████████ | 8086/10000 [1:43:20<25:11, 1.27it/s, loss=0.0020, lr=4.68e-06, step=8085] Training: 81%|████████ | 8086/10000 [1:43:20<25:11, 1.27it/s, loss=0.0132, lr=4.68e-06, step=8086] Training: 81%|████████ | 8087/10000 [1:43:21<25:07, 1.27it/s, loss=0.0132, lr=4.68e-06, step=8086] Training: 81%|████████ | 8087/10000 [1:43:21<25:07, 1.27it/s, loss=0.0016, lr=4.68e-06, step=8087] Training: 81%|████████ | 8088/10000 [1:43:22<22:18, 1.43it/s, loss=0.0016, lr=4.68e-06, step=8087] Training: 81%|████████ | 8088/10000 [1:43:22<22:18, 1.43it/s, loss=0.0055, lr=4.68e-06, step=8088] Training: 81%|████████ | 8089/10000 [1:43:22<20:39, 1.54it/s, loss=0.0055, lr=4.68e-06, step=8088] Training: 81%|████████ | 8089/10000 [1:43:22<20:39, 1.54it/s, loss=0.0025, lr=4.67e-06, step=8089]17:49:29.853 [I] step=8090 loss=0.0145 smoothed_loss=0.0088 lr=4.68e-06 grad_norm=0.4512 step_time=0.5967s data_time=0.1492s it/s=1.341 eta_to_10000=1424.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0090 grad_action_out_proj=0.0759 grad_shared_expert=0.7688 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8090/10000 [1:43:23<21:48, 1.46it/s, loss=0.0025, lr=4.67e-06, step=8089] Training: 81%|████████ | 8090/10000 [1:43:23<21:48, 1.46it/s, loss=0.0145, lr=4.67e-06, step=8090] Training: 81%|████████ | 8091/10000 [1:43:24<25:10, 1.26it/s, loss=0.0145, lr=4.67e-06, step=8090] Training: 81%|████████ | 8091/10000 [1:43:24<25:10, 1.26it/s, loss=0.0119, lr=4.67e-06, step=8091] Training: 81%|████████ | 8092/10000 [1:43:24<22:24, 1.42it/s, loss=0.0119, lr=4.67e-06, step=8091] Training: 81%|████████ | 8092/10000 [1:43:24<22:24, 1.42it/s, loss=0.0033, lr=4.67e-06, step=8092] Training: 81%|████████ | 8093/10000 [1:43:25<21:31, 1.48it/s, loss=0.0033, lr=4.67e-06, step=8092] Training: 81%|████████ | 8093/10000 [1:43:25<21:31, 1.48it/s, loss=0.0031, lr=4.67e-06, step=8093] Training: 81%|████████ | 8094/10000 [1:43:26<23:40, 1.34it/s, loss=0.0031, lr=4.67e-06, step=8093] Training: 81%|████████ | 8094/10000 [1:43:26<23:40, 1.34it/s, loss=0.0034, lr=4.66e-06, step=8094] Training: 81%|████████ | 8095/10000 [1:43:27<25:52, 1.23it/s, loss=0.0034, lr=4.66e-06, step=8094] Training: 81%|████████ | 8095/10000 [1:43:27<25:52, 1.23it/s, loss=0.0043, lr=4.66e-06, step=8095] Training: 81%|████████ | 8096/10000 [1:43:28<24:59, 1.27it/s, loss=0.0043, lr=4.66e-06, step=8095] Training: 81%|████████ | 8096/10000 [1:43:28<24:59, 1.27it/s, loss=0.0051, lr=4.66e-06, step=8096] Training: 81%|████████ | 8097/10000 [1:43:28<24:52, 1.27it/s, loss=0.0051, lr=4.66e-06, step=8096] Training: 81%|████████ | 8097/10000 [1:43:28<24:52, 1.27it/s, loss=0.0054, lr=4.66e-06, step=8097] Training: 81%|████████ | 8098/10000 [1:43:30<27:30, 1.15it/s, loss=0.0054, lr=4.66e-06, step=8097] Training: 81%|████████ | 8098/10000 [1:43:30<27:30, 1.15it/s, loss=0.0104, lr=4.66e-06, step=8098] Training: 81%|████████ | 8099/10000 [1:43:30<27:45, 1.14it/s, loss=0.0104, lr=4.66e-06, step=8098] Training: 81%|████████ | 8099/10000 [1:43:30<27:45, 1.14it/s, loss=0.0024, lr=4.65e-06, step=8099]17:49:38.032 [I] step=8100 loss=0.0082 smoothed_loss=0.0068 lr=4.66e-06 grad_norm=0.3823 step_time=0.6563s data_time=0.1616s it/s=1.223 eta_to_10000=1553.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0255 grad_action_out_proj=0.1928 grad_shared_expert=0.4736 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8100/10000 [1:43:31<25:55, 1.22it/s, loss=0.0024, lr=4.65e-06, step=8099] Training: 81%|████████ | 8100/10000 [1:43:31<25:55, 1.22it/s, loss=0.0082, lr=4.65e-06, step=8100] Training: 81%|████████ | 8101/10000 [1:43:32<25:54, 1.22it/s, loss=0.0082, lr=4.65e-06, step=8100] Training: 81%|████████ | 8101/10000 [1:43:32<25:54, 1.22it/s, loss=0.0325, lr=4.65e-06, step=8101] Training: 81%|████████ | 8102/10000 [1:43:33<26:33, 1.19it/s, loss=0.0325, lr=4.65e-06, step=8101] Training: 81%|████████ | 8102/10000 [1:43:33<26:33, 1.19it/s, loss=0.0039, lr=4.65e-06, step=8102] Training: 81%|████████ | 8103/10000 [1:43:34<26:23, 1.20it/s, loss=0.0039, lr=4.65e-06, step=8102] Training: 81%|████████ | 8103/10000 [1:43:34<26:23, 1.20it/s, loss=0.0033, lr=4.64e-06, step=8103] Training: 81%|████████ | 8104/10000 [1:43:34<25:04, 1.26it/s, loss=0.0033, lr=4.64e-06, step=8103] Training: 81%|████████ | 8104/10000 [1:43:34<25:04, 1.26it/s, loss=0.0030, lr=4.64e-06, step=8104] Training: 81%|████████ | 8105/10000 [1:43:35<28:13, 1.12it/s, loss=0.0030, lr=4.64e-06, step=8104] Training: 81%|████████ | 8105/10000 [1:43:35<28:13, 1.12it/s, loss=0.0022, lr=4.64e-06, step=8105] Training: 81%|████████ | 8106/10000 [1:43:36<27:37, 1.14it/s, loss=0.0022, lr=4.64e-06, step=8105] Training: 81%|████████ | 8106/10000 [1:43:36<27:37, 1.14it/s, loss=0.0015, lr=4.64e-06, step=8106] Training: 81%|████████ | 8107/10000 [1:43:37<27:18, 1.16it/s, loss=0.0015, lr=4.64e-06, step=8106] Training: 81%|████████ | 8107/10000 [1:43:37<27:18, 1.16it/s, loss=0.0086, lr=4.64e-06, step=8107] Training: 81%|████████ | 8108/10000 [1:43:38<24:10, 1.30it/s, loss=0.0086, lr=4.64e-06, step=8107] Training: 81%|████████ | 8108/10000 [1:43:38<24:10, 1.30it/s, loss=0.0110, lr=4.63e-06, step=8108] Training: 81%|████████ | 8109/10000 [1:43:39<26:04, 1.21it/s, loss=0.0110, lr=4.63e-06, step=8108] Training: 81%|████████ | 8109/10000 [1:43:39<26:04, 1.21it/s, loss=0.0072, lr=4.63e-06, step=8109]17:49:46.427 [I] step=8110 loss=0.0069 smoothed_loss=0.0072 lr=4.64e-06 grad_norm=0.4429 step_time=0.6421s data_time=0.1973s it/s=1.191 eta_to_10000=1586.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0080 grad_action_out_proj=0.0674 grad_shared_expert=0.4659 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8110/10000 [1:43:39<26:21, 1.19it/s, loss=0.0072, lr=4.63e-06, step=8109] Training: 81%|████████ | 8110/10000 [1:43:39<26:21, 1.19it/s, loss=0.0069, lr=4.63e-06, step=8110] Training: 81%|████████ | 8111/10000 [1:43:40<27:56, 1.13it/s, loss=0.0069, lr=4.63e-06, step=8110] Training: 81%|████████ | 8111/10000 [1:43:40<27:56, 1.13it/s, loss=0.0173, lr=4.63e-06, step=8111] Training: 81%|████████ | 8112/10000 [1:43:41<27:50, 1.13it/s, loss=0.0173, lr=4.63e-06, step=8111] Training: 81%|████████ | 8112/10000 [1:43:41<27:50, 1.13it/s, loss=0.0064, lr=4.62e-06, step=8112] Training: 81%|████████ | 8113/10000 [1:43:42<28:13, 1.11it/s, loss=0.0064, lr=4.62e-06, step=8112] Training: 81%|████████ | 8113/10000 [1:43:42<28:13, 1.11it/s, loss=0.0042, lr=4.62e-06, step=8113] Training: 81%|████████ | 8114/10000 [1:43:43<27:06, 1.16it/s, loss=0.0042, lr=4.62e-06, step=8113] Training: 81%|████████ | 8114/10000 [1:43:43<27:06, 1.16it/s, loss=0.0488, lr=4.62e-06, step=8114] Training: 81%|████████ | 8115/10000 [1:43:44<26:50, 1.17it/s, loss=0.0488, lr=4.62e-06, step=8114] Training: 81%|████████ | 8115/10000 [1:43:44<26:50, 1.17it/s, loss=0.0048, lr=4.62e-06, step=8115] Training: 81%|████████ | 8116/10000 [1:43:45<27:22, 1.15it/s, loss=0.0048, lr=4.62e-06, step=8115] Training: 81%|████████ | 8116/10000 [1:43:45<27:22, 1.15it/s, loss=0.0010, lr=4.62e-06, step=8116] Training: 81%|████████ | 8117/10000 [1:43:46<27:02, 1.16it/s, loss=0.0010, lr=4.62e-06, step=8116] Training: 81%|████████ | 8117/10000 [1:43:46<27:02, 1.16it/s, loss=0.0117, lr=4.61e-06, step=8117] Training: 81%|████████ | 8118/10000 [1:43:46<26:19, 1.19it/s, loss=0.0117, lr=4.61e-06, step=8117] Training: 81%|████████ | 8118/10000 [1:43:46<26:19, 1.19it/s, loss=0.0152, lr=4.61e-06, step=8118] Training: 81%|████████ | 8119/10000 [1:43:47<26:03, 1.20it/s, loss=0.0152, lr=4.61e-06, step=8118] Training: 81%|████████ | 8119/10000 [1:43:47<26:03, 1.20it/s, loss=0.0091, lr=4.61e-06, step=8119]17:49:55.321 [I] step=8120 loss=0.0031 smoothed_loss=0.0098 lr=4.62e-06 grad_norm=0.4118 step_time=0.7092s data_time=0.1801s it/s=1.125 eta_to_10000=1671.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0055 grad_action_out_proj=0.0588 grad_shared_expert=0.2575 (10775:train_pytorch.py:850) + Training: 81%|████████ | 8120/10000 [1:43:48<28:43, 1.09it/s, loss=0.0091, lr=4.61e-06, step=8119] Training: 81%|████████ | 8120/10000 [1:43:48<28:43, 1.09it/s, loss=0.0031, lr=4.61e-06, step=8120] Training: 81%|████████ | 8121/10000 [1:43:49<28:30, 1.10it/s, loss=0.0031, lr=4.61e-06, step=8120] Training: 81%|████████ | 8121/10000 [1:43:49<28:30, 1.10it/s, loss=0.0026, lr=4.61e-06, step=8121] Training: 81%|████████ | 8122/10000 [1:43:50<28:02, 1.12it/s, loss=0.0026, lr=4.61e-06, step=8121] Training: 81%|████████ | 8122/10000 [1:43:50<28:02, 1.12it/s, loss=0.0046, lr=4.60e-06, step=8122] Training: 81%|████████ | 8123/10000 [1:43:51<29:03, 1.08it/s, loss=0.0046, lr=4.60e-06, step=8122] Training: 81%|████████ | 8123/10000 [1:43:51<29:03, 1.08it/s, loss=0.0027, lr=4.60e-06, step=8123] Training: 81%|████████ | 8124/10000 [1:43:52<28:50, 1.08it/s, loss=0.0027, lr=4.60e-06, step=8123] Training: 81%|████████ | 8124/10000 [1:43:52<28:50, 1.08it/s, loss=0.0153, lr=4.60e-06, step=8124] Training: 81%|████████▏ | 8125/10000 [1:43:53<28:32, 1.09it/s, loss=0.0153, lr=4.60e-06, step=8124] Training: 81%|████████▏ | 8125/10000 [1:43:53<28:32, 1.09it/s, loss=0.0072, lr=4.60e-06, step=8125] Training: 81%|████████▏ | 8126/10000 [1:43:54<27:30, 1.14it/s, loss=0.0072, lr=4.60e-06, step=8125] Training: 81%|████████▏ | 8126/10000 [1:43:54<27:30, 1.14it/s, loss=0.0114, lr=4.59e-06, step=8126] Training: 81%|████████▏ | 8127/10000 [1:43:55<29:50, 1.05it/s, loss=0.0114, lr=4.59e-06, step=8126] Training: 81%|████████▏ | 8127/10000 [1:43:55<29:50, 1.05it/s, loss=0.0428, lr=4.59e-06, step=8127] Training: 81%|████████▏ | 8128/10000 [1:43:56<28:37, 1.09it/s, loss=0.0428, lr=4.59e-06, step=8127] Training: 81%|████████▏ | 8128/10000 [1:43:56<28:37, 1.09it/s, loss=0.0124, lr=4.59e-06, step=8128] Training: 81%|████████▏ | 8129/10000 [1:43:57<28:41, 1.09it/s, loss=0.0124, lr=4.59e-06, step=8128] Training: 81%|████████▏ | 8129/10000 [1:43:57<28:41, 1.09it/s, loss=0.0504, lr=4.59e-06, step=8129]17:50:04.640 [I] step=8130 loss=0.0085 smoothed_loss=0.0153 lr=4.60e-06 grad_norm=0.4983 step_time=0.7012s data_time=0.2308s it/s=1.073 eta_to_10000=1742.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0470 grad_action_out_proj=0.2262 grad_shared_expert=0.7000 (10775:train_pytorch.py:850) + Training: 81%|████████▏ | 8130/10000 [1:43:58<30:02, 1.04it/s, loss=0.0504, lr=4.59e-06, step=8129] Training: 81%|████████▏ | 8130/10000 [1:43:58<30:02, 1.04it/s, loss=0.0085, lr=4.59e-06, step=8130] Training: 81%|████████▏ | 8131/10000 [1:43:59<29:59, 1.04it/s, loss=0.0085, lr=4.59e-06, step=8130] Training: 81%|████████▏ | 8131/10000 [1:43:59<29:59, 1.04it/s, loss=0.0686, lr=4.58e-06, step=8131] Training: 81%|████████▏ | 8132/10000 [1:44:00<29:13, 1.07it/s, loss=0.0686, lr=4.58e-06, step=8131] Training: 81%|████████▏ | 8132/10000 [1:44:00<29:13, 1.07it/s, loss=0.0084, lr=4.58e-06, step=8132] Training: 81%|████████▏ | 8133/10000 [1:44:01<31:19, 1.01s/it, loss=0.0084, lr=4.58e-06, step=8132] Training: 81%|████████▏ | 8133/10000 [1:44:01<31:19, 1.01s/it, loss=0.0052, lr=4.58e-06, step=8133] Training: 81%|████████▏ | 8134/10000 [1:44:02<32:40, 1.05s/it, loss=0.0052, lr=4.58e-06, step=8133] Training: 81%|████████▏ | 8134/10000 [1:44:02<32:40, 1.05s/it, loss=0.0093, lr=4.58e-06, step=8134] Training: 81%|████████▏ | 8135/10000 [1:44:03<30:48, 1.01it/s, loss=0.0093, lr=4.58e-06, step=8134] Training: 81%|████████▏ | 8135/10000 [1:44:03<30:48, 1.01it/s, loss=0.0060, lr=4.57e-06, step=8135] Training: 81%|████████▏ | 8136/10000 [1:44:03<28:17, 1.10it/s, loss=0.0060, lr=4.57e-06, step=8135] Training: 81%|████████▏ | 8136/10000 [1:44:03<28:17, 1.10it/s, loss=0.0023, lr=4.57e-06, step=8136] Training: 81%|████████▏ | 8137/10000 [1:44:04<29:30, 1.05it/s, loss=0.0023, lr=4.57e-06, step=8136] Training: 81%|████████▏ | 8137/10000 [1:44:04<29:30, 1.05it/s, loss=0.0129, lr=4.57e-06, step=8137] Training: 81%|████████▏ | 8138/10000 [1:44:05<29:09, 1.06it/s, loss=0.0129, lr=4.57e-06, step=8137] Training: 81%|████████▏ | 8138/10000 [1:44:05<29:09, 1.06it/s, loss=0.0252, lr=4.57e-06, step=8138] Training: 81%|████████▏ | 8139/10000 [1:44:06<28:37, 1.08it/s, loss=0.0252, lr=4.57e-06, step=8138] Training: 81%|████████▏ | 8139/10000 [1:44:06<28:37, 1.08it/s, loss=0.0088, lr=4.57e-06, step=8139]17:50:14.084 [I] step=8140 loss=0.0016 smoothed_loss=0.0135 lr=4.57e-06 grad_norm=0.4627 step_time=0.7157s data_time=0.2286s it/s=1.059 eta_to_10000=1756.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0033 grad_action_out_proj=0.0450 grad_shared_expert=0.1971 (10775:train_pytorch.py:850) + Training: 81%|████████▏ | 8140/10000 [1:44:07<28:05, 1.10it/s, loss=0.0088, lr=4.57e-06, step=8139] Training: 81%|████████▏ | 8140/10000 [1:44:07<28:05, 1.10it/s, loss=0.0016, lr=4.56e-06, step=8140] Training: 81%|████████▏ | 8141/10000 [1:44:08<26:38, 1.16it/s, loss=0.0016, lr=4.56e-06, step=8140] Training: 81%|████████▏ | 8141/10000 [1:44:08<26:38, 1.16it/s, loss=0.0011, lr=4.56e-06, step=8141] Training: 81%|████████▏ | 8142/10000 [1:44:09<26:49, 1.15it/s, loss=0.0011, lr=4.56e-06, step=8141] Training: 81%|████████▏ | 8142/10000 [1:44:09<26:49, 1.15it/s, loss=0.0018, lr=4.56e-06, step=8142] Training: 81%|████████▏ | 8143/10000 [1:44:10<26:40, 1.16it/s, loss=0.0018, lr=4.56e-06, step=8142] Training: 81%|████████▏ | 8143/10000 [1:44:10<26:40, 1.16it/s, loss=0.0127, lr=4.56e-06, step=8143] Training: 81%|████████▏ | 8144/10000 [1:44:11<27:09, 1.14it/s, loss=0.0127, lr=4.56e-06, step=8143] Training: 81%|████████▏ | 8144/10000 [1:44:11<27:09, 1.14it/s, loss=0.0023, lr=4.56e-06, step=8144] Training: 81%|████████▏ | 8145/10000 [1:44:12<28:09, 1.10it/s, loss=0.0023, lr=4.56e-06, step=8144] Training: 81%|████████▏ | 8145/10000 [1:44:12<28:09, 1.10it/s, loss=0.0114, lr=4.55e-06, step=8145] Training: 81%|████████▏ | 8146/10000 [1:44:12<28:29, 1.08it/s, loss=0.0114, lr=4.55e-06, step=8145] Training: 81%|████████▏ | 8146/10000 [1:44:12<28:29, 1.08it/s, loss=0.0091, lr=4.55e-06, step=8146] Training: 81%|████████▏ | 8147/10000 [1:44:13<29:04, 1.06it/s, loss=0.0091, lr=4.55e-06, step=8146] Training: 81%|████████▏ | 8147/10000 [1:44:13<29:04, 1.06it/s, loss=0.0043, lr=4.55e-06, step=8147] Training: 81%|████████▏ | 8148/10000 [1:44:15<31:39, 1.03s/it, loss=0.0043, lr=4.55e-06, step=8147] Training: 81%|████████▏ | 8148/10000 [1:44:15<31:39, 1.03s/it, loss=0.0137, lr=4.55e-06, step=8148] Training: 81%|████████▏ | 8149/10000 [1:44:16<31:29, 1.02s/it, loss=0.0137, lr=4.55e-06, step=8148] Training: 81%|████████▏ | 8149/10000 [1:44:16<31:29, 1.02s/it, loss=0.0020, lr=4.54e-06, step=8149]17:50:23.650 [I] step=8150 loss=0.0112 smoothed_loss=0.0096 lr=4.55e-06 grad_norm=0.3240 step_time=0.7278s data_time=0.2288s it/s=1.045 eta_to_10000=1769.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0612 grad_shared_expert=0.2329 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8150/10000 [1:44:17<31:23, 1.02s/it, loss=0.0020, lr=4.54e-06, step=8149] Training: 82%|████████▏ | 8150/10000 [1:44:17<31:23, 1.02s/it, loss=0.0112, lr=4.54e-06, step=8150] Training: 82%|████████▏ | 8151/10000 [1:44:18<30:00, 1.03it/s, loss=0.0112, lr=4.54e-06, step=8150] Training: 82%|████████▏ | 8151/10000 [1:44:18<30:00, 1.03it/s, loss=0.0017, lr=4.54e-06, step=8151] Training: 82%|████████▏ | 8152/10000 [1:44:19<30:28, 1.01it/s, loss=0.0017, lr=4.54e-06, step=8151] Training: 82%|████████▏ | 8152/10000 [1:44:19<30:28, 1.01it/s, loss=0.0027, lr=4.54e-06, step=8152] Training: 82%|████████▏ | 8153/10000 [1:44:20<29:40, 1.04it/s, loss=0.0027, lr=4.54e-06, step=8152] Training: 82%|████████▏ | 8153/10000 [1:44:20<29:40, 1.04it/s, loss=0.0059, lr=4.54e-06, step=8153] Training: 82%|████████▏ | 8154/10000 [1:44:20<29:10, 1.05it/s, loss=0.0059, lr=4.54e-06, step=8153] Training: 82%|████████▏ | 8154/10000 [1:44:20<29:10, 1.05it/s, loss=0.0015, lr=4.53e-06, step=8154] Training: 82%|████████▏ | 8155/10000 [1:44:21<29:02, 1.06it/s, loss=0.0015, lr=4.53e-06, step=8154] Training: 82%|████████▏ | 8155/10000 [1:44:21<29:02, 1.06it/s, loss=0.0013, lr=4.53e-06, step=8155] Training: 82%|████████▏ | 8156/10000 [1:44:22<30:40, 1.00it/s, loss=0.0013, lr=4.53e-06, step=8155] Training: 82%|████████▏ | 8156/10000 [1:44:22<30:40, 1.00it/s, loss=0.0019, lr=4.53e-06, step=8156] Training: 82%|████████▏ | 8157/10000 [1:44:23<29:07, 1.05it/s, loss=0.0019, lr=4.53e-06, step=8156] Training: 82%|████████▏ | 8157/10000 [1:44:23<29:07, 1.05it/s, loss=0.0043, lr=4.53e-06, step=8157] Training: 82%|████████▏ | 8158/10000 [1:44:24<29:08, 1.05it/s, loss=0.0043, lr=4.53e-06, step=8157] Training: 82%|████████▏ | 8158/10000 [1:44:24<29:08, 1.05it/s, loss=0.0056, lr=4.53e-06, step=8158] Training: 82%|████████▏ | 8159/10000 [1:44:25<27:51, 1.10it/s, loss=0.0056, lr=4.53e-06, step=8158] Training: 82%|████████▏ | 8159/10000 [1:44:25<27:51, 1.10it/s, loss=0.0153, lr=4.52e-06, step=8159]17:50:33.032 [I] step=8160 loss=0.0059 smoothed_loss=0.0068 lr=4.53e-06 grad_norm=0.3886 step_time=0.7095s data_time=0.2287s it/s=1.066 eta_to_10000=1726.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0211 grad_action_out_proj=0.1866 grad_shared_expert=0.4990 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8160/10000 [1:44:26<28:50, 1.06it/s, loss=0.0153, lr=4.52e-06, step=8159] Training: 82%|████████▏ | 8160/10000 [1:44:26<28:50, 1.06it/s, loss=0.0059, lr=4.52e-06, step=8160] Training: 82%|████████▏ | 8161/10000 [1:44:27<27:22, 1.12it/s, loss=0.0059, lr=4.52e-06, step=8160] Training: 82%|████████▏ | 8161/10000 [1:44:27<27:22, 1.12it/s, loss=0.0061, lr=4.52e-06, step=8161] Training: 82%|████████▏ | 8162/10000 [1:44:28<27:15, 1.12it/s, loss=0.0061, lr=4.52e-06, step=8161] Training: 82%|████████▏ | 8162/10000 [1:44:28<27:15, 1.12it/s, loss=0.0033, lr=4.52e-06, step=8162] Training: 82%|████████▏ | 8163/10000 [1:44:29<28:41, 1.07it/s, loss=0.0033, lr=4.52e-06, step=8162] Training: 82%|████████▏ | 8163/10000 [1:44:29<28:41, 1.07it/s, loss=0.0530, lr=4.51e-06, step=8163] Training: 82%|████████▏ | 8164/10000 [1:44:30<28:03, 1.09it/s, loss=0.0530, lr=4.51e-06, step=8163] Training: 82%|████████▏ | 8164/10000 [1:44:30<28:03, 1.09it/s, loss=0.0053, lr=4.51e-06, step=8164] Training: 82%|████████▏ | 8165/10000 [1:44:31<27:13, 1.12it/s, loss=0.0053, lr=4.51e-06, step=8164] Training: 82%|████████▏ | 8165/10000 [1:44:31<27:13, 1.12it/s, loss=0.0048, lr=4.51e-06, step=8165] Training: 82%|████████▏ | 8166/10000 [1:44:31<26:27, 1.16it/s, loss=0.0048, lr=4.51e-06, step=8165] Training: 82%|████████▏ | 8166/10000 [1:44:31<26:27, 1.16it/s, loss=0.0023, lr=4.51e-06, step=8166] Training: 82%|████████▏ | 8167/10000 [1:44:32<28:28, 1.07it/s, loss=0.0023, lr=4.51e-06, step=8166] Training: 82%|████████▏ | 8167/10000 [1:44:32<28:28, 1.07it/s, loss=0.0062, lr=4.51e-06, step=8167] Training: 82%|████████▏ | 8168/10000 [1:44:33<24:25, 1.25it/s, loss=0.0062, lr=4.51e-06, step=8167] Training: 82%|████████▏ | 8168/10000 [1:44:33<24:25, 1.25it/s, loss=0.0031, lr=4.50e-06, step=8168] Training: 82%|████████▏ | 8169/10000 [1:44:34<23:59, 1.27it/s, loss=0.0031, lr=4.50e-06, step=8168] Training: 82%|████████▏ | 8169/10000 [1:44:34<23:59, 1.27it/s, loss=0.0030, lr=4.50e-06, step=8169]17:50:41.441 [I] step=8170 loss=0.0058 smoothed_loss=0.0076 lr=4.51e-06 grad_norm=0.4063 step_time=0.6732s data_time=0.1677s it/s=1.189 eta_to_10000=1538.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0037 grad_action_out_proj=0.0468 grad_shared_expert=0.1919 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8170/10000 [1:44:35<24:38, 1.24it/s, loss=0.0030, lr=4.50e-06, step=8169] Training: 82%|████████▏ | 8170/10000 [1:44:35<24:38, 1.24it/s, loss=0.0058, lr=4.50e-06, step=8170] Training: 82%|████████▏ | 8171/10000 [1:44:35<25:03, 1.22it/s, loss=0.0058, lr=4.50e-06, step=8170] Training: 82%|████████▏ | 8171/10000 [1:44:35<25:03, 1.22it/s, loss=0.0061, lr=4.50e-06, step=8171] Training: 82%|████████▏ | 8172/10000 [1:44:36<25:26, 1.20it/s, loss=0.0061, lr=4.50e-06, step=8171] Training: 82%|████████▏ | 8172/10000 [1:44:36<25:26, 1.20it/s, loss=0.0100, lr=4.50e-06, step=8172] Training: 82%|████████▏ | 8173/10000 [1:44:37<25:35, 1.19it/s, loss=0.0100, lr=4.50e-06, step=8172] Training: 82%|████████▏ | 8173/10000 [1:44:37<25:35, 1.19it/s, loss=0.0170, lr=4.49e-06, step=8173] Training: 82%|████████▏ | 8174/10000 [1:44:38<22:54, 1.33it/s, loss=0.0170, lr=4.49e-06, step=8173] Training: 82%|████████▏ | 8174/10000 [1:44:38<22:54, 1.33it/s, loss=0.0115, lr=4.49e-06, step=8174] Training: 82%|████████▏ | 8175/10000 [1:44:38<23:52, 1.27it/s, loss=0.0115, lr=4.49e-06, step=8174] Training: 82%|████████▏ | 8175/10000 [1:44:38<23:52, 1.27it/s, loss=0.0216, lr=4.49e-06, step=8175] Training: 82%|████████▏ | 8176/10000 [1:44:39<21:31, 1.41it/s, loss=0.0216, lr=4.49e-06, step=8175] Training: 82%|████████▏ | 8176/10000 [1:44:39<21:31, 1.41it/s, loss=0.0074, lr=4.49e-06, step=8176] Training: 82%|████████▏ | 8177/10000 [1:44:40<21:52, 1.39it/s, loss=0.0074, lr=4.49e-06, step=8176] Training: 82%|████████▏ | 8177/10000 [1:44:40<21:52, 1.39it/s, loss=0.0222, lr=4.49e-06, step=8177] Training: 82%|████████▏ | 8178/10000 [1:44:41<22:29, 1.35it/s, loss=0.0222, lr=4.49e-06, step=8177] Training: 82%|████████▏ | 8178/10000 [1:44:41<22:29, 1.35it/s, loss=0.0215, lr=4.48e-06, step=8178] Training: 82%|████████▏ | 8179/10000 [1:44:41<23:37, 1.28it/s, loss=0.0215, lr=4.48e-06, step=8178] Training: 82%|████████▏ | 8179/10000 [1:44:41<23:37, 1.28it/s, loss=0.0009, lr=4.48e-06, step=8179]17:50:48.916 [I] step=8180 loss=0.0235 smoothed_loss=0.0123 lr=4.49e-06 grad_norm=0.5527 step_time=0.5990s data_time=0.1485s it/s=1.338 eta_to_10000=1360.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0256 grad_action_out_proj=0.1369 grad_shared_expert=0.7313 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8180/10000 [1:44:42<21:37, 1.40it/s, loss=0.0009, lr=4.48e-06, step=8179] Training: 82%|████████▏ | 8180/10000 [1:44:42<21:37, 1.40it/s, loss=0.0235, lr=4.48e-06, step=8180] Training: 82%|████████▏ | 8181/10000 [1:44:43<21:19, 1.42it/s, loss=0.0235, lr=4.48e-06, step=8180] Training: 82%|████████▏ | 8181/10000 [1:44:43<21:19, 1.42it/s, loss=0.0022, lr=4.48e-06, step=8181] Training: 82%|████████▏ | 8182/10000 [1:44:44<23:21, 1.30it/s, loss=0.0022, lr=4.48e-06, step=8181] Training: 82%|████████▏ | 8182/10000 [1:44:44<23:21, 1.30it/s, loss=0.0112, lr=4.47e-06, step=8182] Training: 82%|████████▏ | 8183/10000 [1:44:44<24:32, 1.23it/s, loss=0.0112, lr=4.47e-06, step=8182] Training: 82%|████████▏ | 8183/10000 [1:44:44<24:32, 1.23it/s, loss=0.0031, lr=4.47e-06, step=8183] Training: 82%|████████▏ | 8184/10000 [1:44:45<25:12, 1.20it/s, loss=0.0031, lr=4.47e-06, step=8183] Training: 82%|████████▏ | 8184/10000 [1:44:45<25:12, 1.20it/s, loss=0.0034, lr=4.47e-06, step=8184] Training: 82%|████████▏ | 8185/10000 [1:44:46<25:20, 1.19it/s, loss=0.0034, lr=4.47e-06, step=8184] Training: 82%|████████▏ | 8185/10000 [1:44:46<25:20, 1.19it/s, loss=0.0131, lr=4.47e-06, step=8185] Training: 82%|████████▏ | 8186/10000 [1:44:47<25:13, 1.20it/s, loss=0.0131, lr=4.47e-06, step=8185] Training: 82%|████████▏ | 8186/10000 [1:44:47<25:13, 1.20it/s, loss=0.0029, lr=4.47e-06, step=8186] Training: 82%|████████▏ | 8187/10000 [1:44:48<22:13, 1.36it/s, loss=0.0029, lr=4.47e-06, step=8186] Training: 82%|████████▏ | 8187/10000 [1:44:48<22:13, 1.36it/s, loss=0.0066, lr=4.46e-06, step=8187] Training: 82%|████████▏ | 8188/10000 [1:44:48<22:18, 1.35it/s, loss=0.0066, lr=4.46e-06, step=8187] Training: 82%|████████▏ | 8188/10000 [1:44:48<22:18, 1.35it/s, loss=0.0054, lr=4.46e-06, step=8188] Training: 82%|████████▏ | 8189/10000 [1:44:49<22:34, 1.34it/s, loss=0.0054, lr=4.46e-06, step=8188] Training: 82%|████████▏ | 8189/10000 [1:44:49<22:34, 1.34it/s, loss=0.0050, lr=4.46e-06, step=8189]17:50:56.989 [I] step=8190 loss=0.0036 smoothed_loss=0.0079 lr=4.47e-06 grad_norm=0.4633 step_time=0.6362s data_time=0.1711s it/s=1.239 eta_to_10000=1461.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0051 grad_action_out_proj=0.0728 grad_shared_expert=0.2923 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8190/10000 [1:44:50<24:39, 1.22it/s, loss=0.0050, lr=4.46e-06, step=8189] Training: 82%|████████▏ | 8190/10000 [1:44:50<24:39, 1.22it/s, loss=0.0036, lr=4.46e-06, step=8190] Training: 82%|████████▏ | 8191/10000 [1:44:51<24:08, 1.25it/s, loss=0.0036, lr=4.46e-06, step=8190] Training: 82%|████████▏ | 8191/10000 [1:44:51<24:08, 1.25it/s, loss=0.0075, lr=4.46e-06, step=8191] Training: 82%|████████▏ | 8192/10000 [1:44:52<25:46, 1.17it/s, loss=0.0075, lr=4.46e-06, step=8191] Training: 82%|████████▏ | 8192/10000 [1:44:52<25:46, 1.17it/s, loss=0.0089, lr=4.45e-06, step=8192] Training: 82%|████████▏ | 8193/10000 [1:44:53<26:55, 1.12it/s, loss=0.0089, lr=4.45e-06, step=8192] Training: 82%|████████▏ | 8193/10000 [1:44:53<26:55, 1.12it/s, loss=0.0258, lr=4.45e-06, step=8193] Training: 82%|████████▏ | 8194/10000 [1:44:54<26:15, 1.15it/s, loss=0.0258, lr=4.45e-06, step=8193] Training: 82%|████████▏ | 8194/10000 [1:44:54<26:15, 1.15it/s, loss=0.0844, lr=4.45e-06, step=8194] Training: 82%|████████▏ | 8195/10000 [1:44:54<25:42, 1.17it/s, loss=0.0844, lr=4.45e-06, step=8194] Training: 82%|████████▏ | 8195/10000 [1:44:54<25:42, 1.17it/s, loss=0.0066, lr=4.45e-06, step=8195] Training: 82%|████████▏ | 8196/10000 [1:44:55<26:32, 1.13it/s, loss=0.0066, lr=4.45e-06, step=8195] Training: 82%|████████▏ | 8196/10000 [1:44:55<26:32, 1.13it/s, loss=0.0070, lr=4.45e-06, step=8196] Training: 82%|████████▏ | 8197/10000 [1:44:56<26:08, 1.15it/s, loss=0.0070, lr=4.45e-06, step=8196] Training: 82%|████████▏ | 8197/10000 [1:44:56<26:08, 1.15it/s, loss=0.0118, lr=4.44e-06, step=8197] Training: 82%|████████▏ | 8198/10000 [1:44:57<27:05, 1.11it/s, loss=0.0118, lr=4.44e-06, step=8197] Training: 82%|████████▏ | 8198/10000 [1:44:57<27:05, 1.11it/s, loss=0.0173, lr=4.44e-06, step=8198] Training: 82%|████████▏ | 8199/10000 [1:44:58<28:14, 1.06it/s, loss=0.0173, lr=4.44e-06, step=8198] Training: 82%|████████▏ | 8199/10000 [1:44:58<28:14, 1.06it/s, loss=0.0005, lr=4.44e-06, step=8199]17:51:05.998 [I] step=8200 loss=0.0108 smoothed_loss=0.0134 lr=4.45e-06 grad_norm=0.4201 step_time=0.7156s data_time=0.1853s it/s=1.110 eta_to_10000=1621.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0070 grad_action_out_proj=0.0781 grad_shared_expert=0.3648 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8200/10000 [1:44:59<27:24, 1.09it/s, loss=0.0005, lr=4.44e-06, step=8199] Training: 82%|████████▏ | 8200/10000 [1:44:59<27:24, 1.09it/s, loss=0.0108, lr=4.44e-06, step=8200] Training: 82%|████████▏ | 8201/10000 [1:45:00<25:02, 1.20it/s, loss=0.0108, lr=4.44e-06, step=8200] Training: 82%|████████▏ | 8201/10000 [1:45:00<25:02, 1.20it/s, loss=0.0367, lr=4.43e-06, step=8201] Training: 82%|████████▏ | 8202/10000 [1:45:01<25:24, 1.18it/s, loss=0.0367, lr=4.43e-06, step=8201] Training: 82%|████████▏ | 8202/10000 [1:45:01<25:24, 1.18it/s, loss=0.0007, lr=4.43e-06, step=8202] Training: 82%|████████▏ | 8203/10000 [1:45:02<27:30, 1.09it/s, loss=0.0007, lr=4.43e-06, step=8202] Training: 82%|████████▏ | 8203/10000 [1:45:02<27:30, 1.09it/s, loss=0.0086, lr=4.43e-06, step=8203] Training: 82%|████████▏ | 8204/10000 [1:45:02<23:57, 1.25it/s, loss=0.0086, lr=4.43e-06, step=8203] Training: 82%|████████▏ | 8204/10000 [1:45:02<23:57, 1.25it/s, loss=0.0054, lr=4.43e-06, step=8204] Training: 82%|████████▏ | 8205/10000 [1:45:03<24:44, 1.21it/s, loss=0.0054, lr=4.43e-06, step=8204] Training: 82%|████████▏ | 8205/10000 [1:45:03<24:44, 1.21it/s, loss=0.0103, lr=4.43e-06, step=8205] Training: 82%|████████▏ | 8206/10000 [1:45:04<28:26, 1.05it/s, loss=0.0103, lr=4.43e-06, step=8205] Training: 82%|████████▏ | 8206/10000 [1:45:04<28:26, 1.05it/s, loss=0.0010, lr=4.42e-06, step=8206] Training: 82%|████████▏ | 8207/10000 [1:45:05<28:21, 1.05it/s, loss=0.0010, lr=4.42e-06, step=8206] Training: 82%|████████▏ | 8207/10000 [1:45:05<28:21, 1.05it/s, loss=0.0031, lr=4.42e-06, step=8207] Training: 82%|████████▏ | 8208/10000 [1:45:06<28:34, 1.05it/s, loss=0.0031, lr=4.42e-06, step=8207] Training: 82%|████████▏ | 8208/10000 [1:45:06<28:34, 1.05it/s, loss=0.0055, lr=4.42e-06, step=8208] Training: 82%|████████▏ | 8209/10000 [1:45:07<27:54, 1.07it/s, loss=0.0055, lr=4.42e-06, step=8208] Training: 82%|████████▏ | 8209/10000 [1:45:07<27:54, 1.07it/s, loss=0.0081, lr=4.42e-06, step=8209]17:51:14.667 [I] step=8210 loss=0.0038 smoothed_loss=0.0093 lr=4.43e-06 grad_norm=0.3941 step_time=0.6610s data_time=0.2059s it/s=1.154 eta_to_10000=1551.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0063 grad_action_out_proj=0.0798 grad_shared_expert=0.2780 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8210/10000 [1:45:08<24:52, 1.20it/s, loss=0.0081, lr=4.42e-06, step=8209] Training: 82%|████████▏ | 8210/10000 [1:45:08<24:52, 1.20it/s, loss=0.0038, lr=4.42e-06, step=8210] Training: 82%|████████▏ | 8211/10000 [1:45:09<25:25, 1.17it/s, loss=0.0038, lr=4.42e-06, step=8210] Training: 82%|████████▏ | 8211/10000 [1:45:09<25:25, 1.17it/s, loss=0.0009, lr=4.41e-06, step=8211] Training: 82%|████████▏ | 8212/10000 [1:45:09<24:32, 1.21it/s, loss=0.0009, lr=4.41e-06, step=8211] Training: 82%|████████▏ | 8212/10000 [1:45:09<24:32, 1.21it/s, loss=0.0119, lr=4.41e-06, step=8212] Training: 82%|████████▏ | 8213/10000 [1:45:11<27:30, 1.08it/s, loss=0.0119, lr=4.41e-06, step=8212] Training: 82%|████████▏ | 8213/10000 [1:45:11<27:30, 1.08it/s, loss=0.0097, lr=4.41e-06, step=8213] Training: 82%|████████▏ | 8214/10000 [1:45:11<27:17, 1.09it/s, loss=0.0097, lr=4.41e-06, step=8213] Training: 82%|████████▏ | 8214/10000 [1:45:11<27:17, 1.09it/s, loss=0.0014, lr=4.41e-06, step=8214] Training: 82%|████████▏ | 8215/10000 [1:45:12<26:56, 1.10it/s, loss=0.0014, lr=4.41e-06, step=8214] Training: 82%|████████▏ | 8215/10000 [1:45:12<26:56, 1.10it/s, loss=0.0096, lr=4.41e-06, step=8215] Training: 82%|████████▏ | 8216/10000 [1:45:13<25:10, 1.18it/s, loss=0.0096, lr=4.41e-06, step=8215] Training: 82%|████████▏ | 8216/10000 [1:45:13<25:10, 1.18it/s, loss=0.0036, lr=4.40e-06, step=8216] Training: 82%|████████▏ | 8217/10000 [1:45:14<22:14, 1.34it/s, loss=0.0036, lr=4.40e-06, step=8216] Training: 82%|████████▏ | 8217/10000 [1:45:14<22:14, 1.34it/s, loss=0.0083, lr=4.40e-06, step=8217] Training: 82%|████████▏ | 8218/10000 [1:45:14<22:31, 1.32it/s, loss=0.0083, lr=4.40e-06, step=8217] Training: 82%|████████▏ | 8218/10000 [1:45:14<22:31, 1.32it/s, loss=0.0017, lr=4.40e-06, step=8218] Training: 82%|████████▏ | 8219/10000 [1:45:15<20:17, 1.46it/s, loss=0.0017, lr=4.40e-06, step=8218] Training: 82%|████████▏ | 8219/10000 [1:45:15<20:17, 1.46it/s, loss=0.0130, lr=4.40e-06, step=8219]17:51:22.863 [I] step=8220 loss=0.0155 smoothed_loss=0.0086 lr=4.40e-06 grad_norm=0.4138 step_time=0.6614s data_time=0.1581s it/s=1.220 eta_to_10000=1458.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.1700 grad_shared_expert=0.7446 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8220/10000 [1:45:16<23:55, 1.24it/s, loss=0.0130, lr=4.40e-06, step=8219] Training: 82%|████████▏ | 8220/10000 [1:45:16<23:55, 1.24it/s, loss=0.0155, lr=4.40e-06, step=8220] Training: 82%|████████▏ | 8221/10000 [1:45:17<22:38, 1.31it/s, loss=0.0155, lr=4.40e-06, step=8220] Training: 82%|████████▏ | 8221/10000 [1:45:17<22:38, 1.31it/s, loss=0.0058, lr=4.39e-06, step=8221] Training: 82%|████████▏ | 8222/10000 [1:45:17<23:31, 1.26it/s, loss=0.0058, lr=4.39e-06, step=8221] Training: 82%|████████▏ | 8222/10000 [1:45:17<23:31, 1.26it/s, loss=0.0187, lr=4.39e-06, step=8222] Training: 82%|████████▏ | 8223/10000 [1:45:18<24:41, 1.20it/s, loss=0.0187, lr=4.39e-06, step=8222] Training: 82%|████████▏ | 8223/10000 [1:45:18<24:41, 1.20it/s, loss=0.0081, lr=4.39e-06, step=8223] Training: 82%|████████▏ | 8224/10000 [1:45:19<24:32, 1.21it/s, loss=0.0081, lr=4.39e-06, step=8223] Training: 82%|████████▏ | 8224/10000 [1:45:19<24:32, 1.21it/s, loss=0.0043, lr=4.39e-06, step=8224] Training: 82%|████████▏ | 8225/10000 [1:45:20<22:03, 1.34it/s, loss=0.0043, lr=4.39e-06, step=8224] Training: 82%|████████▏ | 8225/10000 [1:45:20<22:03, 1.34it/s, loss=0.0082, lr=4.39e-06, step=8225] Training: 82%|████████▏ | 8226/10000 [1:45:20<21:00, 1.41it/s, loss=0.0082, lr=4.39e-06, step=8225] Training: 82%|████████▏ | 8226/10000 [1:45:20<21:00, 1.41it/s, loss=0.0104, lr=4.38e-06, step=8226] Training: 82%|████████▏ | 8227/10000 [1:45:21<22:58, 1.29it/s, loss=0.0104, lr=4.38e-06, step=8226] Training: 82%|████████▏ | 8227/10000 [1:45:21<22:58, 1.29it/s, loss=0.0191, lr=4.38e-06, step=8227] Training: 82%|████████▏ | 8228/10000 [1:45:22<23:10, 1.27it/s, loss=0.0191, lr=4.38e-06, step=8227] Training: 82%|████████▏ | 8228/10000 [1:45:22<23:10, 1.27it/s, loss=0.0058, lr=4.38e-06, step=8228] Training: 82%|████████▏ | 8229/10000 [1:45:23<20:35, 1.43it/s, loss=0.0058, lr=4.38e-06, step=8228] Training: 82%|████████▏ | 8229/10000 [1:45:23<20:35, 1.43it/s, loss=0.0023, lr=4.38e-06, step=8229]17:51:30.367 [I] step=8230 loss=0.0029 smoothed_loss=0.0082 lr=4.38e-06 grad_norm=0.4715 step_time=0.5809s data_time=0.1695s it/s=1.334 eta_to_10000=1326.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0078 grad_action_out_proj=0.0821 grad_shared_expert=0.2039 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8230/10000 [1:45:23<21:36, 1.37it/s, loss=0.0023, lr=4.38e-06, step=8229] Training: 82%|████████▏ | 8230/10000 [1:45:23<21:36, 1.37it/s, loss=0.0029, lr=4.37e-06, step=8230] Training: 82%|████████▏ | 8231/10000 [1:45:24<23:46, 1.24it/s, loss=0.0029, lr=4.37e-06, step=8230] Training: 82%|████████▏ | 8231/10000 [1:45:24<23:46, 1.24it/s, loss=0.0066, lr=4.37e-06, step=8231] Training: 82%|████████▏ | 8232/10000 [1:45:25<24:19, 1.21it/s, loss=0.0066, lr=4.37e-06, step=8231] Training: 82%|████████▏ | 8232/10000 [1:45:25<24:19, 1.21it/s, loss=0.0034, lr=4.37e-06, step=8232] Training: 82%|████████▏ | 8233/10000 [1:45:26<22:05, 1.33it/s, loss=0.0034, lr=4.37e-06, step=8232] Training: 82%|████████▏ | 8233/10000 [1:45:26<22:05, 1.33it/s, loss=0.0103, lr=4.37e-06, step=8233] Training: 82%|████████▏ | 8234/10000 [1:45:27<24:33, 1.20it/s, loss=0.0103, lr=4.37e-06, step=8233] Training: 82%|████████▏ | 8234/10000 [1:45:27<24:33, 1.20it/s, loss=0.0145, lr=4.37e-06, step=8234] Training: 82%|████████▏ | 8235/10000 [1:45:28<24:41, 1.19it/s, loss=0.0145, lr=4.37e-06, step=8234] Training: 82%|████████▏ | 8235/10000 [1:45:28<24:41, 1.19it/s, loss=0.0028, lr=4.36e-06, step=8235] Training: 82%|████████▏ | 8236/10000 [1:45:28<23:18, 1.26it/s, loss=0.0028, lr=4.36e-06, step=8235] Training: 82%|████████▏ | 8236/10000 [1:45:28<23:18, 1.26it/s, loss=0.0096, lr=4.36e-06, step=8236] Training: 82%|████████▏ | 8237/10000 [1:45:29<23:59, 1.22it/s, loss=0.0096, lr=4.36e-06, step=8236] Training: 82%|████████▏ | 8237/10000 [1:45:29<23:59, 1.22it/s, loss=0.0146, lr=4.36e-06, step=8237] Training: 82%|████████▏ | 8238/10000 [1:45:30<23:59, 1.22it/s, loss=0.0146, lr=4.36e-06, step=8237] Training: 82%|████████▏ | 8238/10000 [1:45:30<23:59, 1.22it/s, loss=0.0111, lr=4.36e-06, step=8238] Training: 82%|████████▏ | 8239/10000 [1:45:31<23:45, 1.24it/s, loss=0.0111, lr=4.36e-06, step=8238] Training: 82%|████████▏ | 8239/10000 [1:45:31<23:45, 1.24it/s, loss=0.0045, lr=4.36e-06, step=8239]17:51:38.743 [I] step=8240 loss=0.0034 smoothed_loss=0.0080 lr=4.36e-06 grad_norm=0.3656 step_time=0.6493s data_time=0.1883s it/s=1.194 eta_to_10000=1474.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0792 grad_shared_expert=0.2827 (10775:train_pytorch.py:850) + Training: 82%|████████▏ | 8240/10000 [1:45:32<24:35, 1.19it/s, loss=0.0045, lr=4.36e-06, step=8239] Training: 82%|████████▏ | 8240/10000 [1:45:32<24:35, 1.19it/s, loss=0.0034, lr=4.35e-06, step=8240] Training: 82%|████████▏ | 8241/10000 [1:45:32<22:54, 1.28it/s, loss=0.0034, lr=4.35e-06, step=8240] Training: 82%|████████▏ | 8241/10000 [1:45:32<22:54, 1.28it/s, loss=0.0032, lr=4.35e-06, step=8241] Training: 82%|████████▏ | 8242/10000 [1:45:33<23:20, 1.26it/s, loss=0.0032, lr=4.35e-06, step=8241] Training: 82%|████████▏ | 8242/10000 [1:45:33<23:20, 1.26it/s, loss=0.0044, lr=4.35e-06, step=8242] Training: 82%|████████▏ | 8243/10000 [1:45:34<24:16, 1.21it/s, loss=0.0044, lr=4.35e-06, step=8242] Training: 82%|████████▏ | 8243/10000 [1:45:34<24:16, 1.21it/s, loss=0.0022, lr=4.35e-06, step=8243] Training: 82%|████████▏ | 8244/10000 [1:45:35<21:29, 1.36it/s, loss=0.0022, lr=4.35e-06, step=8243] Training: 82%|████████▏ | 8244/10000 [1:45:35<21:29, 1.36it/s, loss=0.0097, lr=4.35e-06, step=8244] Training: 82%|████████▏ | 8245/10000 [1:45:35<20:32, 1.42it/s, loss=0.0097, lr=4.35e-06, step=8244] Training: 82%|████████▏ | 8245/10000 [1:45:35<20:32, 1.42it/s, loss=0.0048, lr=4.34e-06, step=8245] Training: 82%|████████▏ | 8246/10000 [1:45:36<21:16, 1.37it/s, loss=0.0048, lr=4.34e-06, step=8245] Training: 82%|████████▏ | 8246/10000 [1:45:36<21:16, 1.37it/s, loss=0.0288, lr=4.34e-06, step=8246] Training: 82%|████████▏ | 8247/10000 [1:45:37<19:26, 1.50it/s, loss=0.0288, lr=4.34e-06, step=8246] Training: 82%|████████▏ | 8247/10000 [1:45:37<19:26, 1.50it/s, loss=0.0081, lr=4.34e-06, step=8247] Training: 82%|████████▏ | 8248/10000 [1:45:37<17:57, 1.63it/s, loss=0.0081, lr=4.34e-06, step=8247] Training: 82%|████████▏ | 8248/10000 [1:45:37<17:57, 1.63it/s, loss=0.1128, lr=4.34e-06, step=8248] Training: 82%|████████▏ | 8249/10000 [1:45:38<21:57, 1.33it/s, loss=0.1128, lr=4.34e-06, step=8248] Training: 82%|████████▏ | 8249/10000 [1:45:38<21:57, 1.33it/s, loss=0.0105, lr=4.34e-06, step=8249]17:51:45.745 [I] step=8250 loss=0.0059 smoothed_loss=0.0172 lr=4.34e-06 grad_norm=0.3798 step_time=0.5769s data_time=0.1233s it/s=1.428 eta_to_10000=1225.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.1039 grad_shared_expert=0.4070 (10775:train_pytorch.py:850) + Training: 82%|████████▎ | 8250/10000 [1:45:39<20:35, 1.42it/s, loss=0.0105, lr=4.34e-06, step=8249] Training: 82%|████████▎ | 8250/10000 [1:45:39<20:35, 1.42it/s, loss=0.0059, lr=4.33e-06, step=8250] Training: 83%|████████▎ | 8251/10000 [1:45:39<20:25, 1.43it/s, loss=0.0059, lr=4.33e-06, step=8250] Training: 83%|████████▎ | 8251/10000 [1:45:40<20:25, 1.43it/s, loss=0.0037, lr=4.33e-06, step=8251] Training: 83%|████████▎ | 8252/10000 [1:45:40<18:51, 1.54it/s, loss=0.0037, lr=4.33e-06, step=8251] Training: 83%|████████▎ | 8252/10000 [1:45:40<18:51, 1.54it/s, loss=0.0074, lr=4.33e-06, step=8252] Training: 83%|████████▎ | 8253/10000 [1:45:41<22:47, 1.28it/s, loss=0.0074, lr=4.33e-06, step=8252] Training: 83%|████████▎ | 8253/10000 [1:45:41<22:47, 1.28it/s, loss=0.0047, lr=4.33e-06, step=8253] Training: 83%|████████▎ | 8254/10000 [1:45:42<20:12, 1.44it/s, loss=0.0047, lr=4.33e-06, step=8253] Training: 83%|████████▎ | 8254/10000 [1:45:42<20:12, 1.44it/s, loss=0.0048, lr=4.33e-06, step=8254] Training: 83%|████████▎ | 8255/10000 [1:45:42<18:52, 1.54it/s, loss=0.0048, lr=4.33e-06, step=8254] Training: 83%|████████▎ | 8255/10000 [1:45:42<18:52, 1.54it/s, loss=0.0086, lr=4.32e-06, step=8255] Training: 83%|████████▎ | 8256/10000 [1:45:43<22:23, 1.30it/s, loss=0.0086, lr=4.32e-06, step=8255] Training: 83%|████████▎ | 8256/10000 [1:45:43<22:23, 1.30it/s, loss=0.0206, lr=4.32e-06, step=8256] Training: 83%|████████▎ | 8257/10000 [1:45:44<21:27, 1.35it/s, loss=0.0206, lr=4.32e-06, step=8256] Training: 83%|████████▎ | 8257/10000 [1:45:44<21:27, 1.35it/s, loss=0.0030, lr=4.32e-06, step=8257] Training: 83%|████████▎ | 8258/10000 [1:45:45<21:06, 1.37it/s, loss=0.0030, lr=4.32e-06, step=8257] Training: 83%|████████▎ | 8258/10000 [1:45:45<21:06, 1.37it/s, loss=0.0067, lr=4.32e-06, step=8258] Training: 83%|████████▎ | 8259/10000 [1:45:45<22:37, 1.28it/s, loss=0.0067, lr=4.32e-06, step=8258] Training: 83%|████████▎ | 8259/10000 [1:45:45<22:37, 1.28it/s, loss=0.0063, lr=4.32e-06, step=8259]17:51:53.050 [I] step=8260 loss=0.0035 smoothed_loss=0.0105 lr=4.32e-06 grad_norm=0.4020 step_time=0.5813s data_time=0.1491s it/s=1.369 eta_to_10000=1270.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.1488 grad_shared_expert=0.3450 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8260/10000 [1:45:46<21:28, 1.35it/s, loss=0.0063, lr=4.32e-06, step=8259] Training: 83%|████████▎ | 8260/10000 [1:45:46<21:28, 1.35it/s, loss=0.0035, lr=4.31e-06, step=8260] Training: 83%|████████▎ | 8261/10000 [1:45:47<23:25, 1.24it/s, loss=0.0035, lr=4.31e-06, step=8260] Training: 83%|████████▎ | 8261/10000 [1:45:47<23:25, 1.24it/s, loss=0.0214, lr=4.31e-06, step=8261] Training: 83%|████████▎ | 8262/10000 [1:45:48<22:20, 1.30it/s, loss=0.0214, lr=4.31e-06, step=8261] Training: 83%|████████▎ | 8262/10000 [1:45:48<22:20, 1.30it/s, loss=0.0106, lr=4.31e-06, step=8262] Training: 83%|████████▎ | 8263/10000 [1:45:49<22:58, 1.26it/s, loss=0.0106, lr=4.31e-06, step=8262] Training: 83%|████████▎ | 8263/10000 [1:45:49<22:58, 1.26it/s, loss=0.0058, lr=4.31e-06, step=8263] Training: 83%|████████▎ | 8264/10000 [1:45:49<21:48, 1.33it/s, loss=0.0058, lr=4.31e-06, step=8263] Training: 83%|████████▎ | 8264/10000 [1:45:49<21:48, 1.33it/s, loss=0.0525, lr=4.31e-06, step=8264] Training: 83%|████████▎ | 8265/10000 [1:45:50<20:16, 1.43it/s, loss=0.0525, lr=4.31e-06, step=8264] Training: 83%|████████▎ | 8265/10000 [1:45:50<20:16, 1.43it/s, loss=0.0060, lr=4.30e-06, step=8265] Training: 83%|████████▎ | 8266/10000 [1:45:51<20:04, 1.44it/s, loss=0.0060, lr=4.30e-06, step=8265] Training: 83%|████████▎ | 8266/10000 [1:45:51<20:04, 1.44it/s, loss=0.0390, lr=4.30e-06, step=8266] Training: 83%|████████▎ | 8267/10000 [1:45:51<19:47, 1.46it/s, loss=0.0390, lr=4.30e-06, step=8266] Training: 83%|████████▎ | 8267/10000 [1:45:51<19:47, 1.46it/s, loss=0.0039, lr=4.30e-06, step=8267] Training: 83%|████████▎ | 8268/10000 [1:45:52<21:17, 1.36it/s, loss=0.0039, lr=4.30e-06, step=8267] Training: 83%|████████▎ | 8268/10000 [1:45:52<21:17, 1.36it/s, loss=0.0073, lr=4.30e-06, step=8268] Training: 83%|████████▎ | 8269/10000 [1:45:53<20:46, 1.39it/s, loss=0.0073, lr=4.30e-06, step=8268] Training: 83%|████████▎ | 8269/10000 [1:45:53<20:46, 1.39it/s, loss=0.0038, lr=4.30e-06, step=8269]17:52:00.525 [I] step=8270 loss=0.0042 smoothed_loss=0.0126 lr=4.30e-06 grad_norm=0.4376 step_time=0.6079s data_time=0.1396s it/s=1.339 eta_to_10000=1291.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0152 grad_action_out_proj=0.1014 grad_shared_expert=0.5045 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8270/10000 [1:45:54<21:53, 1.32it/s, loss=0.0038, lr=4.30e-06, step=8269] Training: 83%|████████▎ | 8270/10000 [1:45:54<21:53, 1.32it/s, loss=0.0042, lr=4.29e-06, step=8270] Training: 83%|████████▎ | 8271/10000 [1:45:54<19:53, 1.45it/s, loss=0.0042, lr=4.29e-06, step=8270] Training: 83%|████████▎ | 8271/10000 [1:45:54<19:53, 1.45it/s, loss=0.0027, lr=4.29e-06, step=8271] Training: 83%|████████▎ | 8272/10000 [1:45:55<19:24, 1.48it/s, loss=0.0027, lr=4.29e-06, step=8271] Training: 83%|████████▎ | 8272/10000 [1:45:55<19:24, 1.48it/s, loss=0.0055, lr=4.29e-06, step=8272] Training: 83%|████████▎ | 8273/10000 [1:45:55<18:13, 1.58it/s, loss=0.0055, lr=4.29e-06, step=8272] Training: 83%|████████▎ | 8273/10000 [1:45:55<18:13, 1.58it/s, loss=0.0027, lr=4.29e-06, step=8273] Training: 83%|████████▎ | 8274/10000 [1:45:56<16:53, 1.70it/s, loss=0.0027, lr=4.29e-06, step=8273] Training: 83%|████████▎ | 8274/10000 [1:45:56<16:53, 1.70it/s, loss=0.0238, lr=4.29e-06, step=8274] Training: 83%|████████▎ | 8275/10000 [1:45:57<18:28, 1.56it/s, loss=0.0238, lr=4.29e-06, step=8274] Training: 83%|████████▎ | 8275/10000 [1:45:57<18:28, 1.56it/s, loss=0.0021, lr=4.28e-06, step=8275] Training: 83%|████████▎ | 8276/10000 [1:45:57<19:14, 1.49it/s, loss=0.0021, lr=4.28e-06, step=8275] Training: 83%|████████▎ | 8276/10000 [1:45:57<19:14, 1.49it/s, loss=0.0015, lr=4.28e-06, step=8276] Training: 83%|████████▎ | 8277/10000 [1:45:58<20:12, 1.42it/s, loss=0.0015, lr=4.28e-06, step=8276] Training: 83%|████████▎ | 8277/10000 [1:45:58<20:12, 1.42it/s, loss=0.0039, lr=4.28e-06, step=8277] Training: 83%|████████▎ | 8278/10000 [1:45:59<21:39, 1.33it/s, loss=0.0039, lr=4.28e-06, step=8277] Training: 83%|████████▎ | 8278/10000 [1:45:59<21:39, 1.33it/s, loss=0.0051, lr=4.28e-06, step=8278] Training: 83%|████████▎ | 8279/10000 [1:45:59<19:21, 1.48it/s, loss=0.0051, lr=4.28e-06, step=8278] Training: 83%|████████▎ | 8279/10000 [1:45:59<19:21, 1.48it/s, loss=0.0012, lr=4.28e-06, step=8279]17:52:07.216 [I] step=8280 loss=0.0022 smoothed_loss=0.0074 lr=4.28e-06 grad_norm=0.3715 step_time=0.5385s data_time=0.1306s it/s=1.495 eta_to_10000=1150.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0038 grad_action_out_proj=0.0493 grad_shared_expert=0.2099 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8280/10000 [1:46:00<20:54, 1.37it/s, loss=0.0012, lr=4.28e-06, step=8279] Training: 83%|████████▎ | 8280/10000 [1:46:00<20:54, 1.37it/s, loss=0.0022, lr=4.27e-06, step=8280] Training: 83%|████████▎ | 8281/10000 [1:46:01<21:17, 1.35it/s, loss=0.0022, lr=4.27e-06, step=8280] Training: 83%|████████▎ | 8281/10000 [1:46:01<21:17, 1.35it/s, loss=0.0015, lr=4.27e-06, step=8281] Training: 83%|████████▎ | 8282/10000 [1:46:02<22:44, 1.26it/s, loss=0.0015, lr=4.27e-06, step=8281] Training: 83%|████████▎ | 8282/10000 [1:46:02<22:44, 1.26it/s, loss=0.0157, lr=4.27e-06, step=8282] Training: 83%|████████▎ | 8283/10000 [1:46:03<22:37, 1.27it/s, loss=0.0157, lr=4.27e-06, step=8282] Training: 83%|████████▎ | 8283/10000 [1:46:03<22:37, 1.27it/s, loss=0.0032, lr=4.27e-06, step=8283] Training: 83%|████████▎ | 8284/10000 [1:46:04<22:21, 1.28it/s, loss=0.0032, lr=4.27e-06, step=8283] Training: 83%|████████▎ | 8284/10000 [1:46:04<22:21, 1.28it/s, loss=0.0007, lr=4.27e-06, step=8284] Training: 83%|████████▎ | 8285/10000 [1:46:05<24:46, 1.15it/s, loss=0.0007, lr=4.27e-06, step=8284] Training: 83%|████████▎ | 8285/10000 [1:46:05<24:46, 1.15it/s, loss=0.0026, lr=4.26e-06, step=8285] Training: 83%|████████▎ | 8286/10000 [1:46:05<24:35, 1.16it/s, loss=0.0026, lr=4.26e-06, step=8285] Training: 83%|████████▎ | 8286/10000 [1:46:05<24:35, 1.16it/s, loss=0.0049, lr=4.26e-06, step=8286] Training: 83%|████████▎ | 8287/10000 [1:46:06<23:16, 1.23it/s, loss=0.0049, lr=4.26e-06, step=8286] Training: 83%|████████▎ | 8287/10000 [1:46:06<23:16, 1.23it/s, loss=0.0033, lr=4.26e-06, step=8287] Training: 83%|████████▎ | 8288/10000 [1:46:07<23:32, 1.21it/s, loss=0.0033, lr=4.26e-06, step=8287] Training: 83%|████████▎ | 8288/10000 [1:46:07<23:32, 1.21it/s, loss=0.0009, lr=4.26e-06, step=8288] Training: 83%|████████▎ | 8289/10000 [1:46:08<22:49, 1.25it/s, loss=0.0009, lr=4.26e-06, step=8288] Training: 83%|████████▎ | 8289/10000 [1:46:08<22:49, 1.25it/s, loss=0.0034, lr=4.26e-06, step=8289]17:52:15.375 [I] step=8290 loss=0.0017 smoothed_loss=0.0048 lr=4.26e-06 grad_norm=0.3493 step_time=0.6632s data_time=0.1527s it/s=1.226 eta_to_10000=1395.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0067 grad_action_out_proj=0.0534 grad_shared_expert=0.1975 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8290/10000 [1:46:08<22:05, 1.29it/s, loss=0.0034, lr=4.26e-06, step=8289] Training: 83%|████████▎ | 8290/10000 [1:46:08<22:05, 1.29it/s, loss=0.0017, lr=4.25e-06, step=8290] Training: 83%|████████▎ | 8291/10000 [1:46:09<21:55, 1.30it/s, loss=0.0017, lr=4.25e-06, step=8290] Training: 83%|████████▎ | 8291/10000 [1:46:09<21:55, 1.30it/s, loss=0.0415, lr=4.25e-06, step=8291] Training: 83%|████████▎ | 8292/10000 [1:46:10<24:28, 1.16it/s, loss=0.0415, lr=4.25e-06, step=8291] Training: 83%|████████▎ | 8292/10000 [1:46:10<24:28, 1.16it/s, loss=0.0078, lr=4.25e-06, step=8292] Training: 83%|████████▎ | 8293/10000 [1:46:11<24:07, 1.18it/s, loss=0.0078, lr=4.25e-06, step=8292] Training: 83%|████████▎ | 8293/10000 [1:46:11<24:07, 1.18it/s, loss=0.0072, lr=4.25e-06, step=8293] Training: 83%|████████▎ | 8294/10000 [1:46:12<20:58, 1.36it/s, loss=0.0072, lr=4.25e-06, step=8293] Training: 83%|████████▎ | 8294/10000 [1:46:12<20:58, 1.36it/s, loss=0.0128, lr=4.25e-06, step=8294] Training: 83%|████████▎ | 8295/10000 [1:46:12<22:34, 1.26it/s, loss=0.0128, lr=4.25e-06, step=8294] Training: 83%|████████▎ | 8295/10000 [1:46:12<22:34, 1.26it/s, loss=0.0177, lr=4.24e-06, step=8295] Training: 83%|████████▎ | 8296/10000 [1:46:13<19:59, 1.42it/s, loss=0.0177, lr=4.24e-06, step=8295] Training: 83%|████████▎ | 8296/10000 [1:46:13<19:59, 1.42it/s, loss=0.0111, lr=4.24e-06, step=8296] Training: 83%|████████▎ | 8297/10000 [1:46:13<18:11, 1.56it/s, loss=0.0111, lr=4.24e-06, step=8296] Training: 83%|████████▎ | 8297/10000 [1:46:13<18:11, 1.56it/s, loss=0.0094, lr=4.24e-06, step=8297] Training: 83%|████████▎ | 8298/10000 [1:46:14<19:44, 1.44it/s, loss=0.0094, lr=4.24e-06, step=8297] Training: 83%|████████▎ | 8298/10000 [1:46:14<19:44, 1.44it/s, loss=0.0109, lr=4.24e-06, step=8298] Training: 83%|████████▎ | 8299/10000 [1:46:15<23:20, 1.21it/s, loss=0.0109, lr=4.24e-06, step=8298] Training: 83%|████████▎ | 8299/10000 [1:46:15<23:20, 1.21it/s, loss=0.0094, lr=4.24e-06, step=8299]17:52:23.201 [I] step=8300 loss=0.0602 smoothed_loss=0.0148 lr=4.24e-06 grad_norm=0.5339 step_time=0.6066s data_time=0.1760s it/s=1.278 eta_to_10000=1330.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0290 grad_action_out_proj=0.1829 grad_shared_expert=0.5369 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8300/10000 [1:46:16<23:30, 1.20it/s, loss=0.0094, lr=4.24e-06, step=8299] Training: 83%|████████▎ | 8300/10000 [1:46:16<23:30, 1.20it/s, loss=0.0602, lr=4.23e-06, step=8300] Training: 83%|████████▎ | 8301/10000 [1:46:17<22:03, 1.28it/s, loss=0.0602, lr=4.23e-06, step=8300] Training: 83%|████████▎ | 8301/10000 [1:46:17<22:03, 1.28it/s, loss=0.0022, lr=4.23e-06, step=8301] Training: 83%|████████▎ | 8302/10000 [1:46:18<22:04, 1.28it/s, loss=0.0022, lr=4.23e-06, step=8301] Training: 83%|████████▎ | 8302/10000 [1:46:18<22:04, 1.28it/s, loss=0.0023, lr=4.23e-06, step=8302] Training: 83%|████████▎ | 8303/10000 [1:46:19<22:59, 1.23it/s, loss=0.0023, lr=4.23e-06, step=8302] Training: 83%|████████▎ | 8303/10000 [1:46:19<22:59, 1.23it/s, loss=0.0087, lr=4.23e-06, step=8303] Training: 83%|████████▎ | 8304/10000 [1:46:19<20:08, 1.40it/s, loss=0.0087, lr=4.23e-06, step=8303] Training: 83%|████████▎ | 8304/10000 [1:46:19<20:08, 1.40it/s, loss=0.0597, lr=4.23e-06, step=8304] Training: 83%|████████▎ | 8305/10000 [1:46:20<20:54, 1.35it/s, loss=0.0597, lr=4.23e-06, step=8304] Training: 83%|████████▎ | 8305/10000 [1:46:20<20:54, 1.35it/s, loss=0.0051, lr=4.22e-06, step=8305] Training: 83%|████████▎ | 8306/10000 [1:46:21<20:59, 1.35it/s, loss=0.0051, lr=4.22e-06, step=8305] Training: 83%|████████▎ | 8306/10000 [1:46:21<20:59, 1.35it/s, loss=0.0234, lr=4.22e-06, step=8306] Training: 83%|████████▎ | 8307/10000 [1:46:21<19:30, 1.45it/s, loss=0.0234, lr=4.22e-06, step=8306] Training: 83%|████████▎ | 8307/10000 [1:46:21<19:30, 1.45it/s, loss=0.0368, lr=4.22e-06, step=8307] Training: 83%|████████▎ | 8308/10000 [1:46:22<22:57, 1.23it/s, loss=0.0368, lr=4.22e-06, step=8307] Training: 83%|████████▎ | 8308/10000 [1:46:22<22:57, 1.23it/s, loss=0.0048, lr=4.22e-06, step=8308] Training: 83%|████████▎ | 8309/10000 [1:46:23<22:12, 1.27it/s, loss=0.0048, lr=4.22e-06, step=8308] Training: 83%|████████▎ | 8309/10000 [1:46:23<22:12, 1.27it/s, loss=0.0015, lr=4.22e-06, step=8309]17:52:30.808 [I] step=8310 loss=0.0075 smoothed_loss=0.0147 lr=4.22e-06 grad_norm=0.3895 step_time=0.6169s data_time=0.1438s it/s=1.316 eta_to_10000=1284.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0064 grad_action_out_proj=0.0709 grad_shared_expert=0.2301 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8310/10000 [1:46:24<22:37, 1.24it/s, loss=0.0015, lr=4.22e-06, step=8309] Training: 83%|████████▎ | 8310/10000 [1:46:24<22:37, 1.24it/s, loss=0.0075, lr=4.21e-06, step=8310] Training: 83%|████████▎ | 8311/10000 [1:46:24<20:22, 1.38it/s, loss=0.0075, lr=4.21e-06, step=8310] Training: 83%|████████▎ | 8311/10000 [1:46:24<20:22, 1.38it/s, loss=0.0048, lr=4.21e-06, step=8311] Training: 83%|████████▎ | 8312/10000 [1:46:25<20:24, 1.38it/s, loss=0.0048, lr=4.21e-06, step=8311] Training: 83%|████████▎ | 8312/10000 [1:46:25<20:24, 1.38it/s, loss=0.0021, lr=4.21e-06, step=8312] Training: 83%|████████▎ | 8313/10000 [1:46:26<21:16, 1.32it/s, loss=0.0021, lr=4.21e-06, step=8312] Training: 83%|████████▎ | 8313/10000 [1:46:26<21:16, 1.32it/s, loss=0.0046, lr=4.21e-06, step=8313] Training: 83%|████████▎ | 8314/10000 [1:46:27<20:38, 1.36it/s, loss=0.0046, lr=4.21e-06, step=8313] Training: 83%|████████▎ | 8314/10000 [1:46:27<20:38, 1.36it/s, loss=0.0037, lr=4.21e-06, step=8314] Training: 83%|████████▎ | 8315/10000 [1:46:27<19:12, 1.46it/s, loss=0.0037, lr=4.21e-06, step=8314] Training: 83%|████████▎ | 8315/10000 [1:46:27<19:12, 1.46it/s, loss=0.0026, lr=4.20e-06, step=8315] Training: 83%|████████▎ | 8316/10000 [1:46:28<18:49, 1.49it/s, loss=0.0026, lr=4.20e-06, step=8315] Training: 83%|████████▎ | 8316/10000 [1:46:28<18:49, 1.49it/s, loss=0.0234, lr=4.20e-06, step=8316] Training: 83%|████████▎ | 8317/10000 [1:46:28<17:54, 1.57it/s, loss=0.0234, lr=4.20e-06, step=8316] Training: 83%|████████▎ | 8317/10000 [1:46:28<17:54, 1.57it/s, loss=0.0090, lr=4.20e-06, step=8317] Training: 83%|████████▎ | 8318/10000 [1:46:29<21:08, 1.33it/s, loss=0.0090, lr=4.20e-06, step=8317] Training: 83%|████████▎ | 8318/10000 [1:46:29<21:08, 1.33it/s, loss=0.0053, lr=4.20e-06, step=8318] Training: 83%|████████▎ | 8319/10000 [1:46:30<19:21, 1.45it/s, loss=0.0053, lr=4.20e-06, step=8318] Training: 83%|████████▎ | 8319/10000 [1:46:30<19:21, 1.45it/s, loss=0.0035, lr=4.20e-06, step=8319]17:52:37.669 [I] step=8320 loss=0.0215 smoothed_loss=0.0111 lr=4.20e-06 grad_norm=0.3588 step_time=0.5643s data_time=0.1218s it/s=1.458 eta_to_10000=1152.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.0924 grad_shared_expert=0.5836 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8320/10000 [1:46:31<19:47, 1.41it/s, loss=0.0035, lr=4.20e-06, step=8319] Training: 83%|████████▎ | 8320/10000 [1:46:31<19:47, 1.41it/s, loss=0.0215, lr=4.19e-06, step=8320] Training: 83%|████████▎ | 8321/10000 [1:46:31<17:49, 1.57it/s, loss=0.0215, lr=4.19e-06, step=8320] Training: 83%|████████▎ | 8321/10000 [1:46:31<17:49, 1.57it/s, loss=0.0102, lr=4.19e-06, step=8321] Training: 83%|████████▎ | 8322/10000 [1:46:32<18:35, 1.50it/s, loss=0.0102, lr=4.19e-06, step=8321] Training: 83%|████████▎ | 8322/10000 [1:46:32<18:35, 1.50it/s, loss=0.0238, lr=4.19e-06, step=8322] Training: 83%|████████▎ | 8323/10000 [1:46:33<19:01, 1.47it/s, loss=0.0238, lr=4.19e-06, step=8322] Training: 83%|████████▎ | 8323/10000 [1:46:33<19:01, 1.47it/s, loss=0.0065, lr=4.19e-06, step=8323] Training: 83%|████████▎ | 8324/10000 [1:46:33<17:44, 1.57it/s, loss=0.0065, lr=4.19e-06, step=8323] Training: 83%|████████▎ | 8324/10000 [1:46:33<17:44, 1.57it/s, loss=0.0013, lr=4.19e-06, step=8324] Training: 83%|████████▎ | 8325/10000 [1:46:34<20:00, 1.40it/s, loss=0.0013, lr=4.19e-06, step=8324] Training: 83%|████████▎ | 8325/10000 [1:46:34<20:00, 1.40it/s, loss=0.0025, lr=4.18e-06, step=8325] Training: 83%|████████▎ | 8326/10000 [1:46:35<23:06, 1.21it/s, loss=0.0025, lr=4.18e-06, step=8325] Training: 83%|████████▎ | 8326/10000 [1:46:35<23:06, 1.21it/s, loss=0.0041, lr=4.18e-06, step=8326] Training: 83%|████████▎ | 8327/10000 [1:46:36<24:12, 1.15it/s, loss=0.0041, lr=4.18e-06, step=8326] Training: 83%|████████▎ | 8327/10000 [1:46:36<24:12, 1.15it/s, loss=0.0036, lr=4.18e-06, step=8327] Training: 83%|████████▎ | 8328/10000 [1:46:37<22:59, 1.21it/s, loss=0.0036, lr=4.18e-06, step=8327] Training: 83%|████████▎ | 8328/10000 [1:46:37<22:59, 1.21it/s, loss=0.0190, lr=4.18e-06, step=8328] Training: 83%|████████▎ | 8329/10000 [1:46:37<20:44, 1.34it/s, loss=0.0190, lr=4.18e-06, step=8328] Training: 83%|████████▎ | 8329/10000 [1:46:37<20:44, 1.34it/s, loss=0.0080, lr=4.18e-06, step=8329]17:52:45.045 [I] step=8330 loss=0.0061 smoothed_loss=0.0092 lr=4.18e-06 grad_norm=0.4515 step_time=0.5811s data_time=0.1565s it/s=1.356 eta_to_10000=1231.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0051 grad_action_out_proj=0.0656 grad_shared_expert=0.2476 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8330/10000 [1:46:38<20:14, 1.37it/s, loss=0.0080, lr=4.18e-06, step=8329] Training: 83%|████████▎ | 8330/10000 [1:46:38<20:14, 1.37it/s, loss=0.0061, lr=4.17e-06, step=8330] Training: 83%|████████▎ | 8331/10000 [1:46:39<20:44, 1.34it/s, loss=0.0061, lr=4.17e-06, step=8330] Training: 83%|████████▎ | 8331/10000 [1:46:39<20:44, 1.34it/s, loss=0.0142, lr=4.17e-06, step=8331] Training: 83%|████████▎ | 8332/10000 [1:46:39<18:48, 1.48it/s, loss=0.0142, lr=4.17e-06, step=8331] Training: 83%|████████▎ | 8332/10000 [1:46:39<18:48, 1.48it/s, loss=0.0275, lr=4.17e-06, step=8332] Training: 83%|████████▎ | 8333/10000 [1:46:40<19:09, 1.45it/s, loss=0.0275, lr=4.17e-06, step=8332] Training: 83%|████████▎ | 8333/10000 [1:46:40<19:09, 1.45it/s, loss=0.0077, lr=4.17e-06, step=8333] Training: 83%|████████▎ | 8334/10000 [1:46:41<17:54, 1.55it/s, loss=0.0077, lr=4.17e-06, step=8333] Training: 83%|████████▎ | 8334/10000 [1:46:41<17:54, 1.55it/s, loss=0.0126, lr=4.17e-06, step=8334] Training: 83%|████████▎ | 8335/10000 [1:46:42<20:30, 1.35it/s, loss=0.0126, lr=4.17e-06, step=8334] Training: 83%|████████▎ | 8335/10000 [1:46:42<20:30, 1.35it/s, loss=0.0241, lr=4.16e-06, step=8335] Training: 83%|████████▎ | 8336/10000 [1:46:42<20:46, 1.34it/s, loss=0.0241, lr=4.16e-06, step=8335] Training: 83%|████████▎ | 8336/10000 [1:46:42<20:46, 1.34it/s, loss=0.0027, lr=4.16e-06, step=8336] Training: 83%|████████▎ | 8337/10000 [1:46:43<18:57, 1.46it/s, loss=0.0027, lr=4.16e-06, step=8336] Training: 83%|████████▎ | 8337/10000 [1:46:43<18:57, 1.46it/s, loss=0.0032, lr=4.16e-06, step=8337] Training: 83%|████████▎ | 8338/10000 [1:46:44<20:46, 1.33it/s, loss=0.0032, lr=4.16e-06, step=8337] Training: 83%|████████▎ | 8338/10000 [1:46:44<20:46, 1.33it/s, loss=0.0045, lr=4.16e-06, step=8338] Training: 83%|████████▎ | 8339/10000 [1:46:45<20:51, 1.33it/s, loss=0.0045, lr=4.16e-06, step=8338] Training: 83%|████████▎ | 8339/10000 [1:46:45<20:51, 1.33it/s, loss=0.0014, lr=4.16e-06, step=8339]17:52:52.526 [I] step=8340 loss=0.0078 smoothed_loss=0.0091 lr=4.16e-06 grad_norm=0.4806 step_time=0.6043s data_time=0.1438s it/s=1.337 eta_to_10000=1241.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.1604 grad_shared_expert=0.4367 (10775:train_pytorch.py:850) + Training: 83%|████████▎ | 8340/10000 [1:46:46<22:47, 1.21it/s, loss=0.0014, lr=4.16e-06, step=8339] Training: 83%|████████▎ | 8340/10000 [1:46:46<22:47, 1.21it/s, loss=0.0078, lr=4.15e-06, step=8340] Training: 83%|████████▎ | 8341/10000 [1:46:46<22:16, 1.24it/s, loss=0.0078, lr=4.15e-06, step=8340] Training: 83%|████████▎ | 8341/10000 [1:46:46<22:16, 1.24it/s, loss=0.0034, lr=4.15e-06, step=8341] Training: 83%|████████▎ | 8342/10000 [1:46:47<22:49, 1.21it/s, loss=0.0034, lr=4.15e-06, step=8341] Training: 83%|████████▎ | 8342/10000 [1:46:47<22:49, 1.21it/s, loss=0.0053, lr=4.15e-06, step=8342] Training: 83%|████████▎ | 8343/10000 [1:46:48<20:24, 1.35it/s, loss=0.0053, lr=4.15e-06, step=8342] Training: 83%|████████▎ | 8343/10000 [1:46:48<20:24, 1.35it/s, loss=0.0248, lr=4.15e-06, step=8343] Training: 83%|████████▎ | 8344/10000 [1:46:48<19:58, 1.38it/s, loss=0.0248, lr=4.15e-06, step=8343] Training: 83%|████████▎ | 8344/10000 [1:46:48<19:58, 1.38it/s, loss=0.0014, lr=4.15e-06, step=8344] Training: 83%|████████▎ | 8345/10000 [1:46:49<19:39, 1.40it/s, loss=0.0014, lr=4.15e-06, step=8344] Training: 83%|████████▎ | 8345/10000 [1:46:49<19:39, 1.40it/s, loss=0.0129, lr=4.15e-06, step=8345] Training: 83%|████████▎ | 8346/10000 [1:46:50<21:09, 1.30it/s, loss=0.0129, lr=4.15e-06, step=8345] Training: 83%|████████▎ | 8346/10000 [1:46:50<21:09, 1.30it/s, loss=0.0408, lr=4.14e-06, step=8346] Training: 83%|████████▎ | 8347/10000 [1:46:51<22:47, 1.21it/s, loss=0.0408, lr=4.14e-06, step=8346] Training: 83%|████████▎ | 8347/10000 [1:46:51<22:47, 1.21it/s, loss=0.0034, lr=4.14e-06, step=8347] Training: 83%|████████▎ | 8348/10000 [1:46:52<21:33, 1.28it/s, loss=0.0034, lr=4.14e-06, step=8347] Training: 83%|████████▎ | 8348/10000 [1:46:52<21:33, 1.28it/s, loss=0.0070, lr=4.14e-06, step=8348] Training: 83%|████████▎ | 8349/10000 [1:46:52<20:13, 1.36it/s, loss=0.0070, lr=4.14e-06, step=8348] Training: 83%|████████▎ | 8349/10000 [1:46:52<20:13, 1.36it/s, loss=0.0074, lr=4.14e-06, step=8349]17:53:00.376 [I] step=8350 loss=0.0101 smoothed_loss=0.0107 lr=4.14e-06 grad_norm=0.3855 step_time=0.6244s data_time=0.1606s it/s=1.274 eta_to_10000=1295.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0876 grad_shared_expert=0.2790 (10775:train_pytorch.py:850) + Training: 84%|████████▎ | 8350/10000 [1:46:53<23:31, 1.17it/s, loss=0.0074, lr=4.14e-06, step=8349] Training: 84%|████████▎ | 8350/10000 [1:46:53<23:31, 1.17it/s, loss=0.0101, lr=4.14e-06, step=8350] Training: 84%|████████▎ | 8351/10000 [1:46:54<23:19, 1.18it/s, loss=0.0101, lr=4.14e-06, step=8350] Training: 84%|████████▎ | 8351/10000 [1:46:54<23:19, 1.18it/s, loss=0.0034, lr=4.13e-06, step=8351] Training: 84%|████████▎ | 8352/10000 [1:46:55<20:22, 1.35it/s, loss=0.0034, lr=4.13e-06, step=8351] Training: 84%|████████▎ | 8352/10000 [1:46:55<20:22, 1.35it/s, loss=0.0095, lr=4.13e-06, step=8352] Training: 84%|████████▎ | 8353/10000 [1:46:55<19:04, 1.44it/s, loss=0.0095, lr=4.13e-06, step=8352] Training: 84%|████████▎ | 8353/10000 [1:46:55<19:04, 1.44it/s, loss=0.0066, lr=4.13e-06, step=8353] Training: 84%|████████▎ | 8354/10000 [1:46:56<19:31, 1.41it/s, loss=0.0066, lr=4.13e-06, step=8353] Training: 84%|████████▎ | 8354/10000 [1:46:56<19:31, 1.41it/s, loss=0.0095, lr=4.13e-06, step=8354] Training: 84%|████████▎ | 8355/10000 [1:46:57<18:43, 1.46it/s, loss=0.0095, lr=4.13e-06, step=8354] Training: 84%|████████▎ | 8355/10000 [1:46:57<18:43, 1.46it/s, loss=0.0027, lr=4.13e-06, step=8355] Training: 84%|████████▎ | 8356/10000 [1:46:57<17:49, 1.54it/s, loss=0.0027, lr=4.13e-06, step=8355] Training: 84%|████████▎ | 8356/10000 [1:46:57<17:49, 1.54it/s, loss=0.0044, lr=4.12e-06, step=8356] Training: 84%|████████▎ | 8357/10000 [1:46:58<19:19, 1.42it/s, loss=0.0044, lr=4.12e-06, step=8356] Training: 84%|████████▎ | 8357/10000 [1:46:58<19:19, 1.42it/s, loss=0.0058, lr=4.12e-06, step=8357] Training: 84%|████████▎ | 8358/10000 [1:46:59<20:13, 1.35it/s, loss=0.0058, lr=4.12e-06, step=8357] Training: 84%|████████▎ | 8358/10000 [1:46:59<20:13, 1.35it/s, loss=0.0205, lr=4.12e-06, step=8358] Training: 84%|████████▎ | 8359/10000 [1:46:59<18:10, 1.51it/s, loss=0.0205, lr=4.12e-06, step=8358] Training: 84%|████████▎ | 8359/10000 [1:46:59<18:10, 1.51it/s, loss=0.0147, lr=4.12e-06, step=8359]17:53:06.983 [I] step=8360 loss=0.0068 smoothed_loss=0.0096 lr=4.12e-06 grad_norm=0.4441 step_time=0.5497s data_time=0.1110s it/s=1.514 eta_to_10000=1083.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0704 grad_shared_expert=0.2088 (10775:train_pytorch.py:850) + Training: 84%|████████▎ | 8360/10000 [1:47:00<17:44, 1.54it/s, loss=0.0147, lr=4.12e-06, step=8359] Training: 84%|████████▎ | 8360/10000 [1:47:00<17:44, 1.54it/s, loss=0.0068, lr=4.12e-06, step=8360] Training: 84%|████████▎ | 8361/10000 [1:47:01<20:26, 1.34it/s, loss=0.0068, lr=4.12e-06, step=8360] Training: 84%|████████▎ | 8361/10000 [1:47:01<20:26, 1.34it/s, loss=0.0082, lr=4.11e-06, step=8361] Training: 84%|████████▎ | 8362/10000 [1:47:02<21:08, 1.29it/s, loss=0.0082, lr=4.11e-06, step=8361] Training: 84%|████████▎ | 8362/10000 [1:47:02<21:08, 1.29it/s, loss=0.0048, lr=4.11e-06, step=8362] Training: 84%|████████▎ | 8363/10000 [1:47:03<20:47, 1.31it/s, loss=0.0048, lr=4.11e-06, step=8362] Training: 84%|████████▎ | 8363/10000 [1:47:03<20:47, 1.31it/s, loss=0.0042, lr=4.11e-06, step=8363] Training: 84%|████████▎ | 8364/10000 [1:47:03<18:45, 1.45it/s, loss=0.0042, lr=4.11e-06, step=8363] Training: 84%|████████▎ | 8364/10000 [1:47:03<18:45, 1.45it/s, loss=0.0125, lr=4.11e-06, step=8364] Training: 84%|████████▎ | 8365/10000 [1:47:04<18:05, 1.51it/s, loss=0.0125, lr=4.11e-06, step=8364] Training: 84%|████████▎ | 8365/10000 [1:47:04<18:05, 1.51it/s, loss=0.0429, lr=4.11e-06, step=8365] Training: 84%|████████▎ | 8366/10000 [1:47:04<16:44, 1.63it/s, loss=0.0429, lr=4.11e-06, step=8365] Training: 84%|████████▎ | 8366/10000 [1:47:04<16:44, 1.63it/s, loss=0.0332, lr=4.10e-06, step=8366] Training: 84%|████████▎ | 8367/10000 [1:47:05<18:41, 1.46it/s, loss=0.0332, lr=4.10e-06, step=8366] Training: 84%|████████▎ | 8367/10000 [1:47:05<18:41, 1.46it/s, loss=0.0020, lr=4.10e-06, step=8367] Training: 84%|████████▎ | 8368/10000 [1:47:06<19:51, 1.37it/s, loss=0.0020, lr=4.10e-06, step=8367] Training: 84%|████████▎ | 8368/10000 [1:47:06<19:51, 1.37it/s, loss=0.0047, lr=4.10e-06, step=8368] Training: 84%|████████▎ | 8369/10000 [1:47:07<19:19, 1.41it/s, loss=0.0047, lr=4.10e-06, step=8368] Training: 84%|████████▎ | 8369/10000 [1:47:07<19:19, 1.41it/s, loss=0.0099, lr=4.10e-06, step=8369]17:53:14.490 [I] step=8370 loss=0.0049 smoothed_loss=0.0114 lr=4.11e-06 grad_norm=0.3999 step_time=0.5911s data_time=0.1596s it/s=1.332 eta_to_10000=1223.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0085 grad_action_out_proj=0.0859 grad_shared_expert=0.3712 (10775:train_pytorch.py:850) + Training: 84%|████████▎ | 8370/10000 [1:47:08<21:31, 1.26it/s, loss=0.0099, lr=4.10e-06, step=8369] Training: 84%|████████▎ | 8370/10000 [1:47:08<21:31, 1.26it/s, loss=0.0049, lr=4.10e-06, step=8370] Training: 84%|████████▎ | 8371/10000 [1:47:09<23:33, 1.15it/s, loss=0.0049, lr=4.10e-06, step=8370] Training: 84%|████████▎ | 8371/10000 [1:47:09<23:33, 1.15it/s, loss=0.0014, lr=4.10e-06, step=8371] Training: 84%|████████▎ | 8372/10000 [1:47:09<22:41, 1.20it/s, loss=0.0014, lr=4.10e-06, step=8371] Training: 84%|████████▎ | 8372/10000 [1:47:09<22:41, 1.20it/s, loss=0.0060, lr=4.09e-06, step=8372] Training: 84%|████████▎ | 8373/10000 [1:47:10<21:49, 1.24it/s, loss=0.0060, lr=4.09e-06, step=8372] Training: 84%|████████▎ | 8373/10000 [1:47:10<21:49, 1.24it/s, loss=0.0014, lr=4.09e-06, step=8373] Training: 84%|████████▎ | 8374/10000 [1:47:11<19:14, 1.41it/s, loss=0.0014, lr=4.09e-06, step=8373] Training: 84%|████████▎ | 8374/10000 [1:47:11<19:14, 1.41it/s, loss=0.0347, lr=4.09e-06, step=8374] Training: 84%|████████▍ | 8375/10000 [1:47:12<21:42, 1.25it/s, loss=0.0347, lr=4.09e-06, step=8374] Training: 84%|████████▍ | 8375/10000 [1:47:12<21:42, 1.25it/s, loss=0.0227, lr=4.09e-06, step=8375] Training: 84%|████████▍ | 8376/10000 [1:47:12<19:09, 1.41it/s, loss=0.0227, lr=4.09e-06, step=8375] Training: 84%|████████▍ | 8376/10000 [1:47:12<19:09, 1.41it/s, loss=0.0055, lr=4.09e-06, step=8376] Training: 84%|████████▍ | 8377/10000 [1:47:13<17:27, 1.55it/s, loss=0.0055, lr=4.09e-06, step=8376] Training: 84%|████████▍ | 8377/10000 [1:47:13<17:27, 1.55it/s, loss=0.0033, lr=4.08e-06, step=8377] Training: 84%|████████▍ | 8378/10000 [1:47:13<18:31, 1.46it/s, loss=0.0033, lr=4.08e-06, step=8377] Training: 84%|████████▍ | 8378/10000 [1:47:13<18:31, 1.46it/s, loss=0.0070, lr=4.08e-06, step=8378] Training: 84%|████████▍ | 8379/10000 [1:47:14<19:36, 1.38it/s, loss=0.0070, lr=4.08e-06, step=8378] Training: 84%|████████▍ | 8379/10000 [1:47:14<19:36, 1.38it/s, loss=0.0099, lr=4.08e-06, step=8379]17:53:21.645 [I] step=8380 loss=0.0053 smoothed_loss=0.0101 lr=4.09e-06 grad_norm=0.3796 step_time=0.5895s data_time=0.1260s it/s=1.398 eta_to_10000=1158.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0134 grad_action_out_proj=0.1030 grad_shared_expert=0.2899 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8380/10000 [1:47:15<17:58, 1.50it/s, loss=0.0099, lr=4.08e-06, step=8379] Training: 84%|████████▍ | 8380/10000 [1:47:15<17:58, 1.50it/s, loss=0.0053, lr=4.08e-06, step=8380] Training: 84%|████████▍ | 8381/10000 [1:47:15<16:41, 1.62it/s, loss=0.0053, lr=4.08e-06, step=8380] Training: 84%|████████▍ | 8381/10000 [1:47:15<16:41, 1.62it/s, loss=0.0016, lr=4.08e-06, step=8381] Training: 84%|████████▍ | 8382/10000 [1:47:16<19:21, 1.39it/s, loss=0.0016, lr=4.08e-06, step=8381] Training: 84%|████████▍ | 8382/10000 [1:47:16<19:21, 1.39it/s, loss=0.0039, lr=4.07e-06, step=8382] Training: 84%|████████▍ | 8383/10000 [1:47:17<19:35, 1.38it/s, loss=0.0039, lr=4.07e-06, step=8382] Training: 84%|████████▍ | 8383/10000 [1:47:17<19:35, 1.38it/s, loss=0.0115, lr=4.07e-06, step=8383] Training: 84%|████████▍ | 8384/10000 [1:47:17<17:44, 1.52it/s, loss=0.0115, lr=4.07e-06, step=8383] Training: 84%|████████▍ | 8384/10000 [1:47:17<17:44, 1.52it/s, loss=0.0022, lr=4.07e-06, step=8384] Training: 84%|████████▍ | 8385/10000 [1:47:19<21:58, 1.22it/s, loss=0.0022, lr=4.07e-06, step=8384] Training: 84%|████████▍ | 8385/10000 [1:47:19<21:58, 1.22it/s, loss=0.0112, lr=4.07e-06, step=8385] Training: 84%|████████▍ | 8386/10000 [1:47:19<21:26, 1.25it/s, loss=0.0112, lr=4.07e-06, step=8385] Training: 84%|████████▍ | 8386/10000 [1:47:19<21:26, 1.25it/s, loss=0.0069, lr=4.07e-06, step=8386] Training: 84%|████████▍ | 8387/10000 [1:47:20<20:08, 1.33it/s, loss=0.0069, lr=4.07e-06, step=8386] Training: 84%|████████▍ | 8387/10000 [1:47:20<20:08, 1.33it/s, loss=0.0238, lr=4.06e-06, step=8387] Training: 84%|████████▍ | 8388/10000 [1:47:21<19:20, 1.39it/s, loss=0.0238, lr=4.06e-06, step=8387] Training: 84%|████████▍ | 8388/10000 [1:47:21<19:20, 1.39it/s, loss=0.0478, lr=4.06e-06, step=8388] Training: 84%|████████▍ | 8389/10000 [1:47:21<19:39, 1.37it/s, loss=0.0478, lr=4.06e-06, step=8388] Training: 84%|████████▍ | 8389/10000 [1:47:21<19:39, 1.37it/s, loss=0.0265, lr=4.06e-06, step=8389]17:53:29.286 [I] step=8390 loss=0.0012 smoothed_loss=0.0136 lr=4.07e-06 grad_norm=0.5205 step_time=0.6023s data_time=0.1620s it/s=1.309 eta_to_10000=1230.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0213 grad_action_out_proj=0.1226 grad_shared_expert=0.6062 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8390/10000 [1:47:22<21:23, 1.25it/s, loss=0.0265, lr=4.06e-06, step=8389] Training: 84%|████████▍ | 8390/10000 [1:47:22<21:23, 1.25it/s, loss=0.0012, lr=4.06e-06, step=8390] Training: 84%|████████▍ | 8391/10000 [1:47:23<21:03, 1.27it/s, loss=0.0012, lr=4.06e-06, step=8390] Training: 84%|████████▍ | 8391/10000 [1:47:23<21:03, 1.27it/s, loss=0.0172, lr=4.06e-06, step=8391] Training: 84%|████████▍ | 8392/10000 [1:47:24<22:40, 1.18it/s, loss=0.0172, lr=4.06e-06, step=8391] Training: 84%|████████▍ | 8392/10000 [1:47:24<22:40, 1.18it/s, loss=0.0019, lr=4.06e-06, step=8392] Training: 84%|████████▍ | 8393/10000 [1:47:25<19:51, 1.35it/s, loss=0.0019, lr=4.06e-06, step=8392] Training: 84%|████████▍ | 8393/10000 [1:47:25<19:51, 1.35it/s, loss=0.0071, lr=4.05e-06, step=8393] Training: 84%|████████▍ | 8394/10000 [1:47:25<20:34, 1.30it/s, loss=0.0071, lr=4.05e-06, step=8393] Training: 84%|████████▍ | 8394/10000 [1:47:25<20:34, 1.30it/s, loss=0.0067, lr=4.05e-06, step=8394] Training: 84%|████████▍ | 8395/10000 [1:47:26<18:35, 1.44it/s, loss=0.0067, lr=4.05e-06, step=8394] Training: 84%|████████▍ | 8395/10000 [1:47:26<18:35, 1.44it/s, loss=0.0067, lr=4.05e-06, step=8395] Training: 84%|████████▍ | 8396/10000 [1:47:26<17:15, 1.55it/s, loss=0.0067, lr=4.05e-06, step=8395] Training: 84%|████████▍ | 8396/10000 [1:47:26<17:15, 1.55it/s, loss=0.0028, lr=4.05e-06, step=8396] Training: 84%|████████▍ | 8397/10000 [1:47:27<19:42, 1.36it/s, loss=0.0028, lr=4.05e-06, step=8396] Training: 84%|████████▍ | 8397/10000 [1:47:27<19:42, 1.36it/s, loss=0.0192, lr=4.05e-06, step=8397] Training: 84%|████████▍ | 8398/10000 [1:47:28<17:58, 1.49it/s, loss=0.0192, lr=4.05e-06, step=8397] Training: 84%|████████▍ | 8398/10000 [1:47:28<17:58, 1.49it/s, loss=0.0058, lr=4.04e-06, step=8398] Training: 84%|████████▍ | 8399/10000 [1:47:29<22:19, 1.20it/s, loss=0.0058, lr=4.04e-06, step=8398] Training: 84%|████████▍ | 8399/10000 [1:47:29<22:19, 1.20it/s, loss=0.0054, lr=4.04e-06, step=8399]17:53:36.832 [I] step=8400 loss=0.0021 smoothed_loss=0.0093 lr=4.05e-06 grad_norm=0.4245 step_time=0.6133s data_time=0.1412s it/s=1.326 eta_to_10000=1207.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.0832 grad_shared_expert=0.4161 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8400/10000 [1:47:30<21:24, 1.25it/s, loss=0.0054, lr=4.04e-06, step=8399] Training: 84%|████████▍ | 8400/10000 [1:47:30<21:24, 1.25it/s, loss=0.0021, lr=4.04e-06, step=8400] Training: 84%|████████▍ | 8401/10000 [1:47:31<20:55, 1.27it/s, loss=0.0021, lr=4.04e-06, step=8400] Training: 84%|████████▍ | 8401/10000 [1:47:31<20:55, 1.27it/s, loss=0.0039, lr=4.04e-06, step=8401] Training: 84%|████████▍ | 8402/10000 [1:47:31<20:21, 1.31it/s, loss=0.0039, lr=4.04e-06, step=8401] Training: 84%|████████▍ | 8402/10000 [1:47:31<20:21, 1.31it/s, loss=0.0030, lr=4.04e-06, step=8402] Training: 84%|████████▍ | 8403/10000 [1:47:32<20:12, 1.32it/s, loss=0.0030, lr=4.04e-06, step=8402] Training: 84%|████████▍ | 8403/10000 [1:47:32<20:12, 1.32it/s, loss=0.0346, lr=4.03e-06, step=8403] Training: 84%|████████▍ | 8404/10000 [1:47:33<18:05, 1.47it/s, loss=0.0346, lr=4.03e-06, step=8403] Training: 84%|████████▍ | 8404/10000 [1:47:33<18:05, 1.47it/s, loss=0.0010, lr=4.03e-06, step=8404] Training: 84%|████████▍ | 8405/10000 [1:47:34<20:56, 1.27it/s, loss=0.0010, lr=4.03e-06, step=8404] Training: 84%|████████▍ | 8405/10000 [1:47:34<20:56, 1.27it/s, loss=0.0154, lr=4.03e-06, step=8405] Training: 84%|████████▍ | 8406/10000 [1:47:35<21:59, 1.21it/s, loss=0.0154, lr=4.03e-06, step=8405] Training: 84%|████████▍ | 8406/10000 [1:47:35<21:59, 1.21it/s, loss=0.0088, lr=4.03e-06, step=8406] Training: 84%|████████▍ | 8407/10000 [1:47:35<20:10, 1.32it/s, loss=0.0088, lr=4.03e-06, step=8406] Training: 84%|████████▍ | 8407/10000 [1:47:35<20:10, 1.32it/s, loss=0.0103, lr=4.03e-06, step=8407] Training: 84%|████████▍ | 8408/10000 [1:47:36<21:04, 1.26it/s, loss=0.0103, lr=4.03e-06, step=8407] Training: 84%|████████▍ | 8408/10000 [1:47:36<21:04, 1.26it/s, loss=0.0017, lr=4.03e-06, step=8408] Training: 84%|████████▍ | 8409/10000 [1:47:37<21:12, 1.25it/s, loss=0.0017, lr=4.03e-06, step=8408] Training: 84%|████████▍ | 8409/10000 [1:47:37<21:12, 1.25it/s, loss=0.0169, lr=4.02e-06, step=8409]17:53:44.691 [I] step=8410 loss=0.0029 smoothed_loss=0.0094 lr=4.03e-06 grad_norm=0.4027 step_time=0.6320s data_time=0.1539s it/s=1.273 eta_to_10000=1249.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0135 grad_action_out_proj=0.1045 grad_shared_expert=0.3604 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8410/10000 [1:47:38<22:02, 1.20it/s, loss=0.0169, lr=4.02e-06, step=8409] Training: 84%|████████▍ | 8410/10000 [1:47:38<22:02, 1.20it/s, loss=0.0029, lr=4.02e-06, step=8410] Training: 84%|████████▍ | 8411/10000 [1:47:39<22:07, 1.20it/s, loss=0.0029, lr=4.02e-06, step=8410] Training: 84%|████████▍ | 8411/10000 [1:47:39<22:07, 1.20it/s, loss=0.0079, lr=4.02e-06, step=8411] Training: 84%|████████▍ | 8412/10000 [1:47:40<23:38, 1.12it/s, loss=0.0079, lr=4.02e-06, step=8411] Training: 84%|████████▍ | 8412/10000 [1:47:40<23:38, 1.12it/s, loss=0.0009, lr=4.02e-06, step=8412] Training: 84%|████████▍ | 8413/10000 [1:47:41<24:40, 1.07it/s, loss=0.0009, lr=4.02e-06, step=8412] Training: 84%|████████▍ | 8413/10000 [1:47:41<24:40, 1.07it/s, loss=0.0260, lr=4.02e-06, step=8413] Training: 84%|████████▍ | 8414/10000 [1:47:42<26:32, 1.00s/it, loss=0.0260, lr=4.02e-06, step=8413] Training: 84%|████████▍ | 8414/10000 [1:47:42<26:32, 1.00s/it, loss=0.0085, lr=4.01e-06, step=8414] Training: 84%|████████▍ | 8415/10000 [1:47:43<26:16, 1.01it/s, loss=0.0085, lr=4.01e-06, step=8414] Training: 84%|████████▍ | 8415/10000 [1:47:43<26:16, 1.01it/s, loss=0.0337, lr=4.01e-06, step=8415] Training: 84%|████████▍ | 8416/10000 [1:47:44<26:29, 1.00s/it, loss=0.0337, lr=4.01e-06, step=8415] Training: 84%|████████▍ | 8416/10000 [1:47:44<26:29, 1.00s/it, loss=0.0098, lr=4.01e-06, step=8416] Training: 84%|████████▍ | 8417/10000 [1:47:44<23:17, 1.13it/s, loss=0.0098, lr=4.01e-06, step=8416] Training: 84%|████████▍ | 8417/10000 [1:47:44<23:17, 1.13it/s, loss=0.0051, lr=4.01e-06, step=8417] Training: 84%|████████▍ | 8418/10000 [1:47:45<21:26, 1.23it/s, loss=0.0051, lr=4.01e-06, step=8417] Training: 84%|████████▍ | 8418/10000 [1:47:45<21:26, 1.23it/s, loss=0.0135, lr=4.01e-06, step=8418] Training: 84%|████████▍ | 8419/10000 [1:47:46<20:35, 1.28it/s, loss=0.0135, lr=4.01e-06, step=8418] Training: 84%|████████▍ | 8419/10000 [1:47:46<20:35, 1.28it/s, loss=0.0059, lr=4.00e-06, step=8419]17:53:53.592 [I] step=8420 loss=0.0062 smoothed_loss=0.0106 lr=4.01e-06 grad_norm=0.4336 step_time=0.6853s data_time=0.2049s it/s=1.124 eta_to_10000=1406.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0059 grad_action_out_proj=0.0606 grad_shared_expert=0.3500 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8420/10000 [1:47:47<21:20, 1.23it/s, loss=0.0059, lr=4.00e-06, step=8419] Training: 84%|████████▍ | 8420/10000 [1:47:47<21:20, 1.23it/s, loss=0.0062, lr=4.00e-06, step=8420] Training: 84%|████████▍ | 8421/10000 [1:47:48<23:06, 1.14it/s, loss=0.0062, lr=4.00e-06, step=8420] Training: 84%|████████▍ | 8421/10000 [1:47:48<23:06, 1.14it/s, loss=0.0035, lr=4.00e-06, step=8421] Training: 84%|████████▍ | 8422/10000 [1:47:49<22:40, 1.16it/s, loss=0.0035, lr=4.00e-06, step=8421] Training: 84%|████████▍ | 8422/10000 [1:47:49<22:40, 1.16it/s, loss=0.0032, lr=4.00e-06, step=8422] Training: 84%|████████▍ | 8423/10000 [1:47:49<22:49, 1.15it/s, loss=0.0032, lr=4.00e-06, step=8422] Training: 84%|████████▍ | 8423/10000 [1:47:49<22:49, 1.15it/s, loss=0.0031, lr=4.00e-06, step=8423] Training: 84%|████████▍ | 8424/10000 [1:47:50<22:47, 1.15it/s, loss=0.0031, lr=4.00e-06, step=8423] Training: 84%|████████▍ | 8424/10000 [1:47:50<22:47, 1.15it/s, loss=0.0775, lr=4.00e-06, step=8424] Training: 84%|████████▍ | 8425/10000 [1:47:51<24:22, 1.08it/s, loss=0.0775, lr=4.00e-06, step=8424] Training: 84%|████████▍ | 8425/10000 [1:47:51<24:22, 1.08it/s, loss=0.0114, lr=3.99e-06, step=8425] Training: 84%|████████▍ | 8426/10000 [1:47:52<25:17, 1.04it/s, loss=0.0114, lr=3.99e-06, step=8425] Training: 84%|████████▍ | 8426/10000 [1:47:52<25:17, 1.04it/s, loss=0.0082, lr=3.99e-06, step=8426] Training: 84%|████████▍ | 8427/10000 [1:47:53<22:06, 1.19it/s, loss=0.0082, lr=3.99e-06, step=8426] Training: 84%|████████▍ | 8427/10000 [1:47:53<22:06, 1.19it/s, loss=0.0009, lr=3.99e-06, step=8427] Training: 84%|████████▍ | 8428/10000 [1:47:54<21:07, 1.24it/s, loss=0.0009, lr=3.99e-06, step=8427] Training: 84%|████████▍ | 8428/10000 [1:47:54<21:07, 1.24it/s, loss=0.0016, lr=3.99e-06, step=8428] Training: 84%|████████▍ | 8429/10000 [1:47:55<21:38, 1.21it/s, loss=0.0016, lr=3.99e-06, step=8428] Training: 84%|████████▍ | 8429/10000 [1:47:55<21:38, 1.21it/s, loss=0.0081, lr=3.99e-06, step=8429]17:54:02.019 [I] step=8430 loss=0.0547 smoothed_loss=0.0158 lr=3.99e-06 grad_norm=0.4875 step_time=0.6620s data_time=0.1807s it/s=1.187 eta_to_10000=1322.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0278 grad_action_out_proj=0.1624 grad_shared_expert=0.4405 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8430/10000 [1:47:55<19:24, 1.35it/s, loss=0.0081, lr=3.99e-06, step=8429] Training: 84%|████████▍ | 8430/10000 [1:47:55<19:24, 1.35it/s, loss=0.0547, lr=3.98e-06, step=8430] Training: 84%|████████▍ | 8431/10000 [1:47:56<17:27, 1.50it/s, loss=0.0547, lr=3.98e-06, step=8430] Training: 84%|████████▍ | 8431/10000 [1:47:56<17:27, 1.50it/s, loss=0.0230, lr=3.98e-06, step=8431] Training: 84%|████████▍ | 8432/10000 [1:47:56<16:06, 1.62it/s, loss=0.0230, lr=3.98e-06, step=8431] Training: 84%|████████▍ | 8432/10000 [1:47:56<16:06, 1.62it/s, loss=0.0058, lr=3.98e-06, step=8432] Training: 84%|████████▍ | 8433/10000 [1:47:57<18:19, 1.43it/s, loss=0.0058, lr=3.98e-06, step=8432] Training: 84%|████████▍ | 8433/10000 [1:47:57<18:19, 1.43it/s, loss=0.0028, lr=3.98e-06, step=8433] Training: 84%|████████▍ | 8434/10000 [1:47:58<16:56, 1.54it/s, loss=0.0028, lr=3.98e-06, step=8433] Training: 84%|████████▍ | 8434/10000 [1:47:58<16:56, 1.54it/s, loss=0.0063, lr=3.98e-06, step=8434] Training: 84%|████████▍ | 8435/10000 [1:47:58<18:46, 1.39it/s, loss=0.0063, lr=3.98e-06, step=8434] Training: 84%|████████▍ | 8435/10000 [1:47:58<18:46, 1.39it/s, loss=0.0151, lr=3.98e-06, step=8435] Training: 84%|████████▍ | 8436/10000 [1:47:59<19:10, 1.36it/s, loss=0.0151, lr=3.98e-06, step=8435] Training: 84%|████████▍ | 8436/10000 [1:47:59<19:10, 1.36it/s, loss=0.0082, lr=3.97e-06, step=8436] Training: 84%|████████▍ | 8437/10000 [1:48:00<17:51, 1.46it/s, loss=0.0082, lr=3.97e-06, step=8436] Training: 84%|████████▍ | 8437/10000 [1:48:00<17:51, 1.46it/s, loss=0.0057, lr=3.97e-06, step=8437] Training: 84%|████████▍ | 8438/10000 [1:48:00<17:50, 1.46it/s, loss=0.0057, lr=3.97e-06, step=8437] Training: 84%|████████▍ | 8438/10000 [1:48:00<17:50, 1.46it/s, loss=0.0188, lr=3.97e-06, step=8438] Training: 84%|████████▍ | 8439/10000 [1:48:01<17:32, 1.48it/s, loss=0.0188, lr=3.97e-06, step=8438] Training: 84%|████████▍ | 8439/10000 [1:48:01<17:32, 1.48it/s, loss=0.0085, lr=3.97e-06, step=8439]17:54:08.873 [I] step=8440 loss=0.0040 smoothed_loss=0.0117 lr=3.97e-06 grad_norm=0.3852 step_time=0.5500s data_time=0.1354s it/s=1.459 eta_to_10000=1069.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0055 grad_action_out_proj=0.0394 grad_shared_expert=0.4089 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8440/10000 [1:48:02<19:06, 1.36it/s, loss=0.0085, lr=3.97e-06, step=8439] Training: 84%|████████▍ | 8440/10000 [1:48:02<19:06, 1.36it/s, loss=0.0040, lr=3.97e-06, step=8440] Training: 84%|████████▍ | 8441/10000 [1:48:02<17:42, 1.47it/s, loss=0.0040, lr=3.97e-06, step=8440] Training: 84%|████████▍ | 8441/10000 [1:48:02<17:42, 1.47it/s, loss=0.0233, lr=3.96e-06, step=8441] Training: 84%|████████▍ | 8442/10000 [1:48:03<18:37, 1.39it/s, loss=0.0233, lr=3.96e-06, step=8441] Training: 84%|████████▍ | 8442/10000 [1:48:03<18:37, 1.39it/s, loss=0.0033, lr=3.96e-06, step=8442] Training: 84%|████████▍ | 8443/10000 [1:48:04<17:38, 1.47it/s, loss=0.0033, lr=3.96e-06, step=8442] Training: 84%|████████▍ | 8443/10000 [1:48:04<17:38, 1.47it/s, loss=0.0075, lr=3.96e-06, step=8443] Training: 84%|████████▍ | 8444/10000 [1:48:05<18:19, 1.42it/s, loss=0.0075, lr=3.96e-06, step=8443] Training: 84%|████████▍ | 8444/10000 [1:48:05<18:19, 1.42it/s, loss=0.0583, lr=3.96e-06, step=8444] Training: 84%|████████▍ | 8445/10000 [1:48:06<20:33, 1.26it/s, loss=0.0583, lr=3.96e-06, step=8444] Training: 84%|████████▍ | 8445/10000 [1:48:06<20:33, 1.26it/s, loss=0.0070, lr=3.96e-06, step=8445] Training: 84%|████████▍ | 8446/10000 [1:48:07<23:03, 1.12it/s, loss=0.0070, lr=3.96e-06, step=8445] Training: 84%|████████▍ | 8446/10000 [1:48:07<23:03, 1.12it/s, loss=0.0077, lr=3.95e-06, step=8446] Training: 84%|████████▍ | 8447/10000 [1:48:08<23:21, 1.11it/s, loss=0.0077, lr=3.95e-06, step=8446] Training: 84%|████████▍ | 8447/10000 [1:48:08<23:21, 1.11it/s, loss=0.0068, lr=3.95e-06, step=8447] Training: 84%|████████▍ | 8448/10000 [1:48:08<20:03, 1.29it/s, loss=0.0068, lr=3.95e-06, step=8447] Training: 84%|████████▍ | 8448/10000 [1:48:08<20:03, 1.29it/s, loss=0.0021, lr=3.95e-06, step=8448] Training: 84%|████████▍ | 8449/10000 [1:48:09<22:49, 1.13it/s, loss=0.0021, lr=3.95e-06, step=8448] Training: 84%|████████▍ | 8449/10000 [1:48:09<22:49, 1.13it/s, loss=0.0097, lr=3.95e-06, step=8449]17:54:16.773 [I] step=8450 loss=0.0268 smoothed_loss=0.0137 lr=3.96e-06 grad_norm=0.4530 step_time=0.6235s data_time=0.1665s it/s=1.266 eta_to_10000=1224.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0169 grad_action_out_proj=0.1931 grad_shared_expert=0.4687 (10775:train_pytorch.py:850) + Training: 84%|████████▍ | 8450/10000 [1:48:10<20:02, 1.29it/s, loss=0.0097, lr=3.95e-06, step=8449] Training: 84%|████████▍ | 8450/10000 [1:48:10<20:02, 1.29it/s, loss=0.0268, lr=3.95e-06, step=8450] Training: 85%|████████▍ | 8451/10000 [1:48:11<19:34, 1.32it/s, loss=0.0268, lr=3.95e-06, step=8450] Training: 85%|████████▍ | 8451/10000 [1:48:11<19:34, 1.32it/s, loss=0.0125, lr=3.95e-06, step=8451] Training: 85%|████████▍ | 8452/10000 [1:48:11<19:30, 1.32it/s, loss=0.0125, lr=3.95e-06, step=8451] Training: 85%|████████▍ | 8452/10000 [1:48:11<19:30, 1.32it/s, loss=0.0050, lr=3.94e-06, step=8452] Training: 85%|████████▍ | 8453/10000 [1:48:12<19:16, 1.34it/s, loss=0.0050, lr=3.94e-06, step=8452] Training: 85%|████████▍ | 8453/10000 [1:48:12<19:16, 1.34it/s, loss=0.0680, lr=3.94e-06, step=8453] Training: 85%|████████▍ | 8454/10000 [1:48:13<21:26, 1.20it/s, loss=0.0680, lr=3.94e-06, step=8453] Training: 85%|████████▍ | 8454/10000 [1:48:13<21:26, 1.20it/s, loss=0.0075, lr=3.94e-06, step=8454] Training: 85%|████████▍ | 8455/10000 [1:48:14<21:08, 1.22it/s, loss=0.0075, lr=3.94e-06, step=8454] Training: 85%|████████▍ | 8455/10000 [1:48:14<21:08, 1.22it/s, loss=0.0052, lr=3.94e-06, step=8455] Training: 85%|████████▍ | 8456/10000 [1:48:15<20:08, 1.28it/s, loss=0.0052, lr=3.94e-06, step=8455] Training: 85%|████████▍ | 8456/10000 [1:48:15<20:08, 1.28it/s, loss=0.0128, lr=3.94e-06, step=8456] Training: 85%|████████▍ | 8457/10000 [1:48:16<21:36, 1.19it/s, loss=0.0128, lr=3.94e-06, step=8456] Training: 85%|████████▍ | 8457/10000 [1:48:16<21:36, 1.19it/s, loss=0.0024, lr=3.93e-06, step=8457] Training: 85%|████████▍ | 8458/10000 [1:48:16<19:30, 1.32it/s, loss=0.0024, lr=3.93e-06, step=8457] Training: 85%|████████▍ | 8458/10000 [1:48:16<19:30, 1.32it/s, loss=0.0026, lr=3.93e-06, step=8458] Training: 85%|████████▍ | 8459/10000 [1:48:17<19:08, 1.34it/s, loss=0.0026, lr=3.93e-06, step=8458] Training: 85%|████████▍ | 8459/10000 [1:48:17<19:08, 1.34it/s, loss=0.0012, lr=3.93e-06, step=8459]17:54:24.331 [I] step=8460 loss=0.0141 smoothed_loss=0.0122 lr=3.94e-06 grad_norm=0.3932 step_time=0.5777s data_time=0.1781s it/s=1.323 eta_to_10000=1163.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0191 grad_action_out_proj=0.1244 grad_shared_expert=0.5898 (10775:train_pytorch.py:850) + Training: 85%|████████▍ | 8460/10000 [1:48:17<17:54, 1.43it/s, loss=0.0012, lr=3.93e-06, step=8459] Training: 85%|████████▍ | 8460/10000 [1:48:17<17:54, 1.43it/s, loss=0.0141, lr=3.93e-06, step=8460] Training: 85%|████████▍ | 8461/10000 [1:48:18<20:10, 1.27it/s, loss=0.0141, lr=3.93e-06, step=8460] Training: 85%|████████▍ | 8461/10000 [1:48:18<20:10, 1.27it/s, loss=0.0067, lr=3.93e-06, step=8461] Training: 85%|████████▍ | 8462/10000 [1:48:19<19:28, 1.32it/s, loss=0.0067, lr=3.93e-06, step=8461] Training: 85%|████████▍ | 8462/10000 [1:48:19<19:28, 1.32it/s, loss=0.0058, lr=3.93e-06, step=8462] Training: 85%|████████▍ | 8463/10000 [1:48:20<20:33, 1.25it/s, loss=0.0058, lr=3.93e-06, step=8462] Training: 85%|████████▍ | 8463/10000 [1:48:20<20:33, 1.25it/s, loss=0.0103, lr=3.92e-06, step=8463] Training: 85%|████████▍ | 8464/10000 [1:48:21<23:29, 1.09it/s, loss=0.0103, lr=3.92e-06, step=8463] Training: 85%|████████▍ | 8464/10000 [1:48:21<23:29, 1.09it/s, loss=0.0099, lr=3.92e-06, step=8464] Training: 85%|████████▍ | 8465/10000 [1:48:22<22:41, 1.13it/s, loss=0.0099, lr=3.92e-06, step=8464] Training: 85%|████████▍ | 8465/10000 [1:48:22<22:41, 1.13it/s, loss=0.0016, lr=3.92e-06, step=8465] Training: 85%|████████▍ | 8466/10000 [1:48:23<21:58, 1.16it/s, loss=0.0016, lr=3.92e-06, step=8465] Training: 85%|████████▍ | 8466/10000 [1:48:23<21:58, 1.16it/s, loss=0.0012, lr=3.92e-06, step=8466] Training: 85%|████████▍ | 8467/10000 [1:48:23<19:22, 1.32it/s, loss=0.0012, lr=3.92e-06, step=8466] Training: 85%|████████▍ | 8467/10000 [1:48:23<19:22, 1.32it/s, loss=0.0226, lr=3.92e-06, step=8467] Training: 85%|████████▍ | 8468/10000 [1:48:24<19:24, 1.32it/s, loss=0.0226, lr=3.92e-06, step=8467] Training: 85%|████████▍ | 8468/10000 [1:48:24<19:24, 1.32it/s, loss=0.0178, lr=3.91e-06, step=8468] Training: 85%|████████▍ | 8469/10000 [1:48:25<20:30, 1.24it/s, loss=0.0178, lr=3.91e-06, step=8468] Training: 85%|████████▍ | 8469/10000 [1:48:25<20:30, 1.24it/s, loss=0.0125, lr=3.91e-06, step=8469]17:54:32.432 [I] step=8470 loss=0.0029 smoothed_loss=0.0105 lr=3.92e-06 grad_norm=0.4605 step_time=0.6310s data_time=0.1791s it/s=1.235 eta_to_10000=1239.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.0854 grad_shared_expert=0.3956 (10775:train_pytorch.py:850) + Training: 85%|████████▍ | 8470/10000 [1:48:25<18:18, 1.39it/s, loss=0.0125, lr=3.91e-06, step=8469] Training: 85%|████████▍ | 8470/10000 [1:48:25<18:18, 1.39it/s, loss=0.0029, lr=3.91e-06, step=8470] Training: 85%|████████▍ | 8471/10000 [1:48:27<21:17, 1.20it/s, loss=0.0029, lr=3.91e-06, step=8470] Training: 85%|████████▍ | 8471/10000 [1:48:27<21:17, 1.20it/s, loss=0.0014, lr=3.91e-06, step=8471] Training: 85%|████████▍ | 8472/10000 [1:48:28<22:23, 1.14it/s, loss=0.0014, lr=3.91e-06, step=8471] Training: 85%|████████▍ | 8472/10000 [1:48:28<22:23, 1.14it/s, loss=0.0172, lr=3.91e-06, step=8472] Training: 85%|████████▍ | 8473/10000 [1:48:29<23:37, 1.08it/s, loss=0.0172, lr=3.91e-06, step=8472] Training: 85%|████████▍ | 8473/10000 [1:48:29<23:37, 1.08it/s, loss=0.0168, lr=3.91e-06, step=8473] Training: 85%|████████▍ | 8474/10000 [1:48:30<23:27, 1.08it/s, loss=0.0168, lr=3.91e-06, step=8473] Training: 85%|████████▍ | 8474/10000 [1:48:30<23:27, 1.08it/s, loss=0.0027, lr=3.90e-06, step=8474] Training: 85%|████████▍ | 8475/10000 [1:48:30<20:17, 1.25it/s, loss=0.0027, lr=3.90e-06, step=8474] Training: 85%|████████▍ | 8475/10000 [1:48:30<20:17, 1.25it/s, loss=0.0183, lr=3.90e-06, step=8475] Training: 85%|████████▍ | 8476/10000 [1:48:31<20:26, 1.24it/s, loss=0.0183, lr=3.90e-06, step=8475] Training: 85%|████████▍ | 8476/10000 [1:48:31<20:26, 1.24it/s, loss=0.0037, lr=3.90e-06, step=8476] Training: 85%|████████▍ | 8477/10000 [1:48:32<20:17, 1.25it/s, loss=0.0037, lr=3.90e-06, step=8476] Training: 85%|████████▍ | 8477/10000 [1:48:32<20:17, 1.25it/s, loss=0.0089, lr=3.90e-06, step=8477] Training: 85%|████████▍ | 8478/10000 [1:48:33<22:27, 1.13it/s, loss=0.0089, lr=3.90e-06, step=8477] Training: 85%|████████▍ | 8478/10000 [1:48:33<22:27, 1.13it/s, loss=0.0041, lr=3.90e-06, step=8478] Training: 85%|████████▍ | 8479/10000 [1:48:33<19:51, 1.28it/s, loss=0.0041, lr=3.90e-06, step=8478] Training: 85%|████████▍ | 8479/10000 [1:48:33<19:51, 1.28it/s, loss=0.0029, lr=3.90e-06, step=8479]17:54:41.021 [I] step=8480 loss=0.0025 smoothed_loss=0.0082 lr=3.90e-06 grad_norm=0.4560 step_time=0.6600s data_time=0.1989s it/s=1.164 eta_to_10000=1305.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.0887 grad_shared_expert=0.4670 (10775:train_pytorch.py:850) + Training: 85%|████████▍ | 8480/10000 [1:48:34<19:58, 1.27it/s, loss=0.0029, lr=3.90e-06, step=8479] Training: 85%|████████▍ | 8480/10000 [1:48:34<19:58, 1.27it/s, loss=0.0025, lr=3.89e-06, step=8480] Training: 85%|████████▍ | 8481/10000 [1:48:35<21:24, 1.18it/s, loss=0.0025, lr=3.89e-06, step=8480] Training: 85%|████████▍ | 8481/10000 [1:48:35<21:24, 1.18it/s, loss=0.0024, lr=3.89e-06, step=8481] Training: 85%|████████▍ | 8482/10000 [1:48:36<23:14, 1.09it/s, loss=0.0024, lr=3.89e-06, step=8481] Training: 85%|████████▍ | 8482/10000 [1:48:36<23:14, 1.09it/s, loss=0.0018, lr=3.89e-06, step=8482] Training: 85%|████████▍ | 8483/10000 [1:48:37<23:54, 1.06it/s, loss=0.0018, lr=3.89e-06, step=8482] Training: 85%|████████▍ | 8483/10000 [1:48:37<23:54, 1.06it/s, loss=0.0088, lr=3.89e-06, step=8483] Training: 85%|████████▍ | 8484/10000 [1:48:38<20:19, 1.24it/s, loss=0.0088, lr=3.89e-06, step=8483] Training: 85%|████████▍ | 8484/10000 [1:48:38<20:19, 1.24it/s, loss=0.0188, lr=3.89e-06, step=8484] Training: 85%|████████▍ | 8485/10000 [1:48:39<22:06, 1.14it/s, loss=0.0188, lr=3.89e-06, step=8484] Training: 85%|████████▍ | 8485/10000 [1:48:39<22:06, 1.14it/s, loss=0.0025, lr=3.88e-06, step=8485] Training: 85%|████████▍ | 8486/10000 [1:48:40<23:00, 1.10it/s, loss=0.0025, lr=3.88e-06, step=8485] Training: 85%|████████▍ | 8486/10000 [1:48:40<23:00, 1.10it/s, loss=0.0262, lr=3.88e-06, step=8486] Training: 85%|████████▍ | 8487/10000 [1:48:40<20:16, 1.24it/s, loss=0.0262, lr=3.88e-06, step=8486] Training: 85%|████████▍ | 8487/10000 [1:48:40<20:16, 1.24it/s, loss=0.0022, lr=3.88e-06, step=8487] Training: 85%|████████▍ | 8488/10000 [1:48:41<20:24, 1.24it/s, loss=0.0022, lr=3.88e-06, step=8487] Training: 85%|████████▍ | 8488/10000 [1:48:41<20:24, 1.24it/s, loss=0.0156, lr=3.88e-06, step=8488] Training: 85%|████████▍ | 8489/10000 [1:48:42<17:54, 1.41it/s, loss=0.0156, lr=3.88e-06, step=8488] Training: 85%|████████▍ | 8489/10000 [1:48:42<17:54, 1.41it/s, loss=0.0035, lr=3.88e-06, step=8489]17:54:49.487 [I] step=8490 loss=0.0017 smoothed_loss=0.0082 lr=3.88e-06 grad_norm=0.4296 step_time=0.6420s data_time=0.2045s it/s=1.181 eta_to_10000=1278.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0052 grad_action_out_proj=0.0706 grad_shared_expert=0.3212 (10775:train_pytorch.py:850) + Training: 85%|████████▍ | 8490/10000 [1:48:43<20:13, 1.24it/s, loss=0.0035, lr=3.88e-06, step=8489] Training: 85%|████████▍ | 8490/10000 [1:48:43<20:13, 1.24it/s, loss=0.0017, lr=3.88e-06, step=8490] Training: 85%|████████▍ | 8491/10000 [1:48:43<17:57, 1.40it/s, loss=0.0017, lr=3.88e-06, step=8490] Training: 85%|████████▍ | 8491/10000 [1:48:43<17:57, 1.40it/s, loss=0.0079, lr=3.87e-06, step=8491] Training: 85%|████████▍ | 8492/10000 [1:48:44<17:51, 1.41it/s, loss=0.0079, lr=3.87e-06, step=8491] Training: 85%|████████▍ | 8492/10000 [1:48:44<17:51, 1.41it/s, loss=0.0035, lr=3.87e-06, step=8492] Training: 85%|████████▍ | 8493/10000 [1:48:44<16:20, 1.54it/s, loss=0.0035, lr=3.87e-06, step=8492] Training: 85%|████████▍ | 8493/10000 [1:48:44<16:20, 1.54it/s, loss=0.0054, lr=3.87e-06, step=8493] Training: 85%|████████▍ | 8494/10000 [1:48:45<15:19, 1.64it/s, loss=0.0054, lr=3.87e-06, step=8493] Training: 85%|████████▍ | 8494/10000 [1:48:45<15:19, 1.64it/s, loss=0.0140, lr=3.87e-06, step=8494] Training: 85%|████████▍ | 8495/10000 [1:48:45<14:20, 1.75it/s, loss=0.0140, lr=3.87e-06, step=8494] Training: 85%|████████▍ | 8495/10000 [1:48:45<14:20, 1.75it/s, loss=0.0019, lr=3.87e-06, step=8495] Training: 85%|████████▍ | 8496/10000 [1:48:46<13:52, 1.81it/s, loss=0.0019, lr=3.87e-06, step=8495] Training: 85%|████████▍ | 8496/10000 [1:48:46<13:52, 1.81it/s, loss=0.0125, lr=3.86e-06, step=8496] Training: 85%|████████▍ | 8497/10000 [1:48:46<14:47, 1.69it/s, loss=0.0125, lr=3.86e-06, step=8496] Training: 85%|████████▍ | 8497/10000 [1:48:46<14:47, 1.69it/s, loss=0.0187, lr=3.86e-06, step=8497] Training: 85%|████████▍ | 8498/10000 [1:48:47<14:04, 1.78it/s, loss=0.0187, lr=3.86e-06, step=8497] Training: 85%|████████▍ | 8498/10000 [1:48:47<14:04, 1.78it/s, loss=0.0014, lr=3.86e-06, step=8498] Training: 85%|████████▍ | 8499/10000 [1:48:47<13:33, 1.84it/s, loss=0.0014, lr=3.86e-06, step=8498] Training: 85%|████████▍ | 8499/10000 [1:48:47<13:33, 1.84it/s, loss=0.0072, lr=3.86e-06, step=8499]17:54:55.393 [I] step=8500 loss=0.0049 smoothed_loss=0.0079 lr=3.87e-06 grad_norm=0.3953 step_time=0.5193s data_time=0.0714s it/s=1.693 eta_to_10000=885.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0169 grad_action_out_proj=0.1439 grad_shared_expert=0.4403 (10775:train_pytorch.py:850) + Training: 85%|████████▌ | 8500/10000 [1:48:48<17:03, 1.47it/s, loss=0.0072, lr=3.86e-06, step=8499] Training: 85%|████████▌ | 8500/10000 [1:48:48<17:03, 1.47it/s, loss=0.0049, lr=3.86e-06, step=8500] Training: 85%|████████▌ | 8501/10000 [1:48:49<15:53, 1.57it/s, loss=0.0049, lr=3.86e-06, step=8500] Training: 85%|████████▌ | 8501/10000 [1:48:49<15:53, 1.57it/s, loss=0.0196, lr=3.86e-06, step=8501] Training: 85%|████████▌ | 8502/10000 [1:48:49<14:47, 1.69it/s, loss=0.0196, lr=3.86e-06, step=8501] Training: 85%|████████▌ | 8502/10000 [1:48:49<14:47, 1.69it/s, loss=0.0057, lr=3.85e-06, step=8502] Training: 85%|████████▌ | 8503/10000 [1:48:50<14:01, 1.78it/s, loss=0.0057, lr=3.85e-06, step=8502] Training: 85%|████████▌ | 8503/10000 [1:48:50<14:01, 1.78it/s, loss=0.0121, lr=3.85e-06, step=8503] Training: 85%|████████▌ | 8504/10000 [1:48:51<15:14, 1.64it/s, loss=0.0121, lr=3.85e-06, step=8503] Training: 85%|████████▌ | 8504/10000 [1:48:51<15:14, 1.64it/s, loss=0.0017, lr=3.85e-06, step=8504] Training: 85%|████████▌ | 8505/10000 [1:48:51<14:17, 1.74it/s, loss=0.0017, lr=3.85e-06, step=8504] Training: 85%|████████▌ | 8505/10000 [1:48:51<14:17, 1.74it/s, loss=0.0066, lr=3.85e-06, step=8505] Training: 85%|████████▌ | 8506/10000 [1:48:52<13:39, 1.82it/s, loss=0.0066, lr=3.85e-06, step=8505] Training: 85%|████████▌ | 8506/10000 [1:48:52<13:39, 1.82it/s, loss=0.0330, lr=3.85e-06, step=8506] Training: 85%|████████▌ | 8507/10000 [1:48:52<15:08, 1.64it/s, loss=0.0330, lr=3.85e-06, step=8506] Training: 85%|████████▌ | 8507/10000 [1:48:52<15:08, 1.64it/s, loss=0.0119, lr=3.85e-06, step=8507] Training: 85%|████████▌ | 8508/10000 [1:48:53<15:37, 1.59it/s, loss=0.0119, lr=3.85e-06, step=8507] Training: 85%|████████▌ | 8508/10000 [1:48:53<15:37, 1.59it/s, loss=0.0020, lr=3.84e-06, step=8508] Training: 85%|████████▌ | 8509/10000 [1:48:54<15:28, 1.61it/s, loss=0.0020, lr=3.84e-06, step=8508] Training: 85%|████████▌ | 8509/10000 [1:48:54<15:28, 1.61it/s, loss=0.0408, lr=3.84e-06, step=8509]17:55:01.380 [I] step=8510 loss=0.0021 smoothed_loss=0.0119 lr=3.85e-06 grad_norm=0.4711 step_time=0.5129s data_time=0.0858s it/s=1.670 eta_to_10000=892.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0167 grad_action_out_proj=0.1106 grad_shared_expert=0.3536 (10775:train_pytorch.py:850) + Training: 85%|████████▌ | 8510/10000 [1:48:54<16:21, 1.52it/s, loss=0.0408, lr=3.84e-06, step=8509] Training: 85%|████████▌ | 8510/10000 [1:48:54<16:21, 1.52it/s, loss=0.0021, lr=3.84e-06, step=8510] Training: 85%|████████▌ | 8511/10000 [1:48:55<16:41, 1.49it/s, loss=0.0021, lr=3.84e-06, step=8510] Training: 85%|████████▌ | 8511/10000 [1:48:55<16:41, 1.49it/s, loss=0.0048, lr=3.84e-06, step=8511] Training: 85%|████████▌ | 8512/10000 [1:48:56<17:58, 1.38it/s, loss=0.0048, lr=3.84e-06, step=8511] Training: 85%|████████▌ | 8512/10000 [1:48:56<17:58, 1.38it/s, loss=0.0061, lr=3.84e-06, step=8512] Training: 85%|████████▌ | 8513/10000 [1:48:57<18:35, 1.33it/s, loss=0.0061, lr=3.84e-06, step=8512] Training: 85%|████████▌ | 8513/10000 [1:48:57<18:35, 1.33it/s, loss=0.0177, lr=3.83e-06, step=8513] Training: 85%|████████▌ | 8514/10000 [1:48:58<20:33, 1.20it/s, loss=0.0177, lr=3.83e-06, step=8513] Training: 85%|████████▌ | 8514/10000 [1:48:58<20:33, 1.20it/s, loss=0.0025, lr=3.83e-06, step=8514] Training: 85%|████████▌ | 8515/10000 [1:48:58<18:39, 1.33it/s, loss=0.0025, lr=3.83e-06, step=8514] Training: 85%|████████▌ | 8515/10000 [1:48:58<18:39, 1.33it/s, loss=0.0241, lr=3.83e-06, step=8515] Training: 85%|████████▌ | 8516/10000 [1:48:59<16:37, 1.49it/s, loss=0.0241, lr=3.83e-06, step=8515] Training: 85%|████████▌ | 8516/10000 [1:48:59<16:37, 1.49it/s, loss=0.0073, lr=3.83e-06, step=8516] Training: 85%|████████▌ | 8517/10000 [1:48:59<15:32, 1.59it/s, loss=0.0073, lr=3.83e-06, step=8516] Training: 85%|████████▌ | 8517/10000 [1:48:59<15:32, 1.59it/s, loss=0.0073, lr=3.83e-06, step=8517] Training: 85%|████████▌ | 8518/10000 [1:49:00<15:43, 1.57it/s, loss=0.0073, lr=3.83e-06, step=8517] Training: 85%|████████▌ | 8518/10000 [1:49:00<15:43, 1.57it/s, loss=0.0059, lr=3.83e-06, step=8518] Training: 85%|████████▌ | 8519/10000 [1:49:01<16:20, 1.51it/s, loss=0.0059, lr=3.83e-06, step=8518] Training: 85%|████████▌ | 8519/10000 [1:49:01<16:20, 1.51it/s, loss=0.0191, lr=3.82e-06, step=8519]17:55:08.247 [I] step=8520 loss=0.0063 smoothed_loss=0.0108 lr=3.83e-06 grad_norm=0.4256 step_time=0.5444s data_time=0.1424s it/s=1.456 eta_to_10000=1016.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0072 grad_action_out_proj=0.0639 grad_shared_expert=0.1994 (10775:train_pytorch.py:850) + Training: 85%|████████▌ | 8520/10000 [1:49:01<15:20, 1.61it/s, loss=0.0191, lr=3.82e-06, step=8519] Training: 85%|████████▌ | 8520/10000 [1:49:01<15:20, 1.61it/s, loss=0.0063, lr=3.82e-06, step=8520] Training: 85%|████████▌ | 8521/10000 [1:49:02<14:25, 1.71it/s, loss=0.0063, lr=3.82e-06, step=8520] Training: 85%|████████▌ | 8521/10000 [1:49:02<14:25, 1.71it/s, loss=0.0103, lr=3.82e-06, step=8521] Training: 85%|████████▌ | 8522/10000 [1:49:03<17:37, 1.40it/s, loss=0.0103, lr=3.82e-06, step=8521] Training: 85%|████████▌ | 8522/10000 [1:49:03<17:37, 1.40it/s, loss=0.0070, lr=3.82e-06, step=8522] Training: 85%|████████▌ | 8523/10000 [1:49:03<16:03, 1.53it/s, loss=0.0070, lr=3.82e-06, step=8522] Training: 85%|████████▌ | 8523/10000 [1:49:03<16:03, 1.53it/s, loss=0.0150, lr=3.82e-06, step=8523] Training: 85%|████████▌ | 8524/10000 [1:49:04<17:05, 1.44it/s, loss=0.0150, lr=3.82e-06, step=8523] Training: 85%|████████▌ | 8524/10000 [1:49:04<17:05, 1.44it/s, loss=0.0031, lr=3.82e-06, step=8524] Training: 85%|████████▌ | 8525/10000 [1:49:05<17:24, 1.41it/s, loss=0.0031, lr=3.82e-06, step=8524] Training: 85%|████████▌ | 8525/10000 [1:49:05<17:24, 1.41it/s, loss=0.0124, lr=3.81e-06, step=8525] Training: 85%|████████▌ | 8526/10000 [1:49:05<16:01, 1.53it/s, loss=0.0124, lr=3.81e-06, step=8525] Training: 85%|████████▌ | 8526/10000 [1:49:05<16:01, 1.53it/s, loss=0.0056, lr=3.81e-06, step=8526] Training: 85%|████████▌ | 8527/10000 [1:49:06<17:40, 1.39it/s, loss=0.0056, lr=3.81e-06, step=8526] Training: 85%|████████▌ | 8527/10000 [1:49:06<17:40, 1.39it/s, loss=0.0154, lr=3.81e-06, step=8527] Training: 85%|████████▌ | 8528/10000 [1:49:07<15:59, 1.53it/s, loss=0.0154, lr=3.81e-06, step=8527] Training: 85%|████████▌ | 8528/10000 [1:49:07<15:59, 1.53it/s, loss=0.0802, lr=3.81e-06, step=8528] Training: 85%|████████▌ | 8529/10000 [1:49:08<18:30, 1.32it/s, loss=0.0802, lr=3.81e-06, step=8528] Training: 85%|████████▌ | 8529/10000 [1:49:08<18:30, 1.32it/s, loss=0.0087, lr=3.81e-06, step=8529]17:55:15.228 [I] step=8530 loss=0.0017 smoothed_loss=0.0150 lr=3.81e-06 grad_norm=0.4462 step_time=0.5567s data_time=0.1414s it/s=1.433 eta_to_10000=1026.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0119 grad_action_out_proj=0.1093 grad_shared_expert=0.2995 (10775:train_pytorch.py:850) + Training: 85%|████████▌ | 8530/10000 [1:49:08<16:52, 1.45it/s, loss=0.0087, lr=3.81e-06, step=8529] Training: 85%|████████▌ | 8530/10000 [1:49:08<16:52, 1.45it/s, loss=0.0017, lr=3.81e-06, step=8530] Training: 85%|████████▌ | 8531/10000 [1:49:09<15:26, 1.59it/s, loss=0.0017, lr=3.81e-06, step=8530] Training: 85%|████████▌ | 8531/10000 [1:49:09<15:26, 1.59it/s, loss=0.0082, lr=3.80e-06, step=8531] Training: 85%|████████▌ | 8532/10000 [1:49:10<16:48, 1.46it/s, loss=0.0082, lr=3.80e-06, step=8531] Training: 85%|████████▌ | 8532/10000 [1:49:10<16:48, 1.46it/s, loss=0.0021, lr=3.80e-06, step=8532] Training: 85%|████████▌ | 8533/10000 [1:49:10<17:06, 1.43it/s, loss=0.0021, lr=3.80e-06, step=8532] Training: 85%|████████▌ | 8533/10000 [1:49:10<17:06, 1.43it/s, loss=0.0029, lr=3.80e-06, step=8533] Training: 85%|████████▌ | 8534/10000 [1:49:11<15:39, 1.56it/s, loss=0.0029, lr=3.80e-06, step=8533] Training: 85%|████████▌ | 8534/10000 [1:49:11<15:39, 1.56it/s, loss=0.0072, lr=3.80e-06, step=8534] Training: 85%|████████▌ | 8535/10000 [1:49:12<18:51, 1.29it/s, loss=0.0072, lr=3.80e-06, step=8534] Training: 85%|████████▌ | 8535/10000 [1:49:12<18:51, 1.29it/s, loss=0.0129, lr=3.80e-06, step=8535] Training: 85%|████████▌ | 8536/10000 [1:49:13<18:24, 1.33it/s, loss=0.0129, lr=3.80e-06, step=8535] Training: 85%|████████▌ | 8536/10000 [1:49:13<18:24, 1.33it/s, loss=0.0022, lr=3.79e-06, step=8536] Training: 85%|████████▌ | 8537/10000 [1:49:13<16:33, 1.47it/s, loss=0.0022, lr=3.79e-06, step=8536] Training: 85%|████████▌ | 8537/10000 [1:49:13<16:33, 1.47it/s, loss=0.0051, lr=3.79e-06, step=8537] Training: 85%|████████▌ | 8538/10000 [1:49:14<15:09, 1.61it/s, loss=0.0051, lr=3.79e-06, step=8537] Training: 85%|████████▌ | 8538/10000 [1:49:14<15:09, 1.61it/s, loss=0.0016, lr=3.79e-06, step=8538] Training: 85%|████████▌ | 8539/10000 [1:49:14<14:14, 1.71it/s, loss=0.0016, lr=3.79e-06, step=8538] Training: 85%|████████▌ | 8539/10000 [1:49:14<14:14, 1.71it/s, loss=0.0242, lr=3.79e-06, step=8539]17:55:21.919 [I] step=8540 loss=0.0045 smoothed_loss=0.0102 lr=3.80e-06 grad_norm=0.3981 step_time=0.5402s data_time=0.1289s it/s=1.495 eta_to_10000=976.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0447 grad_action_out_proj=0.1545 grad_shared_expert=0.4847 (10775:train_pytorch.py:850) + Training: 85%|████████▌ | 8540/10000 [1:49:15<16:15, 1.50it/s, loss=0.0242, lr=3.79e-06, step=8539] Training: 85%|████████▌ | 8540/10000 [1:49:15<16:15, 1.50it/s, loss=0.0045, lr=3.79e-06, step=8540] Training: 85%|████████▌ | 8541/10000 [1:49:16<15:29, 1.57it/s, loss=0.0045, lr=3.79e-06, step=8540] Training: 85%|████████▌ | 8541/10000 [1:49:16<15:29, 1.57it/s, loss=0.0025, lr=3.79e-06, step=8541] Training: 85%|████████▌ | 8542/10000 [1:49:16<15:41, 1.55it/s, loss=0.0025, lr=3.79e-06, step=8541] Training: 85%|████████▌ | 8542/10000 [1:49:16<15:41, 1.55it/s, loss=0.0031, lr=3.78e-06, step=8542] Training: 85%|████████▌ | 8543/10000 [1:49:17<18:45, 1.29it/s, loss=0.0031, lr=3.78e-06, step=8542] Training: 85%|████████▌ | 8543/10000 [1:49:17<18:45, 1.29it/s, loss=0.0037, lr=3.78e-06, step=8543] Training: 85%|████████▌ | 8544/10000 [1:49:18<16:41, 1.45it/s, loss=0.0037, lr=3.78e-06, step=8543] Training: 85%|████████▌ | 8544/10000 [1:49:18<16:41, 1.45it/s, loss=0.0044, lr=3.78e-06, step=8544] Training: 85%|████████▌ | 8545/10000 [1:49:18<15:16, 1.59it/s, loss=0.0044, lr=3.78e-06, step=8544] Training: 85%|████████▌ | 8545/10000 [1:49:18<15:16, 1.59it/s, loss=0.0063, lr=3.78e-06, step=8545] Training: 85%|████████▌ | 8546/10000 [1:49:19<17:41, 1.37it/s, loss=0.0063, lr=3.78e-06, step=8545] Training: 85%|████████▌ | 8546/10000 [1:49:19<17:41, 1.37it/s, loss=0.0146, lr=3.78e-06, step=8546] Training: 85%|████████▌ | 8547/10000 [1:49:20<19:06, 1.27it/s, loss=0.0146, lr=3.78e-06, step=8546] Training: 85%|████████▌ | 8547/10000 [1:49:20<19:06, 1.27it/s, loss=0.0037, lr=3.78e-06, step=8547] Training: 85%|████████▌ | 8548/10000 [1:49:21<18:29, 1.31it/s, loss=0.0037, lr=3.78e-06, step=8547] Training: 85%|████████▌ | 8548/10000 [1:49:21<18:29, 1.31it/s, loss=0.0016, lr=3.77e-06, step=8548] Training: 85%|████████▌ | 8549/10000 [1:49:21<17:09, 1.41it/s, loss=0.0016, lr=3.77e-06, step=8548] Training: 85%|████████▌ | 8549/10000 [1:49:21<17:09, 1.41it/s, loss=0.0022, lr=3.77e-06, step=8549]17:55:29.315 [I] step=8550 loss=0.0120 smoothed_loss=0.0073 lr=3.78e-06 grad_norm=0.3784 step_time=0.6051s data_time=0.1345s it/s=1.352 eta_to_10000=1072.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.1076 grad_shared_expert=0.4817 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8550/10000 [1:49:22<18:45, 1.29it/s, loss=0.0022, lr=3.77e-06, step=8549] Training: 86%|████████▌ | 8550/10000 [1:49:22<18:45, 1.29it/s, loss=0.0120, lr=3.77e-06, step=8550] Training: 86%|████████▌ | 8551/10000 [1:49:23<18:28, 1.31it/s, loss=0.0120, lr=3.77e-06, step=8550] Training: 86%|████████▌ | 8551/10000 [1:49:23<18:28, 1.31it/s, loss=0.0056, lr=3.77e-06, step=8551] Training: 86%|████████▌ | 8552/10000 [1:49:24<17:11, 1.40it/s, loss=0.0056, lr=3.77e-06, step=8551] Training: 86%|████████▌ | 8552/10000 [1:49:24<17:11, 1.40it/s, loss=0.0189, lr=3.77e-06, step=8552] Training: 86%|████████▌ | 8553/10000 [1:49:25<18:43, 1.29it/s, loss=0.0189, lr=3.77e-06, step=8552] Training: 86%|████████▌ | 8553/10000 [1:49:25<18:43, 1.29it/s, loss=0.0023, lr=3.77e-06, step=8553] Training: 86%|████████▌ | 8554/10000 [1:49:25<18:10, 1.33it/s, loss=0.0023, lr=3.77e-06, step=8553] Training: 86%|████████▌ | 8554/10000 [1:49:25<18:10, 1.33it/s, loss=0.0074, lr=3.76e-06, step=8554] Training: 86%|████████▌ | 8555/10000 [1:49:26<17:55, 1.34it/s, loss=0.0074, lr=3.76e-06, step=8554] Training: 86%|████████▌ | 8555/10000 [1:49:26<17:55, 1.34it/s, loss=0.0188, lr=3.76e-06, step=8555] Training: 86%|████████▌ | 8556/10000 [1:49:27<19:26, 1.24it/s, loss=0.0188, lr=3.76e-06, step=8555] Training: 86%|████████▌ | 8556/10000 [1:49:27<19:26, 1.24it/s, loss=0.0056, lr=3.76e-06, step=8556] Training: 86%|████████▌ | 8557/10000 [1:49:28<20:35, 1.17it/s, loss=0.0056, lr=3.76e-06, step=8556] Training: 86%|████████▌ | 8557/10000 [1:49:28<20:35, 1.17it/s, loss=0.0142, lr=3.76e-06, step=8557] Training: 86%|████████▌ | 8558/10000 [1:49:29<20:07, 1.19it/s, loss=0.0142, lr=3.76e-06, step=8557] Training: 86%|████████▌ | 8558/10000 [1:49:29<20:07, 1.19it/s, loss=0.0066, lr=3.76e-06, step=8558] Training: 86%|████████▌ | 8559/10000 [1:49:29<19:08, 1.25it/s, loss=0.0066, lr=3.76e-06, step=8558] Training: 86%|████████▌ | 8559/10000 [1:49:29<19:08, 1.25it/s, loss=0.0039, lr=3.76e-06, step=8559]17:55:37.036 [I] step=8560 loss=0.0076 smoothed_loss=0.0083 lr=3.76e-06 grad_norm=0.3981 step_time=0.6000s data_time=0.1721s it/s=1.295 eta_to_10000=1111.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0152 grad_action_out_proj=0.1174 grad_shared_expert=0.3488 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8560/10000 [1:49:30<17:52, 1.34it/s, loss=0.0039, lr=3.76e-06, step=8559] Training: 86%|████████▌ | 8560/10000 [1:49:30<17:52, 1.34it/s, loss=0.0076, lr=3.75e-06, step=8560] Training: 86%|████████▌ | 8561/10000 [1:49:31<19:06, 1.26it/s, loss=0.0076, lr=3.75e-06, step=8560] Training: 86%|████████▌ | 8561/10000 [1:49:31<19:06, 1.26it/s, loss=0.0035, lr=3.75e-06, step=8561] Training: 86%|████████▌ | 8562/10000 [1:49:32<19:56, 1.20it/s, loss=0.0035, lr=3.75e-06, step=8561] Training: 86%|████████▌ | 8562/10000 [1:49:32<19:56, 1.20it/s, loss=0.0026, lr=3.75e-06, step=8562] Training: 86%|████████▌ | 8563/10000 [1:49:33<22:13, 1.08it/s, loss=0.0026, lr=3.75e-06, step=8562] Training: 86%|████████▌ | 8563/10000 [1:49:33<22:13, 1.08it/s, loss=0.0337, lr=3.75e-06, step=8563] Training: 86%|████████▌ | 8564/10000 [1:49:34<24:04, 1.01s/it, loss=0.0337, lr=3.75e-06, step=8563] Training: 86%|████████▌ | 8564/10000 [1:49:34<24:04, 1.01s/it, loss=0.0030, lr=3.75e-06, step=8564] Training: 86%|████████▌ | 8565/10000 [1:49:35<21:47, 1.10it/s, loss=0.0030, lr=3.75e-06, step=8564] Training: 86%|████████▌ | 8565/10000 [1:49:35<21:47, 1.10it/s, loss=0.0168, lr=3.74e-06, step=8565] Training: 86%|████████▌ | 8566/10000 [1:49:36<21:00, 1.14it/s, loss=0.0168, lr=3.74e-06, step=8565] Training: 86%|████████▌ | 8566/10000 [1:49:36<21:00, 1.14it/s, loss=0.0009, lr=3.74e-06, step=8566] Training: 86%|████████▌ | 8567/10000 [1:49:37<20:30, 1.16it/s, loss=0.0009, lr=3.74e-06, step=8566] Training: 86%|████████▌ | 8567/10000 [1:49:37<20:30, 1.16it/s, loss=0.0034, lr=3.74e-06, step=8567] Training: 86%|████████▌ | 8568/10000 [1:49:38<21:16, 1.12it/s, loss=0.0034, lr=3.74e-06, step=8567] Training: 86%|████████▌ | 8568/10000 [1:49:38<21:16, 1.12it/s, loss=0.0036, lr=3.74e-06, step=8568] Training: 86%|████████▌ | 8569/10000 [1:49:38<20:06, 1.19it/s, loss=0.0036, lr=3.74e-06, step=8568] Training: 86%|████████▌ | 8569/10000 [1:49:38<20:06, 1.19it/s, loss=0.0306, lr=3.74e-06, step=8569]17:55:46.351 [I] step=8570 loss=0.0014 smoothed_loss=0.0094 lr=3.74e-06 grad_norm=0.3828 step_time=0.6894s data_time=0.2421s it/s=1.074 eta_to_10000=1331.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0747 grad_shared_expert=0.3021 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8570/10000 [1:49:39<22:12, 1.07it/s, loss=0.0306, lr=3.74e-06, step=8569] Training: 86%|████████▌ | 8570/10000 [1:49:39<22:12, 1.07it/s, loss=0.0014, lr=3.74e-06, step=8570] Training: 86%|████████▌ | 8571/10000 [1:49:40<21:06, 1.13it/s, loss=0.0014, lr=3.74e-06, step=8570] Training: 86%|████████▌ | 8571/10000 [1:49:40<21:06, 1.13it/s, loss=0.0023, lr=3.73e-06, step=8571] Training: 86%|████████▌ | 8572/10000 [1:49:41<20:20, 1.17it/s, loss=0.0023, lr=3.73e-06, step=8571] Training: 86%|████████▌ | 8572/10000 [1:49:41<20:20, 1.17it/s, loss=0.0173, lr=3.73e-06, step=8572] Training: 86%|████████▌ | 8573/10000 [1:49:42<19:05, 1.25it/s, loss=0.0173, lr=3.73e-06, step=8572] Training: 86%|████████▌ | 8573/10000 [1:49:42<19:05, 1.25it/s, loss=0.0076, lr=3.73e-06, step=8573] Training: 86%|████████▌ | 8574/10000 [1:49:42<17:23, 1.37it/s, loss=0.0076, lr=3.73e-06, step=8573] Training: 86%|████████▌ | 8574/10000 [1:49:42<17:23, 1.37it/s, loss=0.0115, lr=3.73e-06, step=8574] Training: 86%|████████▌ | 8575/10000 [1:49:43<16:30, 1.44it/s, loss=0.0115, lr=3.73e-06, step=8574] Training: 86%|████████▌ | 8575/10000 [1:49:43<16:30, 1.44it/s, loss=0.0018, lr=3.73e-06, step=8575] Training: 86%|████████▌ | 8576/10000 [1:49:43<15:25, 1.54it/s, loss=0.0018, lr=3.73e-06, step=8575] Training: 86%|████████▌ | 8576/10000 [1:49:43<15:25, 1.54it/s, loss=0.0022, lr=3.73e-06, step=8576] Training: 86%|████████▌ | 8577/10000 [1:49:44<15:39, 1.51it/s, loss=0.0022, lr=3.73e-06, step=8576] Training: 86%|████████▌ | 8577/10000 [1:49:44<15:39, 1.51it/s, loss=0.0012, lr=3.72e-06, step=8577] Training: 86%|████████▌ | 8578/10000 [1:49:45<16:15, 1.46it/s, loss=0.0012, lr=3.72e-06, step=8577] Training: 86%|████████▌ | 8578/10000 [1:49:45<16:15, 1.46it/s, loss=0.0007, lr=3.72e-06, step=8578] Training: 86%|████████▌ | 8579/10000 [1:49:45<15:26, 1.53it/s, loss=0.0007, lr=3.72e-06, step=8578] Training: 86%|████████▌ | 8579/10000 [1:49:45<15:26, 1.53it/s, loss=0.0009, lr=3.72e-06, step=8579]17:55:52.917 [I] step=8580 loss=0.0049 smoothed_loss=0.0061 lr=3.73e-06 grad_norm=0.4402 step_time=0.5595s data_time=0.0971s it/s=1.523 eta_to_10000=932.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.1070 grad_shared_expert=0.4450 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8580/10000 [1:49:46<15:03, 1.57it/s, loss=0.0009, lr=3.72e-06, step=8579] Training: 86%|████████▌ | 8580/10000 [1:49:46<15:03, 1.57it/s, loss=0.0049, lr=3.72e-06, step=8580] Training: 86%|████████▌ | 8581/10000 [1:49:47<16:24, 1.44it/s, loss=0.0049, lr=3.72e-06, step=8580] Training: 86%|████████▌ | 8581/10000 [1:49:47<16:24, 1.44it/s, loss=0.0070, lr=3.72e-06, step=8581] Training: 86%|████████▌ | 8582/10000 [1:49:47<14:59, 1.58it/s, loss=0.0070, lr=3.72e-06, step=8581] Training: 86%|████████▌ | 8582/10000 [1:49:47<14:59, 1.58it/s, loss=0.0316, lr=3.72e-06, step=8582] Training: 86%|████████▌ | 8583/10000 [1:49:48<14:59, 1.58it/s, loss=0.0316, lr=3.72e-06, step=8582] Training: 86%|████████▌ | 8583/10000 [1:49:48<14:59, 1.58it/s, loss=0.0139, lr=3.71e-06, step=8583] Training: 86%|████████▌ | 8584/10000 [1:49:49<17:20, 1.36it/s, loss=0.0139, lr=3.71e-06, step=8583] Training: 86%|████████▌ | 8584/10000 [1:49:49<17:20, 1.36it/s, loss=0.0172, lr=3.71e-06, step=8584] Training: 86%|████████▌ | 8585/10000 [1:49:50<16:27, 1.43it/s, loss=0.0172, lr=3.71e-06, step=8584] Training: 86%|████████▌ | 8585/10000 [1:49:50<16:27, 1.43it/s, loss=0.0253, lr=3.71e-06, step=8585] Training: 86%|████████▌ | 8586/10000 [1:49:51<18:26, 1.28it/s, loss=0.0253, lr=3.71e-06, step=8585] Training: 86%|████████▌ | 8586/10000 [1:49:51<18:26, 1.28it/s, loss=0.0042, lr=3.71e-06, step=8586] Training: 86%|████████▌ | 8587/10000 [1:49:51<18:57, 1.24it/s, loss=0.0042, lr=3.71e-06, step=8586] Training: 86%|████████▌ | 8587/10000 [1:49:51<18:57, 1.24it/s, loss=0.0042, lr=3.71e-06, step=8587] Training: 86%|████████▌ | 8588/10000 [1:49:52<17:00, 1.38it/s, loss=0.0042, lr=3.71e-06, step=8587] Training: 86%|████████▌ | 8588/10000 [1:49:52<17:00, 1.38it/s, loss=0.0026, lr=3.71e-06, step=8588] Training: 86%|████████▌ | 8589/10000 [1:49:53<18:14, 1.29it/s, loss=0.0026, lr=3.71e-06, step=8588] Training: 86%|████████▌ | 8589/10000 [1:49:53<18:14, 1.29it/s, loss=0.0101, lr=3.70e-06, step=8589]17:56:00.454 [I] step=8590 loss=0.0017 smoothed_loss=0.0087 lr=3.71e-06 grad_norm=0.3867 step_time=0.5841s data_time=0.1696s it/s=1.327 eta_to_10000=1062.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0884 grad_shared_expert=0.2983 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8590/10000 [1:49:54<17:54, 1.31it/s, loss=0.0101, lr=3.70e-06, step=8589] Training: 86%|████████▌ | 8590/10000 [1:49:54<17:54, 1.31it/s, loss=0.0017, lr=3.70e-06, step=8590] Training: 86%|████████▌ | 8591/10000 [1:49:54<16:46, 1.40it/s, loss=0.0017, lr=3.70e-06, step=8590] Training: 86%|████████▌ | 8591/10000 [1:49:54<16:46, 1.40it/s, loss=0.0070, lr=3.70e-06, step=8591] Training: 86%|████████▌ | 8592/10000 [1:49:55<15:22, 1.53it/s, loss=0.0070, lr=3.70e-06, step=8591] Training: 86%|████████▌ | 8592/10000 [1:49:55<15:22, 1.53it/s, loss=0.0048, lr=3.70e-06, step=8592] Training: 86%|████████▌ | 8593/10000 [1:49:56<16:56, 1.38it/s, loss=0.0048, lr=3.70e-06, step=8592] Training: 86%|████████▌ | 8593/10000 [1:49:56<16:56, 1.38it/s, loss=0.0163, lr=3.70e-06, step=8593] Training: 86%|████████▌ | 8594/10000 [1:49:56<16:22, 1.43it/s, loss=0.0163, lr=3.70e-06, step=8593] Training: 86%|████████▌ | 8594/10000 [1:49:56<16:22, 1.43it/s, loss=0.0045, lr=3.70e-06, step=8594] Training: 86%|████████▌ | 8595/10000 [1:49:57<15:12, 1.54it/s, loss=0.0045, lr=3.70e-06, step=8594] Training: 86%|████████▌ | 8595/10000 [1:49:57<15:12, 1.54it/s, loss=0.0165, lr=3.69e-06, step=8595] Training: 86%|████████▌ | 8596/10000 [1:49:58<16:38, 1.41it/s, loss=0.0165, lr=3.69e-06, step=8595] Training: 86%|████████▌ | 8596/10000 [1:49:58<16:38, 1.41it/s, loss=0.0038, lr=3.69e-06, step=8596] Training: 86%|████████▌ | 8597/10000 [1:49:58<17:11, 1.36it/s, loss=0.0038, lr=3.69e-06, step=8596] Training: 86%|████████▌ | 8597/10000 [1:49:58<17:11, 1.36it/s, loss=0.0074, lr=3.69e-06, step=8597] Training: 86%|████████▌ | 8598/10000 [1:49:59<15:38, 1.49it/s, loss=0.0074, lr=3.69e-06, step=8597] Training: 86%|████████▌ | 8598/10000 [1:49:59<15:38, 1.49it/s, loss=0.0172, lr=3.69e-06, step=8598] Training: 86%|████████▌ | 8599/10000 [1:50:00<17:07, 1.36it/s, loss=0.0172, lr=3.69e-06, step=8598] Training: 86%|████████▌ | 8599/10000 [1:50:00<17:07, 1.36it/s, loss=0.0030, lr=3.69e-06, step=8599]17:56:07.476 [I] step=8600 loss=0.0036 smoothed_loss=0.0083 lr=3.69e-06 grad_norm=0.4321 step_time=0.5791s data_time=0.1232s it/s=1.424 eta_to_10000=983.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0070 grad_action_out_proj=0.0848 grad_shared_expert=0.3857 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8600/10000 [1:50:01<17:34, 1.33it/s, loss=0.0030, lr=3.69e-06, step=8599] Training: 86%|████████▌ | 8600/10000 [1:50:01<17:34, 1.33it/s, loss=0.0036, lr=3.69e-06, step=8600] Training: 86%|████████▌ | 8601/10000 [1:50:01<18:11, 1.28it/s, loss=0.0036, lr=3.69e-06, step=8600] Training: 86%|████████▌ | 8601/10000 [1:50:01<18:11, 1.28it/s, loss=0.0088, lr=3.68e-06, step=8601] Training: 86%|████████▌ | 8602/10000 [1:50:02<16:13, 1.44it/s, loss=0.0088, lr=3.68e-06, step=8601] Training: 86%|████████▌ | 8602/10000 [1:50:02<16:13, 1.44it/s, loss=0.0034, lr=3.68e-06, step=8602] Training: 86%|████████▌ | 8603/10000 [1:50:02<15:19, 1.52it/s, loss=0.0034, lr=3.68e-06, step=8602] Training: 86%|████████▌ | 8603/10000 [1:50:02<15:19, 1.52it/s, loss=0.0102, lr=3.68e-06, step=8603] Training: 86%|████████▌ | 8604/10000 [1:50:03<17:55, 1.30it/s, loss=0.0102, lr=3.68e-06, step=8603] Training: 86%|████████▌ | 8604/10000 [1:50:03<17:55, 1.30it/s, loss=0.0100, lr=3.68e-06, step=8604] Training: 86%|████████▌ | 8605/10000 [1:50:04<17:55, 1.30it/s, loss=0.0100, lr=3.68e-06, step=8604] Training: 86%|████████▌ | 8605/10000 [1:50:04<17:55, 1.30it/s, loss=0.0021, lr=3.68e-06, step=8605] Training: 86%|████████▌ | 8606/10000 [1:50:05<19:37, 1.18it/s, loss=0.0021, lr=3.68e-06, step=8605] Training: 86%|████████▌ | 8606/10000 [1:50:05<19:37, 1.18it/s, loss=0.0107, lr=3.68e-06, step=8606] Training: 86%|████████▌ | 8607/10000 [1:50:06<19:05, 1.22it/s, loss=0.0107, lr=3.68e-06, step=8606] Training: 86%|████████▌ | 8607/10000 [1:50:06<19:05, 1.22it/s, loss=0.0057, lr=3.67e-06, step=8607] Training: 86%|████████▌ | 8608/10000 [1:50:07<18:35, 1.25it/s, loss=0.0057, lr=3.67e-06, step=8607] Training: 86%|████████▌ | 8608/10000 [1:50:07<18:35, 1.25it/s, loss=0.0054, lr=3.67e-06, step=8608] Training: 86%|████████▌ | 8609/10000 [1:50:08<18:59, 1.22it/s, loss=0.0054, lr=3.67e-06, step=8608] Training: 86%|████████▌ | 8609/10000 [1:50:08<18:59, 1.22it/s, loss=0.0182, lr=3.67e-06, step=8609]17:56:15.224 [I] step=8610 loss=0.0020 smoothed_loss=0.0079 lr=3.68e-06 grad_norm=0.4750 step_time=0.6075s data_time=0.1673s it/s=1.291 eta_to_10000=1076.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0133 grad_action_out_proj=0.0996 grad_shared_expert=0.4741 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8610/10000 [1:50:08<17:40, 1.31it/s, loss=0.0182, lr=3.67e-06, step=8609] Training: 86%|████████▌ | 8610/10000 [1:50:08<17:40, 1.31it/s, loss=0.0020, lr=3.67e-06, step=8610] Training: 86%|████████▌ | 8611/10000 [1:50:09<15:48, 1.46it/s, loss=0.0020, lr=3.67e-06, step=8610] Training: 86%|████████▌ | 8611/10000 [1:50:09<15:48, 1.46it/s, loss=0.0107, lr=3.67e-06, step=8611] Training: 86%|████████▌ | 8612/10000 [1:50:09<15:53, 1.46it/s, loss=0.0107, lr=3.67e-06, step=8611] Training: 86%|████████▌ | 8612/10000 [1:50:09<15:53, 1.46it/s, loss=0.0095, lr=3.67e-06, step=8612] Training: 86%|████████▌ | 8613/10000 [1:50:10<15:06, 1.53it/s, loss=0.0095, lr=3.67e-06, step=8612] Training: 86%|████████▌ | 8613/10000 [1:50:10<15:06, 1.53it/s, loss=0.0408, lr=3.66e-06, step=8613] Training: 86%|████████▌ | 8614/10000 [1:50:11<16:29, 1.40it/s, loss=0.0408, lr=3.66e-06, step=8613] Training: 86%|████████▌ | 8614/10000 [1:50:11<16:29, 1.40it/s, loss=0.0041, lr=3.66e-06, step=8614] Training: 86%|████████▌ | 8615/10000 [1:50:12<16:43, 1.38it/s, loss=0.0041, lr=3.66e-06, step=8614] Training: 86%|████████▌ | 8615/10000 [1:50:12<16:43, 1.38it/s, loss=0.0033, lr=3.66e-06, step=8615] Training: 86%|████████▌ | 8616/10000 [1:50:13<18:22, 1.25it/s, loss=0.0033, lr=3.66e-06, step=8615] Training: 86%|████████▌ | 8616/10000 [1:50:13<18:22, 1.25it/s, loss=0.0056, lr=3.66e-06, step=8616] Training: 86%|████████▌ | 8617/10000 [1:50:13<17:24, 1.32it/s, loss=0.0056, lr=3.66e-06, step=8616] Training: 86%|████████▌ | 8617/10000 [1:50:13<17:24, 1.32it/s, loss=0.0026, lr=3.66e-06, step=8617] Training: 86%|████████▌ | 8618/10000 [1:50:14<18:29, 1.25it/s, loss=0.0026, lr=3.66e-06, step=8617] Training: 86%|████████▌ | 8618/10000 [1:50:14<18:29, 1.25it/s, loss=0.0093, lr=3.66e-06, step=8618] Training: 86%|████████▌ | 8619/10000 [1:50:15<18:31, 1.24it/s, loss=0.0093, lr=3.66e-06, step=8618] Training: 86%|████████▌ | 8619/10000 [1:50:15<18:31, 1.24it/s, loss=0.0076, lr=3.65e-06, step=8619]17:56:22.697 [I] step=8620 loss=0.0052 smoothed_loss=0.0085 lr=3.66e-06 grad_norm=0.4850 step_time=0.5985s data_time=0.1488s it/s=1.338 eta_to_10000=1031.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0242 grad_action_out_proj=0.1433 grad_shared_expert=0.5158 (10775:train_pytorch.py:850) + Training: 86%|████████▌ | 8620/10000 [1:50:16<18:09, 1.27it/s, loss=0.0076, lr=3.65e-06, step=8619] Training: 86%|████████▌ | 8620/10000 [1:50:16<18:09, 1.27it/s, loss=0.0052, lr=3.65e-06, step=8620] Training: 86%|████████▌ | 8621/10000 [1:50:17<20:03, 1.15it/s, loss=0.0052, lr=3.65e-06, step=8620] Training: 86%|████████▌ | 8621/10000 [1:50:17<20:03, 1.15it/s, loss=0.0043, lr=3.65e-06, step=8621] Training: 86%|████████▌ | 8622/10000 [1:50:17<18:34, 1.24it/s, loss=0.0043, lr=3.65e-06, step=8621] Training: 86%|████████▌ | 8622/10000 [1:50:17<18:34, 1.24it/s, loss=0.0109, lr=3.65e-06, step=8622] Training: 86%|████████▌ | 8623/10000 [1:50:18<19:07, 1.20it/s, loss=0.0109, lr=3.65e-06, step=8622] Training: 86%|████████▌ | 8623/10000 [1:50:18<19:07, 1.20it/s, loss=0.0052, lr=3.65e-06, step=8623] Training: 86%|████████▌ | 8624/10000 [1:50:19<18:20, 1.25it/s, loss=0.0052, lr=3.65e-06, step=8623] Training: 86%|████████▌ | 8624/10000 [1:50:19<18:20, 1.25it/s, loss=0.0162, lr=3.65e-06, step=8624] Training: 86%|████████▋ | 8625/10000 [1:50:20<18:53, 1.21it/s, loss=0.0162, lr=3.65e-06, step=8624] Training: 86%|████████▋ | 8625/10000 [1:50:20<18:53, 1.21it/s, loss=0.0019, lr=3.64e-06, step=8625] Training: 86%|████████▋ | 8626/10000 [1:50:21<19:45, 1.16it/s, loss=0.0019, lr=3.64e-06, step=8625] Training: 86%|████████▋ | 8626/10000 [1:50:21<19:45, 1.16it/s, loss=0.0062, lr=3.64e-06, step=8626] Training: 86%|████████▋ | 8627/10000 [1:50:22<21:19, 1.07it/s, loss=0.0062, lr=3.64e-06, step=8626] Training: 86%|████████▋ | 8627/10000 [1:50:22<21:19, 1.07it/s, loss=0.0045, lr=3.64e-06, step=8627] Training: 86%|████████▋ | 8628/10000 [1:50:23<20:23, 1.12it/s, loss=0.0045, lr=3.64e-06, step=8627] Training: 86%|████████▋ | 8628/10000 [1:50:23<20:23, 1.12it/s, loss=0.0030, lr=3.64e-06, step=8628] Training: 86%|████████▋ | 8629/10000 [1:50:24<20:20, 1.12it/s, loss=0.0030, lr=3.64e-06, step=8628] Training: 86%|████████▋ | 8629/10000 [1:50:24<20:20, 1.12it/s, loss=0.0024, lr=3.64e-06, step=8629]17:56:31.336 [I] step=8630 loss=0.0059 smoothed_loss=0.0066 lr=3.64e-06 grad_norm=0.3594 step_time=0.6718s data_time=0.1921s it/s=1.158 eta_to_10000=1183.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0091 grad_action_out_proj=0.0884 grad_shared_expert=0.2790 (10775:train_pytorch.py:850) + Training: 86%|████████▋ | 8630/10000 [1:50:24<18:56, 1.21it/s, loss=0.0024, lr=3.64e-06, step=8629] Training: 86%|████████▋ | 8630/10000 [1:50:24<18:56, 1.21it/s, loss=0.0059, lr=3.64e-06, step=8630] Training: 86%|████████▋ | 8631/10000 [1:50:25<19:01, 1.20it/s, loss=0.0059, lr=3.64e-06, step=8630] Training: 86%|████████▋ | 8631/10000 [1:50:25<19:01, 1.20it/s, loss=0.0019, lr=3.63e-06, step=8631] Training: 86%|████████▋ | 8632/10000 [1:50:26<17:31, 1.30it/s, loss=0.0019, lr=3.63e-06, step=8631] Training: 86%|████████▋ | 8632/10000 [1:50:26<17:31, 1.30it/s, loss=0.0006, lr=3.63e-06, step=8632] Training: 86%|████████▋ | 8633/10000 [1:50:26<16:03, 1.42it/s, loss=0.0006, lr=3.63e-06, step=8632] Training: 86%|████████▋ | 8633/10000 [1:50:26<16:03, 1.42it/s, loss=0.0028, lr=3.63e-06, step=8633] Training: 86%|████████▋ | 8634/10000 [1:50:27<16:20, 1.39it/s, loss=0.0028, lr=3.63e-06, step=8633] Training: 86%|████████▋ | 8634/10000 [1:50:27<16:20, 1.39it/s, loss=0.0046, lr=3.63e-06, step=8634] Training: 86%|████████▋ | 8635/10000 [1:50:28<15:49, 1.44it/s, loss=0.0046, lr=3.63e-06, step=8634] Training: 86%|████████▋ | 8635/10000 [1:50:28<15:49, 1.44it/s, loss=0.0059, lr=3.63e-06, step=8635] Training: 86%|████████▋ | 8636/10000 [1:50:29<17:36, 1.29it/s, loss=0.0059, lr=3.63e-06, step=8635] Training: 86%|████████▋ | 8636/10000 [1:50:29<17:36, 1.29it/s, loss=0.0063, lr=3.63e-06, step=8636] Training: 86%|████████▋ | 8637/10000 [1:50:30<18:30, 1.23it/s, loss=0.0063, lr=3.63e-06, step=8636] Training: 86%|████████▋ | 8637/10000 [1:50:30<18:30, 1.23it/s, loss=0.0069, lr=3.63e-06, step=8637] Training: 86%|████████▋ | 8638/10000 [1:50:31<19:51, 1.14it/s, loss=0.0069, lr=3.63e-06, step=8637] Training: 86%|████████▋ | 8638/10000 [1:50:31<19:51, 1.14it/s, loss=0.0542, lr=3.62e-06, step=8638] Training: 86%|████████▋ | 8639/10000 [1:50:32<20:26, 1.11it/s, loss=0.0542, lr=3.62e-06, step=8638] Training: 86%|████████▋ | 8639/10000 [1:50:32<20:26, 1.11it/s, loss=0.0031, lr=3.62e-06, step=8639]17:56:39.542 [I] step=8640 loss=0.0038 smoothed_loss=0.0091 lr=3.63e-06 grad_norm=0.4332 step_time=0.6347s data_time=0.1859s it/s=1.219 eta_to_10000=1115.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0181 grad_action_out_proj=0.1359 grad_shared_expert=0.3749 (10775:train_pytorch.py:850) + Training: 86%|████████▋ | 8640/10000 [1:50:33<20:46, 1.09it/s, loss=0.0031, lr=3.62e-06, step=8639] Training: 86%|████████▋ | 8640/10000 [1:50:33<20:46, 1.09it/s, loss=0.0038, lr=3.62e-06, step=8640] Training: 86%|████████▋ | 8641/10000 [1:50:33<19:56, 1.14it/s, loss=0.0038, lr=3.62e-06, step=8640] Training: 86%|████████▋ | 8641/10000 [1:50:33<19:56, 1.14it/s, loss=0.0107, lr=3.62e-06, step=8641] Training: 86%|████████▋ | 8642/10000 [1:50:34<20:03, 1.13it/s, loss=0.0107, lr=3.62e-06, step=8641] Training: 86%|████████▋ | 8642/10000 [1:50:34<20:03, 1.13it/s, loss=0.0080, lr=3.62e-06, step=8642] Training: 86%|████████▋ | 8643/10000 [1:50:35<19:44, 1.15it/s, loss=0.0080, lr=3.62e-06, step=8642] Training: 86%|████████▋ | 8643/10000 [1:50:35<19:44, 1.15it/s, loss=0.0046, lr=3.62e-06, step=8643] Training: 86%|████████▋ | 8644/10000 [1:50:36<19:12, 1.18it/s, loss=0.0046, lr=3.62e-06, step=8643] Training: 86%|████████▋ | 8644/10000 [1:50:36<19:12, 1.18it/s, loss=0.0083, lr=3.61e-06, step=8644] Training: 86%|████████▋ | 8645/10000 [1:50:37<20:04, 1.13it/s, loss=0.0083, lr=3.61e-06, step=8644] Training: 86%|████████▋ | 8645/10000 [1:50:37<20:04, 1.13it/s, loss=0.0073, lr=3.61e-06, step=8645] Training: 86%|████████▋ | 8646/10000 [1:50:38<19:17, 1.17it/s, loss=0.0073, lr=3.61e-06, step=8645] Training: 86%|████████▋ | 8646/10000 [1:50:38<19:17, 1.17it/s, loss=0.1020, lr=3.61e-06, step=8646] Training: 86%|████████▋ | 8647/10000 [1:50:38<18:17, 1.23it/s, loss=0.1020, lr=3.61e-06, step=8646] Training: 86%|████████▋ | 8647/10000 [1:50:38<18:17, 1.23it/s, loss=0.0079, lr=3.61e-06, step=8647] Training: 86%|████████▋ | 8648/10000 [1:50:39<18:35, 1.21it/s, loss=0.0079, lr=3.61e-06, step=8647] Training: 86%|████████▋ | 8648/10000 [1:50:39<18:35, 1.21it/s, loss=0.0020, lr=3.61e-06, step=8648] Training: 86%|████████▋ | 8649/10000 [1:50:40<17:27, 1.29it/s, loss=0.0020, lr=3.61e-06, step=8648] Training: 86%|████████▋ | 8649/10000 [1:50:40<17:27, 1.29it/s, loss=0.0068, lr=3.61e-06, step=8649]17:56:47.615 [I] step=8650 loss=0.0101 smoothed_loss=0.0141 lr=3.61e-06 grad_norm=0.4733 step_time=0.6579s data_time=0.1495s it/s=1.239 eta_to_10000=1089.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0211 grad_action_out_proj=0.1012 grad_shared_expert=0.3194 (10775:train_pytorch.py:850) + Training: 86%|████████▋ | 8650/10000 [1:50:41<17:19, 1.30it/s, loss=0.0068, lr=3.61e-06, step=8649] Training: 86%|████████▋ | 8650/10000 [1:50:41<17:19, 1.30it/s, loss=0.0101, lr=3.60e-06, step=8650] Training: 87%|████████▋ | 8651/10000 [1:50:41<17:03, 1.32it/s, loss=0.0101, lr=3.60e-06, step=8650] Training: 87%|████████▋ | 8651/10000 [1:50:41<17:03, 1.32it/s, loss=0.0077, lr=3.60e-06, step=8651] Training: 87%|████████▋ | 8652/10000 [1:50:42<16:51, 1.33it/s, loss=0.0077, lr=3.60e-06, step=8651] Training: 87%|████████▋ | 8652/10000 [1:50:42<16:51, 1.33it/s, loss=0.0020, lr=3.60e-06, step=8652] Training: 87%|████████▋ | 8653/10000 [1:50:43<18:39, 1.20it/s, loss=0.0020, lr=3.60e-06, step=8652] Training: 87%|████████▋ | 8653/10000 [1:50:43<18:39, 1.20it/s, loss=0.0072, lr=3.60e-06, step=8653] Training: 87%|████████▋ | 8654/10000 [1:50:44<18:27, 1.22it/s, loss=0.0072, lr=3.60e-06, step=8653] Training: 87%|████████▋ | 8654/10000 [1:50:44<18:27, 1.22it/s, loss=0.0017, lr=3.60e-06, step=8654] Training: 87%|████████▋ | 8655/10000 [1:50:45<18:42, 1.20it/s, loss=0.0017, lr=3.60e-06, step=8654] Training: 87%|████████▋ | 8655/10000 [1:50:45<18:42, 1.20it/s, loss=0.0252, lr=3.60e-06, step=8655] Training: 87%|████████▋ | 8656/10000 [1:50:46<20:05, 1.12it/s, loss=0.0252, lr=3.60e-06, step=8655] Training: 87%|████████▋ | 8656/10000 [1:50:46<20:05, 1.12it/s, loss=0.0285, lr=3.59e-06, step=8656] Training: 87%|████████▋ | 8657/10000 [1:50:47<20:51, 1.07it/s, loss=0.0285, lr=3.59e-06, step=8656] Training: 87%|████████▋ | 8657/10000 [1:50:47<20:51, 1.07it/s, loss=0.0047, lr=3.59e-06, step=8657] Training: 87%|████████▋ | 8658/10000 [1:50:48<19:54, 1.12it/s, loss=0.0047, lr=3.59e-06, step=8657] Training: 87%|████████▋ | 8658/10000 [1:50:48<19:54, 1.12it/s, loss=0.0142, lr=3.59e-06, step=8658] Training: 87%|████████▋ | 8659/10000 [1:50:48<18:13, 1.23it/s, loss=0.0142, lr=3.59e-06, step=8658] Training: 87%|████████▋ | 8659/10000 [1:50:48<18:13, 1.23it/s, loss=0.0030, lr=3.59e-06, step=8659]17:56:55.926 [I] step=8660 loss=0.0103 smoothed_loss=0.0119 lr=3.60e-06 grad_norm=0.4133 step_time=0.6238s data_time=0.2073s it/s=1.203 eta_to_10000=1113.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0104 grad_action_out_proj=0.0855 grad_shared_expert=0.3258 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8660/10000 [1:50:49<17:15, 1.29it/s, loss=0.0030, lr=3.59e-06, step=8659] Training: 87%|████████▋ | 8660/10000 [1:50:49<17:15, 1.29it/s, loss=0.0103, lr=3.59e-06, step=8660] Training: 87%|████████▋ | 8661/10000 [1:50:50<15:54, 1.40it/s, loss=0.0103, lr=3.59e-06, step=8660] Training: 87%|████████▋ | 8661/10000 [1:50:50<15:54, 1.40it/s, loss=0.0021, lr=3.59e-06, step=8661] Training: 87%|████████▋ | 8662/10000 [1:50:50<17:09, 1.30it/s, loss=0.0021, lr=3.59e-06, step=8661] Training: 87%|████████▋ | 8662/10000 [1:50:50<17:09, 1.30it/s, loss=0.0298, lr=3.58e-06, step=8662] Training: 87%|████████▋ | 8663/10000 [1:50:51<17:12, 1.30it/s, loss=0.0298, lr=3.58e-06, step=8662] Training: 87%|████████▋ | 8663/10000 [1:50:51<17:12, 1.30it/s, loss=0.0143, lr=3.58e-06, step=8663] Training: 87%|████████▋ | 8664/10000 [1:50:52<18:35, 1.20it/s, loss=0.0143, lr=3.58e-06, step=8663] Training: 87%|████████▋ | 8664/10000 [1:50:52<18:35, 1.20it/s, loss=0.0020, lr=3.58e-06, step=8664] Training: 87%|████████▋ | 8665/10000 [1:50:53<16:46, 1.33it/s, loss=0.0020, lr=3.58e-06, step=8664] Training: 87%|████████▋ | 8665/10000 [1:50:53<16:46, 1.33it/s, loss=0.0016, lr=3.58e-06, step=8665] Training: 87%|████████▋ | 8666/10000 [1:50:53<15:49, 1.40it/s, loss=0.0016, lr=3.58e-06, step=8665] Training: 87%|████████▋ | 8666/10000 [1:50:53<15:49, 1.40it/s, loss=0.0119, lr=3.58e-06, step=8666] Training: 87%|████████▋ | 8667/10000 [1:50:54<15:14, 1.46it/s, loss=0.0119, lr=3.58e-06, step=8666] Training: 87%|████████▋ | 8667/10000 [1:50:54<15:14, 1.46it/s, loss=0.0116, lr=3.58e-06, step=8667] Training: 87%|████████▋ | 8668/10000 [1:50:55<16:17, 1.36it/s, loss=0.0116, lr=3.58e-06, step=8667] Training: 87%|████████▋ | 8668/10000 [1:50:55<16:17, 1.36it/s, loss=0.0031, lr=3.58e-06, step=8668] Training: 87%|████████▋ | 8669/10000 [1:50:56<16:06, 1.38it/s, loss=0.0031, lr=3.58e-06, step=8668] Training: 87%|████████▋ | 8669/10000 [1:50:56<16:06, 1.38it/s, loss=0.0033, lr=3.57e-06, step=8669]17:57:03.363 [I] step=8670 loss=0.0063 smoothed_loss=0.0092 lr=3.58e-06 grad_norm=0.4443 step_time=0.5854s data_time=0.1583s it/s=1.345 eta_to_10000=989.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0174 grad_action_out_proj=0.1103 grad_shared_expert=0.4008 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8670/10000 [1:50:56<16:53, 1.31it/s, loss=0.0033, lr=3.57e-06, step=8669] Training: 87%|████████▋ | 8670/10000 [1:50:56<16:53, 1.31it/s, loss=0.0063, lr=3.57e-06, step=8670] Training: 87%|████████▋ | 8671/10000 [1:50:57<18:23, 1.20it/s, loss=0.0063, lr=3.57e-06, step=8670] Training: 87%|████████▋ | 8671/10000 [1:50:57<18:23, 1.20it/s, loss=0.0062, lr=3.57e-06, step=8671] Training: 87%|████████▋ | 8672/10000 [1:50:58<19:25, 1.14it/s, loss=0.0062, lr=3.57e-06, step=8671] Training: 87%|████████▋ | 8672/10000 [1:50:58<19:25, 1.14it/s, loss=0.0033, lr=3.57e-06, step=8672] Training: 87%|████████▋ | 8673/10000 [1:50:59<19:28, 1.14it/s, loss=0.0033, lr=3.57e-06, step=8672] Training: 87%|████████▋ | 8673/10000 [1:50:59<19:28, 1.14it/s, loss=0.0193, lr=3.57e-06, step=8673] Training: 87%|████████▋ | 8674/10000 [1:51:00<17:26, 1.27it/s, loss=0.0193, lr=3.57e-06, step=8673] Training: 87%|████████▋ | 8674/10000 [1:51:00<17:26, 1.27it/s, loss=0.0045, lr=3.57e-06, step=8674] Training: 87%|████████▋ | 8675/10000 [1:51:01<16:33, 1.33it/s, loss=0.0045, lr=3.57e-06, step=8674] Training: 87%|████████▋ | 8675/10000 [1:51:01<16:33, 1.33it/s, loss=0.0020, lr=3.56e-06, step=8675] Training: 87%|████████▋ | 8676/10000 [1:51:01<15:07, 1.46it/s, loss=0.0020, lr=3.56e-06, step=8675] Training: 87%|████████▋ | 8676/10000 [1:51:01<15:07, 1.46it/s, loss=0.0076, lr=3.56e-06, step=8676] Training: 87%|████████▋ | 8677/10000 [1:51:02<17:12, 1.28it/s, loss=0.0076, lr=3.56e-06, step=8676] Training: 87%|████████▋ | 8677/10000 [1:51:02<17:12, 1.28it/s, loss=0.0103, lr=3.56e-06, step=8677] Training: 87%|████████▋ | 8678/10000 [1:51:03<17:46, 1.24it/s, loss=0.0103, lr=3.56e-06, step=8677] Training: 87%|████████▋ | 8678/10000 [1:51:03<17:46, 1.24it/s, loss=0.0094, lr=3.56e-06, step=8678] Training: 87%|████████▋ | 8679/10000 [1:51:04<20:10, 1.09it/s, loss=0.0094, lr=3.56e-06, step=8678] Training: 87%|████████▋ | 8679/10000 [1:51:04<20:10, 1.09it/s, loss=0.0058, lr=3.56e-06, step=8679]17:57:11.646 [I] step=8680 loss=0.0016 smoothed_loss=0.0076 lr=3.56e-06 grad_norm=0.4742 step_time=0.6599s data_time=0.1685s it/s=1.207 eta_to_10000=1093.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.1418 grad_shared_expert=0.5365 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8680/10000 [1:51:05<18:06, 1.21it/s, loss=0.0058, lr=3.56e-06, step=8679] Training: 87%|████████▋ | 8680/10000 [1:51:05<18:06, 1.21it/s, loss=0.0016, lr=3.56e-06, step=8680] Training: 87%|████████▋ | 8681/10000 [1:51:06<18:40, 1.18it/s, loss=0.0016, lr=3.56e-06, step=8680] Training: 87%|████████▋ | 8681/10000 [1:51:06<18:40, 1.18it/s, loss=0.0025, lr=3.55e-06, step=8681] Training: 87%|████████▋ | 8682/10000 [1:51:07<20:30, 1.07it/s, loss=0.0025, lr=3.55e-06, step=8681] Training: 87%|████████▋ | 8682/10000 [1:51:07<20:30, 1.07it/s, loss=0.0275, lr=3.55e-06, step=8682] Training: 87%|████████▋ | 8683/10000 [1:51:08<21:17, 1.03it/s, loss=0.0275, lr=3.55e-06, step=8682] Training: 87%|████████▋ | 8683/10000 [1:51:08<21:17, 1.03it/s, loss=0.0096, lr=3.55e-06, step=8683] Training: 87%|████████▋ | 8684/10000 [1:51:09<19:55, 1.10it/s, loss=0.0096, lr=3.55e-06, step=8683] Training: 87%|████████▋ | 8684/10000 [1:51:09<19:55, 1.10it/s, loss=0.0024, lr=3.55e-06, step=8684] Training: 87%|████████▋ | 8685/10000 [1:51:10<20:10, 1.09it/s, loss=0.0024, lr=3.55e-06, step=8684] Training: 87%|████████▋ | 8685/10000 [1:51:10<20:10, 1.09it/s, loss=0.0090, lr=3.55e-06, step=8685] Training: 87%|████████▋ | 8686/10000 [1:51:11<21:23, 1.02it/s, loss=0.0090, lr=3.55e-06, step=8685] Training: 87%|████████▋ | 8686/10000 [1:51:11<21:23, 1.02it/s, loss=0.0344, lr=3.55e-06, step=8686] Training: 87%|████████▋ | 8687/10000 [1:51:11<20:08, 1.09it/s, loss=0.0344, lr=3.55e-06, step=8686] Training: 87%|████████▋ | 8687/10000 [1:51:11<20:08, 1.09it/s, loss=0.0014, lr=3.55e-06, step=8687] Training: 87%|████████▋ | 8688/10000 [1:51:12<20:23, 1.07it/s, loss=0.0014, lr=3.55e-06, step=8687] Training: 87%|████████▋ | 8688/10000 [1:51:12<20:23, 1.07it/s, loss=0.0026, lr=3.54e-06, step=8688] Training: 87%|████████▋ | 8689/10000 [1:51:14<22:02, 1.01s/it, loss=0.0026, lr=3.54e-06, step=8688] Training: 87%|████████▋ | 8689/10000 [1:51:14<22:02, 1.01s/it, loss=0.0061, lr=3.54e-06, step=8689]17:57:21.635 [I] step=8690 loss=0.0073 smoothed_loss=0.0089 lr=3.55e-06 grad_norm=0.4296 step_time=0.7507s data_time=0.2482s it/s=1.001 eta_to_10000=1308.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0148 grad_action_out_proj=0.1430 grad_shared_expert=0.4491 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8690/10000 [1:51:15<22:51, 1.05s/it, loss=0.0061, lr=3.54e-06, step=8689] Training: 87%|████████▋ | 8690/10000 [1:51:15<22:51, 1.05s/it, loss=0.0073, lr=3.54e-06, step=8690] Training: 87%|████████▋ | 8691/10000 [1:51:16<22:09, 1.02s/it, loss=0.0073, lr=3.54e-06, step=8690] Training: 87%|████████▋ | 8691/10000 [1:51:16<22:09, 1.02s/it, loss=0.0113, lr=3.54e-06, step=8691] Training: 87%|████████▋ | 8692/10000 [1:51:17<21:35, 1.01it/s, loss=0.0113, lr=3.54e-06, step=8691] Training: 87%|████████▋ | 8692/10000 [1:51:17<21:35, 1.01it/s, loss=0.0080, lr=3.54e-06, step=8692] Training: 87%|████████▋ | 8693/10000 [1:51:17<19:46, 1.10it/s, loss=0.0080, lr=3.54e-06, step=8692] Training: 87%|████████▋ | 8693/10000 [1:51:17<19:46, 1.10it/s, loss=0.0027, lr=3.54e-06, step=8693] Training: 87%|████████▋ | 8694/10000 [1:51:18<18:51, 1.15it/s, loss=0.0027, lr=3.54e-06, step=8693] Training: 87%|████████▋ | 8694/10000 [1:51:18<18:51, 1.15it/s, loss=0.0020, lr=3.53e-06, step=8694] Training: 87%|████████▋ | 8695/10000 [1:51:19<16:37, 1.31it/s, loss=0.0020, lr=3.53e-06, step=8694] Training: 87%|████████▋ | 8695/10000 [1:51:19<16:37, 1.31it/s, loss=0.0021, lr=3.53e-06, step=8695] Training: 87%|████████▋ | 8696/10000 [1:51:19<15:24, 1.41it/s, loss=0.0021, lr=3.53e-06, step=8695] Training: 87%|████████▋ | 8696/10000 [1:51:19<15:24, 1.41it/s, loss=0.0153, lr=3.53e-06, step=8696] Training: 87%|████████▋ | 8697/10000 [1:51:20<14:47, 1.47it/s, loss=0.0153, lr=3.53e-06, step=8696] Training: 87%|████████▋ | 8697/10000 [1:51:20<14:47, 1.47it/s, loss=0.0188, lr=3.53e-06, step=8697] Training: 87%|████████▋ | 8698/10000 [1:51:21<15:23, 1.41it/s, loss=0.0188, lr=3.53e-06, step=8697] Training: 87%|████████▋ | 8698/10000 [1:51:21<15:23, 1.41it/s, loss=0.0136, lr=3.53e-06, step=8698] Training: 87%|████████▋ | 8699/10000 [1:51:22<16:56, 1.28it/s, loss=0.0136, lr=3.53e-06, step=8698] Training: 87%|████████▋ | 8699/10000 [1:51:22<16:56, 1.28it/s, loss=0.0039, lr=3.53e-06, step=8699]17:57:29.664 [I] step=8700 loss=0.0639 smoothed_loss=0.0145 lr=3.53e-06 grad_norm=0.4556 step_time=0.6220s data_time=0.1809s it/s=1.246 eta_to_10000=1043.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0321 grad_action_out_proj=0.1592 grad_shared_expert=0.5489 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8700/10000 [1:51:23<19:48, 1.09it/s, loss=0.0039, lr=3.53e-06, step=8699] Training: 87%|████████▋ | 8700/10000 [1:51:23<19:48, 1.09it/s, loss=0.0639, lr=3.53e-06, step=8700] Training: 87%|████████▋ | 8701/10000 [1:51:24<19:01, 1.14it/s, loss=0.0639, lr=3.53e-06, step=8700] Training: 87%|████████▋ | 8701/10000 [1:51:24<19:01, 1.14it/s, loss=0.0114, lr=3.52e-06, step=8701] Training: 87%|████████▋ | 8702/10000 [1:51:24<19:04, 1.13it/s, loss=0.0114, lr=3.52e-06, step=8701] Training: 87%|████████▋ | 8702/10000 [1:51:24<19:04, 1.13it/s, loss=0.0501, lr=3.52e-06, step=8702] Training: 87%|████████▋ | 8703/10000 [1:51:25<19:52, 1.09it/s, loss=0.0501, lr=3.52e-06, step=8702] Training: 87%|████████▋ | 8703/10000 [1:51:25<19:52, 1.09it/s, loss=0.0210, lr=3.52e-06, step=8703] Training: 87%|████████▋ | 8704/10000 [1:51:26<18:39, 1.16it/s, loss=0.0210, lr=3.52e-06, step=8703] Training: 87%|████████▋ | 8704/10000 [1:51:26<18:39, 1.16it/s, loss=0.0250, lr=3.52e-06, step=8704] Training: 87%|████████▋ | 8705/10000 [1:51:27<16:14, 1.33it/s, loss=0.0250, lr=3.52e-06, step=8704] Training: 87%|████████▋ | 8705/10000 [1:51:27<16:14, 1.33it/s, loss=0.0064, lr=3.52e-06, step=8705] Training: 87%|████████▋ | 8706/10000 [1:51:27<16:50, 1.28it/s, loss=0.0064, lr=3.52e-06, step=8705] Training: 87%|████████▋ | 8706/10000 [1:51:27<16:50, 1.28it/s, loss=0.0433, lr=3.52e-06, step=8706] Training: 87%|████████▋ | 8707/10000 [1:51:28<16:26, 1.31it/s, loss=0.0433, lr=3.52e-06, step=8706] Training: 87%|████████▋ | 8707/10000 [1:51:28<16:26, 1.31it/s, loss=0.0077, lr=3.51e-06, step=8707] Training: 87%|████████▋ | 8708/10000 [1:51:29<15:23, 1.40it/s, loss=0.0077, lr=3.51e-06, step=8707] Training: 87%|████████▋ | 8708/10000 [1:51:29<15:23, 1.40it/s, loss=0.0152, lr=3.51e-06, step=8708] Training: 87%|████████▋ | 8709/10000 [1:51:30<15:11, 1.42it/s, loss=0.0152, lr=3.51e-06, step=8708] Training: 87%|████████▋ | 8709/10000 [1:51:30<15:11, 1.42it/s, loss=0.0021, lr=3.51e-06, step=8709]17:57:37.123 [I] step=8710 loss=0.0047 smoothed_loss=0.0157 lr=3.52e-06 grad_norm=0.4417 step_time=0.5886s data_time=0.1573s it/s=1.341 eta_to_10000=962.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0045 grad_action_out_proj=0.0479 grad_shared_expert=0.2445 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8710/10000 [1:51:30<15:02, 1.43it/s, loss=0.0021, lr=3.51e-06, step=8709] Training: 87%|████████▋ | 8710/10000 [1:51:30<15:02, 1.43it/s, loss=0.0047, lr=3.51e-06, step=8710] Training: 87%|████████▋ | 8711/10000 [1:51:31<13:51, 1.55it/s, loss=0.0047, lr=3.51e-06, step=8710] Training: 87%|████████▋ | 8711/10000 [1:51:31<13:51, 1.55it/s, loss=0.0215, lr=3.51e-06, step=8711] Training: 87%|████████▋ | 8712/10000 [1:51:31<13:00, 1.65it/s, loss=0.0215, lr=3.51e-06, step=8711] Training: 87%|████████▋ | 8712/10000 [1:51:31<13:00, 1.65it/s, loss=0.0019, lr=3.51e-06, step=8712] Training: 87%|████████▋ | 8713/10000 [1:51:32<12:11, 1.76it/s, loss=0.0019, lr=3.51e-06, step=8712] Training: 87%|████████▋ | 8713/10000 [1:51:32<12:11, 1.76it/s, loss=0.0018, lr=3.51e-06, step=8713] Training: 87%|████████▋ | 8714/10000 [1:51:33<14:32, 1.47it/s, loss=0.0018, lr=3.51e-06, step=8713] Training: 87%|████████▋ | 8714/10000 [1:51:33<14:32, 1.47it/s, loss=0.0088, lr=3.50e-06, step=8714] Training: 87%|████████▋ | 8715/10000 [1:51:33<15:36, 1.37it/s, loss=0.0088, lr=3.50e-06, step=8714] Training: 87%|████████▋ | 8715/10000 [1:51:33<15:36, 1.37it/s, loss=0.0032, lr=3.50e-06, step=8715] Training: 87%|████████▋ | 8716/10000 [1:51:34<14:24, 1.49it/s, loss=0.0032, lr=3.50e-06, step=8715] Training: 87%|████████▋ | 8716/10000 [1:51:34<14:24, 1.49it/s, loss=0.0019, lr=3.50e-06, step=8716] Training: 87%|████████▋ | 8717/10000 [1:51:35<15:33, 1.37it/s, loss=0.0019, lr=3.50e-06, step=8716] Training: 87%|████████▋ | 8717/10000 [1:51:35<15:33, 1.37it/s, loss=0.0096, lr=3.50e-06, step=8717] Training: 87%|████████▋ | 8718/10000 [1:51:35<14:32, 1.47it/s, loss=0.0096, lr=3.50e-06, step=8717] Training: 87%|████████▋ | 8718/10000 [1:51:35<14:32, 1.47it/s, loss=0.0145, lr=3.50e-06, step=8718] Training: 87%|████████▋ | 8719/10000 [1:51:36<13:45, 1.55it/s, loss=0.0145, lr=3.50e-06, step=8718] Training: 87%|████████▋ | 8719/10000 [1:51:36<13:45, 1.55it/s, loss=0.0053, lr=3.50e-06, step=8719]17:57:43.770 [I] step=8720 loss=0.0038 smoothed_loss=0.0100 lr=3.50e-06 grad_norm=0.5380 step_time=0.5422s data_time=0.1226s it/s=1.505 eta_to_10000=850.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0711 grad_shared_expert=0.1363 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8720/10000 [1:51:37<14:53, 1.43it/s, loss=0.0053, lr=3.50e-06, step=8719] Training: 87%|████████▋ | 8720/10000 [1:51:37<14:53, 1.43it/s, loss=0.0038, lr=3.49e-06, step=8720] Training: 87%|████████▋ | 8721/10000 [1:51:37<13:39, 1.56it/s, loss=0.0038, lr=3.49e-06, step=8720] Training: 87%|████████▋ | 8721/10000 [1:51:37<13:39, 1.56it/s, loss=0.0019, lr=3.49e-06, step=8721] Training: 87%|████████▋ | 8722/10000 [1:51:38<15:01, 1.42it/s, loss=0.0019, lr=3.49e-06, step=8721] Training: 87%|████████▋ | 8722/10000 [1:51:38<15:01, 1.42it/s, loss=0.0024, lr=3.49e-06, step=8722] Training: 87%|████████▋ | 8723/10000 [1:51:39<15:34, 1.37it/s, loss=0.0024, lr=3.49e-06, step=8722] Training: 87%|████████▋ | 8723/10000 [1:51:39<15:34, 1.37it/s, loss=0.0058, lr=3.49e-06, step=8723] Training: 87%|████████▋ | 8724/10000 [1:51:40<15:08, 1.40it/s, loss=0.0058, lr=3.49e-06, step=8723] Training: 87%|████████▋ | 8724/10000 [1:51:40<15:08, 1.40it/s, loss=0.0035, lr=3.49e-06, step=8724] Training: 87%|████████▋ | 8725/10000 [1:51:40<14:01, 1.52it/s, loss=0.0035, lr=3.49e-06, step=8724] Training: 87%|████████▋ | 8725/10000 [1:51:40<14:01, 1.52it/s, loss=0.0243, lr=3.49e-06, step=8725] Training: 87%|████████▋ | 8726/10000 [1:51:41<13:01, 1.63it/s, loss=0.0243, lr=3.49e-06, step=8725] Training: 87%|████████▋ | 8726/10000 [1:51:41<13:01, 1.63it/s, loss=0.0107, lr=3.49e-06, step=8726] Training: 87%|████████▋ | 8727/10000 [1:51:41<12:22, 1.71it/s, loss=0.0107, lr=3.49e-06, step=8726] Training: 87%|████████▋ | 8727/10000 [1:51:41<12:22, 1.71it/s, loss=0.0055, lr=3.48e-06, step=8727] Training: 87%|████████▋ | 8728/10000 [1:51:42<13:55, 1.52it/s, loss=0.0055, lr=3.48e-06, step=8727] Training: 87%|████████▋ | 8728/10000 [1:51:42<13:55, 1.52it/s, loss=0.0019, lr=3.48e-06, step=8728] Training: 87%|████████▋ | 8729/10000 [1:51:43<17:05, 1.24it/s, loss=0.0019, lr=3.48e-06, step=8728] Training: 87%|████████▋ | 8729/10000 [1:51:43<17:05, 1.24it/s, loss=0.0198, lr=3.48e-06, step=8729]17:57:50.678 [I] step=8730 loss=0.0027 smoothed_loss=0.0089 lr=3.49e-06 grad_norm=0.4761 step_time=0.5671s data_time=0.1238s it/s=1.448 eta_to_10000=877.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0207 grad_action_out_proj=0.1785 grad_shared_expert=0.4409 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8730/10000 [1:51:44<15:24, 1.37it/s, loss=0.0198, lr=3.48e-06, step=8729] Training: 87%|████████▋ | 8730/10000 [1:51:44<15:24, 1.37it/s, loss=0.0027, lr=3.48e-06, step=8730] Training: 87%|████████▋ | 8731/10000 [1:51:44<15:20, 1.38it/s, loss=0.0027, lr=3.48e-06, step=8730] Training: 87%|████████▋ | 8731/10000 [1:51:44<15:20, 1.38it/s, loss=0.0339, lr=3.48e-06, step=8731] Training: 87%|████████▋ | 8732/10000 [1:51:45<14:02, 1.51it/s, loss=0.0339, lr=3.48e-06, step=8731] Training: 87%|████████▋ | 8732/10000 [1:51:45<14:02, 1.51it/s, loss=0.0045, lr=3.48e-06, step=8732] Training: 87%|████████▋ | 8733/10000 [1:51:46<13:43, 1.54it/s, loss=0.0045, lr=3.48e-06, step=8732] Training: 87%|████████▋ | 8733/10000 [1:51:46<13:43, 1.54it/s, loss=0.0056, lr=3.47e-06, step=8733] Training: 87%|████████▋ | 8734/10000 [1:51:46<13:06, 1.61it/s, loss=0.0056, lr=3.47e-06, step=8733] Training: 87%|████████▋ | 8734/10000 [1:51:46<13:06, 1.61it/s, loss=0.0032, lr=3.47e-06, step=8734] Training: 87%|████████▋ | 8735/10000 [1:51:47<14:47, 1.43it/s, loss=0.0032, lr=3.47e-06, step=8734] Training: 87%|████████▋ | 8735/10000 [1:51:47<14:47, 1.43it/s, loss=0.0034, lr=3.47e-06, step=8735] Training: 87%|████████▋ | 8736/10000 [1:51:48<17:28, 1.21it/s, loss=0.0034, lr=3.47e-06, step=8735] Training: 87%|████████▋ | 8736/10000 [1:51:48<17:28, 1.21it/s, loss=0.0029, lr=3.47e-06, step=8736] Training: 87%|████████▋ | 8737/10000 [1:51:49<16:18, 1.29it/s, loss=0.0029, lr=3.47e-06, step=8736] Training: 87%|████████▋ | 8737/10000 [1:51:49<16:18, 1.29it/s, loss=0.0056, lr=3.47e-06, step=8737] Training: 87%|████████▋ | 8738/10000 [1:51:49<15:23, 1.37it/s, loss=0.0056, lr=3.47e-06, step=8737] Training: 87%|████████▋ | 8738/10000 [1:51:49<15:23, 1.37it/s, loss=0.0013, lr=3.47e-06, step=8738] Training: 87%|████████▋ | 8739/10000 [1:51:50<14:33, 1.44it/s, loss=0.0013, lr=3.47e-06, step=8738] Training: 87%|████████▋ | 8739/10000 [1:51:50<14:33, 1.44it/s, loss=0.0179, lr=3.47e-06, step=8739]17:57:57.627 [I] step=8740 loss=0.0130 smoothed_loss=0.0088 lr=3.47e-06 grad_norm=0.3790 step_time=0.5630s data_time=0.1318s it/s=1.439 eta_to_10000=875.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0121 grad_action_out_proj=0.0943 grad_shared_expert=0.3007 (10775:train_pytorch.py:850) + Training: 87%|████████▋ | 8740/10000 [1:51:51<14:13, 1.48it/s, loss=0.0179, lr=3.47e-06, step=8739] Training: 87%|████████▋ | 8740/10000 [1:51:51<14:13, 1.48it/s, loss=0.0130, lr=3.46e-06, step=8740] Training: 87%|████████▋ | 8741/10000 [1:51:52<15:31, 1.35it/s, loss=0.0130, lr=3.46e-06, step=8740] Training: 87%|████████▋ | 8741/10000 [1:51:52<15:31, 1.35it/s, loss=0.0176, lr=3.46e-06, step=8741] Training: 87%|████████▋ | 8742/10000 [1:51:52<14:44, 1.42it/s, loss=0.0176, lr=3.46e-06, step=8741] Training: 87%|████████▋ | 8742/10000 [1:51:52<14:44, 1.42it/s, loss=0.0065, lr=3.46e-06, step=8742] Training: 87%|████████▋ | 8743/10000 [1:51:53<15:25, 1.36it/s, loss=0.0065, lr=3.46e-06, step=8742] Training: 87%|████████▋ | 8743/10000 [1:51:53<15:25, 1.36it/s, loss=0.0147, lr=3.46e-06, step=8743] Training: 87%|████████▋ | 8744/10000 [1:51:54<16:24, 1.28it/s, loss=0.0147, lr=3.46e-06, step=8743] Training: 87%|████████▋ | 8744/10000 [1:51:54<16:24, 1.28it/s, loss=0.0125, lr=3.46e-06, step=8744] Training: 87%|████████▋ | 8745/10000 [1:51:55<16:09, 1.29it/s, loss=0.0125, lr=3.46e-06, step=8744] Training: 87%|████████▋ | 8745/10000 [1:51:55<16:09, 1.29it/s, loss=0.0055, lr=3.46e-06, step=8745] Training: 87%|████████▋ | 8746/10000 [1:51:55<14:22, 1.45it/s, loss=0.0055, lr=3.46e-06, step=8745] Training: 87%|████████▋ | 8746/10000 [1:51:55<14:22, 1.45it/s, loss=0.0060, lr=3.46e-06, step=8746] Training: 87%|████████▋ | 8747/10000 [1:51:56<14:58, 1.39it/s, loss=0.0060, lr=3.46e-06, step=8746] Training: 87%|████████▋ | 8747/10000 [1:51:56<14:58, 1.39it/s, loss=0.0582, lr=3.45e-06, step=8747] Training: 87%|████████▋ | 8748/10000 [1:51:57<15:56, 1.31it/s, loss=0.0582, lr=3.45e-06, step=8747] Training: 87%|████████▋ | 8748/10000 [1:51:57<15:56, 1.31it/s, loss=0.0056, lr=3.45e-06, step=8748] Training: 87%|████████▋ | 8749/10000 [1:51:57<15:11, 1.37it/s, loss=0.0056, lr=3.45e-06, step=8748] Training: 87%|████████▋ | 8749/10000 [1:51:57<15:11, 1.37it/s, loss=0.0040, lr=3.45e-06, step=8749]17:58:05.419 [I] step=8750 loss=0.0051 smoothed_loss=0.0117 lr=3.46e-06 grad_norm=0.4530 step_time=0.6174s data_time=0.1618s it/s=1.284 eta_to_10000=973.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0109 grad_action_out_proj=0.0898 grad_shared_expert=0.3252 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8750/10000 [1:51:58<17:07, 1.22it/s, loss=0.0040, lr=3.45e-06, step=8749] Training: 88%|████████▊ | 8750/10000 [1:51:58<17:07, 1.22it/s, loss=0.0051, lr=3.45e-06, step=8750] Training: 88%|████████▊ | 8751/10000 [1:51:59<15:24, 1.35it/s, loss=0.0051, lr=3.45e-06, step=8750] Training: 88%|████████▊ | 8751/10000 [1:51:59<15:24, 1.35it/s, loss=0.0009, lr=3.45e-06, step=8751] Training: 88%|████████▊ | 8752/10000 [1:52:00<14:24, 1.44it/s, loss=0.0009, lr=3.45e-06, step=8751] Training: 88%|████████▊ | 8752/10000 [1:52:00<14:24, 1.44it/s, loss=0.0074, lr=3.45e-06, step=8752] Training: 88%|████████▊ | 8753/10000 [1:52:00<14:20, 1.45it/s, loss=0.0074, lr=3.45e-06, step=8752] Training: 88%|████████▊ | 8753/10000 [1:52:00<14:20, 1.45it/s, loss=0.0077, lr=3.44e-06, step=8753] Training: 88%|████████▊ | 8754/10000 [1:52:01<14:19, 1.45it/s, loss=0.0077, lr=3.44e-06, step=8753] Training: 88%|████████▊ | 8754/10000 [1:52:01<14:19, 1.45it/s, loss=0.0071, lr=3.44e-06, step=8754] Training: 88%|████████▊ | 8755/10000 [1:52:02<15:06, 1.37it/s, loss=0.0071, lr=3.44e-06, step=8754] Training: 88%|████████▊ | 8755/10000 [1:52:02<15:06, 1.37it/s, loss=0.0314, lr=3.44e-06, step=8755] Training: 88%|████████▊ | 8756/10000 [1:52:02<13:40, 1.52it/s, loss=0.0314, lr=3.44e-06, step=8755] Training: 88%|████████▊ | 8756/10000 [1:52:02<13:40, 1.52it/s, loss=0.0134, lr=3.44e-06, step=8756] Training: 88%|████████▊ | 8757/10000 [1:52:03<13:49, 1.50it/s, loss=0.0134, lr=3.44e-06, step=8756] Training: 88%|████████▊ | 8757/10000 [1:52:03<13:49, 1.50it/s, loss=0.0148, lr=3.44e-06, step=8757] Training: 88%|████████▊ | 8758/10000 [1:52:04<15:48, 1.31it/s, loss=0.0148, lr=3.44e-06, step=8757] Training: 88%|████████▊ | 8758/10000 [1:52:04<15:48, 1.31it/s, loss=0.0091, lr=3.44e-06, step=8758] Training: 88%|████████▊ | 8759/10000 [1:52:05<15:31, 1.33it/s, loss=0.0091, lr=3.44e-06, step=8758] Training: 88%|████████▊ | 8759/10000 [1:52:05<15:31, 1.33it/s, loss=0.0015, lr=3.44e-06, step=8759]17:58:12.498 [I] step=8760 loss=0.0076 smoothed_loss=0.0106 lr=3.44e-06 grad_norm=0.4014 step_time=0.5741s data_time=0.1337s it/s=1.413 eta_to_10000=877.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.0824 grad_shared_expert=0.5847 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8760/10000 [1:52:06<16:12, 1.28it/s, loss=0.0015, lr=3.44e-06, step=8759] Training: 88%|████████▊ | 8760/10000 [1:52:06<16:12, 1.28it/s, loss=0.0076, lr=3.43e-06, step=8760] Training: 88%|████████▊ | 8761/10000 [1:52:06<15:30, 1.33it/s, loss=0.0076, lr=3.43e-06, step=8760] Training: 88%|████████▊ | 8761/10000 [1:52:06<15:30, 1.33it/s, loss=0.0036, lr=3.43e-06, step=8761] Training: 88%|████████▊ | 8762/10000 [1:52:07<13:59, 1.48it/s, loss=0.0036, lr=3.43e-06, step=8761] Training: 88%|████████▊ | 8762/10000 [1:52:07<13:59, 1.48it/s, loss=0.0024, lr=3.43e-06, step=8762] Training: 88%|████████▊ | 8763/10000 [1:52:07<13:06, 1.57it/s, loss=0.0024, lr=3.43e-06, step=8762] Training: 88%|████████▊ | 8763/10000 [1:52:07<13:06, 1.57it/s, loss=0.0096, lr=3.43e-06, step=8763] Training: 88%|████████▊ | 8764/10000 [1:52:08<12:50, 1.60it/s, loss=0.0096, lr=3.43e-06, step=8763] Training: 88%|████████▊ | 8764/10000 [1:52:08<12:50, 1.60it/s, loss=0.0019, lr=3.43e-06, step=8764] Training: 88%|████████▊ | 8765/10000 [1:52:09<13:59, 1.47it/s, loss=0.0019, lr=3.43e-06, step=8764] Training: 88%|████████▊ | 8765/10000 [1:52:09<13:59, 1.47it/s, loss=0.0057, lr=3.43e-06, step=8765] Training: 88%|████████▊ | 8766/10000 [1:52:09<12:56, 1.59it/s, loss=0.0057, lr=3.43e-06, step=8765] Training: 88%|████████▊ | 8766/10000 [1:52:09<12:56, 1.59it/s, loss=0.0036, lr=3.43e-06, step=8766] Training: 88%|████████▊ | 8767/10000 [1:52:10<13:46, 1.49it/s, loss=0.0036, lr=3.43e-06, step=8766] Training: 88%|████████▊ | 8767/10000 [1:52:10<13:46, 1.49it/s, loss=0.0110, lr=3.42e-06, step=8767] Training: 88%|████████▊ | 8768/10000 [1:52:11<15:08, 1.36it/s, loss=0.0110, lr=3.42e-06, step=8767] Training: 88%|████████▊ | 8768/10000 [1:52:11<15:08, 1.36it/s, loss=0.0403, lr=3.42e-06, step=8768] Training: 88%|████████▊ | 8769/10000 [1:52:12<16:14, 1.26it/s, loss=0.0403, lr=3.42e-06, step=8768] Training: 88%|████████▊ | 8769/10000 [1:52:12<16:14, 1.26it/s, loss=0.0020, lr=3.42e-06, step=8769]17:58:19.538 [I] step=8770 loss=0.0029 smoothed_loss=0.0096 lr=3.43e-06 grad_norm=0.4070 step_time=0.5581s data_time=0.1460s it/s=1.421 eta_to_10000=865.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0098 grad_action_out_proj=0.0813 grad_shared_expert=0.5001 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8770/10000 [1:52:13<16:25, 1.25it/s, loss=0.0020, lr=3.42e-06, step=8769] Training: 88%|████████▊ | 8770/10000 [1:52:13<16:25, 1.25it/s, loss=0.0029, lr=3.42e-06, step=8770] Training: 88%|████████▊ | 8771/10000 [1:52:13<15:52, 1.29it/s, loss=0.0029, lr=3.42e-06, step=8770] Training: 88%|████████▊ | 8771/10000 [1:52:13<15:52, 1.29it/s, loss=0.0025, lr=3.42e-06, step=8771] Training: 88%|████████▊ | 8772/10000 [1:52:14<18:08, 1.13it/s, loss=0.0025, lr=3.42e-06, step=8771] Training: 88%|████████▊ | 8772/10000 [1:52:14<18:08, 1.13it/s, loss=0.0046, lr=3.42e-06, step=8772] Training: 88%|████████▊ | 8773/10000 [1:52:15<18:09, 1.13it/s, loss=0.0046, lr=3.42e-06, step=8772] Training: 88%|████████▊ | 8773/10000 [1:52:15<18:09, 1.13it/s, loss=0.0064, lr=3.41e-06, step=8773] Training: 88%|████████▊ | 8774/10000 [1:52:16<17:58, 1.14it/s, loss=0.0064, lr=3.41e-06, step=8773] Training: 88%|████████▊ | 8774/10000 [1:52:16<17:58, 1.14it/s, loss=0.0222, lr=3.41e-06, step=8774] Training: 88%|████████▊ | 8775/10000 [1:52:17<16:29, 1.24it/s, loss=0.0222, lr=3.41e-06, step=8774] Training: 88%|████████▊ | 8775/10000 [1:52:17<16:29, 1.24it/s, loss=0.0075, lr=3.41e-06, step=8775] Training: 88%|████████▊ | 8776/10000 [1:52:17<15:21, 1.33it/s, loss=0.0075, lr=3.41e-06, step=8775] Training: 88%|████████▊ | 8776/10000 [1:52:17<15:21, 1.33it/s, loss=0.0237, lr=3.41e-06, step=8776] Training: 88%|████████▊ | 8777/10000 [1:52:18<13:41, 1.49it/s, loss=0.0237, lr=3.41e-06, step=8776] Training: 88%|████████▊ | 8777/10000 [1:52:18<13:41, 1.49it/s, loss=0.0012, lr=3.41e-06, step=8777] Training: 88%|████████▊ | 8778/10000 [1:52:19<14:57, 1.36it/s, loss=0.0012, lr=3.41e-06, step=8777] Training: 88%|████████▊ | 8778/10000 [1:52:19<14:57, 1.36it/s, loss=0.0056, lr=3.41e-06, step=8778] Training: 88%|████████▊ | 8779/10000 [1:52:20<16:14, 1.25it/s, loss=0.0056, lr=3.41e-06, step=8778] Training: 88%|████████▊ | 8779/10000 [1:52:20<16:14, 1.25it/s, loss=0.0233, lr=3.41e-06, step=8779]17:58:27.434 [I] step=8780 loss=0.0077 smoothed_loss=0.0106 lr=3.41e-06 grad_norm=0.4683 step_time=0.6129s data_time=0.1767s it/s=1.267 eta_to_10000=963.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0646 grad_shared_expert=0.3111 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8780/10000 [1:52:20<15:42, 1.29it/s, loss=0.0233, lr=3.41e-06, step=8779] Training: 88%|████████▊ | 8780/10000 [1:52:21<15:42, 1.29it/s, loss=0.0077, lr=3.40e-06, step=8780] Training: 88%|████████▊ | 8781/10000 [1:52:21<14:08, 1.44it/s, loss=0.0077, lr=3.40e-06, step=8780] Training: 88%|████████▊ | 8781/10000 [1:52:21<14:08, 1.44it/s, loss=0.0037, lr=3.40e-06, step=8781] Training: 88%|████████▊ | 8782/10000 [1:52:22<13:27, 1.51it/s, loss=0.0037, lr=3.40e-06, step=8781] Training: 88%|████████▊ | 8782/10000 [1:52:22<13:27, 1.51it/s, loss=0.0051, lr=3.40e-06, step=8782] Training: 88%|████████▊ | 8783/10000 [1:52:22<14:45, 1.37it/s, loss=0.0051, lr=3.40e-06, step=8782] Training: 88%|████████▊ | 8783/10000 [1:52:22<14:45, 1.37it/s, loss=0.0013, lr=3.40e-06, step=8783] Training: 88%|████████▊ | 8784/10000 [1:52:23<14:55, 1.36it/s, loss=0.0013, lr=3.40e-06, step=8783] Training: 88%|████████▊ | 8784/10000 [1:52:23<14:55, 1.36it/s, loss=0.0127, lr=3.40e-06, step=8784] Training: 88%|████████▊ | 8785/10000 [1:52:24<13:20, 1.52it/s, loss=0.0127, lr=3.40e-06, step=8784] Training: 88%|████████▊ | 8785/10000 [1:52:24<13:20, 1.52it/s, loss=0.0315, lr=3.40e-06, step=8785] Training: 88%|████████▊ | 8786/10000 [1:52:25<14:39, 1.38it/s, loss=0.0315, lr=3.40e-06, step=8785] Training: 88%|████████▊ | 8786/10000 [1:52:25<14:39, 1.38it/s, loss=0.0056, lr=3.40e-06, step=8786] Training: 88%|████████▊ | 8787/10000 [1:52:26<16:37, 1.22it/s, loss=0.0056, lr=3.40e-06, step=8786] Training: 88%|████████▊ | 8787/10000 [1:52:26<16:37, 1.22it/s, loss=0.0192, lr=3.39e-06, step=8787] Training: 88%|████████▊ | 8788/10000 [1:52:26<15:51, 1.27it/s, loss=0.0192, lr=3.39e-06, step=8787] Training: 88%|████████▊ | 8788/10000 [1:52:26<15:51, 1.27it/s, loss=0.0534, lr=3.39e-06, step=8788] Training: 88%|████████▊ | 8789/10000 [1:52:27<14:09, 1.43it/s, loss=0.0534, lr=3.39e-06, step=8788] Training: 88%|████████▊ | 8789/10000 [1:52:27<14:09, 1.43it/s, loss=0.0333, lr=3.39e-06, step=8789]17:58:34.371 [I] step=8790 loss=0.0134 smoothed_loss=0.0171 lr=3.40e-06 grad_norm=0.6204 step_time=0.5645s data_time=0.1292s it/s=1.442 eta_to_10000=839.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.1090 grad_shared_expert=0.3260 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8790/10000 [1:52:27<13:26, 1.50it/s, loss=0.0333, lr=3.39e-06, step=8789] Training: 88%|████████▊ | 8790/10000 [1:52:27<13:26, 1.50it/s, loss=0.0134, lr=3.39e-06, step=8790] Training: 88%|████████▊ | 8791/10000 [1:52:28<12:18, 1.64it/s, loss=0.0134, lr=3.39e-06, step=8790] Training: 88%|████████▊ | 8791/10000 [1:52:28<12:18, 1.64it/s, loss=0.0029, lr=3.39e-06, step=8791] Training: 88%|████████▊ | 8792/10000 [1:52:29<12:08, 1.66it/s, loss=0.0029, lr=3.39e-06, step=8791] Training: 88%|████████▊ | 8792/10000 [1:52:29<12:08, 1.66it/s, loss=0.0029, lr=3.39e-06, step=8792] Training: 88%|████████▊ | 8793/10000 [1:52:29<12:59, 1.55it/s, loss=0.0029, lr=3.39e-06, step=8792] Training: 88%|████████▊ | 8793/10000 [1:52:29<12:59, 1.55it/s, loss=0.0074, lr=3.39e-06, step=8793] Training: 88%|████████▊ | 8794/10000 [1:52:30<12:01, 1.67it/s, loss=0.0074, lr=3.39e-06, step=8793] Training: 88%|████████▊ | 8794/10000 [1:52:30<12:01, 1.67it/s, loss=0.0075, lr=3.38e-06, step=8794] Training: 88%|████████▊ | 8795/10000 [1:52:30<12:21, 1.63it/s, loss=0.0075, lr=3.38e-06, step=8794] Training: 88%|████████▊ | 8795/10000 [1:52:30<12:21, 1.63it/s, loss=0.0022, lr=3.38e-06, step=8795] Training: 88%|████████▊ | 8796/10000 [1:52:31<11:31, 1.74it/s, loss=0.0022, lr=3.38e-06, step=8795] Training: 88%|████████▊ | 8796/10000 [1:52:31<11:31, 1.74it/s, loss=0.0083, lr=3.38e-06, step=8796] Training: 88%|████████▊ | 8797/10000 [1:52:31<11:06, 1.81it/s, loss=0.0083, lr=3.38e-06, step=8796] Training: 88%|████████▊ | 8797/10000 [1:52:31<11:06, 1.81it/s, loss=0.0174, lr=3.38e-06, step=8797] Training: 88%|████████▊ | 8798/10000 [1:52:32<10:43, 1.87it/s, loss=0.0174, lr=3.38e-06, step=8797] Training: 88%|████████▊ | 8798/10000 [1:52:32<10:43, 1.87it/s, loss=0.0623, lr=3.38e-06, step=8798] Training: 88%|████████▊ | 8799/10000 [1:52:32<11:05, 1.80it/s, loss=0.0623, lr=3.38e-06, step=8798] Training: 88%|████████▊ | 8799/10000 [1:52:32<11:05, 1.80it/s, loss=0.0019, lr=3.38e-06, step=8799]17:58:40.094 [I] step=8800 loss=0.0067 smoothed_loss=0.0148 lr=3.38e-06 grad_norm=0.4798 step_time=0.4991s data_time=0.0733s it/s=1.748 eta_to_10000=686.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0071 grad_action_out_proj=0.1206 grad_shared_expert=0.8084 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8800/10000 [1:52:33<11:55, 1.68it/s, loss=0.0019, lr=3.38e-06, step=8799] Training: 88%|████████▊ | 8800/10000 [1:52:33<11:55, 1.68it/s, loss=0.0067, lr=3.38e-06, step=8800] Training: 88%|████████▊ | 8801/10000 [1:52:34<13:52, 1.44it/s, loss=0.0067, lr=3.38e-06, step=8800] Training: 88%|████████▊ | 8801/10000 [1:52:34<13:52, 1.44it/s, loss=0.0035, lr=3.37e-06, step=8801] Training: 88%|████████▊ | 8802/10000 [1:52:35<14:27, 1.38it/s, loss=0.0035, lr=3.37e-06, step=8801] Training: 88%|████████▊ | 8802/10000 [1:52:35<14:27, 1.38it/s, loss=0.0053, lr=3.37e-06, step=8802] Training: 88%|████████▊ | 8803/10000 [1:52:36<14:18, 1.39it/s, loss=0.0053, lr=3.37e-06, step=8802] Training: 88%|████████▊ | 8803/10000 [1:52:36<14:18, 1.39it/s, loss=0.0140, lr=3.37e-06, step=8803] Training: 88%|████████▊ | 8804/10000 [1:52:36<13:36, 1.47it/s, loss=0.0140, lr=3.37e-06, step=8803] Training: 88%|████████▊ | 8804/10000 [1:52:36<13:36, 1.47it/s, loss=0.0079, lr=3.37e-06, step=8804] Training: 88%|████████▊ | 8805/10000 [1:52:37<12:34, 1.58it/s, loss=0.0079, lr=3.37e-06, step=8804] Training: 88%|████████▊ | 8805/10000 [1:52:37<12:34, 1.58it/s, loss=0.0059, lr=3.37e-06, step=8805] Training: 88%|████████▊ | 8806/10000 [1:52:37<13:06, 1.52it/s, loss=0.0059, lr=3.37e-06, step=8805] Training: 88%|████████▊ | 8806/10000 [1:52:37<13:06, 1.52it/s, loss=0.0062, lr=3.37e-06, step=8806] Training: 88%|████████▊ | 8807/10000 [1:52:38<12:07, 1.64it/s, loss=0.0062, lr=3.37e-06, step=8806] Training: 88%|████████▊ | 8807/10000 [1:52:38<12:07, 1.64it/s, loss=0.0087, lr=3.37e-06, step=8807] Training: 88%|████████▊ | 8808/10000 [1:52:39<13:16, 1.50it/s, loss=0.0087, lr=3.37e-06, step=8807] Training: 88%|████████▊ | 8808/10000 [1:52:39<13:16, 1.50it/s, loss=0.0250, lr=3.36e-06, step=8808] Training: 88%|████████▊ | 8809/10000 [1:52:40<14:31, 1.37it/s, loss=0.0250, lr=3.36e-06, step=8808] Training: 88%|████████▊ | 8809/10000 [1:52:40<14:31, 1.37it/s, loss=0.0067, lr=3.36e-06, step=8809]17:58:47.143 [I] step=8810 loss=0.0156 smoothed_loss=0.0122 lr=3.37e-06 grad_norm=0.4121 step_time=0.5857s data_time=0.1192s it/s=1.419 eta_to_10000=838.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0198 grad_action_out_proj=0.1863 grad_shared_expert=0.5994 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8810/10000 [1:52:40<13:49, 1.43it/s, loss=0.0067, lr=3.36e-06, step=8809] Training: 88%|████████▊ | 8810/10000 [1:52:40<13:49, 1.43it/s, loss=0.0156, lr=3.36e-06, step=8810] Training: 88%|████████▊ | 8811/10000 [1:52:41<13:00, 1.52it/s, loss=0.0156, lr=3.36e-06, step=8810] Training: 88%|████████▊ | 8811/10000 [1:52:41<13:00, 1.52it/s, loss=0.0074, lr=3.36e-06, step=8811] Training: 88%|████████▊ | 8812/10000 [1:52:42<13:26, 1.47it/s, loss=0.0074, lr=3.36e-06, step=8811] Training: 88%|████████▊ | 8812/10000 [1:52:42<13:26, 1.47it/s, loss=0.0073, lr=3.36e-06, step=8812] Training: 88%|████████▊ | 8813/10000 [1:52:42<12:32, 1.58it/s, loss=0.0073, lr=3.36e-06, step=8812] Training: 88%|████████▊ | 8813/10000 [1:52:42<12:32, 1.58it/s, loss=0.0093, lr=3.36e-06, step=8813] Training: 88%|████████▊ | 8814/10000 [1:52:43<13:16, 1.49it/s, loss=0.0093, lr=3.36e-06, step=8813] Training: 88%|████████▊ | 8814/10000 [1:52:43<13:16, 1.49it/s, loss=0.0100, lr=3.36e-06, step=8814] Training: 88%|████████▊ | 8815/10000 [1:52:44<13:53, 1.42it/s, loss=0.0100, lr=3.36e-06, step=8814] Training: 88%|████████▊ | 8815/10000 [1:52:44<13:53, 1.42it/s, loss=0.0361, lr=3.35e-06, step=8815] Training: 88%|████████▊ | 8816/10000 [1:52:44<13:47, 1.43it/s, loss=0.0361, lr=3.35e-06, step=8815] Training: 88%|████████▊ | 8816/10000 [1:52:44<13:47, 1.43it/s, loss=0.0352, lr=3.35e-06, step=8816] Training: 88%|████████▊ | 8817/10000 [1:52:45<13:07, 1.50it/s, loss=0.0352, lr=3.35e-06, step=8816] Training: 88%|████████▊ | 8817/10000 [1:52:45<13:07, 1.50it/s, loss=0.0189, lr=3.35e-06, step=8817] Training: 88%|████████▊ | 8818/10000 [1:52:46<13:42, 1.44it/s, loss=0.0189, lr=3.35e-06, step=8817] Training: 88%|████████▊ | 8818/10000 [1:52:46<13:42, 1.44it/s, loss=0.0013, lr=3.35e-06, step=8818] Training: 88%|████████▊ | 8819/10000 [1:52:46<12:36, 1.56it/s, loss=0.0013, lr=3.35e-06, step=8818] Training: 88%|████████▊ | 8819/10000 [1:52:46<12:36, 1.56it/s, loss=0.0052, lr=3.35e-06, step=8819]17:58:53.864 [I] step=8820 loss=0.0215 smoothed_loss=0.0144 lr=3.35e-06 grad_norm=0.4744 step_time=0.5532s data_time=0.1188s it/s=1.488 eta_to_10000=792.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0400 grad_action_out_proj=0.2137 grad_shared_expert=0.6823 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8820/10000 [1:52:47<13:35, 1.45it/s, loss=0.0052, lr=3.35e-06, step=8819] Training: 88%|████████▊ | 8820/10000 [1:52:47<13:35, 1.45it/s, loss=0.0215, lr=3.35e-06, step=8820] Training: 88%|████████▊ | 8821/10000 [1:52:48<13:35, 1.45it/s, loss=0.0215, lr=3.35e-06, step=8820] Training: 88%|████████▊ | 8821/10000 [1:52:48<13:35, 1.45it/s, loss=0.0070, lr=3.35e-06, step=8821] Training: 88%|████████▊ | 8822/10000 [1:52:48<13:40, 1.44it/s, loss=0.0070, lr=3.35e-06, step=8821] Training: 88%|████████▊ | 8822/10000 [1:52:48<13:40, 1.44it/s, loss=0.0050, lr=3.34e-06, step=8822] Training: 88%|████████▊ | 8823/10000 [1:52:49<14:10, 1.38it/s, loss=0.0050, lr=3.34e-06, step=8822] Training: 88%|████████▊ | 8823/10000 [1:52:49<14:10, 1.38it/s, loss=0.0023, lr=3.34e-06, step=8823] Training: 88%|████████▊ | 8824/10000 [1:52:50<14:35, 1.34it/s, loss=0.0023, lr=3.34e-06, step=8823] Training: 88%|████████▊ | 8824/10000 [1:52:50<14:35, 1.34it/s, loss=0.0034, lr=3.34e-06, step=8824] Training: 88%|████████▊ | 8825/10000 [1:52:51<14:41, 1.33it/s, loss=0.0034, lr=3.34e-06, step=8824] Training: 88%|████████▊ | 8825/10000 [1:52:51<14:41, 1.33it/s, loss=0.0087, lr=3.34e-06, step=8825] Training: 88%|████████▊ | 8826/10000 [1:52:51<13:50, 1.41it/s, loss=0.0087, lr=3.34e-06, step=8825] Training: 88%|████████▊ | 8826/10000 [1:52:51<13:50, 1.41it/s, loss=0.0067, lr=3.34e-06, step=8826] Training: 88%|████████▊ | 8827/10000 [1:52:52<14:20, 1.36it/s, loss=0.0067, lr=3.34e-06, step=8826] Training: 88%|████████▊ | 8827/10000 [1:52:52<14:20, 1.36it/s, loss=0.0052, lr=3.34e-06, step=8827] Training: 88%|████████▊ | 8828/10000 [1:52:53<14:41, 1.33it/s, loss=0.0052, lr=3.34e-06, step=8827] Training: 88%|████████▊ | 8828/10000 [1:52:53<14:41, 1.33it/s, loss=0.0035, lr=3.34e-06, step=8828] Training: 88%|████████▊ | 8829/10000 [1:52:54<15:05, 1.29it/s, loss=0.0035, lr=3.34e-06, step=8828] Training: 88%|████████▊ | 8829/10000 [1:52:54<15:05, 1.29it/s, loss=0.0013, lr=3.33e-06, step=8829]17:59:01.377 [I] step=8830 loss=0.0046 smoothed_loss=0.0080 lr=3.34e-06 grad_norm=0.3453 step_time=0.6115s data_time=0.1399s it/s=1.331 eta_to_10000=879.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0538 grad_shared_expert=0.3280 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8830/10000 [1:52:54<14:56, 1.31it/s, loss=0.0013, lr=3.33e-06, step=8829] Training: 88%|████████▊ | 8830/10000 [1:52:54<14:56, 1.31it/s, loss=0.0046, lr=3.33e-06, step=8830] Training: 88%|████████▊ | 8831/10000 [1:52:55<14:49, 1.31it/s, loss=0.0046, lr=3.33e-06, step=8830] Training: 88%|████████▊ | 8831/10000 [1:52:55<14:49, 1.31it/s, loss=0.0032, lr=3.33e-06, step=8831] Training: 88%|████████▊ | 8832/10000 [1:52:56<15:19, 1.27it/s, loss=0.0032, lr=3.33e-06, step=8831] Training: 88%|████████▊ | 8832/10000 [1:52:56<15:19, 1.27it/s, loss=0.0085, lr=3.33e-06, step=8832] Training: 88%|████████▊ | 8833/10000 [1:52:57<13:36, 1.43it/s, loss=0.0085, lr=3.33e-06, step=8832] Training: 88%|████████▊ | 8833/10000 [1:52:57<13:36, 1.43it/s, loss=0.0143, lr=3.33e-06, step=8833] Training: 88%|████████▊ | 8834/10000 [1:52:57<12:27, 1.56it/s, loss=0.0143, lr=3.33e-06, step=8833] Training: 88%|████████▊ | 8834/10000 [1:52:57<12:27, 1.56it/s, loss=0.0030, lr=3.33e-06, step=8834] Training: 88%|████████▊ | 8835/10000 [1:52:58<13:01, 1.49it/s, loss=0.0030, lr=3.33e-06, step=8834] Training: 88%|████████▊ | 8835/10000 [1:52:58<13:01, 1.49it/s, loss=0.0097, lr=3.33e-06, step=8835] Training: 88%|████████▊ | 8836/10000 [1:52:59<15:04, 1.29it/s, loss=0.0097, lr=3.33e-06, step=8835] Training: 88%|████████▊ | 8836/10000 [1:52:59<15:04, 1.29it/s, loss=0.0194, lr=3.32e-06, step=8836] Training: 88%|████████▊ | 8837/10000 [1:52:59<13:57, 1.39it/s, loss=0.0194, lr=3.32e-06, step=8836] Training: 88%|████████▊ | 8837/10000 [1:52:59<13:57, 1.39it/s, loss=0.0274, lr=3.32e-06, step=8837] Training: 88%|████████▊ | 8838/10000 [1:53:00<13:41, 1.42it/s, loss=0.0274, lr=3.32e-06, step=8837] Training: 88%|████████▊ | 8838/10000 [1:53:00<13:41, 1.42it/s, loss=0.0020, lr=3.32e-06, step=8838] Training: 88%|████████▊ | 8839/10000 [1:53:01<13:01, 1.48it/s, loss=0.0020, lr=3.32e-06, step=8838] Training: 88%|████████▊ | 8839/10000 [1:53:01<13:01, 1.48it/s, loss=0.0072, lr=3.32e-06, step=8839]17:59:08.339 [I] step=8840 loss=0.0043 smoothed_loss=0.0092 lr=3.33e-06 grad_norm=0.4168 step_time=0.5558s data_time=0.1405s it/s=1.437 eta_to_10000=807.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.0841 grad_shared_expert=0.2843 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8840/10000 [1:53:01<13:24, 1.44it/s, loss=0.0072, lr=3.32e-06, step=8839] Training: 88%|████████▊ | 8840/10000 [1:53:01<13:24, 1.44it/s, loss=0.0043, lr=3.32e-06, step=8840] Training: 88%|████████▊ | 8841/10000 [1:53:02<14:05, 1.37it/s, loss=0.0043, lr=3.32e-06, step=8840] Training: 88%|████████▊ | 8841/10000 [1:53:02<14:05, 1.37it/s, loss=0.0226, lr=3.32e-06, step=8841] Training: 88%|████████▊ | 8842/10000 [1:53:03<14:19, 1.35it/s, loss=0.0226, lr=3.32e-06, step=8841] Training: 88%|████████▊ | 8842/10000 [1:53:03<14:19, 1.35it/s, loss=0.0087, lr=3.32e-06, step=8842] Training: 88%|████████▊ | 8843/10000 [1:53:04<14:54, 1.29it/s, loss=0.0087, lr=3.32e-06, step=8842] Training: 88%|████████▊ | 8843/10000 [1:53:04<14:54, 1.29it/s, loss=0.0084, lr=3.31e-06, step=8843] Training: 88%|████████▊ | 8844/10000 [1:53:05<15:36, 1.23it/s, loss=0.0084, lr=3.31e-06, step=8843] Training: 88%|████████▊ | 8844/10000 [1:53:05<15:36, 1.23it/s, loss=0.0090, lr=3.31e-06, step=8844] Training: 88%|████████▊ | 8845/10000 [1:53:06<15:40, 1.23it/s, loss=0.0090, lr=3.31e-06, step=8844] Training: 88%|████████▊ | 8845/10000 [1:53:06<15:40, 1.23it/s, loss=0.0042, lr=3.31e-06, step=8845] Training: 88%|████████▊ | 8846/10000 [1:53:06<14:06, 1.36it/s, loss=0.0042, lr=3.31e-06, step=8845] Training: 88%|████████▊ | 8846/10000 [1:53:06<14:06, 1.36it/s, loss=0.0072, lr=3.31e-06, step=8846] Training: 88%|████████▊ | 8847/10000 [1:53:07<14:44, 1.30it/s, loss=0.0072, lr=3.31e-06, step=8846] Training: 88%|████████▊ | 8847/10000 [1:53:07<14:44, 1.30it/s, loss=0.0035, lr=3.31e-06, step=8847] Training: 88%|████████▊ | 8848/10000 [1:53:08<13:31, 1.42it/s, loss=0.0035, lr=3.31e-06, step=8847] Training: 88%|████████▊ | 8848/10000 [1:53:08<13:31, 1.42it/s, loss=0.0092, lr=3.31e-06, step=8848] Training: 88%|████████▊ | 8849/10000 [1:53:08<12:33, 1.53it/s, loss=0.0092, lr=3.31e-06, step=8848] Training: 88%|████████▊ | 8849/10000 [1:53:08<12:33, 1.53it/s, loss=0.0031, lr=3.31e-06, step=8849]17:59:15.781 [I] step=8850 loss=0.0074 smoothed_loss=0.0081 lr=3.31e-06 grad_norm=0.4923 step_time=0.5989s data_time=0.1453s it/s=1.344 eta_to_10000=855.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.0854 grad_shared_expert=0.4180 (10775:train_pytorch.py:850) + Training: 88%|████████▊ | 8850/10000 [1:53:09<13:24, 1.43it/s, loss=0.0031, lr=3.31e-06, step=8849] Training: 88%|████████▊ | 8850/10000 [1:53:09<13:24, 1.43it/s, loss=0.0074, lr=3.31e-06, step=8850] Training: 89%|████████▊ | 8851/10000 [1:53:10<13:39, 1.40it/s, loss=0.0074, lr=3.31e-06, step=8850] Training: 89%|████████▊ | 8851/10000 [1:53:10<13:39, 1.40it/s, loss=0.0030, lr=3.30e-06, step=8851] Training: 89%|████████▊ | 8852/10000 [1:53:10<13:52, 1.38it/s, loss=0.0030, lr=3.30e-06, step=8851] Training: 89%|████████▊ | 8852/10000 [1:53:10<13:52, 1.38it/s, loss=0.0016, lr=3.30e-06, step=8852] Training: 89%|████████▊ | 8853/10000 [1:53:11<13:18, 1.44it/s, loss=0.0016, lr=3.30e-06, step=8852] Training: 89%|████████▊ | 8853/10000 [1:53:11<13:18, 1.44it/s, loss=0.0014, lr=3.30e-06, step=8853] Training: 89%|████████▊ | 8854/10000 [1:53:12<13:37, 1.40it/s, loss=0.0014, lr=3.30e-06, step=8853] Training: 89%|████████▊ | 8854/10000 [1:53:12<13:37, 1.40it/s, loss=0.0025, lr=3.30e-06, step=8854] Training: 89%|████████▊ | 8855/10000 [1:53:12<13:12, 1.44it/s, loss=0.0025, lr=3.30e-06, step=8854] Training: 89%|████████▊ | 8855/10000 [1:53:12<13:12, 1.44it/s, loss=0.0014, lr=3.30e-06, step=8855] Training: 89%|████████▊ | 8856/10000 [1:53:13<13:31, 1.41it/s, loss=0.0014, lr=3.30e-06, step=8855] Training: 89%|████████▊ | 8856/10000 [1:53:13<13:31, 1.41it/s, loss=0.0097, lr=3.30e-06, step=8856] Training: 89%|████████▊ | 8857/10000 [1:53:14<13:07, 1.45it/s, loss=0.0097, lr=3.30e-06, step=8856] Training: 89%|████████▊ | 8857/10000 [1:53:14<13:07, 1.45it/s, loss=0.0041, lr=3.30e-06, step=8857] Training: 89%|████████▊ | 8858/10000 [1:53:15<14:23, 1.32it/s, loss=0.0041, lr=3.30e-06, step=8857] Training: 89%|████████▊ | 8858/10000 [1:53:15<14:23, 1.32it/s, loss=0.0067, lr=3.29e-06, step=8858] Training: 89%|████████▊ | 8859/10000 [1:53:16<15:20, 1.24it/s, loss=0.0067, lr=3.29e-06, step=8858] Training: 89%|████████▊ | 8859/10000 [1:53:16<15:20, 1.24it/s, loss=0.0134, lr=3.29e-06, step=8859]17:59:23.419 [I] step=8860 loss=0.0021 smoothed_loss=0.0062 lr=3.30e-06 grad_norm=0.3872 step_time=0.6135s data_time=0.1503s it/s=1.309 eta_to_10000=870.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0136 grad_action_out_proj=0.1091 grad_shared_expert=0.4307 (10775:train_pytorch.py:850) + Training: 89%|████████▊ | 8860/10000 [1:53:16<15:46, 1.20it/s, loss=0.0134, lr=3.29e-06, step=8859] Training: 89%|████████▊ | 8860/10000 [1:53:16<15:46, 1.20it/s, loss=0.0021, lr=3.29e-06, step=8860] Training: 89%|████████▊ | 8861/10000 [1:53:17<15:07, 1.25it/s, loss=0.0021, lr=3.29e-06, step=8860] Training: 89%|████████▊ | 8861/10000 [1:53:17<15:07, 1.25it/s, loss=0.0059, lr=3.29e-06, step=8861] Training: 89%|████████▊ | 8862/10000 [1:53:18<13:28, 1.41it/s, loss=0.0059, lr=3.29e-06, step=8861] Training: 89%|████████▊ | 8862/10000 [1:53:18<13:28, 1.41it/s, loss=0.0103, lr=3.29e-06, step=8862] Training: 89%|████████▊ | 8863/10000 [1:53:18<12:14, 1.55it/s, loss=0.0103, lr=3.29e-06, step=8862] Training: 89%|████████▊ | 8863/10000 [1:53:18<12:14, 1.55it/s, loss=0.0043, lr=3.29e-06, step=8863] Training: 89%|████████▊ | 8864/10000 [1:53:19<12:32, 1.51it/s, loss=0.0043, lr=3.29e-06, step=8863] Training: 89%|████████▊ | 8864/10000 [1:53:19<12:32, 1.51it/s, loss=0.0039, lr=3.29e-06, step=8864] Training: 89%|████████▊ | 8865/10000 [1:53:19<11:45, 1.61it/s, loss=0.0039, lr=3.29e-06, step=8864] Training: 89%|████████▊ | 8865/10000 [1:53:19<11:45, 1.61it/s, loss=0.0045, lr=3.28e-06, step=8865] Training: 89%|████████▊ | 8866/10000 [1:53:20<13:59, 1.35it/s, loss=0.0045, lr=3.28e-06, step=8865] Training: 89%|████████▊ | 8866/10000 [1:53:20<13:59, 1.35it/s, loss=0.0104, lr=3.28e-06, step=8866] Training: 89%|████████▊ | 8867/10000 [1:53:21<13:05, 1.44it/s, loss=0.0104, lr=3.28e-06, step=8866] Training: 89%|████████▊ | 8867/10000 [1:53:21<13:05, 1.44it/s, loss=0.0202, lr=3.28e-06, step=8867] Training: 89%|████████▊ | 8868/10000 [1:53:22<13:54, 1.36it/s, loss=0.0202, lr=3.28e-06, step=8867] Training: 89%|████████▊ | 8868/10000 [1:53:22<13:54, 1.36it/s, loss=0.0063, lr=3.28e-06, step=8868] Training: 89%|████████▊ | 8869/10000 [1:53:22<13:11, 1.43it/s, loss=0.0063, lr=3.28e-06, step=8868] Training: 89%|████████▊ | 8869/10000 [1:53:22<13:11, 1.43it/s, loss=0.0098, lr=3.28e-06, step=8869]17:59:30.288 [I] step=8870 loss=0.0081 smoothed_loss=0.0079 lr=3.28e-06 grad_norm=0.3217 step_time=0.5625s data_time=0.1244s it/s=1.456 eta_to_10000=776.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.0683 grad_shared_expert=0.2354 (10775:train_pytorch.py:850) + Training: 89%|████████▊ | 8870/10000 [1:53:23<14:07, 1.33it/s, loss=0.0098, lr=3.28e-06, step=8869] Training: 89%|████████▊ | 8870/10000 [1:53:23<14:07, 1.33it/s, loss=0.0081, lr=3.28e-06, step=8870] Training: 89%|████████▊ | 8871/10000 [1:53:24<13:17, 1.42it/s, loss=0.0081, lr=3.28e-06, step=8870] Training: 89%|████████▊ | 8871/10000 [1:53:24<13:17, 1.42it/s, loss=0.0026, lr=3.28e-06, step=8871] Training: 89%|████████▊ | 8872/10000 [1:53:25<13:51, 1.36it/s, loss=0.0026, lr=3.28e-06, step=8871] Training: 89%|████████▊ | 8872/10000 [1:53:25<13:51, 1.36it/s, loss=0.0214, lr=3.28e-06, step=8872] Training: 89%|████████▊ | 8873/10000 [1:53:26<15:39, 1.20it/s, loss=0.0214, lr=3.28e-06, step=8872] Training: 89%|████████▊ | 8873/10000 [1:53:26<15:39, 1.20it/s, loss=0.0211, lr=3.27e-06, step=8873] Training: 89%|████████▊ | 8874/10000 [1:53:27<15:37, 1.20it/s, loss=0.0211, lr=3.27e-06, step=8873] Training: 89%|████████▊ | 8874/10000 [1:53:27<15:37, 1.20it/s, loss=0.0104, lr=3.27e-06, step=8874] Training: 89%|████████▉ | 8875/10000 [1:53:27<14:20, 1.31it/s, loss=0.0104, lr=3.27e-06, step=8874] Training: 89%|████████▉ | 8875/10000 [1:53:27<14:20, 1.31it/s, loss=0.0045, lr=3.27e-06, step=8875] Training: 89%|████████▉ | 8876/10000 [1:53:28<13:42, 1.37it/s, loss=0.0045, lr=3.27e-06, step=8875] Training: 89%|████████▉ | 8876/10000 [1:53:28<13:42, 1.37it/s, loss=0.0042, lr=3.27e-06, step=8876] Training: 89%|████████▉ | 8877/10000 [1:53:29<12:52, 1.45it/s, loss=0.0042, lr=3.27e-06, step=8876] Training: 89%|████████▉ | 8877/10000 [1:53:29<12:52, 1.45it/s, loss=0.0387, lr=3.27e-06, step=8877] Training: 89%|████████▉ | 8878/10000 [1:53:29<11:50, 1.58it/s, loss=0.0387, lr=3.27e-06, step=8877] Training: 89%|████████▉ | 8878/10000 [1:53:29<11:50, 1.58it/s, loss=0.0021, lr=3.27e-06, step=8878] Training: 89%|████████▉ | 8879/10000 [1:53:30<12:21, 1.51it/s, loss=0.0021, lr=3.27e-06, step=8878] Training: 89%|████████▉ | 8879/10000 [1:53:30<12:21, 1.51it/s, loss=0.0036, lr=3.27e-06, step=8879]17:59:37.231 [I] step=8880 loss=0.0036 smoothed_loss=0.0095 lr=3.27e-06 grad_norm=0.3516 step_time=0.5630s data_time=0.1313s it/s=1.441 eta_to_10000=777.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0875 grad_shared_expert=0.3212 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8880/10000 [1:53:30<11:46, 1.59it/s, loss=0.0036, lr=3.27e-06, step=8879] Training: 89%|████████▉ | 8880/10000 [1:53:30<11:46, 1.59it/s, loss=0.0036, lr=3.26e-06, step=8880] Training: 89%|████████▉ | 8881/10000 [1:53:31<11:57, 1.56it/s, loss=0.0036, lr=3.26e-06, step=8880] Training: 89%|████████▉ | 8881/10000 [1:53:31<11:57, 1.56it/s, loss=0.0026, lr=3.26e-06, step=8881] Training: 89%|████████▉ | 8882/10000 [1:53:31<11:04, 1.68it/s, loss=0.0026, lr=3.26e-06, step=8881] Training: 89%|████████▉ | 8882/10000 [1:53:31<11:04, 1.68it/s, loss=0.0345, lr=3.26e-06, step=8882] Training: 89%|████████▉ | 8883/10000 [1:53:32<11:48, 1.58it/s, loss=0.0345, lr=3.26e-06, step=8882] Training: 89%|████████▉ | 8883/10000 [1:53:32<11:48, 1.58it/s, loss=0.0082, lr=3.26e-06, step=8883] Training: 89%|████████▉ | 8884/10000 [1:53:33<11:54, 1.56it/s, loss=0.0082, lr=3.26e-06, step=8883] Training: 89%|████████▉ | 8884/10000 [1:53:33<11:54, 1.56it/s, loss=0.0024, lr=3.26e-06, step=8884] Training: 89%|████████▉ | 8885/10000 [1:53:33<11:20, 1.64it/s, loss=0.0024, lr=3.26e-06, step=8884] Training: 89%|████████▉ | 8885/10000 [1:53:33<11:20, 1.64it/s, loss=0.0073, lr=3.26e-06, step=8885] Training: 89%|████████▉ | 8886/10000 [1:53:34<10:39, 1.74it/s, loss=0.0073, lr=3.26e-06, step=8885] Training: 89%|████████▉ | 8886/10000 [1:53:34<10:39, 1.74it/s, loss=0.0021, lr=3.26e-06, step=8886] Training: 89%|████████▉ | 8887/10000 [1:53:35<12:27, 1.49it/s, loss=0.0021, lr=3.26e-06, step=8886] Training: 89%|████████▉ | 8887/10000 [1:53:35<12:27, 1.49it/s, loss=0.0046, lr=3.25e-06, step=8887] Training: 89%|████████▉ | 8888/10000 [1:53:36<13:36, 1.36it/s, loss=0.0046, lr=3.25e-06, step=8887] Training: 89%|████████▉ | 8888/10000 [1:53:36<13:36, 1.36it/s, loss=0.0090, lr=3.25e-06, step=8888] Training: 89%|████████▉ | 8889/10000 [1:53:36<12:47, 1.45it/s, loss=0.0090, lr=3.25e-06, step=8888] Training: 89%|████████▉ | 8889/10000 [1:53:36<12:47, 1.45it/s, loss=0.0044, lr=3.25e-06, step=8889]17:59:43.798 [I] step=8890 loss=0.0093 smoothed_loss=0.0084 lr=3.26e-06 grad_norm=0.4889 step_time=0.5499s data_time=0.1068s it/s=1.523 eta_to_10000=728.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0658 grad_shared_expert=0.3863 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8890/10000 [1:53:37<12:28, 1.48it/s, loss=0.0044, lr=3.25e-06, step=8889] Training: 89%|████████▉ | 8890/10000 [1:53:37<12:28, 1.48it/s, loss=0.0093, lr=3.25e-06, step=8890] Training: 89%|████████▉ | 8891/10000 [1:53:38<12:21, 1.50it/s, loss=0.0093, lr=3.25e-06, step=8890] Training: 89%|████████▉ | 8891/10000 [1:53:38<12:21, 1.50it/s, loss=0.0098, lr=3.25e-06, step=8891] Training: 89%|████████▉ | 8892/10000 [1:53:38<12:21, 1.49it/s, loss=0.0098, lr=3.25e-06, step=8891] Training: 89%|████████▉ | 8892/10000 [1:53:38<12:21, 1.49it/s, loss=0.0011, lr=3.25e-06, step=8892] Training: 89%|████████▉ | 8893/10000 [1:53:39<12:32, 1.47it/s, loss=0.0011, lr=3.25e-06, step=8892] Training: 89%|████████▉ | 8893/10000 [1:53:39<12:32, 1.47it/s, loss=0.0081, lr=3.25e-06, step=8893] Training: 89%|████████▉ | 8894/10000 [1:53:40<13:19, 1.38it/s, loss=0.0081, lr=3.25e-06, step=8893] Training: 89%|████████▉ | 8894/10000 [1:53:40<13:19, 1.38it/s, loss=0.0131, lr=3.25e-06, step=8894] Training: 89%|████████▉ | 8895/10000 [1:53:41<13:55, 1.32it/s, loss=0.0131, lr=3.25e-06, step=8894] Training: 89%|████████▉ | 8895/10000 [1:53:41<13:55, 1.32it/s, loss=0.0048, lr=3.24e-06, step=8895] Training: 89%|████████▉ | 8896/10000 [1:53:41<13:29, 1.36it/s, loss=0.0048, lr=3.24e-06, step=8895] Training: 89%|████████▉ | 8896/10000 [1:53:41<13:29, 1.36it/s, loss=0.0096, lr=3.24e-06, step=8896] Training: 89%|████████▉ | 8897/10000 [1:53:42<13:39, 1.35it/s, loss=0.0096, lr=3.24e-06, step=8896] Training: 89%|████████▉ | 8897/10000 [1:53:42<13:39, 1.35it/s, loss=0.0161, lr=3.24e-06, step=8897] Training: 89%|████████▉ | 8898/10000 [1:53:42<12:18, 1.49it/s, loss=0.0161, lr=3.24e-06, step=8897] Training: 89%|████████▉ | 8898/10000 [1:53:42<12:18, 1.49it/s, loss=0.0178, lr=3.24e-06, step=8898] Training: 89%|████████▉ | 8899/10000 [1:53:43<12:22, 1.48it/s, loss=0.0178, lr=3.24e-06, step=8898] Training: 89%|████████▉ | 8899/10000 [1:53:43<12:22, 1.48it/s, loss=0.0041, lr=3.24e-06, step=8899]17:59:50.764 [I] step=8900 loss=0.0050 smoothed_loss=0.0088 lr=3.24e-06 grad_norm=0.4508 step_time=0.5779s data_time=0.1187s it/s=1.436 eta_to_10000=766.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0370 grad_action_out_proj=0.1892 grad_shared_expert=0.6850 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8900/10000 [1:53:44<12:13, 1.50it/s, loss=0.0041, lr=3.24e-06, step=8899] Training: 89%|████████▉ | 8900/10000 [1:53:44<12:13, 1.50it/s, loss=0.0050, lr=3.24e-06, step=8900] Training: 89%|████████▉ | 8901/10000 [1:53:45<13:23, 1.37it/s, loss=0.0050, lr=3.24e-06, step=8900] Training: 89%|████████▉ | 8901/10000 [1:53:45<13:23, 1.37it/s, loss=0.0366, lr=3.24e-06, step=8901] Training: 89%|████████▉ | 8902/10000 [1:53:46<13:42, 1.33it/s, loss=0.0366, lr=3.24e-06, step=8901] Training: 89%|████████▉ | 8902/10000 [1:53:46<13:42, 1.33it/s, loss=0.0130, lr=3.23e-06, step=8902] Training: 89%|████████▉ | 8903/10000 [1:53:46<12:47, 1.43it/s, loss=0.0130, lr=3.23e-06, step=8902] Training: 89%|████████▉ | 8903/10000 [1:53:46<12:47, 1.43it/s, loss=0.0025, lr=3.23e-06, step=8903] Training: 89%|████████▉ | 8904/10000 [1:53:47<12:52, 1.42it/s, loss=0.0025, lr=3.23e-06, step=8903] Training: 89%|████████▉ | 8904/10000 [1:53:47<12:52, 1.42it/s, loss=0.0065, lr=3.23e-06, step=8904] Training: 89%|████████▉ | 8905/10000 [1:53:47<12:33, 1.45it/s, loss=0.0065, lr=3.23e-06, step=8904] Training: 89%|████████▉ | 8905/10000 [1:53:47<12:33, 1.45it/s, loss=0.0094, lr=3.23e-06, step=8905] Training: 89%|████████▉ | 8906/10000 [1:53:48<13:24, 1.36it/s, loss=0.0094, lr=3.23e-06, step=8905] Training: 89%|████████▉ | 8906/10000 [1:53:48<13:24, 1.36it/s, loss=0.0059, lr=3.23e-06, step=8906] Training: 89%|████████▉ | 8907/10000 [1:53:49<12:14, 1.49it/s, loss=0.0059, lr=3.23e-06, step=8906] Training: 89%|████████▉ | 8907/10000 [1:53:49<12:14, 1.49it/s, loss=0.0080, lr=3.23e-06, step=8907] Training: 89%|████████▉ | 8908/10000 [1:53:50<14:52, 1.22it/s, loss=0.0080, lr=3.23e-06, step=8907] Training: 89%|████████▉ | 8908/10000 [1:53:50<14:52, 1.22it/s, loss=0.0020, lr=3.23e-06, step=8908] Training: 89%|████████▉ | 8909/10000 [1:53:51<14:16, 1.27it/s, loss=0.0020, lr=3.23e-06, step=8908] Training: 89%|████████▉ | 8909/10000 [1:53:51<14:16, 1.27it/s, loss=0.0011, lr=3.23e-06, step=8909]17:59:58.434 [I] step=8910 loss=0.0094 smoothed_loss=0.0083 lr=3.23e-06 grad_norm=0.4252 step_time=0.6182s data_time=0.1487s it/s=1.304 eta_to_10000=835.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0150 grad_action_out_proj=0.1243 grad_shared_expert=0.3996 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8910/10000 [1:53:51<14:24, 1.26it/s, loss=0.0011, lr=3.23e-06, step=8909] Training: 89%|████████▉ | 8910/10000 [1:53:52<14:24, 1.26it/s, loss=0.0094, lr=3.22e-06, step=8910] Training: 89%|████████▉ | 8911/10000 [1:53:52<13:09, 1.38it/s, loss=0.0094, lr=3.22e-06, step=8910] Training: 89%|████████▉ | 8911/10000 [1:53:52<13:09, 1.38it/s, loss=0.0070, lr=3.22e-06, step=8911] Training: 89%|████████▉ | 8912/10000 [1:53:53<13:04, 1.39it/s, loss=0.0070, lr=3.22e-06, step=8911] Training: 89%|████████▉ | 8912/10000 [1:53:53<13:04, 1.39it/s, loss=0.0062, lr=3.22e-06, step=8912] Training: 89%|████████▉ | 8913/10000 [1:53:53<12:16, 1.48it/s, loss=0.0062, lr=3.22e-06, step=8912] Training: 89%|████████▉ | 8913/10000 [1:53:53<12:16, 1.48it/s, loss=0.0020, lr=3.22e-06, step=8913] Training: 89%|████████▉ | 8914/10000 [1:53:54<13:37, 1.33it/s, loss=0.0020, lr=3.22e-06, step=8913] Training: 89%|████████▉ | 8914/10000 [1:53:54<13:37, 1.33it/s, loss=0.0033, lr=3.22e-06, step=8914] Training: 89%|████████▉ | 8915/10000 [1:53:55<14:19, 1.26it/s, loss=0.0033, lr=3.22e-06, step=8914] Training: 89%|████████▉ | 8915/10000 [1:53:55<14:19, 1.26it/s, loss=0.0014, lr=3.22e-06, step=8915] Training: 89%|████████▉ | 8916/10000 [1:53:56<12:40, 1.42it/s, loss=0.0014, lr=3.22e-06, step=8915] Training: 89%|████████▉ | 8916/10000 [1:53:56<12:40, 1.42it/s, loss=0.0074, lr=3.22e-06, step=8916] Training: 89%|████████▉ | 8917/10000 [1:53:56<11:42, 1.54it/s, loss=0.0074, lr=3.22e-06, step=8916] Training: 89%|████████▉ | 8917/10000 [1:53:56<11:42, 1.54it/s, loss=0.0039, lr=3.22e-06, step=8917] Training: 89%|████████▉ | 8918/10000 [1:53:57<12:19, 1.46it/s, loss=0.0039, lr=3.22e-06, step=8917] Training: 89%|████████▉ | 8918/10000 [1:53:57<12:19, 1.46it/s, loss=0.0013, lr=3.21e-06, step=8918] Training: 89%|████████▉ | 8919/10000 [1:53:58<13:13, 1.36it/s, loss=0.0013, lr=3.21e-06, step=8918] Training: 89%|████████▉ | 8919/10000 [1:53:58<13:13, 1.36it/s, loss=0.0011, lr=3.21e-06, step=8919]18:00:05.634 [I] step=8920 loss=0.0019 smoothed_loss=0.0049 lr=3.22e-06 grad_norm=0.3391 step_time=0.5696s data_time=0.1504s it/s=1.389 eta_to_10000=777.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0138 grad_action_out_proj=0.0863 grad_shared_expert=0.2722 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8920/10000 [1:53:59<14:06, 1.28it/s, loss=0.0011, lr=3.21e-06, step=8919] Training: 89%|████████▉ | 8920/10000 [1:53:59<14:06, 1.28it/s, loss=0.0019, lr=3.21e-06, step=8920] Training: 89%|████████▉ | 8921/10000 [1:54:00<14:45, 1.22it/s, loss=0.0019, lr=3.21e-06, step=8920] Training: 89%|████████▉ | 8921/10000 [1:54:00<14:45, 1.22it/s, loss=0.0035, lr=3.21e-06, step=8921] Training: 89%|████████▉ | 8922/10000 [1:54:00<14:49, 1.21it/s, loss=0.0035, lr=3.21e-06, step=8921] Training: 89%|████████▉ | 8922/10000 [1:54:00<14:49, 1.21it/s, loss=0.0017, lr=3.21e-06, step=8922] Training: 89%|████████▉ | 8923/10000 [1:54:01<15:39, 1.15it/s, loss=0.0017, lr=3.21e-06, step=8922] Training: 89%|████████▉ | 8923/10000 [1:54:01<15:39, 1.15it/s, loss=0.0600, lr=3.21e-06, step=8923] Training: 89%|████████▉ | 8924/10000 [1:54:02<16:04, 1.12it/s, loss=0.0600, lr=3.21e-06, step=8923] Training: 89%|████████▉ | 8924/10000 [1:54:02<16:04, 1.12it/s, loss=0.0034, lr=3.21e-06, step=8924] Training: 89%|████████▉ | 8925/10000 [1:54:03<15:55, 1.13it/s, loss=0.0034, lr=3.21e-06, step=8924] Training: 89%|████████▉ | 8925/10000 [1:54:03<15:55, 1.13it/s, loss=0.0045, lr=3.20e-06, step=8925] Training: 89%|████████▉ | 8926/10000 [1:54:04<14:59, 1.19it/s, loss=0.0045, lr=3.20e-06, step=8925] Training: 89%|████████▉ | 8926/10000 [1:54:04<14:59, 1.19it/s, loss=0.0106, lr=3.20e-06, step=8926] Training: 89%|████████▉ | 8927/10000 [1:54:05<14:31, 1.23it/s, loss=0.0106, lr=3.20e-06, step=8926] Training: 89%|████████▉ | 8927/10000 [1:54:05<14:31, 1.23it/s, loss=0.0076, lr=3.20e-06, step=8927] Training: 89%|████████▉ | 8928/10000 [1:54:05<13:26, 1.33it/s, loss=0.0076, lr=3.20e-06, step=8927] Training: 89%|████████▉ | 8928/10000 [1:54:05<13:26, 1.33it/s, loss=0.0079, lr=3.20e-06, step=8928] Training: 89%|████████▉ | 8929/10000 [1:54:06<13:17, 1.34it/s, loss=0.0079, lr=3.20e-06, step=8928] Training: 89%|████████▉ | 8929/10000 [1:54:06<13:17, 1.34it/s, loss=0.0006, lr=3.20e-06, step=8929]18:00:14.039 [I] step=8930 loss=0.0072 smoothed_loss=0.0079 lr=3.20e-06 grad_norm=0.3912 step_time=0.6597s data_time=0.1809s it/s=1.190 eta_to_10000=899.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0093 grad_action_out_proj=0.0873 grad_shared_expert=0.3618 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8930/10000 [1:54:07<14:53, 1.20it/s, loss=0.0006, lr=3.20e-06, step=8929] Training: 89%|████████▉ | 8930/10000 [1:54:07<14:53, 1.20it/s, loss=0.0072, lr=3.20e-06, step=8930] Training: 89%|████████▉ | 8931/10000 [1:54:08<13:46, 1.29it/s, loss=0.0072, lr=3.20e-06, step=8930] Training: 89%|████████▉ | 8931/10000 [1:54:08<13:46, 1.29it/s, loss=0.0127, lr=3.20e-06, step=8931] Training: 89%|████████▉ | 8932/10000 [1:54:08<13:06, 1.36it/s, loss=0.0127, lr=3.20e-06, step=8931] Training: 89%|████████▉ | 8932/10000 [1:54:08<13:06, 1.36it/s, loss=0.0008, lr=3.20e-06, step=8932] Training: 89%|████████▉ | 8933/10000 [1:54:09<12:05, 1.47it/s, loss=0.0008, lr=3.20e-06, step=8932] Training: 89%|████████▉ | 8933/10000 [1:54:09<12:05, 1.47it/s, loss=0.0024, lr=3.19e-06, step=8933] Training: 89%|████████▉ | 8934/10000 [1:54:09<11:20, 1.57it/s, loss=0.0024, lr=3.19e-06, step=8933] Training: 89%|████████▉ | 8934/10000 [1:54:09<11:20, 1.57it/s, loss=0.0037, lr=3.19e-06, step=8934] Training: 89%|████████▉ | 8935/10000 [1:54:10<11:48, 1.50it/s, loss=0.0037, lr=3.19e-06, step=8934] Training: 89%|████████▉ | 8935/10000 [1:54:10<11:48, 1.50it/s, loss=0.0018, lr=3.19e-06, step=8935] Training: 89%|████████▉ | 8936/10000 [1:54:11<11:34, 1.53it/s, loss=0.0018, lr=3.19e-06, step=8935] Training: 89%|████████▉ | 8936/10000 [1:54:11<11:34, 1.53it/s, loss=0.0036, lr=3.19e-06, step=8936] Training: 89%|████████▉ | 8937/10000 [1:54:12<13:30, 1.31it/s, loss=0.0036, lr=3.19e-06, step=8936] Training: 89%|████████▉ | 8937/10000 [1:54:12<13:30, 1.31it/s, loss=0.0015, lr=3.19e-06, step=8937] Training: 89%|████████▉ | 8938/10000 [1:54:13<13:52, 1.28it/s, loss=0.0015, lr=3.19e-06, step=8937] Training: 89%|████████▉ | 8938/10000 [1:54:13<13:52, 1.28it/s, loss=0.0074, lr=3.19e-06, step=8938] Training: 89%|████████▉ | 8939/10000 [1:54:13<12:51, 1.38it/s, loss=0.0074, lr=3.19e-06, step=8938] Training: 89%|████████▉ | 8939/10000 [1:54:13<12:51, 1.38it/s, loss=0.0106, lr=3.19e-06, step=8939]18:00:20.908 [I] step=8940 loss=0.0023 smoothed_loss=0.0058 lr=3.19e-06 grad_norm=0.3753 step_time=0.5661s data_time=0.1208s it/s=1.456 eta_to_10000=727.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0537 grad_shared_expert=0.2648 (10775:train_pytorch.py:850) + Training: 89%|████████▉ | 8940/10000 [1:54:14<12:42, 1.39it/s, loss=0.0106, lr=3.19e-06, step=8939] Training: 89%|████████▉ | 8940/10000 [1:54:14<12:42, 1.39it/s, loss=0.0023, lr=3.19e-06, step=8940] Training: 89%|████████▉ | 8941/10000 [1:54:15<11:43, 1.50it/s, loss=0.0023, lr=3.19e-06, step=8940] Training: 89%|████████▉ | 8941/10000 [1:54:15<11:43, 1.50it/s, loss=0.0036, lr=3.18e-06, step=8941] Training: 89%|████████▉ | 8942/10000 [1:54:15<12:43, 1.39it/s, loss=0.0036, lr=3.18e-06, step=8941] Training: 89%|████████▉ | 8942/10000 [1:54:15<12:43, 1.39it/s, loss=0.0105, lr=3.18e-06, step=8942] Training: 89%|████████▉ | 8943/10000 [1:54:16<13:18, 1.32it/s, loss=0.0105, lr=3.18e-06, step=8942] Training: 89%|████████▉ | 8943/10000 [1:54:16<13:18, 1.32it/s, loss=0.0113, lr=3.18e-06, step=8943] Training: 89%|████████▉ | 8944/10000 [1:54:17<14:42, 1.20it/s, loss=0.0113, lr=3.18e-06, step=8943] Training: 89%|████████▉ | 8944/10000 [1:54:17<14:42, 1.20it/s, loss=0.0091, lr=3.18e-06, step=8944] Training: 89%|████████▉ | 8945/10000 [1:54:18<14:28, 1.21it/s, loss=0.0091, lr=3.18e-06, step=8944] Training: 89%|████████▉ | 8945/10000 [1:54:18<14:28, 1.21it/s, loss=0.0054, lr=3.18e-06, step=8945] Training: 89%|████████▉ | 8946/10000 [1:54:19<16:11, 1.09it/s, loss=0.0054, lr=3.18e-06, step=8945] Training: 89%|████████▉ | 8946/10000 [1:54:19<16:11, 1.09it/s, loss=0.0186, lr=3.18e-06, step=8946] Training: 89%|████████▉ | 8947/10000 [1:54:20<16:18, 1.08it/s, loss=0.0186, lr=3.18e-06, step=8946] Training: 89%|████████▉ | 8947/10000 [1:54:20<16:18, 1.08it/s, loss=0.0090, lr=3.18e-06, step=8947] Training: 89%|████████▉ | 8948/10000 [1:54:21<15:00, 1.17it/s, loss=0.0090, lr=3.18e-06, step=8947] Training: 89%|████████▉ | 8948/10000 [1:54:21<15:00, 1.17it/s, loss=0.0057, lr=3.18e-06, step=8948] Training: 89%|████████▉ | 8949/10000 [1:54:22<15:34, 1.12it/s, loss=0.0057, lr=3.18e-06, step=8948] Training: 89%|████████▉ | 8949/10000 [1:54:22<15:34, 1.12it/s, loss=0.0027, lr=3.17e-06, step=8949]18:00:29.465 [I] step=8950 loss=0.0120 smoothed_loss=0.0077 lr=3.18e-06 grad_norm=0.7654 step_time=0.6619s data_time=0.1938s it/s=1.169 eta_to_10000=898.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0363 grad_action_out_proj=0.2035 grad_shared_expert=0.6761 (10775:train_pytorch.py:850) + Training: 90%|████████▉ | 8950/10000 [1:54:23<14:55, 1.17it/s, loss=0.0027, lr=3.17e-06, step=8949] Training: 90%|████████▉ | 8950/10000 [1:54:23<14:55, 1.17it/s, loss=0.0120, lr=3.17e-06, step=8950] Training: 90%|████████▉ | 8951/10000 [1:54:24<15:38, 1.12it/s, loss=0.0120, lr=3.17e-06, step=8950] Training: 90%|████████▉ | 8951/10000 [1:54:24<15:38, 1.12it/s, loss=0.0296, lr=3.17e-06, step=8951] Training: 90%|████████▉ | 8952/10000 [1:54:24<15:22, 1.14it/s, loss=0.0296, lr=3.17e-06, step=8951] Training: 90%|████████▉ | 8952/10000 [1:54:24<15:22, 1.14it/s, loss=0.0018, lr=3.17e-06, step=8952] Training: 90%|████████▉ | 8953/10000 [1:54:25<14:58, 1.17it/s, loss=0.0018, lr=3.17e-06, step=8952] Training: 90%|████████▉ | 8953/10000 [1:54:25<14:58, 1.17it/s, loss=0.0026, lr=3.17e-06, step=8953] Training: 90%|████████▉ | 8954/10000 [1:54:26<13:58, 1.25it/s, loss=0.0026, lr=3.17e-06, step=8953] Training: 90%|████████▉ | 8954/10000 [1:54:26<13:58, 1.25it/s, loss=0.0192, lr=3.17e-06, step=8954] Training: 90%|████████▉ | 8955/10000 [1:54:26<13:04, 1.33it/s, loss=0.0192, lr=3.17e-06, step=8954] Training: 90%|████████▉ | 8955/10000 [1:54:26<13:04, 1.33it/s, loss=0.0163, lr=3.17e-06, step=8955] Training: 90%|████████▉ | 8956/10000 [1:54:27<13:19, 1.31it/s, loss=0.0163, lr=3.17e-06, step=8955] Training: 90%|████████▉ | 8956/10000 [1:54:27<13:19, 1.31it/s, loss=0.0028, lr=3.17e-06, step=8956] Training: 90%|████████▉ | 8957/10000 [1:54:28<13:25, 1.29it/s, loss=0.0028, lr=3.17e-06, step=8956] Training: 90%|████████▉ | 8957/10000 [1:54:28<13:25, 1.29it/s, loss=0.0056, lr=3.16e-06, step=8957] Training: 90%|████████▉ | 8958/10000 [1:54:29<15:02, 1.15it/s, loss=0.0056, lr=3.16e-06, step=8957] Training: 90%|████████▉ | 8958/10000 [1:54:29<15:02, 1.15it/s, loss=0.0026, lr=3.16e-06, step=8958] Training: 90%|████████▉ | 8959/10000 [1:54:30<13:09, 1.32it/s, loss=0.0026, lr=3.16e-06, step=8958] Training: 90%|████████▉ | 8959/10000 [1:54:30<13:09, 1.32it/s, loss=0.0077, lr=3.16e-06, step=8959]18:00:37.295 [I] step=8960 loss=0.0046 smoothed_loss=0.0080 lr=3.17e-06 grad_norm=0.4122 step_time=0.6201s data_time=0.1629s it/s=1.278 eta_to_10000=813.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0054 grad_action_out_proj=0.0645 grad_shared_expert=0.4155 (10775:train_pytorch.py:850) + Training: 90%|████████▉ | 8960/10000 [1:54:30<12:49, 1.35it/s, loss=0.0077, lr=3.16e-06, step=8959] Training: 90%|████████▉ | 8960/10000 [1:54:30<12:49, 1.35it/s, loss=0.0046, lr=3.16e-06, step=8960] Training: 90%|████████▉ | 8961/10000 [1:54:31<11:48, 1.47it/s, loss=0.0046, lr=3.16e-06, step=8960] Training: 90%|████████▉ | 8961/10000 [1:54:31<11:48, 1.47it/s, loss=0.0067, lr=3.16e-06, step=8961] Training: 90%|████████▉ | 8962/10000 [1:54:32<12:12, 1.42it/s, loss=0.0067, lr=3.16e-06, step=8961] Training: 90%|████████▉ | 8962/10000 [1:54:32<12:12, 1.42it/s, loss=0.0067, lr=3.16e-06, step=8962] Training: 90%|████████▉ | 8963/10000 [1:54:32<12:46, 1.35it/s, loss=0.0067, lr=3.16e-06, step=8962] Training: 90%|████████▉ | 8963/10000 [1:54:32<12:46, 1.35it/s, loss=0.0296, lr=3.16e-06, step=8963] Training: 90%|████████▉ | 8964/10000 [1:54:33<12:52, 1.34it/s, loss=0.0296, lr=3.16e-06, step=8963] Training: 90%|████████▉ | 8964/10000 [1:54:33<12:52, 1.34it/s, loss=0.0027, lr=3.16e-06, step=8964] Training: 90%|████████▉ | 8965/10000 [1:54:34<14:55, 1.16it/s, loss=0.0027, lr=3.16e-06, step=8964] Training: 90%|████████▉ | 8965/10000 [1:54:34<14:55, 1.16it/s, loss=0.0035, lr=3.15e-06, step=8965] Training: 90%|████████▉ | 8966/10000 [1:54:35<14:35, 1.18it/s, loss=0.0035, lr=3.15e-06, step=8965] Training: 90%|████████▉ | 8966/10000 [1:54:35<14:35, 1.18it/s, loss=0.0062, lr=3.15e-06, step=8966] Training: 90%|████████▉ | 8967/10000 [1:54:36<14:50, 1.16it/s, loss=0.0062, lr=3.15e-06, step=8966] Training: 90%|████████▉ | 8967/10000 [1:54:36<14:50, 1.16it/s, loss=0.0022, lr=3.15e-06, step=8967] Training: 90%|████████▉ | 8968/10000 [1:54:37<14:17, 1.20it/s, loss=0.0022, lr=3.15e-06, step=8967] Training: 90%|████████▉ | 8968/10000 [1:54:37<14:17, 1.20it/s, loss=0.0028, lr=3.15e-06, step=8968] Training: 90%|████████▉ | 8969/10000 [1:54:38<14:16, 1.20it/s, loss=0.0028, lr=3.15e-06, step=8968] Training: 90%|████████▉ | 8969/10000 [1:54:38<14:16, 1.20it/s, loss=0.0061, lr=3.15e-06, step=8969]18:00:45.585 [I] step=8970 loss=0.0017 smoothed_loss=0.0066 lr=3.15e-06 grad_norm=0.4100 step_time=0.6464s data_time=0.1827s it/s=1.206 eta_to_10000=853.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0032 grad_action_out_proj=0.0408 grad_shared_expert=0.1715 (10775:train_pytorch.py:850) + Training: 90%|████████▉ | 8970/10000 [1:54:39<14:59, 1.15it/s, loss=0.0061, lr=3.15e-06, step=8969] Training: 90%|████████▉ | 8970/10000 [1:54:39<14:59, 1.15it/s, loss=0.0017, lr=3.15e-06, step=8970] Training: 90%|████████▉ | 8971/10000 [1:54:40<15:31, 1.10it/s, loss=0.0017, lr=3.15e-06, step=8970] Training: 90%|████████▉ | 8971/10000 [1:54:40<15:31, 1.10it/s, loss=0.0018, lr=3.15e-06, step=8971] Training: 90%|████████▉ | 8972/10000 [1:54:40<15:07, 1.13it/s, loss=0.0018, lr=3.15e-06, step=8971] Training: 90%|████████▉ | 8972/10000 [1:54:40<15:07, 1.13it/s, loss=0.0060, lr=3.15e-06, step=8972] Training: 90%|████████▉ | 8973/10000 [1:54:42<16:23, 1.04it/s, loss=0.0060, lr=3.15e-06, step=8972] Training: 90%|████████▉ | 8973/10000 [1:54:42<16:23, 1.04it/s, loss=0.0147, lr=3.14e-06, step=8973] Training: 90%|████████▉ | 8974/10000 [1:54:42<16:05, 1.06it/s, loss=0.0147, lr=3.14e-06, step=8973] Training: 90%|████████▉ | 8974/10000 [1:54:42<16:05, 1.06it/s, loss=0.0329, lr=3.14e-06, step=8974] Training: 90%|████████▉ | 8975/10000 [1:54:43<15:16, 1.12it/s, loss=0.0329, lr=3.14e-06, step=8974] Training: 90%|████████▉ | 8975/10000 [1:54:43<15:16, 1.12it/s, loss=0.0028, lr=3.14e-06, step=8975] Training: 90%|████████▉ | 8976/10000 [1:54:44<14:41, 1.16it/s, loss=0.0028, lr=3.14e-06, step=8975] Training: 90%|████████▉ | 8976/10000 [1:54:44<14:41, 1.16it/s, loss=0.0046, lr=3.14e-06, step=8976] Training: 90%|████████▉ | 8977/10000 [1:54:45<12:54, 1.32it/s, loss=0.0046, lr=3.14e-06, step=8976] Training: 90%|████████▉ | 8977/10000 [1:54:45<12:54, 1.32it/s, loss=0.0068, lr=3.14e-06, step=8977] Training: 90%|████████▉ | 8978/10000 [1:54:45<11:33, 1.47it/s, loss=0.0068, lr=3.14e-06, step=8977] Training: 90%|████████▉ | 8978/10000 [1:54:45<11:33, 1.47it/s, loss=0.0039, lr=3.14e-06, step=8978] Training: 90%|████████▉ | 8979/10000 [1:54:46<10:39, 1.60it/s, loss=0.0039, lr=3.14e-06, step=8978] Training: 90%|████████▉ | 8979/10000 [1:54:46<10:39, 1.60it/s, loss=0.0030, lr=3.14e-06, step=8979]18:00:53.434 [I] step=8980 loss=0.0012 smoothed_loss=0.0068 lr=3.14e-06 grad_norm=0.4652 step_time=0.6313s data_time=0.1535s it/s=1.274 eta_to_10000=800.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0120 grad_action_out_proj=0.0925 grad_shared_expert=0.8640 (10775:train_pytorch.py:850) + Training: 90%|████████▉ | 8980/10000 [1:54:46<12:08, 1.40it/s, loss=0.0030, lr=3.14e-06, step=8979] Training: 90%|████████▉ | 8980/10000 [1:54:46<12:08, 1.40it/s, loss=0.0012, lr=3.14e-06, step=8980] Training: 90%|████████▉ | 8981/10000 [1:54:47<12:24, 1.37it/s, loss=0.0012, lr=3.14e-06, step=8980] Training: 90%|████████▉ | 8981/10000 [1:54:47<12:24, 1.37it/s, loss=0.0032, lr=3.13e-06, step=8981] Training: 90%|████████▉ | 8982/10000 [1:54:48<11:23, 1.49it/s, loss=0.0032, lr=3.13e-06, step=8981] Training: 90%|████████▉ | 8982/10000 [1:54:48<11:23, 1.49it/s, loss=0.0054, lr=3.13e-06, step=8982] Training: 90%|████████▉ | 8983/10000 [1:54:48<11:01, 1.54it/s, loss=0.0054, lr=3.13e-06, step=8982] Training: 90%|████████▉ | 8983/10000 [1:54:48<11:01, 1.54it/s, loss=0.0016, lr=3.13e-06, step=8983] Training: 90%|████████▉ | 8984/10000 [1:54:49<12:15, 1.38it/s, loss=0.0016, lr=3.13e-06, step=8983] Training: 90%|████████▉ | 8984/10000 [1:54:49<12:15, 1.38it/s, loss=0.0015, lr=3.13e-06, step=8984] Training: 90%|████████▉ | 8985/10000 [1:54:50<12:14, 1.38it/s, loss=0.0015, lr=3.13e-06, step=8984] Training: 90%|████████▉ | 8985/10000 [1:54:50<12:14, 1.38it/s, loss=0.0043, lr=3.13e-06, step=8985] Training: 90%|████████▉ | 8986/10000 [1:54:51<12:12, 1.39it/s, loss=0.0043, lr=3.13e-06, step=8985] Training: 90%|████████▉ | 8986/10000 [1:54:51<12:12, 1.39it/s, loss=0.0037, lr=3.13e-06, step=8986] Training: 90%|████████▉ | 8987/10000 [1:54:52<12:57, 1.30it/s, loss=0.0037, lr=3.13e-06, step=8986] Training: 90%|████████▉ | 8987/10000 [1:54:52<12:57, 1.30it/s, loss=0.0048, lr=3.13e-06, step=8987] Training: 90%|████████▉ | 8988/10000 [1:54:52<12:32, 1.35it/s, loss=0.0048, lr=3.13e-06, step=8987] Training: 90%|████████▉ | 8988/10000 [1:54:52<12:32, 1.35it/s, loss=0.0016, lr=3.13e-06, step=8988] Training: 90%|████████▉ | 8989/10000 [1:54:53<11:42, 1.44it/s, loss=0.0016, lr=3.13e-06, step=8988] Training: 90%|████████▉ | 8989/10000 [1:54:53<11:42, 1.44it/s, loss=0.0219, lr=3.12e-06, step=8989]18:01:00.588 [I] step=8990 loss=0.0038 smoothed_loss=0.0062 lr=3.13e-06 grad_norm=0.3891 step_time=0.5801s data_time=0.1353s it/s=1.398 eta_to_10000=722.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0102 grad_action_out_proj=0.0719 grad_shared_expert=0.7492 (10775:train_pytorch.py:850) + Training: 90%|████████▉ | 8990/10000 [1:54:54<12:05, 1.39it/s, loss=0.0219, lr=3.12e-06, step=8989] Training: 90%|████████▉ | 8990/10000 [1:54:54<12:05, 1.39it/s, loss=0.0038, lr=3.12e-06, step=8990] Training: 90%|████████▉ | 8991/10000 [1:54:54<11:52, 1.42it/s, loss=0.0038, lr=3.12e-06, step=8990] Training: 90%|████████▉ | 8991/10000 [1:54:54<11:52, 1.42it/s, loss=0.0711, lr=3.12e-06, step=8991] Training: 90%|████████▉ | 8992/10000 [1:54:55<10:46, 1.56it/s, loss=0.0711, lr=3.12e-06, step=8991] Training: 90%|████████▉ | 8992/10000 [1:54:55<10:46, 1.56it/s, loss=0.0069, lr=3.12e-06, step=8992] Training: 90%|████████▉ | 8993/10000 [1:54:56<11:25, 1.47it/s, loss=0.0069, lr=3.12e-06, step=8992] Training: 90%|████████▉ | 8993/10000 [1:54:56<11:25, 1.47it/s, loss=0.0062, lr=3.12e-06, step=8993] Training: 90%|████████▉ | 8994/10000 [1:54:56<11:48, 1.42it/s, loss=0.0062, lr=3.12e-06, step=8993] Training: 90%|████████▉ | 8994/10000 [1:54:56<11:48, 1.42it/s, loss=0.0104, lr=3.12e-06, step=8994] Training: 90%|████████▉ | 8995/10000 [1:54:57<11:42, 1.43it/s, loss=0.0104, lr=3.12e-06, step=8994] Training: 90%|████████▉ | 8995/10000 [1:54:57<11:42, 1.43it/s, loss=0.0110, lr=3.12e-06, step=8995] Training: 90%|████████▉ | 8996/10000 [1:54:58<11:30, 1.45it/s, loss=0.0110, lr=3.12e-06, step=8995] Training: 90%|████████▉ | 8996/10000 [1:54:58<11:30, 1.45it/s, loss=0.0059, lr=3.12e-06, step=8996] Training: 90%|████████▉ | 8997/10000 [1:54:58<11:25, 1.46it/s, loss=0.0059, lr=3.12e-06, step=8996] Training: 90%|████████▉ | 8997/10000 [1:54:58<11:25, 1.46it/s, loss=0.0270, lr=3.11e-06, step=8997] Training: 90%|████████▉ | 8998/10000 [1:54:59<10:46, 1.55it/s, loss=0.0270, lr=3.11e-06, step=8997] Training: 90%|████████▉ | 8998/10000 [1:54:59<10:46, 1.55it/s, loss=0.0027, lr=3.11e-06, step=8998] Training: 90%|████████▉ | 8999/10000 [1:55:00<11:28, 1.45it/s, loss=0.0027, lr=3.11e-06, step=8998] Training: 90%|████████▉ | 8999/10000 [1:55:00<11:28, 1.45it/s, loss=0.0035, lr=3.11e-06, step=8999]18:01:07.182 [I] step=9000 loss=0.0034 smoothed_loss=0.0100 lr=3.12e-06 grad_norm=0.3807 step_time=0.5450s data_time=0.1145s it/s=1.517 eta_to_10000=659.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0204 grad_action_out_proj=0.1373 grad_shared_expert=0.3867 (10775:train_pytorch.py:850) +18:02:59.063 [I] Saved checkpoint at step 9000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/9000 (10775:train_pytorch.py:350) + Training: 90%|█████████ | 9000/10000 [1:56:52<9:30:05, 34.21s/it, loss=0.0035, lr=3.11e-06, step=8999] Training: 90%|█████████ | 9000/10000 [1:56:52<9:30:05, 34.21s/it, loss=0.0034, lr=3.11e-06, step=9000] Training: 90%|█████████ | 9001/10000 [1:56:53<6:44:15, 24.28s/it, loss=0.0034, lr=3.11e-06, step=9000] Training: 90%|█████████ | 9001/10000 [1:56:53<6:44:15, 24.28s/it, loss=0.0015, lr=3.11e-06, step=9001] Training: 90%|█████████ | 9002/10000 [1:56:54<4:46:34, 17.23s/it, loss=0.0015, lr=3.11e-06, step=9001] Training: 90%|█████████ | 9002/10000 [1:56:54<4:46:34, 17.23s/it, loss=0.0056, lr=3.11e-06, step=9002] Training: 90%|█████████ | 9003/10000 [1:56:55<3:23:27, 12.24s/it, loss=0.0056, lr=3.11e-06, step=9002] Training: 90%|█████████ | 9003/10000 [1:56:55<3:23:27, 12.24s/it, loss=0.0091, lr=3.11e-06, step=9003] Training: 90%|█████████ | 9004/10000 [1:56:55<2:26:22, 8.82s/it, loss=0.0091, lr=3.11e-06, step=9003] Training: 90%|█████████ | 9004/10000 [1:56:55<2:26:22, 8.82s/it, loss=0.0025, lr=3.11e-06, step=9004] Training: 90%|█████████ | 9005/10000 [1:56:56<1:46:14, 6.41s/it, loss=0.0025, lr=3.11e-06, step=9004] Training: 90%|█████████ | 9005/10000 [1:56:56<1:46:14, 6.41s/it, loss=0.0046, lr=3.10e-06, step=9005] Training: 90%|█████████ | 9006/10000 [1:56:57<1:18:03, 4.71s/it, loss=0.0046, lr=3.10e-06, step=9005] Training: 90%|█████████ | 9006/10000 [1:56:57<1:18:03, 4.71s/it, loss=0.0086, lr=3.10e-06, step=9006] Training: 90%|█████████ | 9007/10000 [1:56:58<58:12, 3.52s/it, loss=0.0086, lr=3.10e-06, step=9006] Training: 90%|█████████ | 9007/10000 [1:56:58<58:12, 3.52s/it, loss=0.0063, lr=3.10e-06, step=9007] Training: 90%|█████████ | 9008/10000 [1:56:59<46:07, 2.79s/it, loss=0.0063, lr=3.10e-06, step=9007] Training: 90%|█████████ | 9008/10000 [1:56:59<46:07, 2.79s/it, loss=0.0073, lr=3.10e-06, step=9008] Training: 90%|█████████ | 9009/10000 [1:57:00<38:24, 2.33s/it, loss=0.0073, lr=3.10e-06, step=9008] Training: 90%|█████████ | 9009/10000 [1:57:00<38:24, 2.33s/it, loss=0.0119, lr=3.10e-06, step=9009]18:03:07.655 [I] step=9010 loss=0.0081 smoothed_loss=0.0081 lr=3.10e-06 grad_norm=0.3965 step_time=0.6696s data_time=11.3777s it/s=0.083 eta_to_10000=11926.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0550 grad_shared_expert=0.4289 (10775:train_pytorch.py:850) + Training: 90%|█████████ | 9010/10000 [1:57:01<30:06, 1.82s/it, loss=0.0119, lr=3.10e-06, step=9009] Training: 90%|█████████ | 9010/10000 [1:57:01<30:06, 1.82s/it, loss=0.0081, lr=3.10e-06, step=9010] Training: 90%|█████████ | 9011/10000 [1:57:01<24:32, 1.49s/it, loss=0.0081, lr=3.10e-06, step=9010] Training: 90%|█████████ | 9011/10000 [1:57:01<24:32, 1.49s/it, loss=0.0036, lr=3.10e-06, step=9011] Training: 90%|█████████ | 9012/10000 [1:57:02<21:07, 1.28s/it, loss=0.0036, lr=3.10e-06, step=9011] Training: 90%|█████████ | 9012/10000 [1:57:02<21:07, 1.28s/it, loss=0.0015, lr=3.10e-06, step=9012] Training: 90%|█████████ | 9013/10000 [1:57:03<18:53, 1.15s/it, loss=0.0015, lr=3.10e-06, step=9012] Training: 90%|█████████ | 9013/10000 [1:57:03<18:53, 1.15s/it, loss=0.0053, lr=3.10e-06, step=9013] Training: 90%|█████████ | 9014/10000 [1:57:04<15:42, 1.05it/s, loss=0.0053, lr=3.10e-06, step=9013] Training: 90%|█████████ | 9014/10000 [1:57:04<15:42, 1.05it/s, loss=0.0076, lr=3.09e-06, step=9014] Training: 90%|█████████ | 9015/10000 [1:57:04<13:28, 1.22it/s, loss=0.0076, lr=3.09e-06, step=9014] Training: 90%|█████████ | 9015/10000 [1:57:04<13:28, 1.22it/s, loss=0.0184, lr=3.09e-06, step=9015] Training: 90%|█████████ | 9016/10000 [1:57:05<14:56, 1.10it/s, loss=0.0184, lr=3.09e-06, step=9015] Training: 90%|█████████ | 9016/10000 [1:57:05<14:56, 1.10it/s, loss=0.0514, lr=3.09e-06, step=9016] Training: 90%|█████████ | 9017/10000 [1:57:06<14:41, 1.11it/s, loss=0.0514, lr=3.09e-06, step=9016] Training: 90%|█████████ | 9017/10000 [1:57:06<14:41, 1.11it/s, loss=0.0048, lr=3.09e-06, step=9017] Training: 90%|█████████ | 9018/10000 [1:57:07<14:14, 1.15it/s, loss=0.0048, lr=3.09e-06, step=9017] Training: 90%|█████████ | 9018/10000 [1:57:07<14:14, 1.15it/s, loss=0.0027, lr=3.09e-06, step=9018] Training: 90%|█████████ | 9019/10000 [1:57:08<13:38, 1.20it/s, loss=0.0027, lr=3.09e-06, step=9018] Training: 90%|█████████ | 9019/10000 [1:57:08<13:38, 1.20it/s, loss=0.0350, lr=3.09e-06, step=9019]18:03:15.319 [I] step=9020 loss=0.0030 smoothed_loss=0.0122 lr=3.09e-06 grad_norm=0.4253 step_time=0.6061s data_time=0.1603s it/s=1.305 eta_to_10000=750.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0035 grad_action_out_proj=0.0455 grad_shared_expert=0.1945 (10775:train_pytorch.py:850) + Training: 90%|█████████ | 9020/10000 [1:57:08<13:17, 1.23it/s, loss=0.0350, lr=3.09e-06, step=9019] Training: 90%|█████████ | 9020/10000 [1:57:08<13:17, 1.23it/s, loss=0.0030, lr=3.09e-06, step=9020] Training: 90%|█████████ | 9021/10000 [1:57:09<12:07, 1.34it/s, loss=0.0030, lr=3.09e-06, step=9020] Training: 90%|█████████ | 9021/10000 [1:57:09<12:07, 1.34it/s, loss=0.0061, lr=3.09e-06, step=9021] Training: 90%|█████████ | 9022/10000 [1:57:10<11:10, 1.46it/s, loss=0.0061, lr=3.09e-06, step=9021] Training: 90%|█████████ | 9022/10000 [1:57:10<11:10, 1.46it/s, loss=0.0016, lr=3.08e-06, step=9022] Training: 90%|█████████ | 9023/10000 [1:57:10<11:59, 1.36it/s, loss=0.0016, lr=3.08e-06, step=9022] Training: 90%|█████████ | 9023/10000 [1:57:10<11:59, 1.36it/s, loss=0.0063, lr=3.08e-06, step=9023] Training: 90%|█████████ | 9024/10000 [1:57:11<11:03, 1.47it/s, loss=0.0063, lr=3.08e-06, step=9023] Training: 90%|█████████ | 9024/10000 [1:57:11<11:03, 1.47it/s, loss=0.0079, lr=3.08e-06, step=9024] Training: 90%|█████████ | 9025/10000 [1:57:12<11:03, 1.47it/s, loss=0.0079, lr=3.08e-06, step=9024] Training: 90%|█████████ | 9025/10000 [1:57:12<11:03, 1.47it/s, loss=0.0735, lr=3.08e-06, step=9025] Training: 90%|█████████ | 9026/10000 [1:57:12<11:13, 1.45it/s, loss=0.0735, lr=3.08e-06, step=9025] Training: 90%|█████████ | 9026/10000 [1:57:12<11:13, 1.45it/s, loss=0.0023, lr=3.08e-06, step=9026] Training: 90%|█████████ | 9027/10000 [1:57:13<11:49, 1.37it/s, loss=0.0023, lr=3.08e-06, step=9026] Training: 90%|█████████ | 9027/10000 [1:57:13<11:49, 1.37it/s, loss=0.0025, lr=3.08e-06, step=9027] Training: 90%|█████████ | 9028/10000 [1:57:14<12:27, 1.30it/s, loss=0.0025, lr=3.08e-06, step=9027] Training: 90%|█████████ | 9028/10000 [1:57:14<12:27, 1.30it/s, loss=0.0044, lr=3.08e-06, step=9028] Training: 90%|█████████ | 9029/10000 [1:57:15<11:45, 1.38it/s, loss=0.0044, lr=3.08e-06, step=9028] Training: 90%|█████████ | 9029/10000 [1:57:15<11:45, 1.38it/s, loss=0.0030, lr=3.08e-06, step=9029]18:03:22.701 [I] step=9030 loss=0.0043 smoothed_loss=0.0110 lr=3.08e-06 grad_norm=0.3637 step_time=0.5897s data_time=0.1485s it/s=1.355 eta_to_10000=715.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0411 grad_action_out_proj=0.1424 grad_shared_expert=0.5603 (10775:train_pytorch.py:850) + Training: 90%|█████████ | 9030/10000 [1:57:16<13:45, 1.17it/s, loss=0.0030, lr=3.08e-06, step=9029] Training: 90%|█████████ | 9030/10000 [1:57:16<13:45, 1.17it/s, loss=0.0043, lr=3.08e-06, step=9030] Training: 90%|█████████ | 9031/10000 [1:57:17<13:30, 1.20it/s, loss=0.0043, lr=3.08e-06, step=9030] Training: 90%|█████████ | 9031/10000 [1:57:17<13:30, 1.20it/s, loss=0.0042, lr=3.07e-06, step=9031] Training: 90%|█████████ | 9032/10000 [1:57:17<12:21, 1.31it/s, loss=0.0042, lr=3.07e-06, step=9031] Training: 90%|█████████ | 9032/10000 [1:57:17<12:21, 1.31it/s, loss=0.0049, lr=3.07e-06, step=9032] Training: 90%|█████████ | 9033/10000 [1:57:18<11:11, 1.44it/s, loss=0.0049, lr=3.07e-06, step=9032] Training: 90%|█████████ | 9033/10000 [1:57:18<11:11, 1.44it/s, loss=0.0031, lr=3.07e-06, step=9033] Training: 90%|█████████ | 9034/10000 [1:57:18<10:44, 1.50it/s, loss=0.0031, lr=3.07e-06, step=9033] Training: 90%|█████████ | 9034/10000 [1:57:18<10:44, 1.50it/s, loss=0.0065, lr=3.07e-06, step=9034] Training: 90%|█████████ | 9035/10000 [1:57:19<11:12, 1.44it/s, loss=0.0065, lr=3.07e-06, step=9034] Training: 90%|█████████ | 9035/10000 [1:57:19<11:12, 1.44it/s, loss=0.0141, lr=3.07e-06, step=9035] Training: 90%|█████████ | 9036/10000 [1:57:20<11:54, 1.35it/s, loss=0.0141, lr=3.07e-06, step=9035] Training: 90%|█████████ | 9036/10000 [1:57:20<11:54, 1.35it/s, loss=0.0078, lr=3.07e-06, step=9036] Training: 90%|█████████ | 9037/10000 [1:57:21<14:04, 1.14it/s, loss=0.0078, lr=3.07e-06, step=9036] Training: 90%|█████████ | 9037/10000 [1:57:21<14:04, 1.14it/s, loss=0.0059, lr=3.07e-06, step=9037] Training: 90%|█████████ | 9038/10000 [1:57:22<13:28, 1.19it/s, loss=0.0059, lr=3.07e-06, step=9037] Training: 90%|█████████ | 9038/10000 [1:57:22<13:28, 1.19it/s, loss=0.0013, lr=3.07e-06, step=9038] Training: 90%|█████████ | 9039/10000 [1:57:23<13:27, 1.19it/s, loss=0.0013, lr=3.07e-06, step=9038] Training: 90%|█████████ | 9039/10000 [1:57:23<13:27, 1.19it/s, loss=0.0038, lr=3.06e-06, step=9039]18:03:30.451 [I] step=9040 loss=0.0073 smoothed_loss=0.0077 lr=3.07e-06 grad_norm=0.3098 step_time=0.6047s data_time=0.1702s it/s=1.290 eta_to_10000=743.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0059 grad_action_out_proj=0.0636 grad_shared_expert=0.2207 (10775:train_pytorch.py:850) + Training: 90%|█████████ | 9040/10000 [1:57:24<13:19, 1.20it/s, loss=0.0038, lr=3.06e-06, step=9039] Training: 90%|█████████ | 9040/10000 [1:57:24<13:19, 1.20it/s, loss=0.0073, lr=3.06e-06, step=9040] Training: 90%|█████████ | 9041/10000 [1:57:24<13:18, 1.20it/s, loss=0.0073, lr=3.06e-06, step=9040] Training: 90%|█████████ | 9041/10000 [1:57:24<13:18, 1.20it/s, loss=0.0043, lr=3.06e-06, step=9041] Training: 90%|█████████ | 9042/10000 [1:57:25<12:27, 1.28it/s, loss=0.0043, lr=3.06e-06, step=9041] Training: 90%|█████████ | 9042/10000 [1:57:25<12:27, 1.28it/s, loss=0.0077, lr=3.06e-06, step=9042] Training: 90%|█████████ | 9043/10000 [1:57:26<12:39, 1.26it/s, loss=0.0077, lr=3.06e-06, step=9042] Training: 90%|█████████ | 9043/10000 [1:57:26<12:39, 1.26it/s, loss=0.0061, lr=3.06e-06, step=9043] Training: 90%|█████████ | 9044/10000 [1:57:27<12:36, 1.26it/s, loss=0.0061, lr=3.06e-06, step=9043] Training: 90%|█████████ | 9044/10000 [1:57:27<12:36, 1.26it/s, loss=0.0081, lr=3.06e-06, step=9044] Training: 90%|█████████ | 9045/10000 [1:57:27<12:48, 1.24it/s, loss=0.0081, lr=3.06e-06, step=9044] Training: 90%|█████████ | 9045/10000 [1:57:27<12:48, 1.24it/s, loss=0.0064, lr=3.06e-06, step=9045] Training: 90%|█████████ | 9046/10000 [1:57:28<11:19, 1.40it/s, loss=0.0064, lr=3.06e-06, step=9045] Training: 90%|█████████ | 9046/10000 [1:57:28<11:19, 1.40it/s, loss=0.0062, lr=3.06e-06, step=9046] Training: 90%|█████████ | 9047/10000 [1:57:29<10:45, 1.48it/s, loss=0.0062, lr=3.06e-06, step=9046] Training: 90%|█████████ | 9047/10000 [1:57:29<10:45, 1.48it/s, loss=0.0014, lr=3.06e-06, step=9047] Training: 90%|█████████ | 9048/10000 [1:57:29<10:08, 1.57it/s, loss=0.0014, lr=3.06e-06, step=9047] Training: 90%|█████████ | 9048/10000 [1:57:29<10:08, 1.57it/s, loss=0.0238, lr=3.05e-06, step=9048] Training: 90%|█████████ | 9049/10000 [1:57:30<11:16, 1.41it/s, loss=0.0238, lr=3.05e-06, step=9048] Training: 90%|█████████ | 9049/10000 [1:57:30<11:16, 1.41it/s, loss=0.0039, lr=3.05e-06, step=9049]18:03:37.749 [I] step=9050 loss=0.0052 smoothed_loss=0.0076 lr=3.06e-06 grad_norm=0.3544 step_time=0.5741s data_time=0.1557s it/s=1.370 eta_to_10000=693.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0036 grad_action_out_proj=0.0610 grad_shared_expert=0.2522 (10775:train_pytorch.py:850) + Training: 90%|█████████ | 9050/10000 [1:57:31<11:52, 1.33it/s, loss=0.0039, lr=3.05e-06, step=9049] Training: 90%|█████████ | 9050/10000 [1:57:31<11:52, 1.33it/s, loss=0.0052, lr=3.05e-06, step=9050] Training: 91%|█████████ | 9051/10000 [1:57:32<12:57, 1.22it/s, loss=0.0052, lr=3.05e-06, step=9050] Training: 91%|█████████ | 9051/10000 [1:57:32<12:57, 1.22it/s, loss=0.0098, lr=3.05e-06, step=9051] Training: 91%|█████████ | 9052/10000 [1:57:33<12:43, 1.24it/s, loss=0.0098, lr=3.05e-06, step=9051] Training: 91%|█████████ | 9052/10000 [1:57:33<12:43, 1.24it/s, loss=0.0022, lr=3.05e-06, step=9052] Training: 91%|█████████ | 9053/10000 [1:57:33<12:01, 1.31it/s, loss=0.0022, lr=3.05e-06, step=9052] Training: 91%|█████████ | 9053/10000 [1:57:33<12:01, 1.31it/s, loss=0.0131, lr=3.05e-06, step=9053] Training: 91%|█████████ | 9054/10000 [1:57:34<11:26, 1.38it/s, loss=0.0131, lr=3.05e-06, step=9053] Training: 91%|█████████ | 9054/10000 [1:57:34<11:26, 1.38it/s, loss=0.0097, lr=3.05e-06, step=9054] Training: 91%|█████████ | 9055/10000 [1:57:35<11:18, 1.39it/s, loss=0.0097, lr=3.05e-06, step=9054] Training: 91%|█████████ | 9055/10000 [1:57:35<11:18, 1.39it/s, loss=0.0017, lr=3.05e-06, step=9055] Training: 91%|█████████ | 9056/10000 [1:57:35<11:58, 1.31it/s, loss=0.0017, lr=3.05e-06, step=9055] Training: 91%|█████████ | 9056/10000 [1:57:35<11:58, 1.31it/s, loss=0.0732, lr=3.04e-06, step=9056] Training: 91%|█████████ | 9057/10000 [1:57:36<11:07, 1.41it/s, loss=0.0732, lr=3.04e-06, step=9056] Training: 91%|█████████ | 9057/10000 [1:57:36<11:07, 1.41it/s, loss=0.0058, lr=3.04e-06, step=9057] Training: 91%|█████████ | 9058/10000 [1:57:37<11:26, 1.37it/s, loss=0.0058, lr=3.04e-06, step=9057] Training: 91%|█████████ | 9058/10000 [1:57:37<11:26, 1.37it/s, loss=0.0028, lr=3.04e-06, step=9058] Training: 91%|█████████ | 9059/10000 [1:57:38<12:48, 1.22it/s, loss=0.0028, lr=3.04e-06, step=9058] Training: 91%|█████████ | 9059/10000 [1:57:38<12:48, 1.22it/s, loss=0.0017, lr=3.04e-06, step=9059]18:03:45.785 [I] step=9060 loss=0.0214 smoothed_loss=0.0121 lr=3.05e-06 grad_norm=0.4264 step_time=0.6547s data_time=0.1489s it/s=1.245 eta_to_10000=755.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0127 grad_action_out_proj=0.0886 grad_shared_expert=0.3299 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9060/10000 [1:57:39<13:49, 1.13it/s, loss=0.0017, lr=3.04e-06, step=9059] Training: 91%|█████████ | 9060/10000 [1:57:39<13:49, 1.13it/s, loss=0.0214, lr=3.04e-06, step=9060] Training: 91%|█████████ | 9061/10000 [1:57:39<12:23, 1.26it/s, loss=0.0214, lr=3.04e-06, step=9060] Training: 91%|█████████ | 9061/10000 [1:57:39<12:23, 1.26it/s, loss=0.0016, lr=3.04e-06, step=9061] Training: 91%|█████████ | 9062/10000 [1:57:40<12:07, 1.29it/s, loss=0.0016, lr=3.04e-06, step=9061] Training: 91%|█████████ | 9062/10000 [1:57:40<12:07, 1.29it/s, loss=0.0024, lr=3.04e-06, step=9062] Training: 91%|█████████ | 9063/10000 [1:57:41<11:17, 1.38it/s, loss=0.0024, lr=3.04e-06, step=9062] Training: 91%|█████████ | 9063/10000 [1:57:41<11:17, 1.38it/s, loss=0.0112, lr=3.04e-06, step=9063] Training: 91%|█████████ | 9064/10000 [1:57:41<10:37, 1.47it/s, loss=0.0112, lr=3.04e-06, step=9063] Training: 91%|█████████ | 9064/10000 [1:57:41<10:37, 1.47it/s, loss=0.0244, lr=3.04e-06, step=9064] Training: 91%|█████████ | 9065/10000 [1:57:42<10:51, 1.43it/s, loss=0.0244, lr=3.04e-06, step=9064] Training: 91%|█████████ | 9065/10000 [1:57:42<10:51, 1.43it/s, loss=0.0100, lr=3.03e-06, step=9065] Training: 91%|█████████ | 9066/10000 [1:57:43<13:02, 1.19it/s, loss=0.0100, lr=3.03e-06, step=9065] Training: 91%|█████████ | 9066/10000 [1:57:43<13:02, 1.19it/s, loss=0.0145, lr=3.03e-06, step=9066] Training: 91%|█████████ | 9067/10000 [1:57:45<15:05, 1.03it/s, loss=0.0145, lr=3.03e-06, step=9066] Training: 91%|█████████ | 9067/10000 [1:57:45<15:05, 1.03it/s, loss=0.0049, lr=3.03e-06, step=9067] Training: 91%|█████████ | 9068/10000 [1:57:45<14:07, 1.10it/s, loss=0.0049, lr=3.03e-06, step=9067] Training: 91%|█████████ | 9068/10000 [1:57:45<14:07, 1.10it/s, loss=0.0054, lr=3.03e-06, step=9068] Training: 91%|█████████ | 9069/10000 [1:57:46<12:53, 1.20it/s, loss=0.0054, lr=3.03e-06, step=9068] Training: 91%|█████████ | 9069/10000 [1:57:46<12:53, 1.20it/s, loss=0.0116, lr=3.03e-06, step=9069]18:03:54.128 [I] step=9070 loss=0.0120 smoothed_loss=0.0108 lr=3.03e-06 grad_norm=0.4611 step_time=0.6718s data_time=0.1624s it/s=1.199 eta_to_10000=775.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.0733 grad_shared_expert=0.3870 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9070/10000 [1:57:47<14:47, 1.05it/s, loss=0.0116, lr=3.03e-06, step=9069] Training: 91%|█████████ | 9070/10000 [1:57:47<14:47, 1.05it/s, loss=0.0120, lr=3.03e-06, step=9070] Training: 91%|█████████ | 9071/10000 [1:57:48<13:51, 1.12it/s, loss=0.0120, lr=3.03e-06, step=9070] Training: 91%|█████████ | 9071/10000 [1:57:48<13:51, 1.12it/s, loss=0.0035, lr=3.03e-06, step=9071] Training: 91%|█████████ | 9072/10000 [1:57:49<12:24, 1.25it/s, loss=0.0035, lr=3.03e-06, step=9071] Training: 91%|█████████ | 9072/10000 [1:57:49<12:24, 1.25it/s, loss=0.0146, lr=3.03e-06, step=9072] Training: 91%|█████████ | 9073/10000 [1:57:50<13:35, 1.14it/s, loss=0.0146, lr=3.03e-06, step=9072] Training: 91%|█████████ | 9073/10000 [1:57:50<13:35, 1.14it/s, loss=0.0127, lr=3.03e-06, step=9073] Training: 91%|█████████ | 9074/10000 [1:57:50<13:00, 1.19it/s, loss=0.0127, lr=3.03e-06, step=9073] Training: 91%|█████████ | 9074/10000 [1:57:50<13:00, 1.19it/s, loss=0.0732, lr=3.02e-06, step=9074] Training: 91%|█████████ | 9075/10000 [1:57:51<13:04, 1.18it/s, loss=0.0732, lr=3.02e-06, step=9074] Training: 91%|█████████ | 9075/10000 [1:57:51<13:04, 1.18it/s, loss=0.0036, lr=3.02e-06, step=9075] Training: 91%|█████████ | 9076/10000 [1:57:52<12:30, 1.23it/s, loss=0.0036, lr=3.02e-06, step=9075] Training: 91%|█████████ | 9076/10000 [1:57:52<12:30, 1.23it/s, loss=0.0021, lr=3.02e-06, step=9076] Training: 91%|█████████ | 9077/10000 [1:57:53<13:12, 1.16it/s, loss=0.0021, lr=3.02e-06, step=9076] Training: 91%|█████████ | 9077/10000 [1:57:53<13:12, 1.16it/s, loss=0.0076, lr=3.02e-06, step=9077] Training: 91%|█████████ | 9078/10000 [1:57:54<12:13, 1.26it/s, loss=0.0076, lr=3.02e-06, step=9077] Training: 91%|█████████ | 9078/10000 [1:57:54<12:13, 1.26it/s, loss=0.0011, lr=3.02e-06, step=9078] Training: 91%|█████████ | 9079/10000 [1:57:54<11:31, 1.33it/s, loss=0.0011, lr=3.02e-06, step=9078] Training: 91%|█████████ | 9079/10000 [1:57:54<11:31, 1.33it/s, loss=0.0356, lr=3.02e-06, step=9079]18:04:01.938 [I] step=9080 loss=0.0036 smoothed_loss=0.0136 lr=3.02e-06 grad_norm=0.3998 step_time=0.6327s data_time=0.1483s it/s=1.280 eta_to_10000=718.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0041 grad_action_out_proj=0.0655 grad_shared_expert=0.2298 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9080/10000 [1:57:55<11:44, 1.31it/s, loss=0.0356, lr=3.02e-06, step=9079] Training: 91%|█████████ | 9080/10000 [1:57:55<11:44, 1.31it/s, loss=0.0036, lr=3.02e-06, step=9080] Training: 91%|█████████ | 9081/10000 [1:57:56<11:29, 1.33it/s, loss=0.0036, lr=3.02e-06, step=9080] Training: 91%|█████████ | 9081/10000 [1:57:56<11:29, 1.33it/s, loss=0.0112, lr=3.02e-06, step=9081] Training: 91%|█████████ | 9082/10000 [1:57:57<12:07, 1.26it/s, loss=0.0112, lr=3.02e-06, step=9081] Training: 91%|█████████ | 9082/10000 [1:57:57<12:07, 1.26it/s, loss=0.0017, lr=3.02e-06, step=9082] Training: 91%|█████████ | 9083/10000 [1:57:57<12:30, 1.22it/s, loss=0.0017, lr=3.02e-06, step=9082] Training: 91%|█████████ | 9083/10000 [1:57:57<12:30, 1.22it/s, loss=0.0012, lr=3.01e-06, step=9083] Training: 91%|█████████ | 9084/10000 [1:57:58<11:23, 1.34it/s, loss=0.0012, lr=3.01e-06, step=9083] Training: 91%|█████████ | 9084/10000 [1:57:58<11:23, 1.34it/s, loss=0.0175, lr=3.01e-06, step=9084] Training: 91%|█████████ | 9085/10000 [1:57:59<10:50, 1.41it/s, loss=0.0175, lr=3.01e-06, step=9084] Training: 91%|█████████ | 9085/10000 [1:57:59<10:50, 1.41it/s, loss=0.0041, lr=3.01e-06, step=9085] Training: 91%|█████████ | 9086/10000 [1:57:59<10:50, 1.41it/s, loss=0.0041, lr=3.01e-06, step=9085] Training: 91%|█████████ | 9086/10000 [1:57:59<10:50, 1.41it/s, loss=0.0019, lr=3.01e-06, step=9086] Training: 91%|█████████ | 9087/10000 [1:58:00<11:37, 1.31it/s, loss=0.0019, lr=3.01e-06, step=9086] Training: 91%|█████████ | 9087/10000 [1:58:00<11:37, 1.31it/s, loss=0.0080, lr=3.01e-06, step=9087] Training: 91%|█████████ | 9088/10000 [1:58:01<12:06, 1.26it/s, loss=0.0080, lr=3.01e-06, step=9087] Training: 91%|█████████ | 9088/10000 [1:58:01<12:06, 1.26it/s, loss=0.0100, lr=3.01e-06, step=9088] Training: 91%|█████████ | 9089/10000 [1:58:02<11:49, 1.28it/s, loss=0.0100, lr=3.01e-06, step=9088] Training: 91%|█████████ | 9089/10000 [1:58:02<11:49, 1.28it/s, loss=0.0145, lr=3.01e-06, step=9089]18:04:09.361 [I] step=9090 loss=0.0036 smoothed_loss=0.0097 lr=3.01e-06 grad_norm=0.3718 step_time=0.6110s data_time=0.1314s it/s=1.347 eta_to_10000=675.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0099 grad_action_out_proj=0.0983 grad_shared_expert=0.3056 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9090/10000 [1:58:02<10:39, 1.42it/s, loss=0.0145, lr=3.01e-06, step=9089] Training: 91%|█████████ | 9090/10000 [1:58:02<10:39, 1.42it/s, loss=0.0036, lr=3.01e-06, step=9090] Training: 91%|█████████ | 9091/10000 [1:58:03<11:42, 1.29it/s, loss=0.0036, lr=3.01e-06, step=9090] Training: 91%|█████████ | 9091/10000 [1:58:03<11:42, 1.29it/s, loss=0.0041, lr=3.01e-06, step=9091] Training: 91%|█████████ | 9092/10000 [1:58:04<10:56, 1.38it/s, loss=0.0041, lr=3.01e-06, step=9091] Training: 91%|█████████ | 9092/10000 [1:58:04<10:56, 1.38it/s, loss=0.0058, lr=3.00e-06, step=9092] Training: 91%|█████████ | 9093/10000 [1:58:05<11:27, 1.32it/s, loss=0.0058, lr=3.00e-06, step=9092] Training: 91%|█████████ | 9093/10000 [1:58:05<11:27, 1.32it/s, loss=0.0027, lr=3.00e-06, step=9093] Training: 91%|█████████ | 9094/10000 [1:58:06<13:19, 1.13it/s, loss=0.0027, lr=3.00e-06, step=9093] Training: 91%|█████████ | 9094/10000 [1:58:06<13:19, 1.13it/s, loss=0.0058, lr=3.00e-06, step=9094] Training: 91%|█████████ | 9095/10000 [1:58:07<13:02, 1.16it/s, loss=0.0058, lr=3.00e-06, step=9094] Training: 91%|█████████ | 9095/10000 [1:58:07<13:02, 1.16it/s, loss=0.0041, lr=3.00e-06, step=9095] Training: 91%|█████████ | 9096/10000 [1:58:08<12:51, 1.17it/s, loss=0.0041, lr=3.00e-06, step=9095] Training: 91%|█████████ | 9096/10000 [1:58:08<12:51, 1.17it/s, loss=0.0025, lr=3.00e-06, step=9096] Training: 91%|█████████ | 9097/10000 [1:58:08<12:26, 1.21it/s, loss=0.0025, lr=3.00e-06, step=9096] Training: 91%|█████████ | 9097/10000 [1:58:08<12:26, 1.21it/s, loss=0.0042, lr=3.00e-06, step=9097] Training: 91%|█████████ | 9098/10000 [1:58:09<12:34, 1.20it/s, loss=0.0042, lr=3.00e-06, step=9097] Training: 91%|█████████ | 9098/10000 [1:58:09<12:34, 1.20it/s, loss=0.0030, lr=3.00e-06, step=9098] Training: 91%|█████████ | 9099/10000 [1:58:10<12:33, 1.20it/s, loss=0.0030, lr=3.00e-06, step=9098] Training: 91%|█████████ | 9099/10000 [1:58:10<12:33, 1.20it/s, loss=0.0055, lr=3.00e-06, step=9099]18:04:17.893 [I] step=9100 loss=0.0089 smoothed_loss=0.0066 lr=3.00e-06 grad_norm=0.3828 step_time=0.6590s data_time=0.1941s it/s=1.172 eta_to_10000=767.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0508 grad_action_out_proj=0.1685 grad_shared_expert=0.5391 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9100/10000 [1:58:11<12:40, 1.18it/s, loss=0.0055, lr=3.00e-06, step=9099] Training: 91%|█████████ | 9100/10000 [1:58:11<12:40, 1.18it/s, loss=0.0089, lr=3.00e-06, step=9100] Training: 91%|█████████ | 9101/10000 [1:58:11<11:12, 1.34it/s, loss=0.0089, lr=3.00e-06, step=9100] Training: 91%|█████████ | 9101/10000 [1:58:11<11:12, 1.34it/s, loss=0.0068, lr=2.99e-06, step=9101] Training: 91%|█████████ | 9102/10000 [1:58:12<12:20, 1.21it/s, loss=0.0068, lr=2.99e-06, step=9101] Training: 91%|█████████ | 9102/10000 [1:58:12<12:20, 1.21it/s, loss=0.0018, lr=2.99e-06, step=9102] Training: 91%|█████████ | 9103/10000 [1:58:13<11:44, 1.27it/s, loss=0.0018, lr=2.99e-06, step=9102] Training: 91%|█████████ | 9103/10000 [1:58:13<11:44, 1.27it/s, loss=0.0004, lr=2.99e-06, step=9103] Training: 91%|█████████ | 9104/10000 [1:58:14<12:44, 1.17it/s, loss=0.0004, lr=2.99e-06, step=9103] Training: 91%|█████████ | 9104/10000 [1:58:14<12:44, 1.17it/s, loss=0.0010, lr=2.99e-06, step=9104] Training: 91%|█████████ | 9105/10000 [1:58:15<12:38, 1.18it/s, loss=0.0010, lr=2.99e-06, step=9104] Training: 91%|█████████ | 9105/10000 [1:58:15<12:38, 1.18it/s, loss=0.0054, lr=2.99e-06, step=9105] Training: 91%|█████████ | 9106/10000 [1:58:16<12:15, 1.22it/s, loss=0.0054, lr=2.99e-06, step=9105] Training: 91%|█████████ | 9106/10000 [1:58:16<12:15, 1.22it/s, loss=0.0030, lr=2.99e-06, step=9106] Training: 91%|█████████ | 9107/10000 [1:58:16<11:20, 1.31it/s, loss=0.0030, lr=2.99e-06, step=9106] Training: 91%|█████████ | 9107/10000 [1:58:16<11:20, 1.31it/s, loss=0.0019, lr=2.99e-06, step=9107] Training: 91%|█████████ | 9108/10000 [1:58:17<11:44, 1.27it/s, loss=0.0019, lr=2.99e-06, step=9107] Training: 91%|█████████ | 9108/10000 [1:58:17<11:44, 1.27it/s, loss=0.0041, lr=2.99e-06, step=9108] Training: 91%|█████████ | 9109/10000 [1:58:18<12:28, 1.19it/s, loss=0.0041, lr=2.99e-06, step=9108] Training: 91%|█████████ | 9109/10000 [1:58:18<12:28, 1.19it/s, loss=0.0015, lr=2.99e-06, step=9109]18:04:26.000 [I] step=9110 loss=0.0044 smoothed_loss=0.0043 lr=2.99e-06 grad_norm=0.3458 step_time=0.6602s data_time=0.1505s it/s=1.234 eta_to_10000=721.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0096 grad_action_out_proj=0.0955 grad_shared_expert=0.4496 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9110/10000 [1:58:19<12:29, 1.19it/s, loss=0.0015, lr=2.99e-06, step=9109] Training: 91%|█████████ | 9110/10000 [1:58:19<12:29, 1.19it/s, loss=0.0044, lr=2.98e-06, step=9110] Training: 91%|█████████ | 9111/10000 [1:58:20<11:04, 1.34it/s, loss=0.0044, lr=2.98e-06, step=9110] Training: 91%|█████████ | 9111/10000 [1:58:20<11:04, 1.34it/s, loss=0.0048, lr=2.98e-06, step=9111] Training: 91%|█████████ | 9112/10000 [1:58:20<10:57, 1.35it/s, loss=0.0048, lr=2.98e-06, step=9111] Training: 91%|█████████ | 9112/10000 [1:58:20<10:57, 1.35it/s, loss=0.0157, lr=2.98e-06, step=9112] Training: 91%|█████████ | 9113/10000 [1:58:21<09:56, 1.49it/s, loss=0.0157, lr=2.98e-06, step=9112] Training: 91%|█████████ | 9113/10000 [1:58:21<09:56, 1.49it/s, loss=0.0137, lr=2.98e-06, step=9113] Training: 91%|█████████ | 9114/10000 [1:58:21<09:21, 1.58it/s, loss=0.0137, lr=2.98e-06, step=9113] Training: 91%|█████████ | 9114/10000 [1:58:21<09:21, 1.58it/s, loss=0.0076, lr=2.98e-06, step=9114] Training: 91%|█████████ | 9115/10000 [1:58:22<09:41, 1.52it/s, loss=0.0076, lr=2.98e-06, step=9114] Training: 91%|█████████ | 9115/10000 [1:58:22<09:41, 1.52it/s, loss=0.0154, lr=2.98e-06, step=9115] Training: 91%|█████████ | 9116/10000 [1:58:23<11:58, 1.23it/s, loss=0.0154, lr=2.98e-06, step=9115] Training: 91%|█████████ | 9116/10000 [1:58:23<11:58, 1.23it/s, loss=0.0041, lr=2.98e-06, step=9116] Training: 91%|█████████ | 9117/10000 [1:58:24<11:26, 1.29it/s, loss=0.0041, lr=2.98e-06, step=9116] Training: 91%|█████████ | 9117/10000 [1:58:24<11:26, 1.29it/s, loss=0.0200, lr=2.98e-06, step=9117] Training: 91%|█████████ | 9118/10000 [1:58:25<11:32, 1.27it/s, loss=0.0200, lr=2.98e-06, step=9117] Training: 91%|█████████ | 9118/10000 [1:58:25<11:32, 1.27it/s, loss=0.0050, lr=2.98e-06, step=9118] Training: 91%|█████████ | 9119/10000 [1:58:25<10:43, 1.37it/s, loss=0.0050, lr=2.98e-06, step=9118] Training: 91%|█████████ | 9119/10000 [1:58:25<10:43, 1.37it/s, loss=0.0968, lr=2.98e-06, step=9119]18:04:33.121 [I] step=9120 loss=0.0012 smoothed_loss=0.0153 lr=2.98e-06 grad_norm=0.4674 step_time=0.5860s data_time=0.1260s it/s=1.405 eta_to_10000=626.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0263 grad_action_out_proj=0.1273 grad_shared_expert=0.4751 (10775:train_pytorch.py:850) + Training: 91%|█████████ | 9120/10000 [1:58:26<11:07, 1.32it/s, loss=0.0968, lr=2.98e-06, step=9119] Training: 91%|█████████ | 9120/10000 [1:58:26<11:07, 1.32it/s, loss=0.0012, lr=2.97e-06, step=9120] Training: 91%|█████████ | 9121/10000 [1:58:27<11:43, 1.25it/s, loss=0.0012, lr=2.97e-06, step=9120] Training: 91%|█████████ | 9121/10000 [1:58:27<11:43, 1.25it/s, loss=0.0027, lr=2.97e-06, step=9121] Training: 91%|█████████ | 9122/10000 [1:58:28<11:58, 1.22it/s, loss=0.0027, lr=2.97e-06, step=9121] Training: 91%|█████████ | 9122/10000 [1:58:28<11:58, 1.22it/s, loss=0.0044, lr=2.97e-06, step=9122] Training: 91%|█████████ | 9123/10000 [1:58:29<12:53, 1.13it/s, loss=0.0044, lr=2.97e-06, step=9122] Training: 91%|█████████ | 9123/10000 [1:58:29<12:53, 1.13it/s, loss=0.0070, lr=2.97e-06, step=9123] Training: 91%|█████████ | 9124/10000 [1:58:30<11:26, 1.28it/s, loss=0.0070, lr=2.97e-06, step=9123] Training: 91%|█████████ | 9124/10000 [1:58:30<11:26, 1.28it/s, loss=0.0029, lr=2.97e-06, step=9124] Training: 91%|█████████▏| 9125/10000 [1:58:31<12:47, 1.14it/s, loss=0.0029, lr=2.97e-06, step=9124] Training: 91%|█████████▏| 9125/10000 [1:58:31<12:47, 1.14it/s, loss=0.0172, lr=2.97e-06, step=9125] Training: 91%|█████████▏| 9126/10000 [1:58:31<12:26, 1.17it/s, loss=0.0172, lr=2.97e-06, step=9125] Training: 91%|█████████▏| 9126/10000 [1:58:31<12:26, 1.17it/s, loss=0.0053, lr=2.97e-06, step=9126] Training: 91%|█████████▏| 9127/10000 [1:58:32<11:47, 1.23it/s, loss=0.0053, lr=2.97e-06, step=9126] Training: 91%|█████████▏| 9127/10000 [1:58:32<11:47, 1.23it/s, loss=0.0040, lr=2.97e-06, step=9127] Training: 91%|█████████▏| 9128/10000 [1:58:33<12:43, 1.14it/s, loss=0.0040, lr=2.97e-06, step=9127] Training: 91%|█████████▏| 9128/10000 [1:58:33<12:43, 1.14it/s, loss=0.0802, lr=2.97e-06, step=9128] Training: 91%|█████████▏| 9129/10000 [1:58:34<13:12, 1.10it/s, loss=0.0802, lr=2.97e-06, step=9128] Training: 91%|█████████▏| 9129/10000 [1:58:34<13:12, 1.10it/s, loss=0.0024, lr=2.96e-06, step=9129]18:04:41.964 [I] step=9130 loss=0.0153 smoothed_loss=0.0160 lr=2.97e-06 grad_norm=0.5309 step_time=0.6556s data_time=0.2288s it/s=1.131 eta_to_10000=769.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0184 grad_action_out_proj=0.0936 grad_shared_expert=1.3522 (10775:train_pytorch.py:850) + Training: 91%|█████████▏| 9130/10000 [1:58:35<13:03, 1.11it/s, loss=0.0024, lr=2.96e-06, step=9129] Training: 91%|█████████▏| 9130/10000 [1:58:35<13:03, 1.11it/s, loss=0.0153, lr=2.96e-06, step=9130] Training: 91%|█████████▏| 9131/10000 [1:58:36<12:23, 1.17it/s, loss=0.0153, lr=2.96e-06, step=9130] Training: 91%|█████████▏| 9131/10000 [1:58:36<12:23, 1.17it/s, loss=0.0063, lr=2.96e-06, step=9131] Training: 91%|█████████▏| 9132/10000 [1:58:37<12:16, 1.18it/s, loss=0.0063, lr=2.96e-06, step=9131] Training: 91%|█████████▏| 9132/10000 [1:58:37<12:16, 1.18it/s, loss=0.0150, lr=2.96e-06, step=9132] Training: 91%|█████████▏| 9133/10000 [1:58:37<11:32, 1.25it/s, loss=0.0150, lr=2.96e-06, step=9132] Training: 91%|█████████▏| 9133/10000 [1:58:37<11:32, 1.25it/s, loss=0.0021, lr=2.96e-06, step=9133] Training: 91%|█████████▏| 9134/10000 [1:58:38<11:38, 1.24it/s, loss=0.0021, lr=2.96e-06, step=9133] Training: 91%|█████████▏| 9134/10000 [1:58:38<11:38, 1.24it/s, loss=0.0118, lr=2.96e-06, step=9134] Training: 91%|█████████▏| 9135/10000 [1:58:39<11:58, 1.20it/s, loss=0.0118, lr=2.96e-06, step=9134] Training: 91%|█████████▏| 9135/10000 [1:58:39<11:58, 1.20it/s, loss=0.0035, lr=2.96e-06, step=9135] Training: 91%|█████████▏| 9136/10000 [1:58:40<12:13, 1.18it/s, loss=0.0035, lr=2.96e-06, step=9135] Training: 91%|█████████▏| 9136/10000 [1:58:40<12:13, 1.18it/s, loss=0.0183, lr=2.96e-06, step=9136] Training: 91%|█████████▏| 9137/10000 [1:58:41<13:06, 1.10it/s, loss=0.0183, lr=2.96e-06, step=9136] Training: 91%|█████████▏| 9137/10000 [1:58:41<13:06, 1.10it/s, loss=0.0017, lr=2.96e-06, step=9137] Training: 91%|█████████▏| 9138/10000 [1:58:42<13:57, 1.03it/s, loss=0.0017, lr=2.96e-06, step=9137] Training: 91%|█████████▏| 9138/10000 [1:58:42<13:57, 1.03it/s, loss=0.0027, lr=2.96e-06, step=9138] Training: 91%|█████████▏| 9139/10000 [1:58:43<14:30, 1.01s/it, loss=0.0027, lr=2.96e-06, step=9138] Training: 91%|█████████▏| 9139/10000 [1:58:43<14:30, 1.01s/it, loss=0.0166, lr=2.95e-06, step=9139]18:04:51.323 [I] step=9140 loss=0.0159 smoothed_loss=0.0120 lr=2.96e-06 grad_norm=0.4003 step_time=0.6878s data_time=0.2481s it/s=1.069 eta_to_10000=804.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0255 grad_action_out_proj=0.1681 grad_shared_expert=0.5326 (10775:train_pytorch.py:850) + Training: 91%|█████████▏| 9140/10000 [1:58:44<15:23, 1.07s/it, loss=0.0166, lr=2.95e-06, step=9139] Training: 91%|█████████▏| 9140/10000 [1:58:44<15:23, 1.07s/it, loss=0.0159, lr=2.95e-06, step=9140] Training: 91%|█████████▏| 9141/10000 [1:58:45<14:39, 1.02s/it, loss=0.0159, lr=2.95e-06, step=9140] Training: 91%|█████████▏| 9141/10000 [1:58:45<14:39, 1.02s/it, loss=0.0065, lr=2.95e-06, step=9141] Training: 91%|█████████▏| 9142/10000 [1:58:46<14:09, 1.01it/s, loss=0.0065, lr=2.95e-06, step=9141] Training: 91%|█████████▏| 9142/10000 [1:58:46<14:09, 1.01it/s, loss=0.0022, lr=2.95e-06, step=9142] Training: 91%|█████████▏| 9143/10000 [1:58:47<12:23, 1.15it/s, loss=0.0022, lr=2.95e-06, step=9142] Training: 91%|█████████▏| 9143/10000 [1:58:47<12:23, 1.15it/s, loss=0.0051, lr=2.95e-06, step=9143] Training: 91%|█████████▏| 9144/10000 [1:58:47<11:38, 1.23it/s, loss=0.0051, lr=2.95e-06, step=9143] Training: 91%|█████████▏| 9144/10000 [1:58:47<11:38, 1.23it/s, loss=0.0041, lr=2.95e-06, step=9144] Training: 91%|█████████▏| 9145/10000 [1:58:48<11:58, 1.19it/s, loss=0.0041, lr=2.95e-06, step=9144] Training: 91%|█████████▏| 9145/10000 [1:58:48<11:58, 1.19it/s, loss=0.0039, lr=2.95e-06, step=9145] Training: 91%|█████████▏| 9146/10000 [1:58:49<11:25, 1.25it/s, loss=0.0039, lr=2.95e-06, step=9145] Training: 91%|█████████▏| 9146/10000 [1:58:49<11:25, 1.25it/s, loss=0.0155, lr=2.95e-06, step=9146] Training: 91%|█████████▏| 9147/10000 [1:58:50<11:30, 1.23it/s, loss=0.0155, lr=2.95e-06, step=9146] Training: 91%|█████████▏| 9147/10000 [1:58:50<11:30, 1.23it/s, loss=0.0027, lr=2.95e-06, step=9147] Training: 91%|█████████▏| 9148/10000 [1:58:51<10:43, 1.32it/s, loss=0.0027, lr=2.95e-06, step=9147] Training: 91%|█████████▏| 9148/10000 [1:58:51<10:43, 1.32it/s, loss=0.0226, lr=2.94e-06, step=9148] Training: 91%|█████████▏| 9149/10000 [1:58:51<10:18, 1.38it/s, loss=0.0226, lr=2.94e-06, step=9148] Training: 91%|█████████▏| 9149/10000 [1:58:51<10:18, 1.38it/s, loss=0.0069, lr=2.94e-06, step=9149]18:04:59.011 [I] step=9150 loss=0.0033 smoothed_loss=0.0092 lr=2.95e-06 grad_norm=0.3861 step_time=0.6180s data_time=0.1509s it/s=1.301 eta_to_10000=653.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0078 grad_action_out_proj=0.0626 grad_shared_expert=0.1623 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9150/10000 [1:58:52<10:52, 1.30it/s, loss=0.0069, lr=2.94e-06, step=9149] Training: 92%|█████████▏| 9150/10000 [1:58:52<10:52, 1.30it/s, loss=0.0033, lr=2.94e-06, step=9150] Training: 92%|█████████▏| 9151/10000 [1:58:53<11:08, 1.27it/s, loss=0.0033, lr=2.94e-06, step=9150] Training: 92%|█████████▏| 9151/10000 [1:58:53<11:08, 1.27it/s, loss=0.0012, lr=2.94e-06, step=9151] Training: 92%|█████████▏| 9152/10000 [1:58:54<12:46, 1.11it/s, loss=0.0012, lr=2.94e-06, step=9151] Training: 92%|█████████▏| 9152/10000 [1:58:54<12:46, 1.11it/s, loss=0.0015, lr=2.94e-06, step=9152] Training: 92%|█████████▏| 9153/10000 [1:58:55<12:18, 1.15it/s, loss=0.0015, lr=2.94e-06, step=9152] Training: 92%|█████████▏| 9153/10000 [1:58:55<12:18, 1.15it/s, loss=0.0016, lr=2.94e-06, step=9153] Training: 92%|█████████▏| 9154/10000 [1:58:56<11:53, 1.19it/s, loss=0.0016, lr=2.94e-06, step=9153] Training: 92%|█████████▏| 9154/10000 [1:58:56<11:53, 1.19it/s, loss=0.0082, lr=2.94e-06, step=9154] Training: 92%|█████████▏| 9155/10000 [1:58:56<11:41, 1.20it/s, loss=0.0082, lr=2.94e-06, step=9154] Training: 92%|█████████▏| 9155/10000 [1:58:56<11:41, 1.20it/s, loss=0.0005, lr=2.94e-06, step=9155] Training: 92%|█████████▏| 9156/10000 [1:58:57<12:08, 1.16it/s, loss=0.0005, lr=2.94e-06, step=9155] Training: 92%|█████████▏| 9156/10000 [1:58:57<12:08, 1.16it/s, loss=0.0066, lr=2.94e-06, step=9156] Training: 92%|█████████▏| 9157/10000 [1:58:58<10:45, 1.31it/s, loss=0.0066, lr=2.94e-06, step=9156] Training: 92%|█████████▏| 9157/10000 [1:58:58<10:45, 1.31it/s, loss=0.0046, lr=2.94e-06, step=9157] Training: 92%|█████████▏| 9158/10000 [1:58:59<10:15, 1.37it/s, loss=0.0046, lr=2.94e-06, step=9157] Training: 92%|█████████▏| 9158/10000 [1:58:59<10:15, 1.37it/s, loss=0.0256, lr=2.93e-06, step=9158] Training: 92%|█████████▏| 9159/10000 [1:58:59<10:50, 1.29it/s, loss=0.0256, lr=2.93e-06, step=9158] Training: 92%|█████████▏| 9159/10000 [1:58:59<10:50, 1.29it/s, loss=0.0096, lr=2.93e-06, step=9159]18:05:07.222 [I] step=9160 loss=0.0074 smoothed_loss=0.0083 lr=2.94e-06 grad_norm=0.4134 step_time=0.6674s data_time=0.1537s it/s=1.218 eta_to_10000=689.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0035 grad_action_out_proj=0.0428 grad_shared_expert=0.2705 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9160/10000 [1:59:00<11:03, 1.27it/s, loss=0.0096, lr=2.93e-06, step=9159] Training: 92%|█████████▏| 9160/10000 [1:59:00<11:03, 1.27it/s, loss=0.0074, lr=2.93e-06, step=9160] Training: 92%|█████████▏| 9161/10000 [1:59:01<10:02, 1.39it/s, loss=0.0074, lr=2.93e-06, step=9160] Training: 92%|█████████▏| 9161/10000 [1:59:01<10:02, 1.39it/s, loss=0.0061, lr=2.93e-06, step=9161] Training: 92%|█████████▏| 9162/10000 [1:59:02<10:20, 1.35it/s, loss=0.0061, lr=2.93e-06, step=9161] Training: 92%|█████████▏| 9162/10000 [1:59:02<10:20, 1.35it/s, loss=0.0039, lr=2.93e-06, step=9162] Training: 92%|█████████▏| 9163/10000 [1:59:02<10:07, 1.38it/s, loss=0.0039, lr=2.93e-06, step=9162] Training: 92%|█████████▏| 9163/10000 [1:59:02<10:07, 1.38it/s, loss=0.0029, lr=2.93e-06, step=9163] Training: 92%|█████████▏| 9164/10000 [1:59:03<10:14, 1.36it/s, loss=0.0029, lr=2.93e-06, step=9163] Training: 92%|█████████▏| 9164/10000 [1:59:03<10:14, 1.36it/s, loss=0.0147, lr=2.93e-06, step=9164] Training: 92%|█████████▏| 9165/10000 [1:59:04<10:09, 1.37it/s, loss=0.0147, lr=2.93e-06, step=9164] Training: 92%|█████████▏| 9165/10000 [1:59:04<10:09, 1.37it/s, loss=0.0086, lr=2.93e-06, step=9165] Training: 92%|█████████▏| 9166/10000 [1:59:05<11:12, 1.24it/s, loss=0.0086, lr=2.93e-06, step=9165] Training: 92%|█████████▏| 9166/10000 [1:59:05<11:12, 1.24it/s, loss=0.0053, lr=2.93e-06, step=9166] Training: 92%|█████████▏| 9167/10000 [1:59:06<11:19, 1.23it/s, loss=0.0053, lr=2.93e-06, step=9166] Training: 92%|█████████▏| 9167/10000 [1:59:06<11:19, 1.23it/s, loss=0.0064, lr=2.93e-06, step=9167] Training: 92%|█████████▏| 9168/10000 [1:59:06<10:56, 1.27it/s, loss=0.0064, lr=2.93e-06, step=9167] Training: 92%|█████████▏| 9168/10000 [1:59:06<10:56, 1.27it/s, loss=0.0033, lr=2.92e-06, step=9168] Training: 92%|█████████▏| 9169/10000 [1:59:07<10:42, 1.29it/s, loss=0.0033, lr=2.92e-06, step=9168] Training: 92%|█████████▏| 9169/10000 [1:59:07<10:42, 1.29it/s, loss=0.0208, lr=2.92e-06, step=9169]18:05:14.842 [I] step=9170 loss=0.0064 smoothed_loss=0.0083 lr=2.93e-06 grad_norm=0.4466 step_time=0.6171s data_time=0.1448s it/s=1.313 eta_to_10000=632.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0068 grad_action_out_proj=0.0756 grad_shared_expert=0.2059 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9170/10000 [1:59:08<10:54, 1.27it/s, loss=0.0208, lr=2.92e-06, step=9169] Training: 92%|█████████▏| 9170/10000 [1:59:08<10:54, 1.27it/s, loss=0.0064, lr=2.92e-06, step=9170] Training: 92%|█████████▏| 9171/10000 [1:59:09<10:55, 1.26it/s, loss=0.0064, lr=2.92e-06, step=9170] Training: 92%|█████████▏| 9171/10000 [1:59:09<10:55, 1.26it/s, loss=0.0062, lr=2.92e-06, step=9171] Training: 92%|█████████▏| 9172/10000 [1:59:09<10:20, 1.33it/s, loss=0.0062, lr=2.92e-06, step=9171] Training: 92%|█████████▏| 9172/10000 [1:59:09<10:20, 1.33it/s, loss=0.0041, lr=2.92e-06, step=9172] Training: 92%|█████████▏| 9173/10000 [1:59:10<11:29, 1.20it/s, loss=0.0041, lr=2.92e-06, step=9172] Training: 92%|█████████▏| 9173/10000 [1:59:10<11:29, 1.20it/s, loss=0.0090, lr=2.92e-06, step=9173] Training: 92%|█████████▏| 9174/10000 [1:59:11<10:12, 1.35it/s, loss=0.0090, lr=2.92e-06, step=9173] Training: 92%|█████████▏| 9174/10000 [1:59:11<10:12, 1.35it/s, loss=0.0128, lr=2.92e-06, step=9174] Training: 92%|█████████▏| 9175/10000 [1:59:11<09:18, 1.48it/s, loss=0.0128, lr=2.92e-06, step=9174] Training: 92%|█████████▏| 9175/10000 [1:59:11<09:18, 1.48it/s, loss=0.0051, lr=2.92e-06, step=9175] Training: 92%|█████████▏| 9176/10000 [1:59:12<10:26, 1.32it/s, loss=0.0051, lr=2.92e-06, step=9175] Training: 92%|█████████▏| 9176/10000 [1:59:12<10:26, 1.32it/s, loss=0.0228, lr=2.92e-06, step=9176] Training: 92%|█████████▏| 9177/10000 [1:59:13<11:28, 1.19it/s, loss=0.0228, lr=2.92e-06, step=9176] Training: 92%|█████████▏| 9177/10000 [1:59:13<11:28, 1.19it/s, loss=0.0069, lr=2.92e-06, step=9177] Training: 92%|█████████▏| 9178/10000 [1:59:14<11:27, 1.20it/s, loss=0.0069, lr=2.92e-06, step=9177] Training: 92%|█████████▏| 9178/10000 [1:59:14<11:27, 1.20it/s, loss=0.0095, lr=2.91e-06, step=9178] Training: 92%|█████████▏| 9179/10000 [1:59:15<10:11, 1.34it/s, loss=0.0095, lr=2.91e-06, step=9178] Training: 92%|█████████▏| 9179/10000 [1:59:15<10:11, 1.34it/s, loss=0.0142, lr=2.91e-06, step=9179]18:05:22.858 [I] step=9180 loss=0.0103 smoothed_loss=0.0098 lr=2.92e-06 grad_norm=0.4200 step_time=0.6198s data_time=0.1818s it/s=1.248 eta_to_10000=657.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.0857 grad_shared_expert=0.4316 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9180/10000 [1:59:16<11:49, 1.16it/s, loss=0.0142, lr=2.91e-06, step=9179] Training: 92%|█████████▏| 9180/10000 [1:59:16<11:49, 1.16it/s, loss=0.0103, lr=2.91e-06, step=9180] Training: 92%|█████████▏| 9181/10000 [1:59:16<10:31, 1.30it/s, loss=0.0103, lr=2.91e-06, step=9180] Training: 92%|█████████▏| 9181/10000 [1:59:16<10:31, 1.30it/s, loss=0.0180, lr=2.91e-06, step=9181] Training: 92%|█████████▏| 9182/10000 [1:59:17<10:54, 1.25it/s, loss=0.0180, lr=2.91e-06, step=9181] Training: 92%|█████████▏| 9182/10000 [1:59:17<10:54, 1.25it/s, loss=0.0110, lr=2.91e-06, step=9182] Training: 92%|█████████▏| 9183/10000 [1:59:18<10:29, 1.30it/s, loss=0.0110, lr=2.91e-06, step=9182] Training: 92%|█████████▏| 9183/10000 [1:59:18<10:29, 1.30it/s, loss=0.0064, lr=2.91e-06, step=9183] Training: 92%|█████████▏| 9184/10000 [1:59:19<09:31, 1.43it/s, loss=0.0064, lr=2.91e-06, step=9183] Training: 92%|█████████▏| 9184/10000 [1:59:19<09:31, 1.43it/s, loss=0.0140, lr=2.91e-06, step=9184] Training: 92%|█████████▏| 9185/10000 [1:59:19<09:01, 1.51it/s, loss=0.0140, lr=2.91e-06, step=9184] Training: 92%|█████████▏| 9185/10000 [1:59:19<09:01, 1.51it/s, loss=0.0049, lr=2.91e-06, step=9185] Training: 92%|█████████▏| 9186/10000 [1:59:20<08:35, 1.58it/s, loss=0.0049, lr=2.91e-06, step=9185] Training: 92%|█████████▏| 9186/10000 [1:59:20<08:35, 1.58it/s, loss=0.0034, lr=2.91e-06, step=9186] Training: 92%|█████████▏| 9187/10000 [1:59:21<09:19, 1.45it/s, loss=0.0034, lr=2.91e-06, step=9186] Training: 92%|█████████▏| 9187/10000 [1:59:21<09:19, 1.45it/s, loss=0.0038, lr=2.91e-06, step=9187] Training: 92%|█████████▏| 9188/10000 [1:59:22<11:22, 1.19it/s, loss=0.0038, lr=2.91e-06, step=9187] Training: 92%|█████████▏| 9188/10000 [1:59:22<11:22, 1.19it/s, loss=0.0086, lr=2.90e-06, step=9188] Training: 92%|█████████▏| 9189/10000 [1:59:23<12:08, 1.11it/s, loss=0.0086, lr=2.90e-06, step=9188] Training: 92%|█████████▏| 9189/10000 [1:59:23<12:08, 1.11it/s, loss=0.0071, lr=2.90e-06, step=9189]18:05:30.545 [I] step=9190 loss=0.0128 smoothed_loss=0.0090 lr=2.91e-06 grad_norm=0.3965 step_time=0.6213s data_time=0.1474s it/s=1.301 eta_to_10000=622.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.0810 grad_shared_expert=0.3363 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9190/10000 [1:59:24<11:54, 1.13it/s, loss=0.0071, lr=2.90e-06, step=9189] Training: 92%|█████████▏| 9190/10000 [1:59:24<11:54, 1.13it/s, loss=0.0128, lr=2.90e-06, step=9190] Training: 92%|█████████▏| 9191/10000 [1:59:24<10:42, 1.26it/s, loss=0.0128, lr=2.90e-06, step=9190] Training: 92%|█████████▏| 9191/10000 [1:59:24<10:42, 1.26it/s, loss=0.0011, lr=2.90e-06, step=9191] Training: 92%|█████████▏| 9192/10000 [1:59:25<11:43, 1.15it/s, loss=0.0011, lr=2.90e-06, step=9191] Training: 92%|█████████▏| 9192/10000 [1:59:25<11:43, 1.15it/s, loss=0.0010, lr=2.90e-06, step=9192] Training: 92%|█████████▏| 9193/10000 [1:59:26<11:27, 1.17it/s, loss=0.0010, lr=2.90e-06, step=9192] Training: 92%|█████████▏| 9193/10000 [1:59:26<11:27, 1.17it/s, loss=0.0016, lr=2.90e-06, step=9193] Training: 92%|█████████▏| 9194/10000 [1:59:27<10:18, 1.30it/s, loss=0.0016, lr=2.90e-06, step=9193] Training: 92%|█████████▏| 9194/10000 [1:59:27<10:18, 1.30it/s, loss=0.0011, lr=2.90e-06, step=9194] Training: 92%|█████████▏| 9195/10000 [1:59:27<10:37, 1.26it/s, loss=0.0011, lr=2.90e-06, step=9194] Training: 92%|█████████▏| 9195/10000 [1:59:27<10:37, 1.26it/s, loss=0.0104, lr=2.90e-06, step=9195] Training: 92%|█████████▏| 9196/10000 [1:59:29<11:53, 1.13it/s, loss=0.0104, lr=2.90e-06, step=9195] Training: 92%|█████████▏| 9196/10000 [1:59:29<11:53, 1.13it/s, loss=0.0162, lr=2.90e-06, step=9196] Training: 92%|█████████▏| 9197/10000 [1:59:30<12:14, 1.09it/s, loss=0.0162, lr=2.90e-06, step=9196] Training: 92%|█████████▏| 9197/10000 [1:59:30<12:14, 1.09it/s, loss=0.0023, lr=2.90e-06, step=9197] Training: 92%|█████████▏| 9198/10000 [1:59:30<11:39, 1.15it/s, loss=0.0023, lr=2.90e-06, step=9197] Training: 92%|█████████▏| 9198/10000 [1:59:30<11:39, 1.15it/s, loss=0.0021, lr=2.89e-06, step=9198] Training: 92%|█████████▏| 9199/10000 [1:59:31<10:46, 1.24it/s, loss=0.0021, lr=2.89e-06, step=9198] Training: 92%|█████████▏| 9199/10000 [1:59:31<10:46, 1.24it/s, loss=0.0060, lr=2.89e-06, step=9199]18:05:38.864 [I] step=9200 loss=0.0019 smoothed_loss=0.0061 lr=2.90e-06 grad_norm=0.3647 step_time=0.6438s data_time=0.1881s it/s=1.202 eta_to_10000=665.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0056 grad_action_out_proj=0.0578 grad_shared_expert=0.2340 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9200/10000 [1:59:32<11:17, 1.18it/s, loss=0.0060, lr=2.89e-06, step=9199] Training: 92%|█████████▏| 9200/10000 [1:59:32<11:17, 1.18it/s, loss=0.0019, lr=2.89e-06, step=9200] Training: 92%|█████████▏| 9201/10000 [1:59:33<11:58, 1.11it/s, loss=0.0019, lr=2.89e-06, step=9200] Training: 92%|█████████▏| 9201/10000 [1:59:33<11:58, 1.11it/s, loss=0.0199, lr=2.89e-06, step=9201] Training: 92%|█████████▏| 9202/10000 [1:59:34<12:52, 1.03it/s, loss=0.0199, lr=2.89e-06, step=9201] Training: 92%|█████████▏| 9202/10000 [1:59:34<12:52, 1.03it/s, loss=0.0019, lr=2.89e-06, step=9202] Training: 92%|█████████▏| 9203/10000 [1:59:35<11:06, 1.20it/s, loss=0.0019, lr=2.89e-06, step=9202] Training: 92%|█████████▏| 9203/10000 [1:59:35<11:06, 1.20it/s, loss=0.0057, lr=2.89e-06, step=9203] Training: 92%|█████████▏| 9204/10000 [1:59:35<10:33, 1.26it/s, loss=0.0057, lr=2.89e-06, step=9203] Training: 92%|█████████▏| 9204/10000 [1:59:35<10:33, 1.26it/s, loss=0.0008, lr=2.89e-06, step=9204] Training: 92%|█████████▏| 9205/10000 [1:59:36<11:30, 1.15it/s, loss=0.0008, lr=2.89e-06, step=9204] Training: 92%|█████████▏| 9205/10000 [1:59:36<11:30, 1.15it/s, loss=0.0033, lr=2.89e-06, step=9205] Training: 92%|█████████▏| 9206/10000 [1:59:37<10:54, 1.21it/s, loss=0.0033, lr=2.89e-06, step=9205] Training: 92%|█████████▏| 9206/10000 [1:59:37<10:54, 1.21it/s, loss=0.0033, lr=2.89e-06, step=9206] Training: 92%|█████████▏| 9207/10000 [1:59:38<09:38, 1.37it/s, loss=0.0033, lr=2.89e-06, step=9206] Training: 92%|█████████▏| 9207/10000 [1:59:38<09:38, 1.37it/s, loss=0.0037, lr=2.89e-06, step=9207] Training: 92%|█████████▏| 9208/10000 [1:59:38<10:04, 1.31it/s, loss=0.0037, lr=2.89e-06, step=9207] Training: 92%|█████████▏| 9208/10000 [1:59:38<10:04, 1.31it/s, loss=0.0025, lr=2.88e-06, step=9208] Training: 92%|█████████▏| 9209/10000 [1:59:39<09:12, 1.43it/s, loss=0.0025, lr=2.88e-06, step=9208] Training: 92%|█████████▏| 9209/10000 [1:59:39<09:12, 1.43it/s, loss=0.0025, lr=2.88e-06, step=9209]18:05:47.023 [I] step=9210 loss=0.0070 smoothed_loss=0.0051 lr=2.89e-06 grad_norm=0.4857 step_time=0.6336s data_time=0.1823s it/s=1.226 eta_to_10000=644.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0094 grad_action_out_proj=0.0908 grad_shared_expert=0.3534 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9210/10000 [1:59:40<10:51, 1.21it/s, loss=0.0025, lr=2.88e-06, step=9209] Training: 92%|█████████▏| 9210/10000 [1:59:40<10:51, 1.21it/s, loss=0.0070, lr=2.88e-06, step=9210] Training: 92%|█████████▏| 9211/10000 [1:59:41<11:02, 1.19it/s, loss=0.0070, lr=2.88e-06, step=9210] Training: 92%|█████████▏| 9211/10000 [1:59:41<11:02, 1.19it/s, loss=0.0091, lr=2.88e-06, step=9211] Training: 92%|█████████▏| 9212/10000 [1:59:42<10:23, 1.26it/s, loss=0.0091, lr=2.88e-06, step=9211] Training: 92%|█████████▏| 9212/10000 [1:59:42<10:23, 1.26it/s, loss=0.0175, lr=2.88e-06, step=9212] Training: 92%|█████████▏| 9213/10000 [1:59:42<09:42, 1.35it/s, loss=0.0175, lr=2.88e-06, step=9212] Training: 92%|█████████▏| 9213/10000 [1:59:42<09:42, 1.35it/s, loss=0.0097, lr=2.88e-06, step=9213] Training: 92%|█████████▏| 9214/10000 [1:59:43<10:05, 1.30it/s, loss=0.0097, lr=2.88e-06, step=9213] Training: 92%|█████████▏| 9214/10000 [1:59:43<10:05, 1.30it/s, loss=0.0034, lr=2.88e-06, step=9214] Training: 92%|█████████▏| 9215/10000 [1:59:44<09:30, 1.38it/s, loss=0.0034, lr=2.88e-06, step=9214] Training: 92%|█████████▏| 9215/10000 [1:59:44<09:30, 1.38it/s, loss=0.0125, lr=2.88e-06, step=9215] Training: 92%|█████████▏| 9216/10000 [1:59:45<09:50, 1.33it/s, loss=0.0125, lr=2.88e-06, step=9215] Training: 92%|█████████▏| 9216/10000 [1:59:45<09:50, 1.33it/s, loss=0.0022, lr=2.88e-06, step=9216] Training: 92%|█████████▏| 9217/10000 [1:59:46<11:40, 1.12it/s, loss=0.0022, lr=2.88e-06, step=9216] Training: 92%|█████████▏| 9217/10000 [1:59:46<11:40, 1.12it/s, loss=0.0088, lr=2.88e-06, step=9217] Training: 92%|█████████▏| 9218/10000 [1:59:46<10:24, 1.25it/s, loss=0.0088, lr=2.88e-06, step=9217] Training: 92%|█████████▏| 9218/10000 [1:59:46<10:24, 1.25it/s, loss=0.0132, lr=2.88e-06, step=9218] Training: 92%|█████████▏| 9219/10000 [1:59:47<10:44, 1.21it/s, loss=0.0132, lr=2.88e-06, step=9218] Training: 92%|█████████▏| 9219/10000 [1:59:47<10:44, 1.21it/s, loss=0.0073, lr=2.87e-06, step=9219]18:05:55.096 [I] step=9220 loss=0.0048 smoothed_loss=0.0073 lr=2.88e-06 grad_norm=0.4120 step_time=0.6303s data_time=0.1770s it/s=1.239 eta_to_10000=629.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0030 grad_action_out_proj=0.0481 grad_shared_expert=0.1851 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9220/10000 [1:59:48<11:09, 1.17it/s, loss=0.0073, lr=2.87e-06, step=9219] Training: 92%|█████████▏| 9220/10000 [1:59:48<11:09, 1.17it/s, loss=0.0048, lr=2.87e-06, step=9220] Training: 92%|█████████▏| 9221/10000 [1:59:49<11:11, 1.16it/s, loss=0.0048, lr=2.87e-06, step=9220] Training: 92%|█████████▏| 9221/10000 [1:59:49<11:11, 1.16it/s, loss=0.0027, lr=2.87e-06, step=9221] Training: 92%|█████████▏| 9222/10000 [1:59:50<10:25, 1.24it/s, loss=0.0027, lr=2.87e-06, step=9221] Training: 92%|█████████▏| 9222/10000 [1:59:50<10:25, 1.24it/s, loss=0.0036, lr=2.87e-06, step=9222] Training: 92%|█████████▏| 9223/10000 [1:59:51<11:31, 1.12it/s, loss=0.0036, lr=2.87e-06, step=9222] Training: 92%|█████████▏| 9223/10000 [1:59:51<11:31, 1.12it/s, loss=0.0029, lr=2.87e-06, step=9223] Training: 92%|█████████▏| 9224/10000 [1:59:52<11:10, 1.16it/s, loss=0.0029, lr=2.87e-06, step=9223] Training: 92%|█████████▏| 9224/10000 [1:59:52<11:10, 1.16it/s, loss=0.0139, lr=2.87e-06, step=9224] Training: 92%|█████████▏| 9225/10000 [1:59:53<11:53, 1.09it/s, loss=0.0139, lr=2.87e-06, step=9224] Training: 92%|█████████▏| 9225/10000 [1:59:53<11:53, 1.09it/s, loss=0.0046, lr=2.87e-06, step=9225] Training: 92%|█████████▏| 9226/10000 [1:59:54<12:12, 1.06it/s, loss=0.0046, lr=2.87e-06, step=9225] Training: 92%|█████████▏| 9226/10000 [1:59:54<12:12, 1.06it/s, loss=0.0038, lr=2.87e-06, step=9226] Training: 92%|█████████▏| 9227/10000 [1:59:55<11:59, 1.07it/s, loss=0.0038, lr=2.87e-06, step=9226] Training: 92%|█████████▏| 9227/10000 [1:59:55<11:59, 1.07it/s, loss=0.0117, lr=2.87e-06, step=9227] Training: 92%|█████████▏| 9228/10000 [1:59:55<10:20, 1.24it/s, loss=0.0117, lr=2.87e-06, step=9227] Training: 92%|█████████▏| 9228/10000 [1:59:55<10:20, 1.24it/s, loss=0.0918, lr=2.87e-06, step=9228] Training: 92%|█████████▏| 9229/10000 [1:59:56<10:23, 1.24it/s, loss=0.0918, lr=2.87e-06, step=9228] Training: 92%|█████████▏| 9229/10000 [1:59:56<10:23, 1.24it/s, loss=0.0035, lr=2.86e-06, step=9229]18:06:03.554 [I] step=9230 loss=0.0054 smoothed_loss=0.0133 lr=2.87e-06 grad_norm=0.3829 step_time=0.6192s data_time=0.2266s it/s=1.182 eta_to_10000=651.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0095 grad_action_out_proj=0.0789 grad_shared_expert=0.3080 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9230/10000 [1:59:57<10:07, 1.27it/s, loss=0.0035, lr=2.86e-06, step=9229] Training: 92%|█████████▏| 9230/10000 [1:59:57<10:07, 1.27it/s, loss=0.0054, lr=2.86e-06, step=9230] Training: 92%|█████████▏| 9231/10000 [1:59:57<10:22, 1.24it/s, loss=0.0054, lr=2.86e-06, step=9230] Training: 92%|█████████▏| 9231/10000 [1:59:57<10:22, 1.24it/s, loss=0.0017, lr=2.86e-06, step=9231] Training: 92%|█████████▏| 9232/10000 [1:59:58<09:15, 1.38it/s, loss=0.0017, lr=2.86e-06, step=9231] Training: 92%|█████████▏| 9232/10000 [1:59:58<09:15, 1.38it/s, loss=0.0016, lr=2.86e-06, step=9232] Training: 92%|█████████▏| 9233/10000 [1:59:59<10:45, 1.19it/s, loss=0.0016, lr=2.86e-06, step=9232] Training: 92%|█████████▏| 9233/10000 [1:59:59<10:45, 1.19it/s, loss=0.0134, lr=2.86e-06, step=9233] Training: 92%|█████████▏| 9234/10000 [2:00:00<09:53, 1.29it/s, loss=0.0134, lr=2.86e-06, step=9233] Training: 92%|█████████▏| 9234/10000 [2:00:00<09:53, 1.29it/s, loss=0.0140, lr=2.86e-06, step=9234] Training: 92%|█████████▏| 9235/10000 [2:00:00<09:22, 1.36it/s, loss=0.0140, lr=2.86e-06, step=9234] Training: 92%|█████████▏| 9235/10000 [2:00:00<09:22, 1.36it/s, loss=0.0145, lr=2.86e-06, step=9235] Training: 92%|█████████▏| 9236/10000 [2:00:01<09:21, 1.36it/s, loss=0.0145, lr=2.86e-06, step=9235] Training: 92%|█████████▏| 9236/10000 [2:00:01<09:21, 1.36it/s, loss=0.0036, lr=2.86e-06, step=9236] Training: 92%|█████████▏| 9237/10000 [2:00:02<10:49, 1.17it/s, loss=0.0036, lr=2.86e-06, step=9236] Training: 92%|█████████▏| 9237/10000 [2:00:02<10:49, 1.17it/s, loss=0.0024, lr=2.86e-06, step=9237] Training: 92%|█████████▏| 9238/10000 [2:00:03<11:14, 1.13it/s, loss=0.0024, lr=2.86e-06, step=9237] Training: 92%|█████████▏| 9238/10000 [2:00:03<11:14, 1.13it/s, loss=0.0152, lr=2.86e-06, step=9238] Training: 92%|█████████▏| 9239/10000 [2:00:04<10:00, 1.27it/s, loss=0.0152, lr=2.86e-06, step=9238] Training: 92%|█████████▏| 9239/10000 [2:00:04<10:00, 1.27it/s, loss=0.0013, lr=2.86e-06, step=9239]18:06:11.241 [I] step=9240 loss=0.0057 smoothed_loss=0.0093 lr=2.86e-06 grad_norm=0.3647 step_time=0.6375s data_time=0.1312s it/s=1.301 eta_to_10000=584.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0086 grad_action_out_proj=0.0648 grad_shared_expert=0.3614 (10775:train_pytorch.py:850) + Training: 92%|█████████▏| 9240/10000 [2:00:04<09:03, 1.40it/s, loss=0.0013, lr=2.86e-06, step=9239] Training: 92%|█████████▏| 9240/10000 [2:00:04<09:03, 1.40it/s, loss=0.0057, lr=2.85e-06, step=9240] Training: 92%|█████████▏| 9241/10000 [2:00:05<08:48, 1.44it/s, loss=0.0057, lr=2.85e-06, step=9240] Training: 92%|█████████▏| 9241/10000 [2:00:05<08:48, 1.44it/s, loss=0.0025, lr=2.85e-06, step=9241] Training: 92%|█████████▏| 9242/10000 [2:00:06<08:22, 1.51it/s, loss=0.0025, lr=2.85e-06, step=9241] Training: 92%|█████████▏| 9242/10000 [2:00:06<08:22, 1.51it/s, loss=0.0025, lr=2.85e-06, step=9242] Training: 92%|█████████▏| 9243/10000 [2:00:06<07:45, 1.63it/s, loss=0.0025, lr=2.85e-06, step=9242] Training: 92%|█████████▏| 9243/10000 [2:00:06<07:45, 1.63it/s, loss=0.0253, lr=2.85e-06, step=9243] Training: 92%|█████████▏| 9244/10000 [2:00:07<07:15, 1.73it/s, loss=0.0253, lr=2.85e-06, step=9243] Training: 92%|█████████▏| 9244/10000 [2:00:07<07:15, 1.73it/s, loss=0.0160, lr=2.85e-06, step=9244] Training: 92%|█████████▏| 9245/10000 [2:00:08<09:16, 1.36it/s, loss=0.0160, lr=2.85e-06, step=9244] Training: 92%|█████████▏| 9245/10000 [2:00:08<09:16, 1.36it/s, loss=0.0101, lr=2.85e-06, step=9245] Training: 92%|█████████▏| 9246/10000 [2:00:09<09:56, 1.26it/s, loss=0.0101, lr=2.85e-06, step=9245] Training: 92%|█████████▏| 9246/10000 [2:00:09<09:56, 1.26it/s, loss=0.0034, lr=2.85e-06, step=9246] Training: 92%|█████████▏| 9247/10000 [2:00:09<10:12, 1.23it/s, loss=0.0034, lr=2.85e-06, step=9246] Training: 92%|█████████▏| 9247/10000 [2:00:09<10:12, 1.23it/s, loss=0.0009, lr=2.85e-06, step=9247] Training: 92%|█████████▏| 9248/10000 [2:00:10<10:54, 1.15it/s, loss=0.0009, lr=2.85e-06, step=9247] Training: 92%|█████████▏| 9248/10000 [2:00:10<10:54, 1.15it/s, loss=0.0087, lr=2.85e-06, step=9248] Training: 92%|█████████▏| 9249/10000 [2:00:11<09:41, 1.29it/s, loss=0.0087, lr=2.85e-06, step=9248] Training: 92%|█████████▏| 9249/10000 [2:00:11<09:41, 1.29it/s, loss=0.0084, lr=2.85e-06, step=9249]18:06:18.440 [I] step=9250 loss=0.0038 smoothed_loss=0.0083 lr=2.85e-06 grad_norm=0.3417 step_time=0.5988s data_time=0.1211s it/s=1.389 eta_to_10000=539.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0045 grad_action_out_proj=0.0506 grad_shared_expert=0.2800 (10775:train_pytorch.py:850) + Training: 92%|█████████▎| 9250/10000 [2:00:12<08:44, 1.43it/s, loss=0.0084, lr=2.85e-06, step=9249] Training: 92%|█████████▎| 9250/10000 [2:00:12<08:44, 1.43it/s, loss=0.0038, lr=2.85e-06, step=9250] Training: 93%|█████████▎| 9251/10000 [2:00:12<08:36, 1.45it/s, loss=0.0038, lr=2.85e-06, step=9250] Training: 93%|█████████▎| 9251/10000 [2:00:12<08:36, 1.45it/s, loss=0.0062, lr=2.84e-06, step=9251] Training: 93%|█████████▎| 9252/10000 [2:00:13<08:59, 1.39it/s, loss=0.0062, lr=2.84e-06, step=9251] Training: 93%|█████████▎| 9252/10000 [2:00:13<08:59, 1.39it/s, loss=0.0057, lr=2.84e-06, step=9252] Training: 93%|█████████▎| 9253/10000 [2:00:14<08:19, 1.50it/s, loss=0.0057, lr=2.84e-06, step=9252] Training: 93%|█████████▎| 9253/10000 [2:00:14<08:19, 1.50it/s, loss=0.0057, lr=2.84e-06, step=9253] Training: 93%|█████████▎| 9254/10000 [2:00:14<09:06, 1.37it/s, loss=0.0057, lr=2.84e-06, step=9253] Training: 93%|█████████▎| 9254/10000 [2:00:14<09:06, 1.37it/s, loss=0.0030, lr=2.84e-06, step=9254] Training: 93%|█████████▎| 9255/10000 [2:00:15<10:21, 1.20it/s, loss=0.0030, lr=2.84e-06, step=9254] Training: 93%|█████████▎| 9255/10000 [2:00:15<10:21, 1.20it/s, loss=0.0039, lr=2.84e-06, step=9255] Training: 93%|█████████▎| 9256/10000 [2:00:16<10:19, 1.20it/s, loss=0.0039, lr=2.84e-06, step=9255] Training: 93%|█████████▎| 9256/10000 [2:00:16<10:19, 1.20it/s, loss=0.0027, lr=2.84e-06, step=9256] Training: 93%|█████████▎| 9257/10000 [2:00:17<10:14, 1.21it/s, loss=0.0027, lr=2.84e-06, step=9256] Training: 93%|█████████▎| 9257/10000 [2:00:17<10:14, 1.21it/s, loss=0.0119, lr=2.84e-06, step=9257] Training: 93%|█████████▎| 9258/10000 [2:00:18<09:47, 1.26it/s, loss=0.0119, lr=2.84e-06, step=9257] Training: 93%|█████████▎| 9258/10000 [2:00:18<09:47, 1.26it/s, loss=0.0310, lr=2.84e-06, step=9258] Training: 93%|█████████▎| 9259/10000 [2:00:19<10:17, 1.20it/s, loss=0.0310, lr=2.84e-06, step=9258] Training: 93%|█████████▎| 9259/10000 [2:00:19<10:17, 1.20it/s, loss=0.0341, lr=2.84e-06, step=9259]18:06:26.550 [I] step=9260 loss=0.0058 smoothed_loss=0.0112 lr=2.84e-06 grad_norm=0.4050 step_time=0.6352s data_time=0.1758s it/s=1.233 eta_to_10000=600.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0103 grad_action_out_proj=0.0901 grad_shared_expert=0.6081 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9260/10000 [2:00:20<10:24, 1.19it/s, loss=0.0341, lr=2.84e-06, step=9259] Training: 93%|█████████▎| 9260/10000 [2:00:20<10:24, 1.19it/s, loss=0.0058, lr=2.84e-06, step=9260] Training: 93%|█████████▎| 9261/10000 [2:00:20<10:20, 1.19it/s, loss=0.0058, lr=2.84e-06, step=9260] Training: 93%|█████████▎| 9261/10000 [2:00:20<10:20, 1.19it/s, loss=0.0172, lr=2.84e-06, step=9261] Training: 93%|█████████▎| 9262/10000 [2:00:22<11:10, 1.10it/s, loss=0.0172, lr=2.84e-06, step=9261] Training: 93%|█████████▎| 9262/10000 [2:00:22<11:10, 1.10it/s, loss=0.0054, lr=2.83e-06, step=9262] Training: 93%|█████████▎| 9263/10000 [2:00:22<10:07, 1.21it/s, loss=0.0054, lr=2.83e-06, step=9262] Training: 93%|█████████▎| 9263/10000 [2:00:22<10:07, 1.21it/s, loss=0.0022, lr=2.83e-06, step=9263] Training: 93%|█████████▎| 9264/10000 [2:00:23<08:57, 1.37it/s, loss=0.0022, lr=2.83e-06, step=9263] Training: 93%|█████████▎| 9264/10000 [2:00:23<08:57, 1.37it/s, loss=0.0114, lr=2.83e-06, step=9264] Training: 93%|█████████▎| 9265/10000 [2:00:23<08:09, 1.50it/s, loss=0.0114, lr=2.83e-06, step=9264] Training: 93%|█████████▎| 9265/10000 [2:00:23<08:09, 1.50it/s, loss=0.0065, lr=2.83e-06, step=9265] Training: 93%|█████████▎| 9266/10000 [2:00:24<08:32, 1.43it/s, loss=0.0065, lr=2.83e-06, step=9265] Training: 93%|█████████▎| 9266/10000 [2:00:24<08:32, 1.43it/s, loss=0.0033, lr=2.83e-06, step=9266] Training: 93%|█████████▎| 9267/10000 [2:00:25<08:06, 1.51it/s, loss=0.0033, lr=2.83e-06, step=9266] Training: 93%|█████████▎| 9267/10000 [2:00:25<08:06, 1.51it/s, loss=0.0289, lr=2.83e-06, step=9267] Training: 93%|█████████▎| 9268/10000 [2:00:25<08:08, 1.50it/s, loss=0.0289, lr=2.83e-06, step=9267] Training: 93%|█████████▎| 9268/10000 [2:00:25<08:08, 1.50it/s, loss=0.0017, lr=2.83e-06, step=9268] Training: 93%|█████████▎| 9269/10000 [2:00:26<08:35, 1.42it/s, loss=0.0017, lr=2.83e-06, step=9268] Training: 93%|█████████▎| 9269/10000 [2:00:26<08:35, 1.42it/s, loss=0.0016, lr=2.83e-06, step=9269]18:06:33.490 [I] step=9270 loss=0.0086 smoothed_loss=0.0094 lr=2.83e-06 grad_norm=0.4630 step_time=0.5735s data_time=0.1205s it/s=1.441 eta_to_10000=506.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0059 grad_action_out_proj=0.0560 grad_shared_expert=0.2782 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9270/10000 [2:00:27<08:02, 1.51it/s, loss=0.0016, lr=2.83e-06, step=9269] Training: 93%|█████████▎| 9270/10000 [2:00:27<08:02, 1.51it/s, loss=0.0086, lr=2.83e-06, step=9270] Training: 93%|█████████▎| 9271/10000 [2:00:27<07:31, 1.62it/s, loss=0.0086, lr=2.83e-06, step=9270] Training: 93%|█████████▎| 9271/10000 [2:00:27<07:31, 1.62it/s, loss=0.0071, lr=2.83e-06, step=9271] Training: 93%|█████████▎| 9272/10000 [2:00:28<07:58, 1.52it/s, loss=0.0071, lr=2.83e-06, step=9271] Training: 93%|█████████▎| 9272/10000 [2:00:28<07:58, 1.52it/s, loss=0.0205, lr=2.83e-06, step=9272] Training: 93%|█████████▎| 9273/10000 [2:00:28<07:20, 1.65it/s, loss=0.0205, lr=2.83e-06, step=9272] Training: 93%|█████████▎| 9273/10000 [2:00:28<07:20, 1.65it/s, loss=0.0096, lr=2.82e-06, step=9273] Training: 93%|█████████▎| 9274/10000 [2:00:29<08:10, 1.48it/s, loss=0.0096, lr=2.82e-06, step=9273] Training: 93%|█████████▎| 9274/10000 [2:00:29<08:10, 1.48it/s, loss=0.0060, lr=2.82e-06, step=9274] Training: 93%|█████████▎| 9275/10000 [2:00:30<08:07, 1.49it/s, loss=0.0060, lr=2.82e-06, step=9274] Training: 93%|█████████▎| 9275/10000 [2:00:30<08:07, 1.49it/s, loss=0.0042, lr=2.82e-06, step=9275] Training: 93%|█████████▎| 9276/10000 [2:00:30<07:31, 1.60it/s, loss=0.0042, lr=2.82e-06, step=9275] Training: 93%|█████████▎| 9276/10000 [2:00:30<07:31, 1.60it/s, loss=0.0060, lr=2.82e-06, step=9276] Training: 93%|█████████▎| 9277/10000 [2:00:31<08:01, 1.50it/s, loss=0.0060, lr=2.82e-06, step=9276] Training: 93%|█████████▎| 9277/10000 [2:00:31<08:01, 1.50it/s, loss=0.0025, lr=2.82e-06, step=9277] Training: 93%|█████████▎| 9278/10000 [2:00:32<07:35, 1.59it/s, loss=0.0025, lr=2.82e-06, step=9277] Training: 93%|█████████▎| 9278/10000 [2:00:32<07:35, 1.59it/s, loss=0.0051, lr=2.82e-06, step=9278] Training: 93%|█████████▎| 9279/10000 [2:00:32<07:46, 1.55it/s, loss=0.0051, lr=2.82e-06, step=9278] Training: 93%|█████████▎| 9279/10000 [2:00:32<07:46, 1.55it/s, loss=0.0114, lr=2.82e-06, step=9279]18:06:40.104 [I] step=9280 loss=0.0065 smoothed_loss=0.0081 lr=2.82e-06 grad_norm=0.4231 step_time=0.5601s data_time=0.1013s it/s=1.512 eta_to_10000=476.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0143 grad_action_out_proj=0.1096 grad_shared_expert=0.4456 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9280/10000 [2:00:33<08:29, 1.41it/s, loss=0.0114, lr=2.82e-06, step=9279] Training: 93%|█████████▎| 9280/10000 [2:00:33<08:29, 1.41it/s, loss=0.0065, lr=2.82e-06, step=9280] Training: 93%|█████████▎| 9281/10000 [2:00:34<10:12, 1.17it/s, loss=0.0065, lr=2.82e-06, step=9280] Training: 93%|█████████▎| 9281/10000 [2:00:34<10:12, 1.17it/s, loss=0.0126, lr=2.82e-06, step=9281] Training: 93%|█████████▎| 9282/10000 [2:00:35<09:38, 1.24it/s, loss=0.0126, lr=2.82e-06, step=9281] Training: 93%|█████████▎| 9282/10000 [2:00:35<09:38, 1.24it/s, loss=0.0064, lr=2.82e-06, step=9282] Training: 93%|█████████▎| 9283/10000 [2:00:36<09:12, 1.30it/s, loss=0.0064, lr=2.82e-06, step=9282] Training: 93%|█████████▎| 9283/10000 [2:00:36<09:12, 1.30it/s, loss=0.0065, lr=2.82e-06, step=9283] Training: 93%|█████████▎| 9284/10000 [2:00:36<08:21, 1.43it/s, loss=0.0065, lr=2.82e-06, step=9283] Training: 93%|█████████▎| 9284/10000 [2:00:36<08:21, 1.43it/s, loss=0.0060, lr=2.81e-06, step=9284] Training: 93%|█████████▎| 9285/10000 [2:00:37<08:34, 1.39it/s, loss=0.0060, lr=2.81e-06, step=9284] Training: 93%|█████████▎| 9285/10000 [2:00:37<08:34, 1.39it/s, loss=0.0215, lr=2.81e-06, step=9285] Training: 93%|█████████▎| 9286/10000 [2:00:38<07:46, 1.53it/s, loss=0.0215, lr=2.81e-06, step=9285] Training: 93%|█████████▎| 9286/10000 [2:00:38<07:46, 1.53it/s, loss=0.0052, lr=2.81e-06, step=9286] Training: 93%|█████████▎| 9287/10000 [2:00:38<07:42, 1.54it/s, loss=0.0052, lr=2.81e-06, step=9286] Training: 93%|█████████▎| 9287/10000 [2:00:38<07:42, 1.54it/s, loss=0.0030, lr=2.81e-06, step=9287] Training: 93%|█████████▎| 9288/10000 [2:00:39<09:19, 1.27it/s, loss=0.0030, lr=2.81e-06, step=9287] Training: 93%|█████████▎| 9288/10000 [2:00:39<09:19, 1.27it/s, loss=0.0017, lr=2.81e-06, step=9288] Training: 93%|█████████▎| 9289/10000 [2:00:40<08:25, 1.41it/s, loss=0.0017, lr=2.81e-06, step=9288] Training: 93%|█████████▎| 9289/10000 [2:00:40<08:25, 1.41it/s, loss=0.0060, lr=2.81e-06, step=9289]18:06:47.627 [I] step=9290 loss=0.0028 smoothed_loss=0.0070 lr=2.81e-06 grad_norm=0.3680 step_time=0.6147s data_time=0.1376s it/s=1.330 eta_to_10000=534.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0037 grad_action_out_proj=0.0430 grad_shared_expert=0.1828 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9290/10000 [2:00:41<08:57, 1.32it/s, loss=0.0060, lr=2.81e-06, step=9289] Training: 93%|█████████▎| 9290/10000 [2:00:41<08:57, 1.32it/s, loss=0.0028, lr=2.81e-06, step=9290] Training: 93%|█████████▎| 9291/10000 [2:00:41<08:01, 1.47it/s, loss=0.0028, lr=2.81e-06, step=9290] Training: 93%|█████████▎| 9291/10000 [2:00:41<08:01, 1.47it/s, loss=0.0103, lr=2.81e-06, step=9291] Training: 93%|█████████▎| 9292/10000 [2:00:42<07:26, 1.59it/s, loss=0.0103, lr=2.81e-06, step=9291] Training: 93%|█████████▎| 9292/10000 [2:00:42<07:26, 1.59it/s, loss=0.0143, lr=2.81e-06, step=9292] Training: 93%|█████████▎| 9293/10000 [2:00:42<07:56, 1.49it/s, loss=0.0143, lr=2.81e-06, step=9292] Training: 93%|█████████▎| 9293/10000 [2:00:42<07:56, 1.49it/s, loss=0.0027, lr=2.81e-06, step=9293] Training: 93%|█████████▎| 9294/10000 [2:00:43<07:16, 1.62it/s, loss=0.0027, lr=2.81e-06, step=9293] Training: 93%|█████████▎| 9294/10000 [2:00:43<07:16, 1.62it/s, loss=0.0092, lr=2.81e-06, step=9294] Training: 93%|█████████▎| 9295/10000 [2:00:44<08:16, 1.42it/s, loss=0.0092, lr=2.81e-06, step=9294] Training: 93%|█████████▎| 9295/10000 [2:00:44<08:16, 1.42it/s, loss=0.0064, lr=2.81e-06, step=9295] Training: 93%|█████████▎| 9296/10000 [2:00:45<08:20, 1.41it/s, loss=0.0064, lr=2.81e-06, step=9295] Training: 93%|█████████▎| 9296/10000 [2:00:45<08:20, 1.41it/s, loss=0.0030, lr=2.80e-06, step=9296] Training: 93%|█████████▎| 9297/10000 [2:00:45<08:35, 1.36it/s, loss=0.0030, lr=2.80e-06, step=9296] Training: 93%|█████████▎| 9297/10000 [2:00:45<08:35, 1.36it/s, loss=0.0021, lr=2.80e-06, step=9297] Training: 93%|█████████▎| 9298/10000 [2:00:46<08:32, 1.37it/s, loss=0.0021, lr=2.80e-06, step=9297] Training: 93%|█████████▎| 9298/10000 [2:00:46<08:32, 1.37it/s, loss=0.0877, lr=2.80e-06, step=9298] Training: 93%|█████████▎| 9299/10000 [2:00:47<07:46, 1.50it/s, loss=0.0877, lr=2.80e-06, step=9298] Training: 93%|█████████▎| 9299/10000 [2:00:47<07:46, 1.50it/s, loss=0.0056, lr=2.80e-06, step=9299]18:06:54.211 [I] step=9300 loss=0.0079 smoothed_loss=0.0132 lr=2.80e-06 grad_norm=0.4053 step_time=0.5453s data_time=0.1131s it/s=1.519 eta_to_10000=460.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0069 grad_action_out_proj=0.0858 grad_shared_expert=0.3299 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9300/10000 [2:00:47<07:43, 1.51it/s, loss=0.0056, lr=2.80e-06, step=9299] Training: 93%|█████████▎| 9300/10000 [2:00:47<07:43, 1.51it/s, loss=0.0079, lr=2.80e-06, step=9300] Training: 93%|█████████▎| 9301/10000 [2:00:48<07:59, 1.46it/s, loss=0.0079, lr=2.80e-06, step=9300] Training: 93%|█████████▎| 9301/10000 [2:00:48<07:59, 1.46it/s, loss=0.0117, lr=2.80e-06, step=9301] Training: 93%|█████████▎| 9302/10000 [2:00:49<08:34, 1.36it/s, loss=0.0117, lr=2.80e-06, step=9301] Training: 93%|█████████▎| 9302/10000 [2:00:49<08:34, 1.36it/s, loss=0.0092, lr=2.80e-06, step=9302] Training: 93%|█████████▎| 9303/10000 [2:00:49<07:47, 1.49it/s, loss=0.0092, lr=2.80e-06, step=9302] Training: 93%|█████████▎| 9303/10000 [2:00:49<07:47, 1.49it/s, loss=0.0052, lr=2.80e-06, step=9303] Training: 93%|█████████▎| 9304/10000 [2:00:50<08:12, 1.41it/s, loss=0.0052, lr=2.80e-06, step=9303] Training: 93%|█████████▎| 9304/10000 [2:00:50<08:12, 1.41it/s, loss=0.0245, lr=2.80e-06, step=9304] Training: 93%|█████████▎| 9305/10000 [2:00:51<07:32, 1.54it/s, loss=0.0245, lr=2.80e-06, step=9304] Training: 93%|█████████▎| 9305/10000 [2:00:51<07:32, 1.54it/s, loss=0.0099, lr=2.80e-06, step=9305] Training: 93%|█████████▎| 9306/10000 [2:00:52<08:02, 1.44it/s, loss=0.0099, lr=2.80e-06, step=9305] Training: 93%|█████████▎| 9306/10000 [2:00:52<08:02, 1.44it/s, loss=0.0039, lr=2.80e-06, step=9306] Training: 93%|█████████▎| 9307/10000 [2:00:52<07:24, 1.56it/s, loss=0.0039, lr=2.80e-06, step=9306] Training: 93%|█████████▎| 9307/10000 [2:00:52<07:24, 1.56it/s, loss=0.0015, lr=2.79e-06, step=9307] Training: 93%|█████████▎| 9308/10000 [2:00:53<06:54, 1.67it/s, loss=0.0015, lr=2.79e-06, step=9307] Training: 93%|█████████▎| 9308/10000 [2:00:53<06:54, 1.67it/s, loss=0.0052, lr=2.79e-06, step=9308] Training: 93%|█████████▎| 9309/10000 [2:00:53<08:11, 1.41it/s, loss=0.0052, lr=2.79e-06, step=9308] Training: 93%|█████████▎| 9309/10000 [2:00:53<08:11, 1.41it/s, loss=0.0069, lr=2.79e-06, step=9309]18:07:00.951 [I] step=9310 loss=0.0088 smoothed_loss=0.0099 lr=2.80e-06 grad_norm=0.5043 step_time=0.5613s data_time=0.1127s it/s=1.484 eta_to_10000=465.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0224 grad_action_out_proj=0.1585 grad_shared_expert=0.4269 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9310/10000 [2:00:54<07:32, 1.53it/s, loss=0.0069, lr=2.79e-06, step=9309] Training: 93%|█████████▎| 9310/10000 [2:00:54<07:32, 1.53it/s, loss=0.0088, lr=2.79e-06, step=9310] Training: 93%|█████████▎| 9311/10000 [2:00:55<06:58, 1.65it/s, loss=0.0088, lr=2.79e-06, step=9310] Training: 93%|█████████▎| 9311/10000 [2:00:55<06:58, 1.65it/s, loss=0.0006, lr=2.79e-06, step=9311] Training: 93%|█████████▎| 9312/10000 [2:00:55<07:40, 1.49it/s, loss=0.0006, lr=2.79e-06, step=9311] Training: 93%|█████████▎| 9312/10000 [2:00:55<07:40, 1.49it/s, loss=0.0086, lr=2.79e-06, step=9312] Training: 93%|█████████▎| 9313/10000 [2:00:56<07:51, 1.46it/s, loss=0.0086, lr=2.79e-06, step=9312] Training: 93%|█████████▎| 9313/10000 [2:00:56<07:51, 1.46it/s, loss=0.0074, lr=2.79e-06, step=9313] Training: 93%|█████████▎| 9314/10000 [2:00:57<08:04, 1.42it/s, loss=0.0074, lr=2.79e-06, step=9313] Training: 93%|█████████▎| 9314/10000 [2:00:57<08:04, 1.42it/s, loss=0.0109, lr=2.79e-06, step=9314] Training: 93%|█████████▎| 9315/10000 [2:00:58<08:26, 1.35it/s, loss=0.0109, lr=2.79e-06, step=9314] Training: 93%|█████████▎| 9315/10000 [2:00:58<08:26, 1.35it/s, loss=0.0120, lr=2.79e-06, step=9315] Training: 93%|█████████▎| 9316/10000 [2:00:58<07:56, 1.44it/s, loss=0.0120, lr=2.79e-06, step=9315] Training: 93%|█████████▎| 9316/10000 [2:00:58<07:56, 1.44it/s, loss=0.0263, lr=2.79e-06, step=9316] Training: 93%|█████████▎| 9317/10000 [2:00:59<08:58, 1.27it/s, loss=0.0263, lr=2.79e-06, step=9316] Training: 93%|█████████▎| 9317/10000 [2:00:59<08:58, 1.27it/s, loss=0.0420, lr=2.79e-06, step=9317] Training: 93%|█████████▎| 9318/10000 [2:01:00<08:57, 1.27it/s, loss=0.0420, lr=2.79e-06, step=9317] Training: 93%|█████████▎| 9318/10000 [2:01:00<08:57, 1.27it/s, loss=0.0014, lr=2.79e-06, step=9318] Training: 93%|█████████▎| 9319/10000 [2:01:01<08:57, 1.27it/s, loss=0.0014, lr=2.79e-06, step=9318] Training: 93%|█████████▎| 9319/10000 [2:01:01<08:57, 1.27it/s, loss=0.0058, lr=2.78e-06, step=9319]18:07:08.798 [I] step=9320 loss=0.0029 smoothed_loss=0.0112 lr=2.79e-06 grad_norm=0.4975 step_time=0.6176s data_time=0.1671s it/s=1.274 eta_to_10000=533.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0231 grad_action_out_proj=0.1905 grad_shared_expert=0.5606 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9320/10000 [2:01:02<09:52, 1.15it/s, loss=0.0058, lr=2.78e-06, step=9319] Training: 93%|█████████▎| 9320/10000 [2:01:02<09:52, 1.15it/s, loss=0.0029, lr=2.78e-06, step=9320] Training: 93%|█████████▎| 9321/10000 [2:01:03<09:22, 1.21it/s, loss=0.0029, lr=2.78e-06, step=9320] Training: 93%|█████████▎| 9321/10000 [2:01:03<09:22, 1.21it/s, loss=0.0030, lr=2.78e-06, step=9321] Training: 93%|█████████▎| 9322/10000 [2:01:03<08:21, 1.35it/s, loss=0.0030, lr=2.78e-06, step=9321] Training: 93%|█████████▎| 9322/10000 [2:01:03<08:21, 1.35it/s, loss=0.0049, lr=2.78e-06, step=9322] Training: 93%|█████████▎| 9323/10000 [2:01:04<08:40, 1.30it/s, loss=0.0049, lr=2.78e-06, step=9322] Training: 93%|█████████▎| 9323/10000 [2:01:04<08:40, 1.30it/s, loss=0.0398, lr=2.78e-06, step=9323] Training: 93%|█████████▎| 9324/10000 [2:01:05<09:06, 1.24it/s, loss=0.0398, lr=2.78e-06, step=9323] Training: 93%|█████████▎| 9324/10000 [2:01:05<09:06, 1.24it/s, loss=0.0166, lr=2.78e-06, step=9324] Training: 93%|█████████▎| 9325/10000 [2:01:06<09:48, 1.15it/s, loss=0.0166, lr=2.78e-06, step=9324] Training: 93%|█████████▎| 9325/10000 [2:01:06<09:48, 1.15it/s, loss=0.0026, lr=2.78e-06, step=9325] Training: 93%|█████████▎| 9326/10000 [2:01:07<09:34, 1.17it/s, loss=0.0026, lr=2.78e-06, step=9325] Training: 93%|█████████▎| 9326/10000 [2:01:07<09:34, 1.17it/s, loss=0.0080, lr=2.78e-06, step=9326] Training: 93%|█████████▎| 9327/10000 [2:01:07<08:52, 1.26it/s, loss=0.0080, lr=2.78e-06, step=9326] Training: 93%|█████████▎| 9327/10000 [2:01:07<08:52, 1.26it/s, loss=0.0081, lr=2.78e-06, step=9327] Training: 93%|█████████▎| 9328/10000 [2:01:08<07:52, 1.42it/s, loss=0.0081, lr=2.78e-06, step=9327] Training: 93%|█████████▎| 9328/10000 [2:01:08<07:52, 1.42it/s, loss=0.0038, lr=2.78e-06, step=9328] Training: 93%|█████████▎| 9329/10000 [2:01:09<08:08, 1.37it/s, loss=0.0038, lr=2.78e-06, step=9328] Training: 93%|█████████▎| 9329/10000 [2:01:09<08:08, 1.37it/s, loss=0.0384, lr=2.78e-06, step=9329]18:07:16.493 [I] step=9330 loss=0.0086 smoothed_loss=0.0129 lr=2.78e-06 grad_norm=0.4142 step_time=0.5877s data_time=0.1817s it/s=1.300 eta_to_10000=515.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0320 grad_action_out_proj=0.1540 grad_shared_expert=0.5304 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9330/10000 [2:01:10<08:50, 1.26it/s, loss=0.0384, lr=2.78e-06, step=9329] Training: 93%|█████████▎| 9330/10000 [2:01:10<08:50, 1.26it/s, loss=0.0086, lr=2.78e-06, step=9330] Training: 93%|█████████▎| 9331/10000 [2:01:10<08:44, 1.27it/s, loss=0.0086, lr=2.78e-06, step=9330] Training: 93%|█████████▎| 9331/10000 [2:01:10<08:44, 1.27it/s, loss=0.0023, lr=2.78e-06, step=9331] Training: 93%|█████████▎| 9332/10000 [2:01:11<08:29, 1.31it/s, loss=0.0023, lr=2.78e-06, step=9331] Training: 93%|█████████▎| 9332/10000 [2:01:11<08:29, 1.31it/s, loss=0.0027, lr=2.77e-06, step=9332] Training: 93%|█████████▎| 9333/10000 [2:01:12<07:37, 1.46it/s, loss=0.0027, lr=2.77e-06, step=9332] Training: 93%|█████████▎| 9333/10000 [2:01:12<07:37, 1.46it/s, loss=0.0060, lr=2.77e-06, step=9333] Training: 93%|█████████▎| 9334/10000 [2:01:13<08:40, 1.28it/s, loss=0.0060, lr=2.77e-06, step=9333] Training: 93%|█████████▎| 9334/10000 [2:01:13<08:40, 1.28it/s, loss=0.0056, lr=2.77e-06, step=9334] Training: 93%|█████████▎| 9335/10000 [2:01:13<07:44, 1.43it/s, loss=0.0056, lr=2.77e-06, step=9334] Training: 93%|█████████▎| 9335/10000 [2:01:13<07:44, 1.43it/s, loss=0.0016, lr=2.77e-06, step=9335] Training: 93%|█████████▎| 9336/10000 [2:01:14<08:11, 1.35it/s, loss=0.0016, lr=2.77e-06, step=9335] Training: 93%|█████████▎| 9336/10000 [2:01:14<08:11, 1.35it/s, loss=0.0070, lr=2.77e-06, step=9336] Training: 93%|█████████▎| 9337/10000 [2:01:15<08:03, 1.37it/s, loss=0.0070, lr=2.77e-06, step=9336] Training: 93%|█████████▎| 9337/10000 [2:01:15<08:03, 1.37it/s, loss=0.0148, lr=2.77e-06, step=9337] Training: 93%|█████████▎| 9338/10000 [2:01:16<08:42, 1.27it/s, loss=0.0148, lr=2.77e-06, step=9337] Training: 93%|█████████▎| 9338/10000 [2:01:16<08:42, 1.27it/s, loss=0.0122, lr=2.77e-06, step=9338] Training: 93%|█████████▎| 9339/10000 [2:01:16<07:47, 1.42it/s, loss=0.0122, lr=2.77e-06, step=9338] Training: 93%|█████████▎| 9339/10000 [2:01:16<07:47, 1.42it/s, loss=0.0036, lr=2.77e-06, step=9339]18:07:23.826 [I] step=9340 loss=0.0049 smoothed_loss=0.0087 lr=2.77e-06 grad_norm=0.4020 step_time=0.5972s data_time=0.1362s it/s=1.364 eta_to_10000=484.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0053 grad_action_out_proj=0.0616 grad_shared_expert=0.2974 (10775:train_pytorch.py:850) + Training: 93%|█████████▎| 9340/10000 [2:01:17<08:15, 1.33it/s, loss=0.0036, lr=2.77e-06, step=9339] Training: 93%|█████████▎| 9340/10000 [2:01:17<08:15, 1.33it/s, loss=0.0049, lr=2.77e-06, step=9340] Training: 93%|█████████▎| 9341/10000 [2:01:18<08:28, 1.30it/s, loss=0.0049, lr=2.77e-06, step=9340] Training: 93%|█████████▎| 9341/10000 [2:01:18<08:28, 1.30it/s, loss=0.0050, lr=2.77e-06, step=9341] Training: 93%|█████████▎| 9342/10000 [2:01:18<07:58, 1.37it/s, loss=0.0050, lr=2.77e-06, step=9341] Training: 93%|█████████▎| 9342/10000 [2:01:18<07:58, 1.37it/s, loss=0.0178, lr=2.77e-06, step=9342] Training: 93%|█████████▎| 9343/10000 [2:01:19<07:27, 1.47it/s, loss=0.0178, lr=2.77e-06, step=9342] Training: 93%|█████████▎| 9343/10000 [2:01:19<07:27, 1.47it/s, loss=0.0049, lr=2.77e-06, step=9343] Training: 93%|█████████▎| 9344/10000 [2:01:19<07:06, 1.54it/s, loss=0.0049, lr=2.77e-06, step=9343] Training: 93%|█████████▎| 9344/10000 [2:01:19<07:06, 1.54it/s, loss=0.0060, lr=2.76e-06, step=9344] Training: 93%|█████████▎| 9345/10000 [2:01:20<07:39, 1.43it/s, loss=0.0060, lr=2.76e-06, step=9344] Training: 93%|█████████▎| 9345/10000 [2:01:20<07:39, 1.43it/s, loss=0.0072, lr=2.76e-06, step=9345] Training: 93%|█████████▎| 9346/10000 [2:01:21<07:58, 1.37it/s, loss=0.0072, lr=2.76e-06, step=9345] Training: 93%|█████████▎| 9346/10000 [2:01:21<07:58, 1.37it/s, loss=0.0062, lr=2.76e-06, step=9346] Training: 93%|█████████▎| 9347/10000 [2:01:22<07:16, 1.50it/s, loss=0.0062, lr=2.76e-06, step=9346] Training: 93%|█████████▎| 9347/10000 [2:01:22<07:16, 1.50it/s, loss=0.0038, lr=2.76e-06, step=9347] Training: 93%|█████████▎| 9348/10000 [2:01:22<07:22, 1.47it/s, loss=0.0038, lr=2.76e-06, step=9347] Training: 93%|█████████▎| 9348/10000 [2:01:22<07:22, 1.47it/s, loss=0.0088, lr=2.76e-06, step=9348] Training: 93%|█████████▎| 9349/10000 [2:01:23<06:49, 1.59it/s, loss=0.0088, lr=2.76e-06, step=9348] Training: 93%|█████████▎| 9349/10000 [2:01:23<06:49, 1.59it/s, loss=0.0012, lr=2.76e-06, step=9349]18:07:30.342 [I] step=9350 loss=0.0103 smoothed_loss=0.0075 lr=2.76e-06 grad_norm=0.3554 step_time=0.5443s data_time=0.1073s it/s=1.535 eta_to_10000=423.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.0857 grad_shared_expert=0.4649 (10775:train_pytorch.py:850) + Training: 94%|█████████▎| 9350/10000 [2:01:23<06:35, 1.65it/s, loss=0.0012, lr=2.76e-06, step=9349] Training: 94%|█████████▎| 9350/10000 [2:01:23<06:35, 1.65it/s, loss=0.0103, lr=2.76e-06, step=9350] Training: 94%|█████████▎| 9351/10000 [2:01:24<06:46, 1.60it/s, loss=0.0103, lr=2.76e-06, step=9350] Training: 94%|█████████▎| 9351/10000 [2:01:24<06:46, 1.60it/s, loss=0.0017, lr=2.76e-06, step=9351] Training: 94%|█████████▎| 9352/10000 [2:01:25<07:36, 1.42it/s, loss=0.0017, lr=2.76e-06, step=9351] Training: 94%|█████████▎| 9352/10000 [2:01:25<07:36, 1.42it/s, loss=0.0017, lr=2.76e-06, step=9352] Training: 94%|█████████▎| 9353/10000 [2:01:25<06:56, 1.55it/s, loss=0.0017, lr=2.76e-06, step=9352] Training: 94%|█████████▎| 9353/10000 [2:01:25<06:56, 1.55it/s, loss=0.0055, lr=2.76e-06, step=9353] Training: 94%|█████████▎| 9354/10000 [2:01:26<07:35, 1.42it/s, loss=0.0055, lr=2.76e-06, step=9353] Training: 94%|█████████▎| 9354/10000 [2:01:26<07:35, 1.42it/s, loss=0.0067, lr=2.76e-06, step=9354] Training: 94%|█████████▎| 9355/10000 [2:01:27<07:28, 1.44it/s, loss=0.0067, lr=2.76e-06, step=9354] Training: 94%|█████████▎| 9355/10000 [2:01:27<07:28, 1.44it/s, loss=0.0174, lr=2.76e-06, step=9355] Training: 94%|█████████▎| 9356/10000 [2:01:28<07:10, 1.50it/s, loss=0.0174, lr=2.76e-06, step=9355] Training: 94%|█████████▎| 9356/10000 [2:01:28<07:10, 1.50it/s, loss=0.0102, lr=2.75e-06, step=9356] Training: 94%|█████████▎| 9357/10000 [2:01:28<06:42, 1.60it/s, loss=0.0102, lr=2.75e-06, step=9356] Training: 94%|█████████▎| 9357/10000 [2:01:28<06:42, 1.60it/s, loss=0.0067, lr=2.75e-06, step=9357] Training: 94%|█████████▎| 9358/10000 [2:01:29<07:08, 1.50it/s, loss=0.0067, lr=2.75e-06, step=9357] Training: 94%|█████████▎| 9358/10000 [2:01:29<07:08, 1.50it/s, loss=0.0025, lr=2.75e-06, step=9358] Training: 94%|█████████▎| 9359/10000 [2:01:29<06:34, 1.62it/s, loss=0.0025, lr=2.75e-06, step=9358] Training: 94%|█████████▎| 9359/10000 [2:01:29<06:34, 1.62it/s, loss=0.0146, lr=2.75e-06, step=9359]18:07:37.144 [I] step=9360 loss=0.0076 smoothed_loss=0.0078 lr=2.76e-06 grad_norm=0.3848 step_time=0.5562s data_time=0.1240s it/s=1.470 eta_to_10000=435.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0091 grad_action_out_proj=0.0708 grad_shared_expert=0.2960 (10775:train_pytorch.py:850) + Training: 94%|█████████▎| 9360/10000 [2:01:30<07:14, 1.47it/s, loss=0.0146, lr=2.75e-06, step=9359] Training: 94%|█████████▎| 9360/10000 [2:01:30<07:14, 1.47it/s, loss=0.0076, lr=2.75e-06, step=9360] Training: 94%|█████████▎| 9361/10000 [2:01:31<07:45, 1.37it/s, loss=0.0076, lr=2.75e-06, step=9360] Training: 94%|█████████▎| 9361/10000 [2:01:31<07:45, 1.37it/s, loss=0.0254, lr=2.75e-06, step=9361] Training: 94%|█████████▎| 9362/10000 [2:01:32<08:25, 1.26it/s, loss=0.0254, lr=2.75e-06, step=9361] Training: 94%|█████████▎| 9362/10000 [2:01:32<08:25, 1.26it/s, loss=0.0134, lr=2.75e-06, step=9362] Training: 94%|█████████▎| 9363/10000 [2:01:33<08:31, 1.25it/s, loss=0.0134, lr=2.75e-06, step=9362] Training: 94%|█████████▎| 9363/10000 [2:01:33<08:31, 1.25it/s, loss=0.0035, lr=2.75e-06, step=9363] Training: 94%|█████████▎| 9364/10000 [2:01:33<07:47, 1.36it/s, loss=0.0035, lr=2.75e-06, step=9363] Training: 94%|█████████▎| 9364/10000 [2:01:33<07:47, 1.36it/s, loss=0.0026, lr=2.75e-06, step=9364] Training: 94%|█████████▎| 9365/10000 [2:01:34<07:43, 1.37it/s, loss=0.0026, lr=2.75e-06, step=9364] Training: 94%|█████████▎| 9365/10000 [2:01:34<07:43, 1.37it/s, loss=0.0014, lr=2.75e-06, step=9365] Training: 94%|█████████▎| 9366/10000 [2:01:35<07:52, 1.34it/s, loss=0.0014, lr=2.75e-06, step=9365] Training: 94%|█████████▎| 9366/10000 [2:01:35<07:52, 1.34it/s, loss=0.0122, lr=2.75e-06, step=9366] Training: 94%|█████████▎| 9367/10000 [2:01:36<07:57, 1.33it/s, loss=0.0122, lr=2.75e-06, step=9366] Training: 94%|█████████▎| 9367/10000 [2:01:36<07:57, 1.33it/s, loss=0.0059, lr=2.75e-06, step=9367] Training: 94%|█████████▎| 9368/10000 [2:01:36<07:10, 1.47it/s, loss=0.0059, lr=2.75e-06, step=9367] Training: 94%|█████████▎| 9368/10000 [2:01:36<07:10, 1.47it/s, loss=0.0076, lr=2.75e-06, step=9368] Training: 94%|█████████▎| 9369/10000 [2:01:37<06:55, 1.52it/s, loss=0.0076, lr=2.75e-06, step=9368] Training: 94%|█████████▎| 9369/10000 [2:01:37<06:55, 1.52it/s, loss=0.0007, lr=2.74e-06, step=9369]18:07:44.442 [I] step=9370 loss=0.0060 smoothed_loss=0.0072 lr=2.75e-06 grad_norm=0.3792 step_time=0.5878s data_time=0.1420s it/s=1.371 eta_to_10000=459.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0040 grad_action_out_proj=0.0553 grad_shared_expert=0.2013 (10775:train_pytorch.py:850) + Training: 94%|█████████▎| 9370/10000 [2:01:38<07:06, 1.48it/s, loss=0.0007, lr=2.74e-06, step=9369] Training: 94%|█████████▎| 9370/10000 [2:01:38<07:06, 1.48it/s, loss=0.0060, lr=2.74e-06, step=9370] Training: 94%|█████████▎| 9371/10000 [2:01:38<06:30, 1.61it/s, loss=0.0060, lr=2.74e-06, step=9370] Training: 94%|█████████▎| 9371/10000 [2:01:38<06:30, 1.61it/s, loss=0.0012, lr=2.74e-06, step=9371] Training: 94%|█████████▎| 9372/10000 [2:01:38<06:05, 1.72it/s, loss=0.0012, lr=2.74e-06, step=9371] Training: 94%|█████████▎| 9372/10000 [2:01:38<06:05, 1.72it/s, loss=0.0281, lr=2.74e-06, step=9372] Training: 94%|█████████▎| 9373/10000 [2:01:39<06:45, 1.55it/s, loss=0.0281, lr=2.74e-06, step=9372] Training: 94%|█████████▎| 9373/10000 [2:01:39<06:45, 1.55it/s, loss=0.0027, lr=2.74e-06, step=9373] Training: 94%|█████████▎| 9374/10000 [2:01:40<08:04, 1.29it/s, loss=0.0027, lr=2.74e-06, step=9373] Training: 94%|█████████▎| 9374/10000 [2:01:40<08:04, 1.29it/s, loss=0.0112, lr=2.74e-06, step=9374] Training: 94%|█████████▍| 9375/10000 [2:01:41<07:20, 1.42it/s, loss=0.0112, lr=2.74e-06, step=9374] Training: 94%|█████████▍| 9375/10000 [2:01:41<07:20, 1.42it/s, loss=0.0052, lr=2.74e-06, step=9375] Training: 94%|█████████▍| 9376/10000 [2:01:42<07:00, 1.48it/s, loss=0.0052, lr=2.74e-06, step=9375] Training: 94%|█████████▍| 9376/10000 [2:01:42<07:00, 1.48it/s, loss=0.0025, lr=2.74e-06, step=9376] Training: 94%|█████████▍| 9377/10000 [2:01:42<07:05, 1.47it/s, loss=0.0025, lr=2.74e-06, step=9376] Training: 94%|█████████▍| 9377/10000 [2:01:42<07:05, 1.47it/s, loss=0.0028, lr=2.74e-06, step=9377] Training: 94%|█████████▍| 9378/10000 [2:01:43<06:43, 1.54it/s, loss=0.0028, lr=2.74e-06, step=9377] Training: 94%|█████████▍| 9378/10000 [2:01:43<06:43, 1.54it/s, loss=0.0012, lr=2.74e-06, step=9378] Training: 94%|█████████▍| 9379/10000 [2:01:43<06:22, 1.62it/s, loss=0.0012, lr=2.74e-06, step=9378] Training: 94%|█████████▍| 9379/10000 [2:01:43<06:22, 1.62it/s, loss=0.0018, lr=2.74e-06, step=9379]18:07:51.129 [I] step=9380 loss=0.0016 smoothed_loss=0.0056 lr=2.74e-06 grad_norm=0.4211 step_time=0.5598s data_time=0.1090s it/s=1.496 eta_to_10000=414.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0036 grad_action_out_proj=0.0479 grad_shared_expert=0.1158 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9380/10000 [2:01:44<07:10, 1.44it/s, loss=0.0018, lr=2.74e-06, step=9379] Training: 94%|█████████▍| 9380/10000 [2:01:44<07:10, 1.44it/s, loss=0.0016, lr=2.74e-06, step=9380] Training: 94%|█████████▍| 9381/10000 [2:01:45<06:50, 1.51it/s, loss=0.0016, lr=2.74e-06, step=9380] Training: 94%|█████████▍| 9381/10000 [2:01:45<06:50, 1.51it/s, loss=0.0045, lr=2.74e-06, step=9381] Training: 94%|█████████▍| 9382/10000 [2:01:46<08:00, 1.29it/s, loss=0.0045, lr=2.74e-06, step=9381] Training: 94%|█████████▍| 9382/10000 [2:01:46<08:00, 1.29it/s, loss=0.0048, lr=2.73e-06, step=9382] Training: 94%|█████████▍| 9383/10000 [2:01:46<07:30, 1.37it/s, loss=0.0048, lr=2.73e-06, step=9382] Training: 94%|█████████▍| 9383/10000 [2:01:46<07:30, 1.37it/s, loss=0.0113, lr=2.73e-06, step=9383] Training: 94%|█████████▍| 9384/10000 [2:01:47<07:41, 1.33it/s, loss=0.0113, lr=2.73e-06, step=9383] Training: 94%|█████████▍| 9384/10000 [2:01:47<07:41, 1.33it/s, loss=0.0125, lr=2.73e-06, step=9384] Training: 94%|█████████▍| 9385/10000 [2:01:48<08:20, 1.23it/s, loss=0.0125, lr=2.73e-06, step=9384] Training: 94%|█████████▍| 9385/10000 [2:01:48<08:20, 1.23it/s, loss=0.0035, lr=2.73e-06, step=9385] Training: 94%|█████████▍| 9386/10000 [2:01:49<07:20, 1.39it/s, loss=0.0035, lr=2.73e-06, step=9385] Training: 94%|█████████▍| 9386/10000 [2:01:49<07:20, 1.39it/s, loss=0.0036, lr=2.73e-06, step=9386] Training: 94%|█████████▍| 9387/10000 [2:01:49<07:32, 1.35it/s, loss=0.0036, lr=2.73e-06, step=9386] Training: 94%|█████████▍| 9387/10000 [2:01:49<07:32, 1.35it/s, loss=0.0073, lr=2.73e-06, step=9387] Training: 94%|█████████▍| 9388/10000 [2:01:50<06:51, 1.49it/s, loss=0.0073, lr=2.73e-06, step=9387] Training: 94%|█████████▍| 9388/10000 [2:01:50<06:51, 1.49it/s, loss=0.0097, lr=2.73e-06, step=9388] Training: 94%|█████████▍| 9389/10000 [2:01:51<08:52, 1.15it/s, loss=0.0097, lr=2.73e-06, step=9388] Training: 94%|█████████▍| 9389/10000 [2:01:51<08:52, 1.15it/s, loss=0.0024, lr=2.73e-06, step=9389]18:07:58.830 [I] step=9390 loss=0.0237 smoothed_loss=0.0079 lr=2.73e-06 grad_norm=0.4958 step_time=0.6069s data_time=0.1632s it/s=1.299 eta_to_10000=469.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0201 grad_action_out_proj=0.1336 grad_shared_expert=0.7281 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9390/10000 [2:01:52<07:53, 1.29it/s, loss=0.0024, lr=2.73e-06, step=9389] Training: 94%|█████████▍| 9390/10000 [2:01:52<07:53, 1.29it/s, loss=0.0237, lr=2.73e-06, step=9390] Training: 94%|█████████▍| 9391/10000 [2:01:52<07:00, 1.45it/s, loss=0.0237, lr=2.73e-06, step=9390] Training: 94%|█████████▍| 9391/10000 [2:01:52<07:00, 1.45it/s, loss=0.0031, lr=2.73e-06, step=9391] Training: 94%|█████████▍| 9392/10000 [2:01:53<07:54, 1.28it/s, loss=0.0031, lr=2.73e-06, step=9391] Training: 94%|█████████▍| 9392/10000 [2:01:53<07:54, 1.28it/s, loss=0.0107, lr=2.73e-06, step=9392] Training: 94%|█████████▍| 9393/10000 [2:01:54<07:39, 1.32it/s, loss=0.0107, lr=2.73e-06, step=9392] Training: 94%|█████████▍| 9393/10000 [2:01:54<07:39, 1.32it/s, loss=0.0190, lr=2.73e-06, step=9393] Training: 94%|█████████▍| 9394/10000 [2:01:55<07:02, 1.43it/s, loss=0.0190, lr=2.73e-06, step=9393] Training: 94%|█████████▍| 9394/10000 [2:01:55<07:02, 1.43it/s, loss=0.0069, lr=2.73e-06, step=9394] Training: 94%|█████████▍| 9395/10000 [2:01:55<07:29, 1.35it/s, loss=0.0069, lr=2.73e-06, step=9394] Training: 94%|█████████▍| 9395/10000 [2:01:55<07:29, 1.35it/s, loss=0.0026, lr=2.73e-06, step=9395] Training: 94%|█████████▍| 9396/10000 [2:01:56<07:23, 1.36it/s, loss=0.0026, lr=2.73e-06, step=9395] Training: 94%|█████████▍| 9396/10000 [2:01:56<07:23, 1.36it/s, loss=0.0081, lr=2.72e-06, step=9396] Training: 94%|█████████▍| 9397/10000 [2:01:57<07:25, 1.35it/s, loss=0.0081, lr=2.72e-06, step=9396] Training: 94%|█████████▍| 9397/10000 [2:01:57<07:25, 1.35it/s, loss=0.0359, lr=2.72e-06, step=9397] Training: 94%|█████████▍| 9398/10000 [2:01:58<07:37, 1.32it/s, loss=0.0359, lr=2.72e-06, step=9397] Training: 94%|█████████▍| 9398/10000 [2:01:58<07:37, 1.32it/s, loss=0.0284, lr=2.72e-06, step=9398] Training: 94%|█████████▍| 9399/10000 [2:01:59<07:48, 1.28it/s, loss=0.0284, lr=2.72e-06, step=9398] Training: 94%|█████████▍| 9399/10000 [2:01:59<07:48, 1.28it/s, loss=0.0063, lr=2.72e-06, step=9399]18:08:06.062 [I] step=9400 loss=0.0047 smoothed_loss=0.0112 lr=2.72e-06 grad_norm=0.3745 step_time=0.5850s data_time=0.1382s it/s=1.383 eta_to_10000=433.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0057 grad_action_out_proj=0.0685 grad_shared_expert=0.2892 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9400/10000 [2:01:59<07:05, 1.41it/s, loss=0.0063, lr=2.72e-06, step=9399] Training: 94%|█████████▍| 9400/10000 [2:01:59<07:05, 1.41it/s, loss=0.0047, lr=2.72e-06, step=9400] Training: 94%|█████████▍| 9401/10000 [2:02:00<06:26, 1.55it/s, loss=0.0047, lr=2.72e-06, step=9400] Training: 94%|█████████▍| 9401/10000 [2:02:00<06:26, 1.55it/s, loss=0.0026, lr=2.72e-06, step=9401] Training: 94%|█████████▍| 9402/10000 [2:02:00<06:02, 1.65it/s, loss=0.0026, lr=2.72e-06, step=9401] Training: 94%|█████████▍| 9402/10000 [2:02:00<06:02, 1.65it/s, loss=0.0061, lr=2.72e-06, step=9402] Training: 94%|█████████▍| 9403/10000 [2:02:01<06:37, 1.50it/s, loss=0.0061, lr=2.72e-06, step=9402] Training: 94%|█████████▍| 9403/10000 [2:02:01<06:37, 1.50it/s, loss=0.0074, lr=2.72e-06, step=9403] Training: 94%|█████████▍| 9404/10000 [2:02:02<06:40, 1.49it/s, loss=0.0074, lr=2.72e-06, step=9403] Training: 94%|█████████▍| 9404/10000 [2:02:02<06:40, 1.49it/s, loss=0.0024, lr=2.72e-06, step=9404] Training: 94%|█████████▍| 9405/10000 [2:02:02<07:03, 1.40it/s, loss=0.0024, lr=2.72e-06, step=9404] Training: 94%|█████████▍| 9405/10000 [2:02:02<07:03, 1.40it/s, loss=0.0105, lr=2.72e-06, step=9405] Training: 94%|█████████▍| 9406/10000 [2:02:03<07:23, 1.34it/s, loss=0.0105, lr=2.72e-06, step=9405] Training: 94%|█████████▍| 9406/10000 [2:02:03<07:23, 1.34it/s, loss=0.0055, lr=2.72e-06, step=9406] Training: 94%|█████████▍| 9407/10000 [2:02:04<07:14, 1.37it/s, loss=0.0055, lr=2.72e-06, step=9406] Training: 94%|█████████▍| 9407/10000 [2:02:04<07:14, 1.37it/s, loss=0.0109, lr=2.72e-06, step=9407] Training: 94%|█████████▍| 9408/10000 [2:02:05<06:50, 1.44it/s, loss=0.0109, lr=2.72e-06, step=9407] Training: 94%|█████████▍| 9408/10000 [2:02:05<06:50, 1.44it/s, loss=0.0059, lr=2.72e-06, step=9408] Training: 94%|█████████▍| 9409/10000 [2:02:05<07:14, 1.36it/s, loss=0.0059, lr=2.72e-06, step=9408] Training: 94%|█████████▍| 9409/10000 [2:02:05<07:14, 1.36it/s, loss=0.0015, lr=2.71e-06, step=9409]18:08:13.319 [I] step=9410 loss=0.0108 smoothed_loss=0.0082 lr=2.72e-06 grad_norm=0.3748 step_time=0.5859s data_time=0.1398s it/s=1.378 eta_to_10000=428.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0221 grad_action_out_proj=0.1288 grad_shared_expert=0.4823 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9410/10000 [2:02:06<07:58, 1.23it/s, loss=0.0015, lr=2.71e-06, step=9409] Training: 94%|█████████▍| 9410/10000 [2:02:06<07:58, 1.23it/s, loss=0.0108, lr=2.71e-06, step=9410] Training: 94%|█████████▍| 9411/10000 [2:02:07<07:34, 1.30it/s, loss=0.0108, lr=2.71e-06, step=9410] Training: 94%|█████████▍| 9411/10000 [2:02:07<07:34, 1.30it/s, loss=0.0044, lr=2.71e-06, step=9411] Training: 94%|█████████▍| 9412/10000 [2:02:08<06:44, 1.45it/s, loss=0.0044, lr=2.71e-06, step=9411] Training: 94%|█████████▍| 9412/10000 [2:02:08<06:44, 1.45it/s, loss=0.0099, lr=2.71e-06, step=9412] Training: 94%|█████████▍| 9413/10000 [2:02:08<06:53, 1.42it/s, loss=0.0099, lr=2.71e-06, step=9412] Training: 94%|█████████▍| 9413/10000 [2:02:08<06:53, 1.42it/s, loss=0.0034, lr=2.71e-06, step=9413] Training: 94%|█████████▍| 9414/10000 [2:02:09<07:10, 1.36it/s, loss=0.0034, lr=2.71e-06, step=9413] Training: 94%|█████████▍| 9414/10000 [2:02:09<07:10, 1.36it/s, loss=0.0060, lr=2.71e-06, step=9414] Training: 94%|█████████▍| 9415/10000 [2:02:10<07:36, 1.28it/s, loss=0.0060, lr=2.71e-06, step=9414] Training: 94%|█████████▍| 9415/10000 [2:02:10<07:36, 1.28it/s, loss=0.0033, lr=2.71e-06, step=9415] Training: 94%|█████████▍| 9416/10000 [2:02:10<06:46, 1.44it/s, loss=0.0033, lr=2.71e-06, step=9415] Training: 94%|█████████▍| 9416/10000 [2:02:10<06:46, 1.44it/s, loss=0.0126, lr=2.71e-06, step=9416] Training: 94%|█████████▍| 9417/10000 [2:02:11<06:48, 1.43it/s, loss=0.0126, lr=2.71e-06, step=9416] Training: 94%|█████████▍| 9417/10000 [2:02:11<06:48, 1.43it/s, loss=0.0041, lr=2.71e-06, step=9417] Training: 94%|█████████▍| 9418/10000 [2:02:12<07:09, 1.35it/s, loss=0.0041, lr=2.71e-06, step=9417] Training: 94%|█████████▍| 9418/10000 [2:02:12<07:09, 1.35it/s, loss=0.0064, lr=2.71e-06, step=9418] Training: 94%|█████████▍| 9419/10000 [2:02:13<07:21, 1.32it/s, loss=0.0064, lr=2.71e-06, step=9418] Training: 94%|█████████▍| 9419/10000 [2:02:13<07:21, 1.32it/s, loss=0.0017, lr=2.71e-06, step=9419]18:08:20.327 [I] step=9420 loss=0.0054 smoothed_loss=0.0065 lr=2.71e-06 grad_norm=0.4498 step_time=0.5733s data_time=0.1276s it/s=1.427 eta_to_10000=406.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0624 grad_shared_expert=0.3575 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9420/10000 [2:02:13<06:44, 1.43it/s, loss=0.0017, lr=2.71e-06, step=9419] Training: 94%|█████████▍| 9420/10000 [2:02:13<06:44, 1.43it/s, loss=0.0054, lr=2.71e-06, step=9420] Training: 94%|█████████▍| 9421/10000 [2:02:14<06:57, 1.39it/s, loss=0.0054, lr=2.71e-06, step=9420] Training: 94%|█████████▍| 9421/10000 [2:02:14<06:57, 1.39it/s, loss=0.0087, lr=2.71e-06, step=9421] Training: 94%|█████████▍| 9422/10000 [2:02:15<06:18, 1.53it/s, loss=0.0087, lr=2.71e-06, step=9421] Training: 94%|█████████▍| 9422/10000 [2:02:15<06:18, 1.53it/s, loss=0.0212, lr=2.71e-06, step=9422] Training: 94%|█████████▍| 9423/10000 [2:02:15<05:56, 1.62it/s, loss=0.0212, lr=2.71e-06, step=9422] Training: 94%|█████████▍| 9423/10000 [2:02:15<05:56, 1.62it/s, loss=0.0070, lr=2.70e-06, step=9423] Training: 94%|█████████▍| 9424/10000 [2:02:16<06:13, 1.54it/s, loss=0.0070, lr=2.70e-06, step=9423] Training: 94%|█████████▍| 9424/10000 [2:02:16<06:13, 1.54it/s, loss=0.0204, lr=2.70e-06, step=9424] Training: 94%|█████████▍| 9425/10000 [2:02:17<06:41, 1.43it/s, loss=0.0204, lr=2.70e-06, step=9424] Training: 94%|█████████▍| 9425/10000 [2:02:17<06:41, 1.43it/s, loss=0.0063, lr=2.70e-06, step=9425] Training: 94%|█████████▍| 9426/10000 [2:02:18<07:22, 1.30it/s, loss=0.0063, lr=2.70e-06, step=9425] Training: 94%|█████████▍| 9426/10000 [2:02:18<07:22, 1.30it/s, loss=0.0044, lr=2.70e-06, step=9426] Training: 94%|█████████▍| 9427/10000 [2:02:19<07:34, 1.26it/s, loss=0.0044, lr=2.70e-06, step=9426] Training: 94%|█████████▍| 9427/10000 [2:02:19<07:34, 1.26it/s, loss=0.0092, lr=2.70e-06, step=9427] Training: 94%|█████████▍| 9428/10000 [2:02:19<07:32, 1.26it/s, loss=0.0092, lr=2.70e-06, step=9427] Training: 94%|█████████▍| 9428/10000 [2:02:19<07:32, 1.26it/s, loss=0.0024, lr=2.70e-06, step=9428] Training: 94%|█████████▍| 9429/10000 [2:02:20<06:44, 1.41it/s, loss=0.0024, lr=2.70e-06, step=9428] Training: 94%|█████████▍| 9429/10000 [2:02:20<06:44, 1.41it/s, loss=0.0051, lr=2.70e-06, step=9429]18:08:27.542 [I] step=9430 loss=0.0095 smoothed_loss=0.0079 lr=2.70e-06 grad_norm=0.4312 step_time=0.5818s data_time=0.1397s it/s=1.386 eta_to_10000=411.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0192 grad_action_out_proj=0.1227 grad_shared_expert=0.4525 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9430/10000 [2:02:21<06:56, 1.37it/s, loss=0.0051, lr=2.70e-06, step=9429] Training: 94%|█████████▍| 9430/10000 [2:02:21<06:56, 1.37it/s, loss=0.0095, lr=2.70e-06, step=9430] Training: 94%|█████████▍| 9431/10000 [2:02:21<07:02, 1.35it/s, loss=0.0095, lr=2.70e-06, step=9430] Training: 94%|█████████▍| 9431/10000 [2:02:21<07:02, 1.35it/s, loss=0.0123, lr=2.70e-06, step=9431] Training: 94%|█████████▍| 9432/10000 [2:02:22<06:22, 1.48it/s, loss=0.0123, lr=2.70e-06, step=9431] Training: 94%|█████████▍| 9432/10000 [2:02:22<06:22, 1.48it/s, loss=0.0016, lr=2.70e-06, step=9432] Training: 94%|█████████▍| 9433/10000 [2:02:22<05:59, 1.58it/s, loss=0.0016, lr=2.70e-06, step=9432] Training: 94%|█████████▍| 9433/10000 [2:02:22<05:59, 1.58it/s, loss=0.0061, lr=2.70e-06, step=9433] Training: 94%|█████████▍| 9434/10000 [2:02:23<07:03, 1.34it/s, loss=0.0061, lr=2.70e-06, step=9433] Training: 94%|█████████▍| 9434/10000 [2:02:23<07:03, 1.34it/s, loss=0.0232, lr=2.70e-06, step=9434] Training: 94%|█████████▍| 9435/10000 [2:02:24<07:35, 1.24it/s, loss=0.0232, lr=2.70e-06, step=9434] Training: 94%|█████████▍| 9435/10000 [2:02:24<07:35, 1.24it/s, loss=0.0290, lr=2.70e-06, step=9435] Training: 94%|█████████▍| 9436/10000 [2:02:25<06:53, 1.36it/s, loss=0.0290, lr=2.70e-06, step=9435] Training: 94%|█████████▍| 9436/10000 [2:02:25<06:53, 1.36it/s, loss=0.0143, lr=2.70e-06, step=9436] Training: 94%|█████████▍| 9437/10000 [2:02:25<06:19, 1.48it/s, loss=0.0143, lr=2.70e-06, step=9436] Training: 94%|█████████▍| 9437/10000 [2:02:25<06:19, 1.48it/s, loss=0.0050, lr=2.70e-06, step=9437] Training: 94%|█████████▍| 9438/10000 [2:02:26<06:27, 1.45it/s, loss=0.0050, lr=2.70e-06, step=9437] Training: 94%|█████████▍| 9438/10000 [2:02:26<06:27, 1.45it/s, loss=0.0071, lr=2.69e-06, step=9438] Training: 94%|█████████▍| 9439/10000 [2:02:27<06:58, 1.34it/s, loss=0.0071, lr=2.69e-06, step=9438] Training: 94%|█████████▍| 9439/10000 [2:02:27<06:58, 1.34it/s, loss=0.0058, lr=2.69e-06, step=9439]18:08:34.971 [I] step=9440 loss=0.0043 smoothed_loss=0.0093 lr=2.70e-06 grad_norm=0.4106 step_time=0.6077s data_time=0.1352s it/s=1.346 eta_to_10000=416.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0594 grad_shared_expert=0.1800 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9440/10000 [2:02:28<07:31, 1.24it/s, loss=0.0058, lr=2.69e-06, step=9439] Training: 94%|█████████▍| 9440/10000 [2:02:28<07:31, 1.24it/s, loss=0.0043, lr=2.69e-06, step=9440] Training: 94%|█████████▍| 9441/10000 [2:02:29<07:56, 1.17it/s, loss=0.0043, lr=2.69e-06, step=9440] Training: 94%|█████████▍| 9441/10000 [2:02:29<07:56, 1.17it/s, loss=0.0172, lr=2.69e-06, step=9441] Training: 94%|█████████▍| 9442/10000 [2:02:30<07:53, 1.18it/s, loss=0.0172, lr=2.69e-06, step=9441] Training: 94%|█████████▍| 9442/10000 [2:02:30<07:53, 1.18it/s, loss=0.0068, lr=2.69e-06, step=9442] Training: 94%|█████████▍| 9443/10000 [2:02:30<07:11, 1.29it/s, loss=0.0068, lr=2.69e-06, step=9442] Training: 94%|█████████▍| 9443/10000 [2:02:30<07:11, 1.29it/s, loss=0.0194, lr=2.69e-06, step=9443] Training: 94%|█████████▍| 9444/10000 [2:02:31<06:38, 1.40it/s, loss=0.0194, lr=2.69e-06, step=9443] Training: 94%|█████████▍| 9444/10000 [2:02:31<06:38, 1.40it/s, loss=0.0031, lr=2.69e-06, step=9444] Training: 94%|█████████▍| 9445/10000 [2:02:32<06:04, 1.52it/s, loss=0.0031, lr=2.69e-06, step=9444] Training: 94%|█████████▍| 9445/10000 [2:02:32<06:04, 1.52it/s, loss=0.0183, lr=2.69e-06, step=9445] Training: 94%|█████████▍| 9446/10000 [2:02:32<06:33, 1.41it/s, loss=0.0183, lr=2.69e-06, step=9445] Training: 94%|█████████▍| 9446/10000 [2:02:32<06:33, 1.41it/s, loss=0.0056, lr=2.69e-06, step=9446] Training: 94%|█████████▍| 9447/10000 [2:02:33<05:58, 1.54it/s, loss=0.0056, lr=2.69e-06, step=9446] Training: 94%|█████████▍| 9447/10000 [2:02:33<05:58, 1.54it/s, loss=0.0032, lr=2.69e-06, step=9447] Training: 94%|█████████▍| 9448/10000 [2:02:33<05:36, 1.64it/s, loss=0.0032, lr=2.69e-06, step=9447] Training: 94%|█████████▍| 9448/10000 [2:02:33<05:36, 1.64it/s, loss=0.0021, lr=2.69e-06, step=9448] Training: 94%|█████████▍| 9449/10000 [2:02:34<06:12, 1.48it/s, loss=0.0021, lr=2.69e-06, step=9448] Training: 94%|█████████▍| 9449/10000 [2:02:34<06:12, 1.48it/s, loss=0.0040, lr=2.69e-06, step=9449]18:08:42.038 [I] step=9450 loss=0.0023 smoothed_loss=0.0077 lr=2.69e-06 grad_norm=0.3921 step_time=0.5749s data_time=0.1317s it/s=1.415 eta_to_10000=388.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0665 grad_shared_expert=0.2440 (10775:train_pytorch.py:850) + Training: 94%|█████████▍| 9450/10000 [2:02:35<06:44, 1.36it/s, loss=0.0040, lr=2.69e-06, step=9449] Training: 94%|█████████▍| 9450/10000 [2:02:35<06:44, 1.36it/s, loss=0.0023, lr=2.69e-06, step=9450] Training: 95%|█████████▍| 9451/10000 [2:02:36<06:03, 1.51it/s, loss=0.0023, lr=2.69e-06, step=9450] Training: 95%|█████████▍| 9451/10000 [2:02:36<06:03, 1.51it/s, loss=0.0113, lr=2.69e-06, step=9451] Training: 95%|█████████▍| 9452/10000 [2:02:36<06:02, 1.51it/s, loss=0.0113, lr=2.69e-06, step=9451] Training: 95%|█████████▍| 9452/10000 [2:02:36<06:02, 1.51it/s, loss=0.0057, lr=2.68e-06, step=9452] Training: 95%|█████████▍| 9453/10000 [2:02:37<06:30, 1.40it/s, loss=0.0057, lr=2.68e-06, step=9452] Training: 95%|█████████▍| 9453/10000 [2:02:37<06:30, 1.40it/s, loss=0.0196, lr=2.68e-06, step=9453] Training: 95%|█████████▍| 9454/10000 [2:02:38<05:53, 1.55it/s, loss=0.0196, lr=2.68e-06, step=9453] Training: 95%|█████████▍| 9454/10000 [2:02:38<05:53, 1.55it/s, loss=0.0051, lr=2.68e-06, step=9454] Training: 95%|█████████▍| 9455/10000 [2:02:38<05:49, 1.56it/s, loss=0.0051, lr=2.68e-06, step=9454] Training: 95%|█████████▍| 9455/10000 [2:02:38<05:49, 1.56it/s, loss=0.0591, lr=2.68e-06, step=9455] Training: 95%|█████████▍| 9456/10000 [2:02:39<05:59, 1.51it/s, loss=0.0591, lr=2.68e-06, step=9455] Training: 95%|█████████▍| 9456/10000 [2:02:39<05:59, 1.51it/s, loss=0.0016, lr=2.68e-06, step=9456] Training: 95%|█████████▍| 9457/10000 [2:02:39<05:36, 1.61it/s, loss=0.0016, lr=2.68e-06, step=9456] Training: 95%|█████████▍| 9457/10000 [2:02:39<05:36, 1.61it/s, loss=0.0041, lr=2.68e-06, step=9457] Training: 95%|█████████▍| 9458/10000 [2:02:40<05:50, 1.55it/s, loss=0.0041, lr=2.68e-06, step=9457] Training: 95%|█████████▍| 9458/10000 [2:02:40<05:50, 1.55it/s, loss=0.0010, lr=2.68e-06, step=9458] Training: 95%|█████████▍| 9459/10000 [2:02:41<05:31, 1.63it/s, loss=0.0010, lr=2.68e-06, step=9458] Training: 95%|█████████▍| 9459/10000 [2:02:41<05:31, 1.63it/s, loss=0.0034, lr=2.68e-06, step=9459]18:08:48.776 [I] step=9460 loss=0.0206 smoothed_loss=0.0109 lr=2.68e-06 grad_norm=0.4454 step_time=0.5558s data_time=0.1180s it/s=1.484 eta_to_10000=363.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0140 grad_action_out_proj=0.0911 grad_shared_expert=0.4005 (10775:train_pytorch.py:850) + Training: 95%|█████████▍| 9460/10000 [2:02:42<06:59, 1.29it/s, loss=0.0034, lr=2.68e-06, step=9459] Training: 95%|█████████▍| 9460/10000 [2:02:42<06:59, 1.29it/s, loss=0.0206, lr=2.68e-06, step=9460] Training: 95%|█████████▍| 9461/10000 [2:02:43<06:45, 1.33it/s, loss=0.0206, lr=2.68e-06, step=9460] Training: 95%|█████████▍| 9461/10000 [2:02:43<06:45, 1.33it/s, loss=0.0099, lr=2.68e-06, step=9461] Training: 95%|█████████▍| 9462/10000 [2:02:43<06:08, 1.46it/s, loss=0.0099, lr=2.68e-06, step=9461] Training: 95%|█████████▍| 9462/10000 [2:02:43<06:08, 1.46it/s, loss=0.0011, lr=2.68e-06, step=9462] Training: 95%|█████████▍| 9463/10000 [2:02:44<06:17, 1.42it/s, loss=0.0011, lr=2.68e-06, step=9462] Training: 95%|█████████▍| 9463/10000 [2:02:44<06:17, 1.42it/s, loss=0.0013, lr=2.68e-06, step=9463] Training: 95%|█████████▍| 9464/10000 [2:02:44<05:49, 1.53it/s, loss=0.0013, lr=2.68e-06, step=9463] Training: 95%|█████████▍| 9464/10000 [2:02:44<05:49, 1.53it/s, loss=0.0016, lr=2.68e-06, step=9464] Training: 95%|█████████▍| 9465/10000 [2:02:45<06:05, 1.46it/s, loss=0.0016, lr=2.68e-06, step=9464] Training: 95%|█████████▍| 9465/10000 [2:02:45<06:05, 1.46it/s, loss=0.0087, lr=2.68e-06, step=9465] Training: 95%|█████████▍| 9466/10000 [2:02:46<06:29, 1.37it/s, loss=0.0087, lr=2.68e-06, step=9465] Training: 95%|█████████▍| 9466/10000 [2:02:46<06:29, 1.37it/s, loss=0.0064, lr=2.68e-06, step=9466] Training: 95%|█████████▍| 9467/10000 [2:02:47<06:24, 1.39it/s, loss=0.0064, lr=2.68e-06, step=9466] Training: 95%|█████████▍| 9467/10000 [2:02:47<06:24, 1.39it/s, loss=0.0007, lr=2.67e-06, step=9467] Training: 95%|█████████▍| 9468/10000 [2:02:47<05:48, 1.53it/s, loss=0.0007, lr=2.67e-06, step=9467] Training: 95%|█████████▍| 9468/10000 [2:02:47<05:48, 1.53it/s, loss=0.0127, lr=2.67e-06, step=9468] Training: 95%|█████████▍| 9469/10000 [2:02:48<05:24, 1.64it/s, loss=0.0127, lr=2.67e-06, step=9468] Training: 95%|█████████▍| 9469/10000 [2:02:48<05:24, 1.64it/s, loss=0.0057, lr=2.67e-06, step=9469]18:08:55.497 [I] step=9470 loss=0.0086 smoothed_loss=0.0078 lr=2.68e-06 grad_norm=0.4208 step_time=0.5695s data_time=0.1026s it/s=1.488 eta_to_10000=356.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0082 grad_action_out_proj=0.0785 grad_shared_expert=0.2694 (10775:train_pytorch.py:850) + Training: 95%|█████████▍| 9470/10000 [2:02:49<06:12, 1.42it/s, loss=0.0057, lr=2.67e-06, step=9469] Training: 95%|█████████▍| 9470/10000 [2:02:49<06:12, 1.42it/s, loss=0.0086, lr=2.67e-06, step=9470] Training: 95%|█████████▍| 9471/10000 [2:02:50<06:56, 1.27it/s, loss=0.0086, lr=2.67e-06, step=9470] Training: 95%|█████████▍| 9471/10000 [2:02:50<06:56, 1.27it/s, loss=0.0118, lr=2.67e-06, step=9471] Training: 95%|█████████▍| 9472/10000 [2:02:50<06:10, 1.42it/s, loss=0.0118, lr=2.67e-06, step=9471] Training: 95%|█████████▍| 9472/10000 [2:02:50<06:10, 1.42it/s, loss=0.0051, lr=2.67e-06, step=9472] Training: 95%|█████████▍| 9473/10000 [2:02:51<06:12, 1.41it/s, loss=0.0051, lr=2.67e-06, step=9472] Training: 95%|█████████▍| 9473/10000 [2:02:51<06:12, 1.41it/s, loss=0.0202, lr=2.67e-06, step=9473] Training: 95%|█████████▍| 9474/10000 [2:02:52<07:06, 1.23it/s, loss=0.0202, lr=2.67e-06, step=9473] Training: 95%|█████████▍| 9474/10000 [2:02:52<07:06, 1.23it/s, loss=0.0027, lr=2.67e-06, step=9474] Training: 95%|█████████▍| 9475/10000 [2:02:53<07:04, 1.24it/s, loss=0.0027, lr=2.67e-06, step=9474] Training: 95%|█████████▍| 9475/10000 [2:02:53<07:04, 1.24it/s, loss=0.0109, lr=2.67e-06, step=9475] Training: 95%|█████████▍| 9476/10000 [2:02:53<06:13, 1.40it/s, loss=0.0109, lr=2.67e-06, step=9475] Training: 95%|█████████▍| 9476/10000 [2:02:53<06:13, 1.40it/s, loss=0.0024, lr=2.67e-06, step=9476] Training: 95%|█████████▍| 9477/10000 [2:02:54<06:11, 1.41it/s, loss=0.0024, lr=2.67e-06, step=9476] Training: 95%|█████████▍| 9477/10000 [2:02:54<06:11, 1.41it/s, loss=0.0011, lr=2.67e-06, step=9477] Training: 95%|█████████▍| 9478/10000 [2:02:54<05:34, 1.56it/s, loss=0.0011, lr=2.67e-06, step=9477] Training: 95%|█████████▍| 9478/10000 [2:02:54<05:34, 1.56it/s, loss=0.0010, lr=2.67e-06, step=9478] Training: 95%|█████████▍| 9479/10000 [2:02:55<05:08, 1.69it/s, loss=0.0010, lr=2.67e-06, step=9478] Training: 95%|█████████▍| 9479/10000 [2:02:55<05:08, 1.69it/s, loss=0.0103, lr=2.67e-06, step=9479]18:09:02.485 [I] step=9480 loss=0.0198 smoothed_loss=0.0084 lr=2.67e-06 grad_norm=0.4853 step_time=0.5675s data_time=0.1314s it/s=1.431 eta_to_10000=363.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0173 grad_action_out_proj=0.1112 grad_shared_expert=0.5434 (10775:train_pytorch.py:850) + Training: 95%|█████████▍| 9480/10000 [2:02:56<05:36, 1.54it/s, loss=0.0103, lr=2.67e-06, step=9479] Training: 95%|█████████▍| 9480/10000 [2:02:56<05:36, 1.54it/s, loss=0.0198, lr=2.67e-06, step=9480] Training: 95%|█████████▍| 9481/10000 [2:02:56<05:48, 1.49it/s, loss=0.0198, lr=2.67e-06, step=9480] Training: 95%|█████████▍| 9481/10000 [2:02:56<05:48, 1.49it/s, loss=0.0549, lr=2.67e-06, step=9481] Training: 95%|█████████▍| 9482/10000 [2:02:57<05:33, 1.55it/s, loss=0.0549, lr=2.67e-06, step=9481] Training: 95%|█████████▍| 9482/10000 [2:02:57<05:33, 1.55it/s, loss=0.0040, lr=2.67e-06, step=9482] Training: 95%|█████████▍| 9483/10000 [2:02:58<06:02, 1.42it/s, loss=0.0040, lr=2.67e-06, step=9482] Training: 95%|█████████▍| 9483/10000 [2:02:58<06:02, 1.42it/s, loss=0.0071, lr=2.66e-06, step=9483] Training: 95%|█████████▍| 9484/10000 [2:02:58<06:00, 1.43it/s, loss=0.0071, lr=2.66e-06, step=9483] Training: 95%|█████████▍| 9484/10000 [2:02:58<06:00, 1.43it/s, loss=0.0035, lr=2.66e-06, step=9484] Training: 95%|█████████▍| 9485/10000 [2:02:59<05:30, 1.56it/s, loss=0.0035, lr=2.66e-06, step=9484] Training: 95%|█████████▍| 9485/10000 [2:02:59<05:30, 1.56it/s, loss=0.0049, lr=2.66e-06, step=9485] Training: 95%|█████████▍| 9486/10000 [2:02:59<05:05, 1.68it/s, loss=0.0049, lr=2.66e-06, step=9485] Training: 95%|█████████▍| 9486/10000 [2:02:59<05:05, 1.68it/s, loss=0.0025, lr=2.66e-06, step=9486] Training: 95%|█████████▍| 9487/10000 [2:03:00<04:50, 1.77it/s, loss=0.0025, lr=2.66e-06, step=9486] Training: 95%|█████████▍| 9487/10000 [2:03:00<04:50, 1.77it/s, loss=0.0120, lr=2.66e-06, step=9487] Training: 95%|█████████▍| 9488/10000 [2:03:01<05:07, 1.66it/s, loss=0.0120, lr=2.66e-06, step=9487] Training: 95%|█████████▍| 9488/10000 [2:03:01<05:07, 1.66it/s, loss=0.0017, lr=2.66e-06, step=9488] Training: 95%|█████████▍| 9489/10000 [2:03:02<06:03, 1.41it/s, loss=0.0017, lr=2.66e-06, step=9488] Training: 95%|█████████▍| 9489/10000 [2:03:02<06:03, 1.41it/s, loss=0.0020, lr=2.66e-06, step=9489]18:09:09.136 [I] step=9490 loss=0.0238 smoothed_loss=0.0098 lr=2.66e-06 grad_norm=0.4431 step_time=0.5574s data_time=0.1076s it/s=1.504 eta_to_10000=339.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0075 grad_action_out_proj=0.1141 grad_shared_expert=0.7838 (10775:train_pytorch.py:850) + Training: 95%|█████████▍| 9490/10000 [2:03:02<05:55, 1.43it/s, loss=0.0020, lr=2.66e-06, step=9489] Training: 95%|█████████▍| 9490/10000 [2:03:02<05:55, 1.43it/s, loss=0.0238, lr=2.66e-06, step=9490] Training: 95%|█████████▍| 9491/10000 [2:03:03<05:56, 1.43it/s, loss=0.0238, lr=2.66e-06, step=9490] Training: 95%|█████████▍| 9491/10000 [2:03:03<05:56, 1.43it/s, loss=0.0264, lr=2.66e-06, step=9491] Training: 95%|█████████▍| 9492/10000 [2:03:03<05:24, 1.56it/s, loss=0.0264, lr=2.66e-06, step=9491] Training: 95%|█████████▍| 9492/10000 [2:03:03<05:24, 1.56it/s, loss=0.0045, lr=2.66e-06, step=9492] Training: 95%|█████████▍| 9493/10000 [2:03:04<05:26, 1.55it/s, loss=0.0045, lr=2.66e-06, step=9492] Training: 95%|█████████▍| 9493/10000 [2:03:04<05:26, 1.55it/s, loss=0.0066, lr=2.66e-06, step=9493] Training: 95%|█████████▍| 9494/10000 [2:03:05<05:04, 1.66it/s, loss=0.0066, lr=2.66e-06, step=9493] Training: 95%|█████████▍| 9494/10000 [2:03:05<05:04, 1.66it/s, loss=0.0013, lr=2.66e-06, step=9494] Training: 95%|█████████▍| 9495/10000 [2:03:05<05:03, 1.67it/s, loss=0.0013, lr=2.66e-06, step=9494] Training: 95%|█████████▍| 9495/10000 [2:03:05<05:03, 1.67it/s, loss=0.0174, lr=2.66e-06, step=9495] Training: 95%|█████████▍| 9496/10000 [2:03:06<05:33, 1.51it/s, loss=0.0174, lr=2.66e-06, step=9495] Training: 95%|█████████▍| 9496/10000 [2:03:06<05:33, 1.51it/s, loss=0.0032, lr=2.66e-06, step=9496] Training: 95%|█████████▍| 9497/10000 [2:03:06<05:06, 1.64it/s, loss=0.0032, lr=2.66e-06, step=9496] Training: 95%|█████████▍| 9497/10000 [2:03:06<05:06, 1.64it/s, loss=0.0042, lr=2.66e-06, step=9497] Training: 95%|█████████▍| 9498/10000 [2:03:08<06:17, 1.33it/s, loss=0.0042, lr=2.66e-06, step=9497] Training: 95%|█████████▍| 9498/10000 [2:03:08<06:17, 1.33it/s, loss=0.0159, lr=2.66e-06, step=9498] Training: 95%|█████████▍| 9499/10000 [2:03:08<06:17, 1.33it/s, loss=0.0159, lr=2.66e-06, step=9498] Training: 95%|█████████▍| 9499/10000 [2:03:08<06:17, 1.33it/s, loss=0.0041, lr=2.65e-06, step=9499]18:09:15.746 [I] step=9500 loss=0.0029 smoothed_loss=0.0085 lr=2.66e-06 grad_norm=0.4148 step_time=0.5516s data_time=0.1095s it/s=1.513 eta_to_10000=330.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0036 grad_action_out_proj=0.0456 grad_shared_expert=0.2314 (10775:train_pytorch.py:850) + Training: 95%|█████████▌| 9500/10000 [2:03:09<05:41, 1.47it/s, loss=0.0041, lr=2.65e-06, step=9499] Training: 95%|█████████▌| 9500/10000 [2:03:09<05:41, 1.47it/s, loss=0.0029, lr=2.65e-06, step=9500] Training: 95%|█████████▌| 9501/10000 [2:03:09<05:24, 1.54it/s, loss=0.0029, lr=2.65e-06, step=9500] Training: 95%|█████████▌| 9501/10000 [2:03:09<05:24, 1.54it/s, loss=0.0012, lr=2.65e-06, step=9501] Training: 95%|█████████▌| 9502/10000 [2:03:10<05:00, 1.66it/s, loss=0.0012, lr=2.65e-06, step=9501] Training: 95%|█████████▌| 9502/10000 [2:03:10<05:00, 1.66it/s, loss=0.0525, lr=2.65e-06, step=9502] Training: 95%|█████████▌| 9503/10000 [2:03:11<05:14, 1.58it/s, loss=0.0525, lr=2.65e-06, step=9502] Training: 95%|█████████▌| 9503/10000 [2:03:11<05:14, 1.58it/s, loss=0.0158, lr=2.65e-06, step=9503] Training: 95%|█████████▌| 9504/10000 [2:03:11<04:58, 1.66it/s, loss=0.0158, lr=2.65e-06, step=9503] Training: 95%|█████████▌| 9504/10000 [2:03:11<04:58, 1.66it/s, loss=0.0080, lr=2.65e-06, step=9504] Training: 95%|█████████▌| 9505/10000 [2:03:12<05:14, 1.57it/s, loss=0.0080, lr=2.65e-06, step=9504] Training: 95%|█████████▌| 9505/10000 [2:03:12<05:14, 1.57it/s, loss=0.0100, lr=2.65e-06, step=9505] Training: 95%|█████████▌| 9506/10000 [2:03:12<04:51, 1.69it/s, loss=0.0100, lr=2.65e-06, step=9505] Training: 95%|█████████▌| 9506/10000 [2:03:12<04:51, 1.69it/s, loss=0.0012, lr=2.65e-06, step=9506] Training: 95%|█████████▌| 9507/10000 [2:03:13<04:35, 1.79it/s, loss=0.0012, lr=2.65e-06, step=9506] Training: 95%|█████████▌| 9507/10000 [2:03:13<04:35, 1.79it/s, loss=0.0030, lr=2.65e-06, step=9507] Training: 95%|█████████▌| 9508/10000 [2:03:13<04:22, 1.87it/s, loss=0.0030, lr=2.65e-06, step=9507] Training: 95%|█████████▌| 9508/10000 [2:03:13<04:22, 1.87it/s, loss=0.0107, lr=2.65e-06, step=9508] Training: 95%|█████████▌| 9509/10000 [2:03:14<04:20, 1.89it/s, loss=0.0107, lr=2.65e-06, step=9508] Training: 95%|█████████▌| 9509/10000 [2:03:14<04:20, 1.89it/s, loss=0.0024, lr=2.65e-06, step=9509]18:09:21.556 [I] step=9510 loss=0.0327 smoothed_loss=0.0117 lr=2.65e-06 grad_norm=0.5235 step_time=0.5139s data_time=0.0671s it/s=1.722 eta_to_10000=284.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0241 grad_action_out_proj=0.1341 grad_shared_expert=0.4542 (10775:train_pytorch.py:850) + Training: 95%|█████████▌| 9510/10000 [2:03:15<05:03, 1.61it/s, loss=0.0024, lr=2.65e-06, step=9509] Training: 95%|█████████▌| 9510/10000 [2:03:15<05:03, 1.61it/s, loss=0.0327, lr=2.65e-06, step=9510] Training: 95%|█████████▌| 9511/10000 [2:03:15<04:44, 1.72it/s, loss=0.0327, lr=2.65e-06, step=9510] Training: 95%|█████████▌| 9511/10000 [2:03:15<04:44, 1.72it/s, loss=0.0021, lr=2.65e-06, step=9511] Training: 95%|█████████▌| 9512/10000 [2:03:16<04:34, 1.77it/s, loss=0.0021, lr=2.65e-06, step=9511] Training: 95%|█████████▌| 9512/10000 [2:03:16<04:34, 1.77it/s, loss=0.0046, lr=2.65e-06, step=9512] Training: 95%|█████████▌| 9513/10000 [2:03:16<04:51, 1.67it/s, loss=0.0046, lr=2.65e-06, step=9512] Training: 95%|█████████▌| 9513/10000 [2:03:16<04:51, 1.67it/s, loss=0.0051, lr=2.65e-06, step=9513] Training: 95%|█████████▌| 9514/10000 [2:03:17<04:32, 1.78it/s, loss=0.0051, lr=2.65e-06, step=9513] Training: 95%|█████████▌| 9514/10000 [2:03:17<04:32, 1.78it/s, loss=0.0032, lr=2.65e-06, step=9514] Training: 95%|█████████▌| 9515/10000 [2:03:17<04:20, 1.86it/s, loss=0.0032, lr=2.65e-06, step=9514] Training: 95%|█████████▌| 9515/10000 [2:03:17<04:20, 1.86it/s, loss=0.0060, lr=2.64e-06, step=9515] Training: 95%|█████████▌| 9516/10000 [2:03:18<04:24, 1.83it/s, loss=0.0060, lr=2.64e-06, step=9515] Training: 95%|█████████▌| 9516/10000 [2:03:18<04:24, 1.83it/s, loss=0.0056, lr=2.64e-06, step=9516] Training: 95%|█████████▌| 9517/10000 [2:03:19<04:48, 1.68it/s, loss=0.0056, lr=2.64e-06, step=9516] Training: 95%|█████████▌| 9517/10000 [2:03:19<04:48, 1.68it/s, loss=0.0029, lr=2.64e-06, step=9517] Training: 95%|█████████▌| 9518/10000 [2:03:19<04:34, 1.76it/s, loss=0.0029, lr=2.64e-06, step=9517] Training: 95%|█████████▌| 9518/10000 [2:03:19<04:34, 1.76it/s, loss=0.0168, lr=2.64e-06, step=9518] Training: 95%|█████████▌| 9519/10000 [2:03:20<04:22, 1.83it/s, loss=0.0168, lr=2.64e-06, step=9518] Training: 95%|█████████▌| 9519/10000 [2:03:20<04:22, 1.83it/s, loss=0.0172, lr=2.64e-06, step=9519]18:09:27.342 [I] step=9520 loss=0.0064 smoothed_loss=0.0093 lr=2.64e-06 grad_norm=0.3688 step_time=0.5155s data_time=0.0631s it/s=1.728 eta_to_10000=277.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0572 grad_shared_expert=0.2390 (10775:train_pytorch.py:850) + Training: 95%|█████████▌| 9520/10000 [2:03:20<05:07, 1.56it/s, loss=0.0172, lr=2.64e-06, step=9519] Training: 95%|█████████▌| 9520/10000 [2:03:20<05:07, 1.56it/s, loss=0.0064, lr=2.64e-06, step=9520] Training: 95%|█████████▌| 9521/10000 [2:03:21<05:34, 1.43it/s, loss=0.0064, lr=2.64e-06, step=9520] Training: 95%|█████████▌| 9521/10000 [2:03:21<05:34, 1.43it/s, loss=0.0158, lr=2.64e-06, step=9521] Training: 95%|█████████▌| 9522/10000 [2:03:22<05:10, 1.54it/s, loss=0.0158, lr=2.64e-06, step=9521] Training: 95%|█████████▌| 9522/10000 [2:03:22<05:10, 1.54it/s, loss=0.0204, lr=2.64e-06, step=9522] Training: 95%|█████████▌| 9523/10000 [2:03:22<04:53, 1.62it/s, loss=0.0204, lr=2.64e-06, step=9522] Training: 95%|█████████▌| 9523/10000 [2:03:22<04:53, 1.62it/s, loss=0.0031, lr=2.64e-06, step=9523] Training: 95%|█████████▌| 9524/10000 [2:03:23<05:04, 1.56it/s, loss=0.0031, lr=2.64e-06, step=9523] Training: 95%|█████████▌| 9524/10000 [2:03:23<05:04, 1.56it/s, loss=0.0031, lr=2.64e-06, step=9524] Training: 95%|█████████▌| 9525/10000 [2:03:24<04:45, 1.66it/s, loss=0.0031, lr=2.64e-06, step=9524] Training: 95%|█████████▌| 9525/10000 [2:03:24<04:45, 1.66it/s, loss=0.0020, lr=2.64e-06, step=9525] Training: 95%|█████████▌| 9526/10000 [2:03:24<04:31, 1.74it/s, loss=0.0020, lr=2.64e-06, step=9525] Training: 95%|█████████▌| 9526/10000 [2:03:24<04:31, 1.74it/s, loss=0.0069, lr=2.64e-06, step=9526] Training: 95%|█████████▌| 9527/10000 [2:03:25<04:33, 1.73it/s, loss=0.0069, lr=2.64e-06, step=9526] Training: 95%|█████████▌| 9527/10000 [2:03:25<04:33, 1.73it/s, loss=0.0020, lr=2.64e-06, step=9527] Training: 95%|█████████▌| 9528/10000 [2:03:25<04:48, 1.64it/s, loss=0.0020, lr=2.64e-06, step=9527] Training: 95%|█████████▌| 9528/10000 [2:03:25<04:48, 1.64it/s, loss=0.0052, lr=2.64e-06, step=9528] Training: 95%|█████████▌| 9529/10000 [2:03:26<04:32, 1.73it/s, loss=0.0052, lr=2.64e-06, step=9528] Training: 95%|█████████▌| 9529/10000 [2:03:26<04:32, 1.73it/s, loss=0.0138, lr=2.64e-06, step=9529]18:09:33.275 [I] step=9530 loss=0.0039 smoothed_loss=0.0078 lr=2.64e-06 grad_norm=0.3187 step_time=0.5103s data_time=0.0830s it/s=1.686 eta_to_10000=278.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0035 grad_action_out_proj=0.0433 grad_shared_expert=0.1850 (10775:train_pytorch.py:850) + Training: 95%|█████████▌| 9530/10000 [2:03:26<04:25, 1.77it/s, loss=0.0138, lr=2.64e-06, step=9529] Training: 95%|█████████▌| 9530/10000 [2:03:26<04:25, 1.77it/s, loss=0.0039, lr=2.64e-06, step=9530] Training: 95%|█████████▌| 9531/10000 [2:03:27<04:19, 1.81it/s, loss=0.0039, lr=2.64e-06, step=9530] Training: 95%|█████████▌| 9531/10000 [2:03:27<04:19, 1.81it/s, loss=0.0195, lr=2.64e-06, step=9531] Training: 95%|█████████▌| 9532/10000 [2:03:28<04:51, 1.61it/s, loss=0.0195, lr=2.64e-06, step=9531] Training: 95%|█████████▌| 9532/10000 [2:03:28<04:51, 1.61it/s, loss=0.0245, lr=2.64e-06, step=9532] Training: 95%|█████████▌| 9533/10000 [2:03:28<04:50, 1.61it/s, loss=0.0245, lr=2.64e-06, step=9532] Training: 95%|█████████▌| 9533/10000 [2:03:28<04:50, 1.61it/s, loss=0.0209, lr=2.63e-06, step=9533] Training: 95%|█████████▌| 9534/10000 [2:03:29<04:32, 1.71it/s, loss=0.0209, lr=2.63e-06, step=9533] Training: 95%|█████████▌| 9534/10000 [2:03:29<04:32, 1.71it/s, loss=0.0257, lr=2.63e-06, step=9534] Training: 95%|█████████▌| 9535/10000 [2:03:29<04:45, 1.63it/s, loss=0.0257, lr=2.63e-06, step=9534] Training: 95%|█████████▌| 9535/10000 [2:03:29<04:45, 1.63it/s, loss=0.0276, lr=2.63e-06, step=9535] Training: 95%|█████████▌| 9536/10000 [2:03:30<04:32, 1.71it/s, loss=0.0276, lr=2.63e-06, step=9535] Training: 95%|█████████▌| 9536/10000 [2:03:30<04:32, 1.71it/s, loss=0.0016, lr=2.63e-06, step=9536] Training: 95%|█████████▌| 9537/10000 [2:03:31<04:26, 1.74it/s, loss=0.0016, lr=2.63e-06, step=9536] Training: 95%|█████████▌| 9537/10000 [2:03:31<04:26, 1.74it/s, loss=0.0028, lr=2.63e-06, step=9537] Training: 95%|█████████▌| 9538/10000 [2:03:31<04:23, 1.75it/s, loss=0.0028, lr=2.63e-06, step=9537] Training: 95%|█████████▌| 9538/10000 [2:03:31<04:23, 1.75it/s, loss=0.0053, lr=2.63e-06, step=9538] Training: 95%|█████████▌| 9539/10000 [2:03:32<05:08, 1.49it/s, loss=0.0053, lr=2.63e-06, step=9538] Training: 95%|█████████▌| 9539/10000 [2:03:32<05:08, 1.49it/s, loss=0.0125, lr=2.63e-06, step=9539]18:09:39.500 [I] step=9540 loss=0.0010 smoothed_loss=0.0105 lr=2.63e-06 grad_norm=0.4398 step_time=0.5464s data_time=0.0761s it/s=1.607 eta_to_10000=286.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0176 grad_action_out_proj=0.1120 grad_shared_expert=0.5587 (10775:train_pytorch.py:850) + Training: 95%|█████████▌| 9540/10000 [2:03:33<04:56, 1.55it/s, loss=0.0125, lr=2.63e-06, step=9539] Training: 95%|█████████▌| 9540/10000 [2:03:33<04:56, 1.55it/s, loss=0.0010, lr=2.63e-06, step=9540] Training: 95%|█████████▌| 9541/10000 [2:03:33<05:05, 1.50it/s, loss=0.0010, lr=2.63e-06, step=9540] Training: 95%|█████████▌| 9541/10000 [2:03:33<05:05, 1.50it/s, loss=0.0040, lr=2.63e-06, step=9541] Training: 95%|█████████▌| 9542/10000 [2:03:34<05:06, 1.49it/s, loss=0.0040, lr=2.63e-06, step=9541] Training: 95%|█████████▌| 9542/10000 [2:03:34<05:06, 1.49it/s, loss=0.0070, lr=2.63e-06, step=9542] Training: 95%|█████████▌| 9543/10000 [2:03:35<05:09, 1.48it/s, loss=0.0070, lr=2.63e-06, step=9542] Training: 95%|█████████▌| 9543/10000 [2:03:35<05:09, 1.48it/s, loss=0.0151, lr=2.63e-06, step=9543] Training: 95%|█████████▌| 9544/10000 [2:03:35<04:46, 1.59it/s, loss=0.0151, lr=2.63e-06, step=9543] Training: 95%|█████████▌| 9544/10000 [2:03:35<04:46, 1.59it/s, loss=0.0119, lr=2.63e-06, step=9544] Training: 95%|█████████▌| 9545/10000 [2:03:36<04:33, 1.66it/s, loss=0.0119, lr=2.63e-06, step=9544] Training: 95%|█████████▌| 9545/10000 [2:03:36<04:33, 1.66it/s, loss=0.0058, lr=2.63e-06, step=9545] Training: 95%|█████████▌| 9546/10000 [2:03:37<05:36, 1.35it/s, loss=0.0058, lr=2.63e-06, step=9545] Training: 95%|█████████▌| 9546/10000 [2:03:37<05:36, 1.35it/s, loss=0.0012, lr=2.63e-06, step=9546] Training: 95%|█████████▌| 9547/10000 [2:03:37<05:10, 1.46it/s, loss=0.0012, lr=2.63e-06, step=9546] Training: 95%|█████████▌| 9547/10000 [2:03:37<05:10, 1.46it/s, loss=0.0072, lr=2.63e-06, step=9547] Training: 95%|█████████▌| 9548/10000 [2:03:38<04:49, 1.56it/s, loss=0.0072, lr=2.63e-06, step=9547] Training: 95%|█████████▌| 9548/10000 [2:03:38<04:49, 1.56it/s, loss=0.0035, lr=2.63e-06, step=9548] Training: 95%|█████████▌| 9549/10000 [2:03:38<04:29, 1.67it/s, loss=0.0035, lr=2.63e-06, step=9548] Training: 95%|█████████▌| 9549/10000 [2:03:38<04:29, 1.67it/s, loss=0.0171, lr=2.63e-06, step=9549]18:09:46.051 [I] step=9550 loss=0.0036 smoothed_loss=0.0086 lr=2.63e-06 grad_norm=0.4349 step_time=0.5653s data_time=0.0898s it/s=1.527 eta_to_10000=294.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0087 grad_action_out_proj=0.0939 grad_shared_expert=0.3151 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9550/10000 [2:03:39<04:49, 1.55it/s, loss=0.0171, lr=2.63e-06, step=9549] Training: 96%|█████████▌| 9550/10000 [2:03:39<04:49, 1.55it/s, loss=0.0036, lr=2.62e-06, step=9550] Training: 96%|█████████▌| 9551/10000 [2:03:40<04:50, 1.54it/s, loss=0.0036, lr=2.62e-06, step=9550] Training: 96%|█████████▌| 9551/10000 [2:03:40<04:50, 1.54it/s, loss=0.0399, lr=2.62e-06, step=9551] Training: 96%|█████████▌| 9552/10000 [2:03:40<04:39, 1.60it/s, loss=0.0399, lr=2.62e-06, step=9551] Training: 96%|█████████▌| 9552/10000 [2:03:40<04:39, 1.60it/s, loss=0.0039, lr=2.62e-06, step=9552] Training: 96%|█████████▌| 9553/10000 [2:03:41<04:27, 1.67it/s, loss=0.0039, lr=2.62e-06, step=9552] Training: 96%|█████████▌| 9553/10000 [2:03:41<04:27, 1.67it/s, loss=0.0038, lr=2.62e-06, step=9553] Training: 96%|█████████▌| 9554/10000 [2:03:42<04:52, 1.53it/s, loss=0.0038, lr=2.62e-06, step=9553] Training: 96%|█████████▌| 9554/10000 [2:03:42<04:52, 1.53it/s, loss=0.0112, lr=2.62e-06, step=9554] Training: 96%|█████████▌| 9555/10000 [2:03:42<04:31, 1.64it/s, loss=0.0112, lr=2.62e-06, step=9554] Training: 96%|█████████▌| 9555/10000 [2:03:42<04:31, 1.64it/s, loss=0.0021, lr=2.62e-06, step=9555] Training: 96%|█████████▌| 9556/10000 [2:03:43<04:43, 1.57it/s, loss=0.0021, lr=2.62e-06, step=9555] Training: 96%|█████████▌| 9556/10000 [2:03:43<04:43, 1.57it/s, loss=0.0069, lr=2.62e-06, step=9556] Training: 96%|█████████▌| 9557/10000 [2:03:44<05:23, 1.37it/s, loss=0.0069, lr=2.62e-06, step=9556] Training: 96%|█████████▌| 9557/10000 [2:03:44<05:23, 1.37it/s, loss=0.0016, lr=2.62e-06, step=9557] Training: 96%|█████████▌| 9558/10000 [2:03:44<04:49, 1.53it/s, loss=0.0016, lr=2.62e-06, step=9557] Training: 96%|█████████▌| 9558/10000 [2:03:44<04:49, 1.53it/s, loss=0.0088, lr=2.62e-06, step=9558] Training: 96%|█████████▌| 9559/10000 [2:03:45<04:27, 1.65it/s, loss=0.0088, lr=2.62e-06, step=9558] Training: 96%|█████████▌| 9559/10000 [2:03:45<04:27, 1.65it/s, loss=0.0183, lr=2.62e-06, step=9559]18:09:52.294 [I] step=9560 loss=0.0082 smoothed_loss=0.0093 lr=2.62e-06 grad_norm=0.4303 step_time=0.5515s data_time=0.0727s it/s=1.602 eta_to_10000=274.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0284 grad_action_out_proj=0.2408 grad_shared_expert=0.6269 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9560/10000 [2:03:45<04:21, 1.68it/s, loss=0.0183, lr=2.62e-06, step=9559] Training: 96%|█████████▌| 9560/10000 [2:03:45<04:21, 1.68it/s, loss=0.0082, lr=2.62e-06, step=9560] Training: 96%|█████████▌| 9561/10000 [2:03:46<04:33, 1.60it/s, loss=0.0082, lr=2.62e-06, step=9560] Training: 96%|█████████▌| 9561/10000 [2:03:46<04:33, 1.60it/s, loss=0.0052, lr=2.62e-06, step=9561] Training: 96%|█████████▌| 9562/10000 [2:03:47<04:23, 1.66it/s, loss=0.0052, lr=2.62e-06, step=9561] Training: 96%|█████████▌| 9562/10000 [2:03:47<04:23, 1.66it/s, loss=0.0018, lr=2.62e-06, step=9562] Training: 96%|█████████▌| 9563/10000 [2:03:47<04:21, 1.67it/s, loss=0.0018, lr=2.62e-06, step=9562] Training: 96%|█████████▌| 9563/10000 [2:03:47<04:21, 1.67it/s, loss=0.0122, lr=2.62e-06, step=9563] Training: 96%|█████████▌| 9564/10000 [2:03:48<04:32, 1.60it/s, loss=0.0122, lr=2.62e-06, step=9563] Training: 96%|█████████▌| 9564/10000 [2:03:48<04:32, 1.60it/s, loss=0.0230, lr=2.62e-06, step=9564] Training: 96%|█████████▌| 9565/10000 [2:03:49<04:31, 1.60it/s, loss=0.0230, lr=2.62e-06, step=9564] Training: 96%|█████████▌| 9565/10000 [2:03:49<04:31, 1.60it/s, loss=0.0083, lr=2.62e-06, step=9565] Training: 96%|█████████▌| 9566/10000 [2:03:49<04:18, 1.68it/s, loss=0.0083, lr=2.62e-06, step=9565] Training: 96%|█████████▌| 9566/10000 [2:03:49<04:18, 1.68it/s, loss=0.0038, lr=2.62e-06, step=9566] Training: 96%|█████████▌| 9567/10000 [2:03:50<05:30, 1.31it/s, loss=0.0038, lr=2.62e-06, step=9566] Training: 96%|█████████▌| 9567/10000 [2:03:50<05:30, 1.31it/s, loss=0.0015, lr=2.62e-06, step=9567] Training: 96%|█████████▌| 9568/10000 [2:03:51<05:12, 1.38it/s, loss=0.0015, lr=2.62e-06, step=9567] Training: 96%|█████████▌| 9568/10000 [2:03:51<05:12, 1.38it/s, loss=0.0113, lr=2.62e-06, step=9568] Training: 96%|█████████▌| 9569/10000 [2:03:51<05:03, 1.42it/s, loss=0.0113, lr=2.62e-06, step=9568] Training: 96%|█████████▌| 9569/10000 [2:03:51<05:03, 1.42it/s, loss=0.0129, lr=2.61e-06, step=9569]18:09:59.154 [I] step=9570 loss=0.0028 smoothed_loss=0.0086 lr=2.62e-06 grad_norm=0.4051 step_time=0.5735s data_time=0.1125s it/s=1.458 eta_to_10000=294.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.1235 grad_shared_expert=0.4193 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9570/10000 [2:03:52<05:08, 1.40it/s, loss=0.0129, lr=2.61e-06, step=9569] Training: 96%|█████████▌| 9570/10000 [2:03:52<05:08, 1.40it/s, loss=0.0028, lr=2.61e-06, step=9570] Training: 96%|█████████▌| 9571/10000 [2:03:53<04:40, 1.53it/s, loss=0.0028, lr=2.61e-06, step=9570] Training: 96%|█████████▌| 9571/10000 [2:03:53<04:40, 1.53it/s, loss=0.0173, lr=2.61e-06, step=9571] Training: 96%|█████████▌| 9572/10000 [2:03:53<04:21, 1.64it/s, loss=0.0173, lr=2.61e-06, step=9571] Training: 96%|█████████▌| 9572/10000 [2:03:53<04:21, 1.64it/s, loss=0.0256, lr=2.61e-06, step=9572] Training: 96%|█████████▌| 9573/10000 [2:03:54<04:06, 1.73it/s, loss=0.0256, lr=2.61e-06, step=9572] Training: 96%|█████████▌| 9573/10000 [2:03:54<04:06, 1.73it/s, loss=0.0105, lr=2.61e-06, step=9573] Training: 96%|█████████▌| 9574/10000 [2:03:54<04:27, 1.59it/s, loss=0.0105, lr=2.61e-06, step=9573] Training: 96%|█████████▌| 9574/10000 [2:03:54<04:27, 1.59it/s, loss=0.0071, lr=2.61e-06, step=9574] Training: 96%|█████████▌| 9575/10000 [2:03:56<05:18, 1.33it/s, loss=0.0071, lr=2.61e-06, step=9574] Training: 96%|█████████▌| 9575/10000 [2:03:56<05:18, 1.33it/s, loss=0.0033, lr=2.61e-06, step=9575] Training: 96%|█████████▌| 9576/10000 [2:03:56<04:48, 1.47it/s, loss=0.0033, lr=2.61e-06, step=9575] Training: 96%|█████████▌| 9576/10000 [2:03:56<04:48, 1.47it/s, loss=0.0051, lr=2.61e-06, step=9576] Training: 96%|█████████▌| 9577/10000 [2:03:57<04:51, 1.45it/s, loss=0.0051, lr=2.61e-06, step=9576] Training: 96%|█████████▌| 9577/10000 [2:03:57<04:51, 1.45it/s, loss=0.0020, lr=2.61e-06, step=9577] Training: 96%|█████████▌| 9578/10000 [2:03:57<04:29, 1.57it/s, loss=0.0020, lr=2.61e-06, step=9577] Training: 96%|█████████▌| 9578/10000 [2:03:57<04:29, 1.57it/s, loss=0.0034, lr=2.61e-06, step=9578] Training: 96%|█████████▌| 9579/10000 [2:03:58<04:53, 1.44it/s, loss=0.0034, lr=2.61e-06, step=9578] Training: 96%|█████████▌| 9579/10000 [2:03:58<04:53, 1.44it/s, loss=0.0012, lr=2.61e-06, step=9579]18:10:05.574 [I] step=9580 loss=0.0023 smoothed_loss=0.0069 lr=2.61e-06 grad_norm=0.3893 step_time=0.5437s data_time=0.0983s it/s=1.558 eta_to_10000=269.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0314 grad_action_out_proj=0.1238 grad_shared_expert=0.4530 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9580/10000 [2:03:59<04:32, 1.54it/s, loss=0.0012, lr=2.61e-06, step=9579] Training: 96%|█████████▌| 9580/10000 [2:03:59<04:32, 1.54it/s, loss=0.0023, lr=2.61e-06, step=9580] Training: 96%|█████████▌| 9581/10000 [2:03:59<04:13, 1.65it/s, loss=0.0023, lr=2.61e-06, step=9580] Training: 96%|█████████▌| 9581/10000 [2:03:59<04:13, 1.65it/s, loss=0.0046, lr=2.61e-06, step=9581] Training: 96%|█████████▌| 9582/10000 [2:04:00<04:48, 1.45it/s, loss=0.0046, lr=2.61e-06, step=9581] Training: 96%|█████████▌| 9582/10000 [2:04:00<04:48, 1.45it/s, loss=0.0010, lr=2.61e-06, step=9582] Training: 96%|█████████▌| 9583/10000 [2:04:01<04:47, 1.45it/s, loss=0.0010, lr=2.61e-06, step=9582] Training: 96%|█████████▌| 9583/10000 [2:04:01<04:47, 1.45it/s, loss=0.0019, lr=2.61e-06, step=9583] Training: 96%|█████████▌| 9584/10000 [2:04:01<04:44, 1.46it/s, loss=0.0019, lr=2.61e-06, step=9583] Training: 96%|█████████▌| 9584/10000 [2:04:01<04:44, 1.46it/s, loss=0.0075, lr=2.61e-06, step=9584] Training: 96%|█████████▌| 9585/10000 [2:04:02<04:54, 1.41it/s, loss=0.0075, lr=2.61e-06, step=9584] Training: 96%|█████████▌| 9585/10000 [2:04:02<04:54, 1.41it/s, loss=0.0045, lr=2.61e-06, step=9585] Training: 96%|█████████▌| 9586/10000 [2:04:03<04:27, 1.55it/s, loss=0.0045, lr=2.61e-06, step=9585] Training: 96%|█████████▌| 9586/10000 [2:04:03<04:27, 1.55it/s, loss=0.0136, lr=2.61e-06, step=9586] Training: 96%|█████████▌| 9587/10000 [2:04:03<04:08, 1.66it/s, loss=0.0136, lr=2.61e-06, step=9586] Training: 96%|█████████▌| 9587/10000 [2:04:03<04:08, 1.66it/s, loss=0.0033, lr=2.61e-06, step=9587] Training: 96%|█████████▌| 9588/10000 [2:04:04<04:14, 1.62it/s, loss=0.0033, lr=2.61e-06, step=9587] Training: 96%|█████████▌| 9588/10000 [2:04:04<04:14, 1.62it/s, loss=0.0200, lr=2.60e-06, step=9588] Training: 96%|█████████▌| 9589/10000 [2:04:05<04:48, 1.42it/s, loss=0.0200, lr=2.60e-06, step=9588] Training: 96%|█████████▌| 9589/10000 [2:04:05<04:48, 1.42it/s, loss=0.0006, lr=2.60e-06, step=9589]18:10:12.172 [I] step=9590 loss=0.0056 smoothed_loss=0.0068 lr=2.61e-06 grad_norm=0.4466 step_time=0.5513s data_time=0.1084s it/s=1.516 eta_to_10000=270.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0192 grad_action_out_proj=0.1177 grad_shared_expert=0.5837 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9590/10000 [2:04:05<04:26, 1.54it/s, loss=0.0006, lr=2.60e-06, step=9589] Training: 96%|█████████▌| 9590/10000 [2:04:05<04:26, 1.54it/s, loss=0.0056, lr=2.60e-06, step=9590] Training: 96%|█████████▌| 9591/10000 [2:04:06<04:07, 1.65it/s, loss=0.0056, lr=2.60e-06, step=9590] Training: 96%|█████████▌| 9591/10000 [2:04:06<04:07, 1.65it/s, loss=0.0136, lr=2.60e-06, step=9591] Training: 96%|█████████▌| 9592/10000 [2:04:06<04:15, 1.60it/s, loss=0.0136, lr=2.60e-06, step=9591] Training: 96%|█████████▌| 9592/10000 [2:04:06<04:15, 1.60it/s, loss=0.0040, lr=2.60e-06, step=9592] Training: 96%|█████████▌| 9593/10000 [2:04:07<03:58, 1.71it/s, loss=0.0040, lr=2.60e-06, step=9592] Training: 96%|█████████▌| 9593/10000 [2:04:07<03:58, 1.71it/s, loss=0.0048, lr=2.60e-06, step=9593] Training: 96%|█████████▌| 9594/10000 [2:04:08<04:03, 1.67it/s, loss=0.0048, lr=2.60e-06, step=9593] Training: 96%|█████████▌| 9594/10000 [2:04:08<04:03, 1.67it/s, loss=0.0036, lr=2.60e-06, step=9594] Training: 96%|█████████▌| 9595/10000 [2:04:08<03:51, 1.75it/s, loss=0.0036, lr=2.60e-06, step=9594] Training: 96%|█████████▌| 9595/10000 [2:04:08<03:51, 1.75it/s, loss=0.0075, lr=2.60e-06, step=9595] Training: 96%|█████████▌| 9596/10000 [2:04:09<04:34, 1.47it/s, loss=0.0075, lr=2.60e-06, step=9595] Training: 96%|█████████▌| 9596/10000 [2:04:09<04:34, 1.47it/s, loss=0.0195, lr=2.60e-06, step=9596] Training: 96%|█████████▌| 9597/10000 [2:04:10<04:43, 1.42it/s, loss=0.0195, lr=2.60e-06, step=9596] Training: 96%|█████████▌| 9597/10000 [2:04:10<04:43, 1.42it/s, loss=0.0055, lr=2.60e-06, step=9597] Training: 96%|█████████▌| 9598/10000 [2:04:10<04:47, 1.40it/s, loss=0.0055, lr=2.60e-06, step=9597] Training: 96%|█████████▌| 9598/10000 [2:04:10<04:47, 1.40it/s, loss=0.0127, lr=2.60e-06, step=9598] Training: 96%|█████████▌| 9599/10000 [2:04:11<04:54, 1.36it/s, loss=0.0127, lr=2.60e-06, step=9598] Training: 96%|█████████▌| 9599/10000 [2:04:11<04:54, 1.36it/s, loss=0.0041, lr=2.60e-06, step=9599]18:10:18.977 [I] step=9600 loss=0.0088 smoothed_loss=0.0079 lr=2.60e-06 grad_norm=0.3636 step_time=0.5445s data_time=0.1361s it/s=1.470 eta_to_10000=272.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0092 grad_action_out_proj=0.0855 grad_shared_expert=0.3968 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9600/10000 [2:04:12<05:00, 1.33it/s, loss=0.0041, lr=2.60e-06, step=9599] Training: 96%|█████████▌| 9600/10000 [2:04:12<05:00, 1.33it/s, loss=0.0088, lr=2.60e-06, step=9600] Training: 96%|█████████▌| 9601/10000 [2:04:13<04:29, 1.48it/s, loss=0.0088, lr=2.60e-06, step=9600] Training: 96%|█████████▌| 9601/10000 [2:04:13<04:29, 1.48it/s, loss=0.0036, lr=2.60e-06, step=9601] Training: 96%|█████████▌| 9602/10000 [2:04:13<04:11, 1.58it/s, loss=0.0036, lr=2.60e-06, step=9601] Training: 96%|█████████▌| 9602/10000 [2:04:13<04:11, 1.58it/s, loss=0.0072, lr=2.60e-06, step=9602] Training: 96%|█████████▌| 9603/10000 [2:04:14<04:18, 1.54it/s, loss=0.0072, lr=2.60e-06, step=9602] Training: 96%|█████████▌| 9603/10000 [2:04:14<04:18, 1.54it/s, loss=0.0060, lr=2.60e-06, step=9603] Training: 96%|█████████▌| 9604/10000 [2:04:14<04:09, 1.58it/s, loss=0.0060, lr=2.60e-06, step=9603] Training: 96%|█████████▌| 9604/10000 [2:04:14<04:09, 1.58it/s, loss=0.0031, lr=2.60e-06, step=9604] Training: 96%|█████████▌| 9605/10000 [2:04:15<03:56, 1.67it/s, loss=0.0031, lr=2.60e-06, step=9604] Training: 96%|█████████▌| 9605/10000 [2:04:15<03:56, 1.67it/s, loss=0.0059, lr=2.60e-06, step=9605] Training: 96%|█████████▌| 9606/10000 [2:04:15<03:58, 1.65it/s, loss=0.0059, lr=2.60e-06, step=9605] Training: 96%|█████████▌| 9606/10000 [2:04:15<03:58, 1.65it/s, loss=0.0113, lr=2.60e-06, step=9606] Training: 96%|█████████▌| 9607/10000 [2:04:16<04:23, 1.49it/s, loss=0.0113, lr=2.60e-06, step=9606] Training: 96%|█████████▌| 9607/10000 [2:04:16<04:23, 1.49it/s, loss=0.0048, lr=2.60e-06, step=9607] Training: 96%|█████████▌| 9608/10000 [2:04:17<04:15, 1.53it/s, loss=0.0048, lr=2.60e-06, step=9607] Training: 96%|█████████▌| 9608/10000 [2:04:17<04:15, 1.53it/s, loss=0.0038, lr=2.59e-06, step=9608] Training: 96%|█████████▌| 9609/10000 [2:04:18<04:25, 1.47it/s, loss=0.0038, lr=2.59e-06, step=9608] Training: 96%|█████████▌| 9609/10000 [2:04:18<04:25, 1.47it/s, loss=0.0057, lr=2.59e-06, step=9609]18:10:25.338 [I] step=9610 loss=0.0068 smoothed_loss=0.0066 lr=2.60e-06 grad_norm=0.3476 step_time=0.5318s data_time=0.1043s it/s=1.572 eta_to_10000=248.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0058 grad_action_out_proj=0.0597 grad_shared_expert=0.2740 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9610/10000 [2:04:18<04:31, 1.44it/s, loss=0.0057, lr=2.59e-06, step=9609] Training: 96%|█████████▌| 9610/10000 [2:04:18<04:31, 1.44it/s, loss=0.0068, lr=2.59e-06, step=9610] Training: 96%|█████████▌| 9611/10000 [2:04:19<04:17, 1.51it/s, loss=0.0068, lr=2.59e-06, step=9610] Training: 96%|█████████▌| 9611/10000 [2:04:19<04:17, 1.51it/s, loss=0.0045, lr=2.59e-06, step=9611] Training: 96%|█████████▌| 9612/10000 [2:04:19<03:58, 1.63it/s, loss=0.0045, lr=2.59e-06, step=9611] Training: 96%|█████████▌| 9612/10000 [2:04:19<03:58, 1.63it/s, loss=0.0017, lr=2.59e-06, step=9612] Training: 96%|█████████▌| 9613/10000 [2:04:20<04:03, 1.59it/s, loss=0.0017, lr=2.59e-06, step=9612] Training: 96%|█████████▌| 9613/10000 [2:04:20<04:03, 1.59it/s, loss=0.0098, lr=2.59e-06, step=9613] Training: 96%|█████████▌| 9614/10000 [2:04:21<04:59, 1.29it/s, loss=0.0098, lr=2.59e-06, step=9613] Training: 96%|█████████▌| 9614/10000 [2:04:21<04:59, 1.29it/s, loss=0.0015, lr=2.59e-06, step=9614] Training: 96%|█████████▌| 9615/10000 [2:04:22<04:43, 1.36it/s, loss=0.0015, lr=2.59e-06, step=9614] Training: 96%|█████████▌| 9615/10000 [2:04:22<04:43, 1.36it/s, loss=0.0039, lr=2.59e-06, step=9615] Training: 96%|█████████▌| 9616/10000 [2:04:23<04:33, 1.40it/s, loss=0.0039, lr=2.59e-06, step=9615] Training: 96%|█████████▌| 9616/10000 [2:04:23<04:33, 1.40it/s, loss=0.0132, lr=2.59e-06, step=9616] Training: 96%|█████████▌| 9617/10000 [2:04:23<04:20, 1.47it/s, loss=0.0132, lr=2.59e-06, step=9616] Training: 96%|█████████▌| 9617/10000 [2:04:23<04:20, 1.47it/s, loss=0.0205, lr=2.59e-06, step=9617] Training: 96%|█████████▌| 9618/10000 [2:04:24<04:32, 1.40it/s, loss=0.0205, lr=2.59e-06, step=9617] Training: 96%|█████████▌| 9618/10000 [2:04:24<04:32, 1.40it/s, loss=0.0072, lr=2.59e-06, step=9618] Training: 96%|█████████▌| 9619/10000 [2:04:25<04:51, 1.31it/s, loss=0.0072, lr=2.59e-06, step=9618] Training: 96%|█████████▌| 9619/10000 [2:04:25<04:51, 1.31it/s, loss=0.0018, lr=2.59e-06, step=9619]18:10:32.408 [I] step=9620 loss=0.0081 smoothed_loss=0.0073 lr=2.59e-06 grad_norm=0.3827 step_time=0.5735s data_time=0.1334s it/s=1.415 eta_to_10000=268.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0155 grad_action_out_proj=0.0990 grad_shared_expert=0.4694 (10775:train_pytorch.py:850) + Training: 96%|█████████▌| 9620/10000 [2:04:25<04:34, 1.39it/s, loss=0.0018, lr=2.59e-06, step=9619] Training: 96%|█████████▌| 9620/10000 [2:04:25<04:34, 1.39it/s, loss=0.0081, lr=2.59e-06, step=9620] Training: 96%|█████████▌| 9621/10000 [2:04:26<04:10, 1.51it/s, loss=0.0081, lr=2.59e-06, step=9620] Training: 96%|█████████▌| 9621/10000 [2:04:26<04:10, 1.51it/s, loss=0.0026, lr=2.59e-06, step=9621] Training: 96%|█████████▌| 9622/10000 [2:04:27<04:11, 1.50it/s, loss=0.0026, lr=2.59e-06, step=9621] Training: 96%|█████████▌| 9622/10000 [2:04:27<04:11, 1.50it/s, loss=0.0018, lr=2.59e-06, step=9622] Training: 96%|█████████▌| 9623/10000 [2:04:27<03:54, 1.61it/s, loss=0.0018, lr=2.59e-06, step=9622] Training: 96%|█████████▌| 9623/10000 [2:04:27<03:54, 1.61it/s, loss=0.0017, lr=2.59e-06, step=9623] Training: 96%|█████████▌| 9624/10000 [2:04:28<04:01, 1.56it/s, loss=0.0017, lr=2.59e-06, step=9623] Training: 96%|█████████▌| 9624/10000 [2:04:28<04:01, 1.56it/s, loss=0.0077, lr=2.59e-06, step=9624] Training: 96%|█████████▋| 9625/10000 [2:04:29<04:26, 1.41it/s, loss=0.0077, lr=2.59e-06, step=9624] Training: 96%|█████████▋| 9625/10000 [2:04:29<04:26, 1.41it/s, loss=0.0047, lr=2.59e-06, step=9625] Training: 96%|█████████▋| 9626/10000 [2:04:29<04:06, 1.52it/s, loss=0.0047, lr=2.59e-06, step=9625] Training: 96%|█████████▋| 9626/10000 [2:04:29<04:06, 1.52it/s, loss=0.0130, lr=2.59e-06, step=9626] Training: 96%|█████████▋| 9627/10000 [2:04:30<03:49, 1.62it/s, loss=0.0130, lr=2.59e-06, step=9626] Training: 96%|█████████▋| 9627/10000 [2:04:30<03:49, 1.62it/s, loss=0.0060, lr=2.59e-06, step=9627] Training: 96%|█████████▋| 9628/10000 [2:04:31<03:58, 1.56it/s, loss=0.0060, lr=2.59e-06, step=9627] Training: 96%|█████████▋| 9628/10000 [2:04:31<03:58, 1.56it/s, loss=0.0079, lr=2.59e-06, step=9628] Training: 96%|█████████▋| 9629/10000 [2:04:31<04:32, 1.36it/s, loss=0.0079, lr=2.59e-06, step=9628] Training: 96%|█████████▋| 9629/10000 [2:04:31<04:32, 1.36it/s, loss=0.0191, lr=2.59e-06, step=9629]18:10:39.075 [I] step=9630 loss=0.0014 smoothed_loss=0.0073 lr=2.59e-06 grad_norm=0.3641 step_time=0.5474s data_time=0.1193s it/s=1.500 eta_to_10000=246.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0030 grad_action_out_proj=0.0404 grad_shared_expert=0.2206 (10775:train_pytorch.py:850) + Training: 96%|█████████▋| 9630/10000 [2:04:32<04:26, 1.39it/s, loss=0.0191, lr=2.59e-06, step=9629] Training: 96%|█████████▋| 9630/10000 [2:04:32<04:26, 1.39it/s, loss=0.0014, lr=2.58e-06, step=9630] Training: 96%|█████████▋| 9631/10000 [2:04:33<04:27, 1.38it/s, loss=0.0014, lr=2.58e-06, step=9630] Training: 96%|█████████▋| 9631/10000 [2:04:33<04:27, 1.38it/s, loss=0.0027, lr=2.58e-06, step=9631] Training: 96%|█████████▋| 9632/10000 [2:04:34<04:37, 1.32it/s, loss=0.0027, lr=2.58e-06, step=9631] Training: 96%|█████████▋| 9632/10000 [2:04:34<04:37, 1.32it/s, loss=0.0098, lr=2.58e-06, step=9632] Training: 96%|█████████▋| 9633/10000 [2:04:34<04:11, 1.46it/s, loss=0.0098, lr=2.58e-06, step=9632] Training: 96%|█████████▋| 9633/10000 [2:04:34<04:11, 1.46it/s, loss=0.0031, lr=2.58e-06, step=9633] Training: 96%|█████████▋| 9634/10000 [2:04:35<03:53, 1.57it/s, loss=0.0031, lr=2.58e-06, step=9633] Training: 96%|█████████▋| 9634/10000 [2:04:35<03:53, 1.57it/s, loss=0.0034, lr=2.58e-06, step=9634] Training: 96%|█████████▋| 9635/10000 [2:04:35<03:37, 1.68it/s, loss=0.0034, lr=2.58e-06, step=9634] Training: 96%|█████████▋| 9635/10000 [2:04:35<03:37, 1.68it/s, loss=0.0269, lr=2.58e-06, step=9635] Training: 96%|█████████▋| 9636/10000 [2:04:36<03:47, 1.60it/s, loss=0.0269, lr=2.58e-06, step=9635] Training: 96%|█████████▋| 9636/10000 [2:04:36<03:47, 1.60it/s, loss=0.0200, lr=2.58e-06, step=9636] Training: 96%|█████████▋| 9637/10000 [2:04:37<03:43, 1.62it/s, loss=0.0200, lr=2.58e-06, step=9636] Training: 96%|█████████▋| 9637/10000 [2:04:37<03:43, 1.62it/s, loss=0.0039, lr=2.58e-06, step=9637] Training: 96%|█████████▋| 9638/10000 [2:04:37<03:35, 1.68it/s, loss=0.0039, lr=2.58e-06, step=9637] Training: 96%|█████████▋| 9638/10000 [2:04:37<03:35, 1.68it/s, loss=0.0232, lr=2.58e-06, step=9638] Training: 96%|█████████▋| 9639/10000 [2:04:38<04:02, 1.49it/s, loss=0.0232, lr=2.58e-06, step=9638] Training: 96%|█████████▋| 9639/10000 [2:04:38<04:02, 1.49it/s, loss=0.0052, lr=2.58e-06, step=9639]18:10:45.402 [I] step=9640 loss=0.0053 smoothed_loss=0.0094 lr=2.58e-06 grad_norm=0.4816 step_time=0.5446s data_time=0.0881s it/s=1.581 eta_to_10000=227.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0226 grad_action_out_proj=0.1272 grad_shared_expert=0.4744 (10775:train_pytorch.py:850) + Training: 96%|█████████▋| 9640/10000 [2:04:38<03:47, 1.59it/s, loss=0.0052, lr=2.58e-06, step=9639] Training: 96%|█████████▋| 9640/10000 [2:04:38<03:47, 1.59it/s, loss=0.0053, lr=2.58e-06, step=9640] Training: 96%|█████████▋| 9641/10000 [2:04:39<03:31, 1.70it/s, loss=0.0053, lr=2.58e-06, step=9640] Training: 96%|█████████▋| 9641/10000 [2:04:39<03:31, 1.70it/s, loss=0.0026, lr=2.58e-06, step=9641] Training: 96%|█████████▋| 9642/10000 [2:04:39<03:23, 1.76it/s, loss=0.0026, lr=2.58e-06, step=9641] Training: 96%|█████████▋| 9642/10000 [2:04:39<03:23, 1.76it/s, loss=0.0048, lr=2.58e-06, step=9642] Training: 96%|█████████▋| 9643/10000 [2:04:40<03:55, 1.51it/s, loss=0.0048, lr=2.58e-06, step=9642] Training: 96%|█████████▋| 9643/10000 [2:04:40<03:55, 1.51it/s, loss=0.0035, lr=2.58e-06, step=9643] Training: 96%|█████████▋| 9644/10000 [2:04:41<04:06, 1.44it/s, loss=0.0035, lr=2.58e-06, step=9643] Training: 96%|█████████▋| 9644/10000 [2:04:41<04:06, 1.44it/s, loss=0.0041, lr=2.58e-06, step=9644] Training: 96%|█████████▋| 9645/10000 [2:04:42<03:46, 1.57it/s, loss=0.0041, lr=2.58e-06, step=9644] Training: 96%|█████████▋| 9645/10000 [2:04:42<03:46, 1.57it/s, loss=0.0036, lr=2.58e-06, step=9645] Training: 96%|█████████▋| 9646/10000 [2:04:42<03:58, 1.49it/s, loss=0.0036, lr=2.58e-06, step=9645] Training: 96%|█████████▋| 9646/10000 [2:04:42<03:58, 1.49it/s, loss=0.0086, lr=2.58e-06, step=9646] Training: 96%|█████████▋| 9647/10000 [2:04:43<03:39, 1.61it/s, loss=0.0086, lr=2.58e-06, step=9646] Training: 96%|█████████▋| 9647/10000 [2:04:43<03:39, 1.61it/s, loss=0.0154, lr=2.58e-06, step=9647] Training: 96%|█████████▋| 9648/10000 [2:04:44<03:51, 1.52it/s, loss=0.0154, lr=2.58e-06, step=9647] Training: 96%|█████████▋| 9648/10000 [2:04:44<03:51, 1.52it/s, loss=0.0120, lr=2.58e-06, step=9648] Training: 96%|█████████▋| 9649/10000 [2:04:44<03:37, 1.62it/s, loss=0.0120, lr=2.58e-06, step=9648] Training: 96%|█████████▋| 9649/10000 [2:04:44<03:37, 1.62it/s, loss=0.0047, lr=2.58e-06, step=9649]18:10:51.842 [I] step=9650 loss=0.0054 smoothed_loss=0.0078 lr=2.58e-06 grad_norm=0.4123 step_time=0.5401s data_time=0.1040s it/s=1.553 eta_to_10000=225.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0687 grad_shared_expert=0.2560 (10775:train_pytorch.py:850) + Training: 96%|█████████▋| 9650/10000 [2:04:45<03:50, 1.52it/s, loss=0.0047, lr=2.58e-06, step=9649] Training: 96%|█████████▋| 9650/10000 [2:04:45<03:50, 1.52it/s, loss=0.0054, lr=2.58e-06, step=9650] Training: 97%|█████████▋| 9651/10000 [2:04:46<03:54, 1.49it/s, loss=0.0054, lr=2.58e-06, step=9650] Training: 97%|█████████▋| 9651/10000 [2:04:46<03:54, 1.49it/s, loss=0.0069, lr=2.58e-06, step=9651] Training: 97%|█████████▋| 9652/10000 [2:04:46<04:04, 1.42it/s, loss=0.0069, lr=2.58e-06, step=9651] Training: 97%|█████████▋| 9652/10000 [2:04:46<04:04, 1.42it/s, loss=0.0105, lr=2.57e-06, step=9652] Training: 97%|█████████▋| 9653/10000 [2:04:47<04:18, 1.34it/s, loss=0.0105, lr=2.57e-06, step=9652] Training: 97%|█████████▋| 9653/10000 [2:04:47<04:18, 1.34it/s, loss=0.0127, lr=2.57e-06, step=9653] Training: 97%|█████████▋| 9654/10000 [2:04:48<04:21, 1.32it/s, loss=0.0127, lr=2.57e-06, step=9653] Training: 97%|█████████▋| 9654/10000 [2:04:48<04:21, 1.32it/s, loss=0.0049, lr=2.57e-06, step=9654] Training: 97%|█████████▋| 9655/10000 [2:04:49<04:00, 1.43it/s, loss=0.0049, lr=2.57e-06, step=9654] Training: 97%|█████████▋| 9655/10000 [2:04:49<04:00, 1.43it/s, loss=0.0053, lr=2.57e-06, step=9655] Training: 97%|█████████▋| 9656/10000 [2:04:49<03:45, 1.52it/s, loss=0.0053, lr=2.57e-06, step=9655] Training: 97%|█████████▋| 9656/10000 [2:04:49<03:45, 1.52it/s, loss=0.0043, lr=2.57e-06, step=9656] Training: 97%|█████████▋| 9657/10000 [2:04:50<04:09, 1.37it/s, loss=0.0043, lr=2.57e-06, step=9656] Training: 97%|█████████▋| 9657/10000 [2:04:50<04:09, 1.37it/s, loss=0.0104, lr=2.57e-06, step=9657] Training: 97%|█████████▋| 9658/10000 [2:04:51<04:00, 1.42it/s, loss=0.0104, lr=2.57e-06, step=9657] Training: 97%|█████████▋| 9658/10000 [2:04:51<04:00, 1.42it/s, loss=0.0046, lr=2.57e-06, step=9658] Training: 97%|█████████▋| 9659/10000 [2:04:51<03:38, 1.56it/s, loss=0.0046, lr=2.57e-06, step=9658] Training: 97%|█████████▋| 9659/10000 [2:04:51<03:38, 1.56it/s, loss=0.0054, lr=2.57e-06, step=9659]18:10:58.647 [I] step=9660 loss=0.0179 smoothed_loss=0.0083 lr=2.57e-06 grad_norm=0.4544 step_time=0.5704s data_time=0.1100s it/s=1.470 eta_to_10000=231.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0322 grad_action_out_proj=0.1862 grad_shared_expert=0.6386 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9660/10000 [2:04:52<03:28, 1.63it/s, loss=0.0054, lr=2.57e-06, step=9659] Training: 97%|█████████▋| 9660/10000 [2:04:52<03:28, 1.63it/s, loss=0.0179, lr=2.57e-06, step=9660] Training: 97%|█████████▋| 9661/10000 [2:04:53<03:47, 1.49it/s, loss=0.0179, lr=2.57e-06, step=9660] Training: 97%|█████████▋| 9661/10000 [2:04:53<03:47, 1.49it/s, loss=0.0032, lr=2.57e-06, step=9661] Training: 97%|█████████▋| 9662/10000 [2:04:53<03:33, 1.58it/s, loss=0.0032, lr=2.57e-06, step=9661] Training: 97%|█████████▋| 9662/10000 [2:04:53<03:33, 1.58it/s, loss=0.0014, lr=2.57e-06, step=9662] Training: 97%|█████████▋| 9663/10000 [2:04:54<03:18, 1.70it/s, loss=0.0014, lr=2.57e-06, step=9662] Training: 97%|█████████▋| 9663/10000 [2:04:54<03:18, 1.70it/s, loss=0.0087, lr=2.57e-06, step=9663] Training: 97%|█████████▋| 9664/10000 [2:04:54<03:37, 1.54it/s, loss=0.0087, lr=2.57e-06, step=9663] Training: 97%|█████████▋| 9664/10000 [2:04:54<03:37, 1.54it/s, loss=0.0128, lr=2.57e-06, step=9664] Training: 97%|█████████▋| 9665/10000 [2:04:55<03:47, 1.47it/s, loss=0.0128, lr=2.57e-06, step=9664] Training: 97%|█████████▋| 9665/10000 [2:04:55<03:47, 1.47it/s, loss=0.0044, lr=2.57e-06, step=9665] Training: 97%|█████████▋| 9666/10000 [2:04:56<03:40, 1.52it/s, loss=0.0044, lr=2.57e-06, step=9665] Training: 97%|█████████▋| 9666/10000 [2:04:56<03:40, 1.52it/s, loss=0.0072, lr=2.57e-06, step=9666] Training: 97%|█████████▋| 9667/10000 [2:04:56<03:34, 1.55it/s, loss=0.0072, lr=2.57e-06, step=9666] Training: 97%|█████████▋| 9667/10000 [2:04:56<03:34, 1.55it/s, loss=0.0859, lr=2.57e-06, step=9667] Training: 97%|█████████▋| 9668/10000 [2:04:57<04:18, 1.29it/s, loss=0.0859, lr=2.57e-06, step=9667] Training: 97%|█████████▋| 9668/10000 [2:04:57<04:18, 1.29it/s, loss=0.0300, lr=2.57e-06, step=9668] Training: 97%|█████████▋| 9669/10000 [2:04:58<04:17, 1.29it/s, loss=0.0300, lr=2.57e-06, step=9668] Training: 97%|█████████▋| 9669/10000 [2:04:58<04:17, 1.29it/s, loss=0.0048, lr=2.57e-06, step=9669]18:11:05.705 [I] step=9670 loss=0.0109 smoothed_loss=0.0151 lr=2.57e-06 grad_norm=0.4994 step_time=0.5935s data_time=0.1123s it/s=1.417 eta_to_10000=232.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0105 grad_action_out_proj=0.0779 grad_shared_expert=0.3020 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9670/10000 [2:04:59<03:58, 1.38it/s, loss=0.0048, lr=2.57e-06, step=9669] Training: 97%|█████████▋| 9670/10000 [2:04:59<03:58, 1.38it/s, loss=0.0109, lr=2.57e-06, step=9670] Training: 97%|█████████▋| 9671/10000 [2:04:59<03:51, 1.42it/s, loss=0.0109, lr=2.57e-06, step=9670] Training: 97%|█████████▋| 9671/10000 [2:04:59<03:51, 1.42it/s, loss=0.0024, lr=2.57e-06, step=9671] Training: 97%|█████████▋| 9672/10000 [2:05:00<03:47, 1.44it/s, loss=0.0024, lr=2.57e-06, step=9671] Training: 97%|█████████▋| 9672/10000 [2:05:00<03:47, 1.44it/s, loss=0.0022, lr=2.57e-06, step=9672] Training: 97%|█████████▋| 9673/10000 [2:05:01<03:29, 1.56it/s, loss=0.0022, lr=2.57e-06, step=9672] Training: 97%|█████████▋| 9673/10000 [2:05:01<03:29, 1.56it/s, loss=0.0080, lr=2.57e-06, step=9673] Training: 97%|█████████▋| 9674/10000 [2:05:01<03:15, 1.67it/s, loss=0.0080, lr=2.57e-06, step=9673] Training: 97%|█████████▋| 9674/10000 [2:05:01<03:15, 1.67it/s, loss=0.0075, lr=2.57e-06, step=9674] Training: 97%|█████████▋| 9675/10000 [2:05:02<03:36, 1.50it/s, loss=0.0075, lr=2.57e-06, step=9674] Training: 97%|█████████▋| 9675/10000 [2:05:02<03:36, 1.50it/s, loss=0.0031, lr=2.57e-06, step=9675] Training: 97%|█████████▋| 9676/10000 [2:05:02<03:18, 1.63it/s, loss=0.0031, lr=2.57e-06, step=9675] Training: 97%|█████████▋| 9676/10000 [2:05:02<03:18, 1.63it/s, loss=0.0055, lr=2.56e-06, step=9676] Training: 97%|█████████▋| 9677/10000 [2:05:03<03:08, 1.72it/s, loss=0.0055, lr=2.56e-06, step=9676] Training: 97%|█████████▋| 9677/10000 [2:05:03<03:08, 1.72it/s, loss=0.0064, lr=2.56e-06, step=9677] Training: 97%|█████████▋| 9678/10000 [2:05:03<03:00, 1.79it/s, loss=0.0064, lr=2.56e-06, step=9677] Training: 97%|█████████▋| 9678/10000 [2:05:03<03:00, 1.79it/s, loss=0.0124, lr=2.56e-06, step=9678] Training: 97%|█████████▋| 9679/10000 [2:05:04<02:54, 1.84it/s, loss=0.0124, lr=2.56e-06, step=9678] Training: 97%|█████████▋| 9679/10000 [2:05:04<02:54, 1.84it/s, loss=0.0015, lr=2.56e-06, step=9679]18:11:11.696 [I] step=9680 loss=0.0014 smoothed_loss=0.0085 lr=2.57e-06 grad_norm=0.3112 step_time=0.5183s data_time=0.0808s it/s=1.670 eta_to_10000=191.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0049 grad_action_out_proj=0.0693 grad_shared_expert=0.1947 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9680/10000 [2:05:05<03:19, 1.61it/s, loss=0.0015, lr=2.56e-06, step=9679] Training: 97%|█████████▋| 9680/10000 [2:05:05<03:19, 1.61it/s, loss=0.0014, lr=2.56e-06, step=9680] Training: 97%|█████████▋| 9681/10000 [2:05:05<03:05, 1.72it/s, loss=0.0014, lr=2.56e-06, step=9680] Training: 97%|█████████▋| 9681/10000 [2:05:05<03:05, 1.72it/s, loss=0.0217, lr=2.56e-06, step=9681] Training: 97%|█████████▋| 9682/10000 [2:05:06<03:21, 1.58it/s, loss=0.0217, lr=2.56e-06, step=9681] Training: 97%|█████████▋| 9682/10000 [2:05:06<03:21, 1.58it/s, loss=0.0109, lr=2.56e-06, step=9682] Training: 97%|█████████▋| 9683/10000 [2:05:07<03:08, 1.68it/s, loss=0.0109, lr=2.56e-06, step=9682] Training: 97%|█████████▋| 9683/10000 [2:05:07<03:08, 1.68it/s, loss=0.0514, lr=2.56e-06, step=9683] Training: 97%|█████████▋| 9684/10000 [2:05:07<03:00, 1.75it/s, loss=0.0514, lr=2.56e-06, step=9683] Training: 97%|█████████▋| 9684/10000 [2:05:07<03:00, 1.75it/s, loss=0.0069, lr=2.56e-06, step=9684] Training: 97%|█████████▋| 9685/10000 [2:05:08<02:58, 1.76it/s, loss=0.0069, lr=2.56e-06, step=9684] Training: 97%|█████████▋| 9685/10000 [2:05:08<02:58, 1.76it/s, loss=0.0154, lr=2.56e-06, step=9685] Training: 97%|█████████▋| 9686/10000 [2:05:08<03:16, 1.60it/s, loss=0.0154, lr=2.56e-06, step=9685] Training: 97%|█████████▋| 9686/10000 [2:05:08<03:16, 1.60it/s, loss=0.0040, lr=2.56e-06, step=9686] Training: 97%|█████████▋| 9687/10000 [2:05:09<03:19, 1.57it/s, loss=0.0040, lr=2.56e-06, step=9686] Training: 97%|█████████▋| 9687/10000 [2:05:09<03:19, 1.57it/s, loss=0.0403, lr=2.56e-06, step=9687] Training: 97%|█████████▋| 9688/10000 [2:05:10<03:18, 1.58it/s, loss=0.0403, lr=2.56e-06, step=9687] Training: 97%|█████████▋| 9688/10000 [2:05:10<03:18, 1.58it/s, loss=0.0071, lr=2.56e-06, step=9688] Training: 97%|█████████▋| 9689/10000 [2:05:10<03:30, 1.48it/s, loss=0.0071, lr=2.56e-06, step=9688] Training: 97%|█████████▋| 9689/10000 [2:05:10<03:30, 1.48it/s, loss=0.0775, lr=2.56e-06, step=9689]18:11:18.111 [I] step=9690 loss=0.0036 smoothed_loss=0.0191 lr=2.56e-06 grad_norm=0.4620 step_time=0.5468s data_time=0.0947s it/s=1.559 eta_to_10000=198.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0062 grad_action_out_proj=0.0685 grad_shared_expert=0.4864 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9690/10000 [2:05:11<03:38, 1.42it/s, loss=0.0775, lr=2.56e-06, step=9689] Training: 97%|█████████▋| 9690/10000 [2:05:11<03:38, 1.42it/s, loss=0.0036, lr=2.56e-06, step=9690] Training: 97%|█████████▋| 9691/10000 [2:05:12<03:52, 1.33it/s, loss=0.0036, lr=2.56e-06, step=9690] Training: 97%|█████████▋| 9691/10000 [2:05:12<03:52, 1.33it/s, loss=0.0066, lr=2.56e-06, step=9691] Training: 97%|█████████▋| 9692/10000 [2:05:13<04:01, 1.27it/s, loss=0.0066, lr=2.56e-06, step=9691] Training: 97%|█████████▋| 9692/10000 [2:05:13<04:01, 1.27it/s, loss=0.0026, lr=2.56e-06, step=9692] Training: 97%|█████████▋| 9693/10000 [2:05:14<04:12, 1.21it/s, loss=0.0026, lr=2.56e-06, step=9692] Training: 97%|█████████▋| 9693/10000 [2:05:14<04:12, 1.21it/s, loss=0.0261, lr=2.56e-06, step=9693] Training: 97%|█████████▋| 9694/10000 [2:05:14<03:42, 1.38it/s, loss=0.0261, lr=2.56e-06, step=9693] Training: 97%|█████████▋| 9694/10000 [2:05:14<03:42, 1.38it/s, loss=0.0139, lr=2.56e-06, step=9694] Training: 97%|█████████▋| 9695/10000 [2:05:15<03:48, 1.34it/s, loss=0.0139, lr=2.56e-06, step=9694] Training: 97%|█████████▋| 9695/10000 [2:05:15<03:48, 1.34it/s, loss=0.0006, lr=2.56e-06, step=9695] Training: 97%|█████████▋| 9696/10000 [2:05:16<04:06, 1.24it/s, loss=0.0006, lr=2.56e-06, step=9695] Training: 97%|█████████▋| 9696/10000 [2:05:16<04:06, 1.24it/s, loss=0.0027, lr=2.56e-06, step=9696] Training: 97%|█████████▋| 9697/10000 [2:05:17<04:09, 1.22it/s, loss=0.0027, lr=2.56e-06, step=9696] Training: 97%|█████████▋| 9697/10000 [2:05:17<04:09, 1.22it/s, loss=0.0022, lr=2.56e-06, step=9697] Training: 97%|█████████▋| 9698/10000 [2:05:18<04:15, 1.18it/s, loss=0.0022, lr=2.56e-06, step=9697] Training: 97%|█████████▋| 9698/10000 [2:05:18<04:15, 1.18it/s, loss=0.0065, lr=2.56e-06, step=9698] Training: 97%|█████████▋| 9699/10000 [2:05:19<04:09, 1.21it/s, loss=0.0065, lr=2.56e-06, step=9698] Training: 97%|█████████▋| 9699/10000 [2:05:19<04:09, 1.21it/s, loss=0.0021, lr=2.56e-06, step=9699]18:11:26.341 [I] step=9700 loss=0.0035 smoothed_loss=0.0105 lr=2.56e-06 grad_norm=0.3947 step_time=0.6285s data_time=0.1945s it/s=1.215 eta_to_10000=246.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0074 grad_action_out_proj=0.0845 grad_shared_expert=0.3729 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9700/10000 [2:05:19<04:05, 1.22it/s, loss=0.0021, lr=2.56e-06, step=9699] Training: 97%|█████████▋| 9700/10000 [2:05:19<04:05, 1.22it/s, loss=0.0035, lr=2.56e-06, step=9700] Training: 97%|█████████▋| 9701/10000 [2:05:20<03:46, 1.32it/s, loss=0.0035, lr=2.56e-06, step=9700] Training: 97%|█████████▋| 9701/10000 [2:05:20<03:46, 1.32it/s, loss=0.0079, lr=2.56e-06, step=9701] Training: 97%|█████████▋| 9702/10000 [2:05:21<03:46, 1.32it/s, loss=0.0079, lr=2.56e-06, step=9701] Training: 97%|█████████▋| 9702/10000 [2:05:21<03:46, 1.32it/s, loss=0.0038, lr=2.55e-06, step=9702] Training: 97%|█████████▋| 9703/10000 [2:05:21<03:31, 1.40it/s, loss=0.0038, lr=2.55e-06, step=9702] Training: 97%|█████████▋| 9703/10000 [2:05:21<03:31, 1.40it/s, loss=0.0131, lr=2.55e-06, step=9703] Training: 97%|█████████▋| 9704/10000 [2:05:22<03:38, 1.36it/s, loss=0.0131, lr=2.55e-06, step=9703] Training: 97%|█████████▋| 9704/10000 [2:05:22<03:38, 1.36it/s, loss=0.0130, lr=2.55e-06, step=9704] Training: 97%|█████████▋| 9705/10000 [2:05:23<03:42, 1.32it/s, loss=0.0130, lr=2.55e-06, step=9704] Training: 97%|█████████▋| 9705/10000 [2:05:23<03:42, 1.32it/s, loss=0.0082, lr=2.55e-06, step=9705] Training: 97%|█████████▋| 9706/10000 [2:05:23<03:21, 1.46it/s, loss=0.0082, lr=2.55e-06, step=9705] Training: 97%|█████████▋| 9706/10000 [2:05:23<03:21, 1.46it/s, loss=0.0300, lr=2.55e-06, step=9706] Training: 97%|█████████▋| 9707/10000 [2:05:24<03:07, 1.56it/s, loss=0.0300, lr=2.55e-06, step=9706] Training: 97%|█████████▋| 9707/10000 [2:05:24<03:07, 1.56it/s, loss=0.0041, lr=2.55e-06, step=9707] Training: 97%|█████████▋| 9708/10000 [2:05:25<03:06, 1.57it/s, loss=0.0041, lr=2.55e-06, step=9707] Training: 97%|█████████▋| 9708/10000 [2:05:25<03:06, 1.57it/s, loss=0.0167, lr=2.55e-06, step=9708] Training: 97%|█████████▋| 9709/10000 [2:05:26<03:27, 1.40it/s, loss=0.0167, lr=2.55e-06, step=9708] Training: 97%|█████████▋| 9709/10000 [2:05:26<03:27, 1.40it/s, loss=0.0036, lr=2.55e-06, step=9709]18:11:33.058 [I] step=9710 loss=0.0047 smoothed_loss=0.0103 lr=2.55e-06 grad_norm=0.4748 step_time=0.5552s data_time=0.1165s it/s=1.489 eta_to_10000=194.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0153 grad_action_out_proj=0.1502 grad_shared_expert=0.3428 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9710/10000 [2:05:26<03:14, 1.49it/s, loss=0.0036, lr=2.55e-06, step=9709] Training: 97%|█████████▋| 9710/10000 [2:05:26<03:14, 1.49it/s, loss=0.0047, lr=2.55e-06, step=9710] Training: 97%|█████████▋| 9711/10000 [2:05:27<03:39, 1.32it/s, loss=0.0047, lr=2.55e-06, step=9710] Training: 97%|█████████▋| 9711/10000 [2:05:27<03:39, 1.32it/s, loss=0.0101, lr=2.55e-06, step=9711] Training: 97%|█████████▋| 9712/10000 [2:05:28<03:32, 1.36it/s, loss=0.0101, lr=2.55e-06, step=9711] Training: 97%|█████████▋| 9712/10000 [2:05:28<03:32, 1.36it/s, loss=0.0125, lr=2.55e-06, step=9712] Training: 97%|█████████▋| 9713/10000 [2:05:28<03:25, 1.40it/s, loss=0.0125, lr=2.55e-06, step=9712] Training: 97%|█████████▋| 9713/10000 [2:05:28<03:25, 1.40it/s, loss=0.0039, lr=2.55e-06, step=9713] Training: 97%|█████████▋| 9714/10000 [2:05:29<03:10, 1.50it/s, loss=0.0039, lr=2.55e-06, step=9713] Training: 97%|█████████▋| 9714/10000 [2:05:29<03:10, 1.50it/s, loss=0.0043, lr=2.55e-06, step=9714] Training: 97%|█████████▋| 9715/10000 [2:05:30<03:16, 1.45it/s, loss=0.0043, lr=2.55e-06, step=9714] Training: 97%|█████████▋| 9715/10000 [2:05:30<03:16, 1.45it/s, loss=0.0056, lr=2.55e-06, step=9715] Training: 97%|█████████▋| 9716/10000 [2:05:31<03:37, 1.30it/s, loss=0.0056, lr=2.55e-06, step=9715] Training: 97%|█████████▋| 9716/10000 [2:05:31<03:37, 1.30it/s, loss=0.0038, lr=2.55e-06, step=9716] Training: 97%|█████████▋| 9717/10000 [2:05:31<03:32, 1.33it/s, loss=0.0038, lr=2.55e-06, step=9716] Training: 97%|█████████▋| 9717/10000 [2:05:31<03:32, 1.33it/s, loss=0.0041, lr=2.55e-06, step=9717] Training: 97%|█████████▋| 9718/10000 [2:05:32<03:43, 1.26it/s, loss=0.0041, lr=2.55e-06, step=9717] Training: 97%|█████████▋| 9718/10000 [2:05:32<03:43, 1.26it/s, loss=0.0030, lr=2.55e-06, step=9718] Training: 97%|█████████▋| 9719/10000 [2:05:33<03:20, 1.40it/s, loss=0.0030, lr=2.55e-06, step=9718] Training: 97%|█████████▋| 9719/10000 [2:05:33<03:20, 1.40it/s, loss=0.0081, lr=2.55e-06, step=9719]18:11:40.314 [I] step=9720 loss=0.0023 smoothed_loss=0.0070 lr=2.55e-06 grad_norm=0.3785 step_time=0.5964s data_time=0.1292s it/s=1.378 eta_to_10000=203.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0476 grad_action_out_proj=0.1391 grad_shared_expert=0.5027 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9720/10000 [2:05:33<03:07, 1.50it/s, loss=0.0081, lr=2.55e-06, step=9719] Training: 97%|█████████▋| 9720/10000 [2:05:33<03:07, 1.50it/s, loss=0.0023, lr=2.55e-06, step=9720] Training: 97%|█████████▋| 9721/10000 [2:05:34<03:04, 1.51it/s, loss=0.0023, lr=2.55e-06, step=9720] Training: 97%|█████████▋| 9721/10000 [2:05:34<03:04, 1.51it/s, loss=0.0103, lr=2.55e-06, step=9721] Training: 97%|█████████▋| 9722/10000 [2:05:35<03:09, 1.47it/s, loss=0.0103, lr=2.55e-06, step=9721] Training: 97%|█████████▋| 9722/10000 [2:05:35<03:09, 1.47it/s, loss=0.0076, lr=2.55e-06, step=9722] Training: 97%|█████████▋| 9723/10000 [2:05:35<03:07, 1.47it/s, loss=0.0076, lr=2.55e-06, step=9722] Training: 97%|█████████▋| 9723/10000 [2:05:35<03:07, 1.47it/s, loss=0.0047, lr=2.55e-06, step=9723] Training: 97%|█████████▋| 9724/10000 [2:05:36<03:08, 1.46it/s, loss=0.0047, lr=2.55e-06, step=9723] Training: 97%|█████████▋| 9724/10000 [2:05:36<03:08, 1.46it/s, loss=0.0281, lr=2.55e-06, step=9724] Training: 97%|█████████▋| 9725/10000 [2:05:37<03:19, 1.38it/s, loss=0.0281, lr=2.55e-06, step=9724] Training: 97%|█████████▋| 9725/10000 [2:05:37<03:19, 1.38it/s, loss=0.0101, lr=2.55e-06, step=9725] Training: 97%|█████████▋| 9726/10000 [2:05:38<03:31, 1.29it/s, loss=0.0101, lr=2.55e-06, step=9725] Training: 97%|█████████▋| 9726/10000 [2:05:38<03:31, 1.29it/s, loss=0.0070, lr=2.55e-06, step=9726] Training: 97%|█████████▋| 9727/10000 [2:05:38<03:22, 1.35it/s, loss=0.0070, lr=2.55e-06, step=9726] Training: 97%|█████████▋| 9727/10000 [2:05:38<03:22, 1.35it/s, loss=0.0020, lr=2.55e-06, step=9727] Training: 97%|█████████▋| 9728/10000 [2:05:39<03:01, 1.50it/s, loss=0.0020, lr=2.55e-06, step=9727] Training: 97%|█████████▋| 9728/10000 [2:05:39<03:01, 1.50it/s, loss=0.0077, lr=2.55e-06, step=9728] Training: 97%|█████████▋| 9729/10000 [2:05:40<03:10, 1.42it/s, loss=0.0077, lr=2.55e-06, step=9728] Training: 97%|█████████▋| 9729/10000 [2:05:40<03:10, 1.42it/s, loss=0.0105, lr=2.55e-06, step=9729]18:11:47.404 [I] step=9730 loss=0.0006 smoothed_loss=0.0077 lr=2.55e-06 grad_norm=0.4150 step_time=0.5756s data_time=0.1334s it/s=1.411 eta_to_10000=191.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0214 grad_action_out_proj=0.1021 grad_shared_expert=0.4479 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9730/10000 [2:05:40<03:08, 1.43it/s, loss=0.0105, lr=2.55e-06, step=9729] Training: 97%|█████████▋| 9730/10000 [2:05:40<03:08, 1.43it/s, loss=0.0006, lr=2.55e-06, step=9730] Training: 97%|█████████▋| 9731/10000 [2:05:41<02:54, 1.54it/s, loss=0.0006, lr=2.55e-06, step=9730] Training: 97%|█████████▋| 9731/10000 [2:05:41<02:54, 1.54it/s, loss=0.0114, lr=2.54e-06, step=9731] Training: 97%|█████████▋| 9732/10000 [2:05:42<03:06, 1.44it/s, loss=0.0114, lr=2.54e-06, step=9731] Training: 97%|█████████▋| 9732/10000 [2:05:42<03:06, 1.44it/s, loss=0.0069, lr=2.54e-06, step=9732] Training: 97%|█████████▋| 9733/10000 [2:05:43<03:23, 1.31it/s, loss=0.0069, lr=2.54e-06, step=9732] Training: 97%|█████████▋| 9733/10000 [2:05:43<03:23, 1.31it/s, loss=0.0020, lr=2.54e-06, step=9733] Training: 97%|█████████▋| 9734/10000 [2:05:43<03:08, 1.41it/s, loss=0.0020, lr=2.54e-06, step=9733] Training: 97%|█████████▋| 9734/10000 [2:05:43<03:08, 1.41it/s, loss=0.0016, lr=2.54e-06, step=9734] Training: 97%|█████████▋| 9735/10000 [2:05:44<03:00, 1.47it/s, loss=0.0016, lr=2.54e-06, step=9734] Training: 97%|█████████▋| 9735/10000 [2:05:44<03:00, 1.47it/s, loss=0.0046, lr=2.54e-06, step=9735] Training: 97%|█████████▋| 9736/10000 [2:05:45<02:59, 1.47it/s, loss=0.0046, lr=2.54e-06, step=9735] Training: 97%|█████████▋| 9736/10000 [2:05:45<02:59, 1.47it/s, loss=0.0027, lr=2.54e-06, step=9736] Training: 97%|█████████▋| 9737/10000 [2:05:45<03:03, 1.44it/s, loss=0.0027, lr=2.54e-06, step=9736] Training: 97%|█████████▋| 9737/10000 [2:05:45<03:03, 1.44it/s, loss=0.0097, lr=2.54e-06, step=9737] Training: 97%|█████████▋| 9738/10000 [2:05:46<03:21, 1.30it/s, loss=0.0097, lr=2.54e-06, step=9737] Training: 97%|█████████▋| 9738/10000 [2:05:46<03:21, 1.30it/s, loss=0.0016, lr=2.54e-06, step=9738] Training: 97%|█████████▋| 9739/10000 [2:05:47<03:27, 1.26it/s, loss=0.0016, lr=2.54e-06, step=9738] Training: 97%|█████████▋| 9739/10000 [2:05:47<03:27, 1.26it/s, loss=0.0051, lr=2.54e-06, step=9739]18:11:55.258 [I] step=9740 loss=0.0020 smoothed_loss=0.0056 lr=2.54e-06 grad_norm=0.4227 step_time=0.6422s data_time=0.1431s it/s=1.274 eta_to_10000=204.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0182 grad_action_out_proj=0.0791 grad_shared_expert=0.3250 (10775:train_pytorch.py:850) + Training: 97%|█████████▋| 9740/10000 [2:05:48<03:58, 1.09it/s, loss=0.0051, lr=2.54e-06, step=9739] Training: 97%|█████████▋| 9740/10000 [2:05:48<03:58, 1.09it/s, loss=0.0020, lr=2.54e-06, step=9740] Training: 97%|█████████▋| 9741/10000 [2:05:49<04:02, 1.07it/s, loss=0.0020, lr=2.54e-06, step=9740] Training: 97%|█████████▋| 9741/10000 [2:05:49<04:02, 1.07it/s, loss=0.0063, lr=2.54e-06, step=9741] Training: 97%|█████████▋| 9742/10000 [2:05:50<03:51, 1.11it/s, loss=0.0063, lr=2.54e-06, step=9741] Training: 97%|█████████▋| 9742/10000 [2:05:50<03:51, 1.11it/s, loss=0.0098, lr=2.54e-06, step=9742] Training: 97%|█████████▋| 9743/10000 [2:05:51<03:23, 1.26it/s, loss=0.0098, lr=2.54e-06, step=9742] Training: 97%|█████████▋| 9743/10000 [2:05:51<03:23, 1.26it/s, loss=0.0016, lr=2.54e-06, step=9743] Training: 97%|█████████▋| 9744/10000 [2:05:51<03:23, 1.26it/s, loss=0.0016, lr=2.54e-06, step=9743] Training: 97%|█████████▋| 9744/10000 [2:05:51<03:23, 1.26it/s, loss=0.0045, lr=2.54e-06, step=9744] Training: 97%|█████████▋| 9745/10000 [2:05:52<03:34, 1.19it/s, loss=0.0045, lr=2.54e-06, step=9744] Training: 97%|█████████▋| 9745/10000 [2:05:52<03:34, 1.19it/s, loss=0.0100, lr=2.54e-06, step=9745] Training: 97%|█████████▋| 9746/10000 [2:05:53<03:17, 1.29it/s, loss=0.0100, lr=2.54e-06, step=9745] Training: 97%|█████████▋| 9746/10000 [2:05:53<03:17, 1.29it/s, loss=0.0015, lr=2.54e-06, step=9746] Training: 97%|█████████▋| 9747/10000 [2:05:54<03:30, 1.20it/s, loss=0.0015, lr=2.54e-06, step=9746] Training: 97%|█████████▋| 9747/10000 [2:05:54<03:30, 1.20it/s, loss=0.0033, lr=2.54e-06, step=9747] Training: 97%|█████████▋| 9748/10000 [2:05:55<03:27, 1.21it/s, loss=0.0033, lr=2.54e-06, step=9747] Training: 97%|█████████▋| 9748/10000 [2:05:55<03:27, 1.21it/s, loss=0.0112, lr=2.54e-06, step=9748] Training: 97%|█████████▋| 9749/10000 [2:05:56<03:26, 1.22it/s, loss=0.0112, lr=2.54e-06, step=9748] Training: 97%|█████████▋| 9749/10000 [2:05:56<03:26, 1.22it/s, loss=0.0071, lr=2.54e-06, step=9749]18:12:03.373 [I] step=9750 loss=0.0016 smoothed_loss=0.0056 lr=2.54e-06 grad_norm=0.3245 step_time=0.6189s data_time=0.1926s it/s=1.232 eta_to_10000=202.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0079 grad_action_out_proj=0.0670 grad_shared_expert=0.1692 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9750/10000 [2:05:56<03:24, 1.22it/s, loss=0.0071, lr=2.54e-06, step=9749] Training: 98%|█████████▊| 9750/10000 [2:05:56<03:24, 1.22it/s, loss=0.0016, lr=2.54e-06, step=9750] Training: 98%|█████████▊| 9751/10000 [2:05:57<03:03, 1.36it/s, loss=0.0016, lr=2.54e-06, step=9750] Training: 98%|█████████▊| 9751/10000 [2:05:57<03:03, 1.36it/s, loss=0.0019, lr=2.54e-06, step=9751] Training: 98%|█████████▊| 9752/10000 [2:05:58<03:15, 1.27it/s, loss=0.0019, lr=2.54e-06, step=9751] Training: 98%|█████████▊| 9752/10000 [2:05:58<03:15, 1.27it/s, loss=0.0146, lr=2.54e-06, step=9752] Training: 98%|█████████▊| 9753/10000 [2:05:59<03:09, 1.30it/s, loss=0.0146, lr=2.54e-06, step=9752] Training: 98%|█████████▊| 9753/10000 [2:05:59<03:09, 1.30it/s, loss=0.0170, lr=2.54e-06, step=9753] Training: 98%|█████████▊| 9754/10000 [2:05:59<03:12, 1.28it/s, loss=0.0170, lr=2.54e-06, step=9753] Training: 98%|█████████▊| 9754/10000 [2:05:59<03:12, 1.28it/s, loss=0.0046, lr=2.54e-06, step=9754] Training: 98%|█████████▊| 9755/10000 [2:06:00<02:57, 1.38it/s, loss=0.0046, lr=2.54e-06, step=9754] Training: 98%|█████████▊| 9755/10000 [2:06:00<02:57, 1.38it/s, loss=0.0017, lr=2.54e-06, step=9755] Training: 98%|█████████▊| 9756/10000 [2:06:01<03:03, 1.33it/s, loss=0.0017, lr=2.54e-06, step=9755] Training: 98%|█████████▊| 9756/10000 [2:06:01<03:03, 1.33it/s, loss=0.0033, lr=2.54e-06, step=9756] Training: 98%|█████████▊| 9757/10000 [2:06:02<03:08, 1.29it/s, loss=0.0033, lr=2.54e-06, step=9756] Training: 98%|█████████▊| 9757/10000 [2:06:02<03:08, 1.29it/s, loss=0.0240, lr=2.54e-06, step=9757] Training: 98%|█████████▊| 9758/10000 [2:06:02<02:50, 1.42it/s, loss=0.0240, lr=2.54e-06, step=9757] Training: 98%|█████████▊| 9758/10000 [2:06:02<02:50, 1.42it/s, loss=0.0119, lr=2.54e-06, step=9758] Training: 98%|█████████▊| 9759/10000 [2:06:03<02:38, 1.52it/s, loss=0.0119, lr=2.54e-06, step=9758] Training: 98%|█████████▊| 9759/10000 [2:06:03<02:38, 1.52it/s, loss=0.0076, lr=2.54e-06, step=9759]18:12:10.421 [I] step=9760 loss=0.0016 smoothed_loss=0.0076 lr=2.54e-06 grad_norm=0.4855 step_time=0.5864s data_time=0.1184s it/s=1.419 eta_to_10000=169.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0372 grad_action_out_proj=0.1271 grad_shared_expert=0.4865 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9760/10000 [2:06:03<02:43, 1.47it/s, loss=0.0076, lr=2.54e-06, step=9759] Training: 98%|█████████▊| 9760/10000 [2:06:03<02:43, 1.47it/s, loss=0.0016, lr=2.54e-06, step=9760] Training: 98%|█████████▊| 9761/10000 [2:06:04<03:05, 1.29it/s, loss=0.0016, lr=2.54e-06, step=9760] Training: 98%|█████████▊| 9761/10000 [2:06:04<03:05, 1.29it/s, loss=0.0015, lr=2.54e-06, step=9761] Training: 98%|█████████▊| 9762/10000 [2:06:05<02:54, 1.36it/s, loss=0.0015, lr=2.54e-06, step=9761] Training: 98%|█████████▊| 9762/10000 [2:06:05<02:54, 1.36it/s, loss=0.0033, lr=2.54e-06, step=9762] Training: 98%|█████████▊| 9763/10000 [2:06:06<02:55, 1.35it/s, loss=0.0033, lr=2.54e-06, step=9762] Training: 98%|█████████▊| 9763/10000 [2:06:06<02:55, 1.35it/s, loss=0.0040, lr=2.53e-06, step=9763] Training: 98%|█████████▊| 9764/10000 [2:06:06<02:43, 1.44it/s, loss=0.0040, lr=2.53e-06, step=9763] Training: 98%|█████████▊| 9764/10000 [2:06:06<02:43, 1.44it/s, loss=0.0054, lr=2.53e-06, step=9764] Training: 98%|█████████▊| 9765/10000 [2:06:07<02:38, 1.48it/s, loss=0.0054, lr=2.53e-06, step=9764] Training: 98%|█████████▊| 9765/10000 [2:06:07<02:38, 1.48it/s, loss=0.0103, lr=2.53e-06, step=9765] Training: 98%|█████████▊| 9766/10000 [2:06:08<02:38, 1.47it/s, loss=0.0103, lr=2.53e-06, step=9765] Training: 98%|█████████▊| 9766/10000 [2:06:08<02:38, 1.47it/s, loss=0.0011, lr=2.53e-06, step=9766] Training: 98%|█████████▊| 9767/10000 [2:06:09<02:57, 1.31it/s, loss=0.0011, lr=2.53e-06, step=9766] Training: 98%|█████████▊| 9767/10000 [2:06:09<02:57, 1.31it/s, loss=0.0131, lr=2.53e-06, step=9767] Training: 98%|█████████▊| 9768/10000 [2:06:10<03:30, 1.10it/s, loss=0.0131, lr=2.53e-06, step=9767] Training: 98%|█████████▊| 9768/10000 [2:06:10<03:30, 1.10it/s, loss=0.0755, lr=2.53e-06, step=9768] Training: 98%|█████████▊| 9769/10000 [2:06:11<03:35, 1.07it/s, loss=0.0755, lr=2.53e-06, step=9768] Training: 98%|█████████▊| 9769/10000 [2:06:11<03:35, 1.07it/s, loss=0.0058, lr=2.53e-06, step=9769]18:12:18.874 [I] step=9770 loss=0.0148 smoothed_loss=0.0131 lr=2.53e-06 grad_norm=0.4726 step_time=0.6676s data_time=0.1777s it/s=1.183 eta_to_10000=194.4s max_cuda_memory=35.23GB grad_action_in_proj=0.0056 grad_action_out_proj=0.0886 grad_shared_expert=0.2900 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9770/10000 [2:06:12<03:36, 1.06it/s, loss=0.0058, lr=2.53e-06, step=9769] Training: 98%|█████████▊| 9770/10000 [2:06:12<03:36, 1.06it/s, loss=0.0148, lr=2.53e-06, step=9770] Training: 98%|█████████▊| 9771/10000 [2:06:13<03:30, 1.09it/s, loss=0.0148, lr=2.53e-06, step=9770] Training: 98%|█████████▊| 9771/10000 [2:06:13<03:30, 1.09it/s, loss=0.0033, lr=2.53e-06, step=9771] Training: 98%|█████████▊| 9772/10000 [2:06:13<03:12, 1.19it/s, loss=0.0033, lr=2.53e-06, step=9771] Training: 98%|█████████▊| 9772/10000 [2:06:13<03:12, 1.19it/s, loss=0.0052, lr=2.53e-06, step=9772] Training: 98%|█████████▊| 9773/10000 [2:06:15<03:25, 1.11it/s, loss=0.0052, lr=2.53e-06, step=9772] Training: 98%|█████████▊| 9773/10000 [2:06:15<03:25, 1.11it/s, loss=0.0011, lr=2.53e-06, step=9773] Training: 98%|█████████▊| 9774/10000 [2:06:15<02:58, 1.26it/s, loss=0.0011, lr=2.53e-06, step=9773] Training: 98%|█████████▊| 9774/10000 [2:06:15<02:58, 1.26it/s, loss=0.0061, lr=2.53e-06, step=9774] Training: 98%|█████████▊| 9775/10000 [2:06:16<02:57, 1.27it/s, loss=0.0061, lr=2.53e-06, step=9774] Training: 98%|█████████▊| 9775/10000 [2:06:16<02:57, 1.27it/s, loss=0.0023, lr=2.53e-06, step=9775] Training: 98%|█████████▊| 9776/10000 [2:06:17<03:04, 1.21it/s, loss=0.0023, lr=2.53e-06, step=9775] Training: 98%|█████████▊| 9776/10000 [2:06:17<03:04, 1.21it/s, loss=0.0110, lr=2.53e-06, step=9776] Training: 98%|█████████▊| 9777/10000 [2:06:18<03:11, 1.16it/s, loss=0.0110, lr=2.53e-06, step=9776] Training: 98%|█████████▊| 9777/10000 [2:06:18<03:11, 1.16it/s, loss=0.0070, lr=2.53e-06, step=9777] Training: 98%|█████████▊| 9778/10000 [2:06:19<03:10, 1.16it/s, loss=0.0070, lr=2.53e-06, step=9777] Training: 98%|█████████▊| 9778/10000 [2:06:19<03:10, 1.16it/s, loss=0.0094, lr=2.53e-06, step=9778] Training: 98%|█████████▊| 9779/10000 [2:06:19<02:54, 1.27it/s, loss=0.0094, lr=2.53e-06, step=9778] Training: 98%|█████████▊| 9779/10000 [2:06:19<02:54, 1.27it/s, loss=0.0118, lr=2.53e-06, step=9779]18:12:26.810 [I] step=9780 loss=0.0067 smoothed_loss=0.0092 lr=2.53e-06 grad_norm=0.4212 step_time=0.6204s data_time=0.1733s it/s=1.260 eta_to_10000=174.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0240 grad_action_out_proj=0.1408 grad_shared_expert=0.4093 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9780/10000 [2:06:20<02:48, 1.31it/s, loss=0.0118, lr=2.53e-06, step=9779] Training: 98%|█████████▊| 9780/10000 [2:06:20<02:48, 1.31it/s, loss=0.0067, lr=2.53e-06, step=9780] Training: 98%|█████████▊| 9781/10000 [2:06:20<02:32, 1.44it/s, loss=0.0067, lr=2.53e-06, step=9780] Training: 98%|█████████▊| 9781/10000 [2:06:20<02:32, 1.44it/s, loss=0.0136, lr=2.53e-06, step=9781] Training: 98%|█████████▊| 9782/10000 [2:06:21<02:38, 1.38it/s, loss=0.0136, lr=2.53e-06, step=9781] Training: 98%|█████████▊| 9782/10000 [2:06:21<02:38, 1.38it/s, loss=0.0007, lr=2.53e-06, step=9782] Training: 98%|█████████▊| 9783/10000 [2:06:22<02:44, 1.32it/s, loss=0.0007, lr=2.53e-06, step=9782] Training: 98%|█████████▊| 9783/10000 [2:06:22<02:44, 1.32it/s, loss=0.0019, lr=2.53e-06, step=9783] Training: 98%|█████████▊| 9784/10000 [2:06:23<02:33, 1.40it/s, loss=0.0019, lr=2.53e-06, step=9783] Training: 98%|█████████▊| 9784/10000 [2:06:23<02:33, 1.40it/s, loss=0.0030, lr=2.53e-06, step=9784] Training: 98%|█████████▊| 9785/10000 [2:06:23<02:33, 1.40it/s, loss=0.0030, lr=2.53e-06, step=9784] Training: 98%|█████████▊| 9785/10000 [2:06:23<02:33, 1.40it/s, loss=0.0029, lr=2.53e-06, step=9785] Training: 98%|█████████▊| 9786/10000 [2:06:24<02:37, 1.36it/s, loss=0.0029, lr=2.53e-06, step=9785] Training: 98%|█████████▊| 9786/10000 [2:06:24<02:37, 1.36it/s, loss=0.0162, lr=2.53e-06, step=9786] Training: 98%|█████████▊| 9787/10000 [2:06:25<02:22, 1.49it/s, loss=0.0162, lr=2.53e-06, step=9786] Training: 98%|█████████▊| 9787/10000 [2:06:25<02:22, 1.49it/s, loss=0.0009, lr=2.53e-06, step=9787] Training: 98%|█████████▊| 9788/10000 [2:06:25<02:12, 1.59it/s, loss=0.0009, lr=2.53e-06, step=9787] Training: 98%|█████████▊| 9788/10000 [2:06:25<02:12, 1.59it/s, loss=0.0274, lr=2.53e-06, step=9788] Training: 98%|█████████▊| 9789/10000 [2:06:26<02:38, 1.33it/s, loss=0.0274, lr=2.53e-06, step=9788] Training: 98%|█████████▊| 9789/10000 [2:06:26<02:38, 1.33it/s, loss=0.0054, lr=2.53e-06, step=9789]18:12:34.225 [I] step=9790 loss=0.0050 smoothed_loss=0.0085 lr=2.53e-06 grad_norm=0.4916 step_time=0.6133s data_time=0.1282s it/s=1.349 eta_to_10000=155.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0046 grad_action_out_proj=0.0619 grad_shared_expert=0.2566 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9790/10000 [2:06:27<02:56, 1.19it/s, loss=0.0054, lr=2.53e-06, step=9789] Training: 98%|█████████▊| 9790/10000 [2:06:27<02:56, 1.19it/s, loss=0.0050, lr=2.53e-06, step=9790] Training: 98%|█████████▊| 9791/10000 [2:06:28<02:44, 1.27it/s, loss=0.0050, lr=2.53e-06, step=9790] Training: 98%|█████████▊| 9791/10000 [2:06:28<02:44, 1.27it/s, loss=0.0008, lr=2.53e-06, step=9791] Training: 98%|█████████▊| 9792/10000 [2:06:28<02:26, 1.42it/s, loss=0.0008, lr=2.53e-06, step=9791] Training: 98%|█████████▊| 9792/10000 [2:06:28<02:26, 1.42it/s, loss=0.0272, lr=2.53e-06, step=9792] Training: 98%|█████████▊| 9793/10000 [2:06:29<02:14, 1.54it/s, loss=0.0272, lr=2.53e-06, step=9792] Training: 98%|█████████▊| 9793/10000 [2:06:29<02:14, 1.54it/s, loss=0.0005, lr=2.53e-06, step=9793] Training: 98%|█████████▊| 9794/10000 [2:06:29<02:04, 1.65it/s, loss=0.0005, lr=2.53e-06, step=9793] Training: 98%|█████████▊| 9794/10000 [2:06:29<02:04, 1.65it/s, loss=0.0016, lr=2.53e-06, step=9794] Training: 98%|█████████▊| 9795/10000 [2:06:30<01:59, 1.72it/s, loss=0.0016, lr=2.53e-06, step=9794] Training: 98%|█████████▊| 9795/10000 [2:06:30<01:59, 1.72it/s, loss=0.2155, lr=2.53e-06, step=9795] Training: 98%|█████████▊| 9796/10000 [2:06:31<02:09, 1.57it/s, loss=0.2155, lr=2.53e-06, step=9795] Training: 98%|█████████▊| 9796/10000 [2:06:31<02:09, 1.57it/s, loss=0.0021, lr=2.53e-06, step=9796] Training: 98%|█████████▊| 9797/10000 [2:06:32<02:23, 1.42it/s, loss=0.0021, lr=2.53e-06, step=9796] Training: 98%|█████████▊| 9797/10000 [2:06:32<02:23, 1.42it/s, loss=0.0013, lr=2.53e-06, step=9797] Training: 98%|█████████▊| 9798/10000 [2:06:32<02:25, 1.39it/s, loss=0.0013, lr=2.53e-06, step=9797] Training: 98%|█████████▊| 9798/10000 [2:06:32<02:25, 1.39it/s, loss=0.0020, lr=2.53e-06, step=9798] Training: 98%|█████████▊| 9799/10000 [2:06:33<02:14, 1.50it/s, loss=0.0020, lr=2.53e-06, step=9798] Training: 98%|█████████▊| 9799/10000 [2:06:33<02:14, 1.50it/s, loss=0.0030, lr=2.53e-06, step=9799]18:12:40.711 [I] step=9800 loss=0.0011 smoothed_loss=0.0178 lr=2.53e-06 grad_norm=0.3528 step_time=0.5512s data_time=0.0974s it/s=1.542 eta_to_10000=129.7s max_cuda_memory=35.23GB grad_action_in_proj=0.0072 grad_action_out_proj=0.0677 grad_shared_expert=0.3213 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9800/10000 [2:06:34<02:23, 1.39it/s, loss=0.0030, lr=2.53e-06, step=9799] Training: 98%|█████████▊| 9800/10000 [2:06:34<02:23, 1.39it/s, loss=0.0011, lr=2.52e-06, step=9800] Training: 98%|█████████▊| 9801/10000 [2:06:34<02:10, 1.53it/s, loss=0.0011, lr=2.52e-06, step=9800] Training: 98%|█████████▊| 9801/10000 [2:06:34<02:10, 1.53it/s, loss=0.0214, lr=2.52e-06, step=9801] Training: 98%|█████████▊| 9802/10000 [2:06:35<02:00, 1.64it/s, loss=0.0214, lr=2.52e-06, step=9801] Training: 98%|█████████▊| 9802/10000 [2:06:35<02:00, 1.64it/s, loss=0.0046, lr=2.52e-06, step=9802] Training: 98%|█████████▊| 9803/10000 [2:06:35<02:03, 1.60it/s, loss=0.0046, lr=2.52e-06, step=9802] Training: 98%|█████████▊| 9803/10000 [2:06:35<02:03, 1.60it/s, loss=0.0058, lr=2.52e-06, step=9803] Training: 98%|█████████▊| 9804/10000 [2:06:36<02:23, 1.37it/s, loss=0.0058, lr=2.52e-06, step=9803] Training: 98%|█████████▊| 9804/10000 [2:06:36<02:23, 1.37it/s, loss=0.0013, lr=2.52e-06, step=9804] Training: 98%|█████████▊| 9805/10000 [2:06:37<02:19, 1.39it/s, loss=0.0013, lr=2.52e-06, step=9804] Training: 98%|█████████▊| 9805/10000 [2:06:37<02:19, 1.39it/s, loss=0.0015, lr=2.52e-06, step=9805] Training: 98%|█████████▊| 9806/10000 [2:06:38<02:06, 1.53it/s, loss=0.0015, lr=2.52e-06, step=9805] Training: 98%|█████████▊| 9806/10000 [2:06:38<02:06, 1.53it/s, loss=0.0012, lr=2.52e-06, step=9806] Training: 98%|█████████▊| 9807/10000 [2:06:38<01:57, 1.65it/s, loss=0.0012, lr=2.52e-06, step=9806] Training: 98%|█████████▊| 9807/10000 [2:06:38<01:57, 1.65it/s, loss=0.0028, lr=2.52e-06, step=9807] Training: 98%|█████████▊| 9808/10000 [2:06:39<01:53, 1.68it/s, loss=0.0028, lr=2.52e-06, step=9807] Training: 98%|█████████▊| 9808/10000 [2:06:39<01:53, 1.68it/s, loss=0.0013, lr=2.52e-06, step=9808] Training: 98%|█████████▊| 9809/10000 [2:06:39<01:57, 1.63it/s, loss=0.0013, lr=2.52e-06, step=9808] Training: 98%|█████████▊| 9809/10000 [2:06:39<01:57, 1.63it/s, loss=0.0264, lr=2.52e-06, step=9809]18:12:47.144 [I] step=9810 loss=0.0300 smoothed_loss=0.0134 lr=2.52e-06 grad_norm=0.4502 step_time=0.5384s data_time=0.1049s it/s=1.555 eta_to_10000=122.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0157 grad_action_out_proj=0.1134 grad_shared_expert=0.6301 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9810/10000 [2:06:40<02:11, 1.45it/s, loss=0.0264, lr=2.52e-06, step=9809] Training: 98%|█████████▊| 9810/10000 [2:06:40<02:11, 1.45it/s, loss=0.0300, lr=2.52e-06, step=9810] Training: 98%|█████████▊| 9811/10000 [2:06:41<02:16, 1.38it/s, loss=0.0300, lr=2.52e-06, step=9810] Training: 98%|█████████▊| 9811/10000 [2:06:41<02:16, 1.38it/s, loss=0.0059, lr=2.52e-06, step=9811] Training: 98%|█████████▊| 9812/10000 [2:06:42<02:04, 1.51it/s, loss=0.0059, lr=2.52e-06, step=9811] Training: 98%|█████████▊| 9812/10000 [2:06:42<02:04, 1.51it/s, loss=0.0036, lr=2.52e-06, step=9812] Training: 98%|█████████▊| 9813/10000 [2:06:42<02:05, 1.49it/s, loss=0.0036, lr=2.52e-06, step=9812] Training: 98%|█████████▊| 9813/10000 [2:06:42<02:05, 1.49it/s, loss=0.0224, lr=2.52e-06, step=9813] Training: 98%|█████████▊| 9814/10000 [2:06:43<01:54, 1.62it/s, loss=0.0224, lr=2.52e-06, step=9813] Training: 98%|█████████▊| 9814/10000 [2:06:43<01:54, 1.62it/s, loss=0.0071, lr=2.52e-06, step=9814] Training: 98%|█████████▊| 9815/10000 [2:06:43<01:49, 1.69it/s, loss=0.0071, lr=2.52e-06, step=9814] Training: 98%|█████████▊| 9815/10000 [2:06:43<01:49, 1.69it/s, loss=0.0116, lr=2.52e-06, step=9815] Training: 98%|█████████▊| 9816/10000 [2:06:44<02:01, 1.51it/s, loss=0.0116, lr=2.52e-06, step=9815] Training: 98%|█████████▊| 9816/10000 [2:06:44<02:01, 1.51it/s, loss=0.0110, lr=2.52e-06, step=9816] Training: 98%|█████████▊| 9817/10000 [2:06:45<01:59, 1.54it/s, loss=0.0110, lr=2.52e-06, step=9816] Training: 98%|█████████▊| 9817/10000 [2:06:45<01:59, 1.54it/s, loss=0.0115, lr=2.52e-06, step=9817] Training: 98%|█████████▊| 9818/10000 [2:06:46<02:17, 1.32it/s, loss=0.0115, lr=2.52e-06, step=9817] Training: 98%|█████████▊| 9818/10000 [2:06:46<02:17, 1.32it/s, loss=0.0043, lr=2.52e-06, step=9818] Training: 98%|█████████▊| 9819/10000 [2:06:46<02:16, 1.32it/s, loss=0.0043, lr=2.52e-06, step=9818] Training: 98%|█████████▊| 9819/10000 [2:06:46<02:16, 1.32it/s, loss=0.0025, lr=2.52e-06, step=9819]18:12:54.235 [I] step=9820 loss=0.0150 smoothed_loss=0.0108 lr=2.52e-06 grad_norm=0.4588 step_time=0.5785s data_time=0.1306s it/s=1.410 eta_to_10000=127.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0065 grad_action_out_proj=0.0829 grad_shared_expert=0.3710 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9820/10000 [2:06:47<02:21, 1.27it/s, loss=0.0025, lr=2.52e-06, step=9819] Training: 98%|█████████▊| 9820/10000 [2:06:47<02:21, 1.27it/s, loss=0.0150, lr=2.52e-06, step=9820] Training: 98%|█████████▊| 9821/10000 [2:06:48<02:14, 1.33it/s, loss=0.0150, lr=2.52e-06, step=9820] Training: 98%|█████████▊| 9821/10000 [2:06:48<02:14, 1.33it/s, loss=0.0006, lr=2.52e-06, step=9821] Training: 98%|█████████▊| 9822/10000 [2:06:49<02:02, 1.45it/s, loss=0.0006, lr=2.52e-06, step=9821] Training: 98%|█████████▊| 9822/10000 [2:06:49<02:02, 1.45it/s, loss=0.0056, lr=2.52e-06, step=9822] Training: 98%|█████████▊| 9823/10000 [2:06:49<01:53, 1.56it/s, loss=0.0056, lr=2.52e-06, step=9822] Training: 98%|█████████▊| 9823/10000 [2:06:49<01:53, 1.56it/s, loss=0.0039, lr=2.52e-06, step=9823] Training: 98%|█████████▊| 9824/10000 [2:06:50<01:53, 1.55it/s, loss=0.0039, lr=2.52e-06, step=9823] Training: 98%|█████████▊| 9824/10000 [2:06:50<01:53, 1.55it/s, loss=0.0511, lr=2.52e-06, step=9824] Training: 98%|█████████▊| 9825/10000 [2:06:51<02:05, 1.39it/s, loss=0.0511, lr=2.52e-06, step=9824] Training: 98%|█████████▊| 9825/10000 [2:06:51<02:05, 1.39it/s, loss=0.0043, lr=2.52e-06, step=9825] Training: 98%|█████████▊| 9826/10000 [2:06:51<02:10, 1.34it/s, loss=0.0043, lr=2.52e-06, step=9825] Training: 98%|█████████▊| 9826/10000 [2:06:51<02:10, 1.34it/s, loss=0.0029, lr=2.52e-06, step=9826] Training: 98%|█████████▊| 9827/10000 [2:06:52<02:00, 1.43it/s, loss=0.0029, lr=2.52e-06, step=9826] Training: 98%|█████████▊| 9827/10000 [2:06:52<02:00, 1.43it/s, loss=0.0219, lr=2.52e-06, step=9827] Training: 98%|█████████▊| 9828/10000 [2:06:53<01:57, 1.46it/s, loss=0.0219, lr=2.52e-06, step=9827] Training: 98%|█████████▊| 9828/10000 [2:06:53<01:57, 1.46it/s, loss=0.0066, lr=2.52e-06, step=9828] Training: 98%|█████████▊| 9829/10000 [2:06:53<01:48, 1.58it/s, loss=0.0066, lr=2.52e-06, step=9828] Training: 98%|█████████▊| 9829/10000 [2:06:53<01:48, 1.58it/s, loss=0.0037, lr=2.52e-06, step=9829]18:13:00.857 [I] step=9830 loss=0.0011 smoothed_loss=0.0100 lr=2.52e-06 grad_norm=0.3867 step_time=0.5599s data_time=0.1024s it/s=1.510 eta_to_10000=112.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0050 grad_action_out_proj=0.0431 grad_shared_expert=0.1756 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9830/10000 [2:06:54<01:54, 1.49it/s, loss=0.0037, lr=2.52e-06, step=9829] Training: 98%|█████████▊| 9830/10000 [2:06:54<01:54, 1.49it/s, loss=0.0011, lr=2.52e-06, step=9830] Training: 98%|█████████▊| 9831/10000 [2:06:55<02:12, 1.28it/s, loss=0.0011, lr=2.52e-06, step=9830] Training: 98%|█████████▊| 9831/10000 [2:06:55<02:12, 1.28it/s, loss=0.0119, lr=2.52e-06, step=9831] Training: 98%|█████████▊| 9832/10000 [2:06:56<02:12, 1.26it/s, loss=0.0119, lr=2.52e-06, step=9831] Training: 98%|█████████▊| 9832/10000 [2:06:56<02:12, 1.26it/s, loss=0.0137, lr=2.52e-06, step=9832] Training: 98%|█████████▊| 9833/10000 [2:06:57<02:17, 1.22it/s, loss=0.0137, lr=2.52e-06, step=9832] Training: 98%|█████████▊| 9833/10000 [2:06:57<02:17, 1.22it/s, loss=0.0051, lr=2.52e-06, step=9833] Training: 98%|█████████▊| 9834/10000 [2:06:57<02:06, 1.31it/s, loss=0.0051, lr=2.52e-06, step=9833] Training: 98%|█████████▊| 9834/10000 [2:06:57<02:06, 1.31it/s, loss=0.0061, lr=2.52e-06, step=9834] Training: 98%|█████████▊| 9835/10000 [2:06:58<02:05, 1.31it/s, loss=0.0061, lr=2.52e-06, step=9834] Training: 98%|█████████▊| 9835/10000 [2:06:58<02:05, 1.31it/s, loss=0.0081, lr=2.52e-06, step=9835] Training: 98%|█████████▊| 9836/10000 [2:06:59<01:51, 1.47it/s, loss=0.0081, lr=2.52e-06, step=9835] Training: 98%|█████████▊| 9836/10000 [2:06:59<01:51, 1.47it/s, loss=0.0145, lr=2.52e-06, step=9836] Training: 98%|█████████▊| 9837/10000 [2:06:59<01:46, 1.52it/s, loss=0.0145, lr=2.52e-06, step=9836] Training: 98%|█████████▊| 9837/10000 [2:06:59<01:46, 1.52it/s, loss=0.0066, lr=2.52e-06, step=9837] Training: 98%|█████████▊| 9838/10000 [2:07:00<01:49, 1.48it/s, loss=0.0066, lr=2.52e-06, step=9837] Training: 98%|█████████▊| 9838/10000 [2:07:00<01:49, 1.48it/s, loss=0.0023, lr=2.52e-06, step=9838] Training: 98%|█████████▊| 9839/10000 [2:07:01<01:49, 1.48it/s, loss=0.0023, lr=2.52e-06, step=9838] Training: 98%|█████████▊| 9839/10000 [2:07:01<01:49, 1.48it/s, loss=0.0048, lr=2.52e-06, step=9839]18:13:08.348 [I] step=9840 loss=0.0068 smoothed_loss=0.0083 lr=2.52e-06 grad_norm=0.4328 step_time=0.6042s data_time=0.1448s it/s=1.335 eta_to_10000=119.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0117 grad_action_out_proj=0.1196 grad_shared_expert=0.5125 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9840/10000 [2:07:01<01:57, 1.36it/s, loss=0.0048, lr=2.52e-06, step=9839] Training: 98%|█████████▊| 9840/10000 [2:07:01<01:57, 1.36it/s, loss=0.0068, lr=2.52e-06, step=9840] Training: 98%|█████████▊| 9841/10000 [2:07:02<01:45, 1.51it/s, loss=0.0068, lr=2.52e-06, step=9840] Training: 98%|█████████▊| 9841/10000 [2:07:02<01:45, 1.51it/s, loss=0.0072, lr=2.52e-06, step=9841] Training: 98%|█████████▊| 9842/10000 [2:07:02<01:36, 1.63it/s, loss=0.0072, lr=2.52e-06, step=9841] Training: 98%|█████████▊| 9842/10000 [2:07:02<01:36, 1.63it/s, loss=0.0132, lr=2.52e-06, step=9842] Training: 98%|█████████▊| 9843/10000 [2:07:03<01:32, 1.70it/s, loss=0.0132, lr=2.52e-06, step=9842] Training: 98%|█████████▊| 9843/10000 [2:07:03<01:32, 1.70it/s, loss=0.0435, lr=2.52e-06, step=9843] Training: 98%|█████████▊| 9844/10000 [2:07:04<01:36, 1.61it/s, loss=0.0435, lr=2.52e-06, step=9843] Training: 98%|█████████▊| 9844/10000 [2:07:04<01:36, 1.61it/s, loss=0.0037, lr=2.52e-06, step=9844] Training: 98%|█████████▊| 9845/10000 [2:07:04<01:46, 1.45it/s, loss=0.0037, lr=2.52e-06, step=9844] Training: 98%|█████████▊| 9845/10000 [2:07:04<01:46, 1.45it/s, loss=0.0043, lr=2.51e-06, step=9845] Training: 98%|█████████▊| 9846/10000 [2:07:05<01:57, 1.31it/s, loss=0.0043, lr=2.51e-06, step=9845] Training: 98%|█████████▊| 9846/10000 [2:07:05<01:57, 1.31it/s, loss=0.0084, lr=2.51e-06, step=9846] Training: 98%|█████████▊| 9847/10000 [2:07:07<02:12, 1.15it/s, loss=0.0084, lr=2.51e-06, step=9846] Training: 98%|█████████▊| 9847/10000 [2:07:07<02:12, 1.15it/s, loss=0.0042, lr=2.51e-06, step=9847] Training: 98%|█████████▊| 9848/10000 [2:07:07<02:12, 1.14it/s, loss=0.0042, lr=2.51e-06, step=9847] Training: 98%|█████████▊| 9848/10000 [2:07:07<02:12, 1.14it/s, loss=0.0028, lr=2.51e-06, step=9848] Training: 98%|█████████▊| 9849/10000 [2:07:08<01:57, 1.28it/s, loss=0.0028, lr=2.51e-06, step=9848] Training: 98%|█████████▊| 9849/10000 [2:07:08<01:57, 1.28it/s, loss=0.0126, lr=2.51e-06, step=9849]18:13:15.543 [I] step=9850 loss=0.0082 smoothed_loss=0.0093 lr=2.51e-06 grad_norm=0.4160 step_time=0.5749s data_time=0.1447s it/s=1.390 eta_to_10000=107.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0124 grad_action_out_proj=0.0980 grad_shared_expert=0.3461 (10775:train_pytorch.py:850) + Training: 98%|█████████▊| 9850/10000 [2:07:09<01:50, 1.36it/s, loss=0.0126, lr=2.51e-06, step=9849] Training: 98%|█████████▊| 9850/10000 [2:07:09<01:50, 1.36it/s, loss=0.0082, lr=2.51e-06, step=9850] Training: 99%|█████████▊| 9851/10000 [2:07:09<01:48, 1.37it/s, loss=0.0082, lr=2.51e-06, step=9850] Training: 99%|█████████▊| 9851/10000 [2:07:09<01:48, 1.37it/s, loss=0.0031, lr=2.51e-06, step=9851] Training: 99%|█████████▊| 9852/10000 [2:07:10<02:01, 1.22it/s, loss=0.0031, lr=2.51e-06, step=9851] Training: 99%|█████████▊| 9852/10000 [2:07:10<02:01, 1.22it/s, loss=0.0086, lr=2.51e-06, step=9852] Training: 99%|█████████▊| 9853/10000 [2:07:11<02:11, 1.12it/s, loss=0.0086, lr=2.51e-06, step=9852] Training: 99%|█████████▊| 9853/10000 [2:07:11<02:11, 1.12it/s, loss=0.0097, lr=2.51e-06, step=9853] Training: 99%|█████████▊| 9854/10000 [2:07:13<02:19, 1.05it/s, loss=0.0097, lr=2.51e-06, step=9853] Training: 99%|█████████▊| 9854/10000 [2:07:13<02:19, 1.05it/s, loss=0.0020, lr=2.51e-06, step=9854] Training: 99%|█████████▊| 9855/10000 [2:07:13<02:16, 1.06it/s, loss=0.0020, lr=2.51e-06, step=9854] Training: 99%|█████████▊| 9855/10000 [2:07:13<02:16, 1.06it/s, loss=0.0102, lr=2.51e-06, step=9855] Training: 99%|█████████▊| 9856/10000 [2:07:14<01:57, 1.23it/s, loss=0.0102, lr=2.51e-06, step=9855] Training: 99%|█████████▊| 9856/10000 [2:07:14<01:57, 1.23it/s, loss=0.0039, lr=2.51e-06, step=9856] Training: 99%|█████████▊| 9857/10000 [2:07:15<02:02, 1.17it/s, loss=0.0039, lr=2.51e-06, step=9856] Training: 99%|█████████▊| 9857/10000 [2:07:15<02:02, 1.17it/s, loss=0.0074, lr=2.51e-06, step=9857] Training: 99%|█████████▊| 9858/10000 [2:07:15<01:48, 1.30it/s, loss=0.0074, lr=2.51e-06, step=9857] Training: 99%|█████████▊| 9858/10000 [2:07:15<01:48, 1.30it/s, loss=0.0218, lr=2.51e-06, step=9858] Training: 99%|█████████▊| 9859/10000 [2:07:16<01:51, 1.27it/s, loss=0.0218, lr=2.51e-06, step=9858] Training: 99%|█████████▊| 9859/10000 [2:07:16<01:51, 1.27it/s, loss=0.0012, lr=2.51e-06, step=9859]18:13:24.221 [I] step=9860 loss=0.0117 smoothed_loss=0.0088 lr=2.51e-06 grad_norm=0.3767 step_time=0.6528s data_time=0.2150s it/s=1.153 eta_to_10000=121.5s max_cuda_memory=35.23GB grad_action_in_proj=0.0128 grad_action_out_proj=0.0820 grad_shared_expert=0.3272 (10775:train_pytorch.py:850) + Training: 99%|█████████▊| 9860/10000 [2:07:17<01:58, 1.18it/s, loss=0.0012, lr=2.51e-06, step=9859] Training: 99%|█████████▊| 9860/10000 [2:07:17<01:58, 1.18it/s, loss=0.0117, lr=2.51e-06, step=9860] Training: 99%|█████████▊| 9861/10000 [2:07:18<01:53, 1.22it/s, loss=0.0117, lr=2.51e-06, step=9860] Training: 99%|█████████▊| 9861/10000 [2:07:18<01:53, 1.22it/s, loss=0.0155, lr=2.51e-06, step=9861] Training: 99%|█████████▊| 9862/10000 [2:07:19<01:43, 1.33it/s, loss=0.0155, lr=2.51e-06, step=9861] Training: 99%|█████████▊| 9862/10000 [2:07:19<01:43, 1.33it/s, loss=0.0236, lr=2.51e-06, step=9862] Training: 99%|█████████▊| 9863/10000 [2:07:19<01:42, 1.34it/s, loss=0.0236, lr=2.51e-06, step=9862] Training: 99%|█████████▊| 9863/10000 [2:07:19<01:42, 1.34it/s, loss=0.0018, lr=2.51e-06, step=9863] Training: 99%|█████████▊| 9864/10000 [2:07:20<01:50, 1.23it/s, loss=0.0018, lr=2.51e-06, step=9863] Training: 99%|█████████▊| 9864/10000 [2:07:20<01:50, 1.23it/s, loss=0.0083, lr=2.51e-06, step=9864] Training: 99%|█████████▊| 9865/10000 [2:07:21<01:44, 1.29it/s, loss=0.0083, lr=2.51e-06, step=9864] Training: 99%|█████████▊| 9865/10000 [2:07:21<01:44, 1.29it/s, loss=0.0028, lr=2.51e-06, step=9865] Training: 99%|█████████▊| 9866/10000 [2:07:22<01:32, 1.46it/s, loss=0.0028, lr=2.51e-06, step=9865] Training: 99%|█████████▊| 9866/10000 [2:07:22<01:32, 1.46it/s, loss=0.0067, lr=2.51e-06, step=9866] Training: 99%|█████████▊| 9867/10000 [2:07:22<01:23, 1.59it/s, loss=0.0067, lr=2.51e-06, step=9866] Training: 99%|█████████▊| 9867/10000 [2:07:22<01:23, 1.59it/s, loss=0.0036, lr=2.51e-06, step=9867] Training: 99%|█████████▊| 9868/10000 [2:07:23<01:29, 1.47it/s, loss=0.0036, lr=2.51e-06, step=9867] Training: 99%|█████████▊| 9868/10000 [2:07:23<01:29, 1.47it/s, loss=0.0009, lr=2.51e-06, step=9868] Training: 99%|█████████▊| 9869/10000 [2:07:23<01:24, 1.55it/s, loss=0.0009, lr=2.51e-06, step=9868] Training: 99%|█████████▊| 9869/10000 [2:07:23<01:24, 1.55it/s, loss=0.0031, lr=2.51e-06, step=9869]18:13:31.111 [I] step=9870 loss=0.0059 smoothed_loss=0.0070 lr=2.51e-06 grad_norm=0.5194 step_time=0.5583s data_time=0.1308s it/s=1.451 eta_to_10000=89.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0191 grad_action_out_proj=0.1788 grad_shared_expert=0.6869 (10775:train_pytorch.py:850) + Training: 99%|█████████▊| 9870/10000 [2:07:24<01:30, 1.44it/s, loss=0.0031, lr=2.51e-06, step=9869] Training: 99%|█████████▊| 9870/10000 [2:07:24<01:30, 1.44it/s, loss=0.0059, lr=2.51e-06, step=9870] Training: 99%|█████████▊| 9871/10000 [2:07:25<01:22, 1.57it/s, loss=0.0059, lr=2.51e-06, step=9870] Training: 99%|█████████▊| 9871/10000 [2:07:25<01:22, 1.57it/s, loss=0.0029, lr=2.51e-06, step=9871] Training: 99%|█████████▊| 9872/10000 [2:07:26<01:31, 1.40it/s, loss=0.0029, lr=2.51e-06, step=9871] Training: 99%|█████████▊| 9872/10000 [2:07:26<01:31, 1.40it/s, loss=0.0079, lr=2.51e-06, step=9872] Training: 99%|█████████▊| 9873/10000 [2:07:26<01:33, 1.35it/s, loss=0.0079, lr=2.51e-06, step=9872] Training: 99%|█████████▊| 9873/10000 [2:07:26<01:33, 1.35it/s, loss=0.0148, lr=2.51e-06, step=9873] Training: 99%|█████████▊| 9874/10000 [2:07:27<01:35, 1.32it/s, loss=0.0148, lr=2.51e-06, step=9873] Training: 99%|█████████▊| 9874/10000 [2:07:27<01:35, 1.32it/s, loss=0.0017, lr=2.51e-06, step=9874] Training: 99%|█████████▉| 9875/10000 [2:07:28<01:50, 1.13it/s, loss=0.0017, lr=2.51e-06, step=9874] Training: 99%|█████████▉| 9875/10000 [2:07:28<01:50, 1.13it/s, loss=0.0025, lr=2.51e-06, step=9875] Training: 99%|█████████▉| 9876/10000 [2:07:29<01:44, 1.18it/s, loss=0.0025, lr=2.51e-06, step=9875] Training: 99%|█████████▉| 9876/10000 [2:07:29<01:44, 1.18it/s, loss=0.0174, lr=2.51e-06, step=9876] Training: 99%|█████████▉| 9877/10000 [2:07:30<01:41, 1.21it/s, loss=0.0174, lr=2.51e-06, step=9876] Training: 99%|█████████▉| 9877/10000 [2:07:30<01:41, 1.21it/s, loss=0.0036, lr=2.51e-06, step=9877] Training: 99%|█████████▉| 9878/10000 [2:07:30<01:31, 1.33it/s, loss=0.0036, lr=2.51e-06, step=9877] Training: 99%|█████████▉| 9878/10000 [2:07:30<01:31, 1.33it/s, loss=0.0067, lr=2.51e-06, step=9878] Training: 99%|█████████▉| 9879/10000 [2:07:31<01:22, 1.47it/s, loss=0.0067, lr=2.51e-06, step=9878] Training: 99%|█████████▉| 9879/10000 [2:07:31<01:22, 1.47it/s, loss=0.0021, lr=2.51e-06, step=9879]18:13:38.970 [I] step=9880 loss=0.0041 smoothed_loss=0.0064 lr=2.51e-06 grad_norm=0.4778 step_time=0.6064s data_time=0.1795s it/s=1.273 eta_to_10000=94.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0447 grad_action_out_proj=0.2242 grad_shared_expert=0.6269 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9880/10000 [2:07:32<01:35, 1.26it/s, loss=0.0021, lr=2.51e-06, step=9879] Training: 99%|█████████▉| 9880/10000 [2:07:32<01:35, 1.26it/s, loss=0.0041, lr=2.51e-06, step=9880] Training: 99%|█████████▉| 9881/10000 [2:07:33<01:34, 1.26it/s, loss=0.0041, lr=2.51e-06, step=9880] Training: 99%|█████████▉| 9881/10000 [2:07:33<01:34, 1.26it/s, loss=0.0044, lr=2.51e-06, step=9881] Training: 99%|█████████▉| 9882/10000 [2:07:34<01:37, 1.21it/s, loss=0.0044, lr=2.51e-06, step=9881] Training: 99%|█████████▉| 9882/10000 [2:07:34<01:37, 1.21it/s, loss=0.0358, lr=2.51e-06, step=9882] Training: 99%|█████████▉| 9883/10000 [2:07:34<01:33, 1.25it/s, loss=0.0358, lr=2.51e-06, step=9882] Training: 99%|█████████▉| 9883/10000 [2:07:34<01:33, 1.25it/s, loss=0.0061, lr=2.51e-06, step=9883] Training: 99%|█████████▉| 9884/10000 [2:07:35<01:28, 1.30it/s, loss=0.0061, lr=2.51e-06, step=9883] Training: 99%|█████████▉| 9884/10000 [2:07:35<01:28, 1.30it/s, loss=0.0015, lr=2.51e-06, step=9884] Training: 99%|█████████▉| 9885/10000 [2:07:36<01:35, 1.21it/s, loss=0.0015, lr=2.51e-06, step=9884] Training: 99%|█████████▉| 9885/10000 [2:07:36<01:35, 1.21it/s, loss=0.0074, lr=2.51e-06, step=9885] Training: 99%|█████████▉| 9886/10000 [2:07:37<01:24, 1.36it/s, loss=0.0074, lr=2.51e-06, step=9885] Training: 99%|█████████▉| 9886/10000 [2:07:37<01:24, 1.36it/s, loss=0.0109, lr=2.51e-06, step=9886] Training: 99%|█████████▉| 9887/10000 [2:07:38<01:29, 1.27it/s, loss=0.0109, lr=2.51e-06, step=9886] Training: 99%|█████████▉| 9887/10000 [2:07:38<01:29, 1.27it/s, loss=0.0003, lr=2.51e-06, step=9887] Training: 99%|█████████▉| 9888/10000 [2:07:39<01:35, 1.17it/s, loss=0.0003, lr=2.51e-06, step=9887] Training: 99%|█████████▉| 9888/10000 [2:07:39<01:35, 1.17it/s, loss=0.0165, lr=2.51e-06, step=9888] Training: 99%|█████████▉| 9889/10000 [2:07:39<01:30, 1.23it/s, loss=0.0165, lr=2.51e-06, step=9888] Training: 99%|█████████▉| 9889/10000 [2:07:39<01:30, 1.23it/s, loss=0.0086, lr=2.51e-06, step=9889]18:13:47.092 [I] step=9890 loss=0.0060 smoothed_loss=0.0082 lr=2.51e-06 grad_norm=0.4531 step_time=0.6583s data_time=0.1540s it/s=1.231 eta_to_10000=89.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0177 grad_action_out_proj=0.1205 grad_shared_expert=0.4349 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9890/10000 [2:07:40<01:31, 1.21it/s, loss=0.0086, lr=2.51e-06, step=9889] Training: 99%|█████████▉| 9890/10000 [2:07:40<01:31, 1.21it/s, loss=0.0060, lr=2.51e-06, step=9890] Training: 99%|█████████▉| 9891/10000 [2:07:41<01:23, 1.31it/s, loss=0.0060, lr=2.51e-06, step=9890] Training: 99%|█████████▉| 9891/10000 [2:07:41<01:23, 1.31it/s, loss=0.0192, lr=2.51e-06, step=9891] Training: 99%|█████████▉| 9892/10000 [2:07:41<01:20, 1.34it/s, loss=0.0192, lr=2.51e-06, step=9891] Training: 99%|█████████▉| 9892/10000 [2:07:41<01:20, 1.34it/s, loss=0.0187, lr=2.51e-06, step=9892] Training: 99%|█████████▉| 9893/10000 [2:07:42<01:15, 1.41it/s, loss=0.0187, lr=2.51e-06, step=9892] Training: 99%|█████████▉| 9893/10000 [2:07:42<01:15, 1.41it/s, loss=0.0059, lr=2.51e-06, step=9893] Training: 99%|█████████▉| 9894/10000 [2:07:43<01:18, 1.34it/s, loss=0.0059, lr=2.51e-06, step=9893] Training: 99%|█████████▉| 9894/10000 [2:07:43<01:18, 1.34it/s, loss=0.0055, lr=2.51e-06, step=9894] Training: 99%|█████████▉| 9895/10000 [2:07:44<01:17, 1.35it/s, loss=0.0055, lr=2.51e-06, step=9894] Training: 99%|█████████▉| 9895/10000 [2:07:44<01:17, 1.35it/s, loss=0.0016, lr=2.51e-06, step=9895] Training: 99%|█████████▉| 9896/10000 [2:07:44<01:09, 1.50it/s, loss=0.0016, lr=2.51e-06, step=9895] Training: 99%|█████████▉| 9896/10000 [2:07:44<01:09, 1.50it/s, loss=0.0030, lr=2.51e-06, step=9896] Training: 99%|█████████▉| 9897/10000 [2:07:45<01:20, 1.29it/s, loss=0.0030, lr=2.51e-06, step=9896] Training: 99%|█████████▉| 9897/10000 [2:07:45<01:20, 1.29it/s, loss=0.0026, lr=2.51e-06, step=9897] Training: 99%|█████████▉| 9898/10000 [2:07:46<01:28, 1.16it/s, loss=0.0026, lr=2.51e-06, step=9897] Training: 99%|█████████▉| 9898/10000 [2:07:46<01:28, 1.16it/s, loss=0.0307, lr=2.51e-06, step=9898] Training: 99%|█████████▉| 9899/10000 [2:07:47<01:22, 1.23it/s, loss=0.0307, lr=2.51e-06, step=9898] Training: 99%|█████████▉| 9899/10000 [2:07:47<01:22, 1.23it/s, loss=0.0118, lr=2.51e-06, step=9899]18:13:54.553 [I] step=9900 loss=0.0202 smoothed_loss=0.0110 lr=2.51e-06 grad_norm=0.4201 step_time=0.5837s data_time=0.1623s it/s=1.341 eta_to_10000=74.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0083 grad_action_out_proj=0.0658 grad_shared_expert=0.3654 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9900/10000 [2:07:48<01:16, 1.30it/s, loss=0.0118, lr=2.51e-06, step=9899] Training: 99%|█████████▉| 9900/10000 [2:07:48<01:16, 1.30it/s, loss=0.0202, lr=2.51e-06, step=9900] Training: 99%|█████████▉| 9901/10000 [2:07:48<01:08, 1.45it/s, loss=0.0202, lr=2.51e-06, step=9900] Training: 99%|█████████▉| 9901/10000 [2:07:48<01:08, 1.45it/s, loss=0.0051, lr=2.51e-06, step=9901] Training: 99%|█████████▉| 9902/10000 [2:07:49<01:02, 1.57it/s, loss=0.0051, lr=2.51e-06, step=9901] Training: 99%|█████████▉| 9902/10000 [2:07:49<01:02, 1.57it/s, loss=0.0085, lr=2.51e-06, step=9902] Training: 99%|█████████▉| 9903/10000 [2:07:50<01:10, 1.38it/s, loss=0.0085, lr=2.51e-06, step=9902] Training: 99%|█████████▉| 9903/10000 [2:07:50<01:10, 1.38it/s, loss=0.0082, lr=2.51e-06, step=9903] Training: 99%|█████████▉| 9904/10000 [2:07:50<01:02, 1.53it/s, loss=0.0082, lr=2.51e-06, step=9903] Training: 99%|█████████▉| 9904/10000 [2:07:50<01:02, 1.53it/s, loss=0.0255, lr=2.51e-06, step=9904] Training: 99%|█████████▉| 9905/10000 [2:07:51<01:17, 1.23it/s, loss=0.0255, lr=2.51e-06, step=9904] Training: 99%|█████████▉| 9905/10000 [2:07:51<01:17, 1.23it/s, loss=0.0042, lr=2.51e-06, step=9905] Training: 99%|█████████▉| 9906/10000 [2:07:52<01:08, 1.36it/s, loss=0.0042, lr=2.51e-06, step=9905] Training: 99%|█████████▉| 9906/10000 [2:07:52<01:08, 1.36it/s, loss=0.0028, lr=2.51e-06, step=9906] Training: 99%|█████████▉| 9907/10000 [2:07:53<01:08, 1.35it/s, loss=0.0028, lr=2.51e-06, step=9906] Training: 99%|█████████▉| 9907/10000 [2:07:53<01:08, 1.35it/s, loss=0.0080, lr=2.51e-06, step=9907] Training: 99%|█████████▉| 9908/10000 [2:07:53<01:01, 1.49it/s, loss=0.0080, lr=2.51e-06, step=9907] Training: 99%|█████████▉| 9908/10000 [2:07:53<01:01, 1.49it/s, loss=0.0012, lr=2.51e-06, step=9908] Training: 99%|█████████▉| 9909/10000 [2:07:54<01:04, 1.41it/s, loss=0.0012, lr=2.51e-06, step=9908] Training: 99%|█████████▉| 9909/10000 [2:07:54<01:04, 1.41it/s, loss=0.0057, lr=2.51e-06, step=9909]18:14:01.975 [I] step=9910 loss=0.0915 smoothed_loss=0.0169 lr=2.51e-06 grad_norm=0.3870 step_time=0.5930s data_time=0.1493s it/s=1.347 eta_to_10000=66.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0384 grad_action_out_proj=0.1925 grad_shared_expert=0.4918 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9910/10000 [2:07:55<01:16, 1.17it/s, loss=0.0057, lr=2.51e-06, step=9909] Training: 99%|█████████▉| 9910/10000 [2:07:55<01:16, 1.17it/s, loss=0.0915, lr=2.51e-06, step=9910] Training: 99%|█████████▉| 9911/10000 [2:07:56<01:12, 1.22it/s, loss=0.0915, lr=2.51e-06, step=9910] Training: 99%|█████████▉| 9911/10000 [2:07:56<01:12, 1.22it/s, loss=0.0127, lr=2.50e-06, step=9911] Training: 99%|█████████▉| 9912/10000 [2:07:56<01:03, 1.38it/s, loss=0.0127, lr=2.50e-06, step=9911] Training: 99%|█████████▉| 9912/10000 [2:07:56<01:03, 1.38it/s, loss=0.0117, lr=2.50e-06, step=9912] Training: 99%|█████████▉| 9913/10000 [2:07:57<00:57, 1.50it/s, loss=0.0117, lr=2.50e-06, step=9912] Training: 99%|█████████▉| 9913/10000 [2:07:57<00:57, 1.50it/s, loss=0.0060, lr=2.50e-06, step=9913] Training: 99%|█████████▉| 9914/10000 [2:07:57<00:53, 1.61it/s, loss=0.0060, lr=2.50e-06, step=9913] Training: 99%|█████████▉| 9914/10000 [2:07:57<00:53, 1.61it/s, loss=0.0069, lr=2.50e-06, step=9914] Training: 99%|█████████▉| 9915/10000 [2:07:58<00:51, 1.65it/s, loss=0.0069, lr=2.50e-06, step=9914] Training: 99%|█████████▉| 9915/10000 [2:07:58<00:51, 1.65it/s, loss=0.0308, lr=2.50e-06, step=9915] Training: 99%|█████████▉| 9916/10000 [2:07:58<00:48, 1.74it/s, loss=0.0308, lr=2.50e-06, step=9915] Training: 99%|█████████▉| 9916/10000 [2:07:58<00:48, 1.74it/s, loss=0.0246, lr=2.50e-06, step=9916] Training: 99%|█████████▉| 9917/10000 [2:07:59<00:46, 1.80it/s, loss=0.0246, lr=2.50e-06, step=9916] Training: 99%|█████████▉| 9917/10000 [2:07:59<00:46, 1.80it/s, loss=0.0064, lr=2.50e-06, step=9917] Training: 99%|█████████▉| 9918/10000 [2:08:00<00:48, 1.67it/s, loss=0.0064, lr=2.50e-06, step=9917] Training: 99%|█████████▉| 9918/10000 [2:08:00<00:48, 1.67it/s, loss=0.0022, lr=2.50e-06, step=9918] Training: 99%|█████████▉| 9919/10000 [2:08:00<00:51, 1.57it/s, loss=0.0022, lr=2.50e-06, step=9918] Training: 99%|█████████▉| 9919/10000 [2:08:00<00:51, 1.57it/s, loss=0.0034, lr=2.50e-06, step=9919]18:14:07.801 [I] step=9920 loss=0.0012 smoothed_loss=0.0121 lr=2.50e-06 grad_norm=0.4127 step_time=0.5111s data_time=0.0715s it/s=1.717 eta_to_10000=46.6s max_cuda_memory=35.23GB grad_action_in_proj=0.0183 grad_action_out_proj=0.1225 grad_shared_expert=0.3161 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9920/10000 [2:08:01<00:48, 1.65it/s, loss=0.0034, lr=2.50e-06, step=9919] Training: 99%|█████████▉| 9920/10000 [2:08:01<00:48, 1.65it/s, loss=0.0012, lr=2.50e-06, step=9920] Training: 99%|█████████▉| 9921/10000 [2:08:02<00:50, 1.57it/s, loss=0.0012, lr=2.50e-06, step=9920] Training: 99%|█████████▉| 9921/10000 [2:08:02<00:50, 1.57it/s, loss=0.0033, lr=2.50e-06, step=9921] Training: 99%|█████████▉| 9922/10000 [2:08:02<00:52, 1.49it/s, loss=0.0033, lr=2.50e-06, step=9921] Training: 99%|█████████▉| 9922/10000 [2:08:02<00:52, 1.49it/s, loss=0.0087, lr=2.50e-06, step=9922] Training: 99%|█████████▉| 9923/10000 [2:08:03<00:48, 1.58it/s, loss=0.0087, lr=2.50e-06, step=9922] Training: 99%|█████████▉| 9923/10000 [2:08:03<00:48, 1.58it/s, loss=0.0057, lr=2.50e-06, step=9923] Training: 99%|█████████▉| 9924/10000 [2:08:04<00:54, 1.40it/s, loss=0.0057, lr=2.50e-06, step=9923] Training: 99%|█████████▉| 9924/10000 [2:08:04<00:54, 1.40it/s, loss=0.0108, lr=2.50e-06, step=9924] Training: 99%|█████████▉| 9925/10000 [2:08:04<00:51, 1.46it/s, loss=0.0108, lr=2.50e-06, step=9924] Training: 99%|█████████▉| 9925/10000 [2:08:04<00:51, 1.46it/s, loss=0.0029, lr=2.50e-06, step=9925] Training: 99%|█████████▉| 9926/10000 [2:08:05<00:56, 1.31it/s, loss=0.0029, lr=2.50e-06, step=9925] Training: 99%|█████████▉| 9926/10000 [2:08:05<00:56, 1.31it/s, loss=0.0270, lr=2.50e-06, step=9926] Training: 99%|█████████▉| 9927/10000 [2:08:06<00:50, 1.46it/s, loss=0.0270, lr=2.50e-06, step=9926] Training: 99%|█████████▉| 9927/10000 [2:08:06<00:50, 1.46it/s, loss=0.0048, lr=2.50e-06, step=9927] Training: 99%|█████████▉| 9928/10000 [2:08:07<00:50, 1.43it/s, loss=0.0048, lr=2.50e-06, step=9927] Training: 99%|█████████▉| 9928/10000 [2:08:07<00:50, 1.43it/s, loss=0.0260, lr=2.50e-06, step=9928] Training: 99%|█████████▉| 9929/10000 [2:08:07<00:45, 1.57it/s, loss=0.0260, lr=2.50e-06, step=9928] Training: 99%|█████████▉| 9929/10000 [2:08:07<00:45, 1.57it/s, loss=0.0063, lr=2.50e-06, step=9929]18:14:14.535 [I] step=9930 loss=0.0096 smoothed_loss=0.0115 lr=2.50e-06 grad_norm=0.4252 step_time=0.5457s data_time=0.1277s it/s=1.485 eta_to_10000=47.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0073 grad_action_out_proj=0.0860 grad_shared_expert=0.3860 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9930/10000 [2:08:08<00:42, 1.65it/s, loss=0.0063, lr=2.50e-06, step=9929] Training: 99%|█████████▉| 9930/10000 [2:08:08<00:42, 1.65it/s, loss=0.0096, lr=2.50e-06, step=9930] Training: 99%|█████████▉| 9931/10000 [2:08:08<00:39, 1.76it/s, loss=0.0096, lr=2.50e-06, step=9930] Training: 99%|█████████▉| 9931/10000 [2:08:08<00:39, 1.76it/s, loss=0.0162, lr=2.50e-06, step=9931] Training: 99%|█████████▉| 9932/10000 [2:08:09<00:37, 1.83it/s, loss=0.0162, lr=2.50e-06, step=9931] Training: 99%|█████████▉| 9932/10000 [2:08:09<00:37, 1.83it/s, loss=0.0066, lr=2.50e-06, step=9932] Training: 99%|█████████▉| 9933/10000 [2:08:09<00:41, 1.60it/s, loss=0.0066, lr=2.50e-06, step=9932] Training: 99%|█████████▉| 9933/10000 [2:08:09<00:41, 1.60it/s, loss=0.0043, lr=2.50e-06, step=9933] Training: 99%|█████████▉| 9934/10000 [2:08:10<00:38, 1.69it/s, loss=0.0043, lr=2.50e-06, step=9933] Training: 99%|█████████▉| 9934/10000 [2:08:10<00:38, 1.69it/s, loss=0.0051, lr=2.50e-06, step=9934] Training: 99%|█████████▉| 9935/10000 [2:08:11<00:42, 1.53it/s, loss=0.0051, lr=2.50e-06, step=9934] Training: 99%|█████████▉| 9935/10000 [2:08:11<00:42, 1.53it/s, loss=0.0084, lr=2.50e-06, step=9935] Training: 99%|█████████▉| 9936/10000 [2:08:11<00:40, 1.58it/s, loss=0.0084, lr=2.50e-06, step=9935] Training: 99%|█████████▉| 9936/10000 [2:08:11<00:40, 1.58it/s, loss=0.0091, lr=2.50e-06, step=9936] Training: 99%|█████████▉| 9937/10000 [2:08:12<00:43, 1.44it/s, loss=0.0091, lr=2.50e-06, step=9936] Training: 99%|█████████▉| 9937/10000 [2:08:12<00:43, 1.44it/s, loss=0.0151, lr=2.50e-06, step=9937] Training: 99%|█████████▉| 9938/10000 [2:08:13<00:43, 1.41it/s, loss=0.0151, lr=2.50e-06, step=9937] Training: 99%|█████████▉| 9938/10000 [2:08:13<00:43, 1.41it/s, loss=0.0027, lr=2.50e-06, step=9938] Training: 99%|█████████▉| 9939/10000 [2:08:13<00:41, 1.48it/s, loss=0.0027, lr=2.50e-06, step=9938] Training: 99%|█████████▉| 9939/10000 [2:08:13<00:41, 1.48it/s, loss=0.0117, lr=2.50e-06, step=9939]18:14:21.422 [I] step=9940 loss=0.0050 smoothed_loss=0.0094 lr=2.50e-06 grad_norm=0.5446 step_time=0.5607s data_time=0.1280s it/s=1.452 eta_to_10000=41.3s max_cuda_memory=35.23GB grad_action_in_proj=0.0175 grad_action_out_proj=0.1005 grad_shared_expert=0.4694 (10775:train_pytorch.py:850) + Training: 99%|█████████▉| 9940/10000 [2:08:14<00:46, 1.28it/s, loss=0.0117, lr=2.50e-06, step=9939] Training: 99%|█████████▉| 9940/10000 [2:08:14<00:46, 1.28it/s, loss=0.0050, lr=2.50e-06, step=9940] Training: 99%|█████████▉| 9941/10000 [2:08:15<00:42, 1.39it/s, loss=0.0050, lr=2.50e-06, step=9940] Training: 99%|█████████▉| 9941/10000 [2:08:15<00:42, 1.39it/s, loss=0.0010, lr=2.50e-06, step=9941] Training: 99%|█████████▉| 9942/10000 [2:08:16<00:42, 1.38it/s, loss=0.0010, lr=2.50e-06, step=9941] Training: 99%|█████████▉| 9942/10000 [2:08:16<00:42, 1.38it/s, loss=0.0022, lr=2.50e-06, step=9942] Training: 99%|█████████▉| 9943/10000 [2:08:17<00:41, 1.37it/s, loss=0.0022, lr=2.50e-06, step=9942] Training: 99%|█████████▉| 9943/10000 [2:08:17<00:41, 1.37it/s, loss=0.0143, lr=2.50e-06, step=9943] Training: 99%|█████████▉| 9944/10000 [2:08:17<00:37, 1.48it/s, loss=0.0143, lr=2.50e-06, step=9943] Training: 99%|█████████▉| 9944/10000 [2:08:17<00:37, 1.48it/s, loss=0.0199, lr=2.50e-06, step=9944] Training: 99%|█████████▉| 9945/10000 [2:08:18<00:34, 1.59it/s, loss=0.0199, lr=2.50e-06, step=9944] Training: 99%|█████████▉| 9945/10000 [2:08:18<00:34, 1.59it/s, loss=0.0041, lr=2.50e-06, step=9945] Training: 99%|█████████▉| 9946/10000 [2:08:18<00:32, 1.64it/s, loss=0.0041, lr=2.50e-06, step=9945] Training: 99%|█████████▉| 9946/10000 [2:08:18<00:32, 1.64it/s, loss=0.0032, lr=2.50e-06, step=9946] Training: 99%|█████████▉| 9947/10000 [2:08:19<00:34, 1.54it/s, loss=0.0032, lr=2.50e-06, step=9946] Training: 99%|█████████▉| 9947/10000 [2:08:19<00:34, 1.54it/s, loss=0.0675, lr=2.50e-06, step=9947] Training: 99%|█████████▉| 9948/10000 [2:08:20<00:35, 1.46it/s, loss=0.0675, lr=2.50e-06, step=9947] Training: 99%|█████████▉| 9948/10000 [2:08:20<00:35, 1.46it/s, loss=0.0065, lr=2.50e-06, step=9948] Training: 99%|█████████▉| 9949/10000 [2:08:20<00:33, 1.50it/s, loss=0.0065, lr=2.50e-06, step=9948] Training: 99%|█████████▉| 9949/10000 [2:08:20<00:33, 1.50it/s, loss=0.0038, lr=2.50e-06, step=9949]18:14:27.783 [I] step=9950 loss=0.0039 smoothed_loss=0.0118 lr=2.50e-06 grad_norm=0.5098 step_time=0.5326s data_time=0.1035s it/s=1.572 eta_to_10000=31.8s max_cuda_memory=35.23GB grad_action_in_proj=0.0097 grad_action_out_proj=0.1392 grad_shared_expert=0.9805 (10775:train_pytorch.py:850) + Training: 100%|█████████▉| 9950/10000 [2:08:21<00:31, 1.59it/s, loss=0.0038, lr=2.50e-06, step=9949] Training: 100%|█████████▉| 9950/10000 [2:08:21<00:31, 1.59it/s, loss=0.0039, lr=2.50e-06, step=9950] Training: 100%|█████████▉| 9951/10000 [2:08:21<00:28, 1.69it/s, loss=0.0039, lr=2.50e-06, step=9950] Training: 100%|█████████▉| 9951/10000 [2:08:21<00:28, 1.69it/s, loss=0.0838, lr=2.50e-06, step=9951] Training: 100%|█████████▉| 9952/10000 [2:08:22<00:27, 1.75it/s, loss=0.0838, lr=2.50e-06, step=9951] Training: 100%|█████████▉| 9952/10000 [2:08:22<00:27, 1.75it/s, loss=0.0067, lr=2.50e-06, step=9952] Training: 100%|█████████▉| 9953/10000 [2:08:23<00:29, 1.57it/s, loss=0.0067, lr=2.50e-06, step=9952] Training: 100%|█████████▉| 9953/10000 [2:08:23<00:29, 1.57it/s, loss=0.0081, lr=2.50e-06, step=9953] Training: 100%|█████████▉| 9954/10000 [2:08:23<00:30, 1.49it/s, loss=0.0081, lr=2.50e-06, step=9953] Training: 100%|█████████▉| 9954/10000 [2:08:23<00:30, 1.49it/s, loss=0.0049, lr=2.50e-06, step=9954] Training: 100%|█████████▉| 9955/10000 [2:08:24<00:27, 1.62it/s, loss=0.0049, lr=2.50e-06, step=9954] Training: 100%|█████████▉| 9955/10000 [2:08:24<00:27, 1.62it/s, loss=0.0078, lr=2.50e-06, step=9955] Training: 100%|█████████▉| 9956/10000 [2:08:24<00:25, 1.72it/s, loss=0.0078, lr=2.50e-06, step=9955] Training: 100%|█████████▉| 9956/10000 [2:08:24<00:25, 1.72it/s, loss=0.0038, lr=2.50e-06, step=9956] Training: 100%|█████████▉| 9957/10000 [2:08:25<00:25, 1.68it/s, loss=0.0038, lr=2.50e-06, step=9956] Training: 100%|█████████▉| 9957/10000 [2:08:25<00:25, 1.68it/s, loss=0.0127, lr=2.50e-06, step=9957] Training: 100%|█████████▉| 9958/10000 [2:08:26<00:23, 1.77it/s, loss=0.0127, lr=2.50e-06, step=9957] Training: 100%|█████████▉| 9958/10000 [2:08:26<00:23, 1.77it/s, loss=0.0324, lr=2.50e-06, step=9958] Training: 100%|█████████▉| 9959/10000 [2:08:26<00:23, 1.76it/s, loss=0.0324, lr=2.50e-06, step=9958] Training: 100%|█████████▉| 9959/10000 [2:08:26<00:23, 1.76it/s, loss=0.0018, lr=2.50e-06, step=9959]18:14:33.566 [I] step=9960 loss=0.0366 smoothed_loss=0.0164 lr=2.50e-06 grad_norm=0.4185 step_time=0.4790s data_time=0.0993s it/s=1.730 eta_to_10000=23.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0357 grad_action_out_proj=0.1786 grad_shared_expert=0.9393 (10775:train_pytorch.py:850) + Training: 100%|█████████▉| 9960/10000 [2:08:27<00:22, 1.80it/s, loss=0.0018, lr=2.50e-06, step=9959] Training: 100%|█████████▉| 9960/10000 [2:08:27<00:22, 1.80it/s, loss=0.0366, lr=2.50e-06, step=9960] Training: 100%|█████████▉| 9961/10000 [2:08:27<00:23, 1.68it/s, loss=0.0366, lr=2.50e-06, step=9960] Training: 100%|█████████▉| 9961/10000 [2:08:27<00:23, 1.68it/s, loss=0.0181, lr=2.50e-06, step=9961] Training: 100%|█████████▉| 9962/10000 [2:08:28<00:24, 1.55it/s, loss=0.0181, lr=2.50e-06, step=9961] Training: 100%|█████████▉| 9962/10000 [2:08:28<00:24, 1.55it/s, loss=0.0082, lr=2.50e-06, step=9962] Training: 100%|█████████▉| 9963/10000 [2:08:29<00:22, 1.64it/s, loss=0.0082, lr=2.50e-06, step=9962] Training: 100%|█████████▉| 9963/10000 [2:08:29<00:22, 1.64it/s, loss=0.0029, lr=2.50e-06, step=9963] Training: 100%|█████████▉| 9964/10000 [2:08:29<00:22, 1.57it/s, loss=0.0029, lr=2.50e-06, step=9963] Training: 100%|█████████▉| 9964/10000 [2:08:29<00:22, 1.57it/s, loss=0.0113, lr=2.50e-06, step=9964] Training: 100%|█████████▉| 9965/10000 [2:08:30<00:22, 1.57it/s, loss=0.0113, lr=2.50e-06, step=9964] Training: 100%|█████████▉| 9965/10000 [2:08:30<00:22, 1.57it/s, loss=0.0013, lr=2.50e-06, step=9965] Training: 100%|█████████▉| 9966/10000 [2:08:31<00:21, 1.56it/s, loss=0.0013, lr=2.50e-06, step=9965] Training: 100%|█████████▉| 9966/10000 [2:08:31<00:21, 1.56it/s, loss=0.0026, lr=2.50e-06, step=9966] Training: 100%|█████████▉| 9967/10000 [2:08:31<00:21, 1.51it/s, loss=0.0026, lr=2.50e-06, step=9966] Training: 100%|█████████▉| 9967/10000 [2:08:31<00:21, 1.51it/s, loss=0.0069, lr=2.50e-06, step=9967] Training: 100%|█████████▉| 9968/10000 [2:08:32<00:22, 1.44it/s, loss=0.0069, lr=2.50e-06, step=9967] Training: 100%|█████████▉| 9968/10000 [2:08:32<00:22, 1.44it/s, loss=0.0094, lr=2.50e-06, step=9968] Training: 100%|█████████▉| 9969/10000 [2:08:33<00:25, 1.23it/s, loss=0.0094, lr=2.50e-06, step=9968] Training: 100%|█████████▉| 9969/10000 [2:08:33<00:25, 1.23it/s, loss=0.0135, lr=2.50e-06, step=9969]18:14:40.626 [I] step=9970 loss=0.0123 smoothed_loss=0.0115 lr=2.50e-06 grad_norm=0.4304 step_time=0.5929s data_time=0.1131s it/s=1.417 eta_to_10000=21.2s max_cuda_memory=35.23GB grad_action_in_proj=0.0112 grad_action_out_proj=0.1085 grad_shared_expert=0.3551 (10775:train_pytorch.py:850) + Training: 100%|█████████▉| 9970/10000 [2:08:34<00:21, 1.38it/s, loss=0.0135, lr=2.50e-06, step=9969] Training: 100%|█████████▉| 9970/10000 [2:08:34<00:21, 1.38it/s, loss=0.0123, lr=2.50e-06, step=9970] Training: 100%|█████████▉| 9971/10000 [2:08:34<00:19, 1.47it/s, loss=0.0123, lr=2.50e-06, step=9970] Training: 100%|█████████▉| 9971/10000 [2:08:34<00:19, 1.47it/s, loss=0.0032, lr=2.50e-06, step=9971] Training: 100%|█████████▉| 9972/10000 [2:08:35<00:18, 1.51it/s, loss=0.0032, lr=2.50e-06, step=9971] Training: 100%|█████████▉| 9972/10000 [2:08:35<00:18, 1.51it/s, loss=0.0128, lr=2.50e-06, step=9972] Training: 100%|█████████▉| 9973/10000 [2:08:35<00:16, 1.62it/s, loss=0.0128, lr=2.50e-06, step=9972] Training: 100%|█████████▉| 9973/10000 [2:08:35<00:16, 1.62it/s, loss=0.0606, lr=2.50e-06, step=9973] Training: 100%|█████████▉| 9974/10000 [2:08:36<00:15, 1.72it/s, loss=0.0606, lr=2.50e-06, step=9973] Training: 100%|█████████▉| 9974/10000 [2:08:36<00:15, 1.72it/s, loss=0.0091, lr=2.50e-06, step=9974] Training: 100%|█████████▉| 9975/10000 [2:08:37<00:16, 1.52it/s, loss=0.0091, lr=2.50e-06, step=9974] Training: 100%|█████████▉| 9975/10000 [2:08:37<00:16, 1.52it/s, loss=0.0023, lr=2.50e-06, step=9975] Training: 100%|█████████▉| 9976/10000 [2:08:38<00:16, 1.43it/s, loss=0.0023, lr=2.50e-06, step=9975] Training: 100%|█████████▉| 9976/10000 [2:08:38<00:16, 1.43it/s, loss=0.0022, lr=2.50e-06, step=9976] Training: 100%|█████████▉| 9977/10000 [2:08:38<00:15, 1.52it/s, loss=0.0022, lr=2.50e-06, step=9976] Training: 100%|█████████▉| 9977/10000 [2:08:38<00:15, 1.52it/s, loss=0.0084, lr=2.50e-06, step=9977] Training: 100%|█████████▉| 9978/10000 [2:08:39<00:13, 1.64it/s, loss=0.0084, lr=2.50e-06, step=9977] Training: 100%|█████████▉| 9978/10000 [2:08:39<00:13, 1.64it/s, loss=0.0012, lr=2.50e-06, step=9978] Training: 100%|█████████▉| 9979/10000 [2:08:39<00:12, 1.72it/s, loss=0.0012, lr=2.50e-06, step=9978] Training: 100%|█████████▉| 9979/10000 [2:08:39<00:12, 1.72it/s, loss=0.0014, lr=2.50e-06, step=9979]18:14:47.083 [I] step=9980 loss=0.0029 smoothed_loss=0.0095 lr=2.50e-06 grad_norm=0.3902 step_time=0.5540s data_time=0.0917s it/s=1.549 eta_to_10000=12.9s max_cuda_memory=35.23GB grad_action_in_proj=0.0022 grad_action_out_proj=0.0268 grad_shared_expert=0.1216 (10775:train_pytorch.py:850) + Training: 100%|█████████▉| 9980/10000 [2:08:40<00:14, 1.38it/s, loss=0.0014, lr=2.50e-06, step=9979] Training: 100%|█████████▉| 9980/10000 [2:08:40<00:14, 1.38it/s, loss=0.0029, lr=2.50e-06, step=9980] Training: 100%|█████████▉| 9981/10000 [2:08:41<00:13, 1.41it/s, loss=0.0029, lr=2.50e-06, step=9980] Training: 100%|█████████▉| 9981/10000 [2:08:41<00:13, 1.41it/s, loss=0.0088, lr=2.50e-06, step=9981] Training: 100%|█████████▉| 9982/10000 [2:08:42<00:12, 1.39it/s, loss=0.0088, lr=2.50e-06, step=9981] Training: 100%|█████████▉| 9982/10000 [2:08:42<00:12, 1.39it/s, loss=0.0091, lr=2.50e-06, step=9982] Training: 100%|█████████▉| 9983/10000 [2:08:42<00:13, 1.28it/s, loss=0.0091, lr=2.50e-06, step=9982] Training: 100%|█████████▉| 9983/10000 [2:08:42<00:13, 1.28it/s, loss=0.0025, lr=2.50e-06, step=9983] Training: 100%|█████████▉| 9984/10000 [2:08:43<00:11, 1.37it/s, loss=0.0025, lr=2.50e-06, step=9983] Training: 100%|█████████▉| 9984/10000 [2:08:43<00:11, 1.37it/s, loss=0.0069, lr=2.50e-06, step=9984] Training: 100%|█████████▉| 9985/10000 [2:08:44<00:10, 1.47it/s, loss=0.0069, lr=2.50e-06, step=9984] Training: 100%|█████████▉| 9985/10000 [2:08:44<00:10, 1.47it/s, loss=0.0013, lr=2.50e-06, step=9985] Training: 100%|█████████▉| 9986/10000 [2:08:44<00:09, 1.49it/s, loss=0.0013, lr=2.50e-06, step=9985] Training: 100%|█████████▉| 9986/10000 [2:08:44<00:09, 1.49it/s, loss=0.0009, lr=2.50e-06, step=9986] Training: 100%|█████████▉| 9987/10000 [2:08:45<00:08, 1.59it/s, loss=0.0009, lr=2.50e-06, step=9986] Training: 100%|█████████▉| 9987/10000 [2:08:45<00:08, 1.59it/s, loss=0.0050, lr=2.50e-06, step=9987] Training: 100%|█████████▉| 9988/10000 [2:08:46<00:08, 1.47it/s, loss=0.0050, lr=2.50e-06, step=9987] Training: 100%|█████████▉| 9988/10000 [2:08:46<00:08, 1.47it/s, loss=0.0136, lr=2.50e-06, step=9988] Training: 100%|█████████▉| 9989/10000 [2:08:46<00:07, 1.40it/s, loss=0.0136, lr=2.50e-06, step=9988] Training: 100%|█████████▉| 9989/10000 [2:08:46<00:07, 1.40it/s, loss=0.0064, lr=2.50e-06, step=9989]18:14:54.208 [I] step=9990 loss=0.0028 smoothed_loss=0.0070 lr=2.50e-06 grad_norm=0.4925 step_time=0.5849s data_time=0.1276s it/s=1.404 eta_to_10000=7.1s max_cuda_memory=35.23GB grad_action_in_proj=0.0106 grad_action_out_proj=0.0706 grad_shared_expert=0.6721 (10775:train_pytorch.py:850) + Training: 100%|█████████▉| 9990/10000 [2:08:47<00:07, 1.33it/s, loss=0.0064, lr=2.50e-06, step=9989] Training: 100%|█████████▉| 9990/10000 [2:08:47<00:07, 1.33it/s, loss=0.0028, lr=2.50e-06, step=9990] Training: 100%|█████████▉| 9991/10000 [2:08:48<00:06, 1.48it/s, loss=0.0028, lr=2.50e-06, step=9990] Training: 100%|█████████▉| 9991/10000 [2:08:48<00:06, 1.48it/s, loss=0.0240, lr=2.50e-06, step=9991] Training: 100%|█████████▉| 9992/10000 [2:08:48<00:05, 1.59it/s, loss=0.0240, lr=2.50e-06, step=9991] Training: 100%|█████████▉| 9992/10000 [2:08:48<00:05, 1.59it/s, loss=0.0032, lr=2.50e-06, step=9992] Training: 100%|█████████▉| 9993/10000 [2:08:49<00:04, 1.46it/s, loss=0.0032, lr=2.50e-06, step=9992] Training: 100%|█████████▉| 9993/10000 [2:08:49<00:04, 1.46it/s, loss=0.0025, lr=2.50e-06, step=9993] Training: 100%|█████████▉| 9994/10000 [2:08:50<00:04, 1.44it/s, loss=0.0025, lr=2.50e-06, step=9993] Training: 100%|█████████▉| 9994/10000 [2:08:50<00:04, 1.44it/s, loss=0.0017, lr=2.50e-06, step=9994] Training: 100%|█████████▉| 9995/10000 [2:08:51<00:03, 1.31it/s, loss=0.0017, lr=2.50e-06, step=9994] Training: 100%|█████████▉| 9995/10000 [2:08:51<00:03, 1.31it/s, loss=0.0060, lr=2.50e-06, step=9995] Training: 100%|█████████▉| 9996/10000 [2:08:51<00:02, 1.45it/s, loss=0.0060, lr=2.50e-06, step=9995] Training: 100%|█████████▉| 9996/10000 [2:08:51<00:02, 1.45it/s, loss=0.0033, lr=2.50e-06, step=9996] Training: 100%|█████████▉| 9997/10000 [2:08:52<00:02, 1.39it/s, loss=0.0033, lr=2.50e-06, step=9996] Training: 100%|█████████▉| 9997/10000 [2:08:52<00:02, 1.39it/s, loss=0.0086, lr=2.50e-06, step=9997] Training: 100%|█████████▉| 9998/10000 [2:08:53<00:01, 1.53it/s, loss=0.0086, lr=2.50e-06, step=9997] Training: 100%|█████████▉| 9998/10000 [2:08:53<00:01, 1.53it/s, loss=0.0024, lr=2.50e-06, step=9998] Training: 100%|█████████▉| 9999/10000 [2:08:53<00:00, 1.56it/s, loss=0.0024, lr=2.50e-06, step=9998] Training: 100%|█████████▉| 9999/10000 [2:08:53<00:00, 1.56it/s, loss=0.1186, lr=2.50e-06, step=9999]18:15:00.659 [I] step=10000 loss=0.0141 smoothed_loss=0.0172 lr=2.50e-06 grad_norm=0.4377 step_time=0.5241s data_time=0.1210s it/s=1.550 eta_to_10000=0.0s max_cuda_memory=35.23GB grad_action_in_proj=0.0125 grad_action_out_proj=0.1342 grad_shared_expert=0.4184 (10775:train_pytorch.py:850) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +18:16:58.135 [I] Saved checkpoint at step 10000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000 (10775:train_pytorch.py:350) + Training: 100%|██████████| 10000/10000 [2:10:51<00:00, 35.86s/it, loss=0.1186, lr=2.50e-06, step=9999] Training: 100%|██████████| 10000/10000 [2:10:51<00:00, 35.86s/it, loss=0.0141, lr=2.50e-06, step=1e+4] Training: 100%|██████████| 10000/10000 [2:10:52<00:00, 1.27it/s, loss=0.0141, lr=2.50e-06, step=1e+4] +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_1000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_1000.log new file mode 100644 index 0000000000000000000000000000000000000000..3a9c8be666002ddfe3fb7e0da6a3ccf8b4938c81 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_1000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_baseline_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.046618 left_arm_loss=0.035624 right_arm_loss=0.057612 imbalance=0.021988 batch_time_s=0.8837 +eval_batch=2 loss=0.011211 left_arm_loss=0.010756 right_arm_loss=0.011666 imbalance=0.000910 batch_time_s=0.2997 +eval_batch=3 loss=0.014357 left_arm_loss=0.020238 right_arm_loss=0.008477 imbalance=0.011761 batch_time_s=0.2246 +eval_batch=4 loss=0.065152 left_arm_loss=0.061398 right_arm_loss=0.068906 imbalance=0.007508 batch_time_s=0.5019 +eval_batch=5 loss=0.045531 left_arm_loss=0.063439 right_arm_loss=0.027622 imbalance=0.035817 batch_time_s=0.2430 +eval_batch=6 loss=0.048678 left_arm_loss=0.092346 right_arm_loss=0.005009 imbalance=0.087337 batch_time_s=0.2351 +eval_batch=7 loss=0.037585 left_arm_loss=0.070659 right_arm_loss=0.004512 imbalance=0.066146 batch_time_s=0.3137 +eval_batch=8 loss=0.016246 left_arm_loss=0.029937 right_arm_loss=0.002555 imbalance=0.027382 batch_time_s=0.5194 +eval_batch=9 loss=0.027677 left_arm_loss=0.053019 right_arm_loss=0.002335 imbalance=0.050684 batch_time_s=0.5638 +eval_batch=10 loss=0.028385 left_arm_loss=0.054602 right_arm_loss=0.002167 imbalance=0.052435 batch_time_s=0.4029 +eval_batch=11 loss=0.029503 left_arm_loss=0.055273 right_arm_loss=0.003732 imbalance=0.051541 batch_time_s=0.3176 +eval_batch=12 loss=0.043170 left_arm_loss=0.082558 right_arm_loss=0.003782 imbalance=0.078776 batch_time_s=0.2468 +eval_batch=13 loss=0.052655 left_arm_loss=0.101415 right_arm_loss=0.003895 imbalance=0.097519 batch_time_s=0.2813 +eval_batch=14 loss=0.067551 left_arm_loss=0.115959 right_arm_loss=0.019144 imbalance=0.096815 batch_time_s=0.3179 +eval_batch=15 loss=0.086284 left_arm_loss=0.032746 right_arm_loss=0.139821 imbalance=0.107075 batch_time_s=0.2862 +eval_batch=16 loss=0.076913 left_arm_loss=0.047023 right_arm_loss=0.106803 imbalance=0.059780 batch_time_s=0.2262 +eval_batch=17 loss=0.055457 left_arm_loss=0.100819 right_arm_loss=0.010095 imbalance=0.090724 batch_time_s=0.2535 +eval_batch=18 loss=0.070395 left_arm_loss=0.077499 right_arm_loss=0.063291 imbalance=0.014207 batch_time_s=0.2412 +eval_batch=19 loss=0.031461 left_arm_loss=0.041223 right_arm_loss=0.021699 imbalance=0.019524 batch_time_s=0.3031 +eval_batch=20 loss=0.026952 left_arm_loss=0.041134 right_arm_loss=0.012770 imbalance=0.028364 batch_time_s=0.2572 +eval_batch=21 loss=0.025842 left_arm_loss=0.040805 right_arm_loss=0.010879 imbalance=0.029926 batch_time_s=0.2815 +eval_batch=22 loss=0.056536 left_arm_loss=0.058355 right_arm_loss=0.054717 imbalance=0.003638 batch_time_s=0.7272 +eval_batch=23 loss=0.077286 left_arm_loss=0.129516 right_arm_loss=0.025057 imbalance=0.104459 batch_time_s=0.2620 +eval_batch=24 loss=0.108069 left_arm_loss=0.203466 right_arm_loss=0.012671 imbalance=0.190795 batch_time_s=0.2676 +eval_batch=25 loss=0.082836 left_arm_loss=0.162669 right_arm_loss=0.003003 imbalance=0.159666 batch_time_s=0.2385 +eval_batch=26 loss=0.036761 left_arm_loss=0.066170 right_arm_loss=0.007353 imbalance=0.058817 batch_time_s=0.2609 +eval_batch=27 loss=0.037065 left_arm_loss=0.065602 right_arm_loss=0.008527 imbalance=0.057075 batch_time_s=0.2331 +eval_batch=28 loss=0.035955 left_arm_loss=0.069021 right_arm_loss=0.002889 imbalance=0.066132 batch_time_s=0.3208 +eval_batch=29 loss=0.060579 left_arm_loss=0.118573 right_arm_loss=0.002585 imbalance=0.115988 batch_time_s=0.3175 +eval_batch=30 loss=0.100699 left_arm_loss=0.197816 right_arm_loss=0.003583 imbalance=0.194233 batch_time_s=0.2390 +eval_batch=31 loss=0.187748 left_arm_loss=0.361111 right_arm_loss=0.014385 imbalance=0.346726 batch_time_s=0.2807 +eval_batch=32 loss=0.108934 left_arm_loss=0.117864 right_arm_loss=0.100004 imbalance=0.017860 batch_time_s=0.3261 +eval_batch=33 loss=0.072897 left_arm_loss=0.035474 right_arm_loss=0.110320 imbalance=0.074846 batch_time_s=0.3380 +eval_batch=34 loss=0.079352 left_arm_loss=0.131144 right_arm_loss=0.027560 imbalance=0.103585 batch_time_s=0.2874 +eval_batch=35 loss=0.062093 left_arm_loss=0.110691 right_arm_loss=0.013495 imbalance=0.097196 batch_time_s=0.2346 +eval_batch=36 loss=0.050124 left_arm_loss=0.062390 right_arm_loss=0.037857 imbalance=0.024533 batch_time_s=0.2303 +eval_batch=37 loss=0.028622 left_arm_loss=0.044315 right_arm_loss=0.012930 imbalance=0.031385 batch_time_s=0.2376 +eval_batch=38 loss=0.064885 left_arm_loss=0.078474 right_arm_loss=0.051295 imbalance=0.027179 batch_time_s=0.2284 +eval_batch=39 loss=0.073221 left_arm_loss=0.047691 right_arm_loss=0.098751 imbalance=0.051060 batch_time_s=0.2703 +eval_batch=40 loss=0.039382 left_arm_loss=0.045306 right_arm_loss=0.033458 imbalance=0.011848 batch_time_s=0.2373 +eval_batch=41 loss=0.071908 left_arm_loss=0.139208 right_arm_loss=0.004608 imbalance=0.134601 batch_time_s=0.2347 +eval_batch=42 loss=0.041757 left_arm_loss=0.079108 right_arm_loss=0.004406 imbalance=0.074702 batch_time_s=0.3166 +eval_batch=43 loss=0.018202 left_arm_loss=0.030615 right_arm_loss=0.005788 imbalance=0.024827 batch_time_s=0.2292 +eval_batch=44 loss=0.020007 left_arm_loss=0.035204 right_arm_loss=0.004809 imbalance=0.030394 batch_time_s=0.2328 +eval_batch=45 loss=0.021428 left_arm_loss=0.038985 right_arm_loss=0.003871 imbalance=0.035115 batch_time_s=0.2296 +eval_batch=46 loss=0.039452 left_arm_loss=0.073343 right_arm_loss=0.005561 imbalance=0.067782 batch_time_s=0.2299 +eval_batch=47 loss=0.131330 left_arm_loss=0.042242 right_arm_loss=0.220417 imbalance=0.178175 batch_time_s=0.2279 +eval_batch=48 loss=0.248957 left_arm_loss=0.015340 right_arm_loss=0.482575 imbalance=0.467235 batch_time_s=0.2493 +eval_batch=49 loss=0.046603 left_arm_loss=0.014231 right_arm_loss=0.078976 imbalance=0.064745 batch_time_s=0.2881 +eval_batch=50 loss=0.146214 left_arm_loss=0.068633 right_arm_loss=0.223796 imbalance=0.155163 batch_time_s=0.2250 +config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/1000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.061130 +std_val_loss: 0.043921 +mean_left_arm_loss: 0.077421 +std_left_arm_loss: 0.059309 +mean_right_arm_loss: 0.044840 +std_right_arm_loss: 0.080634 +mean_left_joint_loss: 0.082092 +std_left_joint_loss: 0.066740 +mean_left_gripper_loss: 0.044720 +std_left_gripper_loss: 0.088365 +mean_right_joint_loss: 0.046274 +std_right_joint_loss: 0.087919 +mean_right_gripper_loss: 0.034807 +std_right_gripper_loss: 0.076825 +mean_left_right_imbalance: 0.080120 +std_left_right_imbalance: 0.083456 +per_batch_timing_seconds: mean=0.3040 std=0.1266 min=0.2246 max=0.8837 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.110773 left_arm_mae=0.098997 right_arm_mae=0.122549 imbalance_mae=0.023551 batch_time_s=0.2698 +sample_eval_batch=2 num_steps=4 masked_mae=0.053114 left_arm_mae=0.054272 right_arm_mae=0.051956 imbalance_mae=0.002316 batch_time_s=0.2901 +sample_eval_batch=3 num_steps=4 masked_mae=0.064381 left_arm_mae=0.067260 right_arm_mae=0.061502 imbalance_mae=0.005757 batch_time_s=0.3011 +sample_eval_batch=4 num_steps=4 masked_mae=0.109693 left_arm_mae=0.111135 right_arm_mae=0.108252 imbalance_mae=0.002883 batch_time_s=0.2649 +sample_eval_batch=5 num_steps=4 masked_mae=0.077213 left_arm_mae=0.098610 right_arm_mae=0.055816 imbalance_mae=0.042794 batch_time_s=0.3085 +sample_eval_batch=6 num_steps=4 masked_mae=0.091437 left_arm_mae=0.160364 right_arm_mae=0.022511 imbalance_mae=0.137853 batch_time_s=0.3781 +sample_eval_batch=7 num_steps=4 masked_mae=0.091958 left_arm_mae=0.164175 right_arm_mae=0.019740 imbalance_mae=0.144435 batch_time_s=0.3430 +sample_eval_batch=8 num_steps=4 masked_mae=0.065797 left_arm_mae=0.112976 right_arm_mae=0.018618 imbalance_mae=0.094358 batch_time_s=0.3558 +sample_eval_batch=9 num_steps=4 masked_mae=0.072095 left_arm_mae=0.126277 right_arm_mae=0.017913 imbalance_mae=0.108364 batch_time_s=0.2688 +sample_eval_batch=10 num_steps=4 masked_mae=0.079846 left_arm_mae=0.139709 right_arm_mae=0.019984 imbalance_mae=0.119725 batch_time_s=0.2815 +sample_eval_batch=11 num_steps=4 masked_mae=0.072607 left_arm_mae=0.124672 right_arm_mae=0.020542 imbalance_mae=0.104131 batch_time_s=0.3351 +sample_eval_batch=12 num_steps=4 masked_mae=0.097009 left_arm_mae=0.172318 right_arm_mae=0.021700 imbalance_mae=0.150618 batch_time_s=0.3060 +sample_eval_batch=13 num_steps=4 masked_mae=0.102344 left_arm_mae=0.182477 right_arm_mae=0.022212 imbalance_mae=0.160265 batch_time_s=0.3382 +sample_eval_batch=14 num_steps=4 masked_mae=0.125010 left_arm_mae=0.204377 right_arm_mae=0.045644 imbalance_mae=0.158733 batch_time_s=0.2661 +sample_eval_batch=15 num_steps=4 masked_mae=0.132648 left_arm_mae=0.043128 right_arm_mae=0.222168 imbalance_mae=0.179040 batch_time_s=0.3299 +sample_eval_batch=16 num_steps=4 masked_mae=0.109078 left_arm_mae=0.065883 right_arm_mae=0.152274 imbalance_mae=0.086391 batch_time_s=0.3721 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.090938 +sample_eval_num_steps_4_std_masked_mae: 0.022240 +sample_eval_num_steps_4_mean_left_arm_mae: 0.120414 +sample_eval_num_steps_4_std_left_arm_mae: 0.046606 +sample_eval_num_steps_4_mean_right_arm_mae: 0.061461 +sample_eval_num_steps_4_std_right_arm_mae: 0.058026 +sample_eval_num_steps_4_mean_left_joint_mae: 0.130966 +sample_eval_num_steps_4_std_left_joint_mae: 0.054578 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.046552 +sample_eval_num_steps_4_std_left_gripper_mae: 0.067920 +sample_eval_num_steps_4_mean_right_joint_mae: 0.063945 +sample_eval_num_steps_4_std_right_joint_mae: 0.062779 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.044077 +sample_eval_num_steps_4_std_right_gripper_mae: 0.053987 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.095076 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.059464 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3131 std=0.0370 min=0.2649 max=0.3781 +sample_eval_batch=1 num_steps=10 masked_mae=0.125925 left_arm_mae=0.112806 right_arm_mae=0.139044 imbalance_mae=0.026238 batch_time_s=0.3393 +sample_eval_batch=2 num_steps=10 masked_mae=0.065916 left_arm_mae=0.067937 right_arm_mae=0.063895 imbalance_mae=0.004043 batch_time_s=0.3368 +sample_eval_batch=3 num_steps=10 masked_mae=0.075489 left_arm_mae=0.077150 right_arm_mae=0.073827 imbalance_mae=0.003322 batch_time_s=0.3428 +sample_eval_batch=4 num_steps=10 masked_mae=0.119956 left_arm_mae=0.122138 right_arm_mae=0.117774 imbalance_mae=0.004364 batch_time_s=0.3683 +sample_eval_batch=5 num_steps=10 masked_mae=0.086405 left_arm_mae=0.108638 right_arm_mae=0.064172 imbalance_mae=0.044466 batch_time_s=0.3385 +sample_eval_batch=6 num_steps=10 masked_mae=0.102866 left_arm_mae=0.179362 right_arm_mae=0.026370 imbalance_mae=0.152992 batch_time_s=0.4448 +sample_eval_batch=7 num_steps=10 masked_mae=0.099225 left_arm_mae=0.175145 right_arm_mae=0.023305 imbalance_mae=0.151840 batch_time_s=0.4423 +sample_eval_batch=8 num_steps=10 masked_mae=0.070220 left_arm_mae=0.118236 right_arm_mae=0.022204 imbalance_mae=0.096032 batch_time_s=0.4572 +sample_eval_batch=9 num_steps=10 masked_mae=0.080352 left_arm_mae=0.138299 right_arm_mae=0.022405 imbalance_mae=0.115894 batch_time_s=0.3420 +sample_eval_batch=10 num_steps=10 masked_mae=0.088702 left_arm_mae=0.154109 right_arm_mae=0.023295 imbalance_mae=0.130813 batch_time_s=0.3360 +sample_eval_batch=11 num_steps=10 masked_mae=0.080649 left_arm_mae=0.139099 right_arm_mae=0.022199 imbalance_mae=0.116900 batch_time_s=0.3334 +sample_eval_batch=12 num_steps=10 masked_mae=0.105638 left_arm_mae=0.185946 right_arm_mae=0.025330 imbalance_mae=0.160616 batch_time_s=0.3391 +sample_eval_batch=13 num_steps=10 masked_mae=0.111236 left_arm_mae=0.196994 right_arm_mae=0.025478 imbalance_mae=0.171516 batch_time_s=0.3333 +sample_eval_batch=14 num_steps=10 masked_mae=0.133621 left_arm_mae=0.214837 right_arm_mae=0.052405 imbalance_mae=0.162432 batch_time_s=0.3404 +sample_eval_batch=15 num_steps=10 masked_mae=0.147807 left_arm_mae=0.053255 right_arm_mae=0.242359 imbalance_mae=0.189104 batch_time_s=0.3383 +sample_eval_batch=16 num_steps=10 masked_mae=0.121862 left_arm_mae=0.073953 right_arm_mae=0.169770 imbalance_mae=0.095817 batch_time_s=0.3914 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.100992 +sample_eval_num_steps_10_std_masked_mae: 0.023502 +sample_eval_num_steps_10_mean_left_arm_mae: 0.132369 +sample_eval_num_steps_10_std_left_arm_mae: 0.047803 +sample_eval_num_steps_10_mean_right_arm_mae: 0.069615 +sample_eval_num_steps_10_std_right_arm_mae: 0.063335 +sample_eval_num_steps_10_mean_left_joint_mae: 0.143677 +sample_eval_num_steps_10_std_left_joint_mae: 0.056155 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.053215 +sample_eval_num_steps_10_std_left_gripper_mae: 0.074232 +sample_eval_num_steps_10_mean_right_joint_mae: 0.072165 +sample_eval_num_steps_10_std_right_joint_mae: 0.068555 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.051764 +sample_eval_num_steps_10_std_right_gripper_mae: 0.054067 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.101649 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.063159 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.3640 std=0.0430 min=0.3333 max=0.4572 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_10000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_10000.log new file mode 100644 index 0000000000000000000000000000000000000000..c1a5448e0336d7e2d10c236cd8acfeff833337a1 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_10000.log @@ -0,0 +1,198 @@ +starting_eval config=pi05_twin_handover_256_packed_baseline_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=100 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.008560 left_arm_loss=0.008768 right_arm_loss=0.008353 imbalance=0.000415 batch_time_s=0.8903 +eval_batch=2 loss=0.001335 left_arm_loss=0.001273 right_arm_loss=0.001396 imbalance=0.000123 batch_time_s=0.2357 +eval_batch=3 loss=0.001543 left_arm_loss=0.001703 right_arm_loss=0.001384 imbalance=0.000319 batch_time_s=0.3261 +eval_batch=4 loss=0.014131 left_arm_loss=0.013319 right_arm_loss=0.014943 imbalance=0.001624 batch_time_s=0.2341 +eval_batch=5 loss=0.010147 left_arm_loss=0.011514 right_arm_loss=0.008780 imbalance=0.002734 batch_time_s=0.2311 +eval_batch=6 loss=0.008744 left_arm_loss=0.016569 right_arm_loss=0.000919 imbalance=0.015649 batch_time_s=0.2410 +eval_batch=7 loss=0.007124 left_arm_loss=0.013695 right_arm_loss=0.000553 imbalance=0.013142 batch_time_s=0.2401 +eval_batch=8 loss=0.009670 left_arm_loss=0.018982 right_arm_loss=0.000359 imbalance=0.018624 batch_time_s=0.2342 +eval_batch=9 loss=0.016381 left_arm_loss=0.032430 right_arm_loss=0.000333 imbalance=0.032097 batch_time_s=0.2400 +eval_batch=10 loss=0.023648 left_arm_loss=0.046801 right_arm_loss=0.000495 imbalance=0.046306 batch_time_s=0.2375 +eval_batch=11 loss=0.019174 left_arm_loss=0.037797 right_arm_loss=0.000552 imbalance=0.037244 batch_time_s=0.2783 +eval_batch=12 loss=0.017045 left_arm_loss=0.033491 right_arm_loss=0.000599 imbalance=0.032891 batch_time_s=0.2462 +eval_batch=13 loss=0.023033 left_arm_loss=0.045382 right_arm_loss=0.000684 imbalance=0.044698 batch_time_s=0.2408 +eval_batch=14 loss=0.005748 left_arm_loss=0.009245 right_arm_loss=0.002251 imbalance=0.006994 batch_time_s=0.2910 +eval_batch=15 loss=0.030839 left_arm_loss=0.010502 right_arm_loss=0.051175 imbalance=0.040673 batch_time_s=0.2852 +eval_batch=16 loss=0.024088 left_arm_loss=0.018003 right_arm_loss=0.030173 imbalance=0.012170 batch_time_s=0.3083 +eval_batch=17 loss=0.012957 left_arm_loss=0.024034 right_arm_loss=0.001880 imbalance=0.022154 batch_time_s=0.2456 +eval_batch=18 loss=0.038324 left_arm_loss=0.056229 right_arm_loss=0.020419 imbalance=0.035809 batch_time_s=0.2414 +eval_batch=19 loss=0.011172 left_arm_loss=0.019652 right_arm_loss=0.002692 imbalance=0.016960 batch_time_s=0.2383 +eval_batch=20 loss=0.021376 left_arm_loss=0.039737 right_arm_loss=0.003016 imbalance=0.036721 batch_time_s=0.2315 +eval_batch=21 loss=0.092819 left_arm_loss=0.183719 right_arm_loss=0.001919 imbalance=0.181800 batch_time_s=0.2421 +eval_batch=22 loss=0.107753 left_arm_loss=0.187696 right_arm_loss=0.027809 imbalance=0.159887 batch_time_s=0.2272 +eval_batch=23 loss=0.039306 left_arm_loss=0.070818 right_arm_loss=0.007795 imbalance=0.063023 batch_time_s=0.2290 +eval_batch=24 loss=0.105801 left_arm_loss=0.210136 right_arm_loss=0.001466 imbalance=0.208669 batch_time_s=0.2295 +eval_batch=25 loss=0.074643 left_arm_loss=0.148833 right_arm_loss=0.000454 imbalance=0.148380 batch_time_s=0.2341 +eval_batch=26 loss=0.053766 left_arm_loss=0.106145 right_arm_loss=0.001387 imbalance=0.104758 batch_time_s=0.2310 +eval_batch=27 loss=0.059274 left_arm_loss=0.117549 right_arm_loss=0.000999 imbalance=0.116551 batch_time_s=0.2295 +eval_batch=28 loss=0.024668 left_arm_loss=0.048976 right_arm_loss=0.000361 imbalance=0.048615 batch_time_s=0.2339 +eval_batch=29 loss=0.033007 left_arm_loss=0.065406 right_arm_loss=0.000608 imbalance=0.064798 batch_time_s=0.2921 +eval_batch=30 loss=0.025443 left_arm_loss=0.049679 right_arm_loss=0.001208 imbalance=0.048470 batch_time_s=0.2567 +eval_batch=31 loss=0.047262 left_arm_loss=0.092545 right_arm_loss=0.001978 imbalance=0.090567 batch_time_s=0.2269 +eval_batch=32 loss=0.017943 left_arm_loss=0.028981 right_arm_loss=0.006904 imbalance=0.022077 batch_time_s=0.2486 +eval_batch=33 loss=0.049076 left_arm_loss=0.023606 right_arm_loss=0.074545 imbalance=0.050939 batch_time_s=0.4874 +eval_batch=34 loss=0.078728 left_arm_loss=0.128720 right_arm_loss=0.028735 imbalance=0.099985 batch_time_s=0.2288 +eval_batch=35 loss=0.027250 left_arm_loss=0.051945 right_arm_loss=0.002554 imbalance=0.049391 batch_time_s=0.2350 +eval_batch=36 loss=0.007188 left_arm_loss=0.003737 right_arm_loss=0.010639 imbalance=0.006902 batch_time_s=0.2371 +eval_batch=37 loss=0.001722 left_arm_loss=0.002658 right_arm_loss=0.000786 imbalance=0.001872 batch_time_s=0.2310 +eval_batch=38 loss=0.024799 left_arm_loss=0.031059 right_arm_loss=0.018538 imbalance=0.012521 batch_time_s=0.2716 +eval_batch=39 loss=0.022045 left_arm_loss=0.009727 right_arm_loss=0.034363 imbalance=0.024636 batch_time_s=0.2294 +eval_batch=40 loss=0.010673 left_arm_loss=0.012038 right_arm_loss=0.009308 imbalance=0.002730 batch_time_s=0.3581 +eval_batch=41 loss=0.019520 left_arm_loss=0.038111 right_arm_loss=0.000930 imbalance=0.037180 batch_time_s=0.2449 +eval_batch=42 loss=0.010768 left_arm_loss=0.020798 right_arm_loss=0.000738 imbalance=0.020060 batch_time_s=0.2284 +eval_batch=43 loss=0.002843 left_arm_loss=0.005206 right_arm_loss=0.000481 imbalance=0.004725 batch_time_s=0.2434 +eval_batch=44 loss=0.001429 left_arm_loss=0.002409 right_arm_loss=0.000449 imbalance=0.001960 batch_time_s=0.2829 +eval_batch=45 loss=0.003741 left_arm_loss=0.006856 right_arm_loss=0.000625 imbalance=0.006232 batch_time_s=0.2430 +eval_batch=46 loss=0.011082 left_arm_loss=0.021369 right_arm_loss=0.000795 imbalance=0.020575 batch_time_s=0.2387 +eval_batch=47 loss=0.102886 left_arm_loss=0.004540 right_arm_loss=0.201233 imbalance=0.196693 batch_time_s=0.2470 +eval_batch=48 loss=0.097341 left_arm_loss=0.003805 right_arm_loss=0.190878 imbalance=0.187074 batch_time_s=0.2359 +eval_batch=49 loss=0.006523 left_arm_loss=0.002001 right_arm_loss=0.011044 imbalance=0.009043 batch_time_s=0.2302 +eval_batch=50 loss=0.046598 left_arm_loss=0.022808 right_arm_loss=0.070389 imbalance=0.047580 batch_time_s=0.2539 +eval_batch=51 loss=0.009135 left_arm_loss=0.013602 right_arm_loss=0.004667 imbalance=0.008934 batch_time_s=0.2427 +eval_batch=52 loss=0.011514 left_arm_loss=0.016650 right_arm_loss=0.006378 imbalance=0.010272 batch_time_s=0.2395 +eval_batch=53 loss=0.001132 left_arm_loss=0.001374 right_arm_loss=0.000890 imbalance=0.000485 batch_time_s=0.2332 +eval_batch=54 loss=0.002559 left_arm_loss=0.002530 right_arm_loss=0.002589 imbalance=0.000059 batch_time_s=0.2354 +eval_batch=55 loss=0.017277 left_arm_loss=0.018273 right_arm_loss=0.016282 imbalance=0.001991 batch_time_s=0.2345 +eval_batch=56 loss=0.024251 left_arm_loss=0.005012 right_arm_loss=0.043491 imbalance=0.038479 batch_time_s=0.2376 +eval_batch=57 loss=0.009270 left_arm_loss=0.013307 right_arm_loss=0.005233 imbalance=0.008074 batch_time_s=0.2310 +eval_batch=58 loss=0.008486 left_arm_loss=0.015714 right_arm_loss=0.001258 imbalance=0.014456 batch_time_s=0.2297 +eval_batch=59 loss=0.012955 left_arm_loss=0.025000 right_arm_loss=0.000911 imbalance=0.024089 batch_time_s=0.2376 +eval_batch=60 loss=0.004419 left_arm_loss=0.007966 right_arm_loss=0.000873 imbalance=0.007093 batch_time_s=0.2301 +eval_batch=61 loss=0.001311 left_arm_loss=0.002374 right_arm_loss=0.000248 imbalance=0.002127 batch_time_s=0.2271 +eval_batch=62 loss=0.001412 left_arm_loss=0.002614 right_arm_loss=0.000210 imbalance=0.002404 batch_time_s=0.3047 +eval_batch=63 loss=0.003703 left_arm_loss=0.006794 right_arm_loss=0.000613 imbalance=0.006181 batch_time_s=0.2590 +eval_batch=64 loss=0.003989 left_arm_loss=0.007039 right_arm_loss=0.000940 imbalance=0.006100 batch_time_s=0.2326 +eval_batch=65 loss=0.011913 left_arm_loss=0.002230 right_arm_loss=0.021597 imbalance=0.019367 batch_time_s=0.2301 +eval_batch=66 loss=0.003909 left_arm_loss=0.004961 right_arm_loss=0.002856 imbalance=0.002105 batch_time_s=0.2372 +eval_batch=67 loss=0.005047 left_arm_loss=0.009273 right_arm_loss=0.000820 imbalance=0.008453 batch_time_s=0.2308 +eval_batch=68 loss=0.018646 left_arm_loss=0.024080 right_arm_loss=0.013213 imbalance=0.010868 batch_time_s=0.2338 +eval_batch=69 loss=0.035457 left_arm_loss=0.055589 right_arm_loss=0.015326 imbalance=0.040263 batch_time_s=0.2278 +eval_batch=70 loss=0.023561 left_arm_loss=0.014324 right_arm_loss=0.032799 imbalance=0.018475 batch_time_s=0.2263 +eval_batch=71 loss=0.012966 left_arm_loss=0.005677 right_arm_loss=0.020255 imbalance=0.014578 batch_time_s=0.2334 +eval_batch=72 loss=0.030692 left_arm_loss=0.052390 right_arm_loss=0.008995 imbalance=0.043395 batch_time_s=0.2307 +eval_batch=73 loss=0.019974 left_arm_loss=0.037737 right_arm_loss=0.002212 imbalance=0.035525 batch_time_s=0.2324 +eval_batch=74 loss=0.017243 left_arm_loss=0.033196 right_arm_loss=0.001290 imbalance=0.031907 batch_time_s=0.2301 +eval_batch=75 loss=0.006222 left_arm_loss=0.012136 right_arm_loss=0.000308 imbalance=0.011828 batch_time_s=0.2352 +eval_batch=76 loss=0.022304 left_arm_loss=0.036135 right_arm_loss=0.008474 imbalance=0.027661 batch_time_s=0.2309 +eval_batch=77 loss=0.007094 left_arm_loss=0.013553 right_arm_loss=0.000635 imbalance=0.012918 batch_time_s=0.2285 +eval_batch=78 loss=0.027930 left_arm_loss=0.004931 right_arm_loss=0.050929 imbalance=0.045998 batch_time_s=0.2580 +eval_batch=79 loss=0.073179 left_arm_loss=0.013695 right_arm_loss=0.132664 imbalance=0.118969 batch_time_s=0.2436 +eval_batch=80 loss=0.052136 left_arm_loss=0.014955 right_arm_loss=0.089316 imbalance=0.074362 batch_time_s=0.2320 +eval_batch=81 loss=0.021709 left_arm_loss=0.017623 right_arm_loss=0.025794 imbalance=0.008171 batch_time_s=0.2394 +eval_batch=82 loss=0.008380 left_arm_loss=0.006955 right_arm_loss=0.009806 imbalance=0.002851 batch_time_s=0.2371 +eval_batch=83 loss=0.011753 left_arm_loss=0.013604 right_arm_loss=0.009902 imbalance=0.003702 batch_time_s=0.2396 +eval_batch=84 loss=0.007405 left_arm_loss=0.010580 right_arm_loss=0.004231 imbalance=0.006349 batch_time_s=0.2440 +eval_batch=85 loss=0.017654 left_arm_loss=0.011350 right_arm_loss=0.023958 imbalance=0.012608 batch_time_s=0.2373 +eval_batch=86 loss=0.015883 left_arm_loss=0.002756 right_arm_loss=0.029009 imbalance=0.026253 batch_time_s=0.2370 +eval_batch=87 loss=0.006524 left_arm_loss=0.005036 right_arm_loss=0.008012 imbalance=0.002976 batch_time_s=0.2359 +eval_batch=88 loss=0.016466 left_arm_loss=0.031930 right_arm_loss=0.001001 imbalance=0.030929 batch_time_s=0.2377 +eval_batch=89 loss=0.015712 left_arm_loss=0.027821 right_arm_loss=0.003602 imbalance=0.024219 batch_time_s=0.2324 +eval_batch=90 loss=0.002218 left_arm_loss=0.004046 right_arm_loss=0.000389 imbalance=0.003657 batch_time_s=0.2310 +eval_batch=91 loss=0.001704 left_arm_loss=0.002999 right_arm_loss=0.000409 imbalance=0.002590 batch_time_s=0.2355 +eval_batch=92 loss=0.001195 left_arm_loss=0.002064 right_arm_loss=0.000326 imbalance=0.001738 batch_time_s=0.2340 +eval_batch=93 loss=0.007939 left_arm_loss=0.014865 right_arm_loss=0.001012 imbalance=0.013853 batch_time_s=0.2389 +eval_batch=94 loss=0.003085 left_arm_loss=0.003954 right_arm_loss=0.002217 imbalance=0.001737 batch_time_s=0.2660 +eval_batch=95 loss=0.008923 left_arm_loss=0.003261 right_arm_loss=0.014584 imbalance=0.011323 batch_time_s=0.2413 +eval_batch=96 loss=0.007102 left_arm_loss=0.008076 right_arm_loss=0.006127 imbalance=0.001949 batch_time_s=0.2398 +eval_batch=97 loss=0.014454 left_arm_loss=0.026941 right_arm_loss=0.001967 imbalance=0.024973 batch_time_s=0.2452 +eval_batch=98 loss=0.033711 left_arm_loss=0.063071 right_arm_loss=0.004351 imbalance=0.058720 batch_time_s=0.2431 +eval_batch=99 loss=0.020718 left_arm_loss=0.037961 right_arm_loss=0.003476 imbalance=0.034484 batch_time_s=0.2347 +eval_batch=100 loss=0.021343 left_arm_loss=0.036917 right_arm_loss=0.005769 imbalance=0.031148 batch_time_s=0.2456 +config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/10000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 100 +mean_val_loss: 0.022345 +std_val_loss: 0.024337 +mean_left_arm_loss: 0.029659 +std_left_arm_loss: 0.039896 +mean_right_arm_loss: 0.015031 +std_right_arm_loss: 0.032929 +mean_left_joint_loss: 0.031507 +std_left_joint_loss: 0.044637 +mean_left_gripper_loss: 0.016725 +std_left_gripper_loss: 0.040894 +mean_right_joint_loss: 0.015776 +std_right_joint_loss: 0.036308 +mean_right_gripper_loss: 0.009818 +std_right_gripper_loss: 0.028543 +mean_left_right_imbalance: 0.034067 +std_left_right_imbalance: 0.045126 +per_batch_timing_seconds: mean=0.2524 std=0.0719 min=0.2263 max=0.8903 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.029301 left_arm_mae=0.027942 right_arm_mae=0.030659 imbalance_mae=0.002717 batch_time_s=0.2769 +sample_eval_batch=2 num_steps=4 masked_mae=0.015433 left_arm_mae=0.017666 right_arm_mae=0.013200 imbalance_mae=0.004466 batch_time_s=0.2659 +sample_eval_batch=3 num_steps=4 masked_mae=0.016550 left_arm_mae=0.019292 right_arm_mae=0.013809 imbalance_mae=0.005483 batch_time_s=0.2699 +sample_eval_batch=4 num_steps=4 masked_mae=0.025598 left_arm_mae=0.022401 right_arm_mae=0.028794 imbalance_mae=0.006393 batch_time_s=0.2676 +sample_eval_batch=5 num_steps=4 masked_mae=0.026846 left_arm_mae=0.028538 right_arm_mae=0.025154 imbalance_mae=0.003384 batch_time_s=0.2658 +sample_eval_batch=6 num_steps=4 masked_mae=0.026403 left_arm_mae=0.044187 right_arm_mae=0.008619 imbalance_mae=0.035567 batch_time_s=0.2680 +sample_eval_batch=7 num_steps=4 masked_mae=0.026617 left_arm_mae=0.047235 right_arm_mae=0.005999 imbalance_mae=0.041236 batch_time_s=0.2669 +sample_eval_batch=8 num_steps=4 masked_mae=0.022871 left_arm_mae=0.039870 right_arm_mae=0.005872 imbalance_mae=0.033999 batch_time_s=0.2625 +sample_eval_batch=9 num_steps=4 masked_mae=0.034925 left_arm_mae=0.062935 right_arm_mae=0.006915 imbalance_mae=0.056020 batch_time_s=0.2726 +sample_eval_batch=10 num_steps=4 masked_mae=0.043991 left_arm_mae=0.080034 right_arm_mae=0.007949 imbalance_mae=0.072084 batch_time_s=0.2678 +sample_eval_batch=11 num_steps=4 masked_mae=0.034402 left_arm_mae=0.062883 right_arm_mae=0.005922 imbalance_mae=0.056961 batch_time_s=0.2676 +sample_eval_batch=12 num_steps=4 masked_mae=0.037064 left_arm_mae=0.067084 right_arm_mae=0.007043 imbalance_mae=0.060041 batch_time_s=0.3375 +sample_eval_batch=13 num_steps=4 masked_mae=0.033553 left_arm_mae=0.059772 right_arm_mae=0.007334 imbalance_mae=0.052438 batch_time_s=0.2909 +sample_eval_batch=14 num_steps=4 masked_mae=0.024011 left_arm_mae=0.037121 right_arm_mae=0.010900 imbalance_mae=0.026221 batch_time_s=0.2666 +sample_eval_batch=15 num_steps=4 masked_mae=0.043051 left_arm_mae=0.017745 right_arm_mae=0.068357 imbalance_mae=0.050613 batch_time_s=0.3469 +sample_eval_batch=16 num_steps=4 masked_mae=0.038342 left_arm_mae=0.022290 right_arm_mae=0.054393 imbalance_mae=0.032104 batch_time_s=0.2756 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.029935 +sample_eval_num_steps_4_std_masked_mae: 0.008200 +sample_eval_num_steps_4_mean_left_arm_mae: 0.041062 +sample_eval_num_steps_4_std_left_arm_mae: 0.019621 +sample_eval_num_steps_4_mean_right_arm_mae: 0.018807 +sample_eval_num_steps_4_std_right_arm_mae: 0.018117 +sample_eval_num_steps_4_mean_left_joint_mae: 0.044440 +sample_eval_num_steps_4_std_left_joint_mae: 0.022950 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.017416 +sample_eval_num_steps_4_std_left_gripper_mae: 0.016394 +sample_eval_num_steps_4_mean_right_joint_mae: 0.019500 +sample_eval_num_steps_4_std_right_joint_mae: 0.019305 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.013963 +sample_eval_num_steps_4_std_right_gripper_mae: 0.019504 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.033733 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.022691 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.2793 std=0.0247 min=0.2625 max=0.3469 +sample_eval_batch=1 num_steps=10 masked_mae=0.031892 left_arm_mae=0.030478 right_arm_mae=0.033307 imbalance_mae=0.002830 batch_time_s=0.3695 +sample_eval_batch=2 num_steps=10 masked_mae=0.019175 left_arm_mae=0.021774 right_arm_mae=0.016576 imbalance_mae=0.005198 batch_time_s=0.3460 +sample_eval_batch=3 num_steps=10 masked_mae=0.019252 left_arm_mae=0.020680 right_arm_mae=0.017824 imbalance_mae=0.002856 batch_time_s=0.4095 +sample_eval_batch=4 num_steps=10 masked_mae=0.027861 left_arm_mae=0.023648 right_arm_mae=0.032074 imbalance_mae=0.008427 batch_time_s=0.3446 +sample_eval_batch=5 num_steps=10 masked_mae=0.027615 left_arm_mae=0.028965 right_arm_mae=0.026265 imbalance_mae=0.002700 batch_time_s=0.3563 +sample_eval_batch=6 num_steps=10 masked_mae=0.025685 left_arm_mae=0.044061 right_arm_mae=0.007309 imbalance_mae=0.036752 batch_time_s=0.3497 +sample_eval_batch=7 num_steps=10 masked_mae=0.026870 left_arm_mae=0.048038 right_arm_mae=0.005701 imbalance_mae=0.042337 batch_time_s=0.3481 +sample_eval_batch=8 num_steps=10 masked_mae=0.021825 left_arm_mae=0.037125 right_arm_mae=0.006526 imbalance_mae=0.030599 batch_time_s=0.4133 +sample_eval_batch=9 num_steps=10 masked_mae=0.033276 left_arm_mae=0.061431 right_arm_mae=0.005121 imbalance_mae=0.056311 batch_time_s=0.3536 +sample_eval_batch=10 num_steps=10 masked_mae=0.041929 left_arm_mae=0.078845 right_arm_mae=0.005012 imbalance_mae=0.073833 batch_time_s=0.3504 +sample_eval_batch=11 num_steps=10 masked_mae=0.034225 left_arm_mae=0.062573 right_arm_mae=0.005876 imbalance_mae=0.056697 batch_time_s=0.4244 +sample_eval_batch=12 num_steps=10 masked_mae=0.036636 left_arm_mae=0.066864 right_arm_mae=0.006408 imbalance_mae=0.060457 batch_time_s=0.4686 +sample_eval_batch=13 num_steps=10 masked_mae=0.033938 left_arm_mae=0.061894 right_arm_mae=0.005982 imbalance_mae=0.055912 batch_time_s=0.4191 +sample_eval_batch=14 num_steps=10 masked_mae=0.023608 left_arm_mae=0.038211 right_arm_mae=0.009005 imbalance_mae=0.029206 batch_time_s=0.4448 +sample_eval_batch=15 num_steps=10 masked_mae=0.043359 left_arm_mae=0.016845 right_arm_mae=0.069874 imbalance_mae=0.053029 batch_time_s=0.3755 +sample_eval_batch=16 num_steps=10 masked_mae=0.037564 left_arm_mae=0.019482 right_arm_mae=0.055646 imbalance_mae=0.036164 batch_time_s=0.3432 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.030294 +sample_eval_num_steps_10_std_masked_mae: 0.007277 +sample_eval_num_steps_10_mean_left_arm_mae: 0.041307 +sample_eval_num_steps_10_std_left_arm_mae: 0.019181 +sample_eval_num_steps_10_mean_right_arm_mae: 0.019282 +sample_eval_num_steps_10_std_right_arm_mae: 0.019077 +sample_eval_num_steps_10_mean_left_joint_mae: 0.045179 +sample_eval_num_steps_10_std_left_joint_mae: 0.022508 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.014207 +sample_eval_num_steps_10_std_left_gripper_mae: 0.016425 +sample_eval_num_steps_10_mean_right_joint_mae: 0.020231 +sample_eval_num_steps_10_std_right_joint_mae: 0.020465 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.012640 +sample_eval_num_steps_10_std_right_gripper_mae: 0.018571 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.034582 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.023261 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.3823 std=0.0398 min=0.3432 max=0.4686 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_2000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_2000.log new file mode 100644 index 0000000000000000000000000000000000000000..2a4e8c9239642dfb18d2d0bdbecf49045ff1f015 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_2000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_baseline_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.026397 left_arm_loss=0.019776 right_arm_loss=0.033018 imbalance=0.013242 batch_time_s=0.8257 +eval_batch=2 loss=0.009284 left_arm_loss=0.009409 right_arm_loss=0.009160 imbalance=0.000248 batch_time_s=0.2272 +eval_batch=3 loss=0.011369 left_arm_loss=0.014311 right_arm_loss=0.008427 imbalance=0.005883 batch_time_s=0.2242 +eval_batch=4 loss=0.036861 left_arm_loss=0.035507 right_arm_loss=0.038216 imbalance=0.002709 batch_time_s=0.2250 +eval_batch=5 loss=0.025353 left_arm_loss=0.036053 right_arm_loss=0.014654 imbalance=0.021399 batch_time_s=0.2409 +eval_batch=6 loss=0.046137 left_arm_loss=0.089046 right_arm_loss=0.003228 imbalance=0.085817 batch_time_s=0.2292 +eval_batch=7 loss=0.032558 left_arm_loss=0.062428 right_arm_loss=0.002688 imbalance=0.059740 batch_time_s=0.2458 +eval_batch=8 loss=0.009889 left_arm_loss=0.018301 right_arm_loss=0.001477 imbalance=0.016824 batch_time_s=0.2339 +eval_batch=9 loss=0.012928 left_arm_loss=0.024751 right_arm_loss=0.001106 imbalance=0.023645 batch_time_s=0.2444 +eval_batch=10 loss=0.019660 left_arm_loss=0.037839 right_arm_loss=0.001480 imbalance=0.036359 batch_time_s=0.2305 +eval_batch=11 loss=0.018686 left_arm_loss=0.034757 right_arm_loss=0.002615 imbalance=0.032142 batch_time_s=0.2271 +eval_batch=12 loss=0.021816 left_arm_loss=0.041631 right_arm_loss=0.002000 imbalance=0.039631 batch_time_s=0.2597 +eval_batch=13 loss=0.043429 left_arm_loss=0.084094 right_arm_loss=0.002765 imbalance=0.081329 batch_time_s=0.2239 +eval_batch=14 loss=0.042424 left_arm_loss=0.069546 right_arm_loss=0.015301 imbalance=0.054244 batch_time_s=0.2338 +eval_batch=15 loss=0.060247 left_arm_loss=0.023033 right_arm_loss=0.097461 imbalance=0.074428 batch_time_s=0.2298 +eval_batch=16 loss=0.078912 left_arm_loss=0.039402 right_arm_loss=0.118422 imbalance=0.079020 batch_time_s=0.2336 +eval_batch=17 loss=0.034687 left_arm_loss=0.061929 right_arm_loss=0.007444 imbalance=0.054485 batch_time_s=0.2323 +eval_batch=18 loss=0.049051 left_arm_loss=0.068182 right_arm_loss=0.029920 imbalance=0.038262 batch_time_s=0.2351 +eval_batch=19 loss=0.021375 left_arm_loss=0.026341 right_arm_loss=0.016409 imbalance=0.009933 batch_time_s=0.3373 +eval_batch=20 loss=0.012990 left_arm_loss=0.014628 right_arm_loss=0.011352 imbalance=0.003276 batch_time_s=0.2283 +eval_batch=21 loss=0.024979 left_arm_loss=0.038678 right_arm_loss=0.011280 imbalance=0.027397 batch_time_s=0.2371 +eval_batch=22 loss=0.056881 left_arm_loss=0.061898 right_arm_loss=0.051865 imbalance=0.010034 batch_time_s=0.2315 +eval_batch=23 loss=0.054200 left_arm_loss=0.092598 right_arm_loss=0.015802 imbalance=0.076795 batch_time_s=0.2385 +eval_batch=24 loss=0.052739 left_arm_loss=0.099767 right_arm_loss=0.005711 imbalance=0.094056 batch_time_s=0.2319 +eval_batch=25 loss=0.056198 left_arm_loss=0.111241 right_arm_loss=0.001156 imbalance=0.110086 batch_time_s=0.2370 +eval_batch=26 loss=0.029912 left_arm_loss=0.055838 right_arm_loss=0.003987 imbalance=0.051851 batch_time_s=0.2339 +eval_batch=27 loss=0.023166 left_arm_loss=0.042513 right_arm_loss=0.003819 imbalance=0.038695 batch_time_s=0.2326 +eval_batch=28 loss=0.021049 left_arm_loss=0.040540 right_arm_loss=0.001557 imbalance=0.038983 batch_time_s=0.2299 +eval_batch=29 loss=0.038624 left_arm_loss=0.074662 right_arm_loss=0.002585 imbalance=0.072077 batch_time_s=0.2458 +eval_batch=30 loss=0.054455 left_arm_loss=0.106275 right_arm_loss=0.002636 imbalance=0.103639 batch_time_s=0.2291 +eval_batch=31 loss=0.082370 left_arm_loss=0.156108 right_arm_loss=0.008633 imbalance=0.147475 batch_time_s=0.2312 +eval_batch=32 loss=0.076872 left_arm_loss=0.071586 right_arm_loss=0.082158 imbalance=0.010573 batch_time_s=0.2272 +eval_batch=33 loss=0.048834 left_arm_loss=0.020280 right_arm_loss=0.077388 imbalance=0.057109 batch_time_s=0.2342 +eval_batch=34 loss=0.073862 left_arm_loss=0.108661 right_arm_loss=0.039063 imbalance=0.069597 batch_time_s=0.2314 +eval_batch=35 loss=0.045368 left_arm_loss=0.079731 right_arm_loss=0.011004 imbalance=0.068727 batch_time_s=0.2324 +eval_batch=36 loss=0.026741 left_arm_loss=0.018015 right_arm_loss=0.035468 imbalance=0.017453 batch_time_s=0.2344 +eval_batch=37 loss=0.011712 left_arm_loss=0.014219 right_arm_loss=0.009205 imbalance=0.005013 batch_time_s=0.2332 +eval_batch=38 loss=0.046667 left_arm_loss=0.060838 right_arm_loss=0.032495 imbalance=0.028343 batch_time_s=0.2303 +eval_batch=39 loss=0.056612 left_arm_loss=0.036946 right_arm_loss=0.076279 imbalance=0.039333 batch_time_s=0.2286 +eval_batch=40 loss=0.026539 left_arm_loss=0.029187 right_arm_loss=0.023891 imbalance=0.005296 batch_time_s=0.2277 +eval_batch=41 loss=0.057449 left_arm_loss=0.112006 right_arm_loss=0.002892 imbalance=0.109114 batch_time_s=0.2290 +eval_batch=42 loss=0.025764 left_arm_loss=0.048528 right_arm_loss=0.003000 imbalance=0.045528 batch_time_s=0.2396 +eval_batch=43 loss=0.011870 left_arm_loss=0.020990 right_arm_loss=0.002750 imbalance=0.018240 batch_time_s=0.2340 +eval_batch=44 loss=0.013696 left_arm_loss=0.025204 right_arm_loss=0.002189 imbalance=0.023015 batch_time_s=0.2451 +eval_batch=45 loss=0.018640 left_arm_loss=0.034554 right_arm_loss=0.002726 imbalance=0.031828 batch_time_s=0.2325 +eval_batch=46 loss=0.026927 left_arm_loss=0.049770 right_arm_loss=0.004084 imbalance=0.045686 batch_time_s=0.2301 +eval_batch=47 loss=0.133834 left_arm_loss=0.017368 right_arm_loss=0.250299 imbalance=0.232931 batch_time_s=0.2997 +eval_batch=48 loss=0.162658 left_arm_loss=0.010945 right_arm_loss=0.314371 imbalance=0.303426 batch_time_s=0.2318 +eval_batch=49 loss=0.020931 left_arm_loss=0.005021 right_arm_loss=0.036841 imbalance=0.031820 batch_time_s=0.2316 +eval_batch=50 loss=0.086151 left_arm_loss=0.041024 right_arm_loss=0.131277 imbalance=0.090253 batch_time_s=0.2341 +config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/2000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.041595 +std_val_loss: 0.030015 +mean_left_arm_loss: 0.049919 +std_left_arm_loss: 0.033208 +mean_right_arm_loss: 0.033271 +std_right_arm_loss: 0.059873 +mean_left_joint_loss: 0.051501 +std_left_joint_loss: 0.035502 +mean_left_gripper_loss: 0.038846 +std_left_gripper_loss: 0.082622 +mean_right_joint_loss: 0.034159 +std_right_joint_loss: 0.066139 +mean_right_gripper_loss: 0.027055 +std_right_gripper_loss: 0.066540 +mean_left_right_imbalance: 0.054740 +std_left_right_imbalance: 0.055247 +per_batch_timing_seconds: mean=0.2487 std=0.0844 min=0.2239 max=0.8257 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.064759 left_arm_mae=0.058327 right_arm_mae=0.071190 imbalance_mae=0.012863 batch_time_s=0.2814 +sample_eval_batch=2 num_steps=4 masked_mae=0.040229 left_arm_mae=0.048076 right_arm_mae=0.032383 imbalance_mae=0.015693 batch_time_s=0.2705 +sample_eval_batch=3 num_steps=4 masked_mae=0.039452 left_arm_mae=0.043037 right_arm_mae=0.035867 imbalance_mae=0.007170 batch_time_s=0.2690 +sample_eval_batch=4 num_steps=4 masked_mae=0.057066 left_arm_mae=0.061438 right_arm_mae=0.052694 imbalance_mae=0.008743 batch_time_s=0.2702 +sample_eval_batch=5 num_steps=4 masked_mae=0.051264 left_arm_mae=0.059876 right_arm_mae=0.042652 imbalance_mae=0.017224 batch_time_s=0.2814 +sample_eval_batch=6 num_steps=4 masked_mae=0.079315 left_arm_mae=0.141172 right_arm_mae=0.017458 imbalance_mae=0.123714 batch_time_s=0.2681 +sample_eval_batch=7 num_steps=4 masked_mae=0.064131 left_arm_mae=0.113531 right_arm_mae=0.014731 imbalance_mae=0.098800 batch_time_s=0.2998 +sample_eval_batch=8 num_steps=4 masked_mae=0.036546 left_arm_mae=0.060300 right_arm_mae=0.012791 imbalance_mae=0.047508 batch_time_s=0.2774 +sample_eval_batch=9 num_steps=4 masked_mae=0.042204 left_arm_mae=0.072879 right_arm_mae=0.011529 imbalance_mae=0.061350 batch_time_s=0.3000 +sample_eval_batch=10 num_steps=4 masked_mae=0.053692 left_arm_mae=0.094078 right_arm_mae=0.013305 imbalance_mae=0.080773 batch_time_s=0.2674 +sample_eval_batch=11 num_steps=4 masked_mae=0.047388 left_arm_mae=0.079979 right_arm_mae=0.014798 imbalance_mae=0.065181 batch_time_s=0.3285 +sample_eval_batch=12 num_steps=4 masked_mae=0.050189 left_arm_mae=0.085965 right_arm_mae=0.014413 imbalance_mae=0.071552 batch_time_s=0.3060 +sample_eval_batch=13 num_steps=4 masked_mae=0.073749 left_arm_mae=0.132138 right_arm_mae=0.015360 imbalance_mae=0.116778 batch_time_s=0.3753 +sample_eval_batch=14 num_steps=4 masked_mae=0.082068 left_arm_mae=0.126276 right_arm_mae=0.037859 imbalance_mae=0.088417 batch_time_s=0.3704 +sample_eval_batch=15 num_steps=4 masked_mae=0.084759 left_arm_mae=0.030757 right_arm_mae=0.138762 imbalance_mae=0.108006 batch_time_s=0.3056 +sample_eval_batch=16 num_steps=4 masked_mae=0.097239 left_arm_mae=0.051779 right_arm_mae=0.142698 imbalance_mae=0.090919 batch_time_s=0.3393 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.060253 +sample_eval_num_steps_4_std_masked_mae: 0.017936 +sample_eval_num_steps_4_mean_left_arm_mae: 0.078725 +sample_eval_num_steps_4_std_left_arm_mae: 0.032786 +sample_eval_num_steps_4_mean_right_arm_mae: 0.041781 +sample_eval_num_steps_4_std_right_arm_mae: 0.040910 +sample_eval_num_steps_4_mean_left_joint_mae: 0.083688 +sample_eval_num_steps_4_std_left_joint_mae: 0.036089 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.043985 +sample_eval_num_steps_4_std_left_gripper_mae: 0.072901 +sample_eval_num_steps_4_mean_right_joint_mae: 0.042767 +sample_eval_num_steps_4_std_right_joint_mae: 0.041669 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.034874 +sample_eval_num_steps_4_std_right_gripper_mae: 0.058769 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.063418 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.039412 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3006 std=0.0345 min=0.2674 max=0.3753 +sample_eval_batch=1 num_steps=10 masked_mae=0.071056 left_arm_mae=0.066950 right_arm_mae=0.075162 imbalance_mae=0.008212 batch_time_s=0.4220 +sample_eval_batch=2 num_steps=10 masked_mae=0.047812 left_arm_mae=0.056756 right_arm_mae=0.038868 imbalance_mae=0.017888 batch_time_s=0.3396 +sample_eval_batch=3 num_steps=10 masked_mae=0.045826 left_arm_mae=0.051423 right_arm_mae=0.040229 imbalance_mae=0.011195 batch_time_s=0.3502 +sample_eval_batch=4 num_steps=10 masked_mae=0.065155 left_arm_mae=0.070466 right_arm_mae=0.059845 imbalance_mae=0.010622 batch_time_s=0.3414 +sample_eval_batch=5 num_steps=10 masked_mae=0.057679 left_arm_mae=0.065192 right_arm_mae=0.050167 imbalance_mae=0.015025 batch_time_s=0.3405 +sample_eval_batch=6 num_steps=10 masked_mae=0.084349 left_arm_mae=0.148198 right_arm_mae=0.020499 imbalance_mae=0.127699 batch_time_s=0.3414 +sample_eval_batch=7 num_steps=10 masked_mae=0.067378 left_arm_mae=0.119032 right_arm_mae=0.015724 imbalance_mae=0.103307 batch_time_s=0.3734 +sample_eval_batch=8 num_steps=10 masked_mae=0.041997 left_arm_mae=0.070063 right_arm_mae=0.013930 imbalance_mae=0.056133 batch_time_s=0.3433 +sample_eval_batch=9 num_steps=10 masked_mae=0.048462 left_arm_mae=0.083206 right_arm_mae=0.013718 imbalance_mae=0.069487 batch_time_s=0.3682 +sample_eval_batch=10 num_steps=10 masked_mae=0.059187 left_arm_mae=0.103132 right_arm_mae=0.015243 imbalance_mae=0.087889 batch_time_s=0.4041 +sample_eval_batch=11 num_steps=10 masked_mae=0.052531 left_arm_mae=0.088090 right_arm_mae=0.016972 imbalance_mae=0.071118 batch_time_s=0.3420 +sample_eval_batch=12 num_steps=10 masked_mae=0.057733 left_arm_mae=0.096639 right_arm_mae=0.018827 imbalance_mae=0.077812 batch_time_s=0.3407 +sample_eval_batch=13 num_steps=10 masked_mae=0.078588 left_arm_mae=0.139026 right_arm_mae=0.018150 imbalance_mae=0.120876 batch_time_s=0.3427 +sample_eval_batch=14 num_steps=10 masked_mae=0.085513 left_arm_mae=0.132507 right_arm_mae=0.038519 imbalance_mae=0.093988 batch_time_s=0.3408 +sample_eval_batch=15 num_steps=10 masked_mae=0.088594 left_arm_mae=0.035055 right_arm_mae=0.142132 imbalance_mae=0.107077 batch_time_s=0.3833 +sample_eval_batch=16 num_steps=10 masked_mae=0.100376 left_arm_mae=0.056270 right_arm_mae=0.144482 imbalance_mae=0.088212 batch_time_s=0.3644 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.065765 +sample_eval_num_steps_10_std_masked_mae: 0.016923 +sample_eval_num_steps_10_mean_left_arm_mae: 0.086375 +sample_eval_num_steps_10_std_left_arm_mae: 0.032761 +sample_eval_num_steps_10_mean_right_arm_mae: 0.045154 +sample_eval_num_steps_10_std_right_arm_mae: 0.041131 +sample_eval_num_steps_10_mean_left_joint_mae: 0.092111 +sample_eval_num_steps_10_std_left_joint_mae: 0.036788 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.046224 +sample_eval_num_steps_10_std_left_gripper_mae: 0.076043 +sample_eval_num_steps_10_mean_right_joint_mae: 0.046163 +sample_eval_num_steps_10_std_right_joint_mae: 0.042138 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.038093 +sample_eval_num_steps_10_std_right_gripper_mae: 0.056179 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.066659 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.040501 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.3586 std=0.0248 min=0.3396 max=0.4220 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_5000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_5000.log new file mode 100644 index 0000000000000000000000000000000000000000..44e3d4639d53f7e82fba3edbad9146696697824c --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_baseline_10k_val_5000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_baseline_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.022598 left_arm_loss=0.025243 right_arm_loss=0.019952 imbalance=0.005291 batch_time_s=0.7730 +eval_batch=2 loss=0.003257 left_arm_loss=0.002496 right_arm_loss=0.004018 imbalance=0.001522 batch_time_s=0.2212 +eval_batch=3 loss=0.003316 left_arm_loss=0.003317 right_arm_loss=0.003314 imbalance=0.000004 batch_time_s=0.2266 +eval_batch=4 loss=0.017987 left_arm_loss=0.019024 right_arm_loss=0.016950 imbalance=0.002074 batch_time_s=0.2267 +eval_batch=5 loss=0.012783 left_arm_loss=0.016612 right_arm_loss=0.008953 imbalance=0.007659 batch_time_s=0.3403 +eval_batch=6 loss=0.010879 left_arm_loss=0.020326 right_arm_loss=0.001432 imbalance=0.018894 batch_time_s=0.2233 +eval_batch=7 loss=0.012325 left_arm_loss=0.023537 right_arm_loss=0.001113 imbalance=0.022423 batch_time_s=0.2308 +eval_batch=8 loss=0.008239 left_arm_loss=0.015663 right_arm_loss=0.000815 imbalance=0.014848 batch_time_s=0.2327 +eval_batch=9 loss=0.013383 left_arm_loss=0.025981 right_arm_loss=0.000785 imbalance=0.025196 batch_time_s=0.2296 +eval_batch=10 loss=0.022633 left_arm_loss=0.044221 right_arm_loss=0.001046 imbalance=0.043176 batch_time_s=0.2398 +eval_batch=11 loss=0.015927 left_arm_loss=0.030194 right_arm_loss=0.001659 imbalance=0.028536 batch_time_s=0.2276 +eval_batch=12 loss=0.016066 left_arm_loss=0.031229 right_arm_loss=0.000902 imbalance=0.030327 batch_time_s=0.2228 +eval_batch=13 loss=0.019034 left_arm_loss=0.036426 right_arm_loss=0.001641 imbalance=0.034785 batch_time_s=0.3058 +eval_batch=14 loss=0.016662 left_arm_loss=0.016339 right_arm_loss=0.016986 imbalance=0.000647 batch_time_s=0.2226 +eval_batch=15 loss=0.055849 left_arm_loss=0.016080 right_arm_loss=0.095619 imbalance=0.079538 batch_time_s=0.2277 +eval_batch=16 loss=0.035661 left_arm_loss=0.017943 right_arm_loss=0.053379 imbalance=0.035436 batch_time_s=0.2301 +eval_batch=17 loss=0.021186 left_arm_loss=0.039219 right_arm_loss=0.003153 imbalance=0.036066 batch_time_s=0.2307 +eval_batch=18 loss=0.033071 left_arm_loss=0.046249 right_arm_loss=0.019893 imbalance=0.026356 batch_time_s=0.2317 +eval_batch=19 loss=0.010998 left_arm_loss=0.017014 right_arm_loss=0.004983 imbalance=0.012032 batch_time_s=0.2259 +eval_batch=20 loss=0.016367 left_arm_loss=0.027997 right_arm_loss=0.004737 imbalance=0.023260 batch_time_s=0.2276 +eval_batch=21 loss=0.070861 left_arm_loss=0.138458 right_arm_loss=0.003263 imbalance=0.135195 batch_time_s=0.2659 +eval_batch=22 loss=0.086826 left_arm_loss=0.136300 right_arm_loss=0.037352 imbalance=0.098947 batch_time_s=0.2656 +eval_batch=23 loss=0.041515 left_arm_loss=0.074249 right_arm_loss=0.008781 imbalance=0.065469 batch_time_s=0.2287 +eval_batch=24 loss=0.075753 left_arm_loss=0.148664 right_arm_loss=0.002842 imbalance=0.145822 batch_time_s=0.2883 +eval_batch=25 loss=0.063371 left_arm_loss=0.125955 right_arm_loss=0.000787 imbalance=0.125168 batch_time_s=0.2283 +eval_batch=26 loss=0.031963 left_arm_loss=0.061717 right_arm_loss=0.002209 imbalance=0.059508 batch_time_s=0.2304 +eval_batch=27 loss=0.029457 left_arm_loss=0.055315 right_arm_loss=0.003600 imbalance=0.051715 batch_time_s=0.2292 +eval_batch=28 loss=0.015485 left_arm_loss=0.030234 right_arm_loss=0.000735 imbalance=0.029499 batch_time_s=0.3076 +eval_batch=29 loss=0.024835 left_arm_loss=0.047639 right_arm_loss=0.002031 imbalance=0.045607 batch_time_s=0.2278 +eval_batch=30 loss=0.026867 left_arm_loss=0.050554 right_arm_loss=0.003179 imbalance=0.047374 batch_time_s=0.3279 +eval_batch=31 loss=0.048694 left_arm_loss=0.092962 right_arm_loss=0.004426 imbalance=0.088536 batch_time_s=0.3195 +eval_batch=32 loss=0.032212 left_arm_loss=0.041649 right_arm_loss=0.022774 imbalance=0.018875 batch_time_s=0.2350 +eval_batch=33 loss=0.037968 left_arm_loss=0.012033 right_arm_loss=0.063903 imbalance=0.051870 batch_time_s=0.2801 +eval_batch=34 loss=0.070101 left_arm_loss=0.121847 right_arm_loss=0.018354 imbalance=0.103493 batch_time_s=0.2352 +eval_batch=35 loss=0.036351 left_arm_loss=0.069739 right_arm_loss=0.002963 imbalance=0.066775 batch_time_s=0.2946 +eval_batch=36 loss=0.015255 left_arm_loss=0.009489 right_arm_loss=0.021021 imbalance=0.011532 batch_time_s=0.2311 +eval_batch=37 loss=0.003919 left_arm_loss=0.005172 right_arm_loss=0.002666 imbalance=0.002506 batch_time_s=0.2330 +eval_batch=38 loss=0.034404 left_arm_loss=0.039350 right_arm_loss=0.029457 imbalance=0.009893 batch_time_s=0.2376 +eval_batch=39 loss=0.031972 left_arm_loss=0.013650 right_arm_loss=0.050293 imbalance=0.036643 batch_time_s=0.2325 +eval_batch=40 loss=0.013568 left_arm_loss=0.016394 right_arm_loss=0.010741 imbalance=0.005654 batch_time_s=0.2671 +eval_batch=41 loss=0.026423 left_arm_loss=0.051625 right_arm_loss=0.001222 imbalance=0.050402 batch_time_s=0.2496 +eval_batch=42 loss=0.011443 left_arm_loss=0.021655 right_arm_loss=0.001231 imbalance=0.020424 batch_time_s=0.2390 +eval_batch=43 loss=0.004324 left_arm_loss=0.007171 right_arm_loss=0.001478 imbalance=0.005693 batch_time_s=0.2313 +eval_batch=44 loss=0.002703 left_arm_loss=0.004312 right_arm_loss=0.001093 imbalance=0.003219 batch_time_s=0.2279 +eval_batch=45 loss=0.007087 left_arm_loss=0.012914 right_arm_loss=0.001261 imbalance=0.011654 batch_time_s=0.2363 +eval_batch=46 loss=0.022314 left_arm_loss=0.043007 right_arm_loss=0.001622 imbalance=0.041385 batch_time_s=0.2282 +eval_batch=47 loss=0.029021 left_arm_loss=0.008937 right_arm_loss=0.049105 imbalance=0.040168 batch_time_s=0.3012 +eval_batch=48 loss=0.033211 left_arm_loss=0.005827 right_arm_loss=0.060594 imbalance=0.054767 batch_time_s=0.2974 +eval_batch=49 loss=0.006837 left_arm_loss=0.002519 right_arm_loss=0.011154 imbalance=0.008635 batch_time_s=0.2310 +eval_batch=50 loss=0.063237 left_arm_loss=0.031470 right_arm_loss=0.095004 imbalance=0.063534 batch_time_s=0.3002 +config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/handover_packed_baseline_10k/5000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.027324 +std_val_loss: 0.020404 +mean_left_arm_loss: 0.039118 +std_left_arm_loss: 0.037404 +mean_right_arm_loss: 0.015529 +std_right_arm_loss: 0.023314 +mean_left_joint_loss: 0.042035 +std_left_joint_loss: 0.041763 +mean_left_gripper_loss: 0.018705 +std_left_gripper_loss: 0.031815 +mean_right_joint_loss: 0.015711 +std_right_joint_loss: 0.023929 +mean_right_gripper_loss: 0.014261 +std_right_gripper_loss: 0.030013 +mean_left_right_imbalance: 0.038961 +std_left_right_imbalance: 0.035474 +per_batch_timing_seconds: mean=0.2601 std=0.0801 min=0.2212 max=0.7730 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.049127 left_arm_mae=0.051359 right_arm_mae=0.046895 imbalance_mae=0.004464 batch_time_s=0.2731 +sample_eval_batch=2 num_steps=4 masked_mae=0.021553 left_arm_mae=0.021278 right_arm_mae=0.021828 imbalance_mae=0.000550 batch_time_s=0.3528 +sample_eval_batch=3 num_steps=4 masked_mae=0.020387 left_arm_mae=0.018467 right_arm_mae=0.022306 imbalance_mae=0.003839 batch_time_s=0.2626 +sample_eval_batch=4 num_steps=4 masked_mae=0.035600 left_arm_mae=0.030283 right_arm_mae=0.040917 imbalance_mae=0.010633 batch_time_s=0.2712 +sample_eval_batch=5 num_steps=4 masked_mae=0.032516 left_arm_mae=0.037471 right_arm_mae=0.027560 imbalance_mae=0.009911 batch_time_s=0.2688 +sample_eval_batch=6 num_steps=4 masked_mae=0.034533 left_arm_mae=0.058071 right_arm_mae=0.010994 imbalance_mae=0.047077 batch_time_s=0.3179 +sample_eval_batch=7 num_steps=4 masked_mae=0.035423 left_arm_mae=0.061402 right_arm_mae=0.009444 imbalance_mae=0.051958 batch_time_s=0.3146 +sample_eval_batch=8 num_steps=4 masked_mae=0.026805 left_arm_mae=0.046320 right_arm_mae=0.007290 imbalance_mae=0.039029 batch_time_s=0.3397 +sample_eval_batch=9 num_steps=4 masked_mae=0.040398 left_arm_mae=0.072072 right_arm_mae=0.008723 imbalance_mae=0.063349 batch_time_s=0.3298 +sample_eval_batch=10 num_steps=4 masked_mae=0.050191 left_arm_mae=0.090027 right_arm_mae=0.010354 imbalance_mae=0.079673 batch_time_s=0.2585 +sample_eval_batch=11 num_steps=4 masked_mae=0.034508 left_arm_mae=0.059912 right_arm_mae=0.009105 imbalance_mae=0.050807 batch_time_s=0.2612 +sample_eval_batch=12 num_steps=4 masked_mae=0.041212 left_arm_mae=0.073254 right_arm_mae=0.009170 imbalance_mae=0.064084 batch_time_s=0.2658 +sample_eval_batch=13 num_steps=4 masked_mae=0.035764 left_arm_mae=0.060856 right_arm_mae=0.010673 imbalance_mae=0.050183 batch_time_s=0.3511 +sample_eval_batch=14 num_steps=4 masked_mae=0.035192 left_arm_mae=0.048918 right_arm_mae=0.021465 imbalance_mae=0.027453 batch_time_s=0.2694 +sample_eval_batch=15 num_steps=4 masked_mae=0.081409 left_arm_mae=0.025452 right_arm_mae=0.137367 imbalance_mae=0.111915 batch_time_s=0.2695 +sample_eval_batch=16 num_steps=4 masked_mae=0.060902 left_arm_mae=0.032682 right_arm_mae=0.089121 imbalance_mae=0.056439 batch_time_s=0.2659 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.039720 +sample_eval_num_steps_4_std_masked_mae: 0.014654 +sample_eval_num_steps_4_mean_left_arm_mae: 0.049239 +sample_eval_num_steps_4_std_left_arm_mae: 0.019869 +sample_eval_num_steps_4_mean_right_arm_mae: 0.030201 +sample_eval_num_steps_4_std_right_arm_mae: 0.034473 +sample_eval_num_steps_4_mean_left_joint_mae: 0.052215 +sample_eval_num_steps_4_std_left_joint_mae: 0.023235 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.028408 +sample_eval_num_steps_4_std_left_gripper_mae: 0.028427 +sample_eval_num_steps_4_mean_right_joint_mae: 0.031159 +sample_eval_num_steps_4_std_right_joint_mae: 0.037572 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.023490 +sample_eval_num_steps_4_std_right_gripper_mae: 0.024208 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.041960 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.030152 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.2920 std=0.0342 min=0.2585 max=0.3528 +sample_eval_batch=1 num_steps=10 masked_mae=0.058142 left_arm_mae=0.062580 right_arm_mae=0.053705 imbalance_mae=0.008875 batch_time_s=0.3521 +sample_eval_batch=2 num_steps=10 masked_mae=0.027516 left_arm_mae=0.027185 right_arm_mae=0.027846 imbalance_mae=0.000661 batch_time_s=0.3613 +sample_eval_batch=3 num_steps=10 masked_mae=0.026459 left_arm_mae=0.024776 right_arm_mae=0.028142 imbalance_mae=0.003366 batch_time_s=0.3707 +sample_eval_batch=4 num_steps=10 masked_mae=0.042321 left_arm_mae=0.037100 right_arm_mae=0.047541 imbalance_mae=0.010441 batch_time_s=0.4291 +sample_eval_batch=5 num_steps=10 masked_mae=0.035501 left_arm_mae=0.039882 right_arm_mae=0.031121 imbalance_mae=0.008761 batch_time_s=0.3789 +sample_eval_batch=6 num_steps=10 masked_mae=0.037181 left_arm_mae=0.063069 right_arm_mae=0.011292 imbalance_mae=0.051776 batch_time_s=0.3463 +sample_eval_batch=7 num_steps=10 masked_mae=0.037960 left_arm_mae=0.065358 right_arm_mae=0.010561 imbalance_mae=0.054798 batch_time_s=0.3618 +sample_eval_batch=8 num_steps=10 masked_mae=0.030014 left_arm_mae=0.052116 right_arm_mae=0.007913 imbalance_mae=0.044203 batch_time_s=0.4241 +sample_eval_batch=9 num_steps=10 masked_mae=0.045459 left_arm_mae=0.080979 right_arm_mae=0.009940 imbalance_mae=0.071039 batch_time_s=0.4006 +sample_eval_batch=10 num_steps=10 masked_mae=0.052380 left_arm_mae=0.092981 right_arm_mae=0.011778 imbalance_mae=0.081203 batch_time_s=0.4774 +sample_eval_batch=11 num_steps=10 masked_mae=0.036979 left_arm_mae=0.064074 right_arm_mae=0.009883 imbalance_mae=0.054191 batch_time_s=0.4397 +sample_eval_batch=12 num_steps=10 masked_mae=0.044283 left_arm_mae=0.078149 right_arm_mae=0.010416 imbalance_mae=0.067733 batch_time_s=0.3574 +sample_eval_batch=13 num_steps=10 masked_mae=0.037810 left_arm_mae=0.063530 right_arm_mae=0.012089 imbalance_mae=0.051441 batch_time_s=0.3996 +sample_eval_batch=14 num_steps=10 masked_mae=0.037400 left_arm_mae=0.052177 right_arm_mae=0.022623 imbalance_mae=0.029554 batch_time_s=0.3962 +sample_eval_batch=15 num_steps=10 masked_mae=0.080721 left_arm_mae=0.024507 right_arm_mae=0.136936 imbalance_mae=0.112428 batch_time_s=0.4037 +sample_eval_batch=16 num_steps=10 masked_mae=0.063413 left_arm_mae=0.032152 right_arm_mae=0.094674 imbalance_mae=0.062522 batch_time_s=0.4226 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.043346 +sample_eval_num_steps_10_std_masked_mae: 0.013818 +sample_eval_num_steps_10_mean_left_arm_mae: 0.053788 +sample_eval_num_steps_10_std_left_arm_mae: 0.020493 +sample_eval_num_steps_10_mean_right_arm_mae: 0.032904 +sample_eval_num_steps_10_std_right_arm_mae: 0.034889 +sample_eval_num_steps_10_mean_left_joint_mae: 0.057689 +sample_eval_num_steps_10_std_left_joint_mae: 0.024439 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.026486 +sample_eval_num_steps_10_std_left_gripper_mae: 0.029864 +sample_eval_num_steps_10_mean_right_joint_mae: 0.033700 +sample_eval_num_steps_10_std_right_joint_mae: 0.038002 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.027331 +sample_eval_num_steps_10_std_right_gripper_mae: 0.027093 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.044562 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.030999 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.3951 std=0.0357 min=0.3463 max=0.4774 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k.log new file mode 100644 index 0000000000000000000000000000000000000000..36e2ac9a99b39abcab1dd1313ea6fd6a058cd79b --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k.log @@ -0,0 +1,1154 @@ +W0309 18:41:27.847000 18566 torch/distributed/run.py:766] +W0309 18:41:27.847000 18566 torch/distributed/run.py:766] ***************************************** +W0309 18:41:27.847000 18566 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0309 18:41:27.847000 18566 torch/distributed/run.py:766] ***************************************** +18:42:25.260 [I] Created experiment checkpoint directory: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k (18633:train_pytorch.py:505) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank0]:[W309 18:42:25.838770876 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank3]:[W309 18:42:25.864279915 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank2]:[W309 18:42:27.891368769 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank1]:[W309 18:42:29.778079486 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +18:42:30.660 [I] Using batch size per GPU: 4 (total batch size across 4 GPUs: 16) (18633:train_pytorch.py:524) +18:42:30.797 [I] Loaded norm stats from /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train (18633:config.py:234) +18:42:30.800 [I] data_config: DataConfig(repo_id='lsnu/twin_handover_256_train', asset_id='lsnu/twin_handover_256_train', norm_stats={'state': NormStats(mean=array([ 0.40321857, 0.17899239, -0.07588876, -2.06326795, -0.46418607, + 1.79356563, 0.70229131, 0.48194093, 0.93952829, 0.86693275, + -1.03168762, -1.9056077 , -0.53421056, 1.87584054, 2.36738205, + 0.91249251]), std=array([0.73344636, 0.47653052, 0.72710407, 0.42399687, 0.63613892, + 0.61144608, 1.11724186, 0.49967375, 0.86981195, 0.75071597, + 0.90787333, 0.35008711, 0.51183224, 0.36600712, 0.56947577, + 0.28257725]), q01=array([-1.52408956, -1.32446341, -1.91092197, -2.89885788, -1.66315554, + 0.59010215, -2.27611645, 0. , -1.77352981, -1.62131719, + -1.77092851, -2.19172778, -2.03159353, 0.55409113, 0.79255736, + 0. ]), q99=array([ 2.16638614, 1.38857444, 1.93436338, -0.88548369, 1.39976143, + 2.99162304, 2.8194857 , 0.9998 , 1.46557211, 1.74660106, + 1.58644652, -0.87876934, 2.25910752, 2.54628449, 2.89347284, + 0.9998 ])), 'actions': NormStats(mean=array([ 0.05879939, -0.00704042, -0.02719213, -0.07685276, -0.07520971, + -0.00498583, 0.03577602, 0.48164892, 0.06564316, 0.06023132, + -0.10068271, -0.09547432, -0.0526481 , 0.08205888, 0.13954687, + 0.88333535]), std=array([0.18337056, 0.28128958, 0.18525195, 0.29767084, 0.22944973, + 0.40312037, 0.3896611 , 0.49966311, 0.21938531, 0.16883859, + 0.20206179, 0.14864719, 0.12629333, 0.15546791, 0.23423795, + 0.32102022]), q01=array([-0.34140511, -0.71597991, -0.55301429, -0.8233152 , -0.68097536, + -0.87723451, -0.86000918, 0. , -0.53261366, -0.49289397, + -0.48524564, -0.35752607, -0.42426748, -0.18230745, -0.09212705, + 0. ]), q99=array([0.55444025, 0.69361174, 0.44115428, 0.550829 , 0.49707318, + 0.68353445, 0.82907713, 0.9998 , 0.42654409, 0.44255511, + 0.4114292 , 0.01550327, 0.38038206, 0.71452535, 0.62808441, + 0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (18633:data_loader.py:283) +18:42:30.806 [I] Using existing local LeRobot dataset mirror for lsnu/twin_handover_256_train: /workspace/lerobot/lsnu/twin_handover_256_train (18633:data_loader.py:149) +18:42:35.224 [I] local_batch_size: 4 (18633:data_loader.py:364) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +18:44:11.355 [I] Enabled gradient checkpointing for PI0Pytorch model (18633:pi0_pytorch.py:150) +18:44:11.359 [I] Enabled gradient checkpointing for memory optimization (18633:train_pytorch.py:596) +18:44:11.361 [I] Step 0 (after_model_creation): GPU memory - allocated: 7.48GB, reserved: 7.48GB, free: 0.00GB, peak_allocated: 7.48GB, peak_reserved: 7.48GB | DDP: rank=0, world_size=4 (18633:train_pytorch.py:465) +18:44:29.950 [I] Loading weights from: /workspace/checkpoints/pi05_base_parallel_packed_from_single (18633:train_pytorch.py:625) +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +18:44:31.818 [I] Weight loading missing key count: 0 (18633:train_pytorch.py:629) +18:44:31.818 [I] Weight loading missing keys: set() (18633:train_pytorch.py:630) +18:44:31.819 [I] Weight loading unexpected key count: 0 (18633:train_pytorch.py:631) +18:44:31.819 [I] Weight loading unexpected keys: [] (18633:train_pytorch.py:632) +18:44:31.819 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_parallel_packed_from_single (18633:train_pytorch.py:633) +18:44:31.822 [I] Running on: 9a96de7d560b | world_size=4 (18633:train_pytorch.py:673) +18:44:31.823 [I] Training config: batch_size=16, effective_batch_size=4, num_train_steps=10000 (18633:train_pytorch.py:674) +18:44:31.823 [I] Memory optimizations: gradient_checkpointing=True (18633:train_pytorch.py:677) +18:44:31.823 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (18633:train_pytorch.py:678) +18:44:31.824 [I] LR schedule: warmup=500, peak_lr=2.50e-05, decay_steps=10000, end_lr=2.50e-06 (18633:train_pytorch.py:679) +18:44:31.824 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (18633:train_pytorch.py:682) +18:44:31.824 [I] EMA is not supported for PyTorch training (18633:train_pytorch.py:685) +18:44:31.824 [I] Training precision: bfloat16 (18633:train_pytorch.py:686) +18:44:31.826 [I] Resolved config name: pi05_twin_handover_256_packed_parallel_pytorch_10k (18633:train_pytorch.py:280) +18:44:31.827 [I] Dataset repo_id: lsnu/twin_handover_256_train (18633:train_pytorch.py:281) +18:44:31.827 [I] Norm-stats file path: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (18633:train_pytorch.py:282) +18:44:31.827 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (18633:train_pytorch.py:283) +18:44:31.827 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_parallel_packed_from_single (18633:train_pytorch.py:284) +18:44:31.827 [I] Model type: parallel (18633:train_pytorch.py:285) +18:44:31.828 [I] Packed transforms active: True (18633:train_pytorch.py:286) +18:44:31.828 [I] World size: 4 (18633:train_pytorch.py:287) +18:44:31.828 [I] Batch size: local=4, global=16 (18633:train_pytorch.py:288) +18:44:31.828 [I] num_workers: 8 (18633:train_pytorch.py:289) +18:44:31.828 [I] Precision: bfloat16 (18633:train_pytorch.py:290) +18:44:31.828 [I] LR schedule summary: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (18633:train_pytorch.py:291) +18:44:31.828 [I] Save/log intervals: save_interval=1000, log_interval=10 (18633:train_pytorch.py:298) +18:44:31.828 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (18633:train_pytorch.py:299) +18:44:31.828 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (18633:train_pytorch.py:300) +18:44:31.829 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (18633:train_pytorch.py:301) +18:44:31.829 [I] Gradient bucket diagnostics: action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert (18633:train_pytorch.py:694) + Training: 0%| | 0/10000 [00:00 /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000 (18633:train_pytorch.py:350) + Training: 10%|█ | 1000/10000 [13:42<85:11:52, 34.08s/it, loss=0.0436, lr=2.48e-05, step=999] Training: 10%|█ | 1000/10000 [13:42<85:11:52, 34.08s/it, loss=0.0246, lr=2.48e-05, step=1000] Training: 10%|█ | 1001/10000 [13:42<60:07:23, 24.05s/it, loss=0.0246, lr=2.48e-05, step=1000] Training: 10%|█ | 1001/10000 [13:42<60:07:23, 24.05s/it, loss=0.0384, lr=2.48e-05, step=1001] Training: 10%|█ | 1002/10000 [13:43<42:39:14, 17.07s/it, loss=0.0384, lr=2.48e-05, step=1001] Training: 10%|█ | 1002/10000 [13:43<42:39:14, 17.07s/it, loss=0.0319, lr=2.48e-05, step=1002] Training: 10%|█ | 1003/10000 [13:44<30:20:14, 12.14s/it, loss=0.0319, lr=2.48e-05, step=1002] Training: 10%|█ | 1003/10000 [13:44<30:20:14, 12.14s/it, loss=0.0552, lr=2.48e-05, step=1003] Training: 10%|█ | 1004/10000 [13:45<21:48:12, 8.73s/it, loss=0.0552, lr=2.48e-05, step=1003] Training: 10%|█ | 1004/10000 [13:45<21:48:12, 8.73s/it, loss=0.1076, lr=2.48e-05, step=1004] Training: 10%|█ | 1005/10000 [13:45<15:37:55, 6.26s/it, loss=0.1076, lr=2.48e-05, step=1004] Training: 10%|█ | 1005/10000 [13:45<15:37:55, 6.26s/it, loss=0.0218, lr=2.48e-05, step=1005] Training: 10%|█ | 1006/10000 [13:46<11:23:35, 4.56s/it, loss=0.0218, lr=2.48e-05, step=1005] Training: 10%|█ | 1006/10000 [13:46<11:23:35, 4.56s/it, loss=0.0289, lr=2.48e-05, step=1006] Training: 10%|█ | 1007/10000 [13:47<8:35:14, 3.44s/it, loss=0.0289, lr=2.48e-05, step=1006] Training: 10%|█ | 1007/10000 [13:47<8:35:14, 3.44s/it, loss=0.0490, lr=2.48e-05, step=1007] Training: 10%|█ | 1008/10000 [13:47<6:23:03, 2.56s/it, loss=0.0490, lr=2.48e-05, step=1007] Training: 10%|█ | 1008/10000 [13:47<6:23:03, 2.56s/it, loss=0.0322, lr=2.48e-05, step=1008] Training: 10%|█ | 1009/10000 [13:48<4:50:00, 1.94s/it, loss=0.0322, lr=2.48e-05, step=1008] Training: 10%|█ | 1009/10000 [13:48<4:50:00, 1.94s/it, loss=0.0249, lr=2.48e-05, step=1009]18:58:20.550 [I] step=1010 loss=0.0290 smoothed_loss=0.0429 lr=2.48e-05 grad_norm=0.9173 step_time=0.5516s data_time=11.2188s it/s=0.085 eta_to_10000=105814.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1944 grad_arm_token_fuse=0.0948 grad_shared_expert=0.7120 (18633:train_pytorch.py:850) + Training: 10%|█ | 1010/10000 [13:48<3:54:07, 1.56s/it, loss=0.0249, lr=2.48e-05, step=1009] Training: 10%|█ | 1010/10000 [13:48<3:54:07, 1.56s/it, loss=0.0290, lr=2.48e-05, step=1010] Training: 10%|█ | 1011/10000 [13:49<3:16:24, 1.31s/it, loss=0.0290, lr=2.48e-05, step=1010] Training: 10%|█ | 1011/10000 [13:49<3:16:24, 1.31s/it, loss=0.0423, lr=2.48e-05, step=1011] Training: 10%|█ | 1012/10000 [13:50<2:42:56, 1.09s/it, loss=0.0423, lr=2.48e-05, step=1011] Training: 10%|█ | 1012/10000 [13:50<2:42:56, 1.09s/it, loss=0.0412, lr=2.48e-05, step=1012] Training: 10%|█ | 1013/10000 [13:50<2:34:32, 1.03s/it, loss=0.0412, lr=2.48e-05, step=1012] Training: 10%|█ | 1013/10000 [13:50<2:34:32, 1.03s/it, loss=0.0639, lr=2.48e-05, step=1013] Training: 10%|█ | 1014/10000 [13:51<2:21:38, 1.06it/s, loss=0.0639, lr=2.48e-05, step=1013] Training: 10%|█ | 1014/10000 [13:51<2:21:38, 1.06it/s, loss=0.0298, lr=2.48e-05, step=1014] Training: 10%|█ | 1015/10000 [13:52<2:02:14, 1.22it/s, loss=0.0298, lr=2.48e-05, step=1014] Training: 10%|█ | 1015/10000 [13:52<2:02:14, 1.22it/s, loss=0.0451, lr=2.48e-05, step=1015] Training: 10%|█ | 1016/10000 [13:52<1:58:05, 1.27it/s, loss=0.0451, lr=2.48e-05, step=1015] Training: 10%|█ | 1016/10000 [13:52<1:58:05, 1.27it/s, loss=0.0323, lr=2.48e-05, step=1016] Training: 10%|█ | 1017/10000 [13:53<1:53:26, 1.32it/s, loss=0.0323, lr=2.48e-05, step=1016] Training: 10%|█ | 1017/10000 [13:53<1:53:26, 1.32it/s, loss=0.0223, lr=2.48e-05, step=1017] Training: 10%|█ | 1018/10000 [13:54<1:47:58, 1.39it/s, loss=0.0223, lr=2.48e-05, step=1017] Training: 10%|█ | 1018/10000 [13:54<1:47:58, 1.39it/s, loss=0.0387, lr=2.48e-05, step=1018] Training: 10%|█ | 1019/10000 [13:54<1:43:19, 1.45it/s, loss=0.0387, lr=2.48e-05, step=1018] Training: 10%|█ | 1019/10000 [13:54<1:43:19, 1.45it/s, loss=0.0392, lr=2.48e-05, step=1019]18:58:27.458 [I] step=1020 loss=0.0388 smoothed_loss=0.0400 lr=2.48e-05 grad_norm=0.8194 step_time=0.5832s data_time=0.1075s it/s=1.448 eta_to_10000=6201.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1844 grad_arm_token_fuse=0.0480 grad_shared_expert=0.5752 (18633:train_pytorch.py:850) + Training: 10%|█ | 1020/10000 [13:55<1:47:53, 1.39it/s, loss=0.0392, lr=2.48e-05, step=1019] Training: 10%|█ | 1020/10000 [13:55<1:47:53, 1.39it/s, loss=0.0388, lr=2.48e-05, step=1020] Training: 10%|█ | 1021/10000 [13:56<1:55:01, 1.30it/s, loss=0.0388, lr=2.48e-05, step=1020] Training: 10%|█ | 1021/10000 [13:56<1:55:01, 1.30it/s, loss=0.0523, lr=2.48e-05, step=1021] Training: 10%|█ | 1022/10000 [13:57<1:43:37, 1.44it/s, loss=0.0523, lr=2.48e-05, step=1021] Training: 10%|█ | 1022/10000 [13:57<1:43:37, 1.44it/s, loss=0.0515, lr=2.48e-05, step=1022] Training: 10%|█ | 1023/10000 [13:57<1:36:36, 1.55it/s, loss=0.0515, lr=2.48e-05, step=1022] Training: 10%|█ | 1023/10000 [13:57<1:36:36, 1.55it/s, loss=0.0210, lr=2.48e-05, step=1023] Training: 10%|█ | 1024/10000 [13:58<1:42:24, 1.46it/s, loss=0.0210, lr=2.48e-05, step=1023] Training: 10%|█ | 1024/10000 [13:58<1:42:24, 1.46it/s, loss=0.0443, lr=2.48e-05, step=1024] Training: 10%|█ | 1025/10000 [13:58<1:41:26, 1.47it/s, loss=0.0443, lr=2.48e-05, step=1024] Training: 10%|█ | 1025/10000 [13:59<1:41:26, 1.47it/s, loss=0.0391, lr=2.48e-05, step=1025] Training: 10%|█ | 1026/10000 [13:59<1:38:43, 1.52it/s, loss=0.0391, lr=2.48e-05, step=1025] Training: 10%|█ | 1026/10000 [13:59<1:38:43, 1.52it/s, loss=0.0503, lr=2.48e-05, step=1026] Training: 10%|█ | 1027/10000 [14:00<1:41:39, 1.47it/s, loss=0.0503, lr=2.48e-05, step=1026] Training: 10%|█ | 1027/10000 [14:00<1:41:39, 1.47it/s, loss=0.1119, lr=2.48e-05, step=1027] Training: 10%|█ | 1028/10000 [14:01<1:45:42, 1.41it/s, loss=0.1119, lr=2.48e-05, step=1027] Training: 10%|█ | 1028/10000 [14:01<1:45:42, 1.41it/s, loss=0.0432, lr=2.48e-05, step=1028] Training: 10%|█ | 1029/10000 [14:01<1:38:58, 1.51it/s, loss=0.0432, lr=2.48e-05, step=1028] Training: 10%|█ | 1029/10000 [14:01<1:38:58, 1.51it/s, loss=0.0402, lr=2.48e-05, step=1029]18:58:34.140 [I] step=1030 loss=0.0220 smoothed_loss=0.0446 lr=2.48e-05 grad_norm=0.8237 step_time=0.5601s data_time=0.1080s it/s=1.497 eta_to_10000=5992.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0175 grad_action_out_proj_arms=0.1958 grad_arm_token_fuse=0.0912 grad_shared_expert=0.7368 (18633:train_pytorch.py:850) + Training: 10%|█ | 1030/10000 [14:02<1:38:00, 1.53it/s, loss=0.0402, lr=2.48e-05, step=1029] Training: 10%|█ | 1030/10000 [14:02<1:38:00, 1.53it/s, loss=0.0220, lr=2.48e-05, step=1030] Training: 10%|█ | 1031/10000 [14:02<1:34:17, 1.59it/s, loss=0.0220, lr=2.48e-05, step=1030] Training: 10%|█ | 1031/10000 [14:02<1:34:17, 1.59it/s, loss=0.0430, lr=2.48e-05, step=1031] Training: 10%|█ | 1032/10000 [14:03<1:34:41, 1.58it/s, loss=0.0430, lr=2.48e-05, step=1031] Training: 10%|█ | 1032/10000 [14:03<1:34:41, 1.58it/s, loss=0.0434, lr=2.48e-05, step=1032] Training: 10%|█ | 1033/10000 [14:04<1:33:44, 1.59it/s, loss=0.0434, lr=2.48e-05, step=1032] Training: 10%|█ | 1033/10000 [14:04<1:33:44, 1.59it/s, loss=0.0460, lr=2.48e-05, step=1033] Training: 10%|█ | 1034/10000 [14:04<1:33:02, 1.61it/s, loss=0.0460, lr=2.48e-05, step=1033] Training: 10%|█ | 1034/10000 [14:04<1:33:02, 1.61it/s, loss=0.0097, lr=2.48e-05, step=1034] Training: 10%|█ | 1035/10000 [14:05<1:29:26, 1.67it/s, loss=0.0097, lr=2.48e-05, step=1034] Training: 10%|█ | 1035/10000 [14:05<1:29:26, 1.67it/s, loss=0.0475, lr=2.48e-05, step=1035] Training: 10%|█ | 1036/10000 [14:06<1:37:50, 1.53it/s, loss=0.0475, lr=2.48e-05, step=1035] Training: 10%|█ | 1036/10000 [14:06<1:37:50, 1.53it/s, loss=0.0248, lr=2.48e-05, step=1036] Training: 10%|█ | 1037/10000 [14:06<1:32:31, 1.61it/s, loss=0.0248, lr=2.48e-05, step=1036] Training: 10%|█ | 1037/10000 [14:06<1:32:31, 1.61it/s, loss=0.0196, lr=2.48e-05, step=1037] Training: 10%|█ | 1038/10000 [14:07<1:39:02, 1.51it/s, loss=0.0196, lr=2.48e-05, step=1037] Training: 10%|█ | 1038/10000 [14:07<1:39:02, 1.51it/s, loss=0.0612, lr=2.48e-05, step=1038] Training: 10%|█ | 1039/10000 [14:07<1:31:16, 1.64it/s, loss=0.0612, lr=2.48e-05, step=1038] Training: 10%|█ | 1039/10000 [14:07<1:31:16, 1.64it/s, loss=0.0344, lr=2.48e-05, step=1039]18:58:40.298 [I] step=1040 loss=0.0793 smoothed_loss=0.0437 lr=2.48e-05 grad_norm=0.8432 step_time=0.5297s data_time=0.0861s it/s=1.624 eta_to_10000=5516.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1995 grad_arm_token_fuse=0.0709 grad_shared_expert=0.8590 (18633:train_pytorch.py:850) + Training: 10%|█ | 1040/10000 [14:08<1:30:47, 1.64it/s, loss=0.0344, lr=2.48e-05, step=1039] Training: 10%|█ | 1040/10000 [14:08<1:30:47, 1.64it/s, loss=0.0793, lr=2.48e-05, step=1040] Training: 10%|█ | 1041/10000 [14:09<1:28:40, 1.68it/s, loss=0.0793, lr=2.48e-05, step=1040] Training: 10%|█ | 1041/10000 [14:09<1:28:40, 1.68it/s, loss=0.0152, lr=2.48e-05, step=1041] Training: 10%|█ | 1042/10000 [14:09<1:28:54, 1.68it/s, loss=0.0152, lr=2.48e-05, step=1041] Training: 10%|█ | 1042/10000 [14:09<1:28:54, 1.68it/s, loss=0.0246, lr=2.48e-05, step=1042] Training: 10%|█ | 1043/10000 [14:10<1:55:46, 1.29it/s, loss=0.0246, lr=2.48e-05, step=1042] Training: 10%|█ | 1043/10000 [14:10<1:55:46, 1.29it/s, loss=0.0472, lr=2.48e-05, step=1043] Training: 10%|█ | 1044/10000 [14:11<1:47:23, 1.39it/s, loss=0.0472, lr=2.48e-05, step=1043] Training: 10%|█ | 1044/10000 [14:11<1:47:23, 1.39it/s, loss=0.0223, lr=2.48e-05, step=1044] Training: 10%|█ | 1045/10000 [14:12<1:55:23, 1.29it/s, loss=0.0223, lr=2.48e-05, step=1044] Training: 10%|█ | 1045/10000 [14:12<1:55:23, 1.29it/s, loss=0.0330, lr=2.48e-05, step=1045] Training: 10%|█ | 1046/10000 [14:12<1:49:53, 1.36it/s, loss=0.0330, lr=2.48e-05, step=1045] Training: 10%|█ | 1046/10000 [14:12<1:49:53, 1.36it/s, loss=0.0311, lr=2.48e-05, step=1046] Training: 10%|█ | 1047/10000 [14:13<1:44:38, 1.43it/s, loss=0.0311, lr=2.48e-05, step=1046] Training: 10%|█ | 1047/10000 [14:13<1:44:38, 1.43it/s, loss=0.0411, lr=2.48e-05, step=1047] Training: 10%|█ | 1048/10000 [14:14<1:42:08, 1.46it/s, loss=0.0411, lr=2.48e-05, step=1047] Training: 10%|█ | 1048/10000 [14:14<1:42:08, 1.46it/s, loss=0.0557, lr=2.48e-05, step=1048] Training: 10%|█ | 1049/10000 [14:14<1:36:43, 1.54it/s, loss=0.0557, lr=2.48e-05, step=1048] Training: 10%|█ | 1049/10000 [14:14<1:36:43, 1.54it/s, loss=0.0327, lr=2.48e-05, step=1049]18:58:47.509 [I] step=1050 loss=0.0305 smoothed_loss=0.0378 lr=2.48e-05 grad_norm=0.9045 step_time=0.5945s data_time=0.1267s it/s=1.387 eta_to_10000=6453.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0147 grad_action_out_proj_arms=0.2151 grad_arm_token_fuse=0.0802 grad_shared_expert=0.6648 (18633:train_pytorch.py:850) + Training: 10%|█ | 1050/10000 [14:15<1:47:28, 1.39it/s, loss=0.0327, lr=2.48e-05, step=1049] Training: 10%|█ | 1050/10000 [14:15<1:47:28, 1.39it/s, loss=0.0305, lr=2.48e-05, step=1050] Training: 11%|█ | 1051/10000 [14:16<1:41:08, 1.47it/s, loss=0.0305, lr=2.48e-05, step=1050] Training: 11%|█ | 1051/10000 [14:16<1:41:08, 1.47it/s, loss=0.0479, lr=2.48e-05, step=1051] Training: 11%|█ | 1052/10000 [14:17<1:56:00, 1.29it/s, loss=0.0479, lr=2.48e-05, step=1051] Training: 11%|█ | 1052/10000 [14:17<1:56:00, 1.29it/s, loss=0.0276, lr=2.48e-05, step=1052] Training: 11%|█ | 1053/10000 [14:18<1:58:24, 1.26it/s, loss=0.0276, lr=2.48e-05, step=1052] Training: 11%|█ | 1053/10000 [14:18<1:58:24, 1.26it/s, loss=0.0486, lr=2.48e-05, step=1053] Training: 11%|█ | 1054/10000 [14:18<1:45:36, 1.41it/s, loss=0.0486, lr=2.48e-05, step=1053] Training: 11%|█ | 1054/10000 [14:18<1:45:36, 1.41it/s, loss=0.0530, lr=2.48e-05, step=1054] Training: 11%|█ | 1055/10000 [14:19<1:47:54, 1.38it/s, loss=0.0530, lr=2.48e-05, step=1054] Training: 11%|█ | 1055/10000 [14:19<1:47:54, 1.38it/s, loss=0.0410, lr=2.48e-05, step=1055] Training: 11%|█ | 1056/10000 [14:19<1:41:29, 1.47it/s, loss=0.0410, lr=2.48e-05, step=1055] Training: 11%|█ | 1056/10000 [14:19<1:41:29, 1.47it/s, loss=0.0243, lr=2.48e-05, step=1056] Training: 11%|█ | 1057/10000 [14:20<1:47:49, 1.38it/s, loss=0.0243, lr=2.48e-05, step=1056] Training: 11%|█ | 1057/10000 [14:20<1:47:49, 1.38it/s, loss=0.0443, lr=2.48e-05, step=1057] Training: 11%|█ | 1058/10000 [14:21<1:49:40, 1.36it/s, loss=0.0443, lr=2.48e-05, step=1057] Training: 11%|█ | 1058/10000 [14:21<1:49:40, 1.36it/s, loss=0.0515, lr=2.48e-05, step=1058] Training: 11%|█ | 1059/10000 [14:22<1:46:53, 1.39it/s, loss=0.0515, lr=2.48e-05, step=1058] Training: 11%|█ | 1059/10000 [14:22<1:46:53, 1.39it/s, loss=0.0483, lr=2.48e-05, step=1059]18:58:54.711 [I] step=1060 loss=0.0473 smoothed_loss=0.0419 lr=2.48e-05 grad_norm=0.8403 step_time=0.5853s data_time=0.1348s it/s=1.389 eta_to_10000=6437.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0191 grad_action_out_proj_arms=0.2407 grad_arm_token_fuse=0.1053 grad_shared_expert=0.5989 (18633:train_pytorch.py:850) + Training: 11%|█ | 1060/10000 [14:22<1:44:44, 1.42it/s, loss=0.0483, lr=2.48e-05, step=1059] Training: 11%|█ | 1060/10000 [14:22<1:44:44, 1.42it/s, loss=0.0473, lr=2.48e-05, step=1060] Training: 11%|█ | 1061/10000 [14:23<1:36:05, 1.55it/s, loss=0.0473, lr=2.48e-05, step=1060] Training: 11%|█ | 1061/10000 [14:23<1:36:05, 1.55it/s, loss=0.0435, lr=2.48e-05, step=1061] Training: 11%|█ | 1062/10000 [14:23<1:31:07, 1.63it/s, loss=0.0435, lr=2.48e-05, step=1061] Training: 11%|█ | 1062/10000 [14:23<1:31:07, 1.63it/s, loss=0.0263, lr=2.48e-05, step=1062] Training: 11%|█ | 1063/10000 [14:24<1:35:57, 1.55it/s, loss=0.0263, lr=2.48e-05, step=1062] Training: 11%|█ | 1063/10000 [14:24<1:35:57, 1.55it/s, loss=0.0753, lr=2.48e-05, step=1063] Training: 11%|█ | 1064/10000 [14:25<1:43:33, 1.44it/s, loss=0.0753, lr=2.48e-05, step=1063] Training: 11%|█ | 1064/10000 [14:25<1:43:33, 1.44it/s, loss=0.0451, lr=2.48e-05, step=1064] Training: 11%|█ | 1065/10000 [14:26<1:40:37, 1.48it/s, loss=0.0451, lr=2.48e-05, step=1064] Training: 11%|█ | 1065/10000 [14:26<1:40:37, 1.48it/s, loss=0.0376, lr=2.48e-05, step=1065] Training: 11%|█ | 1066/10000 [14:26<1:49:04, 1.37it/s, loss=0.0376, lr=2.48e-05, step=1065] Training: 11%|█ | 1066/10000 [14:26<1:49:04, 1.37it/s, loss=0.0921, lr=2.48e-05, step=1066] Training: 11%|█ | 1067/10000 [14:27<1:41:12, 1.47it/s, loss=0.0921, lr=2.48e-05, step=1066] Training: 11%|█ | 1067/10000 [14:27<1:41:12, 1.47it/s, loss=0.0421, lr=2.48e-05, step=1067] Training: 11%|█ | 1068/10000 [14:28<1:43:39, 1.44it/s, loss=0.0421, lr=2.48e-05, step=1067] Training: 11%|█ | 1068/10000 [14:28<1:43:39, 1.44it/s, loss=0.0337, lr=2.48e-05, step=1068] Training: 11%|█ | 1069/10000 [14:29<1:57:23, 1.27it/s, loss=0.0337, lr=2.48e-05, step=1068] Training: 11%|█ | 1069/10000 [14:29<1:57:23, 1.27it/s, loss=0.0259, lr=2.48e-05, step=1069]18:59:01.842 [I] step=1070 loss=0.0365 smoothed_loss=0.0435 lr=2.48e-05 grad_norm=0.8376 step_time=0.5854s data_time=0.1277s it/s=1.402 eta_to_10000=6367.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0181 grad_action_out_proj_arms=0.2402 grad_arm_token_fuse=0.0942 grad_shared_expert=0.8575 (18633:train_pytorch.py:850) + Training: 11%|█ | 1070/10000 [14:30<1:56:11, 1.28it/s, loss=0.0259, lr=2.48e-05, step=1069] Training: 11%|█ | 1070/10000 [14:30<1:56:11, 1.28it/s, loss=0.0365, lr=2.48e-05, step=1070] Training: 11%|█ | 1071/10000 [14:30<1:57:50, 1.26it/s, loss=0.0365, lr=2.48e-05, step=1070] Training: 11%|█ | 1071/10000 [14:30<1:57:50, 1.26it/s, loss=0.0331, lr=2.48e-05, step=1071] Training: 11%|█ | 1072/10000 [14:31<1:59:58, 1.24it/s, loss=0.0331, lr=2.48e-05, step=1071] Training: 11%|█ | 1072/10000 [14:31<1:59:58, 1.24it/s, loss=0.0312, lr=2.48e-05, step=1072] Training: 11%|█ | 1073/10000 [14:32<1:46:02, 1.40it/s, loss=0.0312, lr=2.48e-05, step=1072] Training: 11%|█ | 1073/10000 [14:32<1:46:02, 1.40it/s, loss=0.0354, lr=2.48e-05, step=1073] Training: 11%|█ | 1074/10000 [14:32<1:48:32, 1.37it/s, loss=0.0354, lr=2.48e-05, step=1073] Training: 11%|█ | 1074/10000 [14:32<1:48:32, 1.37it/s, loss=0.0317, lr=2.48e-05, step=1074] Training: 11%|█ | 1075/10000 [14:33<1:39:18, 1.50it/s, loss=0.0317, lr=2.48e-05, step=1074] Training: 11%|█ | 1075/10000 [14:33<1:39:18, 1.50it/s, loss=0.0245, lr=2.48e-05, step=1075] Training: 11%|█ | 1076/10000 [14:34<1:38:54, 1.50it/s, loss=0.0245, lr=2.48e-05, step=1075] Training: 11%|█ | 1076/10000 [14:34<1:38:54, 1.50it/s, loss=0.0283, lr=2.48e-05, step=1076] Training: 11%|█ | 1077/10000 [14:34<1:32:49, 1.60it/s, loss=0.0283, lr=2.48e-05, step=1076] Training: 11%|█ | 1077/10000 [14:34<1:32:49, 1.60it/s, loss=0.0188, lr=2.48e-05, step=1077] Training: 11%|█ | 1078/10000 [14:35<1:44:27, 1.42it/s, loss=0.0188, lr=2.48e-05, step=1077] Training: 11%|█ | 1078/10000 [14:35<1:44:27, 1.42it/s, loss=0.0224, lr=2.48e-05, step=1078] Training: 11%|█ | 1079/10000 [14:36<1:36:34, 1.54it/s, loss=0.0224, lr=2.48e-05, step=1078] Training: 11%|█ | 1079/10000 [14:36<1:36:34, 1.54it/s, loss=0.0222, lr=2.48e-05, step=1079]18:59:08.646 [I] step=1080 loss=0.0235 smoothed_loss=0.0320 lr=2.48e-05 grad_norm=0.7849 step_time=0.5758s data_time=0.1046s it/s=1.470 eta_to_10000=6068.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1972 grad_arm_token_fuse=0.0827 grad_shared_expert=0.7827 (18633:train_pytorch.py:850) + Training: 11%|█ | 1080/10000 [14:36<1:41:32, 1.46it/s, loss=0.0222, lr=2.48e-05, step=1079] Training: 11%|█ | 1080/10000 [14:36<1:41:32, 1.46it/s, loss=0.0235, lr=2.48e-05, step=1080] Training: 11%|█ | 1081/10000 [14:37<1:34:11, 1.58it/s, loss=0.0235, lr=2.48e-05, step=1080] Training: 11%|█ | 1081/10000 [14:37<1:34:11, 1.58it/s, loss=0.0779, lr=2.48e-05, step=1081] Training: 11%|█ | 1082/10000 [14:37<1:30:41, 1.64it/s, loss=0.0779, lr=2.48e-05, step=1081] Training: 11%|█ | 1082/10000 [14:37<1:30:41, 1.64it/s, loss=0.0183, lr=2.48e-05, step=1082] Training: 11%|█ | 1083/10000 [14:38<1:28:03, 1.69it/s, loss=0.0183, lr=2.48e-05, step=1082] Training: 11%|█ | 1083/10000 [14:38<1:28:03, 1.69it/s, loss=0.0906, lr=2.48e-05, step=1083] Training: 11%|█ | 1084/10000 [14:39<1:34:33, 1.57it/s, loss=0.0906, lr=2.48e-05, step=1083] Training: 11%|█ | 1084/10000 [14:39<1:34:33, 1.57it/s, loss=0.1150, lr=2.48e-05, step=1084] Training: 11%|█ | 1085/10000 [14:39<1:30:33, 1.64it/s, loss=0.1150, lr=2.48e-05, step=1084] Training: 11%|█ | 1085/10000 [14:39<1:30:33, 1.64it/s, loss=0.0376, lr=2.48e-05, step=1085] Training: 11%|█ | 1086/10000 [14:40<1:53:58, 1.30it/s, loss=0.0376, lr=2.48e-05, step=1085] Training: 11%|█ | 1086/10000 [14:40<1:53:58, 1.30it/s, loss=0.0419, lr=2.48e-05, step=1086] Training: 11%|█ | 1087/10000 [14:41<1:50:00, 1.35it/s, loss=0.0419, lr=2.48e-05, step=1086] Training: 11%|█ | 1087/10000 [14:41<1:50:00, 1.35it/s, loss=0.0399, lr=2.48e-05, step=1087] Training: 11%|█ | 1088/10000 [14:42<1:40:39, 1.48it/s, loss=0.0399, lr=2.48e-05, step=1087] Training: 11%|█ | 1088/10000 [14:42<1:40:39, 1.48it/s, loss=0.0247, lr=2.48e-05, step=1088] Training: 11%|█ | 1089/10000 [14:42<1:37:32, 1.52it/s, loss=0.0247, lr=2.48e-05, step=1088] Training: 11%|█ | 1089/10000 [14:42<1:37:32, 1.52it/s, loss=0.0160, lr=2.48e-05, step=1089]18:59:15.029 [I] step=1090 loss=0.0463 smoothed_loss=0.0414 lr=2.48e-05 grad_norm=0.6990 step_time=0.5175s data_time=0.1208s it/s=1.567 eta_to_10000=5686.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0234 grad_action_out_proj_arms=0.1777 grad_arm_token_fuse=0.1277 grad_shared_expert=0.6976 (18633:train_pytorch.py:850) + Training: 11%|█ | 1090/10000 [14:43<1:31:26, 1.62it/s, loss=0.0160, lr=2.48e-05, step=1089] Training: 11%|█ | 1090/10000 [14:43<1:31:26, 1.62it/s, loss=0.0463, lr=2.48e-05, step=1090] Training: 11%|█ | 1091/10000 [14:43<1:29:56, 1.65it/s, loss=0.0463, lr=2.48e-05, step=1090] Training: 11%|█ | 1091/10000 [14:43<1:29:56, 1.65it/s, loss=0.0232, lr=2.48e-05, step=1091] Training: 11%|█ | 1092/10000 [14:44<1:27:42, 1.69it/s, loss=0.0232, lr=2.48e-05, step=1091] Training: 11%|█ | 1092/10000 [14:44<1:27:42, 1.69it/s, loss=0.0200, lr=2.48e-05, step=1092] Training: 11%|█ | 1093/10000 [14:45<1:39:42, 1.49it/s, loss=0.0200, lr=2.48e-05, step=1092] Training: 11%|█ | 1093/10000 [14:45<1:39:42, 1.49it/s, loss=0.0224, lr=2.48e-05, step=1093] Training: 11%|█ | 1094/10000 [14:45<1:32:16, 1.61it/s, loss=0.0224, lr=2.48e-05, step=1093] Training: 11%|█ | 1094/10000 [14:45<1:32:16, 1.61it/s, loss=0.0427, lr=2.48e-05, step=1094] Training: 11%|█ | 1095/10000 [14:46<1:34:17, 1.57it/s, loss=0.0427, lr=2.48e-05, step=1094] Training: 11%|█ | 1095/10000 [14:46<1:34:17, 1.57it/s, loss=0.0510, lr=2.48e-05, step=1095] Training: 11%|█ | 1096/10000 [14:46<1:28:34, 1.68it/s, loss=0.0510, lr=2.48e-05, step=1095] Training: 11%|█ | 1096/10000 [14:46<1:28:34, 1.68it/s, loss=0.0348, lr=2.48e-05, step=1096] Training: 11%|█ | 1097/10000 [14:47<1:23:33, 1.78it/s, loss=0.0348, lr=2.48e-05, step=1096] Training: 11%|█ | 1097/10000 [14:47<1:23:33, 1.78it/s, loss=0.0414, lr=2.48e-05, step=1097] Training: 11%|█ | 1098/10000 [14:47<1:20:34, 1.84it/s, loss=0.0414, lr=2.48e-05, step=1097] Training: 11%|█ | 1098/10000 [14:47<1:20:34, 1.84it/s, loss=0.0492, lr=2.48e-05, step=1098] Training: 11%|█ | 1099/10000 [14:48<1:20:03, 1.85it/s, loss=0.0492, lr=2.48e-05, step=1098] Training: 11%|█ | 1099/10000 [14:48<1:20:03, 1.85it/s, loss=0.0458, lr=2.48e-05, step=1099]18:59:20.999 [I] step=1100 loss=0.0429 smoothed_loss=0.0402 lr=2.48e-05 grad_norm=0.7795 step_time=0.5211s data_time=0.0759s it/s=1.676 eta_to_10000=5311.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1808 grad_arm_token_fuse=0.0749 grad_shared_expert=0.8600 (18633:train_pytorch.py:850) + Training: 11%|█ | 1100/10000 [14:49<1:30:42, 1.64it/s, loss=0.0458, lr=2.48e-05, step=1099] Training: 11%|█ | 1100/10000 [14:49<1:30:42, 1.64it/s, loss=0.0429, lr=2.48e-05, step=1100] Training: 11%|█ | 1101/10000 [14:49<1:33:12, 1.59it/s, loss=0.0429, lr=2.48e-05, step=1100] Training: 11%|█ | 1101/10000 [14:49<1:33:12, 1.59it/s, loss=0.2405, lr=2.48e-05, step=1101] Training: 11%|█ | 1102/10000 [14:50<1:34:32, 1.57it/s, loss=0.2405, lr=2.48e-05, step=1101] Training: 11%|█ | 1102/10000 [14:50<1:34:32, 1.57it/s, loss=0.0285, lr=2.48e-05, step=1102] Training: 11%|█ | 1103/10000 [14:51<1:39:34, 1.49it/s, loss=0.0285, lr=2.48e-05, step=1102] Training: 11%|█ | 1103/10000 [14:51<1:39:34, 1.49it/s, loss=0.0859, lr=2.48e-05, step=1103] Training: 11%|█ | 1104/10000 [14:51<1:37:18, 1.52it/s, loss=0.0859, lr=2.48e-05, step=1103] Training: 11%|█ | 1104/10000 [14:51<1:37:18, 1.52it/s, loss=0.0203, lr=2.48e-05, step=1104] Training: 11%|█ | 1105/10000 [14:52<1:35:07, 1.56it/s, loss=0.0203, lr=2.48e-05, step=1104] Training: 11%|█ | 1105/10000 [14:52<1:35:07, 1.56it/s, loss=0.0282, lr=2.48e-05, step=1105] Training: 11%|█ | 1106/10000 [14:53<1:32:19, 1.61it/s, loss=0.0282, lr=2.48e-05, step=1105] Training: 11%|█ | 1106/10000 [14:53<1:32:19, 1.61it/s, loss=0.0388, lr=2.48e-05, step=1106] Training: 11%|█ | 1107/10000 [14:53<1:41:34, 1.46it/s, loss=0.0388, lr=2.48e-05, step=1106] Training: 11%|█ | 1107/10000 [14:53<1:41:34, 1.46it/s, loss=0.0456, lr=2.48e-05, step=1107] Training: 11%|█ | 1108/10000 [14:54<1:33:21, 1.59it/s, loss=0.0456, lr=2.48e-05, step=1107] Training: 11%|█ | 1108/10000 [14:54<1:33:21, 1.59it/s, loss=0.0481, lr=2.48e-05, step=1108] Training: 11%|█ | 1109/10000 [14:54<1:27:35, 1.69it/s, loss=0.0481, lr=2.48e-05, step=1108] Training: 11%|█ | 1109/10000 [14:54<1:27:35, 1.69it/s, loss=0.0526, lr=2.48e-05, step=1109]18:59:27.433 [I] step=1110 loss=0.0889 smoothed_loss=0.0548 lr=2.48e-05 grad_norm=0.7852 step_time=0.5443s data_time=0.0992s it/s=1.554 eta_to_10000=5719.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0200 grad_action_out_proj_arms=0.2290 grad_arm_token_fuse=0.1159 grad_shared_expert=0.5363 (18633:train_pytorch.py:850) + Training: 11%|█ | 1110/10000 [14:55<1:33:10, 1.59it/s, loss=0.0526, lr=2.48e-05, step=1109] Training: 11%|█ | 1110/10000 [14:55<1:33:10, 1.59it/s, loss=0.0889, lr=2.48e-05, step=1110] Training: 11%|█ | 1111/10000 [14:56<1:27:20, 1.70it/s, loss=0.0889, lr=2.48e-05, step=1110] Training: 11%|█ | 1111/10000 [14:56<1:27:20, 1.70it/s, loss=0.0425, lr=2.48e-05, step=1111] Training: 11%|█ | 1112/10000 [14:56<1:26:22, 1.72it/s, loss=0.0425, lr=2.48e-05, step=1111] Training: 11%|█ | 1112/10000 [14:56<1:26:22, 1.72it/s, loss=0.0167, lr=2.48e-05, step=1112] Training: 11%|█ | 1113/10000 [14:57<1:22:55, 1.79it/s, loss=0.0167, lr=2.48e-05, step=1112] Training: 11%|█ | 1113/10000 [14:57<1:22:55, 1.79it/s, loss=0.0197, lr=2.48e-05, step=1113] Training: 11%|█ | 1114/10000 [14:58<1:36:35, 1.53it/s, loss=0.0197, lr=2.48e-05, step=1113] Training: 11%|█ | 1114/10000 [14:58<1:36:35, 1.53it/s, loss=0.0602, lr=2.48e-05, step=1114] Training: 11%|█ | 1115/10000 [14:58<1:44:53, 1.41it/s, loss=0.0602, lr=2.48e-05, step=1114] Training: 11%|█ | 1115/10000 [14:58<1:44:53, 1.41it/s, loss=0.0290, lr=2.48e-05, step=1115] Training: 11%|█ | 1116/10000 [14:59<1:42:06, 1.45it/s, loss=0.0290, lr=2.48e-05, step=1115] Training: 11%|█ | 1116/10000 [14:59<1:42:06, 1.45it/s, loss=0.0126, lr=2.48e-05, step=1116] Training: 11%|█ | 1117/10000 [15:00<1:45:47, 1.40it/s, loss=0.0126, lr=2.48e-05, step=1116] Training: 11%|█ | 1117/10000 [15:00<1:45:47, 1.40it/s, loss=0.0560, lr=2.48e-05, step=1117] Training: 11%|█ | 1118/10000 [15:00<1:36:34, 1.53it/s, loss=0.0560, lr=2.48e-05, step=1117] Training: 11%|█ | 1118/10000 [15:00<1:36:34, 1.53it/s, loss=0.0219, lr=2.48e-05, step=1118] Training: 11%|█ | 1119/10000 [15:01<1:31:10, 1.62it/s, loss=0.0219, lr=2.48e-05, step=1118] Training: 11%|█ | 1119/10000 [15:01<1:31:10, 1.62it/s, loss=0.0555, lr=2.48e-05, step=1119]18:59:33.731 [I] step=1120 loss=0.0394 smoothed_loss=0.0430 lr=2.48e-05 grad_norm=0.7397 step_time=0.5412s data_time=0.0887s it/s=1.588 eta_to_10000=5592.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0163 grad_action_out_proj_arms=0.2267 grad_arm_token_fuse=0.0870 grad_shared_expert=0.7686 (18633:train_pytorch.py:850) + Training: 11%|█ | 1120/10000 [15:01<1:28:48, 1.67it/s, loss=0.0555, lr=2.48e-05, step=1119] Training: 11%|█ | 1120/10000 [15:01<1:28:48, 1.67it/s, loss=0.0394, lr=2.48e-05, step=1120] Training: 11%|█ | 1121/10000 [15:02<1:33:33, 1.58it/s, loss=0.0394, lr=2.48e-05, step=1120] Training: 11%|█ | 1121/10000 [15:02<1:33:33, 1.58it/s, loss=0.0224, lr=2.48e-05, step=1121] Training: 11%|█ | 1122/10000 [15:03<1:28:09, 1.68it/s, loss=0.0224, lr=2.48e-05, step=1121] Training: 11%|█ | 1122/10000 [15:03<1:28:09, 1.68it/s, loss=0.0381, lr=2.48e-05, step=1122] Training: 11%|█ | 1123/10000 [15:03<1:24:15, 1.76it/s, loss=0.0381, lr=2.48e-05, step=1122] Training: 11%|█ | 1123/10000 [15:03<1:24:15, 1.76it/s, loss=0.0353, lr=2.48e-05, step=1123] Training: 11%|█ | 1124/10000 [15:04<1:32:32, 1.60it/s, loss=0.0353, lr=2.48e-05, step=1123] Training: 11%|█ | 1124/10000 [15:04<1:32:32, 1.60it/s, loss=0.0291, lr=2.48e-05, step=1124] Training: 11%|█▏ | 1125/10000 [15:04<1:27:08, 1.70it/s, loss=0.0291, lr=2.48e-05, step=1124] Training: 11%|█▏ | 1125/10000 [15:04<1:27:08, 1.70it/s, loss=0.0359, lr=2.48e-05, step=1125] Training: 11%|█▏ | 1126/10000 [15:05<1:23:39, 1.77it/s, loss=0.0359, lr=2.48e-05, step=1125] Training: 11%|█▏ | 1126/10000 [15:05<1:23:39, 1.77it/s, loss=0.0263, lr=2.48e-05, step=1126] Training: 11%|█▏ | 1127/10000 [15:05<1:23:50, 1.76it/s, loss=0.0263, lr=2.48e-05, step=1126] Training: 11%|█▏ | 1127/10000 [15:05<1:23:50, 1.76it/s, loss=0.0242, lr=2.48e-05, step=1127] Training: 11%|█▏ | 1128/10000 [15:06<1:39:11, 1.49it/s, loss=0.0242, lr=2.48e-05, step=1127] Training: 11%|█▏ | 1128/10000 [15:06<1:39:11, 1.49it/s, loss=0.0454, lr=2.48e-05, step=1128] Training: 11%|█▏ | 1129/10000 [15:07<1:42:03, 1.45it/s, loss=0.0454, lr=2.48e-05, step=1128] Training: 11%|█▏ | 1129/10000 [15:07<1:42:03, 1.45it/s, loss=0.0483, lr=2.48e-05, step=1129]18:59:39.981 [I] step=1130 loss=0.0812 smoothed_loss=0.0425 lr=2.48e-05 grad_norm=0.7304 step_time=0.5314s data_time=0.0935s it/s=1.600 eta_to_10000=5542.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0150 grad_action_out_proj_arms=0.2368 grad_arm_token_fuse=0.0737 grad_shared_expert=0.6937 (18633:train_pytorch.py:850) + Training: 11%|█▏ | 1130/10000 [15:08<1:35:03, 1.56it/s, loss=0.0483, lr=2.48e-05, step=1129] Training: 11%|█▏ | 1130/10000 [15:08<1:35:03, 1.56it/s, loss=0.0812, lr=2.48e-05, step=1130] Training: 11%|█▏ | 1131/10000 [15:08<1:31:24, 1.62it/s, loss=0.0812, lr=2.48e-05, step=1130] Training: 11%|█▏ | 1131/10000 [15:08<1:31:24, 1.62it/s, loss=0.0718, lr=2.48e-05, step=1131] Training: 11%|█▏ | 1132/10000 [15:09<1:44:11, 1.42it/s, loss=0.0718, lr=2.48e-05, step=1131] Training: 11%|█▏ | 1132/10000 [15:09<1:44:11, 1.42it/s, loss=0.0144, lr=2.48e-05, step=1132] Training: 11%|█▏ | 1133/10000 [15:10<1:39:03, 1.49it/s, loss=0.0144, lr=2.48e-05, step=1132] Training: 11%|█▏ | 1133/10000 [15:10<1:39:03, 1.49it/s, loss=0.0121, lr=2.48e-05, step=1133] Training: 11%|█▏ | 1134/10000 [15:10<1:31:51, 1.61it/s, loss=0.0121, lr=2.48e-05, step=1133] Training: 11%|█▏ | 1134/10000 [15:10<1:31:51, 1.61it/s, loss=0.0169, lr=2.48e-05, step=1134] Training: 11%|█▏ | 1135/10000 [15:11<1:30:54, 1.63it/s, loss=0.0169, lr=2.48e-05, step=1134] Training: 11%|█▏ | 1135/10000 [15:11<1:30:54, 1.63it/s, loss=0.0289, lr=2.48e-05, step=1135] Training: 11%|█▏ | 1136/10000 [15:12<1:39:37, 1.48it/s, loss=0.0289, lr=2.48e-05, step=1135] Training: 11%|█▏ | 1136/10000 [15:12<1:39:37, 1.48it/s, loss=0.0266, lr=2.48e-05, step=1136] Training: 11%|█▏ | 1137/10000 [15:12<1:31:48, 1.61it/s, loss=0.0266, lr=2.48e-05, step=1136] Training: 11%|█▏ | 1137/10000 [15:12<1:31:48, 1.61it/s, loss=0.0175, lr=2.48e-05, step=1137] Training: 11%|█▏ | 1138/10000 [15:13<1:36:21, 1.53it/s, loss=0.0175, lr=2.48e-05, step=1137] Training: 11%|█▏ | 1138/10000 [15:13<1:36:21, 1.53it/s, loss=0.0529, lr=2.48e-05, step=1138] Training: 11%|█▏ | 1139/10000 [15:13<1:35:11, 1.55it/s, loss=0.0529, lr=2.48e-05, step=1138] Training: 11%|█▏ | 1139/10000 [15:13<1:35:11, 1.55it/s, loss=0.0117, lr=2.48e-05, step=1139]18:59:46.516 [I] step=1140 loss=0.0141 smoothed_loss=0.0312 lr=2.48e-05 grad_norm=0.7545 step_time=0.5588s data_time=0.0948s it/s=1.530 eta_to_10000=5789.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0127 grad_action_out_proj_arms=0.1838 grad_arm_token_fuse=0.0589 grad_shared_expert=0.5282 (18633:train_pytorch.py:850) + Training: 11%|█▏ | 1140/10000 [15:14<1:37:59, 1.51it/s, loss=0.0117, lr=2.48e-05, step=1139] Training: 11%|█▏ | 1140/10000 [15:14<1:37:59, 1.51it/s, loss=0.0141, lr=2.47e-05, step=1140] Training: 11%|█▏ | 1141/10000 [15:15<1:30:58, 1.62it/s, loss=0.0141, lr=2.47e-05, step=1140] Training: 11%|█▏ | 1141/10000 [15:15<1:30:58, 1.62it/s, loss=0.0430, lr=2.47e-05, step=1141] Training: 11%|█▏ | 1142/10000 [15:15<1:28:08, 1.68it/s, loss=0.0430, lr=2.47e-05, step=1141] Training: 11%|█▏ | 1142/10000 [15:15<1:28:08, 1.68it/s, loss=0.0499, lr=2.47e-05, step=1142] Training: 11%|█▏ | 1143/10000 [15:16<1:41:49, 1.45it/s, loss=0.0499, lr=2.47e-05, step=1142] Training: 11%|█▏ | 1143/10000 [15:16<1:41:49, 1.45it/s, loss=0.0347, lr=2.47e-05, step=1143] Training: 11%|█▏ | 1144/10000 [15:17<1:33:57, 1.57it/s, loss=0.0347, lr=2.47e-05, step=1143] Training: 11%|█▏ | 1144/10000 [15:17<1:33:57, 1.57it/s, loss=0.0355, lr=2.47e-05, step=1144] Training: 11%|█▏ | 1145/10000 [15:18<1:42:58, 1.43it/s, loss=0.0355, lr=2.47e-05, step=1144] Training: 11%|█▏ | 1145/10000 [15:18<1:42:58, 1.43it/s, loss=0.0262, lr=2.47e-05, step=1145] Training: 11%|█▏ | 1146/10000 [15:18<1:37:53, 1.51it/s, loss=0.0262, lr=2.47e-05, step=1145] Training: 11%|█▏ | 1146/10000 [15:18<1:37:53, 1.51it/s, loss=0.0098, lr=2.47e-05, step=1146] Training: 11%|█▏ | 1147/10000 [15:19<1:45:29, 1.40it/s, loss=0.0098, lr=2.47e-05, step=1146] Training: 11%|█▏ | 1147/10000 [15:19<1:45:29, 1.40it/s, loss=0.0479, lr=2.47e-05, step=1147] Training: 11%|█▏ | 1148/10000 [15:20<1:48:34, 1.36it/s, loss=0.0479, lr=2.47e-05, step=1147] Training: 11%|█▏ | 1148/10000 [15:20<1:48:34, 1.36it/s, loss=0.0427, lr=2.47e-05, step=1148] Training: 11%|█▏ | 1149/10000 [15:20<1:44:24, 1.41it/s, loss=0.0427, lr=2.47e-05, step=1148] Training: 11%|█▏ | 1149/10000 [15:20<1:44:24, 1.41it/s, loss=0.0353, lr=2.47e-05, step=1149]18:59:53.600 [I] step=1150 loss=0.0276 smoothed_loss=0.0333 lr=2.47e-05 grad_norm=0.7081 step_time=0.6260s data_time=0.0825s it/s=1.412 eta_to_10000=6268.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.2196 grad_arm_token_fuse=0.0810 grad_shared_expert=0.6355 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1150/10000 [15:21<1:53:52, 1.30it/s, loss=0.0353, lr=2.47e-05, step=1149] Training: 12%|█▏ | 1150/10000 [15:21<1:53:52, 1.30it/s, loss=0.0276, lr=2.47e-05, step=1150] Training: 12%|█▏ | 1151/10000 [15:22<1:51:54, 1.32it/s, loss=0.0276, lr=2.47e-05, step=1150] Training: 12%|█▏ | 1151/10000 [15:22<1:51:54, 1.32it/s, loss=0.0403, lr=2.47e-05, step=1151] Training: 12%|█▏ | 1152/10000 [15:22<1:39:33, 1.48it/s, loss=0.0403, lr=2.47e-05, step=1151] Training: 12%|█▏ | 1152/10000 [15:22<1:39:33, 1.48it/s, loss=0.0363, lr=2.47e-05, step=1152] Training: 12%|█▏ | 1153/10000 [15:23<1:38:13, 1.50it/s, loss=0.0363, lr=2.47e-05, step=1152] Training: 12%|█▏ | 1153/10000 [15:23<1:38:13, 1.50it/s, loss=0.0506, lr=2.47e-05, step=1153] Training: 12%|█▏ | 1154/10000 [15:24<1:33:14, 1.58it/s, loss=0.0506, lr=2.47e-05, step=1153] Training: 12%|█▏ | 1154/10000 [15:24<1:33:14, 1.58it/s, loss=0.0331, lr=2.47e-05, step=1154] Training: 12%|█▏ | 1155/10000 [15:24<1:30:54, 1.62it/s, loss=0.0331, lr=2.47e-05, step=1154] Training: 12%|█▏ | 1155/10000 [15:24<1:30:54, 1.62it/s, loss=0.0961, lr=2.47e-05, step=1155] Training: 12%|█▏ | 1156/10000 [15:25<1:32:57, 1.59it/s, loss=0.0961, lr=2.47e-05, step=1155] Training: 12%|█▏ | 1156/10000 [15:25<1:32:57, 1.59it/s, loss=0.0261, lr=2.47e-05, step=1156] Training: 12%|█▏ | 1157/10000 [15:26<1:39:16, 1.48it/s, loss=0.0261, lr=2.47e-05, step=1156] Training: 12%|█▏ | 1157/10000 [15:26<1:39:16, 1.48it/s, loss=0.0236, lr=2.47e-05, step=1157] Training: 12%|█▏ | 1158/10000 [15:26<1:34:29, 1.56it/s, loss=0.0236, lr=2.47e-05, step=1157] Training: 12%|█▏ | 1158/10000 [15:26<1:34:29, 1.56it/s, loss=0.0650, lr=2.47e-05, step=1158] Training: 12%|█▏ | 1159/10000 [15:27<1:34:29, 1.56it/s, loss=0.0650, lr=2.47e-05, step=1158] Training: 12%|█▏ | 1159/10000 [15:27<1:34:29, 1.56it/s, loss=0.0465, lr=2.47e-05, step=1159]19:00:00.038 [I] step=1160 loss=0.0206 smoothed_loss=0.0395 lr=2.47e-05 grad_norm=0.8436 step_time=0.5523s data_time=0.0913s it/s=1.553 eta_to_10000=5690.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1694 grad_arm_token_fuse=0.0852 grad_shared_expert=0.6914 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1160/10000 [15:28<1:41:51, 1.45it/s, loss=0.0465, lr=2.47e-05, step=1159] Training: 12%|█▏ | 1160/10000 [15:28<1:41:51, 1.45it/s, loss=0.0206, lr=2.47e-05, step=1160] Training: 12%|█▏ | 1161/10000 [15:28<1:36:39, 1.52it/s, loss=0.0206, lr=2.47e-05, step=1160] Training: 12%|█▏ | 1161/10000 [15:28<1:36:39, 1.52it/s, loss=0.0642, lr=2.47e-05, step=1161] Training: 12%|█▏ | 1162/10000 [15:29<1:30:47, 1.62it/s, loss=0.0642, lr=2.47e-05, step=1161] Training: 12%|█▏ | 1162/10000 [15:29<1:30:47, 1.62it/s, loss=0.0335, lr=2.47e-05, step=1162] Training: 12%|█▏ | 1163/10000 [15:29<1:28:27, 1.66it/s, loss=0.0335, lr=2.47e-05, step=1162] Training: 12%|█▏ | 1163/10000 [15:29<1:28:27, 1.66it/s, loss=0.0336, lr=2.47e-05, step=1163] Training: 12%|█▏ | 1164/10000 [15:30<1:36:11, 1.53it/s, loss=0.0336, lr=2.47e-05, step=1163] Training: 12%|█▏ | 1164/10000 [15:30<1:36:11, 1.53it/s, loss=0.0306, lr=2.47e-05, step=1164] Training: 12%|█▏ | 1165/10000 [15:31<1:36:15, 1.53it/s, loss=0.0306, lr=2.47e-05, step=1164] Training: 12%|█▏ | 1165/10000 [15:31<1:36:15, 1.53it/s, loss=0.0201, lr=2.47e-05, step=1165] Training: 12%|█▏ | 1166/10000 [15:31<1:33:49, 1.57it/s, loss=0.0201, lr=2.47e-05, step=1165] Training: 12%|█▏ | 1166/10000 [15:31<1:33:49, 1.57it/s, loss=0.0539, lr=2.47e-05, step=1166] Training: 12%|█▏ | 1167/10000 [15:32<1:34:20, 1.56it/s, loss=0.0539, lr=2.47e-05, step=1166] Training: 12%|█▏ | 1167/10000 [15:32<1:34:20, 1.56it/s, loss=0.0299, lr=2.47e-05, step=1167] Training: 12%|█▏ | 1168/10000 [15:33<1:42:49, 1.43it/s, loss=0.0299, lr=2.47e-05, step=1167] Training: 12%|█▏ | 1168/10000 [15:33<1:42:49, 1.43it/s, loss=0.0211, lr=2.47e-05, step=1168] Training: 12%|█▏ | 1169/10000 [15:34<1:39:20, 1.48it/s, loss=0.0211, lr=2.47e-05, step=1168] Training: 12%|█▏ | 1169/10000 [15:34<1:39:20, 1.48it/s, loss=0.0789, lr=2.47e-05, step=1169]19:00:06.702 [I] step=1170 loss=0.0506 smoothed_loss=0.0417 lr=2.47e-05 grad_norm=0.8244 step_time=0.5708s data_time=0.0956s it/s=1.501 eta_to_10000=5882.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.1753 grad_arm_token_fuse=0.0582 grad_shared_expert=0.8377 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1170/10000 [15:34<1:47:55, 1.36it/s, loss=0.0789, lr=2.47e-05, step=1169] Training: 12%|█▏ | 1170/10000 [15:34<1:47:55, 1.36it/s, loss=0.0506, lr=2.47e-05, step=1170] Training: 12%|█▏ | 1171/10000 [15:35<1:53:31, 1.30it/s, loss=0.0506, lr=2.47e-05, step=1170] Training: 12%|█▏ | 1171/10000 [15:35<1:53:31, 1.30it/s, loss=0.0864, lr=2.47e-05, step=1171] Training: 12%|█▏ | 1172/10000 [15:36<1:51:53, 1.31it/s, loss=0.0864, lr=2.47e-05, step=1171] Training: 12%|█▏ | 1172/10000 [15:36<1:51:53, 1.31it/s, loss=0.0621, lr=2.47e-05, step=1172] Training: 12%|█▏ | 1173/10000 [15:37<1:46:02, 1.39it/s, loss=0.0621, lr=2.47e-05, step=1172] Training: 12%|█▏ | 1173/10000 [15:37<1:46:02, 1.39it/s, loss=0.0323, lr=2.47e-05, step=1173] Training: 12%|█▏ | 1174/10000 [15:37<1:43:26, 1.42it/s, loss=0.0323, lr=2.47e-05, step=1173] Training: 12%|█▏ | 1174/10000 [15:37<1:43:26, 1.42it/s, loss=0.0506, lr=2.47e-05, step=1174] Training: 12%|█▏ | 1175/10000 [15:38<1:49:41, 1.34it/s, loss=0.0506, lr=2.47e-05, step=1174] Training: 12%|█▏ | 1175/10000 [15:38<1:49:41, 1.34it/s, loss=0.0418, lr=2.47e-05, step=1175] Training: 12%|█▏ | 1176/10000 [15:39<1:42:38, 1.43it/s, loss=0.0418, lr=2.47e-05, step=1175] Training: 12%|█▏ | 1176/10000 [15:39<1:42:38, 1.43it/s, loss=0.0146, lr=2.47e-05, step=1176] Training: 12%|█▏ | 1177/10000 [15:39<1:38:40, 1.49it/s, loss=0.0146, lr=2.47e-05, step=1176] Training: 12%|█▏ | 1177/10000 [15:39<1:38:40, 1.49it/s, loss=0.0810, lr=2.47e-05, step=1177] Training: 12%|█▏ | 1178/10000 [15:40<1:46:07, 1.39it/s, loss=0.0810, lr=2.47e-05, step=1177] Training: 12%|█▏ | 1178/10000 [15:40<1:46:07, 1.39it/s, loss=0.0560, lr=2.47e-05, step=1178] Training: 12%|█▏ | 1179/10000 [15:41<1:51:39, 1.32it/s, loss=0.0560, lr=2.47e-05, step=1178] Training: 12%|█▏ | 1179/10000 [15:41<1:51:39, 1.32it/s, loss=0.0976, lr=2.47e-05, step=1179]19:00:14.039 [I] step=1180 loss=0.0720 smoothed_loss=0.0547 lr=2.47e-05 grad_norm=0.8426 step_time=0.5980s data_time=0.1358s it/s=1.363 eta_to_10000=6470.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0134 grad_action_out_proj_arms=0.1964 grad_arm_token_fuse=0.0708 grad_shared_expert=0.8491 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1180/10000 [15:42<1:50:05, 1.34it/s, loss=0.0976, lr=2.47e-05, step=1179] Training: 12%|█▏ | 1180/10000 [15:42<1:50:05, 1.34it/s, loss=0.0720, lr=2.47e-05, step=1180] Training: 12%|█▏ | 1181/10000 [15:43<1:53:05, 1.30it/s, loss=0.0720, lr=2.47e-05, step=1180] Training: 12%|█▏ | 1181/10000 [15:43<1:53:05, 1.30it/s, loss=0.0254, lr=2.47e-05, step=1181] Training: 12%|█▏ | 1182/10000 [15:43<1:49:11, 1.35it/s, loss=0.0254, lr=2.47e-05, step=1181] Training: 12%|█▏ | 1182/10000 [15:43<1:49:11, 1.35it/s, loss=0.0447, lr=2.47e-05, step=1182] Training: 12%|█▏ | 1183/10000 [15:44<1:48:34, 1.35it/s, loss=0.0447, lr=2.47e-05, step=1182] Training: 12%|█▏ | 1183/10000 [15:44<1:48:34, 1.35it/s, loss=0.0215, lr=2.47e-05, step=1183] Training: 12%|█▏ | 1184/10000 [15:44<1:39:26, 1.48it/s, loss=0.0215, lr=2.47e-05, step=1183] Training: 12%|█▏ | 1184/10000 [15:44<1:39:26, 1.48it/s, loss=0.0266, lr=2.47e-05, step=1184] Training: 12%|█▏ | 1185/10000 [15:45<1:33:17, 1.57it/s, loss=0.0266, lr=2.47e-05, step=1184] Training: 12%|█▏ | 1185/10000 [15:45<1:33:17, 1.57it/s, loss=0.0358, lr=2.47e-05, step=1185] Training: 12%|█▏ | 1186/10000 [15:46<1:52:48, 1.30it/s, loss=0.0358, lr=2.47e-05, step=1185] Training: 12%|█▏ | 1186/10000 [15:46<1:52:48, 1.30it/s, loss=0.1411, lr=2.47e-05, step=1186] Training: 12%|█▏ | 1187/10000 [15:47<1:43:07, 1.42it/s, loss=0.1411, lr=2.47e-05, step=1186] Training: 12%|█▏ | 1187/10000 [15:47<1:43:07, 1.42it/s, loss=0.0518, lr=2.47e-05, step=1187] Training: 12%|█▏ | 1188/10000 [15:48<1:54:01, 1.29it/s, loss=0.0518, lr=2.47e-05, step=1187] Training: 12%|█▏ | 1188/10000 [15:48<1:54:01, 1.29it/s, loss=0.1015, lr=2.47e-05, step=1188] Training: 12%|█▏ | 1189/10000 [15:48<1:46:31, 1.38it/s, loss=0.1015, lr=2.47e-05, step=1188] Training: 12%|█▏ | 1189/10000 [15:48<1:46:31, 1.38it/s, loss=0.0849, lr=2.47e-05, step=1189]19:00:21.288 [I] step=1190 loss=0.0325 smoothed_loss=0.0587 lr=2.47e-05 grad_norm=0.7397 step_time=0.5863s data_time=0.1386s it/s=1.380 eta_to_10000=6385.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0234 grad_action_out_proj_arms=0.2577 grad_arm_token_fuse=0.1170 grad_shared_expert=0.6850 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1190/10000 [15:49<1:48:24, 1.35it/s, loss=0.0849, lr=2.47e-05, step=1189] Training: 12%|█▏ | 1190/10000 [15:49<1:48:24, 1.35it/s, loss=0.0325, lr=2.47e-05, step=1190] Training: 12%|█▏ | 1191/10000 [15:49<1:38:19, 1.49it/s, loss=0.0325, lr=2.47e-05, step=1190] Training: 12%|█▏ | 1191/10000 [15:49<1:38:19, 1.49it/s, loss=0.0447, lr=2.47e-05, step=1191] Training: 12%|█▏ | 1192/10000 [15:50<1:31:38, 1.60it/s, loss=0.0447, lr=2.47e-05, step=1191] Training: 12%|█▏ | 1192/10000 [15:50<1:31:38, 1.60it/s, loss=0.0167, lr=2.47e-05, step=1192] Training: 12%|█▏ | 1193/10000 [15:51<1:52:58, 1.30it/s, loss=0.0167, lr=2.47e-05, step=1192] Training: 12%|█▏ | 1193/10000 [15:51<1:52:58, 1.30it/s, loss=0.0334, lr=2.47e-05, step=1193] Training: 12%|█▏ | 1194/10000 [15:52<1:52:32, 1.30it/s, loss=0.0334, lr=2.47e-05, step=1193] Training: 12%|█▏ | 1194/10000 [15:52<1:52:32, 1.30it/s, loss=0.0269, lr=2.47e-05, step=1194] Training: 12%|█▏ | 1195/10000 [15:53<2:02:40, 1.20it/s, loss=0.0269, lr=2.47e-05, step=1194] Training: 12%|█▏ | 1195/10000 [15:53<2:02:40, 1.20it/s, loss=0.0430, lr=2.47e-05, step=1195] Training: 12%|█▏ | 1196/10000 [15:54<2:01:21, 1.21it/s, loss=0.0430, lr=2.47e-05, step=1195] Training: 12%|█▏ | 1196/10000 [15:54<2:01:21, 1.21it/s, loss=0.0412, lr=2.47e-05, step=1196] Training: 12%|█▏ | 1197/10000 [15:54<1:49:43, 1.34it/s, loss=0.0412, lr=2.47e-05, step=1196] Training: 12%|█▏ | 1197/10000 [15:54<1:49:43, 1.34it/s, loss=0.0199, lr=2.47e-05, step=1197] Training: 12%|█▏ | 1198/10000 [15:55<1:51:03, 1.32it/s, loss=0.0199, lr=2.47e-05, step=1197] Training: 12%|█▏ | 1198/10000 [15:55<1:51:03, 1.32it/s, loss=0.0467, lr=2.47e-05, step=1198] Training: 12%|█▏ | 1199/10000 [15:56<1:47:04, 1.37it/s, loss=0.0467, lr=2.47e-05, step=1198] Training: 12%|█▏ | 1199/10000 [15:56<1:47:04, 1.37it/s, loss=0.0161, lr=2.47e-05, step=1199]19:00:28.919 [I] step=1200 loss=0.0531 smoothed_loss=0.0432 lr=2.47e-05 grad_norm=0.6303 step_time=0.6518s data_time=0.1113s it/s=1.311 eta_to_10000=6714.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0170 grad_action_out_proj_arms=0.2143 grad_arm_token_fuse=0.0872 grad_shared_expert=0.5745 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1200/10000 [15:57<1:55:30, 1.27it/s, loss=0.0161, lr=2.47e-05, step=1199] Training: 12%|█▏ | 1200/10000 [15:57<1:55:30, 1.27it/s, loss=0.0531, lr=2.47e-05, step=1200] Training: 12%|█▏ | 1201/10000 [15:57<1:45:47, 1.39it/s, loss=0.0531, lr=2.47e-05, step=1200] Training: 12%|█▏ | 1201/10000 [15:57<1:45:47, 1.39it/s, loss=0.0716, lr=2.47e-05, step=1201] Training: 12%|█▏ | 1202/10000 [15:58<1:42:53, 1.43it/s, loss=0.0716, lr=2.47e-05, step=1201] Training: 12%|█▏ | 1202/10000 [15:58<1:42:53, 1.43it/s, loss=0.0518, lr=2.47e-05, step=1202] Training: 12%|█▏ | 1203/10000 [15:58<1:40:02, 1.47it/s, loss=0.0518, lr=2.47e-05, step=1202] Training: 12%|█▏ | 1203/10000 [15:58<1:40:02, 1.47it/s, loss=0.1428, lr=2.47e-05, step=1203] Training: 12%|█▏ | 1204/10000 [15:59<1:36:21, 1.52it/s, loss=0.1428, lr=2.47e-05, step=1203] Training: 12%|█▏ | 1204/10000 [15:59<1:36:21, 1.52it/s, loss=0.0349, lr=2.47e-05, step=1204] Training: 12%|█▏ | 1205/10000 [16:00<1:41:30, 1.44it/s, loss=0.0349, lr=2.47e-05, step=1204] Training: 12%|█▏ | 1205/10000 [16:00<1:41:30, 1.44it/s, loss=0.0320, lr=2.47e-05, step=1205] Training: 12%|█▏ | 1206/10000 [16:01<1:47:54, 1.36it/s, loss=0.0320, lr=2.47e-05, step=1205] Training: 12%|█▏ | 1206/10000 [16:01<1:47:54, 1.36it/s, loss=0.0811, lr=2.47e-05, step=1206] Training: 12%|█▏ | 1207/10000 [16:01<1:51:46, 1.31it/s, loss=0.0811, lr=2.47e-05, step=1206] Training: 12%|█▏ | 1207/10000 [16:01<1:51:46, 1.31it/s, loss=0.0235, lr=2.47e-05, step=1207] Training: 12%|█▏ | 1208/10000 [16:02<1:39:53, 1.47it/s, loss=0.0235, lr=2.47e-05, step=1207] Training: 12%|█▏ | 1208/10000 [16:02<1:39:53, 1.47it/s, loss=0.0803, lr=2.47e-05, step=1208] Training: 12%|█▏ | 1209/10000 [16:02<1:31:49, 1.60it/s, loss=0.0803, lr=2.47e-05, step=1208] Training: 12%|█▏ | 1209/10000 [16:02<1:31:49, 1.60it/s, loss=0.0491, lr=2.47e-05, step=1209]19:00:35.340 [I] step=1210 loss=0.0379 smoothed_loss=0.0524 lr=2.47e-05 grad_norm=0.7050 step_time=0.5518s data_time=0.0904s it/s=1.557 eta_to_10000=5643.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.2233 grad_arm_token_fuse=0.0893 grad_shared_expert=0.5515 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1210/10000 [16:03<1:27:46, 1.67it/s, loss=0.0491, lr=2.47e-05, step=1209] Training: 12%|█▏ | 1210/10000 [16:03<1:27:46, 1.67it/s, loss=0.0379, lr=2.47e-05, step=1210] Training: 12%|█▏ | 1211/10000 [16:04<1:23:57, 1.74it/s, loss=0.0379, lr=2.47e-05, step=1210] Training: 12%|█▏ | 1211/10000 [16:04<1:23:57, 1.74it/s, loss=0.0277, lr=2.47e-05, step=1211] Training: 12%|█▏ | 1212/10000 [16:04<1:31:04, 1.61it/s, loss=0.0277, lr=2.47e-05, step=1211] Training: 12%|█▏ | 1212/10000 [16:04<1:31:04, 1.61it/s, loss=0.0423, lr=2.47e-05, step=1212] Training: 12%|█▏ | 1213/10000 [16:05<1:42:00, 1.44it/s, loss=0.0423, lr=2.47e-05, step=1212] Training: 12%|█▏ | 1213/10000 [16:05<1:42:00, 1.44it/s, loss=0.0280, lr=2.47e-05, step=1213] Training: 12%|█▏ | 1214/10000 [16:06<2:02:14, 1.20it/s, loss=0.0280, lr=2.47e-05, step=1213] Training: 12%|█▏ | 1214/10000 [16:06<2:02:14, 1.20it/s, loss=0.0419, lr=2.47e-05, step=1214] Training: 12%|█▏ | 1215/10000 [16:07<1:49:12, 1.34it/s, loss=0.0419, lr=2.47e-05, step=1214] Training: 12%|█▏ | 1215/10000 [16:07<1:49:12, 1.34it/s, loss=0.0188, lr=2.47e-05, step=1215] Training: 12%|█▏ | 1216/10000 [16:07<1:40:02, 1.46it/s, loss=0.0188, lr=2.47e-05, step=1215] Training: 12%|█▏ | 1216/10000 [16:07<1:40:02, 1.46it/s, loss=0.0421, lr=2.47e-05, step=1216] Training: 12%|█▏ | 1217/10000 [16:08<1:32:34, 1.58it/s, loss=0.0421, lr=2.47e-05, step=1216] Training: 12%|█▏ | 1217/10000 [16:08<1:32:34, 1.58it/s, loss=0.0579, lr=2.47e-05, step=1217] Training: 12%|█▏ | 1218/10000 [16:08<1:27:35, 1.67it/s, loss=0.0579, lr=2.47e-05, step=1217] Training: 12%|█▏ | 1218/10000 [16:08<1:27:35, 1.67it/s, loss=0.0386, lr=2.47e-05, step=1218] Training: 12%|█▏ | 1219/10000 [16:10<1:59:29, 1.22it/s, loss=0.0386, lr=2.47e-05, step=1218] Training: 12%|█▏ | 1219/10000 [16:10<1:59:29, 1.22it/s, loss=0.0252, lr=2.47e-05, step=1219]19:00:42.907 [I] step=1220 loss=0.0397 smoothed_loss=0.0422 lr=2.47e-05 grad_norm=0.8563 step_time=0.6312s data_time=0.1255s it/s=1.322 eta_to_10000=6643.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0201 grad_action_out_proj_arms=0.2020 grad_arm_token_fuse=0.1090 grad_shared_expert=1.0032 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1220/10000 [16:11<2:01:16, 1.21it/s, loss=0.0252, lr=2.47e-05, step=1219] Training: 12%|█▏ | 1220/10000 [16:11<2:01:16, 1.21it/s, loss=0.0397, lr=2.47e-05, step=1220] Training: 12%|█▏ | 1221/10000 [16:11<2:02:03, 1.20it/s, loss=0.0397, lr=2.47e-05, step=1220] Training: 12%|█▏ | 1221/10000 [16:11<2:02:03, 1.20it/s, loss=0.0448, lr=2.47e-05, step=1221] Training: 12%|█▏ | 1222/10000 [16:12<1:52:25, 1.30it/s, loss=0.0448, lr=2.47e-05, step=1221] Training: 12%|█▏ | 1222/10000 [16:12<1:52:25, 1.30it/s, loss=0.0116, lr=2.47e-05, step=1222] Training: 12%|█▏ | 1223/10000 [16:13<1:42:06, 1.43it/s, loss=0.0116, lr=2.47e-05, step=1222] Training: 12%|█▏ | 1223/10000 [16:13<1:42:06, 1.43it/s, loss=0.0350, lr=2.47e-05, step=1223] Training: 12%|█▏ | 1224/10000 [16:13<1:39:28, 1.47it/s, loss=0.0350, lr=2.47e-05, step=1223] Training: 12%|█▏ | 1224/10000 [16:13<1:39:28, 1.47it/s, loss=0.0253, lr=2.47e-05, step=1224] Training: 12%|█▏ | 1225/10000 [16:14<1:44:36, 1.40it/s, loss=0.0253, lr=2.47e-05, step=1224] Training: 12%|█▏ | 1225/10000 [16:14<1:44:36, 1.40it/s, loss=0.0841, lr=2.47e-05, step=1225] Training: 12%|█▏ | 1226/10000 [16:15<1:47:06, 1.37it/s, loss=0.0841, lr=2.47e-05, step=1225] Training: 12%|█▏ | 1226/10000 [16:15<1:47:06, 1.37it/s, loss=0.0481, lr=2.47e-05, step=1226] Training: 12%|█▏ | 1227/10000 [16:15<1:43:06, 1.42it/s, loss=0.0481, lr=2.47e-05, step=1226] Training: 12%|█▏ | 1227/10000 [16:15<1:43:06, 1.42it/s, loss=0.0220, lr=2.47e-05, step=1227] Training: 12%|█▏ | 1228/10000 [16:16<1:47:30, 1.36it/s, loss=0.0220, lr=2.47e-05, step=1227] Training: 12%|█▏ | 1228/10000 [16:16<1:47:30, 1.36it/s, loss=0.0221, lr=2.47e-05, step=1228] Training: 12%|█▏ | 1229/10000 [16:17<1:58:07, 1.24it/s, loss=0.0221, lr=2.47e-05, step=1228] Training: 12%|█▏ | 1229/10000 [16:17<1:58:07, 1.24it/s, loss=0.0610, lr=2.47e-05, step=1229]19:00:50.476 [I] step=1230 loss=0.0297 smoothed_loss=0.0399 lr=2.47e-05 grad_norm=0.6583 step_time=0.5995s data_time=0.1574s it/s=1.321 eta_to_10000=6636.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1900 grad_arm_token_fuse=0.0732 grad_shared_expert=0.4631 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1230/10000 [16:18<2:03:53, 1.18it/s, loss=0.0610, lr=2.47e-05, step=1229] Training: 12%|█▏ | 1230/10000 [16:18<2:03:53, 1.18it/s, loss=0.0297, lr=2.47e-05, step=1230] Training: 12%|█▏ | 1231/10000 [16:19<2:07:53, 1.14it/s, loss=0.0297, lr=2.47e-05, step=1230] Training: 12%|█▏ | 1231/10000 [16:19<2:07:53, 1.14it/s, loss=0.0333, lr=2.47e-05, step=1231] Training: 12%|█▏ | 1232/10000 [16:20<1:57:14, 1.25it/s, loss=0.0333, lr=2.47e-05, step=1231] Training: 12%|█▏ | 1232/10000 [16:20<1:57:14, 1.25it/s, loss=0.0478, lr=2.47e-05, step=1232] Training: 12%|█▏ | 1233/10000 [16:21<1:56:46, 1.25it/s, loss=0.0478, lr=2.47e-05, step=1232] Training: 12%|█▏ | 1233/10000 [16:21<1:56:46, 1.25it/s, loss=0.0645, lr=2.47e-05, step=1233] Training: 12%|█▏ | 1234/10000 [16:21<1:53:46, 1.28it/s, loss=0.0645, lr=2.47e-05, step=1233] Training: 12%|█▏ | 1234/10000 [16:21<1:53:46, 1.28it/s, loss=0.0100, lr=2.47e-05, step=1234] Training: 12%|█▏ | 1235/10000 [16:22<1:53:25, 1.29it/s, loss=0.0100, lr=2.47e-05, step=1234] Training: 12%|█▏ | 1235/10000 [16:22<1:53:25, 1.29it/s, loss=0.0232, lr=2.47e-05, step=1235] Training: 12%|█▏ | 1236/10000 [16:23<1:55:01, 1.27it/s, loss=0.0232, lr=2.47e-05, step=1235] Training: 12%|█▏ | 1236/10000 [16:23<1:55:01, 1.27it/s, loss=0.0169, lr=2.47e-05, step=1236] Training: 12%|█▏ | 1237/10000 [16:23<1:43:23, 1.41it/s, loss=0.0169, lr=2.47e-05, step=1236] Training: 12%|█▏ | 1237/10000 [16:23<1:43:23, 1.41it/s, loss=0.0685, lr=2.47e-05, step=1237] Training: 12%|█▏ | 1238/10000 [16:24<1:37:26, 1.50it/s, loss=0.0685, lr=2.47e-05, step=1237] Training: 12%|█▏ | 1238/10000 [16:24<1:37:26, 1.50it/s, loss=0.0271, lr=2.47e-05, step=1238] Training: 12%|█▏ | 1239/10000 [16:25<1:33:54, 1.55it/s, loss=0.0271, lr=2.47e-05, step=1238] Training: 12%|█▏ | 1239/10000 [16:25<1:33:54, 1.55it/s, loss=0.0467, lr=2.47e-05, step=1239]19:00:57.511 [I] step=1240 loss=0.0375 smoothed_loss=0.0385 lr=2.47e-05 grad_norm=0.7127 step_time=0.5590s data_time=0.1446s it/s=1.422 eta_to_10000=6162.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0110 grad_action_out_proj_arms=0.1633 grad_arm_token_fuse=0.0621 grad_shared_expert=0.6229 (18633:train_pytorch.py:850) + Training: 12%|█▏ | 1240/10000 [16:25<1:35:16, 1.53it/s, loss=0.0467, lr=2.47e-05, step=1239] Training: 12%|█▏ | 1240/10000 [16:25<1:35:16, 1.53it/s, loss=0.0375, lr=2.47e-05, step=1240] Training: 12%|█▏ | 1241/10000 [16:26<1:52:53, 1.29it/s, loss=0.0375, lr=2.47e-05, step=1240] Training: 12%|█▏ | 1241/10000 [16:26<1:52:53, 1.29it/s, loss=0.0584, lr=2.47e-05, step=1241] Training: 12%|█▏ | 1242/10000 [16:27<2:00:19, 1.21it/s, loss=0.0584, lr=2.47e-05, step=1241] Training: 12%|█▏ | 1242/10000 [16:27<2:00:19, 1.21it/s, loss=0.0196, lr=2.47e-05, step=1242] Training: 12%|█▏ | 1243/10000 [16:28<2:02:36, 1.19it/s, loss=0.0196, lr=2.47e-05, step=1242] Training: 12%|█▏ | 1243/10000 [16:28<2:02:36, 1.19it/s, loss=0.0219, lr=2.47e-05, step=1243] Training: 12%|█▏ | 1244/10000 [16:29<1:57:05, 1.25it/s, loss=0.0219, lr=2.47e-05, step=1243] Training: 12%|█▏ | 1244/10000 [16:29<1:57:05, 1.25it/s, loss=0.0596, lr=2.47e-05, step=1244] Training: 12%|█▏ | 1245/10000 [16:30<1:55:09, 1.27it/s, loss=0.0596, lr=2.47e-05, step=1244] Training: 12%|█▏ | 1245/10000 [16:30<1:55:09, 1.27it/s, loss=0.0512, lr=2.47e-05, step=1245] Training: 12%|█▏ | 1246/10000 [16:30<2:00:19, 1.21it/s, loss=0.0512, lr=2.47e-05, step=1245] Training: 12%|█▏ | 1246/10000 [16:30<2:00:19, 1.21it/s, loss=0.0388, lr=2.47e-05, step=1246] Training: 12%|█▏ | 1247/10000 [16:31<1:48:58, 1.34it/s, loss=0.0388, lr=2.47e-05, step=1246] Training: 12%|█▏ | 1247/10000 [16:31<1:48:58, 1.34it/s, loss=0.0800, lr=2.47e-05, step=1247] Training: 12%|█▏ | 1248/10000 [16:32<1:48:06, 1.35it/s, loss=0.0800, lr=2.47e-05, step=1247] Training: 12%|█▏ | 1248/10000 [16:32<1:48:06, 1.35it/s, loss=0.0269, lr=2.47e-05, step=1248] Training: 12%|█▏ | 1249/10000 [16:32<1:43:20, 1.41it/s, loss=0.0269, lr=2.47e-05, step=1248] Training: 12%|█▏ | 1249/10000 [16:32<1:43:20, 1.41it/s, loss=0.0221, lr=2.47e-05, step=1249]19:01:05.611 [I] step=1250 loss=0.0163 smoothed_loss=0.0379 lr=2.47e-05 grad_norm=0.6020 step_time=0.6444s data_time=0.1655s it/s=1.235 eta_to_10000=7086.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.1710 grad_arm_token_fuse=0.0537 grad_shared_expert=0.5359 (18633:train_pytorch.py:850) + Training: 12%|█▎ | 1250/10000 [16:33<1:52:30, 1.30it/s, loss=0.0221, lr=2.47e-05, step=1249] Training: 12%|█▎ | 1250/10000 [16:33<1:52:30, 1.30it/s, loss=0.0163, lr=2.47e-05, step=1250] Training: 13%|█▎ | 1251/10000 [16:34<1:45:12, 1.39it/s, loss=0.0163, lr=2.47e-05, step=1250] Training: 13%|█▎ | 1251/10000 [16:34<1:45:12, 1.39it/s, loss=0.0309, lr=2.47e-05, step=1251] Training: 13%|█▎ | 1252/10000 [16:35<1:48:08, 1.35it/s, loss=0.0309, lr=2.47e-05, step=1251] Training: 13%|█▎ | 1252/10000 [16:35<1:48:08, 1.35it/s, loss=0.0559, lr=2.47e-05, step=1252] Training: 13%|█▎ | 1253/10000 [16:35<1:45:14, 1.39it/s, loss=0.0559, lr=2.47e-05, step=1252] Training: 13%|█▎ | 1253/10000 [16:35<1:45:14, 1.39it/s, loss=0.0348, lr=2.47e-05, step=1253] Training: 13%|█▎ | 1254/10000 [16:36<1:41:39, 1.43it/s, loss=0.0348, lr=2.47e-05, step=1253] Training: 13%|█▎ | 1254/10000 [16:36<1:41:39, 1.43it/s, loss=0.0689, lr=2.47e-05, step=1254] Training: 13%|█▎ | 1255/10000 [16:37<1:43:56, 1.40it/s, loss=0.0689, lr=2.47e-05, step=1254] Training: 13%|█▎ | 1255/10000 [16:37<1:43:56, 1.40it/s, loss=0.0189, lr=2.47e-05, step=1255] Training: 13%|█▎ | 1256/10000 [16:37<1:39:13, 1.47it/s, loss=0.0189, lr=2.47e-05, step=1255] Training: 13%|█▎ | 1256/10000 [16:37<1:39:13, 1.47it/s, loss=0.0821, lr=2.47e-05, step=1256] Training: 13%|█▎ | 1257/10000 [16:38<1:55:18, 1.26it/s, loss=0.0821, lr=2.47e-05, step=1256] Training: 13%|█▎ | 1257/10000 [16:38<1:55:18, 1.26it/s, loss=0.0312, lr=2.47e-05, step=1257] Training: 13%|█▎ | 1258/10000 [16:39<1:50:41, 1.32it/s, loss=0.0312, lr=2.47e-05, step=1257] Training: 13%|█▎ | 1258/10000 [16:39<1:50:41, 1.32it/s, loss=0.0220, lr=2.46e-05, step=1258] Training: 13%|█▎ | 1259/10000 [16:40<1:45:19, 1.38it/s, loss=0.0220, lr=2.46e-05, step=1258] Training: 13%|█▎ | 1259/10000 [16:40<1:45:19, 1.38it/s, loss=0.0438, lr=2.46e-05, step=1259]19:01:12.778 [I] step=1260 loss=0.0591 smoothed_loss=0.0426 lr=2.47e-05 grad_norm=0.8187 step_time=0.5784s data_time=0.1384s it/s=1.395 eta_to_10000=6263.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0127 grad_action_out_proj_arms=0.2164 grad_arm_token_fuse=0.0681 grad_shared_expert=0.9012 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1260/10000 [16:40<1:45:38, 1.38it/s, loss=0.0438, lr=2.46e-05, step=1259] Training: 13%|█▎ | 1260/10000 [16:40<1:45:38, 1.38it/s, loss=0.0591, lr=2.46e-05, step=1260] Training: 13%|█▎ | 1261/10000 [16:41<1:51:52, 1.30it/s, loss=0.0591, lr=2.46e-05, step=1260] Training: 13%|█▎ | 1261/10000 [16:41<1:51:52, 1.30it/s, loss=0.0120, lr=2.46e-05, step=1261] Training: 13%|█▎ | 1262/10000 [16:42<1:50:31, 1.32it/s, loss=0.0120, lr=2.46e-05, step=1261] Training: 13%|█▎ | 1262/10000 [16:42<1:50:31, 1.32it/s, loss=0.0374, lr=2.46e-05, step=1262] Training: 13%|█▎ | 1263/10000 [16:43<1:50:25, 1.32it/s, loss=0.0374, lr=2.46e-05, step=1262] Training: 13%|█▎ | 1263/10000 [16:43<1:50:25, 1.32it/s, loss=0.0456, lr=2.46e-05, step=1263] Training: 13%|█▎ | 1264/10000 [16:44<2:11:10, 1.11it/s, loss=0.0456, lr=2.46e-05, step=1263] Training: 13%|█▎ | 1264/10000 [16:44<2:11:10, 1.11it/s, loss=0.0914, lr=2.46e-05, step=1264] Training: 13%|█▎ | 1265/10000 [16:45<2:07:58, 1.14it/s, loss=0.0914, lr=2.46e-05, step=1264] Training: 13%|█▎ | 1265/10000 [16:45<2:07:58, 1.14it/s, loss=0.0431, lr=2.46e-05, step=1265] Training: 13%|█▎ | 1266/10000 [16:46<1:59:55, 1.21it/s, loss=0.0431, lr=2.46e-05, step=1265] Training: 13%|█▎ | 1266/10000 [16:46<1:59:55, 1.21it/s, loss=0.0787, lr=2.46e-05, step=1266] Training: 13%|█▎ | 1267/10000 [16:47<2:04:44, 1.17it/s, loss=0.0787, lr=2.46e-05, step=1266] Training: 13%|█▎ | 1267/10000 [16:47<2:04:44, 1.17it/s, loss=0.0313, lr=2.46e-05, step=1267] Training: 13%|█▎ | 1268/10000 [16:47<1:57:14, 1.24it/s, loss=0.0313, lr=2.46e-05, step=1267] Training: 13%|█▎ | 1268/10000 [16:47<1:57:14, 1.24it/s, loss=0.0196, lr=2.46e-05, step=1268] Training: 13%|█▎ | 1269/10000 [16:48<1:48:50, 1.34it/s, loss=0.0196, lr=2.46e-05, step=1268] Training: 13%|█▎ | 1269/10000 [16:48<1:48:50, 1.34it/s, loss=0.0531, lr=2.46e-05, step=1269]19:01:20.831 [I] step=1270 loss=0.0220 smoothed_loss=0.0425 lr=2.46e-05 grad_norm=0.7673 step_time=0.6177s data_time=0.1876s it/s=1.242 eta_to_10000=7029.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.2193 grad_arm_token_fuse=0.0860 grad_shared_expert=0.6339 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1270/10000 [16:49<1:46:43, 1.36it/s, loss=0.0531, lr=2.46e-05, step=1269] Training: 13%|█▎ | 1270/10000 [16:49<1:46:43, 1.36it/s, loss=0.0220, lr=2.46e-05, step=1270] Training: 13%|█▎ | 1271/10000 [16:49<1:50:29, 1.32it/s, loss=0.0220, lr=2.46e-05, step=1270] Training: 13%|█▎ | 1271/10000 [16:49<1:50:29, 1.32it/s, loss=0.0211, lr=2.46e-05, step=1271] Training: 13%|█▎ | 1272/10000 [16:50<1:56:26, 1.25it/s, loss=0.0211, lr=2.46e-05, step=1271] Training: 13%|█▎ | 1272/10000 [16:50<1:56:26, 1.25it/s, loss=0.0304, lr=2.46e-05, step=1272] Training: 13%|█▎ | 1273/10000 [16:51<1:55:42, 1.26it/s, loss=0.0304, lr=2.46e-05, step=1272] Training: 13%|█▎ | 1273/10000 [16:51<1:55:42, 1.26it/s, loss=0.0272, lr=2.46e-05, step=1273] Training: 13%|█▎ | 1274/10000 [16:52<1:46:20, 1.37it/s, loss=0.0272, lr=2.46e-05, step=1273] Training: 13%|█▎ | 1274/10000 [16:52<1:46:20, 1.37it/s, loss=0.0437, lr=2.46e-05, step=1274] Training: 13%|█▎ | 1275/10000 [16:52<1:42:50, 1.41it/s, loss=0.0437, lr=2.46e-05, step=1274] Training: 13%|█▎ | 1275/10000 [16:52<1:42:50, 1.41it/s, loss=0.0527, lr=2.46e-05, step=1275] Training: 13%|█▎ | 1276/10000 [16:53<1:38:12, 1.48it/s, loss=0.0527, lr=2.46e-05, step=1275] Training: 13%|█▎ | 1276/10000 [16:53<1:38:12, 1.48it/s, loss=0.0382, lr=2.46e-05, step=1276] Training: 13%|█▎ | 1277/10000 [16:53<1:36:58, 1.50it/s, loss=0.0382, lr=2.46e-05, step=1276] Training: 13%|█▎ | 1277/10000 [16:53<1:36:58, 1.50it/s, loss=0.0179, lr=2.46e-05, step=1277] Training: 13%|█▎ | 1278/10000 [16:54<1:48:36, 1.34it/s, loss=0.0179, lr=2.46e-05, step=1277] Training: 13%|█▎ | 1278/10000 [16:54<1:48:36, 1.34it/s, loss=0.0118, lr=2.46e-05, step=1278] Training: 13%|█▎ | 1279/10000 [16:55<1:51:41, 1.30it/s, loss=0.0118, lr=2.46e-05, step=1278] Training: 13%|█▎ | 1279/10000 [16:55<1:51:41, 1.30it/s, loss=0.0885, lr=2.46e-05, step=1279]19:01:28.484 [I] step=1280 loss=0.0189 smoothed_loss=0.0383 lr=2.46e-05 grad_norm=0.7068 step_time=0.6262s data_time=0.1391s it/s=1.307 eta_to_10000=6672.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.2374 grad_arm_token_fuse=0.0971 grad_shared_expert=0.7644 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1280/10000 [16:56<1:58:15, 1.23it/s, loss=0.0885, lr=2.46e-05, step=1279] Training: 13%|█▎ | 1280/10000 [16:56<1:58:15, 1.23it/s, loss=0.0189, lr=2.46e-05, step=1280] Training: 13%|█▎ | 1281/10000 [16:57<1:53:05, 1.28it/s, loss=0.0189, lr=2.46e-05, step=1280] Training: 13%|█▎ | 1281/10000 [16:57<1:53:05, 1.28it/s, loss=0.0215, lr=2.46e-05, step=1281] Training: 13%|█▎ | 1282/10000 [16:57<1:46:15, 1.37it/s, loss=0.0215, lr=2.46e-05, step=1281] Training: 13%|█▎ | 1282/10000 [16:57<1:46:15, 1.37it/s, loss=0.0266, lr=2.46e-05, step=1282] Training: 13%|█▎ | 1283/10000 [16:58<1:57:59, 1.23it/s, loss=0.0266, lr=2.46e-05, step=1282] Training: 13%|█▎ | 1283/10000 [16:58<1:57:59, 1.23it/s, loss=0.1426, lr=2.46e-05, step=1283] Training: 13%|█▎ | 1284/10000 [16:59<1:50:40, 1.31it/s, loss=0.1426, lr=2.46e-05, step=1283] Training: 13%|█▎ | 1284/10000 [16:59<1:50:40, 1.31it/s, loss=0.0119, lr=2.46e-05, step=1284] Training: 13%|█▎ | 1285/10000 [17:00<1:55:46, 1.25it/s, loss=0.0119, lr=2.46e-05, step=1284] Training: 13%|█▎ | 1285/10000 [17:00<1:55:46, 1.25it/s, loss=0.0841, lr=2.46e-05, step=1285] Training: 13%|█▎ | 1286/10000 [17:01<1:56:17, 1.25it/s, loss=0.0841, lr=2.46e-05, step=1285] Training: 13%|█▎ | 1286/10000 [17:01<1:56:17, 1.25it/s, loss=0.0133, lr=2.46e-05, step=1286] Training: 13%|█▎ | 1287/10000 [17:02<1:55:26, 1.26it/s, loss=0.0133, lr=2.46e-05, step=1286] Training: 13%|█▎ | 1287/10000 [17:02<1:55:26, 1.26it/s, loss=0.0854, lr=2.46e-05, step=1287] Training: 13%|█▎ | 1288/10000 [17:02<1:48:35, 1.34it/s, loss=0.0854, lr=2.46e-05, step=1287] Training: 13%|█▎ | 1288/10000 [17:02<1:48:35, 1.34it/s, loss=0.0500, lr=2.46e-05, step=1288] Training: 13%|█▎ | 1289/10000 [17:03<1:50:54, 1.31it/s, loss=0.0500, lr=2.46e-05, step=1288] Training: 13%|█▎ | 1289/10000 [17:03<1:50:54, 1.31it/s, loss=0.0356, lr=2.46e-05, step=1289]19:01:36.126 [I] step=1290 loss=0.0352 smoothed_loss=0.0456 lr=2.46e-05 grad_norm=0.7469 step_time=0.6109s data_time=0.1533s it/s=1.309 eta_to_10000=6655.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1733 grad_arm_token_fuse=0.0547 grad_shared_expert=0.8393 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1290/10000 [17:04<1:51:11, 1.31it/s, loss=0.0356, lr=2.46e-05, step=1289] Training: 13%|█▎ | 1290/10000 [17:04<1:51:11, 1.31it/s, loss=0.0352, lr=2.46e-05, step=1290] Training: 13%|█▎ | 1291/10000 [17:04<1:43:05, 1.41it/s, loss=0.0352, lr=2.46e-05, step=1290] Training: 13%|█▎ | 1291/10000 [17:04<1:43:05, 1.41it/s, loss=0.0461, lr=2.46e-05, step=1291] Training: 13%|█▎ | 1292/10000 [17:05<1:57:39, 1.23it/s, loss=0.0461, lr=2.46e-05, step=1291] Training: 13%|█▎ | 1292/10000 [17:05<1:57:39, 1.23it/s, loss=0.0348, lr=2.46e-05, step=1292] Training: 13%|█▎ | 1293/10000 [17:07<2:11:54, 1.10it/s, loss=0.0348, lr=2.46e-05, step=1292] Training: 13%|█▎ | 1293/10000 [17:07<2:11:54, 1.10it/s, loss=0.0726, lr=2.46e-05, step=1293] Training: 13%|█▎ | 1294/10000 [17:07<2:06:11, 1.15it/s, loss=0.0726, lr=2.46e-05, step=1293] Training: 13%|█▎ | 1294/10000 [17:07<2:06:11, 1.15it/s, loss=0.1092, lr=2.46e-05, step=1294] Training: 13%|█▎ | 1295/10000 [17:08<2:13:16, 1.09it/s, loss=0.1092, lr=2.46e-05, step=1294] Training: 13%|█▎ | 1295/10000 [17:08<2:13:16, 1.09it/s, loss=0.0285, lr=2.46e-05, step=1295] Training: 13%|█▎ | 1296/10000 [17:09<2:02:17, 1.19it/s, loss=0.0285, lr=2.46e-05, step=1295] Training: 13%|█▎ | 1296/10000 [17:09<2:02:17, 1.19it/s, loss=0.0318, lr=2.46e-05, step=1296] Training: 13%|█▎ | 1297/10000 [17:10<2:19:00, 1.04it/s, loss=0.0318, lr=2.46e-05, step=1296] Training: 13%|█▎ | 1297/10000 [17:10<2:19:00, 1.04it/s, loss=0.0355, lr=2.46e-05, step=1297] Training: 13%|█▎ | 1298/10000 [17:11<2:07:46, 1.14it/s, loss=0.0355, lr=2.46e-05, step=1297] Training: 13%|█▎ | 1298/10000 [17:11<2:07:46, 1.14it/s, loss=0.0499, lr=2.46e-05, step=1298] Training: 13%|█▎ | 1299/10000 [17:12<2:03:55, 1.17it/s, loss=0.0499, lr=2.46e-05, step=1298] Training: 13%|█▎ | 1299/10000 [17:12<2:03:55, 1.17it/s, loss=0.0338, lr=2.46e-05, step=1299]19:01:45.134 [I] step=1300 loss=0.0292 smoothed_loss=0.0448 lr=2.46e-05 grad_norm=0.7324 step_time=0.7408s data_time=0.1601s it/s=1.110 eta_to_10000=7836.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0165 grad_action_out_proj_arms=0.1689 grad_arm_token_fuse=0.0835 grad_shared_expert=0.6470 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1300/10000 [17:13<2:12:15, 1.10it/s, loss=0.0338, lr=2.46e-05, step=1299] Training: 13%|█▎ | 1300/10000 [17:13<2:12:15, 1.10it/s, loss=0.0292, lr=2.46e-05, step=1300] Training: 13%|█▎ | 1301/10000 [17:13<1:58:51, 1.22it/s, loss=0.0292, lr=2.46e-05, step=1300] Training: 13%|█▎ | 1301/10000 [17:13<1:58:51, 1.22it/s, loss=0.0300, lr=2.46e-05, step=1301] Training: 13%|█▎ | 1302/10000 [17:14<1:50:18, 1.31it/s, loss=0.0300, lr=2.46e-05, step=1301] Training: 13%|█▎ | 1302/10000 [17:14<1:50:18, 1.31it/s, loss=0.0452, lr=2.46e-05, step=1302] Training: 13%|█▎ | 1303/10000 [17:15<1:42:54, 1.41it/s, loss=0.0452, lr=2.46e-05, step=1302] Training: 13%|█▎ | 1303/10000 [17:15<1:42:54, 1.41it/s, loss=0.0224, lr=2.46e-05, step=1303] Training: 13%|█▎ | 1304/10000 [17:15<1:43:56, 1.39it/s, loss=0.0224, lr=2.46e-05, step=1303] Training: 13%|█▎ | 1304/10000 [17:15<1:43:56, 1.39it/s, loss=0.0369, lr=2.46e-05, step=1304] Training: 13%|█▎ | 1305/10000 [17:16<1:42:29, 1.41it/s, loss=0.0369, lr=2.46e-05, step=1304] Training: 13%|█▎ | 1305/10000 [17:16<1:42:29, 1.41it/s, loss=0.0581, lr=2.46e-05, step=1305] Training: 13%|█▎ | 1306/10000 [17:17<1:45:27, 1.37it/s, loss=0.0581, lr=2.46e-05, step=1305] Training: 13%|█▎ | 1306/10000 [17:17<1:45:27, 1.37it/s, loss=0.0251, lr=2.46e-05, step=1306] Training: 13%|█▎ | 1307/10000 [17:18<1:56:16, 1.25it/s, loss=0.0251, lr=2.46e-05, step=1306] Training: 13%|█▎ | 1307/10000 [17:18<1:56:16, 1.25it/s, loss=0.0224, lr=2.46e-05, step=1307] Training: 13%|█▎ | 1308/10000 [17:19<2:00:01, 1.21it/s, loss=0.0224, lr=2.46e-05, step=1307] Training: 13%|█▎ | 1308/10000 [17:19<2:00:01, 1.21it/s, loss=0.0165, lr=2.46e-05, step=1308] Training: 13%|█▎ | 1309/10000 [17:19<1:50:44, 1.31it/s, loss=0.0165, lr=2.46e-05, step=1308] Training: 13%|█▎ | 1309/10000 [17:19<1:50:44, 1.31it/s, loss=0.0352, lr=2.46e-05, step=1309]19:01:52.279 [I] step=1310 loss=0.0180 smoothed_loss=0.0348 lr=2.46e-05 grad_norm=0.6781 step_time=0.5695s data_time=0.1450s it/s=1.400 eta_to_10000=6207.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0185 grad_action_out_proj_arms=0.2019 grad_arm_token_fuse=0.0854 grad_shared_expert=0.5628 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1310/10000 [17:20<1:45:45, 1.37it/s, loss=0.0352, lr=2.46e-05, step=1309] Training: 13%|█▎ | 1310/10000 [17:20<1:45:45, 1.37it/s, loss=0.0180, lr=2.46e-05, step=1310] Training: 13%|█▎ | 1311/10000 [17:21<1:45:08, 1.38it/s, loss=0.0180, lr=2.46e-05, step=1310] Training: 13%|█▎ | 1311/10000 [17:21<1:45:08, 1.38it/s, loss=0.0906, lr=2.46e-05, step=1311] Training: 13%|█▎ | 1312/10000 [17:22<2:05:31, 1.15it/s, loss=0.0906, lr=2.46e-05, step=1311] Training: 13%|█▎ | 1312/10000 [17:22<2:05:31, 1.15it/s, loss=0.0647, lr=2.46e-05, step=1312] Training: 13%|█▎ | 1313/10000 [17:22<1:51:53, 1.29it/s, loss=0.0647, lr=2.46e-05, step=1312] Training: 13%|█▎ | 1313/10000 [17:22<1:51:53, 1.29it/s, loss=0.0527, lr=2.46e-05, step=1313] Training: 13%|█▎ | 1314/10000 [17:23<1:59:34, 1.21it/s, loss=0.0527, lr=2.46e-05, step=1313] Training: 13%|█▎ | 1314/10000 [17:23<1:59:34, 1.21it/s, loss=0.0451, lr=2.46e-05, step=1314] Training: 13%|█▎ | 1315/10000 [17:24<1:57:55, 1.23it/s, loss=0.0451, lr=2.46e-05, step=1314] Training: 13%|█▎ | 1315/10000 [17:24<1:57:55, 1.23it/s, loss=0.0541, lr=2.46e-05, step=1315] Training: 13%|█▎ | 1316/10000 [17:25<1:50:34, 1.31it/s, loss=0.0541, lr=2.46e-05, step=1315] Training: 13%|█▎ | 1316/10000 [17:25<1:50:34, 1.31it/s, loss=0.0203, lr=2.46e-05, step=1316] Training: 13%|█▎ | 1317/10000 [17:25<1:42:43, 1.41it/s, loss=0.0203, lr=2.46e-05, step=1316] Training: 13%|█▎ | 1317/10000 [17:25<1:42:43, 1.41it/s, loss=0.0412, lr=2.46e-05, step=1317] Training: 13%|█▎ | 1318/10000 [17:26<1:41:22, 1.43it/s, loss=0.0412, lr=2.46e-05, step=1317] Training: 13%|█▎ | 1318/10000 [17:26<1:41:22, 1.43it/s, loss=0.0141, lr=2.46e-05, step=1318] Training: 13%|█▎ | 1319/10000 [17:27<1:37:47, 1.48it/s, loss=0.0141, lr=2.46e-05, step=1318] Training: 13%|█▎ | 1319/10000 [17:27<1:37:47, 1.48it/s, loss=0.0368, lr=2.46e-05, step=1319]19:02:00.173 [I] step=1320 loss=0.1465 smoothed_loss=0.0500 lr=2.46e-05 grad_norm=0.7532 step_time=0.6279s data_time=0.1615s it/s=1.267 eta_to_10000=6851.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0270 grad_action_out_proj_arms=0.2238 grad_arm_token_fuse=0.1594 grad_shared_expert=0.6368 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1320/10000 [17:28<1:58:58, 1.22it/s, loss=0.0368, lr=2.46e-05, step=1319] Training: 13%|█▎ | 1320/10000 [17:28<1:58:58, 1.22it/s, loss=0.1465, lr=2.46e-05, step=1320] Training: 13%|█▎ | 1321/10000 [17:29<1:59:46, 1.21it/s, loss=0.1465, lr=2.46e-05, step=1320] Training: 13%|█▎ | 1321/10000 [17:29<1:59:46, 1.21it/s, loss=0.0817, lr=2.46e-05, step=1321] Training: 13%|█▎ | 1322/10000 [17:29<1:50:00, 1.31it/s, loss=0.0817, lr=2.46e-05, step=1321] Training: 13%|█▎ | 1322/10000 [17:29<1:50:00, 1.31it/s, loss=0.0688, lr=2.46e-05, step=1322] Training: 13%|█▎ | 1323/10000 [17:30<1:43:05, 1.40it/s, loss=0.0688, lr=2.46e-05, step=1322] Training: 13%|█▎ | 1323/10000 [17:30<1:43:05, 1.40it/s, loss=0.0187, lr=2.46e-05, step=1323] Training: 13%|█▎ | 1324/10000 [17:31<1:49:37, 1.32it/s, loss=0.0187, lr=2.46e-05, step=1323] Training: 13%|█▎ | 1324/10000 [17:31<1:49:37, 1.32it/s, loss=0.1001, lr=2.46e-05, step=1324] Training: 13%|█▎ | 1325/10000 [17:31<1:47:59, 1.34it/s, loss=0.1001, lr=2.46e-05, step=1324] Training: 13%|█▎ | 1325/10000 [17:31<1:47:59, 1.34it/s, loss=0.0403, lr=2.46e-05, step=1325] Training: 13%|█▎ | 1326/10000 [17:32<1:41:13, 1.43it/s, loss=0.0403, lr=2.46e-05, step=1325] Training: 13%|█▎ | 1326/10000 [17:32<1:41:13, 1.43it/s, loss=0.0355, lr=2.46e-05, step=1326] Training: 13%|█▎ | 1327/10000 [17:33<1:50:39, 1.31it/s, loss=0.0355, lr=2.46e-05, step=1326] Training: 13%|█▎ | 1327/10000 [17:33<1:50:39, 1.31it/s, loss=0.0579, lr=2.46e-05, step=1327] Training: 13%|█▎ | 1328/10000 [17:34<2:03:00, 1.17it/s, loss=0.0579, lr=2.46e-05, step=1327] Training: 13%|█▎ | 1328/10000 [17:34<2:03:00, 1.17it/s, loss=0.0822, lr=2.46e-05, step=1328] Training: 13%|█▎ | 1329/10000 [17:35<1:53:28, 1.27it/s, loss=0.0822, lr=2.46e-05, step=1328] Training: 13%|█▎ | 1329/10000 [17:35<1:53:28, 1.27it/s, loss=0.0523, lr=2.46e-05, step=1329]19:02:07.537 [I] step=1330 loss=0.0269 smoothed_loss=0.0527 lr=2.46e-05 grad_norm=0.7676 step_time=0.6015s data_time=0.1349s it/s=1.358 eta_to_10000=6383.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1866 grad_arm_token_fuse=0.0571 grad_shared_expert=0.4999 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1330/10000 [17:35<1:42:56, 1.40it/s, loss=0.0523, lr=2.46e-05, step=1329] Training: 13%|█▎ | 1330/10000 [17:35<1:42:56, 1.40it/s, loss=0.0269, lr=2.46e-05, step=1330] Training: 13%|█▎ | 1331/10000 [17:36<1:38:11, 1.47it/s, loss=0.0269, lr=2.46e-05, step=1330] Training: 13%|█▎ | 1331/10000 [17:36<1:38:11, 1.47it/s, loss=0.0504, lr=2.46e-05, step=1331] Training: 13%|█▎ | 1332/10000 [17:36<1:32:10, 1.57it/s, loss=0.0504, lr=2.46e-05, step=1331] Training: 13%|█▎ | 1332/10000 [17:36<1:32:10, 1.57it/s, loss=0.0469, lr=2.46e-05, step=1332] Training: 13%|█▎ | 1333/10000 [17:37<1:26:22, 1.67it/s, loss=0.0469, lr=2.46e-05, step=1332] Training: 13%|█▎ | 1333/10000 [17:37<1:26:22, 1.67it/s, loss=0.0223, lr=2.46e-05, step=1333] Training: 13%|█▎ | 1334/10000 [17:37<1:23:15, 1.73it/s, loss=0.0223, lr=2.46e-05, step=1333] Training: 13%|█▎ | 1334/10000 [17:37<1:23:15, 1.73it/s, loss=0.0803, lr=2.46e-05, step=1334] Training: 13%|█▎ | 1335/10000 [17:38<1:31:29, 1.58it/s, loss=0.0803, lr=2.46e-05, step=1334] Training: 13%|█▎ | 1335/10000 [17:38<1:31:29, 1.58it/s, loss=0.0584, lr=2.46e-05, step=1335] Training: 13%|█▎ | 1336/10000 [17:39<1:37:03, 1.49it/s, loss=0.0584, lr=2.46e-05, step=1335] Training: 13%|█▎ | 1336/10000 [17:39<1:37:03, 1.49it/s, loss=0.0160, lr=2.46e-05, step=1336] Training: 13%|█▎ | 1337/10000 [17:40<1:35:00, 1.52it/s, loss=0.0160, lr=2.46e-05, step=1336] Training: 13%|█▎ | 1337/10000 [17:40<1:35:00, 1.52it/s, loss=0.0468, lr=2.46e-05, step=1337] Training: 13%|█▎ | 1338/10000 [17:40<1:32:56, 1.55it/s, loss=0.0468, lr=2.46e-05, step=1337] Training: 13%|█▎ | 1338/10000 [17:40<1:32:56, 1.55it/s, loss=0.0448, lr=2.46e-05, step=1338] Training: 13%|█▎ | 1339/10000 [17:41<1:31:13, 1.58it/s, loss=0.0448, lr=2.46e-05, step=1338] Training: 13%|█▎ | 1339/10000 [17:41<1:31:13, 1.58it/s, loss=0.0337, lr=2.46e-05, step=1339]19:02:13.726 [I] step=1340 loss=0.0332 smoothed_loss=0.0456 lr=2.46e-05 grad_norm=0.7554 step_time=0.5265s data_time=0.0924s it/s=1.616 eta_to_10000=5358.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1731 grad_arm_token_fuse=0.0629 grad_shared_expert=0.4757 (18633:train_pytorch.py:850) + Training: 13%|█▎ | 1340/10000 [17:41<1:31:46, 1.57it/s, loss=0.0337, lr=2.46e-05, step=1339] Training: 13%|█▎ | 1340/10000 [17:41<1:31:46, 1.57it/s, loss=0.0332, lr=2.46e-05, step=1340] Training: 13%|█▎ | 1341/10000 [17:42<1:25:22, 1.69it/s, loss=0.0332, lr=2.46e-05, step=1340] Training: 13%|█▎ | 1341/10000 [17:42<1:25:22, 1.69it/s, loss=0.0169, lr=2.46e-05, step=1341] Training: 13%|█▎ | 1342/10000 [17:43<1:27:51, 1.64it/s, loss=0.0169, lr=2.46e-05, step=1341] Training: 13%|█▎ | 1342/10000 [17:43<1:27:51, 1.64it/s, loss=0.0380, lr=2.46e-05, step=1342] Training: 13%|█▎ | 1343/10000 [17:43<1:41:21, 1.42it/s, loss=0.0380, lr=2.46e-05, step=1342] Training: 13%|█▎ | 1343/10000 [17:43<1:41:21, 1.42it/s, loss=0.0247, lr=2.46e-05, step=1343] Training: 13%|█▎ | 1344/10000 [17:44<1:41:37, 1.42it/s, loss=0.0247, lr=2.46e-05, step=1343] Training: 13%|█▎ | 1344/10000 [17:44<1:41:37, 1.42it/s, loss=0.0445, lr=2.46e-05, step=1344] Training: 13%|█▎ | 1345/10000 [17:45<1:38:26, 1.47it/s, loss=0.0445, lr=2.46e-05, step=1344] Training: 13%|█▎ | 1345/10000 [17:45<1:38:26, 1.47it/s, loss=0.0256, lr=2.46e-05, step=1345] Training: 13%|█▎ | 1346/10000 [17:46<1:40:29, 1.44it/s, loss=0.0256, lr=2.46e-05, step=1345] Training: 13%|█▎ | 1346/10000 [17:46<1:40:29, 1.44it/s, loss=0.0550, lr=2.46e-05, step=1346] Training: 13%|█▎ | 1347/10000 [17:46<1:37:42, 1.48it/s, loss=0.0550, lr=2.46e-05, step=1346] Training: 13%|█▎ | 1347/10000 [17:46<1:37:42, 1.48it/s, loss=0.0752, lr=2.46e-05, step=1347] Training: 13%|█▎ | 1348/10000 [17:47<1:34:23, 1.53it/s, loss=0.0752, lr=2.46e-05, step=1347] Training: 13%|█▎ | 1348/10000 [17:47<1:34:23, 1.53it/s, loss=0.0522, lr=2.46e-05, step=1348] Training: 13%|█▎ | 1349/10000 [17:47<1:36:59, 1.49it/s, loss=0.0522, lr=2.46e-05, step=1348] Training: 13%|█▎ | 1349/10000 [17:47<1:36:59, 1.49it/s, loss=0.0425, lr=2.46e-05, step=1349]19:02:20.761 [I] step=1350 loss=0.0650 smoothed_loss=0.0469 lr=2.46e-05 grad_norm=0.6417 step_time=0.5840s data_time=0.1195s it/s=1.422 eta_to_10000=6084.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.2265 grad_arm_token_fuse=0.0719 grad_shared_expert=0.6469 (18633:train_pytorch.py:850) + Training: 14%|█▎ | 1350/10000 [17:48<1:49:17, 1.32it/s, loss=0.0425, lr=2.46e-05, step=1349] Training: 14%|█▎ | 1350/10000 [17:48<1:49:17, 1.32it/s, loss=0.0650, lr=2.46e-05, step=1350] Training: 14%|█▎ | 1351/10000 [17:49<1:43:04, 1.40it/s, loss=0.0650, lr=2.46e-05, step=1350] Training: 14%|█▎ | 1351/10000 [17:49<1:43:04, 1.40it/s, loss=0.0270, lr=2.46e-05, step=1351] Training: 14%|█▎ | 1352/10000 [17:50<1:39:53, 1.44it/s, loss=0.0270, lr=2.46e-05, step=1351] Training: 14%|█▎ | 1352/10000 [17:50<1:39:53, 1.44it/s, loss=0.0169, lr=2.46e-05, step=1352] Training: 14%|█▎ | 1353/10000 [17:50<1:36:24, 1.49it/s, loss=0.0169, lr=2.46e-05, step=1352] Training: 14%|█▎ | 1353/10000 [17:50<1:36:24, 1.49it/s, loss=0.0358, lr=2.46e-05, step=1353] Training: 14%|█▎ | 1354/10000 [17:51<1:34:14, 1.53it/s, loss=0.0358, lr=2.46e-05, step=1353] Training: 14%|█▎ | 1354/10000 [17:51<1:34:14, 1.53it/s, loss=0.0630, lr=2.46e-05, step=1354] Training: 14%|█▎ | 1355/10000 [17:51<1:31:00, 1.58it/s, loss=0.0630, lr=2.46e-05, step=1354] Training: 14%|█▎ | 1355/10000 [17:52<1:31:00, 1.58it/s, loss=0.0482, lr=2.46e-05, step=1355] Training: 14%|█▎ | 1356/10000 [17:52<1:28:50, 1.62it/s, loss=0.0482, lr=2.46e-05, step=1355] Training: 14%|█▎ | 1356/10000 [17:52<1:28:50, 1.62it/s, loss=0.0205, lr=2.46e-05, step=1356] Training: 14%|█▎ | 1357/10000 [17:53<1:35:15, 1.51it/s, loss=0.0205, lr=2.46e-05, step=1356] Training: 14%|█▎ | 1357/10000 [17:53<1:35:15, 1.51it/s, loss=0.0230, lr=2.46e-05, step=1357] Training: 14%|█▎ | 1358/10000 [17:53<1:27:47, 1.64it/s, loss=0.0230, lr=2.46e-05, step=1357] Training: 14%|█▎ | 1358/10000 [17:53<1:27:47, 1.64it/s, loss=0.0161, lr=2.46e-05, step=1358] Training: 14%|█▎ | 1359/10000 [17:54<1:29:06, 1.62it/s, loss=0.0161, lr=2.46e-05, step=1358] Training: 14%|█▎ | 1359/10000 [17:54<1:29:06, 1.62it/s, loss=0.0397, lr=2.46e-05, step=1359]19:02:26.906 [I] step=1360 loss=0.0202 smoothed_loss=0.0359 lr=2.46e-05 grad_norm=0.5651 step_time=0.5246s data_time=0.0899s it/s=1.628 eta_to_10000=5308.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1162 grad_arm_token_fuse=0.0588 grad_shared_expert=0.3670 (18633:train_pytorch.py:850) + Training: 14%|█▎ | 1360/10000 [17:55<1:28:20, 1.63it/s, loss=0.0397, lr=2.46e-05, step=1359] Training: 14%|█▎ | 1360/10000 [17:55<1:28:20, 1.63it/s, loss=0.0202, lr=2.45e-05, step=1360] Training: 14%|█▎ | 1361/10000 [17:55<1:22:51, 1.74it/s, loss=0.0202, lr=2.45e-05, step=1360] Training: 14%|█▎ | 1361/10000 [17:55<1:22:51, 1.74it/s, loss=0.0302, lr=2.45e-05, step=1361] Training: 14%|█▎ | 1362/10000 [17:56<1:25:03, 1.69it/s, loss=0.0302, lr=2.45e-05, step=1361] Training: 14%|█▎ | 1362/10000 [17:56<1:25:03, 1.69it/s, loss=0.0337, lr=2.45e-05, step=1362] Training: 14%|█▎ | 1363/10000 [17:56<1:20:39, 1.78it/s, loss=0.0337, lr=2.45e-05, step=1362] Training: 14%|█▎ | 1363/10000 [17:56<1:20:39, 1.78it/s, loss=0.0340, lr=2.45e-05, step=1363] Training: 14%|█▎ | 1364/10000 [17:57<1:29:05, 1.62it/s, loss=0.0340, lr=2.45e-05, step=1363] Training: 14%|█▎ | 1364/10000 [17:57<1:29:05, 1.62it/s, loss=0.0183, lr=2.45e-05, step=1364] Training: 14%|█▎ | 1365/10000 [17:58<1:31:31, 1.57it/s, loss=0.0183, lr=2.45e-05, step=1364] Training: 14%|█▎ | 1365/10000 [17:58<1:31:31, 1.57it/s, loss=0.0319, lr=2.45e-05, step=1365] Training: 14%|█▎ | 1366/10000 [17:58<1:27:22, 1.65it/s, loss=0.0319, lr=2.45e-05, step=1365] Training: 14%|█▎ | 1366/10000 [17:58<1:27:22, 1.65it/s, loss=0.0251, lr=2.45e-05, step=1366] Training: 14%|█▎ | 1367/10000 [17:59<1:30:18, 1.59it/s, loss=0.0251, lr=2.45e-05, step=1366] Training: 14%|█▎ | 1367/10000 [17:59<1:30:18, 1.59it/s, loss=0.0339, lr=2.45e-05, step=1367] Training: 14%|█▎ | 1368/10000 [17:59<1:25:59, 1.67it/s, loss=0.0339, lr=2.45e-05, step=1367] Training: 14%|█▎ | 1368/10000 [17:59<1:25:59, 1.67it/s, loss=0.0132, lr=2.45e-05, step=1368] Training: 14%|█▎ | 1369/10000 [18:00<1:24:41, 1.70it/s, loss=0.0132, lr=2.45e-05, step=1368] Training: 14%|█▎ | 1369/10000 [18:00<1:24:41, 1.70it/s, loss=0.0228, lr=2.45e-05, step=1369]19:02:32.931 [I] step=1370 loss=0.0197 smoothed_loss=0.0288 lr=2.45e-05 grad_norm=0.6260 step_time=0.5151s data_time=0.0874s it/s=1.660 eta_to_10000=5198.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0159 grad_action_out_proj_arms=0.1756 grad_arm_token_fuse=0.0854 grad_shared_expert=0.6493 (18633:train_pytorch.py:850) + Training: 14%|█▎ | 1370/10000 [18:01<1:28:36, 1.62it/s, loss=0.0228, lr=2.45e-05, step=1369] Training: 14%|█▎ | 1370/10000 [18:01<1:28:36, 1.62it/s, loss=0.0197, lr=2.45e-05, step=1370] Training: 14%|█▎ | 1371/10000 [18:01<1:36:27, 1.49it/s, loss=0.0197, lr=2.45e-05, step=1370] Training: 14%|█▎ | 1371/10000 [18:01<1:36:27, 1.49it/s, loss=0.0216, lr=2.45e-05, step=1371] Training: 14%|█▎ | 1372/10000 [18:02<1:36:01, 1.50it/s, loss=0.0216, lr=2.45e-05, step=1371] Training: 14%|█▎ | 1372/10000 [18:02<1:36:01, 1.50it/s, loss=0.0633, lr=2.45e-05, step=1372] Training: 14%|█▎ | 1373/10000 [18:03<1:29:01, 1.62it/s, loss=0.0633, lr=2.45e-05, step=1372] Training: 14%|█▎ | 1373/10000 [18:03<1:29:01, 1.62it/s, loss=0.0240, lr=2.45e-05, step=1373] Training: 14%|█▎ | 1374/10000 [18:03<1:28:27, 1.63it/s, loss=0.0240, lr=2.45e-05, step=1373] Training: 14%|█▎ | 1374/10000 [18:03<1:28:27, 1.63it/s, loss=0.0356, lr=2.45e-05, step=1374] Training: 14%|█▍ | 1375/10000 [18:04<1:22:57, 1.73it/s, loss=0.0356, lr=2.45e-05, step=1374] Training: 14%|█▍ | 1375/10000 [18:04<1:22:57, 1.73it/s, loss=0.0400, lr=2.45e-05, step=1375] Training: 14%|█▍ | 1376/10000 [18:04<1:19:05, 1.82it/s, loss=0.0400, lr=2.45e-05, step=1375] Training: 14%|█▍ | 1376/10000 [18:04<1:19:05, 1.82it/s, loss=0.0525, lr=2.45e-05, step=1376] Training: 14%|█▍ | 1377/10000 [18:05<1:16:01, 1.89it/s, loss=0.0525, lr=2.45e-05, step=1376] Training: 14%|█▍ | 1377/10000 [18:05<1:16:01, 1.89it/s, loss=0.0238, lr=2.45e-05, step=1377] Training: 14%|█▍ | 1378/10000 [18:05<1:25:58, 1.67it/s, loss=0.0238, lr=2.45e-05, step=1377] Training: 14%|█▍ | 1378/10000 [18:05<1:25:58, 1.67it/s, loss=0.0269, lr=2.45e-05, step=1378] Training: 14%|█▍ | 1379/10000 [18:06<1:36:44, 1.49it/s, loss=0.0269, lr=2.45e-05, step=1378] Training: 14%|█▍ | 1379/10000 [18:06<1:36:44, 1.49it/s, loss=0.0299, lr=2.45e-05, step=1379]19:02:39.146 [I] step=1380 loss=0.0372 smoothed_loss=0.0328 lr=2.45e-05 grad_norm=0.7184 step_time=0.5480s data_time=0.0735s it/s=1.609 eta_to_10000=5356.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0138 grad_action_out_proj_arms=0.1706 grad_arm_token_fuse=0.0791 grad_shared_expert=0.7708 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1380/10000 [18:07<1:32:43, 1.55it/s, loss=0.0299, lr=2.45e-05, step=1379] Training: 14%|█▍ | 1380/10000 [18:07<1:32:43, 1.55it/s, loss=0.0372, lr=2.45e-05, step=1380] Training: 14%|█▍ | 1381/10000 [18:07<1:28:43, 1.62it/s, loss=0.0372, lr=2.45e-05, step=1380] Training: 14%|█▍ | 1381/10000 [18:07<1:28:43, 1.62it/s, loss=0.0233, lr=2.45e-05, step=1381] Training: 14%|█▍ | 1382/10000 [18:08<1:27:34, 1.64it/s, loss=0.0233, lr=2.45e-05, step=1381] Training: 14%|█▍ | 1382/10000 [18:08<1:27:34, 1.64it/s, loss=0.0162, lr=2.45e-05, step=1382] Training: 14%|█▍ | 1383/10000 [18:09<1:34:46, 1.52it/s, loss=0.0162, lr=2.45e-05, step=1382] Training: 14%|█▍ | 1383/10000 [18:09<1:34:46, 1.52it/s, loss=0.0342, lr=2.45e-05, step=1383] Training: 14%|█▍ | 1384/10000 [18:09<1:31:42, 1.57it/s, loss=0.0342, lr=2.45e-05, step=1383] Training: 14%|█▍ | 1384/10000 [18:09<1:31:42, 1.57it/s, loss=0.0240, lr=2.45e-05, step=1384] Training: 14%|█▍ | 1385/10000 [18:10<1:34:58, 1.51it/s, loss=0.0240, lr=2.45e-05, step=1384] Training: 14%|█▍ | 1385/10000 [18:10<1:34:58, 1.51it/s, loss=0.0389, lr=2.45e-05, step=1385] Training: 14%|█▍ | 1386/10000 [18:11<1:47:11, 1.34it/s, loss=0.0389, lr=2.45e-05, step=1385] Training: 14%|█▍ | 1386/10000 [18:11<1:47:11, 1.34it/s, loss=0.0218, lr=2.45e-05, step=1386] Training: 14%|█▍ | 1387/10000 [18:12<1:37:42, 1.47it/s, loss=0.0218, lr=2.45e-05, step=1386] Training: 14%|█▍ | 1387/10000 [18:12<1:37:42, 1.47it/s, loss=0.0239, lr=2.45e-05, step=1387] Training: 14%|█▍ | 1388/10000 [18:12<1:49:24, 1.31it/s, loss=0.0239, lr=2.45e-05, step=1387] Training: 14%|█▍ | 1388/10000 [18:12<1:49:24, 1.31it/s, loss=0.0781, lr=2.45e-05, step=1388] Training: 14%|█▍ | 1389/10000 [18:13<1:43:53, 1.38it/s, loss=0.0781, lr=2.45e-05, step=1388] Training: 14%|█▍ | 1389/10000 [18:13<1:43:53, 1.38it/s, loss=0.0272, lr=2.45e-05, step=1389]19:02:46.114 [I] step=1390 loss=0.0569 smoothed_loss=0.0359 lr=2.45e-05 grad_norm=0.7087 step_time=0.5831s data_time=0.1137s it/s=1.435 eta_to_10000=5998.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1289 grad_arm_token_fuse=0.0540 grad_shared_expert=0.4897 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1390/10000 [18:14<1:42:10, 1.40it/s, loss=0.0272, lr=2.45e-05, step=1389] Training: 14%|█▍ | 1390/10000 [18:14<1:42:10, 1.40it/s, loss=0.0569, lr=2.45e-05, step=1390] Training: 14%|█▍ | 1391/10000 [18:14<1:37:09, 1.48it/s, loss=0.0569, lr=2.45e-05, step=1390] Training: 14%|█▍ | 1391/10000 [18:14<1:37:09, 1.48it/s, loss=0.0379, lr=2.45e-05, step=1391] Training: 14%|█▍ | 1392/10000 [18:15<1:37:45, 1.47it/s, loss=0.0379, lr=2.45e-05, step=1391] Training: 14%|█▍ | 1392/10000 [18:15<1:37:45, 1.47it/s, loss=0.0155, lr=2.45e-05, step=1392] Training: 14%|█▍ | 1393/10000 [18:16<1:54:13, 1.26it/s, loss=0.0155, lr=2.45e-05, step=1392] Training: 14%|█▍ | 1393/10000 [18:16<1:54:13, 1.26it/s, loss=0.0358, lr=2.45e-05, step=1393] Training: 14%|█▍ | 1394/10000 [18:17<1:45:32, 1.36it/s, loss=0.0358, lr=2.45e-05, step=1393] Training: 14%|█▍ | 1394/10000 [18:17<1:45:32, 1.36it/s, loss=0.0154, lr=2.45e-05, step=1394] Training: 14%|█▍ | 1395/10000 [18:17<1:37:39, 1.47it/s, loss=0.0154, lr=2.45e-05, step=1394] Training: 14%|█▍ | 1395/10000 [18:17<1:37:39, 1.47it/s, loss=0.0249, lr=2.45e-05, step=1395] Training: 14%|█▍ | 1396/10000 [18:18<1:34:48, 1.51it/s, loss=0.0249, lr=2.45e-05, step=1395] Training: 14%|█▍ | 1396/10000 [18:18<1:34:48, 1.51it/s, loss=0.0419, lr=2.45e-05, step=1396] Training: 14%|█▍ | 1397/10000 [18:19<1:34:04, 1.52it/s, loss=0.0419, lr=2.45e-05, step=1396] Training: 14%|█▍ | 1397/10000 [18:19<1:34:04, 1.52it/s, loss=0.0675, lr=2.45e-05, step=1397] Training: 14%|█▍ | 1398/10000 [18:19<1:38:50, 1.45it/s, loss=0.0675, lr=2.45e-05, step=1397] Training: 14%|█▍ | 1398/10000 [18:19<1:38:50, 1.45it/s, loss=0.0287, lr=2.45e-05, step=1398] Training: 14%|█▍ | 1399/10000 [18:20<1:36:18, 1.49it/s, loss=0.0287, lr=2.45e-05, step=1398] Training: 14%|█▍ | 1399/10000 [18:20<1:36:18, 1.49it/s, loss=0.0328, lr=2.45e-05, step=1399]19:02:53.093 [I] step=1400 loss=0.0556 smoothed_loss=0.0371 lr=2.45e-05 grad_norm=0.7077 step_time=0.5830s data_time=0.1150s it/s=1.433 eta_to_10000=6001.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0268 grad_action_out_proj_arms=0.2455 grad_arm_token_fuse=0.1355 grad_shared_expert=0.6964 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1400/10000 [18:21<1:42:48, 1.39it/s, loss=0.0328, lr=2.45e-05, step=1399] Training: 14%|█▍ | 1400/10000 [18:21<1:42:48, 1.39it/s, loss=0.0556, lr=2.45e-05, step=1400] Training: 14%|█▍ | 1401/10000 [18:21<1:34:59, 1.51it/s, loss=0.0556, lr=2.45e-05, step=1400] Training: 14%|█▍ | 1401/10000 [18:21<1:34:59, 1.51it/s, loss=0.0140, lr=2.45e-05, step=1401] Training: 14%|█▍ | 1402/10000 [18:22<1:28:37, 1.62it/s, loss=0.0140, lr=2.45e-05, step=1401] Training: 14%|█▍ | 1402/10000 [18:22<1:28:37, 1.62it/s, loss=0.0266, lr=2.45e-05, step=1402] Training: 14%|█▍ | 1403/10000 [18:22<1:23:12, 1.72it/s, loss=0.0266, lr=2.45e-05, step=1402] Training: 14%|█▍ | 1403/10000 [18:22<1:23:12, 1.72it/s, loss=0.0384, lr=2.45e-05, step=1403] Training: 14%|█▍ | 1404/10000 [18:23<1:29:14, 1.61it/s, loss=0.0384, lr=2.45e-05, step=1403] Training: 14%|█▍ | 1404/10000 [18:23<1:29:14, 1.61it/s, loss=0.0781, lr=2.45e-05, step=1404] Training: 14%|█▍ | 1405/10000 [18:24<1:30:52, 1.58it/s, loss=0.0781, lr=2.45e-05, step=1404] Training: 14%|█▍ | 1405/10000 [18:24<1:30:52, 1.58it/s, loss=0.0524, lr=2.45e-05, step=1405] Training: 14%|█▍ | 1406/10000 [18:24<1:34:38, 1.51it/s, loss=0.0524, lr=2.45e-05, step=1405] Training: 14%|█▍ | 1406/10000 [18:24<1:34:38, 1.51it/s, loss=0.0294, lr=2.45e-05, step=1406] Training: 14%|█▍ | 1407/10000 [18:25<1:43:33, 1.38it/s, loss=0.0294, lr=2.45e-05, step=1406] Training: 14%|█▍ | 1407/10000 [18:25<1:43:33, 1.38it/s, loss=0.0368, lr=2.45e-05, step=1407] Training: 14%|█▍ | 1408/10000 [18:26<1:51:42, 1.28it/s, loss=0.0368, lr=2.45e-05, step=1407] Training: 14%|█▍ | 1408/10000 [18:26<1:51:42, 1.28it/s, loss=0.0151, lr=2.45e-05, step=1408] Training: 14%|█▍ | 1409/10000 [18:27<1:44:22, 1.37it/s, loss=0.0151, lr=2.45e-05, step=1408] Training: 14%|█▍ | 1409/10000 [18:27<1:44:22, 1.37it/s, loss=0.0314, lr=2.45e-05, step=1409]19:02:59.806 [I] step=1410 loss=0.0419 smoothed_loss=0.0366 lr=2.45e-05 grad_norm=0.7837 step_time=0.5452s data_time=0.1260s it/s=1.490 eta_to_10000=5765.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.2158 grad_arm_token_fuse=0.0786 grad_shared_expert=0.5569 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1410/10000 [18:27<1:41:58, 1.40it/s, loss=0.0314, lr=2.45e-05, step=1409] Training: 14%|█▍ | 1410/10000 [18:27<1:41:58, 1.40it/s, loss=0.0419, lr=2.45e-05, step=1410] Training: 14%|█▍ | 1411/10000 [18:28<1:36:45, 1.48it/s, loss=0.0419, lr=2.45e-05, step=1410] Training: 14%|█▍ | 1411/10000 [18:28<1:36:45, 1.48it/s, loss=0.0123, lr=2.45e-05, step=1411] Training: 14%|█▍ | 1412/10000 [18:29<1:33:42, 1.53it/s, loss=0.0123, lr=2.45e-05, step=1411] Training: 14%|█▍ | 1412/10000 [18:29<1:33:42, 1.53it/s, loss=0.0352, lr=2.45e-05, step=1412] Training: 14%|█▍ | 1413/10000 [18:29<1:31:53, 1.56it/s, loss=0.0352, lr=2.45e-05, step=1412] Training: 14%|█▍ | 1413/10000 [18:29<1:31:53, 1.56it/s, loss=0.0222, lr=2.45e-05, step=1413] Training: 14%|█▍ | 1414/10000 [18:30<1:41:29, 1.41it/s, loss=0.0222, lr=2.45e-05, step=1413] Training: 14%|█▍ | 1414/10000 [18:30<1:41:29, 1.41it/s, loss=0.0378, lr=2.45e-05, step=1414] Training: 14%|█▍ | 1415/10000 [18:31<1:46:11, 1.35it/s, loss=0.0378, lr=2.45e-05, step=1414] Training: 14%|█▍ | 1415/10000 [18:31<1:46:11, 1.35it/s, loss=0.0424, lr=2.45e-05, step=1415] Training: 14%|█▍ | 1416/10000 [18:32<1:45:58, 1.35it/s, loss=0.0424, lr=2.45e-05, step=1415] Training: 14%|█▍ | 1416/10000 [18:32<1:45:58, 1.35it/s, loss=0.0217, lr=2.45e-05, step=1416] Training: 14%|█▍ | 1417/10000 [18:32<1:43:37, 1.38it/s, loss=0.0217, lr=2.45e-05, step=1416] Training: 14%|█▍ | 1417/10000 [18:32<1:43:37, 1.38it/s, loss=0.0226, lr=2.45e-05, step=1417] Training: 14%|█▍ | 1418/10000 [18:33<1:43:40, 1.38it/s, loss=0.0226, lr=2.45e-05, step=1417] Training: 14%|█▍ | 1418/10000 [18:33<1:43:40, 1.38it/s, loss=0.0427, lr=2.45e-05, step=1418] Training: 14%|█▍ | 1419/10000 [18:34<1:33:49, 1.52it/s, loss=0.0427, lr=2.45e-05, step=1418] Training: 14%|█▍ | 1419/10000 [18:34<1:33:49, 1.52it/s, loss=0.0284, lr=2.45e-05, step=1419]19:03:06.493 [I] step=1420 loss=0.0396 smoothed_loss=0.0334 lr=2.45e-05 grad_norm=0.7347 step_time=0.5458s data_time=0.1229s it/s=1.496 eta_to_10000=5736.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0153 grad_action_out_proj_arms=0.1414 grad_arm_token_fuse=0.0792 grad_shared_expert=0.6606 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1420/10000 [18:34<1:29:11, 1.60it/s, loss=0.0284, lr=2.45e-05, step=1419] Training: 14%|█▍ | 1420/10000 [18:34<1:29:11, 1.60it/s, loss=0.0396, lr=2.45e-05, step=1420] Training: 14%|█▍ | 1421/10000 [18:35<1:33:44, 1.53it/s, loss=0.0396, lr=2.45e-05, step=1420] Training: 14%|█▍ | 1421/10000 [18:35<1:33:44, 1.53it/s, loss=0.0283, lr=2.45e-05, step=1421] Training: 14%|█▍ | 1422/10000 [18:36<1:56:06, 1.23it/s, loss=0.0283, lr=2.45e-05, step=1421] Training: 14%|█▍ | 1422/10000 [18:36<1:56:06, 1.23it/s, loss=0.0261, lr=2.45e-05, step=1422] Training: 14%|█▍ | 1423/10000 [18:37<1:55:44, 1.24it/s, loss=0.0261, lr=2.45e-05, step=1422] Training: 14%|█▍ | 1423/10000 [18:37<1:55:44, 1.24it/s, loss=0.0468, lr=2.45e-05, step=1423] Training: 14%|█▍ | 1424/10000 [18:37<1:46:59, 1.34it/s, loss=0.0468, lr=2.45e-05, step=1423] Training: 14%|█▍ | 1424/10000 [18:37<1:46:59, 1.34it/s, loss=0.0582, lr=2.45e-05, step=1424] Training: 14%|█▍ | 1425/10000 [18:38<1:46:03, 1.35it/s, loss=0.0582, lr=2.45e-05, step=1424] Training: 14%|█▍ | 1425/10000 [18:38<1:46:03, 1.35it/s, loss=0.0150, lr=2.45e-05, step=1425] Training: 14%|█▍ | 1426/10000 [18:39<1:45:43, 1.35it/s, loss=0.0150, lr=2.45e-05, step=1425] Training: 14%|█▍ | 1426/10000 [18:39<1:45:43, 1.35it/s, loss=0.0296, lr=2.45e-05, step=1426] Training: 14%|█▍ | 1427/10000 [18:40<1:43:00, 1.39it/s, loss=0.0296, lr=2.45e-05, step=1426] Training: 14%|█▍ | 1427/10000 [18:40<1:43:00, 1.39it/s, loss=0.0371, lr=2.45e-05, step=1427] Training: 14%|█▍ | 1428/10000 [18:41<1:50:52, 1.29it/s, loss=0.0371, lr=2.45e-05, step=1427] Training: 14%|█▍ | 1428/10000 [18:41<1:50:52, 1.29it/s, loss=0.1348, lr=2.45e-05, step=1428] Training: 14%|█▍ | 1429/10000 [18:41<1:44:35, 1.37it/s, loss=0.1348, lr=2.45e-05, step=1428] Training: 14%|█▍ | 1429/10000 [18:41<1:44:35, 1.37it/s, loss=0.0471, lr=2.45e-05, step=1429]19:03:14.492 [I] step=1430 loss=0.0645 smoothed_loss=0.0463 lr=2.45e-05 grad_norm=0.6815 step_time=0.6696s data_time=0.1302s it/s=1.250 eta_to_10000=6853.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0162 grad_action_out_proj_arms=0.2143 grad_arm_token_fuse=0.0927 grad_shared_expert=0.7901 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1430/10000 [18:42<1:56:25, 1.23it/s, loss=0.0471, lr=2.45e-05, step=1429] Training: 14%|█▍ | 1430/10000 [18:42<1:56:25, 1.23it/s, loss=0.0645, lr=2.45e-05, step=1430] Training: 14%|█▍ | 1431/10000 [18:43<1:43:35, 1.38it/s, loss=0.0645, lr=2.45e-05, step=1430] Training: 14%|█▍ | 1431/10000 [18:43<1:43:35, 1.38it/s, loss=0.0202, lr=2.45e-05, step=1431] Training: 14%|█▍ | 1432/10000 [18:43<1:39:42, 1.43it/s, loss=0.0202, lr=2.45e-05, step=1431] Training: 14%|█▍ | 1432/10000 [18:43<1:39:42, 1.43it/s, loss=0.0475, lr=2.45e-05, step=1432] Training: 14%|█▍ | 1433/10000 [18:44<1:40:26, 1.42it/s, loss=0.0475, lr=2.45e-05, step=1432] Training: 14%|█▍ | 1433/10000 [18:44<1:40:26, 1.42it/s, loss=0.0498, lr=2.45e-05, step=1433] Training: 14%|█▍ | 1434/10000 [18:45<1:34:34, 1.51it/s, loss=0.0498, lr=2.45e-05, step=1433] Training: 14%|█▍ | 1434/10000 [18:45<1:34:34, 1.51it/s, loss=0.0273, lr=2.45e-05, step=1434] Training: 14%|█▍ | 1435/10000 [18:45<1:30:59, 1.57it/s, loss=0.0273, lr=2.45e-05, step=1434] Training: 14%|█▍ | 1435/10000 [18:45<1:30:59, 1.57it/s, loss=0.0670, lr=2.45e-05, step=1435] Training: 14%|█▍ | 1436/10000 [18:46<1:38:06, 1.45it/s, loss=0.0670, lr=2.45e-05, step=1435] Training: 14%|█▍ | 1436/10000 [18:46<1:38:06, 1.45it/s, loss=0.0166, lr=2.45e-05, step=1436] Training: 14%|█▍ | 1437/10000 [18:47<1:36:01, 1.49it/s, loss=0.0166, lr=2.45e-05, step=1436] Training: 14%|█▍ | 1437/10000 [18:47<1:36:01, 1.49it/s, loss=0.0396, lr=2.45e-05, step=1437] Training: 14%|█▍ | 1438/10000 [18:48<1:46:05, 1.35it/s, loss=0.0396, lr=2.45e-05, step=1437] Training: 14%|█▍ | 1438/10000 [18:48<1:46:05, 1.35it/s, loss=0.0148, lr=2.45e-05, step=1438] Training: 14%|█▍ | 1439/10000 [18:49<2:04:16, 1.15it/s, loss=0.0148, lr=2.45e-05, step=1438] Training: 14%|█▍ | 1439/10000 [18:49<2:04:16, 1.15it/s, loss=0.0327, lr=2.45e-05, step=1439]19:03:21.718 [I] step=1440 loss=0.0349 smoothed_loss=0.0384 lr=2.45e-05 grad_norm=0.7119 step_time=0.5910s data_time=0.1316s it/s=1.384 eta_to_10000=6184.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0127 grad_action_out_proj_arms=0.1841 grad_arm_token_fuse=0.0661 grad_shared_expert=0.6052 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1440/10000 [18:49<1:56:41, 1.22it/s, loss=0.0327, lr=2.45e-05, step=1439] Training: 14%|█▍ | 1440/10000 [18:49<1:56:41, 1.22it/s, loss=0.0349, lr=2.45e-05, step=1440] Training: 14%|█▍ | 1441/10000 [18:50<1:47:23, 1.33it/s, loss=0.0349, lr=2.45e-05, step=1440] Training: 14%|█▍ | 1441/10000 [18:50<1:47:23, 1.33it/s, loss=0.0192, lr=2.45e-05, step=1441] Training: 14%|█▍ | 1442/10000 [18:51<1:43:04, 1.38it/s, loss=0.0192, lr=2.45e-05, step=1441] Training: 14%|█▍ | 1442/10000 [18:51<1:43:04, 1.38it/s, loss=0.0434, lr=2.45e-05, step=1442] Training: 14%|█▍ | 1443/10000 [18:52<1:51:27, 1.28it/s, loss=0.0434, lr=2.45e-05, step=1442] Training: 14%|█▍ | 1443/10000 [18:52<1:51:27, 1.28it/s, loss=0.0216, lr=2.45e-05, step=1443] Training: 14%|█▍ | 1444/10000 [18:53<2:00:00, 1.19it/s, loss=0.0216, lr=2.45e-05, step=1443] Training: 14%|█▍ | 1444/10000 [18:53<2:00:00, 1.19it/s, loss=0.0502, lr=2.45e-05, step=1444] Training: 14%|█▍ | 1445/10000 [18:54<2:05:04, 1.14it/s, loss=0.0502, lr=2.45e-05, step=1444] Training: 14%|█▍ | 1445/10000 [18:54<2:05:04, 1.14it/s, loss=0.1545, lr=2.45e-05, step=1445] Training: 14%|█▍ | 1446/10000 [18:54<1:50:08, 1.29it/s, loss=0.1545, lr=2.45e-05, step=1445] Training: 14%|█▍ | 1446/10000 [18:54<1:50:08, 1.29it/s, loss=0.0543, lr=2.45e-05, step=1446] Training: 14%|█▍ | 1447/10000 [18:55<1:38:16, 1.45it/s, loss=0.0543, lr=2.45e-05, step=1446] Training: 14%|█▍ | 1447/10000 [18:55<1:38:16, 1.45it/s, loss=0.0250, lr=2.45e-05, step=1447] Training: 14%|█▍ | 1448/10000 [18:55<1:29:36, 1.59it/s, loss=0.0250, lr=2.45e-05, step=1447] Training: 14%|█▍ | 1448/10000 [18:55<1:29:36, 1.59it/s, loss=0.0293, lr=2.45e-05, step=1448] Training: 14%|█▍ | 1449/10000 [18:56<1:24:36, 1.68it/s, loss=0.0293, lr=2.45e-05, step=1448] Training: 14%|█▍ | 1449/10000 [18:56<1:24:36, 1.68it/s, loss=0.0189, lr=2.45e-05, step=1449]19:03:28.674 [I] step=1450 loss=0.0644 smoothed_loss=0.0447 lr=2.45e-05 grad_norm=0.7383 step_time=0.5951s data_time=0.1005s it/s=1.438 eta_to_10000=5946.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0160 grad_action_out_proj_arms=0.1920 grad_arm_token_fuse=0.0822 grad_shared_expert=0.6591 (18633:train_pytorch.py:850) + Training: 14%|█▍ | 1450/10000 [18:56<1:34:14, 1.51it/s, loss=0.0189, lr=2.45e-05, step=1449] Training: 14%|█▍ | 1450/10000 [18:56<1:34:14, 1.51it/s, loss=0.0644, lr=2.45e-05, step=1450] Training: 15%|█▍ | 1451/10000 [18:57<1:27:15, 1.63it/s, loss=0.0644, lr=2.45e-05, step=1450] Training: 15%|█▍ | 1451/10000 [18:57<1:27:15, 1.63it/s, loss=0.0239, lr=2.44e-05, step=1451] Training: 15%|█▍ | 1452/10000 [18:58<1:29:45, 1.59it/s, loss=0.0239, lr=2.44e-05, step=1451] Training: 15%|█▍ | 1452/10000 [18:58<1:29:45, 1.59it/s, loss=0.0130, lr=2.44e-05, step=1452] Training: 15%|█▍ | 1453/10000 [18:58<1:23:56, 1.70it/s, loss=0.0130, lr=2.44e-05, step=1452] Training: 15%|█▍ | 1453/10000 [18:58<1:23:56, 1.70it/s, loss=0.0375, lr=2.44e-05, step=1453] Training: 15%|█▍ | 1454/10000 [18:59<1:20:56, 1.76it/s, loss=0.0375, lr=2.44e-05, step=1453] Training: 15%|█▍ | 1454/10000 [18:59<1:20:56, 1.76it/s, loss=0.0453, lr=2.44e-05, step=1454] Training: 15%|█▍ | 1455/10000 [18:59<1:18:10, 1.82it/s, loss=0.0453, lr=2.44e-05, step=1454] Training: 15%|█▍ | 1455/10000 [18:59<1:18:10, 1.82it/s, loss=0.1361, lr=2.44e-05, step=1455] Training: 15%|█▍ | 1456/10000 [19:00<1:17:07, 1.85it/s, loss=0.1361, lr=2.44e-05, step=1455] Training: 15%|█▍ | 1456/10000 [19:00<1:17:07, 1.85it/s, loss=0.0168, lr=2.44e-05, step=1456] Training: 15%|█▍ | 1457/10000 [19:00<1:26:04, 1.65it/s, loss=0.0168, lr=2.44e-05, step=1456] Training: 15%|█▍ | 1457/10000 [19:00<1:26:04, 1.65it/s, loss=0.0233, lr=2.44e-05, step=1457] Training: 15%|█▍ | 1458/10000 [19:01<1:21:20, 1.75it/s, loss=0.0233, lr=2.44e-05, step=1457] Training: 15%|█▍ | 1458/10000 [19:01<1:21:20, 1.75it/s, loss=0.0335, lr=2.44e-05, step=1458] Training: 15%|█▍ | 1459/10000 [19:01<1:18:17, 1.82it/s, loss=0.0335, lr=2.44e-05, step=1458] Training: 15%|█▍ | 1459/10000 [19:01<1:18:17, 1.82it/s, loss=0.0266, lr=2.44e-05, step=1459]19:03:34.323 [I] step=1460 loss=0.0863 smoothed_loss=0.0459 lr=2.44e-05 grad_norm=0.7713 step_time=0.4978s data_time=0.0671s it/s=1.770 eta_to_10000=4823.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.2512 grad_arm_token_fuse=0.0861 grad_shared_expert=0.5724 (18633:train_pytorch.py:850) + Training: 15%|█▍ | 1460/10000 [19:02<1:24:22, 1.69it/s, loss=0.0266, lr=2.44e-05, step=1459] Training: 15%|█▍ | 1460/10000 [19:02<1:24:22, 1.69it/s, loss=0.0863, lr=2.44e-05, step=1460] Training: 15%|█▍ | 1461/10000 [19:02<1:20:08, 1.78it/s, loss=0.0863, lr=2.44e-05, step=1460] Training: 15%|█▍ | 1461/10000 [19:02<1:20:08, 1.78it/s, loss=0.0122, lr=2.44e-05, step=1461] Training: 15%|█▍ | 1462/10000 [19:03<1:17:16, 1.84it/s, loss=0.0122, lr=2.44e-05, step=1461] Training: 15%|█▍ | 1462/10000 [19:03<1:17:16, 1.84it/s, loss=0.0548, lr=2.44e-05, step=1462] Training: 15%|█▍ | 1463/10000 [19:03<1:15:07, 1.89it/s, loss=0.0548, lr=2.44e-05, step=1462] Training: 15%|█▍ | 1463/10000 [19:03<1:15:07, 1.89it/s, loss=0.0291, lr=2.44e-05, step=1463] Training: 15%|█▍ | 1464/10000 [19:04<1:23:28, 1.70it/s, loss=0.0291, lr=2.44e-05, step=1463] Training: 15%|█▍ | 1464/10000 [19:04<1:23:28, 1.70it/s, loss=0.0328, lr=2.44e-05, step=1464] Training: 15%|█▍ | 1465/10000 [19:05<1:19:31, 1.79it/s, loss=0.0328, lr=2.44e-05, step=1464] Training: 15%|█▍ | 1465/10000 [19:05<1:19:31, 1.79it/s, loss=0.0904, lr=2.44e-05, step=1465] Training: 15%|█▍ | 1466/10000 [19:05<1:17:15, 1.84it/s, loss=0.0904, lr=2.44e-05, step=1465] Training: 15%|█▍ | 1466/10000 [19:05<1:17:15, 1.84it/s, loss=0.0300, lr=2.44e-05, step=1466] Training: 15%|█▍ | 1467/10000 [19:06<1:16:22, 1.86it/s, loss=0.0300, lr=2.44e-05, step=1466] Training: 15%|█▍ | 1467/10000 [19:06<1:16:22, 1.86it/s, loss=0.0470, lr=2.44e-05, step=1467] Training: 15%|█▍ | 1468/10000 [19:06<1:21:11, 1.75it/s, loss=0.0470, lr=2.44e-05, step=1467] Training: 15%|█▍ | 1468/10000 [19:06<1:21:11, 1.75it/s, loss=0.0274, lr=2.44e-05, step=1468] Training: 15%|█▍ | 1469/10000 [19:07<1:20:03, 1.78it/s, loss=0.0274, lr=2.44e-05, step=1468] Training: 15%|█▍ | 1469/10000 [19:07<1:20:03, 1.78it/s, loss=0.0158, lr=2.44e-05, step=1469]19:03:39.811 [I] step=1470 loss=0.0362 smoothed_loss=0.0399 lr=2.44e-05 grad_norm=0.6318 step_time=0.4822s data_time=0.0666s it/s=1.823 eta_to_10000=4680.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.2091 grad_arm_token_fuse=0.1217 grad_shared_expert=0.6148 (18633:train_pytorch.py:850) + Training: 15%|█▍ | 1470/10000 [19:07<1:20:01, 1.78it/s, loss=0.0158, lr=2.44e-05, step=1469] Training: 15%|█▍ | 1470/10000 [19:07<1:20:01, 1.78it/s, loss=0.0362, lr=2.44e-05, step=1470] Training: 15%|█▍ | 1471/10000 [19:08<1:26:36, 1.64it/s, loss=0.0362, lr=2.44e-05, step=1470] Training: 15%|█▍ | 1471/10000 [19:08<1:26:36, 1.64it/s, loss=0.0687, lr=2.44e-05, step=1471] Training: 15%|█▍ | 1472/10000 [19:09<1:28:20, 1.61it/s, loss=0.0687, lr=2.44e-05, step=1471] Training: 15%|█▍ | 1472/10000 [19:09<1:28:20, 1.61it/s, loss=0.0194, lr=2.44e-05, step=1472] Training: 15%|█▍ | 1473/10000 [19:09<1:24:39, 1.68it/s, loss=0.0194, lr=2.44e-05, step=1472] Training: 15%|█▍ | 1473/10000 [19:09<1:24:39, 1.68it/s, loss=0.0289, lr=2.44e-05, step=1473] Training: 15%|█▍ | 1474/10000 [19:10<1:23:36, 1.70it/s, loss=0.0289, lr=2.44e-05, step=1473] Training: 15%|█▍ | 1474/10000 [19:10<1:23:36, 1.70it/s, loss=0.0216, lr=2.44e-05, step=1474] Training: 15%|█▍ | 1475/10000 [19:11<1:30:20, 1.57it/s, loss=0.0216, lr=2.44e-05, step=1474] Training: 15%|█▍ | 1475/10000 [19:11<1:30:20, 1.57it/s, loss=0.0431, lr=2.44e-05, step=1475] Training: 15%|█▍ | 1476/10000 [19:11<1:29:16, 1.59it/s, loss=0.0431, lr=2.44e-05, step=1475] Training: 15%|█▍ | 1476/10000 [19:11<1:29:16, 1.59it/s, loss=0.0412, lr=2.44e-05, step=1476] Training: 15%|█▍ | 1477/10000 [19:12<1:23:38, 1.70it/s, loss=0.0412, lr=2.44e-05, step=1476] Training: 15%|█▍ | 1477/10000 [19:12<1:23:38, 1.70it/s, loss=0.0161, lr=2.44e-05, step=1477] Training: 15%|█▍ | 1478/10000 [19:13<1:31:24, 1.55it/s, loss=0.0161, lr=2.44e-05, step=1477] Training: 15%|█▍ | 1478/10000 [19:13<1:31:24, 1.55it/s, loss=0.0113, lr=2.44e-05, step=1478] Training: 15%|█▍ | 1479/10000 [19:13<1:25:31, 1.66it/s, loss=0.0113, lr=2.44e-05, step=1478] Training: 15%|█▍ | 1479/10000 [19:13<1:25:31, 1.66it/s, loss=0.0248, lr=2.44e-05, step=1479]19:03:45.942 [I] step=1480 loss=0.0184 smoothed_loss=0.0314 lr=2.44e-05 grad_norm=0.7564 step_time=0.5324s data_time=0.0808s it/s=1.631 eta_to_10000=5223.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0198 grad_action_out_proj_arms=0.2176 grad_arm_token_fuse=0.1064 grad_shared_expert=0.5787 (18633:train_pytorch.py:850) + Training: 15%|█▍ | 1480/10000 [19:14<1:22:17, 1.73it/s, loss=0.0248, lr=2.44e-05, step=1479] Training: 15%|█▍ | 1480/10000 [19:14<1:22:17, 1.73it/s, loss=0.0184, lr=2.44e-05, step=1480] Training: 15%|█▍ | 1481/10000 [19:14<1:22:04, 1.73it/s, loss=0.0184, lr=2.44e-05, step=1480] Training: 15%|█▍ | 1481/10000 [19:14<1:22:04, 1.73it/s, loss=0.0394, lr=2.44e-05, step=1481] Training: 15%|█▍ | 1482/10000 [19:15<1:25:21, 1.66it/s, loss=0.0394, lr=2.44e-05, step=1481] Training: 15%|█▍ | 1482/10000 [19:15<1:25:21, 1.66it/s, loss=0.0184, lr=2.44e-05, step=1482] Training: 15%|█▍ | 1483/10000 [19:15<1:21:39, 1.74it/s, loss=0.0184, lr=2.44e-05, step=1482] Training: 15%|█▍ | 1483/10000 [19:15<1:21:39, 1.74it/s, loss=0.0719, lr=2.44e-05, step=1483] Training: 15%|█▍ | 1484/10000 [19:16<1:18:59, 1.80it/s, loss=0.0719, lr=2.44e-05, step=1483] Training: 15%|█▍ | 1484/10000 [19:16<1:18:59, 1.80it/s, loss=0.0203, lr=2.44e-05, step=1484] Training: 15%|█▍ | 1485/10000 [19:16<1:16:10, 1.86it/s, loss=0.0203, lr=2.44e-05, step=1484] Training: 15%|█▍ | 1485/10000 [19:16<1:16:10, 1.86it/s, loss=0.0448, lr=2.44e-05, step=1485] Training: 15%|█▍ | 1486/10000 [19:17<1:29:06, 1.59it/s, loss=0.0448, lr=2.44e-05, step=1485] Training: 15%|█▍ | 1486/10000 [19:17<1:29:06, 1.59it/s, loss=0.0221, lr=2.44e-05, step=1486] Training: 15%|█▍ | 1487/10000 [19:18<1:23:58, 1.69it/s, loss=0.0221, lr=2.44e-05, step=1486] Training: 15%|█▍ | 1487/10000 [19:18<1:23:58, 1.69it/s, loss=0.0131, lr=2.44e-05, step=1487] Training: 15%|█▍ | 1488/10000 [19:18<1:20:17, 1.77it/s, loss=0.0131, lr=2.44e-05, step=1487] Training: 15%|█▍ | 1488/10000 [19:18<1:20:17, 1.77it/s, loss=0.0222, lr=2.44e-05, step=1488] Training: 15%|█▍ | 1489/10000 [19:19<1:18:34, 1.81it/s, loss=0.0222, lr=2.44e-05, step=1488] Training: 15%|█▍ | 1489/10000 [19:19<1:18:34, 1.81it/s, loss=0.0191, lr=2.44e-05, step=1489]19:03:51.748 [I] step=1490 loss=0.0250 smoothed_loss=0.0288 lr=2.44e-05 grad_norm=0.7436 step_time=0.5105s data_time=0.0701s it/s=1.723 eta_to_10000=4939.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1729 grad_arm_token_fuse=0.0625 grad_shared_expert=0.5518 (18633:train_pytorch.py:850) + Training: 15%|█▍ | 1490/10000 [19:19<1:23:48, 1.69it/s, loss=0.0191, lr=2.44e-05, step=1489] Training: 15%|█▍ | 1490/10000 [19:19<1:23:48, 1.69it/s, loss=0.0250, lr=2.44e-05, step=1490] Training: 15%|█▍ | 1491/10000 [19:20<1:20:20, 1.77it/s, loss=0.0250, lr=2.44e-05, step=1490] Training: 15%|█▍ | 1491/10000 [19:20<1:20:20, 1.77it/s, loss=0.0116, lr=2.44e-05, step=1491] Training: 15%|█▍ | 1492/10000 [19:20<1:17:56, 1.82it/s, loss=0.0116, lr=2.44e-05, step=1491] Training: 15%|█▍ | 1492/10000 [19:20<1:17:56, 1.82it/s, loss=0.0401, lr=2.44e-05, step=1492] Training: 15%|█▍ | 1493/10000 [19:21<1:27:35, 1.62it/s, loss=0.0401, lr=2.44e-05, step=1492] Training: 15%|█▍ | 1493/10000 [19:21<1:27:35, 1.62it/s, loss=0.0445, lr=2.44e-05, step=1493] Training: 15%|█▍ | 1494/10000 [19:22<1:22:38, 1.72it/s, loss=0.0445, lr=2.44e-05, step=1493] Training: 15%|█▍ | 1494/10000 [19:22<1:22:38, 1.72it/s, loss=0.0759, lr=2.44e-05, step=1494] Training: 15%|█▍ | 1495/10000 [19:22<1:20:00, 1.77it/s, loss=0.0759, lr=2.44e-05, step=1494] Training: 15%|█▍ | 1495/10000 [19:22<1:20:00, 1.77it/s, loss=0.0409, lr=2.44e-05, step=1495] Training: 15%|█▍ | 1496/10000 [19:23<1:17:33, 1.83it/s, loss=0.0409, lr=2.44e-05, step=1495] Training: 15%|█▍ | 1496/10000 [19:23<1:17:33, 1.83it/s, loss=0.1264, lr=2.44e-05, step=1496] Training: 15%|█▍ | 1497/10000 [19:23<1:21:56, 1.73it/s, loss=0.1264, lr=2.44e-05, step=1496] Training: 15%|█▍ | 1497/10000 [19:23<1:21:56, 1.73it/s, loss=0.0291, lr=2.44e-05, step=1497] Training: 15%|█▍ | 1498/10000 [19:24<1:18:43, 1.80it/s, loss=0.0291, lr=2.44e-05, step=1497] Training: 15%|█▍ | 1498/10000 [19:24<1:18:43, 1.80it/s, loss=0.0389, lr=2.44e-05, step=1498] Training: 15%|█▍ | 1499/10000 [19:24<1:16:27, 1.85it/s, loss=0.0389, lr=2.44e-05, step=1498] Training: 15%|█▍ | 1499/10000 [19:24<1:16:27, 1.85it/s, loss=0.0318, lr=2.44e-05, step=1499]19:03:57.586 [I] step=1500 loss=0.0165 smoothed_loss=0.0389 lr=2.44e-05 grad_norm=0.6866 step_time=0.5182s data_time=0.0656s it/s=1.713 eta_to_10000=4961.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0195 grad_action_out_proj_arms=0.1744 grad_arm_token_fuse=0.1046 grad_shared_expert=0.4480 (18633:train_pytorch.py:850) + Training: 15%|█▌ | 1500/10000 [19:25<1:29:52, 1.58it/s, loss=0.0318, lr=2.44e-05, step=1499] Training: 15%|█▌ | 1500/10000 [19:25<1:29:52, 1.58it/s, loss=0.0165, lr=2.44e-05, step=1500] Training: 15%|█▌ | 1501/10000 [19:26<1:24:50, 1.67it/s, loss=0.0165, lr=2.44e-05, step=1500] Training: 15%|█▌ | 1501/10000 [19:26<1:24:50, 1.67it/s, loss=0.0101, lr=2.44e-05, step=1501] Training: 15%|█▌ | 1502/10000 [19:26<1:20:47, 1.75it/s, loss=0.0101, lr=2.44e-05, step=1501] Training: 15%|█▌ | 1502/10000 [19:26<1:20:47, 1.75it/s, loss=0.0101, lr=2.44e-05, step=1502] Training: 15%|█▌ | 1503/10000 [19:27<1:21:42, 1.73it/s, loss=0.0101, lr=2.44e-05, step=1502] Training: 15%|█▌ | 1503/10000 [19:27<1:21:42, 1.73it/s, loss=0.0232, lr=2.44e-05, step=1503] Training: 15%|█▌ | 1504/10000 [19:28<1:25:08, 1.66it/s, loss=0.0232, lr=2.44e-05, step=1503] Training: 15%|█▌ | 1504/10000 [19:28<1:25:08, 1.66it/s, loss=0.0350, lr=2.44e-05, step=1504] Training: 15%|█▌ | 1505/10000 [19:28<1:22:38, 1.71it/s, loss=0.0350, lr=2.44e-05, step=1504] Training: 15%|█▌ | 1505/10000 [19:28<1:22:38, 1.71it/s, loss=0.0230, lr=2.44e-05, step=1505] Training: 15%|█▌ | 1506/10000 [19:29<1:18:52, 1.79it/s, loss=0.0230, lr=2.44e-05, step=1505] Training: 15%|█▌ | 1506/10000 [19:29<1:18:52, 1.79it/s, loss=0.0275, lr=2.44e-05, step=1506] Training: 15%|█▌ | 1507/10000 [19:29<1:27:22, 1.62it/s, loss=0.0275, lr=2.44e-05, step=1506] Training: 15%|█▌ | 1507/10000 [19:29<1:27:22, 1.62it/s, loss=0.0490, lr=2.44e-05, step=1507] Training: 15%|█▌ | 1508/10000 [19:30<1:26:50, 1.63it/s, loss=0.0490, lr=2.44e-05, step=1507] Training: 15%|█▌ | 1508/10000 [19:30<1:26:50, 1.63it/s, loss=0.1025, lr=2.44e-05, step=1508] Training: 15%|█▌ | 1509/10000 [19:30<1:22:05, 1.72it/s, loss=0.1025, lr=2.44e-05, step=1508] Training: 15%|█▌ | 1509/10000 [19:30<1:22:05, 1.72it/s, loss=0.0571, lr=2.44e-05, step=1509]19:04:03.328 [I] step=1510 loss=0.0099 smoothed_loss=0.0385 lr=2.44e-05 grad_norm=0.8025 step_time=0.4953s data_time=0.0788s it/s=1.742 eta_to_10000=4874.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1467 grad_arm_token_fuse=0.0771 grad_shared_expert=0.4531 (18633:train_pytorch.py:850) + Training: 15%|█▌ | 1510/10000 [19:31<1:21:37, 1.73it/s, loss=0.0571, lr=2.44e-05, step=1509] Training: 15%|█▌ | 1510/10000 [19:31<1:21:37, 1.73it/s, loss=0.0099, lr=2.44e-05, step=1510] Training: 15%|█▌ | 1511/10000 [19:31<1:17:57, 1.81it/s, loss=0.0099, lr=2.44e-05, step=1510] Training: 15%|█▌ | 1511/10000 [19:31<1:17:57, 1.81it/s, loss=0.0465, lr=2.44e-05, step=1511] Training: 15%|█▌ | 1512/10000 [19:32<1:25:05, 1.66it/s, loss=0.0465, lr=2.44e-05, step=1511] Training: 15%|█▌ | 1512/10000 [19:32<1:25:05, 1.66it/s, loss=0.0401, lr=2.44e-05, step=1512] Training: 15%|█▌ | 1513/10000 [19:33<1:21:44, 1.73it/s, loss=0.0401, lr=2.44e-05, step=1512] Training: 15%|█▌ | 1513/10000 [19:33<1:21:44, 1.73it/s, loss=0.0469, lr=2.44e-05, step=1513] Training: 15%|█▌ | 1514/10000 [19:33<1:29:18, 1.58it/s, loss=0.0469, lr=2.44e-05, step=1513] Training: 15%|█▌ | 1514/10000 [19:33<1:29:18, 1.58it/s, loss=0.0192, lr=2.44e-05, step=1514] Training: 15%|█▌ | 1515/10000 [19:34<1:30:01, 1.57it/s, loss=0.0192, lr=2.44e-05, step=1514] Training: 15%|█▌ | 1515/10000 [19:34<1:30:01, 1.57it/s, loss=0.0285, lr=2.44e-05, step=1515] Training: 15%|█▌ | 1516/10000 [19:35<1:32:19, 1.53it/s, loss=0.0285, lr=2.44e-05, step=1515] Training: 15%|█▌ | 1516/10000 [19:35<1:32:19, 1.53it/s, loss=0.0230, lr=2.44e-05, step=1516] Training: 15%|█▌ | 1517/10000 [19:35<1:26:31, 1.63it/s, loss=0.0230, lr=2.44e-05, step=1516] Training: 15%|█▌ | 1517/10000 [19:35<1:26:31, 1.63it/s, loss=0.0219, lr=2.44e-05, step=1517] Training: 15%|█▌ | 1518/10000 [19:36<1:36:32, 1.46it/s, loss=0.0219, lr=2.44e-05, step=1517] Training: 15%|█▌ | 1518/10000 [19:36<1:36:32, 1.46it/s, loss=0.0340, lr=2.44e-05, step=1518] Training: 15%|█▌ | 1519/10000 [19:37<1:29:40, 1.58it/s, loss=0.0340, lr=2.44e-05, step=1518] Training: 15%|█▌ | 1519/10000 [19:37<1:29:40, 1.58it/s, loss=0.0092, lr=2.44e-05, step=1519]19:04:09.583 [I] step=1520 loss=0.0559 smoothed_loss=0.0342 lr=2.44e-05 grad_norm=0.6501 step_time=0.5343s data_time=0.0912s it/s=1.599 eta_to_10000=5303.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0249 grad_action_out_proj_arms=0.1968 grad_arm_token_fuse=0.1366 grad_shared_expert=0.5752 (18633:train_pytorch.py:850) + Training: 15%|█▌ | 1520/10000 [19:37<1:25:39, 1.65it/s, loss=0.0092, lr=2.44e-05, step=1519] Training: 15%|█▌ | 1520/10000 [19:37<1:25:39, 1.65it/s, loss=0.0559, lr=2.44e-05, step=1520] Training: 15%|█▌ | 1521/10000 [19:38<1:31:40, 1.54it/s, loss=0.0559, lr=2.44e-05, step=1520] Training: 15%|█▌ | 1521/10000 [19:38<1:31:40, 1.54it/s, loss=0.0469, lr=2.44e-05, step=1521] Training: 15%|█▌ | 1522/10000 [19:39<1:25:38, 1.65it/s, loss=0.0469, lr=2.44e-05, step=1521] Training: 15%|█▌ | 1522/10000 [19:39<1:25:38, 1.65it/s, loss=0.0429, lr=2.44e-05, step=1522] Training: 15%|█▌ | 1523/10000 [19:39<1:24:09, 1.68it/s, loss=0.0429, lr=2.44e-05, step=1522] Training: 15%|█▌ | 1523/10000 [19:39<1:24:09, 1.68it/s, loss=0.0233, lr=2.44e-05, step=1523] Training: 15%|█▌ | 1524/10000 [19:40<1:20:40, 1.75it/s, loss=0.0233, lr=2.44e-05, step=1523] Training: 15%|█▌ | 1524/10000 [19:40<1:20:40, 1.75it/s, loss=0.0176, lr=2.44e-05, step=1524] Training: 15%|█▌ | 1525/10000 [19:40<1:25:46, 1.65it/s, loss=0.0176, lr=2.44e-05, step=1524] Training: 15%|█▌ | 1525/10000 [19:40<1:25:46, 1.65it/s, loss=0.0397, lr=2.44e-05, step=1525] Training: 15%|█▌ | 1526/10000 [19:41<1:21:16, 1.74it/s, loss=0.0397, lr=2.44e-05, step=1525] Training: 15%|█▌ | 1526/10000 [19:41<1:21:16, 1.74it/s, loss=0.0220, lr=2.44e-05, step=1526] Training: 15%|█▌ | 1527/10000 [19:41<1:17:51, 1.81it/s, loss=0.0220, lr=2.44e-05, step=1526] Training: 15%|█▌ | 1527/10000 [19:41<1:17:51, 1.81it/s, loss=0.0422, lr=2.44e-05, step=1527] Training: 15%|█▌ | 1528/10000 [19:42<1:27:10, 1.62it/s, loss=0.0422, lr=2.44e-05, step=1527] Training: 15%|█▌ | 1528/10000 [19:42<1:27:10, 1.62it/s, loss=0.0738, lr=2.44e-05, step=1528] Training: 15%|█▌ | 1529/10000 [19:43<1:28:00, 1.60it/s, loss=0.0738, lr=2.44e-05, step=1528] Training: 15%|█▌ | 1529/10000 [19:43<1:28:00, 1.60it/s, loss=0.0202, lr=2.44e-05, step=1529]19:04:15.682 [I] step=1530 loss=0.0720 smoothed_loss=0.0395 lr=2.44e-05 grad_norm=0.6473 step_time=0.5293s data_time=0.0806s it/s=1.640 eta_to_10000=5165.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1754 grad_arm_token_fuse=0.0826 grad_shared_expert=0.8320 (18633:train_pytorch.py:850) + Training: 15%|█▌ | 1530/10000 [19:43<1:29:40, 1.57it/s, loss=0.0202, lr=2.44e-05, step=1529] Training: 15%|█▌ | 1530/10000 [19:43<1:29:40, 1.57it/s, loss=0.0720, lr=2.44e-05, step=1530] Training: 15%|█▌ | 1531/10000 [19:44<1:24:00, 1.68it/s, loss=0.0720, lr=2.44e-05, step=1530] Training: 15%|█▌ | 1531/10000 [19:44<1:24:00, 1.68it/s, loss=0.0091, lr=2.44e-05, step=1531] Training: 15%|█▌ | 1532/10000 [19:45<1:26:34, 1.63it/s, loss=0.0091, lr=2.44e-05, step=1531] Training: 15%|█▌ | 1532/10000 [19:45<1:26:34, 1.63it/s, loss=0.0192, lr=2.44e-05, step=1532] Training: 15%|█▌ | 1533/10000 [19:45<1:21:15, 1.74it/s, loss=0.0192, lr=2.44e-05, step=1532] Training: 15%|█▌ | 1533/10000 [19:45<1:21:15, 1.74it/s, loss=0.0216, lr=2.44e-05, step=1533] Training: 15%|█▌ | 1534/10000 [19:46<1:18:11, 1.80it/s, loss=0.0216, lr=2.44e-05, step=1533] Training: 15%|█▌ | 1534/10000 [19:46<1:18:11, 1.80it/s, loss=0.0222, lr=2.43e-05, step=1534] Training: 15%|█▌ | 1535/10000 [19:46<1:16:41, 1.84it/s, loss=0.0222, lr=2.43e-05, step=1534] Training: 15%|█▌ | 1535/10000 [19:46<1:16:41, 1.84it/s, loss=0.0394, lr=2.43e-05, step=1535] Training: 15%|█▌ | 1536/10000 [19:47<1:27:31, 1.61it/s, loss=0.0394, lr=2.43e-05, step=1535] Training: 15%|█▌ | 1536/10000 [19:47<1:27:31, 1.61it/s, loss=0.0360, lr=2.43e-05, step=1536] Training: 15%|█▌ | 1537/10000 [19:47<1:24:13, 1.67it/s, loss=0.0360, lr=2.43e-05, step=1536] Training: 15%|█▌ | 1537/10000 [19:47<1:24:13, 1.67it/s, loss=0.0239, lr=2.43e-05, step=1537] Training: 15%|█▌ | 1538/10000 [19:48<1:20:07, 1.76it/s, loss=0.0239, lr=2.43e-05, step=1537] Training: 15%|█▌ | 1538/10000 [19:48<1:20:07, 1.76it/s, loss=0.0662, lr=2.43e-05, step=1538] Training: 15%|█▌ | 1539/10000 [19:49<1:30:57, 1.55it/s, loss=0.0662, lr=2.43e-05, step=1538] Training: 15%|█▌ | 1539/10000 [19:49<1:30:57, 1.55it/s, loss=0.0173, lr=2.43e-05, step=1539]19:04:21.730 [I] step=1540 loss=0.0334 smoothed_loss=0.0339 lr=2.43e-05 grad_norm=0.6346 step_time=0.5308s data_time=0.0740s it/s=1.654 eta_to_10000=5115.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.1289 grad_arm_token_fuse=0.0494 grad_shared_expert=0.6417 (18633:train_pytorch.py:850) + Training: 15%|█▌ | 1540/10000 [19:49<1:33:46, 1.50it/s, loss=0.0173, lr=2.43e-05, step=1539] Training: 15%|█▌ | 1540/10000 [19:49<1:33:46, 1.50it/s, loss=0.0334, lr=2.43e-05, step=1540] Training: 15%|█▌ | 1541/10000 [19:50<1:27:24, 1.61it/s, loss=0.0334, lr=2.43e-05, step=1540] Training: 15%|█▌ | 1541/10000 [19:50<1:27:24, 1.61it/s, loss=0.0098, lr=2.43e-05, step=1541] Training: 15%|█▌ | 1542/10000 [19:50<1:22:22, 1.71it/s, loss=0.0098, lr=2.43e-05, step=1541] Training: 15%|█▌ | 1542/10000 [19:50<1:22:22, 1.71it/s, loss=0.0339, lr=2.43e-05, step=1542] Training: 15%|█▌ | 1543/10000 [19:51<1:29:47, 1.57it/s, loss=0.0339, lr=2.43e-05, step=1542] Training: 15%|█▌ | 1543/10000 [19:51<1:29:47, 1.57it/s, loss=0.1195, lr=2.43e-05, step=1543] Training: 15%|█▌ | 1544/10000 [19:52<1:24:01, 1.68it/s, loss=0.1195, lr=2.43e-05, step=1543] Training: 15%|█▌ | 1544/10000 [19:52<1:24:01, 1.68it/s, loss=0.0984, lr=2.43e-05, step=1544] Training: 15%|█▌ | 1545/10000 [19:52<1:19:36, 1.77it/s, loss=0.0984, lr=2.43e-05, step=1544] Training: 15%|█▌ | 1545/10000 [19:52<1:19:36, 1.77it/s, loss=0.0759, lr=2.43e-05, step=1545] Training: 15%|█▌ | 1546/10000 [19:53<1:27:59, 1.60it/s, loss=0.0759, lr=2.43e-05, step=1545] Training: 15%|█▌ | 1546/10000 [19:53<1:27:59, 1.60it/s, loss=0.0696, lr=2.43e-05, step=1546] Training: 15%|█▌ | 1547/10000 [19:53<1:23:36, 1.69it/s, loss=0.0696, lr=2.43e-05, step=1546] Training: 15%|█▌ | 1547/10000 [19:53<1:23:36, 1.69it/s, loss=0.0953, lr=2.43e-05, step=1547] Training: 15%|█▌ | 1548/10000 [19:54<1:20:45, 1.74it/s, loss=0.0953, lr=2.43e-05, step=1547] Training: 15%|█▌ | 1548/10000 [19:54<1:20:45, 1.74it/s, loss=0.0481, lr=2.43e-05, step=1548] Training: 15%|█▌ | 1549/10000 [19:54<1:17:08, 1.83it/s, loss=0.0481, lr=2.43e-05, step=1548] Training: 15%|█▌ | 1549/10000 [19:54<1:17:08, 1.83it/s, loss=0.0263, lr=2.43e-05, step=1549]19:04:27.651 [I] step=1550 loss=0.0462 smoothed_loss=0.0515 lr=2.43e-05 grad_norm=0.7521 step_time=0.5258s data_time=0.0663s it/s=1.689 eta_to_10000=5002.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0156 grad_action_out_proj_arms=0.2116 grad_arm_token_fuse=0.0860 grad_shared_expert=0.6348 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1550/10000 [19:55<1:30:02, 1.56it/s, loss=0.0263, lr=2.43e-05, step=1549] Training: 16%|█▌ | 1550/10000 [19:55<1:30:02, 1.56it/s, loss=0.0462, lr=2.43e-05, step=1550] Training: 16%|█▌ | 1551/10000 [19:56<1:28:09, 1.60it/s, loss=0.0462, lr=2.43e-05, step=1550] Training: 16%|█▌ | 1551/10000 [19:56<1:28:09, 1.60it/s, loss=0.0482, lr=2.43e-05, step=1551] Training: 16%|█▌ | 1552/10000 [19:56<1:22:55, 1.70it/s, loss=0.0482, lr=2.43e-05, step=1551] Training: 16%|█▌ | 1552/10000 [19:56<1:22:55, 1.70it/s, loss=0.0299, lr=2.43e-05, step=1552] Training: 16%|█▌ | 1553/10000 [19:57<1:19:33, 1.77it/s, loss=0.0299, lr=2.43e-05, step=1552] Training: 16%|█▌ | 1553/10000 [19:57<1:19:33, 1.77it/s, loss=0.0201, lr=2.43e-05, step=1553] Training: 16%|█▌ | 1554/10000 [19:58<1:31:00, 1.55it/s, loss=0.0201, lr=2.43e-05, step=1553] Training: 16%|█▌ | 1554/10000 [19:58<1:31:00, 1.55it/s, loss=0.0252, lr=2.43e-05, step=1554] Training: 16%|█▌ | 1555/10000 [19:58<1:34:24, 1.49it/s, loss=0.0252, lr=2.43e-05, step=1554] Training: 16%|█▌ | 1555/10000 [19:58<1:34:24, 1.49it/s, loss=0.0301, lr=2.43e-05, step=1555] Training: 16%|█▌ | 1556/10000 [19:59<1:28:32, 1.59it/s, loss=0.0301, lr=2.43e-05, step=1555] Training: 16%|█▌ | 1556/10000 [19:59<1:28:32, 1.59it/s, loss=0.1005, lr=2.43e-05, step=1556] Training: 16%|█▌ | 1557/10000 [20:00<1:34:14, 1.49it/s, loss=0.1005, lr=2.43e-05, step=1556] Training: 16%|█▌ | 1557/10000 [20:00<1:34:14, 1.49it/s, loss=0.0283, lr=2.43e-05, step=1557] Training: 16%|█▌ | 1558/10000 [20:00<1:34:04, 1.50it/s, loss=0.0283, lr=2.43e-05, step=1557] Training: 16%|█▌ | 1558/10000 [20:00<1:34:04, 1.50it/s, loss=0.0211, lr=2.43e-05, step=1558] Training: 16%|█▌ | 1559/10000 [20:01<1:34:26, 1.49it/s, loss=0.0211, lr=2.43e-05, step=1558] Training: 16%|█▌ | 1559/10000 [20:01<1:34:26, 1.49it/s, loss=0.0432, lr=2.43e-05, step=1559]19:04:34.255 [I] step=1560 loss=0.0233 smoothed_loss=0.0418 lr=2.43e-05 grad_norm=0.6680 step_time=0.5681s data_time=0.0923s it/s=1.514 eta_to_10000=5573.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0187 grad_action_out_proj_arms=0.1946 grad_arm_token_fuse=0.0960 grad_shared_expert=0.5892 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1560/10000 [20:02<1:39:36, 1.41it/s, loss=0.0432, lr=2.43e-05, step=1559] Training: 16%|█▌ | 1560/10000 [20:02<1:39:36, 1.41it/s, loss=0.0233, lr=2.43e-05, step=1560] Training: 16%|█▌ | 1561/10000 [20:03<1:35:56, 1.47it/s, loss=0.0233, lr=2.43e-05, step=1560] Training: 16%|█▌ | 1561/10000 [20:03<1:35:56, 1.47it/s, loss=0.0193, lr=2.43e-05, step=1561] Training: 16%|█▌ | 1562/10000 [20:03<1:36:37, 1.46it/s, loss=0.0193, lr=2.43e-05, step=1561] Training: 16%|█▌ | 1562/10000 [20:03<1:36:37, 1.46it/s, loss=0.0374, lr=2.43e-05, step=1562] Training: 16%|█▌ | 1563/10000 [20:04<1:35:47, 1.47it/s, loss=0.0374, lr=2.43e-05, step=1562] Training: 16%|█▌ | 1563/10000 [20:04<1:35:47, 1.47it/s, loss=0.1471, lr=2.43e-05, step=1563] Training: 16%|█▌ | 1564/10000 [20:05<1:45:54, 1.33it/s, loss=0.1471, lr=2.43e-05, step=1563] Training: 16%|█▌ | 1564/10000 [20:05<1:45:54, 1.33it/s, loss=0.0655, lr=2.43e-05, step=1564] Training: 16%|█▌ | 1565/10000 [20:06<1:42:47, 1.37it/s, loss=0.0655, lr=2.43e-05, step=1564] Training: 16%|█▌ | 1565/10000 [20:06<1:42:47, 1.37it/s, loss=0.0462, lr=2.43e-05, step=1565] Training: 16%|█▌ | 1566/10000 [20:06<1:36:27, 1.46it/s, loss=0.0462, lr=2.43e-05, step=1565] Training: 16%|█▌ | 1566/10000 [20:06<1:36:27, 1.46it/s, loss=0.0595, lr=2.43e-05, step=1566] Training: 16%|█▌ | 1567/10000 [20:07<1:41:36, 1.38it/s, loss=0.0595, lr=2.43e-05, step=1566] Training: 16%|█▌ | 1567/10000 [20:07<1:41:36, 1.38it/s, loss=0.0442, lr=2.43e-05, step=1567] Training: 16%|█▌ | 1568/10000 [20:08<1:38:44, 1.42it/s, loss=0.0442, lr=2.43e-05, step=1567] Training: 16%|█▌ | 1568/10000 [20:08<1:38:44, 1.42it/s, loss=0.0197, lr=2.43e-05, step=1568] Training: 16%|█▌ | 1569/10000 [20:08<1:30:44, 1.55it/s, loss=0.0197, lr=2.43e-05, step=1568] Training: 16%|█▌ | 1569/10000 [20:08<1:30:44, 1.55it/s, loss=0.0307, lr=2.43e-05, step=1569]19:04:40.938 [I] step=1570 loss=0.0142 smoothed_loss=0.0431 lr=2.43e-05 grad_norm=0.7641 step_time=0.5636s data_time=0.1047s it/s=1.497 eta_to_10000=5632.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0266 grad_action_out_proj_arms=0.2700 grad_arm_token_fuse=0.1253 grad_shared_expert=1.0730 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1570/10000 [20:09<1:26:06, 1.63it/s, loss=0.0307, lr=2.43e-05, step=1569] Training: 16%|█▌ | 1570/10000 [20:09<1:26:06, 1.63it/s, loss=0.0142, lr=2.43e-05, step=1570] Training: 16%|█▌ | 1571/10000 [20:09<1:34:53, 1.48it/s, loss=0.0142, lr=2.43e-05, step=1570] Training: 16%|█▌ | 1571/10000 [20:09<1:34:53, 1.48it/s, loss=0.0382, lr=2.43e-05, step=1571] Training: 16%|█▌ | 1572/10000 [20:10<1:27:50, 1.60it/s, loss=0.0382, lr=2.43e-05, step=1571] Training: 16%|█▌ | 1572/10000 [20:10<1:27:50, 1.60it/s, loss=0.0288, lr=2.43e-05, step=1572] Training: 16%|█▌ | 1573/10000 [20:10<1:22:37, 1.70it/s, loss=0.0288, lr=2.43e-05, step=1572] Training: 16%|█▌ | 1573/10000 [20:10<1:22:37, 1.70it/s, loss=0.0412, lr=2.43e-05, step=1573] Training: 16%|█▌ | 1574/10000 [20:11<1:18:59, 1.78it/s, loss=0.0412, lr=2.43e-05, step=1573] Training: 16%|█▌ | 1574/10000 [20:11<1:18:59, 1.78it/s, loss=0.0703, lr=2.43e-05, step=1574] Training: 16%|█▌ | 1575/10000 [20:12<1:23:11, 1.69it/s, loss=0.0703, lr=2.43e-05, step=1574] Training: 16%|█▌ | 1575/10000 [20:12<1:23:11, 1.69it/s, loss=0.0277, lr=2.43e-05, step=1575] Training: 16%|█▌ | 1576/10000 [20:12<1:22:14, 1.71it/s, loss=0.0277, lr=2.43e-05, step=1575] Training: 16%|█▌ | 1576/10000 [20:12<1:22:14, 1.71it/s, loss=0.0411, lr=2.43e-05, step=1576] Training: 16%|█▌ | 1577/10000 [20:13<1:18:15, 1.79it/s, loss=0.0411, lr=2.43e-05, step=1576] Training: 16%|█▌ | 1577/10000 [20:13<1:18:15, 1.79it/s, loss=0.0344, lr=2.43e-05, step=1577] Training: 16%|█▌ | 1578/10000 [20:13<1:26:12, 1.63it/s, loss=0.0344, lr=2.43e-05, step=1577] Training: 16%|█▌ | 1578/10000 [20:13<1:26:12, 1.63it/s, loss=0.0059, lr=2.43e-05, step=1578] Training: 16%|█▌ | 1579/10000 [20:14<1:24:57, 1.65it/s, loss=0.0059, lr=2.43e-05, step=1578] Training: 16%|█▌ | 1579/10000 [20:14<1:24:57, 1.65it/s, loss=0.0278, lr=2.43e-05, step=1579]19:04:46.869 [I] step=1580 loss=0.0182 smoothed_loss=0.0351 lr=2.43e-05 grad_norm=0.6919 step_time=0.5182s data_time=0.0749s it/s=1.686 eta_to_10000=4993.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1494 grad_arm_token_fuse=0.0505 grad_shared_expert=0.4146 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1580/10000 [20:15<1:22:16, 1.71it/s, loss=0.0278, lr=2.43e-05, step=1579] Training: 16%|█▌ | 1580/10000 [20:15<1:22:16, 1.71it/s, loss=0.0182, lr=2.43e-05, step=1580] Training: 16%|█▌ | 1581/10000 [20:15<1:23:04, 1.69it/s, loss=0.0182, lr=2.43e-05, step=1580] Training: 16%|█▌ | 1581/10000 [20:15<1:23:04, 1.69it/s, loss=0.0975, lr=2.43e-05, step=1581] Training: 16%|█▌ | 1582/10000 [20:16<1:26:49, 1.62it/s, loss=0.0975, lr=2.43e-05, step=1581] Training: 16%|█▌ | 1582/10000 [20:16<1:26:49, 1.62it/s, loss=0.0268, lr=2.43e-05, step=1582] Training: 16%|█▌ | 1583/10000 [20:16<1:21:24, 1.72it/s, loss=0.0268, lr=2.43e-05, step=1582] Training: 16%|█▌ | 1583/10000 [20:16<1:21:24, 1.72it/s, loss=0.0212, lr=2.43e-05, step=1583] Training: 16%|█▌ | 1584/10000 [20:17<1:20:39, 1.74it/s, loss=0.0212, lr=2.43e-05, step=1583] Training: 16%|█▌ | 1584/10000 [20:17<1:20:39, 1.74it/s, loss=0.0315, lr=2.43e-05, step=1584] Training: 16%|█▌ | 1585/10000 [20:17<1:20:57, 1.73it/s, loss=0.0315, lr=2.43e-05, step=1584] Training: 16%|█▌ | 1585/10000 [20:17<1:20:57, 1.73it/s, loss=0.0119, lr=2.43e-05, step=1585] Training: 16%|█▌ | 1586/10000 [20:19<1:44:33, 1.34it/s, loss=0.0119, lr=2.43e-05, step=1585] Training: 16%|█▌ | 1586/10000 [20:19<1:44:33, 1.34it/s, loss=0.0379, lr=2.43e-05, step=1586] Training: 16%|█▌ | 1587/10000 [20:19<1:40:05, 1.40it/s, loss=0.0379, lr=2.43e-05, step=1586] Training: 16%|█▌ | 1587/10000 [20:19<1:40:05, 1.40it/s, loss=0.0279, lr=2.43e-05, step=1587] Training: 16%|█▌ | 1588/10000 [20:20<1:36:58, 1.45it/s, loss=0.0279, lr=2.43e-05, step=1587] Training: 16%|█▌ | 1588/10000 [20:20<1:36:58, 1.45it/s, loss=0.0155, lr=2.43e-05, step=1588] Training: 16%|█▌ | 1589/10000 [20:21<1:34:03, 1.49it/s, loss=0.0155, lr=2.43e-05, step=1588] Training: 16%|█▌ | 1589/10000 [20:21<1:34:03, 1.49it/s, loss=0.0083, lr=2.43e-05, step=1589]19:04:53.743 [I] step=1590 loss=0.0239 smoothed_loss=0.0295 lr=2.43e-05 grad_norm=0.6951 step_time=0.5838s data_time=0.1036s it/s=1.455 eta_to_10000=5780.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1989 grad_arm_token_fuse=0.0573 grad_shared_expert=0.4585 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1590/10000 [20:21<1:44:07, 1.35it/s, loss=0.0083, lr=2.43e-05, step=1589] Training: 16%|█▌ | 1590/10000 [20:21<1:44:07, 1.35it/s, loss=0.0239, lr=2.43e-05, step=1590] Training: 16%|█▌ | 1591/10000 [20:22<1:38:20, 1.43it/s, loss=0.0239, lr=2.43e-05, step=1590] Training: 16%|█▌ | 1591/10000 [20:22<1:38:20, 1.43it/s, loss=0.0577, lr=2.43e-05, step=1591] Training: 16%|█▌ | 1592/10000 [20:23<1:39:06, 1.41it/s, loss=0.0577, lr=2.43e-05, step=1591] Training: 16%|█▌ | 1592/10000 [20:23<1:39:06, 1.41it/s, loss=0.0133, lr=2.43e-05, step=1592] Training: 16%|█▌ | 1593/10000 [20:24<1:47:47, 1.30it/s, loss=0.0133, lr=2.43e-05, step=1592] Training: 16%|█▌ | 1593/10000 [20:24<1:47:47, 1.30it/s, loss=0.0787, lr=2.43e-05, step=1593] Training: 16%|█▌ | 1594/10000 [20:24<1:41:14, 1.38it/s, loss=0.0787, lr=2.43e-05, step=1593] Training: 16%|█▌ | 1594/10000 [20:24<1:41:14, 1.38it/s, loss=0.0181, lr=2.43e-05, step=1594] Training: 16%|█▌ | 1595/10000 [20:25<1:39:47, 1.40it/s, loss=0.0181, lr=2.43e-05, step=1594] Training: 16%|█▌ | 1595/10000 [20:25<1:39:47, 1.40it/s, loss=0.0379, lr=2.43e-05, step=1595] Training: 16%|█▌ | 1596/10000 [20:26<1:34:34, 1.48it/s, loss=0.0379, lr=2.43e-05, step=1595] Training: 16%|█▌ | 1596/10000 [20:26<1:34:34, 1.48it/s, loss=0.0184, lr=2.43e-05, step=1596] Training: 16%|█▌ | 1597/10000 [20:26<1:37:16, 1.44it/s, loss=0.0184, lr=2.43e-05, step=1596] Training: 16%|█▌ | 1597/10000 [20:26<1:37:16, 1.44it/s, loss=0.0352, lr=2.43e-05, step=1597] Training: 16%|█▌ | 1598/10000 [20:27<1:32:54, 1.51it/s, loss=0.0352, lr=2.43e-05, step=1597] Training: 16%|█▌ | 1598/10000 [20:27<1:32:54, 1.51it/s, loss=0.0137, lr=2.43e-05, step=1598] Training: 16%|█▌ | 1599/10000 [20:28<1:31:52, 1.52it/s, loss=0.0137, lr=2.43e-05, step=1598] Training: 16%|█▌ | 1599/10000 [20:28<1:31:52, 1.52it/s, loss=0.0341, lr=2.43e-05, step=1599]19:05:00.681 [I] step=1600 loss=0.0258 smoothed_loss=0.0306 lr=2.43e-05 grad_norm=0.6823 step_time=0.5834s data_time=0.1104s it/s=1.442 eta_to_10000=5826.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1631 grad_arm_token_fuse=0.0739 grad_shared_expert=0.7295 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1600/10000 [20:28<1:39:28, 1.41it/s, loss=0.0341, lr=2.43e-05, step=1599] Training: 16%|█▌ | 1600/10000 [20:28<1:39:28, 1.41it/s, loss=0.0258, lr=2.43e-05, step=1600] Training: 16%|█▌ | 1601/10000 [20:29<1:36:37, 1.45it/s, loss=0.0258, lr=2.43e-05, step=1600] Training: 16%|█▌ | 1601/10000 [20:29<1:36:37, 1.45it/s, loss=0.0250, lr=2.43e-05, step=1601] Training: 16%|█▌ | 1602/10000 [20:30<1:40:34, 1.39it/s, loss=0.0250, lr=2.43e-05, step=1601] Training: 16%|█▌ | 1602/10000 [20:30<1:40:34, 1.39it/s, loss=0.0317, lr=2.43e-05, step=1602] Training: 16%|█▌ | 1603/10000 [20:30<1:40:16, 1.40it/s, loss=0.0317, lr=2.43e-05, step=1602] Training: 16%|█▌ | 1603/10000 [20:30<1:40:16, 1.40it/s, loss=0.0371, lr=2.43e-05, step=1603] Training: 16%|█▌ | 1604/10000 [20:31<1:50:03, 1.27it/s, loss=0.0371, lr=2.43e-05, step=1603] Training: 16%|█▌ | 1604/10000 [20:31<1:50:03, 1.27it/s, loss=0.0312, lr=2.43e-05, step=1604] Training: 16%|█▌ | 1605/10000 [20:32<1:42:42, 1.36it/s, loss=0.0312, lr=2.43e-05, step=1604] Training: 16%|█▌ | 1605/10000 [20:32<1:42:42, 1.36it/s, loss=0.0160, lr=2.43e-05, step=1605] Training: 16%|█▌ | 1606/10000 [20:33<1:37:21, 1.44it/s, loss=0.0160, lr=2.43e-05, step=1605] Training: 16%|█▌ | 1606/10000 [20:33<1:37:21, 1.44it/s, loss=0.2454, lr=2.43e-05, step=1606] Training: 16%|█▌ | 1607/10000 [20:34<1:44:34, 1.34it/s, loss=0.2454, lr=2.43e-05, step=1606] Training: 16%|█▌ | 1607/10000 [20:34<1:44:34, 1.34it/s, loss=0.0190, lr=2.43e-05, step=1607] Training: 16%|█▌ | 1608/10000 [20:34<1:42:26, 1.37it/s, loss=0.0190, lr=2.43e-05, step=1607] Training: 16%|█▌ | 1608/10000 [20:34<1:42:26, 1.37it/s, loss=0.0382, lr=2.43e-05, step=1608] Training: 16%|█▌ | 1609/10000 [20:35<1:39:49, 1.40it/s, loss=0.0382, lr=2.43e-05, step=1608] Training: 16%|█▌ | 1609/10000 [20:35<1:39:49, 1.40it/s, loss=0.0117, lr=2.43e-05, step=1609]19:05:08.042 [I] step=1610 loss=0.0131 smoothed_loss=0.0403 lr=2.43e-05 grad_norm=0.7426 step_time=0.5999s data_time=0.1363s it/s=1.359 eta_to_10000=6175.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.2195 grad_arm_token_fuse=0.0717 grad_shared_expert=0.6116 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1610/10000 [20:36<1:44:11, 1.34it/s, loss=0.0117, lr=2.43e-05, step=1609] Training: 16%|█▌ | 1610/10000 [20:36<1:44:11, 1.34it/s, loss=0.0131, lr=2.43e-05, step=1610] Training: 16%|█▌ | 1611/10000 [20:36<1:36:05, 1.46it/s, loss=0.0131, lr=2.43e-05, step=1610] Training: 16%|█▌ | 1611/10000 [20:36<1:36:05, 1.46it/s, loss=0.0284, lr=2.43e-05, step=1611] Training: 16%|█▌ | 1612/10000 [20:37<1:38:59, 1.41it/s, loss=0.0284, lr=2.43e-05, step=1611] Training: 16%|█▌ | 1612/10000 [20:37<1:38:59, 1.41it/s, loss=0.0487, lr=2.42e-05, step=1612] Training: 16%|█▌ | 1613/10000 [20:38<1:39:20, 1.41it/s, loss=0.0487, lr=2.42e-05, step=1612] Training: 16%|█▌ | 1613/10000 [20:38<1:39:20, 1.41it/s, loss=0.0223, lr=2.42e-05, step=1613] Training: 16%|█▌ | 1614/10000 [20:39<1:47:17, 1.30it/s, loss=0.0223, lr=2.42e-05, step=1613] Training: 16%|█▌ | 1614/10000 [20:39<1:47:17, 1.30it/s, loss=0.0112, lr=2.42e-05, step=1614] Training: 16%|█▌ | 1615/10000 [20:39<1:41:45, 1.37it/s, loss=0.0112, lr=2.42e-05, step=1614] Training: 16%|█▌ | 1615/10000 [20:39<1:41:45, 1.37it/s, loss=0.0162, lr=2.42e-05, step=1615] Training: 16%|█▌ | 1616/10000 [20:40<1:36:58, 1.44it/s, loss=0.0162, lr=2.42e-05, step=1615] Training: 16%|█▌ | 1616/10000 [20:40<1:36:58, 1.44it/s, loss=0.0422, lr=2.42e-05, step=1616] Training: 16%|█▌ | 1617/10000 [20:41<1:36:45, 1.44it/s, loss=0.0422, lr=2.42e-05, step=1616] Training: 16%|█▌ | 1617/10000 [20:41<1:36:45, 1.44it/s, loss=0.0801, lr=2.42e-05, step=1617] Training: 16%|█▌ | 1618/10000 [20:41<1:41:57, 1.37it/s, loss=0.0801, lr=2.42e-05, step=1617] Training: 16%|█▌ | 1618/10000 [20:41<1:41:57, 1.37it/s, loss=0.0557, lr=2.42e-05, step=1618] Training: 16%|█▌ | 1619/10000 [20:42<1:35:52, 1.46it/s, loss=0.0557, lr=2.42e-05, step=1618] Training: 16%|█▌ | 1619/10000 [20:42<1:35:52, 1.46it/s, loss=0.0227, lr=2.42e-05, step=1619]19:05:15.001 [I] step=1620 loss=0.0275 smoothed_loss=0.0378 lr=2.42e-05 grad_norm=0.6838 step_time=0.5800s data_time=0.1160s it/s=1.437 eta_to_10000=5830.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0161 grad_action_out_proj_arms=0.2408 grad_arm_token_fuse=0.0862 grad_shared_expert=1.3863 (18633:train_pytorch.py:850) + Training: 16%|█▌ | 1620/10000 [20:43<1:36:05, 1.45it/s, loss=0.0227, lr=2.42e-05, step=1619] Training: 16%|█▌ | 1620/10000 [20:43<1:36:05, 1.45it/s, loss=0.0275, lr=2.42e-05, step=1620] Training: 16%|█▌ | 1621/10000 [20:43<1:38:55, 1.41it/s, loss=0.0275, lr=2.42e-05, step=1620] Training: 16%|█▌ | 1621/10000 [20:43<1:38:55, 1.41it/s, loss=0.0459, lr=2.42e-05, step=1621] Training: 16%|█▌ | 1622/10000 [20:44<1:34:02, 1.48it/s, loss=0.0459, lr=2.42e-05, step=1621] Training: 16%|█▌ | 1622/10000 [20:44<1:34:02, 1.48it/s, loss=0.0116, lr=2.42e-05, step=1622] Training: 16%|█▌ | 1623/10000 [20:45<1:26:39, 1.61it/s, loss=0.0116, lr=2.42e-05, step=1622] Training: 16%|█▌ | 1623/10000 [20:45<1:26:39, 1.61it/s, loss=0.0306, lr=2.42e-05, step=1623] Training: 16%|█▌ | 1624/10000 [20:45<1:22:18, 1.70it/s, loss=0.0306, lr=2.42e-05, step=1623] Training: 16%|█▌ | 1624/10000 [20:45<1:22:18, 1.70it/s, loss=0.0437, lr=2.42e-05, step=1624] Training: 16%|█▋ | 1625/10000 [20:46<1:29:22, 1.56it/s, loss=0.0437, lr=2.42e-05, step=1624] Training: 16%|█▋ | 1625/10000 [20:46<1:29:22, 1.56it/s, loss=0.0475, lr=2.42e-05, step=1625] Training: 16%|█▋ | 1626/10000 [20:46<1:24:20, 1.65it/s, loss=0.0475, lr=2.42e-05, step=1625] Training: 16%|█▋ | 1626/10000 [20:46<1:24:20, 1.65it/s, loss=0.0310, lr=2.42e-05, step=1626] Training: 16%|█▋ | 1627/10000 [20:47<1:20:24, 1.74it/s, loss=0.0310, lr=2.42e-05, step=1626] Training: 16%|█▋ | 1627/10000 [20:47<1:20:24, 1.74it/s, loss=0.0349, lr=2.42e-05, step=1627] Training: 16%|█▋ | 1628/10000 [20:48<1:34:58, 1.47it/s, loss=0.0349, lr=2.42e-05, step=1627] Training: 16%|█▋ | 1628/10000 [20:48<1:34:58, 1.47it/s, loss=0.0253, lr=2.42e-05, step=1628] Training: 16%|█▋ | 1629/10000 [20:48<1:27:48, 1.59it/s, loss=0.0253, lr=2.42e-05, step=1628] Training: 16%|█▋ | 1629/10000 [20:48<1:27:48, 1.59it/s, loss=0.0535, lr=2.42e-05, step=1629]19:05:21.182 [I] step=1630 loss=0.0339 smoothed_loss=0.0369 lr=2.42e-05 grad_norm=0.6173 step_time=0.5372s data_time=0.0808s it/s=1.618 eta_to_10000=5172.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0170 grad_action_out_proj_arms=0.1967 grad_arm_token_fuse=0.0901 grad_shared_expert=0.6284 (18633:train_pytorch.py:850) + Training: 16%|█▋ | 1630/10000 [20:49<1:26:19, 1.62it/s, loss=0.0535, lr=2.42e-05, step=1629] Training: 16%|█▋ | 1630/10000 [20:49<1:26:19, 1.62it/s, loss=0.0339, lr=2.42e-05, step=1630] Training: 16%|█▋ | 1631/10000 [20:49<1:21:10, 1.72it/s, loss=0.0339, lr=2.42e-05, step=1630] Training: 16%|█▋ | 1631/10000 [20:49<1:21:10, 1.72it/s, loss=0.0202, lr=2.42e-05, step=1631] Training: 16%|█▋ | 1632/10000 [20:50<1:24:21, 1.65it/s, loss=0.0202, lr=2.42e-05, step=1631] Training: 16%|█▋ | 1632/10000 [20:50<1:24:21, 1.65it/s, loss=0.0164, lr=2.42e-05, step=1632] Training: 16%|█▋ | 1633/10000 [20:50<1:19:33, 1.75it/s, loss=0.0164, lr=2.42e-05, step=1632] Training: 16%|█▋ | 1633/10000 [20:50<1:19:33, 1.75it/s, loss=0.0353, lr=2.42e-05, step=1633] Training: 16%|█▋ | 1634/10000 [20:51<1:16:07, 1.83it/s, loss=0.0353, lr=2.42e-05, step=1633] Training: 16%|█▋ | 1634/10000 [20:51<1:16:07, 1.83it/s, loss=0.0176, lr=2.42e-05, step=1634] Training: 16%|█▋ | 1635/10000 [20:51<1:14:07, 1.88it/s, loss=0.0176, lr=2.42e-05, step=1634] Training: 16%|█▋ | 1635/10000 [20:51<1:14:07, 1.88it/s, loss=0.0084, lr=2.42e-05, step=1635] Training: 16%|█▋ | 1636/10000 [20:52<1:25:05, 1.64it/s, loss=0.0084, lr=2.42e-05, step=1635] Training: 16%|█▋ | 1636/10000 [20:52<1:25:05, 1.64it/s, loss=0.0116, lr=2.42e-05, step=1636] Training: 16%|█▋ | 1637/10000 [20:53<1:20:11, 1.74it/s, loss=0.0116, lr=2.42e-05, step=1636] Training: 16%|█▋ | 1637/10000 [20:53<1:20:11, 1.74it/s, loss=0.0401, lr=2.42e-05, step=1637] Training: 16%|█▋ | 1638/10000 [20:53<1:17:02, 1.81it/s, loss=0.0401, lr=2.42e-05, step=1637] Training: 16%|█▋ | 1638/10000 [20:53<1:17:02, 1.81it/s, loss=0.0170, lr=2.42e-05, step=1638] Training: 16%|█▋ | 1639/10000 [20:54<1:21:04, 1.72it/s, loss=0.0170, lr=2.42e-05, step=1638] Training: 16%|█▋ | 1639/10000 [20:54<1:21:04, 1.72it/s, loss=0.0084, lr=2.42e-05, step=1639]19:05:26.788 [I] step=1640 loss=0.0373 smoothed_loss=0.0270 lr=2.42e-05 grad_norm=0.5958 step_time=0.4973s data_time=0.0634s it/s=1.784 eta_to_10000=4686.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.2435 grad_arm_token_fuse=0.1195 grad_shared_expert=0.6856 (18633:train_pytorch.py:850) + Training: 16%|█▋ | 1640/10000 [20:54<1:19:12, 1.76it/s, loss=0.0084, lr=2.42e-05, step=1639] Training: 16%|█▋ | 1640/10000 [20:54<1:19:12, 1.76it/s, loss=0.0373, lr=2.42e-05, step=1640] Training: 16%|█▋ | 1641/10000 [20:55<1:16:10, 1.83it/s, loss=0.0373, lr=2.42e-05, step=1640] Training: 16%|█▋ | 1641/10000 [20:55<1:16:10, 1.83it/s, loss=0.0086, lr=2.42e-05, step=1641] Training: 16%|█▋ | 1642/10000 [20:55<1:13:41, 1.89it/s, loss=0.0086, lr=2.42e-05, step=1641] Training: 16%|█▋ | 1642/10000 [20:55<1:13:41, 1.89it/s, loss=0.0100, lr=2.42e-05, step=1642] Training: 16%|█▋ | 1643/10000 [20:56<1:23:59, 1.66it/s, loss=0.0100, lr=2.42e-05, step=1642] Training: 16%|█▋ | 1643/10000 [20:56<1:23:59, 1.66it/s, loss=0.0465, lr=2.42e-05, step=1643] Training: 16%|█▋ | 1644/10000 [20:57<1:19:37, 1.75it/s, loss=0.0465, lr=2.42e-05, step=1643] Training: 16%|█▋ | 1644/10000 [20:57<1:19:37, 1.75it/s, loss=0.0385, lr=2.42e-05, step=1644] Training: 16%|█▋ | 1645/10000 [20:57<1:17:18, 1.80it/s, loss=0.0385, lr=2.42e-05, step=1644] Training: 16%|█▋ | 1645/10000 [20:57<1:17:18, 1.80it/s, loss=0.0521, lr=2.42e-05, step=1645] Training: 16%|█▋ | 1646/10000 [20:58<1:21:14, 1.71it/s, loss=0.0521, lr=2.42e-05, step=1645] Training: 16%|█▋ | 1646/10000 [20:58<1:21:14, 1.71it/s, loss=0.0623, lr=2.42e-05, step=1646] Training: 16%|█▋ | 1647/10000 [20:58<1:17:15, 1.80it/s, loss=0.0623, lr=2.42e-05, step=1646] Training: 16%|█▋ | 1647/10000 [20:58<1:17:15, 1.80it/s, loss=0.0283, lr=2.42e-05, step=1647] Training: 16%|█▋ | 1648/10000 [20:59<1:22:09, 1.69it/s, loss=0.0283, lr=2.42e-05, step=1647] Training: 16%|█▋ | 1648/10000 [20:59<1:22:09, 1.69it/s, loss=0.0342, lr=2.42e-05, step=1648] Training: 16%|█▋ | 1649/10000 [21:00<1:17:52, 1.79it/s, loss=0.0342, lr=2.42e-05, step=1648] Training: 16%|█▋ | 1649/10000 [21:00<1:17:52, 1.79it/s, loss=0.0144, lr=2.42e-05, step=1649]19:05:32.675 [I] step=1650 loss=0.0484 smoothed_loss=0.0326 lr=2.42e-05 grad_norm=0.6035 step_time=0.5151s data_time=0.0736s it/s=1.699 eta_to_10000=4914.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0168 grad_action_out_proj_arms=0.2092 grad_arm_token_fuse=0.0898 grad_shared_expert=0.6384 (18633:train_pytorch.py:850) + Training: 16%|█▋ | 1650/10000 [21:00<1:28:28, 1.57it/s, loss=0.0144, lr=2.42e-05, step=1649] Training: 16%|█▋ | 1650/10000 [21:00<1:28:28, 1.57it/s, loss=0.0484, lr=2.42e-05, step=1650] Training: 17%|█▋ | 1651/10000 [21:01<1:22:39, 1.68it/s, loss=0.0484, lr=2.42e-05, step=1650] Training: 17%|█▋ | 1651/10000 [21:01<1:22:39, 1.68it/s, loss=0.0287, lr=2.42e-05, step=1651] Training: 17%|█▋ | 1652/10000 [21:01<1:18:59, 1.76it/s, loss=0.0287, lr=2.42e-05, step=1651] Training: 17%|█▋ | 1652/10000 [21:01<1:18:59, 1.76it/s, loss=0.0169, lr=2.42e-05, step=1652] Training: 17%|█▋ | 1653/10000 [21:02<1:22:39, 1.68it/s, loss=0.0169, lr=2.42e-05, step=1652] Training: 17%|█▋ | 1653/10000 [21:02<1:22:39, 1.68it/s, loss=0.0141, lr=2.42e-05, step=1653] Training: 17%|█▋ | 1654/10000 [21:02<1:18:09, 1.78it/s, loss=0.0141, lr=2.42e-05, step=1653] Training: 17%|█▋ | 1654/10000 [21:02<1:18:09, 1.78it/s, loss=0.0256, lr=2.42e-05, step=1654] Training: 17%|█▋ | 1655/10000 [21:03<1:15:06, 1.85it/s, loss=0.0256, lr=2.42e-05, step=1654] Training: 17%|█▋ | 1655/10000 [21:03<1:15:06, 1.85it/s, loss=0.0222, lr=2.42e-05, step=1655] Training: 17%|█▋ | 1656/10000 [21:03<1:13:22, 1.90it/s, loss=0.0222, lr=2.42e-05, step=1655] Training: 17%|█▋ | 1656/10000 [21:03<1:13:22, 1.90it/s, loss=0.0225, lr=2.42e-05, step=1656] Training: 17%|█▋ | 1657/10000 [21:04<1:23:32, 1.66it/s, loss=0.0225, lr=2.42e-05, step=1656] Training: 17%|█▋ | 1657/10000 [21:04<1:23:32, 1.66it/s, loss=0.0252, lr=2.42e-05, step=1657] Training: 17%|█▋ | 1658/10000 [21:05<1:18:32, 1.77it/s, loss=0.0252, lr=2.42e-05, step=1657] Training: 17%|█▋ | 1658/10000 [21:05<1:18:32, 1.77it/s, loss=0.0384, lr=2.42e-05, step=1658] Training: 17%|█▋ | 1659/10000 [21:05<1:16:17, 1.82it/s, loss=0.0384, lr=2.42e-05, step=1658] Training: 17%|█▋ | 1659/10000 [21:05<1:16:17, 1.82it/s, loss=0.1182, lr=2.42e-05, step=1659]19:05:38.280 [I] step=1660 loss=0.0238 smoothed_loss=0.0360 lr=2.42e-05 grad_norm=0.5806 step_time=0.4954s data_time=0.0651s it/s=1.784 eta_to_10000=4673.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1329 grad_arm_token_fuse=0.0775 grad_shared_expert=0.5350 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1660/10000 [21:06<1:22:55, 1.68it/s, loss=0.1182, lr=2.42e-05, step=1659] Training: 17%|█▋ | 1660/10000 [21:06<1:22:55, 1.68it/s, loss=0.0238, lr=2.42e-05, step=1660] Training: 17%|█▋ | 1661/10000 [21:06<1:18:45, 1.76it/s, loss=0.0238, lr=2.42e-05, step=1660] Training: 17%|█▋ | 1661/10000 [21:06<1:18:45, 1.76it/s, loss=0.0136, lr=2.42e-05, step=1661] Training: 17%|█▋ | 1662/10000 [21:07<1:16:00, 1.83it/s, loss=0.0136, lr=2.42e-05, step=1661] Training: 17%|█▋ | 1662/10000 [21:07<1:16:00, 1.83it/s, loss=0.0129, lr=2.42e-05, step=1662] Training: 17%|█▋ | 1663/10000 [21:07<1:15:15, 1.85it/s, loss=0.0129, lr=2.42e-05, step=1662] Training: 17%|█▋ | 1663/10000 [21:07<1:15:15, 1.85it/s, loss=0.0452, lr=2.42e-05, step=1663] Training: 17%|█▋ | 1664/10000 [21:08<1:23:25, 1.67it/s, loss=0.0452, lr=2.42e-05, step=1663] Training: 17%|█▋ | 1664/10000 [21:08<1:23:25, 1.67it/s, loss=0.0262, lr=2.42e-05, step=1664] Training: 17%|█▋ | 1665/10000 [21:09<1:18:41, 1.77it/s, loss=0.0262, lr=2.42e-05, step=1664] Training: 17%|█▋ | 1665/10000 [21:09<1:18:41, 1.77it/s, loss=0.0698, lr=2.42e-05, step=1665] Training: 17%|█▋ | 1666/10000 [21:09<1:16:38, 1.81it/s, loss=0.0698, lr=2.42e-05, step=1665] Training: 17%|█▋ | 1666/10000 [21:09<1:16:38, 1.81it/s, loss=0.0209, lr=2.42e-05, step=1666] Training: 17%|█▋ | 1667/10000 [21:10<1:21:08, 1.71it/s, loss=0.0209, lr=2.42e-05, step=1666] Training: 17%|█▋ | 1667/10000 [21:10<1:21:08, 1.71it/s, loss=0.0102, lr=2.42e-05, step=1667] Training: 17%|█▋ | 1668/10000 [21:10<1:17:37, 1.79it/s, loss=0.0102, lr=2.42e-05, step=1667] Training: 17%|█▋ | 1668/10000 [21:10<1:17:37, 1.79it/s, loss=0.0821, lr=2.42e-05, step=1668] Training: 17%|█▋ | 1669/10000 [21:11<1:15:47, 1.83it/s, loss=0.0821, lr=2.42e-05, step=1668] Training: 17%|█▋ | 1669/10000 [21:11<1:15:47, 1.83it/s, loss=0.0460, lr=2.42e-05, step=1669]19:05:43.762 [I] step=1670 loss=0.0346 smoothed_loss=0.0377 lr=2.42e-05 grad_norm=0.7035 step_time=0.4822s data_time=0.0660s it/s=1.825 eta_to_10000=4565.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0143 grad_action_out_proj_arms=0.2296 grad_arm_token_fuse=0.0685 grad_shared_expert=0.6212 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1670/10000 [21:11<1:15:24, 1.84it/s, loss=0.0460, lr=2.42e-05, step=1669] Training: 17%|█▋ | 1670/10000 [21:11<1:15:24, 1.84it/s, loss=0.0346, lr=2.42e-05, step=1670] Training: 17%|█▋ | 1671/10000 [21:12<1:23:27, 1.66it/s, loss=0.0346, lr=2.42e-05, step=1670] Training: 17%|█▋ | 1671/10000 [21:12<1:23:27, 1.66it/s, loss=0.1298, lr=2.42e-05, step=1671] Training: 17%|█▋ | 1672/10000 [21:13<1:18:47, 1.76it/s, loss=0.1298, lr=2.42e-05, step=1671] Training: 17%|█▋ | 1672/10000 [21:13<1:18:47, 1.76it/s, loss=0.0279, lr=2.42e-05, step=1672] Training: 17%|█▋ | 1673/10000 [21:13<1:15:36, 1.84it/s, loss=0.0279, lr=2.42e-05, step=1672] Training: 17%|█▋ | 1673/10000 [21:13<1:15:36, 1.84it/s, loss=0.0262, lr=2.42e-05, step=1673] Training: 17%|█▋ | 1674/10000 [21:14<1:20:58, 1.71it/s, loss=0.0262, lr=2.42e-05, step=1673] Training: 17%|█▋ | 1674/10000 [21:14<1:20:58, 1.71it/s, loss=0.0365, lr=2.42e-05, step=1674] Training: 17%|█▋ | 1675/10000 [21:14<1:18:14, 1.77it/s, loss=0.0365, lr=2.42e-05, step=1674] Training: 17%|█▋ | 1675/10000 [21:14<1:18:14, 1.77it/s, loss=0.0248, lr=2.42e-05, step=1675] Training: 17%|█▋ | 1676/10000 [21:15<1:16:30, 1.81it/s, loss=0.0248, lr=2.42e-05, step=1675] Training: 17%|█▋ | 1676/10000 [21:15<1:16:30, 1.81it/s, loss=0.0384, lr=2.42e-05, step=1676] Training: 17%|█▋ | 1677/10000 [21:15<1:15:22, 1.84it/s, loss=0.0384, lr=2.42e-05, step=1676] Training: 17%|█▋ | 1677/10000 [21:15<1:15:22, 1.84it/s, loss=0.0456, lr=2.42e-05, step=1677] Training: 17%|█▋ | 1678/10000 [21:16<1:31:06, 1.52it/s, loss=0.0456, lr=2.42e-05, step=1677] Training: 17%|█▋ | 1678/10000 [21:16<1:31:06, 1.52it/s, loss=0.0252, lr=2.42e-05, step=1678] Training: 17%|█▋ | 1679/10000 [21:17<1:25:20, 1.62it/s, loss=0.0252, lr=2.42e-05, step=1678] Training: 17%|█▋ | 1679/10000 [21:17<1:25:20, 1.62it/s, loss=0.0320, lr=2.42e-05, step=1679]19:05:49.682 [I] step=1680 loss=0.0194 smoothed_loss=0.0367 lr=2.42e-05 grad_norm=0.6466 step_time=0.5265s data_time=0.0656s it/s=1.689 eta_to_10000=4925.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.1691 grad_arm_token_fuse=0.0662 grad_shared_expert=0.5645 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1680/10000 [21:17<1:21:30, 1.70it/s, loss=0.0320, lr=2.42e-05, step=1679] Training: 17%|█▋ | 1680/10000 [21:17<1:21:30, 1.70it/s, loss=0.0194, lr=2.42e-05, step=1680] Training: 17%|█▋ | 1681/10000 [21:18<1:18:32, 1.77it/s, loss=0.0194, lr=2.42e-05, step=1680] Training: 17%|█▋ | 1681/10000 [21:18<1:18:32, 1.77it/s, loss=0.0297, lr=2.42e-05, step=1681] Training: 17%|█▋ | 1682/10000 [21:19<1:23:36, 1.66it/s, loss=0.0297, lr=2.42e-05, step=1681] Training: 17%|█▋ | 1682/10000 [21:19<1:23:36, 1.66it/s, loss=0.0397, lr=2.42e-05, step=1682] Training: 17%|█▋ | 1683/10000 [21:19<1:19:35, 1.74it/s, loss=0.0397, lr=2.42e-05, step=1682] Training: 17%|█▋ | 1683/10000 [21:19<1:19:35, 1.74it/s, loss=0.0198, lr=2.42e-05, step=1683] Training: 17%|█▋ | 1684/10000 [21:20<1:16:26, 1.81it/s, loss=0.0198, lr=2.42e-05, step=1683] Training: 17%|█▋ | 1684/10000 [21:20<1:16:26, 1.81it/s, loss=0.0250, lr=2.42e-05, step=1684] Training: 17%|█▋ | 1685/10000 [21:20<1:14:16, 1.87it/s, loss=0.0250, lr=2.42e-05, step=1684] Training: 17%|█▋ | 1685/10000 [21:20<1:14:16, 1.87it/s, loss=0.0256, lr=2.41e-05, step=1685] Training: 17%|█▋ | 1686/10000 [21:21<1:24:01, 1.65it/s, loss=0.0256, lr=2.41e-05, step=1685] Training: 17%|█▋ | 1686/10000 [21:21<1:24:01, 1.65it/s, loss=0.0247, lr=2.41e-05, step=1686] Training: 17%|█▋ | 1687/10000 [21:21<1:19:56, 1.73it/s, loss=0.0247, lr=2.41e-05, step=1686] Training: 17%|█▋ | 1687/10000 [21:21<1:19:56, 1.73it/s, loss=0.0473, lr=2.41e-05, step=1687] Training: 17%|█▋ | 1688/10000 [21:22<1:16:21, 1.81it/s, loss=0.0473, lr=2.41e-05, step=1687] Training: 17%|█▋ | 1688/10000 [21:22<1:16:21, 1.81it/s, loss=0.0235, lr=2.41e-05, step=1688] Training: 17%|█▋ | 1689/10000 [21:23<1:23:21, 1.66it/s, loss=0.0235, lr=2.41e-05, step=1688] Training: 17%|█▋ | 1689/10000 [21:23<1:23:21, 1.66it/s, loss=0.0210, lr=2.41e-05, step=1689]19:05:55.398 [I] step=1690 loss=0.0532 smoothed_loss=0.0336 lr=2.41e-05 grad_norm=0.6479 step_time=0.5072s data_time=0.0643s it/s=1.750 eta_to_10000=4749.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.2488 grad_arm_token_fuse=0.0735 grad_shared_expert=0.7097 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1690/10000 [21:23<1:19:45, 1.74it/s, loss=0.0210, lr=2.41e-05, step=1689] Training: 17%|█▋ | 1690/10000 [21:23<1:19:45, 1.74it/s, loss=0.0532, lr=2.41e-05, step=1690] Training: 17%|█▋ | 1691/10000 [21:24<1:16:17, 1.82it/s, loss=0.0532, lr=2.41e-05, step=1690] Training: 17%|█▋ | 1691/10000 [21:24<1:16:17, 1.82it/s, loss=0.0257, lr=2.41e-05, step=1691] Training: 17%|█▋ | 1692/10000 [21:24<1:20:20, 1.72it/s, loss=0.0257, lr=2.41e-05, step=1691] Training: 17%|█▋ | 1692/10000 [21:24<1:20:20, 1.72it/s, loss=0.0147, lr=2.41e-05, step=1692] Training: 17%|█▋ | 1693/10000 [21:25<1:28:35, 1.56it/s, loss=0.0147, lr=2.41e-05, step=1692] Training: 17%|█▋ | 1693/10000 [21:25<1:28:35, 1.56it/s, loss=0.0532, lr=2.41e-05, step=1693] Training: 17%|█▋ | 1694/10000 [21:25<1:22:37, 1.68it/s, loss=0.0532, lr=2.41e-05, step=1693] Training: 17%|█▋ | 1694/10000 [21:25<1:22:37, 1.68it/s, loss=0.0171, lr=2.41e-05, step=1694] Training: 17%|█▋ | 1695/10000 [21:26<1:22:02, 1.69it/s, loss=0.0171, lr=2.41e-05, step=1694] Training: 17%|█▋ | 1695/10000 [21:26<1:22:02, 1.69it/s, loss=0.0610, lr=2.41e-05, step=1695] Training: 17%|█▋ | 1696/10000 [21:27<1:17:42, 1.78it/s, loss=0.0610, lr=2.41e-05, step=1695] Training: 17%|█▋ | 1696/10000 [21:27<1:17:42, 1.78it/s, loss=0.0901, lr=2.41e-05, step=1696] Training: 17%|█▋ | 1697/10000 [21:27<1:22:20, 1.68it/s, loss=0.0901, lr=2.41e-05, step=1696] Training: 17%|█▋ | 1697/10000 [21:27<1:22:20, 1.68it/s, loss=0.0858, lr=2.41e-05, step=1697] Training: 17%|█▋ | 1698/10000 [21:28<1:18:54, 1.75it/s, loss=0.0858, lr=2.41e-05, step=1697] Training: 17%|█▋ | 1698/10000 [21:28<1:18:54, 1.75it/s, loss=0.0233, lr=2.41e-05, step=1698] Training: 17%|█▋ | 1699/10000 [21:28<1:15:51, 1.82it/s, loss=0.0233, lr=2.41e-05, step=1698] Training: 17%|█▋ | 1699/10000 [21:28<1:15:51, 1.82it/s, loss=0.0143, lr=2.41e-05, step=1699]19:06:01.380 [I] step=1700 loss=0.0486 smoothed_loss=0.0406 lr=2.41e-05 grad_norm=0.7110 step_time=0.5236s data_time=0.0746s it/s=1.672 eta_to_10000=4964.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0248 grad_action_out_proj_arms=0.3108 grad_arm_token_fuse=0.1241 grad_shared_expert=0.7008 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1700/10000 [21:29<1:26:45, 1.59it/s, loss=0.0143, lr=2.41e-05, step=1699] Training: 17%|█▋ | 1700/10000 [21:29<1:26:45, 1.59it/s, loss=0.0486, lr=2.41e-05, step=1700] Training: 17%|█▋ | 1701/10000 [21:30<1:23:19, 1.66it/s, loss=0.0486, lr=2.41e-05, step=1700] Training: 17%|█▋ | 1701/10000 [21:30<1:23:19, 1.66it/s, loss=0.0154, lr=2.41e-05, step=1701] Training: 17%|█▋ | 1702/10000 [21:30<1:20:18, 1.72it/s, loss=0.0154, lr=2.41e-05, step=1701] Training: 17%|█▋ | 1702/10000 [21:30<1:20:18, 1.72it/s, loss=0.0395, lr=2.41e-05, step=1702] Training: 17%|█▋ | 1703/10000 [21:31<1:27:14, 1.58it/s, loss=0.0395, lr=2.41e-05, step=1702] Training: 17%|█▋ | 1703/10000 [21:31<1:27:14, 1.58it/s, loss=0.0095, lr=2.41e-05, step=1703] Training: 17%|█▋ | 1704/10000 [21:32<1:27:55, 1.57it/s, loss=0.0095, lr=2.41e-05, step=1703] Training: 17%|█▋ | 1704/10000 [21:32<1:27:55, 1.57it/s, loss=0.0443, lr=2.41e-05, step=1704] Training: 17%|█▋ | 1705/10000 [21:32<1:22:17, 1.68it/s, loss=0.0443, lr=2.41e-05, step=1704] Training: 17%|█▋ | 1705/10000 [21:32<1:22:17, 1.68it/s, loss=0.0309, lr=2.41e-05, step=1705] Training: 17%|█▋ | 1706/10000 [21:33<1:22:39, 1.67it/s, loss=0.0309, lr=2.41e-05, step=1705] Training: 17%|█▋ | 1706/10000 [21:33<1:22:39, 1.67it/s, loss=0.0294, lr=2.41e-05, step=1706] Training: 17%|█▋ | 1707/10000 [21:33<1:29:11, 1.55it/s, loss=0.0294, lr=2.41e-05, step=1706] Training: 17%|█▋ | 1707/10000 [21:33<1:29:11, 1.55it/s, loss=0.0173, lr=2.41e-05, step=1707] Training: 17%|█▋ | 1708/10000 [21:34<1:23:50, 1.65it/s, loss=0.0173, lr=2.41e-05, step=1707] Training: 17%|█▋ | 1708/10000 [21:34<1:23:50, 1.65it/s, loss=0.0209, lr=2.41e-05, step=1708] Training: 17%|█▋ | 1709/10000 [21:34<1:19:33, 1.74it/s, loss=0.0209, lr=2.41e-05, step=1708] Training: 17%|█▋ | 1709/10000 [21:34<1:19:33, 1.74it/s, loss=0.0101, lr=2.41e-05, step=1709]19:06:07.260 [I] step=1710 loss=0.0144 smoothed_loss=0.0283 lr=2.41e-05 grad_norm=0.6042 step_time=0.5027s data_time=0.0854s it/s=1.701 eta_to_10000=4873.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0136 grad_action_out_proj_arms=0.1782 grad_arm_token_fuse=0.0617 grad_shared_expert=0.5886 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1710/10000 [21:35<1:17:37, 1.78it/s, loss=0.0101, lr=2.41e-05, step=1709] Training: 17%|█▋ | 1710/10000 [21:35<1:17:37, 1.78it/s, loss=0.0144, lr=2.41e-05, step=1710] Training: 17%|█▋ | 1711/10000 [21:35<1:15:01, 1.84it/s, loss=0.0144, lr=2.41e-05, step=1710] Training: 17%|█▋ | 1711/10000 [21:35<1:15:01, 1.84it/s, loss=0.0642, lr=2.41e-05, step=1711] Training: 17%|█▋ | 1712/10000 [21:36<1:24:57, 1.63it/s, loss=0.0642, lr=2.41e-05, step=1711] Training: 17%|█▋ | 1712/10000 [21:36<1:24:57, 1.63it/s, loss=0.0379, lr=2.41e-05, step=1712] Training: 17%|█▋ | 1713/10000 [21:37<1:28:54, 1.55it/s, loss=0.0379, lr=2.41e-05, step=1712] Training: 17%|█▋ | 1713/10000 [21:37<1:28:54, 1.55it/s, loss=0.0182, lr=2.41e-05, step=1713] Training: 17%|█▋ | 1714/10000 [21:38<1:35:06, 1.45it/s, loss=0.0182, lr=2.41e-05, step=1713] Training: 17%|█▋ | 1714/10000 [21:38<1:35:06, 1.45it/s, loss=0.0445, lr=2.41e-05, step=1714] Training: 17%|█▋ | 1715/10000 [21:38<1:35:38, 1.44it/s, loss=0.0445, lr=2.41e-05, step=1714] Training: 17%|█▋ | 1715/10000 [21:38<1:35:38, 1.44it/s, loss=0.0307, lr=2.41e-05, step=1715] Training: 17%|█▋ | 1716/10000 [21:39<1:36:32, 1.43it/s, loss=0.0307, lr=2.41e-05, step=1715] Training: 17%|█▋ | 1716/10000 [21:39<1:36:32, 1.43it/s, loss=0.0224, lr=2.41e-05, step=1716] Training: 17%|█▋ | 1717/10000 [21:40<1:34:29, 1.46it/s, loss=0.0224, lr=2.41e-05, step=1716] Training: 17%|█▋ | 1717/10000 [21:40<1:34:29, 1.46it/s, loss=0.0176, lr=2.41e-05, step=1717] Training: 17%|█▋ | 1718/10000 [21:40<1:34:15, 1.46it/s, loss=0.0176, lr=2.41e-05, step=1717] Training: 17%|█▋ | 1718/10000 [21:40<1:34:15, 1.46it/s, loss=0.0303, lr=2.41e-05, step=1718] Training: 17%|█▋ | 1719/10000 [21:41<1:39:09, 1.39it/s, loss=0.0303, lr=2.41e-05, step=1718] Training: 17%|█▋ | 1719/10000 [21:41<1:39:09, 1.39it/s, loss=0.1371, lr=2.41e-05, step=1719]19:06:14.457 [I] step=1720 loss=0.0355 smoothed_loss=0.0401 lr=2.41e-05 grad_norm=0.6471 step_time=0.5811s data_time=0.1386s it/s=1.390 eta_to_10000=5958.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1716 grad_arm_token_fuse=0.0730 grad_shared_expert=0.6572 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1720/10000 [21:42<1:45:09, 1.31it/s, loss=0.1371, lr=2.41e-05, step=1719] Training: 17%|█▋ | 1720/10000 [21:42<1:45:09, 1.31it/s, loss=0.0355, lr=2.41e-05, step=1720] Training: 17%|█▋ | 1721/10000 [21:43<1:53:06, 1.22it/s, loss=0.0355, lr=2.41e-05, step=1720] Training: 17%|█▋ | 1721/10000 [21:43<1:53:06, 1.22it/s, loss=0.0374, lr=2.41e-05, step=1721] Training: 17%|█▋ | 1722/10000 [21:44<1:46:18, 1.30it/s, loss=0.0374, lr=2.41e-05, step=1721] Training: 17%|█▋ | 1722/10000 [21:44<1:46:18, 1.30it/s, loss=0.0300, lr=2.41e-05, step=1722] Training: 17%|█▋ | 1723/10000 [21:45<1:46:21, 1.30it/s, loss=0.0300, lr=2.41e-05, step=1722] Training: 17%|█▋ | 1723/10000 [21:45<1:46:21, 1.30it/s, loss=0.0316, lr=2.41e-05, step=1723] Training: 17%|█▋ | 1724/10000 [21:45<1:42:51, 1.34it/s, loss=0.0316, lr=2.41e-05, step=1723] Training: 17%|█▋ | 1724/10000 [21:45<1:42:51, 1.34it/s, loss=0.0231, lr=2.41e-05, step=1724] Training: 17%|█▋ | 1725/10000 [21:46<1:45:52, 1.30it/s, loss=0.0231, lr=2.41e-05, step=1724] Training: 17%|█▋ | 1725/10000 [21:46<1:45:52, 1.30it/s, loss=0.0096, lr=2.41e-05, step=1725] Training: 17%|█▋ | 1726/10000 [21:47<1:41:42, 1.36it/s, loss=0.0096, lr=2.41e-05, step=1725] Training: 17%|█▋ | 1726/10000 [21:47<1:41:42, 1.36it/s, loss=0.0717, lr=2.41e-05, step=1726] Training: 17%|█▋ | 1727/10000 [21:48<1:55:40, 1.19it/s, loss=0.0717, lr=2.41e-05, step=1726] Training: 17%|█▋ | 1727/10000 [21:48<1:55:40, 1.19it/s, loss=0.0151, lr=2.41e-05, step=1727] Training: 17%|█▋ | 1728/10000 [21:49<2:04:01, 1.11it/s, loss=0.0151, lr=2.41e-05, step=1727] Training: 17%|█▋ | 1728/10000 [21:49<2:04:01, 1.11it/s, loss=0.0207, lr=2.41e-05, step=1728] Training: 17%|█▋ | 1729/10000 [21:50<1:57:52, 1.17it/s, loss=0.0207, lr=2.41e-05, step=1728] Training: 17%|█▋ | 1729/10000 [21:50<1:57:52, 1.17it/s, loss=0.0183, lr=2.41e-05, step=1729]19:06:22.516 [I] step=1730 loss=0.0104 smoothed_loss=0.0302 lr=2.41e-05 grad_norm=0.7345 step_time=0.6594s data_time=0.1465s it/s=1.241 eta_to_10000=6663.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0137 grad_action_out_proj_arms=0.1846 grad_arm_token_fuse=0.0665 grad_shared_expert=0.7704 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1730/10000 [21:50<1:48:46, 1.27it/s, loss=0.0183, lr=2.41e-05, step=1729] Training: 17%|█▋ | 1730/10000 [21:50<1:48:46, 1.27it/s, loss=0.0104, lr=2.41e-05, step=1730] Training: 17%|█▋ | 1731/10000 [21:51<1:45:46, 1.30it/s, loss=0.0104, lr=2.41e-05, step=1730] Training: 17%|█▋ | 1731/10000 [21:51<1:45:46, 1.30it/s, loss=0.0316, lr=2.41e-05, step=1731] Training: 17%|█▋ | 1732/10000 [21:52<1:44:33, 1.32it/s, loss=0.0316, lr=2.41e-05, step=1731] Training: 17%|█▋ | 1732/10000 [21:52<1:44:33, 1.32it/s, loss=0.0533, lr=2.41e-05, step=1732] Training: 17%|█▋ | 1733/10000 [21:52<1:40:49, 1.37it/s, loss=0.0533, lr=2.41e-05, step=1732] Training: 17%|█▋ | 1733/10000 [21:52<1:40:49, 1.37it/s, loss=0.0384, lr=2.41e-05, step=1733] Training: 17%|█▋ | 1734/10000 [21:53<1:44:35, 1.32it/s, loss=0.0384, lr=2.41e-05, step=1733] Training: 17%|█▋ | 1734/10000 [21:53<1:44:35, 1.32it/s, loss=0.0887, lr=2.41e-05, step=1734] Training: 17%|█▋ | 1735/10000 [21:54<1:45:14, 1.31it/s, loss=0.0887, lr=2.41e-05, step=1734] Training: 17%|█▋ | 1735/10000 [21:54<1:45:14, 1.31it/s, loss=0.0399, lr=2.41e-05, step=1735] Training: 17%|█▋ | 1736/10000 [21:55<1:54:22, 1.20it/s, loss=0.0399, lr=2.41e-05, step=1735] Training: 17%|█▋ | 1736/10000 [21:55<1:54:22, 1.20it/s, loss=0.0187, lr=2.41e-05, step=1736] Training: 17%|█▋ | 1737/10000 [21:56<1:55:26, 1.19it/s, loss=0.0187, lr=2.41e-05, step=1736] Training: 17%|█▋ | 1737/10000 [21:56<1:55:26, 1.19it/s, loss=0.0218, lr=2.41e-05, step=1737] Training: 17%|█▋ | 1738/10000 [21:56<1:49:20, 1.26it/s, loss=0.0218, lr=2.41e-05, step=1737] Training: 17%|█▋ | 1738/10000 [21:56<1:49:20, 1.26it/s, loss=0.0197, lr=2.41e-05, step=1738] Training: 17%|█▋ | 1739/10000 [21:57<1:42:41, 1.34it/s, loss=0.0197, lr=2.41e-05, step=1738] Training: 17%|█▋ | 1739/10000 [21:57<1:42:41, 1.34it/s, loss=0.0322, lr=2.41e-05, step=1739]19:06:30.267 [I] step=1740 loss=0.0080 smoothed_loss=0.0311 lr=2.41e-05 grad_norm=0.7110 step_time=0.6175s data_time=0.1576s it/s=1.290 eta_to_10000=6401.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0215 grad_action_out_proj_arms=0.2194 grad_arm_token_fuse=0.1054 grad_shared_expert=0.5143 (18633:train_pytorch.py:850) + Training: 17%|█▋ | 1740/10000 [21:58<1:47:29, 1.28it/s, loss=0.0322, lr=2.41e-05, step=1739] Training: 17%|█▋ | 1740/10000 [21:58<1:47:29, 1.28it/s, loss=0.0080, lr=2.41e-05, step=1740] Training: 17%|█▋ | 1741/10000 [21:59<1:40:56, 1.36it/s, loss=0.0080, lr=2.41e-05, step=1740] Training: 17%|█▋ | 1741/10000 [21:59<1:40:56, 1.36it/s, loss=0.0136, lr=2.41e-05, step=1741] Training: 17%|█▋ | 1742/10000 [21:59<1:46:40, 1.29it/s, loss=0.0136, lr=2.41e-05, step=1741] Training: 17%|█▋ | 1742/10000 [21:59<1:46:40, 1.29it/s, loss=0.0229, lr=2.41e-05, step=1742] Training: 17%|█▋ | 1743/10000 [22:00<1:54:31, 1.20it/s, loss=0.0229, lr=2.41e-05, step=1742] Training: 17%|█▋ | 1743/10000 [22:00<1:54:31, 1.20it/s, loss=0.0397, lr=2.41e-05, step=1743] Training: 17%|█▋ | 1744/10000 [22:01<1:49:17, 1.26it/s, loss=0.0397, lr=2.41e-05, step=1743] Training: 17%|█▋ | 1744/10000 [22:01<1:49:17, 1.26it/s, loss=0.0761, lr=2.41e-05, step=1744] Training: 17%|█▋ | 1745/10000 [22:02<1:43:53, 1.32it/s, loss=0.0761, lr=2.41e-05, step=1744] Training: 17%|█▋ | 1745/10000 [22:02<1:43:53, 1.32it/s, loss=0.0118, lr=2.41e-05, step=1745] Training: 17%|█▋ | 1746/10000 [22:03<1:44:32, 1.32it/s, loss=0.0118, lr=2.41e-05, step=1745] Training: 17%|█▋ | 1746/10000 [22:03<1:44:32, 1.32it/s, loss=0.0159, lr=2.41e-05, step=1746] Training: 17%|█▋ | 1747/10000 [22:03<1:36:05, 1.43it/s, loss=0.0159, lr=2.41e-05, step=1746] Training: 17%|█▋ | 1747/10000 [22:03<1:36:05, 1.43it/s, loss=0.0400, lr=2.41e-05, step=1747] Training: 17%|█▋ | 1748/10000 [22:04<1:36:53, 1.42it/s, loss=0.0400, lr=2.41e-05, step=1747] Training: 17%|█▋ | 1748/10000 [22:04<1:36:53, 1.42it/s, loss=0.0294, lr=2.41e-05, step=1748] Training: 17%|█▋ | 1749/10000 [22:05<1:38:07, 1.40it/s, loss=0.0294, lr=2.41e-05, step=1748] Training: 17%|█▋ | 1749/10000 [22:05<1:38:07, 1.40it/s, loss=0.0446, lr=2.41e-05, step=1749]19:06:38.204 [I] step=1750 loss=0.0478 smoothed_loss=0.0341 lr=2.41e-05 grad_norm=0.6419 step_time=0.6228s data_time=0.1709s it/s=1.260 eta_to_10000=6546.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0199 grad_action_out_proj_arms=0.2154 grad_arm_token_fuse=0.0988 grad_shared_expert=0.8680 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1750/10000 [22:06<2:03:24, 1.11it/s, loss=0.0446, lr=2.41e-05, step=1749] Training: 18%|█▊ | 1750/10000 [22:06<2:03:24, 1.11it/s, loss=0.0478, lr=2.41e-05, step=1750] Training: 18%|█▊ | 1751/10000 [22:07<2:01:17, 1.13it/s, loss=0.0478, lr=2.41e-05, step=1750] Training: 18%|█▊ | 1751/10000 [22:07<2:01:17, 1.13it/s, loss=0.0368, lr=2.41e-05, step=1751] Training: 18%|█▊ | 1752/10000 [22:07<1:53:38, 1.21it/s, loss=0.0368, lr=2.41e-05, step=1751] Training: 18%|█▊ | 1752/10000 [22:07<1:53:38, 1.21it/s, loss=0.0674, lr=2.41e-05, step=1752] Training: 18%|█▊ | 1753/10000 [22:08<1:45:20, 1.30it/s, loss=0.0674, lr=2.41e-05, step=1752] Training: 18%|█▊ | 1753/10000 [22:08<1:45:20, 1.30it/s, loss=0.0308, lr=2.40e-05, step=1753] Training: 18%|█▊ | 1754/10000 [22:09<1:41:24, 1.36it/s, loss=0.0308, lr=2.40e-05, step=1753] Training: 18%|█▊ | 1754/10000 [22:09<1:41:24, 1.36it/s, loss=0.1873, lr=2.40e-05, step=1754] Training: 18%|█▊ | 1755/10000 [22:09<1:38:07, 1.40it/s, loss=0.1873, lr=2.40e-05, step=1754] Training: 18%|█▊ | 1755/10000 [22:09<1:38:07, 1.40it/s, loss=0.0270, lr=2.40e-05, step=1755] Training: 18%|█▊ | 1756/10000 [22:10<1:33:56, 1.46it/s, loss=0.0270, lr=2.40e-05, step=1755] Training: 18%|█▊ | 1756/10000 [22:10<1:33:56, 1.46it/s, loss=0.0493, lr=2.40e-05, step=1756] Training: 18%|█▊ | 1757/10000 [22:11<1:52:43, 1.22it/s, loss=0.0493, lr=2.40e-05, step=1756] Training: 18%|█▊ | 1757/10000 [22:11<1:52:43, 1.22it/s, loss=0.0271, lr=2.40e-05, step=1757] Training: 18%|█▊ | 1758/10000 [22:12<1:49:00, 1.26it/s, loss=0.0271, lr=2.40e-05, step=1757] Training: 18%|█▊ | 1758/10000 [22:12<1:49:00, 1.26it/s, loss=0.0855, lr=2.40e-05, step=1758] Training: 18%|█▊ | 1759/10000 [22:13<1:51:19, 1.23it/s, loss=0.0855, lr=2.40e-05, step=1758] Training: 18%|█▊ | 1759/10000 [22:13<1:51:19, 1.23it/s, loss=0.0657, lr=2.40e-05, step=1759]19:06:45.693 [I] step=1760 loss=0.0128 smoothed_loss=0.0486 lr=2.40e-05 grad_norm=0.6482 step_time=0.5831s data_time=0.1658s it/s=1.335 eta_to_10000=6170.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0203 grad_action_out_proj_arms=0.1805 grad_arm_token_fuse=0.1082 grad_shared_expert=0.4590 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1760/10000 [22:13<1:44:59, 1.31it/s, loss=0.0657, lr=2.40e-05, step=1759] Training: 18%|█▊ | 1760/10000 [22:13<1:44:59, 1.31it/s, loss=0.0128, lr=2.40e-05, step=1760] Training: 18%|█▊ | 1761/10000 [22:14<1:42:13, 1.34it/s, loss=0.0128, lr=2.40e-05, step=1760] Training: 18%|█▊ | 1761/10000 [22:14<1:42:13, 1.34it/s, loss=0.0272, lr=2.40e-05, step=1761] Training: 18%|█▊ | 1762/10000 [22:15<1:37:43, 1.40it/s, loss=0.0272, lr=2.40e-05, step=1761] Training: 18%|█▊ | 1762/10000 [22:15<1:37:43, 1.40it/s, loss=0.0774, lr=2.40e-05, step=1762] Training: 18%|█▊ | 1763/10000 [22:15<1:30:59, 1.51it/s, loss=0.0774, lr=2.40e-05, step=1762] Training: 18%|█▊ | 1763/10000 [22:15<1:30:59, 1.51it/s, loss=0.0161, lr=2.40e-05, step=1763] Training: 18%|█▊ | 1764/10000 [22:16<1:39:37, 1.38it/s, loss=0.0161, lr=2.40e-05, step=1763] Training: 18%|█▊ | 1764/10000 [22:16<1:39:37, 1.38it/s, loss=0.0225, lr=2.40e-05, step=1764] Training: 18%|█▊ | 1765/10000 [22:17<1:31:41, 1.50it/s, loss=0.0225, lr=2.40e-05, step=1764] Training: 18%|█▊ | 1765/10000 [22:17<1:31:41, 1.50it/s, loss=0.0075, lr=2.40e-05, step=1765] Training: 18%|█▊ | 1766/10000 [22:17<1:33:49, 1.46it/s, loss=0.0075, lr=2.40e-05, step=1765] Training: 18%|█▊ | 1766/10000 [22:17<1:33:49, 1.46it/s, loss=0.0279, lr=2.40e-05, step=1766] Training: 18%|█▊ | 1767/10000 [22:18<1:26:03, 1.59it/s, loss=0.0279, lr=2.40e-05, step=1766] Training: 18%|█▊ | 1767/10000 [22:18<1:26:03, 1.59it/s, loss=0.0372, lr=2.40e-05, step=1767] Training: 18%|█▊ | 1768/10000 [22:19<1:33:54, 1.46it/s, loss=0.0372, lr=2.40e-05, step=1767] Training: 18%|█▊ | 1768/10000 [22:19<1:33:54, 1.46it/s, loss=0.0458, lr=2.40e-05, step=1768] Training: 18%|█▊ | 1769/10000 [22:19<1:35:30, 1.44it/s, loss=0.0458, lr=2.40e-05, step=1768] Training: 18%|█▊ | 1769/10000 [22:19<1:35:30, 1.44it/s, loss=0.0308, lr=2.40e-05, step=1769]19:06:52.447 [I] step=1770 loss=0.0305 smoothed_loss=0.0378 lr=2.40e-05 grad_norm=0.7349 step_time=0.5515s data_time=0.1239s it/s=1.481 eta_to_10000=5557.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0159 grad_action_out_proj_arms=0.1906 grad_arm_token_fuse=0.0753 grad_shared_expert=0.6005 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1770/10000 [22:20<1:36:00, 1.43it/s, loss=0.0308, lr=2.40e-05, step=1769] Training: 18%|█▊ | 1770/10000 [22:20<1:36:00, 1.43it/s, loss=0.0305, lr=2.40e-05, step=1770] Training: 18%|█▊ | 1771/10000 [22:21<1:55:19, 1.19it/s, loss=0.0305, lr=2.40e-05, step=1770] Training: 18%|█▊ | 1771/10000 [22:21<1:55:19, 1.19it/s, loss=0.0419, lr=2.40e-05, step=1771] Training: 18%|█▊ | 1772/10000 [22:22<2:05:33, 1.09it/s, loss=0.0419, lr=2.40e-05, step=1771] Training: 18%|█▊ | 1772/10000 [22:22<2:05:33, 1.09it/s, loss=0.0321, lr=2.40e-05, step=1772] Training: 18%|█▊ | 1773/10000 [22:23<1:55:14, 1.19it/s, loss=0.0321, lr=2.40e-05, step=1772] Training: 18%|█▊ | 1773/10000 [22:23<1:55:14, 1.19it/s, loss=0.0357, lr=2.40e-05, step=1773] Training: 18%|█▊ | 1774/10000 [22:24<1:46:50, 1.28it/s, loss=0.0357, lr=2.40e-05, step=1773] Training: 18%|█▊ | 1774/10000 [22:24<1:46:50, 1.28it/s, loss=0.0259, lr=2.40e-05, step=1774] Training: 18%|█▊ | 1775/10000 [22:25<1:50:29, 1.24it/s, loss=0.0259, lr=2.40e-05, step=1774] Training: 18%|█▊ | 1775/10000 [22:25<1:50:29, 1.24it/s, loss=0.0317, lr=2.40e-05, step=1775] Training: 18%|█▊ | 1776/10000 [22:25<1:39:09, 1.38it/s, loss=0.0317, lr=2.40e-05, step=1775] Training: 18%|█▊ | 1776/10000 [22:25<1:39:09, 1.38it/s, loss=0.0121, lr=2.40e-05, step=1776] Training: 18%|█▊ | 1777/10000 [22:26<1:40:38, 1.36it/s, loss=0.0121, lr=2.40e-05, step=1776] Training: 18%|█▊ | 1777/10000 [22:26<1:40:38, 1.36it/s, loss=0.0329, lr=2.40e-05, step=1777] Training: 18%|█▊ | 1778/10000 [22:27<1:53:46, 1.20it/s, loss=0.0329, lr=2.40e-05, step=1777] Training: 18%|█▊ | 1778/10000 [22:27<1:53:46, 1.20it/s, loss=0.0331, lr=2.40e-05, step=1778] Training: 18%|█▊ | 1779/10000 [22:28<1:55:36, 1.19it/s, loss=0.0331, lr=2.40e-05, step=1778] Training: 18%|█▊ | 1779/10000 [22:28<1:55:36, 1.19it/s, loss=0.0420, lr=2.40e-05, step=1779]19:07:00.758 [I] step=1780 loss=0.0222 smoothed_loss=0.0330 lr=2.40e-05 grad_norm=0.5879 step_time=0.6742s data_time=0.1569s it/s=1.203 eta_to_10000=6830.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1657 grad_arm_token_fuse=0.0681 grad_shared_expert=0.4276 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1780/10000 [22:28<1:48:05, 1.27it/s, loss=0.0420, lr=2.40e-05, step=1779] Training: 18%|█▊ | 1780/10000 [22:28<1:48:05, 1.27it/s, loss=0.0222, lr=2.40e-05, step=1780] Training: 18%|█▊ | 1781/10000 [22:29<1:41:56, 1.34it/s, loss=0.0222, lr=2.40e-05, step=1780] Training: 18%|█▊ | 1781/10000 [22:29<1:41:56, 1.34it/s, loss=0.0137, lr=2.40e-05, step=1781] Training: 18%|█▊ | 1782/10000 [22:30<1:40:26, 1.36it/s, loss=0.0137, lr=2.40e-05, step=1781] Training: 18%|█▊ | 1782/10000 [22:30<1:40:26, 1.36it/s, loss=0.0357, lr=2.40e-05, step=1782] Training: 18%|█▊ | 1783/10000 [22:31<1:43:07, 1.33it/s, loss=0.0357, lr=2.40e-05, step=1782] Training: 18%|█▊ | 1783/10000 [22:31<1:43:07, 1.33it/s, loss=0.0797, lr=2.40e-05, step=1783] Training: 18%|█▊ | 1784/10000 [22:31<1:38:08, 1.40it/s, loss=0.0797, lr=2.40e-05, step=1783] Training: 18%|█▊ | 1784/10000 [22:31<1:38:08, 1.40it/s, loss=0.0424, lr=2.40e-05, step=1784] Training: 18%|█▊ | 1785/10000 [22:32<1:36:55, 1.41it/s, loss=0.0424, lr=2.40e-05, step=1784] Training: 18%|█▊ | 1785/10000 [22:32<1:36:55, 1.41it/s, loss=0.0235, lr=2.40e-05, step=1785] Training: 18%|█▊ | 1786/10000 [22:33<1:49:25, 1.25it/s, loss=0.0235, lr=2.40e-05, step=1785] Training: 18%|█▊ | 1786/10000 [22:33<1:49:25, 1.25it/s, loss=0.0091, lr=2.40e-05, step=1786] Training: 18%|█▊ | 1787/10000 [22:34<1:52:10, 1.22it/s, loss=0.0091, lr=2.40e-05, step=1786] Training: 18%|█▊ | 1787/10000 [22:34<1:52:10, 1.22it/s, loss=0.0299, lr=2.40e-05, step=1787] Training: 18%|█▊ | 1788/10000 [22:35<1:51:00, 1.23it/s, loss=0.0299, lr=2.40e-05, step=1787] Training: 18%|█▊ | 1788/10000 [22:35<1:51:00, 1.23it/s, loss=0.0540, lr=2.40e-05, step=1788] Training: 18%|█▊ | 1789/10000 [22:35<1:45:46, 1.29it/s, loss=0.0540, lr=2.40e-05, step=1788] Training: 18%|█▊ | 1789/10000 [22:35<1:45:46, 1.29it/s, loss=0.0327, lr=2.40e-05, step=1789]19:07:08.186 [I] step=1790 loss=0.0215 smoothed_loss=0.0333 lr=2.40e-05 grad_norm=0.6438 step_time=0.6041s data_time=0.1387s it/s=1.346 eta_to_10000=6097.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0098 grad_action_out_proj_arms=0.1645 grad_arm_token_fuse=0.0546 grad_shared_expert=0.5018 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1790/10000 [22:36<1:38:59, 1.38it/s, loss=0.0327, lr=2.40e-05, step=1789] Training: 18%|█▊ | 1790/10000 [22:36<1:38:59, 1.38it/s, loss=0.0215, lr=2.40e-05, step=1790] Training: 18%|█▊ | 1791/10000 [22:37<1:37:54, 1.40it/s, loss=0.0215, lr=2.40e-05, step=1790] Training: 18%|█▊ | 1791/10000 [22:37<1:37:54, 1.40it/s, loss=0.0217, lr=2.40e-05, step=1791] Training: 18%|█▊ | 1792/10000 [22:37<1:33:33, 1.46it/s, loss=0.0217, lr=2.40e-05, step=1791] Training: 18%|█▊ | 1792/10000 [22:37<1:33:33, 1.46it/s, loss=0.0133, lr=2.40e-05, step=1792] Training: 18%|█▊ | 1793/10000 [22:38<1:47:44, 1.27it/s, loss=0.0133, lr=2.40e-05, step=1792] Training: 18%|█▊ | 1793/10000 [22:38<1:47:44, 1.27it/s, loss=0.0534, lr=2.40e-05, step=1793] Training: 18%|█▊ | 1794/10000 [22:39<1:46:26, 1.28it/s, loss=0.0534, lr=2.40e-05, step=1793] Training: 18%|█▊ | 1794/10000 [22:39<1:46:26, 1.28it/s, loss=0.0130, lr=2.40e-05, step=1794] Training: 18%|█▊ | 1795/10000 [22:40<1:45:58, 1.29it/s, loss=0.0130, lr=2.40e-05, step=1794] Training: 18%|█▊ | 1795/10000 [22:40<1:45:58, 1.29it/s, loss=0.0273, lr=2.40e-05, step=1795] Training: 18%|█▊ | 1796/10000 [22:40<1:41:06, 1.35it/s, loss=0.0273, lr=2.40e-05, step=1795] Training: 18%|█▊ | 1796/10000 [22:40<1:41:06, 1.35it/s, loss=0.0226, lr=2.40e-05, step=1796] Training: 18%|█▊ | 1797/10000 [22:41<1:47:16, 1.27it/s, loss=0.0226, lr=2.40e-05, step=1796] Training: 18%|█▊ | 1797/10000 [22:41<1:47:16, 1.27it/s, loss=0.0280, lr=2.40e-05, step=1797] Training: 18%|█▊ | 1798/10000 [22:42<1:40:45, 1.36it/s, loss=0.0280, lr=2.40e-05, step=1797] Training: 18%|█▊ | 1798/10000 [22:42<1:40:45, 1.36it/s, loss=0.0239, lr=2.40e-05, step=1798] Training: 18%|█▊ | 1799/10000 [22:43<1:36:45, 1.41it/s, loss=0.0239, lr=2.40e-05, step=1798] Training: 18%|█▊ | 1799/10000 [22:43<1:36:45, 1.41it/s, loss=0.0107, lr=2.40e-05, step=1799]19:07:15.810 [I] step=1800 loss=0.0999 smoothed_loss=0.0343 lr=2.40e-05 grad_norm=0.7122 step_time=0.6266s data_time=0.1359s it/s=1.312 eta_to_10000=6251.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0232 grad_action_out_proj_arms=0.2762 grad_arm_token_fuse=0.1284 grad_shared_expert=0.7935 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1800/10000 [22:43<1:46:42, 1.28it/s, loss=0.0107, lr=2.40e-05, step=1799] Training: 18%|█▊ | 1800/10000 [22:43<1:46:42, 1.28it/s, loss=0.0999, lr=2.40e-05, step=1800] Training: 18%|█▊ | 1801/10000 [22:45<2:11:36, 1.04it/s, loss=0.0999, lr=2.40e-05, step=1800] Training: 18%|█▊ | 1801/10000 [22:45<2:11:36, 1.04it/s, loss=0.0140, lr=2.40e-05, step=1801] Training: 18%|█▊ | 1802/10000 [22:46<2:08:17, 1.07it/s, loss=0.0140, lr=2.40e-05, step=1801] Training: 18%|█▊ | 1802/10000 [22:46<2:08:17, 1.07it/s, loss=0.0087, lr=2.40e-05, step=1802] Training: 18%|█▊ | 1803/10000 [22:47<2:08:52, 1.06it/s, loss=0.0087, lr=2.40e-05, step=1802] Training: 18%|█▊ | 1803/10000 [22:47<2:08:52, 1.06it/s, loss=0.0134, lr=2.40e-05, step=1803] Training: 18%|█▊ | 1804/10000 [22:47<1:56:09, 1.18it/s, loss=0.0134, lr=2.40e-05, step=1803] Training: 18%|█▊ | 1804/10000 [22:47<1:56:09, 1.18it/s, loss=0.0372, lr=2.40e-05, step=1804] Training: 18%|█▊ | 1805/10000 [22:48<1:47:38, 1.27it/s, loss=0.0372, lr=2.40e-05, step=1804] Training: 18%|█▊ | 1805/10000 [22:48<1:47:38, 1.27it/s, loss=0.0144, lr=2.40e-05, step=1805] Training: 18%|█▊ | 1806/10000 [22:49<1:43:11, 1.32it/s, loss=0.0144, lr=2.40e-05, step=1805] Training: 18%|█▊ | 1806/10000 [22:49<1:43:11, 1.32it/s, loss=0.0175, lr=2.40e-05, step=1806] Training: 18%|█▊ | 1807/10000 [22:50<1:48:39, 1.26it/s, loss=0.0175, lr=2.40e-05, step=1806] Training: 18%|█▊ | 1807/10000 [22:50<1:48:39, 1.26it/s, loss=0.0176, lr=2.40e-05, step=1807] Training: 18%|█▊ | 1808/10000 [22:50<1:49:21, 1.25it/s, loss=0.0176, lr=2.40e-05, step=1807] Training: 18%|█▊ | 1808/10000 [22:50<1:49:21, 1.25it/s, loss=0.0306, lr=2.40e-05, step=1808] Training: 18%|█▊ | 1809/10000 [22:51<1:53:05, 1.21it/s, loss=0.0306, lr=2.40e-05, step=1808] Training: 18%|█▊ | 1809/10000 [22:51<1:53:05, 1.21it/s, loss=0.0467, lr=2.40e-05, step=1809]19:07:24.262 [I] step=1810 loss=0.0138 smoothed_loss=0.0268 lr=2.40e-05 grad_norm=0.5938 step_time=0.6335s data_time=0.2117s it/s=1.183 eta_to_10000=6921.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1326 grad_arm_token_fuse=0.0551 grad_shared_expert=0.5075 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1810/10000 [22:52<1:46:50, 1.28it/s, loss=0.0467, lr=2.40e-05, step=1809] Training: 18%|█▊ | 1810/10000 [22:52<1:46:50, 1.28it/s, loss=0.0138, lr=2.40e-05, step=1810] Training: 18%|█▊ | 1811/10000 [22:53<1:50:05, 1.24it/s, loss=0.0138, lr=2.40e-05, step=1810] Training: 18%|█▊ | 1811/10000 [22:53<1:50:05, 1.24it/s, loss=0.0251, lr=2.40e-05, step=1811] Training: 18%|█▊ | 1812/10000 [22:54<1:48:28, 1.26it/s, loss=0.0251, lr=2.40e-05, step=1811] Training: 18%|█▊ | 1812/10000 [22:54<1:48:28, 1.26it/s, loss=0.0149, lr=2.40e-05, step=1812] Training: 18%|█▊ | 1813/10000 [22:54<1:40:47, 1.35it/s, loss=0.0149, lr=2.40e-05, step=1812] Training: 18%|█▊ | 1813/10000 [22:54<1:40:47, 1.35it/s, loss=0.0137, lr=2.40e-05, step=1813] Training: 18%|█▊ | 1814/10000 [22:55<1:48:54, 1.25it/s, loss=0.0137, lr=2.40e-05, step=1813] Training: 18%|█▊ | 1814/10000 [22:55<1:48:54, 1.25it/s, loss=0.0169, lr=2.40e-05, step=1814] Training: 18%|█▊ | 1815/10000 [22:56<2:09:36, 1.05it/s, loss=0.0169, lr=2.40e-05, step=1814] Training: 18%|█▊ | 1815/10000 [22:56<2:09:36, 1.05it/s, loss=0.0452, lr=2.40e-05, step=1815] Training: 18%|█▊ | 1816/10000 [22:57<1:55:42, 1.18it/s, loss=0.0452, lr=2.40e-05, step=1815] Training: 18%|█▊ | 1816/10000 [22:57<1:55:42, 1.18it/s, loss=0.0435, lr=2.40e-05, step=1816] Training: 18%|█▊ | 1817/10000 [22:58<1:58:56, 1.15it/s, loss=0.0435, lr=2.40e-05, step=1816] Training: 18%|█▊ | 1817/10000 [22:58<1:58:56, 1.15it/s, loss=0.0448, lr=2.40e-05, step=1817] Training: 18%|█▊ | 1818/10000 [22:59<1:52:06, 1.22it/s, loss=0.0448, lr=2.40e-05, step=1817] Training: 18%|█▊ | 1818/10000 [22:59<1:52:06, 1.22it/s, loss=0.0429, lr=2.39e-05, step=1818] Training: 18%|█▊ | 1819/10000 [23:00<1:54:13, 1.19it/s, loss=0.0429, lr=2.39e-05, step=1818] Training: 18%|█▊ | 1819/10000 [23:00<1:54:13, 1.19it/s, loss=0.0413, lr=2.39e-05, step=1819]19:07:32.552 [I] step=1820 loss=0.0140 smoothed_loss=0.0299 lr=2.40e-05 grad_norm=0.6707 step_time=0.6612s data_time=0.1678s it/s=1.207 eta_to_10000=6779.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0149 grad_action_out_proj_arms=0.2032 grad_arm_token_fuse=0.0697 grad_shared_expert=0.6875 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1820/10000 [23:00<1:48:16, 1.26it/s, loss=0.0413, lr=2.39e-05, step=1819] Training: 18%|█▊ | 1820/10000 [23:00<1:48:16, 1.26it/s, loss=0.0140, lr=2.39e-05, step=1820] Training: 18%|█▊ | 1821/10000 [23:01<1:52:35, 1.21it/s, loss=0.0140, lr=2.39e-05, step=1820] Training: 18%|█▊ | 1821/10000 [23:01<1:52:35, 1.21it/s, loss=0.0154, lr=2.39e-05, step=1821] Training: 18%|█▊ | 1822/10000 [23:02<1:56:09, 1.17it/s, loss=0.0154, lr=2.39e-05, step=1821] Training: 18%|█▊ | 1822/10000 [23:02<1:56:09, 1.17it/s, loss=0.0524, lr=2.39e-05, step=1822] Training: 18%|█▊ | 1823/10000 [23:03<1:48:16, 1.26it/s, loss=0.0524, lr=2.39e-05, step=1822] Training: 18%|█▊ | 1823/10000 [23:03<1:48:16, 1.26it/s, loss=0.0066, lr=2.39e-05, step=1823] Training: 18%|█▊ | 1824/10000 [23:03<1:48:08, 1.26it/s, loss=0.0066, lr=2.39e-05, step=1823] Training: 18%|█▊ | 1824/10000 [23:03<1:48:08, 1.26it/s, loss=0.0364, lr=2.39e-05, step=1824] Training: 18%|█▊ | 1825/10000 [23:04<1:41:17, 1.35it/s, loss=0.0364, lr=2.39e-05, step=1824] Training: 18%|█▊ | 1825/10000 [23:04<1:41:17, 1.35it/s, loss=0.0172, lr=2.39e-05, step=1825] Training: 18%|█▊ | 1826/10000 [23:05<1:41:04, 1.35it/s, loss=0.0172, lr=2.39e-05, step=1825] Training: 18%|█▊ | 1826/10000 [23:05<1:41:04, 1.35it/s, loss=0.0087, lr=2.39e-05, step=1826] Training: 18%|█▊ | 1827/10000 [23:06<1:40:01, 1.36it/s, loss=0.0087, lr=2.39e-05, step=1826] Training: 18%|█▊ | 1827/10000 [23:06<1:40:01, 1.36it/s, loss=0.0120, lr=2.39e-05, step=1827] Training: 18%|█▊ | 1828/10000 [23:07<2:00:39, 1.13it/s, loss=0.0120, lr=2.39e-05, step=1827] Training: 18%|█▊ | 1828/10000 [23:07<2:00:39, 1.13it/s, loss=0.0270, lr=2.39e-05, step=1828] Training: 18%|█▊ | 1829/10000 [23:07<1:52:28, 1.21it/s, loss=0.0270, lr=2.39e-05, step=1828] Training: 18%|█▊ | 1829/10000 [23:07<1:52:28, 1.21it/s, loss=0.0440, lr=2.39e-05, step=1829]19:07:40.550 [I] step=1830 loss=0.0175 smoothed_loss=0.0259 lr=2.39e-05 grad_norm=0.5750 step_time=0.6480s data_time=0.1518s it/s=1.251 eta_to_10000=6533.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1595 grad_arm_token_fuse=0.0696 grad_shared_expert=0.4718 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1830/10000 [23:08<1:48:24, 1.26it/s, loss=0.0440, lr=2.39e-05, step=1829] Training: 18%|█▊ | 1830/10000 [23:08<1:48:24, 1.26it/s, loss=0.0175, lr=2.39e-05, step=1830] Training: 18%|█▊ | 1831/10000 [23:09<1:45:46, 1.29it/s, loss=0.0175, lr=2.39e-05, step=1830] Training: 18%|█▊ | 1831/10000 [23:09<1:45:46, 1.29it/s, loss=0.0369, lr=2.39e-05, step=1831] Training: 18%|█▊ | 1832/10000 [23:10<1:47:02, 1.27it/s, loss=0.0369, lr=2.39e-05, step=1831] Training: 18%|█▊ | 1832/10000 [23:10<1:47:02, 1.27it/s, loss=0.0267, lr=2.39e-05, step=1832] Training: 18%|█▊ | 1833/10000 [23:10<1:43:50, 1.31it/s, loss=0.0267, lr=2.39e-05, step=1832] Training: 18%|█▊ | 1833/10000 [23:10<1:43:50, 1.31it/s, loss=0.0240, lr=2.39e-05, step=1833] Training: 18%|█▊ | 1834/10000 [23:11<1:33:28, 1.46it/s, loss=0.0240, lr=2.39e-05, step=1833] Training: 18%|█▊ | 1834/10000 [23:11<1:33:28, 1.46it/s, loss=0.0355, lr=2.39e-05, step=1834] Training: 18%|█▊ | 1835/10000 [23:12<2:03:46, 1.10it/s, loss=0.0355, lr=2.39e-05, step=1834] Training: 18%|█▊ | 1835/10000 [23:12<2:03:46, 1.10it/s, loss=0.0055, lr=2.39e-05, step=1835] Training: 18%|█▊ | 1836/10000 [23:13<2:04:05, 1.10it/s, loss=0.0055, lr=2.39e-05, step=1835] Training: 18%|█▊ | 1836/10000 [23:13<2:04:05, 1.10it/s, loss=0.0537, lr=2.39e-05, step=1836] Training: 18%|█▊ | 1837/10000 [23:14<1:56:53, 1.16it/s, loss=0.0537, lr=2.39e-05, step=1836] Training: 18%|█▊ | 1837/10000 [23:14<1:56:53, 1.16it/s, loss=0.0181, lr=2.39e-05, step=1837] Training: 18%|█▊ | 1838/10000 [23:15<1:51:20, 1.22it/s, loss=0.0181, lr=2.39e-05, step=1837] Training: 18%|█▊ | 1838/10000 [23:15<1:51:20, 1.22it/s, loss=0.0219, lr=2.39e-05, step=1838] Training: 18%|█▊ | 1839/10000 [23:16<1:52:31, 1.21it/s, loss=0.0219, lr=2.39e-05, step=1838] Training: 18%|█▊ | 1839/10000 [23:16<1:52:31, 1.21it/s, loss=0.0163, lr=2.39e-05, step=1839]19:07:48.650 [I] step=1840 loss=0.0256 smoothed_loss=0.0256 lr=2.39e-05 grad_norm=0.6718 step_time=0.6488s data_time=0.1612s it/s=1.235 eta_to_10000=6609.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1563 grad_arm_token_fuse=0.0527 grad_shared_expert=0.5841 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1840/10000 [23:16<1:46:50, 1.27it/s, loss=0.0163, lr=2.39e-05, step=1839] Training: 18%|█▊ | 1840/10000 [23:16<1:46:50, 1.27it/s, loss=0.0256, lr=2.39e-05, step=1840] Training: 18%|█▊ | 1841/10000 [23:17<1:39:27, 1.37it/s, loss=0.0256, lr=2.39e-05, step=1840] Training: 18%|█▊ | 1841/10000 [23:17<1:39:27, 1.37it/s, loss=0.0106, lr=2.39e-05, step=1841] Training: 18%|█▊ | 1842/10000 [23:18<1:53:15, 1.20it/s, loss=0.0106, lr=2.39e-05, step=1841] Training: 18%|█▊ | 1842/10000 [23:18<1:53:15, 1.20it/s, loss=0.0190, lr=2.39e-05, step=1842] Training: 18%|█▊ | 1843/10000 [23:19<2:01:22, 1.12it/s, loss=0.0190, lr=2.39e-05, step=1842] Training: 18%|█▊ | 1843/10000 [23:19<2:01:22, 1.12it/s, loss=0.0231, lr=2.39e-05, step=1843] Training: 18%|█▊ | 1844/10000 [23:20<1:52:18, 1.21it/s, loss=0.0231, lr=2.39e-05, step=1843] Training: 18%|█▊ | 1844/10000 [23:20<1:52:18, 1.21it/s, loss=0.0299, lr=2.39e-05, step=1844] Training: 18%|█▊ | 1845/10000 [23:20<1:45:06, 1.29it/s, loss=0.0299, lr=2.39e-05, step=1844] Training: 18%|█▊ | 1845/10000 [23:20<1:45:06, 1.29it/s, loss=0.0196, lr=2.39e-05, step=1845] Training: 18%|█▊ | 1846/10000 [23:21<1:41:04, 1.34it/s, loss=0.0196, lr=2.39e-05, step=1845] Training: 18%|█▊ | 1846/10000 [23:21<1:41:04, 1.34it/s, loss=0.0218, lr=2.39e-05, step=1846] Training: 18%|█▊ | 1847/10000 [23:22<1:51:45, 1.22it/s, loss=0.0218, lr=2.39e-05, step=1846] Training: 18%|█▊ | 1847/10000 [23:22<1:51:45, 1.22it/s, loss=0.0433, lr=2.39e-05, step=1847] Training: 18%|█▊ | 1848/10000 [23:23<1:50:09, 1.23it/s, loss=0.0433, lr=2.39e-05, step=1847] Training: 18%|█▊ | 1848/10000 [23:23<1:50:09, 1.23it/s, loss=0.0337, lr=2.39e-05, step=1848] Training: 18%|█▊ | 1849/10000 [23:23<1:44:41, 1.30it/s, loss=0.0337, lr=2.39e-05, step=1848] Training: 18%|█▊ | 1849/10000 [23:23<1:44:41, 1.30it/s, loss=0.0115, lr=2.39e-05, step=1849]19:07:57.103 [I] step=1850 loss=0.0144 smoothed_loss=0.0238 lr=2.39e-05 grad_norm=0.6204 step_time=0.6841s data_time=0.1612s it/s=1.183 eta_to_10000=6887.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.1351 grad_arm_token_fuse=0.0592 grad_shared_expert=0.4311 (18633:train_pytorch.py:850) + Training: 18%|█▊ | 1850/10000 [23:25<2:05:33, 1.08it/s, loss=0.0115, lr=2.39e-05, step=1849] Training: 18%|█▊ | 1850/10000 [23:25<2:05:33, 1.08it/s, loss=0.0144, lr=2.39e-05, step=1850] Training: 19%|█▊ | 1851/10000 [23:25<1:52:09, 1.21it/s, loss=0.0144, lr=2.39e-05, step=1850] Training: 19%|█▊ | 1851/10000 [23:25<1:52:09, 1.21it/s, loss=0.0089, lr=2.39e-05, step=1851] Training: 19%|█▊ | 1852/10000 [23:26<1:41:59, 1.33it/s, loss=0.0089, lr=2.39e-05, step=1851] Training: 19%|█▊ | 1852/10000 [23:26<1:41:59, 1.33it/s, loss=0.0677, lr=2.39e-05, step=1852] Training: 19%|█▊ | 1853/10000 [23:27<1:39:35, 1.36it/s, loss=0.0677, lr=2.39e-05, step=1852] Training: 19%|█▊ | 1853/10000 [23:27<1:39:35, 1.36it/s, loss=0.0187, lr=2.39e-05, step=1853] Training: 19%|█▊ | 1854/10000 [23:28<1:46:52, 1.27it/s, loss=0.0187, lr=2.39e-05, step=1853] Training: 19%|█▊ | 1854/10000 [23:28<1:46:52, 1.27it/s, loss=0.0238, lr=2.39e-05, step=1854] Training: 19%|█▊ | 1855/10000 [23:28<1:39:06, 1.37it/s, loss=0.0238, lr=2.39e-05, step=1854] Training: 19%|█▊ | 1855/10000 [23:28<1:39:06, 1.37it/s, loss=0.0570, lr=2.39e-05, step=1855] Training: 19%|█▊ | 1856/10000 [23:29<1:31:11, 1.49it/s, loss=0.0570, lr=2.39e-05, step=1855] Training: 19%|█▊ | 1856/10000 [23:29<1:31:11, 1.49it/s, loss=0.0304, lr=2.39e-05, step=1856] Training: 19%|█▊ | 1857/10000 [23:30<1:44:41, 1.30it/s, loss=0.0304, lr=2.39e-05, step=1856] Training: 19%|█▊ | 1857/10000 [23:30<1:44:41, 1.30it/s, loss=0.0803, lr=2.39e-05, step=1857] Training: 19%|█▊ | 1858/10000 [23:31<1:49:57, 1.23it/s, loss=0.0803, lr=2.39e-05, step=1857] Training: 19%|█▊ | 1858/10000 [23:31<1:49:57, 1.23it/s, loss=0.0285, lr=2.39e-05, step=1858] Training: 19%|█▊ | 1859/10000 [23:31<1:48:35, 1.25it/s, loss=0.0285, lr=2.39e-05, step=1858] Training: 19%|█▊ | 1859/10000 [23:31<1:48:35, 1.25it/s, loss=0.0350, lr=2.39e-05, step=1859]19:08:04.414 [I] step=1860 loss=0.0175 smoothed_loss=0.0321 lr=2.39e-05 grad_norm=0.7131 step_time=0.5917s data_time=0.1394s it/s=1.368 eta_to_10000=5950.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0178 grad_action_out_proj_arms=0.2110 grad_arm_token_fuse=0.1045 grad_shared_expert=0.5490 (18633:train_pytorch.py:850) + Training: 19%|█▊ | 1860/10000 [23:32<1:45:15, 1.29it/s, loss=0.0350, lr=2.39e-05, step=1859] Training: 19%|█▊ | 1860/10000 [23:32<1:45:15, 1.29it/s, loss=0.0175, lr=2.39e-05, step=1860] Training: 19%|█▊ | 1861/10000 [23:33<1:41:19, 1.34it/s, loss=0.0175, lr=2.39e-05, step=1860] Training: 19%|█▊ | 1861/10000 [23:33<1:41:19, 1.34it/s, loss=0.0284, lr=2.39e-05, step=1861] Training: 19%|█▊ | 1862/10000 [23:34<1:48:37, 1.25it/s, loss=0.0284, lr=2.39e-05, step=1861] Training: 19%|█▊ | 1862/10000 [23:34<1:48:37, 1.25it/s, loss=0.0563, lr=2.39e-05, step=1862] Training: 19%|█▊ | 1863/10000 [23:34<1:40:39, 1.35it/s, loss=0.0563, lr=2.39e-05, step=1862] Training: 19%|█▊ | 1863/10000 [23:34<1:40:39, 1.35it/s, loss=0.0216, lr=2.39e-05, step=1863] Training: 19%|█▊ | 1864/10000 [23:35<1:42:16, 1.33it/s, loss=0.0216, lr=2.39e-05, step=1863] Training: 19%|█▊ | 1864/10000 [23:35<1:42:16, 1.33it/s, loss=0.0361, lr=2.39e-05, step=1864] Training: 19%|█▊ | 1865/10000 [23:36<1:38:19, 1.38it/s, loss=0.0361, lr=2.39e-05, step=1864] Training: 19%|█▊ | 1865/10000 [23:36<1:38:19, 1.38it/s, loss=0.0244, lr=2.39e-05, step=1865] Training: 19%|█▊ | 1866/10000 [23:37<1:42:49, 1.32it/s, loss=0.0244, lr=2.39e-05, step=1865] Training: 19%|█▊ | 1866/10000 [23:37<1:42:49, 1.32it/s, loss=0.0268, lr=2.39e-05, step=1866] Training: 19%|█▊ | 1867/10000 [23:37<1:43:46, 1.31it/s, loss=0.0268, lr=2.39e-05, step=1866] Training: 19%|█▊ | 1867/10000 [23:37<1:43:46, 1.31it/s, loss=0.0310, lr=2.39e-05, step=1867] Training: 19%|█▊ | 1868/10000 [23:38<1:37:54, 1.38it/s, loss=0.0310, lr=2.39e-05, step=1867] Training: 19%|█▊ | 1868/10000 [23:38<1:37:54, 1.38it/s, loss=0.0355, lr=2.39e-05, step=1868] Training: 19%|█▊ | 1869/10000 [23:39<1:35:02, 1.43it/s, loss=0.0355, lr=2.39e-05, step=1868] Training: 19%|█▊ | 1869/10000 [23:39<1:35:02, 1.43it/s, loss=0.0181, lr=2.39e-05, step=1869]19:08:11.658 [I] step=1870 loss=0.0386 smoothed_loss=0.0315 lr=2.39e-05 grad_norm=0.6103 step_time=0.6045s data_time=0.1199s it/s=1.381 eta_to_10000=5888.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1448 grad_arm_token_fuse=0.0750 grad_shared_expert=0.6229 (18633:train_pytorch.py:850) + Training: 19%|█▊ | 1870/10000 [23:39<1:34:59, 1.43it/s, loss=0.0181, lr=2.39e-05, step=1869] Training: 19%|█▊ | 1870/10000 [23:39<1:34:59, 1.43it/s, loss=0.0386, lr=2.39e-05, step=1870] Training: 19%|█▊ | 1871/10000 [23:40<1:40:27, 1.35it/s, loss=0.0386, lr=2.39e-05, step=1870] Training: 19%|█▊ | 1871/10000 [23:40<1:40:27, 1.35it/s, loss=0.0533, lr=2.39e-05, step=1871] Training: 19%|█▊ | 1872/10000 [23:41<1:35:58, 1.41it/s, loss=0.0533, lr=2.39e-05, step=1871] Training: 19%|█▊ | 1872/10000 [23:41<1:35:58, 1.41it/s, loss=0.0516, lr=2.39e-05, step=1872] Training: 19%|█▊ | 1873/10000 [23:42<1:42:37, 1.32it/s, loss=0.0516, lr=2.39e-05, step=1872] Training: 19%|█▊ | 1873/10000 [23:42<1:42:37, 1.32it/s, loss=0.0344, lr=2.39e-05, step=1873] Training: 19%|█▊ | 1874/10000 [23:43<1:45:58, 1.28it/s, loss=0.0344, lr=2.39e-05, step=1873] Training: 19%|█▊ | 1874/10000 [23:43<1:45:58, 1.28it/s, loss=0.0301, lr=2.39e-05, step=1874] Training: 19%|█▉ | 1875/10000 [23:43<1:45:55, 1.28it/s, loss=0.0301, lr=2.39e-05, step=1874] Training: 19%|█▉ | 1875/10000 [23:43<1:45:55, 1.28it/s, loss=0.0098, lr=2.39e-05, step=1875] Training: 19%|█▉ | 1876/10000 [23:44<1:40:37, 1.35it/s, loss=0.0098, lr=2.39e-05, step=1875] Training: 19%|█▉ | 1876/10000 [23:44<1:40:37, 1.35it/s, loss=0.0553, lr=2.39e-05, step=1876] Training: 19%|█▉ | 1877/10000 [23:45<1:59:20, 1.13it/s, loss=0.0553, lr=2.39e-05, step=1876] Training: 19%|█▉ | 1877/10000 [23:45<1:59:20, 1.13it/s, loss=0.0268, lr=2.39e-05, step=1877] Training: 19%|█▉ | 1878/10000 [23:46<2:14:17, 1.01it/s, loss=0.0268, lr=2.39e-05, step=1877] Training: 19%|█▉ | 1878/10000 [23:46<2:14:17, 1.01it/s, loss=0.0563, lr=2.39e-05, step=1878] Training: 19%|█▉ | 1879/10000 [23:48<2:19:18, 1.03s/it, loss=0.0563, lr=2.39e-05, step=1878] Training: 19%|█▉ | 1879/10000 [23:48<2:19:18, 1.03s/it, loss=0.0317, lr=2.39e-05, step=1879]19:08:20.767 [I] step=1880 loss=0.0224 smoothed_loss=0.0343 lr=2.39e-05 grad_norm=0.6823 step_time=0.6999s data_time=0.2111s it/s=1.098 eta_to_10000=7396.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0111 grad_action_out_proj_arms=0.1645 grad_arm_token_fuse=0.0607 grad_shared_expert=1.2589 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1880/10000 [23:48<2:15:11, 1.00it/s, loss=0.0317, lr=2.39e-05, step=1879] Training: 19%|█▉ | 1880/10000 [23:48<2:15:11, 1.00it/s, loss=0.0224, lr=2.39e-05, step=1880] Training: 19%|█▉ | 1881/10000 [23:49<1:54:41, 1.18it/s, loss=0.0224, lr=2.39e-05, step=1880] Training: 19%|█▉ | 1881/10000 [23:49<1:54:41, 1.18it/s, loss=0.0127, lr=2.38e-05, step=1881] Training: 19%|█▉ | 1882/10000 [23:50<1:47:36, 1.26it/s, loss=0.0127, lr=2.38e-05, step=1881] Training: 19%|█▉ | 1882/10000 [23:50<1:47:36, 1.26it/s, loss=0.0128, lr=2.38e-05, step=1882] Training: 19%|█▉ | 1883/10000 [23:50<1:44:48, 1.29it/s, loss=0.0128, lr=2.38e-05, step=1882] Training: 19%|█▉ | 1883/10000 [23:50<1:44:48, 1.29it/s, loss=0.0110, lr=2.38e-05, step=1883] Training: 19%|█▉ | 1884/10000 [23:51<1:54:15, 1.18it/s, loss=0.0110, lr=2.38e-05, step=1883] Training: 19%|█▉ | 1884/10000 [23:51<1:54:15, 1.18it/s, loss=0.0223, lr=2.38e-05, step=1884] Training: 19%|█▉ | 1885/10000 [23:52<1:49:56, 1.23it/s, loss=0.0223, lr=2.38e-05, step=1884] Training: 19%|█▉ | 1885/10000 [23:52<1:49:56, 1.23it/s, loss=0.0248, lr=2.38e-05, step=1885] Training: 19%|█▉ | 1886/10000 [23:53<2:02:22, 1.11it/s, loss=0.0248, lr=2.38e-05, step=1885] Training: 19%|█▉ | 1886/10000 [23:53<2:02:22, 1.11it/s, loss=0.0145, lr=2.38e-05, step=1886] Training: 19%|█▉ | 1887/10000 [23:54<1:58:50, 1.14it/s, loss=0.0145, lr=2.38e-05, step=1886] Training: 19%|█▉ | 1887/10000 [23:54<1:58:50, 1.14it/s, loss=0.0575, lr=2.38e-05, step=1887] Training: 19%|█▉ | 1888/10000 [23:55<1:45:36, 1.28it/s, loss=0.0575, lr=2.38e-05, step=1887] Training: 19%|█▉ | 1888/10000 [23:55<1:45:36, 1.28it/s, loss=0.0135, lr=2.38e-05, step=1888] Training: 19%|█▉ | 1889/10000 [23:55<1:49:09, 1.24it/s, loss=0.0135, lr=2.38e-05, step=1888] Training: 19%|█▉ | 1889/10000 [23:55<1:49:09, 1.24it/s, loss=0.0194, lr=2.38e-05, step=1889]19:08:28.390 [I] step=1890 loss=0.0142 smoothed_loss=0.0256 lr=2.38e-05 grad_norm=0.5904 step_time=0.6097s data_time=0.1527s it/s=1.312 eta_to_10000=6179.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1699 grad_arm_token_fuse=0.1005 grad_shared_expert=0.3683 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1890/10000 [23:56<1:41:31, 1.33it/s, loss=0.0194, lr=2.38e-05, step=1889] Training: 19%|█▉ | 1890/10000 [23:56<1:41:31, 1.33it/s, loss=0.0142, lr=2.38e-05, step=1890] Training: 19%|█▉ | 1891/10000 [23:57<1:49:25, 1.24it/s, loss=0.0142, lr=2.38e-05, step=1890] Training: 19%|█▉ | 1891/10000 [23:57<1:49:25, 1.24it/s, loss=0.0347, lr=2.38e-05, step=1891] Training: 19%|█▉ | 1892/10000 [23:58<2:00:31, 1.12it/s, loss=0.0347, lr=2.38e-05, step=1891] Training: 19%|█▉ | 1892/10000 [23:58<2:00:31, 1.12it/s, loss=0.0224, lr=2.38e-05, step=1892] Training: 19%|█▉ | 1893/10000 [23:59<1:58:53, 1.14it/s, loss=0.0224, lr=2.38e-05, step=1892] Training: 19%|█▉ | 1893/10000 [23:59<1:58:53, 1.14it/s, loss=0.0163, lr=2.38e-05, step=1893] Training: 19%|█▉ | 1894/10000 [24:00<1:57:42, 1.15it/s, loss=0.0163, lr=2.38e-05, step=1893] Training: 19%|█▉ | 1894/10000 [24:00<1:57:42, 1.15it/s, loss=0.0189, lr=2.38e-05, step=1894] Training: 19%|█▉ | 1895/10000 [24:01<1:54:37, 1.18it/s, loss=0.0189, lr=2.38e-05, step=1894] Training: 19%|█▉ | 1895/10000 [24:01<1:54:37, 1.18it/s, loss=0.0258, lr=2.38e-05, step=1895] Training: 19%|█▉ | 1896/10000 [24:01<1:45:27, 1.28it/s, loss=0.0258, lr=2.38e-05, step=1895] Training: 19%|█▉ | 1896/10000 [24:01<1:45:27, 1.28it/s, loss=0.0741, lr=2.38e-05, step=1896] Training: 19%|█▉ | 1897/10000 [24:02<1:39:59, 1.35it/s, loss=0.0741, lr=2.38e-05, step=1896] Training: 19%|█▉ | 1897/10000 [24:02<1:39:59, 1.35it/s, loss=0.0310, lr=2.38e-05, step=1897] Training: 19%|█▉ | 1898/10000 [24:03<1:44:06, 1.30it/s, loss=0.0310, lr=2.38e-05, step=1897] Training: 19%|█▉ | 1898/10000 [24:03<1:44:06, 1.30it/s, loss=0.0090, lr=2.38e-05, step=1898] Training: 19%|█▉ | 1899/10000 [24:04<1:52:21, 1.20it/s, loss=0.0090, lr=2.38e-05, step=1898] Training: 19%|█▉ | 1899/10000 [24:04<1:52:21, 1.20it/s, loss=0.0374, lr=2.38e-05, step=1899]19:08:36.915 [I] step=1900 loss=0.0122 smoothed_loss=0.0270 lr=2.38e-05 grad_norm=0.6805 step_time=0.6622s data_time=0.1902s it/s=1.173 eta_to_10000=6904.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0126 grad_action_out_proj_arms=0.1887 grad_arm_token_fuse=0.0605 grad_shared_expert=0.4870 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1900/10000 [24:05<1:55:30, 1.17it/s, loss=0.0374, lr=2.38e-05, step=1899] Training: 19%|█▉ | 1900/10000 [24:05<1:55:30, 1.17it/s, loss=0.0122, lr=2.38e-05, step=1900] Training: 19%|█▉ | 1901/10000 [24:05<1:45:16, 1.28it/s, loss=0.0122, lr=2.38e-05, step=1900] Training: 19%|█▉ | 1901/10000 [24:05<1:45:16, 1.28it/s, loss=0.0489, lr=2.38e-05, step=1901] Training: 19%|█▉ | 1902/10000 [24:06<1:34:03, 1.43it/s, loss=0.0489, lr=2.38e-05, step=1901] Training: 19%|█▉ | 1902/10000 [24:06<1:34:03, 1.43it/s, loss=0.0276, lr=2.38e-05, step=1902] Training: 19%|█▉ | 1903/10000 [24:06<1:29:05, 1.51it/s, loss=0.0276, lr=2.38e-05, step=1902] Training: 19%|█▉ | 1903/10000 [24:06<1:29:05, 1.51it/s, loss=0.0079, lr=2.38e-05, step=1903] Training: 19%|█▉ | 1904/10000 [24:07<1:31:10, 1.48it/s, loss=0.0079, lr=2.38e-05, step=1903] Training: 19%|█▉ | 1904/10000 [24:07<1:31:10, 1.48it/s, loss=0.0121, lr=2.38e-05, step=1904] Training: 19%|█▉ | 1905/10000 [24:08<1:29:28, 1.51it/s, loss=0.0121, lr=2.38e-05, step=1904] Training: 19%|█▉ | 1905/10000 [24:08<1:29:28, 1.51it/s, loss=0.0109, lr=2.38e-05, step=1905] Training: 19%|█▉ | 1906/10000 [24:08<1:32:55, 1.45it/s, loss=0.0109, lr=2.38e-05, step=1905] Training: 19%|█▉ | 1906/10000 [24:08<1:32:55, 1.45it/s, loss=0.0998, lr=2.38e-05, step=1906] Training: 19%|█▉ | 1907/10000 [24:09<1:42:44, 1.31it/s, loss=0.0998, lr=2.38e-05, step=1906] Training: 19%|█▉ | 1907/10000 [24:09<1:42:44, 1.31it/s, loss=0.0490, lr=2.38e-05, step=1907] Training: 19%|█▉ | 1908/10000 [24:10<1:38:41, 1.37it/s, loss=0.0490, lr=2.38e-05, step=1907] Training: 19%|█▉ | 1908/10000 [24:10<1:38:41, 1.37it/s, loss=0.0149, lr=2.38e-05, step=1908] Training: 19%|█▉ | 1909/10000 [24:11<1:35:45, 1.41it/s, loss=0.0149, lr=2.38e-05, step=1908] Training: 19%|█▉ | 1909/10000 [24:11<1:35:45, 1.41it/s, loss=0.0100, lr=2.38e-05, step=1909]19:08:43.548 [I] step=1910 loss=0.0766 smoothed_loss=0.0340 lr=2.38e-05 grad_norm=0.6078 step_time=0.5408s data_time=0.1225s it/s=1.508 eta_to_10000=5365.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0341 grad_action_out_proj_arms=0.2469 grad_arm_token_fuse=0.1871 grad_shared_expert=0.7141 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1910/10000 [24:11<1:31:26, 1.47it/s, loss=0.0100, lr=2.38e-05, step=1909] Training: 19%|█▉ | 1910/10000 [24:11<1:31:26, 1.47it/s, loss=0.0766, lr=2.38e-05, step=1910] Training: 19%|█▉ | 1911/10000 [24:12<1:24:06, 1.60it/s, loss=0.0766, lr=2.38e-05, step=1910] Training: 19%|█▉ | 1911/10000 [24:12<1:24:06, 1.60it/s, loss=0.0178, lr=2.38e-05, step=1911] Training: 19%|█▉ | 1912/10000 [24:12<1:28:00, 1.53it/s, loss=0.0178, lr=2.38e-05, step=1911] Training: 19%|█▉ | 1912/10000 [24:12<1:28:00, 1.53it/s, loss=0.0069, lr=2.38e-05, step=1912] Training: 19%|█▉ | 1913/10000 [24:13<1:30:38, 1.49it/s, loss=0.0069, lr=2.38e-05, step=1912] Training: 19%|█▉ | 1913/10000 [24:13<1:30:38, 1.49it/s, loss=0.0171, lr=2.38e-05, step=1913] Training: 19%|█▉ | 1914/10000 [24:14<1:43:57, 1.30it/s, loss=0.0171, lr=2.38e-05, step=1913] Training: 19%|█▉ | 1914/10000 [24:14<1:43:57, 1.30it/s, loss=0.0284, lr=2.38e-05, step=1914] Training: 19%|█▉ | 1915/10000 [24:15<1:51:29, 1.21it/s, loss=0.0284, lr=2.38e-05, step=1914] Training: 19%|█▉ | 1915/10000 [24:15<1:51:29, 1.21it/s, loss=0.0282, lr=2.38e-05, step=1915] Training: 19%|█▉ | 1916/10000 [24:16<1:43:34, 1.30it/s, loss=0.0282, lr=2.38e-05, step=1915] Training: 19%|█▉ | 1916/10000 [24:16<1:43:34, 1.30it/s, loss=0.0229, lr=2.38e-05, step=1916] Training: 19%|█▉ | 1917/10000 [24:16<1:36:24, 1.40it/s, loss=0.0229, lr=2.38e-05, step=1916] Training: 19%|█▉ | 1917/10000 [24:16<1:36:24, 1.40it/s, loss=0.0028, lr=2.38e-05, step=1917] Training: 19%|█▉ | 1918/10000 [24:17<1:35:18, 1.41it/s, loss=0.0028, lr=2.38e-05, step=1917] Training: 19%|█▉ | 1918/10000 [24:17<1:35:18, 1.41it/s, loss=0.0350, lr=2.38e-05, step=1918] Training: 19%|█▉ | 1919/10000 [24:18<1:34:31, 1.42it/s, loss=0.0350, lr=2.38e-05, step=1918] Training: 19%|█▉ | 1919/10000 [24:18<1:34:31, 1.42it/s, loss=0.0259, lr=2.38e-05, step=1919]19:08:51.009 [I] step=1920 loss=0.0517 smoothed_loss=0.0289 lr=2.38e-05 grad_norm=0.5853 step_time=0.5847s data_time=0.1614s it/s=1.340 eta_to_10000=6027.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0223 grad_action_out_proj_arms=0.1658 grad_arm_token_fuse=0.1152 grad_shared_expert=0.4647 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1920/10000 [24:19<1:45:10, 1.28it/s, loss=0.0259, lr=2.38e-05, step=1919] Training: 19%|█▉ | 1920/10000 [24:19<1:45:10, 1.28it/s, loss=0.0517, lr=2.38e-05, step=1920] Training: 19%|█▉ | 1921/10000 [24:20<1:54:08, 1.18it/s, loss=0.0517, lr=2.38e-05, step=1920] Training: 19%|█▉ | 1921/10000 [24:20<1:54:08, 1.18it/s, loss=0.0588, lr=2.38e-05, step=1921] Training: 19%|█▉ | 1922/10000 [24:21<1:57:46, 1.14it/s, loss=0.0588, lr=2.38e-05, step=1921] Training: 19%|█▉ | 1922/10000 [24:21<1:57:46, 1.14it/s, loss=0.0098, lr=2.38e-05, step=1922] Training: 19%|█▉ | 1923/10000 [24:21<1:47:22, 1.25it/s, loss=0.0098, lr=2.38e-05, step=1922] Training: 19%|█▉ | 1923/10000 [24:21<1:47:22, 1.25it/s, loss=0.0221, lr=2.38e-05, step=1923] Training: 19%|█▉ | 1924/10000 [24:22<1:49:42, 1.23it/s, loss=0.0221, lr=2.38e-05, step=1923] Training: 19%|█▉ | 1924/10000 [24:22<1:49:42, 1.23it/s, loss=0.0653, lr=2.38e-05, step=1924] Training: 19%|█▉ | 1925/10000 [24:23<1:46:55, 1.26it/s, loss=0.0653, lr=2.38e-05, step=1924] Training: 19%|█▉ | 1925/10000 [24:23<1:46:55, 1.26it/s, loss=0.0205, lr=2.38e-05, step=1925] Training: 19%|█▉ | 1926/10000 [24:24<1:54:08, 1.18it/s, loss=0.0205, lr=2.38e-05, step=1925] Training: 19%|█▉ | 1926/10000 [24:24<1:54:08, 1.18it/s, loss=0.0308, lr=2.38e-05, step=1926] Training: 19%|█▉ | 1927/10000 [24:25<1:49:09, 1.23it/s, loss=0.0308, lr=2.38e-05, step=1926] Training: 19%|█▉ | 1927/10000 [24:25<1:49:09, 1.23it/s, loss=0.0244, lr=2.38e-05, step=1927] Training: 19%|█▉ | 1928/10000 [24:26<2:08:41, 1.05it/s, loss=0.0244, lr=2.38e-05, step=1927] Training: 19%|█▉ | 1928/10000 [24:26<2:08:41, 1.05it/s, loss=0.0467, lr=2.38e-05, step=1928] Training: 19%|█▉ | 1929/10000 [24:27<2:14:29, 1.00it/s, loss=0.0467, lr=2.38e-05, step=1928] Training: 19%|█▉ | 1929/10000 [24:27<2:14:29, 1.00it/s, loss=0.0220, lr=2.38e-05, step=1929]19:08:59.915 [I] step=1930 loss=0.0736 smoothed_loss=0.0354 lr=2.38e-05 grad_norm=0.6598 step_time=0.7107s data_time=0.1798s it/s=1.123 eta_to_10000=7184.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0259 grad_action_out_proj_arms=0.1859 grad_arm_token_fuse=0.1342 grad_shared_expert=0.6000 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1930/10000 [24:28<2:00:17, 1.12it/s, loss=0.0220, lr=2.38e-05, step=1929] Training: 19%|█▉ | 1930/10000 [24:28<2:00:17, 1.12it/s, loss=0.0736, lr=2.38e-05, step=1930] Training: 19%|█▉ | 1931/10000 [24:28<1:51:22, 1.21it/s, loss=0.0736, lr=2.38e-05, step=1930] Training: 19%|█▉ | 1931/10000 [24:28<1:51:22, 1.21it/s, loss=0.0844, lr=2.38e-05, step=1931] Training: 19%|█▉ | 1932/10000 [24:29<1:39:49, 1.35it/s, loss=0.0844, lr=2.38e-05, step=1931] Training: 19%|█▉ | 1932/10000 [24:29<1:39:49, 1.35it/s, loss=0.0113, lr=2.38e-05, step=1932] Training: 19%|█▉ | 1933/10000 [24:29<1:34:26, 1.42it/s, loss=0.0113, lr=2.38e-05, step=1932] Training: 19%|█▉ | 1933/10000 [24:29<1:34:26, 1.42it/s, loss=0.0073, lr=2.38e-05, step=1933] Training: 19%|█▉ | 1934/10000 [24:30<1:31:42, 1.47it/s, loss=0.0073, lr=2.38e-05, step=1933] Training: 19%|█▉ | 1934/10000 [24:30<1:31:42, 1.47it/s, loss=0.0667, lr=2.38e-05, step=1934] Training: 19%|█▉ | 1935/10000 [24:31<1:29:13, 1.51it/s, loss=0.0667, lr=2.38e-05, step=1934] Training: 19%|█▉ | 1935/10000 [24:31<1:29:13, 1.51it/s, loss=0.0305, lr=2.38e-05, step=1935] Training: 19%|█▉ | 1936/10000 [24:32<1:39:59, 1.34it/s, loss=0.0305, lr=2.38e-05, step=1935] Training: 19%|█▉ | 1936/10000 [24:32<1:39:59, 1.34it/s, loss=0.0219, lr=2.38e-05, step=1936] Training: 19%|█▉ | 1937/10000 [24:33<1:54:51, 1.17it/s, loss=0.0219, lr=2.38e-05, step=1936] Training: 19%|█▉ | 1937/10000 [24:33<1:54:51, 1.17it/s, loss=0.0139, lr=2.38e-05, step=1937] Training: 19%|█▉ | 1938/10000 [24:33<1:49:04, 1.23it/s, loss=0.0139, lr=2.38e-05, step=1937] Training: 19%|█▉ | 1938/10000 [24:33<1:49:04, 1.23it/s, loss=0.0194, lr=2.38e-05, step=1938] Training: 19%|█▉ | 1939/10000 [24:34<1:42:19, 1.31it/s, loss=0.0194, lr=2.38e-05, step=1938] Training: 19%|█▉ | 1939/10000 [24:34<1:42:19, 1.31it/s, loss=0.0105, lr=2.38e-05, step=1939]19:09:07.130 [I] step=1940 loss=0.0893 smoothed_loss=0.0357 lr=2.38e-05 grad_norm=0.6141 step_time=0.5679s data_time=0.1536s it/s=1.386 eta_to_10000=5813.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0180 grad_action_out_proj_arms=0.1976 grad_arm_token_fuse=0.0957 grad_shared_expert=0.7395 (18633:train_pytorch.py:850) + Training: 19%|█▉ | 1940/10000 [24:35<1:41:09, 1.33it/s, loss=0.0105, lr=2.38e-05, step=1939] Training: 19%|█▉ | 1940/10000 [24:35<1:41:09, 1.33it/s, loss=0.0893, lr=2.38e-05, step=1940] Training: 19%|█▉ | 1941/10000 [24:35<1:35:23, 1.41it/s, loss=0.0893, lr=2.38e-05, step=1940] Training: 19%|█▉ | 1941/10000 [24:35<1:35:23, 1.41it/s, loss=0.0456, lr=2.37e-05, step=1941] Training: 19%|█▉ | 1942/10000 [24:36<1:35:15, 1.41it/s, loss=0.0456, lr=2.37e-05, step=1941] Training: 19%|█▉ | 1942/10000 [24:36<1:35:15, 1.41it/s, loss=0.0463, lr=2.37e-05, step=1942] Training: 19%|█▉ | 1943/10000 [24:37<1:42:14, 1.31it/s, loss=0.0463, lr=2.37e-05, step=1942] Training: 19%|█▉ | 1943/10000 [24:37<1:42:14, 1.31it/s, loss=0.0320, lr=2.37e-05, step=1943] Training: 19%|█▉ | 1944/10000 [24:38<1:43:53, 1.29it/s, loss=0.0320, lr=2.37e-05, step=1943] Training: 19%|█▉ | 1944/10000 [24:38<1:43:53, 1.29it/s, loss=0.0901, lr=2.37e-05, step=1944] Training: 19%|█▉ | 1945/10000 [24:38<1:40:38, 1.33it/s, loss=0.0901, lr=2.37e-05, step=1944] Training: 19%|█▉ | 1945/10000 [24:38<1:40:38, 1.33it/s, loss=0.0224, lr=2.37e-05, step=1945] Training: 19%|█▉ | 1946/10000 [24:39<1:39:14, 1.35it/s, loss=0.0224, lr=2.37e-05, step=1945] Training: 19%|█▉ | 1946/10000 [24:39<1:39:14, 1.35it/s, loss=0.0169, lr=2.37e-05, step=1946] Training: 19%|█▉ | 1947/10000 [24:40<1:35:05, 1.41it/s, loss=0.0169, lr=2.37e-05, step=1946] Training: 19%|█▉ | 1947/10000 [24:40<1:35:05, 1.41it/s, loss=0.0327, lr=2.37e-05, step=1947] Training: 19%|█▉ | 1948/10000 [24:41<1:39:13, 1.35it/s, loss=0.0327, lr=2.37e-05, step=1947] Training: 19%|█▉ | 1948/10000 [24:41<1:39:13, 1.35it/s, loss=0.0149, lr=2.37e-05, step=1948] Training: 19%|█▉ | 1949/10000 [24:41<1:38:18, 1.36it/s, loss=0.0149, lr=2.37e-05, step=1948] Training: 19%|█▉ | 1949/10000 [24:41<1:38:18, 1.36it/s, loss=0.0129, lr=2.37e-05, step=1949]19:09:14.639 [I] step=1950 loss=0.0195 smoothed_loss=0.0317 lr=2.37e-05 grad_norm=0.5467 step_time=0.6137s data_time=0.1372s it/s=1.332 eta_to_10000=6044.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0171 grad_action_out_proj_arms=0.1826 grad_arm_token_fuse=0.0853 grad_shared_expert=0.5761 (18633:train_pytorch.py:850) + Training: 20%|█▉ | 1950/10000 [24:42<1:46:20, 1.26it/s, loss=0.0129, lr=2.37e-05, step=1949] Training: 20%|█▉ | 1950/10000 [24:42<1:46:20, 1.26it/s, loss=0.0195, lr=2.37e-05, step=1950] Training: 20%|█▉ | 1951/10000 [24:43<1:35:45, 1.40it/s, loss=0.0195, lr=2.37e-05, step=1950] Training: 20%|█▉ | 1951/10000 [24:43<1:35:45, 1.40it/s, loss=0.0214, lr=2.37e-05, step=1951] Training: 20%|█▉ | 1952/10000 [24:44<1:38:22, 1.36it/s, loss=0.0214, lr=2.37e-05, step=1951] Training: 20%|█▉ | 1952/10000 [24:44<1:38:22, 1.36it/s, loss=0.0673, lr=2.37e-05, step=1952] Training: 20%|█▉ | 1953/10000 [24:44<1:30:32, 1.48it/s, loss=0.0673, lr=2.37e-05, step=1952] Training: 20%|█▉ | 1953/10000 [24:44<1:30:32, 1.48it/s, loss=0.0781, lr=2.37e-05, step=1953] Training: 20%|█▉ | 1954/10000 [24:45<1:44:19, 1.29it/s, loss=0.0781, lr=2.37e-05, step=1953] Training: 20%|█▉ | 1954/10000 [24:45<1:44:19, 1.29it/s, loss=0.0283, lr=2.37e-05, step=1954] Training: 20%|█▉ | 1955/10000 [24:46<1:48:13, 1.24it/s, loss=0.0283, lr=2.37e-05, step=1954] Training: 20%|█▉ | 1955/10000 [24:46<1:48:13, 1.24it/s, loss=0.0207, lr=2.37e-05, step=1955] Training: 20%|█▉ | 1956/10000 [24:47<1:43:47, 1.29it/s, loss=0.0207, lr=2.37e-05, step=1955] Training: 20%|█▉ | 1956/10000 [24:47<1:43:47, 1.29it/s, loss=0.0542, lr=2.37e-05, step=1956] Training: 20%|█▉ | 1957/10000 [24:48<1:51:57, 1.20it/s, loss=0.0542, lr=2.37e-05, step=1956] Training: 20%|█▉ | 1957/10000 [24:48<1:51:57, 1.20it/s, loss=0.0921, lr=2.37e-05, step=1957] Training: 20%|█▉ | 1958/10000 [24:49<1:58:43, 1.13it/s, loss=0.0921, lr=2.37e-05, step=1957] Training: 20%|█▉ | 1958/10000 [24:49<1:58:43, 1.13it/s, loss=0.0689, lr=2.37e-05, step=1958] Training: 20%|█▉ | 1959/10000 [24:50<2:01:26, 1.10it/s, loss=0.0689, lr=2.37e-05, step=1958] Training: 20%|█▉ | 1959/10000 [24:50<2:01:26, 1.10it/s, loss=0.0417, lr=2.37e-05, step=1959]19:09:22.575 [I] step=1960 loss=0.0267 smoothed_loss=0.0435 lr=2.37e-05 grad_norm=0.7081 step_time=0.6319s data_time=0.1617s it/s=1.260 eta_to_10000=6380.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0267 grad_action_out_proj_arms=0.1927 grad_arm_token_fuse=0.1302 grad_shared_expert=0.7528 (18633:train_pytorch.py:850) + Training: 20%|█▉ | 1960/10000 [24:50<1:47:37, 1.25it/s, loss=0.0417, lr=2.37e-05, step=1959] Training: 20%|█▉ | 1960/10000 [24:50<1:47:37, 1.25it/s, loss=0.0267, lr=2.37e-05, step=1960] Training: 20%|█▉ | 1961/10000 [24:51<1:39:40, 1.34it/s, loss=0.0267, lr=2.37e-05, step=1960] Training: 20%|█▉ | 1961/10000 [24:51<1:39:40, 1.34it/s, loss=0.0124, lr=2.37e-05, step=1961] Training: 20%|█▉ | 1962/10000 [24:52<1:42:36, 1.31it/s, loss=0.0124, lr=2.37e-05, step=1961] Training: 20%|█▉ | 1962/10000 [24:52<1:42:36, 1.31it/s, loss=0.0452, lr=2.37e-05, step=1962] Training: 20%|█▉ | 1963/10000 [24:52<1:37:40, 1.37it/s, loss=0.0452, lr=2.37e-05, step=1962] Training: 20%|█▉ | 1963/10000 [24:52<1:37:40, 1.37it/s, loss=0.0126, lr=2.37e-05, step=1963] Training: 20%|█▉ | 1964/10000 [24:53<1:45:40, 1.27it/s, loss=0.0126, lr=2.37e-05, step=1963] Training: 20%|█▉ | 1964/10000 [24:53<1:45:40, 1.27it/s, loss=0.0118, lr=2.37e-05, step=1964] Training: 20%|█▉ | 1965/10000 [24:54<1:39:08, 1.35it/s, loss=0.0118, lr=2.37e-05, step=1964] Training: 20%|█▉ | 1965/10000 [24:54<1:39:08, 1.35it/s, loss=0.0171, lr=2.37e-05, step=1965] Training: 20%|█▉ | 1966/10000 [24:54<1:32:55, 1.44it/s, loss=0.0171, lr=2.37e-05, step=1965] Training: 20%|█▉ | 1966/10000 [24:54<1:32:55, 1.44it/s, loss=0.0155, lr=2.37e-05, step=1966] Training: 20%|█▉ | 1967/10000 [24:55<1:40:08, 1.34it/s, loss=0.0155, lr=2.37e-05, step=1966] Training: 20%|█▉ | 1967/10000 [24:55<1:40:08, 1.34it/s, loss=0.0335, lr=2.37e-05, step=1967] Training: 20%|█▉ | 1968/10000 [24:56<1:37:14, 1.38it/s, loss=0.0335, lr=2.37e-05, step=1967] Training: 20%|█▉ | 1968/10000 [24:56<1:37:14, 1.38it/s, loss=0.0185, lr=2.37e-05, step=1968] Training: 20%|█▉ | 1969/10000 [24:57<1:45:38, 1.27it/s, loss=0.0185, lr=2.37e-05, step=1968] Training: 20%|█▉ | 1969/10000 [24:57<1:45:38, 1.27it/s, loss=0.0927, lr=2.37e-05, step=1969]19:09:29.834 [I] step=1970 loss=0.0325 smoothed_loss=0.0364 lr=2.37e-05 grad_norm=0.6703 step_time=0.6026s data_time=0.1233s it/s=1.378 eta_to_10000=5828.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1819 grad_arm_token_fuse=0.0613 grad_shared_expert=0.7383 (18633:train_pytorch.py:850) + Training: 20%|█▉ | 1970/10000 [24:58<1:36:38, 1.38it/s, loss=0.0927, lr=2.37e-05, step=1969] Training: 20%|█▉ | 1970/10000 [24:58<1:36:38, 1.38it/s, loss=0.0325, lr=2.37e-05, step=1970] Training: 20%|█▉ | 1971/10000 [24:58<1:41:25, 1.32it/s, loss=0.0325, lr=2.37e-05, step=1970] Training: 20%|█▉ | 1971/10000 [24:58<1:41:25, 1.32it/s, loss=0.0088, lr=2.37e-05, step=1971] Training: 20%|█▉ | 1972/10000 [24:59<1:36:17, 1.39it/s, loss=0.0088, lr=2.37e-05, step=1971] Training: 20%|█▉ | 1972/10000 [24:59<1:36:17, 1.39it/s, loss=0.0216, lr=2.37e-05, step=1972] Training: 20%|█▉ | 1973/10000 [25:00<1:39:22, 1.35it/s, loss=0.0216, lr=2.37e-05, step=1972] Training: 20%|█▉ | 1973/10000 [25:00<1:39:22, 1.35it/s, loss=0.0357, lr=2.37e-05, step=1973] Training: 20%|█▉ | 1974/10000 [25:01<1:44:55, 1.27it/s, loss=0.0357, lr=2.37e-05, step=1973] Training: 20%|█▉ | 1974/10000 [25:01<1:44:55, 1.27it/s, loss=0.0380, lr=2.37e-05, step=1974] Training: 20%|█▉ | 1975/10000 [25:01<1:40:18, 1.33it/s, loss=0.0380, lr=2.37e-05, step=1974] Training: 20%|█▉ | 1975/10000 [25:01<1:40:18, 1.33it/s, loss=0.0403, lr=2.37e-05, step=1975] Training: 20%|█▉ | 1976/10000 [25:02<1:33:14, 1.43it/s, loss=0.0403, lr=2.37e-05, step=1975] Training: 20%|█▉ | 1976/10000 [25:02<1:33:14, 1.43it/s, loss=0.0345, lr=2.37e-05, step=1976] Training: 20%|█▉ | 1977/10000 [25:03<1:33:49, 1.43it/s, loss=0.0345, lr=2.37e-05, step=1976] Training: 20%|█▉ | 1977/10000 [25:03<1:33:49, 1.43it/s, loss=0.0241, lr=2.37e-05, step=1977] Training: 20%|█▉ | 1978/10000 [25:04<1:54:44, 1.17it/s, loss=0.0241, lr=2.37e-05, step=1977] Training: 20%|█▉ | 1978/10000 [25:04<1:54:44, 1.17it/s, loss=0.0170, lr=2.37e-05, step=1978] Training: 20%|█▉ | 1979/10000 [25:05<1:48:21, 1.23it/s, loss=0.0170, lr=2.37e-05, step=1978] Training: 20%|█▉ | 1979/10000 [25:05<1:48:21, 1.23it/s, loss=0.0459, lr=2.37e-05, step=1979]19:09:37.528 [I] step=1980 loss=0.0104 smoothed_loss=0.0306 lr=2.37e-05 grad_norm=0.5608 step_time=0.6290s data_time=0.1404s it/s=1.300 eta_to_10000=6169.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0091 grad_action_out_proj_arms=0.1574 grad_arm_token_fuse=0.0483 grad_shared_expert=0.3474 (18633:train_pytorch.py:850) + Training: 20%|█▉ | 1980/10000 [25:05<1:42:29, 1.30it/s, loss=0.0459, lr=2.37e-05, step=1979] Training: 20%|█▉ | 1980/10000 [25:05<1:42:29, 1.30it/s, loss=0.0104, lr=2.37e-05, step=1980] Training: 20%|█▉ | 1981/10000 [25:06<1:36:19, 1.39it/s, loss=0.0104, lr=2.37e-05, step=1980] Training: 20%|█▉ | 1981/10000 [25:06<1:36:19, 1.39it/s, loss=0.0245, lr=2.37e-05, step=1981] Training: 20%|█▉ | 1982/10000 [25:07<1:41:06, 1.32it/s, loss=0.0245, lr=2.37e-05, step=1981] Training: 20%|█▉ | 1982/10000 [25:07<1:41:06, 1.32it/s, loss=0.0411, lr=2.37e-05, step=1982] Training: 20%|█▉ | 1983/10000 [25:07<1:37:45, 1.37it/s, loss=0.0411, lr=2.37e-05, step=1982] Training: 20%|█▉ | 1983/10000 [25:07<1:37:45, 1.37it/s, loss=0.0265, lr=2.37e-05, step=1983] Training: 20%|█▉ | 1984/10000 [25:08<1:50:28, 1.21it/s, loss=0.0265, lr=2.37e-05, step=1983] Training: 20%|█▉ | 1984/10000 [25:08<1:50:28, 1.21it/s, loss=0.0854, lr=2.37e-05, step=1984] Training: 20%|█▉ | 1985/10000 [25:09<1:39:36, 1.34it/s, loss=0.0854, lr=2.37e-05, step=1984] Training: 20%|█▉ | 1985/10000 [25:09<1:39:36, 1.34it/s, loss=0.0200, lr=2.37e-05, step=1985] Training: 20%|█▉ | 1986/10000 [25:10<1:45:18, 1.27it/s, loss=0.0200, lr=2.37e-05, step=1985] Training: 20%|█▉ | 1986/10000 [25:10<1:45:18, 1.27it/s, loss=0.0257, lr=2.37e-05, step=1986] Training: 20%|█▉ | 1987/10000 [25:10<1:37:15, 1.37it/s, loss=0.0257, lr=2.37e-05, step=1986] Training: 20%|█▉ | 1987/10000 [25:10<1:37:15, 1.37it/s, loss=0.0259, lr=2.37e-05, step=1987] Training: 20%|█▉ | 1988/10000 [25:11<1:32:32, 1.44it/s, loss=0.0259, lr=2.37e-05, step=1987] Training: 20%|█▉ | 1988/10000 [25:11<1:32:32, 1.44it/s, loss=0.0462, lr=2.37e-05, step=1988] Training: 20%|█▉ | 1989/10000 [25:12<1:37:32, 1.37it/s, loss=0.0462, lr=2.37e-05, step=1988] Training: 20%|█▉ | 1989/10000 [25:12<1:37:32, 1.37it/s, loss=0.0134, lr=2.37e-05, step=1989]19:09:44.861 [I] step=1990 loss=0.0317 smoothed_loss=0.0321 lr=2.37e-05 grad_norm=0.6389 step_time=0.6116s data_time=0.1218s it/s=1.364 eta_to_10000=5872.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.1736 grad_arm_token_fuse=0.0778 grad_shared_expert=0.6810 (18633:train_pytorch.py:850) + Training: 20%|█▉ | 1990/10000 [25:13<1:36:08, 1.39it/s, loss=0.0134, lr=2.37e-05, step=1989] Training: 20%|█▉ | 1990/10000 [25:13<1:36:08, 1.39it/s, loss=0.0317, lr=2.37e-05, step=1990] Training: 20%|█▉ | 1991/10000 [25:14<1:46:07, 1.26it/s, loss=0.0317, lr=2.37e-05, step=1990] Training: 20%|█▉ | 1991/10000 [25:14<1:46:07, 1.26it/s, loss=0.0264, lr=2.37e-05, step=1991] Training: 20%|█▉ | 1992/10000 [25:14<1:52:09, 1.19it/s, loss=0.0264, lr=2.37e-05, step=1991] Training: 20%|█▉ | 1992/10000 [25:14<1:52:09, 1.19it/s, loss=0.0276, lr=2.37e-05, step=1992] Training: 20%|█▉ | 1993/10000 [25:16<2:04:53, 1.07it/s, loss=0.0276, lr=2.37e-05, step=1992] Training: 20%|█▉ | 1993/10000 [25:16<2:04:53, 1.07it/s, loss=0.0151, lr=2.37e-05, step=1993] Training: 20%|█▉ | 1994/10000 [25:16<2:00:13, 1.11it/s, loss=0.0151, lr=2.37e-05, step=1993] Training: 20%|█▉ | 1994/10000 [25:16<2:00:13, 1.11it/s, loss=0.0385, lr=2.37e-05, step=1994] Training: 20%|█▉ | 1995/10000 [25:17<1:50:36, 1.21it/s, loss=0.0385, lr=2.37e-05, step=1994] Training: 20%|█▉ | 1995/10000 [25:17<1:50:36, 1.21it/s, loss=0.0106, lr=2.37e-05, step=1995] Training: 20%|█▉ | 1996/10000 [25:18<1:44:34, 1.28it/s, loss=0.0106, lr=2.37e-05, step=1995] Training: 20%|█▉ | 1996/10000 [25:18<1:44:34, 1.28it/s, loss=0.0306, lr=2.37e-05, step=1996] Training: 20%|█▉ | 1997/10000 [25:18<1:40:55, 1.32it/s, loss=0.0306, lr=2.37e-05, step=1996] Training: 20%|█▉ | 1997/10000 [25:18<1:40:55, 1.32it/s, loss=0.0226, lr=2.37e-05, step=1997] Training: 20%|█▉ | 1998/10000 [25:19<1:48:16, 1.23it/s, loss=0.0226, lr=2.37e-05, step=1997] Training: 20%|█▉ | 1998/10000 [25:19<1:48:16, 1.23it/s, loss=0.0239, lr=2.36e-05, step=1998] Training: 20%|█▉ | 1999/10000 [25:20<1:51:58, 1.19it/s, loss=0.0239, lr=2.36e-05, step=1998] Training: 20%|█▉ | 1999/10000 [25:20<1:51:58, 1.19it/s, loss=0.0173, lr=2.36e-05, step=1999]19:09:53.627 [I] step=2000 loss=0.0280 smoothed_loss=0.0267 lr=2.37e-05 grad_norm=0.6051 step_time=0.7138s data_time=0.1628s it/s=1.141 eta_to_10000=7012.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0180 grad_action_out_proj_arms=0.1784 grad_arm_token_fuse=0.0955 grad_shared_expert=0.5627 (18633:train_pytorch.py:850) +19:12:06.795 [I] Saved checkpoint at step 2000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000 (18633:train_pytorch.py:350) + Training: 20%|██ | 2000/10000 [27:34<90:44:51, 40.84s/it, loss=0.0173, lr=2.36e-05, step=1999] Training: 20%|██ | 2000/10000 [27:34<90:44:51, 40.84s/it, loss=0.0280, lr=2.36e-05, step=2000] Training: 20%|██ | 2001/10000 [27:35<63:56:14, 28.78s/it, loss=0.0280, lr=2.36e-05, step=2000] Training: 20%|██ | 2001/10000 [27:35<63:56:14, 28.78s/it, loss=0.0072, lr=2.36e-05, step=2001] Training: 20%|██ | 2002/10000 [27:36<45:17:08, 20.38s/it, loss=0.0072, lr=2.36e-05, step=2001] Training: 20%|██ | 2002/10000 [27:36<45:17:08, 20.38s/it, loss=0.0455, lr=2.36e-05, step=2002] Training: 20%|██ | 2003/10000 [27:36<32:05:31, 14.45s/it, loss=0.0455, lr=2.36e-05, step=2002] Training: 20%|██ | 2003/10000 [27:36<32:05:31, 14.45s/it, loss=0.0464, lr=2.36e-05, step=2003] Training: 20%|██ | 2004/10000 [27:37<22:55:46, 10.32s/it, loss=0.0464, lr=2.36e-05, step=2003] Training: 20%|██ | 2004/10000 [27:37<22:55:46, 10.32s/it, loss=0.0499, lr=2.36e-05, step=2004] Training: 20%|██ | 2005/10000 [27:38<16:26:03, 7.40s/it, loss=0.0499, lr=2.36e-05, step=2004] Training: 20%|██ | 2005/10000 [27:38<16:26:03, 7.40s/it, loss=0.0048, lr=2.36e-05, step=2005] Training: 20%|██ | 2006/10000 [27:38<11:52:36, 5.35s/it, loss=0.0048, lr=2.36e-05, step=2005] Training: 20%|██ | 2006/10000 [27:38<11:52:36, 5.35s/it, loss=0.0158, lr=2.36e-05, step=2006] Training: 20%|██ | 2007/10000 [27:39<9:02:04, 4.07s/it, loss=0.0158, lr=2.36e-05, step=2006] Training: 20%|██ | 2007/10000 [27:39<9:02:04, 4.07s/it, loss=0.0355, lr=2.36e-05, step=2007] Training: 20%|██ | 2008/10000 [27:40<6:48:24, 3.07s/it, loss=0.0355, lr=2.36e-05, step=2007] Training: 20%|██ | 2008/10000 [27:40<6:48:24, 3.07s/it, loss=0.0096, lr=2.36e-05, step=2008] Training: 20%|██ | 2009/10000 [27:41<5:11:26, 2.34s/it, loss=0.0096, lr=2.36e-05, step=2008] Training: 20%|██ | 2009/10000 [27:41<5:11:26, 2.34s/it, loss=0.0593, lr=2.36e-05, step=2009]19:12:13.900 [I] step=2010 loss=0.0441 smoothed_loss=0.0309 lr=2.36e-05 grad_norm=0.5684 step_time=0.5752s data_time=13.4521s it/s=0.071 eta_to_10000=112077.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0282 grad_action_out_proj_arms=0.2208 grad_arm_token_fuse=0.1501 grad_shared_expert=0.5833 (18633:train_pytorch.py:850) + Training: 20%|██ | 2010/10000 [27:42<4:09:12, 1.87s/it, loss=0.0593, lr=2.36e-05, step=2009] Training: 20%|██ | 2010/10000 [27:42<4:09:12, 1.87s/it, loss=0.0441, lr=2.36e-05, step=2010] Training: 20%|██ | 2011/10000 [27:42<3:25:54, 1.55s/it, loss=0.0441, lr=2.36e-05, step=2010] Training: 20%|██ | 2011/10000 [27:42<3:25:54, 1.55s/it, loss=0.0338, lr=2.36e-05, step=2011] Training: 20%|██ | 2012/10000 [27:43<2:54:28, 1.31s/it, loss=0.0338, lr=2.36e-05, step=2011] Training: 20%|██ | 2012/10000 [27:43<2:54:28, 1.31s/it, loss=0.0522, lr=2.36e-05, step=2012] Training: 20%|██ | 2013/10000 [27:44<2:34:10, 1.16s/it, loss=0.0522, lr=2.36e-05, step=2012] Training: 20%|██ | 2013/10000 [27:44<2:34:10, 1.16s/it, loss=0.0242, lr=2.36e-05, step=2013] Training: 20%|██ | 2014/10000 [27:45<2:29:12, 1.12s/it, loss=0.0242, lr=2.36e-05, step=2013] Training: 20%|██ | 2014/10000 [27:45<2:29:12, 1.12s/it, loss=0.0103, lr=2.36e-05, step=2014] Training: 20%|██ | 2015/10000 [27:45<2:04:53, 1.07it/s, loss=0.0103, lr=2.36e-05, step=2014] Training: 20%|██ | 2015/10000 [27:45<2:04:53, 1.07it/s, loss=0.0191, lr=2.36e-05, step=2015] Training: 20%|██ | 2016/10000 [27:46<1:50:10, 1.21it/s, loss=0.0191, lr=2.36e-05, step=2015] Training: 20%|██ | 2016/10000 [27:46<1:50:10, 1.21it/s, loss=0.0305, lr=2.36e-05, step=2016] Training: 20%|██ | 2017/10000 [27:47<1:48:11, 1.23it/s, loss=0.0305, lr=2.36e-05, step=2016] Training: 20%|██ | 2017/10000 [27:47<1:48:11, 1.23it/s, loss=0.0256, lr=2.36e-05, step=2017] Training: 20%|██ | 2018/10000 [27:48<1:55:07, 1.16it/s, loss=0.0256, lr=2.36e-05, step=2017] Training: 20%|██ | 2018/10000 [27:48<1:55:07, 1.16it/s, loss=0.0210, lr=2.36e-05, step=2018] Training: 20%|██ | 2019/10000 [27:48<1:42:57, 1.29it/s, loss=0.0210, lr=2.36e-05, step=2018] Training: 20%|██ | 2019/10000 [27:48<1:42:57, 1.29it/s, loss=0.0344, lr=2.36e-05, step=2019]19:12:21.382 [I] step=2020 loss=0.0095 smoothed_loss=0.0268 lr=2.36e-05 grad_norm=0.6474 step_time=0.6031s data_time=0.1451s it/s=1.337 eta_to_10000=5969.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0170 grad_action_out_proj_arms=0.1521 grad_arm_token_fuse=0.0906 grad_shared_expert=0.6150 (18633:train_pytorch.py:850) + Training: 20%|██ | 2020/10000 [27:49<1:39:27, 1.34it/s, loss=0.0344, lr=2.36e-05, step=2019] Training: 20%|██ | 2020/10000 [27:49<1:39:27, 1.34it/s, loss=0.0095, lr=2.36e-05, step=2020] Training: 20%|██ | 2021/10000 [27:50<1:43:24, 1.29it/s, loss=0.0095, lr=2.36e-05, step=2020] Training: 20%|██ | 2021/10000 [27:50<1:43:24, 1.29it/s, loss=0.0315, lr=2.36e-05, step=2021] Training: 20%|██ | 2022/10000 [27:50<1:36:18, 1.38it/s, loss=0.0315, lr=2.36e-05, step=2021] Training: 20%|██ | 2022/10000 [27:51<1:36:18, 1.38it/s, loss=0.0420, lr=2.36e-05, step=2022] Training: 20%|██ | 2023/10000 [27:51<1:38:47, 1.35it/s, loss=0.0420, lr=2.36e-05, step=2022] Training: 20%|██ | 2023/10000 [27:51<1:38:47, 1.35it/s, loss=0.0385, lr=2.36e-05, step=2023] Training: 20%|██ | 2024/10000 [27:52<1:39:43, 1.33it/s, loss=0.0385, lr=2.36e-05, step=2023] Training: 20%|██ | 2024/10000 [27:52<1:39:43, 1.33it/s, loss=0.0332, lr=2.36e-05, step=2024] Training: 20%|██ | 2025/10000 [27:53<1:47:28, 1.24it/s, loss=0.0332, lr=2.36e-05, step=2024] Training: 20%|██ | 2025/10000 [27:53<1:47:28, 1.24it/s, loss=0.0225, lr=2.36e-05, step=2025] Training: 20%|██ | 2026/10000 [27:54<1:45:22, 1.26it/s, loss=0.0225, lr=2.36e-05, step=2025] Training: 20%|██ | 2026/10000 [27:54<1:45:22, 1.26it/s, loss=0.0115, lr=2.36e-05, step=2026] Training: 20%|██ | 2027/10000 [27:55<1:43:42, 1.28it/s, loss=0.0115, lr=2.36e-05, step=2026] Training: 20%|██ | 2027/10000 [27:55<1:43:42, 1.28it/s, loss=0.0630, lr=2.36e-05, step=2027] Training: 20%|██ | 2028/10000 [27:56<1:52:14, 1.18it/s, loss=0.0630, lr=2.36e-05, step=2027] Training: 20%|██ | 2028/10000 [27:56<1:52:14, 1.18it/s, loss=0.0494, lr=2.36e-05, step=2028] Training: 20%|██ | 2029/10000 [27:56<1:52:21, 1.18it/s, loss=0.0494, lr=2.36e-05, step=2028] Training: 20%|██ | 2029/10000 [27:56<1:52:21, 1.18it/s, loss=0.0334, lr=2.36e-05, step=2029]19:12:29.593 [I] step=2030 loss=0.0230 smoothed_loss=0.0320 lr=2.36e-05 grad_norm=0.5594 step_time=0.6550s data_time=0.1662s it/s=1.218 eta_to_10000=6543.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.2304 grad_arm_token_fuse=0.1217 grad_shared_expert=0.5267 (18633:train_pytorch.py:850) + Training: 20%|██ | 2030/10000 [27:57<1:55:06, 1.15it/s, loss=0.0334, lr=2.36e-05, step=2029] Training: 20%|██ | 2030/10000 [27:57<1:55:06, 1.15it/s, loss=0.0230, lr=2.36e-05, step=2030] Training: 20%|██ | 2031/10000 [27:58<1:40:41, 1.32it/s, loss=0.0230, lr=2.36e-05, step=2030] Training: 20%|██ | 2031/10000 [27:58<1:40:41, 1.32it/s, loss=0.0350, lr=2.36e-05, step=2031] Training: 20%|██ | 2032/10000 [27:59<1:49:25, 1.21it/s, loss=0.0350, lr=2.36e-05, step=2031] Training: 20%|██ | 2032/10000 [27:59<1:49:25, 1.21it/s, loss=0.0137, lr=2.36e-05, step=2032] Training: 20%|██ | 2033/10000 [28:00<1:51:23, 1.19it/s, loss=0.0137, lr=2.36e-05, step=2032] Training: 20%|██ | 2033/10000 [28:00<1:51:23, 1.19it/s, loss=0.0227, lr=2.36e-05, step=2033] Training: 20%|██ | 2034/10000 [28:00<1:51:33, 1.19it/s, loss=0.0227, lr=2.36e-05, step=2033] Training: 20%|██ | 2034/10000 [28:00<1:51:33, 1.19it/s, loss=0.0443, lr=2.36e-05, step=2034] Training: 20%|██ | 2035/10000 [28:01<1:50:06, 1.21it/s, loss=0.0443, lr=2.36e-05, step=2034] Training: 20%|██ | 2035/10000 [28:01<1:50:06, 1.21it/s, loss=0.0143, lr=2.36e-05, step=2035] Training: 20%|██ | 2036/10000 [28:02<2:01:29, 1.09it/s, loss=0.0143, lr=2.36e-05, step=2035] Training: 20%|██ | 2036/10000 [28:02<2:01:29, 1.09it/s, loss=0.0230, lr=2.36e-05, step=2036] Training: 20%|██ | 2037/10000 [28:03<1:54:16, 1.16it/s, loss=0.0230, lr=2.36e-05, step=2036] Training: 20%|██ | 2037/10000 [28:03<1:54:16, 1.16it/s, loss=0.0098, lr=2.36e-05, step=2037] Training: 20%|██ | 2038/10000 [28:04<1:50:06, 1.21it/s, loss=0.0098, lr=2.36e-05, step=2037] Training: 20%|██ | 2038/10000 [28:04<1:50:06, 1.21it/s, loss=0.0430, lr=2.36e-05, step=2038] Training: 20%|██ | 2039/10000 [28:05<1:53:57, 1.16it/s, loss=0.0430, lr=2.36e-05, step=2038] Training: 20%|██ | 2039/10000 [28:05<1:53:57, 1.16it/s, loss=0.0666, lr=2.36e-05, step=2039]19:12:38.028 [I] step=2040 loss=0.0598 smoothed_loss=0.0351 lr=2.36e-05 grad_norm=0.5957 step_time=0.6497s data_time=0.1937s it/s=1.186 eta_to_10000=6712.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0229 grad_action_out_proj_arms=0.2128 grad_arm_token_fuse=0.1253 grad_shared_expert=0.6465 (18633:train_pytorch.py:850) + Training: 20%|██ | 2040/10000 [28:06<1:55:25, 1.15it/s, loss=0.0666, lr=2.36e-05, step=2039] Training: 20%|██ | 2040/10000 [28:06<1:55:25, 1.15it/s, loss=0.0598, lr=2.36e-05, step=2040] Training: 20%|██ | 2041/10000 [28:06<1:50:25, 1.20it/s, loss=0.0598, lr=2.36e-05, step=2040] Training: 20%|██ | 2041/10000 [28:06<1:50:25, 1.20it/s, loss=0.0264, lr=2.36e-05, step=2041] Training: 20%|██ | 2042/10000 [28:07<1:40:31, 1.32it/s, loss=0.0264, lr=2.36e-05, step=2041] Training: 20%|██ | 2042/10000 [28:07<1:40:31, 1.32it/s, loss=0.0230, lr=2.36e-05, step=2042] Training: 20%|██ | 2043/10000 [28:08<1:53:49, 1.17it/s, loss=0.0230, lr=2.36e-05, step=2042] Training: 20%|██ | 2043/10000 [28:08<1:53:49, 1.17it/s, loss=0.0145, lr=2.36e-05, step=2043] Training: 20%|██ | 2044/10000 [28:09<1:41:35, 1.31it/s, loss=0.0145, lr=2.36e-05, step=2043] Training: 20%|██ | 2044/10000 [28:09<1:41:35, 1.31it/s, loss=0.0378, lr=2.36e-05, step=2044] Training: 20%|██ | 2045/10000 [28:09<1:31:07, 1.45it/s, loss=0.0378, lr=2.36e-05, step=2044] Training: 20%|██ | 2045/10000 [28:09<1:31:07, 1.45it/s, loss=0.0287, lr=2.36e-05, step=2045] Training: 20%|██ | 2046/10000 [28:10<1:36:10, 1.38it/s, loss=0.0287, lr=2.36e-05, step=2045] Training: 20%|██ | 2046/10000 [28:10<1:36:10, 1.38it/s, loss=0.0336, lr=2.36e-05, step=2046] Training: 20%|██ | 2047/10000 [28:11<1:28:17, 1.50it/s, loss=0.0336, lr=2.36e-05, step=2046] Training: 20%|██ | 2047/10000 [28:11<1:28:17, 1.50it/s, loss=0.0258, lr=2.36e-05, step=2047] Training: 20%|██ | 2048/10000 [28:11<1:36:31, 1.37it/s, loss=0.0258, lr=2.36e-05, step=2047] Training: 20%|██ | 2048/10000 [28:11<1:36:31, 1.37it/s, loss=0.0161, lr=2.36e-05, step=2048] Training: 20%|██ | 2049/10000 [28:12<1:48:35, 1.22it/s, loss=0.0161, lr=2.36e-05, step=2048] Training: 20%|██ | 2049/10000 [28:12<1:48:35, 1.22it/s, loss=0.0374, lr=2.36e-05, step=2049]19:12:45.867 [I] step=2050 loss=0.0154 smoothed_loss=0.0289 lr=2.36e-05 grad_norm=0.6164 step_time=0.6304s data_time=0.1535s it/s=1.276 eta_to_10000=6231.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1924 grad_arm_token_fuse=0.0750 grad_shared_expert=0.7113 (18633:train_pytorch.py:850) + Training: 20%|██ | 2050/10000 [28:14<2:00:22, 1.10it/s, loss=0.0374, lr=2.36e-05, step=2049] Training: 20%|██ | 2050/10000 [28:14<2:00:22, 1.10it/s, loss=0.0154, lr=2.36e-05, step=2050] Training: 21%|██ | 2051/10000 [28:14<1:54:13, 1.16it/s, loss=0.0154, lr=2.36e-05, step=2050] Training: 21%|██ | 2051/10000 [28:14<1:54:13, 1.16it/s, loss=0.0082, lr=2.36e-05, step=2051] Training: 21%|██ | 2052/10000 [28:15<1:51:58, 1.18it/s, loss=0.0082, lr=2.36e-05, step=2051] Training: 21%|██ | 2052/10000 [28:15<1:51:58, 1.18it/s, loss=0.0116, lr=2.36e-05, step=2052] Training: 21%|██ | 2053/10000 [28:16<1:54:46, 1.15it/s, loss=0.0116, lr=2.36e-05, step=2052] Training: 21%|██ | 2053/10000 [28:16<1:54:46, 1.15it/s, loss=0.0292, lr=2.36e-05, step=2053] Training: 21%|██ | 2054/10000 [28:17<1:49:14, 1.21it/s, loss=0.0292, lr=2.36e-05, step=2053] Training: 21%|██ | 2054/10000 [28:17<1:49:14, 1.21it/s, loss=0.0308, lr=2.35e-05, step=2054] Training: 21%|██ | 2055/10000 [28:17<1:36:07, 1.38it/s, loss=0.0308, lr=2.35e-05, step=2054] Training: 21%|██ | 2055/10000 [28:17<1:36:07, 1.38it/s, loss=0.1893, lr=2.35e-05, step=2055] Training: 21%|██ | 2056/10000 [28:18<1:29:20, 1.48it/s, loss=0.1893, lr=2.35e-05, step=2055] Training: 21%|██ | 2056/10000 [28:18<1:29:20, 1.48it/s, loss=0.0454, lr=2.35e-05, step=2056] Training: 21%|██ | 2057/10000 [28:19<1:31:25, 1.45it/s, loss=0.0454, lr=2.35e-05, step=2056] Training: 21%|██ | 2057/10000 [28:19<1:31:25, 1.45it/s, loss=0.0063, lr=2.35e-05, step=2057] Training: 21%|██ | 2058/10000 [28:19<1:30:38, 1.46it/s, loss=0.0063, lr=2.35e-05, step=2057] Training: 21%|██ | 2058/10000 [28:19<1:30:38, 1.46it/s, loss=0.0174, lr=2.35e-05, step=2058] Training: 21%|██ | 2059/10000 [28:20<1:31:44, 1.44it/s, loss=0.0174, lr=2.35e-05, step=2058] Training: 21%|██ | 2059/10000 [28:20<1:31:44, 1.44it/s, loss=0.0214, lr=2.35e-05, step=2059]19:12:53.345 [I] step=2060 loss=0.0314 smoothed_loss=0.0350 lr=2.35e-05 grad_norm=0.5571 step_time=0.6012s data_time=0.1466s it/s=1.337 eta_to_10000=5936.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0183 grad_action_out_proj_arms=0.2079 grad_arm_token_fuse=0.0979 grad_shared_expert=0.4581 (18633:train_pytorch.py:850) + Training: 21%|██ | 2060/10000 [28:21<1:48:21, 1.22it/s, loss=0.0214, lr=2.35e-05, step=2059] Training: 21%|██ | 2060/10000 [28:21<1:48:21, 1.22it/s, loss=0.0314, lr=2.35e-05, step=2060] Training: 21%|██ | 2061/10000 [28:22<1:52:08, 1.18it/s, loss=0.0314, lr=2.35e-05, step=2060] Training: 21%|██ | 2061/10000 [28:22<1:52:08, 1.18it/s, loss=0.0178, lr=2.35e-05, step=2061] Training: 21%|██ | 2062/10000 [28:23<1:46:20, 1.24it/s, loss=0.0178, lr=2.35e-05, step=2061] Training: 21%|██ | 2062/10000 [28:23<1:46:20, 1.24it/s, loss=0.0231, lr=2.35e-05, step=2062] Training: 21%|██ | 2063/10000 [28:23<1:46:30, 1.24it/s, loss=0.0231, lr=2.35e-05, step=2062] Training: 21%|██ | 2063/10000 [28:23<1:46:30, 1.24it/s, loss=0.0412, lr=2.35e-05, step=2063] Training: 21%|██ | 2064/10000 [28:25<1:59:55, 1.10it/s, loss=0.0412, lr=2.35e-05, step=2063] Training: 21%|██ | 2064/10000 [28:25<1:59:55, 1.10it/s, loss=0.0338, lr=2.35e-05, step=2064] Training: 21%|██ | 2065/10000 [28:25<1:55:54, 1.14it/s, loss=0.0338, lr=2.35e-05, step=2064] Training: 21%|██ | 2065/10000 [28:25<1:55:54, 1.14it/s, loss=0.0271, lr=2.35e-05, step=2065] Training: 21%|██ | 2066/10000 [28:26<1:53:22, 1.17it/s, loss=0.0271, lr=2.35e-05, step=2065] Training: 21%|██ | 2066/10000 [28:26<1:53:22, 1.17it/s, loss=0.0239, lr=2.35e-05, step=2066] Training: 21%|██ | 2067/10000 [28:27<1:58:15, 1.12it/s, loss=0.0239, lr=2.35e-05, step=2066] Training: 21%|██ | 2067/10000 [28:27<1:58:15, 1.12it/s, loss=0.0297, lr=2.35e-05, step=2067] Training: 21%|██ | 2068/10000 [28:28<1:42:55, 1.28it/s, loss=0.0297, lr=2.35e-05, step=2067] Training: 21%|██ | 2068/10000 [28:28<1:42:55, 1.28it/s, loss=0.0387, lr=2.35e-05, step=2068] Training: 21%|██ | 2069/10000 [28:28<1:34:44, 1.40it/s, loss=0.0387, lr=2.35e-05, step=2068] Training: 21%|██ | 2069/10000 [28:28<1:34:44, 1.40it/s, loss=0.0412, lr=2.35e-05, step=2069]19:13:01.369 [I] step=2070 loss=0.0184 smoothed_loss=0.0317 lr=2.35e-05 grad_norm=0.5616 step_time=0.6225s data_time=0.1800s it/s=1.246 eta_to_10000=6362.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0201 grad_action_out_proj_arms=0.1684 grad_arm_token_fuse=0.1146 grad_shared_expert=0.4988 (18633:train_pytorch.py:850) + Training: 21%|██ | 2070/10000 [28:29<1:37:04, 1.36it/s, loss=0.0412, lr=2.35e-05, step=2069] Training: 21%|██ | 2070/10000 [28:29<1:37:04, 1.36it/s, loss=0.0184, lr=2.35e-05, step=2070] Training: 21%|██ | 2071/10000 [28:30<1:42:16, 1.29it/s, loss=0.0184, lr=2.35e-05, step=2070] Training: 21%|██ | 2071/10000 [28:30<1:42:16, 1.29it/s, loss=0.0123, lr=2.35e-05, step=2071] Training: 21%|██ | 2072/10000 [28:30<1:31:26, 1.44it/s, loss=0.0123, lr=2.35e-05, step=2071] Training: 21%|██ | 2072/10000 [28:30<1:31:26, 1.44it/s, loss=0.0159, lr=2.35e-05, step=2072] Training: 21%|██ | 2073/10000 [28:31<1:24:00, 1.57it/s, loss=0.0159, lr=2.35e-05, step=2072] Training: 21%|██ | 2073/10000 [28:31<1:24:00, 1.57it/s, loss=0.0161, lr=2.35e-05, step=2073] Training: 21%|██ | 2074/10000 [28:31<1:18:34, 1.68it/s, loss=0.0161, lr=2.35e-05, step=2073] Training: 21%|██ | 2074/10000 [28:31<1:18:34, 1.68it/s, loss=0.0090, lr=2.35e-05, step=2074] Training: 21%|██ | 2075/10000 [28:32<1:23:48, 1.58it/s, loss=0.0090, lr=2.35e-05, step=2074] Training: 21%|██ | 2075/10000 [28:32<1:23:48, 1.58it/s, loss=0.0187, lr=2.35e-05, step=2075] Training: 21%|██ | 2076/10000 [28:33<1:25:25, 1.55it/s, loss=0.0187, lr=2.35e-05, step=2075] Training: 21%|██ | 2076/10000 [28:33<1:25:25, 1.55it/s, loss=0.0155, lr=2.35e-05, step=2076] Training: 21%|██ | 2077/10000 [28:33<1:24:51, 1.56it/s, loss=0.0155, lr=2.35e-05, step=2076] Training: 21%|██ | 2077/10000 [28:33<1:24:51, 1.56it/s, loss=0.0083, lr=2.35e-05, step=2077] Training: 21%|██ | 2078/10000 [28:34<1:29:41, 1.47it/s, loss=0.0083, lr=2.35e-05, step=2077] Training: 21%|██ | 2078/10000 [28:34<1:29:41, 1.47it/s, loss=0.0284, lr=2.35e-05, step=2078] Training: 21%|██ | 2079/10000 [28:35<1:27:21, 1.51it/s, loss=0.0284, lr=2.35e-05, step=2078] Training: 21%|██ | 2079/10000 [28:35<1:27:21, 1.51it/s, loss=0.0157, lr=2.35e-05, step=2079]19:13:07.715 [I] step=2080 loss=0.0170 smoothed_loss=0.0216 lr=2.35e-05 grad_norm=0.5138 step_time=0.5438s data_time=0.0907s it/s=1.576 eta_to_10000=5025.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1591 grad_arm_token_fuse=0.0524 grad_shared_expert=0.4468 (18633:train_pytorch.py:850) + Training: 21%|██ | 2080/10000 [28:35<1:23:05, 1.59it/s, loss=0.0157, lr=2.35e-05, step=2079] Training: 21%|██ | 2080/10000 [28:35<1:23:05, 1.59it/s, loss=0.0170, lr=2.35e-05, step=2080] Training: 21%|██ | 2081/10000 [28:36<1:24:59, 1.55it/s, loss=0.0170, lr=2.35e-05, step=2080] Training: 21%|██ | 2081/10000 [28:36<1:24:59, 1.55it/s, loss=0.1387, lr=2.35e-05, step=2081] Training: 21%|██ | 2082/10000 [28:37<1:21:47, 1.61it/s, loss=0.1387, lr=2.35e-05, step=2081] Training: 21%|██ | 2082/10000 [28:37<1:21:47, 1.61it/s, loss=0.0452, lr=2.35e-05, step=2082] Training: 21%|██ | 2083/10000 [28:37<1:25:49, 1.54it/s, loss=0.0452, lr=2.35e-05, step=2082] Training: 21%|██ | 2083/10000 [28:37<1:25:49, 1.54it/s, loss=0.0064, lr=2.35e-05, step=2083] Training: 21%|██ | 2084/10000 [28:38<1:20:52, 1.63it/s, loss=0.0064, lr=2.35e-05, step=2083] Training: 21%|██ | 2084/10000 [28:38<1:20:52, 1.63it/s, loss=0.0445, lr=2.35e-05, step=2084] Training: 21%|██ | 2085/10000 [28:39<1:30:23, 1.46it/s, loss=0.0445, lr=2.35e-05, step=2084] Training: 21%|██ | 2085/10000 [28:39<1:30:23, 1.46it/s, loss=0.0390, lr=2.35e-05, step=2085] Training: 21%|██ | 2086/10000 [28:40<1:36:24, 1.37it/s, loss=0.0390, lr=2.35e-05, step=2085] Training: 21%|██ | 2086/10000 [28:40<1:36:24, 1.37it/s, loss=0.0179, lr=2.35e-05, step=2086] Training: 21%|██ | 2087/10000 [28:40<1:40:23, 1.31it/s, loss=0.0179, lr=2.35e-05, step=2086] Training: 21%|██ | 2087/10000 [28:40<1:40:23, 1.31it/s, loss=0.0067, lr=2.35e-05, step=2087] Training: 21%|██ | 2088/10000 [28:41<1:48:50, 1.21it/s, loss=0.0067, lr=2.35e-05, step=2087] Training: 21%|██ | 2088/10000 [28:41<1:48:50, 1.21it/s, loss=0.0341, lr=2.35e-05, step=2088] Training: 21%|██ | 2089/10000 [28:42<1:49:57, 1.20it/s, loss=0.0341, lr=2.35e-05, step=2088] Training: 21%|██ | 2089/10000 [28:42<1:49:57, 1.20it/s, loss=0.0160, lr=2.35e-05, step=2089]19:13:15.358 [I] step=2090 loss=0.0328 smoothed_loss=0.0290 lr=2.35e-05 grad_norm=0.6198 step_time=0.6042s data_time=0.1601s it/s=1.309 eta_to_10000=6044.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0158 grad_action_out_proj_arms=0.1785 grad_arm_token_fuse=0.0805 grad_shared_expert=0.5652 (18633:train_pytorch.py:850) + Training: 21%|██ | 2090/10000 [28:43<1:48:38, 1.21it/s, loss=0.0160, lr=2.35e-05, step=2089] Training: 21%|██ | 2090/10000 [28:43<1:48:38, 1.21it/s, loss=0.0328, lr=2.35e-05, step=2090] Training: 21%|██ | 2091/10000 [28:44<1:35:43, 1.38it/s, loss=0.0328, lr=2.35e-05, step=2090] Training: 21%|██ | 2091/10000 [28:44<1:35:43, 1.38it/s, loss=0.0158, lr=2.35e-05, step=2091] Training: 21%|██ | 2092/10000 [28:44<1:38:55, 1.33it/s, loss=0.0158, lr=2.35e-05, step=2091] Training: 21%|██ | 2092/10000 [28:44<1:38:55, 1.33it/s, loss=0.0458, lr=2.35e-05, step=2092] Training: 21%|██ | 2093/10000 [28:46<1:56:00, 1.14it/s, loss=0.0458, lr=2.35e-05, step=2092] Training: 21%|██ | 2093/10000 [28:46<1:56:00, 1.14it/s, loss=0.0414, lr=2.35e-05, step=2093] Training: 21%|██ | 2094/10000 [28:46<1:40:44, 1.31it/s, loss=0.0414, lr=2.35e-05, step=2093] Training: 21%|██ | 2094/10000 [28:46<1:40:44, 1.31it/s, loss=0.0267, lr=2.35e-05, step=2094] Training: 21%|██ | 2095/10000 [28:47<1:39:12, 1.33it/s, loss=0.0267, lr=2.35e-05, step=2094] Training: 21%|██ | 2095/10000 [28:47<1:39:12, 1.33it/s, loss=0.0084, lr=2.35e-05, step=2095] Training: 21%|██ | 2096/10000 [28:48<1:42:25, 1.29it/s, loss=0.0084, lr=2.35e-05, step=2095] Training: 21%|██ | 2096/10000 [28:48<1:42:25, 1.29it/s, loss=0.0451, lr=2.35e-05, step=2096] Training: 21%|██ | 2097/10000 [28:48<1:31:01, 1.45it/s, loss=0.0451, lr=2.35e-05, step=2096] Training: 21%|██ | 2097/10000 [28:48<1:31:01, 1.45it/s, loss=0.0316, lr=2.35e-05, step=2097] Training: 21%|██ | 2098/10000 [28:49<1:35:45, 1.38it/s, loss=0.0316, lr=2.35e-05, step=2097] Training: 21%|██ | 2098/10000 [28:49<1:35:45, 1.38it/s, loss=0.0103, lr=2.35e-05, step=2098] Training: 21%|██ | 2099/10000 [28:49<1:27:13, 1.51it/s, loss=0.0103, lr=2.35e-05, step=2098] Training: 21%|██ | 2099/10000 [28:49<1:27:13, 1.51it/s, loss=0.0605, lr=2.35e-05, step=2099]19:13:22.627 [I] step=2100 loss=0.0271 smoothed_loss=0.0308 lr=2.35e-05 grad_norm=0.6641 step_time=0.5892s data_time=0.1377s it/s=1.376 eta_to_10000=5742.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0244 grad_action_out_proj_arms=0.2344 grad_arm_token_fuse=0.1299 grad_shared_expert=0.6544 (18633:train_pytorch.py:850) + Training: 21%|██ | 2100/10000 [28:50<1:37:11, 1.35it/s, loss=0.0605, lr=2.35e-05, step=2099] Training: 21%|██ | 2100/10000 [28:50<1:37:11, 1.35it/s, loss=0.0271, lr=2.35e-05, step=2100] Training: 21%|██ | 2101/10000 [28:51<1:29:39, 1.47it/s, loss=0.0271, lr=2.35e-05, step=2100] Training: 21%|██ | 2101/10000 [28:51<1:29:39, 1.47it/s, loss=0.0393, lr=2.35e-05, step=2101] Training: 21%|██ | 2102/10000 [28:52<1:29:47, 1.47it/s, loss=0.0393, lr=2.35e-05, step=2101] Training: 21%|██ | 2102/10000 [28:52<1:29:47, 1.47it/s, loss=0.0286, lr=2.35e-05, step=2102] Training: 21%|██ | 2103/10000 [28:52<1:26:43, 1.52it/s, loss=0.0286, lr=2.35e-05, step=2102] Training: 21%|██ | 2103/10000 [28:52<1:26:43, 1.52it/s, loss=0.0694, lr=2.35e-05, step=2103] Training: 21%|██ | 2104/10000 [28:53<1:24:34, 1.56it/s, loss=0.0694, lr=2.35e-05, step=2103] Training: 21%|██ | 2104/10000 [28:53<1:24:34, 1.56it/s, loss=0.0168, lr=2.35e-05, step=2104] Training: 21%|██ | 2105/10000 [28:54<1:29:38, 1.47it/s, loss=0.0168, lr=2.35e-05, step=2104] Training: 21%|██ | 2105/10000 [28:54<1:29:38, 1.47it/s, loss=0.0142, lr=2.35e-05, step=2105] Training: 21%|██ | 2106/10000 [28:54<1:31:07, 1.44it/s, loss=0.0142, lr=2.35e-05, step=2105] Training: 21%|██ | 2106/10000 [28:54<1:31:07, 1.44it/s, loss=0.0206, lr=2.35e-05, step=2106] Training: 21%|██ | 2107/10000 [28:55<1:47:37, 1.22it/s, loss=0.0206, lr=2.35e-05, step=2106] Training: 21%|██ | 2107/10000 [28:55<1:47:37, 1.22it/s, loss=0.0176, lr=2.35e-05, step=2107] Training: 21%|██ | 2108/10000 [28:56<1:41:20, 1.30it/s, loss=0.0176, lr=2.35e-05, step=2107] Training: 21%|██ | 2108/10000 [28:56<1:41:20, 1.30it/s, loss=0.0749, lr=2.34e-05, step=2108] Training: 21%|██ | 2109/10000 [28:57<1:49:40, 1.20it/s, loss=0.0749, lr=2.34e-05, step=2108] Training: 21%|██ | 2109/10000 [28:57<1:49:40, 1.20it/s, loss=0.0139, lr=2.34e-05, step=2109]19:13:29.842 [I] step=2110 loss=0.0220 smoothed_loss=0.0307 lr=2.35e-05 grad_norm=0.6519 step_time=0.5709s data_time=0.1506s it/s=1.386 eta_to_10000=5691.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0140 grad_action_out_proj_arms=0.1431 grad_arm_token_fuse=0.0708 grad_shared_expert=0.5445 (18633:train_pytorch.py:850) + Training: 21%|██ | 2110/10000 [28:58<1:37:40, 1.35it/s, loss=0.0139, lr=2.34e-05, step=2109] Training: 21%|██ | 2110/10000 [28:58<1:37:40, 1.35it/s, loss=0.0220, lr=2.34e-05, step=2110] Training: 21%|██ | 2111/10000 [28:58<1:29:18, 1.47it/s, loss=0.0220, lr=2.34e-05, step=2110] Training: 21%|██ | 2111/10000 [28:58<1:29:18, 1.47it/s, loss=0.0131, lr=2.34e-05, step=2111] Training: 21%|██ | 2112/10000 [28:59<1:21:58, 1.60it/s, loss=0.0131, lr=2.34e-05, step=2111] Training: 21%|██ | 2112/10000 [28:59<1:21:58, 1.60it/s, loss=0.0237, lr=2.34e-05, step=2112] Training: 21%|██ | 2113/10000 [28:59<1:16:35, 1.72it/s, loss=0.0237, lr=2.34e-05, step=2112] Training: 21%|██ | 2113/10000 [28:59<1:16:35, 1.72it/s, loss=0.0229, lr=2.34e-05, step=2113] Training: 21%|██ | 2114/10000 [29:00<1:28:07, 1.49it/s, loss=0.0229, lr=2.34e-05, step=2113] Training: 21%|██ | 2114/10000 [29:00<1:28:07, 1.49it/s, loss=0.0184, lr=2.34e-05, step=2114] Training: 21%|██ | 2115/10000 [29:00<1:21:03, 1.62it/s, loss=0.0184, lr=2.34e-05, step=2114] Training: 21%|██ | 2115/10000 [29:00<1:21:03, 1.62it/s, loss=0.0317, lr=2.34e-05, step=2115] Training: 21%|██ | 2116/10000 [29:01<1:23:41, 1.57it/s, loss=0.0317, lr=2.34e-05, step=2115] Training: 21%|██ | 2116/10000 [29:01<1:23:41, 1.57it/s, loss=0.0406, lr=2.34e-05, step=2116] Training: 21%|██ | 2117/10000 [29:02<1:18:14, 1.68it/s, loss=0.0406, lr=2.34e-05, step=2116] Training: 21%|██ | 2117/10000 [29:02<1:18:14, 1.68it/s, loss=0.0128, lr=2.34e-05, step=2117] Training: 21%|██ | 2118/10000 [29:02<1:29:18, 1.47it/s, loss=0.0128, lr=2.34e-05, step=2117] Training: 21%|██ | 2118/10000 [29:02<1:29:18, 1.47it/s, loss=0.0208, lr=2.34e-05, step=2118] Training: 21%|██ | 2119/10000 [29:03<1:30:46, 1.45it/s, loss=0.0208, lr=2.34e-05, step=2118] Training: 21%|██ | 2119/10000 [29:03<1:30:46, 1.45it/s, loss=0.0390, lr=2.34e-05, step=2119]19:13:36.050 [I] step=2120 loss=0.0657 smoothed_loss=0.0315 lr=2.34e-05 grad_norm=0.6137 step_time=0.5220s data_time=0.0989s it/s=1.611 eta_to_10000=4891.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0197 grad_action_out_proj_arms=0.1916 grad_arm_token_fuse=0.1080 grad_shared_expert=0.5001 (18633:train_pytorch.py:850) + Training: 21%|██ | 2120/10000 [29:04<1:25:18, 1.54it/s, loss=0.0390, lr=2.34e-05, step=2119] Training: 21%|██ | 2120/10000 [29:04<1:25:18, 1.54it/s, loss=0.0657, lr=2.34e-05, step=2120] Training: 21%|██ | 2121/10000 [29:05<1:36:56, 1.35it/s, loss=0.0657, lr=2.34e-05, step=2120] Training: 21%|██ | 2121/10000 [29:05<1:36:56, 1.35it/s, loss=0.0239, lr=2.34e-05, step=2121] Training: 21%|██ | 2122/10000 [29:05<1:34:16, 1.39it/s, loss=0.0239, lr=2.34e-05, step=2121] Training: 21%|██ | 2122/10000 [29:05<1:34:16, 1.39it/s, loss=0.0286, lr=2.34e-05, step=2122] Training: 21%|██ | 2123/10000 [29:06<1:40:48, 1.30it/s, loss=0.0286, lr=2.34e-05, step=2122] Training: 21%|██ | 2123/10000 [29:06<1:40:48, 1.30it/s, loss=0.0349, lr=2.34e-05, step=2123] Training: 21%|██ | 2124/10000 [29:07<1:30:31, 1.45it/s, loss=0.0349, lr=2.34e-05, step=2123] Training: 21%|██ | 2124/10000 [29:07<1:30:31, 1.45it/s, loss=0.0366, lr=2.34e-05, step=2124] Training: 21%|██▏ | 2125/10000 [29:07<1:32:22, 1.42it/s, loss=0.0366, lr=2.34e-05, step=2124] Training: 21%|██▏ | 2125/10000 [29:07<1:32:22, 1.42it/s, loss=0.0163, lr=2.34e-05, step=2125] Training: 21%|██▏ | 2126/10000 [29:08<1:35:55, 1.37it/s, loss=0.0163, lr=2.34e-05, step=2125] Training: 21%|██▏ | 2126/10000 [29:08<1:35:55, 1.37it/s, loss=0.0211, lr=2.34e-05, step=2126] Training: 21%|██▏ | 2127/10000 [29:09<1:39:16, 1.32it/s, loss=0.0211, lr=2.34e-05, step=2126] Training: 21%|██▏ | 2127/10000 [29:09<1:39:16, 1.32it/s, loss=0.0158, lr=2.34e-05, step=2127] Training: 21%|██▏ | 2128/10000 [29:10<1:51:51, 1.17it/s, loss=0.0158, lr=2.34e-05, step=2127] Training: 21%|██▏ | 2128/10000 [29:10<1:51:51, 1.17it/s, loss=0.0122, lr=2.34e-05, step=2128] Training: 21%|██▏ | 2129/10000 [29:11<1:47:34, 1.22it/s, loss=0.0122, lr=2.34e-05, step=2128] Training: 21%|██▏ | 2129/10000 [29:11<1:47:34, 1.22it/s, loss=0.0166, lr=2.34e-05, step=2129]19:13:43.841 [I] step=2130 loss=0.0125 smoothed_loss=0.0240 lr=2.34e-05 grad_norm=0.6547 step_time=0.6337s data_time=0.1454s it/s=1.284 eta_to_10000=6131.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0162 grad_action_out_proj_arms=0.1790 grad_arm_token_fuse=0.0867 grad_shared_expert=0.5061 (18633:train_pytorch.py:850) + Training: 21%|██▏ | 2130/10000 [29:12<1:39:33, 1.32it/s, loss=0.0166, lr=2.34e-05, step=2129] Training: 21%|██▏ | 2130/10000 [29:12<1:39:33, 1.32it/s, loss=0.0125, lr=2.34e-05, step=2130] Training: 21%|██▏ | 2131/10000 [29:12<1:46:11, 1.23it/s, loss=0.0125, lr=2.34e-05, step=2130] Training: 21%|██▏ | 2131/10000 [29:12<1:46:11, 1.23it/s, loss=0.0618, lr=2.34e-05, step=2131] Training: 21%|██▏ | 2132/10000 [29:13<1:43:07, 1.27it/s, loss=0.0618, lr=2.34e-05, step=2131] Training: 21%|██▏ | 2132/10000 [29:13<1:43:07, 1.27it/s, loss=0.0156, lr=2.34e-05, step=2132] Training: 21%|██▏ | 2133/10000 [29:14<1:38:54, 1.33it/s, loss=0.0156, lr=2.34e-05, step=2132] Training: 21%|██▏ | 2133/10000 [29:14<1:38:54, 1.33it/s, loss=0.0162, lr=2.34e-05, step=2133] Training: 21%|██▏ | 2134/10000 [29:15<1:40:57, 1.30it/s, loss=0.0162, lr=2.34e-05, step=2133] Training: 21%|██▏ | 2134/10000 [29:15<1:40:57, 1.30it/s, loss=0.0179, lr=2.34e-05, step=2134] Training: 21%|██▏ | 2135/10000 [29:15<1:31:01, 1.44it/s, loss=0.0179, lr=2.34e-05, step=2134] Training: 21%|██▏ | 2135/10000 [29:15<1:31:01, 1.44it/s, loss=0.0244, lr=2.34e-05, step=2135] Training: 21%|██▏ | 2136/10000 [29:16<1:48:06, 1.21it/s, loss=0.0244, lr=2.34e-05, step=2135] Training: 21%|██▏ | 2136/10000 [29:16<1:48:06, 1.21it/s, loss=0.0129, lr=2.34e-05, step=2136] Training: 21%|██▏ | 2137/10000 [29:17<1:37:15, 1.35it/s, loss=0.0129, lr=2.34e-05, step=2136] Training: 21%|██▏ | 2137/10000 [29:17<1:37:15, 1.35it/s, loss=0.0086, lr=2.34e-05, step=2137] Training: 21%|██▏ | 2138/10000 [29:18<1:41:54, 1.29it/s, loss=0.0086, lr=2.34e-05, step=2137] Training: 21%|██▏ | 2138/10000 [29:18<1:41:54, 1.29it/s, loss=0.0104, lr=2.34e-05, step=2138] Training: 21%|██▏ | 2139/10000 [29:18<1:40:51, 1.30it/s, loss=0.0104, lr=2.34e-05, step=2138] Training: 21%|██▏ | 2139/10000 [29:18<1:40:51, 1.30it/s, loss=0.0294, lr=2.34e-05, step=2139]19:13:51.389 [I] step=2140 loss=0.0206 smoothed_loss=0.0216 lr=2.34e-05 grad_norm=0.5490 step_time=0.5957s data_time=0.1591s it/s=1.325 eta_to_10000=5931.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1737 grad_arm_token_fuse=0.0640 grad_shared_expert=0.5218 (18633:train_pytorch.py:850) + Training: 21%|██▏ | 2140/10000 [29:19<1:33:52, 1.40it/s, loss=0.0294, lr=2.34e-05, step=2139] Training: 21%|██▏ | 2140/10000 [29:19<1:33:52, 1.40it/s, loss=0.0206, lr=2.34e-05, step=2140] Training: 21%|██▏ | 2141/10000 [29:20<1:26:04, 1.52it/s, loss=0.0206, lr=2.34e-05, step=2140] Training: 21%|██▏ | 2141/10000 [29:20<1:26:04, 1.52it/s, loss=0.0307, lr=2.34e-05, step=2141] Training: 21%|██▏ | 2142/10000 [29:20<1:25:19, 1.53it/s, loss=0.0307, lr=2.34e-05, step=2141] Training: 21%|██▏ | 2142/10000 [29:20<1:25:19, 1.53it/s, loss=0.0105, lr=2.34e-05, step=2142] Training: 21%|██▏ | 2143/10000 [29:21<1:39:15, 1.32it/s, loss=0.0105, lr=2.34e-05, step=2142] Training: 21%|██▏ | 2143/10000 [29:21<1:39:15, 1.32it/s, loss=0.0291, lr=2.34e-05, step=2143] Training: 21%|██▏ | 2144/10000 [29:22<1:45:37, 1.24it/s, loss=0.0291, lr=2.34e-05, step=2143] Training: 21%|██▏ | 2144/10000 [29:22<1:45:37, 1.24it/s, loss=0.0117, lr=2.34e-05, step=2144] Training: 21%|██▏ | 2145/10000 [29:23<1:33:51, 1.39it/s, loss=0.0117, lr=2.34e-05, step=2144] Training: 21%|██▏ | 2145/10000 [29:23<1:33:51, 1.39it/s, loss=0.0339, lr=2.34e-05, step=2145] Training: 21%|██▏ | 2146/10000 [29:23<1:35:28, 1.37it/s, loss=0.0339, lr=2.34e-05, step=2145] Training: 21%|██▏ | 2146/10000 [29:23<1:35:28, 1.37it/s, loss=0.0331, lr=2.34e-05, step=2146] Training: 21%|██▏ | 2147/10000 [29:24<1:39:44, 1.31it/s, loss=0.0331, lr=2.34e-05, step=2146] Training: 21%|██▏ | 2147/10000 [29:24<1:39:44, 1.31it/s, loss=0.0269, lr=2.34e-05, step=2147] Training: 21%|██▏ | 2148/10000 [29:25<1:34:37, 1.38it/s, loss=0.0269, lr=2.34e-05, step=2147] Training: 21%|██▏ | 2148/10000 [29:25<1:34:37, 1.38it/s, loss=0.0079, lr=2.34e-05, step=2148] Training: 21%|██▏ | 2149/10000 [29:26<1:30:52, 1.44it/s, loss=0.0079, lr=2.34e-05, step=2148] Training: 21%|██▏ | 2149/10000 [29:26<1:30:52, 1.44it/s, loss=0.0360, lr=2.34e-05, step=2149]19:13:58.905 [I] step=2150 loss=0.0179 smoothed_loss=0.0230 lr=2.34e-05 grad_norm=0.5342 step_time=0.6072s data_time=0.1444s it/s=1.331 eta_to_10000=5898.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0214 grad_action_out_proj_arms=0.1827 grad_arm_token_fuse=0.1233 grad_shared_expert=0.4606 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2150/10000 [29:27<1:45:31, 1.24it/s, loss=0.0360, lr=2.34e-05, step=2149] Training: 22%|██▏ | 2150/10000 [29:27<1:45:31, 1.24it/s, loss=0.0179, lr=2.34e-05, step=2150] Training: 22%|██▏ | 2151/10000 [29:27<1:34:23, 1.39it/s, loss=0.0179, lr=2.34e-05, step=2150] Training: 22%|██▏ | 2151/10000 [29:27<1:34:23, 1.39it/s, loss=0.0277, lr=2.34e-05, step=2151] Training: 22%|██▏ | 2152/10000 [29:28<1:35:22, 1.37it/s, loss=0.0277, lr=2.34e-05, step=2151] Training: 22%|██▏ | 2152/10000 [29:28<1:35:22, 1.37it/s, loss=0.0187, lr=2.34e-05, step=2152] Training: 22%|██▏ | 2153/10000 [29:29<1:38:16, 1.33it/s, loss=0.0187, lr=2.34e-05, step=2152] Training: 22%|██▏ | 2153/10000 [29:29<1:38:16, 1.33it/s, loss=0.0260, lr=2.34e-05, step=2153] Training: 22%|██▏ | 2154/10000 [29:29<1:31:00, 1.44it/s, loss=0.0260, lr=2.34e-05, step=2153] Training: 22%|██▏ | 2154/10000 [29:29<1:31:00, 1.44it/s, loss=0.0113, lr=2.34e-05, step=2154] Training: 22%|██▏ | 2155/10000 [29:30<1:36:21, 1.36it/s, loss=0.0113, lr=2.34e-05, step=2154] Training: 22%|██▏ | 2155/10000 [29:30<1:36:21, 1.36it/s, loss=0.0914, lr=2.34e-05, step=2155] Training: 22%|██▏ | 2156/10000 [29:31<1:37:53, 1.34it/s, loss=0.0914, lr=2.34e-05, step=2155] Training: 22%|██▏ | 2156/10000 [29:31<1:37:53, 1.34it/s, loss=0.0401, lr=2.34e-05, step=2156] Training: 22%|██▏ | 2157/10000 [29:32<1:53:46, 1.15it/s, loss=0.0401, lr=2.34e-05, step=2156] Training: 22%|██▏ | 2157/10000 [29:32<1:53:46, 1.15it/s, loss=0.0210, lr=2.34e-05, step=2157] Training: 22%|██▏ | 2158/10000 [29:33<1:54:06, 1.15it/s, loss=0.0210, lr=2.34e-05, step=2157] Training: 22%|██▏ | 2158/10000 [29:33<1:54:06, 1.15it/s, loss=0.0250, lr=2.34e-05, step=2158] Training: 22%|██▏ | 2159/10000 [29:34<1:59:16, 1.10it/s, loss=0.0250, lr=2.34e-05, step=2158] Training: 22%|██▏ | 2159/10000 [29:34<1:59:16, 1.10it/s, loss=0.0740, lr=2.34e-05, step=2159]19:14:07.233 [I] step=2160 loss=0.0183 smoothed_loss=0.0318 lr=2.34e-05 grad_norm=0.5424 step_time=0.6463s data_time=0.1865s it/s=1.201 eta_to_10000=6528.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0192 grad_action_out_proj_arms=0.1966 grad_arm_token_fuse=0.1026 grad_shared_expert=0.4306 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2160/10000 [29:35<2:04:16, 1.05it/s, loss=0.0740, lr=2.34e-05, step=2159] Training: 22%|██▏ | 2160/10000 [29:35<2:04:16, 1.05it/s, loss=0.0183, lr=2.33e-05, step=2160] Training: 22%|██▏ | 2161/10000 [29:36<2:01:34, 1.07it/s, loss=0.0183, lr=2.33e-05, step=2160] Training: 22%|██▏ | 2161/10000 [29:36<2:01:34, 1.07it/s, loss=0.0953, lr=2.33e-05, step=2161] Training: 22%|██▏ | 2162/10000 [29:36<1:48:30, 1.20it/s, loss=0.0953, lr=2.33e-05, step=2161] Training: 22%|██▏ | 2162/10000 [29:36<1:48:30, 1.20it/s, loss=0.0091, lr=2.33e-05, step=2162] Training: 22%|██▏ | 2163/10000 [29:37<1:36:25, 1.35it/s, loss=0.0091, lr=2.33e-05, step=2162] Training: 22%|██▏ | 2163/10000 [29:37<1:36:25, 1.35it/s, loss=0.0272, lr=2.33e-05, step=2163] Training: 22%|██▏ | 2164/10000 [29:38<1:39:39, 1.31it/s, loss=0.0272, lr=2.33e-05, step=2163] Training: 22%|██▏ | 2164/10000 [29:38<1:39:39, 1.31it/s, loss=0.0362, lr=2.33e-05, step=2164] Training: 22%|██▏ | 2165/10000 [29:38<1:30:46, 1.44it/s, loss=0.0362, lr=2.33e-05, step=2164] Training: 22%|██▏ | 2165/10000 [29:38<1:30:46, 1.44it/s, loss=0.0122, lr=2.33e-05, step=2165] Training: 22%|██▏ | 2166/10000 [29:39<1:23:25, 1.57it/s, loss=0.0122, lr=2.33e-05, step=2165] Training: 22%|██▏ | 2166/10000 [29:39<1:23:25, 1.57it/s, loss=0.0189, lr=2.33e-05, step=2166] Training: 22%|██▏ | 2167/10000 [29:40<1:28:09, 1.48it/s, loss=0.0189, lr=2.33e-05, step=2166] Training: 22%|██▏ | 2167/10000 [29:40<1:28:09, 1.48it/s, loss=0.0136, lr=2.33e-05, step=2167] Training: 22%|██▏ | 2168/10000 [29:40<1:33:54, 1.39it/s, loss=0.0136, lr=2.33e-05, step=2167] Training: 22%|██▏ | 2168/10000 [29:40<1:33:54, 1.39it/s, loss=0.0131, lr=2.33e-05, step=2168] Training: 22%|██▏ | 2169/10000 [29:41<1:26:43, 1.50it/s, loss=0.0131, lr=2.33e-05, step=2168] Training: 22%|██▏ | 2169/10000 [29:41<1:26:43, 1.50it/s, loss=0.0178, lr=2.33e-05, step=2169]19:14:14.105 [I] step=2170 loss=0.0220 smoothed_loss=0.0262 lr=2.33e-05 grad_norm=0.5086 step_time=0.5624s data_time=0.1247s it/s=1.455 eta_to_10000=5379.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0156 grad_action_out_proj_arms=0.1405 grad_arm_token_fuse=0.0788 grad_shared_expert=0.4417 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2170/10000 [29:42<1:35:20, 1.37it/s, loss=0.0178, lr=2.33e-05, step=2169] Training: 22%|██▏ | 2170/10000 [29:42<1:35:20, 1.37it/s, loss=0.0220, lr=2.33e-05, step=2170] Training: 22%|██▏ | 2171/10000 [29:43<1:51:31, 1.17it/s, loss=0.0220, lr=2.33e-05, step=2170] Training: 22%|██▏ | 2171/10000 [29:43<1:51:31, 1.17it/s, loss=0.0190, lr=2.33e-05, step=2171] Training: 22%|██▏ | 2172/10000 [29:43<1:40:44, 1.30it/s, loss=0.0190, lr=2.33e-05, step=2171] Training: 22%|██▏ | 2172/10000 [29:44<1:40:44, 1.30it/s, loss=0.0093, lr=2.33e-05, step=2172] Training: 22%|██▏ | 2173/10000 [29:44<1:36:00, 1.36it/s, loss=0.0093, lr=2.33e-05, step=2172] Training: 22%|██▏ | 2173/10000 [29:44<1:36:00, 1.36it/s, loss=0.0565, lr=2.33e-05, step=2173] Training: 22%|██▏ | 2174/10000 [29:45<1:29:04, 1.46it/s, loss=0.0565, lr=2.33e-05, step=2173] Training: 22%|██▏ | 2174/10000 [29:45<1:29:04, 1.46it/s, loss=0.0280, lr=2.33e-05, step=2174] Training: 22%|██▏ | 2175/10000 [29:46<1:35:19, 1.37it/s, loss=0.0280, lr=2.33e-05, step=2174] Training: 22%|██▏ | 2175/10000 [29:46<1:35:19, 1.37it/s, loss=0.0265, lr=2.33e-05, step=2175] Training: 22%|██▏ | 2176/10000 [29:46<1:40:22, 1.30it/s, loss=0.0265, lr=2.33e-05, step=2175] Training: 22%|██▏ | 2176/10000 [29:46<1:40:22, 1.30it/s, loss=0.0123, lr=2.33e-05, step=2176] Training: 22%|██▏ | 2177/10000 [29:47<1:39:22, 1.31it/s, loss=0.0123, lr=2.33e-05, step=2176] Training: 22%|██▏ | 2177/10000 [29:47<1:39:22, 1.31it/s, loss=0.0244, lr=2.33e-05, step=2177] Training: 22%|██▏ | 2178/10000 [29:48<1:53:26, 1.15it/s, loss=0.0244, lr=2.33e-05, step=2177] Training: 22%|██▏ | 2178/10000 [29:48<1:53:26, 1.15it/s, loss=0.0169, lr=2.33e-05, step=2178] Training: 22%|██▏ | 2179/10000 [29:49<1:55:50, 1.13it/s, loss=0.0169, lr=2.33e-05, step=2178] Training: 22%|██▏ | 2179/10000 [29:49<1:55:50, 1.13it/s, loss=0.0233, lr=2.33e-05, step=2179]19:14:22.139 [I] step=2180 loss=0.0104 smoothed_loss=0.0231 lr=2.33e-05 grad_norm=0.5984 step_time=0.6254s data_time=0.1781s it/s=1.245 eta_to_10000=6282.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0143 grad_action_out_proj_arms=0.1549 grad_arm_token_fuse=0.0723 grad_shared_expert=0.5079 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2180/10000 [29:50<1:44:28, 1.25it/s, loss=0.0233, lr=2.33e-05, step=2179] Training: 22%|██▏ | 2180/10000 [29:50<1:44:28, 1.25it/s, loss=0.0104, lr=2.33e-05, step=2180] Training: 22%|██▏ | 2181/10000 [29:51<1:40:52, 1.29it/s, loss=0.0104, lr=2.33e-05, step=2180] Training: 22%|██▏ | 2181/10000 [29:51<1:40:52, 1.29it/s, loss=0.0227, lr=2.33e-05, step=2181] Training: 22%|██▏ | 2182/10000 [29:51<1:35:17, 1.37it/s, loss=0.0227, lr=2.33e-05, step=2181] Training: 22%|██▏ | 2182/10000 [29:51<1:35:17, 1.37it/s, loss=0.0235, lr=2.33e-05, step=2182] Training: 22%|██▏ | 2183/10000 [29:52<1:42:36, 1.27it/s, loss=0.0235, lr=2.33e-05, step=2182] Training: 22%|██▏ | 2183/10000 [29:52<1:42:36, 1.27it/s, loss=0.0279, lr=2.33e-05, step=2183] Training: 22%|██▏ | 2184/10000 [29:53<1:38:46, 1.32it/s, loss=0.0279, lr=2.33e-05, step=2183] Training: 22%|██▏ | 2184/10000 [29:53<1:38:46, 1.32it/s, loss=0.0132, lr=2.33e-05, step=2184] Training: 22%|██▏ | 2185/10000 [29:53<1:28:13, 1.48it/s, loss=0.0132, lr=2.33e-05, step=2184] Training: 22%|██▏ | 2185/10000 [29:53<1:28:13, 1.48it/s, loss=0.0107, lr=2.33e-05, step=2185] Training: 22%|██▏ | 2186/10000 [29:54<1:32:32, 1.41it/s, loss=0.0107, lr=2.33e-05, step=2185] Training: 22%|██▏ | 2186/10000 [29:54<1:32:32, 1.41it/s, loss=0.0160, lr=2.33e-05, step=2186] Training: 22%|██▏ | 2187/10000 [29:55<1:24:17, 1.54it/s, loss=0.0160, lr=2.33e-05, step=2186] Training: 22%|██▏ | 2187/10000 [29:55<1:24:17, 1.54it/s, loss=0.1117, lr=2.33e-05, step=2187] Training: 22%|██▏ | 2188/10000 [29:55<1:18:59, 1.65it/s, loss=0.1117, lr=2.33e-05, step=2187] Training: 22%|██▏ | 2188/10000 [29:55<1:18:59, 1.65it/s, loss=0.0273, lr=2.33e-05, step=2188] Training: 22%|██▏ | 2189/10000 [29:56<1:18:16, 1.66it/s, loss=0.0273, lr=2.33e-05, step=2188] Training: 22%|██▏ | 2189/10000 [29:56<1:18:16, 1.66it/s, loss=0.0524, lr=2.33e-05, step=2189]19:14:28.777 [I] step=2190 loss=0.0074 smoothed_loss=0.0295 lr=2.33e-05 grad_norm=0.5972 step_time=0.5512s data_time=0.1125s it/s=1.507 eta_to_10000=5182.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0821 grad_arm_token_fuse=0.0302 grad_shared_expert=0.2958 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2190/10000 [29:56<1:26:25, 1.51it/s, loss=0.0524, lr=2.33e-05, step=2189] Training: 22%|██▏ | 2190/10000 [29:56<1:26:25, 1.51it/s, loss=0.0074, lr=2.33e-05, step=2190] Training: 22%|██▏ | 2191/10000 [29:57<1:28:57, 1.46it/s, loss=0.0074, lr=2.33e-05, step=2190] Training: 22%|██▏ | 2191/10000 [29:57<1:28:57, 1.46it/s, loss=0.0716, lr=2.33e-05, step=2191] Training: 22%|██▏ | 2192/10000 [29:58<1:27:21, 1.49it/s, loss=0.0716, lr=2.33e-05, step=2191] Training: 22%|██▏ | 2192/10000 [29:58<1:27:21, 1.49it/s, loss=0.0275, lr=2.33e-05, step=2192] Training: 22%|██▏ | 2193/10000 [29:59<1:32:21, 1.41it/s, loss=0.0275, lr=2.33e-05, step=2192] Training: 22%|██▏ | 2193/10000 [29:59<1:32:21, 1.41it/s, loss=0.0436, lr=2.33e-05, step=2193] Training: 22%|██▏ | 2194/10000 [29:59<1:25:17, 1.53it/s, loss=0.0436, lr=2.33e-05, step=2193] Training: 22%|██▏ | 2194/10000 [29:59<1:25:17, 1.53it/s, loss=0.0085, lr=2.33e-05, step=2194] Training: 22%|██▏ | 2195/10000 [30:00<1:22:02, 1.59it/s, loss=0.0085, lr=2.33e-05, step=2194] Training: 22%|██▏ | 2195/10000 [30:00<1:22:02, 1.59it/s, loss=0.0161, lr=2.33e-05, step=2195] Training: 22%|██▏ | 2196/10000 [30:00<1:24:26, 1.54it/s, loss=0.0161, lr=2.33e-05, step=2195] Training: 22%|██▏ | 2196/10000 [30:00<1:24:26, 1.54it/s, loss=0.0314, lr=2.33e-05, step=2196] Training: 22%|██▏ | 2197/10000 [30:01<1:18:55, 1.65it/s, loss=0.0314, lr=2.33e-05, step=2196] Training: 22%|██▏ | 2197/10000 [30:01<1:18:55, 1.65it/s, loss=0.0129, lr=2.33e-05, step=2197] Training: 22%|██▏ | 2198/10000 [30:02<1:35:46, 1.36it/s, loss=0.0129, lr=2.33e-05, step=2197] Training: 22%|██▏ | 2198/10000 [30:02<1:35:46, 1.36it/s, loss=0.0158, lr=2.33e-05, step=2198] Training: 22%|██▏ | 2199/10000 [30:03<1:29:30, 1.45it/s, loss=0.0158, lr=2.33e-05, step=2198] Training: 22%|██▏ | 2199/10000 [30:03<1:29:30, 1.45it/s, loss=0.0084, lr=2.33e-05, step=2199]19:14:35.689 [I] step=2200 loss=0.0297 smoothed_loss=0.0257 lr=2.33e-05 grad_norm=0.6252 step_time=0.5716s data_time=0.1196s it/s=1.447 eta_to_10000=5390.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0269 grad_action_out_proj_arms=0.2761 grad_arm_token_fuse=0.1414 grad_shared_expert=0.8229 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2200/10000 [30:03<1:34:45, 1.37it/s, loss=0.0084, lr=2.33e-05, step=2199] Training: 22%|██▏ | 2200/10000 [30:03<1:34:45, 1.37it/s, loss=0.0297, lr=2.33e-05, step=2200] Training: 22%|██▏ | 2201/10000 [30:04<1:40:21, 1.30it/s, loss=0.0297, lr=2.33e-05, step=2200] Training: 22%|██▏ | 2201/10000 [30:04<1:40:21, 1.30it/s, loss=0.0121, lr=2.33e-05, step=2201] Training: 22%|██▏ | 2202/10000 [30:05<1:45:29, 1.23it/s, loss=0.0121, lr=2.33e-05, step=2201] Training: 22%|██▏ | 2202/10000 [30:05<1:45:29, 1.23it/s, loss=0.0109, lr=2.33e-05, step=2202] Training: 22%|██▏ | 2203/10000 [30:06<1:49:03, 1.19it/s, loss=0.0109, lr=2.33e-05, step=2202] Training: 22%|██▏ | 2203/10000 [30:06<1:49:03, 1.19it/s, loss=0.0545, lr=2.33e-05, step=2203] Training: 22%|██▏ | 2204/10000 [30:07<1:37:42, 1.33it/s, loss=0.0545, lr=2.33e-05, step=2203] Training: 22%|██▏ | 2204/10000 [30:07<1:37:42, 1.33it/s, loss=0.0459, lr=2.33e-05, step=2204] Training: 22%|██▏ | 2205/10000 [30:08<1:49:27, 1.19it/s, loss=0.0459, lr=2.33e-05, step=2204] Training: 22%|██▏ | 2205/10000 [30:08<1:49:27, 1.19it/s, loss=0.0351, lr=2.33e-05, step=2205] Training: 22%|██▏ | 2206/10000 [30:08<1:36:47, 1.34it/s, loss=0.0351, lr=2.33e-05, step=2205] Training: 22%|██▏ | 2206/10000 [30:08<1:36:47, 1.34it/s, loss=0.0509, lr=2.33e-05, step=2206] Training: 22%|██▏ | 2207/10000 [30:09<1:40:10, 1.30it/s, loss=0.0509, lr=2.33e-05, step=2206] Training: 22%|██▏ | 2207/10000 [30:09<1:40:10, 1.30it/s, loss=0.0314, lr=2.33e-05, step=2207] Training: 22%|██▏ | 2208/10000 [30:10<1:45:19, 1.23it/s, loss=0.0314, lr=2.33e-05, step=2207] Training: 22%|██▏ | 2208/10000 [30:10<1:45:19, 1.23it/s, loss=0.0036, lr=2.33e-05, step=2208] Training: 22%|██▏ | 2209/10000 [30:11<1:43:55, 1.25it/s, loss=0.0036, lr=2.33e-05, step=2208] Training: 22%|██▏ | 2209/10000 [30:11<1:43:55, 1.25it/s, loss=0.0333, lr=2.33e-05, step=2209]19:14:43.530 [I] step=2210 loss=0.0048 smoothed_loss=0.0264 lr=2.33e-05 grad_norm=0.5325 step_time=0.6079s data_time=0.1761s it/s=1.276 eta_to_10000=6107.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0171 grad_action_out_proj_arms=0.1671 grad_arm_token_fuse=0.0940 grad_shared_expert=0.5297 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2210/10000 [30:11<1:33:19, 1.39it/s, loss=0.0333, lr=2.33e-05, step=2209] Training: 22%|██▏ | 2210/10000 [30:11<1:33:19, 1.39it/s, loss=0.0048, lr=2.33e-05, step=2210] Training: 22%|██▏ | 2211/10000 [30:12<1:29:32, 1.45it/s, loss=0.0048, lr=2.33e-05, step=2210] Training: 22%|██▏ | 2211/10000 [30:12<1:29:32, 1.45it/s, loss=0.0151, lr=2.32e-05, step=2211] Training: 22%|██▏ | 2212/10000 [30:13<1:35:09, 1.36it/s, loss=0.0151, lr=2.32e-05, step=2211] Training: 22%|██▏ | 2212/10000 [30:13<1:35:09, 1.36it/s, loss=0.0071, lr=2.32e-05, step=2212] Training: 22%|██▏ | 2213/10000 [30:13<1:34:51, 1.37it/s, loss=0.0071, lr=2.32e-05, step=2212] Training: 22%|██▏ | 2213/10000 [30:13<1:34:51, 1.37it/s, loss=0.0400, lr=2.32e-05, step=2213] Training: 22%|██▏ | 2214/10000 [30:15<1:53:09, 1.15it/s, loss=0.0400, lr=2.32e-05, step=2213] Training: 22%|██▏ | 2214/10000 [30:15<1:53:09, 1.15it/s, loss=0.0218, lr=2.32e-05, step=2214] Training: 22%|██▏ | 2215/10000 [30:15<1:44:39, 1.24it/s, loss=0.0218, lr=2.32e-05, step=2214] Training: 22%|██▏ | 2215/10000 [30:15<1:44:39, 1.24it/s, loss=0.0220, lr=2.32e-05, step=2215] Training: 22%|██▏ | 2216/10000 [30:16<1:34:07, 1.38it/s, loss=0.0220, lr=2.32e-05, step=2215] Training: 22%|██▏ | 2216/10000 [30:16<1:34:07, 1.38it/s, loss=0.0223, lr=2.32e-05, step=2216] Training: 22%|██▏ | 2217/10000 [30:16<1:33:44, 1.38it/s, loss=0.0223, lr=2.32e-05, step=2216] Training: 22%|██▏ | 2217/10000 [30:16<1:33:44, 1.38it/s, loss=0.0721, lr=2.32e-05, step=2217] Training: 22%|██▏ | 2218/10000 [30:17<1:33:42, 1.38it/s, loss=0.0721, lr=2.32e-05, step=2217] Training: 22%|██▏ | 2218/10000 [30:17<1:33:42, 1.38it/s, loss=0.0210, lr=2.32e-05, step=2218] Training: 22%|██▏ | 2219/10000 [30:18<1:41:59, 1.27it/s, loss=0.0210, lr=2.32e-05, step=2218] Training: 22%|██▏ | 2219/10000 [30:18<1:41:59, 1.27it/s, loss=0.0308, lr=2.32e-05, step=2219]19:14:51.154 [I] step=2220 loss=0.0342 smoothed_loss=0.0291 lr=2.32e-05 grad_norm=0.5975 step_time=0.5874s data_time=0.1751s it/s=1.312 eta_to_10000=5930.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0143 grad_action_out_proj_arms=0.1959 grad_arm_token_fuse=0.0758 grad_shared_expert=0.5077 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2220/10000 [30:19<1:37:42, 1.33it/s, loss=0.0308, lr=2.32e-05, step=2219] Training: 22%|██▏ | 2220/10000 [30:19<1:37:42, 1.33it/s, loss=0.0342, lr=2.32e-05, step=2220] Training: 22%|██▏ | 2221/10000 [30:20<1:52:07, 1.16it/s, loss=0.0342, lr=2.32e-05, step=2220] Training: 22%|██▏ | 2221/10000 [30:20<1:52:07, 1.16it/s, loss=0.0114, lr=2.32e-05, step=2221] Training: 22%|██▏ | 2222/10000 [30:21<1:54:22, 1.13it/s, loss=0.0114, lr=2.32e-05, step=2221] Training: 22%|██▏ | 2222/10000 [30:21<1:54:22, 1.13it/s, loss=0.0452, lr=2.32e-05, step=2222] Training: 22%|██▏ | 2223/10000 [30:21<1:39:27, 1.30it/s, loss=0.0452, lr=2.32e-05, step=2222] Training: 22%|██▏ | 2223/10000 [30:21<1:39:27, 1.30it/s, loss=0.0130, lr=2.32e-05, step=2223] Training: 22%|██▏ | 2224/10000 [30:22<1:32:05, 1.41it/s, loss=0.0130, lr=2.32e-05, step=2223] Training: 22%|██▏ | 2224/10000 [30:22<1:32:05, 1.41it/s, loss=0.0187, lr=2.32e-05, step=2224] Training: 22%|██▏ | 2225/10000 [30:23<1:34:41, 1.37it/s, loss=0.0187, lr=2.32e-05, step=2224] Training: 22%|██▏ | 2225/10000 [30:23<1:34:41, 1.37it/s, loss=0.0854, lr=2.32e-05, step=2225] Training: 22%|██▏ | 2226/10000 [30:24<1:47:54, 1.20it/s, loss=0.0854, lr=2.32e-05, step=2225] Training: 22%|██▏ | 2226/10000 [30:24<1:47:54, 1.20it/s, loss=0.0296, lr=2.32e-05, step=2226] Training: 22%|██▏ | 2227/10000 [30:25<1:46:21, 1.22it/s, loss=0.0296, lr=2.32e-05, step=2226] Training: 22%|██▏ | 2227/10000 [30:25<1:46:21, 1.22it/s, loss=0.0185, lr=2.32e-05, step=2227] Training: 22%|██▏ | 2228/10000 [30:26<1:56:52, 1.11it/s, loss=0.0185, lr=2.32e-05, step=2227] Training: 22%|██▏ | 2228/10000 [30:26<1:56:52, 1.11it/s, loss=0.0199, lr=2.32e-05, step=2228] Training: 22%|██▏ | 2229/10000 [30:27<1:54:31, 1.13it/s, loss=0.0199, lr=2.32e-05, step=2228] Training: 22%|██▏ | 2229/10000 [30:27<1:54:31, 1.13it/s, loss=0.0126, lr=2.32e-05, step=2229]19:14:59.383 [I] step=2230 loss=0.0299 smoothed_loss=0.0282 lr=2.32e-05 grad_norm=0.6065 step_time=0.6168s data_time=0.2061s it/s=1.215 eta_to_10000=6393.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.2058 grad_arm_token_fuse=0.0755 grad_shared_expert=0.6015 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2230/10000 [30:27<1:40:41, 1.29it/s, loss=0.0126, lr=2.32e-05, step=2229] Training: 22%|██▏ | 2230/10000 [30:27<1:40:41, 1.29it/s, loss=0.0299, lr=2.32e-05, step=2230] Training: 22%|██▏ | 2231/10000 [30:28<1:39:38, 1.30it/s, loss=0.0299, lr=2.32e-05, step=2230] Training: 22%|██▏ | 2231/10000 [30:28<1:39:38, 1.30it/s, loss=0.0237, lr=2.32e-05, step=2231] Training: 22%|██▏ | 2232/10000 [30:29<1:40:21, 1.29it/s, loss=0.0237, lr=2.32e-05, step=2231] Training: 22%|██▏ | 2232/10000 [30:29<1:40:21, 1.29it/s, loss=0.0335, lr=2.32e-05, step=2232] Training: 22%|██▏ | 2233/10000 [30:30<1:48:53, 1.19it/s, loss=0.0335, lr=2.32e-05, step=2232] Training: 22%|██▏ | 2233/10000 [30:30<1:48:53, 1.19it/s, loss=0.0167, lr=2.32e-05, step=2233] Training: 22%|██▏ | 2234/10000 [30:30<1:36:08, 1.35it/s, loss=0.0167, lr=2.32e-05, step=2233] Training: 22%|██▏ | 2234/10000 [30:30<1:36:08, 1.35it/s, loss=0.0121, lr=2.32e-05, step=2234] Training: 22%|██▏ | 2235/10000 [30:31<1:30:12, 1.43it/s, loss=0.0121, lr=2.32e-05, step=2234] Training: 22%|██▏ | 2235/10000 [30:31<1:30:12, 1.43it/s, loss=0.0285, lr=2.32e-05, step=2235] Training: 22%|██▏ | 2236/10000 [30:32<1:43:51, 1.25it/s, loss=0.0285, lr=2.32e-05, step=2235] Training: 22%|██▏ | 2236/10000 [30:32<1:43:51, 1.25it/s, loss=0.0442, lr=2.32e-05, step=2236] Training: 22%|██▏ | 2237/10000 [30:33<1:43:28, 1.25it/s, loss=0.0442, lr=2.32e-05, step=2236] Training: 22%|██▏ | 2237/10000 [30:33<1:43:28, 1.25it/s, loss=0.0094, lr=2.32e-05, step=2237] Training: 22%|██▏ | 2238/10000 [30:33<1:43:38, 1.25it/s, loss=0.0094, lr=2.32e-05, step=2237] Training: 22%|██▏ | 2238/10000 [30:33<1:43:38, 1.25it/s, loss=0.0224, lr=2.32e-05, step=2238] Training: 22%|██▏ | 2239/10000 [30:34<1:32:06, 1.40it/s, loss=0.0224, lr=2.32e-05, step=2238] Training: 22%|██▏ | 2239/10000 [30:34<1:32:06, 1.40it/s, loss=0.0074, lr=2.32e-05, step=2239]19:15:07.033 [I] step=2240 loss=0.0333 smoothed_loss=0.0247 lr=2.32e-05 grad_norm=0.5207 step_time=0.5933s data_time=0.1717s it/s=1.307 eta_to_10000=5935.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1505 grad_arm_token_fuse=0.0622 grad_shared_expert=0.4722 (18633:train_pytorch.py:850) + Training: 22%|██▏ | 2240/10000 [30:35<1:37:53, 1.32it/s, loss=0.0074, lr=2.32e-05, step=2239] Training: 22%|██▏ | 2240/10000 [30:35<1:37:53, 1.32it/s, loss=0.0333, lr=2.32e-05, step=2240] Training: 22%|██▏ | 2241/10000 [30:35<1:39:17, 1.30it/s, loss=0.0333, lr=2.32e-05, step=2240] Training: 22%|██▏ | 2241/10000 [30:35<1:39:17, 1.30it/s, loss=0.0660, lr=2.32e-05, step=2241] Training: 22%|██▏ | 2242/10000 [30:36<1:29:37, 1.44it/s, loss=0.0660, lr=2.32e-05, step=2241] Training: 22%|██▏ | 2242/10000 [30:36<1:29:37, 1.44it/s, loss=0.0227, lr=2.32e-05, step=2242] Training: 22%|██▏ | 2243/10000 [30:37<1:33:50, 1.38it/s, loss=0.0227, lr=2.32e-05, step=2242] Training: 22%|██▏ | 2243/10000 [30:37<1:33:50, 1.38it/s, loss=0.0197, lr=2.32e-05, step=2243] Training: 22%|██▏ | 2244/10000 [30:37<1:25:48, 1.51it/s, loss=0.0197, lr=2.32e-05, step=2243] Training: 22%|██▏ | 2244/10000 [30:37<1:25:48, 1.51it/s, loss=0.0174, lr=2.32e-05, step=2244] Training: 22%|██▏ | 2245/10000 [30:38<1:27:14, 1.48it/s, loss=0.0174, lr=2.32e-05, step=2244] Training: 22%|██▏ | 2245/10000 [30:38<1:27:14, 1.48it/s, loss=0.0203, lr=2.32e-05, step=2245] Training: 22%|██▏ | 2246/10000 [30:39<1:21:10, 1.59it/s, loss=0.0203, lr=2.32e-05, step=2245] Training: 22%|██▏ | 2246/10000 [30:39<1:21:10, 1.59it/s, loss=0.0155, lr=2.32e-05, step=2246] Training: 22%|██▏ | 2247/10000 [30:39<1:25:51, 1.51it/s, loss=0.0155, lr=2.32e-05, step=2246] Training: 22%|██▏ | 2247/10000 [30:39<1:25:51, 1.51it/s, loss=0.0300, lr=2.32e-05, step=2247] Training: 22%|██▏ | 2248/10000 [30:40<1:19:22, 1.63it/s, loss=0.0300, lr=2.32e-05, step=2247] Training: 22%|██▏ | 2248/10000 [30:40<1:19:22, 1.63it/s, loss=0.0280, lr=2.32e-05, step=2248] Training: 22%|██▏ | 2249/10000 [30:40<1:15:03, 1.72it/s, loss=0.0280, lr=2.32e-05, step=2248] Training: 22%|██▏ | 2249/10000 [30:40<1:15:03, 1.72it/s, loss=0.0152, lr=2.32e-05, step=2249]19:15:13.470 [I] step=2250 loss=0.0477 smoothed_loss=0.0268 lr=2.32e-05 grad_norm=0.5262 step_time=0.5401s data_time=0.1036s it/s=1.554 eta_to_10000=4987.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1527 grad_arm_token_fuse=0.0598 grad_shared_expert=0.4806 (18633:train_pytorch.py:850) + Training: 22%|██▎ | 2250/10000 [30:41<1:24:49, 1.52it/s, loss=0.0152, lr=2.32e-05, step=2249] Training: 22%|██▎ | 2250/10000 [30:41<1:24:49, 1.52it/s, loss=0.0477, lr=2.32e-05, step=2250] Training: 23%|██▎ | 2251/10000 [30:42<1:20:23, 1.61it/s, loss=0.0477, lr=2.32e-05, step=2250] Training: 23%|██▎ | 2251/10000 [30:42<1:20:23, 1.61it/s, loss=0.0446, lr=2.32e-05, step=2251] Training: 23%|██▎ | 2252/10000 [30:42<1:15:47, 1.70it/s, loss=0.0446, lr=2.32e-05, step=2251] Training: 23%|██▎ | 2252/10000 [30:42<1:15:47, 1.70it/s, loss=0.0148, lr=2.32e-05, step=2252] Training: 23%|██▎ | 2253/10000 [30:43<1:14:56, 1.72it/s, loss=0.0148, lr=2.32e-05, step=2252] Training: 23%|██▎ | 2253/10000 [30:43<1:14:56, 1.72it/s, loss=0.0029, lr=2.32e-05, step=2253] Training: 23%|██▎ | 2254/10000 [30:44<1:22:02, 1.57it/s, loss=0.0029, lr=2.32e-05, step=2253] Training: 23%|██▎ | 2254/10000 [30:44<1:22:02, 1.57it/s, loss=0.0635, lr=2.32e-05, step=2254] Training: 23%|██▎ | 2255/10000 [30:44<1:24:07, 1.53it/s, loss=0.0635, lr=2.32e-05, step=2254] Training: 23%|██▎ | 2255/10000 [30:44<1:24:07, 1.53it/s, loss=0.0106, lr=2.32e-05, step=2255] Training: 23%|██▎ | 2256/10000 [30:45<1:23:19, 1.55it/s, loss=0.0106, lr=2.32e-05, step=2255] Training: 23%|██▎ | 2256/10000 [30:45<1:23:19, 1.55it/s, loss=0.0031, lr=2.32e-05, step=2256] Training: 23%|██▎ | 2257/10000 [30:46<1:42:48, 1.26it/s, loss=0.0031, lr=2.32e-05, step=2256] Training: 23%|██▎ | 2257/10000 [30:46<1:42:48, 1.26it/s, loss=0.0163, lr=2.32e-05, step=2257] Training: 23%|██▎ | 2258/10000 [30:47<1:35:17, 1.35it/s, loss=0.0163, lr=2.32e-05, step=2257] Training: 23%|██▎ | 2258/10000 [30:47<1:35:17, 1.35it/s, loss=0.0403, lr=2.32e-05, step=2258] Training: 23%|██▎ | 2259/10000 [30:47<1:26:49, 1.49it/s, loss=0.0403, lr=2.32e-05, step=2258] Training: 23%|██▎ | 2259/10000 [30:47<1:26:49, 1.49it/s, loss=0.0373, lr=2.32e-05, step=2259]19:15:20.157 [I] step=2260 loss=0.0200 smoothed_loss=0.0259 lr=2.32e-05 grad_norm=0.6786 step_time=0.5509s data_time=0.1178s it/s=1.496 eta_to_10000=5175.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0205 grad_action_out_proj_arms=0.1665 grad_arm_token_fuse=0.1068 grad_shared_expert=0.6506 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2260/10000 [30:48<1:28:35, 1.46it/s, loss=0.0373, lr=2.32e-05, step=2259] Training: 23%|██▎ | 2260/10000 [30:48<1:28:35, 1.46it/s, loss=0.0200, lr=2.31e-05, step=2260] Training: 23%|██▎ | 2261/10000 [30:49<1:29:22, 1.44it/s, loss=0.0200, lr=2.31e-05, step=2260] Training: 23%|██▎ | 2261/10000 [30:49<1:29:22, 1.44it/s, loss=0.0128, lr=2.31e-05, step=2261] Training: 23%|██▎ | 2262/10000 [30:49<1:24:05, 1.53it/s, loss=0.0128, lr=2.31e-05, step=2261] Training: 23%|██▎ | 2262/10000 [30:49<1:24:05, 1.53it/s, loss=0.0351, lr=2.31e-05, step=2262] Training: 23%|██▎ | 2263/10000 [30:50<1:19:24, 1.62it/s, loss=0.0351, lr=2.31e-05, step=2262] Training: 23%|██▎ | 2263/10000 [30:50<1:19:24, 1.62it/s, loss=0.0619, lr=2.31e-05, step=2263] Training: 23%|██▎ | 2264/10000 [30:51<1:32:47, 1.39it/s, loss=0.0619, lr=2.31e-05, step=2263] Training: 23%|██▎ | 2264/10000 [30:51<1:32:47, 1.39it/s, loss=0.0176, lr=2.31e-05, step=2264] Training: 23%|██▎ | 2265/10000 [30:51<1:23:30, 1.54it/s, loss=0.0176, lr=2.31e-05, step=2264] Training: 23%|██▎ | 2265/10000 [30:51<1:23:30, 1.54it/s, loss=0.0187, lr=2.31e-05, step=2265] Training: 23%|██▎ | 2266/10000 [30:52<1:31:14, 1.41it/s, loss=0.0187, lr=2.31e-05, step=2265] Training: 23%|██▎ | 2266/10000 [30:52<1:31:14, 1.41it/s, loss=0.0203, lr=2.31e-05, step=2266] Training: 23%|██▎ | 2267/10000 [30:52<1:23:21, 1.55it/s, loss=0.0203, lr=2.31e-05, step=2266] Training: 23%|██▎ | 2267/10000 [30:52<1:23:21, 1.55it/s, loss=0.0174, lr=2.31e-05, step=2267] Training: 23%|██▎ | 2268/10000 [30:53<1:33:00, 1.39it/s, loss=0.0174, lr=2.31e-05, step=2267] Training: 23%|██▎ | 2268/10000 [30:53<1:33:00, 1.39it/s, loss=0.0481, lr=2.31e-05, step=2268] Training: 23%|██▎ | 2269/10000 [30:54<1:25:49, 1.50it/s, loss=0.0481, lr=2.31e-05, step=2268] Training: 23%|██▎ | 2269/10000 [30:54<1:25:49, 1.50it/s, loss=0.0326, lr=2.31e-05, step=2269]19:15:26.815 [I] step=2270 loss=0.0175 smoothed_loss=0.0272 lr=2.31e-05 grad_norm=0.5522 step_time=0.5569s data_time=0.1089s it/s=1.502 eta_to_10000=5145.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.1364 grad_arm_token_fuse=0.0511 grad_shared_expert=0.4607 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2270/10000 [30:54<1:24:36, 1.52it/s, loss=0.0326, lr=2.31e-05, step=2269] Training: 23%|██▎ | 2270/10000 [30:54<1:24:36, 1.52it/s, loss=0.0175, lr=2.31e-05, step=2270] Training: 23%|██▎ | 2271/10000 [30:56<1:42:15, 1.26it/s, loss=0.0175, lr=2.31e-05, step=2270] Training: 23%|██▎ | 2271/10000 [30:56<1:42:15, 1.26it/s, loss=0.0290, lr=2.31e-05, step=2271] Training: 23%|██▎ | 2272/10000 [30:56<1:34:37, 1.36it/s, loss=0.0290, lr=2.31e-05, step=2271] Training: 23%|██▎ | 2272/10000 [30:56<1:34:37, 1.36it/s, loss=0.0107, lr=2.31e-05, step=2272] Training: 23%|██▎ | 2273/10000 [30:57<1:26:51, 1.48it/s, loss=0.0107, lr=2.31e-05, step=2272] Training: 23%|██▎ | 2273/10000 [30:57<1:26:51, 1.48it/s, loss=0.0288, lr=2.31e-05, step=2273] Training: 23%|██▎ | 2274/10000 [30:58<1:36:19, 1.34it/s, loss=0.0288, lr=2.31e-05, step=2273] Training: 23%|██▎ | 2274/10000 [30:58<1:36:19, 1.34it/s, loss=0.0119, lr=2.31e-05, step=2274] Training: 23%|██▎ | 2275/10000 [30:58<1:34:40, 1.36it/s, loss=0.0119, lr=2.31e-05, step=2274] Training: 23%|██▎ | 2275/10000 [30:58<1:34:40, 1.36it/s, loss=0.0151, lr=2.31e-05, step=2275] Training: 23%|██▎ | 2276/10000 [30:59<1:33:15, 1.38it/s, loss=0.0151, lr=2.31e-05, step=2275] Training: 23%|██▎ | 2276/10000 [30:59<1:33:15, 1.38it/s, loss=0.0250, lr=2.31e-05, step=2276] Training: 23%|██▎ | 2277/10000 [31:00<1:26:29, 1.49it/s, loss=0.0250, lr=2.31e-05, step=2276] Training: 23%|██▎ | 2277/10000 [31:00<1:26:29, 1.49it/s, loss=0.0201, lr=2.31e-05, step=2277] Training: 23%|██▎ | 2278/10000 [31:00<1:31:45, 1.40it/s, loss=0.0201, lr=2.31e-05, step=2277] Training: 23%|██▎ | 2278/10000 [31:00<1:31:45, 1.40it/s, loss=0.0263, lr=2.31e-05, step=2278] Training: 23%|██▎ | 2279/10000 [31:01<1:23:22, 1.54it/s, loss=0.0263, lr=2.31e-05, step=2278] Training: 23%|██▎ | 2279/10000 [31:01<1:23:22, 1.54it/s, loss=0.0200, lr=2.31e-05, step=2279]19:15:33.882 [I] step=2280 loss=0.0543 smoothed_loss=0.0264 lr=2.31e-05 grad_norm=0.5947 step_time=0.5889s data_time=0.1178s it/s=1.415 eta_to_10000=5454.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0250 grad_action_out_proj_arms=0.2159 grad_arm_token_fuse=0.1293 grad_shared_expert=0.5894 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2280/10000 [31:02<1:23:11, 1.55it/s, loss=0.0200, lr=2.31e-05, step=2279] Training: 23%|██▎ | 2280/10000 [31:02<1:23:11, 1.55it/s, loss=0.0543, lr=2.31e-05, step=2280] Training: 23%|██▎ | 2281/10000 [31:02<1:17:36, 1.66it/s, loss=0.0543, lr=2.31e-05, step=2280] Training: 23%|██▎ | 2281/10000 [31:02<1:17:36, 1.66it/s, loss=0.0232, lr=2.31e-05, step=2281] Training: 23%|██▎ | 2282/10000 [31:03<1:20:23, 1.60it/s, loss=0.0232, lr=2.31e-05, step=2281] Training: 23%|██▎ | 2282/10000 [31:03<1:20:23, 1.60it/s, loss=0.0250, lr=2.31e-05, step=2282] Training: 23%|██▎ | 2283/10000 [31:03<1:24:13, 1.53it/s, loss=0.0250, lr=2.31e-05, step=2282] Training: 23%|██▎ | 2283/10000 [31:03<1:24:13, 1.53it/s, loss=0.0213, lr=2.31e-05, step=2283] Training: 23%|██▎ | 2284/10000 [31:04<1:19:18, 1.62it/s, loss=0.0213, lr=2.31e-05, step=2283] Training: 23%|██▎ | 2284/10000 [31:04<1:19:18, 1.62it/s, loss=0.0117, lr=2.31e-05, step=2284] Training: 23%|██▎ | 2285/10000 [31:04<1:15:10, 1.71it/s, loss=0.0117, lr=2.31e-05, step=2284] Training: 23%|██▎ | 2285/10000 [31:04<1:15:10, 1.71it/s, loss=0.0164, lr=2.31e-05, step=2285] Training: 23%|██▎ | 2286/10000 [31:05<1:22:33, 1.56it/s, loss=0.0164, lr=2.31e-05, step=2285] Training: 23%|██▎ | 2286/10000 [31:05<1:22:33, 1.56it/s, loss=0.0649, lr=2.31e-05, step=2286] Training: 23%|██▎ | 2287/10000 [31:06<1:18:00, 1.65it/s, loss=0.0649, lr=2.31e-05, step=2286] Training: 23%|██▎ | 2287/10000 [31:06<1:18:00, 1.65it/s, loss=0.0443, lr=2.31e-05, step=2287] Training: 23%|██▎ | 2288/10000 [31:06<1:15:06, 1.71it/s, loss=0.0443, lr=2.31e-05, step=2287] Training: 23%|██▎ | 2288/10000 [31:06<1:15:06, 1.71it/s, loss=0.0365, lr=2.31e-05, step=2288] Training: 23%|██▎ | 2289/10000 [31:07<1:26:28, 1.49it/s, loss=0.0365, lr=2.31e-05, step=2288] Training: 23%|██▎ | 2289/10000 [31:07<1:26:28, 1.49it/s, loss=0.0109, lr=2.31e-05, step=2289]19:15:40.076 [I] step=2290 loss=0.0280 smoothed_loss=0.0280 lr=2.31e-05 grad_norm=0.6412 step_time=0.5292s data_time=0.0903s it/s=1.615 eta_to_10000=4775.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0253 grad_action_out_proj_arms=0.2154 grad_arm_token_fuse=0.1336 grad_shared_expert=0.5023 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2290/10000 [31:08<1:21:26, 1.58it/s, loss=0.0109, lr=2.31e-05, step=2289] Training: 23%|██▎ | 2290/10000 [31:08<1:21:26, 1.58it/s, loss=0.0280, lr=2.31e-05, step=2290] Training: 23%|██▎ | 2291/10000 [31:08<1:16:05, 1.69it/s, loss=0.0280, lr=2.31e-05, step=2290] Training: 23%|██▎ | 2291/10000 [31:08<1:16:05, 1.69it/s, loss=0.0066, lr=2.31e-05, step=2291] Training: 23%|██▎ | 2292/10000 [31:09<1:21:41, 1.57it/s, loss=0.0066, lr=2.31e-05, step=2291] Training: 23%|██▎ | 2292/10000 [31:09<1:21:41, 1.57it/s, loss=0.0359, lr=2.31e-05, step=2292] Training: 23%|██▎ | 2293/10000 [31:10<1:36:46, 1.33it/s, loss=0.0359, lr=2.31e-05, step=2292] Training: 23%|██▎ | 2293/10000 [31:10<1:36:46, 1.33it/s, loss=0.0160, lr=2.31e-05, step=2293] Training: 23%|██▎ | 2294/10000 [31:11<1:37:38, 1.32it/s, loss=0.0160, lr=2.31e-05, step=2293] Training: 23%|██▎ | 2294/10000 [31:11<1:37:38, 1.32it/s, loss=0.0274, lr=2.31e-05, step=2294] Training: 23%|██▎ | 2295/10000 [31:11<1:29:30, 1.43it/s, loss=0.0274, lr=2.31e-05, step=2294] Training: 23%|██▎ | 2295/10000 [31:11<1:29:30, 1.43it/s, loss=0.0053, lr=2.31e-05, step=2295] Training: 23%|██▎ | 2296/10000 [31:12<1:28:12, 1.46it/s, loss=0.0053, lr=2.31e-05, step=2295] Training: 23%|██▎ | 2296/10000 [31:12<1:28:12, 1.46it/s, loss=0.0336, lr=2.31e-05, step=2296] Training: 23%|██▎ | 2297/10000 [31:13<1:22:20, 1.56it/s, loss=0.0336, lr=2.31e-05, step=2296] Training: 23%|██▎ | 2297/10000 [31:13<1:22:20, 1.56it/s, loss=0.0401, lr=2.31e-05, step=2297] Training: 23%|██▎ | 2298/10000 [31:13<1:19:06, 1.62it/s, loss=0.0401, lr=2.31e-05, step=2297] Training: 23%|██▎ | 2298/10000 [31:13<1:19:06, 1.62it/s, loss=0.0336, lr=2.31e-05, step=2298] Training: 23%|██▎ | 2299/10000 [31:14<1:14:57, 1.71it/s, loss=0.0336, lr=2.31e-05, step=2298] Training: 23%|██▎ | 2299/10000 [31:14<1:14:57, 1.71it/s, loss=0.0124, lr=2.31e-05, step=2299]19:15:46.732 [I] step=2300 loss=0.0320 smoothed_loss=0.0263 lr=2.31e-05 grad_norm=0.6155 step_time=0.5725s data_time=0.0931s it/s=1.503 eta_to_10000=5124.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0116 grad_action_out_proj_arms=0.1684 grad_arm_token_fuse=0.0544 grad_shared_expert=0.5484 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2300/10000 [31:14<1:23:27, 1.54it/s, loss=0.0124, lr=2.31e-05, step=2299] Training: 23%|██▎ | 2300/10000 [31:14<1:23:27, 1.54it/s, loss=0.0320, lr=2.31e-05, step=2300] Training: 23%|██▎ | 2301/10000 [31:15<1:17:18, 1.66it/s, loss=0.0320, lr=2.31e-05, step=2300] Training: 23%|██▎ | 2301/10000 [31:15<1:17:18, 1.66it/s, loss=0.0144, lr=2.31e-05, step=2301] Training: 23%|██▎ | 2302/10000 [31:15<1:13:16, 1.75it/s, loss=0.0144, lr=2.31e-05, step=2301] Training: 23%|██▎ | 2302/10000 [31:15<1:13:16, 1.75it/s, loss=0.0164, lr=2.31e-05, step=2302] Training: 23%|██▎ | 2303/10000 [31:16<1:16:42, 1.67it/s, loss=0.0164, lr=2.31e-05, step=2302] Training: 23%|██▎ | 2303/10000 [31:16<1:16:42, 1.67it/s, loss=0.0195, lr=2.31e-05, step=2303] Training: 23%|██▎ | 2304/10000 [31:17<1:14:37, 1.72it/s, loss=0.0195, lr=2.31e-05, step=2303] Training: 23%|██▎ | 2304/10000 [31:17<1:14:37, 1.72it/s, loss=0.0365, lr=2.31e-05, step=2304] Training: 23%|██▎ | 2305/10000 [31:17<1:13:32, 1.74it/s, loss=0.0365, lr=2.31e-05, step=2304] Training: 23%|██▎ | 2305/10000 [31:17<1:13:32, 1.74it/s, loss=0.1108, lr=2.31e-05, step=2305] Training: 23%|██▎ | 2306/10000 [31:18<1:12:40, 1.76it/s, loss=0.1108, lr=2.31e-05, step=2305] Training: 23%|██▎ | 2306/10000 [31:18<1:12:40, 1.76it/s, loss=0.0304, lr=2.31e-05, step=2306] Training: 23%|██▎ | 2307/10000 [31:18<1:21:22, 1.58it/s, loss=0.0304, lr=2.31e-05, step=2306] Training: 23%|██▎ | 2307/10000 [31:18<1:21:22, 1.58it/s, loss=0.0333, lr=2.31e-05, step=2307] Training: 23%|██▎ | 2308/10000 [31:19<1:18:22, 1.64it/s, loss=0.0333, lr=2.31e-05, step=2307] Training: 23%|██▎ | 2308/10000 [31:19<1:18:22, 1.64it/s, loss=0.0193, lr=2.31e-05, step=2308] Training: 23%|██▎ | 2309/10000 [31:20<1:15:54, 1.69it/s, loss=0.0193, lr=2.31e-05, step=2308] Training: 23%|██▎ | 2309/10000 [31:20<1:15:54, 1.69it/s, loss=0.0404, lr=2.30e-05, step=2309]19:15:52.466 [I] step=2310 loss=0.0801 smoothed_loss=0.0375 lr=2.31e-05 grad_norm=0.5936 step_time=0.5047s data_time=0.0687s it/s=1.744 eta_to_10000=4408.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0233 grad_action_out_proj_arms=0.2358 grad_arm_token_fuse=0.1348 grad_shared_expert=0.6627 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2310/10000 [31:20<1:13:46, 1.74it/s, loss=0.0404, lr=2.30e-05, step=2309] Training: 23%|██▎ | 2310/10000 [31:20<1:13:46, 1.74it/s, loss=0.0801, lr=2.30e-05, step=2310] Training: 23%|██▎ | 2311/10000 [31:21<1:16:07, 1.68it/s, loss=0.0801, lr=2.30e-05, step=2310] Training: 23%|██▎ | 2311/10000 [31:21<1:16:07, 1.68it/s, loss=0.0099, lr=2.30e-05, step=2311] Training: 23%|██▎ | 2312/10000 [31:21<1:11:51, 1.78it/s, loss=0.0099, lr=2.30e-05, step=2311] Training: 23%|██▎ | 2312/10000 [31:21<1:11:51, 1.78it/s, loss=0.0148, lr=2.30e-05, step=2312] Training: 23%|██▎ | 2313/10000 [31:22<1:09:48, 1.84it/s, loss=0.0148, lr=2.30e-05, step=2312] Training: 23%|██▎ | 2313/10000 [31:22<1:09:48, 1.84it/s, loss=0.0315, lr=2.30e-05, step=2313] Training: 23%|██▎ | 2314/10000 [31:23<1:20:42, 1.59it/s, loss=0.0315, lr=2.30e-05, step=2313] Training: 23%|██▎ | 2314/10000 [31:23<1:20:42, 1.59it/s, loss=0.0155, lr=2.30e-05, step=2314] Training: 23%|██▎ | 2315/10000 [31:23<1:15:15, 1.70it/s, loss=0.0155, lr=2.30e-05, step=2314] Training: 23%|██▎ | 2315/10000 [31:23<1:15:15, 1.70it/s, loss=0.0281, lr=2.30e-05, step=2315] Training: 23%|██▎ | 2316/10000 [31:24<1:11:54, 1.78it/s, loss=0.0281, lr=2.30e-05, step=2315] Training: 23%|██▎ | 2316/10000 [31:24<1:11:54, 1.78it/s, loss=0.0246, lr=2.30e-05, step=2316] Training: 23%|██▎ | 2317/10000 [31:24<1:10:17, 1.82it/s, loss=0.0246, lr=2.30e-05, step=2316] Training: 23%|██▎ | 2317/10000 [31:24<1:10:17, 1.82it/s, loss=0.0195, lr=2.30e-05, step=2317] Training: 23%|██▎ | 2318/10000 [31:25<1:08:19, 1.87it/s, loss=0.0195, lr=2.30e-05, step=2317] Training: 23%|██▎ | 2318/10000 [31:25<1:08:19, 1.87it/s, loss=0.0199, lr=2.30e-05, step=2318] Training: 23%|██▎ | 2319/10000 [31:25<1:12:43, 1.76it/s, loss=0.0199, lr=2.30e-05, step=2318] Training: 23%|██▎ | 2319/10000 [31:25<1:12:43, 1.76it/s, loss=0.0218, lr=2.30e-05, step=2319]19:15:58.105 [I] step=2320 loss=0.1007 smoothed_loss=0.0347 lr=2.30e-05 grad_norm=0.5551 step_time=0.4933s data_time=0.0705s it/s=1.774 eta_to_10000=4330.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0256 grad_action_out_proj_arms=0.1877 grad_arm_token_fuse=0.1308 grad_shared_expert=0.5658 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2320/10000 [31:26<1:11:08, 1.80it/s, loss=0.0218, lr=2.30e-05, step=2319] Training: 23%|██▎ | 2320/10000 [31:26<1:11:08, 1.80it/s, loss=0.1007, lr=2.30e-05, step=2320] Training: 23%|██▎ | 2321/10000 [31:26<1:17:24, 1.65it/s, loss=0.1007, lr=2.30e-05, step=2320] Training: 23%|██▎ | 2321/10000 [31:26<1:17:24, 1.65it/s, loss=0.0338, lr=2.30e-05, step=2321] Training: 23%|██▎ | 2322/10000 [31:27<1:14:10, 1.73it/s, loss=0.0338, lr=2.30e-05, step=2321] Training: 23%|██▎ | 2322/10000 [31:27<1:14:10, 1.73it/s, loss=0.0401, lr=2.30e-05, step=2322] Training: 23%|██▎ | 2323/10000 [31:28<1:12:31, 1.76it/s, loss=0.0401, lr=2.30e-05, step=2322] Training: 23%|██▎ | 2323/10000 [31:28<1:12:31, 1.76it/s, loss=0.0204, lr=2.30e-05, step=2323] Training: 23%|██▎ | 2324/10000 [31:28<1:10:17, 1.82it/s, loss=0.0204, lr=2.30e-05, step=2323] Training: 23%|██▎ | 2324/10000 [31:28<1:10:17, 1.82it/s, loss=0.0260, lr=2.30e-05, step=2324] Training: 23%|██▎ | 2325/10000 [31:29<1:15:19, 1.70it/s, loss=0.0260, lr=2.30e-05, step=2324] Training: 23%|██▎ | 2325/10000 [31:29<1:15:19, 1.70it/s, loss=0.0146, lr=2.30e-05, step=2325] Training: 23%|██▎ | 2326/10000 [31:29<1:18:06, 1.64it/s, loss=0.0146, lr=2.30e-05, step=2325] Training: 23%|██▎ | 2326/10000 [31:29<1:18:06, 1.64it/s, loss=0.0104, lr=2.30e-05, step=2326] Training: 23%|██▎ | 2327/10000 [31:30<1:15:32, 1.69it/s, loss=0.0104, lr=2.30e-05, step=2326] Training: 23%|██▎ | 2327/10000 [31:30<1:15:32, 1.69it/s, loss=0.0550, lr=2.30e-05, step=2327] Training: 23%|██▎ | 2328/10000 [31:31<1:21:35, 1.57it/s, loss=0.0550, lr=2.30e-05, step=2327] Training: 23%|██▎ | 2328/10000 [31:31<1:21:35, 1.57it/s, loss=0.0442, lr=2.30e-05, step=2328] Training: 23%|██▎ | 2329/10000 [31:31<1:17:09, 1.66it/s, loss=0.0442, lr=2.30e-05, step=2328] Training: 23%|██▎ | 2329/10000 [31:31<1:17:09, 1.66it/s, loss=0.0229, lr=2.30e-05, step=2329]19:16:04.107 [I] step=2330 loss=0.0246 smoothed_loss=0.0312 lr=2.30e-05 grad_norm=0.5141 step_time=0.5221s data_time=0.0782s it/s=1.666 eta_to_10000=4603.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.1303 grad_arm_token_fuse=0.0595 grad_shared_expert=0.4893 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2330/10000 [31:32<1:15:26, 1.69it/s, loss=0.0229, lr=2.30e-05, step=2329] Training: 23%|██▎ | 2330/10000 [31:32<1:15:26, 1.69it/s, loss=0.0246, lr=2.30e-05, step=2330] Training: 23%|██▎ | 2331/10000 [31:32<1:11:56, 1.78it/s, loss=0.0246, lr=2.30e-05, step=2330] Training: 23%|██▎ | 2331/10000 [31:32<1:11:56, 1.78it/s, loss=0.0144, lr=2.30e-05, step=2331] Training: 23%|██▎ | 2332/10000 [31:33<1:10:52, 1.80it/s, loss=0.0144, lr=2.30e-05, step=2331] Training: 23%|██▎ | 2332/10000 [31:33<1:10:52, 1.80it/s, loss=0.0323, lr=2.30e-05, step=2332] Training: 23%|██▎ | 2333/10000 [31:33<1:14:31, 1.71it/s, loss=0.0323, lr=2.30e-05, step=2332] Training: 23%|██▎ | 2333/10000 [31:33<1:14:31, 1.71it/s, loss=0.0279, lr=2.30e-05, step=2333] Training: 23%|██▎ | 2334/10000 [31:34<1:11:30, 1.79it/s, loss=0.0279, lr=2.30e-05, step=2333] Training: 23%|██▎ | 2334/10000 [31:34<1:11:30, 1.79it/s, loss=0.0197, lr=2.30e-05, step=2334] Training: 23%|██▎ | 2335/10000 [31:34<1:08:51, 1.86it/s, loss=0.0197, lr=2.30e-05, step=2334] Training: 23%|██▎ | 2335/10000 [31:34<1:08:51, 1.86it/s, loss=0.0104, lr=2.30e-05, step=2335] Training: 23%|██▎ | 2336/10000 [31:35<1:17:02, 1.66it/s, loss=0.0104, lr=2.30e-05, step=2335] Training: 23%|██▎ | 2336/10000 [31:35<1:17:02, 1.66it/s, loss=0.0120, lr=2.30e-05, step=2336] Training: 23%|██▎ | 2337/10000 [31:36<1:13:29, 1.74it/s, loss=0.0120, lr=2.30e-05, step=2336] Training: 23%|██▎ | 2337/10000 [31:36<1:13:29, 1.74it/s, loss=0.0155, lr=2.30e-05, step=2337] Training: 23%|██▎ | 2338/10000 [31:36<1:12:34, 1.76it/s, loss=0.0155, lr=2.30e-05, step=2337] Training: 23%|██▎ | 2338/10000 [31:36<1:12:34, 1.76it/s, loss=0.0110, lr=2.30e-05, step=2338] Training: 23%|██▎ | 2339/10000 [31:37<1:23:38, 1.53it/s, loss=0.0110, lr=2.30e-05, step=2338] Training: 23%|██▎ | 2339/10000 [31:37<1:23:38, 1.53it/s, loss=0.0077, lr=2.30e-05, step=2339]19:16:10.056 [I] step=2340 loss=0.0185 smoothed_loss=0.0212 lr=2.30e-05 grad_norm=0.6848 step_time=0.5186s data_time=0.0763s it/s=1.681 eta_to_10000=4556.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1679 grad_arm_token_fuse=0.0736 grad_shared_expert=0.4759 (18633:train_pytorch.py:850) + Training: 23%|██▎ | 2340/10000 [31:38<1:21:24, 1.57it/s, loss=0.0077, lr=2.30e-05, step=2339] Training: 23%|██▎ | 2340/10000 [31:38<1:21:24, 1.57it/s, loss=0.0185, lr=2.30e-05, step=2340] Training: 23%|██▎ | 2341/10000 [31:38<1:23:31, 1.53it/s, loss=0.0185, lr=2.30e-05, step=2340] Training: 23%|██▎ | 2341/10000 [31:38<1:23:31, 1.53it/s, loss=0.0188, lr=2.30e-05, step=2341] Training: 23%|██▎ | 2342/10000 [31:39<1:16:58, 1.66it/s, loss=0.0188, lr=2.30e-05, step=2341] Training: 23%|██▎ | 2342/10000 [31:39<1:16:58, 1.66it/s, loss=0.0397, lr=2.30e-05, step=2342] Training: 23%|██▎ | 2343/10000 [31:40<1:24:20, 1.51it/s, loss=0.0397, lr=2.30e-05, step=2342] Training: 23%|██▎ | 2343/10000 [31:40<1:24:20, 1.51it/s, loss=0.0289, lr=2.30e-05, step=2343] Training: 23%|██▎ | 2344/10000 [31:40<1:18:05, 1.63it/s, loss=0.0289, lr=2.30e-05, step=2343] Training: 23%|██▎ | 2344/10000 [31:40<1:18:05, 1.63it/s, loss=0.0351, lr=2.30e-05, step=2344] Training: 23%|██▎ | 2345/10000 [31:41<1:13:52, 1.73it/s, loss=0.0351, lr=2.30e-05, step=2344] Training: 23%|██▎ | 2345/10000 [31:41<1:13:52, 1.73it/s, loss=0.0153, lr=2.30e-05, step=2345] Training: 23%|██▎ | 2346/10000 [31:41<1:10:39, 1.81it/s, loss=0.0153, lr=2.30e-05, step=2345] Training: 23%|██▎ | 2346/10000 [31:41<1:10:39, 1.81it/s, loss=0.0108, lr=2.30e-05, step=2346] Training: 23%|██▎ | 2347/10000 [31:42<1:08:09, 1.87it/s, loss=0.0108, lr=2.30e-05, step=2346] Training: 23%|██▎ | 2347/10000 [31:42<1:08:09, 1.87it/s, loss=0.0271, lr=2.30e-05, step=2347] Training: 23%|██▎ | 2348/10000 [31:43<1:23:55, 1.52it/s, loss=0.0271, lr=2.30e-05, step=2347] Training: 23%|██▎ | 2348/10000 [31:43<1:23:55, 1.52it/s, loss=0.0184, lr=2.30e-05, step=2348] Training: 23%|██▎ | 2349/10000 [31:43<1:17:10, 1.65it/s, loss=0.0184, lr=2.30e-05, step=2348] Training: 23%|██▎ | 2349/10000 [31:43<1:17:10, 1.65it/s, loss=0.0130, lr=2.30e-05, step=2349]19:16:16.269 [I] step=2350 loss=0.0324 smoothed_loss=0.0226 lr=2.30e-05 grad_norm=0.5726 step_time=0.5418s data_time=0.0795s it/s=1.610 eta_to_10000=4751.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.2015 grad_arm_token_fuse=0.1126 grad_shared_expert=0.5222 (18633:train_pytorch.py:850) + Training: 24%|██▎ | 2350/10000 [31:44<1:25:38, 1.49it/s, loss=0.0130, lr=2.30e-05, step=2349] Training: 24%|██▎ | 2350/10000 [31:44<1:25:38, 1.49it/s, loss=0.0324, lr=2.30e-05, step=2350] Training: 24%|██▎ | 2351/10000 [31:44<1:19:50, 1.60it/s, loss=0.0324, lr=2.30e-05, step=2350] Training: 24%|██▎ | 2351/10000 [31:44<1:19:50, 1.60it/s, loss=0.0149, lr=2.30e-05, step=2351] Training: 24%|██▎ | 2352/10000 [31:45<1:24:40, 1.51it/s, loss=0.0149, lr=2.30e-05, step=2351] Training: 24%|██▎ | 2352/10000 [31:45<1:24:40, 1.51it/s, loss=0.0647, lr=2.30e-05, step=2352] Training: 24%|██▎ | 2353/10000 [31:46<1:18:38, 1.62it/s, loss=0.0647, lr=2.30e-05, step=2352] Training: 24%|██▎ | 2353/10000 [31:46<1:18:38, 1.62it/s, loss=0.0338, lr=2.30e-05, step=2353] Training: 24%|██▎ | 2354/10000 [31:46<1:20:11, 1.59it/s, loss=0.0338, lr=2.30e-05, step=2353] Training: 24%|██▎ | 2354/10000 [31:46<1:20:11, 1.59it/s, loss=0.0163, lr=2.30e-05, step=2354] Training: 24%|██▎ | 2355/10000 [31:47<1:19:08, 1.61it/s, loss=0.0163, lr=2.30e-05, step=2354] Training: 24%|██▎ | 2355/10000 [31:47<1:19:08, 1.61it/s, loss=0.0433, lr=2.30e-05, step=2355] Training: 24%|██▎ | 2356/10000 [31:48<1:20:10, 1.59it/s, loss=0.0433, lr=2.30e-05, step=2355] Training: 24%|██▎ | 2356/10000 [31:48<1:20:10, 1.59it/s, loss=0.0243, lr=2.29e-05, step=2356] Training: 24%|██▎ | 2357/10000 [31:48<1:27:43, 1.45it/s, loss=0.0243, lr=2.29e-05, step=2356] Training: 24%|██▎ | 2357/10000 [31:48<1:27:43, 1.45it/s, loss=0.0211, lr=2.29e-05, step=2357] Training: 24%|██▎ | 2358/10000 [31:49<1:21:51, 1.56it/s, loss=0.0211, lr=2.29e-05, step=2357] Training: 24%|██▎ | 2358/10000 [31:49<1:21:51, 1.56it/s, loss=0.0502, lr=2.29e-05, step=2358] Training: 24%|██▎ | 2359/10000 [31:50<1:17:10, 1.65it/s, loss=0.0502, lr=2.29e-05, step=2358] Training: 24%|██▎ | 2359/10000 [31:50<1:17:10, 1.65it/s, loss=0.0177, lr=2.29e-05, step=2359]19:16:22.370 [I] step=2360 loss=0.0137 smoothed_loss=0.0264 lr=2.29e-05 grad_norm=0.5920 step_time=0.5114s data_time=0.0987s it/s=1.639 eta_to_10000=4660.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1273 grad_arm_token_fuse=0.0657 grad_shared_expert=0.3158 (18633:train_pytorch.py:850) + Training: 24%|██▎ | 2360/10000 [31:50<1:14:15, 1.71it/s, loss=0.0177, lr=2.29e-05, step=2359] Training: 24%|██▎ | 2360/10000 [31:50<1:14:15, 1.71it/s, loss=0.0137, lr=2.29e-05, step=2360] Training: 24%|██▎ | 2361/10000 [31:51<1:11:41, 1.78it/s, loss=0.0137, lr=2.29e-05, step=2360] Training: 24%|██▎ | 2361/10000 [31:51<1:11:41, 1.78it/s, loss=0.0065, lr=2.29e-05, step=2361] Training: 24%|██▎ | 2362/10000 [31:51<1:09:01, 1.84it/s, loss=0.0065, lr=2.29e-05, step=2361] Training: 24%|██▎ | 2362/10000 [31:51<1:09:01, 1.84it/s, loss=0.0140, lr=2.29e-05, step=2362] Training: 24%|██▎ | 2363/10000 [31:52<1:12:43, 1.75it/s, loss=0.0140, lr=2.29e-05, step=2362] Training: 24%|██▎ | 2363/10000 [31:52<1:12:43, 1.75it/s, loss=0.0080, lr=2.29e-05, step=2363] Training: 24%|██▎ | 2364/10000 [31:52<1:19:05, 1.61it/s, loss=0.0080, lr=2.29e-05, step=2363] Training: 24%|██▎ | 2364/10000 [31:52<1:19:05, 1.61it/s, loss=0.0450, lr=2.29e-05, step=2364] Training: 24%|██▎ | 2365/10000 [31:53<1:15:01, 1.70it/s, loss=0.0450, lr=2.29e-05, step=2364] Training: 24%|██▎ | 2365/10000 [31:53<1:15:01, 1.70it/s, loss=0.0166, lr=2.29e-05, step=2365] Training: 24%|██▎ | 2366/10000 [31:53<1:12:12, 1.76it/s, loss=0.0166, lr=2.29e-05, step=2365] Training: 24%|██▎ | 2366/10000 [31:53<1:12:12, 1.76it/s, loss=0.0373, lr=2.29e-05, step=2366] Training: 24%|██▎ | 2367/10000 [31:54<1:09:05, 1.84it/s, loss=0.0373, lr=2.29e-05, step=2366] Training: 24%|██▎ | 2367/10000 [31:54<1:09:05, 1.84it/s, loss=0.0162, lr=2.29e-05, step=2367] Training: 24%|██▎ | 2368/10000 [31:55<1:14:36, 1.71it/s, loss=0.0162, lr=2.29e-05, step=2367] Training: 24%|██▎ | 2368/10000 [31:55<1:14:36, 1.71it/s, loss=0.0528, lr=2.29e-05, step=2368] Training: 24%|██▎ | 2369/10000 [31:55<1:11:16, 1.78it/s, loss=0.0528, lr=2.29e-05, step=2368] Training: 24%|██▎ | 2369/10000 [31:55<1:11:16, 1.78it/s, loss=0.0215, lr=2.29e-05, step=2369]19:16:28.266 [I] step=2370 loss=0.0200 smoothed_loss=0.0257 lr=2.29e-05 grad_norm=0.5386 step_time=0.4944s data_time=0.0952s it/s=1.696 eta_to_10000=4497.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0071 grad_action_out_proj_arms=0.1174 grad_arm_token_fuse=0.0379 grad_shared_expert=0.3762 (18633:train_pytorch.py:850) + Training: 24%|██▎ | 2370/10000 [31:56<1:20:31, 1.58it/s, loss=0.0215, lr=2.29e-05, step=2369] Training: 24%|██▎ | 2370/10000 [31:56<1:20:31, 1.58it/s, loss=0.0200, lr=2.29e-05, step=2370] Training: 24%|██▎ | 2371/10000 [31:57<1:23:48, 1.52it/s, loss=0.0200, lr=2.29e-05, step=2370] Training: 24%|██▎ | 2371/10000 [31:57<1:23:48, 1.52it/s, loss=0.0066, lr=2.29e-05, step=2371] Training: 24%|██▎ | 2372/10000 [31:57<1:17:21, 1.64it/s, loss=0.0066, lr=2.29e-05, step=2371] Training: 24%|██▎ | 2372/10000 [31:57<1:17:21, 1.64it/s, loss=0.0169, lr=2.29e-05, step=2372] Training: 24%|██▎ | 2373/10000 [31:58<1:13:46, 1.72it/s, loss=0.0169, lr=2.29e-05, step=2372] Training: 24%|██▎ | 2373/10000 [31:58<1:13:46, 1.72it/s, loss=0.0373, lr=2.29e-05, step=2373] Training: 24%|██▎ | 2374/10000 [31:58<1:10:21, 1.81it/s, loss=0.0373, lr=2.29e-05, step=2373] Training: 24%|██▎ | 2374/10000 [31:58<1:10:21, 1.81it/s, loss=0.0326, lr=2.29e-05, step=2374] Training: 24%|██▍ | 2375/10000 [31:59<1:07:38, 1.88it/s, loss=0.0326, lr=2.29e-05, step=2374] Training: 24%|██▍ | 2375/10000 [31:59<1:07:38, 1.88it/s, loss=0.0205, lr=2.29e-05, step=2375] Training: 24%|██▍ | 2376/10000 [31:59<1:05:56, 1.93it/s, loss=0.0205, lr=2.29e-05, step=2375] Training: 24%|██▍ | 2376/10000 [31:59<1:05:56, 1.93it/s, loss=0.0147, lr=2.29e-05, step=2376] Training: 24%|██▍ | 2377/10000 [32:00<1:04:56, 1.96it/s, loss=0.0147, lr=2.29e-05, step=2376] Training: 24%|██▍ | 2377/10000 [32:00<1:04:56, 1.96it/s, loss=0.0303, lr=2.29e-05, step=2377] Training: 24%|██▍ | 2378/10000 [32:00<1:12:56, 1.74it/s, loss=0.0303, lr=2.29e-05, step=2377] Training: 24%|██▍ | 2378/10000 [32:00<1:12:56, 1.74it/s, loss=0.0313, lr=2.29e-05, step=2378] Training: 24%|██▍ | 2379/10000 [32:01<1:09:56, 1.82it/s, loss=0.0313, lr=2.29e-05, step=2378] Training: 24%|██▍ | 2379/10000 [32:01<1:09:56, 1.82it/s, loss=0.0032, lr=2.29e-05, step=2379]19:16:33.699 [I] step=2380 loss=0.0445 smoothed_loss=0.0251 lr=2.29e-05 grad_norm=0.5655 step_time=0.4805s data_time=0.0628s it/s=1.841 eta_to_10000=4139.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0247 grad_action_out_proj_arms=0.2160 grad_arm_token_fuse=0.1330 grad_shared_expert=0.4830 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2380/10000 [32:01<1:09:24, 1.83it/s, loss=0.0032, lr=2.29e-05, step=2379] Training: 24%|██▍ | 2380/10000 [32:01<1:09:24, 1.83it/s, loss=0.0445, lr=2.29e-05, step=2380] Training: 24%|██▍ | 2381/10000 [32:02<1:07:19, 1.89it/s, loss=0.0445, lr=2.29e-05, step=2380] Training: 24%|██▍ | 2381/10000 [32:02<1:07:19, 1.89it/s, loss=0.0537, lr=2.29e-05, step=2381] Training: 24%|██▍ | 2382/10000 [32:02<1:05:58, 1.92it/s, loss=0.0537, lr=2.29e-05, step=2381] Training: 24%|██▍ | 2382/10000 [32:02<1:05:58, 1.92it/s, loss=0.0234, lr=2.29e-05, step=2382] Training: 24%|██▍ | 2383/10000 [32:03<1:04:57, 1.95it/s, loss=0.0234, lr=2.29e-05, step=2382] Training: 24%|██▍ | 2383/10000 [32:03<1:04:57, 1.95it/s, loss=0.0134, lr=2.29e-05, step=2383] Training: 24%|██▍ | 2384/10000 [32:03<1:04:11, 1.98it/s, loss=0.0134, lr=2.29e-05, step=2383] Training: 24%|██▍ | 2384/10000 [32:03<1:04:11, 1.98it/s, loss=0.0192, lr=2.29e-05, step=2384] Training: 24%|██▍ | 2385/10000 [32:04<1:09:29, 1.83it/s, loss=0.0192, lr=2.29e-05, step=2384] Training: 24%|██▍ | 2385/10000 [32:04<1:09:29, 1.83it/s, loss=0.0269, lr=2.29e-05, step=2385] Training: 24%|██▍ | 2386/10000 [32:05<1:17:20, 1.64it/s, loss=0.0269, lr=2.29e-05, step=2385] Training: 24%|██▍ | 2386/10000 [32:05<1:17:20, 1.64it/s, loss=0.0460, lr=2.29e-05, step=2386] Training: 24%|██▍ | 2387/10000 [32:05<1:13:58, 1.72it/s, loss=0.0460, lr=2.29e-05, step=2386] Training: 24%|██▍ | 2387/10000 [32:05<1:13:58, 1.72it/s, loss=0.0549, lr=2.29e-05, step=2387] Training: 24%|██▍ | 2388/10000 [32:06<1:10:46, 1.79it/s, loss=0.0549, lr=2.29e-05, step=2387] Training: 24%|██▍ | 2388/10000 [32:06<1:10:46, 1.79it/s, loss=0.0283, lr=2.29e-05, step=2388] Training: 24%|██▍ | 2389/10000 [32:06<1:16:56, 1.65it/s, loss=0.0283, lr=2.29e-05, step=2388] Training: 24%|██▍ | 2389/10000 [32:06<1:16:56, 1.65it/s, loss=0.0343, lr=2.29e-05, step=2389]19:16:39.359 [I] step=2390 loss=0.0074 smoothed_loss=0.0282 lr=2.29e-05 grad_norm=0.5667 step_time=0.4889s data_time=0.0771s it/s=1.767 eta_to_10000=4306.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.1352 grad_arm_token_fuse=0.0392 grad_shared_expert=0.3903 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2390/10000 [32:07<1:14:42, 1.70it/s, loss=0.0343, lr=2.29e-05, step=2389] Training: 24%|██▍ | 2390/10000 [32:07<1:14:42, 1.70it/s, loss=0.0074, lr=2.29e-05, step=2390] Training: 24%|██▍ | 2391/10000 [32:08<1:10:50, 1.79it/s, loss=0.0074, lr=2.29e-05, step=2390] Training: 24%|██▍ | 2391/10000 [32:08<1:10:50, 1.79it/s, loss=0.0246, lr=2.29e-05, step=2391] Training: 24%|██▍ | 2392/10000 [32:08<1:14:39, 1.70it/s, loss=0.0246, lr=2.29e-05, step=2391] Training: 24%|██▍ | 2392/10000 [32:08<1:14:39, 1.70it/s, loss=0.0140, lr=2.29e-05, step=2392] Training: 24%|██▍ | 2393/10000 [32:09<1:20:01, 1.58it/s, loss=0.0140, lr=2.29e-05, step=2392] Training: 24%|██▍ | 2393/10000 [32:09<1:20:01, 1.58it/s, loss=0.0149, lr=2.29e-05, step=2393] Training: 24%|██▍ | 2394/10000 [32:09<1:15:07, 1.69it/s, loss=0.0149, lr=2.29e-05, step=2393] Training: 24%|██▍ | 2394/10000 [32:09<1:15:07, 1.69it/s, loss=0.0128, lr=2.29e-05, step=2394] Training: 24%|██▍ | 2395/10000 [32:10<1:12:04, 1.76it/s, loss=0.0128, lr=2.29e-05, step=2394] Training: 24%|██▍ | 2395/10000 [32:10<1:12:04, 1.76it/s, loss=0.0239, lr=2.29e-05, step=2395] Training: 24%|██▍ | 2396/10000 [32:10<1:09:21, 1.83it/s, loss=0.0239, lr=2.29e-05, step=2395] Training: 24%|██▍ | 2396/10000 [32:10<1:09:21, 1.83it/s, loss=0.0227, lr=2.29e-05, step=2396] Training: 24%|██▍ | 2397/10000 [32:11<1:08:27, 1.85it/s, loss=0.0227, lr=2.29e-05, step=2396] Training: 24%|██▍ | 2397/10000 [32:11<1:08:27, 1.85it/s, loss=0.0219, lr=2.29e-05, step=2397] Training: 24%|██▍ | 2398/10000 [32:11<1:06:14, 1.91it/s, loss=0.0219, lr=2.29e-05, step=2397] Training: 24%|██▍ | 2398/10000 [32:11<1:06:14, 1.91it/s, loss=0.0218, lr=2.29e-05, step=2398] Training: 24%|██▍ | 2399/10000 [32:12<1:11:03, 1.78it/s, loss=0.0218, lr=2.29e-05, step=2398] Training: 24%|██▍ | 2399/10000 [32:12<1:11:03, 1.78it/s, loss=0.0147, lr=2.29e-05, step=2399]19:16:45.394 [I] step=2400 loss=0.0151 smoothed_loss=0.0219 lr=2.29e-05 grad_norm=0.5844 step_time=0.5267s data_time=0.0768s it/s=1.657 eta_to_10000=4585.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1367 grad_arm_token_fuse=0.0676 grad_shared_expert=0.6095 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2400/10000 [32:13<1:27:19, 1.45it/s, loss=0.0147, lr=2.29e-05, step=2399] Training: 24%|██▍ | 2400/10000 [32:13<1:27:19, 1.45it/s, loss=0.0151, lr=2.29e-05, step=2400] Training: 24%|██▍ | 2401/10000 [32:14<1:19:34, 1.59it/s, loss=0.0151, lr=2.29e-05, step=2400] Training: 24%|██▍ | 2401/10000 [32:14<1:19:34, 1.59it/s, loss=0.0233, lr=2.29e-05, step=2401] Training: 24%|██▍ | 2402/10000 [32:14<1:14:12, 1.71it/s, loss=0.0233, lr=2.29e-05, step=2401] Training: 24%|██▍ | 2402/10000 [32:14<1:14:12, 1.71it/s, loss=0.0361, lr=2.28e-05, step=2402] Training: 24%|██▍ | 2403/10000 [32:15<1:13:25, 1.72it/s, loss=0.0361, lr=2.28e-05, step=2402] Training: 24%|██▍ | 2403/10000 [32:15<1:13:25, 1.72it/s, loss=0.0292, lr=2.28e-05, step=2403] Training: 24%|██▍ | 2404/10000 [32:15<1:10:21, 1.80it/s, loss=0.0292, lr=2.28e-05, step=2403] Training: 24%|██▍ | 2404/10000 [32:15<1:10:21, 1.80it/s, loss=0.0290, lr=2.28e-05, step=2404] Training: 24%|██▍ | 2405/10000 [32:16<1:13:23, 1.72it/s, loss=0.0290, lr=2.28e-05, step=2404] Training: 24%|██▍ | 2405/10000 [32:16<1:13:23, 1.72it/s, loss=0.0080, lr=2.28e-05, step=2405] Training: 24%|██▍ | 2406/10000 [32:16<1:19:48, 1.59it/s, loss=0.0080, lr=2.28e-05, step=2405] Training: 24%|██▍ | 2406/10000 [32:16<1:19:48, 1.59it/s, loss=0.0154, lr=2.28e-05, step=2406] Training: 24%|██▍ | 2407/10000 [32:17<1:25:55, 1.47it/s, loss=0.0154, lr=2.28e-05, step=2406] Training: 24%|██▍ | 2407/10000 [32:17<1:25:55, 1.47it/s, loss=0.0100, lr=2.28e-05, step=2407] Training: 24%|██▍ | 2408/10000 [32:18<1:23:14, 1.52it/s, loss=0.0100, lr=2.28e-05, step=2407] Training: 24%|██▍ | 2408/10000 [32:18<1:23:14, 1.52it/s, loss=0.0258, lr=2.28e-05, step=2408] Training: 24%|██▍ | 2409/10000 [32:18<1:20:31, 1.57it/s, loss=0.0258, lr=2.28e-05, step=2408] Training: 24%|██▍ | 2409/10000 [32:18<1:20:31, 1.57it/s, loss=0.0060, lr=2.28e-05, step=2409]19:16:51.381 [I] step=2410 loss=0.0245 smoothed_loss=0.0203 lr=2.28e-05 grad_norm=0.4979 step_time=0.5122s data_time=0.0865s it/s=1.671 eta_to_10000=4543.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0120 grad_action_out_proj_arms=0.1144 grad_arm_token_fuse=0.0559 grad_shared_expert=0.5339 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2410/10000 [32:19<1:18:16, 1.62it/s, loss=0.0060, lr=2.28e-05, step=2409] Training: 24%|██▍ | 2410/10000 [32:19<1:18:16, 1.62it/s, loss=0.0245, lr=2.28e-05, step=2410] Training: 24%|██▍ | 2411/10000 [32:20<1:23:08, 1.52it/s, loss=0.0245, lr=2.28e-05, step=2410] Training: 24%|██▍ | 2411/10000 [32:20<1:23:08, 1.52it/s, loss=0.1009, lr=2.28e-05, step=2411] Training: 24%|██▍ | 2412/10000 [32:20<1:21:13, 1.56it/s, loss=0.1009, lr=2.28e-05, step=2411] Training: 24%|██▍ | 2412/10000 [32:20<1:21:13, 1.56it/s, loss=0.0192, lr=2.28e-05, step=2412] Training: 24%|██▍ | 2413/10000 [32:21<1:26:53, 1.46it/s, loss=0.0192, lr=2.28e-05, step=2412] Training: 24%|██▍ | 2413/10000 [32:21<1:26:53, 1.46it/s, loss=0.0699, lr=2.28e-05, step=2413] Training: 24%|██▍ | 2414/10000 [32:22<1:31:14, 1.39it/s, loss=0.0699, lr=2.28e-05, step=2413] Training: 24%|██▍ | 2414/10000 [32:22<1:31:14, 1.39it/s, loss=0.0640, lr=2.28e-05, step=2414] Training: 24%|██▍ | 2415/10000 [32:23<1:28:19, 1.43it/s, loss=0.0640, lr=2.28e-05, step=2414] Training: 24%|██▍ | 2415/10000 [32:23<1:28:19, 1.43it/s, loss=0.0193, lr=2.28e-05, step=2415] Training: 24%|██▍ | 2416/10000 [32:23<1:25:16, 1.48it/s, loss=0.0193, lr=2.28e-05, step=2415] Training: 24%|██▍ | 2416/10000 [32:23<1:25:16, 1.48it/s, loss=0.0089, lr=2.28e-05, step=2416] Training: 24%|██▍ | 2417/10000 [32:24<1:22:51, 1.53it/s, loss=0.0089, lr=2.28e-05, step=2416] Training: 24%|██▍ | 2417/10000 [32:24<1:22:51, 1.53it/s, loss=0.0331, lr=2.28e-05, step=2417] Training: 24%|██▍ | 2418/10000 [32:24<1:20:18, 1.57it/s, loss=0.0331, lr=2.28e-05, step=2417] Training: 24%|██▍ | 2418/10000 [32:24<1:20:18, 1.57it/s, loss=0.1802, lr=2.28e-05, step=2418] Training: 24%|██▍ | 2419/10000 [32:25<1:18:10, 1.62it/s, loss=0.1802, lr=2.28e-05, step=2418] Training: 24%|██▍ | 2419/10000 [32:25<1:18:10, 1.62it/s, loss=0.0302, lr=2.28e-05, step=2419]19:16:58.073 [I] step=2420 loss=0.0189 smoothed_loss=0.0419 lr=2.28e-05 grad_norm=0.6382 step_time=0.5588s data_time=0.1104s it/s=1.495 eta_to_10000=5071.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0202 grad_action_out_proj_arms=0.2083 grad_arm_token_fuse=0.1181 grad_shared_expert=0.6415 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2420/10000 [32:26<1:21:14, 1.56it/s, loss=0.0302, lr=2.28e-05, step=2419] Training: 24%|██▍ | 2420/10000 [32:26<1:21:14, 1.56it/s, loss=0.0189, lr=2.28e-05, step=2420] Training: 24%|██▍ | 2421/10000 [32:27<1:28:11, 1.43it/s, loss=0.0189, lr=2.28e-05, step=2420] Training: 24%|██▍ | 2421/10000 [32:27<1:28:11, 1.43it/s, loss=0.0547, lr=2.28e-05, step=2421] Training: 24%|██▍ | 2422/10000 [32:27<1:20:27, 1.57it/s, loss=0.0547, lr=2.28e-05, step=2421] Training: 24%|██▍ | 2422/10000 [32:27<1:20:27, 1.57it/s, loss=0.0210, lr=2.28e-05, step=2422] Training: 24%|██▍ | 2423/10000 [32:28<1:15:02, 1.68it/s, loss=0.0210, lr=2.28e-05, step=2422] Training: 24%|██▍ | 2423/10000 [32:28<1:15:02, 1.68it/s, loss=0.0060, lr=2.28e-05, step=2423] Training: 24%|██▍ | 2424/10000 [32:28<1:21:18, 1.55it/s, loss=0.0060, lr=2.28e-05, step=2423] Training: 24%|██▍ | 2424/10000 [32:28<1:21:18, 1.55it/s, loss=0.0155, lr=2.28e-05, step=2424] Training: 24%|██▍ | 2425/10000 [32:29<1:22:06, 1.54it/s, loss=0.0155, lr=2.28e-05, step=2424] Training: 24%|██▍ | 2425/10000 [32:29<1:22:06, 1.54it/s, loss=0.0310, lr=2.28e-05, step=2425] Training: 24%|██▍ | 2426/10000 [32:29<1:16:39, 1.65it/s, loss=0.0310, lr=2.28e-05, step=2425] Training: 24%|██▍ | 2426/10000 [32:29<1:16:39, 1.65it/s, loss=0.0129, lr=2.28e-05, step=2426] Training: 24%|██▍ | 2427/10000 [32:30<1:29:49, 1.41it/s, loss=0.0129, lr=2.28e-05, step=2426] Training: 24%|██▍ | 2427/10000 [32:30<1:29:49, 1.41it/s, loss=0.0354, lr=2.28e-05, step=2427] Training: 24%|██▍ | 2428/10000 [32:31<1:34:28, 1.34it/s, loss=0.0354, lr=2.28e-05, step=2427] Training: 24%|██▍ | 2428/10000 [32:31<1:34:28, 1.34it/s, loss=0.0077, lr=2.28e-05, step=2428] Training: 24%|██▍ | 2429/10000 [32:32<1:31:54, 1.37it/s, loss=0.0077, lr=2.28e-05, step=2428] Training: 24%|██▍ | 2429/10000 [32:32<1:31:54, 1.37it/s, loss=0.0179, lr=2.28e-05, step=2429]19:17:04.973 [I] step=2430 loss=0.0581 smoothed_loss=0.0321 lr=2.28e-05 grad_norm=0.6099 step_time=0.5746s data_time=0.1154s it/s=1.449 eta_to_10000=5222.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0158 grad_action_out_proj_arms=0.2241 grad_arm_token_fuse=0.0869 grad_shared_expert=0.6354 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2430/10000 [32:33<1:30:07, 1.40it/s, loss=0.0179, lr=2.28e-05, step=2429] Training: 24%|██▍ | 2430/10000 [32:33<1:30:07, 1.40it/s, loss=0.0581, lr=2.28e-05, step=2430] Training: 24%|██▍ | 2431/10000 [32:33<1:28:17, 1.43it/s, loss=0.0581, lr=2.28e-05, step=2430] Training: 24%|██▍ | 2431/10000 [32:33<1:28:17, 1.43it/s, loss=0.0190, lr=2.28e-05, step=2431] Training: 24%|██▍ | 2432/10000 [32:34<1:29:48, 1.40it/s, loss=0.0190, lr=2.28e-05, step=2431] Training: 24%|██▍ | 2432/10000 [32:34<1:29:48, 1.40it/s, loss=0.0251, lr=2.28e-05, step=2432] Training: 24%|██▍ | 2433/10000 [32:35<1:21:38, 1.54it/s, loss=0.0251, lr=2.28e-05, step=2432] Training: 24%|██▍ | 2433/10000 [32:35<1:21:38, 1.54it/s, loss=0.0625, lr=2.28e-05, step=2433] Training: 24%|██▍ | 2434/10000 [32:35<1:15:34, 1.67it/s, loss=0.0625, lr=2.28e-05, step=2433] Training: 24%|██▍ | 2434/10000 [32:35<1:15:34, 1.67it/s, loss=0.0161, lr=2.28e-05, step=2434] Training: 24%|██▍ | 2435/10000 [32:36<1:18:05, 1.61it/s, loss=0.0161, lr=2.28e-05, step=2434] Training: 24%|██▍ | 2435/10000 [32:36<1:18:05, 1.61it/s, loss=0.0247, lr=2.28e-05, step=2435] Training: 24%|██▍ | 2436/10000 [32:36<1:23:41, 1.51it/s, loss=0.0247, lr=2.28e-05, step=2435] Training: 24%|██▍ | 2436/10000 [32:36<1:23:41, 1.51it/s, loss=0.0331, lr=2.28e-05, step=2436] Training: 24%|██▍ | 2437/10000 [32:37<1:17:35, 1.62it/s, loss=0.0331, lr=2.28e-05, step=2436] Training: 24%|██▍ | 2437/10000 [32:37<1:17:35, 1.62it/s, loss=0.0154, lr=2.28e-05, step=2437] Training: 24%|██▍ | 2438/10000 [32:37<1:12:53, 1.73it/s, loss=0.0154, lr=2.28e-05, step=2437] Training: 24%|██▍ | 2438/10000 [32:37<1:12:53, 1.73it/s, loss=0.0615, lr=2.28e-05, step=2438] Training: 24%|██▍ | 2439/10000 [32:38<1:09:22, 1.82it/s, loss=0.0615, lr=2.28e-05, step=2438] Training: 24%|██▍ | 2439/10000 [32:38<1:09:22, 1.82it/s, loss=0.0099, lr=2.28e-05, step=2439]19:17:10.810 [I] step=2440 loss=0.0095 smoothed_loss=0.0284 lr=2.28e-05 grad_norm=0.4721 step_time=0.5055s data_time=0.0782s it/s=1.714 eta_to_10000=4411.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.1534 grad_arm_token_fuse=0.0697 grad_shared_expert=0.3680 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2440/10000 [32:38<1:08:41, 1.83it/s, loss=0.0099, lr=2.28e-05, step=2439] Training: 24%|██▍ | 2440/10000 [32:38<1:08:41, 1.83it/s, loss=0.0095, lr=2.28e-05, step=2440] Training: 24%|██▍ | 2441/10000 [32:39<1:13:27, 1.72it/s, loss=0.0095, lr=2.28e-05, step=2440] Training: 24%|██▍ | 2441/10000 [32:39<1:13:27, 1.72it/s, loss=0.0591, lr=2.28e-05, step=2441] Training: 24%|██▍ | 2442/10000 [32:40<1:18:33, 1.60it/s, loss=0.0591, lr=2.28e-05, step=2441] Training: 24%|██▍ | 2442/10000 [32:40<1:18:33, 1.60it/s, loss=0.0037, lr=2.28e-05, step=2442] Training: 24%|██▍ | 2443/10000 [32:41<1:25:40, 1.47it/s, loss=0.0037, lr=2.28e-05, step=2442] Training: 24%|██▍ | 2443/10000 [32:41<1:25:40, 1.47it/s, loss=0.0094, lr=2.28e-05, step=2443] Training: 24%|██▍ | 2444/10000 [32:41<1:19:07, 1.59it/s, loss=0.0094, lr=2.28e-05, step=2443] Training: 24%|██▍ | 2444/10000 [32:41<1:19:07, 1.59it/s, loss=0.0069, lr=2.28e-05, step=2444] Training: 24%|██▍ | 2445/10000 [32:42<1:15:43, 1.66it/s, loss=0.0069, lr=2.28e-05, step=2444] Training: 24%|██▍ | 2445/10000 [32:42<1:15:43, 1.66it/s, loss=0.0166, lr=2.28e-05, step=2445] Training: 24%|██▍ | 2446/10000 [32:42<1:11:36, 1.76it/s, loss=0.0166, lr=2.28e-05, step=2445] Training: 24%|██▍ | 2446/10000 [32:42<1:11:36, 1.76it/s, loss=0.0401, lr=2.28e-05, step=2446] Training: 24%|██▍ | 2447/10000 [32:43<1:09:06, 1.82it/s, loss=0.0401, lr=2.28e-05, step=2446] Training: 24%|██▍ | 2447/10000 [32:43<1:09:06, 1.82it/s, loss=0.0178, lr=2.27e-05, step=2447] Training: 24%|██▍ | 2448/10000 [32:43<1:13:32, 1.71it/s, loss=0.0178, lr=2.27e-05, step=2447] Training: 24%|██▍ | 2448/10000 [32:43<1:13:32, 1.71it/s, loss=0.0213, lr=2.27e-05, step=2448] Training: 24%|██▍ | 2449/10000 [32:44<1:19:49, 1.58it/s, loss=0.0213, lr=2.27e-05, step=2448] Training: 24%|██▍ | 2449/10000 [32:44<1:19:49, 1.58it/s, loss=0.0341, lr=2.27e-05, step=2449]19:17:17.655 [I] step=2450 loss=0.0219 smoothed_loss=0.0251 lr=2.28e-05 grad_norm=0.5551 step_time=0.5750s data_time=0.1096s it/s=1.461 eta_to_10000=5167.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0084 grad_action_out_proj_arms=0.1106 grad_arm_token_fuse=0.0464 grad_shared_expert=0.3042 (18633:train_pytorch.py:850) + Training: 24%|██▍ | 2450/10000 [32:45<1:40:34, 1.25it/s, loss=0.0341, lr=2.27e-05, step=2449] Training: 24%|██▍ | 2450/10000 [32:45<1:40:34, 1.25it/s, loss=0.0219, lr=2.27e-05, step=2450] Training: 25%|██▍ | 2451/10000 [32:46<1:31:17, 1.38it/s, loss=0.0219, lr=2.27e-05, step=2450] Training: 25%|██▍ | 2451/10000 [32:46<1:31:17, 1.38it/s, loss=0.0170, lr=2.27e-05, step=2451] Training: 25%|██▍ | 2452/10000 [32:46<1:22:43, 1.52it/s, loss=0.0170, lr=2.27e-05, step=2451] Training: 25%|██▍ | 2452/10000 [32:46<1:22:43, 1.52it/s, loss=0.0164, lr=2.27e-05, step=2452] Training: 25%|██▍ | 2453/10000 [32:47<1:17:04, 1.63it/s, loss=0.0164, lr=2.27e-05, step=2452] Training: 25%|██▍ | 2453/10000 [32:47<1:17:04, 1.63it/s, loss=0.0080, lr=2.27e-05, step=2453] Training: 25%|██▍ | 2454/10000 [32:48<1:18:09, 1.61it/s, loss=0.0080, lr=2.27e-05, step=2453] Training: 25%|██▍ | 2454/10000 [32:48<1:18:09, 1.61it/s, loss=0.0377, lr=2.27e-05, step=2454] Training: 25%|██▍ | 2455/10000 [32:48<1:24:48, 1.48it/s, loss=0.0377, lr=2.27e-05, step=2454] Training: 25%|██▍ | 2455/10000 [32:48<1:24:48, 1.48it/s, loss=0.0121, lr=2.27e-05, step=2455] Training: 25%|██▍ | 2456/10000 [32:49<1:18:13, 1.61it/s, loss=0.0121, lr=2.27e-05, step=2455] Training: 25%|██▍ | 2456/10000 [32:49<1:18:13, 1.61it/s, loss=0.0252, lr=2.27e-05, step=2456] Training: 25%|██▍ | 2457/10000 [32:50<1:25:08, 1.48it/s, loss=0.0252, lr=2.27e-05, step=2456] Training: 25%|██▍ | 2457/10000 [32:50<1:25:08, 1.48it/s, loss=0.0167, lr=2.27e-05, step=2457] Training: 25%|██▍ | 2458/10000 [32:50<1:17:44, 1.62it/s, loss=0.0167, lr=2.27e-05, step=2457] Training: 25%|██▍ | 2458/10000 [32:50<1:17:44, 1.62it/s, loss=0.0068, lr=2.27e-05, step=2458] Training: 25%|██▍ | 2459/10000 [32:51<1:20:09, 1.57it/s, loss=0.0068, lr=2.27e-05, step=2458] Training: 25%|██▍ | 2459/10000 [32:51<1:20:09, 1.57it/s, loss=0.0150, lr=2.27e-05, step=2459]19:17:23.860 [I] step=2460 loss=0.0318 smoothed_loss=0.0212 lr=2.27e-05 grad_norm=0.4913 step_time=0.5176s data_time=0.1029s it/s=1.612 eta_to_10000=4677.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0147 grad_action_out_proj_arms=0.1711 grad_arm_token_fuse=0.0738 grad_shared_expert=0.4441 (18633:train_pytorch.py:850) + Training: 25%|██▍ | 2460/10000 [32:52<1:23:46, 1.50it/s, loss=0.0150, lr=2.27e-05, step=2459] Training: 25%|██▍ | 2460/10000 [32:52<1:23:46, 1.50it/s, loss=0.0318, lr=2.27e-05, step=2460] Training: 25%|██▍ | 2461/10000 [32:52<1:16:50, 1.64it/s, loss=0.0318, lr=2.27e-05, step=2460] Training: 25%|██▍ | 2461/10000 [32:52<1:16:50, 1.64it/s, loss=0.0134, lr=2.27e-05, step=2461] Training: 25%|██▍ | 2462/10000 [32:53<1:17:52, 1.61it/s, loss=0.0134, lr=2.27e-05, step=2461] Training: 25%|██▍ | 2462/10000 [32:53<1:17:52, 1.61it/s, loss=0.0161, lr=2.27e-05, step=2462] Training: 25%|██▍ | 2463/10000 [32:53<1:12:43, 1.73it/s, loss=0.0161, lr=2.27e-05, step=2462] Training: 25%|██▍ | 2463/10000 [32:53<1:12:43, 1.73it/s, loss=0.0236, lr=2.27e-05, step=2463] Training: 25%|██▍ | 2464/10000 [32:54<1:17:26, 1.62it/s, loss=0.0236, lr=2.27e-05, step=2463] Training: 25%|██▍ | 2464/10000 [32:54<1:17:26, 1.62it/s, loss=0.0208, lr=2.27e-05, step=2464] Training: 25%|██▍ | 2465/10000 [32:54<1:12:34, 1.73it/s, loss=0.0208, lr=2.27e-05, step=2464] Training: 25%|██▍ | 2465/10000 [32:54<1:12:34, 1.73it/s, loss=0.0266, lr=2.27e-05, step=2465] Training: 25%|██▍ | 2466/10000 [32:55<1:18:29, 1.60it/s, loss=0.0266, lr=2.27e-05, step=2465] Training: 25%|██▍ | 2466/10000 [32:55<1:18:29, 1.60it/s, loss=0.0488, lr=2.27e-05, step=2466] Training: 25%|██▍ | 2467/10000 [32:56<1:26:50, 1.45it/s, loss=0.0488, lr=2.27e-05, step=2466] Training: 25%|██▍ | 2467/10000 [32:56<1:26:50, 1.45it/s, loss=0.0679, lr=2.27e-05, step=2467] Training: 25%|██▍ | 2468/10000 [32:57<1:24:39, 1.48it/s, loss=0.0679, lr=2.27e-05, step=2467] Training: 25%|██▍ | 2468/10000 [32:57<1:24:39, 1.48it/s, loss=0.0233, lr=2.27e-05, step=2468] Training: 25%|██▍ | 2469/10000 [32:58<1:40:32, 1.25it/s, loss=0.0233, lr=2.27e-05, step=2468] Training: 25%|██▍ | 2469/10000 [32:58<1:40:32, 1.25it/s, loss=0.0108, lr=2.27e-05, step=2469]19:17:31.022 [I] step=2470 loss=0.0362 smoothed_loss=0.0270 lr=2.27e-05 grad_norm=0.5364 step_time=0.5596s data_time=0.1566s it/s=1.396 eta_to_10000=5392.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0199 grad_action_out_proj_arms=0.2280 grad_arm_token_fuse=0.0987 grad_shared_expert=0.6443 (18633:train_pytorch.py:850) + Training: 25%|██▍ | 2470/10000 [32:59<1:49:58, 1.14it/s, loss=0.0108, lr=2.27e-05, step=2469] Training: 25%|██▍ | 2470/10000 [32:59<1:49:58, 1.14it/s, loss=0.0362, lr=2.27e-05, step=2470] Training: 25%|██▍ | 2471/10000 [32:59<1:44:05, 1.21it/s, loss=0.0362, lr=2.27e-05, step=2470] Training: 25%|██▍ | 2471/10000 [32:59<1:44:05, 1.21it/s, loss=0.0215, lr=2.27e-05, step=2471] Training: 25%|██▍ | 2472/10000 [33:00<1:35:29, 1.31it/s, loss=0.0215, lr=2.27e-05, step=2471] Training: 25%|██▍ | 2472/10000 [33:00<1:35:29, 1.31it/s, loss=0.0209, lr=2.27e-05, step=2472] Training: 25%|██▍ | 2473/10000 [33:01<1:34:04, 1.33it/s, loss=0.0209, lr=2.27e-05, step=2472] Training: 25%|██▍ | 2473/10000 [33:01<1:34:04, 1.33it/s, loss=0.0162, lr=2.27e-05, step=2473] Training: 25%|██▍ | 2474/10000 [33:01<1:24:18, 1.49it/s, loss=0.0162, lr=2.27e-05, step=2473] Training: 25%|██▍ | 2474/10000 [33:01<1:24:18, 1.49it/s, loss=0.0313, lr=2.27e-05, step=2474] Training: 25%|██▍ | 2475/10000 [33:02<1:18:12, 1.60it/s, loss=0.0313, lr=2.27e-05, step=2474] Training: 25%|██▍ | 2475/10000 [33:02<1:18:12, 1.60it/s, loss=0.0111, lr=2.27e-05, step=2475] Training: 25%|██▍ | 2476/10000 [33:02<1:19:24, 1.58it/s, loss=0.0111, lr=2.27e-05, step=2475] Training: 25%|██▍ | 2476/10000 [33:02<1:19:24, 1.58it/s, loss=0.0071, lr=2.27e-05, step=2476] Training: 25%|██▍ | 2477/10000 [33:03<1:14:47, 1.68it/s, loss=0.0071, lr=2.27e-05, step=2476] Training: 25%|██▍ | 2477/10000 [33:03<1:14:47, 1.68it/s, loss=0.0180, lr=2.27e-05, step=2477] Training: 25%|██▍ | 2478/10000 [33:04<1:20:05, 1.57it/s, loss=0.0180, lr=2.27e-05, step=2477] Training: 25%|██▍ | 2478/10000 [33:04<1:20:05, 1.57it/s, loss=0.0127, lr=2.27e-05, step=2478] Training: 25%|██▍ | 2479/10000 [33:04<1:14:50, 1.67it/s, loss=0.0127, lr=2.27e-05, step=2478] Training: 25%|██▍ | 2479/10000 [33:04<1:14:50, 1.67it/s, loss=0.0254, lr=2.27e-05, step=2479]19:17:37.197 [I] step=2480 loss=0.0256 smoothed_loss=0.0219 lr=2.27e-05 grad_norm=0.5620 step_time=0.5223s data_time=0.0952s it/s=1.620 eta_to_10000=4642.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1921 grad_arm_token_fuse=0.0746 grad_shared_expert=0.4260 (18633:train_pytorch.py:850) + Training: 25%|██▍ | 2480/10000 [33:05<1:19:37, 1.57it/s, loss=0.0254, lr=2.27e-05, step=2479] Training: 25%|██▍ | 2480/10000 [33:05<1:19:37, 1.57it/s, loss=0.0256, lr=2.27e-05, step=2480] Training: 25%|██▍ | 2481/10000 [33:05<1:14:11, 1.69it/s, loss=0.0256, lr=2.27e-05, step=2480] Training: 25%|██▍ | 2481/10000 [33:05<1:14:11, 1.69it/s, loss=0.0284, lr=2.27e-05, step=2481] Training: 25%|██▍ | 2482/10000 [33:06<1:10:14, 1.78it/s, loss=0.0284, lr=2.27e-05, step=2481] Training: 25%|██▍ | 2482/10000 [33:06<1:10:14, 1.78it/s, loss=0.0887, lr=2.27e-05, step=2482] Training: 25%|██▍ | 2483/10000 [33:06<1:13:15, 1.71it/s, loss=0.0887, lr=2.27e-05, step=2482] Training: 25%|██▍ | 2483/10000 [33:06<1:13:15, 1.71it/s, loss=0.0147, lr=2.27e-05, step=2483] Training: 25%|██▍ | 2484/10000 [33:07<1:11:15, 1.76it/s, loss=0.0147, lr=2.27e-05, step=2483] Training: 25%|██▍ | 2484/10000 [33:07<1:11:15, 1.76it/s, loss=0.0234, lr=2.27e-05, step=2484] Training: 25%|██▍ | 2485/10000 [33:08<1:11:21, 1.76it/s, loss=0.0234, lr=2.27e-05, step=2484] Training: 25%|██▍ | 2485/10000 [33:08<1:11:21, 1.76it/s, loss=0.0372, lr=2.27e-05, step=2485] Training: 25%|██▍ | 2486/10000 [33:09<1:24:39, 1.48it/s, loss=0.0372, lr=2.27e-05, step=2485] Training: 25%|██▍ | 2486/10000 [33:09<1:24:39, 1.48it/s, loss=0.0100, lr=2.27e-05, step=2486] Training: 25%|██▍ | 2487/10000 [33:09<1:27:05, 1.44it/s, loss=0.0100, lr=2.27e-05, step=2486] Training: 25%|██▍ | 2487/10000 [33:09<1:27:05, 1.44it/s, loss=0.0534, lr=2.27e-05, step=2487] Training: 25%|██▍ | 2488/10000 [33:10<1:21:07, 1.54it/s, loss=0.0534, lr=2.27e-05, step=2487] Training: 25%|██▍ | 2488/10000 [33:10<1:21:07, 1.54it/s, loss=0.0062, lr=2.27e-05, step=2488] Training: 25%|██▍ | 2489/10000 [33:11<1:24:29, 1.48it/s, loss=0.0062, lr=2.27e-05, step=2488] Training: 25%|██▍ | 2489/10000 [33:11<1:24:29, 1.48it/s, loss=0.0467, lr=2.27e-05, step=2489]19:17:43.401 [I] step=2490 loss=0.0235 smoothed_loss=0.0283 lr=2.27e-05 grad_norm=0.5414 step_time=0.5222s data_time=0.0982s it/s=1.612 eta_to_10000=4658.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0180 grad_action_out_proj_arms=0.2060 grad_arm_token_fuse=0.0932 grad_shared_expert=0.5434 (18633:train_pytorch.py:850) + Training: 25%|██▍ | 2490/10000 [33:11<1:19:26, 1.58it/s, loss=0.0467, lr=2.27e-05, step=2489] Training: 25%|██▍ | 2490/10000 [33:11<1:19:26, 1.58it/s, loss=0.0235, lr=2.27e-05, step=2490] Training: 25%|██▍ | 2491/10000 [33:12<1:21:00, 1.54it/s, loss=0.0235, lr=2.27e-05, step=2490] Training: 25%|██▍ | 2491/10000 [33:12<1:21:00, 1.54it/s, loss=0.0220, lr=2.27e-05, step=2491] Training: 25%|██▍ | 2492/10000 [33:12<1:15:58, 1.65it/s, loss=0.0220, lr=2.27e-05, step=2491] Training: 25%|██▍ | 2492/10000 [33:12<1:15:58, 1.65it/s, loss=0.0261, lr=2.26e-05, step=2492] Training: 25%|██▍ | 2493/10000 [33:13<1:24:58, 1.47it/s, loss=0.0261, lr=2.26e-05, step=2492] Training: 25%|██▍ | 2493/10000 [33:13<1:24:58, 1.47it/s, loss=0.0124, lr=2.26e-05, step=2493] Training: 25%|██▍ | 2494/10000 [33:14<1:17:58, 1.60it/s, loss=0.0124, lr=2.26e-05, step=2493] Training: 25%|██▍ | 2494/10000 [33:14<1:17:58, 1.60it/s, loss=0.0137, lr=2.26e-05, step=2494] Training: 25%|██▍ | 2495/10000 [33:14<1:21:49, 1.53it/s, loss=0.0137, lr=2.26e-05, step=2494] Training: 25%|██▍ | 2495/10000 [33:14<1:21:49, 1.53it/s, loss=0.0141, lr=2.26e-05, step=2495] Training: 25%|██▍ | 2496/10000 [33:15<1:26:47, 1.44it/s, loss=0.0141, lr=2.26e-05, step=2495] Training: 25%|██▍ | 2496/10000 [33:15<1:26:47, 1.44it/s, loss=0.0435, lr=2.26e-05, step=2496] Training: 25%|██▍ | 2497/10000 [33:16<1:21:17, 1.54it/s, loss=0.0435, lr=2.26e-05, step=2496] Training: 25%|██▍ | 2497/10000 [33:16<1:21:17, 1.54it/s, loss=0.0088, lr=2.26e-05, step=2497] Training: 25%|██▍ | 2498/10000 [33:16<1:15:20, 1.66it/s, loss=0.0088, lr=2.26e-05, step=2497] Training: 25%|██▍ | 2498/10000 [33:16<1:15:20, 1.66it/s, loss=0.0285, lr=2.26e-05, step=2498] Training: 25%|██▍ | 2499/10000 [33:17<1:23:24, 1.50it/s, loss=0.0285, lr=2.26e-05, step=2498] Training: 25%|██▍ | 2499/10000 [33:17<1:23:24, 1.50it/s, loss=0.0174, lr=2.26e-05, step=2499]19:17:50.121 [I] step=2500 loss=0.0684 smoothed_loss=0.0282 lr=2.26e-05 grad_norm=0.5831 step_time=0.5741s data_time=0.0979s it/s=1.488 eta_to_10000=5038.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1804 grad_arm_token_fuse=0.0842 grad_shared_expert=0.5649 (18633:train_pytorch.py:850) + Training: 25%|██▌ | 2500/10000 [33:18<1:29:04, 1.40it/s, loss=0.0174, lr=2.26e-05, step=2499] Training: 25%|██▌ | 2500/10000 [33:18<1:29:04, 1.40it/s, loss=0.0684, lr=2.26e-05, step=2500] Training: 25%|██▌ | 2501/10000 [33:18<1:22:33, 1.51it/s, loss=0.0684, lr=2.26e-05, step=2500] Training: 25%|██▌ | 2501/10000 [33:18<1:22:33, 1.51it/s, loss=0.0144, lr=2.26e-05, step=2501] Training: 25%|██▌ | 2502/10000 [33:19<1:22:03, 1.52it/s, loss=0.0144, lr=2.26e-05, step=2501] Training: 25%|██▌ | 2502/10000 [33:19<1:22:03, 1.52it/s, loss=0.0141, lr=2.26e-05, step=2502] Training: 25%|██▌ | 2503/10000 [33:20<1:28:37, 1.41it/s, loss=0.0141, lr=2.26e-05, step=2502] Training: 25%|██▌ | 2503/10000 [33:20<1:28:37, 1.41it/s, loss=0.0240, lr=2.26e-05, step=2503] Training: 25%|██▌ | 2504/10000 [33:20<1:24:21, 1.48it/s, loss=0.0240, lr=2.26e-05, step=2503] Training: 25%|██▌ | 2504/10000 [33:20<1:24:21, 1.48it/s, loss=0.0241, lr=2.26e-05, step=2504] Training: 25%|██▌ | 2505/10000 [33:21<1:18:24, 1.59it/s, loss=0.0241, lr=2.26e-05, step=2504] Training: 25%|██▌ | 2505/10000 [33:21<1:18:24, 1.59it/s, loss=0.0151, lr=2.26e-05, step=2505] Training: 25%|██▌ | 2506/10000 [33:22<1:27:17, 1.43it/s, loss=0.0151, lr=2.26e-05, step=2505] Training: 25%|██▌ | 2506/10000 [33:22<1:27:17, 1.43it/s, loss=0.0163, lr=2.26e-05, step=2506] Training: 25%|██▌ | 2507/10000 [33:23<1:36:57, 1.29it/s, loss=0.0163, lr=2.26e-05, step=2506] Training: 25%|██▌ | 2507/10000 [33:23<1:36:57, 1.29it/s, loss=0.0056, lr=2.26e-05, step=2507] Training: 25%|██▌ | 2508/10000 [33:23<1:32:25, 1.35it/s, loss=0.0056, lr=2.26e-05, step=2507] Training: 25%|██▌ | 2508/10000 [33:23<1:32:25, 1.35it/s, loss=0.0268, lr=2.26e-05, step=2508] Training: 25%|██▌ | 2509/10000 [33:24<1:26:42, 1.44it/s, loss=0.0268, lr=2.26e-05, step=2508] Training: 25%|██▌ | 2509/10000 [33:24<1:26:42, 1.44it/s, loss=0.0418, lr=2.26e-05, step=2509]19:17:56.944 [I] step=2510 loss=0.0091 smoothed_loss=0.0226 lr=2.26e-05 grad_norm=0.5188 step_time=0.5641s data_time=0.1182s it/s=1.466 eta_to_10000=5109.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0224 grad_action_out_proj_arms=0.1571 grad_arm_token_fuse=0.1174 grad_shared_expert=0.3305 (18633:train_pytorch.py:850) + Training: 25%|██▌ | 2510/10000 [33:25<1:24:08, 1.48it/s, loss=0.0418, lr=2.26e-05, step=2509] Training: 25%|██▌ | 2510/10000 [33:25<1:24:08, 1.48it/s, loss=0.0091, lr=2.26e-05, step=2510] Training: 25%|██▌ | 2511/10000 [33:25<1:17:48, 1.60it/s, loss=0.0091, lr=2.26e-05, step=2510] Training: 25%|██▌ | 2511/10000 [33:25<1:17:48, 1.60it/s, loss=0.0149, lr=2.26e-05, step=2511] Training: 25%|██▌ | 2512/10000 [33:26<1:13:37, 1.70it/s, loss=0.0149, lr=2.26e-05, step=2511] Training: 25%|██▌ | 2512/10000 [33:26<1:13:37, 1.70it/s, loss=0.0340, lr=2.26e-05, step=2512] Training: 25%|██▌ | 2513/10000 [33:26<1:17:25, 1.61it/s, loss=0.0340, lr=2.26e-05, step=2512] Training: 25%|██▌ | 2513/10000 [33:26<1:17:25, 1.61it/s, loss=0.0229, lr=2.26e-05, step=2513] Training: 25%|██▌ | 2514/10000 [33:27<1:26:37, 1.44it/s, loss=0.0229, lr=2.26e-05, step=2513] Training: 25%|██▌ | 2514/10000 [33:27<1:26:37, 1.44it/s, loss=0.0374, lr=2.26e-05, step=2514] Training: 25%|██▌ | 2515/10000 [33:28<1:19:35, 1.57it/s, loss=0.0374, lr=2.26e-05, step=2514] Training: 25%|██▌ | 2515/10000 [33:28<1:19:35, 1.57it/s, loss=0.0208, lr=2.26e-05, step=2515] Training: 25%|██▌ | 2516/10000 [33:28<1:14:19, 1.68it/s, loss=0.0208, lr=2.26e-05, step=2515] Training: 25%|██▌ | 2516/10000 [33:28<1:14:19, 1.68it/s, loss=0.0037, lr=2.26e-05, step=2516] Training: 25%|██▌ | 2517/10000 [33:29<1:10:56, 1.76it/s, loss=0.0037, lr=2.26e-05, step=2516] Training: 25%|██▌ | 2517/10000 [33:29<1:10:56, 1.76it/s, loss=0.0294, lr=2.26e-05, step=2517] Training: 25%|██▌ | 2518/10000 [33:29<1:09:15, 1.80it/s, loss=0.0294, lr=2.26e-05, step=2517] Training: 25%|██▌ | 2518/10000 [33:29<1:09:15, 1.80it/s, loss=0.0871, lr=2.26e-05, step=2518] Training: 25%|██▌ | 2519/10000 [33:30<1:08:13, 1.83it/s, loss=0.0871, lr=2.26e-05, step=2518] Training: 25%|██▌ | 2519/10000 [33:30<1:08:13, 1.83it/s, loss=0.0102, lr=2.26e-05, step=2519]19:18:02.608 [I] step=2520 loss=0.0104 smoothed_loss=0.0256 lr=2.26e-05 grad_norm=0.5732 step_time=0.5008s data_time=0.0656s it/s=1.766 eta_to_10000=4236.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0300 grad_action_out_proj_arms=0.2717 grad_arm_token_fuse=0.1593 grad_shared_expert=0.5126 (18633:train_pytorch.py:850) + Training: 25%|██▌ | 2520/10000 [33:30<1:07:27, 1.85it/s, loss=0.0102, lr=2.26e-05, step=2519] Training: 25%|██▌ | 2520/10000 [33:30<1:07:27, 1.85it/s, loss=0.0104, lr=2.26e-05, step=2520] Training: 25%|██▌ | 2521/10000 [33:31<1:14:26, 1.67it/s, loss=0.0104, lr=2.26e-05, step=2520] Training: 25%|██▌ | 2521/10000 [33:31<1:14:26, 1.67it/s, loss=0.0128, lr=2.26e-05, step=2521] Training: 25%|██▌ | 2522/10000 [33:32<1:14:19, 1.68it/s, loss=0.0128, lr=2.26e-05, step=2521] Training: 25%|██▌ | 2522/10000 [33:32<1:14:19, 1.68it/s, loss=0.0120, lr=2.26e-05, step=2522] Training: 25%|██▌ | 2523/10000 [33:32<1:13:57, 1.69it/s, loss=0.0120, lr=2.26e-05, step=2522] Training: 25%|██▌ | 2523/10000 [33:32<1:13:57, 1.69it/s, loss=0.0212, lr=2.26e-05, step=2523] Training: 25%|██▌ | 2524/10000 [33:33<1:11:14, 1.75it/s, loss=0.0212, lr=2.26e-05, step=2523] Training: 25%|██▌ | 2524/10000 [33:33<1:11:14, 1.75it/s, loss=0.0586, lr=2.26e-05, step=2524] Training: 25%|██▌ | 2525/10000 [33:33<1:13:38, 1.69it/s, loss=0.0586, lr=2.26e-05, step=2524] Training: 25%|██▌ | 2525/10000 [33:33<1:13:38, 1.69it/s, loss=0.0085, lr=2.26e-05, step=2525] Training: 25%|██▌ | 2526/10000 [33:34<1:13:17, 1.70it/s, loss=0.0085, lr=2.26e-05, step=2525] Training: 25%|██▌ | 2526/10000 [33:34<1:13:17, 1.70it/s, loss=0.0377, lr=2.26e-05, step=2526] Training: 25%|██▌ | 2527/10000 [33:34<1:10:35, 1.76it/s, loss=0.0377, lr=2.26e-05, step=2526] Training: 25%|██▌ | 2527/10000 [33:34<1:10:35, 1.76it/s, loss=0.0345, lr=2.26e-05, step=2527] Training: 25%|██▌ | 2528/10000 [33:35<1:17:51, 1.60it/s, loss=0.0345, lr=2.26e-05, step=2527] Training: 25%|██▌ | 2528/10000 [33:35<1:17:51, 1.60it/s, loss=0.0106, lr=2.26e-05, step=2528] Training: 25%|██▌ | 2529/10000 [33:36<1:17:13, 1.61it/s, loss=0.0106, lr=2.26e-05, step=2528] Training: 25%|██▌ | 2529/10000 [33:36<1:17:13, 1.61it/s, loss=0.0385, lr=2.26e-05, step=2529]19:18:08.687 [I] step=2530 loss=0.0218 smoothed_loss=0.0261 lr=2.26e-05 grad_norm=0.5306 step_time=0.5206s data_time=0.0872s it/s=1.645 eta_to_10000=4539.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.1509 grad_arm_token_fuse=0.0851 grad_shared_expert=0.4745 (18633:train_pytorch.py:850) + Training: 25%|██▌ | 2530/10000 [33:36<1:14:21, 1.67it/s, loss=0.0385, lr=2.26e-05, step=2529] Training: 25%|██▌ | 2530/10000 [33:36<1:14:21, 1.67it/s, loss=0.0218, lr=2.26e-05, step=2530] Training: 25%|██▌ | 2531/10000 [33:37<1:11:55, 1.73it/s, loss=0.0218, lr=2.26e-05, step=2530] Training: 25%|██▌ | 2531/10000 [33:37<1:11:55, 1.73it/s, loss=0.0079, lr=2.26e-05, step=2531] Training: 25%|██▌ | 2532/10000 [33:38<1:14:19, 1.67it/s, loss=0.0079, lr=2.26e-05, step=2531] Training: 25%|██▌ | 2532/10000 [33:38<1:14:19, 1.67it/s, loss=0.0116, lr=2.26e-05, step=2532] Training: 25%|██▌ | 2533/10000 [33:38<1:10:43, 1.76it/s, loss=0.0116, lr=2.26e-05, step=2532] Training: 25%|██▌ | 2533/10000 [33:38<1:10:43, 1.76it/s, loss=0.0108, lr=2.26e-05, step=2533] Training: 25%|██▌ | 2534/10000 [33:39<1:11:32, 1.74it/s, loss=0.0108, lr=2.26e-05, step=2533] Training: 25%|██▌ | 2534/10000 [33:39<1:11:32, 1.74it/s, loss=0.0202, lr=2.26e-05, step=2534] Training: 25%|██▌ | 2535/10000 [33:39<1:15:15, 1.65it/s, loss=0.0202, lr=2.26e-05, step=2534] Training: 25%|██▌ | 2535/10000 [33:39<1:15:15, 1.65it/s, loss=0.0125, lr=2.25e-05, step=2535] Training: 25%|██▌ | 2536/10000 [33:40<1:24:15, 1.48it/s, loss=0.0125, lr=2.25e-05, step=2535] Training: 25%|██▌ | 2536/10000 [33:40<1:24:15, 1.48it/s, loss=0.0228, lr=2.25e-05, step=2536] Training: 25%|██▌ | 2537/10000 [33:41<1:17:02, 1.61it/s, loss=0.0228, lr=2.25e-05, step=2536] Training: 25%|██▌ | 2537/10000 [33:41<1:17:02, 1.61it/s, loss=0.0433, lr=2.25e-05, step=2537] Training: 25%|██▌ | 2538/10000 [33:41<1:12:22, 1.72it/s, loss=0.0433, lr=2.25e-05, step=2537] Training: 25%|██▌ | 2538/10000 [33:41<1:12:22, 1.72it/s, loss=0.0147, lr=2.25e-05, step=2538] Training: 25%|██▌ | 2539/10000 [33:42<1:10:52, 1.75it/s, loss=0.0147, lr=2.25e-05, step=2538] Training: 25%|██▌ | 2539/10000 [33:42<1:10:52, 1.75it/s, loss=0.0156, lr=2.25e-05, step=2539]19:18:14.530 [I] step=2540 loss=0.0418 smoothed_loss=0.0237 lr=2.25e-05 grad_norm=0.5316 step_time=0.5014s data_time=0.0829s it/s=1.712 eta_to_10000=4357.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1445 grad_arm_token_fuse=0.0672 grad_shared_expert=0.6081 (18633:train_pytorch.py:850) + Training: 25%|██▌ | 2540/10000 [33:42<1:09:33, 1.79it/s, loss=0.0156, lr=2.25e-05, step=2539] Training: 25%|██▌ | 2540/10000 [33:42<1:09:33, 1.79it/s, loss=0.0418, lr=2.25e-05, step=2540] Training: 25%|██▌ | 2541/10000 [33:43<1:07:01, 1.85it/s, loss=0.0418, lr=2.25e-05, step=2540] Training: 25%|██▌ | 2541/10000 [33:43<1:07:01, 1.85it/s, loss=0.0138, lr=2.25e-05, step=2541] Training: 25%|██▌ | 2542/10000 [33:43<1:13:20, 1.69it/s, loss=0.0138, lr=2.25e-05, step=2541] Training: 25%|██▌ | 2542/10000 [33:43<1:13:20, 1.69it/s, loss=0.0195, lr=2.25e-05, step=2542] Training: 25%|██▌ | 2543/10000 [33:44<1:20:02, 1.55it/s, loss=0.0195, lr=2.25e-05, step=2542] Training: 25%|██▌ | 2543/10000 [33:44<1:20:02, 1.55it/s, loss=0.0082, lr=2.25e-05, step=2543] Training: 25%|██▌ | 2544/10000 [33:45<1:16:29, 1.62it/s, loss=0.0082, lr=2.25e-05, step=2543] Training: 25%|██▌ | 2544/10000 [33:45<1:16:29, 1.62it/s, loss=0.0079, lr=2.25e-05, step=2544] Training: 25%|██▌ | 2545/10000 [33:45<1:11:39, 1.73it/s, loss=0.0079, lr=2.25e-05, step=2544] Training: 25%|██▌ | 2545/10000 [33:45<1:11:39, 1.73it/s, loss=0.0250, lr=2.25e-05, step=2545] Training: 25%|██▌ | 2546/10000 [33:46<1:11:11, 1.75it/s, loss=0.0250, lr=2.25e-05, step=2545] Training: 25%|██▌ | 2546/10000 [33:46<1:11:11, 1.75it/s, loss=0.0283, lr=2.25e-05, step=2546] Training: 25%|██▌ | 2547/10000 [33:46<1:08:37, 1.81it/s, loss=0.0283, lr=2.25e-05, step=2546] Training: 25%|██▌ | 2547/10000 [33:46<1:08:37, 1.81it/s, loss=0.0128, lr=2.25e-05, step=2547] Training: 25%|██▌ | 2548/10000 [33:47<1:06:53, 1.86it/s, loss=0.0128, lr=2.25e-05, step=2547] Training: 25%|██▌ | 2548/10000 [33:47<1:06:53, 1.86it/s, loss=0.0301, lr=2.25e-05, step=2548] Training: 25%|██▌ | 2549/10000 [33:47<1:05:32, 1.89it/s, loss=0.0301, lr=2.25e-05, step=2548] Training: 25%|██▌ | 2549/10000 [33:47<1:05:32, 1.89it/s, loss=0.0148, lr=2.25e-05, step=2549]19:18:20.379 [I] step=2550 loss=0.0617 smoothed_loss=0.0246 lr=2.25e-05 grad_norm=0.5513 step_time=0.5096s data_time=0.0753s it/s=1.710 eta_to_10000=4356.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.1585 grad_arm_token_fuse=0.0617 grad_shared_expert=0.5779 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2550/10000 [33:48<1:14:22, 1.67it/s, loss=0.0148, lr=2.25e-05, step=2549] Training: 26%|██▌ | 2550/10000 [33:48<1:14:22, 1.67it/s, loss=0.0617, lr=2.25e-05, step=2550] Training: 26%|██▌ | 2551/10000 [33:49<1:16:50, 1.62it/s, loss=0.0617, lr=2.25e-05, step=2550] Training: 26%|██▌ | 2551/10000 [33:49<1:16:50, 1.62it/s, loss=0.0159, lr=2.25e-05, step=2551] Training: 26%|██▌ | 2552/10000 [33:49<1:12:23, 1.71it/s, loss=0.0159, lr=2.25e-05, step=2551] Training: 26%|██▌ | 2552/10000 [33:49<1:12:23, 1.71it/s, loss=0.0190, lr=2.25e-05, step=2552] Training: 26%|██▌ | 2553/10000 [33:50<1:10:00, 1.77it/s, loss=0.0190, lr=2.25e-05, step=2552] Training: 26%|██▌ | 2553/10000 [33:50<1:10:00, 1.77it/s, loss=0.0342, lr=2.25e-05, step=2553] Training: 26%|██▌ | 2554/10000 [33:50<1:08:08, 1.82it/s, loss=0.0342, lr=2.25e-05, step=2553] Training: 26%|██▌ | 2554/10000 [33:50<1:08:08, 1.82it/s, loss=0.0284, lr=2.25e-05, step=2554] Training: 26%|██▌ | 2555/10000 [33:51<1:08:42, 1.81it/s, loss=0.0284, lr=2.25e-05, step=2554] Training: 26%|██▌ | 2555/10000 [33:51<1:08:42, 1.81it/s, loss=0.0224, lr=2.25e-05, step=2555] Training: 26%|██▌ | 2556/10000 [33:51<1:06:30, 1.87it/s, loss=0.0224, lr=2.25e-05, step=2555] Training: 26%|██▌ | 2556/10000 [33:51<1:06:30, 1.87it/s, loss=0.0080, lr=2.25e-05, step=2556] Training: 26%|██▌ | 2557/10000 [33:52<1:16:18, 1.63it/s, loss=0.0080, lr=2.25e-05, step=2556] Training: 26%|██▌ | 2557/10000 [33:52<1:16:18, 1.63it/s, loss=0.0200, lr=2.25e-05, step=2557] Training: 26%|██▌ | 2558/10000 [33:53<1:17:27, 1.60it/s, loss=0.0200, lr=2.25e-05, step=2557] Training: 26%|██▌ | 2558/10000 [33:53<1:17:27, 1.60it/s, loss=0.0440, lr=2.25e-05, step=2558] Training: 26%|██▌ | 2559/10000 [33:53<1:11:57, 1.72it/s, loss=0.0440, lr=2.25e-05, step=2558] Training: 26%|██▌ | 2559/10000 [33:53<1:11:57, 1.72it/s, loss=0.0110, lr=2.25e-05, step=2559]19:18:26.129 [I] step=2560 loss=0.0302 smoothed_loss=0.0241 lr=2.25e-05 grad_norm=0.5529 step_time=0.5048s data_time=0.0702s it/s=1.739 eta_to_10000=4278.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.1159 grad_arm_token_fuse=0.0467 grad_shared_expert=0.5075 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2560/10000 [33:54<1:11:33, 1.73it/s, loss=0.0110, lr=2.25e-05, step=2559] Training: 26%|██▌ | 2560/10000 [33:54<1:11:33, 1.73it/s, loss=0.0302, lr=2.25e-05, step=2560] Training: 26%|██▌ | 2561/10000 [33:54<1:09:06, 1.79it/s, loss=0.0302, lr=2.25e-05, step=2560] Training: 26%|██▌ | 2561/10000 [33:54<1:09:06, 1.79it/s, loss=0.0452, lr=2.25e-05, step=2561] Training: 26%|██▌ | 2562/10000 [33:55<1:06:39, 1.86it/s, loss=0.0452, lr=2.25e-05, step=2561] Training: 26%|██▌ | 2562/10000 [33:55<1:06:39, 1.86it/s, loss=0.0269, lr=2.25e-05, step=2562] Training: 26%|██▌ | 2563/10000 [33:55<1:05:08, 1.90it/s, loss=0.0269, lr=2.25e-05, step=2562] Training: 26%|██▌ | 2563/10000 [33:55<1:05:08, 1.90it/s, loss=0.0479, lr=2.25e-05, step=2563] Training: 26%|██▌ | 2564/10000 [33:56<1:14:04, 1.67it/s, loss=0.0479, lr=2.25e-05, step=2563] Training: 26%|██▌ | 2564/10000 [33:56<1:14:04, 1.67it/s, loss=0.0148, lr=2.25e-05, step=2564] Training: 26%|██▌ | 2565/10000 [33:57<1:20:33, 1.54it/s, loss=0.0148, lr=2.25e-05, step=2564] Training: 26%|██▌ | 2565/10000 [33:57<1:20:33, 1.54it/s, loss=0.0174, lr=2.25e-05, step=2565] Training: 26%|██▌ | 2566/10000 [33:57<1:14:20, 1.67it/s, loss=0.0174, lr=2.25e-05, step=2565] Training: 26%|██▌ | 2566/10000 [33:57<1:14:20, 1.67it/s, loss=0.0230, lr=2.25e-05, step=2566] Training: 26%|██▌ | 2567/10000 [33:58<1:12:17, 1.71it/s, loss=0.0230, lr=2.25e-05, step=2566] Training: 26%|██▌ | 2567/10000 [33:58<1:12:17, 1.71it/s, loss=0.0244, lr=2.25e-05, step=2567] Training: 26%|██▌ | 2568/10000 [33:58<1:09:47, 1.77it/s, loss=0.0244, lr=2.25e-05, step=2567] Training: 26%|██▌ | 2568/10000 [33:58<1:09:47, 1.77it/s, loss=0.0078, lr=2.25e-05, step=2568] Training: 26%|██▌ | 2569/10000 [33:59<1:06:48, 1.85it/s, loss=0.0078, lr=2.25e-05, step=2568] Training: 26%|██▌ | 2569/10000 [33:59<1:06:48, 1.85it/s, loss=0.0548, lr=2.25e-05, step=2569]19:18:31.713 [I] step=2570 loss=0.0495 smoothed_loss=0.0292 lr=2.25e-05 grad_norm=0.5717 step_time=0.4877s data_time=0.0707s it/s=1.791 eta_to_10000=4148.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0122 grad_action_out_proj_arms=0.1832 grad_arm_token_fuse=0.0664 grad_shared_expert=0.4524 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2570/10000 [33:59<1:05:57, 1.88it/s, loss=0.0548, lr=2.25e-05, step=2569] Training: 26%|██▌ | 2570/10000 [33:59<1:05:57, 1.88it/s, loss=0.0495, lr=2.25e-05, step=2570] Training: 26%|██▌ | 2571/10000 [34:00<1:13:52, 1.68it/s, loss=0.0495, lr=2.25e-05, step=2570] Training: 26%|██▌ | 2571/10000 [34:00<1:13:52, 1.68it/s, loss=0.0095, lr=2.25e-05, step=2571] Training: 26%|██▌ | 2572/10000 [34:01<1:26:48, 1.43it/s, loss=0.0095, lr=2.25e-05, step=2571] Training: 26%|██▌ | 2572/10000 [34:01<1:26:48, 1.43it/s, loss=0.0305, lr=2.25e-05, step=2572] Training: 26%|██▌ | 2573/10000 [34:02<1:19:25, 1.56it/s, loss=0.0305, lr=2.25e-05, step=2572] Training: 26%|██▌ | 2573/10000 [34:02<1:19:25, 1.56it/s, loss=0.0120, lr=2.25e-05, step=2573] Training: 26%|██▌ | 2574/10000 [34:02<1:13:55, 1.67it/s, loss=0.0120, lr=2.25e-05, step=2573] Training: 26%|██▌ | 2574/10000 [34:02<1:13:55, 1.67it/s, loss=0.0119, lr=2.25e-05, step=2574] Training: 26%|██▌ | 2575/10000 [34:03<1:10:05, 1.77it/s, loss=0.0119, lr=2.25e-05, step=2574] Training: 26%|██▌ | 2575/10000 [34:03<1:10:05, 1.77it/s, loss=0.0133, lr=2.25e-05, step=2575] Training: 26%|██▌ | 2576/10000 [34:03<1:07:16, 1.84it/s, loss=0.0133, lr=2.25e-05, step=2575] Training: 26%|██▌ | 2576/10000 [34:03<1:07:16, 1.84it/s, loss=0.0196, lr=2.25e-05, step=2576] Training: 26%|██▌ | 2577/10000 [34:04<1:04:59, 1.90it/s, loss=0.0196, lr=2.25e-05, step=2576] Training: 26%|██▌ | 2577/10000 [34:04<1:04:59, 1.90it/s, loss=0.0082, lr=2.25e-05, step=2577] Training: 26%|██▌ | 2578/10000 [34:04<1:13:22, 1.69it/s, loss=0.0082, lr=2.25e-05, step=2577] Training: 26%|██▌ | 2578/10000 [34:04<1:13:22, 1.69it/s, loss=0.0437, lr=2.24e-05, step=2578] Training: 26%|██▌ | 2579/10000 [34:05<1:15:58, 1.63it/s, loss=0.0437, lr=2.24e-05, step=2578] Training: 26%|██▌ | 2579/10000 [34:05<1:15:58, 1.63it/s, loss=0.0240, lr=2.24e-05, step=2579]19:18:37.822 [I] step=2580 loss=0.0321 smoothed_loss=0.0246 lr=2.25e-05 grad_norm=0.5234 step_time=0.5319s data_time=0.0790s it/s=1.637 eta_to_10000=4532.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0145 grad_action_out_proj_arms=0.1533 grad_arm_token_fuse=0.0721 grad_shared_expert=0.5820 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2580/10000 [34:05<1:13:09, 1.69it/s, loss=0.0240, lr=2.24e-05, step=2579] Training: 26%|██▌ | 2580/10000 [34:05<1:13:09, 1.69it/s, loss=0.0321, lr=2.24e-05, step=2580] Training: 26%|██▌ | 2581/10000 [34:06<1:12:41, 1.70it/s, loss=0.0321, lr=2.24e-05, step=2580] Training: 26%|██▌ | 2581/10000 [34:06<1:12:41, 1.70it/s, loss=0.0145, lr=2.24e-05, step=2581] Training: 26%|██▌ | 2582/10000 [34:07<1:17:45, 1.59it/s, loss=0.0145, lr=2.24e-05, step=2581] Training: 26%|██▌ | 2582/10000 [34:07<1:17:45, 1.59it/s, loss=0.0184, lr=2.24e-05, step=2582] Training: 26%|██▌ | 2583/10000 [34:07<1:18:45, 1.57it/s, loss=0.0184, lr=2.24e-05, step=2582] Training: 26%|██▌ | 2583/10000 [34:07<1:18:45, 1.57it/s, loss=0.0586, lr=2.24e-05, step=2583] Training: 26%|██▌ | 2584/10000 [34:08<1:15:34, 1.64it/s, loss=0.0586, lr=2.24e-05, step=2583] Training: 26%|██▌ | 2584/10000 [34:08<1:15:34, 1.64it/s, loss=0.0419, lr=2.24e-05, step=2584] Training: 26%|██▌ | 2585/10000 [34:09<1:18:05, 1.58it/s, loss=0.0419, lr=2.24e-05, step=2584] Training: 26%|██▌ | 2585/10000 [34:09<1:18:05, 1.58it/s, loss=0.0121, lr=2.24e-05, step=2585] Training: 26%|██▌ | 2586/10000 [34:10<1:34:39, 1.31it/s, loss=0.0121, lr=2.24e-05, step=2585] Training: 26%|██▌ | 2586/10000 [34:10<1:34:39, 1.31it/s, loss=0.0104, lr=2.24e-05, step=2586] Training: 26%|██▌ | 2587/10000 [34:10<1:24:40, 1.46it/s, loss=0.0104, lr=2.24e-05, step=2586] Training: 26%|██▌ | 2587/10000 [34:10<1:24:40, 1.46it/s, loss=0.0338, lr=2.24e-05, step=2587] Training: 26%|██▌ | 2588/10000 [34:11<1:21:37, 1.51it/s, loss=0.0338, lr=2.24e-05, step=2587] Training: 26%|██▌ | 2588/10000 [34:11<1:21:37, 1.51it/s, loss=0.0171, lr=2.24e-05, step=2588] Training: 26%|██▌ | 2589/10000 [34:11<1:15:44, 1.63it/s, loss=0.0171, lr=2.24e-05, step=2588] Training: 26%|██▌ | 2589/10000 [34:11<1:15:44, 1.63it/s, loss=0.0120, lr=2.24e-05, step=2589]19:18:44.226 [I] step=2590 loss=0.1227 smoothed_loss=0.0336 lr=2.24e-05 grad_norm=0.6507 step_time=0.5320s data_time=0.1083s it/s=1.562 eta_to_10000=4744.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.2433 grad_arm_token_fuse=0.1118 grad_shared_expert=1.4099 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2590/10000 [34:12<1:12:40, 1.70it/s, loss=0.0120, lr=2.24e-05, step=2589] Training: 26%|██▌ | 2590/10000 [34:12<1:12:40, 1.70it/s, loss=0.1227, lr=2.24e-05, step=2590] Training: 26%|██▌ | 2591/10000 [34:12<1:09:04, 1.79it/s, loss=0.1227, lr=2.24e-05, step=2590] Training: 26%|██▌ | 2591/10000 [34:12<1:09:04, 1.79it/s, loss=0.0071, lr=2.24e-05, step=2591] Training: 26%|██▌ | 2592/10000 [34:13<1:06:47, 1.85it/s, loss=0.0071, lr=2.24e-05, step=2591] Training: 26%|██▌ | 2592/10000 [34:13<1:06:47, 1.85it/s, loss=0.0551, lr=2.24e-05, step=2592] Training: 26%|██▌ | 2593/10000 [34:14<1:17:12, 1.60it/s, loss=0.0551, lr=2.24e-05, step=2592] Training: 26%|██▌ | 2593/10000 [34:14<1:17:12, 1.60it/s, loss=0.0229, lr=2.24e-05, step=2593] Training: 26%|██▌ | 2594/10000 [34:14<1:12:07, 1.71it/s, loss=0.0229, lr=2.24e-05, step=2593] Training: 26%|██▌ | 2594/10000 [34:14<1:12:07, 1.71it/s, loss=0.0293, lr=2.24e-05, step=2594] Training: 26%|██▌ | 2595/10000 [34:15<1:09:09, 1.78it/s, loss=0.0293, lr=2.24e-05, step=2594] Training: 26%|██▌ | 2595/10000 [34:15<1:09:09, 1.78it/s, loss=0.0545, lr=2.24e-05, step=2595] Training: 26%|██▌ | 2596/10000 [34:15<1:06:43, 1.85it/s, loss=0.0545, lr=2.24e-05, step=2595] Training: 26%|██▌ | 2596/10000 [34:15<1:06:43, 1.85it/s, loss=0.0110, lr=2.24e-05, step=2596] Training: 26%|██▌ | 2597/10000 [34:16<1:05:08, 1.89it/s, loss=0.0110, lr=2.24e-05, step=2596] Training: 26%|██▌ | 2597/10000 [34:16<1:05:08, 1.89it/s, loss=0.0455, lr=2.24e-05, step=2597] Training: 26%|██▌ | 2598/10000 [34:16<1:03:30, 1.94it/s, loss=0.0455, lr=2.24e-05, step=2597] Training: 26%|██▌ | 2598/10000 [34:16<1:03:30, 1.94it/s, loss=0.0305, lr=2.24e-05, step=2598] Training: 26%|██▌ | 2599/10000 [34:17<1:02:01, 1.99it/s, loss=0.0305, lr=2.24e-05, step=2598] Training: 26%|██▌ | 2599/10000 [34:17<1:02:01, 1.99it/s, loss=0.0140, lr=2.24e-05, step=2599]19:18:49.763 [I] step=2600 loss=0.0155 smoothed_loss=0.0295 lr=2.24e-05 grad_norm=0.6342 step_time=0.4879s data_time=0.0658s it/s=1.806 eta_to_10000=4096.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.1688 grad_arm_token_fuse=0.1134 grad_shared_expert=0.4951 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2600/10000 [34:17<1:12:16, 1.71it/s, loss=0.0140, lr=2.24e-05, step=2599] Training: 26%|██▌ | 2600/10000 [34:17<1:12:16, 1.71it/s, loss=0.0155, lr=2.24e-05, step=2600] Training: 26%|██▌ | 2601/10000 [34:18<1:08:39, 1.80it/s, loss=0.0155, lr=2.24e-05, step=2600] Training: 26%|██▌ | 2601/10000 [34:18<1:08:39, 1.80it/s, loss=0.0874, lr=2.24e-05, step=2601] Training: 26%|██▌ | 2602/10000 [34:19<1:11:32, 1.72it/s, loss=0.0874, lr=2.24e-05, step=2601] Training: 26%|██▌ | 2602/10000 [34:19<1:11:32, 1.72it/s, loss=0.0110, lr=2.24e-05, step=2602] Training: 26%|██▌ | 2603/10000 [34:19<1:08:22, 1.80it/s, loss=0.0110, lr=2.24e-05, step=2602] Training: 26%|██▌ | 2603/10000 [34:19<1:08:22, 1.80it/s, loss=0.0213, lr=2.24e-05, step=2603] Training: 26%|██▌ | 2604/10000 [34:20<1:08:11, 1.81it/s, loss=0.0213, lr=2.24e-05, step=2603] Training: 26%|██▌ | 2604/10000 [34:20<1:08:11, 1.81it/s, loss=0.0125, lr=2.24e-05, step=2604] Training: 26%|██▌ | 2605/10000 [34:20<1:06:53, 1.84it/s, loss=0.0125, lr=2.24e-05, step=2604] Training: 26%|██▌ | 2605/10000 [34:20<1:06:53, 1.84it/s, loss=0.0144, lr=2.24e-05, step=2605] Training: 26%|██▌ | 2606/10000 [34:21<1:07:26, 1.83it/s, loss=0.0144, lr=2.24e-05, step=2605] Training: 26%|██▌ | 2606/10000 [34:21<1:07:26, 1.83it/s, loss=0.0046, lr=2.24e-05, step=2606] Training: 26%|██▌ | 2607/10000 [34:22<1:18:14, 1.57it/s, loss=0.0046, lr=2.24e-05, step=2606] Training: 26%|██▌ | 2607/10000 [34:22<1:18:14, 1.57it/s, loss=0.0634, lr=2.24e-05, step=2607] Training: 26%|██▌ | 2608/10000 [34:22<1:12:46, 1.69it/s, loss=0.0634, lr=2.24e-05, step=2607] Training: 26%|██▌ | 2608/10000 [34:22<1:12:46, 1.69it/s, loss=0.0217, lr=2.24e-05, step=2608] Training: 26%|██▌ | 2609/10000 [34:22<1:08:53, 1.79it/s, loss=0.0217, lr=2.24e-05, step=2608] Training: 26%|██▌ | 2609/10000 [34:22<1:08:53, 1.79it/s, loss=0.0145, lr=2.24e-05, step=2609]19:18:55.343 [I] step=2610 loss=0.0367 smoothed_loss=0.0283 lr=2.24e-05 grad_norm=0.6245 step_time=0.4812s data_time=0.0768s it/s=1.792 eta_to_10000=4122.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0125 grad_action_out_proj_arms=0.1629 grad_arm_token_fuse=0.0595 grad_shared_expert=0.4429 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2610/10000 [34:23<1:07:30, 1.82it/s, loss=0.0145, lr=2.24e-05, step=2609] Training: 26%|██▌ | 2610/10000 [34:23<1:07:30, 1.82it/s, loss=0.0367, lr=2.24e-05, step=2610] Training: 26%|██▌ | 2611/10000 [34:24<1:05:23, 1.88it/s, loss=0.0367, lr=2.24e-05, step=2610] Training: 26%|██▌ | 2611/10000 [34:24<1:05:23, 1.88it/s, loss=0.0142, lr=2.24e-05, step=2611] Training: 26%|██▌ | 2612/10000 [34:24<1:04:35, 1.91it/s, loss=0.0142, lr=2.24e-05, step=2611] Training: 26%|██▌ | 2612/10000 [34:24<1:04:35, 1.91it/s, loss=0.0331, lr=2.24e-05, step=2612] Training: 26%|██▌ | 2613/10000 [34:25<1:03:21, 1.94it/s, loss=0.0331, lr=2.24e-05, step=2612] Training: 26%|██▌ | 2613/10000 [34:25<1:03:21, 1.94it/s, loss=0.0232, lr=2.24e-05, step=2613] Training: 26%|██▌ | 2614/10000 [34:25<1:10:48, 1.74it/s, loss=0.0232, lr=2.24e-05, step=2613] Training: 26%|██▌ | 2614/10000 [34:25<1:10:48, 1.74it/s, loss=0.0153, lr=2.24e-05, step=2614] Training: 26%|██▌ | 2615/10000 [34:26<1:17:37, 1.59it/s, loss=0.0153, lr=2.24e-05, step=2614] Training: 26%|██▌ | 2615/10000 [34:26<1:17:37, 1.59it/s, loss=0.0290, lr=2.24e-05, step=2615] Training: 26%|██▌ | 2616/10000 [34:26<1:12:01, 1.71it/s, loss=0.0290, lr=2.24e-05, step=2615] Training: 26%|██▌ | 2616/10000 [34:26<1:12:01, 1.71it/s, loss=0.0441, lr=2.24e-05, step=2616] Training: 26%|██▌ | 2617/10000 [34:27<1:09:08, 1.78it/s, loss=0.0441, lr=2.24e-05, step=2616] Training: 26%|██▌ | 2617/10000 [34:27<1:09:08, 1.78it/s, loss=0.0193, lr=2.24e-05, step=2617] Training: 26%|██▌ | 2618/10000 [34:27<1:06:31, 1.85it/s, loss=0.0193, lr=2.24e-05, step=2617] Training: 26%|██▌ | 2618/10000 [34:27<1:06:31, 1.85it/s, loss=0.0191, lr=2.24e-05, step=2618] Training: 26%|██▌ | 2619/10000 [34:28<1:05:07, 1.89it/s, loss=0.0191, lr=2.24e-05, step=2618] Training: 26%|██▌ | 2619/10000 [34:28<1:05:07, 1.89it/s, loss=0.0111, lr=2.24e-05, step=2619]19:19:00.933 [I] step=2620 loss=0.0086 smoothed_loss=0.0232 lr=2.24e-05 grad_norm=0.5396 step_time=0.4897s data_time=0.0693s it/s=1.789 eta_to_10000=4124.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.1221 grad_arm_token_fuse=0.0681 grad_shared_expert=0.5209 (18633:train_pytorch.py:850) + Training: 26%|██▌ | 2620/10000 [34:29<1:09:12, 1.78it/s, loss=0.0111, lr=2.24e-05, step=2619] Training: 26%|██▌ | 2620/10000 [34:29<1:09:12, 1.78it/s, loss=0.0086, lr=2.23e-05, step=2620] Training: 26%|██▌ | 2621/10000 [34:29<1:15:53, 1.62it/s, loss=0.0086, lr=2.23e-05, step=2620] Training: 26%|██▌ | 2621/10000 [34:29<1:15:53, 1.62it/s, loss=0.0231, lr=2.23e-05, step=2621] Training: 26%|██▌ | 2622/10000 [34:30<1:12:16, 1.70it/s, loss=0.0231, lr=2.23e-05, step=2621] Training: 26%|██▌ | 2622/10000 [34:30<1:12:16, 1.70it/s, loss=0.0236, lr=2.23e-05, step=2622] Training: 26%|██▌ | 2623/10000 [34:31<1:16:58, 1.60it/s, loss=0.0236, lr=2.23e-05, step=2622] Training: 26%|██▌ | 2623/10000 [34:31<1:16:58, 1.60it/s, loss=0.0184, lr=2.23e-05, step=2623] Training: 26%|██▌ | 2624/10000 [34:31<1:13:08, 1.68it/s, loss=0.0184, lr=2.23e-05, step=2623] Training: 26%|██▌ | 2624/10000 [34:31<1:13:08, 1.68it/s, loss=0.0107, lr=2.23e-05, step=2624] Training: 26%|██▋ | 2625/10000 [34:32<1:10:16, 1.75it/s, loss=0.0107, lr=2.23e-05, step=2624] Training: 26%|██▋ | 2625/10000 [34:32<1:10:16, 1.75it/s, loss=0.0079, lr=2.23e-05, step=2625] Training: 26%|██▋ | 2626/10000 [34:32<1:08:13, 1.80it/s, loss=0.0079, lr=2.23e-05, step=2625] Training: 26%|██▋ | 2626/10000 [34:32<1:08:13, 1.80it/s, loss=0.0234, lr=2.23e-05, step=2626] Training: 26%|██▋ | 2627/10000 [34:33<1:07:18, 1.83it/s, loss=0.0234, lr=2.23e-05, step=2626] Training: 26%|██▋ | 2627/10000 [34:33<1:07:18, 1.83it/s, loss=0.0126, lr=2.23e-05, step=2627] Training: 26%|██▋ | 2628/10000 [34:34<1:22:11, 1.49it/s, loss=0.0126, lr=2.23e-05, step=2627] Training: 26%|██▋ | 2628/10000 [34:34<1:22:11, 1.49it/s, loss=0.0075, lr=2.23e-05, step=2628] Training: 26%|██▋ | 2629/10000 [34:34<1:19:59, 1.54it/s, loss=0.0075, lr=2.23e-05, step=2628] Training: 26%|██▋ | 2629/10000 [34:34<1:19:59, 1.54it/s, loss=0.0170, lr=2.23e-05, step=2629]19:19:07.129 [I] step=2630 loss=0.0124 smoothed_loss=0.0177 lr=2.23e-05 grad_norm=0.4944 step_time=0.5364s data_time=0.0832s it/s=1.614 eta_to_10000=4566.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0235 grad_action_out_proj_arms=0.1675 grad_arm_token_fuse=0.1191 grad_shared_expert=0.4886 (18633:train_pytorch.py:850) + Training: 26%|██▋ | 2630/10000 [34:35<1:16:58, 1.60it/s, loss=0.0170, lr=2.23e-05, step=2629] Training: 26%|██▋ | 2630/10000 [34:35<1:16:58, 1.60it/s, loss=0.0124, lr=2.23e-05, step=2630] Training: 26%|██▋ | 2631/10000 [34:35<1:12:00, 1.71it/s, loss=0.0124, lr=2.23e-05, step=2630] Training: 26%|██▋ | 2631/10000 [34:35<1:12:00, 1.71it/s, loss=0.0053, lr=2.23e-05, step=2631] Training: 26%|██▋ | 2632/10000 [34:36<1:10:10, 1.75it/s, loss=0.0053, lr=2.23e-05, step=2631] Training: 26%|██▋ | 2632/10000 [34:36<1:10:10, 1.75it/s, loss=0.0241, lr=2.23e-05, step=2632] Training: 26%|██▋ | 2633/10000 [34:36<1:08:44, 1.79it/s, loss=0.0241, lr=2.23e-05, step=2632] Training: 26%|██▋ | 2633/10000 [34:36<1:08:44, 1.79it/s, loss=0.0143, lr=2.23e-05, step=2633] Training: 26%|██▋ | 2634/10000 [34:37<1:06:00, 1.86it/s, loss=0.0143, lr=2.23e-05, step=2633] Training: 26%|██▋ | 2634/10000 [34:37<1:06:00, 1.86it/s, loss=0.0353, lr=2.23e-05, step=2634] Training: 26%|██▋ | 2635/10000 [34:38<1:11:38, 1.71it/s, loss=0.0353, lr=2.23e-05, step=2634] Training: 26%|██▋ | 2635/10000 [34:38<1:11:38, 1.71it/s, loss=0.0102, lr=2.23e-05, step=2635] Training: 26%|██▋ | 2636/10000 [34:38<1:19:09, 1.55it/s, loss=0.0102, lr=2.23e-05, step=2635] Training: 26%|██▋ | 2636/10000 [34:38<1:19:09, 1.55it/s, loss=0.0079, lr=2.23e-05, step=2636] Training: 26%|██▋ | 2637/10000 [34:39<1:15:20, 1.63it/s, loss=0.0079, lr=2.23e-05, step=2636] Training: 26%|██▋ | 2637/10000 [34:39<1:15:20, 1.63it/s, loss=0.0089, lr=2.23e-05, step=2637] Training: 26%|██▋ | 2638/10000 [34:39<1:11:11, 1.72it/s, loss=0.0089, lr=2.23e-05, step=2637] Training: 26%|██▋ | 2638/10000 [34:39<1:11:11, 1.72it/s, loss=0.0090, lr=2.23e-05, step=2638] Training: 26%|██▋ | 2639/10000 [34:40<1:09:00, 1.78it/s, loss=0.0090, lr=2.23e-05, step=2638] Training: 26%|██▋ | 2639/10000 [34:40<1:09:00, 1.78it/s, loss=0.0148, lr=2.23e-05, step=2639]19:19:12.783 [I] step=2640 loss=0.0281 smoothed_loss=0.0166 lr=2.23e-05 grad_norm=0.5114 step_time=0.4898s data_time=0.0756s it/s=1.769 eta_to_10000=4160.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0220 grad_action_out_proj_arms=0.1929 grad_arm_token_fuse=0.1177 grad_shared_expert=0.5384 (18633:train_pytorch.py:850) + Training: 26%|██▋ | 2640/10000 [34:40<1:09:02, 1.78it/s, loss=0.0148, lr=2.23e-05, step=2639] Training: 26%|██▋ | 2640/10000 [34:40<1:09:02, 1.78it/s, loss=0.0281, lr=2.23e-05, step=2640] Training: 26%|██▋ | 2641/10000 [34:41<1:09:25, 1.77it/s, loss=0.0281, lr=2.23e-05, step=2640] Training: 26%|██▋ | 2641/10000 [34:41<1:09:25, 1.77it/s, loss=0.0172, lr=2.23e-05, step=2641] Training: 26%|██▋ | 2642/10000 [34:42<1:14:40, 1.64it/s, loss=0.0172, lr=2.23e-05, step=2641] Training: 26%|██▋ | 2642/10000 [34:42<1:14:40, 1.64it/s, loss=0.0140, lr=2.23e-05, step=2642] Training: 26%|██▋ | 2643/10000 [34:43<1:20:46, 1.52it/s, loss=0.0140, lr=2.23e-05, step=2642] Training: 26%|██▋ | 2643/10000 [34:43<1:20:46, 1.52it/s, loss=0.0236, lr=2.23e-05, step=2643] Training: 26%|██▋ | 2644/10000 [34:43<1:16:04, 1.61it/s, loss=0.0236, lr=2.23e-05, step=2643] Training: 26%|██▋ | 2644/10000 [34:43<1:16:04, 1.61it/s, loss=0.0036, lr=2.23e-05, step=2644] Training: 26%|██▋ | 2645/10000 [34:44<1:12:14, 1.70it/s, loss=0.0036, lr=2.23e-05, step=2644] Training: 26%|██▋ | 2645/10000 [34:44<1:12:14, 1.70it/s, loss=0.0303, lr=2.23e-05, step=2645] Training: 26%|██▋ | 2646/10000 [34:44<1:08:54, 1.78it/s, loss=0.0303, lr=2.23e-05, step=2645] Training: 26%|██▋ | 2646/10000 [34:44<1:08:54, 1.78it/s, loss=0.0084, lr=2.23e-05, step=2646] Training: 26%|██▋ | 2647/10000 [34:45<1:06:49, 1.83it/s, loss=0.0084, lr=2.23e-05, step=2646] Training: 26%|██▋ | 2647/10000 [34:45<1:06:49, 1.83it/s, loss=0.0348, lr=2.23e-05, step=2647] Training: 26%|██▋ | 2648/10000 [34:45<1:04:31, 1.90it/s, loss=0.0348, lr=2.23e-05, step=2647] Training: 26%|██▋ | 2648/10000 [34:45<1:04:31, 1.90it/s, loss=0.0132, lr=2.23e-05, step=2648] Training: 26%|██▋ | 2649/10000 [34:46<1:09:31, 1.76it/s, loss=0.0132, lr=2.23e-05, step=2648] Training: 26%|██▋ | 2649/10000 [34:46<1:09:31, 1.76it/s, loss=0.0129, lr=2.23e-05, step=2649]19:19:18.856 [I] step=2650 loss=0.0356 smoothed_loss=0.0190 lr=2.23e-05 grad_norm=0.5005 step_time=0.5332s data_time=0.0741s it/s=1.647 eta_to_10000=4462.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0167 grad_action_out_proj_arms=0.1246 grad_arm_token_fuse=0.0874 grad_shared_expert=0.4393 (18633:train_pytorch.py:850) + Training: 26%|██▋ | 2650/10000 [34:47<1:18:41, 1.56it/s, loss=0.0129, lr=2.23e-05, step=2649] Training: 26%|██▋ | 2650/10000 [34:47<1:18:41, 1.56it/s, loss=0.0356, lr=2.23e-05, step=2650] Training: 27%|██▋ | 2651/10000 [34:47<1:22:14, 1.49it/s, loss=0.0356, lr=2.23e-05, step=2650] Training: 27%|██▋ | 2651/10000 [34:47<1:22:14, 1.49it/s, loss=0.0084, lr=2.23e-05, step=2651] Training: 27%|██▋ | 2652/10000 [34:48<1:15:34, 1.62it/s, loss=0.0084, lr=2.23e-05, step=2651] Training: 27%|██▋ | 2652/10000 [34:48<1:15:34, 1.62it/s, loss=0.0297, lr=2.23e-05, step=2652] Training: 27%|██▋ | 2653/10000 [34:48<1:19:25, 1.54it/s, loss=0.0297, lr=2.23e-05, step=2652] Training: 27%|██▋ | 2653/10000 [34:48<1:19:25, 1.54it/s, loss=0.0134, lr=2.23e-05, step=2653] Training: 27%|██▋ | 2654/10000 [34:49<1:13:47, 1.66it/s, loss=0.0134, lr=2.23e-05, step=2653] Training: 27%|██▋ | 2654/10000 [34:49<1:13:47, 1.66it/s, loss=0.0462, lr=2.23e-05, step=2654] Training: 27%|██▋ | 2655/10000 [34:49<1:09:24, 1.76it/s, loss=0.0462, lr=2.23e-05, step=2654] Training: 27%|██▋ | 2655/10000 [34:49<1:09:24, 1.76it/s, loss=0.0198, lr=2.23e-05, step=2655] Training: 27%|██▋ | 2656/10000 [34:50<1:12:08, 1.70it/s, loss=0.0198, lr=2.23e-05, step=2655] Training: 27%|██▋ | 2656/10000 [34:50<1:12:08, 1.70it/s, loss=0.0145, lr=2.23e-05, step=2656] Training: 27%|██▋ | 2657/10000 [34:51<1:22:55, 1.48it/s, loss=0.0145, lr=2.23e-05, step=2656] Training: 27%|██▋ | 2657/10000 [34:51<1:22:55, 1.48it/s, loss=0.0051, lr=2.23e-05, step=2657] Training: 27%|██▋ | 2658/10000 [34:51<1:15:38, 1.62it/s, loss=0.0051, lr=2.23e-05, step=2657] Training: 27%|██▋ | 2658/10000 [34:51<1:15:38, 1.62it/s, loss=0.0136, lr=2.23e-05, step=2658] Training: 27%|██▋ | 2659/10000 [34:52<1:11:26, 1.71it/s, loss=0.0136, lr=2.23e-05, step=2658] Training: 27%|██▋ | 2659/10000 [34:52<1:11:26, 1.71it/s, loss=0.0213, lr=2.23e-05, step=2659]19:19:24.839 [I] step=2660 loss=0.0278 smoothed_loss=0.0196 lr=2.23e-05 grad_norm=0.5207 step_time=0.5006s data_time=0.0977s it/s=1.672 eta_to_10000=4390.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0096 grad_action_out_proj_arms=0.1465 grad_arm_token_fuse=0.0455 grad_shared_expert=0.3820 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2660/10000 [34:53<1:09:55, 1.75it/s, loss=0.0213, lr=2.23e-05, step=2659] Training: 27%|██▋ | 2660/10000 [34:53<1:09:55, 1.75it/s, loss=0.0278, lr=2.23e-05, step=2660] Training: 27%|██▋ | 2661/10000 [34:53<1:08:40, 1.78it/s, loss=0.0278, lr=2.23e-05, step=2660] Training: 27%|██▋ | 2661/10000 [34:53<1:08:40, 1.78it/s, loss=0.0356, lr=2.22e-05, step=2661] Training: 27%|██▋ | 2662/10000 [34:54<1:05:58, 1.85it/s, loss=0.0356, lr=2.22e-05, step=2661] Training: 27%|██▋ | 2662/10000 [34:54<1:05:58, 1.85it/s, loss=0.0062, lr=2.22e-05, step=2662] Training: 27%|██▋ | 2663/10000 [34:54<1:10:11, 1.74it/s, loss=0.0062, lr=2.22e-05, step=2662] Training: 27%|██▋ | 2663/10000 [34:54<1:10:11, 1.74it/s, loss=0.0150, lr=2.22e-05, step=2663] Training: 27%|██▋ | 2664/10000 [34:55<1:15:29, 1.62it/s, loss=0.0150, lr=2.22e-05, step=2663] Training: 27%|██▋ | 2664/10000 [34:55<1:15:29, 1.62it/s, loss=0.0104, lr=2.22e-05, step=2664] Training: 27%|██▋ | 2665/10000 [34:56<1:24:40, 1.44it/s, loss=0.0104, lr=2.22e-05, step=2664] Training: 27%|██▋ | 2665/10000 [34:56<1:24:40, 1.44it/s, loss=0.0050, lr=2.22e-05, step=2665] Training: 27%|██▋ | 2666/10000 [34:57<1:47:00, 1.14it/s, loss=0.0050, lr=2.22e-05, step=2665] Training: 27%|██▋ | 2666/10000 [34:57<1:47:00, 1.14it/s, loss=0.0151, lr=2.22e-05, step=2666] Training: 27%|██▋ | 2667/10000 [34:58<1:56:24, 1.05it/s, loss=0.0151, lr=2.22e-05, step=2666] Training: 27%|██▋ | 2667/10000 [34:58<1:56:24, 1.05it/s, loss=0.0147, lr=2.22e-05, step=2667] Training: 27%|██▋ | 2668/10000 [34:59<2:04:06, 1.02s/it, loss=0.0147, lr=2.22e-05, step=2667] Training: 27%|██▋ | 2668/10000 [34:59<2:04:06, 1.02s/it, loss=0.0344, lr=2.22e-05, step=2668] Training: 27%|██▋ | 2669/10000 [35:01<2:19:17, 1.14s/it, loss=0.0344, lr=2.22e-05, step=2668] Training: 27%|██▋ | 2669/10000 [35:01<2:19:17, 1.14s/it, loss=0.0173, lr=2.22e-05, step=2669]19:19:34.557 [I] step=2670 loss=0.0115 smoothed_loss=0.0176 lr=2.22e-05 grad_norm=0.4742 step_time=0.6650s data_time=0.3068s it/s=1.029 eta_to_10000=7122.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0112 grad_action_out_proj_arms=0.1141 grad_arm_token_fuse=0.0547 grad_shared_expert=0.3952 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2670/10000 [35:02<2:30:05, 1.23s/it, loss=0.0173, lr=2.22e-05, step=2669] Training: 27%|██▋ | 2670/10000 [35:02<2:30:05, 1.23s/it, loss=0.0115, lr=2.22e-05, step=2670] Training: 27%|██▋ | 2671/10000 [35:04<2:35:44, 1.28s/it, loss=0.0115, lr=2.22e-05, step=2670] Training: 27%|██▋ | 2671/10000 [35:04<2:35:44, 1.28s/it, loss=0.0098, lr=2.22e-05, step=2671] Training: 27%|██▋ | 2672/10000 [35:05<2:54:46, 1.43s/it, loss=0.0098, lr=2.22e-05, step=2671] Training: 27%|██▋ | 2672/10000 [35:05<2:54:46, 1.43s/it, loss=0.0221, lr=2.22e-05, step=2672] Training: 27%|██▋ | 2673/10000 [35:07<2:42:02, 1.33s/it, loss=0.0221, lr=2.22e-05, step=2672] Training: 27%|██▋ | 2673/10000 [35:07<2:42:02, 1.33s/it, loss=0.0071, lr=2.22e-05, step=2673] Training: 27%|██▋ | 2674/10000 [35:08<2:34:24, 1.26s/it, loss=0.0071, lr=2.22e-05, step=2673] Training: 27%|██▋ | 2674/10000 [35:08<2:34:24, 1.26s/it, loss=0.0329, lr=2.22e-05, step=2674] Training: 27%|██▋ | 2675/10000 [35:09<2:27:27, 1.21s/it, loss=0.0329, lr=2.22e-05, step=2674] Training: 27%|██▋ | 2675/10000 [35:09<2:27:27, 1.21s/it, loss=0.0294, lr=2.22e-05, step=2675] Training: 27%|██▋ | 2676/10000 [35:10<2:19:43, 1.14s/it, loss=0.0294, lr=2.22e-05, step=2675] Training: 27%|██▋ | 2676/10000 [35:10<2:19:43, 1.14s/it, loss=0.0076, lr=2.22e-05, step=2676] Training: 27%|██▋ | 2677/10000 [35:11<2:28:22, 1.22s/it, loss=0.0076, lr=2.22e-05, step=2676] Training: 27%|██▋ | 2677/10000 [35:11<2:28:22, 1.22s/it, loss=0.0276, lr=2.22e-05, step=2677] Training: 27%|██▋ | 2678/10000 [35:12<2:34:25, 1.27s/it, loss=0.0276, lr=2.22e-05, step=2677] Training: 27%|██▋ | 2678/10000 [35:12<2:34:25, 1.27s/it, loss=0.0165, lr=2.22e-05, step=2678] Training: 27%|██▋ | 2679/10000 [35:14<2:29:41, 1.23s/it, loss=0.0165, lr=2.22e-05, step=2678] Training: 27%|██▋ | 2679/10000 [35:14<2:29:41, 1.23s/it, loss=0.0162, lr=2.22e-05, step=2679]19:19:47.012 [I] step=2680 loss=0.0107 smoothed_loss=0.0177 lr=2.22e-05 grad_norm=0.4972 step_time=0.8740s data_time=0.3716s it/s=0.804 eta_to_10000=9107.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0075 grad_action_out_proj_arms=0.1118 grad_arm_token_fuse=0.0334 grad_shared_expert=0.3007 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2680/10000 [35:15<2:24:47, 1.19s/it, loss=0.0162, lr=2.22e-05, step=2679] Training: 27%|██▋ | 2680/10000 [35:15<2:24:47, 1.19s/it, loss=0.0107, lr=2.22e-05, step=2680] Training: 27%|██▋ | 2681/10000 [35:16<2:23:18, 1.17s/it, loss=0.0107, lr=2.22e-05, step=2680] Training: 27%|██▋ | 2681/10000 [35:16<2:23:18, 1.17s/it, loss=0.0051, lr=2.22e-05, step=2681] Training: 27%|██▋ | 2682/10000 [35:17<2:14:51, 1.11s/it, loss=0.0051, lr=2.22e-05, step=2681] Training: 27%|██▋ | 2682/10000 [35:17<2:14:51, 1.11s/it, loss=0.0217, lr=2.22e-05, step=2682] Training: 27%|██▋ | 2683/10000 [35:18<2:25:06, 1.19s/it, loss=0.0217, lr=2.22e-05, step=2682] Training: 27%|██▋ | 2683/10000 [35:18<2:25:06, 1.19s/it, loss=0.0257, lr=2.22e-05, step=2683] Training: 27%|██▋ | 2684/10000 [35:19<2:15:38, 1.11s/it, loss=0.0257, lr=2.22e-05, step=2683] Training: 27%|██▋ | 2684/10000 [35:19<2:15:38, 1.11s/it, loss=0.0390, lr=2.22e-05, step=2684] Training: 27%|██▋ | 2685/10000 [35:20<2:08:33, 1.05s/it, loss=0.0390, lr=2.22e-05, step=2684] Training: 27%|██▋ | 2685/10000 [35:20<2:08:33, 1.05s/it, loss=0.0186, lr=2.22e-05, step=2685] Training: 27%|██▋ | 2686/10000 [35:21<2:22:03, 1.17s/it, loss=0.0186, lr=2.22e-05, step=2685] Training: 27%|██▋ | 2686/10000 [35:21<2:22:03, 1.17s/it, loss=0.0075, lr=2.22e-05, step=2686] Training: 27%|██▋ | 2687/10000 [35:23<2:22:57, 1.17s/it, loss=0.0075, lr=2.22e-05, step=2686] Training: 27%|██▋ | 2687/10000 [35:23<2:22:57, 1.17s/it, loss=0.0257, lr=2.22e-05, step=2687] Training: 27%|██▋ | 2688/10000 [35:24<2:15:40, 1.11s/it, loss=0.0257, lr=2.22e-05, step=2687] Training: 27%|██▋ | 2688/10000 [35:24<2:15:40, 1.11s/it, loss=0.0181, lr=2.22e-05, step=2688] Training: 27%|██▋ | 2689/10000 [35:25<2:11:38, 1.08s/it, loss=0.0181, lr=2.22e-05, step=2688] Training: 27%|██▋ | 2689/10000 [35:25<2:11:38, 1.08s/it, loss=0.0323, lr=2.22e-05, step=2689]19:19:57.909 [I] step=2690 loss=0.0176 smoothed_loss=0.0202 lr=2.22e-05 grad_norm=0.4937 step_time=0.7200s data_time=0.3696s it/s=0.918 eta_to_10000=7961.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0201 grad_action_out_proj_arms=0.2239 grad_arm_token_fuse=0.1098 grad_shared_expert=0.5080 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2690/10000 [35:26<2:07:44, 1.05s/it, loss=0.0323, lr=2.22e-05, step=2689] Training: 27%|██▋ | 2690/10000 [35:26<2:07:44, 1.05s/it, loss=0.0176, lr=2.22e-05, step=2690] Training: 27%|██▋ | 2691/10000 [35:27<2:07:44, 1.05s/it, loss=0.0176, lr=2.22e-05, step=2690] Training: 27%|██▋ | 2691/10000 [35:27<2:07:44, 1.05s/it, loss=0.0114, lr=2.22e-05, step=2691] Training: 27%|██▋ | 2692/10000 [35:28<2:01:30, 1.00it/s, loss=0.0114, lr=2.22e-05, step=2691] Training: 27%|██▋ | 2692/10000 [35:28<2:01:30, 1.00it/s, loss=0.0254, lr=2.22e-05, step=2692] Training: 27%|██▋ | 2693/10000 [35:30<2:39:32, 1.31s/it, loss=0.0254, lr=2.22e-05, step=2692] Training: 27%|██▋ | 2693/10000 [35:30<2:39:32, 1.31s/it, loss=0.0150, lr=2.22e-05, step=2693] Training: 27%|██▋ | 2694/10000 [35:30<2:22:28, 1.17s/it, loss=0.0150, lr=2.22e-05, step=2693] Training: 27%|██▋ | 2694/10000 [35:30<2:22:28, 1.17s/it, loss=0.0272, lr=2.22e-05, step=2694] Training: 27%|██▋ | 2695/10000 [35:31<2:13:24, 1.10s/it, loss=0.0272, lr=2.22e-05, step=2694] Training: 27%|██▋ | 2695/10000 [35:31<2:13:24, 1.10s/it, loss=0.0093, lr=2.22e-05, step=2695] Training: 27%|██▋ | 2696/10000 [35:32<2:11:54, 1.08s/it, loss=0.0093, lr=2.22e-05, step=2695] Training: 27%|██▋ | 2696/10000 [35:32<2:11:54, 1.08s/it, loss=0.0170, lr=2.22e-05, step=2696] Training: 27%|██▋ | 2697/10000 [35:33<2:05:02, 1.03s/it, loss=0.0170, lr=2.22e-05, step=2696] Training: 27%|██▋ | 2697/10000 [35:33<2:05:02, 1.03s/it, loss=0.0175, lr=2.22e-05, step=2697] Training: 27%|██▋ | 2698/10000 [35:34<2:00:04, 1.01it/s, loss=0.0175, lr=2.22e-05, step=2697] Training: 27%|██▋ | 2698/10000 [35:34<2:00:04, 1.01it/s, loss=0.0069, lr=2.22e-05, step=2698] Training: 27%|██▋ | 2699/10000 [35:35<1:58:09, 1.03it/s, loss=0.0069, lr=2.22e-05, step=2698] Training: 27%|██▋ | 2699/10000 [35:35<1:58:09, 1.03it/s, loss=0.0137, lr=2.22e-05, step=2699]19:20:08.860 [I] step=2700 loss=0.0058 smoothed_loss=0.0161 lr=2.22e-05 grad_norm=0.5484 step_time=0.8121s data_time=0.2830s it/s=0.913 eta_to_10000=7991.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0111 grad_action_out_proj_arms=0.1219 grad_arm_token_fuse=0.0559 grad_shared_expert=0.3449 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2700/10000 [35:37<2:15:28, 1.11s/it, loss=0.0137, lr=2.22e-05, step=2699] Training: 27%|██▋ | 2700/10000 [35:37<2:15:28, 1.11s/it, loss=0.0058, lr=2.22e-05, step=2700] Training: 27%|██▋ | 2701/10000 [35:38<2:12:00, 1.09s/it, loss=0.0058, lr=2.22e-05, step=2700] Training: 27%|██▋ | 2701/10000 [35:38<2:12:00, 1.09s/it, loss=0.0343, lr=2.22e-05, step=2701] Training: 27%|██▋ | 2702/10000 [35:38<2:03:12, 1.01s/it, loss=0.0343, lr=2.22e-05, step=2701] Training: 27%|██▋ | 2702/10000 [35:38<2:03:12, 1.01s/it, loss=0.0049, lr=2.21e-05, step=2702] Training: 27%|██▋ | 2703/10000 [35:39<2:05:11, 1.03s/it, loss=0.0049, lr=2.21e-05, step=2702] Training: 27%|██▋ | 2703/10000 [35:39<2:05:11, 1.03s/it, loss=0.1024, lr=2.21e-05, step=2703] Training: 27%|██▋ | 2704/10000 [35:40<2:04:30, 1.02s/it, loss=0.1024, lr=2.21e-05, step=2703] Training: 27%|██▋ | 2704/10000 [35:40<2:04:30, 1.02s/it, loss=0.0303, lr=2.21e-05, step=2704] Training: 27%|██▋ | 2705/10000 [35:41<1:58:22, 1.03it/s, loss=0.0303, lr=2.21e-05, step=2704] Training: 27%|██▋ | 2705/10000 [35:41<1:58:22, 1.03it/s, loss=0.0190, lr=2.21e-05, step=2705] Training: 27%|██▋ | 2706/10000 [35:42<1:52:33, 1.08it/s, loss=0.0190, lr=2.21e-05, step=2705] Training: 27%|██▋ | 2706/10000 [35:42<1:52:33, 1.08it/s, loss=0.0155, lr=2.21e-05, step=2706] Training: 27%|██▋ | 2707/10000 [35:43<1:59:31, 1.02it/s, loss=0.0155, lr=2.21e-05, step=2706] Training: 27%|██▋ | 2707/10000 [35:43<1:59:31, 1.02it/s, loss=0.0236, lr=2.21e-05, step=2707] Training: 27%|██▋ | 2708/10000 [35:44<2:00:45, 1.01it/s, loss=0.0236, lr=2.21e-05, step=2707] Training: 27%|██▋ | 2708/10000 [35:44<2:00:45, 1.01it/s, loss=0.0114, lr=2.21e-05, step=2708] Training: 27%|██▋ | 2709/10000 [35:45<1:55:34, 1.05it/s, loss=0.0114, lr=2.21e-05, step=2708] Training: 27%|██▋ | 2709/10000 [35:45<1:55:34, 1.05it/s, loss=0.0107, lr=2.21e-05, step=2709]19:20:18.341 [I] step=2710 loss=0.0357 smoothed_loss=0.0230 lr=2.21e-05 grad_norm=0.5690 step_time=0.7255s data_time=0.2226s it/s=1.056 eta_to_10000=6905.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0206 grad_action_out_proj_arms=0.2175 grad_arm_token_fuse=0.1063 grad_shared_expert=0.5956 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2710/10000 [35:46<1:52:57, 1.08it/s, loss=0.0107, lr=2.21e-05, step=2709] Training: 27%|██▋ | 2710/10000 [35:46<1:52:57, 1.08it/s, loss=0.0357, lr=2.21e-05, step=2710] Training: 27%|██▋ | 2711/10000 [35:47<1:45:08, 1.16it/s, loss=0.0357, lr=2.21e-05, step=2710] Training: 27%|██▋ | 2711/10000 [35:47<1:45:08, 1.16it/s, loss=0.0368, lr=2.21e-05, step=2711] Training: 27%|██▋ | 2712/10000 [35:47<1:39:21, 1.22it/s, loss=0.0368, lr=2.21e-05, step=2711] Training: 27%|██▋ | 2712/10000 [35:47<1:39:21, 1.22it/s, loss=0.0142, lr=2.21e-05, step=2712] Training: 27%|██▋ | 2713/10000 [35:48<1:35:51, 1.27it/s, loss=0.0142, lr=2.21e-05, step=2712] Training: 27%|██▋ | 2713/10000 [35:48<1:35:51, 1.27it/s, loss=0.0158, lr=2.21e-05, step=2713] Training: 27%|██▋ | 2714/10000 [35:49<1:41:52, 1.19it/s, loss=0.0158, lr=2.21e-05, step=2713] Training: 27%|██▋ | 2714/10000 [35:49<1:41:52, 1.19it/s, loss=0.0066, lr=2.21e-05, step=2714] Training: 27%|██▋ | 2715/10000 [35:50<1:36:58, 1.25it/s, loss=0.0066, lr=2.21e-05, step=2714] Training: 27%|██▋ | 2715/10000 [35:50<1:36:58, 1.25it/s, loss=0.0186, lr=2.21e-05, step=2715] Training: 27%|██▋ | 2716/10000 [35:51<1:38:51, 1.23it/s, loss=0.0186, lr=2.21e-05, step=2715] Training: 27%|██▋ | 2716/10000 [35:51<1:38:51, 1.23it/s, loss=0.0129, lr=2.21e-05, step=2716] Training: 27%|██▋ | 2717/10000 [35:51<1:36:30, 1.26it/s, loss=0.0129, lr=2.21e-05, step=2716] Training: 27%|██▋ | 2717/10000 [35:51<1:36:30, 1.26it/s, loss=0.0381, lr=2.21e-05, step=2717] Training: 27%|██▋ | 2718/10000 [35:52<1:40:15, 1.21it/s, loss=0.0381, lr=2.21e-05, step=2717] Training: 27%|██▋ | 2718/10000 [35:52<1:40:15, 1.21it/s, loss=0.0177, lr=2.21e-05, step=2718] Training: 27%|██▋ | 2719/10000 [35:53<1:33:08, 1.30it/s, loss=0.0177, lr=2.21e-05, step=2718] Training: 27%|██▋ | 2719/10000 [35:53<1:33:08, 1.30it/s, loss=0.0190, lr=2.21e-05, step=2719]19:20:26.000 [I] step=2720 loss=0.0307 smoothed_loss=0.0221 lr=2.21e-05 grad_norm=0.4975 step_time=0.6115s data_time=0.1544s it/s=1.307 eta_to_10000=5570.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1846 grad_arm_token_fuse=0.0943 grad_shared_expert=0.4927 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2720/10000 [35:54<1:31:24, 1.33it/s, loss=0.0190, lr=2.21e-05, step=2719] Training: 27%|██▋ | 2720/10000 [35:54<1:31:24, 1.33it/s, loss=0.0307, lr=2.21e-05, step=2720] Training: 27%|██▋ | 2721/10000 [35:55<1:35:32, 1.27it/s, loss=0.0307, lr=2.21e-05, step=2720] Training: 27%|██▋ | 2721/10000 [35:55<1:35:32, 1.27it/s, loss=0.0049, lr=2.21e-05, step=2721] Training: 27%|██▋ | 2722/10000 [35:55<1:38:04, 1.24it/s, loss=0.0049, lr=2.21e-05, step=2721] Training: 27%|██▋ | 2722/10000 [35:55<1:38:04, 1.24it/s, loss=0.0763, lr=2.21e-05, step=2722] Training: 27%|██▋ | 2723/10000 [35:56<1:45:47, 1.15it/s, loss=0.0763, lr=2.21e-05, step=2722] Training: 27%|██▋ | 2723/10000 [35:56<1:45:47, 1.15it/s, loss=0.0353, lr=2.21e-05, step=2723] Training: 27%|██▋ | 2724/10000 [35:57<1:35:41, 1.27it/s, loss=0.0353, lr=2.21e-05, step=2723] Training: 27%|██▋ | 2724/10000 [35:57<1:35:41, 1.27it/s, loss=0.0302, lr=2.21e-05, step=2724] Training: 27%|██▋ | 2725/10000 [35:58<1:39:11, 1.22it/s, loss=0.0302, lr=2.21e-05, step=2724] Training: 27%|██▋ | 2725/10000 [35:58<1:39:11, 1.22it/s, loss=0.0175, lr=2.21e-05, step=2725] Training: 27%|██▋ | 2726/10000 [35:59<1:32:13, 1.31it/s, loss=0.0175, lr=2.21e-05, step=2725] Training: 27%|██▋ | 2726/10000 [35:59<1:32:13, 1.31it/s, loss=0.0151, lr=2.21e-05, step=2726] Training: 27%|██▋ | 2727/10000 [35:59<1:26:39, 1.40it/s, loss=0.0151, lr=2.21e-05, step=2726] Training: 27%|██▋ | 2727/10000 [35:59<1:26:39, 1.40it/s, loss=0.0058, lr=2.21e-05, step=2727] Training: 27%|██▋ | 2728/10000 [36:01<1:52:28, 1.08it/s, loss=0.0058, lr=2.21e-05, step=2727] Training: 27%|██▋ | 2728/10000 [36:01<1:52:28, 1.08it/s, loss=0.0053, lr=2.21e-05, step=2728] Training: 27%|██▋ | 2729/10000 [36:01<1:44:36, 1.16it/s, loss=0.0053, lr=2.21e-05, step=2728] Training: 27%|██▋ | 2729/10000 [36:01<1:44:36, 1.16it/s, loss=0.0147, lr=2.21e-05, step=2729]19:20:34.331 [I] step=2730 loss=0.0089 smoothed_loss=0.0196 lr=2.21e-05 grad_norm=0.5381 step_time=0.6826s data_time=0.1506s it/s=1.201 eta_to_10000=6054.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.1252 grad_arm_token_fuse=0.0578 grad_shared_expert=0.3948 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2730/10000 [36:02<1:39:48, 1.21it/s, loss=0.0147, lr=2.21e-05, step=2729] Training: 27%|██▋ | 2730/10000 [36:02<1:39:48, 1.21it/s, loss=0.0089, lr=2.21e-05, step=2730] Training: 27%|██▋ | 2731/10000 [36:03<1:41:39, 1.19it/s, loss=0.0089, lr=2.21e-05, step=2730] Training: 27%|██▋ | 2731/10000 [36:03<1:41:39, 1.19it/s, loss=0.0125, lr=2.21e-05, step=2731] Training: 27%|██▋ | 2732/10000 [36:04<1:41:27, 1.19it/s, loss=0.0125, lr=2.21e-05, step=2731] Training: 27%|██▋ | 2732/10000 [36:04<1:41:27, 1.19it/s, loss=0.0172, lr=2.21e-05, step=2732] Training: 27%|██▋ | 2733/10000 [36:04<1:35:52, 1.26it/s, loss=0.0172, lr=2.21e-05, step=2732] Training: 27%|██▋ | 2733/10000 [36:04<1:35:52, 1.26it/s, loss=0.0081, lr=2.21e-05, step=2733] Training: 27%|██▋ | 2734/10000 [36:05<1:31:27, 1.32it/s, loss=0.0081, lr=2.21e-05, step=2733] Training: 27%|██▋ | 2734/10000 [36:05<1:31:27, 1.32it/s, loss=0.0086, lr=2.21e-05, step=2734] Training: 27%|██▋ | 2735/10000 [36:06<1:23:55, 1.44it/s, loss=0.0086, lr=2.21e-05, step=2734] Training: 27%|██▋ | 2735/10000 [36:06<1:23:55, 1.44it/s, loss=0.0400, lr=2.21e-05, step=2735] Training: 27%|██▋ | 2736/10000 [36:07<1:52:55, 1.07it/s, loss=0.0400, lr=2.21e-05, step=2735] Training: 27%|██▋ | 2736/10000 [36:07<1:52:55, 1.07it/s, loss=0.0310, lr=2.21e-05, step=2736] Training: 27%|██▋ | 2737/10000 [36:08<1:44:09, 1.16it/s, loss=0.0310, lr=2.21e-05, step=2736] Training: 27%|██▋ | 2737/10000 [36:08<1:44:09, 1.16it/s, loss=0.0156, lr=2.21e-05, step=2737] Training: 27%|██▋ | 2738/10000 [36:09<1:41:32, 1.19it/s, loss=0.0156, lr=2.21e-05, step=2737] Training: 27%|██▋ | 2738/10000 [36:09<1:41:32, 1.19it/s, loss=0.0164, lr=2.21e-05, step=2738] Training: 27%|██▋ | 2739/10000 [36:09<1:34:15, 1.28it/s, loss=0.0164, lr=2.21e-05, step=2738] Training: 27%|██▋ | 2739/10000 [36:09<1:34:15, 1.28it/s, loss=0.0109, lr=2.21e-05, step=2739]19:20:42.376 [I] step=2740 loss=0.0556 smoothed_loss=0.0223 lr=2.21e-05 grad_norm=0.5324 step_time=0.6120s data_time=0.1924s it/s=1.243 eta_to_10000=5838.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0219 grad_action_out_proj_arms=0.2113 grad_arm_token_fuse=0.1184 grad_shared_expert=0.6011 (18633:train_pytorch.py:850) + Training: 27%|██▋ | 2740/10000 [36:10<1:35:49, 1.26it/s, loss=0.0109, lr=2.21e-05, step=2739] Training: 27%|██▋ | 2740/10000 [36:10<1:35:49, 1.26it/s, loss=0.0556, lr=2.21e-05, step=2740] Training: 27%|██▋ | 2741/10000 [36:11<1:31:33, 1.32it/s, loss=0.0556, lr=2.21e-05, step=2740] Training: 27%|██▋ | 2741/10000 [36:11<1:31:33, 1.32it/s, loss=0.0181, lr=2.21e-05, step=2741] Training: 27%|██▋ | 2742/10000 [36:11<1:27:28, 1.38it/s, loss=0.0181, lr=2.21e-05, step=2741] Training: 27%|██▋ | 2742/10000 [36:11<1:27:28, 1.38it/s, loss=0.0204, lr=2.20e-05, step=2742] Training: 27%|██▋ | 2743/10000 [36:13<1:43:54, 1.16it/s, loss=0.0204, lr=2.20e-05, step=2742] Training: 27%|██▋ | 2743/10000 [36:13<1:43:54, 1.16it/s, loss=0.0297, lr=2.20e-05, step=2743] Training: 27%|██▋ | 2744/10000 [36:13<1:37:01, 1.25it/s, loss=0.0297, lr=2.20e-05, step=2743] Training: 27%|██▋ | 2744/10000 [36:13<1:37:01, 1.25it/s, loss=0.0058, lr=2.20e-05, step=2744] Training: 27%|██▋ | 2745/10000 [36:14<1:36:05, 1.26it/s, loss=0.0058, lr=2.20e-05, step=2744] Training: 27%|██▋ | 2745/10000 [36:14<1:36:05, 1.26it/s, loss=0.0111, lr=2.20e-05, step=2745] Training: 27%|██▋ | 2746/10000 [36:15<1:44:43, 1.15it/s, loss=0.0111, lr=2.20e-05, step=2745] Training: 27%|██▋ | 2746/10000 [36:15<1:44:43, 1.15it/s, loss=0.0254, lr=2.20e-05, step=2746] Training: 27%|██▋ | 2747/10000 [36:16<1:42:25, 1.18it/s, loss=0.0254, lr=2.20e-05, step=2746] Training: 27%|██▋ | 2747/10000 [36:16<1:42:25, 1.18it/s, loss=0.0288, lr=2.20e-05, step=2747] Training: 27%|██▋ | 2748/10000 [36:16<1:35:28, 1.27it/s, loss=0.0288, lr=2.20e-05, step=2747] Training: 27%|██▋ | 2748/10000 [36:16<1:35:28, 1.27it/s, loss=0.0047, lr=2.20e-05, step=2748] Training: 27%|██▋ | 2749/10000 [36:17<1:30:12, 1.34it/s, loss=0.0047, lr=2.20e-05, step=2748] Training: 27%|██▋ | 2749/10000 [36:17<1:30:12, 1.34it/s, loss=0.0277, lr=2.20e-05, step=2749]19:20:50.434 [I] step=2750 loss=0.0157 smoothed_loss=0.0200 lr=2.20e-05 grad_norm=0.4970 step_time=0.6140s data_time=0.1917s it/s=1.241 eta_to_10000=5839.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0112 grad_action_out_proj_arms=0.1694 grad_arm_token_fuse=0.0606 grad_shared_expert=0.5226 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2750/10000 [36:18<1:38:30, 1.23it/s, loss=0.0277, lr=2.20e-05, step=2749] Training: 28%|██▊ | 2750/10000 [36:18<1:38:30, 1.23it/s, loss=0.0157, lr=2.20e-05, step=2750] Training: 28%|██▊ | 2751/10000 [36:19<1:34:15, 1.28it/s, loss=0.0157, lr=2.20e-05, step=2750] Training: 28%|██▊ | 2751/10000 [36:19<1:34:15, 1.28it/s, loss=0.0131, lr=2.20e-05, step=2751] Training: 28%|██▊ | 2752/10000 [36:20<1:33:23, 1.29it/s, loss=0.0131, lr=2.20e-05, step=2751] Training: 28%|██▊ | 2752/10000 [36:20<1:33:23, 1.29it/s, loss=0.0172, lr=2.20e-05, step=2752] Training: 28%|██▊ | 2753/10000 [36:21<1:44:07, 1.16it/s, loss=0.0172, lr=2.20e-05, step=2752] Training: 28%|██▊ | 2753/10000 [36:21<1:44:07, 1.16it/s, loss=0.0114, lr=2.20e-05, step=2753] Training: 28%|██▊ | 2754/10000 [36:21<1:35:51, 1.26it/s, loss=0.0114, lr=2.20e-05, step=2753] Training: 28%|██▊ | 2754/10000 [36:21<1:35:51, 1.26it/s, loss=0.0169, lr=2.20e-05, step=2754] Training: 28%|██▊ | 2755/10000 [36:24<2:29:01, 1.23s/it, loss=0.0169, lr=2.20e-05, step=2754] Training: 28%|██▊ | 2755/10000 [36:24<2:29:01, 1.23s/it, loss=0.0186, lr=2.20e-05, step=2755] Training: 28%|██▊ | 2756/10000 [36:24<2:06:20, 1.05s/it, loss=0.0186, lr=2.20e-05, step=2755] Training: 28%|██▊ | 2756/10000 [36:24<2:06:20, 1.05s/it, loss=0.0190, lr=2.20e-05, step=2756] Training: 28%|██▊ | 2757/10000 [36:25<2:09:18, 1.07s/it, loss=0.0190, lr=2.20e-05, step=2756] Training: 28%|██▊ | 2757/10000 [36:25<2:09:18, 1.07s/it, loss=0.0065, lr=2.20e-05, step=2757] Training: 28%|██▊ | 2758/10000 [36:26<2:03:15, 1.02s/it, loss=0.0065, lr=2.20e-05, step=2757] Training: 28%|██▊ | 2758/10000 [36:26<2:03:15, 1.02s/it, loss=0.0050, lr=2.20e-05, step=2758] Training: 28%|██▊ | 2759/10000 [36:27<1:56:56, 1.03it/s, loss=0.0050, lr=2.20e-05, step=2758] Training: 28%|██▊ | 2759/10000 [36:27<1:56:56, 1.03it/s, loss=0.0166, lr=2.20e-05, step=2759]19:21:00.088 [I] step=2760 loss=0.0272 smoothed_loss=0.0171 lr=2.20e-05 grad_norm=0.4829 step_time=0.6599s data_time=0.3055s it/s=1.036 eta_to_10000=6989.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.1085 grad_arm_token_fuse=0.0484 grad_shared_expert=0.4126 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2760/10000 [36:28<1:48:48, 1.11it/s, loss=0.0166, lr=2.20e-05, step=2759] Training: 28%|██▊ | 2760/10000 [36:28<1:48:48, 1.11it/s, loss=0.0272, lr=2.20e-05, step=2760] Training: 28%|██▊ | 2761/10000 [36:28<1:37:30, 1.24it/s, loss=0.0272, lr=2.20e-05, step=2760] Training: 28%|██▊ | 2761/10000 [36:28<1:37:30, 1.24it/s, loss=0.0041, lr=2.20e-05, step=2761] Training: 28%|██▊ | 2762/10000 [36:29<1:32:35, 1.30it/s, loss=0.0041, lr=2.20e-05, step=2761] Training: 28%|██▊ | 2762/10000 [36:29<1:32:35, 1.30it/s, loss=0.0333, lr=2.20e-05, step=2762] Training: 28%|██▊ | 2763/10000 [36:30<1:43:30, 1.17it/s, loss=0.0333, lr=2.20e-05, step=2762] Training: 28%|██▊ | 2763/10000 [36:30<1:43:30, 1.17it/s, loss=0.0653, lr=2.20e-05, step=2763] Training: 28%|██▊ | 2764/10000 [36:31<1:58:05, 1.02it/s, loss=0.0653, lr=2.20e-05, step=2763] Training: 28%|██▊ | 2764/10000 [36:31<1:58:05, 1.02it/s, loss=0.0306, lr=2.20e-05, step=2764] Training: 28%|██▊ | 2765/10000 [36:32<1:57:22, 1.03it/s, loss=0.0306, lr=2.20e-05, step=2764] Training: 28%|██▊ | 2765/10000 [36:32<1:57:22, 1.03it/s, loss=0.0388, lr=2.20e-05, step=2765] Training: 28%|██▊ | 2766/10000 [36:33<1:53:45, 1.06it/s, loss=0.0388, lr=2.20e-05, step=2765] Training: 28%|██▊ | 2766/10000 [36:33<1:53:45, 1.06it/s, loss=0.0421, lr=2.20e-05, step=2766] Training: 28%|██▊ | 2767/10000 [36:34<1:39:42, 1.21it/s, loss=0.0421, lr=2.20e-05, step=2766] Training: 28%|██▊ | 2767/10000 [36:34<1:39:42, 1.21it/s, loss=0.0145, lr=2.20e-05, step=2767] Training: 28%|██▊ | 2768/10000 [36:34<1:32:42, 1.30it/s, loss=0.0145, lr=2.20e-05, step=2767] Training: 28%|██▊ | 2768/10000 [36:34<1:32:42, 1.30it/s, loss=0.0164, lr=2.20e-05, step=2768] Training: 28%|██▊ | 2769/10000 [36:35<1:26:38, 1.39it/s, loss=0.0164, lr=2.20e-05, step=2768] Training: 28%|██▊ | 2769/10000 [36:35<1:26:38, 1.39it/s, loss=0.0158, lr=2.20e-05, step=2769]19:21:07.956 [I] step=2770 loss=0.0139 smoothed_loss=0.0226 lr=2.20e-05 grad_norm=0.5479 step_time=0.5768s data_time=0.2100s it/s=1.271 eta_to_10000=5687.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0963 grad_arm_token_fuse=0.0329 grad_shared_expert=0.4171 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2770/10000 [36:36<1:24:14, 1.43it/s, loss=0.0158, lr=2.20e-05, step=2769] Training: 28%|██▊ | 2770/10000 [36:36<1:24:14, 1.43it/s, loss=0.0139, lr=2.20e-05, step=2770] Training: 28%|██▊ | 2771/10000 [36:37<1:30:29, 1.33it/s, loss=0.0139, lr=2.20e-05, step=2770] Training: 28%|██▊ | 2771/10000 [36:37<1:30:29, 1.33it/s, loss=0.0105, lr=2.20e-05, step=2771] Training: 28%|██▊ | 2772/10000 [36:37<1:30:38, 1.33it/s, loss=0.0105, lr=2.20e-05, step=2771] Training: 28%|██▊ | 2772/10000 [36:37<1:30:38, 1.33it/s, loss=0.0491, lr=2.20e-05, step=2772] Training: 28%|██▊ | 2773/10000 [36:38<1:39:46, 1.21it/s, loss=0.0491, lr=2.20e-05, step=2772] Training: 28%|██▊ | 2773/10000 [36:38<1:39:46, 1.21it/s, loss=0.0454, lr=2.20e-05, step=2773] Training: 28%|██▊ | 2774/10000 [36:39<1:32:12, 1.31it/s, loss=0.0454, lr=2.20e-05, step=2773] Training: 28%|██▊ | 2774/10000 [36:39<1:32:12, 1.31it/s, loss=0.0271, lr=2.20e-05, step=2774] Training: 28%|██▊ | 2775/10000 [36:40<1:29:23, 1.35it/s, loss=0.0271, lr=2.20e-05, step=2774] Training: 28%|██▊ | 2775/10000 [36:40<1:29:23, 1.35it/s, loss=0.0073, lr=2.20e-05, step=2775] Training: 28%|██▊ | 2776/10000 [36:40<1:28:09, 1.37it/s, loss=0.0073, lr=2.20e-05, step=2775] Training: 28%|██▊ | 2776/10000 [36:40<1:28:09, 1.37it/s, loss=0.0087, lr=2.20e-05, step=2776] Training: 28%|██▊ | 2777/10000 [36:41<1:22:45, 1.45it/s, loss=0.0087, lr=2.20e-05, step=2776] Training: 28%|██▊ | 2777/10000 [36:41<1:22:45, 1.45it/s, loss=0.0467, lr=2.20e-05, step=2777] Training: 28%|██▊ | 2778/10000 [36:42<1:30:09, 1.34it/s, loss=0.0467, lr=2.20e-05, step=2777] Training: 28%|██▊ | 2778/10000 [36:42<1:30:09, 1.34it/s, loss=0.0354, lr=2.20e-05, step=2778] Training: 28%|██▊ | 2779/10000 [36:42<1:25:41, 1.40it/s, loss=0.0354, lr=2.20e-05, step=2778] Training: 28%|██▊ | 2779/10000 [36:42<1:25:41, 1.40it/s, loss=0.0074, lr=2.20e-05, step=2779]19:21:15.540 [I] step=2780 loss=0.0164 smoothed_loss=0.0236 lr=2.20e-05 grad_norm=0.4880 step_time=0.6330s data_time=0.1254s it/s=1.319 eta_to_10000=5472.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.1034 grad_arm_token_fuse=0.0353 grad_shared_expert=0.3744 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2780/10000 [36:43<1:30:01, 1.34it/s, loss=0.0074, lr=2.20e-05, step=2779] Training: 28%|██▊ | 2780/10000 [36:43<1:30:01, 1.34it/s, loss=0.0164, lr=2.20e-05, step=2780] Training: 28%|██▊ | 2781/10000 [36:44<1:26:50, 1.39it/s, loss=0.0164, lr=2.20e-05, step=2780] Training: 28%|██▊ | 2781/10000 [36:44<1:26:50, 1.39it/s, loss=0.0301, lr=2.20e-05, step=2781] Training: 28%|██▊ | 2782/10000 [36:45<1:24:11, 1.43it/s, loss=0.0301, lr=2.20e-05, step=2781] Training: 28%|██▊ | 2782/10000 [36:45<1:24:11, 1.43it/s, loss=0.0305, lr=2.19e-05, step=2782] Training: 28%|██▊ | 2783/10000 [36:45<1:22:04, 1.47it/s, loss=0.0305, lr=2.19e-05, step=2782] Training: 28%|██▊ | 2783/10000 [36:45<1:22:04, 1.47it/s, loss=0.0644, lr=2.19e-05, step=2783] Training: 28%|██▊ | 2784/10000 [36:46<1:18:42, 1.53it/s, loss=0.0644, lr=2.19e-05, step=2783] Training: 28%|██▊ | 2784/10000 [36:46<1:18:42, 1.53it/s, loss=0.0487, lr=2.19e-05, step=2784] Training: 28%|██▊ | 2785/10000 [36:46<1:15:54, 1.58it/s, loss=0.0487, lr=2.19e-05, step=2784] Training: 28%|██▊ | 2785/10000 [36:46<1:15:54, 1.58it/s, loss=0.0328, lr=2.19e-05, step=2785] Training: 28%|██▊ | 2786/10000 [36:47<1:24:42, 1.42it/s, loss=0.0328, lr=2.19e-05, step=2785] Training: 28%|██▊ | 2786/10000 [36:47<1:24:42, 1.42it/s, loss=0.0114, lr=2.19e-05, step=2786] Training: 28%|██▊ | 2787/10000 [36:48<1:36:36, 1.24it/s, loss=0.0114, lr=2.19e-05, step=2786] Training: 28%|██▊ | 2787/10000 [36:48<1:36:36, 1.24it/s, loss=0.0170, lr=2.19e-05, step=2787] Training: 28%|██▊ | 2788/10000 [36:49<1:30:17, 1.33it/s, loss=0.0170, lr=2.19e-05, step=2787] Training: 28%|██▊ | 2788/10000 [36:49<1:30:17, 1.33it/s, loss=0.0210, lr=2.19e-05, step=2788] Training: 28%|██▊ | 2789/10000 [36:50<1:27:39, 1.37it/s, loss=0.0210, lr=2.19e-05, step=2788] Training: 28%|██▊ | 2789/10000 [36:50<1:27:39, 1.37it/s, loss=0.0162, lr=2.19e-05, step=2789]19:21:22.526 [I] step=2790 loss=0.0062 smoothed_loss=0.0241 lr=2.19e-05 grad_norm=0.5330 step_time=0.5654s data_time=0.1332s it/s=1.432 eta_to_10000=5033.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1443 grad_arm_token_fuse=0.0672 grad_shared_expert=0.4396 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2790/10000 [36:50<1:24:42, 1.42it/s, loss=0.0162, lr=2.19e-05, step=2789] Training: 28%|██▊ | 2790/10000 [36:50<1:24:42, 1.42it/s, loss=0.0062, lr=2.19e-05, step=2790] Training: 28%|██▊ | 2791/10000 [36:51<1:26:08, 1.39it/s, loss=0.0062, lr=2.19e-05, step=2790] Training: 28%|██▊ | 2791/10000 [36:51<1:26:08, 1.39it/s, loss=0.0135, lr=2.19e-05, step=2791] Training: 28%|██▊ | 2792/10000 [36:52<1:24:25, 1.42it/s, loss=0.0135, lr=2.19e-05, step=2791] Training: 28%|██▊ | 2792/10000 [36:52<1:24:25, 1.42it/s, loss=0.0073, lr=2.19e-05, step=2792] Training: 28%|██▊ | 2793/10000 [36:52<1:31:06, 1.32it/s, loss=0.0073, lr=2.19e-05, step=2792] Training: 28%|██▊ | 2793/10000 [36:53<1:31:06, 1.32it/s, loss=0.0125, lr=2.19e-05, step=2793] Training: 28%|██▊ | 2794/10000 [36:53<1:36:00, 1.25it/s, loss=0.0125, lr=2.19e-05, step=2793] Training: 28%|██▊ | 2794/10000 [36:53<1:36:00, 1.25it/s, loss=0.0236, lr=2.19e-05, step=2794] Training: 28%|██▊ | 2795/10000 [36:54<1:34:15, 1.27it/s, loss=0.0236, lr=2.19e-05, step=2794] Training: 28%|██▊ | 2795/10000 [36:54<1:34:15, 1.27it/s, loss=0.0508, lr=2.19e-05, step=2795] Training: 28%|██▊ | 2796/10000 [36:55<1:40:59, 1.19it/s, loss=0.0508, lr=2.19e-05, step=2795] Training: 28%|██▊ | 2796/10000 [36:55<1:40:59, 1.19it/s, loss=0.0369, lr=2.19e-05, step=2796] Training: 28%|██▊ | 2797/10000 [36:56<1:31:25, 1.31it/s, loss=0.0369, lr=2.19e-05, step=2796] Training: 28%|██▊ | 2797/10000 [36:56<1:31:25, 1.31it/s, loss=0.0770, lr=2.19e-05, step=2797] Training: 28%|██▊ | 2798/10000 [36:56<1:31:55, 1.31it/s, loss=0.0770, lr=2.19e-05, step=2797] Training: 28%|██▊ | 2798/10000 [36:56<1:31:55, 1.31it/s, loss=0.0138, lr=2.19e-05, step=2798] Training: 28%|██▊ | 2799/10000 [36:57<1:23:45, 1.43it/s, loss=0.0138, lr=2.19e-05, step=2798] Training: 28%|██▊ | 2799/10000 [36:57<1:23:45, 1.43it/s, loss=0.0102, lr=2.19e-05, step=2799]19:21:30.175 [I] step=2800 loss=0.0506 smoothed_loss=0.0292 lr=2.19e-05 grad_norm=0.4862 step_time=0.5926s data_time=0.1724s it/s=1.307 eta_to_10000=5507.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0235 grad_action_out_proj_arms=0.1602 grad_arm_token_fuse=0.1168 grad_shared_expert=0.5073 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2800/10000 [36:58<1:28:45, 1.35it/s, loss=0.0102, lr=2.19e-05, step=2799] Training: 28%|██▊ | 2800/10000 [36:58<1:28:45, 1.35it/s, loss=0.0506, lr=2.19e-05, step=2800] Training: 28%|██▊ | 2801/10000 [36:59<1:35:20, 1.26it/s, loss=0.0506, lr=2.19e-05, step=2800] Training: 28%|██▊ | 2801/10000 [36:59<1:35:20, 1.26it/s, loss=0.0446, lr=2.19e-05, step=2801] Training: 28%|██▊ | 2802/10000 [37:00<1:44:14, 1.15it/s, loss=0.0446, lr=2.19e-05, step=2801] Training: 28%|██▊ | 2802/10000 [37:00<1:44:14, 1.15it/s, loss=0.0256, lr=2.19e-05, step=2802] Training: 28%|██▊ | 2803/10000 [37:00<1:33:00, 1.29it/s, loss=0.0256, lr=2.19e-05, step=2802] Training: 28%|██▊ | 2803/10000 [37:00<1:33:00, 1.29it/s, loss=0.0166, lr=2.19e-05, step=2803] Training: 28%|██▊ | 2804/10000 [37:01<1:35:48, 1.25it/s, loss=0.0166, lr=2.19e-05, step=2803] Training: 28%|██▊ | 2804/10000 [37:01<1:35:48, 1.25it/s, loss=0.0247, lr=2.19e-05, step=2804] Training: 28%|██▊ | 2805/10000 [37:02<1:32:38, 1.29it/s, loss=0.0247, lr=2.19e-05, step=2804] Training: 28%|██▊ | 2805/10000 [37:02<1:32:38, 1.29it/s, loss=0.0208, lr=2.19e-05, step=2805] Training: 28%|██▊ | 2806/10000 [37:04<2:07:53, 1.07s/it, loss=0.0208, lr=2.19e-05, step=2805] Training: 28%|██▊ | 2806/10000 [37:04<2:07:53, 1.07s/it, loss=0.0106, lr=2.19e-05, step=2806] Training: 28%|██▊ | 2807/10000 [37:05<2:00:21, 1.00s/it, loss=0.0106, lr=2.19e-05, step=2806] Training: 28%|██▊ | 2807/10000 [37:05<2:00:21, 1.00s/it, loss=0.0053, lr=2.19e-05, step=2807] Training: 28%|██▊ | 2808/10000 [37:06<2:23:14, 1.19s/it, loss=0.0053, lr=2.19e-05, step=2807] Training: 28%|██▊ | 2808/10000 [37:06<2:23:14, 1.19s/it, loss=0.0183, lr=2.19e-05, step=2808] Training: 28%|██▊ | 2809/10000 [37:07<2:02:15, 1.02s/it, loss=0.0183, lr=2.19e-05, step=2808] Training: 28%|██▊ | 2809/10000 [37:07<2:02:15, 1.02s/it, loss=0.0385, lr=2.19e-05, step=2809]19:21:40.460 [I] step=2810 loss=0.0084 smoothed_loss=0.0232 lr=2.19e-05 grad_norm=0.5652 step_time=0.6948s data_time=0.3336s it/s=0.972 eta_to_10000=7393.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0123 grad_action_out_proj_arms=0.1484 grad_arm_token_fuse=0.0595 grad_shared_expert=0.6626 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2810/10000 [37:08<2:13:33, 1.11s/it, loss=0.0385, lr=2.19e-05, step=2809] Training: 28%|██▊ | 2810/10000 [37:08<2:13:33, 1.11s/it, loss=0.0084, lr=2.19e-05, step=2810] Training: 28%|██▊ | 2811/10000 [37:09<1:55:00, 1.04it/s, loss=0.0084, lr=2.19e-05, step=2810] Training: 28%|██▊ | 2811/10000 [37:09<1:55:00, 1.04it/s, loss=0.0100, lr=2.19e-05, step=2811] Training: 28%|██▊ | 2812/10000 [37:10<1:50:16, 1.09it/s, loss=0.0100, lr=2.19e-05, step=2811] Training: 28%|██▊ | 2812/10000 [37:10<1:50:16, 1.09it/s, loss=0.0280, lr=2.19e-05, step=2812] Training: 28%|██▊ | 2813/10000 [37:12<2:31:46, 1.27s/it, loss=0.0280, lr=2.19e-05, step=2812] Training: 28%|██▊ | 2813/10000 [37:12<2:31:46, 1.27s/it, loss=0.0170, lr=2.19e-05, step=2813] Training: 28%|██▊ | 2814/10000 [37:12<2:16:49, 1.14s/it, loss=0.0170, lr=2.19e-05, step=2813] Training: 28%|██▊ | 2814/10000 [37:12<2:16:49, 1.14s/it, loss=0.0134, lr=2.19e-05, step=2814] Training: 28%|██▊ | 2815/10000 [37:14<2:16:05, 1.14s/it, loss=0.0134, lr=2.19e-05, step=2814] Training: 28%|██▊ | 2815/10000 [37:14<2:16:05, 1.14s/it, loss=0.0063, lr=2.19e-05, step=2815] Training: 28%|██▊ | 2816/10000 [37:14<1:57:25, 1.02it/s, loss=0.0063, lr=2.19e-05, step=2815] Training: 28%|██▊ | 2816/10000 [37:14<1:57:25, 1.02it/s, loss=0.0112, lr=2.19e-05, step=2816] Training: 28%|██▊ | 2817/10000 [37:15<1:45:32, 1.13it/s, loss=0.0112, lr=2.19e-05, step=2816] Training: 28%|██▊ | 2817/10000 [37:15<1:45:32, 1.13it/s, loss=0.0236, lr=2.19e-05, step=2817] Training: 28%|██▊ | 2818/10000 [37:16<1:57:38, 1.02it/s, loss=0.0236, lr=2.19e-05, step=2817] Training: 28%|██▊ | 2818/10000 [37:16<1:57:38, 1.02it/s, loss=0.0030, lr=2.19e-05, step=2818] Training: 28%|██▊ | 2819/10000 [37:17<1:44:19, 1.15it/s, loss=0.0030, lr=2.19e-05, step=2818] Training: 28%|██▊ | 2819/10000 [37:17<1:44:19, 1.15it/s, loss=0.0126, lr=2.19e-05, step=2819]19:21:49.873 [I] step=2820 loss=0.0361 smoothed_loss=0.0190 lr=2.19e-05 grad_norm=0.5122 step_time=0.6476s data_time=0.2936s it/s=1.063 eta_to_10000=6757.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0190 grad_action_out_proj_arms=0.1762 grad_arm_token_fuse=0.1021 grad_shared_expert=0.7007 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2820/10000 [37:18<1:43:02, 1.16it/s, loss=0.0126, lr=2.19e-05, step=2819] Training: 28%|██▊ | 2820/10000 [37:18<1:43:02, 1.16it/s, loss=0.0361, lr=2.19e-05, step=2820] Training: 28%|██▊ | 2821/10000 [37:18<1:44:56, 1.14it/s, loss=0.0361, lr=2.19e-05, step=2820] Training: 28%|██▊ | 2821/10000 [37:18<1:44:56, 1.14it/s, loss=0.0055, lr=2.18e-05, step=2821] Training: 28%|██▊ | 2822/10000 [37:19<1:39:45, 1.20it/s, loss=0.0055, lr=2.18e-05, step=2821] Training: 28%|██▊ | 2822/10000 [37:19<1:39:45, 1.20it/s, loss=0.0051, lr=2.18e-05, step=2822] Training: 28%|██▊ | 2823/10000 [37:20<1:40:54, 1.19it/s, loss=0.0051, lr=2.18e-05, step=2822] Training: 28%|██▊ | 2823/10000 [37:20<1:40:54, 1.19it/s, loss=0.0090, lr=2.18e-05, step=2823] Training: 28%|██▊ | 2824/10000 [37:21<1:46:59, 1.12it/s, loss=0.0090, lr=2.18e-05, step=2823] Training: 28%|██▊ | 2824/10000 [37:21<1:46:59, 1.12it/s, loss=0.0065, lr=2.18e-05, step=2824] Training: 28%|██▊ | 2825/10000 [37:22<1:41:08, 1.18it/s, loss=0.0065, lr=2.18e-05, step=2824] Training: 28%|██▊ | 2825/10000 [37:22<1:41:08, 1.18it/s, loss=0.0139, lr=2.18e-05, step=2825] Training: 28%|██▊ | 2826/10000 [37:22<1:30:33, 1.32it/s, loss=0.0139, lr=2.18e-05, step=2825] Training: 28%|██▊ | 2826/10000 [37:22<1:30:33, 1.32it/s, loss=0.0122, lr=2.18e-05, step=2826] Training: 28%|██▊ | 2827/10000 [37:23<1:24:50, 1.41it/s, loss=0.0122, lr=2.18e-05, step=2826] Training: 28%|██▊ | 2827/10000 [37:23<1:24:50, 1.41it/s, loss=0.0098, lr=2.18e-05, step=2827] Training: 28%|██▊ | 2828/10000 [37:24<1:29:30, 1.34it/s, loss=0.0098, lr=2.18e-05, step=2827] Training: 28%|██▊ | 2828/10000 [37:24<1:29:30, 1.34it/s, loss=0.0030, lr=2.18e-05, step=2828] Training: 28%|██▊ | 2829/10000 [37:24<1:26:59, 1.37it/s, loss=0.0030, lr=2.18e-05, step=2828] Training: 28%|██▊ | 2829/10000 [37:24<1:26:59, 1.37it/s, loss=0.0184, lr=2.18e-05, step=2829]19:21:57.583 [I] step=2830 loss=0.0058 smoothed_loss=0.0127 lr=2.18e-05 grad_norm=0.4606 step_time=0.6122s data_time=0.1587s it/s=1.297 eta_to_10000=5526.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0065 grad_action_out_proj_arms=0.0915 grad_arm_token_fuse=0.0353 grad_shared_expert=0.3276 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2830/10000 [37:25<1:28:52, 1.34it/s, loss=0.0184, lr=2.18e-05, step=2829] Training: 28%|██▊ | 2830/10000 [37:25<1:28:52, 1.34it/s, loss=0.0058, lr=2.18e-05, step=2830] Training: 28%|██▊ | 2831/10000 [37:26<1:22:06, 1.46it/s, loss=0.0058, lr=2.18e-05, step=2830] Training: 28%|██▊ | 2831/10000 [37:26<1:22:06, 1.46it/s, loss=0.0412, lr=2.18e-05, step=2831] Training: 28%|██▊ | 2832/10000 [37:26<1:17:14, 1.55it/s, loss=0.0412, lr=2.18e-05, step=2831] Training: 28%|██▊ | 2832/10000 [37:26<1:17:14, 1.55it/s, loss=0.0076, lr=2.18e-05, step=2832] Training: 28%|██▊ | 2833/10000 [37:27<1:16:18, 1.57it/s, loss=0.0076, lr=2.18e-05, step=2832] Training: 28%|██▊ | 2833/10000 [37:27<1:16:18, 1.57it/s, loss=0.0098, lr=2.18e-05, step=2833] Training: 28%|██▊ | 2834/10000 [37:27<1:11:24, 1.67it/s, loss=0.0098, lr=2.18e-05, step=2833] Training: 28%|██▊ | 2834/10000 [37:27<1:11:24, 1.67it/s, loss=0.0205, lr=2.18e-05, step=2834] Training: 28%|██▊ | 2835/10000 [37:28<1:11:02, 1.68it/s, loss=0.0205, lr=2.18e-05, step=2834] Training: 28%|██▊ | 2835/10000 [37:28<1:11:02, 1.68it/s, loss=0.0089, lr=2.18e-05, step=2835] Training: 28%|██▊ | 2836/10000 [37:29<1:19:56, 1.49it/s, loss=0.0089, lr=2.18e-05, step=2835] Training: 28%|██▊ | 2836/10000 [37:29<1:19:56, 1.49it/s, loss=0.0156, lr=2.18e-05, step=2836] Training: 28%|██▊ | 2837/10000 [37:29<1:13:40, 1.62it/s, loss=0.0156, lr=2.18e-05, step=2836] Training: 28%|██▊ | 2837/10000 [37:29<1:13:40, 1.62it/s, loss=0.0189, lr=2.18e-05, step=2837] Training: 28%|██▊ | 2838/10000 [37:30<1:09:56, 1.71it/s, loss=0.0189, lr=2.18e-05, step=2837] Training: 28%|██▊ | 2838/10000 [37:30<1:09:56, 1.71it/s, loss=0.0410, lr=2.18e-05, step=2838] Training: 28%|██▊ | 2839/10000 [37:30<1:08:14, 1.75it/s, loss=0.0410, lr=2.18e-05, step=2838] Training: 28%|██▊ | 2839/10000 [37:30<1:08:14, 1.75it/s, loss=0.0380, lr=2.18e-05, step=2839]19:22:03.389 [I] step=2840 loss=0.0141 smoothed_loss=0.0190 lr=2.18e-05 grad_norm=0.4922 step_time=0.5019s data_time=0.0787s it/s=1.723 eta_to_10000=4156.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0169 grad_action_out_proj_arms=0.1700 grad_arm_token_fuse=0.0903 grad_shared_expert=0.4407 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2840/10000 [37:31<1:09:09, 1.73it/s, loss=0.0380, lr=2.18e-05, step=2839] Training: 28%|██▊ | 2840/10000 [37:31<1:09:09, 1.73it/s, loss=0.0141, lr=2.18e-05, step=2840] Training: 28%|██▊ | 2841/10000 [37:32<1:06:52, 1.78it/s, loss=0.0141, lr=2.18e-05, step=2840] Training: 28%|██▊ | 2841/10000 [37:32<1:06:52, 1.78it/s, loss=0.0371, lr=2.18e-05, step=2841] Training: 28%|██▊ | 2842/10000 [37:32<1:04:17, 1.86it/s, loss=0.0371, lr=2.18e-05, step=2841] Training: 28%|██▊ | 2842/10000 [37:32<1:04:17, 1.86it/s, loss=0.0149, lr=2.18e-05, step=2842] Training: 28%|██▊ | 2843/10000 [37:33<1:12:43, 1.64it/s, loss=0.0149, lr=2.18e-05, step=2842] Training: 28%|██▊ | 2843/10000 [37:33<1:12:43, 1.64it/s, loss=0.0221, lr=2.18e-05, step=2843] Training: 28%|██▊ | 2844/10000 [37:33<1:11:22, 1.67it/s, loss=0.0221, lr=2.18e-05, step=2843] Training: 28%|██▊ | 2844/10000 [37:33<1:11:22, 1.67it/s, loss=0.0172, lr=2.18e-05, step=2844] Training: 28%|██▊ | 2845/10000 [37:34<1:16:11, 1.57it/s, loss=0.0172, lr=2.18e-05, step=2844] Training: 28%|██▊ | 2845/10000 [37:34<1:16:11, 1.57it/s, loss=0.0227, lr=2.18e-05, step=2845] Training: 28%|██▊ | 2846/10000 [37:35<1:12:53, 1.64it/s, loss=0.0227, lr=2.18e-05, step=2845] Training: 28%|██▊ | 2846/10000 [37:35<1:12:53, 1.64it/s, loss=0.0169, lr=2.18e-05, step=2846] Training: 28%|██▊ | 2847/10000 [37:35<1:10:19, 1.70it/s, loss=0.0169, lr=2.18e-05, step=2846] Training: 28%|██▊ | 2847/10000 [37:35<1:10:19, 1.70it/s, loss=0.0251, lr=2.18e-05, step=2847] Training: 28%|██▊ | 2848/10000 [37:36<1:09:48, 1.71it/s, loss=0.0251, lr=2.18e-05, step=2847] Training: 28%|██▊ | 2848/10000 [37:36<1:09:48, 1.71it/s, loss=0.0375, lr=2.18e-05, step=2848] Training: 28%|██▊ | 2849/10000 [37:36<1:10:54, 1.68it/s, loss=0.0375, lr=2.18e-05, step=2848] Training: 28%|██▊ | 2849/10000 [37:36<1:10:54, 1.68it/s, loss=0.0061, lr=2.18e-05, step=2849]19:22:10.208 [I] step=2850 loss=0.0155 smoothed_loss=0.0201 lr=2.18e-05 grad_norm=0.5022 step_time=0.5609s data_time=0.1210s it/s=1.467 eta_to_10000=4875.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.1380 grad_arm_token_fuse=0.0491 grad_shared_expert=0.5397 (18633:train_pytorch.py:850) + Training: 28%|██▊ | 2850/10000 [37:38<1:41:37, 1.17it/s, loss=0.0061, lr=2.18e-05, step=2849] Training: 28%|██▊ | 2850/10000 [37:38<1:41:37, 1.17it/s, loss=0.0155, lr=2.18e-05, step=2850] Training: 29%|██▊ | 2851/10000 [37:39<1:37:12, 1.23it/s, loss=0.0155, lr=2.18e-05, step=2850] Training: 29%|██▊ | 2851/10000 [37:39<1:37:12, 1.23it/s, loss=0.0616, lr=2.18e-05, step=2851] Training: 29%|██▊ | 2852/10000 [37:39<1:27:03, 1.37it/s, loss=0.0616, lr=2.18e-05, step=2851] Training: 29%|██▊ | 2852/10000 [37:39<1:27:03, 1.37it/s, loss=0.0107, lr=2.18e-05, step=2852] Training: 29%|██▊ | 2853/10000 [37:40<1:20:01, 1.49it/s, loss=0.0107, lr=2.18e-05, step=2852] Training: 29%|██▊ | 2853/10000 [37:40<1:20:01, 1.49it/s, loss=0.0069, lr=2.18e-05, step=2853] Training: 29%|██▊ | 2854/10000 [37:40<1:15:33, 1.58it/s, loss=0.0069, lr=2.18e-05, step=2853] Training: 29%|██▊ | 2854/10000 [37:40<1:15:33, 1.58it/s, loss=0.0218, lr=2.18e-05, step=2854] Training: 29%|██▊ | 2855/10000 [37:41<1:13:44, 1.62it/s, loss=0.0218, lr=2.18e-05, step=2854] Training: 29%|██▊ | 2855/10000 [37:41<1:13:44, 1.62it/s, loss=0.0180, lr=2.18e-05, step=2855] Training: 29%|██▊ | 2856/10000 [37:41<1:12:48, 1.64it/s, loss=0.0180, lr=2.18e-05, step=2855] Training: 29%|██▊ | 2856/10000 [37:41<1:12:48, 1.64it/s, loss=0.0123, lr=2.18e-05, step=2856] Training: 29%|██▊ | 2857/10000 [37:42<1:19:30, 1.50it/s, loss=0.0123, lr=2.18e-05, step=2856] Training: 29%|██▊ | 2857/10000 [37:42<1:19:30, 1.50it/s, loss=0.0279, lr=2.18e-05, step=2857] Training: 29%|██▊ | 2858/10000 [37:43<1:14:28, 1.60it/s, loss=0.0279, lr=2.18e-05, step=2857] Training: 29%|██▊ | 2858/10000 [37:43<1:14:28, 1.60it/s, loss=0.0270, lr=2.18e-05, step=2858] Training: 29%|██▊ | 2859/10000 [37:43<1:18:57, 1.51it/s, loss=0.0270, lr=2.18e-05, step=2858] Training: 29%|██▊ | 2859/10000 [37:43<1:18:57, 1.51it/s, loss=0.0268, lr=2.17e-05, step=2859]19:22:16.357 [I] step=2860 loss=0.0266 smoothed_loss=0.0225 lr=2.18e-05 grad_norm=0.4872 step_time=0.5351s data_time=0.0798s it/s=1.626 eta_to_10000=4390.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0140 grad_action_out_proj_arms=0.1237 grad_arm_token_fuse=0.0742 grad_shared_expert=0.3875 (18633:train_pytorch.py:850) + Training: 29%|██▊ | 2860/10000 [37:44<1:14:56, 1.59it/s, loss=0.0268, lr=2.17e-05, step=2859] Training: 29%|██▊ | 2860/10000 [37:44<1:14:56, 1.59it/s, loss=0.0266, lr=2.17e-05, step=2860] Training: 29%|██▊ | 2861/10000 [37:45<1:11:47, 1.66it/s, loss=0.0266, lr=2.17e-05, step=2860] Training: 29%|██▊ | 2861/10000 [37:45<1:11:47, 1.66it/s, loss=0.0207, lr=2.17e-05, step=2861] Training: 29%|██▊ | 2862/10000 [37:45<1:08:36, 1.73it/s, loss=0.0207, lr=2.17e-05, step=2861] Training: 29%|██▊ | 2862/10000 [37:45<1:08:36, 1.73it/s, loss=0.0094, lr=2.17e-05, step=2862] Training: 29%|██▊ | 2863/10000 [37:46<1:06:40, 1.78it/s, loss=0.0094, lr=2.17e-05, step=2862] Training: 29%|██▊ | 2863/10000 [37:46<1:06:40, 1.78it/s, loss=0.0222, lr=2.17e-05, step=2863] Training: 29%|██▊ | 2864/10000 [37:46<1:14:52, 1.59it/s, loss=0.0222, lr=2.17e-05, step=2863] Training: 29%|██▊ | 2864/10000 [37:46<1:14:52, 1.59it/s, loss=0.0159, lr=2.17e-05, step=2864] Training: 29%|██▊ | 2865/10000 [37:47<1:13:19, 1.62it/s, loss=0.0159, lr=2.17e-05, step=2864] Training: 29%|██▊ | 2865/10000 [37:47<1:13:19, 1.62it/s, loss=0.0363, lr=2.17e-05, step=2865] Training: 29%|██▊ | 2866/10000 [37:48<1:15:46, 1.57it/s, loss=0.0363, lr=2.17e-05, step=2865] Training: 29%|██▊ | 2866/10000 [37:48<1:15:46, 1.57it/s, loss=0.0206, lr=2.17e-05, step=2866] Training: 29%|██▊ | 2867/10000 [37:48<1:10:30, 1.69it/s, loss=0.0206, lr=2.17e-05, step=2866] Training: 29%|██▊ | 2867/10000 [37:48<1:10:30, 1.69it/s, loss=0.0259, lr=2.17e-05, step=2867] Training: 29%|██▊ | 2868/10000 [37:49<1:07:32, 1.76it/s, loss=0.0259, lr=2.17e-05, step=2867] Training: 29%|██▊ | 2868/10000 [37:49<1:07:32, 1.76it/s, loss=0.0137, lr=2.17e-05, step=2868] Training: 29%|██▊ | 2869/10000 [37:49<1:05:14, 1.82it/s, loss=0.0137, lr=2.17e-05, step=2868] Training: 29%|██▊ | 2869/10000 [37:49<1:05:14, 1.82it/s, loss=0.0152, lr=2.17e-05, step=2869]19:22:22.067 [I] step=2870 loss=0.0137 smoothed_loss=0.0202 lr=2.17e-05 grad_norm=0.5385 step_time=0.4984s data_time=0.0726s it/s=1.752 eta_to_10000=4070.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.2027 grad_arm_token_fuse=0.0678 grad_shared_expert=0.4955 (18633:train_pytorch.py:850) + Training: 29%|██▊ | 2870/10000 [37:50<1:05:44, 1.81it/s, loss=0.0152, lr=2.17e-05, step=2869] Training: 29%|██▊ | 2870/10000 [37:50<1:05:44, 1.81it/s, loss=0.0137, lr=2.17e-05, step=2870] Training: 29%|██▊ | 2871/10000 [37:51<1:17:48, 1.53it/s, loss=0.0137, lr=2.17e-05, step=2870] Training: 29%|██▊ | 2871/10000 [37:51<1:17:48, 1.53it/s, loss=0.0307, lr=2.17e-05, step=2871] Training: 29%|██▊ | 2872/10000 [37:51<1:12:25, 1.64it/s, loss=0.0307, lr=2.17e-05, step=2871] Training: 29%|██▊ | 2872/10000 [37:51<1:12:25, 1.64it/s, loss=0.0600, lr=2.17e-05, step=2872] Training: 29%|██▊ | 2873/10000 [37:52<1:15:35, 1.57it/s, loss=0.0600, lr=2.17e-05, step=2872] Training: 29%|██▊ | 2873/10000 [37:52<1:15:35, 1.57it/s, loss=0.0311, lr=2.17e-05, step=2873] Training: 29%|██▊ | 2874/10000 [37:52<1:10:12, 1.69it/s, loss=0.0311, lr=2.17e-05, step=2873] Training: 29%|██▊ | 2874/10000 [37:52<1:10:12, 1.69it/s, loss=0.0121, lr=2.17e-05, step=2874] Training: 29%|██▉ | 2875/10000 [37:53<1:08:45, 1.73it/s, loss=0.0121, lr=2.17e-05, step=2874] Training: 29%|██▉ | 2875/10000 [37:53<1:08:45, 1.73it/s, loss=0.0163, lr=2.17e-05, step=2875] Training: 29%|██▉ | 2876/10000 [37:54<1:28:44, 1.34it/s, loss=0.0163, lr=2.17e-05, step=2875] Training: 29%|██▉ | 2876/10000 [37:54<1:28:44, 1.34it/s, loss=0.0186, lr=2.17e-05, step=2876] Training: 29%|██▉ | 2877/10000 [37:55<1:21:37, 1.45it/s, loss=0.0186, lr=2.17e-05, step=2876] Training: 29%|██▉ | 2877/10000 [37:55<1:21:37, 1.45it/s, loss=0.0502, lr=2.17e-05, step=2877] Training: 29%|██▉ | 2878/10000 [37:55<1:30:22, 1.31it/s, loss=0.0502, lr=2.17e-05, step=2877] Training: 29%|██▉ | 2878/10000 [37:55<1:30:22, 1.31it/s, loss=0.0273, lr=2.17e-05, step=2878] Training: 29%|██▉ | 2879/10000 [37:56<1:22:37, 1.44it/s, loss=0.0273, lr=2.17e-05, step=2878] Training: 29%|██▉ | 2879/10000 [37:56<1:22:37, 1.44it/s, loss=0.0149, lr=2.17e-05, step=2879]19:22:28.926 [I] step=2880 loss=0.0116 smoothed_loss=0.0235 lr=2.17e-05 grad_norm=0.4965 step_time=0.5923s data_time=0.0937s it/s=1.458 eta_to_10000=4883.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.2041 grad_arm_token_fuse=0.0580 grad_shared_expert=0.6080 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2880/10000 [37:57<1:17:51, 1.52it/s, loss=0.0149, lr=2.17e-05, step=2879] Training: 29%|██▉ | 2880/10000 [37:57<1:17:51, 1.52it/s, loss=0.0116, lr=2.17e-05, step=2880] Training: 29%|██▉ | 2881/10000 [37:57<1:18:38, 1.51it/s, loss=0.0116, lr=2.17e-05, step=2880] Training: 29%|██▉ | 2881/10000 [37:57<1:18:38, 1.51it/s, loss=0.0433, lr=2.17e-05, step=2881] Training: 29%|██▉ | 2882/10000 [37:58<1:17:34, 1.53it/s, loss=0.0433, lr=2.17e-05, step=2881] Training: 29%|██▉ | 2882/10000 [37:58<1:17:34, 1.53it/s, loss=0.0180, lr=2.17e-05, step=2882] Training: 29%|██▉ | 2883/10000 [37:59<1:15:50, 1.56it/s, loss=0.0180, lr=2.17e-05, step=2882] Training: 29%|██▉ | 2883/10000 [37:59<1:15:50, 1.56it/s, loss=0.0280, lr=2.17e-05, step=2883] Training: 29%|██▉ | 2884/10000 [37:59<1:12:39, 1.63it/s, loss=0.0280, lr=2.17e-05, step=2883] Training: 29%|██▉ | 2884/10000 [37:59<1:12:39, 1.63it/s, loss=0.0201, lr=2.17e-05, step=2884] Training: 29%|██▉ | 2885/10000 [38:00<1:09:40, 1.70it/s, loss=0.0201, lr=2.17e-05, step=2884] Training: 29%|██▉ | 2885/10000 [38:00<1:09:40, 1.70it/s, loss=0.0068, lr=2.17e-05, step=2885] Training: 29%|██▉ | 2886/10000 [38:00<1:18:12, 1.52it/s, loss=0.0068, lr=2.17e-05, step=2885] Training: 29%|██▉ | 2886/10000 [38:00<1:18:12, 1.52it/s, loss=0.0098, lr=2.17e-05, step=2886] Training: 29%|██▉ | 2887/10000 [38:01<1:14:20, 1.59it/s, loss=0.0098, lr=2.17e-05, step=2886] Training: 29%|██▉ | 2887/10000 [38:01<1:14:20, 1.59it/s, loss=0.0187, lr=2.17e-05, step=2887] Training: 29%|██▉ | 2888/10000 [38:02<1:31:25, 1.30it/s, loss=0.0187, lr=2.17e-05, step=2887] Training: 29%|██▉ | 2888/10000 [38:02<1:31:25, 1.30it/s, loss=0.0186, lr=2.17e-05, step=2888] Training: 29%|██▉ | 2889/10000 [38:03<1:23:56, 1.41it/s, loss=0.0186, lr=2.17e-05, step=2888] Training: 29%|██▉ | 2889/10000 [38:03<1:23:56, 1.41it/s, loss=0.0072, lr=2.17e-05, step=2889]19:22:35.602 [I] step=2890 loss=0.0142 smoothed_loss=0.0190 lr=2.17e-05 grad_norm=0.5482 step_time=0.5458s data_time=0.1217s it/s=1.498 eta_to_10000=4746.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0077 grad_action_out_proj_arms=0.0950 grad_arm_token_fuse=0.0351 grad_shared_expert=0.3394 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2890/10000 [38:03<1:21:11, 1.46it/s, loss=0.0072, lr=2.17e-05, step=2889] Training: 29%|██▉ | 2890/10000 [38:03<1:21:11, 1.46it/s, loss=0.0142, lr=2.17e-05, step=2890] Training: 29%|██▉ | 2891/10000 [38:04<1:13:55, 1.60it/s, loss=0.0142, lr=2.17e-05, step=2890] Training: 29%|██▉ | 2891/10000 [38:04<1:13:55, 1.60it/s, loss=0.0171, lr=2.17e-05, step=2891] Training: 29%|██▉ | 2892/10000 [38:04<1:10:06, 1.69it/s, loss=0.0171, lr=2.17e-05, step=2891] Training: 29%|██▉ | 2892/10000 [38:04<1:10:06, 1.69it/s, loss=0.0140, lr=2.17e-05, step=2892] Training: 29%|██▉ | 2893/10000 [38:05<1:16:41, 1.54it/s, loss=0.0140, lr=2.17e-05, step=2892] Training: 29%|██▉ | 2893/10000 [38:05<1:16:41, 1.54it/s, loss=0.0050, lr=2.17e-05, step=2893] Training: 29%|██▉ | 2894/10000 [38:06<1:11:56, 1.65it/s, loss=0.0050, lr=2.17e-05, step=2893] Training: 29%|██▉ | 2894/10000 [38:06<1:11:56, 1.65it/s, loss=0.0070, lr=2.17e-05, step=2894] Training: 29%|██▉ | 2895/10000 [38:06<1:08:36, 1.73it/s, loss=0.0070, lr=2.17e-05, step=2894] Training: 29%|██▉ | 2895/10000 [38:06<1:08:36, 1.73it/s, loss=0.0428, lr=2.17e-05, step=2895] Training: 29%|██▉ | 2896/10000 [38:07<1:12:36, 1.63it/s, loss=0.0428, lr=2.17e-05, step=2895] Training: 29%|██▉ | 2896/10000 [38:07<1:12:36, 1.63it/s, loss=0.0423, lr=2.17e-05, step=2896] Training: 29%|██▉ | 2897/10000 [38:07<1:09:30, 1.70it/s, loss=0.0423, lr=2.17e-05, step=2896] Training: 29%|██▉ | 2897/10000 [38:07<1:09:30, 1.70it/s, loss=0.0169, lr=2.16e-05, step=2897] Training: 29%|██▉ | 2898/10000 [38:08<1:06:21, 1.78it/s, loss=0.0169, lr=2.16e-05, step=2897] Training: 29%|██▉ | 2898/10000 [38:08<1:06:21, 1.78it/s, loss=0.0395, lr=2.16e-05, step=2898] Training: 29%|██▉ | 2899/10000 [38:08<1:06:08, 1.79it/s, loss=0.0395, lr=2.16e-05, step=2898] Training: 29%|██▉ | 2899/10000 [38:08<1:06:08, 1.79it/s, loss=0.0197, lr=2.16e-05, step=2899]19:22:41.522 [I] step=2900 loss=0.0158 smoothed_loss=0.0216 lr=2.17e-05 grad_norm=0.5148 step_time=0.5183s data_time=0.0737s it/s=1.689 eta_to_10000=4202.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0933 grad_arm_token_fuse=0.0377 grad_shared_expert=0.3448 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2900/10000 [38:09<1:16:17, 1.55it/s, loss=0.0197, lr=2.16e-05, step=2899] Training: 29%|██▉ | 2900/10000 [38:09<1:16:17, 1.55it/s, loss=0.0158, lr=2.16e-05, step=2900] Training: 29%|██▉ | 2901/10000 [38:10<1:11:18, 1.66it/s, loss=0.0158, lr=2.16e-05, step=2900] Training: 29%|██▉ | 2901/10000 [38:10<1:11:18, 1.66it/s, loss=0.0058, lr=2.16e-05, step=2901] Training: 29%|██▉ | 2902/10000 [38:10<1:07:44, 1.75it/s, loss=0.0058, lr=2.16e-05, step=2901] Training: 29%|██▉ | 2902/10000 [38:10<1:07:44, 1.75it/s, loss=0.0236, lr=2.16e-05, step=2902] Training: 29%|██▉ | 2903/10000 [38:11<1:14:26, 1.59it/s, loss=0.0236, lr=2.16e-05, step=2902] Training: 29%|██▉ | 2903/10000 [38:11<1:14:26, 1.59it/s, loss=0.0184, lr=2.16e-05, step=2903] Training: 29%|██▉ | 2904/10000 [38:11<1:09:56, 1.69it/s, loss=0.0184, lr=2.16e-05, step=2903] Training: 29%|██▉ | 2904/10000 [38:11<1:09:56, 1.69it/s, loss=0.0227, lr=2.16e-05, step=2904] Training: 29%|██▉ | 2905/10000 [38:12<1:06:54, 1.77it/s, loss=0.0227, lr=2.16e-05, step=2904] Training: 29%|██▉ | 2905/10000 [38:12<1:06:54, 1.77it/s, loss=0.0307, lr=2.16e-05, step=2905] Training: 29%|██▉ | 2906/10000 [38:13<1:07:44, 1.75it/s, loss=0.0307, lr=2.16e-05, step=2905] Training: 29%|██▉ | 2906/10000 [38:13<1:07:44, 1.75it/s, loss=0.0632, lr=2.16e-05, step=2906] Training: 29%|██▉ | 2907/10000 [38:13<1:15:35, 1.56it/s, loss=0.0632, lr=2.16e-05, step=2906] Training: 29%|██▉ | 2907/10000 [38:13<1:15:35, 1.56it/s, loss=0.0192, lr=2.16e-05, step=2907] Training: 29%|██▉ | 2908/10000 [38:14<1:11:45, 1.65it/s, loss=0.0192, lr=2.16e-05, step=2907] Training: 29%|██▉ | 2908/10000 [38:14<1:11:45, 1.65it/s, loss=0.0103, lr=2.16e-05, step=2908] Training: 29%|██▉ | 2909/10000 [38:15<1:13:02, 1.62it/s, loss=0.0103, lr=2.16e-05, step=2908] Training: 29%|██▉ | 2909/10000 [38:15<1:13:02, 1.62it/s, loss=0.0327, lr=2.16e-05, step=2909]19:22:47.407 [I] step=2910 loss=0.0281 smoothed_loss=0.0248 lr=2.16e-05 grad_norm=0.6104 step_time=0.5084s data_time=0.0801s it/s=1.700 eta_to_10000=4171.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0174 grad_action_out_proj_arms=0.2403 grad_arm_token_fuse=0.0966 grad_shared_expert=0.5880 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2910/10000 [38:15<1:10:32, 1.68it/s, loss=0.0327, lr=2.16e-05, step=2909] Training: 29%|██▉ | 2910/10000 [38:15<1:10:32, 1.68it/s, loss=0.0281, lr=2.16e-05, step=2910] Training: 29%|██▉ | 2911/10000 [38:16<1:14:57, 1.58it/s, loss=0.0281, lr=2.16e-05, step=2910] Training: 29%|██▉ | 2911/10000 [38:16<1:14:57, 1.58it/s, loss=0.0094, lr=2.16e-05, step=2911] Training: 29%|██▉ | 2912/10000 [38:17<1:32:30, 1.28it/s, loss=0.0094, lr=2.16e-05, step=2911] Training: 29%|██▉ | 2912/10000 [38:17<1:32:30, 1.28it/s, loss=0.0087, lr=2.16e-05, step=2912] Training: 29%|██▉ | 2913/10000 [38:18<1:27:01, 1.36it/s, loss=0.0087, lr=2.16e-05, step=2912] Training: 29%|██▉ | 2913/10000 [38:18<1:27:01, 1.36it/s, loss=0.0153, lr=2.16e-05, step=2913] Training: 29%|██▉ | 2914/10000 [38:18<1:28:49, 1.33it/s, loss=0.0153, lr=2.16e-05, step=2913] Training: 29%|██▉ | 2914/10000 [38:18<1:28:49, 1.33it/s, loss=0.0099, lr=2.16e-05, step=2914] Training: 29%|██▉ | 2915/10000 [38:19<1:20:32, 1.47it/s, loss=0.0099, lr=2.16e-05, step=2914] Training: 29%|██▉ | 2915/10000 [38:19<1:20:32, 1.47it/s, loss=0.0291, lr=2.16e-05, step=2915] Training: 29%|██▉ | 2916/10000 [38:19<1:15:14, 1.57it/s, loss=0.0291, lr=2.16e-05, step=2915] Training: 29%|██▉ | 2916/10000 [38:19<1:15:14, 1.57it/s, loss=0.0445, lr=2.16e-05, step=2916] Training: 29%|██▉ | 2917/10000 [38:20<1:11:16, 1.66it/s, loss=0.0445, lr=2.16e-05, step=2916] Training: 29%|██▉ | 2917/10000 [38:20<1:11:16, 1.66it/s, loss=0.0100, lr=2.16e-05, step=2917] Training: 29%|██▉ | 2918/10000 [38:21<1:15:04, 1.57it/s, loss=0.0100, lr=2.16e-05, step=2917] Training: 29%|██▉ | 2918/10000 [38:21<1:15:04, 1.57it/s, loss=0.0212, lr=2.16e-05, step=2918] Training: 29%|██▉ | 2919/10000 [38:21<1:11:05, 1.66it/s, loss=0.0212, lr=2.16e-05, step=2918] Training: 29%|██▉ | 2919/10000 [38:21<1:11:05, 1.66it/s, loss=0.0244, lr=2.16e-05, step=2919]19:22:54.056 [I] step=2920 loss=0.0071 smoothed_loss=0.0206 lr=2.16e-05 grad_norm=0.5159 step_time=0.5332s data_time=0.1316s it/s=1.504 eta_to_10000=4706.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0250 grad_action_out_proj_arms=0.1814 grad_arm_token_fuse=0.1286 grad_shared_expert=0.4983 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2920/10000 [38:22<1:09:52, 1.69it/s, loss=0.0244, lr=2.16e-05, step=2919] Training: 29%|██▉ | 2920/10000 [38:22<1:09:52, 1.69it/s, loss=0.0071, lr=2.16e-05, step=2920] Training: 29%|██▉ | 2921/10000 [38:22<1:15:13, 1.57it/s, loss=0.0071, lr=2.16e-05, step=2920] Training: 29%|██▉ | 2921/10000 [38:22<1:15:13, 1.57it/s, loss=0.0138, lr=2.16e-05, step=2921] Training: 29%|██▉ | 2922/10000 [38:23<1:13:24, 1.61it/s, loss=0.0138, lr=2.16e-05, step=2921] Training: 29%|██▉ | 2922/10000 [38:23<1:13:24, 1.61it/s, loss=0.0129, lr=2.16e-05, step=2922] Training: 29%|██▉ | 2923/10000 [38:24<1:12:35, 1.62it/s, loss=0.0129, lr=2.16e-05, step=2922] Training: 29%|██▉ | 2923/10000 [38:24<1:12:35, 1.62it/s, loss=0.0197, lr=2.16e-05, step=2923] Training: 29%|██▉ | 2924/10000 [38:24<1:11:41, 1.64it/s, loss=0.0197, lr=2.16e-05, step=2923] Training: 29%|██▉ | 2924/10000 [38:24<1:11:41, 1.64it/s, loss=0.0161, lr=2.16e-05, step=2924] Training: 29%|██▉ | 2925/10000 [38:25<1:16:43, 1.54it/s, loss=0.0161, lr=2.16e-05, step=2924] Training: 29%|██▉ | 2925/10000 [38:25<1:16:43, 1.54it/s, loss=0.0402, lr=2.16e-05, step=2925] Training: 29%|██▉ | 2926/10000 [38:26<1:11:31, 1.65it/s, loss=0.0402, lr=2.16e-05, step=2925] Training: 29%|██▉ | 2926/10000 [38:26<1:11:31, 1.65it/s, loss=0.0403, lr=2.16e-05, step=2926] Training: 29%|██▉ | 2927/10000 [38:26<1:09:00, 1.71it/s, loss=0.0403, lr=2.16e-05, step=2926] Training: 29%|██▉ | 2927/10000 [38:26<1:09:00, 1.71it/s, loss=0.0281, lr=2.16e-05, step=2927] Training: 29%|██▉ | 2928/10000 [38:27<1:17:01, 1.53it/s, loss=0.0281, lr=2.16e-05, step=2927] Training: 29%|██▉ | 2928/10000 [38:27<1:17:01, 1.53it/s, loss=0.0176, lr=2.16e-05, step=2928] Training: 29%|██▉ | 2929/10000 [38:27<1:11:42, 1.64it/s, loss=0.0176, lr=2.16e-05, step=2928] Training: 29%|██▉ | 2929/10000 [38:27<1:11:42, 1.64it/s, loss=0.0918, lr=2.16e-05, step=2929]19:23:00.272 [I] step=2930 loss=0.0223 smoothed_loss=0.0291 lr=2.16e-05 grad_norm=0.5616 step_time=0.5402s data_time=0.0815s it/s=1.609 eta_to_10000=4394.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0773 grad_arm_token_fuse=0.0360 grad_shared_expert=0.2959 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2930/10000 [38:28<1:11:03, 1.66it/s, loss=0.0918, lr=2.16e-05, step=2929] Training: 29%|██▉ | 2930/10000 [38:28<1:11:03, 1.66it/s, loss=0.0223, lr=2.16e-05, step=2930] Training: 29%|██▉ | 2931/10000 [38:28<1:07:40, 1.74it/s, loss=0.0223, lr=2.16e-05, step=2930] Training: 29%|██▉ | 2931/10000 [38:28<1:07:40, 1.74it/s, loss=0.0135, lr=2.16e-05, step=2931] Training: 29%|██▉ | 2932/10000 [38:29<1:14:42, 1.58it/s, loss=0.0135, lr=2.16e-05, step=2931] Training: 29%|██▉ | 2932/10000 [38:29<1:14:42, 1.58it/s, loss=0.0333, lr=2.16e-05, step=2932] Training: 29%|██▉ | 2933/10000 [38:30<1:10:14, 1.68it/s, loss=0.0333, lr=2.16e-05, step=2932] Training: 29%|██▉ | 2933/10000 [38:30<1:10:14, 1.68it/s, loss=0.0385, lr=2.16e-05, step=2933] Training: 29%|██▉ | 2934/10000 [38:30<1:09:07, 1.70it/s, loss=0.0385, lr=2.16e-05, step=2933] Training: 29%|██▉ | 2934/10000 [38:30<1:09:07, 1.70it/s, loss=0.0191, lr=2.16e-05, step=2934] Training: 29%|██▉ | 2935/10000 [38:31<1:16:03, 1.55it/s, loss=0.0191, lr=2.16e-05, step=2934] Training: 29%|██▉ | 2935/10000 [38:31<1:16:03, 1.55it/s, loss=0.0144, lr=2.15e-05, step=2935] Training: 29%|██▉ | 2936/10000 [38:32<1:25:39, 1.37it/s, loss=0.0144, lr=2.15e-05, step=2935] Training: 29%|██▉ | 2936/10000 [38:32<1:25:39, 1.37it/s, loss=0.0265, lr=2.15e-05, step=2936] Training: 29%|██▉ | 2937/10000 [38:33<1:17:57, 1.51it/s, loss=0.0265, lr=2.15e-05, step=2936] Training: 29%|██▉ | 2937/10000 [38:33<1:17:57, 1.51it/s, loss=0.0065, lr=2.15e-05, step=2937] Training: 29%|██▉ | 2938/10000 [38:33<1:13:39, 1.60it/s, loss=0.0065, lr=2.15e-05, step=2937] Training: 29%|██▉ | 2938/10000 [38:33<1:13:39, 1.60it/s, loss=0.0262, lr=2.15e-05, step=2938] Training: 29%|██▉ | 2939/10000 [38:34<1:09:10, 1.70it/s, loss=0.0262, lr=2.15e-05, step=2938] Training: 29%|██▉ | 2939/10000 [38:34<1:09:10, 1.70it/s, loss=0.0305, lr=2.15e-05, step=2939]19:23:06.651 [I] step=2940 loss=0.0171 smoothed_loss=0.0246 lr=2.15e-05 grad_norm=0.4731 step_time=0.5398s data_time=0.0981s it/s=1.568 eta_to_10000=4502.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1142 grad_arm_token_fuse=0.0539 grad_shared_expert=0.5367 (18633:train_pytorch.py:850) + Training: 29%|██▉ | 2940/10000 [38:34<1:15:41, 1.55it/s, loss=0.0305, lr=2.15e-05, step=2939] Training: 29%|██▉ | 2940/10000 [38:34<1:15:41, 1.55it/s, loss=0.0171, lr=2.15e-05, step=2940] Training: 29%|██▉ | 2941/10000 [38:35<1:11:02, 1.66it/s, loss=0.0171, lr=2.15e-05, step=2940] Training: 29%|██▉ | 2941/10000 [38:35<1:11:02, 1.66it/s, loss=0.0418, lr=2.15e-05, step=2941] Training: 29%|██▉ | 2942/10000 [38:35<1:08:03, 1.73it/s, loss=0.0418, lr=2.15e-05, step=2941] Training: 29%|██▉ | 2942/10000 [38:35<1:08:03, 1.73it/s, loss=0.0121, lr=2.15e-05, step=2942] Training: 29%|██▉ | 2943/10000 [38:36<1:15:56, 1.55it/s, loss=0.0121, lr=2.15e-05, step=2942] Training: 29%|██▉ | 2943/10000 [38:36<1:15:56, 1.55it/s, loss=0.0320, lr=2.15e-05, step=2943] Training: 29%|██▉ | 2944/10000 [38:37<1:13:23, 1.60it/s, loss=0.0320, lr=2.15e-05, step=2943] Training: 29%|██▉ | 2944/10000 [38:37<1:13:23, 1.60it/s, loss=0.0432, lr=2.15e-05, step=2944] Training: 29%|██▉ | 2945/10000 [38:37<1:12:12, 1.63it/s, loss=0.0432, lr=2.15e-05, step=2944] Training: 29%|██▉ | 2945/10000 [38:37<1:12:12, 1.63it/s, loss=0.0149, lr=2.15e-05, step=2945] Training: 29%|██▉ | 2946/10000 [38:38<1:07:59, 1.73it/s, loss=0.0149, lr=2.15e-05, step=2945] Training: 29%|██▉ | 2946/10000 [38:38<1:07:59, 1.73it/s, loss=0.0084, lr=2.15e-05, step=2946] Training: 29%|██▉ | 2947/10000 [38:38<1:11:26, 1.65it/s, loss=0.0084, lr=2.15e-05, step=2946] Training: 29%|██▉ | 2947/10000 [38:38<1:11:26, 1.65it/s, loss=0.0173, lr=2.15e-05, step=2947] Training: 29%|██▉ | 2948/10000 [38:39<1:08:17, 1.72it/s, loss=0.0173, lr=2.15e-05, step=2947] Training: 29%|██▉ | 2948/10000 [38:39<1:08:17, 1.72it/s, loss=0.0105, lr=2.15e-05, step=2948] Training: 29%|██▉ | 2949/10000 [38:40<1:05:59, 1.78it/s, loss=0.0105, lr=2.15e-05, step=2948] Training: 29%|██▉ | 2949/10000 [38:40<1:05:59, 1.78it/s, loss=0.0095, lr=2.15e-05, step=2949]19:23:12.697 [I] step=2950 loss=0.0137 smoothed_loss=0.0203 lr=2.15e-05 grad_norm=0.5309 step_time=0.5264s data_time=0.0782s it/s=1.654 eta_to_10000=4262.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0237 grad_action_out_proj_arms=0.1629 grad_arm_token_fuse=0.1215 grad_shared_expert=0.4928 (18633:train_pytorch.py:850) + Training: 30%|██▉ | 2950/10000 [38:40<1:15:52, 1.55it/s, loss=0.0095, lr=2.15e-05, step=2949] Training: 30%|██▉ | 2950/10000 [38:40<1:15:52, 1.55it/s, loss=0.0137, lr=2.15e-05, step=2950] Training: 30%|██▉ | 2951/10000 [38:41<1:10:44, 1.66it/s, loss=0.0137, lr=2.15e-05, step=2950] Training: 30%|██▉ | 2951/10000 [38:41<1:10:44, 1.66it/s, loss=0.0187, lr=2.15e-05, step=2951] Training: 30%|██▉ | 2952/10000 [38:41<1:07:18, 1.75it/s, loss=0.0187, lr=2.15e-05, step=2951] Training: 30%|██▉ | 2952/10000 [38:41<1:07:18, 1.75it/s, loss=0.0058, lr=2.15e-05, step=2952] Training: 30%|██▉ | 2953/10000 [38:42<1:05:37, 1.79it/s, loss=0.0058, lr=2.15e-05, step=2952] Training: 30%|██▉ | 2953/10000 [38:42<1:05:37, 1.79it/s, loss=0.0188, lr=2.15e-05, step=2953] Training: 30%|██▉ | 2954/10000 [38:42<1:03:26, 1.85it/s, loss=0.0188, lr=2.15e-05, step=2953] Training: 30%|██▉ | 2954/10000 [38:42<1:03:26, 1.85it/s, loss=0.0139, lr=2.15e-05, step=2954] Training: 30%|██▉ | 2955/10000 [38:43<1:07:56, 1.73it/s, loss=0.0139, lr=2.15e-05, step=2954] Training: 30%|██▉ | 2955/10000 [38:43<1:07:56, 1.73it/s, loss=0.0618, lr=2.15e-05, step=2955] Training: 30%|██▉ | 2956/10000 [38:44<1:05:58, 1.78it/s, loss=0.0618, lr=2.15e-05, step=2955] Training: 30%|██▉ | 2956/10000 [38:44<1:05:58, 1.78it/s, loss=0.0122, lr=2.15e-05, step=2956] Training: 30%|██▉ | 2957/10000 [38:44<1:13:57, 1.59it/s, loss=0.0122, lr=2.15e-05, step=2956] Training: 30%|██▉ | 2957/10000 [38:44<1:13:57, 1.59it/s, loss=0.0078, lr=2.15e-05, step=2957] Training: 30%|██▉ | 2958/10000 [38:45<1:10:57, 1.65it/s, loss=0.0078, lr=2.15e-05, step=2957] Training: 30%|██▉ | 2958/10000 [38:45<1:10:57, 1.65it/s, loss=0.0103, lr=2.15e-05, step=2958] Training: 30%|██▉ | 2959/10000 [38:45<1:09:38, 1.69it/s, loss=0.0103, lr=2.15e-05, step=2958] Training: 30%|██▉ | 2959/10000 [38:45<1:09:38, 1.69it/s, loss=0.0072, lr=2.15e-05, step=2959]19:23:18.373 [I] step=2960 loss=0.0066 smoothed_loss=0.0169 lr=2.15e-05 grad_norm=0.4517 step_time=0.4981s data_time=0.0694s it/s=1.762 eta_to_10000=3994.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0245 grad_action_out_proj_arms=0.2016 grad_arm_token_fuse=0.1343 grad_shared_expert=0.6550 (18633:train_pytorch.py:850) + Training: 30%|██▉ | 2960/10000 [38:46<1:08:15, 1.72it/s, loss=0.0072, lr=2.15e-05, step=2959] Training: 30%|██▉ | 2960/10000 [38:46<1:08:15, 1.72it/s, loss=0.0066, lr=2.15e-05, step=2960] Training: 30%|██▉ | 2961/10000 [38:47<1:04:45, 1.81it/s, loss=0.0066, lr=2.15e-05, step=2960] Training: 30%|██▉ | 2961/10000 [38:47<1:04:45, 1.81it/s, loss=0.0191, lr=2.15e-05, step=2961] Training: 30%|██▉ | 2962/10000 [38:47<1:09:09, 1.70it/s, loss=0.0191, lr=2.15e-05, step=2961] Training: 30%|██▉ | 2962/10000 [38:47<1:09:09, 1.70it/s, loss=0.0159, lr=2.15e-05, step=2962] Training: 30%|██▉ | 2963/10000 [38:48<1:05:33, 1.79it/s, loss=0.0159, lr=2.15e-05, step=2962] Training: 30%|██▉ | 2963/10000 [38:48<1:05:33, 1.79it/s, loss=0.0151, lr=2.15e-05, step=2963] Training: 30%|██▉ | 2964/10000 [38:49<1:19:02, 1.48it/s, loss=0.0151, lr=2.15e-05, step=2963] Training: 30%|██▉ | 2964/10000 [38:49<1:19:02, 1.48it/s, loss=0.0216, lr=2.15e-05, step=2964] Training: 30%|██▉ | 2965/10000 [38:49<1:12:46, 1.61it/s, loss=0.0216, lr=2.15e-05, step=2964] Training: 30%|██▉ | 2965/10000 [38:49<1:12:46, 1.61it/s, loss=0.0065, lr=2.15e-05, step=2965] Training: 30%|██▉ | 2966/10000 [38:50<1:08:35, 1.71it/s, loss=0.0065, lr=2.15e-05, step=2965] Training: 30%|██▉ | 2966/10000 [38:50<1:08:35, 1.71it/s, loss=0.0460, lr=2.15e-05, step=2966] Training: 30%|██▉ | 2967/10000 [38:50<1:04:42, 1.81it/s, loss=0.0460, lr=2.15e-05, step=2966] Training: 30%|██▉ | 2967/10000 [38:50<1:04:42, 1.81it/s, loss=0.0195, lr=2.15e-05, step=2967] Training: 30%|██▉ | 2968/10000 [38:51<1:02:48, 1.87it/s, loss=0.0195, lr=2.15e-05, step=2967] Training: 30%|██▉ | 2968/10000 [38:51<1:02:48, 1.87it/s, loss=0.0127, lr=2.15e-05, step=2968] Training: 30%|██▉ | 2969/10000 [38:51<1:01:15, 1.91it/s, loss=0.0127, lr=2.15e-05, step=2968] Training: 30%|██▉ | 2969/10000 [38:51<1:01:15, 1.91it/s, loss=0.0111, lr=2.15e-05, step=2969]19:23:24.126 [I] step=2970 loss=0.0162 smoothed_loss=0.0176 lr=2.15e-05 grad_norm=0.4830 step_time=0.5022s data_time=0.0731s it/s=1.738 eta_to_10000=4044.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.1203 grad_arm_token_fuse=0.0411 grad_shared_expert=0.3011 (18633:train_pytorch.py:850) + Training: 30%|██▉ | 2970/10000 [38:52<1:07:30, 1.74it/s, loss=0.0111, lr=2.15e-05, step=2969] Training: 30%|██▉ | 2970/10000 [38:52<1:07:30, 1.74it/s, loss=0.0162, lr=2.15e-05, step=2970] Training: 30%|██▉ | 2971/10000 [38:53<1:12:32, 1.61it/s, loss=0.0162, lr=2.15e-05, step=2970] Training: 30%|██▉ | 2971/10000 [38:53<1:12:32, 1.61it/s, loss=0.0082, lr=2.15e-05, step=2971] Training: 30%|██▉ | 2972/10000 [38:53<1:08:49, 1.70it/s, loss=0.0082, lr=2.15e-05, step=2971] Training: 30%|██▉ | 2972/10000 [38:53<1:08:49, 1.70it/s, loss=0.0082, lr=2.14e-05, step=2972] Training: 30%|██▉ | 2973/10000 [38:54<1:07:34, 1.73it/s, loss=0.0082, lr=2.14e-05, step=2972] Training: 30%|██▉ | 2973/10000 [38:54<1:07:34, 1.73it/s, loss=0.0213, lr=2.14e-05, step=2973] Training: 30%|██▉ | 2974/10000 [38:54<1:04:24, 1.82it/s, loss=0.0213, lr=2.14e-05, step=2973] Training: 30%|██▉ | 2974/10000 [38:54<1:04:24, 1.82it/s, loss=0.0210, lr=2.14e-05, step=2974] Training: 30%|██▉ | 2975/10000 [38:55<1:03:42, 1.84it/s, loss=0.0210, lr=2.14e-05, step=2974] Training: 30%|██▉ | 2975/10000 [38:55<1:03:42, 1.84it/s, loss=0.0091, lr=2.14e-05, step=2975] Training: 30%|██▉ | 2976/10000 [38:55<1:03:27, 1.84it/s, loss=0.0091, lr=2.14e-05, step=2975] Training: 30%|██▉ | 2976/10000 [38:55<1:03:27, 1.84it/s, loss=0.0141, lr=2.14e-05, step=2976] Training: 30%|██▉ | 2977/10000 [38:56<1:08:32, 1.71it/s, loss=0.0141, lr=2.14e-05, step=2976] Training: 30%|██▉ | 2977/10000 [38:56<1:08:32, 1.71it/s, loss=0.0185, lr=2.14e-05, step=2977] Training: 30%|██▉ | 2978/10000 [38:57<1:14:01, 1.58it/s, loss=0.0185, lr=2.14e-05, step=2977] Training: 30%|██▉ | 2978/10000 [38:57<1:14:01, 1.58it/s, loss=0.0070, lr=2.14e-05, step=2978] Training: 30%|██▉ | 2979/10000 [38:57<1:12:30, 1.61it/s, loss=0.0070, lr=2.14e-05, step=2978] Training: 30%|██▉ | 2979/10000 [38:57<1:12:30, 1.61it/s, loss=0.0102, lr=2.14e-05, step=2979]19:23:30.080 [I] step=2980 loss=0.0236 smoothed_loss=0.0156 lr=2.14e-05 grad_norm=0.4386 step_time=0.5170s data_time=0.0785s it/s=1.680 eta_to_10000=4179.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0103 grad_action_out_proj_arms=0.1275 grad_arm_token_fuse=0.0552 grad_shared_expert=0.4303 (18633:train_pytorch.py:850) + Training: 30%|██▉ | 2980/10000 [38:58<1:11:37, 1.63it/s, loss=0.0102, lr=2.14e-05, step=2979] Training: 30%|██▉ | 2980/10000 [38:58<1:11:37, 1.63it/s, loss=0.0236, lr=2.14e-05, step=2980] Training: 30%|██▉ | 2981/10000 [38:58<1:07:29, 1.73it/s, loss=0.0236, lr=2.14e-05, step=2980] Training: 30%|██▉ | 2981/10000 [38:58<1:07:29, 1.73it/s, loss=0.0107, lr=2.14e-05, step=2981] Training: 30%|██▉ | 2982/10000 [38:59<1:04:34, 1.81it/s, loss=0.0107, lr=2.14e-05, step=2981] Training: 30%|██▉ | 2982/10000 [38:59<1:04:34, 1.81it/s, loss=0.0074, lr=2.14e-05, step=2982] Training: 30%|██▉ | 2983/10000 [39:00<1:18:30, 1.49it/s, loss=0.0074, lr=2.14e-05, step=2982] Training: 30%|██▉ | 2983/10000 [39:00<1:18:30, 1.49it/s, loss=0.0313, lr=2.14e-05, step=2983] Training: 30%|██▉ | 2984/10000 [39:00<1:13:13, 1.60it/s, loss=0.0313, lr=2.14e-05, step=2983] Training: 30%|██▉ | 2984/10000 [39:00<1:13:13, 1.60it/s, loss=0.0273, lr=2.14e-05, step=2984] Training: 30%|██▉ | 2985/10000 [39:01<1:14:00, 1.58it/s, loss=0.0273, lr=2.14e-05, step=2984] Training: 30%|██▉ | 2985/10000 [39:01<1:14:00, 1.58it/s, loss=0.0393, lr=2.14e-05, step=2985] Training: 30%|██▉ | 2986/10000 [39:02<1:18:35, 1.49it/s, loss=0.0393, lr=2.14e-05, step=2985] Training: 30%|██▉ | 2986/10000 [39:02<1:18:35, 1.49it/s, loss=0.0383, lr=2.14e-05, step=2986] Training: 30%|██▉ | 2987/10000 [39:02<1:18:51, 1.48it/s, loss=0.0383, lr=2.14e-05, step=2986] Training: 30%|██▉ | 2987/10000 [39:02<1:18:51, 1.48it/s, loss=0.0253, lr=2.14e-05, step=2987] Training: 30%|██▉ | 2988/10000 [39:03<1:12:33, 1.61it/s, loss=0.0253, lr=2.14e-05, step=2987] Training: 30%|██▉ | 2988/10000 [39:03<1:12:33, 1.61it/s, loss=0.0239, lr=2.14e-05, step=2988] Training: 30%|██▉ | 2989/10000 [39:03<1:11:57, 1.62it/s, loss=0.0239, lr=2.14e-05, step=2988] Training: 30%|██▉ | 2989/10000 [39:03<1:11:57, 1.62it/s, loss=0.0359, lr=2.14e-05, step=2989]19:23:36.461 [I] step=2990 loss=0.0595 smoothed_loss=0.0269 lr=2.14e-05 grad_norm=0.5241 step_time=0.5357s data_time=0.1023s it/s=1.567 eta_to_10000=4472.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.2005 grad_arm_token_fuse=0.0614 grad_shared_expert=0.4129 (18633:train_pytorch.py:850) + Training: 30%|██▉ | 2990/10000 [39:04<1:15:54, 1.54it/s, loss=0.0359, lr=2.14e-05, step=2989] Training: 30%|██▉ | 2990/10000 [39:04<1:15:54, 1.54it/s, loss=0.0595, lr=2.14e-05, step=2990] Training: 30%|██▉ | 2991/10000 [39:05<1:10:43, 1.65it/s, loss=0.0595, lr=2.14e-05, step=2990] Training: 30%|██▉ | 2991/10000 [39:05<1:10:43, 1.65it/s, loss=0.0488, lr=2.14e-05, step=2991] Training: 30%|██▉ | 2992/10000 [39:05<1:08:15, 1.71it/s, loss=0.0488, lr=2.14e-05, step=2991] Training: 30%|██▉ | 2992/10000 [39:05<1:08:15, 1.71it/s, loss=0.0332, lr=2.14e-05, step=2992] Training: 30%|██▉ | 2993/10000 [39:06<1:16:06, 1.53it/s, loss=0.0332, lr=2.14e-05, step=2992] Training: 30%|██▉ | 2993/10000 [39:06<1:16:06, 1.53it/s, loss=0.0055, lr=2.14e-05, step=2993] Training: 30%|██▉ | 2994/10000 [39:06<1:11:04, 1.64it/s, loss=0.0055, lr=2.14e-05, step=2993] Training: 30%|██▉ | 2994/10000 [39:06<1:11:04, 1.64it/s, loss=0.0184, lr=2.14e-05, step=2994] Training: 30%|██▉ | 2995/10000 [39:07<1:08:08, 1.71it/s, loss=0.0184, lr=2.14e-05, step=2994] Training: 30%|██▉ | 2995/10000 [39:07<1:08:08, 1.71it/s, loss=0.0318, lr=2.14e-05, step=2995] Training: 30%|██▉ | 2996/10000 [39:08<1:07:21, 1.73it/s, loss=0.0318, lr=2.14e-05, step=2995] Training: 30%|██▉ | 2996/10000 [39:08<1:07:21, 1.73it/s, loss=0.0082, lr=2.14e-05, step=2996] Training: 30%|██▉ | 2997/10000 [39:08<1:11:28, 1.63it/s, loss=0.0082, lr=2.14e-05, step=2996] Training: 30%|██▉ | 2997/10000 [39:08<1:11:28, 1.63it/s, loss=0.0111, lr=2.14e-05, step=2997] Training: 30%|██▉ | 2998/10000 [39:09<1:09:49, 1.67it/s, loss=0.0111, lr=2.14e-05, step=2997] Training: 30%|██▉ | 2998/10000 [39:09<1:09:49, 1.67it/s, loss=0.0367, lr=2.14e-05, step=2998] Training: 30%|██▉ | 2999/10000 [39:10<1:16:30, 1.53it/s, loss=0.0367, lr=2.14e-05, step=2998] Training: 30%|██▉ | 2999/10000 [39:10<1:16:30, 1.53it/s, loss=0.0301, lr=2.14e-05, step=2999]19:23:42.798 [I] step=3000 loss=0.0201 smoothed_loss=0.0249 lr=2.14e-05 grad_norm=0.5353 step_time=0.5412s data_time=0.0925s it/s=1.578 eta_to_10000=4435.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0200 grad_action_out_proj_arms=0.1988 grad_arm_token_fuse=0.1050 grad_shared_expert=0.6035 (18633:train_pytorch.py:850) +19:25:59.695 [I] Saved checkpoint at step 3000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/3000 (18633:train_pytorch.py:350) + Training: 30%|███ | 3000/10000 [41:27<81:14:45, 41.78s/it, loss=0.0301, lr=2.14e-05, step=2999] Training: 30%|███ | 3000/10000 [41:27<81:14:45, 41.78s/it, loss=0.0201, lr=2.14e-05, step=3000] Training: 30%|███ | 3001/10000 [41:28<57:27:04, 29.55s/it, loss=0.0201, lr=2.14e-05, step=3000] Training: 30%|███ | 3001/10000 [41:28<57:27:04, 29.55s/it, loss=0.0038, lr=2.14e-05, step=3001] Training: 30%|███ | 3002/10000 [41:29<40:36:32, 20.89s/it, loss=0.0038, lr=2.14e-05, step=3001] Training: 30%|███ | 3002/10000 [41:29<40:36:32, 20.89s/it, loss=0.0041, lr=2.14e-05, step=3002] Training: 30%|███ | 3003/10000 [41:30<28:51:15, 14.85s/it, loss=0.0041, lr=2.14e-05, step=3002] Training: 30%|███ | 3003/10000 [41:30<28:51:15, 14.85s/it, loss=0.0219, lr=2.14e-05, step=3003] Training: 30%|███ | 3004/10000 [41:31<20:52:02, 10.74s/it, loss=0.0219, lr=2.14e-05, step=3003] Training: 30%|███ | 3004/10000 [41:31<20:52:02, 10.74s/it, loss=0.0181, lr=2.14e-05, step=3004] Training: 30%|███ | 3005/10000 [41:32<15:06:02, 7.77s/it, loss=0.0181, lr=2.14e-05, step=3004] Training: 30%|███ | 3005/10000 [41:32<15:06:02, 7.77s/it, loss=0.0131, lr=2.14e-05, step=3005] Training: 30%|███ | 3006/10000 [41:33<11:05:19, 5.71s/it, loss=0.0131, lr=2.14e-05, step=3005] Training: 30%|███ | 3006/10000 [41:33<11:05:19, 5.71s/it, loss=0.0193, lr=2.14e-05, step=3006] Training: 30%|███ | 3007/10000 [41:34<8:17:20, 4.27s/it, loss=0.0193, lr=2.14e-05, step=3006] Training: 30%|███ | 3007/10000 [41:34<8:17:20, 4.27s/it, loss=0.0227, lr=2.14e-05, step=3007] Training: 30%|███ | 3008/10000 [41:35<6:24:11, 3.30s/it, loss=0.0227, lr=2.14e-05, step=3007] Training: 30%|███ | 3008/10000 [41:35<6:24:11, 3.30s/it, loss=0.0379, lr=2.14e-05, step=3008] Training: 30%|███ | 3009/10000 [41:35<4:57:40, 2.55s/it, loss=0.0379, lr=2.14e-05, step=3008] Training: 30%|███ | 3009/10000 [41:35<4:57:40, 2.55s/it, loss=0.0221, lr=2.13e-05, step=3009]19:26:08.608 [I] step=3010 loss=0.0070 smoothed_loss=0.0205 lr=2.14e-05 grad_norm=0.6298 step_time=0.6527s data_time=13.9283s it/s=0.069 eta_to_10000=101920.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0318 grad_action_out_proj_arms=0.1761 grad_arm_token_fuse=0.1618 grad_shared_expert=0.6405 (18633:train_pytorch.py:850) + Training: 30%|███ | 3010/10000 [41:36<3:56:55, 2.03s/it, loss=0.0221, lr=2.13e-05, step=3009] Training: 30%|███ | 3010/10000 [41:36<3:56:55, 2.03s/it, loss=0.0070, lr=2.13e-05, step=3010] Training: 30%|███ | 3011/10000 [41:37<3:07:51, 1.61s/it, loss=0.0070, lr=2.13e-05, step=3010] Training: 30%|███ | 3011/10000 [41:37<3:07:51, 1.61s/it, loss=0.0168, lr=2.13e-05, step=3011] Training: 30%|███ | 3012/10000 [41:38<2:37:32, 1.35s/it, loss=0.0168, lr=2.13e-05, step=3011] Training: 30%|███ | 3012/10000 [41:38<2:37:32, 1.35s/it, loss=0.0499, lr=2.13e-05, step=3012] Training: 30%|███ | 3013/10000 [41:38<2:08:34, 1.10s/it, loss=0.0499, lr=2.13e-05, step=3012] Training: 30%|███ | 3013/10000 [41:38<2:08:34, 1.10s/it, loss=0.0085, lr=2.13e-05, step=3013] Training: 30%|███ | 3014/10000 [41:39<2:02:10, 1.05s/it, loss=0.0085, lr=2.13e-05, step=3013] Training: 30%|███ | 3014/10000 [41:39<2:02:10, 1.05s/it, loss=0.0186, lr=2.13e-05, step=3014] Training: 30%|███ | 3015/10000 [41:40<1:50:57, 1.05it/s, loss=0.0186, lr=2.13e-05, step=3014] Training: 30%|███ | 3015/10000 [41:40<1:50:57, 1.05it/s, loss=0.0037, lr=2.13e-05, step=3015] Training: 30%|███ | 3016/10000 [41:40<1:36:54, 1.20it/s, loss=0.0037, lr=2.13e-05, step=3015] Training: 30%|███ | 3016/10000 [41:40<1:36:54, 1.20it/s, loss=0.0144, lr=2.13e-05, step=3016] Training: 30%|███ | 3017/10000 [41:41<1:29:27, 1.30it/s, loss=0.0144, lr=2.13e-05, step=3016] Training: 30%|███ | 3017/10000 [41:41<1:29:27, 1.30it/s, loss=0.0207, lr=2.13e-05, step=3017] Training: 30%|███ | 3018/10000 [41:42<1:22:45, 1.41it/s, loss=0.0207, lr=2.13e-05, step=3017] Training: 30%|███ | 3018/10000 [41:42<1:22:45, 1.41it/s, loss=0.0384, lr=2.13e-05, step=3018] Training: 30%|███ | 3019/10000 [41:43<1:30:58, 1.28it/s, loss=0.0384, lr=2.13e-05, step=3018] Training: 30%|███ | 3019/10000 [41:43<1:30:58, 1.28it/s, loss=0.0122, lr=2.13e-05, step=3019]19:26:15.728 [I] step=3020 loss=0.0138 smoothed_loss=0.0196 lr=2.13e-05 grad_norm=0.5346 step_time=0.5938s data_time=0.1182s it/s=1.405 eta_to_10000=4969.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0095 grad_action_out_proj_arms=0.1090 grad_arm_token_fuse=0.0510 grad_shared_expert=0.4272 (18633:train_pytorch.py:850) + Training: 30%|███ | 3020/10000 [41:43<1:34:11, 1.24it/s, loss=0.0122, lr=2.13e-05, step=3019] Training: 30%|███ | 3020/10000 [41:43<1:34:11, 1.24it/s, loss=0.0138, lr=2.13e-05, step=3020] Training: 30%|███ | 3021/10000 [41:44<1:39:27, 1.17it/s, loss=0.0138, lr=2.13e-05, step=3020] Training: 30%|███ | 3021/10000 [41:44<1:39:27, 1.17it/s, loss=0.0534, lr=2.13e-05, step=3021] Training: 30%|███ | 3022/10000 [41:45<1:40:44, 1.15it/s, loss=0.0534, lr=2.13e-05, step=3021] Training: 30%|███ | 3022/10000 [41:45<1:40:44, 1.15it/s, loss=0.0320, lr=2.13e-05, step=3022] Training: 30%|███ | 3023/10000 [41:46<1:33:22, 1.25it/s, loss=0.0320, lr=2.13e-05, step=3022] Training: 30%|███ | 3023/10000 [41:46<1:33:22, 1.25it/s, loss=0.0213, lr=2.13e-05, step=3023] Training: 30%|███ | 3024/10000 [41:47<1:26:34, 1.34it/s, loss=0.0213, lr=2.13e-05, step=3023] Training: 30%|███ | 3024/10000 [41:47<1:26:34, 1.34it/s, loss=0.0068, lr=2.13e-05, step=3024] Training: 30%|███ | 3025/10000 [41:47<1:22:43, 1.41it/s, loss=0.0068, lr=2.13e-05, step=3024] Training: 30%|███ | 3025/10000 [41:47<1:22:43, 1.41it/s, loss=0.0104, lr=2.13e-05, step=3025] Training: 30%|███ | 3026/10000 [41:48<1:29:31, 1.30it/s, loss=0.0104, lr=2.13e-05, step=3025] Training: 30%|███ | 3026/10000 [41:48<1:29:31, 1.30it/s, loss=0.0224, lr=2.13e-05, step=3026] Training: 30%|███ | 3027/10000 [41:49<1:31:26, 1.27it/s, loss=0.0224, lr=2.13e-05, step=3026] Training: 30%|███ | 3027/10000 [41:49<1:31:26, 1.27it/s, loss=0.0175, lr=2.13e-05, step=3027] Training: 30%|███ | 3028/10000 [41:50<1:42:28, 1.13it/s, loss=0.0175, lr=2.13e-05, step=3027] Training: 30%|███ | 3028/10000 [41:50<1:42:28, 1.13it/s, loss=0.1049, lr=2.13e-05, step=3028] Training: 30%|███ | 3029/10000 [41:51<1:35:24, 1.22it/s, loss=0.1049, lr=2.13e-05, step=3028] Training: 30%|███ | 3029/10000 [41:51<1:35:24, 1.22it/s, loss=0.0847, lr=2.13e-05, step=3029]19:26:23.584 [I] step=3030 loss=0.0079 smoothed_loss=0.0319 lr=2.13e-05 grad_norm=0.5188 step_time=0.6205s data_time=0.1651s it/s=1.273 eta_to_10000=5475.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0311 grad_action_out_proj_arms=0.2589 grad_arm_token_fuse=0.1845 grad_shared_expert=0.8296 (18633:train_pytorch.py:850) + Training: 30%|███ | 3030/10000 [41:51<1:27:17, 1.33it/s, loss=0.0847, lr=2.13e-05, step=3029] Training: 30%|███ | 3030/10000 [41:51<1:27:17, 1.33it/s, loss=0.0079, lr=2.13e-05, step=3030] Training: 30%|███ | 3031/10000 [41:52<1:20:04, 1.45it/s, loss=0.0079, lr=2.13e-05, step=3030] Training: 30%|███ | 3031/10000 [41:52<1:20:04, 1.45it/s, loss=0.0095, lr=2.13e-05, step=3031] Training: 30%|███ | 3032/10000 [41:53<1:24:24, 1.38it/s, loss=0.0095, lr=2.13e-05, step=3031] Training: 30%|███ | 3032/10000 [41:53<1:24:24, 1.38it/s, loss=0.0107, lr=2.13e-05, step=3032] Training: 30%|███ | 3033/10000 [41:53<1:21:33, 1.42it/s, loss=0.0107, lr=2.13e-05, step=3032] Training: 30%|███ | 3033/10000 [41:53<1:21:33, 1.42it/s, loss=0.0276, lr=2.13e-05, step=3033] Training: 30%|███ | 3034/10000 [41:54<1:22:06, 1.41it/s, loss=0.0276, lr=2.13e-05, step=3033] Training: 30%|███ | 3034/10000 [41:54<1:22:06, 1.41it/s, loss=0.0322, lr=2.13e-05, step=3034] Training: 30%|███ | 3035/10000 [41:55<1:16:23, 1.52it/s, loss=0.0322, lr=2.13e-05, step=3034] Training: 30%|███ | 3035/10000 [41:55<1:16:23, 1.52it/s, loss=0.0077, lr=2.13e-05, step=3035] Training: 30%|███ | 3036/10000 [41:55<1:22:59, 1.40it/s, loss=0.0077, lr=2.13e-05, step=3035] Training: 30%|███ | 3036/10000 [41:55<1:22:59, 1.40it/s, loss=0.0237, lr=2.13e-05, step=3036] Training: 30%|███ | 3037/10000 [41:56<1:20:12, 1.45it/s, loss=0.0237, lr=2.13e-05, step=3036] Training: 30%|███ | 3037/10000 [41:56<1:20:12, 1.45it/s, loss=0.0073, lr=2.13e-05, step=3037] Training: 30%|███ | 3038/10000 [41:57<1:20:18, 1.44it/s, loss=0.0073, lr=2.13e-05, step=3037] Training: 30%|███ | 3038/10000 [41:57<1:20:18, 1.44it/s, loss=0.0164, lr=2.13e-05, step=3038] Training: 30%|███ | 3039/10000 [41:57<1:15:20, 1.54it/s, loss=0.0164, lr=2.13e-05, step=3038] Training: 30%|███ | 3039/10000 [41:57<1:15:20, 1.54it/s, loss=0.0231, lr=2.13e-05, step=3039]19:26:30.239 [I] step=3040 loss=0.0177 smoothed_loss=0.0227 lr=2.13e-05 grad_norm=0.5146 step_time=0.5518s data_time=0.1137s it/s=1.503 eta_to_10000=4631.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1197 grad_arm_token_fuse=0.0467 grad_shared_expert=0.3931 (18633:train_pytorch.py:850) + Training: 30%|███ | 3040/10000 [41:58<1:15:52, 1.53it/s, loss=0.0231, lr=2.13e-05, step=3039] Training: 30%|███ | 3040/10000 [41:58<1:15:52, 1.53it/s, loss=0.0177, lr=2.13e-05, step=3040] Training: 30%|███ | 3041/10000 [41:59<1:16:55, 1.51it/s, loss=0.0177, lr=2.13e-05, step=3040] Training: 30%|███ | 3041/10000 [41:59<1:16:55, 1.51it/s, loss=0.0218, lr=2.13e-05, step=3041] Training: 30%|███ | 3042/10000 [42:00<1:26:02, 1.35it/s, loss=0.0218, lr=2.13e-05, step=3041] Training: 30%|███ | 3042/10000 [42:00<1:26:02, 1.35it/s, loss=0.0356, lr=2.13e-05, step=3042] Training: 30%|███ | 3043/10000 [42:00<1:29:19, 1.30it/s, loss=0.0356, lr=2.13e-05, step=3042] Training: 30%|███ | 3043/10000 [42:00<1:29:19, 1.30it/s, loss=0.0264, lr=2.13e-05, step=3043] Training: 30%|███ | 3044/10000 [42:01<1:23:12, 1.39it/s, loss=0.0264, lr=2.13e-05, step=3043] Training: 30%|███ | 3044/10000 [42:01<1:23:12, 1.39it/s, loss=0.0305, lr=2.13e-05, step=3044] Training: 30%|███ | 3045/10000 [42:02<1:19:29, 1.46it/s, loss=0.0305, lr=2.13e-05, step=3044] Training: 30%|███ | 3045/10000 [42:02<1:19:29, 1.46it/s, loss=0.0359, lr=2.12e-05, step=3045] Training: 30%|███ | 3046/10000 [42:02<1:16:13, 1.52it/s, loss=0.0359, lr=2.12e-05, step=3045] Training: 30%|███ | 3046/10000 [42:02<1:16:13, 1.52it/s, loss=0.0025, lr=2.12e-05, step=3046] Training: 30%|███ | 3047/10000 [42:03<1:14:31, 1.55it/s, loss=0.0025, lr=2.12e-05, step=3046] Training: 30%|███ | 3047/10000 [42:03<1:14:31, 1.55it/s, loss=0.0268, lr=2.12e-05, step=3047] Training: 30%|███ | 3048/10000 [42:03<1:12:00, 1.61it/s, loss=0.0268, lr=2.12e-05, step=3047] Training: 30%|███ | 3048/10000 [42:03<1:12:00, 1.61it/s, loss=0.0176, lr=2.12e-05, step=3048] Training: 30%|███ | 3049/10000 [42:04<1:23:04, 1.39it/s, loss=0.0176, lr=2.12e-05, step=3048] Training: 30%|███ | 3049/10000 [42:04<1:23:04, 1.39it/s, loss=0.0138, lr=2.12e-05, step=3049]19:26:37.431 [I] step=3050 loss=0.0061 smoothed_loss=0.0207 lr=2.12e-05 grad_norm=0.6467 step_time=0.6057s data_time=0.1134s it/s=1.391 eta_to_10000=4995.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0078 grad_action_out_proj_arms=0.1142 grad_arm_token_fuse=0.0356 grad_shared_expert=0.3554 (18633:train_pytorch.py:850) + Training: 30%|███ | 3050/10000 [42:05<1:26:44, 1.34it/s, loss=0.0138, lr=2.12e-05, step=3049] Training: 30%|███ | 3050/10000 [42:05<1:26:44, 1.34it/s, loss=0.0061, lr=2.12e-05, step=3050] Training: 31%|███ | 3051/10000 [42:06<1:17:47, 1.49it/s, loss=0.0061, lr=2.12e-05, step=3050] Training: 31%|███ | 3051/10000 [42:06<1:17:47, 1.49it/s, loss=0.0086, lr=2.12e-05, step=3051] Training: 31%|███ | 3052/10000 [42:06<1:15:55, 1.53it/s, loss=0.0086, lr=2.12e-05, step=3051] Training: 31%|███ | 3052/10000 [42:06<1:15:55, 1.53it/s, loss=0.0134, lr=2.12e-05, step=3052] Training: 31%|███ | 3053/10000 [42:07<1:20:16, 1.44it/s, loss=0.0134, lr=2.12e-05, step=3052] Training: 31%|███ | 3053/10000 [42:07<1:20:16, 1.44it/s, loss=0.0095, lr=2.12e-05, step=3053] Training: 31%|███ | 3054/10000 [42:08<1:23:34, 1.39it/s, loss=0.0095, lr=2.12e-05, step=3053] Training: 31%|███ | 3054/10000 [42:08<1:23:34, 1.39it/s, loss=0.0291, lr=2.12e-05, step=3054] Training: 31%|███ | 3055/10000 [42:08<1:22:24, 1.40it/s, loss=0.0291, lr=2.12e-05, step=3054] Training: 31%|███ | 3055/10000 [42:08<1:22:24, 1.40it/s, loss=0.0118, lr=2.12e-05, step=3055] Training: 31%|███ | 3056/10000 [42:09<1:23:42, 1.38it/s, loss=0.0118, lr=2.12e-05, step=3055] Training: 31%|███ | 3056/10000 [42:09<1:23:42, 1.38it/s, loss=0.0231, lr=2.12e-05, step=3056] Training: 31%|███ | 3057/10000 [42:10<1:35:41, 1.21it/s, loss=0.0231, lr=2.12e-05, step=3056] Training: 31%|███ | 3057/10000 [42:10<1:35:41, 1.21it/s, loss=0.0120, lr=2.12e-05, step=3057] Training: 31%|███ | 3058/10000 [42:11<1:30:37, 1.28it/s, loss=0.0120, lr=2.12e-05, step=3057] Training: 31%|███ | 3058/10000 [42:11<1:30:37, 1.28it/s, loss=0.0065, lr=2.12e-05, step=3058] Training: 31%|███ | 3059/10000 [42:12<1:31:19, 1.27it/s, loss=0.0065, lr=2.12e-05, step=3058] Training: 31%|███ | 3059/10000 [42:12<1:31:19, 1.27it/s, loss=0.0470, lr=2.12e-05, step=3059]19:26:44.770 [I] step=3060 loss=0.0243 smoothed_loss=0.0204 lr=2.12e-05 grad_norm=0.4928 step_time=0.5739s data_time=0.1600s it/s=1.363 eta_to_10000=5092.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0261 grad_action_out_proj_arms=0.2474 grad_arm_token_fuse=0.1311 grad_shared_expert=0.5398 (18633:train_pytorch.py:850) + Training: 31%|███ | 3060/10000 [42:12<1:27:03, 1.33it/s, loss=0.0470, lr=2.12e-05, step=3059] Training: 31%|███ | 3060/10000 [42:12<1:27:03, 1.33it/s, loss=0.0243, lr=2.12e-05, step=3060] Training: 31%|███ | 3061/10000 [42:13<1:29:04, 1.30it/s, loss=0.0243, lr=2.12e-05, step=3060] Training: 31%|███ | 3061/10000 [42:13<1:29:04, 1.30it/s, loss=0.0434, lr=2.12e-05, step=3061] Training: 31%|███ | 3062/10000 [42:14<1:21:19, 1.42it/s, loss=0.0434, lr=2.12e-05, step=3061] Training: 31%|███ | 3062/10000 [42:14<1:21:19, 1.42it/s, loss=0.0101, lr=2.12e-05, step=3062] Training: 31%|███ | 3063/10000 [42:14<1:18:13, 1.48it/s, loss=0.0101, lr=2.12e-05, step=3062] Training: 31%|███ | 3063/10000 [42:14<1:18:13, 1.48it/s, loss=0.0162, lr=2.12e-05, step=3063] Training: 31%|███ | 3064/10000 [42:15<1:23:10, 1.39it/s, loss=0.0162, lr=2.12e-05, step=3063] Training: 31%|███ | 3064/10000 [42:15<1:23:10, 1.39it/s, loss=0.0345, lr=2.12e-05, step=3064] Training: 31%|███ | 3065/10000 [42:16<1:17:12, 1.50it/s, loss=0.0345, lr=2.12e-05, step=3064] Training: 31%|███ | 3065/10000 [42:16<1:17:12, 1.50it/s, loss=0.0178, lr=2.12e-05, step=3065] Training: 31%|███ | 3066/10000 [42:16<1:15:23, 1.53it/s, loss=0.0178, lr=2.12e-05, step=3065] Training: 31%|███ | 3066/10000 [42:16<1:15:23, 1.53it/s, loss=0.0058, lr=2.12e-05, step=3066] Training: 31%|███ | 3067/10000 [42:17<1:17:43, 1.49it/s, loss=0.0058, lr=2.12e-05, step=3066] Training: 31%|███ | 3067/10000 [42:17<1:17:43, 1.49it/s, loss=0.0087, lr=2.12e-05, step=3067] Training: 31%|███ | 3068/10000 [42:18<1:23:41, 1.38it/s, loss=0.0087, lr=2.12e-05, step=3067] Training: 31%|███ | 3068/10000 [42:18<1:23:41, 1.38it/s, loss=0.0130, lr=2.12e-05, step=3068] Training: 31%|███ | 3069/10000 [42:18<1:16:51, 1.50it/s, loss=0.0130, lr=2.12e-05, step=3068] Training: 31%|███ | 3069/10000 [42:18<1:16:51, 1.50it/s, loss=0.0373, lr=2.12e-05, step=3069]19:26:51.444 [I] step=3070 loss=0.0125 smoothed_loss=0.0196 lr=2.12e-05 grad_norm=0.5857 step_time=0.5545s data_time=0.1129s it/s=1.499 eta_to_10000=4624.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1532 grad_arm_token_fuse=0.0690 grad_shared_expert=0.7375 (18633:train_pytorch.py:850) + Training: 31%|███ | 3070/10000 [42:19<1:15:27, 1.53it/s, loss=0.0373, lr=2.12e-05, step=3069] Training: 31%|███ | 3070/10000 [42:19<1:15:27, 1.53it/s, loss=0.0125, lr=2.12e-05, step=3070] Training: 31%|███ | 3071/10000 [42:20<1:23:37, 1.38it/s, loss=0.0125, lr=2.12e-05, step=3070] Training: 31%|███ | 3071/10000 [42:20<1:23:37, 1.38it/s, loss=0.0072, lr=2.12e-05, step=3071] Training: 31%|███ | 3072/10000 [42:21<1:32:06, 1.25it/s, loss=0.0072, lr=2.12e-05, step=3071] Training: 31%|███ | 3072/10000 [42:21<1:32:06, 1.25it/s, loss=0.0323, lr=2.12e-05, step=3072] Training: 31%|███ | 3073/10000 [42:22<1:26:02, 1.34it/s, loss=0.0323, lr=2.12e-05, step=3072] Training: 31%|███ | 3073/10000 [42:22<1:26:02, 1.34it/s, loss=0.0296, lr=2.12e-05, step=3073] Training: 31%|███ | 3074/10000 [42:22<1:21:21, 1.42it/s, loss=0.0296, lr=2.12e-05, step=3073] Training: 31%|███ | 3074/10000 [42:22<1:21:21, 1.42it/s, loss=0.0111, lr=2.12e-05, step=3074] Training: 31%|███ | 3075/10000 [42:23<1:25:05, 1.36it/s, loss=0.0111, lr=2.12e-05, step=3074] Training: 31%|███ | 3075/10000 [42:23<1:25:05, 1.36it/s, loss=0.0254, lr=2.12e-05, step=3075] Training: 31%|███ | 3076/10000 [42:24<1:17:32, 1.49it/s, loss=0.0254, lr=2.12e-05, step=3075] Training: 31%|███ | 3076/10000 [42:24<1:17:32, 1.49it/s, loss=0.0202, lr=2.12e-05, step=3076] Training: 31%|███ | 3077/10000 [42:24<1:13:59, 1.56it/s, loss=0.0202, lr=2.12e-05, step=3076] Training: 31%|███ | 3077/10000 [42:24<1:13:59, 1.56it/s, loss=0.0166, lr=2.12e-05, step=3077] Training: 31%|███ | 3078/10000 [42:25<1:29:16, 1.29it/s, loss=0.0166, lr=2.12e-05, step=3077] Training: 31%|███ | 3078/10000 [42:25<1:29:16, 1.29it/s, loss=0.0106, lr=2.12e-05, step=3078] Training: 31%|███ | 3079/10000 [42:26<1:29:08, 1.29it/s, loss=0.0106, lr=2.12e-05, step=3078] Training: 31%|███ | 3079/10000 [42:26<1:29:08, 1.29it/s, loss=0.0613, lr=2.12e-05, step=3079]19:26:58.878 [I] step=3080 loss=0.0596 smoothed_loss=0.0269 lr=2.12e-05 grad_norm=0.4780 step_time=0.6166s data_time=0.1268s it/s=1.345 eta_to_10000=5143.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0272 grad_action_out_proj_arms=0.1916 grad_arm_token_fuse=0.1461 grad_shared_expert=0.4235 (18633:train_pytorch.py:850) + Training: 31%|███ | 3080/10000 [42:27<1:22:40, 1.40it/s, loss=0.0613, lr=2.12e-05, step=3079] Training: 31%|███ | 3080/10000 [42:27<1:22:40, 1.40it/s, loss=0.0596, lr=2.12e-05, step=3080] Training: 31%|███ | 3081/10000 [42:27<1:16:00, 1.52it/s, loss=0.0596, lr=2.12e-05, step=3080] Training: 31%|███ | 3081/10000 [42:27<1:16:00, 1.52it/s, loss=0.0045, lr=2.11e-05, step=3081] Training: 31%|███ | 3082/10000 [42:28<1:12:56, 1.58it/s, loss=0.0045, lr=2.11e-05, step=3081] Training: 31%|███ | 3082/10000 [42:28<1:12:56, 1.58it/s, loss=0.0093, lr=2.11e-05, step=3082] Training: 31%|███ | 3083/10000 [42:28<1:09:02, 1.67it/s, loss=0.0093, lr=2.11e-05, step=3082] Training: 31%|███ | 3083/10000 [42:28<1:09:02, 1.67it/s, loss=0.0736, lr=2.11e-05, step=3083] Training: 31%|███ | 3084/10000 [42:29<1:06:02, 1.75it/s, loss=0.0736, lr=2.11e-05, step=3083] Training: 31%|███ | 3084/10000 [42:29<1:06:02, 1.75it/s, loss=0.0205, lr=2.11e-05, step=3084] Training: 31%|███ | 3085/10000 [42:30<1:20:31, 1.43it/s, loss=0.0205, lr=2.11e-05, step=3084] Training: 31%|███ | 3085/10000 [42:30<1:20:31, 1.43it/s, loss=0.0269, lr=2.11e-05, step=3085] Training: 31%|███ | 3086/10000 [42:31<1:25:35, 1.35it/s, loss=0.0269, lr=2.11e-05, step=3085] Training: 31%|███ | 3086/10000 [42:31<1:25:35, 1.35it/s, loss=0.0049, lr=2.11e-05, step=3086] Training: 31%|███ | 3087/10000 [42:31<1:32:21, 1.25it/s, loss=0.0049, lr=2.11e-05, step=3086] Training: 31%|███ | 3087/10000 [42:31<1:32:21, 1.25it/s, loss=0.0446, lr=2.11e-05, step=3087] Training: 31%|███ | 3088/10000 [42:32<1:25:04, 1.35it/s, loss=0.0446, lr=2.11e-05, step=3087] Training: 31%|███ | 3088/10000 [42:32<1:25:04, 1.35it/s, loss=0.0079, lr=2.11e-05, step=3088] Training: 31%|███ | 3089/10000 [42:33<1:18:29, 1.47it/s, loss=0.0079, lr=2.11e-05, step=3088] Training: 31%|███ | 3089/10000 [42:33<1:18:29, 1.47it/s, loss=0.0145, lr=2.11e-05, step=3089]19:27:05.523 [I] step=3090 loss=0.0165 smoothed_loss=0.0233 lr=2.11e-05 grad_norm=0.5428 step_time=0.5431s data_time=0.1214s it/s=1.505 eta_to_10000=4591.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.2302 grad_arm_token_fuse=0.1118 grad_shared_expert=0.5303 (18633:train_pytorch.py:850) + Training: 31%|███ | 3090/10000 [42:33<1:15:42, 1.52it/s, loss=0.0145, lr=2.11e-05, step=3089] Training: 31%|███ | 3090/10000 [42:33<1:15:42, 1.52it/s, loss=0.0165, lr=2.11e-05, step=3090] Training: 31%|███ | 3091/10000 [42:34<1:13:45, 1.56it/s, loss=0.0165, lr=2.11e-05, step=3090] Training: 31%|███ | 3091/10000 [42:34<1:13:45, 1.56it/s, loss=0.0085, lr=2.11e-05, step=3091] Training: 31%|███ | 3092/10000 [42:34<1:09:20, 1.66it/s, loss=0.0085, lr=2.11e-05, step=3091] Training: 31%|███ | 3092/10000 [42:34<1:09:20, 1.66it/s, loss=0.0709, lr=2.11e-05, step=3092] Training: 31%|███ | 3093/10000 [42:35<1:15:43, 1.52it/s, loss=0.0709, lr=2.11e-05, step=3092] Training: 31%|███ | 3093/10000 [42:35<1:15:43, 1.52it/s, loss=0.0159, lr=2.11e-05, step=3093] Training: 31%|███ | 3094/10000 [42:36<1:26:32, 1.33it/s, loss=0.0159, lr=2.11e-05, step=3093] Training: 31%|███ | 3094/10000 [42:36<1:26:32, 1.33it/s, loss=0.0305, lr=2.11e-05, step=3094] Training: 31%|███ | 3095/10000 [42:37<1:27:20, 1.32it/s, loss=0.0305, lr=2.11e-05, step=3094] Training: 31%|███ | 3095/10000 [42:37<1:27:20, 1.32it/s, loss=0.1383, lr=2.11e-05, step=3095] Training: 31%|███ | 3096/10000 [42:38<1:29:21, 1.29it/s, loss=0.1383, lr=2.11e-05, step=3095] Training: 31%|███ | 3096/10000 [42:38<1:29:21, 1.29it/s, loss=0.0042, lr=2.11e-05, step=3096] Training: 31%|███ | 3097/10000 [42:38<1:22:07, 1.40it/s, loss=0.0042, lr=2.11e-05, step=3096] Training: 31%|███ | 3097/10000 [42:38<1:22:07, 1.40it/s, loss=0.0071, lr=2.11e-05, step=3097] Training: 31%|███ | 3098/10000 [42:39<1:16:41, 1.50it/s, loss=0.0071, lr=2.11e-05, step=3097] Training: 31%|███ | 3098/10000 [42:39<1:16:41, 1.50it/s, loss=0.0529, lr=2.11e-05, step=3098] Training: 31%|███ | 3099/10000 [42:40<1:23:55, 1.37it/s, loss=0.0529, lr=2.11e-05, step=3098] Training: 31%|███ | 3099/10000 [42:40<1:23:55, 1.37it/s, loss=0.0257, lr=2.11e-05, step=3099]19:27:13.120 [I] step=3100 loss=0.1228 smoothed_loss=0.0417 lr=2.11e-05 grad_norm=0.5249 step_time=0.6132s data_time=0.1465s it/s=1.316 eta_to_10000=5241.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0252 grad_action_out_proj_arms=0.1981 grad_arm_token_fuse=0.1527 grad_shared_expert=0.4690 (18633:train_pytorch.py:850) + Training: 31%|███ | 3100/10000 [42:41<1:37:43, 1.18it/s, loss=0.0257, lr=2.11e-05, step=3099] Training: 31%|███ | 3100/10000 [42:41<1:37:43, 1.18it/s, loss=0.1228, lr=2.11e-05, step=3100] Training: 31%|███ | 3101/10000 [42:42<1:36:10, 1.20it/s, loss=0.1228, lr=2.11e-05, step=3100] Training: 31%|███ | 3101/10000 [42:42<1:36:10, 1.20it/s, loss=0.0437, lr=2.11e-05, step=3101] Training: 31%|███ | 3102/10000 [42:43<1:51:46, 1.03it/s, loss=0.0437, lr=2.11e-05, step=3101] Training: 31%|███ | 3102/10000 [42:43<1:51:46, 1.03it/s, loss=0.0083, lr=2.11e-05, step=3102] Training: 31%|███ | 3103/10000 [42:43<1:38:54, 1.16it/s, loss=0.0083, lr=2.11e-05, step=3102] Training: 31%|███ | 3103/10000 [42:43<1:38:54, 1.16it/s, loss=0.0187, lr=2.11e-05, step=3103] Training: 31%|███ | 3104/10000 [42:44<1:28:38, 1.30it/s, loss=0.0187, lr=2.11e-05, step=3103] Training: 31%|███ | 3104/10000 [42:44<1:28:38, 1.30it/s, loss=0.0047, lr=2.11e-05, step=3104] Training: 31%|███ | 3105/10000 [42:45<1:26:37, 1.33it/s, loss=0.0047, lr=2.11e-05, step=3104] Training: 31%|███ | 3105/10000 [42:45<1:26:37, 1.33it/s, loss=0.0259, lr=2.11e-05, step=3105] Training: 31%|███ | 3106/10000 [42:45<1:20:55, 1.42it/s, loss=0.0259, lr=2.11e-05, step=3105] Training: 31%|███ | 3106/10000 [42:45<1:20:55, 1.42it/s, loss=0.0289, lr=2.11e-05, step=3106] Training: 31%|███ | 3107/10000 [42:46<1:25:22, 1.35it/s, loss=0.0289, lr=2.11e-05, step=3106] Training: 31%|███ | 3107/10000 [42:46<1:25:22, 1.35it/s, loss=0.0243, lr=2.11e-05, step=3107] Training: 31%|███ | 3108/10000 [42:47<1:32:50, 1.24it/s, loss=0.0243, lr=2.11e-05, step=3107] Training: 31%|███ | 3108/10000 [42:47<1:32:50, 1.24it/s, loss=0.0122, lr=2.11e-05, step=3108] Training: 31%|███ | 3109/10000 [42:48<1:38:30, 1.17it/s, loss=0.0122, lr=2.11e-05, step=3108] Training: 31%|███ | 3109/10000 [42:48<1:38:30, 1.17it/s, loss=0.0208, lr=2.11e-05, step=3109]19:27:21.093 [I] step=3110 loss=0.0037 smoothed_loss=0.0262 lr=2.11e-05 grad_norm=0.5473 step_time=0.6052s data_time=0.1921s it/s=1.254 eta_to_10000=5492.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0149 grad_action_out_proj_arms=0.1677 grad_arm_token_fuse=0.0763 grad_shared_expert=0.4268 (18633:train_pytorch.py:850) + Training: 31%|███ | 3110/10000 [42:49<1:31:13, 1.26it/s, loss=0.0208, lr=2.11e-05, step=3109] Training: 31%|███ | 3110/10000 [42:49<1:31:13, 1.26it/s, loss=0.0037, lr=2.11e-05, step=3110] Training: 31%|███ | 3111/10000 [42:50<1:32:48, 1.24it/s, loss=0.0037, lr=2.11e-05, step=3110] Training: 31%|███ | 3111/10000 [42:50<1:32:48, 1.24it/s, loss=0.0159, lr=2.11e-05, step=3111] Training: 31%|███ | 3112/10000 [42:50<1:24:12, 1.36it/s, loss=0.0159, lr=2.11e-05, step=3111] Training: 31%|███ | 3112/10000 [42:50<1:24:12, 1.36it/s, loss=0.0068, lr=2.11e-05, step=3112] Training: 31%|███ | 3113/10000 [42:51<1:32:33, 1.24it/s, loss=0.0068, lr=2.11e-05, step=3112] Training: 31%|███ | 3113/10000 [42:51<1:32:33, 1.24it/s, loss=0.0185, lr=2.11e-05, step=3113] Training: 31%|███ | 3114/10000 [42:52<1:31:02, 1.26it/s, loss=0.0185, lr=2.11e-05, step=3113] Training: 31%|███ | 3114/10000 [42:52<1:31:02, 1.26it/s, loss=0.0439, lr=2.11e-05, step=3114] Training: 31%|███ | 3115/10000 [42:53<1:29:02, 1.29it/s, loss=0.0439, lr=2.11e-05, step=3114] Training: 31%|███ | 3115/10000 [42:53<1:29:02, 1.29it/s, loss=0.0379, lr=2.11e-05, step=3115] Training: 31%|███ | 3116/10000 [42:53<1:27:51, 1.31it/s, loss=0.0379, lr=2.11e-05, step=3115] Training: 31%|███ | 3116/10000 [42:53<1:27:51, 1.31it/s, loss=0.0094, lr=2.10e-05, step=3116] Training: 31%|███ | 3117/10000 [42:54<1:25:38, 1.34it/s, loss=0.0094, lr=2.10e-05, step=3116] Training: 31%|███ | 3117/10000 [42:54<1:25:38, 1.34it/s, loss=0.0246, lr=2.10e-05, step=3117] Training: 31%|███ | 3118/10000 [42:55<1:18:22, 1.46it/s, loss=0.0246, lr=2.10e-05, step=3117] Training: 31%|███ | 3118/10000 [42:55<1:18:22, 1.46it/s, loss=0.0164, lr=2.10e-05, step=3118] Training: 31%|███ | 3119/10000 [42:55<1:15:26, 1.52it/s, loss=0.0164, lr=2.10e-05, step=3118] Training: 31%|███ | 3119/10000 [42:55<1:15:26, 1.52it/s, loss=0.0230, lr=2.10e-05, step=3119]19:27:28.212 [I] step=3120 loss=0.0204 smoothed_loss=0.0233 lr=2.11e-05 grad_norm=0.4770 step_time=0.5573s data_time=0.1546s it/s=1.405 eta_to_10000=4897.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0190 grad_action_out_proj_arms=0.1919 grad_arm_token_fuse=0.1026 grad_shared_expert=0.3531 (18633:train_pytorch.py:850) + Training: 31%|███ | 3120/10000 [42:56<1:15:43, 1.51it/s, loss=0.0230, lr=2.10e-05, step=3119] Training: 31%|███ | 3120/10000 [42:56<1:15:43, 1.51it/s, loss=0.0204, lr=2.10e-05, step=3120] Training: 31%|███ | 3121/10000 [42:57<1:19:34, 1.44it/s, loss=0.0204, lr=2.10e-05, step=3120] Training: 31%|███ | 3121/10000 [42:57<1:19:34, 1.44it/s, loss=0.0141, lr=2.10e-05, step=3121] Training: 31%|███ | 3122/10000 [42:57<1:18:34, 1.46it/s, loss=0.0141, lr=2.10e-05, step=3121] Training: 31%|███ | 3122/10000 [42:57<1:18:34, 1.46it/s, loss=0.0140, lr=2.10e-05, step=3122] Training: 31%|███ | 3123/10000 [42:58<1:13:11, 1.57it/s, loss=0.0140, lr=2.10e-05, step=3122] Training: 31%|███ | 3123/10000 [42:58<1:13:11, 1.57it/s, loss=0.0164, lr=2.10e-05, step=3123] Training: 31%|███ | 3124/10000 [42:59<1:14:45, 1.53it/s, loss=0.0164, lr=2.10e-05, step=3123] Training: 31%|███ | 3124/10000 [42:59<1:14:45, 1.53it/s, loss=0.0427, lr=2.10e-05, step=3124] Training: 31%|███▏ | 3125/10000 [42:59<1:14:23, 1.54it/s, loss=0.0427, lr=2.10e-05, step=3124] Training: 31%|███▏ | 3125/10000 [42:59<1:14:23, 1.54it/s, loss=0.0030, lr=2.10e-05, step=3125] Training: 31%|███▏ | 3126/10000 [43:00<1:28:43, 1.29it/s, loss=0.0030, lr=2.10e-05, step=3125] Training: 31%|███▏ | 3126/10000 [43:00<1:28:43, 1.29it/s, loss=0.0190, lr=2.10e-05, step=3126] Training: 31%|███▏ | 3127/10000 [43:01<1:26:57, 1.32it/s, loss=0.0190, lr=2.10e-05, step=3126] Training: 31%|███▏ | 3127/10000 [43:01<1:26:57, 1.32it/s, loss=0.0206, lr=2.10e-05, step=3127] Training: 31%|███▏ | 3128/10000 [43:02<1:30:09, 1.27it/s, loss=0.0206, lr=2.10e-05, step=3127] Training: 31%|███▏ | 3128/10000 [43:02<1:30:09, 1.27it/s, loss=0.0120, lr=2.10e-05, step=3128] Training: 31%|███▏ | 3129/10000 [43:02<1:22:36, 1.39it/s, loss=0.0120, lr=2.10e-05, step=3128] Training: 31%|███▏ | 3129/10000 [43:02<1:22:36, 1.39it/s, loss=0.0062, lr=2.10e-05, step=3129]19:27:35.325 [I] step=3130 loss=0.0260 smoothed_loss=0.0194 lr=2.10e-05 grad_norm=0.5252 step_time=0.5630s data_time=0.1483s it/s=1.406 eta_to_10000=4886.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0151 grad_action_out_proj_arms=0.1677 grad_arm_token_fuse=0.0795 grad_shared_expert=0.4111 (18633:train_pytorch.py:850) + Training: 31%|███▏ | 3130/10000 [43:03<1:18:45, 1.45it/s, loss=0.0062, lr=2.10e-05, step=3129] Training: 31%|███▏ | 3130/10000 [43:03<1:18:45, 1.45it/s, loss=0.0260, lr=2.10e-05, step=3130] Training: 31%|███▏ | 3131/10000 [43:04<1:17:28, 1.48it/s, loss=0.0260, lr=2.10e-05, step=3130] Training: 31%|███▏ | 3131/10000 [43:04<1:17:28, 1.48it/s, loss=0.0121, lr=2.10e-05, step=3131] Training: 31%|███▏ | 3132/10000 [43:04<1:17:13, 1.48it/s, loss=0.0121, lr=2.10e-05, step=3131] Training: 31%|███▏ | 3132/10000 [43:04<1:17:13, 1.48it/s, loss=0.0493, lr=2.10e-05, step=3132] Training: 31%|███▏ | 3133/10000 [43:05<1:11:34, 1.60it/s, loss=0.0493, lr=2.10e-05, step=3132] Training: 31%|███▏ | 3133/10000 [43:05<1:11:34, 1.60it/s, loss=0.0081, lr=2.10e-05, step=3133] Training: 31%|███▏ | 3134/10000 [43:05<1:10:34, 1.62it/s, loss=0.0081, lr=2.10e-05, step=3133] Training: 31%|███▏ | 3134/10000 [43:05<1:10:34, 1.62it/s, loss=0.0098, lr=2.10e-05, step=3134] Training: 31%|███▏ | 3135/10000 [43:06<1:07:27, 1.70it/s, loss=0.0098, lr=2.10e-05, step=3134] Training: 31%|███▏ | 3135/10000 [43:06<1:07:27, 1.70it/s, loss=0.0031, lr=2.10e-05, step=3135] Training: 31%|███▏ | 3136/10000 [43:07<1:14:22, 1.54it/s, loss=0.0031, lr=2.10e-05, step=3135] Training: 31%|███▏ | 3136/10000 [43:07<1:14:22, 1.54it/s, loss=0.0183, lr=2.10e-05, step=3136] Training: 31%|███▏ | 3137/10000 [43:07<1:12:43, 1.57it/s, loss=0.0183, lr=2.10e-05, step=3136] Training: 31%|███▏ | 3137/10000 [43:07<1:12:43, 1.57it/s, loss=0.0146, lr=2.10e-05, step=3137] Training: 31%|███▏ | 3138/10000 [43:08<1:18:10, 1.46it/s, loss=0.0146, lr=2.10e-05, step=3137] Training: 31%|███▏ | 3138/10000 [43:08<1:18:10, 1.46it/s, loss=0.0229, lr=2.10e-05, step=3138] Training: 31%|███▏ | 3139/10000 [43:09<1:24:48, 1.35it/s, loss=0.0229, lr=2.10e-05, step=3138] Training: 31%|███▏ | 3139/10000 [43:09<1:24:48, 1.35it/s, loss=0.0090, lr=2.10e-05, step=3139]19:27:42.029 [I] step=3140 loss=0.0194 smoothed_loss=0.0173 lr=2.10e-05 grad_norm=0.5092 step_time=0.5545s data_time=0.1159s it/s=1.492 eta_to_10000=4598.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0195 grad_action_out_proj_arms=0.1228 grad_arm_token_fuse=0.1051 grad_shared_expert=0.4232 (18633:train_pytorch.py:850) + Training: 31%|███▏ | 3140/10000 [43:10<1:22:48, 1.38it/s, loss=0.0090, lr=2.10e-05, step=3139] Training: 31%|███▏ | 3140/10000 [43:10<1:22:48, 1.38it/s, loss=0.0194, lr=2.10e-05, step=3140] Training: 31%|███▏ | 3141/10000 [43:10<1:16:41, 1.49it/s, loss=0.0194, lr=2.10e-05, step=3140] Training: 31%|███▏ | 3141/10000 [43:10<1:16:41, 1.49it/s, loss=0.0120, lr=2.10e-05, step=3141] Training: 31%|███▏ | 3142/10000 [43:11<1:12:05, 1.59it/s, loss=0.0120, lr=2.10e-05, step=3141] Training: 31%|███▏ | 3142/10000 [43:11<1:12:05, 1.59it/s, loss=0.0128, lr=2.10e-05, step=3142] Training: 31%|███▏ | 3143/10000 [43:12<1:28:40, 1.29it/s, loss=0.0128, lr=2.10e-05, step=3142] Training: 31%|███▏ | 3143/10000 [43:12<1:28:40, 1.29it/s, loss=0.0091, lr=2.10e-05, step=3143] Training: 31%|███▏ | 3144/10000 [43:13<1:25:27, 1.34it/s, loss=0.0091, lr=2.10e-05, step=3143] Training: 31%|███▏ | 3144/10000 [43:13<1:25:27, 1.34it/s, loss=0.0230, lr=2.10e-05, step=3144] Training: 31%|███▏ | 3145/10000 [43:13<1:17:56, 1.47it/s, loss=0.0230, lr=2.10e-05, step=3144] Training: 31%|███▏ | 3145/10000 [43:13<1:17:56, 1.47it/s, loss=0.0198, lr=2.10e-05, step=3145] Training: 31%|███▏ | 3146/10000 [43:14<1:18:18, 1.46it/s, loss=0.0198, lr=2.10e-05, step=3145] Training: 31%|███▏ | 3146/10000 [43:14<1:18:18, 1.46it/s, loss=0.0594, lr=2.10e-05, step=3146] Training: 31%|███▏ | 3147/10000 [43:14<1:18:33, 1.45it/s, loss=0.0594, lr=2.10e-05, step=3146] Training: 31%|███▏ | 3147/10000 [43:14<1:18:33, 1.45it/s, loss=0.0109, lr=2.10e-05, step=3147] Training: 31%|███▏ | 3148/10000 [43:15<1:22:55, 1.38it/s, loss=0.0109, lr=2.10e-05, step=3147] Training: 31%|███▏ | 3148/10000 [43:15<1:22:55, 1.38it/s, loss=0.0090, lr=2.10e-05, step=3148] Training: 31%|███▏ | 3149/10000 [43:16<1:17:12, 1.48it/s, loss=0.0090, lr=2.10e-05, step=3148] Training: 31%|███▏ | 3149/10000 [43:16<1:17:12, 1.48it/s, loss=0.0052, lr=2.10e-05, step=3149]19:27:49.117 [I] step=3150 loss=0.0288 smoothed_loss=0.0186 lr=2.10e-05 grad_norm=0.4789 step_time=0.5887s data_time=0.1202s it/s=1.411 eta_to_10000=4855.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0203 grad_action_out_proj_arms=0.1593 grad_arm_token_fuse=0.1181 grad_shared_expert=0.5429 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3150/10000 [43:17<1:25:27, 1.34it/s, loss=0.0052, lr=2.10e-05, step=3149] Training: 32%|███▏ | 3150/10000 [43:17<1:25:27, 1.34it/s, loss=0.0288, lr=2.10e-05, step=3150] Training: 32%|███▏ | 3151/10000 [43:17<1:17:54, 1.47it/s, loss=0.0288, lr=2.10e-05, step=3150] Training: 32%|███▏ | 3151/10000 [43:17<1:17:54, 1.47it/s, loss=0.0074, lr=2.09e-05, step=3151] Training: 32%|███▏ | 3152/10000 [43:18<1:26:46, 1.32it/s, loss=0.0074, lr=2.09e-05, step=3151] Training: 32%|███▏ | 3152/10000 [43:18<1:26:46, 1.32it/s, loss=0.0118, lr=2.09e-05, step=3152] Training: 32%|███▏ | 3153/10000 [43:19<1:22:55, 1.38it/s, loss=0.0118, lr=2.09e-05, step=3152] Training: 32%|███▏ | 3153/10000 [43:19<1:22:55, 1.38it/s, loss=0.0202, lr=2.09e-05, step=3153] Training: 32%|███▏ | 3154/10000 [43:20<1:22:46, 1.38it/s, loss=0.0202, lr=2.09e-05, step=3153] Training: 32%|███▏ | 3154/10000 [43:20<1:22:46, 1.38it/s, loss=0.0157, lr=2.09e-05, step=3154] Training: 32%|███▏ | 3155/10000 [43:20<1:17:56, 1.46it/s, loss=0.0157, lr=2.09e-05, step=3154] Training: 32%|███▏ | 3155/10000 [43:20<1:17:56, 1.46it/s, loss=0.0127, lr=2.09e-05, step=3155] Training: 32%|███▏ | 3156/10000 [43:21<1:13:12, 1.56it/s, loss=0.0127, lr=2.09e-05, step=3155] Training: 32%|███▏ | 3156/10000 [43:21<1:13:12, 1.56it/s, loss=0.0226, lr=2.09e-05, step=3156] Training: 32%|███▏ | 3157/10000 [43:22<1:21:22, 1.40it/s, loss=0.0226, lr=2.09e-05, step=3156] Training: 32%|███▏ | 3157/10000 [43:22<1:21:22, 1.40it/s, loss=0.0257, lr=2.09e-05, step=3157] Training: 32%|███▏ | 3158/10000 [43:23<1:26:59, 1.31it/s, loss=0.0257, lr=2.09e-05, step=3157] Training: 32%|███▏ | 3158/10000 [43:23<1:26:59, 1.31it/s, loss=0.0729, lr=2.09e-05, step=3158] Training: 32%|███▏ | 3159/10000 [43:23<1:29:39, 1.27it/s, loss=0.0729, lr=2.09e-05, step=3158] Training: 32%|███▏ | 3159/10000 [43:23<1:29:39, 1.27it/s, loss=0.0360, lr=2.09e-05, step=3159]19:27:56.287 [I] step=3160 loss=0.0388 smoothed_loss=0.0262 lr=2.09e-05 grad_norm=0.5851 step_time=0.6019s data_time=0.1151s it/s=1.395 eta_to_10000=4903.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0206 grad_action_out_proj_arms=0.2228 grad_arm_token_fuse=0.1079 grad_shared_expert=0.5647 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3160/10000 [43:24<1:23:14, 1.37it/s, loss=0.0360, lr=2.09e-05, step=3159] Training: 32%|███▏ | 3160/10000 [43:24<1:23:14, 1.37it/s, loss=0.0388, lr=2.09e-05, step=3160] Training: 32%|███▏ | 3161/10000 [43:25<1:28:14, 1.29it/s, loss=0.0388, lr=2.09e-05, step=3160] Training: 32%|███▏ | 3161/10000 [43:25<1:28:14, 1.29it/s, loss=0.0362, lr=2.09e-05, step=3161] Training: 32%|███▏ | 3162/10000 [43:26<1:25:27, 1.33it/s, loss=0.0362, lr=2.09e-05, step=3161] Training: 32%|███▏ | 3162/10000 [43:26<1:25:27, 1.33it/s, loss=0.0096, lr=2.09e-05, step=3162] Training: 32%|███▏ | 3163/10000 [43:26<1:27:43, 1.30it/s, loss=0.0096, lr=2.09e-05, step=3162] Training: 32%|███▏ | 3163/10000 [43:26<1:27:43, 1.30it/s, loss=0.0094, lr=2.09e-05, step=3163] Training: 32%|███▏ | 3164/10000 [43:27<1:32:23, 1.23it/s, loss=0.0094, lr=2.09e-05, step=3163] Training: 32%|███▏ | 3164/10000 [43:27<1:32:23, 1.23it/s, loss=0.0099, lr=2.09e-05, step=3164] Training: 32%|███▏ | 3165/10000 [43:28<1:42:04, 1.12it/s, loss=0.0099, lr=2.09e-05, step=3164] Training: 32%|███▏ | 3165/10000 [43:28<1:42:04, 1.12it/s, loss=0.0114, lr=2.09e-05, step=3165] Training: 32%|███▏ | 3166/10000 [43:29<1:29:11, 1.28it/s, loss=0.0114, lr=2.09e-05, step=3165] Training: 32%|███▏ | 3166/10000 [43:29<1:29:11, 1.28it/s, loss=0.0440, lr=2.09e-05, step=3166] Training: 32%|███▏ | 3167/10000 [43:30<1:24:19, 1.35it/s, loss=0.0440, lr=2.09e-05, step=3166] Training: 32%|███▏ | 3167/10000 [43:30<1:24:19, 1.35it/s, loss=0.0206, lr=2.09e-05, step=3167] Training: 32%|███▏ | 3168/10000 [43:30<1:26:21, 1.32it/s, loss=0.0206, lr=2.09e-05, step=3167] Training: 32%|███▏ | 3168/10000 [43:30<1:26:21, 1.32it/s, loss=0.0595, lr=2.09e-05, step=3168] Training: 32%|███▏ | 3169/10000 [43:31<1:28:12, 1.29it/s, loss=0.0595, lr=2.09e-05, step=3168] Training: 32%|███▏ | 3169/10000 [43:31<1:28:12, 1.29it/s, loss=0.0131, lr=2.09e-05, step=3169]19:28:04.019 [I] step=3170 loss=0.0060 smoothed_loss=0.0236 lr=2.09e-05 grad_norm=0.5881 step_time=0.5981s data_time=0.1751s it/s=1.294 eta_to_10000=5280.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0135 grad_action_out_proj_arms=0.1455 grad_arm_token_fuse=0.0641 grad_shared_expert=0.3745 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3170/10000 [43:32<1:21:11, 1.40it/s, loss=0.0131, lr=2.09e-05, step=3169] Training: 32%|███▏ | 3170/10000 [43:32<1:21:11, 1.40it/s, loss=0.0060, lr=2.09e-05, step=3170] Training: 32%|███▏ | 3171/10000 [43:33<1:25:14, 1.34it/s, loss=0.0060, lr=2.09e-05, step=3170] Training: 32%|███▏ | 3171/10000 [43:33<1:25:14, 1.34it/s, loss=0.0051, lr=2.09e-05, step=3171] Training: 32%|███▏ | 3172/10000 [43:33<1:19:55, 1.42it/s, loss=0.0051, lr=2.09e-05, step=3171] Training: 32%|███▏ | 3172/10000 [43:33<1:19:55, 1.42it/s, loss=0.0050, lr=2.09e-05, step=3172] Training: 32%|███▏ | 3173/10000 [43:34<1:17:05, 1.48it/s, loss=0.0050, lr=2.09e-05, step=3172] Training: 32%|███▏ | 3173/10000 [43:34<1:17:05, 1.48it/s, loss=0.0077, lr=2.09e-05, step=3173] Training: 32%|███▏ | 3174/10000 [43:34<1:13:21, 1.55it/s, loss=0.0077, lr=2.09e-05, step=3173] Training: 32%|███▏ | 3174/10000 [43:34<1:13:21, 1.55it/s, loss=0.0039, lr=2.09e-05, step=3174] Training: 32%|███▏ | 3175/10000 [43:35<1:09:09, 1.64it/s, loss=0.0039, lr=2.09e-05, step=3174] Training: 32%|███▏ | 3175/10000 [43:35<1:09:09, 1.64it/s, loss=0.0135, lr=2.09e-05, step=3175] Training: 32%|███▏ | 3176/10000 [43:35<1:08:34, 1.66it/s, loss=0.0135, lr=2.09e-05, step=3175] Training: 32%|███▏ | 3176/10000 [43:35<1:08:34, 1.66it/s, loss=0.0188, lr=2.09e-05, step=3176] Training: 32%|███▏ | 3177/10000 [43:36<1:11:24, 1.59it/s, loss=0.0188, lr=2.09e-05, step=3176] Training: 32%|███▏ | 3177/10000 [43:36<1:11:24, 1.59it/s, loss=0.0250, lr=2.09e-05, step=3177] Training: 32%|███▏ | 3178/10000 [43:37<1:18:33, 1.45it/s, loss=0.0250, lr=2.09e-05, step=3177] Training: 32%|███▏ | 3178/10000 [43:37<1:18:33, 1.45it/s, loss=0.0140, lr=2.09e-05, step=3178] Training: 32%|███▏ | 3179/10000 [43:38<1:14:20, 1.53it/s, loss=0.0140, lr=2.09e-05, step=3178] Training: 32%|███▏ | 3179/10000 [43:38<1:14:20, 1.53it/s, loss=0.0046, lr=2.09e-05, step=3179]19:28:10.762 [I] step=3180 loss=0.0179 smoothed_loss=0.0164 lr=2.09e-05 grad_norm=0.4781 step_time=0.5720s data_time=0.1023s it/s=1.483 eta_to_10000=4598.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0286 grad_action_out_proj_arms=0.2471 grad_arm_token_fuse=0.1538 grad_shared_expert=0.6004 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3180/10000 [43:38<1:23:31, 1.36it/s, loss=0.0046, lr=2.09e-05, step=3179] Training: 32%|███▏ | 3180/10000 [43:38<1:23:31, 1.36it/s, loss=0.0179, lr=2.09e-05, step=3180] Training: 32%|███▏ | 3181/10000 [43:39<1:25:41, 1.33it/s, loss=0.0179, lr=2.09e-05, step=3180] Training: 32%|███▏ | 3181/10000 [43:39<1:25:41, 1.33it/s, loss=0.0052, lr=2.09e-05, step=3181] Training: 32%|███▏ | 3182/10000 [43:40<1:28:38, 1.28it/s, loss=0.0052, lr=2.09e-05, step=3181] Training: 32%|███▏ | 3182/10000 [43:40<1:28:38, 1.28it/s, loss=0.0245, lr=2.09e-05, step=3182] Training: 32%|███▏ | 3183/10000 [43:41<1:26:58, 1.31it/s, loss=0.0245, lr=2.09e-05, step=3182] Training: 32%|███▏ | 3183/10000 [43:41<1:26:58, 1.31it/s, loss=0.0094, lr=2.09e-05, step=3183] Training: 32%|███▏ | 3184/10000 [43:41<1:24:08, 1.35it/s, loss=0.0094, lr=2.09e-05, step=3183] Training: 32%|███▏ | 3184/10000 [43:41<1:24:08, 1.35it/s, loss=0.0363, lr=2.09e-05, step=3184] Training: 32%|███▏ | 3185/10000 [43:42<1:17:25, 1.47it/s, loss=0.0363, lr=2.09e-05, step=3184] Training: 32%|███▏ | 3185/10000 [43:42<1:17:25, 1.47it/s, loss=0.0202, lr=2.09e-05, step=3185] Training: 32%|███▏ | 3186/10000 [43:43<1:21:25, 1.39it/s, loss=0.0202, lr=2.09e-05, step=3185] Training: 32%|███▏ | 3186/10000 [43:43<1:21:25, 1.39it/s, loss=0.0058, lr=2.08e-05, step=3186] Training: 32%|███▏ | 3187/10000 [43:44<1:29:10, 1.27it/s, loss=0.0058, lr=2.08e-05, step=3186] Training: 32%|███▏ | 3187/10000 [43:44<1:29:10, 1.27it/s, loss=0.0417, lr=2.08e-05, step=3187] Training: 32%|███▏ | 3188/10000 [43:44<1:26:35, 1.31it/s, loss=0.0417, lr=2.08e-05, step=3187] Training: 32%|███▏ | 3188/10000 [43:44<1:26:35, 1.31it/s, loss=0.0089, lr=2.08e-05, step=3188] Training: 32%|███▏ | 3189/10000 [43:45<1:29:41, 1.27it/s, loss=0.0089, lr=2.08e-05, step=3188] Training: 32%|███▏ | 3189/10000 [43:45<1:29:41, 1.27it/s, loss=0.0161, lr=2.08e-05, step=3189]19:28:18.431 [I] step=3190 loss=0.0169 smoothed_loss=0.0178 lr=2.09e-05 grad_norm=0.4987 step_time=0.6148s data_time=0.1521s it/s=1.304 eta_to_10000=5222.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0306 grad_action_out_proj_arms=0.2713 grad_arm_token_fuse=0.1608 grad_shared_expert=0.5994 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3190/10000 [43:46<1:28:46, 1.28it/s, loss=0.0161, lr=2.08e-05, step=3189] Training: 32%|███▏ | 3190/10000 [43:46<1:28:46, 1.28it/s, loss=0.0169, lr=2.08e-05, step=3190] Training: 32%|███▏ | 3191/10000 [43:47<1:20:24, 1.41it/s, loss=0.0169, lr=2.08e-05, step=3190] Training: 32%|███▏ | 3191/10000 [43:47<1:20:24, 1.41it/s, loss=0.0219, lr=2.08e-05, step=3191] Training: 32%|███▏ | 3192/10000 [43:47<1:21:43, 1.39it/s, loss=0.0219, lr=2.08e-05, step=3191] Training: 32%|███▏ | 3192/10000 [43:47<1:21:43, 1.39it/s, loss=0.0085, lr=2.08e-05, step=3192] Training: 32%|███▏ | 3193/10000 [43:48<1:31:50, 1.24it/s, loss=0.0085, lr=2.08e-05, step=3192] Training: 32%|███▏ | 3193/10000 [43:48<1:31:50, 1.24it/s, loss=0.0126, lr=2.08e-05, step=3193] Training: 32%|███▏ | 3194/10000 [43:49<1:22:45, 1.37it/s, loss=0.0126, lr=2.08e-05, step=3193] Training: 32%|███▏ | 3194/10000 [43:49<1:22:45, 1.37it/s, loss=0.0337, lr=2.08e-05, step=3194] Training: 32%|███▏ | 3195/10000 [43:50<1:24:54, 1.34it/s, loss=0.0337, lr=2.08e-05, step=3194] Training: 32%|███▏ | 3195/10000 [43:50<1:24:54, 1.34it/s, loss=0.0286, lr=2.08e-05, step=3195] Training: 32%|███▏ | 3196/10000 [43:50<1:17:13, 1.47it/s, loss=0.0286, lr=2.08e-05, step=3195] Training: 32%|███▏ | 3196/10000 [43:50<1:17:13, 1.47it/s, loss=0.0918, lr=2.08e-05, step=3196] Training: 32%|███▏ | 3197/10000 [43:51<1:26:07, 1.32it/s, loss=0.0918, lr=2.08e-05, step=3196] Training: 32%|███▏ | 3197/10000 [43:51<1:26:07, 1.32it/s, loss=0.0154, lr=2.08e-05, step=3197] Training: 32%|███▏ | 3198/10000 [43:52<1:29:43, 1.26it/s, loss=0.0154, lr=2.08e-05, step=3197] Training: 32%|███▏ | 3198/10000 [43:52<1:29:43, 1.26it/s, loss=0.0215, lr=2.08e-05, step=3198] Training: 32%|███▏ | 3199/10000 [43:53<1:39:23, 1.14it/s, loss=0.0215, lr=2.08e-05, step=3198] Training: 32%|███▏ | 3199/10000 [43:53<1:39:23, 1.14it/s, loss=0.0027, lr=2.08e-05, step=3199]19:28:26.506 [I] step=3200 loss=0.0673 smoothed_loss=0.0274 lr=2.08e-05 grad_norm=0.5034 step_time=0.6530s data_time=0.1545s it/s=1.238 eta_to_10000=5490.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0276 grad_action_out_proj_arms=0.1715 grad_arm_token_fuse=0.1366 grad_shared_expert=0.4667 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3200/10000 [43:54<1:44:33, 1.08it/s, loss=0.0027, lr=2.08e-05, step=3199] Training: 32%|███▏ | 3200/10000 [43:54<1:44:33, 1.08it/s, loss=0.0673, lr=2.08e-05, step=3200] Training: 32%|███▏ | 3201/10000 [43:55<1:37:04, 1.17it/s, loss=0.0673, lr=2.08e-05, step=3200] Training: 32%|███▏ | 3201/10000 [43:55<1:37:04, 1.17it/s, loss=0.0059, lr=2.08e-05, step=3201] Training: 32%|███▏ | 3202/10000 [43:55<1:28:00, 1.29it/s, loss=0.0059, lr=2.08e-05, step=3201] Training: 32%|███▏ | 3202/10000 [43:55<1:28:00, 1.29it/s, loss=0.0104, lr=2.08e-05, step=3202] Training: 32%|███▏ | 3203/10000 [43:56<1:21:12, 1.40it/s, loss=0.0104, lr=2.08e-05, step=3202] Training: 32%|███▏ | 3203/10000 [43:56<1:21:12, 1.40it/s, loss=0.0127, lr=2.08e-05, step=3203] Training: 32%|███▏ | 3204/10000 [43:57<1:14:18, 1.52it/s, loss=0.0127, lr=2.08e-05, step=3203] Training: 32%|███▏ | 3204/10000 [43:57<1:14:18, 1.52it/s, loss=0.0072, lr=2.08e-05, step=3204] Training: 32%|███▏ | 3205/10000 [43:57<1:13:54, 1.53it/s, loss=0.0072, lr=2.08e-05, step=3204] Training: 32%|███▏ | 3205/10000 [43:57<1:13:54, 1.53it/s, loss=0.0116, lr=2.08e-05, step=3205] Training: 32%|███▏ | 3206/10000 [43:58<1:15:16, 1.50it/s, loss=0.0116, lr=2.08e-05, step=3205] Training: 32%|███▏ | 3206/10000 [43:58<1:15:16, 1.50it/s, loss=0.0125, lr=2.08e-05, step=3206] Training: 32%|███▏ | 3207/10000 [43:59<1:28:55, 1.27it/s, loss=0.0125, lr=2.08e-05, step=3206] Training: 32%|███▏ | 3207/10000 [43:59<1:28:55, 1.27it/s, loss=0.0056, lr=2.08e-05, step=3207] Training: 32%|███▏ | 3208/10000 [44:00<1:21:40, 1.39it/s, loss=0.0056, lr=2.08e-05, step=3207] Training: 32%|███▏ | 3208/10000 [44:00<1:21:40, 1.39it/s, loss=0.0564, lr=2.08e-05, step=3208] Training: 32%|███▏ | 3209/10000 [44:00<1:19:36, 1.42it/s, loss=0.0564, lr=2.08e-05, step=3208] Training: 32%|███▏ | 3209/10000 [44:00<1:19:36, 1.42it/s, loss=0.0087, lr=2.08e-05, step=3209]19:28:33.241 [I] step=3210 loss=0.0236 smoothed_loss=0.0208 lr=2.08e-05 grad_norm=0.5143 step_time=0.5561s data_time=0.1174s it/s=1.485 eta_to_10000=4572.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0207 grad_action_out_proj_arms=0.1880 grad_arm_token_fuse=0.1142 grad_shared_expert=0.4799 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3210/10000 [44:01<1:19:52, 1.42it/s, loss=0.0087, lr=2.08e-05, step=3209] Training: 32%|███▏ | 3210/10000 [44:01<1:19:52, 1.42it/s, loss=0.0236, lr=2.08e-05, step=3210] Training: 32%|███▏ | 3211/10000 [44:01<1:12:38, 1.56it/s, loss=0.0236, lr=2.08e-05, step=3210] Training: 32%|███▏ | 3211/10000 [44:01<1:12:38, 1.56it/s, loss=0.0272, lr=2.08e-05, step=3211] Training: 32%|███▏ | 3212/10000 [44:02<1:11:01, 1.59it/s, loss=0.0272, lr=2.08e-05, step=3211] Training: 32%|███▏ | 3212/10000 [44:02<1:11:01, 1.59it/s, loss=0.0484, lr=2.08e-05, step=3212] Training: 32%|███▏ | 3213/10000 [44:03<1:18:55, 1.43it/s, loss=0.0484, lr=2.08e-05, step=3212] Training: 32%|███▏ | 3213/10000 [44:03<1:18:55, 1.43it/s, loss=0.0302, lr=2.08e-05, step=3213] Training: 32%|███▏ | 3214/10000 [44:04<1:28:27, 1.28it/s, loss=0.0302, lr=2.08e-05, step=3213] Training: 32%|███▏ | 3214/10000 [44:04<1:28:27, 1.28it/s, loss=0.0306, lr=2.08e-05, step=3214] Training: 32%|███▏ | 3215/10000 [44:04<1:20:30, 1.40it/s, loss=0.0306, lr=2.08e-05, step=3214] Training: 32%|███▏ | 3215/10000 [44:04<1:20:30, 1.40it/s, loss=0.0392, lr=2.08e-05, step=3215] Training: 32%|███▏ | 3216/10000 [44:05<1:15:46, 1.49it/s, loss=0.0392, lr=2.08e-05, step=3215] Training: 32%|███▏ | 3216/10000 [44:05<1:15:46, 1.49it/s, loss=0.0149, lr=2.08e-05, step=3216] Training: 32%|███▏ | 3217/10000 [44:06<1:18:56, 1.43it/s, loss=0.0149, lr=2.08e-05, step=3216] Training: 32%|███▏ | 3217/10000 [44:06<1:18:56, 1.43it/s, loss=0.0311, lr=2.08e-05, step=3217] Training: 32%|███▏ | 3218/10000 [44:06<1:12:14, 1.56it/s, loss=0.0311, lr=2.08e-05, step=3217] Training: 32%|███▏ | 3218/10000 [44:06<1:12:14, 1.56it/s, loss=0.0071, lr=2.08e-05, step=3218] Training: 32%|███▏ | 3219/10000 [44:07<1:09:28, 1.63it/s, loss=0.0071, lr=2.08e-05, step=3218] Training: 32%|███▏ | 3219/10000 [44:07<1:09:28, 1.63it/s, loss=0.0103, lr=2.08e-05, step=3219]19:28:39.897 [I] step=3220 loss=0.0414 smoothed_loss=0.0247 lr=2.08e-05 grad_norm=0.5397 step_time=0.5503s data_time=0.1153s it/s=1.503 eta_to_10000=4512.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1605 grad_arm_token_fuse=0.0735 grad_shared_expert=0.4325 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3220/10000 [44:08<1:15:14, 1.50it/s, loss=0.0103, lr=2.08e-05, step=3219] Training: 32%|███▏ | 3220/10000 [44:08<1:15:14, 1.50it/s, loss=0.0414, lr=2.08e-05, step=3220] Training: 32%|███▏ | 3221/10000 [44:09<1:36:24, 1.17it/s, loss=0.0414, lr=2.08e-05, step=3220] Training: 32%|███▏ | 3221/10000 [44:09<1:36:24, 1.17it/s, loss=0.0163, lr=2.07e-05, step=3221] Training: 32%|███▏ | 3222/10000 [44:10<1:51:36, 1.01it/s, loss=0.0163, lr=2.07e-05, step=3221] Training: 32%|███▏ | 3222/10000 [44:10<1:51:36, 1.01it/s, loss=0.0261, lr=2.07e-05, step=3222] Training: 32%|███▏ | 3223/10000 [44:11<1:44:59, 1.08it/s, loss=0.0261, lr=2.07e-05, step=3222] Training: 32%|███▏ | 3223/10000 [44:11<1:44:59, 1.08it/s, loss=0.0955, lr=2.07e-05, step=3223] Training: 32%|███▏ | 3224/10000 [44:12<1:36:37, 1.17it/s, loss=0.0955, lr=2.07e-05, step=3223] Training: 32%|███▏ | 3224/10000 [44:12<1:36:37, 1.17it/s, loss=0.0149, lr=2.07e-05, step=3224] Training: 32%|███▏ | 3225/10000 [44:12<1:30:59, 1.24it/s, loss=0.0149, lr=2.07e-05, step=3224] Training: 32%|███▏ | 3225/10000 [44:12<1:30:59, 1.24it/s, loss=0.0061, lr=2.07e-05, step=3225] Training: 32%|███▏ | 3226/10000 [44:13<1:21:07, 1.39it/s, loss=0.0061, lr=2.07e-05, step=3225] Training: 32%|███▏ | 3226/10000 [44:13<1:21:07, 1.39it/s, loss=0.0107, lr=2.07e-05, step=3226] Training: 32%|███▏ | 3227/10000 [44:14<1:22:35, 1.37it/s, loss=0.0107, lr=2.07e-05, step=3226] Training: 32%|███▏ | 3227/10000 [44:14<1:22:35, 1.37it/s, loss=0.0182, lr=2.07e-05, step=3227] Training: 32%|███▏ | 3228/10000 [44:14<1:22:11, 1.37it/s, loss=0.0182, lr=2.07e-05, step=3227] Training: 32%|███▏ | 3228/10000 [44:14<1:22:11, 1.37it/s, loss=0.0170, lr=2.07e-05, step=3228] Training: 32%|███▏ | 3229/10000 [44:15<1:24:28, 1.34it/s, loss=0.0170, lr=2.07e-05, step=3228] Training: 32%|███▏ | 3229/10000 [44:15<1:24:28, 1.34it/s, loss=0.0078, lr=2.07e-05, step=3229]19:28:48.034 [I] step=3230 loss=0.0233 smoothed_loss=0.0225 lr=2.07e-05 grad_norm=0.5073 step_time=0.6744s data_time=0.1393s it/s=1.229 eta_to_10000=5508.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1314 grad_arm_token_fuse=0.0692 grad_shared_expert=0.4714 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3230/10000 [44:16<1:18:56, 1.43it/s, loss=0.0078, lr=2.07e-05, step=3229] Training: 32%|███▏ | 3230/10000 [44:16<1:18:56, 1.43it/s, loss=0.0233, lr=2.07e-05, step=3230] Training: 32%|███▏ | 3231/10000 [44:16<1:14:52, 1.51it/s, loss=0.0233, lr=2.07e-05, step=3230] Training: 32%|███▏ | 3231/10000 [44:16<1:14:52, 1.51it/s, loss=0.0331, lr=2.07e-05, step=3231] Training: 32%|███▏ | 3232/10000 [44:17<1:12:26, 1.56it/s, loss=0.0331, lr=2.07e-05, step=3231] Training: 32%|███▏ | 3232/10000 [44:17<1:12:26, 1.56it/s, loss=0.0181, lr=2.07e-05, step=3232] Training: 32%|███▏ | 3233/10000 [44:17<1:09:19, 1.63it/s, loss=0.0181, lr=2.07e-05, step=3232] Training: 32%|███▏ | 3233/10000 [44:17<1:09:19, 1.63it/s, loss=0.0168, lr=2.07e-05, step=3233] Training: 32%|███▏ | 3234/10000 [44:18<1:09:32, 1.62it/s, loss=0.0168, lr=2.07e-05, step=3233] Training: 32%|███▏ | 3234/10000 [44:18<1:09:32, 1.62it/s, loss=0.0184, lr=2.07e-05, step=3234] Training: 32%|███▏ | 3235/10000 [44:19<1:15:09, 1.50it/s, loss=0.0184, lr=2.07e-05, step=3234] Training: 32%|███▏ | 3235/10000 [44:19<1:15:09, 1.50it/s, loss=0.0213, lr=2.07e-05, step=3235] Training: 32%|███▏ | 3236/10000 [44:20<1:23:26, 1.35it/s, loss=0.0213, lr=2.07e-05, step=3235] Training: 32%|███▏ | 3236/10000 [44:20<1:23:26, 1.35it/s, loss=0.0100, lr=2.07e-05, step=3236] Training: 32%|███▏ | 3237/10000 [44:20<1:20:42, 1.40it/s, loss=0.0100, lr=2.07e-05, step=3236] Training: 32%|███▏ | 3237/10000 [44:20<1:20:42, 1.40it/s, loss=0.0149, lr=2.07e-05, step=3237] Training: 32%|███▏ | 3238/10000 [44:21<1:17:26, 1.46it/s, loss=0.0149, lr=2.07e-05, step=3237] Training: 32%|███▏ | 3238/10000 [44:21<1:17:26, 1.46it/s, loss=0.0338, lr=2.07e-05, step=3238] Training: 32%|███▏ | 3239/10000 [44:22<1:13:07, 1.54it/s, loss=0.0338, lr=2.07e-05, step=3238] Training: 32%|███▏ | 3239/10000 [44:22<1:13:07, 1.54it/s, loss=0.0130, lr=2.07e-05, step=3239]19:28:54.516 [I] step=3240 loss=0.0256 smoothed_loss=0.0212 lr=2.07e-05 grad_norm=0.5305 step_time=0.5383s data_time=0.1099s it/s=1.543 eta_to_10000=4381.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.1490 grad_arm_token_fuse=0.0652 grad_shared_expert=0.4982 (18633:train_pytorch.py:850) + Training: 32%|███▏ | 3240/10000 [44:22<1:11:35, 1.57it/s, loss=0.0130, lr=2.07e-05, step=3239] Training: 32%|███▏ | 3240/10000 [44:22<1:11:35, 1.57it/s, loss=0.0256, lr=2.07e-05, step=3240] Training: 32%|███▏ | 3241/10000 [44:23<1:16:02, 1.48it/s, loss=0.0256, lr=2.07e-05, step=3240] Training: 32%|███▏ | 3241/10000 [44:23<1:16:02, 1.48it/s, loss=0.0159, lr=2.07e-05, step=3241] Training: 32%|███▏ | 3242/10000 [44:24<1:19:17, 1.42it/s, loss=0.0159, lr=2.07e-05, step=3241] Training: 32%|███▏ | 3242/10000 [44:24<1:19:17, 1.42it/s, loss=0.0162, lr=2.07e-05, step=3242] Training: 32%|███▏ | 3243/10000 [44:25<1:24:36, 1.33it/s, loss=0.0162, lr=2.07e-05, step=3242] Training: 32%|███▏ | 3243/10000 [44:25<1:24:36, 1.33it/s, loss=0.0214, lr=2.07e-05, step=3243] Training: 32%|███▏ | 3244/10000 [44:26<1:30:28, 1.24it/s, loss=0.0214, lr=2.07e-05, step=3243] Training: 32%|███▏ | 3244/10000 [44:26<1:30:28, 1.24it/s, loss=0.0293, lr=2.07e-05, step=3244] Training: 32%|███▏ | 3245/10000 [44:26<1:21:21, 1.38it/s, loss=0.0293, lr=2.07e-05, step=3244] Training: 32%|███▏ | 3245/10000 [44:26<1:21:21, 1.38it/s, loss=0.0189, lr=2.07e-05, step=3245] Training: 32%|███▏ | 3246/10000 [44:27<1:16:02, 1.48it/s, loss=0.0189, lr=2.07e-05, step=3245] Training: 32%|███▏ | 3246/10000 [44:27<1:16:02, 1.48it/s, loss=0.0128, lr=2.07e-05, step=3246] Training: 32%|███▏ | 3247/10000 [44:27<1:17:29, 1.45it/s, loss=0.0128, lr=2.07e-05, step=3246] Training: 32%|███▏ | 3247/10000 [44:27<1:17:29, 1.45it/s, loss=0.0089, lr=2.07e-05, step=3247] Training: 32%|███▏ | 3248/10000 [44:28<1:17:19, 1.46it/s, loss=0.0089, lr=2.07e-05, step=3247] Training: 32%|███▏ | 3248/10000 [44:28<1:17:19, 1.46it/s, loss=0.0288, lr=2.07e-05, step=3248] Training: 32%|███▏ | 3249/10000 [44:29<1:11:06, 1.58it/s, loss=0.0288, lr=2.07e-05, step=3248] Training: 32%|███▏ | 3249/10000 [44:29<1:11:06, 1.58it/s, loss=0.0200, lr=2.07e-05, step=3249]19:29:01.997 [I] step=3250 loss=0.0069 smoothed_loss=0.0187 lr=2.07e-05 grad_norm=0.5243 step_time=0.6011s data_time=0.1469s it/s=1.337 eta_to_10000=5049.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0090 grad_action_out_proj_arms=0.0995 grad_arm_token_fuse=0.0427 grad_shared_expert=0.2295 (18633:train_pytorch.py:850) + Training: 32%|███▎ | 3250/10000 [44:30<1:28:36, 1.27it/s, loss=0.0200, lr=2.07e-05, step=3249] Training: 32%|███▎ | 3250/10000 [44:30<1:28:36, 1.27it/s, loss=0.0069, lr=2.07e-05, step=3250] Training: 33%|███▎ | 3251/10000 [44:30<1:27:16, 1.29it/s, loss=0.0069, lr=2.07e-05, step=3250] Training: 33%|███▎ | 3251/10000 [44:30<1:27:16, 1.29it/s, loss=0.0157, lr=2.07e-05, step=3251] Training: 33%|███▎ | 3252/10000 [44:31<1:30:11, 1.25it/s, loss=0.0157, lr=2.07e-05, step=3251] Training: 33%|███▎ | 3252/10000 [44:31<1:30:11, 1.25it/s, loss=0.0312, lr=2.07e-05, step=3252] Training: 33%|███▎ | 3253/10000 [44:32<1:29:22, 1.26it/s, loss=0.0312, lr=2.07e-05, step=3252] Training: 33%|███▎ | 3253/10000 [44:32<1:29:22, 1.26it/s, loss=0.0257, lr=2.07e-05, step=3253] Training: 33%|███▎ | 3254/10000 [44:33<1:20:18, 1.40it/s, loss=0.0257, lr=2.07e-05, step=3253] Training: 33%|███▎ | 3254/10000 [44:33<1:20:18, 1.40it/s, loss=0.1007, lr=2.07e-05, step=3254] Training: 33%|███▎ | 3255/10000 [44:33<1:26:03, 1.31it/s, loss=0.1007, lr=2.07e-05, step=3254] Training: 33%|███▎ | 3255/10000 [44:33<1:26:03, 1.31it/s, loss=0.0226, lr=2.06e-05, step=3255] Training: 33%|███▎ | 3256/10000 [44:34<1:20:28, 1.40it/s, loss=0.0226, lr=2.06e-05, step=3255] Training: 33%|███▎ | 3256/10000 [44:34<1:20:28, 1.40it/s, loss=0.0103, lr=2.06e-05, step=3256] Training: 33%|███▎ | 3257/10000 [44:35<1:27:18, 1.29it/s, loss=0.0103, lr=2.06e-05, step=3256] Training: 33%|███▎ | 3257/10000 [44:35<1:27:18, 1.29it/s, loss=0.0209, lr=2.06e-05, step=3257] Training: 33%|███▎ | 3258/10000 [44:36<1:35:16, 1.18it/s, loss=0.0209, lr=2.06e-05, step=3257] Training: 33%|███▎ | 3258/10000 [44:36<1:35:16, 1.18it/s, loss=0.0113, lr=2.06e-05, step=3258] Training: 33%|███▎ | 3259/10000 [44:37<1:45:13, 1.07it/s, loss=0.0113, lr=2.06e-05, step=3258] Training: 33%|███▎ | 3259/10000 [44:37<1:45:13, 1.07it/s, loss=0.2061, lr=2.06e-05, step=3259]19:29:10.223 [I] step=3260 loss=0.0312 smoothed_loss=0.0412 lr=2.06e-05 grad_norm=0.5695 step_time=0.6439s data_time=0.1787s it/s=1.216 eta_to_10000=5542.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.1480 grad_arm_token_fuse=0.0712 grad_shared_expert=0.4993 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3260/10000 [44:38<1:38:47, 1.14it/s, loss=0.2061, lr=2.06e-05, step=3259] Training: 33%|███▎ | 3260/10000 [44:38<1:38:47, 1.14it/s, loss=0.0312, lr=2.06e-05, step=3260] Training: 33%|███▎ | 3261/10000 [44:38<1:28:10, 1.27it/s, loss=0.0312, lr=2.06e-05, step=3260] Training: 33%|███▎ | 3261/10000 [44:38<1:28:10, 1.27it/s, loss=0.0125, lr=2.06e-05, step=3261] Training: 33%|███▎ | 3262/10000 [44:39<1:29:52, 1.25it/s, loss=0.0125, lr=2.06e-05, step=3261] Training: 33%|███▎ | 3262/10000 [44:39<1:29:52, 1.25it/s, loss=0.0383, lr=2.06e-05, step=3262] Training: 33%|███▎ | 3263/10000 [44:40<1:21:59, 1.37it/s, loss=0.0383, lr=2.06e-05, step=3262] Training: 33%|███▎ | 3263/10000 [44:40<1:21:59, 1.37it/s, loss=0.0063, lr=2.06e-05, step=3263] Training: 33%|███▎ | 3264/10000 [44:41<1:27:04, 1.29it/s, loss=0.0063, lr=2.06e-05, step=3263] Training: 33%|███▎ | 3264/10000 [44:41<1:27:04, 1.29it/s, loss=0.0217, lr=2.06e-05, step=3264] Training: 33%|███▎ | 3265/10000 [44:42<1:32:09, 1.22it/s, loss=0.0217, lr=2.06e-05, step=3264] Training: 33%|███▎ | 3265/10000 [44:42<1:32:09, 1.22it/s, loss=0.0924, lr=2.06e-05, step=3265] Training: 33%|███▎ | 3266/10000 [44:42<1:32:07, 1.22it/s, loss=0.0924, lr=2.06e-05, step=3265] Training: 33%|███▎ | 3266/10000 [44:42<1:32:07, 1.22it/s, loss=0.0338, lr=2.06e-05, step=3266] Training: 33%|███▎ | 3267/10000 [44:43<1:25:51, 1.31it/s, loss=0.0338, lr=2.06e-05, step=3266] Training: 33%|███▎ | 3267/10000 [44:43<1:25:51, 1.31it/s, loss=0.0178, lr=2.06e-05, step=3267] Training: 33%|███▎ | 3268/10000 [44:44<1:17:13, 1.45it/s, loss=0.0178, lr=2.06e-05, step=3267] Training: 33%|███▎ | 3268/10000 [44:44<1:17:13, 1.45it/s, loss=0.0125, lr=2.06e-05, step=3268] Training: 33%|███▎ | 3269/10000 [44:44<1:14:00, 1.52it/s, loss=0.0125, lr=2.06e-05, step=3268] Training: 33%|███▎ | 3269/10000 [44:44<1:14:00, 1.52it/s, loss=0.1188, lr=2.06e-05, step=3269]19:29:17.292 [I] step=3270 loss=0.0179 smoothed_loss=0.0404 lr=2.06e-05 grad_norm=0.6858 step_time=0.5668s data_time=0.1401s it/s=1.415 eta_to_10000=4757.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.1235 grad_arm_token_fuse=0.0483 grad_shared_expert=0.3326 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3270/10000 [44:45<1:16:34, 1.46it/s, loss=0.1188, lr=2.06e-05, step=3269] Training: 33%|███▎ | 3270/10000 [44:45<1:16:34, 1.46it/s, loss=0.0179, lr=2.06e-05, step=3270] Training: 33%|███▎ | 3271/10000 [44:46<1:19:14, 1.42it/s, loss=0.0179, lr=2.06e-05, step=3270] Training: 33%|███▎ | 3271/10000 [44:46<1:19:14, 1.42it/s, loss=0.0117, lr=2.06e-05, step=3271] Training: 33%|███▎ | 3272/10000 [44:46<1:16:46, 1.46it/s, loss=0.0117, lr=2.06e-05, step=3271] Training: 33%|███▎ | 3272/10000 [44:46<1:16:46, 1.46it/s, loss=0.0816, lr=2.06e-05, step=3272] Training: 33%|███▎ | 3273/10000 [44:47<1:18:53, 1.42it/s, loss=0.0816, lr=2.06e-05, step=3272] Training: 33%|███▎ | 3273/10000 [44:47<1:18:53, 1.42it/s, loss=0.0144, lr=2.06e-05, step=3273] Training: 33%|███▎ | 3274/10000 [44:48<1:23:53, 1.34it/s, loss=0.0144, lr=2.06e-05, step=3273] Training: 33%|███▎ | 3274/10000 [44:48<1:23:53, 1.34it/s, loss=0.0087, lr=2.06e-05, step=3274] Training: 33%|███▎ | 3275/10000 [44:49<1:23:07, 1.35it/s, loss=0.0087, lr=2.06e-05, step=3274] Training: 33%|███▎ | 3275/10000 [44:49<1:23:07, 1.35it/s, loss=0.0180, lr=2.06e-05, step=3275] Training: 33%|███▎ | 3276/10000 [44:49<1:16:47, 1.46it/s, loss=0.0180, lr=2.06e-05, step=3275] Training: 33%|███▎ | 3276/10000 [44:49<1:16:47, 1.46it/s, loss=0.0368, lr=2.06e-05, step=3276] Training: 33%|███▎ | 3277/10000 [44:50<1:19:00, 1.42it/s, loss=0.0368, lr=2.06e-05, step=3276] Training: 33%|███▎ | 3277/10000 [44:50<1:19:00, 1.42it/s, loss=0.0204, lr=2.06e-05, step=3277] Training: 33%|███▎ | 3278/10000 [44:51<1:27:25, 1.28it/s, loss=0.0204, lr=2.06e-05, step=3277] Training: 33%|███▎ | 3278/10000 [44:51<1:27:25, 1.28it/s, loss=0.0090, lr=2.06e-05, step=3278] Training: 33%|███▎ | 3279/10000 [44:52<1:27:18, 1.28it/s, loss=0.0090, lr=2.06e-05, step=3278] Training: 33%|███▎ | 3279/10000 [44:52<1:27:18, 1.28it/s, loss=0.0381, lr=2.06e-05, step=3279]19:29:24.754 [I] step=3280 loss=0.0286 smoothed_loss=0.0312 lr=2.06e-05 grad_norm=0.5614 step_time=0.6223s data_time=0.1239s it/s=1.340 eta_to_10000=5013.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0168 grad_action_out_proj_arms=0.1680 grad_arm_token_fuse=0.0861 grad_shared_expert=0.4310 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3280/10000 [44:52<1:24:39, 1.32it/s, loss=0.0381, lr=2.06e-05, step=3279] Training: 33%|███▎ | 3280/10000 [44:52<1:24:39, 1.32it/s, loss=0.0286, lr=2.06e-05, step=3280] Training: 33%|███▎ | 3281/10000 [44:53<1:22:03, 1.36it/s, loss=0.0286, lr=2.06e-05, step=3280] Training: 33%|███▎ | 3281/10000 [44:53<1:22:03, 1.36it/s, loss=0.0120, lr=2.06e-05, step=3281] Training: 33%|███▎ | 3282/10000 [44:54<1:17:37, 1.44it/s, loss=0.0120, lr=2.06e-05, step=3281] Training: 33%|███▎ | 3282/10000 [44:54<1:17:37, 1.44it/s, loss=0.0194, lr=2.06e-05, step=3282] Training: 33%|███▎ | 3283/10000 [44:54<1:14:45, 1.50it/s, loss=0.0194, lr=2.06e-05, step=3282] Training: 33%|███▎ | 3283/10000 [44:54<1:14:45, 1.50it/s, loss=0.0164, lr=2.06e-05, step=3283] Training: 33%|███▎ | 3284/10000 [44:55<1:10:30, 1.59it/s, loss=0.0164, lr=2.06e-05, step=3283] Training: 33%|███▎ | 3284/10000 [44:55<1:10:30, 1.59it/s, loss=0.0140, lr=2.06e-05, step=3284] Training: 33%|███▎ | 3285/10000 [44:56<1:14:21, 1.51it/s, loss=0.0140, lr=2.06e-05, step=3284] Training: 33%|███▎ | 3285/10000 [44:56<1:14:21, 1.51it/s, loss=0.0100, lr=2.06e-05, step=3285] Training: 33%|███▎ | 3286/10000 [44:56<1:20:57, 1.38it/s, loss=0.0100, lr=2.06e-05, step=3285] Training: 33%|███▎ | 3286/10000 [44:56<1:20:57, 1.38it/s, loss=0.0144, lr=2.06e-05, step=3286] Training: 33%|███▎ | 3287/10000 [44:57<1:17:51, 1.44it/s, loss=0.0144, lr=2.06e-05, step=3286] Training: 33%|███▎ | 3287/10000 [44:57<1:17:51, 1.44it/s, loss=0.0050, lr=2.06e-05, step=3287] Training: 33%|███▎ | 3288/10000 [44:58<1:15:35, 1.48it/s, loss=0.0050, lr=2.06e-05, step=3287] Training: 33%|███▎ | 3288/10000 [44:58<1:15:35, 1.48it/s, loss=0.0471, lr=2.06e-05, step=3288] Training: 33%|███▎ | 3289/10000 [44:59<1:25:57, 1.30it/s, loss=0.0471, lr=2.06e-05, step=3288] Training: 33%|███▎ | 3289/10000 [44:59<1:25:57, 1.30it/s, loss=0.0249, lr=2.05e-05, step=3289]19:29:31.591 [I] step=3290 loss=0.0337 smoothed_loss=0.0250 lr=2.06e-05 grad_norm=0.6012 step_time=0.5643s data_time=0.1194s it/s=1.463 eta_to_10000=4586.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1565 grad_arm_token_fuse=0.0712 grad_shared_expert=0.4981 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3290/10000 [44:59<1:18:48, 1.42it/s, loss=0.0249, lr=2.05e-05, step=3289] Training: 33%|███▎ | 3290/10000 [44:59<1:18:48, 1.42it/s, loss=0.0337, lr=2.05e-05, step=3290] Training: 33%|███▎ | 3291/10000 [45:00<1:23:25, 1.34it/s, loss=0.0337, lr=2.05e-05, step=3290] Training: 33%|███▎ | 3291/10000 [45:00<1:23:25, 1.34it/s, loss=0.0364, lr=2.05e-05, step=3291] Training: 33%|███▎ | 3292/10000 [45:01<1:20:42, 1.39it/s, loss=0.0364, lr=2.05e-05, step=3291] Training: 33%|███▎ | 3292/10000 [45:01<1:20:42, 1.39it/s, loss=0.0133, lr=2.05e-05, step=3292] Training: 33%|███▎ | 3293/10000 [45:02<1:25:18, 1.31it/s, loss=0.0133, lr=2.05e-05, step=3292] Training: 33%|███▎ | 3293/10000 [45:02<1:25:18, 1.31it/s, loss=0.0139, lr=2.05e-05, step=3293] Training: 33%|███▎ | 3294/10000 [45:02<1:24:43, 1.32it/s, loss=0.0139, lr=2.05e-05, step=3293] Training: 33%|███▎ | 3294/10000 [45:02<1:24:43, 1.32it/s, loss=0.0086, lr=2.05e-05, step=3294] Training: 33%|███▎ | 3295/10000 [45:03<1:21:47, 1.37it/s, loss=0.0086, lr=2.05e-05, step=3294] Training: 33%|███▎ | 3295/10000 [45:03<1:21:47, 1.37it/s, loss=0.0031, lr=2.05e-05, step=3295] Training: 33%|███▎ | 3296/10000 [45:04<1:24:23, 1.32it/s, loss=0.0031, lr=2.05e-05, step=3295] Training: 33%|███▎ | 3296/10000 [45:04<1:24:23, 1.32it/s, loss=0.1096, lr=2.05e-05, step=3296] Training: 33%|███▎ | 3297/10000 [45:05<1:23:26, 1.34it/s, loss=0.1096, lr=2.05e-05, step=3296] Training: 33%|███▎ | 3297/10000 [45:05<1:23:26, 1.34it/s, loss=0.0055, lr=2.05e-05, step=3297] Training: 33%|███▎ | 3298/10000 [45:05<1:22:56, 1.35it/s, loss=0.0055, lr=2.05e-05, step=3297] Training: 33%|███▎ | 3298/10000 [45:05<1:22:56, 1.35it/s, loss=0.0208, lr=2.05e-05, step=3298] Training: 33%|███▎ | 3299/10000 [45:06<1:22:06, 1.36it/s, loss=0.0208, lr=2.05e-05, step=3298] Training: 33%|███▎ | 3299/10000 [45:06<1:22:06, 1.36it/s, loss=0.0058, lr=2.05e-05, step=3299]19:29:39.359 [I] step=3300 loss=0.0088 smoothed_loss=0.0227 lr=2.05e-05 grad_norm=0.5085 step_time=0.6271s data_time=0.1497s it/s=1.287 eta_to_10000=5203.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0365 grad_action_out_proj_arms=0.1838 grad_arm_token_fuse=0.2137 grad_shared_expert=0.5782 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3300/10000 [45:07<1:30:49, 1.23it/s, loss=0.0058, lr=2.05e-05, step=3299] Training: 33%|███▎ | 3300/10000 [45:07<1:30:49, 1.23it/s, loss=0.0088, lr=2.05e-05, step=3300] Training: 33%|███▎ | 3301/10000 [45:08<1:27:45, 1.27it/s, loss=0.0088, lr=2.05e-05, step=3300] Training: 33%|███▎ | 3301/10000 [45:08<1:27:45, 1.27it/s, loss=0.0192, lr=2.05e-05, step=3301] Training: 33%|███▎ | 3302/10000 [45:08<1:20:03, 1.39it/s, loss=0.0192, lr=2.05e-05, step=3301] Training: 33%|███▎ | 3302/10000 [45:08<1:20:03, 1.39it/s, loss=0.0327, lr=2.05e-05, step=3302] Training: 33%|███▎ | 3303/10000 [45:09<1:24:41, 1.32it/s, loss=0.0327, lr=2.05e-05, step=3302] Training: 33%|███▎ | 3303/10000 [45:09<1:24:41, 1.32it/s, loss=0.0055, lr=2.05e-05, step=3303] Training: 33%|███▎ | 3304/10000 [45:10<1:30:32, 1.23it/s, loss=0.0055, lr=2.05e-05, step=3303] Training: 33%|███▎ | 3304/10000 [45:10<1:30:32, 1.23it/s, loss=0.0293, lr=2.05e-05, step=3304] Training: 33%|███▎ | 3305/10000 [45:11<1:30:28, 1.23it/s, loss=0.0293, lr=2.05e-05, step=3304] Training: 33%|███▎ | 3305/10000 [45:11<1:30:28, 1.23it/s, loss=0.0204, lr=2.05e-05, step=3305] Training: 33%|███▎ | 3306/10000 [45:11<1:22:07, 1.36it/s, loss=0.0204, lr=2.05e-05, step=3305] Training: 33%|███▎ | 3306/10000 [45:11<1:22:07, 1.36it/s, loss=0.0115, lr=2.05e-05, step=3306] Training: 33%|███▎ | 3307/10000 [45:12<1:22:31, 1.35it/s, loss=0.0115, lr=2.05e-05, step=3306] Training: 33%|███▎ | 3307/10000 [45:12<1:22:31, 1.35it/s, loss=0.0155, lr=2.05e-05, step=3307] Training: 33%|███▎ | 3308/10000 [45:13<1:21:13, 1.37it/s, loss=0.0155, lr=2.05e-05, step=3307] Training: 33%|███▎ | 3308/10000 [45:13<1:21:13, 1.37it/s, loss=0.0307, lr=2.05e-05, step=3308] Training: 33%|███▎ | 3309/10000 [45:14<1:25:22, 1.31it/s, loss=0.0307, lr=2.05e-05, step=3308] Training: 33%|███▎ | 3309/10000 [45:14<1:25:22, 1.31it/s, loss=0.0165, lr=2.05e-05, step=3309]19:29:46.872 [I] step=3310 loss=0.0122 smoothed_loss=0.0202 lr=2.05e-05 grad_norm=0.5440 step_time=0.5938s data_time=0.1575s it/s=1.331 eta_to_10000=5025.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0303 grad_action_out_proj_arms=0.2099 grad_arm_token_fuse=0.1675 grad_shared_expert=0.4893 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3310/10000 [45:15<1:25:31, 1.30it/s, loss=0.0165, lr=2.05e-05, step=3309] Training: 33%|███▎ | 3310/10000 [45:15<1:25:31, 1.30it/s, loss=0.0122, lr=2.05e-05, step=3310] Training: 33%|███▎ | 3311/10000 [45:15<1:20:30, 1.38it/s, loss=0.0122, lr=2.05e-05, step=3310] Training: 33%|███▎ | 3311/10000 [45:15<1:20:30, 1.38it/s, loss=0.0033, lr=2.05e-05, step=3311] Training: 33%|███▎ | 3312/10000 [45:16<1:34:41, 1.18it/s, loss=0.0033, lr=2.05e-05, step=3311] Training: 33%|███▎ | 3312/10000 [45:16<1:34:41, 1.18it/s, loss=0.0625, lr=2.05e-05, step=3312] Training: 33%|███▎ | 3313/10000 [45:17<1:27:39, 1.27it/s, loss=0.0625, lr=2.05e-05, step=3312] Training: 33%|███▎ | 3313/10000 [45:17<1:27:39, 1.27it/s, loss=0.0346, lr=2.05e-05, step=3313] Training: 33%|███▎ | 3314/10000 [45:18<1:37:21, 1.14it/s, loss=0.0346, lr=2.05e-05, step=3313] Training: 33%|███▎ | 3314/10000 [45:18<1:37:21, 1.14it/s, loss=0.0160, lr=2.05e-05, step=3314] Training: 33%|███▎ | 3315/10000 [45:19<1:34:35, 1.18it/s, loss=0.0160, lr=2.05e-05, step=3314] Training: 33%|███▎ | 3315/10000 [45:19<1:34:35, 1.18it/s, loss=0.0125, lr=2.05e-05, step=3315] Training: 33%|███▎ | 3316/10000 [45:19<1:23:38, 1.33it/s, loss=0.0125, lr=2.05e-05, step=3315] Training: 33%|███▎ | 3316/10000 [45:19<1:23:38, 1.33it/s, loss=0.0122, lr=2.05e-05, step=3316] Training: 33%|███▎ | 3317/10000 [45:20<1:26:18, 1.29it/s, loss=0.0122, lr=2.05e-05, step=3316] Training: 33%|███▎ | 3317/10000 [45:20<1:26:18, 1.29it/s, loss=0.0384, lr=2.05e-05, step=3317] Training: 33%|███▎ | 3318/10000 [45:21<1:22:57, 1.34it/s, loss=0.0384, lr=2.05e-05, step=3317] Training: 33%|███▎ | 3318/10000 [45:21<1:22:57, 1.34it/s, loss=0.0171, lr=2.05e-05, step=3318] Training: 33%|███▎ | 3319/10000 [45:22<1:24:52, 1.31it/s, loss=0.0171, lr=2.05e-05, step=3318] Training: 33%|███▎ | 3319/10000 [45:22<1:24:52, 1.31it/s, loss=0.0022, lr=2.05e-05, step=3319]19:29:54.693 [I] step=3320 loss=0.0216 smoothed_loss=0.0204 lr=2.05e-05 grad_norm=0.5164 step_time=0.6196s data_time=0.1625s it/s=1.279 eta_to_10000=5224.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0181 grad_action_out_proj_arms=0.2122 grad_arm_token_fuse=0.0972 grad_shared_expert=0.4893 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3320/10000 [45:22<1:23:23, 1.34it/s, loss=0.0022, lr=2.05e-05, step=3319] Training: 33%|███▎ | 3320/10000 [45:22<1:23:23, 1.34it/s, loss=0.0216, lr=2.05e-05, step=3320] Training: 33%|███▎ | 3321/10000 [45:23<1:29:56, 1.24it/s, loss=0.0216, lr=2.05e-05, step=3320] Training: 33%|███▎ | 3321/10000 [45:23<1:29:56, 1.24it/s, loss=0.0111, lr=2.05e-05, step=3321] Training: 33%|███▎ | 3322/10000 [45:24<1:21:05, 1.37it/s, loss=0.0111, lr=2.05e-05, step=3321] Training: 33%|███▎ | 3322/10000 [45:24<1:21:05, 1.37it/s, loss=0.0159, lr=2.04e-05, step=3322] Training: 33%|███▎ | 3323/10000 [45:25<1:21:58, 1.36it/s, loss=0.0159, lr=2.04e-05, step=3322] Training: 33%|███▎ | 3323/10000 [45:25<1:21:58, 1.36it/s, loss=0.0091, lr=2.04e-05, step=3323] Training: 33%|███▎ | 3324/10000 [45:25<1:18:24, 1.42it/s, loss=0.0091, lr=2.04e-05, step=3323] Training: 33%|███▎ | 3324/10000 [45:25<1:18:24, 1.42it/s, loss=0.0084, lr=2.04e-05, step=3324] Training: 33%|███▎ | 3325/10000 [45:26<1:20:27, 1.38it/s, loss=0.0084, lr=2.04e-05, step=3324] Training: 33%|███▎ | 3325/10000 [45:26<1:20:27, 1.38it/s, loss=0.0468, lr=2.04e-05, step=3325] Training: 33%|███▎ | 3326/10000 [45:27<1:22:10, 1.35it/s, loss=0.0468, lr=2.04e-05, step=3325] Training: 33%|███▎ | 3326/10000 [45:27<1:22:10, 1.35it/s, loss=0.0156, lr=2.04e-05, step=3326] Training: 33%|███▎ | 3327/10000 [45:28<1:29:47, 1.24it/s, loss=0.0156, lr=2.04e-05, step=3326] Training: 33%|███▎ | 3327/10000 [45:28<1:29:47, 1.24it/s, loss=0.0220, lr=2.04e-05, step=3327] Training: 33%|███▎ | 3328/10000 [45:28<1:27:08, 1.28it/s, loss=0.0220, lr=2.04e-05, step=3327] Training: 33%|███▎ | 3328/10000 [45:28<1:27:08, 1.28it/s, loss=0.0174, lr=2.04e-05, step=3328] Training: 33%|███▎ | 3329/10000 [45:29<1:25:15, 1.30it/s, loss=0.0174, lr=2.04e-05, step=3328] Training: 33%|███▎ | 3329/10000 [45:29<1:25:15, 1.30it/s, loss=0.0231, lr=2.04e-05, step=3329]19:30:02.140 [I] step=3330 loss=0.0078 smoothed_loss=0.0188 lr=2.04e-05 grad_norm=0.4431 step_time=0.5974s data_time=0.1473s it/s=1.343 eta_to_10000=4966.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0199 grad_action_out_proj_arms=0.1742 grad_arm_token_fuse=0.1107 grad_shared_expert=0.4401 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3330/10000 [45:30<1:19:56, 1.39it/s, loss=0.0231, lr=2.04e-05, step=3329] Training: 33%|███▎ | 3330/10000 [45:30<1:19:56, 1.39it/s, loss=0.0078, lr=2.04e-05, step=3330] Training: 33%|███▎ | 3331/10000 [45:30<1:18:12, 1.42it/s, loss=0.0078, lr=2.04e-05, step=3330] Training: 33%|███▎ | 3331/10000 [45:30<1:18:12, 1.42it/s, loss=0.0357, lr=2.04e-05, step=3331] Training: 33%|███▎ | 3332/10000 [45:31<1:16:23, 1.45it/s, loss=0.0357, lr=2.04e-05, step=3331] Training: 33%|███▎ | 3332/10000 [45:31<1:16:23, 1.45it/s, loss=0.0112, lr=2.04e-05, step=3332] Training: 33%|███▎ | 3333/10000 [45:32<1:11:50, 1.55it/s, loss=0.0112, lr=2.04e-05, step=3332] Training: 33%|███▎ | 3333/10000 [45:32<1:11:50, 1.55it/s, loss=0.0111, lr=2.04e-05, step=3333] Training: 33%|███▎ | 3334/10000 [45:32<1:12:17, 1.54it/s, loss=0.0111, lr=2.04e-05, step=3333] Training: 33%|███▎ | 3334/10000 [45:32<1:12:17, 1.54it/s, loss=0.0066, lr=2.04e-05, step=3334] Training: 33%|███▎ | 3335/10000 [45:33<1:09:04, 1.61it/s, loss=0.0066, lr=2.04e-05, step=3334] Training: 33%|███▎ | 3335/10000 [45:33<1:09:04, 1.61it/s, loss=0.0131, lr=2.04e-05, step=3335] Training: 33%|███▎ | 3336/10000 [45:34<1:23:26, 1.33it/s, loss=0.0131, lr=2.04e-05, step=3335] Training: 33%|███▎ | 3336/10000 [45:34<1:23:26, 1.33it/s, loss=0.0242, lr=2.04e-05, step=3336] Training: 33%|███▎ | 3337/10000 [45:35<1:17:47, 1.43it/s, loss=0.0242, lr=2.04e-05, step=3336] Training: 33%|███▎ | 3337/10000 [45:35<1:17:47, 1.43it/s, loss=0.0296, lr=2.04e-05, step=3337] Training: 33%|███▎ | 3338/10000 [45:35<1:14:26, 1.49it/s, loss=0.0296, lr=2.04e-05, step=3337] Training: 33%|███▎ | 3338/10000 [45:35<1:14:26, 1.49it/s, loss=0.0123, lr=2.04e-05, step=3338] Training: 33%|███▎ | 3339/10000 [45:36<1:13:57, 1.50it/s, loss=0.0123, lr=2.04e-05, step=3338] Training: 33%|███▎ | 3339/10000 [45:36<1:13:57, 1.50it/s, loss=0.0109, lr=2.04e-05, step=3339]19:30:08.677 [I] step=3340 loss=0.0422 smoothed_loss=0.0200 lr=2.04e-05 grad_norm=0.4846 step_time=0.5369s data_time=0.1168s it/s=1.530 eta_to_10000=4352.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0211 grad_action_out_proj_arms=0.2363 grad_arm_token_fuse=0.1080 grad_shared_expert=0.7666 (18633:train_pytorch.py:850) + Training: 33%|███▎ | 3340/10000 [45:36<1:10:28, 1.58it/s, loss=0.0109, lr=2.04e-05, step=3339] Training: 33%|███▎ | 3340/10000 [45:36<1:10:28, 1.58it/s, loss=0.0422, lr=2.04e-05, step=3340] Training: 33%|███▎ | 3341/10000 [45:37<1:09:07, 1.61it/s, loss=0.0422, lr=2.04e-05, step=3340] Training: 33%|███▎ | 3341/10000 [45:37<1:09:07, 1.61it/s, loss=0.0176, lr=2.04e-05, step=3341] Training: 33%|███▎ | 3342/10000 [45:38<1:19:16, 1.40it/s, loss=0.0176, lr=2.04e-05, step=3341] Training: 33%|███▎ | 3342/10000 [45:38<1:19:16, 1.40it/s, loss=0.0166, lr=2.04e-05, step=3342] Training: 33%|███▎ | 3343/10000 [45:39<1:26:41, 1.28it/s, loss=0.0166, lr=2.04e-05, step=3342] Training: 33%|███▎ | 3343/10000 [45:39<1:26:41, 1.28it/s, loss=0.0391, lr=2.04e-05, step=3343] Training: 33%|███▎ | 3344/10000 [45:40<1:31:24, 1.21it/s, loss=0.0391, lr=2.04e-05, step=3343] Training: 33%|███▎ | 3344/10000 [45:40<1:31:24, 1.21it/s, loss=0.0084, lr=2.04e-05, step=3344] Training: 33%|███▎ | 3345/10000 [45:40<1:26:09, 1.29it/s, loss=0.0084, lr=2.04e-05, step=3344] Training: 33%|███▎ | 3345/10000 [45:40<1:26:09, 1.29it/s, loss=0.0288, lr=2.04e-05, step=3345] Training: 33%|███▎ | 3346/10000 [45:41<1:18:49, 1.41it/s, loss=0.0288, lr=2.04e-05, step=3345] Training: 33%|███▎ | 3346/10000 [45:41<1:18:49, 1.41it/s, loss=0.0240, lr=2.04e-05, step=3346] Training: 33%|███▎ | 3347/10000 [45:41<1:12:29, 1.53it/s, loss=0.0240, lr=2.04e-05, step=3346] Training: 33%|███▎ | 3347/10000 [45:41<1:12:29, 1.53it/s, loss=0.0101, lr=2.04e-05, step=3347] Training: 33%|███▎ | 3348/10000 [45:42<1:12:51, 1.52it/s, loss=0.0101, lr=2.04e-05, step=3347] Training: 33%|███▎ | 3348/10000 [45:42<1:12:51, 1.52it/s, loss=0.0168, lr=2.04e-05, step=3348] Training: 33%|███▎ | 3349/10000 [45:43<1:19:55, 1.39it/s, loss=0.0168, lr=2.04e-05, step=3348] Training: 33%|███▎ | 3349/10000 [45:43<1:19:55, 1.39it/s, loss=0.0109, lr=2.04e-05, step=3349]19:30:16.249 [I] step=3350 loss=0.0121 smoothed_loss=0.0183 lr=2.04e-05 grad_norm=0.4958 step_time=0.6054s data_time=0.1519s it/s=1.321 eta_to_10000=5035.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0077 grad_action_out_proj_arms=0.1172 grad_arm_token_fuse=0.0404 grad_shared_expert=0.3190 (18633:train_pytorch.py:850) + Training: 34%|███▎ | 3350/10000 [45:44<1:26:10, 1.29it/s, loss=0.0109, lr=2.04e-05, step=3349] Training: 34%|███▎ | 3350/10000 [45:44<1:26:10, 1.29it/s, loss=0.0121, lr=2.04e-05, step=3350] Training: 34%|███▎ | 3351/10000 [45:44<1:16:51, 1.44it/s, loss=0.0121, lr=2.04e-05, step=3350] Training: 34%|███▎ | 3351/10000 [45:44<1:16:51, 1.44it/s, loss=0.0148, lr=2.04e-05, step=3351] Training: 34%|███▎ | 3352/10000 [45:45<1:20:27, 1.38it/s, loss=0.0148, lr=2.04e-05, step=3351] Training: 34%|███▎ | 3352/10000 [45:45<1:20:27, 1.38it/s, loss=0.0153, lr=2.04e-05, step=3352] Training: 34%|███▎ | 3353/10000 [45:46<1:22:22, 1.34it/s, loss=0.0153, lr=2.04e-05, step=3352] Training: 34%|███▎ | 3353/10000 [45:46<1:22:22, 1.34it/s, loss=0.0077, lr=2.04e-05, step=3353] Training: 34%|███▎ | 3354/10000 [45:47<1:21:41, 1.36it/s, loss=0.0077, lr=2.04e-05, step=3353] Training: 34%|███▎ | 3354/10000 [45:47<1:21:41, 1.36it/s, loss=0.0142, lr=2.04e-05, step=3354] Training: 34%|███▎ | 3355/10000 [45:47<1:17:22, 1.43it/s, loss=0.0142, lr=2.04e-05, step=3354] Training: 34%|███▎ | 3355/10000 [45:47<1:17:22, 1.43it/s, loss=0.0388, lr=2.04e-05, step=3355] Training: 34%|███▎ | 3356/10000 [45:48<1:13:15, 1.51it/s, loss=0.0388, lr=2.04e-05, step=3355] Training: 34%|███▎ | 3356/10000 [45:48<1:13:15, 1.51it/s, loss=0.0200, lr=2.03e-05, step=3356] Training: 34%|███▎ | 3357/10000 [45:49<1:24:52, 1.30it/s, loss=0.0200, lr=2.03e-05, step=3356] Training: 34%|███▎ | 3357/10000 [45:49<1:24:52, 1.30it/s, loss=0.0066, lr=2.03e-05, step=3357] Training: 34%|███▎ | 3358/10000 [45:50<1:30:02, 1.23it/s, loss=0.0066, lr=2.03e-05, step=3357] Training: 34%|███▎ | 3358/10000 [45:50<1:30:02, 1.23it/s, loss=0.0140, lr=2.03e-05, step=3358] Training: 34%|███▎ | 3359/10000 [45:51<1:25:10, 1.30it/s, loss=0.0140, lr=2.03e-05, step=3358] Training: 34%|███▎ | 3359/10000 [45:51<1:25:10, 1.30it/s, loss=0.0096, lr=2.03e-05, step=3359]19:30:23.708 [I] step=3360 loss=0.0112 smoothed_loss=0.0159 lr=2.03e-05 grad_norm=0.5111 step_time=0.5631s data_time=0.1827s it/s=1.341 eta_to_10000=4951.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1601 grad_arm_token_fuse=0.0801 grad_shared_expert=0.3499 (18633:train_pytorch.py:850) + Training: 34%|███▎ | 3360/10000 [45:51<1:28:24, 1.25it/s, loss=0.0096, lr=2.03e-05, step=3359] Training: 34%|███▎ | 3360/10000 [45:51<1:28:24, 1.25it/s, loss=0.0112, lr=2.03e-05, step=3360] Training: 34%|███▎ | 3361/10000 [45:52<1:23:33, 1.32it/s, loss=0.0112, lr=2.03e-05, step=3360] Training: 34%|███▎ | 3361/10000 [45:52<1:23:33, 1.32it/s, loss=0.0182, lr=2.03e-05, step=3361] Training: 34%|███▎ | 3362/10000 [45:53<1:23:27, 1.33it/s, loss=0.0182, lr=2.03e-05, step=3361] Training: 34%|███▎ | 3362/10000 [45:53<1:23:27, 1.33it/s, loss=0.0167, lr=2.03e-05, step=3362] Training: 34%|███▎ | 3363/10000 [45:54<1:26:33, 1.28it/s, loss=0.0167, lr=2.03e-05, step=3362] Training: 34%|███▎ | 3363/10000 [45:54<1:26:33, 1.28it/s, loss=0.0214, lr=2.03e-05, step=3363] Training: 34%|███▎ | 3364/10000 [45:55<1:33:47, 1.18it/s, loss=0.0214, lr=2.03e-05, step=3363] Training: 34%|███▎ | 3364/10000 [45:55<1:33:47, 1.18it/s, loss=0.0092, lr=2.03e-05, step=3364] Training: 34%|███▎ | 3365/10000 [45:55<1:31:58, 1.20it/s, loss=0.0092, lr=2.03e-05, step=3364] Training: 34%|███▎ | 3365/10000 [45:55<1:31:58, 1.20it/s, loss=0.0459, lr=2.03e-05, step=3365] Training: 34%|███▎ | 3366/10000 [45:56<1:28:22, 1.25it/s, loss=0.0459, lr=2.03e-05, step=3365] Training: 34%|███▎ | 3366/10000 [45:56<1:28:22, 1.25it/s, loss=0.0213, lr=2.03e-05, step=3366] Training: 34%|███▎ | 3367/10000 [45:57<1:20:39, 1.37it/s, loss=0.0213, lr=2.03e-05, step=3366] Training: 34%|███▎ | 3367/10000 [45:57<1:20:39, 1.37it/s, loss=0.0436, lr=2.03e-05, step=3367] Training: 34%|███▎ | 3368/10000 [45:58<1:25:42, 1.29it/s, loss=0.0436, lr=2.03e-05, step=3367] Training: 34%|███▎ | 3368/10000 [45:58<1:25:42, 1.29it/s, loss=0.0687, lr=2.03e-05, step=3368] Training: 34%|███▎ | 3369/10000 [45:58<1:29:29, 1.23it/s, loss=0.0687, lr=2.03e-05, step=3368] Training: 34%|███▎ | 3369/10000 [45:58<1:29:29, 1.23it/s, loss=0.0133, lr=2.03e-05, step=3369]19:30:31.403 [I] step=3370 loss=0.0288 smoothed_loss=0.0254 lr=2.03e-05 grad_norm=0.7487 step_time=0.5902s data_time=0.1793s it/s=1.300 eta_to_10000=5100.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0122 grad_action_out_proj_arms=0.1435 grad_arm_token_fuse=0.0631 grad_shared_expert=0.4325 (18633:train_pytorch.py:850) + Training: 34%|███▎ | 3370/10000 [45:59<1:21:57, 1.35it/s, loss=0.0133, lr=2.03e-05, step=3369] Training: 34%|███▎ | 3370/10000 [45:59<1:21:57, 1.35it/s, loss=0.0288, lr=2.03e-05, step=3370] Training: 34%|███▎ | 3371/10000 [46:00<1:25:35, 1.29it/s, loss=0.0288, lr=2.03e-05, step=3370] Training: 34%|███▎ | 3371/10000 [46:00<1:25:35, 1.29it/s, loss=0.0196, lr=2.03e-05, step=3371] Training: 34%|███▎ | 3372/10000 [46:00<1:17:40, 1.42it/s, loss=0.0196, lr=2.03e-05, step=3371] Training: 34%|███▎ | 3372/10000 [46:00<1:17:40, 1.42it/s, loss=0.0215, lr=2.03e-05, step=3372] Training: 34%|███▎ | 3373/10000 [46:01<1:11:40, 1.54it/s, loss=0.0215, lr=2.03e-05, step=3372] Training: 34%|███▎ | 3373/10000 [46:01<1:11:40, 1.54it/s, loss=0.0117, lr=2.03e-05, step=3373] Training: 34%|███▎ | 3374/10000 [46:02<1:14:18, 1.49it/s, loss=0.0117, lr=2.03e-05, step=3373] Training: 34%|███▎ | 3374/10000 [46:02<1:14:18, 1.49it/s, loss=0.0057, lr=2.03e-05, step=3374] Training: 34%|███▍ | 3375/10000 [46:02<1:14:39, 1.48it/s, loss=0.0057, lr=2.03e-05, step=3374] Training: 34%|███▍ | 3375/10000 [46:02<1:14:39, 1.48it/s, loss=0.0153, lr=2.03e-05, step=3375] Training: 34%|███▍ | 3376/10000 [46:03<1:11:55, 1.53it/s, loss=0.0153, lr=2.03e-05, step=3375] Training: 34%|███▍ | 3376/10000 [46:03<1:11:55, 1.53it/s, loss=0.0285, lr=2.03e-05, step=3376] Training: 34%|███▍ | 3377/10000 [46:04<1:13:25, 1.50it/s, loss=0.0285, lr=2.03e-05, step=3376] Training: 34%|███▍ | 3377/10000 [46:04<1:13:25, 1.50it/s, loss=0.0281, lr=2.03e-05, step=3377] Training: 34%|███▍ | 3378/10000 [46:04<1:17:31, 1.42it/s, loss=0.0281, lr=2.03e-05, step=3377] Training: 34%|███▍ | 3378/10000 [46:04<1:17:31, 1.42it/s, loss=0.0234, lr=2.03e-05, step=3378] Training: 34%|███▍ | 3379/10000 [46:05<1:21:05, 1.36it/s, loss=0.0234, lr=2.03e-05, step=3378] Training: 34%|███▍ | 3379/10000 [46:05<1:21:05, 1.36it/s, loss=0.0081, lr=2.03e-05, step=3379]19:30:38.256 [I] step=3380 loss=0.0460 smoothed_loss=0.0235 lr=2.03e-05 grad_norm=0.4451 step_time=0.5721s data_time=0.1132s it/s=1.459 eta_to_10000=4536.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.1872 grad_arm_token_fuse=0.1155 grad_shared_expert=0.3657 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3380/10000 [46:06<1:17:55, 1.42it/s, loss=0.0081, lr=2.03e-05, step=3379] Training: 34%|███▍ | 3380/10000 [46:06<1:17:55, 1.42it/s, loss=0.0460, lr=2.03e-05, step=3380] Training: 34%|███▍ | 3381/10000 [46:06<1:12:50, 1.51it/s, loss=0.0460, lr=2.03e-05, step=3380] Training: 34%|███▍ | 3381/10000 [46:06<1:12:50, 1.51it/s, loss=0.0109, lr=2.03e-05, step=3381] Training: 34%|███▍ | 3382/10000 [46:07<1:17:35, 1.42it/s, loss=0.0109, lr=2.03e-05, step=3381] Training: 34%|███▍ | 3382/10000 [46:07<1:17:35, 1.42it/s, loss=0.0166, lr=2.03e-05, step=3382] Training: 34%|███▍ | 3383/10000 [46:08<1:21:45, 1.35it/s, loss=0.0166, lr=2.03e-05, step=3382] Training: 34%|███▍ | 3383/10000 [46:08<1:21:45, 1.35it/s, loss=0.0265, lr=2.03e-05, step=3383] Training: 34%|███▍ | 3384/10000 [46:09<1:23:45, 1.32it/s, loss=0.0265, lr=2.03e-05, step=3383] Training: 34%|███▍ | 3384/10000 [46:09<1:23:45, 1.32it/s, loss=0.0080, lr=2.03e-05, step=3384] Training: 34%|███▍ | 3385/10000 [46:10<1:18:48, 1.40it/s, loss=0.0080, lr=2.03e-05, step=3384] Training: 34%|███▍ | 3385/10000 [46:10<1:18:48, 1.40it/s, loss=0.0138, lr=2.03e-05, step=3385] Training: 34%|███▍ | 3386/10000 [46:11<1:30:34, 1.22it/s, loss=0.0138, lr=2.03e-05, step=3385] Training: 34%|███▍ | 3386/10000 [46:11<1:30:34, 1.22it/s, loss=0.0112, lr=2.03e-05, step=3386] Training: 34%|███▍ | 3387/10000 [46:11<1:25:31, 1.29it/s, loss=0.0112, lr=2.03e-05, step=3386] Training: 34%|███▍ | 3387/10000 [46:11<1:25:31, 1.29it/s, loss=0.0299, lr=2.03e-05, step=3387] Training: 34%|███▍ | 3388/10000 [46:12<1:18:43, 1.40it/s, loss=0.0299, lr=2.03e-05, step=3387] Training: 34%|███▍ | 3388/10000 [46:12<1:18:43, 1.40it/s, loss=0.0163, lr=2.03e-05, step=3388] Training: 34%|███▍ | 3389/10000 [46:13<1:22:24, 1.34it/s, loss=0.0163, lr=2.03e-05, step=3388] Training: 34%|███▍ | 3389/10000 [46:13<1:22:24, 1.34it/s, loss=0.0111, lr=2.02e-05, step=3389]19:30:45.832 [I] step=3390 loss=0.0110 smoothed_loss=0.0182 lr=2.03e-05 grad_norm=0.5251 step_time=0.5987s data_time=0.1589s it/s=1.320 eta_to_10000=5007.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0213 grad_action_out_proj_arms=0.2380 grad_arm_token_fuse=0.1001 grad_shared_expert=0.6710 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3390/10000 [46:14<1:25:25, 1.29it/s, loss=0.0111, lr=2.02e-05, step=3389] Training: 34%|███▍ | 3390/10000 [46:14<1:25:25, 1.29it/s, loss=0.0110, lr=2.02e-05, step=3390] Training: 34%|███▍ | 3391/10000 [46:14<1:23:40, 1.32it/s, loss=0.0110, lr=2.02e-05, step=3390] Training: 34%|███▍ | 3391/10000 [46:14<1:23:40, 1.32it/s, loss=0.0238, lr=2.02e-05, step=3391] Training: 34%|███▍ | 3392/10000 [46:15<1:18:37, 1.40it/s, loss=0.0238, lr=2.02e-05, step=3391] Training: 34%|███▍ | 3392/10000 [46:15<1:18:37, 1.40it/s, loss=0.0112, lr=2.02e-05, step=3392] Training: 34%|███▍ | 3393/10000 [46:16<1:36:58, 1.14it/s, loss=0.0112, lr=2.02e-05, step=3392] Training: 34%|███▍ | 3393/10000 [46:16<1:36:58, 1.14it/s, loss=0.0040, lr=2.02e-05, step=3393] Training: 34%|███▍ | 3394/10000 [46:17<1:33:13, 1.18it/s, loss=0.0040, lr=2.02e-05, step=3393] Training: 34%|███▍ | 3394/10000 [46:17<1:33:13, 1.18it/s, loss=0.0503, lr=2.02e-05, step=3394] Training: 34%|███▍ | 3395/10000 [46:17<1:23:47, 1.31it/s, loss=0.0503, lr=2.02e-05, step=3394] Training: 34%|███▍ | 3395/10000 [46:17<1:23:47, 1.31it/s, loss=0.0094, lr=2.02e-05, step=3395] Training: 34%|███▍ | 3396/10000 [46:18<1:15:48, 1.45it/s, loss=0.0094, lr=2.02e-05, step=3395] Training: 34%|███▍ | 3396/10000 [46:18<1:15:48, 1.45it/s, loss=0.0162, lr=2.02e-05, step=3396] Training: 34%|███▍ | 3397/10000 [46:19<1:18:22, 1.40it/s, loss=0.0162, lr=2.02e-05, step=3396] Training: 34%|███▍ | 3397/10000 [46:19<1:18:22, 1.40it/s, loss=0.0063, lr=2.02e-05, step=3397] Training: 34%|███▍ | 3398/10000 [46:20<1:25:56, 1.28it/s, loss=0.0063, lr=2.02e-05, step=3397] Training: 34%|███▍ | 3398/10000 [46:20<1:25:56, 1.28it/s, loss=0.0384, lr=2.02e-05, step=3398] Training: 34%|███▍ | 3399/10000 [46:20<1:19:58, 1.38it/s, loss=0.0384, lr=2.02e-05, step=3398] Training: 34%|███▍ | 3399/10000 [46:20<1:19:58, 1.38it/s, loss=0.0064, lr=2.02e-05, step=3399]19:30:53.354 [I] step=3400 loss=0.0151 smoothed_loss=0.0179 lr=2.02e-05 grad_norm=0.4966 step_time=0.5991s data_time=0.1532s it/s=1.330 eta_to_10000=4964.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.1097 grad_arm_token_fuse=0.0309 grad_shared_expert=0.3332 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3400/10000 [46:21<1:21:12, 1.35it/s, loss=0.0064, lr=2.02e-05, step=3399] Training: 34%|███▍ | 3400/10000 [46:21<1:21:12, 1.35it/s, loss=0.0151, lr=2.02e-05, step=3400] Training: 34%|███▍ | 3401/10000 [46:22<1:27:12, 1.26it/s, loss=0.0151, lr=2.02e-05, step=3400] Training: 34%|███▍ | 3401/10000 [46:22<1:27:12, 1.26it/s, loss=0.0309, lr=2.02e-05, step=3401] Training: 34%|███▍ | 3402/10000 [46:22<1:18:31, 1.40it/s, loss=0.0309, lr=2.02e-05, step=3401] Training: 34%|███▍ | 3402/10000 [46:22<1:18:31, 1.40it/s, loss=0.1210, lr=2.02e-05, step=3402] Training: 34%|███▍ | 3403/10000 [46:23<1:17:37, 1.42it/s, loss=0.1210, lr=2.02e-05, step=3402] Training: 34%|███▍ | 3403/10000 [46:23<1:17:37, 1.42it/s, loss=0.0313, lr=2.02e-05, step=3403] Training: 34%|███▍ | 3404/10000 [46:24<1:10:29, 1.56it/s, loss=0.0313, lr=2.02e-05, step=3403] Training: 34%|███▍ | 3404/10000 [46:24<1:10:29, 1.56it/s, loss=0.0115, lr=2.02e-05, step=3404] Training: 34%|███▍ | 3405/10000 [46:24<1:10:16, 1.56it/s, loss=0.0115, lr=2.02e-05, step=3404] Training: 34%|███▍ | 3405/10000 [46:24<1:10:16, 1.56it/s, loss=0.0124, lr=2.02e-05, step=3405] Training: 34%|███▍ | 3406/10000 [46:25<1:06:58, 1.64it/s, loss=0.0124, lr=2.02e-05, step=3405] Training: 34%|███▍ | 3406/10000 [46:25<1:06:58, 1.64it/s, loss=0.0060, lr=2.02e-05, step=3406] Training: 34%|███▍ | 3407/10000 [46:26<1:21:10, 1.35it/s, loss=0.0060, lr=2.02e-05, step=3406] Training: 34%|███▍ | 3407/10000 [46:26<1:21:10, 1.35it/s, loss=0.0149, lr=2.02e-05, step=3407] Training: 34%|███▍ | 3408/10000 [46:27<1:30:48, 1.21it/s, loss=0.0149, lr=2.02e-05, step=3407] Training: 34%|███▍ | 3408/10000 [46:27<1:30:48, 1.21it/s, loss=0.0132, lr=2.02e-05, step=3408] Training: 34%|███▍ | 3409/10000 [46:28<1:31:02, 1.21it/s, loss=0.0132, lr=2.02e-05, step=3408] Training: 34%|███▍ | 3409/10000 [46:28<1:31:02, 1.21it/s, loss=0.0123, lr=2.02e-05, step=3409]19:31:00.789 [I] step=3410 loss=0.0228 smoothed_loss=0.0214 lr=2.02e-05 grad_norm=0.5274 step_time=0.5870s data_time=0.1563s it/s=1.345 eta_to_10000=4898.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0093 grad_action_out_proj_arms=0.1150 grad_arm_token_fuse=0.0425 grad_shared_expert=0.4133 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3410/10000 [46:28<1:27:38, 1.25it/s, loss=0.0123, lr=2.02e-05, step=3409] Training: 34%|███▍ | 3410/10000 [46:28<1:27:38, 1.25it/s, loss=0.0228, lr=2.02e-05, step=3410] Training: 34%|███▍ | 3411/10000 [46:29<1:26:43, 1.27it/s, loss=0.0228, lr=2.02e-05, step=3410] Training: 34%|███▍ | 3411/10000 [46:29<1:26:43, 1.27it/s, loss=0.0026, lr=2.02e-05, step=3411] Training: 34%|███▍ | 3412/10000 [46:30<1:18:01, 1.41it/s, loss=0.0026, lr=2.02e-05, step=3411] Training: 34%|███▍ | 3412/10000 [46:30<1:18:01, 1.41it/s, loss=0.0163, lr=2.02e-05, step=3412] Training: 34%|███▍ | 3413/10000 [46:30<1:17:38, 1.41it/s, loss=0.0163, lr=2.02e-05, step=3412] Training: 34%|███▍ | 3413/10000 [46:30<1:17:38, 1.41it/s, loss=0.0232, lr=2.02e-05, step=3413] Training: 34%|███▍ | 3414/10000 [46:31<1:22:06, 1.34it/s, loss=0.0232, lr=2.02e-05, step=3413] Training: 34%|███▍ | 3414/10000 [46:31<1:22:06, 1.34it/s, loss=0.0070, lr=2.02e-05, step=3414] Training: 34%|███▍ | 3415/10000 [46:32<1:31:05, 1.20it/s, loss=0.0070, lr=2.02e-05, step=3414] Training: 34%|███▍ | 3415/10000 [46:32<1:31:05, 1.20it/s, loss=0.0058, lr=2.02e-05, step=3415] Training: 34%|███▍ | 3416/10000 [46:33<1:21:26, 1.35it/s, loss=0.0058, lr=2.02e-05, step=3415] Training: 34%|███▍ | 3416/10000 [46:33<1:21:26, 1.35it/s, loss=0.0225, lr=2.02e-05, step=3416] Training: 34%|███▍ | 3417/10000 [46:34<1:32:15, 1.19it/s, loss=0.0225, lr=2.02e-05, step=3416] Training: 34%|███▍ | 3417/10000 [46:34<1:32:15, 1.19it/s, loss=0.0282, lr=2.02e-05, step=3417] Training: 34%|███▍ | 3418/10000 [46:35<1:34:05, 1.17it/s, loss=0.0282, lr=2.02e-05, step=3417] Training: 34%|███▍ | 3418/10000 [46:35<1:34:05, 1.17it/s, loss=0.0238, lr=2.02e-05, step=3418] Training: 34%|███▍ | 3419/10000 [46:35<1:23:03, 1.32it/s, loss=0.0238, lr=2.02e-05, step=3418] Training: 34%|███▍ | 3419/10000 [46:35<1:23:03, 1.32it/s, loss=0.0226, lr=2.02e-05, step=3419]19:31:08.389 [I] step=3420 loss=0.0074 smoothed_loss=0.0183 lr=2.02e-05 grad_norm=0.6094 step_time=0.5854s data_time=0.1746s it/s=1.316 eta_to_10000=5000.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0160 grad_action_out_proj_arms=0.1763 grad_arm_token_fuse=0.0853 grad_shared_expert=0.4239 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3420/10000 [46:36<1:21:32, 1.34it/s, loss=0.0226, lr=2.02e-05, step=3419] Training: 34%|███▍ | 3420/10000 [46:36<1:21:32, 1.34it/s, loss=0.0074, lr=2.02e-05, step=3420] Training: 34%|███▍ | 3421/10000 [46:37<1:25:53, 1.28it/s, loss=0.0074, lr=2.02e-05, step=3420] Training: 34%|███▍ | 3421/10000 [46:37<1:25:53, 1.28it/s, loss=0.0125, lr=2.02e-05, step=3421] Training: 34%|███▍ | 3422/10000 [46:38<1:29:17, 1.23it/s, loss=0.0125, lr=2.02e-05, step=3421] Training: 34%|███▍ | 3422/10000 [46:38<1:29:17, 1.23it/s, loss=0.0116, lr=2.01e-05, step=3422] Training: 34%|███▍ | 3423/10000 [46:38<1:23:00, 1.32it/s, loss=0.0116, lr=2.01e-05, step=3422] Training: 34%|███▍ | 3423/10000 [46:38<1:23:00, 1.32it/s, loss=0.0132, lr=2.01e-05, step=3423] Training: 34%|███▍ | 3424/10000 [46:39<1:28:38, 1.24it/s, loss=0.0132, lr=2.01e-05, step=3423] Training: 34%|███▍ | 3424/10000 [46:39<1:28:38, 1.24it/s, loss=0.0147, lr=2.01e-05, step=3424] Training: 34%|███▍ | 3425/10000 [46:40<1:26:48, 1.26it/s, loss=0.0147, lr=2.01e-05, step=3424] Training: 34%|███▍ | 3425/10000 [46:40<1:26:48, 1.26it/s, loss=0.0390, lr=2.01e-05, step=3425] Training: 34%|███▍ | 3426/10000 [46:41<1:24:01, 1.30it/s, loss=0.0390, lr=2.01e-05, step=3425] Training: 34%|███▍ | 3426/10000 [46:41<1:24:01, 1.30it/s, loss=0.0388, lr=2.01e-05, step=3426] Training: 34%|███▍ | 3427/10000 [46:41<1:14:55, 1.46it/s, loss=0.0388, lr=2.01e-05, step=3426] Training: 34%|███▍ | 3427/10000 [46:41<1:14:55, 1.46it/s, loss=0.0162, lr=2.01e-05, step=3427] Training: 34%|███▍ | 3428/10000 [46:42<1:21:25, 1.35it/s, loss=0.0162, lr=2.01e-05, step=3427] Training: 34%|███▍ | 3428/10000 [46:42<1:21:25, 1.35it/s, loss=0.0471, lr=2.01e-05, step=3428] Training: 34%|███▍ | 3429/10000 [46:43<1:30:26, 1.21it/s, loss=0.0471, lr=2.01e-05, step=3428] Training: 34%|███▍ | 3429/10000 [46:43<1:30:26, 1.21it/s, loss=0.0021, lr=2.01e-05, step=3429]19:31:16.205 [I] step=3430 loss=0.0100 smoothed_loss=0.0198 lr=2.01e-05 grad_norm=0.7561 step_time=0.6430s data_time=0.1385s it/s=1.280 eta_to_10000=5134.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0866 grad_arm_token_fuse=0.0367 grad_shared_expert=0.4135 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3430/10000 [46:44<1:24:34, 1.29it/s, loss=0.0021, lr=2.01e-05, step=3429] Training: 34%|███▍ | 3430/10000 [46:44<1:24:34, 1.29it/s, loss=0.0100, lr=2.01e-05, step=3430] Training: 34%|███▍ | 3431/10000 [46:44<1:16:58, 1.42it/s, loss=0.0100, lr=2.01e-05, step=3430] Training: 34%|███▍ | 3431/10000 [46:44<1:16:58, 1.42it/s, loss=0.0171, lr=2.01e-05, step=3431] Training: 34%|███▍ | 3432/10000 [46:45<1:18:46, 1.39it/s, loss=0.0171, lr=2.01e-05, step=3431] Training: 34%|███▍ | 3432/10000 [46:45<1:18:46, 1.39it/s, loss=0.0067, lr=2.01e-05, step=3432] Training: 34%|███▍ | 3433/10000 [46:46<1:12:29, 1.51it/s, loss=0.0067, lr=2.01e-05, step=3432] Training: 34%|███▍ | 3433/10000 [46:46<1:12:29, 1.51it/s, loss=0.0190, lr=2.01e-05, step=3433] Training: 34%|███▍ | 3434/10000 [46:46<1:11:25, 1.53it/s, loss=0.0190, lr=2.01e-05, step=3433] Training: 34%|███▍ | 3434/10000 [46:46<1:11:25, 1.53it/s, loss=0.0351, lr=2.01e-05, step=3434] Training: 34%|███▍ | 3435/10000 [46:47<1:13:52, 1.48it/s, loss=0.0351, lr=2.01e-05, step=3434] Training: 34%|███▍ | 3435/10000 [46:47<1:13:52, 1.48it/s, loss=0.0304, lr=2.01e-05, step=3435] Training: 34%|███▍ | 3436/10000 [46:48<1:21:56, 1.34it/s, loss=0.0304, lr=2.01e-05, step=3435] Training: 34%|███▍ | 3436/10000 [46:48<1:21:56, 1.34it/s, loss=0.0031, lr=2.01e-05, step=3436] Training: 34%|███▍ | 3437/10000 [46:49<1:18:40, 1.39it/s, loss=0.0031, lr=2.01e-05, step=3436] Training: 34%|███▍ | 3437/10000 [46:49<1:18:40, 1.39it/s, loss=0.0446, lr=2.01e-05, step=3437] Training: 34%|███▍ | 3438/10000 [46:49<1:13:29, 1.49it/s, loss=0.0446, lr=2.01e-05, step=3437] Training: 34%|███▍ | 3438/10000 [46:49<1:13:29, 1.49it/s, loss=0.0226, lr=2.01e-05, step=3438] Training: 34%|███▍ | 3439/10000 [46:50<1:20:51, 1.35it/s, loss=0.0226, lr=2.01e-05, step=3438] Training: 34%|███▍ | 3439/10000 [46:50<1:20:51, 1.35it/s, loss=0.0282, lr=2.01e-05, step=3439]19:31:23.219 [I] step=3440 loss=0.0124 smoothed_loss=0.0215 lr=2.01e-05 grad_norm=0.5023 step_time=0.5715s data_time=0.1300s it/s=1.426 eta_to_10000=4600.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0136 grad_action_out_proj_arms=0.1376 grad_arm_token_fuse=0.0658 grad_shared_expert=0.4925 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3440/10000 [46:51<1:22:46, 1.32it/s, loss=0.0282, lr=2.01e-05, step=3439] Training: 34%|███▍ | 3440/10000 [46:51<1:22:46, 1.32it/s, loss=0.0124, lr=2.01e-05, step=3440] Training: 34%|███▍ | 3441/10000 [46:52<1:20:08, 1.36it/s, loss=0.0124, lr=2.01e-05, step=3440] Training: 34%|███▍ | 3441/10000 [46:52<1:20:08, 1.36it/s, loss=0.1042, lr=2.01e-05, step=3441] Training: 34%|███▍ | 3442/10000 [46:52<1:18:29, 1.39it/s, loss=0.1042, lr=2.01e-05, step=3441] Training: 34%|███▍ | 3442/10000 [46:52<1:18:29, 1.39it/s, loss=0.0148, lr=2.01e-05, step=3442] Training: 34%|███▍ | 3443/10000 [46:53<1:25:10, 1.28it/s, loss=0.0148, lr=2.01e-05, step=3442] Training: 34%|███▍ | 3443/10000 [46:53<1:25:10, 1.28it/s, loss=0.0048, lr=2.01e-05, step=3443] Training: 34%|███▍ | 3444/10000 [46:54<1:21:38, 1.34it/s, loss=0.0048, lr=2.01e-05, step=3443] Training: 34%|███▍ | 3444/10000 [46:54<1:21:38, 1.34it/s, loss=0.0205, lr=2.01e-05, step=3444] Training: 34%|███▍ | 3445/10000 [46:55<1:18:55, 1.38it/s, loss=0.0205, lr=2.01e-05, step=3444] Training: 34%|███▍ | 3445/10000 [46:55<1:18:55, 1.38it/s, loss=0.0077, lr=2.01e-05, step=3445] Training: 34%|███▍ | 3446/10000 [46:55<1:16:24, 1.43it/s, loss=0.0077, lr=2.01e-05, step=3445] Training: 34%|███▍ | 3446/10000 [46:55<1:16:24, 1.43it/s, loss=0.0207, lr=2.01e-05, step=3446] Training: 34%|███▍ | 3447/10000 [46:56<1:19:56, 1.37it/s, loss=0.0207, lr=2.01e-05, step=3446] Training: 34%|███▍ | 3447/10000 [46:56<1:19:56, 1.37it/s, loss=0.0092, lr=2.01e-05, step=3447] Training: 34%|███▍ | 3448/10000 [46:57<1:15:41, 1.44it/s, loss=0.0092, lr=2.01e-05, step=3447] Training: 34%|███▍ | 3448/10000 [46:57<1:15:41, 1.44it/s, loss=0.0095, lr=2.01e-05, step=3448] Training: 34%|███▍ | 3449/10000 [46:57<1:14:19, 1.47it/s, loss=0.0095, lr=2.01e-05, step=3448] Training: 34%|███▍ | 3449/10000 [46:57<1:14:19, 1.47it/s, loss=0.0072, lr=2.01e-05, step=3449]19:31:30.554 [I] step=3450 loss=0.0130 smoothed_loss=0.0187 lr=2.01e-05 grad_norm=0.4574 step_time=0.5919s data_time=0.1416s it/s=1.363 eta_to_10000=4803.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0073 grad_action_out_proj_arms=0.1047 grad_arm_token_fuse=0.0386 grad_shared_expert=0.3397 (18633:train_pytorch.py:850) + Training: 34%|███▍ | 3450/10000 [46:58<1:25:01, 1.28it/s, loss=0.0072, lr=2.01e-05, step=3449] Training: 34%|███▍ | 3450/10000 [46:58<1:25:01, 1.28it/s, loss=0.0130, lr=2.01e-05, step=3450] Training: 35%|███▍ | 3451/10000 [46:59<1:18:40, 1.39it/s, loss=0.0130, lr=2.01e-05, step=3450] Training: 35%|███▍ | 3451/10000 [46:59<1:18:40, 1.39it/s, loss=0.0153, lr=2.01e-05, step=3451] Training: 35%|███▍ | 3452/10000 [47:00<1:19:59, 1.36it/s, loss=0.0153, lr=2.01e-05, step=3451] Training: 35%|███▍ | 3452/10000 [47:00<1:19:59, 1.36it/s, loss=0.0215, lr=2.01e-05, step=3452] Training: 35%|███▍ | 3453/10000 [47:00<1:18:37, 1.39it/s, loss=0.0215, lr=2.01e-05, step=3452] Training: 35%|███▍ | 3453/10000 [47:00<1:18:37, 1.39it/s, loss=0.0710, lr=2.01e-05, step=3453] Training: 35%|███▍ | 3454/10000 [47:01<1:18:14, 1.39it/s, loss=0.0710, lr=2.01e-05, step=3453] Training: 35%|███▍ | 3454/10000 [47:01<1:18:14, 1.39it/s, loss=0.0044, lr=2.00e-05, step=3454] Training: 35%|███▍ | 3455/10000 [47:02<1:14:13, 1.47it/s, loss=0.0044, lr=2.00e-05, step=3454] Training: 35%|███▍ | 3455/10000 [47:02<1:14:13, 1.47it/s, loss=0.0068, lr=2.00e-05, step=3455] Training: 35%|███▍ | 3456/10000 [47:02<1:13:10, 1.49it/s, loss=0.0068, lr=2.00e-05, step=3455] Training: 35%|███▍ | 3456/10000 [47:02<1:13:10, 1.49it/s, loss=0.0904, lr=2.00e-05, step=3456] Training: 35%|███▍ | 3457/10000 [47:03<1:21:55, 1.33it/s, loss=0.0904, lr=2.00e-05, step=3456] Training: 35%|███▍ | 3457/10000 [47:03<1:21:55, 1.33it/s, loss=0.0217, lr=2.00e-05, step=3457] Training: 35%|███▍ | 3458/10000 [47:04<1:20:45, 1.35it/s, loss=0.0217, lr=2.00e-05, step=3457] Training: 35%|███▍ | 3458/10000 [47:04<1:20:45, 1.35it/s, loss=0.0057, lr=2.00e-05, step=3458] Training: 35%|███▍ | 3459/10000 [47:04<1:13:38, 1.48it/s, loss=0.0057, lr=2.00e-05, step=3458] Training: 35%|███▍ | 3459/10000 [47:04<1:13:38, 1.48it/s, loss=0.0075, lr=2.00e-05, step=3459]19:31:37.277 [I] step=3460 loss=0.0207 smoothed_loss=0.0228 lr=2.00e-05 grad_norm=0.4663 step_time=0.5481s data_time=0.1241s it/s=1.488 eta_to_10000=4395.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0219 grad_action_out_proj_arms=0.2086 grad_arm_token_fuse=0.1137 grad_shared_expert=0.5451 (18633:train_pytorch.py:850) + Training: 35%|███▍ | 3460/10000 [47:05<1:09:37, 1.57it/s, loss=0.0075, lr=2.00e-05, step=3459] Training: 35%|███▍ | 3460/10000 [47:05<1:09:37, 1.57it/s, loss=0.0207, lr=2.00e-05, step=3460] Training: 35%|███▍ | 3461/10000 [47:05<1:05:24, 1.67it/s, loss=0.0207, lr=2.00e-05, step=3460] Training: 35%|███▍ | 3461/10000 [47:05<1:05:24, 1.67it/s, loss=0.0340, lr=2.00e-05, step=3461] Training: 35%|███▍ | 3462/10000 [47:06<1:08:24, 1.59it/s, loss=0.0340, lr=2.00e-05, step=3461] Training: 35%|███▍ | 3462/10000 [47:06<1:08:24, 1.59it/s, loss=0.0125, lr=2.00e-05, step=3462] Training: 35%|███▍ | 3463/10000 [47:07<1:05:04, 1.67it/s, loss=0.0125, lr=2.00e-05, step=3462] Training: 35%|███▍ | 3463/10000 [47:07<1:05:04, 1.67it/s, loss=0.0127, lr=2.00e-05, step=3463] Training: 35%|███▍ | 3464/10000 [47:07<1:08:25, 1.59it/s, loss=0.0127, lr=2.00e-05, step=3463] Training: 35%|███▍ | 3464/10000 [47:07<1:08:25, 1.59it/s, loss=0.0218, lr=2.00e-05, step=3464] Training: 35%|███▍ | 3465/10000 [47:08<1:16:03, 1.43it/s, loss=0.0218, lr=2.00e-05, step=3464] Training: 35%|███▍ | 3465/10000 [47:08<1:16:03, 1.43it/s, loss=0.0377, lr=2.00e-05, step=3465] Training: 35%|███▍ | 3466/10000 [47:09<1:19:51, 1.36it/s, loss=0.0377, lr=2.00e-05, step=3465] Training: 35%|███▍ | 3466/10000 [47:09<1:19:51, 1.36it/s, loss=0.0406, lr=2.00e-05, step=3466] Training: 35%|███▍ | 3467/10000 [47:10<1:13:22, 1.48it/s, loss=0.0406, lr=2.00e-05, step=3466] Training: 35%|███▍ | 3467/10000 [47:10<1:13:22, 1.48it/s, loss=0.0308, lr=2.00e-05, step=3467] Training: 35%|███▍ | 3468/10000 [47:10<1:11:48, 1.52it/s, loss=0.0308, lr=2.00e-05, step=3467] Training: 35%|███▍ | 3468/10000 [47:10<1:11:48, 1.52it/s, loss=0.0122, lr=2.00e-05, step=3468] Training: 35%|███▍ | 3469/10000 [47:11<1:12:35, 1.50it/s, loss=0.0122, lr=2.00e-05, step=3468] Training: 35%|███▍ | 3469/10000 [47:11<1:12:35, 1.50it/s, loss=0.0039, lr=2.00e-05, step=3469]19:31:44.134 [I] step=3470 loss=0.0181 smoothed_loss=0.0218 lr=2.00e-05 grad_norm=0.4805 step_time=0.5744s data_time=0.1113s it/s=1.459 eta_to_10000=4477.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0202 grad_action_out_proj_arms=0.1644 grad_arm_token_fuse=0.1095 grad_shared_expert=0.3851 (18633:train_pytorch.py:850) + Training: 35%|███▍ | 3470/10000 [47:12<1:20:23, 1.35it/s, loss=0.0039, lr=2.00e-05, step=3469] Training: 35%|███▍ | 3470/10000 [47:12<1:20:23, 1.35it/s, loss=0.0181, lr=2.00e-05, step=3470] Training: 35%|███▍ | 3471/10000 [47:13<1:30:05, 1.21it/s, loss=0.0181, lr=2.00e-05, step=3470] Training: 35%|███▍ | 3471/10000 [47:13<1:30:05, 1.21it/s, loss=0.0229, lr=2.00e-05, step=3471] Training: 35%|███▍ | 3472/10000 [47:14<1:32:47, 1.17it/s, loss=0.0229, lr=2.00e-05, step=3471] Training: 35%|███▍ | 3472/10000 [47:14<1:32:47, 1.17it/s, loss=0.0290, lr=2.00e-05, step=3472] Training: 35%|███▍ | 3473/10000 [47:14<1:23:08, 1.31it/s, loss=0.0290, lr=2.00e-05, step=3472] Training: 35%|███▍ | 3473/10000 [47:14<1:23:08, 1.31it/s, loss=0.0123, lr=2.00e-05, step=3473] Training: 35%|███▍ | 3474/10000 [47:15<1:22:18, 1.32it/s, loss=0.0123, lr=2.00e-05, step=3473] Training: 35%|███▍ | 3474/10000 [47:15<1:22:18, 1.32it/s, loss=0.0696, lr=2.00e-05, step=3474] Training: 35%|███▍ | 3475/10000 [47:16<1:23:10, 1.31it/s, loss=0.0696, lr=2.00e-05, step=3474] Training: 35%|███▍ | 3475/10000 [47:16<1:23:10, 1.31it/s, loss=0.0085, lr=2.00e-05, step=3475] Training: 35%|███▍ | 3476/10000 [47:16<1:16:32, 1.42it/s, loss=0.0085, lr=2.00e-05, step=3475] Training: 35%|███▍ | 3476/10000 [47:16<1:16:32, 1.42it/s, loss=0.0251, lr=2.00e-05, step=3476] Training: 35%|███▍ | 3477/10000 [47:17<1:16:45, 1.42it/s, loss=0.0251, lr=2.00e-05, step=3476] Training: 35%|███▍ | 3477/10000 [47:17<1:16:45, 1.42it/s, loss=0.0094, lr=2.00e-05, step=3477] Training: 35%|███▍ | 3478/10000 [47:18<1:22:39, 1.32it/s, loss=0.0094, lr=2.00e-05, step=3477] Training: 35%|███▍ | 3478/10000 [47:18<1:22:39, 1.32it/s, loss=0.0068, lr=2.00e-05, step=3478] Training: 35%|███▍ | 3479/10000 [47:19<1:23:38, 1.30it/s, loss=0.0068, lr=2.00e-05, step=3478] Training: 35%|███▍ | 3479/10000 [47:19<1:23:38, 1.30it/s, loss=0.0571, lr=2.00e-05, step=3479]19:31:51.647 [I] step=3480 loss=0.0564 smoothed_loss=0.0282 lr=2.00e-05 grad_norm=0.5129 step_time=0.6308s data_time=0.1204s it/s=1.331 eta_to_10000=4897.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0262 grad_action_out_proj_arms=0.1830 grad_arm_token_fuse=0.1330 grad_shared_expert=0.4122 (18633:train_pytorch.py:850) + Training: 35%|███▍ | 3480/10000 [47:19<1:15:57, 1.43it/s, loss=0.0571, lr=2.00e-05, step=3479] Training: 35%|███▍ | 3480/10000 [47:19<1:15:57, 1.43it/s, loss=0.0564, lr=2.00e-05, step=3480] Training: 35%|███▍ | 3481/10000 [47:20<1:12:18, 1.50it/s, loss=0.0564, lr=2.00e-05, step=3480] Training: 35%|███▍ | 3481/10000 [47:20<1:12:18, 1.50it/s, loss=0.0428, lr=2.00e-05, step=3481] Training: 35%|███▍ | 3482/10000 [47:21<1:14:39, 1.46it/s, loss=0.0428, lr=2.00e-05, step=3481] Training: 35%|███▍ | 3482/10000 [47:21<1:14:39, 1.46it/s, loss=0.0811, lr=2.00e-05, step=3482] Training: 35%|███▍ | 3483/10000 [47:21<1:13:19, 1.48it/s, loss=0.0811, lr=2.00e-05, step=3482] Training: 35%|███▍ | 3483/10000 [47:21<1:13:19, 1.48it/s, loss=0.0056, lr=2.00e-05, step=3483] Training: 35%|███▍ | 3484/10000 [47:22<1:21:17, 1.34it/s, loss=0.0056, lr=2.00e-05, step=3483] Training: 35%|███▍ | 3484/10000 [47:22<1:21:17, 1.34it/s, loss=0.0048, lr=2.00e-05, step=3484] Training: 35%|███▍ | 3485/10000 [47:23<1:16:05, 1.43it/s, loss=0.0048, lr=2.00e-05, step=3484] Training: 35%|███▍ | 3485/10000 [47:23<1:16:05, 1.43it/s, loss=0.0148, lr=2.00e-05, step=3485] Training: 35%|███▍ | 3486/10000 [47:24<1:18:27, 1.38it/s, loss=0.0148, lr=2.00e-05, step=3485] Training: 35%|███▍ | 3486/10000 [47:24<1:18:27, 1.38it/s, loss=0.0291, lr=1.99e-05, step=3486] Training: 35%|███▍ | 3487/10000 [47:24<1:18:42, 1.38it/s, loss=0.0291, lr=1.99e-05, step=3486] Training: 35%|███▍ | 3487/10000 [47:24<1:18:42, 1.38it/s, loss=0.0334, lr=1.99e-05, step=3487] Training: 35%|███▍ | 3488/10000 [47:25<1:22:06, 1.32it/s, loss=0.0334, lr=1.99e-05, step=3487] Training: 35%|███▍ | 3488/10000 [47:25<1:22:06, 1.32it/s, loss=0.0144, lr=1.99e-05, step=3488] Training: 35%|███▍ | 3489/10000 [47:26<1:21:45, 1.33it/s, loss=0.0144, lr=1.99e-05, step=3488] Training: 35%|███▍ | 3489/10000 [47:26<1:21:45, 1.33it/s, loss=0.0054, lr=1.99e-05, step=3489]19:31:58.906 [I] step=3490 loss=0.0275 smoothed_loss=0.0251 lr=2.00e-05 grad_norm=0.5635 step_time=0.5778s data_time=0.1481s it/s=1.378 eta_to_10000=4725.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0161 grad_action_out_proj_arms=0.1524 grad_arm_token_fuse=0.0887 grad_shared_expert=0.5594 (18633:train_pytorch.py:850) + Training: 35%|███▍ | 3490/10000 [47:27<1:19:55, 1.36it/s, loss=0.0054, lr=1.99e-05, step=3489] Training: 35%|███▍ | 3490/10000 [47:27<1:19:55, 1.36it/s, loss=0.0275, lr=1.99e-05, step=3490] Training: 35%|███▍ | 3491/10000 [47:27<1:19:08, 1.37it/s, loss=0.0275, lr=1.99e-05, step=3490] Training: 35%|███▍ | 3491/10000 [47:27<1:19:08, 1.37it/s, loss=0.0120, lr=1.99e-05, step=3491] Training: 35%|███▍ | 3492/10000 [47:28<1:12:35, 1.49it/s, loss=0.0120, lr=1.99e-05, step=3491] Training: 35%|███▍ | 3492/10000 [47:28<1:12:35, 1.49it/s, loss=0.0335, lr=1.99e-05, step=3492] Training: 35%|███▍ | 3493/10000 [47:29<1:16:45, 1.41it/s, loss=0.0335, lr=1.99e-05, step=3492] Training: 35%|███▍ | 3493/10000 [47:29<1:16:45, 1.41it/s, loss=0.0158, lr=1.99e-05, step=3493] Training: 35%|███▍ | 3494/10000 [47:29<1:17:45, 1.39it/s, loss=0.0158, lr=1.99e-05, step=3493] Training: 35%|███▍ | 3494/10000 [47:29<1:17:45, 1.39it/s, loss=0.0509, lr=1.99e-05, step=3494] Training: 35%|███▍ | 3495/10000 [47:30<1:17:35, 1.40it/s, loss=0.0509, lr=1.99e-05, step=3494] Training: 35%|███▍ | 3495/10000 [47:30<1:17:35, 1.40it/s, loss=0.0342, lr=1.99e-05, step=3495] Training: 35%|███▍ | 3496/10000 [47:31<1:12:09, 1.50it/s, loss=0.0342, lr=1.99e-05, step=3495] Training: 35%|███▍ | 3496/10000 [47:31<1:12:09, 1.50it/s, loss=0.0142, lr=1.99e-05, step=3496] Training: 35%|███▍ | 3497/10000 [47:31<1:08:28, 1.58it/s, loss=0.0142, lr=1.99e-05, step=3496] Training: 35%|███▍ | 3497/10000 [47:31<1:08:28, 1.58it/s, loss=0.0092, lr=1.99e-05, step=3497] Training: 35%|███▍ | 3498/10000 [47:32<1:07:34, 1.60it/s, loss=0.0092, lr=1.99e-05, step=3497] Training: 35%|███▍ | 3498/10000 [47:32<1:07:34, 1.60it/s, loss=0.0262, lr=1.99e-05, step=3498] Training: 35%|███▍ | 3499/10000 [47:32<1:10:50, 1.53it/s, loss=0.0262, lr=1.99e-05, step=3498] Training: 35%|███▍ | 3499/10000 [47:32<1:10:50, 1.53it/s, loss=0.0095, lr=1.99e-05, step=3499]19:32:05.704 [I] step=3500 loss=0.1977 smoothed_loss=0.0405 lr=1.99e-05 grad_norm=0.4995 step_time=0.5724s data_time=0.1073s it/s=1.471 eta_to_10000=4417.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0306 grad_action_out_proj_arms=0.2074 grad_arm_token_fuse=0.1875 grad_shared_expert=0.4800 (18633:train_pytorch.py:850) + Training: 35%|███▌ | 3500/10000 [47:33<1:18:04, 1.39it/s, loss=0.0095, lr=1.99e-05, step=3499] Training: 35%|███▌ | 3500/10000 [47:33<1:18:04, 1.39it/s, loss=0.1977, lr=1.99e-05, step=3500] Training: 35%|███▌ | 3501/10000 [47:34<1:11:59, 1.50it/s, loss=0.1977, lr=1.99e-05, step=3500] Training: 35%|███▌ | 3501/10000 [47:34<1:11:59, 1.50it/s, loss=0.0208, lr=1.99e-05, step=3501] Training: 35%|███▌ | 3502/10000 [47:35<1:13:55, 1.47it/s, loss=0.0208, lr=1.99e-05, step=3501] Training: 35%|███▌ | 3502/10000 [47:35<1:13:55, 1.47it/s, loss=0.0146, lr=1.99e-05, step=3502] Training: 35%|███▌ | 3503/10000 [47:35<1:08:46, 1.57it/s, loss=0.0146, lr=1.99e-05, step=3502] Training: 35%|███▌ | 3503/10000 [47:35<1:08:46, 1.57it/s, loss=0.0092, lr=1.99e-05, step=3503] Training: 35%|███▌ | 3504/10000 [47:36<1:09:59, 1.55it/s, loss=0.0092, lr=1.99e-05, step=3503] Training: 35%|███▌ | 3504/10000 [47:36<1:09:59, 1.55it/s, loss=0.0277, lr=1.99e-05, step=3504] Training: 35%|███▌ | 3505/10000 [47:37<1:13:53, 1.46it/s, loss=0.0277, lr=1.99e-05, step=3504] Training: 35%|███▌ | 3505/10000 [47:37<1:13:53, 1.46it/s, loss=0.0027, lr=1.99e-05, step=3505] Training: 35%|███▌ | 3506/10000 [47:37<1:18:04, 1.39it/s, loss=0.0027, lr=1.99e-05, step=3505] Training: 35%|███▌ | 3506/10000 [47:37<1:18:04, 1.39it/s, loss=0.0059, lr=1.99e-05, step=3506] Training: 35%|███▌ | 3507/10000 [47:38<1:23:44, 1.29it/s, loss=0.0059, lr=1.99e-05, step=3506] Training: 35%|███▌ | 3507/10000 [47:38<1:23:44, 1.29it/s, loss=0.0338, lr=1.99e-05, step=3507] Training: 35%|███▌ | 3508/10000 [47:39<1:21:35, 1.33it/s, loss=0.0338, lr=1.99e-05, step=3507] Training: 35%|███▌ | 3508/10000 [47:39<1:21:35, 1.33it/s, loss=0.0070, lr=1.99e-05, step=3508] Training: 35%|███▌ | 3509/10000 [47:40<1:14:18, 1.46it/s, loss=0.0070, lr=1.99e-05, step=3508] Training: 35%|███▌ | 3509/10000 [47:40<1:14:18, 1.46it/s, loss=0.0098, lr=1.99e-05, step=3509]19:32:12.583 [I] step=3510 loss=0.0296 smoothed_loss=0.0249 lr=1.99e-05 grad_norm=0.5740 step_time=0.5610s data_time=0.1270s it/s=1.454 eta_to_10000=4464.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0156 grad_action_out_proj_arms=0.1407 grad_arm_token_fuse=0.0840 grad_shared_expert=0.4714 (18633:train_pytorch.py:850) + Training: 35%|███▌ | 3510/10000 [47:40<1:15:03, 1.44it/s, loss=0.0098, lr=1.99e-05, step=3509] Training: 35%|███▌ | 3510/10000 [47:40<1:15:03, 1.44it/s, loss=0.0296, lr=1.99e-05, step=3510] Training: 35%|███▌ | 3511/10000 [47:41<1:08:46, 1.57it/s, loss=0.0296, lr=1.99e-05, step=3510] Training: 35%|███▌ | 3511/10000 [47:41<1:08:46, 1.57it/s, loss=0.0033, lr=1.99e-05, step=3511] Training: 35%|███▌ | 3512/10000 [47:41<1:09:50, 1.55it/s, loss=0.0033, lr=1.99e-05, step=3511] Training: 35%|███▌ | 3512/10000 [47:41<1:09:50, 1.55it/s, loss=0.0181, lr=1.99e-05, step=3512] Training: 35%|███▌ | 3513/10000 [47:42<1:04:47, 1.67it/s, loss=0.0181, lr=1.99e-05, step=3512] Training: 35%|███▌ | 3513/10000 [47:42<1:04:47, 1.67it/s, loss=0.0223, lr=1.99e-05, step=3513] Training: 35%|███▌ | 3514/10000 [47:43<1:08:39, 1.57it/s, loss=0.0223, lr=1.99e-05, step=3513] Training: 35%|███▌ | 3514/10000 [47:43<1:08:39, 1.57it/s, loss=0.0523, lr=1.99e-05, step=3514] Training: 35%|███▌ | 3515/10000 [47:43<1:13:39, 1.47it/s, loss=0.0523, lr=1.99e-05, step=3514] Training: 35%|███▌ | 3515/10000 [47:43<1:13:39, 1.47it/s, loss=0.0082, lr=1.99e-05, step=3515] Training: 35%|███▌ | 3516/10000 [47:44<1:09:29, 1.56it/s, loss=0.0082, lr=1.99e-05, step=3515] Training: 35%|███▌ | 3516/10000 [47:44<1:09:29, 1.56it/s, loss=0.0066, lr=1.99e-05, step=3516] Training: 35%|███▌ | 3517/10000 [47:45<1:06:59, 1.61it/s, loss=0.0066, lr=1.99e-05, step=3516] Training: 35%|███▌ | 3517/10000 [47:45<1:06:59, 1.61it/s, loss=0.0155, lr=1.99e-05, step=3517] Training: 35%|███▌ | 3518/10000 [47:45<1:05:31, 1.65it/s, loss=0.0155, lr=1.99e-05, step=3517] Training: 35%|███▌ | 3518/10000 [47:45<1:05:31, 1.65it/s, loss=0.0117, lr=1.99e-05, step=3518] Training: 35%|███▌ | 3519/10000 [47:46<1:07:23, 1.60it/s, loss=0.0117, lr=1.99e-05, step=3518] Training: 35%|███▌ | 3519/10000 [47:46<1:07:23, 1.60it/s, loss=0.0984, lr=1.98e-05, step=3519]19:32:19.051 [I] step=3520 loss=0.0063 smoothed_loss=0.0259 lr=1.99e-05 grad_norm=0.4855 step_time=0.5396s data_time=0.1072s it/s=1.546 eta_to_10000=4190.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1421 grad_arm_token_fuse=0.0936 grad_shared_expert=0.3783 (18633:train_pytorch.py:850) + Training: 35%|███▌ | 3520/10000 [47:47<1:17:38, 1.39it/s, loss=0.0984, lr=1.98e-05, step=3519] Training: 35%|███▌ | 3520/10000 [47:47<1:17:38, 1.39it/s, loss=0.0063, lr=1.98e-05, step=3520] Training: 35%|███▌ | 3521/10000 [47:48<1:21:07, 1.33it/s, loss=0.0063, lr=1.98e-05, step=3520] Training: 35%|███▌ | 3521/10000 [47:48<1:21:07, 1.33it/s, loss=0.0073, lr=1.98e-05, step=3521] Training: 35%|███▌ | 3522/10000 [47:48<1:24:57, 1.27it/s, loss=0.0073, lr=1.98e-05, step=3521] Training: 35%|███▌ | 3522/10000 [47:48<1:24:57, 1.27it/s, loss=0.0371, lr=1.98e-05, step=3522] Training: 35%|███▌ | 3523/10000 [47:49<1:17:26, 1.39it/s, loss=0.0371, lr=1.98e-05, step=3522] Training: 35%|███▌ | 3523/10000 [47:49<1:17:26, 1.39it/s, loss=0.0297, lr=1.98e-05, step=3523] Training: 35%|███▌ | 3524/10000 [47:50<1:11:45, 1.50it/s, loss=0.0297, lr=1.98e-05, step=3523] Training: 35%|███▌ | 3524/10000 [47:50<1:11:45, 1.50it/s, loss=0.0115, lr=1.98e-05, step=3524] Training: 35%|███▌ | 3525/10000 [47:50<1:07:00, 1.61it/s, loss=0.0115, lr=1.98e-05, step=3524] Training: 35%|███▌ | 3525/10000 [47:50<1:07:00, 1.61it/s, loss=0.0575, lr=1.98e-05, step=3525] Training: 35%|███▌ | 3526/10000 [47:51<1:04:59, 1.66it/s, loss=0.0575, lr=1.98e-05, step=3525] Training: 35%|███▌ | 3526/10000 [47:51<1:04:59, 1.66it/s, loss=0.0068, lr=1.98e-05, step=3526] Training: 35%|███▌ | 3527/10000 [47:51<1:07:45, 1.59it/s, loss=0.0068, lr=1.98e-05, step=3526] Training: 35%|███▌ | 3527/10000 [47:51<1:07:45, 1.59it/s, loss=0.0045, lr=1.98e-05, step=3527] Training: 35%|███▌ | 3528/10000 [47:52<1:12:44, 1.48it/s, loss=0.0045, lr=1.98e-05, step=3527] Training: 35%|███▌ | 3528/10000 [47:52<1:12:44, 1.48it/s, loss=0.0246, lr=1.98e-05, step=3528] Training: 35%|███▌ | 3529/10000 [47:53<1:21:17, 1.33it/s, loss=0.0246, lr=1.98e-05, step=3528] Training: 35%|███▌ | 3529/10000 [47:53<1:21:17, 1.33it/s, loss=0.0145, lr=1.98e-05, step=3529]19:32:26.165 [I] step=3530 loss=0.0142 smoothed_loss=0.0218 lr=1.98e-05 grad_norm=0.4946 step_time=0.5977s data_time=0.1137s it/s=1.406 eta_to_10000=4602.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0093 grad_action_out_proj_arms=0.1257 grad_arm_token_fuse=0.0467 grad_shared_expert=0.4021 (18633:train_pytorch.py:850) + Training: 35%|███▌ | 3530/10000 [47:54<1:23:50, 1.29it/s, loss=0.0145, lr=1.98e-05, step=3529] Training: 35%|███▌ | 3530/10000 [47:54<1:23:50, 1.29it/s, loss=0.0142, lr=1.98e-05, step=3530] Training: 35%|███▌ | 3531/10000 [47:54<1:15:49, 1.42it/s, loss=0.0142, lr=1.98e-05, step=3530] Training: 35%|███▌ | 3531/10000 [47:54<1:15:49, 1.42it/s, loss=0.0172, lr=1.98e-05, step=3531] Training: 35%|███▌ | 3532/10000 [47:55<1:16:27, 1.41it/s, loss=0.0172, lr=1.98e-05, step=3531] Training: 35%|███▌ | 3532/10000 [47:55<1:16:27, 1.41it/s, loss=0.0126, lr=1.98e-05, step=3532] Training: 35%|███▌ | 3533/10000 [47:56<1:19:00, 1.36it/s, loss=0.0126, lr=1.98e-05, step=3532] Training: 35%|███▌ | 3533/10000 [47:56<1:19:00, 1.36it/s, loss=0.0209, lr=1.98e-05, step=3533] Training: 35%|███▌ | 3534/10000 [47:57<1:17:24, 1.39it/s, loss=0.0209, lr=1.98e-05, step=3533] Training: 35%|███▌ | 3534/10000 [47:57<1:17:24, 1.39it/s, loss=0.0298, lr=1.98e-05, step=3534] Training: 35%|███▌ | 3535/10000 [47:57<1:16:38, 1.41it/s, loss=0.0298, lr=1.98e-05, step=3534] Training: 35%|███▌ | 3535/10000 [47:57<1:16:38, 1.41it/s, loss=0.0271, lr=1.98e-05, step=3535] Training: 35%|███▌ | 3536/10000 [47:58<1:23:19, 1.29it/s, loss=0.0271, lr=1.98e-05, step=3535] Training: 35%|███▌ | 3536/10000 [47:58<1:23:19, 1.29it/s, loss=0.0297, lr=1.98e-05, step=3536] Training: 35%|███▌ | 3537/10000 [47:59<1:19:12, 1.36it/s, loss=0.0297, lr=1.98e-05, step=3536] Training: 35%|███▌ | 3537/10000 [47:59<1:19:12, 1.36it/s, loss=0.0423, lr=1.98e-05, step=3537] Training: 35%|███▌ | 3538/10000 [47:59<1:16:40, 1.40it/s, loss=0.0423, lr=1.98e-05, step=3537] Training: 35%|███▌ | 3538/10000 [47:59<1:16:40, 1.40it/s, loss=0.0580, lr=1.98e-05, step=3538] Training: 35%|███▌ | 3539/10000 [48:00<1:13:43, 1.46it/s, loss=0.0580, lr=1.98e-05, step=3538] Training: 35%|███▌ | 3539/10000 [48:00<1:13:43, 1.46it/s, loss=0.0093, lr=1.98e-05, step=3539]19:32:33.073 [I] step=3540 loss=0.0161 smoothed_loss=0.0252 lr=1.98e-05 grad_norm=0.4838 step_time=0.5635s data_time=0.1273s it/s=1.448 eta_to_10000=4461.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1912 grad_arm_token_fuse=0.0712 grad_shared_expert=0.3321 (18633:train_pytorch.py:850) + Training: 35%|███▌ | 3540/10000 [48:01<1:12:23, 1.49it/s, loss=0.0093, lr=1.98e-05, step=3539] Training: 35%|███▌ | 3540/10000 [48:01<1:12:23, 1.49it/s, loss=0.0161, lr=1.98e-05, step=3540] Training: 35%|███▌ | 3541/10000 [48:01<1:10:35, 1.53it/s, loss=0.0161, lr=1.98e-05, step=3540] Training: 35%|███▌ | 3541/10000 [48:01<1:10:35, 1.53it/s, loss=0.0170, lr=1.98e-05, step=3541] Training: 35%|███▌ | 3542/10000 [48:02<1:08:06, 1.58it/s, loss=0.0170, lr=1.98e-05, step=3541] Training: 35%|███▌ | 3542/10000 [48:02<1:08:06, 1.58it/s, loss=0.0038, lr=1.98e-05, step=3542] Training: 35%|███▌ | 3543/10000 [48:03<1:15:13, 1.43it/s, loss=0.0038, lr=1.98e-05, step=3542] Training: 35%|███▌ | 3543/10000 [48:03<1:15:13, 1.43it/s, loss=0.0268, lr=1.98e-05, step=3543] Training: 35%|███▌ | 3544/10000 [48:03<1:14:39, 1.44it/s, loss=0.0268, lr=1.98e-05, step=3543] Training: 35%|███▌ | 3544/10000 [48:03<1:14:39, 1.44it/s, loss=0.0069, lr=1.98e-05, step=3544] Training: 35%|███▌ | 3545/10000 [48:04<1:15:03, 1.43it/s, loss=0.0069, lr=1.98e-05, step=3544] Training: 35%|███▌ | 3545/10000 [48:04<1:15:03, 1.43it/s, loss=0.0062, lr=1.98e-05, step=3545] Training: 35%|███▌ | 3546/10000 [48:05<1:08:10, 1.58it/s, loss=0.0062, lr=1.98e-05, step=3545] Training: 35%|███▌ | 3546/10000 [48:05<1:08:10, 1.58it/s, loss=0.0074, lr=1.98e-05, step=3546] Training: 35%|███▌ | 3547/10000 [48:05<1:07:04, 1.60it/s, loss=0.0074, lr=1.98e-05, step=3546] Training: 35%|███▌ | 3547/10000 [48:05<1:07:04, 1.60it/s, loss=0.0222, lr=1.98e-05, step=3547] Training: 35%|███▌ | 3548/10000 [48:06<1:08:20, 1.57it/s, loss=0.0222, lr=1.98e-05, step=3547] Training: 35%|███▌ | 3548/10000 [48:06<1:08:20, 1.57it/s, loss=0.0224, lr=1.98e-05, step=3548] Training: 35%|███▌ | 3549/10000 [48:07<1:13:12, 1.47it/s, loss=0.0224, lr=1.98e-05, step=3548] Training: 35%|███▌ | 3549/10000 [48:07<1:13:12, 1.47it/s, loss=0.0188, lr=1.98e-05, step=3549]19:32:40.018 [I] step=3550 loss=0.0330 smoothed_loss=0.0205 lr=1.98e-05 grad_norm=0.3967 step_time=0.5749s data_time=0.1196s it/s=1.440 eta_to_10000=4478.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.1317 grad_arm_token_fuse=0.0489 grad_shared_expert=0.3646 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3550/10000 [48:08<1:22:35, 1.30it/s, loss=0.0188, lr=1.98e-05, step=3549] Training: 36%|███▌ | 3550/10000 [48:08<1:22:35, 1.30it/s, loss=0.0330, lr=1.97e-05, step=3550] Training: 36%|███▌ | 3551/10000 [48:08<1:13:55, 1.45it/s, loss=0.0330, lr=1.97e-05, step=3550] Training: 36%|███▌ | 3551/10000 [48:08<1:13:55, 1.45it/s, loss=0.0152, lr=1.97e-05, step=3551] Training: 36%|███▌ | 3552/10000 [48:09<1:15:49, 1.42it/s, loss=0.0152, lr=1.97e-05, step=3551] Training: 36%|███▌ | 3552/10000 [48:09<1:15:49, 1.42it/s, loss=0.0164, lr=1.97e-05, step=3552] Training: 36%|███▌ | 3553/10000 [48:10<1:15:54, 1.42it/s, loss=0.0164, lr=1.97e-05, step=3552] Training: 36%|███▌ | 3553/10000 [48:10<1:15:54, 1.42it/s, loss=0.0227, lr=1.97e-05, step=3553] Training: 36%|███▌ | 3554/10000 [48:10<1:10:08, 1.53it/s, loss=0.0227, lr=1.97e-05, step=3553] Training: 36%|███▌ | 3554/10000 [48:10<1:10:08, 1.53it/s, loss=0.0114, lr=1.97e-05, step=3554] Training: 36%|███▌ | 3555/10000 [48:11<1:06:16, 1.62it/s, loss=0.0114, lr=1.97e-05, step=3554] Training: 36%|███▌ | 3555/10000 [48:11<1:06:16, 1.62it/s, loss=0.0118, lr=1.97e-05, step=3555] Training: 36%|███▌ | 3556/10000 [48:11<1:10:12, 1.53it/s, loss=0.0118, lr=1.97e-05, step=3555] Training: 36%|███▌ | 3556/10000 [48:11<1:10:12, 1.53it/s, loss=0.0224, lr=1.97e-05, step=3556] Training: 36%|███▌ | 3557/10000 [48:12<1:20:56, 1.33it/s, loss=0.0224, lr=1.97e-05, step=3556] Training: 36%|███▌ | 3557/10000 [48:12<1:20:56, 1.33it/s, loss=0.0069, lr=1.97e-05, step=3557] Training: 36%|███▌ | 3558/10000 [48:13<1:13:14, 1.47it/s, loss=0.0069, lr=1.97e-05, step=3557] Training: 36%|███▌ | 3558/10000 [48:13<1:13:14, 1.47it/s, loss=0.0121, lr=1.97e-05, step=3558] Training: 36%|███▌ | 3559/10000 [48:14<1:12:01, 1.49it/s, loss=0.0121, lr=1.97e-05, step=3558] Training: 36%|███▌ | 3559/10000 [48:14<1:12:01, 1.49it/s, loss=0.0091, lr=1.97e-05, step=3559]19:32:46.452 [I] step=3560 loss=0.0231 smoothed_loss=0.0169 lr=1.97e-05 grad_norm=0.4770 step_time=0.5413s data_time=0.1021s it/s=1.555 eta_to_10000=4142.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.1530 grad_arm_token_fuse=0.1088 grad_shared_expert=0.4076 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3560/10000 [48:14<1:07:30, 1.59it/s, loss=0.0091, lr=1.97e-05, step=3559] Training: 36%|███▌ | 3560/10000 [48:14<1:07:30, 1.59it/s, loss=0.0231, lr=1.97e-05, step=3560] Training: 36%|███▌ | 3561/10000 [48:15<1:12:19, 1.48it/s, loss=0.0231, lr=1.97e-05, step=3560] Training: 36%|███▌ | 3561/10000 [48:15<1:12:19, 1.48it/s, loss=0.0150, lr=1.97e-05, step=3561] Training: 36%|███▌ | 3562/10000 [48:15<1:09:36, 1.54it/s, loss=0.0150, lr=1.97e-05, step=3561] Training: 36%|███▌ | 3562/10000 [48:15<1:09:36, 1.54it/s, loss=0.0108, lr=1.97e-05, step=3562] Training: 36%|███▌ | 3563/10000 [48:16<1:14:05, 1.45it/s, loss=0.0108, lr=1.97e-05, step=3562] Training: 36%|███▌ | 3563/10000 [48:16<1:14:05, 1.45it/s, loss=0.1224, lr=1.97e-05, step=3563] Training: 36%|███▌ | 3564/10000 [48:17<1:25:11, 1.26it/s, loss=0.1224, lr=1.97e-05, step=3563] Training: 36%|███▌ | 3564/10000 [48:17<1:25:11, 1.26it/s, loss=0.0377, lr=1.97e-05, step=3564] Training: 36%|███▌ | 3565/10000 [48:18<1:25:14, 1.26it/s, loss=0.0377, lr=1.97e-05, step=3564] Training: 36%|███▌ | 3565/10000 [48:18<1:25:14, 1.26it/s, loss=0.0131, lr=1.97e-05, step=3565] Training: 36%|███▌ | 3566/10000 [48:19<1:19:55, 1.34it/s, loss=0.0131, lr=1.97e-05, step=3565] Training: 36%|███▌ | 3566/10000 [48:19<1:19:55, 1.34it/s, loss=0.0093, lr=1.97e-05, step=3566] Training: 36%|███▌ | 3567/10000 [48:20<1:20:27, 1.33it/s, loss=0.0093, lr=1.97e-05, step=3566] Training: 36%|███▌ | 3567/10000 [48:20<1:20:27, 1.33it/s, loss=0.0410, lr=1.97e-05, step=3567] Training: 36%|███▌ | 3568/10000 [48:20<1:16:10, 1.41it/s, loss=0.0410, lr=1.97e-05, step=3567] Training: 36%|███▌ | 3568/10000 [48:20<1:16:10, 1.41it/s, loss=0.0239, lr=1.97e-05, step=3568] Training: 36%|███▌ | 3569/10000 [48:21<1:10:36, 1.52it/s, loss=0.0239, lr=1.97e-05, step=3568] Training: 36%|███▌ | 3569/10000 [48:21<1:10:36, 1.52it/s, loss=0.0102, lr=1.97e-05, step=3569]19:32:53.625 [I] step=3570 loss=0.0169 smoothed_loss=0.0237 lr=1.97e-05 grad_norm=0.5681 step_time=0.5913s data_time=0.1260s it/s=1.394 eta_to_10000=4611.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1232 grad_arm_token_fuse=0.0598 grad_shared_expert=0.2965 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3570/10000 [48:21<1:09:53, 1.53it/s, loss=0.0102, lr=1.97e-05, step=3569] Training: 36%|███▌ | 3570/10000 [48:21<1:09:53, 1.53it/s, loss=0.0169, lr=1.97e-05, step=3570] Training: 36%|███▌ | 3571/10000 [48:22<1:13:29, 1.46it/s, loss=0.0169, lr=1.97e-05, step=3570] Training: 36%|███▌ | 3571/10000 [48:22<1:13:29, 1.46it/s, loss=0.0036, lr=1.97e-05, step=3571] Training: 36%|███▌ | 3572/10000 [48:23<1:22:52, 1.29it/s, loss=0.0036, lr=1.97e-05, step=3571] Training: 36%|███▌ | 3572/10000 [48:23<1:22:52, 1.29it/s, loss=0.0079, lr=1.97e-05, step=3572] Training: 36%|███▌ | 3573/10000 [48:24<1:26:29, 1.24it/s, loss=0.0079, lr=1.97e-05, step=3572] Training: 36%|███▌ | 3573/10000 [48:24<1:26:29, 1.24it/s, loss=0.0258, lr=1.97e-05, step=3573] Training: 36%|███▌ | 3574/10000 [48:25<1:21:13, 1.32it/s, loss=0.0258, lr=1.97e-05, step=3573] Training: 36%|███▌ | 3574/10000 [48:25<1:21:13, 1.32it/s, loss=0.0231, lr=1.97e-05, step=3574] Training: 36%|███▌ | 3575/10000 [48:25<1:21:51, 1.31it/s, loss=0.0231, lr=1.97e-05, step=3574] Training: 36%|███▌ | 3575/10000 [48:25<1:21:51, 1.31it/s, loss=0.0204, lr=1.97e-05, step=3575] Training: 36%|███▌ | 3576/10000 [48:26<1:13:18, 1.46it/s, loss=0.0204, lr=1.97e-05, step=3575] Training: 36%|███▌ | 3576/10000 [48:26<1:13:18, 1.46it/s, loss=0.0531, lr=1.97e-05, step=3576] Training: 36%|███▌ | 3577/10000 [48:27<1:16:52, 1.39it/s, loss=0.0531, lr=1.97e-05, step=3576] Training: 36%|███▌ | 3577/10000 [48:27<1:16:52, 1.39it/s, loss=0.0237, lr=1.97e-05, step=3577] Training: 36%|███▌ | 3578/10000 [48:27<1:18:56, 1.36it/s, loss=0.0237, lr=1.97e-05, step=3577] Training: 36%|███▌ | 3578/10000 [48:27<1:18:56, 1.36it/s, loss=0.0479, lr=1.97e-05, step=3578] Training: 36%|███▌ | 3579/10000 [48:28<1:24:59, 1.26it/s, loss=0.0479, lr=1.97e-05, step=3578] Training: 36%|███▌ | 3579/10000 [48:28<1:24:59, 1.26it/s, loss=0.0187, lr=1.97e-05, step=3579]19:33:01.211 [I] step=3580 loss=0.0296 smoothed_loss=0.0261 lr=1.97e-05 grad_norm=0.4917 step_time=0.6215s data_time=0.1370s it/s=1.319 eta_to_10000=4868.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1541 grad_arm_token_fuse=0.0735 grad_shared_expert=0.4130 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3580/10000 [48:29<1:16:28, 1.40it/s, loss=0.0187, lr=1.97e-05, step=3579] Training: 36%|███▌ | 3580/10000 [48:29<1:16:28, 1.40it/s, loss=0.0296, lr=1.97e-05, step=3580] Training: 36%|███▌ | 3581/10000 [48:30<1:16:20, 1.40it/s, loss=0.0296, lr=1.97e-05, step=3580] Training: 36%|███▌ | 3581/10000 [48:30<1:16:20, 1.40it/s, loss=0.0171, lr=1.97e-05, step=3581] Training: 36%|███▌ | 3582/10000 [48:30<1:16:12, 1.40it/s, loss=0.0171, lr=1.97e-05, step=3581] Training: 36%|███▌ | 3582/10000 [48:30<1:16:12, 1.40it/s, loss=0.0288, lr=1.96e-05, step=3582] Training: 36%|███▌ | 3583/10000 [48:31<1:19:38, 1.34it/s, loss=0.0288, lr=1.96e-05, step=3582] Training: 36%|███▌ | 3583/10000 [48:31<1:19:38, 1.34it/s, loss=0.0067, lr=1.96e-05, step=3583] Training: 36%|███▌ | 3584/10000 [48:32<1:18:58, 1.35it/s, loss=0.0067, lr=1.96e-05, step=3583] Training: 36%|███▌ | 3584/10000 [48:32<1:18:58, 1.35it/s, loss=0.0653, lr=1.96e-05, step=3584] Training: 36%|███▌ | 3585/10000 [48:33<1:22:14, 1.30it/s, loss=0.0653, lr=1.96e-05, step=3584] Training: 36%|███▌ | 3585/10000 [48:33<1:22:14, 1.30it/s, loss=0.0246, lr=1.96e-05, step=3585] Training: 36%|███▌ | 3586/10000 [48:34<1:26:26, 1.24it/s, loss=0.0246, lr=1.96e-05, step=3585] Training: 36%|███▌ | 3586/10000 [48:34<1:26:26, 1.24it/s, loss=0.0209, lr=1.96e-05, step=3586] Training: 36%|███▌ | 3587/10000 [48:34<1:19:18, 1.35it/s, loss=0.0209, lr=1.96e-05, step=3586] Training: 36%|███▌ | 3587/10000 [48:34<1:19:18, 1.35it/s, loss=0.0222, lr=1.96e-05, step=3587] Training: 36%|███▌ | 3588/10000 [48:35<1:13:27, 1.45it/s, loss=0.0222, lr=1.96e-05, step=3587] Training: 36%|███▌ | 3588/10000 [48:35<1:13:27, 1.45it/s, loss=0.0359, lr=1.96e-05, step=3588] Training: 36%|███▌ | 3589/10000 [48:36<1:19:28, 1.34it/s, loss=0.0359, lr=1.96e-05, step=3588] Training: 36%|███▌ | 3589/10000 [48:36<1:19:28, 1.34it/s, loss=0.0166, lr=1.96e-05, step=3589]19:33:08.659 [I] step=3590 loss=0.0198 smoothed_loss=0.0256 lr=1.96e-05 grad_norm=0.5846 step_time=0.5885s data_time=0.1563s it/s=1.343 eta_to_10000=4773.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0214 grad_action_out_proj_arms=0.1859 grad_arm_token_fuse=0.1112 grad_shared_expert=0.4917 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3590/10000 [48:36<1:18:42, 1.36it/s, loss=0.0166, lr=1.96e-05, step=3589] Training: 36%|███▌ | 3590/10000 [48:36<1:18:42, 1.36it/s, loss=0.0198, lr=1.96e-05, step=3590] Training: 36%|███▌ | 3591/10000 [48:37<1:16:56, 1.39it/s, loss=0.0198, lr=1.96e-05, step=3590] Training: 36%|███▌ | 3591/10000 [48:37<1:16:56, 1.39it/s, loss=0.0200, lr=1.96e-05, step=3591] Training: 36%|███▌ | 3592/10000 [48:38<1:10:55, 1.51it/s, loss=0.0200, lr=1.96e-05, step=3591] Training: 36%|███▌ | 3592/10000 [48:38<1:10:55, 1.51it/s, loss=0.0383, lr=1.96e-05, step=3592] Training: 36%|███▌ | 3593/10000 [48:38<1:13:49, 1.45it/s, loss=0.0383, lr=1.96e-05, step=3592] Training: 36%|███▌ | 3593/10000 [48:38<1:13:49, 1.45it/s, loss=0.0786, lr=1.96e-05, step=3593] Training: 36%|███▌ | 3594/10000 [48:39<1:10:01, 1.52it/s, loss=0.0786, lr=1.96e-05, step=3593] Training: 36%|███▌ | 3594/10000 [48:39<1:10:01, 1.52it/s, loss=0.0239, lr=1.96e-05, step=3594] Training: 36%|███▌ | 3595/10000 [48:40<1:09:25, 1.54it/s, loss=0.0239, lr=1.96e-05, step=3594] Training: 36%|███▌ | 3595/10000 [48:40<1:09:25, 1.54it/s, loss=0.0163, lr=1.96e-05, step=3595] Training: 36%|███▌ | 3596/10000 [48:40<1:12:30, 1.47it/s, loss=0.0163, lr=1.96e-05, step=3595] Training: 36%|███▌ | 3596/10000 [48:40<1:12:30, 1.47it/s, loss=0.0151, lr=1.96e-05, step=3596] Training: 36%|███▌ | 3597/10000 [48:41<1:12:56, 1.46it/s, loss=0.0151, lr=1.96e-05, step=3596] Training: 36%|███▌ | 3597/10000 [48:41<1:12:56, 1.46it/s, loss=0.0224, lr=1.96e-05, step=3597] Training: 36%|███▌ | 3598/10000 [48:41<1:07:49, 1.57it/s, loss=0.0224, lr=1.96e-05, step=3597] Training: 36%|███▌ | 3598/10000 [48:41<1:07:49, 1.57it/s, loss=0.0161, lr=1.96e-05, step=3598] Training: 36%|███▌ | 3599/10000 [48:42<1:04:06, 1.66it/s, loss=0.0161, lr=1.96e-05, step=3598] Training: 36%|███▌ | 3599/10000 [48:42<1:04:06, 1.66it/s, loss=0.0286, lr=1.96e-05, step=3599]19:33:15.094 [I] step=3600 loss=0.0129 smoothed_loss=0.0251 lr=1.96e-05 grad_norm=0.5740 step_time=0.5494s data_time=0.0940s it/s=1.554 eta_to_10000=4117.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0282 grad_action_out_proj_arms=0.2098 grad_arm_token_fuse=0.1501 grad_shared_expert=0.4786 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3600/10000 [48:43<1:09:29, 1.53it/s, loss=0.0286, lr=1.96e-05, step=3599] Training: 36%|███▌ | 3600/10000 [48:43<1:09:29, 1.53it/s, loss=0.0129, lr=1.96e-05, step=3600] Training: 36%|███▌ | 3601/10000 [48:43<1:04:17, 1.66it/s, loss=0.0129, lr=1.96e-05, step=3600] Training: 36%|███▌ | 3601/10000 [48:43<1:04:17, 1.66it/s, loss=0.0217, lr=1.96e-05, step=3601] Training: 36%|███▌ | 3602/10000 [48:44<1:02:11, 1.71it/s, loss=0.0217, lr=1.96e-05, step=3601] Training: 36%|███▌ | 3602/10000 [48:44<1:02:11, 1.71it/s, loss=0.0122, lr=1.96e-05, step=3602] Training: 36%|███▌ | 3603/10000 [48:45<1:08:38, 1.55it/s, loss=0.0122, lr=1.96e-05, step=3602] Training: 36%|███▌ | 3603/10000 [48:45<1:08:38, 1.55it/s, loss=0.0063, lr=1.96e-05, step=3603] Training: 36%|███▌ | 3604/10000 [48:45<1:08:45, 1.55it/s, loss=0.0063, lr=1.96e-05, step=3603] Training: 36%|███▌ | 3604/10000 [48:45<1:08:45, 1.55it/s, loss=0.0152, lr=1.96e-05, step=3604] Training: 36%|███▌ | 3605/10000 [48:46<1:09:35, 1.53it/s, loss=0.0152, lr=1.96e-05, step=3604] Training: 36%|███▌ | 3605/10000 [48:46<1:09:35, 1.53it/s, loss=0.0099, lr=1.96e-05, step=3605] Training: 36%|███▌ | 3606/10000 [48:47<1:09:50, 1.53it/s, loss=0.0099, lr=1.96e-05, step=3605] Training: 36%|███▌ | 3606/10000 [48:47<1:09:50, 1.53it/s, loss=0.0353, lr=1.96e-05, step=3606] Training: 36%|███▌ | 3607/10000 [48:47<1:13:35, 1.45it/s, loss=0.0353, lr=1.96e-05, step=3606] Training: 36%|███▌ | 3607/10000 [48:47<1:13:35, 1.45it/s, loss=0.0095, lr=1.96e-05, step=3607] Training: 36%|███▌ | 3608/10000 [48:48<1:08:51, 1.55it/s, loss=0.0095, lr=1.96e-05, step=3607] Training: 36%|███▌ | 3608/10000 [48:48<1:08:51, 1.55it/s, loss=0.0129, lr=1.96e-05, step=3608] Training: 36%|███▌ | 3609/10000 [48:49<1:11:53, 1.48it/s, loss=0.0129, lr=1.96e-05, step=3608] Training: 36%|███▌ | 3609/10000 [48:49<1:11:53, 1.48it/s, loss=0.0266, lr=1.96e-05, step=3609]19:33:21.507 [I] step=3610 loss=0.0077 smoothed_loss=0.0190 lr=1.96e-05 grad_norm=0.4358 step_time=0.5295s data_time=0.1119s it/s=1.559 eta_to_10000=4098.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0180 grad_action_out_proj_arms=0.1762 grad_arm_token_fuse=0.0953 grad_shared_expert=0.4759 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3610/10000 [48:49<1:08:20, 1.56it/s, loss=0.0266, lr=1.96e-05, step=3609] Training: 36%|███▌ | 3610/10000 [48:49<1:08:20, 1.56it/s, loss=0.0077, lr=1.96e-05, step=3610] Training: 36%|███▌ | 3611/10000 [48:50<1:08:59, 1.54it/s, loss=0.0077, lr=1.96e-05, step=3610] Training: 36%|███▌ | 3611/10000 [48:50<1:08:59, 1.54it/s, loss=0.0119, lr=1.96e-05, step=3611] Training: 36%|███▌ | 3612/10000 [48:50<1:05:27, 1.63it/s, loss=0.0119, lr=1.96e-05, step=3611] Training: 36%|███▌ | 3612/10000 [48:50<1:05:27, 1.63it/s, loss=0.0137, lr=1.96e-05, step=3612] Training: 36%|███▌ | 3613/10000 [48:51<1:03:38, 1.67it/s, loss=0.0137, lr=1.96e-05, step=3612] Training: 36%|███▌ | 3613/10000 [48:51<1:03:38, 1.67it/s, loss=0.0083, lr=1.96e-05, step=3613] Training: 36%|███▌ | 3614/10000 [48:52<1:10:19, 1.51it/s, loss=0.0083, lr=1.96e-05, step=3613] Training: 36%|███▌ | 3614/10000 [48:52<1:10:19, 1.51it/s, loss=0.0137, lr=1.95e-05, step=3614] Training: 36%|███▌ | 3615/10000 [48:52<1:13:17, 1.45it/s, loss=0.0137, lr=1.95e-05, step=3614] Training: 36%|███▌ | 3615/10000 [48:52<1:13:17, 1.45it/s, loss=0.0133, lr=1.95e-05, step=3615] Training: 36%|███▌ | 3616/10000 [48:53<1:07:23, 1.58it/s, loss=0.0133, lr=1.95e-05, step=3615] Training: 36%|███▌ | 3616/10000 [48:53<1:07:23, 1.58it/s, loss=0.0184, lr=1.95e-05, step=3616] Training: 36%|███▌ | 3617/10000 [48:54<1:07:31, 1.58it/s, loss=0.0184, lr=1.95e-05, step=3616] Training: 36%|███▌ | 3617/10000 [48:54<1:07:31, 1.58it/s, loss=0.0207, lr=1.95e-05, step=3617] Training: 36%|███▌ | 3618/10000 [48:54<1:03:22, 1.68it/s, loss=0.0207, lr=1.95e-05, step=3617] Training: 36%|███▌ | 3618/10000 [48:54<1:03:22, 1.68it/s, loss=0.0060, lr=1.95e-05, step=3618] Training: 36%|███▌ | 3619/10000 [48:55<1:04:55, 1.64it/s, loss=0.0060, lr=1.95e-05, step=3618] Training: 36%|███▌ | 3619/10000 [48:55<1:04:55, 1.64it/s, loss=0.0207, lr=1.95e-05, step=3619]19:33:27.708 [I] step=3620 loss=0.0167 smoothed_loss=0.0163 lr=1.95e-05 grad_norm=0.5419 step_time=0.5425s data_time=0.0775s it/s=1.613 eta_to_10000=3955.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.1153 grad_arm_token_fuse=0.0532 grad_shared_expert=0.3426 (18633:train_pytorch.py:850) + Training: 36%|███▌ | 3620/10000 [48:55<1:04:13, 1.66it/s, loss=0.0207, lr=1.95e-05, step=3619] Training: 36%|███▌ | 3620/10000 [48:55<1:04:13, 1.66it/s, loss=0.0167, lr=1.95e-05, step=3620] Training: 36%|███▌ | 3621/10000 [48:56<1:16:53, 1.38it/s, loss=0.0167, lr=1.95e-05, step=3620] Training: 36%|███▌ | 3621/10000 [48:56<1:16:53, 1.38it/s, loss=0.0327, lr=1.95e-05, step=3621] Training: 36%|███▌ | 3622/10000 [48:57<1:27:19, 1.22it/s, loss=0.0327, lr=1.95e-05, step=3621] Training: 36%|███▌ | 3622/10000 [48:57<1:27:19, 1.22it/s, loss=0.0074, lr=1.95e-05, step=3622] Training: 36%|███▌ | 3623/10000 [48:58<1:19:10, 1.34it/s, loss=0.0074, lr=1.95e-05, step=3622] Training: 36%|███▌ | 3623/10000 [48:58<1:19:10, 1.34it/s, loss=0.0947, lr=1.95e-05, step=3623] Training: 36%|███▌ | 3624/10000 [48:59<1:15:33, 1.41it/s, loss=0.0947, lr=1.95e-05, step=3623] Training: 36%|███▌ | 3624/10000 [48:59<1:15:33, 1.41it/s, loss=0.0173, lr=1.95e-05, step=3624] Training: 36%|███▋ | 3625/10000 [49:00<1:22:49, 1.28it/s, loss=0.0173, lr=1.95e-05, step=3624] Training: 36%|███▋ | 3625/10000 [49:00<1:22:49, 1.28it/s, loss=0.0103, lr=1.95e-05, step=3625] Training: 36%|███▋ | 3626/10000 [49:00<1:18:53, 1.35it/s, loss=0.0103, lr=1.95e-05, step=3625] Training: 36%|███▋ | 3626/10000 [49:00<1:18:53, 1.35it/s, loss=0.0071, lr=1.95e-05, step=3626] Training: 36%|███▋ | 3627/10000 [49:01<1:12:46, 1.46it/s, loss=0.0071, lr=1.95e-05, step=3626] Training: 36%|███▋ | 3627/10000 [49:01<1:12:46, 1.46it/s, loss=0.0294, lr=1.95e-05, step=3627] Training: 36%|███▋ | 3628/10000 [49:01<1:12:25, 1.47it/s, loss=0.0294, lr=1.95e-05, step=3627] Training: 36%|███▋ | 3628/10000 [49:01<1:12:25, 1.47it/s, loss=0.0077, lr=1.95e-05, step=3628] Training: 36%|███▋ | 3629/10000 [49:02<1:14:24, 1.43it/s, loss=0.0077, lr=1.95e-05, step=3628] Training: 36%|███▋ | 3629/10000 [49:02<1:14:24, 1.43it/s, loss=0.0111, lr=1.95e-05, step=3629]19:33:35.286 [I] step=3630 loss=0.0066 smoothed_loss=0.0182 lr=1.95e-05 grad_norm=0.5573 step_time=0.6485s data_time=0.1094s it/s=1.320 eta_to_10000=4827.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0166 grad_action_out_proj_arms=0.1491 grad_arm_token_fuse=0.0762 grad_shared_expert=0.9022 (18633:train_pytorch.py:850) + Training: 36%|███▋ | 3630/10000 [49:03<1:16:28, 1.39it/s, loss=0.0111, lr=1.95e-05, step=3629] Training: 36%|███▋ | 3630/10000 [49:03<1:16:28, 1.39it/s, loss=0.0066, lr=1.95e-05, step=3630] Training: 36%|███▋ | 3631/10000 [49:04<1:15:11, 1.41it/s, loss=0.0066, lr=1.95e-05, step=3630] Training: 36%|███▋ | 3631/10000 [49:04<1:15:11, 1.41it/s, loss=0.0108, lr=1.95e-05, step=3631] Training: 36%|███▋ | 3632/10000 [49:04<1:16:42, 1.38it/s, loss=0.0108, lr=1.95e-05, step=3631] Training: 36%|███▋ | 3632/10000 [49:04<1:16:42, 1.38it/s, loss=0.0212, lr=1.95e-05, step=3632] Training: 36%|███▋ | 3633/10000 [49:05<1:14:28, 1.42it/s, loss=0.0212, lr=1.95e-05, step=3632] Training: 36%|███▋ | 3633/10000 [49:05<1:14:28, 1.42it/s, loss=0.0170, lr=1.95e-05, step=3633] Training: 36%|███▋ | 3634/10000 [49:06<1:24:32, 1.25it/s, loss=0.0170, lr=1.95e-05, step=3633] Training: 36%|███▋ | 3634/10000 [49:06<1:24:32, 1.25it/s, loss=0.0155, lr=1.95e-05, step=3634] Training: 36%|███▋ | 3635/10000 [49:07<1:16:23, 1.39it/s, loss=0.0155, lr=1.95e-05, step=3634] Training: 36%|███▋ | 3635/10000 [49:07<1:16:23, 1.39it/s, loss=0.0250, lr=1.95e-05, step=3635] Training: 36%|███▋ | 3636/10000 [49:07<1:17:32, 1.37it/s, loss=0.0250, lr=1.95e-05, step=3635] Training: 36%|███▋ | 3636/10000 [49:07<1:17:32, 1.37it/s, loss=0.0141, lr=1.95e-05, step=3636] Training: 36%|███▋ | 3637/10000 [49:08<1:11:20, 1.49it/s, loss=0.0141, lr=1.95e-05, step=3636] Training: 36%|███▋ | 3637/10000 [49:08<1:11:20, 1.49it/s, loss=0.0065, lr=1.95e-05, step=3637] Training: 36%|███▋ | 3638/10000 [49:09<1:15:21, 1.41it/s, loss=0.0065, lr=1.95e-05, step=3637] Training: 36%|███▋ | 3638/10000 [49:09<1:15:21, 1.41it/s, loss=0.0162, lr=1.95e-05, step=3638] Training: 36%|███▋ | 3639/10000 [49:09<1:12:53, 1.45it/s, loss=0.0162, lr=1.95e-05, step=3638] Training: 36%|███▋ | 3639/10000 [49:09<1:12:53, 1.45it/s, loss=0.0274, lr=1.95e-05, step=3639]19:33:42.257 [I] step=3640 loss=0.0159 smoothed_loss=0.0176 lr=1.95e-05 grad_norm=0.5065 step_time=0.5534s data_time=0.1436s it/s=1.436 eta_to_10000=4430.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0105 grad_action_out_proj_arms=0.1109 grad_arm_token_fuse=0.0551 grad_shared_expert=0.3667 (18633:train_pytorch.py:850) + Training: 36%|███▋ | 3640/10000 [49:10<1:09:49, 1.52it/s, loss=0.0274, lr=1.95e-05, step=3639] Training: 36%|███▋ | 3640/10000 [49:10<1:09:49, 1.52it/s, loss=0.0159, lr=1.95e-05, step=3640] Training: 36%|███▋ | 3641/10000 [49:11<1:12:07, 1.47it/s, loss=0.0159, lr=1.95e-05, step=3640] Training: 36%|███▋ | 3641/10000 [49:11<1:12:07, 1.47it/s, loss=0.0212, lr=1.95e-05, step=3641] Training: 36%|███▋ | 3642/10000 [49:11<1:13:06, 1.45it/s, loss=0.0212, lr=1.95e-05, step=3641] Training: 36%|███▋ | 3642/10000 [49:11<1:13:06, 1.45it/s, loss=0.0261, lr=1.95e-05, step=3642] Training: 36%|███▋ | 3643/10000 [49:12<1:19:36, 1.33it/s, loss=0.0261, lr=1.95e-05, step=3642] Training: 36%|███▋ | 3643/10000 [49:12<1:19:36, 1.33it/s, loss=0.0064, lr=1.95e-05, step=3643] Training: 36%|███▋ | 3644/10000 [49:13<1:15:10, 1.41it/s, loss=0.0064, lr=1.95e-05, step=3643] Training: 36%|███▋ | 3644/10000 [49:13<1:15:10, 1.41it/s, loss=0.0057, lr=1.95e-05, step=3644] Training: 36%|███▋ | 3645/10000 [49:14<1:13:36, 1.44it/s, loss=0.0057, lr=1.95e-05, step=3644] Training: 36%|███▋ | 3645/10000 [49:14<1:13:36, 1.44it/s, loss=0.0042, lr=1.94e-05, step=3645] Training: 36%|███▋ | 3646/10000 [49:14<1:08:35, 1.54it/s, loss=0.0042, lr=1.94e-05, step=3645] Training: 36%|███▋ | 3646/10000 [49:14<1:08:35, 1.54it/s, loss=0.0045, lr=1.94e-05, step=3646] Training: 36%|███▋ | 3647/10000 [49:15<1:07:56, 1.56it/s, loss=0.0045, lr=1.94e-05, step=3646] Training: 36%|███▋ | 3647/10000 [49:15<1:07:56, 1.56it/s, loss=0.0490, lr=1.94e-05, step=3647] Training: 36%|███▋ | 3648/10000 [49:15<1:04:00, 1.65it/s, loss=0.0490, lr=1.94e-05, step=3647] Training: 36%|███▋ | 3648/10000 [49:15<1:04:00, 1.65it/s, loss=0.0109, lr=1.94e-05, step=3648] Training: 36%|███▋ | 3649/10000 [49:16<1:06:47, 1.58it/s, loss=0.0109, lr=1.94e-05, step=3648] Training: 36%|███▋ | 3649/10000 [49:16<1:06:47, 1.58it/s, loss=0.0189, lr=1.94e-05, step=3649]19:33:49.080 [I] step=3650 loss=0.0184 smoothed_loss=0.0172 lr=1.94e-05 grad_norm=0.4466 step_time=0.5745s data_time=0.1079s it/s=1.466 eta_to_10000=4332.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1293 grad_arm_token_fuse=0.0714 grad_shared_expert=0.4178 (18633:train_pytorch.py:850) + Training: 36%|███▋ | 3650/10000 [49:17<1:13:18, 1.44it/s, loss=0.0189, lr=1.94e-05, step=3649] Training: 36%|███▋ | 3650/10000 [49:17<1:13:18, 1.44it/s, loss=0.0184, lr=1.94e-05, step=3650] Training: 37%|███▋ | 3651/10000 [49:17<1:10:29, 1.50it/s, loss=0.0184, lr=1.94e-05, step=3650] Training: 37%|███▋ | 3651/10000 [49:17<1:10:29, 1.50it/s, loss=0.0117, lr=1.94e-05, step=3651] Training: 37%|███▋ | 3652/10000 [49:18<1:07:31, 1.57it/s, loss=0.0117, lr=1.94e-05, step=3651] Training: 37%|███▋ | 3652/10000 [49:18<1:07:31, 1.57it/s, loss=0.0118, lr=1.94e-05, step=3652] Training: 37%|███▋ | 3653/10000 [49:19<1:08:15, 1.55it/s, loss=0.0118, lr=1.94e-05, step=3652] Training: 37%|███▋ | 3653/10000 [49:19<1:08:15, 1.55it/s, loss=0.0220, lr=1.94e-05, step=3653] Training: 37%|███▋ | 3654/10000 [49:19<1:06:18, 1.60it/s, loss=0.0220, lr=1.94e-05, step=3653] Training: 37%|███▋ | 3654/10000 [49:19<1:06:18, 1.60it/s, loss=0.0307, lr=1.94e-05, step=3654] Training: 37%|███▋ | 3655/10000 [49:20<1:03:58, 1.65it/s, loss=0.0307, lr=1.94e-05, step=3654] Training: 37%|███▋ | 3655/10000 [49:20<1:03:58, 1.65it/s, loss=0.0297, lr=1.94e-05, step=3655] Training: 37%|███▋ | 3656/10000 [49:20<1:01:01, 1.73it/s, loss=0.0297, lr=1.94e-05, step=3655] Training: 37%|███▋ | 3656/10000 [49:20<1:01:01, 1.73it/s, loss=0.0208, lr=1.94e-05, step=3656] Training: 37%|███▋ | 3657/10000 [49:21<1:11:56, 1.47it/s, loss=0.0208, lr=1.94e-05, step=3656] Training: 37%|███▋ | 3657/10000 [49:21<1:11:56, 1.47it/s, loss=0.0061, lr=1.94e-05, step=3657] Training: 37%|███▋ | 3658/10000 [49:22<1:07:44, 1.56it/s, loss=0.0061, lr=1.94e-05, step=3657] Training: 37%|███▋ | 3658/10000 [49:22<1:07:44, 1.56it/s, loss=0.0086, lr=1.94e-05, step=3658] Training: 37%|███▋ | 3659/10000 [49:22<1:07:05, 1.58it/s, loss=0.0086, lr=1.94e-05, step=3658] Training: 37%|███▋ | 3659/10000 [49:22<1:07:05, 1.58it/s, loss=0.0221, lr=1.94e-05, step=3659]19:33:55.274 [I] step=3660 loss=0.0498 smoothed_loss=0.0209 lr=1.94e-05 grad_norm=0.5216 step_time=0.5204s data_time=0.0991s it/s=1.615 eta_to_10000=3926.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0246 grad_action_out_proj_arms=0.1917 grad_arm_token_fuse=0.1282 grad_shared_expert=0.5223 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3660/10000 [49:23<1:06:25, 1.59it/s, loss=0.0221, lr=1.94e-05, step=3659] Training: 37%|███▋ | 3660/10000 [49:23<1:06:25, 1.59it/s, loss=0.0498, lr=1.94e-05, step=3660] Training: 37%|███▋ | 3661/10000 [49:23<1:03:20, 1.67it/s, loss=0.0498, lr=1.94e-05, step=3660] Training: 37%|███▋ | 3661/10000 [49:23<1:03:20, 1.67it/s, loss=0.0062, lr=1.94e-05, step=3661] Training: 37%|███▋ | 3662/10000 [49:24<1:01:17, 1.72it/s, loss=0.0062, lr=1.94e-05, step=3661] Training: 37%|███▋ | 3662/10000 [49:24<1:01:17, 1.72it/s, loss=0.0126, lr=1.94e-05, step=3662] Training: 37%|███▋ | 3663/10000 [49:25<1:00:25, 1.75it/s, loss=0.0126, lr=1.94e-05, step=3662] Training: 37%|███▋ | 3663/10000 [49:25<1:00:25, 1.75it/s, loss=0.0323, lr=1.94e-05, step=3663] Training: 37%|███▋ | 3664/10000 [49:25<1:07:47, 1.56it/s, loss=0.0323, lr=1.94e-05, step=3663] Training: 37%|███▋ | 3664/10000 [49:25<1:07:47, 1.56it/s, loss=0.0071, lr=1.94e-05, step=3664] Training: 37%|███▋ | 3665/10000 [49:26<1:12:39, 1.45it/s, loss=0.0071, lr=1.94e-05, step=3664] Training: 37%|███▋ | 3665/10000 [49:26<1:12:39, 1.45it/s, loss=0.0079, lr=1.94e-05, step=3665] Training: 37%|███▋ | 3666/10000 [49:27<1:08:15, 1.55it/s, loss=0.0079, lr=1.94e-05, step=3665] Training: 37%|███▋ | 3666/10000 [49:27<1:08:15, 1.55it/s, loss=0.0034, lr=1.94e-05, step=3666] Training: 37%|███▋ | 3667/10000 [49:27<1:04:15, 1.64it/s, loss=0.0034, lr=1.94e-05, step=3666] Training: 37%|███▋ | 3667/10000 [49:27<1:04:15, 1.64it/s, loss=0.0177, lr=1.94e-05, step=3667] Training: 37%|███▋ | 3668/10000 [49:28<1:01:50, 1.71it/s, loss=0.0177, lr=1.94e-05, step=3667] Training: 37%|███▋ | 3668/10000 [49:28<1:01:50, 1.71it/s, loss=0.0033, lr=1.94e-05, step=3668] Training: 37%|███▋ | 3669/10000 [49:28<1:00:04, 1.76it/s, loss=0.0033, lr=1.94e-05, step=3668] Training: 37%|███▋ | 3669/10000 [49:28<1:00:04, 1.76it/s, loss=0.0043, lr=1.94e-05, step=3669]19:34:01.259 [I] step=3670 loss=0.0059 smoothed_loss=0.0132 lr=1.94e-05 grad_norm=0.4846 step_time=0.5179s data_time=0.0805s it/s=1.671 eta_to_10000=3787.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0102 grad_action_out_proj_arms=0.1072 grad_arm_token_fuse=0.0471 grad_shared_expert=0.2792 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3670/10000 [49:29<1:02:00, 1.70it/s, loss=0.0043, lr=1.94e-05, step=3669] Training: 37%|███▋ | 3670/10000 [49:29<1:02:00, 1.70it/s, loss=0.0059, lr=1.94e-05, step=3670] Training: 37%|███▋ | 3671/10000 [49:30<1:07:57, 1.55it/s, loss=0.0059, lr=1.94e-05, step=3670] Training: 37%|███▋ | 3671/10000 [49:30<1:07:57, 1.55it/s, loss=0.0114, lr=1.94e-05, step=3671] Training: 37%|███▋ | 3672/10000 [49:30<1:11:29, 1.48it/s, loss=0.0114, lr=1.94e-05, step=3671] Training: 37%|███▋ | 3672/10000 [49:30<1:11:29, 1.48it/s, loss=0.0216, lr=1.94e-05, step=3672] Training: 37%|███▋ | 3673/10000 [49:31<1:08:43, 1.53it/s, loss=0.0216, lr=1.94e-05, step=3672] Training: 37%|███▋ | 3673/10000 [49:31<1:08:43, 1.53it/s, loss=0.0080, lr=1.94e-05, step=3673] Training: 37%|███▋ | 3674/10000 [49:32<1:04:52, 1.63it/s, loss=0.0080, lr=1.94e-05, step=3673] Training: 37%|███▋ | 3674/10000 [49:32<1:04:52, 1.63it/s, loss=0.0086, lr=1.94e-05, step=3674] Training: 37%|███▋ | 3675/10000 [49:32<1:01:28, 1.71it/s, loss=0.0086, lr=1.94e-05, step=3674] Training: 37%|███▋ | 3675/10000 [49:32<1:01:28, 1.71it/s, loss=0.0271, lr=1.94e-05, step=3675] Training: 37%|███▋ | 3676/10000 [49:33<59:32, 1.77it/s, loss=0.0271, lr=1.94e-05, step=3675] Training: 37%|███▋ | 3676/10000 [49:33<59:32, 1.77it/s, loss=0.0390, lr=1.93e-05, step=3676] Training: 37%|███▋ | 3677/10000 [49:33<59:08, 1.78it/s, loss=0.0390, lr=1.93e-05, step=3676] Training: 37%|███▋ | 3677/10000 [49:33<59:08, 1.78it/s, loss=0.0091, lr=1.93e-05, step=3677] Training: 37%|███▋ | 3678/10000 [49:34<1:05:19, 1.61it/s, loss=0.0091, lr=1.93e-05, step=3677] Training: 37%|███▋ | 3678/10000 [49:34<1:05:19, 1.61it/s, loss=0.0057, lr=1.93e-05, step=3678] Training: 37%|███▋ | 3679/10000 [49:35<1:08:42, 1.53it/s, loss=0.0057, lr=1.93e-05, step=3678] Training: 37%|███▋ | 3679/10000 [49:35<1:08:42, 1.53it/s, loss=0.0180, lr=1.93e-05, step=3679]19:34:07.589 [I] step=3680 loss=0.0155 smoothed_loss=0.0153 lr=1.93e-05 grad_norm=0.5026 step_time=0.5542s data_time=0.0788s it/s=1.580 eta_to_10000=4000.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0150 grad_action_out_proj_arms=0.1275 grad_arm_token_fuse=0.0771 grad_shared_expert=0.4339 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3680/10000 [49:35<1:07:19, 1.56it/s, loss=0.0180, lr=1.93e-05, step=3679] Training: 37%|███▋ | 3680/10000 [49:35<1:07:19, 1.56it/s, loss=0.0155, lr=1.93e-05, step=3680] Training: 37%|███▋ | 3681/10000 [49:36<1:03:16, 1.66it/s, loss=0.0155, lr=1.93e-05, step=3680] Training: 37%|███▋ | 3681/10000 [49:36<1:03:16, 1.66it/s, loss=0.0038, lr=1.93e-05, step=3681] Training: 37%|███▋ | 3682/10000 [49:36<1:01:24, 1.71it/s, loss=0.0038, lr=1.93e-05, step=3681] Training: 37%|███▋ | 3682/10000 [49:36<1:01:24, 1.71it/s, loss=0.0072, lr=1.93e-05, step=3682] Training: 37%|███▋ | 3683/10000 [49:37<58:44, 1.79it/s, loss=0.0072, lr=1.93e-05, step=3682] Training: 37%|███▋ | 3683/10000 [49:37<58:44, 1.79it/s, loss=0.0449, lr=1.93e-05, step=3683] Training: 37%|███▋ | 3684/10000 [49:37<58:13, 1.81it/s, loss=0.0449, lr=1.93e-05, step=3683] Training: 37%|███▋ | 3684/10000 [49:37<58:13, 1.81it/s, loss=0.0303, lr=1.93e-05, step=3684] Training: 37%|███▋ | 3685/10000 [49:38<58:21, 1.80it/s, loss=0.0303, lr=1.93e-05, step=3684] Training: 37%|███▋ | 3685/10000 [49:38<58:21, 1.80it/s, loss=0.0092, lr=1.93e-05, step=3685] Training: 37%|███▋ | 3686/10000 [49:39<1:04:57, 1.62it/s, loss=0.0092, lr=1.93e-05, step=3685] Training: 37%|███▋ | 3686/10000 [49:39<1:04:57, 1.62it/s, loss=0.0106, lr=1.93e-05, step=3686] Training: 37%|███▋ | 3687/10000 [49:39<1:07:13, 1.57it/s, loss=0.0106, lr=1.93e-05, step=3686] Training: 37%|███▋ | 3687/10000 [49:39<1:07:13, 1.57it/s, loss=0.0181, lr=1.93e-05, step=3687] Training: 37%|███▋ | 3688/10000 [49:40<1:04:13, 1.64it/s, loss=0.0181, lr=1.93e-05, step=3687] Training: 37%|███▋ | 3688/10000 [49:40<1:04:13, 1.64it/s, loss=0.0115, lr=1.93e-05, step=3688] Training: 37%|███▋ | 3689/10000 [49:40<1:02:50, 1.67it/s, loss=0.0115, lr=1.93e-05, step=3688] Training: 37%|███▋ | 3689/10000 [49:40<1:02:50, 1.67it/s, loss=0.0081, lr=1.93e-05, step=3689]19:34:13.379 [I] step=3690 loss=0.0150 smoothed_loss=0.0153 lr=1.93e-05 grad_norm=0.5147 step_time=0.5008s data_time=0.0781s it/s=1.728 eta_to_10000=3652.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0269 grad_action_out_proj_arms=0.1595 grad_arm_token_fuse=0.1438 grad_shared_expert=0.4751 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3690/10000 [49:41<1:02:04, 1.69it/s, loss=0.0081, lr=1.93e-05, step=3689] Training: 37%|███▋ | 3690/10000 [49:41<1:02:04, 1.69it/s, loss=0.0150, lr=1.93e-05, step=3690] Training: 37%|███▋ | 3691/10000 [49:42<1:00:38, 1.73it/s, loss=0.0150, lr=1.93e-05, step=3690] Training: 37%|███▋ | 3691/10000 [49:42<1:00:38, 1.73it/s, loss=0.0436, lr=1.93e-05, step=3691] Training: 37%|███▋ | 3692/10000 [49:42<59:05, 1.78it/s, loss=0.0436, lr=1.93e-05, step=3691] Training: 37%|███▋ | 3692/10000 [49:42<59:05, 1.78it/s, loss=0.0199, lr=1.93e-05, step=3692] Training: 37%|███▋ | 3693/10000 [49:43<1:08:47, 1.53it/s, loss=0.0199, lr=1.93e-05, step=3692] Training: 37%|███▋ | 3693/10000 [49:43<1:08:47, 1.53it/s, loss=0.0309, lr=1.93e-05, step=3693] Training: 37%|███▋ | 3694/10000 [49:44<1:10:02, 1.50it/s, loss=0.0309, lr=1.93e-05, step=3693] Training: 37%|███▋ | 3694/10000 [49:44<1:10:02, 1.50it/s, loss=0.0197, lr=1.93e-05, step=3694] Training: 37%|███▋ | 3695/10000 [49:44<1:09:06, 1.52it/s, loss=0.0197, lr=1.93e-05, step=3694] Training: 37%|███▋ | 3695/10000 [49:44<1:09:06, 1.52it/s, loss=0.0499, lr=1.93e-05, step=3695] Training: 37%|███▋ | 3696/10000 [49:45<1:11:47, 1.46it/s, loss=0.0499, lr=1.93e-05, step=3695] Training: 37%|███▋ | 3696/10000 [49:45<1:11:47, 1.46it/s, loss=0.0143, lr=1.93e-05, step=3696] Training: 37%|███▋ | 3697/10000 [49:46<1:13:31, 1.43it/s, loss=0.0143, lr=1.93e-05, step=3696] Training: 37%|███▋ | 3697/10000 [49:46<1:13:31, 1.43it/s, loss=0.0517, lr=1.93e-05, step=3697] Training: 37%|███▋ | 3698/10000 [49:47<1:15:06, 1.40it/s, loss=0.0517, lr=1.93e-05, step=3697] Training: 37%|███▋ | 3698/10000 [49:47<1:15:06, 1.40it/s, loss=0.0176, lr=1.93e-05, step=3698] Training: 37%|███▋ | 3699/10000 [49:47<1:16:13, 1.38it/s, loss=0.0176, lr=1.93e-05, step=3698] Training: 37%|███▋ | 3699/10000 [49:47<1:16:13, 1.38it/s, loss=0.0132, lr=1.93e-05, step=3699]19:34:20.409 [I] step=3700 loss=0.0120 smoothed_loss=0.0219 lr=1.93e-05 grad_norm=0.5938 step_time=0.5708s data_time=0.1323s it/s=1.423 eta_to_10000=4428.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0174 grad_action_out_proj_arms=0.1510 grad_arm_token_fuse=0.0887 grad_shared_expert=0.5320 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3700/10000 [49:48<1:17:42, 1.35it/s, loss=0.0132, lr=1.93e-05, step=3699] Training: 37%|███▋ | 3700/10000 [49:48<1:17:42, 1.35it/s, loss=0.0120, lr=1.93e-05, step=3700] Training: 37%|███▋ | 3701/10000 [49:49<1:12:37, 1.45it/s, loss=0.0120, lr=1.93e-05, step=3700] Training: 37%|███▋ | 3701/10000 [49:49<1:12:37, 1.45it/s, loss=0.0403, lr=1.93e-05, step=3701] Training: 37%|███▋ | 3702/10000 [49:49<1:12:54, 1.44it/s, loss=0.0403, lr=1.93e-05, step=3701] Training: 37%|███▋ | 3702/10000 [49:49<1:12:54, 1.44it/s, loss=0.0806, lr=1.93e-05, step=3702] Training: 37%|███▋ | 3703/10000 [49:50<1:10:25, 1.49it/s, loss=0.0806, lr=1.93e-05, step=3702] Training: 37%|███▋ | 3703/10000 [49:50<1:10:25, 1.49it/s, loss=0.0144, lr=1.93e-05, step=3703] Training: 37%|███▋ | 3704/10000 [49:50<1:04:48, 1.62it/s, loss=0.0144, lr=1.93e-05, step=3703] Training: 37%|███▋ | 3704/10000 [49:50<1:04:48, 1.62it/s, loss=0.0282, lr=1.93e-05, step=3704] Training: 37%|███▋ | 3705/10000 [49:51<1:09:27, 1.51it/s, loss=0.0282, lr=1.93e-05, step=3704] Training: 37%|███▋ | 3705/10000 [49:51<1:09:27, 1.51it/s, loss=0.0709, lr=1.93e-05, step=3705] Training: 37%|███▋ | 3706/10000 [49:52<1:08:02, 1.54it/s, loss=0.0709, lr=1.93e-05, step=3705] Training: 37%|███▋ | 3706/10000 [49:52<1:08:02, 1.54it/s, loss=0.0079, lr=1.93e-05, step=3706] Training: 37%|███▋ | 3707/10000 [49:53<1:19:11, 1.32it/s, loss=0.0079, lr=1.93e-05, step=3706] Training: 37%|███▋ | 3707/10000 [49:53<1:19:11, 1.32it/s, loss=0.0188, lr=1.92e-05, step=3707] Training: 37%|███▋ | 3708/10000 [49:55<1:52:16, 1.07s/it, loss=0.0188, lr=1.92e-05, step=3707] Training: 37%|███▋ | 3708/10000 [49:55<1:52:16, 1.07s/it, loss=0.0270, lr=1.92e-05, step=3708] Training: 37%|███▋ | 3709/10000 [49:56<2:10:11, 1.24s/it, loss=0.0270, lr=1.92e-05, step=3708] Training: 37%|███▋ | 3709/10000 [49:56<2:10:11, 1.24s/it, loss=0.0064, lr=1.92e-05, step=3709]19:34:29.723 [I] step=3710 loss=0.0077 smoothed_loss=0.0245 lr=1.93e-05 grad_norm=0.5707 step_time=0.7266s data_time=0.2047s it/s=1.074 eta_to_10000=5857.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0258 grad_action_out_proj_arms=0.1860 grad_arm_token_fuse=0.1269 grad_shared_expert=0.5295 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3710/10000 [49:57<2:08:12, 1.22s/it, loss=0.0064, lr=1.92e-05, step=3709] Training: 37%|███▋ | 3710/10000 [49:58<2:08:12, 1.22s/it, loss=0.0077, lr=1.92e-05, step=3710] Training: 37%|███▋ | 3711/10000 [49:58<1:59:47, 1.14s/it, loss=0.0077, lr=1.92e-05, step=3710] Training: 37%|███▋ | 3711/10000 [49:59<1:59:47, 1.14s/it, loss=0.0215, lr=1.92e-05, step=3711] Training: 37%|███▋ | 3712/10000 [50:00<2:06:54, 1.21s/it, loss=0.0215, lr=1.92e-05, step=3711] Training: 37%|███▋ | 3712/10000 [50:00<2:06:54, 1.21s/it, loss=0.0241, lr=1.92e-05, step=3712] Training: 37%|███▋ | 3713/10000 [50:01<2:10:49, 1.25s/it, loss=0.0241, lr=1.92e-05, step=3712] Training: 37%|███▋ | 3713/10000 [50:01<2:10:49, 1.25s/it, loss=0.0157, lr=1.92e-05, step=3713] Training: 37%|███▋ | 3714/10000 [50:03<2:15:10, 1.29s/it, loss=0.0157, lr=1.92e-05, step=3713] Training: 37%|███▋ | 3714/10000 [50:03<2:15:10, 1.29s/it, loss=0.0032, lr=1.92e-05, step=3714] Training: 37%|███▋ | 3715/10000 [50:04<2:20:37, 1.34s/it, loss=0.0032, lr=1.92e-05, step=3714] Training: 37%|███▋ | 3715/10000 [50:04<2:20:37, 1.34s/it, loss=0.0160, lr=1.92e-05, step=3715] Training: 37%|███▋ | 3716/10000 [50:06<2:36:46, 1.50s/it, loss=0.0160, lr=1.92e-05, step=3715] Training: 37%|███▋ | 3716/10000 [50:06<2:36:46, 1.50s/it, loss=0.0060, lr=1.92e-05, step=3716] Training: 37%|███▋ | 3717/10000 [50:07<2:34:48, 1.48s/it, loss=0.0060, lr=1.92e-05, step=3716] Training: 37%|███▋ | 3717/10000 [50:07<2:34:48, 1.48s/it, loss=0.0387, lr=1.92e-05, step=3717] Training: 37%|███▋ | 3718/10000 [50:11<3:30:00, 2.01s/it, loss=0.0387, lr=1.92e-05, step=3717] Training: 37%|███▋ | 3718/10000 [50:11<3:30:00, 2.01s/it, loss=0.0080, lr=1.92e-05, step=3718] Training: 37%|███▋ | 3719/10000 [50:13<3:58:56, 2.28s/it, loss=0.0080, lr=1.92e-05, step=3718] Training: 37%|███▋ | 3719/10000 [50:13<3:58:56, 2.28s/it, loss=0.0131, lr=1.92e-05, step=3719]19:34:47.575 [I] step=3720 loss=0.0140 smoothed_loss=0.0187 lr=1.92e-05 grad_norm=0.6655 step_time=0.9904s data_time=0.7949s it/s=0.563 eta_to_10000=11154.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1457 grad_arm_token_fuse=0.0793 grad_shared_expert=0.4085 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3720/10000 [50:15<3:43:56, 2.14s/it, loss=0.0131, lr=1.92e-05, step=3719] Training: 37%|███▋ | 3720/10000 [50:15<3:43:56, 2.14s/it, loss=0.0140, lr=1.92e-05, step=3720] Training: 37%|███▋ | 3721/10000 [50:17<3:29:45, 2.00s/it, loss=0.0140, lr=1.92e-05, step=3720] Training: 37%|███▋ | 3721/10000 [50:17<3:29:45, 2.00s/it, loss=0.0132, lr=1.92e-05, step=3721] Training: 37%|███▋ | 3722/10000 [50:18<3:06:22, 1.78s/it, loss=0.0132, lr=1.92e-05, step=3721] Training: 37%|███▋ | 3722/10000 [50:18<3:06:22, 1.78s/it, loss=0.0059, lr=1.92e-05, step=3722] Training: 37%|███▋ | 3723/10000 [50:20<3:02:34, 1.75s/it, loss=0.0059, lr=1.92e-05, step=3722] Training: 37%|███▋ | 3723/10000 [50:20<3:02:34, 1.75s/it, loss=0.0217, lr=1.92e-05, step=3723] Training: 37%|███▋ | 3724/10000 [50:21<2:56:56, 1.69s/it, loss=0.0217, lr=1.92e-05, step=3723] Training: 37%|███▋ | 3724/10000 [50:21<2:56:56, 1.69s/it, loss=0.0190, lr=1.92e-05, step=3724] Training: 37%|███▋ | 3725/10000 [50:23<2:54:05, 1.66s/it, loss=0.0190, lr=1.92e-05, step=3724] Training: 37%|███▋ | 3725/10000 [50:23<2:54:05, 1.66s/it, loss=0.0225, lr=1.92e-05, step=3725] Training: 37%|███▋ | 3726/10000 [50:24<2:35:10, 1.48s/it, loss=0.0225, lr=1.92e-05, step=3725] Training: 37%|███▋ | 3726/10000 [50:24<2:35:10, 1.48s/it, loss=0.0108, lr=1.92e-05, step=3726] Training: 37%|███▋ | 3727/10000 [50:25<2:18:10, 1.32s/it, loss=0.0108, lr=1.92e-05, step=3726] Training: 37%|███▋ | 3727/10000 [50:25<2:18:10, 1.32s/it, loss=0.0122, lr=1.92e-05, step=3727] Training: 37%|███▋ | 3728/10000 [50:26<2:11:01, 1.25s/it, loss=0.0122, lr=1.92e-05, step=3727] Training: 37%|███▋ | 3728/10000 [50:26<2:11:01, 1.25s/it, loss=0.0065, lr=1.92e-05, step=3728] Training: 37%|███▋ | 3729/10000 [50:28<2:27:57, 1.42s/it, loss=0.0065, lr=1.92e-05, step=3728] Training: 37%|███▋ | 3729/10000 [50:28<2:27:57, 1.42s/it, loss=0.1190, lr=1.92e-05, step=3729]19:35:01.007 [I] step=3730 loss=0.0051 smoothed_loss=0.0240 lr=1.92e-05 grad_norm=0.5209 step_time=0.9184s data_time=0.4248s it/s=0.745 eta_to_10000=8412.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0096 grad_action_out_proj_arms=0.1329 grad_arm_token_fuse=0.0500 grad_shared_expert=0.3340 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3730/10000 [50:29<2:07:35, 1.22s/it, loss=0.1190, lr=1.92e-05, step=3729] Training: 37%|███▋ | 3730/10000 [50:29<2:07:35, 1.22s/it, loss=0.0051, lr=1.92e-05, step=3730] Training: 37%|███▋ | 3731/10000 [50:30<2:06:06, 1.21s/it, loss=0.0051, lr=1.92e-05, step=3730] Training: 37%|███▋ | 3731/10000 [50:30<2:06:06, 1.21s/it, loss=0.0094, lr=1.92e-05, step=3731] Training: 37%|███▋ | 3732/10000 [50:31<2:09:34, 1.24s/it, loss=0.0094, lr=1.92e-05, step=3731] Training: 37%|███▋ | 3732/10000 [50:31<2:09:34, 1.24s/it, loss=0.0242, lr=1.92e-05, step=3732] Training: 37%|███▋ | 3733/10000 [50:33<2:11:41, 1.26s/it, loss=0.0242, lr=1.92e-05, step=3732] Training: 37%|███▋ | 3733/10000 [50:33<2:11:41, 1.26s/it, loss=0.0161, lr=1.92e-05, step=3733] Training: 37%|███▋ | 3734/10000 [50:34<2:21:10, 1.35s/it, loss=0.0161, lr=1.92e-05, step=3733] Training: 37%|███▋ | 3734/10000 [50:34<2:21:10, 1.35s/it, loss=0.0111, lr=1.92e-05, step=3734] Training: 37%|███▋ | 3735/10000 [50:35<2:23:39, 1.38s/it, loss=0.0111, lr=1.92e-05, step=3734] Training: 37%|███▋ | 3735/10000 [50:36<2:23:39, 1.38s/it, loss=0.0186, lr=1.92e-05, step=3735] Training: 37%|███▋ | 3736/10000 [50:37<2:39:28, 1.53s/it, loss=0.0186, lr=1.92e-05, step=3735] Training: 37%|███▋ | 3736/10000 [50:37<2:39:28, 1.53s/it, loss=0.0118, lr=1.92e-05, step=3736] Training: 37%|███▋ | 3737/10000 [50:39<2:40:58, 1.54s/it, loss=0.0118, lr=1.92e-05, step=3736] Training: 37%|███▋ | 3737/10000 [50:39<2:40:58, 1.54s/it, loss=0.0161, lr=1.92e-05, step=3737] Training: 37%|███▋ | 3738/10000 [50:40<2:16:53, 1.31s/it, loss=0.0161, lr=1.92e-05, step=3737] Training: 37%|███▋ | 3738/10000 [50:40<2:16:53, 1.31s/it, loss=0.0146, lr=1.91e-05, step=3738] Training: 37%|███▋ | 3739/10000 [50:41<2:18:55, 1.33s/it, loss=0.0146, lr=1.91e-05, step=3738] Training: 37%|███▋ | 3739/10000 [50:41<2:18:55, 1.33s/it, loss=0.0162, lr=1.91e-05, step=3739]19:35:14.740 [I] step=3740 loss=0.0133 smoothed_loss=0.0182 lr=1.92e-05 grad_norm=0.4999 step_time=0.9420s data_time=0.4312s it/s=0.729 eta_to_10000=8582.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0089 grad_action_out_proj_arms=0.0976 grad_arm_token_fuse=0.0445 grad_shared_expert=0.3919 (18633:train_pytorch.py:850) + Training: 37%|███▋ | 3740/10000 [50:42<2:18:39, 1.33s/it, loss=0.0162, lr=1.91e-05, step=3739] Training: 37%|███▋ | 3740/10000 [50:42<2:18:39, 1.33s/it, loss=0.0133, lr=1.91e-05, step=3740] Training: 37%|███▋ | 3741/10000 [50:44<2:13:31, 1.28s/it, loss=0.0133, lr=1.91e-05, step=3740] Training: 37%|███▋ | 3741/10000 [50:44<2:13:31, 1.28s/it, loss=0.0277, lr=1.91e-05, step=3741] Training: 37%|███▋ | 3742/10000 [50:45<2:10:39, 1.25s/it, loss=0.0277, lr=1.91e-05, step=3741] Training: 37%|███▋ | 3742/10000 [50:45<2:10:39, 1.25s/it, loss=0.0083, lr=1.91e-05, step=3742] Training: 37%|███▋ | 3743/10000 [50:47<2:27:29, 1.41s/it, loss=0.0083, lr=1.91e-05, step=3742] Training: 37%|███▋ | 3743/10000 [50:47<2:27:29, 1.41s/it, loss=0.0382, lr=1.91e-05, step=3743] Training: 37%|███▋ | 3744/10000 [50:48<2:26:34, 1.41s/it, loss=0.0382, lr=1.91e-05, step=3743] Training: 37%|███▋ | 3744/10000 [50:48<2:26:34, 1.41s/it, loss=0.0135, lr=1.91e-05, step=3744] Training: 37%|███▋ | 3745/10000 [50:50<2:35:36, 1.49s/it, loss=0.0135, lr=1.91e-05, step=3744] Training: 37%|███▋ | 3745/10000 [50:50<2:35:36, 1.49s/it, loss=0.1055, lr=1.91e-05, step=3745] Training: 37%|███▋ | 3746/10000 [50:51<2:27:02, 1.41s/it, loss=0.1055, lr=1.91e-05, step=3745] Training: 37%|███▋ | 3746/10000 [50:51<2:27:02, 1.41s/it, loss=0.0088, lr=1.91e-05, step=3746] Training: 37%|███▋ | 3747/10000 [50:52<2:07:51, 1.23s/it, loss=0.0088, lr=1.91e-05, step=3746] Training: 37%|███▋ | 3747/10000 [50:52<2:07:51, 1.23s/it, loss=0.0179, lr=1.91e-05, step=3747] Training: 37%|███▋ | 3748/10000 [50:53<2:10:10, 1.25s/it, loss=0.0179, lr=1.91e-05, step=3747] Training: 37%|███▋ | 3748/10000 [50:53<2:10:10, 1.25s/it, loss=0.0091, lr=1.91e-05, step=3748] Training: 37%|███▋ | 3749/10000 [50:54<2:08:32, 1.23s/it, loss=0.0091, lr=1.91e-05, step=3748] Training: 37%|███▋ | 3749/10000 [50:54<2:08:32, 1.23s/it, loss=0.0221, lr=1.91e-05, step=3749]19:35:27.509 [I] step=3750 loss=0.0246 smoothed_loss=0.0236 lr=1.91e-05 grad_norm=0.4955 step_time=0.8424s data_time=0.4345s it/s=0.784 eta_to_10000=7968.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0140 grad_action_out_proj_arms=0.1117 grad_arm_token_fuse=0.0723 grad_shared_expert=0.3719 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3750/10000 [50:55<2:02:23, 1.18s/it, loss=0.0221, lr=1.91e-05, step=3749] Training: 38%|███▊ | 3750/10000 [50:55<2:02:23, 1.18s/it, loss=0.0246, lr=1.91e-05, step=3750] Training: 38%|███▊ | 3751/10000 [50:57<2:20:27, 1.35s/it, loss=0.0246, lr=1.91e-05, step=3750] Training: 38%|███▊ | 3751/10000 [50:57<2:20:27, 1.35s/it, loss=0.0123, lr=1.91e-05, step=3751] Training: 38%|███▊ | 3752/10000 [50:58<2:15:03, 1.30s/it, loss=0.0123, lr=1.91e-05, step=3751] Training: 38%|███▊ | 3752/10000 [50:58<2:15:03, 1.30s/it, loss=0.0127, lr=1.91e-05, step=3752] Training: 38%|███▊ | 3753/10000 [50:59<1:56:30, 1.12s/it, loss=0.0127, lr=1.91e-05, step=3752] Training: 38%|███▊ | 3753/10000 [50:59<1:56:30, 1.12s/it, loss=0.0668, lr=1.91e-05, step=3753] Training: 38%|███▊ | 3754/10000 [51:01<2:20:34, 1.35s/it, loss=0.0668, lr=1.91e-05, step=3753] Training: 38%|███▊ | 3754/10000 [51:01<2:20:34, 1.35s/it, loss=0.0050, lr=1.91e-05, step=3754] Training: 38%|███▊ | 3755/10000 [51:02<2:19:21, 1.34s/it, loss=0.0050, lr=1.91e-05, step=3754] Training: 38%|███▊ | 3755/10000 [51:02<2:19:21, 1.34s/it, loss=0.0320, lr=1.91e-05, step=3755] Training: 38%|███▊ | 3756/10000 [51:03<2:04:20, 1.19s/it, loss=0.0320, lr=1.91e-05, step=3755] Training: 38%|███▊ | 3756/10000 [51:03<2:04:20, 1.19s/it, loss=0.0052, lr=1.91e-05, step=3756] Training: 38%|███▊ | 3757/10000 [51:04<2:09:38, 1.25s/it, loss=0.0052, lr=1.91e-05, step=3756] Training: 38%|███▊ | 3757/10000 [51:04<2:09:38, 1.25s/it, loss=0.0441, lr=1.91e-05, step=3757] Training: 38%|███▊ | 3758/10000 [51:05<1:52:14, 1.08s/it, loss=0.0441, lr=1.91e-05, step=3757] Training: 38%|███▊ | 3758/10000 [51:05<1:52:14, 1.08s/it, loss=0.0055, lr=1.91e-05, step=3758] Training: 38%|███▊ | 3759/10000 [51:06<1:56:29, 1.12s/it, loss=0.0055, lr=1.91e-05, step=3758] Training: 38%|███▊ | 3759/10000 [51:06<1:56:29, 1.12s/it, loss=0.0059, lr=1.91e-05, step=3759]19:35:39.675 [I] step=3760 loss=0.0065 smoothed_loss=0.0198 lr=1.91e-05 grad_norm=0.5645 step_time=0.7735s data_time=0.4432s it/s=0.824 eta_to_10000=7571.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0260 grad_action_out_proj_arms=0.1407 grad_arm_token_fuse=0.1211 grad_shared_expert=0.7603 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3760/10000 [51:07<1:58:23, 1.14s/it, loss=0.0059, lr=1.91e-05, step=3759] Training: 38%|███▊ | 3760/10000 [51:07<1:58:23, 1.14s/it, loss=0.0065, lr=1.91e-05, step=3760] Training: 38%|███▊ | 3761/10000 [51:08<1:47:32, 1.03s/it, loss=0.0065, lr=1.91e-05, step=3760] Training: 38%|███▊ | 3761/10000 [51:08<1:47:32, 1.03s/it, loss=0.0128, lr=1.91e-05, step=3761] Training: 38%|███▊ | 3762/10000 [51:09<1:31:14, 1.14it/s, loss=0.0128, lr=1.91e-05, step=3761] Training: 38%|███▊ | 3762/10000 [51:09<1:31:14, 1.14it/s, loss=0.0106, lr=1.91e-05, step=3762] Training: 38%|███▊ | 3763/10000 [51:10<1:42:52, 1.01it/s, loss=0.0106, lr=1.91e-05, step=3762] Training: 38%|███▊ | 3763/10000 [51:10<1:42:52, 1.01it/s, loss=0.0189, lr=1.91e-05, step=3763] Training: 38%|███▊ | 3764/10000 [51:11<1:35:41, 1.09it/s, loss=0.0189, lr=1.91e-05, step=3763] Training: 38%|███▊ | 3764/10000 [51:11<1:35:41, 1.09it/s, loss=0.0115, lr=1.91e-05, step=3764] Training: 38%|███▊ | 3765/10000 [51:12<1:36:49, 1.07it/s, loss=0.0115, lr=1.91e-05, step=3764] Training: 38%|███▊ | 3765/10000 [51:12<1:36:49, 1.07it/s, loss=0.0044, lr=1.91e-05, step=3765] Training: 38%|███▊ | 3766/10000 [51:12<1:33:11, 1.11it/s, loss=0.0044, lr=1.91e-05, step=3765] Training: 38%|███▊ | 3766/10000 [51:12<1:33:11, 1.11it/s, loss=0.0191, lr=1.91e-05, step=3766] Training: 38%|███▊ | 3767/10000 [51:14<1:39:50, 1.04it/s, loss=0.0191, lr=1.91e-05, step=3766] Training: 38%|███▊ | 3767/10000 [51:14<1:39:50, 1.04it/s, loss=0.0154, lr=1.91e-05, step=3767] Training: 38%|███▊ | 3768/10000 [51:15<1:54:55, 1.11s/it, loss=0.0154, lr=1.91e-05, step=3767] Training: 38%|███▊ | 3768/10000 [51:15<1:54:55, 1.11s/it, loss=0.0338, lr=1.90e-05, step=3768] Training: 38%|███▊ | 3769/10000 [51:16<2:02:29, 1.18s/it, loss=0.0338, lr=1.90e-05, step=3768] Training: 38%|███▊ | 3769/10000 [51:16<2:02:29, 1.18s/it, loss=0.0155, lr=1.90e-05, step=3769]19:35:50.159 [I] step=3770 loss=0.0026 smoothed_loss=0.0164 lr=1.91e-05 grad_norm=0.4734 step_time=0.7571s data_time=0.2912s it/s=0.955 eta_to_10000=6523.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0108 grad_action_out_proj_arms=0.1022 grad_arm_token_fuse=0.0559 grad_shared_expert=0.4228 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3770/10000 [51:18<2:12:27, 1.28s/it, loss=0.0155, lr=1.90e-05, step=3769] Training: 38%|███▊ | 3770/10000 [51:18<2:12:27, 1.28s/it, loss=0.0026, lr=1.90e-05, step=3770] Training: 38%|███▊ | 3771/10000 [51:20<2:34:26, 1.49s/it, loss=0.0026, lr=1.90e-05, step=3770] Training: 38%|███▊ | 3771/10000 [51:20<2:34:26, 1.49s/it, loss=0.0167, lr=1.90e-05, step=3771] Training: 38%|███▊ | 3772/10000 [51:21<2:35:50, 1.50s/it, loss=0.0167, lr=1.90e-05, step=3771] Training: 38%|███▊ | 3772/10000 [51:21<2:35:50, 1.50s/it, loss=0.0079, lr=1.90e-05, step=3772] Training: 38%|███▊ | 3773/10000 [51:23<2:35:49, 1.50s/it, loss=0.0079, lr=1.90e-05, step=3772] Training: 38%|███▊ | 3773/10000 [51:23<2:35:49, 1.50s/it, loss=0.0149, lr=1.90e-05, step=3773] Training: 38%|███▊ | 3774/10000 [51:24<2:29:46, 1.44s/it, loss=0.0149, lr=1.90e-05, step=3773] Training: 38%|███▊ | 3774/10000 [51:24<2:29:46, 1.44s/it, loss=0.0095, lr=1.90e-05, step=3774] Training: 38%|███▊ | 3775/10000 [51:25<2:25:25, 1.40s/it, loss=0.0095, lr=1.90e-05, step=3774] Training: 38%|███▊ | 3775/10000 [51:25<2:25:25, 1.40s/it, loss=0.0033, lr=1.90e-05, step=3775] Training: 38%|███▊ | 3776/10000 [51:27<2:23:05, 1.38s/it, loss=0.0033, lr=1.90e-05, step=3775] Training: 38%|███▊ | 3776/10000 [51:27<2:23:05, 1.38s/it, loss=0.0061, lr=1.90e-05, step=3776] Training: 38%|███▊ | 3777/10000 [51:28<2:29:48, 1.44s/it, loss=0.0061, lr=1.90e-05, step=3776] Training: 38%|███▊ | 3777/10000 [51:28<2:29:48, 1.44s/it, loss=0.0053, lr=1.90e-05, step=3777] Training: 38%|███▊ | 3778/10000 [51:30<2:30:52, 1.45s/it, loss=0.0053, lr=1.90e-05, step=3777] Training: 38%|███▊ | 3778/10000 [51:30<2:30:52, 1.45s/it, loss=0.0130, lr=1.90e-05, step=3778] Training: 38%|███▊ | 3779/10000 [51:31<2:28:34, 1.43s/it, loss=0.0130, lr=1.90e-05, step=3778] Training: 38%|███▊ | 3779/10000 [51:31<2:28:34, 1.43s/it, loss=0.0197, lr=1.90e-05, step=3779]19:36:04.911 [I] step=3780 loss=0.0169 smoothed_loss=0.0134 lr=1.90e-05 grad_norm=0.4545 step_time=0.9552s data_time=0.5201s it/s=0.679 eta_to_10000=9161.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0162 grad_action_out_proj_arms=0.1345 grad_arm_token_fuse=0.0840 grad_shared_expert=0.5242 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3780/10000 [51:33<2:25:18, 1.40s/it, loss=0.0197, lr=1.90e-05, step=3779] Training: 38%|███▊ | 3780/10000 [51:33<2:25:18, 1.40s/it, loss=0.0169, lr=1.90e-05, step=3780] Training: 38%|███▊ | 3781/10000 [51:34<2:20:03, 1.35s/it, loss=0.0169, lr=1.90e-05, step=3780] Training: 38%|███▊ | 3781/10000 [51:34<2:20:03, 1.35s/it, loss=0.0735, lr=1.90e-05, step=3781] Training: 38%|███▊ | 3782/10000 [51:35<2:12:29, 1.28s/it, loss=0.0735, lr=1.90e-05, step=3781] Training: 38%|███▊ | 3782/10000 [51:35<2:12:29, 1.28s/it, loss=0.0168, lr=1.90e-05, step=3782] Training: 38%|███▊ | 3783/10000 [51:36<2:09:05, 1.25s/it, loss=0.0168, lr=1.90e-05, step=3782] Training: 38%|███▊ | 3783/10000 [51:36<2:09:05, 1.25s/it, loss=0.0419, lr=1.90e-05, step=3783] Training: 38%|███▊ | 3784/10000 [51:37<2:12:41, 1.28s/it, loss=0.0419, lr=1.90e-05, step=3783] Training: 38%|███▊ | 3784/10000 [51:37<2:12:41, 1.28s/it, loss=0.0026, lr=1.90e-05, step=3784] Training: 38%|███▊ | 3785/10000 [51:38<1:48:15, 1.05s/it, loss=0.0026, lr=1.90e-05, step=3784] Training: 38%|███▊ | 3785/10000 [51:38<1:48:15, 1.05s/it, loss=0.0085, lr=1.90e-05, step=3785] Training: 38%|███▊ | 3786/10000 [51:39<1:38:39, 1.05it/s, loss=0.0085, lr=1.90e-05, step=3785] Training: 38%|███▊ | 3786/10000 [51:39<1:38:39, 1.05it/s, loss=0.0191, lr=1.90e-05, step=3786] Training: 38%|███▊ | 3787/10000 [51:39<1:24:52, 1.22it/s, loss=0.0191, lr=1.90e-05, step=3786] Training: 38%|███▊ | 3787/10000 [51:39<1:24:52, 1.22it/s, loss=0.0126, lr=1.90e-05, step=3787] Training: 38%|███▊ | 3788/10000 [51:40<1:14:27, 1.39it/s, loss=0.0126, lr=1.90e-05, step=3787] Training: 38%|███▊ | 3788/10000 [51:40<1:14:27, 1.39it/s, loss=0.0037, lr=1.90e-05, step=3788] Training: 38%|███▊ | 3789/10000 [51:40<1:07:56, 1.52it/s, loss=0.0037, lr=1.90e-05, step=3788] Training: 38%|███▊ | 3789/10000 [51:40<1:07:56, 1.52it/s, loss=0.0179, lr=1.90e-05, step=3789]19:36:13.057 [I] step=3790 loss=0.0185 smoothed_loss=0.0168 lr=1.90e-05 grad_norm=0.5383 step_time=0.6096s data_time=0.2050s it/s=1.230 eta_to_10000=5050.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0147 grad_action_out_proj_arms=0.1406 grad_arm_token_fuse=0.0770 grad_shared_expert=0.6834 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3790/10000 [51:41<1:03:48, 1.62it/s, loss=0.0179, lr=1.90e-05, step=3789] Training: 38%|███▊ | 3790/10000 [51:41<1:03:48, 1.62it/s, loss=0.0185, lr=1.90e-05, step=3790] Training: 38%|███▊ | 3791/10000 [51:41<1:02:22, 1.66it/s, loss=0.0185, lr=1.90e-05, step=3790] Training: 38%|███▊ | 3791/10000 [51:41<1:02:22, 1.66it/s, loss=0.0085, lr=1.90e-05, step=3791] Training: 38%|███▊ | 3792/10000 [51:42<1:04:53, 1.59it/s, loss=0.0085, lr=1.90e-05, step=3791] Training: 38%|███▊ | 3792/10000 [51:42<1:04:53, 1.59it/s, loss=0.0080, lr=1.90e-05, step=3792] Training: 38%|███▊ | 3793/10000 [51:43<1:07:54, 1.52it/s, loss=0.0080, lr=1.90e-05, step=3792] Training: 38%|███▊ | 3793/10000 [51:43<1:07:54, 1.52it/s, loss=0.0050, lr=1.90e-05, step=3793] Training: 38%|███▊ | 3794/10000 [51:43<1:03:31, 1.63it/s, loss=0.0050, lr=1.90e-05, step=3793] Training: 38%|███▊ | 3794/10000 [51:43<1:03:31, 1.63it/s, loss=0.0097, lr=1.90e-05, step=3794] Training: 38%|███▊ | 3795/10000 [51:44<1:04:14, 1.61it/s, loss=0.0097, lr=1.90e-05, step=3794] Training: 38%|███▊ | 3795/10000 [51:44<1:04:14, 1.61it/s, loss=0.0073, lr=1.90e-05, step=3795] Training: 38%|███▊ | 3796/10000 [51:44<1:03:22, 1.63it/s, loss=0.0073, lr=1.90e-05, step=3795] Training: 38%|███▊ | 3796/10000 [51:44<1:03:22, 1.63it/s, loss=0.0055, lr=1.90e-05, step=3796] Training: 38%|███▊ | 3797/10000 [51:45<1:01:15, 1.69it/s, loss=0.0055, lr=1.90e-05, step=3796] Training: 38%|███▊ | 3797/10000 [51:45<1:01:15, 1.69it/s, loss=0.0056, lr=1.90e-05, step=3797] Training: 38%|███▊ | 3798/10000 [51:46<1:00:10, 1.72it/s, loss=0.0056, lr=1.90e-05, step=3797] Training: 38%|███▊ | 3798/10000 [51:46<1:00:10, 1.72it/s, loss=0.0144, lr=1.89e-05, step=3798] Training: 38%|███▊ | 3799/10000 [51:46<1:04:12, 1.61it/s, loss=0.0144, lr=1.89e-05, step=3798] Training: 38%|███▊ | 3799/10000 [51:46<1:04:12, 1.61it/s, loss=0.0185, lr=1.89e-05, step=3799]19:36:19.469 [I] step=3800 loss=0.0266 smoothed_loss=0.0140 lr=1.90e-05 grad_norm=0.4478 step_time=0.5606s data_time=0.0806s it/s=1.560 eta_to_10000=3975.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.1533 grad_arm_token_fuse=0.0868 grad_shared_expert=0.7889 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3800/10000 [51:47<1:11:55, 1.44it/s, loss=0.0185, lr=1.89e-05, step=3799] Training: 38%|███▊ | 3800/10000 [51:47<1:11:55, 1.44it/s, loss=0.0266, lr=1.89e-05, step=3800] Training: 38%|███▊ | 3801/10000 [51:48<1:06:11, 1.56it/s, loss=0.0266, lr=1.89e-05, step=3800] Training: 38%|███▊ | 3801/10000 [51:48<1:06:11, 1.56it/s, loss=0.0168, lr=1.89e-05, step=3801] Training: 38%|███▊ | 3802/10000 [51:48<1:01:10, 1.69it/s, loss=0.0168, lr=1.89e-05, step=3801] Training: 38%|███▊ | 3802/10000 [51:48<1:01:10, 1.69it/s, loss=0.0018, lr=1.89e-05, step=3802] Training: 38%|███▊ | 3803/10000 [51:49<1:00:23, 1.71it/s, loss=0.0018, lr=1.89e-05, step=3802] Training: 38%|███▊ | 3803/10000 [51:49<1:00:23, 1.71it/s, loss=0.0163, lr=1.89e-05, step=3803] Training: 38%|███▊ | 3804/10000 [51:49<57:05, 1.81it/s, loss=0.0163, lr=1.89e-05, step=3803] Training: 38%|███▊ | 3804/10000 [51:49<57:05, 1.81it/s, loss=0.0236, lr=1.89e-05, step=3804] Training: 38%|███▊ | 3805/10000 [51:50<55:41, 1.85it/s, loss=0.0236, lr=1.89e-05, step=3804] Training: 38%|███▊ | 3805/10000 [51:50<55:41, 1.85it/s, loss=0.0238, lr=1.89e-05, step=3805] Training: 38%|███▊ | 3806/10000 [51:50<1:00:37, 1.70it/s, loss=0.0238, lr=1.89e-05, step=3805] Training: 38%|███▊ | 3806/10000 [51:50<1:00:37, 1.70it/s, loss=0.0107, lr=1.89e-05, step=3806] Training: 38%|███▊ | 3807/10000 [51:51<1:07:01, 1.54it/s, loss=0.0107, lr=1.89e-05, step=3806] Training: 38%|███▊ | 3807/10000 [51:51<1:07:01, 1.54it/s, loss=0.0342, lr=1.89e-05, step=3807] Training: 38%|███▊ | 3808/10000 [51:52<1:01:59, 1.66it/s, loss=0.0342, lr=1.89e-05, step=3807] Training: 38%|███▊ | 3808/10000 [51:52<1:01:59, 1.66it/s, loss=0.0057, lr=1.89e-05, step=3808] Training: 38%|███▊ | 3809/10000 [51:52<58:31, 1.76it/s, loss=0.0057, lr=1.89e-05, step=3808] Training: 38%|███▊ | 3809/10000 [51:52<58:31, 1.76it/s, loss=0.0073, lr=1.89e-05, step=3809]19:36:24.999 [I] step=3810 loss=0.0065 smoothed_loss=0.0140 lr=1.89e-05 grad_norm=0.4836 step_time=0.4808s data_time=0.0721s it/s=1.809 eta_to_10000=3421.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0120 grad_action_out_proj_arms=0.1148 grad_arm_token_fuse=0.0525 grad_shared_expert=0.3556 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3810/10000 [51:53<56:54, 1.81it/s, loss=0.0073, lr=1.89e-05, step=3809] Training: 38%|███▊ | 3810/10000 [51:53<56:54, 1.81it/s, loss=0.0065, lr=1.89e-05, step=3810] Training: 38%|███▊ | 3811/10000 [51:53<56:12, 1.84it/s, loss=0.0065, lr=1.89e-05, step=3810] Training: 38%|███▊ | 3811/10000 [51:53<56:12, 1.84it/s, loss=0.0127, lr=1.89e-05, step=3811] Training: 38%|███▊ | 3812/10000 [51:54<54:32, 1.89it/s, loss=0.0127, lr=1.89e-05, step=3811] Training: 38%|███▊ | 3812/10000 [51:54<54:32, 1.89it/s, loss=0.0238, lr=1.89e-05, step=3812] Training: 38%|███▊ | 3813/10000 [51:54<56:59, 1.81it/s, loss=0.0238, lr=1.89e-05, step=3812] Training: 38%|███▊ | 3813/10000 [51:54<56:59, 1.81it/s, loss=0.0303, lr=1.89e-05, step=3813] Training: 38%|███▊ | 3814/10000 [51:55<1:04:45, 1.59it/s, loss=0.0303, lr=1.89e-05, step=3813] Training: 38%|███▊ | 3814/10000 [51:55<1:04:45, 1.59it/s, loss=0.0233, lr=1.89e-05, step=3814] Training: 38%|███▊ | 3815/10000 [51:56<1:09:47, 1.48it/s, loss=0.0233, lr=1.89e-05, step=3814] Training: 38%|███▊ | 3815/10000 [51:56<1:09:47, 1.48it/s, loss=0.0070, lr=1.89e-05, step=3815] Training: 38%|███▊ | 3816/10000 [51:56<1:04:09, 1.61it/s, loss=0.0070, lr=1.89e-05, step=3815] Training: 38%|███▊ | 3816/10000 [51:56<1:04:09, 1.61it/s, loss=0.0301, lr=1.89e-05, step=3816] Training: 38%|███▊ | 3817/10000 [51:57<1:00:24, 1.71it/s, loss=0.0301, lr=1.89e-05, step=3816] Training: 38%|███▊ | 3817/10000 [51:57<1:00:24, 1.71it/s, loss=0.0130, lr=1.89e-05, step=3817] Training: 38%|███▊ | 3818/10000 [51:57<57:50, 1.78it/s, loss=0.0130, lr=1.89e-05, step=3817] Training: 38%|███▊ | 3818/10000 [51:57<57:50, 1.78it/s, loss=0.0287, lr=1.89e-05, step=3818] Training: 38%|███▊ | 3819/10000 [51:58<55:48, 1.85it/s, loss=0.0287, lr=1.89e-05, step=3818] Training: 38%|███▊ | 3819/10000 [51:58<55:48, 1.85it/s, loss=0.0179, lr=1.89e-05, step=3819]19:36:30.784 [I] step=3820 loss=0.0201 smoothed_loss=0.0184 lr=1.89e-05 grad_norm=0.5090 step_time=0.5108s data_time=0.0677s it/s=1.729 eta_to_10000=3574.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1181 grad_arm_token_fuse=0.0585 grad_shared_expert=0.3203 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3820/10000 [51:58<56:39, 1.82it/s, loss=0.0179, lr=1.89e-05, step=3819] Training: 38%|███▊ | 3820/10000 [51:58<56:39, 1.82it/s, loss=0.0201, lr=1.89e-05, step=3820] Training: 38%|███▊ | 3821/10000 [51:59<1:08:45, 1.50it/s, loss=0.0201, lr=1.89e-05, step=3820] Training: 38%|███▊ | 3821/10000 [51:59<1:08:45, 1.50it/s, loss=0.0048, lr=1.89e-05, step=3821] Training: 38%|███▊ | 3822/10000 [52:00<1:10:00, 1.47it/s, loss=0.0048, lr=1.89e-05, step=3821] Training: 38%|███▊ | 3822/10000 [52:00<1:10:00, 1.47it/s, loss=0.0400, lr=1.89e-05, step=3822] Training: 38%|███▊ | 3823/10000 [52:01<1:08:07, 1.51it/s, loss=0.0400, lr=1.89e-05, step=3822] Training: 38%|███▊ | 3823/10000 [52:01<1:08:07, 1.51it/s, loss=0.0112, lr=1.89e-05, step=3823] Training: 38%|███▊ | 3824/10000 [52:01<1:05:35, 1.57it/s, loss=0.0112, lr=1.89e-05, step=3823] Training: 38%|███▊ | 3824/10000 [52:01<1:05:35, 1.57it/s, loss=0.0422, lr=1.89e-05, step=3824] Training: 38%|███▊ | 3825/10000 [52:02<1:07:32, 1.52it/s, loss=0.0422, lr=1.89e-05, step=3824] Training: 38%|███▊ | 3825/10000 [52:02<1:07:32, 1.52it/s, loss=0.0122, lr=1.89e-05, step=3825] Training: 38%|███▊ | 3826/10000 [52:03<1:05:22, 1.57it/s, loss=0.0122, lr=1.89e-05, step=3825] Training: 38%|███▊ | 3826/10000 [52:03<1:05:22, 1.57it/s, loss=0.0217, lr=1.89e-05, step=3826] Training: 38%|███▊ | 3827/10000 [52:03<1:10:58, 1.45it/s, loss=0.0217, lr=1.89e-05, step=3826] Training: 38%|███▊ | 3827/10000 [52:03<1:10:58, 1.45it/s, loss=0.0142, lr=1.89e-05, step=3827] Training: 38%|███▊ | 3828/10000 [52:04<1:13:45, 1.39it/s, loss=0.0142, lr=1.89e-05, step=3827] Training: 38%|███▊ | 3828/10000 [52:04<1:13:45, 1.39it/s, loss=0.0049, lr=1.89e-05, step=3828] Training: 38%|███▊ | 3829/10000 [52:05<1:16:44, 1.34it/s, loss=0.0049, lr=1.89e-05, step=3828] Training: 38%|███▊ | 3829/10000 [52:05<1:16:44, 1.34it/s, loss=0.0055, lr=1.88e-05, step=3829]19:36:37.891 [I] step=3830 loss=0.0415 smoothed_loss=0.0193 lr=1.89e-05 grad_norm=0.5542 step_time=0.5902s data_time=0.1205s it/s=1.408 eta_to_10000=4381.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0289 grad_action_out_proj_arms=0.1853 grad_arm_token_fuse=0.1542 grad_shared_expert=0.7543 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3830/10000 [52:06<1:10:47, 1.45it/s, loss=0.0055, lr=1.88e-05, step=3829] Training: 38%|███▊ | 3830/10000 [52:06<1:10:47, 1.45it/s, loss=0.0415, lr=1.88e-05, step=3830] Training: 38%|███▊ | 3831/10000 [52:06<1:11:04, 1.45it/s, loss=0.0415, lr=1.88e-05, step=3830] Training: 38%|███▊ | 3831/10000 [52:06<1:11:04, 1.45it/s, loss=0.0075, lr=1.88e-05, step=3831] Training: 38%|███▊ | 3832/10000 [52:07<1:06:58, 1.54it/s, loss=0.0075, lr=1.88e-05, step=3831] Training: 38%|███▊ | 3832/10000 [52:07<1:06:58, 1.54it/s, loss=0.0269, lr=1.88e-05, step=3832] Training: 38%|███▊ | 3833/10000 [52:07<1:03:52, 1.61it/s, loss=0.0269, lr=1.88e-05, step=3832] Training: 38%|███▊ | 3833/10000 [52:07<1:03:52, 1.61it/s, loss=0.0163, lr=1.88e-05, step=3833] Training: 38%|███▊ | 3834/10000 [52:08<1:03:12, 1.63it/s, loss=0.0163, lr=1.88e-05, step=3833] Training: 38%|███▊ | 3834/10000 [52:08<1:03:12, 1.63it/s, loss=0.0461, lr=1.88e-05, step=3834] Training: 38%|███▊ | 3835/10000 [52:09<1:00:35, 1.70it/s, loss=0.0461, lr=1.88e-05, step=3834] Training: 38%|███▊ | 3835/10000 [52:09<1:00:35, 1.70it/s, loss=0.0030, lr=1.88e-05, step=3835] Training: 38%|███▊ | 3836/10000 [52:09<1:08:56, 1.49it/s, loss=0.0030, lr=1.88e-05, step=3835] Training: 38%|███▊ | 3836/10000 [52:09<1:08:56, 1.49it/s, loss=0.0187, lr=1.88e-05, step=3836] Training: 38%|███▊ | 3837/10000 [52:10<1:10:05, 1.47it/s, loss=0.0187, lr=1.88e-05, step=3836] Training: 38%|███▊ | 3837/10000 [52:10<1:10:05, 1.47it/s, loss=0.0714, lr=1.88e-05, step=3837] Training: 38%|███▊ | 3838/10000 [52:11<1:07:09, 1.53it/s, loss=0.0714, lr=1.88e-05, step=3837] Training: 38%|███▊ | 3838/10000 [52:11<1:07:09, 1.53it/s, loss=0.0104, lr=1.88e-05, step=3838] Training: 38%|███▊ | 3839/10000 [52:11<1:06:46, 1.54it/s, loss=0.0104, lr=1.88e-05, step=3838] Training: 38%|███▊ | 3839/10000 [52:11<1:06:46, 1.54it/s, loss=0.0361, lr=1.88e-05, step=3839]19:36:44.179 [I] step=3840 loss=0.0143 smoothed_loss=0.0235 lr=1.88e-05 grad_norm=0.4907 step_time=0.5287s data_time=0.1002s it/s=1.591 eta_to_10000=3873.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1110 grad_arm_token_fuse=0.0551 grad_shared_expert=0.3751 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3840/10000 [52:12<1:03:56, 1.61it/s, loss=0.0361, lr=1.88e-05, step=3839] Training: 38%|███▊ | 3840/10000 [52:12<1:03:56, 1.61it/s, loss=0.0143, lr=1.88e-05, step=3840] Training: 38%|███▊ | 3841/10000 [52:12<1:01:52, 1.66it/s, loss=0.0143, lr=1.88e-05, step=3840] Training: 38%|███▊ | 3841/10000 [52:12<1:01:52, 1.66it/s, loss=0.0070, lr=1.88e-05, step=3841] Training: 38%|███▊ | 3842/10000 [52:13<1:02:06, 1.65it/s, loss=0.0070, lr=1.88e-05, step=3841] Training: 38%|███▊ | 3842/10000 [52:13<1:02:06, 1.65it/s, loss=0.0126, lr=1.88e-05, step=3842] Training: 38%|███▊ | 3843/10000 [52:14<1:06:44, 1.54it/s, loss=0.0126, lr=1.88e-05, step=3842] Training: 38%|███▊ | 3843/10000 [52:14<1:06:44, 1.54it/s, loss=0.0065, lr=1.88e-05, step=3843] Training: 38%|███▊ | 3844/10000 [52:15<1:09:40, 1.47it/s, loss=0.0065, lr=1.88e-05, step=3843] Training: 38%|███▊ | 3844/10000 [52:15<1:09:40, 1.47it/s, loss=0.0126, lr=1.88e-05, step=3844] Training: 38%|███▊ | 3845/10000 [52:15<1:09:39, 1.47it/s, loss=0.0126, lr=1.88e-05, step=3844] Training: 38%|███▊ | 3845/10000 [52:15<1:09:39, 1.47it/s, loss=0.0190, lr=1.88e-05, step=3845] Training: 38%|███▊ | 3846/10000 [52:16<1:04:02, 1.60it/s, loss=0.0190, lr=1.88e-05, step=3845] Training: 38%|███▊ | 3846/10000 [52:16<1:04:02, 1.60it/s, loss=0.0052, lr=1.88e-05, step=3846] Training: 38%|███▊ | 3847/10000 [52:16<59:50, 1.71it/s, loss=0.0052, lr=1.88e-05, step=3846] Training: 38%|███▊ | 3847/10000 [52:16<59:50, 1.71it/s, loss=0.0033, lr=1.88e-05, step=3847] Training: 38%|███▊ | 3848/10000 [52:17<1:02:04, 1.65it/s, loss=0.0033, lr=1.88e-05, step=3847] Training: 38%|███▊ | 3848/10000 [52:17<1:02:04, 1.65it/s, loss=0.0125, lr=1.88e-05, step=3848] Training: 38%|███▊ | 3849/10000 [52:17<1:00:46, 1.69it/s, loss=0.0125, lr=1.88e-05, step=3848] Training: 38%|███▊ | 3849/10000 [52:17<1:00:46, 1.69it/s, loss=0.0091, lr=1.88e-05, step=3849]19:36:50.635 [I] step=3850 loss=0.0075 smoothed_loss=0.0143 lr=1.88e-05 grad_norm=0.5058 step_time=0.5435s data_time=0.1022s it/s=1.551 eta_to_10000=3965.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0164 grad_action_out_proj_arms=0.1500 grad_arm_token_fuse=0.0782 grad_shared_expert=0.4467 (18633:train_pytorch.py:850) + Training: 38%|███▊ | 3850/10000 [52:18<1:10:08, 1.46it/s, loss=0.0091, lr=1.88e-05, step=3849] Training: 38%|███▊ | 3850/10000 [52:18<1:10:08, 1.46it/s, loss=0.0075, lr=1.88e-05, step=3850] Training: 39%|███▊ | 3851/10000 [52:19<1:10:28, 1.45it/s, loss=0.0075, lr=1.88e-05, step=3850] Training: 39%|███▊ | 3851/10000 [52:19<1:10:28, 1.45it/s, loss=0.0184, lr=1.88e-05, step=3851] Training: 39%|███▊ | 3852/10000 [52:20<1:16:55, 1.33it/s, loss=0.0184, lr=1.88e-05, step=3851] Training: 39%|███▊ | 3852/10000 [52:20<1:16:55, 1.33it/s, loss=0.0122, lr=1.88e-05, step=3852] Training: 39%|███▊ | 3853/10000 [52:20<1:11:52, 1.43it/s, loss=0.0122, lr=1.88e-05, step=3852] Training: 39%|███▊ | 3853/10000 [52:20<1:11:52, 1.43it/s, loss=0.0432, lr=1.88e-05, step=3853] Training: 39%|███▊ | 3854/10000 [52:21<1:13:03, 1.40it/s, loss=0.0432, lr=1.88e-05, step=3853] Training: 39%|███▊ | 3854/10000 [52:21<1:13:03, 1.40it/s, loss=0.0246, lr=1.88e-05, step=3854] Training: 39%|███▊ | 3855/10000 [52:22<1:12:18, 1.42it/s, loss=0.0246, lr=1.88e-05, step=3854] Training: 39%|███▊ | 3855/10000 [52:22<1:12:18, 1.42it/s, loss=0.0112, lr=1.88e-05, step=3855] Training: 39%|███▊ | 3856/10000 [52:23<1:12:15, 1.42it/s, loss=0.0112, lr=1.88e-05, step=3855] Training: 39%|███▊ | 3856/10000 [52:23<1:12:15, 1.42it/s, loss=0.0401, lr=1.88e-05, step=3856] Training: 39%|███▊ | 3857/10000 [52:23<1:16:51, 1.33it/s, loss=0.0401, lr=1.88e-05, step=3856] Training: 39%|███▊ | 3857/10000 [52:23<1:16:51, 1.33it/s, loss=0.0220, lr=1.88e-05, step=3857] Training: 39%|███▊ | 3858/10000 [52:24<1:14:35, 1.37it/s, loss=0.0220, lr=1.88e-05, step=3857] Training: 39%|███▊ | 3858/10000 [52:24<1:14:35, 1.37it/s, loss=0.0681, lr=1.88e-05, step=3858] Training: 39%|███▊ | 3859/10000 [52:25<1:15:58, 1.35it/s, loss=0.0681, lr=1.88e-05, step=3858] Training: 39%|███▊ | 3859/10000 [52:25<1:15:58, 1.35it/s, loss=0.0087, lr=1.87e-05, step=3859]19:36:57.843 [I] step=3860 loss=0.0179 smoothed_loss=0.0226 lr=1.88e-05 grad_norm=0.5094 step_time=0.5709s data_time=0.1499s it/s=1.388 eta_to_10000=4424.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0184 grad_action_out_proj_arms=0.1243 grad_arm_token_fuse=0.0951 grad_shared_expert=0.4017 (18633:train_pytorch.py:850) + Training: 39%|███▊ | 3860/10000 [52:26<1:11:07, 1.44it/s, loss=0.0087, lr=1.87e-05, step=3859] Training: 39%|███▊ | 3860/10000 [52:26<1:11:07, 1.44it/s, loss=0.0179, lr=1.87e-05, step=3860] Training: 39%|███▊ | 3861/10000 [52:26<1:07:04, 1.53it/s, loss=0.0179, lr=1.87e-05, step=3860] Training: 39%|███▊ | 3861/10000 [52:26<1:07:04, 1.53it/s, loss=0.0118, lr=1.87e-05, step=3861] Training: 39%|███▊ | 3862/10000 [52:27<1:07:17, 1.52it/s, loss=0.0118, lr=1.87e-05, step=3861] Training: 39%|███▊ | 3862/10000 [52:27<1:07:17, 1.52it/s, loss=0.0427, lr=1.87e-05, step=3862] Training: 39%|███▊ | 3863/10000 [52:28<1:10:30, 1.45it/s, loss=0.0427, lr=1.87e-05, step=3862] Training: 39%|███▊ | 3863/10000 [52:28<1:10:30, 1.45it/s, loss=0.0088, lr=1.87e-05, step=3863] Training: 39%|███▊ | 3864/10000 [52:28<1:16:41, 1.33it/s, loss=0.0088, lr=1.87e-05, step=3863] Training: 39%|███▊ | 3864/10000 [52:28<1:16:41, 1.33it/s, loss=0.0151, lr=1.87e-05, step=3864] Training: 39%|███▊ | 3865/10000 [52:29<1:14:45, 1.37it/s, loss=0.0151, lr=1.87e-05, step=3864] Training: 39%|███▊ | 3865/10000 [52:29<1:14:45, 1.37it/s, loss=0.0137, lr=1.87e-05, step=3865] Training: 39%|███▊ | 3866/10000 [52:30<1:16:59, 1.33it/s, loss=0.0137, lr=1.87e-05, step=3865] Training: 39%|███▊ | 3866/10000 [52:30<1:16:59, 1.33it/s, loss=0.0069, lr=1.87e-05, step=3866] Training: 39%|███▊ | 3867/10000 [52:30<1:09:12, 1.48it/s, loss=0.0069, lr=1.87e-05, step=3866] Training: 39%|███▊ | 3867/10000 [52:30<1:09:12, 1.48it/s, loss=0.0185, lr=1.87e-05, step=3867] Training: 39%|███▊ | 3868/10000 [52:31<1:04:18, 1.59it/s, loss=0.0185, lr=1.87e-05, step=3867] Training: 39%|███▊ | 3868/10000 [52:31<1:04:18, 1.59it/s, loss=0.0131, lr=1.87e-05, step=3868] Training: 39%|███▊ | 3869/10000 [52:32<1:07:48, 1.51it/s, loss=0.0131, lr=1.87e-05, step=3868] Training: 39%|███▊ | 3869/10000 [52:32<1:07:48, 1.51it/s, loss=0.0140, lr=1.87e-05, step=3869]19:37:04.685 [I] step=3870 loss=0.0205 smoothed_loss=0.0184 lr=1.87e-05 grad_norm=0.4762 step_time=0.5770s data_time=0.1072s it/s=1.462 eta_to_10000=4193.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.0978 grad_arm_token_fuse=0.0673 grad_shared_expert=0.4341 (18633:train_pytorch.py:850) + Training: 39%|███▊ | 3870/10000 [52:32<1:09:21, 1.47it/s, loss=0.0140, lr=1.87e-05, step=3869] Training: 39%|███▊ | 3870/10000 [52:32<1:09:21, 1.47it/s, loss=0.0205, lr=1.87e-05, step=3870] Training: 39%|███▊ | 3871/10000 [52:33<1:13:38, 1.39it/s, loss=0.0205, lr=1.87e-05, step=3870] Training: 39%|███▊ | 3871/10000 [52:33<1:13:38, 1.39it/s, loss=0.0145, lr=1.87e-05, step=3871] Training: 39%|███▊ | 3872/10000 [52:34<1:14:27, 1.37it/s, loss=0.0145, lr=1.87e-05, step=3871] Training: 39%|███▊ | 3872/10000 [52:34<1:14:27, 1.37it/s, loss=0.0054, lr=1.87e-05, step=3872] Training: 39%|███▊ | 3873/10000 [52:35<1:13:56, 1.38it/s, loss=0.0054, lr=1.87e-05, step=3872] Training: 39%|███▊ | 3873/10000 [52:35<1:13:56, 1.38it/s, loss=0.0324, lr=1.87e-05, step=3873] Training: 39%|███▊ | 3874/10000 [52:36<1:19:10, 1.29it/s, loss=0.0324, lr=1.87e-05, step=3873] Training: 39%|███▊ | 3874/10000 [52:36<1:19:10, 1.29it/s, loss=0.0096, lr=1.87e-05, step=3874] Training: 39%|███▉ | 3875/10000 [52:36<1:11:55, 1.42it/s, loss=0.0096, lr=1.87e-05, step=3874] Training: 39%|███▉ | 3875/10000 [52:36<1:11:55, 1.42it/s, loss=0.0064, lr=1.87e-05, step=3875] Training: 39%|███▉ | 3876/10000 [52:37<1:09:29, 1.47it/s, loss=0.0064, lr=1.87e-05, step=3875] Training: 39%|███▉ | 3876/10000 [52:37<1:09:29, 1.47it/s, loss=0.0466, lr=1.87e-05, step=3876] Training: 39%|███▉ | 3877/10000 [52:37<1:04:30, 1.58it/s, loss=0.0466, lr=1.87e-05, step=3876] Training: 39%|███▉ | 3877/10000 [52:37<1:04:30, 1.58it/s, loss=0.0094, lr=1.87e-05, step=3877] Training: 39%|███▉ | 3878/10000 [52:38<1:12:43, 1.40it/s, loss=0.0094, lr=1.87e-05, step=3877] Training: 39%|███▉ | 3878/10000 [52:38<1:12:43, 1.40it/s, loss=0.0223, lr=1.87e-05, step=3878] Training: 39%|███▉ | 3879/10000 [52:39<1:13:46, 1.38it/s, loss=0.0223, lr=1.87e-05, step=3878] Training: 39%|███▉ | 3879/10000 [52:39<1:13:46, 1.38it/s, loss=0.0093, lr=1.87e-05, step=3879]19:37:11.768 [I] step=3880 loss=0.0656 smoothed_loss=0.0226 lr=1.87e-05 grad_norm=0.4720 step_time=0.6031s data_time=0.1052s it/s=1.413 eta_to_10000=4329.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0182 grad_action_out_proj_arms=0.2368 grad_arm_token_fuse=0.0961 grad_shared_expert=0.6105 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3880/10000 [52:39<1:09:09, 1.47it/s, loss=0.0093, lr=1.87e-05, step=3879] Training: 39%|███▉ | 3880/10000 [52:39<1:09:09, 1.47it/s, loss=0.0656, lr=1.87e-05, step=3880] Training: 39%|███▉ | 3881/10000 [52:40<1:13:56, 1.38it/s, loss=0.0656, lr=1.87e-05, step=3880] Training: 39%|███▉ | 3881/10000 [52:40<1:13:56, 1.38it/s, loss=0.0246, lr=1.87e-05, step=3881] Training: 39%|███▉ | 3882/10000 [52:41<1:09:57, 1.46it/s, loss=0.0246, lr=1.87e-05, step=3881] Training: 39%|███▉ | 3882/10000 [52:41<1:09:57, 1.46it/s, loss=0.0212, lr=1.87e-05, step=3882] Training: 39%|███▉ | 3883/10000 [52:42<1:10:18, 1.45it/s, loss=0.0212, lr=1.87e-05, step=3882] Training: 39%|███▉ | 3883/10000 [52:42<1:10:18, 1.45it/s, loss=0.0055, lr=1.87e-05, step=3883] Training: 39%|███▉ | 3884/10000 [52:42<1:05:20, 1.56it/s, loss=0.0055, lr=1.87e-05, step=3883] Training: 39%|███▉ | 3884/10000 [52:42<1:05:20, 1.56it/s, loss=0.0114, lr=1.87e-05, step=3884] Training: 39%|███▉ | 3885/10000 [52:43<1:05:49, 1.55it/s, loss=0.0114, lr=1.87e-05, step=3884] Training: 39%|███▉ | 3885/10000 [52:43<1:05:49, 1.55it/s, loss=0.0108, lr=1.87e-05, step=3885] Training: 39%|███▉ | 3886/10000 [52:44<1:09:04, 1.48it/s, loss=0.0108, lr=1.87e-05, step=3885] Training: 39%|███▉ | 3886/10000 [52:44<1:09:04, 1.48it/s, loss=0.0080, lr=1.87e-05, step=3886] Training: 39%|███▉ | 3887/10000 [52:44<1:07:42, 1.50it/s, loss=0.0080, lr=1.87e-05, step=3886] Training: 39%|███▉ | 3887/10000 [52:44<1:07:42, 1.50it/s, loss=0.0112, lr=1.87e-05, step=3887] Training: 39%|███▉ | 3888/10000 [52:45<1:04:54, 1.57it/s, loss=0.0112, lr=1.87e-05, step=3887] Training: 39%|███▉ | 3888/10000 [52:45<1:04:54, 1.57it/s, loss=0.0186, lr=1.87e-05, step=3888] Training: 39%|███▉ | 3889/10000 [52:45<1:09:14, 1.47it/s, loss=0.0186, lr=1.87e-05, step=3888] Training: 39%|███▉ | 3889/10000 [52:45<1:09:14, 1.47it/s, loss=0.0037, lr=1.86e-05, step=3889]19:37:18.377 [I] step=3890 loss=0.0123 smoothed_loss=0.0157 lr=1.87e-05 grad_norm=0.4578 step_time=0.5626s data_time=0.0984s it/s=1.514 eta_to_10000=4036.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.0944 grad_arm_token_fuse=0.0525 grad_shared_expert=0.3850 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3890/10000 [52:46<1:05:24, 1.56it/s, loss=0.0037, lr=1.86e-05, step=3889] Training: 39%|███▉ | 3890/10000 [52:46<1:05:24, 1.56it/s, loss=0.0123, lr=1.86e-05, step=3890] Training: 39%|███▉ | 3891/10000 [52:47<1:02:58, 1.62it/s, loss=0.0123, lr=1.86e-05, step=3890] Training: 39%|███▉ | 3891/10000 [52:47<1:02:58, 1.62it/s, loss=0.0216, lr=1.86e-05, step=3891] Training: 39%|███▉ | 3892/10000 [52:47<59:21, 1.71it/s, loss=0.0216, lr=1.86e-05, step=3891] Training: 39%|███▉ | 3892/10000 [52:47<59:21, 1.71it/s, loss=0.0199, lr=1.86e-05, step=3892] Training: 39%|███▉ | 3893/10000 [52:48<1:05:14, 1.56it/s, loss=0.0199, lr=1.86e-05, step=3892] Training: 39%|███▉ | 3893/10000 [52:48<1:05:14, 1.56it/s, loss=0.0168, lr=1.86e-05, step=3893] Training: 39%|███▉ | 3894/10000 [52:48<1:03:07, 1.61it/s, loss=0.0168, lr=1.86e-05, step=3893] Training: 39%|███▉ | 3894/10000 [52:48<1:03:07, 1.61it/s, loss=0.0376, lr=1.86e-05, step=3894] Training: 39%|███▉ | 3895/10000 [52:49<1:00:01, 1.70it/s, loss=0.0376, lr=1.86e-05, step=3894] Training: 39%|███▉ | 3895/10000 [52:49<1:00:01, 1.70it/s, loss=0.0042, lr=1.86e-05, step=3895] Training: 39%|███▉ | 3896/10000 [52:49<57:36, 1.77it/s, loss=0.0042, lr=1.86e-05, step=3895] Training: 39%|███▉ | 3896/10000 [52:49<57:36, 1.77it/s, loss=0.0471, lr=1.86e-05, step=3896] Training: 39%|███▉ | 3897/10000 [52:50<1:01:42, 1.65it/s, loss=0.0471, lr=1.86e-05, step=3896] Training: 39%|███▉ | 3897/10000 [52:50<1:01:42, 1.65it/s, loss=0.0270, lr=1.86e-05, step=3897] Training: 39%|███▉ | 3898/10000 [52:51<59:46, 1.70it/s, loss=0.0270, lr=1.86e-05, step=3897] Training: 39%|███▉ | 3898/10000 [52:51<59:46, 1.70it/s, loss=0.0203, lr=1.86e-05, step=3898] Training: 39%|███▉ | 3899/10000 [52:51<1:01:26, 1.65it/s, loss=0.0203, lr=1.86e-05, step=3898] Training: 39%|███▉ | 3899/10000 [52:51<1:01:26, 1.65it/s, loss=0.0088, lr=1.86e-05, step=3899]19:37:24.467 [I] step=3900 loss=0.0089 smoothed_loss=0.0186 lr=1.86e-05 grad_norm=0.4887 step_time=0.5243s data_time=0.0846s it/s=1.642 eta_to_10000=3714.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0065 grad_action_out_proj_arms=0.1095 grad_arm_token_fuse=0.0345 grad_shared_expert=0.3134 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3900/10000 [52:52<1:06:23, 1.53it/s, loss=0.0088, lr=1.86e-05, step=3899] Training: 39%|███▉ | 3900/10000 [52:52<1:06:23, 1.53it/s, loss=0.0089, lr=1.86e-05, step=3900] Training: 39%|███▉ | 3901/10000 [52:53<1:04:23, 1.58it/s, loss=0.0089, lr=1.86e-05, step=3900] Training: 39%|███▉ | 3901/10000 [52:53<1:04:23, 1.58it/s, loss=0.0160, lr=1.86e-05, step=3901] Training: 39%|███▉ | 3902/10000 [52:53<1:02:27, 1.63it/s, loss=0.0160, lr=1.86e-05, step=3901] Training: 39%|███▉ | 3902/10000 [52:53<1:02:27, 1.63it/s, loss=0.0041, lr=1.86e-05, step=3902] Training: 39%|███▉ | 3903/10000 [52:54<1:04:54, 1.57it/s, loss=0.0041, lr=1.86e-05, step=3902] Training: 39%|███▉ | 3903/10000 [52:54<1:04:54, 1.57it/s, loss=0.0332, lr=1.86e-05, step=3903] Training: 39%|███▉ | 3904/10000 [52:55<1:07:59, 1.49it/s, loss=0.0332, lr=1.86e-05, step=3903] Training: 39%|███▉ | 3904/10000 [52:55<1:07:59, 1.49it/s, loss=0.0210, lr=1.86e-05, step=3904] Training: 39%|███▉ | 3905/10000 [52:55<1:05:56, 1.54it/s, loss=0.0210, lr=1.86e-05, step=3904] Training: 39%|███▉ | 3905/10000 [52:55<1:05:56, 1.54it/s, loss=0.0049, lr=1.86e-05, step=3905] Training: 39%|███▉ | 3906/10000 [52:56<1:09:43, 1.46it/s, loss=0.0049, lr=1.86e-05, step=3905] Training: 39%|███▉ | 3906/10000 [52:56<1:09:43, 1.46it/s, loss=0.0077, lr=1.86e-05, step=3906] Training: 39%|███▉ | 3907/10000 [52:57<1:12:04, 1.41it/s, loss=0.0077, lr=1.86e-05, step=3906] Training: 39%|███▉ | 3907/10000 [52:57<1:12:04, 1.41it/s, loss=0.0175, lr=1.86e-05, step=3907] Training: 39%|███▉ | 3908/10000 [52:57<1:05:30, 1.55it/s, loss=0.0175, lr=1.86e-05, step=3907] Training: 39%|███▉ | 3908/10000 [52:57<1:05:30, 1.55it/s, loss=0.0363, lr=1.86e-05, step=3908] Training: 39%|███▉ | 3909/10000 [52:58<1:01:26, 1.65it/s, loss=0.0363, lr=1.86e-05, step=3908] Training: 39%|███▉ | 3909/10000 [52:58<1:01:26, 1.65it/s, loss=0.0123, lr=1.86e-05, step=3909]19:37:30.740 [I] step=3910 loss=0.0130 smoothed_loss=0.0174 lr=1.86e-05 grad_norm=0.5049 step_time=0.5261s data_time=0.1012s it/s=1.596 eta_to_10000=3816.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1621 grad_arm_token_fuse=0.0800 grad_shared_expert=0.5360 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3910/10000 [52:58<59:02, 1.72it/s, loss=0.0123, lr=1.86e-05, step=3909] Training: 39%|███▉ | 3910/10000 [52:58<59:02, 1.72it/s, loss=0.0130, lr=1.86e-05, step=3910] Training: 39%|███▉ | 3911/10000 [52:59<1:00:12, 1.69it/s, loss=0.0130, lr=1.86e-05, step=3910] Training: 39%|███▉ | 3911/10000 [52:59<1:00:12, 1.69it/s, loss=0.0129, lr=1.86e-05, step=3911] Training: 39%|███▉ | 3912/10000 [53:00<1:03:50, 1.59it/s, loss=0.0129, lr=1.86e-05, step=3911] Training: 39%|███▉ | 3912/10000 [53:00<1:03:50, 1.59it/s, loss=0.0178, lr=1.86e-05, step=3912] Training: 39%|███▉ | 3913/10000 [53:00<1:00:04, 1.69it/s, loss=0.0178, lr=1.86e-05, step=3912] Training: 39%|███▉ | 3913/10000 [53:00<1:00:04, 1.69it/s, loss=0.0078, lr=1.86e-05, step=3913] Training: 39%|███▉ | 3914/10000 [53:01<1:03:55, 1.59it/s, loss=0.0078, lr=1.86e-05, step=3913] Training: 39%|███▉ | 3914/10000 [53:01<1:03:55, 1.59it/s, loss=0.0301, lr=1.86e-05, step=3914] Training: 39%|███▉ | 3915/10000 [53:02<1:06:27, 1.53it/s, loss=0.0301, lr=1.86e-05, step=3914] Training: 39%|███▉ | 3915/10000 [53:02<1:06:27, 1.53it/s, loss=0.0721, lr=1.86e-05, step=3915] Training: 39%|███▉ | 3916/10000 [53:02<1:01:38, 1.65it/s, loss=0.0721, lr=1.86e-05, step=3915] Training: 39%|███▉ | 3916/10000 [53:02<1:01:38, 1.65it/s, loss=0.0430, lr=1.86e-05, step=3916] Training: 39%|███▉ | 3917/10000 [53:03<1:00:24, 1.68it/s, loss=0.0430, lr=1.86e-05, step=3916] Training: 39%|███▉ | 3917/10000 [53:03<1:00:24, 1.68it/s, loss=0.0292, lr=1.86e-05, step=3917] Training: 39%|███▉ | 3918/10000 [53:04<1:05:12, 1.55it/s, loss=0.0292, lr=1.86e-05, step=3917] Training: 39%|███▉ | 3918/10000 [53:04<1:05:12, 1.55it/s, loss=0.0130, lr=1.86e-05, step=3918] Training: 39%|███▉ | 3919/10000 [53:04<1:05:29, 1.55it/s, loss=0.0130, lr=1.86e-05, step=3918] Training: 39%|███▉ | 3919/10000 [53:04<1:05:29, 1.55it/s, loss=0.0111, lr=1.85e-05, step=3919]19:37:37.028 [I] step=3920 loss=0.0196 smoothed_loss=0.0225 lr=1.86e-05 grad_norm=0.4606 step_time=0.5492s data_time=0.0796s it/s=1.591 eta_to_10000=3822.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0217 grad_action_out_proj_arms=0.1577 grad_arm_token_fuse=0.1088 grad_shared_expert=0.4484 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3920/10000 [53:05<1:02:23, 1.62it/s, loss=0.0111, lr=1.85e-05, step=3919] Training: 39%|███▉ | 3920/10000 [53:05<1:02:23, 1.62it/s, loss=0.0196, lr=1.85e-05, step=3920] Training: 39%|███▉ | 3921/10000 [53:05<1:05:35, 1.54it/s, loss=0.0196, lr=1.85e-05, step=3920] Training: 39%|███▉ | 3921/10000 [53:05<1:05:35, 1.54it/s, loss=0.0145, lr=1.85e-05, step=3921] Training: 39%|███▉ | 3922/10000 [53:06<1:11:56, 1.41it/s, loss=0.0145, lr=1.85e-05, step=3921] Training: 39%|███▉ | 3922/10000 [53:06<1:11:56, 1.41it/s, loss=0.0078, lr=1.85e-05, step=3922] Training: 39%|███▉ | 3923/10000 [53:07<1:05:32, 1.55it/s, loss=0.0078, lr=1.85e-05, step=3922] Training: 39%|███▉ | 3923/10000 [53:07<1:05:32, 1.55it/s, loss=0.0101, lr=1.85e-05, step=3923] Training: 39%|███▉ | 3924/10000 [53:07<1:01:07, 1.66it/s, loss=0.0101, lr=1.85e-05, step=3923] Training: 39%|███▉ | 3924/10000 [53:07<1:01:07, 1.66it/s, loss=0.0111, lr=1.85e-05, step=3924] Training: 39%|███▉ | 3925/10000 [53:08<1:01:54, 1.64it/s, loss=0.0111, lr=1.85e-05, step=3924] Training: 39%|███▉ | 3925/10000 [53:08<1:01:54, 1.64it/s, loss=0.0118, lr=1.85e-05, step=3925] Training: 39%|███▉ | 3926/10000 [53:08<58:34, 1.73it/s, loss=0.0118, lr=1.85e-05, step=3925] Training: 39%|███▉ | 3926/10000 [53:08<58:34, 1.73it/s, loss=0.0200, lr=1.85e-05, step=3926] Training: 39%|███▉ | 3927/10000 [53:09<1:08:22, 1.48it/s, loss=0.0200, lr=1.85e-05, step=3926] Training: 39%|███▉ | 3927/10000 [53:09<1:08:22, 1.48it/s, loss=0.0204, lr=1.85e-05, step=3927] Training: 39%|███▉ | 3928/10000 [53:10<1:11:08, 1.42it/s, loss=0.0204, lr=1.85e-05, step=3927] Training: 39%|███▉ | 3928/10000 [53:10<1:11:08, 1.42it/s, loss=0.0147, lr=1.85e-05, step=3928] Training: 39%|███▉ | 3929/10000 [53:11<1:11:14, 1.42it/s, loss=0.0147, lr=1.85e-05, step=3928] Training: 39%|███▉ | 3929/10000 [53:11<1:11:14, 1.42it/s, loss=0.0070, lr=1.85e-05, step=3929]19:37:43.655 [I] step=3930 loss=0.0203 smoothed_loss=0.0172 lr=1.85e-05 grad_norm=0.4946 step_time=0.5778s data_time=0.0849s it/s=1.509 eta_to_10000=4022.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0208 grad_action_out_proj_arms=0.2035 grad_arm_token_fuse=0.1147 grad_shared_expert=0.5402 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3930/10000 [53:11<1:06:14, 1.53it/s, loss=0.0070, lr=1.85e-05, step=3929] Training: 39%|███▉ | 3930/10000 [53:11<1:06:14, 1.53it/s, loss=0.0203, lr=1.85e-05, step=3930] Training: 39%|███▉ | 3931/10000 [53:12<1:02:12, 1.63it/s, loss=0.0203, lr=1.85e-05, step=3930] Training: 39%|███▉ | 3931/10000 [53:12<1:02:12, 1.63it/s, loss=0.0161, lr=1.85e-05, step=3931] Training: 39%|███▉ | 3932/10000 [53:12<1:00:17, 1.68it/s, loss=0.0161, lr=1.85e-05, step=3931] Training: 39%|███▉ | 3932/10000 [53:12<1:00:17, 1.68it/s, loss=0.0067, lr=1.85e-05, step=3932] Training: 39%|███▉ | 3933/10000 [53:13<57:35, 1.76it/s, loss=0.0067, lr=1.85e-05, step=3932] Training: 39%|███▉ | 3933/10000 [53:13<57:35, 1.76it/s, loss=0.0117, lr=1.85e-05, step=3933] Training: 39%|███▉ | 3934/10000 [53:14<1:01:54, 1.63it/s, loss=0.0117, lr=1.85e-05, step=3933] Training: 39%|███▉ | 3934/10000 [53:14<1:01:54, 1.63it/s, loss=0.0122, lr=1.85e-05, step=3934] Training: 39%|███▉ | 3935/10000 [53:14<59:15, 1.71it/s, loss=0.0122, lr=1.85e-05, step=3934] Training: 39%|███▉ | 3935/10000 [53:14<59:15, 1.71it/s, loss=0.0268, lr=1.85e-05, step=3935] Training: 39%|███▉ | 3936/10000 [53:15<1:06:14, 1.53it/s, loss=0.0268, lr=1.85e-05, step=3935] Training: 39%|███▉ | 3936/10000 [53:15<1:06:14, 1.53it/s, loss=0.0317, lr=1.85e-05, step=3936] Training: 39%|███▉ | 3937/10000 [53:16<1:05:51, 1.53it/s, loss=0.0317, lr=1.85e-05, step=3936] Training: 39%|███▉ | 3937/10000 [53:16<1:05:51, 1.53it/s, loss=0.0133, lr=1.85e-05, step=3937] Training: 39%|███▉ | 3938/10000 [53:16<1:02:10, 1.63it/s, loss=0.0133, lr=1.85e-05, step=3937] Training: 39%|███▉ | 3938/10000 [53:16<1:02:10, 1.63it/s, loss=0.0073, lr=1.85e-05, step=3938] Training: 39%|███▉ | 3939/10000 [53:17<58:54, 1.71it/s, loss=0.0073, lr=1.85e-05, step=3938] Training: 39%|███▉ | 3939/10000 [53:17<58:54, 1.71it/s, loss=0.0048, lr=1.85e-05, step=3939]19:37:49.555 [I] step=3940 loss=0.0120 smoothed_loss=0.0150 lr=1.85e-05 grad_norm=0.4519 step_time=0.5137s data_time=0.0763s it/s=1.695 eta_to_10000=3574.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.1541 grad_arm_token_fuse=0.0739 grad_shared_expert=0.4194 (18633:train_pytorch.py:850) + Training: 39%|███▉ | 3940/10000 [53:17<58:51, 1.72it/s, loss=0.0048, lr=1.85e-05, step=3939] Training: 39%|███▉ | 3940/10000 [53:17<58:51, 1.72it/s, loss=0.0120, lr=1.85e-05, step=3940] Training: 39%|███▉ | 3941/10000 [53:18<58:05, 1.74it/s, loss=0.0120, lr=1.85e-05, step=3940] Training: 39%|███▉ | 3941/10000 [53:18<58:05, 1.74it/s, loss=0.0091, lr=1.85e-05, step=3941] Training: 39%|███▉ | 3942/10000 [53:19<1:05:27, 1.54it/s, loss=0.0091, lr=1.85e-05, step=3941] Training: 39%|███▉ | 3942/10000 [53:19<1:05:27, 1.54it/s, loss=0.0102, lr=1.85e-05, step=3942] Training: 39%|███▉ | 3943/10000 [53:19<1:08:13, 1.48it/s, loss=0.0102, lr=1.85e-05, step=3942] Training: 39%|███▉ | 3943/10000 [53:19<1:08:13, 1.48it/s, loss=0.0363, lr=1.85e-05, step=3943] Training: 39%|███▉ | 3944/10000 [53:20<1:03:04, 1.60it/s, loss=0.0363, lr=1.85e-05, step=3943] Training: 39%|███▉ | 3944/10000 [53:20<1:03:04, 1.60it/s, loss=0.0176, lr=1.85e-05, step=3944] Training: 39%|███▉ | 3945/10000 [53:20<1:02:23, 1.62it/s, loss=0.0176, lr=1.85e-05, step=3944] Training: 39%|███▉ | 3945/10000 [53:20<1:02:23, 1.62it/s, loss=0.0070, lr=1.85e-05, step=3945] Training: 39%|███▉ | 3946/10000 [53:21<1:05:53, 1.53it/s, loss=0.0070, lr=1.85e-05, step=3945] Training: 39%|███▉ | 3946/10000 [53:21<1:05:53, 1.53it/s, loss=0.0060, lr=1.85e-05, step=3946] Training: 39%|███▉ | 3947/10000 [53:22<1:03:35, 1.59it/s, loss=0.0060, lr=1.85e-05, step=3946] Training: 39%|███▉ | 3947/10000 [53:22<1:03:35, 1.59it/s, loss=0.0219, lr=1.85e-05, step=3947] Training: 39%|███▉ | 3948/10000 [53:22<59:03, 1.71it/s, loss=0.0219, lr=1.85e-05, step=3947] Training: 39%|███▉ | 3948/10000 [53:22<59:03, 1.71it/s, loss=0.0114, lr=1.84e-05, step=3948] Training: 39%|███▉ | 3949/10000 [53:23<1:07:08, 1.50it/s, loss=0.0114, lr=1.84e-05, step=3948] Training: 39%|███▉ | 3949/10000 [53:23<1:07:08, 1.50it/s, loss=0.0332, lr=1.84e-05, step=3949]19:37:56.327 [I] step=3950 loss=0.0072 smoothed_loss=0.0157 lr=1.85e-05 grad_norm=0.5591 step_time=0.5765s data_time=0.1008s it/s=1.477 eta_to_10000=4096.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0282 grad_action_out_proj_arms=0.2086 grad_arm_token_fuse=0.1647 grad_shared_expert=0.4795 (18633:train_pytorch.py:850) + Training: 40%|███▉ | 3950/10000 [53:24<1:14:15, 1.36it/s, loss=0.0332, lr=1.84e-05, step=3949] Training: 40%|███▉ | 3950/10000 [53:24<1:14:15, 1.36it/s, loss=0.0072, lr=1.84e-05, step=3950] Training: 40%|███▉ | 3951/10000 [53:25<1:13:17, 1.38it/s, loss=0.0072, lr=1.84e-05, step=3950] Training: 40%|███▉ | 3951/10000 [53:25<1:13:17, 1.38it/s, loss=0.0182, lr=1.84e-05, step=3951] Training: 40%|███▉ | 3952/10000 [53:25<1:11:12, 1.42it/s, loss=0.0182, lr=1.84e-05, step=3951] Training: 40%|███▉ | 3952/10000 [53:25<1:11:12, 1.42it/s, loss=0.0117, lr=1.84e-05, step=3952] Training: 40%|███▉ | 3953/10000 [53:26<1:10:11, 1.44it/s, loss=0.0117, lr=1.84e-05, step=3952] Training: 40%|███▉ | 3953/10000 [53:26<1:10:11, 1.44it/s, loss=0.0145, lr=1.84e-05, step=3953] Training: 40%|███▉ | 3954/10000 [53:27<1:09:38, 1.45it/s, loss=0.0145, lr=1.84e-05, step=3953] Training: 40%|███▉ | 3954/10000 [53:27<1:09:38, 1.45it/s, loss=0.0069, lr=1.84e-05, step=3954] Training: 40%|███▉ | 3955/10000 [53:27<1:03:42, 1.58it/s, loss=0.0069, lr=1.84e-05, step=3954] Training: 40%|███▉ | 3955/10000 [53:27<1:03:42, 1.58it/s, loss=0.0043, lr=1.84e-05, step=3955] Training: 40%|███▉ | 3956/10000 [53:28<59:52, 1.68it/s, loss=0.0043, lr=1.84e-05, step=3955] Training: 40%|███▉ | 3956/10000 [53:28<59:52, 1.68it/s, loss=0.0089, lr=1.84e-05, step=3956] Training: 40%|███▉ | 3957/10000 [53:29<1:06:54, 1.51it/s, loss=0.0089, lr=1.84e-05, step=3956] Training: 40%|███▉ | 3957/10000 [53:29<1:06:54, 1.51it/s, loss=0.0067, lr=1.84e-05, step=3957] Training: 40%|███▉ | 3958/10000 [53:29<1:01:31, 1.64it/s, loss=0.0067, lr=1.84e-05, step=3957] Training: 40%|███▉ | 3958/10000 [53:29<1:01:31, 1.64it/s, loss=0.0191, lr=1.84e-05, step=3958] Training: 40%|███▉ | 3959/10000 [53:30<59:33, 1.69it/s, loss=0.0191, lr=1.84e-05, step=3958] Training: 40%|███▉ | 3959/10000 [53:30<59:33, 1.69it/s, loss=0.0494, lr=1.84e-05, step=3959]19:38:02.456 [I] step=3960 loss=0.0182 smoothed_loss=0.0169 lr=1.84e-05 grad_norm=0.4802 step_time=0.5210s data_time=0.0918s it/s=1.632 eta_to_10000=3701.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0838 grad_arm_token_fuse=0.0353 grad_shared_expert=0.6214 (18633:train_pytorch.py:850) + Training: 40%|███▉ | 3960/10000 [53:30<58:21, 1.73it/s, loss=0.0494, lr=1.84e-05, step=3959] Training: 40%|███▉ | 3960/10000 [53:30<58:21, 1.73it/s, loss=0.0182, lr=1.84e-05, step=3960] Training: 40%|███▉ | 3961/10000 [53:31<55:21, 1.82it/s, loss=0.0182, lr=1.84e-05, step=3960] Training: 40%|███▉ | 3961/10000 [53:31<55:21, 1.82it/s, loss=0.0068, lr=1.84e-05, step=3961] Training: 40%|███▉ | 3962/10000 [53:31<53:42, 1.87it/s, loss=0.0068, lr=1.84e-05, step=3961] Training: 40%|███▉ | 3962/10000 [53:31<53:42, 1.87it/s, loss=0.0269, lr=1.84e-05, step=3962] Training: 40%|███▉ | 3963/10000 [53:32<54:01, 1.86it/s, loss=0.0269, lr=1.84e-05, step=3962] Training: 40%|███▉ | 3963/10000 [53:32<54:01, 1.86it/s, loss=0.0169, lr=1.84e-05, step=3963] Training: 40%|███▉ | 3964/10000 [53:33<1:04:49, 1.55it/s, loss=0.0169, lr=1.84e-05, step=3963] Training: 40%|███▉ | 3964/10000 [53:33<1:04:49, 1.55it/s, loss=0.0079, lr=1.84e-05, step=3964] Training: 40%|███▉ | 3965/10000 [53:33<1:09:44, 1.44it/s, loss=0.0079, lr=1.84e-05, step=3964] Training: 40%|███▉ | 3965/10000 [53:33<1:09:44, 1.44it/s, loss=0.0302, lr=1.84e-05, step=3965] Training: 40%|███▉ | 3966/10000 [53:34<1:04:06, 1.57it/s, loss=0.0302, lr=1.84e-05, step=3965] Training: 40%|███▉ | 3966/10000 [53:34<1:04:06, 1.57it/s, loss=0.0258, lr=1.84e-05, step=3966] Training: 40%|███▉ | 3967/10000 [53:34<1:01:17, 1.64it/s, loss=0.0258, lr=1.84e-05, step=3966] Training: 40%|███▉ | 3967/10000 [53:34<1:01:17, 1.64it/s, loss=0.0096, lr=1.84e-05, step=3967] Training: 40%|███▉ | 3968/10000 [53:35<1:02:56, 1.60it/s, loss=0.0096, lr=1.84e-05, step=3967] Training: 40%|███▉ | 3968/10000 [53:35<1:02:56, 1.60it/s, loss=0.0390, lr=1.84e-05, step=3968] Training: 40%|███▉ | 3969/10000 [53:36<1:01:31, 1.63it/s, loss=0.0390, lr=1.84e-05, step=3968] Training: 40%|███▉ | 3969/10000 [53:36<1:01:31, 1.63it/s, loss=0.0074, lr=1.84e-05, step=3969]19:38:08.739 [I] step=3970 loss=0.0139 smoothed_loss=0.0179 lr=1.84e-05 grad_norm=0.5350 step_time=0.5366s data_time=0.0918s it/s=1.592 eta_to_10000=3788.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1256 grad_arm_token_fuse=0.0584 grad_shared_expert=0.5397 (18633:train_pytorch.py:850) + Training: 40%|███▉ | 3970/10000 [53:36<1:06:08, 1.52it/s, loss=0.0074, lr=1.84e-05, step=3969] Training: 40%|███▉ | 3970/10000 [53:36<1:06:08, 1.52it/s, loss=0.0139, lr=1.84e-05, step=3970] Training: 40%|███▉ | 3971/10000 [53:37<1:07:19, 1.49it/s, loss=0.0139, lr=1.84e-05, step=3970] Training: 40%|███▉ | 3971/10000 [53:37<1:07:19, 1.49it/s, loss=0.0316, lr=1.84e-05, step=3971] Training: 40%|███▉ | 3972/10000 [53:38<1:10:46, 1.42it/s, loss=0.0316, lr=1.84e-05, step=3971] Training: 40%|███▉ | 3972/10000 [53:38<1:10:46, 1.42it/s, loss=0.0283, lr=1.84e-05, step=3972] Training: 40%|███▉ | 3973/10000 [53:39<1:12:45, 1.38it/s, loss=0.0283, lr=1.84e-05, step=3972] Training: 40%|███▉ | 3973/10000 [53:39<1:12:45, 1.38it/s, loss=0.0059, lr=1.84e-05, step=3973] Training: 40%|███▉ | 3974/10000 [53:39<1:05:51, 1.53it/s, loss=0.0059, lr=1.84e-05, step=3973] Training: 40%|███▉ | 3974/10000 [53:39<1:05:51, 1.53it/s, loss=0.0099, lr=1.84e-05, step=3974] Training: 40%|███▉ | 3975/10000 [53:40<1:09:18, 1.45it/s, loss=0.0099, lr=1.84e-05, step=3974] Training: 40%|███▉ | 3975/10000 [53:40<1:09:18, 1.45it/s, loss=0.0246, lr=1.84e-05, step=3975] Training: 40%|███▉ | 3976/10000 [53:40<1:04:05, 1.57it/s, loss=0.0246, lr=1.84e-05, step=3975] Training: 40%|███▉ | 3976/10000 [53:40<1:04:05, 1.57it/s, loss=0.0112, lr=1.84e-05, step=3976] Training: 40%|███▉ | 3977/10000 [53:41<1:02:29, 1.61it/s, loss=0.0112, lr=1.84e-05, step=3976] Training: 40%|███▉ | 3977/10000 [53:41<1:02:29, 1.61it/s, loss=0.0182, lr=1.84e-05, step=3977] Training: 40%|███▉ | 3978/10000 [53:42<1:04:06, 1.57it/s, loss=0.0182, lr=1.84e-05, step=3977] Training: 40%|███▉ | 3978/10000 [53:42<1:04:06, 1.57it/s, loss=0.0080, lr=1.83e-05, step=3978] Training: 40%|███▉ | 3979/10000 [53:42<1:07:10, 1.49it/s, loss=0.0080, lr=1.83e-05, step=3978] Training: 40%|███▉ | 3979/10000 [53:42<1:07:10, 1.49it/s, loss=0.0113, lr=1.83e-05, step=3979]19:38:15.338 [I] step=3980 loss=0.0068 smoothed_loss=0.0154 lr=1.84e-05 grad_norm=0.5242 step_time=0.5558s data_time=0.1040s it/s=1.516 eta_to_10000=3971.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0275 grad_action_out_proj_arms=0.1650 grad_arm_token_fuse=0.1419 grad_shared_expert=0.4527 (18633:train_pytorch.py:850) + Training: 40%|███▉ | 3980/10000 [53:43<1:03:47, 1.57it/s, loss=0.0113, lr=1.83e-05, step=3979] Training: 40%|███▉ | 3980/10000 [53:43<1:03:47, 1.57it/s, loss=0.0068, lr=1.83e-05, step=3980] Training: 40%|███▉ | 3981/10000 [53:43<59:16, 1.69it/s, loss=0.0068, lr=1.83e-05, step=3980] Training: 40%|███▉ | 3981/10000 [53:43<59:16, 1.69it/s, loss=0.0185, lr=1.83e-05, step=3981] Training: 40%|███▉ | 3982/10000 [53:44<55:50, 1.80it/s, loss=0.0185, lr=1.83e-05, step=3981] Training: 40%|███▉ | 3982/10000 [53:44<55:50, 1.80it/s, loss=0.0167, lr=1.83e-05, step=3982] Training: 40%|███▉ | 3983/10000 [53:45<56:01, 1.79it/s, loss=0.0167, lr=1.83e-05, step=3982] Training: 40%|███▉ | 3983/10000 [53:45<56:01, 1.79it/s, loss=0.0436, lr=1.83e-05, step=3983] Training: 40%|███▉ | 3984/10000 [53:45<55:01, 1.82it/s, loss=0.0436, lr=1.83e-05, step=3983] Training: 40%|███▉ | 3984/10000 [53:45<55:01, 1.82it/s, loss=0.0061, lr=1.83e-05, step=3984] Training: 40%|███▉ | 3985/10000 [53:46<55:45, 1.80it/s, loss=0.0061, lr=1.83e-05, step=3984] Training: 40%|███▉ | 3985/10000 [53:46<55:45, 1.80it/s, loss=0.0075, lr=1.83e-05, step=3985] Training: 40%|███▉ | 3986/10000 [53:46<1:01:12, 1.64it/s, loss=0.0075, lr=1.83e-05, step=3985] Training: 40%|███▉ | 3986/10000 [53:46<1:01:12, 1.64it/s, loss=0.0062, lr=1.83e-05, step=3986] Training: 40%|███▉ | 3987/10000 [53:47<57:26, 1.74it/s, loss=0.0062, lr=1.83e-05, step=3986] Training: 40%|███▉ | 3987/10000 [53:47<57:26, 1.74it/s, loss=0.0057, lr=1.83e-05, step=3987] Training: 40%|███▉ | 3988/10000 [53:47<54:45, 1.83it/s, loss=0.0057, lr=1.83e-05, step=3987] Training: 40%|███▉ | 3988/10000 [53:47<54:45, 1.83it/s, loss=0.0104, lr=1.83e-05, step=3988] Training: 40%|███▉ | 3989/10000 [53:48<59:15, 1.69it/s, loss=0.0104, lr=1.83e-05, step=3988] Training: 40%|███▉ | 3989/10000 [53:48<59:15, 1.69it/s, loss=0.0136, lr=1.83e-05, step=3989]19:38:20.898 [I] step=3990 loss=0.0364 smoothed_loss=0.0162 lr=1.83e-05 grad_norm=0.4366 step_time=0.4750s data_time=0.0810s it/s=1.799 eta_to_10000=3341.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0192 grad_action_out_proj_arms=0.1610 grad_arm_token_fuse=0.0964 grad_shared_expert=0.6069 (18633:train_pytorch.py:850) + Training: 40%|███▉ | 3990/10000 [53:49<57:26, 1.74it/s, loss=0.0136, lr=1.83e-05, step=3989] Training: 40%|███▉ | 3990/10000 [53:49<57:26, 1.74it/s, loss=0.0364, lr=1.83e-05, step=3990] Training: 40%|███▉ | 3991/10000 [53:49<54:48, 1.83it/s, loss=0.0364, lr=1.83e-05, step=3990] Training: 40%|███▉ | 3991/10000 [53:49<54:48, 1.83it/s, loss=0.0189, lr=1.83e-05, step=3991] Training: 40%|███▉ | 3992/10000 [53:50<54:40, 1.83it/s, loss=0.0189, lr=1.83e-05, step=3991] Training: 40%|███▉ | 3992/10000 [53:50<54:40, 1.83it/s, loss=0.0041, lr=1.83e-05, step=3992] Training: 40%|███▉ | 3993/10000 [53:50<1:02:51, 1.59it/s, loss=0.0041, lr=1.83e-05, step=3992] Training: 40%|███▉ | 3993/10000 [53:50<1:02:51, 1.59it/s, loss=0.0245, lr=1.83e-05, step=3993] Training: 40%|███▉ | 3994/10000 [53:51<1:08:31, 1.46it/s, loss=0.0245, lr=1.83e-05, step=3993] Training: 40%|███▉ | 3994/10000 [53:51<1:08:31, 1.46it/s, loss=0.0522, lr=1.83e-05, step=3994] Training: 40%|███▉ | 3995/10000 [53:52<1:03:05, 1.59it/s, loss=0.0522, lr=1.83e-05, step=3994] Training: 40%|███▉ | 3995/10000 [53:52<1:03:05, 1.59it/s, loss=0.0083, lr=1.83e-05, step=3995] Training: 40%|███▉ | 3996/10000 [53:52<1:02:54, 1.59it/s, loss=0.0083, lr=1.83e-05, step=3995] Training: 40%|███▉ | 3996/10000 [53:52<1:02:54, 1.59it/s, loss=0.0121, lr=1.83e-05, step=3996] Training: 40%|███▉ | 3997/10000 [53:53<1:04:10, 1.56it/s, loss=0.0121, lr=1.83e-05, step=3996] Training: 40%|███▉ | 3997/10000 [53:53<1:04:10, 1.56it/s, loss=0.0215, lr=1.83e-05, step=3997] Training: 40%|███▉ | 3998/10000 [53:54<1:01:40, 1.62it/s, loss=0.0215, lr=1.83e-05, step=3997] Training: 40%|███▉ | 3998/10000 [53:54<1:01:40, 1.62it/s, loss=0.0134, lr=1.83e-05, step=3998] Training: 40%|███▉ | 3999/10000 [53:54<58:02, 1.72it/s, loss=0.0134, lr=1.83e-05, step=3998] Training: 40%|███▉ | 3999/10000 [53:54<58:02, 1.72it/s, loss=0.0059, lr=1.83e-05, step=3999]19:38:27.254 [I] step=4000 loss=0.0278 smoothed_loss=0.0177 lr=1.83e-05 grad_norm=0.4481 step_time=0.5487s data_time=0.0869s it/s=1.574 eta_to_10000=3812.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1730 grad_arm_token_fuse=0.0796 grad_shared_expert=0.3729 (18633:train_pytorch.py:850) +19:39:53.065 [I] Saved checkpoint at step 4000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/4000 (18633:train_pytorch.py:350) + Training: 40%|████ | 4000/10000 [55:21<44:00:04, 26.40s/it, loss=0.0059, lr=1.83e-05, step=3999] Training: 40%|████ | 4000/10000 [55:21<44:00:04, 26.40s/it, loss=0.0278, lr=1.83e-05, step=4000] Training: 40%|████ | 4001/10000 [55:22<31:12:09, 18.72s/it, loss=0.0278, lr=1.83e-05, step=4000] Training: 40%|████ | 4001/10000 [55:22<31:12:09, 18.72s/it, loss=0.0190, lr=1.83e-05, step=4001] Training: 40%|████ | 4002/10000 [55:22<22:05:34, 13.26s/it, loss=0.0190, lr=1.83e-05, step=4001] Training: 40%|████ | 4002/10000 [55:22<22:05:34, 13.26s/it, loss=0.0063, lr=1.83e-05, step=4002] Training: 40%|████ | 4003/10000 [55:23<15:42:36, 9.43s/it, loss=0.0063, lr=1.83e-05, step=4002] Training: 40%|████ | 4003/10000 [55:23<15:42:36, 9.43s/it, loss=0.0084, lr=1.83e-05, step=4003] Training: 40%|████ | 4004/10000 [55:23<11:15:16, 6.76s/it, loss=0.0084, lr=1.83e-05, step=4003] Training: 40%|████ | 4004/10000 [55:23<11:15:16, 6.76s/it, loss=0.0153, lr=1.83e-05, step=4004] Training: 40%|████ | 4005/10000 [55:24<8:08:00, 4.88s/it, loss=0.0153, lr=1.83e-05, step=4004] Training: 40%|████ | 4005/10000 [55:24<8:08:00, 4.88s/it, loss=0.0765, lr=1.83e-05, step=4005] Training: 40%|████ | 4006/10000 [55:24<5:56:47, 3.57s/it, loss=0.0765, lr=1.83e-05, step=4005] Training: 40%|████ | 4006/10000 [55:24<5:56:47, 3.57s/it, loss=0.0062, lr=1.83e-05, step=4006] Training: 40%|████ | 4007/10000 [55:25<4:33:21, 2.74s/it, loss=0.0062, lr=1.83e-05, step=4006] Training: 40%|████ | 4007/10000 [55:25<4:33:21, 2.74s/it, loss=0.0145, lr=1.82e-05, step=4007] Training: 40%|████ | 4008/10000 [55:26<3:32:22, 2.13s/it, loss=0.0145, lr=1.82e-05, step=4007] Training: 40%|████ | 4008/10000 [55:26<3:32:22, 2.13s/it, loss=0.0178, lr=1.82e-05, step=4008] Training: 40%|████ | 4009/10000 [55:26<2:43:31, 1.64s/it, loss=0.0178, lr=1.82e-05, step=4008] Training: 40%|████ | 4009/10000 [55:26<2:43:31, 1.64s/it, loss=0.0647, lr=1.82e-05, step=4009]19:39:58.978 [I] step=4010 loss=0.0053 smoothed_loss=0.0222 lr=1.83e-05 grad_norm=0.5485 step_time=0.5233s data_time=8.6491s it/s=0.109 eta_to_10000=54941.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1828 grad_arm_token_fuse=0.0853 grad_shared_expert=0.4442 (18633:train_pytorch.py:850) + Training: 40%|████ | 4010/10000 [55:27<2:11:17, 1.32s/it, loss=0.0647, lr=1.82e-05, step=4009] Training: 40%|████ | 4010/10000 [55:27<2:11:17, 1.32s/it, loss=0.0053, lr=1.82e-05, step=4010] Training: 40%|████ | 4011/10000 [55:27<1:46:36, 1.07s/it, loss=0.0053, lr=1.82e-05, step=4010] Training: 40%|████ | 4011/10000 [55:27<1:46:36, 1.07s/it, loss=0.0108, lr=1.82e-05, step=4011] Training: 40%|████ | 4012/10000 [55:28<1:37:11, 1.03it/s, loss=0.0108, lr=1.82e-05, step=4011] Training: 40%|████ | 4012/10000 [55:28<1:37:11, 1.03it/s, loss=0.0460, lr=1.82e-05, step=4012] Training: 40%|████ | 4013/10000 [55:29<1:26:35, 1.15it/s, loss=0.0460, lr=1.82e-05, step=4012] Training: 40%|████ | 4013/10000 [55:29<1:26:35, 1.15it/s, loss=0.0049, lr=1.82e-05, step=4013] Training: 40%|████ | 4014/10000 [55:29<1:24:03, 1.19it/s, loss=0.0049, lr=1.82e-05, step=4013] Training: 40%|████ | 4014/10000 [55:29<1:24:03, 1.19it/s, loss=0.0119, lr=1.82e-05, step=4014] Training: 40%|████ | 4015/10000 [55:30<1:23:46, 1.19it/s, loss=0.0119, lr=1.82e-05, step=4014] Training: 40%|████ | 4015/10000 [55:30<1:23:46, 1.19it/s, loss=0.0316, lr=1.82e-05, step=4015] Training: 40%|████ | 4016/10000 [55:31<1:13:35, 1.36it/s, loss=0.0316, lr=1.82e-05, step=4015] Training: 40%|████ | 4016/10000 [55:31<1:13:35, 1.36it/s, loss=0.0203, lr=1.82e-05, step=4016] Training: 40%|████ | 4017/10000 [55:31<1:06:31, 1.50it/s, loss=0.0203, lr=1.82e-05, step=4016] Training: 40%|████ | 4017/10000 [55:31<1:06:31, 1.50it/s, loss=0.0148, lr=1.82e-05, step=4017] Training: 40%|████ | 4018/10000 [55:32<1:01:34, 1.62it/s, loss=0.0148, lr=1.82e-05, step=4017] Training: 40%|████ | 4018/10000 [55:32<1:01:34, 1.62it/s, loss=0.0076, lr=1.82e-05, step=4018] Training: 40%|████ | 4019/10000 [55:32<58:07, 1.72it/s, loss=0.0076, lr=1.82e-05, step=4018] Training: 40%|████ | 4019/10000 [55:32<58:07, 1.72it/s, loss=0.0057, lr=1.82e-05, step=4019]19:40:05.005 [I] step=4020 loss=0.0154 smoothed_loss=0.0179 lr=1.82e-05 grad_norm=0.5379 step_time=0.5242s data_time=0.0785s it/s=1.660 eta_to_10000=3603.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0294 grad_action_out_proj_arms=0.1911 grad_arm_token_fuse=0.1562 grad_shared_expert=0.5652 (18633:train_pytorch.py:850) + Training: 40%|████ | 4020/10000 [55:33<56:44, 1.76it/s, loss=0.0057, lr=1.82e-05, step=4019] Training: 40%|████ | 4020/10000 [55:33<56:44, 1.76it/s, loss=0.0154, lr=1.82e-05, step=4020] Training: 40%|████ | 4021/10000 [55:33<1:02:26, 1.60it/s, loss=0.0154, lr=1.82e-05, step=4020] Training: 40%|████ | 4021/10000 [55:33<1:02:26, 1.60it/s, loss=0.0181, lr=1.82e-05, step=4021] Training: 40%|████ | 4022/10000 [55:34<1:07:35, 1.47it/s, loss=0.0181, lr=1.82e-05, step=4021] Training: 40%|████ | 4022/10000 [55:34<1:07:35, 1.47it/s, loss=0.0062, lr=1.82e-05, step=4022] Training: 40%|████ | 4023/10000 [55:35<1:02:34, 1.59it/s, loss=0.0062, lr=1.82e-05, step=4022] Training: 40%|████ | 4023/10000 [55:35<1:02:34, 1.59it/s, loss=0.0107, lr=1.82e-05, step=4023] Training: 40%|████ | 4024/10000 [55:35<1:01:50, 1.61it/s, loss=0.0107, lr=1.82e-05, step=4023] Training: 40%|████ | 4024/10000 [55:35<1:01:50, 1.61it/s, loss=0.1962, lr=1.82e-05, step=4024] Training: 40%|████ | 4025/10000 [55:36<58:24, 1.71it/s, loss=0.1962, lr=1.82e-05, step=4024] Training: 40%|████ | 4025/10000 [55:36<58:24, 1.71it/s, loss=0.0074, lr=1.82e-05, step=4025] Training: 40%|████ | 4026/10000 [55:36<56:07, 1.77it/s, loss=0.0074, lr=1.82e-05, step=4025] Training: 40%|████ | 4026/10000 [55:36<56:07, 1.77it/s, loss=0.0294, lr=1.82e-05, step=4026] Training: 40%|████ | 4027/10000 [55:37<54:26, 1.83it/s, loss=0.0294, lr=1.82e-05, step=4026] Training: 40%|████ | 4027/10000 [55:37<54:26, 1.83it/s, loss=0.0199, lr=1.82e-05, step=4027] Training: 40%|████ | 4028/10000 [55:38<58:58, 1.69it/s, loss=0.0199, lr=1.82e-05, step=4027] Training: 40%|████ | 4028/10000 [55:38<58:58, 1.69it/s, loss=0.0203, lr=1.82e-05, step=4028] Training: 40%|████ | 4029/10000 [55:38<1:04:59, 1.53it/s, loss=0.0203, lr=1.82e-05, step=4028] Training: 40%|████ | 4029/10000 [55:38<1:04:59, 1.53it/s, loss=0.0052, lr=1.82e-05, step=4029]19:40:11.221 [I] step=4030 loss=0.0201 smoothed_loss=0.0261 lr=1.82e-05 grad_norm=0.4950 step_time=0.5520s data_time=0.0696s it/s=1.609 eta_to_10000=3710.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0126 grad_action_out_proj_arms=0.1067 grad_arm_token_fuse=0.0576 grad_shared_expert=0.3247 (18633:train_pytorch.py:850) + Training: 40%|████ | 4030/10000 [55:39<1:01:07, 1.63it/s, loss=0.0052, lr=1.82e-05, step=4029] Training: 40%|████ | 4030/10000 [55:39<1:01:07, 1.63it/s, loss=0.0201, lr=1.82e-05, step=4030] Training: 40%|████ | 4031/10000 [55:39<57:40, 1.72it/s, loss=0.0201, lr=1.82e-05, step=4030] Training: 40%|████ | 4031/10000 [55:39<57:40, 1.72it/s, loss=0.0138, lr=1.82e-05, step=4031] Training: 40%|████ | 4032/10000 [55:40<57:18, 1.74it/s, loss=0.0138, lr=1.82e-05, step=4031] Training: 40%|████ | 4032/10000 [55:40<57:18, 1.74it/s, loss=0.0102, lr=1.82e-05, step=4032] Training: 40%|████ | 4033/10000 [55:40<55:24, 1.79it/s, loss=0.0102, lr=1.82e-05, step=4032] Training: 40%|████ | 4033/10000 [55:40<55:24, 1.79it/s, loss=0.0065, lr=1.82e-05, step=4033] Training: 40%|████ | 4034/10000 [55:41<53:22, 1.86it/s, loss=0.0065, lr=1.82e-05, step=4033] Training: 40%|████ | 4034/10000 [55:41<53:22, 1.86it/s, loss=0.0260, lr=1.82e-05, step=4034] Training: 40%|████ | 4035/10000 [55:41<51:53, 1.92it/s, loss=0.0260, lr=1.82e-05, step=4034] Training: 40%|████ | 4035/10000 [55:41<51:53, 1.92it/s, loss=0.0091, lr=1.82e-05, step=4035] Training: 40%|████ | 4036/10000 [55:42<58:06, 1.71it/s, loss=0.0091, lr=1.82e-05, step=4035] Training: 40%|████ | 4036/10000 [55:42<58:06, 1.71it/s, loss=0.0023, lr=1.81e-05, step=4036] Training: 40%|████ | 4037/10000 [55:43<56:08, 1.77it/s, loss=0.0023, lr=1.81e-05, step=4036] Training: 40%|████ | 4037/10000 [55:43<56:08, 1.77it/s, loss=0.0051, lr=1.81e-05, step=4037] Training: 40%|████ | 4038/10000 [55:43<53:59, 1.84it/s, loss=0.0051, lr=1.81e-05, step=4037] Training: 40%|████ | 4038/10000 [55:43<53:59, 1.84it/s, loss=0.0330, lr=1.81e-05, step=4038] Training: 40%|████ | 4039/10000 [55:44<59:45, 1.66it/s, loss=0.0330, lr=1.81e-05, step=4038] Training: 40%|████ | 4039/10000 [55:44<59:45, 1.66it/s, loss=0.0216, lr=1.81e-05, step=4039]19:40:16.796 [I] step=4040 loss=0.0180 smoothed_loss=0.0192 lr=1.82e-05 grad_norm=0.4443 step_time=0.4838s data_time=0.0738s it/s=1.794 eta_to_10000=3322.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0113 grad_action_out_proj_arms=0.1386 grad_arm_token_fuse=0.0592 grad_shared_expert=0.4986 (18633:train_pytorch.py:850) + Training: 40%|████ | 4040/10000 [55:44<57:52, 1.72it/s, loss=0.0216, lr=1.81e-05, step=4039] Training: 40%|████ | 4040/10000 [55:44<57:52, 1.72it/s, loss=0.0180, lr=1.81e-05, step=4040] Training: 40%|████ | 4041/10000 [55:45<55:35, 1.79it/s, loss=0.0180, lr=1.81e-05, step=4040] Training: 40%|████ | 4041/10000 [55:45<55:35, 1.79it/s, loss=0.0120, lr=1.81e-05, step=4041] Training: 40%|████ | 4042/10000 [55:45<53:57, 1.84it/s, loss=0.0120, lr=1.81e-05, step=4041] Training: 40%|████ | 4042/10000 [55:45<53:57, 1.84it/s, loss=0.0185, lr=1.81e-05, step=4042] Training: 40%|████ | 4043/10000 [55:46<1:00:07, 1.65it/s, loss=0.0185, lr=1.81e-05, step=4042] Training: 40%|████ | 4043/10000 [55:46<1:00:07, 1.65it/s, loss=0.0373, lr=1.81e-05, step=4043] Training: 40%|████ | 4044/10000 [55:47<1:01:41, 1.61it/s, loss=0.0373, lr=1.81e-05, step=4043] Training: 40%|████ | 4044/10000 [55:47<1:01:41, 1.61it/s, loss=0.0117, lr=1.81e-05, step=4044] Training: 40%|████ | 4045/10000 [55:47<58:10, 1.71it/s, loss=0.0117, lr=1.81e-05, step=4044] Training: 40%|████ | 4045/10000 [55:47<58:10, 1.71it/s, loss=0.0166, lr=1.81e-05, step=4045] Training: 40%|████ | 4046/10000 [55:48<55:39, 1.78it/s, loss=0.0166, lr=1.81e-05, step=4045] Training: 40%|████ | 4046/10000 [55:48<55:39, 1.78it/s, loss=0.0109, lr=1.81e-05, step=4046] Training: 40%|████ | 4047/10000 [55:48<55:16, 1.79it/s, loss=0.0109, lr=1.81e-05, step=4046] Training: 40%|████ | 4047/10000 [55:48<55:16, 1.79it/s, loss=0.0018, lr=1.81e-05, step=4047] Training: 40%|████ | 4048/10000 [55:49<53:19, 1.86it/s, loss=0.0018, lr=1.81e-05, step=4047] Training: 40%|████ | 4048/10000 [55:49<53:19, 1.86it/s, loss=0.0145, lr=1.81e-05, step=4048] Training: 40%|████ | 4049/10000 [55:49<52:13, 1.90it/s, loss=0.0145, lr=1.81e-05, step=4048] Training: 40%|████ | 4049/10000 [55:49<52:13, 1.90it/s, loss=0.0114, lr=1.81e-05, step=4049]19:40:22.525 [I] step=4050 loss=0.0242 smoothed_loss=0.0168 lr=1.81e-05 grad_norm=0.4103 step_time=0.5116s data_time=0.0613s it/s=1.746 eta_to_10000=3408.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.0900 grad_arm_token_fuse=0.0371 grad_shared_expert=0.4040 (18633:train_pytorch.py:850) + Training: 40%|████ | 4050/10000 [55:50<59:11, 1.68it/s, loss=0.0114, lr=1.81e-05, step=4049] Training: 40%|████ | 4050/10000 [55:50<59:11, 1.68it/s, loss=0.0242, lr=1.81e-05, step=4050] Training: 41%|████ | 4051/10000 [55:51<55:32, 1.79it/s, loss=0.0242, lr=1.81e-05, step=4050] Training: 41%|████ | 4051/10000 [55:51<55:32, 1.79it/s, loss=0.0147, lr=1.81e-05, step=4051] Training: 41%|████ | 4052/10000 [55:51<53:18, 1.86it/s, loss=0.0147, lr=1.81e-05, step=4051] Training: 41%|████ | 4052/10000 [55:51<53:18, 1.86it/s, loss=0.0176, lr=1.81e-05, step=4052] Training: 41%|████ | 4053/10000 [55:52<52:53, 1.87it/s, loss=0.0176, lr=1.81e-05, step=4052] Training: 41%|████ | 4053/10000 [55:52<52:53, 1.87it/s, loss=0.0155, lr=1.81e-05, step=4053] Training: 41%|████ | 4054/10000 [55:52<51:51, 1.91it/s, loss=0.0155, lr=1.81e-05, step=4053] Training: 41%|████ | 4054/10000 [55:52<51:51, 1.91it/s, loss=0.0107, lr=1.81e-05, step=4054] Training: 41%|████ | 4055/10000 [55:53<51:36, 1.92it/s, loss=0.0107, lr=1.81e-05, step=4054] Training: 41%|████ | 4055/10000 [55:53<51:36, 1.92it/s, loss=0.0111, lr=1.81e-05, step=4055] Training: 41%|████ | 4056/10000 [55:53<54:41, 1.81it/s, loss=0.0111, lr=1.81e-05, step=4055] Training: 41%|████ | 4056/10000 [55:53<54:41, 1.81it/s, loss=0.0119, lr=1.81e-05, step=4056] Training: 41%|████ | 4057/10000 [55:54<59:29, 1.66it/s, loss=0.0119, lr=1.81e-05, step=4056] Training: 41%|████ | 4057/10000 [55:54<59:29, 1.66it/s, loss=0.0109, lr=1.81e-05, step=4057] Training: 41%|████ | 4058/10000 [55:55<56:09, 1.76it/s, loss=0.0109, lr=1.81e-05, step=4057] Training: 41%|████ | 4058/10000 [55:55<56:09, 1.76it/s, loss=0.0074, lr=1.81e-05, step=4058] Training: 41%|████ | 4059/10000 [55:55<53:48, 1.84it/s, loss=0.0074, lr=1.81e-05, step=4058] Training: 41%|████ | 4059/10000 [55:55<53:48, 1.84it/s, loss=0.0314, lr=1.81e-05, step=4059]19:40:27.869 [I] step=4060 loss=0.0168 smoothed_loss=0.0158 lr=1.81e-05 grad_norm=0.3952 step_time=0.4687s data_time=0.0658s it/s=1.871 eta_to_10000=3174.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0100 grad_action_out_proj_arms=0.1242 grad_arm_token_fuse=0.0502 grad_shared_expert=0.4295 (18633:train_pytorch.py:850) + Training: 41%|████ | 4060/10000 [55:56<53:25, 1.85it/s, loss=0.0314, lr=1.81e-05, step=4059] Training: 41%|████ | 4060/10000 [55:56<53:25, 1.85it/s, loss=0.0168, lr=1.81e-05, step=4060] Training: 41%|████ | 4061/10000 [55:56<52:16, 1.89it/s, loss=0.0168, lr=1.81e-05, step=4060] Training: 41%|████ | 4061/10000 [55:56<52:16, 1.89it/s, loss=0.0069, lr=1.81e-05, step=4061] Training: 41%|████ | 4062/10000 [55:57<52:01, 1.90it/s, loss=0.0069, lr=1.81e-05, step=4061] Training: 41%|████ | 4062/10000 [55:57<52:01, 1.90it/s, loss=0.0429, lr=1.81e-05, step=4062] Training: 41%|████ | 4063/10000 [55:57<52:24, 1.89it/s, loss=0.0429, lr=1.81e-05, step=4062] Training: 41%|████ | 4063/10000 [55:57<52:24, 1.89it/s, loss=0.0126, lr=1.81e-05, step=4063] Training: 41%|████ | 4064/10000 [55:58<57:47, 1.71it/s, loss=0.0126, lr=1.81e-05, step=4063] Training: 41%|████ | 4064/10000 [55:58<57:47, 1.71it/s, loss=0.0116, lr=1.81e-05, step=4064] Training: 41%|████ | 4065/10000 [55:59<1:06:45, 1.48it/s, loss=0.0116, lr=1.81e-05, step=4064] Training: 41%|████ | 4065/10000 [55:59<1:06:45, 1.48it/s, loss=0.0188, lr=1.81e-05, step=4065] Training: 41%|████ | 4066/10000 [55:59<1:01:31, 1.61it/s, loss=0.0188, lr=1.81e-05, step=4065] Training: 41%|████ | 4066/10000 [55:59<1:01:31, 1.61it/s, loss=0.0353, lr=1.80e-05, step=4066] Training: 41%|████ | 4067/10000 [56:00<57:30, 1.72it/s, loss=0.0353, lr=1.80e-05, step=4066] Training: 41%|████ | 4067/10000 [56:00<57:30, 1.72it/s, loss=0.0041, lr=1.80e-05, step=4067] Training: 41%|████ | 4068/10000 [56:00<54:58, 1.80it/s, loss=0.0041, lr=1.80e-05, step=4067] Training: 41%|████ | 4068/10000 [56:00<54:58, 1.80it/s, loss=0.0298, lr=1.80e-05, step=4068] Training: 41%|████ | 4069/10000 [56:01<59:50, 1.65it/s, loss=0.0298, lr=1.80e-05, step=4068] Training: 41%|████ | 4069/10000 [56:01<59:50, 1.65it/s, loss=0.0083, lr=1.80e-05, step=4069]19:40:33.759 [I] step=4070 loss=0.0108 smoothed_loss=0.0168 lr=1.80e-05 grad_norm=0.4887 step_time=0.5069s data_time=0.0821s it/s=1.698 eta_to_10000=3492.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0165 grad_action_out_proj_arms=0.1459 grad_arm_token_fuse=0.0859 grad_shared_expert=0.3483 (18633:train_pytorch.py:850) + Training: 41%|████ | 4070/10000 [56:01<57:33, 1.72it/s, loss=0.0083, lr=1.80e-05, step=4069] Training: 41%|████ | 4070/10000 [56:01<57:33, 1.72it/s, loss=0.0108, lr=1.80e-05, step=4070] Training: 41%|████ | 4071/10000 [56:02<1:00:22, 1.64it/s, loss=0.0108, lr=1.80e-05, step=4070] Training: 41%|████ | 4071/10000 [56:02<1:00:22, 1.64it/s, loss=0.0073, lr=1.80e-05, step=4071] Training: 41%|████ | 4072/10000 [56:03<1:04:17, 1.54it/s, loss=0.0073, lr=1.80e-05, step=4071] Training: 41%|████ | 4072/10000 [56:03<1:04:17, 1.54it/s, loss=0.0223, lr=1.80e-05, step=4072] Training: 41%|████ | 4073/10000 [56:03<59:20, 1.66it/s, loss=0.0223, lr=1.80e-05, step=4072] Training: 41%|████ | 4073/10000 [56:03<59:20, 1.66it/s, loss=0.0162, lr=1.80e-05, step=4073] Training: 41%|████ | 4074/10000 [56:04<55:58, 1.76it/s, loss=0.0162, lr=1.80e-05, step=4073] Training: 41%|████ | 4074/10000 [56:04<55:58, 1.76it/s, loss=0.0197, lr=1.80e-05, step=4074] Training: 41%|████ | 4075/10000 [56:04<53:44, 1.84it/s, loss=0.0197, lr=1.80e-05, step=4074] Training: 41%|████ | 4075/10000 [56:04<53:44, 1.84it/s, loss=0.0074, lr=1.80e-05, step=4075] Training: 41%|████ | 4076/10000 [56:05<53:11, 1.86it/s, loss=0.0074, lr=1.80e-05, step=4075] Training: 41%|████ | 4076/10000 [56:05<53:11, 1.86it/s, loss=0.0332, lr=1.80e-05, step=4076] Training: 41%|████ | 4077/10000 [56:05<52:36, 1.88it/s, loss=0.0332, lr=1.80e-05, step=4076] Training: 41%|████ | 4077/10000 [56:05<52:36, 1.88it/s, loss=0.0130, lr=1.80e-05, step=4077] Training: 41%|████ | 4078/10000 [56:06<56:57, 1.73it/s, loss=0.0130, lr=1.80e-05, step=4077] Training: 41%|████ | 4078/10000 [56:06<56:57, 1.73it/s, loss=0.0234, lr=1.80e-05, step=4078] Training: 41%|████ | 4079/10000 [56:07<1:00:26, 1.63it/s, loss=0.0234, lr=1.80e-05, step=4078] Training: 41%|████ | 4079/10000 [56:07<1:00:26, 1.63it/s, loss=0.0120, lr=1.80e-05, step=4079]19:40:39.758 [I] step=4080 loss=0.0215 smoothed_loss=0.0176 lr=1.80e-05 grad_norm=0.5152 step_time=0.5356s data_time=0.0643s it/s=1.667 eta_to_10000=3551.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0050 grad_action_out_proj_arms=0.0684 grad_arm_token_fuse=0.0265 grad_shared_expert=0.2937 (18633:train_pytorch.py:850) + Training: 41%|████ | 4080/10000 [56:07<1:02:50, 1.57it/s, loss=0.0120, lr=1.80e-05, step=4079] Training: 41%|████ | 4080/10000 [56:07<1:02:50, 1.57it/s, loss=0.0215, lr=1.80e-05, step=4080] Training: 41%|████ | 4081/10000 [56:08<58:43, 1.68it/s, loss=0.0215, lr=1.80e-05, step=4080] Training: 41%|████ | 4081/10000 [56:08<58:43, 1.68it/s, loss=0.0092, lr=1.80e-05, step=4081] Training: 41%|████ | 4082/10000 [56:08<56:08, 1.76it/s, loss=0.0092, lr=1.80e-05, step=4081] Training: 41%|████ | 4082/10000 [56:08<56:08, 1.76it/s, loss=0.0066, lr=1.80e-05, step=4082] Training: 41%|████ | 4083/10000 [56:09<53:19, 1.85it/s, loss=0.0066, lr=1.80e-05, step=4082] Training: 41%|████ | 4083/10000 [56:09<53:19, 1.85it/s, loss=0.0115, lr=1.80e-05, step=4083] Training: 41%|████ | 4084/10000 [56:09<53:57, 1.83it/s, loss=0.0115, lr=1.80e-05, step=4083] Training: 41%|████ | 4084/10000 [56:09<53:57, 1.83it/s, loss=0.0062, lr=1.80e-05, step=4084] Training: 41%|████ | 4085/10000 [56:10<53:04, 1.86it/s, loss=0.0062, lr=1.80e-05, step=4084] Training: 41%|████ | 4085/10000 [56:10<53:04, 1.86it/s, loss=0.0123, lr=1.80e-05, step=4085] Training: 41%|████ | 4086/10000 [56:11<58:29, 1.69it/s, loss=0.0123, lr=1.80e-05, step=4085] Training: 41%|████ | 4086/10000 [56:11<58:29, 1.69it/s, loss=0.0202, lr=1.80e-05, step=4086] Training: 41%|████ | 4087/10000 [56:11<59:47, 1.65it/s, loss=0.0202, lr=1.80e-05, step=4086] Training: 41%|████ | 4087/10000 [56:11<59:47, 1.65it/s, loss=0.0138, lr=1.80e-05, step=4087] Training: 41%|████ | 4088/10000 [56:12<56:22, 1.75it/s, loss=0.0138, lr=1.80e-05, step=4087] Training: 41%|████ | 4088/10000 [56:12<56:22, 1.75it/s, loss=0.0126, lr=1.80e-05, step=4088] Training: 41%|████ | 4089/10000 [56:12<54:12, 1.82it/s, loss=0.0126, lr=1.80e-05, step=4088] Training: 41%|████ | 4089/10000 [56:12<54:12, 1.82it/s, loss=0.0021, lr=1.80e-05, step=4089]19:40:45.194 [I] step=4090 loss=0.0304 smoothed_loss=0.0150 lr=1.80e-05 grad_norm=0.5281 step_time=0.4770s data_time=0.0666s it/s=1.840 eta_to_10000=3211.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0103 grad_action_out_proj_arms=0.1361 grad_arm_token_fuse=0.0517 grad_shared_expert=0.5351 (18633:train_pytorch.py:850) + Training: 41%|████ | 4090/10000 [56:13<53:26, 1.84it/s, loss=0.0021, lr=1.80e-05, step=4089] Training: 41%|████ | 4090/10000 [56:13<53:26, 1.84it/s, loss=0.0304, lr=1.80e-05, step=4090] Training: 41%|████ | 4091/10000 [56:13<51:42, 1.90it/s, loss=0.0304, lr=1.80e-05, step=4090] Training: 41%|████ | 4091/10000 [56:13<51:42, 1.90it/s, loss=0.0085, lr=1.80e-05, step=4091] Training: 41%|████ | 4092/10000 [56:14<51:04, 1.93it/s, loss=0.0085, lr=1.80e-05, step=4091] Training: 41%|████ | 4092/10000 [56:14<51:04, 1.93it/s, loss=0.1281, lr=1.80e-05, step=4092] Training: 41%|████ | 4093/10000 [56:15<57:58, 1.70it/s, loss=0.1281, lr=1.80e-05, step=4092] Training: 41%|████ | 4093/10000 [56:15<57:58, 1.70it/s, loss=0.0203, lr=1.80e-05, step=4093] Training: 41%|████ | 4094/10000 [56:15<59:48, 1.65it/s, loss=0.0203, lr=1.80e-05, step=4093] Training: 41%|████ | 4094/10000 [56:15<59:48, 1.65it/s, loss=0.0671, lr=1.80e-05, step=4094] Training: 41%|████ | 4095/10000 [56:16<56:21, 1.75it/s, loss=0.0671, lr=1.80e-05, step=4094] Training: 41%|████ | 4095/10000 [56:16<56:21, 1.75it/s, loss=0.0129, lr=1.79e-05, step=4095] Training: 41%|████ | 4096/10000 [56:16<55:43, 1.77it/s, loss=0.0129, lr=1.79e-05, step=4095] Training: 41%|████ | 4096/10000 [56:16<55:43, 1.77it/s, loss=0.0105, lr=1.79e-05, step=4096] Training: 41%|████ | 4097/10000 [56:17<54:03, 1.82it/s, loss=0.0105, lr=1.79e-05, step=4096] Training: 41%|████ | 4097/10000 [56:17<54:03, 1.82it/s, loss=0.0045, lr=1.79e-05, step=4097] Training: 41%|████ | 4098/10000 [56:17<56:40, 1.74it/s, loss=0.0045, lr=1.79e-05, step=4097] Training: 41%|████ | 4098/10000 [56:17<56:40, 1.74it/s, loss=0.0060, lr=1.79e-05, step=4098] Training: 41%|████ | 4099/10000 [56:18<53:59, 1.82it/s, loss=0.0060, lr=1.79e-05, step=4098] Training: 41%|████ | 4099/10000 [56:18<53:59, 1.82it/s, loss=0.0403, lr=1.79e-05, step=4099]19:40:51.018 [I] step=4100 loss=0.0085 smoothed_loss=0.0224 lr=1.79e-05 grad_norm=0.4852 step_time=0.5070s data_time=0.0754s it/s=1.717 eta_to_10000=3435.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0129 grad_action_out_proj_arms=0.1208 grad_arm_token_fuse=0.0643 grad_shared_expert=0.4796 (18633:train_pytorch.py:850) + Training: 41%|████ | 4100/10000 [56:19<1:00:04, 1.64it/s, loss=0.0403, lr=1.79e-05, step=4099] Training: 41%|████ | 4100/10000 [56:19<1:00:04, 1.64it/s, loss=0.0085, lr=1.79e-05, step=4100] Training: 41%|████ | 4101/10000 [56:19<56:30, 1.74it/s, loss=0.0085, lr=1.79e-05, step=4100] Training: 41%|████ | 4101/10000 [56:19<56:30, 1.74it/s, loss=0.0023, lr=1.79e-05, step=4101] Training: 41%|████ | 4102/10000 [56:20<59:13, 1.66it/s, loss=0.0023, lr=1.79e-05, step=4101] Training: 41%|████ | 4102/10000 [56:20<59:13, 1.66it/s, loss=0.0023, lr=1.79e-05, step=4102] Training: 41%|████ | 4103/10000 [56:20<57:07, 1.72it/s, loss=0.0023, lr=1.79e-05, step=4102] Training: 41%|████ | 4103/10000 [56:20<57:07, 1.72it/s, loss=0.0072, lr=1.79e-05, step=4103] Training: 41%|████ | 4104/10000 [56:21<55:40, 1.77it/s, loss=0.0072, lr=1.79e-05, step=4103] Training: 41%|████ | 4104/10000 [56:21<55:40, 1.77it/s, loss=0.0058, lr=1.79e-05, step=4104] Training: 41%|████ | 4105/10000 [56:21<55:46, 1.76it/s, loss=0.0058, lr=1.79e-05, step=4104] Training: 41%|████ | 4105/10000 [56:21<55:46, 1.76it/s, loss=0.0106, lr=1.79e-05, step=4105] Training: 41%|████ | 4106/10000 [56:22<54:55, 1.79it/s, loss=0.0106, lr=1.79e-05, step=4105] Training: 41%|████ | 4106/10000 [56:22<54:55, 1.79it/s, loss=0.0226, lr=1.79e-05, step=4106] Training: 41%|████ | 4107/10000 [56:23<59:41, 1.65it/s, loss=0.0226, lr=1.79e-05, step=4106] Training: 41%|████ | 4107/10000 [56:23<59:41, 1.65it/s, loss=0.0187, lr=1.79e-05, step=4107] Training: 41%|████ | 4108/10000 [56:23<1:01:35, 1.59it/s, loss=0.0187, lr=1.79e-05, step=4107] Training: 41%|████ | 4108/10000 [56:23<1:01:35, 1.59it/s, loss=0.0321, lr=1.79e-05, step=4108] Training: 41%|████ | 4109/10000 [56:24<1:03:20, 1.55it/s, loss=0.0321, lr=1.79e-05, step=4108] Training: 41%|████ | 4109/10000 [56:24<1:03:20, 1.55it/s, loss=0.0127, lr=1.79e-05, step=4109]19:40:56.992 [I] step=4110 loss=0.0145 smoothed_loss=0.0173 lr=1.79e-05 grad_norm=0.4305 step_time=0.5255s data_time=0.0719s it/s=1.674 eta_to_10000=3518.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.1644 grad_arm_token_fuse=0.0694 grad_shared_expert=0.3924 (18633:train_pytorch.py:850) + Training: 41%|████ | 4110/10000 [56:25<1:00:53, 1.61it/s, loss=0.0127, lr=1.79e-05, step=4109] Training: 41%|████ | 4110/10000 [56:25<1:00:53, 1.61it/s, loss=0.0145, lr=1.79e-05, step=4110] Training: 41%|████ | 4111/10000 [56:25<57:53, 1.70it/s, loss=0.0145, lr=1.79e-05, step=4110] Training: 41%|████ | 4111/10000 [56:25<57:53, 1.70it/s, loss=0.0184, lr=1.79e-05, step=4111] Training: 41%|████ | 4112/10000 [56:26<55:18, 1.77it/s, loss=0.0184, lr=1.79e-05, step=4111] Training: 41%|████ | 4112/10000 [56:26<55:18, 1.77it/s, loss=0.0337, lr=1.79e-05, step=4112] Training: 41%|████ | 4113/10000 [56:26<52:59, 1.85it/s, loss=0.0337, lr=1.79e-05, step=4112] Training: 41%|████ | 4113/10000 [56:26<52:59, 1.85it/s, loss=0.0088, lr=1.79e-05, step=4113] Training: 41%|████ | 4114/10000 [56:27<56:34, 1.73it/s, loss=0.0088, lr=1.79e-05, step=4113] Training: 41%|████ | 4114/10000 [56:27<56:34, 1.73it/s, loss=0.0047, lr=1.79e-05, step=4114] Training: 41%|████ | 4115/10000 [56:28<1:02:57, 1.56it/s, loss=0.0047, lr=1.79e-05, step=4114] Training: 41%|████ | 4115/10000 [56:28<1:02:57, 1.56it/s, loss=0.0435, lr=1.79e-05, step=4115] Training: 41%|████ | 4116/10000 [56:28<57:58, 1.69it/s, loss=0.0435, lr=1.79e-05, step=4115] Training: 41%|████ | 4116/10000 [56:28<57:58, 1.69it/s, loss=0.0355, lr=1.79e-05, step=4116] Training: 41%|████ | 4117/10000 [56:29<55:39, 1.76it/s, loss=0.0355, lr=1.79e-05, step=4116] Training: 41%|████ | 4117/10000 [56:29<55:39, 1.76it/s, loss=0.0162, lr=1.79e-05, step=4117] Training: 41%|████ | 4118/10000 [56:29<53:16, 1.84it/s, loss=0.0162, lr=1.79e-05, step=4117] Training: 41%|████ | 4118/10000 [56:29<53:16, 1.84it/s, loss=0.0283, lr=1.79e-05, step=4118] Training: 41%|████ | 4119/10000 [56:30<51:43, 1.90it/s, loss=0.0283, lr=1.79e-05, step=4118] Training: 41%|████ | 4119/10000 [56:30<51:43, 1.90it/s, loss=0.0447, lr=1.79e-05, step=4119]19:41:02.445 [I] step=4120 loss=0.0097 smoothed_loss=0.0222 lr=1.79e-05 grad_norm=0.5813 step_time=0.4833s data_time=0.0620s it/s=1.834 eta_to_10000=3205.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1176 grad_arm_token_fuse=0.0468 grad_shared_expert=0.2835 (18633:train_pytorch.py:850) + Training: 41%|████ | 4120/10000 [56:30<51:41, 1.90it/s, loss=0.0447, lr=1.79e-05, step=4119] Training: 41%|████ | 4120/10000 [56:30<51:41, 1.90it/s, loss=0.0097, lr=1.79e-05, step=4120] Training: 41%|████ | 4121/10000 [56:31<55:55, 1.75it/s, loss=0.0097, lr=1.79e-05, step=4120] Training: 41%|████ | 4121/10000 [56:31<55:55, 1.75it/s, loss=0.0106, lr=1.79e-05, step=4121] Training: 41%|████ | 4122/10000 [56:32<1:01:13, 1.60it/s, loss=0.0106, lr=1.79e-05, step=4121] Training: 41%|████ | 4122/10000 [56:32<1:01:13, 1.60it/s, loss=0.0070, lr=1.79e-05, step=4122] Training: 41%|████ | 4123/10000 [56:32<1:01:55, 1.58it/s, loss=0.0070, lr=1.79e-05, step=4122] Training: 41%|████ | 4123/10000 [56:32<1:01:55, 1.58it/s, loss=0.0298, lr=1.78e-05, step=4123] Training: 41%|████ | 4124/10000 [56:33<57:30, 1.70it/s, loss=0.0298, lr=1.78e-05, step=4123] Training: 41%|████ | 4124/10000 [56:33<57:30, 1.70it/s, loss=0.0185, lr=1.78e-05, step=4124] Training: 41%|████▏ | 4125/10000 [56:33<55:18, 1.77it/s, loss=0.0185, lr=1.78e-05, step=4124] Training: 41%|████▏ | 4125/10000 [56:33<55:18, 1.77it/s, loss=0.0117, lr=1.78e-05, step=4125] Training: 41%|████▏ | 4126/10000 [56:34<53:29, 1.83it/s, loss=0.0117, lr=1.78e-05, step=4125] Training: 41%|████▏ | 4126/10000 [56:34<53:29, 1.83it/s, loss=0.0135, lr=1.78e-05, step=4126] Training: 41%|████▏ | 4127/10000 [56:34<52:42, 1.86it/s, loss=0.0135, lr=1.78e-05, step=4126] Training: 41%|████▏ | 4127/10000 [56:34<52:42, 1.86it/s, loss=0.0221, lr=1.78e-05, step=4127] Training: 41%|████▏ | 4128/10000 [56:35<57:04, 1.71it/s, loss=0.0221, lr=1.78e-05, step=4127] Training: 41%|████▏ | 4128/10000 [56:35<57:04, 1.71it/s, loss=0.0066, lr=1.78e-05, step=4128] Training: 41%|████▏ | 4129/10000 [56:36<1:01:03, 1.60it/s, loss=0.0066, lr=1.78e-05, step=4128] Training: 41%|████▏ | 4129/10000 [56:36<1:01:03, 1.60it/s, loss=0.0087, lr=1.78e-05, step=4129]19:41:08.658 [I] step=4130 loss=0.0168 smoothed_loss=0.0171 lr=1.78e-05 grad_norm=0.4504 step_time=0.5585s data_time=0.0628s it/s=1.610 eta_to_10000=3646.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0136 grad_action_out_proj_arms=0.1207 grad_arm_token_fuse=0.0652 grad_shared_expert=0.3941 (18633:train_pytorch.py:850) + Training: 41%|████▏ | 4130/10000 [56:36<1:03:42, 1.54it/s, loss=0.0087, lr=1.78e-05, step=4129] Training: 41%|████▏ | 4130/10000 [56:36<1:03:42, 1.54it/s, loss=0.0168, lr=1.78e-05, step=4130] Training: 41%|████▏ | 4131/10000 [56:37<58:38, 1.67it/s, loss=0.0168, lr=1.78e-05, step=4130] Training: 41%|████▏ | 4131/10000 [56:37<58:38, 1.67it/s, loss=0.0089, lr=1.78e-05, step=4131] Training: 41%|████▏ | 4132/10000 [56:37<55:30, 1.76it/s, loss=0.0089, lr=1.78e-05, step=4131] Training: 41%|████▏ | 4132/10000 [56:37<55:30, 1.76it/s, loss=0.0029, lr=1.78e-05, step=4132] Training: 41%|████▏ | 4133/10000 [56:38<53:40, 1.82it/s, loss=0.0029, lr=1.78e-05, step=4132] Training: 41%|████▏ | 4133/10000 [56:38<53:40, 1.82it/s, loss=0.0339, lr=1.78e-05, step=4133] Training: 41%|████▏ | 4134/10000 [56:38<52:03, 1.88it/s, loss=0.0339, lr=1.78e-05, step=4133] Training: 41%|████▏ | 4134/10000 [56:38<52:03, 1.88it/s, loss=0.0194, lr=1.78e-05, step=4134] Training: 41%|████▏ | 4135/10000 [56:39<51:14, 1.91it/s, loss=0.0194, lr=1.78e-05, step=4134] Training: 41%|████▏ | 4135/10000 [56:39<51:14, 1.91it/s, loss=0.0320, lr=1.78e-05, step=4135] Training: 41%|████▏ | 4136/10000 [56:40<57:57, 1.69it/s, loss=0.0320, lr=1.78e-05, step=4135] Training: 41%|████▏ | 4136/10000 [56:40<57:57, 1.69it/s, loss=0.0131, lr=1.78e-05, step=4136] Training: 41%|████▏ | 4137/10000 [56:40<55:02, 1.78it/s, loss=0.0131, lr=1.78e-05, step=4136] Training: 41%|████▏ | 4137/10000 [56:40<55:02, 1.78it/s, loss=0.0118, lr=1.78e-05, step=4137] Training: 41%|████▏ | 4138/10000 [56:41<57:18, 1.70it/s, loss=0.0118, lr=1.78e-05, step=4137] Training: 41%|████▏ | 4138/10000 [56:41<57:18, 1.70it/s, loss=0.0124, lr=1.78e-05, step=4138] Training: 41%|████▏ | 4139/10000 [56:41<54:22, 1.80it/s, loss=0.0124, lr=1.78e-05, step=4138] Training: 41%|████▏ | 4139/10000 [56:41<54:22, 1.80it/s, loss=0.0057, lr=1.78e-05, step=4139]19:41:14.040 [I] step=4140 loss=0.0357 smoothed_loss=0.0178 lr=1.78e-05 grad_norm=0.4664 step_time=0.4752s data_time=0.0630s it/s=1.858 eta_to_10000=3153.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0280 grad_action_out_proj_arms=0.2227 grad_arm_token_fuse=0.1651 grad_shared_expert=0.5636 (18633:train_pytorch.py:850) + Training: 41%|████▏ | 4140/10000 [56:42<53:37, 1.82it/s, loss=0.0057, lr=1.78e-05, step=4139] Training: 41%|████▏ | 4140/10000 [56:42<53:37, 1.82it/s, loss=0.0357, lr=1.78e-05, step=4140] Training: 41%|████▏ | 4141/10000 [56:42<52:34, 1.86it/s, loss=0.0357, lr=1.78e-05, step=4140] Training: 41%|████▏ | 4141/10000 [56:42<52:34, 1.86it/s, loss=0.0218, lr=1.78e-05, step=4141] Training: 41%|████▏ | 4142/10000 [56:43<51:58, 1.88it/s, loss=0.0218, lr=1.78e-05, step=4141] Training: 41%|████▏ | 4142/10000 [56:43<51:58, 1.88it/s, loss=0.0171, lr=1.78e-05, step=4142] Training: 41%|████▏ | 4143/10000 [56:43<58:06, 1.68it/s, loss=0.0171, lr=1.78e-05, step=4142] Training: 41%|████▏ | 4143/10000 [56:43<58:06, 1.68it/s, loss=0.0105, lr=1.78e-05, step=4143] Training: 41%|████▏ | 4144/10000 [56:44<54:56, 1.78it/s, loss=0.0105, lr=1.78e-05, step=4143] Training: 41%|████▏ | 4144/10000 [56:44<54:56, 1.78it/s, loss=0.0064, lr=1.78e-05, step=4144] Training: 41%|████▏ | 4145/10000 [56:45<56:43, 1.72it/s, loss=0.0064, lr=1.78e-05, step=4144] Training: 41%|████▏ | 4145/10000 [56:45<56:43, 1.72it/s, loss=0.0064, lr=1.78e-05, step=4145] Training: 41%|████▏ | 4146/10000 [56:45<53:29, 1.82it/s, loss=0.0064, lr=1.78e-05, step=4145] Training: 41%|████▏ | 4146/10000 [56:45<53:29, 1.82it/s, loss=0.0126, lr=1.78e-05, step=4146] Training: 41%|████▏ | 4147/10000 [56:46<51:38, 1.89it/s, loss=0.0126, lr=1.78e-05, step=4146] Training: 41%|████▏ | 4147/10000 [56:46<51:38, 1.89it/s, loss=0.0028, lr=1.78e-05, step=4147] Training: 41%|████▏ | 4148/10000 [56:46<50:10, 1.94it/s, loss=0.0028, lr=1.78e-05, step=4147] Training: 41%|████▏ | 4148/10000 [56:46<50:10, 1.94it/s, loss=0.0124, lr=1.78e-05, step=4148] Training: 41%|████▏ | 4149/10000 [56:47<50:56, 1.91it/s, loss=0.0124, lr=1.78e-05, step=4148] Training: 41%|████▏ | 4149/10000 [56:47<50:56, 1.91it/s, loss=0.0170, lr=1.78e-05, step=4149]19:41:19.608 [I] step=4150 loss=0.0228 smoothed_loss=0.0148 lr=1.78e-05 grad_norm=0.4062 step_time=0.4935s data_time=0.0633s it/s=1.796 eta_to_10000=3256.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0189 grad_action_out_proj_arms=0.1047 grad_arm_token_fuse=0.0989 grad_shared_expert=0.3410 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4150/10000 [56:47<56:17, 1.73it/s, loss=0.0170, lr=1.78e-05, step=4149] Training: 42%|████▏ | 4150/10000 [56:47<56:17, 1.73it/s, loss=0.0228, lr=1.78e-05, step=4150] Training: 42%|████▏ | 4151/10000 [56:48<53:56, 1.81it/s, loss=0.0228, lr=1.78e-05, step=4150] Training: 42%|████▏ | 4151/10000 [56:48<53:56, 1.81it/s, loss=0.0112, lr=1.78e-05, step=4151] Training: 42%|████▏ | 4152/10000 [56:48<52:27, 1.86it/s, loss=0.0112, lr=1.78e-05, step=4151] Training: 42%|████▏ | 4152/10000 [56:48<52:27, 1.86it/s, loss=0.0185, lr=1.77e-05, step=4152] Training: 42%|████▏ | 4153/10000 [56:49<55:24, 1.76it/s, loss=0.0185, lr=1.77e-05, step=4152] Training: 42%|████▏ | 4153/10000 [56:49<55:24, 1.76it/s, loss=0.0075, lr=1.77e-05, step=4153] Training: 42%|████▏ | 4154/10000 [56:49<53:12, 1.83it/s, loss=0.0075, lr=1.77e-05, step=4153] Training: 42%|████▏ | 4154/10000 [56:49<53:12, 1.83it/s, loss=0.0448, lr=1.77e-05, step=4154] Training: 42%|████▏ | 4155/10000 [56:50<51:31, 1.89it/s, loss=0.0448, lr=1.77e-05, step=4154] Training: 42%|████▏ | 4155/10000 [56:50<51:31, 1.89it/s, loss=0.0038, lr=1.77e-05, step=4155] Training: 42%|████▏ | 4156/10000 [56:50<50:26, 1.93it/s, loss=0.0038, lr=1.77e-05, step=4155] Training: 42%|████▏ | 4156/10000 [56:50<50:26, 1.93it/s, loss=0.0192, lr=1.77e-05, step=4156] Training: 42%|████▏ | 4157/10000 [56:51<56:44, 1.72it/s, loss=0.0192, lr=1.77e-05, step=4156] Training: 42%|████▏ | 4157/10000 [56:51<56:44, 1.72it/s, loss=0.0102, lr=1.77e-05, step=4157] Training: 42%|████▏ | 4158/10000 [56:52<56:05, 1.74it/s, loss=0.0102, lr=1.77e-05, step=4157] Training: 42%|████▏ | 4158/10000 [56:52<56:05, 1.74it/s, loss=0.0540, lr=1.77e-05, step=4158] Training: 42%|████▏ | 4159/10000 [56:52<53:47, 1.81it/s, loss=0.0540, lr=1.77e-05, step=4158] Training: 42%|████▏ | 4159/10000 [56:52<53:47, 1.81it/s, loss=0.0715, lr=1.77e-05, step=4159]19:41:25.214 [I] step=4160 loss=0.0183 smoothed_loss=0.0240 lr=1.77e-05 grad_norm=0.4834 step_time=0.4909s data_time=0.0697s it/s=1.784 eta_to_10000=3273.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1403 grad_arm_token_fuse=0.0638 grad_shared_expert=0.5958 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4160/10000 [56:53<58:03, 1.68it/s, loss=0.0715, lr=1.77e-05, step=4159] Training: 42%|████▏ | 4160/10000 [56:53<58:03, 1.68it/s, loss=0.0183, lr=1.77e-05, step=4160] Training: 42%|████▏ | 4161/10000 [56:53<55:25, 1.76it/s, loss=0.0183, lr=1.77e-05, step=4160] Training: 42%|████▏ | 4161/10000 [56:53<55:25, 1.76it/s, loss=0.0179, lr=1.77e-05, step=4161] Training: 42%|████▏ | 4162/10000 [56:54<53:48, 1.81it/s, loss=0.0179, lr=1.77e-05, step=4161] Training: 42%|████▏ | 4162/10000 [56:54<53:48, 1.81it/s, loss=0.0969, lr=1.77e-05, step=4162] Training: 42%|████▏ | 4163/10000 [56:54<51:52, 1.88it/s, loss=0.0969, lr=1.77e-05, step=4162] Training: 42%|████▏ | 4163/10000 [56:54<51:52, 1.88it/s, loss=0.0134, lr=1.77e-05, step=4163] Training: 42%|████▏ | 4164/10000 [56:55<56:16, 1.73it/s, loss=0.0134, lr=1.77e-05, step=4163] Training: 42%|████▏ | 4164/10000 [56:55<56:16, 1.73it/s, loss=0.0540, lr=1.77e-05, step=4164] Training: 42%|████▏ | 4165/10000 [56:56<1:02:07, 1.57it/s, loss=0.0540, lr=1.77e-05, step=4164] Training: 42%|████▏ | 4165/10000 [56:56<1:02:07, 1.57it/s, loss=0.0104, lr=1.77e-05, step=4165] Training: 42%|████▏ | 4166/10000 [56:56<59:19, 1.64it/s, loss=0.0104, lr=1.77e-05, step=4165] Training: 42%|████▏ | 4166/10000 [56:56<59:19, 1.64it/s, loss=0.0172, lr=1.77e-05, step=4166] Training: 42%|████▏ | 4167/10000 [56:57<58:29, 1.66it/s, loss=0.0172, lr=1.77e-05, step=4166] Training: 42%|████▏ | 4167/10000 [56:57<58:29, 1.66it/s, loss=0.0045, lr=1.77e-05, step=4167] Training: 42%|████▏ | 4168/10000 [56:58<1:00:01, 1.62it/s, loss=0.0045, lr=1.77e-05, step=4167] Training: 42%|████▏ | 4168/10000 [56:58<1:00:01, 1.62it/s, loss=0.0315, lr=1.77e-05, step=4168] Training: 42%|████▏ | 4169/10000 [56:58<56:50, 1.71it/s, loss=0.0315, lr=1.77e-05, step=4168] Training: 42%|████▏ | 4169/10000 [56:58<56:50, 1.71it/s, loss=0.0151, lr=1.77e-05, step=4169]19:41:30.994 [I] step=4170 loss=0.0342 smoothed_loss=0.0261 lr=1.77e-05 grad_norm=0.5859 step_time=0.5042s data_time=0.0738s it/s=1.730 eta_to_10000=3369.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0183 grad_action_out_proj_arms=0.1746 grad_arm_token_fuse=0.0911 grad_shared_expert=0.5511 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4170/10000 [56:59<54:56, 1.77it/s, loss=0.0151, lr=1.77e-05, step=4169] Training: 42%|████▏ | 4170/10000 [56:59<54:56, 1.77it/s, loss=0.0342, lr=1.77e-05, step=4170] Training: 42%|████▏ | 4171/10000 [56:59<58:06, 1.67it/s, loss=0.0342, lr=1.77e-05, step=4170] Training: 42%|████▏ | 4171/10000 [56:59<58:06, 1.67it/s, loss=0.0174, lr=1.77e-05, step=4171] Training: 42%|████▏ | 4172/10000 [57:00<1:01:54, 1.57it/s, loss=0.0174, lr=1.77e-05, step=4171] Training: 42%|████▏ | 4172/10000 [57:00<1:01:54, 1.57it/s, loss=0.0213, lr=1.77e-05, step=4172] Training: 42%|████▏ | 4173/10000 [57:01<58:18, 1.67it/s, loss=0.0213, lr=1.77e-05, step=4172] Training: 42%|████▏ | 4173/10000 [57:01<58:18, 1.67it/s, loss=0.0065, lr=1.77e-05, step=4173] Training: 42%|████▏ | 4174/10000 [57:01<55:21, 1.75it/s, loss=0.0065, lr=1.77e-05, step=4173] Training: 42%|████▏ | 4174/10000 [57:01<55:21, 1.75it/s, loss=0.0210, lr=1.77e-05, step=4174] Training: 42%|████▏ | 4175/10000 [57:02<1:01:19, 1.58it/s, loss=0.0210, lr=1.77e-05, step=4174] Training: 42%|████▏ | 4175/10000 [57:02<1:01:19, 1.58it/s, loss=0.0062, lr=1.77e-05, step=4175] Training: 42%|████▏ | 4176/10000 [57:03<1:03:37, 1.53it/s, loss=0.0062, lr=1.77e-05, step=4175] Training: 42%|████▏ | 4176/10000 [57:03<1:03:37, 1.53it/s, loss=0.0351, lr=1.77e-05, step=4176] Training: 42%|████▏ | 4177/10000 [57:03<1:00:34, 1.60it/s, loss=0.0351, lr=1.77e-05, step=4176] Training: 42%|████▏ | 4177/10000 [57:03<1:00:34, 1.60it/s, loss=0.0116, lr=1.77e-05, step=4177] Training: 42%|████▏ | 4178/10000 [57:04<1:02:46, 1.55it/s, loss=0.0116, lr=1.77e-05, step=4177] Training: 42%|████▏ | 4178/10000 [57:04<1:02:46, 1.55it/s, loss=0.0445, lr=1.77e-05, step=4178] Training: 42%|████▏ | 4179/10000 [57:05<1:05:11, 1.49it/s, loss=0.0445, lr=1.77e-05, step=4178] Training: 42%|████▏ | 4179/10000 [57:05<1:05:11, 1.49it/s, loss=0.0072, lr=1.77e-05, step=4179]19:41:37.416 [I] step=4180 loss=0.0115 smoothed_loss=0.0210 lr=1.77e-05 grad_norm=0.5103 step_time=0.5544s data_time=0.0878s it/s=1.557 eta_to_10000=3736.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.1494 grad_arm_token_fuse=0.0556 grad_shared_expert=0.5573 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4180/10000 [57:05<1:01:15, 1.58it/s, loss=0.0072, lr=1.77e-05, step=4179] Training: 42%|████▏ | 4180/10000 [57:05<1:01:15, 1.58it/s, loss=0.0115, lr=1.77e-05, step=4180] Training: 42%|████▏ | 4181/10000 [57:06<57:34, 1.68it/s, loss=0.0115, lr=1.77e-05, step=4180] Training: 42%|████▏ | 4181/10000 [57:06<57:34, 1.68it/s, loss=0.0371, lr=1.76e-05, step=4181] Training: 42%|████▏ | 4182/10000 [57:06<59:19, 1.63it/s, loss=0.0371, lr=1.76e-05, step=4181] Training: 42%|████▏ | 4182/10000 [57:06<59:19, 1.63it/s, loss=0.0253, lr=1.76e-05, step=4182] Training: 42%|████▏ | 4183/10000 [57:07<56:55, 1.70it/s, loss=0.0253, lr=1.76e-05, step=4182] Training: 42%|████▏ | 4183/10000 [57:07<56:55, 1.70it/s, loss=0.0189, lr=1.76e-05, step=4183] Training: 42%|████▏ | 4184/10000 [57:07<54:40, 1.77it/s, loss=0.0189, lr=1.76e-05, step=4183] Training: 42%|████▏ | 4184/10000 [57:07<54:40, 1.77it/s, loss=0.0092, lr=1.76e-05, step=4184] Training: 42%|████▏ | 4185/10000 [57:08<53:16, 1.82it/s, loss=0.0092, lr=1.76e-05, step=4184] Training: 42%|████▏ | 4185/10000 [57:08<53:16, 1.82it/s, loss=0.0063, lr=1.76e-05, step=4185] Training: 42%|████▏ | 4186/10000 [57:09<1:04:34, 1.50it/s, loss=0.0063, lr=1.76e-05, step=4185] Training: 42%|████▏ | 4186/10000 [57:09<1:04:34, 1.50it/s, loss=0.0153, lr=1.76e-05, step=4186] Training: 42%|████▏ | 4187/10000 [57:09<59:31, 1.63it/s, loss=0.0153, lr=1.76e-05, step=4186] Training: 42%|████▏ | 4187/10000 [57:09<59:31, 1.63it/s, loss=0.0103, lr=1.76e-05, step=4187] Training: 42%|████▏ | 4188/10000 [57:10<59:07, 1.64it/s, loss=0.0103, lr=1.76e-05, step=4187] Training: 42%|████▏ | 4188/10000 [57:10<59:07, 1.64it/s, loss=0.0025, lr=1.76e-05, step=4188] Training: 42%|████▏ | 4189/10000 [57:10<59:06, 1.64it/s, loss=0.0025, lr=1.76e-05, step=4188] Training: 42%|████▏ | 4189/10000 [57:10<59:06, 1.64it/s, loss=0.0096, lr=1.76e-05, step=4189]19:41:43.473 [I] step=4190 loss=0.0238 smoothed_loss=0.0168 lr=1.76e-05 grad_norm=0.4847 step_time=0.5208s data_time=0.0849s it/s=1.651 eta_to_10000=3518.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0110 grad_action_out_proj_arms=0.0975 grad_arm_token_fuse=0.0548 grad_shared_expert=0.4618 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4190/10000 [57:11<1:01:39, 1.57it/s, loss=0.0096, lr=1.76e-05, step=4189] Training: 42%|████▏ | 4190/10000 [57:11<1:01:39, 1.57it/s, loss=0.0238, lr=1.76e-05, step=4190] Training: 42%|████▏ | 4191/10000 [57:12<57:58, 1.67it/s, loss=0.0238, lr=1.76e-05, step=4190] Training: 42%|████▏ | 4191/10000 [57:12<57:58, 1.67it/s, loss=0.0067, lr=1.76e-05, step=4191] Training: 42%|████▏ | 4192/10000 [57:12<54:56, 1.76it/s, loss=0.0067, lr=1.76e-05, step=4191] Training: 42%|████▏ | 4192/10000 [57:12<54:56, 1.76it/s, loss=0.0331, lr=1.76e-05, step=4192] Training: 42%|████▏ | 4193/10000 [57:13<1:04:41, 1.50it/s, loss=0.0331, lr=1.76e-05, step=4192] Training: 42%|████▏ | 4193/10000 [57:13<1:04:41, 1.50it/s, loss=0.0393, lr=1.76e-05, step=4193] Training: 42%|████▏ | 4194/10000 [57:14<1:02:27, 1.55it/s, loss=0.0393, lr=1.76e-05, step=4193] Training: 42%|████▏ | 4194/10000 [57:14<1:02:27, 1.55it/s, loss=0.0135, lr=1.76e-05, step=4194] Training: 42%|████▏ | 4195/10000 [57:14<1:00:46, 1.59it/s, loss=0.0135, lr=1.76e-05, step=4194] Training: 42%|████▏ | 4195/10000 [57:14<1:00:46, 1.59it/s, loss=0.0130, lr=1.76e-05, step=4195] Training: 42%|████▏ | 4196/10000 [57:15<1:08:03, 1.42it/s, loss=0.0130, lr=1.76e-05, step=4195] Training: 42%|████▏ | 4196/10000 [57:15<1:08:03, 1.42it/s, loss=0.0099, lr=1.76e-05, step=4196] Training: 42%|████▏ | 4197/10000 [57:16<1:07:12, 1.44it/s, loss=0.0099, lr=1.76e-05, step=4196] Training: 42%|████▏ | 4197/10000 [57:16<1:07:12, 1.44it/s, loss=0.0249, lr=1.76e-05, step=4197] Training: 42%|████▏ | 4198/10000 [57:17<1:08:39, 1.41it/s, loss=0.0249, lr=1.76e-05, step=4197] Training: 42%|████▏ | 4198/10000 [57:17<1:08:39, 1.41it/s, loss=0.0051, lr=1.76e-05, step=4198] Training: 42%|████▏ | 4199/10000 [57:17<1:05:41, 1.47it/s, loss=0.0051, lr=1.76e-05, step=4198] Training: 42%|████▏ | 4199/10000 [57:17<1:05:41, 1.47it/s, loss=0.0028, lr=1.76e-05, step=4199]19:41:50.302 [I] step=4200 loss=0.0041 smoothed_loss=0.0145 lr=1.76e-05 grad_norm=0.4777 step_time=0.5769s data_time=0.1060s it/s=1.465 eta_to_10000=3960.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0213 grad_action_out_proj_arms=0.1693 grad_arm_token_fuse=0.1079 grad_shared_expert=0.5339 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4200/10000 [57:18<1:10:08, 1.38it/s, loss=0.0028, lr=1.76e-05, step=4199] Training: 42%|████▏ | 4200/10000 [57:18<1:10:08, 1.38it/s, loss=0.0041, lr=1.76e-05, step=4200] Training: 42%|████▏ | 4201/10000 [57:19<1:13:05, 1.32it/s, loss=0.0041, lr=1.76e-05, step=4200] Training: 42%|████▏ | 4201/10000 [57:19<1:13:05, 1.32it/s, loss=0.0221, lr=1.76e-05, step=4201] Training: 42%|████▏ | 4202/10000 [57:19<1:08:51, 1.40it/s, loss=0.0221, lr=1.76e-05, step=4201] Training: 42%|████▏ | 4202/10000 [57:19<1:08:51, 1.40it/s, loss=0.0079, lr=1.76e-05, step=4202] Training: 42%|████▏ | 4203/10000 [57:20<1:06:49, 1.45it/s, loss=0.0079, lr=1.76e-05, step=4202] Training: 42%|████▏ | 4203/10000 [57:20<1:06:49, 1.45it/s, loss=0.0319, lr=1.76e-05, step=4203] Training: 42%|████▏ | 4204/10000 [57:21<1:05:06, 1.48it/s, loss=0.0319, lr=1.76e-05, step=4203] Training: 42%|████▏ | 4204/10000 [57:21<1:05:06, 1.48it/s, loss=0.0127, lr=1.76e-05, step=4204] Training: 42%|████▏ | 4205/10000 [57:21<1:07:32, 1.43it/s, loss=0.0127, lr=1.76e-05, step=4204] Training: 42%|████▏ | 4205/10000 [57:21<1:07:32, 1.43it/s, loss=0.0143, lr=1.76e-05, step=4205] Training: 42%|████▏ | 4206/10000 [57:22<1:05:26, 1.48it/s, loss=0.0143, lr=1.76e-05, step=4205] Training: 42%|████▏ | 4206/10000 [57:22<1:05:26, 1.48it/s, loss=0.0030, lr=1.76e-05, step=4206] Training: 42%|████▏ | 4207/10000 [57:23<1:13:30, 1.31it/s, loss=0.0030, lr=1.76e-05, step=4206] Training: 42%|████▏ | 4207/10000 [57:23<1:13:30, 1.31it/s, loss=0.0101, lr=1.76e-05, step=4207] Training: 42%|████▏ | 4208/10000 [57:24<1:06:58, 1.44it/s, loss=0.0101, lr=1.76e-05, step=4207] Training: 42%|████▏ | 4208/10000 [57:24<1:06:58, 1.44it/s, loss=0.0249, lr=1.76e-05, step=4208] Training: 42%|████▏ | 4209/10000 [57:24<1:02:30, 1.54it/s, loss=0.0249, lr=1.76e-05, step=4208] Training: 42%|████▏ | 4209/10000 [57:24<1:02:30, 1.54it/s, loss=0.0264, lr=1.76e-05, step=4209]19:41:56.960 [I] step=4210 loss=0.0181 smoothed_loss=0.0164 lr=1.76e-05 grad_norm=0.4864 step_time=0.5632s data_time=0.1026s it/s=1.502 eta_to_10000=3854.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0096 grad_action_out_proj_arms=0.1011 grad_arm_token_fuse=0.0492 grad_shared_expert=0.4510 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4210/10000 [57:25<58:57, 1.64it/s, loss=0.0264, lr=1.76e-05, step=4209] Training: 42%|████▏ | 4210/10000 [57:25<58:57, 1.64it/s, loss=0.0181, lr=1.75e-05, step=4210] Training: 42%|████▏ | 4211/10000 [57:25<56:03, 1.72it/s, loss=0.0181, lr=1.75e-05, step=4210] Training: 42%|████▏ | 4211/10000 [57:25<56:03, 1.72it/s, loss=0.0279, lr=1.75e-05, step=4211] Training: 42%|████▏ | 4212/10000 [57:26<57:44, 1.67it/s, loss=0.0279, lr=1.75e-05, step=4211] Training: 42%|████▏ | 4212/10000 [57:26<57:44, 1.67it/s, loss=0.0167, lr=1.75e-05, step=4212] Training: 42%|████▏ | 4213/10000 [57:26<55:32, 1.74it/s, loss=0.0167, lr=1.75e-05, step=4212] Training: 42%|████▏ | 4213/10000 [57:26<55:32, 1.74it/s, loss=0.0084, lr=1.75e-05, step=4213] Training: 42%|████▏ | 4214/10000 [57:27<1:01:46, 1.56it/s, loss=0.0084, lr=1.75e-05, step=4213] Training: 42%|████▏ | 4214/10000 [57:27<1:01:46, 1.56it/s, loss=0.0106, lr=1.75e-05, step=4214] Training: 42%|████▏ | 4215/10000 [57:28<1:10:11, 1.37it/s, loss=0.0106, lr=1.75e-05, step=4214] Training: 42%|████▏ | 4215/10000 [57:28<1:10:11, 1.37it/s, loss=0.0340, lr=1.75e-05, step=4215] Training: 42%|████▏ | 4216/10000 [57:29<1:04:29, 1.49it/s, loss=0.0340, lr=1.75e-05, step=4215] Training: 42%|████▏ | 4216/10000 [57:29<1:04:29, 1.49it/s, loss=0.0070, lr=1.75e-05, step=4216] Training: 42%|████▏ | 4217/10000 [57:29<1:08:41, 1.40it/s, loss=0.0070, lr=1.75e-05, step=4216] Training: 42%|████▏ | 4217/10000 [57:29<1:08:41, 1.40it/s, loss=0.0101, lr=1.75e-05, step=4217] Training: 42%|████▏ | 4218/10000 [57:30<1:02:19, 1.55it/s, loss=0.0101, lr=1.75e-05, step=4217] Training: 42%|████▏ | 4218/10000 [57:30<1:02:19, 1.55it/s, loss=0.0364, lr=1.75e-05, step=4218] Training: 42%|████▏ | 4219/10000 [57:30<57:53, 1.66it/s, loss=0.0364, lr=1.75e-05, step=4218] Training: 42%|████▏ | 4219/10000 [57:30<57:53, 1.66it/s, loss=0.0086, lr=1.75e-05, step=4219]19:42:03.383 [I] step=4220 loss=0.0067 smoothed_loss=0.0161 lr=1.75e-05 grad_norm=0.4645 step_time=0.5478s data_time=0.0946s it/s=1.557 eta_to_10000=3712.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0169 grad_action_out_proj_arms=0.1524 grad_arm_token_fuse=0.0883 grad_shared_expert=0.4469 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4220/10000 [57:31<1:00:33, 1.59it/s, loss=0.0086, lr=1.75e-05, step=4219] Training: 42%|████▏ | 4220/10000 [57:31<1:00:33, 1.59it/s, loss=0.0067, lr=1.75e-05, step=4220] Training: 42%|████▏ | 4221/10000 [57:32<1:02:49, 1.53it/s, loss=0.0067, lr=1.75e-05, step=4220] Training: 42%|████▏ | 4221/10000 [57:32<1:02:49, 1.53it/s, loss=0.0032, lr=1.75e-05, step=4221] Training: 42%|████▏ | 4222/10000 [57:33<1:07:16, 1.43it/s, loss=0.0032, lr=1.75e-05, step=4221] Training: 42%|████▏ | 4222/10000 [57:33<1:07:16, 1.43it/s, loss=0.0062, lr=1.75e-05, step=4222] Training: 42%|████▏ | 4223/10000 [57:33<1:01:33, 1.56it/s, loss=0.0062, lr=1.75e-05, step=4222] Training: 42%|████▏ | 4223/10000 [57:33<1:01:33, 1.56it/s, loss=0.0076, lr=1.75e-05, step=4223] Training: 42%|████▏ | 4224/10000 [57:34<57:10, 1.68it/s, loss=0.0076, lr=1.75e-05, step=4223] Training: 42%|████▏ | 4224/10000 [57:34<57:10, 1.68it/s, loss=0.0079, lr=1.75e-05, step=4224] Training: 42%|████▏ | 4225/10000 [57:34<54:21, 1.77it/s, loss=0.0079, lr=1.75e-05, step=4224] Training: 42%|████▏ | 4225/10000 [57:34<54:21, 1.77it/s, loss=0.0035, lr=1.75e-05, step=4225] Training: 42%|████▏ | 4226/10000 [57:35<51:59, 1.85it/s, loss=0.0035, lr=1.75e-05, step=4225] Training: 42%|████▏ | 4226/10000 [57:35<51:59, 1.85it/s, loss=0.0413, lr=1.75e-05, step=4226] Training: 42%|████▏ | 4227/10000 [57:35<55:27, 1.73it/s, loss=0.0413, lr=1.75e-05, step=4226] Training: 42%|████▏ | 4227/10000 [57:35<55:27, 1.73it/s, loss=0.0261, lr=1.75e-05, step=4227] Training: 42%|████▏ | 4228/10000 [57:36<57:49, 1.66it/s, loss=0.0261, lr=1.75e-05, step=4227] Training: 42%|████▏ | 4228/10000 [57:36<57:49, 1.66it/s, loss=0.0116, lr=1.75e-05, step=4228] Training: 42%|████▏ | 4229/10000 [57:37<1:01:48, 1.56it/s, loss=0.0116, lr=1.75e-05, step=4228] Training: 42%|████▏ | 4229/10000 [57:37<1:01:48, 1.56it/s, loss=0.0261, lr=1.75e-05, step=4229]19:42:09.447 [I] step=4230 loss=0.0128 smoothed_loss=0.0162 lr=1.75e-05 grad_norm=0.4183 step_time=0.5403s data_time=0.0660s it/s=1.649 eta_to_10000=3498.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0164 grad_action_out_proj_arms=0.1538 grad_arm_token_fuse=0.0890 grad_shared_expert=0.3284 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4230/10000 [57:37<58:18, 1.65it/s, loss=0.0261, lr=1.75e-05, step=4229] Training: 42%|████▏ | 4230/10000 [57:37<58:18, 1.65it/s, loss=0.0128, lr=1.75e-05, step=4230] Training: 42%|████▏ | 4231/10000 [57:38<55:05, 1.75it/s, loss=0.0128, lr=1.75e-05, step=4230] Training: 42%|████▏ | 4231/10000 [57:38<55:05, 1.75it/s, loss=0.0615, lr=1.75e-05, step=4231] Training: 42%|████▏ | 4232/10000 [57:38<52:34, 1.83it/s, loss=0.0615, lr=1.75e-05, step=4231] Training: 42%|████▏ | 4232/10000 [57:38<52:34, 1.83it/s, loss=0.0142, lr=1.75e-05, step=4232] Training: 42%|████▏ | 4233/10000 [57:39<50:34, 1.90it/s, loss=0.0142, lr=1.75e-05, step=4232] Training: 42%|████▏ | 4233/10000 [57:39<50:34, 1.90it/s, loss=0.0139, lr=1.75e-05, step=4233] Training: 42%|████▏ | 4234/10000 [57:39<53:57, 1.78it/s, loss=0.0139, lr=1.75e-05, step=4233] Training: 42%|████▏ | 4234/10000 [57:39<53:57, 1.78it/s, loss=0.0111, lr=1.75e-05, step=4234] Training: 42%|████▏ | 4235/10000 [57:40<51:56, 1.85it/s, loss=0.0111, lr=1.75e-05, step=4234] Training: 42%|████▏ | 4235/10000 [57:40<51:56, 1.85it/s, loss=0.0084, lr=1.75e-05, step=4235] Training: 42%|████▏ | 4236/10000 [57:40<57:51, 1.66it/s, loss=0.0084, lr=1.75e-05, step=4235] Training: 42%|████▏ | 4236/10000 [57:40<57:51, 1.66it/s, loss=0.0261, lr=1.75e-05, step=4236] Training: 42%|████▏ | 4237/10000 [57:41<55:35, 1.73it/s, loss=0.0261, lr=1.75e-05, step=4236] Training: 42%|████▏ | 4237/10000 [57:41<55:35, 1.73it/s, loss=0.0058, lr=1.75e-05, step=4237] Training: 42%|████▏ | 4238/10000 [57:41<53:06, 1.81it/s, loss=0.0058, lr=1.75e-05, step=4237] Training: 42%|████▏ | 4238/10000 [57:41<53:06, 1.81it/s, loss=0.0064, lr=1.74e-05, step=4238] Training: 42%|████▏ | 4239/10000 [57:42<52:21, 1.83it/s, loss=0.0064, lr=1.74e-05, step=4238] Training: 42%|████▏ | 4239/10000 [57:42<52:21, 1.83it/s, loss=0.0150, lr=1.74e-05, step=4239]19:42:14.847 [I] step=4240 loss=0.0149 smoothed_loss=0.0159 lr=1.75e-05 grad_norm=0.4279 step_time=0.4737s data_time=0.0662s it/s=1.852 eta_to_10000=3109.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0303 grad_action_out_proj_arms=0.1964 grad_arm_token_fuse=0.1568 grad_shared_expert=0.4887 (18633:train_pytorch.py:850) + Training: 42%|████▏ | 4240/10000 [57:43<51:27, 1.87it/s, loss=0.0150, lr=1.74e-05, step=4239] Training: 42%|████▏ | 4240/10000 [57:43<51:27, 1.87it/s, loss=0.0149, lr=1.74e-05, step=4240] Training: 42%|████▏ | 4241/10000 [57:43<50:16, 1.91it/s, loss=0.0149, lr=1.74e-05, step=4240] Training: 42%|████▏ | 4241/10000 [57:43<50:16, 1.91it/s, loss=0.0077, lr=1.74e-05, step=4241] Training: 42%|████▏ | 4242/10000 [57:44<54:14, 1.77it/s, loss=0.0077, lr=1.74e-05, step=4241] Training: 42%|████▏ | 4242/10000 [57:44<54:14, 1.77it/s, loss=0.0117, lr=1.74e-05, step=4242] Training: 42%|████▏ | 4243/10000 [57:44<59:13, 1.62it/s, loss=0.0117, lr=1.74e-05, step=4242] Training: 42%|████▏ | 4243/10000 [57:44<59:13, 1.62it/s, loss=0.0118, lr=1.74e-05, step=4243] Training: 42%|████▏ | 4244/10000 [57:45<55:38, 1.72it/s, loss=0.0118, lr=1.74e-05, step=4243] Training: 42%|████▏ | 4244/10000 [57:45<55:38, 1.72it/s, loss=0.0092, lr=1.74e-05, step=4244] Training: 42%|████▏ | 4245/10000 [57:45<54:19, 1.77it/s, loss=0.0092, lr=1.74e-05, step=4244] Training: 42%|████▏ | 4245/10000 [57:45<54:19, 1.77it/s, loss=0.0164, lr=1.74e-05, step=4245] Training: 42%|████▏ | 4246/10000 [57:46<57:27, 1.67it/s, loss=0.0164, lr=1.74e-05, step=4245] Training: 42%|████▏ | 4246/10000 [57:46<57:27, 1.67it/s, loss=0.0118, lr=1.74e-05, step=4246] Training: 42%|████▏ | 4247/10000 [57:47<56:15, 1.70it/s, loss=0.0118, lr=1.74e-05, step=4246] Training: 42%|████▏ | 4247/10000 [57:47<56:15, 1.70it/s, loss=0.0085, lr=1.74e-05, step=4247] Training: 42%|████▏ | 4248/10000 [57:47<58:14, 1.65it/s, loss=0.0085, lr=1.74e-05, step=4247] Training: 42%|████▏ | 4248/10000 [57:47<58:14, 1.65it/s, loss=0.0224, lr=1.74e-05, step=4248] Training: 42%|████▏ | 4249/10000 [57:48<55:11, 1.74it/s, loss=0.0224, lr=1.74e-05, step=4248] Training: 42%|████▏ | 4249/10000 [57:48<55:11, 1.74it/s, loss=0.0102, lr=1.74e-05, step=4249]19:42:21.060 [I] step=4250 loss=0.0192 smoothed_loss=0.0144 lr=1.74e-05 grad_norm=0.4727 step_time=0.5377s data_time=0.0836s it/s=1.610 eta_to_10000=3572.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0080 grad_action_out_proj_arms=0.1208 grad_arm_token_fuse=0.0395 grad_shared_expert=0.4261 (18633:train_pytorch.py:850) + Training: 42%|████▎ | 4250/10000 [57:49<1:04:27, 1.49it/s, loss=0.0102, lr=1.74e-05, step=4249] Training: 42%|████▎ | 4250/10000 [57:49<1:04:27, 1.49it/s, loss=0.0192, lr=1.74e-05, step=4250] Training: 43%|████▎ | 4251/10000 [57:49<59:10, 1.62it/s, loss=0.0192, lr=1.74e-05, step=4250] Training: 43%|████▎ | 4251/10000 [57:49<59:10, 1.62it/s, loss=0.0112, lr=1.74e-05, step=4251] Training: 43%|████▎ | 4252/10000 [57:50<55:02, 1.74it/s, loss=0.0112, lr=1.74e-05, step=4251] Training: 43%|████▎ | 4252/10000 [57:50<55:02, 1.74it/s, loss=0.0198, lr=1.74e-05, step=4252] Training: 43%|████▎ | 4253/10000 [57:50<54:25, 1.76it/s, loss=0.0198, lr=1.74e-05, step=4252] Training: 43%|████▎ | 4253/10000 [57:50<54:25, 1.76it/s, loss=0.0056, lr=1.74e-05, step=4253] Training: 43%|████▎ | 4254/10000 [57:51<52:12, 1.83it/s, loss=0.0056, lr=1.74e-05, step=4253] Training: 43%|████▎ | 4254/10000 [57:51<52:12, 1.83it/s, loss=0.0298, lr=1.74e-05, step=4254] Training: 43%|████▎ | 4255/10000 [57:51<55:05, 1.74it/s, loss=0.0298, lr=1.74e-05, step=4254] Training: 43%|████▎ | 4255/10000 [57:51<55:05, 1.74it/s, loss=0.0339, lr=1.74e-05, step=4255] Training: 43%|████▎ | 4256/10000 [57:52<54:55, 1.74it/s, loss=0.0339, lr=1.74e-05, step=4255] Training: 43%|████▎ | 4256/10000 [57:52<54:55, 1.74it/s, loss=0.0082, lr=1.74e-05, step=4256] Training: 43%|████▎ | 4257/10000 [57:53<59:09, 1.62it/s, loss=0.0082, lr=1.74e-05, step=4256] Training: 43%|████▎ | 4257/10000 [57:53<59:09, 1.62it/s, loss=0.0067, lr=1.74e-05, step=4257] Training: 43%|████▎ | 4258/10000 [57:53<55:16, 1.73it/s, loss=0.0067, lr=1.74e-05, step=4257] Training: 43%|████▎ | 4258/10000 [57:53<55:16, 1.73it/s, loss=0.0118, lr=1.74e-05, step=4258] Training: 43%|████▎ | 4259/10000 [57:54<53:02, 1.80it/s, loss=0.0118, lr=1.74e-05, step=4258] Training: 43%|████▎ | 4259/10000 [57:54<53:02, 1.80it/s, loss=0.0119, lr=1.74e-05, step=4259]19:42:26.815 [I] step=4260 loss=0.0119 smoothed_loss=0.0144 lr=1.74e-05 grad_norm=0.4884 step_time=0.4945s data_time=0.0811s it/s=1.738 eta_to_10000=3303.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.0754 grad_arm_token_fuse=0.0440 grad_shared_expert=0.3808 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4260/10000 [57:54<1:00:51, 1.57it/s, loss=0.0119, lr=1.74e-05, step=4259] Training: 43%|████▎ | 4260/10000 [57:54<1:00:51, 1.57it/s, loss=0.0119, lr=1.74e-05, step=4260] Training: 43%|████▎ | 4261/10000 [57:55<56:49, 1.68it/s, loss=0.0119, lr=1.74e-05, step=4260] Training: 43%|████▎ | 4261/10000 [57:55<56:49, 1.68it/s, loss=0.0048, lr=1.74e-05, step=4261] Training: 43%|████▎ | 4262/10000 [57:56<58:08, 1.64it/s, loss=0.0048, lr=1.74e-05, step=4261] Training: 43%|████▎ | 4262/10000 [57:56<58:08, 1.64it/s, loss=0.0090, lr=1.74e-05, step=4262] Training: 43%|████▎ | 4263/10000 [57:56<55:04, 1.74it/s, loss=0.0090, lr=1.74e-05, step=4262] Training: 43%|████▎ | 4263/10000 [57:56<55:04, 1.74it/s, loss=0.0303, lr=1.74e-05, step=4263] Training: 43%|████▎ | 4264/10000 [57:57<58:07, 1.64it/s, loss=0.0303, lr=1.74e-05, step=4263] Training: 43%|████▎ | 4264/10000 [57:57<58:07, 1.64it/s, loss=0.0117, lr=1.74e-05, step=4264] Training: 43%|████▎ | 4265/10000 [57:58<1:01:15, 1.56it/s, loss=0.0117, lr=1.74e-05, step=4264] Training: 43%|████▎ | 4265/10000 [57:58<1:01:15, 1.56it/s, loss=0.0501, lr=1.74e-05, step=4265] Training: 43%|████▎ | 4266/10000 [57:58<1:03:24, 1.51it/s, loss=0.0501, lr=1.74e-05, step=4265] Training: 43%|████▎ | 4266/10000 [57:58<1:03:24, 1.51it/s, loss=0.0084, lr=1.74e-05, step=4266] Training: 43%|████▎ | 4267/10000 [57:59<1:01:07, 1.56it/s, loss=0.0084, lr=1.74e-05, step=4266] Training: 43%|████▎ | 4267/10000 [57:59<1:01:07, 1.56it/s, loss=0.0068, lr=1.73e-05, step=4267] Training: 43%|████▎ | 4268/10000 [57:59<57:49, 1.65it/s, loss=0.0068, lr=1.73e-05, step=4267] Training: 43%|████▎ | 4268/10000 [57:59<57:49, 1.65it/s, loss=0.0055, lr=1.73e-05, step=4268] Training: 43%|████▎ | 4269/10000 [58:00<58:42, 1.63it/s, loss=0.0055, lr=1.73e-05, step=4268] Training: 43%|████▎ | 4269/10000 [58:00<58:42, 1.63it/s, loss=0.0107, lr=1.73e-05, step=4269]19:42:32.849 [I] step=4270 loss=0.0155 smoothed_loss=0.0146 lr=1.74e-05 grad_norm=0.4907 step_time=0.5250s data_time=0.0785s it/s=1.658 eta_to_10000=3456.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0187 grad_action_out_proj_arms=0.1499 grad_arm_token_fuse=0.0909 grad_shared_expert=0.3667 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4270/10000 [58:01<56:23, 1.69it/s, loss=0.0107, lr=1.73e-05, step=4269] Training: 43%|████▎ | 4270/10000 [58:01<56:23, 1.69it/s, loss=0.0155, lr=1.73e-05, step=4270] Training: 43%|████▎ | 4271/10000 [58:01<1:03:36, 1.50it/s, loss=0.0155, lr=1.73e-05, step=4270] Training: 43%|████▎ | 4271/10000 [58:01<1:03:36, 1.50it/s, loss=0.0191, lr=1.73e-05, step=4271] Training: 43%|████▎ | 4272/10000 [58:02<1:04:58, 1.47it/s, loss=0.0191, lr=1.73e-05, step=4271] Training: 43%|████▎ | 4272/10000 [58:02<1:04:58, 1.47it/s, loss=0.0107, lr=1.73e-05, step=4272] Training: 43%|████▎ | 4273/10000 [58:03<59:09, 1.61it/s, loss=0.0107, lr=1.73e-05, step=4272] Training: 43%|████▎ | 4273/10000 [58:03<59:09, 1.61it/s, loss=0.0028, lr=1.73e-05, step=4273] Training: 43%|████▎ | 4274/10000 [58:03<55:04, 1.73it/s, loss=0.0028, lr=1.73e-05, step=4273] Training: 43%|████▎ | 4274/10000 [58:03<55:04, 1.73it/s, loss=0.0199, lr=1.73e-05, step=4274] Training: 43%|████▎ | 4275/10000 [58:04<52:10, 1.83it/s, loss=0.0199, lr=1.73e-05, step=4274] Training: 43%|████▎ | 4275/10000 [58:04<52:10, 1.83it/s, loss=0.0159, lr=1.73e-05, step=4275] Training: 43%|████▎ | 4276/10000 [58:04<54:48, 1.74it/s, loss=0.0159, lr=1.73e-05, step=4275] Training: 43%|████▎ | 4276/10000 [58:04<54:48, 1.74it/s, loss=0.0115, lr=1.73e-05, step=4276] Training: 43%|████▎ | 4277/10000 [58:05<52:16, 1.82it/s, loss=0.0115, lr=1.73e-05, step=4276] Training: 43%|████▎ | 4277/10000 [58:05<52:16, 1.82it/s, loss=0.0042, lr=1.73e-05, step=4277] Training: 43%|████▎ | 4278/10000 [58:05<56:20, 1.69it/s, loss=0.0042, lr=1.73e-05, step=4277] Training: 43%|████▎ | 4278/10000 [58:05<56:20, 1.69it/s, loss=0.0264, lr=1.73e-05, step=4278] Training: 43%|████▎ | 4279/10000 [58:06<59:42, 1.60it/s, loss=0.0264, lr=1.73e-05, step=4278] Training: 43%|████▎ | 4279/10000 [58:06<59:42, 1.60it/s, loss=0.0141, lr=1.73e-05, step=4279]19:42:38.888 [I] step=4280 loss=0.0078 smoothed_loss=0.0137 lr=1.73e-05 grad_norm=0.4088 step_time=0.5346s data_time=0.0693s it/s=1.656 eta_to_10000=3453.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0306 grad_action_out_proj_arms=0.1631 grad_arm_token_fuse=0.1796 grad_shared_expert=0.4660 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4280/10000 [58:07<56:47, 1.68it/s, loss=0.0141, lr=1.73e-05, step=4279] Training: 43%|████▎ | 4280/10000 [58:07<56:47, 1.68it/s, loss=0.0078, lr=1.73e-05, step=4280] Training: 43%|████▎ | 4281/10000 [58:07<53:34, 1.78it/s, loss=0.0078, lr=1.73e-05, step=4280] Training: 43%|████▎ | 4281/10000 [58:07<53:34, 1.78it/s, loss=0.0160, lr=1.73e-05, step=4281] Training: 43%|████▎ | 4282/10000 [58:08<56:34, 1.68it/s, loss=0.0160, lr=1.73e-05, step=4281] Training: 43%|████▎ | 4282/10000 [58:08<56:34, 1.68it/s, loss=0.0035, lr=1.73e-05, step=4282] Training: 43%|████▎ | 4283/10000 [58:08<58:33, 1.63it/s, loss=0.0035, lr=1.73e-05, step=4282] Training: 43%|████▎ | 4283/10000 [58:08<58:33, 1.63it/s, loss=0.0220, lr=1.73e-05, step=4283] Training: 43%|████▎ | 4284/10000 [58:09<1:05:11, 1.46it/s, loss=0.0220, lr=1.73e-05, step=4283] Training: 43%|████▎ | 4284/10000 [58:09<1:05:11, 1.46it/s, loss=0.0295, lr=1.73e-05, step=4284] Training: 43%|████▎ | 4285/10000 [58:10<1:04:51, 1.47it/s, loss=0.0295, lr=1.73e-05, step=4284] Training: 43%|████▎ | 4285/10000 [58:10<1:04:51, 1.47it/s, loss=0.0028, lr=1.73e-05, step=4285] Training: 43%|████▎ | 4286/10000 [58:11<1:12:32, 1.31it/s, loss=0.0028, lr=1.73e-05, step=4285] Training: 43%|████▎ | 4286/10000 [58:11<1:12:32, 1.31it/s, loss=0.0075, lr=1.73e-05, step=4286] Training: 43%|████▎ | 4287/10000 [58:11<1:04:31, 1.48it/s, loss=0.0075, lr=1.73e-05, step=4286] Training: 43%|████▎ | 4287/10000 [58:11<1:04:31, 1.48it/s, loss=0.0101, lr=1.73e-05, step=4287] Training: 43%|████▎ | 4288/10000 [58:12<1:02:02, 1.53it/s, loss=0.0101, lr=1.73e-05, step=4287] Training: 43%|████▎ | 4288/10000 [58:12<1:02:02, 1.53it/s, loss=0.0123, lr=1.73e-05, step=4288] Training: 43%|████▎ | 4289/10000 [58:12<58:20, 1.63it/s, loss=0.0123, lr=1.73e-05, step=4288] Training: 43%|████▎ | 4289/10000 [58:12<58:20, 1.63it/s, loss=0.0078, lr=1.73e-05, step=4289]19:42:45.433 [I] step=4290 loss=0.0107 smoothed_loss=0.0123 lr=1.73e-05 grad_norm=0.5055 step_time=0.5341s data_time=0.1204s it/s=1.528 eta_to_10000=3736.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.1107 grad_arm_token_fuse=0.0394 grad_shared_expert=0.4207 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4290/10000 [58:13<59:49, 1.59it/s, loss=0.0078, lr=1.73e-05, step=4289] Training: 43%|████▎ | 4290/10000 [58:13<59:49, 1.59it/s, loss=0.0107, lr=1.73e-05, step=4290] Training: 43%|████▎ | 4291/10000 [58:14<1:01:06, 1.56it/s, loss=0.0107, lr=1.73e-05, step=4290] Training: 43%|████▎ | 4291/10000 [58:14<1:01:06, 1.56it/s, loss=0.0121, lr=1.73e-05, step=4291] Training: 43%|████▎ | 4292/10000 [58:14<56:35, 1.68it/s, loss=0.0121, lr=1.73e-05, step=4291] Training: 43%|████▎ | 4292/10000 [58:14<56:35, 1.68it/s, loss=0.0064, lr=1.73e-05, step=4292] Training: 43%|████▎ | 4293/10000 [58:15<1:00:10, 1.58it/s, loss=0.0064, lr=1.73e-05, step=4292] Training: 43%|████▎ | 4293/10000 [58:15<1:00:10, 1.58it/s, loss=0.0058, lr=1.73e-05, step=4293] Training: 43%|████▎ | 4294/10000 [58:16<1:00:27, 1.57it/s, loss=0.0058, lr=1.73e-05, step=4293] Training: 43%|████▎ | 4294/10000 [58:16<1:00:27, 1.57it/s, loss=0.0034, lr=1.73e-05, step=4294] Training: 43%|████▎ | 4295/10000 [58:16<56:17, 1.69it/s, loss=0.0034, lr=1.73e-05, step=4294] Training: 43%|████▎ | 4295/10000 [58:16<56:17, 1.69it/s, loss=0.0248, lr=1.72e-05, step=4295] Training: 43%|████▎ | 4296/10000 [58:17<56:27, 1.68it/s, loss=0.0248, lr=1.72e-05, step=4295] Training: 43%|████▎ | 4296/10000 [58:17<56:27, 1.68it/s, loss=0.0610, lr=1.72e-05, step=4296] Training: 43%|████▎ | 4297/10000 [58:18<1:04:17, 1.48it/s, loss=0.0610, lr=1.72e-05, step=4296] Training: 43%|████▎ | 4297/10000 [58:18<1:04:17, 1.48it/s, loss=0.0475, lr=1.72e-05, step=4297] Training: 43%|████▎ | 4298/10000 [58:18<1:06:52, 1.42it/s, loss=0.0475, lr=1.72e-05, step=4297] Training: 43%|████▎ | 4298/10000 [58:18<1:06:52, 1.42it/s, loss=0.0056, lr=1.72e-05, step=4298] Training: 43%|████▎ | 4299/10000 [58:19<1:07:05, 1.42it/s, loss=0.0056, lr=1.72e-05, step=4298] Training: 43%|████▎ | 4299/10000 [58:19<1:07:05, 1.42it/s, loss=0.0252, lr=1.72e-05, step=4299]19:42:52.117 [I] step=4300 loss=0.1242 smoothed_loss=0.0296 lr=1.72e-05 grad_norm=0.4954 step_time=0.5589s data_time=0.1096s it/s=1.496 eta_to_10000=3809.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0306 grad_action_out_proj_arms=0.2490 grad_arm_token_fuse=0.1709 grad_shared_expert=0.5197 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4300/10000 [58:20<1:07:39, 1.40it/s, loss=0.0252, lr=1.72e-05, step=4299] Training: 43%|████▎ | 4300/10000 [58:20<1:07:39, 1.40it/s, loss=0.1242, lr=1.72e-05, step=4300] Training: 43%|████▎ | 4301/10000 [58:20<1:01:24, 1.55it/s, loss=0.1242, lr=1.72e-05, step=4300] Training: 43%|████▎ | 4301/10000 [58:20<1:01:24, 1.55it/s, loss=0.0293, lr=1.72e-05, step=4301] Training: 43%|████▎ | 4302/10000 [58:21<1:05:39, 1.45it/s, loss=0.0293, lr=1.72e-05, step=4301] Training: 43%|████▎ | 4302/10000 [58:21<1:05:39, 1.45it/s, loss=0.0153, lr=1.72e-05, step=4302] Training: 43%|████▎ | 4303/10000 [58:22<1:01:45, 1.54it/s, loss=0.0153, lr=1.72e-05, step=4302] Training: 43%|████▎ | 4303/10000 [58:22<1:01:45, 1.54it/s, loss=0.0103, lr=1.72e-05, step=4303] Training: 43%|████▎ | 4304/10000 [58:22<1:03:45, 1.49it/s, loss=0.0103, lr=1.72e-05, step=4303] Training: 43%|████▎ | 4304/10000 [58:22<1:03:45, 1.49it/s, loss=0.0038, lr=1.72e-05, step=4304] Training: 43%|████▎ | 4305/10000 [58:23<1:03:35, 1.49it/s, loss=0.0038, lr=1.72e-05, step=4304] Training: 43%|████▎ | 4305/10000 [58:23<1:03:35, 1.49it/s, loss=0.0111, lr=1.72e-05, step=4305] Training: 43%|████▎ | 4306/10000 [58:24<1:01:17, 1.55it/s, loss=0.0111, lr=1.72e-05, step=4305] Training: 43%|████▎ | 4306/10000 [58:24<1:01:17, 1.55it/s, loss=0.0322, lr=1.72e-05, step=4306] Training: 43%|████▎ | 4307/10000 [58:24<1:07:46, 1.40it/s, loss=0.0322, lr=1.72e-05, step=4306] Training: 43%|████▎ | 4307/10000 [58:24<1:07:46, 1.40it/s, loss=0.0108, lr=1.72e-05, step=4307] Training: 43%|████▎ | 4308/10000 [58:25<1:10:28, 1.35it/s, loss=0.0108, lr=1.72e-05, step=4307] Training: 43%|████▎ | 4308/10000 [58:25<1:10:28, 1.35it/s, loss=0.0087, lr=1.72e-05, step=4308] Training: 43%|████▎ | 4309/10000 [58:26<1:07:19, 1.41it/s, loss=0.0087, lr=1.72e-05, step=4308] Training: 43%|████▎ | 4309/10000 [58:26<1:07:19, 1.41it/s, loss=0.0052, lr=1.72e-05, step=4309]19:42:59.042 [I] step=4310 loss=0.0022 smoothed_loss=0.0178 lr=1.72e-05 grad_norm=0.5244 step_time=0.5480s data_time=0.1446s it/s=1.444 eta_to_10000=3939.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1330 grad_arm_token_fuse=0.0691 grad_shared_expert=0.3427 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4310/10000 [58:27<1:09:32, 1.36it/s, loss=0.0052, lr=1.72e-05, step=4309] Training: 43%|████▎ | 4310/10000 [58:27<1:09:32, 1.36it/s, loss=0.0022, lr=1.72e-05, step=4310] Training: 43%|████▎ | 4311/10000 [58:27<1:02:41, 1.51it/s, loss=0.0022, lr=1.72e-05, step=4310] Training: 43%|████▎ | 4311/10000 [58:27<1:02:41, 1.51it/s, loss=0.0039, lr=1.72e-05, step=4311] Training: 43%|████▎ | 4312/10000 [58:28<1:01:33, 1.54it/s, loss=0.0039, lr=1.72e-05, step=4311] Training: 43%|████▎ | 4312/10000 [58:28<1:01:33, 1.54it/s, loss=0.0117, lr=1.72e-05, step=4312] Training: 43%|████▎ | 4313/10000 [58:29<1:02:21, 1.52it/s, loss=0.0117, lr=1.72e-05, step=4312] Training: 43%|████▎ | 4313/10000 [58:29<1:02:21, 1.52it/s, loss=0.0088, lr=1.72e-05, step=4313] Training: 43%|████▎ | 4314/10000 [58:29<1:02:34, 1.51it/s, loss=0.0088, lr=1.72e-05, step=4313] Training: 43%|████▎ | 4314/10000 [58:29<1:02:34, 1.51it/s, loss=0.0059, lr=1.72e-05, step=4314] Training: 43%|████▎ | 4315/10000 [58:30<1:14:55, 1.26it/s, loss=0.0059, lr=1.72e-05, step=4314] Training: 43%|████▎ | 4315/10000 [58:30<1:14:55, 1.26it/s, loss=0.0047, lr=1.72e-05, step=4315] Training: 43%|████▎ | 4316/10000 [58:31<1:12:48, 1.30it/s, loss=0.0047, lr=1.72e-05, step=4315] Training: 43%|████▎ | 4316/10000 [58:31<1:12:48, 1.30it/s, loss=0.0181, lr=1.72e-05, step=4316] Training: 43%|████▎ | 4317/10000 [58:32<1:10:47, 1.34it/s, loss=0.0181, lr=1.72e-05, step=4316] Training: 43%|████▎ | 4317/10000 [58:32<1:10:47, 1.34it/s, loss=0.0101, lr=1.72e-05, step=4317] Training: 43%|████▎ | 4318/10000 [58:32<1:03:23, 1.49it/s, loss=0.0101, lr=1.72e-05, step=4317] Training: 43%|████▎ | 4318/10000 [58:32<1:03:23, 1.49it/s, loss=0.0487, lr=1.72e-05, step=4318] Training: 43%|████▎ | 4319/10000 [58:33<58:02, 1.63it/s, loss=0.0487, lr=1.72e-05, step=4318] Training: 43%|████▎ | 4319/10000 [58:33<58:02, 1.63it/s, loss=0.0206, lr=1.72e-05, step=4319]19:43:05.694 [I] step=4320 loss=0.0401 smoothed_loss=0.0196 lr=1.72e-05 grad_norm=0.4836 step_time=0.5565s data_time=0.1086s it/s=1.504 eta_to_10000=3777.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0253 grad_action_out_proj_arms=0.1902 grad_arm_token_fuse=0.1262 grad_shared_expert=0.5798 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4320/10000 [58:33<1:00:54, 1.55it/s, loss=0.0206, lr=1.72e-05, step=4319] Training: 43%|████▎ | 4320/10000 [58:33<1:00:54, 1.55it/s, loss=0.0401, lr=1.72e-05, step=4320] Training: 43%|████▎ | 4321/10000 [58:34<1:06:23, 1.43it/s, loss=0.0401, lr=1.72e-05, step=4320] Training: 43%|████▎ | 4321/10000 [58:34<1:06:23, 1.43it/s, loss=0.0016, lr=1.72e-05, step=4321] Training: 43%|████▎ | 4322/10000 [58:35<1:12:24, 1.31it/s, loss=0.0016, lr=1.72e-05, step=4321] Training: 43%|████▎ | 4322/10000 [58:35<1:12:24, 1.31it/s, loss=0.0018, lr=1.72e-05, step=4322] Training: 43%|████▎ | 4323/10000 [58:36<1:04:13, 1.47it/s, loss=0.0018, lr=1.72e-05, step=4322] Training: 43%|████▎ | 4323/10000 [58:36<1:04:13, 1.47it/s, loss=0.0063, lr=1.71e-05, step=4323] Training: 43%|████▎ | 4324/10000 [58:36<58:46, 1.61it/s, loss=0.0063, lr=1.71e-05, step=4323] Training: 43%|████▎ | 4324/10000 [58:36<58:46, 1.61it/s, loss=0.0436, lr=1.71e-05, step=4324] Training: 43%|████▎ | 4325/10000 [58:37<1:01:51, 1.53it/s, loss=0.0436, lr=1.71e-05, step=4324] Training: 43%|████▎ | 4325/10000 [58:37<1:01:51, 1.53it/s, loss=0.0475, lr=1.71e-05, step=4325] Training: 43%|████▎ | 4326/10000 [58:37<56:57, 1.66it/s, loss=0.0475, lr=1.71e-05, step=4325] Training: 43%|████▎ | 4326/10000 [58:37<56:57, 1.66it/s, loss=0.0394, lr=1.71e-05, step=4326] Training: 43%|████▎ | 4327/10000 [58:38<1:02:12, 1.52it/s, loss=0.0394, lr=1.71e-05, step=4326] Training: 43%|████▎ | 4327/10000 [58:38<1:02:12, 1.52it/s, loss=0.0497, lr=1.71e-05, step=4327] Training: 43%|████▎ | 4328/10000 [58:39<1:04:45, 1.46it/s, loss=0.0497, lr=1.71e-05, step=4327] Training: 43%|████▎ | 4328/10000 [58:39<1:04:45, 1.46it/s, loss=0.0027, lr=1.71e-05, step=4328] Training: 43%|████▎ | 4329/10000 [58:40<1:05:40, 1.44it/s, loss=0.0027, lr=1.71e-05, step=4328] Training: 43%|████▎ | 4329/10000 [58:40<1:05:40, 1.44it/s, loss=0.0062, lr=1.71e-05, step=4329]19:43:12.398 [I] step=4330 loss=0.0189 smoothed_loss=0.0213 lr=1.71e-05 grad_norm=0.4846 step_time=0.5771s data_time=0.0934s it/s=1.492 eta_to_10000=3801.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0112 grad_action_out_proj_arms=0.1011 grad_arm_token_fuse=0.0570 grad_shared_expert=0.4831 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4330/10000 [58:40<1:00:48, 1.55it/s, loss=0.0062, lr=1.71e-05, step=4329] Training: 43%|████▎ | 4330/10000 [58:40<1:00:48, 1.55it/s, loss=0.0189, lr=1.71e-05, step=4330] Training: 43%|████▎ | 4331/10000 [58:41<56:08, 1.68it/s, loss=0.0189, lr=1.71e-05, step=4330] Training: 43%|████▎ | 4331/10000 [58:41<56:08, 1.68it/s, loss=0.0078, lr=1.71e-05, step=4331] Training: 43%|████▎ | 4332/10000 [58:41<53:55, 1.75it/s, loss=0.0078, lr=1.71e-05, step=4331] Training: 43%|████▎ | 4332/10000 [58:41<53:55, 1.75it/s, loss=0.0244, lr=1.71e-05, step=4332] Training: 43%|████▎ | 4333/10000 [58:42<1:00:46, 1.55it/s, loss=0.0244, lr=1.71e-05, step=4332] Training: 43%|████▎ | 4333/10000 [58:42<1:00:46, 1.55it/s, loss=0.0053, lr=1.71e-05, step=4333] Training: 43%|████▎ | 4334/10000 [58:43<1:00:17, 1.57it/s, loss=0.0053, lr=1.71e-05, step=4333] Training: 43%|████▎ | 4334/10000 [58:43<1:00:17, 1.57it/s, loss=0.0062, lr=1.71e-05, step=4334] Training: 43%|████▎ | 4335/10000 [58:43<1:01:09, 1.54it/s, loss=0.0062, lr=1.71e-05, step=4334] Training: 43%|████▎ | 4335/10000 [58:43<1:01:09, 1.54it/s, loss=0.0082, lr=1.71e-05, step=4335] Training: 43%|████▎ | 4336/10000 [58:44<1:09:05, 1.37it/s, loss=0.0082, lr=1.71e-05, step=4335] Training: 43%|████▎ | 4336/10000 [58:44<1:09:05, 1.37it/s, loss=0.0124, lr=1.71e-05, step=4336] Training: 43%|████▎ | 4337/10000 [58:45<1:01:58, 1.52it/s, loss=0.0124, lr=1.71e-05, step=4336] Training: 43%|████▎ | 4337/10000 [58:45<1:01:58, 1.52it/s, loss=0.0077, lr=1.71e-05, step=4337] Training: 43%|████▎ | 4338/10000 [58:45<1:03:49, 1.48it/s, loss=0.0077, lr=1.71e-05, step=4337] Training: 43%|████▎ | 4338/10000 [58:45<1:03:49, 1.48it/s, loss=0.0034, lr=1.71e-05, step=4338] Training: 43%|████▎ | 4339/10000 [58:46<1:14:02, 1.27it/s, loss=0.0034, lr=1.71e-05, step=4338] Training: 43%|████▎ | 4339/10000 [58:46<1:14:02, 1.27it/s, loss=0.0085, lr=1.71e-05, step=4339]19:43:19.338 [I] step=4340 loss=0.0095 smoothed_loss=0.0132 lr=1.71e-05 grad_norm=0.4636 step_time=0.5753s data_time=0.1188s it/s=1.441 eta_to_10000=3927.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0182 grad_action_out_proj_arms=0.1352 grad_arm_token_fuse=0.0990 grad_shared_expert=0.3477 (18633:train_pytorch.py:850) + Training: 43%|████▎ | 4340/10000 [58:47<1:10:39, 1.33it/s, loss=0.0085, lr=1.71e-05, step=4339] Training: 43%|████▎ | 4340/10000 [58:47<1:10:39, 1.33it/s, loss=0.0095, lr=1.71e-05, step=4340] Training: 43%|████▎ | 4341/10000 [58:48<1:15:25, 1.25it/s, loss=0.0095, lr=1.71e-05, step=4340] Training: 43%|████▎ | 4341/10000 [58:48<1:15:25, 1.25it/s, loss=0.0089, lr=1.71e-05, step=4341] Training: 43%|████▎ | 4342/10000 [58:49<1:19:43, 1.18it/s, loss=0.0089, lr=1.71e-05, step=4341] Training: 43%|████▎ | 4342/10000 [58:49<1:19:43, 1.18it/s, loss=0.0060, lr=1.71e-05, step=4342] Training: 43%|████▎ | 4343/10000 [58:50<1:23:52, 1.12it/s, loss=0.0060, lr=1.71e-05, step=4342] Training: 43%|████▎ | 4343/10000 [58:50<1:23:52, 1.12it/s, loss=0.0095, lr=1.71e-05, step=4343] Training: 43%|████▎ | 4344/10000 [58:51<1:27:21, 1.08it/s, loss=0.0095, lr=1.71e-05, step=4343] Training: 43%|████▎ | 4344/10000 [58:51<1:27:21, 1.08it/s, loss=0.0079, lr=1.71e-05, step=4344] Training: 43%|████▎ | 4345/10000 [58:52<1:21:36, 1.15it/s, loss=0.0079, lr=1.71e-05, step=4344] Training: 43%|████▎ | 4345/10000 [58:52<1:21:36, 1.15it/s, loss=0.0098, lr=1.71e-05, step=4345] Training: 43%|████▎ | 4346/10000 [58:52<1:19:44, 1.18it/s, loss=0.0098, lr=1.71e-05, step=4345] Training: 43%|████▎ | 4346/10000 [58:52<1:19:44, 1.18it/s, loss=0.0241, lr=1.71e-05, step=4346] Training: 43%|████▎ | 4347/10000 [58:53<1:12:02, 1.31it/s, loss=0.0241, lr=1.71e-05, step=4346] Training: 43%|████▎ | 4347/10000 [58:53<1:12:02, 1.31it/s, loss=0.0114, lr=1.71e-05, step=4347] Training: 43%|████▎ | 4348/10000 [58:54<1:08:32, 1.37it/s, loss=0.0114, lr=1.71e-05, step=4347] Training: 43%|████▎ | 4348/10000 [58:54<1:08:32, 1.37it/s, loss=0.0236, lr=1.71e-05, step=4348] Training: 43%|████▎ | 4349/10000 [58:54<1:08:40, 1.37it/s, loss=0.0236, lr=1.71e-05, step=4348] Training: 43%|████▎ | 4349/10000 [58:54<1:08:40, 1.37it/s, loss=0.0185, lr=1.71e-05, step=4349]19:43:27.501 [I] step=4350 loss=0.0123 smoothed_loss=0.0139 lr=1.71e-05 grad_norm=0.4402 step_time=0.6485s data_time=0.1678s it/s=1.225 eta_to_10000=4611.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0078 grad_action_out_proj_arms=0.1110 grad_arm_token_fuse=0.0444 grad_shared_expert=0.4434 (18633:train_pytorch.py:850) + Training: 44%|████▎ | 4350/10000 [58:55<1:11:05, 1.32it/s, loss=0.0185, lr=1.71e-05, step=4349] Training: 44%|████▎ | 4350/10000 [58:55<1:11:05, 1.32it/s, loss=0.0123, lr=1.71e-05, step=4350] Training: 44%|████▎ | 4351/10000 [58:56<1:07:22, 1.40it/s, loss=0.0123, lr=1.71e-05, step=4350] Training: 44%|████▎ | 4351/10000 [58:56<1:07:22, 1.40it/s, loss=0.0062, lr=1.70e-05, step=4351] Training: 44%|████▎ | 4352/10000 [58:57<1:07:35, 1.39it/s, loss=0.0062, lr=1.70e-05, step=4351] Training: 44%|████▎ | 4352/10000 [58:57<1:07:35, 1.39it/s, loss=0.0260, lr=1.70e-05, step=4352] Training: 44%|████▎ | 4353/10000 [58:57<1:06:45, 1.41it/s, loss=0.0260, lr=1.70e-05, step=4352] Training: 44%|████▎ | 4353/10000 [58:57<1:06:45, 1.41it/s, loss=0.0060, lr=1.70e-05, step=4353] Training: 44%|████▎ | 4354/10000 [58:58<1:07:06, 1.40it/s, loss=0.0060, lr=1.70e-05, step=4353] Training: 44%|████▎ | 4354/10000 [58:58<1:07:06, 1.40it/s, loss=0.0069, lr=1.70e-05, step=4354] Training: 44%|████▎ | 4355/10000 [58:59<1:11:48, 1.31it/s, loss=0.0069, lr=1.70e-05, step=4354] Training: 44%|████▎ | 4355/10000 [58:59<1:11:48, 1.31it/s, loss=0.0136, lr=1.70e-05, step=4355] Training: 44%|████▎ | 4356/10000 [58:59<1:06:26, 1.42it/s, loss=0.0136, lr=1.70e-05, step=4355] Training: 44%|████▎ | 4356/10000 [58:59<1:06:26, 1.42it/s, loss=0.0030, lr=1.70e-05, step=4356] Training: 44%|████▎ | 4357/10000 [59:00<1:09:05, 1.36it/s, loss=0.0030, lr=1.70e-05, step=4356] Training: 44%|████▎ | 4357/10000 [59:00<1:09:05, 1.36it/s, loss=0.0296, lr=1.70e-05, step=4357] Training: 44%|████▎ | 4358/10000 [59:01<1:05:23, 1.44it/s, loss=0.0296, lr=1.70e-05, step=4357] Training: 44%|████▎ | 4358/10000 [59:01<1:05:23, 1.44it/s, loss=0.0195, lr=1.70e-05, step=4358] Training: 44%|████▎ | 4359/10000 [59:02<1:09:04, 1.36it/s, loss=0.0195, lr=1.70e-05, step=4358] Training: 44%|████▎ | 4359/10000 [59:02<1:09:04, 1.36it/s, loss=0.0615, lr=1.70e-05, step=4359]19:43:34.645 [I] step=4360 loss=0.0112 smoothed_loss=0.0182 lr=1.70e-05 grad_norm=0.4808 step_time=0.5749s data_time=0.1394s it/s=1.400 eta_to_10000=4028.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1457 grad_arm_token_fuse=0.0603 grad_shared_expert=0.5140 (18633:train_pytorch.py:850) + Training: 44%|████▎ | 4360/10000 [59:02<1:08:05, 1.38it/s, loss=0.0615, lr=1.70e-05, step=4359] Training: 44%|████▎ | 4360/10000 [59:02<1:08:05, 1.38it/s, loss=0.0112, lr=1.70e-05, step=4360] Training: 44%|████▎ | 4361/10000 [59:03<1:07:06, 1.40it/s, loss=0.0112, lr=1.70e-05, step=4360] Training: 44%|████▎ | 4361/10000 [59:03<1:07:06, 1.40it/s, loss=0.0092, lr=1.70e-05, step=4361] Training: 44%|████▎ | 4362/10000 [59:04<1:01:44, 1.52it/s, loss=0.0092, lr=1.70e-05, step=4361] Training: 44%|████▎ | 4362/10000 [59:04<1:01:44, 1.52it/s, loss=0.0099, lr=1.70e-05, step=4362] Training: 44%|████▎ | 4363/10000 [59:04<1:01:34, 1.53it/s, loss=0.0099, lr=1.70e-05, step=4362] Training: 44%|████▎ | 4363/10000 [59:04<1:01:34, 1.53it/s, loss=0.0038, lr=1.70e-05, step=4363] Training: 44%|████▎ | 4364/10000 [59:05<1:08:35, 1.37it/s, loss=0.0038, lr=1.70e-05, step=4363] Training: 44%|████▎ | 4364/10000 [59:05<1:08:35, 1.37it/s, loss=0.0133, lr=1.70e-05, step=4364] Training: 44%|████▎ | 4365/10000 [59:06<1:08:44, 1.37it/s, loss=0.0133, lr=1.70e-05, step=4364] Training: 44%|████▎ | 4365/10000 [59:06<1:08:44, 1.37it/s, loss=0.0231, lr=1.70e-05, step=4365] Training: 44%|████▎ | 4366/10000 [59:06<1:05:29, 1.43it/s, loss=0.0231, lr=1.70e-05, step=4365] Training: 44%|████▎ | 4366/10000 [59:06<1:05:29, 1.43it/s, loss=0.0102, lr=1.70e-05, step=4366] Training: 44%|████▎ | 4367/10000 [59:07<1:02:57, 1.49it/s, loss=0.0102, lr=1.70e-05, step=4366] Training: 44%|████▎ | 4367/10000 [59:07<1:02:57, 1.49it/s, loss=0.0109, lr=1.70e-05, step=4367] Training: 44%|████▎ | 4368/10000 [59:08<59:24, 1.58it/s, loss=0.0109, lr=1.70e-05, step=4367] Training: 44%|████▎ | 4368/10000 [59:08<59:24, 1.58it/s, loss=0.0045, lr=1.70e-05, step=4368] Training: 44%|████▎ | 4369/10000 [59:08<1:02:31, 1.50it/s, loss=0.0045, lr=1.70e-05, step=4368] Training: 44%|████▎ | 4369/10000 [59:08<1:02:31, 1.50it/s, loss=0.0062, lr=1.70e-05, step=4369]19:43:41.453 [I] step=4370 loss=0.0211 smoothed_loss=0.0139 lr=1.70e-05 grad_norm=0.4925 step_time=0.5740s data_time=0.1068s it/s=1.469 eta_to_10000=3832.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0241 grad_action_out_proj_arms=0.2179 grad_arm_token_fuse=0.1301 grad_shared_expert=0.5978 (18633:train_pytorch.py:850) + Training: 44%|████▎ | 4370/10000 [59:09<1:05:54, 1.42it/s, loss=0.0062, lr=1.70e-05, step=4369] Training: 44%|████▎ | 4370/10000 [59:09<1:05:54, 1.42it/s, loss=0.0211, lr=1.70e-05, step=4370] Training: 44%|████▎ | 4371/10000 [59:10<1:05:26, 1.43it/s, loss=0.0211, lr=1.70e-05, step=4370] Training: 44%|████▎ | 4371/10000 [59:10<1:05:26, 1.43it/s, loss=0.0134, lr=1.70e-05, step=4371] Training: 44%|████▎ | 4372/10000 [59:11<1:07:50, 1.38it/s, loss=0.0134, lr=1.70e-05, step=4371] Training: 44%|████▎ | 4372/10000 [59:11<1:07:50, 1.38it/s, loss=0.0168, lr=1.70e-05, step=4372] Training: 44%|████▎ | 4373/10000 [59:11<1:02:53, 1.49it/s, loss=0.0168, lr=1.70e-05, step=4372] Training: 44%|████▎ | 4373/10000 [59:11<1:02:53, 1.49it/s, loss=0.0174, lr=1.70e-05, step=4373] Training: 44%|████▎ | 4374/10000 [59:12<1:01:40, 1.52it/s, loss=0.0174, lr=1.70e-05, step=4373] Training: 44%|████▎ | 4374/10000 [59:12<1:01:40, 1.52it/s, loss=0.0200, lr=1.70e-05, step=4374] Training: 44%|████▍ | 4375/10000 [59:12<58:53, 1.59it/s, loss=0.0200, lr=1.70e-05, step=4374] Training: 44%|████▍ | 4375/10000 [59:12<58:53, 1.59it/s, loss=0.0164, lr=1.70e-05, step=4375] Training: 44%|████▍ | 4376/10000 [59:13<1:04:20, 1.46it/s, loss=0.0164, lr=1.70e-05, step=4375] Training: 44%|████▍ | 4376/10000 [59:13<1:04:20, 1.46it/s, loss=0.0669, lr=1.70e-05, step=4376] Training: 44%|████▍ | 4377/10000 [59:14<1:02:40, 1.50it/s, loss=0.0669, lr=1.70e-05, step=4376] Training: 44%|████▍ | 4377/10000 [59:14<1:02:40, 1.50it/s, loss=0.0227, lr=1.70e-05, step=4377] Training: 44%|████▍ | 4378/10000 [59:15<1:10:42, 1.33it/s, loss=0.0227, lr=1.70e-05, step=4377] Training: 44%|████▍ | 4378/10000 [59:15<1:10:42, 1.33it/s, loss=0.0106, lr=1.70e-05, step=4378] Training: 44%|████▍ | 4379/10000 [59:16<1:19:51, 1.17it/s, loss=0.0106, lr=1.70e-05, step=4378] Training: 44%|████▍ | 4379/10000 [59:16<1:19:51, 1.17it/s, loss=0.0116, lr=1.69e-05, step=4379]19:43:48.967 [I] step=4380 loss=0.0120 smoothed_loss=0.0181 lr=1.70e-05 grad_norm=0.5055 step_time=0.6183s data_time=0.1331s it/s=1.331 eta_to_10000=4222.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.0921 grad_arm_token_fuse=0.0532 grad_shared_expert=0.3694 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4380/10000 [59:17<1:19:04, 1.18it/s, loss=0.0116, lr=1.69e-05, step=4379] Training: 44%|████▍ | 4380/10000 [59:17<1:19:04, 1.18it/s, loss=0.0120, lr=1.69e-05, step=4380] Training: 44%|████▍ | 4381/10000 [59:17<1:09:11, 1.35it/s, loss=0.0120, lr=1.69e-05, step=4380] Training: 44%|████▍ | 4381/10000 [59:17<1:09:11, 1.35it/s, loss=0.0036, lr=1.69e-05, step=4381] Training: 44%|████▍ | 4382/10000 [59:18<1:06:50, 1.40it/s, loss=0.0036, lr=1.69e-05, step=4381] Training: 44%|████▍ | 4382/10000 [59:18<1:06:50, 1.40it/s, loss=0.0076, lr=1.69e-05, step=4382] Training: 44%|████▍ | 4383/10000 [59:18<1:01:58, 1.51it/s, loss=0.0076, lr=1.69e-05, step=4382] Training: 44%|████▍ | 4383/10000 [59:18<1:01:58, 1.51it/s, loss=0.0127, lr=1.69e-05, step=4383] Training: 44%|████▍ | 4384/10000 [59:19<1:05:44, 1.42it/s, loss=0.0127, lr=1.69e-05, step=4383] Training: 44%|████▍ | 4384/10000 [59:19<1:05:44, 1.42it/s, loss=0.0083, lr=1.69e-05, step=4384] Training: 44%|████▍ | 4385/10000 [59:20<1:03:34, 1.47it/s, loss=0.0083, lr=1.69e-05, step=4384] Training: 44%|████▍ | 4385/10000 [59:20<1:03:34, 1.47it/s, loss=0.0122, lr=1.69e-05, step=4385] Training: 44%|████▍ | 4386/10000 [59:20<1:04:57, 1.44it/s, loss=0.0122, lr=1.69e-05, step=4385] Training: 44%|████▍ | 4386/10000 [59:20<1:04:57, 1.44it/s, loss=0.0161, lr=1.69e-05, step=4386] Training: 44%|████▍ | 4387/10000 [59:21<1:00:20, 1.55it/s, loss=0.0161, lr=1.69e-05, step=4386] Training: 44%|████▍ | 4387/10000 [59:21<1:00:20, 1.55it/s, loss=0.0025, lr=1.69e-05, step=4387] Training: 44%|████▍ | 4388/10000 [59:22<1:09:50, 1.34it/s, loss=0.0025, lr=1.69e-05, step=4387] Training: 44%|████▍ | 4388/10000 [59:22<1:09:50, 1.34it/s, loss=0.0371, lr=1.69e-05, step=4388] Training: 44%|████▍ | 4389/10000 [59:23<1:07:58, 1.38it/s, loss=0.0371, lr=1.69e-05, step=4388] Training: 44%|████▍ | 4389/10000 [59:23<1:07:58, 1.38it/s, loss=0.0107, lr=1.69e-05, step=4389]19:43:55.523 [I] step=4390 loss=0.0307 smoothed_loss=0.0168 lr=1.69e-05 grad_norm=0.4103 step_time=0.5370s data_time=0.1187s it/s=1.525 eta_to_10000=3677.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0903 grad_arm_token_fuse=0.0430 grad_shared_expert=0.4015 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4390/10000 [59:23<1:02:11, 1.50it/s, loss=0.0107, lr=1.69e-05, step=4389] Training: 44%|████▍ | 4390/10000 [59:23<1:02:11, 1.50it/s, loss=0.0307, lr=1.69e-05, step=4390] Training: 44%|████▍ | 4391/10000 [59:24<1:05:57, 1.42it/s, loss=0.0307, lr=1.69e-05, step=4390] Training: 44%|████▍ | 4391/10000 [59:24<1:05:57, 1.42it/s, loss=0.0118, lr=1.69e-05, step=4391] Training: 44%|████▍ | 4392/10000 [59:25<1:01:46, 1.51it/s, loss=0.0118, lr=1.69e-05, step=4391] Training: 44%|████▍ | 4392/10000 [59:25<1:01:46, 1.51it/s, loss=0.0075, lr=1.69e-05, step=4392] Training: 44%|████▍ | 4393/10000 [59:25<1:05:30, 1.43it/s, loss=0.0075, lr=1.69e-05, step=4392] Training: 44%|████▍ | 4393/10000 [59:25<1:05:30, 1.43it/s, loss=0.0094, lr=1.69e-05, step=4393] Training: 44%|████▍ | 4394/10000 [59:26<1:10:43, 1.32it/s, loss=0.0094, lr=1.69e-05, step=4393] Training: 44%|████▍ | 4394/10000 [59:26<1:10:43, 1.32it/s, loss=0.0112, lr=1.69e-05, step=4394] Training: 44%|████▍ | 4395/10000 [59:27<1:13:41, 1.27it/s, loss=0.0112, lr=1.69e-05, step=4394] Training: 44%|████▍ | 4395/10000 [59:27<1:13:41, 1.27it/s, loss=0.0093, lr=1.69e-05, step=4395] Training: 44%|████▍ | 4396/10000 [59:28<1:20:11, 1.16it/s, loss=0.0093, lr=1.69e-05, step=4395] Training: 44%|████▍ | 4396/10000 [59:28<1:20:11, 1.16it/s, loss=0.0097, lr=1.69e-05, step=4396] Training: 44%|████▍ | 4397/10000 [59:29<1:19:36, 1.17it/s, loss=0.0097, lr=1.69e-05, step=4396] Training: 44%|████▍ | 4397/10000 [59:29<1:19:36, 1.17it/s, loss=0.0107, lr=1.69e-05, step=4397] Training: 44%|████▍ | 4398/10000 [59:30<1:22:59, 1.13it/s, loss=0.0107, lr=1.69e-05, step=4397] Training: 44%|████▍ | 4398/10000 [59:30<1:22:59, 1.13it/s, loss=0.0180, lr=1.69e-05, step=4398] Training: 44%|████▍ | 4399/10000 [59:31<1:24:31, 1.10it/s, loss=0.0180, lr=1.69e-05, step=4398] Training: 44%|████▍ | 4399/10000 [59:31<1:24:31, 1.10it/s, loss=0.0209, lr=1.69e-05, step=4399]19:44:04.308 [I] step=4400 loss=0.0095 smoothed_loss=0.0140 lr=1.69e-05 grad_norm=0.4892 step_time=0.6604s data_time=0.2181s it/s=1.139 eta_to_10000=4918.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0148 grad_action_out_proj_arms=0.1079 grad_arm_token_fuse=0.0727 grad_shared_expert=0.3535 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4400/10000 [59:32<1:30:05, 1.04it/s, loss=0.0209, lr=1.69e-05, step=4399] Training: 44%|████▍ | 4400/10000 [59:32<1:30:05, 1.04it/s, loss=0.0095, lr=1.69e-05, step=4400] Training: 44%|████▍ | 4401/10000 [59:33<1:29:26, 1.04it/s, loss=0.0095, lr=1.69e-05, step=4400] Training: 44%|████▍ | 4401/10000 [59:33<1:29:26, 1.04it/s, loss=0.0025, lr=1.69e-05, step=4401] Training: 44%|████▍ | 4402/10000 [59:34<1:26:46, 1.08it/s, loss=0.0025, lr=1.69e-05, step=4401] Training: 44%|████▍ | 4402/10000 [59:34<1:26:46, 1.08it/s, loss=0.0158, lr=1.69e-05, step=4402] Training: 44%|████▍ | 4403/10000 [59:35<1:29:03, 1.05it/s, loss=0.0158, lr=1.69e-05, step=4402] Training: 44%|████▍ | 4403/10000 [59:35<1:29:03, 1.05it/s, loss=0.0071, lr=1.69e-05, step=4403] Training: 44%|████▍ | 4404/10000 [59:36<1:28:38, 1.05it/s, loss=0.0071, lr=1.69e-05, step=4403] Training: 44%|████▍ | 4404/10000 [59:36<1:28:38, 1.05it/s, loss=0.0083, lr=1.69e-05, step=4404] Training: 44%|████▍ | 4405/10000 [59:37<1:27:14, 1.07it/s, loss=0.0083, lr=1.69e-05, step=4404] Training: 44%|████▍ | 4405/10000 [59:37<1:27:14, 1.07it/s, loss=0.0130, lr=1.69e-05, step=4405] Training: 44%|████▍ | 4406/10000 [59:37<1:24:51, 1.10it/s, loss=0.0130, lr=1.69e-05, step=4405] Training: 44%|████▍ | 4406/10000 [59:37<1:24:51, 1.10it/s, loss=0.0136, lr=1.69e-05, step=4406] Training: 44%|████▍ | 4407/10000 [59:39<1:29:33, 1.04it/s, loss=0.0136, lr=1.69e-05, step=4406] Training: 44%|████▍ | 4407/10000 [59:39<1:29:33, 1.04it/s, loss=0.0369, lr=1.68e-05, step=4407] Training: 44%|████▍ | 4408/10000 [59:40<1:29:11, 1.04it/s, loss=0.0369, lr=1.68e-05, step=4407] Training: 44%|████▍ | 4408/10000 [59:40<1:29:11, 1.04it/s, loss=0.0056, lr=1.68e-05, step=4408] Training: 44%|████▍ | 4409/10000 [59:40<1:15:43, 1.23it/s, loss=0.0056, lr=1.68e-05, step=4408] Training: 44%|████▍ | 4409/10000 [59:40<1:15:43, 1.23it/s, loss=0.0026, lr=1.68e-05, step=4409]19:44:12.854 [I] step=4410 loss=0.0026 smoothed_loss=0.0117 lr=1.69e-05 grad_norm=0.4319 step_time=0.6559s data_time=0.1987s it/s=1.170 eta_to_10000=4776.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1559 grad_arm_token_fuse=0.0682 grad_shared_expert=0.3881 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4410/10000 [59:41<1:07:53, 1.37it/s, loss=0.0026, lr=1.68e-05, step=4409] Training: 44%|████▍ | 4410/10000 [59:41<1:07:53, 1.37it/s, loss=0.0026, lr=1.68e-05, step=4410] Training: 44%|████▍ | 4411/10000 [59:41<1:01:43, 1.51it/s, loss=0.0026, lr=1.68e-05, step=4410] Training: 44%|████▍ | 4411/10000 [59:41<1:01:43, 1.51it/s, loss=0.0158, lr=1.68e-05, step=4411] Training: 44%|████▍ | 4412/10000 [59:42<56:50, 1.64it/s, loss=0.0158, lr=1.68e-05, step=4411] Training: 44%|████▍ | 4412/10000 [59:42<56:50, 1.64it/s, loss=0.0041, lr=1.68e-05, step=4412] Training: 44%|████▍ | 4413/10000 [59:42<57:44, 1.61it/s, loss=0.0041, lr=1.68e-05, step=4412] Training: 44%|████▍ | 4413/10000 [59:42<57:44, 1.61it/s, loss=0.0040, lr=1.68e-05, step=4413] Training: 44%|████▍ | 4414/10000 [59:43<1:08:35, 1.36it/s, loss=0.0040, lr=1.68e-05, step=4413] Training: 44%|████▍ | 4414/10000 [59:43<1:08:35, 1.36it/s, loss=0.0154, lr=1.68e-05, step=4414] Training: 44%|████▍ | 4415/10000 [59:44<1:16:45, 1.21it/s, loss=0.0154, lr=1.68e-05, step=4414] Training: 44%|████▍ | 4415/10000 [59:44<1:16:45, 1.21it/s, loss=0.0674, lr=1.68e-05, step=4415] Training: 44%|████▍ | 4416/10000 [59:45<1:16:20, 1.22it/s, loss=0.0674, lr=1.68e-05, step=4415] Training: 44%|████▍ | 4416/10000 [59:45<1:16:20, 1.22it/s, loss=0.0034, lr=1.68e-05, step=4416] Training: 44%|████▍ | 4417/10000 [59:46<1:10:50, 1.31it/s, loss=0.0034, lr=1.68e-05, step=4416] Training: 44%|████▍ | 4417/10000 [59:46<1:10:50, 1.31it/s, loss=0.0052, lr=1.68e-05, step=4417] Training: 44%|████▍ | 4418/10000 [59:46<1:05:59, 1.41it/s, loss=0.0052, lr=1.68e-05, step=4417] Training: 44%|████▍ | 4418/10000 [59:46<1:05:59, 1.41it/s, loss=0.0256, lr=1.68e-05, step=4418] Training: 44%|████▍ | 4419/10000 [59:47<1:05:50, 1.41it/s, loss=0.0256, lr=1.68e-05, step=4418] Training: 44%|████▍ | 4419/10000 [59:47<1:05:50, 1.41it/s, loss=0.0149, lr=1.68e-05, step=4419]19:44:19.776 [I] step=4420 loss=0.0073 smoothed_loss=0.0146 lr=1.68e-05 grad_norm=0.4410 step_time=0.5564s data_time=0.1358s it/s=1.445 eta_to_10000=3861.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1191 grad_arm_token_fuse=0.0473 grad_shared_expert=0.3218 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4420/10000 [59:47<1:00:30, 1.54it/s, loss=0.0149, lr=1.68e-05, step=4419] Training: 44%|████▍ | 4420/10000 [59:47<1:00:30, 1.54it/s, loss=0.0073, lr=1.68e-05, step=4420] Training: 44%|████▍ | 4421/10000 [59:48<1:05:38, 1.42it/s, loss=0.0073, lr=1.68e-05, step=4420] Training: 44%|████▍ | 4421/10000 [59:48<1:05:38, 1.42it/s, loss=0.0141, lr=1.68e-05, step=4421] Training: 44%|████▍ | 4422/10000 [59:49<1:12:54, 1.27it/s, loss=0.0141, lr=1.68e-05, step=4421] Training: 44%|████▍ | 4422/10000 [59:49<1:12:54, 1.27it/s, loss=0.0022, lr=1.68e-05, step=4422] Training: 44%|████▍ | 4423/10000 [59:50<1:04:50, 1.43it/s, loss=0.0022, lr=1.68e-05, step=4422] Training: 44%|████▍ | 4423/10000 [59:50<1:04:50, 1.43it/s, loss=0.0093, lr=1.68e-05, step=4423] Training: 44%|████▍ | 4424/10000 [59:50<1:02:49, 1.48it/s, loss=0.0093, lr=1.68e-05, step=4423] Training: 44%|████▍ | 4424/10000 [59:50<1:02:49, 1.48it/s, loss=0.0095, lr=1.68e-05, step=4424] Training: 44%|████▍ | 4425/10000 [59:51<57:03, 1.63it/s, loss=0.0095, lr=1.68e-05, step=4424] Training: 44%|████▍ | 4425/10000 [59:51<57:03, 1.63it/s, loss=0.0111, lr=1.68e-05, step=4425] Training: 44%|████▍ | 4426/10000 [59:51<53:18, 1.74it/s, loss=0.0111, lr=1.68e-05, step=4425] Training: 44%|████▍ | 4426/10000 [59:51<53:18, 1.74it/s, loss=0.0070, lr=1.68e-05, step=4426] Training: 44%|████▍ | 4427/10000 [59:52<57:23, 1.62it/s, loss=0.0070, lr=1.68e-05, step=4426] Training: 44%|████▍ | 4427/10000 [59:52<57:23, 1.62it/s, loss=0.0065, lr=1.68e-05, step=4427] Training: 44%|████▍ | 4428/10000 [59:53<1:08:31, 1.36it/s, loss=0.0065, lr=1.68e-05, step=4427] Training: 44%|████▍ | 4428/10000 [59:53<1:08:31, 1.36it/s, loss=0.0133, lr=1.68e-05, step=4428] Training: 44%|████▍ | 4429/10000 [59:54<1:15:38, 1.23it/s, loss=0.0133, lr=1.68e-05, step=4428] Training: 44%|████▍ | 4429/10000 [59:54<1:15:38, 1.23it/s, loss=0.0260, lr=1.68e-05, step=4429]19:44:27.285 [I] step=4430 loss=0.0091 smoothed_loss=0.0126 lr=1.68e-05 grad_norm=0.4072 step_time=0.6030s data_time=0.1479s it/s=1.332 eta_to_10000=4182.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0949 grad_arm_token_fuse=0.0451 grad_shared_expert=0.3705 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4430/10000 [59:55<1:18:07, 1.19it/s, loss=0.0260, lr=1.68e-05, step=4429] Training: 44%|████▍ | 4430/10000 [59:55<1:18:07, 1.19it/s, loss=0.0091, lr=1.68e-05, step=4430] Training: 44%|████▍ | 4431/10000 [59:56<1:14:18, 1.25it/s, loss=0.0091, lr=1.68e-05, step=4430] Training: 44%|████▍ | 4431/10000 [59:56<1:14:18, 1.25it/s, loss=0.0197, lr=1.68e-05, step=4431] Training: 44%|████▍ | 4432/10000 [59:57<1:16:02, 1.22it/s, loss=0.0197, lr=1.68e-05, step=4431] Training: 44%|████▍ | 4432/10000 [59:57<1:16:02, 1.22it/s, loss=0.0120, lr=1.68e-05, step=4432] Training: 44%|████▍ | 4433/10000 [59:57<1:12:31, 1.28it/s, loss=0.0120, lr=1.68e-05, step=4432] Training: 44%|████▍ | 4433/10000 [59:57<1:12:31, 1.28it/s, loss=0.0154, lr=1.68e-05, step=4433] Training: 44%|████▍ | 4434/10000 [59:58<1:13:10, 1.27it/s, loss=0.0154, lr=1.68e-05, step=4433] Training: 44%|████▍ | 4434/10000 [59:58<1:13:10, 1.27it/s, loss=0.0120, lr=1.68e-05, step=4434] Training: 44%|████▍ | 4435/10000 [59:59<1:11:53, 1.29it/s, loss=0.0120, lr=1.68e-05, step=4434] Training: 44%|████▍ | 4435/10000 [59:59<1:11:53, 1.29it/s, loss=0.0042, lr=1.67e-05, step=4435] Training: 44%|████▍ | 4436/10000 [1:00:00<1:17:30, 1.20it/s, loss=0.0042, lr=1.67e-05, step=4435] Training: 44%|████▍ | 4436/10000 [1:00:00<1:17:30, 1.20it/s, loss=0.0120, lr=1.67e-05, step=4436] Training: 44%|████▍ | 4437/10000 [1:00:01<1:15:19, 1.23it/s, loss=0.0120, lr=1.67e-05, step=4436] Training: 44%|████▍ | 4437/10000 [1:00:01<1:15:19, 1.23it/s, loss=0.0238, lr=1.67e-05, step=4437] Training: 44%|████▍ | 4438/10000 [1:00:01<1:18:28, 1.18it/s, loss=0.0238, lr=1.67e-05, step=4437] Training: 44%|████▍ | 4438/10000 [1:00:01<1:18:28, 1.18it/s, loss=0.0187, lr=1.67e-05, step=4438] Training: 44%|████▍ | 4439/10000 [1:00:02<1:08:17, 1.36it/s, loss=0.0187, lr=1.67e-05, step=4438] Training: 44%|████▍ | 4439/10000 [1:00:02<1:08:17, 1.36it/s, loss=0.0063, lr=1.67e-05, step=4439]19:44:35.082 [I] step=4440 loss=0.0103 smoothed_loss=0.0129 lr=1.67e-05 grad_norm=0.4057 step_time=0.6150s data_time=0.1647s it/s=1.283 eta_to_10000=4334.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0065 grad_action_out_proj_arms=0.0924 grad_arm_token_fuse=0.0296 grad_shared_expert=0.3068 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4440/10000 [1:00:03<1:11:15, 1.30it/s, loss=0.0063, lr=1.67e-05, step=4439] Training: 44%|████▍ | 4440/10000 [1:00:03<1:11:15, 1.30it/s, loss=0.0103, lr=1.67e-05, step=4440] Training: 44%|████▍ | 4441/10000 [1:00:04<1:14:04, 1.25it/s, loss=0.0103, lr=1.67e-05, step=4440] Training: 44%|████▍ | 4441/10000 [1:00:04<1:14:04, 1.25it/s, loss=0.0104, lr=1.67e-05, step=4441] Training: 44%|████▍ | 4442/10000 [1:00:04<1:09:11, 1.34it/s, loss=0.0104, lr=1.67e-05, step=4441] Training: 44%|████▍ | 4442/10000 [1:00:04<1:09:11, 1.34it/s, loss=0.0198, lr=1.67e-05, step=4442] Training: 44%|████▍ | 4443/10000 [1:00:05<1:20:52, 1.15it/s, loss=0.0198, lr=1.67e-05, step=4442] Training: 44%|████▍ | 4443/10000 [1:00:05<1:20:52, 1.15it/s, loss=0.0036, lr=1.67e-05, step=4443] Training: 44%|████▍ | 4444/10000 [1:00:06<1:14:44, 1.24it/s, loss=0.0036, lr=1.67e-05, step=4443] Training: 44%|████▍ | 4444/10000 [1:00:06<1:14:44, 1.24it/s, loss=0.0306, lr=1.67e-05, step=4444] Training: 44%|████▍ | 4445/10000 [1:00:07<1:14:29, 1.24it/s, loss=0.0306, lr=1.67e-05, step=4444] Training: 44%|████▍ | 4445/10000 [1:00:07<1:14:29, 1.24it/s, loss=0.0088, lr=1.67e-05, step=4445] Training: 44%|████▍ | 4446/10000 [1:00:08<1:16:10, 1.22it/s, loss=0.0088, lr=1.67e-05, step=4445] Training: 44%|████▍ | 4446/10000 [1:00:08<1:16:10, 1.22it/s, loss=0.0075, lr=1.67e-05, step=4446] Training: 44%|████▍ | 4447/10000 [1:00:09<1:17:55, 1.19it/s, loss=0.0075, lr=1.67e-05, step=4446] Training: 44%|████▍ | 4447/10000 [1:00:09<1:17:55, 1.19it/s, loss=0.0093, lr=1.67e-05, step=4447] Training: 44%|████▍ | 4448/10000 [1:00:09<1:11:56, 1.29it/s, loss=0.0093, lr=1.67e-05, step=4447] Training: 44%|████▍ | 4448/10000 [1:00:09<1:11:56, 1.29it/s, loss=0.0285, lr=1.67e-05, step=4448] Training: 44%|████▍ | 4449/10000 [1:00:10<1:11:17, 1.30it/s, loss=0.0285, lr=1.67e-05, step=4448] Training: 44%|████▍ | 4449/10000 [1:00:10<1:11:17, 1.30it/s, loss=0.0131, lr=1.67e-05, step=4449]19:44:43.367 [I] step=4450 loss=0.0063 smoothed_loss=0.0134 lr=1.67e-05 grad_norm=0.4093 step_time=0.6369s data_time=0.1916s it/s=1.207 eta_to_10000=4597.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.0826 grad_arm_token_fuse=0.0428 grad_shared_expert=0.2996 (18633:train_pytorch.py:850) + Training: 44%|████▍ | 4450/10000 [1:00:11<1:18:40, 1.18it/s, loss=0.0131, lr=1.67e-05, step=4449] Training: 44%|████▍ | 4450/10000 [1:00:11<1:18:40, 1.18it/s, loss=0.0063, lr=1.67e-05, step=4450] Training: 45%|████▍ | 4451/10000 [1:00:12<1:12:29, 1.28it/s, loss=0.0063, lr=1.67e-05, step=4450] Training: 45%|████▍ | 4451/10000 [1:00:12<1:12:29, 1.28it/s, loss=0.0167, lr=1.67e-05, step=4451] Training: 45%|████▍ | 4452/10000 [1:00:13<1:14:18, 1.24it/s, loss=0.0167, lr=1.67e-05, step=4451] Training: 45%|████▍ | 4452/10000 [1:00:13<1:14:18, 1.24it/s, loss=0.0111, lr=1.67e-05, step=4452] Training: 45%|████▍ | 4453/10000 [1:00:13<1:10:40, 1.31it/s, loss=0.0111, lr=1.67e-05, step=4452] Training: 45%|████▍ | 4453/10000 [1:00:13<1:10:40, 1.31it/s, loss=0.0166, lr=1.67e-05, step=4453] Training: 45%|████▍ | 4454/10000 [1:00:14<1:12:40, 1.27it/s, loss=0.0166, lr=1.67e-05, step=4453] Training: 45%|████▍ | 4454/10000 [1:00:14<1:12:40, 1.27it/s, loss=0.0113, lr=1.67e-05, step=4454] Training: 45%|████▍ | 4455/10000 [1:00:15<1:10:57, 1.30it/s, loss=0.0113, lr=1.67e-05, step=4454] Training: 45%|████▍ | 4455/10000 [1:00:15<1:10:57, 1.30it/s, loss=0.0081, lr=1.67e-05, step=4455] Training: 45%|████▍ | 4456/10000 [1:00:16<1:11:31, 1.29it/s, loss=0.0081, lr=1.67e-05, step=4455] Training: 45%|████▍ | 4456/10000 [1:00:16<1:11:31, 1.29it/s, loss=0.0138, lr=1.67e-05, step=4456] Training: 45%|████▍ | 4457/10000 [1:00:17<1:18:30, 1.18it/s, loss=0.0138, lr=1.67e-05, step=4456] Training: 45%|████▍ | 4457/10000 [1:00:17<1:18:30, 1.18it/s, loss=0.0065, lr=1.67e-05, step=4457] Training: 45%|████▍ | 4458/10000 [1:00:17<1:17:12, 1.20it/s, loss=0.0065, lr=1.67e-05, step=4457] Training: 45%|████▍ | 4458/10000 [1:00:17<1:17:12, 1.20it/s, loss=0.0023, lr=1.67e-05, step=4458] Training: 45%|████▍ | 4459/10000 [1:00:18<1:16:44, 1.20it/s, loss=0.0023, lr=1.67e-05, step=4458] Training: 45%|████▍ | 4459/10000 [1:00:18<1:16:44, 1.20it/s, loss=0.0107, lr=1.67e-05, step=4459]19:44:51.635 [I] step=4460 loss=0.0058 smoothed_loss=0.0108 lr=1.67e-05 grad_norm=0.3703 step_time=0.6409s data_time=0.1859s it/s=1.210 eta_to_10000=4580.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0054 grad_action_out_proj_arms=0.0672 grad_arm_token_fuse=0.0276 grad_shared_expert=0.2277 (18633:train_pytorch.py:850) + Training: 45%|████▍ | 4460/10000 [1:00:19<1:24:39, 1.09it/s, loss=0.0107, lr=1.67e-05, step=4459] Training: 45%|████▍ | 4460/10000 [1:00:19<1:24:39, 1.09it/s, loss=0.0058, lr=1.67e-05, step=4460] Training: 45%|████▍ | 4461/10000 [1:00:20<1:19:38, 1.16it/s, loss=0.0058, lr=1.67e-05, step=4460] Training: 45%|████▍ | 4461/10000 [1:00:20<1:19:38, 1.16it/s, loss=0.0045, lr=1.67e-05, step=4461] Training: 45%|████▍ | 4462/10000 [1:00:21<1:22:16, 1.12it/s, loss=0.0045, lr=1.67e-05, step=4461] Training: 45%|████▍ | 4462/10000 [1:00:21<1:22:16, 1.12it/s, loss=0.0052, lr=1.67e-05, step=4462] Training: 45%|████▍ | 4463/10000 [1:00:22<1:21:24, 1.13it/s, loss=0.0052, lr=1.67e-05, step=4462] Training: 45%|████▍ | 4463/10000 [1:00:22<1:21:24, 1.13it/s, loss=0.0164, lr=1.66e-05, step=4463] Training: 45%|████▍ | 4464/10000 [1:00:23<1:24:45, 1.09it/s, loss=0.0164, lr=1.66e-05, step=4463] Training: 45%|████▍ | 4464/10000 [1:00:23<1:24:45, 1.09it/s, loss=0.0134, lr=1.66e-05, step=4464] Training: 45%|████▍ | 4465/10000 [1:00:24<1:30:28, 1.02it/s, loss=0.0134, lr=1.66e-05, step=4464] Training: 45%|████▍ | 4465/10000 [1:00:24<1:30:28, 1.02it/s, loss=0.0020, lr=1.66e-05, step=4465] Training: 45%|████▍ | 4466/10000 [1:00:25<1:25:16, 1.08it/s, loss=0.0020, lr=1.66e-05, step=4465] Training: 45%|████▍ | 4466/10000 [1:00:25<1:25:16, 1.08it/s, loss=0.0568, lr=1.66e-05, step=4466] Training: 45%|████▍ | 4467/10000 [1:00:25<1:16:54, 1.20it/s, loss=0.0568, lr=1.66e-05, step=4466] Training: 45%|████▍ | 4467/10000 [1:00:25<1:16:54, 1.20it/s, loss=0.0091, lr=1.66e-05, step=4467] Training: 45%|████▍ | 4468/10000 [1:00:26<1:17:57, 1.18it/s, loss=0.0091, lr=1.66e-05, step=4467] Training: 45%|████▍ | 4468/10000 [1:00:26<1:17:57, 1.18it/s, loss=0.0118, lr=1.66e-05, step=4468] Training: 45%|████▍ | 4469/10000 [1:00:27<1:08:20, 1.35it/s, loss=0.0118, lr=1.66e-05, step=4468] Training: 45%|████▍ | 4469/10000 [1:00:27<1:08:20, 1.35it/s, loss=0.0080, lr=1.66e-05, step=4469]19:44:59.668 [I] step=4470 loss=0.0149 smoothed_loss=0.0133 lr=1.66e-05 grad_norm=0.5143 step_time=0.6361s data_time=0.1672s it/s=1.245 eta_to_10000=4441.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0278 grad_action_out_proj_arms=0.1789 grad_arm_token_fuse=0.1329 grad_shared_expert=0.4627 (18633:train_pytorch.py:850) + Training: 45%|████▍ | 4470/10000 [1:00:27<1:03:22, 1.45it/s, loss=0.0080, lr=1.66e-05, step=4469] Training: 45%|████▍ | 4470/10000 [1:00:27<1:03:22, 1.45it/s, loss=0.0149, lr=1.66e-05, step=4470] Training: 45%|████▍ | 4471/10000 [1:00:28<1:10:53, 1.30it/s, loss=0.0149, lr=1.66e-05, step=4470] Training: 45%|████▍ | 4471/10000 [1:00:28<1:10:53, 1.30it/s, loss=0.1111, lr=1.66e-05, step=4471] Training: 45%|████▍ | 4472/10000 [1:00:29<1:10:00, 1.32it/s, loss=0.1111, lr=1.66e-05, step=4471] Training: 45%|████▍ | 4472/10000 [1:00:29<1:10:00, 1.32it/s, loss=0.0039, lr=1.66e-05, step=4472] Training: 45%|████▍ | 4473/10000 [1:00:30<1:02:40, 1.47it/s, loss=0.0039, lr=1.66e-05, step=4472] Training: 45%|████▍ | 4473/10000 [1:00:30<1:02:40, 1.47it/s, loss=0.0080, lr=1.66e-05, step=4473] Training: 45%|████▍ | 4474/10000 [1:00:31<1:11:47, 1.28it/s, loss=0.0080, lr=1.66e-05, step=4473] Training: 45%|████▍ | 4474/10000 [1:00:31<1:11:47, 1.28it/s, loss=0.0054, lr=1.66e-05, step=4474] Training: 45%|████▍ | 4475/10000 [1:00:31<1:08:09, 1.35it/s, loss=0.0054, lr=1.66e-05, step=4474] Training: 45%|████▍ | 4475/10000 [1:00:31<1:08:09, 1.35it/s, loss=0.0215, lr=1.66e-05, step=4475] Training: 45%|████▍ | 4476/10000 [1:00:32<1:02:21, 1.48it/s, loss=0.0215, lr=1.66e-05, step=4475] Training: 45%|████▍ | 4476/10000 [1:00:32<1:02:21, 1.48it/s, loss=0.0054, lr=1.66e-05, step=4476] Training: 45%|████▍ | 4477/10000 [1:00:33<1:06:15, 1.39it/s, loss=0.0054, lr=1.66e-05, step=4476] Training: 45%|████▍ | 4477/10000 [1:00:33<1:06:15, 1.39it/s, loss=0.0308, lr=1.66e-05, step=4477] Training: 45%|████▍ | 4478/10000 [1:00:34<1:15:31, 1.22it/s, loss=0.0308, lr=1.66e-05, step=4477] Training: 45%|████▍ | 4478/10000 [1:00:34<1:15:31, 1.22it/s, loss=0.0127, lr=1.66e-05, step=4478] Training: 45%|████▍ | 4479/10000 [1:00:35<1:25:38, 1.07it/s, loss=0.0127, lr=1.66e-05, step=4478] Training: 45%|████▍ | 4479/10000 [1:00:35<1:25:38, 1.07it/s, loss=0.0117, lr=1.66e-05, step=4479]19:45:08.055 [I] step=4480 loss=0.0118 smoothed_loss=0.0169 lr=1.66e-05 grad_norm=0.5111 step_time=0.6752s data_time=0.1635s it/s=1.193 eta_to_10000=4628.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0085 grad_action_out_proj_arms=0.1233 grad_arm_token_fuse=0.0427 grad_shared_expert=0.4176 (18633:train_pytorch.py:850) + Training: 45%|████▍ | 4480/10000 [1:00:36<1:25:55, 1.07it/s, loss=0.0117, lr=1.66e-05, step=4479] Training: 45%|████▍ | 4480/10000 [1:00:36<1:25:55, 1.07it/s, loss=0.0118, lr=1.66e-05, step=4480] Training: 45%|████▍ | 4481/10000 [1:00:36<1:21:03, 1.13it/s, loss=0.0118, lr=1.66e-05, step=4480] Training: 45%|████▍ | 4481/10000 [1:00:36<1:21:03, 1.13it/s, loss=0.0074, lr=1.66e-05, step=4481] Training: 45%|████▍ | 4482/10000 [1:00:37<1:17:39, 1.18it/s, loss=0.0074, lr=1.66e-05, step=4481] Training: 45%|████▍ | 4482/10000 [1:00:37<1:17:39, 1.18it/s, loss=0.0314, lr=1.66e-05, step=4482] Training: 45%|████▍ | 4483/10000 [1:00:38<1:13:56, 1.24it/s, loss=0.0314, lr=1.66e-05, step=4482] Training: 45%|████▍ | 4483/10000 [1:00:38<1:13:56, 1.24it/s, loss=0.0102, lr=1.66e-05, step=4483] Training: 45%|████▍ | 4484/10000 [1:00:38<1:06:18, 1.39it/s, loss=0.0102, lr=1.66e-05, step=4483] Training: 45%|████▍ | 4484/10000 [1:00:38<1:06:18, 1.39it/s, loss=0.0511, lr=1.66e-05, step=4484] Training: 45%|████▍ | 4485/10000 [1:00:39<1:00:44, 1.51it/s, loss=0.0511, lr=1.66e-05, step=4484] Training: 45%|████▍ | 4485/10000 [1:00:39<1:00:44, 1.51it/s, loss=0.0094, lr=1.66e-05, step=4485] Training: 45%|████▍ | 4486/10000 [1:00:40<1:10:22, 1.31it/s, loss=0.0094, lr=1.66e-05, step=4485] Training: 45%|████▍ | 4486/10000 [1:00:40<1:10:22, 1.31it/s, loss=0.0115, lr=1.66e-05, step=4486] Training: 45%|████▍ | 4487/10000 [1:00:41<1:11:49, 1.28it/s, loss=0.0115, lr=1.66e-05, step=4486] Training: 45%|████▍ | 4487/10000 [1:00:41<1:11:49, 1.28it/s, loss=0.0114, lr=1.66e-05, step=4487] Training: 45%|████▍ | 4488/10000 [1:00:41<1:08:21, 1.34it/s, loss=0.0114, lr=1.66e-05, step=4487] Training: 45%|████▍ | 4488/10000 [1:00:41<1:08:21, 1.34it/s, loss=0.0067, lr=1.66e-05, step=4488] Training: 45%|████▍ | 4489/10000 [1:00:42<1:08:15, 1.35it/s, loss=0.0067, lr=1.66e-05, step=4488] Training: 45%|████▍ | 4489/10000 [1:00:42<1:08:15, 1.35it/s, loss=0.0357, lr=1.66e-05, step=4489]19:45:15.122 [I] step=4490 loss=0.0291 smoothed_loss=0.0196 lr=1.66e-05 grad_norm=0.4771 step_time=0.5722s data_time=0.1345s it/s=1.415 eta_to_10000=3893.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0863 grad_arm_token_fuse=0.0368 grad_shared_expert=0.4087 (18633:train_pytorch.py:850) + Training: 45%|████▍ | 4490/10000 [1:00:43<1:03:21, 1.45it/s, loss=0.0357, lr=1.66e-05, step=4489] Training: 45%|████▍ | 4490/10000 [1:00:43<1:03:21, 1.45it/s, loss=0.0291, lr=1.66e-05, step=4490] Training: 45%|████▍ | 4491/10000 [1:00:43<59:07, 1.55it/s, loss=0.0291, lr=1.66e-05, step=4490] Training: 45%|████▍ | 4491/10000 [1:00:43<59:07, 1.55it/s, loss=0.0177, lr=1.65e-05, step=4491] Training: 45%|████▍ | 4492/10000 [1:00:44<1:01:11, 1.50it/s, loss=0.0177, lr=1.65e-05, step=4491] Training: 45%|████▍ | 4492/10000 [1:00:44<1:01:11, 1.50it/s, loss=0.0055, lr=1.65e-05, step=4492] Training: 45%|████▍ | 4493/10000 [1:00:45<1:10:07, 1.31it/s, loss=0.0055, lr=1.65e-05, step=4492] Training: 45%|████▍ | 4493/10000 [1:00:45<1:10:07, 1.31it/s, loss=0.0115, lr=1.65e-05, step=4493] Training: 45%|████▍ | 4494/10000 [1:00:46<1:02:31, 1.47it/s, loss=0.0115, lr=1.65e-05, step=4493] Training: 45%|████▍ | 4494/10000 [1:00:46<1:02:31, 1.47it/s, loss=0.0062, lr=1.65e-05, step=4494] Training: 45%|████▍ | 4495/10000 [1:00:46<1:01:27, 1.49it/s, loss=0.0062, lr=1.65e-05, step=4494] Training: 45%|████▍ | 4495/10000 [1:00:46<1:01:27, 1.49it/s, loss=0.0022, lr=1.65e-05, step=4495] Training: 45%|████▍ | 4496/10000 [1:00:47<56:42, 1.62it/s, loss=0.0022, lr=1.65e-05, step=4495] Training: 45%|████▍ | 4496/10000 [1:00:47<56:42, 1.62it/s, loss=0.0058, lr=1.65e-05, step=4496] Training: 45%|████▍ | 4497/10000 [1:00:47<58:57, 1.56it/s, loss=0.0058, lr=1.65e-05, step=4496] Training: 45%|████▍ | 4497/10000 [1:00:47<58:57, 1.56it/s, loss=0.0431, lr=1.65e-05, step=4497] Training: 45%|████▍ | 4498/10000 [1:00:48<59:59, 1.53it/s, loss=0.0431, lr=1.65e-05, step=4497] Training: 45%|████▍ | 4498/10000 [1:00:48<59:59, 1.53it/s, loss=0.0129, lr=1.65e-05, step=4498] Training: 45%|████▍ | 4499/10000 [1:00:49<55:23, 1.66it/s, loss=0.0129, lr=1.65e-05, step=4498] Training: 45%|████▍ | 4499/10000 [1:00:49<55:23, 1.66it/s, loss=0.0178, lr=1.65e-05, step=4499]19:45:21.680 [I] step=4500 loss=0.0090 smoothed_loss=0.0158 lr=1.65e-05 grad_norm=0.4425 step_time=0.5544s data_time=0.1014s it/s=1.525 eta_to_10000=3606.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.1076 grad_arm_token_fuse=0.0621 grad_shared_expert=0.4395 (18633:train_pytorch.py:850) + Training: 45%|████▌ | 4500/10000 [1:00:49<1:01:07, 1.50it/s, loss=0.0178, lr=1.65e-05, step=4499] Training: 45%|████▌ | 4500/10000 [1:00:49<1:01:07, 1.50it/s, loss=0.0090, lr=1.65e-05, step=4500] Training: 45%|████▌ | 4501/10000 [1:00:50<56:11, 1.63it/s, loss=0.0090, lr=1.65e-05, step=4500] Training: 45%|████▌ | 4501/10000 [1:00:50<56:11, 1.63it/s, loss=0.0109, lr=1.65e-05, step=4501] Training: 45%|████▌ | 4502/10000 [1:00:51<1:01:56, 1.48it/s, loss=0.0109, lr=1.65e-05, step=4501] Training: 45%|████▌ | 4502/10000 [1:00:51<1:01:56, 1.48it/s, loss=0.0092, lr=1.65e-05, step=4502] Training: 45%|████▌ | 4503/10000 [1:00:51<1:00:10, 1.52it/s, loss=0.0092, lr=1.65e-05, step=4502] Training: 45%|████▌ | 4503/10000 [1:00:51<1:00:10, 1.52it/s, loss=0.1522, lr=1.65e-05, step=4503] Training: 45%|████▌ | 4504/10000 [1:00:52<1:01:09, 1.50it/s, loss=0.1522, lr=1.65e-05, step=4503] Training: 45%|████▌ | 4504/10000 [1:00:52<1:01:09, 1.50it/s, loss=0.0216, lr=1.65e-05, step=4504] Training: 45%|████▌ | 4505/10000 [1:00:53<1:00:18, 1.52it/s, loss=0.0216, lr=1.65e-05, step=4504] Training: 45%|████▌ | 4505/10000 [1:00:53<1:00:18, 1.52it/s, loss=0.0034, lr=1.65e-05, step=4505] Training: 45%|████▌ | 4506/10000 [1:00:53<55:55, 1.64it/s, loss=0.0034, lr=1.65e-05, step=4505] Training: 45%|████▌ | 4506/10000 [1:00:53<55:55, 1.64it/s, loss=0.0046, lr=1.65e-05, step=4506] Training: 45%|████▌ | 4507/10000 [1:00:54<59:34, 1.54it/s, loss=0.0046, lr=1.65e-05, step=4506] Training: 45%|████▌ | 4507/10000 [1:00:54<59:34, 1.54it/s, loss=0.0059, lr=1.65e-05, step=4507] Training: 45%|████▌ | 4508/10000 [1:00:54<59:04, 1.55it/s, loss=0.0059, lr=1.65e-05, step=4507] Training: 45%|████▌ | 4508/10000 [1:00:54<59:04, 1.55it/s, loss=0.0262, lr=1.65e-05, step=4508] Training: 45%|████▌ | 4509/10000 [1:00:55<59:26, 1.54it/s, loss=0.0262, lr=1.65e-05, step=4508] Training: 45%|████▌ | 4509/10000 [1:00:55<59:26, 1.54it/s, loss=0.0087, lr=1.65e-05, step=4509]19:45:27.999 [I] step=4510 loss=0.0337 smoothed_loss=0.0220 lr=1.65e-05 grad_norm=0.4676 step_time=0.5202s data_time=0.1118s it/s=1.583 eta_to_10000=3468.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1262 grad_arm_token_fuse=0.0539 grad_shared_expert=0.4451 (18633:train_pytorch.py:850) + Training: 45%|████▌ | 4510/10000 [1:00:56<56:10, 1.63it/s, loss=0.0087, lr=1.65e-05, step=4509] Training: 45%|████▌ | 4510/10000 [1:00:56<56:10, 1.63it/s, loss=0.0337, lr=1.65e-05, step=4510] Training: 45%|████▌ | 4511/10000 [1:00:56<52:46, 1.73it/s, loss=0.0337, lr=1.65e-05, step=4510] Training: 45%|████▌ | 4511/10000 [1:00:56<52:46, 1.73it/s, loss=0.0043, lr=1.65e-05, step=4511] Training: 45%|████▌ | 4512/10000 [1:00:57<50:36, 1.81it/s, loss=0.0043, lr=1.65e-05, step=4511] Training: 45%|████▌ | 4512/10000 [1:00:57<50:36, 1.81it/s, loss=0.0548, lr=1.65e-05, step=4512] Training: 45%|████▌ | 4513/10000 [1:00:57<48:32, 1.88it/s, loss=0.0548, lr=1.65e-05, step=4512] Training: 45%|████▌ | 4513/10000 [1:00:57<48:32, 1.88it/s, loss=0.0043, lr=1.65e-05, step=4513] Training: 45%|████▌ | 4514/10000 [1:00:58<53:21, 1.71it/s, loss=0.0043, lr=1.65e-05, step=4513] Training: 45%|████▌ | 4514/10000 [1:00:58<53:21, 1.71it/s, loss=0.0063, lr=1.65e-05, step=4514] Training: 45%|████▌ | 4515/10000 [1:00:59<1:04:50, 1.41it/s, loss=0.0063, lr=1.65e-05, step=4514] Training: 45%|████▌ | 4515/10000 [1:00:59<1:04:50, 1.41it/s, loss=0.0067, lr=1.65e-05, step=4515] Training: 45%|████▌ | 4516/10000 [1:01:00<1:05:52, 1.39it/s, loss=0.0067, lr=1.65e-05, step=4515] Training: 45%|████▌ | 4516/10000 [1:01:00<1:05:52, 1.39it/s, loss=0.0061, lr=1.65e-05, step=4516] Training: 45%|████▌ | 4517/10000 [1:01:00<59:37, 1.53it/s, loss=0.0061, lr=1.65e-05, step=4516] Training: 45%|████▌ | 4517/10000 [1:01:00<59:37, 1.53it/s, loss=0.0135, lr=1.65e-05, step=4517] Training: 45%|████▌ | 4518/10000 [1:01:01<59:29, 1.54it/s, loss=0.0135, lr=1.65e-05, step=4517] Training: 45%|████▌ | 4518/10000 [1:01:01<59:29, 1.54it/s, loss=0.0138, lr=1.65e-05, step=4518] Training: 45%|████▌ | 4519/10000 [1:01:01<54:53, 1.66it/s, loss=0.0138, lr=1.65e-05, step=4518] Training: 45%|████▌ | 4519/10000 [1:01:01<54:53, 1.66it/s, loss=0.0130, lr=1.64e-05, step=4519]19:45:34.104 [I] step=4520 loss=0.0262 smoothed_loss=0.0174 lr=1.65e-05 grad_norm=0.5135 step_time=0.5261s data_time=0.0844s it/s=1.638 eta_to_10000=3344.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0127 grad_action_out_proj_arms=0.1878 grad_arm_token_fuse=0.0736 grad_shared_expert=0.4307 (18633:train_pytorch.py:850) + Training: 45%|████▌ | 4520/10000 [1:01:02<53:39, 1.70it/s, loss=0.0130, lr=1.64e-05, step=4519] Training: 45%|████▌ | 4520/10000 [1:01:02<53:39, 1.70it/s, loss=0.0262, lr=1.64e-05, step=4520] Training: 45%|████▌ | 4521/10000 [1:01:02<55:24, 1.65it/s, loss=0.0262, lr=1.64e-05, step=4520] Training: 45%|████▌ | 4521/10000 [1:01:02<55:24, 1.65it/s, loss=0.0079, lr=1.64e-05, step=4521] Training: 45%|████▌ | 4522/10000 [1:01:03<1:03:29, 1.44it/s, loss=0.0079, lr=1.64e-05, step=4521] Training: 45%|████▌ | 4522/10000 [1:01:03<1:03:29, 1.44it/s, loss=0.0456, lr=1.64e-05, step=4522] Training: 45%|████▌ | 4523/10000 [1:01:04<57:54, 1.58it/s, loss=0.0456, lr=1.64e-05, step=4522] Training: 45%|████▌ | 4523/10000 [1:01:04<57:54, 1.58it/s, loss=0.0083, lr=1.64e-05, step=4523] Training: 45%|████▌ | 4524/10000 [1:01:04<58:23, 1.56it/s, loss=0.0083, lr=1.64e-05, step=4523] Training: 45%|████▌ | 4524/10000 [1:01:04<58:23, 1.56it/s, loss=0.0174, lr=1.64e-05, step=4524] Training: 45%|████▌ | 4525/10000 [1:01:05<54:18, 1.68it/s, loss=0.0174, lr=1.64e-05, step=4524] Training: 45%|████▌ | 4525/10000 [1:01:05<54:18, 1.68it/s, loss=0.0061, lr=1.64e-05, step=4525] Training: 45%|████▌ | 4526/10000 [1:01:06<53:44, 1.70it/s, loss=0.0061, lr=1.64e-05, step=4525] Training: 45%|████▌ | 4526/10000 [1:01:06<53:44, 1.70it/s, loss=0.0092, lr=1.64e-05, step=4526] Training: 45%|████▌ | 4527/10000 [1:01:06<51:03, 1.79it/s, loss=0.0092, lr=1.64e-05, step=4526] Training: 45%|████▌ | 4527/10000 [1:01:06<51:03, 1.79it/s, loss=0.0064, lr=1.64e-05, step=4527] Training: 45%|████▌ | 4528/10000 [1:01:07<55:43, 1.64it/s, loss=0.0064, lr=1.64e-05, step=4527] Training: 45%|████▌ | 4528/10000 [1:01:07<55:43, 1.64it/s, loss=0.0557, lr=1.64e-05, step=4528] Training: 45%|████▌ | 4529/10000 [1:01:08<1:00:33, 1.51it/s, loss=0.0557, lr=1.64e-05, step=4528] Training: 45%|████▌ | 4529/10000 [1:01:08<1:00:33, 1.51it/s, loss=0.0017, lr=1.64e-05, step=4529]19:45:40.802 [I] step=4530 loss=0.0139 smoothed_loss=0.0172 lr=1.64e-05 grad_norm=0.4543 step_time=0.5793s data_time=0.0905s it/s=1.493 eta_to_10000=3663.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1312 grad_arm_token_fuse=0.0729 grad_shared_expert=0.3869 (18633:train_pytorch.py:850) + Training: 45%|████▌ | 4530/10000 [1:01:08<1:07:38, 1.35it/s, loss=0.0017, lr=1.64e-05, step=4529] Training: 45%|████▌ | 4530/10000 [1:01:08<1:07:38, 1.35it/s, loss=0.0139, lr=1.64e-05, step=4530] Training: 45%|████▌ | 4531/10000 [1:01:09<1:06:04, 1.38it/s, loss=0.0139, lr=1.64e-05, step=4530] Training: 45%|████▌ | 4531/10000 [1:01:09<1:06:04, 1.38it/s, loss=0.0285, lr=1.64e-05, step=4531] Training: 45%|████▌ | 4532/10000 [1:01:10<59:38, 1.53it/s, loss=0.0285, lr=1.64e-05, step=4531] Training: 45%|████▌ | 4532/10000 [1:01:10<59:38, 1.53it/s, loss=0.0114, lr=1.64e-05, step=4532] Training: 45%|████▌ | 4533/10000 [1:01:10<55:16, 1.65it/s, loss=0.0114, lr=1.64e-05, step=4532] Training: 45%|████▌ | 4533/10000 [1:01:10<55:16, 1.65it/s, loss=0.0079, lr=1.64e-05, step=4533] Training: 45%|████▌ | 4534/10000 [1:01:11<52:02, 1.75it/s, loss=0.0079, lr=1.64e-05, step=4533] Training: 45%|████▌ | 4534/10000 [1:01:11<52:02, 1.75it/s, loss=0.0339, lr=1.64e-05, step=4534] Training: 45%|████▌ | 4535/10000 [1:01:11<49:58, 1.82it/s, loss=0.0339, lr=1.64e-05, step=4534] Training: 45%|████▌ | 4535/10000 [1:01:11<49:58, 1.82it/s, loss=0.0086, lr=1.64e-05, step=4535] Training: 45%|████▌ | 4536/10000 [1:01:12<55:13, 1.65it/s, loss=0.0086, lr=1.64e-05, step=4535] Training: 45%|████▌ | 4536/10000 [1:01:12<55:13, 1.65it/s, loss=0.0027, lr=1.64e-05, step=4536] Training: 45%|████▌ | 4537/10000 [1:01:13<1:02:31, 1.46it/s, loss=0.0027, lr=1.64e-05, step=4536] Training: 45%|████▌ | 4537/10000 [1:01:13<1:02:31, 1.46it/s, loss=0.0208, lr=1.64e-05, step=4537] Training: 45%|████▌ | 4538/10000 [1:01:13<57:03, 1.60it/s, loss=0.0208, lr=1.64e-05, step=4537] Training: 45%|████▌ | 4538/10000 [1:01:13<57:03, 1.60it/s, loss=0.0065, lr=1.64e-05, step=4538] Training: 45%|████▌ | 4539/10000 [1:01:14<59:00, 1.54it/s, loss=0.0065, lr=1.64e-05, step=4538] Training: 45%|████▌ | 4539/10000 [1:01:14<59:00, 1.54it/s, loss=0.0108, lr=1.64e-05, step=4539]19:45:46.778 [I] step=4540 loss=0.0052 smoothed_loss=0.0140 lr=1.64e-05 grad_norm=0.4322 step_time=0.5115s data_time=0.0861s it/s=1.674 eta_to_10000=3262.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.0954 grad_arm_token_fuse=0.0508 grad_shared_expert=0.2833 (18633:train_pytorch.py:850) + Training: 45%|████▌ | 4540/10000 [1:01:14<55:29, 1.64it/s, loss=0.0108, lr=1.64e-05, step=4539] Training: 45%|████▌ | 4540/10000 [1:01:14<55:29, 1.64it/s, loss=0.0052, lr=1.64e-05, step=4540] Training: 45%|████▌ | 4541/10000 [1:01:15<52:28, 1.73it/s, loss=0.0052, lr=1.64e-05, step=4540] Training: 45%|████▌ | 4541/10000 [1:01:15<52:28, 1.73it/s, loss=0.0260, lr=1.64e-05, step=4541] Training: 45%|████▌ | 4542/10000 [1:01:16<53:36, 1.70it/s, loss=0.0260, lr=1.64e-05, step=4541] Training: 45%|████▌ | 4542/10000 [1:01:16<53:36, 1.70it/s, loss=0.0582, lr=1.64e-05, step=4542] Training: 45%|████▌ | 4543/10000 [1:01:16<59:46, 1.52it/s, loss=0.0582, lr=1.64e-05, step=4542] Training: 45%|████▌ | 4543/10000 [1:01:16<59:46, 1.52it/s, loss=0.0179, lr=1.64e-05, step=4543] Training: 45%|████▌ | 4544/10000 [1:01:17<58:07, 1.56it/s, loss=0.0179, lr=1.64e-05, step=4543] Training: 45%|████▌ | 4544/10000 [1:01:17<58:07, 1.56it/s, loss=0.0115, lr=1.64e-05, step=4544] Training: 45%|████▌ | 4545/10000 [1:01:18<1:03:46, 1.43it/s, loss=0.0115, lr=1.64e-05, step=4544] Training: 45%|████▌ | 4545/10000 [1:01:18<1:03:46, 1.43it/s, loss=0.0192, lr=1.64e-05, step=4545] Training: 45%|████▌ | 4546/10000 [1:01:18<58:27, 1.56it/s, loss=0.0192, lr=1.64e-05, step=4545] Training: 45%|████▌ | 4546/10000 [1:01:18<58:27, 1.56it/s, loss=0.0087, lr=1.63e-05, step=4546] Training: 45%|████▌ | 4547/10000 [1:01:19<53:57, 1.68it/s, loss=0.0087, lr=1.63e-05, step=4546] Training: 45%|████▌ | 4547/10000 [1:01:19<53:57, 1.68it/s, loss=0.0115, lr=1.63e-05, step=4547] Training: 45%|████▌ | 4548/10000 [1:01:19<50:46, 1.79it/s, loss=0.0115, lr=1.63e-05, step=4547] Training: 45%|████▌ | 4548/10000 [1:01:19<50:46, 1.79it/s, loss=0.0052, lr=1.63e-05, step=4548] Training: 45%|████▌ | 4549/10000 [1:01:20<49:15, 1.84it/s, loss=0.0052, lr=1.63e-05, step=4548] Training: 45%|████▌ | 4549/10000 [1:01:20<49:15, 1.84it/s, loss=0.0180, lr=1.63e-05, step=4549]19:45:52.879 [I] step=4550 loss=0.0142 smoothed_loss=0.0159 lr=1.64e-05 grad_norm=0.4521 step_time=0.5265s data_time=0.0836s it/s=1.639 eta_to_10000=3324.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0890 grad_arm_token_fuse=0.0408 grad_shared_expert=0.3583 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4550/10000 [1:01:21<55:06, 1.65it/s, loss=0.0180, lr=1.63e-05, step=4549] Training: 46%|████▌ | 4550/10000 [1:01:21<55:06, 1.65it/s, loss=0.0142, lr=1.63e-05, step=4550] Training: 46%|████▌ | 4551/10000 [1:01:21<52:03, 1.74it/s, loss=0.0142, lr=1.63e-05, step=4550] Training: 46%|████▌ | 4551/10000 [1:01:21<52:03, 1.74it/s, loss=0.0145, lr=1.63e-05, step=4551] Training: 46%|████▌ | 4552/10000 [1:01:22<57:33, 1.58it/s, loss=0.0145, lr=1.63e-05, step=4551] Training: 46%|████▌ | 4552/10000 [1:01:22<57:33, 1.58it/s, loss=0.0053, lr=1.63e-05, step=4552] Training: 46%|████▌ | 4553/10000 [1:01:22<53:25, 1.70it/s, loss=0.0053, lr=1.63e-05, step=4552] Training: 46%|████▌ | 4553/10000 [1:01:22<53:25, 1.70it/s, loss=0.0058, lr=1.63e-05, step=4553] Training: 46%|████▌ | 4554/10000 [1:01:23<54:30, 1.67it/s, loss=0.0058, lr=1.63e-05, step=4553] Training: 46%|████▌ | 4554/10000 [1:01:23<54:30, 1.67it/s, loss=0.0059, lr=1.63e-05, step=4554] Training: 46%|████▌ | 4555/10000 [1:01:23<51:18, 1.77it/s, loss=0.0059, lr=1.63e-05, step=4554] Training: 46%|████▌ | 4555/10000 [1:01:23<51:18, 1.77it/s, loss=0.0042, lr=1.63e-05, step=4555] Training: 46%|████▌ | 4556/10000 [1:01:24<57:03, 1.59it/s, loss=0.0042, lr=1.63e-05, step=4555] Training: 46%|████▌ | 4556/10000 [1:01:24<57:03, 1.59it/s, loss=0.0110, lr=1.63e-05, step=4556] Training: 46%|████▌ | 4557/10000 [1:01:25<1:05:31, 1.38it/s, loss=0.0110, lr=1.63e-05, step=4556] Training: 46%|████▌ | 4557/10000 [1:01:25<1:05:31, 1.38it/s, loss=0.0052, lr=1.63e-05, step=4557] Training: 46%|████▌ | 4558/10000 [1:01:26<59:18, 1.53it/s, loss=0.0052, lr=1.63e-05, step=4557] Training: 46%|████▌ | 4558/10000 [1:01:26<59:18, 1.53it/s, loss=0.0166, lr=1.63e-05, step=4558] Training: 46%|████▌ | 4559/10000 [1:01:26<1:01:15, 1.48it/s, loss=0.0166, lr=1.63e-05, step=4558] Training: 46%|████▌ | 4559/10000 [1:01:26<1:01:15, 1.48it/s, loss=0.0069, lr=1.63e-05, step=4559]19:45:59.635 [I] step=4560 loss=0.0417 smoothed_loss=0.0144 lr=1.63e-05 grad_norm=0.3850 step_time=0.5497s data_time=0.1260s it/s=1.480 eta_to_10000=3675.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0180 grad_action_out_proj_arms=0.1682 grad_arm_token_fuse=0.0958 grad_shared_expert=0.4570 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4560/10000 [1:01:27<1:08:49, 1.32it/s, loss=0.0069, lr=1.63e-05, step=4559] Training: 46%|████▌ | 4560/10000 [1:01:27<1:08:49, 1.32it/s, loss=0.0417, lr=1.63e-05, step=4560] Training: 46%|████▌ | 4561/10000 [1:01:28<1:02:47, 1.44it/s, loss=0.0417, lr=1.63e-05, step=4560] Training: 46%|████▌ | 4561/10000 [1:01:28<1:02:47, 1.44it/s, loss=0.0077, lr=1.63e-05, step=4561] Training: 46%|████▌ | 4562/10000 [1:01:29<1:05:40, 1.38it/s, loss=0.0077, lr=1.63e-05, step=4561] Training: 46%|████▌ | 4562/10000 [1:01:29<1:05:40, 1.38it/s, loss=0.0084, lr=1.63e-05, step=4562] Training: 46%|████▌ | 4563/10000 [1:01:29<59:21, 1.53it/s, loss=0.0084, lr=1.63e-05, step=4562] Training: 46%|████▌ | 4563/10000 [1:01:29<59:21, 1.53it/s, loss=0.0052, lr=1.63e-05, step=4563] Training: 46%|████▌ | 4564/10000 [1:01:30<1:08:26, 1.32it/s, loss=0.0052, lr=1.63e-05, step=4563] Training: 46%|████▌ | 4564/10000 [1:01:30<1:08:26, 1.32it/s, loss=0.0082, lr=1.63e-05, step=4564] Training: 46%|████▌ | 4565/10000 [1:01:31<1:18:09, 1.16it/s, loss=0.0082, lr=1.63e-05, step=4564] Training: 46%|████▌ | 4565/10000 [1:01:31<1:18:09, 1.16it/s, loss=0.0089, lr=1.63e-05, step=4565] Training: 46%|████▌ | 4566/10000 [1:01:32<1:09:52, 1.30it/s, loss=0.0089, lr=1.63e-05, step=4565] Training: 46%|████▌ | 4566/10000 [1:01:32<1:09:52, 1.30it/s, loss=0.0300, lr=1.63e-05, step=4566] Training: 46%|████▌ | 4567/10000 [1:01:33<1:11:50, 1.26it/s, loss=0.0300, lr=1.63e-05, step=4566] Training: 46%|████▌ | 4567/10000 [1:01:33<1:11:50, 1.26it/s, loss=0.0094, lr=1.63e-05, step=4567] Training: 46%|████▌ | 4568/10000 [1:01:33<1:05:14, 1.39it/s, loss=0.0094, lr=1.63e-05, step=4567] Training: 46%|████▌ | 4568/10000 [1:01:33<1:05:14, 1.39it/s, loss=0.0111, lr=1.63e-05, step=4568] Training: 46%|████▌ | 4569/10000 [1:01:34<58:48, 1.54it/s, loss=0.0111, lr=1.63e-05, step=4568] Training: 46%|████▌ | 4569/10000 [1:01:34<58:48, 1.54it/s, loss=0.0042, lr=1.63e-05, step=4569]19:46:06.852 [I] step=4570 loss=0.0033 smoothed_loss=0.0111 lr=1.63e-05 grad_norm=0.4749 step_time=0.5923s data_time=0.1294s it/s=1.386 eta_to_10000=3918.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0085 grad_action_out_proj_arms=0.1081 grad_arm_token_fuse=0.0419 grad_shared_expert=0.3158 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4570/10000 [1:01:35<1:04:08, 1.41it/s, loss=0.0042, lr=1.63e-05, step=4569] Training: 46%|████▌ | 4570/10000 [1:01:35<1:04:08, 1.41it/s, loss=0.0033, lr=1.63e-05, step=4570] Training: 46%|████▌ | 4571/10000 [1:01:35<1:02:46, 1.44it/s, loss=0.0033, lr=1.63e-05, step=4570] Training: 46%|████▌ | 4571/10000 [1:01:35<1:02:46, 1.44it/s, loss=0.0093, lr=1.63e-05, step=4571] Training: 46%|████▌ | 4572/10000 [1:01:36<1:10:57, 1.28it/s, loss=0.0093, lr=1.63e-05, step=4571] Training: 46%|████▌ | 4572/10000 [1:01:36<1:10:57, 1.28it/s, loss=0.0049, lr=1.63e-05, step=4572] Training: 46%|████▌ | 4573/10000 [1:01:37<1:03:31, 1.42it/s, loss=0.0049, lr=1.63e-05, step=4572] Training: 46%|████▌ | 4573/10000 [1:01:37<1:03:31, 1.42it/s, loss=0.0033, lr=1.63e-05, step=4573] Training: 46%|████▌ | 4574/10000 [1:01:37<1:04:07, 1.41it/s, loss=0.0033, lr=1.63e-05, step=4573] Training: 46%|████▌ | 4574/10000 [1:01:37<1:04:07, 1.41it/s, loss=0.0141, lr=1.62e-05, step=4574] Training: 46%|████▌ | 4575/10000 [1:01:38<1:04:39, 1.40it/s, loss=0.0141, lr=1.62e-05, step=4574] Training: 46%|████▌ | 4575/10000 [1:01:38<1:04:39, 1.40it/s, loss=0.0040, lr=1.62e-05, step=4575] Training: 46%|████▌ | 4576/10000 [1:01:39<1:05:49, 1.37it/s, loss=0.0040, lr=1.62e-05, step=4575] Training: 46%|████▌ | 4576/10000 [1:01:39<1:05:49, 1.37it/s, loss=0.0158, lr=1.62e-05, step=4576] Training: 46%|████▌ | 4577/10000 [1:01:39<1:00:04, 1.50it/s, loss=0.0158, lr=1.62e-05, step=4576] Training: 46%|████▌ | 4577/10000 [1:01:39<1:00:04, 1.50it/s, loss=0.0029, lr=1.62e-05, step=4577] Training: 46%|████▌ | 4578/10000 [1:01:40<59:58, 1.51it/s, loss=0.0029, lr=1.62e-05, step=4577] Training: 46%|████▌ | 4578/10000 [1:01:40<59:58, 1.51it/s, loss=0.0079, lr=1.62e-05, step=4578] Training: 46%|████▌ | 4579/10000 [1:01:41<1:03:28, 1.42it/s, loss=0.0079, lr=1.62e-05, step=4578] Training: 46%|████▌ | 4579/10000 [1:01:41<1:03:28, 1.42it/s, loss=0.0099, lr=1.62e-05, step=4579]19:46:13.729 [I] step=4580 loss=0.0328 smoothed_loss=0.0117 lr=1.62e-05 grad_norm=0.4758 step_time=0.5911s data_time=0.0965s it/s=1.454 eta_to_10000=3726.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1615 grad_arm_token_fuse=0.0734 grad_shared_expert=0.4145 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4580/10000 [1:01:41<58:43, 1.54it/s, loss=0.0099, lr=1.62e-05, step=4579] Training: 46%|████▌ | 4580/10000 [1:01:41<58:43, 1.54it/s, loss=0.0328, lr=1.62e-05, step=4580] Training: 46%|████▌ | 4581/10000 [1:01:42<1:00:02, 1.50it/s, loss=0.0328, lr=1.62e-05, step=4580] Training: 46%|████▌ | 4581/10000 [1:01:42<1:00:02, 1.50it/s, loss=0.0049, lr=1.62e-05, step=4581] Training: 46%|████▌ | 4582/10000 [1:01:43<59:38, 1.51it/s, loss=0.0049, lr=1.62e-05, step=4581] Training: 46%|████▌ | 4582/10000 [1:01:43<59:38, 1.51it/s, loss=0.0405, lr=1.62e-05, step=4582] Training: 46%|████▌ | 4583/10000 [1:01:43<55:05, 1.64it/s, loss=0.0405, lr=1.62e-05, step=4582] Training: 46%|████▌ | 4583/10000 [1:01:43<55:05, 1.64it/s, loss=0.0127, lr=1.62e-05, step=4583] Training: 46%|████▌ | 4584/10000 [1:01:44<53:32, 1.69it/s, loss=0.0127, lr=1.62e-05, step=4583] Training: 46%|████▌ | 4584/10000 [1:01:44<53:32, 1.69it/s, loss=0.0245, lr=1.62e-05, step=4584] Training: 46%|████▌ | 4585/10000 [1:01:44<50:44, 1.78it/s, loss=0.0245, lr=1.62e-05, step=4584] Training: 46%|████▌ | 4585/10000 [1:01:44<50:44, 1.78it/s, loss=0.0135, lr=1.62e-05, step=4585] Training: 46%|████▌ | 4586/10000 [1:01:45<56:01, 1.61it/s, loss=0.0135, lr=1.62e-05, step=4585] Training: 46%|████▌ | 4586/10000 [1:01:45<56:01, 1.61it/s, loss=0.0172, lr=1.62e-05, step=4586] Training: 46%|████▌ | 4587/10000 [1:01:46<53:28, 1.69it/s, loss=0.0172, lr=1.62e-05, step=4586] Training: 46%|████▌ | 4587/10000 [1:01:46<53:28, 1.69it/s, loss=0.0043, lr=1.62e-05, step=4587] Training: 46%|████▌ | 4588/10000 [1:01:46<1:02:15, 1.45it/s, loss=0.0043, lr=1.62e-05, step=4587] Training: 46%|████▌ | 4588/10000 [1:01:46<1:02:15, 1.45it/s, loss=0.0161, lr=1.62e-05, step=4588] Training: 46%|████▌ | 4589/10000 [1:01:47<56:47, 1.59it/s, loss=0.0161, lr=1.62e-05, step=4588] Training: 46%|████▌ | 4589/10000 [1:01:47<56:47, 1.59it/s, loss=0.0143, lr=1.62e-05, step=4589]19:46:20.041 [I] step=4590 loss=0.0079 smoothed_loss=0.0135 lr=1.62e-05 grad_norm=0.4231 step_time=0.5345s data_time=0.0967s it/s=1.585 eta_to_10000=3414.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0181 grad_action_out_proj_arms=0.1292 grad_arm_token_fuse=0.0850 grad_shared_expert=0.4281 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4590/10000 [1:01:48<59:37, 1.51it/s, loss=0.0143, lr=1.62e-05, step=4589] Training: 46%|████▌ | 4590/10000 [1:01:48<59:37, 1.51it/s, loss=0.0079, lr=1.62e-05, step=4590] Training: 46%|████▌ | 4591/10000 [1:01:48<1:01:58, 1.45it/s, loss=0.0079, lr=1.62e-05, step=4590] Training: 46%|████▌ | 4591/10000 [1:01:48<1:01:58, 1.45it/s, loss=0.0129, lr=1.62e-05, step=4591] Training: 46%|████▌ | 4592/10000 [1:01:49<56:37, 1.59it/s, loss=0.0129, lr=1.62e-05, step=4591] Training: 46%|████▌ | 4592/10000 [1:01:49<56:37, 1.59it/s, loss=0.0513, lr=1.62e-05, step=4592] Training: 46%|████▌ | 4593/10000 [1:01:50<59:50, 1.51it/s, loss=0.0513, lr=1.62e-05, step=4592] Training: 46%|████▌ | 4593/10000 [1:01:50<59:50, 1.51it/s, loss=0.0094, lr=1.62e-05, step=4593] Training: 46%|████▌ | 4594/10000 [1:01:50<57:55, 1.56it/s, loss=0.0094, lr=1.62e-05, step=4593] Training: 46%|████▌ | 4594/10000 [1:01:50<57:55, 1.56it/s, loss=0.0237, lr=1.62e-05, step=4594] Training: 46%|████▌ | 4595/10000 [1:01:51<57:40, 1.56it/s, loss=0.0237, lr=1.62e-05, step=4594] Training: 46%|████▌ | 4595/10000 [1:01:51<57:40, 1.56it/s, loss=0.0062, lr=1.62e-05, step=4595] Training: 46%|████▌ | 4596/10000 [1:01:52<59:31, 1.51it/s, loss=0.0062, lr=1.62e-05, step=4595] Training: 46%|████▌ | 4596/10000 [1:01:52<59:31, 1.51it/s, loss=0.0116, lr=1.62e-05, step=4596] Training: 46%|████▌ | 4597/10000 [1:01:52<55:18, 1.63it/s, loss=0.0116, lr=1.62e-05, step=4596] Training: 46%|████▌ | 4597/10000 [1:01:52<55:18, 1.63it/s, loss=0.0108, lr=1.62e-05, step=4597] Training: 46%|████▌ | 4598/10000 [1:01:53<52:09, 1.73it/s, loss=0.0108, lr=1.62e-05, step=4597] Training: 46%|████▌ | 4598/10000 [1:01:53<52:09, 1.73it/s, loss=0.0070, lr=1.62e-05, step=4598] Training: 46%|████▌ | 4599/10000 [1:01:53<52:07, 1.73it/s, loss=0.0070, lr=1.62e-05, step=4598] Training: 46%|████▌ | 4599/10000 [1:01:53<52:07, 1.73it/s, loss=0.0052, lr=1.62e-05, step=4599]19:46:26.475 [I] step=4600 loss=0.0079 smoothed_loss=0.0129 lr=1.62e-05 grad_norm=0.4875 step_time=0.5486s data_time=0.0948s it/s=1.555 eta_to_10000=3473.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1062 grad_arm_token_fuse=0.0612 grad_shared_expert=0.3775 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4600/10000 [1:01:54<1:01:34, 1.46it/s, loss=0.0052, lr=1.62e-05, step=4599] Training: 46%|████▌ | 4600/10000 [1:01:54<1:01:34, 1.46it/s, loss=0.0079, lr=1.62e-05, step=4600] Training: 46%|████▌ | 4601/10000 [1:01:55<1:01:45, 1.46it/s, loss=0.0079, lr=1.62e-05, step=4600] Training: 46%|████▌ | 4601/10000 [1:01:55<1:01:45, 1.46it/s, loss=0.0121, lr=1.61e-05, step=4601] Training: 46%|████▌ | 4602/10000 [1:01:55<56:28, 1.59it/s, loss=0.0121, lr=1.61e-05, step=4601] Training: 46%|████▌ | 4602/10000 [1:01:55<56:28, 1.59it/s, loss=0.0360, lr=1.61e-05, step=4602] Training: 46%|████▌ | 4603/10000 [1:01:56<57:15, 1.57it/s, loss=0.0360, lr=1.61e-05, step=4602] Training: 46%|████▌ | 4603/10000 [1:01:56<57:15, 1.57it/s, loss=0.0107, lr=1.61e-05, step=4603] Training: 46%|████▌ | 4604/10000 [1:01:57<57:10, 1.57it/s, loss=0.0107, lr=1.61e-05, step=4603] Training: 46%|████▌ | 4604/10000 [1:01:57<57:10, 1.57it/s, loss=0.0230, lr=1.61e-05, step=4604] Training: 46%|████▌ | 4605/10000 [1:01:57<56:51, 1.58it/s, loss=0.0230, lr=1.61e-05, step=4604] Training: 46%|████▌ | 4605/10000 [1:01:57<56:51, 1.58it/s, loss=0.0132, lr=1.61e-05, step=4605] Training: 46%|████▌ | 4606/10000 [1:01:58<52:59, 1.70it/s, loss=0.0132, lr=1.61e-05, step=4605] Training: 46%|████▌ | 4606/10000 [1:01:58<52:59, 1.70it/s, loss=0.0168, lr=1.61e-05, step=4606] Training: 46%|████▌ | 4607/10000 [1:01:59<1:03:10, 1.42it/s, loss=0.0168, lr=1.61e-05, step=4606] Training: 46%|████▌ | 4607/10000 [1:01:59<1:03:10, 1.42it/s, loss=0.0117, lr=1.61e-05, step=4607] Training: 46%|████▌ | 4608/10000 [1:01:59<57:36, 1.56it/s, loss=0.0117, lr=1.61e-05, step=4607] Training: 46%|████▌ | 4608/10000 [1:01:59<57:36, 1.56it/s, loss=0.0091, lr=1.61e-05, step=4608] Training: 46%|████▌ | 4609/10000 [1:02:00<53:16, 1.69it/s, loss=0.0091, lr=1.61e-05, step=4608] Training: 46%|████▌ | 4609/10000 [1:02:00<53:16, 1.69it/s, loss=0.0018, lr=1.61e-05, step=4609]19:46:32.691 [I] step=4610 loss=0.0120 smoothed_loss=0.0131 lr=1.61e-05 grad_norm=0.4203 step_time=0.5248s data_time=0.0968s it/s=1.609 eta_to_10000=3349.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0228 grad_action_out_proj_arms=0.1572 grad_arm_token_fuse=0.1235 grad_shared_expert=0.3692 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4610/10000 [1:02:00<55:42, 1.61it/s, loss=0.0018, lr=1.61e-05, step=4609] Training: 46%|████▌ | 4610/10000 [1:02:00<55:42, 1.61it/s, loss=0.0120, lr=1.61e-05, step=4610] Training: 46%|████▌ | 4611/10000 [1:02:01<55:44, 1.61it/s, loss=0.0120, lr=1.61e-05, step=4610] Training: 46%|████▌ | 4611/10000 [1:02:01<55:44, 1.61it/s, loss=0.0074, lr=1.61e-05, step=4611] Training: 46%|████▌ | 4612/10000 [1:02:01<52:17, 1.72it/s, loss=0.0074, lr=1.61e-05, step=4611] Training: 46%|████▌ | 4612/10000 [1:02:01<52:17, 1.72it/s, loss=0.0129, lr=1.61e-05, step=4612] Training: 46%|████▌ | 4613/10000 [1:02:02<52:23, 1.71it/s, loss=0.0129, lr=1.61e-05, step=4612] Training: 46%|████▌ | 4613/10000 [1:02:02<52:23, 1.71it/s, loss=0.0050, lr=1.61e-05, step=4613] Training: 46%|████▌ | 4614/10000 [1:02:03<54:34, 1.65it/s, loss=0.0050, lr=1.61e-05, step=4613] Training: 46%|████▌ | 4614/10000 [1:02:03<54:34, 1.65it/s, loss=0.0256, lr=1.61e-05, step=4614] Training: 46%|████▌ | 4615/10000 [1:02:04<59:34, 1.51it/s, loss=0.0256, lr=1.61e-05, step=4614] Training: 46%|████▌ | 4615/10000 [1:02:04<59:34, 1.51it/s, loss=0.0187, lr=1.61e-05, step=4615] Training: 46%|████▌ | 4616/10000 [1:02:04<54:55, 1.63it/s, loss=0.0187, lr=1.61e-05, step=4615] Training: 46%|████▌ | 4616/10000 [1:02:04<54:55, 1.63it/s, loss=0.1143, lr=1.61e-05, step=4616] Training: 46%|████▌ | 4617/10000 [1:02:05<58:22, 1.54it/s, loss=0.1143, lr=1.61e-05, step=4616] Training: 46%|████▌ | 4617/10000 [1:02:05<58:22, 1.54it/s, loss=0.0301, lr=1.61e-05, step=4617] Training: 46%|████▌ | 4618/10000 [1:02:05<58:16, 1.54it/s, loss=0.0301, lr=1.61e-05, step=4617] Training: 46%|████▌ | 4618/10000 [1:02:05<58:16, 1.54it/s, loss=0.0057, lr=1.61e-05, step=4618] Training: 46%|████▌ | 4619/10000 [1:02:06<54:22, 1.65it/s, loss=0.0057, lr=1.61e-05, step=4618] Training: 46%|████▌ | 4619/10000 [1:02:06<54:22, 1.65it/s, loss=0.0215, lr=1.61e-05, step=4619]19:46:39.051 [I] step=4620 loss=0.0191 smoothed_loss=0.0221 lr=1.61e-05 grad_norm=0.4678 step_time=0.5415s data_time=0.0946s it/s=1.572 eta_to_10000=3421.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0197 grad_action_out_proj_arms=0.1640 grad_arm_token_fuse=0.1064 grad_shared_expert=0.4498 (18633:train_pytorch.py:850) + Training: 46%|████▌ | 4620/10000 [1:02:07<1:00:00, 1.49it/s, loss=0.0215, lr=1.61e-05, step=4619] Training: 46%|████▌ | 4620/10000 [1:02:07<1:00:00, 1.49it/s, loss=0.0191, lr=1.61e-05, step=4620] Training: 46%|████▌ | 4621/10000 [1:02:07<1:00:07, 1.49it/s, loss=0.0191, lr=1.61e-05, step=4620] Training: 46%|████▌ | 4621/10000 [1:02:07<1:00:07, 1.49it/s, loss=0.0159, lr=1.61e-05, step=4621] Training: 46%|████▌ | 4622/10000 [1:02:08<1:06:30, 1.35it/s, loss=0.0159, lr=1.61e-05, step=4621] Training: 46%|████▌ | 4622/10000 [1:02:08<1:06:30, 1.35it/s, loss=0.0068, lr=1.61e-05, step=4622] Training: 46%|████▌ | 4623/10000 [1:02:09<1:05:08, 1.38it/s, loss=0.0068, lr=1.61e-05, step=4622] Training: 46%|████▌ | 4623/10000 [1:02:09<1:05:08, 1.38it/s, loss=0.0100, lr=1.61e-05, step=4623] Training: 46%|████▌ | 4624/10000 [1:02:10<1:04:34, 1.39it/s, loss=0.0100, lr=1.61e-05, step=4623] Training: 46%|████▌ | 4624/10000 [1:02:10<1:04:34, 1.39it/s, loss=0.0123, lr=1.61e-05, step=4624] Training: 46%|████▋ | 4625/10000 [1:02:11<1:08:10, 1.31it/s, loss=0.0123, lr=1.61e-05, step=4624] Training: 46%|████▋ | 4625/10000 [1:02:11<1:08:10, 1.31it/s, loss=0.0164, lr=1.61e-05, step=4625] Training: 46%|████▋ | 4626/10000 [1:02:11<1:00:42, 1.48it/s, loss=0.0164, lr=1.61e-05, step=4625] Training: 46%|████▋ | 4626/10000 [1:02:11<1:00:42, 1.48it/s, loss=0.0107, lr=1.61e-05, step=4626] Training: 46%|████▋ | 4627/10000 [1:02:12<1:01:41, 1.45it/s, loss=0.0107, lr=1.61e-05, step=4626] Training: 46%|████▋ | 4627/10000 [1:02:12<1:01:41, 1.45it/s, loss=0.0143, lr=1.61e-05, step=4627] Training: 46%|████▋ | 4628/10000 [1:02:12<1:00:45, 1.47it/s, loss=0.0143, lr=1.61e-05, step=4627] Training: 46%|████▋ | 4628/10000 [1:02:12<1:00:45, 1.47it/s, loss=0.0085, lr=1.61e-05, step=4628] Training: 46%|████▋ | 4629/10000 [1:02:13<1:03:49, 1.40it/s, loss=0.0085, lr=1.61e-05, step=4628] Training: 46%|████▋ | 4629/10000 [1:02:13<1:03:49, 1.40it/s, loss=0.0056, lr=1.60e-05, step=4629]19:46:46.064 [I] step=4630 loss=0.0245 smoothed_loss=0.0161 lr=1.61e-05 grad_norm=0.4969 step_time=0.5822s data_time=0.1191s it/s=1.426 eta_to_10000=3765.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0218 grad_action_out_proj_arms=0.1371 grad_arm_token_fuse=0.1184 grad_shared_expert=0.4929 (18633:train_pytorch.py:850) + Training: 46%|████▋ | 4630/10000 [1:02:14<58:55, 1.52it/s, loss=0.0056, lr=1.60e-05, step=4629] Training: 46%|████▋ | 4630/10000 [1:02:14<58:55, 1.52it/s, loss=0.0245, lr=1.60e-05, step=4630] Training: 46%|████▋ | 4631/10000 [1:02:15<1:02:00, 1.44it/s, loss=0.0245, lr=1.60e-05, step=4630] Training: 46%|████▋ | 4631/10000 [1:02:15<1:02:00, 1.44it/s, loss=0.0081, lr=1.60e-05, step=4631] Training: 46%|████▋ | 4632/10000 [1:02:15<58:20, 1.53it/s, loss=0.0081, lr=1.60e-05, step=4631] Training: 46%|████▋ | 4632/10000 [1:02:15<58:20, 1.53it/s, loss=0.0339, lr=1.60e-05, step=4632] Training: 46%|████▋ | 4633/10000 [1:02:16<1:03:41, 1.40it/s, loss=0.0339, lr=1.60e-05, step=4632] Training: 46%|████▋ | 4633/10000 [1:02:16<1:03:41, 1.40it/s, loss=0.0100, lr=1.60e-05, step=4633] Training: 46%|████▋ | 4634/10000 [1:02:16<57:37, 1.55it/s, loss=0.0100, lr=1.60e-05, step=4633] Training: 46%|████▋ | 4634/10000 [1:02:16<57:37, 1.55it/s, loss=0.0230, lr=1.60e-05, step=4634] Training: 46%|████▋ | 4635/10000 [1:02:17<53:31, 1.67it/s, loss=0.0230, lr=1.60e-05, step=4634] Training: 46%|████▋ | 4635/10000 [1:02:17<53:31, 1.67it/s, loss=0.0070, lr=1.60e-05, step=4635] Training: 46%|████▋ | 4636/10000 [1:02:18<56:50, 1.57it/s, loss=0.0070, lr=1.60e-05, step=4635] Training: 46%|████▋ | 4636/10000 [1:02:18<56:50, 1.57it/s, loss=0.0084, lr=1.60e-05, step=4636] Training: 46%|████▋ | 4637/10000 [1:02:18<1:01:43, 1.45it/s, loss=0.0084, lr=1.60e-05, step=4636] Training: 46%|████▋ | 4637/10000 [1:02:18<1:01:43, 1.45it/s, loss=0.0198, lr=1.60e-05, step=4637] Training: 46%|████▋ | 4638/10000 [1:02:19<58:20, 1.53it/s, loss=0.0198, lr=1.60e-05, step=4637] Training: 46%|████▋ | 4638/10000 [1:02:19<58:20, 1.53it/s, loss=0.0148, lr=1.60e-05, step=4638] Training: 46%|████▋ | 4639/10000 [1:02:20<1:01:49, 1.45it/s, loss=0.0148, lr=1.60e-05, step=4638] Training: 46%|████▋ | 4639/10000 [1:02:20<1:01:49, 1.45it/s, loss=0.0108, lr=1.60e-05, step=4639]19:46:53.044 [I] step=4640 loss=0.0016 smoothed_loss=0.0138 lr=1.60e-05 grad_norm=0.4928 step_time=0.5628s data_time=0.1352s it/s=1.433 eta_to_10000=3740.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.1133 grad_arm_token_fuse=0.0875 grad_shared_expert=0.7222 (18633:train_pytorch.py:850) + Training: 46%|████▋ | 4640/10000 [1:02:21<1:08:12, 1.31it/s, loss=0.0108, lr=1.60e-05, step=4639] Training: 46%|████▋ | 4640/10000 [1:02:21<1:08:12, 1.31it/s, loss=0.0016, lr=1.60e-05, step=4640] Training: 46%|████▋ | 4641/10000 [1:02:21<1:07:35, 1.32it/s, loss=0.0016, lr=1.60e-05, step=4640] Training: 46%|████▋ | 4641/10000 [1:02:21<1:07:35, 1.32it/s, loss=0.0408, lr=1.60e-05, step=4641] Training: 46%|████▋ | 4642/10000 [1:02:22<1:08:39, 1.30it/s, loss=0.0408, lr=1.60e-05, step=4641] Training: 46%|████▋ | 4642/10000 [1:02:22<1:08:39, 1.30it/s, loss=0.0129, lr=1.60e-05, step=4642] Training: 46%|████▋ | 4643/10000 [1:02:23<1:08:16, 1.31it/s, loss=0.0129, lr=1.60e-05, step=4642] Training: 46%|████▋ | 4643/10000 [1:02:23<1:08:16, 1.31it/s, loss=0.0190, lr=1.60e-05, step=4643] Training: 46%|████▋ | 4644/10000 [1:02:24<1:04:37, 1.38it/s, loss=0.0190, lr=1.60e-05, step=4643] Training: 46%|████▋ | 4644/10000 [1:02:24<1:04:37, 1.38it/s, loss=0.0052, lr=1.60e-05, step=4644] Training: 46%|████▋ | 4645/10000 [1:02:24<1:05:46, 1.36it/s, loss=0.0052, lr=1.60e-05, step=4644] Training: 46%|████▋ | 4645/10000 [1:02:24<1:05:46, 1.36it/s, loss=0.0131, lr=1.60e-05, step=4645] Training: 46%|████▋ | 4646/10000 [1:02:25<1:07:30, 1.32it/s, loss=0.0131, lr=1.60e-05, step=4645] Training: 46%|████▋ | 4646/10000 [1:02:25<1:07:30, 1.32it/s, loss=0.0104, lr=1.60e-05, step=4646] Training: 46%|████▋ | 4647/10000 [1:02:26<1:00:38, 1.47it/s, loss=0.0104, lr=1.60e-05, step=4646] Training: 46%|████▋ | 4647/10000 [1:02:26<1:00:38, 1.47it/s, loss=0.0285, lr=1.60e-05, step=4647] Training: 46%|████▋ | 4648/10000 [1:02:27<1:05:36, 1.36it/s, loss=0.0285, lr=1.60e-05, step=4647] Training: 46%|████▋ | 4648/10000 [1:02:27<1:05:36, 1.36it/s, loss=0.0127, lr=1.60e-05, step=4648] Training: 46%|████▋ | 4649/10000 [1:02:27<59:01, 1.51it/s, loss=0.0127, lr=1.60e-05, step=4648] Training: 46%|████▋ | 4649/10000 [1:02:27<59:01, 1.51it/s, loss=0.0098, lr=1.60e-05, step=4649]19:47:00.306 [I] step=4650 loss=0.0113 smoothed_loss=0.0147 lr=1.60e-05 grad_norm=0.3868 step_time=0.5944s data_time=0.1317s it/s=1.377 eta_to_10000=3884.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.1123 grad_arm_token_fuse=0.0739 grad_shared_expert=0.3747 (18633:train_pytorch.py:850) + Training: 46%|████▋ | 4650/10000 [1:02:28<1:05:44, 1.36it/s, loss=0.0098, lr=1.60e-05, step=4649] Training: 46%|████▋ | 4650/10000 [1:02:28<1:05:44, 1.36it/s, loss=0.0113, lr=1.60e-05, step=4650] Training: 47%|████▋ | 4651/10000 [1:02:29<1:01:17, 1.45it/s, loss=0.0113, lr=1.60e-05, step=4650] Training: 47%|████▋ | 4651/10000 [1:02:29<1:01:17, 1.45it/s, loss=0.0267, lr=1.60e-05, step=4651] Training: 47%|████▋ | 4652/10000 [1:02:29<1:03:41, 1.40it/s, loss=0.0267, lr=1.60e-05, step=4651] Training: 47%|████▋ | 4652/10000 [1:02:29<1:03:41, 1.40it/s, loss=0.0075, lr=1.60e-05, step=4652] Training: 47%|████▋ | 4653/10000 [1:02:30<59:38, 1.49it/s, loss=0.0075, lr=1.60e-05, step=4652] Training: 47%|████▋ | 4653/10000 [1:02:30<59:38, 1.49it/s, loss=0.0276, lr=1.60e-05, step=4653] Training: 47%|████▋ | 4654/10000 [1:02:31<1:03:06, 1.41it/s, loss=0.0276, lr=1.60e-05, step=4653] Training: 47%|████▋ | 4654/10000 [1:02:31<1:03:06, 1.41it/s, loss=0.1228, lr=1.60e-05, step=4654] Training: 47%|████▋ | 4655/10000 [1:02:32<1:09:22, 1.28it/s, loss=0.1228, lr=1.60e-05, step=4654] Training: 47%|████▋ | 4655/10000 [1:02:32<1:09:22, 1.28it/s, loss=0.0180, lr=1.60e-05, step=4655] Training: 47%|████▋ | 4656/10000 [1:02:32<1:04:53, 1.37it/s, loss=0.0180, lr=1.60e-05, step=4655] Training: 47%|████▋ | 4656/10000 [1:02:32<1:04:53, 1.37it/s, loss=0.0128, lr=1.59e-05, step=4656] Training: 47%|████▋ | 4657/10000 [1:02:33<1:09:54, 1.27it/s, loss=0.0128, lr=1.59e-05, step=4656] Training: 47%|████▋ | 4657/10000 [1:02:33<1:09:54, 1.27it/s, loss=0.0211, lr=1.59e-05, step=4657] Training: 47%|████▋ | 4658/10000 [1:02:34<1:12:26, 1.23it/s, loss=0.0211, lr=1.59e-05, step=4657] Training: 47%|████▋ | 4658/10000 [1:02:34<1:12:26, 1.23it/s, loss=0.0096, lr=1.59e-05, step=4658] Training: 47%|████▋ | 4659/10000 [1:02:35<1:04:17, 1.38it/s, loss=0.0096, lr=1.59e-05, step=4658] Training: 47%|████▋ | 4659/10000 [1:02:35<1:04:17, 1.38it/s, loss=0.0291, lr=1.59e-05, step=4659]19:47:07.474 [I] step=4660 loss=0.0067 smoothed_loss=0.0218 lr=1.60e-05 grad_norm=0.5171 step_time=0.5824s data_time=0.1344s it/s=1.395 eta_to_10000=3827.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.1906 grad_arm_token_fuse=0.0621 grad_shared_expert=0.3738 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4660/10000 [1:02:35<1:00:54, 1.46it/s, loss=0.0291, lr=1.59e-05, step=4659] Training: 47%|████▋ | 4660/10000 [1:02:35<1:00:54, 1.46it/s, loss=0.0067, lr=1.59e-05, step=4660] Training: 47%|████▋ | 4661/10000 [1:02:36<57:53, 1.54it/s, loss=0.0067, lr=1.59e-05, step=4660] Training: 47%|████▋ | 4661/10000 [1:02:36<57:53, 1.54it/s, loss=0.0035, lr=1.59e-05, step=4661] Training: 47%|████▋ | 4662/10000 [1:02:37<1:05:48, 1.35it/s, loss=0.0035, lr=1.59e-05, step=4661] Training: 47%|████▋ | 4662/10000 [1:02:37<1:05:48, 1.35it/s, loss=0.0113, lr=1.59e-05, step=4662] Training: 47%|████▋ | 4663/10000 [1:02:37<1:02:50, 1.42it/s, loss=0.0113, lr=1.59e-05, step=4662] Training: 47%|████▋ | 4663/10000 [1:02:37<1:02:50, 1.42it/s, loss=0.0244, lr=1.59e-05, step=4663] Training: 47%|████▋ | 4664/10000 [1:02:38<1:01:45, 1.44it/s, loss=0.0244, lr=1.59e-05, step=4663] Training: 47%|████▋ | 4664/10000 [1:02:38<1:01:45, 1.44it/s, loss=0.0070, lr=1.59e-05, step=4664] Training: 47%|████▋ | 4665/10000 [1:02:39<1:18:34, 1.13it/s, loss=0.0070, lr=1.59e-05, step=4664] Training: 47%|████▋ | 4665/10000 [1:02:39<1:18:34, 1.13it/s, loss=0.0117, lr=1.59e-05, step=4665] Training: 47%|████▋ | 4666/10000 [1:02:40<1:14:58, 1.19it/s, loss=0.0117, lr=1.59e-05, step=4665] Training: 47%|████▋ | 4666/10000 [1:02:40<1:14:58, 1.19it/s, loss=0.0060, lr=1.59e-05, step=4666] Training: 47%|████▋ | 4667/10000 [1:02:41<1:11:38, 1.24it/s, loss=0.0060, lr=1.59e-05, step=4666] Training: 47%|████▋ | 4667/10000 [1:02:41<1:11:38, 1.24it/s, loss=0.0038, lr=1.59e-05, step=4667] Training: 47%|████▋ | 4668/10000 [1:02:41<1:08:24, 1.30it/s, loss=0.0038, lr=1.59e-05, step=4667] Training: 47%|████▋ | 4668/10000 [1:02:41<1:08:24, 1.30it/s, loss=0.0041, lr=1.59e-05, step=4668] Training: 47%|████▋ | 4669/10000 [1:02:42<1:08:12, 1.30it/s, loss=0.0041, lr=1.59e-05, step=4668] Training: 47%|████▋ | 4669/10000 [1:02:42<1:08:12, 1.30it/s, loss=0.0457, lr=1.59e-05, step=4669]19:47:15.383 [I] step=4670 loss=0.0106 smoothed_loss=0.0166 lr=1.59e-05 grad_norm=0.5357 step_time=0.6164s data_time=0.1745s it/s=1.264 eta_to_10000=4215.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0659 grad_arm_token_fuse=0.0355 grad_shared_expert=0.4724 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4670/10000 [1:02:43<1:10:28, 1.26it/s, loss=0.0457, lr=1.59e-05, step=4669] Training: 47%|████▋ | 4670/10000 [1:02:43<1:10:28, 1.26it/s, loss=0.0106, lr=1.59e-05, step=4670] Training: 47%|████▋ | 4671/10000 [1:02:44<1:12:53, 1.22it/s, loss=0.0106, lr=1.59e-05, step=4670] Training: 47%|████▋ | 4671/10000 [1:02:44<1:12:53, 1.22it/s, loss=0.0139, lr=1.59e-05, step=4671] Training: 47%|████▋ | 4672/10000 [1:02:45<1:20:08, 1.11it/s, loss=0.0139, lr=1.59e-05, step=4671] Training: 47%|████▋ | 4672/10000 [1:02:45<1:20:08, 1.11it/s, loss=0.0039, lr=1.59e-05, step=4672] Training: 47%|████▋ | 4673/10000 [1:02:46<1:09:20, 1.28it/s, loss=0.0039, lr=1.59e-05, step=4672] Training: 47%|████▋ | 4673/10000 [1:02:46<1:09:20, 1.28it/s, loss=0.0123, lr=1.59e-05, step=4673] Training: 47%|████▋ | 4674/10000 [1:02:46<1:10:24, 1.26it/s, loss=0.0123, lr=1.59e-05, step=4673] Training: 47%|████▋ | 4674/10000 [1:02:46<1:10:24, 1.26it/s, loss=0.0136, lr=1.59e-05, step=4674] Training: 47%|████▋ | 4675/10000 [1:02:47<1:09:42, 1.27it/s, loss=0.0136, lr=1.59e-05, step=4674] Training: 47%|████▋ | 4675/10000 [1:02:47<1:09:42, 1.27it/s, loss=0.0038, lr=1.59e-05, step=4675] Training: 47%|████▋ | 4676/10000 [1:02:48<1:01:58, 1.43it/s, loss=0.0038, lr=1.59e-05, step=4675] Training: 47%|████▋ | 4676/10000 [1:02:48<1:01:58, 1.43it/s, loss=0.0381, lr=1.59e-05, step=4676] Training: 47%|████▋ | 4677/10000 [1:02:48<56:44, 1.56it/s, loss=0.0381, lr=1.59e-05, step=4676] Training: 47%|████▋ | 4677/10000 [1:02:48<56:44, 1.56it/s, loss=0.0181, lr=1.59e-05, step=4677] Training: 47%|████▋ | 4678/10000 [1:02:49<58:25, 1.52it/s, loss=0.0181, lr=1.59e-05, step=4677] Training: 47%|████▋ | 4678/10000 [1:02:49<58:25, 1.52it/s, loss=0.0244, lr=1.59e-05, step=4678] Training: 47%|████▋ | 4679/10000 [1:02:50<1:00:30, 1.47it/s, loss=0.0244, lr=1.59e-05, step=4678] Training: 47%|████▋ | 4679/10000 [1:02:50<1:00:30, 1.47it/s, loss=0.0069, lr=1.59e-05, step=4679]19:47:22.693 [I] step=4680 loss=0.0082 smoothed_loss=0.0153 lr=1.59e-05 grad_norm=0.4497 step_time=0.6077s data_time=0.1233s it/s=1.368 eta_to_10000=3888.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0098 grad_action_out_proj_arms=0.0902 grad_arm_token_fuse=0.0472 grad_shared_expert=0.2997 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4680/10000 [1:02:50<1:03:49, 1.39it/s, loss=0.0069, lr=1.59e-05, step=4679] Training: 47%|████▋ | 4680/10000 [1:02:50<1:03:49, 1.39it/s, loss=0.0082, lr=1.59e-05, step=4680] Training: 47%|████▋ | 4681/10000 [1:02:51<57:49, 1.53it/s, loss=0.0082, lr=1.59e-05, step=4680] Training: 47%|████▋ | 4681/10000 [1:02:51<57:49, 1.53it/s, loss=0.0056, lr=1.59e-05, step=4681] Training: 47%|████▋ | 4682/10000 [1:02:51<53:59, 1.64it/s, loss=0.0056, lr=1.59e-05, step=4681] Training: 47%|████▋ | 4682/10000 [1:02:51<53:59, 1.64it/s, loss=0.0024, lr=1.59e-05, step=4682] Training: 47%|████▋ | 4683/10000 [1:02:52<57:12, 1.55it/s, loss=0.0024, lr=1.59e-05, step=4682] Training: 47%|████▋ | 4683/10000 [1:02:52<57:12, 1.55it/s, loss=0.0106, lr=1.59e-05, step=4683] Training: 47%|████▋ | 4684/10000 [1:02:53<53:28, 1.66it/s, loss=0.0106, lr=1.59e-05, step=4683] Training: 47%|████▋ | 4684/10000 [1:02:53<53:28, 1.66it/s, loss=0.0119, lr=1.58e-05, step=4684] Training: 47%|████▋ | 4685/10000 [1:02:53<55:36, 1.59it/s, loss=0.0119, lr=1.58e-05, step=4684] Training: 47%|████▋ | 4685/10000 [1:02:53<55:36, 1.59it/s, loss=0.0137, lr=1.58e-05, step=4685] Training: 47%|████▋ | 4686/10000 [1:02:54<1:04:40, 1.37it/s, loss=0.0137, lr=1.58e-05, step=4685] Training: 47%|████▋ | 4686/10000 [1:02:54<1:04:40, 1.37it/s, loss=0.0168, lr=1.58e-05, step=4686] Training: 47%|████▋ | 4687/10000 [1:02:55<58:19, 1.52it/s, loss=0.0168, lr=1.58e-05, step=4686] Training: 47%|████▋ | 4687/10000 [1:02:55<58:19, 1.52it/s, loss=0.0271, lr=1.58e-05, step=4687] Training: 47%|████▋ | 4688/10000 [1:02:55<53:49, 1.64it/s, loss=0.0271, lr=1.58e-05, step=4687] Training: 47%|████▋ | 4688/10000 [1:02:55<53:49, 1.64it/s, loss=0.0117, lr=1.58e-05, step=4688] Training: 47%|████▋ | 4689/10000 [1:02:56<50:48, 1.74it/s, loss=0.0117, lr=1.58e-05, step=4688] Training: 47%|████▋ | 4689/10000 [1:02:56<50:48, 1.74it/s, loss=0.0185, lr=1.58e-05, step=4689]19:47:28.819 [I] step=4690 loss=0.0060 smoothed_loss=0.0139 lr=1.58e-05 grad_norm=0.4438 step_time=0.5166s data_time=0.0960s it/s=1.633 eta_to_10000=3252.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.0683 grad_arm_token_fuse=0.0308 grad_shared_expert=0.2135 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4690/10000 [1:02:56<55:38, 1.59it/s, loss=0.0185, lr=1.58e-05, step=4689] Training: 47%|████▋ | 4690/10000 [1:02:56<55:38, 1.59it/s, loss=0.0060, lr=1.58e-05, step=4690] Training: 47%|████▋ | 4691/10000 [1:02:57<51:41, 1.71it/s, loss=0.0060, lr=1.58e-05, step=4690] Training: 47%|████▋ | 4691/10000 [1:02:57<51:41, 1.71it/s, loss=0.0043, lr=1.58e-05, step=4691] Training: 47%|████▋ | 4692/10000 [1:02:58<53:26, 1.66it/s, loss=0.0043, lr=1.58e-05, step=4691] Training: 47%|████▋ | 4692/10000 [1:02:58<53:26, 1.66it/s, loss=0.0381, lr=1.58e-05, step=4692] Training: 47%|████▋ | 4693/10000 [1:02:58<56:50, 1.56it/s, loss=0.0381, lr=1.58e-05, step=4692] Training: 47%|████▋ | 4693/10000 [1:02:58<56:50, 1.56it/s, loss=0.0072, lr=1.58e-05, step=4693] Training: 47%|████▋ | 4694/10000 [1:02:59<52:47, 1.68it/s, loss=0.0072, lr=1.58e-05, step=4693] Training: 47%|████▋ | 4694/10000 [1:02:59<52:47, 1.68it/s, loss=0.0131, lr=1.58e-05, step=4694] Training: 47%|████▋ | 4695/10000 [1:02:59<49:44, 1.78it/s, loss=0.0131, lr=1.58e-05, step=4694] Training: 47%|████▋ | 4695/10000 [1:02:59<49:44, 1.78it/s, loss=0.0029, lr=1.58e-05, step=4695] Training: 47%|████▋ | 4696/10000 [1:03:00<47:29, 1.86it/s, loss=0.0029, lr=1.58e-05, step=4695] Training: 47%|████▋ | 4696/10000 [1:03:00<47:29, 1.86it/s, loss=0.0234, lr=1.58e-05, step=4696] Training: 47%|████▋ | 4697/10000 [1:03:00<46:48, 1.89it/s, loss=0.0234, lr=1.58e-05, step=4696] Training: 47%|████▋ | 4697/10000 [1:03:00<46:48, 1.89it/s, loss=0.0096, lr=1.58e-05, step=4697] Training: 47%|████▋ | 4698/10000 [1:03:01<48:36, 1.82it/s, loss=0.0096, lr=1.58e-05, step=4697] Training: 47%|████▋ | 4698/10000 [1:03:01<48:36, 1.82it/s, loss=0.0202, lr=1.58e-05, step=4698] Training: 47%|████▋ | 4699/10000 [1:03:01<46:49, 1.89it/s, loss=0.0202, lr=1.58e-05, step=4698] Training: 47%|████▋ | 4699/10000 [1:03:01<46:49, 1.89it/s, loss=0.0158, lr=1.58e-05, step=4699]19:47:34.486 [I] step=4700 loss=0.0126 smoothed_loss=0.0144 lr=1.58e-05 grad_norm=0.5223 step_time=0.4992s data_time=0.0675s it/s=1.765 eta_to_10000=3003.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0164 grad_action_out_proj_arms=0.1650 grad_arm_token_fuse=0.0811 grad_shared_expert=0.3610 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4700/10000 [1:03:02<52:55, 1.67it/s, loss=0.0158, lr=1.58e-05, step=4699] Training: 47%|████▋ | 4700/10000 [1:03:02<52:55, 1.67it/s, loss=0.0126, lr=1.58e-05, step=4700] Training: 47%|████▋ | 4701/10000 [1:03:03<50:13, 1.76it/s, loss=0.0126, lr=1.58e-05, step=4700] Training: 47%|████▋ | 4701/10000 [1:03:03<50:13, 1.76it/s, loss=0.0445, lr=1.58e-05, step=4701] Training: 47%|████▋ | 4702/10000 [1:03:03<48:09, 1.83it/s, loss=0.0445, lr=1.58e-05, step=4701] Training: 47%|████▋ | 4702/10000 [1:03:03<48:09, 1.83it/s, loss=0.0100, lr=1.58e-05, step=4702] Training: 47%|████▋ | 4703/10000 [1:03:04<46:43, 1.89it/s, loss=0.0100, lr=1.58e-05, step=4702] Training: 47%|████▋ | 4703/10000 [1:03:04<46:43, 1.89it/s, loss=0.0214, lr=1.58e-05, step=4703] Training: 47%|████▋ | 4704/10000 [1:03:04<46:24, 1.90it/s, loss=0.0214, lr=1.58e-05, step=4703] Training: 47%|████▋ | 4704/10000 [1:03:04<46:24, 1.90it/s, loss=0.0066, lr=1.58e-05, step=4704] Training: 47%|████▋ | 4705/10000 [1:03:05<45:29, 1.94it/s, loss=0.0066, lr=1.58e-05, step=4704] Training: 47%|████▋ | 4705/10000 [1:03:05<45:29, 1.94it/s, loss=0.0337, lr=1.58e-05, step=4705] Training: 47%|████▋ | 4706/10000 [1:03:05<49:52, 1.77it/s, loss=0.0337, lr=1.58e-05, step=4705] Training: 47%|████▋ | 4706/10000 [1:03:05<49:52, 1.77it/s, loss=0.0080, lr=1.58e-05, step=4706] Training: 47%|████▋ | 4707/10000 [1:03:06<54:12, 1.63it/s, loss=0.0080, lr=1.58e-05, step=4706] Training: 47%|████▋ | 4707/10000 [1:03:06<54:12, 1.63it/s, loss=0.0097, lr=1.58e-05, step=4707] Training: 47%|████▋ | 4708/10000 [1:03:07<53:26, 1.65it/s, loss=0.0097, lr=1.58e-05, step=4707] Training: 47%|████▋ | 4708/10000 [1:03:07<53:26, 1.65it/s, loss=0.0140, lr=1.58e-05, step=4708] Training: 47%|████▋ | 4709/10000 [1:03:07<51:14, 1.72it/s, loss=0.0140, lr=1.58e-05, step=4708] Training: 47%|████▋ | 4709/10000 [1:03:07<51:14, 1.72it/s, loss=0.0869, lr=1.58e-05, step=4709]19:47:40.014 [I] step=4710 loss=0.0052 smoothed_loss=0.0212 lr=1.58e-05 grad_norm=0.5550 step_time=0.4760s data_time=0.0768s it/s=1.810 eta_to_10000=2923.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0071 grad_action_out_proj_arms=0.0873 grad_arm_token_fuse=0.0363 grad_shared_expert=0.3036 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4710/10000 [1:03:08<49:34, 1.78it/s, loss=0.0869, lr=1.58e-05, step=4709] Training: 47%|████▋ | 4710/10000 [1:03:08<49:34, 1.78it/s, loss=0.0052, lr=1.58e-05, step=4710] Training: 47%|████▋ | 4711/10000 [1:03:08<47:41, 1.85it/s, loss=0.0052, lr=1.58e-05, step=4710] Training: 47%|████▋ | 4711/10000 [1:03:08<47:41, 1.85it/s, loss=0.0034, lr=1.57e-05, step=4711] Training: 47%|████▋ | 4712/10000 [1:03:09<46:15, 1.91it/s, loss=0.0034, lr=1.57e-05, step=4711] Training: 47%|████▋ | 4712/10000 [1:03:09<46:15, 1.91it/s, loss=0.0058, lr=1.57e-05, step=4712] Training: 47%|████▋ | 4713/10000 [1:03:09<49:12, 1.79it/s, loss=0.0058, lr=1.57e-05, step=4712] Training: 47%|████▋ | 4713/10000 [1:03:09<49:12, 1.79it/s, loss=0.0183, lr=1.57e-05, step=4713] Training: 47%|████▋ | 4714/10000 [1:03:10<52:16, 1.69it/s, loss=0.0183, lr=1.57e-05, step=4713] Training: 47%|████▋ | 4714/10000 [1:03:10<52:16, 1.69it/s, loss=0.0130, lr=1.57e-05, step=4714] Training: 47%|████▋ | 4715/10000 [1:03:11<55:33, 1.59it/s, loss=0.0130, lr=1.57e-05, step=4714] Training: 47%|████▋ | 4715/10000 [1:03:11<55:33, 1.59it/s, loss=0.0104, lr=1.57e-05, step=4715] Training: 47%|████▋ | 4716/10000 [1:03:11<56:28, 1.56it/s, loss=0.0104, lr=1.57e-05, step=4715] Training: 47%|████▋ | 4716/10000 [1:03:11<56:28, 1.56it/s, loss=0.0191, lr=1.57e-05, step=4716] Training: 47%|████▋ | 4717/10000 [1:03:12<52:57, 1.66it/s, loss=0.0191, lr=1.57e-05, step=4716] Training: 47%|████▋ | 4717/10000 [1:03:12<52:57, 1.66it/s, loss=0.0170, lr=1.57e-05, step=4717] Training: 47%|████▋ | 4718/10000 [1:03:12<49:52, 1.76it/s, loss=0.0170, lr=1.57e-05, step=4717] Training: 47%|████▋ | 4718/10000 [1:03:12<49:52, 1.76it/s, loss=0.0073, lr=1.57e-05, step=4718] Training: 47%|████▋ | 4719/10000 [1:03:13<47:49, 1.84it/s, loss=0.0073, lr=1.57e-05, step=4718] Training: 47%|████▋ | 4719/10000 [1:03:13<47:49, 1.84it/s, loss=0.0020, lr=1.57e-05, step=4719]19:47:45.866 [I] step=4720 loss=0.0247 smoothed_loss=0.0157 lr=1.57e-05 grad_norm=0.4606 step_time=0.5071s data_time=0.0781s it/s=1.709 eta_to_10000=3089.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.1457 grad_arm_token_fuse=0.0474 grad_shared_expert=0.3627 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4720/10000 [1:03:14<51:49, 1.70it/s, loss=0.0020, lr=1.57e-05, step=4719] Training: 47%|████▋ | 4720/10000 [1:03:14<51:49, 1.70it/s, loss=0.0247, lr=1.57e-05, step=4720] Training: 47%|████▋ | 4721/10000 [1:03:14<53:04, 1.66it/s, loss=0.0247, lr=1.57e-05, step=4720] Training: 47%|████▋ | 4721/10000 [1:03:14<53:04, 1.66it/s, loss=0.0265, lr=1.57e-05, step=4721] Training: 47%|████▋ | 4722/10000 [1:03:15<1:03:18, 1.39it/s, loss=0.0265, lr=1.57e-05, step=4721] Training: 47%|████▋ | 4722/10000 [1:03:15<1:03:18, 1.39it/s, loss=0.0095, lr=1.57e-05, step=4722] Training: 47%|████▋ | 4723/10000 [1:03:16<57:11, 1.54it/s, loss=0.0095, lr=1.57e-05, step=4722] Training: 47%|████▋ | 4723/10000 [1:03:16<57:11, 1.54it/s, loss=0.0079, lr=1.57e-05, step=4723] Training: 47%|████▋ | 4724/10000 [1:03:16<53:40, 1.64it/s, loss=0.0079, lr=1.57e-05, step=4723] Training: 47%|████▋ | 4724/10000 [1:03:16<53:40, 1.64it/s, loss=0.0088, lr=1.57e-05, step=4724] Training: 47%|████▋ | 4725/10000 [1:03:17<51:00, 1.72it/s, loss=0.0088, lr=1.57e-05, step=4724] Training: 47%|████▋ | 4725/10000 [1:03:17<51:00, 1.72it/s, loss=0.0136, lr=1.57e-05, step=4725] Training: 47%|████▋ | 4726/10000 [1:03:17<48:32, 1.81it/s, loss=0.0136, lr=1.57e-05, step=4725] Training: 47%|████▋ | 4726/10000 [1:03:17<48:32, 1.81it/s, loss=0.0173, lr=1.57e-05, step=4726] Training: 47%|████▋ | 4727/10000 [1:03:18<56:15, 1.56it/s, loss=0.0173, lr=1.57e-05, step=4726] Training: 47%|████▋ | 4727/10000 [1:03:18<56:15, 1.56it/s, loss=0.0070, lr=1.57e-05, step=4727] Training: 47%|████▋ | 4728/10000 [1:03:19<1:10:54, 1.24it/s, loss=0.0070, lr=1.57e-05, step=4727] Training: 47%|████▋ | 4728/10000 [1:03:19<1:10:54, 1.24it/s, loss=0.0043, lr=1.57e-05, step=4728] Training: 47%|████▋ | 4729/10000 [1:03:20<1:13:42, 1.19it/s, loss=0.0043, lr=1.57e-05, step=4728] Training: 47%|████▋ | 4729/10000 [1:03:20<1:13:42, 1.19it/s, loss=0.0105, lr=1.57e-05, step=4729]19:47:53.080 [I] step=4730 loss=0.0238 smoothed_loss=0.0139 lr=1.57e-05 grad_norm=0.4622 step_time=0.6043s data_time=0.1172s it/s=1.386 eta_to_10000=3801.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0127 grad_action_out_proj_arms=0.1151 grad_arm_token_fuse=0.0655 grad_shared_expert=0.6848 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4730/10000 [1:03:21<1:08:09, 1.29it/s, loss=0.0105, lr=1.57e-05, step=4729] Training: 47%|████▋ | 4730/10000 [1:03:21<1:08:09, 1.29it/s, loss=0.0238, lr=1.57e-05, step=4730] Training: 47%|████▋ | 4731/10000 [1:03:21<1:07:13, 1.31it/s, loss=0.0238, lr=1.57e-05, step=4730] Training: 47%|████▋ | 4731/10000 [1:03:21<1:07:13, 1.31it/s, loss=0.0057, lr=1.57e-05, step=4731] Training: 47%|████▋ | 4732/10000 [1:03:22<59:56, 1.46it/s, loss=0.0057, lr=1.57e-05, step=4731] Training: 47%|████▋ | 4732/10000 [1:03:22<59:56, 1.46it/s, loss=0.0117, lr=1.57e-05, step=4732] Training: 47%|████▋ | 4733/10000 [1:03:23<1:03:38, 1.38it/s, loss=0.0117, lr=1.57e-05, step=4732] Training: 47%|████▋ | 4733/10000 [1:03:23<1:03:38, 1.38it/s, loss=0.0050, lr=1.57e-05, step=4733] Training: 47%|████▋ | 4734/10000 [1:03:23<1:01:40, 1.42it/s, loss=0.0050, lr=1.57e-05, step=4733] Training: 47%|████▋ | 4734/10000 [1:03:23<1:01:40, 1.42it/s, loss=0.0134, lr=1.57e-05, step=4734] Training: 47%|████▋ | 4735/10000 [1:03:24<1:01:26, 1.43it/s, loss=0.0134, lr=1.57e-05, step=4734] Training: 47%|████▋ | 4735/10000 [1:03:24<1:01:26, 1.43it/s, loss=0.0057, lr=1.57e-05, step=4735] Training: 47%|████▋ | 4736/10000 [1:03:25<1:07:26, 1.30it/s, loss=0.0057, lr=1.57e-05, step=4735] Training: 47%|████▋ | 4736/10000 [1:03:25<1:07:26, 1.30it/s, loss=0.0039, lr=1.57e-05, step=4736] Training: 47%|████▋ | 4737/10000 [1:03:26<1:05:39, 1.34it/s, loss=0.0039, lr=1.57e-05, step=4736] Training: 47%|████▋ | 4737/10000 [1:03:26<1:05:39, 1.34it/s, loss=0.0105, lr=1.57e-05, step=4737] Training: 47%|████▋ | 4738/10000 [1:03:27<1:18:13, 1.12it/s, loss=0.0105, lr=1.57e-05, step=4737] Training: 47%|████▋ | 4738/10000 [1:03:27<1:18:13, 1.12it/s, loss=0.0116, lr=1.56e-05, step=4738] Training: 47%|████▋ | 4739/10000 [1:03:28<1:10:48, 1.24it/s, loss=0.0116, lr=1.56e-05, step=4738] Training: 47%|████▋ | 4739/10000 [1:03:28<1:10:48, 1.24it/s, loss=0.0288, lr=1.56e-05, step=4739]19:48:00.765 [I] step=4740 loss=0.0125 smoothed_loss=0.0126 lr=1.57e-05 grad_norm=0.4057 step_time=0.6381s data_time=0.1304s it/s=1.301 eta_to_10000=4041.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0082 grad_action_out_proj_arms=0.0977 grad_arm_token_fuse=0.0379 grad_shared_expert=0.2711 (18633:train_pytorch.py:850) + Training: 47%|████▋ | 4740/10000 [1:03:28<1:11:05, 1.23it/s, loss=0.0288, lr=1.56e-05, step=4739] Training: 47%|████▋ | 4740/10000 [1:03:28<1:11:05, 1.23it/s, loss=0.0125, lr=1.56e-05, step=4740] Training: 47%|████▋ | 4741/10000 [1:03:29<1:03:28, 1.38it/s, loss=0.0125, lr=1.56e-05, step=4740] Training: 47%|████▋ | 4741/10000 [1:03:29<1:03:28, 1.38it/s, loss=0.0202, lr=1.56e-05, step=4741] Training: 47%|████▋ | 4742/10000 [1:03:30<1:06:12, 1.32it/s, loss=0.0202, lr=1.56e-05, step=4741] Training: 47%|████▋ | 4742/10000 [1:03:30<1:06:12, 1.32it/s, loss=0.0048, lr=1.56e-05, step=4742] Training: 47%|████▋ | 4743/10000 [1:03:31<1:09:55, 1.25it/s, loss=0.0048, lr=1.56e-05, step=4742] Training: 47%|████▋ | 4743/10000 [1:03:31<1:09:55, 1.25it/s, loss=0.0042, lr=1.56e-05, step=4743] Training: 47%|████▋ | 4744/10000 [1:03:31<1:05:39, 1.33it/s, loss=0.0042, lr=1.56e-05, step=4743] Training: 47%|████▋ | 4744/10000 [1:03:31<1:05:39, 1.33it/s, loss=0.0323, lr=1.56e-05, step=4744] Training: 47%|████▋ | 4745/10000 [1:03:32<59:28, 1.47it/s, loss=0.0323, lr=1.56e-05, step=4744] Training: 47%|████▋ | 4745/10000 [1:03:32<59:28, 1.47it/s, loss=0.0041, lr=1.56e-05, step=4745] Training: 47%|████▋ | 4746/10000 [1:03:32<57:18, 1.53it/s, loss=0.0041, lr=1.56e-05, step=4745] Training: 47%|████▋ | 4746/10000 [1:03:32<57:18, 1.53it/s, loss=0.0987, lr=1.56e-05, step=4746] Training: 47%|████▋ | 4747/10000 [1:03:33<53:09, 1.65it/s, loss=0.0987, lr=1.56e-05, step=4746] Training: 47%|████▋ | 4747/10000 [1:03:33<53:09, 1.65it/s, loss=0.0128, lr=1.56e-05, step=4747] Training: 47%|████▋ | 4748/10000 [1:03:33<50:11, 1.74it/s, loss=0.0128, lr=1.56e-05, step=4747] Training: 47%|████▋ | 4748/10000 [1:03:33<50:11, 1.74it/s, loss=0.0093, lr=1.56e-05, step=4748] Training: 47%|████▋ | 4749/10000 [1:03:34<1:01:13, 1.43it/s, loss=0.0093, lr=1.56e-05, step=4748] Training: 47%|████▋ | 4749/10000 [1:03:34<1:01:13, 1.43it/s, loss=0.0045, lr=1.56e-05, step=4749]19:48:07.793 [I] step=4750 loss=0.0127 smoothed_loss=0.0174 lr=1.56e-05 grad_norm=0.4630 step_time=0.5892s data_time=0.1135s it/s=1.423 eta_to_10000=3688.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0266 grad_action_out_proj_arms=0.2214 grad_arm_token_fuse=0.1425 grad_shared_expert=0.4773 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4750/10000 [1:03:35<1:10:18, 1.24it/s, loss=0.0045, lr=1.56e-05, step=4749] Training: 48%|████▊ | 4750/10000 [1:03:35<1:10:18, 1.24it/s, loss=0.0127, lr=1.56e-05, step=4750] Training: 48%|████▊ | 4751/10000 [1:03:36<1:09:57, 1.25it/s, loss=0.0127, lr=1.56e-05, step=4750] Training: 48%|████▊ | 4751/10000 [1:03:36<1:09:57, 1.25it/s, loss=0.0836, lr=1.56e-05, step=4751] Training: 48%|████▊ | 4752/10000 [1:03:37<1:03:33, 1.38it/s, loss=0.0836, lr=1.56e-05, step=4751] Training: 48%|████▊ | 4752/10000 [1:03:37<1:03:33, 1.38it/s, loss=0.0124, lr=1.56e-05, step=4752] Training: 48%|████▊ | 4753/10000 [1:03:38<1:03:52, 1.37it/s, loss=0.0124, lr=1.56e-05, step=4752] Training: 48%|████▊ | 4753/10000 [1:03:38<1:03:52, 1.37it/s, loss=0.0511, lr=1.56e-05, step=4753] Training: 48%|████▊ | 4754/10000 [1:03:38<1:01:31, 1.42it/s, loss=0.0511, lr=1.56e-05, step=4753] Training: 48%|████▊ | 4754/10000 [1:03:38<1:01:31, 1.42it/s, loss=0.0042, lr=1.56e-05, step=4754] Training: 48%|████▊ | 4755/10000 [1:03:39<55:47, 1.57it/s, loss=0.0042, lr=1.56e-05, step=4754] Training: 48%|████▊ | 4755/10000 [1:03:39<55:47, 1.57it/s, loss=0.0125, lr=1.56e-05, step=4755] Training: 48%|████▊ | 4756/10000 [1:03:39<55:50, 1.56it/s, loss=0.0125, lr=1.56e-05, step=4755] Training: 48%|████▊ | 4756/10000 [1:03:39<55:50, 1.56it/s, loss=0.0090, lr=1.56e-05, step=4756] Training: 48%|████▊ | 4757/10000 [1:03:40<59:10, 1.48it/s, loss=0.0090, lr=1.56e-05, step=4756] Training: 48%|████▊ | 4757/10000 [1:03:40<59:10, 1.48it/s, loss=0.0052, lr=1.56e-05, step=4757] Training: 48%|████▊ | 4758/10000 [1:03:41<58:09, 1.50it/s, loss=0.0052, lr=1.56e-05, step=4757] Training: 48%|████▊ | 4758/10000 [1:03:41<58:09, 1.50it/s, loss=0.0012, lr=1.56e-05, step=4758] Training: 48%|████▊ | 4759/10000 [1:03:41<56:05, 1.56it/s, loss=0.0012, lr=1.56e-05, step=4758] Training: 48%|████▊ | 4759/10000 [1:03:41<56:05, 1.56it/s, loss=0.0090, lr=1.56e-05, step=4759]19:48:14.360 [I] step=4760 loss=0.0215 smoothed_loss=0.0173 lr=1.56e-05 grad_norm=0.4783 step_time=0.5472s data_time=0.1096s it/s=1.523 eta_to_10000=3440.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.1012 grad_arm_token_fuse=0.0414 grad_shared_expert=0.4656 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4760/10000 [1:03:42<58:09, 1.50it/s, loss=0.0090, lr=1.56e-05, step=4759] Training: 48%|████▊ | 4760/10000 [1:03:42<58:09, 1.50it/s, loss=0.0215, lr=1.56e-05, step=4760] Training: 48%|████▊ | 4761/10000 [1:03:43<53:24, 1.64it/s, loss=0.0215, lr=1.56e-05, step=4760] Training: 48%|████▊ | 4761/10000 [1:03:43<53:24, 1.64it/s, loss=0.0253, lr=1.56e-05, step=4761] Training: 48%|████▊ | 4762/10000 [1:03:43<50:32, 1.73it/s, loss=0.0253, lr=1.56e-05, step=4761] Training: 48%|████▊ | 4762/10000 [1:03:43<50:32, 1.73it/s, loss=0.0118, lr=1.56e-05, step=4762] Training: 48%|████▊ | 4763/10000 [1:03:44<52:23, 1.67it/s, loss=0.0118, lr=1.56e-05, step=4762] Training: 48%|████▊ | 4763/10000 [1:03:44<52:23, 1.67it/s, loss=0.0294, lr=1.56e-05, step=4763] Training: 48%|████▊ | 4764/10000 [1:03:45<1:09:24, 1.26it/s, loss=0.0294, lr=1.56e-05, step=4763] Training: 48%|████▊ | 4764/10000 [1:03:45<1:09:24, 1.26it/s, loss=0.0069, lr=1.56e-05, step=4764] Training: 48%|████▊ | 4765/10000 [1:03:46<1:19:07, 1.10it/s, loss=0.0069, lr=1.56e-05, step=4764] Training: 48%|████▊ | 4765/10000 [1:03:46<1:19:07, 1.10it/s, loss=0.0044, lr=1.56e-05, step=4765] Training: 48%|████▊ | 4766/10000 [1:03:47<1:14:26, 1.17it/s, loss=0.0044, lr=1.56e-05, step=4765] Training: 48%|████▊ | 4766/10000 [1:03:47<1:14:26, 1.17it/s, loss=0.0038, lr=1.55e-05, step=4766] Training: 48%|████▊ | 4767/10000 [1:03:48<1:12:20, 1.21it/s, loss=0.0038, lr=1.55e-05, step=4766] Training: 48%|████▊ | 4767/10000 [1:03:48<1:12:20, 1.21it/s, loss=0.0797, lr=1.55e-05, step=4767] Training: 48%|████▊ | 4768/10000 [1:03:48<1:08:09, 1.28it/s, loss=0.0797, lr=1.55e-05, step=4767] Training: 48%|████▊ | 4768/10000 [1:03:48<1:08:09, 1.28it/s, loss=0.0207, lr=1.55e-05, step=4768] Training: 48%|████▊ | 4769/10000 [1:03:49<1:04:41, 1.35it/s, loss=0.0207, lr=1.55e-05, step=4768] Training: 48%|████▊ | 4769/10000 [1:03:49<1:04:41, 1.35it/s, loss=0.0180, lr=1.55e-05, step=4769]19:48:21.900 [I] step=4770 loss=0.0080 smoothed_loss=0.0197 lr=1.55e-05 grad_norm=0.5290 step_time=0.5885s data_time=0.1655s it/s=1.326 eta_to_10000=3942.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.1561 grad_arm_token_fuse=0.1177 grad_shared_expert=0.7397 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4770/10000 [1:03:50<1:02:36, 1.39it/s, loss=0.0180, lr=1.55e-05, step=4769] Training: 48%|████▊ | 4770/10000 [1:03:50<1:02:36, 1.39it/s, loss=0.0080, lr=1.55e-05, step=4770] Training: 48%|████▊ | 4771/10000 [1:03:50<1:00:42, 1.44it/s, loss=0.0080, lr=1.55e-05, step=4770] Training: 48%|████▊ | 4771/10000 [1:03:50<1:00:42, 1.44it/s, loss=0.0261, lr=1.55e-05, step=4771] Training: 48%|████▊ | 4772/10000 [1:03:51<1:04:51, 1.34it/s, loss=0.0261, lr=1.55e-05, step=4771] Training: 48%|████▊ | 4772/10000 [1:03:51<1:04:51, 1.34it/s, loss=0.0032, lr=1.55e-05, step=4772] Training: 48%|████▊ | 4773/10000 [1:03:52<59:06, 1.47it/s, loss=0.0032, lr=1.55e-05, step=4772] Training: 48%|████▊ | 4773/10000 [1:03:52<59:06, 1.47it/s, loss=0.0047, lr=1.55e-05, step=4773] Training: 48%|████▊ | 4774/10000 [1:03:52<57:08, 1.52it/s, loss=0.0047, lr=1.55e-05, step=4773] Training: 48%|████▊ | 4774/10000 [1:03:52<57:08, 1.52it/s, loss=0.0107, lr=1.55e-05, step=4774] Training: 48%|████▊ | 4775/10000 [1:03:53<56:18, 1.55it/s, loss=0.0107, lr=1.55e-05, step=4774] Training: 48%|████▊ | 4775/10000 [1:03:53<56:18, 1.55it/s, loss=0.0038, lr=1.55e-05, step=4775] Training: 48%|████▊ | 4776/10000 [1:03:53<54:19, 1.60it/s, loss=0.0038, lr=1.55e-05, step=4775] Training: 48%|████▊ | 4776/10000 [1:03:53<54:19, 1.60it/s, loss=0.0056, lr=1.55e-05, step=4776] Training: 48%|████▊ | 4777/10000 [1:03:55<1:06:53, 1.30it/s, loss=0.0056, lr=1.55e-05, step=4776] Training: 48%|████▊ | 4777/10000 [1:03:55<1:06:53, 1.30it/s, loss=0.0072, lr=1.55e-05, step=4777] Training: 48%|████▊ | 4778/10000 [1:03:55<1:05:23, 1.33it/s, loss=0.0072, lr=1.55e-05, step=4777] Training: 48%|████▊ | 4778/10000 [1:03:55<1:05:23, 1.33it/s, loss=0.0358, lr=1.55e-05, step=4778] Training: 48%|████▊ | 4779/10000 [1:03:56<1:11:38, 1.21it/s, loss=0.0358, lr=1.55e-05, step=4778] Training: 48%|████▊ | 4779/10000 [1:03:56<1:11:38, 1.21it/s, loss=0.0206, lr=1.55e-05, step=4779]19:48:29.415 [I] step=4780 loss=0.0061 smoothed_loss=0.0153 lr=1.55e-05 grad_norm=0.4777 step_time=0.6065s data_time=0.1451s it/s=1.331 eta_to_10000=3922.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0147 grad_action_out_proj_arms=0.1383 grad_arm_token_fuse=0.0729 grad_shared_expert=0.3592 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4780/10000 [1:03:57<1:13:09, 1.19it/s, loss=0.0206, lr=1.55e-05, step=4779] Training: 48%|████▊ | 4780/10000 [1:03:57<1:13:09, 1.19it/s, loss=0.0061, lr=1.55e-05, step=4780] Training: 48%|████▊ | 4781/10000 [1:03:58<1:05:54, 1.32it/s, loss=0.0061, lr=1.55e-05, step=4780] Training: 48%|████▊ | 4781/10000 [1:03:58<1:05:54, 1.32it/s, loss=0.0203, lr=1.55e-05, step=4781] Training: 48%|████▊ | 4782/10000 [1:03:58<1:01:53, 1.41it/s, loss=0.0203, lr=1.55e-05, step=4781] Training: 48%|████▊ | 4782/10000 [1:03:58<1:01:53, 1.41it/s, loss=0.0182, lr=1.55e-05, step=4782] Training: 48%|████▊ | 4783/10000 [1:03:59<1:03:26, 1.37it/s, loss=0.0182, lr=1.55e-05, step=4782] Training: 48%|████▊ | 4783/10000 [1:03:59<1:03:26, 1.37it/s, loss=0.0205, lr=1.55e-05, step=4783] Training: 48%|████▊ | 4784/10000 [1:04:00<1:00:37, 1.43it/s, loss=0.0205, lr=1.55e-05, step=4783] Training: 48%|████▊ | 4784/10000 [1:04:00<1:00:37, 1.43it/s, loss=0.0054, lr=1.55e-05, step=4784] Training: 48%|████▊ | 4785/10000 [1:04:00<1:02:06, 1.40it/s, loss=0.0054, lr=1.55e-05, step=4784] Training: 48%|████▊ | 4785/10000 [1:04:00<1:02:06, 1.40it/s, loss=0.0116, lr=1.55e-05, step=4785] Training: 48%|████▊ | 4786/10000 [1:04:01<1:03:05, 1.38it/s, loss=0.0116, lr=1.55e-05, step=4785] Training: 48%|████▊ | 4786/10000 [1:04:01<1:03:05, 1.38it/s, loss=0.0170, lr=1.55e-05, step=4786] Training: 48%|████▊ | 4787/10000 [1:04:02<1:04:18, 1.35it/s, loss=0.0170, lr=1.55e-05, step=4786] Training: 48%|████▊ | 4787/10000 [1:04:02<1:04:18, 1.35it/s, loss=0.0060, lr=1.55e-05, step=4787] Training: 48%|████▊ | 4788/10000 [1:04:02<58:34, 1.48it/s, loss=0.0060, lr=1.55e-05, step=4787] Training: 48%|████▊ | 4788/10000 [1:04:02<58:34, 1.48it/s, loss=0.0064, lr=1.55e-05, step=4788] Training: 48%|████▊ | 4789/10000 [1:04:03<54:12, 1.60it/s, loss=0.0064, lr=1.55e-05, step=4788] Training: 48%|████▊ | 4789/10000 [1:04:03<54:12, 1.60it/s, loss=0.0274, lr=1.55e-05, step=4789]19:48:35.891 [I] step=4790 loss=0.0243 smoothed_loss=0.0158 lr=1.55e-05 grad_norm=0.5400 step_time=0.5345s data_time=0.1130s it/s=1.545 eta_to_10000=3372.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0278 grad_action_out_proj_arms=0.2647 grad_arm_token_fuse=0.1527 grad_shared_expert=0.7109 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4790/10000 [1:04:04<53:42, 1.62it/s, loss=0.0274, lr=1.55e-05, step=4789] Training: 48%|████▊ | 4790/10000 [1:04:04<53:42, 1.62it/s, loss=0.0243, lr=1.55e-05, step=4790] Training: 48%|████▊ | 4791/10000 [1:04:04<50:09, 1.73it/s, loss=0.0243, lr=1.55e-05, step=4790] Training: 48%|████▊ | 4791/10000 [1:04:04<50:09, 1.73it/s, loss=0.0089, lr=1.55e-05, step=4791] Training: 48%|████▊ | 4792/10000 [1:04:05<52:58, 1.64it/s, loss=0.0089, lr=1.55e-05, step=4791] Training: 48%|████▊ | 4792/10000 [1:04:05<52:58, 1.64it/s, loss=0.0208, lr=1.55e-05, step=4792] Training: 48%|████▊ | 4793/10000 [1:04:06<58:32, 1.48it/s, loss=0.0208, lr=1.55e-05, step=4792] Training: 48%|████▊ | 4793/10000 [1:04:06<58:32, 1.48it/s, loss=0.0090, lr=1.54e-05, step=4793] Training: 48%|████▊ | 4794/10000 [1:04:06<58:49, 1.47it/s, loss=0.0090, lr=1.54e-05, step=4793] Training: 48%|████▊ | 4794/10000 [1:04:06<58:49, 1.47it/s, loss=0.0100, lr=1.54e-05, step=4794] Training: 48%|████▊ | 4795/10000 [1:04:07<59:11, 1.47it/s, loss=0.0100, lr=1.54e-05, step=4794] Training: 48%|████▊ | 4795/10000 [1:04:07<59:11, 1.47it/s, loss=0.0169, lr=1.54e-05, step=4795] Training: 48%|████▊ | 4796/10000 [1:04:07<55:50, 1.55it/s, loss=0.0169, lr=1.54e-05, step=4795] Training: 48%|████▊ | 4796/10000 [1:04:07<55:50, 1.55it/s, loss=0.0114, lr=1.54e-05, step=4796] Training: 48%|████▊ | 4797/10000 [1:04:08<53:04, 1.63it/s, loss=0.0114, lr=1.54e-05, step=4796] Training: 48%|████▊ | 4797/10000 [1:04:08<53:04, 1.63it/s, loss=0.0033, lr=1.54e-05, step=4797] Training: 48%|████▊ | 4798/10000 [1:04:09<53:50, 1.61it/s, loss=0.0033, lr=1.54e-05, step=4797] Training: 48%|████▊ | 4798/10000 [1:04:09<53:50, 1.61it/s, loss=0.0370, lr=1.54e-05, step=4798] Training: 48%|████▊ | 4799/10000 [1:04:09<57:09, 1.52it/s, loss=0.0370, lr=1.54e-05, step=4798] Training: 48%|████▊ | 4799/10000 [1:04:09<57:09, 1.52it/s, loss=0.0178, lr=1.54e-05, step=4799]19:48:42.755 [I] step=4800 loss=0.0063 smoothed_loss=0.0149 lr=1.54e-05 grad_norm=0.4466 step_time=0.5558s data_time=0.1306s it/s=1.457 eta_to_10000=3568.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0218 grad_action_out_proj_arms=0.1456 grad_arm_token_fuse=0.1179 grad_shared_expert=0.4484 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4800/10000 [1:04:10<1:06:14, 1.31it/s, loss=0.0178, lr=1.54e-05, step=4799] Training: 48%|████▊ | 4800/10000 [1:04:10<1:06:14, 1.31it/s, loss=0.0063, lr=1.54e-05, step=4800] Training: 48%|████▊ | 4801/10000 [1:04:11<1:05:59, 1.31it/s, loss=0.0063, lr=1.54e-05, step=4800] Training: 48%|████▊ | 4801/10000 [1:04:11<1:05:59, 1.31it/s, loss=0.0026, lr=1.54e-05, step=4801] Training: 48%|████▊ | 4802/10000 [1:04:12<1:05:15, 1.33it/s, loss=0.0026, lr=1.54e-05, step=4801] Training: 48%|████▊ | 4802/10000 [1:04:12<1:05:15, 1.33it/s, loss=0.0029, lr=1.54e-05, step=4802] Training: 48%|████▊ | 4803/10000 [1:04:13<1:07:11, 1.29it/s, loss=0.0029, lr=1.54e-05, step=4802] Training: 48%|████▊ | 4803/10000 [1:04:13<1:07:11, 1.29it/s, loss=0.0123, lr=1.54e-05, step=4803] Training: 48%|████▊ | 4804/10000 [1:04:13<1:01:37, 1.41it/s, loss=0.0123, lr=1.54e-05, step=4803] Training: 48%|████▊ | 4804/10000 [1:04:13<1:01:37, 1.41it/s, loss=0.0175, lr=1.54e-05, step=4804] Training: 48%|████▊ | 4805/10000 [1:04:14<1:02:49, 1.38it/s, loss=0.0175, lr=1.54e-05, step=4804] Training: 48%|████▊ | 4805/10000 [1:04:14<1:02:49, 1.38it/s, loss=0.0127, lr=1.54e-05, step=4805] Training: 48%|████▊ | 4806/10000 [1:04:15<58:51, 1.47it/s, loss=0.0127, lr=1.54e-05, step=4805] Training: 48%|████▊ | 4806/10000 [1:04:15<58:51, 1.47it/s, loss=0.0174, lr=1.54e-05, step=4806] Training: 48%|████▊ | 4807/10000 [1:04:15<1:02:23, 1.39it/s, loss=0.0174, lr=1.54e-05, step=4806] Training: 48%|████▊ | 4807/10000 [1:04:15<1:02:23, 1.39it/s, loss=0.0028, lr=1.54e-05, step=4807] Training: 48%|████▊ | 4808/10000 [1:04:16<1:02:40, 1.38it/s, loss=0.0028, lr=1.54e-05, step=4807] Training: 48%|████▊ | 4808/10000 [1:04:16<1:02:40, 1.38it/s, loss=0.0147, lr=1.54e-05, step=4808] Training: 48%|████▊ | 4809/10000 [1:04:17<59:56, 1.44it/s, loss=0.0147, lr=1.54e-05, step=4808] Training: 48%|████▊ | 4809/10000 [1:04:17<59:56, 1.44it/s, loss=0.0230, lr=1.54e-05, step=4809]19:48:50.066 [I] step=4810 loss=0.0395 smoothed_loss=0.0163 lr=1.54e-05 grad_norm=0.4655 step_time=0.5990s data_time=0.1322s it/s=1.368 eta_to_10000=3794.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0333 grad_action_out_proj_arms=0.1727 grad_arm_token_fuse=0.1735 grad_shared_expert=0.6395 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4810/10000 [1:04:18<1:06:08, 1.31it/s, loss=0.0230, lr=1.54e-05, step=4809] Training: 48%|████▊ | 4810/10000 [1:04:18<1:06:08, 1.31it/s, loss=0.0395, lr=1.54e-05, step=4810] Training: 48%|████▊ | 4811/10000 [1:04:18<1:05:14, 1.33it/s, loss=0.0395, lr=1.54e-05, step=4810] Training: 48%|████▊ | 4811/10000 [1:04:18<1:05:14, 1.33it/s, loss=0.0121, lr=1.54e-05, step=4811] Training: 48%|████▊ | 4812/10000 [1:04:19<1:09:58, 1.24it/s, loss=0.0121, lr=1.54e-05, step=4811] Training: 48%|████▊ | 4812/10000 [1:04:19<1:09:58, 1.24it/s, loss=0.0463, lr=1.54e-05, step=4812] Training: 48%|████▊ | 4813/10000 [1:04:20<1:05:07, 1.33it/s, loss=0.0463, lr=1.54e-05, step=4812] Training: 48%|████▊ | 4813/10000 [1:04:20<1:05:07, 1.33it/s, loss=0.0194, lr=1.54e-05, step=4813] Training: 48%|████▊ | 4814/10000 [1:04:21<1:03:57, 1.35it/s, loss=0.0194, lr=1.54e-05, step=4813] Training: 48%|████▊ | 4814/10000 [1:04:21<1:03:57, 1.35it/s, loss=0.0264, lr=1.54e-05, step=4814] Training: 48%|████▊ | 4815/10000 [1:04:22<1:06:26, 1.30it/s, loss=0.0264, lr=1.54e-05, step=4814] Training: 48%|████▊ | 4815/10000 [1:04:22<1:06:26, 1.30it/s, loss=0.0054, lr=1.54e-05, step=4815] Training: 48%|████▊ | 4816/10000 [1:04:22<1:01:42, 1.40it/s, loss=0.0054, lr=1.54e-05, step=4815] Training: 48%|████▊ | 4816/10000 [1:04:22<1:01:42, 1.40it/s, loss=0.0070, lr=1.54e-05, step=4816] Training: 48%|████▊ | 4817/10000 [1:04:23<1:02:45, 1.38it/s, loss=0.0070, lr=1.54e-05, step=4816] Training: 48%|████▊ | 4817/10000 [1:04:23<1:02:45, 1.38it/s, loss=0.0198, lr=1.54e-05, step=4817] Training: 48%|████▊ | 4818/10000 [1:04:24<1:06:48, 1.29it/s, loss=0.0198, lr=1.54e-05, step=4817] Training: 48%|████▊ | 4818/10000 [1:04:24<1:06:48, 1.29it/s, loss=0.0166, lr=1.54e-05, step=4818] Training: 48%|████▊ | 4819/10000 [1:04:25<1:06:55, 1.29it/s, loss=0.0166, lr=1.54e-05, step=4818] Training: 48%|████▊ | 4819/10000 [1:04:25<1:06:55, 1.29it/s, loss=0.0039, lr=1.54e-05, step=4819]19:48:57.447 [I] step=4820 loss=0.0099 smoothed_loss=0.0154 lr=1.54e-05 grad_norm=0.4734 step_time=0.5890s data_time=0.1491s it/s=1.355 eta_to_10000=3822.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0238 grad_action_out_proj_arms=0.1482 grad_arm_token_fuse=0.1325 grad_shared_expert=0.4909 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4820/10000 [1:04:25<1:00:51, 1.42it/s, loss=0.0039, lr=1.54e-05, step=4819] Training: 48%|████▊ | 4820/10000 [1:04:25<1:00:51, 1.42it/s, loss=0.0099, lr=1.53e-05, step=4820] Training: 48%|████▊ | 4821/10000 [1:04:26<1:00:11, 1.43it/s, loss=0.0099, lr=1.53e-05, step=4820] Training: 48%|████▊ | 4821/10000 [1:04:26<1:00:11, 1.43it/s, loss=0.0184, lr=1.53e-05, step=4821] Training: 48%|████▊ | 4822/10000 [1:04:27<1:06:37, 1.30it/s, loss=0.0184, lr=1.53e-05, step=4821] Training: 48%|████▊ | 4822/10000 [1:04:27<1:06:37, 1.30it/s, loss=0.0084, lr=1.53e-05, step=4822] Training: 48%|████▊ | 4823/10000 [1:04:27<59:28, 1.45it/s, loss=0.0084, lr=1.53e-05, step=4822] Training: 48%|████▊ | 4823/10000 [1:04:27<59:28, 1.45it/s, loss=0.0025, lr=1.53e-05, step=4823] Training: 48%|████▊ | 4824/10000 [1:04:28<54:28, 1.58it/s, loss=0.0025, lr=1.53e-05, step=4823] Training: 48%|████▊ | 4824/10000 [1:04:28<54:28, 1.58it/s, loss=0.0290, lr=1.53e-05, step=4824] Training: 48%|████▊ | 4825/10000 [1:04:29<1:02:55, 1.37it/s, loss=0.0290, lr=1.53e-05, step=4824] Training: 48%|████▊ | 4825/10000 [1:04:29<1:02:55, 1.37it/s, loss=0.0138, lr=1.53e-05, step=4825] Training: 48%|████▊ | 4826/10000 [1:04:29<57:30, 1.50it/s, loss=0.0138, lr=1.53e-05, step=4825] Training: 48%|████▊ | 4826/10000 [1:04:29<57:30, 1.50it/s, loss=0.0135, lr=1.53e-05, step=4826] Training: 48%|████▊ | 4827/10000 [1:04:30<58:31, 1.47it/s, loss=0.0135, lr=1.53e-05, step=4826] Training: 48%|████▊ | 4827/10000 [1:04:30<58:31, 1.47it/s, loss=0.0183, lr=1.53e-05, step=4827] Training: 48%|████▊ | 4828/10000 [1:04:31<1:07:25, 1.28it/s, loss=0.0183, lr=1.53e-05, step=4827] Training: 48%|████▊ | 4828/10000 [1:04:31<1:07:25, 1.28it/s, loss=0.0056, lr=1.53e-05, step=4828] Training: 48%|████▊ | 4829/10000 [1:04:32<1:11:29, 1.21it/s, loss=0.0056, lr=1.53e-05, step=4828] Training: 48%|████▊ | 4829/10000 [1:04:32<1:11:29, 1.21it/s, loss=0.0043, lr=1.53e-05, step=4829]19:49:05.212 [I] step=4830 loss=0.0056 smoothed_loss=0.0125 lr=1.53e-05 grad_norm=0.3925 step_time=0.6288s data_time=0.1477s it/s=1.288 eta_to_10000=4013.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.1049 grad_arm_token_fuse=0.0337 grad_shared_expert=0.3722 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4830/10000 [1:04:33<1:15:47, 1.14it/s, loss=0.0043, lr=1.53e-05, step=4829] Training: 48%|████▊ | 4830/10000 [1:04:33<1:15:47, 1.14it/s, loss=0.0056, lr=1.53e-05, step=4830] Training: 48%|████▊ | 4831/10000 [1:04:34<1:14:39, 1.15it/s, loss=0.0056, lr=1.53e-05, step=4830] Training: 48%|████▊ | 4831/10000 [1:04:34<1:14:39, 1.15it/s, loss=0.0098, lr=1.53e-05, step=4831] Training: 48%|████▊ | 4832/10000 [1:04:34<1:09:36, 1.24it/s, loss=0.0098, lr=1.53e-05, step=4831] Training: 48%|████▊ | 4832/10000 [1:04:34<1:09:36, 1.24it/s, loss=0.0111, lr=1.53e-05, step=4832] Training: 48%|████▊ | 4833/10000 [1:04:35<1:05:11, 1.32it/s, loss=0.0111, lr=1.53e-05, step=4832] Training: 48%|████▊ | 4833/10000 [1:04:35<1:05:11, 1.32it/s, loss=0.0098, lr=1.53e-05, step=4833] Training: 48%|████▊ | 4834/10000 [1:04:36<1:05:09, 1.32it/s, loss=0.0098, lr=1.53e-05, step=4833] Training: 48%|████▊ | 4834/10000 [1:04:36<1:05:09, 1.32it/s, loss=0.0068, lr=1.53e-05, step=4834] Training: 48%|████▊ | 4835/10000 [1:04:37<1:12:37, 1.19it/s, loss=0.0068, lr=1.53e-05, step=4834] Training: 48%|████▊ | 4835/10000 [1:04:37<1:12:37, 1.19it/s, loss=0.0314, lr=1.53e-05, step=4835] Training: 48%|████▊ | 4836/10000 [1:04:38<1:23:02, 1.04it/s, loss=0.0314, lr=1.53e-05, step=4835] Training: 48%|████▊ | 4836/10000 [1:04:38<1:23:02, 1.04it/s, loss=0.0122, lr=1.53e-05, step=4836] Training: 48%|████▊ | 4837/10000 [1:04:39<1:19:24, 1.08it/s, loss=0.0122, lr=1.53e-05, step=4836] Training: 48%|████▊ | 4837/10000 [1:04:39<1:19:24, 1.08it/s, loss=0.0073, lr=1.53e-05, step=4837] Training: 48%|████▊ | 4838/10000 [1:04:39<1:10:34, 1.22it/s, loss=0.0073, lr=1.53e-05, step=4837] Training: 48%|████▊ | 4838/10000 [1:04:39<1:10:34, 1.22it/s, loss=0.0253, lr=1.53e-05, step=4838] Training: 48%|████▊ | 4839/10000 [1:04:40<1:06:06, 1.30it/s, loss=0.0253, lr=1.53e-05, step=4838] Training: 48%|████▊ | 4839/10000 [1:04:40<1:06:06, 1.30it/s, loss=0.0082, lr=1.53e-05, step=4839]19:49:13.024 [I] step=4840 loss=0.0113 smoothed_loss=0.0132 lr=1.53e-05 grad_norm=0.4493 step_time=0.6096s data_time=0.1717s it/s=1.280 eta_to_10000=4030.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0164 grad_action_out_proj_arms=0.1163 grad_arm_token_fuse=0.0759 grad_shared_expert=0.4806 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4840/10000 [1:04:41<1:00:47, 1.41it/s, loss=0.0082, lr=1.53e-05, step=4839] Training: 48%|████▊ | 4840/10000 [1:04:41<1:00:47, 1.41it/s, loss=0.0113, lr=1.53e-05, step=4840] Training: 48%|████▊ | 4841/10000 [1:04:41<56:17, 1.53it/s, loss=0.0113, lr=1.53e-05, step=4840] Training: 48%|████▊ | 4841/10000 [1:04:41<56:17, 1.53it/s, loss=0.0025, lr=1.53e-05, step=4841] Training: 48%|████▊ | 4842/10000 [1:04:42<1:01:56, 1.39it/s, loss=0.0025, lr=1.53e-05, step=4841] Training: 48%|████▊ | 4842/10000 [1:04:42<1:01:56, 1.39it/s, loss=0.0113, lr=1.53e-05, step=4842] Training: 48%|████▊ | 4843/10000 [1:04:43<1:09:14, 1.24it/s, loss=0.0113, lr=1.53e-05, step=4842] Training: 48%|████▊ | 4843/10000 [1:04:43<1:09:14, 1.24it/s, loss=0.0056, lr=1.53e-05, step=4843] Training: 48%|████▊ | 4844/10000 [1:04:44<1:10:34, 1.22it/s, loss=0.0056, lr=1.53e-05, step=4843] Training: 48%|████▊ | 4844/10000 [1:04:44<1:10:34, 1.22it/s, loss=0.0223, lr=1.53e-05, step=4844] Training: 48%|████▊ | 4845/10000 [1:04:45<1:08:34, 1.25it/s, loss=0.0223, lr=1.53e-05, step=4844] Training: 48%|████▊ | 4845/10000 [1:04:45<1:08:34, 1.25it/s, loss=0.0150, lr=1.53e-05, step=4845] Training: 48%|████▊ | 4846/10000 [1:04:45<1:00:55, 1.41it/s, loss=0.0150, lr=1.53e-05, step=4845] Training: 48%|████▊ | 4846/10000 [1:04:45<1:00:55, 1.41it/s, loss=0.0327, lr=1.53e-05, step=4846] Training: 48%|████▊ | 4847/10000 [1:04:46<55:30, 1.55it/s, loss=0.0327, lr=1.53e-05, step=4846] Training: 48%|████▊ | 4847/10000 [1:04:46<55:30, 1.55it/s, loss=0.0577, lr=1.52e-05, step=4847] Training: 48%|████▊ | 4848/10000 [1:04:46<59:07, 1.45it/s, loss=0.0577, lr=1.52e-05, step=4847] Training: 48%|████▊ | 4848/10000 [1:04:46<59:07, 1.45it/s, loss=0.0043, lr=1.52e-05, step=4848] Training: 48%|████▊ | 4849/10000 [1:04:47<57:39, 1.49it/s, loss=0.0043, lr=1.52e-05, step=4848] Training: 48%|████▊ | 4849/10000 [1:04:47<57:39, 1.49it/s, loss=0.0091, lr=1.52e-05, step=4849]19:49:20.565 [I] step=4850 loss=0.0252 smoothed_loss=0.0176 lr=1.53e-05 grad_norm=0.4631 step_time=0.6103s data_time=0.1437s it/s=1.326 eta_to_10000=3882.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0234 grad_action_out_proj_arms=0.1871 grad_arm_token_fuse=0.1261 grad_shared_expert=0.5358 (18633:train_pytorch.py:850) + Training: 48%|████▊ | 4850/10000 [1:04:48<1:08:51, 1.25it/s, loss=0.0091, lr=1.52e-05, step=4849] Training: 48%|████▊ | 4850/10000 [1:04:48<1:08:51, 1.25it/s, loss=0.0252, lr=1.52e-05, step=4850] Training: 49%|████▊ | 4851/10000 [1:04:49<1:09:50, 1.23it/s, loss=0.0252, lr=1.52e-05, step=4850] Training: 49%|████▊ | 4851/10000 [1:04:49<1:09:50, 1.23it/s, loss=0.0142, lr=1.52e-05, step=4851] Training: 49%|████▊ | 4852/10000 [1:04:50<1:04:32, 1.33it/s, loss=0.0142, lr=1.52e-05, step=4851] Training: 49%|████▊ | 4852/10000 [1:04:50<1:04:32, 1.33it/s, loss=0.0095, lr=1.52e-05, step=4852] Training: 49%|████▊ | 4853/10000 [1:04:51<1:13:56, 1.16it/s, loss=0.0095, lr=1.52e-05, step=4852] Training: 49%|████▊ | 4853/10000 [1:04:51<1:13:56, 1.16it/s, loss=0.0309, lr=1.52e-05, step=4853] Training: 49%|████▊ | 4854/10000 [1:04:52<1:22:14, 1.04it/s, loss=0.0309, lr=1.52e-05, step=4853] Training: 49%|████▊ | 4854/10000 [1:04:52<1:22:14, 1.04it/s, loss=0.0244, lr=1.52e-05, step=4854] Training: 49%|████▊ | 4855/10000 [1:04:53<1:21:11, 1.06it/s, loss=0.0244, lr=1.52e-05, step=4854] Training: 49%|████▊ | 4855/10000 [1:04:53<1:21:11, 1.06it/s, loss=0.0048, lr=1.52e-05, step=4855] Training: 49%|████▊ | 4856/10000 [1:04:54<1:15:36, 1.13it/s, loss=0.0048, lr=1.52e-05, step=4855] Training: 49%|████▊ | 4856/10000 [1:04:54<1:15:36, 1.13it/s, loss=0.0106, lr=1.52e-05, step=4856] Training: 49%|████▊ | 4857/10000 [1:04:55<1:19:30, 1.08it/s, loss=0.0106, lr=1.52e-05, step=4856] Training: 49%|████▊ | 4857/10000 [1:04:55<1:19:30, 1.08it/s, loss=0.0253, lr=1.52e-05, step=4857] Training: 49%|████▊ | 4858/10000 [1:04:56<1:21:08, 1.06it/s, loss=0.0253, lr=1.52e-05, step=4857] Training: 49%|████▊ | 4858/10000 [1:04:56<1:21:08, 1.06it/s, loss=0.0123, lr=1.52e-05, step=4858] Training: 49%|████▊ | 4859/10000 [1:04:56<1:14:53, 1.14it/s, loss=0.0123, lr=1.52e-05, step=4858] Training: 49%|████▊ | 4859/10000 [1:04:56<1:14:53, 1.14it/s, loss=0.0059, lr=1.52e-05, step=4859]19:49:29.635 [I] step=4860 loss=0.0185 smoothed_loss=0.0161 lr=1.52e-05 grad_norm=0.4854 step_time=0.6539s data_time=0.2531s it/s=1.103 eta_to_10000=4661.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0307 grad_action_out_proj_arms=0.2332 grad_arm_token_fuse=0.1569 grad_shared_expert=0.5534 (18633:train_pytorch.py:850) + Training: 49%|████▊ | 4860/10000 [1:04:57<1:16:31, 1.12it/s, loss=0.0059, lr=1.52e-05, step=4859] Training: 49%|████▊ | 4860/10000 [1:04:57<1:16:31, 1.12it/s, loss=0.0185, lr=1.52e-05, step=4860] Training: 49%|████▊ | 4861/10000 [1:04:58<1:07:28, 1.27it/s, loss=0.0185, lr=1.52e-05, step=4860] Training: 49%|████▊ | 4861/10000 [1:04:58<1:07:28, 1.27it/s, loss=0.0195, lr=1.52e-05, step=4861] Training: 49%|████▊ | 4862/10000 [1:04:58<59:50, 1.43it/s, loss=0.0195, lr=1.52e-05, step=4861] Training: 49%|████▊ | 4862/10000 [1:04:58<59:50, 1.43it/s, loss=0.0092, lr=1.52e-05, step=4862] Training: 49%|████▊ | 4863/10000 [1:04:59<55:30, 1.54it/s, loss=0.0092, lr=1.52e-05, step=4862] Training: 49%|████▊ | 4863/10000 [1:04:59<55:30, 1.54it/s, loss=0.0065, lr=1.52e-05, step=4863] Training: 49%|████▊ | 4864/10000 [1:05:00<56:47, 1.51it/s, loss=0.0065, lr=1.52e-05, step=4863] Training: 49%|████▊ | 4864/10000 [1:05:00<56:47, 1.51it/s, loss=0.0061, lr=1.52e-05, step=4864] Training: 49%|████▊ | 4865/10000 [1:05:00<1:03:21, 1.35it/s, loss=0.0061, lr=1.52e-05, step=4864] Training: 49%|████▊ | 4865/10000 [1:05:00<1:03:21, 1.35it/s, loss=0.0131, lr=1.52e-05, step=4865] Training: 49%|████▊ | 4866/10000 [1:05:01<57:28, 1.49it/s, loss=0.0131, lr=1.52e-05, step=4865] Training: 49%|████▊ | 4866/10000 [1:05:01<57:28, 1.49it/s, loss=0.0319, lr=1.52e-05, step=4866] Training: 49%|████▊ | 4867/10000 [1:05:02<1:00:42, 1.41it/s, loss=0.0319, lr=1.52e-05, step=4866] Training: 49%|████▊ | 4867/10000 [1:05:02<1:00:42, 1.41it/s, loss=0.0172, lr=1.52e-05, step=4867] Training: 49%|████▊ | 4868/10000 [1:05:03<1:00:45, 1.41it/s, loss=0.0172, lr=1.52e-05, step=4867] Training: 49%|████▊ | 4868/10000 [1:05:03<1:00:45, 1.41it/s, loss=0.0058, lr=1.52e-05, step=4868] Training: 49%|████▊ | 4869/10000 [1:05:03<1:02:33, 1.37it/s, loss=0.0058, lr=1.52e-05, step=4868] Training: 49%|████▊ | 4869/10000 [1:05:03<1:02:33, 1.37it/s, loss=0.0130, lr=1.52e-05, step=4869]19:49:36.315 [I] step=4870 loss=0.0235 smoothed_loss=0.0155 lr=1.52e-05 grad_norm=0.4740 step_time=0.5460s data_time=0.1219s it/s=1.497 eta_to_10000=3425.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0073 grad_action_out_proj_arms=0.0901 grad_arm_token_fuse=0.0367 grad_shared_expert=0.3930 (18633:train_pytorch.py:850) + Training: 49%|████▊ | 4870/10000 [1:05:04<1:01:36, 1.39it/s, loss=0.0130, lr=1.52e-05, step=4869] Training: 49%|████▊ | 4870/10000 [1:05:04<1:01:36, 1.39it/s, loss=0.0235, lr=1.52e-05, step=4870] Training: 49%|████▊ | 4871/10000 [1:05:05<1:05:38, 1.30it/s, loss=0.0235, lr=1.52e-05, step=4870] Training: 49%|████▊ | 4871/10000 [1:05:05<1:05:38, 1.30it/s, loss=0.0085, lr=1.52e-05, step=4871] Training: 49%|████▊ | 4872/10000 [1:05:06<1:04:22, 1.33it/s, loss=0.0085, lr=1.52e-05, step=4871] Training: 49%|████▊ | 4872/10000 [1:05:06<1:04:22, 1.33it/s, loss=0.0057, lr=1.52e-05, step=4872] Training: 49%|████▊ | 4873/10000 [1:05:06<57:26, 1.49it/s, loss=0.0057, lr=1.52e-05, step=4872] Training: 49%|████▊ | 4873/10000 [1:05:06<57:26, 1.49it/s, loss=0.0130, lr=1.52e-05, step=4873] Training: 49%|████▊ | 4874/10000 [1:05:07<52:40, 1.62it/s, loss=0.0130, lr=1.52e-05, step=4873] Training: 49%|████▊ | 4874/10000 [1:05:07<52:40, 1.62it/s, loss=0.0087, lr=1.51e-05, step=4874] Training: 49%|████▉ | 4875/10000 [1:05:07<49:15, 1.73it/s, loss=0.0087, lr=1.51e-05, step=4874] Training: 49%|████▉ | 4875/10000 [1:05:07<49:15, 1.73it/s, loss=0.0044, lr=1.51e-05, step=4875] Training: 49%|████▉ | 4876/10000 [1:05:08<51:54, 1.65it/s, loss=0.0044, lr=1.51e-05, step=4875] Training: 49%|████▉ | 4876/10000 [1:05:08<51:54, 1.65it/s, loss=0.0250, lr=1.51e-05, step=4876] Training: 49%|████▉ | 4877/10000 [1:05:08<52:44, 1.62it/s, loss=0.0250, lr=1.51e-05, step=4876] Training: 49%|████▉ | 4877/10000 [1:05:08<52:44, 1.62it/s, loss=0.0112, lr=1.51e-05, step=4877] Training: 49%|████▉ | 4878/10000 [1:05:09<55:32, 1.54it/s, loss=0.0112, lr=1.51e-05, step=4877] Training: 49%|████▉ | 4878/10000 [1:05:09<55:32, 1.54it/s, loss=0.0044, lr=1.51e-05, step=4878] Training: 49%|████▉ | 4879/10000 [1:05:10<1:01:07, 1.40it/s, loss=0.0044, lr=1.51e-05, step=4878] Training: 49%|████▉ | 4879/10000 [1:05:10<1:01:07, 1.40it/s, loss=0.0155, lr=1.51e-05, step=4879]19:49:42.882 [I] step=4880 loss=0.0089 smoothed_loss=0.0124 lr=1.51e-05 grad_norm=0.4783 step_time=0.5645s data_time=0.0922s it/s=1.523 eta_to_10000=3361.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0205 grad_action_out_proj_arms=0.1801 grad_arm_token_fuse=0.1023 grad_shared_expert=0.5084 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4880/10000 [1:05:11<58:05, 1.47it/s, loss=0.0155, lr=1.51e-05, step=4879] Training: 49%|████▉ | 4880/10000 [1:05:11<58:05, 1.47it/s, loss=0.0089, lr=1.51e-05, step=4880] Training: 49%|████▉ | 4881/10000 [1:05:11<54:30, 1.57it/s, loss=0.0089, lr=1.51e-05, step=4880] Training: 49%|████▉ | 4881/10000 [1:05:11<54:30, 1.57it/s, loss=0.0059, lr=1.51e-05, step=4881] Training: 49%|████▉ | 4882/10000 [1:05:12<51:58, 1.64it/s, loss=0.0059, lr=1.51e-05, step=4881] Training: 49%|████▉ | 4882/10000 [1:05:12<51:58, 1.64it/s, loss=0.0106, lr=1.51e-05, step=4882] Training: 49%|████▉ | 4883/10000 [1:05:12<52:53, 1.61it/s, loss=0.0106, lr=1.51e-05, step=4882] Training: 49%|████▉ | 4883/10000 [1:05:12<52:53, 1.61it/s, loss=0.0226, lr=1.51e-05, step=4883] Training: 49%|████▉ | 4884/10000 [1:05:13<57:58, 1.47it/s, loss=0.0226, lr=1.51e-05, step=4883] Training: 49%|████▉ | 4884/10000 [1:05:13<57:58, 1.47it/s, loss=0.0239, lr=1.51e-05, step=4884] Training: 49%|████▉ | 4885/10000 [1:05:14<1:03:06, 1.35it/s, loss=0.0239, lr=1.51e-05, step=4884] Training: 49%|████▉ | 4885/10000 [1:05:14<1:03:06, 1.35it/s, loss=0.0372, lr=1.51e-05, step=4885] Training: 49%|████▉ | 4886/10000 [1:05:15<1:03:09, 1.35it/s, loss=0.0372, lr=1.51e-05, step=4885] Training: 49%|████▉ | 4886/10000 [1:05:15<1:03:09, 1.35it/s, loss=0.0189, lr=1.51e-05, step=4886] Training: 49%|████▉ | 4887/10000 [1:05:15<1:03:36, 1.34it/s, loss=0.0189, lr=1.51e-05, step=4886] Training: 49%|████▉ | 4887/10000 [1:05:15<1:03:36, 1.34it/s, loss=0.0247, lr=1.51e-05, step=4887] Training: 49%|████▉ | 4888/10000 [1:05:16<1:00:53, 1.40it/s, loss=0.0247, lr=1.51e-05, step=4887] Training: 49%|████▉ | 4888/10000 [1:05:16<1:00:53, 1.40it/s, loss=0.0148, lr=1.51e-05, step=4888] Training: 49%|████▉ | 4889/10000 [1:05:17<1:01:11, 1.39it/s, loss=0.0148, lr=1.51e-05, step=4888] Training: 49%|████▉ | 4889/10000 [1:05:17<1:01:11, 1.39it/s, loss=0.0055, lr=1.51e-05, step=4889]19:49:49.690 [I] step=4890 loss=0.0052 smoothed_loss=0.0148 lr=1.51e-05 grad_norm=0.4450 step_time=0.5534s data_time=0.1273s it/s=1.469 eta_to_10000=3478.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0188 grad_action_out_proj_arms=0.1354 grad_arm_token_fuse=0.0971 grad_shared_expert=0.3906 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4890/10000 [1:05:17<55:53, 1.52it/s, loss=0.0055, lr=1.51e-05, step=4889] Training: 49%|████▉ | 4890/10000 [1:05:17<55:53, 1.52it/s, loss=0.0052, lr=1.51e-05, step=4890] Training: 49%|████▉ | 4891/10000 [1:05:18<51:50, 1.64it/s, loss=0.0052, lr=1.51e-05, step=4890] Training: 49%|████▉ | 4891/10000 [1:05:18<51:50, 1.64it/s, loss=0.0055, lr=1.51e-05, step=4891] Training: 49%|████▉ | 4892/10000 [1:05:19<53:25, 1.59it/s, loss=0.0055, lr=1.51e-05, step=4891] Training: 49%|████▉ | 4892/10000 [1:05:19<53:25, 1.59it/s, loss=0.0287, lr=1.51e-05, step=4892] Training: 49%|████▉ | 4893/10000 [1:05:19<56:56, 1.49it/s, loss=0.0287, lr=1.51e-05, step=4892] Training: 49%|████▉ | 4893/10000 [1:05:19<56:56, 1.49it/s, loss=0.0078, lr=1.51e-05, step=4893] Training: 49%|████▉ | 4894/10000 [1:05:20<53:45, 1.58it/s, loss=0.0078, lr=1.51e-05, step=4893] Training: 49%|████▉ | 4894/10000 [1:05:20<53:45, 1.58it/s, loss=0.0330, lr=1.51e-05, step=4894] Training: 49%|████▉ | 4895/10000 [1:05:20<51:18, 1.66it/s, loss=0.0330, lr=1.51e-05, step=4894] Training: 49%|████▉ | 4895/10000 [1:05:20<51:18, 1.66it/s, loss=0.0243, lr=1.51e-05, step=4895] Training: 49%|████▉ | 4896/10000 [1:05:21<54:23, 1.56it/s, loss=0.0243, lr=1.51e-05, step=4895] Training: 49%|████▉ | 4896/10000 [1:05:21<54:23, 1.56it/s, loss=0.0089, lr=1.51e-05, step=4896] Training: 49%|████▉ | 4897/10000 [1:05:22<59:09, 1.44it/s, loss=0.0089, lr=1.51e-05, step=4896] Training: 49%|████▉ | 4897/10000 [1:05:22<59:09, 1.44it/s, loss=0.0115, lr=1.51e-05, step=4897] Training: 49%|████▉ | 4898/10000 [1:05:23<1:00:30, 1.41it/s, loss=0.0115, lr=1.51e-05, step=4897] Training: 49%|████▉ | 4898/10000 [1:05:23<1:00:30, 1.41it/s, loss=0.0042, lr=1.51e-05, step=4898] Training: 49%|████▉ | 4899/10000 [1:05:23<56:59, 1.49it/s, loss=0.0042, lr=1.51e-05, step=4898] Training: 49%|████▉ | 4899/10000 [1:05:23<56:59, 1.49it/s, loss=0.0084, lr=1.51e-05, step=4899]19:49:56.343 [I] step=4900 loss=0.0120 smoothed_loss=0.0139 lr=1.51e-05 grad_norm=0.4514 step_time=0.5582s data_time=0.1070s it/s=1.503 eta_to_10000=3392.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0948 grad_arm_token_fuse=0.0431 grad_shared_expert=0.3239 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4900/10000 [1:05:24<59:20, 1.43it/s, loss=0.0084, lr=1.51e-05, step=4899] Training: 49%|████▉ | 4900/10000 [1:05:24<59:20, 1.43it/s, loss=0.0120, lr=1.51e-05, step=4900] Training: 49%|████▉ | 4901/10000 [1:05:24<53:51, 1.58it/s, loss=0.0120, lr=1.51e-05, step=4900] Training: 49%|████▉ | 4901/10000 [1:05:24<53:51, 1.58it/s, loss=0.0092, lr=1.50e-05, step=4901] Training: 49%|████▉ | 4902/10000 [1:05:25<53:38, 1.58it/s, loss=0.0092, lr=1.50e-05, step=4901] Training: 49%|████▉ | 4902/10000 [1:05:25<53:38, 1.58it/s, loss=0.0117, lr=1.50e-05, step=4902] Training: 49%|████▉ | 4903/10000 [1:05:26<53:01, 1.60it/s, loss=0.0117, lr=1.50e-05, step=4902] Training: 49%|████▉ | 4903/10000 [1:05:26<53:01, 1.60it/s, loss=0.0179, lr=1.50e-05, step=4903] Training: 49%|████▉ | 4904/10000 [1:05:26<49:38, 1.71it/s, loss=0.0179, lr=1.50e-05, step=4903] Training: 49%|████▉ | 4904/10000 [1:05:26<49:38, 1.71it/s, loss=0.0225, lr=1.50e-05, step=4904] Training: 49%|████▉ | 4905/10000 [1:05:27<47:21, 1.79it/s, loss=0.0225, lr=1.50e-05, step=4904] Training: 49%|████▉ | 4905/10000 [1:05:27<47:21, 1.79it/s, loss=0.0139, lr=1.50e-05, step=4905] Training: 49%|████▉ | 4906/10000 [1:05:27<45:26, 1.87it/s, loss=0.0139, lr=1.50e-05, step=4905] Training: 49%|████▉ | 4906/10000 [1:05:27<45:26, 1.87it/s, loss=0.0119, lr=1.50e-05, step=4906] Training: 49%|████▉ | 4907/10000 [1:05:28<51:03, 1.66it/s, loss=0.0119, lr=1.50e-05, step=4906] Training: 49%|████▉ | 4907/10000 [1:05:28<51:03, 1.66it/s, loss=0.0228, lr=1.50e-05, step=4907] Training: 49%|████▉ | 4908/10000 [1:05:28<48:17, 1.76it/s, loss=0.0228, lr=1.50e-05, step=4907] Training: 49%|████▉ | 4908/10000 [1:05:28<48:17, 1.76it/s, loss=0.0175, lr=1.50e-05, step=4908] Training: 49%|████▉ | 4909/10000 [1:05:29<46:38, 1.82it/s, loss=0.0175, lr=1.50e-05, step=4908] Training: 49%|████▉ | 4909/10000 [1:05:29<46:38, 1.82it/s, loss=0.0079, lr=1.50e-05, step=4909]19:50:01.813 [I] step=4910 loss=0.0152 smoothed_loss=0.0147 lr=1.50e-05 grad_norm=0.4941 step_time=0.4765s data_time=0.0705s it/s=1.828 eta_to_10000=2783.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0220 grad_action_out_proj_arms=0.2131 grad_arm_token_fuse=0.1207 grad_shared_expert=0.5698 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4910/10000 [1:05:29<46:07, 1.84it/s, loss=0.0079, lr=1.50e-05, step=4909] Training: 49%|████▉ | 4910/10000 [1:05:29<46:07, 1.84it/s, loss=0.0152, lr=1.50e-05, step=4910] Training: 49%|████▉ | 4911/10000 [1:05:30<44:44, 1.90it/s, loss=0.0152, lr=1.50e-05, step=4910] Training: 49%|████▉ | 4911/10000 [1:05:30<44:44, 1.90it/s, loss=0.0251, lr=1.50e-05, step=4911] Training: 49%|████▉ | 4912/10000 [1:05:30<43:39, 1.94it/s, loss=0.0251, lr=1.50e-05, step=4911] Training: 49%|████▉ | 4912/10000 [1:05:30<43:39, 1.94it/s, loss=0.0514, lr=1.50e-05, step=4912] Training: 49%|████▉ | 4913/10000 [1:05:31<52:08, 1.63it/s, loss=0.0514, lr=1.50e-05, step=4912] Training: 49%|████▉ | 4913/10000 [1:05:31<52:08, 1.63it/s, loss=0.0023, lr=1.50e-05, step=4913] Training: 49%|████▉ | 4914/10000 [1:05:32<57:17, 1.48it/s, loss=0.0023, lr=1.50e-05, step=4913] Training: 49%|████▉ | 4914/10000 [1:05:32<57:17, 1.48it/s, loss=0.0059, lr=1.50e-05, step=4914] Training: 49%|████▉ | 4915/10000 [1:05:33<57:58, 1.46it/s, loss=0.0059, lr=1.50e-05, step=4914] Training: 49%|████▉ | 4915/10000 [1:05:33<57:58, 1.46it/s, loss=0.0071, lr=1.50e-05, step=4915] Training: 49%|████▉ | 4916/10000 [1:05:33<52:34, 1.61it/s, loss=0.0071, lr=1.50e-05, step=4915] Training: 49%|████▉ | 4916/10000 [1:05:33<52:34, 1.61it/s, loss=0.0120, lr=1.50e-05, step=4916] Training: 49%|████▉ | 4917/10000 [1:05:34<49:34, 1.71it/s, loss=0.0120, lr=1.50e-05, step=4916] Training: 49%|████▉ | 4917/10000 [1:05:34<49:34, 1.71it/s, loss=0.0144, lr=1.50e-05, step=4917] Training: 49%|████▉ | 4918/10000 [1:05:34<46:37, 1.82it/s, loss=0.0144, lr=1.50e-05, step=4917] Training: 49%|████▉ | 4918/10000 [1:05:34<46:37, 1.82it/s, loss=0.0038, lr=1.50e-05, step=4918] Training: 49%|████▉ | 4919/10000 [1:05:35<50:55, 1.66it/s, loss=0.0038, lr=1.50e-05, step=4918] Training: 49%|████▉ | 4919/10000 [1:05:35<50:55, 1.66it/s, loss=0.0072, lr=1.50e-05, step=4919]19:50:07.839 [I] step=4920 loss=0.0155 smoothed_loss=0.0135 lr=1.50e-05 grad_norm=0.5434 step_time=0.5088s data_time=0.0937s it/s=1.660 eta_to_10000=3060.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0174 grad_action_out_proj_arms=0.1656 grad_arm_token_fuse=0.0991 grad_shared_expert=0.5532 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4920/10000 [1:05:36<48:45, 1.74it/s, loss=0.0072, lr=1.50e-05, step=4919] Training: 49%|████▉ | 4920/10000 [1:05:36<48:45, 1.74it/s, loss=0.0155, lr=1.50e-05, step=4920] Training: 49%|████▉ | 4921/10000 [1:05:36<52:26, 1.61it/s, loss=0.0155, lr=1.50e-05, step=4920] Training: 49%|████▉ | 4921/10000 [1:05:36<52:26, 1.61it/s, loss=0.0123, lr=1.50e-05, step=4921] Training: 49%|████▉ | 4922/10000 [1:05:37<59:55, 1.41it/s, loss=0.0123, lr=1.50e-05, step=4921] Training: 49%|████▉ | 4922/10000 [1:05:37<59:55, 1.41it/s, loss=0.0100, lr=1.50e-05, step=4922] Training: 49%|████▉ | 4923/10000 [1:05:38<54:25, 1.55it/s, loss=0.0100, lr=1.50e-05, step=4922] Training: 49%|████▉ | 4923/10000 [1:05:38<54:25, 1.55it/s, loss=0.0202, lr=1.50e-05, step=4923] Training: 49%|████▉ | 4924/10000 [1:05:38<50:35, 1.67it/s, loss=0.0202, lr=1.50e-05, step=4923] Training: 49%|████▉ | 4924/10000 [1:05:38<50:35, 1.67it/s, loss=0.0088, lr=1.50e-05, step=4924] Training: 49%|████▉ | 4925/10000 [1:05:39<50:43, 1.67it/s, loss=0.0088, lr=1.50e-05, step=4924] Training: 49%|████▉ | 4925/10000 [1:05:39<50:43, 1.67it/s, loss=0.0069, lr=1.50e-05, step=4925] Training: 49%|████▉ | 4926/10000 [1:05:39<48:16, 1.75it/s, loss=0.0069, lr=1.50e-05, step=4925] Training: 49%|████▉ | 4926/10000 [1:05:39<48:16, 1.75it/s, loss=0.0069, lr=1.50e-05, step=4926] Training: 49%|████▉ | 4927/10000 [1:05:40<50:13, 1.68it/s, loss=0.0069, lr=1.50e-05, step=4926] Training: 49%|████▉ | 4927/10000 [1:05:40<50:13, 1.68it/s, loss=0.0160, lr=1.50e-05, step=4927] Training: 49%|████▉ | 4928/10000 [1:05:41<52:38, 1.61it/s, loss=0.0160, lr=1.50e-05, step=4927] Training: 49%|████▉ | 4928/10000 [1:05:41<52:38, 1.61it/s, loss=0.0262, lr=1.49e-05, step=4928] Training: 49%|████▉ | 4929/10000 [1:05:42<1:02:51, 1.34it/s, loss=0.0262, lr=1.49e-05, step=4928] Training: 49%|████▉ | 4929/10000 [1:05:42<1:02:51, 1.34it/s, loss=0.0057, lr=1.49e-05, step=4929]19:50:14.524 [I] step=4930 loss=0.0122 smoothed_loss=0.0129 lr=1.50e-05 grad_norm=0.4820 step_time=0.5506s data_time=0.1179s it/s=1.496 eta_to_10000=3389.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0235 grad_action_out_proj_arms=0.1188 grad_arm_token_fuse=0.1222 grad_shared_expert=0.4422 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4930/10000 [1:05:42<59:02, 1.43it/s, loss=0.0057, lr=1.49e-05, step=4929] Training: 49%|████▉ | 4930/10000 [1:05:42<59:02, 1.43it/s, loss=0.0122, lr=1.49e-05, step=4930] Training: 49%|████▉ | 4931/10000 [1:05:43<53:44, 1.57it/s, loss=0.0122, lr=1.49e-05, step=4930] Training: 49%|████▉ | 4931/10000 [1:05:43<53:44, 1.57it/s, loss=0.0424, lr=1.49e-05, step=4931] Training: 49%|████▉ | 4932/10000 [1:05:43<49:32, 1.70it/s, loss=0.0424, lr=1.49e-05, step=4931] Training: 49%|████▉ | 4932/10000 [1:05:43<49:32, 1.70it/s, loss=0.0073, lr=1.49e-05, step=4932] Training: 49%|████▉ | 4933/10000 [1:05:44<54:27, 1.55it/s, loss=0.0073, lr=1.49e-05, step=4932] Training: 49%|████▉ | 4933/10000 [1:05:44<54:27, 1.55it/s, loss=0.0089, lr=1.49e-05, step=4933] Training: 49%|████▉ | 4934/10000 [1:05:45<53:59, 1.56it/s, loss=0.0089, lr=1.49e-05, step=4933] Training: 49%|████▉ | 4934/10000 [1:05:45<53:59, 1.56it/s, loss=0.0099, lr=1.49e-05, step=4934] Training: 49%|████▉ | 4935/10000 [1:05:45<54:01, 1.56it/s, loss=0.0099, lr=1.49e-05, step=4934] Training: 49%|████▉ | 4935/10000 [1:05:45<54:01, 1.56it/s, loss=0.0264, lr=1.49e-05, step=4935] Training: 49%|████▉ | 4936/10000 [1:05:46<56:44, 1.49it/s, loss=0.0264, lr=1.49e-05, step=4935] Training: 49%|████▉ | 4936/10000 [1:05:46<56:44, 1.49it/s, loss=0.0140, lr=1.49e-05, step=4936] Training: 49%|████▉ | 4937/10000 [1:05:46<51:47, 1.63it/s, loss=0.0140, lr=1.49e-05, step=4936] Training: 49%|████▉ | 4937/10000 [1:05:46<51:47, 1.63it/s, loss=0.0262, lr=1.49e-05, step=4937] Training: 49%|████▉ | 4938/10000 [1:05:47<49:12, 1.71it/s, loss=0.0262, lr=1.49e-05, step=4937] Training: 49%|████▉ | 4938/10000 [1:05:47<49:12, 1.71it/s, loss=0.0050, lr=1.49e-05, step=4938] Training: 49%|████▉ | 4939/10000 [1:05:47<48:26, 1.74it/s, loss=0.0050, lr=1.49e-05, step=4938] Training: 49%|████▉ | 4939/10000 [1:05:47<48:26, 1.74it/s, loss=0.0473, lr=1.49e-05, step=4939]19:50:20.489 [I] step=4940 loss=0.0063 smoothed_loss=0.0171 lr=1.49e-05 grad_norm=0.4619 step_time=0.5009s data_time=0.0956s it/s=1.677 eta_to_10000=3017.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0158 grad_action_out_proj_arms=0.1212 grad_arm_token_fuse=0.0808 grad_shared_expert=0.3971 (18633:train_pytorch.py:850) + Training: 49%|████▉ | 4940/10000 [1:05:48<50:43, 1.66it/s, loss=0.0473, lr=1.49e-05, step=4939] Training: 49%|████▉ | 4940/10000 [1:05:48<50:43, 1.66it/s, loss=0.0063, lr=1.49e-05, step=4940] Training: 49%|████▉ | 4941/10000 [1:05:49<49:23, 1.71it/s, loss=0.0063, lr=1.49e-05, step=4940] Training: 49%|████▉ | 4941/10000 [1:05:49<49:23, 1.71it/s, loss=0.0031, lr=1.49e-05, step=4941] Training: 49%|████▉ | 4942/10000 [1:05:49<46:40, 1.81it/s, loss=0.0031, lr=1.49e-05, step=4941] Training: 49%|████▉ | 4942/10000 [1:05:49<46:40, 1.81it/s, loss=0.0043, lr=1.49e-05, step=4942] Training: 49%|████▉ | 4943/10000 [1:05:50<52:38, 1.60it/s, loss=0.0043, lr=1.49e-05, step=4942] Training: 49%|████▉ | 4943/10000 [1:05:50<52:38, 1.60it/s, loss=0.0100, lr=1.49e-05, step=4943] Training: 49%|████▉ | 4944/10000 [1:05:50<49:53, 1.69it/s, loss=0.0100, lr=1.49e-05, step=4943] Training: 49%|████▉ | 4944/10000 [1:05:50<49:53, 1.69it/s, loss=0.0961, lr=1.49e-05, step=4944] Training: 49%|████▉ | 4945/10000 [1:05:51<47:09, 1.79it/s, loss=0.0961, lr=1.49e-05, step=4944] Training: 49%|████▉ | 4945/10000 [1:05:51<47:09, 1.79it/s, loss=0.0047, lr=1.49e-05, step=4945] Training: 49%|████▉ | 4946/10000 [1:05:51<45:28, 1.85it/s, loss=0.0047, lr=1.49e-05, step=4945] Training: 49%|████▉ | 4946/10000 [1:05:51<45:28, 1.85it/s, loss=0.0169, lr=1.49e-05, step=4946] Training: 49%|████▉ | 4947/10000 [1:05:52<44:53, 1.88it/s, loss=0.0169, lr=1.49e-05, step=4946] Training: 49%|████▉ | 4947/10000 [1:05:52<44:53, 1.88it/s, loss=0.0042, lr=1.49e-05, step=4947] Training: 49%|████▉ | 4948/10000 [1:05:53<48:44, 1.73it/s, loss=0.0042, lr=1.49e-05, step=4947] Training: 49%|████▉ | 4948/10000 [1:05:53<48:44, 1.73it/s, loss=0.0033, lr=1.49e-05, step=4948] Training: 49%|████▉ | 4949/10000 [1:05:53<46:29, 1.81it/s, loss=0.0033, lr=1.49e-05, step=4948] Training: 49%|████▉ | 4949/10000 [1:05:53<46:29, 1.81it/s, loss=0.0093, lr=1.49e-05, step=4949]19:50:26.255 [I] step=4950 loss=0.0098 smoothed_loss=0.0156 lr=1.49e-05 grad_norm=0.4522 step_time=0.5022s data_time=0.0744s it/s=1.735 eta_to_10000=2911.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0340 grad_action_out_proj_arms=0.2454 grad_arm_token_fuse=0.1684 grad_shared_expert=0.6480 (18633:train_pytorch.py:850) + Training: 50%|████▉ | 4950/10000 [1:05:54<51:45, 1.63it/s, loss=0.0093, lr=1.49e-05, step=4949] Training: 50%|████▉ | 4950/10000 [1:05:54<51:45, 1.63it/s, loss=0.0098, lr=1.49e-05, step=4950] Training: 50%|████▉ | 4951/10000 [1:05:54<48:31, 1.73it/s, loss=0.0098, lr=1.49e-05, step=4950] Training: 50%|████▉ | 4951/10000 [1:05:54<48:31, 1.73it/s, loss=0.0094, lr=1.49e-05, step=4951] Training: 50%|████▉ | 4952/10000 [1:05:55<46:47, 1.80it/s, loss=0.0094, lr=1.49e-05, step=4951] Training: 50%|████▉ | 4952/10000 [1:05:55<46:47, 1.80it/s, loss=0.0065, lr=1.49e-05, step=4952] Training: 50%|████▉ | 4953/10000 [1:05:55<45:16, 1.86it/s, loss=0.0065, lr=1.49e-05, step=4952] Training: 50%|████▉ | 4953/10000 [1:05:55<45:16, 1.86it/s, loss=0.0054, lr=1.49e-05, step=4953] Training: 50%|████▉ | 4954/10000 [1:05:56<44:17, 1.90it/s, loss=0.0054, lr=1.49e-05, step=4953] Training: 50%|████▉ | 4954/10000 [1:05:56<44:17, 1.90it/s, loss=0.0226, lr=1.49e-05, step=4954] Training: 50%|████▉ | 4955/10000 [1:05:56<43:33, 1.93it/s, loss=0.0226, lr=1.49e-05, step=4954] Training: 50%|████▉ | 4955/10000 [1:05:56<43:33, 1.93it/s, loss=0.0143, lr=1.48e-05, step=4955] Training: 50%|████▉ | 4956/10000 [1:05:57<47:20, 1.78it/s, loss=0.0143, lr=1.48e-05, step=4955] Training: 50%|████▉ | 4956/10000 [1:05:57<47:20, 1.78it/s, loss=0.0089, lr=1.48e-05, step=4956] Training: 50%|████▉ | 4957/10000 [1:05:58<54:27, 1.54it/s, loss=0.0089, lr=1.48e-05, step=4956] Training: 50%|████▉ | 4957/10000 [1:05:58<54:27, 1.54it/s, loss=0.0105, lr=1.48e-05, step=4957] Training: 50%|████▉ | 4958/10000 [1:05:59<54:26, 1.54it/s, loss=0.0105, lr=1.48e-05, step=4957] Training: 50%|████▉ | 4958/10000 [1:05:59<54:26, 1.54it/s, loss=0.0051, lr=1.48e-05, step=4958] Training: 50%|████▉ | 4959/10000 [1:05:59<50:57, 1.65it/s, loss=0.0051, lr=1.48e-05, step=4958] Training: 50%|████▉ | 4959/10000 [1:05:59<50:57, 1.65it/s, loss=0.0144, lr=1.48e-05, step=4959]19:50:31.963 [I] step=4960 loss=0.0479 smoothed_loss=0.0162 lr=1.48e-05 grad_norm=0.4660 step_time=0.4979s data_time=0.0729s it/s=1.752 eta_to_10000=2876.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0227 grad_action_out_proj_arms=0.1413 grad_arm_token_fuse=0.1231 grad_shared_expert=0.3280 (18633:train_pytorch.py:850) + Training: 50%|████▉ | 4960/10000 [1:06:00<49:27, 1.70it/s, loss=0.0144, lr=1.48e-05, step=4959] Training: 50%|████▉ | 4960/10000 [1:06:00<49:27, 1.70it/s, loss=0.0479, lr=1.48e-05, step=4960] Training: 50%|████▉ | 4961/10000 [1:06:00<50:06, 1.68it/s, loss=0.0479, lr=1.48e-05, step=4960] Training: 50%|████▉ | 4961/10000 [1:06:00<50:06, 1.68it/s, loss=0.0087, lr=1.48e-05, step=4961] Training: 50%|████▉ | 4962/10000 [1:06:01<48:51, 1.72it/s, loss=0.0087, lr=1.48e-05, step=4961] Training: 50%|████▉ | 4962/10000 [1:06:01<48:51, 1.72it/s, loss=0.0374, lr=1.48e-05, step=4962] Training: 50%|████▉ | 4963/10000 [1:06:01<46:54, 1.79it/s, loss=0.0374, lr=1.48e-05, step=4962] Training: 50%|████▉ | 4963/10000 [1:06:01<46:54, 1.79it/s, loss=0.0026, lr=1.48e-05, step=4963] Training: 50%|████▉ | 4964/10000 [1:06:02<49:33, 1.69it/s, loss=0.0026, lr=1.48e-05, step=4963] Training: 50%|████▉ | 4964/10000 [1:06:02<49:33, 1.69it/s, loss=0.0095, lr=1.48e-05, step=4964] Training: 50%|████▉ | 4965/10000 [1:06:03<53:58, 1.55it/s, loss=0.0095, lr=1.48e-05, step=4964] Training: 50%|████▉ | 4965/10000 [1:06:03<53:58, 1.55it/s, loss=0.0044, lr=1.48e-05, step=4965] Training: 50%|████▉ | 4966/10000 [1:06:03<49:52, 1.68it/s, loss=0.0044, lr=1.48e-05, step=4965] Training: 50%|████▉ | 4966/10000 [1:06:03<49:52, 1.68it/s, loss=0.0078, lr=1.48e-05, step=4966] Training: 50%|████▉ | 4967/10000 [1:06:04<48:20, 1.74it/s, loss=0.0078, lr=1.48e-05, step=4966] Training: 50%|████▉ | 4967/10000 [1:06:04<48:20, 1.74it/s, loss=0.0295, lr=1.48e-05, step=4967] Training: 50%|████▉ | 4968/10000 [1:06:04<47:16, 1.77it/s, loss=0.0295, lr=1.48e-05, step=4967] Training: 50%|████▉ | 4968/10000 [1:06:04<47:16, 1.77it/s, loss=0.0026, lr=1.48e-05, step=4968] Training: 50%|████▉ | 4969/10000 [1:06:05<46:02, 1.82it/s, loss=0.0026, lr=1.48e-05, step=4968] Training: 50%|████▉ | 4969/10000 [1:06:05<46:02, 1.82it/s, loss=0.0037, lr=1.48e-05, step=4969]19:50:37.652 [I] step=4970 loss=0.0122 smoothed_loss=0.0129 lr=1.48e-05 grad_norm=0.3873 step_time=0.4916s data_time=0.0773s it/s=1.758 eta_to_10000=2860.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.1030 grad_arm_token_fuse=0.0336 grad_shared_expert=0.2945 (18633:train_pytorch.py:850) + Training: 50%|████▉ | 4970/10000 [1:06:05<45:27, 1.84it/s, loss=0.0037, lr=1.48e-05, step=4969] Training: 50%|████▉ | 4970/10000 [1:06:05<45:27, 1.84it/s, loss=0.0122, lr=1.48e-05, step=4970] Training: 50%|████▉ | 4971/10000 [1:06:06<49:42, 1.69it/s, loss=0.0122, lr=1.48e-05, step=4970] Training: 50%|████▉ | 4971/10000 [1:06:06<49:42, 1.69it/s, loss=0.0481, lr=1.48e-05, step=4971] Training: 50%|████▉ | 4972/10000 [1:06:07<52:28, 1.60it/s, loss=0.0481, lr=1.48e-05, step=4971] Training: 50%|████▉ | 4972/10000 [1:06:07<52:28, 1.60it/s, loss=0.0028, lr=1.48e-05, step=4972] Training: 50%|████▉ | 4973/10000 [1:06:07<52:53, 1.58it/s, loss=0.0028, lr=1.48e-05, step=4972] Training: 50%|████▉ | 4973/10000 [1:06:07<52:53, 1.58it/s, loss=0.0284, lr=1.48e-05, step=4973] Training: 50%|████▉ | 4974/10000 [1:06:08<49:07, 1.70it/s, loss=0.0284, lr=1.48e-05, step=4973] Training: 50%|████▉ | 4974/10000 [1:06:08<49:07, 1.70it/s, loss=0.0098, lr=1.48e-05, step=4974] Training: 50%|████▉ | 4975/10000 [1:06:08<46:53, 1.79it/s, loss=0.0098, lr=1.48e-05, step=4974] Training: 50%|████▉ | 4975/10000 [1:06:08<46:53, 1.79it/s, loss=0.0123, lr=1.48e-05, step=4975] Training: 50%|████▉ | 4976/10000 [1:06:09<45:49, 1.83it/s, loss=0.0123, lr=1.48e-05, step=4975] Training: 50%|████▉ | 4976/10000 [1:06:09<45:49, 1.83it/s, loss=0.0108, lr=1.48e-05, step=4976] Training: 50%|████▉ | 4977/10000 [1:06:10<49:42, 1.68it/s, loss=0.0108, lr=1.48e-05, step=4976] Training: 50%|████▉ | 4977/10000 [1:06:10<49:42, 1.68it/s, loss=0.0076, lr=1.48e-05, step=4977] Training: 50%|████▉ | 4978/10000 [1:06:10<52:43, 1.59it/s, loss=0.0076, lr=1.48e-05, step=4977] Training: 50%|████▉ | 4978/10000 [1:06:10<52:43, 1.59it/s, loss=0.0090, lr=1.48e-05, step=4978] Training: 50%|████▉ | 4979/10000 [1:06:11<55:22, 1.51it/s, loss=0.0090, lr=1.48e-05, step=4978] Training: 50%|████▉ | 4979/10000 [1:06:11<55:22, 1.51it/s, loss=0.0053, lr=1.48e-05, step=4979]19:50:44.320 [I] step=4980 loss=0.0261 smoothed_loss=0.0142 lr=1.48e-05 grad_norm=0.4363 step_time=0.5798s data_time=0.0869s it/s=1.500 eta_to_10000=3346.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0158 grad_action_out_proj_arms=0.1824 grad_arm_token_fuse=0.0826 grad_shared_expert=0.5360 (18633:train_pytorch.py:850) + Training: 50%|████▉ | 4980/10000 [1:06:12<1:02:49, 1.33it/s, loss=0.0053, lr=1.48e-05, step=4979] Training: 50%|████▉ | 4980/10000 [1:06:12<1:02:49, 1.33it/s, loss=0.0261, lr=1.48e-05, step=4980] Training: 50%|████▉ | 4981/10000 [1:06:12<56:26, 1.48it/s, loss=0.0261, lr=1.48e-05, step=4980] Training: 50%|████▉ | 4981/10000 [1:06:12<56:26, 1.48it/s, loss=0.2125, lr=1.48e-05, step=4981] Training: 50%|████▉ | 4982/10000 [1:06:13<51:31, 1.62it/s, loss=0.2125, lr=1.48e-05, step=4981] Training: 50%|████▉ | 4982/10000 [1:06:13<51:31, 1.62it/s, loss=0.0074, lr=1.47e-05, step=4982] Training: 50%|████▉ | 4983/10000 [1:06:13<48:59, 1.71it/s, loss=0.0074, lr=1.47e-05, step=4982] Training: 50%|████▉ | 4983/10000 [1:06:13<48:59, 1.71it/s, loss=0.0259, lr=1.47e-05, step=4983] Training: 50%|████▉ | 4984/10000 [1:06:14<46:32, 1.80it/s, loss=0.0259, lr=1.47e-05, step=4983] Training: 50%|████▉ | 4984/10000 [1:06:14<46:32, 1.80it/s, loss=0.0016, lr=1.47e-05, step=4984] Training: 50%|████▉ | 4985/10000 [1:06:14<44:25, 1.88it/s, loss=0.0016, lr=1.47e-05, step=4984] Training: 50%|████▉ | 4985/10000 [1:06:14<44:25, 1.88it/s, loss=0.0180, lr=1.47e-05, step=4985] Training: 50%|████▉ | 4986/10000 [1:06:15<52:43, 1.59it/s, loss=0.0180, lr=1.47e-05, step=4985] Training: 50%|████▉ | 4986/10000 [1:06:15<52:43, 1.59it/s, loss=0.0197, lr=1.47e-05, step=4986] Training: 50%|████▉ | 4987/10000 [1:06:16<48:57, 1.71it/s, loss=0.0197, lr=1.47e-05, step=4986] Training: 50%|████▉ | 4987/10000 [1:06:16<48:57, 1.71it/s, loss=0.0123, lr=1.47e-05, step=4987] Training: 50%|████▉ | 4988/10000 [1:06:17<55:50, 1.50it/s, loss=0.0123, lr=1.47e-05, step=4987] Training: 50%|████▉ | 4988/10000 [1:06:17<55:50, 1.50it/s, loss=0.0173, lr=1.47e-05, step=4988] Training: 50%|████▉ | 4989/10000 [1:06:17<51:14, 1.63it/s, loss=0.0173, lr=1.47e-05, step=4988] Training: 50%|████▉ | 4989/10000 [1:06:17<51:14, 1.63it/s, loss=0.0141, lr=1.47e-05, step=4989]19:50:49.974 [I] step=4990 loss=0.0033 smoothed_loss=0.0211 lr=1.47e-05 grad_norm=0.5321 step_time=0.4798s data_time=0.0856s it/s=1.769 eta_to_10000=2832.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0146 grad_action_out_proj_arms=0.1254 grad_arm_token_fuse=0.0779 grad_shared_expert=0.3586 (18633:train_pytorch.py:850) + Training: 50%|████▉ | 4990/10000 [1:06:18<48:39, 1.72it/s, loss=0.0141, lr=1.47e-05, step=4989] Training: 50%|████▉ | 4990/10000 [1:06:18<48:39, 1.72it/s, loss=0.0033, lr=1.47e-05, step=4990] Training: 50%|████▉ | 4991/10000 [1:06:18<46:40, 1.79it/s, loss=0.0033, lr=1.47e-05, step=4990] Training: 50%|████▉ | 4991/10000 [1:06:18<46:40, 1.79it/s, loss=0.0154, lr=1.47e-05, step=4991] Training: 50%|████▉ | 4992/10000 [1:06:19<45:33, 1.83it/s, loss=0.0154, lr=1.47e-05, step=4991] Training: 50%|████▉ | 4992/10000 [1:06:19<45:33, 1.83it/s, loss=0.0411, lr=1.47e-05, step=4992] Training: 50%|████▉ | 4993/10000 [1:06:19<50:20, 1.66it/s, loss=0.0411, lr=1.47e-05, step=4992] Training: 50%|████▉ | 4993/10000 [1:06:19<50:20, 1.66it/s, loss=0.0044, lr=1.47e-05, step=4993] Training: 50%|████▉ | 4994/10000 [1:06:20<47:44, 1.75it/s, loss=0.0044, lr=1.47e-05, step=4993] Training: 50%|████▉ | 4994/10000 [1:06:20<47:44, 1.75it/s, loss=0.0117, lr=1.47e-05, step=4994] Training: 50%|████▉ | 4995/10000 [1:06:21<49:25, 1.69it/s, loss=0.0117, lr=1.47e-05, step=4994] Training: 50%|████▉ | 4995/10000 [1:06:21<49:25, 1.69it/s, loss=0.0103, lr=1.47e-05, step=4995] Training: 50%|████▉ | 4996/10000 [1:06:21<49:18, 1.69it/s, loss=0.0103, lr=1.47e-05, step=4995] Training: 50%|████▉ | 4996/10000 [1:06:21<49:18, 1.69it/s, loss=0.0051, lr=1.47e-05, step=4996] Training: 50%|████▉ | 4997/10000 [1:06:22<47:13, 1.77it/s, loss=0.0051, lr=1.47e-05, step=4996] Training: 50%|████▉ | 4997/10000 [1:06:22<47:13, 1.77it/s, loss=0.0191, lr=1.47e-05, step=4997] Training: 50%|████▉ | 4998/10000 [1:06:22<44:56, 1.85it/s, loss=0.0191, lr=1.47e-05, step=4997] Training: 50%|████▉ | 4998/10000 [1:06:22<44:56, 1.85it/s, loss=0.0037, lr=1.47e-05, step=4998] Training: 50%|████▉ | 4999/10000 [1:06:23<44:33, 1.87it/s, loss=0.0037, lr=1.47e-05, step=4998] Training: 50%|████▉ | 4999/10000 [1:06:23<44:33, 1.87it/s, loss=0.0249, lr=1.47e-05, step=4999]19:50:55.815 [I] step=5000 loss=0.0043 smoothed_loss=0.0159 lr=1.47e-05 grad_norm=0.4850 step_time=0.5183s data_time=0.0658s it/s=1.712 eta_to_10000=2920.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0105 grad_action_out_proj_arms=0.1454 grad_arm_token_fuse=0.0568 grad_shared_expert=1.0533 (18633:train_pytorch.py:850) +19:52:11.616 [I] Saved checkpoint at step 5000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000 (18633:train_pytorch.py:350) + Training: 50%|█████ | 5000/10000 [1:07:39<32:27:27, 23.37s/it, loss=0.0249, lr=1.47e-05, step=4999] Training: 50%|█████ | 5000/10000 [1:07:39<32:27:27, 23.37s/it, loss=0.0043, lr=1.47e-05, step=5000] Training: 50%|█████ | 5001/10000 [1:07:40<22:58:38, 16.55s/it, loss=0.0043, lr=1.47e-05, step=5000] Training: 50%|█████ | 5001/10000 [1:07:40<22:58:38, 16.55s/it, loss=0.0186, lr=1.47e-05, step=5001] Training: 50%|█████ | 5002/10000 [1:07:41<16:21:13, 11.78s/it, loss=0.0186, lr=1.47e-05, step=5001] Training: 50%|█████ | 5002/10000 [1:07:41<16:21:13, 11.78s/it, loss=0.0033, lr=1.47e-05, step=5002] Training: 50%|█████ | 5003/10000 [1:07:41<11:46:15, 8.48s/it, loss=0.0033, lr=1.47e-05, step=5002] Training: 50%|█████ | 5003/10000 [1:07:41<11:46:15, 8.48s/it, loss=0.0166, lr=1.47e-05, step=5003] Training: 50%|█████ | 5004/10000 [1:07:42<8:26:33, 6.08s/it, loss=0.0166, lr=1.47e-05, step=5003] Training: 50%|█████ | 5004/10000 [1:07:42<8:26:33, 6.08s/it, loss=0.0128, lr=1.47e-05, step=5004] Training: 50%|█████ | 5005/10000 [1:07:42<6:08:05, 4.42s/it, loss=0.0128, lr=1.47e-05, step=5004] Training: 50%|█████ | 5005/10000 [1:07:42<6:08:05, 4.42s/it, loss=0.0032, lr=1.47e-05, step=5005] Training: 50%|█████ | 5006/10000 [1:07:43<4:30:38, 3.25s/it, loss=0.0032, lr=1.47e-05, step=5005] Training: 50%|█████ | 5006/10000 [1:07:43<4:30:38, 3.25s/it, loss=0.0090, lr=1.47e-05, step=5006] Training: 50%|█████ | 5007/10000 [1:07:44<3:30:46, 2.53s/it, loss=0.0090, lr=1.47e-05, step=5006] Training: 50%|█████ | 5007/10000 [1:07:44<3:30:46, 2.53s/it, loss=0.0358, lr=1.47e-05, step=5007] Training: 50%|█████ | 5008/10000 [1:07:44<2:39:53, 1.92s/it, loss=0.0358, lr=1.47e-05, step=5007] Training: 50%|█████ | 5008/10000 [1:07:44<2:39:53, 1.92s/it, loss=0.0094, lr=1.47e-05, step=5008] Training: 50%|█████ | 5009/10000 [1:07:45<2:08:18, 1.54s/it, loss=0.0094, lr=1.47e-05, step=5008] Training: 50%|█████ | 5009/10000 [1:07:45<2:08:18, 1.54s/it, loss=0.0042, lr=1.46e-05, step=5009]19:52:17.774 [I] step=5010 loss=0.0166 smoothed_loss=0.0141 lr=1.47e-05 grad_norm=0.4803 step_time=0.5275s data_time=7.6684s it/s=0.122 eta_to_10000=40896.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1185 grad_arm_token_fuse=0.0891 grad_shared_expert=0.3338 (18633:train_pytorch.py:850) + Training: 50%|█████ | 5010/10000 [1:07:45<1:42:55, 1.24s/it, loss=0.0042, lr=1.46e-05, step=5009] Training: 50%|█████ | 5010/10000 [1:07:45<1:42:55, 1.24s/it, loss=0.0166, lr=1.46e-05, step=5010] Training: 50%|█████ | 5011/10000 [1:07:46<1:24:06, 1.01s/it, loss=0.0166, lr=1.46e-05, step=5010] Training: 50%|█████ | 5011/10000 [1:07:46<1:24:06, 1.01s/it, loss=0.0023, lr=1.46e-05, step=5011] Training: 50%|█████ | 5012/10000 [1:07:46<1:10:50, 1.17it/s, loss=0.0023, lr=1.46e-05, step=5011] Training: 50%|█████ | 5012/10000 [1:07:46<1:10:50, 1.17it/s, loss=0.0037, lr=1.46e-05, step=5012] Training: 50%|█████ | 5013/10000 [1:07:47<1:01:35, 1.35it/s, loss=0.0037, lr=1.46e-05, step=5012] Training: 50%|█████ | 5013/10000 [1:07:47<1:01:35, 1.35it/s, loss=0.0263, lr=1.46e-05, step=5013] Training: 50%|█████ | 5014/10000 [1:07:48<59:27, 1.40it/s, loss=0.0263, lr=1.46e-05, step=5013] Training: 50%|█████ | 5014/10000 [1:07:48<59:27, 1.40it/s, loss=0.0032, lr=1.46e-05, step=5014] Training: 50%|█████ | 5015/10000 [1:07:48<1:00:12, 1.38it/s, loss=0.0032, lr=1.46e-05, step=5014] Training: 50%|█████ | 5015/10000 [1:07:48<1:00:12, 1.38it/s, loss=0.0080, lr=1.46e-05, step=5015] Training: 50%|█████ | 5016/10000 [1:07:49<55:38, 1.49it/s, loss=0.0080, lr=1.46e-05, step=5015] Training: 50%|█████ | 5016/10000 [1:07:49<55:38, 1.49it/s, loss=0.0044, lr=1.46e-05, step=5016] Training: 50%|█████ | 5017/10000 [1:07:49<55:19, 1.50it/s, loss=0.0044, lr=1.46e-05, step=5016] Training: 50%|█████ | 5017/10000 [1:07:49<55:19, 1.50it/s, loss=0.0070, lr=1.46e-05, step=5017] Training: 50%|█████ | 5018/10000 [1:07:50<50:29, 1.64it/s, loss=0.0070, lr=1.46e-05, step=5017] Training: 50%|█████ | 5018/10000 [1:07:50<50:29, 1.64it/s, loss=0.0033, lr=1.46e-05, step=5018] Training: 50%|█████ | 5019/10000 [1:07:50<47:53, 1.73it/s, loss=0.0033, lr=1.46e-05, step=5018] Training: 50%|█████ | 5019/10000 [1:07:50<47:53, 1.73it/s, loss=0.0165, lr=1.46e-05, step=5019]19:52:23.323 [I] step=5020 loss=0.0053 smoothed_loss=0.0101 lr=1.46e-05 grad_norm=0.4017 step_time=0.4926s data_time=0.0623s it/s=1.803 eta_to_10000=2762.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1345 grad_arm_token_fuse=0.0705 grad_shared_expert=0.3269 (18633:train_pytorch.py:850) + Training: 50%|█████ | 5020/10000 [1:07:51<46:34, 1.78it/s, loss=0.0165, lr=1.46e-05, step=5019] Training: 50%|█████ | 5020/10000 [1:07:51<46:34, 1.78it/s, loss=0.0053, lr=1.46e-05, step=5020] Training: 50%|█████ | 5021/10000 [1:07:52<49:08, 1.69it/s, loss=0.0053, lr=1.46e-05, step=5020] Training: 50%|█████ | 5021/10000 [1:07:52<49:08, 1.69it/s, loss=0.0208, lr=1.46e-05, step=5021] Training: 50%|█████ | 5022/10000 [1:07:52<52:23, 1.58it/s, loss=0.0208, lr=1.46e-05, step=5021] Training: 50%|█████ | 5022/10000 [1:07:52<52:23, 1.58it/s, loss=0.0042, lr=1.46e-05, step=5022] Training: 50%|█████ | 5023/10000 [1:07:53<49:31, 1.67it/s, loss=0.0042, lr=1.46e-05, step=5022] Training: 50%|█████ | 5023/10000 [1:07:53<49:31, 1.67it/s, loss=0.0173, lr=1.46e-05, step=5023] Training: 50%|█████ | 5024/10000 [1:07:54<51:07, 1.62it/s, loss=0.0173, lr=1.46e-05, step=5023] Training: 50%|█████ | 5024/10000 [1:07:54<51:07, 1.62it/s, loss=0.0090, lr=1.46e-05, step=5024] Training: 50%|█████ | 5025/10000 [1:07:54<47:42, 1.74it/s, loss=0.0090, lr=1.46e-05, step=5024] Training: 50%|█████ | 5025/10000 [1:07:54<47:42, 1.74it/s, loss=0.0106, lr=1.46e-05, step=5025] Training: 50%|█████ | 5026/10000 [1:07:55<45:21, 1.83it/s, loss=0.0106, lr=1.46e-05, step=5025] Training: 50%|█████ | 5026/10000 [1:07:55<45:21, 1.83it/s, loss=0.0030, lr=1.46e-05, step=5026] Training: 50%|█████ | 5027/10000 [1:07:55<43:47, 1.89it/s, loss=0.0030, lr=1.46e-05, step=5026] Training: 50%|█████ | 5027/10000 [1:07:55<43:47, 1.89it/s, loss=0.0061, lr=1.46e-05, step=5027] Training: 50%|█████ | 5028/10000 [1:07:56<47:04, 1.76it/s, loss=0.0061, lr=1.46e-05, step=5027] Training: 50%|█████ | 5028/10000 [1:07:56<47:04, 1.76it/s, loss=0.0048, lr=1.46e-05, step=5028] Training: 50%|█████ | 5029/10000 [1:07:56<50:20, 1.65it/s, loss=0.0048, lr=1.46e-05, step=5028] Training: 50%|█████ | 5029/10000 [1:07:56<50:20, 1.65it/s, loss=0.0332, lr=1.46e-05, step=5029]19:52:29.218 [I] step=5030 loss=0.0050 smoothed_loss=0.0110 lr=1.46e-05 grad_norm=0.4569 step_time=0.5277s data_time=0.0619s it/s=1.696 eta_to_10000=2929.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.1080 grad_arm_token_fuse=0.0629 grad_shared_expert=0.3817 (18633:train_pytorch.py:850) + Training: 50%|█████ | 5030/10000 [1:07:57<48:14, 1.72it/s, loss=0.0332, lr=1.46e-05, step=5029] Training: 50%|█████ | 5030/10000 [1:07:57<48:14, 1.72it/s, loss=0.0050, lr=1.46e-05, step=5030] Training: 50%|█████ | 5031/10000 [1:07:58<49:35, 1.67it/s, loss=0.0050, lr=1.46e-05, step=5030] Training: 50%|█████ | 5031/10000 [1:07:58<49:35, 1.67it/s, loss=0.0183, lr=1.46e-05, step=5031] Training: 50%|█████ | 5032/10000 [1:07:58<47:09, 1.76it/s, loss=0.0183, lr=1.46e-05, step=5031] Training: 50%|█████ | 5032/10000 [1:07:58<47:09, 1.76it/s, loss=0.0276, lr=1.46e-05, step=5032] Training: 50%|█████ | 5033/10000 [1:07:59<44:57, 1.84it/s, loss=0.0276, lr=1.46e-05, step=5032] Training: 50%|█████ | 5033/10000 [1:07:59<44:57, 1.84it/s, loss=0.0116, lr=1.46e-05, step=5033] Training: 50%|█████ | 5034/10000 [1:07:59<43:34, 1.90it/s, loss=0.0116, lr=1.46e-05, step=5033] Training: 50%|█████ | 5034/10000 [1:07:59<43:34, 1.90it/s, loss=0.0107, lr=1.46e-05, step=5034] Training: 50%|█████ | 5035/10000 [1:07:59<42:25, 1.95it/s, loss=0.0107, lr=1.46e-05, step=5034] Training: 50%|█████ | 5035/10000 [1:07:59<42:25, 1.95it/s, loss=0.0158, lr=1.46e-05, step=5035] Training: 50%|█████ | 5036/10000 [1:08:00<47:43, 1.73it/s, loss=0.0158, lr=1.46e-05, step=5035] Training: 50%|█████ | 5036/10000 [1:08:00<47:43, 1.73it/s, loss=0.0097, lr=1.45e-05, step=5036] Training: 50%|█████ | 5037/10000 [1:08:01<45:53, 1.80it/s, loss=0.0097, lr=1.45e-05, step=5036] Training: 50%|█████ | 5037/10000 [1:08:01<45:53, 1.80it/s, loss=0.0109, lr=1.45e-05, step=5037] Training: 50%|█████ | 5038/10000 [1:08:01<44:07, 1.87it/s, loss=0.0109, lr=1.45e-05, step=5037] Training: 50%|█████ | 5038/10000 [1:08:01<44:07, 1.87it/s, loss=0.0079, lr=1.45e-05, step=5038] Training: 50%|█████ | 5039/10000 [1:08:02<48:00, 1.72it/s, loss=0.0079, lr=1.45e-05, step=5038] Training: 50%|█████ | 5039/10000 [1:08:02<48:00, 1.72it/s, loss=0.0046, lr=1.45e-05, step=5039]19:52:34.743 [I] step=5040 loss=0.0065 smoothed_loss=0.0109 lr=1.46e-05 grad_norm=0.6825 step_time=0.4892s data_time=0.0633s it/s=1.810 eta_to_10000=2739.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0291 grad_action_out_proj_arms=0.2425 grad_arm_token_fuse=0.1705 grad_shared_expert=0.5690 (18633:train_pytorch.py:850) + Training: 50%|█████ | 5040/10000 [1:08:02<46:48, 1.77it/s, loss=0.0046, lr=1.45e-05, step=5039] Training: 50%|█████ | 5040/10000 [1:08:02<46:48, 1.77it/s, loss=0.0065, lr=1.45e-05, step=5040] Training: 50%|█████ | 5041/10000 [1:08:03<45:21, 1.82it/s, loss=0.0065, lr=1.45e-05, step=5040] Training: 50%|█████ | 5041/10000 [1:08:03<45:21, 1.82it/s, loss=0.0030, lr=1.45e-05, step=5041] Training: 50%|█████ | 5042/10000 [1:08:03<43:53, 1.88it/s, loss=0.0030, lr=1.45e-05, step=5041] Training: 50%|█████ | 5042/10000 [1:08:03<43:53, 1.88it/s, loss=0.0105, lr=1.45e-05, step=5042] Training: 50%|█████ | 5043/10000 [1:08:04<49:54, 1.66it/s, loss=0.0105, lr=1.45e-05, step=5042] Training: 50%|█████ | 5043/10000 [1:08:04<49:54, 1.66it/s, loss=0.0064, lr=1.45e-05, step=5043] Training: 50%|█████ | 5044/10000 [1:08:05<46:46, 1.77it/s, loss=0.0064, lr=1.45e-05, step=5043] Training: 50%|█████ | 5044/10000 [1:08:05<46:46, 1.77it/s, loss=0.0031, lr=1.45e-05, step=5044] Training: 50%|█████ | 5045/10000 [1:08:05<47:12, 1.75it/s, loss=0.0031, lr=1.45e-05, step=5044] Training: 50%|█████ | 5045/10000 [1:08:05<47:12, 1.75it/s, loss=0.0382, lr=1.45e-05, step=5045] Training: 50%|█████ | 5046/10000 [1:08:06<49:07, 1.68it/s, loss=0.0382, lr=1.45e-05, step=5045] Training: 50%|█████ | 5046/10000 [1:08:06<49:07, 1.68it/s, loss=0.0128, lr=1.45e-05, step=5046] Training: 50%|█████ | 5047/10000 [1:08:06<46:29, 1.78it/s, loss=0.0128, lr=1.45e-05, step=5046] Training: 50%|█████ | 5047/10000 [1:08:06<46:29, 1.78it/s, loss=0.0170, lr=1.45e-05, step=5047] Training: 50%|█████ | 5048/10000 [1:08:07<44:33, 1.85it/s, loss=0.0170, lr=1.45e-05, step=5047] Training: 50%|█████ | 5048/10000 [1:08:07<44:33, 1.85it/s, loss=0.0022, lr=1.45e-05, step=5048] Training: 50%|█████ | 5049/10000 [1:08:07<43:08, 1.91it/s, loss=0.0022, lr=1.45e-05, step=5048] Training: 50%|█████ | 5049/10000 [1:08:07<43:08, 1.91it/s, loss=0.0162, lr=1.45e-05, step=5049]19:52:40.524 [I] step=5050 loss=0.0128 smoothed_loss=0.0121 lr=1.45e-05 grad_norm=0.4927 step_time=0.5085s data_time=0.0696s it/s=1.730 eta_to_10000=2861.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0146 grad_action_out_proj_arms=0.1488 grad_arm_token_fuse=0.0782 grad_shared_expert=0.4246 (18633:train_pytorch.py:850) + Training: 50%|█████ | 5050/10000 [1:08:08<50:57, 1.62it/s, loss=0.0162, lr=1.45e-05, step=5049] Training: 50%|█████ | 5050/10000 [1:08:08<50:57, 1.62it/s, loss=0.0128, lr=1.45e-05, step=5050] Training: 51%|█████ | 5051/10000 [1:08:09<47:34, 1.73it/s, loss=0.0128, lr=1.45e-05, step=5050] Training: 51%|█████ | 5051/10000 [1:08:09<47:34, 1.73it/s, loss=0.0102, lr=1.45e-05, step=5051] Training: 51%|█████ | 5052/10000 [1:08:09<45:15, 1.82it/s, loss=0.0102, lr=1.45e-05, step=5051] Training: 51%|█████ | 5052/10000 [1:08:09<45:15, 1.82it/s, loss=0.0029, lr=1.45e-05, step=5052] Training: 51%|█████ | 5053/10000 [1:08:10<47:24, 1.74it/s, loss=0.0029, lr=1.45e-05, step=5052] Training: 51%|█████ | 5053/10000 [1:08:10<47:24, 1.74it/s, loss=0.0011, lr=1.45e-05, step=5053] Training: 51%|█████ | 5054/10000 [1:08:10<49:52, 1.65it/s, loss=0.0011, lr=1.45e-05, step=5053] Training: 51%|█████ | 5054/10000 [1:08:10<49:52, 1.65it/s, loss=0.0104, lr=1.45e-05, step=5054] Training: 51%|█████ | 5055/10000 [1:08:11<47:31, 1.73it/s, loss=0.0104, lr=1.45e-05, step=5054] Training: 51%|█████ | 5055/10000 [1:08:11<47:31, 1.73it/s, loss=0.0124, lr=1.45e-05, step=5055] Training: 51%|█████ | 5056/10000 [1:08:11<45:19, 1.82it/s, loss=0.0124, lr=1.45e-05, step=5055] Training: 51%|█████ | 5056/10000 [1:08:11<45:19, 1.82it/s, loss=0.0083, lr=1.45e-05, step=5056] Training: 51%|█████ | 5057/10000 [1:08:12<49:14, 1.67it/s, loss=0.0083, lr=1.45e-05, step=5056] Training: 51%|█████ | 5057/10000 [1:08:12<49:14, 1.67it/s, loss=0.0153, lr=1.45e-05, step=5057] Training: 51%|█████ | 5058/10000 [1:08:13<46:40, 1.76it/s, loss=0.0153, lr=1.45e-05, step=5057] Training: 51%|█████ | 5058/10000 [1:08:13<46:40, 1.76it/s, loss=0.0096, lr=1.45e-05, step=5058] Training: 51%|█████ | 5059/10000 [1:08:13<44:26, 1.85it/s, loss=0.0096, lr=1.45e-05, step=5058] Training: 51%|█████ | 5059/10000 [1:08:13<44:26, 1.85it/s, loss=0.0205, lr=1.45e-05, step=5059]19:52:46.048 [I] step=5060 loss=0.0138 smoothed_loss=0.0117 lr=1.45e-05 grad_norm=0.4766 step_time=0.4785s data_time=0.0741s it/s=1.810 eta_to_10000=2728.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0066 grad_action_out_proj_arms=0.0789 grad_arm_token_fuse=0.0337 grad_shared_expert=0.2994 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5060/10000 [1:08:14<45:11, 1.82it/s, loss=0.0205, lr=1.45e-05, step=5059] Training: 51%|█████ | 5060/10000 [1:08:14<45:11, 1.82it/s, loss=0.0138, lr=1.45e-05, step=5060] Training: 51%|█████ | 5061/10000 [1:08:15<53:01, 1.55it/s, loss=0.0138, lr=1.45e-05, step=5060] Training: 51%|█████ | 5061/10000 [1:08:15<53:01, 1.55it/s, loss=0.0134, lr=1.45e-05, step=5061] Training: 51%|█████ | 5062/10000 [1:08:15<49:30, 1.66it/s, loss=0.0134, lr=1.45e-05, step=5061] Training: 51%|█████ | 5062/10000 [1:08:15<49:30, 1.66it/s, loss=0.0045, lr=1.45e-05, step=5062] Training: 51%|█████ | 5063/10000 [1:08:16<47:27, 1.73it/s, loss=0.0045, lr=1.45e-05, step=5062] Training: 51%|█████ | 5063/10000 [1:08:16<47:27, 1.73it/s, loss=0.0050, lr=1.44e-05, step=5063] Training: 51%|█████ | 5064/10000 [1:08:16<49:53, 1.65it/s, loss=0.0050, lr=1.44e-05, step=5063] Training: 51%|█████ | 5064/10000 [1:08:16<49:53, 1.65it/s, loss=0.0197, lr=1.44e-05, step=5064] Training: 51%|█████ | 5065/10000 [1:08:17<52:14, 1.57it/s, loss=0.0197, lr=1.44e-05, step=5064] Training: 51%|█████ | 5065/10000 [1:08:17<52:14, 1.57it/s, loss=0.0229, lr=1.44e-05, step=5065] Training: 51%|█████ | 5066/10000 [1:08:17<48:46, 1.69it/s, loss=0.0229, lr=1.44e-05, step=5065] Training: 51%|█████ | 5066/10000 [1:08:17<48:46, 1.69it/s, loss=0.0079, lr=1.44e-05, step=5066] Training: 51%|█████ | 5067/10000 [1:08:18<46:42, 1.76it/s, loss=0.0079, lr=1.44e-05, step=5066] Training: 51%|█████ | 5067/10000 [1:08:18<46:42, 1.76it/s, loss=0.0477, lr=1.44e-05, step=5067] Training: 51%|█████ | 5068/10000 [1:08:18<44:37, 1.84it/s, loss=0.0477, lr=1.44e-05, step=5067] Training: 51%|█████ | 5068/10000 [1:08:18<44:37, 1.84it/s, loss=0.0549, lr=1.44e-05, step=5068] Training: 51%|█████ | 5069/10000 [1:08:19<47:44, 1.72it/s, loss=0.0549, lr=1.44e-05, step=5068] Training: 51%|█████ | 5069/10000 [1:08:19<47:44, 1.72it/s, loss=0.0042, lr=1.44e-05, step=5069]19:52:52.012 [I] step=5070 loss=0.0062 smoothed_loss=0.0169 lr=1.44e-05 grad_norm=0.5921 step_time=0.5274s data_time=0.0688s it/s=1.677 eta_to_10000=2939.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0085 grad_action_out_proj_arms=0.0814 grad_arm_token_fuse=0.0421 grad_shared_expert=0.3189 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5070/10000 [1:08:20<46:43, 1.76it/s, loss=0.0042, lr=1.44e-05, step=5069] Training: 51%|█████ | 5070/10000 [1:08:20<46:43, 1.76it/s, loss=0.0062, lr=1.44e-05, step=5070] Training: 51%|█████ | 5071/10000 [1:08:20<49:35, 1.66it/s, loss=0.0062, lr=1.44e-05, step=5070] Training: 51%|█████ | 5071/10000 [1:08:20<49:35, 1.66it/s, loss=0.0062, lr=1.44e-05, step=5071] Training: 51%|█████ | 5072/10000 [1:08:21<54:45, 1.50it/s, loss=0.0062, lr=1.44e-05, step=5071] Training: 51%|█████ | 5072/10000 [1:08:21<54:45, 1.50it/s, loss=0.0018, lr=1.44e-05, step=5072] Training: 51%|█████ | 5073/10000 [1:08:22<50:54, 1.61it/s, loss=0.0018, lr=1.44e-05, step=5072] Training: 51%|█████ | 5073/10000 [1:08:22<50:54, 1.61it/s, loss=0.1002, lr=1.44e-05, step=5073] Training: 51%|█████ | 5074/10000 [1:08:22<47:50, 1.72it/s, loss=0.1002, lr=1.44e-05, step=5073] Training: 51%|█████ | 5074/10000 [1:08:22<47:50, 1.72it/s, loss=0.0072, lr=1.44e-05, step=5074] Training: 51%|█████ | 5075/10000 [1:08:23<45:30, 1.80it/s, loss=0.0072, lr=1.44e-05, step=5074] Training: 51%|█████ | 5075/10000 [1:08:23<45:30, 1.80it/s, loss=0.0307, lr=1.44e-05, step=5075] Training: 51%|█████ | 5076/10000 [1:08:23<50:26, 1.63it/s, loss=0.0307, lr=1.44e-05, step=5075] Training: 51%|█████ | 5076/10000 [1:08:23<50:26, 1.63it/s, loss=0.0067, lr=1.44e-05, step=5076] Training: 51%|█████ | 5077/10000 [1:08:24<47:02, 1.74it/s, loss=0.0067, lr=1.44e-05, step=5076] Training: 51%|█████ | 5077/10000 [1:08:24<47:02, 1.74it/s, loss=0.0069, lr=1.44e-05, step=5077] Training: 51%|█████ | 5078/10000 [1:08:25<49:13, 1.67it/s, loss=0.0069, lr=1.44e-05, step=5077] Training: 51%|█████ | 5078/10000 [1:08:25<49:13, 1.67it/s, loss=0.0068, lr=1.44e-05, step=5078] Training: 51%|█████ | 5079/10000 [1:08:25<53:50, 1.52it/s, loss=0.0068, lr=1.44e-05, step=5078] Training: 51%|█████ | 5079/10000 [1:08:25<53:50, 1.52it/s, loss=0.0069, lr=1.44e-05, step=5079]19:52:58.218 [I] step=5080 loss=0.0157 smoothed_loss=0.0169 lr=1.44e-05 grad_norm=0.4459 step_time=0.5587s data_time=0.0619s it/s=1.612 eta_to_10000=3053.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0177 grad_action_out_proj_arms=0.1439 grad_arm_token_fuse=0.0962 grad_shared_expert=0.4195 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5080/10000 [1:08:26<50:41, 1.62it/s, loss=0.0069, lr=1.44e-05, step=5079] Training: 51%|█████ | 5080/10000 [1:08:26<50:41, 1.62it/s, loss=0.0157, lr=1.44e-05, step=5080] Training: 51%|█████ | 5081/10000 [1:08:26<47:49, 1.71it/s, loss=0.0157, lr=1.44e-05, step=5080] Training: 51%|█████ | 5081/10000 [1:08:26<47:49, 1.71it/s, loss=0.0050, lr=1.44e-05, step=5081] Training: 51%|█████ | 5082/10000 [1:08:27<45:28, 1.80it/s, loss=0.0050, lr=1.44e-05, step=5081] Training: 51%|█████ | 5082/10000 [1:08:27<45:28, 1.80it/s, loss=0.0023, lr=1.44e-05, step=5082] Training: 51%|█████ | 5083/10000 [1:08:28<47:21, 1.73it/s, loss=0.0023, lr=1.44e-05, step=5082] Training: 51%|█████ | 5083/10000 [1:08:28<47:21, 1.73it/s, loss=0.0333, lr=1.44e-05, step=5083] Training: 51%|█████ | 5084/10000 [1:08:28<45:16, 1.81it/s, loss=0.0333, lr=1.44e-05, step=5083] Training: 51%|█████ | 5084/10000 [1:08:28<45:16, 1.81it/s, loss=0.0063, lr=1.44e-05, step=5084] Training: 51%|█████ | 5085/10000 [1:08:29<46:01, 1.78it/s, loss=0.0063, lr=1.44e-05, step=5084] Training: 51%|█████ | 5085/10000 [1:08:29<46:01, 1.78it/s, loss=0.0105, lr=1.44e-05, step=5085] Training: 51%|█████ | 5086/10000 [1:08:29<52:52, 1.55it/s, loss=0.0105, lr=1.44e-05, step=5085] Training: 51%|█████ | 5086/10000 [1:08:29<52:52, 1.55it/s, loss=0.0113, lr=1.44e-05, step=5086] Training: 51%|█████ | 5087/10000 [1:08:30<52:03, 1.57it/s, loss=0.0113, lr=1.44e-05, step=5086] Training: 51%|█████ | 5087/10000 [1:08:30<52:03, 1.57it/s, loss=0.0025, lr=1.44e-05, step=5087] Training: 51%|█████ | 5088/10000 [1:08:31<49:30, 1.65it/s, loss=0.0025, lr=1.44e-05, step=5087] Training: 51%|█████ | 5088/10000 [1:08:31<49:30, 1.65it/s, loss=0.0056, lr=1.44e-05, step=5088] Training: 51%|█████ | 5089/10000 [1:08:31<46:30, 1.76it/s, loss=0.0056, lr=1.44e-05, step=5088] Training: 51%|█████ | 5089/10000 [1:08:31<46:30, 1.76it/s, loss=0.0103, lr=1.44e-05, step=5089]19:53:03.931 [I] step=5090 loss=0.0103 smoothed_loss=0.0121 lr=1.44e-05 grad_norm=0.4865 step_time=0.4947s data_time=0.0766s it/s=1.751 eta_to_10000=2804.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0135 grad_action_out_proj_arms=0.1443 grad_arm_token_fuse=0.0716 grad_shared_expert=0.4298 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5090/10000 [1:08:32<45:55, 1.78it/s, loss=0.0103, lr=1.44e-05, step=5089] Training: 51%|█████ | 5090/10000 [1:08:32<45:55, 1.78it/s, loss=0.0103, lr=1.43e-05, step=5090] Training: 51%|█████ | 5091/10000 [1:08:32<48:32, 1.69it/s, loss=0.0103, lr=1.43e-05, step=5090] Training: 51%|█████ | 5091/10000 [1:08:32<48:32, 1.69it/s, loss=0.0162, lr=1.43e-05, step=5091] Training: 51%|█████ | 5092/10000 [1:08:33<45:42, 1.79it/s, loss=0.0162, lr=1.43e-05, step=5091] Training: 51%|█████ | 5092/10000 [1:08:33<45:42, 1.79it/s, loss=0.0032, lr=1.43e-05, step=5092] Training: 51%|█████ | 5093/10000 [1:08:34<52:15, 1.57it/s, loss=0.0032, lr=1.43e-05, step=5092] Training: 51%|█████ | 5093/10000 [1:08:34<52:15, 1.57it/s, loss=0.0064, lr=1.43e-05, step=5093] Training: 51%|█████ | 5094/10000 [1:08:34<50:48, 1.61it/s, loss=0.0064, lr=1.43e-05, step=5093] Training: 51%|█████ | 5094/10000 [1:08:34<50:48, 1.61it/s, loss=0.0298, lr=1.43e-05, step=5094] Training: 51%|█████ | 5095/10000 [1:08:35<57:04, 1.43it/s, loss=0.0298, lr=1.43e-05, step=5094] Training: 51%|█████ | 5095/10000 [1:08:35<57:04, 1.43it/s, loss=0.0299, lr=1.43e-05, step=5095] Training: 51%|█████ | 5096/10000 [1:08:36<58:09, 1.41it/s, loss=0.0299, lr=1.43e-05, step=5095] Training: 51%|█████ | 5096/10000 [1:08:36<58:09, 1.41it/s, loss=0.0042, lr=1.43e-05, step=5096] Training: 51%|█████ | 5097/10000 [1:08:36<53:17, 1.53it/s, loss=0.0042, lr=1.43e-05, step=5096] Training: 51%|█████ | 5097/10000 [1:08:36<53:17, 1.53it/s, loss=0.0015, lr=1.43e-05, step=5097] Training: 51%|█████ | 5098/10000 [1:08:37<53:34, 1.53it/s, loss=0.0015, lr=1.43e-05, step=5097] Training: 51%|█████ | 5098/10000 [1:08:37<53:34, 1.53it/s, loss=0.0238, lr=1.43e-05, step=5098] Training: 51%|█████ | 5099/10000 [1:08:38<54:06, 1.51it/s, loss=0.0238, lr=1.43e-05, step=5098] Training: 51%|█████ | 5099/10000 [1:08:38<54:06, 1.51it/s, loss=0.0025, lr=1.43e-05, step=5099]19:53:10.824 [I] step=5100 loss=0.0017 smoothed_loss=0.0113 lr=1.43e-05 grad_norm=0.4242 step_time=0.5944s data_time=0.0950s it/s=1.451 eta_to_10000=3377.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.0899 grad_arm_token_fuse=0.0428 grad_shared_expert=0.3084 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5100/10000 [1:08:38<59:05, 1.38it/s, loss=0.0025, lr=1.43e-05, step=5099] Training: 51%|█████ | 5100/10000 [1:08:38<59:05, 1.38it/s, loss=0.0017, lr=1.43e-05, step=5100] Training: 51%|█████ | 5101/10000 [1:08:39<58:05, 1.41it/s, loss=0.0017, lr=1.43e-05, step=5100] Training: 51%|█████ | 5101/10000 [1:08:39<58:05, 1.41it/s, loss=0.0339, lr=1.43e-05, step=5101] Training: 51%|█████ | 5102/10000 [1:08:40<52:51, 1.54it/s, loss=0.0339, lr=1.43e-05, step=5101] Training: 51%|█████ | 5102/10000 [1:08:40<52:51, 1.54it/s, loss=0.0049, lr=1.43e-05, step=5102] Training: 51%|█████ | 5103/10000 [1:08:40<49:41, 1.64it/s, loss=0.0049, lr=1.43e-05, step=5102] Training: 51%|█████ | 5103/10000 [1:08:40<49:41, 1.64it/s, loss=0.0079, lr=1.43e-05, step=5103] Training: 51%|█████ | 5104/10000 [1:08:41<51:28, 1.59it/s, loss=0.0079, lr=1.43e-05, step=5103] Training: 51%|█████ | 5104/10000 [1:08:41<51:28, 1.59it/s, loss=0.0660, lr=1.43e-05, step=5104] Training: 51%|█████ | 5105/10000 [1:08:41<48:40, 1.68it/s, loss=0.0660, lr=1.43e-05, step=5104] Training: 51%|█████ | 5105/10000 [1:08:41<48:40, 1.68it/s, loss=0.0190, lr=1.43e-05, step=5105] Training: 51%|█████ | 5106/10000 [1:08:42<51:33, 1.58it/s, loss=0.0190, lr=1.43e-05, step=5105] Training: 51%|█████ | 5106/10000 [1:08:42<51:33, 1.58it/s, loss=0.0107, lr=1.43e-05, step=5106] Training: 51%|█████ | 5107/10000 [1:08:43<54:57, 1.48it/s, loss=0.0107, lr=1.43e-05, step=5106] Training: 51%|█████ | 5107/10000 [1:08:43<54:57, 1.48it/s, loss=0.0152, lr=1.43e-05, step=5107] Training: 51%|█████ | 5108/10000 [1:08:44<54:57, 1.48it/s, loss=0.0152, lr=1.43e-05, step=5107] Training: 51%|█████ | 5108/10000 [1:08:44<54:57, 1.48it/s, loss=0.0154, lr=1.43e-05, step=5108] Training: 51%|█████ | 5109/10000 [1:08:44<50:33, 1.61it/s, loss=0.0154, lr=1.43e-05, step=5108] Training: 51%|█████ | 5109/10000 [1:08:44<50:33, 1.61it/s, loss=0.0120, lr=1.43e-05, step=5109]19:53:17.098 [I] step=5110 loss=0.0085 smoothed_loss=0.0155 lr=1.43e-05 grad_norm=0.4784 step_time=0.5318s data_time=0.0956s it/s=1.594 eta_to_10000=3067.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.0916 grad_arm_token_fuse=0.0350 grad_shared_expert=0.3694 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5110/10000 [1:08:45<52:57, 1.54it/s, loss=0.0120, lr=1.43e-05, step=5109] Training: 51%|█████ | 5110/10000 [1:08:45<52:57, 1.54it/s, loss=0.0085, lr=1.43e-05, step=5110] Training: 51%|█████ | 5111/10000 [1:08:45<48:43, 1.67it/s, loss=0.0085, lr=1.43e-05, step=5110] Training: 51%|█████ | 5111/10000 [1:08:45<48:43, 1.67it/s, loss=0.0139, lr=1.43e-05, step=5111] Training: 51%|█████ | 5112/10000 [1:08:46<46:37, 1.75it/s, loss=0.0139, lr=1.43e-05, step=5111] Training: 51%|█████ | 5112/10000 [1:08:46<46:37, 1.75it/s, loss=0.0071, lr=1.43e-05, step=5112] Training: 51%|█████ | 5113/10000 [1:08:47<53:02, 1.54it/s, loss=0.0071, lr=1.43e-05, step=5112] Training: 51%|█████ | 5113/10000 [1:08:47<53:02, 1.54it/s, loss=0.0048, lr=1.43e-05, step=5113] Training: 51%|█████ | 5114/10000 [1:08:47<55:00, 1.48it/s, loss=0.0048, lr=1.43e-05, step=5113] Training: 51%|█████ | 5114/10000 [1:08:47<55:00, 1.48it/s, loss=0.0092, lr=1.43e-05, step=5114] Training: 51%|█████ | 5115/10000 [1:08:48<56:55, 1.43it/s, loss=0.0092, lr=1.43e-05, step=5114] Training: 51%|█████ | 5115/10000 [1:08:48<56:55, 1.43it/s, loss=0.0388, lr=1.43e-05, step=5115] Training: 51%|█████ | 5116/10000 [1:08:49<56:03, 1.45it/s, loss=0.0388, lr=1.43e-05, step=5115] Training: 51%|█████ | 5116/10000 [1:08:49<56:03, 1.45it/s, loss=0.0159, lr=1.43e-05, step=5116] Training: 51%|█████ | 5117/10000 [1:08:49<50:59, 1.60it/s, loss=0.0159, lr=1.43e-05, step=5116] Training: 51%|█████ | 5117/10000 [1:08:49<50:59, 1.60it/s, loss=0.0143, lr=1.42e-05, step=5117] Training: 51%|█████ | 5118/10000 [1:08:50<48:25, 1.68it/s, loss=0.0143, lr=1.42e-05, step=5117] Training: 51%|█████ | 5118/10000 [1:08:50<48:25, 1.68it/s, loss=0.0175, lr=1.42e-05, step=5118] Training: 51%|█████ | 5119/10000 [1:08:50<46:44, 1.74it/s, loss=0.0175, lr=1.42e-05, step=5118] Training: 51%|█████ | 5119/10000 [1:08:50<46:44, 1.74it/s, loss=0.0188, lr=1.42e-05, step=5119]19:53:23.190 [I] step=5120 loss=0.0029 smoothed_loss=0.0147 lr=1.43e-05 grad_norm=0.4556 step_time=0.5305s data_time=0.0788s it/s=1.642 eta_to_10000=2972.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.1078 grad_arm_token_fuse=0.0616 grad_shared_expert=0.2550 (18633:train_pytorch.py:850) + Training: 51%|█████ | 5120/10000 [1:08:51<47:01, 1.73it/s, loss=0.0188, lr=1.42e-05, step=5119] Training: 51%|█████ | 5120/10000 [1:08:51<47:01, 1.73it/s, loss=0.0029, lr=1.42e-05, step=5120] Training: 51%|█████ | 5121/10000 [1:08:52<50:43, 1.60it/s, loss=0.0029, lr=1.42e-05, step=5120] Training: 51%|█████ | 5121/10000 [1:08:52<50:43, 1.60it/s, loss=0.0024, lr=1.42e-05, step=5121] Training: 51%|█████ | 5122/10000 [1:08:52<54:34, 1.49it/s, loss=0.0024, lr=1.42e-05, step=5121] Training: 51%|█████ | 5122/10000 [1:08:52<54:34, 1.49it/s, loss=0.0103, lr=1.42e-05, step=5122] Training: 51%|█████ | 5123/10000 [1:08:53<50:25, 1.61it/s, loss=0.0103, lr=1.42e-05, step=5122] Training: 51%|█████ | 5123/10000 [1:08:53<50:25, 1.61it/s, loss=0.0095, lr=1.42e-05, step=5123] Training: 51%|█████ | 5124/10000 [1:08:53<47:15, 1.72it/s, loss=0.0095, lr=1.42e-05, step=5123] Training: 51%|█████ | 5124/10000 [1:08:53<47:15, 1.72it/s, loss=0.0144, lr=1.42e-05, step=5124] Training: 51%|█████▏ | 5125/10000 [1:08:54<45:21, 1.79it/s, loss=0.0144, lr=1.42e-05, step=5124] Training: 51%|█████▏ | 5125/10000 [1:08:54<45:21, 1.79it/s, loss=0.0086, lr=1.42e-05, step=5125] Training: 51%|█████▏ | 5126/10000 [1:08:54<43:50, 1.85it/s, loss=0.0086, lr=1.42e-05, step=5125] Training: 51%|█████▏ | 5126/10000 [1:08:54<43:50, 1.85it/s, loss=0.0052, lr=1.42e-05, step=5126] Training: 51%|█████▏ | 5127/10000 [1:08:55<43:22, 1.87it/s, loss=0.0052, lr=1.42e-05, step=5126] Training: 51%|█████▏ | 5127/10000 [1:08:55<43:22, 1.87it/s, loss=0.0038, lr=1.42e-05, step=5127] Training: 51%|█████▏ | 5128/10000 [1:08:56<48:32, 1.67it/s, loss=0.0038, lr=1.42e-05, step=5127] Training: 51%|█████▏ | 5128/10000 [1:08:56<48:32, 1.67it/s, loss=0.0133, lr=1.42e-05, step=5128] Training: 51%|█████▏ | 5129/10000 [1:08:56<51:59, 1.56it/s, loss=0.0133, lr=1.42e-05, step=5128] Training: 51%|█████▏ | 5129/10000 [1:08:56<51:59, 1.56it/s, loss=0.0261, lr=1.42e-05, step=5129]19:53:29.263 [I] step=5130 loss=0.0173 smoothed_loss=0.0132 lr=1.42e-05 grad_norm=0.4076 step_time=0.5487s data_time=0.0586s it/s=1.647 eta_to_10000=2956.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0112 grad_action_out_proj_arms=0.1184 grad_arm_token_fuse=0.0615 grad_shared_expert=0.3449 (18633:train_pytorch.py:850) + Training: 51%|█████▏ | 5130/10000 [1:08:57<50:01, 1.62it/s, loss=0.0261, lr=1.42e-05, step=5129] Training: 51%|█████▏ | 5130/10000 [1:08:57<50:01, 1.62it/s, loss=0.0173, lr=1.42e-05, step=5130] Training: 51%|█████▏ | 5131/10000 [1:08:57<47:00, 1.73it/s, loss=0.0173, lr=1.42e-05, step=5130] Training: 51%|█████▏ | 5131/10000 [1:08:57<47:00, 1.73it/s, loss=0.0195, lr=1.42e-05, step=5131] Training: 51%|█████▏ | 5132/10000 [1:08:58<45:10, 1.80it/s, loss=0.0195, lr=1.42e-05, step=5131] Training: 51%|█████▏ | 5132/10000 [1:08:58<45:10, 1.80it/s, loss=0.0084, lr=1.42e-05, step=5132] Training: 51%|█████▏ | 5133/10000 [1:08:58<44:04, 1.84it/s, loss=0.0084, lr=1.42e-05, step=5132] Training: 51%|█████▏ | 5133/10000 [1:08:58<44:04, 1.84it/s, loss=0.0095, lr=1.42e-05, step=5133] Training: 51%|█████▏ | 5134/10000 [1:08:59<42:41, 1.90it/s, loss=0.0095, lr=1.42e-05, step=5133] Training: 51%|█████▏ | 5134/10000 [1:08:59<42:41, 1.90it/s, loss=0.0100, lr=1.42e-05, step=5134] Training: 51%|█████▏ | 5135/10000 [1:09:00<46:14, 1.75it/s, loss=0.0100, lr=1.42e-05, step=5134] Training: 51%|█████▏ | 5135/10000 [1:09:00<46:14, 1.75it/s, loss=0.0292, lr=1.42e-05, step=5135] Training: 51%|█████▏ | 5136/10000 [1:09:00<50:25, 1.61it/s, loss=0.0292, lr=1.42e-05, step=5135] Training: 51%|█████▏ | 5136/10000 [1:09:00<50:25, 1.61it/s, loss=0.0047, lr=1.42e-05, step=5136] Training: 51%|█████▏ | 5137/10000 [1:09:01<47:38, 1.70it/s, loss=0.0047, lr=1.42e-05, step=5136] Training: 51%|█████▏ | 5137/10000 [1:09:01<47:38, 1.70it/s, loss=0.0241, lr=1.42e-05, step=5137] Training: 51%|█████▏ | 5138/10000 [1:09:01<45:39, 1.77it/s, loss=0.0241, lr=1.42e-05, step=5137] Training: 51%|█████▏ | 5138/10000 [1:09:01<45:39, 1.77it/s, loss=0.0133, lr=1.42e-05, step=5138] Training: 51%|█████▏ | 5139/10000 [1:09:02<44:09, 1.83it/s, loss=0.0133, lr=1.42e-05, step=5138] Training: 51%|█████▏ | 5139/10000 [1:09:02<44:09, 1.83it/s, loss=0.0051, lr=1.42e-05, step=5139]19:53:34.733 [I] step=5140 loss=0.0151 smoothed_loss=0.0135 lr=1.42e-05 grad_norm=0.3875 step_time=0.4872s data_time=0.0599s it/s=1.829 eta_to_10000=2657.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0122 grad_action_out_proj_arms=0.1056 grad_arm_token_fuse=0.0633 grad_shared_expert=0.4365 (18633:train_pytorch.py:850) + Training: 51%|█████▏ | 5140/10000 [1:09:02<44:03, 1.84it/s, loss=0.0051, lr=1.42e-05, step=5139] Training: 51%|█████▏ | 5140/10000 [1:09:02<44:03, 1.84it/s, loss=0.0151, lr=1.42e-05, step=5140] Training: 51%|█████▏ | 5141/10000 [1:09:03<43:27, 1.86it/s, loss=0.0151, lr=1.42e-05, step=5140] Training: 51%|█████▏ | 5141/10000 [1:09:03<43:27, 1.86it/s, loss=0.0134, lr=1.42e-05, step=5141] Training: 51%|█████▏ | 5142/10000 [1:09:04<46:41, 1.73it/s, loss=0.0134, lr=1.42e-05, step=5141] Training: 51%|█████▏ | 5142/10000 [1:09:04<46:41, 1.73it/s, loss=0.0124, lr=1.42e-05, step=5142] Training: 51%|█████▏ | 5143/10000 [1:09:04<52:29, 1.54it/s, loss=0.0124, lr=1.42e-05, step=5142] Training: 51%|█████▏ | 5143/10000 [1:09:04<52:29, 1.54it/s, loss=0.0249, lr=1.42e-05, step=5143] Training: 51%|█████▏ | 5144/10000 [1:09:05<48:56, 1.65it/s, loss=0.0249, lr=1.42e-05, step=5143] Training: 51%|█████▏ | 5144/10000 [1:09:05<48:56, 1.65it/s, loss=0.0237, lr=1.41e-05, step=5144] Training: 51%|█████▏ | 5145/10000 [1:09:05<46:20, 1.75it/s, loss=0.0237, lr=1.41e-05, step=5144] Training: 51%|█████▏ | 5145/10000 [1:09:05<46:20, 1.75it/s, loss=0.0091, lr=1.41e-05, step=5145] Training: 51%|█████▏ | 5146/10000 [1:09:06<44:11, 1.83it/s, loss=0.0091, lr=1.41e-05, step=5145] Training: 51%|█████▏ | 5146/10000 [1:09:06<44:11, 1.83it/s, loss=0.0098, lr=1.41e-05, step=5146] Training: 51%|█████▏ | 5147/10000 [1:09:06<44:22, 1.82it/s, loss=0.0098, lr=1.41e-05, step=5146] Training: 51%|█████▏ | 5147/10000 [1:09:06<44:22, 1.82it/s, loss=0.0253, lr=1.41e-05, step=5147] Training: 51%|█████▏ | 5148/10000 [1:09:07<42:37, 1.90it/s, loss=0.0253, lr=1.41e-05, step=5147] Training: 51%|█████▏ | 5148/10000 [1:09:07<42:37, 1.90it/s, loss=0.0110, lr=1.41e-05, step=5148] Training: 51%|█████▏ | 5149/10000 [1:09:08<45:41, 1.77it/s, loss=0.0110, lr=1.41e-05, step=5148] Training: 51%|█████▏ | 5149/10000 [1:09:08<45:41, 1.77it/s, loss=0.0249, lr=1.41e-05, step=5149]19:53:40.712 [I] step=5150 loss=0.0239 smoothed_loss=0.0168 lr=1.41e-05 grad_norm=0.4609 step_time=0.5341s data_time=0.0637s it/s=1.673 eta_to_10000=2899.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0082 grad_action_out_proj_arms=0.1034 grad_arm_token_fuse=0.0451 grad_shared_expert=0.2669 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5150/10000 [1:09:08<51:27, 1.57it/s, loss=0.0249, lr=1.41e-05, step=5149] Training: 52%|█████▏ | 5150/10000 [1:09:08<51:27, 1.57it/s, loss=0.0239, lr=1.41e-05, step=5150] Training: 52%|█████▏ | 5151/10000 [1:09:09<48:21, 1.67it/s, loss=0.0239, lr=1.41e-05, step=5150] Training: 52%|█████▏ | 5151/10000 [1:09:09<48:21, 1.67it/s, loss=0.0123, lr=1.41e-05, step=5151] Training: 52%|█████▏ | 5152/10000 [1:09:09<46:09, 1.75it/s, loss=0.0123, lr=1.41e-05, step=5151] Training: 52%|█████▏ | 5152/10000 [1:09:09<46:09, 1.75it/s, loss=0.0119, lr=1.41e-05, step=5152] Training: 52%|█████▏ | 5153/10000 [1:09:10<44:19, 1.82it/s, loss=0.0119, lr=1.41e-05, step=5152] Training: 52%|█████▏ | 5153/10000 [1:09:10<44:19, 1.82it/s, loss=0.0084, lr=1.41e-05, step=5153] Training: 52%|█████▏ | 5154/10000 [1:09:10<44:15, 1.82it/s, loss=0.0084, lr=1.41e-05, step=5153] Training: 52%|█████▏ | 5154/10000 [1:09:10<44:15, 1.82it/s, loss=0.0100, lr=1.41e-05, step=5154] Training: 52%|█████▏ | 5155/10000 [1:09:11<42:36, 1.89it/s, loss=0.0100, lr=1.41e-05, step=5154] Training: 52%|█████▏ | 5155/10000 [1:09:11<42:36, 1.89it/s, loss=0.0641, lr=1.41e-05, step=5155] Training: 52%|█████▏ | 5156/10000 [1:09:12<45:29, 1.77it/s, loss=0.0641, lr=1.41e-05, step=5155] Training: 52%|█████▏ | 5156/10000 [1:09:12<45:29, 1.77it/s, loss=0.0022, lr=1.41e-05, step=5156] Training: 52%|█████▏ | 5157/10000 [1:09:12<49:18, 1.64it/s, loss=0.0022, lr=1.41e-05, step=5156] Training: 52%|█████▏ | 5157/10000 [1:09:12<49:18, 1.64it/s, loss=0.0072, lr=1.41e-05, step=5157] Training: 52%|█████▏ | 5158/10000 [1:09:13<46:33, 1.73it/s, loss=0.0072, lr=1.41e-05, step=5157] Training: 52%|█████▏ | 5158/10000 [1:09:13<46:33, 1.73it/s, loss=0.0154, lr=1.41e-05, step=5158] Training: 52%|█████▏ | 5159/10000 [1:09:13<45:16, 1.78it/s, loss=0.0154, lr=1.41e-05, step=5158] Training: 52%|█████▏ | 5159/10000 [1:09:13<45:16, 1.78it/s, loss=0.0183, lr=1.41e-05, step=5159]19:53:46.181 [I] step=5160 loss=0.0079 smoothed_loss=0.0159 lr=1.41e-05 grad_norm=0.5088 step_time=0.4840s data_time=0.0629s it/s=1.829 eta_to_10000=2646.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0186 grad_action_out_proj_arms=0.1910 grad_arm_token_fuse=0.1007 grad_shared_expert=0.7121 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5160/10000 [1:09:14<44:43, 1.80it/s, loss=0.0183, lr=1.41e-05, step=5159] Training: 52%|█████▏ | 5160/10000 [1:09:14<44:43, 1.80it/s, loss=0.0079, lr=1.41e-05, step=5160] Training: 52%|█████▏ | 5161/10000 [1:09:14<43:28, 1.85it/s, loss=0.0079, lr=1.41e-05, step=5160] Training: 52%|█████▏ | 5161/10000 [1:09:14<43:28, 1.85it/s, loss=0.0230, lr=1.41e-05, step=5161] Training: 52%|█████▏ | 5162/10000 [1:09:15<42:06, 1.91it/s, loss=0.0230, lr=1.41e-05, step=5161] Training: 52%|█████▏ | 5162/10000 [1:09:15<42:06, 1.91it/s, loss=0.0177, lr=1.41e-05, step=5162] Training: 52%|█████▏ | 5163/10000 [1:09:16<45:34, 1.77it/s, loss=0.0177, lr=1.41e-05, step=5162] Training: 52%|█████▏ | 5163/10000 [1:09:16<45:34, 1.77it/s, loss=0.0237, lr=1.41e-05, step=5163] Training: 52%|█████▏ | 5164/10000 [1:09:16<49:14, 1.64it/s, loss=0.0237, lr=1.41e-05, step=5163] Training: 52%|█████▏ | 5164/10000 [1:09:16<49:14, 1.64it/s, loss=0.0273, lr=1.41e-05, step=5164] Training: 52%|█████▏ | 5165/10000 [1:09:17<54:21, 1.48it/s, loss=0.0273, lr=1.41e-05, step=5164] Training: 52%|█████▏ | 5165/10000 [1:09:17<54:21, 1.48it/s, loss=0.0073, lr=1.41e-05, step=5165] Training: 52%|█████▏ | 5166/10000 [1:09:18<49:46, 1.62it/s, loss=0.0073, lr=1.41e-05, step=5165] Training: 52%|█████▏ | 5166/10000 [1:09:18<49:46, 1.62it/s, loss=0.0094, lr=1.41e-05, step=5166] Training: 52%|█████▏ | 5167/10000 [1:09:18<47:04, 1.71it/s, loss=0.0094, lr=1.41e-05, step=5166] Training: 52%|█████▏ | 5167/10000 [1:09:18<47:04, 1.71it/s, loss=0.0094, lr=1.41e-05, step=5167] Training: 52%|█████▏ | 5168/10000 [1:09:19<44:58, 1.79it/s, loss=0.0094, lr=1.41e-05, step=5167] Training: 52%|█████▏ | 5168/10000 [1:09:19<44:58, 1.79it/s, loss=0.0048, lr=1.41e-05, step=5168] Training: 52%|█████▏ | 5169/10000 [1:09:19<43:26, 1.85it/s, loss=0.0048, lr=1.41e-05, step=5168] Training: 52%|█████▏ | 5169/10000 [1:09:19<43:26, 1.85it/s, loss=0.0179, lr=1.41e-05, step=5169]19:53:52.085 [I] step=5170 loss=0.0094 smoothed_loss=0.0144 lr=1.41e-05 grad_norm=0.4576 step_time=0.5287s data_time=0.0617s it/s=1.694 eta_to_10000=2851.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0326 grad_action_out_proj_arms=0.1736 grad_arm_token_fuse=0.1671 grad_shared_expert=0.4935 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5170/10000 [1:09:20<47:56, 1.68it/s, loss=0.0179, lr=1.41e-05, step=5169] Training: 52%|█████▏ | 5170/10000 [1:09:20<47:56, 1.68it/s, loss=0.0094, lr=1.41e-05, step=5170] Training: 52%|█████▏ | 5171/10000 [1:09:20<50:07, 1.61it/s, loss=0.0094, lr=1.41e-05, step=5170] Training: 52%|█████▏ | 5171/10000 [1:09:20<50:07, 1.61it/s, loss=0.0050, lr=1.40e-05, step=5171] Training: 52%|█████▏ | 5172/10000 [1:09:21<53:15, 1.51it/s, loss=0.0050, lr=1.40e-05, step=5171] Training: 52%|█████▏ | 5172/10000 [1:09:21<53:15, 1.51it/s, loss=0.0079, lr=1.40e-05, step=5172] Training: 52%|█████▏ | 5173/10000 [1:09:22<49:08, 1.64it/s, loss=0.0079, lr=1.40e-05, step=5172] Training: 52%|█████▏ | 5173/10000 [1:09:22<49:08, 1.64it/s, loss=0.0148, lr=1.40e-05, step=5173] Training: 52%|█████▏ | 5174/10000 [1:09:22<46:26, 1.73it/s, loss=0.0148, lr=1.40e-05, step=5173] Training: 52%|█████▏ | 5174/10000 [1:09:22<46:26, 1.73it/s, loss=0.0087, lr=1.40e-05, step=5174] Training: 52%|█████▏ | 5175/10000 [1:09:23<44:50, 1.79it/s, loss=0.0087, lr=1.40e-05, step=5174] Training: 52%|█████▏ | 5175/10000 [1:09:23<44:50, 1.79it/s, loss=0.0509, lr=1.40e-05, step=5175] Training: 52%|█████▏ | 5176/10000 [1:09:23<43:10, 1.86it/s, loss=0.0509, lr=1.40e-05, step=5175] Training: 52%|█████▏ | 5176/10000 [1:09:23<43:10, 1.86it/s, loss=0.0149, lr=1.40e-05, step=5176] Training: 52%|█████▏ | 5177/10000 [1:09:24<47:30, 1.69it/s, loss=0.0149, lr=1.40e-05, step=5176] Training: 52%|█████▏ | 5177/10000 [1:09:24<47:30, 1.69it/s, loss=0.0064, lr=1.40e-05, step=5177] Training: 52%|█████▏ | 5178/10000 [1:09:25<51:20, 1.57it/s, loss=0.0064, lr=1.40e-05, step=5177] Training: 52%|█████▏ | 5178/10000 [1:09:25<51:20, 1.57it/s, loss=0.0089, lr=1.40e-05, step=5178] Training: 52%|█████▏ | 5179/10000 [1:09:25<54:33, 1.47it/s, loss=0.0089, lr=1.40e-05, step=5178] Training: 52%|█████▏ | 5179/10000 [1:09:25<54:33, 1.47it/s, loss=0.0050, lr=1.40e-05, step=5179]19:53:58.286 [I] step=5180 loss=0.0067 smoothed_loss=0.0130 lr=1.40e-05 grad_norm=0.4678 step_time=0.5520s data_time=0.0681s it/s=1.613 eta_to_10000=2988.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.0843 grad_arm_token_fuse=0.0390 grad_shared_expert=0.3995 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5180/10000 [1:09:26<50:56, 1.58it/s, loss=0.0050, lr=1.40e-05, step=5179] Training: 52%|█████▏ | 5180/10000 [1:09:26<50:56, 1.58it/s, loss=0.0067, lr=1.40e-05, step=5180] Training: 52%|█████▏ | 5181/10000 [1:09:26<47:29, 1.69it/s, loss=0.0067, lr=1.40e-05, step=5180] Training: 52%|█████▏ | 5181/10000 [1:09:26<47:29, 1.69it/s, loss=0.0067, lr=1.40e-05, step=5181] Training: 52%|█████▏ | 5182/10000 [1:09:27<44:57, 1.79it/s, loss=0.0067, lr=1.40e-05, step=5181] Training: 52%|█████▏ | 5182/10000 [1:09:27<44:57, 1.79it/s, loss=0.0210, lr=1.40e-05, step=5182] Training: 52%|█████▏ | 5183/10000 [1:09:27<44:00, 1.82it/s, loss=0.0210, lr=1.40e-05, step=5182] Training: 52%|█████▏ | 5183/10000 [1:09:27<44:00, 1.82it/s, loss=0.0027, lr=1.40e-05, step=5183] Training: 52%|█████▏ | 5184/10000 [1:09:28<42:22, 1.89it/s, loss=0.0027, lr=1.40e-05, step=5183] Training: 52%|█████▏ | 5184/10000 [1:09:28<42:22, 1.89it/s, loss=0.0053, lr=1.40e-05, step=5184] Training: 52%|█████▏ | 5185/10000 [1:09:29<45:21, 1.77it/s, loss=0.0053, lr=1.40e-05, step=5184] Training: 52%|█████▏ | 5185/10000 [1:09:29<45:21, 1.77it/s, loss=0.0043, lr=1.40e-05, step=5185] Training: 52%|█████▏ | 5186/10000 [1:09:29<49:21, 1.63it/s, loss=0.0043, lr=1.40e-05, step=5185] Training: 52%|█████▏ | 5186/10000 [1:09:29<49:21, 1.63it/s, loss=0.0098, lr=1.40e-05, step=5186] Training: 52%|█████▏ | 5187/10000 [1:09:30<46:05, 1.74it/s, loss=0.0098, lr=1.40e-05, step=5186] Training: 52%|█████▏ | 5187/10000 [1:09:30<46:05, 1.74it/s, loss=0.0092, lr=1.40e-05, step=5187] Training: 52%|█████▏ | 5188/10000 [1:09:30<44:14, 1.81it/s, loss=0.0092, lr=1.40e-05, step=5187] Training: 52%|█████▏ | 5188/10000 [1:09:30<44:14, 1.81it/s, loss=0.0082, lr=1.40e-05, step=5188] Training: 52%|█████▏ | 5189/10000 [1:09:31<42:41, 1.88it/s, loss=0.0082, lr=1.40e-05, step=5188] Training: 52%|█████▏ | 5189/10000 [1:09:31<42:41, 1.88it/s, loss=0.0113, lr=1.40e-05, step=5189]19:54:03.664 [I] step=5190 loss=0.0055 smoothed_loss=0.0099 lr=1.40e-05 grad_norm=0.4773 step_time=0.4759s data_time=0.0619s it/s=1.860 eta_to_10000=2586.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.1129 grad_arm_token_fuse=0.0587 grad_shared_expert=0.5580 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5190/10000 [1:09:31<43:06, 1.86it/s, loss=0.0113, lr=1.40e-05, step=5189] Training: 52%|█████▏ | 5190/10000 [1:09:31<43:06, 1.86it/s, loss=0.0055, lr=1.40e-05, step=5190] Training: 52%|█████▏ | 5191/10000 [1:09:32<45:36, 1.76it/s, loss=0.0055, lr=1.40e-05, step=5190] Training: 52%|█████▏ | 5191/10000 [1:09:32<45:36, 1.76it/s, loss=0.0139, lr=1.40e-05, step=5191] Training: 52%|█████▏ | 5192/10000 [1:09:32<43:38, 1.84it/s, loss=0.0139, lr=1.40e-05, step=5191] Training: 52%|█████▏ | 5192/10000 [1:09:32<43:38, 1.84it/s, loss=0.0139, lr=1.40e-05, step=5192] Training: 52%|█████▏ | 5193/10000 [1:09:33<49:26, 1.62it/s, loss=0.0139, lr=1.40e-05, step=5192] Training: 52%|█████▏ | 5193/10000 [1:09:33<49:26, 1.62it/s, loss=0.0014, lr=1.40e-05, step=5193] Training: 52%|█████▏ | 5194/10000 [1:09:34<49:05, 1.63it/s, loss=0.0014, lr=1.40e-05, step=5193] Training: 52%|█████▏ | 5194/10000 [1:09:34<49:05, 1.63it/s, loss=0.0043, lr=1.40e-05, step=5194] Training: 52%|█████▏ | 5195/10000 [1:09:34<45:54, 1.74it/s, loss=0.0043, lr=1.40e-05, step=5194] Training: 52%|█████▏ | 5195/10000 [1:09:34<45:54, 1.74it/s, loss=0.0083, lr=1.40e-05, step=5195] Training: 52%|█████▏ | 5196/10000 [1:09:35<44:34, 1.80it/s, loss=0.0083, lr=1.40e-05, step=5195] Training: 52%|█████▏ | 5196/10000 [1:09:35<44:34, 1.80it/s, loss=0.0346, lr=1.40e-05, step=5196] Training: 52%|█████▏ | 5197/10000 [1:09:35<42:54, 1.87it/s, loss=0.0346, lr=1.40e-05, step=5196] Training: 52%|█████▏ | 5197/10000 [1:09:35<42:54, 1.87it/s, loss=0.0316, lr=1.40e-05, step=5197] Training: 52%|█████▏ | 5198/10000 [1:09:36<46:00, 1.74it/s, loss=0.0316, lr=1.40e-05, step=5197] Training: 52%|█████▏ | 5198/10000 [1:09:36<46:00, 1.74it/s, loss=0.0159, lr=1.39e-05, step=5198] Training: 52%|█████▏ | 5199/10000 [1:09:37<44:43, 1.79it/s, loss=0.0159, lr=1.39e-05, step=5198] Training: 52%|█████▏ | 5199/10000 [1:09:37<44:43, 1.79it/s, loss=0.0132, lr=1.39e-05, step=5199]19:54:09.604 [I] step=5200 loss=0.0138 smoothed_loss=0.0138 lr=1.40e-05 grad_norm=0.4301 step_time=0.5225s data_time=0.0715s it/s=1.684 eta_to_10000=2850.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.0856 grad_arm_token_fuse=0.0348 grad_shared_expert=0.5473 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5200/10000 [1:09:37<49:12, 1.63it/s, loss=0.0132, lr=1.39e-05, step=5199] Training: 52%|█████▏ | 5200/10000 [1:09:37<49:12, 1.63it/s, loss=0.0138, lr=1.39e-05, step=5200] Training: 52%|█████▏ | 5201/10000 [1:09:38<46:06, 1.73it/s, loss=0.0138, lr=1.39e-05, step=5200] Training: 52%|█████▏ | 5201/10000 [1:09:38<46:06, 1.73it/s, loss=0.0019, lr=1.39e-05, step=5201] Training: 52%|█████▏ | 5202/10000 [1:09:38<44:31, 1.80it/s, loss=0.0019, lr=1.39e-05, step=5201] Training: 52%|█████▏ | 5202/10000 [1:09:38<44:31, 1.80it/s, loss=0.0028, lr=1.39e-05, step=5202] Training: 52%|█████▏ | 5203/10000 [1:09:39<42:40, 1.87it/s, loss=0.0028, lr=1.39e-05, step=5202] Training: 52%|█████▏ | 5203/10000 [1:09:39<42:40, 1.87it/s, loss=0.0030, lr=1.39e-05, step=5203] Training: 52%|█████▏ | 5204/10000 [1:09:39<41:27, 1.93it/s, loss=0.0030, lr=1.39e-05, step=5203] Training: 52%|█████▏ | 5204/10000 [1:09:39<41:27, 1.93it/s, loss=0.0117, lr=1.39e-05, step=5204] Training: 52%|█████▏ | 5205/10000 [1:09:40<44:24, 1.80it/s, loss=0.0117, lr=1.39e-05, step=5204] Training: 52%|█████▏ | 5205/10000 [1:09:40<44:24, 1.80it/s, loss=0.0035, lr=1.39e-05, step=5205] Training: 52%|█████▏ | 5206/10000 [1:09:40<42:27, 1.88it/s, loss=0.0035, lr=1.39e-05, step=5205] Training: 52%|█████▏ | 5206/10000 [1:09:40<42:27, 1.88it/s, loss=0.0084, lr=1.39e-05, step=5206] Training: 52%|█████▏ | 5207/10000 [1:09:41<47:12, 1.69it/s, loss=0.0084, lr=1.39e-05, step=5206] Training: 52%|█████▏ | 5207/10000 [1:09:41<47:12, 1.69it/s, loss=0.0043, lr=1.39e-05, step=5207] Training: 52%|█████▏ | 5208/10000 [1:09:42<47:51, 1.67it/s, loss=0.0043, lr=1.39e-05, step=5207] Training: 52%|█████▏ | 5208/10000 [1:09:42<47:51, 1.67it/s, loss=0.0089, lr=1.39e-05, step=5208] Training: 52%|█████▏ | 5209/10000 [1:09:42<45:02, 1.77it/s, loss=0.0089, lr=1.39e-05, step=5208] Training: 52%|█████▏ | 5209/10000 [1:09:42<45:02, 1.77it/s, loss=0.0277, lr=1.39e-05, step=5209]19:54:15.042 [I] step=5210 loss=0.0247 smoothed_loss=0.0125 lr=1.39e-05 grad_norm=0.4910 step_time=0.4777s data_time=0.0661s it/s=1.839 eta_to_10000=2604.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0329 grad_action_out_proj_arms=0.1662 grad_arm_token_fuse=0.1724 grad_shared_expert=0.4763 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5210/10000 [1:09:43<44:12, 1.81it/s, loss=0.0277, lr=1.39e-05, step=5209] Training: 52%|█████▏ | 5210/10000 [1:09:43<44:12, 1.81it/s, loss=0.0247, lr=1.39e-05, step=5210] Training: 52%|█████▏ | 5211/10000 [1:09:43<43:01, 1.86it/s, loss=0.0247, lr=1.39e-05, step=5210] Training: 52%|█████▏ | 5211/10000 [1:09:43<43:01, 1.86it/s, loss=0.0171, lr=1.39e-05, step=5211] Training: 52%|█████▏ | 5212/10000 [1:09:44<45:45, 1.74it/s, loss=0.0171, lr=1.39e-05, step=5211] Training: 52%|█████▏ | 5212/10000 [1:09:44<45:45, 1.74it/s, loss=0.0118, lr=1.39e-05, step=5212] Training: 52%|█████▏ | 5213/10000 [1:09:44<43:41, 1.83it/s, loss=0.0118, lr=1.39e-05, step=5212] Training: 52%|█████▏ | 5213/10000 [1:09:44<43:41, 1.83it/s, loss=0.0029, lr=1.39e-05, step=5213] Training: 52%|█████▏ | 5214/10000 [1:09:45<47:06, 1.69it/s, loss=0.0029, lr=1.39e-05, step=5213] Training: 52%|█████▏ | 5214/10000 [1:09:45<47:06, 1.69it/s, loss=0.0317, lr=1.39e-05, step=5214] Training: 52%|█████▏ | 5215/10000 [1:09:46<50:49, 1.57it/s, loss=0.0317, lr=1.39e-05, step=5214] Training: 52%|█████▏ | 5215/10000 [1:09:46<50:49, 1.57it/s, loss=0.0168, lr=1.39e-05, step=5215] Training: 52%|█████▏ | 5216/10000 [1:09:46<47:25, 1.68it/s, loss=0.0168, lr=1.39e-05, step=5215] Training: 52%|█████▏ | 5216/10000 [1:09:46<47:25, 1.68it/s, loss=0.0024, lr=1.39e-05, step=5216] Training: 52%|█████▏ | 5217/10000 [1:09:47<44:55, 1.77it/s, loss=0.0024, lr=1.39e-05, step=5216] Training: 52%|█████▏ | 5217/10000 [1:09:47<44:55, 1.77it/s, loss=0.0481, lr=1.39e-05, step=5217] Training: 52%|█████▏ | 5218/10000 [1:09:47<44:15, 1.80it/s, loss=0.0481, lr=1.39e-05, step=5217] Training: 52%|█████▏ | 5218/10000 [1:09:47<44:15, 1.80it/s, loss=0.0213, lr=1.39e-05, step=5218] Training: 52%|█████▏ | 5219/10000 [1:09:48<47:23, 1.68it/s, loss=0.0213, lr=1.39e-05, step=5218] Training: 52%|█████▏ | 5219/10000 [1:09:48<47:23, 1.68it/s, loss=0.0535, lr=1.39e-05, step=5219]19:54:20.864 [I] step=5220 loss=0.0125 smoothed_loss=0.0198 lr=1.39e-05 grad_norm=0.4233 step_time=0.5180s data_time=0.0642s it/s=1.718 eta_to_10000=2782.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0347 grad_action_out_proj_arms=0.1666 grad_arm_token_fuse=0.1804 grad_shared_expert=0.5635 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5220/10000 [1:09:49<45:49, 1.74it/s, loss=0.0535, lr=1.39e-05, step=5219] Training: 52%|█████▏ | 5220/10000 [1:09:49<45:49, 1.74it/s, loss=0.0125, lr=1.39e-05, step=5220] Training: 52%|█████▏ | 5221/10000 [1:09:49<48:10, 1.65it/s, loss=0.0125, lr=1.39e-05, step=5220] Training: 52%|█████▏ | 5221/10000 [1:09:49<48:10, 1.65it/s, loss=0.0107, lr=1.39e-05, step=5221] Training: 52%|█████▏ | 5222/10000 [1:09:50<53:20, 1.49it/s, loss=0.0107, lr=1.39e-05, step=5221] Training: 52%|█████▏ | 5222/10000 [1:09:50<53:20, 1.49it/s, loss=0.0212, lr=1.39e-05, step=5222] Training: 52%|█████▏ | 5223/10000 [1:09:51<50:04, 1.59it/s, loss=0.0212, lr=1.39e-05, step=5222] Training: 52%|█████▏ | 5223/10000 [1:09:51<50:04, 1.59it/s, loss=0.0071, lr=1.39e-05, step=5223] Training: 52%|█████▏ | 5224/10000 [1:09:51<46:44, 1.70it/s, loss=0.0071, lr=1.39e-05, step=5223] Training: 52%|█████▏ | 5224/10000 [1:09:51<46:44, 1.70it/s, loss=0.0113, lr=1.39e-05, step=5224] Training: 52%|█████▏ | 5225/10000 [1:09:52<44:15, 1.80it/s, loss=0.0113, lr=1.39e-05, step=5224] Training: 52%|█████▏ | 5225/10000 [1:09:52<44:15, 1.80it/s, loss=0.0082, lr=1.38e-05, step=5225] Training: 52%|█████▏ | 5226/10000 [1:09:52<46:50, 1.70it/s, loss=0.0082, lr=1.38e-05, step=5225] Training: 52%|█████▏ | 5226/10000 [1:09:52<46:50, 1.70it/s, loss=0.0103, lr=1.38e-05, step=5226] Training: 52%|█████▏ | 5227/10000 [1:09:53<44:36, 1.78it/s, loss=0.0103, lr=1.38e-05, step=5226] Training: 52%|█████▏ | 5227/10000 [1:09:53<44:36, 1.78it/s, loss=0.0092, lr=1.38e-05, step=5227] Training: 52%|█████▏ | 5228/10000 [1:09:53<47:22, 1.68it/s, loss=0.0092, lr=1.38e-05, step=5227] Training: 52%|█████▏ | 5228/10000 [1:09:53<47:22, 1.68it/s, loss=0.0044, lr=1.38e-05, step=5228] Training: 52%|█████▏ | 5229/10000 [1:09:54<50:37, 1.57it/s, loss=0.0044, lr=1.38e-05, step=5228] Training: 52%|█████▏ | 5229/10000 [1:09:54<50:37, 1.57it/s, loss=0.0110, lr=1.38e-05, step=5229]19:54:26.958 [I] step=5230 loss=0.0303 smoothed_loss=0.0154 lr=1.38e-05 grad_norm=0.4487 step_time=0.5466s data_time=0.0628s it/s=1.641 eta_to_10000=2906.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0161 grad_action_out_proj_arms=0.1727 grad_arm_token_fuse=0.0858 grad_shared_expert=0.3647 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5230/10000 [1:09:55<47:53, 1.66it/s, loss=0.0110, lr=1.38e-05, step=5229] Training: 52%|█████▏ | 5230/10000 [1:09:55<47:53, 1.66it/s, loss=0.0303, lr=1.38e-05, step=5230] Training: 52%|█████▏ | 5231/10000 [1:09:55<45:42, 1.74it/s, loss=0.0303, lr=1.38e-05, step=5230] Training: 52%|█████▏ | 5231/10000 [1:09:55<45:42, 1.74it/s, loss=0.0133, lr=1.38e-05, step=5231] Training: 52%|█████▏ | 5232/10000 [1:09:56<43:50, 1.81it/s, loss=0.0133, lr=1.38e-05, step=5231] Training: 52%|█████▏ | 5232/10000 [1:09:56<43:50, 1.81it/s, loss=0.0270, lr=1.38e-05, step=5232] Training: 52%|█████▏ | 5233/10000 [1:09:56<46:25, 1.71it/s, loss=0.0270, lr=1.38e-05, step=5232] Training: 52%|█████▏ | 5233/10000 [1:09:56<46:25, 1.71it/s, loss=0.0063, lr=1.38e-05, step=5233] Training: 52%|█████▏ | 5234/10000 [1:09:57<43:49, 1.81it/s, loss=0.0063, lr=1.38e-05, step=5233] Training: 52%|█████▏ | 5234/10000 [1:09:57<43:49, 1.81it/s, loss=0.0074, lr=1.38e-05, step=5234] Training: 52%|█████▏ | 5235/10000 [1:09:57<42:09, 1.88it/s, loss=0.0074, lr=1.38e-05, step=5234] Training: 52%|█████▏ | 5235/10000 [1:09:57<42:09, 1.88it/s, loss=0.0055, lr=1.38e-05, step=5235] Training: 52%|█████▏ | 5236/10000 [1:09:58<47:15, 1.68it/s, loss=0.0055, lr=1.38e-05, step=5235] Training: 52%|█████▏ | 5236/10000 [1:09:58<47:15, 1.68it/s, loss=0.0064, lr=1.38e-05, step=5236] Training: 52%|█████▏ | 5237/10000 [1:09:58<44:41, 1.78it/s, loss=0.0064, lr=1.38e-05, step=5236] Training: 52%|█████▏ | 5237/10000 [1:09:58<44:41, 1.78it/s, loss=0.0172, lr=1.38e-05, step=5237] Training: 52%|█████▏ | 5238/10000 [1:09:59<42:52, 1.85it/s, loss=0.0172, lr=1.38e-05, step=5237] Training: 52%|█████▏ | 5238/10000 [1:09:59<42:52, 1.85it/s, loss=0.0040, lr=1.38e-05, step=5238] Training: 52%|█████▏ | 5239/10000 [1:09:59<42:24, 1.87it/s, loss=0.0040, lr=1.38e-05, step=5238] Training: 52%|█████▏ | 5239/10000 [1:09:59<42:24, 1.87it/s, loss=0.0029, lr=1.38e-05, step=5239]19:54:32.352 [I] step=5240 loss=0.0099 smoothed_loss=0.0113 lr=1.38e-05 grad_norm=0.4021 step_time=0.4772s data_time=0.0623s it/s=1.854 eta_to_10000=2567.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0118 grad_action_out_proj_arms=0.1208 grad_arm_token_fuse=0.0591 grad_shared_expert=0.6181 (18633:train_pytorch.py:850) + Training: 52%|█████▏ | 5240/10000 [1:10:00<42:11, 1.88it/s, loss=0.0029, lr=1.38e-05, step=5239] Training: 52%|█████▏ | 5240/10000 [1:10:00<42:11, 1.88it/s, loss=0.0099, lr=1.38e-05, step=5240] Training: 52%|█████▏ | 5241/10000 [1:10:01<45:41, 1.74it/s, loss=0.0099, lr=1.38e-05, step=5240] Training: 52%|█████▏ | 5241/10000 [1:10:01<45:41, 1.74it/s, loss=0.0351, lr=1.38e-05, step=5241] Training: 52%|█████▏ | 5242/10000 [1:10:01<44:40, 1.78it/s, loss=0.0351, lr=1.38e-05, step=5241] Training: 52%|█████▏ | 5242/10000 [1:10:01<44:40, 1.78it/s, loss=0.0628, lr=1.38e-05, step=5242] Training: 52%|█████▏ | 5243/10000 [1:10:02<49:15, 1.61it/s, loss=0.0628, lr=1.38e-05, step=5242] Training: 52%|█████▏ | 5243/10000 [1:10:02<49:15, 1.61it/s, loss=0.0260, lr=1.38e-05, step=5243] Training: 52%|█████▏ | 5244/10000 [1:10:02<45:51, 1.73it/s, loss=0.0260, lr=1.38e-05, step=5243] Training: 52%|█████▏ | 5244/10000 [1:10:02<45:51, 1.73it/s, loss=0.0193, lr=1.38e-05, step=5244] Training: 52%|█████▏ | 5245/10000 [1:10:03<43:56, 1.80it/s, loss=0.0193, lr=1.38e-05, step=5244] Training: 52%|█████▏ | 5245/10000 [1:10:03<43:56, 1.80it/s, loss=0.0200, lr=1.38e-05, step=5245] Training: 52%|█████▏ | 5246/10000 [1:10:03<42:34, 1.86it/s, loss=0.0200, lr=1.38e-05, step=5245] Training: 52%|█████▏ | 5246/10000 [1:10:03<42:34, 1.86it/s, loss=0.0123, lr=1.38e-05, step=5246] Training: 52%|█████▏ | 5247/10000 [1:10:04<42:09, 1.88it/s, loss=0.0123, lr=1.38e-05, step=5246] Training: 52%|█████▏ | 5247/10000 [1:10:04<42:09, 1.88it/s, loss=0.0292, lr=1.38e-05, step=5247] Training: 52%|█████▏ | 5248/10000 [1:10:04<41:19, 1.92it/s, loss=0.0292, lr=1.38e-05, step=5247] Training: 52%|█████▏ | 5248/10000 [1:10:04<41:19, 1.92it/s, loss=0.0700, lr=1.38e-05, step=5248] Training: 52%|█████▏ | 5249/10000 [1:10:05<44:54, 1.76it/s, loss=0.0700, lr=1.38e-05, step=5248] Training: 52%|█████▏ | 5249/10000 [1:10:05<44:54, 1.76it/s, loss=0.0040, lr=1.38e-05, step=5249]19:54:38.258 [I] step=5250 loss=0.0137 smoothed_loss=0.0218 lr=1.38e-05 grad_norm=0.5252 step_time=0.5215s data_time=0.0690s it/s=1.694 eta_to_10000=2804.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1670 grad_arm_token_fuse=0.0788 grad_shared_expert=0.5076 (18633:train_pytorch.py:850) + Training: 52%|█████▎ | 5250/10000 [1:10:06<49:44, 1.59it/s, loss=0.0040, lr=1.38e-05, step=5249] Training: 52%|█████▎ | 5250/10000 [1:10:06<49:44, 1.59it/s, loss=0.0137, lr=1.38e-05, step=5250] Training: 53%|█████▎ | 5251/10000 [1:10:06<46:09, 1.71it/s, loss=0.0137, lr=1.38e-05, step=5250] Training: 53%|█████▎ | 5251/10000 [1:10:06<46:09, 1.71it/s, loss=0.0093, lr=1.38e-05, step=5251] Training: 53%|█████▎ | 5252/10000 [1:10:07<44:09, 1.79it/s, loss=0.0093, lr=1.38e-05, step=5251] Training: 53%|█████▎ | 5252/10000 [1:10:07<44:09, 1.79it/s, loss=0.0126, lr=1.37e-05, step=5252] Training: 53%|█████▎ | 5253/10000 [1:10:07<43:15, 1.83it/s, loss=0.0126, lr=1.37e-05, step=5252] Training: 53%|█████▎ | 5253/10000 [1:10:07<43:15, 1.83it/s, loss=0.0064, lr=1.37e-05, step=5253] Training: 53%|█████▎ | 5254/10000 [1:10:08<41:36, 1.90it/s, loss=0.0064, lr=1.37e-05, step=5253] Training: 53%|█████▎ | 5254/10000 [1:10:08<41:36, 1.90it/s, loss=0.0062, lr=1.37e-05, step=5254] Training: 53%|█████▎ | 5255/10000 [1:10:08<42:05, 1.88it/s, loss=0.0062, lr=1.37e-05, step=5254] Training: 53%|█████▎ | 5255/10000 [1:10:08<42:05, 1.88it/s, loss=0.0139, lr=1.37e-05, step=5255] Training: 53%|█████▎ | 5256/10000 [1:10:09<45:18, 1.75it/s, loss=0.0139, lr=1.37e-05, step=5255] Training: 53%|█████▎ | 5256/10000 [1:10:09<45:18, 1.75it/s, loss=0.0092, lr=1.37e-05, step=5256] Training: 53%|█████▎ | 5257/10000 [1:10:10<49:32, 1.60it/s, loss=0.0092, lr=1.37e-05, step=5256] Training: 53%|█████▎ | 5257/10000 [1:10:10<49:32, 1.60it/s, loss=0.0744, lr=1.37e-05, step=5257] Training: 53%|█████▎ | 5258/10000 [1:10:10<46:42, 1.69it/s, loss=0.0744, lr=1.37e-05, step=5257] Training: 53%|█████▎ | 5258/10000 [1:10:10<46:42, 1.69it/s, loss=0.0158, lr=1.37e-05, step=5258] Training: 53%|█████▎ | 5259/10000 [1:10:11<44:26, 1.78it/s, loss=0.0158, lr=1.37e-05, step=5258] Training: 53%|█████▎ | 5259/10000 [1:10:11<44:26, 1.78it/s, loss=0.0099, lr=1.37e-05, step=5259]19:54:43.732 [I] step=5260 loss=0.0012 smoothed_loss=0.0183 lr=1.37e-05 grad_norm=0.4492 step_time=0.4774s data_time=0.0700s it/s=1.827 eta_to_10000=2594.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0257 grad_action_out_proj_arms=0.1336 grad_arm_token_fuse=0.1339 grad_shared_expert=0.3515 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5260/10000 [1:10:11<43:38, 1.81it/s, loss=0.0099, lr=1.37e-05, step=5259] Training: 53%|█████▎ | 5260/10000 [1:10:11<43:38, 1.81it/s, loss=0.0012, lr=1.37e-05, step=5260] Training: 53%|█████▎ | 5261/10000 [1:10:12<42:15, 1.87it/s, loss=0.0012, lr=1.37e-05, step=5260] Training: 53%|█████▎ | 5261/10000 [1:10:12<42:15, 1.87it/s, loss=0.0110, lr=1.37e-05, step=5261] Training: 53%|█████▎ | 5262/10000 [1:10:12<41:36, 1.90it/s, loss=0.0110, lr=1.37e-05, step=5261] Training: 53%|█████▎ | 5262/10000 [1:10:12<41:36, 1.90it/s, loss=0.0113, lr=1.37e-05, step=5262] Training: 53%|█████▎ | 5263/10000 [1:10:13<44:23, 1.78it/s, loss=0.0113, lr=1.37e-05, step=5262] Training: 53%|█████▎ | 5263/10000 [1:10:13<44:23, 1.78it/s, loss=0.0906, lr=1.37e-05, step=5263] Training: 53%|█████▎ | 5264/10000 [1:10:14<47:03, 1.68it/s, loss=0.0906, lr=1.37e-05, step=5263] Training: 53%|█████▎ | 5264/10000 [1:10:14<47:03, 1.68it/s, loss=0.0056, lr=1.37e-05, step=5264] Training: 53%|█████▎ | 5265/10000 [1:10:14<50:13, 1.57it/s, loss=0.0056, lr=1.37e-05, step=5264] Training: 53%|█████▎ | 5265/10000 [1:10:14<50:13, 1.57it/s, loss=0.0049, lr=1.37e-05, step=5265] Training: 53%|█████▎ | 5266/10000 [1:10:15<47:19, 1.67it/s, loss=0.0049, lr=1.37e-05, step=5265] Training: 53%|█████▎ | 5266/10000 [1:10:15<47:19, 1.67it/s, loss=0.0106, lr=1.37e-05, step=5266] Training: 53%|█████▎ | 5267/10000 [1:10:15<44:31, 1.77it/s, loss=0.0106, lr=1.37e-05, step=5266] Training: 53%|█████▎ | 5267/10000 [1:10:15<44:31, 1.77it/s, loss=0.0093, lr=1.37e-05, step=5267] Training: 53%|█████▎ | 5268/10000 [1:10:16<43:04, 1.83it/s, loss=0.0093, lr=1.37e-05, step=5267] Training: 53%|█████▎ | 5268/10000 [1:10:16<43:04, 1.83it/s, loss=0.0320, lr=1.37e-05, step=5268] Training: 53%|█████▎ | 5269/10000 [1:10:16<42:03, 1.87it/s, loss=0.0320, lr=1.37e-05, step=5268] Training: 53%|█████▎ | 5269/10000 [1:10:16<42:03, 1.87it/s, loss=0.0038, lr=1.37e-05, step=5269]19:54:49.323 [I] step=5270 loss=0.0155 smoothed_loss=0.0181 lr=1.37e-05 grad_norm=0.4671 step_time=0.4938s data_time=0.0653s it/s=1.789 eta_to_10000=2644.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0098 grad_action_out_proj_arms=0.0970 grad_arm_token_fuse=0.0519 grad_shared_expert=0.4813 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5270/10000 [1:10:17<42:05, 1.87it/s, loss=0.0038, lr=1.37e-05, step=5269] Training: 53%|█████▎ | 5270/10000 [1:10:17<42:05, 1.87it/s, loss=0.0155, lr=1.37e-05, step=5270] Training: 53%|█████▎ | 5271/10000 [1:10:18<45:40, 1.73it/s, loss=0.0155, lr=1.37e-05, step=5270] Training: 53%|█████▎ | 5271/10000 [1:10:18<45:40, 1.73it/s, loss=0.0117, lr=1.37e-05, step=5271] Training: 53%|█████▎ | 5272/10000 [1:10:18<49:55, 1.58it/s, loss=0.0117, lr=1.37e-05, step=5271] Training: 53%|█████▎ | 5272/10000 [1:10:18<49:55, 1.58it/s, loss=0.0069, lr=1.37e-05, step=5272] Training: 53%|█████▎ | 5273/10000 [1:10:19<46:20, 1.70it/s, loss=0.0069, lr=1.37e-05, step=5272] Training: 53%|█████▎ | 5273/10000 [1:10:19<46:20, 1.70it/s, loss=0.0105, lr=1.37e-05, step=5273] Training: 53%|█████▎ | 5274/10000 [1:10:19<43:56, 1.79it/s, loss=0.0105, lr=1.37e-05, step=5273] Training: 53%|█████▎ | 5274/10000 [1:10:19<43:56, 1.79it/s, loss=0.0202, lr=1.37e-05, step=5274] Training: 53%|█████▎ | 5275/10000 [1:10:20<42:36, 1.85it/s, loss=0.0202, lr=1.37e-05, step=5274] Training: 53%|█████▎ | 5275/10000 [1:10:20<42:36, 1.85it/s, loss=0.0025, lr=1.37e-05, step=5275] Training: 53%|█████▎ | 5276/10000 [1:10:20<41:40, 1.89it/s, loss=0.0025, lr=1.37e-05, step=5275] Training: 53%|█████▎ | 5276/10000 [1:10:20<41:40, 1.89it/s, loss=0.0059, lr=1.37e-05, step=5276] Training: 53%|█████▎ | 5277/10000 [1:10:21<41:44, 1.89it/s, loss=0.0059, lr=1.37e-05, step=5276] Training: 53%|█████▎ | 5277/10000 [1:10:21<41:44, 1.89it/s, loss=0.0027, lr=1.37e-05, step=5277] Training: 53%|█████▎ | 5278/10000 [1:10:22<45:38, 1.72it/s, loss=0.0027, lr=1.37e-05, step=5277] Training: 53%|█████▎ | 5278/10000 [1:10:22<45:38, 1.72it/s, loss=0.0036, lr=1.36e-05, step=5278] Training: 53%|█████▎ | 5279/10000 [1:10:22<48:50, 1.61it/s, loss=0.0036, lr=1.36e-05, step=5278] Training: 53%|█████▎ | 5279/10000 [1:10:22<48:50, 1.61it/s, loss=0.0187, lr=1.36e-05, step=5279]19:54:55.226 [I] step=5280 loss=0.0281 smoothed_loss=0.0141 lr=1.37e-05 grad_norm=0.4226 step_time=0.5228s data_time=0.0675s it/s=1.695 eta_to_10000=2785.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0252 grad_action_out_proj_arms=0.2255 grad_arm_token_fuse=0.1299 grad_shared_expert=0.6235 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5280/10000 [1:10:23<46:54, 1.68it/s, loss=0.0187, lr=1.36e-05, step=5279] Training: 53%|█████▎ | 5280/10000 [1:10:23<46:54, 1.68it/s, loss=0.0281, lr=1.36e-05, step=5280] Training: 53%|█████▎ | 5281/10000 [1:10:23<44:46, 1.76it/s, loss=0.0281, lr=1.36e-05, step=5280] Training: 53%|█████▎ | 5281/10000 [1:10:23<44:46, 1.76it/s, loss=0.0161, lr=1.36e-05, step=5281] Training: 53%|█████▎ | 5282/10000 [1:10:24<42:51, 1.83it/s, loss=0.0161, lr=1.36e-05, step=5281] Training: 53%|█████▎ | 5282/10000 [1:10:24<42:51, 1.83it/s, loss=0.0057, lr=1.36e-05, step=5282] Training: 53%|█████▎ | 5283/10000 [1:10:24<42:09, 1.87it/s, loss=0.0057, lr=1.36e-05, step=5282] Training: 53%|█████▎ | 5283/10000 [1:10:24<42:09, 1.87it/s, loss=0.0108, lr=1.36e-05, step=5283] Training: 53%|█████▎ | 5284/10000 [1:10:25<41:15, 1.91it/s, loss=0.0108, lr=1.36e-05, step=5283] Training: 53%|█████▎ | 5284/10000 [1:10:25<41:15, 1.91it/s, loss=0.0086, lr=1.36e-05, step=5284] Training: 53%|█████▎ | 5285/10000 [1:10:25<40:42, 1.93it/s, loss=0.0086, lr=1.36e-05, step=5284] Training: 53%|█████▎ | 5285/10000 [1:10:25<40:42, 1.93it/s, loss=0.0139, lr=1.36e-05, step=5285] Training: 53%|█████▎ | 5286/10000 [1:10:26<46:15, 1.70it/s, loss=0.0139, lr=1.36e-05, step=5285] Training: 53%|█████▎ | 5286/10000 [1:10:26<46:15, 1.70it/s, loss=0.0149, lr=1.36e-05, step=5286] Training: 53%|█████▎ | 5287/10000 [1:10:27<45:25, 1.73it/s, loss=0.0149, lr=1.36e-05, step=5286] Training: 53%|█████▎ | 5287/10000 [1:10:27<45:25, 1.73it/s, loss=0.0089, lr=1.36e-05, step=5287] Training: 53%|█████▎ | 5288/10000 [1:10:27<44:48, 1.75it/s, loss=0.0089, lr=1.36e-05, step=5287] Training: 53%|█████▎ | 5288/10000 [1:10:27<44:48, 1.75it/s, loss=0.0047, lr=1.36e-05, step=5288] Training: 53%|█████▎ | 5289/10000 [1:10:28<44:29, 1.76it/s, loss=0.0047, lr=1.36e-05, step=5288] Training: 53%|█████▎ | 5289/10000 [1:10:28<44:29, 1.76it/s, loss=0.0083, lr=1.36e-05, step=5289]19:55:00.745 [I] step=5290 loss=0.0032 smoothed_loss=0.0107 lr=1.36e-05 grad_norm=0.4777 step_time=0.4884s data_time=0.0637s it/s=1.812 eta_to_10000=2598.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.1072 grad_arm_token_fuse=0.0478 grad_shared_expert=0.3140 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5290/10000 [1:10:28<45:01, 1.74it/s, loss=0.0083, lr=1.36e-05, step=5289] Training: 53%|█████▎ | 5290/10000 [1:10:28<45:01, 1.74it/s, loss=0.0032, lr=1.36e-05, step=5290] Training: 53%|█████▎ | 5291/10000 [1:10:29<44:09, 1.78it/s, loss=0.0032, lr=1.36e-05, step=5290] Training: 53%|█████▎ | 5291/10000 [1:10:29<44:09, 1.78it/s, loss=0.0047, lr=1.36e-05, step=5291] Training: 53%|█████▎ | 5292/10000 [1:10:29<42:19, 1.85it/s, loss=0.0047, lr=1.36e-05, step=5291] Training: 53%|█████▎ | 5292/10000 [1:10:29<42:19, 1.85it/s, loss=0.0057, lr=1.36e-05, step=5292] Training: 53%|█████▎ | 5293/10000 [1:10:30<47:11, 1.66it/s, loss=0.0057, lr=1.36e-05, step=5292] Training: 53%|█████▎ | 5293/10000 [1:10:30<47:11, 1.66it/s, loss=0.0021, lr=1.36e-05, step=5293] Training: 53%|█████▎ | 5294/10000 [1:10:31<44:31, 1.76it/s, loss=0.0021, lr=1.36e-05, step=5293] Training: 53%|█████▎ | 5294/10000 [1:10:31<44:31, 1.76it/s, loss=0.0072, lr=1.36e-05, step=5294] Training: 53%|█████▎ | 5295/10000 [1:10:31<43:16, 1.81it/s, loss=0.0072, lr=1.36e-05, step=5294] Training: 53%|█████▎ | 5295/10000 [1:10:31<43:16, 1.81it/s, loss=0.0126, lr=1.36e-05, step=5295] Training: 53%|█████▎ | 5296/10000 [1:10:32<42:25, 1.85it/s, loss=0.0126, lr=1.36e-05, step=5295] Training: 53%|█████▎ | 5296/10000 [1:10:32<42:25, 1.85it/s, loss=0.0181, lr=1.36e-05, step=5296] Training: 53%|█████▎ | 5297/10000 [1:10:32<41:43, 1.88it/s, loss=0.0181, lr=1.36e-05, step=5296] Training: 53%|█████▎ | 5297/10000 [1:10:32<41:43, 1.88it/s, loss=0.0026, lr=1.36e-05, step=5297] Training: 53%|█████▎ | 5298/10000 [1:10:33<41:08, 1.90it/s, loss=0.0026, lr=1.36e-05, step=5297] Training: 53%|█████▎ | 5298/10000 [1:10:33<41:08, 1.90it/s, loss=0.0189, lr=1.36e-05, step=5298] Training: 53%|█████▎ | 5299/10000 [1:10:33<42:08, 1.86it/s, loss=0.0189, lr=1.36e-05, step=5298] Training: 53%|█████▎ | 5299/10000 [1:10:33<42:08, 1.86it/s, loss=0.0093, lr=1.36e-05, step=5299]19:55:06.414 [I] step=5300 loss=0.0084 smoothed_loss=0.0100 lr=1.36e-05 grad_norm=0.4797 step_time=0.5001s data_time=0.0666s it/s=1.764 eta_to_10000=2664.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0234 grad_action_out_proj_arms=0.1639 grad_arm_token_fuse=0.1178 grad_shared_expert=0.5322 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5300/10000 [1:10:34<48:09, 1.63it/s, loss=0.0093, lr=1.36e-05, step=5299] Training: 53%|█████▎ | 5300/10000 [1:10:34<48:09, 1.63it/s, loss=0.0084, lr=1.36e-05, step=5300] Training: 53%|█████▎ | 5301/10000 [1:10:35<48:59, 1.60it/s, loss=0.0084, lr=1.36e-05, step=5300] Training: 53%|█████▎ | 5301/10000 [1:10:35<48:59, 1.60it/s, loss=0.0079, lr=1.36e-05, step=5301] Training: 53%|█████▎ | 5302/10000 [1:10:35<46:13, 1.69it/s, loss=0.0079, lr=1.36e-05, step=5301] Training: 53%|█████▎ | 5302/10000 [1:10:35<46:13, 1.69it/s, loss=0.0059, lr=1.36e-05, step=5302] Training: 53%|█████▎ | 5303/10000 [1:10:36<44:28, 1.76it/s, loss=0.0059, lr=1.36e-05, step=5302] Training: 53%|█████▎ | 5303/10000 [1:10:36<44:28, 1.76it/s, loss=0.0129, lr=1.36e-05, step=5303] Training: 53%|█████▎ | 5304/10000 [1:10:36<43:23, 1.80it/s, loss=0.0129, lr=1.36e-05, step=5303] Training: 53%|█████▎ | 5304/10000 [1:10:36<43:23, 1.80it/s, loss=0.0070, lr=1.36e-05, step=5304] Training: 53%|█████▎ | 5305/10000 [1:10:37<42:24, 1.85it/s, loss=0.0070, lr=1.36e-05, step=5304] Training: 53%|█████▎ | 5305/10000 [1:10:37<42:24, 1.85it/s, loss=0.0176, lr=1.35e-05, step=5305] Training: 53%|█████▎ | 5306/10000 [1:10:37<41:08, 1.90it/s, loss=0.0176, lr=1.35e-05, step=5305] Training: 53%|█████▎ | 5306/10000 [1:10:37<41:08, 1.90it/s, loss=0.0049, lr=1.35e-05, step=5306] Training: 53%|█████▎ | 5307/10000 [1:10:38<46:30, 1.68it/s, loss=0.0049, lr=1.35e-05, step=5306] Training: 53%|█████▎ | 5307/10000 [1:10:38<46:30, 1.68it/s, loss=0.0041, lr=1.35e-05, step=5307] Training: 53%|█████▎ | 5308/10000 [1:10:39<47:47, 1.64it/s, loss=0.0041, lr=1.35e-05, step=5307] Training: 53%|█████▎ | 5308/10000 [1:10:39<47:47, 1.64it/s, loss=0.0075, lr=1.35e-05, step=5308] Training: 53%|█████▎ | 5309/10000 [1:10:39<46:22, 1.69it/s, loss=0.0075, lr=1.35e-05, step=5308] Training: 53%|█████▎ | 5309/10000 [1:10:39<46:22, 1.69it/s, loss=0.0124, lr=1.35e-05, step=5309]19:55:12.171 [I] step=5310 loss=0.0046 smoothed_loss=0.0089 lr=1.35e-05 grad_norm=0.4482 step_time=0.5100s data_time=0.0658s it/s=1.738 eta_to_10000=2699.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0100 grad_action_out_proj_arms=0.0906 grad_arm_token_fuse=0.0507 grad_shared_expert=0.3776 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5310/10000 [1:10:40<46:33, 1.68it/s, loss=0.0124, lr=1.35e-05, step=5309] Training: 53%|█████▎ | 5310/10000 [1:10:40<46:33, 1.68it/s, loss=0.0046, lr=1.35e-05, step=5310] Training: 53%|█████▎ | 5311/10000 [1:10:40<44:22, 1.76it/s, loss=0.0046, lr=1.35e-05, step=5310] Training: 53%|█████▎ | 5311/10000 [1:10:40<44:22, 1.76it/s, loss=0.0112, lr=1.35e-05, step=5311] Training: 53%|█████▎ | 5312/10000 [1:10:41<42:54, 1.82it/s, loss=0.0112, lr=1.35e-05, step=5311] Training: 53%|█████▎ | 5312/10000 [1:10:41<42:54, 1.82it/s, loss=0.0388, lr=1.35e-05, step=5312] Training: 53%|█████▎ | 5313/10000 [1:10:41<41:47, 1.87it/s, loss=0.0388, lr=1.35e-05, step=5312] Training: 53%|█████▎ | 5313/10000 [1:10:41<41:47, 1.87it/s, loss=0.0103, lr=1.35e-05, step=5313] Training: 53%|█████▎ | 5314/10000 [1:10:42<44:52, 1.74it/s, loss=0.0103, lr=1.35e-05, step=5313] Training: 53%|█████▎ | 5314/10000 [1:10:42<44:52, 1.74it/s, loss=0.0336, lr=1.35e-05, step=5314] Training: 53%|█████▎ | 5315/10000 [1:10:43<48:48, 1.60it/s, loss=0.0336, lr=1.35e-05, step=5314] Training: 53%|█████▎ | 5315/10000 [1:10:43<48:48, 1.60it/s, loss=0.0076, lr=1.35e-05, step=5315] Training: 53%|█████▎ | 5316/10000 [1:10:43<45:41, 1.71it/s, loss=0.0076, lr=1.35e-05, step=5315] Training: 53%|█████▎ | 5316/10000 [1:10:43<45:41, 1.71it/s, loss=0.0022, lr=1.35e-05, step=5316] Training: 53%|█████▎ | 5317/10000 [1:10:44<43:34, 1.79it/s, loss=0.0022, lr=1.35e-05, step=5316] Training: 53%|█████▎ | 5317/10000 [1:10:44<43:34, 1.79it/s, loss=0.0096, lr=1.35e-05, step=5317] Training: 53%|█████▎ | 5318/10000 [1:10:44<42:05, 1.85it/s, loss=0.0096, lr=1.35e-05, step=5317] Training: 53%|█████▎ | 5318/10000 [1:10:44<42:05, 1.85it/s, loss=0.0155, lr=1.35e-05, step=5318] Training: 53%|█████▎ | 5319/10000 [1:10:45<41:24, 1.88it/s, loss=0.0155, lr=1.35e-05, step=5318] Training: 53%|█████▎ | 5319/10000 [1:10:45<41:24, 1.88it/s, loss=0.0238, lr=1.35e-05, step=5319]19:55:17.623 [I] step=5320 loss=0.0167 smoothed_loss=0.0138 lr=1.35e-05 grad_norm=0.5608 step_time=0.4762s data_time=0.0690s it/s=1.834 eta_to_10000=2551.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0163 grad_action_out_proj_arms=0.1290 grad_arm_token_fuse=0.0860 grad_shared_expert=0.6403 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5320/10000 [1:10:45<41:34, 1.88it/s, loss=0.0238, lr=1.35e-05, step=5319] Training: 53%|█████▎ | 5320/10000 [1:10:45<41:34, 1.88it/s, loss=0.0167, lr=1.35e-05, step=5320] Training: 53%|█████▎ | 5321/10000 [1:10:46<44:43, 1.74it/s, loss=0.0167, lr=1.35e-05, step=5320] Training: 53%|█████▎ | 5321/10000 [1:10:46<44:43, 1.74it/s, loss=0.0054, lr=1.35e-05, step=5321] Training: 53%|█████▎ | 5322/10000 [1:10:47<48:35, 1.60it/s, loss=0.0054, lr=1.35e-05, step=5321] Training: 53%|█████▎ | 5322/10000 [1:10:47<48:35, 1.60it/s, loss=0.0099, lr=1.35e-05, step=5322] Training: 53%|█████▎ | 5323/10000 [1:10:47<46:42, 1.67it/s, loss=0.0099, lr=1.35e-05, step=5322] Training: 53%|█████▎ | 5323/10000 [1:10:47<46:42, 1.67it/s, loss=0.0427, lr=1.35e-05, step=5323] Training: 53%|█████▎ | 5324/10000 [1:10:48<45:29, 1.71it/s, loss=0.0427, lr=1.35e-05, step=5323] Training: 53%|█████▎ | 5324/10000 [1:10:48<45:29, 1.71it/s, loss=0.0082, lr=1.35e-05, step=5324] Training: 53%|█████▎ | 5325/10000 [1:10:48<46:46, 1.67it/s, loss=0.0082, lr=1.35e-05, step=5324] Training: 53%|█████▎ | 5325/10000 [1:10:48<46:46, 1.67it/s, loss=0.0056, lr=1.35e-05, step=5325] Training: 53%|█████▎ | 5326/10000 [1:10:49<44:52, 1.74it/s, loss=0.0056, lr=1.35e-05, step=5325] Training: 53%|█████▎ | 5326/10000 [1:10:49<44:52, 1.74it/s, loss=0.0074, lr=1.35e-05, step=5326] Training: 53%|█████▎ | 5327/10000 [1:10:50<44:20, 1.76it/s, loss=0.0074, lr=1.35e-05, step=5326] Training: 53%|█████▎ | 5327/10000 [1:10:50<44:20, 1.76it/s, loss=0.0130, lr=1.35e-05, step=5327] Training: 53%|█████▎ | 5328/10000 [1:10:50<47:20, 1.64it/s, loss=0.0130, lr=1.35e-05, step=5327] Training: 53%|█████▎ | 5328/10000 [1:10:50<47:20, 1.64it/s, loss=0.0085, lr=1.35e-05, step=5328] Training: 53%|█████▎ | 5329/10000 [1:10:51<51:06, 1.52it/s, loss=0.0085, lr=1.35e-05, step=5328] Training: 53%|█████▎ | 5329/10000 [1:10:51<51:06, 1.52it/s, loss=0.0024, lr=1.35e-05, step=5329]19:55:23.835 [I] step=5330 loss=0.0027 smoothed_loss=0.0109 lr=1.35e-05 grad_norm=0.4413 step_time=0.5360s data_time=0.0852s it/s=1.610 eta_to_10000=2900.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0816 grad_arm_token_fuse=0.0414 grad_shared_expert=0.2829 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5330/10000 [1:10:52<48:13, 1.61it/s, loss=0.0024, lr=1.35e-05, step=5329] Training: 53%|█████▎ | 5330/10000 [1:10:52<48:13, 1.61it/s, loss=0.0027, lr=1.35e-05, step=5330] Training: 53%|█████▎ | 5331/10000 [1:10:52<45:35, 1.71it/s, loss=0.0027, lr=1.35e-05, step=5330] Training: 53%|█████▎ | 5331/10000 [1:10:52<45:35, 1.71it/s, loss=0.0183, lr=1.35e-05, step=5331] Training: 53%|█████▎ | 5332/10000 [1:10:52<43:14, 1.80it/s, loss=0.0183, lr=1.35e-05, step=5331] Training: 53%|█████▎ | 5332/10000 [1:10:52<43:14, 1.80it/s, loss=0.0175, lr=1.34e-05, step=5332] Training: 53%|█████▎ | 5333/10000 [1:10:53<42:14, 1.84it/s, loss=0.0175, lr=1.34e-05, step=5332] Training: 53%|█████▎ | 5333/10000 [1:10:53<42:14, 1.84it/s, loss=0.0406, lr=1.34e-05, step=5333] Training: 53%|█████▎ | 5334/10000 [1:10:54<41:14, 1.89it/s, loss=0.0406, lr=1.34e-05, step=5333] Training: 53%|█████▎ | 5334/10000 [1:10:54<41:14, 1.89it/s, loss=0.0070, lr=1.34e-05, step=5334] Training: 53%|█████▎ | 5335/10000 [1:10:54<40:31, 1.92it/s, loss=0.0070, lr=1.34e-05, step=5334] Training: 53%|█████▎ | 5335/10000 [1:10:54<40:31, 1.92it/s, loss=0.1001, lr=1.34e-05, step=5335] Training: 53%|█████▎ | 5336/10000 [1:10:55<49:33, 1.57it/s, loss=0.1001, lr=1.34e-05, step=5335] Training: 53%|█████▎ | 5336/10000 [1:10:55<49:33, 1.57it/s, loss=0.0058, lr=1.34e-05, step=5336] Training: 53%|█████▎ | 5337/10000 [1:10:55<46:34, 1.67it/s, loss=0.0058, lr=1.34e-05, step=5336] Training: 53%|█████▎ | 5337/10000 [1:10:55<46:34, 1.67it/s, loss=0.0253, lr=1.34e-05, step=5337] Training: 53%|█████▎ | 5338/10000 [1:10:56<44:12, 1.76it/s, loss=0.0253, lr=1.34e-05, step=5337] Training: 53%|█████▎ | 5338/10000 [1:10:56<44:12, 1.76it/s, loss=0.0116, lr=1.34e-05, step=5338] Training: 53%|█████▎ | 5339/10000 [1:10:56<42:21, 1.83it/s, loss=0.0116, lr=1.34e-05, step=5338] Training: 53%|█████▎ | 5339/10000 [1:10:56<42:21, 1.83it/s, loss=0.0181, lr=1.34e-05, step=5339]19:55:29.288 [I] step=5340 loss=0.0152 smoothed_loss=0.0198 lr=1.34e-05 grad_norm=0.4887 step_time=0.4792s data_time=0.0662s it/s=1.834 eta_to_10000=2540.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.1084 grad_arm_token_fuse=0.0485 grad_shared_expert=0.3125 (18633:train_pytorch.py:850) + Training: 53%|█████▎ | 5340/10000 [1:10:57<42:12, 1.84it/s, loss=0.0181, lr=1.34e-05, step=5339] Training: 53%|█████▎ | 5340/10000 [1:10:57<42:12, 1.84it/s, loss=0.0152, lr=1.34e-05, step=5340] Training: 53%|█████▎ | 5341/10000 [1:10:57<41:57, 1.85it/s, loss=0.0152, lr=1.34e-05, step=5340] Training: 53%|█████▎ | 5341/10000 [1:10:57<41:57, 1.85it/s, loss=0.0111, lr=1.34e-05, step=5341] Training: 53%|█████▎ | 5342/10000 [1:10:58<40:56, 1.90it/s, loss=0.0111, lr=1.34e-05, step=5341] Training: 53%|█████▎ | 5342/10000 [1:10:58<40:56, 1.90it/s, loss=0.0187, lr=1.34e-05, step=5342] Training: 53%|█████▎ | 5343/10000 [1:10:59<46:13, 1.68it/s, loss=0.0187, lr=1.34e-05, step=5342] Training: 53%|█████▎ | 5343/10000 [1:10:59<46:13, 1.68it/s, loss=0.0339, lr=1.34e-05, step=5343] Training: 53%|█████▎ | 5344/10000 [1:10:59<44:28, 1.74it/s, loss=0.0339, lr=1.34e-05, step=5343] Training: 53%|█████▎ | 5344/10000 [1:10:59<44:28, 1.74it/s, loss=0.0157, lr=1.34e-05, step=5344] Training: 53%|█████▎ | 5345/10000 [1:11:00<42:51, 1.81it/s, loss=0.0157, lr=1.34e-05, step=5344] Training: 53%|█████▎ | 5345/10000 [1:11:00<42:51, 1.81it/s, loss=0.0052, lr=1.34e-05, step=5345] Training: 53%|█████▎ | 5346/10000 [1:11:00<41:34, 1.87it/s, loss=0.0052, lr=1.34e-05, step=5345] Training: 53%|█████▎ | 5346/10000 [1:11:00<41:34, 1.87it/s, loss=0.0099, lr=1.34e-05, step=5346] Training: 53%|█████▎ | 5347/10000 [1:11:01<40:31, 1.91it/s, loss=0.0099, lr=1.34e-05, step=5346] Training: 53%|█████▎ | 5347/10000 [1:11:01<40:31, 1.91it/s, loss=0.0113, lr=1.34e-05, step=5347] Training: 53%|█████▎ | 5348/10000 [1:11:01<39:41, 1.95it/s, loss=0.0113, lr=1.34e-05, step=5347] Training: 53%|█████▎ | 5348/10000 [1:11:01<39:41, 1.95it/s, loss=0.0158, lr=1.34e-05, step=5348] Training: 53%|█████▎ | 5349/10000 [1:11:02<39:27, 1.96it/s, loss=0.0158, lr=1.34e-05, step=5348] Training: 53%|█████▎ | 5349/10000 [1:11:02<39:27, 1.96it/s, loss=0.0114, lr=1.34e-05, step=5349]19:55:34.861 [I] step=5350 loss=0.0049 smoothed_loss=0.0152 lr=1.34e-05 grad_norm=0.4995 step_time=0.4919s data_time=0.0655s it/s=1.795 eta_to_10000=2590.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.1081 grad_arm_token_fuse=0.0584 grad_shared_expert=0.4060 (18633:train_pytorch.py:850) + Training: 54%|█████▎ | 5350/10000 [1:11:03<45:51, 1.69it/s, loss=0.0114, lr=1.34e-05, step=5349] Training: 54%|█████▎ | 5350/10000 [1:11:03<45:51, 1.69it/s, loss=0.0049, lr=1.34e-05, step=5350] Training: 54%|█████▎ | 5351/10000 [1:11:03<47:38, 1.63it/s, loss=0.0049, lr=1.34e-05, step=5350] Training: 54%|█████▎ | 5351/10000 [1:11:03<47:38, 1.63it/s, loss=0.0586, lr=1.34e-05, step=5351] Training: 54%|█████▎ | 5352/10000 [1:11:04<46:16, 1.67it/s, loss=0.0586, lr=1.34e-05, step=5351] Training: 54%|█████▎ | 5352/10000 [1:11:04<46:16, 1.67it/s, loss=0.0085, lr=1.34e-05, step=5352] Training: 54%|█████▎ | 5353/10000 [1:11:05<49:52, 1.55it/s, loss=0.0085, lr=1.34e-05, step=5352] Training: 54%|█████▎ | 5353/10000 [1:11:05<49:52, 1.55it/s, loss=0.0014, lr=1.34e-05, step=5353] Training: 54%|█████▎ | 5354/10000 [1:11:05<47:36, 1.63it/s, loss=0.0014, lr=1.34e-05, step=5353] Training: 54%|█████▎ | 5354/10000 [1:11:05<47:36, 1.63it/s, loss=0.0251, lr=1.34e-05, step=5354] Training: 54%|█████▎ | 5355/10000 [1:11:06<46:10, 1.68it/s, loss=0.0251, lr=1.34e-05, step=5354] Training: 54%|█████▎ | 5355/10000 [1:11:06<46:10, 1.68it/s, loss=0.0103, lr=1.34e-05, step=5355] Training: 54%|█████▎ | 5356/10000 [1:11:06<45:13, 1.71it/s, loss=0.0103, lr=1.34e-05, step=5355] Training: 54%|█████▎ | 5356/10000 [1:11:06<45:13, 1.71it/s, loss=0.0295, lr=1.34e-05, step=5356] Training: 54%|█████▎ | 5357/10000 [1:11:07<49:20, 1.57it/s, loss=0.0295, lr=1.34e-05, step=5356] Training: 54%|█████▎ | 5357/10000 [1:11:07<49:20, 1.57it/s, loss=0.0088, lr=1.34e-05, step=5357] Training: 54%|█████▎ | 5358/10000 [1:11:08<51:04, 1.51it/s, loss=0.0088, lr=1.34e-05, step=5357] Training: 54%|█████▎ | 5358/10000 [1:11:08<51:04, 1.51it/s, loss=0.0203, lr=1.34e-05, step=5358] Training: 54%|█████▎ | 5359/10000 [1:11:08<48:19, 1.60it/s, loss=0.0203, lr=1.34e-05, step=5358] Training: 54%|█████▎ | 5359/10000 [1:11:08<48:19, 1.60it/s, loss=0.0126, lr=1.33e-05, step=5359]19:55:41.037 [I] step=5360 loss=0.0036 smoothed_loss=0.0156 lr=1.34e-05 grad_norm=0.4299 step_time=0.5448s data_time=0.0727s it/s=1.619 eta_to_10000=2865.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.1130 grad_arm_token_fuse=0.0718 grad_shared_expert=0.2874 (18633:train_pytorch.py:850) + Training: 54%|█████▎ | 5360/10000 [1:11:09<45:58, 1.68it/s, loss=0.0126, lr=1.33e-05, step=5359] Training: 54%|█████▎ | 5360/10000 [1:11:09<45:58, 1.68it/s, loss=0.0036, lr=1.33e-05, step=5360] Training: 54%|█████▎ | 5361/10000 [1:11:09<43:48, 1.76it/s, loss=0.0036, lr=1.33e-05, step=5360] Training: 54%|█████▎ | 5361/10000 [1:11:09<43:48, 1.76it/s, loss=0.0096, lr=1.33e-05, step=5361] Training: 54%|█████▎ | 5362/10000 [1:11:10<42:20, 1.83it/s, loss=0.0096, lr=1.33e-05, step=5361] Training: 54%|█████▎ | 5362/10000 [1:11:10<42:20, 1.83it/s, loss=0.0346, lr=1.33e-05, step=5362] Training: 54%|█████▎ | 5363/10000 [1:11:10<40:47, 1.89it/s, loss=0.0346, lr=1.33e-05, step=5362] Training: 54%|█████▎ | 5363/10000 [1:11:10<40:47, 1.89it/s, loss=0.0266, lr=1.33e-05, step=5363] Training: 54%|█████▎ | 5364/10000 [1:11:11<43:57, 1.76it/s, loss=0.0266, lr=1.33e-05, step=5363] Training: 54%|█████▎ | 5364/10000 [1:11:11<43:57, 1.76it/s, loss=0.0055, lr=1.33e-05, step=5364] Training: 54%|█████▎ | 5365/10000 [1:11:12<47:42, 1.62it/s, loss=0.0055, lr=1.33e-05, step=5364] Training: 54%|█████▎ | 5365/10000 [1:11:12<47:42, 1.62it/s, loss=0.0082, lr=1.33e-05, step=5365] Training: 54%|█████▎ | 5366/10000 [1:11:12<48:27, 1.59it/s, loss=0.0082, lr=1.33e-05, step=5365] Training: 54%|█████▎ | 5366/10000 [1:11:12<48:27, 1.59it/s, loss=0.0055, lr=1.33e-05, step=5366] Training: 54%|█████▎ | 5367/10000 [1:11:13<47:18, 1.63it/s, loss=0.0055, lr=1.33e-05, step=5366] Training: 54%|█████▎ | 5367/10000 [1:11:13<47:18, 1.63it/s, loss=0.0028, lr=1.33e-05, step=5367] Training: 54%|█████▎ | 5368/10000 [1:11:13<46:03, 1.68it/s, loss=0.0028, lr=1.33e-05, step=5367] Training: 54%|█████▎ | 5368/10000 [1:11:13<46:03, 1.68it/s, loss=0.0023, lr=1.33e-05, step=5368] Training: 54%|█████▎ | 5369/10000 [1:11:14<45:19, 1.70it/s, loss=0.0023, lr=1.33e-05, step=5368] Training: 54%|█████▎ | 5369/10000 [1:11:14<45:19, 1.70it/s, loss=0.0018, lr=1.33e-05, step=5369]19:55:46.862 [I] step=5370 loss=0.0020 smoothed_loss=0.0105 lr=1.33e-05 grad_norm=0.4003 step_time=0.5167s data_time=0.0660s it/s=1.717 eta_to_10000=2696.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0061 grad_action_out_proj_arms=0.0981 grad_arm_token_fuse=0.0294 grad_shared_expert=0.3222 (18633:train_pytorch.py:850) + Training: 54%|█████▎ | 5370/10000 [1:11:15<45:23, 1.70it/s, loss=0.0018, lr=1.33e-05, step=5369] Training: 54%|█████▎ | 5370/10000 [1:11:15<45:23, 1.70it/s, loss=0.0020, lr=1.33e-05, step=5370] Training: 54%|█████▎ | 5371/10000 [1:11:15<47:23, 1.63it/s, loss=0.0020, lr=1.33e-05, step=5370] Training: 54%|█████▎ | 5371/10000 [1:11:15<47:23, 1.63it/s, loss=0.0031, lr=1.33e-05, step=5371] Training: 54%|█████▎ | 5372/10000 [1:11:16<50:43, 1.52it/s, loss=0.0031, lr=1.33e-05, step=5371] Training: 54%|█████▎ | 5372/10000 [1:11:16<50:43, 1.52it/s, loss=0.0064, lr=1.33e-05, step=5372] Training: 54%|█████▎ | 5373/10000 [1:11:17<50:43, 1.52it/s, loss=0.0064, lr=1.33e-05, step=5372] Training: 54%|█████▎ | 5373/10000 [1:11:17<50:43, 1.52it/s, loss=0.0813, lr=1.33e-05, step=5373] Training: 54%|█████▎ | 5374/10000 [1:11:17<49:16, 1.56it/s, loss=0.0813, lr=1.33e-05, step=5373] Training: 54%|█████▎ | 5374/10000 [1:11:17<49:16, 1.56it/s, loss=0.0061, lr=1.33e-05, step=5374] Training: 54%|█████▍ | 5375/10000 [1:11:18<45:56, 1.68it/s, loss=0.0061, lr=1.33e-05, step=5374] Training: 54%|█████▍ | 5375/10000 [1:11:18<45:56, 1.68it/s, loss=0.0247, lr=1.33e-05, step=5375] Training: 54%|█████▍ | 5376/10000 [1:11:18<43:34, 1.77it/s, loss=0.0247, lr=1.33e-05, step=5375] Training: 54%|█████▍ | 5376/10000 [1:11:18<43:34, 1.77it/s, loss=0.0097, lr=1.33e-05, step=5376] Training: 54%|█████▍ | 5377/10000 [1:11:19<41:30, 1.86it/s, loss=0.0097, lr=1.33e-05, step=5376] Training: 54%|█████▍ | 5377/10000 [1:11:19<41:30, 1.86it/s, loss=0.0171, lr=1.33e-05, step=5377] Training: 54%|█████▍ | 5378/10000 [1:11:19<44:22, 1.74it/s, loss=0.0171, lr=1.33e-05, step=5377] Training: 54%|█████▍ | 5378/10000 [1:11:19<44:22, 1.74it/s, loss=0.0032, lr=1.33e-05, step=5378] Training: 54%|█████▍ | 5379/10000 [1:11:20<47:42, 1.61it/s, loss=0.0032, lr=1.33e-05, step=5378] Training: 54%|█████▍ | 5379/10000 [1:11:20<47:42, 1.61it/s, loss=0.0317, lr=1.33e-05, step=5379]19:55:52.949 [I] step=5380 loss=0.0115 smoothed_loss=0.0159 lr=1.33e-05 grad_norm=0.4653 step_time=0.5413s data_time=0.0673s it/s=1.643 eta_to_10000=2811.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0240 grad_action_out_proj_arms=0.1844 grad_arm_token_fuse=0.1313 grad_shared_expert=0.4737 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5380/10000 [1:11:21<46:04, 1.67it/s, loss=0.0317, lr=1.33e-05, step=5379] Training: 54%|█████▍ | 5380/10000 [1:11:21<46:04, 1.67it/s, loss=0.0115, lr=1.33e-05, step=5380] Training: 54%|█████▍ | 5381/10000 [1:11:21<47:48, 1.61it/s, loss=0.0115, lr=1.33e-05, step=5380] Training: 54%|█████▍ | 5381/10000 [1:11:21<47:48, 1.61it/s, loss=0.1178, lr=1.33e-05, step=5381] Training: 54%|█████▍ | 5382/10000 [1:11:22<45:36, 1.69it/s, loss=0.1178, lr=1.33e-05, step=5381] Training: 54%|█████▍ | 5382/10000 [1:11:22<45:36, 1.69it/s, loss=0.0037, lr=1.33e-05, step=5382] Training: 54%|█████▍ | 5383/10000 [1:11:22<43:27, 1.77it/s, loss=0.0037, lr=1.33e-05, step=5382] Training: 54%|█████▍ | 5383/10000 [1:11:22<43:27, 1.77it/s, loss=0.0101, lr=1.33e-05, step=5383] Training: 54%|█████▍ | 5384/10000 [1:11:23<42:30, 1.81it/s, loss=0.0101, lr=1.33e-05, step=5383] Training: 54%|█████▍ | 5384/10000 [1:11:23<42:30, 1.81it/s, loss=0.0191, lr=1.33e-05, step=5384] Training: 54%|█████▍ | 5385/10000 [1:11:23<40:55, 1.88it/s, loss=0.0191, lr=1.33e-05, step=5384] Training: 54%|█████▍ | 5385/10000 [1:11:23<40:55, 1.88it/s, loss=0.0132, lr=1.33e-05, step=5385] Training: 54%|█████▍ | 5386/10000 [1:11:24<45:52, 1.68it/s, loss=0.0132, lr=1.33e-05, step=5385] Training: 54%|█████▍ | 5386/10000 [1:11:24<45:52, 1.68it/s, loss=0.0096, lr=1.32e-05, step=5386] Training: 54%|█████▍ | 5387/10000 [1:11:25<44:55, 1.71it/s, loss=0.0096, lr=1.32e-05, step=5386] Training: 54%|█████▍ | 5387/10000 [1:11:25<44:55, 1.71it/s, loss=0.0026, lr=1.32e-05, step=5387] Training: 54%|█████▍ | 5388/10000 [1:11:25<46:34, 1.65it/s, loss=0.0026, lr=1.32e-05, step=5387] Training: 54%|█████▍ | 5388/10000 [1:11:25<46:34, 1.65it/s, loss=0.0192, lr=1.32e-05, step=5388] Training: 54%|█████▍ | 5389/10000 [1:11:26<44:55, 1.71it/s, loss=0.0192, lr=1.32e-05, step=5388] Training: 54%|█████▍ | 5389/10000 [1:11:26<44:55, 1.71it/s, loss=0.0028, lr=1.32e-05, step=5389]19:55:58.665 [I] step=5390 loss=0.0050 smoothed_loss=0.0157 lr=1.32e-05 grad_norm=0.4412 step_time=0.5061s data_time=0.0654s it/s=1.750 eta_to_10000=2634.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0092 grad_action_out_proj_arms=0.0853 grad_arm_token_fuse=0.0444 grad_shared_expert=0.3891 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5390/10000 [1:11:26<43:16, 1.78it/s, loss=0.0028, lr=1.32e-05, step=5389] Training: 54%|█████▍ | 5390/10000 [1:11:26<43:16, 1.78it/s, loss=0.0050, lr=1.32e-05, step=5390] Training: 54%|█████▍ | 5391/10000 [1:11:27<41:28, 1.85it/s, loss=0.0050, lr=1.32e-05, step=5390] Training: 54%|█████▍ | 5391/10000 [1:11:27<41:28, 1.85it/s, loss=0.0029, lr=1.32e-05, step=5391] Training: 54%|█████▍ | 5392/10000 [1:11:27<40:27, 1.90it/s, loss=0.0029, lr=1.32e-05, step=5391] Training: 54%|█████▍ | 5392/10000 [1:11:27<40:27, 1.90it/s, loss=0.0205, lr=1.32e-05, step=5392] Training: 54%|█████▍ | 5393/10000 [1:11:28<45:49, 1.68it/s, loss=0.0205, lr=1.32e-05, step=5392] Training: 54%|█████▍ | 5393/10000 [1:11:28<45:49, 1.68it/s, loss=0.0115, lr=1.32e-05, step=5393] Training: 54%|█████▍ | 5394/10000 [1:11:29<43:26, 1.77it/s, loss=0.0115, lr=1.32e-05, step=5393] Training: 54%|█████▍ | 5394/10000 [1:11:29<43:26, 1.77it/s, loss=0.0048, lr=1.32e-05, step=5394] Training: 54%|█████▍ | 5395/10000 [1:11:29<46:09, 1.66it/s, loss=0.0048, lr=1.32e-05, step=5394] Training: 54%|█████▍ | 5395/10000 [1:11:29<46:09, 1.66it/s, loss=0.0222, lr=1.32e-05, step=5395] Training: 54%|█████▍ | 5396/10000 [1:11:30<43:54, 1.75it/s, loss=0.0222, lr=1.32e-05, step=5395] Training: 54%|█████▍ | 5396/10000 [1:11:30<43:54, 1.75it/s, loss=0.0025, lr=1.32e-05, step=5396] Training: 54%|█████▍ | 5397/10000 [1:11:30<41:48, 1.83it/s, loss=0.0025, lr=1.32e-05, step=5396] Training: 54%|█████▍ | 5397/10000 [1:11:30<41:48, 1.83it/s, loss=0.0081, lr=1.32e-05, step=5397] Training: 54%|█████▍ | 5398/10000 [1:11:31<40:45, 1.88it/s, loss=0.0081, lr=1.32e-05, step=5397] Training: 54%|█████▍ | 5398/10000 [1:11:31<40:45, 1.88it/s, loss=0.0081, lr=1.32e-05, step=5398] Training: 54%|█████▍ | 5399/10000 [1:11:31<40:57, 1.87it/s, loss=0.0081, lr=1.32e-05, step=5398] Training: 54%|█████▍ | 5399/10000 [1:11:31<40:57, 1.87it/s, loss=0.0175, lr=1.32e-05, step=5399]19:56:04.443 [I] step=5400 loss=0.0133 smoothed_loss=0.0129 lr=1.32e-05 grad_norm=0.4885 step_time=0.5135s data_time=0.0644s it/s=1.731 eta_to_10000=2657.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1068 grad_arm_token_fuse=0.0540 grad_shared_expert=0.3084 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5400/10000 [1:11:32<47:48, 1.60it/s, loss=0.0175, lr=1.32e-05, step=5399] Training: 54%|█████▍ | 5400/10000 [1:11:32<47:48, 1.60it/s, loss=0.0133, lr=1.32e-05, step=5400] Training: 54%|█████▍ | 5401/10000 [1:11:33<44:57, 1.71it/s, loss=0.0133, lr=1.32e-05, step=5400] Training: 54%|█████▍ | 5401/10000 [1:11:33<44:57, 1.71it/s, loss=0.0286, lr=1.32e-05, step=5401] Training: 54%|█████▍ | 5402/10000 [1:11:33<46:46, 1.64it/s, loss=0.0286, lr=1.32e-05, step=5401] Training: 54%|█████▍ | 5402/10000 [1:11:33<46:46, 1.64it/s, loss=0.0027, lr=1.32e-05, step=5402] Training: 54%|█████▍ | 5403/10000 [1:11:34<44:12, 1.73it/s, loss=0.0027, lr=1.32e-05, step=5402] Training: 54%|█████▍ | 5403/10000 [1:11:34<44:12, 1.73it/s, loss=0.0056, lr=1.32e-05, step=5403] Training: 54%|█████▍ | 5404/10000 [1:11:34<42:17, 1.81it/s, loss=0.0056, lr=1.32e-05, step=5403] Training: 54%|█████▍ | 5404/10000 [1:11:34<42:17, 1.81it/s, loss=0.0061, lr=1.32e-05, step=5404] Training: 54%|█████▍ | 5405/10000 [1:11:35<40:39, 1.88it/s, loss=0.0061, lr=1.32e-05, step=5404] Training: 54%|█████▍ | 5405/10000 [1:11:35<40:39, 1.88it/s, loss=0.0088, lr=1.32e-05, step=5405] Training: 54%|█████▍ | 5406/10000 [1:11:35<40:09, 1.91it/s, loss=0.0088, lr=1.32e-05, step=5405] Training: 54%|█████▍ | 5406/10000 [1:11:35<40:09, 1.91it/s, loss=0.0227, lr=1.32e-05, step=5406] Training: 54%|█████▍ | 5407/10000 [1:11:36<46:00, 1.66it/s, loss=0.0227, lr=1.32e-05, step=5406] Training: 54%|█████▍ | 5407/10000 [1:11:36<46:00, 1.66it/s, loss=0.0102, lr=1.32e-05, step=5407] Training: 54%|█████▍ | 5408/10000 [1:11:37<43:40, 1.75it/s, loss=0.0102, lr=1.32e-05, step=5407] Training: 54%|█████▍ | 5408/10000 [1:11:37<43:40, 1.75it/s, loss=0.0055, lr=1.32e-05, step=5408] Training: 54%|█████▍ | 5409/10000 [1:11:37<45:20, 1.69it/s, loss=0.0055, lr=1.32e-05, step=5408] Training: 54%|█████▍ | 5409/10000 [1:11:37<45:20, 1.69it/s, loss=0.0074, lr=1.32e-05, step=5409]19:56:10.043 [I] step=5410 loss=0.0074 smoothed_loss=0.0109 lr=1.32e-05 grad_norm=0.4449 step_time=0.4940s data_time=0.0661s it/s=1.786 eta_to_10000=2570.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0036 grad_action_out_proj_arms=0.0527 grad_arm_token_fuse=0.0183 grad_shared_expert=0.1962 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5410/10000 [1:11:38<43:50, 1.75it/s, loss=0.0074, lr=1.32e-05, step=5409] Training: 54%|█████▍ | 5410/10000 [1:11:38<43:50, 1.75it/s, loss=0.0074, lr=1.32e-05, step=5410] Training: 54%|█████▍ | 5411/10000 [1:11:38<42:06, 1.82it/s, loss=0.0074, lr=1.32e-05, step=5410] Training: 54%|█████▍ | 5411/10000 [1:11:38<42:06, 1.82it/s, loss=0.0095, lr=1.32e-05, step=5411] Training: 54%|█████▍ | 5412/10000 [1:11:39<40:40, 1.88it/s, loss=0.0095, lr=1.32e-05, step=5411] Training: 54%|█████▍ | 5412/10000 [1:11:39<40:40, 1.88it/s, loss=0.0101, lr=1.32e-05, step=5412] Training: 54%|█████▍ | 5413/10000 [1:11:39<39:55, 1.91it/s, loss=0.0101, lr=1.32e-05, step=5412] Training: 54%|█████▍ | 5413/10000 [1:11:39<39:55, 1.91it/s, loss=0.0028, lr=1.31e-05, step=5413] Training: 54%|█████▍ | 5414/10000 [1:11:40<44:10, 1.73it/s, loss=0.0028, lr=1.31e-05, step=5413] Training: 54%|█████▍ | 5414/10000 [1:11:40<44:10, 1.73it/s, loss=0.0095, lr=1.31e-05, step=5414] Training: 54%|█████▍ | 5415/10000 [1:11:41<47:10, 1.62it/s, loss=0.0095, lr=1.31e-05, step=5414] Training: 54%|█████▍ | 5415/10000 [1:11:41<47:10, 1.62it/s, loss=0.0136, lr=1.31e-05, step=5415] Training: 54%|█████▍ | 5416/10000 [1:11:41<48:01, 1.59it/s, loss=0.0136, lr=1.31e-05, step=5415] Training: 54%|█████▍ | 5416/10000 [1:11:41<48:01, 1.59it/s, loss=0.0106, lr=1.31e-05, step=5416] Training: 54%|█████▍ | 5417/10000 [1:11:42<45:16, 1.69it/s, loss=0.0106, lr=1.31e-05, step=5416] Training: 54%|█████▍ | 5417/10000 [1:11:42<45:16, 1.69it/s, loss=0.0108, lr=1.31e-05, step=5417] Training: 54%|█████▍ | 5418/10000 [1:11:42<43:13, 1.77it/s, loss=0.0108, lr=1.31e-05, step=5417] Training: 54%|█████▍ | 5418/10000 [1:11:42<43:13, 1.77it/s, loss=0.0226, lr=1.31e-05, step=5418] Training: 54%|█████▍ | 5419/10000 [1:11:43<42:03, 1.82it/s, loss=0.0226, lr=1.31e-05, step=5418] Training: 54%|█████▍ | 5419/10000 [1:11:43<42:03, 1.82it/s, loss=0.0100, lr=1.31e-05, step=5419]19:56:15.655 [I] step=5420 loss=0.0143 smoothed_loss=0.0117 lr=1.31e-05 grad_norm=0.4037 step_time=0.4978s data_time=0.0634s it/s=1.782 eta_to_10000=2569.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0063 grad_action_out_proj_arms=0.0941 grad_arm_token_fuse=0.0325 grad_shared_expert=0.3491 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5420/10000 [1:11:43<41:27, 1.84it/s, loss=0.0100, lr=1.31e-05, step=5419] Training: 54%|█████▍ | 5420/10000 [1:11:43<41:27, 1.84it/s, loss=0.0143, lr=1.31e-05, step=5420] Training: 54%|█████▍ | 5421/10000 [1:11:44<45:49, 1.67it/s, loss=0.0143, lr=1.31e-05, step=5420] Training: 54%|█████▍ | 5421/10000 [1:11:44<45:49, 1.67it/s, loss=0.0074, lr=1.31e-05, step=5421] Training: 54%|█████▍ | 5422/10000 [1:11:45<47:47, 1.60it/s, loss=0.0074, lr=1.31e-05, step=5421] Training: 54%|█████▍ | 5422/10000 [1:11:45<47:47, 1.60it/s, loss=0.0019, lr=1.31e-05, step=5422] Training: 54%|█████▍ | 5423/10000 [1:11:45<49:21, 1.55it/s, loss=0.0019, lr=1.31e-05, step=5422] Training: 54%|█████▍ | 5423/10000 [1:11:45<49:21, 1.55it/s, loss=0.0065, lr=1.31e-05, step=5423] Training: 54%|█████▍ | 5424/10000 [1:11:46<45:49, 1.66it/s, loss=0.0065, lr=1.31e-05, step=5423] Training: 54%|█████▍ | 5424/10000 [1:11:46<45:49, 1.66it/s, loss=0.0202, lr=1.31e-05, step=5424] Training: 54%|█████▍ | 5425/10000 [1:11:46<43:40, 1.75it/s, loss=0.0202, lr=1.31e-05, step=5424] Training: 54%|█████▍ | 5425/10000 [1:11:46<43:40, 1.75it/s, loss=0.0050, lr=1.31e-05, step=5425] Training: 54%|█████▍ | 5426/10000 [1:11:47<41:48, 1.82it/s, loss=0.0050, lr=1.31e-05, step=5425] Training: 54%|█████▍ | 5426/10000 [1:11:47<41:48, 1.82it/s, loss=0.0329, lr=1.31e-05, step=5426] Training: 54%|█████▍ | 5427/10000 [1:11:47<40:33, 1.88it/s, loss=0.0329, lr=1.31e-05, step=5426] Training: 54%|█████▍ | 5427/10000 [1:11:47<40:33, 1.88it/s, loss=0.0037, lr=1.31e-05, step=5427] Training: 54%|█████▍ | 5428/10000 [1:11:48<44:33, 1.71it/s, loss=0.0037, lr=1.31e-05, step=5427] Training: 54%|█████▍ | 5428/10000 [1:11:48<44:33, 1.71it/s, loss=0.0090, lr=1.31e-05, step=5428] Training: 54%|█████▍ | 5429/10000 [1:11:49<47:43, 1.60it/s, loss=0.0090, lr=1.31e-05, step=5428] Training: 54%|█████▍ | 5429/10000 [1:11:49<47:43, 1.60it/s, loss=0.0016, lr=1.31e-05, step=5429]19:56:21.718 [I] step=5430 loss=0.0182 smoothed_loss=0.0112 lr=1.31e-05 grad_norm=0.4656 step_time=0.5411s data_time=0.0652s it/s=1.650 eta_to_10000=2770.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0168 grad_action_out_proj_arms=0.1452 grad_arm_token_fuse=0.0865 grad_shared_expert=0.4526 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5430/10000 [1:11:49<45:29, 1.67it/s, loss=0.0016, lr=1.31e-05, step=5429] Training: 54%|█████▍ | 5430/10000 [1:11:49<45:29, 1.67it/s, loss=0.0182, lr=1.31e-05, step=5430] Training: 54%|█████▍ | 5431/10000 [1:11:50<47:14, 1.61it/s, loss=0.0182, lr=1.31e-05, step=5430] Training: 54%|█████▍ | 5431/10000 [1:11:50<47:14, 1.61it/s, loss=0.0186, lr=1.31e-05, step=5431] Training: 54%|█████▍ | 5432/10000 [1:11:51<44:43, 1.70it/s, loss=0.0186, lr=1.31e-05, step=5431] Training: 54%|█████▍ | 5432/10000 [1:11:51<44:43, 1.70it/s, loss=0.0071, lr=1.31e-05, step=5432] Training: 54%|█████▍ | 5433/10000 [1:11:51<42:30, 1.79it/s, loss=0.0071, lr=1.31e-05, step=5432] Training: 54%|█████▍ | 5433/10000 [1:11:51<42:30, 1.79it/s, loss=0.0391, lr=1.31e-05, step=5433] Training: 54%|█████▍ | 5434/10000 [1:11:52<41:15, 1.84it/s, loss=0.0391, lr=1.31e-05, step=5433] Training: 54%|█████▍ | 5434/10000 [1:11:52<41:15, 1.84it/s, loss=0.0310, lr=1.31e-05, step=5434] Training: 54%|█████▍ | 5435/10000 [1:11:52<40:04, 1.90it/s, loss=0.0310, lr=1.31e-05, step=5434] Training: 54%|█████▍ | 5435/10000 [1:11:52<40:04, 1.90it/s, loss=0.0015, lr=1.31e-05, step=5435] Training: 54%|█████▍ | 5436/10000 [1:11:53<44:20, 1.72it/s, loss=0.0015, lr=1.31e-05, step=5435] Training: 54%|█████▍ | 5436/10000 [1:11:53<44:20, 1.72it/s, loss=0.0060, lr=1.31e-05, step=5436] Training: 54%|█████▍ | 5437/10000 [1:11:53<42:26, 1.79it/s, loss=0.0060, lr=1.31e-05, step=5436] Training: 54%|█████▍ | 5437/10000 [1:11:53<42:26, 1.79it/s, loss=0.0068, lr=1.31e-05, step=5437] Training: 54%|█████▍ | 5438/10000 [1:11:54<45:22, 1.68it/s, loss=0.0068, lr=1.31e-05, step=5437] Training: 54%|█████▍ | 5438/10000 [1:11:54<45:22, 1.68it/s, loss=0.0097, lr=1.31e-05, step=5438] Training: 54%|█████▍ | 5439/10000 [1:11:55<44:50, 1.70it/s, loss=0.0097, lr=1.31e-05, step=5438] Training: 54%|█████▍ | 5439/10000 [1:11:55<44:50, 1.70it/s, loss=0.0250, lr=1.31e-05, step=5439]19:56:27.399 [I] step=5440 loss=0.0044 smoothed_loss=0.0129 lr=1.31e-05 grad_norm=0.4721 step_time=0.5040s data_time=0.0641s it/s=1.761 eta_to_10000=2589.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.0937 grad_arm_token_fuse=0.0493 grad_shared_expert=0.4096 (18633:train_pytorch.py:850) + Training: 54%|█████▍ | 5440/10000 [1:11:55<43:35, 1.74it/s, loss=0.0250, lr=1.31e-05, step=5439] Training: 54%|█████▍ | 5440/10000 [1:11:55<43:35, 1.74it/s, loss=0.0044, lr=1.30e-05, step=5440] Training: 54%|█████▍ | 5441/10000 [1:11:56<41:43, 1.82it/s, loss=0.0044, lr=1.30e-05, step=5440] Training: 54%|█████▍ | 5441/10000 [1:11:56<41:43, 1.82it/s, loss=0.0010, lr=1.30e-05, step=5441] Training: 54%|█████▍ | 5442/10000 [1:11:56<40:34, 1.87it/s, loss=0.0010, lr=1.30e-05, step=5441] Training: 54%|█████▍ | 5442/10000 [1:11:56<40:34, 1.87it/s, loss=0.0163, lr=1.30e-05, step=5442] Training: 54%|█████▍ | 5443/10000 [1:11:57<44:58, 1.69it/s, loss=0.0163, lr=1.30e-05, step=5442] Training: 54%|█████▍ | 5443/10000 [1:11:57<44:58, 1.69it/s, loss=0.0028, lr=1.30e-05, step=5443] Training: 54%|█████▍ | 5444/10000 [1:11:57<42:35, 1.78it/s, loss=0.0028, lr=1.30e-05, step=5443] Training: 54%|█████▍ | 5444/10000 [1:11:57<42:35, 1.78it/s, loss=0.0044, lr=1.30e-05, step=5444] Training: 54%|█████▍ | 5445/10000 [1:11:58<40:37, 1.87it/s, loss=0.0044, lr=1.30e-05, step=5444] Training: 54%|█████▍ | 5445/10000 [1:11:58<40:37, 1.87it/s, loss=0.0078, lr=1.30e-05, step=5445] Training: 54%|█████▍ | 5446/10000 [1:11:59<45:36, 1.66it/s, loss=0.0078, lr=1.30e-05, step=5445] Training: 54%|█████▍ | 5446/10000 [1:11:59<45:36, 1.66it/s, loss=0.0168, lr=1.30e-05, step=5446] Training: 54%|█████▍ | 5447/10000 [1:11:59<43:19, 1.75it/s, loss=0.0168, lr=1.30e-05, step=5446] Training: 54%|█████▍ | 5447/10000 [1:11:59<43:19, 1.75it/s, loss=0.0081, lr=1.30e-05, step=5447] Training: 54%|█████▍ | 5448/10000 [1:12:00<41:44, 1.82it/s, loss=0.0081, lr=1.30e-05, step=5447] Training: 54%|█████▍ | 5448/10000 [1:12:00<41:44, 1.82it/s, loss=0.0166, lr=1.30e-05, step=5448] Training: 54%|█████▍ | 5449/10000 [1:12:00<40:20, 1.88it/s, loss=0.0166, lr=1.30e-05, step=5448] Training: 54%|█████▍ | 5449/10000 [1:12:00<40:20, 1.88it/s, loss=0.0058, lr=1.30e-05, step=5449]19:56:33.122 [I] step=5450 loss=0.0052 smoothed_loss=0.0102 lr=1.30e-05 grad_norm=0.5410 step_time=0.5057s data_time=0.0667s it/s=1.747 eta_to_10000=2603.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0148 grad_action_out_proj_arms=0.1554 grad_arm_token_fuse=0.0758 grad_shared_expert=0.5638 (18633:train_pytorch.py:850) + Training: 55%|█████▍ | 5450/10000 [1:12:01<46:19, 1.64it/s, loss=0.0058, lr=1.30e-05, step=5449] Training: 55%|█████▍ | 5450/10000 [1:12:01<46:19, 1.64it/s, loss=0.0052, lr=1.30e-05, step=5450] Training: 55%|█████▍ | 5451/10000 [1:12:01<43:56, 1.73it/s, loss=0.0052, lr=1.30e-05, step=5450] Training: 55%|█████▍ | 5451/10000 [1:12:01<43:56, 1.73it/s, loss=0.0178, lr=1.30e-05, step=5451] Training: 55%|█████▍ | 5452/10000 [1:12:02<41:59, 1.80it/s, loss=0.0178, lr=1.30e-05, step=5451] Training: 55%|█████▍ | 5452/10000 [1:12:02<41:59, 1.80it/s, loss=0.0161, lr=1.30e-05, step=5452] Training: 55%|█████▍ | 5453/10000 [1:12:02<44:18, 1.71it/s, loss=0.0161, lr=1.30e-05, step=5452] Training: 55%|█████▍ | 5453/10000 [1:12:02<44:18, 1.71it/s, loss=0.0031, lr=1.30e-05, step=5453] Training: 55%|█████▍ | 5454/10000 [1:12:03<42:43, 1.77it/s, loss=0.0031, lr=1.30e-05, step=5453] Training: 55%|█████▍ | 5454/10000 [1:12:03<42:43, 1.77it/s, loss=0.0059, lr=1.30e-05, step=5454] Training: 55%|█████▍ | 5455/10000 [1:12:04<44:19, 1.71it/s, loss=0.0059, lr=1.30e-05, step=5454] Training: 55%|█████▍ | 5455/10000 [1:12:04<44:19, 1.71it/s, loss=0.0046, lr=1.30e-05, step=5455] Training: 55%|█████▍ | 5456/10000 [1:12:04<42:46, 1.77it/s, loss=0.0046, lr=1.30e-05, step=5455] Training: 55%|█████▍ | 5456/10000 [1:12:04<42:46, 1.77it/s, loss=0.0573, lr=1.30e-05, step=5456] Training: 55%|█████▍ | 5457/10000 [1:12:05<47:14, 1.60it/s, loss=0.0573, lr=1.30e-05, step=5456] Training: 55%|█████▍ | 5457/10000 [1:12:05<47:14, 1.60it/s, loss=0.0025, lr=1.30e-05, step=5457] Training: 55%|█████▍ | 5458/10000 [1:12:05<44:18, 1.71it/s, loss=0.0025, lr=1.30e-05, step=5457] Training: 55%|█████▍ | 5458/10000 [1:12:05<44:18, 1.71it/s, loss=0.0060, lr=1.30e-05, step=5458] Training: 55%|█████▍ | 5459/10000 [1:12:06<42:51, 1.77it/s, loss=0.0060, lr=1.30e-05, step=5458] Training: 55%|█████▍ | 5459/10000 [1:12:06<42:51, 1.77it/s, loss=0.0386, lr=1.30e-05, step=5459]19:56:38.822 [I] step=5460 loss=0.0025 smoothed_loss=0.0138 lr=1.30e-05 grad_norm=0.4311 step_time=0.4919s data_time=0.0781s it/s=1.755 eta_to_10000=2587.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0182 grad_action_out_proj_arms=0.1774 grad_arm_token_fuse=0.0914 grad_shared_expert=0.4294 (18633:train_pytorch.py:850) + Training: 55%|█████▍ | 5460/10000 [1:12:06<43:31, 1.74it/s, loss=0.0386, lr=1.30e-05, step=5459] Training: 55%|█████▍ | 5460/10000 [1:12:06<43:31, 1.74it/s, loss=0.0025, lr=1.30e-05, step=5460] Training: 55%|█████▍ | 5461/10000 [1:12:07<45:15, 1.67it/s, loss=0.0025, lr=1.30e-05, step=5460] Training: 55%|█████▍ | 5461/10000 [1:12:07<45:15, 1.67it/s, loss=0.0121, lr=1.30e-05, step=5461] Training: 55%|█████▍ | 5462/10000 [1:12:08<42:44, 1.77it/s, loss=0.0121, lr=1.30e-05, step=5461] Training: 55%|█████▍ | 5462/10000 [1:12:08<42:44, 1.77it/s, loss=0.0101, lr=1.30e-05, step=5462] Training: 55%|█████▍ | 5463/10000 [1:12:08<40:56, 1.85it/s, loss=0.0101, lr=1.30e-05, step=5462] Training: 55%|█████▍ | 5463/10000 [1:12:08<40:56, 1.85it/s, loss=0.0113, lr=1.30e-05, step=5463] Training: 55%|█████▍ | 5464/10000 [1:12:09<44:35, 1.70it/s, loss=0.0113, lr=1.30e-05, step=5463] Training: 55%|█████▍ | 5464/10000 [1:12:09<44:35, 1.70it/s, loss=0.0071, lr=1.30e-05, step=5464] Training: 55%|█████▍ | 5465/10000 [1:12:10<47:12, 1.60it/s, loss=0.0071, lr=1.30e-05, step=5464] Training: 55%|█████▍ | 5465/10000 [1:12:10<47:12, 1.60it/s, loss=0.0330, lr=1.30e-05, step=5465] Training: 55%|█████▍ | 5466/10000 [1:12:10<44:18, 1.71it/s, loss=0.0330, lr=1.30e-05, step=5465] Training: 55%|█████▍ | 5466/10000 [1:12:10<44:18, 1.71it/s, loss=0.0107, lr=1.30e-05, step=5466] Training: 55%|█████▍ | 5467/10000 [1:12:11<42:31, 1.78it/s, loss=0.0107, lr=1.30e-05, step=5466] Training: 55%|█████▍ | 5467/10000 [1:12:11<42:31, 1.78it/s, loss=0.0071, lr=1.29e-05, step=5467] Training: 55%|█████▍ | 5468/10000 [1:12:11<45:02, 1.68it/s, loss=0.0071, lr=1.29e-05, step=5467] Training: 55%|█████▍ | 5468/10000 [1:12:11<45:02, 1.68it/s, loss=0.0183, lr=1.29e-05, step=5468] Training: 55%|█████▍ | 5469/10000 [1:12:12<43:02, 1.75it/s, loss=0.0183, lr=1.29e-05, step=5468] Training: 55%|█████▍ | 5469/10000 [1:12:12<43:02, 1.75it/s, loss=0.0023, lr=1.29e-05, step=5469]19:56:44.558 [I] step=5470 loss=0.0198 smoothed_loss=0.0135 lr=1.30e-05 grad_norm=0.4966 step_time=0.5103s data_time=0.0633s it/s=1.744 eta_to_10000=2598.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0384 grad_action_out_proj_arms=0.2529 grad_arm_token_fuse=0.2322 grad_shared_expert=0.6447 (18633:train_pytorch.py:850) + Training: 55%|█████▍ | 5470/10000 [1:12:12<41:45, 1.81it/s, loss=0.0023, lr=1.29e-05, step=5469] Training: 55%|█████▍ | 5470/10000 [1:12:12<41:45, 1.81it/s, loss=0.0198, lr=1.29e-05, step=5470] Training: 55%|█████▍ | 5471/10000 [1:12:13<44:33, 1.69it/s, loss=0.0198, lr=1.29e-05, step=5470] Training: 55%|█████▍ | 5471/10000 [1:12:13<44:33, 1.69it/s, loss=0.0089, lr=1.29e-05, step=5471] Training: 55%|█████▍ | 5472/10000 [1:12:14<46:52, 1.61it/s, loss=0.0089, lr=1.29e-05, step=5471] Training: 55%|█████▍ | 5472/10000 [1:12:14<46:52, 1.61it/s, loss=0.0059, lr=1.29e-05, step=5472] Training: 55%|█████▍ | 5473/10000 [1:12:14<44:15, 1.70it/s, loss=0.0059, lr=1.29e-05, step=5472] Training: 55%|█████▍ | 5473/10000 [1:12:14<44:15, 1.70it/s, loss=0.0136, lr=1.29e-05, step=5473] Training: 55%|█████▍ | 5474/10000 [1:12:15<42:22, 1.78it/s, loss=0.0136, lr=1.29e-05, step=5473] Training: 55%|█████▍ | 5474/10000 [1:12:15<42:22, 1.78it/s, loss=0.0473, lr=1.29e-05, step=5474] Training: 55%|█████▍ | 5475/10000 [1:12:15<45:22, 1.66it/s, loss=0.0473, lr=1.29e-05, step=5474] Training: 55%|█████▍ | 5475/10000 [1:12:15<45:22, 1.66it/s, loss=0.0129, lr=1.29e-05, step=5475] Training: 55%|█████▍ | 5476/10000 [1:12:16<42:46, 1.76it/s, loss=0.0129, lr=1.29e-05, step=5475] Training: 55%|█████▍ | 5476/10000 [1:12:16<42:46, 1.76it/s, loss=0.0178, lr=1.29e-05, step=5476] Training: 55%|█████▍ | 5477/10000 [1:12:16<45:07, 1.67it/s, loss=0.0178, lr=1.29e-05, step=5476] Training: 55%|█████▍ | 5477/10000 [1:12:16<45:07, 1.67it/s, loss=0.0180, lr=1.29e-05, step=5477] Training: 55%|█████▍ | 5478/10000 [1:12:17<47:15, 1.59it/s, loss=0.0180, lr=1.29e-05, step=5477] Training: 55%|█████▍ | 5478/10000 [1:12:17<47:15, 1.59it/s, loss=0.0018, lr=1.29e-05, step=5478] Training: 55%|█████▍ | 5479/10000 [1:12:18<49:10, 1.53it/s, loss=0.0018, lr=1.29e-05, step=5478] Training: 55%|█████▍ | 5479/10000 [1:12:18<49:10, 1.53it/s, loss=0.0414, lr=1.29e-05, step=5479]19:56:50.891 [I] step=5480 loss=0.0053 smoothed_loss=0.0161 lr=1.29e-05 grad_norm=0.4989 step_time=0.5449s data_time=0.0884s it/s=1.579 eta_to_10000=2861.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.1097 grad_arm_token_fuse=0.0636 grad_shared_expert=0.3542 (18633:train_pytorch.py:850) + Training: 55%|█████▍ | 5480/10000 [1:12:19<50:05, 1.50it/s, loss=0.0414, lr=1.29e-05, step=5479] Training: 55%|█████▍ | 5480/10000 [1:12:19<50:05, 1.50it/s, loss=0.0053, lr=1.29e-05, step=5480] Training: 55%|█████▍ | 5481/10000 [1:12:19<46:06, 1.63it/s, loss=0.0053, lr=1.29e-05, step=5480] Training: 55%|█████▍ | 5481/10000 [1:12:19<46:06, 1.63it/s, loss=0.0060, lr=1.29e-05, step=5481] Training: 55%|█████▍ | 5482/10000 [1:12:20<47:25, 1.59it/s, loss=0.0060, lr=1.29e-05, step=5481] Training: 55%|█████▍ | 5482/10000 [1:12:20<47:25, 1.59it/s, loss=0.0115, lr=1.29e-05, step=5482] Training: 55%|█████▍ | 5483/10000 [1:12:20<44:48, 1.68it/s, loss=0.0115, lr=1.29e-05, step=5482] Training: 55%|█████▍ | 5483/10000 [1:12:20<44:48, 1.68it/s, loss=0.0027, lr=1.29e-05, step=5483] Training: 55%|█████▍ | 5484/10000 [1:12:21<46:05, 1.63it/s, loss=0.0027, lr=1.29e-05, step=5483] Training: 55%|█████▍ | 5484/10000 [1:12:21<46:05, 1.63it/s, loss=0.0096, lr=1.29e-05, step=5484] Training: 55%|█████▍ | 5485/10000 [1:12:22<49:07, 1.53it/s, loss=0.0096, lr=1.29e-05, step=5484] Training: 55%|█████▍ | 5485/10000 [1:12:22<49:07, 1.53it/s, loss=0.0180, lr=1.29e-05, step=5485] Training: 55%|█████▍ | 5486/10000 [1:12:22<52:30, 1.43it/s, loss=0.0180, lr=1.29e-05, step=5485] Training: 55%|█████▍ | 5486/10000 [1:12:22<52:30, 1.43it/s, loss=0.0165, lr=1.29e-05, step=5486] Training: 55%|█████▍ | 5487/10000 [1:12:23<53:11, 1.41it/s, loss=0.0165, lr=1.29e-05, step=5486] Training: 55%|█████▍ | 5487/10000 [1:12:23<53:11, 1.41it/s, loss=0.0216, lr=1.29e-05, step=5487] Training: 55%|█████▍ | 5488/10000 [1:12:24<52:49, 1.42it/s, loss=0.0216, lr=1.29e-05, step=5487] Training: 55%|█████▍ | 5488/10000 [1:12:24<52:49, 1.42it/s, loss=0.0252, lr=1.29e-05, step=5488] Training: 55%|█████▍ | 5489/10000 [1:12:25<53:35, 1.40it/s, loss=0.0252, lr=1.29e-05, step=5488] Training: 55%|█████▍ | 5489/10000 [1:12:25<53:35, 1.40it/s, loss=0.0067, lr=1.29e-05, step=5489]19:56:57.660 [I] step=5490 loss=0.0042 smoothed_loss=0.0138 lr=1.29e-05 grad_norm=0.4332 step_time=0.5552s data_time=0.1218s it/s=1.477 eta_to_10000=3052.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0052 grad_action_out_proj_arms=0.0604 grad_arm_token_fuse=0.0233 grad_shared_expert=0.1753 (18633:train_pytorch.py:850) + Training: 55%|█████▍ | 5490/10000 [1:12:25<54:05, 1.39it/s, loss=0.0067, lr=1.29e-05, step=5489] Training: 55%|█████▍ | 5490/10000 [1:12:25<54:05, 1.39it/s, loss=0.0042, lr=1.29e-05, step=5490] Training: 55%|█████▍ | 5491/10000 [1:12:26<53:33, 1.40it/s, loss=0.0042, lr=1.29e-05, step=5490] Training: 55%|█████▍ | 5491/10000 [1:12:26<53:33, 1.40it/s, loss=0.0095, lr=1.29e-05, step=5491] Training: 55%|█████▍ | 5492/10000 [1:12:27<54:50, 1.37it/s, loss=0.0095, lr=1.29e-05, step=5491] Training: 55%|█████▍ | 5492/10000 [1:12:27<54:50, 1.37it/s, loss=0.0064, lr=1.29e-05, step=5492] Training: 55%|█████▍ | 5493/10000 [1:12:28<54:53, 1.37it/s, loss=0.0064, lr=1.29e-05, step=5492] Training: 55%|█████▍ | 5493/10000 [1:12:28<54:53, 1.37it/s, loss=0.0046, lr=1.29e-05, step=5493] Training: 55%|█████▍ | 5494/10000 [1:12:28<55:39, 1.35it/s, loss=0.0046, lr=1.29e-05, step=5493] Training: 55%|█████▍ | 5494/10000 [1:12:28<55:39, 1.35it/s, loss=0.0185, lr=1.28e-05, step=5494] Training: 55%|█████▍ | 5495/10000 [1:12:29<53:51, 1.39it/s, loss=0.0185, lr=1.28e-05, step=5494] Training: 55%|█████▍ | 5495/10000 [1:12:29<53:51, 1.39it/s, loss=0.0120, lr=1.28e-05, step=5495] Training: 55%|█████▍ | 5496/10000 [1:12:30<59:28, 1.26it/s, loss=0.0120, lr=1.28e-05, step=5495] Training: 55%|█████▍ | 5496/10000 [1:12:30<59:28, 1.26it/s, loss=0.0040, lr=1.28e-05, step=5496] Training: 55%|█████▍ | 5497/10000 [1:12:31<57:24, 1.31it/s, loss=0.0040, lr=1.28e-05, step=5496] Training: 55%|█████▍ | 5497/10000 [1:12:31<57:24, 1.31it/s, loss=0.0130, lr=1.28e-05, step=5497] Training: 55%|█████▍ | 5498/10000 [1:12:31<51:25, 1.46it/s, loss=0.0130, lr=1.28e-05, step=5497] Training: 55%|█████▍ | 5498/10000 [1:12:31<51:25, 1.46it/s, loss=0.0134, lr=1.28e-05, step=5498] Training: 55%|█████▍ | 5499/10000 [1:12:32<50:41, 1.48it/s, loss=0.0134, lr=1.28e-05, step=5498] Training: 55%|█████▍ | 5499/10000 [1:12:32<50:41, 1.48it/s, loss=0.0141, lr=1.28e-05, step=5499]19:57:04.821 [I] step=5500 loss=0.0041 smoothed_loss=0.0113 lr=1.28e-05 grad_norm=0.3669 step_time=0.5770s data_time=0.1391s it/s=1.397 eta_to_10000=3221.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0906 grad_arm_token_fuse=0.0362 grad_shared_expert=0.3890 (18633:train_pytorch.py:850) + Training: 55%|█████▌ | 5500/10000 [1:12:32<51:30, 1.46it/s, loss=0.0141, lr=1.28e-05, step=5499] Training: 55%|█████▌ | 5500/10000 [1:12:32<51:30, 1.46it/s, loss=0.0041, lr=1.28e-05, step=5500] Training: 55%|█████▌ | 5501/10000 [1:12:33<50:08, 1.50it/s, loss=0.0041, lr=1.28e-05, step=5500] Training: 55%|█████▌ | 5501/10000 [1:12:33<50:08, 1.50it/s, loss=0.0085, lr=1.28e-05, step=5501] Training: 55%|█████▌ | 5502/10000 [1:12:34<51:02, 1.47it/s, loss=0.0085, lr=1.28e-05, step=5501] Training: 55%|█████▌ | 5502/10000 [1:12:34<51:02, 1.47it/s, loss=0.0088, lr=1.28e-05, step=5502] Training: 55%|█████▌ | 5503/10000 [1:12:35<56:34, 1.32it/s, loss=0.0088, lr=1.28e-05, step=5502] Training: 55%|█████▌ | 5503/10000 [1:12:35<56:34, 1.32it/s, loss=0.0064, lr=1.28e-05, step=5503] Training: 55%|█████▌ | 5504/10000 [1:12:35<51:05, 1.47it/s, loss=0.0064, lr=1.28e-05, step=5503] Training: 55%|█████▌ | 5504/10000 [1:12:35<51:05, 1.47it/s, loss=0.0104, lr=1.28e-05, step=5504] Training: 55%|█████▌ | 5505/10000 [1:12:36<55:22, 1.35it/s, loss=0.0104, lr=1.28e-05, step=5504] Training: 55%|█████▌ | 5505/10000 [1:12:36<55:22, 1.35it/s, loss=0.0068, lr=1.28e-05, step=5505] Training: 55%|█████▌ | 5506/10000 [1:12:37<55:50, 1.34it/s, loss=0.0068, lr=1.28e-05, step=5505] Training: 55%|█████▌ | 5506/10000 [1:12:37<55:50, 1.34it/s, loss=0.0064, lr=1.28e-05, step=5506] Training: 55%|█████▌ | 5507/10000 [1:12:38<57:47, 1.30it/s, loss=0.0064, lr=1.28e-05, step=5506] Training: 55%|█████▌ | 5507/10000 [1:12:38<57:47, 1.30it/s, loss=0.0042, lr=1.28e-05, step=5507] Training: 55%|█████▌ | 5508/10000 [1:12:38<57:37, 1.30it/s, loss=0.0042, lr=1.28e-05, step=5507] Training: 55%|█████▌ | 5508/10000 [1:12:38<57:37, 1.30it/s, loss=0.0151, lr=1.28e-05, step=5508] Training: 55%|█████▌ | 5509/10000 [1:12:39<57:37, 1.30it/s, loss=0.0151, lr=1.28e-05, step=5508] Training: 55%|█████▌ | 5509/10000 [1:12:39<57:37, 1.30it/s, loss=0.0132, lr=1.28e-05, step=5509]19:57:12.213 [I] step=5510 loss=0.0355 smoothed_loss=0.0126 lr=1.28e-05 grad_norm=0.5534 step_time=0.6090s data_time=0.1303s it/s=1.353 eta_to_10000=3318.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0082 grad_action_out_proj_arms=0.0872 grad_arm_token_fuse=0.0416 grad_shared_expert=0.6273 (18633:train_pytorch.py:850) + Training: 55%|█████▌ | 5510/10000 [1:12:40<54:10, 1.38it/s, loss=0.0132, lr=1.28e-05, step=5509] Training: 55%|█████▌ | 5510/10000 [1:12:40<54:10, 1.38it/s, loss=0.0355, lr=1.28e-05, step=5510] Training: 55%|█████▌ | 5511/10000 [1:12:41<54:41, 1.37it/s, loss=0.0355, lr=1.28e-05, step=5510] Training: 55%|█████▌ | 5511/10000 [1:12:41<54:41, 1.37it/s, loss=0.0029, lr=1.28e-05, step=5511] Training: 55%|█████▌ | 5512/10000 [1:12:41<53:44, 1.39it/s, loss=0.0029, lr=1.28e-05, step=5511] Training: 55%|█████▌ | 5512/10000 [1:12:41<53:44, 1.39it/s, loss=0.0267, lr=1.28e-05, step=5512] Training: 55%|█████▌ | 5513/10000 [1:12:42<55:59, 1.34it/s, loss=0.0267, lr=1.28e-05, step=5512] Training: 55%|█████▌ | 5513/10000 [1:12:42<55:59, 1.34it/s, loss=0.0041, lr=1.28e-05, step=5513] Training: 55%|█████▌ | 5514/10000 [1:12:43<1:02:22, 1.20it/s, loss=0.0041, lr=1.28e-05, step=5513] Training: 55%|█████▌ | 5514/10000 [1:12:43<1:02:22, 1.20it/s, loss=0.0175, lr=1.28e-05, step=5514] Training: 55%|█████▌ | 5515/10000 [1:12:44<1:08:35, 1.09it/s, loss=0.0175, lr=1.28e-05, step=5514] Training: 55%|█████▌ | 5515/10000 [1:12:44<1:08:35, 1.09it/s, loss=0.0056, lr=1.28e-05, step=5515] Training: 55%|█████▌ | 5516/10000 [1:12:45<59:17, 1.26it/s, loss=0.0056, lr=1.28e-05, step=5515] Training: 55%|█████▌ | 5516/10000 [1:12:45<59:17, 1.26it/s, loss=0.0052, lr=1.28e-05, step=5516] Training: 55%|█████▌ | 5517/10000 [1:12:45<57:14, 1.31it/s, loss=0.0052, lr=1.28e-05, step=5516] Training: 55%|█████▌ | 5517/10000 [1:12:45<57:14, 1.31it/s, loss=0.0141, lr=1.28e-05, step=5517] Training: 55%|█████▌ | 5518/10000 [1:12:46<58:09, 1.28it/s, loss=0.0141, lr=1.28e-05, step=5517] Training: 55%|█████▌ | 5518/10000 [1:12:46<58:09, 1.28it/s, loss=0.0102, lr=1.28e-05, step=5518] Training: 55%|█████▌ | 5519/10000 [1:12:47<51:39, 1.45it/s, loss=0.0102, lr=1.28e-05, step=5518] Training: 55%|█████▌ | 5519/10000 [1:12:47<51:39, 1.45it/s, loss=0.0093, lr=1.28e-05, step=5519]19:57:19.638 [I] step=5520 loss=0.0075 smoothed_loss=0.0109 lr=1.28e-05 grad_norm=0.4629 step_time=0.6131s data_time=0.1295s it/s=1.347 eta_to_10000=3326.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0089 grad_action_out_proj_arms=0.0793 grad_arm_token_fuse=0.0436 grad_shared_expert=0.2225 (18633:train_pytorch.py:850) + Training: 55%|█████▌ | 5520/10000 [1:12:47<47:49, 1.56it/s, loss=0.0093, lr=1.28e-05, step=5519] Training: 55%|█████▌ | 5520/10000 [1:12:47<47:49, 1.56it/s, loss=0.0075, lr=1.28e-05, step=5520] Training: 55%|█████▌ | 5521/10000 [1:12:48<48:24, 1.54it/s, loss=0.0075, lr=1.28e-05, step=5520] Training: 55%|█████▌ | 5521/10000 [1:12:48<48:24, 1.54it/s, loss=0.0317, lr=1.27e-05, step=5521] Training: 55%|█████▌ | 5522/10000 [1:12:49<59:47, 1.25it/s, loss=0.0317, lr=1.27e-05, step=5521] Training: 55%|█████▌ | 5522/10000 [1:12:49<59:47, 1.25it/s, loss=0.0099, lr=1.27e-05, step=5522] Training: 55%|█████▌ | 5523/10000 [1:12:50<1:00:52, 1.23it/s, loss=0.0099, lr=1.27e-05, step=5522] Training: 55%|█████▌ | 5523/10000 [1:12:50<1:00:52, 1.23it/s, loss=0.0167, lr=1.27e-05, step=5523] Training: 55%|█████▌ | 5524/10000 [1:12:51<59:24, 1.26it/s, loss=0.0167, lr=1.27e-05, step=5523] Training: 55%|█████▌ | 5524/10000 [1:12:51<59:24, 1.26it/s, loss=0.0026, lr=1.27e-05, step=5524] Training: 55%|█████▌ | 5525/10000 [1:12:52<1:00:39, 1.23it/s, loss=0.0026, lr=1.27e-05, step=5524] Training: 55%|█████▌ | 5525/10000 [1:12:52<1:00:39, 1.23it/s, loss=0.0055, lr=1.27e-05, step=5525] Training: 55%|█████▌ | 5526/10000 [1:12:53<1:04:42, 1.15it/s, loss=0.0055, lr=1.27e-05, step=5525] Training: 55%|█████▌ | 5526/10000 [1:12:53<1:04:42, 1.15it/s, loss=0.0079, lr=1.27e-05, step=5526] Training: 55%|█████▌ | 5527/10000 [1:12:53<56:55, 1.31it/s, loss=0.0079, lr=1.27e-05, step=5526] Training: 55%|█████▌ | 5527/10000 [1:12:53<56:55, 1.31it/s, loss=0.0035, lr=1.27e-05, step=5527] Training: 55%|█████▌ | 5528/10000 [1:12:54<54:45, 1.36it/s, loss=0.0035, lr=1.27e-05, step=5527] Training: 55%|█████▌ | 5528/10000 [1:12:54<54:45, 1.36it/s, loss=0.0036, lr=1.27e-05, step=5528] Training: 55%|█████▌ | 5529/10000 [1:12:55<57:10, 1.30it/s, loss=0.0036, lr=1.27e-05, step=5528] Training: 55%|█████▌ | 5529/10000 [1:12:55<57:10, 1.30it/s, loss=0.0149, lr=1.27e-05, step=5529]19:57:27.467 [I] step=5530 loss=0.0194 smoothed_loss=0.0111 lr=1.27e-05 grad_norm=0.6269 step_time=0.6332s data_time=0.1497s it/s=1.278 eta_to_10000=3498.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0153 grad_action_out_proj_arms=0.1737 grad_arm_token_fuse=0.0775 grad_shared_expert=0.6493 (18633:train_pytorch.py:850) + Training: 55%|█████▌ | 5530/10000 [1:12:55<51:44, 1.44it/s, loss=0.0149, lr=1.27e-05, step=5529] Training: 55%|█████▌ | 5530/10000 [1:12:55<51:44, 1.44it/s, loss=0.0194, lr=1.27e-05, step=5530] Training: 55%|█████▌ | 5531/10000 [1:12:56<51:16, 1.45it/s, loss=0.0194, lr=1.27e-05, step=5530] Training: 55%|█████▌ | 5531/10000 [1:12:56<51:16, 1.45it/s, loss=0.0177, lr=1.27e-05, step=5531] Training: 55%|█████▌ | 5532/10000 [1:12:56<47:16, 1.58it/s, loss=0.0177, lr=1.27e-05, step=5531] Training: 55%|█████▌ | 5532/10000 [1:12:56<47:16, 1.58it/s, loss=0.0073, lr=1.27e-05, step=5532] Training: 55%|█████▌ | 5533/10000 [1:12:57<47:40, 1.56it/s, loss=0.0073, lr=1.27e-05, step=5532] Training: 55%|█████▌ | 5533/10000 [1:12:57<47:40, 1.56it/s, loss=0.0060, lr=1.27e-05, step=5533] Training: 55%|█████▌ | 5534/10000 [1:12:58<49:23, 1.51it/s, loss=0.0060, lr=1.27e-05, step=5533] Training: 55%|█████▌ | 5534/10000 [1:12:58<49:23, 1.51it/s, loss=0.0020, lr=1.27e-05, step=5534] Training: 55%|█████▌ | 5535/10000 [1:12:58<45:43, 1.63it/s, loss=0.0020, lr=1.27e-05, step=5534] Training: 55%|█████▌ | 5535/10000 [1:12:58<45:43, 1.63it/s, loss=0.0705, lr=1.27e-05, step=5535] Training: 55%|█████▌ | 5536/10000 [1:12:59<55:43, 1.34it/s, loss=0.0705, lr=1.27e-05, step=5535] Training: 55%|█████▌ | 5536/10000 [1:12:59<55:43, 1.34it/s, loss=0.0090, lr=1.27e-05, step=5536] Training: 55%|█████▌ | 5537/10000 [1:13:00<49:53, 1.49it/s, loss=0.0090, lr=1.27e-05, step=5536] Training: 55%|█████▌ | 5537/10000 [1:13:00<49:53, 1.49it/s, loss=0.0355, lr=1.27e-05, step=5537] Training: 55%|█████▌ | 5538/10000 [1:13:00<47:26, 1.57it/s, loss=0.0355, lr=1.27e-05, step=5537] Training: 55%|█████▌ | 5538/10000 [1:13:00<47:26, 1.57it/s, loss=0.0362, lr=1.27e-05, step=5538] Training: 55%|█████▌ | 5539/10000 [1:13:01<49:36, 1.50it/s, loss=0.0362, lr=1.27e-05, step=5538] Training: 55%|█████▌ | 5539/10000 [1:13:01<49:36, 1.50it/s, loss=0.0115, lr=1.27e-05, step=5539]19:57:33.920 [I] step=5540 loss=0.0029 smoothed_loss=0.0168 lr=1.27e-05 grad_norm=0.5353 step_time=0.5269s data_time=0.1183s it/s=1.550 eta_to_10000=2877.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0147 grad_action_out_proj_arms=0.0979 grad_arm_token_fuse=0.0712 grad_shared_expert=0.3811 (18633:train_pytorch.py:850) + Training: 55%|█████▌ | 5540/10000 [1:13:02<46:57, 1.58it/s, loss=0.0115, lr=1.27e-05, step=5539] Training: 55%|█████▌ | 5540/10000 [1:13:02<46:57, 1.58it/s, loss=0.0029, lr=1.27e-05, step=5540] Training: 55%|█████▌ | 5541/10000 [1:13:03<55:52, 1.33it/s, loss=0.0029, lr=1.27e-05, step=5540] Training: 55%|█████▌ | 5541/10000 [1:13:03<55:52, 1.33it/s, loss=0.0269, lr=1.27e-05, step=5541] Training: 55%|█████▌ | 5542/10000 [1:13:03<52:18, 1.42it/s, loss=0.0269, lr=1.27e-05, step=5541] Training: 55%|█████▌ | 5542/10000 [1:13:03<52:18, 1.42it/s, loss=0.0012, lr=1.27e-05, step=5542] Training: 55%|█████▌ | 5543/10000 [1:13:04<1:00:11, 1.23it/s, loss=0.0012, lr=1.27e-05, step=5542] Training: 55%|█████▌ | 5543/10000 [1:13:04<1:00:11, 1.23it/s, loss=0.0179, lr=1.27e-05, step=5543] Training: 55%|█████▌ | 5544/10000 [1:13:05<59:33, 1.25it/s, loss=0.0179, lr=1.27e-05, step=5543] Training: 55%|█████▌ | 5544/10000 [1:13:05<59:33, 1.25it/s, loss=0.0164, lr=1.27e-05, step=5544] Training: 55%|█████▌ | 5545/10000 [1:13:06<59:23, 1.25it/s, loss=0.0164, lr=1.27e-05, step=5544] Training: 55%|█████▌ | 5545/10000 [1:13:06<59:23, 1.25it/s, loss=0.0170, lr=1.27e-05, step=5545] Training: 55%|█████▌ | 5546/10000 [1:13:07<1:00:42, 1.22it/s, loss=0.0170, lr=1.27e-05, step=5545] Training: 55%|█████▌ | 5546/10000 [1:13:07<1:00:42, 1.22it/s, loss=0.0046, lr=1.27e-05, step=5546] Training: 55%|█████▌ | 5547/10000 [1:13:08<1:01:01, 1.22it/s, loss=0.0046, lr=1.27e-05, step=5546] Training: 55%|█████▌ | 5547/10000 [1:13:08<1:01:01, 1.22it/s, loss=0.0030, lr=1.27e-05, step=5547] Training: 55%|█████▌ | 5548/10000 [1:13:08<1:02:46, 1.18it/s, loss=0.0030, lr=1.27e-05, step=5547] Training: 55%|█████▌ | 5548/10000 [1:13:08<1:02:46, 1.18it/s, loss=0.0038, lr=1.26e-05, step=5548] Training: 55%|█████▌ | 5549/10000 [1:13:09<57:33, 1.29it/s, loss=0.0038, lr=1.26e-05, step=5548] Training: 55%|█████▌ | 5549/10000 [1:13:09<57:33, 1.29it/s, loss=0.0048, lr=1.26e-05, step=5549]19:57:42.149 [I] step=5550 loss=0.0018 smoothed_loss=0.0112 lr=1.27e-05 grad_norm=0.4497 step_time=0.6437s data_time=0.1793s it/s=1.215 eta_to_10000=3661.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0047 grad_action_out_proj_arms=0.0611 grad_arm_token_fuse=0.0247 grad_shared_expert=0.1649 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5550/10000 [1:13:10<57:15, 1.30it/s, loss=0.0048, lr=1.26e-05, step=5549] Training: 56%|█████▌ | 5550/10000 [1:13:10<57:15, 1.30it/s, loss=0.0018, lr=1.26e-05, step=5550] Training: 56%|█████▌ | 5551/10000 [1:13:10<50:45, 1.46it/s, loss=0.0018, lr=1.26e-05, step=5550] Training: 56%|█████▌ | 5551/10000 [1:13:10<50:45, 1.46it/s, loss=0.0035, lr=1.26e-05, step=5551] Training: 56%|█████▌ | 5552/10000 [1:13:11<53:56, 1.37it/s, loss=0.0035, lr=1.26e-05, step=5551] Training: 56%|█████▌ | 5552/10000 [1:13:11<53:56, 1.37it/s, loss=0.0548, lr=1.26e-05, step=5552] Training: 56%|█████▌ | 5553/10000 [1:13:12<53:50, 1.38it/s, loss=0.0548, lr=1.26e-05, step=5552] Training: 56%|█████▌ | 5553/10000 [1:13:12<53:50, 1.38it/s, loss=0.0069, lr=1.26e-05, step=5553] Training: 56%|█████▌ | 5554/10000 [1:13:12<48:52, 1.52it/s, loss=0.0069, lr=1.26e-05, step=5553] Training: 56%|█████▌ | 5554/10000 [1:13:12<48:52, 1.52it/s, loss=0.0190, lr=1.26e-05, step=5554] Training: 56%|█████▌ | 5555/10000 [1:13:13<54:40, 1.36it/s, loss=0.0190, lr=1.26e-05, step=5554] Training: 56%|█████▌ | 5555/10000 [1:13:13<54:40, 1.36it/s, loss=0.0072, lr=1.26e-05, step=5555] Training: 56%|█████▌ | 5556/10000 [1:13:14<48:55, 1.51it/s, loss=0.0072, lr=1.26e-05, step=5555] Training: 56%|█████▌ | 5556/10000 [1:13:14<48:55, 1.51it/s, loss=0.0094, lr=1.26e-05, step=5556] Training: 56%|█████▌ | 5557/10000 [1:13:15<54:25, 1.36it/s, loss=0.0094, lr=1.26e-05, step=5556] Training: 56%|█████▌ | 5557/10000 [1:13:15<54:25, 1.36it/s, loss=0.0195, lr=1.26e-05, step=5557] Training: 56%|█████▌ | 5558/10000 [1:13:15<49:33, 1.49it/s, loss=0.0195, lr=1.26e-05, step=5557] Training: 56%|█████▌ | 5558/10000 [1:13:15<49:33, 1.49it/s, loss=0.0085, lr=1.26e-05, step=5558] Training: 56%|█████▌ | 5559/10000 [1:13:16<48:27, 1.53it/s, loss=0.0085, lr=1.26e-05, step=5558] Training: 56%|█████▌ | 5559/10000 [1:13:16<48:27, 1.53it/s, loss=0.0137, lr=1.26e-05, step=5559]19:57:48.847 [I] step=5560 loss=0.0017 smoothed_loss=0.0123 lr=1.26e-05 grad_norm=0.4623 step_time=0.5327s data_time=0.1371s it/s=1.493 eta_to_10000=2973.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0126 grad_action_out_proj_arms=0.0992 grad_arm_token_fuse=0.0641 grad_shared_expert=0.3364 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5560/10000 [1:13:17<49:48, 1.49it/s, loss=0.0137, lr=1.26e-05, step=5559] Training: 56%|█████▌ | 5560/10000 [1:13:17<49:48, 1.49it/s, loss=0.0017, lr=1.26e-05, step=5560] Training: 56%|█████▌ | 5561/10000 [1:13:17<50:31, 1.46it/s, loss=0.0017, lr=1.26e-05, step=5560] Training: 56%|█████▌ | 5561/10000 [1:13:17<50:31, 1.46it/s, loss=0.0046, lr=1.26e-05, step=5561] Training: 56%|█████▌ | 5562/10000 [1:13:18<49:48, 1.49it/s, loss=0.0046, lr=1.26e-05, step=5561] Training: 56%|█████▌ | 5562/10000 [1:13:18<49:48, 1.49it/s, loss=0.0079, lr=1.26e-05, step=5562] Training: 56%|█████▌ | 5563/10000 [1:13:18<46:26, 1.59it/s, loss=0.0079, lr=1.26e-05, step=5562] Training: 56%|█████▌ | 5563/10000 [1:13:18<46:26, 1.59it/s, loss=0.0038, lr=1.26e-05, step=5563] Training: 56%|█████▌ | 5564/10000 [1:13:19<50:54, 1.45it/s, loss=0.0038, lr=1.26e-05, step=5563] Training: 56%|█████▌ | 5564/10000 [1:13:19<50:54, 1.45it/s, loss=0.0108, lr=1.26e-05, step=5564] Training: 56%|█████▌ | 5565/10000 [1:13:20<1:00:23, 1.22it/s, loss=0.0108, lr=1.26e-05, step=5564] Training: 56%|█████▌ | 5565/10000 [1:13:20<1:00:23, 1.22it/s, loss=0.0185, lr=1.26e-05, step=5565] Training: 56%|█████▌ | 5566/10000 [1:13:21<54:00, 1.37it/s, loss=0.0185, lr=1.26e-05, step=5565] Training: 56%|█████▌ | 5566/10000 [1:13:21<54:00, 1.37it/s, loss=0.0228, lr=1.26e-05, step=5566] Training: 56%|█████▌ | 5567/10000 [1:13:22<55:13, 1.34it/s, loss=0.0228, lr=1.26e-05, step=5566] Training: 56%|█████▌ | 5567/10000 [1:13:22<55:13, 1.34it/s, loss=0.0018, lr=1.26e-05, step=5567] Training: 56%|█████▌ | 5568/10000 [1:13:22<50:05, 1.47it/s, loss=0.0018, lr=1.26e-05, step=5567] Training: 56%|█████▌ | 5568/10000 [1:13:22<50:05, 1.47it/s, loss=0.0164, lr=1.26e-05, step=5568] Training: 56%|█████▌ | 5569/10000 [1:13:23<53:06, 1.39it/s, loss=0.0164, lr=1.26e-05, step=5568] Training: 56%|█████▌ | 5569/10000 [1:13:23<53:06, 1.39it/s, loss=0.0060, lr=1.26e-05, step=5569]19:57:56.266 [I] step=5570 loss=0.0034 smoothed_loss=0.0105 lr=1.26e-05 grad_norm=0.4741 step_time=0.5857s data_time=0.1562s it/s=1.348 eta_to_10000=3286.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0621 grad_arm_token_fuse=0.0344 grad_shared_expert=0.4023 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5570/10000 [1:13:24<58:07, 1.27it/s, loss=0.0060, lr=1.26e-05, step=5569] Training: 56%|█████▌ | 5570/10000 [1:13:24<58:07, 1.27it/s, loss=0.0034, lr=1.26e-05, step=5570] Training: 56%|█████▌ | 5571/10000 [1:13:25<1:01:51, 1.19it/s, loss=0.0034, lr=1.26e-05, step=5570] Training: 56%|█████▌ | 5571/10000 [1:13:25<1:01:51, 1.19it/s, loss=0.0185, lr=1.26e-05, step=5571] Training: 56%|█████▌ | 5572/10000 [1:13:26<1:03:57, 1.15it/s, loss=0.0185, lr=1.26e-05, step=5571] Training: 56%|█████▌ | 5572/10000 [1:13:26<1:03:57, 1.15it/s, loss=0.0046, lr=1.26e-05, step=5572] Training: 56%|█████▌ | 5573/10000 [1:13:27<1:01:01, 1.21it/s, loss=0.0046, lr=1.26e-05, step=5572] Training: 56%|█████▌ | 5573/10000 [1:13:27<1:01:01, 1.21it/s, loss=0.0078, lr=1.26e-05, step=5573] Training: 56%|█████▌ | 5574/10000 [1:13:27<53:36, 1.38it/s, loss=0.0078, lr=1.26e-05, step=5573] Training: 56%|█████▌ | 5574/10000 [1:13:27<53:36, 1.38it/s, loss=0.0201, lr=1.26e-05, step=5574] Training: 56%|█████▌ | 5575/10000 [1:13:28<56:42, 1.30it/s, loss=0.0201, lr=1.26e-05, step=5574] Training: 56%|█████▌ | 5575/10000 [1:13:28<56:42, 1.30it/s, loss=0.0040, lr=1.25e-05, step=5575] Training: 56%|█████▌ | 5576/10000 [1:13:29<1:00:55, 1.21it/s, loss=0.0040, lr=1.25e-05, step=5575] Training: 56%|█████▌ | 5576/10000 [1:13:29<1:00:55, 1.21it/s, loss=0.0066, lr=1.25e-05, step=5576] Training: 56%|█████▌ | 5577/10000 [1:13:30<1:02:47, 1.17it/s, loss=0.0066, lr=1.25e-05, step=5576] Training: 56%|█████▌ | 5577/10000 [1:13:30<1:02:47, 1.17it/s, loss=0.0156, lr=1.25e-05, step=5577] Training: 56%|█████▌ | 5578/10000 [1:13:31<1:07:07, 1.10it/s, loss=0.0156, lr=1.25e-05, step=5577] Training: 56%|█████▌ | 5578/10000 [1:13:31<1:07:07, 1.10it/s, loss=0.0289, lr=1.25e-05, step=5578] Training: 56%|█████▌ | 5579/10000 [1:13:32<1:03:45, 1.16it/s, loss=0.0289, lr=1.25e-05, step=5578] Training: 56%|█████▌ | 5579/10000 [1:13:32<1:03:45, 1.16it/s, loss=0.0072, lr=1.25e-05, step=5579]19:58:04.699 [I] step=5580 loss=0.0041 smoothed_loss=0.0112 lr=1.25e-05 grad_norm=0.4585 step_time=0.6579s data_time=0.1855s it/s=1.186 eta_to_10000=3727.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0188 grad_action_out_proj_arms=0.1398 grad_arm_token_fuse=0.0979 grad_shared_expert=0.4290 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5580/10000 [1:13:32<1:01:37, 1.20it/s, loss=0.0072, lr=1.25e-05, step=5579] Training: 56%|█████▌ | 5580/10000 [1:13:32<1:01:37, 1.20it/s, loss=0.0041, lr=1.25e-05, step=5580] Training: 56%|█████▌ | 5581/10000 [1:13:33<1:00:26, 1.22it/s, loss=0.0041, lr=1.25e-05, step=5580] Training: 56%|█████▌ | 5581/10000 [1:13:33<1:00:26, 1.22it/s, loss=0.0073, lr=1.25e-05, step=5581] Training: 56%|█████▌ | 5582/10000 [1:13:34<53:54, 1.37it/s, loss=0.0073, lr=1.25e-05, step=5581] Training: 56%|█████▌ | 5582/10000 [1:13:34<53:54, 1.37it/s, loss=0.0136, lr=1.25e-05, step=5582] Training: 56%|█████▌ | 5583/10000 [1:13:34<52:29, 1.40it/s, loss=0.0136, lr=1.25e-05, step=5582] Training: 56%|█████▌ | 5583/10000 [1:13:34<52:29, 1.40it/s, loss=0.0109, lr=1.25e-05, step=5583] Training: 56%|█████▌ | 5584/10000 [1:13:35<47:50, 1.54it/s, loss=0.0109, lr=1.25e-05, step=5583] Training: 56%|█████▌ | 5584/10000 [1:13:35<47:50, 1.54it/s, loss=0.0100, lr=1.25e-05, step=5584] Training: 56%|█████▌ | 5585/10000 [1:13:35<44:52, 1.64it/s, loss=0.0100, lr=1.25e-05, step=5584] Training: 56%|█████▌ | 5585/10000 [1:13:35<44:52, 1.64it/s, loss=0.0036, lr=1.25e-05, step=5585] Training: 56%|█████▌ | 5586/10000 [1:13:36<52:48, 1.39it/s, loss=0.0036, lr=1.25e-05, step=5585] Training: 56%|█████▌ | 5586/10000 [1:13:36<52:48, 1.39it/s, loss=0.0287, lr=1.25e-05, step=5586] Training: 56%|█████▌ | 5587/10000 [1:13:37<48:18, 1.52it/s, loss=0.0287, lr=1.25e-05, step=5586] Training: 56%|█████▌ | 5587/10000 [1:13:37<48:18, 1.52it/s, loss=0.0258, lr=1.25e-05, step=5587] Training: 56%|█████▌ | 5588/10000 [1:13:37<46:49, 1.57it/s, loss=0.0258, lr=1.25e-05, step=5587] Training: 56%|█████▌ | 5588/10000 [1:13:37<46:49, 1.57it/s, loss=0.0027, lr=1.25e-05, step=5588] Training: 56%|█████▌ | 5589/10000 [1:13:38<45:19, 1.62it/s, loss=0.0027, lr=1.25e-05, step=5588] Training: 56%|█████▌ | 5589/10000 [1:13:38<45:19, 1.62it/s, loss=0.0049, lr=1.25e-05, step=5589]19:58:10.990 [I] step=5590 loss=0.0189 smoothed_loss=0.0123 lr=1.25e-05 grad_norm=0.5136 step_time=0.5263s data_time=0.1028s it/s=1.590 eta_to_10000=2773.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.1079 grad_arm_token_fuse=0.0498 grad_shared_expert=0.3615 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5590/10000 [1:13:39<46:02, 1.60it/s, loss=0.0049, lr=1.25e-05, step=5589] Training: 56%|█████▌ | 5590/10000 [1:13:39<46:02, 1.60it/s, loss=0.0189, lr=1.25e-05, step=5590] Training: 56%|█████▌ | 5591/10000 [1:13:40<53:21, 1.38it/s, loss=0.0189, lr=1.25e-05, step=5590] Training: 56%|█████▌ | 5591/10000 [1:13:40<53:21, 1.38it/s, loss=0.0087, lr=1.25e-05, step=5591] Training: 56%|█████▌ | 5592/10000 [1:13:40<50:57, 1.44it/s, loss=0.0087, lr=1.25e-05, step=5591] Training: 56%|█████▌ | 5592/10000 [1:13:40<50:57, 1.44it/s, loss=0.0186, lr=1.25e-05, step=5592] Training: 56%|█████▌ | 5593/10000 [1:13:41<58:29, 1.26it/s, loss=0.0186, lr=1.25e-05, step=5592] Training: 56%|█████▌ | 5593/10000 [1:13:41<58:29, 1.26it/s, loss=0.0018, lr=1.25e-05, step=5593] Training: 56%|█████▌ | 5594/10000 [1:13:42<55:14, 1.33it/s, loss=0.0018, lr=1.25e-05, step=5593] Training: 56%|█████▌ | 5594/10000 [1:13:42<55:14, 1.33it/s, loss=0.0322, lr=1.25e-05, step=5594] Training: 56%|█████▌ | 5595/10000 [1:13:42<49:15, 1.49it/s, loss=0.0322, lr=1.25e-05, step=5594] Training: 56%|█████▌ | 5595/10000 [1:13:42<49:15, 1.49it/s, loss=0.0034, lr=1.25e-05, step=5595] Training: 56%|█████▌ | 5596/10000 [1:13:43<45:17, 1.62it/s, loss=0.0034, lr=1.25e-05, step=5595] Training: 56%|█████▌ | 5596/10000 [1:13:43<45:17, 1.62it/s, loss=0.0046, lr=1.25e-05, step=5596] Training: 56%|█████▌ | 5597/10000 [1:13:43<42:18, 1.73it/s, loss=0.0046, lr=1.25e-05, step=5596] Training: 56%|█████▌ | 5597/10000 [1:13:43<42:18, 1.73it/s, loss=0.0119, lr=1.25e-05, step=5597] Training: 56%|█████▌ | 5598/10000 [1:13:44<44:02, 1.67it/s, loss=0.0119, lr=1.25e-05, step=5597] Training: 56%|█████▌ | 5598/10000 [1:13:44<44:02, 1.67it/s, loss=0.0053, lr=1.25e-05, step=5598] Training: 56%|█████▌ | 5599/10000 [1:13:45<41:59, 1.75it/s, loss=0.0053, lr=1.25e-05, step=5598] Training: 56%|█████▌ | 5599/10000 [1:13:45<41:59, 1.75it/s, loss=0.0094, lr=1.25e-05, step=5599]19:58:17.595 [I] step=5600 loss=0.0155 smoothed_loss=0.0114 lr=1.25e-05 grad_norm=0.5031 step_time=0.5583s data_time=0.1023s it/s=1.514 eta_to_10000=2906.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0356 grad_action_out_proj_arms=0.2242 grad_arm_token_fuse=0.1878 grad_shared_expert=0.5690 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5600/10000 [1:13:45<45:21, 1.62it/s, loss=0.0094, lr=1.25e-05, step=5599] Training: 56%|█████▌ | 5600/10000 [1:13:45<45:21, 1.62it/s, loss=0.0155, lr=1.25e-05, step=5600] Training: 56%|█████▌ | 5601/10000 [1:13:46<42:52, 1.71it/s, loss=0.0155, lr=1.25e-05, step=5600] Training: 56%|█████▌ | 5601/10000 [1:13:46<42:52, 1.71it/s, loss=0.0397, lr=1.25e-05, step=5601] Training: 56%|█████▌ | 5602/10000 [1:13:47<47:18, 1.55it/s, loss=0.0397, lr=1.25e-05, step=5601] Training: 56%|█████▌ | 5602/10000 [1:13:47<47:18, 1.55it/s, loss=0.0044, lr=1.24e-05, step=5602] Training: 56%|█████▌ | 5603/10000 [1:13:47<49:08, 1.49it/s, loss=0.0044, lr=1.24e-05, step=5602] Training: 56%|█████▌ | 5603/10000 [1:13:47<49:08, 1.49it/s, loss=0.0069, lr=1.24e-05, step=5603] Training: 56%|█████▌ | 5604/10000 [1:13:48<46:07, 1.59it/s, loss=0.0069, lr=1.24e-05, step=5603] Training: 56%|█████▌ | 5604/10000 [1:13:48<46:07, 1.59it/s, loss=0.0072, lr=1.24e-05, step=5604] Training: 56%|█████▌ | 5605/10000 [1:13:49<52:31, 1.39it/s, loss=0.0072, lr=1.24e-05, step=5604] Training: 56%|█████▌ | 5605/10000 [1:13:49<52:31, 1.39it/s, loss=0.0247, lr=1.24e-05, step=5605] Training: 56%|█████▌ | 5606/10000 [1:13:49<51:02, 1.43it/s, loss=0.0247, lr=1.24e-05, step=5605] Training: 56%|█████▌ | 5606/10000 [1:13:49<51:02, 1.43it/s, loss=0.0105, lr=1.24e-05, step=5606] Training: 56%|█████▌ | 5607/10000 [1:13:50<51:28, 1.42it/s, loss=0.0105, lr=1.24e-05, step=5606] Training: 56%|█████▌ | 5607/10000 [1:13:50<51:28, 1.42it/s, loss=0.0020, lr=1.24e-05, step=5607] Training: 56%|█████▌ | 5608/10000 [1:13:51<53:24, 1.37it/s, loss=0.0020, lr=1.24e-05, step=5607] Training: 56%|█████▌ | 5608/10000 [1:13:51<53:24, 1.37it/s, loss=0.0111, lr=1.24e-05, step=5608] Training: 56%|█████▌ | 5609/10000 [1:13:52<53:31, 1.37it/s, loss=0.0111, lr=1.24e-05, step=5608] Training: 56%|█████▌ | 5609/10000 [1:13:52<53:31, 1.37it/s, loss=0.0171, lr=1.24e-05, step=5609]19:58:24.810 [I] step=5610 loss=0.0092 smoothed_loss=0.0121 lr=1.24e-05 grad_norm=0.4149 step_time=0.5675s data_time=0.1541s it/s=1.386 eta_to_10000=3167.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0729 grad_arm_token_fuse=0.0358 grad_shared_expert=0.2608 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5610/10000 [1:13:52<55:58, 1.31it/s, loss=0.0171, lr=1.24e-05, step=5609] Training: 56%|█████▌ | 5610/10000 [1:13:52<55:58, 1.31it/s, loss=0.0092, lr=1.24e-05, step=5610] Training: 56%|█████▌ | 5611/10000 [1:13:53<57:53, 1.26it/s, loss=0.0092, lr=1.24e-05, step=5610] Training: 56%|█████▌ | 5611/10000 [1:13:53<57:53, 1.26it/s, loss=0.0041, lr=1.24e-05, step=5611] Training: 56%|█████▌ | 5612/10000 [1:13:54<55:52, 1.31it/s, loss=0.0041, lr=1.24e-05, step=5611] Training: 56%|█████▌ | 5612/10000 [1:13:54<55:52, 1.31it/s, loss=0.0100, lr=1.24e-05, step=5612] Training: 56%|█████▌ | 5613/10000 [1:13:55<57:22, 1.27it/s, loss=0.0100, lr=1.24e-05, step=5612] Training: 56%|█████▌ | 5613/10000 [1:13:55<57:22, 1.27it/s, loss=0.0138, lr=1.24e-05, step=5613] Training: 56%|█████▌ | 5614/10000 [1:13:56<54:35, 1.34it/s, loss=0.0138, lr=1.24e-05, step=5613] Training: 56%|█████▌ | 5614/10000 [1:13:56<54:35, 1.34it/s, loss=0.0065, lr=1.24e-05, step=5614] Training: 56%|█████▌ | 5615/10000 [1:13:56<54:07, 1.35it/s, loss=0.0065, lr=1.24e-05, step=5614] Training: 56%|█████▌ | 5615/10000 [1:13:56<54:07, 1.35it/s, loss=0.0085, lr=1.24e-05, step=5615] Training: 56%|█████▌ | 5616/10000 [1:13:57<48:38, 1.50it/s, loss=0.0085, lr=1.24e-05, step=5615] Training: 56%|█████▌ | 5616/10000 [1:13:57<48:38, 1.50it/s, loss=0.0073, lr=1.24e-05, step=5616] Training: 56%|█████▌ | 5617/10000 [1:13:58<52:15, 1.40it/s, loss=0.0073, lr=1.24e-05, step=5616] Training: 56%|█████▌ | 5617/10000 [1:13:58<52:15, 1.40it/s, loss=0.0449, lr=1.24e-05, step=5617] Training: 56%|█████▌ | 5618/10000 [1:13:58<47:05, 1.55it/s, loss=0.0449, lr=1.24e-05, step=5617] Training: 56%|█████▌ | 5618/10000 [1:13:58<47:05, 1.55it/s, loss=0.0255, lr=1.24e-05, step=5618] Training: 56%|█████▌ | 5619/10000 [1:13:59<43:46, 1.67it/s, loss=0.0255, lr=1.24e-05, step=5618] Training: 56%|█████▌ | 5619/10000 [1:13:59<43:46, 1.67it/s, loss=0.0203, lr=1.24e-05, step=5619]19:58:31.749 [I] step=5620 loss=0.0128 smoothed_loss=0.0152 lr=1.24e-05 grad_norm=0.4405 step_time=0.5538s data_time=0.1401s it/s=1.441 eta_to_10000=3038.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0828 grad_arm_token_fuse=0.0401 grad_shared_expert=0.2887 (18633:train_pytorch.py:850) + Training: 56%|█████▌ | 5620/10000 [1:13:59<49:43, 1.47it/s, loss=0.0203, lr=1.24e-05, step=5619] Training: 56%|█████▌ | 5620/10000 [1:13:59<49:43, 1.47it/s, loss=0.0128, lr=1.24e-05, step=5620] Training: 56%|█████▌ | 5621/10000 [1:14:00<54:38, 1.34it/s, loss=0.0128, lr=1.24e-05, step=5620] Training: 56%|█████▌ | 5621/10000 [1:14:00<54:38, 1.34it/s, loss=0.0165, lr=1.24e-05, step=5621] Training: 56%|█████▌ | 5622/10000 [1:14:01<1:02:58, 1.16it/s, loss=0.0165, lr=1.24e-05, step=5621] Training: 56%|█████▌ | 5622/10000 [1:14:01<1:02:58, 1.16it/s, loss=0.0049, lr=1.24e-05, step=5622] Training: 56%|█████▌ | 5623/10000 [1:14:02<54:59, 1.33it/s, loss=0.0049, lr=1.24e-05, step=5622] Training: 56%|█████▌ | 5623/10000 [1:14:02<54:59, 1.33it/s, loss=0.0048, lr=1.24e-05, step=5623] Training: 56%|█████▌ | 5624/10000 [1:14:03<53:58, 1.35it/s, loss=0.0048, lr=1.24e-05, step=5623] Training: 56%|█████▌ | 5624/10000 [1:14:03<53:58, 1.35it/s, loss=0.0128, lr=1.24e-05, step=5624] Training: 56%|█████▋ | 5625/10000 [1:14:03<55:11, 1.32it/s, loss=0.0128, lr=1.24e-05, step=5624] Training: 56%|█████▋ | 5625/10000 [1:14:03<55:11, 1.32it/s, loss=0.0056, lr=1.24e-05, step=5625] Training: 56%|█████▋ | 5626/10000 [1:14:04<54:35, 1.34it/s, loss=0.0056, lr=1.24e-05, step=5625] Training: 56%|█████▋ | 5626/10000 [1:14:04<54:35, 1.34it/s, loss=0.0447, lr=1.24e-05, step=5626] Training: 56%|█████▋ | 5627/10000 [1:14:05<49:07, 1.48it/s, loss=0.0447, lr=1.24e-05, step=5626] Training: 56%|█████▋ | 5627/10000 [1:14:05<49:07, 1.48it/s, loss=0.0029, lr=1.24e-05, step=5627] Training: 56%|█████▋ | 5628/10000 [1:14:06<56:28, 1.29it/s, loss=0.0029, lr=1.24e-05, step=5627] Training: 56%|█████▋ | 5628/10000 [1:14:06<56:28, 1.29it/s, loss=0.0091, lr=1.24e-05, step=5628] Training: 56%|█████▋ | 5629/10000 [1:14:07<1:00:39, 1.20it/s, loss=0.0091, lr=1.24e-05, step=5628] Training: 56%|█████▋ | 5629/10000 [1:14:07<1:00:39, 1.20it/s, loss=0.0093, lr=1.23e-05, step=5629]19:58:39.580 [I] step=5630 loss=0.0149 smoothed_loss=0.0136 lr=1.24e-05 grad_norm=0.4043 step_time=0.6099s data_time=0.1732s it/s=1.277 eta_to_10000=3421.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0153 grad_action_out_proj_arms=0.1507 grad_arm_token_fuse=0.0819 grad_shared_expert=0.4267 (18633:train_pytorch.py:850) + Training: 56%|█████▋ | 5630/10000 [1:14:07<55:12, 1.32it/s, loss=0.0093, lr=1.23e-05, step=5629] Training: 56%|█████▋ | 5630/10000 [1:14:07<55:12, 1.32it/s, loss=0.0149, lr=1.23e-05, step=5630] Training: 56%|█████▋ | 5631/10000 [1:14:08<54:19, 1.34it/s, loss=0.0149, lr=1.23e-05, step=5630] Training: 56%|█████▋ | 5631/10000 [1:14:08<54:19, 1.34it/s, loss=0.0043, lr=1.23e-05, step=5631] Training: 56%|█████▋ | 5632/10000 [1:14:09<49:47, 1.46it/s, loss=0.0043, lr=1.23e-05, step=5631] Training: 56%|█████▋ | 5632/10000 [1:14:09<49:47, 1.46it/s, loss=0.0020, lr=1.23e-05, step=5632] Training: 56%|█████▋ | 5633/10000 [1:14:09<45:29, 1.60it/s, loss=0.0020, lr=1.23e-05, step=5632] Training: 56%|█████▋ | 5633/10000 [1:14:09<45:29, 1.60it/s, loss=0.0123, lr=1.23e-05, step=5633] Training: 56%|█████▋ | 5634/10000 [1:14:09<42:10, 1.73it/s, loss=0.0123, lr=1.23e-05, step=5633] Training: 56%|█████▋ | 5634/10000 [1:14:09<42:10, 1.73it/s, loss=0.0181, lr=1.23e-05, step=5634] Training: 56%|█████▋ | 5635/10000 [1:14:10<44:04, 1.65it/s, loss=0.0181, lr=1.23e-05, step=5634] Training: 56%|█████▋ | 5635/10000 [1:14:10<44:04, 1.65it/s, loss=0.0257, lr=1.23e-05, step=5635] Training: 56%|█████▋ | 5636/10000 [1:14:11<54:31, 1.33it/s, loss=0.0257, lr=1.23e-05, step=5635] Training: 56%|█████▋ | 5636/10000 [1:14:11<54:31, 1.33it/s, loss=0.0035, lr=1.23e-05, step=5636] Training: 56%|█████▋ | 5637/10000 [1:14:12<52:27, 1.39it/s, loss=0.0035, lr=1.23e-05, step=5636] Training: 56%|█████▋ | 5637/10000 [1:14:12<52:27, 1.39it/s, loss=0.0111, lr=1.23e-05, step=5637] Training: 56%|█████▋ | 5638/10000 [1:14:13<54:16, 1.34it/s, loss=0.0111, lr=1.23e-05, step=5637] Training: 56%|█████▋ | 5638/10000 [1:14:13<54:16, 1.34it/s, loss=0.0587, lr=1.23e-05, step=5638] Training: 56%|█████▋ | 5639/10000 [1:14:13<50:30, 1.44it/s, loss=0.0587, lr=1.23e-05, step=5638] Training: 56%|█████▋ | 5639/10000 [1:14:13<50:30, 1.44it/s, loss=0.0063, lr=1.23e-05, step=5639]19:58:46.329 [I] step=5640 loss=0.0050 smoothed_loss=0.0149 lr=1.23e-05 grad_norm=0.3911 step_time=0.5360s data_time=0.1389s it/s=1.482 eta_to_10000=2942.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0960 grad_arm_token_fuse=0.0425 grad_shared_expert=0.4451 (18633:train_pytorch.py:850) + Training: 56%|█████▋ | 5640/10000 [1:14:14<51:38, 1.41it/s, loss=0.0063, lr=1.23e-05, step=5639] Training: 56%|█████▋ | 5640/10000 [1:14:14<51:38, 1.41it/s, loss=0.0050, lr=1.23e-05, step=5640] Training: 56%|█████▋ | 5641/10000 [1:14:15<50:37, 1.44it/s, loss=0.0050, lr=1.23e-05, step=5640] Training: 56%|█████▋ | 5641/10000 [1:14:15<50:37, 1.44it/s, loss=0.0079, lr=1.23e-05, step=5641] Training: 56%|█████▋ | 5642/10000 [1:14:16<56:36, 1.28it/s, loss=0.0079, lr=1.23e-05, step=5641] Training: 56%|█████▋ | 5642/10000 [1:14:16<56:36, 1.28it/s, loss=0.0102, lr=1.23e-05, step=5642] Training: 56%|█████▋ | 5643/10000 [1:14:16<56:30, 1.28it/s, loss=0.0102, lr=1.23e-05, step=5642] Training: 56%|█████▋ | 5643/10000 [1:14:16<56:30, 1.28it/s, loss=0.0085, lr=1.23e-05, step=5643] Training: 56%|█████▋ | 5644/10000 [1:14:17<51:16, 1.42it/s, loss=0.0085, lr=1.23e-05, step=5643] Training: 56%|█████▋ | 5644/10000 [1:14:17<51:16, 1.42it/s, loss=0.0357, lr=1.23e-05, step=5644] Training: 56%|█████▋ | 5645/10000 [1:14:18<56:20, 1.29it/s, loss=0.0357, lr=1.23e-05, step=5644] Training: 56%|█████▋ | 5645/10000 [1:14:18<56:20, 1.29it/s, loss=0.0127, lr=1.23e-05, step=5645] Training: 56%|█████▋ | 5646/10000 [1:14:19<57:12, 1.27it/s, loss=0.0127, lr=1.23e-05, step=5645] Training: 56%|█████▋ | 5646/10000 [1:14:19<57:12, 1.27it/s, loss=0.0036, lr=1.23e-05, step=5646] Training: 56%|█████▋ | 5647/10000 [1:14:20<57:51, 1.25it/s, loss=0.0036, lr=1.23e-05, step=5646] Training: 56%|█████▋ | 5647/10000 [1:14:20<57:51, 1.25it/s, loss=0.0123, lr=1.23e-05, step=5647] Training: 56%|█████▋ | 5648/10000 [1:14:20<51:24, 1.41it/s, loss=0.0123, lr=1.23e-05, step=5647] Training: 56%|█████▋ | 5648/10000 [1:14:20<51:24, 1.41it/s, loss=0.0143, lr=1.23e-05, step=5648] Training: 56%|█████▋ | 5649/10000 [1:14:21<56:59, 1.27it/s, loss=0.0143, lr=1.23e-05, step=5648] Training: 56%|█████▋ | 5649/10000 [1:14:21<56:59, 1.27it/s, loss=0.0101, lr=1.23e-05, step=5649]19:58:54.255 [I] step=5650 loss=0.0061 smoothed_loss=0.0128 lr=1.23e-05 grad_norm=0.4018 step_time=0.6389s data_time=0.1536s it/s=1.263 eta_to_10000=3445.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.0953 grad_arm_token_fuse=0.0430 grad_shared_expert=0.2912 (18633:train_pytorch.py:850) + Training: 56%|█████▋ | 5650/10000 [1:14:22<1:00:04, 1.21it/s, loss=0.0101, lr=1.23e-05, step=5649] Training: 56%|█████▋ | 5650/10000 [1:14:22<1:00:04, 1.21it/s, loss=0.0061, lr=1.23e-05, step=5650] Training: 57%|█████▋ | 5651/10000 [1:14:23<59:20, 1.22it/s, loss=0.0061, lr=1.23e-05, step=5650] Training: 57%|█████▋ | 5651/10000 [1:14:23<59:20, 1.22it/s, loss=0.0140, lr=1.23e-05, step=5651] Training: 57%|█████▋ | 5652/10000 [1:14:23<52:30, 1.38it/s, loss=0.0140, lr=1.23e-05, step=5651] Training: 57%|█████▋ | 5652/10000 [1:14:23<52:30, 1.38it/s, loss=0.0073, lr=1.23e-05, step=5652] Training: 57%|█████▋ | 5653/10000 [1:14:24<48:35, 1.49it/s, loss=0.0073, lr=1.23e-05, step=5652] Training: 57%|█████▋ | 5653/10000 [1:14:24<48:35, 1.49it/s, loss=0.0055, lr=1.23e-05, step=5653] Training: 57%|█████▋ | 5654/10000 [1:14:25<50:37, 1.43it/s, loss=0.0055, lr=1.23e-05, step=5653] Training: 57%|█████▋ | 5654/10000 [1:14:25<50:37, 1.43it/s, loss=0.0501, lr=1.23e-05, step=5654] Training: 57%|█████▋ | 5655/10000 [1:14:25<46:07, 1.57it/s, loss=0.0501, lr=1.23e-05, step=5654] Training: 57%|█████▋ | 5655/10000 [1:14:25<46:07, 1.57it/s, loss=0.0111, lr=1.23e-05, step=5655] Training: 57%|█████▋ | 5656/10000 [1:14:26<47:45, 1.52it/s, loss=0.0111, lr=1.23e-05, step=5655] Training: 57%|█████▋ | 5656/10000 [1:14:26<47:45, 1.52it/s, loss=0.0342, lr=1.22e-05, step=5656] Training: 57%|█████▋ | 5657/10000 [1:14:27<52:31, 1.38it/s, loss=0.0342, lr=1.22e-05, step=5656] Training: 57%|█████▋ | 5657/10000 [1:14:27<52:31, 1.38it/s, loss=0.0129, lr=1.22e-05, step=5657] Training: 57%|█████▋ | 5658/10000 [1:14:28<57:00, 1.27it/s, loss=0.0129, lr=1.22e-05, step=5657] Training: 57%|█████▋ | 5658/10000 [1:14:28<57:00, 1.27it/s, loss=0.0035, lr=1.22e-05, step=5658] Training: 57%|█████▋ | 5659/10000 [1:14:28<55:55, 1.29it/s, loss=0.0035, lr=1.22e-05, step=5658] Training: 57%|█████▋ | 5659/10000 [1:14:28<55:55, 1.29it/s, loss=0.0036, lr=1.22e-05, step=5659]19:59:01.486 [I] step=5660 loss=0.0090 smoothed_loss=0.0136 lr=1.22e-05 grad_norm=0.4749 step_time=0.5707s data_time=0.1524s it/s=1.383 eta_to_10000=3138.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0649 grad_arm_token_fuse=0.0321 grad_shared_expert=0.3317 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5660/10000 [1:14:29<57:55, 1.25it/s, loss=0.0036, lr=1.22e-05, step=5659] Training: 57%|█████▋ | 5660/10000 [1:14:29<57:55, 1.25it/s, loss=0.0090, lr=1.22e-05, step=5660] Training: 57%|█████▋ | 5661/10000 [1:14:30<57:20, 1.26it/s, loss=0.0090, lr=1.22e-05, step=5660] Training: 57%|█████▋ | 5661/10000 [1:14:30<57:20, 1.26it/s, loss=0.0026, lr=1.22e-05, step=5661] Training: 57%|█████▋ | 5662/10000 [1:14:31<58:54, 1.23it/s, loss=0.0026, lr=1.22e-05, step=5661] Training: 57%|█████▋ | 5662/10000 [1:14:31<58:54, 1.23it/s, loss=0.0037, lr=1.22e-05, step=5662] Training: 57%|█████▋ | 5663/10000 [1:14:32<57:22, 1.26it/s, loss=0.0037, lr=1.22e-05, step=5662] Training: 57%|█████▋ | 5663/10000 [1:14:32<57:22, 1.26it/s, loss=0.0184, lr=1.22e-05, step=5663] Training: 57%|█████▋ | 5664/10000 [1:14:33<1:03:03, 1.15it/s, loss=0.0184, lr=1.22e-05, step=5663] Training: 57%|█████▋ | 5664/10000 [1:14:33<1:03:03, 1.15it/s, loss=0.0546, lr=1.22e-05, step=5664] Training: 57%|█████▋ | 5665/10000 [1:14:34<1:07:03, 1.08it/s, loss=0.0546, lr=1.22e-05, step=5664] Training: 57%|█████▋ | 5665/10000 [1:14:34<1:07:03, 1.08it/s, loss=0.0020, lr=1.22e-05, step=5665] Training: 57%|█████▋ | 5666/10000 [1:14:34<1:00:15, 1.20it/s, loss=0.0020, lr=1.22e-05, step=5665] Training: 57%|█████▋ | 5666/10000 [1:14:34<1:00:15, 1.20it/s, loss=0.0030, lr=1.22e-05, step=5666] Training: 57%|█████▋ | 5667/10000 [1:14:35<57:47, 1.25it/s, loss=0.0030, lr=1.22e-05, step=5666] Training: 57%|█████▋ | 5667/10000 [1:14:35<57:47, 1.25it/s, loss=0.0023, lr=1.22e-05, step=5667] Training: 57%|█████▋ | 5668/10000 [1:14:36<56:39, 1.27it/s, loss=0.0023, lr=1.22e-05, step=5667] Training: 57%|█████▋ | 5668/10000 [1:14:36<56:39, 1.27it/s, loss=0.0127, lr=1.22e-05, step=5668] Training: 57%|█████▋ | 5669/10000 [1:14:36<50:56, 1.42it/s, loss=0.0127, lr=1.22e-05, step=5668] Training: 57%|█████▋ | 5669/10000 [1:14:36<50:56, 1.42it/s, loss=0.0040, lr=1.22e-05, step=5669]19:59:09.468 [I] step=5670 loss=0.0117 smoothed_loss=0.0118 lr=1.22e-05 grad_norm=0.4513 step_time=0.6223s data_time=0.1759s it/s=1.253 eta_to_10000=3455.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0208 grad_action_out_proj_arms=0.1788 grad_arm_token_fuse=0.0981 grad_shared_expert=0.5621 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5670/10000 [1:14:37<54:36, 1.32it/s, loss=0.0040, lr=1.22e-05, step=5669] Training: 57%|█████▋ | 5670/10000 [1:14:37<54:36, 1.32it/s, loss=0.0117, lr=1.22e-05, step=5670] Training: 57%|█████▋ | 5671/10000 [1:14:38<52:46, 1.37it/s, loss=0.0117, lr=1.22e-05, step=5670] Training: 57%|█████▋ | 5671/10000 [1:14:38<52:46, 1.37it/s, loss=0.0053, lr=1.22e-05, step=5671] Training: 57%|█████▋ | 5672/10000 [1:14:39<57:59, 1.24it/s, loss=0.0053, lr=1.22e-05, step=5671] Training: 57%|█████▋ | 5672/10000 [1:14:39<57:59, 1.24it/s, loss=0.0057, lr=1.22e-05, step=5672] Training: 57%|█████▋ | 5673/10000 [1:14:39<51:43, 1.39it/s, loss=0.0057, lr=1.22e-05, step=5672] Training: 57%|█████▋ | 5673/10000 [1:14:39<51:43, 1.39it/s, loss=0.0142, lr=1.22e-05, step=5673] Training: 57%|█████▋ | 5674/10000 [1:14:40<47:45, 1.51it/s, loss=0.0142, lr=1.22e-05, step=5673] Training: 57%|█████▋ | 5674/10000 [1:14:40<47:45, 1.51it/s, loss=0.0286, lr=1.22e-05, step=5674] Training: 57%|█████▋ | 5675/10000 [1:14:40<47:37, 1.51it/s, loss=0.0286, lr=1.22e-05, step=5674] Training: 57%|█████▋ | 5675/10000 [1:14:40<47:37, 1.51it/s, loss=0.0053, lr=1.22e-05, step=5675] Training: 57%|█████▋ | 5676/10000 [1:14:41<44:04, 1.63it/s, loss=0.0053, lr=1.22e-05, step=5675] Training: 57%|█████▋ | 5676/10000 [1:14:41<44:04, 1.63it/s, loss=0.0039, lr=1.22e-05, step=5676] Training: 57%|█████▋ | 5677/10000 [1:14:42<42:13, 1.71it/s, loss=0.0039, lr=1.22e-05, step=5676] Training: 57%|█████▋ | 5677/10000 [1:14:42<42:13, 1.71it/s, loss=0.0087, lr=1.22e-05, step=5677] Training: 57%|█████▋ | 5678/10000 [1:14:42<46:06, 1.56it/s, loss=0.0087, lr=1.22e-05, step=5677] Training: 57%|█████▋ | 5678/10000 [1:14:42<46:06, 1.56it/s, loss=0.0122, lr=1.22e-05, step=5678] Training: 57%|█████▋ | 5679/10000 [1:14:43<55:06, 1.31it/s, loss=0.0122, lr=1.22e-05, step=5678] Training: 57%|█████▋ | 5679/10000 [1:14:43<55:06, 1.31it/s, loss=0.0451, lr=1.22e-05, step=5679]19:59:16.511 [I] step=5680 loss=0.0074 smoothed_loss=0.0138 lr=1.22e-05 grad_norm=0.4887 step_time=0.5735s data_time=0.1309s it/s=1.420 eta_to_10000=3042.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0804 grad_arm_token_fuse=0.0375 grad_shared_expert=0.3263 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5680/10000 [1:14:44<56:50, 1.27it/s, loss=0.0451, lr=1.22e-05, step=5679] Training: 57%|█████▋ | 5680/10000 [1:14:44<56:50, 1.27it/s, loss=0.0074, lr=1.22e-05, step=5680] Training: 57%|█████▋ | 5681/10000 [1:14:45<51:51, 1.39it/s, loss=0.0074, lr=1.22e-05, step=5680] Training: 57%|█████▋ | 5681/10000 [1:14:45<51:51, 1.39it/s, loss=0.0068, lr=1.22e-05, step=5681] Training: 57%|█████▋ | 5682/10000 [1:14:46<54:19, 1.32it/s, loss=0.0068, lr=1.22e-05, step=5681] Training: 57%|█████▋ | 5682/10000 [1:14:46<54:19, 1.32it/s, loss=0.0170, lr=1.22e-05, step=5682] Training: 57%|█████▋ | 5683/10000 [1:14:46<56:12, 1.28it/s, loss=0.0170, lr=1.22e-05, step=5682] Training: 57%|█████▋ | 5683/10000 [1:14:46<56:12, 1.28it/s, loss=0.0033, lr=1.21e-05, step=5683] Training: 57%|█████▋ | 5684/10000 [1:14:47<49:48, 1.44it/s, loss=0.0033, lr=1.21e-05, step=5683] Training: 57%|█████▋ | 5684/10000 [1:14:47<49:48, 1.44it/s, loss=0.0265, lr=1.21e-05, step=5684] Training: 57%|█████▋ | 5685/10000 [1:14:48<51:25, 1.40it/s, loss=0.0265, lr=1.21e-05, step=5684] Training: 57%|█████▋ | 5685/10000 [1:14:48<51:25, 1.40it/s, loss=0.0084, lr=1.21e-05, step=5685] Training: 57%|█████▋ | 5686/10000 [1:14:48<51:19, 1.40it/s, loss=0.0084, lr=1.21e-05, step=5685] Training: 57%|█████▋ | 5686/10000 [1:14:48<51:19, 1.40it/s, loss=0.0076, lr=1.21e-05, step=5686] Training: 57%|█████▋ | 5687/10000 [1:14:49<46:36, 1.54it/s, loss=0.0076, lr=1.21e-05, step=5686] Training: 57%|█████▋ | 5687/10000 [1:14:49<46:36, 1.54it/s, loss=0.0108, lr=1.21e-05, step=5687] Training: 57%|█████▋ | 5688/10000 [1:14:50<49:19, 1.46it/s, loss=0.0108, lr=1.21e-05, step=5687] Training: 57%|█████▋ | 5688/10000 [1:14:50<49:19, 1.46it/s, loss=0.0368, lr=1.21e-05, step=5688] Training: 57%|█████▋ | 5689/10000 [1:14:50<49:10, 1.46it/s, loss=0.0368, lr=1.21e-05, step=5688] Training: 57%|█████▋ | 5689/10000 [1:14:50<49:10, 1.46it/s, loss=0.0151, lr=1.21e-05, step=5689]19:59:23.243 [I] step=5690 loss=0.0075 smoothed_loss=0.0142 lr=1.21e-05 grad_norm=0.4556 step_time=0.5406s data_time=0.1326s it/s=1.486 eta_to_10000=2900.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.0944 grad_arm_token_fuse=0.0533 grad_shared_expert=0.5153 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5690/10000 [1:14:51<46:53, 1.53it/s, loss=0.0151, lr=1.21e-05, step=5689] Training: 57%|█████▋ | 5690/10000 [1:14:51<46:53, 1.53it/s, loss=0.0075, lr=1.21e-05, step=5690] Training: 57%|█████▋ | 5691/10000 [1:14:52<52:18, 1.37it/s, loss=0.0075, lr=1.21e-05, step=5690] Training: 57%|█████▋ | 5691/10000 [1:14:52<52:18, 1.37it/s, loss=0.0047, lr=1.21e-05, step=5691] Training: 57%|█████▋ | 5692/10000 [1:14:52<47:07, 1.52it/s, loss=0.0047, lr=1.21e-05, step=5691] Training: 57%|█████▋ | 5692/10000 [1:14:52<47:07, 1.52it/s, loss=0.0028, lr=1.21e-05, step=5692] Training: 57%|█████▋ | 5693/10000 [1:14:53<53:14, 1.35it/s, loss=0.0028, lr=1.21e-05, step=5692] Training: 57%|█████▋ | 5693/10000 [1:14:53<53:14, 1.35it/s, loss=0.0058, lr=1.21e-05, step=5693] Training: 57%|█████▋ | 5694/10000 [1:14:54<51:03, 1.41it/s, loss=0.0058, lr=1.21e-05, step=5693] Training: 57%|█████▋ | 5694/10000 [1:14:54<51:03, 1.41it/s, loss=0.0081, lr=1.21e-05, step=5694] Training: 57%|█████▋ | 5695/10000 [1:14:55<52:53, 1.36it/s, loss=0.0081, lr=1.21e-05, step=5694] Training: 57%|█████▋ | 5695/10000 [1:14:55<52:53, 1.36it/s, loss=0.0062, lr=1.21e-05, step=5695] Training: 57%|█████▋ | 5696/10000 [1:14:55<52:32, 1.37it/s, loss=0.0062, lr=1.21e-05, step=5695] Training: 57%|█████▋ | 5696/10000 [1:14:55<52:32, 1.37it/s, loss=0.0141, lr=1.21e-05, step=5696] Training: 57%|█████▋ | 5697/10000 [1:14:56<47:02, 1.52it/s, loss=0.0141, lr=1.21e-05, step=5696] Training: 57%|█████▋ | 5697/10000 [1:14:56<47:02, 1.52it/s, loss=0.0041, lr=1.21e-05, step=5697] Training: 57%|█████▋ | 5698/10000 [1:14:56<43:21, 1.65it/s, loss=0.0041, lr=1.21e-05, step=5697] Training: 57%|█████▋ | 5698/10000 [1:14:56<43:21, 1.65it/s, loss=0.0108, lr=1.21e-05, step=5698] Training: 57%|█████▋ | 5699/10000 [1:14:57<46:00, 1.56it/s, loss=0.0108, lr=1.21e-05, step=5698] Training: 57%|█████▋ | 5699/10000 [1:14:57<46:00, 1.56it/s, loss=0.0086, lr=1.21e-05, step=5699]19:59:30.155 [I] step=5700 loss=0.1304 smoothed_loss=0.0222 lr=1.21e-05 grad_norm=0.6165 step_time=0.5633s data_time=0.1280s it/s=1.447 eta_to_10000=2972.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0317 grad_action_out_proj_arms=0.1917 grad_arm_token_fuse=0.1618 grad_shared_expert=1.8588 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5700/10000 [1:14:58<47:51, 1.50it/s, loss=0.0086, lr=1.21e-05, step=5699] Training: 57%|█████▋ | 5700/10000 [1:14:58<47:51, 1.50it/s, loss=0.1304, lr=1.21e-05, step=5700] Training: 57%|█████▋ | 5701/10000 [1:14:59<54:30, 1.31it/s, loss=0.1304, lr=1.21e-05, step=5700] Training: 57%|█████▋ | 5701/10000 [1:14:59<54:30, 1.31it/s, loss=0.0048, lr=1.21e-05, step=5701] Training: 57%|█████▋ | 5702/10000 [1:15:00<54:48, 1.31it/s, loss=0.0048, lr=1.21e-05, step=5701] Training: 57%|█████▋ | 5702/10000 [1:15:00<54:48, 1.31it/s, loss=0.0150, lr=1.21e-05, step=5702] Training: 57%|█████▋ | 5703/10000 [1:15:00<48:53, 1.46it/s, loss=0.0150, lr=1.21e-05, step=5702] Training: 57%|█████▋ | 5703/10000 [1:15:00<48:53, 1.46it/s, loss=0.0026, lr=1.21e-05, step=5703] Training: 57%|█████▋ | 5704/10000 [1:15:01<44:47, 1.60it/s, loss=0.0026, lr=1.21e-05, step=5703] Training: 57%|█████▋ | 5704/10000 [1:15:01<44:47, 1.60it/s, loss=0.0064, lr=1.21e-05, step=5704] Training: 57%|█████▋ | 5705/10000 [1:15:01<41:52, 1.71it/s, loss=0.0064, lr=1.21e-05, step=5704] Training: 57%|█████▋ | 5705/10000 [1:15:01<41:52, 1.71it/s, loss=0.0052, lr=1.21e-05, step=5705] Training: 57%|█████▋ | 5706/10000 [1:15:02<44:27, 1.61it/s, loss=0.0052, lr=1.21e-05, step=5705] Training: 57%|█████▋ | 5706/10000 [1:15:02<44:27, 1.61it/s, loss=0.0123, lr=1.21e-05, step=5706] Training: 57%|█████▋ | 5707/10000 [1:15:03<51:37, 1.39it/s, loss=0.0123, lr=1.21e-05, step=5706] Training: 57%|█████▋ | 5707/10000 [1:15:03<51:37, 1.39it/s, loss=0.0059, lr=1.21e-05, step=5707] Training: 57%|█████▋ | 5708/10000 [1:15:03<50:43, 1.41it/s, loss=0.0059, lr=1.21e-05, step=5707] Training: 57%|█████▋ | 5708/10000 [1:15:03<50:43, 1.41it/s, loss=0.0030, lr=1.21e-05, step=5708] Training: 57%|█████▋ | 5709/10000 [1:15:04<45:54, 1.56it/s, loss=0.0030, lr=1.21e-05, step=5708] Training: 57%|█████▋ | 5709/10000 [1:15:04<45:54, 1.56it/s, loss=0.0184, lr=1.21e-05, step=5709]19:59:36.802 [I] step=5710 loss=0.0382 smoothed_loss=0.0163 lr=1.21e-05 grad_norm=0.4590 step_time=0.5485s data_time=0.1161s it/s=1.505 eta_to_10000=2851.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0281 grad_action_out_proj_arms=0.1654 grad_arm_token_fuse=0.1418 grad_shared_expert=0.6204 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5710/10000 [1:15:04<44:52, 1.59it/s, loss=0.0184, lr=1.21e-05, step=5709] Training: 57%|█████▋ | 5710/10000 [1:15:04<44:52, 1.59it/s, loss=0.0382, lr=1.20e-05, step=5710] Training: 57%|█████▋ | 5711/10000 [1:15:05<41:45, 1.71it/s, loss=0.0382, lr=1.20e-05, step=5710] Training: 57%|█████▋ | 5711/10000 [1:15:05<41:45, 1.71it/s, loss=0.0290, lr=1.20e-05, step=5711] Training: 57%|█████▋ | 5712/10000 [1:15:05<39:52, 1.79it/s, loss=0.0290, lr=1.20e-05, step=5711] Training: 57%|█████▋ | 5712/10000 [1:15:05<39:52, 1.79it/s, loss=0.0018, lr=1.20e-05, step=5712] Training: 57%|█████▋ | 5713/10000 [1:15:06<42:49, 1.67it/s, loss=0.0018, lr=1.20e-05, step=5712] Training: 57%|█████▋ | 5713/10000 [1:15:06<42:49, 1.67it/s, loss=0.0064, lr=1.20e-05, step=5713] Training: 57%|█████▋ | 5714/10000 [1:15:07<44:10, 1.62it/s, loss=0.0064, lr=1.20e-05, step=5713] Training: 57%|█████▋ | 5714/10000 [1:15:07<44:10, 1.62it/s, loss=0.0041, lr=1.20e-05, step=5714] Training: 57%|█████▋ | 5715/10000 [1:15:07<45:38, 1.56it/s, loss=0.0041, lr=1.20e-05, step=5714] Training: 57%|█████▋ | 5715/10000 [1:15:07<45:38, 1.56it/s, loss=0.0122, lr=1.20e-05, step=5715] Training: 57%|█████▋ | 5716/10000 [1:15:08<43:33, 1.64it/s, loss=0.0122, lr=1.20e-05, step=5715] Training: 57%|█████▋ | 5716/10000 [1:15:08<43:33, 1.64it/s, loss=0.0094, lr=1.20e-05, step=5716] Training: 57%|█████▋ | 5717/10000 [1:15:09<41:10, 1.73it/s, loss=0.0094, lr=1.20e-05, step=5716] Training: 57%|█████▋ | 5717/10000 [1:15:09<41:10, 1.73it/s, loss=0.0243, lr=1.20e-05, step=5717] Training: 57%|█████▋ | 5718/10000 [1:15:09<45:16, 1.58it/s, loss=0.0243, lr=1.20e-05, step=5717] Training: 57%|█████▋ | 5718/10000 [1:15:09<45:16, 1.58it/s, loss=0.0054, lr=1.20e-05, step=5718] Training: 57%|█████▋ | 5719/10000 [1:15:10<42:10, 1.69it/s, loss=0.0054, lr=1.20e-05, step=5718] Training: 57%|█████▋ | 5719/10000 [1:15:10<42:10, 1.69it/s, loss=0.0384, lr=1.20e-05, step=5719]19:59:42.649 [I] step=5720 loss=0.0139 smoothed_loss=0.0158 lr=1.20e-05 grad_norm=0.5532 step_time=0.5001s data_time=0.0846s it/s=1.711 eta_to_10000=2502.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0201 grad_action_out_proj_arms=0.1538 grad_arm_token_fuse=0.1062 grad_shared_expert=0.6302 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5720/10000 [1:15:10<40:39, 1.75it/s, loss=0.0384, lr=1.20e-05, step=5719] Training: 57%|█████▋ | 5720/10000 [1:15:10<40:39, 1.75it/s, loss=0.0139, lr=1.20e-05, step=5720] Training: 57%|█████▋ | 5721/10000 [1:15:11<45:23, 1.57it/s, loss=0.0139, lr=1.20e-05, step=5720] Training: 57%|█████▋ | 5721/10000 [1:15:11<45:23, 1.57it/s, loss=0.0087, lr=1.20e-05, step=5721] Training: 57%|█████▋ | 5722/10000 [1:15:12<46:17, 1.54it/s, loss=0.0087, lr=1.20e-05, step=5721] Training: 57%|█████▋ | 5722/10000 [1:15:12<46:17, 1.54it/s, loss=0.0346, lr=1.20e-05, step=5722] Training: 57%|█████▋ | 5723/10000 [1:15:12<42:30, 1.68it/s, loss=0.0346, lr=1.20e-05, step=5722] Training: 57%|█████▋ | 5723/10000 [1:15:12<42:30, 1.68it/s, loss=0.0091, lr=1.20e-05, step=5723] Training: 57%|█████▋ | 5724/10000 [1:15:13<40:33, 1.76it/s, loss=0.0091, lr=1.20e-05, step=5723] Training: 57%|█████▋ | 5724/10000 [1:15:13<40:33, 1.76it/s, loss=0.0111, lr=1.20e-05, step=5724] Training: 57%|█████▋ | 5725/10000 [1:15:13<38:52, 1.83it/s, loss=0.0111, lr=1.20e-05, step=5724] Training: 57%|█████▋ | 5725/10000 [1:15:13<38:52, 1.83it/s, loss=0.0057, lr=1.20e-05, step=5725] Training: 57%|█████▋ | 5726/10000 [1:15:14<40:16, 1.77it/s, loss=0.0057, lr=1.20e-05, step=5725] Training: 57%|█████▋ | 5726/10000 [1:15:14<40:16, 1.77it/s, loss=0.0030, lr=1.20e-05, step=5726] Training: 57%|█████▋ | 5727/10000 [1:15:15<43:52, 1.62it/s, loss=0.0030, lr=1.20e-05, step=5726] Training: 57%|█████▋ | 5727/10000 [1:15:15<43:52, 1.62it/s, loss=0.0031, lr=1.20e-05, step=5727] Training: 57%|█████▋ | 5728/10000 [1:15:16<50:48, 1.40it/s, loss=0.0031, lr=1.20e-05, step=5727] Training: 57%|█████▋ | 5728/10000 [1:15:16<50:48, 1.40it/s, loss=0.0509, lr=1.20e-05, step=5728] Training: 57%|█████▋ | 5729/10000 [1:15:16<50:22, 1.41it/s, loss=0.0509, lr=1.20e-05, step=5728] Training: 57%|█████▋ | 5729/10000 [1:15:16<50:22, 1.41it/s, loss=0.0849, lr=1.20e-05, step=5729]19:59:49.295 [I] step=5730 loss=0.0330 smoothed_loss=0.0242 lr=1.20e-05 grad_norm=0.4848 step_time=0.5634s data_time=0.1011s it/s=1.505 eta_to_10000=2837.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0250 grad_action_out_proj_arms=0.2162 grad_arm_token_fuse=0.1318 grad_shared_expert=0.6630 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5730/10000 [1:15:17<50:43, 1.40it/s, loss=0.0849, lr=1.20e-05, step=5729] Training: 57%|█████▋ | 5730/10000 [1:15:17<50:43, 1.40it/s, loss=0.0330, lr=1.20e-05, step=5730] Training: 57%|█████▋ | 5731/10000 [1:15:17<45:38, 1.56it/s, loss=0.0330, lr=1.20e-05, step=5730] Training: 57%|█████▋ | 5731/10000 [1:15:17<45:38, 1.56it/s, loss=0.0173, lr=1.20e-05, step=5731] Training: 57%|█████▋ | 5732/10000 [1:15:18<43:05, 1.65it/s, loss=0.0173, lr=1.20e-05, step=5731] Training: 57%|█████▋ | 5732/10000 [1:15:18<43:05, 1.65it/s, loss=0.0044, lr=1.20e-05, step=5732] Training: 57%|█████▋ | 5733/10000 [1:15:18<40:28, 1.76it/s, loss=0.0044, lr=1.20e-05, step=5732] Training: 57%|█████▋ | 5733/10000 [1:15:18<40:28, 1.76it/s, loss=0.0136, lr=1.20e-05, step=5733] Training: 57%|█████▋ | 5734/10000 [1:15:19<40:08, 1.77it/s, loss=0.0136, lr=1.20e-05, step=5733] Training: 57%|█████▋ | 5734/10000 [1:15:19<40:08, 1.77it/s, loss=0.0126, lr=1.20e-05, step=5734] Training: 57%|█████▋ | 5735/10000 [1:15:20<43:23, 1.64it/s, loss=0.0126, lr=1.20e-05, step=5734] Training: 57%|█████▋ | 5735/10000 [1:15:20<43:23, 1.64it/s, loss=0.0093, lr=1.20e-05, step=5735] Training: 57%|█████▋ | 5736/10000 [1:15:21<56:11, 1.26it/s, loss=0.0093, lr=1.20e-05, step=5735] Training: 57%|█████▋ | 5736/10000 [1:15:21<56:11, 1.26it/s, loss=0.0083, lr=1.20e-05, step=5736] Training: 57%|█████▋ | 5737/10000 [1:15:21<49:58, 1.42it/s, loss=0.0083, lr=1.20e-05, step=5736] Training: 57%|█████▋ | 5737/10000 [1:15:21<49:58, 1.42it/s, loss=0.0086, lr=1.19e-05, step=5737] Training: 57%|█████▋ | 5738/10000 [1:15:22<48:51, 1.45it/s, loss=0.0086, lr=1.19e-05, step=5737] Training: 57%|█████▋ | 5738/10000 [1:15:22<48:51, 1.45it/s, loss=0.0194, lr=1.19e-05, step=5738] Training: 57%|█████▋ | 5739/10000 [1:15:23<44:28, 1.60it/s, loss=0.0194, lr=1.19e-05, step=5738] Training: 57%|█████▋ | 5739/10000 [1:15:23<44:28, 1.60it/s, loss=0.0039, lr=1.19e-05, step=5739]19:59:55.515 [I] step=5740 loss=0.0147 smoothed_loss=0.0157 lr=1.20e-05 grad_norm=0.4321 step_time=0.5268s data_time=0.0952s it/s=1.608 eta_to_10000=2649.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0125 grad_action_out_proj_arms=0.1042 grad_arm_token_fuse=0.0620 grad_shared_expert=0.5631 (18633:train_pytorch.py:850) + Training: 57%|█████▋ | 5740/10000 [1:15:23<44:23, 1.60it/s, loss=0.0039, lr=1.19e-05, step=5739] Training: 57%|█████▋ | 5740/10000 [1:15:23<44:23, 1.60it/s, loss=0.0147, lr=1.19e-05, step=5740] Training: 57%|█████▋ | 5741/10000 [1:15:24<46:45, 1.52it/s, loss=0.0147, lr=1.19e-05, step=5740] Training: 57%|█████▋ | 5741/10000 [1:15:24<46:45, 1.52it/s, loss=0.0026, lr=1.19e-05, step=5741] Training: 57%|█████▋ | 5742/10000 [1:15:24<43:01, 1.65it/s, loss=0.0026, lr=1.19e-05, step=5741] Training: 57%|█████▋ | 5742/10000 [1:15:24<43:01, 1.65it/s, loss=0.0087, lr=1.19e-05, step=5742] Training: 57%|█████▋ | 5743/10000 [1:15:25<50:56, 1.39it/s, loss=0.0087, lr=1.19e-05, step=5742] Training: 57%|█████▋ | 5743/10000 [1:15:25<50:56, 1.39it/s, loss=0.0095, lr=1.19e-05, step=5743] Training: 57%|█████▋ | 5744/10000 [1:15:26<46:00, 1.54it/s, loss=0.0095, lr=1.19e-05, step=5743] Training: 57%|█████▋ | 5744/10000 [1:15:26<46:00, 1.54it/s, loss=0.0015, lr=1.19e-05, step=5744] Training: 57%|█████▋ | 5745/10000 [1:15:26<42:43, 1.66it/s, loss=0.0015, lr=1.19e-05, step=5744] Training: 57%|█████▋ | 5745/10000 [1:15:26<42:43, 1.66it/s, loss=0.0044, lr=1.19e-05, step=5745] Training: 57%|█████▋ | 5746/10000 [1:15:27<44:26, 1.60it/s, loss=0.0044, lr=1.19e-05, step=5745] Training: 57%|█████▋ | 5746/10000 [1:15:27<44:26, 1.60it/s, loss=0.0068, lr=1.19e-05, step=5746] Training: 57%|█████▋ | 5747/10000 [1:15:28<41:29, 1.71it/s, loss=0.0068, lr=1.19e-05, step=5746] Training: 57%|█████▋ | 5747/10000 [1:15:28<41:29, 1.71it/s, loss=0.0055, lr=1.19e-05, step=5747] Training: 57%|█████▋ | 5748/10000 [1:15:28<45:37, 1.55it/s, loss=0.0055, lr=1.19e-05, step=5747] Training: 57%|█████▋ | 5748/10000 [1:15:28<45:37, 1.55it/s, loss=0.0121, lr=1.19e-05, step=5748] Training: 57%|█████▋ | 5749/10000 [1:15:29<42:34, 1.66it/s, loss=0.0121, lr=1.19e-05, step=5748] Training: 57%|█████▋ | 5749/10000 [1:15:29<42:34, 1.66it/s, loss=0.0108, lr=1.19e-05, step=5749]20:00:01.909 [I] step=5750 loss=0.0046 smoothed_loss=0.0100 lr=1.19e-05 grad_norm=0.3840 step_time=0.5256s data_time=0.1137s it/s=1.564 eta_to_10000=2716.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0203 grad_action_out_proj_arms=0.1109 grad_arm_token_fuse=0.1026 grad_shared_expert=0.6585 (18633:train_pytorch.py:850) + Training: 57%|█████▊ | 5750/10000 [1:15:30<45:54, 1.54it/s, loss=0.0108, lr=1.19e-05, step=5749] Training: 57%|█████▊ | 5750/10000 [1:15:30<45:54, 1.54it/s, loss=0.0046, lr=1.19e-05, step=5750] Training: 58%|█████▊ | 5751/10000 [1:15:30<46:34, 1.52it/s, loss=0.0046, lr=1.19e-05, step=5750] Training: 58%|█████▊ | 5751/10000 [1:15:30<46:34, 1.52it/s, loss=0.0083, lr=1.19e-05, step=5751] Training: 58%|█████▊ | 5752/10000 [1:15:31<42:39, 1.66it/s, loss=0.0083, lr=1.19e-05, step=5751] Training: 58%|█████▊ | 5752/10000 [1:15:31<42:39, 1.66it/s, loss=0.0189, lr=1.19e-05, step=5752] Training: 58%|█████▊ | 5753/10000 [1:15:31<40:09, 1.76it/s, loss=0.0189, lr=1.19e-05, step=5752] Training: 58%|█████▊ | 5753/10000 [1:15:31<40:09, 1.76it/s, loss=0.0031, lr=1.19e-05, step=5753] Training: 58%|█████▊ | 5754/10000 [1:15:32<38:10, 1.85it/s, loss=0.0031, lr=1.19e-05, step=5753] Training: 58%|█████▊ | 5754/10000 [1:15:32<38:10, 1.85it/s, loss=0.0143, lr=1.19e-05, step=5754] Training: 58%|█████▊ | 5755/10000 [1:15:32<38:32, 1.84it/s, loss=0.0143, lr=1.19e-05, step=5754] Training: 58%|█████▊ | 5755/10000 [1:15:32<38:32, 1.84it/s, loss=0.0105, lr=1.19e-05, step=5755] Training: 58%|█████▊ | 5756/10000 [1:15:33<37:39, 1.88it/s, loss=0.0105, lr=1.19e-05, step=5755] Training: 58%|█████▊ | 5756/10000 [1:15:33<37:39, 1.88it/s, loss=0.0021, lr=1.19e-05, step=5756] Training: 58%|█████▊ | 5757/10000 [1:15:34<48:23, 1.46it/s, loss=0.0021, lr=1.19e-05, step=5756] Training: 58%|█████▊ | 5757/10000 [1:15:34<48:23, 1.46it/s, loss=0.0047, lr=1.19e-05, step=5757] Training: 58%|█████▊ | 5758/10000 [1:15:35<49:24, 1.43it/s, loss=0.0047, lr=1.19e-05, step=5757] Training: 58%|█████▊ | 5758/10000 [1:15:35<49:24, 1.43it/s, loss=0.0183, lr=1.19e-05, step=5758] Training: 58%|█████▊ | 5759/10000 [1:15:35<44:40, 1.58it/s, loss=0.0183, lr=1.19e-05, step=5758] Training: 58%|█████▊ | 5759/10000 [1:15:35<44:40, 1.58it/s, loss=0.0160, lr=1.19e-05, step=5759]20:00:07.858 [I] step=5760 loss=0.0127 smoothed_loss=0.0108 lr=1.19e-05 grad_norm=0.3529 step_time=0.5067s data_time=0.0881s it/s=1.681 eta_to_10000=2522.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0102 grad_action_out_proj_arms=0.1216 grad_arm_token_fuse=0.0536 grad_shared_expert=0.5413 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5760/10000 [1:15:36<42:26, 1.66it/s, loss=0.0160, lr=1.19e-05, step=5759] Training: 58%|█████▊ | 5760/10000 [1:15:36<42:26, 1.66it/s, loss=0.0127, lr=1.19e-05, step=5760] Training: 58%|█████▊ | 5761/10000 [1:15:36<40:03, 1.76it/s, loss=0.0127, lr=1.19e-05, step=5760] Training: 58%|█████▊ | 5761/10000 [1:15:36<40:03, 1.76it/s, loss=0.0057, lr=1.19e-05, step=5761] Training: 58%|█████▊ | 5762/10000 [1:15:37<43:17, 1.63it/s, loss=0.0057, lr=1.19e-05, step=5761] Training: 58%|█████▊ | 5762/10000 [1:15:37<43:17, 1.63it/s, loss=0.0333, lr=1.19e-05, step=5762] Training: 58%|█████▊ | 5763/10000 [1:15:37<41:13, 1.71it/s, loss=0.0333, lr=1.19e-05, step=5762] Training: 58%|█████▊ | 5763/10000 [1:15:37<41:13, 1.71it/s, loss=0.0036, lr=1.19e-05, step=5763] Training: 58%|█████▊ | 5764/10000 [1:15:38<42:52, 1.65it/s, loss=0.0036, lr=1.19e-05, step=5763] Training: 58%|█████▊ | 5764/10000 [1:15:38<42:52, 1.65it/s, loss=0.0091, lr=1.19e-05, step=5764] Training: 58%|█████▊ | 5765/10000 [1:15:39<44:38, 1.58it/s, loss=0.0091, lr=1.19e-05, step=5764] Training: 58%|█████▊ | 5765/10000 [1:15:39<44:38, 1.58it/s, loss=0.0043, lr=1.18e-05, step=5765] Training: 58%|█████▊ | 5766/10000 [1:15:40<51:20, 1.37it/s, loss=0.0043, lr=1.18e-05, step=5765] Training: 58%|█████▊ | 5766/10000 [1:15:40<51:20, 1.37it/s, loss=0.0220, lr=1.18e-05, step=5766] Training: 58%|█████▊ | 5767/10000 [1:15:40<46:44, 1.51it/s, loss=0.0220, lr=1.18e-05, step=5766] Training: 58%|█████▊ | 5767/10000 [1:15:40<46:44, 1.51it/s, loss=0.0165, lr=1.18e-05, step=5767] Training: 58%|█████▊ | 5768/10000 [1:15:41<45:56, 1.54it/s, loss=0.0165, lr=1.18e-05, step=5767] Training: 58%|█████▊ | 5768/10000 [1:15:41<45:56, 1.54it/s, loss=0.0064, lr=1.18e-05, step=5768] Training: 58%|█████▊ | 5769/10000 [1:15:41<42:34, 1.66it/s, loss=0.0064, lr=1.18e-05, step=5768] Training: 58%|█████▊ | 5769/10000 [1:15:41<42:34, 1.66it/s, loss=0.0049, lr=1.18e-05, step=5769]20:00:14.041 [I] step=5770 loss=0.1780 smoothed_loss=0.0277 lr=1.18e-05 grad_norm=0.4514 step_time=0.5171s data_time=0.1012s it/s=1.618 eta_to_10000=2615.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0301 grad_action_out_proj_arms=0.2025 grad_arm_token_fuse=0.1837 grad_shared_expert=0.5882 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5770/10000 [1:15:42<40:56, 1.72it/s, loss=0.0049, lr=1.18e-05, step=5769] Training: 58%|█████▊ | 5770/10000 [1:15:42<40:56, 1.72it/s, loss=0.1780, lr=1.18e-05, step=5770] Training: 58%|█████▊ | 5771/10000 [1:15:43<46:00, 1.53it/s, loss=0.1780, lr=1.18e-05, step=5770] Training: 58%|█████▊ | 5771/10000 [1:15:43<46:00, 1.53it/s, loss=0.0091, lr=1.18e-05, step=5771] Training: 58%|█████▊ | 5772/10000 [1:15:43<46:26, 1.52it/s, loss=0.0091, lr=1.18e-05, step=5771] Training: 58%|█████▊ | 5772/10000 [1:15:43<46:26, 1.52it/s, loss=0.0131, lr=1.18e-05, step=5772] Training: 58%|█████▊ | 5773/10000 [1:15:44<45:56, 1.53it/s, loss=0.0131, lr=1.18e-05, step=5772] Training: 58%|█████▊ | 5773/10000 [1:15:44<45:56, 1.53it/s, loss=0.0145, lr=1.18e-05, step=5773] Training: 58%|█████▊ | 5774/10000 [1:15:44<42:13, 1.67it/s, loss=0.0145, lr=1.18e-05, step=5773] Training: 58%|█████▊ | 5774/10000 [1:15:44<42:13, 1.67it/s, loss=0.0370, lr=1.18e-05, step=5774] Training: 58%|█████▊ | 5775/10000 [1:15:45<39:54, 1.76it/s, loss=0.0370, lr=1.18e-05, step=5774] Training: 58%|█████▊ | 5775/10000 [1:15:45<39:54, 1.76it/s, loss=0.0214, lr=1.18e-05, step=5775] Training: 58%|█████▊ | 5776/10000 [1:15:46<43:00, 1.64it/s, loss=0.0214, lr=1.18e-05, step=5775] Training: 58%|█████▊ | 5776/10000 [1:15:46<43:00, 1.64it/s, loss=0.0015, lr=1.18e-05, step=5776] Training: 58%|█████▊ | 5777/10000 [1:15:46<40:12, 1.75it/s, loss=0.0015, lr=1.18e-05, step=5776] Training: 58%|█████▊ | 5777/10000 [1:15:46<40:12, 1.75it/s, loss=0.0067, lr=1.18e-05, step=5777] Training: 58%|█████▊ | 5778/10000 [1:15:47<42:05, 1.67it/s, loss=0.0067, lr=1.18e-05, step=5777] Training: 58%|█████▊ | 5778/10000 [1:15:47<42:05, 1.67it/s, loss=0.0421, lr=1.18e-05, step=5778] Training: 58%|█████▊ | 5779/10000 [1:15:48<49:33, 1.42it/s, loss=0.0421, lr=1.18e-05, step=5778] Training: 58%|█████▊ | 5779/10000 [1:15:48<49:33, 1.42it/s, loss=0.0267, lr=1.18e-05, step=5779]20:00:20.472 [I] step=5780 loss=0.0150 smoothed_loss=0.0224 lr=1.18e-05 grad_norm=0.5074 step_time=0.5575s data_time=0.0857s it/s=1.555 eta_to_10000=2713.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0335 grad_action_out_proj_arms=0.1449 grad_arm_token_fuse=0.1759 grad_shared_expert=0.5257 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5780/10000 [1:15:48<45:49, 1.54it/s, loss=0.0267, lr=1.18e-05, step=5779] Training: 58%|█████▊ | 5780/10000 [1:15:48<45:49, 1.54it/s, loss=0.0150, lr=1.18e-05, step=5780] Training: 58%|█████▊ | 5781/10000 [1:15:49<47:52, 1.47it/s, loss=0.0150, lr=1.18e-05, step=5780] Training: 58%|█████▊ | 5781/10000 [1:15:49<47:52, 1.47it/s, loss=0.0023, lr=1.18e-05, step=5781] Training: 58%|█████▊ | 5782/10000 [1:15:50<47:08, 1.49it/s, loss=0.0023, lr=1.18e-05, step=5781] Training: 58%|█████▊ | 5782/10000 [1:15:50<47:08, 1.49it/s, loss=0.0060, lr=1.18e-05, step=5782] Training: 58%|█████▊ | 5783/10000 [1:15:50<43:39, 1.61it/s, loss=0.0060, lr=1.18e-05, step=5782] Training: 58%|█████▊ | 5783/10000 [1:15:50<43:39, 1.61it/s, loss=0.0019, lr=1.18e-05, step=5783] Training: 58%|█████▊ | 5784/10000 [1:15:51<40:58, 1.71it/s, loss=0.0019, lr=1.18e-05, step=5783] Training: 58%|█████▊ | 5784/10000 [1:15:51<40:58, 1.71it/s, loss=0.0035, lr=1.18e-05, step=5784] Training: 58%|█████▊ | 5785/10000 [1:15:51<38:56, 1.80it/s, loss=0.0035, lr=1.18e-05, step=5784] Training: 58%|█████▊ | 5785/10000 [1:15:51<38:56, 1.80it/s, loss=0.0040, lr=1.18e-05, step=5785] Training: 58%|█████▊ | 5786/10000 [1:15:52<43:14, 1.62it/s, loss=0.0040, lr=1.18e-05, step=5785] Training: 58%|█████▊ | 5786/10000 [1:15:52<43:14, 1.62it/s, loss=0.0230, lr=1.18e-05, step=5786] Training: 58%|█████▊ | 5787/10000 [1:15:52<40:34, 1.73it/s, loss=0.0230, lr=1.18e-05, step=5786] Training: 58%|█████▊ | 5787/10000 [1:15:52<40:34, 1.73it/s, loss=0.0046, lr=1.18e-05, step=5787] Training: 58%|█████▊ | 5788/10000 [1:15:53<41:39, 1.68it/s, loss=0.0046, lr=1.18e-05, step=5787] Training: 58%|█████▊ | 5788/10000 [1:15:53<41:39, 1.68it/s, loss=0.0057, lr=1.18e-05, step=5788] Training: 58%|█████▊ | 5789/10000 [1:15:53<39:24, 1.78it/s, loss=0.0057, lr=1.18e-05, step=5788] Training: 58%|█████▊ | 5789/10000 [1:15:53<39:24, 1.78it/s, loss=0.0354, lr=1.18e-05, step=5789]20:00:26.235 [I] step=5790 loss=0.0018 smoothed_loss=0.0144 lr=1.18e-05 grad_norm=0.3740 step_time=0.4997s data_time=0.0767s it/s=1.735 eta_to_10000=2426.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.1089 grad_arm_token_fuse=0.0526 grad_shared_expert=0.5261 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5790/10000 [1:15:54<38:24, 1.83it/s, loss=0.0354, lr=1.18e-05, step=5789] Training: 58%|█████▊ | 5790/10000 [1:15:54<38:24, 1.83it/s, loss=0.0018, lr=1.18e-05, step=5790] Training: 58%|█████▊ | 5791/10000 [1:15:54<37:08, 1.89it/s, loss=0.0018, lr=1.18e-05, step=5790] Training: 58%|█████▊ | 5791/10000 [1:15:54<37:08, 1.89it/s, loss=0.0169, lr=1.18e-05, step=5791] Training: 58%|█████▊ | 5792/10000 [1:15:55<36:16, 1.93it/s, loss=0.0169, lr=1.18e-05, step=5791] Training: 58%|█████▊ | 5792/10000 [1:15:55<36:16, 1.93it/s, loss=0.0138, lr=1.17e-05, step=5792] Training: 58%|█████▊ | 5793/10000 [1:15:56<40:51, 1.72it/s, loss=0.0138, lr=1.17e-05, step=5792] Training: 58%|█████▊ | 5793/10000 [1:15:56<40:51, 1.72it/s, loss=0.0025, lr=1.17e-05, step=5793] Training: 58%|█████▊ | 5794/10000 [1:15:56<39:28, 1.78it/s, loss=0.0025, lr=1.17e-05, step=5793] Training: 58%|█████▊ | 5794/10000 [1:15:56<39:28, 1.78it/s, loss=0.0087, lr=1.17e-05, step=5794] Training: 58%|█████▊ | 5795/10000 [1:15:57<40:56, 1.71it/s, loss=0.0087, lr=1.17e-05, step=5794] Training: 58%|█████▊ | 5795/10000 [1:15:57<40:56, 1.71it/s, loss=0.0034, lr=1.17e-05, step=5795] Training: 58%|█████▊ | 5796/10000 [1:15:57<41:20, 1.69it/s, loss=0.0034, lr=1.17e-05, step=5795] Training: 58%|█████▊ | 5796/10000 [1:15:57<41:20, 1.69it/s, loss=0.0048, lr=1.17e-05, step=5796] Training: 58%|█████▊ | 5797/10000 [1:15:58<39:04, 1.79it/s, loss=0.0048, lr=1.17e-05, step=5796] Training: 58%|█████▊ | 5797/10000 [1:15:58<39:04, 1.79it/s, loss=0.0129, lr=1.17e-05, step=5797] Training: 58%|█████▊ | 5798/10000 [1:15:59<42:15, 1.66it/s, loss=0.0129, lr=1.17e-05, step=5797] Training: 58%|█████▊ | 5798/10000 [1:15:59<42:15, 1.66it/s, loss=0.0087, lr=1.17e-05, step=5798] Training: 58%|█████▊ | 5799/10000 [1:15:59<46:13, 1.51it/s, loss=0.0087, lr=1.17e-05, step=5798] Training: 58%|█████▊ | 5799/10000 [1:15:59<46:13, 1.51it/s, loss=0.0057, lr=1.17e-05, step=5799]20:00:32.639 [I] step=5800 loss=0.0054 smoothed_loss=0.0101 lr=1.17e-05 grad_norm=0.4210 step_time=0.5296s data_time=0.1107s it/s=1.562 eta_to_10000=2688.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0089 grad_action_out_proj_arms=0.1173 grad_arm_token_fuse=0.0450 grad_shared_expert=0.3473 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5800/10000 [1:16:00<52:20, 1.34it/s, loss=0.0057, lr=1.17e-05, step=5799] Training: 58%|█████▊ | 5800/10000 [1:16:00<52:20, 1.34it/s, loss=0.0054, lr=1.17e-05, step=5800] Training: 58%|█████▊ | 5801/10000 [1:16:01<46:50, 1.49it/s, loss=0.0054, lr=1.17e-05, step=5800] Training: 58%|█████▊ | 5801/10000 [1:16:01<46:50, 1.49it/s, loss=0.0070, lr=1.17e-05, step=5801] Training: 58%|█████▊ | 5802/10000 [1:16:01<46:36, 1.50it/s, loss=0.0070, lr=1.17e-05, step=5801] Training: 58%|█████▊ | 5802/10000 [1:16:01<46:36, 1.50it/s, loss=0.0139, lr=1.17e-05, step=5802] Training: 58%|█████▊ | 5803/10000 [1:16:02<45:38, 1.53it/s, loss=0.0139, lr=1.17e-05, step=5802] Training: 58%|█████▊ | 5803/10000 [1:16:02<45:38, 1.53it/s, loss=0.0065, lr=1.17e-05, step=5803] Training: 58%|█████▊ | 5804/10000 [1:16:03<41:55, 1.67it/s, loss=0.0065, lr=1.17e-05, step=5803] Training: 58%|█████▊ | 5804/10000 [1:16:03<41:55, 1.67it/s, loss=0.0192, lr=1.17e-05, step=5804] Training: 58%|█████▊ | 5805/10000 [1:16:03<40:36, 1.72it/s, loss=0.0192, lr=1.17e-05, step=5804] Training: 58%|█████▊ | 5805/10000 [1:16:03<40:36, 1.72it/s, loss=0.0031, lr=1.17e-05, step=5805] Training: 58%|█████▊ | 5806/10000 [1:16:04<38:27, 1.82it/s, loss=0.0031, lr=1.17e-05, step=5805] Training: 58%|█████▊ | 5806/10000 [1:16:04<38:27, 1.82it/s, loss=0.0160, lr=1.17e-05, step=5806] Training: 58%|█████▊ | 5807/10000 [1:16:04<44:29, 1.57it/s, loss=0.0160, lr=1.17e-05, step=5806] Training: 58%|█████▊ | 5807/10000 [1:16:04<44:29, 1.57it/s, loss=0.0305, lr=1.17e-05, step=5807] Training: 58%|█████▊ | 5808/10000 [1:16:05<41:37, 1.68it/s, loss=0.0305, lr=1.17e-05, step=5807] Training: 58%|█████▊ | 5808/10000 [1:16:05<41:37, 1.68it/s, loss=0.0120, lr=1.17e-05, step=5808] Training: 58%|█████▊ | 5809/10000 [1:16:06<42:50, 1.63it/s, loss=0.0120, lr=1.17e-05, step=5808] Training: 58%|█████▊ | 5809/10000 [1:16:06<42:50, 1.63it/s, loss=0.0158, lr=1.17e-05, step=5809]20:00:38.403 [I] step=5810 loss=0.0172 smoothed_loss=0.0133 lr=1.17e-05 grad_norm=0.4278 step_time=0.5007s data_time=0.0757s it/s=1.735 eta_to_10000=2415.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0103 grad_action_out_proj_arms=0.1039 grad_arm_token_fuse=0.0514 grad_shared_expert=0.3280 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5810/10000 [1:16:06<40:45, 1.71it/s, loss=0.0158, lr=1.17e-05, step=5809] Training: 58%|█████▊ | 5810/10000 [1:16:06<40:45, 1.71it/s, loss=0.0172, lr=1.17e-05, step=5810] Training: 58%|█████▊ | 5811/10000 [1:16:07<38:35, 1.81it/s, loss=0.0172, lr=1.17e-05, step=5810] Training: 58%|█████▊ | 5811/10000 [1:16:07<38:35, 1.81it/s, loss=0.0035, lr=1.17e-05, step=5811] Training: 58%|█████▊ | 5812/10000 [1:16:07<39:43, 1.76it/s, loss=0.0035, lr=1.17e-05, step=5811] Training: 58%|█████▊ | 5812/10000 [1:16:07<39:43, 1.76it/s, loss=0.0039, lr=1.17e-05, step=5812] Training: 58%|█████▊ | 5813/10000 [1:16:08<38:06, 1.83it/s, loss=0.0039, lr=1.17e-05, step=5812] Training: 58%|█████▊ | 5813/10000 [1:16:08<38:06, 1.83it/s, loss=0.0013, lr=1.17e-05, step=5813] Training: 58%|█████▊ | 5814/10000 [1:16:08<40:16, 1.73it/s, loss=0.0013, lr=1.17e-05, step=5813] Training: 58%|█████▊ | 5814/10000 [1:16:08<40:16, 1.73it/s, loss=0.0502, lr=1.17e-05, step=5814] Training: 58%|█████▊ | 5815/10000 [1:16:09<43:06, 1.62it/s, loss=0.0502, lr=1.17e-05, step=5814] Training: 58%|█████▊ | 5815/10000 [1:16:09<43:06, 1.62it/s, loss=0.0151, lr=1.17e-05, step=5815] Training: 58%|█████▊ | 5816/10000 [1:16:10<43:01, 1.62it/s, loss=0.0151, lr=1.17e-05, step=5815] Training: 58%|█████▊ | 5816/10000 [1:16:10<43:01, 1.62it/s, loss=0.0046, lr=1.17e-05, step=5816] Training: 58%|█████▊ | 5817/10000 [1:16:10<42:11, 1.65it/s, loss=0.0046, lr=1.17e-05, step=5816] Training: 58%|█████▊ | 5817/10000 [1:16:10<42:11, 1.65it/s, loss=0.0045, lr=1.17e-05, step=5817] Training: 58%|█████▊ | 5818/10000 [1:16:11<40:29, 1.72it/s, loss=0.0045, lr=1.17e-05, step=5817] Training: 58%|█████▊ | 5818/10000 [1:16:11<40:29, 1.72it/s, loss=0.0058, lr=1.17e-05, step=5818] Training: 58%|█████▊ | 5819/10000 [1:16:11<38:25, 1.81it/s, loss=0.0058, lr=1.17e-05, step=5818] Training: 58%|█████▊ | 5819/10000 [1:16:11<38:25, 1.81it/s, loss=0.0050, lr=1.16e-05, step=5819]20:00:44.052 [I] step=5820 loss=0.0079 smoothed_loss=0.0109 lr=1.17e-05 grad_norm=0.4509 step_time=0.4990s data_time=0.0658s it/s=1.771 eta_to_10000=2360.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0133 grad_action_out_proj_arms=0.1119 grad_arm_token_fuse=0.0658 grad_shared_expert=0.2962 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5820/10000 [1:16:12<37:27, 1.86it/s, loss=0.0050, lr=1.16e-05, step=5819] Training: 58%|█████▊ | 5820/10000 [1:16:12<37:27, 1.86it/s, loss=0.0079, lr=1.16e-05, step=5820] Training: 58%|█████▊ | 5821/10000 [1:16:12<39:52, 1.75it/s, loss=0.0079, lr=1.16e-05, step=5820] Training: 58%|█████▊ | 5821/10000 [1:16:12<39:52, 1.75it/s, loss=0.0662, lr=1.16e-05, step=5821] Training: 58%|█████▊ | 5822/10000 [1:16:13<42:02, 1.66it/s, loss=0.0662, lr=1.16e-05, step=5821] Training: 58%|█████▊ | 5822/10000 [1:16:13<42:02, 1.66it/s, loss=0.0218, lr=1.16e-05, step=5822] Training: 58%|█████▊ | 5823/10000 [1:16:14<42:54, 1.62it/s, loss=0.0218, lr=1.16e-05, step=5822] Training: 58%|█████▊ | 5823/10000 [1:16:14<42:54, 1.62it/s, loss=0.0035, lr=1.16e-05, step=5823] Training: 58%|█████▊ | 5824/10000 [1:16:14<39:59, 1.74it/s, loss=0.0035, lr=1.16e-05, step=5823] Training: 58%|█████▊ | 5824/10000 [1:16:14<39:59, 1.74it/s, loss=0.0051, lr=1.16e-05, step=5824] Training: 58%|█████▊ | 5825/10000 [1:16:15<38:11, 1.82it/s, loss=0.0051, lr=1.16e-05, step=5824] Training: 58%|█████▊ | 5825/10000 [1:16:15<38:11, 1.82it/s, loss=0.0094, lr=1.16e-05, step=5825] Training: 58%|█████▊ | 5826/10000 [1:16:15<36:48, 1.89it/s, loss=0.0094, lr=1.16e-05, step=5825] Training: 58%|█████▊ | 5826/10000 [1:16:15<36:48, 1.89it/s, loss=0.0064, lr=1.16e-05, step=5826] Training: 58%|█████▊ | 5827/10000 [1:16:16<35:50, 1.94it/s, loss=0.0064, lr=1.16e-05, step=5826] Training: 58%|█████▊ | 5827/10000 [1:16:16<35:50, 1.94it/s, loss=0.0072, lr=1.16e-05, step=5827] Training: 58%|█████▊ | 5828/10000 [1:16:16<40:08, 1.73it/s, loss=0.0072, lr=1.16e-05, step=5827] Training: 58%|█████▊ | 5828/10000 [1:16:16<40:08, 1.73it/s, loss=0.0056, lr=1.16e-05, step=5828] Training: 58%|█████▊ | 5829/10000 [1:16:17<48:06, 1.45it/s, loss=0.0056, lr=1.16e-05, step=5828] Training: 58%|█████▊ | 5829/10000 [1:16:17<48:06, 1.45it/s, loss=0.0174, lr=1.16e-05, step=5829]20:00:50.170 [I] step=5830 loss=0.0235 smoothed_loss=0.0136 lr=1.16e-05 grad_norm=0.3992 step_time=0.5262s data_time=0.0857s it/s=1.635 eta_to_10000=2551.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0146 grad_action_out_proj_arms=0.1526 grad_arm_token_fuse=0.0739 grad_shared_expert=0.4438 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5830/10000 [1:16:18<44:42, 1.55it/s, loss=0.0174, lr=1.16e-05, step=5829] Training: 58%|█████▊ | 5830/10000 [1:16:18<44:42, 1.55it/s, loss=0.0235, lr=1.16e-05, step=5830] Training: 58%|█████▊ | 5831/10000 [1:16:18<44:57, 1.55it/s, loss=0.0235, lr=1.16e-05, step=5830] Training: 58%|█████▊ | 5831/10000 [1:16:18<44:57, 1.55it/s, loss=0.0128, lr=1.16e-05, step=5831] Training: 58%|█████▊ | 5832/10000 [1:16:19<41:33, 1.67it/s, loss=0.0128, lr=1.16e-05, step=5831] Training: 58%|█████▊ | 5832/10000 [1:16:19<41:33, 1.67it/s, loss=0.0216, lr=1.16e-05, step=5832] Training: 58%|█████▊ | 5833/10000 [1:16:19<39:09, 1.77it/s, loss=0.0216, lr=1.16e-05, step=5832] Training: 58%|█████▊ | 5833/10000 [1:16:19<39:09, 1.77it/s, loss=0.0217, lr=1.16e-05, step=5833] Training: 58%|█████▊ | 5834/10000 [1:16:20<37:25, 1.85it/s, loss=0.0217, lr=1.16e-05, step=5833] Training: 58%|█████▊ | 5834/10000 [1:16:20<37:25, 1.85it/s, loss=0.0057, lr=1.16e-05, step=5834] Training: 58%|█████▊ | 5835/10000 [1:16:20<36:26, 1.91it/s, loss=0.0057, lr=1.16e-05, step=5834] Training: 58%|█████▊ | 5835/10000 [1:16:20<36:26, 1.91it/s, loss=0.0165, lr=1.16e-05, step=5835] Training: 58%|█████▊ | 5836/10000 [1:16:21<40:28, 1.71it/s, loss=0.0165, lr=1.16e-05, step=5835] Training: 58%|█████▊ | 5836/10000 [1:16:21<40:28, 1.71it/s, loss=0.0132, lr=1.16e-05, step=5836] Training: 58%|█████▊ | 5837/10000 [1:16:22<44:51, 1.55it/s, loss=0.0132, lr=1.16e-05, step=5836] Training: 58%|█████▊ | 5837/10000 [1:16:22<44:51, 1.55it/s, loss=0.0086, lr=1.16e-05, step=5837] Training: 58%|█████▊ | 5838/10000 [1:16:23<44:55, 1.54it/s, loss=0.0086, lr=1.16e-05, step=5837] Training: 58%|█████▊ | 5838/10000 [1:16:23<44:55, 1.54it/s, loss=0.0121, lr=1.16e-05, step=5838] Training: 58%|█████▊ | 5839/10000 [1:16:23<41:53, 1.66it/s, loss=0.0121, lr=1.16e-05, step=5838] Training: 58%|█████▊ | 5839/10000 [1:16:23<41:53, 1.66it/s, loss=0.0037, lr=1.16e-05, step=5839]20:00:55.958 [I] step=5840 loss=0.0153 smoothed_loss=0.0128 lr=1.16e-05 grad_norm=0.4492 step_time=0.4988s data_time=0.0800s it/s=1.728 eta_to_10000=2407.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0160 grad_action_out_proj_arms=0.1334 grad_arm_token_fuse=0.0821 grad_shared_expert=0.4558 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5840/10000 [1:16:24<40:13, 1.72it/s, loss=0.0037, lr=1.16e-05, step=5839] Training: 58%|█████▊ | 5840/10000 [1:16:24<40:13, 1.72it/s, loss=0.0153, lr=1.16e-05, step=5840] Training: 58%|█████▊ | 5841/10000 [1:16:24<44:54, 1.54it/s, loss=0.0153, lr=1.16e-05, step=5840] Training: 58%|█████▊ | 5841/10000 [1:16:24<44:54, 1.54it/s, loss=0.0016, lr=1.16e-05, step=5841] Training: 58%|█████▊ | 5842/10000 [1:16:25<41:26, 1.67it/s, loss=0.0016, lr=1.16e-05, step=5841] Training: 58%|█████▊ | 5842/10000 [1:16:25<41:26, 1.67it/s, loss=0.0185, lr=1.16e-05, step=5842] Training: 58%|█████▊ | 5843/10000 [1:16:26<44:07, 1.57it/s, loss=0.0185, lr=1.16e-05, step=5842] Training: 58%|█████▊ | 5843/10000 [1:16:26<44:07, 1.57it/s, loss=0.0027, lr=1.16e-05, step=5843] Training: 58%|█████▊ | 5844/10000 [1:16:27<50:09, 1.38it/s, loss=0.0027, lr=1.16e-05, step=5843] Training: 58%|█████▊ | 5844/10000 [1:16:27<50:09, 1.38it/s, loss=0.0116, lr=1.16e-05, step=5844] Training: 58%|█████▊ | 5845/10000 [1:16:27<51:14, 1.35it/s, loss=0.0116, lr=1.16e-05, step=5844] Training: 58%|█████▊ | 5845/10000 [1:16:27<51:14, 1.35it/s, loss=0.0040, lr=1.16e-05, step=5845] Training: 58%|█████▊ | 5846/10000 [1:16:28<56:21, 1.23it/s, loss=0.0040, lr=1.16e-05, step=5845] Training: 58%|█████▊ | 5846/10000 [1:16:28<56:21, 1.23it/s, loss=0.0179, lr=1.16e-05, step=5846] Training: 58%|█████▊ | 5847/10000 [1:16:29<49:52, 1.39it/s, loss=0.0179, lr=1.16e-05, step=5846] Training: 58%|█████▊ | 5847/10000 [1:16:29<49:52, 1.39it/s, loss=0.0224, lr=1.15e-05, step=5847] Training: 58%|█████▊ | 5848/10000 [1:16:29<45:11, 1.53it/s, loss=0.0224, lr=1.15e-05, step=5847] Training: 58%|█████▊ | 5848/10000 [1:16:29<45:11, 1.53it/s, loss=0.0065, lr=1.15e-05, step=5848] Training: 58%|█████▊ | 5849/10000 [1:16:30<41:51, 1.65it/s, loss=0.0065, lr=1.15e-05, step=5848] Training: 58%|█████▊ | 5849/10000 [1:16:30<41:51, 1.65it/s, loss=0.0110, lr=1.15e-05, step=5849]20:01:02.924 [I] step=5850 loss=0.0042 smoothed_loss=0.0110 lr=1.16e-05 grad_norm=0.4350 step_time=0.5634s data_time=0.1331s it/s=1.436 eta_to_10000=2890.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1256 grad_arm_token_fuse=0.0766 grad_shared_expert=0.4253 (18633:train_pytorch.py:850) + Training: 58%|█████▊ | 5850/10000 [1:16:31<45:13, 1.53it/s, loss=0.0110, lr=1.15e-05, step=5849] Training: 58%|█████▊ | 5850/10000 [1:16:31<45:13, 1.53it/s, loss=0.0042, lr=1.15e-05, step=5850] Training: 59%|█████▊ | 5851/10000 [1:16:31<41:54, 1.65it/s, loss=0.0042, lr=1.15e-05, step=5850] Training: 59%|█████▊ | 5851/10000 [1:16:31<41:54, 1.65it/s, loss=0.0076, lr=1.15e-05, step=5851] Training: 59%|█████▊ | 5852/10000 [1:16:32<39:26, 1.75it/s, loss=0.0076, lr=1.15e-05, step=5851] Training: 59%|█████▊ | 5852/10000 [1:16:32<39:26, 1.75it/s, loss=0.0197, lr=1.15e-05, step=5852] Training: 59%|█████▊ | 5853/10000 [1:16:32<44:06, 1.57it/s, loss=0.0197, lr=1.15e-05, step=5852] Training: 59%|█████▊ | 5853/10000 [1:16:32<44:06, 1.57it/s, loss=0.0017, lr=1.15e-05, step=5853] Training: 59%|█████▊ | 5854/10000 [1:16:33<44:20, 1.56it/s, loss=0.0017, lr=1.15e-05, step=5853] Training: 59%|█████▊ | 5854/10000 [1:16:33<44:20, 1.56it/s, loss=0.0025, lr=1.15e-05, step=5854] Training: 59%|█████▊ | 5855/10000 [1:16:34<41:25, 1.67it/s, loss=0.0025, lr=1.15e-05, step=5854] Training: 59%|█████▊ | 5855/10000 [1:16:34<41:25, 1.67it/s, loss=0.0050, lr=1.15e-05, step=5855] Training: 59%|█████▊ | 5856/10000 [1:16:34<39:49, 1.73it/s, loss=0.0050, lr=1.15e-05, step=5855] Training: 59%|█████▊ | 5856/10000 [1:16:34<39:49, 1.73it/s, loss=0.0103, lr=1.15e-05, step=5856] Training: 59%|█████▊ | 5857/10000 [1:16:35<42:23, 1.63it/s, loss=0.0103, lr=1.15e-05, step=5856] Training: 59%|█████▊ | 5857/10000 [1:16:35<42:23, 1.63it/s, loss=0.0183, lr=1.15e-05, step=5857] Training: 59%|█████▊ | 5858/10000 [1:16:35<40:18, 1.71it/s, loss=0.0183, lr=1.15e-05, step=5857] Training: 59%|█████▊ | 5858/10000 [1:16:35<40:18, 1.71it/s, loss=0.0031, lr=1.15e-05, step=5858] Training: 59%|█████▊ | 5859/10000 [1:16:36<38:29, 1.79it/s, loss=0.0031, lr=1.15e-05, step=5858] Training: 59%|█████▊ | 5859/10000 [1:16:36<38:29, 1.79it/s, loss=0.0056, lr=1.15e-05, step=5859]20:01:08.616 [I] step=5860 loss=0.0168 smoothed_loss=0.0100 lr=1.15e-05 grad_norm=0.4619 step_time=0.4850s data_time=0.0842s it/s=1.757 eta_to_10000=2356.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0247 grad_action_out_proj_arms=0.1798 grad_arm_token_fuse=0.1276 grad_shared_expert=0.6864 (18633:train_pytorch.py:850) + Training: 59%|█████▊ | 5860/10000 [1:16:36<37:51, 1.82it/s, loss=0.0056, lr=1.15e-05, step=5859] Training: 59%|█████▊ | 5860/10000 [1:16:36<37:51, 1.82it/s, loss=0.0168, lr=1.15e-05, step=5860] Training: 59%|█████▊ | 5861/10000 [1:16:37<45:41, 1.51it/s, loss=0.0168, lr=1.15e-05, step=5860] Training: 59%|█████▊ | 5861/10000 [1:16:37<45:41, 1.51it/s, loss=0.0223, lr=1.15e-05, step=5861] Training: 59%|█████▊ | 5862/10000 [1:16:38<42:45, 1.61it/s, loss=0.0223, lr=1.15e-05, step=5861] Training: 59%|█████▊ | 5862/10000 [1:16:38<42:45, 1.61it/s, loss=0.0052, lr=1.15e-05, step=5862] Training: 59%|█████▊ | 5863/10000 [1:16:38<40:17, 1.71it/s, loss=0.0052, lr=1.15e-05, step=5862] Training: 59%|█████▊ | 5863/10000 [1:16:38<40:17, 1.71it/s, loss=0.0211, lr=1.15e-05, step=5863] Training: 59%|█████▊ | 5864/10000 [1:16:39<47:29, 1.45it/s, loss=0.0211, lr=1.15e-05, step=5863] Training: 59%|█████▊ | 5864/10000 [1:16:39<47:29, 1.45it/s, loss=0.0041, lr=1.15e-05, step=5864] Training: 59%|█████▊ | 5865/10000 [1:16:40<48:47, 1.41it/s, loss=0.0041, lr=1.15e-05, step=5864] Training: 59%|█████▊ | 5865/10000 [1:16:40<48:47, 1.41it/s, loss=0.0256, lr=1.15e-05, step=5865] Training: 59%|█████▊ | 5866/10000 [1:16:40<45:05, 1.53it/s, loss=0.0256, lr=1.15e-05, step=5865] Training: 59%|█████▊ | 5866/10000 [1:16:40<45:05, 1.53it/s, loss=0.0059, lr=1.15e-05, step=5866] Training: 59%|█████▊ | 5867/10000 [1:16:41<41:56, 1.64it/s, loss=0.0059, lr=1.15e-05, step=5866] Training: 59%|█████▊ | 5867/10000 [1:16:41<41:56, 1.64it/s, loss=0.0019, lr=1.15e-05, step=5867] Training: 59%|█████▊ | 5868/10000 [1:16:42<43:15, 1.59it/s, loss=0.0019, lr=1.15e-05, step=5867] Training: 59%|█████▊ | 5868/10000 [1:16:42<43:15, 1.59it/s, loss=0.0048, lr=1.15e-05, step=5868] Training: 59%|█████▊ | 5869/10000 [1:16:42<41:58, 1.64it/s, loss=0.0048, lr=1.15e-05, step=5868] Training: 59%|█████▊ | 5869/10000 [1:16:42<41:58, 1.64it/s, loss=0.0206, lr=1.15e-05, step=5869]20:01:15.050 [I] step=5870 loss=0.0122 smoothed_loss=0.0113 lr=1.15e-05 grad_norm=0.4414 step_time=0.5485s data_time=0.0949s it/s=1.554 eta_to_10000=2657.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0212 grad_action_out_proj_arms=0.1664 grad_arm_token_fuse=0.1207 grad_shared_expert=0.3676 (18633:train_pytorch.py:850) + Training: 59%|█████▊ | 5870/10000 [1:16:43<40:15, 1.71it/s, loss=0.0206, lr=1.15e-05, step=5869] Training: 59%|█████▊ | 5870/10000 [1:16:43<40:15, 1.71it/s, loss=0.0122, lr=1.15e-05, step=5870] Training: 59%|█████▊ | 5871/10000 [1:16:43<41:36, 1.65it/s, loss=0.0122, lr=1.15e-05, step=5870] Training: 59%|█████▊ | 5871/10000 [1:16:43<41:36, 1.65it/s, loss=0.0020, lr=1.15e-05, step=5871] Training: 59%|█████▊ | 5872/10000 [1:16:44<45:02, 1.53it/s, loss=0.0020, lr=1.15e-05, step=5871] Training: 59%|█████▊ | 5872/10000 [1:16:44<45:02, 1.53it/s, loss=0.0442, lr=1.15e-05, step=5872] Training: 59%|█████▊ | 5873/10000 [1:16:45<41:39, 1.65it/s, loss=0.0442, lr=1.15e-05, step=5872] Training: 59%|█████▊ | 5873/10000 [1:16:45<41:39, 1.65it/s, loss=0.0175, lr=1.15e-05, step=5873] Training: 59%|█████▊ | 5874/10000 [1:16:45<39:16, 1.75it/s, loss=0.0175, lr=1.15e-05, step=5873] Training: 59%|█████▊ | 5874/10000 [1:16:45<39:16, 1.75it/s, loss=0.0077, lr=1.14e-05, step=5874] Training: 59%|█████▉ | 5875/10000 [1:16:46<41:01, 1.68it/s, loss=0.0077, lr=1.14e-05, step=5874] Training: 59%|█████▉ | 5875/10000 [1:16:46<41:01, 1.68it/s, loss=0.0032, lr=1.14e-05, step=5875] Training: 59%|█████▉ | 5876/10000 [1:16:46<40:52, 1.68it/s, loss=0.0032, lr=1.14e-05, step=5875] Training: 59%|█████▉ | 5876/10000 [1:16:46<40:52, 1.68it/s, loss=0.0022, lr=1.14e-05, step=5876] Training: 59%|█████▉ | 5877/10000 [1:16:47<38:51, 1.77it/s, loss=0.0022, lr=1.14e-05, step=5876] Training: 59%|█████▉ | 5877/10000 [1:16:47<38:51, 1.77it/s, loss=0.0208, lr=1.14e-05, step=5877] Training: 59%|█████▉ | 5878/10000 [1:16:48<44:50, 1.53it/s, loss=0.0208, lr=1.14e-05, step=5877] Training: 59%|█████▉ | 5878/10000 [1:16:48<44:50, 1.53it/s, loss=0.0727, lr=1.14e-05, step=5878] Training: 59%|█████▉ | 5879/10000 [1:16:48<47:10, 1.46it/s, loss=0.0727, lr=1.14e-05, step=5878] Training: 59%|█████▉ | 5879/10000 [1:16:48<47:10, 1.46it/s, loss=0.0044, lr=1.14e-05, step=5879]20:01:21.647 [I] step=5880 loss=0.0250 smoothed_loss=0.0178 lr=1.14e-05 grad_norm=0.4537 step_time=0.5678s data_time=0.0919s it/s=1.516 eta_to_10000=2717.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.1127 grad_arm_token_fuse=0.0564 grad_shared_expert=0.6658 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5880/10000 [1:16:49<50:03, 1.37it/s, loss=0.0044, lr=1.14e-05, step=5879] Training: 59%|█████▉ | 5880/10000 [1:16:49<50:03, 1.37it/s, loss=0.0250, lr=1.14e-05, step=5880] Training: 59%|█████▉ | 5881/10000 [1:16:50<45:07, 1.52it/s, loss=0.0250, lr=1.14e-05, step=5880] Training: 59%|█████▉ | 5881/10000 [1:16:50<45:07, 1.52it/s, loss=0.0033, lr=1.14e-05, step=5881] Training: 59%|█████▉ | 5882/10000 [1:16:51<47:59, 1.43it/s, loss=0.0033, lr=1.14e-05, step=5881] Training: 59%|█████▉ | 5882/10000 [1:16:51<47:59, 1.43it/s, loss=0.0035, lr=1.14e-05, step=5882] Training: 59%|█████▉ | 5883/10000 [1:16:51<43:44, 1.57it/s, loss=0.0035, lr=1.14e-05, step=5882] Training: 59%|█████▉ | 5883/10000 [1:16:51<43:44, 1.57it/s, loss=0.0146, lr=1.14e-05, step=5883] Training: 59%|█████▉ | 5884/10000 [1:16:52<42:04, 1.63it/s, loss=0.0146, lr=1.14e-05, step=5883] Training: 59%|█████▉ | 5884/10000 [1:16:52<42:04, 1.63it/s, loss=0.0210, lr=1.14e-05, step=5884] Training: 59%|█████▉ | 5885/10000 [1:16:52<42:48, 1.60it/s, loss=0.0210, lr=1.14e-05, step=5884] Training: 59%|█████▉ | 5885/10000 [1:16:52<42:48, 1.60it/s, loss=0.0074, lr=1.14e-05, step=5885] Training: 59%|█████▉ | 5886/10000 [1:16:53<45:00, 1.52it/s, loss=0.0074, lr=1.14e-05, step=5885] Training: 59%|█████▉ | 5886/10000 [1:16:53<45:00, 1.52it/s, loss=0.0017, lr=1.14e-05, step=5886] Training: 59%|█████▉ | 5887/10000 [1:16:54<41:52, 1.64it/s, loss=0.0017, lr=1.14e-05, step=5886] Training: 59%|█████▉ | 5887/10000 [1:16:54<41:52, 1.64it/s, loss=0.0022, lr=1.14e-05, step=5887] Training: 59%|█████▉ | 5888/10000 [1:16:54<39:51, 1.72it/s, loss=0.0022, lr=1.14e-05, step=5887] Training: 59%|█████▉ | 5888/10000 [1:16:54<39:51, 1.72it/s, loss=0.0125, lr=1.14e-05, step=5888] Training: 59%|█████▉ | 5889/10000 [1:16:55<40:52, 1.68it/s, loss=0.0125, lr=1.14e-05, step=5888] Training: 59%|█████▉ | 5889/10000 [1:16:55<40:52, 1.68it/s, loss=0.0072, lr=1.14e-05, step=5889]20:01:27.552 [I] step=5890 loss=0.0038 smoothed_loss=0.0110 lr=1.14e-05 grad_norm=0.4482 step_time=0.5097s data_time=0.0808s it/s=1.694 eta_to_10000=2426.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0139 grad_action_out_proj_arms=0.1127 grad_arm_token_fuse=0.0703 grad_shared_expert=0.3670 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5890/10000 [1:16:55<39:38, 1.73it/s, loss=0.0072, lr=1.14e-05, step=5889] Training: 59%|█████▉ | 5890/10000 [1:16:55<39:38, 1.73it/s, loss=0.0038, lr=1.14e-05, step=5890] Training: 59%|█████▉ | 5891/10000 [1:16:56<37:55, 1.81it/s, loss=0.0038, lr=1.14e-05, step=5890] Training: 59%|█████▉ | 5891/10000 [1:16:56<37:55, 1.81it/s, loss=0.0351, lr=1.14e-05, step=5891] Training: 59%|█████▉ | 5892/10000 [1:16:56<36:34, 1.87it/s, loss=0.0351, lr=1.14e-05, step=5891] Training: 59%|█████▉ | 5892/10000 [1:16:56<36:34, 1.87it/s, loss=0.0045, lr=1.14e-05, step=5892] Training: 59%|█████▉ | 5893/10000 [1:16:57<40:36, 1.69it/s, loss=0.0045, lr=1.14e-05, step=5892] Training: 59%|█████▉ | 5893/10000 [1:16:57<40:36, 1.69it/s, loss=0.0112, lr=1.14e-05, step=5893] Training: 59%|█████▉ | 5894/10000 [1:16:57<38:53, 1.76it/s, loss=0.0112, lr=1.14e-05, step=5893] Training: 59%|█████▉ | 5894/10000 [1:16:57<38:53, 1.76it/s, loss=0.0016, lr=1.14e-05, step=5894] Training: 59%|█████▉ | 5895/10000 [1:16:58<37:31, 1.82it/s, loss=0.0016, lr=1.14e-05, step=5894] Training: 59%|█████▉ | 5895/10000 [1:16:58<37:31, 1.82it/s, loss=0.0094, lr=1.14e-05, step=5895] Training: 59%|█████▉ | 5896/10000 [1:16:59<39:48, 1.72it/s, loss=0.0094, lr=1.14e-05, step=5895] Training: 59%|█████▉ | 5896/10000 [1:16:59<39:48, 1.72it/s, loss=0.0120, lr=1.14e-05, step=5896] Training: 59%|█████▉ | 5897/10000 [1:16:59<38:18, 1.79it/s, loss=0.0120, lr=1.14e-05, step=5896] Training: 59%|█████▉ | 5897/10000 [1:16:59<38:18, 1.79it/s, loss=0.0412, lr=1.14e-05, step=5897] Training: 59%|█████▉ | 5898/10000 [1:17:00<37:05, 1.84it/s, loss=0.0412, lr=1.14e-05, step=5897] Training: 59%|█████▉ | 5898/10000 [1:17:00<37:05, 1.84it/s, loss=0.0066, lr=1.14e-05, step=5898] Training: 59%|█████▉ | 5899/10000 [1:17:00<36:08, 1.89it/s, loss=0.0066, lr=1.14e-05, step=5898] Training: 59%|█████▉ | 5899/10000 [1:17:00<36:08, 1.89it/s, loss=0.0028, lr=1.14e-05, step=5899]20:01:33.189 [I] step=5900 loss=0.0112 smoothed_loss=0.0123 lr=1.14e-05 grad_norm=0.5233 step_time=0.4995s data_time=0.0641s it/s=1.775 eta_to_10000=2310.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0131 grad_action_out_proj_arms=0.1165 grad_arm_token_fuse=0.0642 grad_shared_expert=0.3475 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5900/10000 [1:17:01<40:29, 1.69it/s, loss=0.0028, lr=1.14e-05, step=5899] Training: 59%|█████▉ | 5900/10000 [1:17:01<40:29, 1.69it/s, loss=0.0112, lr=1.14e-05, step=5900] Training: 59%|█████▉ | 5901/10000 [1:17:01<39:17, 1.74it/s, loss=0.0112, lr=1.14e-05, step=5900] Training: 59%|█████▉ | 5901/10000 [1:17:01<39:17, 1.74it/s, loss=0.0059, lr=1.14e-05, step=5901] Training: 59%|█████▉ | 5902/10000 [1:17:02<37:38, 1.81it/s, loss=0.0059, lr=1.14e-05, step=5901] Training: 59%|█████▉ | 5902/10000 [1:17:02<37:38, 1.81it/s, loss=0.0103, lr=1.13e-05, step=5902] Training: 59%|█████▉ | 5903/10000 [1:17:03<39:46, 1.72it/s, loss=0.0103, lr=1.13e-05, step=5902] Training: 59%|█████▉ | 5903/10000 [1:17:03<39:46, 1.72it/s, loss=0.0147, lr=1.13e-05, step=5903] Training: 59%|█████▉ | 5904/10000 [1:17:03<38:19, 1.78it/s, loss=0.0147, lr=1.13e-05, step=5903] Training: 59%|█████▉ | 5904/10000 [1:17:03<38:19, 1.78it/s, loss=0.0044, lr=1.13e-05, step=5904] Training: 59%|█████▉ | 5905/10000 [1:17:04<36:50, 1.85it/s, loss=0.0044, lr=1.13e-05, step=5904] Training: 59%|█████▉ | 5905/10000 [1:17:04<36:50, 1.85it/s, loss=0.0050, lr=1.13e-05, step=5905] Training: 59%|█████▉ | 5906/10000 [1:17:04<35:45, 1.91it/s, loss=0.0050, lr=1.13e-05, step=5905] Training: 59%|█████▉ | 5906/10000 [1:17:04<35:45, 1.91it/s, loss=0.0148, lr=1.13e-05, step=5906] Training: 59%|█████▉ | 5907/10000 [1:17:05<42:32, 1.60it/s, loss=0.0148, lr=1.13e-05, step=5906] Training: 59%|█████▉ | 5907/10000 [1:17:05<42:32, 1.60it/s, loss=0.0358, lr=1.13e-05, step=5907] Training: 59%|█████▉ | 5908/10000 [1:17:05<39:26, 1.73it/s, loss=0.0358, lr=1.13e-05, step=5907] Training: 59%|█████▉ | 5908/10000 [1:17:05<39:26, 1.73it/s, loss=0.0345, lr=1.13e-05, step=5908] Training: 59%|█████▉ | 5909/10000 [1:17:06<37:34, 1.81it/s, loss=0.0345, lr=1.13e-05, step=5908] Training: 59%|█████▉ | 5909/10000 [1:17:06<37:34, 1.81it/s, loss=0.0019, lr=1.13e-05, step=5909]20:01:38.689 [I] step=5910 loss=0.0152 smoothed_loss=0.0143 lr=1.13e-05 grad_norm=0.4740 step_time=0.4807s data_time=0.0693s it/s=1.819 eta_to_10000=2249.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0151 grad_action_out_proj_arms=0.1285 grad_arm_token_fuse=0.0792 grad_shared_expert=0.4103 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5910/10000 [1:17:06<36:42, 1.86it/s, loss=0.0019, lr=1.13e-05, step=5909] Training: 59%|█████▉ | 5910/10000 [1:17:06<36:42, 1.86it/s, loss=0.0152, lr=1.13e-05, step=5910] Training: 59%|█████▉ | 5911/10000 [1:17:07<39:08, 1.74it/s, loss=0.0152, lr=1.13e-05, step=5910] Training: 59%|█████▉ | 5911/10000 [1:17:07<39:08, 1.74it/s, loss=0.0136, lr=1.13e-05, step=5911] Training: 59%|█████▉ | 5912/10000 [1:17:07<37:13, 1.83it/s, loss=0.0136, lr=1.13e-05, step=5911] Training: 59%|█████▉ | 5912/10000 [1:17:07<37:13, 1.83it/s, loss=0.0136, lr=1.13e-05, step=5912] Training: 59%|█████▉ | 5913/10000 [1:17:08<35:54, 1.90it/s, loss=0.0136, lr=1.13e-05, step=5912] Training: 59%|█████▉ | 5913/10000 [1:17:08<35:54, 1.90it/s, loss=0.0129, lr=1.13e-05, step=5913] Training: 59%|█████▉ | 5914/10000 [1:17:09<38:31, 1.77it/s, loss=0.0129, lr=1.13e-05, step=5913] Training: 59%|█████▉ | 5914/10000 [1:17:09<38:31, 1.77it/s, loss=0.0275, lr=1.13e-05, step=5914] Training: 59%|█████▉ | 5915/10000 [1:17:09<41:22, 1.65it/s, loss=0.0275, lr=1.13e-05, step=5914] Training: 59%|█████▉ | 5915/10000 [1:17:09<41:22, 1.65it/s, loss=0.0015, lr=1.13e-05, step=5915] Training: 59%|█████▉ | 5916/10000 [1:17:10<39:14, 1.73it/s, loss=0.0015, lr=1.13e-05, step=5915] Training: 59%|█████▉ | 5916/10000 [1:17:10<39:14, 1.73it/s, loss=0.0039, lr=1.13e-05, step=5916] Training: 59%|█████▉ | 5917/10000 [1:17:10<40:21, 1.69it/s, loss=0.0039, lr=1.13e-05, step=5916] Training: 59%|█████▉ | 5917/10000 [1:17:10<40:21, 1.69it/s, loss=0.0369, lr=1.13e-05, step=5917] Training: 59%|█████▉ | 5918/10000 [1:17:11<41:57, 1.62it/s, loss=0.0369, lr=1.13e-05, step=5917] Training: 59%|█████▉ | 5918/10000 [1:17:11<41:57, 1.62it/s, loss=0.0092, lr=1.13e-05, step=5918] Training: 59%|█████▉ | 5919/10000 [1:17:12<39:53, 1.70it/s, loss=0.0092, lr=1.13e-05, step=5918] Training: 59%|█████▉ | 5919/10000 [1:17:12<39:53, 1.70it/s, loss=0.0323, lr=1.13e-05, step=5919]20:01:44.530 [I] step=5920 loss=0.0197 smoothed_loss=0.0168 lr=1.13e-05 grad_norm=0.4990 step_time=0.5064s data_time=0.0777s it/s=1.712 eta_to_10000=2382.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0110 grad_action_out_proj_arms=0.1259 grad_arm_token_fuse=0.0592 grad_shared_expert=0.6252 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5920/10000 [1:17:12<38:50, 1.75it/s, loss=0.0323, lr=1.13e-05, step=5919] Training: 59%|█████▉ | 5920/10000 [1:17:12<38:50, 1.75it/s, loss=0.0197, lr=1.13e-05, step=5920] Training: 59%|█████▉ | 5921/10000 [1:17:13<41:01, 1.66it/s, loss=0.0197, lr=1.13e-05, step=5920] Training: 59%|█████▉ | 5921/10000 [1:17:13<41:01, 1.66it/s, loss=0.0147, lr=1.13e-05, step=5921] Training: 59%|█████▉ | 5922/10000 [1:17:14<42:36, 1.60it/s, loss=0.0147, lr=1.13e-05, step=5921] Training: 59%|█████▉ | 5922/10000 [1:17:14<42:36, 1.60it/s, loss=0.0016, lr=1.13e-05, step=5922] Training: 59%|█████▉ | 5923/10000 [1:17:14<45:22, 1.50it/s, loss=0.0016, lr=1.13e-05, step=5922] Training: 59%|█████▉ | 5923/10000 [1:17:14<45:22, 1.50it/s, loss=0.0083, lr=1.13e-05, step=5923] Training: 59%|█████▉ | 5924/10000 [1:17:15<42:36, 1.59it/s, loss=0.0083, lr=1.13e-05, step=5923] Training: 59%|█████▉ | 5924/10000 [1:17:15<42:36, 1.59it/s, loss=0.0242, lr=1.13e-05, step=5924] Training: 59%|█████▉ | 5925/10000 [1:17:15<39:40, 1.71it/s, loss=0.0242, lr=1.13e-05, step=5924] Training: 59%|█████▉ | 5925/10000 [1:17:15<39:40, 1.71it/s, loss=0.0108, lr=1.13e-05, step=5925] Training: 59%|█████▉ | 5926/10000 [1:17:16<41:03, 1.65it/s, loss=0.0108, lr=1.13e-05, step=5925] Training: 59%|█████▉ | 5926/10000 [1:17:16<41:03, 1.65it/s, loss=0.0230, lr=1.13e-05, step=5926] Training: 59%|█████▉ | 5927/10000 [1:17:17<39:49, 1.70it/s, loss=0.0230, lr=1.13e-05, step=5926] Training: 59%|█████▉ | 5927/10000 [1:17:17<39:49, 1.70it/s, loss=0.0056, lr=1.13e-05, step=5927] Training: 59%|█████▉ | 5928/10000 [1:17:17<41:51, 1.62it/s, loss=0.0056, lr=1.13e-05, step=5927] Training: 59%|█████▉ | 5928/10000 [1:17:17<41:51, 1.62it/s, loss=0.0041, lr=1.13e-05, step=5928] Training: 59%|█████▉ | 5929/10000 [1:17:18<43:28, 1.56it/s, loss=0.0041, lr=1.13e-05, step=5928] Training: 59%|█████▉ | 5929/10000 [1:17:18<43:28, 1.56it/s, loss=0.0084, lr=1.12e-05, step=5929]20:01:50.779 [I] step=5930 loss=0.0199 smoothed_loss=0.0138 lr=1.13e-05 grad_norm=0.3877 step_time=0.5413s data_time=0.0836s it/s=1.600 eta_to_10000=2543.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0095 grad_action_out_proj_arms=0.0965 grad_arm_token_fuse=0.0475 grad_shared_expert=0.2654 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5930/10000 [1:17:18<41:11, 1.65it/s, loss=0.0084, lr=1.12e-05, step=5929] Training: 59%|█████▉ | 5930/10000 [1:17:18<41:11, 1.65it/s, loss=0.0199, lr=1.12e-05, step=5930] Training: 59%|█████▉ | 5931/10000 [1:17:19<39:22, 1.72it/s, loss=0.0199, lr=1.12e-05, step=5930] Training: 59%|█████▉ | 5931/10000 [1:17:19<39:22, 1.72it/s, loss=0.0098, lr=1.12e-05, step=5931] Training: 59%|█████▉ | 5932/10000 [1:17:20<41:54, 1.62it/s, loss=0.0098, lr=1.12e-05, step=5931] Training: 59%|█████▉ | 5932/10000 [1:17:20<41:54, 1.62it/s, loss=0.0057, lr=1.12e-05, step=5932] Training: 59%|█████▉ | 5933/10000 [1:17:20<42:42, 1.59it/s, loss=0.0057, lr=1.12e-05, step=5932] Training: 59%|█████▉ | 5933/10000 [1:17:20<42:42, 1.59it/s, loss=0.0036, lr=1.12e-05, step=5933] Training: 59%|█████▉ | 5934/10000 [1:17:21<39:34, 1.71it/s, loss=0.0036, lr=1.12e-05, step=5933] Training: 59%|█████▉ | 5934/10000 [1:17:21<39:34, 1.71it/s, loss=0.0031, lr=1.12e-05, step=5934] Training: 59%|█████▉ | 5935/10000 [1:17:21<37:42, 1.80it/s, loss=0.0031, lr=1.12e-05, step=5934] Training: 59%|█████▉ | 5935/10000 [1:17:21<37:42, 1.80it/s, loss=0.0092, lr=1.12e-05, step=5935] Training: 59%|█████▉ | 5936/10000 [1:17:22<44:49, 1.51it/s, loss=0.0092, lr=1.12e-05, step=5935] Training: 59%|█████▉ | 5936/10000 [1:17:22<44:49, 1.51it/s, loss=0.0040, lr=1.12e-05, step=5936] Training: 59%|█████▉ | 5937/10000 [1:17:23<41:28, 1.63it/s, loss=0.0040, lr=1.12e-05, step=5936] Training: 59%|█████▉ | 5937/10000 [1:17:23<41:28, 1.63it/s, loss=0.0102, lr=1.12e-05, step=5937] Training: 59%|█████▉ | 5938/10000 [1:17:23<38:54, 1.74it/s, loss=0.0102, lr=1.12e-05, step=5937] Training: 59%|█████▉ | 5938/10000 [1:17:23<38:54, 1.74it/s, loss=0.0027, lr=1.12e-05, step=5938] Training: 59%|█████▉ | 5939/10000 [1:17:24<37:01, 1.83it/s, loss=0.0027, lr=1.12e-05, step=5938] Training: 59%|█████▉ | 5939/10000 [1:17:24<37:01, 1.83it/s, loss=0.0047, lr=1.12e-05, step=5939]20:01:56.531 [I] step=5940 loss=0.0100 smoothed_loss=0.0090 lr=1.12e-05 grad_norm=0.4160 step_time=0.4961s data_time=0.0790s it/s=1.739 eta_to_10000=2334.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0835 grad_arm_token_fuse=0.0394 grad_shared_expert=0.3308 (18633:train_pytorch.py:850) + Training: 59%|█████▉ | 5940/10000 [1:17:24<36:35, 1.85it/s, loss=0.0047, lr=1.12e-05, step=5939] Training: 59%|█████▉ | 5940/10000 [1:17:24<36:35, 1.85it/s, loss=0.0100, lr=1.12e-05, step=5940] Training: 59%|█████▉ | 5941/10000 [1:17:25<39:01, 1.73it/s, loss=0.0100, lr=1.12e-05, step=5940] Training: 59%|█████▉ | 5941/10000 [1:17:25<39:01, 1.73it/s, loss=0.0077, lr=1.12e-05, step=5941] Training: 59%|█████▉ | 5942/10000 [1:17:25<37:12, 1.82it/s, loss=0.0077, lr=1.12e-05, step=5941] Training: 59%|█████▉ | 5942/10000 [1:17:25<37:12, 1.82it/s, loss=0.0119, lr=1.12e-05, step=5942] Training: 59%|█████▉ | 5943/10000 [1:17:26<47:35, 1.42it/s, loss=0.0119, lr=1.12e-05, step=5942] Training: 59%|█████▉ | 5943/10000 [1:17:26<47:35, 1.42it/s, loss=0.0070, lr=1.12e-05, step=5943] Training: 59%|█████▉ | 5944/10000 [1:17:27<43:54, 1.54it/s, loss=0.0070, lr=1.12e-05, step=5943] Training: 59%|█████▉ | 5944/10000 [1:17:27<43:54, 1.54it/s, loss=0.0054, lr=1.12e-05, step=5944] Training: 59%|█████▉ | 5945/10000 [1:17:27<40:20, 1.68it/s, loss=0.0054, lr=1.12e-05, step=5944] Training: 59%|█████▉ | 5945/10000 [1:17:27<40:20, 1.68it/s, loss=0.0057, lr=1.12e-05, step=5945] Training: 59%|█████▉ | 5946/10000 [1:17:28<38:05, 1.77it/s, loss=0.0057, lr=1.12e-05, step=5945] Training: 59%|█████▉ | 5946/10000 [1:17:28<38:05, 1.77it/s, loss=0.0178, lr=1.12e-05, step=5946] Training: 59%|█████▉ | 5947/10000 [1:17:29<41:09, 1.64it/s, loss=0.0178, lr=1.12e-05, step=5946] Training: 59%|█████▉ | 5947/10000 [1:17:29<41:09, 1.64it/s, loss=0.0024, lr=1.12e-05, step=5947] Training: 59%|█████▉ | 5948/10000 [1:17:29<42:13, 1.60it/s, loss=0.0024, lr=1.12e-05, step=5947] Training: 59%|█████▉ | 5948/10000 [1:17:29<42:13, 1.60it/s, loss=0.0360, lr=1.12e-05, step=5948] Training: 59%|█████▉ | 5949/10000 [1:17:30<40:11, 1.68it/s, loss=0.0360, lr=1.12e-05, step=5948] Training: 59%|█████▉ | 5949/10000 [1:17:30<40:11, 1.68it/s, loss=0.0089, lr=1.12e-05, step=5949]20:02:02.878 [I] step=5950 loss=0.0024 smoothed_loss=0.0102 lr=1.12e-05 grad_norm=0.4281 step_time=0.5411s data_time=0.0936s it/s=1.576 eta_to_10000=2570.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0057 grad_action_out_proj_arms=0.0651 grad_arm_token_fuse=0.0283 grad_shared_expert=0.1940 (18633:train_pytorch.py:850) + Training: 60%|█████▉ | 5950/10000 [1:17:31<43:17, 1.56it/s, loss=0.0089, lr=1.12e-05, step=5949] Training: 60%|█████▉ | 5950/10000 [1:17:31<43:17, 1.56it/s, loss=0.0024, lr=1.12e-05, step=5950] Training: 60%|█████▉ | 5951/10000 [1:17:31<42:32, 1.59it/s, loss=0.0024, lr=1.12e-05, step=5950] Training: 60%|█████▉ | 5951/10000 [1:17:31<42:32, 1.59it/s, loss=0.0126, lr=1.12e-05, step=5951] Training: 60%|█████▉ | 5952/10000 [1:17:32<39:30, 1.71it/s, loss=0.0126, lr=1.12e-05, step=5951] Training: 60%|█████▉ | 5952/10000 [1:17:32<39:30, 1.71it/s, loss=0.0105, lr=1.12e-05, step=5952] Training: 60%|█████▉ | 5953/10000 [1:17:32<44:36, 1.51it/s, loss=0.0105, lr=1.12e-05, step=5952] Training: 60%|█████▉ | 5953/10000 [1:17:32<44:36, 1.51it/s, loss=0.0164, lr=1.12e-05, step=5953] Training: 60%|█████▉ | 5954/10000 [1:17:33<40:57, 1.65it/s, loss=0.0164, lr=1.12e-05, step=5953] Training: 60%|█████▉ | 5954/10000 [1:17:33<40:57, 1.65it/s, loss=0.0105, lr=1.12e-05, step=5954] Training: 60%|█████▉ | 5955/10000 [1:17:34<48:05, 1.40it/s, loss=0.0105, lr=1.12e-05, step=5954] Training: 60%|█████▉ | 5955/10000 [1:17:34<48:05, 1.40it/s, loss=0.0162, lr=1.12e-05, step=5955] Training: 60%|█████▉ | 5956/10000 [1:17:35<52:49, 1.28it/s, loss=0.0162, lr=1.12e-05, step=5955] Training: 60%|█████▉ | 5956/10000 [1:17:35<52:49, 1.28it/s, loss=0.0112, lr=1.12e-05, step=5956] Training: 60%|█████▉ | 5957/10000 [1:17:36<52:33, 1.28it/s, loss=0.0112, lr=1.12e-05, step=5956] Training: 60%|█████▉ | 5957/10000 [1:17:36<52:33, 1.28it/s, loss=0.0214, lr=1.11e-05, step=5957] Training: 60%|█████▉ | 5958/10000 [1:17:36<46:42, 1.44it/s, loss=0.0214, lr=1.11e-05, step=5957] Training: 60%|█████▉ | 5958/10000 [1:17:36<46:42, 1.44it/s, loss=0.0075, lr=1.11e-05, step=5958] Training: 60%|█████▉ | 5959/10000 [1:17:37<44:25, 1.52it/s, loss=0.0075, lr=1.11e-05, step=5958] Training: 60%|█████▉ | 5959/10000 [1:17:37<44:25, 1.52it/s, loss=0.0091, lr=1.11e-05, step=5959]20:02:09.842 [I] step=5960 loss=0.0186 smoothed_loss=0.0124 lr=1.12e-05 grad_norm=0.4240 step_time=0.5644s data_time=0.1320s it/s=1.436 eta_to_10000=2813.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0082 grad_action_out_proj_arms=0.0978 grad_arm_token_fuse=0.0412 grad_shared_expert=0.3067 (18633:train_pytorch.py:850) + Training: 60%|█████▉ | 5960/10000 [1:17:38<47:22, 1.42it/s, loss=0.0091, lr=1.11e-05, step=5959] Training: 60%|█████▉ | 5960/10000 [1:17:38<47:22, 1.42it/s, loss=0.0186, lr=1.11e-05, step=5960] Training: 60%|█████▉ | 5961/10000 [1:17:38<43:59, 1.53it/s, loss=0.0186, lr=1.11e-05, step=5960] Training: 60%|█████▉ | 5961/10000 [1:17:38<43:59, 1.53it/s, loss=0.0168, lr=1.11e-05, step=5961] Training: 60%|█████▉ | 5962/10000 [1:17:39<49:30, 1.36it/s, loss=0.0168, lr=1.11e-05, step=5961] Training: 60%|█████▉ | 5962/10000 [1:17:39<49:30, 1.36it/s, loss=0.0101, lr=1.11e-05, step=5962] Training: 60%|█████▉ | 5963/10000 [1:17:40<50:09, 1.34it/s, loss=0.0101, lr=1.11e-05, step=5962] Training: 60%|█████▉ | 5963/10000 [1:17:40<50:09, 1.34it/s, loss=0.0018, lr=1.11e-05, step=5963] Training: 60%|█████▉ | 5964/10000 [1:17:41<54:35, 1.23it/s, loss=0.0018, lr=1.11e-05, step=5963] Training: 60%|█████▉ | 5964/10000 [1:17:41<54:35, 1.23it/s, loss=0.0202, lr=1.11e-05, step=5964] Training: 60%|█████▉ | 5965/10000 [1:17:42<56:19, 1.19it/s, loss=0.0202, lr=1.11e-05, step=5964] Training: 60%|█████▉ | 5965/10000 [1:17:42<56:19, 1.19it/s, loss=0.0370, lr=1.11e-05, step=5965] Training: 60%|█████▉ | 5966/10000 [1:17:42<54:28, 1.23it/s, loss=0.0370, lr=1.11e-05, step=5965] Training: 60%|█████▉ | 5966/10000 [1:17:42<54:28, 1.23it/s, loss=0.0067, lr=1.11e-05, step=5966] Training: 60%|█████▉ | 5967/10000 [1:17:43<50:52, 1.32it/s, loss=0.0067, lr=1.11e-05, step=5966] Training: 60%|█████▉ | 5967/10000 [1:17:43<50:52, 1.32it/s, loss=0.0020, lr=1.11e-05, step=5967] Training: 60%|█████▉ | 5968/10000 [1:17:44<52:04, 1.29it/s, loss=0.0020, lr=1.11e-05, step=5967] Training: 60%|█████▉ | 5968/10000 [1:17:44<52:04, 1.29it/s, loss=0.0127, lr=1.11e-05, step=5968] Training: 60%|█████▉ | 5969/10000 [1:17:45<53:34, 1.25it/s, loss=0.0127, lr=1.11e-05, step=5968] Training: 60%|█████▉ | 5969/10000 [1:17:45<53:34, 1.25it/s, loss=0.0077, lr=1.11e-05, step=5969]20:02:17.498 [I] step=5970 loss=0.0148 smoothed_loss=0.0125 lr=1.11e-05 grad_norm=0.4010 step_time=0.6142s data_time=0.1514s it/s=1.306 eta_to_10000=3085.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0232 grad_action_out_proj_arms=0.1366 grad_arm_token_fuse=0.1227 grad_shared_expert=0.3203 (18633:train_pytorch.py:850) + Training: 60%|█████▉ | 5970/10000 [1:17:45<47:52, 1.40it/s, loss=0.0077, lr=1.11e-05, step=5969] Training: 60%|█████▉ | 5970/10000 [1:17:45<47:52, 1.40it/s, loss=0.0148, lr=1.11e-05, step=5970] Training: 60%|█████▉ | 5971/10000 [1:17:46<46:29, 1.44it/s, loss=0.0148, lr=1.11e-05, step=5970] Training: 60%|█████▉ | 5971/10000 [1:17:46<46:29, 1.44it/s, loss=0.0047, lr=1.11e-05, step=5971] Training: 60%|█████▉ | 5972/10000 [1:17:47<52:54, 1.27it/s, loss=0.0047, lr=1.11e-05, step=5971] Training: 60%|█████▉ | 5972/10000 [1:17:47<52:54, 1.27it/s, loss=0.0127, lr=1.11e-05, step=5972] Training: 60%|█████▉ | 5973/10000 [1:17:47<46:43, 1.44it/s, loss=0.0127, lr=1.11e-05, step=5972] Training: 60%|█████▉ | 5973/10000 [1:17:47<46:43, 1.44it/s, loss=0.0082, lr=1.11e-05, step=5973] Training: 60%|█████▉ | 5974/10000 [1:17:48<43:29, 1.54it/s, loss=0.0082, lr=1.11e-05, step=5973] Training: 60%|█████▉ | 5974/10000 [1:17:48<43:29, 1.54it/s, loss=0.0048, lr=1.11e-05, step=5974] Training: 60%|█████▉ | 5975/10000 [1:17:49<44:26, 1.51it/s, loss=0.0048, lr=1.11e-05, step=5974] Training: 60%|█████▉ | 5975/10000 [1:17:49<44:26, 1.51it/s, loss=0.0739, lr=1.11e-05, step=5975] Training: 60%|█████▉ | 5976/10000 [1:17:49<46:49, 1.43it/s, loss=0.0739, lr=1.11e-05, step=5975] Training: 60%|█████▉ | 5976/10000 [1:17:49<46:49, 1.43it/s, loss=0.0031, lr=1.11e-05, step=5976] Training: 60%|█████▉ | 5977/10000 [1:17:50<43:53, 1.53it/s, loss=0.0031, lr=1.11e-05, step=5976] Training: 60%|█████▉ | 5977/10000 [1:17:50<43:53, 1.53it/s, loss=0.0097, lr=1.11e-05, step=5977] Training: 60%|█████▉ | 5978/10000 [1:17:51<45:23, 1.48it/s, loss=0.0097, lr=1.11e-05, step=5977] Training: 60%|█████▉ | 5978/10000 [1:17:51<45:23, 1.48it/s, loss=0.0147, lr=1.11e-05, step=5978] Training: 60%|█████▉ | 5979/10000 [1:17:51<45:40, 1.47it/s, loss=0.0147, lr=1.11e-05, step=5978] Training: 60%|█████▉ | 5979/10000 [1:17:51<45:40, 1.47it/s, loss=0.0062, lr=1.11e-05, step=5979]20:02:24.376 [I] step=5980 loss=0.0061 smoothed_loss=0.0134 lr=1.11e-05 grad_norm=0.4154 step_time=0.5680s data_time=0.1198s it/s=1.454 eta_to_10000=2764.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0046 grad_action_out_proj_arms=0.0505 grad_arm_token_fuse=0.0240 grad_shared_expert=0.2336 (18633:train_pytorch.py:850) + Training: 60%|█████▉ | 5980/10000 [1:17:52<47:05, 1.42it/s, loss=0.0062, lr=1.11e-05, step=5979] Training: 60%|█████▉ | 5980/10000 [1:17:52<47:05, 1.42it/s, loss=0.0061, lr=1.11e-05, step=5980] Training: 60%|█████▉ | 5981/10000 [1:17:53<42:41, 1.57it/s, loss=0.0061, lr=1.11e-05, step=5980] Training: 60%|█████▉ | 5981/10000 [1:17:53<42:41, 1.57it/s, loss=0.0086, lr=1.11e-05, step=5981] Training: 60%|█████▉ | 5982/10000 [1:17:53<44:14, 1.51it/s, loss=0.0086, lr=1.11e-05, step=5981] Training: 60%|█████▉ | 5982/10000 [1:17:53<44:14, 1.51it/s, loss=0.0072, lr=1.11e-05, step=5982] Training: 60%|█████▉ | 5983/10000 [1:17:54<48:20, 1.38it/s, loss=0.0072, lr=1.11e-05, step=5982] Training: 60%|█████▉ | 5983/10000 [1:17:54<48:20, 1.38it/s, loss=0.0182, lr=1.11e-05, step=5983] Training: 60%|█████▉ | 5984/10000 [1:17:55<45:26, 1.47it/s, loss=0.0182, lr=1.11e-05, step=5983] Training: 60%|█████▉ | 5984/10000 [1:17:55<45:26, 1.47it/s, loss=0.0090, lr=1.10e-05, step=5984] Training: 60%|█████▉ | 5985/10000 [1:17:55<41:20, 1.62it/s, loss=0.0090, lr=1.10e-05, step=5984] Training: 60%|█████▉ | 5985/10000 [1:17:55<41:20, 1.62it/s, loss=0.0264, lr=1.10e-05, step=5985] Training: 60%|█████▉ | 5986/10000 [1:17:56<42:50, 1.56it/s, loss=0.0264, lr=1.10e-05, step=5985] Training: 60%|█████▉ | 5986/10000 [1:17:56<42:50, 1.56it/s, loss=0.0071, lr=1.10e-05, step=5986] Training: 60%|█████▉ | 5987/10000 [1:17:56<39:52, 1.68it/s, loss=0.0071, lr=1.10e-05, step=5986] Training: 60%|█████▉ | 5987/10000 [1:17:56<39:52, 1.68it/s, loss=0.0179, lr=1.10e-05, step=5987] Training: 60%|█████▉ | 5988/10000 [1:17:57<37:53, 1.77it/s, loss=0.0179, lr=1.10e-05, step=5987] Training: 60%|█████▉ | 5988/10000 [1:17:57<37:53, 1.77it/s, loss=0.0128, lr=1.10e-05, step=5988] Training: 60%|█████▉ | 5989/10000 [1:17:57<36:26, 1.83it/s, loss=0.0128, lr=1.10e-05, step=5988] Training: 60%|█████▉ | 5989/10000 [1:17:57<36:26, 1.83it/s, loss=0.0177, lr=1.10e-05, step=5989]20:02:30.201 [I] step=5990 loss=0.0933 smoothed_loss=0.0220 lr=1.10e-05 grad_norm=0.4634 step_time=0.4884s data_time=0.0941s it/s=1.717 eta_to_10000=2335.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0333 grad_action_out_proj_arms=0.2104 grad_arm_token_fuse=0.1785 grad_shared_expert=0.6199 (18633:train_pytorch.py:850) + Training: 60%|█████▉ | 5990/10000 [1:17:58<36:04, 1.85it/s, loss=0.0177, lr=1.10e-05, step=5989] Training: 60%|█████▉ | 5990/10000 [1:17:58<36:04, 1.85it/s, loss=0.0933, lr=1.10e-05, step=5990] Training: 60%|█████▉ | 5991/10000 [1:17:59<38:19, 1.74it/s, loss=0.0933, lr=1.10e-05, step=5990] Training: 60%|█████▉ | 5991/10000 [1:17:59<38:19, 1.74it/s, loss=0.0095, lr=1.10e-05, step=5991] Training: 60%|█████▉ | 5992/10000 [1:17:59<36:52, 1.81it/s, loss=0.0095, lr=1.10e-05, step=5991] Training: 60%|█████▉ | 5992/10000 [1:17:59<36:52, 1.81it/s, loss=0.0067, lr=1.10e-05, step=5992] Training: 60%|█████▉ | 5993/10000 [1:18:00<46:15, 1.44it/s, loss=0.0067, lr=1.10e-05, step=5992] Training: 60%|█████▉ | 5993/10000 [1:18:00<46:15, 1.44it/s, loss=0.0171, lr=1.10e-05, step=5993] Training: 60%|█████▉ | 5994/10000 [1:18:01<42:13, 1.58it/s, loss=0.0171, lr=1.10e-05, step=5993] Training: 60%|█████▉ | 5994/10000 [1:18:01<42:13, 1.58it/s, loss=0.0270, lr=1.10e-05, step=5994] Training: 60%|█████▉ | 5995/10000 [1:18:01<39:07, 1.71it/s, loss=0.0270, lr=1.10e-05, step=5994] Training: 60%|█████▉ | 5995/10000 [1:18:01<39:07, 1.71it/s, loss=0.0165, lr=1.10e-05, step=5995] Training: 60%|█████▉ | 5996/10000 [1:18:02<43:08, 1.55it/s, loss=0.0165, lr=1.10e-05, step=5995] Training: 60%|█████▉ | 5996/10000 [1:18:02<43:08, 1.55it/s, loss=0.0094, lr=1.10e-05, step=5996] Training: 60%|█████▉ | 5997/10000 [1:18:02<44:03, 1.51it/s, loss=0.0094, lr=1.10e-05, step=5996] Training: 60%|█████▉ | 5997/10000 [1:18:02<44:03, 1.51it/s, loss=0.0297, lr=1.10e-05, step=5997] Training: 60%|█████▉ | 5998/10000 [1:18:03<49:10, 1.36it/s, loss=0.0297, lr=1.10e-05, step=5997] Training: 60%|█████▉ | 5998/10000 [1:18:03<49:10, 1.36it/s, loss=0.0092, lr=1.10e-05, step=5998] Training: 60%|█████▉ | 5999/10000 [1:18:04<49:51, 1.34it/s, loss=0.0092, lr=1.10e-05, step=5998] Training: 60%|█████▉ | 5999/10000 [1:18:04<49:51, 1.34it/s, loss=0.0049, lr=1.10e-05, step=5999]20:02:37.515 [I] step=6000 loss=0.0115 smoothed_loss=0.0167 lr=1.10e-05 grad_norm=0.4519 step_time=0.5962s data_time=0.1353s it/s=1.367 eta_to_10000=2925.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0126 grad_action_out_proj_arms=0.1073 grad_arm_token_fuse=0.0742 grad_shared_expert=0.5833 (18633:train_pytorch.py:850) +20:04:56.835 [I] Saved checkpoint at step 6000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/6000 (18633:train_pytorch.py:350) + Training: 60%|██████ | 6000/10000 [1:20:25<47:21:22, 42.62s/it, loss=0.0049, lr=1.10e-05, step=5999] Training: 60%|██████ | 6000/10000 [1:20:25<47:21:22, 42.62s/it, loss=0.0115, lr=1.10e-05, step=6000] Training: 60%|██████ | 6001/10000 [1:20:25<33:26:27, 30.10s/it, loss=0.0115, lr=1.10e-05, step=6000] Training: 60%|██████ | 6001/10000 [1:20:25<33:26:27, 30.10s/it, loss=0.0108, lr=1.10e-05, step=6001] Training: 60%|██████ | 6002/10000 [1:20:26<23:36:12, 21.25s/it, loss=0.0108, lr=1.10e-05, step=6001] Training: 60%|██████ | 6002/10000 [1:20:26<23:36:12, 21.25s/it, loss=0.0017, lr=1.10e-05, step=6002] Training: 60%|██████ | 6003/10000 [1:20:27<16:41:07, 15.03s/it, loss=0.0017, lr=1.10e-05, step=6002] Training: 60%|██████ | 6003/10000 [1:20:27<16:41:07, 15.03s/it, loss=0.0056, lr=1.10e-05, step=6003] Training: 60%|██████ | 6004/10000 [1:20:27<11:52:08, 10.69s/it, loss=0.0056, lr=1.10e-05, step=6003] Training: 60%|██████ | 6004/10000 [1:20:27<11:52:08, 10.69s/it, loss=0.0016, lr=1.10e-05, step=6004] Training: 60%|██████ | 6005/10000 [1:20:28<8:40:02, 7.81s/it, loss=0.0016, lr=1.10e-05, step=6004] Training: 60%|██████ | 6005/10000 [1:20:28<8:40:02, 7.81s/it, loss=0.0051, lr=1.10e-05, step=6005] Training: 60%|██████ | 6006/10000 [1:20:29<6:22:21, 5.74s/it, loss=0.0051, lr=1.10e-05, step=6005] Training: 60%|██████ | 6006/10000 [1:20:29<6:22:21, 5.74s/it, loss=0.0048, lr=1.10e-05, step=6006] Training: 60%|██████ | 6007/10000 [1:20:30<4:44:50, 4.28s/it, loss=0.0048, lr=1.10e-05, step=6006] Training: 60%|██████ | 6007/10000 [1:20:30<4:44:50, 4.28s/it, loss=0.0372, lr=1.10e-05, step=6007] Training: 60%|██████ | 6008/10000 [1:20:31<3:32:38, 3.20s/it, loss=0.0372, lr=1.10e-05, step=6007] Training: 60%|██████ | 6008/10000 [1:20:31<3:32:38, 3.20s/it, loss=0.0374, lr=1.10e-05, step=6008] Training: 60%|██████ | 6009/10000 [1:20:31<2:44:28, 2.47s/it, loss=0.0374, lr=1.10e-05, step=6008] Training: 60%|██████ | 6009/10000 [1:20:31<2:44:28, 2.47s/it, loss=0.0042, lr=1.10e-05, step=6009]20:05:04.812 [I] step=6010 loss=0.0034 smoothed_loss=0.0137 lr=1.10e-05 grad_norm=0.4221 step_time=0.6030s data_time=14.1266s it/s=0.068 eta_to_10000=58770.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0746 grad_arm_token_fuse=0.0424 grad_shared_expert=0.2604 (18633:train_pytorch.py:850) + Training: 60%|██████ | 6010/10000 [1:20:32<2:16:26, 2.05s/it, loss=0.0042, lr=1.10e-05, step=6009] Training: 60%|██████ | 6010/10000 [1:20:32<2:16:26, 2.05s/it, loss=0.0034, lr=1.10e-05, step=6010] Training: 60%|██████ | 6011/10000 [1:20:33<1:45:03, 1.58s/it, loss=0.0034, lr=1.10e-05, step=6010] Training: 60%|██████ | 6011/10000 [1:20:33<1:45:03, 1.58s/it, loss=0.0030, lr=1.10e-05, step=6011] Training: 60%|██████ | 6012/10000 [1:20:34<1:30:16, 1.36s/it, loss=0.0030, lr=1.10e-05, step=6011] Training: 60%|██████ | 6012/10000 [1:20:34<1:30:16, 1.36s/it, loss=0.0039, lr=1.09e-05, step=6012] Training: 60%|██████ | 6013/10000 [1:20:34<1:15:58, 1.14s/it, loss=0.0039, lr=1.09e-05, step=6012] Training: 60%|██████ | 6013/10000 [1:20:34<1:15:58, 1.14s/it, loss=0.0078, lr=1.09e-05, step=6013] Training: 60%|██████ | 6014/10000 [1:20:35<1:09:11, 1.04s/it, loss=0.0078, lr=1.09e-05, step=6013] Training: 60%|██████ | 6014/10000 [1:20:35<1:09:11, 1.04s/it, loss=0.0059, lr=1.09e-05, step=6014] Training: 60%|██████ | 6015/10000 [1:20:36<1:10:22, 1.06s/it, loss=0.0059, lr=1.09e-05, step=6014] Training: 60%|██████ | 6015/10000 [1:20:36<1:10:22, 1.06s/it, loss=0.0150, lr=1.09e-05, step=6015] Training: 60%|██████ | 6016/10000 [1:20:37<1:09:39, 1.05s/it, loss=0.0150, lr=1.09e-05, step=6015] Training: 60%|██████ | 6016/10000 [1:20:37<1:09:39, 1.05s/it, loss=0.0021, lr=1.09e-05, step=6016] Training: 60%|██████ | 6017/10000 [1:20:38<1:01:13, 1.08it/s, loss=0.0021, lr=1.09e-05, step=6016] Training: 60%|██████ | 6017/10000 [1:20:38<1:01:13, 1.08it/s, loss=0.0033, lr=1.09e-05, step=6017] Training: 60%|██████ | 6018/10000 [1:20:39<57:40, 1.15it/s, loss=0.0033, lr=1.09e-05, step=6017] Training: 60%|██████ | 6018/10000 [1:20:39<57:40, 1.15it/s, loss=0.0087, lr=1.09e-05, step=6018] Training: 60%|██████ | 6019/10000 [1:20:40<57:45, 1.15it/s, loss=0.0087, lr=1.09e-05, step=6018] Training: 60%|██████ | 6019/10000 [1:20:40<57:45, 1.15it/s, loss=0.0137, lr=1.09e-05, step=6019]20:05:12.720 [I] step=6020 loss=0.0076 smoothed_loss=0.0097 lr=1.09e-05 grad_norm=0.4440 step_time=0.6410s data_time=0.1499s it/s=1.265 eta_to_10000=3147.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0187 grad_action_out_proj_arms=0.1258 grad_arm_token_fuse=0.0933 grad_shared_expert=0.5497 (18633:train_pytorch.py:850) + Training: 60%|██████ | 6020/10000 [1:20:40<55:45, 1.19it/s, loss=0.0137, lr=1.09e-05, step=6019] Training: 60%|██████ | 6020/10000 [1:20:40<55:45, 1.19it/s, loss=0.0076, lr=1.09e-05, step=6020] Training: 60%|██████ | 6021/10000 [1:20:41<53:06, 1.25it/s, loss=0.0076, lr=1.09e-05, step=6020] Training: 60%|██████ | 6021/10000 [1:20:41<53:06, 1.25it/s, loss=0.0039, lr=1.09e-05, step=6021] Training: 60%|██████ | 6022/10000 [1:20:42<53:42, 1.23it/s, loss=0.0039, lr=1.09e-05, step=6021] Training: 60%|██████ | 6022/10000 [1:20:42<53:42, 1.23it/s, loss=0.0606, lr=1.09e-05, step=6022] Training: 60%|██████ | 6023/10000 [1:20:43<52:12, 1.27it/s, loss=0.0606, lr=1.09e-05, step=6022] Training: 60%|██████ | 6023/10000 [1:20:43<52:12, 1.27it/s, loss=0.0110, lr=1.09e-05, step=6023] Training: 60%|██████ | 6024/10000 [1:20:44<53:56, 1.23it/s, loss=0.0110, lr=1.09e-05, step=6023] Training: 60%|██████ | 6024/10000 [1:20:44<53:56, 1.23it/s, loss=0.0038, lr=1.09e-05, step=6024] Training: 60%|██████ | 6025/10000 [1:20:44<50:17, 1.32it/s, loss=0.0038, lr=1.09e-05, step=6024] Training: 60%|██████ | 6025/10000 [1:20:44<50:17, 1.32it/s, loss=0.0238, lr=1.09e-05, step=6025] Training: 60%|██████ | 6026/10000 [1:20:45<55:43, 1.19it/s, loss=0.0238, lr=1.09e-05, step=6025] Training: 60%|██████ | 6026/10000 [1:20:45<55:43, 1.19it/s, loss=0.0059, lr=1.09e-05, step=6026] Training: 60%|██████ | 6027/10000 [1:20:46<50:15, 1.32it/s, loss=0.0059, lr=1.09e-05, step=6026] Training: 60%|██████ | 6027/10000 [1:20:46<50:15, 1.32it/s, loss=0.0216, lr=1.09e-05, step=6027] Training: 60%|██████ | 6028/10000 [1:20:46<49:07, 1.35it/s, loss=0.0216, lr=1.09e-05, step=6027] Training: 60%|██████ | 6028/10000 [1:20:46<49:07, 1.35it/s, loss=0.0063, lr=1.09e-05, step=6028] Training: 60%|██████ | 6029/10000 [1:20:48<54:53, 1.21it/s, loss=0.0063, lr=1.09e-05, step=6028] Training: 60%|██████ | 6029/10000 [1:20:48<54:53, 1.21it/s, loss=0.0045, lr=1.09e-05, step=6029]20:05:20.834 [I] step=6030 loss=0.0083 smoothed_loss=0.0120 lr=1.09e-05 grad_norm=0.4946 step_time=0.6617s data_time=0.1496s it/s=1.233 eta_to_10000=3220.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0851 grad_arm_token_fuse=0.0423 grad_shared_expert=0.3548 (18633:train_pytorch.py:850) + Training: 60%|██████ | 6030/10000 [1:20:49<58:11, 1.14it/s, loss=0.0045, lr=1.09e-05, step=6029] Training: 60%|██████ | 6030/10000 [1:20:49<58:11, 1.14it/s, loss=0.0083, lr=1.09e-05, step=6030] Training: 60%|██████ | 6031/10000 [1:20:51<1:27:48, 1.33s/it, loss=0.0083, lr=1.09e-05, step=6030] Training: 60%|██████ | 6031/10000 [1:20:51<1:27:48, 1.33s/it, loss=0.0278, lr=1.09e-05, step=6031] Training: 60%|██████ | 6032/10000 [1:20:51<1:13:45, 1.12s/it, loss=0.0278, lr=1.09e-05, step=6031] Training: 60%|██████ | 6032/10000 [1:20:51<1:13:45, 1.12s/it, loss=0.0043, lr=1.09e-05, step=6032] Training: 60%|██████ | 6033/10000 [1:20:52<1:09:21, 1.05s/it, loss=0.0043, lr=1.09e-05, step=6032] Training: 60%|██████ | 6033/10000 [1:20:52<1:09:21, 1.05s/it, loss=0.0029, lr=1.09e-05, step=6033] Training: 60%|██████ | 6034/10000 [1:20:53<1:03:57, 1.03it/s, loss=0.0029, lr=1.09e-05, step=6033] Training: 60%|██████ | 6034/10000 [1:20:53<1:03:57, 1.03it/s, loss=0.0080, lr=1.09e-05, step=6034] Training: 60%|██████ | 6035/10000 [1:20:54<59:59, 1.10it/s, loss=0.0080, lr=1.09e-05, step=6034] Training: 60%|██████ | 6035/10000 [1:20:54<59:59, 1.10it/s, loss=0.0220, lr=1.09e-05, step=6035] Training: 60%|██████ | 6036/10000 [1:20:55<56:30, 1.17it/s, loss=0.0220, lr=1.09e-05, step=6035] Training: 60%|██████ | 6036/10000 [1:20:55<56:30, 1.17it/s, loss=0.0205, lr=1.09e-05, step=6036] Training: 60%|██████ | 6037/10000 [1:20:55<51:24, 1.29it/s, loss=0.0205, lr=1.09e-05, step=6036] Training: 60%|██████ | 6037/10000 [1:20:55<51:24, 1.29it/s, loss=0.0015, lr=1.09e-05, step=6037] Training: 60%|██████ | 6038/10000 [1:20:56<48:34, 1.36it/s, loss=0.0015, lr=1.09e-05, step=6037] Training: 60%|██████ | 6038/10000 [1:20:56<48:34, 1.36it/s, loss=0.0037, lr=1.09e-05, step=6038] Training: 60%|██████ | 6039/10000 [1:20:57<51:43, 1.28it/s, loss=0.0037, lr=1.09e-05, step=6038] Training: 60%|██████ | 6039/10000 [1:20:57<51:43, 1.28it/s, loss=0.0226, lr=1.09e-05, step=6039]20:05:29.766 [I] step=6040 loss=0.0083 smoothed_loss=0.0119 lr=1.09e-05 grad_norm=0.4886 step_time=0.6053s data_time=0.2879s it/s=1.120 eta_to_10000=3536.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0123 grad_action_out_proj_arms=0.1147 grad_arm_token_fuse=0.0647 grad_shared_expert=0.3145 (18633:train_pytorch.py:850) + Training: 60%|██████ | 6040/10000 [1:20:57<48:47, 1.35it/s, loss=0.0226, lr=1.09e-05, step=6039] Training: 60%|██████ | 6040/10000 [1:20:57<48:47, 1.35it/s, loss=0.0083, lr=1.08e-05, step=6040] Training: 60%|██████ | 6041/10000 [1:20:58<49:43, 1.33it/s, loss=0.0083, lr=1.08e-05, step=6040] Training: 60%|██████ | 6041/10000 [1:20:58<49:43, 1.33it/s, loss=0.0058, lr=1.08e-05, step=6041] Training: 60%|██████ | 6042/10000 [1:20:59<50:20, 1.31it/s, loss=0.0058, lr=1.08e-05, step=6041] Training: 60%|██████ | 6042/10000 [1:20:59<50:20, 1.31it/s, loss=0.0047, lr=1.08e-05, step=6042] Training: 60%|██████ | 6043/10000 [1:21:00<53:45, 1.23it/s, loss=0.0047, lr=1.08e-05, step=6042] Training: 60%|██████ | 6043/10000 [1:21:00<53:45, 1.23it/s, loss=0.0051, lr=1.08e-05, step=6043] Training: 60%|██████ | 6044/10000 [1:21:01<48:39, 1.36it/s, loss=0.0051, lr=1.08e-05, step=6043] Training: 60%|██████ | 6044/10000 [1:21:01<48:39, 1.36it/s, loss=0.0026, lr=1.08e-05, step=6044] Training: 60%|██████ | 6045/10000 [1:21:01<47:17, 1.39it/s, loss=0.0026, lr=1.08e-05, step=6044] Training: 60%|██████ | 6045/10000 [1:21:01<47:17, 1.39it/s, loss=0.0083, lr=1.08e-05, step=6045] Training: 60%|██████ | 6046/10000 [1:21:02<45:10, 1.46it/s, loss=0.0083, lr=1.08e-05, step=6045] Training: 60%|██████ | 6046/10000 [1:21:02<45:10, 1.46it/s, loss=0.0071, lr=1.08e-05, step=6046] Training: 60%|██████ | 6047/10000 [1:21:03<46:33, 1.42it/s, loss=0.0071, lr=1.08e-05, step=6046] Training: 60%|██████ | 6047/10000 [1:21:03<46:33, 1.42it/s, loss=0.0043, lr=1.08e-05, step=6047] Training: 60%|██████ | 6048/10000 [1:21:03<49:38, 1.33it/s, loss=0.0043, lr=1.08e-05, step=6047] Training: 60%|██████ | 6048/10000 [1:21:03<49:38, 1.33it/s, loss=0.0085, lr=1.08e-05, step=6048] Training: 60%|██████ | 6049/10000 [1:21:04<52:42, 1.25it/s, loss=0.0085, lr=1.08e-05, step=6048] Training: 60%|██████ | 6049/10000 [1:21:04<52:42, 1.25it/s, loss=0.0046, lr=1.08e-05, step=6049]20:05:37.739 [I] step=6050 loss=0.0069 smoothed_loss=0.0080 lr=1.08e-05 grad_norm=0.3597 step_time=0.6473s data_time=0.1501s it/s=1.254 eta_to_10000=3149.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0835 grad_arm_token_fuse=0.0360 grad_shared_expert=0.2448 (18633:train_pytorch.py:850) + Training: 60%|██████ | 6050/10000 [1:21:05<58:32, 1.12it/s, loss=0.0046, lr=1.08e-05, step=6049] Training: 60%|██████ | 6050/10000 [1:21:05<58:32, 1.12it/s, loss=0.0069, lr=1.08e-05, step=6050] Training: 61%|██████ | 6051/10000 [1:21:06<50:57, 1.29it/s, loss=0.0069, lr=1.08e-05, step=6050] Training: 61%|██████ | 6051/10000 [1:21:06<50:57, 1.29it/s, loss=0.0046, lr=1.08e-05, step=6051] Training: 61%|██████ | 6052/10000 [1:21:07<48:47, 1.35it/s, loss=0.0046, lr=1.08e-05, step=6051] Training: 61%|██████ | 6052/10000 [1:21:07<48:47, 1.35it/s, loss=0.0046, lr=1.08e-05, step=6052] Training: 61%|██████ | 6053/10000 [1:21:07<43:55, 1.50it/s, loss=0.0046, lr=1.08e-05, step=6052] Training: 61%|██████ | 6053/10000 [1:21:07<43:55, 1.50it/s, loss=0.0118, lr=1.08e-05, step=6053] Training: 61%|██████ | 6054/10000 [1:21:08<47:41, 1.38it/s, loss=0.0118, lr=1.08e-05, step=6053] Training: 61%|██████ | 6054/10000 [1:21:08<47:41, 1.38it/s, loss=0.0022, lr=1.08e-05, step=6054] Training: 61%|██████ | 6055/10000 [1:21:09<49:52, 1.32it/s, loss=0.0022, lr=1.08e-05, step=6054] Training: 61%|██████ | 6055/10000 [1:21:09<49:52, 1.32it/s, loss=0.0036, lr=1.08e-05, step=6055] Training: 61%|██████ | 6056/10000 [1:21:10<54:16, 1.21it/s, loss=0.0036, lr=1.08e-05, step=6055] Training: 61%|██████ | 6056/10000 [1:21:10<54:16, 1.21it/s, loss=0.0136, lr=1.08e-05, step=6056] Training: 61%|██████ | 6057/10000 [1:21:11<55:08, 1.19it/s, loss=0.0136, lr=1.08e-05, step=6056] Training: 61%|██████ | 6057/10000 [1:21:11<55:08, 1.19it/s, loss=0.0148, lr=1.08e-05, step=6057] Training: 61%|██████ | 6058/10000 [1:21:12<57:46, 1.14it/s, loss=0.0148, lr=1.08e-05, step=6057] Training: 61%|██████ | 6058/10000 [1:21:12<57:46, 1.14it/s, loss=0.0223, lr=1.08e-05, step=6058] Training: 61%|██████ | 6059/10000 [1:21:12<54:24, 1.21it/s, loss=0.0223, lr=1.08e-05, step=6058] Training: 61%|██████ | 6059/10000 [1:21:12<54:24, 1.21it/s, loss=0.0029, lr=1.08e-05, step=6059]20:05:45.500 [I] step=6060 loss=0.0101 smoothed_loss=0.0091 lr=1.08e-05 grad_norm=0.3581 step_time=0.5958s data_time=0.1803s it/s=1.289 eta_to_10000=3057.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0113 grad_action_out_proj_arms=0.0942 grad_arm_token_fuse=0.0577 grad_shared_expert=0.2608 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6060/10000 [1:21:13<55:05, 1.19it/s, loss=0.0029, lr=1.08e-05, step=6059] Training: 61%|██████ | 6060/10000 [1:21:13<55:05, 1.19it/s, loss=0.0101, lr=1.08e-05, step=6060] Training: 61%|██████ | 6061/10000 [1:21:14<51:22, 1.28it/s, loss=0.0101, lr=1.08e-05, step=6060] Training: 61%|██████ | 6061/10000 [1:21:14<51:22, 1.28it/s, loss=0.0267, lr=1.08e-05, step=6061] Training: 61%|██████ | 6062/10000 [1:21:15<59:25, 1.10it/s, loss=0.0267, lr=1.08e-05, step=6061] Training: 61%|██████ | 6062/10000 [1:21:15<59:25, 1.10it/s, loss=0.0200, lr=1.08e-05, step=6062] Training: 61%|██████ | 6063/10000 [1:21:16<1:01:51, 1.06it/s, loss=0.0200, lr=1.08e-05, step=6062] Training: 61%|██████ | 6063/10000 [1:21:16<1:01:51, 1.06it/s, loss=0.0061, lr=1.08e-05, step=6063] Training: 61%|██████ | 6064/10000 [1:21:17<1:02:06, 1.06it/s, loss=0.0061, lr=1.08e-05, step=6063] Training: 61%|██████ | 6064/10000 [1:21:17<1:02:06, 1.06it/s, loss=0.0041, lr=1.08e-05, step=6064] Training: 61%|██████ | 6065/10000 [1:21:18<1:04:25, 1.02it/s, loss=0.0041, lr=1.08e-05, step=6064] Training: 61%|██████ | 6065/10000 [1:21:18<1:04:25, 1.02it/s, loss=0.0050, lr=1.08e-05, step=6065] Training: 61%|██████ | 6066/10000 [1:21:19<1:02:57, 1.04it/s, loss=0.0050, lr=1.08e-05, step=6065] Training: 61%|██████ | 6066/10000 [1:21:19<1:02:57, 1.04it/s, loss=0.0099, lr=1.08e-05, step=6066] Training: 61%|██████ | 6067/10000 [1:21:20<55:38, 1.18it/s, loss=0.0099, lr=1.08e-05, step=6066] Training: 61%|██████ | 6067/10000 [1:21:20<55:38, 1.18it/s, loss=0.0061, lr=1.08e-05, step=6067] Training: 61%|██████ | 6068/10000 [1:21:20<55:53, 1.17it/s, loss=0.0061, lr=1.08e-05, step=6067] Training: 61%|██████ | 6068/10000 [1:21:20<55:53, 1.17it/s, loss=0.0043, lr=1.07e-05, step=6068] Training: 61%|██████ | 6069/10000 [1:21:21<59:15, 1.11it/s, loss=0.0043, lr=1.07e-05, step=6068] Training: 61%|██████ | 6069/10000 [1:21:21<59:15, 1.11it/s, loss=0.0069, lr=1.07e-05, step=6069]20:05:54.383 [I] step=6070 loss=0.0069 smoothed_loss=0.0086 lr=1.08e-05 grad_norm=0.4202 step_time=0.6454s data_time=0.2429s it/s=1.126 eta_to_10000=3490.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0071 grad_action_out_proj_arms=0.0877 grad_arm_token_fuse=0.0350 grad_shared_expert=0.2397 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6070/10000 [1:21:22<53:19, 1.23it/s, loss=0.0069, lr=1.07e-05, step=6069] Training: 61%|██████ | 6070/10000 [1:21:22<53:19, 1.23it/s, loss=0.0069, lr=1.07e-05, step=6070] Training: 61%|██████ | 6071/10000 [1:21:23<1:01:39, 1.06it/s, loss=0.0069, lr=1.07e-05, step=6070] Training: 61%|██████ | 6071/10000 [1:21:23<1:01:39, 1.06it/s, loss=0.0247, lr=1.07e-05, step=6071] Training: 61%|██████ | 6072/10000 [1:21:24<58:25, 1.12it/s, loss=0.0247, lr=1.07e-05, step=6071] Training: 61%|██████ | 6072/10000 [1:21:24<58:25, 1.12it/s, loss=0.0135, lr=1.07e-05, step=6072] Training: 61%|██████ | 6073/10000 [1:21:25<52:27, 1.25it/s, loss=0.0135, lr=1.07e-05, step=6072] Training: 61%|██████ | 6073/10000 [1:21:25<52:27, 1.25it/s, loss=0.0105, lr=1.07e-05, step=6073] Training: 61%|██████ | 6074/10000 [1:21:25<50:50, 1.29it/s, loss=0.0105, lr=1.07e-05, step=6073] Training: 61%|██████ | 6074/10000 [1:21:25<50:50, 1.29it/s, loss=0.0389, lr=1.07e-05, step=6074] Training: 61%|██████ | 6075/10000 [1:21:26<47:20, 1.38it/s, loss=0.0389, lr=1.07e-05, step=6074] Training: 61%|██████ | 6075/10000 [1:21:26<47:20, 1.38it/s, loss=0.0097, lr=1.07e-05, step=6075] Training: 61%|██████ | 6076/10000 [1:21:27<50:44, 1.29it/s, loss=0.0097, lr=1.07e-05, step=6075] Training: 61%|██████ | 6076/10000 [1:21:27<50:44, 1.29it/s, loss=0.0651, lr=1.07e-05, step=6076] Training: 61%|██████ | 6077/10000 [1:21:28<48:00, 1.36it/s, loss=0.0651, lr=1.07e-05, step=6076] Training: 61%|██████ | 6077/10000 [1:21:28<48:00, 1.36it/s, loss=0.0023, lr=1.07e-05, step=6077] Training: 61%|██████ | 6078/10000 [1:21:28<51:16, 1.27it/s, loss=0.0023, lr=1.07e-05, step=6077] Training: 61%|██████ | 6078/10000 [1:21:28<51:16, 1.27it/s, loss=0.0129, lr=1.07e-05, step=6078] Training: 61%|██████ | 6079/10000 [1:21:29<56:10, 1.16it/s, loss=0.0129, lr=1.07e-05, step=6078] Training: 61%|██████ | 6079/10000 [1:21:29<56:10, 1.16it/s, loss=0.0078, lr=1.07e-05, step=6079]20:06:02.689 [I] step=6080 loss=0.0028 smoothed_loss=0.0141 lr=1.07e-05 grad_norm=0.4315 step_time=0.6760s data_time=0.1547s it/s=1.204 eta_to_10000=3255.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0059 grad_action_out_proj_arms=0.0689 grad_arm_token_fuse=0.0282 grad_shared_expert=0.3349 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6080/10000 [1:21:30<57:08, 1.14it/s, loss=0.0078, lr=1.07e-05, step=6079] Training: 61%|██████ | 6080/10000 [1:21:30<57:08, 1.14it/s, loss=0.0028, lr=1.07e-05, step=6080] Training: 61%|██████ | 6081/10000 [1:21:31<56:00, 1.17it/s, loss=0.0028, lr=1.07e-05, step=6080] Training: 61%|██████ | 6081/10000 [1:21:31<56:00, 1.17it/s, loss=0.0085, lr=1.07e-05, step=6081] Training: 61%|██████ | 6082/10000 [1:21:32<57:58, 1.13it/s, loss=0.0085, lr=1.07e-05, step=6081] Training: 61%|██████ | 6082/10000 [1:21:32<57:58, 1.13it/s, loss=0.0080, lr=1.07e-05, step=6082] Training: 61%|██████ | 6083/10000 [1:21:33<54:28, 1.20it/s, loss=0.0080, lr=1.07e-05, step=6082] Training: 61%|██████ | 6083/10000 [1:21:33<54:28, 1.20it/s, loss=0.0114, lr=1.07e-05, step=6083] Training: 61%|██████ | 6084/10000 [1:21:34<58:48, 1.11it/s, loss=0.0114, lr=1.07e-05, step=6083] Training: 61%|██████ | 6084/10000 [1:21:34<58:48, 1.11it/s, loss=0.0148, lr=1.07e-05, step=6084] Training: 61%|██████ | 6085/10000 [1:21:35<59:17, 1.10it/s, loss=0.0148, lr=1.07e-05, step=6084] Training: 61%|██████ | 6085/10000 [1:21:35<59:17, 1.10it/s, loss=0.0074, lr=1.07e-05, step=6085] Training: 61%|██████ | 6086/10000 [1:21:36<57:23, 1.14it/s, loss=0.0074, lr=1.07e-05, step=6085] Training: 61%|██████ | 6086/10000 [1:21:36<57:23, 1.14it/s, loss=0.0099, lr=1.07e-05, step=6086] Training: 61%|██████ | 6087/10000 [1:21:37<57:12, 1.14it/s, loss=0.0099, lr=1.07e-05, step=6086] Training: 61%|██████ | 6087/10000 [1:21:37<57:12, 1.14it/s, loss=0.0140, lr=1.07e-05, step=6087] Training: 61%|██████ | 6088/10000 [1:21:37<50:48, 1.28it/s, loss=0.0140, lr=1.07e-05, step=6087] Training: 61%|██████ | 6088/10000 [1:21:37<50:48, 1.28it/s, loss=0.0093, lr=1.07e-05, step=6088] Training: 61%|██████ | 6089/10000 [1:21:38<49:15, 1.32it/s, loss=0.0093, lr=1.07e-05, step=6088] Training: 61%|██████ | 6089/10000 [1:21:38<49:15, 1.32it/s, loss=0.0016, lr=1.07e-05, step=6089]20:06:11.117 [I] step=6090 loss=0.0026 smoothed_loss=0.0102 lr=1.07e-05 grad_norm=0.4171 step_time=0.6782s data_time=0.1646s it/s=1.187 eta_to_10000=3295.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0198 grad_action_out_proj_arms=0.1935 grad_arm_token_fuse=0.1065 grad_shared_expert=0.3992 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6090/10000 [1:21:39<54:29, 1.20it/s, loss=0.0016, lr=1.07e-05, step=6089] Training: 61%|██████ | 6090/10000 [1:21:39<54:29, 1.20it/s, loss=0.0026, lr=1.07e-05, step=6090] Training: 61%|██████ | 6091/10000 [1:21:39<51:21, 1.27it/s, loss=0.0026, lr=1.07e-05, step=6090] Training: 61%|██████ | 6091/10000 [1:21:39<51:21, 1.27it/s, loss=0.0580, lr=1.07e-05, step=6091] Training: 61%|██████ | 6092/10000 [1:21:40<48:35, 1.34it/s, loss=0.0580, lr=1.07e-05, step=6091] Training: 61%|██████ | 6092/10000 [1:21:40<48:35, 1.34it/s, loss=0.0046, lr=1.07e-05, step=6092] Training: 61%|██████ | 6093/10000 [1:21:41<55:43, 1.17it/s, loss=0.0046, lr=1.07e-05, step=6092] Training: 61%|██████ | 6093/10000 [1:21:41<55:43, 1.17it/s, loss=0.0203, lr=1.07e-05, step=6093] Training: 61%|██████ | 6094/10000 [1:21:42<55:37, 1.17it/s, loss=0.0203, lr=1.07e-05, step=6093] Training: 61%|██████ | 6094/10000 [1:21:42<55:37, 1.17it/s, loss=0.0077, lr=1.07e-05, step=6094] Training: 61%|██████ | 6095/10000 [1:21:43<55:48, 1.17it/s, loss=0.0077, lr=1.07e-05, step=6094] Training: 61%|██████ | 6095/10000 [1:21:43<55:48, 1.17it/s, loss=0.0501, lr=1.07e-05, step=6095] Training: 61%|██████ | 6096/10000 [1:21:44<55:10, 1.18it/s, loss=0.0501, lr=1.07e-05, step=6095] Training: 61%|██████ | 6096/10000 [1:21:44<55:10, 1.18it/s, loss=0.0056, lr=1.06e-05, step=6096] Training: 61%|██████ | 6097/10000 [1:21:45<58:41, 1.11it/s, loss=0.0056, lr=1.06e-05, step=6096] Training: 61%|██████ | 6097/10000 [1:21:45<58:41, 1.11it/s, loss=0.0237, lr=1.06e-05, step=6097] Training: 61%|██████ | 6098/10000 [1:21:45<53:23, 1.22it/s, loss=0.0237, lr=1.06e-05, step=6097] Training: 61%|██████ | 6098/10000 [1:21:45<53:23, 1.22it/s, loss=0.0074, lr=1.06e-05, step=6098] Training: 61%|██████ | 6099/10000 [1:21:46<47:17, 1.37it/s, loss=0.0074, lr=1.06e-05, step=6098] Training: 61%|██████ | 6099/10000 [1:21:46<47:17, 1.37it/s, loss=0.0083, lr=1.06e-05, step=6099]20:06:19.507 [I] step=6100 loss=0.0123 smoothed_loss=0.0150 lr=1.06e-05 grad_norm=0.5128 step_time=0.6603s data_time=0.1787s it/s=1.192 eta_to_10000=3271.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.0953 grad_arm_token_fuse=0.0509 grad_shared_expert=0.5014 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6100/10000 [1:21:47<57:20, 1.13it/s, loss=0.0083, lr=1.06e-05, step=6099] Training: 61%|██████ | 6100/10000 [1:21:47<57:20, 1.13it/s, loss=0.0123, lr=1.06e-05, step=6100] Training: 61%|██████ | 6101/10000 [1:21:48<1:00:24, 1.08it/s, loss=0.0123, lr=1.06e-05, step=6100] Training: 61%|██████ | 6101/10000 [1:21:48<1:00:24, 1.08it/s, loss=0.0045, lr=1.06e-05, step=6101] Training: 61%|██████ | 6102/10000 [1:21:49<58:40, 1.11it/s, loss=0.0045, lr=1.06e-05, step=6101] Training: 61%|██████ | 6102/10000 [1:21:49<58:40, 1.11it/s, loss=0.0032, lr=1.06e-05, step=6102] Training: 61%|██████ | 6103/10000 [1:21:50<56:28, 1.15it/s, loss=0.0032, lr=1.06e-05, step=6102] Training: 61%|██████ | 6103/10000 [1:21:50<56:28, 1.15it/s, loss=0.0169, lr=1.06e-05, step=6103] Training: 61%|██████ | 6104/10000 [1:21:51<59:37, 1.09it/s, loss=0.0169, lr=1.06e-05, step=6103] Training: 61%|██████ | 6104/10000 [1:21:51<59:37, 1.09it/s, loss=0.0195, lr=1.06e-05, step=6104] Training: 61%|██████ | 6105/10000 [1:21:52<55:15, 1.17it/s, loss=0.0195, lr=1.06e-05, step=6104] Training: 61%|██████ | 6105/10000 [1:21:52<55:15, 1.17it/s, loss=0.0156, lr=1.06e-05, step=6105] Training: 61%|██████ | 6106/10000 [1:21:52<55:17, 1.17it/s, loss=0.0156, lr=1.06e-05, step=6105] Training: 61%|██████ | 6106/10000 [1:21:52<55:17, 1.17it/s, loss=0.0088, lr=1.06e-05, step=6106] Training: 61%|██████ | 6107/10000 [1:21:53<53:07, 1.22it/s, loss=0.0088, lr=1.06e-05, step=6106] Training: 61%|██████ | 6107/10000 [1:21:53<53:07, 1.22it/s, loss=0.0125, lr=1.06e-05, step=6107] Training: 61%|██████ | 6108/10000 [1:21:54<53:33, 1.21it/s, loss=0.0125, lr=1.06e-05, step=6107] Training: 61%|██████ | 6108/10000 [1:21:54<53:33, 1.21it/s, loss=0.0059, lr=1.06e-05, step=6108] Training: 61%|██████ | 6109/10000 [1:21:55<53:38, 1.21it/s, loss=0.0059, lr=1.06e-05, step=6108] Training: 61%|██████ | 6109/10000 [1:21:55<53:38, 1.21it/s, loss=0.0320, lr=1.06e-05, step=6109]20:06:28.112 [I] step=6110 loss=0.0078 smoothed_loss=0.0139 lr=1.06e-05 grad_norm=0.4594 step_time=0.6473s data_time=0.2132s it/s=1.162 eta_to_10000=3347.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1862 grad_arm_token_fuse=0.0773 grad_shared_expert=0.4667 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6110/10000 [1:21:56<55:50, 1.16it/s, loss=0.0320, lr=1.06e-05, step=6109] Training: 61%|██████ | 6110/10000 [1:21:56<55:50, 1.16it/s, loss=0.0078, lr=1.06e-05, step=6110] Training: 61%|██████ | 6111/10000 [1:21:57<55:42, 1.16it/s, loss=0.0078, lr=1.06e-05, step=6110] Training: 61%|██████ | 6111/10000 [1:21:57<55:42, 1.16it/s, loss=0.0056, lr=1.06e-05, step=6111] Training: 61%|██████ | 6112/10000 [1:21:57<54:46, 1.18it/s, loss=0.0056, lr=1.06e-05, step=6111] Training: 61%|██████ | 6112/10000 [1:21:57<54:46, 1.18it/s, loss=0.0253, lr=1.06e-05, step=6112] Training: 61%|██████ | 6113/10000 [1:21:58<50:57, 1.27it/s, loss=0.0253, lr=1.06e-05, step=6112] Training: 61%|██████ | 6113/10000 [1:21:58<50:57, 1.27it/s, loss=0.0060, lr=1.06e-05, step=6113] Training: 61%|██████ | 6114/10000 [1:21:59<49:09, 1.32it/s, loss=0.0060, lr=1.06e-05, step=6113] Training: 61%|██████ | 6114/10000 [1:21:59<49:09, 1.32it/s, loss=0.0080, lr=1.06e-05, step=6114] Training: 61%|██████ | 6115/10000 [1:22:00<55:54, 1.16it/s, loss=0.0080, lr=1.06e-05, step=6114] Training: 61%|██████ | 6115/10000 [1:22:00<55:54, 1.16it/s, loss=0.0042, lr=1.06e-05, step=6115] Training: 61%|██████ | 6116/10000 [1:22:01<54:24, 1.19it/s, loss=0.0042, lr=1.06e-05, step=6115] Training: 61%|██████ | 6116/10000 [1:22:01<54:24, 1.19it/s, loss=0.0372, lr=1.06e-05, step=6116] Training: 61%|██████ | 6117/10000 [1:22:01<53:25, 1.21it/s, loss=0.0372, lr=1.06e-05, step=6116] Training: 61%|██████ | 6117/10000 [1:22:01<53:25, 1.21it/s, loss=0.0148, lr=1.06e-05, step=6117] Training: 61%|██████ | 6118/10000 [1:22:02<50:20, 1.29it/s, loss=0.0148, lr=1.06e-05, step=6117] Training: 61%|██████ | 6118/10000 [1:22:02<50:20, 1.29it/s, loss=0.0094, lr=1.06e-05, step=6118] Training: 61%|██████ | 6119/10000 [1:22:03<49:11, 1.31it/s, loss=0.0094, lr=1.06e-05, step=6118] Training: 61%|██████ | 6119/10000 [1:22:03<49:11, 1.31it/s, loss=0.0057, lr=1.06e-05, step=6119]20:06:36.310 [I] step=6120 loss=0.0162 smoothed_loss=0.0135 lr=1.06e-05 grad_norm=0.4634 step_time=0.6322s data_time=0.1876s it/s=1.221 eta_to_10000=3178.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0116 grad_action_out_proj_arms=0.1383 grad_arm_token_fuse=0.0542 grad_shared_expert=0.4328 (18633:train_pytorch.py:850) + Training: 61%|██████ | 6120/10000 [1:22:04<55:58, 1.16it/s, loss=0.0057, lr=1.06e-05, step=6119] Training: 61%|██████ | 6120/10000 [1:22:04<55:58, 1.16it/s, loss=0.0162, lr=1.06e-05, step=6120] Training: 61%|██████ | 6121/10000 [1:22:05<53:16, 1.21it/s, loss=0.0162, lr=1.06e-05, step=6120] Training: 61%|██████ | 6121/10000 [1:22:05<53:16, 1.21it/s, loss=0.0189, lr=1.06e-05, step=6121] Training: 61%|██████ | 6122/10000 [1:22:06<55:54, 1.16it/s, loss=0.0189, lr=1.06e-05, step=6121] Training: 61%|██████ | 6122/10000 [1:22:06<55:54, 1.16it/s, loss=0.0074, lr=1.06e-05, step=6122] Training: 61%|██████ | 6123/10000 [1:22:06<52:59, 1.22it/s, loss=0.0074, lr=1.06e-05, step=6122] Training: 61%|██████ | 6123/10000 [1:22:06<52:59, 1.22it/s, loss=0.0030, lr=1.06e-05, step=6123] Training: 61%|██████ | 6124/10000 [1:22:07<51:38, 1.25it/s, loss=0.0030, lr=1.06e-05, step=6123] Training: 61%|██████ | 6124/10000 [1:22:07<51:38, 1.25it/s, loss=0.0237, lr=1.05e-05, step=6124] Training: 61%|██████▏ | 6125/10000 [1:22:08<54:58, 1.17it/s, loss=0.0237, lr=1.05e-05, step=6124] Training: 61%|██████▏ | 6125/10000 [1:22:08<54:58, 1.17it/s, loss=0.0158, lr=1.05e-05, step=6125] Training: 61%|██████▏ | 6126/10000 [1:22:09<54:38, 1.18it/s, loss=0.0158, lr=1.05e-05, step=6125] Training: 61%|██████▏ | 6126/10000 [1:22:09<54:38, 1.18it/s, loss=0.0085, lr=1.05e-05, step=6126] Training: 61%|██████▏ | 6127/10000 [1:22:10<58:16, 1.11it/s, loss=0.0085, lr=1.05e-05, step=6126] Training: 61%|██████▏ | 6127/10000 [1:22:10<58:16, 1.11it/s, loss=0.0085, lr=1.05e-05, step=6127] Training: 61%|██████▏ | 6128/10000 [1:22:11<58:09, 1.11it/s, loss=0.0085, lr=1.05e-05, step=6127] Training: 61%|██████▏ | 6128/10000 [1:22:11<58:09, 1.11it/s, loss=0.0183, lr=1.05e-05, step=6128] Training: 61%|██████▏ | 6129/10000 [1:22:12<59:05, 1.09it/s, loss=0.0183, lr=1.05e-05, step=6128] Training: 61%|██████▏ | 6129/10000 [1:22:12<59:05, 1.09it/s, loss=0.0052, lr=1.05e-05, step=6129]20:06:45.004 [I] step=6130 loss=0.0063 smoothed_loss=0.0119 lr=1.05e-05 grad_norm=0.4572 step_time=0.6861s data_time=0.1833s it/s=1.150 eta_to_10000=3364.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.1308 grad_arm_token_fuse=0.0411 grad_shared_expert=0.4792 (18633:train_pytorch.py:850) + Training: 61%|██████▏ | 6130/10000 [1:22:13<57:48, 1.12it/s, loss=0.0052, lr=1.05e-05, step=6129] Training: 61%|██████▏ | 6130/10000 [1:22:13<57:48, 1.12it/s, loss=0.0063, lr=1.05e-05, step=6130] Training: 61%|██████▏ | 6131/10000 [1:22:13<53:10, 1.21it/s, loss=0.0063, lr=1.05e-05, step=6130] Training: 61%|██████▏ | 6131/10000 [1:22:13<53:10, 1.21it/s, loss=0.0059, lr=1.05e-05, step=6131] Training: 61%|██████▏ | 6132/10000 [1:22:14<50:27, 1.28it/s, loss=0.0059, lr=1.05e-05, step=6131] Training: 61%|██████▏ | 6132/10000 [1:22:14<50:27, 1.28it/s, loss=0.0572, lr=1.05e-05, step=6132] Training: 61%|██████▏ | 6133/10000 [1:22:15<50:28, 1.28it/s, loss=0.0572, lr=1.05e-05, step=6132] Training: 61%|██████▏ | 6133/10000 [1:22:15<50:28, 1.28it/s, loss=0.0237, lr=1.05e-05, step=6133] Training: 61%|██████▏ | 6134/10000 [1:22:15<48:02, 1.34it/s, loss=0.0237, lr=1.05e-05, step=6133] Training: 61%|██████▏ | 6134/10000 [1:22:15<48:02, 1.34it/s, loss=0.0094, lr=1.05e-05, step=6134] Training: 61%|██████▏ | 6135/10000 [1:22:16<45:34, 1.41it/s, loss=0.0094, lr=1.05e-05, step=6134] Training: 61%|██████▏ | 6135/10000 [1:22:16<45:34, 1.41it/s, loss=0.0195, lr=1.05e-05, step=6135] Training: 61%|██████▏ | 6136/10000 [1:22:17<47:30, 1.36it/s, loss=0.0195, lr=1.05e-05, step=6135] Training: 61%|██████▏ | 6136/10000 [1:22:17<47:30, 1.36it/s, loss=0.0557, lr=1.05e-05, step=6136] Training: 61%|██████▏ | 6137/10000 [1:22:18<47:27, 1.36it/s, loss=0.0557, lr=1.05e-05, step=6136] Training: 61%|██████▏ | 6137/10000 [1:22:18<47:27, 1.36it/s, loss=0.0216, lr=1.05e-05, step=6137] Training: 61%|██████▏ | 6138/10000 [1:22:18<48:08, 1.34it/s, loss=0.0216, lr=1.05e-05, step=6137] Training: 61%|██████▏ | 6138/10000 [1:22:18<48:08, 1.34it/s, loss=0.0031, lr=1.05e-05, step=6138] Training: 61%|██████▏ | 6139/10000 [1:22:19<54:17, 1.19it/s, loss=0.0031, lr=1.05e-05, step=6138] Training: 61%|██████▏ | 6139/10000 [1:22:19<54:17, 1.19it/s, loss=0.0096, lr=1.05e-05, step=6139]20:06:52.954 [I] step=6140 loss=0.0042 smoothed_loss=0.0164 lr=1.05e-05 grad_norm=0.4438 step_time=0.6080s data_time=0.1870s it/s=1.258 eta_to_10000=3068.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0043 grad_action_out_proj_arms=0.0620 grad_arm_token_fuse=0.0228 grad_shared_expert=0.1757 (18633:train_pytorch.py:850) + Training: 61%|██████▏ | 6140/10000 [1:22:21<1:00:26, 1.06it/s, loss=0.0096, lr=1.05e-05, step=6139] Training: 61%|██████▏ | 6140/10000 [1:22:21<1:00:26, 1.06it/s, loss=0.0042, lr=1.05e-05, step=6140] Training: 61%|██████▏ | 6141/10000 [1:22:21<51:37, 1.25it/s, loss=0.0042, lr=1.05e-05, step=6140] Training: 61%|██████▏ | 6141/10000 [1:22:21<51:37, 1.25it/s, loss=0.0055, lr=1.05e-05, step=6141] Training: 61%|██████▏ | 6142/10000 [1:22:22<48:47, 1.32it/s, loss=0.0055, lr=1.05e-05, step=6141] Training: 61%|██████▏ | 6142/10000 [1:22:22<48:47, 1.32it/s, loss=0.0089, lr=1.05e-05, step=6142] Training: 61%|██████▏ | 6143/10000 [1:22:23<58:14, 1.10it/s, loss=0.0089, lr=1.05e-05, step=6142] Training: 61%|██████▏ | 6143/10000 [1:22:23<58:14, 1.10it/s, loss=0.0099, lr=1.05e-05, step=6143] Training: 61%|██████▏ | 6144/10000 [1:22:24<58:21, 1.10it/s, loss=0.0099, lr=1.05e-05, step=6143] Training: 61%|██████▏ | 6144/10000 [1:22:24<58:21, 1.10it/s, loss=0.0047, lr=1.05e-05, step=6144] Training: 61%|██████▏ | 6145/10000 [1:22:25<58:01, 1.11it/s, loss=0.0047, lr=1.05e-05, step=6144] Training: 61%|██████▏ | 6145/10000 [1:22:25<58:01, 1.11it/s, loss=0.0153, lr=1.05e-05, step=6145] Training: 61%|██████▏ | 6146/10000 [1:22:26<54:45, 1.17it/s, loss=0.0153, lr=1.05e-05, step=6145] Training: 61%|██████▏ | 6146/10000 [1:22:26<54:45, 1.17it/s, loss=0.0086, lr=1.05e-05, step=6146] Training: 61%|██████▏ | 6147/10000 [1:22:26<55:19, 1.16it/s, loss=0.0086, lr=1.05e-05, step=6146] Training: 61%|██████▏ | 6147/10000 [1:22:26<55:19, 1.16it/s, loss=0.0064, lr=1.05e-05, step=6147] Training: 61%|██████▏ | 6148/10000 [1:22:27<51:29, 1.25it/s, loss=0.0064, lr=1.05e-05, step=6147] Training: 61%|██████▏ | 6148/10000 [1:22:27<51:29, 1.25it/s, loss=0.0058, lr=1.05e-05, step=6148] Training: 61%|██████▏ | 6149/10000 [1:22:28<47:21, 1.36it/s, loss=0.0058, lr=1.05e-05, step=6148] Training: 61%|██████▏ | 6149/10000 [1:22:28<47:21, 1.36it/s, loss=0.0083, lr=1.05e-05, step=6149]20:07:00.974 [I] step=6150 loss=0.0050 smoothed_loss=0.0107 lr=1.05e-05 grad_norm=0.4536 step_time=0.6234s data_time=0.1786s it/s=1.247 eta_to_10000=3087.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0658 grad_arm_token_fuse=0.0253 grad_shared_expert=0.2677 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6150/10000 [1:22:29<51:34, 1.24it/s, loss=0.0083, lr=1.05e-05, step=6149] Training: 62%|██████▏ | 6150/10000 [1:22:29<51:34, 1.24it/s, loss=0.0050, lr=1.05e-05, step=6150] Training: 62%|██████▏ | 6151/10000 [1:22:30<54:09, 1.18it/s, loss=0.0050, lr=1.05e-05, step=6150] Training: 62%|██████▏ | 6151/10000 [1:22:30<54:09, 1.18it/s, loss=0.0032, lr=1.05e-05, step=6151] Training: 62%|██████▏ | 6152/10000 [1:22:30<50:21, 1.27it/s, loss=0.0032, lr=1.05e-05, step=6151] Training: 62%|██████▏ | 6152/10000 [1:22:30<50:21, 1.27it/s, loss=0.0103, lr=1.04e-05, step=6152] Training: 62%|██████▏ | 6153/10000 [1:22:31<52:24, 1.22it/s, loss=0.0103, lr=1.04e-05, step=6152] Training: 62%|██████▏ | 6153/10000 [1:22:31<52:24, 1.22it/s, loss=0.0032, lr=1.04e-05, step=6153] Training: 62%|██████▏ | 6154/10000 [1:22:32<54:07, 1.18it/s, loss=0.0032, lr=1.04e-05, step=6153] Training: 62%|██████▏ | 6154/10000 [1:22:32<54:07, 1.18it/s, loss=0.0229, lr=1.04e-05, step=6154] Training: 62%|██████▏ | 6155/10000 [1:22:33<59:05, 1.08it/s, loss=0.0229, lr=1.04e-05, step=6154] Training: 62%|██████▏ | 6155/10000 [1:22:33<59:05, 1.08it/s, loss=0.0099, lr=1.04e-05, step=6155] Training: 62%|██████▏ | 6156/10000 [1:22:34<1:00:57, 1.05it/s, loss=0.0099, lr=1.04e-05, step=6155] Training: 62%|██████▏ | 6156/10000 [1:22:34<1:00:57, 1.05it/s, loss=0.0496, lr=1.04e-05, step=6156] Training: 62%|██████▏ | 6157/10000 [1:22:35<1:07:45, 1.06s/it, loss=0.0496, lr=1.04e-05, step=6156] Training: 62%|██████▏ | 6157/10000 [1:22:35<1:07:45, 1.06s/it, loss=0.0020, lr=1.04e-05, step=6157] Training: 62%|██████▏ | 6158/10000 [1:22:36<1:00:22, 1.06it/s, loss=0.0020, lr=1.04e-05, step=6157] Training: 62%|██████▏ | 6158/10000 [1:22:36<1:00:22, 1.06it/s, loss=0.0083, lr=1.04e-05, step=6158] Training: 62%|██████▏ | 6159/10000 [1:22:37<53:59, 1.19it/s, loss=0.0083, lr=1.04e-05, step=6158] Training: 62%|██████▏ | 6159/10000 [1:22:37<53:59, 1.19it/s, loss=0.0056, lr=1.04e-05, step=6159]20:07:09.894 [I] step=6160 loss=0.0149 smoothed_loss=0.0123 lr=1.04e-05 grad_norm=0.4325 step_time=0.6517s data_time=0.2405s it/s=1.121 eta_to_10000=3424.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0897 grad_arm_token_fuse=0.0373 grad_shared_expert=0.3066 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6160/10000 [1:22:38<53:30, 1.20it/s, loss=0.0056, lr=1.04e-05, step=6159] Training: 62%|██████▏ | 6160/10000 [1:22:38<53:30, 1.20it/s, loss=0.0149, lr=1.04e-05, step=6160] Training: 62%|██████▏ | 6161/10000 [1:22:38<51:23, 1.24it/s, loss=0.0149, lr=1.04e-05, step=6160] Training: 62%|██████▏ | 6161/10000 [1:22:38<51:23, 1.24it/s, loss=0.0258, lr=1.04e-05, step=6161] Training: 62%|██████▏ | 6162/10000 [1:22:39<52:46, 1.21it/s, loss=0.0258, lr=1.04e-05, step=6161] Training: 62%|██████▏ | 6162/10000 [1:22:39<52:46, 1.21it/s, loss=0.0736, lr=1.04e-05, step=6162] Training: 62%|██████▏ | 6163/10000 [1:22:40<48:39, 1.31it/s, loss=0.0736, lr=1.04e-05, step=6162] Training: 62%|██████▏ | 6163/10000 [1:22:40<48:39, 1.31it/s, loss=0.0053, lr=1.04e-05, step=6163] Training: 62%|██████▏ | 6164/10000 [1:22:41<53:00, 1.21it/s, loss=0.0053, lr=1.04e-05, step=6163] Training: 62%|██████▏ | 6164/10000 [1:22:41<53:00, 1.21it/s, loss=0.0054, lr=1.04e-05, step=6164] Training: 62%|██████▏ | 6165/10000 [1:22:42<54:02, 1.18it/s, loss=0.0054, lr=1.04e-05, step=6164] Training: 62%|██████▏ | 6165/10000 [1:22:42<54:02, 1.18it/s, loss=0.0057, lr=1.04e-05, step=6165] Training: 62%|██████▏ | 6166/10000 [1:22:42<50:50, 1.26it/s, loss=0.0057, lr=1.04e-05, step=6165] Training: 62%|██████▏ | 6166/10000 [1:22:42<50:50, 1.26it/s, loss=0.0024, lr=1.04e-05, step=6166] Training: 62%|██████▏ | 6167/10000 [1:22:43<50:25, 1.27it/s, loss=0.0024, lr=1.04e-05, step=6166] Training: 62%|██████▏ | 6167/10000 [1:22:43<50:25, 1.27it/s, loss=0.0040, lr=1.04e-05, step=6167] Training: 62%|██████▏ | 6168/10000 [1:22:44<48:05, 1.33it/s, loss=0.0040, lr=1.04e-05, step=6167] Training: 62%|██████▏ | 6168/10000 [1:22:44<48:05, 1.33it/s, loss=0.0187, lr=1.04e-05, step=6168] Training: 62%|██████▏ | 6169/10000 [1:22:44<43:47, 1.46it/s, loss=0.0187, lr=1.04e-05, step=6168] Training: 62%|██████▏ | 6169/10000 [1:22:44<43:47, 1.46it/s, loss=0.0158, lr=1.04e-05, step=6169]20:07:17.563 [I] step=6170 loss=0.0053 smoothed_loss=0.0132 lr=1.04e-05 grad_norm=0.4932 step_time=0.6245s data_time=0.1422s it/s=1.304 eta_to_10000=2936.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0414 grad_action_out_proj_arms=0.1419 grad_arm_token_fuse=0.2162 grad_shared_expert=0.6256 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6170/10000 [1:22:45<48:29, 1.32it/s, loss=0.0158, lr=1.04e-05, step=6169] Training: 62%|██████▏ | 6170/10000 [1:22:45<48:29, 1.32it/s, loss=0.0053, lr=1.04e-05, step=6170] Training: 62%|██████▏ | 6171/10000 [1:22:46<51:45, 1.23it/s, loss=0.0053, lr=1.04e-05, step=6170] Training: 62%|██████▏ | 6171/10000 [1:22:46<51:45, 1.23it/s, loss=0.0065, lr=1.04e-05, step=6171] Training: 62%|██████▏ | 6172/10000 [1:22:47<52:21, 1.22it/s, loss=0.0065, lr=1.04e-05, step=6171] Training: 62%|██████▏ | 6172/10000 [1:22:47<52:21, 1.22it/s, loss=0.0118, lr=1.04e-05, step=6172] Training: 62%|██████▏ | 6173/10000 [1:22:48<51:55, 1.23it/s, loss=0.0118, lr=1.04e-05, step=6172] Training: 62%|██████▏ | 6173/10000 [1:22:48<51:55, 1.23it/s, loss=0.0045, lr=1.04e-05, step=6173] Training: 62%|██████▏ | 6174/10000 [1:22:49<52:35, 1.21it/s, loss=0.0045, lr=1.04e-05, step=6173] Training: 62%|██████▏ | 6174/10000 [1:22:49<52:35, 1.21it/s, loss=0.0066, lr=1.04e-05, step=6174] Training: 62%|██████▏ | 6175/10000 [1:22:49<51:03, 1.25it/s, loss=0.0066, lr=1.04e-05, step=6174] Training: 62%|██████▏ | 6175/10000 [1:22:49<51:03, 1.25it/s, loss=0.0216, lr=1.04e-05, step=6175] Training: 62%|██████▏ | 6176/10000 [1:22:50<46:53, 1.36it/s, loss=0.0216, lr=1.04e-05, step=6175] Training: 62%|██████▏ | 6176/10000 [1:22:50<46:53, 1.36it/s, loss=0.0035, lr=1.04e-05, step=6176] Training: 62%|██████▏ | 6177/10000 [1:22:51<53:54, 1.18it/s, loss=0.0035, lr=1.04e-05, step=6176] Training: 62%|██████▏ | 6177/10000 [1:22:51<53:54, 1.18it/s, loss=0.0043, lr=1.04e-05, step=6177] Training: 62%|██████▏ | 6178/10000 [1:22:52<58:38, 1.09it/s, loss=0.0043, lr=1.04e-05, step=6177] Training: 62%|██████▏ | 6178/10000 [1:22:52<58:38, 1.09it/s, loss=0.0155, lr=1.04e-05, step=6178] Training: 62%|██████▏ | 6179/10000 [1:22:53<59:35, 1.07it/s, loss=0.0155, lr=1.04e-05, step=6178] Training: 62%|██████▏ | 6179/10000 [1:22:53<59:35, 1.07it/s, loss=0.0196, lr=1.04e-05, step=6179]20:07:26.338 [I] step=6180 loss=0.0088 smoothed_loss=0.0116 lr=1.04e-05 grad_norm=0.3828 step_time=0.7048s data_time=0.1727s it/s=1.140 eta_to_10000=3351.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0050 grad_action_out_proj_arms=0.0492 grad_arm_token_fuse=0.0231 grad_shared_expert=0.1350 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6180/10000 [1:22:54<58:01, 1.10it/s, loss=0.0196, lr=1.04e-05, step=6179] Training: 62%|██████▏ | 6180/10000 [1:22:54<58:01, 1.10it/s, loss=0.0088, lr=1.03e-05, step=6180] Training: 62%|██████▏ | 6181/10000 [1:22:55<50:12, 1.27it/s, loss=0.0088, lr=1.03e-05, step=6180] Training: 62%|██████▏ | 6181/10000 [1:22:55<50:12, 1.27it/s, loss=0.0017, lr=1.03e-05, step=6181] Training: 62%|██████▏ | 6182/10000 [1:22:55<47:55, 1.33it/s, loss=0.0017, lr=1.03e-05, step=6181] Training: 62%|██████▏ | 6182/10000 [1:22:55<47:55, 1.33it/s, loss=0.0098, lr=1.03e-05, step=6182] Training: 62%|██████▏ | 6183/10000 [1:22:56<42:55, 1.48it/s, loss=0.0098, lr=1.03e-05, step=6182] Training: 62%|██████▏ | 6183/10000 [1:22:56<42:55, 1.48it/s, loss=0.0029, lr=1.03e-05, step=6183] Training: 62%|██████▏ | 6184/10000 [1:22:56<44:41, 1.42it/s, loss=0.0029, lr=1.03e-05, step=6183] Training: 62%|██████▏ | 6184/10000 [1:22:56<44:41, 1.42it/s, loss=0.0039, lr=1.03e-05, step=6184] Training: 62%|██████▏ | 6185/10000 [1:22:57<41:05, 1.55it/s, loss=0.0039, lr=1.03e-05, step=6184] Training: 62%|██████▏ | 6185/10000 [1:22:57<41:05, 1.55it/s, loss=0.0040, lr=1.03e-05, step=6185] Training: 62%|██████▏ | 6186/10000 [1:22:58<43:44, 1.45it/s, loss=0.0040, lr=1.03e-05, step=6185] Training: 62%|██████▏ | 6186/10000 [1:22:58<43:44, 1.45it/s, loss=0.0123, lr=1.03e-05, step=6186] Training: 62%|██████▏ | 6187/10000 [1:22:59<45:28, 1.40it/s, loss=0.0123, lr=1.03e-05, step=6186] Training: 62%|██████▏ | 6187/10000 [1:22:59<45:28, 1.40it/s, loss=0.0038, lr=1.03e-05, step=6187] Training: 62%|██████▏ | 6188/10000 [1:22:59<47:35, 1.33it/s, loss=0.0038, lr=1.03e-05, step=6187] Training: 62%|██████▏ | 6188/10000 [1:22:59<47:35, 1.33it/s, loss=0.0058, lr=1.03e-05, step=6188] Training: 62%|██████▏ | 6189/10000 [1:23:00<47:38, 1.33it/s, loss=0.0058, lr=1.03e-05, step=6188] Training: 62%|██████▏ | 6189/10000 [1:23:00<47:38, 1.33it/s, loss=0.0017, lr=1.03e-05, step=6189]20:07:33.109 [I] step=6190 loss=0.0039 smoothed_loss=0.0072 lr=1.03e-05 grad_norm=0.4207 step_time=0.5696s data_time=0.1075s it/s=1.477 eta_to_10000=2579.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.1085 grad_arm_token_fuse=0.0442 grad_shared_expert=0.8573 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6190/10000 [1:23:01<46:17, 1.37it/s, loss=0.0017, lr=1.03e-05, step=6189] Training: 62%|██████▏ | 6190/10000 [1:23:01<46:17, 1.37it/s, loss=0.0039, lr=1.03e-05, step=6190] Training: 62%|██████▏ | 6191/10000 [1:23:02<46:55, 1.35it/s, loss=0.0039, lr=1.03e-05, step=6190] Training: 62%|██████▏ | 6191/10000 [1:23:02<46:55, 1.35it/s, loss=0.0239, lr=1.03e-05, step=6191] Training: 62%|██████▏ | 6192/10000 [1:23:03<55:07, 1.15it/s, loss=0.0239, lr=1.03e-05, step=6191] Training: 62%|██████▏ | 6192/10000 [1:23:03<55:07, 1.15it/s, loss=0.0175, lr=1.03e-05, step=6192] Training: 62%|██████▏ | 6193/10000 [1:23:04<57:37, 1.10it/s, loss=0.0175, lr=1.03e-05, step=6192] Training: 62%|██████▏ | 6193/10000 [1:23:04<57:37, 1.10it/s, loss=0.0175, lr=1.03e-05, step=6193] Training: 62%|██████▏ | 6194/10000 [1:23:04<51:45, 1.23it/s, loss=0.0175, lr=1.03e-05, step=6193] Training: 62%|██████▏ | 6194/10000 [1:23:04<51:45, 1.23it/s, loss=0.0096, lr=1.03e-05, step=6194] Training: 62%|██████▏ | 6195/10000 [1:23:05<50:50, 1.25it/s, loss=0.0096, lr=1.03e-05, step=6194] Training: 62%|██████▏ | 6195/10000 [1:23:05<50:50, 1.25it/s, loss=0.0101, lr=1.03e-05, step=6195] Training: 62%|██████▏ | 6196/10000 [1:23:06<48:03, 1.32it/s, loss=0.0101, lr=1.03e-05, step=6195] Training: 62%|██████▏ | 6196/10000 [1:23:06<48:03, 1.32it/s, loss=0.0151, lr=1.03e-05, step=6196] Training: 62%|██████▏ | 6197/10000 [1:23:06<42:46, 1.48it/s, loss=0.0151, lr=1.03e-05, step=6196] Training: 62%|██████▏ | 6197/10000 [1:23:06<42:46, 1.48it/s, loss=0.0054, lr=1.03e-05, step=6197] Training: 62%|██████▏ | 6198/10000 [1:23:07<45:50, 1.38it/s, loss=0.0054, lr=1.03e-05, step=6197] Training: 62%|██████▏ | 6198/10000 [1:23:07<45:50, 1.38it/s, loss=0.0219, lr=1.03e-05, step=6198] Training: 62%|██████▏ | 6199/10000 [1:23:08<47:02, 1.35it/s, loss=0.0219, lr=1.03e-05, step=6198] Training: 62%|██████▏ | 6199/10000 [1:23:08<47:02, 1.35it/s, loss=0.0025, lr=1.03e-05, step=6199]20:07:41.075 [I] step=6200 loss=0.0097 smoothed_loss=0.0105 lr=1.03e-05 grad_norm=0.5051 step_time=0.6386s data_time=0.1579s it/s=1.256 eta_to_10000=3026.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0347 grad_action_out_proj_arms=0.2070 grad_arm_token_fuse=0.2116 grad_shared_expert=0.5694 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6200/10000 [1:23:09<50:02, 1.27it/s, loss=0.0025, lr=1.03e-05, step=6199] Training: 62%|██████▏ | 6200/10000 [1:23:09<50:02, 1.27it/s, loss=0.0097, lr=1.03e-05, step=6200] Training: 62%|██████▏ | 6201/10000 [1:23:10<51:06, 1.24it/s, loss=0.0097, lr=1.03e-05, step=6200] Training: 62%|██████▏ | 6201/10000 [1:23:10<51:06, 1.24it/s, loss=0.0020, lr=1.03e-05, step=6201] Training: 62%|██████▏ | 6202/10000 [1:23:10<48:10, 1.31it/s, loss=0.0020, lr=1.03e-05, step=6201] Training: 62%|██████▏ | 6202/10000 [1:23:10<48:10, 1.31it/s, loss=0.0083, lr=1.03e-05, step=6202] Training: 62%|██████▏ | 6203/10000 [1:23:11<44:05, 1.44it/s, loss=0.0083, lr=1.03e-05, step=6202] Training: 62%|██████▏ | 6203/10000 [1:23:11<44:05, 1.44it/s, loss=0.0013, lr=1.03e-05, step=6203] Training: 62%|██████▏ | 6204/10000 [1:23:12<47:11, 1.34it/s, loss=0.0013, lr=1.03e-05, step=6203] Training: 62%|██████▏ | 6204/10000 [1:23:12<47:11, 1.34it/s, loss=0.0072, lr=1.03e-05, step=6204] Training: 62%|██████▏ | 6205/10000 [1:23:13<51:37, 1.23it/s, loss=0.0072, lr=1.03e-05, step=6204] Training: 62%|██████▏ | 6205/10000 [1:23:13<51:37, 1.23it/s, loss=0.0100, lr=1.03e-05, step=6205] Training: 62%|██████▏ | 6206/10000 [1:23:13<49:58, 1.27it/s, loss=0.0100, lr=1.03e-05, step=6205] Training: 62%|██████▏ | 6206/10000 [1:23:13<49:58, 1.27it/s, loss=0.0043, lr=1.03e-05, step=6206] Training: 62%|██████▏ | 6207/10000 [1:23:14<54:32, 1.16it/s, loss=0.0043, lr=1.03e-05, step=6206] Training: 62%|██████▏ | 6207/10000 [1:23:14<54:32, 1.16it/s, loss=0.0035, lr=1.03e-05, step=6207] Training: 62%|██████▏ | 6208/10000 [1:23:15<47:28, 1.33it/s, loss=0.0035, lr=1.03e-05, step=6207] Training: 62%|██████▏ | 6208/10000 [1:23:15<47:28, 1.33it/s, loss=0.0111, lr=1.02e-05, step=6208] Training: 62%|██████▏ | 6209/10000 [1:23:16<46:29, 1.36it/s, loss=0.0111, lr=1.02e-05, step=6208] Training: 62%|██████▏ | 6209/10000 [1:23:16<46:29, 1.36it/s, loss=0.0175, lr=1.02e-05, step=6209]20:07:48.444 [I] step=6210 loss=0.0135 smoothed_loss=0.0095 lr=1.03e-05 grad_norm=0.4292 step_time=0.5849s data_time=0.1520s it/s=1.357 eta_to_10000=2792.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.0869 grad_arm_token_fuse=0.0581 grad_shared_expert=0.2236 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6210/10000 [1:23:16<42:33, 1.48it/s, loss=0.0175, lr=1.02e-05, step=6209] Training: 62%|██████▏ | 6210/10000 [1:23:16<42:33, 1.48it/s, loss=0.0135, lr=1.02e-05, step=6210] Training: 62%|██████▏ | 6211/10000 [1:23:17<38:55, 1.62it/s, loss=0.0135, lr=1.02e-05, step=6210] Training: 62%|██████▏ | 6211/10000 [1:23:17<38:55, 1.62it/s, loss=0.0183, lr=1.02e-05, step=6211] Training: 62%|██████▏ | 6212/10000 [1:23:17<43:38, 1.45it/s, loss=0.0183, lr=1.02e-05, step=6211] Training: 62%|██████▏ | 6212/10000 [1:23:17<43:38, 1.45it/s, loss=0.0018, lr=1.02e-05, step=6212] Training: 62%|██████▏ | 6213/10000 [1:23:18<39:27, 1.60it/s, loss=0.0018, lr=1.02e-05, step=6212] Training: 62%|██████▏ | 6213/10000 [1:23:18<39:27, 1.60it/s, loss=0.0232, lr=1.02e-05, step=6213] Training: 62%|██████▏ | 6214/10000 [1:23:19<46:38, 1.35it/s, loss=0.0232, lr=1.02e-05, step=6213] Training: 62%|██████▏ | 6214/10000 [1:23:19<46:38, 1.35it/s, loss=0.0510, lr=1.02e-05, step=6214] Training: 62%|██████▏ | 6215/10000 [1:23:20<46:08, 1.37it/s, loss=0.0510, lr=1.02e-05, step=6214] Training: 62%|██████▏ | 6215/10000 [1:23:20<46:08, 1.37it/s, loss=0.0076, lr=1.02e-05, step=6215] Training: 62%|██████▏ | 6216/10000 [1:23:20<47:35, 1.32it/s, loss=0.0076, lr=1.02e-05, step=6215] Training: 62%|██████▏ | 6216/10000 [1:23:20<47:35, 1.32it/s, loss=0.0090, lr=1.02e-05, step=6216] Training: 62%|██████▏ | 6217/10000 [1:23:21<50:16, 1.25it/s, loss=0.0090, lr=1.02e-05, step=6216] Training: 62%|██████▏ | 6217/10000 [1:23:21<50:16, 1.25it/s, loss=0.0238, lr=1.02e-05, step=6217] Training: 62%|██████▏ | 6218/10000 [1:23:22<44:35, 1.41it/s, loss=0.0238, lr=1.02e-05, step=6217] Training: 62%|██████▏ | 6218/10000 [1:23:22<44:35, 1.41it/s, loss=0.0079, lr=1.02e-05, step=6218] Training: 62%|██████▏ | 6219/10000 [1:23:22<43:02, 1.46it/s, loss=0.0079, lr=1.02e-05, step=6218] Training: 62%|██████▏ | 6219/10000 [1:23:22<43:02, 1.46it/s, loss=0.0090, lr=1.02e-05, step=6219]20:07:55.599 [I] step=6220 loss=0.0352 smoothed_loss=0.0157 lr=1.02e-05 grad_norm=0.5178 step_time=0.5701s data_time=0.1454s it/s=1.398 eta_to_10000=2704.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0268 grad_action_out_proj_arms=0.1337 grad_arm_token_fuse=0.1434 grad_shared_expert=0.4050 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6220/10000 [1:23:23<44:59, 1.40it/s, loss=0.0090, lr=1.02e-05, step=6219] Training: 62%|██████▏ | 6220/10000 [1:23:23<44:59, 1.40it/s, loss=0.0352, lr=1.02e-05, step=6220] Training: 62%|██████▏ | 6221/10000 [1:23:24<43:41, 1.44it/s, loss=0.0352, lr=1.02e-05, step=6220] Training: 62%|██████▏ | 6221/10000 [1:23:24<43:41, 1.44it/s, loss=0.0040, lr=1.02e-05, step=6221] Training: 62%|██████▏ | 6222/10000 [1:23:25<47:10, 1.33it/s, loss=0.0040, lr=1.02e-05, step=6221] Training: 62%|██████▏ | 6222/10000 [1:23:25<47:10, 1.33it/s, loss=0.0200, lr=1.02e-05, step=6222] Training: 62%|██████▏ | 6223/10000 [1:23:25<43:15, 1.46it/s, loss=0.0200, lr=1.02e-05, step=6222] Training: 62%|██████▏ | 6223/10000 [1:23:25<43:15, 1.46it/s, loss=0.0045, lr=1.02e-05, step=6223] Training: 62%|██████▏ | 6224/10000 [1:23:26<40:16, 1.56it/s, loss=0.0045, lr=1.02e-05, step=6223] Training: 62%|██████▏ | 6224/10000 [1:23:26<40:16, 1.56it/s, loss=0.0080, lr=1.02e-05, step=6224] Training: 62%|██████▏ | 6225/10000 [1:23:27<43:42, 1.44it/s, loss=0.0080, lr=1.02e-05, step=6224] Training: 62%|██████▏ | 6225/10000 [1:23:27<43:42, 1.44it/s, loss=0.0049, lr=1.02e-05, step=6225] Training: 62%|██████▏ | 6226/10000 [1:23:27<43:20, 1.45it/s, loss=0.0049, lr=1.02e-05, step=6225] Training: 62%|██████▏ | 6226/10000 [1:23:27<43:20, 1.45it/s, loss=0.0063, lr=1.02e-05, step=6226] Training: 62%|██████▏ | 6227/10000 [1:23:28<39:25, 1.60it/s, loss=0.0063, lr=1.02e-05, step=6226] Training: 62%|██████▏ | 6227/10000 [1:23:28<39:25, 1.60it/s, loss=0.0070, lr=1.02e-05, step=6227] Training: 62%|██████▏ | 6228/10000 [1:23:28<39:36, 1.59it/s, loss=0.0070, lr=1.02e-05, step=6227] Training: 62%|██████▏ | 6228/10000 [1:23:28<39:36, 1.59it/s, loss=0.0083, lr=1.02e-05, step=6228] Training: 62%|██████▏ | 6229/10000 [1:23:29<41:36, 1.51it/s, loss=0.0083, lr=1.02e-05, step=6228] Training: 62%|██████▏ | 6229/10000 [1:23:29<41:36, 1.51it/s, loss=0.0033, lr=1.02e-05, step=6229]20:08:02.355 [I] step=6230 loss=0.0078 smoothed_loss=0.0101 lr=1.02e-05 grad_norm=0.4147 step_time=0.5781s data_time=0.0976s it/s=1.480 eta_to_10000=2546.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0112 grad_action_out_proj_arms=0.0850 grad_arm_token_fuse=0.0580 grad_shared_expert=0.3223 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6230/10000 [1:23:30<44:17, 1.42it/s, loss=0.0033, lr=1.02e-05, step=6229] Training: 62%|██████▏ | 6230/10000 [1:23:30<44:17, 1.42it/s, loss=0.0078, lr=1.02e-05, step=6230] Training: 62%|██████▏ | 6231/10000 [1:23:31<44:00, 1.43it/s, loss=0.0078, lr=1.02e-05, step=6230] Training: 62%|██████▏ | 6231/10000 [1:23:31<44:00, 1.43it/s, loss=0.0141, lr=1.02e-05, step=6231] Training: 62%|██████▏ | 6232/10000 [1:23:31<40:11, 1.56it/s, loss=0.0141, lr=1.02e-05, step=6231] Training: 62%|██████▏ | 6232/10000 [1:23:31<40:11, 1.56it/s, loss=0.0171, lr=1.02e-05, step=6232] Training: 62%|██████▏ | 6233/10000 [1:23:32<40:51, 1.54it/s, loss=0.0171, lr=1.02e-05, step=6232] Training: 62%|██████▏ | 6233/10000 [1:23:32<40:51, 1.54it/s, loss=0.0130, lr=1.02e-05, step=6233] Training: 62%|██████▏ | 6234/10000 [1:23:32<38:08, 1.65it/s, loss=0.0130, lr=1.02e-05, step=6233] Training: 62%|██████▏ | 6234/10000 [1:23:32<38:08, 1.65it/s, loss=0.0053, lr=1.02e-05, step=6234] Training: 62%|██████▏ | 6235/10000 [1:23:33<35:59, 1.74it/s, loss=0.0053, lr=1.02e-05, step=6234] Training: 62%|██████▏ | 6235/10000 [1:23:33<35:59, 1.74it/s, loss=0.0184, lr=1.02e-05, step=6235] Training: 62%|██████▏ | 6236/10000 [1:23:34<46:27, 1.35it/s, loss=0.0184, lr=1.02e-05, step=6235] Training: 62%|██████▏ | 6236/10000 [1:23:34<46:27, 1.35it/s, loss=0.0306, lr=1.01e-05, step=6236] Training: 62%|██████▏ | 6237/10000 [1:23:35<45:00, 1.39it/s, loss=0.0306, lr=1.01e-05, step=6236] Training: 62%|██████▏ | 6237/10000 [1:23:35<45:00, 1.39it/s, loss=0.0257, lr=1.01e-05, step=6237] Training: 62%|██████▏ | 6238/10000 [1:23:35<43:18, 1.45it/s, loss=0.0257, lr=1.01e-05, step=6237] Training: 62%|██████▏ | 6238/10000 [1:23:35<43:18, 1.45it/s, loss=0.0035, lr=1.01e-05, step=6238] Training: 62%|██████▏ | 6239/10000 [1:23:36<39:49, 1.57it/s, loss=0.0035, lr=1.01e-05, step=6238] Training: 62%|██████▏ | 6239/10000 [1:23:36<39:49, 1.57it/s, loss=0.0050, lr=1.01e-05, step=6239]20:08:09.103 [I] step=6240 loss=0.0328 smoothed_loss=0.0147 lr=1.02e-05 grad_norm=0.4697 step_time=0.5465s data_time=0.1282s it/s=1.482 eta_to_10000=2536.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0303 grad_action_out_proj_arms=0.1780 grad_arm_token_fuse=0.1669 grad_shared_expert=0.3687 (18633:train_pytorch.py:850) + Training: 62%|██████▏ | 6240/10000 [1:23:37<45:47, 1.37it/s, loss=0.0050, lr=1.01e-05, step=6239] Training: 62%|██████▏ | 6240/10000 [1:23:37<45:47, 1.37it/s, loss=0.0328, lr=1.01e-05, step=6240] Training: 62%|██████▏ | 6241/10000 [1:23:38<47:48, 1.31it/s, loss=0.0328, lr=1.01e-05, step=6240] Training: 62%|██████▏ | 6241/10000 [1:23:38<47:48, 1.31it/s, loss=0.0098, lr=1.01e-05, step=6241] Training: 62%|██████▏ | 6242/10000 [1:23:38<48:00, 1.30it/s, loss=0.0098, lr=1.01e-05, step=6241] Training: 62%|██████▏ | 6242/10000 [1:23:38<48:00, 1.30it/s, loss=0.0226, lr=1.01e-05, step=6242] Training: 62%|██████▏ | 6243/10000 [1:23:39<52:42, 1.19it/s, loss=0.0226, lr=1.01e-05, step=6242] Training: 62%|██████▏ | 6243/10000 [1:23:39<52:42, 1.19it/s, loss=0.0031, lr=1.01e-05, step=6243] Training: 62%|██████▏ | 6244/10000 [1:23:40<48:57, 1.28it/s, loss=0.0031, lr=1.01e-05, step=6243] Training: 62%|██████▏ | 6244/10000 [1:23:40<48:57, 1.28it/s, loss=0.0026, lr=1.01e-05, step=6244] Training: 62%|██████▏ | 6245/10000 [1:23:41<51:31, 1.21it/s, loss=0.0026, lr=1.01e-05, step=6244] Training: 62%|██████▏ | 6245/10000 [1:23:41<51:31, 1.21it/s, loss=0.0037, lr=1.01e-05, step=6245] Training: 62%|██████▏ | 6246/10000 [1:23:41<45:40, 1.37it/s, loss=0.0037, lr=1.01e-05, step=6245] Training: 62%|██████▏ | 6246/10000 [1:23:41<45:40, 1.37it/s, loss=0.0132, lr=1.01e-05, step=6246] Training: 62%|██████▏ | 6247/10000 [1:23:42<47:53, 1.31it/s, loss=0.0132, lr=1.01e-05, step=6246] Training: 62%|██████▏ | 6247/10000 [1:23:42<47:53, 1.31it/s, loss=0.0197, lr=1.01e-05, step=6247] Training: 62%|██████▏ | 6248/10000 [1:23:43<48:53, 1.28it/s, loss=0.0197, lr=1.01e-05, step=6247] Training: 62%|██████▏ | 6248/10000 [1:23:43<48:53, 1.28it/s, loss=0.0081, lr=1.01e-05, step=6248] Training: 62%|██████▏ | 6249/10000 [1:23:44<49:27, 1.26it/s, loss=0.0081, lr=1.01e-05, step=6248] Training: 62%|██████▏ | 6249/10000 [1:23:44<49:27, 1.26it/s, loss=0.0430, lr=1.01e-05, step=6249]20:08:17.067 [I] step=6250 loss=0.0081 smoothed_loss=0.0146 lr=1.01e-05 grad_norm=0.3919 step_time=0.6320s data_time=0.1644s it/s=1.256 eta_to_10000=2985.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0096 grad_action_out_proj_arms=0.1090 grad_arm_token_fuse=0.0504 grad_shared_expert=0.3675 (18633:train_pytorch.py:850) + Training: 62%|██████▎ | 6250/10000 [1:23:45<49:11, 1.27it/s, loss=0.0430, lr=1.01e-05, step=6249] Training: 62%|██████▎ | 6250/10000 [1:23:45<49:11, 1.27it/s, loss=0.0081, lr=1.01e-05, step=6250] Training: 63%|██████▎ | 6251/10000 [1:23:45<47:39, 1.31it/s, loss=0.0081, lr=1.01e-05, step=6250] Training: 63%|██████▎ | 6251/10000 [1:23:45<47:39, 1.31it/s, loss=0.0044, lr=1.01e-05, step=6251] Training: 63%|██████▎ | 6252/10000 [1:23:46<49:59, 1.25it/s, loss=0.0044, lr=1.01e-05, step=6251] Training: 63%|██████▎ | 6252/10000 [1:23:46<49:59, 1.25it/s, loss=0.0076, lr=1.01e-05, step=6252] Training: 63%|██████▎ | 6253/10000 [1:23:47<44:10, 1.41it/s, loss=0.0076, lr=1.01e-05, step=6252] Training: 63%|██████▎ | 6253/10000 [1:23:47<44:10, 1.41it/s, loss=0.0055, lr=1.01e-05, step=6253] Training: 63%|██████▎ | 6254/10000 [1:23:48<47:01, 1.33it/s, loss=0.0055, lr=1.01e-05, step=6253] Training: 63%|██████▎ | 6254/10000 [1:23:48<47:01, 1.33it/s, loss=0.0043, lr=1.01e-05, step=6254] Training: 63%|██████▎ | 6255/10000 [1:23:49<50:55, 1.23it/s, loss=0.0043, lr=1.01e-05, step=6254] Training: 63%|██████▎ | 6255/10000 [1:23:49<50:55, 1.23it/s, loss=0.0141, lr=1.01e-05, step=6255] Training: 63%|██████▎ | 6256/10000 [1:23:49<48:19, 1.29it/s, loss=0.0141, lr=1.01e-05, step=6255] Training: 63%|██████▎ | 6256/10000 [1:23:49<48:19, 1.29it/s, loss=0.0056, lr=1.01e-05, step=6256] Training: 63%|██████▎ | 6257/10000 [1:23:50<55:32, 1.12it/s, loss=0.0056, lr=1.01e-05, step=6256] Training: 63%|██████▎ | 6257/10000 [1:23:50<55:32, 1.12it/s, loss=0.0100, lr=1.01e-05, step=6257] Training: 63%|██████▎ | 6258/10000 [1:23:51<54:22, 1.15it/s, loss=0.0100, lr=1.01e-05, step=6257] Training: 63%|██████▎ | 6258/10000 [1:23:51<54:22, 1.15it/s, loss=0.0066, lr=1.01e-05, step=6258] Training: 63%|██████▎ | 6259/10000 [1:23:52<54:38, 1.14it/s, loss=0.0066, lr=1.01e-05, step=6258] Training: 63%|██████▎ | 6259/10000 [1:23:52<54:38, 1.14it/s, loss=0.0334, lr=1.01e-05, step=6259]20:08:25.384 [I] step=6260 loss=0.0040 smoothed_loss=0.0120 lr=1.01e-05 grad_norm=0.4130 step_time=0.6326s data_time=0.1991s it/s=1.202 eta_to_10000=3110.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0108 grad_action_out_proj_arms=0.1005 grad_arm_token_fuse=0.0598 grad_shared_expert=0.3259 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6260/10000 [1:23:53<54:15, 1.15it/s, loss=0.0334, lr=1.01e-05, step=6259] Training: 63%|██████▎ | 6260/10000 [1:23:53<54:15, 1.15it/s, loss=0.0040, lr=1.01e-05, step=6260] Training: 63%|██████▎ | 6261/10000 [1:23:54<54:33, 1.14it/s, loss=0.0040, lr=1.01e-05, step=6260] Training: 63%|██████▎ | 6261/10000 [1:23:54<54:33, 1.14it/s, loss=0.0040, lr=1.01e-05, step=6261] Training: 63%|██████▎ | 6262/10000 [1:23:54<46:56, 1.33it/s, loss=0.0040, lr=1.01e-05, step=6261] Training: 63%|██████▎ | 6262/10000 [1:23:54<46:56, 1.33it/s, loss=0.0035, lr=1.01e-05, step=6262] Training: 63%|██████▎ | 6263/10000 [1:23:55<45:52, 1.36it/s, loss=0.0035, lr=1.01e-05, step=6262] Training: 63%|██████▎ | 6263/10000 [1:23:55<45:52, 1.36it/s, loss=0.0089, lr=1.01e-05, step=6263] Training: 63%|██████▎ | 6264/10000 [1:23:56<44:12, 1.41it/s, loss=0.0089, lr=1.01e-05, step=6263] Training: 63%|██████▎ | 6264/10000 [1:23:56<44:12, 1.41it/s, loss=0.0035, lr=1.01e-05, step=6264] Training: 63%|██████▎ | 6265/10000 [1:23:56<43:43, 1.42it/s, loss=0.0035, lr=1.01e-05, step=6264] Training: 63%|██████▎ | 6265/10000 [1:23:56<43:43, 1.42it/s, loss=0.0104, lr=1.00e-05, step=6265] Training: 63%|██████▎ | 6266/10000 [1:23:57<39:45, 1.57it/s, loss=0.0104, lr=1.00e-05, step=6265] Training: 63%|██████▎ | 6266/10000 [1:23:57<39:45, 1.57it/s, loss=0.0082, lr=1.00e-05, step=6266] Training: 63%|██████▎ | 6267/10000 [1:23:57<37:03, 1.68it/s, loss=0.0082, lr=1.00e-05, step=6266] Training: 63%|██████▎ | 6267/10000 [1:23:57<37:03, 1.68it/s, loss=0.0039, lr=1.00e-05, step=6267] Training: 63%|██████▎ | 6268/10000 [1:23:58<38:08, 1.63it/s, loss=0.0039, lr=1.00e-05, step=6267] Training: 63%|██████▎ | 6268/10000 [1:23:58<38:08, 1.63it/s, loss=0.0125, lr=1.00e-05, step=6268] Training: 63%|██████▎ | 6269/10000 [1:23:59<36:08, 1.72it/s, loss=0.0125, lr=1.00e-05, step=6268] Training: 63%|██████▎ | 6269/10000 [1:23:59<36:08, 1.72it/s, loss=0.0054, lr=1.00e-05, step=6269]20:08:31.714 [I] step=6270 loss=0.0028 smoothed_loss=0.0083 lr=1.00e-05 grad_norm=0.4151 step_time=0.5251s data_time=0.1079s it/s=1.580 eta_to_10000=2360.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0061 grad_action_out_proj_arms=0.0715 grad_arm_token_fuse=0.0315 grad_shared_expert=0.4373 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6270/10000 [1:23:59<40:10, 1.55it/s, loss=0.0054, lr=1.00e-05, step=6269] Training: 63%|██████▎ | 6270/10000 [1:23:59<40:10, 1.55it/s, loss=0.0028, lr=1.00e-05, step=6270] Training: 63%|██████▎ | 6271/10000 [1:24:00<41:50, 1.49it/s, loss=0.0028, lr=1.00e-05, step=6270] Training: 63%|██████▎ | 6271/10000 [1:24:00<41:50, 1.49it/s, loss=0.0060, lr=1.00e-05, step=6271] Training: 63%|██████▎ | 6272/10000 [1:24:01<41:58, 1.48it/s, loss=0.0060, lr=1.00e-05, step=6271] Training: 63%|██████▎ | 6272/10000 [1:24:01<41:58, 1.48it/s, loss=0.0135, lr=1.00e-05, step=6272] Training: 63%|██████▎ | 6273/10000 [1:24:02<43:29, 1.43it/s, loss=0.0135, lr=1.00e-05, step=6272] Training: 63%|██████▎ | 6273/10000 [1:24:02<43:29, 1.43it/s, loss=0.0043, lr=1.00e-05, step=6273] Training: 63%|██████▎ | 6274/10000 [1:24:02<43:39, 1.42it/s, loss=0.0043, lr=1.00e-05, step=6273] Training: 63%|██████▎ | 6274/10000 [1:24:02<43:39, 1.42it/s, loss=0.0193, lr=1.00e-05, step=6274] Training: 63%|██████▎ | 6275/10000 [1:24:03<49:23, 1.26it/s, loss=0.0193, lr=1.00e-05, step=6274] Training: 63%|██████▎ | 6275/10000 [1:24:03<49:23, 1.26it/s, loss=0.0017, lr=1.00e-05, step=6275] Training: 63%|██████▎ | 6276/10000 [1:24:04<43:34, 1.42it/s, loss=0.0017, lr=1.00e-05, step=6275] Training: 63%|██████▎ | 6276/10000 [1:24:04<43:34, 1.42it/s, loss=0.0031, lr=1.00e-05, step=6276] Training: 63%|██████▎ | 6277/10000 [1:24:04<40:01, 1.55it/s, loss=0.0031, lr=1.00e-05, step=6276] Training: 63%|██████▎ | 6277/10000 [1:24:04<40:01, 1.55it/s, loss=0.0068, lr=1.00e-05, step=6277] Training: 63%|██████▎ | 6278/10000 [1:24:05<40:36, 1.53it/s, loss=0.0068, lr=1.00e-05, step=6277] Training: 63%|██████▎ | 6278/10000 [1:24:05<40:36, 1.53it/s, loss=0.0065, lr=1.00e-05, step=6278] Training: 63%|██████▎ | 6279/10000 [1:24:06<41:13, 1.50it/s, loss=0.0065, lr=1.00e-05, step=6278] Training: 63%|██████▎ | 6279/10000 [1:24:06<41:13, 1.50it/s, loss=0.0417, lr=1.00e-05, step=6279]20:08:38.503 [I] step=6280 loss=0.0132 smoothed_loss=0.0113 lr=1.00e-05 grad_norm=0.4040 step_time=0.5683s data_time=0.1106s it/s=1.473 eta_to_10000=2525.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0063 grad_action_out_proj_arms=0.0614 grad_arm_token_fuse=0.0330 grad_shared_expert=0.4125 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6280/10000 [1:24:06<38:45, 1.60it/s, loss=0.0417, lr=1.00e-05, step=6279] Training: 63%|██████▎ | 6280/10000 [1:24:06<38:45, 1.60it/s, loss=0.0132, lr=1.00e-05, step=6280] Training: 63%|██████▎ | 6281/10000 [1:24:07<36:06, 1.72it/s, loss=0.0132, lr=1.00e-05, step=6280] Training: 63%|██████▎ | 6281/10000 [1:24:07<36:06, 1.72it/s, loss=0.0123, lr=9.99e-06, step=6281] Training: 63%|██████▎ | 6282/10000 [1:24:07<34:27, 1.80it/s, loss=0.0123, lr=9.99e-06, step=6281] Training: 63%|██████▎ | 6282/10000 [1:24:07<34:27, 1.80it/s, loss=0.0137, lr=9.99e-06, step=6282] Training: 63%|██████▎ | 6283/10000 [1:24:08<37:27, 1.65it/s, loss=0.0137, lr=9.99e-06, step=6282] Training: 63%|██████▎ | 6283/10000 [1:24:08<37:27, 1.65it/s, loss=0.0046, lr=9.98e-06, step=6283] Training: 63%|██████▎ | 6284/10000 [1:24:08<35:01, 1.77it/s, loss=0.0046, lr=9.98e-06, step=6283] Training: 63%|██████▎ | 6284/10000 [1:24:08<35:01, 1.77it/s, loss=0.0026, lr=9.98e-06, step=6284] Training: 63%|██████▎ | 6285/10000 [1:24:09<35:27, 1.75it/s, loss=0.0026, lr=9.98e-06, step=6284] Training: 63%|██████▎ | 6285/10000 [1:24:09<35:27, 1.75it/s, loss=0.0045, lr=9.98e-06, step=6285] Training: 63%|██████▎ | 6286/10000 [1:24:10<37:23, 1.66it/s, loss=0.0045, lr=9.98e-06, step=6285] Training: 63%|██████▎ | 6286/10000 [1:24:10<37:23, 1.66it/s, loss=0.0023, lr=9.97e-06, step=6286] Training: 63%|██████▎ | 6287/10000 [1:24:10<37:09, 1.67it/s, loss=0.0023, lr=9.97e-06, step=6286] Training: 63%|██████▎ | 6287/10000 [1:24:10<37:09, 1.67it/s, loss=0.0066, lr=9.97e-06, step=6287] Training: 63%|██████▎ | 6288/10000 [1:24:11<40:05, 1.54it/s, loss=0.0066, lr=9.97e-06, step=6287] Training: 63%|██████▎ | 6288/10000 [1:24:11<40:05, 1.54it/s, loss=0.0119, lr=9.97e-06, step=6288] Training: 63%|██████▎ | 6289/10000 [1:24:11<37:08, 1.67it/s, loss=0.0119, lr=9.97e-06, step=6288] Training: 63%|██████▎ | 6289/10000 [1:24:11<37:08, 1.67it/s, loss=0.0088, lr=9.96e-06, step=6289]20:08:44.458 [I] step=6290 loss=0.0012 smoothed_loss=0.0082 lr=9.98e-06 grad_norm=0.4017 step_time=0.5232s data_time=0.0723s it/s=1.680 eta_to_10000=2208.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0065 grad_action_out_proj_arms=0.0618 grad_arm_token_fuse=0.0326 grad_shared_expert=0.2433 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6290/10000 [1:24:12<38:33, 1.60it/s, loss=0.0088, lr=9.96e-06, step=6289] Training: 63%|██████▎ | 6290/10000 [1:24:12<38:33, 1.60it/s, loss=0.0012, lr=9.96e-06, step=6290] Training: 63%|██████▎ | 6291/10000 [1:24:13<35:57, 1.72it/s, loss=0.0012, lr=9.96e-06, step=6290] Training: 63%|██████▎ | 6291/10000 [1:24:13<35:57, 1.72it/s, loss=0.0192, lr=9.96e-06, step=6291] Training: 63%|██████▎ | 6292/10000 [1:24:13<34:22, 1.80it/s, loss=0.0192, lr=9.96e-06, step=6291] Training: 63%|██████▎ | 6292/10000 [1:24:13<34:22, 1.80it/s, loss=0.0104, lr=9.95e-06, step=6292] Training: 63%|██████▎ | 6293/10000 [1:24:14<37:16, 1.66it/s, loss=0.0104, lr=9.95e-06, step=6292] Training: 63%|██████▎ | 6293/10000 [1:24:14<37:16, 1.66it/s, loss=0.0056, lr=9.95e-06, step=6293] Training: 63%|██████▎ | 6294/10000 [1:24:14<34:58, 1.77it/s, loss=0.0056, lr=9.95e-06, step=6293] Training: 63%|██████▎ | 6294/10000 [1:24:14<34:58, 1.77it/s, loss=0.0119, lr=9.95e-06, step=6294] Training: 63%|██████▎ | 6295/10000 [1:24:15<34:17, 1.80it/s, loss=0.0119, lr=9.95e-06, step=6294] Training: 63%|██████▎ | 6295/10000 [1:24:15<34:17, 1.80it/s, loss=0.0170, lr=9.94e-06, step=6295] Training: 63%|██████▎ | 6296/10000 [1:24:15<32:54, 1.88it/s, loss=0.0170, lr=9.94e-06, step=6295] Training: 63%|██████▎ | 6296/10000 [1:24:15<32:54, 1.88it/s, loss=0.0050, lr=9.94e-06, step=6296] Training: 63%|██████▎ | 6297/10000 [1:24:16<35:18, 1.75it/s, loss=0.0050, lr=9.94e-06, step=6296] Training: 63%|██████▎ | 6297/10000 [1:24:16<35:18, 1.75it/s, loss=0.0142, lr=9.94e-06, step=6297] Training: 63%|██████▎ | 6298/10000 [1:24:17<36:53, 1.67it/s, loss=0.0142, lr=9.94e-06, step=6297] Training: 63%|██████▎ | 6298/10000 [1:24:17<36:53, 1.67it/s, loss=0.0018, lr=9.93e-06, step=6298] Training: 63%|██████▎ | 6299/10000 [1:24:17<34:49, 1.77it/s, loss=0.0018, lr=9.93e-06, step=6298] Training: 63%|██████▎ | 6299/10000 [1:24:17<34:49, 1.77it/s, loss=0.0119, lr=9.93e-06, step=6299]20:08:50.184 [I] step=6300 loss=0.0074 smoothed_loss=0.0093 lr=9.94e-06 grad_norm=0.4376 step_time=0.4945s data_time=0.0781s it/s=1.747 eta_to_10000=2118.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0065 grad_action_out_proj_arms=0.0711 grad_arm_token_fuse=0.0314 grad_shared_expert=0.2545 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6300/10000 [1:24:18<37:55, 1.63it/s, loss=0.0119, lr=9.93e-06, step=6299] Training: 63%|██████▎ | 6300/10000 [1:24:18<37:55, 1.63it/s, loss=0.0074, lr=9.93e-06, step=6300] Training: 63%|██████▎ | 6301/10000 [1:24:19<42:58, 1.43it/s, loss=0.0074, lr=9.93e-06, step=6300] Training: 63%|██████▎ | 6301/10000 [1:24:19<42:58, 1.43it/s, loss=0.0062, lr=9.92e-06, step=6301] Training: 63%|██████▎ | 6302/10000 [1:24:19<39:07, 1.57it/s, loss=0.0062, lr=9.92e-06, step=6301] Training: 63%|██████▎ | 6302/10000 [1:24:19<39:07, 1.57it/s, loss=0.0217, lr=9.92e-06, step=6302] Training: 63%|██████▎ | 6303/10000 [1:24:20<37:43, 1.63it/s, loss=0.0217, lr=9.92e-06, step=6302] Training: 63%|██████▎ | 6303/10000 [1:24:20<37:43, 1.63it/s, loss=0.0159, lr=9.91e-06, step=6303] Training: 63%|██████▎ | 6304/10000 [1:24:20<38:27, 1.60it/s, loss=0.0159, lr=9.91e-06, step=6303] Training: 63%|██████▎ | 6304/10000 [1:24:20<38:27, 1.60it/s, loss=0.0048, lr=9.91e-06, step=6304] Training: 63%|██████▎ | 6305/10000 [1:24:21<45:53, 1.34it/s, loss=0.0048, lr=9.91e-06, step=6304] Training: 63%|██████▎ | 6305/10000 [1:24:21<45:53, 1.34it/s, loss=0.0095, lr=9.91e-06, step=6305] Training: 63%|██████▎ | 6306/10000 [1:24:22<43:09, 1.43it/s, loss=0.0095, lr=9.91e-06, step=6305] Training: 63%|██████▎ | 6306/10000 [1:24:22<43:09, 1.43it/s, loss=0.0085, lr=9.90e-06, step=6306] Training: 63%|██████▎ | 6307/10000 [1:24:23<44:55, 1.37it/s, loss=0.0085, lr=9.90e-06, step=6306] Training: 63%|██████▎ | 6307/10000 [1:24:23<44:55, 1.37it/s, loss=0.0115, lr=9.90e-06, step=6307] Training: 63%|██████▎ | 6308/10000 [1:24:23<42:13, 1.46it/s, loss=0.0115, lr=9.90e-06, step=6307] Training: 63%|██████▎ | 6308/10000 [1:24:23<42:13, 1.46it/s, loss=0.0038, lr=9.90e-06, step=6308] Training: 63%|██████▎ | 6309/10000 [1:24:24<42:36, 1.44it/s, loss=0.0038, lr=9.90e-06, step=6308] Training: 63%|██████▎ | 6309/10000 [1:24:24<42:36, 1.44it/s, loss=0.0045, lr=9.89e-06, step=6309]20:08:57.132 [I] step=6310 loss=0.0078 smoothed_loss=0.0089 lr=9.91e-06 grad_norm=0.4378 step_time=0.5590s data_time=0.1357s it/s=1.439 eta_to_10000=2563.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0221 grad_action_out_proj_arms=0.1591 grad_arm_token_fuse=0.1283 grad_shared_expert=0.4825 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6310/10000 [1:24:25<41:41, 1.48it/s, loss=0.0045, lr=9.89e-06, step=6309] Training: 63%|██████▎ | 6310/10000 [1:24:25<41:41, 1.48it/s, loss=0.0078, lr=9.89e-06, step=6310] Training: 63%|██████▎ | 6311/10000 [1:24:25<39:26, 1.56it/s, loss=0.0078, lr=9.89e-06, step=6310] Training: 63%|██████▎ | 6311/10000 [1:24:25<39:26, 1.56it/s, loss=0.0032, lr=9.89e-06, step=6311] Training: 63%|██████▎ | 6312/10000 [1:24:26<40:15, 1.53it/s, loss=0.0032, lr=9.89e-06, step=6311] Training: 63%|██████▎ | 6312/10000 [1:24:26<40:15, 1.53it/s, loss=0.0049, lr=9.88e-06, step=6312] Training: 63%|██████▎ | 6313/10000 [1:24:27<43:35, 1.41it/s, loss=0.0049, lr=9.88e-06, step=6312] Training: 63%|██████▎ | 6313/10000 [1:24:27<43:35, 1.41it/s, loss=0.0014, lr=9.88e-06, step=6313] Training: 63%|██████▎ | 6314/10000 [1:24:28<42:23, 1.45it/s, loss=0.0014, lr=9.88e-06, step=6313] Training: 63%|██████▎ | 6314/10000 [1:24:28<42:23, 1.45it/s, loss=0.0100, lr=9.88e-06, step=6314] Training: 63%|██████▎ | 6315/10000 [1:24:28<43:28, 1.41it/s, loss=0.0100, lr=9.88e-06, step=6314] Training: 63%|██████▎ | 6315/10000 [1:24:28<43:28, 1.41it/s, loss=0.0141, lr=9.87e-06, step=6315] Training: 63%|██████▎ | 6316/10000 [1:24:29<39:16, 1.56it/s, loss=0.0141, lr=9.87e-06, step=6315] Training: 63%|██████▎ | 6316/10000 [1:24:29<39:16, 1.56it/s, loss=0.0103, lr=9.87e-06, step=6316] Training: 63%|██████▎ | 6317/10000 [1:24:29<36:16, 1.69it/s, loss=0.0103, lr=9.87e-06, step=6316] Training: 63%|██████▎ | 6317/10000 [1:24:29<36:16, 1.69it/s, loss=0.0051, lr=9.87e-06, step=6317] Training: 63%|██████▎ | 6318/10000 [1:24:30<34:17, 1.79it/s, loss=0.0051, lr=9.87e-06, step=6317] Training: 63%|██████▎ | 6318/10000 [1:24:30<34:17, 1.79it/s, loss=0.0034, lr=9.86e-06, step=6318] Training: 63%|██████▎ | 6319/10000 [1:24:30<33:18, 1.84it/s, loss=0.0034, lr=9.86e-06, step=6318] Training: 63%|██████▎ | 6319/10000 [1:24:30<33:18, 1.84it/s, loss=0.0027, lr=9.86e-06, step=6319]20:09:03.238 [I] step=6320 loss=0.0254 smoothed_loss=0.0090 lr=9.87e-06 grad_norm=0.4458 step_time=0.5420s data_time=0.0687s it/s=1.638 eta_to_10000=2246.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0077 grad_action_out_proj_arms=0.0773 grad_arm_token_fuse=0.0406 grad_shared_expert=0.3770 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6320/10000 [1:24:31<35:54, 1.71it/s, loss=0.0027, lr=9.86e-06, step=6319] Training: 63%|██████▎ | 6320/10000 [1:24:31<35:54, 1.71it/s, loss=0.0254, lr=9.86e-06, step=6320] Training: 63%|██████▎ | 6321/10000 [1:24:32<37:20, 1.64it/s, loss=0.0254, lr=9.86e-06, step=6320] Training: 63%|██████▎ | 6321/10000 [1:24:32<37:20, 1.64it/s, loss=0.0059, lr=9.85e-06, step=6321] Training: 63%|██████▎ | 6322/10000 [1:24:32<39:09, 1.57it/s, loss=0.0059, lr=9.85e-06, step=6321] Training: 63%|██████▎ | 6322/10000 [1:24:32<39:09, 1.57it/s, loss=0.0072, lr=9.85e-06, step=6322] Training: 63%|██████▎ | 6323/10000 [1:24:33<36:10, 1.69it/s, loss=0.0072, lr=9.85e-06, step=6322] Training: 63%|██████▎ | 6323/10000 [1:24:33<36:10, 1.69it/s, loss=0.0011, lr=9.84e-06, step=6323] Training: 63%|██████▎ | 6324/10000 [1:24:33<35:21, 1.73it/s, loss=0.0011, lr=9.84e-06, step=6323] Training: 63%|██████▎ | 6324/10000 [1:24:33<35:21, 1.73it/s, loss=0.0162, lr=9.84e-06, step=6324] Training: 63%|██████▎ | 6325/10000 [1:24:34<33:19, 1.84it/s, loss=0.0162, lr=9.84e-06, step=6324] Training: 63%|██████▎ | 6325/10000 [1:24:34<33:19, 1.84it/s, loss=0.0419, lr=9.84e-06, step=6325] Training: 63%|██████▎ | 6326/10000 [1:24:34<32:13, 1.90it/s, loss=0.0419, lr=9.84e-06, step=6325] Training: 63%|██████▎ | 6326/10000 [1:24:34<32:13, 1.90it/s, loss=0.0165, lr=9.83e-06, step=6326] Training: 63%|██████▎ | 6327/10000 [1:24:35<31:37, 1.94it/s, loss=0.0165, lr=9.83e-06, step=6326] Training: 63%|██████▎ | 6327/10000 [1:24:35<31:37, 1.94it/s, loss=0.0120, lr=9.83e-06, step=6327] Training: 63%|██████▎ | 6328/10000 [1:24:35<34:23, 1.78it/s, loss=0.0120, lr=9.83e-06, step=6327] Training: 63%|██████▎ | 6328/10000 [1:24:35<34:23, 1.78it/s, loss=0.0048, lr=9.83e-06, step=6328] Training: 63%|██████▎ | 6329/10000 [1:24:36<36:34, 1.67it/s, loss=0.0048, lr=9.83e-06, step=6328] Training: 63%|██████▎ | 6329/10000 [1:24:36<36:34, 1.67it/s, loss=0.0090, lr=9.82e-06, step=6329]20:09:08.978 [I] step=6330 loss=0.0032 smoothed_loss=0.0105 lr=9.84e-06 grad_norm=0.4618 step_time=0.5098s data_time=0.0642s it/s=1.742 eta_to_10000=2106.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.0700 grad_arm_token_fuse=0.0319 grad_shared_expert=0.3514 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6330/10000 [1:24:37<35:40, 1.71it/s, loss=0.0090, lr=9.82e-06, step=6329] Training: 63%|██████▎ | 6330/10000 [1:24:37<35:40, 1.71it/s, loss=0.0032, lr=9.82e-06, step=6330] Training: 63%|██████▎ | 6331/10000 [1:24:37<33:35, 1.82it/s, loss=0.0032, lr=9.82e-06, step=6330] Training: 63%|██████▎ | 6331/10000 [1:24:37<33:35, 1.82it/s, loss=0.0132, lr=9.82e-06, step=6331] Training: 63%|██████▎ | 6332/10000 [1:24:38<32:27, 1.88it/s, loss=0.0132, lr=9.82e-06, step=6331] Training: 63%|██████▎ | 6332/10000 [1:24:38<32:27, 1.88it/s, loss=0.0073, lr=9.81e-06, step=6332] Training: 63%|██████▎ | 6333/10000 [1:24:38<31:28, 1.94it/s, loss=0.0073, lr=9.81e-06, step=6332] Training: 63%|██████▎ | 6333/10000 [1:24:38<31:28, 1.94it/s, loss=0.0030, lr=9.81e-06, step=6333] Training: 63%|██████▎ | 6334/10000 [1:24:39<30:36, 2.00it/s, loss=0.0030, lr=9.81e-06, step=6333] Training: 63%|██████▎ | 6334/10000 [1:24:39<30:36, 2.00it/s, loss=0.0144, lr=9.81e-06, step=6334] Training: 63%|██████▎ | 6335/10000 [1:24:39<33:31, 1.82it/s, loss=0.0144, lr=9.81e-06, step=6334] Training: 63%|██████▎ | 6335/10000 [1:24:39<33:31, 1.82it/s, loss=0.0069, lr=9.80e-06, step=6335] Training: 63%|██████▎ | 6336/10000 [1:24:40<35:56, 1.70it/s, loss=0.0069, lr=9.80e-06, step=6335] Training: 63%|██████▎ | 6336/10000 [1:24:40<35:56, 1.70it/s, loss=0.0044, lr=9.80e-06, step=6336] Training: 63%|██████▎ | 6337/10000 [1:24:40<34:00, 1.80it/s, loss=0.0044, lr=9.80e-06, step=6336] Training: 63%|██████▎ | 6337/10000 [1:24:40<34:00, 1.80it/s, loss=0.0039, lr=9.80e-06, step=6337] Training: 63%|██████▎ | 6338/10000 [1:24:41<32:30, 1.88it/s, loss=0.0039, lr=9.80e-06, step=6337] Training: 63%|██████▎ | 6338/10000 [1:24:41<32:30, 1.88it/s, loss=0.0170, lr=9.79e-06, step=6338] Training: 63%|██████▎ | 6339/10000 [1:24:41<31:21, 1.95it/s, loss=0.0170, lr=9.79e-06, step=6338] Training: 63%|██████▎ | 6339/10000 [1:24:41<31:21, 1.95it/s, loss=0.0044, lr=9.79e-06, step=6339]20:09:14.162 [I] step=6340 loss=0.0142 smoothed_loss=0.0096 lr=9.80e-06 grad_norm=0.3913 step_time=0.4584s data_time=0.0601s it/s=1.929 eta_to_10000=1897.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.0714 grad_arm_token_fuse=0.0316 grad_shared_expert=0.2555 (18633:train_pytorch.py:850) + Training: 63%|██████▎ | 6340/10000 [1:24:42<31:15, 1.95it/s, loss=0.0044, lr=9.79e-06, step=6339] Training: 63%|██████▎ | 6340/10000 [1:24:42<31:15, 1.95it/s, loss=0.0142, lr=9.79e-06, step=6340] Training: 63%|██████▎ | 6341/10000 [1:24:42<30:42, 1.99it/s, loss=0.0142, lr=9.79e-06, step=6340] Training: 63%|██████▎ | 6341/10000 [1:24:42<30:42, 1.99it/s, loss=0.0056, lr=9.78e-06, step=6341] Training: 63%|██████▎ | 6342/10000 [1:24:43<32:59, 1.85it/s, loss=0.0056, lr=9.78e-06, step=6341] Training: 63%|██████▎ | 6342/10000 [1:24:43<32:59, 1.85it/s, loss=0.0077, lr=9.78e-06, step=6342] Training: 63%|██████▎ | 6343/10000 [1:24:44<35:56, 1.70it/s, loss=0.0077, lr=9.78e-06, step=6342] Training: 63%|██████▎ | 6343/10000 [1:24:44<35:56, 1.70it/s, loss=0.0144, lr=9.78e-06, step=6343] Training: 63%|██████▎ | 6344/10000 [1:24:44<34:01, 1.79it/s, loss=0.0144, lr=9.78e-06, step=6343] Training: 63%|██████▎ | 6344/10000 [1:24:44<34:01, 1.79it/s, loss=0.0109, lr=9.77e-06, step=6344] Training: 63%|██████▎ | 6345/10000 [1:24:45<32:32, 1.87it/s, loss=0.0109, lr=9.77e-06, step=6344] Training: 63%|██████▎ | 6345/10000 [1:24:45<32:32, 1.87it/s, loss=0.0200, lr=9.77e-06, step=6345] Training: 63%|██████▎ | 6346/10000 [1:24:45<31:27, 1.94it/s, loss=0.0200, lr=9.77e-06, step=6345] Training: 63%|██████▎ | 6346/10000 [1:24:45<31:27, 1.94it/s, loss=0.0097, lr=9.76e-06, step=6346] Training: 63%|██████▎ | 6347/10000 [1:24:46<30:31, 1.99it/s, loss=0.0097, lr=9.76e-06, step=6346] Training: 63%|██████▎ | 6347/10000 [1:24:46<30:31, 1.99it/s, loss=0.0225, lr=9.76e-06, step=6347] Training: 63%|██████▎ | 6348/10000 [1:24:46<30:08, 2.02it/s, loss=0.0225, lr=9.76e-06, step=6347] Training: 63%|██████▎ | 6348/10000 [1:24:46<30:08, 2.02it/s, loss=0.0047, lr=9.76e-06, step=6348] Training: 63%|██████▎ | 6349/10000 [1:24:47<29:37, 2.05it/s, loss=0.0047, lr=9.76e-06, step=6348] Training: 63%|██████▎ | 6349/10000 [1:24:47<29:37, 2.05it/s, loss=0.0125, lr=9.75e-06, step=6349]20:09:19.546 [I] step=6350 loss=0.0119 smoothed_loss=0.0113 lr=9.77e-06 grad_norm=0.4018 step_time=0.4808s data_time=0.0576s it/s=1.858 eta_to_10000=1964.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0163 grad_action_out_proj_arms=0.1423 grad_arm_token_fuse=0.0898 grad_shared_expert=0.4514 (18633:train_pytorch.py:850) + Training: 64%|██████▎ | 6350/10000 [1:24:47<33:48, 1.80it/s, loss=0.0125, lr=9.75e-06, step=6349] Training: 64%|██████▎ | 6350/10000 [1:24:47<33:48, 1.80it/s, loss=0.0119, lr=9.75e-06, step=6350] Training: 64%|██████▎ | 6351/10000 [1:24:48<36:09, 1.68it/s, loss=0.0119, lr=9.75e-06, step=6350] Training: 64%|██████▎ | 6351/10000 [1:24:48<36:09, 1.68it/s, loss=0.0273, lr=9.75e-06, step=6351] Training: 64%|██████▎ | 6352/10000 [1:24:48<34:22, 1.77it/s, loss=0.0273, lr=9.75e-06, step=6351] Training: 64%|██████▎ | 6352/10000 [1:24:48<34:22, 1.77it/s, loss=0.0821, lr=9.74e-06, step=6352] Training: 64%|██████▎ | 6353/10000 [1:24:49<34:11, 1.78it/s, loss=0.0821, lr=9.74e-06, step=6352] Training: 64%|██████▎ | 6353/10000 [1:24:49<34:11, 1.78it/s, loss=0.0082, lr=9.74e-06, step=6353] Training: 64%|██████▎ | 6354/10000 [1:24:49<32:47, 1.85it/s, loss=0.0082, lr=9.74e-06, step=6353] Training: 64%|██████▎ | 6354/10000 [1:24:49<32:47, 1.85it/s, loss=0.0101, lr=9.74e-06, step=6354] Training: 64%|██████▎ | 6355/10000 [1:24:50<31:44, 1.91it/s, loss=0.0101, lr=9.74e-06, step=6354] Training: 64%|██████▎ | 6355/10000 [1:24:50<31:44, 1.91it/s, loss=0.0295, lr=9.73e-06, step=6355] Training: 64%|██████▎ | 6356/10000 [1:24:50<30:57, 1.96it/s, loss=0.0295, lr=9.73e-06, step=6355] Training: 64%|██████▎ | 6356/10000 [1:24:50<30:57, 1.96it/s, loss=0.0507, lr=9.73e-06, step=6356] Training: 64%|██████▎ | 6357/10000 [1:24:51<38:45, 1.57it/s, loss=0.0507, lr=9.73e-06, step=6356] Training: 64%|██████▎ | 6357/10000 [1:24:51<38:45, 1.57it/s, loss=0.0018, lr=9.73e-06, step=6357] Training: 64%|██████▎ | 6358/10000 [1:24:52<36:06, 1.68it/s, loss=0.0018, lr=9.73e-06, step=6357] Training: 64%|██████▎ | 6358/10000 [1:24:52<36:06, 1.68it/s, loss=0.0017, lr=9.72e-06, step=6358] Training: 64%|██████▎ | 6359/10000 [1:24:52<34:39, 1.75it/s, loss=0.0017, lr=9.72e-06, step=6358] Training: 64%|██████▎ | 6359/10000 [1:24:52<34:39, 1.75it/s, loss=0.0142, lr=9.72e-06, step=6359]20:09:25.195 [I] step=6360 loss=0.0021 smoothed_loss=0.0163 lr=9.73e-06 grad_norm=0.4763 step_time=0.4792s data_time=0.0856s it/s=1.771 eta_to_10000=2055.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0160 grad_action_out_proj_arms=0.1566 grad_arm_token_fuse=0.0788 grad_shared_expert=0.5127 (18633:train_pytorch.py:850) + Training: 64%|██████▎ | 6360/10000 [1:24:53<33:37, 1.80it/s, loss=0.0142, lr=9.72e-06, step=6359] Training: 64%|██████▎ | 6360/10000 [1:24:53<33:37, 1.80it/s, loss=0.0021, lr=9.72e-06, step=6360] Training: 64%|██████▎ | 6361/10000 [1:24:53<32:29, 1.87it/s, loss=0.0021, lr=9.72e-06, step=6360] Training: 64%|██████▎ | 6361/10000 [1:24:53<32:29, 1.87it/s, loss=0.0052, lr=9.71e-06, step=6361] Training: 64%|██████▎ | 6362/10000 [1:24:54<31:23, 1.93it/s, loss=0.0052, lr=9.71e-06, step=6361] Training: 64%|██████▎ | 6362/10000 [1:24:54<31:23, 1.93it/s, loss=0.0033, lr=9.71e-06, step=6362] Training: 64%|██████▎ | 6363/10000 [1:24:54<32:30, 1.86it/s, loss=0.0033, lr=9.71e-06, step=6362] Training: 64%|██████▎ | 6363/10000 [1:24:54<32:30, 1.86it/s, loss=0.0060, lr=9.71e-06, step=6363] Training: 64%|██████▎ | 6364/10000 [1:24:55<36:07, 1.68it/s, loss=0.0060, lr=9.71e-06, step=6363] Training: 64%|██████▎ | 6364/10000 [1:24:55<36:07, 1.68it/s, loss=0.0210, lr=9.70e-06, step=6364] Training: 64%|██████▎ | 6365/10000 [1:24:56<37:28, 1.62it/s, loss=0.0210, lr=9.70e-06, step=6364] Training: 64%|██████▎ | 6365/10000 [1:24:56<37:28, 1.62it/s, loss=0.0139, lr=9.70e-06, step=6365] Training: 64%|██████▎ | 6366/10000 [1:24:56<34:51, 1.74it/s, loss=0.0139, lr=9.70e-06, step=6365] Training: 64%|██████▎ | 6366/10000 [1:24:56<34:51, 1.74it/s, loss=0.0030, lr=9.70e-06, step=6366] Training: 64%|██████▎ | 6367/10000 [1:24:57<37:57, 1.60it/s, loss=0.0030, lr=9.70e-06, step=6366] Training: 64%|██████▎ | 6367/10000 [1:24:57<37:57, 1.60it/s, loss=0.0078, lr=9.69e-06, step=6367] Training: 64%|██████▎ | 6368/10000 [1:24:58<39:52, 1.52it/s, loss=0.0078, lr=9.69e-06, step=6367] Training: 64%|██████▎ | 6368/10000 [1:24:58<39:52, 1.52it/s, loss=0.0061, lr=9.69e-06, step=6368] Training: 64%|██████▎ | 6369/10000 [1:24:58<36:46, 1.65it/s, loss=0.0061, lr=9.69e-06, step=6368] Training: 64%|██████▎ | 6369/10000 [1:24:58<36:46, 1.65it/s, loss=0.0024, lr=9.68e-06, step=6369]20:09:31.114 [I] step=6370 loss=0.0111 smoothed_loss=0.0108 lr=9.70e-06 grad_norm=0.3767 step_time=0.4968s data_time=0.0951s it/s=1.690 eta_to_10000=2148.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0078 grad_action_out_proj_arms=0.0889 grad_arm_token_fuse=0.0368 grad_shared_expert=0.3595 (18633:train_pytorch.py:850) + Training: 64%|██████▎ | 6370/10000 [1:24:59<35:10, 1.72it/s, loss=0.0024, lr=9.68e-06, step=6369] Training: 64%|██████▎ | 6370/10000 [1:24:59<35:10, 1.72it/s, loss=0.0111, lr=9.68e-06, step=6370] Training: 64%|██████▎ | 6371/10000 [1:25:00<40:10, 1.51it/s, loss=0.0111, lr=9.68e-06, step=6370] Training: 64%|██████▎ | 6371/10000 [1:25:00<40:10, 1.51it/s, loss=0.0017, lr=9.68e-06, step=6371] Training: 64%|██████▎ | 6372/10000 [1:25:00<40:46, 1.48it/s, loss=0.0017, lr=9.68e-06, step=6371] Training: 64%|██████▎ | 6372/10000 [1:25:00<40:46, 1.48it/s, loss=0.0270, lr=9.67e-06, step=6372] Training: 64%|██████▎ | 6373/10000 [1:25:01<37:38, 1.61it/s, loss=0.0270, lr=9.67e-06, step=6372] Training: 64%|██████▎ | 6373/10000 [1:25:01<37:38, 1.61it/s, loss=0.0234, lr=9.67e-06, step=6373] Training: 64%|██████▎ | 6374/10000 [1:25:01<37:29, 1.61it/s, loss=0.0234, lr=9.67e-06, step=6373] Training: 64%|██████▎ | 6374/10000 [1:25:01<37:29, 1.61it/s, loss=0.0124, lr=9.67e-06, step=6374] Training: 64%|██████▍ | 6375/10000 [1:25:02<35:23, 1.71it/s, loss=0.0124, lr=9.67e-06, step=6374] Training: 64%|██████▍ | 6375/10000 [1:25:02<35:23, 1.71it/s, loss=0.0138, lr=9.66e-06, step=6375] Training: 64%|██████▍ | 6376/10000 [1:25:02<33:28, 1.80it/s, loss=0.0138, lr=9.66e-06, step=6375] Training: 64%|██████▍ | 6376/10000 [1:25:02<33:28, 1.80it/s, loss=0.0236, lr=9.66e-06, step=6376] Training: 64%|██████▍ | 6377/10000 [1:25:03<35:23, 1.71it/s, loss=0.0236, lr=9.66e-06, step=6376] Training: 64%|██████▍ | 6377/10000 [1:25:03<35:23, 1.71it/s, loss=0.0103, lr=9.66e-06, step=6377] Training: 64%|██████▍ | 6378/10000 [1:25:04<36:56, 1.63it/s, loss=0.0103, lr=9.66e-06, step=6377] Training: 64%|██████▍ | 6378/10000 [1:25:04<36:56, 1.63it/s, loss=0.0076, lr=9.65e-06, step=6378] Training: 64%|██████▍ | 6379/10000 [1:25:04<38:13, 1.58it/s, loss=0.0076, lr=9.65e-06, step=6378] Training: 64%|██████▍ | 6379/10000 [1:25:04<38:13, 1.58it/s, loss=0.0156, lr=9.65e-06, step=6379]20:09:37.311 [I] step=6380 loss=0.0047 smoothed_loss=0.0124 lr=9.66e-06 grad_norm=0.4795 step_time=0.5321s data_time=0.0876s it/s=1.614 eta_to_10000=2243.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0746 grad_arm_token_fuse=0.0321 grad_shared_expert=0.2276 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6380/10000 [1:25:05<36:12, 1.67it/s, loss=0.0156, lr=9.65e-06, step=6379] Training: 64%|██████▍ | 6380/10000 [1:25:05<36:12, 1.67it/s, loss=0.0047, lr=9.65e-06, step=6380] Training: 64%|██████▍ | 6381/10000 [1:25:05<33:48, 1.78it/s, loss=0.0047, lr=9.65e-06, step=6380] Training: 64%|██████▍ | 6381/10000 [1:25:05<33:48, 1.78it/s, loss=0.0043, lr=9.64e-06, step=6381] Training: 64%|██████▍ | 6382/10000 [1:25:06<32:20, 1.86it/s, loss=0.0043, lr=9.64e-06, step=6381] Training: 64%|██████▍ | 6382/10000 [1:25:06<32:20, 1.86it/s, loss=0.0020, lr=9.64e-06, step=6382] Training: 64%|██████▍ | 6383/10000 [1:25:06<32:07, 1.88it/s, loss=0.0020, lr=9.64e-06, step=6382] Training: 64%|██████▍ | 6383/10000 [1:25:06<32:07, 1.88it/s, loss=0.0374, lr=9.64e-06, step=6383] Training: 64%|██████▍ | 6384/10000 [1:25:07<31:31, 1.91it/s, loss=0.0374, lr=9.64e-06, step=6383] Training: 64%|██████▍ | 6384/10000 [1:25:07<31:31, 1.91it/s, loss=0.0075, lr=9.63e-06, step=6384] Training: 64%|██████▍ | 6385/10000 [1:25:08<34:00, 1.77it/s, loss=0.0075, lr=9.63e-06, step=6384] Training: 64%|██████▍ | 6385/10000 [1:25:08<34:00, 1.77it/s, loss=0.0063, lr=9.63e-06, step=6385] Training: 64%|██████▍ | 6386/10000 [1:25:08<36:21, 1.66it/s, loss=0.0063, lr=9.63e-06, step=6385] Training: 64%|██████▍ | 6386/10000 [1:25:08<36:21, 1.66it/s, loss=0.0073, lr=9.63e-06, step=6386] Training: 64%|██████▍ | 6387/10000 [1:25:09<34:33, 1.74it/s, loss=0.0073, lr=9.63e-06, step=6386] Training: 64%|██████▍ | 6387/10000 [1:25:09<34:33, 1.74it/s, loss=0.0019, lr=9.62e-06, step=6387] Training: 64%|██████▍ | 6388/10000 [1:25:09<32:54, 1.83it/s, loss=0.0019, lr=9.62e-06, step=6387] Training: 64%|██████▍ | 6388/10000 [1:25:09<32:54, 1.83it/s, loss=0.0081, lr=9.62e-06, step=6388] Training: 64%|██████▍ | 6389/10000 [1:25:10<33:18, 1.81it/s, loss=0.0081, lr=9.62e-06, step=6388] Training: 64%|██████▍ | 6389/10000 [1:25:10<33:18, 1.81it/s, loss=0.0276, lr=9.62e-06, step=6389]20:09:42.762 [I] step=6390 loss=0.0268 smoothed_loss=0.0136 lr=9.63e-06 grad_norm=0.4383 step_time=0.4745s data_time=0.0706s it/s=1.835 eta_to_10000=1967.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0143 grad_action_out_proj_arms=0.1155 grad_arm_token_fuse=0.0662 grad_shared_expert=0.5400 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6390/10000 [1:25:10<33:30, 1.80it/s, loss=0.0276, lr=9.62e-06, step=6389] Training: 64%|██████▍ | 6390/10000 [1:25:10<33:30, 1.80it/s, loss=0.0268, lr=9.61e-06, step=6390] Training: 64%|██████▍ | 6391/10000 [1:25:11<32:18, 1.86it/s, loss=0.0268, lr=9.61e-06, step=6390] Training: 64%|██████▍ | 6391/10000 [1:25:11<32:18, 1.86it/s, loss=0.0012, lr=9.61e-06, step=6391] Training: 64%|██████▍ | 6392/10000 [1:25:12<34:14, 1.76it/s, loss=0.0012, lr=9.61e-06, step=6391] Training: 64%|██████▍ | 6392/10000 [1:25:12<34:14, 1.76it/s, loss=0.0086, lr=9.61e-06, step=6392] Training: 64%|██████▍ | 6393/10000 [1:25:12<36:55, 1.63it/s, loss=0.0086, lr=9.61e-06, step=6392] Training: 64%|██████▍ | 6393/10000 [1:25:12<36:55, 1.63it/s, loss=0.0022, lr=9.60e-06, step=6393] Training: 64%|██████▍ | 6394/10000 [1:25:13<34:50, 1.73it/s, loss=0.0022, lr=9.60e-06, step=6393] Training: 64%|██████▍ | 6394/10000 [1:25:13<34:50, 1.73it/s, loss=0.0133, lr=9.60e-06, step=6394] Training: 64%|██████▍ | 6395/10000 [1:25:13<33:20, 1.80it/s, loss=0.0133, lr=9.60e-06, step=6394] Training: 64%|██████▍ | 6395/10000 [1:25:13<33:20, 1.80it/s, loss=0.0046, lr=9.59e-06, step=6395] Training: 64%|██████▍ | 6396/10000 [1:25:14<32:02, 1.87it/s, loss=0.0046, lr=9.59e-06, step=6395] Training: 64%|██████▍ | 6396/10000 [1:25:14<32:02, 1.87it/s, loss=0.0079, lr=9.59e-06, step=6396] Training: 64%|██████▍ | 6397/10000 [1:25:14<32:04, 1.87it/s, loss=0.0079, lr=9.59e-06, step=6396] Training: 64%|██████▍ | 6397/10000 [1:25:14<32:04, 1.87it/s, loss=0.0109, lr=9.59e-06, step=6397] Training: 64%|██████▍ | 6398/10000 [1:25:15<31:14, 1.92it/s, loss=0.0109, lr=9.59e-06, step=6397] Training: 64%|██████▍ | 6398/10000 [1:25:15<31:14, 1.92it/s, loss=0.0100, lr=9.58e-06, step=6398] Training: 64%|██████▍ | 6399/10000 [1:25:15<33:47, 1.78it/s, loss=0.0100, lr=9.58e-06, step=6398] Training: 64%|██████▍ | 6399/10000 [1:25:15<33:47, 1.78it/s, loss=0.0475, lr=9.58e-06, step=6399]20:09:48.516 [I] step=6400 loss=0.0170 smoothed_loss=0.0143 lr=9.59e-06 grad_norm=0.4978 step_time=0.5097s data_time=0.0657s it/s=1.738 eta_to_10000=2071.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0206 grad_action_out_proj_arms=0.1446 grad_arm_token_fuse=0.1067 grad_shared_expert=0.4757 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6400/10000 [1:25:16<36:50, 1.63it/s, loss=0.0475, lr=9.58e-06, step=6399] Training: 64%|██████▍ | 6400/10000 [1:25:16<36:50, 1.63it/s, loss=0.0170, lr=9.58e-06, step=6400] Training: 64%|██████▍ | 6401/10000 [1:25:17<38:37, 1.55it/s, loss=0.0170, lr=9.58e-06, step=6400] Training: 64%|██████▍ | 6401/10000 [1:25:17<38:37, 1.55it/s, loss=0.0049, lr=9.57e-06, step=6401] Training: 64%|██████▍ | 6402/10000 [1:25:17<35:59, 1.67it/s, loss=0.0049, lr=9.57e-06, step=6401] Training: 64%|██████▍ | 6402/10000 [1:25:17<35:59, 1.67it/s, loss=0.0130, lr=9.57e-06, step=6402] Training: 64%|██████▍ | 6403/10000 [1:25:18<33:51, 1.77it/s, loss=0.0130, lr=9.57e-06, step=6402] Training: 64%|██████▍ | 6403/10000 [1:25:18<33:51, 1.77it/s, loss=0.0095, lr=9.57e-06, step=6403] Training: 64%|██████▍ | 6404/10000 [1:25:18<32:41, 1.83it/s, loss=0.0095, lr=9.57e-06, step=6403] Training: 64%|██████▍ | 6404/10000 [1:25:18<32:41, 1.83it/s, loss=0.0055, lr=9.56e-06, step=6404] Training: 64%|██████▍ | 6405/10000 [1:25:19<31:16, 1.92it/s, loss=0.0055, lr=9.56e-06, step=6404] Training: 64%|██████▍ | 6405/10000 [1:25:19<31:16, 1.92it/s, loss=0.0049, lr=9.56e-06, step=6405] Training: 64%|██████▍ | 6406/10000 [1:25:19<33:17, 1.80it/s, loss=0.0049, lr=9.56e-06, step=6405] Training: 64%|██████▍ | 6406/10000 [1:25:19<33:17, 1.80it/s, loss=0.0115, lr=9.56e-06, step=6406] Training: 64%|██████▍ | 6407/10000 [1:25:20<35:17, 1.70it/s, loss=0.0115, lr=9.56e-06, step=6406] Training: 64%|██████▍ | 6407/10000 [1:25:20<35:17, 1.70it/s, loss=0.0199, lr=9.55e-06, step=6407] Training: 64%|██████▍ | 6408/10000 [1:25:21<33:25, 1.79it/s, loss=0.0199, lr=9.55e-06, step=6407] Training: 64%|██████▍ | 6408/10000 [1:25:21<33:25, 1.79it/s, loss=0.0095, lr=9.55e-06, step=6408] Training: 64%|██████▍ | 6409/10000 [1:25:21<33:37, 1.78it/s, loss=0.0095, lr=9.55e-06, step=6408] Training: 64%|██████▍ | 6409/10000 [1:25:21<33:37, 1.78it/s, loss=0.0055, lr=9.55e-06, step=6409]20:09:54.133 [I] step=6410 loss=0.0036 smoothed_loss=0.0106 lr=9.56e-06 grad_norm=0.3714 step_time=0.4813s data_time=0.0805s it/s=1.780 eta_to_10000=2016.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.0889 grad_arm_token_fuse=0.0371 grad_shared_expert=0.3190 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6410/10000 [1:25:22<34:15, 1.75it/s, loss=0.0055, lr=9.55e-06, step=6409] Training: 64%|██████▍ | 6410/10000 [1:25:22<34:15, 1.75it/s, loss=0.0036, lr=9.54e-06, step=6410] Training: 64%|██████▍ | 6411/10000 [1:25:22<33:27, 1.79it/s, loss=0.0036, lr=9.54e-06, step=6410] Training: 64%|██████▍ | 6411/10000 [1:25:22<33:27, 1.79it/s, loss=0.0080, lr=9.54e-06, step=6411] Training: 64%|██████▍ | 6412/10000 [1:25:23<32:58, 1.81it/s, loss=0.0080, lr=9.54e-06, step=6411] Training: 64%|██████▍ | 6412/10000 [1:25:23<32:58, 1.81it/s, loss=0.0108, lr=9.54e-06, step=6412] Training: 64%|██████▍ | 6413/10000 [1:25:24<34:56, 1.71it/s, loss=0.0108, lr=9.54e-06, step=6412] Training: 64%|██████▍ | 6413/10000 [1:25:24<34:56, 1.71it/s, loss=0.0074, lr=9.53e-06, step=6413] Training: 64%|██████▍ | 6414/10000 [1:25:24<36:33, 1.64it/s, loss=0.0074, lr=9.53e-06, step=6413] Training: 64%|██████▍ | 6414/10000 [1:25:24<36:33, 1.64it/s, loss=0.0124, lr=9.53e-06, step=6414] Training: 64%|██████▍ | 6415/10000 [1:25:25<37:37, 1.59it/s, loss=0.0124, lr=9.53e-06, step=6414] Training: 64%|██████▍ | 6415/10000 [1:25:25<37:37, 1.59it/s, loss=0.0077, lr=9.53e-06, step=6415] Training: 64%|██████▍ | 6416/10000 [1:25:25<34:53, 1.71it/s, loss=0.0077, lr=9.53e-06, step=6415] Training: 64%|██████▍ | 6416/10000 [1:25:25<34:53, 1.71it/s, loss=0.0116, lr=9.52e-06, step=6416] Training: 64%|██████▍ | 6417/10000 [1:25:26<34:09, 1.75it/s, loss=0.0116, lr=9.52e-06, step=6416] Training: 64%|██████▍ | 6417/10000 [1:25:26<34:09, 1.75it/s, loss=0.0038, lr=9.52e-06, step=6417] Training: 64%|██████▍ | 6418/10000 [1:25:26<32:51, 1.82it/s, loss=0.0038, lr=9.52e-06, step=6417] Training: 64%|██████▍ | 6418/10000 [1:25:26<32:51, 1.82it/s, loss=0.0133, lr=9.52e-06, step=6418] Training: 64%|██████▍ | 6419/10000 [1:25:27<31:50, 1.87it/s, loss=0.0133, lr=9.52e-06, step=6418] Training: 64%|██████▍ | 6419/10000 [1:25:27<31:50, 1.87it/s, loss=0.0082, lr=9.51e-06, step=6419]20:10:00.014 [I] step=6420 loss=0.0079 smoothed_loss=0.0096 lr=9.52e-06 grad_norm=0.4137 step_time=0.5167s data_time=0.0713s it/s=1.701 eta_to_10000=2104.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0754 grad_arm_token_fuse=0.0438 grad_shared_expert=0.3232 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6420/10000 [1:25:28<36:31, 1.63it/s, loss=0.0082, lr=9.51e-06, step=6419] Training: 64%|██████▍ | 6420/10000 [1:25:28<36:31, 1.63it/s, loss=0.0079, lr=9.51e-06, step=6420] Training: 64%|██████▍ | 6421/10000 [1:25:28<38:19, 1.56it/s, loss=0.0079, lr=9.51e-06, step=6420] Training: 64%|██████▍ | 6421/10000 [1:25:28<38:19, 1.56it/s, loss=0.0059, lr=9.51e-06, step=6421] Training: 64%|██████▍ | 6422/10000 [1:25:29<41:15, 1.45it/s, loss=0.0059, lr=9.51e-06, step=6421] Training: 64%|██████▍ | 6422/10000 [1:25:29<41:15, 1.45it/s, loss=0.0109, lr=9.50e-06, step=6422] Training: 64%|██████▍ | 6423/10000 [1:25:30<37:23, 1.59it/s, loss=0.0109, lr=9.50e-06, step=6422] Training: 64%|██████▍ | 6423/10000 [1:25:30<37:23, 1.59it/s, loss=0.0277, lr=9.50e-06, step=6423] Training: 64%|██████▍ | 6424/10000 [1:25:30<36:19, 1.64it/s, loss=0.0277, lr=9.50e-06, step=6423] Training: 64%|██████▍ | 6424/10000 [1:25:30<36:19, 1.64it/s, loss=0.0039, lr=9.49e-06, step=6424] Training: 64%|██████▍ | 6425/10000 [1:25:31<34:39, 1.72it/s, loss=0.0039, lr=9.49e-06, step=6424] Training: 64%|██████▍ | 6425/10000 [1:25:31<34:39, 1.72it/s, loss=0.0143, lr=9.49e-06, step=6425] Training: 64%|██████▍ | 6426/10000 [1:25:31<33:06, 1.80it/s, loss=0.0143, lr=9.49e-06, step=6425] Training: 64%|██████▍ | 6426/10000 [1:25:31<33:06, 1.80it/s, loss=0.0043, lr=9.49e-06, step=6426] Training: 64%|██████▍ | 6427/10000 [1:25:32<34:44, 1.71it/s, loss=0.0043, lr=9.49e-06, step=6426] Training: 64%|██████▍ | 6427/10000 [1:25:32<34:44, 1.71it/s, loss=0.0066, lr=9.48e-06, step=6427] Training: 64%|██████▍ | 6428/10000 [1:25:33<35:44, 1.67it/s, loss=0.0066, lr=9.48e-06, step=6427] Training: 64%|██████▍ | 6428/10000 [1:25:33<35:44, 1.67it/s, loss=0.0263, lr=9.48e-06, step=6428] Training: 64%|██████▍ | 6429/10000 [1:25:33<37:16, 1.60it/s, loss=0.0263, lr=9.48e-06, step=6428] Training: 64%|██████▍ | 6429/10000 [1:25:33<37:16, 1.60it/s, loss=0.0250, lr=9.48e-06, step=6429]20:10:06.106 [I] step=6430 loss=0.0041 smoothed_loss=0.0120 lr=9.49e-06 grad_norm=0.4617 step_time=0.5332s data_time=0.0760s it/s=1.642 eta_to_10000=2174.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0214 grad_action_out_proj_arms=0.1324 grad_arm_token_fuse=0.1118 grad_shared_expert=0.4792 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6430/10000 [1:25:34<35:43, 1.67it/s, loss=0.0250, lr=9.48e-06, step=6429] Training: 64%|██████▍ | 6430/10000 [1:25:34<35:43, 1.67it/s, loss=0.0041, lr=9.47e-06, step=6430] Training: 64%|██████▍ | 6431/10000 [1:25:34<34:04, 1.75it/s, loss=0.0041, lr=9.47e-06, step=6430] Training: 64%|██████▍ | 6431/10000 [1:25:34<34:04, 1.75it/s, loss=0.0027, lr=9.47e-06, step=6431] Training: 64%|██████▍ | 6432/10000 [1:25:35<32:47, 1.81it/s, loss=0.0027, lr=9.47e-06, step=6431] Training: 64%|██████▍ | 6432/10000 [1:25:35<32:47, 1.81it/s, loss=0.0102, lr=9.47e-06, step=6432] Training: 64%|██████▍ | 6433/10000 [1:25:35<31:48, 1.87it/s, loss=0.0102, lr=9.47e-06, step=6432] Training: 64%|██████▍ | 6433/10000 [1:25:35<31:48, 1.87it/s, loss=0.0034, lr=9.46e-06, step=6433] Training: 64%|██████▍ | 6434/10000 [1:25:36<33:55, 1.75it/s, loss=0.0034, lr=9.46e-06, step=6433] Training: 64%|██████▍ | 6434/10000 [1:25:36<33:55, 1.75it/s, loss=0.0025, lr=9.46e-06, step=6434] Training: 64%|██████▍ | 6435/10000 [1:25:36<32:18, 1.84it/s, loss=0.0025, lr=9.46e-06, step=6434] Training: 64%|██████▍ | 6435/10000 [1:25:36<32:18, 1.84it/s, loss=0.0036, lr=9.46e-06, step=6435] Training: 64%|██████▍ | 6436/10000 [1:25:37<34:52, 1.70it/s, loss=0.0036, lr=9.46e-06, step=6435] Training: 64%|██████▍ | 6436/10000 [1:25:37<34:52, 1.70it/s, loss=0.0079, lr=9.45e-06, step=6436] Training: 64%|██████▍ | 6437/10000 [1:25:38<33:16, 1.78it/s, loss=0.0079, lr=9.45e-06, step=6436] Training: 64%|██████▍ | 6437/10000 [1:25:38<33:16, 1.78it/s, loss=0.0242, lr=9.45e-06, step=6437] Training: 64%|██████▍ | 6438/10000 [1:25:38<32:15, 1.84it/s, loss=0.0242, lr=9.45e-06, step=6437] Training: 64%|██████▍ | 6438/10000 [1:25:38<32:15, 1.84it/s, loss=0.0049, lr=9.45e-06, step=6438] Training: 64%|██████▍ | 6439/10000 [1:25:39<31:42, 1.87it/s, loss=0.0049, lr=9.45e-06, step=6438] Training: 64%|██████▍ | 6439/10000 [1:25:39<31:42, 1.87it/s, loss=0.0128, lr=9.44e-06, step=6439]20:10:11.477 [I] step=6440 loss=0.0100 smoothed_loss=0.0100 lr=9.46e-06 grad_norm=0.4199 step_time=0.4718s data_time=0.0654s it/s=1.862 eta_to_10000=1911.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0653 grad_arm_token_fuse=0.0266 grad_shared_expert=0.2180 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6440/10000 [1:25:39<31:34, 1.88it/s, loss=0.0128, lr=9.44e-06, step=6439] Training: 64%|██████▍ | 6440/10000 [1:25:39<31:34, 1.88it/s, loss=0.0100, lr=9.44e-06, step=6440] Training: 64%|██████▍ | 6441/10000 [1:25:40<33:48, 1.75it/s, loss=0.0100, lr=9.44e-06, step=6440] Training: 64%|██████▍ | 6441/10000 [1:25:40<33:48, 1.75it/s, loss=0.0277, lr=9.44e-06, step=6441] Training: 64%|██████▍ | 6442/10000 [1:25:40<33:53, 1.75it/s, loss=0.0277, lr=9.44e-06, step=6441] Training: 64%|██████▍ | 6442/10000 [1:25:40<33:53, 1.75it/s, loss=0.0016, lr=9.43e-06, step=6442] Training: 64%|██████▍ | 6443/10000 [1:25:41<36:22, 1.63it/s, loss=0.0016, lr=9.43e-06, step=6442] Training: 64%|██████▍ | 6443/10000 [1:25:41<36:22, 1.63it/s, loss=0.0052, lr=9.43e-06, step=6443] Training: 64%|██████▍ | 6444/10000 [1:25:42<34:07, 1.74it/s, loss=0.0052, lr=9.43e-06, step=6443] Training: 64%|██████▍ | 6444/10000 [1:25:42<34:07, 1.74it/s, loss=0.0096, lr=9.43e-06, step=6444] Training: 64%|██████▍ | 6445/10000 [1:25:42<32:42, 1.81it/s, loss=0.0096, lr=9.43e-06, step=6444] Training: 64%|██████▍ | 6445/10000 [1:25:42<32:42, 1.81it/s, loss=0.0099, lr=9.42e-06, step=6445] Training: 64%|██████▍ | 6446/10000 [1:25:43<31:42, 1.87it/s, loss=0.0099, lr=9.42e-06, step=6445] Training: 64%|██████▍ | 6446/10000 [1:25:43<31:42, 1.87it/s, loss=0.0242, lr=9.42e-06, step=6446] Training: 64%|██████▍ | 6447/10000 [1:25:43<31:29, 1.88it/s, loss=0.0242, lr=9.42e-06, step=6446] Training: 64%|██████▍ | 6447/10000 [1:25:43<31:29, 1.88it/s, loss=0.0143, lr=9.42e-06, step=6447] Training: 64%|██████▍ | 6448/10000 [1:25:44<36:41, 1.61it/s, loss=0.0143, lr=9.42e-06, step=6447] Training: 64%|██████▍ | 6448/10000 [1:25:44<36:41, 1.61it/s, loss=0.0127, lr=9.41e-06, step=6448] Training: 64%|██████▍ | 6449/10000 [1:25:45<37:29, 1.58it/s, loss=0.0127, lr=9.41e-06, step=6448] Training: 64%|██████▍ | 6449/10000 [1:25:45<37:29, 1.58it/s, loss=0.0145, lr=9.41e-06, step=6449]20:10:17.654 [I] step=6450 loss=0.0131 smoothed_loss=0.0123 lr=9.42e-06 grad_norm=0.4149 step_time=0.5321s data_time=0.0855s it/s=1.619 eta_to_10000=2192.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0058 grad_action_out_proj_arms=0.0642 grad_arm_token_fuse=0.0313 grad_shared_expert=0.2583 (18633:train_pytorch.py:850) + Training: 64%|██████▍ | 6450/10000 [1:25:45<39:18, 1.51it/s, loss=0.0145, lr=9.41e-06, step=6449] Training: 64%|██████▍ | 6450/10000 [1:25:45<39:18, 1.51it/s, loss=0.0131, lr=9.41e-06, step=6450] Training: 65%|██████▍ | 6451/10000 [1:25:46<36:12, 1.63it/s, loss=0.0131, lr=9.41e-06, step=6450] Training: 65%|██████▍ | 6451/10000 [1:25:46<36:12, 1.63it/s, loss=0.0063, lr=9.40e-06, step=6451] Training: 65%|██████▍ | 6452/10000 [1:25:46<33:58, 1.74it/s, loss=0.0063, lr=9.40e-06, step=6451] Training: 65%|██████▍ | 6452/10000 [1:25:46<33:58, 1.74it/s, loss=0.0212, lr=9.40e-06, step=6452] Training: 65%|██████▍ | 6453/10000 [1:25:47<32:05, 1.84it/s, loss=0.0212, lr=9.40e-06, step=6452] Training: 65%|██████▍ | 6453/10000 [1:25:47<32:05, 1.84it/s, loss=0.0056, lr=9.40e-06, step=6453] Training: 65%|██████▍ | 6454/10000 [1:25:47<30:47, 1.92it/s, loss=0.0056, lr=9.40e-06, step=6453] Training: 65%|██████▍ | 6454/10000 [1:25:47<30:47, 1.92it/s, loss=0.0388, lr=9.39e-06, step=6454] Training: 65%|██████▍ | 6455/10000 [1:25:48<32:52, 1.80it/s, loss=0.0388, lr=9.39e-06, step=6454] Training: 65%|██████▍ | 6455/10000 [1:25:48<32:52, 1.80it/s, loss=0.0042, lr=9.39e-06, step=6455] Training: 65%|██████▍ | 6456/10000 [1:25:49<34:54, 1.69it/s, loss=0.0042, lr=9.39e-06, step=6455] Training: 65%|██████▍ | 6456/10000 [1:25:49<34:54, 1.69it/s, loss=0.0107, lr=9.38e-06, step=6456] Training: 65%|██████▍ | 6457/10000 [1:25:49<36:35, 1.61it/s, loss=0.0107, lr=9.38e-06, step=6456] Training: 65%|██████▍ | 6457/10000 [1:25:49<36:35, 1.61it/s, loss=0.0091, lr=9.38e-06, step=6457] Training: 65%|██████▍ | 6458/10000 [1:25:50<41:25, 1.43it/s, loss=0.0091, lr=9.38e-06, step=6457] Training: 65%|██████▍ | 6458/10000 [1:25:50<41:25, 1.43it/s, loss=0.0029, lr=9.38e-06, step=6458] Training: 65%|██████▍ | 6459/10000 [1:25:51<41:29, 1.42it/s, loss=0.0029, lr=9.38e-06, step=6458] Training: 65%|██████▍ | 6459/10000 [1:25:51<41:29, 1.42it/s, loss=0.0035, lr=9.37e-06, step=6459]20:10:23.691 [I] step=6460 loss=0.0311 smoothed_loss=0.0130 lr=9.39e-06 grad_norm=0.4941 step_time=0.5171s data_time=0.0866s it/s=1.657 eta_to_10000=2136.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0100 grad_action_out_proj_arms=0.1230 grad_arm_token_fuse=0.0493 grad_shared_expert=0.4672 (18633:train_pytorch.py:850) + Training: 65%|██████▍ | 6460/10000 [1:25:51<38:18, 1.54it/s, loss=0.0035, lr=9.37e-06, step=6459] Training: 65%|██████▍ | 6460/10000 [1:25:51<38:18, 1.54it/s, loss=0.0311, lr=9.37e-06, step=6460] Training: 65%|██████▍ | 6461/10000 [1:25:52<39:14, 1.50it/s, loss=0.0311, lr=9.37e-06, step=6460] Training: 65%|██████▍ | 6461/10000 [1:25:52<39:14, 1.50it/s, loss=0.0033, lr=9.37e-06, step=6461] Training: 65%|██████▍ | 6462/10000 [1:25:53<40:01, 1.47it/s, loss=0.0033, lr=9.37e-06, step=6461] Training: 65%|██████▍ | 6462/10000 [1:25:53<40:01, 1.47it/s, loss=0.0451, lr=9.36e-06, step=6462] Training: 65%|██████▍ | 6463/10000 [1:25:54<42:11, 1.40it/s, loss=0.0451, lr=9.36e-06, step=6462] Training: 65%|██████▍ | 6463/10000 [1:25:54<42:11, 1.40it/s, loss=0.0033, lr=9.36e-06, step=6463] Training: 65%|██████▍ | 6464/10000 [1:25:54<43:10, 1.36it/s, loss=0.0033, lr=9.36e-06, step=6463] Training: 65%|██████▍ | 6464/10000 [1:25:54<43:10, 1.36it/s, loss=0.0048, lr=9.36e-06, step=6464] Training: 65%|██████▍ | 6465/10000 [1:25:55<47:48, 1.23it/s, loss=0.0048, lr=9.36e-06, step=6464] Training: 65%|██████▍ | 6465/10000 [1:25:55<47:48, 1.23it/s, loss=0.0523, lr=9.35e-06, step=6465] Training: 65%|██████▍ | 6466/10000 [1:25:56<42:15, 1.39it/s, loss=0.0523, lr=9.35e-06, step=6465] Training: 65%|██████▍ | 6466/10000 [1:25:56<42:15, 1.39it/s, loss=0.0054, lr=9.35e-06, step=6466] Training: 65%|██████▍ | 6467/10000 [1:25:56<38:11, 1.54it/s, loss=0.0054, lr=9.35e-06, step=6466] Training: 65%|██████▍ | 6467/10000 [1:25:56<38:11, 1.54it/s, loss=0.0059, lr=9.35e-06, step=6467] Training: 65%|██████▍ | 6468/10000 [1:25:57<38:44, 1.52it/s, loss=0.0059, lr=9.35e-06, step=6467] Training: 65%|██████▍ | 6468/10000 [1:25:57<38:44, 1.52it/s, loss=0.0040, lr=9.34e-06, step=6468] Training: 65%|██████▍ | 6469/10000 [1:25:58<37:25, 1.57it/s, loss=0.0040, lr=9.34e-06, step=6468] Training: 65%|██████▍ | 6469/10000 [1:25:58<37:25, 1.57it/s, loss=0.0039, lr=9.34e-06, step=6469]20:10:30.753 [I] step=6470 loss=0.0024 smoothed_loss=0.0118 lr=9.35e-06 grad_norm=0.4423 step_time=0.5723s data_time=0.1340s it/s=1.416 eta_to_10000=2492.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0091 grad_action_out_proj_arms=0.0911 grad_arm_token_fuse=0.0433 grad_shared_expert=0.3209 (18633:train_pytorch.py:850) + Training: 65%|██████▍ | 6470/10000 [1:25:58<40:50, 1.44it/s, loss=0.0039, lr=9.34e-06, step=6469] Training: 65%|██████▍ | 6470/10000 [1:25:58<40:50, 1.44it/s, loss=0.0024, lr=9.34e-06, step=6470] Training: 65%|██████▍ | 6471/10000 [1:25:59<39:39, 1.48it/s, loss=0.0024, lr=9.34e-06, step=6470] Training: 65%|██████▍ | 6471/10000 [1:25:59<39:39, 1.48it/s, loss=0.0028, lr=9.33e-06, step=6471] Training: 65%|██████▍ | 6472/10000 [1:26:00<39:30, 1.49it/s, loss=0.0028, lr=9.33e-06, step=6471] Training: 65%|██████▍ | 6472/10000 [1:26:00<39:30, 1.49it/s, loss=0.0050, lr=9.33e-06, step=6472] Training: 65%|██████▍ | 6473/10000 [1:26:00<36:24, 1.61it/s, loss=0.0050, lr=9.33e-06, step=6472] Training: 65%|██████▍ | 6473/10000 [1:26:00<36:24, 1.61it/s, loss=0.0031, lr=9.33e-06, step=6473] Training: 65%|██████▍ | 6474/10000 [1:26:01<34:22, 1.71it/s, loss=0.0031, lr=9.33e-06, step=6473] Training: 65%|██████▍ | 6474/10000 [1:26:01<34:22, 1.71it/s, loss=0.0023, lr=9.32e-06, step=6474] Training: 65%|██████▍ | 6475/10000 [1:26:01<32:44, 1.79it/s, loss=0.0023, lr=9.32e-06, step=6474] Training: 65%|██████▍ | 6475/10000 [1:26:01<32:44, 1.79it/s, loss=0.0031, lr=9.32e-06, step=6475] Training: 65%|██████▍ | 6476/10000 [1:26:02<31:44, 1.85it/s, loss=0.0031, lr=9.32e-06, step=6475] Training: 65%|██████▍ | 6476/10000 [1:26:02<31:44, 1.85it/s, loss=0.0114, lr=9.32e-06, step=6476] Training: 65%|██████▍ | 6477/10000 [1:26:02<36:02, 1.63it/s, loss=0.0114, lr=9.32e-06, step=6476] Training: 65%|██████▍ | 6477/10000 [1:26:02<36:02, 1.63it/s, loss=0.0150, lr=9.31e-06, step=6477] Training: 65%|██████▍ | 6478/10000 [1:26:03<39:44, 1.48it/s, loss=0.0150, lr=9.31e-06, step=6477] Training: 65%|██████▍ | 6478/10000 [1:26:03<39:44, 1.48it/s, loss=0.0189, lr=9.31e-06, step=6478] Training: 65%|██████▍ | 6479/10000 [1:26:04<39:25, 1.49it/s, loss=0.0189, lr=9.31e-06, step=6478] Training: 65%|██████▍ | 6479/10000 [1:26:04<39:25, 1.49it/s, loss=0.0064, lr=9.31e-06, step=6479]20:10:36.818 [I] step=6480 loss=0.0047 smoothed_loss=0.0093 lr=9.32e-06 grad_norm=0.4464 step_time=0.5190s data_time=0.0875s it/s=1.649 eta_to_10000=2134.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0040 grad_action_out_proj_arms=0.0549 grad_arm_token_fuse=0.0201 grad_shared_expert=0.1935 (18633:train_pytorch.py:850) + Training: 65%|██████▍ | 6480/10000 [1:26:04<36:29, 1.61it/s, loss=0.0064, lr=9.31e-06, step=6479] Training: 65%|██████▍ | 6480/10000 [1:26:04<36:29, 1.61it/s, loss=0.0047, lr=9.30e-06, step=6480] Training: 65%|██████▍ | 6481/10000 [1:26:05<34:04, 1.72it/s, loss=0.0047, lr=9.30e-06, step=6480] Training: 65%|██████▍ | 6481/10000 [1:26:05<34:04, 1.72it/s, loss=0.0057, lr=9.30e-06, step=6481] Training: 65%|██████▍ | 6482/10000 [1:26:05<32:29, 1.80it/s, loss=0.0057, lr=9.30e-06, step=6481] Training: 65%|██████▍ | 6482/10000 [1:26:05<32:29, 1.80it/s, loss=0.0089, lr=9.30e-06, step=6482] Training: 65%|██████▍ | 6483/10000 [1:26:06<31:35, 1.86it/s, loss=0.0089, lr=9.30e-06, step=6482] Training: 65%|██████▍ | 6483/10000 [1:26:06<31:35, 1.86it/s, loss=0.0053, lr=9.29e-06, step=6483] Training: 65%|██████▍ | 6484/10000 [1:26:07<33:46, 1.74it/s, loss=0.0053, lr=9.29e-06, step=6483] Training: 65%|██████▍ | 6484/10000 [1:26:07<33:46, 1.74it/s, loss=0.0043, lr=9.29e-06, step=6484] Training: 65%|██████▍ | 6485/10000 [1:26:07<34:56, 1.68it/s, loss=0.0043, lr=9.29e-06, step=6484] Training: 65%|██████▍ | 6485/10000 [1:26:07<34:56, 1.68it/s, loss=0.0105, lr=9.29e-06, step=6485] Training: 65%|██████▍ | 6486/10000 [1:26:08<36:28, 1.61it/s, loss=0.0105, lr=9.29e-06, step=6485] Training: 65%|██████▍ | 6486/10000 [1:26:08<36:28, 1.61it/s, loss=0.0135, lr=9.28e-06, step=6486] Training: 65%|██████▍ | 6487/10000 [1:26:09<36:06, 1.62it/s, loss=0.0135, lr=9.28e-06, step=6486] Training: 65%|██████▍ | 6487/10000 [1:26:09<36:06, 1.62it/s, loss=0.0378, lr=9.28e-06, step=6487] Training: 65%|██████▍ | 6488/10000 [1:26:09<33:44, 1.73it/s, loss=0.0378, lr=9.28e-06, step=6487] Training: 65%|██████▍ | 6488/10000 [1:26:09<33:44, 1.73it/s, loss=0.0038, lr=9.28e-06, step=6488] Training: 65%|██████▍ | 6489/10000 [1:26:10<36:24, 1.61it/s, loss=0.0038, lr=9.28e-06, step=6488] Training: 65%|██████▍ | 6489/10000 [1:26:10<36:24, 1.61it/s, loss=0.0112, lr=9.27e-06, step=6489]20:10:42.661 [I] step=6490 loss=0.0037 smoothed_loss=0.0103 lr=9.28e-06 grad_norm=0.4053 step_time=0.4879s data_time=0.0964s it/s=1.712 eta_to_10000=2050.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0035 grad_action_out_proj_arms=0.0491 grad_arm_token_fuse=0.0177 grad_shared_expert=0.2140 (18633:train_pytorch.py:850) + Training: 65%|██████▍ | 6490/10000 [1:26:10<35:16, 1.66it/s, loss=0.0112, lr=9.27e-06, step=6489] Training: 65%|██████▍ | 6490/10000 [1:26:10<35:16, 1.66it/s, loss=0.0037, lr=9.27e-06, step=6490] Training: 65%|██████▍ | 6491/10000 [1:26:11<34:14, 1.71it/s, loss=0.0037, lr=9.27e-06, step=6490] Training: 65%|██████▍ | 6491/10000 [1:26:11<34:14, 1.71it/s, loss=0.0020, lr=9.27e-06, step=6491] Training: 65%|██████▍ | 6492/10000 [1:26:12<35:19, 1.66it/s, loss=0.0020, lr=9.27e-06, step=6491] Training: 65%|██████▍ | 6492/10000 [1:26:12<35:19, 1.66it/s, loss=0.0218, lr=9.26e-06, step=6492] Training: 65%|██████▍ | 6493/10000 [1:26:12<36:54, 1.58it/s, loss=0.0218, lr=9.26e-06, step=6492] Training: 65%|██████▍ | 6493/10000 [1:26:12<36:54, 1.58it/s, loss=0.0098, lr=9.26e-06, step=6493] Training: 65%|██████▍ | 6494/10000 [1:26:13<34:22, 1.70it/s, loss=0.0098, lr=9.26e-06, step=6493] Training: 65%|██████▍ | 6494/10000 [1:26:13<34:22, 1.70it/s, loss=0.0400, lr=9.25e-06, step=6494] Training: 65%|██████▍ | 6495/10000 [1:26:13<37:00, 1.58it/s, loss=0.0400, lr=9.25e-06, step=6494] Training: 65%|██████▍ | 6495/10000 [1:26:13<37:00, 1.58it/s, loss=0.0256, lr=9.25e-06, step=6495] Training: 65%|██████▍ | 6496/10000 [1:26:14<34:37, 1.69it/s, loss=0.0256, lr=9.25e-06, step=6495] Training: 65%|██████▍ | 6496/10000 [1:26:14<34:37, 1.69it/s, loss=0.0126, lr=9.25e-06, step=6496] Training: 65%|██████▍ | 6497/10000 [1:26:14<33:22, 1.75it/s, loss=0.0126, lr=9.25e-06, step=6496] Training: 65%|██████▍ | 6497/10000 [1:26:14<33:22, 1.75it/s, loss=0.0274, lr=9.24e-06, step=6497] Training: 65%|██████▍ | 6498/10000 [1:26:15<36:42, 1.59it/s, loss=0.0274, lr=9.24e-06, step=6497] Training: 65%|██████▍ | 6498/10000 [1:26:15<36:42, 1.59it/s, loss=0.0075, lr=9.24e-06, step=6498] Training: 65%|██████▍ | 6499/10000 [1:26:16<36:51, 1.58it/s, loss=0.0075, lr=9.24e-06, step=6498] Training: 65%|██████▍ | 6499/10000 [1:26:16<36:51, 1.58it/s, loss=0.0072, lr=9.24e-06, step=6499]20:10:48.919 [I] step=6500 loss=0.0156 smoothed_loss=0.0144 lr=9.25e-06 grad_norm=0.4664 step_time=0.5340s data_time=0.0918s it/s=1.598 eta_to_10000=2189.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.0778 grad_arm_token_fuse=0.0523 grad_shared_expert=0.4955 (18633:train_pytorch.py:850) + Training: 65%|██████▌ | 6500/10000 [1:26:17<38:26, 1.52it/s, loss=0.0072, lr=9.24e-06, step=6499] Training: 65%|██████▌ | 6500/10000 [1:26:17<38:26, 1.52it/s, loss=0.0156, lr=9.23e-06, step=6500] Training: 65%|██████▌ | 6501/10000 [1:26:17<38:57, 1.50it/s, loss=0.0156, lr=9.23e-06, step=6500] Training: 65%|██████▌ | 6501/10000 [1:26:17<38:57, 1.50it/s, loss=0.0131, lr=9.23e-06, step=6501] Training: 65%|██████▌ | 6502/10000 [1:26:18<39:32, 1.47it/s, loss=0.0131, lr=9.23e-06, step=6501] Training: 65%|██████▌ | 6502/10000 [1:26:18<39:32, 1.47it/s, loss=0.0269, lr=9.23e-06, step=6502] Training: 65%|██████▌ | 6503/10000 [1:26:19<37:54, 1.54it/s, loss=0.0269, lr=9.23e-06, step=6502] Training: 65%|██████▌ | 6503/10000 [1:26:19<37:54, 1.54it/s, loss=0.0102, lr=9.22e-06, step=6503] Training: 65%|██████▌ | 6504/10000 [1:26:19<34:56, 1.67it/s, loss=0.0102, lr=9.22e-06, step=6503] Training: 65%|██████▌ | 6504/10000 [1:26:19<34:56, 1.67it/s, loss=0.0077, lr=9.22e-06, step=6504] Training: 65%|██████▌ | 6505/10000 [1:26:20<33:43, 1.73it/s, loss=0.0077, lr=9.22e-06, step=6504] Training: 65%|██████▌ | 6505/10000 [1:26:20<33:43, 1.73it/s, loss=0.0254, lr=9.22e-06, step=6505] Training: 65%|██████▌ | 6506/10000 [1:26:20<34:30, 1.69it/s, loss=0.0254, lr=9.22e-06, step=6505] Training: 65%|██████▌ | 6506/10000 [1:26:20<34:30, 1.69it/s, loss=0.0094, lr=9.21e-06, step=6506] Training: 65%|██████▌ | 6507/10000 [1:26:21<39:42, 1.47it/s, loss=0.0094, lr=9.21e-06, step=6506] Training: 65%|██████▌ | 6507/10000 [1:26:21<39:42, 1.47it/s, loss=0.0010, lr=9.21e-06, step=6507] Training: 65%|██████▌ | 6508/10000 [1:26:22<41:11, 1.41it/s, loss=0.0010, lr=9.21e-06, step=6507] Training: 65%|██████▌ | 6508/10000 [1:26:22<41:11, 1.41it/s, loss=0.0041, lr=9.21e-06, step=6508] Training: 65%|██████▌ | 6509/10000 [1:26:23<42:06, 1.38it/s, loss=0.0041, lr=9.21e-06, step=6508] Training: 65%|██████▌ | 6509/10000 [1:26:23<42:06, 1.38it/s, loss=0.0039, lr=9.20e-06, step=6509]20:10:55.474 [I] step=6510 loss=0.0066 smoothed_loss=0.0111 lr=9.22e-06 grad_norm=0.5015 step_time=0.5421s data_time=0.1133s it/s=1.526 eta_to_10000=2287.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0185 grad_action_out_proj_arms=0.1837 grad_arm_token_fuse=0.0939 grad_shared_expert=0.5098 (18633:train_pytorch.py:850) + Training: 65%|██████▌ | 6510/10000 [1:26:23<38:34, 1.51it/s, loss=0.0039, lr=9.20e-06, step=6509] Training: 65%|██████▌ | 6510/10000 [1:26:23<38:34, 1.51it/s, loss=0.0066, lr=9.20e-06, step=6510] Training: 65%|██████▌ | 6511/10000 [1:26:24<39:27, 1.47it/s, loss=0.0066, lr=9.20e-06, step=6510] Training: 65%|██████▌ | 6511/10000 [1:26:24<39:27, 1.47it/s, loss=0.0020, lr=9.20e-06, step=6511] Training: 65%|██████▌ | 6512/10000 [1:26:25<41:18, 1.41it/s, loss=0.0020, lr=9.20e-06, step=6511] Training: 65%|██████▌ | 6512/10000 [1:26:25<41:18, 1.41it/s, loss=0.0023, lr=9.19e-06, step=6512] Training: 65%|██████▌ | 6513/10000 [1:26:25<37:43, 1.54it/s, loss=0.0023, lr=9.19e-06, step=6512] Training: 65%|██████▌ | 6513/10000 [1:26:25<37:43, 1.54it/s, loss=0.0047, lr=9.19e-06, step=6513] Training: 65%|██████▌ | 6514/10000 [1:26:26<38:04, 1.53it/s, loss=0.0047, lr=9.19e-06, step=6513] Training: 65%|██████▌ | 6514/10000 [1:26:26<38:04, 1.53it/s, loss=0.0084, lr=9.19e-06, step=6514] Training: 65%|██████▌ | 6515/10000 [1:26:27<40:37, 1.43it/s, loss=0.0084, lr=9.19e-06, step=6514] Training: 65%|██████▌ | 6515/10000 [1:26:27<40:37, 1.43it/s, loss=0.0057, lr=9.18e-06, step=6515] Training: 65%|██████▌ | 6516/10000 [1:26:27<37:11, 1.56it/s, loss=0.0057, lr=9.18e-06, step=6515] Training: 65%|██████▌ | 6516/10000 [1:26:27<37:11, 1.56it/s, loss=0.0115, lr=9.18e-06, step=6516] Training: 65%|██████▌ | 6517/10000 [1:26:28<35:21, 1.64it/s, loss=0.0115, lr=9.18e-06, step=6516] Training: 65%|██████▌ | 6517/10000 [1:26:28<35:21, 1.64it/s, loss=0.0138, lr=9.18e-06, step=6517] Training: 65%|██████▌ | 6518/10000 [1:26:28<36:30, 1.59it/s, loss=0.0138, lr=9.18e-06, step=6517] Training: 65%|██████▌ | 6518/10000 [1:26:28<36:30, 1.59it/s, loss=0.0025, lr=9.17e-06, step=6518] Training: 65%|██████▌ | 6519/10000 [1:26:29<38:32, 1.51it/s, loss=0.0025, lr=9.17e-06, step=6518] Training: 65%|██████▌ | 6519/10000 [1:26:29<38:32, 1.51it/s, loss=0.0026, lr=9.17e-06, step=6519]20:11:01.936 [I] step=6520 loss=0.0089 smoothed_loss=0.0081 lr=9.18e-06 grad_norm=0.4426 step_time=0.5357s data_time=0.1105s it/s=1.548 eta_to_10000=2248.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0091 grad_action_out_proj_arms=0.1048 grad_arm_token_fuse=0.0473 grad_shared_expert=0.3970 (18633:train_pytorch.py:850) + Training: 65%|██████▌ | 6520/10000 [1:26:30<36:04, 1.61it/s, loss=0.0026, lr=9.17e-06, step=6519] Training: 65%|██████▌ | 6520/10000 [1:26:30<36:04, 1.61it/s, loss=0.0089, lr=9.17e-06, step=6520] Training: 65%|██████▌ | 6521/10000 [1:26:30<38:23, 1.51it/s, loss=0.0089, lr=9.17e-06, step=6520] Training: 65%|██████▌ | 6521/10000 [1:26:30<38:23, 1.51it/s, loss=0.0032, lr=9.16e-06, step=6521] Training: 65%|██████▌ | 6522/10000 [1:26:31<40:57, 1.42it/s, loss=0.0032, lr=9.16e-06, step=6521] Training: 65%|██████▌ | 6522/10000 [1:26:31<40:57, 1.42it/s, loss=0.0035, lr=9.16e-06, step=6522] Training: 65%|██████▌ | 6523/10000 [1:26:32<42:08, 1.37it/s, loss=0.0035, lr=9.16e-06, step=6522] Training: 65%|██████▌ | 6523/10000 [1:26:32<42:08, 1.37it/s, loss=0.0121, lr=9.16e-06, step=6523] Training: 65%|██████▌ | 6524/10000 [1:26:32<38:41, 1.50it/s, loss=0.0121, lr=9.16e-06, step=6523] Training: 65%|██████▌ | 6524/10000 [1:26:32<38:41, 1.50it/s, loss=0.0420, lr=9.15e-06, step=6524] Training: 65%|██████▌ | 6525/10000 [1:26:33<37:31, 1.54it/s, loss=0.0420, lr=9.15e-06, step=6524] Training: 65%|██████▌ | 6525/10000 [1:26:33<37:31, 1.54it/s, loss=0.0071, lr=9.15e-06, step=6525] Training: 65%|██████▌ | 6526/10000 [1:26:34<34:51, 1.66it/s, loss=0.0071, lr=9.15e-06, step=6525] Training: 65%|██████▌ | 6526/10000 [1:26:34<34:51, 1.66it/s, loss=0.0180, lr=9.15e-06, step=6526] Training: 65%|██████▌ | 6527/10000 [1:26:34<33:01, 1.75it/s, loss=0.0180, lr=9.15e-06, step=6526] Training: 65%|██████▌ | 6527/10000 [1:26:34<33:01, 1.75it/s, loss=0.0014, lr=9.14e-06, step=6527] Training: 65%|██████▌ | 6528/10000 [1:26:35<36:54, 1.57it/s, loss=0.0014, lr=9.14e-06, step=6527] Training: 65%|██████▌ | 6528/10000 [1:26:35<36:54, 1.57it/s, loss=0.0320, lr=9.14e-06, step=6528] Training: 65%|██████▌ | 6529/10000 [1:26:36<42:16, 1.37it/s, loss=0.0320, lr=9.14e-06, step=6528] Training: 65%|██████▌ | 6529/10000 [1:26:36<42:16, 1.37it/s, loss=0.0102, lr=9.14e-06, step=6529]20:11:08.842 [I] step=6530 loss=0.0132 smoothed_loss=0.0125 lr=9.15e-06 grad_norm=0.6256 step_time=0.5870s data_time=0.1036s it/s=1.448 eta_to_10000=2396.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0161 grad_action_out_proj_arms=0.1543 grad_arm_token_fuse=0.0819 grad_shared_expert=2.1628 (18633:train_pytorch.py:850) + Training: 65%|██████▌ | 6530/10000 [1:26:37<41:43, 1.39it/s, loss=0.0102, lr=9.14e-06, step=6529] Training: 65%|██████▌ | 6530/10000 [1:26:37<41:43, 1.39it/s, loss=0.0132, lr=9.13e-06, step=6530] Training: 65%|██████▌ | 6531/10000 [1:26:37<39:14, 1.47it/s, loss=0.0132, lr=9.13e-06, step=6530] Training: 65%|██████▌ | 6531/10000 [1:26:37<39:14, 1.47it/s, loss=0.0097, lr=9.13e-06, step=6531] Training: 65%|██████▌ | 6532/10000 [1:26:38<35:55, 1.61it/s, loss=0.0097, lr=9.13e-06, step=6531] Training: 65%|██████▌ | 6532/10000 [1:26:38<35:55, 1.61it/s, loss=0.0119, lr=9.13e-06, step=6532] Training: 65%|██████▌ | 6533/10000 [1:26:38<36:34, 1.58it/s, loss=0.0119, lr=9.13e-06, step=6532] Training: 65%|██████▌ | 6533/10000 [1:26:38<36:34, 1.58it/s, loss=0.0140, lr=9.12e-06, step=6533] Training: 65%|██████▌ | 6534/10000 [1:26:39<34:03, 1.70it/s, loss=0.0140, lr=9.12e-06, step=6533] Training: 65%|██████▌ | 6534/10000 [1:26:39<34:03, 1.70it/s, loss=0.0074, lr=9.12e-06, step=6534] Training: 65%|██████▌ | 6535/10000 [1:26:39<36:25, 1.59it/s, loss=0.0074, lr=9.12e-06, step=6534] Training: 65%|██████▌ | 6535/10000 [1:26:39<36:25, 1.59it/s, loss=0.0117, lr=9.12e-06, step=6535] Training: 65%|██████▌ | 6536/10000 [1:26:40<41:37, 1.39it/s, loss=0.0117, lr=9.12e-06, step=6535] Training: 65%|██████▌ | 6536/10000 [1:26:40<41:37, 1.39it/s, loss=0.0090, lr=9.11e-06, step=6536] Training: 65%|██████▌ | 6537/10000 [1:26:41<42:59, 1.34it/s, loss=0.0090, lr=9.11e-06, step=6536] Training: 65%|██████▌ | 6537/10000 [1:26:41<42:59, 1.34it/s, loss=0.0127, lr=9.11e-06, step=6537] Training: 65%|██████▌ | 6538/10000 [1:26:42<39:41, 1.45it/s, loss=0.0127, lr=9.11e-06, step=6537] Training: 65%|██████▌ | 6538/10000 [1:26:42<39:41, 1.45it/s, loss=0.0053, lr=9.11e-06, step=6538] Training: 65%|██████▌ | 6539/10000 [1:26:42<36:12, 1.59it/s, loss=0.0053, lr=9.11e-06, step=6538] Training: 65%|██████▌ | 6539/10000 [1:26:42<36:12, 1.59it/s, loss=0.0032, lr=9.10e-06, step=6539]20:11:15.236 [I] step=6540 loss=0.0084 smoothed_loss=0.0101 lr=9.11e-06 grad_norm=0.4234 step_time=0.5302s data_time=0.1092s it/s=1.564 eta_to_10000=2211.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0711 grad_arm_token_fuse=0.0462 grad_shared_expert=0.3335 (18633:train_pytorch.py:850) + Training: 65%|██████▌ | 6540/10000 [1:26:43<37:03, 1.56it/s, loss=0.0032, lr=9.10e-06, step=6539] Training: 65%|██████▌ | 6540/10000 [1:26:43<37:03, 1.56it/s, loss=0.0084, lr=9.10e-06, step=6540] Training: 65%|██████▌ | 6541/10000 [1:26:43<34:47, 1.66it/s, loss=0.0084, lr=9.10e-06, step=6540] Training: 65%|██████▌ | 6541/10000 [1:26:43<34:47, 1.66it/s, loss=0.0039, lr=9.10e-06, step=6541] Training: 65%|██████▌ | 6542/10000 [1:26:44<35:38, 1.62it/s, loss=0.0039, lr=9.10e-06, step=6541] Training: 65%|██████▌ | 6542/10000 [1:26:44<35:38, 1.62it/s, loss=0.0038, lr=9.09e-06, step=6542] Training: 65%|██████▌ | 6543/10000 [1:26:45<39:39, 1.45it/s, loss=0.0038, lr=9.09e-06, step=6542] Training: 65%|██████▌ | 6543/10000 [1:26:45<39:39, 1.45it/s, loss=0.0363, lr=9.09e-06, step=6543] Training: 65%|██████▌ | 6544/10000 [1:26:46<38:49, 1.48it/s, loss=0.0363, lr=9.09e-06, step=6543] Training: 65%|██████▌ | 6544/10000 [1:26:46<38:49, 1.48it/s, loss=0.0098, lr=9.08e-06, step=6544] Training: 65%|██████▌ | 6545/10000 [1:26:46<38:38, 1.49it/s, loss=0.0098, lr=9.08e-06, step=6544] Training: 65%|██████▌ | 6545/10000 [1:26:46<38:38, 1.49it/s, loss=0.0085, lr=9.08e-06, step=6545] Training: 65%|██████▌ | 6546/10000 [1:26:47<35:50, 1.61it/s, loss=0.0085, lr=9.08e-06, step=6545] Training: 65%|██████▌ | 6546/10000 [1:26:47<35:50, 1.61it/s, loss=0.0057, lr=9.08e-06, step=6546] Training: 65%|██████▌ | 6547/10000 [1:26:47<37:30, 1.53it/s, loss=0.0057, lr=9.08e-06, step=6546] Training: 65%|██████▌ | 6547/10000 [1:26:47<37:30, 1.53it/s, loss=0.0017, lr=9.07e-06, step=6547] Training: 65%|██████▌ | 6548/10000 [1:26:48<35:39, 1.61it/s, loss=0.0017, lr=9.07e-06, step=6547] Training: 65%|██████▌ | 6548/10000 [1:26:48<35:39, 1.61it/s, loss=0.0057, lr=9.07e-06, step=6548] Training: 65%|██████▌ | 6549/10000 [1:26:49<36:07, 1.59it/s, loss=0.0057, lr=9.07e-06, step=6548] Training: 65%|██████▌ | 6549/10000 [1:26:49<36:07, 1.59it/s, loss=0.0122, lr=9.07e-06, step=6549]20:11:21.710 [I] step=6550 loss=0.0098 smoothed_loss=0.0096 lr=9.08e-06 grad_norm=0.5171 step_time=0.5408s data_time=0.1066s it/s=1.545 eta_to_10000=2233.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0103 grad_action_out_proj_arms=0.1193 grad_arm_token_fuse=0.0552 grad_shared_expert=0.3320 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6550/10000 [1:26:49<37:52, 1.52it/s, loss=0.0122, lr=9.07e-06, step=6549] Training: 66%|██████▌ | 6550/10000 [1:26:49<37:52, 1.52it/s, loss=0.0098, lr=9.06e-06, step=6550] Training: 66%|██████▌ | 6551/10000 [1:26:50<35:22, 1.63it/s, loss=0.0098, lr=9.06e-06, step=6550] Training: 66%|██████▌ | 6551/10000 [1:26:50<35:22, 1.63it/s, loss=0.0133, lr=9.06e-06, step=6551] Training: 66%|██████▌ | 6552/10000 [1:26:51<38:02, 1.51it/s, loss=0.0133, lr=9.06e-06, step=6551] Training: 66%|██████▌ | 6552/10000 [1:26:51<38:02, 1.51it/s, loss=0.0032, lr=9.06e-06, step=6552] Training: 66%|██████▌ | 6553/10000 [1:26:51<35:31, 1.62it/s, loss=0.0032, lr=9.06e-06, step=6552] Training: 66%|██████▌ | 6553/10000 [1:26:51<35:31, 1.62it/s, loss=0.0103, lr=9.05e-06, step=6553] Training: 66%|██████▌ | 6554/10000 [1:26:52<35:06, 1.64it/s, loss=0.0103, lr=9.05e-06, step=6553] Training: 66%|██████▌ | 6554/10000 [1:26:52<35:06, 1.64it/s, loss=0.0149, lr=9.05e-06, step=6554] Training: 66%|██████▌ | 6555/10000 [1:26:52<36:27, 1.57it/s, loss=0.0149, lr=9.05e-06, step=6554] Training: 66%|██████▌ | 6555/10000 [1:26:52<36:27, 1.57it/s, loss=0.0035, lr=9.05e-06, step=6555] Training: 66%|██████▌ | 6556/10000 [1:26:53<36:57, 1.55it/s, loss=0.0035, lr=9.05e-06, step=6555] Training: 66%|██████▌ | 6556/10000 [1:26:53<36:57, 1.55it/s, loss=0.0115, lr=9.04e-06, step=6556] Training: 66%|██████▌ | 6557/10000 [1:26:54<38:25, 1.49it/s, loss=0.0115, lr=9.04e-06, step=6556] Training: 66%|██████▌ | 6557/10000 [1:26:54<38:25, 1.49it/s, loss=0.0011, lr=9.04e-06, step=6557] Training: 66%|██████▌ | 6558/10000 [1:26:55<42:41, 1.34it/s, loss=0.0011, lr=9.04e-06, step=6557] Training: 66%|██████▌ | 6558/10000 [1:26:55<42:41, 1.34it/s, loss=0.0279, lr=9.04e-06, step=6558] Training: 66%|██████▌ | 6559/10000 [1:26:55<39:14, 1.46it/s, loss=0.0279, lr=9.04e-06, step=6558] Training: 66%|██████▌ | 6559/10000 [1:26:55<39:14, 1.46it/s, loss=0.0221, lr=9.03e-06, step=6559]20:11:28.166 [I] step=6560 loss=0.0026 smoothed_loss=0.0108 lr=9.05e-06 grad_norm=0.4602 step_time=0.5437s data_time=0.1019s it/s=1.549 eta_to_10000=2220.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0561 grad_arm_token_fuse=0.0235 grad_shared_expert=0.1841 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6560/10000 [1:26:56<36:16, 1.58it/s, loss=0.0221, lr=9.03e-06, step=6559] Training: 66%|██████▌ | 6560/10000 [1:26:56<36:16, 1.58it/s, loss=0.0026, lr=9.03e-06, step=6560] Training: 66%|██████▌ | 6561/10000 [1:26:57<37:46, 1.52it/s, loss=0.0026, lr=9.03e-06, step=6560] Training: 66%|██████▌ | 6561/10000 [1:26:57<37:46, 1.52it/s, loss=0.2612, lr=9.03e-06, step=6561] Training: 66%|██████▌ | 6562/10000 [1:26:57<34:49, 1.65it/s, loss=0.2612, lr=9.03e-06, step=6561] Training: 66%|██████▌ | 6562/10000 [1:26:57<34:49, 1.65it/s, loss=0.0144, lr=9.02e-06, step=6562] Training: 66%|██████▌ | 6563/10000 [1:26:58<34:22, 1.67it/s, loss=0.0144, lr=9.02e-06, step=6562] Training: 66%|██████▌ | 6563/10000 [1:26:58<34:22, 1.67it/s, loss=0.0031, lr=9.02e-06, step=6563] Training: 66%|██████▌ | 6564/10000 [1:26:58<34:59, 1.64it/s, loss=0.0031, lr=9.02e-06, step=6563] Training: 66%|██████▌ | 6564/10000 [1:26:58<34:59, 1.64it/s, loss=0.0150, lr=9.02e-06, step=6564] Training: 66%|██████▌ | 6565/10000 [1:26:59<40:32, 1.41it/s, loss=0.0150, lr=9.02e-06, step=6564] Training: 66%|██████▌ | 6565/10000 [1:26:59<40:32, 1.41it/s, loss=0.0107, lr=9.01e-06, step=6565] Training: 66%|██████▌ | 6566/10000 [1:27:00<39:28, 1.45it/s, loss=0.0107, lr=9.01e-06, step=6565] Training: 66%|██████▌ | 6566/10000 [1:27:00<39:28, 1.45it/s, loss=0.0038, lr=9.01e-06, step=6566] Training: 66%|██████▌ | 6567/10000 [1:27:00<37:24, 1.53it/s, loss=0.0038, lr=9.01e-06, step=6566] Training: 66%|██████▌ | 6567/10000 [1:27:00<37:24, 1.53it/s, loss=0.0108, lr=9.01e-06, step=6567] Training: 66%|██████▌ | 6568/10000 [1:27:01<36:13, 1.58it/s, loss=0.0108, lr=9.01e-06, step=6567] Training: 66%|██████▌ | 6568/10000 [1:27:01<36:13, 1.58it/s, loss=0.0061, lr=9.00e-06, step=6568] Training: 66%|██████▌ | 6569/10000 [1:27:02<37:50, 1.51it/s, loss=0.0061, lr=9.00e-06, step=6568] Training: 66%|██████▌ | 6569/10000 [1:27:02<37:50, 1.51it/s, loss=0.0295, lr=9.00e-06, step=6569]20:11:34.718 [I] step=6570 loss=0.0405 smoothed_loss=0.0243 lr=9.01e-06 grad_norm=0.4781 step_time=0.5455s data_time=0.1097s it/s=1.526 eta_to_10000=2247.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0251 grad_action_out_proj_arms=0.1592 grad_arm_token_fuse=0.1305 grad_shared_expert=0.5412 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6570/10000 [1:27:02<37:48, 1.51it/s, loss=0.0295, lr=9.00e-06, step=6569] Training: 66%|██████▌ | 6570/10000 [1:27:02<37:48, 1.51it/s, loss=0.0405, lr=9.00e-06, step=6570] Training: 66%|██████▌ | 6571/10000 [1:27:03<37:26, 1.53it/s, loss=0.0405, lr=9.00e-06, step=6570] Training: 66%|██████▌ | 6571/10000 [1:27:03<37:26, 1.53it/s, loss=0.0293, lr=8.99e-06, step=6571] Training: 66%|██████▌ | 6572/10000 [1:27:04<41:43, 1.37it/s, loss=0.0293, lr=8.99e-06, step=6571] Training: 66%|██████▌ | 6572/10000 [1:27:04<41:43, 1.37it/s, loss=0.0339, lr=8.99e-06, step=6572] Training: 66%|██████▌ | 6573/10000 [1:27:04<38:03, 1.50it/s, loss=0.0339, lr=8.99e-06, step=6572] Training: 66%|██████▌ | 6573/10000 [1:27:04<38:03, 1.50it/s, loss=0.0084, lr=8.99e-06, step=6573] Training: 66%|██████▌ | 6574/10000 [1:27:05<35:08, 1.63it/s, loss=0.0084, lr=8.99e-06, step=6573] Training: 66%|██████▌ | 6574/10000 [1:27:05<35:08, 1.63it/s, loss=0.0031, lr=8.98e-06, step=6574] Training: 66%|██████▌ | 6575/10000 [1:27:05<33:31, 1.70it/s, loss=0.0031, lr=8.98e-06, step=6574] Training: 66%|██████▌ | 6575/10000 [1:27:05<33:31, 1.70it/s, loss=0.0060, lr=8.98e-06, step=6575] Training: 66%|██████▌ | 6576/10000 [1:27:06<31:43, 1.80it/s, loss=0.0060, lr=8.98e-06, step=6575] Training: 66%|██████▌ | 6576/10000 [1:27:06<31:43, 1.80it/s, loss=0.0137, lr=8.98e-06, step=6576] Training: 66%|██████▌ | 6577/10000 [1:27:07<34:05, 1.67it/s, loss=0.0137, lr=8.98e-06, step=6576] Training: 66%|██████▌ | 6577/10000 [1:27:07<34:05, 1.67it/s, loss=0.0067, lr=8.97e-06, step=6577] Training: 66%|██████▌ | 6578/10000 [1:27:07<35:08, 1.62it/s, loss=0.0067, lr=8.97e-06, step=6577] Training: 66%|██████▌ | 6578/10000 [1:27:07<35:08, 1.62it/s, loss=0.0017, lr=8.97e-06, step=6578] Training: 66%|██████▌ | 6579/10000 [1:27:08<36:19, 1.57it/s, loss=0.0017, lr=8.97e-06, step=6578] Training: 66%|██████▌ | 6579/10000 [1:27:08<36:19, 1.57it/s, loss=0.0195, lr=8.97e-06, step=6579]20:11:40.840 [I] step=6580 loss=0.0051 smoothed_loss=0.0158 lr=8.98e-06 grad_norm=0.5445 step_time=0.5240s data_time=0.0882s it/s=1.634 eta_to_10000=2093.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0077 grad_action_out_proj_arms=0.1002 grad_arm_token_fuse=0.0396 grad_shared_expert=0.3478 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6580/10000 [1:27:09<34:15, 1.66it/s, loss=0.0195, lr=8.97e-06, step=6579] Training: 66%|██████▌ | 6580/10000 [1:27:09<34:15, 1.66it/s, loss=0.0051, lr=8.96e-06, step=6580] Training: 66%|██████▌ | 6581/10000 [1:27:09<32:54, 1.73it/s, loss=0.0051, lr=8.96e-06, step=6580] Training: 66%|██████▌ | 6581/10000 [1:27:09<32:54, 1.73it/s, loss=0.0020, lr=8.96e-06, step=6581] Training: 66%|██████▌ | 6582/10000 [1:27:10<34:35, 1.65it/s, loss=0.0020, lr=8.96e-06, step=6581] Training: 66%|██████▌ | 6582/10000 [1:27:10<34:35, 1.65it/s, loss=0.0115, lr=8.96e-06, step=6582] Training: 66%|██████▌ | 6583/10000 [1:27:10<32:41, 1.74it/s, loss=0.0115, lr=8.96e-06, step=6582] Training: 66%|██████▌ | 6583/10000 [1:27:10<32:41, 1.74it/s, loss=0.0056, lr=8.95e-06, step=6583] Training: 66%|██████▌ | 6584/10000 [1:27:11<31:17, 1.82it/s, loss=0.0056, lr=8.95e-06, step=6583] Training: 66%|██████▌ | 6584/10000 [1:27:11<31:17, 1.82it/s, loss=0.0029, lr=8.95e-06, step=6584] Training: 66%|██████▌ | 6585/10000 [1:27:11<31:33, 1.80it/s, loss=0.0029, lr=8.95e-06, step=6584] Training: 66%|██████▌ | 6585/10000 [1:27:11<31:33, 1.80it/s, loss=0.0030, lr=8.95e-06, step=6585] Training: 66%|██████▌ | 6586/10000 [1:27:12<33:58, 1.68it/s, loss=0.0030, lr=8.95e-06, step=6585] Training: 66%|██████▌ | 6586/10000 [1:27:12<33:58, 1.68it/s, loss=0.0077, lr=8.94e-06, step=6586] Training: 66%|██████▌ | 6587/10000 [1:27:13<33:35, 1.69it/s, loss=0.0077, lr=8.94e-06, step=6586] Training: 66%|██████▌ | 6587/10000 [1:27:13<33:35, 1.69it/s, loss=0.0016, lr=8.94e-06, step=6587] Training: 66%|██████▌ | 6588/10000 [1:27:13<35:39, 1.60it/s, loss=0.0016, lr=8.94e-06, step=6587] Training: 66%|██████▌ | 6588/10000 [1:27:13<35:39, 1.60it/s, loss=0.0059, lr=8.94e-06, step=6588] Training: 66%|██████▌ | 6589/10000 [1:27:14<33:20, 1.71it/s, loss=0.0059, lr=8.94e-06, step=6588] Training: 66%|██████▌ | 6589/10000 [1:27:14<33:20, 1.71it/s, loss=0.0104, lr=8.93e-06, step=6589]20:11:46.588 [I] step=6590 loss=0.0184 smoothed_loss=0.0106 lr=8.94e-06 grad_norm=0.4355 step_time=0.4862s data_time=0.0885s it/s=1.740 eta_to_10000=1959.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0134 grad_action_out_proj_arms=0.1382 grad_arm_token_fuse=0.0681 grad_shared_expert=0.4859 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6590/10000 [1:27:14<32:09, 1.77it/s, loss=0.0104, lr=8.93e-06, step=6589] Training: 66%|██████▌ | 6590/10000 [1:27:14<32:09, 1.77it/s, loss=0.0184, lr=8.93e-06, step=6590] Training: 66%|██████▌ | 6591/10000 [1:27:15<31:23, 1.81it/s, loss=0.0184, lr=8.93e-06, step=6590] Training: 66%|██████▌ | 6591/10000 [1:27:15<31:23, 1.81it/s, loss=0.0045, lr=8.93e-06, step=6591] Training: 66%|██████▌ | 6592/10000 [1:27:15<30:21, 1.87it/s, loss=0.0045, lr=8.93e-06, step=6591] Training: 66%|██████▌ | 6592/10000 [1:27:15<30:21, 1.87it/s, loss=0.0058, lr=8.92e-06, step=6592] Training: 66%|██████▌ | 6593/10000 [1:27:16<35:51, 1.58it/s, loss=0.0058, lr=8.92e-06, step=6592] Training: 66%|██████▌ | 6593/10000 [1:27:16<35:51, 1.58it/s, loss=0.0118, lr=8.92e-06, step=6593] Training: 66%|██████▌ | 6594/10000 [1:27:17<34:12, 1.66it/s, loss=0.0118, lr=8.92e-06, step=6593] Training: 66%|██████▌ | 6594/10000 [1:27:17<34:12, 1.66it/s, loss=0.0032, lr=8.92e-06, step=6594] Training: 66%|██████▌ | 6595/10000 [1:27:17<33:06, 1.71it/s, loss=0.0032, lr=8.92e-06, step=6594] Training: 66%|██████▌ | 6595/10000 [1:27:17<33:06, 1.71it/s, loss=0.0149, lr=8.91e-06, step=6595] Training: 66%|██████▌ | 6596/10000 [1:27:18<34:16, 1.66it/s, loss=0.0149, lr=8.91e-06, step=6595] Training: 66%|██████▌ | 6596/10000 [1:27:18<34:16, 1.66it/s, loss=0.0097, lr=8.91e-06, step=6596] Training: 66%|██████▌ | 6597/10000 [1:27:18<34:00, 1.67it/s, loss=0.0097, lr=8.91e-06, step=6596] Training: 66%|██████▌ | 6597/10000 [1:27:18<34:00, 1.67it/s, loss=0.0105, lr=8.91e-06, step=6597] Training: 66%|██████▌ | 6598/10000 [1:27:19<32:22, 1.75it/s, loss=0.0105, lr=8.91e-06, step=6597] Training: 66%|██████▌ | 6598/10000 [1:27:19<32:22, 1.75it/s, loss=0.0089, lr=8.90e-06, step=6598] Training: 66%|██████▌ | 6599/10000 [1:27:19<31:27, 1.80it/s, loss=0.0089, lr=8.90e-06, step=6598] Training: 66%|██████▌ | 6599/10000 [1:27:19<31:27, 1.80it/s, loss=0.1231, lr=8.90e-06, step=6599]20:11:52.637 [I] step=6600 loss=0.0070 smoothed_loss=0.0196 lr=8.91e-06 grad_norm=0.4444 step_time=0.5230s data_time=0.0819s it/s=1.653 eta_to_10000=2056.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0048 grad_action_out_proj_arms=0.0583 grad_arm_token_fuse=0.0231 grad_shared_expert=0.2568 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6600/10000 [1:27:20<36:18, 1.56it/s, loss=0.1231, lr=8.90e-06, step=6599] Training: 66%|██████▌ | 6600/10000 [1:27:20<36:18, 1.56it/s, loss=0.0070, lr=8.90e-06, step=6600] Training: 66%|██████▌ | 6601/10000 [1:27:21<33:52, 1.67it/s, loss=0.0070, lr=8.90e-06, step=6600] Training: 66%|██████▌ | 6601/10000 [1:27:21<33:52, 1.67it/s, loss=0.0482, lr=8.89e-06, step=6601] Training: 66%|██████▌ | 6602/10000 [1:27:21<34:36, 1.64it/s, loss=0.0482, lr=8.89e-06, step=6601] Training: 66%|██████▌ | 6602/10000 [1:27:21<34:36, 1.64it/s, loss=0.0042, lr=8.89e-06, step=6602] Training: 66%|██████▌ | 6603/10000 [1:27:22<33:13, 1.70it/s, loss=0.0042, lr=8.89e-06, step=6602] Training: 66%|██████▌ | 6603/10000 [1:27:22<33:13, 1.70it/s, loss=0.0017, lr=8.89e-06, step=6603] Training: 66%|██████▌ | 6604/10000 [1:27:22<31:51, 1.78it/s, loss=0.0017, lr=8.89e-06, step=6603] Training: 66%|██████▌ | 6604/10000 [1:27:22<31:51, 1.78it/s, loss=0.0119, lr=8.88e-06, step=6604] Training: 66%|██████▌ | 6605/10000 [1:27:23<30:47, 1.84it/s, loss=0.0119, lr=8.88e-06, step=6604] Training: 66%|██████▌ | 6605/10000 [1:27:23<30:47, 1.84it/s, loss=0.0073, lr=8.88e-06, step=6605] Training: 66%|██████▌ | 6606/10000 [1:27:24<31:04, 1.82it/s, loss=0.0073, lr=8.88e-06, step=6605] Training: 66%|██████▌ | 6606/10000 [1:27:24<31:04, 1.82it/s, loss=0.0139, lr=8.88e-06, step=6606] Training: 66%|██████▌ | 6607/10000 [1:27:24<33:23, 1.69it/s, loss=0.0139, lr=8.88e-06, step=6606] Training: 66%|██████▌ | 6607/10000 [1:27:24<33:23, 1.69it/s, loss=0.0091, lr=8.87e-06, step=6607] Training: 66%|██████▌ | 6608/10000 [1:27:25<32:42, 1.73it/s, loss=0.0091, lr=8.87e-06, step=6607] Training: 66%|██████▌ | 6608/10000 [1:27:25<32:42, 1.73it/s, loss=0.0032, lr=8.87e-06, step=6608] Training: 66%|██████▌ | 6609/10000 [1:27:25<32:57, 1.71it/s, loss=0.0032, lr=8.87e-06, step=6608] Training: 66%|██████▌ | 6609/10000 [1:27:25<32:57, 1.71it/s, loss=0.0038, lr=8.87e-06, step=6609]20:11:58.246 [I] step=6610 loss=0.0084 smoothed_loss=0.0130 lr=8.88e-06 grad_norm=0.4561 step_time=0.4788s data_time=0.0820s it/s=1.783 eta_to_10000=1900.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0905 grad_arm_token_fuse=0.0366 grad_shared_expert=0.2707 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6610/10000 [1:27:26<32:10, 1.76it/s, loss=0.0038, lr=8.87e-06, step=6609] Training: 66%|██████▌ | 6610/10000 [1:27:26<32:10, 1.76it/s, loss=0.0084, lr=8.86e-06, step=6610] Training: 66%|██████▌ | 6611/10000 [1:27:26<30:52, 1.83it/s, loss=0.0084, lr=8.86e-06, step=6610] Training: 66%|██████▌ | 6611/10000 [1:27:26<30:52, 1.83it/s, loss=0.0055, lr=8.86e-06, step=6611] Training: 66%|██████▌ | 6612/10000 [1:27:27<29:56, 1.89it/s, loss=0.0055, lr=8.86e-06, step=6611] Training: 66%|██████▌ | 6612/10000 [1:27:27<29:56, 1.89it/s, loss=0.0125, lr=8.86e-06, step=6612] Training: 66%|██████▌ | 6613/10000 [1:27:28<32:40, 1.73it/s, loss=0.0125, lr=8.86e-06, step=6612] Training: 66%|██████▌ | 6613/10000 [1:27:28<32:40, 1.73it/s, loss=0.0042, lr=8.85e-06, step=6613] Training: 66%|██████▌ | 6614/10000 [1:27:28<34:26, 1.64it/s, loss=0.0042, lr=8.85e-06, step=6613] Training: 66%|██████▌ | 6614/10000 [1:27:28<34:26, 1.64it/s, loss=0.0063, lr=8.85e-06, step=6614] Training: 66%|██████▌ | 6615/10000 [1:27:29<35:19, 1.60it/s, loss=0.0063, lr=8.85e-06, step=6614] Training: 66%|██████▌ | 6615/10000 [1:27:29<35:19, 1.60it/s, loss=0.0033, lr=8.85e-06, step=6615] Training: 66%|██████▌ | 6616/10000 [1:27:29<33:03, 1.71it/s, loss=0.0033, lr=8.85e-06, step=6615] Training: 66%|██████▌ | 6616/10000 [1:27:29<33:03, 1.71it/s, loss=0.0035, lr=8.84e-06, step=6616] Training: 66%|██████▌ | 6617/10000 [1:27:30<31:30, 1.79it/s, loss=0.0035, lr=8.84e-06, step=6616] Training: 66%|██████▌ | 6617/10000 [1:27:30<31:30, 1.79it/s, loss=0.0037, lr=8.84e-06, step=6617] Training: 66%|██████▌ | 6618/10000 [1:27:30<31:10, 1.81it/s, loss=0.0037, lr=8.84e-06, step=6617] Training: 66%|██████▌ | 6618/10000 [1:27:30<31:10, 1.81it/s, loss=0.0046, lr=8.84e-06, step=6618] Training: 66%|██████▌ | 6619/10000 [1:27:31<33:38, 1.67it/s, loss=0.0046, lr=8.84e-06, step=6618] Training: 66%|██████▌ | 6619/10000 [1:27:31<33:38, 1.67it/s, loss=0.0123, lr=8.83e-06, step=6619]20:12:04.039 [I] step=6620 loss=0.0022 smoothed_loss=0.0082 lr=8.84e-06 grad_norm=0.3966 step_time=0.4982s data_time=0.0812s it/s=1.726 eta_to_10000=1958.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0676 grad_arm_token_fuse=0.0260 grad_shared_expert=0.2056 (18633:train_pytorch.py:850) + Training: 66%|██████▌ | 6620/10000 [1:27:32<32:43, 1.72it/s, loss=0.0123, lr=8.83e-06, step=6619] Training: 66%|██████▌ | 6620/10000 [1:27:32<32:43, 1.72it/s, loss=0.0022, lr=8.83e-06, step=6620] Training: 66%|██████▌ | 6621/10000 [1:27:32<35:50, 1.57it/s, loss=0.0022, lr=8.83e-06, step=6620] Training: 66%|██████▌ | 6621/10000 [1:27:32<35:50, 1.57it/s, loss=0.0082, lr=8.83e-06, step=6621] Training: 66%|██████▌ | 6622/10000 [1:27:33<36:33, 1.54it/s, loss=0.0082, lr=8.83e-06, step=6621] Training: 66%|██████▌ | 6622/10000 [1:27:33<36:33, 1.54it/s, loss=0.0066, lr=8.82e-06, step=6622] Training: 66%|██████▌ | 6623/10000 [1:27:34<36:11, 1.56it/s, loss=0.0066, lr=8.82e-06, step=6622] Training: 66%|██████▌ | 6623/10000 [1:27:34<36:11, 1.56it/s, loss=0.0089, lr=8.82e-06, step=6623] Training: 66%|██████▌ | 6624/10000 [1:27:34<33:49, 1.66it/s, loss=0.0089, lr=8.82e-06, step=6623] Training: 66%|██████▌ | 6624/10000 [1:27:34<33:49, 1.66it/s, loss=0.0018, lr=8.82e-06, step=6624] Training: 66%|██████▋ | 6625/10000 [1:27:35<32:05, 1.75it/s, loss=0.0018, lr=8.82e-06, step=6624] Training: 66%|██████▋ | 6625/10000 [1:27:35<32:05, 1.75it/s, loss=0.0105, lr=8.81e-06, step=6625] Training: 66%|██████▋ | 6626/10000 [1:27:35<30:47, 1.83it/s, loss=0.0105, lr=8.81e-06, step=6625] Training: 66%|██████▋ | 6626/10000 [1:27:35<30:47, 1.83it/s, loss=0.0201, lr=8.81e-06, step=6626] Training: 66%|██████▋ | 6627/10000 [1:27:36<32:55, 1.71it/s, loss=0.0201, lr=8.81e-06, step=6626] Training: 66%|██████▋ | 6627/10000 [1:27:36<32:55, 1.71it/s, loss=0.0140, lr=8.81e-06, step=6627] Training: 66%|██████▋ | 6628/10000 [1:27:37<39:21, 1.43it/s, loss=0.0140, lr=8.81e-06, step=6627] Training: 66%|██████▋ | 6628/10000 [1:27:37<39:21, 1.43it/s, loss=0.0088, lr=8.80e-06, step=6628] Training: 66%|██████▋ | 6629/10000 [1:27:38<38:53, 1.44it/s, loss=0.0088, lr=8.80e-06, step=6628] Training: 66%|██████▋ | 6629/10000 [1:27:38<38:53, 1.44it/s, loss=0.0065, lr=8.80e-06, step=6629]20:12:10.460 [I] step=6630 loss=0.0168 smoothed_loss=0.0099 lr=8.81e-06 grad_norm=0.4624 step_time=0.5508s data_time=0.0913s it/s=1.558 eta_to_10000=2163.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0246 grad_action_out_proj_arms=0.1857 grad_arm_token_fuse=0.1242 grad_shared_expert=0.6058 (18633:train_pytorch.py:850) + Training: 66%|██████▋ | 6630/10000 [1:27:38<36:13, 1.55it/s, loss=0.0065, lr=8.80e-06, step=6629] Training: 66%|██████▋ | 6630/10000 [1:27:38<36:13, 1.55it/s, loss=0.0168, lr=8.80e-06, step=6630] Training: 66%|██████▋ | 6631/10000 [1:27:39<34:34, 1.62it/s, loss=0.0168, lr=8.80e-06, step=6630] Training: 66%|██████▋ | 6631/10000 [1:27:39<34:34, 1.62it/s, loss=0.0055, lr=8.79e-06, step=6631] Training: 66%|██████▋ | 6632/10000 [1:27:39<34:45, 1.62it/s, loss=0.0055, lr=8.79e-06, step=6631] Training: 66%|██████▋ | 6632/10000 [1:27:39<34:45, 1.62it/s, loss=0.0044, lr=8.79e-06, step=6632] Training: 66%|██████▋ | 6633/10000 [1:27:40<32:49, 1.71it/s, loss=0.0044, lr=8.79e-06, step=6632] Training: 66%|██████▋ | 6633/10000 [1:27:40<32:49, 1.71it/s, loss=0.0125, lr=8.79e-06, step=6633] Training: 66%|██████▋ | 6634/10000 [1:27:40<31:37, 1.77it/s, loss=0.0125, lr=8.79e-06, step=6633] Training: 66%|██████▋ | 6634/10000 [1:27:40<31:37, 1.77it/s, loss=0.0074, lr=8.78e-06, step=6634] Training: 66%|██████▋ | 6635/10000 [1:27:41<33:52, 1.66it/s, loss=0.0074, lr=8.78e-06, step=6634] Training: 66%|██████▋ | 6635/10000 [1:27:41<33:52, 1.66it/s, loss=0.0035, lr=8.78e-06, step=6635] Training: 66%|██████▋ | 6636/10000 [1:27:42<35:58, 1.56it/s, loss=0.0035, lr=8.78e-06, step=6635] Training: 66%|██████▋ | 6636/10000 [1:27:42<35:58, 1.56it/s, loss=0.0050, lr=8.78e-06, step=6636] Training: 66%|██████▋ | 6637/10000 [1:27:42<33:35, 1.67it/s, loss=0.0050, lr=8.78e-06, step=6636] Training: 66%|██████▋ | 6637/10000 [1:27:42<33:35, 1.67it/s, loss=0.0045, lr=8.77e-06, step=6637] Training: 66%|██████▋ | 6638/10000 [1:27:43<35:56, 1.56it/s, loss=0.0045, lr=8.77e-06, step=6637] Training: 66%|██████▋ | 6638/10000 [1:27:43<35:56, 1.56it/s, loss=0.0031, lr=8.77e-06, step=6638] Training: 66%|██████▋ | 6639/10000 [1:27:44<33:50, 1.66it/s, loss=0.0031, lr=8.77e-06, step=6638] Training: 66%|██████▋ | 6639/10000 [1:27:44<33:50, 1.66it/s, loss=0.0221, lr=8.77e-06, step=6639]20:12:16.398 [I] step=6640 loss=0.0097 smoothed_loss=0.0089 lr=8.78e-06 grad_norm=0.4746 step_time=0.5042s data_time=0.0895s it/s=1.684 eta_to_10000=1994.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.0988 grad_arm_token_fuse=0.0740 grad_shared_expert=0.3972 (18633:train_pytorch.py:850) + Training: 66%|██████▋ | 6640/10000 [1:27:44<33:03, 1.69it/s, loss=0.0221, lr=8.77e-06, step=6639] Training: 66%|██████▋ | 6640/10000 [1:27:44<33:03, 1.69it/s, loss=0.0097, lr=8.76e-06, step=6640] Training: 66%|██████▋ | 6641/10000 [1:27:45<31:50, 1.76it/s, loss=0.0097, lr=8.76e-06, step=6640] Training: 66%|██████▋ | 6641/10000 [1:27:45<31:50, 1.76it/s, loss=0.0418, lr=8.76e-06, step=6641] Training: 66%|██████▋ | 6642/10000 [1:27:45<32:56, 1.70it/s, loss=0.0418, lr=8.76e-06, step=6641] Training: 66%|██████▋ | 6642/10000 [1:27:45<32:56, 1.70it/s, loss=0.0027, lr=8.76e-06, step=6642] Training: 66%|██████▋ | 6643/10000 [1:27:46<39:26, 1.42it/s, loss=0.0027, lr=8.76e-06, step=6642] Training: 66%|██████▋ | 6643/10000 [1:27:46<39:26, 1.42it/s, loss=0.0079, lr=8.75e-06, step=6643] Training: 66%|██████▋ | 6644/10000 [1:27:47<40:40, 1.38it/s, loss=0.0079, lr=8.75e-06, step=6643] Training: 66%|██████▋ | 6644/10000 [1:27:47<40:40, 1.38it/s, loss=0.0089, lr=8.75e-06, step=6644] Training: 66%|██████▋ | 6645/10000 [1:27:48<40:51, 1.37it/s, loss=0.0089, lr=8.75e-06, step=6644] Training: 66%|██████▋ | 6645/10000 [1:27:48<40:51, 1.37it/s, loss=0.0321, lr=8.75e-06, step=6645] Training: 66%|██████▋ | 6646/10000 [1:27:48<36:50, 1.52it/s, loss=0.0321, lr=8.75e-06, step=6645] Training: 66%|██████▋ | 6646/10000 [1:27:48<36:50, 1.52it/s, loss=0.0113, lr=8.74e-06, step=6646] Training: 66%|██████▋ | 6647/10000 [1:27:49<34:13, 1.63it/s, loss=0.0113, lr=8.74e-06, step=6646] Training: 66%|██████▋ | 6647/10000 [1:27:49<34:13, 1.63it/s, loss=0.0007, lr=8.74e-06, step=6647] Training: 66%|██████▋ | 6648/10000 [1:27:49<32:14, 1.73it/s, loss=0.0007, lr=8.74e-06, step=6647] Training: 66%|██████▋ | 6648/10000 [1:27:49<32:14, 1.73it/s, loss=0.0109, lr=8.74e-06, step=6648] Training: 66%|██████▋ | 6649/10000 [1:27:50<34:18, 1.63it/s, loss=0.0109, lr=8.74e-06, step=6648] Training: 66%|██████▋ | 6649/10000 [1:27:50<34:18, 1.63it/s, loss=0.0069, lr=8.73e-06, step=6649]20:12:23.173 [I] step=6650 loss=0.0038 smoothed_loss=0.0103 lr=8.74e-06 grad_norm=0.4623 step_time=0.5481s data_time=0.1294s it/s=1.476 eta_to_10000=2269.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0169 grad_action_out_proj_arms=0.1104 grad_arm_token_fuse=0.0909 grad_shared_expert=0.3287 (18633:train_pytorch.py:850) + Training: 66%|██████▋ | 6650/10000 [1:27:51<39:42, 1.41it/s, loss=0.0069, lr=8.73e-06, step=6649] Training: 66%|██████▋ | 6650/10000 [1:27:51<39:42, 1.41it/s, loss=0.0038, lr=8.73e-06, step=6650] Training: 67%|██████▋ | 6651/10000 [1:27:52<39:23, 1.42it/s, loss=0.0038, lr=8.73e-06, step=6650] Training: 67%|██████▋ | 6651/10000 [1:27:52<39:23, 1.42it/s, loss=0.0094, lr=8.73e-06, step=6651] Training: 67%|██████▋ | 6652/10000 [1:27:52<38:18, 1.46it/s, loss=0.0094, lr=8.73e-06, step=6651] Training: 67%|██████▋ | 6652/10000 [1:27:52<38:18, 1.46it/s, loss=0.0050, lr=8.72e-06, step=6652] Training: 67%|██████▋ | 6653/10000 [1:27:53<38:37, 1.44it/s, loss=0.0050, lr=8.72e-06, step=6652] Training: 67%|██████▋ | 6653/10000 [1:27:53<38:37, 1.44it/s, loss=0.0065, lr=8.72e-06, step=6653] Training: 67%|██████▋ | 6654/10000 [1:27:53<35:39, 1.56it/s, loss=0.0065, lr=8.72e-06, step=6653] Training: 67%|██████▋ | 6654/10000 [1:27:53<35:39, 1.56it/s, loss=0.0078, lr=8.72e-06, step=6654] Training: 67%|██████▋ | 6655/10000 [1:27:54<33:11, 1.68it/s, loss=0.0078, lr=8.72e-06, step=6654] Training: 67%|██████▋ | 6655/10000 [1:27:54<33:11, 1.68it/s, loss=0.0224, lr=8.71e-06, step=6655] Training: 67%|██████▋ | 6656/10000 [1:27:55<33:24, 1.67it/s, loss=0.0224, lr=8.71e-06, step=6655] Training: 67%|██████▋ | 6656/10000 [1:27:55<33:24, 1.67it/s, loss=0.0208, lr=8.71e-06, step=6656] Training: 67%|██████▋ | 6657/10000 [1:27:55<35:08, 1.59it/s, loss=0.0208, lr=8.71e-06, step=6656] Training: 67%|██████▋ | 6657/10000 [1:27:55<35:08, 1.59it/s, loss=0.0039, lr=8.71e-06, step=6657] Training: 67%|██████▋ | 6658/10000 [1:27:56<35:31, 1.57it/s, loss=0.0039, lr=8.71e-06, step=6657] Training: 67%|██████▋ | 6658/10000 [1:27:56<35:31, 1.57it/s, loss=0.0039, lr=8.70e-06, step=6658] Training: 67%|██████▋ | 6659/10000 [1:27:56<33:15, 1.67it/s, loss=0.0039, lr=8.70e-06, step=6658] Training: 67%|██████▋ | 6659/10000 [1:27:56<33:15, 1.67it/s, loss=0.0137, lr=8.70e-06, step=6659]20:12:29.218 [I] step=6660 loss=0.0028 smoothed_loss=0.0097 lr=8.71e-06 grad_norm=0.4349 step_time=0.5074s data_time=0.0971s it/s=1.655 eta_to_10000=2018.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0198 grad_action_out_proj_arms=0.1529 grad_arm_token_fuse=0.1042 grad_shared_expert=0.4178 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6660/10000 [1:27:57<32:03, 1.74it/s, loss=0.0137, lr=8.70e-06, step=6659] Training: 67%|██████▋ | 6660/10000 [1:27:57<32:03, 1.74it/s, loss=0.0028, lr=8.70e-06, step=6660] Training: 67%|██████▋ | 6661/10000 [1:27:57<32:02, 1.74it/s, loss=0.0028, lr=8.70e-06, step=6660] Training: 67%|██████▋ | 6661/10000 [1:27:57<32:02, 1.74it/s, loss=0.0078, lr=8.69e-06, step=6661] Training: 67%|██████▋ | 6662/10000 [1:27:58<31:09, 1.79it/s, loss=0.0078, lr=8.69e-06, step=6661] Training: 67%|██████▋ | 6662/10000 [1:27:58<31:09, 1.79it/s, loss=0.0063, lr=8.69e-06, step=6662] Training: 67%|██████▋ | 6663/10000 [1:27:59<30:58, 1.80it/s, loss=0.0063, lr=8.69e-06, step=6662] Training: 67%|██████▋ | 6663/10000 [1:27:59<30:58, 1.80it/s, loss=0.0027, lr=8.69e-06, step=6663] Training: 67%|██████▋ | 6664/10000 [1:27:59<34:29, 1.61it/s, loss=0.0027, lr=8.69e-06, step=6663] Training: 67%|██████▋ | 6664/10000 [1:27:59<34:29, 1.61it/s, loss=0.0121, lr=8.68e-06, step=6664] Training: 67%|██████▋ | 6665/10000 [1:28:00<38:21, 1.45it/s, loss=0.0121, lr=8.68e-06, step=6664] Training: 67%|██████▋ | 6665/10000 [1:28:00<38:21, 1.45it/s, loss=0.0075, lr=8.68e-06, step=6665] Training: 67%|██████▋ | 6666/10000 [1:28:01<38:14, 1.45it/s, loss=0.0075, lr=8.68e-06, step=6665] Training: 67%|██████▋ | 6666/10000 [1:28:01<38:14, 1.45it/s, loss=0.0039, lr=8.68e-06, step=6666] Training: 67%|██████▋ | 6667/10000 [1:28:01<37:09, 1.49it/s, loss=0.0039, lr=8.68e-06, step=6666] Training: 67%|██████▋ | 6667/10000 [1:28:01<37:09, 1.49it/s, loss=0.0080, lr=8.67e-06, step=6667] Training: 67%|██████▋ | 6668/10000 [1:28:02<38:05, 1.46it/s, loss=0.0080, lr=8.67e-06, step=6667] Training: 67%|██████▋ | 6668/10000 [1:28:02<38:05, 1.46it/s, loss=0.0030, lr=8.67e-06, step=6668] Training: 67%|██████▋ | 6669/10000 [1:28:03<38:06, 1.46it/s, loss=0.0030, lr=8.67e-06, step=6668] Training: 67%|██████▋ | 6669/10000 [1:28:03<38:06, 1.46it/s, loss=0.0080, lr=8.67e-06, step=6669]20:12:35.731 [I] step=6670 loss=0.0131 smoothed_loss=0.0083 lr=8.68e-06 grad_norm=0.4574 step_time=0.5405s data_time=0.1107s it/s=1.536 eta_to_10000=2168.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0777 grad_arm_token_fuse=0.0385 grad_shared_expert=0.5123 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6670/10000 [1:28:03<35:20, 1.57it/s, loss=0.0080, lr=8.67e-06, step=6669] Training: 67%|██████▋ | 6670/10000 [1:28:03<35:20, 1.57it/s, loss=0.0131, lr=8.66e-06, step=6670] Training: 67%|██████▋ | 6671/10000 [1:28:04<35:15, 1.57it/s, loss=0.0131, lr=8.66e-06, step=6670] Training: 67%|██████▋ | 6671/10000 [1:28:04<35:15, 1.57it/s, loss=0.0107, lr=8.66e-06, step=6671] Training: 67%|██████▋ | 6672/10000 [1:28:05<36:12, 1.53it/s, loss=0.0107, lr=8.66e-06, step=6671] Training: 67%|██████▋ | 6672/10000 [1:28:05<36:12, 1.53it/s, loss=0.0031, lr=8.66e-06, step=6672] Training: 67%|██████▋ | 6673/10000 [1:28:05<34:31, 1.61it/s, loss=0.0031, lr=8.66e-06, step=6672] Training: 67%|██████▋ | 6673/10000 [1:28:05<34:31, 1.61it/s, loss=0.0025, lr=8.65e-06, step=6673] Training: 67%|██████▋ | 6674/10000 [1:28:06<36:08, 1.53it/s, loss=0.0025, lr=8.65e-06, step=6673] Training: 67%|██████▋ | 6674/10000 [1:28:06<36:08, 1.53it/s, loss=0.0096, lr=8.65e-06, step=6674] Training: 67%|██████▋ | 6675/10000 [1:28:06<33:35, 1.65it/s, loss=0.0096, lr=8.65e-06, step=6674] Training: 67%|██████▋ | 6675/10000 [1:28:07<33:35, 1.65it/s, loss=0.0074, lr=8.65e-06, step=6675] Training: 67%|██████▋ | 6676/10000 [1:28:07<33:12, 1.67it/s, loss=0.0074, lr=8.65e-06, step=6675] Training: 67%|██████▋ | 6676/10000 [1:28:07<33:12, 1.67it/s, loss=0.0111, lr=8.64e-06, step=6676] Training: 67%|██████▋ | 6677/10000 [1:28:08<32:29, 1.70it/s, loss=0.0111, lr=8.64e-06, step=6676] Training: 67%|██████▋ | 6677/10000 [1:28:08<32:29, 1.70it/s, loss=0.0092, lr=8.64e-06, step=6677] Training: 67%|██████▋ | 6678/10000 [1:28:08<33:54, 1.63it/s, loss=0.0092, lr=8.64e-06, step=6677] Training: 67%|██████▋ | 6678/10000 [1:28:08<33:54, 1.63it/s, loss=0.0170, lr=8.64e-06, step=6678] Training: 67%|██████▋ | 6679/10000 [1:28:09<35:26, 1.56it/s, loss=0.0170, lr=8.64e-06, step=6678] Training: 67%|██████▋ | 6679/10000 [1:28:09<35:26, 1.56it/s, loss=0.0024, lr=8.63e-06, step=6679]20:12:42.030 [I] step=6680 loss=0.0063 smoothed_loss=0.0081 lr=8.64e-06 grad_norm=0.4710 step_time=0.5460s data_time=0.0839s it/s=1.588 eta_to_10000=2091.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0215 grad_action_out_proj_arms=0.1658 grad_arm_token_fuse=0.1085 grad_shared_expert=0.7250 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6680/10000 [1:28:10<36:08, 1.53it/s, loss=0.0024, lr=8.63e-06, step=6679] Training: 67%|██████▋ | 6680/10000 [1:28:10<36:08, 1.53it/s, loss=0.0063, lr=8.63e-06, step=6680] Training: 67%|██████▋ | 6681/10000 [1:28:10<35:48, 1.54it/s, loss=0.0063, lr=8.63e-06, step=6680] Training: 67%|██████▋ | 6681/10000 [1:28:10<35:48, 1.54it/s, loss=0.0050, lr=8.63e-06, step=6681] Training: 67%|██████▋ | 6682/10000 [1:28:11<34:30, 1.60it/s, loss=0.0050, lr=8.63e-06, step=6681] Training: 67%|██████▋ | 6682/10000 [1:28:11<34:30, 1.60it/s, loss=0.0027, lr=8.62e-06, step=6682] Training: 67%|██████▋ | 6683/10000 [1:28:11<32:19, 1.71it/s, loss=0.0027, lr=8.62e-06, step=6682] Training: 67%|██████▋ | 6683/10000 [1:28:11<32:19, 1.71it/s, loss=0.0094, lr=8.62e-06, step=6683] Training: 67%|██████▋ | 6684/10000 [1:28:12<31:16, 1.77it/s, loss=0.0094, lr=8.62e-06, step=6683] Training: 67%|██████▋ | 6684/10000 [1:28:12<31:16, 1.77it/s, loss=0.0560, lr=8.62e-06, step=6684] Training: 67%|██████▋ | 6685/10000 [1:28:12<30:55, 1.79it/s, loss=0.0560, lr=8.62e-06, step=6684] Training: 67%|██████▋ | 6685/10000 [1:28:12<30:55, 1.79it/s, loss=0.0014, lr=8.61e-06, step=6685] Training: 67%|██████▋ | 6686/10000 [1:28:13<33:12, 1.66it/s, loss=0.0014, lr=8.61e-06, step=6685] Training: 67%|██████▋ | 6686/10000 [1:28:13<33:12, 1.66it/s, loss=0.0022, lr=8.61e-06, step=6686] Training: 67%|██████▋ | 6687/10000 [1:28:14<31:34, 1.75it/s, loss=0.0022, lr=8.61e-06, step=6686] Training: 67%|██████▋ | 6687/10000 [1:28:14<31:34, 1.75it/s, loss=0.0088, lr=8.61e-06, step=6687] Training: 67%|██████▋ | 6688/10000 [1:28:14<35:52, 1.54it/s, loss=0.0088, lr=8.61e-06, step=6687] Training: 67%|██████▋ | 6688/10000 [1:28:14<35:52, 1.54it/s, loss=0.0309, lr=8.60e-06, step=6688] Training: 67%|██████▋ | 6689/10000 [1:28:15<33:45, 1.63it/s, loss=0.0309, lr=8.60e-06, step=6688] Training: 67%|██████▋ | 6689/10000 [1:28:15<33:45, 1.63it/s, loss=0.0112, lr=8.60e-06, step=6689]20:12:47.894 [I] step=6690 loss=0.0058 smoothed_loss=0.0115 lr=8.61e-06 grad_norm=0.4814 step_time=0.5053s data_time=0.0812s it/s=1.706 eta_to_10000=1940.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.1204 grad_arm_token_fuse=0.0686 grad_shared_expert=0.8130 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6690/10000 [1:28:16<32:37, 1.69it/s, loss=0.0112, lr=8.60e-06, step=6689] Training: 67%|██████▋ | 6690/10000 [1:28:16<32:37, 1.69it/s, loss=0.0058, lr=8.60e-06, step=6690] Training: 67%|██████▋ | 6691/10000 [1:28:16<31:12, 1.77it/s, loss=0.0058, lr=8.60e-06, step=6690] Training: 67%|██████▋ | 6691/10000 [1:28:16<31:12, 1.77it/s, loss=0.0023, lr=8.59e-06, step=6691] Training: 67%|██████▋ | 6692/10000 [1:28:17<30:51, 1.79it/s, loss=0.0023, lr=8.59e-06, step=6691] Training: 67%|██████▋ | 6692/10000 [1:28:17<30:51, 1.79it/s, loss=0.0183, lr=8.59e-06, step=6692] Training: 67%|██████▋ | 6693/10000 [1:28:17<33:38, 1.64it/s, loss=0.0183, lr=8.59e-06, step=6692] Training: 67%|██████▋ | 6693/10000 [1:28:17<33:38, 1.64it/s, loss=0.0036, lr=8.59e-06, step=6693] Training: 67%|██████▋ | 6694/10000 [1:28:18<32:36, 1.69it/s, loss=0.0036, lr=8.59e-06, step=6693] Training: 67%|██████▋ | 6694/10000 [1:28:18<32:36, 1.69it/s, loss=0.0017, lr=8.58e-06, step=6694] Training: 67%|██████▋ | 6695/10000 [1:28:19<34:24, 1.60it/s, loss=0.0017, lr=8.58e-06, step=6694] Training: 67%|██████▋ | 6695/10000 [1:28:19<34:24, 1.60it/s, loss=0.0078, lr=8.58e-06, step=6695] Training: 67%|██████▋ | 6696/10000 [1:28:19<32:13, 1.71it/s, loss=0.0078, lr=8.58e-06, step=6695] Training: 67%|██████▋ | 6696/10000 [1:28:19<32:13, 1.71it/s, loss=0.0016, lr=8.58e-06, step=6696] Training: 67%|██████▋ | 6697/10000 [1:28:20<32:09, 1.71it/s, loss=0.0016, lr=8.58e-06, step=6696] Training: 67%|██████▋ | 6697/10000 [1:28:20<32:09, 1.71it/s, loss=0.0170, lr=8.57e-06, step=6697] Training: 67%|██████▋ | 6698/10000 [1:28:20<31:09, 1.77it/s, loss=0.0170, lr=8.57e-06, step=6697] Training: 67%|██████▋ | 6698/10000 [1:28:20<31:09, 1.77it/s, loss=0.0060, lr=8.57e-06, step=6698] Training: 67%|██████▋ | 6699/10000 [1:28:21<30:48, 1.79it/s, loss=0.0060, lr=8.57e-06, step=6698] Training: 67%|██████▋ | 6699/10000 [1:28:21<30:48, 1.79it/s, loss=0.0007, lr=8.57e-06, step=6699]20:12:53.874 [I] step=6700 loss=0.0127 smoothed_loss=0.0088 lr=8.58e-06 grad_norm=0.4774 step_time=0.5232s data_time=0.0748s it/s=1.673 eta_to_10000=1972.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0363 grad_action_out_proj_arms=0.2019 grad_arm_token_fuse=0.1818 grad_shared_expert=0.4763 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6700/10000 [1:28:22<34:50, 1.58it/s, loss=0.0007, lr=8.57e-06, step=6699] Training: 67%|██████▋ | 6700/10000 [1:28:22<34:50, 1.58it/s, loss=0.0127, lr=8.56e-06, step=6700] Training: 67%|██████▋ | 6701/10000 [1:28:22<32:31, 1.69it/s, loss=0.0127, lr=8.56e-06, step=6700] Training: 67%|██████▋ | 6701/10000 [1:28:22<32:31, 1.69it/s, loss=0.0225, lr=8.56e-06, step=6701] Training: 67%|██████▋ | 6702/10000 [1:28:23<33:16, 1.65it/s, loss=0.0225, lr=8.56e-06, step=6701] Training: 67%|██████▋ | 6702/10000 [1:28:23<33:16, 1.65it/s, loss=0.0253, lr=8.56e-06, step=6702] Training: 67%|██████▋ | 6703/10000 [1:28:23<32:14, 1.70it/s, loss=0.0253, lr=8.56e-06, step=6702] Training: 67%|██████▋ | 6703/10000 [1:28:23<32:14, 1.70it/s, loss=0.0162, lr=8.55e-06, step=6703] Training: 67%|██████▋ | 6704/10000 [1:28:24<30:57, 1.77it/s, loss=0.0162, lr=8.55e-06, step=6703] Training: 67%|██████▋ | 6704/10000 [1:28:24<30:57, 1.77it/s, loss=0.0076, lr=8.55e-06, step=6704] Training: 67%|██████▋ | 6705/10000 [1:28:24<29:55, 1.84it/s, loss=0.0076, lr=8.55e-06, step=6704] Training: 67%|██████▋ | 6705/10000 [1:28:24<29:55, 1.84it/s, loss=0.0049, lr=8.55e-06, step=6705] Training: 67%|██████▋ | 6706/10000 [1:28:25<30:07, 1.82it/s, loss=0.0049, lr=8.55e-06, step=6705] Training: 67%|██████▋ | 6706/10000 [1:28:25<30:07, 1.82it/s, loss=0.0028, lr=8.54e-06, step=6706] Training: 67%|██████▋ | 6707/10000 [1:28:25<32:39, 1.68it/s, loss=0.0028, lr=8.54e-06, step=6706] Training: 67%|██████▋ | 6707/10000 [1:28:25<32:39, 1.68it/s, loss=0.0111, lr=8.54e-06, step=6707] Training: 67%|██████▋ | 6708/10000 [1:28:26<31:00, 1.77it/s, loss=0.0111, lr=8.54e-06, step=6707] Training: 67%|██████▋ | 6708/10000 [1:28:26<31:00, 1.77it/s, loss=0.0035, lr=8.54e-06, step=6708] Training: 67%|██████▋ | 6709/10000 [1:28:26<29:50, 1.84it/s, loss=0.0035, lr=8.54e-06, step=6708] Training: 67%|██████▋ | 6709/10000 [1:28:26<29:50, 1.84it/s, loss=0.0035, lr=8.53e-06, step=6709]20:12:59.497 [I] step=6710 loss=0.0159 smoothed_loss=0.0097 lr=8.55e-06 grad_norm=0.4082 step_time=0.4970s data_time=0.0654s it/s=1.778 eta_to_10000=1850.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.1088 grad_arm_token_fuse=0.0436 grad_shared_expert=0.4644 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6710/10000 [1:28:27<32:11, 1.70it/s, loss=0.0035, lr=8.53e-06, step=6709] Training: 67%|██████▋ | 6710/10000 [1:28:27<32:11, 1.70it/s, loss=0.0159, lr=8.53e-06, step=6710] Training: 67%|██████▋ | 6711/10000 [1:28:28<30:49, 1.78it/s, loss=0.0159, lr=8.53e-06, step=6710] Training: 67%|██████▋ | 6711/10000 [1:28:28<30:49, 1.78it/s, loss=0.0184, lr=8.53e-06, step=6711] Training: 67%|██████▋ | 6712/10000 [1:28:28<30:33, 1.79it/s, loss=0.0184, lr=8.53e-06, step=6711] Training: 67%|██████▋ | 6712/10000 [1:28:28<30:33, 1.79it/s, loss=0.0313, lr=8.52e-06, step=6712] Training: 67%|██████▋ | 6713/10000 [1:28:29<29:37, 1.85it/s, loss=0.0313, lr=8.52e-06, step=6712] Training: 67%|██████▋ | 6713/10000 [1:28:29<29:37, 1.85it/s, loss=0.0111, lr=8.52e-06, step=6713] Training: 67%|██████▋ | 6714/10000 [1:28:29<31:45, 1.72it/s, loss=0.0111, lr=8.52e-06, step=6713] Training: 67%|██████▋ | 6714/10000 [1:28:29<31:45, 1.72it/s, loss=0.0091, lr=8.52e-06, step=6714] Training: 67%|██████▋ | 6715/10000 [1:28:30<35:01, 1.56it/s, loss=0.0091, lr=8.52e-06, step=6714] Training: 67%|██████▋ | 6715/10000 [1:28:30<35:01, 1.56it/s, loss=0.0025, lr=8.51e-06, step=6715] Training: 67%|██████▋ | 6716/10000 [1:28:31<32:34, 1.68it/s, loss=0.0025, lr=8.51e-06, step=6715] Training: 67%|██████▋ | 6716/10000 [1:28:31<32:34, 1.68it/s, loss=0.0344, lr=8.51e-06, step=6716] Training: 67%|██████▋ | 6717/10000 [1:28:31<33:26, 1.64it/s, loss=0.0344, lr=8.51e-06, step=6716] Training: 67%|██████▋ | 6717/10000 [1:28:31<33:26, 1.64it/s, loss=0.0044, lr=8.51e-06, step=6717] Training: 67%|██████▋ | 6718/10000 [1:28:32<33:17, 1.64it/s, loss=0.0044, lr=8.51e-06, step=6717] Training: 67%|██████▋ | 6718/10000 [1:28:32<33:17, 1.64it/s, loss=0.0034, lr=8.50e-06, step=6718] Training: 67%|██████▋ | 6719/10000 [1:28:32<31:33, 1.73it/s, loss=0.0034, lr=8.50e-06, step=6718] Training: 67%|██████▋ | 6719/10000 [1:28:32<31:33, 1.73it/s, loss=0.0162, lr=8.50e-06, step=6719]20:13:05.269 [I] step=6720 loss=0.0021 smoothed_loss=0.0111 lr=8.51e-06 grad_norm=0.4422 step_time=0.5081s data_time=0.0690s it/s=1.733 eta_to_10000=1892.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0085 grad_action_out_proj_arms=0.0778 grad_arm_token_fuse=0.0444 grad_shared_expert=0.5251 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6720/10000 [1:28:33<30:38, 1.78it/s, loss=0.0162, lr=8.50e-06, step=6719] Training: 67%|██████▋ | 6720/10000 [1:28:33<30:38, 1.78it/s, loss=0.0021, lr=8.50e-06, step=6720] Training: 67%|██████▋ | 6721/10000 [1:28:34<32:33, 1.68it/s, loss=0.0021, lr=8.50e-06, step=6720] Training: 67%|██████▋ | 6721/10000 [1:28:34<32:33, 1.68it/s, loss=0.0100, lr=8.49e-06, step=6721] Training: 67%|██████▋ | 6722/10000 [1:28:35<38:10, 1.43it/s, loss=0.0100, lr=8.49e-06, step=6721] Training: 67%|██████▋ | 6722/10000 [1:28:35<38:10, 1.43it/s, loss=0.0063, lr=8.49e-06, step=6722] Training: 67%|██████▋ | 6723/10000 [1:28:35<34:50, 1.57it/s, loss=0.0063, lr=8.49e-06, step=6722] Training: 67%|██████▋ | 6723/10000 [1:28:35<34:50, 1.57it/s, loss=0.0092, lr=8.49e-06, step=6723] Training: 67%|██████▋ | 6724/10000 [1:28:36<35:01, 1.56it/s, loss=0.0092, lr=8.49e-06, step=6723] Training: 67%|██████▋ | 6724/10000 [1:28:36<35:01, 1.56it/s, loss=0.0127, lr=8.48e-06, step=6724] Training: 67%|██████▋ | 6725/10000 [1:28:36<32:22, 1.69it/s, loss=0.0127, lr=8.48e-06, step=6724] Training: 67%|██████▋ | 6725/10000 [1:28:36<32:22, 1.69it/s, loss=0.0061, lr=8.48e-06, step=6725] Training: 67%|██████▋ | 6726/10000 [1:28:37<31:00, 1.76it/s, loss=0.0061, lr=8.48e-06, step=6725] Training: 67%|██████▋ | 6726/10000 [1:28:37<31:00, 1.76it/s, loss=0.0039, lr=8.48e-06, step=6726] Training: 67%|██████▋ | 6727/10000 [1:28:37<30:06, 1.81it/s, loss=0.0039, lr=8.48e-06, step=6726] Training: 67%|██████▋ | 6727/10000 [1:28:37<30:06, 1.81it/s, loss=0.0124, lr=8.47e-06, step=6727] Training: 67%|██████▋ | 6728/10000 [1:28:38<31:57, 1.71it/s, loss=0.0124, lr=8.47e-06, step=6727] Training: 67%|██████▋ | 6728/10000 [1:28:38<31:57, 1.71it/s, loss=0.0082, lr=8.47e-06, step=6728] Training: 67%|██████▋ | 6729/10000 [1:28:39<35:57, 1.52it/s, loss=0.0082, lr=8.47e-06, step=6728] Training: 67%|██████▋ | 6729/10000 [1:28:39<35:57, 1.52it/s, loss=0.0155, lr=8.47e-06, step=6729]20:13:11.593 [I] step=6730 loss=0.0052 smoothed_loss=0.0098 lr=8.48e-06 grad_norm=0.4007 step_time=0.5648s data_time=0.0676s it/s=1.581 eta_to_10000=2067.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0080 grad_action_out_proj_arms=0.0705 grad_arm_token_fuse=0.0414 grad_shared_expert=0.2591 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6730/10000 [1:28:39<34:19, 1.59it/s, loss=0.0155, lr=8.47e-06, step=6729] Training: 67%|██████▋ | 6730/10000 [1:28:39<34:19, 1.59it/s, loss=0.0052, lr=8.46e-06, step=6730] Training: 67%|██████▋ | 6731/10000 [1:28:40<31:57, 1.70it/s, loss=0.0052, lr=8.46e-06, step=6730] Training: 67%|██████▋ | 6731/10000 [1:28:40<31:57, 1.70it/s, loss=0.0153, lr=8.46e-06, step=6731] Training: 67%|██████▋ | 6732/10000 [1:28:40<33:37, 1.62it/s, loss=0.0153, lr=8.46e-06, step=6731] Training: 67%|██████▋ | 6732/10000 [1:28:40<33:37, 1.62it/s, loss=0.0046, lr=8.46e-06, step=6732] Training: 67%|██████▋ | 6733/10000 [1:28:41<31:35, 1.72it/s, loss=0.0046, lr=8.46e-06, step=6732] Training: 67%|██████▋ | 6733/10000 [1:28:41<31:35, 1.72it/s, loss=0.0026, lr=8.45e-06, step=6733] Training: 67%|██████▋ | 6734/10000 [1:28:41<30:02, 1.81it/s, loss=0.0026, lr=8.45e-06, step=6733] Training: 67%|██████▋ | 6734/10000 [1:28:41<30:02, 1.81it/s, loss=0.0549, lr=8.45e-06, step=6734] Training: 67%|██████▋ | 6735/10000 [1:28:42<29:57, 1.82it/s, loss=0.0549, lr=8.45e-06, step=6734] Training: 67%|██████▋ | 6735/10000 [1:28:42<29:57, 1.82it/s, loss=0.0083, lr=8.45e-06, step=6735] Training: 67%|██████▋ | 6736/10000 [1:28:43<32:36, 1.67it/s, loss=0.0083, lr=8.45e-06, step=6735] Training: 67%|██████▋ | 6736/10000 [1:28:43<32:36, 1.67it/s, loss=0.0081, lr=8.44e-06, step=6736] Training: 67%|██████▋ | 6737/10000 [1:28:43<31:22, 1.73it/s, loss=0.0081, lr=8.44e-06, step=6736] Training: 67%|██████▋ | 6737/10000 [1:28:43<31:22, 1.73it/s, loss=0.0108, lr=8.44e-06, step=6737] Training: 67%|██████▋ | 6738/10000 [1:28:44<33:44, 1.61it/s, loss=0.0108, lr=8.44e-06, step=6737] Training: 67%|██████▋ | 6738/10000 [1:28:44<33:44, 1.61it/s, loss=0.0424, lr=8.44e-06, step=6738] Training: 67%|██████▋ | 6739/10000 [1:28:45<35:22, 1.54it/s, loss=0.0424, lr=8.44e-06, step=6738] Training: 67%|██████▋ | 6739/10000 [1:28:45<35:22, 1.54it/s, loss=0.0104, lr=8.43e-06, step=6739]20:13:17.528 [I] step=6740 loss=0.0159 smoothed_loss=0.0150 lr=8.45e-06 grad_norm=0.5282 step_time=0.5116s data_time=0.0819s it/s=1.685 eta_to_10000=1934.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0251 grad_action_out_proj_arms=0.1130 grad_arm_token_fuse=0.1272 grad_shared_expert=0.3545 (18633:train_pytorch.py:850) + Training: 67%|██████▋ | 6740/10000 [1:28:45<33:44, 1.61it/s, loss=0.0104, lr=8.43e-06, step=6739] Training: 67%|██████▋ | 6740/10000 [1:28:45<33:44, 1.61it/s, loss=0.0159, lr=8.43e-06, step=6740] Training: 67%|██████▋ | 6741/10000 [1:28:46<31:35, 1.72it/s, loss=0.0159, lr=8.43e-06, step=6740] Training: 67%|██████▋ | 6741/10000 [1:28:46<31:35, 1.72it/s, loss=0.0113, lr=8.43e-06, step=6741] Training: 67%|██████▋ | 6742/10000 [1:28:46<30:01, 1.81it/s, loss=0.0113, lr=8.43e-06, step=6741] Training: 67%|██████▋ | 6742/10000 [1:28:46<30:01, 1.81it/s, loss=0.0281, lr=8.43e-06, step=6742] Training: 67%|██████▋ | 6743/10000 [1:28:47<34:47, 1.56it/s, loss=0.0281, lr=8.43e-06, step=6742] Training: 67%|██████▋ | 6743/10000 [1:28:47<34:47, 1.56it/s, loss=0.0078, lr=8.42e-06, step=6743] Training: 67%|██████▋ | 6744/10000 [1:28:48<33:25, 1.62it/s, loss=0.0078, lr=8.42e-06, step=6743] Training: 67%|██████▋ | 6744/10000 [1:28:48<33:25, 1.62it/s, loss=0.0084, lr=8.42e-06, step=6744] Training: 67%|██████▋ | 6745/10000 [1:28:48<34:07, 1.59it/s, loss=0.0084, lr=8.42e-06, step=6744] Training: 67%|██████▋ | 6745/10000 [1:28:48<34:07, 1.59it/s, loss=0.0040, lr=8.42e-06, step=6745] Training: 67%|██████▋ | 6746/10000 [1:28:49<32:07, 1.69it/s, loss=0.0040, lr=8.42e-06, step=6745] Training: 67%|██████▋ | 6746/10000 [1:28:49<32:07, 1.69it/s, loss=0.0071, lr=8.41e-06, step=6746] Training: 67%|██████▋ | 6747/10000 [1:28:49<32:31, 1.67it/s, loss=0.0071, lr=8.41e-06, step=6746] Training: 67%|██████▋ | 6747/10000 [1:28:49<32:31, 1.67it/s, loss=0.0065, lr=8.41e-06, step=6747] Training: 67%|██████▋ | 6748/10000 [1:28:50<30:47, 1.76it/s, loss=0.0065, lr=8.41e-06, step=6747] Training: 67%|██████▋ | 6748/10000 [1:28:50<30:47, 1.76it/s, loss=0.0116, lr=8.41e-06, step=6748] Training: 67%|██████▋ | 6749/10000 [1:28:50<29:30, 1.84it/s, loss=0.0116, lr=8.41e-06, step=6748] Training: 67%|██████▋ | 6749/10000 [1:28:50<29:30, 1.84it/s, loss=0.0145, lr=8.40e-06, step=6749]20:13:23.503 [I] step=6750 loss=0.0017 smoothed_loss=0.0113 lr=8.41e-06 grad_norm=0.6342 step_time=0.5149s data_time=0.0826s it/s=1.674 eta_to_10000=1941.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0166 grad_action_out_proj_arms=0.1336 grad_arm_token_fuse=0.0851 grad_shared_expert=0.3396 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6750/10000 [1:28:51<34:05, 1.59it/s, loss=0.0145, lr=8.40e-06, step=6749] Training: 68%|██████▊ | 6750/10000 [1:28:51<34:05, 1.59it/s, loss=0.0017, lr=8.40e-06, step=6750] Training: 68%|██████▊ | 6751/10000 [1:28:52<31:42, 1.71it/s, loss=0.0017, lr=8.40e-06, step=6750] Training: 68%|██████▊ | 6751/10000 [1:28:52<31:42, 1.71it/s, loss=0.0064, lr=8.40e-06, step=6751] Training: 68%|██████▊ | 6752/10000 [1:28:52<33:04, 1.64it/s, loss=0.0064, lr=8.40e-06, step=6751] Training: 68%|██████▊ | 6752/10000 [1:28:52<33:04, 1.64it/s, loss=0.0209, lr=8.39e-06, step=6752] Training: 68%|██████▊ | 6753/10000 [1:28:53<31:36, 1.71it/s, loss=0.0209, lr=8.39e-06, step=6752] Training: 68%|██████▊ | 6753/10000 [1:28:53<31:36, 1.71it/s, loss=0.0060, lr=8.39e-06, step=6753] Training: 68%|██████▊ | 6754/10000 [1:28:53<30:42, 1.76it/s, loss=0.0060, lr=8.39e-06, step=6753] Training: 68%|██████▊ | 6754/10000 [1:28:53<30:42, 1.76it/s, loss=0.0055, lr=8.39e-06, step=6754] Training: 68%|██████▊ | 6755/10000 [1:28:54<29:25, 1.84it/s, loss=0.0055, lr=8.39e-06, step=6754] Training: 68%|██████▊ | 6755/10000 [1:28:54<29:25, 1.84it/s, loss=0.0216, lr=8.38e-06, step=6755] Training: 68%|██████▊ | 6756/10000 [1:28:54<30:20, 1.78it/s, loss=0.0216, lr=8.38e-06, step=6755] Training: 68%|██████▊ | 6756/10000 [1:28:54<30:20, 1.78it/s, loss=0.0054, lr=8.38e-06, step=6756] Training: 68%|██████▊ | 6757/10000 [1:28:55<32:28, 1.66it/s, loss=0.0054, lr=8.38e-06, step=6756] Training: 68%|██████▊ | 6757/10000 [1:28:55<32:28, 1.66it/s, loss=0.0051, lr=8.38e-06, step=6757] Training: 68%|██████▊ | 6758/10000 [1:28:56<30:53, 1.75it/s, loss=0.0051, lr=8.38e-06, step=6757] Training: 68%|██████▊ | 6758/10000 [1:28:56<30:53, 1.75it/s, loss=0.0039, lr=8.37e-06, step=6758] Training: 68%|██████▊ | 6759/10000 [1:28:56<34:01, 1.59it/s, loss=0.0039, lr=8.37e-06, step=6758] Training: 68%|██████▊ | 6759/10000 [1:28:56<34:01, 1.59it/s, loss=0.0055, lr=8.37e-06, step=6759]20:13:29.309 [I] step=6760 loss=0.0088 smoothed_loss=0.0094 lr=8.38e-06 grad_norm=0.4380 step_time=0.5037s data_time=0.0770s it/s=1.723 eta_to_10000=1881.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0094 grad_action_out_proj_arms=0.1591 grad_arm_token_fuse=0.0479 grad_shared_expert=0.4687 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6760/10000 [1:28:57<32:42, 1.65it/s, loss=0.0055, lr=8.37e-06, step=6759] Training: 68%|██████▊ | 6760/10000 [1:28:57<32:42, 1.65it/s, loss=0.0088, lr=8.37e-06, step=6760] Training: 68%|██████▊ | 6761/10000 [1:28:57<30:51, 1.75it/s, loss=0.0088, lr=8.37e-06, step=6760] Training: 68%|██████▊ | 6761/10000 [1:28:57<30:51, 1.75it/s, loss=0.0020, lr=8.36e-06, step=6761] Training: 68%|██████▊ | 6762/10000 [1:28:58<30:11, 1.79it/s, loss=0.0020, lr=8.36e-06, step=6761] Training: 68%|██████▊ | 6762/10000 [1:28:58<30:11, 1.79it/s, loss=0.0100, lr=8.36e-06, step=6762] Training: 68%|██████▊ | 6763/10000 [1:28:59<31:33, 1.71it/s, loss=0.0100, lr=8.36e-06, step=6762] Training: 68%|██████▊ | 6763/10000 [1:28:59<31:33, 1.71it/s, loss=0.0145, lr=8.36e-06, step=6763] Training: 68%|██████▊ | 6764/10000 [1:28:59<33:43, 1.60it/s, loss=0.0145, lr=8.36e-06, step=6763] Training: 68%|██████▊ | 6764/10000 [1:28:59<33:43, 1.60it/s, loss=0.0079, lr=8.35e-06, step=6764] Training: 68%|██████▊ | 6765/10000 [1:29:00<35:54, 1.50it/s, loss=0.0079, lr=8.35e-06, step=6764] Training: 68%|██████▊ | 6765/10000 [1:29:00<35:54, 1.50it/s, loss=0.0060, lr=8.35e-06, step=6765] Training: 68%|██████▊ | 6766/10000 [1:29:01<36:10, 1.49it/s, loss=0.0060, lr=8.35e-06, step=6765] Training: 68%|██████▊ | 6766/10000 [1:29:01<36:10, 1.49it/s, loss=0.0189, lr=8.35e-06, step=6766] Training: 68%|██████▊ | 6767/10000 [1:29:01<33:50, 1.59it/s, loss=0.0189, lr=8.35e-06, step=6766] Training: 68%|██████▊ | 6767/10000 [1:29:01<33:50, 1.59it/s, loss=0.0143, lr=8.34e-06, step=6767] Training: 68%|██████▊ | 6768/10000 [1:29:02<31:34, 1.71it/s, loss=0.0143, lr=8.34e-06, step=6767] Training: 68%|██████▊ | 6768/10000 [1:29:02<31:34, 1.71it/s, loss=0.0204, lr=8.34e-06, step=6768] Training: 68%|██████▊ | 6769/10000 [1:29:02<30:19, 1.78it/s, loss=0.0204, lr=8.34e-06, step=6768] Training: 68%|██████▊ | 6769/10000 [1:29:02<30:19, 1.78it/s, loss=0.0400, lr=8.34e-06, step=6769]20:13:35.261 [I] step=6770 loss=0.0253 smoothed_loss=0.0153 lr=8.35e-06 grad_norm=0.5013 step_time=0.5140s data_time=0.0812s it/s=1.680 eta_to_10000=1922.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.1115 grad_arm_token_fuse=0.0432 grad_shared_expert=0.6409 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6770/10000 [1:29:03<30:50, 1.75it/s, loss=0.0400, lr=8.34e-06, step=6769] Training: 68%|██████▊ | 6770/10000 [1:29:03<30:50, 1.75it/s, loss=0.0253, lr=8.33e-06, step=6770] Training: 68%|██████▊ | 6771/10000 [1:29:04<32:07, 1.68it/s, loss=0.0253, lr=8.33e-06, step=6770] Training: 68%|██████▊ | 6771/10000 [1:29:04<32:07, 1.68it/s, loss=0.0072, lr=8.33e-06, step=6771] Training: 68%|██████▊ | 6772/10000 [1:29:04<35:42, 1.51it/s, loss=0.0072, lr=8.33e-06, step=6771] Training: 68%|██████▊ | 6772/10000 [1:29:04<35:42, 1.51it/s, loss=0.0267, lr=8.33e-06, step=6772] Training: 68%|██████▊ | 6773/10000 [1:29:05<38:04, 1.41it/s, loss=0.0267, lr=8.33e-06, step=6772] Training: 68%|██████▊ | 6773/10000 [1:29:05<38:04, 1.41it/s, loss=0.0022, lr=8.32e-06, step=6773] Training: 68%|██████▊ | 6774/10000 [1:29:06<34:49, 1.54it/s, loss=0.0022, lr=8.32e-06, step=6773] Training: 68%|██████▊ | 6774/10000 [1:29:06<34:49, 1.54it/s, loss=0.0245, lr=8.32e-06, step=6774] Training: 68%|██████▊ | 6775/10000 [1:29:06<32:45, 1.64it/s, loss=0.0245, lr=8.32e-06, step=6774] Training: 68%|██████▊ | 6775/10000 [1:29:06<32:45, 1.64it/s, loss=0.0227, lr=8.32e-06, step=6775] Training: 68%|██████▊ | 6776/10000 [1:29:07<33:36, 1.60it/s, loss=0.0227, lr=8.32e-06, step=6775] Training: 68%|██████▊ | 6776/10000 [1:29:07<33:36, 1.60it/s, loss=0.0054, lr=8.31e-06, step=6776] Training: 68%|██████▊ | 6777/10000 [1:29:07<32:06, 1.67it/s, loss=0.0054, lr=8.31e-06, step=6776] Training: 68%|██████▊ | 6777/10000 [1:29:07<32:06, 1.67it/s, loss=0.0013, lr=8.31e-06, step=6777] Training: 68%|██████▊ | 6778/10000 [1:29:08<34:18, 1.57it/s, loss=0.0013, lr=8.31e-06, step=6777] Training: 68%|██████▊ | 6778/10000 [1:29:08<34:18, 1.57it/s, loss=0.0026, lr=8.31e-06, step=6778] Training: 68%|██████▊ | 6779/10000 [1:29:09<36:09, 1.48it/s, loss=0.0026, lr=8.31e-06, step=6778] Training: 68%|██████▊ | 6779/10000 [1:29:09<36:09, 1.48it/s, loss=0.0053, lr=8.30e-06, step=6779]20:13:42.026 [I] step=6780 loss=0.0434 smoothed_loss=0.0150 lr=8.32e-06 grad_norm=0.4335 step_time=0.5855s data_time=0.0910s it/s=1.478 eta_to_10000=2178.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0210 grad_action_out_proj_arms=0.1611 grad_arm_token_fuse=0.1125 grad_shared_expert=0.4475 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6780/10000 [1:29:10<37:42, 1.42it/s, loss=0.0053, lr=8.30e-06, step=6779] Training: 68%|██████▊ | 6780/10000 [1:29:10<37:42, 1.42it/s, loss=0.0434, lr=8.30e-06, step=6780] Training: 68%|██████▊ | 6781/10000 [1:29:10<34:25, 1.56it/s, loss=0.0434, lr=8.30e-06, step=6780] Training: 68%|██████▊ | 6781/10000 [1:29:10<34:25, 1.56it/s, loss=0.0273, lr=8.30e-06, step=6781] Training: 68%|██████▊ | 6782/10000 [1:29:11<31:59, 1.68it/s, loss=0.0273, lr=8.30e-06, step=6781] Training: 68%|██████▊ | 6782/10000 [1:29:11<31:59, 1.68it/s, loss=0.0202, lr=8.29e-06, step=6782] Training: 68%|██████▊ | 6783/10000 [1:29:11<30:36, 1.75it/s, loss=0.0202, lr=8.29e-06, step=6782] Training: 68%|██████▊ | 6783/10000 [1:29:11<30:36, 1.75it/s, loss=0.0264, lr=8.29e-06, step=6783] Training: 68%|██████▊ | 6784/10000 [1:29:12<29:23, 1.82it/s, loss=0.0264, lr=8.29e-06, step=6783] Training: 68%|██████▊ | 6784/10000 [1:29:12<29:23, 1.82it/s, loss=0.0024, lr=8.29e-06, step=6784] Training: 68%|██████▊ | 6785/10000 [1:29:12<32:12, 1.66it/s, loss=0.0024, lr=8.29e-06, step=6784] Training: 68%|██████▊ | 6785/10000 [1:29:12<32:12, 1.66it/s, loss=0.0075, lr=8.28e-06, step=6785] Training: 68%|██████▊ | 6786/10000 [1:29:13<35:14, 1.52it/s, loss=0.0075, lr=8.28e-06, step=6785] Training: 68%|██████▊ | 6786/10000 [1:29:13<35:14, 1.52it/s, loss=0.0251, lr=8.28e-06, step=6786] Training: 68%|██████▊ | 6787/10000 [1:29:14<35:26, 1.51it/s, loss=0.0251, lr=8.28e-06, step=6786] Training: 68%|██████▊ | 6787/10000 [1:29:14<35:26, 1.51it/s, loss=0.0009, lr=8.28e-06, step=6787] Training: 68%|██████▊ | 6788/10000 [1:29:15<36:00, 1.49it/s, loss=0.0009, lr=8.28e-06, step=6787] Training: 68%|██████▊ | 6788/10000 [1:29:15<36:00, 1.49it/s, loss=0.0021, lr=8.27e-06, step=6788] Training: 68%|██████▊ | 6789/10000 [1:29:15<33:15, 1.61it/s, loss=0.0021, lr=8.27e-06, step=6788] Training: 68%|██████▊ | 6789/10000 [1:29:15<33:15, 1.61it/s, loss=0.0262, lr=8.27e-06, step=6789]20:13:47.937 [I] step=6790 loss=0.0137 smoothed_loss=0.0146 lr=8.28e-06 grad_norm=0.5102 step_time=0.5030s data_time=0.0880s it/s=1.692 eta_to_10000=1897.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0150 grad_action_out_proj_arms=0.0933 grad_arm_token_fuse=0.0783 grad_shared_expert=0.4340 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6790/10000 [1:29:16<31:44, 1.69it/s, loss=0.0262, lr=8.27e-06, step=6789] Training: 68%|██████▊ | 6790/10000 [1:29:16<31:44, 1.69it/s, loss=0.0137, lr=8.27e-06, step=6790] Training: 68%|██████▊ | 6791/10000 [1:29:16<31:46, 1.68it/s, loss=0.0137, lr=8.27e-06, step=6790] Training: 68%|██████▊ | 6791/10000 [1:29:16<31:46, 1.68it/s, loss=0.0042, lr=8.27e-06, step=6791] Training: 68%|██████▊ | 6792/10000 [1:29:17<30:20, 1.76it/s, loss=0.0042, lr=8.27e-06, step=6791] Training: 68%|██████▊ | 6792/10000 [1:29:17<30:20, 1.76it/s, loss=0.0117, lr=8.26e-06, step=6792] Training: 68%|██████▊ | 6793/10000 [1:29:17<33:28, 1.60it/s, loss=0.0117, lr=8.26e-06, step=6792] Training: 68%|██████▊ | 6793/10000 [1:29:17<33:28, 1.60it/s, loss=0.0039, lr=8.26e-06, step=6793] Training: 68%|██████▊ | 6794/10000 [1:29:18<33:55, 1.57it/s, loss=0.0039, lr=8.26e-06, step=6793] Training: 68%|██████▊ | 6794/10000 [1:29:18<33:55, 1.57it/s, loss=0.0142, lr=8.26e-06, step=6794] Training: 68%|██████▊ | 6795/10000 [1:29:19<31:27, 1.70it/s, loss=0.0142, lr=8.26e-06, step=6794] Training: 68%|██████▊ | 6795/10000 [1:29:19<31:27, 1.70it/s, loss=0.0026, lr=8.25e-06, step=6795] Training: 68%|██████▊ | 6796/10000 [1:29:19<30:06, 1.77it/s, loss=0.0026, lr=8.25e-06, step=6795] Training: 68%|██████▊ | 6796/10000 [1:29:19<30:06, 1.77it/s, loss=0.0036, lr=8.25e-06, step=6796] Training: 68%|██████▊ | 6797/10000 [1:29:20<28:52, 1.85it/s, loss=0.0036, lr=8.25e-06, step=6796] Training: 68%|██████▊ | 6797/10000 [1:29:20<28:52, 1.85it/s, loss=0.0117, lr=8.25e-06, step=6797] Training: 68%|██████▊ | 6798/10000 [1:29:20<30:02, 1.78it/s, loss=0.0117, lr=8.25e-06, step=6797] Training: 68%|██████▊ | 6798/10000 [1:29:20<30:02, 1.78it/s, loss=0.0090, lr=8.24e-06, step=6798] Training: 68%|██████▊ | 6799/10000 [1:29:21<29:45, 1.79it/s, loss=0.0090, lr=8.24e-06, step=6798] Training: 68%|██████▊ | 6799/10000 [1:29:21<29:45, 1.79it/s, loss=0.0052, lr=8.24e-06, step=6799]20:13:53.845 [I] step=6800 loss=0.0082 smoothed_loss=0.0099 lr=8.25e-06 grad_norm=0.4941 step_time=0.5102s data_time=0.0806s it/s=1.693 eta_to_10000=1890.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0534 grad_action_out_proj_arms=0.2405 grad_arm_token_fuse=0.2738 grad_shared_expert=0.6699 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6800/10000 [1:29:22<32:53, 1.62it/s, loss=0.0052, lr=8.24e-06, step=6799] Training: 68%|██████▊ | 6800/10000 [1:29:22<32:53, 1.62it/s, loss=0.0082, lr=8.24e-06, step=6800] Training: 68%|██████▊ | 6801/10000 [1:29:22<34:09, 1.56it/s, loss=0.0082, lr=8.24e-06, step=6800] Training: 68%|██████▊ | 6801/10000 [1:29:22<34:09, 1.56it/s, loss=0.0043, lr=8.23e-06, step=6801] Training: 68%|██████▊ | 6802/10000 [1:29:23<36:08, 1.47it/s, loss=0.0043, lr=8.23e-06, step=6801] Training: 68%|██████▊ | 6802/10000 [1:29:23<36:08, 1.47it/s, loss=0.0054, lr=8.23e-06, step=6802] Training: 68%|██████▊ | 6803/10000 [1:29:24<33:50, 1.57it/s, loss=0.0054, lr=8.23e-06, step=6802] Training: 68%|██████▊ | 6803/10000 [1:29:24<33:50, 1.57it/s, loss=0.0083, lr=8.23e-06, step=6803] Training: 68%|██████▊ | 6804/10000 [1:29:24<31:41, 1.68it/s, loss=0.0083, lr=8.23e-06, step=6803] Training: 68%|██████▊ | 6804/10000 [1:29:24<31:41, 1.68it/s, loss=0.0210, lr=8.22e-06, step=6804] Training: 68%|██████▊ | 6805/10000 [1:29:25<32:15, 1.65it/s, loss=0.0210, lr=8.22e-06, step=6804] Training: 68%|██████▊ | 6805/10000 [1:29:25<32:15, 1.65it/s, loss=0.0050, lr=8.22e-06, step=6805] Training: 68%|██████▊ | 6806/10000 [1:29:25<30:44, 1.73it/s, loss=0.0050, lr=8.22e-06, step=6805] Training: 68%|██████▊ | 6806/10000 [1:29:25<30:44, 1.73it/s, loss=0.0272, lr=8.22e-06, step=6806] Training: 68%|██████▊ | 6807/10000 [1:29:26<33:14, 1.60it/s, loss=0.0272, lr=8.22e-06, step=6806] Training: 68%|██████▊ | 6807/10000 [1:29:26<33:14, 1.60it/s, loss=0.0017, lr=8.21e-06, step=6807] Training: 68%|██████▊ | 6808/10000 [1:29:27<36:18, 1.47it/s, loss=0.0017, lr=8.21e-06, step=6807] Training: 68%|██████▊ | 6808/10000 [1:29:27<36:18, 1.47it/s, loss=0.0045, lr=8.21e-06, step=6808] Training: 68%|██████▊ | 6809/10000 [1:29:27<34:52, 1.53it/s, loss=0.0045, lr=8.21e-06, step=6808] Training: 68%|██████▊ | 6809/10000 [1:29:27<34:52, 1.53it/s, loss=0.0082, lr=8.21e-06, step=6809]20:14:00.187 [I] step=6810 loss=0.0090 smoothed_loss=0.0096 lr=8.22e-06 grad_norm=0.4745 step_time=0.5325s data_time=0.1017s it/s=1.577 eta_to_10000=2022.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0208 grad_action_out_proj_arms=0.1308 grad_arm_token_fuse=0.1049 grad_shared_expert=0.6356 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6810/10000 [1:29:28<33:15, 1.60it/s, loss=0.0082, lr=8.21e-06, step=6809] Training: 68%|██████▊ | 6810/10000 [1:29:28<33:15, 1.60it/s, loss=0.0090, lr=8.20e-06, step=6810] Training: 68%|██████▊ | 6811/10000 [1:29:28<32:24, 1.64it/s, loss=0.0090, lr=8.20e-06, step=6810] Training: 68%|██████▊ | 6811/10000 [1:29:28<32:24, 1.64it/s, loss=0.0211, lr=8.20e-06, step=6811] Training: 68%|██████▊ | 6812/10000 [1:29:29<30:47, 1.73it/s, loss=0.0211, lr=8.20e-06, step=6811] Training: 68%|██████▊ | 6812/10000 [1:29:29<30:47, 1.73it/s, loss=0.0106, lr=8.20e-06, step=6812] Training: 68%|██████▊ | 6813/10000 [1:29:30<31:26, 1.69it/s, loss=0.0106, lr=8.20e-06, step=6812] Training: 68%|██████▊ | 6813/10000 [1:29:30<31:26, 1.69it/s, loss=0.0179, lr=8.19e-06, step=6813] Training: 68%|██████▊ | 6814/10000 [1:29:30<32:20, 1.64it/s, loss=0.0179, lr=8.19e-06, step=6813] Training: 68%|██████▊ | 6814/10000 [1:29:30<32:20, 1.64it/s, loss=0.0123, lr=8.19e-06, step=6814] Training: 68%|██████▊ | 6815/10000 [1:29:31<36:22, 1.46it/s, loss=0.0123, lr=8.19e-06, step=6814] Training: 68%|██████▊ | 6815/10000 [1:29:31<36:22, 1.46it/s, loss=0.0048, lr=8.19e-06, step=6815] Training: 68%|██████▊ | 6816/10000 [1:29:32<39:06, 1.36it/s, loss=0.0048, lr=8.19e-06, step=6815] Training: 68%|██████▊ | 6816/10000 [1:29:32<39:06, 1.36it/s, loss=0.0025, lr=8.18e-06, step=6816] Training: 68%|██████▊ | 6817/10000 [1:29:32<35:15, 1.50it/s, loss=0.0025, lr=8.18e-06, step=6816] Training: 68%|██████▊ | 6817/10000 [1:29:32<35:15, 1.50it/s, loss=0.0137, lr=8.18e-06, step=6817] Training: 68%|██████▊ | 6818/10000 [1:29:33<36:39, 1.45it/s, loss=0.0137, lr=8.18e-06, step=6817] Training: 68%|██████▊ | 6818/10000 [1:29:33<36:39, 1.45it/s, loss=0.0010, lr=8.18e-06, step=6818] Training: 68%|██████▊ | 6819/10000 [1:29:34<33:23, 1.59it/s, loss=0.0010, lr=8.18e-06, step=6818] Training: 68%|██████▊ | 6819/10000 [1:29:34<33:23, 1.59it/s, loss=0.0071, lr=8.17e-06, step=6819]20:14:06.530 [I] step=6820 loss=0.0107 smoothed_loss=0.0094 lr=8.19e-06 grad_norm=0.3717 step_time=0.5454s data_time=0.0889s it/s=1.577 eta_to_10000=2016.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0057 grad_action_out_proj_arms=0.0748 grad_arm_token_fuse=0.0286 grad_shared_expert=0.3363 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6820/10000 [1:29:34<31:51, 1.66it/s, loss=0.0071, lr=8.17e-06, step=6819] Training: 68%|██████▊ | 6820/10000 [1:29:34<31:51, 1.66it/s, loss=0.0107, lr=8.17e-06, step=6820] Training: 68%|██████▊ | 6821/10000 [1:29:35<32:16, 1.64it/s, loss=0.0107, lr=8.17e-06, step=6820] Training: 68%|██████▊ | 6821/10000 [1:29:35<32:16, 1.64it/s, loss=0.0082, lr=8.17e-06, step=6821] Training: 68%|██████▊ | 6822/10000 [1:29:36<33:54, 1.56it/s, loss=0.0082, lr=8.17e-06, step=6821] Training: 68%|██████▊ | 6822/10000 [1:29:36<33:54, 1.56it/s, loss=0.0027, lr=8.16e-06, step=6822] Training: 68%|██████▊ | 6823/10000 [1:29:36<37:37, 1.41it/s, loss=0.0027, lr=8.16e-06, step=6822] Training: 68%|██████▊ | 6823/10000 [1:29:36<37:37, 1.41it/s, loss=0.0329, lr=8.16e-06, step=6823] Training: 68%|██████▊ | 6824/10000 [1:29:37<34:09, 1.55it/s, loss=0.0329, lr=8.16e-06, step=6823] Training: 68%|██████▊ | 6824/10000 [1:29:37<34:09, 1.55it/s, loss=0.0111, lr=8.16e-06, step=6824] Training: 68%|██████▊ | 6825/10000 [1:29:37<31:51, 1.66it/s, loss=0.0111, lr=8.16e-06, step=6824] Training: 68%|██████▊ | 6825/10000 [1:29:37<31:51, 1.66it/s, loss=0.0217, lr=8.16e-06, step=6825] Training: 68%|██████▊ | 6826/10000 [1:29:38<34:19, 1.54it/s, loss=0.0217, lr=8.16e-06, step=6825] Training: 68%|██████▊ | 6826/10000 [1:29:38<34:19, 1.54it/s, loss=0.0233, lr=8.15e-06, step=6826] Training: 68%|██████▊ | 6827/10000 [1:29:39<33:28, 1.58it/s, loss=0.0233, lr=8.15e-06, step=6826] Training: 68%|██████▊ | 6827/10000 [1:29:39<33:28, 1.58it/s, loss=0.0069, lr=8.15e-06, step=6827] Training: 68%|██████▊ | 6828/10000 [1:29:40<36:06, 1.46it/s, loss=0.0069, lr=8.15e-06, step=6827] Training: 68%|██████▊ | 6828/10000 [1:29:40<36:06, 1.46it/s, loss=0.0094, lr=8.15e-06, step=6828] Training: 68%|██████▊ | 6829/10000 [1:29:40<37:24, 1.41it/s, loss=0.0094, lr=8.15e-06, step=6828] Training: 68%|██████▊ | 6829/10000 [1:29:40<37:24, 1.41it/s, loss=0.0068, lr=8.14e-06, step=6829]20:14:13.339 [I] step=6830 loss=0.0030 smoothed_loss=0.0109 lr=8.15e-06 grad_norm=0.4488 step_time=0.5859s data_time=0.0950s it/s=1.469 eta_to_10000=2158.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0053 grad_action_out_proj_arms=0.0732 grad_arm_token_fuse=0.0280 grad_shared_expert=0.2529 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6830/10000 [1:29:41<36:58, 1.43it/s, loss=0.0068, lr=8.14e-06, step=6829] Training: 68%|██████▊ | 6830/10000 [1:29:41<36:58, 1.43it/s, loss=0.0030, lr=8.14e-06, step=6830] Training: 68%|██████▊ | 6831/10000 [1:29:41<33:36, 1.57it/s, loss=0.0030, lr=8.14e-06, step=6830] Training: 68%|██████▊ | 6831/10000 [1:29:41<33:36, 1.57it/s, loss=0.0113, lr=8.14e-06, step=6831] Training: 68%|██████▊ | 6832/10000 [1:29:42<31:18, 1.69it/s, loss=0.0113, lr=8.14e-06, step=6831] Training: 68%|██████▊ | 6832/10000 [1:29:42<31:18, 1.69it/s, loss=0.0019, lr=8.13e-06, step=6832] Training: 68%|██████▊ | 6833/10000 [1:29:43<30:37, 1.72it/s, loss=0.0019, lr=8.13e-06, step=6832] Training: 68%|██████▊ | 6833/10000 [1:29:43<30:37, 1.72it/s, loss=0.0024, lr=8.13e-06, step=6833] Training: 68%|██████▊ | 6834/10000 [1:29:43<30:55, 1.71it/s, loss=0.0024, lr=8.13e-06, step=6833] Training: 68%|██████▊ | 6834/10000 [1:29:43<30:55, 1.71it/s, loss=0.0027, lr=8.13e-06, step=6834] Training: 68%|██████▊ | 6835/10000 [1:29:44<30:20, 1.74it/s, loss=0.0027, lr=8.13e-06, step=6834] Training: 68%|██████▊ | 6835/10000 [1:29:44<30:20, 1.74it/s, loss=0.0042, lr=8.12e-06, step=6835] Training: 68%|██████▊ | 6836/10000 [1:29:44<32:35, 1.62it/s, loss=0.0042, lr=8.12e-06, step=6835] Training: 68%|██████▊ | 6836/10000 [1:29:44<32:35, 1.62it/s, loss=0.0013, lr=8.12e-06, step=6836] Training: 68%|██████▊ | 6837/10000 [1:29:45<32:11, 1.64it/s, loss=0.0013, lr=8.12e-06, step=6836] Training: 68%|██████▊ | 6837/10000 [1:29:45<32:11, 1.64it/s, loss=0.0254, lr=8.12e-06, step=6837] Training: 68%|██████▊ | 6838/10000 [1:29:46<33:03, 1.59it/s, loss=0.0254, lr=8.12e-06, step=6837] Training: 68%|██████▊ | 6838/10000 [1:29:46<33:03, 1.59it/s, loss=0.0168, lr=8.11e-06, step=6838] Training: 68%|██████▊ | 6839/10000 [1:29:46<30:47, 1.71it/s, loss=0.0168, lr=8.11e-06, step=6838] Training: 68%|██████▊ | 6839/10000 [1:29:46<30:47, 1.71it/s, loss=0.0039, lr=8.11e-06, step=6839]20:14:19.144 [I] step=6840 loss=0.0166 smoothed_loss=0.0101 lr=8.12e-06 grad_norm=0.4312 step_time=0.5043s data_time=0.0762s it/s=1.723 eta_to_10000=1834.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0045 grad_action_out_proj_arms=0.0580 grad_arm_token_fuse=0.0229 grad_shared_expert=0.3524 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6840/10000 [1:29:47<32:01, 1.64it/s, loss=0.0039, lr=8.11e-06, step=6839] Training: 68%|██████▊ | 6840/10000 [1:29:47<32:01, 1.64it/s, loss=0.0166, lr=8.11e-06, step=6840] Training: 68%|██████▊ | 6841/10000 [1:29:47<30:27, 1.73it/s, loss=0.0166, lr=8.11e-06, step=6840] Training: 68%|██████▊ | 6841/10000 [1:29:47<30:27, 1.73it/s, loss=0.0017, lr=8.10e-06, step=6841] Training: 68%|██████▊ | 6842/10000 [1:29:48<29:20, 1.79it/s, loss=0.0017, lr=8.10e-06, step=6841] Training: 68%|██████▊ | 6842/10000 [1:29:48<29:20, 1.79it/s, loss=0.0112, lr=8.10e-06, step=6842] Training: 68%|██████▊ | 6843/10000 [1:29:49<32:17, 1.63it/s, loss=0.0112, lr=8.10e-06, step=6842] Training: 68%|██████▊ | 6843/10000 [1:29:49<32:17, 1.63it/s, loss=0.0053, lr=8.10e-06, step=6843] Training: 68%|██████▊ | 6844/10000 [1:29:49<30:23, 1.73it/s, loss=0.0053, lr=8.10e-06, step=6843] Training: 68%|██████▊ | 6844/10000 [1:29:49<30:23, 1.73it/s, loss=0.0036, lr=8.09e-06, step=6844] Training: 68%|██████▊ | 6845/10000 [1:29:50<31:18, 1.68it/s, loss=0.0036, lr=8.09e-06, step=6844] Training: 68%|██████▊ | 6845/10000 [1:29:50<31:18, 1.68it/s, loss=0.0055, lr=8.09e-06, step=6845] Training: 68%|██████▊ | 6846/10000 [1:29:50<29:44, 1.77it/s, loss=0.0055, lr=8.09e-06, step=6845] Training: 68%|██████▊ | 6846/10000 [1:29:50<29:44, 1.77it/s, loss=0.0039, lr=8.09e-06, step=6846] Training: 68%|██████▊ | 6847/10000 [1:29:51<28:29, 1.84it/s, loss=0.0039, lr=8.09e-06, step=6846] Training: 68%|██████▊ | 6847/10000 [1:29:51<28:29, 1.84it/s, loss=0.0090, lr=8.08e-06, step=6847] Training: 68%|██████▊ | 6848/10000 [1:29:51<27:32, 1.91it/s, loss=0.0090, lr=8.08e-06, step=6847] Training: 68%|██████▊ | 6848/10000 [1:29:51<27:32, 1.91it/s, loss=0.0018, lr=8.08e-06, step=6848] Training: 68%|██████▊ | 6849/10000 [1:29:52<27:37, 1.90it/s, loss=0.0018, lr=8.08e-06, step=6848] Training: 68%|██████▊ | 6849/10000 [1:29:52<27:37, 1.90it/s, loss=0.0243, lr=8.08e-06, step=6849]20:14:24.834 [I] step=6850 loss=0.0101 smoothed_loss=0.0091 lr=8.09e-06 grad_norm=0.4283 step_time=0.4966s data_time=0.0724s it/s=1.758 eta_to_10000=1791.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0154 grad_action_out_proj_arms=0.1467 grad_arm_token_fuse=0.0828 grad_shared_expert=0.4866 (18633:train_pytorch.py:850) + Training: 68%|██████▊ | 6850/10000 [1:29:53<31:55, 1.64it/s, loss=0.0243, lr=8.08e-06, step=6849] Training: 68%|██████▊ | 6850/10000 [1:29:53<31:55, 1.64it/s, loss=0.0101, lr=8.07e-06, step=6850] Training: 69%|██████▊ | 6851/10000 [1:29:53<29:52, 1.76it/s, loss=0.0101, lr=8.07e-06, step=6850] Training: 69%|██████▊ | 6851/10000 [1:29:53<29:52, 1.76it/s, loss=0.0079, lr=8.07e-06, step=6851] Training: 69%|██████▊ | 6852/10000 [1:29:54<30:59, 1.69it/s, loss=0.0079, lr=8.07e-06, step=6851] Training: 69%|██████▊ | 6852/10000 [1:29:54<30:59, 1.69it/s, loss=0.0104, lr=8.07e-06, step=6852] Training: 69%|██████▊ | 6853/10000 [1:29:54<29:55, 1.75it/s, loss=0.0104, lr=8.07e-06, step=6852] Training: 69%|██████▊ | 6853/10000 [1:29:54<29:55, 1.75it/s, loss=0.0204, lr=8.06e-06, step=6853] Training: 69%|██████▊ | 6854/10000 [1:29:55<30:42, 1.71it/s, loss=0.0204, lr=8.06e-06, step=6853] Training: 69%|██████▊ | 6854/10000 [1:29:55<30:42, 1.71it/s, loss=0.0093, lr=8.06e-06, step=6854] Training: 69%|██████▊ | 6855/10000 [1:29:55<30:25, 1.72it/s, loss=0.0093, lr=8.06e-06, step=6854] Training: 69%|██████▊ | 6855/10000 [1:29:55<30:25, 1.72it/s, loss=0.0041, lr=8.06e-06, step=6855] Training: 69%|██████▊ | 6856/10000 [1:29:56<29:52, 1.75it/s, loss=0.0041, lr=8.06e-06, step=6855] Training: 69%|██████▊ | 6856/10000 [1:29:56<29:52, 1.75it/s, loss=0.0447, lr=8.06e-06, step=6856] Training: 69%|██████▊ | 6857/10000 [1:29:57<32:03, 1.63it/s, loss=0.0447, lr=8.06e-06, step=6856] Training: 69%|██████▊ | 6857/10000 [1:29:57<32:03, 1.63it/s, loss=0.0029, lr=8.05e-06, step=6857] Training: 69%|██████▊ | 6858/10000 [1:29:57<31:07, 1.68it/s, loss=0.0029, lr=8.05e-06, step=6857] Training: 69%|██████▊ | 6858/10000 [1:29:57<31:07, 1.68it/s, loss=0.0084, lr=8.05e-06, step=6858] Training: 69%|██████▊ | 6859/10000 [1:29:58<29:33, 1.77it/s, loss=0.0084, lr=8.05e-06, step=6858] Training: 69%|██████▊ | 6859/10000 [1:29:58<29:33, 1.77it/s, loss=0.0017, lr=8.05e-06, step=6859]20:14:30.646 [I] step=6860 loss=0.0105 smoothed_loss=0.0107 lr=8.06e-06 grad_norm=0.4437 step_time=0.5071s data_time=0.0741s it/s=1.721 eta_to_10000=1824.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0046 grad_action_out_proj_arms=0.0619 grad_arm_token_fuse=0.0226 grad_shared_expert=0.2648 (18633:train_pytorch.py:850) + Training: 69%|██████▊ | 6860/10000 [1:29:58<31:18, 1.67it/s, loss=0.0017, lr=8.05e-06, step=6859] Training: 69%|██████▊ | 6860/10000 [1:29:58<31:18, 1.67it/s, loss=0.0105, lr=8.04e-06, step=6860] Training: 69%|██████▊ | 6861/10000 [1:29:59<29:38, 1.77it/s, loss=0.0105, lr=8.04e-06, step=6860] Training: 69%|██████▊ | 6861/10000 [1:29:59<29:38, 1.77it/s, loss=0.0072, lr=8.04e-06, step=6861] Training: 69%|██████▊ | 6862/10000 [1:29:59<29:27, 1.77it/s, loss=0.0072, lr=8.04e-06, step=6861] Training: 69%|██████▊ | 6862/10000 [1:29:59<29:27, 1.77it/s, loss=0.0021, lr=8.04e-06, step=6862] Training: 69%|██████▊ | 6863/10000 [1:30:00<28:41, 1.82it/s, loss=0.0021, lr=8.04e-06, step=6862] Training: 69%|██████▊ | 6863/10000 [1:30:00<28:41, 1.82it/s, loss=0.0536, lr=8.03e-06, step=6863] Training: 69%|██████▊ | 6864/10000 [1:30:01<30:17, 1.73it/s, loss=0.0536, lr=8.03e-06, step=6863] Training: 69%|██████▊ | 6864/10000 [1:30:01<30:17, 1.73it/s, loss=0.0061, lr=8.03e-06, step=6864] Training: 69%|██████▊ | 6865/10000 [1:30:01<34:37, 1.51it/s, loss=0.0061, lr=8.03e-06, step=6864] Training: 69%|██████▊ | 6865/10000 [1:30:01<34:37, 1.51it/s, loss=0.0010, lr=8.03e-06, step=6865] Training: 69%|██████▊ | 6866/10000 [1:30:02<32:22, 1.61it/s, loss=0.0010, lr=8.03e-06, step=6865] Training: 69%|██████▊ | 6866/10000 [1:30:02<32:22, 1.61it/s, loss=0.0021, lr=8.02e-06, step=6866] Training: 69%|██████▊ | 6867/10000 [1:30:02<30:42, 1.70it/s, loss=0.0021, lr=8.02e-06, step=6866] Training: 69%|██████▊ | 6867/10000 [1:30:02<30:42, 1.70it/s, loss=0.0056, lr=8.02e-06, step=6867] Training: 69%|██████▊ | 6868/10000 [1:30:03<33:45, 1.55it/s, loss=0.0056, lr=8.02e-06, step=6867] Training: 69%|██████▊ | 6868/10000 [1:30:03<33:45, 1.55it/s, loss=0.0059, lr=8.02e-06, step=6868] Training: 69%|██████▊ | 6869/10000 [1:30:04<31:30, 1.66it/s, loss=0.0059, lr=8.02e-06, step=6868] Training: 69%|██████▊ | 6869/10000 [1:30:04<31:30, 1.66it/s, loss=0.0017, lr=8.01e-06, step=6869]20:14:36.614 [I] step=6870 loss=0.0056 smoothed_loss=0.0088 lr=8.02e-06 grad_norm=0.4032 step_time=0.5158s data_time=0.0811s it/s=1.676 eta_to_10000=1867.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0234 grad_action_out_proj_arms=0.1154 grad_arm_token_fuse=0.1245 grad_shared_expert=0.3313 (18633:train_pytorch.py:850) + Training: 69%|██████▊ | 6870/10000 [1:30:04<31:03, 1.68it/s, loss=0.0017, lr=8.01e-06, step=6869] Training: 69%|██████▊ | 6870/10000 [1:30:04<31:03, 1.68it/s, loss=0.0056, lr=8.01e-06, step=6870] Training: 69%|██████▊ | 6871/10000 [1:30:05<31:39, 1.65it/s, loss=0.0056, lr=8.01e-06, step=6870] Training: 69%|██████▊ | 6871/10000 [1:30:05<31:39, 1.65it/s, loss=0.0018, lr=8.01e-06, step=6871] Training: 69%|██████▊ | 6872/10000 [1:30:06<34:06, 1.53it/s, loss=0.0018, lr=8.01e-06, step=6871] Training: 69%|██████▊ | 6872/10000 [1:30:06<34:06, 1.53it/s, loss=0.0019, lr=8.00e-06, step=6872] Training: 69%|██████▊ | 6873/10000 [1:30:06<32:12, 1.62it/s, loss=0.0019, lr=8.00e-06, step=6872] Training: 69%|██████▊ | 6873/10000 [1:30:06<32:12, 1.62it/s, loss=0.0395, lr=8.00e-06, step=6873] Training: 69%|██████▊ | 6874/10000 [1:30:07<35:54, 1.45it/s, loss=0.0395, lr=8.00e-06, step=6873] Training: 69%|██████▊ | 6874/10000 [1:30:07<35:54, 1.45it/s, loss=0.0797, lr=8.00e-06, step=6874] Training: 69%|██████▉ | 6875/10000 [1:30:08<35:23, 1.47it/s, loss=0.0797, lr=8.00e-06, step=6874] Training: 69%|██████▉ | 6875/10000 [1:30:08<35:23, 1.47it/s, loss=0.0046, lr=7.99e-06, step=6875] Training: 69%|██████▉ | 6876/10000 [1:30:08<34:51, 1.49it/s, loss=0.0046, lr=7.99e-06, step=6875] Training: 69%|██████▉ | 6876/10000 [1:30:08<34:51, 1.49it/s, loss=0.0309, lr=7.99e-06, step=6876] Training: 69%|██████▉ | 6877/10000 [1:30:09<32:07, 1.62it/s, loss=0.0309, lr=7.99e-06, step=6876] Training: 69%|██████▉ | 6877/10000 [1:30:09<32:07, 1.62it/s, loss=0.0014, lr=7.99e-06, step=6877] Training: 69%|██████▉ | 6878/10000 [1:30:10<32:59, 1.58it/s, loss=0.0014, lr=7.99e-06, step=6877] Training: 69%|██████▉ | 6878/10000 [1:30:10<32:59, 1.58it/s, loss=0.0028, lr=7.98e-06, step=6878] Training: 69%|██████▉ | 6879/10000 [1:30:10<35:06, 1.48it/s, loss=0.0028, lr=7.98e-06, step=6878] Training: 69%|██████▉ | 6879/10000 [1:30:10<35:06, 1.48it/s, loss=0.0246, lr=7.98e-06, step=6879]20:14:43.225 [I] step=6880 loss=0.0059 smoothed_loss=0.0148 lr=7.99e-06 grad_norm=0.4708 step_time=0.5569s data_time=0.1043s it/s=1.513 eta_to_10000=2062.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0206 grad_action_out_proj_arms=0.1255 grad_arm_token_fuse=0.1029 grad_shared_expert=0.4564 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6880/10000 [1:30:11<33:38, 1.55it/s, loss=0.0246, lr=7.98e-06, step=6879] Training: 69%|██████▉ | 6880/10000 [1:30:11<33:38, 1.55it/s, loss=0.0059, lr=7.98e-06, step=6880] Training: 69%|██████▉ | 6881/10000 [1:30:11<31:09, 1.67it/s, loss=0.0059, lr=7.98e-06, step=6880] Training: 69%|██████▉ | 6881/10000 [1:30:11<31:09, 1.67it/s, loss=0.0051, lr=7.98e-06, step=6881] Training: 69%|██████▉ | 6882/10000 [1:30:12<32:15, 1.61it/s, loss=0.0051, lr=7.98e-06, step=6881] Training: 69%|██████▉ | 6882/10000 [1:30:12<32:15, 1.61it/s, loss=0.0124, lr=7.97e-06, step=6882] Training: 69%|██████▉ | 6883/10000 [1:30:13<30:23, 1.71it/s, loss=0.0124, lr=7.97e-06, step=6882] Training: 69%|██████▉ | 6883/10000 [1:30:13<30:23, 1.71it/s, loss=0.0042, lr=7.97e-06, step=6883] Training: 69%|██████▉ | 6884/10000 [1:30:13<29:06, 1.78it/s, loss=0.0042, lr=7.97e-06, step=6883] Training: 69%|██████▉ | 6884/10000 [1:30:13<29:06, 1.78it/s, loss=0.0136, lr=7.97e-06, step=6884] Training: 69%|██████▉ | 6885/10000 [1:30:14<29:13, 1.78it/s, loss=0.0136, lr=7.97e-06, step=6884] Training: 69%|██████▉ | 6885/10000 [1:30:14<29:13, 1.78it/s, loss=0.0032, lr=7.96e-06, step=6885] Training: 69%|██████▉ | 6886/10000 [1:30:14<31:33, 1.64it/s, loss=0.0032, lr=7.96e-06, step=6885] Training: 69%|██████▉ | 6886/10000 [1:30:14<31:33, 1.64it/s, loss=0.0017, lr=7.96e-06, step=6886] Training: 69%|██████▉ | 6887/10000 [1:30:15<30:27, 1.70it/s, loss=0.0017, lr=7.96e-06, step=6886] Training: 69%|██████▉ | 6887/10000 [1:30:15<30:27, 1.70it/s, loss=0.0023, lr=7.96e-06, step=6887] Training: 69%|██████▉ | 6888/10000 [1:30:16<31:50, 1.63it/s, loss=0.0023, lr=7.96e-06, step=6887] Training: 69%|██████▉ | 6888/10000 [1:30:16<31:50, 1.63it/s, loss=0.0016, lr=7.95e-06, step=6888] Training: 69%|██████▉ | 6889/10000 [1:30:16<30:01, 1.73it/s, loss=0.0016, lr=7.95e-06, step=6888] Training: 69%|██████▉ | 6889/10000 [1:30:16<30:01, 1.73it/s, loss=0.0108, lr=7.95e-06, step=6889]20:14:49.090 [I] step=6890 loss=0.0431 smoothed_loss=0.0127 lr=7.96e-06 grad_norm=0.4821 step_time=0.5094s data_time=0.0770s it/s=1.706 eta_to_10000=1823.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0154 grad_action_out_proj_arms=0.1155 grad_arm_token_fuse=0.0793 grad_shared_expert=0.6058 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6890/10000 [1:30:17<31:59, 1.62it/s, loss=0.0108, lr=7.95e-06, step=6889] Training: 69%|██████▉ | 6890/10000 [1:30:17<31:59, 1.62it/s, loss=0.0431, lr=7.95e-06, step=6890] Training: 69%|██████▉ | 6891/10000 [1:30:17<30:44, 1.69it/s, loss=0.0431, lr=7.95e-06, step=6890] Training: 69%|██████▉ | 6891/10000 [1:30:17<30:44, 1.69it/s, loss=0.0013, lr=7.94e-06, step=6891] Training: 69%|██████▉ | 6892/10000 [1:30:18<29:02, 1.78it/s, loss=0.0013, lr=7.94e-06, step=6891] Training: 69%|██████▉ | 6892/10000 [1:30:18<29:02, 1.78it/s, loss=0.0113, lr=7.94e-06, step=6892] Training: 69%|██████▉ | 6893/10000 [1:30:19<31:54, 1.62it/s, loss=0.0113, lr=7.94e-06, step=6892] Training: 69%|██████▉ | 6893/10000 [1:30:19<31:54, 1.62it/s, loss=0.0061, lr=7.94e-06, step=6893] Training: 69%|██████▉ | 6894/10000 [1:30:19<31:12, 1.66it/s, loss=0.0061, lr=7.94e-06, step=6893] Training: 69%|██████▉ | 6894/10000 [1:30:19<31:12, 1.66it/s, loss=0.0096, lr=7.93e-06, step=6894] Training: 69%|██████▉ | 6895/10000 [1:30:20<29:38, 1.75it/s, loss=0.0096, lr=7.93e-06, step=6894] Training: 69%|██████▉ | 6895/10000 [1:30:20<29:38, 1.75it/s, loss=0.0034, lr=7.93e-06, step=6895] Training: 69%|██████▉ | 6896/10000 [1:30:20<29:30, 1.75it/s, loss=0.0034, lr=7.93e-06, step=6895] Training: 69%|██████▉ | 6896/10000 [1:30:20<29:30, 1.75it/s, loss=0.0020, lr=7.93e-06, step=6896] Training: 69%|██████▉ | 6897/10000 [1:30:21<30:48, 1.68it/s, loss=0.0020, lr=7.93e-06, step=6896] Training: 69%|██████▉ | 6897/10000 [1:30:21<30:48, 1.68it/s, loss=0.0064, lr=7.92e-06, step=6897] Training: 69%|██████▉ | 6898/10000 [1:30:21<29:53, 1.73it/s, loss=0.0064, lr=7.92e-06, step=6897] Training: 69%|██████▉ | 6898/10000 [1:30:21<29:53, 1.73it/s, loss=0.0034, lr=7.92e-06, step=6898] Training: 69%|██████▉ | 6899/10000 [1:30:22<29:01, 1.78it/s, loss=0.0034, lr=7.92e-06, step=6898] Training: 69%|██████▉ | 6899/10000 [1:30:22<29:01, 1.78it/s, loss=0.0105, lr=7.92e-06, step=6899]20:14:54.922 [I] step=6900 loss=0.0183 smoothed_loss=0.0096 lr=7.93e-06 grad_norm=0.4476 step_time=0.5133s data_time=0.0699s it/s=1.715 eta_to_10000=1807.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1270 grad_arm_token_fuse=0.0549 grad_shared_expert=0.5886 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6900/10000 [1:30:23<31:20, 1.65it/s, loss=0.0105, lr=7.92e-06, step=6899] Training: 69%|██████▉ | 6900/10000 [1:30:23<31:20, 1.65it/s, loss=0.0183, lr=7.91e-06, step=6900] Training: 69%|██████▉ | 6901/10000 [1:30:23<29:33, 1.75it/s, loss=0.0183, lr=7.91e-06, step=6900] Training: 69%|██████▉ | 6901/10000 [1:30:23<29:33, 1.75it/s, loss=0.0016, lr=7.91e-06, step=6901] Training: 69%|██████▉ | 6902/10000 [1:30:24<28:15, 1.83it/s, loss=0.0016, lr=7.91e-06, step=6901] Training: 69%|██████▉ | 6902/10000 [1:30:24<28:15, 1.83it/s, loss=0.0064, lr=7.91e-06, step=6902] Training: 69%|██████▉ | 6903/10000 [1:30:24<28:15, 1.83it/s, loss=0.0064, lr=7.91e-06, step=6902] Training: 69%|██████▉ | 6903/10000 [1:30:24<28:15, 1.83it/s, loss=0.0076, lr=7.91e-06, step=6903] Training: 69%|██████▉ | 6904/10000 [1:30:25<30:15, 1.71it/s, loss=0.0076, lr=7.91e-06, step=6903] Training: 69%|██████▉ | 6904/10000 [1:30:25<30:15, 1.71it/s, loss=0.0054, lr=7.90e-06, step=6904] Training: 69%|██████▉ | 6905/10000 [1:30:25<28:46, 1.79it/s, loss=0.0054, lr=7.90e-06, step=6904] Training: 69%|██████▉ | 6905/10000 [1:30:25<28:46, 1.79it/s, loss=0.0170, lr=7.90e-06, step=6905] Training: 69%|██████▉ | 6906/10000 [1:30:26<28:35, 1.80it/s, loss=0.0170, lr=7.90e-06, step=6905] Training: 69%|██████▉ | 6906/10000 [1:30:26<28:35, 1.80it/s, loss=0.0126, lr=7.90e-06, step=6906] Training: 69%|██████▉ | 6907/10000 [1:30:27<32:01, 1.61it/s, loss=0.0126, lr=7.90e-06, step=6906] Training: 69%|██████▉ | 6907/10000 [1:30:27<32:01, 1.61it/s, loss=0.1405, lr=7.89e-06, step=6907] Training: 69%|██████▉ | 6908/10000 [1:30:27<30:18, 1.70it/s, loss=0.1405, lr=7.89e-06, step=6907] Training: 69%|██████▉ | 6908/10000 [1:30:27<30:18, 1.70it/s, loss=0.0016, lr=7.89e-06, step=6908] Training: 69%|██████▉ | 6909/10000 [1:30:28<31:12, 1.65it/s, loss=0.0016, lr=7.89e-06, step=6908] Training: 69%|██████▉ | 6909/10000 [1:30:28<31:12, 1.65it/s, loss=0.0234, lr=7.89e-06, step=6909]20:15:00.632 [I] step=6910 loss=0.0016 smoothed_loss=0.0188 lr=7.90e-06 grad_norm=0.5331 step_time=0.4968s data_time=0.0742s it/s=1.752 eta_to_10000=1763.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0103 grad_action_out_proj_arms=0.0828 grad_arm_token_fuse=0.0517 grad_shared_expert=0.5199 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6910/10000 [1:30:28<30:01, 1.72it/s, loss=0.0234, lr=7.89e-06, step=6909] Training: 69%|██████▉ | 6910/10000 [1:30:28<30:01, 1.72it/s, loss=0.0016, lr=7.88e-06, step=6910] Training: 69%|██████▉ | 6911/10000 [1:30:29<28:52, 1.78it/s, loss=0.0016, lr=7.88e-06, step=6910] Training: 69%|██████▉ | 6911/10000 [1:30:29<28:52, 1.78it/s, loss=0.0049, lr=7.88e-06, step=6911] Training: 69%|██████▉ | 6912/10000 [1:30:29<30:29, 1.69it/s, loss=0.0049, lr=7.88e-06, step=6911] Training: 69%|██████▉ | 6912/10000 [1:30:29<30:29, 1.69it/s, loss=0.0028, lr=7.88e-06, step=6912] Training: 69%|██████▉ | 6913/10000 [1:30:30<30:00, 1.71it/s, loss=0.0028, lr=7.88e-06, step=6912] Training: 69%|██████▉ | 6913/10000 [1:30:30<30:00, 1.71it/s, loss=0.0092, lr=7.87e-06, step=6913] Training: 69%|██████▉ | 6914/10000 [1:30:31<31:17, 1.64it/s, loss=0.0092, lr=7.87e-06, step=6913] Training: 69%|██████▉ | 6914/10000 [1:30:31<31:17, 1.64it/s, loss=0.0151, lr=7.87e-06, step=6914] Training: 69%|██████▉ | 6915/10000 [1:30:32<34:18, 1.50it/s, loss=0.0151, lr=7.87e-06, step=6914] Training: 69%|██████▉ | 6915/10000 [1:30:32<34:18, 1.50it/s, loss=0.0036, lr=7.87e-06, step=6915] Training: 69%|██████▉ | 6916/10000 [1:30:32<31:44, 1.62it/s, loss=0.0036, lr=7.87e-06, step=6915] Training: 69%|██████▉ | 6916/10000 [1:30:32<31:44, 1.62it/s, loss=0.0368, lr=7.86e-06, step=6916] Training: 69%|██████▉ | 6917/10000 [1:30:33<30:03, 1.71it/s, loss=0.0368, lr=7.86e-06, step=6916] Training: 69%|██████▉ | 6917/10000 [1:30:33<30:03, 1.71it/s, loss=0.0228, lr=7.86e-06, step=6917] Training: 69%|██████▉ | 6918/10000 [1:30:33<32:14, 1.59it/s, loss=0.0228, lr=7.86e-06, step=6917] Training: 69%|██████▉ | 6918/10000 [1:30:33<32:14, 1.59it/s, loss=0.0134, lr=7.86e-06, step=6918] Training: 69%|██████▉ | 6919/10000 [1:30:34<30:02, 1.71it/s, loss=0.0134, lr=7.86e-06, step=6918] Training: 69%|██████▉ | 6919/10000 [1:30:34<30:02, 1.71it/s, loss=0.0188, lr=7.85e-06, step=6919]20:15:06.623 [I] step=6920 loss=0.0016 smoothed_loss=0.0153 lr=7.87e-06 grad_norm=0.4676 step_time=0.5267s data_time=0.0724s it/s=1.669 eta_to_10000=1845.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0195 grad_action_out_proj_arms=0.1111 grad_arm_token_fuse=0.0988 grad_shared_expert=0.3730 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6920/10000 [1:30:34<29:38, 1.73it/s, loss=0.0188, lr=7.85e-06, step=6919] Training: 69%|██████▉ | 6920/10000 [1:30:34<29:38, 1.73it/s, loss=0.0016, lr=7.85e-06, step=6920] Training: 69%|██████▉ | 6921/10000 [1:30:35<30:41, 1.67it/s, loss=0.0016, lr=7.85e-06, step=6920] Training: 69%|██████▉ | 6921/10000 [1:30:35<30:41, 1.67it/s, loss=0.0126, lr=7.85e-06, step=6921] Training: 69%|██████▉ | 6922/10000 [1:30:36<34:13, 1.50it/s, loss=0.0126, lr=7.85e-06, step=6921] Training: 69%|██████▉ | 6922/10000 [1:30:36<34:13, 1.50it/s, loss=0.0091, lr=7.84e-06, step=6922] Training: 69%|██████▉ | 6923/10000 [1:30:36<33:27, 1.53it/s, loss=0.0091, lr=7.84e-06, step=6922] Training: 69%|██████▉ | 6923/10000 [1:30:36<33:27, 1.53it/s, loss=0.0013, lr=7.84e-06, step=6923] Training: 69%|██████▉ | 6924/10000 [1:30:37<32:28, 1.58it/s, loss=0.0013, lr=7.84e-06, step=6923] Training: 69%|██████▉ | 6924/10000 [1:30:37<32:28, 1.58it/s, loss=0.0115, lr=7.84e-06, step=6924] Training: 69%|██████▉ | 6925/10000 [1:30:38<32:45, 1.56it/s, loss=0.0115, lr=7.84e-06, step=6924] Training: 69%|██████▉ | 6925/10000 [1:30:38<32:45, 1.56it/s, loss=0.0278, lr=7.84e-06, step=6925] Training: 69%|██████▉ | 6926/10000 [1:30:38<30:48, 1.66it/s, loss=0.0278, lr=7.84e-06, step=6925] Training: 69%|██████▉ | 6926/10000 [1:30:38<30:48, 1.66it/s, loss=0.0068, lr=7.83e-06, step=6926] Training: 69%|██████▉ | 6927/10000 [1:30:39<30:33, 1.68it/s, loss=0.0068, lr=7.83e-06, step=6926] Training: 69%|██████▉ | 6927/10000 [1:30:39<30:33, 1.68it/s, loss=0.0224, lr=7.83e-06, step=6927] Training: 69%|██████▉ | 6928/10000 [1:30:39<31:07, 1.64it/s, loss=0.0224, lr=7.83e-06, step=6927] Training: 69%|██████▉ | 6928/10000 [1:30:39<31:07, 1.64it/s, loss=0.0101, lr=7.83e-06, step=6928] Training: 69%|██████▉ | 6929/10000 [1:30:40<35:56, 1.42it/s, loss=0.0101, lr=7.83e-06, step=6928] Training: 69%|██████▉ | 6929/10000 [1:30:40<35:56, 1.42it/s, loss=0.0553, lr=7.82e-06, step=6929]20:15:13.147 [I] step=6930 loss=0.0040 smoothed_loss=0.0168 lr=7.83e-06 grad_norm=0.5224 step_time=0.5647s data_time=0.0877s it/s=1.533 eta_to_10000=2002.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0175 grad_action_out_proj_arms=0.1445 grad_arm_token_fuse=0.0895 grad_shared_expert=0.4690 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6930/10000 [1:30:41<33:20, 1.53it/s, loss=0.0553, lr=7.82e-06, step=6929] Training: 69%|██████▉ | 6930/10000 [1:30:41<33:20, 1.53it/s, loss=0.0040, lr=7.82e-06, step=6930] Training: 69%|██████▉ | 6931/10000 [1:30:41<31:44, 1.61it/s, loss=0.0040, lr=7.82e-06, step=6930] Training: 69%|██████▉ | 6931/10000 [1:30:41<31:44, 1.61it/s, loss=0.0155, lr=7.82e-06, step=6931] Training: 69%|██████▉ | 6932/10000 [1:30:42<31:58, 1.60it/s, loss=0.0155, lr=7.82e-06, step=6931] Training: 69%|██████▉ | 6932/10000 [1:30:42<31:58, 1.60it/s, loss=0.0059, lr=7.81e-06, step=6932] Training: 69%|██████▉ | 6933/10000 [1:30:43<30:51, 1.66it/s, loss=0.0059, lr=7.81e-06, step=6932] Training: 69%|██████▉ | 6933/10000 [1:30:43<30:51, 1.66it/s, loss=0.0119, lr=7.81e-06, step=6933] Training: 69%|██████▉ | 6934/10000 [1:30:43<29:16, 1.75it/s, loss=0.0119, lr=7.81e-06, step=6933] Training: 69%|██████▉ | 6934/10000 [1:30:43<29:16, 1.75it/s, loss=0.0152, lr=7.81e-06, step=6934] Training: 69%|██████▉ | 6935/10000 [1:30:44<28:54, 1.77it/s, loss=0.0152, lr=7.81e-06, step=6934] Training: 69%|██████▉ | 6935/10000 [1:30:44<28:54, 1.77it/s, loss=0.0280, lr=7.80e-06, step=6935] Training: 69%|██████▉ | 6936/10000 [1:30:44<31:06, 1.64it/s, loss=0.0280, lr=7.80e-06, step=6935] Training: 69%|██████▉ | 6936/10000 [1:30:44<31:06, 1.64it/s, loss=0.0034, lr=7.80e-06, step=6936] Training: 69%|██████▉ | 6937/10000 [1:30:45<31:12, 1.64it/s, loss=0.0034, lr=7.80e-06, step=6936] Training: 69%|██████▉ | 6937/10000 [1:30:45<31:12, 1.64it/s, loss=0.0222, lr=7.80e-06, step=6937] Training: 69%|██████▉ | 6938/10000 [1:30:45<29:47, 1.71it/s, loss=0.0222, lr=7.80e-06, step=6937] Training: 69%|██████▉ | 6938/10000 [1:30:45<29:47, 1.71it/s, loss=0.0245, lr=7.79e-06, step=6938] Training: 69%|██████▉ | 6939/10000 [1:30:46<32:14, 1.58it/s, loss=0.0245, lr=7.79e-06, step=6938] Training: 69%|██████▉ | 6939/10000 [1:30:46<32:14, 1.58it/s, loss=0.0049, lr=7.79e-06, step=6939]20:15:19.297 [I] step=6940 loss=0.0109 smoothed_loss=0.0151 lr=7.80e-06 grad_norm=0.4974 step_time=0.5233s data_time=0.0917s it/s=1.626 eta_to_10000=1881.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0134 grad_action_out_proj_arms=0.1054 grad_arm_token_fuse=0.0703 grad_shared_expert=0.3930 (18633:train_pytorch.py:850) + Training: 69%|██████▉ | 6940/10000 [1:30:47<34:21, 1.48it/s, loss=0.0049, lr=7.79e-06, step=6939] Training: 69%|██████▉ | 6940/10000 [1:30:47<34:21, 1.48it/s, loss=0.0109, lr=7.79e-06, step=6940] Training: 69%|██████▉ | 6941/10000 [1:30:47<32:05, 1.59it/s, loss=0.0109, lr=7.79e-06, step=6940] Training: 69%|██████▉ | 6941/10000 [1:30:47<32:05, 1.59it/s, loss=0.0091, lr=7.78e-06, step=6941] Training: 69%|██████▉ | 6942/10000 [1:30:48<30:04, 1.69it/s, loss=0.0091, lr=7.78e-06, step=6941] Training: 69%|██████▉ | 6942/10000 [1:30:48<30:04, 1.69it/s, loss=0.0035, lr=7.78e-06, step=6942] Training: 69%|██████▉ | 6943/10000 [1:30:49<32:30, 1.57it/s, loss=0.0035, lr=7.78e-06, step=6942] Training: 69%|██████▉ | 6943/10000 [1:30:49<32:30, 1.57it/s, loss=0.0304, lr=7.78e-06, step=6943] Training: 69%|██████▉ | 6944/10000 [1:30:49<30:26, 1.67it/s, loss=0.0304, lr=7.78e-06, step=6943] Training: 69%|██████▉ | 6944/10000 [1:30:49<30:26, 1.67it/s, loss=0.0053, lr=7.78e-06, step=6944] Training: 69%|██████▉ | 6945/10000 [1:30:50<30:56, 1.65it/s, loss=0.0053, lr=7.78e-06, step=6944] Training: 69%|██████▉ | 6945/10000 [1:30:50<30:56, 1.65it/s, loss=0.0138, lr=7.77e-06, step=6945] Training: 69%|██████▉ | 6946/10000 [1:30:51<31:43, 1.60it/s, loss=0.0138, lr=7.77e-06, step=6945] Training: 69%|██████▉ | 6946/10000 [1:30:51<31:43, 1.60it/s, loss=0.0062, lr=7.77e-06, step=6946] Training: 69%|██████▉ | 6947/10000 [1:30:51<29:49, 1.71it/s, loss=0.0062, lr=7.77e-06, step=6946] Training: 69%|██████▉ | 6947/10000 [1:30:51<29:49, 1.71it/s, loss=0.0334, lr=7.77e-06, step=6947] Training: 69%|██████▉ | 6948/10000 [1:30:52<28:17, 1.80it/s, loss=0.0334, lr=7.77e-06, step=6947] Training: 69%|██████▉ | 6948/10000 [1:30:52<28:17, 1.80it/s, loss=0.0017, lr=7.76e-06, step=6948] Training: 69%|██████▉ | 6949/10000 [1:30:52<27:25, 1.85it/s, loss=0.0017, lr=7.76e-06, step=6948] Training: 69%|██████▉ | 6949/10000 [1:30:52<27:25, 1.85it/s, loss=0.0050, lr=7.76e-06, step=6949]20:15:25.226 [I] step=6950 loss=0.0069 smoothed_loss=0.0124 lr=7.77e-06 grad_norm=0.5440 step_time=0.5172s data_time=0.0757s it/s=1.687 eta_to_10000=1808.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0264 grad_action_out_proj_arms=0.1443 grad_arm_token_fuse=0.1318 grad_shared_expert=0.4525 (18633:train_pytorch.py:850) + Training: 70%|██████▉ | 6950/10000 [1:30:53<32:31, 1.56it/s, loss=0.0050, lr=7.76e-06, step=6949] Training: 70%|██████▉ | 6950/10000 [1:30:53<32:31, 1.56it/s, loss=0.0069, lr=7.76e-06, step=6950] Training: 70%|██████▉ | 6951/10000 [1:30:53<30:19, 1.68it/s, loss=0.0069, lr=7.76e-06, step=6950] Training: 70%|██████▉ | 6951/10000 [1:30:53<30:19, 1.68it/s, loss=0.0095, lr=7.75e-06, step=6951] Training: 70%|██████▉ | 6952/10000 [1:30:54<28:35, 1.78it/s, loss=0.0095, lr=7.75e-06, step=6951] Training: 70%|██████▉ | 6952/10000 [1:30:54<28:35, 1.78it/s, loss=0.0099, lr=7.75e-06, step=6952] Training: 70%|██████▉ | 6953/10000 [1:30:55<32:02, 1.58it/s, loss=0.0099, lr=7.75e-06, step=6952] Training: 70%|██████▉ | 6953/10000 [1:30:55<32:02, 1.58it/s, loss=0.0033, lr=7.75e-06, step=6953] Training: 70%|██████▉ | 6954/10000 [1:30:55<29:48, 1.70it/s, loss=0.0033, lr=7.75e-06, step=6953] Training: 70%|██████▉ | 6954/10000 [1:30:55<29:48, 1.70it/s, loss=0.0053, lr=7.74e-06, step=6954] Training: 70%|██████▉ | 6955/10000 [1:30:56<28:29, 1.78it/s, loss=0.0053, lr=7.74e-06, step=6954] Training: 70%|██████▉ | 6955/10000 [1:30:56<28:29, 1.78it/s, loss=0.0094, lr=7.74e-06, step=6955] Training: 70%|██████▉ | 6956/10000 [1:30:56<27:56, 1.82it/s, loss=0.0094, lr=7.74e-06, step=6955] Training: 70%|██████▉ | 6956/10000 [1:30:56<27:56, 1.82it/s, loss=0.0641, lr=7.74e-06, step=6956] Training: 70%|██████▉ | 6957/10000 [1:30:57<31:04, 1.63it/s, loss=0.0641, lr=7.74e-06, step=6956] Training: 70%|██████▉ | 6957/10000 [1:30:57<31:04, 1.63it/s, loss=0.0301, lr=7.73e-06, step=6957] Training: 70%|██████▉ | 6958/10000 [1:30:57<29:43, 1.71it/s, loss=0.0301, lr=7.73e-06, step=6957] Training: 70%|██████▉ | 6958/10000 [1:30:57<29:43, 1.71it/s, loss=0.0305, lr=7.73e-06, step=6958] Training: 70%|██████▉ | 6959/10000 [1:30:58<28:46, 1.76it/s, loss=0.0305, lr=7.73e-06, step=6958] Training: 70%|██████▉ | 6959/10000 [1:30:58<28:46, 1.76it/s, loss=0.0013, lr=7.73e-06, step=6959]20:15:31.076 [I] step=6960 loss=0.0409 smoothed_loss=0.0192 lr=7.74e-06 grad_norm=0.4605 step_time=0.5042s data_time=0.0807s it/s=1.710 eta_to_10000=1777.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0347 grad_action_out_proj_arms=0.1589 grad_arm_token_fuse=0.1817 grad_shared_expert=0.5989 (18633:train_pytorch.py:850) + Training: 70%|██████▉ | 6960/10000 [1:30:59<31:42, 1.60it/s, loss=0.0013, lr=7.73e-06, step=6959] Training: 70%|██████▉ | 6960/10000 [1:30:59<31:42, 1.60it/s, loss=0.0409, lr=7.73e-06, step=6960] Training: 70%|██████▉ | 6961/10000 [1:30:59<29:33, 1.71it/s, loss=0.0409, lr=7.73e-06, step=6960] Training: 70%|██████▉ | 6961/10000 [1:30:59<29:33, 1.71it/s, loss=0.0018, lr=7.72e-06, step=6961] Training: 70%|██████▉ | 6962/10000 [1:31:00<30:16, 1.67it/s, loss=0.0018, lr=7.72e-06, step=6961] Training: 70%|██████▉ | 6962/10000 [1:31:00<30:16, 1.67it/s, loss=0.0014, lr=7.72e-06, step=6962] Training: 70%|██████▉ | 6963/10000 [1:31:00<29:27, 1.72it/s, loss=0.0014, lr=7.72e-06, step=6962] Training: 70%|██████▉ | 6963/10000 [1:31:00<29:27, 1.72it/s, loss=0.0087, lr=7.72e-06, step=6963] Training: 70%|██████▉ | 6964/10000 [1:31:01<30:55, 1.64it/s, loss=0.0087, lr=7.72e-06, step=6963] Training: 70%|██████▉ | 6964/10000 [1:31:01<30:55, 1.64it/s, loss=0.0039, lr=7.71e-06, step=6964] Training: 70%|██████▉ | 6965/10000 [1:31:02<34:45, 1.46it/s, loss=0.0039, lr=7.71e-06, step=6964] Training: 70%|██████▉ | 6965/10000 [1:31:02<34:45, 1.46it/s, loss=0.0046, lr=7.71e-06, step=6965] Training: 70%|██████▉ | 6966/10000 [1:31:03<33:18, 1.52it/s, loss=0.0046, lr=7.71e-06, step=6965] Training: 70%|██████▉ | 6966/10000 [1:31:03<33:18, 1.52it/s, loss=0.0079, lr=7.71e-06, step=6966] Training: 70%|██████▉ | 6967/10000 [1:31:03<34:24, 1.47it/s, loss=0.0079, lr=7.71e-06, step=6966] Training: 70%|██████▉ | 6967/10000 [1:31:03<34:24, 1.47it/s, loss=0.0048, lr=7.70e-06, step=6967] Training: 70%|██████▉ | 6968/10000 [1:31:04<31:30, 1.60it/s, loss=0.0048, lr=7.70e-06, step=6967] Training: 70%|██████▉ | 6968/10000 [1:31:04<31:30, 1.60it/s, loss=0.0036, lr=7.70e-06, step=6968] Training: 70%|██████▉ | 6969/10000 [1:31:04<29:38, 1.70it/s, loss=0.0036, lr=7.70e-06, step=6968] Training: 70%|██████▉ | 6969/10000 [1:31:04<29:38, 1.70it/s, loss=0.0239, lr=7.70e-06, step=6969]20:15:37.141 [I] step=6970 loss=0.0420 smoothed_loss=0.0152 lr=7.71e-06 grad_norm=0.5004 step_time=0.5206s data_time=0.0860s it/s=1.649 eta_to_10000=1837.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0530 grad_action_out_proj_arms=0.2616 grad_arm_token_fuse=0.2855 grad_shared_expert=0.9944 (18633:train_pytorch.py:850) + Training: 70%|██████▉ | 6970/10000 [1:31:05<28:59, 1.74it/s, loss=0.0239, lr=7.70e-06, step=6969] Training: 70%|██████▉ | 6970/10000 [1:31:05<28:59, 1.74it/s, loss=0.0420, lr=7.69e-06, step=6970] Training: 70%|██████▉ | 6971/10000 [1:31:05<30:03, 1.68it/s, loss=0.0420, lr=7.69e-06, step=6970] Training: 70%|██████▉ | 6971/10000 [1:31:05<30:03, 1.68it/s, loss=0.0381, lr=7.69e-06, step=6971] Training: 70%|██████▉ | 6972/10000 [1:31:06<32:23, 1.56it/s, loss=0.0381, lr=7.69e-06, step=6971] Training: 70%|██████▉ | 6972/10000 [1:31:06<32:23, 1.56it/s, loss=0.0013, lr=7.69e-06, step=6972] Training: 70%|██████▉ | 6973/10000 [1:31:07<31:27, 1.60it/s, loss=0.0013, lr=7.69e-06, step=6972] Training: 70%|██████▉ | 6973/10000 [1:31:07<31:27, 1.60it/s, loss=0.0105, lr=7.68e-06, step=6973] Training: 70%|██████▉ | 6974/10000 [1:31:07<32:07, 1.57it/s, loss=0.0105, lr=7.68e-06, step=6973] Training: 70%|██████▉ | 6974/10000 [1:31:07<32:07, 1.57it/s, loss=0.0064, lr=7.68e-06, step=6974] Training: 70%|██████▉ | 6975/10000 [1:31:08<29:57, 1.68it/s, loss=0.0064, lr=7.68e-06, step=6974] Training: 70%|██████▉ | 6975/10000 [1:31:08<29:57, 1.68it/s, loss=0.0192, lr=7.68e-06, step=6975] Training: 70%|██████▉ | 6976/10000 [1:31:08<28:28, 1.77it/s, loss=0.0192, lr=7.68e-06, step=6975] Training: 70%|██████▉ | 6976/10000 [1:31:08<28:28, 1.77it/s, loss=0.0086, lr=7.67e-06, step=6976] Training: 70%|██████▉ | 6977/10000 [1:31:09<28:42, 1.76it/s, loss=0.0086, lr=7.67e-06, step=6976] Training: 70%|██████▉ | 6977/10000 [1:31:09<28:42, 1.76it/s, loss=0.0040, lr=7.67e-06, step=6977] Training: 70%|██████▉ | 6978/10000 [1:31:10<30:21, 1.66it/s, loss=0.0040, lr=7.67e-06, step=6977] Training: 70%|██████▉ | 6978/10000 [1:31:10<30:21, 1.66it/s, loss=0.0175, lr=7.67e-06, step=6978] Training: 70%|██████▉ | 6979/10000 [1:31:11<33:14, 1.51it/s, loss=0.0175, lr=7.67e-06, step=6978] Training: 70%|██████▉ | 6979/10000 [1:31:11<33:14, 1.51it/s, loss=0.0040, lr=7.67e-06, step=6979]20:15:43.495 [I] step=6980 loss=0.0082 smoothed_loss=0.0123 lr=7.68e-06 grad_norm=0.4760 step_time=0.5588s data_time=0.0766s it/s=1.574 eta_to_10000=1918.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0280 grad_action_out_proj_arms=0.1177 grad_arm_token_fuse=0.1466 grad_shared_expert=0.4120 (18633:train_pytorch.py:850) + Training: 70%|██████▉ | 6980/10000 [1:31:11<33:17, 1.51it/s, loss=0.0040, lr=7.67e-06, step=6979] Training: 70%|██████▉ | 6980/10000 [1:31:11<33:17, 1.51it/s, loss=0.0082, lr=7.66e-06, step=6980] Training: 70%|██████▉ | 6981/10000 [1:31:12<33:54, 1.48it/s, loss=0.0082, lr=7.66e-06, step=6980] Training: 70%|██████▉ | 6981/10000 [1:31:12<33:54, 1.48it/s, loss=0.0029, lr=7.66e-06, step=6981] Training: 70%|██████▉ | 6982/10000 [1:31:12<31:35, 1.59it/s, loss=0.0029, lr=7.66e-06, step=6981] Training: 70%|██████▉ | 6982/10000 [1:31:12<31:35, 1.59it/s, loss=0.0013, lr=7.66e-06, step=6982] Training: 70%|██████▉ | 6983/10000 [1:31:13<29:53, 1.68it/s, loss=0.0013, lr=7.66e-06, step=6982] Training: 70%|██████▉ | 6983/10000 [1:31:13<29:53, 1.68it/s, loss=0.0037, lr=7.65e-06, step=6983] Training: 70%|██████▉ | 6984/10000 [1:31:14<30:20, 1.66it/s, loss=0.0037, lr=7.65e-06, step=6983] Training: 70%|██████▉ | 6984/10000 [1:31:14<30:20, 1.66it/s, loss=0.0010, lr=7.65e-06, step=6984] Training: 70%|██████▉ | 6985/10000 [1:31:14<29:11, 1.72it/s, loss=0.0010, lr=7.65e-06, step=6984] Training: 70%|██████▉ | 6985/10000 [1:31:14<29:11, 1.72it/s, loss=0.0018, lr=7.65e-06, step=6985] Training: 70%|██████▉ | 6986/10000 [1:31:15<34:10, 1.47it/s, loss=0.0018, lr=7.65e-06, step=6985] Training: 70%|██████▉ | 6986/10000 [1:31:15<34:10, 1.47it/s, loss=0.0054, lr=7.64e-06, step=6986] Training: 70%|██████▉ | 6987/10000 [1:31:15<31:47, 1.58it/s, loss=0.0054, lr=7.64e-06, step=6986] Training: 70%|██████▉ | 6987/10000 [1:31:15<31:47, 1.58it/s, loss=0.0210, lr=7.64e-06, step=6987] Training: 70%|██████▉ | 6988/10000 [1:31:16<32:53, 1.53it/s, loss=0.0210, lr=7.64e-06, step=6987] Training: 70%|██████▉ | 6988/10000 [1:31:16<32:53, 1.53it/s, loss=0.0176, lr=7.64e-06, step=6988] Training: 70%|██████▉ | 6989/10000 [1:31:17<30:23, 1.65it/s, loss=0.0176, lr=7.64e-06, step=6988] Training: 70%|██████▉ | 6989/10000 [1:31:17<30:23, 1.65it/s, loss=0.0106, lr=7.63e-06, step=6989]20:15:49.750 [I] step=6990 loss=0.0014 smoothed_loss=0.0092 lr=7.65e-06 grad_norm=0.4371 step_time=0.5372s data_time=0.0883s it/s=1.599 eta_to_10000=1882.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.0877 grad_arm_token_fuse=0.0541 grad_shared_expert=0.3356 (18633:train_pytorch.py:850) + Training: 70%|██████▉ | 6990/10000 [1:31:17<32:14, 1.56it/s, loss=0.0106, lr=7.63e-06, step=6989] Training: 70%|██████▉ | 6990/10000 [1:31:17<32:14, 1.56it/s, loss=0.0014, lr=7.63e-06, step=6990] Training: 70%|██████▉ | 6991/10000 [1:31:18<29:53, 1.68it/s, loss=0.0014, lr=7.63e-06, step=6990] Training: 70%|██████▉ | 6991/10000 [1:31:18<29:53, 1.68it/s, loss=0.0131, lr=7.63e-06, step=6991] Training: 70%|██████▉ | 6992/10000 [1:31:18<28:58, 1.73it/s, loss=0.0131, lr=7.63e-06, step=6991] Training: 70%|██████▉ | 6992/10000 [1:31:18<28:58, 1.73it/s, loss=0.0087, lr=7.62e-06, step=6992] Training: 70%|██████▉ | 6993/10000 [1:31:19<35:00, 1.43it/s, loss=0.0087, lr=7.62e-06, step=6992] Training: 70%|██████▉ | 6993/10000 [1:31:19<35:00, 1.43it/s, loss=0.0170, lr=7.62e-06, step=6993] Training: 70%|██████▉ | 6994/10000 [1:31:20<35:19, 1.42it/s, loss=0.0170, lr=7.62e-06, step=6993] Training: 70%|██████▉ | 6994/10000 [1:31:20<35:19, 1.42it/s, loss=0.0013, lr=7.62e-06, step=6994] Training: 70%|██████▉ | 6995/10000 [1:31:21<37:31, 1.33it/s, loss=0.0013, lr=7.62e-06, step=6994] Training: 70%|██████▉ | 6995/10000 [1:31:21<37:31, 1.33it/s, loss=0.0215, lr=7.62e-06, step=6995] Training: 70%|██████▉ | 6996/10000 [1:31:22<34:39, 1.44it/s, loss=0.0215, lr=7.62e-06, step=6995] Training: 70%|██████▉ | 6996/10000 [1:31:22<34:39, 1.44it/s, loss=0.0064, lr=7.61e-06, step=6996] Training: 70%|██████▉ | 6997/10000 [1:31:22<32:16, 1.55it/s, loss=0.0064, lr=7.61e-06, step=6996] Training: 70%|██████▉ | 6997/10000 [1:31:22<32:16, 1.55it/s, loss=0.0081, lr=7.61e-06, step=6997] Training: 70%|██████▉ | 6998/10000 [1:31:23<30:08, 1.66it/s, loss=0.0081, lr=7.61e-06, step=6997] Training: 70%|██████▉ | 6998/10000 [1:31:23<30:08, 1.66it/s, loss=0.0057, lr=7.61e-06, step=6998] Training: 70%|██████▉ | 6999/10000 [1:31:23<28:52, 1.73it/s, loss=0.0057, lr=7.61e-06, step=6998] Training: 70%|██████▉ | 6999/10000 [1:31:23<28:52, 1.73it/s, loss=0.0079, lr=7.60e-06, step=6999]20:15:56.182 [I] step=7000 loss=0.0142 smoothed_loss=0.0098 lr=7.61e-06 grad_norm=0.4622 step_time=0.5458s data_time=0.0974s it/s=1.555 eta_to_10000=1929.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.1070 grad_arm_token_fuse=0.0664 grad_shared_expert=0.4286 (18633:train_pytorch.py:850) +20:17:25.392 [I] Saved checkpoint at step 7000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/7000 (18633:train_pytorch.py:350) + Training: 70%|███████ | 7000/10000 [1:32:53<22:49:33, 27.39s/it, loss=0.0079, lr=7.60e-06, step=6999] Training: 70%|███████ | 7000/10000 [1:32:53<22:49:33, 27.39s/it, loss=0.0142, lr=7.60e-06, step=7000] Training: 70%|███████ | 7001/10000 [1:32:54<16:09:11, 19.39s/it, loss=0.0142, lr=7.60e-06, step=7000] Training: 70%|███████ | 7001/10000 [1:32:54<16:09:11, 19.39s/it, loss=0.0039, lr=7.60e-06, step=7001] Training: 70%|███████ | 7002/10000 [1:32:54<11:26:13, 13.73s/it, loss=0.0039, lr=7.60e-06, step=7001] Training: 70%|███████ | 7002/10000 [1:32:54<11:26:13, 13.73s/it, loss=0.0051, lr=7.59e-06, step=7002] Training: 70%|███████ | 7003/10000 [1:32:55<8:15:19, 9.92s/it, loss=0.0051, lr=7.59e-06, step=7002] Training: 70%|███████ | 7003/10000 [1:32:55<8:15:19, 9.92s/it, loss=0.0014, lr=7.59e-06, step=7003] Training: 70%|███████ | 7004/10000 [1:32:56<5:56:45, 7.14s/it, loss=0.0014, lr=7.59e-06, step=7003] Training: 70%|███████ | 7004/10000 [1:32:56<5:56:45, 7.14s/it, loss=0.0294, lr=7.59e-06, step=7004] Training: 70%|███████ | 7005/10000 [1:32:57<4:18:03, 5.17s/it, loss=0.0294, lr=7.59e-06, step=7004] Training: 70%|███████ | 7005/10000 [1:32:57<4:18:03, 5.17s/it, loss=0.0100, lr=7.58e-06, step=7005] Training: 70%|███████ | 7006/10000 [1:32:57<3:10:49, 3.82s/it, loss=0.0100, lr=7.58e-06, step=7005] Training: 70%|███████ | 7006/10000 [1:32:57<3:10:49, 3.82s/it, loss=0.0048, lr=7.58e-06, step=7006] Training: 70%|███████ | 7007/10000 [1:32:58<2:25:27, 2.92s/it, loss=0.0048, lr=7.58e-06, step=7006] Training: 70%|███████ | 7007/10000 [1:32:58<2:25:27, 2.92s/it, loss=0.0033, lr=7.58e-06, step=7007] Training: 70%|███████ | 7008/10000 [1:32:59<1:54:40, 2.30s/it, loss=0.0033, lr=7.58e-06, step=7007] Training: 70%|███████ | 7008/10000 [1:32:59<1:54:40, 2.30s/it, loss=0.0087, lr=7.58e-06, step=7008] Training: 70%|███████ | 7009/10000 [1:33:00<1:31:39, 1.84s/it, loss=0.0087, lr=7.58e-06, step=7008] Training: 70%|███████ | 7009/10000 [1:33:00<1:31:39, 1.84s/it, loss=0.0072, lr=7.57e-06, step=7009]20:17:32.902 [I] step=7010 loss=0.0063 smoothed_loss=0.0086 lr=7.58e-06 grad_norm=0.3949 step_time=0.6039s data_time=9.0682s it/s=0.103 eta_to_10000=28919.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0608 grad_arm_token_fuse=0.0283 grad_shared_expert=0.2103 (18633:train_pytorch.py:850) + Training: 70%|███████ | 7010/10000 [1:33:01<1:17:30, 1.56s/it, loss=0.0072, lr=7.57e-06, step=7009] Training: 70%|███████ | 7010/10000 [1:33:01<1:17:30, 1.56s/it, loss=0.0063, lr=7.57e-06, step=7010] Training: 70%|███████ | 7011/10000 [1:33:01<1:03:49, 1.28s/it, loss=0.0063, lr=7.57e-06, step=7010] Training: 70%|███████ | 7011/10000 [1:33:01<1:03:49, 1.28s/it, loss=0.0061, lr=7.57e-06, step=7011] Training: 70%|███████ | 7012/10000 [1:33:02<52:37, 1.06s/it, loss=0.0061, lr=7.57e-06, step=7011] Training: 70%|███████ | 7012/10000 [1:33:02<52:37, 1.06s/it, loss=0.0147, lr=7.56e-06, step=7012] Training: 70%|███████ | 7013/10000 [1:33:02<47:59, 1.04it/s, loss=0.0147, lr=7.56e-06, step=7012] Training: 70%|███████ | 7013/10000 [1:33:02<47:59, 1.04it/s, loss=0.0089, lr=7.56e-06, step=7013] Training: 70%|███████ | 7014/10000 [1:33:03<48:24, 1.03it/s, loss=0.0089, lr=7.56e-06, step=7013] Training: 70%|███████ | 7014/10000 [1:33:03<48:24, 1.03it/s, loss=0.0143, lr=7.56e-06, step=7014] Training: 70%|███████ | 7015/10000 [1:33:05<49:38, 1.00it/s, loss=0.0143, lr=7.56e-06, step=7014] Training: 70%|███████ | 7015/10000 [1:33:05<49:38, 1.00it/s, loss=0.0064, lr=7.55e-06, step=7015] Training: 70%|███████ | 7016/10000 [1:33:05<42:14, 1.18it/s, loss=0.0064, lr=7.55e-06, step=7015] Training: 70%|███████ | 7016/10000 [1:33:05<42:14, 1.18it/s, loss=0.0021, lr=7.55e-06, step=7016] Training: 70%|███████ | 7017/10000 [1:33:06<38:37, 1.29it/s, loss=0.0021, lr=7.55e-06, step=7016] Training: 70%|███████ | 7017/10000 [1:33:06<38:37, 1.29it/s, loss=0.0052, lr=7.55e-06, step=7017] Training: 70%|███████ | 7018/10000 [1:33:06<36:37, 1.36it/s, loss=0.0052, lr=7.55e-06, step=7017] Training: 70%|███████ | 7018/10000 [1:33:06<36:37, 1.36it/s, loss=0.0130, lr=7.54e-06, step=7018] Training: 70%|███████ | 7019/10000 [1:33:07<38:07, 1.30it/s, loss=0.0130, lr=7.54e-06, step=7018] Training: 70%|███████ | 7019/10000 [1:33:07<38:07, 1.30it/s, loss=0.0062, lr=7.54e-06, step=7019]20:17:39.997 [I] step=7020 loss=0.0061 smoothed_loss=0.0082 lr=7.55e-06 grad_norm=0.4196 step_time=0.5728s data_time=0.1367s it/s=1.410 eta_to_10000=2113.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0105 grad_action_out_proj_arms=0.1118 grad_arm_token_fuse=0.0505 grad_shared_expert=0.3837 (18633:train_pytorch.py:850) + Training: 70%|███████ | 7020/10000 [1:33:08<34:35, 1.44it/s, loss=0.0062, lr=7.54e-06, step=7019] Training: 70%|███████ | 7020/10000 [1:33:08<34:35, 1.44it/s, loss=0.0061, lr=7.54e-06, step=7020] Training: 70%|███████ | 7021/10000 [1:33:09<38:34, 1.29it/s, loss=0.0061, lr=7.54e-06, step=7020] Training: 70%|███████ | 7021/10000 [1:33:09<38:34, 1.29it/s, loss=0.0076, lr=7.53e-06, step=7021] Training: 70%|███████ | 7022/10000 [1:33:09<37:02, 1.34it/s, loss=0.0076, lr=7.53e-06, step=7021] Training: 70%|███████ | 7022/10000 [1:33:09<37:02, 1.34it/s, loss=0.0134, lr=7.53e-06, step=7022] Training: 70%|███████ | 7023/10000 [1:33:10<33:01, 1.50it/s, loss=0.0134, lr=7.53e-06, step=7022] Training: 70%|███████ | 7023/10000 [1:33:10<33:01, 1.50it/s, loss=0.0022, lr=7.53e-06, step=7023] Training: 70%|███████ | 7024/10000 [1:33:10<33:15, 1.49it/s, loss=0.0022, lr=7.53e-06, step=7023] Training: 70%|███████ | 7024/10000 [1:33:10<33:15, 1.49it/s, loss=0.0096, lr=7.53e-06, step=7024] Training: 70%|███████ | 7025/10000 [1:33:11<35:47, 1.39it/s, loss=0.0096, lr=7.53e-06, step=7024] Training: 70%|███████ | 7025/10000 [1:33:11<35:47, 1.39it/s, loss=0.0169, lr=7.52e-06, step=7025] Training: 70%|███████ | 7026/10000 [1:33:12<37:37, 1.32it/s, loss=0.0169, lr=7.52e-06, step=7025] Training: 70%|███████ | 7026/10000 [1:33:12<37:37, 1.32it/s, loss=0.0055, lr=7.52e-06, step=7026] Training: 70%|███████ | 7027/10000 [1:33:13<38:36, 1.28it/s, loss=0.0055, lr=7.52e-06, step=7026] Training: 70%|███████ | 7027/10000 [1:33:13<38:36, 1.28it/s, loss=0.0078, lr=7.52e-06, step=7027] Training: 70%|███████ | 7028/10000 [1:33:14<38:27, 1.29it/s, loss=0.0078, lr=7.52e-06, step=7027] Training: 70%|███████ | 7028/10000 [1:33:14<38:27, 1.29it/s, loss=0.0171, lr=7.51e-06, step=7028] Training: 70%|███████ | 7029/10000 [1:33:15<40:44, 1.22it/s, loss=0.0171, lr=7.51e-06, step=7028] Training: 70%|███████ | 7029/10000 [1:33:15<40:44, 1.22it/s, loss=0.0039, lr=7.51e-06, step=7029]20:17:47.804 [I] step=7030 loss=0.0038 smoothed_loss=0.0084 lr=7.52e-06 grad_norm=0.3948 step_time=0.6301s data_time=0.1506s it/s=1.281 eta_to_10000=2318.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0054 grad_action_out_proj_arms=0.0653 grad_arm_token_fuse=0.0293 grad_shared_expert=0.3595 (18633:train_pytorch.py:850) + Training: 70%|███████ | 7030/10000 [1:33:15<40:17, 1.23it/s, loss=0.0039, lr=7.51e-06, step=7029] Training: 70%|███████ | 7030/10000 [1:33:15<40:17, 1.23it/s, loss=0.0038, lr=7.51e-06, step=7030] Training: 70%|███████ | 7031/10000 [1:33:16<35:46, 1.38it/s, loss=0.0038, lr=7.51e-06, step=7030] Training: 70%|███████ | 7031/10000 [1:33:16<35:46, 1.38it/s, loss=0.0050, lr=7.50e-06, step=7031] Training: 70%|███████ | 7032/10000 [1:33:17<34:19, 1.44it/s, loss=0.0050, lr=7.50e-06, step=7031] Training: 70%|███████ | 7032/10000 [1:33:17<34:19, 1.44it/s, loss=0.0045, lr=7.50e-06, step=7032] Training: 70%|███████ | 7033/10000 [1:33:18<37:50, 1.31it/s, loss=0.0045, lr=7.50e-06, step=7032] Training: 70%|███████ | 7033/10000 [1:33:18<37:50, 1.31it/s, loss=0.0107, lr=7.50e-06, step=7033] Training: 70%|███████ | 7034/10000 [1:33:18<39:09, 1.26it/s, loss=0.0107, lr=7.50e-06, step=7033] Training: 70%|███████ | 7034/10000 [1:33:18<39:09, 1.26it/s, loss=0.0025, lr=7.49e-06, step=7034] Training: 70%|███████ | 7035/10000 [1:33:19<35:25, 1.39it/s, loss=0.0025, lr=7.49e-06, step=7034] Training: 70%|███████ | 7035/10000 [1:33:19<35:25, 1.39it/s, loss=0.0041, lr=7.49e-06, step=7035] Training: 70%|███████ | 7036/10000 [1:33:20<36:16, 1.36it/s, loss=0.0041, lr=7.49e-06, step=7035] Training: 70%|███████ | 7036/10000 [1:33:20<36:16, 1.36it/s, loss=0.0107, lr=7.49e-06, step=7036] Training: 70%|███████ | 7037/10000 [1:33:20<36:13, 1.36it/s, loss=0.0107, lr=7.49e-06, step=7036] Training: 70%|███████ | 7037/10000 [1:33:20<36:13, 1.36it/s, loss=0.0028, lr=7.49e-06, step=7037] Training: 70%|███████ | 7038/10000 [1:33:21<33:30, 1.47it/s, loss=0.0028, lr=7.49e-06, step=7037] Training: 70%|███████ | 7038/10000 [1:33:21<33:30, 1.47it/s, loss=0.0051, lr=7.48e-06, step=7038] Training: 70%|███████ | 7039/10000 [1:33:22<31:45, 1.55it/s, loss=0.0051, lr=7.48e-06, step=7038] Training: 70%|███████ | 7039/10000 [1:33:22<31:45, 1.55it/s, loss=0.0021, lr=7.48e-06, step=7039]20:17:54.664 [I] step=7040 loss=0.0020 smoothed_loss=0.0059 lr=7.49e-06 grad_norm=0.3695 step_time=0.5557s data_time=0.1304s it/s=1.458 eta_to_10000=2030.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0154 grad_action_out_proj_arms=0.1333 grad_arm_token_fuse=0.0829 grad_shared_expert=0.5313 (18633:train_pytorch.py:850) + Training: 70%|███████ | 7040/10000 [1:33:22<33:42, 1.46it/s, loss=0.0021, lr=7.48e-06, step=7039] Training: 70%|███████ | 7040/10000 [1:33:22<33:42, 1.46it/s, loss=0.0020, lr=7.48e-06, step=7040] Training: 70%|███████ | 7041/10000 [1:33:23<37:12, 1.33it/s, loss=0.0020, lr=7.48e-06, step=7040] Training: 70%|███████ | 7041/10000 [1:33:23<37:12, 1.33it/s, loss=0.0100, lr=7.47e-06, step=7041] Training: 70%|███████ | 7042/10000 [1:33:24<33:39, 1.46it/s, loss=0.0100, lr=7.47e-06, step=7041] Training: 70%|███████ | 7042/10000 [1:33:24<33:39, 1.46it/s, loss=0.0071, lr=7.47e-06, step=7042] Training: 70%|███████ | 7043/10000 [1:33:25<36:38, 1.35it/s, loss=0.0071, lr=7.47e-06, step=7042] Training: 70%|███████ | 7043/10000 [1:33:25<36:38, 1.35it/s, loss=0.0034, lr=7.47e-06, step=7043] Training: 70%|███████ | 7044/10000 [1:33:25<37:15, 1.32it/s, loss=0.0034, lr=7.47e-06, step=7043] Training: 70%|███████ | 7044/10000 [1:33:25<37:15, 1.32it/s, loss=0.0029, lr=7.46e-06, step=7044] Training: 70%|███████ | 7045/10000 [1:33:26<33:27, 1.47it/s, loss=0.0029, lr=7.46e-06, step=7044] Training: 70%|███████ | 7045/10000 [1:33:26<33:27, 1.47it/s, loss=0.0029, lr=7.46e-06, step=7045] Training: 70%|███████ | 7046/10000 [1:33:26<30:35, 1.61it/s, loss=0.0029, lr=7.46e-06, step=7045] Training: 70%|███████ | 7046/10000 [1:33:26<30:35, 1.61it/s, loss=0.0192, lr=7.46e-06, step=7046] Training: 70%|███████ | 7047/10000 [1:33:27<33:01, 1.49it/s, loss=0.0192, lr=7.46e-06, step=7046] Training: 70%|███████ | 7047/10000 [1:33:27<33:01, 1.49it/s, loss=0.0113, lr=7.45e-06, step=7047] Training: 70%|███████ | 7048/10000 [1:33:28<33:51, 1.45it/s, loss=0.0113, lr=7.45e-06, step=7047] Training: 70%|███████ | 7048/10000 [1:33:28<33:51, 1.45it/s, loss=0.0051, lr=7.45e-06, step=7048] Training: 70%|███████ | 7049/10000 [1:33:28<31:04, 1.58it/s, loss=0.0051, lr=7.45e-06, step=7048] Training: 70%|███████ | 7049/10000 [1:33:28<31:04, 1.58it/s, loss=0.0117, lr=7.45e-06, step=7049]20:18:01.762 [I] step=7050 loss=0.0637 smoothed_loss=0.0131 lr=7.46e-06 grad_norm=0.4684 step_time=0.5875s data_time=0.1223s it/s=1.409 eta_to_10000=2093.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0252 grad_action_out_proj_arms=0.1434 grad_arm_token_fuse=0.1287 grad_shared_expert=0.5189 (18633:train_pytorch.py:850) + Training: 70%|███████ | 7050/10000 [1:33:29<36:20, 1.35it/s, loss=0.0117, lr=7.45e-06, step=7049] Training: 70%|███████ | 7050/10000 [1:33:29<36:20, 1.35it/s, loss=0.0637, lr=7.45e-06, step=7050] Training: 71%|███████ | 7051/10000 [1:33:30<32:54, 1.49it/s, loss=0.0637, lr=7.45e-06, step=7050] Training: 71%|███████ | 7051/10000 [1:33:30<32:54, 1.49it/s, loss=0.0026, lr=7.44e-06, step=7051] Training: 71%|███████ | 7052/10000 [1:33:30<30:54, 1.59it/s, loss=0.0026, lr=7.44e-06, step=7051] Training: 71%|███████ | 7052/10000 [1:33:30<30:54, 1.59it/s, loss=0.0046, lr=7.44e-06, step=7052] Training: 71%|███████ | 7053/10000 [1:33:31<28:46, 1.71it/s, loss=0.0046, lr=7.44e-06, step=7052] Training: 71%|███████ | 7053/10000 [1:33:31<28:46, 1.71it/s, loss=0.0059, lr=7.44e-06, step=7053] Training: 71%|███████ | 7054/10000 [1:33:32<30:42, 1.60it/s, loss=0.0059, lr=7.44e-06, step=7053] Training: 71%|███████ | 7054/10000 [1:33:32<30:42, 1.60it/s, loss=0.0021, lr=7.43e-06, step=7054] Training: 71%|███████ | 7055/10000 [1:33:33<34:17, 1.43it/s, loss=0.0021, lr=7.43e-06, step=7054] Training: 71%|███████ | 7055/10000 [1:33:33<34:17, 1.43it/s, loss=0.0100, lr=7.43e-06, step=7055] Training: 71%|███████ | 7056/10000 [1:33:33<36:01, 1.36it/s, loss=0.0100, lr=7.43e-06, step=7055] Training: 71%|███████ | 7056/10000 [1:33:33<36:01, 1.36it/s, loss=0.0045, lr=7.43e-06, step=7056] Training: 71%|███████ | 7057/10000 [1:33:34<40:18, 1.22it/s, loss=0.0045, lr=7.43e-06, step=7056] Training: 71%|███████ | 7057/10000 [1:33:34<40:18, 1.22it/s, loss=0.0135, lr=7.42e-06, step=7057] Training: 71%|███████ | 7058/10000 [1:33:35<39:26, 1.24it/s, loss=0.0135, lr=7.42e-06, step=7057] Training: 71%|███████ | 7058/10000 [1:33:35<39:26, 1.24it/s, loss=0.0088, lr=7.42e-06, step=7058] Training: 71%|███████ | 7059/10000 [1:33:36<37:54, 1.29it/s, loss=0.0088, lr=7.42e-06, step=7058] Training: 71%|███████ | 7059/10000 [1:33:36<37:54, 1.29it/s, loss=0.0058, lr=7.42e-06, step=7059]20:18:08.918 [I] step=7060 loss=0.0026 smoothed_loss=0.0086 lr=7.43e-06 grad_norm=0.3632 step_time=0.5698s data_time=0.1458s it/s=1.398 eta_to_10000=2103.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0250 grad_action_out_proj_arms=0.1997 grad_arm_token_fuse=0.1358 grad_shared_expert=0.5619 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7060/10000 [1:33:37<37:18, 1.31it/s, loss=0.0058, lr=7.42e-06, step=7059] Training: 71%|███████ | 7060/10000 [1:33:37<37:18, 1.31it/s, loss=0.0026, lr=7.41e-06, step=7060] Training: 71%|███████ | 7061/10000 [1:33:37<35:47, 1.37it/s, loss=0.0026, lr=7.41e-06, step=7060] Training: 71%|███████ | 7061/10000 [1:33:37<35:47, 1.37it/s, loss=0.0105, lr=7.41e-06, step=7061] Training: 71%|███████ | 7062/10000 [1:33:38<36:36, 1.34it/s, loss=0.0105, lr=7.41e-06, step=7061] Training: 71%|███████ | 7062/10000 [1:33:38<36:36, 1.34it/s, loss=0.0052, lr=7.41e-06, step=7062] Training: 71%|███████ | 7063/10000 [1:33:39<44:09, 1.11it/s, loss=0.0052, lr=7.41e-06, step=7062] Training: 71%|███████ | 7063/10000 [1:33:39<44:09, 1.11it/s, loss=0.0017, lr=7.41e-06, step=7063] Training: 71%|███████ | 7064/10000 [1:33:40<43:24, 1.13it/s, loss=0.0017, lr=7.41e-06, step=7063] Training: 71%|███████ | 7064/10000 [1:33:40<43:24, 1.13it/s, loss=0.0083, lr=7.40e-06, step=7064] Training: 71%|███████ | 7065/10000 [1:33:41<42:50, 1.14it/s, loss=0.0083, lr=7.40e-06, step=7064] Training: 71%|███████ | 7065/10000 [1:33:41<42:50, 1.14it/s, loss=0.0020, lr=7.40e-06, step=7065] Training: 71%|███████ | 7066/10000 [1:33:42<41:19, 1.18it/s, loss=0.0020, lr=7.40e-06, step=7065] Training: 71%|███████ | 7066/10000 [1:33:42<41:19, 1.18it/s, loss=0.0031, lr=7.40e-06, step=7066] Training: 71%|███████ | 7067/10000 [1:33:42<37:06, 1.32it/s, loss=0.0031, lr=7.40e-06, step=7066] Training: 71%|███████ | 7067/10000 [1:33:42<37:06, 1.32it/s, loss=0.0023, lr=7.39e-06, step=7067] Training: 71%|███████ | 7068/10000 [1:33:43<37:42, 1.30it/s, loss=0.0023, lr=7.39e-06, step=7067] Training: 71%|███████ | 7068/10000 [1:33:43<37:42, 1.30it/s, loss=0.0410, lr=7.39e-06, step=7068] Training: 71%|███████ | 7069/10000 [1:33:44<38:58, 1.25it/s, loss=0.0410, lr=7.39e-06, step=7068] Training: 71%|███████ | 7069/10000 [1:33:44<38:58, 1.25it/s, loss=0.0299, lr=7.39e-06, step=7069]20:18:16.975 [I] step=7070 loss=0.0064 smoothed_loss=0.0113 lr=7.40e-06 grad_norm=0.5412 step_time=0.5962s data_time=0.2096s it/s=1.241 eta_to_10000=2360.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0279 grad_action_out_proj_arms=0.1808 grad_arm_token_fuse=0.1435 grad_shared_expert=0.7242 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7070/10000 [1:33:45<36:52, 1.32it/s, loss=0.0299, lr=7.39e-06, step=7069] Training: 71%|███████ | 7070/10000 [1:33:45<36:52, 1.32it/s, loss=0.0064, lr=7.38e-06, step=7070] Training: 71%|███████ | 7071/10000 [1:33:46<39:12, 1.25it/s, loss=0.0064, lr=7.38e-06, step=7070] Training: 71%|███████ | 7071/10000 [1:33:46<39:12, 1.25it/s, loss=0.0013, lr=7.38e-06, step=7071] Training: 71%|███████ | 7072/10000 [1:33:47<43:23, 1.12it/s, loss=0.0013, lr=7.38e-06, step=7071] Training: 71%|███████ | 7072/10000 [1:33:47<43:23, 1.12it/s, loss=0.0133, lr=7.38e-06, step=7072] Training: 71%|███████ | 7073/10000 [1:33:48<42:49, 1.14it/s, loss=0.0133, lr=7.38e-06, step=7072] Training: 71%|███████ | 7073/10000 [1:33:48<42:49, 1.14it/s, loss=0.0164, lr=7.37e-06, step=7073] Training: 71%|███████ | 7074/10000 [1:33:48<41:22, 1.18it/s, loss=0.0164, lr=7.37e-06, step=7073] Training: 71%|███████ | 7074/10000 [1:33:48<41:22, 1.18it/s, loss=0.0042, lr=7.37e-06, step=7074] Training: 71%|███████ | 7075/10000 [1:33:49<38:01, 1.28it/s, loss=0.0042, lr=7.37e-06, step=7074] Training: 71%|███████ | 7075/10000 [1:33:49<38:01, 1.28it/s, loss=0.0117, lr=7.37e-06, step=7075] Training: 71%|███████ | 7076/10000 [1:33:50<37:39, 1.29it/s, loss=0.0117, lr=7.37e-06, step=7075] Training: 71%|███████ | 7076/10000 [1:33:50<37:39, 1.29it/s, loss=0.0070, lr=7.37e-06, step=7076] Training: 71%|███████ | 7077/10000 [1:33:51<39:06, 1.25it/s, loss=0.0070, lr=7.37e-06, step=7076] Training: 71%|███████ | 7077/10000 [1:33:51<39:06, 1.25it/s, loss=0.0020, lr=7.36e-06, step=7077] Training: 71%|███████ | 7078/10000 [1:33:51<39:31, 1.23it/s, loss=0.0020, lr=7.36e-06, step=7077] Training: 71%|███████ | 7078/10000 [1:33:51<39:31, 1.23it/s, loss=0.0051, lr=7.36e-06, step=7078] Training: 71%|███████ | 7079/10000 [1:33:52<37:12, 1.31it/s, loss=0.0051, lr=7.36e-06, step=7078] Training: 71%|███████ | 7079/10000 [1:33:52<37:12, 1.31it/s, loss=0.0105, lr=7.36e-06, step=7079]20:18:25.304 [I] step=7080 loss=0.0175 smoothed_loss=0.0100 lr=7.37e-06 grad_norm=0.4154 step_time=0.6616s data_time=0.1712s it/s=1.201 eta_to_10000=2431.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.1240 grad_arm_token_fuse=0.0535 grad_shared_expert=0.3029 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7080/10000 [1:33:53<40:00, 1.22it/s, loss=0.0105, lr=7.36e-06, step=7079] Training: 71%|███████ | 7080/10000 [1:33:53<40:00, 1.22it/s, loss=0.0175, lr=7.35e-06, step=7080] Training: 71%|███████ | 7081/10000 [1:33:54<35:51, 1.36it/s, loss=0.0175, lr=7.35e-06, step=7080] Training: 71%|███████ | 7081/10000 [1:33:54<35:51, 1.36it/s, loss=0.0046, lr=7.35e-06, step=7081] Training: 71%|███████ | 7082/10000 [1:33:54<32:38, 1.49it/s, loss=0.0046, lr=7.35e-06, step=7081] Training: 71%|███████ | 7082/10000 [1:33:54<32:38, 1.49it/s, loss=0.0248, lr=7.35e-06, step=7082] Training: 71%|███████ | 7083/10000 [1:33:55<34:19, 1.42it/s, loss=0.0248, lr=7.35e-06, step=7082] Training: 71%|███████ | 7083/10000 [1:33:55<34:19, 1.42it/s, loss=0.0099, lr=7.34e-06, step=7083] Training: 71%|███████ | 7084/10000 [1:33:56<35:36, 1.36it/s, loss=0.0099, lr=7.34e-06, step=7083] Training: 71%|███████ | 7084/10000 [1:33:56<35:36, 1.36it/s, loss=0.0189, lr=7.34e-06, step=7084] Training: 71%|███████ | 7085/10000 [1:33:57<38:58, 1.25it/s, loss=0.0189, lr=7.34e-06, step=7084] Training: 71%|███████ | 7085/10000 [1:33:57<38:58, 1.25it/s, loss=0.0040, lr=7.34e-06, step=7085] Training: 71%|███████ | 7086/10000 [1:33:57<37:29, 1.30it/s, loss=0.0040, lr=7.34e-06, step=7085] Training: 71%|███████ | 7086/10000 [1:33:57<37:29, 1.30it/s, loss=0.0168, lr=7.33e-06, step=7086] Training: 71%|███████ | 7087/10000 [1:33:58<35:54, 1.35it/s, loss=0.0168, lr=7.33e-06, step=7086] Training: 71%|███████ | 7087/10000 [1:33:58<35:54, 1.35it/s, loss=0.0070, lr=7.33e-06, step=7087] Training: 71%|███████ | 7088/10000 [1:33:59<36:25, 1.33it/s, loss=0.0070, lr=7.33e-06, step=7087] Training: 71%|███████ | 7088/10000 [1:33:59<36:25, 1.33it/s, loss=0.0207, lr=7.33e-06, step=7088] Training: 71%|███████ | 7089/10000 [1:33:59<32:46, 1.48it/s, loss=0.0207, lr=7.33e-06, step=7088] Training: 71%|███████ | 7089/10000 [1:33:59<32:46, 1.48it/s, loss=0.0127, lr=7.33e-06, step=7089]20:18:32.066 [I] step=7090 loss=0.0019 smoothed_loss=0.0111 lr=7.34e-06 grad_norm=0.4050 step_time=0.5543s data_time=0.1219s it/s=1.479 eta_to_10000=1967.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0122 grad_action_out_proj_arms=0.1227 grad_arm_token_fuse=0.0696 grad_shared_expert=0.4745 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7090/10000 [1:34:00<30:27, 1.59it/s, loss=0.0127, lr=7.33e-06, step=7089] Training: 71%|███████ | 7090/10000 [1:34:00<30:27, 1.59it/s, loss=0.0019, lr=7.32e-06, step=7090] Training: 71%|███████ | 7091/10000 [1:34:00<29:13, 1.66it/s, loss=0.0019, lr=7.32e-06, step=7090] Training: 71%|███████ | 7091/10000 [1:34:00<29:13, 1.66it/s, loss=0.0650, lr=7.32e-06, step=7091] Training: 71%|███████ | 7092/10000 [1:34:01<30:05, 1.61it/s, loss=0.0650, lr=7.32e-06, step=7091] Training: 71%|███████ | 7092/10000 [1:34:01<30:05, 1.61it/s, loss=0.0126, lr=7.32e-06, step=7092] Training: 71%|███████ | 7093/10000 [1:34:02<34:52, 1.39it/s, loss=0.0126, lr=7.32e-06, step=7092] Training: 71%|███████ | 7093/10000 [1:34:02<34:52, 1.39it/s, loss=0.0077, lr=7.31e-06, step=7093] Training: 71%|███████ | 7094/10000 [1:34:03<33:22, 1.45it/s, loss=0.0077, lr=7.31e-06, step=7093] Training: 71%|███████ | 7094/10000 [1:34:03<33:22, 1.45it/s, loss=0.0029, lr=7.31e-06, step=7094] Training: 71%|███████ | 7095/10000 [1:34:03<34:12, 1.42it/s, loss=0.0029, lr=7.31e-06, step=7094] Training: 71%|███████ | 7095/10000 [1:34:03<34:12, 1.42it/s, loss=0.0055, lr=7.31e-06, step=7095] Training: 71%|███████ | 7096/10000 [1:34:04<31:14, 1.55it/s, loss=0.0055, lr=7.31e-06, step=7095] Training: 71%|███████ | 7096/10000 [1:34:04<31:14, 1.55it/s, loss=0.0185, lr=7.30e-06, step=7096] Training: 71%|███████ | 7097/10000 [1:34:04<29:45, 1.63it/s, loss=0.0185, lr=7.30e-06, step=7096] Training: 71%|███████ | 7097/10000 [1:34:04<29:45, 1.63it/s, loss=0.0053, lr=7.30e-06, step=7097] Training: 71%|███████ | 7098/10000 [1:34:05<31:43, 1.52it/s, loss=0.0053, lr=7.30e-06, step=7097] Training: 71%|███████ | 7098/10000 [1:34:05<31:43, 1.52it/s, loss=0.0030, lr=7.30e-06, step=7098] Training: 71%|███████ | 7099/10000 [1:34:06<34:29, 1.40it/s, loss=0.0030, lr=7.30e-06, step=7098] Training: 71%|███████ | 7099/10000 [1:34:06<34:29, 1.40it/s, loss=0.0021, lr=7.29e-06, step=7099]20:18:39.218 [I] step=7100 loss=0.0116 smoothed_loss=0.0110 lr=7.31e-06 grad_norm=0.3731 step_time=0.5826s data_time=0.1326s it/s=1.398 eta_to_10000=2073.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.0874 grad_arm_token_fuse=0.0658 grad_shared_expert=0.3774 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7100/10000 [1:34:07<38:24, 1.26it/s, loss=0.0021, lr=7.29e-06, step=7099] Training: 71%|███████ | 7100/10000 [1:34:07<38:24, 1.26it/s, loss=0.0116, lr=7.29e-06, step=7100] Training: 71%|███████ | 7101/10000 [1:34:08<39:27, 1.22it/s, loss=0.0116, lr=7.29e-06, step=7100] Training: 71%|███████ | 7101/10000 [1:34:08<39:27, 1.22it/s, loss=0.0013, lr=7.29e-06, step=7101] Training: 71%|███████ | 7102/10000 [1:34:08<36:15, 1.33it/s, loss=0.0013, lr=7.29e-06, step=7101] Training: 71%|███████ | 7102/10000 [1:34:08<36:15, 1.33it/s, loss=0.0075, lr=7.29e-06, step=7102] Training: 71%|███████ | 7103/10000 [1:34:09<32:55, 1.47it/s, loss=0.0075, lr=7.29e-06, step=7102] Training: 71%|███████ | 7103/10000 [1:34:09<32:55, 1.47it/s, loss=0.0085, lr=7.28e-06, step=7103] Training: 71%|███████ | 7104/10000 [1:34:09<31:58, 1.51it/s, loss=0.0085, lr=7.28e-06, step=7103] Training: 71%|███████ | 7104/10000 [1:34:09<31:58, 1.51it/s, loss=0.0071, lr=7.28e-06, step=7104] Training: 71%|███████ | 7105/10000 [1:34:10<30:07, 1.60it/s, loss=0.0071, lr=7.28e-06, step=7104] Training: 71%|███████ | 7105/10000 [1:34:10<30:07, 1.60it/s, loss=0.0034, lr=7.28e-06, step=7105] Training: 71%|███████ | 7106/10000 [1:34:11<31:38, 1.52it/s, loss=0.0034, lr=7.28e-06, step=7105] Training: 71%|███████ | 7106/10000 [1:34:11<31:38, 1.52it/s, loss=0.0175, lr=7.27e-06, step=7106] Training: 71%|███████ | 7107/10000 [1:34:11<32:37, 1.48it/s, loss=0.0175, lr=7.27e-06, step=7106] Training: 71%|███████ | 7107/10000 [1:34:11<32:37, 1.48it/s, loss=0.0048, lr=7.27e-06, step=7107] Training: 71%|███████ | 7108/10000 [1:34:12<34:09, 1.41it/s, loss=0.0048, lr=7.27e-06, step=7107] Training: 71%|███████ | 7108/10000 [1:34:12<34:09, 1.41it/s, loss=0.0177, lr=7.27e-06, step=7108] Training: 71%|███████ | 7109/10000 [1:34:13<31:42, 1.52it/s, loss=0.0177, lr=7.27e-06, step=7108] Training: 71%|███████ | 7109/10000 [1:34:13<31:42, 1.52it/s, loss=0.0028, lr=7.26e-06, step=7109]20:18:45.763 [I] step=7110 loss=0.0151 smoothed_loss=0.0099 lr=7.28e-06 grad_norm=0.4004 step_time=0.5489s data_time=0.1056s it/s=1.528 eta_to_10000=1891.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0318 grad_action_out_proj_arms=0.1877 grad_arm_token_fuse=0.1709 grad_shared_expert=0.6247 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7110/10000 [1:34:13<31:15, 1.54it/s, loss=0.0028, lr=7.26e-06, step=7109] Training: 71%|███████ | 7110/10000 [1:34:13<31:15, 1.54it/s, loss=0.0151, lr=7.26e-06, step=7110] Training: 71%|███████ | 7111/10000 [1:34:14<29:07, 1.65it/s, loss=0.0151, lr=7.26e-06, step=7110] Training: 71%|███████ | 7111/10000 [1:34:14<29:07, 1.65it/s, loss=0.0051, lr=7.26e-06, step=7111] Training: 71%|███████ | 7112/10000 [1:34:14<27:27, 1.75it/s, loss=0.0051, lr=7.26e-06, step=7111] Training: 71%|███████ | 7112/10000 [1:34:14<27:27, 1.75it/s, loss=0.0022, lr=7.26e-06, step=7112] Training: 71%|███████ | 7113/10000 [1:34:15<28:35, 1.68it/s, loss=0.0022, lr=7.26e-06, step=7112] Training: 71%|███████ | 7113/10000 [1:34:15<28:35, 1.68it/s, loss=0.0101, lr=7.25e-06, step=7113] Training: 71%|███████ | 7114/10000 [1:34:16<32:30, 1.48it/s, loss=0.0101, lr=7.25e-06, step=7113] Training: 71%|███████ | 7114/10000 [1:34:16<32:30, 1.48it/s, loss=0.0288, lr=7.25e-06, step=7114] Training: 71%|███████ | 7115/10000 [1:34:17<33:19, 1.44it/s, loss=0.0288, lr=7.25e-06, step=7114] Training: 71%|███████ | 7115/10000 [1:34:17<33:19, 1.44it/s, loss=0.0099, lr=7.25e-06, step=7115] Training: 71%|███████ | 7116/10000 [1:34:17<30:14, 1.59it/s, loss=0.0099, lr=7.25e-06, step=7115] Training: 71%|███████ | 7116/10000 [1:34:17<30:14, 1.59it/s, loss=0.0251, lr=7.24e-06, step=7116] Training: 71%|███████ | 7117/10000 [1:34:18<29:07, 1.65it/s, loss=0.0251, lr=7.24e-06, step=7116] Training: 71%|███████ | 7117/10000 [1:34:18<29:07, 1.65it/s, loss=0.0056, lr=7.24e-06, step=7117] Training: 71%|███████ | 7118/10000 [1:34:18<31:22, 1.53it/s, loss=0.0056, lr=7.24e-06, step=7117] Training: 71%|███████ | 7118/10000 [1:34:18<31:22, 1.53it/s, loss=0.0154, lr=7.24e-06, step=7118] Training: 71%|███████ | 7119/10000 [1:34:19<33:54, 1.42it/s, loss=0.0154, lr=7.24e-06, step=7118] Training: 71%|███████ | 7119/10000 [1:34:19<33:54, 1.42it/s, loss=0.0046, lr=7.23e-06, step=7119]20:18:52.200 [I] step=7120 loss=0.0037 smoothed_loss=0.0104 lr=7.24e-06 grad_norm=0.4313 step_time=0.5364s data_time=0.1073s it/s=1.554 eta_to_10000=1853.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0335 grad_action_out_proj_arms=0.1835 grad_arm_token_fuse=0.1777 grad_shared_expert=0.5702 (18633:train_pytorch.py:850) + Training: 71%|███████ | 7120/10000 [1:34:20<31:56, 1.50it/s, loss=0.0046, lr=7.23e-06, step=7119] Training: 71%|███████ | 7120/10000 [1:34:20<31:56, 1.50it/s, loss=0.0037, lr=7.23e-06, step=7120] Training: 71%|███████ | 7121/10000 [1:34:21<37:20, 1.29it/s, loss=0.0037, lr=7.23e-06, step=7120] Training: 71%|███████ | 7121/10000 [1:34:21<37:20, 1.29it/s, loss=0.0137, lr=7.23e-06, step=7121] Training: 71%|███████ | 7122/10000 [1:34:22<37:16, 1.29it/s, loss=0.0137, lr=7.23e-06, step=7121] Training: 71%|███████ | 7122/10000 [1:34:22<37:16, 1.29it/s, loss=0.0094, lr=7.23e-06, step=7122] Training: 71%|███████ | 7123/10000 [1:34:22<37:14, 1.29it/s, loss=0.0094, lr=7.23e-06, step=7122] Training: 71%|███████ | 7123/10000 [1:34:22<37:14, 1.29it/s, loss=0.0114, lr=7.22e-06, step=7123] Training: 71%|███████ | 7124/10000 [1:34:23<32:55, 1.46it/s, loss=0.0114, lr=7.22e-06, step=7123] Training: 71%|███████ | 7124/10000 [1:34:23<32:55, 1.46it/s, loss=0.0096, lr=7.22e-06, step=7124] Training: 71%|███████▏ | 7125/10000 [1:34:23<29:51, 1.60it/s, loss=0.0096, lr=7.22e-06, step=7124] Training: 71%|███████▏ | 7125/10000 [1:34:23<29:51, 1.60it/s, loss=0.0086, lr=7.22e-06, step=7125] Training: 71%|███████▏ | 7126/10000 [1:34:24<27:54, 1.72it/s, loss=0.0086, lr=7.22e-06, step=7125] Training: 71%|███████▏ | 7126/10000 [1:34:24<27:54, 1.72it/s, loss=0.0016, lr=7.21e-06, step=7126] Training: 71%|███████▏ | 7127/10000 [1:34:24<26:29, 1.81it/s, loss=0.0016, lr=7.21e-06, step=7126] Training: 71%|███████▏ | 7127/10000 [1:34:24<26:29, 1.81it/s, loss=0.0048, lr=7.21e-06, step=7127] Training: 71%|███████▏ | 7128/10000 [1:34:25<29:36, 1.62it/s, loss=0.0048, lr=7.21e-06, step=7127] Training: 71%|███████▏ | 7128/10000 [1:34:25<29:36, 1.62it/s, loss=0.0122, lr=7.21e-06, step=7128] Training: 71%|███████▏ | 7129/10000 [1:34:26<37:05, 1.29it/s, loss=0.0122, lr=7.21e-06, step=7128] Training: 71%|███████▏ | 7129/10000 [1:34:26<37:05, 1.29it/s, loss=0.0131, lr=7.20e-06, step=7129]20:18:59.261 [I] step=7130 loss=0.0069 smoothed_loss=0.0094 lr=7.21e-06 grad_norm=0.5973 step_time=0.5837s data_time=0.1224s it/s=1.416 eta_to_10000=2026.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.0944 grad_arm_token_fuse=0.0533 grad_shared_expert=0.3636 (18633:train_pytorch.py:850) + Training: 71%|███████▏ | 7130/10000 [1:34:27<35:02, 1.36it/s, loss=0.0131, lr=7.20e-06, step=7129] Training: 71%|███████▏ | 7130/10000 [1:34:27<35:02, 1.36it/s, loss=0.0069, lr=7.20e-06, step=7130] Training: 71%|███████▏ | 7131/10000 [1:34:27<31:23, 1.52it/s, loss=0.0069, lr=7.20e-06, step=7130] Training: 71%|███████▏ | 7131/10000 [1:34:27<31:23, 1.52it/s, loss=0.0098, lr=7.20e-06, step=7131] Training: 71%|███████▏ | 7132/10000 [1:34:28<32:21, 1.48it/s, loss=0.0098, lr=7.20e-06, step=7131] Training: 71%|███████▏ | 7132/10000 [1:34:28<32:21, 1.48it/s, loss=0.0126, lr=7.19e-06, step=7132] Training: 71%|███████▏ | 7133/10000 [1:34:29<29:59, 1.59it/s, loss=0.0126, lr=7.19e-06, step=7132] Training: 71%|███████▏ | 7133/10000 [1:34:29<29:59, 1.59it/s, loss=0.0060, lr=7.19e-06, step=7133] Training: 71%|███████▏ | 7134/10000 [1:34:29<28:50, 1.66it/s, loss=0.0060, lr=7.19e-06, step=7133] Training: 71%|███████▏ | 7134/10000 [1:34:29<28:50, 1.66it/s, loss=0.0091, lr=7.19e-06, step=7134] Training: 71%|███████▏ | 7135/10000 [1:34:30<30:24, 1.57it/s, loss=0.0091, lr=7.19e-06, step=7134] Training: 71%|███████▏ | 7135/10000 [1:34:30<30:24, 1.57it/s, loss=0.0133, lr=7.19e-06, step=7135] Training: 71%|███████▏ | 7136/10000 [1:34:31<33:44, 1.41it/s, loss=0.0133, lr=7.19e-06, step=7135] Training: 71%|███████▏ | 7136/10000 [1:34:31<33:44, 1.41it/s, loss=0.0011, lr=7.18e-06, step=7136] Training: 71%|███████▏ | 7137/10000 [1:34:32<33:55, 1.41it/s, loss=0.0011, lr=7.18e-06, step=7136] Training: 71%|███████▏ | 7137/10000 [1:34:32<33:55, 1.41it/s, loss=0.0123, lr=7.18e-06, step=7137] Training: 71%|███████▏ | 7138/10000 [1:34:32<31:12, 1.53it/s, loss=0.0123, lr=7.18e-06, step=7137] Training: 71%|███████▏ | 7138/10000 [1:34:32<31:12, 1.53it/s, loss=0.0029, lr=7.18e-06, step=7138] Training: 71%|███████▏ | 7139/10000 [1:34:33<31:14, 1.53it/s, loss=0.0029, lr=7.18e-06, step=7138] Training: 71%|███████▏ | 7139/10000 [1:34:33<31:14, 1.53it/s, loss=0.0105, lr=7.17e-06, step=7139]20:19:05.708 [I] step=7140 loss=0.0031 smoothed_loss=0.0082 lr=7.18e-06 grad_norm=0.3615 step_time=0.5166s data_time=0.1281s it/s=1.551 eta_to_10000=1843.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0777 grad_arm_token_fuse=0.0429 grad_shared_expert=0.2910 (18633:train_pytorch.py:850) + Training: 71%|███████▏ | 7140/10000 [1:34:33<31:51, 1.50it/s, loss=0.0105, lr=7.17e-06, step=7139] Training: 71%|███████▏ | 7140/10000 [1:34:33<31:51, 1.50it/s, loss=0.0031, lr=7.17e-06, step=7140] Training: 71%|███████▏ | 7141/10000 [1:34:34<33:42, 1.41it/s, loss=0.0031, lr=7.17e-06, step=7140] Training: 71%|███████▏ | 7141/10000 [1:34:34<33:42, 1.41it/s, loss=0.0044, lr=7.17e-06, step=7141] Training: 71%|███████▏ | 7142/10000 [1:34:35<33:04, 1.44it/s, loss=0.0044, lr=7.17e-06, step=7141] Training: 71%|███████▏ | 7142/10000 [1:34:35<33:04, 1.44it/s, loss=0.0171, lr=7.16e-06, step=7142] Training: 71%|███████▏ | 7143/10000 [1:34:36<38:09, 1.25it/s, loss=0.0171, lr=7.16e-06, step=7142] Training: 71%|███████▏ | 7143/10000 [1:34:36<38:09, 1.25it/s, loss=0.0011, lr=7.16e-06, step=7143] Training: 71%|███████▏ | 7144/10000 [1:34:37<38:35, 1.23it/s, loss=0.0011, lr=7.16e-06, step=7143] Training: 71%|███████▏ | 7144/10000 [1:34:37<38:35, 1.23it/s, loss=0.0021, lr=7.16e-06, step=7144] Training: 71%|███████▏ | 7145/10000 [1:34:38<38:08, 1.25it/s, loss=0.0021, lr=7.16e-06, step=7144] Training: 71%|███████▏ | 7145/10000 [1:34:38<38:08, 1.25it/s, loss=0.0565, lr=7.16e-06, step=7145] Training: 71%|███████▏ | 7146/10000 [1:34:38<36:32, 1.30it/s, loss=0.0565, lr=7.16e-06, step=7145] Training: 71%|███████▏ | 7146/10000 [1:34:38<36:32, 1.30it/s, loss=0.0216, lr=7.15e-06, step=7146] Training: 71%|███████▏ | 7147/10000 [1:34:39<36:49, 1.29it/s, loss=0.0216, lr=7.15e-06, step=7146] Training: 71%|███████▏ | 7147/10000 [1:34:39<36:49, 1.29it/s, loss=0.0305, lr=7.15e-06, step=7147] Training: 71%|███████▏ | 7148/10000 [1:34:40<33:55, 1.40it/s, loss=0.0305, lr=7.15e-06, step=7147] Training: 71%|███████▏ | 7148/10000 [1:34:40<33:55, 1.40it/s, loss=0.0029, lr=7.15e-06, step=7148] Training: 71%|███████▏ | 7149/10000 [1:34:40<30:51, 1.54it/s, loss=0.0029, lr=7.15e-06, step=7148] Training: 71%|███████▏ | 7149/10000 [1:34:40<30:51, 1.54it/s, loss=0.0071, lr=7.14e-06, step=7149]20:19:13.395 [I] step=7150 loss=0.0829 smoothed_loss=0.0201 lr=7.15e-06 grad_norm=0.4455 step_time=0.6005s data_time=0.1682s it/s=1.301 eta_to_10000=2190.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0357 grad_action_out_proj_arms=0.1952 grad_arm_token_fuse=0.2012 grad_shared_expert=0.5366 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7150/10000 [1:34:41<35:59, 1.32it/s, loss=0.0071, lr=7.14e-06, step=7149] Training: 72%|███████▏ | 7150/10000 [1:34:41<35:59, 1.32it/s, loss=0.0829, lr=7.14e-06, step=7150] Training: 72%|███████▏ | 7151/10000 [1:34:42<34:16, 1.39it/s, loss=0.0829, lr=7.14e-06, step=7150] Training: 72%|███████▏ | 7151/10000 [1:34:42<34:16, 1.39it/s, loss=0.0106, lr=7.14e-06, step=7151] Training: 72%|███████▏ | 7152/10000 [1:34:42<30:53, 1.54it/s, loss=0.0106, lr=7.14e-06, step=7151] Training: 72%|███████▏ | 7152/10000 [1:34:42<30:53, 1.54it/s, loss=0.0982, lr=7.13e-06, step=7152] Training: 72%|███████▏ | 7153/10000 [1:34:43<32:29, 1.46it/s, loss=0.0982, lr=7.13e-06, step=7152] Training: 72%|███████▏ | 7153/10000 [1:34:43<32:29, 1.46it/s, loss=0.0037, lr=7.13e-06, step=7153] Training: 72%|███████▏ | 7154/10000 [1:34:43<29:38, 1.60it/s, loss=0.0037, lr=7.13e-06, step=7153] Training: 72%|███████▏ | 7154/10000 [1:34:43<29:38, 1.60it/s, loss=0.0009, lr=7.13e-06, step=7154] Training: 72%|███████▏ | 7155/10000 [1:34:44<30:10, 1.57it/s, loss=0.0009, lr=7.13e-06, step=7154] Training: 72%|███████▏ | 7155/10000 [1:34:44<30:10, 1.57it/s, loss=0.0050, lr=7.13e-06, step=7155] Training: 72%|███████▏ | 7156/10000 [1:34:45<32:10, 1.47it/s, loss=0.0050, lr=7.13e-06, step=7155] Training: 72%|███████▏ | 7156/10000 [1:34:45<32:10, 1.47it/s, loss=0.0391, lr=7.12e-06, step=7156] Training: 72%|███████▏ | 7157/10000 [1:34:46<34:18, 1.38it/s, loss=0.0391, lr=7.12e-06, step=7156] Training: 72%|███████▏ | 7157/10000 [1:34:46<34:18, 1.38it/s, loss=0.0045, lr=7.12e-06, step=7157] Training: 72%|███████▏ | 7158/10000 [1:34:47<37:50, 1.25it/s, loss=0.0045, lr=7.12e-06, step=7157] Training: 72%|███████▏ | 7158/10000 [1:34:47<37:50, 1.25it/s, loss=0.0166, lr=7.12e-06, step=7158] Training: 72%|███████▏ | 7159/10000 [1:34:48<38:31, 1.23it/s, loss=0.0166, lr=7.12e-06, step=7158] Training: 72%|███████▏ | 7159/10000 [1:34:48<38:31, 1.23it/s, loss=0.0124, lr=7.11e-06, step=7159]20:19:20.438 [I] step=7160 loss=0.0131 smoothed_loss=0.0188 lr=7.12e-06 grad_norm=0.4185 step_time=0.5715s data_time=0.1329s it/s=1.420 eta_to_10000=2000.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0289 grad_action_out_proj_arms=0.1697 grad_arm_token_fuse=0.1589 grad_shared_expert=0.4294 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7160/10000 [1:34:48<35:10, 1.35it/s, loss=0.0124, lr=7.11e-06, step=7159] Training: 72%|███████▏ | 7160/10000 [1:34:48<35:10, 1.35it/s, loss=0.0131, lr=7.11e-06, step=7160] Training: 72%|███████▏ | 7161/10000 [1:34:49<34:09, 1.39it/s, loss=0.0131, lr=7.11e-06, step=7160] Training: 72%|███████▏ | 7161/10000 [1:34:49<34:09, 1.39it/s, loss=0.0009, lr=7.11e-06, step=7161] Training: 72%|███████▏ | 7162/10000 [1:34:49<31:49, 1.49it/s, loss=0.0009, lr=7.11e-06, step=7161] Training: 72%|███████▏ | 7162/10000 [1:34:49<31:49, 1.49it/s, loss=0.0225, lr=7.10e-06, step=7162] Training: 72%|███████▏ | 7163/10000 [1:34:50<32:02, 1.48it/s, loss=0.0225, lr=7.10e-06, step=7162] Training: 72%|███████▏ | 7163/10000 [1:34:50<32:02, 1.48it/s, loss=0.0020, lr=7.10e-06, step=7163] Training: 72%|███████▏ | 7164/10000 [1:34:51<35:36, 1.33it/s, loss=0.0020, lr=7.10e-06, step=7163] Training: 72%|███████▏ | 7164/10000 [1:34:51<35:36, 1.33it/s, loss=0.0061, lr=7.10e-06, step=7164] Training: 72%|███████▏ | 7165/10000 [1:34:52<39:43, 1.19it/s, loss=0.0061, lr=7.10e-06, step=7164] Training: 72%|███████▏ | 7165/10000 [1:34:52<39:43, 1.19it/s, loss=0.0027, lr=7.10e-06, step=7165] Training: 72%|███████▏ | 7166/10000 [1:34:52<34:31, 1.37it/s, loss=0.0027, lr=7.10e-06, step=7165] Training: 72%|███████▏ | 7166/10000 [1:34:52<34:31, 1.37it/s, loss=0.0034, lr=7.09e-06, step=7166] Training: 72%|███████▏ | 7167/10000 [1:34:53<34:04, 1.39it/s, loss=0.0034, lr=7.09e-06, step=7166] Training: 72%|███████▏ | 7167/10000 [1:34:53<34:04, 1.39it/s, loss=0.0038, lr=7.09e-06, step=7167] Training: 72%|███████▏ | 7168/10000 [1:34:54<34:08, 1.38it/s, loss=0.0038, lr=7.09e-06, step=7167] Training: 72%|███████▏ | 7168/10000 [1:34:54<34:08, 1.38it/s, loss=0.0062, lr=7.09e-06, step=7168] Training: 72%|███████▏ | 7169/10000 [1:34:55<34:03, 1.39it/s, loss=0.0062, lr=7.09e-06, step=7168] Training: 72%|███████▏ | 7169/10000 [1:34:55<34:03, 1.39it/s, loss=0.0229, lr=7.08e-06, step=7169]20:19:27.737 [I] step=7170 loss=0.0050 smoothed_loss=0.0117 lr=7.09e-06 grad_norm=0.3943 step_time=0.5851s data_time=0.1448s it/s=1.370 eta_to_10000=2065.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.0809 grad_arm_token_fuse=0.0520 grad_shared_expert=0.3102 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7170/10000 [1:34:55<34:56, 1.35it/s, loss=0.0229, lr=7.08e-06, step=7169] Training: 72%|███████▏ | 7170/10000 [1:34:55<34:56, 1.35it/s, loss=0.0050, lr=7.08e-06, step=7170] Training: 72%|███████▏ | 7171/10000 [1:34:56<36:17, 1.30it/s, loss=0.0050, lr=7.08e-06, step=7170] Training: 72%|███████▏ | 7171/10000 [1:34:56<36:17, 1.30it/s, loss=0.0039, lr=7.08e-06, step=7171] Training: 72%|███████▏ | 7172/10000 [1:34:57<39:49, 1.18it/s, loss=0.0039, lr=7.08e-06, step=7171] Training: 72%|███████▏ | 7172/10000 [1:34:57<39:49, 1.18it/s, loss=0.0014, lr=7.07e-06, step=7172] Training: 72%|███████▏ | 7173/10000 [1:34:58<35:57, 1.31it/s, loss=0.0014, lr=7.07e-06, step=7172] Training: 72%|███████▏ | 7173/10000 [1:34:58<35:57, 1.31it/s, loss=0.0120, lr=7.07e-06, step=7173] Training: 72%|███████▏ | 7174/10000 [1:34:59<34:57, 1.35it/s, loss=0.0120, lr=7.07e-06, step=7173] Training: 72%|███████▏ | 7174/10000 [1:34:59<34:57, 1.35it/s, loss=0.0033, lr=7.07e-06, step=7174] Training: 72%|███████▏ | 7175/10000 [1:34:59<35:10, 1.34it/s, loss=0.0033, lr=7.07e-06, step=7174] Training: 72%|███████▏ | 7175/10000 [1:34:59<35:10, 1.34it/s, loss=0.0073, lr=7.07e-06, step=7175] Training: 72%|███████▏ | 7176/10000 [1:35:00<35:28, 1.33it/s, loss=0.0073, lr=7.07e-06, step=7175] Training: 72%|███████▏ | 7176/10000 [1:35:00<35:28, 1.33it/s, loss=0.0075, lr=7.06e-06, step=7176] Training: 72%|███████▏ | 7177/10000 [1:35:01<33:45, 1.39it/s, loss=0.0075, lr=7.06e-06, step=7176] Training: 72%|███████▏ | 7177/10000 [1:35:01<33:45, 1.39it/s, loss=0.0053, lr=7.06e-06, step=7177] Training: 72%|███████▏ | 7178/10000 [1:35:01<32:57, 1.43it/s, loss=0.0053, lr=7.06e-06, step=7177] Training: 72%|███████▏ | 7178/10000 [1:35:01<32:57, 1.43it/s, loss=0.0077, lr=7.06e-06, step=7178] Training: 72%|███████▏ | 7179/10000 [1:35:02<32:45, 1.44it/s, loss=0.0077, lr=7.06e-06, step=7178] Training: 72%|███████▏ | 7179/10000 [1:35:02<32:45, 1.44it/s, loss=0.0078, lr=7.05e-06, step=7179]20:19:35.030 [I] step=7180 loss=0.0237 smoothed_loss=0.0100 lr=7.06e-06 grad_norm=0.5284 step_time=0.5998s data_time=0.1294s it/s=1.372 eta_to_10000=2056.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0396 grad_action_out_proj_arms=0.1527 grad_arm_token_fuse=0.2096 grad_shared_expert=0.6734 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7180/10000 [1:35:03<32:12, 1.46it/s, loss=0.0078, lr=7.05e-06, step=7179] Training: 72%|███████▏ | 7180/10000 [1:35:03<32:12, 1.46it/s, loss=0.0237, lr=7.05e-06, step=7180] Training: 72%|███████▏ | 7181/10000 [1:35:03<32:59, 1.42it/s, loss=0.0237, lr=7.05e-06, step=7180] Training: 72%|███████▏ | 7181/10000 [1:35:03<32:59, 1.42it/s, loss=0.0268, lr=7.05e-06, step=7181] Training: 72%|███████▏ | 7182/10000 [1:35:04<34:03, 1.38it/s, loss=0.0268, lr=7.05e-06, step=7181] Training: 72%|███████▏ | 7182/10000 [1:35:04<34:03, 1.38it/s, loss=0.0044, lr=7.04e-06, step=7182] Training: 72%|███████▏ | 7183/10000 [1:35:05<35:31, 1.32it/s, loss=0.0044, lr=7.04e-06, step=7182] Training: 72%|███████▏ | 7183/10000 [1:35:05<35:31, 1.32it/s, loss=0.0105, lr=7.04e-06, step=7183] Training: 72%|███████▏ | 7184/10000 [1:35:06<36:31, 1.29it/s, loss=0.0105, lr=7.04e-06, step=7183] Training: 72%|███████▏ | 7184/10000 [1:35:06<36:31, 1.29it/s, loss=0.0124, lr=7.04e-06, step=7184] Training: 72%|███████▏ | 7185/10000 [1:35:06<33:03, 1.42it/s, loss=0.0124, lr=7.04e-06, step=7184] Training: 72%|███████▏ | 7185/10000 [1:35:06<33:03, 1.42it/s, loss=0.0096, lr=7.04e-06, step=7185] Training: 72%|███████▏ | 7186/10000 [1:35:08<38:40, 1.21it/s, loss=0.0096, lr=7.04e-06, step=7185] Training: 72%|███████▏ | 7186/10000 [1:35:08<38:40, 1.21it/s, loss=0.0220, lr=7.03e-06, step=7186] Training: 72%|███████▏ | 7187/10000 [1:35:08<37:49, 1.24it/s, loss=0.0220, lr=7.03e-06, step=7186] Training: 72%|███████▏ | 7187/10000 [1:35:08<37:49, 1.24it/s, loss=0.0132, lr=7.03e-06, step=7187] Training: 72%|███████▏ | 7188/10000 [1:35:09<38:01, 1.23it/s, loss=0.0132, lr=7.03e-06, step=7187] Training: 72%|███████▏ | 7188/10000 [1:35:09<38:01, 1.23it/s, loss=0.0139, lr=7.03e-06, step=7188] Training: 72%|███████▏ | 7189/10000 [1:35:10<37:27, 1.25it/s, loss=0.0139, lr=7.03e-06, step=7188] Training: 72%|███████▏ | 7189/10000 [1:35:10<37:27, 1.25it/s, loss=0.0015, lr=7.02e-06, step=7189]20:19:43.128 [I] step=7190 loss=0.0086 smoothed_loss=0.0110 lr=7.03e-06 grad_norm=0.4407 step_time=0.6289s data_time=0.1810s it/s=1.235 eta_to_10000=2275.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.0806 grad_arm_token_fuse=0.0529 grad_shared_expert=0.5773 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7190/10000 [1:35:11<39:10, 1.20it/s, loss=0.0015, lr=7.02e-06, step=7189] Training: 72%|███████▏ | 7190/10000 [1:35:11<39:10, 1.20it/s, loss=0.0086, lr=7.02e-06, step=7190] Training: 72%|███████▏ | 7191/10000 [1:35:12<37:55, 1.23it/s, loss=0.0086, lr=7.02e-06, step=7190] Training: 72%|███████▏ | 7191/10000 [1:35:12<37:55, 1.23it/s, loss=0.0067, lr=7.02e-06, step=7191] Training: 72%|███████▏ | 7192/10000 [1:35:12<33:29, 1.40it/s, loss=0.0067, lr=7.02e-06, step=7191] Training: 72%|███████▏ | 7192/10000 [1:35:12<33:29, 1.40it/s, loss=0.0115, lr=7.01e-06, step=7192] Training: 72%|███████▏ | 7193/10000 [1:35:13<39:08, 1.20it/s, loss=0.0115, lr=7.01e-06, step=7192] Training: 72%|███████▏ | 7193/10000 [1:35:13<39:08, 1.20it/s, loss=0.0190, lr=7.01e-06, step=7193] Training: 72%|███████▏ | 7194/10000 [1:35:14<39:03, 1.20it/s, loss=0.0190, lr=7.01e-06, step=7193] Training: 72%|███████▏ | 7194/10000 [1:35:14<39:03, 1.20it/s, loss=0.0046, lr=7.01e-06, step=7194] Training: 72%|███████▏ | 7195/10000 [1:35:14<34:23, 1.36it/s, loss=0.0046, lr=7.01e-06, step=7194] Training: 72%|███████▏ | 7195/10000 [1:35:14<34:23, 1.36it/s, loss=0.0223, lr=7.01e-06, step=7195] Training: 72%|███████▏ | 7196/10000 [1:35:15<36:26, 1.28it/s, loss=0.0223, lr=7.01e-06, step=7195] Training: 72%|███████▏ | 7196/10000 [1:35:15<36:26, 1.28it/s, loss=0.0105, lr=7.00e-06, step=7196] Training: 72%|███████▏ | 7197/10000 [1:35:16<36:19, 1.29it/s, loss=0.0105, lr=7.00e-06, step=7196] Training: 72%|███████▏ | 7197/10000 [1:35:16<36:19, 1.29it/s, loss=0.0136, lr=7.00e-06, step=7197] Training: 72%|███████▏ | 7198/10000 [1:35:17<35:41, 1.31it/s, loss=0.0136, lr=7.00e-06, step=7197] Training: 72%|███████▏ | 7198/10000 [1:35:17<35:41, 1.31it/s, loss=0.0029, lr=7.00e-06, step=7198] Training: 72%|███████▏ | 7199/10000 [1:35:18<37:16, 1.25it/s, loss=0.0029, lr=7.00e-06, step=7198] Training: 72%|███████▏ | 7199/10000 [1:35:18<37:16, 1.25it/s, loss=0.0074, lr=6.99e-06, step=7199]20:19:50.828 [I] step=7200 loss=0.0157 smoothed_loss=0.0112 lr=7.00e-06 grad_norm=0.4446 step_time=0.5932s data_time=0.1769s it/s=1.299 eta_to_10000=2155.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0946 grad_arm_token_fuse=0.0285 grad_shared_expert=0.3147 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7200/10000 [1:35:18<36:23, 1.28it/s, loss=0.0074, lr=6.99e-06, step=7199] Training: 72%|███████▏ | 7200/10000 [1:35:19<36:23, 1.28it/s, loss=0.0157, lr=6.99e-06, step=7200] Training: 72%|███████▏ | 7201/10000 [1:35:19<39:08, 1.19it/s, loss=0.0157, lr=6.99e-06, step=7200] Training: 72%|███████▏ | 7201/10000 [1:35:19<39:08, 1.19it/s, loss=0.0012, lr=6.99e-06, step=7201] Training: 72%|███████▏ | 7202/10000 [1:35:20<37:40, 1.24it/s, loss=0.0012, lr=6.99e-06, step=7201] Training: 72%|███████▏ | 7202/10000 [1:35:20<37:40, 1.24it/s, loss=0.0111, lr=6.98e-06, step=7202] Training: 72%|███████▏ | 7203/10000 [1:35:21<35:38, 1.31it/s, loss=0.0111, lr=6.98e-06, step=7202] Training: 72%|███████▏ | 7203/10000 [1:35:21<35:38, 1.31it/s, loss=0.0415, lr=6.98e-06, step=7203] Training: 72%|███████▏ | 7204/10000 [1:35:22<35:04, 1.33it/s, loss=0.0415, lr=6.98e-06, step=7203] Training: 72%|███████▏ | 7204/10000 [1:35:22<35:04, 1.33it/s, loss=0.0101, lr=6.98e-06, step=7204] Training: 72%|███████▏ | 7205/10000 [1:35:22<35:13, 1.32it/s, loss=0.0101, lr=6.98e-06, step=7204] Training: 72%|███████▏ | 7205/10000 [1:35:22<35:13, 1.32it/s, loss=0.0030, lr=6.98e-06, step=7205] Training: 72%|███████▏ | 7206/10000 [1:35:23<35:59, 1.29it/s, loss=0.0030, lr=6.98e-06, step=7205] Training: 72%|███████▏ | 7206/10000 [1:35:23<35:59, 1.29it/s, loss=0.0040, lr=6.97e-06, step=7206] Training: 72%|███████▏ | 7207/10000 [1:35:24<38:57, 1.19it/s, loss=0.0040, lr=6.97e-06, step=7206] Training: 72%|███████▏ | 7207/10000 [1:35:24<38:57, 1.19it/s, loss=0.0035, lr=6.97e-06, step=7207] Training: 72%|███████▏ | 7208/10000 [1:35:25<36:15, 1.28it/s, loss=0.0035, lr=6.97e-06, step=7207] Training: 72%|███████▏ | 7208/10000 [1:35:25<36:15, 1.28it/s, loss=0.0066, lr=6.97e-06, step=7208] Training: 72%|███████▏ | 7209/10000 [1:35:26<37:30, 1.24it/s, loss=0.0066, lr=6.97e-06, step=7208] Training: 72%|███████▏ | 7209/10000 [1:35:26<37:30, 1.24it/s, loss=0.0137, lr=6.96e-06, step=7209]20:19:58.737 [I] step=7210 loss=0.0157 smoothed_loss=0.0110 lr=6.97e-06 grad_norm=0.4660 step_time=0.6213s data_time=0.1696s it/s=1.265 eta_to_10000=2206.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.1027 grad_arm_token_fuse=0.0668 grad_shared_expert=0.4112 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7210/10000 [1:35:26<36:26, 1.28it/s, loss=0.0137, lr=6.96e-06, step=7209] Training: 72%|███████▏ | 7210/10000 [1:35:26<36:26, 1.28it/s, loss=0.0157, lr=6.96e-06, step=7210] Training: 72%|███████▏ | 7211/10000 [1:35:27<33:12, 1.40it/s, loss=0.0157, lr=6.96e-06, step=7210] Training: 72%|███████▏ | 7211/10000 [1:35:27<33:12, 1.40it/s, loss=0.0042, lr=6.96e-06, step=7211] Training: 72%|███████▏ | 7212/10000 [1:35:28<32:27, 1.43it/s, loss=0.0042, lr=6.96e-06, step=7211] Training: 72%|███████▏ | 7212/10000 [1:35:28<32:27, 1.43it/s, loss=0.0138, lr=6.96e-06, step=7212] Training: 72%|███████▏ | 7213/10000 [1:35:28<33:23, 1.39it/s, loss=0.0138, lr=6.96e-06, step=7212] Training: 72%|███████▏ | 7213/10000 [1:35:28<33:23, 1.39it/s, loss=0.0100, lr=6.95e-06, step=7213] Training: 72%|███████▏ | 7214/10000 [1:35:29<35:54, 1.29it/s, loss=0.0100, lr=6.95e-06, step=7213] Training: 72%|███████▏ | 7214/10000 [1:35:29<35:54, 1.29it/s, loss=0.0209, lr=6.95e-06, step=7214] Training: 72%|███████▏ | 7215/10000 [1:35:30<39:37, 1.17it/s, loss=0.0209, lr=6.95e-06, step=7214] Training: 72%|███████▏ | 7215/10000 [1:35:30<39:37, 1.17it/s, loss=0.0200, lr=6.95e-06, step=7215] Training: 72%|███████▏ | 7216/10000 [1:35:31<37:43, 1.23it/s, loss=0.0200, lr=6.95e-06, step=7215] Training: 72%|███████▏ | 7216/10000 [1:35:31<37:43, 1.23it/s, loss=0.0056, lr=6.94e-06, step=7216] Training: 72%|███████▏ | 7217/10000 [1:35:32<34:35, 1.34it/s, loss=0.0056, lr=6.94e-06, step=7216] Training: 72%|███████▏ | 7217/10000 [1:35:32<34:35, 1.34it/s, loss=0.0047, lr=6.94e-06, step=7217] Training: 72%|███████▏ | 7218/10000 [1:35:32<31:53, 1.45it/s, loss=0.0047, lr=6.94e-06, step=7217] Training: 72%|███████▏ | 7218/10000 [1:35:32<31:53, 1.45it/s, loss=0.0158, lr=6.94e-06, step=7218] Training: 72%|███████▏ | 7219/10000 [1:35:33<35:16, 1.31it/s, loss=0.0158, lr=6.94e-06, step=7218] Training: 72%|███████▏ | 7219/10000 [1:35:33<35:16, 1.31it/s, loss=0.0074, lr=6.93e-06, step=7219]20:20:06.311 [I] step=7220 loss=0.0130 smoothed_loss=0.0113 lr=6.94e-06 grad_norm=0.4250 step_time=0.5948s data_time=0.1625s it/s=1.321 eta_to_10000=2105.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0751 grad_arm_token_fuse=0.0317 grad_shared_expert=0.3314 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7220/10000 [1:35:34<36:38, 1.26it/s, loss=0.0074, lr=6.93e-06, step=7219] Training: 72%|███████▏ | 7220/10000 [1:35:34<36:38, 1.26it/s, loss=0.0130, lr=6.93e-06, step=7220] Training: 72%|███████▏ | 7221/10000 [1:35:35<39:50, 1.16it/s, loss=0.0130, lr=6.93e-06, step=7220] Training: 72%|███████▏ | 7221/10000 [1:35:35<39:50, 1.16it/s, loss=0.0045, lr=6.93e-06, step=7221] Training: 72%|███████▏ | 7222/10000 [1:35:36<43:37, 1.06it/s, loss=0.0045, lr=6.93e-06, step=7221] Training: 72%|███████▏ | 7222/10000 [1:35:36<43:37, 1.06it/s, loss=0.0034, lr=6.93e-06, step=7222] Training: 72%|███████▏ | 7223/10000 [1:35:37<42:34, 1.09it/s, loss=0.0034, lr=6.93e-06, step=7222] Training: 72%|███████▏ | 7223/10000 [1:35:37<42:34, 1.09it/s, loss=0.0214, lr=6.92e-06, step=7223] Training: 72%|███████▏ | 7224/10000 [1:35:38<44:04, 1.05it/s, loss=0.0214, lr=6.92e-06, step=7223] Training: 72%|███████▏ | 7224/10000 [1:35:38<44:04, 1.05it/s, loss=0.0010, lr=6.92e-06, step=7224] Training: 72%|███████▏ | 7225/10000 [1:35:39<44:31, 1.04it/s, loss=0.0010, lr=6.92e-06, step=7224] Training: 72%|███████▏ | 7225/10000 [1:35:39<44:31, 1.04it/s, loss=0.0048, lr=6.92e-06, step=7225] Training: 72%|███████▏ | 7226/10000 [1:35:40<42:53, 1.08it/s, loss=0.0048, lr=6.92e-06, step=7225] Training: 72%|███████▏ | 7226/10000 [1:35:40<42:53, 1.08it/s, loss=0.0021, lr=6.91e-06, step=7226] Training: 72%|███████▏ | 7227/10000 [1:35:41<45:41, 1.01it/s, loss=0.0021, lr=6.91e-06, step=7226] Training: 72%|███████▏ | 7227/10000 [1:35:41<45:41, 1.01it/s, loss=0.0077, lr=6.91e-06, step=7227] Training: 72%|███████▏ | 7228/10000 [1:35:42<47:08, 1.02s/it, loss=0.0077, lr=6.91e-06, step=7227] Training: 72%|███████▏ | 7228/10000 [1:35:42<47:08, 1.02s/it, loss=0.0041, lr=6.91e-06, step=7228] Training: 72%|███████▏ | 7229/10000 [1:35:43<51:45, 1.12s/it, loss=0.0041, lr=6.91e-06, step=7228] Training: 72%|███████▏ | 7229/10000 [1:35:43<51:45, 1.12s/it, loss=0.0018, lr=6.90e-06, step=7229]20:20:16.862 [I] step=7230 loss=0.0115 smoothed_loss=0.0080 lr=6.92e-06 grad_norm=0.3271 step_time=0.7635s data_time=0.2916s it/s=0.948 eta_to_10000=2922.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.0913 grad_arm_token_fuse=0.0663 grad_shared_expert=0.2771 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7230/10000 [1:35:45<51:15, 1.11s/it, loss=0.0018, lr=6.90e-06, step=7229] Training: 72%|███████▏ | 7230/10000 [1:35:45<51:15, 1.11s/it, loss=0.0115, lr=6.90e-06, step=7230] Training: 72%|███████▏ | 7231/10000 [1:35:45<47:48, 1.04s/it, loss=0.0115, lr=6.90e-06, step=7230] Training: 72%|███████▏ | 7231/10000 [1:35:45<47:48, 1.04s/it, loss=0.0072, lr=6.90e-06, step=7231] Training: 72%|███████▏ | 7232/10000 [1:35:46<46:44, 1.01s/it, loss=0.0072, lr=6.90e-06, step=7231] Training: 72%|███████▏ | 7232/10000 [1:35:46<46:44, 1.01s/it, loss=0.0150, lr=6.90e-06, step=7232] Training: 72%|███████▏ | 7233/10000 [1:35:47<44:55, 1.03it/s, loss=0.0150, lr=6.90e-06, step=7232] Training: 72%|███████▏ | 7233/10000 [1:35:47<44:55, 1.03it/s, loss=0.0058, lr=6.89e-06, step=7233] Training: 72%|███████▏ | 7234/10000 [1:35:48<41:41, 1.11it/s, loss=0.0058, lr=6.89e-06, step=7233] Training: 72%|███████▏ | 7234/10000 [1:35:48<41:41, 1.11it/s, loss=0.0050, lr=6.89e-06, step=7234] Training: 72%|███████▏ | 7235/10000 [1:35:49<40:18, 1.14it/s, loss=0.0050, lr=6.89e-06, step=7234] Training: 72%|███████▏ | 7235/10000 [1:35:49<40:18, 1.14it/s, loss=0.0008, lr=6.89e-06, step=7235] Training: 72%|███████▏ | 7236/10000 [1:35:49<37:37, 1.22it/s, loss=0.0008, lr=6.89e-06, step=7235] Training: 72%|███████▏ | 7236/10000 [1:35:49<37:37, 1.22it/s, loss=0.0066, lr=6.88e-06, step=7236] Training: 72%|███████▏ | 7237/10000 [1:35:50<32:46, 1.41it/s, loss=0.0066, lr=6.88e-06, step=7236] Training: 72%|███████▏ | 7237/10000 [1:35:50<32:46, 1.41it/s, loss=0.0046, lr=6.88e-06, step=7237] Training: 72%|███████▏ | 7238/10000 [1:35:51<36:05, 1.28it/s, loss=0.0046, lr=6.88e-06, step=7237] Training: 72%|███████▏ | 7238/10000 [1:35:51<36:05, 1.28it/s, loss=0.0206, lr=6.88e-06, step=7238] Training: 72%|███████▏ | 7239/10000 [1:35:52<35:30, 1.30it/s, loss=0.0206, lr=6.88e-06, step=7238] Training: 72%|███████▏ | 7239/10000 [1:35:52<35:30, 1.30it/s, loss=0.0051, lr=6.88e-06, step=7239]20:20:24.838 [I] step=7240 loss=0.0049 smoothed_loss=0.0077 lr=6.89e-06 grad_norm=0.4429 step_time=0.6192s data_time=0.1784s it/s=1.254 eta_to_10000=2201.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0085 grad_action_out_proj_arms=0.0676 grad_arm_token_fuse=0.0417 grad_shared_expert=0.2513 (18633:train_pytorch.py:850) + Training: 72%|███████▏ | 7240/10000 [1:35:53<36:59, 1.24it/s, loss=0.0051, lr=6.88e-06, step=7239] Training: 72%|███████▏ | 7240/10000 [1:35:53<36:59, 1.24it/s, loss=0.0049, lr=6.87e-06, step=7240] Training: 72%|███████▏ | 7241/10000 [1:35:53<33:58, 1.35it/s, loss=0.0049, lr=6.87e-06, step=7240] Training: 72%|███████▏ | 7241/10000 [1:35:53<33:58, 1.35it/s, loss=0.0161, lr=6.87e-06, step=7241] Training: 72%|███████▏ | 7242/10000 [1:35:54<34:53, 1.32it/s, loss=0.0161, lr=6.87e-06, step=7241] Training: 72%|███████▏ | 7242/10000 [1:35:54<34:53, 1.32it/s, loss=0.0558, lr=6.87e-06, step=7242] Training: 72%|███████▏ | 7243/10000 [1:35:55<37:54, 1.21it/s, loss=0.0558, lr=6.87e-06, step=7242] Training: 72%|███████▏ | 7243/10000 [1:35:55<37:54, 1.21it/s, loss=0.0027, lr=6.86e-06, step=7243] Training: 72%|███████▏ | 7244/10000 [1:35:56<37:57, 1.21it/s, loss=0.0027, lr=6.86e-06, step=7243] Training: 72%|███████▏ | 7244/10000 [1:35:56<37:57, 1.21it/s, loss=0.0017, lr=6.86e-06, step=7244] Training: 72%|███████▏ | 7245/10000 [1:35:57<40:49, 1.12it/s, loss=0.0017, lr=6.86e-06, step=7244] Training: 72%|███████▏ | 7245/10000 [1:35:57<40:49, 1.12it/s, loss=0.0094, lr=6.86e-06, step=7245] Training: 72%|███████▏ | 7246/10000 [1:35:58<42:33, 1.08it/s, loss=0.0094, lr=6.86e-06, step=7245] Training: 72%|███████▏ | 7246/10000 [1:35:58<42:33, 1.08it/s, loss=0.0033, lr=6.85e-06, step=7246] Training: 72%|███████▏ | 7247/10000 [1:35:58<39:17, 1.17it/s, loss=0.0033, lr=6.85e-06, step=7246] Training: 72%|███████▏ | 7247/10000 [1:35:58<39:17, 1.17it/s, loss=0.0065, lr=6.85e-06, step=7247] Training: 72%|███████▏ | 7248/10000 [1:35:59<39:18, 1.17it/s, loss=0.0065, lr=6.85e-06, step=7247] Training: 72%|███████▏ | 7248/10000 [1:35:59<39:18, 1.17it/s, loss=0.0186, lr=6.85e-06, step=7248] Training: 72%|███████▏ | 7249/10000 [1:36:00<35:01, 1.31it/s, loss=0.0186, lr=6.85e-06, step=7248] Training: 72%|███████▏ | 7249/10000 [1:36:00<35:01, 1.31it/s, loss=0.0103, lr=6.85e-06, step=7249]20:20:33.334 [I] step=7250 loss=0.0131 smoothed_loss=0.0109 lr=6.86e-06 grad_norm=0.4481 step_time=0.6401s data_time=0.2094s it/s=1.177 eta_to_10000=2336.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0320 grad_action_out_proj_arms=0.1535 grad_arm_token_fuse=0.1651 grad_shared_expert=0.5751 (18633:train_pytorch.py:850) + Training: 72%|███████▎ | 7250/10000 [1:36:01<40:17, 1.14it/s, loss=0.0103, lr=6.85e-06, step=7249] Training: 72%|███████▎ | 7250/10000 [1:36:01<40:17, 1.14it/s, loss=0.0131, lr=6.84e-06, step=7250] Training: 73%|███████▎ | 7251/10000 [1:36:02<39:07, 1.17it/s, loss=0.0131, lr=6.84e-06, step=7250] Training: 73%|███████▎ | 7251/10000 [1:36:02<39:07, 1.17it/s, loss=0.0015, lr=6.84e-06, step=7251] Training: 73%|███████▎ | 7252/10000 [1:36:03<40:50, 1.12it/s, loss=0.0015, lr=6.84e-06, step=7251] Training: 73%|███████▎ | 7252/10000 [1:36:03<40:50, 1.12it/s, loss=0.0093, lr=6.84e-06, step=7252] Training: 73%|███████▎ | 7253/10000 [1:36:04<40:50, 1.12it/s, loss=0.0093, lr=6.84e-06, step=7252] Training: 73%|███████▎ | 7253/10000 [1:36:04<40:50, 1.12it/s, loss=0.0058, lr=6.83e-06, step=7253] Training: 73%|███████▎ | 7254/10000 [1:36:04<39:47, 1.15it/s, loss=0.0058, lr=6.83e-06, step=7253] Training: 73%|███████▎ | 7254/10000 [1:36:04<39:47, 1.15it/s, loss=0.0162, lr=6.83e-06, step=7254] Training: 73%|███████▎ | 7255/10000 [1:36:05<38:34, 1.19it/s, loss=0.0162, lr=6.83e-06, step=7254] Training: 73%|███████▎ | 7255/10000 [1:36:05<38:34, 1.19it/s, loss=0.0039, lr=6.83e-06, step=7255] Training: 73%|███████▎ | 7256/10000 [1:36:06<38:07, 1.20it/s, loss=0.0039, lr=6.83e-06, step=7255] Training: 73%|███████▎ | 7256/10000 [1:36:06<38:07, 1.20it/s, loss=0.0030, lr=6.83e-06, step=7256] Training: 73%|███████▎ | 7257/10000 [1:36:07<39:41, 1.15it/s, loss=0.0030, lr=6.83e-06, step=7256] Training: 73%|███████▎ | 7257/10000 [1:36:07<39:41, 1.15it/s, loss=0.0033, lr=6.82e-06, step=7257] Training: 73%|███████▎ | 7258/10000 [1:36:08<38:16, 1.19it/s, loss=0.0033, lr=6.82e-06, step=7257] Training: 73%|███████▎ | 7258/10000 [1:36:08<38:16, 1.19it/s, loss=0.0043, lr=6.82e-06, step=7258] Training: 73%|███████▎ | 7259/10000 [1:36:09<40:26, 1.13it/s, loss=0.0043, lr=6.82e-06, step=7258] Training: 73%|███████▎ | 7259/10000 [1:36:09<40:26, 1.13it/s, loss=0.0051, lr=6.82e-06, step=7259]20:20:41.660 [I] step=7260 loss=0.0017 smoothed_loss=0.0070 lr=6.83e-06 grad_norm=0.3867 step_time=0.6432s data_time=0.1895s it/s=1.201 eta_to_10000=2281.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0100 grad_action_out_proj_arms=0.0938 grad_arm_token_fuse=0.0510 grad_shared_expert=0.4615 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7260/10000 [1:36:09<35:38, 1.28it/s, loss=0.0051, lr=6.82e-06, step=7259] Training: 73%|███████▎ | 7260/10000 [1:36:09<35:38, 1.28it/s, loss=0.0017, lr=6.81e-06, step=7260] Training: 73%|███████▎ | 7261/10000 [1:36:10<32:23, 1.41it/s, loss=0.0017, lr=6.81e-06, step=7260] Training: 73%|███████▎ | 7261/10000 [1:36:10<32:23, 1.41it/s, loss=0.0036, lr=6.81e-06, step=7261] Training: 73%|███████▎ | 7262/10000 [1:36:10<29:58, 1.52it/s, loss=0.0036, lr=6.81e-06, step=7261] Training: 73%|███████▎ | 7262/10000 [1:36:10<29:58, 1.52it/s, loss=0.0129, lr=6.81e-06, step=7262] Training: 73%|███████▎ | 7263/10000 [1:36:11<30:21, 1.50it/s, loss=0.0129, lr=6.81e-06, step=7262] Training: 73%|███████▎ | 7263/10000 [1:36:11<30:21, 1.50it/s, loss=0.0030, lr=6.80e-06, step=7263] Training: 73%|███████▎ | 7264/10000 [1:36:12<34:09, 1.33it/s, loss=0.0030, lr=6.80e-06, step=7263] Training: 73%|███████▎ | 7264/10000 [1:36:12<34:09, 1.33it/s, loss=0.0042, lr=6.80e-06, step=7264] Training: 73%|███████▎ | 7265/10000 [1:36:13<33:52, 1.35it/s, loss=0.0042, lr=6.80e-06, step=7264] Training: 73%|███████▎ | 7265/10000 [1:36:13<33:52, 1.35it/s, loss=0.0147, lr=6.80e-06, step=7265] Training: 73%|███████▎ | 7266/10000 [1:36:14<38:14, 1.19it/s, loss=0.0147, lr=6.80e-06, step=7265] Training: 73%|███████▎ | 7266/10000 [1:36:14<38:14, 1.19it/s, loss=0.0051, lr=6.80e-06, step=7266] Training: 73%|███████▎ | 7267/10000 [1:36:15<38:11, 1.19it/s, loss=0.0051, lr=6.80e-06, step=7266] Training: 73%|███████▎ | 7267/10000 [1:36:15<38:11, 1.19it/s, loss=0.0157, lr=6.79e-06, step=7267] Training: 73%|███████▎ | 7268/10000 [1:36:15<34:20, 1.33it/s, loss=0.0157, lr=6.79e-06, step=7267] Training: 73%|███████▎ | 7268/10000 [1:36:15<34:20, 1.33it/s, loss=0.0026, lr=6.79e-06, step=7268] Training: 73%|███████▎ | 7269/10000 [1:36:16<31:05, 1.46it/s, loss=0.0026, lr=6.79e-06, step=7268] Training: 73%|███████▎ | 7269/10000 [1:36:16<31:05, 1.46it/s, loss=0.0083, lr=6.79e-06, step=7269]20:20:48.627 [I] step=7270 loss=0.0140 smoothed_loss=0.0082 lr=6.80e-06 grad_norm=0.3797 step_time=0.5651s data_time=0.1315s it/s=1.436 eta_to_10000=1901.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0094 grad_action_out_proj_arms=0.0773 grad_arm_token_fuse=0.0478 grad_shared_expert=0.3073 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7270/10000 [1:36:16<29:19, 1.55it/s, loss=0.0083, lr=6.79e-06, step=7269] Training: 73%|███████▎ | 7270/10000 [1:36:16<29:19, 1.55it/s, loss=0.0140, lr=6.78e-06, step=7270] Training: 73%|███████▎ | 7271/10000 [1:36:17<34:42, 1.31it/s, loss=0.0140, lr=6.78e-06, step=7270] Training: 73%|███████▎ | 7271/10000 [1:36:17<34:42, 1.31it/s, loss=0.0193, lr=6.78e-06, step=7271] Training: 73%|███████▎ | 7272/10000 [1:36:19<40:47, 1.11it/s, loss=0.0193, lr=6.78e-06, step=7271] Training: 73%|███████▎ | 7272/10000 [1:36:19<40:47, 1.11it/s, loss=0.0217, lr=6.78e-06, step=7272] Training: 73%|███████▎ | 7273/10000 [1:36:19<41:30, 1.10it/s, loss=0.0217, lr=6.78e-06, step=7272] Training: 73%|███████▎ | 7273/10000 [1:36:19<41:30, 1.10it/s, loss=0.0061, lr=6.78e-06, step=7273] Training: 73%|███████▎ | 7274/10000 [1:36:20<41:48, 1.09it/s, loss=0.0061, lr=6.78e-06, step=7273] Training: 73%|███████▎ | 7274/10000 [1:36:20<41:48, 1.09it/s, loss=0.0115, lr=6.77e-06, step=7274] Training: 73%|███████▎ | 7275/10000 [1:36:21<40:10, 1.13it/s, loss=0.0115, lr=6.77e-06, step=7274] Training: 73%|███████▎ | 7275/10000 [1:36:21<40:10, 1.13it/s, loss=0.0093, lr=6.77e-06, step=7275] Training: 73%|███████▎ | 7276/10000 [1:36:22<35:01, 1.30it/s, loss=0.0093, lr=6.77e-06, step=7275] Training: 73%|███████▎ | 7276/10000 [1:36:22<35:01, 1.30it/s, loss=0.0052, lr=6.77e-06, step=7276] Training: 73%|███████▎ | 7277/10000 [1:36:22<30:59, 1.46it/s, loss=0.0052, lr=6.77e-06, step=7276] Training: 73%|███████▎ | 7277/10000 [1:36:22<30:59, 1.46it/s, loss=0.0055, lr=6.76e-06, step=7277] Training: 73%|███████▎ | 7278/10000 [1:36:23<34:27, 1.32it/s, loss=0.0055, lr=6.76e-06, step=7277] Training: 73%|███████▎ | 7278/10000 [1:36:23<34:27, 1.32it/s, loss=0.0262, lr=6.76e-06, step=7278] Training: 73%|███████▎ | 7279/10000 [1:36:24<38:00, 1.19it/s, loss=0.0262, lr=6.76e-06, step=7278] Training: 73%|███████▎ | 7279/10000 [1:36:24<38:00, 1.19it/s, loss=0.0037, lr=6.76e-06, step=7279]20:20:57.280 [I] step=7280 loss=0.0070 smoothed_loss=0.0099 lr=6.77e-06 grad_norm=0.4625 step_time=0.6662s data_time=0.1991s it/s=1.156 eta_to_10000=2353.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0131 grad_action_out_proj_arms=0.0982 grad_arm_token_fuse=0.0605 grad_shared_expert=0.3183 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7280/10000 [1:36:25<37:08, 1.22it/s, loss=0.0037, lr=6.76e-06, step=7279] Training: 73%|███████▎ | 7280/10000 [1:36:25<37:08, 1.22it/s, loss=0.0070, lr=6.76e-06, step=7280] Training: 73%|███████▎ | 7281/10000 [1:36:26<36:49, 1.23it/s, loss=0.0070, lr=6.76e-06, step=7280] Training: 73%|███████▎ | 7281/10000 [1:36:26<36:49, 1.23it/s, loss=0.0058, lr=6.75e-06, step=7281] Training: 73%|███████▎ | 7282/10000 [1:36:27<36:19, 1.25it/s, loss=0.0058, lr=6.75e-06, step=7281] Training: 73%|███████▎ | 7282/10000 [1:36:27<36:19, 1.25it/s, loss=0.0073, lr=6.75e-06, step=7282] Training: 73%|███████▎ | 7283/10000 [1:36:27<35:48, 1.26it/s, loss=0.0073, lr=6.75e-06, step=7282] Training: 73%|███████▎ | 7283/10000 [1:36:27<35:48, 1.26it/s, loss=0.0043, lr=6.75e-06, step=7283] Training: 73%|███████▎ | 7284/10000 [1:36:28<32:00, 1.41it/s, loss=0.0043, lr=6.75e-06, step=7283] Training: 73%|███████▎ | 7284/10000 [1:36:28<32:00, 1.41it/s, loss=0.0109, lr=6.74e-06, step=7284] Training: 73%|███████▎ | 7285/10000 [1:36:29<33:16, 1.36it/s, loss=0.0109, lr=6.74e-06, step=7284] Training: 73%|███████▎ | 7285/10000 [1:36:29<33:16, 1.36it/s, loss=0.0066, lr=6.74e-06, step=7285] Training: 73%|███████▎ | 7286/10000 [1:36:29<34:08, 1.32it/s, loss=0.0066, lr=6.74e-06, step=7285] Training: 73%|███████▎ | 7286/10000 [1:36:29<34:08, 1.32it/s, loss=0.0109, lr=6.74e-06, step=7286] Training: 73%|███████▎ | 7287/10000 [1:36:30<30:26, 1.49it/s, loss=0.0109, lr=6.74e-06, step=7286] Training: 73%|███████▎ | 7287/10000 [1:36:30<30:26, 1.49it/s, loss=0.0098, lr=6.73e-06, step=7287] Training: 73%|███████▎ | 7288/10000 [1:36:31<36:14, 1.25it/s, loss=0.0098, lr=6.73e-06, step=7287] Training: 73%|███████▎ | 7288/10000 [1:36:31<36:14, 1.25it/s, loss=0.0108, lr=6.73e-06, step=7288] Training: 73%|███████▎ | 7289/10000 [1:36:31<32:05, 1.41it/s, loss=0.0108, lr=6.73e-06, step=7288] Training: 73%|███████▎ | 7289/10000 [1:36:31<32:05, 1.41it/s, loss=0.0027, lr=6.73e-06, step=7289]20:21:04.427 [I] step=7290 loss=0.0055 smoothed_loss=0.0083 lr=6.74e-06 grad_norm=0.4991 step_time=0.5777s data_time=0.1371s it/s=1.400 eta_to_10000=1935.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0053 grad_action_out_proj_arms=0.0647 grad_arm_token_fuse=0.0275 grad_shared_expert=0.2602 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7290/10000 [1:36:32<30:43, 1.47it/s, loss=0.0027, lr=6.73e-06, step=7289] Training: 73%|███████▎ | 7290/10000 [1:36:32<30:43, 1.47it/s, loss=0.0055, lr=6.73e-06, step=7290] Training: 73%|███████▎ | 7291/10000 [1:36:33<28:02, 1.61it/s, loss=0.0055, lr=6.73e-06, step=7290] Training: 73%|███████▎ | 7291/10000 [1:36:33<28:02, 1.61it/s, loss=0.0088, lr=6.72e-06, step=7291] Training: 73%|███████▎ | 7292/10000 [1:36:33<28:44, 1.57it/s, loss=0.0088, lr=6.72e-06, step=7291] Training: 73%|███████▎ | 7292/10000 [1:36:33<28:44, 1.57it/s, loss=0.0048, lr=6.72e-06, step=7292] Training: 73%|███████▎ | 7293/10000 [1:36:34<33:41, 1.34it/s, loss=0.0048, lr=6.72e-06, step=7292] Training: 73%|███████▎ | 7293/10000 [1:36:34<33:41, 1.34it/s, loss=0.0071, lr=6.72e-06, step=7293] Training: 73%|███████▎ | 7294/10000 [1:36:35<30:18, 1.49it/s, loss=0.0071, lr=6.72e-06, step=7293] Training: 73%|███████▎ | 7294/10000 [1:36:35<30:18, 1.49it/s, loss=0.0555, lr=6.71e-06, step=7294] Training: 73%|███████▎ | 7295/10000 [1:36:35<29:38, 1.52it/s, loss=0.0555, lr=6.71e-06, step=7294] Training: 73%|███████▎ | 7295/10000 [1:36:35<29:38, 1.52it/s, loss=0.0779, lr=6.71e-06, step=7295] Training: 73%|███████▎ | 7296/10000 [1:36:36<29:44, 1.52it/s, loss=0.0779, lr=6.71e-06, step=7295] Training: 73%|███████▎ | 7296/10000 [1:36:36<29:44, 1.52it/s, loss=0.0066, lr=6.71e-06, step=7296] Training: 73%|███████▎ | 7297/10000 [1:36:37<33:07, 1.36it/s, loss=0.0066, lr=6.71e-06, step=7296] Training: 73%|███████▎ | 7297/10000 [1:36:37<33:07, 1.36it/s, loss=0.0031, lr=6.71e-06, step=7297] Training: 73%|███████▎ | 7298/10000 [1:36:38<30:32, 1.47it/s, loss=0.0031, lr=6.71e-06, step=7297] Training: 73%|███████▎ | 7298/10000 [1:36:38<30:32, 1.47it/s, loss=0.0015, lr=6.70e-06, step=7298] Training: 73%|███████▎ | 7299/10000 [1:36:38<28:25, 1.58it/s, loss=0.0015, lr=6.70e-06, step=7298] Training: 73%|███████▎ | 7299/10000 [1:36:38<28:25, 1.58it/s, loss=0.0035, lr=6.70e-06, step=7299]20:21:11.458 [I] step=7300 loss=0.0259 smoothed_loss=0.0150 lr=6.71e-06 grad_norm=0.4550 step_time=0.5600s data_time=0.1431s it/s=1.422 eta_to_10000=1898.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0225 grad_action_out_proj_arms=0.1237 grad_arm_token_fuse=0.1142 grad_shared_expert=0.4718 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7300/10000 [1:36:39<34:48, 1.29it/s, loss=0.0035, lr=6.70e-06, step=7299] Training: 73%|███████▎ | 7300/10000 [1:36:39<34:48, 1.29it/s, loss=0.0259, lr=6.70e-06, step=7300] Training: 73%|███████▎ | 7301/10000 [1:36:40<30:47, 1.46it/s, loss=0.0259, lr=6.70e-06, step=7300] Training: 73%|███████▎ | 7301/10000 [1:36:40<30:47, 1.46it/s, loss=0.0060, lr=6.69e-06, step=7301] Training: 73%|███████▎ | 7302/10000 [1:36:40<28:18, 1.59it/s, loss=0.0060, lr=6.69e-06, step=7301] Training: 73%|███████▎ | 7302/10000 [1:36:40<28:18, 1.59it/s, loss=0.0071, lr=6.69e-06, step=7302] Training: 73%|███████▎ | 7303/10000 [1:36:41<34:29, 1.30it/s, loss=0.0071, lr=6.69e-06, step=7302] Training: 73%|███████▎ | 7303/10000 [1:36:41<34:29, 1.30it/s, loss=0.0092, lr=6.69e-06, step=7303] Training: 73%|███████▎ | 7304/10000 [1:36:42<33:17, 1.35it/s, loss=0.0092, lr=6.69e-06, step=7303] Training: 73%|███████▎ | 7304/10000 [1:36:42<33:17, 1.35it/s, loss=0.0317, lr=6.69e-06, step=7304] Training: 73%|███████▎ | 7305/10000 [1:36:43<32:21, 1.39it/s, loss=0.0317, lr=6.69e-06, step=7304] Training: 73%|███████▎ | 7305/10000 [1:36:43<32:21, 1.39it/s, loss=0.0069, lr=6.68e-06, step=7305] Training: 73%|███████▎ | 7306/10000 [1:36:43<35:14, 1.27it/s, loss=0.0069, lr=6.68e-06, step=7305] Training: 73%|███████▎ | 7306/10000 [1:36:43<35:14, 1.27it/s, loss=0.0061, lr=6.68e-06, step=7306] Training: 73%|███████▎ | 7307/10000 [1:36:44<36:06, 1.24it/s, loss=0.0061, lr=6.68e-06, step=7306] Training: 73%|███████▎ | 7307/10000 [1:36:44<36:06, 1.24it/s, loss=0.0017, lr=6.68e-06, step=7307] Training: 73%|███████▎ | 7308/10000 [1:36:45<38:00, 1.18it/s, loss=0.0017, lr=6.68e-06, step=7307] Training: 73%|███████▎ | 7308/10000 [1:36:45<38:00, 1.18it/s, loss=0.0036, lr=6.67e-06, step=7308] Training: 73%|███████▎ | 7309/10000 [1:36:46<33:14, 1.35it/s, loss=0.0036, lr=6.67e-06, step=7308] Training: 73%|███████▎ | 7309/10000 [1:36:46<33:14, 1.35it/s, loss=0.0045, lr=6.67e-06, step=7309]20:21:18.766 [I] step=7310 loss=0.0030 smoothed_loss=0.0098 lr=6.68e-06 grad_norm=0.4271 step_time=0.5679s data_time=0.1629s it/s=1.369 eta_to_10000=1965.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0273 grad_action_out_proj_arms=0.1288 grad_arm_token_fuse=0.1379 grad_shared_expert=0.5333 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7310/10000 [1:36:46<32:10, 1.39it/s, loss=0.0045, lr=6.67e-06, step=7309] Training: 73%|███████▎ | 7310/10000 [1:36:46<32:10, 1.39it/s, loss=0.0030, lr=6.67e-06, step=7310] Training: 73%|███████▎ | 7311/10000 [1:36:47<35:50, 1.25it/s, loss=0.0030, lr=6.67e-06, step=7310] Training: 73%|███████▎ | 7311/10000 [1:36:47<35:50, 1.25it/s, loss=0.0129, lr=6.67e-06, step=7311] Training: 73%|███████▎ | 7312/10000 [1:36:48<31:30, 1.42it/s, loss=0.0129, lr=6.67e-06, step=7311] Training: 73%|███████▎ | 7312/10000 [1:36:48<31:30, 1.42it/s, loss=0.0072, lr=6.66e-06, step=7312] Training: 73%|███████▎ | 7313/10000 [1:36:49<33:24, 1.34it/s, loss=0.0072, lr=6.66e-06, step=7312] Training: 73%|███████▎ | 7313/10000 [1:36:49<33:24, 1.34it/s, loss=0.0061, lr=6.66e-06, step=7313] Training: 73%|███████▎ | 7314/10000 [1:36:49<33:22, 1.34it/s, loss=0.0061, lr=6.66e-06, step=7313] Training: 73%|███████▎ | 7314/10000 [1:36:49<33:22, 1.34it/s, loss=0.0016, lr=6.66e-06, step=7314] Training: 73%|███████▎ | 7315/10000 [1:36:50<32:25, 1.38it/s, loss=0.0016, lr=6.66e-06, step=7314] Training: 73%|███████▎ | 7315/10000 [1:36:50<32:25, 1.38it/s, loss=0.0028, lr=6.65e-06, step=7315] Training: 73%|███████▎ | 7316/10000 [1:36:51<34:24, 1.30it/s, loss=0.0028, lr=6.65e-06, step=7315] Training: 73%|███████▎ | 7316/10000 [1:36:51<34:24, 1.30it/s, loss=0.0097, lr=6.65e-06, step=7316] Training: 73%|███████▎ | 7317/10000 [1:36:52<35:51, 1.25it/s, loss=0.0097, lr=6.65e-06, step=7316] Training: 73%|███████▎ | 7317/10000 [1:36:52<35:51, 1.25it/s, loss=0.0043, lr=6.65e-06, step=7317] Training: 73%|███████▎ | 7318/10000 [1:36:53<33:55, 1.32it/s, loss=0.0043, lr=6.65e-06, step=7317] Training: 73%|███████▎ | 7318/10000 [1:36:53<33:55, 1.32it/s, loss=0.0483, lr=6.65e-06, step=7318] Training: 73%|███████▎ | 7319/10000 [1:36:53<30:14, 1.48it/s, loss=0.0483, lr=6.65e-06, step=7318] Training: 73%|███████▎ | 7319/10000 [1:36:53<30:14, 1.48it/s, loss=0.0031, lr=6.64e-06, step=7319]20:21:26.160 [I] step=7320 loss=0.0009 smoothed_loss=0.0100 lr=6.65e-06 grad_norm=0.4181 step_time=0.5968s data_time=0.1426s it/s=1.353 eta_to_10000=1981.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0153 grad_action_out_proj_arms=0.1244 grad_arm_token_fuse=0.0750 grad_shared_expert=0.6287 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7320/10000 [1:36:54<31:24, 1.42it/s, loss=0.0031, lr=6.64e-06, step=7319] Training: 73%|███████▎ | 7320/10000 [1:36:54<31:24, 1.42it/s, loss=0.0009, lr=6.64e-06, step=7320] Training: 73%|███████▎ | 7321/10000 [1:36:55<34:32, 1.29it/s, loss=0.0009, lr=6.64e-06, step=7320] Training: 73%|███████▎ | 7321/10000 [1:36:55<34:32, 1.29it/s, loss=0.0105, lr=6.64e-06, step=7321] Training: 73%|███████▎ | 7322/10000 [1:36:55<33:19, 1.34it/s, loss=0.0105, lr=6.64e-06, step=7321] Training: 73%|███████▎ | 7322/10000 [1:36:55<33:19, 1.34it/s, loss=0.0106, lr=6.63e-06, step=7322] Training: 73%|███████▎ | 7323/10000 [1:36:56<35:02, 1.27it/s, loss=0.0106, lr=6.63e-06, step=7322] Training: 73%|███████▎ | 7323/10000 [1:36:56<35:02, 1.27it/s, loss=0.0050, lr=6.63e-06, step=7323] Training: 73%|███████▎ | 7324/10000 [1:36:57<33:52, 1.32it/s, loss=0.0050, lr=6.63e-06, step=7323] Training: 73%|███████▎ | 7324/10000 [1:36:57<33:52, 1.32it/s, loss=0.0017, lr=6.63e-06, step=7324] Training: 73%|███████▎ | 7325/10000 [1:36:58<37:36, 1.19it/s, loss=0.0017, lr=6.63e-06, step=7324] Training: 73%|███████▎ | 7325/10000 [1:36:58<37:36, 1.19it/s, loss=0.0011, lr=6.62e-06, step=7325] Training: 73%|███████▎ | 7326/10000 [1:36:59<35:11, 1.27it/s, loss=0.0011, lr=6.62e-06, step=7325] Training: 73%|███████▎ | 7326/10000 [1:36:59<35:11, 1.27it/s, loss=0.0053, lr=6.62e-06, step=7326] Training: 73%|███████▎ | 7327/10000 [1:37:00<35:27, 1.26it/s, loss=0.0053, lr=6.62e-06, step=7326] Training: 73%|███████▎ | 7327/10000 [1:37:00<35:27, 1.26it/s, loss=0.0069, lr=6.62e-06, step=7327] Training: 73%|███████▎ | 7328/10000 [1:37:00<33:50, 1.32it/s, loss=0.0069, lr=6.62e-06, step=7327] Training: 73%|███████▎ | 7328/10000 [1:37:00<33:50, 1.32it/s, loss=0.0153, lr=6.62e-06, step=7328] Training: 73%|███████▎ | 7329/10000 [1:37:01<37:04, 1.20it/s, loss=0.0153, lr=6.62e-06, step=7328] Training: 73%|███████▎ | 7329/10000 [1:37:01<37:04, 1.20it/s, loss=0.0104, lr=6.61e-06, step=7329]20:21:34.469 [I] step=7330 loss=0.0138 smoothed_loss=0.0091 lr=6.62e-06 grad_norm=0.4209 step_time=0.6454s data_time=0.1856s it/s=1.204 eta_to_10000=2218.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.1012 grad_arm_token_fuse=0.0558 grad_shared_expert=0.4207 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7330/10000 [1:37:02<38:13, 1.16it/s, loss=0.0104, lr=6.61e-06, step=7329] Training: 73%|███████▎ | 7330/10000 [1:37:02<38:13, 1.16it/s, loss=0.0138, lr=6.61e-06, step=7330] Training: 73%|███████▎ | 7331/10000 [1:37:03<34:21, 1.29it/s, loss=0.0138, lr=6.61e-06, step=7330] Training: 73%|███████▎ | 7331/10000 [1:37:03<34:21, 1.29it/s, loss=0.0195, lr=6.61e-06, step=7331] Training: 73%|███████▎ | 7332/10000 [1:37:03<32:41, 1.36it/s, loss=0.0195, lr=6.61e-06, step=7331] Training: 73%|███████▎ | 7332/10000 [1:37:03<32:41, 1.36it/s, loss=0.0143, lr=6.60e-06, step=7332] Training: 73%|███████▎ | 7333/10000 [1:37:04<33:30, 1.33it/s, loss=0.0143, lr=6.60e-06, step=7332] Training: 73%|███████▎ | 7333/10000 [1:37:04<33:30, 1.33it/s, loss=0.0023, lr=6.60e-06, step=7333] Training: 73%|███████▎ | 7334/10000 [1:37:05<36:43, 1.21it/s, loss=0.0023, lr=6.60e-06, step=7333] Training: 73%|███████▎ | 7334/10000 [1:37:05<36:43, 1.21it/s, loss=0.0133, lr=6.60e-06, step=7334] Training: 73%|███████▎ | 7335/10000 [1:37:06<32:04, 1.38it/s, loss=0.0133, lr=6.60e-06, step=7334] Training: 73%|███████▎ | 7335/10000 [1:37:06<32:04, 1.38it/s, loss=0.0010, lr=6.60e-06, step=7335] Training: 73%|███████▎ | 7336/10000 [1:37:06<31:28, 1.41it/s, loss=0.0010, lr=6.60e-06, step=7335] Training: 73%|███████▎ | 7336/10000 [1:37:06<31:28, 1.41it/s, loss=0.0089, lr=6.59e-06, step=7336] Training: 73%|███████▎ | 7337/10000 [1:37:07<35:12, 1.26it/s, loss=0.0089, lr=6.59e-06, step=7336] Training: 73%|███████▎ | 7337/10000 [1:37:07<35:12, 1.26it/s, loss=0.0132, lr=6.59e-06, step=7337] Training: 73%|███████▎ | 7338/10000 [1:37:08<36:42, 1.21it/s, loss=0.0132, lr=6.59e-06, step=7337] Training: 73%|███████▎ | 7338/10000 [1:37:08<36:42, 1.21it/s, loss=0.0497, lr=6.59e-06, step=7338] Training: 73%|███████▎ | 7339/10000 [1:37:09<38:54, 1.14it/s, loss=0.0497, lr=6.59e-06, step=7338] Training: 73%|███████▎ | 7339/10000 [1:37:09<38:54, 1.14it/s, loss=0.0023, lr=6.58e-06, step=7339]20:21:42.200 [I] step=7340 loss=0.0108 smoothed_loss=0.0123 lr=6.59e-06 grad_norm=0.4342 step_time=0.5973s data_time=0.1758s it/s=1.294 eta_to_10000=2056.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0080 grad_action_out_proj_arms=0.0749 grad_arm_token_fuse=0.0395 grad_shared_expert=0.2715 (18633:train_pytorch.py:850) + Training: 73%|███████▎ | 7340/10000 [1:37:10<36:08, 1.23it/s, loss=0.0023, lr=6.58e-06, step=7339] Training: 73%|███████▎ | 7340/10000 [1:37:10<36:08, 1.23it/s, loss=0.0108, lr=6.58e-06, step=7340] Training: 73%|███████▎ | 7341/10000 [1:37:10<31:45, 1.40it/s, loss=0.0108, lr=6.58e-06, step=7340] Training: 73%|███████▎ | 7341/10000 [1:37:10<31:45, 1.40it/s, loss=0.0051, lr=6.58e-06, step=7341] Training: 73%|███████▎ | 7342/10000 [1:37:11<32:55, 1.35it/s, loss=0.0051, lr=6.58e-06, step=7341] Training: 73%|███████▎ | 7342/10000 [1:37:11<32:55, 1.35it/s, loss=0.0045, lr=6.58e-06, step=7342] Training: 73%|███████▎ | 7343/10000 [1:37:12<34:24, 1.29it/s, loss=0.0045, lr=6.58e-06, step=7342] Training: 73%|███████▎ | 7343/10000 [1:37:12<34:24, 1.29it/s, loss=0.0037, lr=6.57e-06, step=7343] Training: 73%|███████▎ | 7344/10000 [1:37:13<30:30, 1.45it/s, loss=0.0037, lr=6.57e-06, step=7343] Training: 73%|███████▎ | 7344/10000 [1:37:13<30:30, 1.45it/s, loss=0.0175, lr=6.57e-06, step=7344] Training: 73%|███████▎ | 7345/10000 [1:37:13<32:24, 1.37it/s, loss=0.0175, lr=6.57e-06, step=7344] Training: 73%|███████▎ | 7345/10000 [1:37:13<32:24, 1.37it/s, loss=0.0081, lr=6.57e-06, step=7345] Training: 73%|███████▎ | 7346/10000 [1:37:14<31:02, 1.43it/s, loss=0.0081, lr=6.57e-06, step=7345] Training: 73%|███████▎ | 7346/10000 [1:37:14<31:02, 1.43it/s, loss=0.0014, lr=6.56e-06, step=7346] Training: 73%|███████▎ | 7347/10000 [1:37:15<29:30, 1.50it/s, loss=0.0014, lr=6.56e-06, step=7346] Training: 73%|███████▎ | 7347/10000 [1:37:15<29:30, 1.50it/s, loss=0.0028, lr=6.56e-06, step=7347] Training: 73%|███████▎ | 7348/10000 [1:37:15<27:05, 1.63it/s, loss=0.0028, lr=6.56e-06, step=7347] Training: 73%|███████▎ | 7348/10000 [1:37:15<27:05, 1.63it/s, loss=0.1728, lr=6.56e-06, step=7348] Training: 73%|███████▎ | 7349/10000 [1:37:16<30:09, 1.47it/s, loss=0.1728, lr=6.56e-06, step=7348] Training: 73%|███████▎ | 7349/10000 [1:37:16<30:09, 1.47it/s, loss=0.0172, lr=6.56e-06, step=7349]20:21:48.925 [I] step=7350 loss=0.0024 smoothed_loss=0.0224 lr=6.57e-06 grad_norm=0.3917 step_time=0.5562s data_time=0.1163s it/s=1.487 eta_to_10000=1781.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.0888 grad_arm_token_fuse=0.0439 grad_shared_expert=0.3069 (18633:train_pytorch.py:850) + Training: 74%|███████▎ | 7350/10000 [1:37:17<30:32, 1.45it/s, loss=0.0172, lr=6.56e-06, step=7349] Training: 74%|███████▎ | 7350/10000 [1:37:17<30:32, 1.45it/s, loss=0.0024, lr=6.55e-06, step=7350] Training: 74%|███████▎ | 7351/10000 [1:37:17<30:19, 1.46it/s, loss=0.0024, lr=6.55e-06, step=7350] Training: 74%|███████▎ | 7351/10000 [1:37:17<30:19, 1.46it/s, loss=0.0043, lr=6.55e-06, step=7351] Training: 74%|███████▎ | 7352/10000 [1:37:18<32:04, 1.38it/s, loss=0.0043, lr=6.55e-06, step=7351] Training: 74%|███████▎ | 7352/10000 [1:37:18<32:04, 1.38it/s, loss=0.0693, lr=6.55e-06, step=7352] Training: 74%|███████▎ | 7353/10000 [1:37:19<31:02, 1.42it/s, loss=0.0693, lr=6.55e-06, step=7352] Training: 74%|███████▎ | 7353/10000 [1:37:19<31:02, 1.42it/s, loss=0.0162, lr=6.54e-06, step=7353] Training: 74%|███████▎ | 7354/10000 [1:37:20<32:20, 1.36it/s, loss=0.0162, lr=6.54e-06, step=7353] Training: 74%|███████▎ | 7354/10000 [1:37:20<32:20, 1.36it/s, loss=0.0063, lr=6.54e-06, step=7354] Training: 74%|███████▎ | 7355/10000 [1:37:21<35:41, 1.24it/s, loss=0.0063, lr=6.54e-06, step=7354] Training: 74%|███████▎ | 7355/10000 [1:37:21<35:41, 1.24it/s, loss=0.0026, lr=6.54e-06, step=7355] Training: 74%|███████▎ | 7356/10000 [1:37:21<36:30, 1.21it/s, loss=0.0026, lr=6.54e-06, step=7355] Training: 74%|███████▎ | 7356/10000 [1:37:21<36:30, 1.21it/s, loss=0.0088, lr=6.54e-06, step=7356] Training: 74%|███████▎ | 7357/10000 [1:37:23<41:53, 1.05it/s, loss=0.0088, lr=6.54e-06, step=7356] Training: 74%|███████▎ | 7357/10000 [1:37:23<41:53, 1.05it/s, loss=0.0016, lr=6.53e-06, step=7357] Training: 74%|███████▎ | 7358/10000 [1:37:24<42:33, 1.03it/s, loss=0.0016, lr=6.53e-06, step=7357] Training: 74%|███████▎ | 7358/10000 [1:37:24<42:33, 1.03it/s, loss=0.0227, lr=6.53e-06, step=7358] Training: 74%|███████▎ | 7359/10000 [1:37:24<40:15, 1.09it/s, loss=0.0227, lr=6.53e-06, step=7358] Training: 74%|███████▎ | 7359/10000 [1:37:24<40:15, 1.09it/s, loss=0.0514, lr=6.53e-06, step=7359]20:21:57.290 [I] step=7360 loss=0.0392 smoothed_loss=0.0233 lr=6.54e-06 grad_norm=0.4593 step_time=0.6456s data_time=0.1908s it/s=1.196 eta_to_10000=2207.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0189 grad_action_out_proj_arms=0.1721 grad_arm_token_fuse=0.0981 grad_shared_expert=0.5334 (18633:train_pytorch.py:850) + Training: 74%|███████▎ | 7360/10000 [1:37:25<35:04, 1.25it/s, loss=0.0514, lr=6.53e-06, step=7359] Training: 74%|███████▎ | 7360/10000 [1:37:25<35:04, 1.25it/s, loss=0.0392, lr=6.52e-06, step=7360] Training: 74%|███████▎ | 7361/10000 [1:37:26<34:33, 1.27it/s, loss=0.0392, lr=6.52e-06, step=7360] Training: 74%|███████▎ | 7361/10000 [1:37:26<34:33, 1.27it/s, loss=0.0070, lr=6.52e-06, step=7361] Training: 74%|███████▎ | 7362/10000 [1:37:26<32:04, 1.37it/s, loss=0.0070, lr=6.52e-06, step=7361] Training: 74%|███████▎ | 7362/10000 [1:37:26<32:04, 1.37it/s, loss=0.0135, lr=6.52e-06, step=7362] Training: 74%|███████▎ | 7363/10000 [1:37:27<31:34, 1.39it/s, loss=0.0135, lr=6.52e-06, step=7362] Training: 74%|███████▎ | 7363/10000 [1:37:27<31:34, 1.39it/s, loss=0.0028, lr=6.52e-06, step=7363] Training: 74%|███████▎ | 7364/10000 [1:37:28<31:27, 1.40it/s, loss=0.0028, lr=6.52e-06, step=7363] Training: 74%|███████▎ | 7364/10000 [1:37:28<31:27, 1.40it/s, loss=0.0658, lr=6.51e-06, step=7364] Training: 74%|███████▎ | 7365/10000 [1:37:28<31:00, 1.42it/s, loss=0.0658, lr=6.51e-06, step=7364] Training: 74%|███████▎ | 7365/10000 [1:37:28<31:00, 1.42it/s, loss=0.0241, lr=6.51e-06, step=7365] Training: 74%|███████▎ | 7366/10000 [1:37:29<33:08, 1.32it/s, loss=0.0241, lr=6.51e-06, step=7365] Training: 74%|███████▎ | 7366/10000 [1:37:29<33:08, 1.32it/s, loss=0.0111, lr=6.51e-06, step=7366] Training: 74%|███████▎ | 7367/10000 [1:37:30<33:52, 1.30it/s, loss=0.0111, lr=6.51e-06, step=7366] Training: 74%|███████▎ | 7367/10000 [1:37:30<33:52, 1.30it/s, loss=0.0293, lr=6.50e-06, step=7367] Training: 74%|███████▎ | 7368/10000 [1:37:31<37:14, 1.18it/s, loss=0.0293, lr=6.50e-06, step=7367] Training: 74%|███████▎ | 7368/10000 [1:37:31<37:14, 1.18it/s, loss=0.0039, lr=6.50e-06, step=7368] Training: 74%|███████▎ | 7369/10000 [1:37:32<32:29, 1.35it/s, loss=0.0039, lr=6.50e-06, step=7368] Training: 74%|███████▎ | 7369/10000 [1:37:32<32:29, 1.35it/s, loss=0.0174, lr=6.50e-06, step=7369]20:22:04.912 [I] step=7370 loss=0.0020 smoothed_loss=0.0190 lr=6.51e-06 grad_norm=0.4492 step_time=0.6224s data_time=0.1399s it/s=1.312 eta_to_10000=2004.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.0923 grad_arm_token_fuse=0.0372 grad_shared_expert=0.3734 (18633:train_pytorch.py:850) + Training: 74%|███████▎ | 7370/10000 [1:37:33<35:37, 1.23it/s, loss=0.0174, lr=6.50e-06, step=7369] Training: 74%|███████▎ | 7370/10000 [1:37:33<35:37, 1.23it/s, loss=0.0020, lr=6.50e-06, step=7370] Training: 74%|███████▎ | 7371/10000 [1:37:34<37:10, 1.18it/s, loss=0.0020, lr=6.50e-06, step=7370] Training: 74%|███████▎ | 7371/10000 [1:37:34<37:10, 1.18it/s, loss=0.0055, lr=6.49e-06, step=7371] Training: 74%|███████▎ | 7372/10000 [1:37:34<35:07, 1.25it/s, loss=0.0055, lr=6.49e-06, step=7371] Training: 74%|███████▎ | 7372/10000 [1:37:34<35:07, 1.25it/s, loss=0.0092, lr=6.49e-06, step=7372] Training: 74%|███████▎ | 7373/10000 [1:37:35<33:05, 1.32it/s, loss=0.0092, lr=6.49e-06, step=7372] Training: 74%|███████▎ | 7373/10000 [1:37:35<33:05, 1.32it/s, loss=0.0029, lr=6.49e-06, step=7373] Training: 74%|███████▎ | 7374/10000 [1:37:36<35:15, 1.24it/s, loss=0.0029, lr=6.49e-06, step=7373] Training: 74%|███████▎ | 7374/10000 [1:37:36<35:15, 1.24it/s, loss=0.0116, lr=6.48e-06, step=7374] Training: 74%|███████▍ | 7375/10000 [1:37:37<37:40, 1.16it/s, loss=0.0116, lr=6.48e-06, step=7374] Training: 74%|███████▍ | 7375/10000 [1:37:37<37:40, 1.16it/s, loss=0.0040, lr=6.48e-06, step=7375] Training: 74%|███████▍ | 7376/10000 [1:37:37<34:54, 1.25it/s, loss=0.0040, lr=6.48e-06, step=7375] Training: 74%|███████▍ | 7376/10000 [1:37:37<34:54, 1.25it/s, loss=0.0056, lr=6.48e-06, step=7376] Training: 74%|███████▍ | 7377/10000 [1:37:38<35:50, 1.22it/s, loss=0.0056, lr=6.48e-06, step=7376] Training: 74%|███████▍ | 7377/10000 [1:37:38<35:50, 1.22it/s, loss=0.0050, lr=6.48e-06, step=7377] Training: 74%|███████▍ | 7378/10000 [1:37:39<33:48, 1.29it/s, loss=0.0050, lr=6.48e-06, step=7377] Training: 74%|███████▍ | 7378/10000 [1:37:39<33:48, 1.29it/s, loss=0.0043, lr=6.47e-06, step=7378] Training: 74%|███████▍ | 7379/10000 [1:37:40<32:58, 1.32it/s, loss=0.0043, lr=6.47e-06, step=7378] Training: 74%|███████▍ | 7379/10000 [1:37:40<32:58, 1.32it/s, loss=0.0031, lr=6.47e-06, step=7379]20:22:12.764 [I] step=7380 loss=0.0050 smoothed_loss=0.0101 lr=6.48e-06 grad_norm=0.4219 step_time=0.6277s data_time=0.1575s it/s=1.274 eta_to_10000=2056.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0827 grad_arm_token_fuse=0.0334 grad_shared_expert=0.4587 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7380/10000 [1:37:40<33:08, 1.32it/s, loss=0.0031, lr=6.47e-06, step=7379] Training: 74%|███████▍ | 7380/10000 [1:37:40<33:08, 1.32it/s, loss=0.0050, lr=6.47e-06, step=7380] Training: 74%|███████▍ | 7381/10000 [1:37:41<33:37, 1.30it/s, loss=0.0050, lr=6.47e-06, step=7380] Training: 74%|███████▍ | 7381/10000 [1:37:41<33:37, 1.30it/s, loss=0.0031, lr=6.46e-06, step=7381] Training: 74%|███████▍ | 7382/10000 [1:37:42<30:19, 1.44it/s, loss=0.0031, lr=6.46e-06, step=7381] Training: 74%|███████▍ | 7382/10000 [1:37:42<30:19, 1.44it/s, loss=0.0042, lr=6.46e-06, step=7382] Training: 74%|███████▍ | 7383/10000 [1:37:43<32:40, 1.34it/s, loss=0.0042, lr=6.46e-06, step=7382] Training: 74%|███████▍ | 7383/10000 [1:37:43<32:40, 1.34it/s, loss=0.0056, lr=6.46e-06, step=7383] Training: 74%|███████▍ | 7384/10000 [1:37:43<30:14, 1.44it/s, loss=0.0056, lr=6.46e-06, step=7383] Training: 74%|███████▍ | 7384/10000 [1:37:43<30:14, 1.44it/s, loss=0.0031, lr=6.46e-06, step=7384] Training: 74%|███████▍ | 7385/10000 [1:37:44<27:56, 1.56it/s, loss=0.0031, lr=6.46e-06, step=7384] Training: 74%|███████▍ | 7385/10000 [1:37:44<27:56, 1.56it/s, loss=0.0023, lr=6.45e-06, step=7385] Training: 74%|███████▍ | 7386/10000 [1:37:45<33:31, 1.30it/s, loss=0.0023, lr=6.45e-06, step=7385] Training: 74%|███████▍ | 7386/10000 [1:37:45<33:31, 1.30it/s, loss=0.0028, lr=6.45e-06, step=7386] Training: 74%|███████▍ | 7387/10000 [1:37:46<34:43, 1.25it/s, loss=0.0028, lr=6.45e-06, step=7386] Training: 74%|███████▍ | 7387/10000 [1:37:46<34:43, 1.25it/s, loss=0.0028, lr=6.45e-06, step=7387] Training: 74%|███████▍ | 7388/10000 [1:37:46<35:14, 1.24it/s, loss=0.0028, lr=6.45e-06, step=7387] Training: 74%|███████▍ | 7388/10000 [1:37:46<35:14, 1.24it/s, loss=0.0023, lr=6.45e-06, step=7388] Training: 74%|███████▍ | 7389/10000 [1:37:47<35:18, 1.23it/s, loss=0.0023, lr=6.45e-06, step=7388] Training: 74%|███████▍ | 7389/10000 [1:37:47<35:18, 1.23it/s, loss=0.0252, lr=6.44e-06, step=7389]20:22:20.521 [I] step=7390 loss=0.0069 smoothed_loss=0.0079 lr=6.45e-06 grad_norm=0.3549 step_time=0.6033s data_time=0.1723s it/s=1.289 eta_to_10000=2024.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0115 grad_action_out_proj_arms=0.1096 grad_arm_token_fuse=0.0585 grad_shared_expert=0.5279 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7390/10000 [1:37:48<36:25, 1.19it/s, loss=0.0252, lr=6.44e-06, step=7389] Training: 74%|███████▍ | 7390/10000 [1:37:48<36:25, 1.19it/s, loss=0.0069, lr=6.44e-06, step=7390] Training: 74%|███████▍ | 7391/10000 [1:37:49<34:26, 1.26it/s, loss=0.0069, lr=6.44e-06, step=7390] Training: 74%|███████▍ | 7391/10000 [1:37:49<34:26, 1.26it/s, loss=0.0056, lr=6.44e-06, step=7391] Training: 74%|███████▍ | 7392/10000 [1:37:50<33:49, 1.28it/s, loss=0.0056, lr=6.44e-06, step=7391] Training: 74%|███████▍ | 7392/10000 [1:37:50<33:49, 1.28it/s, loss=0.0644, lr=6.43e-06, step=7392] Training: 74%|███████▍ | 7393/10000 [1:37:51<36:27, 1.19it/s, loss=0.0644, lr=6.43e-06, step=7392] Training: 74%|███████▍ | 7393/10000 [1:37:51<36:27, 1.19it/s, loss=0.0034, lr=6.43e-06, step=7393] Training: 74%|███████▍ | 7394/10000 [1:37:52<37:36, 1.15it/s, loss=0.0034, lr=6.43e-06, step=7393] Training: 74%|███████▍ | 7394/10000 [1:37:52<37:36, 1.15it/s, loss=0.0496, lr=6.43e-06, step=7394] Training: 74%|███████▍ | 7395/10000 [1:37:52<33:14, 1.31it/s, loss=0.0496, lr=6.43e-06, step=7394] Training: 74%|███████▍ | 7395/10000 [1:37:52<33:14, 1.31it/s, loss=0.0154, lr=6.43e-06, step=7395] Training: 74%|███████▍ | 7396/10000 [1:37:53<29:34, 1.47it/s, loss=0.0154, lr=6.43e-06, step=7395] Training: 74%|███████▍ | 7396/10000 [1:37:53<29:34, 1.47it/s, loss=0.0033, lr=6.42e-06, step=7396] Training: 74%|███████▍ | 7397/10000 [1:37:53<27:03, 1.60it/s, loss=0.0033, lr=6.42e-06, step=7396] Training: 74%|███████▍ | 7397/10000 [1:37:53<27:03, 1.60it/s, loss=0.0113, lr=6.42e-06, step=7397] Training: 74%|███████▍ | 7398/10000 [1:37:54<32:46, 1.32it/s, loss=0.0113, lr=6.42e-06, step=7397] Training: 74%|███████▍ | 7398/10000 [1:37:54<32:46, 1.32it/s, loss=0.0428, lr=6.42e-06, step=7398] Training: 74%|███████▍ | 7399/10000 [1:37:55<31:09, 1.39it/s, loss=0.0428, lr=6.42e-06, step=7398] Training: 74%|███████▍ | 7399/10000 [1:37:55<31:09, 1.39it/s, loss=0.0097, lr=6.41e-06, step=7399]20:22:27.795 [I] step=7400 loss=0.0059 smoothed_loss=0.0154 lr=6.42e-06 grad_norm=0.4728 step_time=0.5854s data_time=0.1421s it/s=1.375 eta_to_10000=1891.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0712 grad_arm_token_fuse=0.0394 grad_shared_expert=0.2541 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7400/10000 [1:37:55<31:17, 1.38it/s, loss=0.0097, lr=6.41e-06, step=7399] Training: 74%|███████▍ | 7400/10000 [1:37:55<31:17, 1.38it/s, loss=0.0059, lr=6.41e-06, step=7400] Training: 74%|███████▍ | 7401/10000 [1:37:56<32:27, 1.33it/s, loss=0.0059, lr=6.41e-06, step=7400] Training: 74%|███████▍ | 7401/10000 [1:37:56<32:27, 1.33it/s, loss=0.0029, lr=6.41e-06, step=7401] Training: 74%|███████▍ | 7402/10000 [1:37:57<33:47, 1.28it/s, loss=0.0029, lr=6.41e-06, step=7401] Training: 74%|███████▍ | 7402/10000 [1:37:57<33:47, 1.28it/s, loss=0.0508, lr=6.41e-06, step=7402] Training: 74%|███████▍ | 7403/10000 [1:37:58<35:18, 1.23it/s, loss=0.0508, lr=6.41e-06, step=7402] Training: 74%|███████▍ | 7403/10000 [1:37:58<35:18, 1.23it/s, loss=0.0233, lr=6.40e-06, step=7403] Training: 74%|███████▍ | 7404/10000 [1:37:59<34:50, 1.24it/s, loss=0.0233, lr=6.40e-06, step=7403] Training: 74%|███████▍ | 7404/10000 [1:37:59<34:50, 1.24it/s, loss=0.0025, lr=6.40e-06, step=7404] Training: 74%|███████▍ | 7405/10000 [1:37:59<32:49, 1.32it/s, loss=0.0025, lr=6.40e-06, step=7404] Training: 74%|███████▍ | 7405/10000 [1:37:59<32:49, 1.32it/s, loss=0.0024, lr=6.40e-06, step=7405] Training: 74%|███████▍ | 7406/10000 [1:38:00<33:43, 1.28it/s, loss=0.0024, lr=6.40e-06, step=7405] Training: 74%|███████▍ | 7406/10000 [1:38:00<33:43, 1.28it/s, loss=0.0198, lr=6.39e-06, step=7406] Training: 74%|███████▍ | 7407/10000 [1:38:01<33:57, 1.27it/s, loss=0.0198, lr=6.39e-06, step=7406] Training: 74%|███████▍ | 7407/10000 [1:38:01<33:57, 1.27it/s, loss=0.0047, lr=6.39e-06, step=7407] Training: 74%|███████▍ | 7408/10000 [1:38:02<30:01, 1.44it/s, loss=0.0047, lr=6.39e-06, step=7407] Training: 74%|███████▍ | 7408/10000 [1:38:02<30:01, 1.44it/s, loss=0.1525, lr=6.39e-06, step=7408] Training: 74%|███████▍ | 7409/10000 [1:38:02<27:31, 1.57it/s, loss=0.1525, lr=6.39e-06, step=7408] Training: 74%|███████▍ | 7409/10000 [1:38:02<27:31, 1.57it/s, loss=0.0020, lr=6.39e-06, step=7409]20:22:35.093 [I] step=7410 loss=0.0019 smoothed_loss=0.0234 lr=6.40e-06 grad_norm=0.4582 step_time=0.5950s data_time=0.1348s it/s=1.371 eta_to_10000=1889.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.0992 grad_arm_token_fuse=0.0583 grad_shared_expert=0.3412 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7410/10000 [1:38:03<28:09, 1.53it/s, loss=0.0020, lr=6.39e-06, step=7409] Training: 74%|███████▍ | 7410/10000 [1:38:03<28:09, 1.53it/s, loss=0.0019, lr=6.38e-06, step=7410] Training: 74%|███████▍ | 7411/10000 [1:38:03<27:14, 1.58it/s, loss=0.0019, lr=6.38e-06, step=7410] Training: 74%|███████▍ | 7411/10000 [1:38:03<27:14, 1.58it/s, loss=0.0120, lr=6.38e-06, step=7411] Training: 74%|███████▍ | 7412/10000 [1:38:04<29:49, 1.45it/s, loss=0.0120, lr=6.38e-06, step=7411] Training: 74%|███████▍ | 7412/10000 [1:38:04<29:49, 1.45it/s, loss=0.0092, lr=6.38e-06, step=7412] Training: 74%|███████▍ | 7413/10000 [1:38:05<29:07, 1.48it/s, loss=0.0092, lr=6.38e-06, step=7412] Training: 74%|███████▍ | 7413/10000 [1:38:05<29:07, 1.48it/s, loss=0.0019, lr=6.37e-06, step=7413] Training: 74%|███████▍ | 7414/10000 [1:38:05<29:03, 1.48it/s, loss=0.0019, lr=6.37e-06, step=7413] Training: 74%|███████▍ | 7414/10000 [1:38:05<29:03, 1.48it/s, loss=0.0040, lr=6.37e-06, step=7414] Training: 74%|███████▍ | 7415/10000 [1:38:06<30:00, 1.44it/s, loss=0.0040, lr=6.37e-06, step=7414] Training: 74%|███████▍ | 7415/10000 [1:38:06<30:00, 1.44it/s, loss=0.0039, lr=6.37e-06, step=7415] Training: 74%|███████▍ | 7416/10000 [1:38:07<30:14, 1.42it/s, loss=0.0039, lr=6.37e-06, step=7415] Training: 74%|███████▍ | 7416/10000 [1:38:07<30:14, 1.42it/s, loss=0.0067, lr=6.37e-06, step=7416] Training: 74%|███████▍ | 7417/10000 [1:38:07<27:53, 1.54it/s, loss=0.0067, lr=6.37e-06, step=7416] Training: 74%|███████▍ | 7417/10000 [1:38:07<27:53, 1.54it/s, loss=0.0032, lr=6.36e-06, step=7417] Training: 74%|███████▍ | 7418/10000 [1:38:08<31:03, 1.39it/s, loss=0.0032, lr=6.36e-06, step=7417] Training: 74%|███████▍ | 7418/10000 [1:38:08<31:03, 1.39it/s, loss=0.0049, lr=6.36e-06, step=7418] Training: 74%|███████▍ | 7419/10000 [1:38:09<30:59, 1.39it/s, loss=0.0049, lr=6.36e-06, step=7418] Training: 74%|███████▍ | 7419/10000 [1:38:09<30:59, 1.39it/s, loss=0.0056, lr=6.36e-06, step=7419]20:22:42.110 [I] step=7420 loss=0.0034 smoothed_loss=0.0115 lr=6.37e-06 grad_norm=0.4320 step_time=0.5809s data_time=0.1209s it/s=1.425 eta_to_10000=1810.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0271 grad_action_out_proj_arms=0.1533 grad_arm_token_fuse=0.1356 grad_shared_expert=0.4476 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7420/10000 [1:38:10<30:41, 1.40it/s, loss=0.0056, lr=6.36e-06, step=7419] Training: 74%|███████▍ | 7420/10000 [1:38:10<30:41, 1.40it/s, loss=0.0034, lr=6.35e-06, step=7420] Training: 74%|███████▍ | 7421/10000 [1:38:11<32:02, 1.34it/s, loss=0.0034, lr=6.35e-06, step=7420] Training: 74%|███████▍ | 7421/10000 [1:38:11<32:02, 1.34it/s, loss=0.0060, lr=6.35e-06, step=7421] Training: 74%|███████▍ | 7422/10000 [1:38:12<35:17, 1.22it/s, loss=0.0060, lr=6.35e-06, step=7421] Training: 74%|███████▍ | 7422/10000 [1:38:12<35:17, 1.22it/s, loss=0.0081, lr=6.35e-06, step=7422] Training: 74%|███████▍ | 7423/10000 [1:38:12<34:14, 1.25it/s, loss=0.0081, lr=6.35e-06, step=7422] Training: 74%|███████▍ | 7423/10000 [1:38:12<34:14, 1.25it/s, loss=0.0115, lr=6.35e-06, step=7423] Training: 74%|███████▍ | 7424/10000 [1:38:13<30:13, 1.42it/s, loss=0.0115, lr=6.35e-06, step=7423] Training: 74%|███████▍ | 7424/10000 [1:38:13<30:13, 1.42it/s, loss=0.0074, lr=6.34e-06, step=7424] Training: 74%|███████▍ | 7425/10000 [1:38:13<27:56, 1.54it/s, loss=0.0074, lr=6.34e-06, step=7424] Training: 74%|███████▍ | 7425/10000 [1:38:13<27:56, 1.54it/s, loss=0.0159, lr=6.34e-06, step=7425] Training: 74%|███████▍ | 7426/10000 [1:38:14<27:50, 1.54it/s, loss=0.0159, lr=6.34e-06, step=7425] Training: 74%|███████▍ | 7426/10000 [1:38:14<27:50, 1.54it/s, loss=0.0019, lr=6.34e-06, step=7426] Training: 74%|███████▍ | 7427/10000 [1:38:14<25:48, 1.66it/s, loss=0.0019, lr=6.34e-06, step=7426] Training: 74%|███████▍ | 7427/10000 [1:38:14<25:48, 1.66it/s, loss=0.0069, lr=6.34e-06, step=7427] Training: 74%|███████▍ | 7428/10000 [1:38:15<26:33, 1.61it/s, loss=0.0069, lr=6.34e-06, step=7427] Training: 74%|███████▍ | 7428/10000 [1:38:15<26:33, 1.61it/s, loss=0.0102, lr=6.33e-06, step=7428] Training: 74%|███████▍ | 7429/10000 [1:38:16<30:34, 1.40it/s, loss=0.0102, lr=6.33e-06, step=7428] Training: 74%|███████▍ | 7429/10000 [1:38:16<30:34, 1.40it/s, loss=0.0138, lr=6.33e-06, step=7429]20:22:48.964 [I] step=7430 loss=0.0136 smoothed_loss=0.0105 lr=6.34e-06 grad_norm=0.4958 step_time=0.5702s data_time=0.1152s it/s=1.459 eta_to_10000=1761.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0172 grad_action_out_proj_arms=0.1616 grad_arm_token_fuse=0.0875 grad_shared_expert=0.4601 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7430/10000 [1:38:17<28:28, 1.50it/s, loss=0.0138, lr=6.33e-06, step=7429] Training: 74%|███████▍ | 7430/10000 [1:38:17<28:28, 1.50it/s, loss=0.0136, lr=6.33e-06, step=7430] Training: 74%|███████▍ | 7431/10000 [1:38:17<28:22, 1.51it/s, loss=0.0136, lr=6.33e-06, step=7430] Training: 74%|███████▍ | 7431/10000 [1:38:17<28:22, 1.51it/s, loss=0.0029, lr=6.32e-06, step=7431] Training: 74%|███████▍ | 7432/10000 [1:38:18<26:18, 1.63it/s, loss=0.0029, lr=6.32e-06, step=7431] Training: 74%|███████▍ | 7432/10000 [1:38:18<26:18, 1.63it/s, loss=0.0019, lr=6.32e-06, step=7432] Training: 74%|███████▍ | 7433/10000 [1:38:19<29:39, 1.44it/s, loss=0.0019, lr=6.32e-06, step=7432] Training: 74%|███████▍ | 7433/10000 [1:38:19<29:39, 1.44it/s, loss=0.0117, lr=6.32e-06, step=7433] Training: 74%|███████▍ | 7434/10000 [1:38:19<27:45, 1.54it/s, loss=0.0117, lr=6.32e-06, step=7433] Training: 74%|███████▍ | 7434/10000 [1:38:19<27:45, 1.54it/s, loss=0.0038, lr=6.32e-06, step=7434] Training: 74%|███████▍ | 7435/10000 [1:38:20<27:59, 1.53it/s, loss=0.0038, lr=6.32e-06, step=7434] Training: 74%|███████▍ | 7435/10000 [1:38:20<27:59, 1.53it/s, loss=0.0030, lr=6.31e-06, step=7435] Training: 74%|███████▍ | 7436/10000 [1:38:21<33:24, 1.28it/s, loss=0.0030, lr=6.31e-06, step=7435] Training: 74%|███████▍ | 7436/10000 [1:38:21<33:24, 1.28it/s, loss=0.0037, lr=6.31e-06, step=7436] Training: 74%|███████▍ | 7437/10000 [1:38:22<33:06, 1.29it/s, loss=0.0037, lr=6.31e-06, step=7436] Training: 74%|███████▍ | 7437/10000 [1:38:22<33:06, 1.29it/s, loss=0.0026, lr=6.31e-06, step=7437] Training: 74%|███████▍ | 7438/10000 [1:38:22<33:03, 1.29it/s, loss=0.0026, lr=6.31e-06, step=7437] Training: 74%|███████▍ | 7438/10000 [1:38:22<33:03, 1.29it/s, loss=0.0484, lr=6.30e-06, step=7438] Training: 74%|███████▍ | 7439/10000 [1:38:23<30:56, 1.38it/s, loss=0.0484, lr=6.30e-06, step=7438] Training: 74%|███████▍ | 7439/10000 [1:38:23<30:56, 1.38it/s, loss=0.0032, lr=6.30e-06, step=7439]20:22:56.208 [I] step=7440 loss=0.0056 smoothed_loss=0.0100 lr=6.31e-06 grad_norm=0.3417 step_time=0.5842s data_time=0.1402s it/s=1.381 eta_to_10000=1854.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0294 grad_action_out_proj_arms=0.1781 grad_arm_token_fuse=0.1643 grad_shared_expert=0.4684 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7440/10000 [1:38:24<31:32, 1.35it/s, loss=0.0032, lr=6.30e-06, step=7439] Training: 74%|███████▍ | 7440/10000 [1:38:24<31:32, 1.35it/s, loss=0.0056, lr=6.30e-06, step=7440] Training: 74%|███████▍ | 7441/10000 [1:38:25<30:59, 1.38it/s, loss=0.0056, lr=6.30e-06, step=7440] Training: 74%|███████▍ | 7441/10000 [1:38:25<30:59, 1.38it/s, loss=0.0051, lr=6.30e-06, step=7441] Training: 74%|███████▍ | 7442/10000 [1:38:25<28:04, 1.52it/s, loss=0.0051, lr=6.30e-06, step=7441] Training: 74%|███████▍ | 7442/10000 [1:38:25<28:04, 1.52it/s, loss=0.0073, lr=6.29e-06, step=7442] Training: 74%|███████▍ | 7443/10000 [1:38:26<28:58, 1.47it/s, loss=0.0073, lr=6.29e-06, step=7442] Training: 74%|███████▍ | 7443/10000 [1:38:26<28:58, 1.47it/s, loss=0.0138, lr=6.29e-06, step=7443] Training: 74%|███████▍ | 7444/10000 [1:38:26<26:41, 1.60it/s, loss=0.0138, lr=6.29e-06, step=7443] Training: 74%|███████▍ | 7444/10000 [1:38:26<26:41, 1.60it/s, loss=0.0040, lr=6.29e-06, step=7444] Training: 74%|███████▍ | 7445/10000 [1:38:27<30:07, 1.41it/s, loss=0.0040, lr=6.29e-06, step=7444] Training: 74%|███████▍ | 7445/10000 [1:38:27<30:07, 1.41it/s, loss=0.0037, lr=6.29e-06, step=7445] Training: 74%|███████▍ | 7446/10000 [1:38:28<27:22, 1.55it/s, loss=0.0037, lr=6.29e-06, step=7445] Training: 74%|███████▍ | 7446/10000 [1:38:28<27:22, 1.55it/s, loss=0.0078, lr=6.28e-06, step=7446] Training: 74%|███████▍ | 7447/10000 [1:38:28<28:26, 1.50it/s, loss=0.0078, lr=6.28e-06, step=7446] Training: 74%|███████▍ | 7447/10000 [1:38:28<28:26, 1.50it/s, loss=0.0026, lr=6.28e-06, step=7447] Training: 74%|███████▍ | 7448/10000 [1:38:29<30:54, 1.38it/s, loss=0.0026, lr=6.28e-06, step=7447] Training: 74%|███████▍ | 7448/10000 [1:38:29<30:54, 1.38it/s, loss=0.0056, lr=6.28e-06, step=7448] Training: 74%|███████▍ | 7449/10000 [1:38:30<27:56, 1.52it/s, loss=0.0056, lr=6.28e-06, step=7448] Training: 74%|███████▍ | 7449/10000 [1:38:30<27:56, 1.52it/s, loss=0.0024, lr=6.27e-06, step=7449]20:23:02.837 [I] step=7450 loss=0.0033 smoothed_loss=0.0068 lr=6.28e-06 grad_norm=0.3444 step_time=0.5566s data_time=0.1063s it/s=1.509 eta_to_10000=1690.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0122 grad_action_out_proj_arms=0.1168 grad_arm_token_fuse=0.0657 grad_shared_expert=0.3007 (18633:train_pytorch.py:850) + Training: 74%|███████▍ | 7450/10000 [1:38:31<28:48, 1.48it/s, loss=0.0024, lr=6.27e-06, step=7449] Training: 74%|███████▍ | 7450/10000 [1:38:31<28:48, 1.48it/s, loss=0.0033, lr=6.27e-06, step=7450] Training: 75%|███████▍ | 7451/10000 [1:38:31<26:27, 1.61it/s, loss=0.0033, lr=6.27e-06, step=7450] Training: 75%|███████▍ | 7451/10000 [1:38:31<26:27, 1.61it/s, loss=0.0051, lr=6.27e-06, step=7451] Training: 75%|███████▍ | 7452/10000 [1:38:32<28:51, 1.47it/s, loss=0.0051, lr=6.27e-06, step=7451] Training: 75%|███████▍ | 7452/10000 [1:38:32<28:51, 1.47it/s, loss=0.0058, lr=6.27e-06, step=7452] Training: 75%|███████▍ | 7453/10000 [1:38:32<27:19, 1.55it/s, loss=0.0058, lr=6.27e-06, step=7452] Training: 75%|███████▍ | 7453/10000 [1:38:32<27:19, 1.55it/s, loss=0.0049, lr=6.26e-06, step=7453] Training: 75%|███████▍ | 7454/10000 [1:38:33<25:24, 1.67it/s, loss=0.0049, lr=6.26e-06, step=7453] Training: 75%|███████▍ | 7454/10000 [1:38:33<25:24, 1.67it/s, loss=0.0036, lr=6.26e-06, step=7454] Training: 75%|███████▍ | 7455/10000 [1:38:33<24:05, 1.76it/s, loss=0.0036, lr=6.26e-06, step=7454] Training: 75%|███████▍ | 7455/10000 [1:38:33<24:05, 1.76it/s, loss=0.0019, lr=6.26e-06, step=7455] Training: 75%|███████▍ | 7456/10000 [1:38:34<27:33, 1.54it/s, loss=0.0019, lr=6.26e-06, step=7455] Training: 75%|███████▍ | 7456/10000 [1:38:34<27:33, 1.54it/s, loss=0.0032, lr=6.25e-06, step=7456] Training: 75%|███████▍ | 7457/10000 [1:38:35<31:19, 1.35it/s, loss=0.0032, lr=6.25e-06, step=7456] Training: 75%|███████▍ | 7457/10000 [1:38:35<31:19, 1.35it/s, loss=0.0279, lr=6.25e-06, step=7457] Training: 75%|███████▍ | 7458/10000 [1:38:36<31:43, 1.34it/s, loss=0.0279, lr=6.25e-06, step=7457] Training: 75%|███████▍ | 7458/10000 [1:38:36<31:43, 1.34it/s, loss=0.0053, lr=6.25e-06, step=7458] Training: 75%|███████▍ | 7459/10000 [1:38:37<31:13, 1.36it/s, loss=0.0053, lr=6.25e-06, step=7458] Training: 75%|███████▍ | 7459/10000 [1:38:37<31:13, 1.36it/s, loss=0.0217, lr=6.25e-06, step=7459]20:23:09.743 [I] step=7460 loss=0.0034 smoothed_loss=0.0083 lr=6.26e-06 grad_norm=0.4190 step_time=0.5603s data_time=0.1303s it/s=1.448 eta_to_10000=1753.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0073 grad_action_out_proj_arms=0.0854 grad_arm_token_fuse=0.0359 grad_shared_expert=0.2317 (18633:train_pytorch.py:850) + Training: 75%|███████▍ | 7460/10000 [1:38:37<31:44, 1.33it/s, loss=0.0217, lr=6.25e-06, step=7459] Training: 75%|███████▍ | 7460/10000 [1:38:37<31:44, 1.33it/s, loss=0.0034, lr=6.24e-06, step=7460] Training: 75%|███████▍ | 7461/10000 [1:38:38<31:31, 1.34it/s, loss=0.0034, lr=6.24e-06, step=7460] Training: 75%|███████▍ | 7461/10000 [1:38:38<31:31, 1.34it/s, loss=0.0013, lr=6.24e-06, step=7461] Training: 75%|███████▍ | 7462/10000 [1:38:39<28:23, 1.49it/s, loss=0.0013, lr=6.24e-06, step=7461] Training: 75%|███████▍ | 7462/10000 [1:38:39<28:23, 1.49it/s, loss=0.0022, lr=6.24e-06, step=7462] Training: 75%|███████▍ | 7463/10000 [1:38:39<30:33, 1.38it/s, loss=0.0022, lr=6.24e-06, step=7462] Training: 75%|███████▍ | 7463/10000 [1:38:39<30:33, 1.38it/s, loss=0.0083, lr=6.24e-06, step=7463] Training: 75%|███████▍ | 7464/10000 [1:38:40<32:42, 1.29it/s, loss=0.0083, lr=6.24e-06, step=7463] Training: 75%|███████▍ | 7464/10000 [1:38:40<32:42, 1.29it/s, loss=0.0108, lr=6.23e-06, step=7464] Training: 75%|███████▍ | 7465/10000 [1:38:41<31:58, 1.32it/s, loss=0.0108, lr=6.23e-06, step=7464] Training: 75%|███████▍ | 7465/10000 [1:38:41<31:58, 1.32it/s, loss=0.0037, lr=6.23e-06, step=7465] Training: 75%|███████▍ | 7466/10000 [1:38:42<28:43, 1.47it/s, loss=0.0037, lr=6.23e-06, step=7465] Training: 75%|███████▍ | 7466/10000 [1:38:42<28:43, 1.47it/s, loss=0.0116, lr=6.23e-06, step=7466] Training: 75%|███████▍ | 7467/10000 [1:38:42<26:38, 1.58it/s, loss=0.0116, lr=6.23e-06, step=7466] Training: 75%|███████▍ | 7467/10000 [1:38:42<26:38, 1.58it/s, loss=0.0111, lr=6.22e-06, step=7467] Training: 75%|███████▍ | 7468/10000 [1:38:43<27:01, 1.56it/s, loss=0.0111, lr=6.22e-06, step=7467] Training: 75%|███████▍ | 7468/10000 [1:38:43<27:01, 1.56it/s, loss=0.0038, lr=6.22e-06, step=7468] Training: 75%|███████▍ | 7469/10000 [1:38:43<26:31, 1.59it/s, loss=0.0038, lr=6.22e-06, step=7468] Training: 75%|███████▍ | 7469/10000 [1:38:43<26:31, 1.59it/s, loss=0.0082, lr=6.22e-06, step=7469]20:23:16.272 [I] step=7470 loss=0.0334 smoothed_loss=0.0102 lr=6.23e-06 grad_norm=0.4182 step_time=0.5417s data_time=0.1112s it/s=1.532 eta_to_10000=1651.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0128 grad_action_out_proj_arms=0.1768 grad_arm_token_fuse=0.0708 grad_shared_expert=0.4446 (18633:train_pytorch.py:850) + Training: 75%|███████▍ | 7470/10000 [1:38:44<25:39, 1.64it/s, loss=0.0082, lr=6.22e-06, step=7469] Training: 75%|███████▍ | 7470/10000 [1:38:44<25:39, 1.64it/s, loss=0.0334, lr=6.22e-06, step=7470] Training: 75%|███████▍ | 7471/10000 [1:38:45<26:00, 1.62it/s, loss=0.0334, lr=6.22e-06, step=7470] Training: 75%|███████▍ | 7471/10000 [1:38:45<26:00, 1.62it/s, loss=0.0081, lr=6.21e-06, step=7471] Training: 75%|███████▍ | 7472/10000 [1:38:45<29:21, 1.44it/s, loss=0.0081, lr=6.21e-06, step=7471] Training: 75%|███████▍ | 7472/10000 [1:38:45<29:21, 1.44it/s, loss=0.0280, lr=6.21e-06, step=7472] Training: 75%|███████▍ | 7473/10000 [1:38:46<29:15, 1.44it/s, loss=0.0280, lr=6.21e-06, step=7472] Training: 75%|███████▍ | 7473/10000 [1:38:46<29:15, 1.44it/s, loss=0.0010, lr=6.21e-06, step=7473] Training: 75%|███████▍ | 7474/10000 [1:38:47<26:39, 1.58it/s, loss=0.0010, lr=6.21e-06, step=7473] Training: 75%|███████▍ | 7474/10000 [1:38:47<26:39, 1.58it/s, loss=0.0096, lr=6.20e-06, step=7474] Training: 75%|███████▍ | 7475/10000 [1:38:47<25:15, 1.67it/s, loss=0.0096, lr=6.20e-06, step=7474] Training: 75%|███████▍ | 7475/10000 [1:38:47<25:15, 1.67it/s, loss=0.0877, lr=6.20e-06, step=7475] Training: 75%|███████▍ | 7476/10000 [1:38:48<24:39, 1.71it/s, loss=0.0877, lr=6.20e-06, step=7475] Training: 75%|███████▍ | 7476/10000 [1:38:48<24:39, 1.71it/s, loss=0.0080, lr=6.20e-06, step=7476] Training: 75%|███████▍ | 7477/10000 [1:38:48<23:16, 1.81it/s, loss=0.0080, lr=6.20e-06, step=7476] Training: 75%|███████▍ | 7477/10000 [1:38:48<23:16, 1.81it/s, loss=0.0016, lr=6.20e-06, step=7477] Training: 75%|███████▍ | 7478/10000 [1:38:49<25:58, 1.62it/s, loss=0.0016, lr=6.20e-06, step=7477] Training: 75%|███████▍ | 7478/10000 [1:38:49<25:58, 1.62it/s, loss=0.0441, lr=6.19e-06, step=7478] Training: 75%|███████▍ | 7479/10000 [1:38:50<28:31, 1.47it/s, loss=0.0441, lr=6.19e-06, step=7478] Training: 75%|███████▍ | 7479/10000 [1:38:50<28:31, 1.47it/s, loss=0.0638, lr=6.19e-06, step=7479]20:23:22.651 [I] step=7480 loss=0.0068 smoothed_loss=0.0214 lr=6.20e-06 grad_norm=0.5114 step_time=0.5402s data_time=0.0977s it/s=1.568 eta_to_10000=1607.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1382 grad_arm_token_fuse=0.0839 grad_shared_expert=0.5422 (18633:train_pytorch.py:850) + Training: 75%|███████▍ | 7480/10000 [1:38:50<26:42, 1.57it/s, loss=0.0638, lr=6.19e-06, step=7479] Training: 75%|███████▍ | 7480/10000 [1:38:50<26:42, 1.57it/s, loss=0.0068, lr=6.19e-06, step=7480] Training: 75%|███████▍ | 7481/10000 [1:38:51<27:04, 1.55it/s, loss=0.0068, lr=6.19e-06, step=7480] Training: 75%|███████▍ | 7481/10000 [1:38:51<27:04, 1.55it/s, loss=0.0008, lr=6.19e-06, step=7481] Training: 75%|███████▍ | 7482/10000 [1:38:51<25:15, 1.66it/s, loss=0.0008, lr=6.19e-06, step=7481] Training: 75%|███████▍ | 7482/10000 [1:38:51<25:15, 1.66it/s, loss=0.0038, lr=6.18e-06, step=7482] Training: 75%|███████▍ | 7483/10000 [1:38:52<23:49, 1.76it/s, loss=0.0038, lr=6.18e-06, step=7482] Training: 75%|███████▍ | 7483/10000 [1:38:52<23:49, 1.76it/s, loss=0.0108, lr=6.18e-06, step=7483] Training: 75%|███████▍ | 7484/10000 [1:38:52<23:11, 1.81it/s, loss=0.0108, lr=6.18e-06, step=7483] Training: 75%|███████▍ | 7484/10000 [1:38:52<23:11, 1.81it/s, loss=0.0017, lr=6.18e-06, step=7484] Training: 75%|███████▍ | 7485/10000 [1:38:53<24:14, 1.73it/s, loss=0.0017, lr=6.18e-06, step=7484] Training: 75%|███████▍ | 7485/10000 [1:38:53<24:14, 1.73it/s, loss=0.0069, lr=6.17e-06, step=7485] Training: 75%|███████▍ | 7486/10000 [1:38:54<26:43, 1.57it/s, loss=0.0069, lr=6.17e-06, step=7485] Training: 75%|███████▍ | 7486/10000 [1:38:54<26:43, 1.57it/s, loss=0.0151, lr=6.17e-06, step=7486] Training: 75%|███████▍ | 7487/10000 [1:38:55<26:25, 1.58it/s, loss=0.0151, lr=6.17e-06, step=7486] Training: 75%|███████▍ | 7487/10000 [1:38:55<26:25, 1.58it/s, loss=0.0162, lr=6.17e-06, step=7487] Training: 75%|███████▍ | 7488/10000 [1:38:55<24:51, 1.68it/s, loss=0.0162, lr=6.17e-06, step=7487] Training: 75%|███████▍ | 7488/10000 [1:38:55<24:51, 1.68it/s, loss=0.0039, lr=6.17e-06, step=7488] Training: 75%|███████▍ | 7489/10000 [1:38:56<28:28, 1.47it/s, loss=0.0039, lr=6.17e-06, step=7488] Training: 75%|███████▍ | 7489/10000 [1:38:56<28:28, 1.47it/s, loss=0.0023, lr=6.16e-06, step=7489]20:23:28.795 [I] step=7490 loss=0.0688 smoothed_loss=0.0182 lr=6.17e-06 grad_norm=0.4321 step_time=0.5063s data_time=0.1081s it/s=1.628 eta_to_10000=1542.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0378 grad_action_out_proj_arms=0.1594 grad_arm_token_fuse=0.2008 grad_shared_expert=0.4793 (18633:train_pytorch.py:850) + Training: 75%|███████▍ | 7490/10000 [1:38:56<26:51, 1.56it/s, loss=0.0023, lr=6.16e-06, step=7489] Training: 75%|███████▍ | 7490/10000 [1:38:56<26:51, 1.56it/s, loss=0.0688, lr=6.16e-06, step=7490] Training: 75%|███████▍ | 7491/10000 [1:38:57<24:48, 1.69it/s, loss=0.0688, lr=6.16e-06, step=7490] Training: 75%|███████▍ | 7491/10000 [1:38:57<24:48, 1.69it/s, loss=0.0029, lr=6.16e-06, step=7491] Training: 75%|███████▍ | 7492/10000 [1:38:58<27:06, 1.54it/s, loss=0.0029, lr=6.16e-06, step=7491] Training: 75%|███████▍ | 7492/10000 [1:38:58<27:06, 1.54it/s, loss=0.0020, lr=6.16e-06, step=7492] Training: 75%|███████▍ | 7493/10000 [1:38:59<32:05, 1.30it/s, loss=0.0020, lr=6.16e-06, step=7492] Training: 75%|███████▍ | 7493/10000 [1:38:59<32:05, 1.30it/s, loss=0.0481, lr=6.15e-06, step=7493] Training: 75%|███████▍ | 7494/10000 [1:39:00<33:41, 1.24it/s, loss=0.0481, lr=6.15e-06, step=7493] Training: 75%|███████▍ | 7494/10000 [1:39:00<33:41, 1.24it/s, loss=0.0703, lr=6.15e-06, step=7494] Training: 75%|███████▍ | 7495/10000 [1:39:00<30:16, 1.38it/s, loss=0.0703, lr=6.15e-06, step=7494] Training: 75%|███████▍ | 7495/10000 [1:39:00<30:16, 1.38it/s, loss=0.0027, lr=6.15e-06, step=7495] Training: 75%|███████▍ | 7496/10000 [1:39:01<31:42, 1.32it/s, loss=0.0027, lr=6.15e-06, step=7495] Training: 75%|███████▍ | 7496/10000 [1:39:01<31:42, 1.32it/s, loss=0.0050, lr=6.14e-06, step=7496] Training: 75%|███████▍ | 7497/10000 [1:39:02<32:59, 1.26it/s, loss=0.0050, lr=6.14e-06, step=7496] Training: 75%|███████▍ | 7497/10000 [1:39:02<32:59, 1.26it/s, loss=0.0027, lr=6.14e-06, step=7497] Training: 75%|███████▍ | 7498/10000 [1:39:02<29:12, 1.43it/s, loss=0.0027, lr=6.14e-06, step=7497] Training: 75%|███████▍ | 7498/10000 [1:39:02<29:12, 1.43it/s, loss=0.0012, lr=6.14e-06, step=7498] Training: 75%|███████▍ | 7499/10000 [1:39:03<31:28, 1.32it/s, loss=0.0012, lr=6.14e-06, step=7498] Training: 75%|███████▍ | 7499/10000 [1:39:03<31:28, 1.32it/s, loss=0.0016, lr=6.14e-06, step=7499]20:23:36.321 [I] step=7500 loss=0.0058 smoothed_loss=0.0141 lr=6.15e-06 grad_norm=0.4019 step_time=0.5971s data_time=0.1554s it/s=1.329 eta_to_10000=1881.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.1131 grad_arm_token_fuse=0.0751 grad_shared_expert=0.4870 (18633:train_pytorch.py:850) + Training: 75%|███████▌ | 7500/10000 [1:39:04<30:55, 1.35it/s, loss=0.0016, lr=6.14e-06, step=7499] Training: 75%|███████▌ | 7500/10000 [1:39:04<30:55, 1.35it/s, loss=0.0058, lr=6.13e-06, step=7500] Training: 75%|███████▌ | 7501/10000 [1:39:05<29:02, 1.43it/s, loss=0.0058, lr=6.13e-06, step=7500] Training: 75%|███████▌ | 7501/10000 [1:39:05<29:02, 1.43it/s, loss=0.0027, lr=6.13e-06, step=7501] Training: 75%|███████▌ | 7502/10000 [1:39:05<30:05, 1.38it/s, loss=0.0027, lr=6.13e-06, step=7501] Training: 75%|███████▌ | 7502/10000 [1:39:05<30:05, 1.38it/s, loss=0.0028, lr=6.13e-06, step=7502] Training: 75%|███████▌ | 7503/10000 [1:39:06<31:20, 1.33it/s, loss=0.0028, lr=6.13e-06, step=7502] Training: 75%|███████▌ | 7503/10000 [1:39:06<31:20, 1.33it/s, loss=0.0029, lr=6.13e-06, step=7503] Training: 75%|███████▌ | 7504/10000 [1:39:07<33:21, 1.25it/s, loss=0.0029, lr=6.13e-06, step=7503] Training: 75%|███████▌ | 7504/10000 [1:39:07<33:21, 1.25it/s, loss=0.0069, lr=6.12e-06, step=7504] Training: 75%|███████▌ | 7505/10000 [1:39:08<31:44, 1.31it/s, loss=0.0069, lr=6.12e-06, step=7504] Training: 75%|███████▌ | 7505/10000 [1:39:08<31:44, 1.31it/s, loss=0.0079, lr=6.12e-06, step=7505] Training: 75%|███████▌ | 7506/10000 [1:39:09<32:21, 1.28it/s, loss=0.0079, lr=6.12e-06, step=7505] Training: 75%|███████▌ | 7506/10000 [1:39:09<32:21, 1.28it/s, loss=0.0144, lr=6.12e-06, step=7506] Training: 75%|███████▌ | 7507/10000 [1:39:09<31:25, 1.32it/s, loss=0.0144, lr=6.12e-06, step=7506] Training: 75%|███████▌ | 7507/10000 [1:39:09<31:25, 1.32it/s, loss=0.0069, lr=6.11e-06, step=7507] Training: 75%|███████▌ | 7508/10000 [1:39:10<29:52, 1.39it/s, loss=0.0069, lr=6.11e-06, step=7507] Training: 75%|███████▌ | 7508/10000 [1:39:10<29:52, 1.39it/s, loss=0.0109, lr=6.11e-06, step=7508] Training: 75%|███████▌ | 7509/10000 [1:39:11<30:02, 1.38it/s, loss=0.0109, lr=6.11e-06, step=7508] Training: 75%|███████▌ | 7509/10000 [1:39:11<30:02, 1.38it/s, loss=0.0014, lr=6.11e-06, step=7509]20:23:43.763 [I] step=7510 loss=0.0033 smoothed_loss=0.0089 lr=6.12e-06 grad_norm=0.4053 step_time=0.5975s data_time=0.1467s it/s=1.344 eta_to_10000=1852.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0153 grad_action_out_proj_arms=0.0994 grad_arm_token_fuse=0.0832 grad_shared_expert=0.3729 (18633:train_pytorch.py:850) + Training: 75%|███████▌ | 7510/10000 [1:39:11<30:35, 1.36it/s, loss=0.0014, lr=6.11e-06, step=7509] Training: 75%|███████▌ | 7510/10000 [1:39:11<30:35, 1.36it/s, loss=0.0033, lr=6.11e-06, step=7510] Training: 75%|███████▌ | 7511/10000 [1:39:12<30:46, 1.35it/s, loss=0.0033, lr=6.11e-06, step=7510] Training: 75%|███████▌ | 7511/10000 [1:39:12<30:46, 1.35it/s, loss=0.0038, lr=6.10e-06, step=7511] Training: 75%|███████▌ | 7512/10000 [1:39:13<32:54, 1.26it/s, loss=0.0038, lr=6.10e-06, step=7511] Training: 75%|███████▌ | 7512/10000 [1:39:13<32:54, 1.26it/s, loss=0.0451, lr=6.10e-06, step=7512] Training: 75%|███████▌ | 7513/10000 [1:39:14<33:08, 1.25it/s, loss=0.0451, lr=6.10e-06, step=7512] Training: 75%|███████▌ | 7513/10000 [1:39:14<33:08, 1.25it/s, loss=0.0038, lr=6.10e-06, step=7513] Training: 75%|███████▌ | 7514/10000 [1:39:15<32:12, 1.29it/s, loss=0.0038, lr=6.10e-06, step=7513] Training: 75%|███████▌ | 7514/10000 [1:39:15<32:12, 1.29it/s, loss=0.0068, lr=6.10e-06, step=7514] Training: 75%|███████▌ | 7515/10000 [1:39:15<31:14, 1.33it/s, loss=0.0068, lr=6.10e-06, step=7514] Training: 75%|███████▌ | 7515/10000 [1:39:15<31:14, 1.33it/s, loss=0.0350, lr=6.09e-06, step=7515] Training: 75%|███████▌ | 7516/10000 [1:39:16<33:41, 1.23it/s, loss=0.0350, lr=6.09e-06, step=7515] Training: 75%|███████▌ | 7516/10000 [1:39:16<33:41, 1.23it/s, loss=0.0015, lr=6.09e-06, step=7516] Training: 75%|███████▌ | 7517/10000 [1:39:17<36:05, 1.15it/s, loss=0.0015, lr=6.09e-06, step=7516] Training: 75%|███████▌ | 7517/10000 [1:39:17<36:05, 1.15it/s, loss=0.0056, lr=6.09e-06, step=7517] Training: 75%|███████▌ | 7518/10000 [1:39:18<36:35, 1.13it/s, loss=0.0056, lr=6.09e-06, step=7517] Training: 75%|███████▌ | 7518/10000 [1:39:18<36:35, 1.13it/s, loss=0.0167, lr=6.08e-06, step=7518] Training: 75%|███████▌ | 7519/10000 [1:39:19<34:35, 1.20it/s, loss=0.0167, lr=6.08e-06, step=7518] Training: 75%|███████▌ | 7519/10000 [1:39:19<34:35, 1.20it/s, loss=0.0084, lr=6.08e-06, step=7519]20:23:51.769 [I] step=7520 loss=0.0193 smoothed_loss=0.0123 lr=6.09e-06 grad_norm=0.4642 step_time=0.6284s data_time=0.1722s it/s=1.249 eta_to_10000=1985.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1007 grad_arm_token_fuse=0.0752 grad_shared_expert=0.4445 (18633:train_pytorch.py:850) + Training: 75%|███████▌ | 7520/10000 [1:39:19<30:24, 1.36it/s, loss=0.0084, lr=6.08e-06, step=7519] Training: 75%|███████▌ | 7520/10000 [1:39:19<30:24, 1.36it/s, loss=0.0193, lr=6.08e-06, step=7520] Training: 75%|███████▌ | 7521/10000 [1:39:20<29:27, 1.40it/s, loss=0.0193, lr=6.08e-06, step=7520] Training: 75%|███████▌ | 7521/10000 [1:39:20<29:27, 1.40it/s, loss=0.0144, lr=6.08e-06, step=7521] Training: 75%|███████▌ | 7522/10000 [1:39:21<30:27, 1.36it/s, loss=0.0144, lr=6.08e-06, step=7521] Training: 75%|███████▌ | 7522/10000 [1:39:21<30:27, 1.36it/s, loss=0.0025, lr=6.07e-06, step=7522] Training: 75%|███████▌ | 7523/10000 [1:39:22<30:59, 1.33it/s, loss=0.0025, lr=6.07e-06, step=7522] Training: 75%|███████▌ | 7523/10000 [1:39:22<30:59, 1.33it/s, loss=0.0092, lr=6.07e-06, step=7523] Training: 75%|███████▌ | 7524/10000 [1:39:22<31:00, 1.33it/s, loss=0.0092, lr=6.07e-06, step=7523] Training: 75%|███████▌ | 7524/10000 [1:39:22<31:00, 1.33it/s, loss=0.0356, lr=6.07e-06, step=7524] Training: 75%|███████▌ | 7525/10000 [1:39:23<32:16, 1.28it/s, loss=0.0356, lr=6.07e-06, step=7524] Training: 75%|███████▌ | 7525/10000 [1:39:23<32:16, 1.28it/s, loss=0.0023, lr=6.07e-06, step=7525] Training: 75%|███████▌ | 7526/10000 [1:39:24<33:05, 1.25it/s, loss=0.0023, lr=6.07e-06, step=7525] Training: 75%|███████▌ | 7526/10000 [1:39:24<33:05, 1.25it/s, loss=0.0022, lr=6.06e-06, step=7526] Training: 75%|███████▌ | 7527/10000 [1:39:25<34:48, 1.18it/s, loss=0.0022, lr=6.06e-06, step=7526] Training: 75%|███████▌ | 7527/10000 [1:39:25<34:48, 1.18it/s, loss=0.0035, lr=6.06e-06, step=7527] Training: 75%|███████▌ | 7528/10000 [1:39:26<38:28, 1.07it/s, loss=0.0035, lr=6.06e-06, step=7527] Training: 75%|███████▌ | 7528/10000 [1:39:26<38:28, 1.07it/s, loss=0.0048, lr=6.06e-06, step=7528] Training: 75%|███████▌ | 7529/10000 [1:39:27<39:16, 1.05it/s, loss=0.0048, lr=6.06e-06, step=7528] Training: 75%|███████▌ | 7529/10000 [1:39:27<39:16, 1.05it/s, loss=0.0039, lr=6.05e-06, step=7529]20:24:00.426 [I] step=7530 loss=0.0113 smoothed_loss=0.0097 lr=6.06e-06 grad_norm=0.4712 step_time=0.6707s data_time=0.1950s it/s=1.155 eta_to_10000=2138.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.1063 grad_arm_token_fuse=0.0629 grad_shared_expert=0.3580 (18633:train_pytorch.py:850) + Training: 75%|███████▌ | 7530/10000 [1:39:28<38:20, 1.07it/s, loss=0.0039, lr=6.05e-06, step=7529] Training: 75%|███████▌ | 7530/10000 [1:39:28<38:20, 1.07it/s, loss=0.0113, lr=6.05e-06, step=7530] Training: 75%|███████▌ | 7531/10000 [1:39:29<37:49, 1.09it/s, loss=0.0113, lr=6.05e-06, step=7530] Training: 75%|███████▌ | 7531/10000 [1:39:29<37:49, 1.09it/s, loss=0.0248, lr=6.05e-06, step=7531] Training: 75%|███████▌ | 7532/10000 [1:39:30<37:52, 1.09it/s, loss=0.0248, lr=6.05e-06, step=7531] Training: 75%|███████▌ | 7532/10000 [1:39:30<37:52, 1.09it/s, loss=0.0042, lr=6.05e-06, step=7532] Training: 75%|███████▌ | 7533/10000 [1:39:31<35:48, 1.15it/s, loss=0.0042, lr=6.05e-06, step=7532] Training: 75%|███████▌ | 7533/10000 [1:39:31<35:48, 1.15it/s, loss=0.0073, lr=6.04e-06, step=7533] Training: 75%|███████▌ | 7534/10000 [1:39:32<37:15, 1.10it/s, loss=0.0073, lr=6.04e-06, step=7533] Training: 75%|███████▌ | 7534/10000 [1:39:32<37:15, 1.10it/s, loss=0.0056, lr=6.04e-06, step=7534] Training: 75%|███████▌ | 7535/10000 [1:39:32<35:56, 1.14it/s, loss=0.0056, lr=6.04e-06, step=7534] Training: 75%|███████▌ | 7535/10000 [1:39:32<35:56, 1.14it/s, loss=0.0045, lr=6.04e-06, step=7535] Training: 75%|███████▌ | 7536/10000 [1:39:33<36:44, 1.12it/s, loss=0.0045, lr=6.04e-06, step=7535] Training: 75%|███████▌ | 7536/10000 [1:39:33<36:44, 1.12it/s, loss=0.0029, lr=6.04e-06, step=7536] Training: 75%|███████▌ | 7537/10000 [1:39:34<35:54, 1.14it/s, loss=0.0029, lr=6.04e-06, step=7536] Training: 75%|███████▌ | 7537/10000 [1:39:34<35:54, 1.14it/s, loss=0.0029, lr=6.03e-06, step=7537] Training: 75%|███████▌ | 7538/10000 [1:39:35<31:13, 1.31it/s, loss=0.0029, lr=6.03e-06, step=7537] Training: 75%|███████▌ | 7538/10000 [1:39:35<31:13, 1.31it/s, loss=0.0046, lr=6.03e-06, step=7538] Training: 75%|███████▌ | 7539/10000 [1:39:36<31:28, 1.30it/s, loss=0.0046, lr=6.03e-06, step=7538] Training: 75%|███████▌ | 7539/10000 [1:39:36<31:28, 1.30it/s, loss=0.0029, lr=6.03e-06, step=7539]20:24:08.685 [I] step=7540 loss=0.0226 smoothed_loss=0.0087 lr=6.04e-06 grad_norm=0.4313 step_time=0.6214s data_time=0.2044s it/s=1.211 eta_to_10000=2031.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0243 grad_action_out_proj_arms=0.1999 grad_arm_token_fuse=0.1317 grad_shared_expert=0.6251 (18633:train_pytorch.py:850) + Training: 75%|███████▌ | 7540/10000 [1:39:36<32:28, 1.26it/s, loss=0.0029, lr=6.03e-06, step=7539] Training: 75%|███████▌ | 7540/10000 [1:39:36<32:28, 1.26it/s, loss=0.0226, lr=6.02e-06, step=7540] Training: 75%|███████▌ | 7541/10000 [1:39:37<28:42, 1.43it/s, loss=0.0226, lr=6.02e-06, step=7540] Training: 75%|███████▌ | 7541/10000 [1:39:37<28:42, 1.43it/s, loss=0.0021, lr=6.02e-06, step=7541] Training: 75%|███████▌ | 7542/10000 [1:39:38<29:37, 1.38it/s, loss=0.0021, lr=6.02e-06, step=7541] Training: 75%|███████▌ | 7542/10000 [1:39:38<29:37, 1.38it/s, loss=0.0076, lr=6.02e-06, step=7542] Training: 75%|███████▌ | 7543/10000 [1:39:38<29:35, 1.38it/s, loss=0.0076, lr=6.02e-06, step=7542] Training: 75%|███████▌ | 7543/10000 [1:39:38<29:35, 1.38it/s, loss=0.0063, lr=6.02e-06, step=7543] Training: 75%|███████▌ | 7544/10000 [1:39:39<26:34, 1.54it/s, loss=0.0063, lr=6.02e-06, step=7543] Training: 75%|███████▌ | 7544/10000 [1:39:39<26:34, 1.54it/s, loss=0.0043, lr=6.01e-06, step=7544] Training: 75%|███████▌ | 7545/10000 [1:39:40<28:31, 1.43it/s, loss=0.0043, lr=6.01e-06, step=7544] Training: 75%|███████▌ | 7545/10000 [1:39:40<28:31, 1.43it/s, loss=0.0533, lr=6.01e-06, step=7545] Training: 75%|███████▌ | 7546/10000 [1:39:40<28:03, 1.46it/s, loss=0.0533, lr=6.01e-06, step=7545] Training: 75%|███████▌ | 7546/10000 [1:39:40<28:03, 1.46it/s, loss=0.0127, lr=6.01e-06, step=7546] Training: 75%|███████▌ | 7547/10000 [1:39:41<25:26, 1.61it/s, loss=0.0127, lr=6.01e-06, step=7546] Training: 75%|███████▌ | 7547/10000 [1:39:41<25:26, 1.61it/s, loss=0.0130, lr=6.01e-06, step=7547] Training: 75%|███████▌ | 7548/10000 [1:39:41<25:37, 1.59it/s, loss=0.0130, lr=6.01e-06, step=7547] Training: 75%|███████▌ | 7548/10000 [1:39:41<25:37, 1.59it/s, loss=0.0042, lr=6.00e-06, step=7548] Training: 75%|███████▌ | 7549/10000 [1:39:42<24:58, 1.64it/s, loss=0.0042, lr=6.00e-06, step=7548] Training: 75%|███████▌ | 7549/10000 [1:39:42<24:58, 1.64it/s, loss=0.0133, lr=6.00e-06, step=7549]20:24:15.235 [I] step=7550 loss=0.0060 smoothed_loss=0.0110 lr=6.01e-06 grad_norm=0.4780 step_time=0.5320s data_time=0.1231s it/s=1.527 eta_to_10000=1604.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0094 grad_action_out_proj_arms=0.0834 grad_arm_token_fuse=0.0504 grad_shared_expert=0.3842 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7550/10000 [1:39:43<28:54, 1.41it/s, loss=0.0133, lr=6.00e-06, step=7549] Training: 76%|███████▌ | 7550/10000 [1:39:43<28:54, 1.41it/s, loss=0.0060, lr=6.00e-06, step=7550] Training: 76%|███████▌ | 7551/10000 [1:39:44<28:02, 1.46it/s, loss=0.0060, lr=6.00e-06, step=7550] Training: 76%|███████▌ | 7551/10000 [1:39:44<28:02, 1.46it/s, loss=0.0034, lr=5.99e-06, step=7551] Training: 76%|███████▌ | 7552/10000 [1:39:44<30:52, 1.32it/s, loss=0.0034, lr=5.99e-06, step=7551] Training: 76%|███████▌ | 7552/10000 [1:39:44<30:52, 1.32it/s, loss=0.0059, lr=5.99e-06, step=7552] Training: 76%|███████▌ | 7553/10000 [1:39:45<32:33, 1.25it/s, loss=0.0059, lr=5.99e-06, step=7552] Training: 76%|███████▌ | 7553/10000 [1:39:45<32:33, 1.25it/s, loss=0.0017, lr=5.99e-06, step=7553] Training: 76%|███████▌ | 7554/10000 [1:39:46<32:04, 1.27it/s, loss=0.0017, lr=5.99e-06, step=7553] Training: 76%|███████▌ | 7554/10000 [1:39:46<32:04, 1.27it/s, loss=0.0229, lr=5.99e-06, step=7554] Training: 76%|███████▌ | 7555/10000 [1:39:47<29:38, 1.37it/s, loss=0.0229, lr=5.99e-06, step=7554] Training: 76%|███████▌ | 7555/10000 [1:39:47<29:38, 1.37it/s, loss=0.0076, lr=5.98e-06, step=7555] Training: 76%|███████▌ | 7556/10000 [1:39:48<30:53, 1.32it/s, loss=0.0076, lr=5.98e-06, step=7555] Training: 76%|███████▌ | 7556/10000 [1:39:48<30:53, 1.32it/s, loss=0.0062, lr=5.98e-06, step=7556] Training: 76%|███████▌ | 7557/10000 [1:39:49<34:14, 1.19it/s, loss=0.0062, lr=5.98e-06, step=7556] Training: 76%|███████▌ | 7557/10000 [1:39:49<34:14, 1.19it/s, loss=0.0023, lr=5.98e-06, step=7557] Training: 76%|███████▌ | 7558/10000 [1:39:50<37:16, 1.09it/s, loss=0.0023, lr=5.98e-06, step=7557] Training: 76%|███████▌ | 7558/10000 [1:39:50<37:16, 1.09it/s, loss=0.0039, lr=5.98e-06, step=7558] Training: 76%|███████▌ | 7559/10000 [1:39:51<36:33, 1.11it/s, loss=0.0039, lr=5.98e-06, step=7558] Training: 76%|███████▌ | 7559/10000 [1:39:51<36:33, 1.11it/s, loss=0.0090, lr=5.97e-06, step=7559]20:24:23.731 [I] step=7560 loss=0.0035 smoothed_loss=0.0080 lr=5.98e-06 grad_norm=0.4119 step_time=0.6609s data_time=0.1887s it/s=1.177 eta_to_10000=2072.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.0987 grad_arm_token_fuse=0.0870 grad_shared_expert=0.4064 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7560/10000 [1:39:51<36:19, 1.12it/s, loss=0.0090, lr=5.97e-06, step=7559] Training: 76%|███████▌ | 7560/10000 [1:39:51<36:19, 1.12it/s, loss=0.0035, lr=5.97e-06, step=7560] Training: 76%|███████▌ | 7561/10000 [1:39:52<36:13, 1.12it/s, loss=0.0035, lr=5.97e-06, step=7560] Training: 76%|███████▌ | 7561/10000 [1:39:52<36:13, 1.12it/s, loss=0.0057, lr=5.97e-06, step=7561] Training: 76%|███████▌ | 7562/10000 [1:39:53<34:46, 1.17it/s, loss=0.0057, lr=5.97e-06, step=7561] Training: 76%|███████▌ | 7562/10000 [1:39:53<34:46, 1.17it/s, loss=0.0199, lr=5.97e-06, step=7562] Training: 76%|███████▌ | 7563/10000 [1:39:54<34:38, 1.17it/s, loss=0.0199, lr=5.97e-06, step=7562] Training: 76%|███████▌ | 7563/10000 [1:39:54<34:38, 1.17it/s, loss=0.0038, lr=5.96e-06, step=7563] Training: 76%|███████▌ | 7564/10000 [1:39:55<34:55, 1.16it/s, loss=0.0038, lr=5.96e-06, step=7563] Training: 76%|███████▌ | 7564/10000 [1:39:55<34:55, 1.16it/s, loss=0.0091, lr=5.96e-06, step=7564] Training: 76%|███████▌ | 7565/10000 [1:39:56<36:29, 1.11it/s, loss=0.0091, lr=5.96e-06, step=7564] Training: 76%|███████▌ | 7565/10000 [1:39:56<36:29, 1.11it/s, loss=0.0054, lr=5.96e-06, step=7565] Training: 76%|███████▌ | 7566/10000 [1:39:57<34:21, 1.18it/s, loss=0.0054, lr=5.96e-06, step=7565] Training: 76%|███████▌ | 7566/10000 [1:39:57<34:21, 1.18it/s, loss=0.0165, lr=5.95e-06, step=7566] Training: 76%|███████▌ | 7567/10000 [1:39:57<33:03, 1.23it/s, loss=0.0165, lr=5.95e-06, step=7566] Training: 76%|███████▌ | 7567/10000 [1:39:57<33:03, 1.23it/s, loss=0.0123, lr=5.95e-06, step=7567] Training: 76%|███████▌ | 7568/10000 [1:39:58<29:02, 1.40it/s, loss=0.0123, lr=5.95e-06, step=7567] Training: 76%|███████▌ | 7568/10000 [1:39:58<29:02, 1.40it/s, loss=0.0077, lr=5.95e-06, step=7568] Training: 76%|███████▌ | 7569/10000 [1:39:59<31:00, 1.31it/s, loss=0.0077, lr=5.95e-06, step=7568] Training: 76%|███████▌ | 7569/10000 [1:39:59<31:00, 1.31it/s, loss=0.0329, lr=5.95e-06, step=7569]20:24:31.538 [I] step=7570 loss=0.0046 smoothed_loss=0.0109 lr=5.96e-06 grad_norm=0.4475 step_time=0.5943s data_time=0.1864s it/s=1.281 eta_to_10000=1896.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0173 grad_action_out_proj_arms=0.1255 grad_arm_token_fuse=0.0967 grad_shared_expert=0.4002 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7570/10000 [1:39:59<29:00, 1.40it/s, loss=0.0329, lr=5.95e-06, step=7569] Training: 76%|███████▌ | 7570/10000 [1:39:59<29:00, 1.40it/s, loss=0.0046, lr=5.94e-06, step=7570] Training: 76%|███████▌ | 7571/10000 [1:40:00<30:01, 1.35it/s, loss=0.0046, lr=5.94e-06, step=7570] Training: 76%|███████▌ | 7571/10000 [1:40:00<30:01, 1.35it/s, loss=0.0130, lr=5.94e-06, step=7571] Training: 76%|███████▌ | 7572/10000 [1:40:01<31:08, 1.30it/s, loss=0.0130, lr=5.94e-06, step=7571] Training: 76%|███████▌ | 7572/10000 [1:40:01<31:08, 1.30it/s, loss=0.0036, lr=5.94e-06, step=7572] Training: 76%|███████▌ | 7573/10000 [1:40:01<27:43, 1.46it/s, loss=0.0036, lr=5.94e-06, step=7572] Training: 76%|███████▌ | 7573/10000 [1:40:01<27:43, 1.46it/s, loss=0.0469, lr=5.94e-06, step=7573] Training: 76%|███████▌ | 7574/10000 [1:40:02<29:32, 1.37it/s, loss=0.0469, lr=5.94e-06, step=7573] Training: 76%|███████▌ | 7574/10000 [1:40:02<29:32, 1.37it/s, loss=0.0028, lr=5.93e-06, step=7574] Training: 76%|███████▌ | 7575/10000 [1:40:03<27:43, 1.46it/s, loss=0.0028, lr=5.93e-06, step=7574] Training: 76%|███████▌ | 7575/10000 [1:40:03<27:43, 1.46it/s, loss=0.0071, lr=5.93e-06, step=7575] Training: 76%|███████▌ | 7576/10000 [1:40:03<25:10, 1.61it/s, loss=0.0071, lr=5.93e-06, step=7575] Training: 76%|███████▌ | 7576/10000 [1:40:03<25:10, 1.61it/s, loss=0.0053, lr=5.93e-06, step=7576] Training: 76%|███████▌ | 7577/10000 [1:40:04<23:38, 1.71it/s, loss=0.0053, lr=5.93e-06, step=7576] Training: 76%|███████▌ | 7577/10000 [1:40:04<23:38, 1.71it/s, loss=0.0112, lr=5.92e-06, step=7577] Training: 76%|███████▌ | 7578/10000 [1:40:04<24:36, 1.64it/s, loss=0.0112, lr=5.92e-06, step=7577] Training: 76%|███████▌ | 7578/10000 [1:40:04<24:36, 1.64it/s, loss=0.0959, lr=5.92e-06, step=7578] Training: 76%|███████▌ | 7579/10000 [1:40:05<25:47, 1.56it/s, loss=0.0959, lr=5.92e-06, step=7578] Training: 76%|███████▌ | 7579/10000 [1:40:05<25:47, 1.56it/s, loss=0.0057, lr=5.92e-06, step=7579]20:24:38.293 [I] step=7580 loss=0.0029 smoothed_loss=0.0170 lr=5.93e-06 grad_norm=0.4139 step_time=0.5607s data_time=0.1148s it/s=1.481 eta_to_10000=1634.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0071 grad_action_out_proj_arms=0.0704 grad_arm_token_fuse=0.0373 grad_shared_expert=0.3026 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7580/10000 [1:40:06<28:30, 1.41it/s, loss=0.0057, lr=5.92e-06, step=7579] Training: 76%|███████▌ | 7580/10000 [1:40:06<28:30, 1.41it/s, loss=0.0029, lr=5.92e-06, step=7580] Training: 76%|███████▌ | 7581/10000 [1:40:07<29:19, 1.37it/s, loss=0.0029, lr=5.92e-06, step=7580] Training: 76%|███████▌ | 7581/10000 [1:40:07<29:19, 1.37it/s, loss=0.0021, lr=5.91e-06, step=7581] Training: 76%|███████▌ | 7582/10000 [1:40:08<30:55, 1.30it/s, loss=0.0021, lr=5.91e-06, step=7581] Training: 76%|███████▌ | 7582/10000 [1:40:08<30:55, 1.30it/s, loss=0.0131, lr=5.91e-06, step=7582] Training: 76%|███████▌ | 7583/10000 [1:40:08<28:16, 1.42it/s, loss=0.0131, lr=5.91e-06, step=7582] Training: 76%|███████▌ | 7583/10000 [1:40:08<28:16, 1.42it/s, loss=0.0102, lr=5.91e-06, step=7583] Training: 76%|███████▌ | 7584/10000 [1:40:09<29:57, 1.34it/s, loss=0.0102, lr=5.91e-06, step=7583] Training: 76%|███████▌ | 7584/10000 [1:40:09<29:57, 1.34it/s, loss=0.0082, lr=5.91e-06, step=7584] Training: 76%|███████▌ | 7585/10000 [1:40:10<31:19, 1.28it/s, loss=0.0082, lr=5.91e-06, step=7584] Training: 76%|███████▌ | 7585/10000 [1:40:10<31:19, 1.28it/s, loss=0.0158, lr=5.90e-06, step=7585] Training: 76%|███████▌ | 7586/10000 [1:40:11<34:06, 1.18it/s, loss=0.0158, lr=5.90e-06, step=7585] Training: 76%|███████▌ | 7586/10000 [1:40:11<34:06, 1.18it/s, loss=0.0101, lr=5.90e-06, step=7586] Training: 76%|███████▌ | 7587/10000 [1:40:11<29:55, 1.34it/s, loss=0.0101, lr=5.90e-06, step=7586] Training: 76%|███████▌ | 7587/10000 [1:40:11<29:55, 1.34it/s, loss=0.0069, lr=5.90e-06, step=7587] Training: 76%|███████▌ | 7588/10000 [1:40:12<26:47, 1.50it/s, loss=0.0069, lr=5.90e-06, step=7587] Training: 76%|███████▌ | 7588/10000 [1:40:12<26:47, 1.50it/s, loss=0.0077, lr=5.90e-06, step=7588] Training: 76%|███████▌ | 7589/10000 [1:40:13<30:18, 1.33it/s, loss=0.0077, lr=5.90e-06, step=7588] Training: 76%|███████▌ | 7589/10000 [1:40:13<30:18, 1.33it/s, loss=0.0032, lr=5.89e-06, step=7589]20:24:46.024 [I] step=7590 loss=0.0079 smoothed_loss=0.0113 lr=5.90e-06 grad_norm=0.3748 step_time=0.6098s data_time=0.1633s it/s=1.294 eta_to_10000=1862.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0214 grad_action_out_proj_arms=0.1132 grad_arm_token_fuse=0.1137 grad_shared_expert=0.4573 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7590/10000 [1:40:14<31:54, 1.26it/s, loss=0.0032, lr=5.89e-06, step=7589] Training: 76%|███████▌ | 7590/10000 [1:40:14<31:54, 1.26it/s, loss=0.0079, lr=5.89e-06, step=7590] Training: 76%|███████▌ | 7591/10000 [1:40:14<31:44, 1.26it/s, loss=0.0079, lr=5.89e-06, step=7590] Training: 76%|███████▌ | 7591/10000 [1:40:14<31:44, 1.26it/s, loss=0.0083, lr=5.89e-06, step=7591] Training: 76%|███████▌ | 7592/10000 [1:40:15<27:55, 1.44it/s, loss=0.0083, lr=5.89e-06, step=7591] Training: 76%|███████▌ | 7592/10000 [1:40:15<27:55, 1.44it/s, loss=0.0145, lr=5.88e-06, step=7592] Training: 76%|███████▌ | 7593/10000 [1:40:16<33:00, 1.22it/s, loss=0.0145, lr=5.88e-06, step=7592] Training: 76%|███████▌ | 7593/10000 [1:40:16<33:00, 1.22it/s, loss=0.0069, lr=5.88e-06, step=7593] Training: 76%|███████▌ | 7594/10000 [1:40:17<34:51, 1.15it/s, loss=0.0069, lr=5.88e-06, step=7593] Training: 76%|███████▌ | 7594/10000 [1:40:17<34:51, 1.15it/s, loss=0.0053, lr=5.88e-06, step=7594] Training: 76%|███████▌ | 7595/10000 [1:40:18<32:00, 1.25it/s, loss=0.0053, lr=5.88e-06, step=7594] Training: 76%|███████▌ | 7595/10000 [1:40:18<32:00, 1.25it/s, loss=0.0048, lr=5.88e-06, step=7595] Training: 76%|███████▌ | 7596/10000 [1:40:19<32:50, 1.22it/s, loss=0.0048, lr=5.88e-06, step=7595] Training: 76%|███████▌ | 7596/10000 [1:40:19<32:50, 1.22it/s, loss=0.0017, lr=5.87e-06, step=7596] Training: 76%|███████▌ | 7597/10000 [1:40:19<29:11, 1.37it/s, loss=0.0017, lr=5.87e-06, step=7596] Training: 76%|███████▌ | 7597/10000 [1:40:19<29:11, 1.37it/s, loss=0.0084, lr=5.87e-06, step=7597] Training: 76%|███████▌ | 7598/10000 [1:40:20<29:50, 1.34it/s, loss=0.0084, lr=5.87e-06, step=7597] Training: 76%|███████▌ | 7598/10000 [1:40:20<29:50, 1.34it/s, loss=0.0013, lr=5.87e-06, step=7598] Training: 76%|███████▌ | 7599/10000 [1:40:21<29:14, 1.37it/s, loss=0.0013, lr=5.87e-06, step=7598] Training: 76%|███████▌ | 7599/10000 [1:40:21<29:14, 1.37it/s, loss=0.0234, lr=5.87e-06, step=7599]20:24:53.923 [I] step=7600 loss=0.0057 smoothed_loss=0.0093 lr=5.88e-06 grad_norm=0.4293 step_time=0.6245s data_time=0.1651s it/s=1.267 eta_to_10000=1894.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0741 grad_arm_token_fuse=0.0340 grad_shared_expert=0.4352 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7600/10000 [1:40:22<33:05, 1.21it/s, loss=0.0234, lr=5.87e-06, step=7599] Training: 76%|███████▌ | 7600/10000 [1:40:22<33:05, 1.21it/s, loss=0.0057, lr=5.86e-06, step=7600] Training: 76%|███████▌ | 7601/10000 [1:40:23<36:02, 1.11it/s, loss=0.0057, lr=5.86e-06, step=7600] Training: 76%|███████▌ | 7601/10000 [1:40:23<36:02, 1.11it/s, loss=0.0049, lr=5.86e-06, step=7601] Training: 76%|███████▌ | 7602/10000 [1:40:24<37:42, 1.06it/s, loss=0.0049, lr=5.86e-06, step=7601] Training: 76%|███████▌ | 7602/10000 [1:40:24<37:42, 1.06it/s, loss=0.0104, lr=5.86e-06, step=7602] Training: 76%|███████▌ | 7603/10000 [1:40:25<36:26, 1.10it/s, loss=0.0104, lr=5.86e-06, step=7602] Training: 76%|███████▌ | 7603/10000 [1:40:25<36:26, 1.10it/s, loss=0.0023, lr=5.86e-06, step=7603] Training: 76%|███████▌ | 7604/10000 [1:40:25<35:40, 1.12it/s, loss=0.0023, lr=5.86e-06, step=7603] Training: 76%|███████▌ | 7604/10000 [1:40:25<35:40, 1.12it/s, loss=0.0052, lr=5.85e-06, step=7604] Training: 76%|███████▌ | 7605/10000 [1:40:26<35:56, 1.11it/s, loss=0.0052, lr=5.85e-06, step=7604] Training: 76%|███████▌ | 7605/10000 [1:40:26<35:56, 1.11it/s, loss=0.0124, lr=5.85e-06, step=7605] Training: 76%|███████▌ | 7606/10000 [1:40:27<36:48, 1.08it/s, loss=0.0124, lr=5.85e-06, step=7605] Training: 76%|███████▌ | 7606/10000 [1:40:27<36:48, 1.08it/s, loss=0.0058, lr=5.85e-06, step=7606] Training: 76%|███████▌ | 7607/10000 [1:40:28<37:21, 1.07it/s, loss=0.0058, lr=5.85e-06, step=7606] Training: 76%|███████▌ | 7607/10000 [1:40:28<37:21, 1.07it/s, loss=0.0042, lr=5.85e-06, step=7607] Training: 76%|███████▌ | 7608/10000 [1:40:29<38:29, 1.04it/s, loss=0.0042, lr=5.85e-06, step=7607] Training: 76%|███████▌ | 7608/10000 [1:40:29<38:29, 1.04it/s, loss=0.0116, lr=5.84e-06, step=7608] Training: 76%|███████▌ | 7609/10000 [1:40:30<36:01, 1.11it/s, loss=0.0116, lr=5.84e-06, step=7608] Training: 76%|███████▌ | 7609/10000 [1:40:30<36:01, 1.11it/s, loss=0.0095, lr=5.84e-06, step=7609]20:25:03.285 [I] step=7610 loss=0.0043 smoothed_loss=0.0079 lr=5.85e-06 grad_norm=0.3868 step_time=0.7012s data_time=0.2353s it/s=1.069 eta_to_10000=2235.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.0915 grad_arm_token_fuse=0.0612 grad_shared_expert=0.3344 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7610/10000 [1:40:31<35:55, 1.11it/s, loss=0.0095, lr=5.84e-06, step=7609] Training: 76%|███████▌ | 7610/10000 [1:40:31<35:55, 1.11it/s, loss=0.0043, lr=5.84e-06, step=7610] Training: 76%|███████▌ | 7611/10000 [1:40:32<36:10, 1.10it/s, loss=0.0043, lr=5.84e-06, step=7610] Training: 76%|███████▌ | 7611/10000 [1:40:32<36:10, 1.10it/s, loss=0.0104, lr=5.83e-06, step=7611] Training: 76%|███████▌ | 7612/10000 [1:40:33<36:50, 1.08it/s, loss=0.0104, lr=5.83e-06, step=7611] Training: 76%|███████▌ | 7612/10000 [1:40:33<36:50, 1.08it/s, loss=0.0039, lr=5.83e-06, step=7612] Training: 76%|███████▌ | 7613/10000 [1:40:33<31:42, 1.25it/s, loss=0.0039, lr=5.83e-06, step=7612] Training: 76%|███████▌ | 7613/10000 [1:40:33<31:42, 1.25it/s, loss=0.0088, lr=5.83e-06, step=7613] Training: 76%|███████▌ | 7614/10000 [1:40:34<32:28, 1.22it/s, loss=0.0088, lr=5.83e-06, step=7613] Training: 76%|███████▌ | 7614/10000 [1:40:34<32:28, 1.22it/s, loss=0.0042, lr=5.83e-06, step=7614] Training: 76%|███████▌ | 7615/10000 [1:40:35<33:55, 1.17it/s, loss=0.0042, lr=5.83e-06, step=7614] Training: 76%|███████▌ | 7615/10000 [1:40:35<33:55, 1.17it/s, loss=0.0029, lr=5.82e-06, step=7615] Training: 76%|███████▌ | 7616/10000 [1:40:36<36:03, 1.10it/s, loss=0.0029, lr=5.82e-06, step=7615] Training: 76%|███████▌ | 7616/10000 [1:40:36<36:03, 1.10it/s, loss=0.0049, lr=5.82e-06, step=7616] Training: 76%|███████▌ | 7617/10000 [1:40:37<35:21, 1.12it/s, loss=0.0049, lr=5.82e-06, step=7616] Training: 76%|███████▌ | 7617/10000 [1:40:37<35:21, 1.12it/s, loss=0.0064, lr=5.82e-06, step=7617] Training: 76%|███████▌ | 7618/10000 [1:40:38<30:39, 1.29it/s, loss=0.0064, lr=5.82e-06, step=7617] Training: 76%|███████▌ | 7618/10000 [1:40:38<30:39, 1.29it/s, loss=0.0074, lr=5.82e-06, step=7618] Training: 76%|███████▌ | 7619/10000 [1:40:38<27:12, 1.46it/s, loss=0.0074, lr=5.82e-06, step=7618] Training: 76%|███████▌ | 7619/10000 [1:40:38<27:12, 1.46it/s, loss=0.0267, lr=5.81e-06, step=7619]20:25:11.146 [I] step=7620 loss=0.0022 smoothed_loss=0.0081 lr=5.82e-06 grad_norm=0.4257 step_time=0.6128s data_time=0.1733s it/s=1.272 eta_to_10000=1870.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0271 grad_action_out_proj_arms=0.1534 grad_arm_token_fuse=0.1356 grad_shared_expert=0.4250 (18633:train_pytorch.py:850) + Training: 76%|███████▌ | 7620/10000 [1:40:39<28:39, 1.38it/s, loss=0.0267, lr=5.81e-06, step=7619] Training: 76%|███████▌ | 7620/10000 [1:40:39<28:39, 1.38it/s, loss=0.0022, lr=5.81e-06, step=7620] Training: 76%|███████▌ | 7621/10000 [1:40:39<27:29, 1.44it/s, loss=0.0022, lr=5.81e-06, step=7620] Training: 76%|███████▌ | 7621/10000 [1:40:39<27:29, 1.44it/s, loss=0.0115, lr=5.81e-06, step=7621] Training: 76%|███████▌ | 7622/10000 [1:40:40<29:34, 1.34it/s, loss=0.0115, lr=5.81e-06, step=7621] Training: 76%|███████▌ | 7622/10000 [1:40:40<29:34, 1.34it/s, loss=0.0048, lr=5.81e-06, step=7622] Training: 76%|███████▌ | 7623/10000 [1:40:41<32:37, 1.21it/s, loss=0.0048, lr=5.81e-06, step=7622] Training: 76%|███████▌ | 7623/10000 [1:40:41<32:37, 1.21it/s, loss=0.0072, lr=5.80e-06, step=7623] Training: 76%|███████▌ | 7624/10000 [1:40:42<32:48, 1.21it/s, loss=0.0072, lr=5.80e-06, step=7623] Training: 76%|███████▌ | 7624/10000 [1:40:42<32:48, 1.21it/s, loss=0.0059, lr=5.80e-06, step=7624] Training: 76%|███████▋ | 7625/10000 [1:40:43<28:40, 1.38it/s, loss=0.0059, lr=5.80e-06, step=7624] Training: 76%|███████▋ | 7625/10000 [1:40:43<28:40, 1.38it/s, loss=0.0093, lr=5.80e-06, step=7625] Training: 76%|███████▋ | 7626/10000 [1:40:44<32:00, 1.24it/s, loss=0.0093, lr=5.80e-06, step=7625] Training: 76%|███████▋ | 7626/10000 [1:40:44<32:00, 1.24it/s, loss=0.0063, lr=5.80e-06, step=7626] Training: 76%|███████▋ | 7627/10000 [1:40:44<29:48, 1.33it/s, loss=0.0063, lr=5.80e-06, step=7626] Training: 76%|███████▋ | 7627/10000 [1:40:44<29:48, 1.33it/s, loss=0.0072, lr=5.79e-06, step=7627] Training: 76%|███████▋ | 7628/10000 [1:40:45<31:28, 1.26it/s, loss=0.0072, lr=5.79e-06, step=7627] Training: 76%|███████▋ | 7628/10000 [1:40:45<31:28, 1.26it/s, loss=0.0042, lr=5.79e-06, step=7628] Training: 76%|███████▋ | 7629/10000 [1:40:46<30:13, 1.31it/s, loss=0.0042, lr=5.79e-06, step=7628] Training: 76%|███████▋ | 7629/10000 [1:40:46<30:13, 1.31it/s, loss=0.0031, lr=5.79e-06, step=7629]20:25:18.910 [I] step=7630 loss=0.0141 smoothed_loss=0.0077 lr=5.80e-06 grad_norm=0.4156 step_time=0.6261s data_time=0.1503s it/s=1.288 eta_to_10000=1839.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0757 grad_arm_token_fuse=0.0325 grad_shared_expert=0.3771 (18633:train_pytorch.py:850) + Training: 76%|███████▋ | 7630/10000 [1:40:47<29:44, 1.33it/s, loss=0.0031, lr=5.79e-06, step=7629] Training: 76%|███████▋ | 7630/10000 [1:40:47<29:44, 1.33it/s, loss=0.0141, lr=5.78e-06, step=7630] Training: 76%|███████▋ | 7631/10000 [1:40:47<28:20, 1.39it/s, loss=0.0141, lr=5.78e-06, step=7630] Training: 76%|███████▋ | 7631/10000 [1:40:47<28:20, 1.39it/s, loss=0.0009, lr=5.78e-06, step=7631] Training: 76%|███████▋ | 7632/10000 [1:40:48<30:23, 1.30it/s, loss=0.0009, lr=5.78e-06, step=7631] Training: 76%|███████▋ | 7632/10000 [1:40:48<30:23, 1.30it/s, loss=0.0049, lr=5.78e-06, step=7632] Training: 76%|███████▋ | 7633/10000 [1:40:49<31:27, 1.25it/s, loss=0.0049, lr=5.78e-06, step=7632] Training: 76%|███████▋ | 7633/10000 [1:40:49<31:27, 1.25it/s, loss=0.0124, lr=5.78e-06, step=7633] Training: 76%|███████▋ | 7634/10000 [1:40:50<33:44, 1.17it/s, loss=0.0124, lr=5.78e-06, step=7633] Training: 76%|███████▋ | 7634/10000 [1:40:50<33:44, 1.17it/s, loss=0.0048, lr=5.77e-06, step=7634] Training: 76%|███████▋ | 7635/10000 [1:40:51<32:52, 1.20it/s, loss=0.0048, lr=5.77e-06, step=7634] Training: 76%|███████▋ | 7635/10000 [1:40:51<32:52, 1.20it/s, loss=0.0245, lr=5.77e-06, step=7635] Training: 76%|███████▋ | 7636/10000 [1:40:52<32:29, 1.21it/s, loss=0.0245, lr=5.77e-06, step=7635] Training: 76%|███████▋ | 7636/10000 [1:40:52<32:29, 1.21it/s, loss=0.0042, lr=5.77e-06, step=7636] Training: 76%|███████▋ | 7637/10000 [1:40:52<33:10, 1.19it/s, loss=0.0042, lr=5.77e-06, step=7636] Training: 76%|███████▋ | 7637/10000 [1:40:52<33:10, 1.19it/s, loss=0.0226, lr=5.77e-06, step=7637] Training: 76%|███████▋ | 7638/10000 [1:40:53<34:16, 1.15it/s, loss=0.0226, lr=5.77e-06, step=7637] Training: 76%|███████▋ | 7638/10000 [1:40:53<34:16, 1.15it/s, loss=0.0021, lr=5.76e-06, step=7638] Training: 76%|███████▋ | 7639/10000 [1:40:54<34:22, 1.14it/s, loss=0.0021, lr=5.76e-06, step=7638] Training: 76%|███████▋ | 7639/10000 [1:40:54<34:22, 1.14it/s, loss=0.0109, lr=5.76e-06, step=7639]20:25:27.605 [I] step=7640 loss=0.0033 smoothed_loss=0.0086 lr=5.77e-06 grad_norm=0.4250 step_time=0.6880s data_time=0.1815s it/s=1.150 eta_to_10000=2051.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0198 grad_action_out_proj_arms=0.1756 grad_arm_token_fuse=0.1094 grad_shared_expert=0.4095 (18633:train_pytorch.py:850) + Training: 76%|███████▋ | 7640/10000 [1:40:55<36:11, 1.09it/s, loss=0.0109, lr=5.76e-06, step=7639] Training: 76%|███████▋ | 7640/10000 [1:40:55<36:11, 1.09it/s, loss=0.0033, lr=5.76e-06, step=7640] Training: 76%|███████▋ | 7641/10000 [1:40:56<35:17, 1.11it/s, loss=0.0033, lr=5.76e-06, step=7640] Training: 76%|███████▋ | 7641/10000 [1:40:56<35:17, 1.11it/s, loss=0.0034, lr=5.76e-06, step=7641] Training: 76%|███████▋ | 7642/10000 [1:40:57<35:11, 1.12it/s, loss=0.0034, lr=5.76e-06, step=7641] Training: 76%|███████▋ | 7642/10000 [1:40:57<35:11, 1.12it/s, loss=0.0046, lr=5.75e-06, step=7642] Training: 76%|███████▋ | 7643/10000 [1:40:58<38:36, 1.02it/s, loss=0.0046, lr=5.75e-06, step=7642] Training: 76%|███████▋ | 7643/10000 [1:40:58<38:36, 1.02it/s, loss=0.0059, lr=5.75e-06, step=7643] Training: 76%|███████▋ | 7644/10000 [1:40:59<38:57, 1.01it/s, loss=0.0059, lr=5.75e-06, step=7643] Training: 76%|███████▋ | 7644/10000 [1:40:59<38:57, 1.01it/s, loss=0.0079, lr=5.75e-06, step=7644] Training: 76%|███████▋ | 7645/10000 [1:41:00<38:52, 1.01it/s, loss=0.0079, lr=5.75e-06, step=7644] Training: 76%|███████▋ | 7645/10000 [1:41:00<38:52, 1.01it/s, loss=0.0021, lr=5.75e-06, step=7645] Training: 76%|███████▋ | 7646/10000 [1:41:01<38:05, 1.03it/s, loss=0.0021, lr=5.75e-06, step=7645] Training: 76%|███████▋ | 7646/10000 [1:41:01<38:05, 1.03it/s, loss=0.0071, lr=5.74e-06, step=7646] Training: 76%|███████▋ | 7647/10000 [1:41:02<35:41, 1.10it/s, loss=0.0071, lr=5.74e-06, step=7646] Training: 76%|███████▋ | 7647/10000 [1:41:02<35:41, 1.10it/s, loss=0.0029, lr=5.74e-06, step=7647] Training: 76%|███████▋ | 7648/10000 [1:41:02<30:30, 1.28it/s, loss=0.0029, lr=5.74e-06, step=7647] Training: 76%|███████▋ | 7648/10000 [1:41:02<30:30, 1.28it/s, loss=0.0012, lr=5.74e-06, step=7648] Training: 76%|███████▋ | 7649/10000 [1:41:03<27:31, 1.42it/s, loss=0.0012, lr=5.74e-06, step=7648] Training: 76%|███████▋ | 7649/10000 [1:41:03<27:31, 1.42it/s, loss=0.0058, lr=5.73e-06, step=7649]20:25:36.114 [I] step=7650 loss=0.0021 smoothed_loss=0.0057 lr=5.74e-06 grad_norm=0.3593 step_time=0.6504s data_time=0.2005s it/s=1.175 eta_to_10000=1999.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0045 grad_action_out_proj_arms=0.0594 grad_arm_token_fuse=0.0251 grad_shared_expert=0.3975 (18633:train_pytorch.py:850) + Training: 76%|███████▋ | 7650/10000 [1:41:04<29:46, 1.32it/s, loss=0.0058, lr=5.73e-06, step=7649] Training: 76%|███████▋ | 7650/10000 [1:41:04<29:46, 1.32it/s, loss=0.0021, lr=5.73e-06, step=7650] Training: 77%|███████▋ | 7651/10000 [1:41:05<30:45, 1.27it/s, loss=0.0021, lr=5.73e-06, step=7650] Training: 77%|███████▋ | 7651/10000 [1:41:05<30:45, 1.27it/s, loss=0.0149, lr=5.73e-06, step=7651] Training: 77%|███████▋ | 7652/10000 [1:41:06<33:09, 1.18it/s, loss=0.0149, lr=5.73e-06, step=7651] Training: 77%|███████▋ | 7652/10000 [1:41:06<33:09, 1.18it/s, loss=0.0157, lr=5.73e-06, step=7652] Training: 77%|███████▋ | 7653/10000 [1:41:07<34:20, 1.14it/s, loss=0.0157, lr=5.73e-06, step=7652] Training: 77%|███████▋ | 7653/10000 [1:41:07<34:20, 1.14it/s, loss=0.0020, lr=5.72e-06, step=7653] Training: 77%|███████▋ | 7654/10000 [1:41:07<34:42, 1.13it/s, loss=0.0020, lr=5.72e-06, step=7653] Training: 77%|███████▋ | 7654/10000 [1:41:07<34:42, 1.13it/s, loss=0.0015, lr=5.72e-06, step=7654] Training: 77%|███████▋ | 7655/10000 [1:41:08<34:28, 1.13it/s, loss=0.0015, lr=5.72e-06, step=7654] Training: 77%|███████▋ | 7655/10000 [1:41:08<34:28, 1.13it/s, loss=0.0171, lr=5.72e-06, step=7655] Training: 77%|███████▋ | 7656/10000 [1:41:09<32:49, 1.19it/s, loss=0.0171, lr=5.72e-06, step=7655] Training: 77%|███████▋ | 7656/10000 [1:41:09<32:49, 1.19it/s, loss=0.0073, lr=5.72e-06, step=7656] Training: 77%|███████▋ | 7657/10000 [1:41:10<35:32, 1.10it/s, loss=0.0073, lr=5.72e-06, step=7656] Training: 77%|███████▋ | 7657/10000 [1:41:10<35:32, 1.10it/s, loss=0.0168, lr=5.71e-06, step=7657] Training: 77%|███████▋ | 7658/10000 [1:41:11<36:14, 1.08it/s, loss=0.0168, lr=5.71e-06, step=7657] Training: 77%|███████▋ | 7658/10000 [1:41:11<36:14, 1.08it/s, loss=0.0050, lr=5.71e-06, step=7658] Training: 77%|███████▋ | 7659/10000 [1:41:12<36:07, 1.08it/s, loss=0.0050, lr=5.71e-06, step=7658] Training: 77%|███████▋ | 7659/10000 [1:41:12<36:07, 1.08it/s, loss=0.0090, lr=5.71e-06, step=7659]20:25:45.369 [I] step=7660 loss=0.0055 smoothed_loss=0.0079 lr=5.72e-06 grad_norm=0.4248 step_time=0.6936s data_time=0.2319s it/s=1.081 eta_to_10000=2165.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.0782 grad_arm_token_fuse=0.0524 grad_shared_expert=0.3407 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7660/10000 [1:41:13<36:47, 1.06it/s, loss=0.0090, lr=5.71e-06, step=7659] Training: 77%|███████▋ | 7660/10000 [1:41:13<36:47, 1.06it/s, loss=0.0055, lr=5.71e-06, step=7660] Training: 77%|███████▋ | 7661/10000 [1:41:14<31:27, 1.24it/s, loss=0.0055, lr=5.71e-06, step=7660] Training: 77%|███████▋ | 7661/10000 [1:41:14<31:27, 1.24it/s, loss=0.0026, lr=5.70e-06, step=7661] Training: 77%|███████▋ | 7662/10000 [1:41:14<29:23, 1.33it/s, loss=0.0026, lr=5.70e-06, step=7661] Training: 77%|███████▋ | 7662/10000 [1:41:14<29:23, 1.33it/s, loss=0.0144, lr=5.70e-06, step=7662] Training: 77%|███████▋ | 7663/10000 [1:41:15<26:03, 1.49it/s, loss=0.0144, lr=5.70e-06, step=7662] Training: 77%|███████▋ | 7663/10000 [1:41:15<26:03, 1.49it/s, loss=0.0033, lr=5.70e-06, step=7663] Training: 77%|███████▋ | 7664/10000 [1:41:15<25:42, 1.51it/s, loss=0.0033, lr=5.70e-06, step=7663] Training: 77%|███████▋ | 7664/10000 [1:41:15<25:42, 1.51it/s, loss=0.0123, lr=5.70e-06, step=7664] Training: 77%|███████▋ | 7665/10000 [1:41:16<26:10, 1.49it/s, loss=0.0123, lr=5.70e-06, step=7664] Training: 77%|███████▋ | 7665/10000 [1:41:16<26:10, 1.49it/s, loss=0.0142, lr=5.69e-06, step=7665] Training: 77%|███████▋ | 7666/10000 [1:41:17<28:15, 1.38it/s, loss=0.0142, lr=5.69e-06, step=7665] Training: 77%|███████▋ | 7666/10000 [1:41:17<28:15, 1.38it/s, loss=0.0041, lr=5.69e-06, step=7666] Training: 77%|███████▋ | 7667/10000 [1:41:17<25:23, 1.53it/s, loss=0.0041, lr=5.69e-06, step=7666] Training: 77%|███████▋ | 7667/10000 [1:41:17<25:23, 1.53it/s, loss=0.0012, lr=5.69e-06, step=7667] Training: 77%|███████▋ | 7668/10000 [1:41:18<27:13, 1.43it/s, loss=0.0012, lr=5.69e-06, step=7667] Training: 77%|███████▋ | 7668/10000 [1:41:18<27:13, 1.43it/s, loss=0.0051, lr=5.69e-06, step=7668] Training: 77%|███████▋ | 7669/10000 [1:41:19<29:28, 1.32it/s, loss=0.0051, lr=5.69e-06, step=7668] Training: 77%|███████▋ | 7669/10000 [1:41:19<29:28, 1.32it/s, loss=0.0035, lr=5.68e-06, step=7669]20:25:51.841 [I] step=7670 loss=0.0139 smoothed_loss=0.0076 lr=5.69e-06 grad_norm=0.4551 step_time=0.5400s data_time=0.1072s it/s=1.546 eta_to_10000=1507.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0115 grad_action_out_proj_arms=0.0818 grad_arm_token_fuse=0.0525 grad_shared_expert=0.3441 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7670/10000 [1:41:20<26:26, 1.47it/s, loss=0.0035, lr=5.68e-06, step=7669] Training: 77%|███████▋ | 7670/10000 [1:41:20<26:26, 1.47it/s, loss=0.0139, lr=5.68e-06, step=7670] Training: 77%|███████▋ | 7671/10000 [1:41:20<25:52, 1.50it/s, loss=0.0139, lr=5.68e-06, step=7670] Training: 77%|███████▋ | 7671/10000 [1:41:20<25:52, 1.50it/s, loss=0.0092, lr=5.68e-06, step=7671] Training: 77%|███████▋ | 7672/10000 [1:41:21<29:07, 1.33it/s, loss=0.0092, lr=5.68e-06, step=7671] Training: 77%|███████▋ | 7672/10000 [1:41:21<29:07, 1.33it/s, loss=0.0042, lr=5.67e-06, step=7672] Training: 77%|███████▋ | 7673/10000 [1:41:22<32:03, 1.21it/s, loss=0.0042, lr=5.67e-06, step=7672] Training: 77%|███████▋ | 7673/10000 [1:41:22<32:03, 1.21it/s, loss=0.0428, lr=5.67e-06, step=7673] Training: 77%|███████▋ | 7674/10000 [1:41:23<30:02, 1.29it/s, loss=0.0428, lr=5.67e-06, step=7673] Training: 77%|███████▋ | 7674/10000 [1:41:23<30:02, 1.29it/s, loss=0.0020, lr=5.67e-06, step=7674] Training: 77%|███████▋ | 7675/10000 [1:41:24<29:58, 1.29it/s, loss=0.0020, lr=5.67e-06, step=7674] Training: 77%|███████▋ | 7675/10000 [1:41:24<29:58, 1.29it/s, loss=0.0324, lr=5.67e-06, step=7675] Training: 77%|███████▋ | 7676/10000 [1:41:25<32:33, 1.19it/s, loss=0.0324, lr=5.67e-06, step=7675] Training: 77%|███████▋ | 7676/10000 [1:41:25<32:33, 1.19it/s, loss=0.0355, lr=5.66e-06, step=7676] Training: 77%|███████▋ | 7677/10000 [1:41:25<34:07, 1.13it/s, loss=0.0355, lr=5.66e-06, step=7676] Training: 77%|███████▋ | 7677/10000 [1:41:25<34:07, 1.13it/s, loss=0.0047, lr=5.66e-06, step=7677] Training: 77%|███████▋ | 7678/10000 [1:41:26<35:16, 1.10it/s, loss=0.0047, lr=5.66e-06, step=7677] Training: 77%|███████▋ | 7678/10000 [1:41:26<35:16, 1.10it/s, loss=0.0170, lr=5.66e-06, step=7678] Training: 77%|███████▋ | 7679/10000 [1:41:27<32:53, 1.18it/s, loss=0.0170, lr=5.66e-06, step=7678] Training: 77%|███████▋ | 7679/10000 [1:41:27<32:53, 1.18it/s, loss=0.0064, lr=5.66e-06, step=7679]20:26:00.224 [I] step=7680 loss=0.0129 smoothed_loss=0.0132 lr=5.67e-06 grad_norm=0.4243 step_time=0.6685s data_time=0.1699s it/s=1.193 eta_to_10000=1944.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0078 grad_action_out_proj_arms=0.0832 grad_arm_token_fuse=0.0423 grad_shared_expert=0.3415 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7680/10000 [1:41:28<31:17, 1.24it/s, loss=0.0064, lr=5.66e-06, step=7679] Training: 77%|███████▋ | 7680/10000 [1:41:28<31:17, 1.24it/s, loss=0.0129, lr=5.65e-06, step=7680] Training: 77%|███████▋ | 7681/10000 [1:41:29<30:39, 1.26it/s, loss=0.0129, lr=5.65e-06, step=7680] Training: 77%|███████▋ | 7681/10000 [1:41:29<30:39, 1.26it/s, loss=0.0067, lr=5.65e-06, step=7681] Training: 77%|███████▋ | 7682/10000 [1:41:30<31:36, 1.22it/s, loss=0.0067, lr=5.65e-06, step=7681] Training: 77%|███████▋ | 7682/10000 [1:41:30<31:36, 1.22it/s, loss=0.0032, lr=5.65e-06, step=7682] Training: 77%|███████▋ | 7683/10000 [1:41:30<31:45, 1.22it/s, loss=0.0032, lr=5.65e-06, step=7682] Training: 77%|███████▋ | 7683/10000 [1:41:30<31:45, 1.22it/s, loss=0.0007, lr=5.65e-06, step=7683] Training: 77%|███████▋ | 7684/10000 [1:41:31<30:06, 1.28it/s, loss=0.0007, lr=5.65e-06, step=7683] Training: 77%|███████▋ | 7684/10000 [1:41:31<30:06, 1.28it/s, loss=0.0103, lr=5.64e-06, step=7684] Training: 77%|███████▋ | 7685/10000 [1:41:32<29:03, 1.33it/s, loss=0.0103, lr=5.64e-06, step=7684] Training: 77%|███████▋ | 7685/10000 [1:41:32<29:03, 1.33it/s, loss=0.0057, lr=5.64e-06, step=7685] Training: 77%|███████▋ | 7686/10000 [1:41:33<32:18, 1.19it/s, loss=0.0057, lr=5.64e-06, step=7685] Training: 77%|███████▋ | 7686/10000 [1:41:33<32:18, 1.19it/s, loss=0.0019, lr=5.64e-06, step=7686] Training: 77%|███████▋ | 7687/10000 [1:41:34<32:46, 1.18it/s, loss=0.0019, lr=5.64e-06, step=7686] Training: 77%|███████▋ | 7687/10000 [1:41:34<32:46, 1.18it/s, loss=0.0129, lr=5.64e-06, step=7687] Training: 77%|███████▋ | 7688/10000 [1:41:34<30:43, 1.25it/s, loss=0.0129, lr=5.64e-06, step=7687] Training: 77%|███████▋ | 7688/10000 [1:41:34<30:43, 1.25it/s, loss=0.0100, lr=5.63e-06, step=7688] Training: 77%|███████▋ | 7689/10000 [1:41:35<31:13, 1.23it/s, loss=0.0100, lr=5.63e-06, step=7688] Training: 77%|███████▋ | 7689/10000 [1:41:35<31:13, 1.23it/s, loss=0.0186, lr=5.63e-06, step=7689]20:26:08.489 [I] step=7690 loss=0.0071 smoothed_loss=0.0102 lr=5.64e-06 grad_norm=0.4304 step_time=0.6388s data_time=0.1876s it/s=1.210 eta_to_10000=1908.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0125 grad_action_out_proj_arms=0.1145 grad_arm_token_fuse=0.0676 grad_shared_expert=0.3931 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7690/10000 [1:41:36<33:22, 1.15it/s, loss=0.0186, lr=5.63e-06, step=7689] Training: 77%|███████▋ | 7690/10000 [1:41:36<33:22, 1.15it/s, loss=0.0071, lr=5.63e-06, step=7690] Training: 77%|███████▋ | 7691/10000 [1:41:37<33:51, 1.14it/s, loss=0.0071, lr=5.63e-06, step=7690] Training: 77%|███████▋ | 7691/10000 [1:41:37<33:51, 1.14it/s, loss=0.0115, lr=5.63e-06, step=7691] Training: 77%|███████▋ | 7692/10000 [1:41:38<34:53, 1.10it/s, loss=0.0115, lr=5.63e-06, step=7691] Training: 77%|███████▋ | 7692/10000 [1:41:38<34:53, 1.10it/s, loss=0.0025, lr=5.62e-06, step=7692] Training: 77%|███████▋ | 7693/10000 [1:41:39<37:39, 1.02it/s, loss=0.0025, lr=5.62e-06, step=7692] Training: 77%|███████▋ | 7693/10000 [1:41:39<37:39, 1.02it/s, loss=0.0081, lr=5.62e-06, step=7693] Training: 77%|███████▋ | 7694/10000 [1:41:40<34:18, 1.12it/s, loss=0.0081, lr=5.62e-06, step=7693] Training: 77%|███████▋ | 7694/10000 [1:41:40<34:18, 1.12it/s, loss=0.0034, lr=5.62e-06, step=7694] Training: 77%|███████▋ | 7695/10000 [1:41:41<32:49, 1.17it/s, loss=0.0034, lr=5.62e-06, step=7694] Training: 77%|███████▋ | 7695/10000 [1:41:41<32:49, 1.17it/s, loss=0.0026, lr=5.62e-06, step=7695] Training: 77%|███████▋ | 7696/10000 [1:41:42<36:10, 1.06it/s, loss=0.0026, lr=5.62e-06, step=7695] Training: 77%|███████▋ | 7696/10000 [1:41:42<36:10, 1.06it/s, loss=0.0134, lr=5.61e-06, step=7696] Training: 77%|███████▋ | 7697/10000 [1:41:42<33:26, 1.15it/s, loss=0.0134, lr=5.61e-06, step=7696] Training: 77%|███████▋ | 7697/10000 [1:41:42<33:26, 1.15it/s, loss=0.0066, lr=5.61e-06, step=7697] Training: 77%|███████▋ | 7698/10000 [1:41:43<32:45, 1.17it/s, loss=0.0066, lr=5.61e-06, step=7697] Training: 77%|███████▋ | 7698/10000 [1:41:43<32:45, 1.17it/s, loss=0.0448, lr=5.61e-06, step=7698] Training: 77%|███████▋ | 7699/10000 [1:41:44<29:26, 1.30it/s, loss=0.0448, lr=5.61e-06, step=7698] Training: 77%|███████▋ | 7699/10000 [1:41:44<29:26, 1.30it/s, loss=0.0018, lr=5.61e-06, step=7699]20:26:16.951 [I] step=7700 loss=0.0136 smoothed_loss=0.0113 lr=5.61e-06 grad_norm=0.4241 step_time=0.6454s data_time=0.2008s it/s=1.182 eta_to_10000=1946.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0050 grad_action_out_proj_arms=0.0644 grad_arm_token_fuse=0.0253 grad_shared_expert=0.2272 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7700/10000 [1:41:45<29:10, 1.31it/s, loss=0.0018, lr=5.61e-06, step=7699] Training: 77%|███████▋ | 7700/10000 [1:41:45<29:10, 1.31it/s, loss=0.0136, lr=5.60e-06, step=7700] Training: 77%|███████▋ | 7701/10000 [1:41:45<29:32, 1.30it/s, loss=0.0136, lr=5.60e-06, step=7700] Training: 77%|███████▋ | 7701/10000 [1:41:45<29:32, 1.30it/s, loss=0.0070, lr=5.60e-06, step=7701] Training: 77%|███████▋ | 7702/10000 [1:41:46<27:50, 1.38it/s, loss=0.0070, lr=5.60e-06, step=7701] Training: 77%|███████▋ | 7702/10000 [1:41:46<27:50, 1.38it/s, loss=0.0048, lr=5.60e-06, step=7702] Training: 77%|███████▋ | 7703/10000 [1:41:47<30:46, 1.24it/s, loss=0.0048, lr=5.60e-06, step=7702] Training: 77%|███████▋ | 7703/10000 [1:41:47<30:46, 1.24it/s, loss=0.0106, lr=5.60e-06, step=7703] Training: 77%|███████▋ | 7704/10000 [1:41:48<27:00, 1.42it/s, loss=0.0106, lr=5.60e-06, step=7703] Training: 77%|███████▋ | 7704/10000 [1:41:48<27:00, 1.42it/s, loss=0.0019, lr=5.59e-06, step=7704] Training: 77%|███████▋ | 7705/10000 [1:41:48<27:30, 1.39it/s, loss=0.0019, lr=5.59e-06, step=7704] Training: 77%|███████▋ | 7705/10000 [1:41:48<27:30, 1.39it/s, loss=0.0152, lr=5.59e-06, step=7705] Training: 77%|███████▋ | 7706/10000 [1:41:49<29:07, 1.31it/s, loss=0.0152, lr=5.59e-06, step=7705] Training: 77%|███████▋ | 7706/10000 [1:41:49<29:07, 1.31it/s, loss=0.0046, lr=5.59e-06, step=7706] Training: 77%|███████▋ | 7707/10000 [1:41:50<33:41, 1.13it/s, loss=0.0046, lr=5.59e-06, step=7706] Training: 77%|███████▋ | 7707/10000 [1:41:50<33:41, 1.13it/s, loss=0.0076, lr=5.58e-06, step=7707] Training: 77%|███████▋ | 7708/10000 [1:41:51<34:42, 1.10it/s, loss=0.0076, lr=5.58e-06, step=7707] Training: 77%|███████▋ | 7708/10000 [1:41:51<34:42, 1.10it/s, loss=0.0082, lr=5.58e-06, step=7708] Training: 77%|███████▋ | 7709/10000 [1:41:52<33:52, 1.13it/s, loss=0.0082, lr=5.58e-06, step=7708] Training: 77%|███████▋ | 7709/10000 [1:41:52<33:52, 1.13it/s, loss=0.0030, lr=5.58e-06, step=7709]20:26:25.111 [I] step=7710 loss=0.0077 smoothed_loss=0.0085 lr=5.59e-06 grad_norm=0.4466 step_time=0.6203s data_time=0.1957s it/s=1.226 eta_to_10000=1868.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0584 grad_arm_token_fuse=0.0264 grad_shared_expert=0.2051 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7710/10000 [1:41:53<31:42, 1.20it/s, loss=0.0030, lr=5.58e-06, step=7709] Training: 77%|███████▋ | 7710/10000 [1:41:53<31:42, 1.20it/s, loss=0.0077, lr=5.58e-06, step=7710] Training: 77%|███████▋ | 7711/10000 [1:41:53<29:58, 1.27it/s, loss=0.0077, lr=5.58e-06, step=7710] Training: 77%|███████▋ | 7711/10000 [1:41:53<29:58, 1.27it/s, loss=0.0104, lr=5.57e-06, step=7711] Training: 77%|███████▋ | 7712/10000 [1:41:54<30:14, 1.26it/s, loss=0.0104, lr=5.57e-06, step=7711] Training: 77%|███████▋ | 7712/10000 [1:41:54<30:14, 1.26it/s, loss=0.0092, lr=5.57e-06, step=7712] Training: 77%|███████▋ | 7713/10000 [1:41:55<29:24, 1.30it/s, loss=0.0092, lr=5.57e-06, step=7712] Training: 77%|███████▋ | 7713/10000 [1:41:55<29:24, 1.30it/s, loss=0.0104, lr=5.57e-06, step=7713] Training: 77%|███████▋ | 7714/10000 [1:41:56<31:48, 1.20it/s, loss=0.0104, lr=5.57e-06, step=7713] Training: 77%|███████▋ | 7714/10000 [1:41:56<31:48, 1.20it/s, loss=0.0211, lr=5.57e-06, step=7714] Training: 77%|███████▋ | 7715/10000 [1:41:57<35:15, 1.08it/s, loss=0.0211, lr=5.57e-06, step=7714] Training: 77%|███████▋ | 7715/10000 [1:41:57<35:15, 1.08it/s, loss=0.0067, lr=5.56e-06, step=7715] Training: 77%|███████▋ | 7716/10000 [1:41:58<33:14, 1.15it/s, loss=0.0067, lr=5.56e-06, step=7715] Training: 77%|███████▋ | 7716/10000 [1:41:58<33:14, 1.15it/s, loss=0.0066, lr=5.56e-06, step=7716] Training: 77%|███████▋ | 7717/10000 [1:41:59<30:49, 1.23it/s, loss=0.0066, lr=5.56e-06, step=7716] Training: 77%|███████▋ | 7717/10000 [1:41:59<30:49, 1.23it/s, loss=0.0036, lr=5.56e-06, step=7717] Training: 77%|███████▋ | 7718/10000 [1:42:00<32:42, 1.16it/s, loss=0.0036, lr=5.56e-06, step=7717] Training: 77%|███████▋ | 7718/10000 [1:42:00<32:42, 1.16it/s, loss=0.0059, lr=5.56e-06, step=7718] Training: 77%|███████▋ | 7719/10000 [1:42:00<28:25, 1.34it/s, loss=0.0059, lr=5.56e-06, step=7718] Training: 77%|███████▋ | 7719/10000 [1:42:00<28:25, 1.34it/s, loss=0.0150, lr=5.55e-06, step=7719]20:26:32.848 [I] step=7720 loss=0.0131 smoothed_loss=0.0096 lr=5.56e-06 grad_norm=0.3919 step_time=0.6065s data_time=0.1672s it/s=1.293 eta_to_10000=1763.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0110 grad_action_out_proj_arms=0.0723 grad_arm_token_fuse=0.0556 grad_shared_expert=0.3421 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7720/10000 [1:42:01<25:55, 1.47it/s, loss=0.0150, lr=5.55e-06, step=7719] Training: 77%|███████▋ | 7720/10000 [1:42:01<25:55, 1.47it/s, loss=0.0131, lr=5.55e-06, step=7720] Training: 77%|███████▋ | 7721/10000 [1:42:01<27:16, 1.39it/s, loss=0.0131, lr=5.55e-06, step=7720] Training: 77%|███████▋ | 7721/10000 [1:42:01<27:16, 1.39it/s, loss=0.0122, lr=5.55e-06, step=7721] Training: 77%|███████▋ | 7722/10000 [1:42:02<30:40, 1.24it/s, loss=0.0122, lr=5.55e-06, step=7721] Training: 77%|███████▋ | 7722/10000 [1:42:02<30:40, 1.24it/s, loss=0.0133, lr=5.55e-06, step=7722] Training: 77%|███████▋ | 7723/10000 [1:42:03<31:19, 1.21it/s, loss=0.0133, lr=5.55e-06, step=7722] Training: 77%|███████▋ | 7723/10000 [1:42:03<31:19, 1.21it/s, loss=0.0323, lr=5.54e-06, step=7723] Training: 77%|███████▋ | 7724/10000 [1:42:04<27:42, 1.37it/s, loss=0.0323, lr=5.54e-06, step=7723] Training: 77%|███████▋ | 7724/10000 [1:42:04<27:42, 1.37it/s, loss=0.0017, lr=5.54e-06, step=7724] Training: 77%|███████▋ | 7725/10000 [1:42:04<26:50, 1.41it/s, loss=0.0017, lr=5.54e-06, step=7724] Training: 77%|███████▋ | 7725/10000 [1:42:04<26:50, 1.41it/s, loss=0.0230, lr=5.54e-06, step=7725] Training: 77%|███████▋ | 7726/10000 [1:42:05<24:16, 1.56it/s, loss=0.0230, lr=5.54e-06, step=7725] Training: 77%|███████▋ | 7726/10000 [1:42:05<24:16, 1.56it/s, loss=0.0154, lr=5.54e-06, step=7726] Training: 77%|███████▋ | 7727/10000 [1:42:06<26:06, 1.45it/s, loss=0.0154, lr=5.54e-06, step=7726] Training: 77%|███████▋ | 7727/10000 [1:42:06<26:06, 1.45it/s, loss=0.0041, lr=5.53e-06, step=7727] Training: 77%|███████▋ | 7728/10000 [1:42:07<29:32, 1.28it/s, loss=0.0041, lr=5.53e-06, step=7727] Training: 77%|███████▋ | 7728/10000 [1:42:07<29:32, 1.28it/s, loss=0.0248, lr=5.53e-06, step=7728] Training: 77%|███████▋ | 7729/10000 [1:42:08<33:40, 1.12it/s, loss=0.0248, lr=5.53e-06, step=7728] Training: 77%|███████▋ | 7729/10000 [1:42:08<33:40, 1.12it/s, loss=0.0102, lr=5.53e-06, step=7729]20:26:40.651 [I] step=7730 loss=0.0089 smoothed_loss=0.0125 lr=5.54e-06 grad_norm=0.4142 step_time=0.6223s data_time=0.1579s it/s=1.282 eta_to_10000=1771.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.1004 grad_arm_token_fuse=0.0423 grad_shared_expert=0.4344 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7730/10000 [1:42:08<29:34, 1.28it/s, loss=0.0102, lr=5.53e-06, step=7729] Training: 77%|███████▋ | 7730/10000 [1:42:08<29:34, 1.28it/s, loss=0.0089, lr=5.53e-06, step=7730] Training: 77%|███████▋ | 7731/10000 [1:42:09<26:12, 1.44it/s, loss=0.0089, lr=5.53e-06, step=7730] Training: 77%|███████▋ | 7731/10000 [1:42:09<26:12, 1.44it/s, loss=0.0022, lr=5.52e-06, step=7731] Training: 77%|███████▋ | 7732/10000 [1:42:10<29:04, 1.30it/s, loss=0.0022, lr=5.52e-06, step=7731] Training: 77%|███████▋ | 7732/10000 [1:42:10<29:04, 1.30it/s, loss=0.0029, lr=5.52e-06, step=7732] Training: 77%|███████▋ | 7733/10000 [1:42:11<29:53, 1.26it/s, loss=0.0029, lr=5.52e-06, step=7732] Training: 77%|███████▋ | 7733/10000 [1:42:11<29:53, 1.26it/s, loss=0.0021, lr=5.52e-06, step=7733] Training: 77%|███████▋ | 7734/10000 [1:42:11<28:09, 1.34it/s, loss=0.0021, lr=5.52e-06, step=7733] Training: 77%|███████▋ | 7734/10000 [1:42:11<28:09, 1.34it/s, loss=0.0110, lr=5.52e-06, step=7734] Training: 77%|███████▋ | 7735/10000 [1:42:12<25:36, 1.47it/s, loss=0.0110, lr=5.52e-06, step=7734] Training: 77%|███████▋ | 7735/10000 [1:42:12<25:36, 1.47it/s, loss=0.0200, lr=5.51e-06, step=7735] Training: 77%|███████▋ | 7736/10000 [1:42:13<30:01, 1.26it/s, loss=0.0200, lr=5.51e-06, step=7735] Training: 77%|███████▋ | 7736/10000 [1:42:13<30:01, 1.26it/s, loss=0.0123, lr=5.51e-06, step=7736] Training: 77%|███████▋ | 7737/10000 [1:42:14<32:04, 1.18it/s, loss=0.0123, lr=5.51e-06, step=7736] Training: 77%|███████▋ | 7737/10000 [1:42:14<32:04, 1.18it/s, loss=0.0021, lr=5.51e-06, step=7737] Training: 77%|███████▋ | 7738/10000 [1:42:15<31:33, 1.19it/s, loss=0.0021, lr=5.51e-06, step=7737] Training: 77%|███████▋ | 7738/10000 [1:42:15<31:33, 1.19it/s, loss=0.0146, lr=5.51e-06, step=7738] Training: 77%|███████▋ | 7739/10000 [1:42:16<32:52, 1.15it/s, loss=0.0146, lr=5.51e-06, step=7738] Training: 77%|███████▋ | 7739/10000 [1:42:16<32:52, 1.15it/s, loss=0.0039, lr=5.50e-06, step=7739]20:26:48.799 [I] step=7740 loss=0.0034 smoothed_loss=0.0093 lr=5.51e-06 grad_norm=0.3496 step_time=0.6353s data_time=0.1795s it/s=1.227 eta_to_10000=1841.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0063 grad_action_out_proj_arms=0.0765 grad_arm_token_fuse=0.0333 grad_shared_expert=0.3235 (18633:train_pytorch.py:850) + Training: 77%|███████▋ | 7740/10000 [1:42:16<33:12, 1.13it/s, loss=0.0039, lr=5.50e-06, step=7739] Training: 77%|███████▋ | 7740/10000 [1:42:16<33:12, 1.13it/s, loss=0.0034, lr=5.50e-06, step=7740] Training: 77%|███████▋ | 7741/10000 [1:42:17<33:23, 1.13it/s, loss=0.0034, lr=5.50e-06, step=7740] Training: 77%|███████▋ | 7741/10000 [1:42:17<33:23, 1.13it/s, loss=0.0032, lr=5.50e-06, step=7741] Training: 77%|███████▋ | 7742/10000 [1:42:18<30:35, 1.23it/s, loss=0.0032, lr=5.50e-06, step=7741] Training: 77%|███████▋ | 7742/10000 [1:42:18<30:35, 1.23it/s, loss=0.0030, lr=5.50e-06, step=7742] Training: 77%|███████▋ | 7743/10000 [1:42:19<33:40, 1.12it/s, loss=0.0030, lr=5.50e-06, step=7742] Training: 77%|███████▋ | 7743/10000 [1:42:19<33:40, 1.12it/s, loss=0.0014, lr=5.49e-06, step=7743] Training: 77%|███████▋ | 7744/10000 [1:42:20<32:26, 1.16it/s, loss=0.0014, lr=5.49e-06, step=7743] Training: 77%|███████▋ | 7744/10000 [1:42:20<32:26, 1.16it/s, loss=0.0123, lr=5.49e-06, step=7744] Training: 77%|███████▋ | 7745/10000 [1:42:21<32:42, 1.15it/s, loss=0.0123, lr=5.49e-06, step=7744] Training: 77%|███████▋ | 7745/10000 [1:42:21<32:42, 1.15it/s, loss=0.0020, lr=5.49e-06, step=7745] Training: 77%|███████▋ | 7746/10000 [1:42:22<35:14, 1.07it/s, loss=0.0020, lr=5.49e-06, step=7745] Training: 77%|███████▋ | 7746/10000 [1:42:22<35:14, 1.07it/s, loss=0.0086, lr=5.49e-06, step=7746] Training: 77%|███████▋ | 7747/10000 [1:42:22<30:59, 1.21it/s, loss=0.0086, lr=5.49e-06, step=7746] Training: 77%|███████▋ | 7747/10000 [1:42:22<30:59, 1.21it/s, loss=0.0010, lr=5.48e-06, step=7747] Training: 77%|███████▋ | 7748/10000 [1:42:23<27:41, 1.36it/s, loss=0.0010, lr=5.48e-06, step=7747] Training: 77%|███████▋ | 7748/10000 [1:42:23<27:41, 1.36it/s, loss=0.0247, lr=5.48e-06, step=7748] Training: 77%|███████▋ | 7749/10000 [1:42:24<26:48, 1.40it/s, loss=0.0247, lr=5.48e-06, step=7748] Training: 77%|███████▋ | 7749/10000 [1:42:24<26:48, 1.40it/s, loss=0.0067, lr=5.48e-06, step=7749]20:26:57.036 [I] step=7750 loss=0.0055 smoothed_loss=0.0081 lr=5.49e-06 grad_norm=0.4049 step_time=0.6338s data_time=0.1899s it/s=1.214 eta_to_10000=1853.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0061 grad_action_out_proj_arms=0.0699 grad_arm_token_fuse=0.0329 grad_shared_expert=0.2103 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7750/10000 [1:42:25<30:55, 1.21it/s, loss=0.0067, lr=5.48e-06, step=7749] Training: 78%|███████▊ | 7750/10000 [1:42:25<30:55, 1.21it/s, loss=0.0055, lr=5.48e-06, step=7750] Training: 78%|███████▊ | 7751/10000 [1:42:25<27:15, 1.37it/s, loss=0.0055, lr=5.48e-06, step=7750] Training: 78%|███████▊ | 7751/10000 [1:42:25<27:15, 1.37it/s, loss=0.0036, lr=5.47e-06, step=7751] Training: 78%|███████▊ | 7752/10000 [1:42:26<24:35, 1.52it/s, loss=0.0036, lr=5.47e-06, step=7751] Training: 78%|███████▊ | 7752/10000 [1:42:26<24:35, 1.52it/s, loss=0.0233, lr=5.47e-06, step=7752] Training: 78%|███████▊ | 7753/10000 [1:42:27<28:50, 1.30it/s, loss=0.0233, lr=5.47e-06, step=7752] Training: 78%|███████▊ | 7753/10000 [1:42:27<28:50, 1.30it/s, loss=0.0135, lr=5.47e-06, step=7753] Training: 78%|███████▊ | 7754/10000 [1:42:28<29:14, 1.28it/s, loss=0.0135, lr=5.47e-06, step=7753] Training: 78%|███████▊ | 7754/10000 [1:42:28<29:14, 1.28it/s, loss=0.0070, lr=5.47e-06, step=7754] Training: 78%|███████▊ | 7755/10000 [1:42:28<30:04, 1.24it/s, loss=0.0070, lr=5.47e-06, step=7754] Training: 78%|███████▊ | 7755/10000 [1:42:28<30:04, 1.24it/s, loss=0.0022, lr=5.46e-06, step=7755] Training: 78%|███████▊ | 7756/10000 [1:42:29<29:20, 1.27it/s, loss=0.0022, lr=5.46e-06, step=7755] Training: 78%|███████▊ | 7756/10000 [1:42:29<29:20, 1.27it/s, loss=0.0019, lr=5.46e-06, step=7756] Training: 78%|███████▊ | 7757/10000 [1:42:30<30:14, 1.24it/s, loss=0.0019, lr=5.46e-06, step=7756] Training: 78%|███████▊ | 7757/10000 [1:42:30<30:14, 1.24it/s, loss=0.0096, lr=5.46e-06, step=7757] Training: 78%|███████▊ | 7758/10000 [1:42:31<30:55, 1.21it/s, loss=0.0096, lr=5.46e-06, step=7757] Training: 78%|███████▊ | 7758/10000 [1:42:31<30:55, 1.21it/s, loss=0.0387, lr=5.46e-06, step=7758] Training: 78%|███████▊ | 7759/10000 [1:42:32<31:12, 1.20it/s, loss=0.0387, lr=5.46e-06, step=7758] Training: 78%|███████▊ | 7759/10000 [1:42:32<31:12, 1.20it/s, loss=0.0071, lr=5.45e-06, step=7759]20:27:05.091 [I] step=7760 loss=0.0169 smoothed_loss=0.0114 lr=5.46e-06 grad_norm=0.4765 step_time=0.6283s data_time=0.1773s it/s=1.241 eta_to_10000=1804.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0280 grad_action_out_proj_arms=0.1291 grad_arm_token_fuse=0.1416 grad_shared_expert=0.8995 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7760/10000 [1:42:33<33:26, 1.12it/s, loss=0.0071, lr=5.45e-06, step=7759] Training: 78%|███████▊ | 7760/10000 [1:42:33<33:26, 1.12it/s, loss=0.0169, lr=5.45e-06, step=7760] Training: 78%|███████▊ | 7761/10000 [1:42:34<31:51, 1.17it/s, loss=0.0169, lr=5.45e-06, step=7760] Training: 78%|███████▊ | 7761/10000 [1:42:34<31:51, 1.17it/s, loss=0.0018, lr=5.45e-06, step=7761] Training: 78%|███████▊ | 7762/10000 [1:42:34<27:45, 1.34it/s, loss=0.0018, lr=5.45e-06, step=7761] Training: 78%|███████▊ | 7762/10000 [1:42:34<27:45, 1.34it/s, loss=0.0056, lr=5.45e-06, step=7762] Training: 78%|███████▊ | 7763/10000 [1:42:35<27:48, 1.34it/s, loss=0.0056, lr=5.45e-06, step=7762] Training: 78%|███████▊ | 7763/10000 [1:42:35<27:48, 1.34it/s, loss=0.0036, lr=5.44e-06, step=7763] Training: 78%|███████▊ | 7764/10000 [1:42:35<26:41, 1.40it/s, loss=0.0036, lr=5.44e-06, step=7763] Training: 78%|███████▊ | 7764/10000 [1:42:35<26:41, 1.40it/s, loss=0.0268, lr=5.44e-06, step=7764] Training: 78%|███████▊ | 7765/10000 [1:42:37<32:24, 1.15it/s, loss=0.0268, lr=5.44e-06, step=7764] Training: 78%|███████▊ | 7765/10000 [1:42:37<32:24, 1.15it/s, loss=0.0035, lr=5.44e-06, step=7765] Training: 78%|███████▊ | 7766/10000 [1:42:38<35:17, 1.06it/s, loss=0.0035, lr=5.44e-06, step=7765] Training: 78%|███████▊ | 7766/10000 [1:42:38<35:17, 1.06it/s, loss=0.0063, lr=5.44e-06, step=7766] Training: 78%|███████▊ | 7767/10000 [1:42:39<34:11, 1.09it/s, loss=0.0063, lr=5.44e-06, step=7766] Training: 78%|███████▊ | 7767/10000 [1:42:39<34:11, 1.09it/s, loss=0.0068, lr=5.43e-06, step=7767] Training: 78%|███████▊ | 7768/10000 [1:42:40<35:25, 1.05it/s, loss=0.0068, lr=5.43e-06, step=7767] Training: 78%|███████▊ | 7768/10000 [1:42:40<35:25, 1.05it/s, loss=0.0114, lr=5.43e-06, step=7768] Training: 78%|███████▊ | 7769/10000 [1:42:41<35:03, 1.06it/s, loss=0.0114, lr=5.43e-06, step=7768] Training: 78%|███████▊ | 7769/10000 [1:42:41<35:03, 1.06it/s, loss=0.0058, lr=5.43e-06, step=7769]20:27:13.882 [I] step=7770 loss=0.0040 smoothed_loss=0.0088 lr=5.44e-06 grad_norm=0.3585 step_time=0.6794s data_time=0.1997s it/s=1.138 eta_to_10000=1960.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0688 grad_arm_token_fuse=0.0295 grad_shared_expert=0.2648 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7770/10000 [1:42:42<35:32, 1.05it/s, loss=0.0058, lr=5.43e-06, step=7769] Training: 78%|███████▊ | 7770/10000 [1:42:42<35:32, 1.05it/s, loss=0.0040, lr=5.43e-06, step=7770] Training: 78%|███████▊ | 7771/10000 [1:42:43<36:44, 1.01it/s, loss=0.0040, lr=5.43e-06, step=7770] Training: 78%|███████▊ | 7771/10000 [1:42:43<36:44, 1.01it/s, loss=0.0033, lr=5.42e-06, step=7771] Training: 78%|███████▊ | 7772/10000 [1:42:44<40:40, 1.10s/it, loss=0.0033, lr=5.42e-06, step=7771] Training: 78%|███████▊ | 7772/10000 [1:42:44<40:40, 1.10s/it, loss=0.0114, lr=5.42e-06, step=7772] Training: 78%|███████▊ | 7773/10000 [1:42:45<40:37, 1.09s/it, loss=0.0114, lr=5.42e-06, step=7772] Training: 78%|███████▊ | 7773/10000 [1:42:45<40:37, 1.09s/it, loss=0.0487, lr=5.42e-06, step=7773] Training: 78%|███████▊ | 7774/10000 [1:42:46<38:48, 1.05s/it, loss=0.0487, lr=5.42e-06, step=7773] Training: 78%|███████▊ | 7774/10000 [1:42:46<38:48, 1.05s/it, loss=0.0036, lr=5.42e-06, step=7774] Training: 78%|███████▊ | 7775/10000 [1:42:47<38:22, 1.03s/it, loss=0.0036, lr=5.42e-06, step=7774] Training: 78%|███████▊ | 7775/10000 [1:42:47<38:22, 1.03s/it, loss=0.0010, lr=5.41e-06, step=7775] Training: 78%|███████▊ | 7776/10000 [1:42:48<39:05, 1.05s/it, loss=0.0010, lr=5.41e-06, step=7775] Training: 78%|███████▊ | 7776/10000 [1:42:48<39:05, 1.05s/it, loss=0.0061, lr=5.41e-06, step=7776] Training: 78%|███████▊ | 7777/10000 [1:42:49<38:15, 1.03s/it, loss=0.0061, lr=5.41e-06, step=7776] Training: 78%|███████▊ | 7777/10000 [1:42:49<38:15, 1.03s/it, loss=0.0079, lr=5.41e-06, step=7777] Training: 78%|███████▊ | 7778/10000 [1:42:50<35:38, 1.04it/s, loss=0.0079, lr=5.41e-06, step=7777] Training: 78%|███████▊ | 7778/10000 [1:42:50<35:38, 1.04it/s, loss=0.0019, lr=5.41e-06, step=7778] Training: 78%|███████▊ | 7779/10000 [1:42:51<36:17, 1.02it/s, loss=0.0019, lr=5.41e-06, step=7778] Training: 78%|███████▊ | 7779/10000 [1:42:51<36:17, 1.02it/s, loss=0.0043, lr=5.40e-06, step=7779]20:27:23.951 [I] step=7780 loss=0.0076 smoothed_loss=0.0086 lr=5.41e-06 grad_norm=0.4667 step_time=0.7647s data_time=0.2423s it/s=0.993 eta_to_10000=2235.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0029 grad_action_out_proj_arms=0.0509 grad_arm_token_fuse=0.0155 grad_shared_expert=0.2157 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7780/10000 [1:42:52<33:25, 1.11it/s, loss=0.0043, lr=5.40e-06, step=7779] Training: 78%|███████▊ | 7780/10000 [1:42:52<33:25, 1.11it/s, loss=0.0076, lr=5.40e-06, step=7780] Training: 78%|███████▊ | 7781/10000 [1:42:52<28:43, 1.29it/s, loss=0.0076, lr=5.40e-06, step=7780] Training: 78%|███████▊ | 7781/10000 [1:42:52<28:43, 1.29it/s, loss=0.0028, lr=5.40e-06, step=7781] Training: 78%|███████▊ | 7782/10000 [1:42:53<25:32, 1.45it/s, loss=0.0028, lr=5.40e-06, step=7781] Training: 78%|███████▊ | 7782/10000 [1:42:53<25:32, 1.45it/s, loss=0.0039, lr=5.40e-06, step=7782] Training: 78%|███████▊ | 7783/10000 [1:42:54<29:27, 1.25it/s, loss=0.0039, lr=5.40e-06, step=7782] Training: 78%|███████▊ | 7783/10000 [1:42:54<29:27, 1.25it/s, loss=0.0007, lr=5.39e-06, step=7783] Training: 78%|███████▊ | 7784/10000 [1:42:55<30:37, 1.21it/s, loss=0.0007, lr=5.39e-06, step=7783] Training: 78%|███████▊ | 7784/10000 [1:42:55<30:37, 1.21it/s, loss=0.0014, lr=5.39e-06, step=7784] Training: 78%|███████▊ | 7785/10000 [1:42:55<28:39, 1.29it/s, loss=0.0014, lr=5.39e-06, step=7784] Training: 78%|███████▊ | 7785/10000 [1:42:55<28:39, 1.29it/s, loss=0.0026, lr=5.39e-06, step=7785] Training: 78%|███████▊ | 7786/10000 [1:42:56<33:27, 1.10it/s, loss=0.0026, lr=5.39e-06, step=7785] Training: 78%|███████▊ | 7786/10000 [1:42:56<33:27, 1.10it/s, loss=0.0032, lr=5.39e-06, step=7786] Training: 78%|███████▊ | 7787/10000 [1:42:57<32:07, 1.15it/s, loss=0.0032, lr=5.39e-06, step=7786] Training: 78%|███████▊ | 7787/10000 [1:42:57<32:07, 1.15it/s, loss=0.0026, lr=5.38e-06, step=7787] Training: 78%|███████▊ | 7788/10000 [1:42:58<28:39, 1.29it/s, loss=0.0026, lr=5.38e-06, step=7787] Training: 78%|███████▊ | 7788/10000 [1:42:58<28:39, 1.29it/s, loss=0.0238, lr=5.38e-06, step=7788] Training: 78%|███████▊ | 7789/10000 [1:42:59<29:47, 1.24it/s, loss=0.0238, lr=5.38e-06, step=7788] Training: 78%|███████▊ | 7789/10000 [1:42:59<29:47, 1.24it/s, loss=0.0028, lr=5.38e-06, step=7789]20:27:31.483 [I] step=7790 loss=0.0082 smoothed_loss=0.0069 lr=5.39e-06 grad_norm=0.4828 step_time=0.5863s data_time=0.1668s it/s=1.328 eta_to_10000=1664.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0256 grad_action_out_proj_arms=0.1607 grad_arm_token_fuse=0.1252 grad_shared_expert=0.5766 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7790/10000 [1:42:59<26:34, 1.39it/s, loss=0.0028, lr=5.38e-06, step=7789] Training: 78%|███████▊ | 7790/10000 [1:42:59<26:34, 1.39it/s, loss=0.0082, lr=5.38e-06, step=7790] Training: 78%|███████▊ | 7791/10000 [1:43:00<26:22, 1.40it/s, loss=0.0082, lr=5.38e-06, step=7790] Training: 78%|███████▊ | 7791/10000 [1:43:00<26:22, 1.40it/s, loss=0.0138, lr=5.37e-06, step=7791] Training: 78%|███████▊ | 7792/10000 [1:43:01<25:45, 1.43it/s, loss=0.0138, lr=5.37e-06, step=7791] Training: 78%|███████▊ | 7792/10000 [1:43:01<25:45, 1.43it/s, loss=0.0037, lr=5.37e-06, step=7792] Training: 78%|███████▊ | 7793/10000 [1:43:02<29:14, 1.26it/s, loss=0.0037, lr=5.37e-06, step=7792] Training: 78%|███████▊ | 7793/10000 [1:43:02<29:14, 1.26it/s, loss=0.0024, lr=5.37e-06, step=7793] Training: 78%|███████▊ | 7794/10000 [1:43:03<31:50, 1.15it/s, loss=0.0024, lr=5.37e-06, step=7793] Training: 78%|███████▊ | 7794/10000 [1:43:03<31:50, 1.15it/s, loss=0.0097, lr=5.37e-06, step=7794] Training: 78%|███████▊ | 7795/10000 [1:43:04<33:32, 1.10it/s, loss=0.0097, lr=5.37e-06, step=7794] Training: 78%|███████▊ | 7795/10000 [1:43:04<33:32, 1.10it/s, loss=0.0023, lr=5.36e-06, step=7795] Training: 78%|███████▊ | 7796/10000 [1:43:04<32:28, 1.13it/s, loss=0.0023, lr=5.36e-06, step=7795] Training: 78%|███████▊ | 7796/10000 [1:43:04<32:28, 1.13it/s, loss=0.0085, lr=5.36e-06, step=7796] Training: 78%|███████▊ | 7797/10000 [1:43:05<33:48, 1.09it/s, loss=0.0085, lr=5.36e-06, step=7796] Training: 78%|███████▊ | 7797/10000 [1:43:05<33:48, 1.09it/s, loss=0.0096, lr=5.36e-06, step=7797] Training: 78%|███████▊ | 7798/10000 [1:43:06<35:21, 1.04it/s, loss=0.0096, lr=5.36e-06, step=7797] Training: 78%|███████▊ | 7798/10000 [1:43:06<35:21, 1.04it/s, loss=0.0049, lr=5.36e-06, step=7798] Training: 78%|███████▊ | 7799/10000 [1:43:08<36:06, 1.02it/s, loss=0.0049, lr=5.36e-06, step=7798] Training: 78%|███████▊ | 7799/10000 [1:43:08<36:06, 1.02it/s, loss=0.0059, lr=5.35e-06, step=7799]20:27:41.106 [I] step=7800 loss=0.0056 smoothed_loss=0.0066 lr=5.36e-06 grad_norm=0.3537 step_time=0.7004s data_time=0.2619s it/s=1.039 eta_to_10000=2116.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0046 grad_action_out_proj_arms=0.0529 grad_arm_token_fuse=0.0222 grad_shared_expert=0.2623 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7800/10000 [1:43:09<39:12, 1.07s/it, loss=0.0059, lr=5.35e-06, step=7799] Training: 78%|███████▊ | 7800/10000 [1:43:09<39:12, 1.07s/it, loss=0.0056, lr=5.35e-06, step=7800] Training: 78%|███████▊ | 7801/10000 [1:43:10<37:30, 1.02s/it, loss=0.0056, lr=5.35e-06, step=7800] Training: 78%|███████▊ | 7801/10000 [1:43:10<37:30, 1.02s/it, loss=0.0034, lr=5.35e-06, step=7801] Training: 78%|███████▊ | 7802/10000 [1:43:10<34:52, 1.05it/s, loss=0.0034, lr=5.35e-06, step=7801] Training: 78%|███████▊ | 7802/10000 [1:43:10<34:52, 1.05it/s, loss=0.0187, lr=5.35e-06, step=7802] Training: 78%|███████▊ | 7803/10000 [1:43:11<31:39, 1.16it/s, loss=0.0187, lr=5.35e-06, step=7802] Training: 78%|███████▊ | 7803/10000 [1:43:11<31:39, 1.16it/s, loss=0.0104, lr=5.34e-06, step=7803] Training: 78%|███████▊ | 7804/10000 [1:43:12<28:17, 1.29it/s, loss=0.0104, lr=5.34e-06, step=7803] Training: 78%|███████▊ | 7804/10000 [1:43:12<28:17, 1.29it/s, loss=0.0889, lr=5.34e-06, step=7804] Training: 78%|███████▊ | 7805/10000 [1:43:12<27:08, 1.35it/s, loss=0.0889, lr=5.34e-06, step=7804] Training: 78%|███████▊ | 7805/10000 [1:43:12<27:08, 1.35it/s, loss=0.0088, lr=5.34e-06, step=7805] Training: 78%|███████▊ | 7806/10000 [1:43:13<27:58, 1.31it/s, loss=0.0088, lr=5.34e-06, step=7805] Training: 78%|███████▊ | 7806/10000 [1:43:13<27:58, 1.31it/s, loss=0.0022, lr=5.34e-06, step=7806] Training: 78%|███████▊ | 7807/10000 [1:43:14<30:29, 1.20it/s, loss=0.0022, lr=5.34e-06, step=7806] Training: 78%|███████▊ | 7807/10000 [1:43:14<30:29, 1.20it/s, loss=0.0065, lr=5.33e-06, step=7807] Training: 78%|███████▊ | 7808/10000 [1:43:15<31:26, 1.16it/s, loss=0.0065, lr=5.33e-06, step=7807] Training: 78%|███████▊ | 7808/10000 [1:43:15<31:26, 1.16it/s, loss=0.0195, lr=5.33e-06, step=7808] Training: 78%|███████▊ | 7809/10000 [1:43:16<27:47, 1.31it/s, loss=0.0195, lr=5.33e-06, step=7808] Training: 78%|███████▊ | 7809/10000 [1:43:16<27:47, 1.31it/s, loss=0.0050, lr=5.33e-06, step=7809]20:27:49.082 [I] step=7810 loss=0.0145 smoothed_loss=0.0131 lr=5.34e-06 grad_norm=0.4647 step_time=0.5981s data_time=0.1995s it/s=1.254 eta_to_10000=1746.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0168 grad_action_out_proj_arms=0.1091 grad_arm_token_fuse=0.0866 grad_shared_expert=0.4066 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7810/10000 [1:43:17<31:41, 1.15it/s, loss=0.0050, lr=5.33e-06, step=7809] Training: 78%|███████▊ | 7810/10000 [1:43:17<31:41, 1.15it/s, loss=0.0145, lr=5.33e-06, step=7810] Training: 78%|███████▊ | 7811/10000 [1:43:18<34:16, 1.06it/s, loss=0.0145, lr=5.33e-06, step=7810] Training: 78%|███████▊ | 7811/10000 [1:43:18<34:16, 1.06it/s, loss=0.0106, lr=5.32e-06, step=7811] Training: 78%|███████▊ | 7812/10000 [1:43:19<37:12, 1.02s/it, loss=0.0106, lr=5.32e-06, step=7811] Training: 78%|███████▊ | 7812/10000 [1:43:19<37:12, 1.02s/it, loss=0.0067, lr=5.32e-06, step=7812] Training: 78%|███████▊ | 7813/10000 [1:43:20<37:45, 1.04s/it, loss=0.0067, lr=5.32e-06, step=7812] Training: 78%|███████▊ | 7813/10000 [1:43:20<37:45, 1.04s/it, loss=0.0035, lr=5.32e-06, step=7813] Training: 78%|███████▊ | 7814/10000 [1:43:21<35:34, 1.02it/s, loss=0.0035, lr=5.32e-06, step=7813] Training: 78%|███████▊ | 7814/10000 [1:43:21<35:34, 1.02it/s, loss=0.0049, lr=5.32e-06, step=7814] Training: 78%|███████▊ | 7815/10000 [1:43:22<33:14, 1.10it/s, loss=0.0049, lr=5.32e-06, step=7814] Training: 78%|███████▊ | 7815/10000 [1:43:22<33:14, 1.10it/s, loss=0.0029, lr=5.31e-06, step=7815] Training: 78%|███████▊ | 7816/10000 [1:43:23<33:00, 1.10it/s, loss=0.0029, lr=5.31e-06, step=7815] Training: 78%|███████▊ | 7816/10000 [1:43:23<33:00, 1.10it/s, loss=0.0103, lr=5.31e-06, step=7816] Training: 78%|███████▊ | 7817/10000 [1:43:23<29:12, 1.25it/s, loss=0.0103, lr=5.31e-06, step=7816] Training: 78%|███████▊ | 7817/10000 [1:43:23<29:12, 1.25it/s, loss=0.0073, lr=5.31e-06, step=7817] Training: 78%|███████▊ | 7818/10000 [1:43:24<28:10, 1.29it/s, loss=0.0073, lr=5.31e-06, step=7817] Training: 78%|███████▊ | 7818/10000 [1:43:24<28:10, 1.29it/s, loss=0.0173, lr=5.31e-06, step=7818] Training: 78%|███████▊ | 7819/10000 [1:43:25<30:41, 1.18it/s, loss=0.0173, lr=5.31e-06, step=7818] Training: 78%|███████▊ | 7819/10000 [1:43:25<30:41, 1.18it/s, loss=0.0411, lr=5.30e-06, step=7819]20:27:58.326 [I] step=7820 loss=0.0027 smoothed_loss=0.0124 lr=5.31e-06 grad_norm=0.3984 step_time=0.6907s data_time=0.2337s it/s=1.082 eta_to_10000=2015.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0038 grad_action_out_proj_arms=0.0455 grad_arm_token_fuse=0.0205 grad_shared_expert=0.1912 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7820/10000 [1:43:26<33:18, 1.09it/s, loss=0.0411, lr=5.30e-06, step=7819] Training: 78%|███████▊ | 7820/10000 [1:43:26<33:18, 1.09it/s, loss=0.0027, lr=5.30e-06, step=7820] Training: 78%|███████▊ | 7821/10000 [1:43:27<37:29, 1.03s/it, loss=0.0027, lr=5.30e-06, step=7820] Training: 78%|███████▊ | 7821/10000 [1:43:27<37:29, 1.03s/it, loss=0.0094, lr=5.30e-06, step=7821] Training: 78%|███████▊ | 7822/10000 [1:43:29<40:09, 1.11s/it, loss=0.0094, lr=5.30e-06, step=7821] Training: 78%|███████▊ | 7822/10000 [1:43:29<40:09, 1.11s/it, loss=0.0038, lr=5.30e-06, step=7822] Training: 78%|███████▊ | 7823/10000 [1:43:29<33:53, 1.07it/s, loss=0.0038, lr=5.30e-06, step=7822] Training: 78%|███████▊ | 7823/10000 [1:43:29<33:53, 1.07it/s, loss=0.0043, lr=5.29e-06, step=7823] Training: 78%|███████▊ | 7824/10000 [1:43:30<28:55, 1.25it/s, loss=0.0043, lr=5.29e-06, step=7823] Training: 78%|███████▊ | 7824/10000 [1:43:30<28:55, 1.25it/s, loss=0.0133, lr=5.29e-06, step=7824] Training: 78%|███████▊ | 7825/10000 [1:43:30<28:40, 1.26it/s, loss=0.0133, lr=5.29e-06, step=7824] Training: 78%|███████▊ | 7825/10000 [1:43:30<28:40, 1.26it/s, loss=0.0194, lr=5.29e-06, step=7825] Training: 78%|███████▊ | 7826/10000 [1:43:31<27:58, 1.30it/s, loss=0.0194, lr=5.29e-06, step=7825] Training: 78%|███████▊ | 7826/10000 [1:43:31<27:58, 1.30it/s, loss=0.0173, lr=5.29e-06, step=7826] Training: 78%|███████▊ | 7827/10000 [1:43:32<28:47, 1.26it/s, loss=0.0173, lr=5.29e-06, step=7826] Training: 78%|███████▊ | 7827/10000 [1:43:32<28:47, 1.26it/s, loss=0.0129, lr=5.28e-06, step=7827] Training: 78%|███████▊ | 7828/10000 [1:43:33<33:37, 1.08it/s, loss=0.0129, lr=5.28e-06, step=7827] Training: 78%|███████▊ | 7828/10000 [1:43:33<33:37, 1.08it/s, loss=0.0049, lr=5.28e-06, step=7828] Training: 78%|███████▊ | 7829/10000 [1:43:34<34:39, 1.04it/s, loss=0.0049, lr=5.28e-06, step=7828] Training: 78%|███████▊ | 7829/10000 [1:43:34<34:39, 1.04it/s, loss=0.0049, lr=5.28e-06, step=7829]20:28:07.176 [I] step=7830 loss=0.0131 smoothed_loss=0.0111 lr=5.29e-06 grad_norm=0.4729 step_time=0.6735s data_time=0.2115s it/s=1.130 eta_to_10000=1920.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0297 grad_action_out_proj_arms=0.1837 grad_arm_token_fuse=0.1531 grad_shared_expert=0.6832 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7830/10000 [1:43:35<31:11, 1.16it/s, loss=0.0049, lr=5.28e-06, step=7829] Training: 78%|███████▊ | 7830/10000 [1:43:35<31:11, 1.16it/s, loss=0.0131, lr=5.28e-06, step=7830] Training: 78%|███████▊ | 7831/10000 [1:43:36<29:57, 1.21it/s, loss=0.0131, lr=5.28e-06, step=7830] Training: 78%|███████▊ | 7831/10000 [1:43:36<29:57, 1.21it/s, loss=0.0085, lr=5.27e-06, step=7831] Training: 78%|███████▊ | 7832/10000 [1:43:37<31:08, 1.16it/s, loss=0.0085, lr=5.27e-06, step=7831] Training: 78%|███████▊ | 7832/10000 [1:43:37<31:08, 1.16it/s, loss=0.0028, lr=5.27e-06, step=7832] Training: 78%|███████▊ | 7833/10000 [1:43:38<34:18, 1.05it/s, loss=0.0028, lr=5.27e-06, step=7832] Training: 78%|███████▊ | 7833/10000 [1:43:38<34:18, 1.05it/s, loss=0.0046, lr=5.27e-06, step=7833] Training: 78%|███████▊ | 7834/10000 [1:43:39<35:14, 1.02it/s, loss=0.0046, lr=5.27e-06, step=7833] Training: 78%|███████▊ | 7834/10000 [1:43:39<35:14, 1.02it/s, loss=0.0036, lr=5.27e-06, step=7834] Training: 78%|███████▊ | 7835/10000 [1:43:40<33:52, 1.07it/s, loss=0.0036, lr=5.27e-06, step=7834] Training: 78%|███████▊ | 7835/10000 [1:43:40<33:52, 1.07it/s, loss=0.0029, lr=5.26e-06, step=7835] Training: 78%|███████▊ | 7836/10000 [1:43:40<32:27, 1.11it/s, loss=0.0029, lr=5.26e-06, step=7835] Training: 78%|███████▊ | 7836/10000 [1:43:40<32:27, 1.11it/s, loss=0.0107, lr=5.26e-06, step=7836] Training: 78%|███████▊ | 7837/10000 [1:43:41<32:01, 1.13it/s, loss=0.0107, lr=5.26e-06, step=7836] Training: 78%|███████▊ | 7837/10000 [1:43:41<32:01, 1.13it/s, loss=0.0025, lr=5.26e-06, step=7837] Training: 78%|███████▊ | 7838/10000 [1:43:42<32:20, 1.11it/s, loss=0.0025, lr=5.26e-06, step=7837] Training: 78%|███████▊ | 7838/10000 [1:43:42<32:20, 1.11it/s, loss=0.0037, lr=5.26e-06, step=7838] Training: 78%|███████▊ | 7839/10000 [1:43:43<33:26, 1.08it/s, loss=0.0037, lr=5.26e-06, step=7838] Training: 78%|███████▊ | 7839/10000 [1:43:43<33:26, 1.08it/s, loss=0.0123, lr=5.25e-06, step=7839]20:28:16.564 [I] step=7840 loss=0.0083 smoothed_loss=0.0080 lr=5.26e-06 grad_norm=0.3267 step_time=0.7026s data_time=0.2362s it/s=1.065 eta_to_10000=2027.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0079 grad_action_out_proj_arms=0.0695 grad_arm_token_fuse=0.0404 grad_shared_expert=0.2247 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7840/10000 [1:43:44<34:53, 1.03it/s, loss=0.0123, lr=5.25e-06, step=7839] Training: 78%|███████▊ | 7840/10000 [1:43:44<34:53, 1.03it/s, loss=0.0083, lr=5.25e-06, step=7840] Training: 78%|███████▊ | 7841/10000 [1:43:45<36:06, 1.00s/it, loss=0.0083, lr=5.25e-06, step=7840] Training: 78%|███████▊ | 7841/10000 [1:43:45<36:06, 1.00s/it, loss=0.0190, lr=5.25e-06, step=7841] Training: 78%|███████▊ | 7842/10000 [1:43:46<36:22, 1.01s/it, loss=0.0190, lr=5.25e-06, step=7841] Training: 78%|███████▊ | 7842/10000 [1:43:46<36:22, 1.01s/it, loss=0.0485, lr=5.25e-06, step=7842] Training: 78%|███████▊ | 7843/10000 [1:43:48<38:56, 1.08s/it, loss=0.0485, lr=5.25e-06, step=7842] Training: 78%|███████▊ | 7843/10000 [1:43:48<38:56, 1.08s/it, loss=0.0019, lr=5.25e-06, step=7843] Training: 78%|███████▊ | 7844/10000 [1:43:49<37:21, 1.04s/it, loss=0.0019, lr=5.25e-06, step=7843] Training: 78%|███████▊ | 7844/10000 [1:43:49<37:21, 1.04s/it, loss=0.0048, lr=5.24e-06, step=7844] Training: 78%|███████▊ | 7845/10000 [1:43:49<34:56, 1.03it/s, loss=0.0048, lr=5.24e-06, step=7844] Training: 78%|███████▊ | 7845/10000 [1:43:49<34:56, 1.03it/s, loss=0.0034, lr=5.24e-06, step=7845] Training: 78%|███████▊ | 7846/10000 [1:43:50<36:04, 1.00s/it, loss=0.0034, lr=5.24e-06, step=7845] Training: 78%|███████▊ | 7846/10000 [1:43:50<36:04, 1.00s/it, loss=0.0035, lr=5.24e-06, step=7846] Training: 78%|███████▊ | 7847/10000 [1:43:51<36:12, 1.01s/it, loss=0.0035, lr=5.24e-06, step=7846] Training: 78%|███████▊ | 7847/10000 [1:43:51<36:12, 1.01s/it, loss=0.0133, lr=5.24e-06, step=7847] Training: 78%|███████▊ | 7848/10000 [1:43:53<38:04, 1.06s/it, loss=0.0133, lr=5.24e-06, step=7847] Training: 78%|███████▊ | 7848/10000 [1:43:53<38:04, 1.06s/it, loss=0.0017, lr=5.23e-06, step=7848] Training: 78%|███████▊ | 7849/10000 [1:43:54<39:30, 1.10s/it, loss=0.0017, lr=5.23e-06, step=7848] Training: 78%|███████▊ | 7849/10000 [1:43:54<39:30, 1.10s/it, loss=0.0009, lr=5.23e-06, step=7849]20:28:27.493 [I] step=7850 loss=0.0077 smoothed_loss=0.0084 lr=5.24e-06 grad_norm=0.5040 step_time=0.7961s data_time=0.2968s it/s=0.915 eta_to_10000=2349.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0405 grad_action_out_proj_arms=0.1945 grad_arm_token_fuse=0.2398 grad_shared_expert=0.5375 (18633:train_pytorch.py:850) + Training: 78%|███████▊ | 7850/10000 [1:43:55<41:56, 1.17s/it, loss=0.0009, lr=5.23e-06, step=7849] Training: 78%|███████▊ | 7850/10000 [1:43:55<41:56, 1.17s/it, loss=0.0077, lr=5.23e-06, step=7850] Training: 79%|███████▊ | 7851/10000 [1:43:56<38:44, 1.08s/it, loss=0.0077, lr=5.23e-06, step=7850] Training: 79%|███████▊ | 7851/10000 [1:43:56<38:44, 1.08s/it, loss=0.0289, lr=5.23e-06, step=7851] Training: 79%|███████▊ | 7852/10000 [1:43:57<38:50, 1.08s/it, loss=0.0289, lr=5.23e-06, step=7851] Training: 79%|███████▊ | 7852/10000 [1:43:57<38:50, 1.08s/it, loss=0.0137, lr=5.22e-06, step=7852] Training: 79%|███████▊ | 7853/10000 [1:43:58<38:07, 1.07s/it, loss=0.0137, lr=5.22e-06, step=7852] Training: 79%|███████▊ | 7853/10000 [1:43:58<38:07, 1.07s/it, loss=0.0092, lr=5.22e-06, step=7853] Training: 79%|███████▊ | 7854/10000 [1:43:59<38:31, 1.08s/it, loss=0.0092, lr=5.22e-06, step=7853] Training: 79%|███████▊ | 7854/10000 [1:43:59<38:31, 1.08s/it, loss=0.0046, lr=5.22e-06, step=7854] Training: 79%|███████▊ | 7855/10000 [1:44:00<37:00, 1.04s/it, loss=0.0046, lr=5.22e-06, step=7854] Training: 79%|███████▊ | 7855/10000 [1:44:00<37:00, 1.04s/it, loss=0.0033, lr=5.22e-06, step=7855] Training: 79%|███████▊ | 7856/10000 [1:44:01<38:22, 1.07s/it, loss=0.0033, lr=5.22e-06, step=7855] Training: 79%|███████▊ | 7856/10000 [1:44:01<38:22, 1.07s/it, loss=0.0036, lr=5.21e-06, step=7856] Training: 79%|███████▊ | 7857/10000 [1:44:03<39:16, 1.10s/it, loss=0.0036, lr=5.21e-06, step=7856] Training: 79%|███████▊ | 7857/10000 [1:44:03<39:16, 1.10s/it, loss=0.0107, lr=5.21e-06, step=7857] Training: 79%|███████▊ | 7858/10000 [1:44:04<39:00, 1.09s/it, loss=0.0107, lr=5.21e-06, step=7857] Training: 79%|███████▊ | 7858/10000 [1:44:04<39:00, 1.09s/it, loss=0.0074, lr=5.21e-06, step=7858] Training: 79%|███████▊ | 7859/10000 [1:44:05<39:54, 1.12s/it, loss=0.0074, lr=5.21e-06, step=7858] Training: 79%|███████▊ | 7859/10000 [1:44:05<39:54, 1.12s/it, loss=0.0076, lr=5.21e-06, step=7859]20:28:38.258 [I] step=7860 loss=0.0075 smoothed_loss=0.0086 lr=5.21e-06 grad_norm=0.4110 step_time=0.7543s data_time=0.3222s it/s=0.929 eta_to_10000=2303.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0188 grad_action_out_proj_arms=0.1275 grad_arm_token_fuse=0.1004 grad_shared_expert=0.4666 (18633:train_pytorch.py:850) + Training: 79%|███████▊ | 7860/10000 [1:44:06<40:18, 1.13s/it, loss=0.0076, lr=5.21e-06, step=7859] Training: 79%|███████▊ | 7860/10000 [1:44:06<40:18, 1.13s/it, loss=0.0075, lr=5.20e-06, step=7860] Training: 79%|███████▊ | 7861/10000 [1:44:07<39:47, 1.12s/it, loss=0.0075, lr=5.20e-06, step=7860] Training: 79%|███████▊ | 7861/10000 [1:44:07<39:47, 1.12s/it, loss=0.0043, lr=5.20e-06, step=7861] Training: 79%|███████▊ | 7862/10000 [1:44:08<37:18, 1.05s/it, loss=0.0043, lr=5.20e-06, step=7861] Training: 79%|███████▊ | 7862/10000 [1:44:08<37:18, 1.05s/it, loss=0.0013, lr=5.20e-06, step=7862] Training: 79%|███████▊ | 7863/10000 [1:44:09<32:48, 1.09it/s, loss=0.0013, lr=5.20e-06, step=7862] Training: 79%|███████▊ | 7863/10000 [1:44:09<32:48, 1.09it/s, loss=0.0099, lr=5.20e-06, step=7863] Training: 79%|███████▊ | 7864/10000 [1:44:09<29:45, 1.20it/s, loss=0.0099, lr=5.20e-06, step=7863] Training: 79%|███████▊ | 7864/10000 [1:44:09<29:45, 1.20it/s, loss=0.0125, lr=5.19e-06, step=7864] Training: 79%|███████▊ | 7865/10000 [1:44:10<31:26, 1.13it/s, loss=0.0125, lr=5.19e-06, step=7864] Training: 79%|███████▊ | 7865/10000 [1:44:10<31:26, 1.13it/s, loss=0.0103, lr=5.19e-06, step=7865] Training: 79%|███████▊ | 7866/10000 [1:44:11<29:04, 1.22it/s, loss=0.0103, lr=5.19e-06, step=7865] Training: 79%|███████▊ | 7866/10000 [1:44:11<29:04, 1.22it/s, loss=0.0132, lr=5.19e-06, step=7866] Training: 79%|███████▊ | 7867/10000 [1:44:12<30:38, 1.16it/s, loss=0.0132, lr=5.19e-06, step=7866] Training: 79%|███████▊ | 7867/10000 [1:44:12<30:38, 1.16it/s, loss=0.0151, lr=5.19e-06, step=7867] Training: 79%|███████▊ | 7868/10000 [1:44:13<32:09, 1.10it/s, loss=0.0151, lr=5.19e-06, step=7867] Training: 79%|███████▊ | 7868/10000 [1:44:13<32:09, 1.10it/s, loss=0.0090, lr=5.18e-06, step=7868] Training: 79%|███████▊ | 7869/10000 [1:44:14<32:42, 1.09it/s, loss=0.0090, lr=5.18e-06, step=7868] Training: 79%|███████▊ | 7869/10000 [1:44:14<32:42, 1.09it/s, loss=0.0099, lr=5.18e-06, step=7869]20:28:46.975 [I] step=7870 loss=0.0012 smoothed_loss=0.0087 lr=5.19e-06 grad_norm=0.4836 step_time=0.6510s data_time=0.2207s it/s=1.147 eta_to_10000=1856.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.1373 grad_arm_token_fuse=0.0543 grad_shared_expert=0.3504 (18633:train_pytorch.py:850) + Training: 79%|███████▊ | 7870/10000 [1:44:15<32:25, 1.09it/s, loss=0.0099, lr=5.18e-06, step=7869] Training: 79%|███████▊ | 7870/10000 [1:44:15<32:25, 1.09it/s, loss=0.0012, lr=5.18e-06, step=7870] Training: 79%|███████▊ | 7871/10000 [1:44:16<31:56, 1.11it/s, loss=0.0012, lr=5.18e-06, step=7870] Training: 79%|███████▊ | 7871/10000 [1:44:16<31:56, 1.11it/s, loss=0.0248, lr=5.18e-06, step=7871] Training: 79%|███████▊ | 7872/10000 [1:44:17<34:05, 1.04it/s, loss=0.0248, lr=5.18e-06, step=7871] Training: 79%|███████▊ | 7872/10000 [1:44:17<34:05, 1.04it/s, loss=0.0069, lr=5.17e-06, step=7872] Training: 79%|███████▊ | 7873/10000 [1:44:17<33:09, 1.07it/s, loss=0.0069, lr=5.17e-06, step=7872] Training: 79%|███████▊ | 7873/10000 [1:44:17<33:09, 1.07it/s, loss=0.0021, lr=5.17e-06, step=7873] Training: 79%|███████▊ | 7874/10000 [1:44:18<32:28, 1.09it/s, loss=0.0021, lr=5.17e-06, step=7873] Training: 79%|███████▊ | 7874/10000 [1:44:18<32:28, 1.09it/s, loss=0.0071, lr=5.17e-06, step=7874] Training: 79%|███████▉ | 7875/10000 [1:44:19<32:40, 1.08it/s, loss=0.0071, lr=5.17e-06, step=7874] Training: 79%|███████▉ | 7875/10000 [1:44:19<32:40, 1.08it/s, loss=0.0048, lr=5.17e-06, step=7875] Training: 79%|███████▉ | 7876/10000 [1:44:20<35:05, 1.01it/s, loss=0.0048, lr=5.17e-06, step=7875] Training: 79%|███████▉ | 7876/10000 [1:44:20<35:05, 1.01it/s, loss=0.0054, lr=5.17e-06, step=7876] Training: 79%|███████▉ | 7877/10000 [1:44:22<37:14, 1.05s/it, loss=0.0054, lr=5.17e-06, step=7876] Training: 79%|███████▉ | 7877/10000 [1:44:22<37:14, 1.05s/it, loss=0.0089, lr=5.16e-06, step=7877] Training: 79%|███████▉ | 7878/10000 [1:44:23<39:22, 1.11s/it, loss=0.0089, lr=5.16e-06, step=7877] Training: 79%|███████▉ | 7878/10000 [1:44:23<39:22, 1.11s/it, loss=0.0064, lr=5.16e-06, step=7878] Training: 79%|███████▉ | 7879/10000 [1:44:24<40:31, 1.15s/it, loss=0.0064, lr=5.16e-06, step=7878] Training: 79%|███████▉ | 7879/10000 [1:44:24<40:31, 1.15s/it, loss=0.0022, lr=5.16e-06, step=7879]20:28:57.521 [I] step=7880 loss=0.0013 smoothed_loss=0.0069 lr=5.17e-06 grad_norm=0.4102 step_time=0.7554s data_time=0.2993s it/s=0.948 eta_to_10000=2235.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0669 grad_arm_token_fuse=0.0250 grad_shared_expert=0.2975 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7880/10000 [1:44:25<39:37, 1.12s/it, loss=0.0022, lr=5.16e-06, step=7879] Training: 79%|███████▉ | 7880/10000 [1:44:25<39:37, 1.12s/it, loss=0.0013, lr=5.16e-06, step=7880] Training: 79%|███████▉ | 7881/10000 [1:44:26<39:05, 1.11s/it, loss=0.0013, lr=5.16e-06, step=7880] Training: 79%|███████▉ | 7881/10000 [1:44:26<39:05, 1.11s/it, loss=0.0029, lr=5.15e-06, step=7881] Training: 79%|███████▉ | 7882/10000 [1:44:27<39:00, 1.11s/it, loss=0.0029, lr=5.15e-06, step=7881] Training: 79%|███████▉ | 7882/10000 [1:44:27<39:00, 1.11s/it, loss=0.0036, lr=5.15e-06, step=7882] Training: 79%|███████▉ | 7883/10000 [1:44:28<38:14, 1.08s/it, loss=0.0036, lr=5.15e-06, step=7882] Training: 79%|███████▉ | 7883/10000 [1:44:28<38:14, 1.08s/it, loss=0.0031, lr=5.15e-06, step=7883] Training: 79%|███████▉ | 7884/10000 [1:44:29<37:48, 1.07s/it, loss=0.0031, lr=5.15e-06, step=7883] Training: 79%|███████▉ | 7884/10000 [1:44:29<37:48, 1.07s/it, loss=0.0012, lr=5.15e-06, step=7884] Training: 79%|███████▉ | 7885/10000 [1:44:31<38:49, 1.10s/it, loss=0.0012, lr=5.15e-06, step=7884] Training: 79%|███████▉ | 7885/10000 [1:44:31<38:49, 1.10s/it, loss=0.0320, lr=5.14e-06, step=7885] Training: 79%|███████▉ | 7886/10000 [1:44:32<40:32, 1.15s/it, loss=0.0320, lr=5.14e-06, step=7885] Training: 79%|███████▉ | 7886/10000 [1:44:32<40:32, 1.15s/it, loss=0.0015, lr=5.14e-06, step=7886] Training: 79%|███████▉ | 7887/10000 [1:44:33<39:08, 1.11s/it, loss=0.0015, lr=5.14e-06, step=7886] Training: 79%|███████▉ | 7887/10000 [1:44:33<39:08, 1.11s/it, loss=0.0470, lr=5.14e-06, step=7887] Training: 79%|███████▉ | 7888/10000 [1:44:34<36:47, 1.05s/it, loss=0.0470, lr=5.14e-06, step=7887] Training: 79%|███████▉ | 7888/10000 [1:44:34<36:47, 1.05s/it, loss=0.0056, lr=5.14e-06, step=7888] Training: 79%|███████▉ | 7889/10000 [1:44:35<37:07, 1.06s/it, loss=0.0056, lr=5.14e-06, step=7888] Training: 79%|███████▉ | 7889/10000 [1:44:35<37:07, 1.06s/it, loss=0.0040, lr=5.13e-06, step=7889]20:29:08.310 [I] step=7890 loss=0.0063 smoothed_loss=0.0097 lr=5.14e-06 grad_norm=0.4723 step_time=0.7343s data_time=0.3446s it/s=0.927 eta_to_10000=2276.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0111 grad_action_out_proj_arms=0.1083 grad_arm_token_fuse=0.0542 grad_shared_expert=0.5491 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7890/10000 [1:44:36<37:41, 1.07s/it, loss=0.0040, lr=5.13e-06, step=7889] Training: 79%|███████▉ | 7890/10000 [1:44:36<37:41, 1.07s/it, loss=0.0063, lr=5.13e-06, step=7890] Training: 79%|███████▉ | 7891/10000 [1:44:37<36:42, 1.04s/it, loss=0.0063, lr=5.13e-06, step=7890] Training: 79%|███████▉ | 7891/10000 [1:44:37<36:42, 1.04s/it, loss=0.0126, lr=5.13e-06, step=7891] Training: 79%|███████▉ | 7892/10000 [1:44:38<37:52, 1.08s/it, loss=0.0126, lr=5.13e-06, step=7891] Training: 79%|███████▉ | 7892/10000 [1:44:38<37:52, 1.08s/it, loss=0.0010, lr=5.13e-06, step=7892] Training: 79%|███████▉ | 7893/10000 [1:44:39<40:02, 1.14s/it, loss=0.0010, lr=5.13e-06, step=7892] Training: 79%|███████▉ | 7893/10000 [1:44:39<40:02, 1.14s/it, loss=0.0150, lr=5.12e-06, step=7893] Training: 79%|███████▉ | 7894/10000 [1:44:40<37:12, 1.06s/it, loss=0.0150, lr=5.12e-06, step=7893] Training: 79%|███████▉ | 7894/10000 [1:44:40<37:12, 1.06s/it, loss=0.0034, lr=5.12e-06, step=7894] Training: 79%|███████▉ | 7895/10000 [1:44:41<34:26, 1.02it/s, loss=0.0034, lr=5.12e-06, step=7894] Training: 79%|███████▉ | 7895/10000 [1:44:41<34:26, 1.02it/s, loss=0.0029, lr=5.12e-06, step=7895] Training: 79%|███████▉ | 7896/10000 [1:44:42<32:24, 1.08it/s, loss=0.0029, lr=5.12e-06, step=7895] Training: 79%|███████▉ | 7896/10000 [1:44:42<32:24, 1.08it/s, loss=0.0116, lr=5.12e-06, step=7896] Training: 79%|███████▉ | 7897/10000 [1:44:42<27:53, 1.26it/s, loss=0.0116, lr=5.12e-06, step=7896] Training: 79%|███████▉ | 7897/10000 [1:44:42<27:53, 1.26it/s, loss=0.0070, lr=5.12e-06, step=7897] Training: 79%|███████▉ | 7898/10000 [1:44:43<25:06, 1.40it/s, loss=0.0070, lr=5.12e-06, step=7897] Training: 79%|███████▉ | 7898/10000 [1:44:43<25:06, 1.40it/s, loss=0.0121, lr=5.11e-06, step=7898] Training: 79%|███████▉ | 7899/10000 [1:44:44<27:15, 1.28it/s, loss=0.0121, lr=5.11e-06, step=7898] Training: 79%|███████▉ | 7899/10000 [1:44:44<27:15, 1.28it/s, loss=0.0052, lr=5.11e-06, step=7899]20:29:16.848 [I] step=7900 loss=0.0025 smoothed_loss=0.0080 lr=5.12e-06 grad_norm=0.3879 step_time=0.6730s data_time=0.1807s it/s=1.171 eta_to_10000=1792.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0571 grad_arm_token_fuse=0.0394 grad_shared_expert=0.3674 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7900/10000 [1:44:45<26:27, 1.32it/s, loss=0.0052, lr=5.11e-06, step=7899] Training: 79%|███████▉ | 7900/10000 [1:44:45<26:27, 1.32it/s, loss=0.0025, lr=5.11e-06, step=7900] Training: 79%|███████▉ | 7901/10000 [1:44:45<27:14, 1.28it/s, loss=0.0025, lr=5.11e-06, step=7900] Training: 79%|███████▉ | 7901/10000 [1:44:45<27:14, 1.28it/s, loss=0.0158, lr=5.11e-06, step=7901] Training: 79%|███████▉ | 7902/10000 [1:44:46<24:15, 1.44it/s, loss=0.0158, lr=5.11e-06, step=7901] Training: 79%|███████▉ | 7902/10000 [1:44:46<24:15, 1.44it/s, loss=0.0034, lr=5.10e-06, step=7902] Training: 79%|███████▉ | 7903/10000 [1:44:46<23:19, 1.50it/s, loss=0.0034, lr=5.10e-06, step=7902] Training: 79%|███████▉ | 7903/10000 [1:44:46<23:19, 1.50it/s, loss=0.0061, lr=5.10e-06, step=7903] Training: 79%|███████▉ | 7904/10000 [1:44:47<25:28, 1.37it/s, loss=0.0061, lr=5.10e-06, step=7903] Training: 79%|███████▉ | 7904/10000 [1:44:47<25:28, 1.37it/s, loss=0.0076, lr=5.10e-06, step=7904] Training: 79%|███████▉ | 7905/10000 [1:44:48<27:44, 1.26it/s, loss=0.0076, lr=5.10e-06, step=7904] Training: 79%|███████▉ | 7905/10000 [1:44:48<27:44, 1.26it/s, loss=0.0435, lr=5.10e-06, step=7905] Training: 79%|███████▉ | 7906/10000 [1:44:49<29:42, 1.18it/s, loss=0.0435, lr=5.10e-06, step=7905] Training: 79%|███████▉ | 7906/10000 [1:44:49<29:42, 1.18it/s, loss=0.0352, lr=5.09e-06, step=7906] Training: 79%|███████▉ | 7907/10000 [1:44:50<29:46, 1.17it/s, loss=0.0352, lr=5.09e-06, step=7906] Training: 79%|███████▉ | 7907/10000 [1:44:50<29:46, 1.17it/s, loss=0.0075, lr=5.09e-06, step=7907] Training: 79%|███████▉ | 7908/10000 [1:44:51<26:34, 1.31it/s, loss=0.0075, lr=5.09e-06, step=7907] Training: 79%|███████▉ | 7908/10000 [1:44:51<26:34, 1.31it/s, loss=0.0050, lr=5.09e-06, step=7908] Training: 79%|███████▉ | 7909/10000 [1:44:52<28:00, 1.24it/s, loss=0.0050, lr=5.09e-06, step=7908] Training: 79%|███████▉ | 7909/10000 [1:44:52<28:00, 1.24it/s, loss=0.0080, lr=5.09e-06, step=7909]20:29:24.860 [I] step=7910 loss=0.0045 smoothed_loss=0.0112 lr=5.09e-06 grad_norm=0.4573 step_time=0.6136s data_time=0.1876s it/s=1.248 eta_to_10000=1674.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0035 grad_action_out_proj_arms=0.0523 grad_arm_token_fuse=0.0174 grad_shared_expert=0.2571 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7910/10000 [1:44:53<29:41, 1.17it/s, loss=0.0080, lr=5.09e-06, step=7909] Training: 79%|███████▉ | 7910/10000 [1:44:53<29:41, 1.17it/s, loss=0.0045, lr=5.08e-06, step=7910] Training: 79%|███████▉ | 7911/10000 [1:44:53<29:18, 1.19it/s, loss=0.0045, lr=5.08e-06, step=7910] Training: 79%|███████▉ | 7911/10000 [1:44:53<29:18, 1.19it/s, loss=0.0106, lr=5.08e-06, step=7911] Training: 79%|███████▉ | 7912/10000 [1:44:54<27:40, 1.26it/s, loss=0.0106, lr=5.08e-06, step=7911] Training: 79%|███████▉ | 7912/10000 [1:44:54<27:40, 1.26it/s, loss=0.0013, lr=5.08e-06, step=7912] Training: 79%|███████▉ | 7913/10000 [1:44:55<26:19, 1.32it/s, loss=0.0013, lr=5.08e-06, step=7912] Training: 79%|███████▉ | 7913/10000 [1:44:55<26:19, 1.32it/s, loss=0.0030, lr=5.08e-06, step=7913] Training: 79%|███████▉ | 7914/10000 [1:44:56<27:34, 1.26it/s, loss=0.0030, lr=5.08e-06, step=7913] Training: 79%|███████▉ | 7914/10000 [1:44:56<27:34, 1.26it/s, loss=0.0342, lr=5.07e-06, step=7914] Training: 79%|███████▉ | 7915/10000 [1:44:57<29:52, 1.16it/s, loss=0.0342, lr=5.07e-06, step=7914] Training: 79%|███████▉ | 7915/10000 [1:44:57<29:52, 1.16it/s, loss=0.0014, lr=5.07e-06, step=7915] Training: 79%|███████▉ | 7916/10000 [1:44:57<29:54, 1.16it/s, loss=0.0014, lr=5.07e-06, step=7915] Training: 79%|███████▉ | 7916/10000 [1:44:57<29:54, 1.16it/s, loss=0.0131, lr=5.07e-06, step=7916] Training: 79%|███████▉ | 7917/10000 [1:44:58<29:01, 1.20it/s, loss=0.0131, lr=5.07e-06, step=7916] Training: 79%|███████▉ | 7917/10000 [1:44:58<29:01, 1.20it/s, loss=0.0017, lr=5.07e-06, step=7917] Training: 79%|███████▉ | 7918/10000 [1:44:59<28:15, 1.23it/s, loss=0.0017, lr=5.07e-06, step=7917] Training: 79%|███████▉ | 7918/10000 [1:44:59<28:15, 1.23it/s, loss=0.0128, lr=5.07e-06, step=7918] Training: 79%|███████▉ | 7919/10000 [1:45:00<26:31, 1.31it/s, loss=0.0128, lr=5.07e-06, step=7918] Training: 79%|███████▉ | 7919/10000 [1:45:00<26:31, 1.31it/s, loss=0.0084, lr=5.06e-06, step=7919]20:29:32.541 [I] step=7920 loss=0.0370 smoothed_loss=0.0129 lr=5.07e-06 grad_norm=0.5175 step_time=0.6015s data_time=0.1666s it/s=1.302 eta_to_10000=1597.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0155 grad_action_out_proj_arms=0.1070 grad_arm_token_fuse=0.0784 grad_shared_expert=0.4697 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7920/10000 [1:45:00<24:24, 1.42it/s, loss=0.0084, lr=5.06e-06, step=7919] Training: 79%|███████▉ | 7920/10000 [1:45:00<24:24, 1.42it/s, loss=0.0370, lr=5.06e-06, step=7920] Training: 79%|███████▉ | 7921/10000 [1:45:01<28:06, 1.23it/s, loss=0.0370, lr=5.06e-06, step=7920] Training: 79%|███████▉ | 7921/10000 [1:45:01<28:06, 1.23it/s, loss=0.0112, lr=5.06e-06, step=7921] Training: 79%|███████▉ | 7922/10000 [1:45:02<32:12, 1.08it/s, loss=0.0112, lr=5.06e-06, step=7921] Training: 79%|███████▉ | 7922/10000 [1:45:02<32:12, 1.08it/s, loss=0.0110, lr=5.06e-06, step=7922] Training: 79%|███████▉ | 7923/10000 [1:45:04<34:33, 1.00it/s, loss=0.0110, lr=5.06e-06, step=7922] Training: 79%|███████▉ | 7923/10000 [1:45:04<34:33, 1.00it/s, loss=0.0059, lr=5.05e-06, step=7923] Training: 79%|███████▉ | 7924/10000 [1:45:05<35:25, 1.02s/it, loss=0.0059, lr=5.05e-06, step=7923] Training: 79%|███████▉ | 7924/10000 [1:45:05<35:25, 1.02s/it, loss=0.0083, lr=5.05e-06, step=7924] Training: 79%|███████▉ | 7925/10000 [1:45:05<30:23, 1.14it/s, loss=0.0083, lr=5.05e-06, step=7924] Training: 79%|███████▉ | 7925/10000 [1:45:05<30:23, 1.14it/s, loss=0.0147, lr=5.05e-06, step=7925] Training: 79%|███████▉ | 7926/10000 [1:45:06<26:17, 1.31it/s, loss=0.0147, lr=5.05e-06, step=7925] Training: 79%|███████▉ | 7926/10000 [1:45:06<26:17, 1.31it/s, loss=0.0158, lr=5.05e-06, step=7926] Training: 79%|███████▉ | 7927/10000 [1:45:07<26:20, 1.31it/s, loss=0.0158, lr=5.05e-06, step=7926] Training: 79%|███████▉ | 7927/10000 [1:45:07<26:20, 1.31it/s, loss=0.0116, lr=5.04e-06, step=7927] Training: 79%|███████▉ | 7928/10000 [1:45:07<25:26, 1.36it/s, loss=0.0116, lr=5.04e-06, step=7927] Training: 79%|███████▉ | 7928/10000 [1:45:07<25:26, 1.36it/s, loss=0.0161, lr=5.04e-06, step=7928] Training: 79%|███████▉ | 7929/10000 [1:45:08<24:54, 1.39it/s, loss=0.0161, lr=5.04e-06, step=7928] Training: 79%|███████▉ | 7929/10000 [1:45:08<24:54, 1.39it/s, loss=0.0557, lr=5.04e-06, step=7929]20:29:41.099 [I] step=7930 loss=0.0046 smoothed_loss=0.0157 lr=5.05e-06 grad_norm=0.4057 step_time=0.6474s data_time=0.2084s it/s=1.169 eta_to_10000=1771.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0135 grad_action_out_proj_arms=0.0887 grad_arm_token_fuse=0.0689 grad_shared_expert=0.2632 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7930/10000 [1:45:09<26:40, 1.29it/s, loss=0.0557, lr=5.04e-06, step=7929] Training: 79%|███████▉ | 7930/10000 [1:45:09<26:40, 1.29it/s, loss=0.0046, lr=5.04e-06, step=7930] Training: 79%|███████▉ | 7931/10000 [1:45:10<31:37, 1.09it/s, loss=0.0046, lr=5.04e-06, step=7930] Training: 79%|███████▉ | 7931/10000 [1:45:10<31:37, 1.09it/s, loss=0.0017, lr=5.03e-06, step=7931] Training: 79%|███████▉ | 7932/10000 [1:45:11<34:40, 1.01s/it, loss=0.0017, lr=5.03e-06, step=7931] Training: 79%|███████▉ | 7932/10000 [1:45:11<34:40, 1.01s/it, loss=0.0051, lr=5.03e-06, step=7932] Training: 79%|███████▉ | 7933/10000 [1:45:13<37:25, 1.09s/it, loss=0.0051, lr=5.03e-06, step=7932] Training: 79%|███████▉ | 7933/10000 [1:45:13<37:25, 1.09s/it, loss=0.0120, lr=5.03e-06, step=7933] Training: 79%|███████▉ | 7934/10000 [1:45:14<36:47, 1.07s/it, loss=0.0120, lr=5.03e-06, step=7933] Training: 79%|███████▉ | 7934/10000 [1:45:14<36:47, 1.07s/it, loss=0.0324, lr=5.03e-06, step=7934] Training: 79%|███████▉ | 7935/10000 [1:45:14<34:48, 1.01s/it, loss=0.0324, lr=5.03e-06, step=7934] Training: 79%|███████▉ | 7935/10000 [1:45:14<34:48, 1.01s/it, loss=0.0134, lr=5.03e-06, step=7935] Training: 79%|███████▉ | 7936/10000 [1:45:16<39:16, 1.14s/it, loss=0.0134, lr=5.03e-06, step=7935] Training: 79%|███████▉ | 7936/10000 [1:45:16<39:16, 1.14s/it, loss=0.0048, lr=5.02e-06, step=7936] Training: 79%|███████▉ | 7937/10000 [1:45:17<39:37, 1.15s/it, loss=0.0048, lr=5.02e-06, step=7936] Training: 79%|███████▉ | 7937/10000 [1:45:17<39:37, 1.15s/it, loss=0.0026, lr=5.02e-06, step=7937] Training: 79%|███████▉ | 7938/10000 [1:45:18<37:34, 1.09s/it, loss=0.0026, lr=5.02e-06, step=7937] Training: 79%|███████▉ | 7938/10000 [1:45:18<37:34, 1.09s/it, loss=0.0330, lr=5.02e-06, step=7938] Training: 79%|███████▉ | 7939/10000 [1:45:19<36:49, 1.07s/it, loss=0.0330, lr=5.02e-06, step=7938] Training: 79%|███████▉ | 7939/10000 [1:45:19<36:49, 1.07s/it, loss=0.0197, lr=5.02e-06, step=7939]20:29:52.420 [I] step=7940 loss=0.0187 smoothed_loss=0.0156 lr=5.02e-06 grad_norm=0.4310 step_time=0.7447s data_time=0.3874s it/s=0.883 eta_to_10000=2331.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0191 grad_action_out_proj_arms=0.1329 grad_arm_token_fuse=0.0995 grad_shared_expert=0.4691 (18633:train_pytorch.py:850) + Training: 79%|███████▉ | 7940/10000 [1:45:20<36:49, 1.07s/it, loss=0.0197, lr=5.02e-06, step=7939] Training: 79%|███████▉ | 7940/10000 [1:45:20<36:49, 1.07s/it, loss=0.0187, lr=5.01e-06, step=7940] Training: 79%|███████▉ | 7941/10000 [1:45:21<36:03, 1.05s/it, loss=0.0187, lr=5.01e-06, step=7940] Training: 79%|███████▉ | 7941/10000 [1:45:21<36:03, 1.05s/it, loss=0.0094, lr=5.01e-06, step=7941] Training: 79%|███████▉ | 7942/10000 [1:45:22<35:08, 1.02s/it, loss=0.0094, lr=5.01e-06, step=7941] Training: 79%|███████▉ | 7942/10000 [1:45:22<35:08, 1.02s/it, loss=0.0029, lr=5.01e-06, step=7942] Training: 79%|███████▉ | 7943/10000 [1:45:23<35:42, 1.04s/it, loss=0.0029, lr=5.01e-06, step=7942] Training: 79%|███████▉ | 7943/10000 [1:45:23<35:42, 1.04s/it, loss=0.0227, lr=5.01e-06, step=7943] Training: 79%|███████▉ | 7944/10000 [1:45:24<35:34, 1.04s/it, loss=0.0227, lr=5.01e-06, step=7943] Training: 79%|███████▉ | 7944/10000 [1:45:24<35:34, 1.04s/it, loss=0.0060, lr=5.00e-06, step=7944] Training: 79%|███████▉ | 7945/10000 [1:45:25<35:03, 1.02s/it, loss=0.0060, lr=5.00e-06, step=7944] Training: 79%|███████▉ | 7945/10000 [1:45:25<35:03, 1.02s/it, loss=0.0019, lr=5.00e-06, step=7945] Training: 79%|███████▉ | 7946/10000 [1:45:26<29:52, 1.15it/s, loss=0.0019, lr=5.00e-06, step=7945] Training: 79%|███████▉ | 7946/10000 [1:45:26<29:52, 1.15it/s, loss=0.0011, lr=5.00e-06, step=7946] Training: 79%|███████▉ | 7947/10000 [1:45:26<25:51, 1.32it/s, loss=0.0011, lr=5.00e-06, step=7946] Training: 79%|███████▉ | 7947/10000 [1:45:26<25:51, 1.32it/s, loss=0.0040, lr=5.00e-06, step=7947] Training: 79%|███████▉ | 7948/10000 [1:45:27<23:11, 1.47it/s, loss=0.0040, lr=5.00e-06, step=7947] Training: 79%|███████▉ | 7948/10000 [1:45:27<23:11, 1.47it/s, loss=0.0070, lr=4.99e-06, step=7948] Training: 79%|███████▉ | 7949/10000 [1:45:27<22:06, 1.55it/s, loss=0.0070, lr=4.99e-06, step=7948] Training: 79%|███████▉ | 7949/10000 [1:45:27<22:06, 1.55it/s, loss=0.0052, lr=4.99e-06, step=7949]20:30:00.634 [I] step=7950 loss=0.0073 smoothed_loss=0.0096 lr=5.00e-06 grad_norm=0.4117 step_time=0.6250s data_time=0.1964s it/s=1.218 eta_to_10000=1683.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0143 grad_action_out_proj_arms=0.1271 grad_arm_token_fuse=0.0781 grad_shared_expert=0.4073 (18633:train_pytorch.py:850) + Training: 80%|███████▉ | 7950/10000 [1:45:28<26:28, 1.29it/s, loss=0.0052, lr=4.99e-06, step=7949] Training: 80%|███████▉ | 7950/10000 [1:45:28<26:28, 1.29it/s, loss=0.0073, lr=4.99e-06, step=7950] Training: 80%|███████▉ | 7951/10000 [1:45:29<29:58, 1.14it/s, loss=0.0073, lr=4.99e-06, step=7950] Training: 80%|███████▉ | 7951/10000 [1:45:29<29:58, 1.14it/s, loss=0.0170, lr=4.99e-06, step=7951] Training: 80%|███████▉ | 7952/10000 [1:45:30<31:05, 1.10it/s, loss=0.0170, lr=4.99e-06, step=7951] Training: 80%|███████▉ | 7952/10000 [1:45:30<31:05, 1.10it/s, loss=0.0035, lr=4.99e-06, step=7952] Training: 80%|███████▉ | 7953/10000 [1:45:31<30:36, 1.11it/s, loss=0.0035, lr=4.99e-06, step=7952] Training: 80%|███████▉ | 7953/10000 [1:45:31<30:36, 1.11it/s, loss=0.0038, lr=4.98e-06, step=7953] Training: 80%|███████▉ | 7954/10000 [1:45:32<31:47, 1.07it/s, loss=0.0038, lr=4.98e-06, step=7953] Training: 80%|███████▉ | 7954/10000 [1:45:32<31:47, 1.07it/s, loss=0.0010, lr=4.98e-06, step=7954] Training: 80%|███████▉ | 7955/10000 [1:45:33<34:06, 1.00s/it, loss=0.0010, lr=4.98e-06, step=7954] Training: 80%|███████▉ | 7955/10000 [1:45:33<34:06, 1.00s/it, loss=0.0080, lr=4.98e-06, step=7955] Training: 80%|███████▉ | 7956/10000 [1:45:34<32:54, 1.03it/s, loss=0.0080, lr=4.98e-06, step=7955] Training: 80%|███████▉ | 7956/10000 [1:45:34<32:54, 1.03it/s, loss=0.0024, lr=4.98e-06, step=7956] Training: 80%|███████▉ | 7957/10000 [1:45:35<33:57, 1.00it/s, loss=0.0024, lr=4.98e-06, step=7956] Training: 80%|███████▉ | 7957/10000 [1:45:35<33:57, 1.00it/s, loss=0.0599, lr=4.97e-06, step=7957] Training: 80%|███████▉ | 7958/10000 [1:45:36<31:33, 1.08it/s, loss=0.0599, lr=4.97e-06, step=7957] Training: 80%|███████▉ | 7958/10000 [1:45:36<31:33, 1.08it/s, loss=0.0041, lr=4.97e-06, step=7958] Training: 80%|███████▉ | 7959/10000 [1:45:37<30:08, 1.13it/s, loss=0.0041, lr=4.97e-06, step=7958] Training: 80%|███████▉ | 7959/10000 [1:45:37<30:08, 1.13it/s, loss=0.0103, lr=4.97e-06, step=7959]20:30:10.236 [I] step=7960 loss=0.0030 smoothed_loss=0.0109 lr=4.98e-06 grad_norm=0.4042 step_time=0.6897s data_time=0.2705s it/s=1.042 eta_to_10000=1958.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0411 grad_action_out_proj_arms=0.1569 grad_arm_token_fuse=0.2184 grad_shared_expert=0.4863 (18633:train_pytorch.py:850) + Training: 80%|███████▉ | 7960/10000 [1:45:38<30:45, 1.11it/s, loss=0.0103, lr=4.97e-06, step=7959] Training: 80%|███████▉ | 7960/10000 [1:45:38<30:45, 1.11it/s, loss=0.0030, lr=4.97e-06, step=7960] Training: 80%|███████▉ | 7961/10000 [1:45:39<30:54, 1.10it/s, loss=0.0030, lr=4.97e-06, step=7960] Training: 80%|███████▉ | 7961/10000 [1:45:39<30:54, 1.10it/s, loss=0.0047, lr=4.96e-06, step=7961] Training: 80%|███████▉ | 7962/10000 [1:45:40<30:55, 1.10it/s, loss=0.0047, lr=4.96e-06, step=7961] Training: 80%|███████▉ | 7962/10000 [1:45:40<30:55, 1.10it/s, loss=0.0272, lr=4.96e-06, step=7962] Training: 80%|███████▉ | 7963/10000 [1:45:40<26:21, 1.29it/s, loss=0.0272, lr=4.96e-06, step=7962] Training: 80%|███████▉ | 7963/10000 [1:45:40<26:21, 1.29it/s, loss=0.1140, lr=4.96e-06, step=7963] Training: 80%|███████▉ | 7964/10000 [1:45:41<24:56, 1.36it/s, loss=0.1140, lr=4.96e-06, step=7963] Training: 80%|███████▉ | 7964/10000 [1:45:41<24:56, 1.36it/s, loss=0.0090, lr=4.96e-06, step=7964] Training: 80%|███████▉ | 7965/10000 [1:45:42<27:51, 1.22it/s, loss=0.0090, lr=4.96e-06, step=7964] Training: 80%|███████▉ | 7965/10000 [1:45:42<27:51, 1.22it/s, loss=0.0026, lr=4.96e-06, step=7965] Training: 80%|███████▉ | 7966/10000 [1:45:43<29:40, 1.14it/s, loss=0.0026, lr=4.96e-06, step=7965] Training: 80%|███████▉ | 7966/10000 [1:45:43<29:40, 1.14it/s, loss=0.0079, lr=4.95e-06, step=7966] Training: 80%|███████▉ | 7967/10000 [1:45:43<25:33, 1.33it/s, loss=0.0079, lr=4.95e-06, step=7966] Training: 80%|███████▉ | 7967/10000 [1:45:43<25:33, 1.33it/s, loss=0.0253, lr=4.95e-06, step=7967] Training: 80%|███████▉ | 7968/10000 [1:45:44<23:48, 1.42it/s, loss=0.0253, lr=4.95e-06, step=7967] Training: 80%|███████▉ | 7968/10000 [1:45:44<23:48, 1.42it/s, loss=0.0010, lr=4.95e-06, step=7968] Training: 80%|███████▉ | 7969/10000 [1:45:45<25:34, 1.32it/s, loss=0.0010, lr=4.95e-06, step=7968] Training: 80%|███████▉ | 7969/10000 [1:45:45<25:34, 1.32it/s, loss=0.0184, lr=4.95e-06, step=7969]20:30:18.147 [I] step=7970 loss=0.0137 smoothed_loss=0.0167 lr=4.95e-06 grad_norm=0.4116 step_time=0.5999s data_time=0.1912s it/s=1.264 eta_to_10000=1605.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0142 grad_action_out_proj_arms=0.0858 grad_arm_token_fuse=0.0756 grad_shared_expert=0.2382 (18633:train_pytorch.py:850) + Training: 80%|███████▉ | 7970/10000 [1:45:46<28:13, 1.20it/s, loss=0.0184, lr=4.95e-06, step=7969] Training: 80%|███████▉ | 7970/10000 [1:45:46<28:13, 1.20it/s, loss=0.0137, lr=4.94e-06, step=7970] Training: 80%|███████▉ | 7971/10000 [1:45:47<33:06, 1.02it/s, loss=0.0137, lr=4.94e-06, step=7970] Training: 80%|███████▉ | 7971/10000 [1:45:47<33:06, 1.02it/s, loss=0.0637, lr=4.94e-06, step=7971] Training: 80%|███████▉ | 7972/10000 [1:45:48<35:03, 1.04s/it, loss=0.0637, lr=4.94e-06, step=7971] Training: 80%|███████▉ | 7972/10000 [1:45:48<35:03, 1.04s/it, loss=0.0026, lr=4.94e-06, step=7972] Training: 80%|███████▉ | 7973/10000 [1:45:49<34:25, 1.02s/it, loss=0.0026, lr=4.94e-06, step=7972] Training: 80%|███████▉ | 7973/10000 [1:45:49<34:25, 1.02s/it, loss=0.0027, lr=4.94e-06, step=7973] Training: 80%|███████▉ | 7974/10000 [1:45:50<29:53, 1.13it/s, loss=0.0027, lr=4.94e-06, step=7973] Training: 80%|███████▉ | 7974/10000 [1:45:50<29:53, 1.13it/s, loss=0.0354, lr=4.93e-06, step=7974] Training: 80%|███████▉ | 7975/10000 [1:45:51<28:37, 1.18it/s, loss=0.0354, lr=4.93e-06, step=7974] Training: 80%|███████▉ | 7975/10000 [1:45:51<28:37, 1.18it/s, loss=0.0092, lr=4.93e-06, step=7975] Training: 80%|███████▉ | 7976/10000 [1:45:52<30:01, 1.12it/s, loss=0.0092, lr=4.93e-06, step=7975] Training: 80%|███████▉ | 7976/10000 [1:45:52<30:01, 1.12it/s, loss=0.0037, lr=4.93e-06, step=7976] Training: 80%|███████▉ | 7977/10000 [1:45:52<26:25, 1.28it/s, loss=0.0037, lr=4.93e-06, step=7976] Training: 80%|███████▉ | 7977/10000 [1:45:52<26:25, 1.28it/s, loss=0.0191, lr=4.93e-06, step=7977] Training: 80%|███████▉ | 7978/10000 [1:45:53<28:38, 1.18it/s, loss=0.0191, lr=4.93e-06, step=7977] Training: 80%|███████▉ | 7978/10000 [1:45:53<28:38, 1.18it/s, loss=0.0073, lr=4.92e-06, step=7978] Training: 80%|███████▉ | 7979/10000 [1:45:54<32:11, 1.05it/s, loss=0.0073, lr=4.92e-06, step=7978] Training: 80%|███████▉ | 7979/10000 [1:45:54<32:11, 1.05it/s, loss=0.0038, lr=4.92e-06, step=7979]20:30:27.643 [I] step=7980 loss=0.0054 smoothed_loss=0.0141 lr=4.93e-06 grad_norm=0.4779 step_time=0.6953s data_time=0.2542s it/s=1.053 eta_to_10000=1918.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0120 grad_action_out_proj_arms=0.0844 grad_arm_token_fuse=0.0606 grad_shared_expert=0.5605 (18633:train_pytorch.py:850) + Training: 80%|███████▉ | 7980/10000 [1:45:55<32:15, 1.04it/s, loss=0.0038, lr=4.92e-06, step=7979] Training: 80%|███████▉ | 7980/10000 [1:45:55<32:15, 1.04it/s, loss=0.0054, lr=4.92e-06, step=7980] Training: 80%|███████▉ | 7981/10000 [1:45:56<30:35, 1.10it/s, loss=0.0054, lr=4.92e-06, step=7980] Training: 80%|███████▉ | 7981/10000 [1:45:56<30:35, 1.10it/s, loss=0.0062, lr=4.92e-06, step=7981] Training: 80%|███████▉ | 7982/10000 [1:45:57<28:37, 1.18it/s, loss=0.0062, lr=4.92e-06, step=7981] Training: 80%|███████▉ | 7982/10000 [1:45:57<28:37, 1.18it/s, loss=0.0064, lr=4.92e-06, step=7982] Training: 80%|███████▉ | 7983/10000 [1:45:58<29:47, 1.13it/s, loss=0.0064, lr=4.92e-06, step=7982] Training: 80%|███████▉ | 7983/10000 [1:45:58<29:47, 1.13it/s, loss=0.0034, lr=4.91e-06, step=7983] Training: 80%|███████▉ | 7984/10000 [1:45:59<29:02, 1.16it/s, loss=0.0034, lr=4.91e-06, step=7983] Training: 80%|███████▉ | 7984/10000 [1:45:59<29:02, 1.16it/s, loss=0.0052, lr=4.91e-06, step=7984] Training: 80%|███████▉ | 7985/10000 [1:46:00<31:05, 1.08it/s, loss=0.0052, lr=4.91e-06, step=7984] Training: 80%|███████▉ | 7985/10000 [1:46:00<31:05, 1.08it/s, loss=0.0012, lr=4.91e-06, step=7985] Training: 80%|███████▉ | 7986/10000 [1:46:01<35:18, 1.05s/it, loss=0.0012, lr=4.91e-06, step=7985] Training: 80%|███████▉ | 7986/10000 [1:46:01<35:18, 1.05s/it, loss=0.0075, lr=4.91e-06, step=7986] Training: 80%|███████▉ | 7987/10000 [1:46:02<34:16, 1.02s/it, loss=0.0075, lr=4.91e-06, step=7986] Training: 80%|███████▉ | 7987/10000 [1:46:02<34:16, 1.02s/it, loss=0.0035, lr=4.90e-06, step=7987] Training: 80%|███████▉ | 7988/10000 [1:46:03<33:56, 1.01s/it, loss=0.0035, lr=4.90e-06, step=7987] Training: 80%|███████▉ | 7988/10000 [1:46:03<33:56, 1.01s/it, loss=0.0049, lr=4.90e-06, step=7988] Training: 80%|███████▉ | 7989/10000 [1:46:04<31:51, 1.05it/s, loss=0.0049, lr=4.90e-06, step=7988] Training: 80%|███████▉ | 7989/10000 [1:46:04<31:51, 1.05it/s, loss=0.0119, lr=4.90e-06, step=7989]20:30:36.879 [I] step=7990 loss=0.0029 smoothed_loss=0.0084 lr=4.91e-06 grad_norm=0.3800 step_time=0.6731s data_time=0.2505s it/s=1.083 eta_to_10000=1856.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0063 grad_action_out_proj_arms=0.0563 grad_arm_token_fuse=0.0314 grad_shared_expert=0.1535 (18633:train_pytorch.py:850) + Training: 80%|███████▉ | 7990/10000 [1:46:05<30:08, 1.11it/s, loss=0.0119, lr=4.90e-06, step=7989] Training: 80%|███████▉ | 7990/10000 [1:46:05<30:08, 1.11it/s, loss=0.0029, lr=4.90e-06, step=7990] Training: 80%|███████▉ | 7991/10000 [1:46:05<29:38, 1.13it/s, loss=0.0029, lr=4.90e-06, step=7990] Training: 80%|███████▉ | 7991/10000 [1:46:05<29:38, 1.13it/s, loss=0.0043, lr=4.90e-06, step=7991] Training: 80%|███████▉ | 7992/10000 [1:46:06<29:07, 1.15it/s, loss=0.0043, lr=4.90e-06, step=7991] Training: 80%|███████▉ | 7992/10000 [1:46:06<29:07, 1.15it/s, loss=0.0071, lr=4.89e-06, step=7992] Training: 80%|███████▉ | 7993/10000 [1:46:07<30:16, 1.10it/s, loss=0.0071, lr=4.89e-06, step=7992] Training: 80%|███████▉ | 7993/10000 [1:46:07<30:16, 1.10it/s, loss=0.0071, lr=4.89e-06, step=7993] Training: 80%|███████▉ | 7994/10000 [1:46:08<25:56, 1.29it/s, loss=0.0071, lr=4.89e-06, step=7993] Training: 80%|███████▉ | 7994/10000 [1:46:08<25:56, 1.29it/s, loss=0.0281, lr=4.89e-06, step=7994] Training: 80%|███████▉ | 7995/10000 [1:46:08<23:11, 1.44it/s, loss=0.0281, lr=4.89e-06, step=7994] Training: 80%|███████▉ | 7995/10000 [1:46:08<23:11, 1.44it/s, loss=0.0014, lr=4.89e-06, step=7995] Training: 80%|███████▉ | 7996/10000 [1:46:09<23:36, 1.42it/s, loss=0.0014, lr=4.89e-06, step=7995] Training: 80%|███████▉ | 7996/10000 [1:46:09<23:36, 1.42it/s, loss=0.0110, lr=4.88e-06, step=7996] Training: 80%|███████▉ | 7997/10000 [1:46:10<23:53, 1.40it/s, loss=0.0110, lr=4.88e-06, step=7996] Training: 80%|███████▉ | 7997/10000 [1:46:10<23:53, 1.40it/s, loss=0.0097, lr=4.88e-06, step=7997] Training: 80%|███████▉ | 7998/10000 [1:46:11<25:20, 1.32it/s, loss=0.0097, lr=4.88e-06, step=7997] Training: 80%|███████▉ | 7998/10000 [1:46:11<25:20, 1.32it/s, loss=0.0009, lr=4.88e-06, step=7998] Training: 80%|███████▉ | 7999/10000 [1:46:11<24:09, 1.38it/s, loss=0.0009, lr=4.88e-06, step=7998] Training: 80%|███████▉ | 7999/10000 [1:46:11<24:09, 1.38it/s, loss=0.0260, lr=4.88e-06, step=7999]20:30:44.380 [I] step=8000 loss=0.0058 smoothed_loss=0.0097 lr=4.88e-06 grad_norm=0.3771 step_time=0.5868s data_time=0.1633s it/s=1.333 eta_to_10000=1500.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0052 grad_action_out_proj_arms=0.0690 grad_arm_token_fuse=0.0264 grad_shared_expert=0.2813 (18633:train_pytorch.py:850) +20:33:46.138 [I] Saved checkpoint at step 8000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/8000 (18633:train_pytorch.py:350) + Training: 80%|████████ | 8000/10000 [1:49:14<30:43:12, 55.30s/it, loss=0.0260, lr=4.88e-06, step=7999] Training: 80%|████████ | 8000/10000 [1:49:14<30:43:12, 55.30s/it, loss=0.0058, lr=4.87e-06, step=8000] Training: 80%|████████ | 8001/10000 [1:49:14<21:36:22, 38.91s/it, loss=0.0058, lr=4.87e-06, step=8000] Training: 80%|████████ | 8001/10000 [1:49:14<21:36:22, 38.91s/it, loss=0.0108, lr=4.87e-06, step=8001] Training: 80%|████████ | 8002/10000 [1:49:15<15:15:04, 27.48s/it, loss=0.0108, lr=4.87e-06, step=8001] Training: 80%|████████ | 8002/10000 [1:49:15<15:15:04, 27.48s/it, loss=0.0122, lr=4.87e-06, step=8002] Training: 80%|████████ | 8003/10000 [1:49:16<10:49:48, 19.52s/it, loss=0.0122, lr=4.87e-06, step=8002] Training: 80%|████████ | 8003/10000 [1:49:16<10:49:48, 19.52s/it, loss=0.0014, lr=4.87e-06, step=8003] Training: 80%|████████ | 8004/10000 [1:49:17<7:42:35, 13.91s/it, loss=0.0014, lr=4.87e-06, step=8003] Training: 80%|████████ | 8004/10000 [1:49:17<7:42:35, 13.91s/it, loss=0.0013, lr=4.87e-06, step=8004] Training: 80%|████████ | 8005/10000 [1:49:18<5:33:11, 10.02s/it, loss=0.0013, lr=4.87e-06, step=8004] Training: 80%|████████ | 8005/10000 [1:49:18<5:33:11, 10.02s/it, loss=0.0139, lr=4.86e-06, step=8005] Training: 80%|████████ | 8006/10000 [1:49:19<4:04:55, 7.37s/it, loss=0.0139, lr=4.86e-06, step=8005] Training: 80%|████████ | 8006/10000 [1:49:19<4:04:55, 7.37s/it, loss=0.0082, lr=4.86e-06, step=8006] Training: 80%|████████ | 8007/10000 [1:49:20<2:58:33, 5.38s/it, loss=0.0082, lr=4.86e-06, step=8006] Training: 80%|████████ | 8007/10000 [1:49:20<2:58:33, 5.38s/it, loss=0.0042, lr=4.86e-06, step=8007] Training: 80%|████████ | 8008/10000 [1:49:21<2:12:42, 4.00s/it, loss=0.0042, lr=4.86e-06, step=8007] Training: 80%|████████ | 8008/10000 [1:49:21<2:12:42, 4.00s/it, loss=0.0107, lr=4.86e-06, step=8008] Training: 80%|████████ | 8009/10000 [1:49:21<1:38:53, 2.98s/it, loss=0.0107, lr=4.86e-06, step=8008] Training: 80%|████████ | 8009/10000 [1:49:21<1:38:53, 2.98s/it, loss=0.0420, lr=4.85e-06, step=8009]20:33:54.229 [I] step=8010 loss=0.0105 smoothed_loss=0.0118 lr=4.86e-06 grad_norm=0.4309 step_time=0.6118s data_time=18.3731s it/s=0.053 eta_to_10000=37779.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0138 grad_action_out_proj_arms=0.1298 grad_arm_token_fuse=0.0779 grad_shared_expert=0.4023 (18633:train_pytorch.py:850) + Training: 80%|████████ | 8010/10000 [1:49:22<1:15:08, 2.27s/it, loss=0.0420, lr=4.85e-06, step=8009] Training: 80%|████████ | 8010/10000 [1:49:22<1:15:08, 2.27s/it, loss=0.0105, lr=4.85e-06, step=8010] Training: 80%|████████ | 8011/10000 [1:49:23<1:01:28, 1.85s/it, loss=0.0105, lr=4.85e-06, step=8010] Training: 80%|████████ | 8011/10000 [1:49:23<1:01:28, 1.85s/it, loss=0.0013, lr=4.85e-06, step=8011] Training: 80%|████████ | 8012/10000 [1:49:23<49:53, 1.51s/it, loss=0.0013, lr=4.85e-06, step=8011] Training: 80%|████████ | 8012/10000 [1:49:23<49:53, 1.51s/it, loss=0.0030, lr=4.85e-06, step=8012] Training: 80%|████████ | 8013/10000 [1:49:24<43:37, 1.32s/it, loss=0.0030, lr=4.85e-06, step=8012] Training: 80%|████████ | 8013/10000 [1:49:24<43:37, 1.32s/it, loss=0.0031, lr=4.84e-06, step=8013] Training: 80%|████████ | 8014/10000 [1:49:25<39:22, 1.19s/it, loss=0.0031, lr=4.84e-06, step=8013] Training: 80%|████████ | 8014/10000 [1:49:25<39:22, 1.19s/it, loss=0.0110, lr=4.84e-06, step=8014] Training: 80%|████████ | 8015/10000 [1:49:26<36:31, 1.10s/it, loss=0.0110, lr=4.84e-06, step=8014] Training: 80%|████████ | 8015/10000 [1:49:26<36:31, 1.10s/it, loss=0.0020, lr=4.84e-06, step=8015] Training: 80%|████████ | 8016/10000 [1:49:27<35:57, 1.09s/it, loss=0.0020, lr=4.84e-06, step=8015] Training: 80%|████████ | 8016/10000 [1:49:27<35:57, 1.09s/it, loss=0.0020, lr=4.84e-06, step=8016] Training: 80%|████████ | 8017/10000 [1:49:28<30:04, 1.10it/s, loss=0.0020, lr=4.84e-06, step=8016] Training: 80%|████████ | 8017/10000 [1:49:28<30:04, 1.10it/s, loss=0.0165, lr=4.84e-06, step=8017] Training: 80%|████████ | 8018/10000 [1:49:28<26:22, 1.25it/s, loss=0.0165, lr=4.84e-06, step=8017] Training: 80%|████████ | 8018/10000 [1:49:28<26:22, 1.25it/s, loss=0.0081, lr=4.83e-06, step=8018] Training: 80%|████████ | 8019/10000 [1:49:29<23:52, 1.38it/s, loss=0.0081, lr=4.83e-06, step=8018] Training: 80%|████████ | 8019/10000 [1:49:29<23:52, 1.38it/s, loss=0.0165, lr=4.83e-06, step=8019]20:34:02.216 [I] step=8020 loss=0.0112 smoothed_loss=0.0097 lr=4.84e-06 grad_norm=0.4453 step_time=0.6059s data_time=0.1927s it/s=1.252 eta_to_10000=1581.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.0987 grad_arm_token_fuse=0.0554 grad_shared_expert=0.5058 (18633:train_pytorch.py:850) + Training: 80%|████████ | 8020/10000 [1:49:30<27:34, 1.20it/s, loss=0.0165, lr=4.83e-06, step=8019] Training: 80%|████████ | 8020/10000 [1:49:30<27:34, 1.20it/s, loss=0.0112, lr=4.83e-06, step=8020] Training: 80%|████████ | 8021/10000 [1:49:31<27:31, 1.20it/s, loss=0.0112, lr=4.83e-06, step=8020] Training: 80%|████████ | 8021/10000 [1:49:31<27:31, 1.20it/s, loss=0.0351, lr=4.83e-06, step=8021] Training: 80%|████████ | 8022/10000 [1:49:32<28:09, 1.17it/s, loss=0.0351, lr=4.83e-06, step=8021] Training: 80%|████████ | 8022/10000 [1:49:32<28:09, 1.17it/s, loss=0.0163, lr=4.82e-06, step=8022] Training: 80%|████████ | 8023/10000 [1:49:32<24:49, 1.33it/s, loss=0.0163, lr=4.82e-06, step=8022] Training: 80%|████████ | 8023/10000 [1:49:32<24:49, 1.33it/s, loss=0.0165, lr=4.82e-06, step=8023] Training: 80%|████████ | 8024/10000 [1:49:33<23:50, 1.38it/s, loss=0.0165, lr=4.82e-06, step=8023] Training: 80%|████████ | 8024/10000 [1:49:33<23:50, 1.38it/s, loss=0.0085, lr=4.82e-06, step=8024] Training: 80%|████████ | 8025/10000 [1:49:33<23:06, 1.42it/s, loss=0.0085, lr=4.82e-06, step=8024] Training: 80%|████████ | 8025/10000 [1:49:33<23:06, 1.42it/s, loss=0.0037, lr=4.82e-06, step=8025] Training: 80%|████████ | 8026/10000 [1:49:34<24:22, 1.35it/s, loss=0.0037, lr=4.82e-06, step=8025] Training: 80%|████████ | 8026/10000 [1:49:34<24:22, 1.35it/s, loss=0.0283, lr=4.82e-06, step=8026] Training: 80%|████████ | 8027/10000 [1:49:35<25:01, 1.31it/s, loss=0.0283, lr=4.82e-06, step=8026] Training: 80%|████████ | 8027/10000 [1:49:35<25:01, 1.31it/s, loss=0.0087, lr=4.81e-06, step=8027] Training: 80%|████████ | 8028/10000 [1:49:36<30:38, 1.07it/s, loss=0.0087, lr=4.81e-06, step=8027] Training: 80%|████████ | 8028/10000 [1:49:36<30:38, 1.07it/s, loss=0.0020, lr=4.81e-06, step=8028] Training: 80%|████████ | 8029/10000 [1:49:38<32:07, 1.02it/s, loss=0.0020, lr=4.81e-06, step=8028] Training: 80%|████████ | 8029/10000 [1:49:38<32:07, 1.02it/s, loss=0.0032, lr=4.81e-06, step=8029]20:34:10.745 [I] step=8030 loss=0.0028 smoothed_loss=0.0101 lr=4.82e-06 grad_norm=0.4434 step_time=0.6413s data_time=0.2116s it/s=1.173 eta_to_10000=1679.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0069 grad_action_out_proj_arms=0.0806 grad_arm_token_fuse=0.0349 grad_shared_expert=0.2460 (18633:train_pytorch.py:850) + Training: 80%|████████ | 8030/10000 [1:49:38<31:29, 1.04it/s, loss=0.0032, lr=4.81e-06, step=8029] Training: 80%|████████ | 8030/10000 [1:49:38<31:29, 1.04it/s, loss=0.0028, lr=4.81e-06, step=8030] Training: 80%|████████ | 8031/10000 [1:49:39<27:34, 1.19it/s, loss=0.0028, lr=4.81e-06, step=8030] Training: 80%|████████ | 8031/10000 [1:49:39<27:34, 1.19it/s, loss=0.0073, lr=4.80e-06, step=8031] Training: 80%|████████ | 8032/10000 [1:49:40<24:28, 1.34it/s, loss=0.0073, lr=4.80e-06, step=8031] Training: 80%|████████ | 8032/10000 [1:49:40<24:28, 1.34it/s, loss=0.0052, lr=4.80e-06, step=8032] Training: 80%|████████ | 8033/10000 [1:49:40<23:23, 1.40it/s, loss=0.0052, lr=4.80e-06, step=8032] Training: 80%|████████ | 8033/10000 [1:49:40<23:23, 1.40it/s, loss=0.0016, lr=4.80e-06, step=8033] Training: 80%|████████ | 8034/10000 [1:49:41<21:12, 1.54it/s, loss=0.0016, lr=4.80e-06, step=8033] Training: 80%|████████ | 8034/10000 [1:49:41<21:12, 1.54it/s, loss=0.0073, lr=4.80e-06, step=8034] Training: 80%|████████ | 8035/10000 [1:49:42<25:29, 1.28it/s, loss=0.0073, lr=4.80e-06, step=8034] Training: 80%|████████ | 8035/10000 [1:49:42<25:29, 1.28it/s, loss=0.0161, lr=4.80e-06, step=8035] Training: 80%|████████ | 8036/10000 [1:49:43<28:42, 1.14it/s, loss=0.0161, lr=4.80e-06, step=8035] Training: 80%|████████ | 8036/10000 [1:49:43<28:42, 1.14it/s, loss=0.0017, lr=4.79e-06, step=8036] Training: 80%|████████ | 8037/10000 [1:49:43<26:31, 1.23it/s, loss=0.0017, lr=4.79e-06, step=8036] Training: 80%|████████ | 8037/10000 [1:49:43<26:31, 1.23it/s, loss=0.0021, lr=4.79e-06, step=8037] Training: 80%|████████ | 8038/10000 [1:49:44<27:16, 1.20it/s, loss=0.0021, lr=4.79e-06, step=8037] Training: 80%|████████ | 8038/10000 [1:49:44<27:16, 1.20it/s, loss=0.0154, lr=4.79e-06, step=8038] Training: 80%|████████ | 8039/10000 [1:49:45<27:26, 1.19it/s, loss=0.0154, lr=4.79e-06, step=8038] Training: 80%|████████ | 8039/10000 [1:49:45<27:26, 1.19it/s, loss=0.0305, lr=4.79e-06, step=8039]20:34:18.383 [I] step=8040 loss=0.0031 smoothed_loss=0.0100 lr=4.79e-06 grad_norm=0.3729 step_time=0.5693s data_time=0.1945s it/s=1.309 eta_to_10000=1496.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0767 grad_arm_token_fuse=0.0295 grad_shared_expert=0.2016 (18633:train_pytorch.py:850) + Training: 80%|████████ | 8040/10000 [1:49:46<27:24, 1.19it/s, loss=0.0305, lr=4.79e-06, step=8039] Training: 80%|████████ | 8040/10000 [1:49:46<27:24, 1.19it/s, loss=0.0031, lr=4.78e-06, step=8040] Training: 80%|████████ | 8041/10000 [1:49:47<26:13, 1.25it/s, loss=0.0031, lr=4.78e-06, step=8040] Training: 80%|████████ | 8041/10000 [1:49:47<26:13, 1.25it/s, loss=0.0094, lr=4.78e-06, step=8041] Training: 80%|████████ | 8042/10000 [1:49:48<29:42, 1.10it/s, loss=0.0094, lr=4.78e-06, step=8041] Training: 80%|████████ | 8042/10000 [1:49:48<29:42, 1.10it/s, loss=0.0172, lr=4.78e-06, step=8042] Training: 80%|████████ | 8043/10000 [1:49:49<31:55, 1.02it/s, loss=0.0172, lr=4.78e-06, step=8042] Training: 80%|████████ | 8043/10000 [1:49:49<31:55, 1.02it/s, loss=0.0062, lr=4.78e-06, step=8043] Training: 80%|████████ | 8044/10000 [1:49:50<32:43, 1.00s/it, loss=0.0062, lr=4.78e-06, step=8043] Training: 80%|████████ | 8044/10000 [1:49:50<32:43, 1.00s/it, loss=0.0058, lr=4.77e-06, step=8044] Training: 80%|████████ | 8045/10000 [1:49:51<31:30, 1.03it/s, loss=0.0058, lr=4.77e-06, step=8044] Training: 80%|████████ | 8045/10000 [1:49:51<31:30, 1.03it/s, loss=0.0057, lr=4.77e-06, step=8045] Training: 80%|████████ | 8046/10000 [1:49:52<31:59, 1.02it/s, loss=0.0057, lr=4.77e-06, step=8045] Training: 80%|████████ | 8046/10000 [1:49:52<31:59, 1.02it/s, loss=0.0050, lr=4.77e-06, step=8046] Training: 80%|████████ | 8047/10000 [1:49:53<29:23, 1.11it/s, loss=0.0050, lr=4.77e-06, step=8046] Training: 80%|████████ | 8047/10000 [1:49:53<29:23, 1.11it/s, loss=0.0048, lr=4.77e-06, step=8047] Training: 80%|████████ | 8048/10000 [1:49:53<25:13, 1.29it/s, loss=0.0048, lr=4.77e-06, step=8047] Training: 80%|████████ | 8048/10000 [1:49:53<25:13, 1.29it/s, loss=0.0153, lr=4.77e-06, step=8048] Training: 80%|████████ | 8049/10000 [1:49:54<26:14, 1.24it/s, loss=0.0153, lr=4.77e-06, step=8048] Training: 80%|████████ | 8049/10000 [1:49:54<26:14, 1.24it/s, loss=0.0072, lr=4.76e-06, step=8049]20:34:27.469 [I] step=8050 loss=0.0035 smoothed_loss=0.0085 lr=4.77e-06 grad_norm=0.4757 step_time=0.6693s data_time=0.2393s it/s=1.102 eta_to_10000=1769.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0045 grad_action_out_proj_arms=0.0447 grad_arm_token_fuse=0.0249 grad_shared_expert=0.1938 (18633:train_pytorch.py:850) + Training: 80%|████████ | 8050/10000 [1:49:55<28:18, 1.15it/s, loss=0.0072, lr=4.76e-06, step=8049] Training: 80%|████████ | 8050/10000 [1:49:55<28:18, 1.15it/s, loss=0.0035, lr=4.76e-06, step=8050] Training: 81%|████████ | 8051/10000 [1:49:56<24:24, 1.33it/s, loss=0.0035, lr=4.76e-06, step=8050] Training: 81%|████████ | 8051/10000 [1:49:56<24:24, 1.33it/s, loss=0.0053, lr=4.76e-06, step=8051] Training: 81%|████████ | 8052/10000 [1:49:56<22:31, 1.44it/s, loss=0.0053, lr=4.76e-06, step=8051] Training: 81%|████████ | 8052/10000 [1:49:56<22:31, 1.44it/s, loss=0.0040, lr=4.76e-06, step=8052] Training: 81%|████████ | 8053/10000 [1:49:57<20:41, 1.57it/s, loss=0.0040, lr=4.76e-06, step=8052] Training: 81%|████████ | 8053/10000 [1:49:57<20:41, 1.57it/s, loss=0.0033, lr=4.75e-06, step=8053] Training: 81%|████████ | 8054/10000 [1:49:58<22:44, 1.43it/s, loss=0.0033, lr=4.75e-06, step=8053] Training: 81%|████████ | 8054/10000 [1:49:58<22:44, 1.43it/s, loss=0.0092, lr=4.75e-06, step=8054] Training: 81%|████████ | 8055/10000 [1:49:58<21:20, 1.52it/s, loss=0.0092, lr=4.75e-06, step=8054] Training: 81%|████████ | 8055/10000 [1:49:58<21:20, 1.52it/s, loss=0.0035, lr=4.75e-06, step=8055] Training: 81%|████████ | 8056/10000 [1:49:59<22:12, 1.46it/s, loss=0.0035, lr=4.75e-06, step=8055] Training: 81%|████████ | 8056/10000 [1:49:59<22:12, 1.46it/s, loss=0.0083, lr=4.75e-06, step=8056] Training: 81%|████████ | 8057/10000 [1:50:00<23:16, 1.39it/s, loss=0.0083, lr=4.75e-06, step=8056] Training: 81%|████████ | 8057/10000 [1:50:00<23:16, 1.39it/s, loss=0.0125, lr=4.75e-06, step=8057] Training: 81%|████████ | 8058/10000 [1:50:00<23:18, 1.39it/s, loss=0.0125, lr=4.75e-06, step=8057] Training: 81%|████████ | 8058/10000 [1:50:00<23:18, 1.39it/s, loss=0.0247, lr=4.74e-06, step=8058] Training: 81%|████████ | 8059/10000 [1:50:01<25:07, 1.29it/s, loss=0.0247, lr=4.74e-06, step=8058] Training: 81%|████████ | 8059/10000 [1:50:01<25:07, 1.29it/s, loss=0.0026, lr=4.74e-06, step=8059]20:34:34.117 [I] step=8060 loss=0.0079 smoothed_loss=0.0087 lr=4.75e-06 grad_norm=0.3577 step_time=0.5352s data_time=0.1297s it/s=1.504 eta_to_10000=1289.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0586 grad_arm_token_fuse=0.0365 grad_shared_expert=0.2759 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8060/10000 [1:50:02<22:40, 1.43it/s, loss=0.0026, lr=4.74e-06, step=8059] Training: 81%|████████ | 8060/10000 [1:50:02<22:40, 1.43it/s, loss=0.0079, lr=4.74e-06, step=8060] Training: 81%|████████ | 8061/10000 [1:50:02<22:25, 1.44it/s, loss=0.0079, lr=4.74e-06, step=8060] Training: 81%|████████ | 8061/10000 [1:50:02<22:25, 1.44it/s, loss=0.0146, lr=4.74e-06, step=8061] Training: 81%|████████ | 8062/10000 [1:50:03<23:42, 1.36it/s, loss=0.0146, lr=4.74e-06, step=8061] Training: 81%|████████ | 8062/10000 [1:50:03<23:42, 1.36it/s, loss=0.0211, lr=4.73e-06, step=8062] Training: 81%|████████ | 8063/10000 [1:50:04<22:21, 1.44it/s, loss=0.0211, lr=4.73e-06, step=8062] Training: 81%|████████ | 8063/10000 [1:50:04<22:21, 1.44it/s, loss=0.0034, lr=4.73e-06, step=8063] Training: 81%|████████ | 8064/10000 [1:50:05<21:58, 1.47it/s, loss=0.0034, lr=4.73e-06, step=8063] Training: 81%|████████ | 8064/10000 [1:50:05<21:58, 1.47it/s, loss=0.0025, lr=4.73e-06, step=8064] Training: 81%|████████ | 8065/10000 [1:50:05<24:14, 1.33it/s, loss=0.0025, lr=4.73e-06, step=8064] Training: 81%|████████ | 8065/10000 [1:50:05<24:14, 1.33it/s, loss=0.0060, lr=4.73e-06, step=8065] Training: 81%|████████ | 8066/10000 [1:50:06<21:35, 1.49it/s, loss=0.0060, lr=4.73e-06, step=8065] Training: 81%|████████ | 8066/10000 [1:50:06<21:35, 1.49it/s, loss=0.0084, lr=4.73e-06, step=8066] Training: 81%|████████ | 8067/10000 [1:50:06<20:17, 1.59it/s, loss=0.0084, lr=4.73e-06, step=8066] Training: 81%|████████ | 8067/10000 [1:50:06<20:17, 1.59it/s, loss=0.0246, lr=4.72e-06, step=8067] Training: 81%|████████ | 8068/10000 [1:50:07<20:07, 1.60it/s, loss=0.0246, lr=4.72e-06, step=8067] Training: 81%|████████ | 8068/10000 [1:50:07<20:07, 1.60it/s, loss=0.0008, lr=4.72e-06, step=8068] Training: 81%|████████ | 8069/10000 [1:50:08<18:49, 1.71it/s, loss=0.0008, lr=4.72e-06, step=8068] Training: 81%|████████ | 8069/10000 [1:50:08<18:49, 1.71it/s, loss=0.0026, lr=4.72e-06, step=8069]20:34:40.440 [I] step=8070 loss=0.0223 smoothed_loss=0.0100 lr=4.73e-06 grad_norm=0.4949 step_time=0.5198s data_time=0.1125s it/s=1.582 eta_to_10000=1220.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1248 grad_arm_token_fuse=0.0722 grad_shared_expert=0.4706 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8070/10000 [1:50:08<18:18, 1.76it/s, loss=0.0026, lr=4.72e-06, step=8069] Training: 81%|████████ | 8070/10000 [1:50:08<18:18, 1.76it/s, loss=0.0223, lr=4.72e-06, step=8070] Training: 81%|████████ | 8071/10000 [1:50:09<19:11, 1.67it/s, loss=0.0223, lr=4.72e-06, step=8070] Training: 81%|████████ | 8071/10000 [1:50:09<19:11, 1.67it/s, loss=0.0178, lr=4.71e-06, step=8071] Training: 81%|████████ | 8072/10000 [1:50:10<21:30, 1.49it/s, loss=0.0178, lr=4.71e-06, step=8071] Training: 81%|████████ | 8072/10000 [1:50:10<21:30, 1.49it/s, loss=0.0311, lr=4.71e-06, step=8072] Training: 81%|████████ | 8073/10000 [1:50:10<20:03, 1.60it/s, loss=0.0311, lr=4.71e-06, step=8072] Training: 81%|████████ | 8073/10000 [1:50:10<20:03, 1.60it/s, loss=0.0085, lr=4.71e-06, step=8073] Training: 81%|████████ | 8074/10000 [1:50:11<18:59, 1.69it/s, loss=0.0085, lr=4.71e-06, step=8073] Training: 81%|████████ | 8074/10000 [1:50:11<18:59, 1.69it/s, loss=0.0067, lr=4.71e-06, step=8074] Training: 81%|████████ | 8075/10000 [1:50:11<21:28, 1.49it/s, loss=0.0067, lr=4.71e-06, step=8074] Training: 81%|████████ | 8075/10000 [1:50:11<21:28, 1.49it/s, loss=0.0112, lr=4.71e-06, step=8075] Training: 81%|████████ | 8076/10000 [1:50:12<20:55, 1.53it/s, loss=0.0112, lr=4.71e-06, step=8075] Training: 81%|████████ | 8076/10000 [1:50:12<20:55, 1.53it/s, loss=0.0084, lr=4.70e-06, step=8076] Training: 81%|████████ | 8077/10000 [1:50:13<22:49, 1.40it/s, loss=0.0084, lr=4.70e-06, step=8076] Training: 81%|████████ | 8077/10000 [1:50:13<22:49, 1.40it/s, loss=0.0059, lr=4.70e-06, step=8077] Training: 81%|████████ | 8078/10000 [1:50:14<23:42, 1.35it/s, loss=0.0059, lr=4.70e-06, step=8077] Training: 81%|████████ | 8078/10000 [1:50:14<23:42, 1.35it/s, loss=0.0030, lr=4.70e-06, step=8078] Training: 81%|████████ | 8079/10000 [1:50:15<25:21, 1.26it/s, loss=0.0030, lr=4.70e-06, step=8078] Training: 81%|████████ | 8079/10000 [1:50:15<25:21, 1.26it/s, loss=0.0075, lr=4.70e-06, step=8079]20:34:47.753 [I] step=8080 loss=0.0068 smoothed_loss=0.0095 lr=4.70e-06 grad_norm=0.4542 step_time=0.5934s data_time=0.1379s it/s=1.368 eta_to_10000=1403.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0192 grad_action_out_proj_arms=0.1483 grad_arm_token_fuse=0.1042 grad_shared_expert=0.3982 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8080/10000 [1:50:15<24:52, 1.29it/s, loss=0.0075, lr=4.70e-06, step=8079] Training: 81%|████████ | 8080/10000 [1:50:15<24:52, 1.29it/s, loss=0.0068, lr=4.69e-06, step=8080] Training: 81%|████████ | 8081/10000 [1:50:16<22:11, 1.44it/s, loss=0.0068, lr=4.69e-06, step=8080] Training: 81%|████████ | 8081/10000 [1:50:16<22:11, 1.44it/s, loss=0.0055, lr=4.69e-06, step=8081] Training: 81%|████████ | 8082/10000 [1:50:16<20:20, 1.57it/s, loss=0.0055, lr=4.69e-06, step=8081] Training: 81%|████████ | 8082/10000 [1:50:16<20:20, 1.57it/s, loss=0.0083, lr=4.69e-06, step=8082] Training: 81%|████████ | 8083/10000 [1:50:17<19:47, 1.61it/s, loss=0.0083, lr=4.69e-06, step=8082] Training: 81%|████████ | 8083/10000 [1:50:17<19:47, 1.61it/s, loss=0.0229, lr=4.69e-06, step=8083] Training: 81%|████████ | 8084/10000 [1:50:18<19:14, 1.66it/s, loss=0.0229, lr=4.69e-06, step=8083] Training: 81%|████████ | 8084/10000 [1:50:18<19:14, 1.66it/s, loss=0.0039, lr=4.69e-06, step=8084] Training: 81%|████████ | 8085/10000 [1:50:18<18:47, 1.70it/s, loss=0.0039, lr=4.69e-06, step=8084] Training: 81%|████████ | 8085/10000 [1:50:18<18:47, 1.70it/s, loss=0.0018, lr=4.68e-06, step=8085] Training: 81%|████████ | 8086/10000 [1:50:19<19:49, 1.61it/s, loss=0.0018, lr=4.68e-06, step=8085] Training: 81%|████████ | 8086/10000 [1:50:19<19:49, 1.61it/s, loss=0.0147, lr=4.68e-06, step=8086] Training: 81%|████████ | 8087/10000 [1:50:20<21:01, 1.52it/s, loss=0.0147, lr=4.68e-06, step=8086] Training: 81%|████████ | 8087/10000 [1:50:20<21:01, 1.52it/s, loss=0.0017, lr=4.68e-06, step=8087] Training: 81%|████████ | 8088/10000 [1:50:20<22:53, 1.39it/s, loss=0.0017, lr=4.68e-06, step=8087] Training: 81%|████████ | 8088/10000 [1:50:20<22:53, 1.39it/s, loss=0.0051, lr=4.68e-06, step=8088] Training: 81%|████████ | 8089/10000 [1:50:21<21:30, 1.48it/s, loss=0.0051, lr=4.68e-06, step=8088] Training: 81%|████████ | 8089/10000 [1:50:21<21:30, 1.48it/s, loss=0.0030, lr=4.67e-06, step=8089]20:34:53.850 [I] step=8090 loss=0.0153 smoothed_loss=0.0086 lr=4.68e-06 grad_norm=0.4830 step_time=0.5100s data_time=0.0997s it/s=1.640 eta_to_10000=1164.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0078 grad_action_out_proj_arms=0.0718 grad_arm_token_fuse=0.0419 grad_shared_expert=0.6816 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8090/10000 [1:50:22<20:00, 1.59it/s, loss=0.0030, lr=4.67e-06, step=8089] Training: 81%|████████ | 8090/10000 [1:50:22<20:00, 1.59it/s, loss=0.0153, lr=4.67e-06, step=8090] Training: 81%|████████ | 8091/10000 [1:50:22<18:47, 1.69it/s, loss=0.0153, lr=4.67e-06, step=8090] Training: 81%|████████ | 8091/10000 [1:50:22<18:47, 1.69it/s, loss=0.0126, lr=4.67e-06, step=8091] Training: 81%|████████ | 8092/10000 [1:50:23<20:37, 1.54it/s, loss=0.0126, lr=4.67e-06, step=8091] Training: 81%|████████ | 8092/10000 [1:50:23<20:37, 1.54it/s, loss=0.0021, lr=4.67e-06, step=8092] Training: 81%|████████ | 8093/10000 [1:50:24<21:29, 1.48it/s, loss=0.0021, lr=4.67e-06, step=8092] Training: 81%|████████ | 8093/10000 [1:50:24<21:29, 1.48it/s, loss=0.0036, lr=4.67e-06, step=8093] Training: 81%|████████ | 8094/10000 [1:50:24<22:51, 1.39it/s, loss=0.0036, lr=4.67e-06, step=8093] Training: 81%|████████ | 8094/10000 [1:50:24<22:51, 1.39it/s, loss=0.0045, lr=4.66e-06, step=8094] Training: 81%|████████ | 8095/10000 [1:50:25<21:10, 1.50it/s, loss=0.0045, lr=4.66e-06, step=8094] Training: 81%|████████ | 8095/10000 [1:50:25<21:10, 1.50it/s, loss=0.0051, lr=4.66e-06, step=8095] Training: 81%|████████ | 8096/10000 [1:50:26<20:46, 1.53it/s, loss=0.0051, lr=4.66e-06, step=8095] Training: 81%|████████ | 8096/10000 [1:50:26<20:46, 1.53it/s, loss=0.0057, lr=4.66e-06, step=8096] Training: 81%|████████ | 8097/10000 [1:50:26<20:29, 1.55it/s, loss=0.0057, lr=4.66e-06, step=8096] Training: 81%|████████ | 8097/10000 [1:50:26<20:29, 1.55it/s, loss=0.0045, lr=4.66e-06, step=8097] Training: 81%|████████ | 8098/10000 [1:50:27<19:19, 1.64it/s, loss=0.0045, lr=4.66e-06, step=8097] Training: 81%|████████ | 8098/10000 [1:50:27<19:19, 1.64it/s, loss=0.0072, lr=4.66e-06, step=8098] Training: 81%|████████ | 8099/10000 [1:50:27<20:07, 1.57it/s, loss=0.0072, lr=4.66e-06, step=8098] Training: 81%|████████ | 8099/10000 [1:50:27<20:07, 1.57it/s, loss=0.0032, lr=4.65e-06, step=8099]20:35:00.463 [I] step=8100 loss=0.0089 smoothed_loss=0.0067 lr=4.66e-06 grad_norm=0.3695 step_time=0.5479s data_time=0.1133s it/s=1.512 eta_to_10000=1256.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0247 grad_action_out_proj_arms=0.1946 grad_arm_token_fuse=0.1382 grad_shared_expert=0.4772 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8100/10000 [1:50:28<21:11, 1.49it/s, loss=0.0032, lr=4.65e-06, step=8099] Training: 81%|████████ | 8100/10000 [1:50:28<21:11, 1.49it/s, loss=0.0089, lr=4.65e-06, step=8100] Training: 81%|████████ | 8101/10000 [1:50:29<21:39, 1.46it/s, loss=0.0089, lr=4.65e-06, step=8100] Training: 81%|████████ | 8101/10000 [1:50:29<21:39, 1.46it/s, loss=0.0342, lr=4.65e-06, step=8101] Training: 81%|████████ | 8102/10000 [1:50:30<21:20, 1.48it/s, loss=0.0342, lr=4.65e-06, step=8101] Training: 81%|████████ | 8102/10000 [1:50:30<21:20, 1.48it/s, loss=0.0040, lr=4.65e-06, step=8102] Training: 81%|████████ | 8103/10000 [1:50:30<23:32, 1.34it/s, loss=0.0040, lr=4.65e-06, step=8102] Training: 81%|████████ | 8103/10000 [1:50:30<23:32, 1.34it/s, loss=0.0039, lr=4.64e-06, step=8103] Training: 81%|████████ | 8104/10000 [1:50:31<21:18, 1.48it/s, loss=0.0039, lr=4.64e-06, step=8103] Training: 81%|████████ | 8104/10000 [1:50:31<21:18, 1.48it/s, loss=0.0036, lr=4.64e-06, step=8104] Training: 81%|████████ | 8105/10000 [1:50:32<23:42, 1.33it/s, loss=0.0036, lr=4.64e-06, step=8104] Training: 81%|████████ | 8105/10000 [1:50:32<23:42, 1.33it/s, loss=0.0026, lr=4.64e-06, step=8105] Training: 81%|████████ | 8106/10000 [1:50:32<22:36, 1.40it/s, loss=0.0026, lr=4.64e-06, step=8105] Training: 81%|████████ | 8106/10000 [1:50:32<22:36, 1.40it/s, loss=0.0017, lr=4.64e-06, step=8106] Training: 81%|████████ | 8107/10000 [1:50:33<22:21, 1.41it/s, loss=0.0017, lr=4.64e-06, step=8106] Training: 81%|████████ | 8107/10000 [1:50:33<22:21, 1.41it/s, loss=0.0076, lr=4.64e-06, step=8107] Training: 81%|████████ | 8108/10000 [1:50:34<22:12, 1.42it/s, loss=0.0076, lr=4.64e-06, step=8107] Training: 81%|████████ | 8108/10000 [1:50:34<22:12, 1.42it/s, loss=0.0105, lr=4.63e-06, step=8108] Training: 81%|████████ | 8109/10000 [1:50:34<20:44, 1.52it/s, loss=0.0105, lr=4.63e-06, step=8108] Training: 81%|████████ | 8109/10000 [1:50:34<20:44, 1.52it/s, loss=0.0074, lr=4.63e-06, step=8109]20:35:07.413 [I] step=8110 loss=0.0082 smoothed_loss=0.0074 lr=4.64e-06 grad_norm=0.4415 step_time=0.5579s data_time=0.1371s it/s=1.439 eta_to_10000=1313.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0076 grad_action_out_proj_arms=0.0627 grad_arm_token_fuse=0.0448 grad_shared_expert=0.4333 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8110/10000 [1:50:35<20:45, 1.52it/s, loss=0.0074, lr=4.63e-06, step=8109] Training: 81%|████████ | 8110/10000 [1:50:35<20:45, 1.52it/s, loss=0.0082, lr=4.63e-06, step=8110] Training: 81%|████████ | 8111/10000 [1:50:36<19:37, 1.60it/s, loss=0.0082, lr=4.63e-06, step=8110] Training: 81%|████████ | 8111/10000 [1:50:36<19:37, 1.60it/s, loss=0.0166, lr=4.63e-06, step=8111] Training: 81%|████████ | 8112/10000 [1:50:36<19:06, 1.65it/s, loss=0.0166, lr=4.63e-06, step=8111] Training: 81%|████████ | 8112/10000 [1:50:36<19:06, 1.65it/s, loss=0.0058, lr=4.62e-06, step=8112] Training: 81%|████████ | 8113/10000 [1:50:37<18:02, 1.74it/s, loss=0.0058, lr=4.62e-06, step=8112] Training: 81%|████████ | 8113/10000 [1:50:37<18:02, 1.74it/s, loss=0.0031, lr=4.62e-06, step=8113] Training: 81%|████████ | 8114/10000 [1:50:37<18:41, 1.68it/s, loss=0.0031, lr=4.62e-06, step=8113] Training: 81%|████████ | 8114/10000 [1:50:37<18:41, 1.68it/s, loss=0.0477, lr=4.62e-06, step=8114] Training: 81%|████████ | 8115/10000 [1:50:38<20:27, 1.54it/s, loss=0.0477, lr=4.62e-06, step=8114] Training: 81%|████████ | 8115/10000 [1:50:38<20:27, 1.54it/s, loss=0.0060, lr=4.62e-06, step=8115] Training: 81%|████████ | 8116/10000 [1:50:39<18:51, 1.66it/s, loss=0.0060, lr=4.62e-06, step=8115] Training: 81%|████████ | 8116/10000 [1:50:39<18:51, 1.66it/s, loss=0.0011, lr=4.62e-06, step=8116] Training: 81%|████████ | 8117/10000 [1:50:39<21:21, 1.47it/s, loss=0.0011, lr=4.62e-06, step=8116] Training: 81%|████████ | 8117/10000 [1:50:39<21:21, 1.47it/s, loss=0.0140, lr=4.61e-06, step=8117] Training: 81%|████████ | 8118/10000 [1:50:40<19:44, 1.59it/s, loss=0.0140, lr=4.61e-06, step=8117] Training: 81%|████████ | 8118/10000 [1:50:40<19:44, 1.59it/s, loss=0.0165, lr=4.61e-06, step=8118] Training: 81%|████████ | 8119/10000 [1:50:41<18:44, 1.67it/s, loss=0.0165, lr=4.61e-06, step=8118] Training: 81%|████████ | 8119/10000 [1:50:41<18:44, 1.67it/s, loss=0.0080, lr=4.61e-06, step=8119]20:35:13.365 [I] step=8120 loss=0.0037 smoothed_loss=0.0100 lr=4.62e-06 grad_norm=0.3897 step_time=0.5042s data_time=0.0910s it/s=1.680 eta_to_10000=1118.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0543 grad_arm_token_fuse=0.0235 grad_shared_expert=0.1683 (18633:train_pytorch.py:850) + Training: 81%|████████ | 8120/10000 [1:50:41<18:08, 1.73it/s, loss=0.0080, lr=4.61e-06, step=8119] Training: 81%|████████ | 8120/10000 [1:50:41<18:08, 1.73it/s, loss=0.0037, lr=4.61e-06, step=8120] Training: 81%|████████ | 8121/10000 [1:50:42<18:54, 1.66it/s, loss=0.0037, lr=4.61e-06, step=8120] Training: 81%|████████ | 8121/10000 [1:50:42<18:54, 1.66it/s, loss=0.0031, lr=4.61e-06, step=8121] Training: 81%|████████ | 8122/10000 [1:50:43<23:31, 1.33it/s, loss=0.0031, lr=4.61e-06, step=8121] Training: 81%|████████ | 8122/10000 [1:50:43<23:31, 1.33it/s, loss=0.0046, lr=4.60e-06, step=8122] Training: 81%|████████ | 8123/10000 [1:50:44<24:44, 1.26it/s, loss=0.0046, lr=4.60e-06, step=8122] Training: 81%|████████ | 8123/10000 [1:50:44<24:44, 1.26it/s, loss=0.0034, lr=4.60e-06, step=8123] Training: 81%|████████ | 8124/10000 [1:50:44<22:14, 1.41it/s, loss=0.0034, lr=4.60e-06, step=8123] Training: 81%|████████ | 8124/10000 [1:50:44<22:14, 1.41it/s, loss=0.0161, lr=4.60e-06, step=8124] Training: 81%|████████▏ | 8125/10000 [1:50:45<20:41, 1.51it/s, loss=0.0161, lr=4.60e-06, step=8124] Training: 81%|████████▏ | 8125/10000 [1:50:45<20:41, 1.51it/s, loss=0.0084, lr=4.60e-06, step=8125] Training: 81%|████████▏ | 8126/10000 [1:50:45<19:36, 1.59it/s, loss=0.0084, lr=4.60e-06, step=8125] Training: 81%|████████▏ | 8126/10000 [1:50:45<19:36, 1.59it/s, loss=0.0132, lr=4.59e-06, step=8126] Training: 81%|████████▏ | 8127/10000 [1:50:46<19:11, 1.63it/s, loss=0.0132, lr=4.59e-06, step=8126] Training: 81%|████████▏ | 8127/10000 [1:50:46<19:11, 1.63it/s, loss=0.0431, lr=4.59e-06, step=8127] Training: 81%|████████▏ | 8128/10000 [1:50:47<20:31, 1.52it/s, loss=0.0431, lr=4.59e-06, step=8127] Training: 81%|████████▏ | 8128/10000 [1:50:47<20:31, 1.52it/s, loss=0.0150, lr=4.59e-06, step=8128] Training: 81%|████████▏ | 8129/10000 [1:50:48<24:12, 1.29it/s, loss=0.0150, lr=4.59e-06, step=8128] Training: 81%|████████▏ | 8129/10000 [1:50:48<24:12, 1.29it/s, loss=0.0494, lr=4.59e-06, step=8129]20:35:20.709 [I] step=8130 loss=0.0086 smoothed_loss=0.0158 lr=4.60e-06 grad_norm=0.5038 step_time=0.6149s data_time=0.1195s it/s=1.362 eta_to_10000=1373.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0452 grad_action_out_proj_arms=0.2309 grad_arm_token_fuse=0.2599 grad_shared_expert=0.6653 (18633:train_pytorch.py:850) + Training: 81%|████████▏ | 8130/10000 [1:50:48<23:22, 1.33it/s, loss=0.0494, lr=4.59e-06, step=8129] Training: 81%|████████▏ | 8130/10000 [1:50:48<23:22, 1.33it/s, loss=0.0086, lr=4.59e-06, step=8130] Training: 81%|████████▏ | 8131/10000 [1:50:49<24:32, 1.27it/s, loss=0.0086, lr=4.59e-06, step=8130] Training: 81%|████████▏ | 8131/10000 [1:50:49<24:32, 1.27it/s, loss=0.0667, lr=4.58e-06, step=8131] Training: 81%|████████▏ | 8132/10000 [1:50:50<24:39, 1.26it/s, loss=0.0667, lr=4.58e-06, step=8131] Training: 81%|████████▏ | 8132/10000 [1:50:50<24:39, 1.26it/s, loss=0.0086, lr=4.58e-06, step=8132] Training: 81%|████████▏ | 8133/10000 [1:50:51<22:34, 1.38it/s, loss=0.0086, lr=4.58e-06, step=8132] Training: 81%|████████▏ | 8133/10000 [1:50:51<22:34, 1.38it/s, loss=0.0053, lr=4.58e-06, step=8133] Training: 81%|████████▏ | 8134/10000 [1:50:51<23:33, 1.32it/s, loss=0.0053, lr=4.58e-06, step=8133] Training: 81%|████████▏ | 8134/10000 [1:50:51<23:33, 1.32it/s, loss=0.0071, lr=4.58e-06, step=8134] Training: 81%|████████▏ | 8135/10000 [1:50:52<21:25, 1.45it/s, loss=0.0071, lr=4.58e-06, step=8134] Training: 81%|████████▏ | 8135/10000 [1:50:52<21:25, 1.45it/s, loss=0.0065, lr=4.57e-06, step=8135] Training: 81%|████████▏ | 8136/10000 [1:50:53<24:25, 1.27it/s, loss=0.0065, lr=4.57e-06, step=8135] Training: 81%|████████▏ | 8136/10000 [1:50:53<24:25, 1.27it/s, loss=0.0024, lr=4.57e-06, step=8136] Training: 81%|████████▏ | 8137/10000 [1:50:54<22:06, 1.40it/s, loss=0.0024, lr=4.57e-06, step=8136] Training: 81%|████████▏ | 8137/10000 [1:50:54<22:06, 1.40it/s, loss=0.0129, lr=4.57e-06, step=8137] Training: 81%|████████▏ | 8138/10000 [1:50:54<22:20, 1.39it/s, loss=0.0129, lr=4.57e-06, step=8137] Training: 81%|████████▏ | 8138/10000 [1:50:54<22:20, 1.39it/s, loss=0.0192, lr=4.57e-06, step=8138] Training: 81%|████████▏ | 8139/10000 [1:50:55<22:05, 1.40it/s, loss=0.0192, lr=4.57e-06, step=8138] Training: 81%|████████▏ | 8139/10000 [1:50:55<22:05, 1.40it/s, loss=0.0073, lr=4.57e-06, step=8139]20:35:28.076 [I] step=8140 loss=0.0017 smoothed_loss=0.0130 lr=4.57e-06 grad_norm=0.4172 step_time=0.5681s data_time=0.1685s it/s=1.358 eta_to_10000=1370.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0037 grad_action_out_proj_arms=0.0467 grad_arm_token_fuse=0.0199 grad_shared_expert=0.2264 (18633:train_pytorch.py:850) + Training: 81%|████████▏ | 8140/10000 [1:50:56<22:37, 1.37it/s, loss=0.0073, lr=4.57e-06, step=8139] Training: 81%|████████▏ | 8140/10000 [1:50:56<22:37, 1.37it/s, loss=0.0017, lr=4.56e-06, step=8140] Training: 81%|████████▏ | 8141/10000 [1:50:56<22:12, 1.39it/s, loss=0.0017, lr=4.56e-06, step=8140] Training: 81%|████████▏ | 8141/10000 [1:50:56<22:12, 1.39it/s, loss=0.0018, lr=4.56e-06, step=8141] Training: 81%|████████▏ | 8142/10000 [1:50:57<20:08, 1.54it/s, loss=0.0018, lr=4.56e-06, step=8141] Training: 81%|████████▏ | 8142/10000 [1:50:57<20:08, 1.54it/s, loss=0.0017, lr=4.56e-06, step=8142] Training: 81%|████████▏ | 8143/10000 [1:50:58<22:01, 1.41it/s, loss=0.0017, lr=4.56e-06, step=8142] Training: 81%|████████▏ | 8143/10000 [1:50:58<22:01, 1.41it/s, loss=0.0128, lr=4.56e-06, step=8143] Training: 81%|████████▏ | 8144/10000 [1:50:58<20:32, 1.51it/s, loss=0.0128, lr=4.56e-06, step=8143] Training: 81%|████████▏ | 8144/10000 [1:50:58<20:32, 1.51it/s, loss=0.0032, lr=4.56e-06, step=8144] Training: 81%|████████▏ | 8145/10000 [1:50:59<24:57, 1.24it/s, loss=0.0032, lr=4.56e-06, step=8144] Training: 81%|████████▏ | 8145/10000 [1:50:59<24:57, 1.24it/s, loss=0.0116, lr=4.55e-06, step=8145] Training: 81%|████████▏ | 8146/10000 [1:51:00<24:09, 1.28it/s, loss=0.0116, lr=4.55e-06, step=8145] Training: 81%|████████▏ | 8146/10000 [1:51:00<24:09, 1.28it/s, loss=0.0088, lr=4.55e-06, step=8146] Training: 81%|████████▏ | 8147/10000 [1:51:01<22:29, 1.37it/s, loss=0.0088, lr=4.55e-06, step=8146] Training: 81%|████████▏ | 8147/10000 [1:51:01<22:29, 1.37it/s, loss=0.0039, lr=4.55e-06, step=8147] Training: 81%|████████▏ | 8148/10000 [1:51:02<23:14, 1.33it/s, loss=0.0039, lr=4.55e-06, step=8147] Training: 81%|████████▏ | 8148/10000 [1:51:02<23:14, 1.33it/s, loss=0.0135, lr=4.55e-06, step=8148] Training: 81%|████████▏ | 8149/10000 [1:51:02<21:34, 1.43it/s, loss=0.0135, lr=4.55e-06, step=8148] Training: 81%|████████▏ | 8149/10000 [1:51:02<21:34, 1.43it/s, loss=0.0018, lr=4.54e-06, step=8149]20:35:35.574 [I] step=8150 loss=0.0096 smoothed_loss=0.0092 lr=4.55e-06 grad_norm=0.3270 step_time=0.5835s data_time=0.1663s it/s=1.334 eta_to_10000=1387.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0577 grad_arm_token_fuse=0.0294 grad_shared_expert=0.2588 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8150/10000 [1:51:03<24:52, 1.24it/s, loss=0.0018, lr=4.54e-06, step=8149] Training: 82%|████████▏ | 8150/10000 [1:51:03<24:52, 1.24it/s, loss=0.0096, lr=4.54e-06, step=8150] Training: 82%|████████▏ | 8151/10000 [1:51:04<22:40, 1.36it/s, loss=0.0096, lr=4.54e-06, step=8150] Training: 82%|████████▏ | 8151/10000 [1:51:04<22:40, 1.36it/s, loss=0.0017, lr=4.54e-06, step=8151] Training: 82%|████████▏ | 8152/10000 [1:51:05<24:48, 1.24it/s, loss=0.0017, lr=4.54e-06, step=8151] Training: 82%|████████▏ | 8152/10000 [1:51:05<24:48, 1.24it/s, loss=0.0031, lr=4.54e-06, step=8152] Training: 82%|████████▏ | 8153/10000 [1:51:06<26:23, 1.17it/s, loss=0.0031, lr=4.54e-06, step=8152] Training: 82%|████████▏ | 8153/10000 [1:51:06<26:23, 1.17it/s, loss=0.0051, lr=4.54e-06, step=8153] Training: 82%|████████▏ | 8154/10000 [1:51:06<25:08, 1.22it/s, loss=0.0051, lr=4.54e-06, step=8153] Training: 82%|████████▏ | 8154/10000 [1:51:06<25:08, 1.22it/s, loss=0.0012, lr=4.53e-06, step=8154] Training: 82%|████████▏ | 8155/10000 [1:51:07<23:07, 1.33it/s, loss=0.0012, lr=4.53e-06, step=8154] Training: 82%|████████▏ | 8155/10000 [1:51:07<23:07, 1.33it/s, loss=0.0017, lr=4.53e-06, step=8155] Training: 82%|████████▏ | 8156/10000 [1:51:08<20:48, 1.48it/s, loss=0.0017, lr=4.53e-06, step=8155] Training: 82%|████████▏ | 8156/10000 [1:51:08<20:48, 1.48it/s, loss=0.0017, lr=4.53e-06, step=8156] Training: 82%|████████▏ | 8157/10000 [1:51:08<21:01, 1.46it/s, loss=0.0017, lr=4.53e-06, step=8156] Training: 82%|████████▏ | 8157/10000 [1:51:08<21:01, 1.46it/s, loss=0.0035, lr=4.53e-06, step=8157] Training: 82%|████████▏ | 8158/10000 [1:51:09<21:39, 1.42it/s, loss=0.0035, lr=4.53e-06, step=8157] Training: 82%|████████▏ | 8158/10000 [1:51:09<21:39, 1.42it/s, loss=0.0059, lr=4.53e-06, step=8158] Training: 82%|████████▏ | 8159/10000 [1:51:10<19:37, 1.56it/s, loss=0.0059, lr=4.53e-06, step=8158] Training: 82%|████████▏ | 8159/10000 [1:51:10<19:37, 1.56it/s, loss=0.0174, lr=4.52e-06, step=8159]20:35:42.580 [I] step=8160 loss=0.0070 smoothed_loss=0.0069 lr=4.53e-06 grad_norm=0.3692 step_time=0.5591s data_time=0.1415s it/s=1.428 eta_to_10000=1288.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0206 grad_action_out_proj_arms=0.1774 grad_arm_token_fuse=0.1090 grad_shared_expert=0.5145 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8160/10000 [1:51:10<20:21, 1.51it/s, loss=0.0174, lr=4.52e-06, step=8159] Training: 82%|████████▏ | 8160/10000 [1:51:10<20:21, 1.51it/s, loss=0.0070, lr=4.52e-06, step=8160] Training: 82%|████████▏ | 8161/10000 [1:51:11<18:41, 1.64it/s, loss=0.0070, lr=4.52e-06, step=8160] Training: 82%|████████▏ | 8161/10000 [1:51:11<18:41, 1.64it/s, loss=0.0051, lr=4.52e-06, step=8161] Training: 82%|████████▏ | 8162/10000 [1:51:11<19:32, 1.57it/s, loss=0.0051, lr=4.52e-06, step=8161] Training: 82%|████████▏ | 8162/10000 [1:51:11<19:32, 1.57it/s, loss=0.0033, lr=4.52e-06, step=8162] Training: 82%|████████▏ | 8163/10000 [1:51:12<18:31, 1.65it/s, loss=0.0033, lr=4.52e-06, step=8162] Training: 82%|████████▏ | 8163/10000 [1:51:12<18:31, 1.65it/s, loss=0.0516, lr=4.51e-06, step=8163] Training: 82%|████████▏ | 8164/10000 [1:51:13<18:59, 1.61it/s, loss=0.0516, lr=4.51e-06, step=8163] Training: 82%|████████▏ | 8164/10000 [1:51:13<18:59, 1.61it/s, loss=0.0036, lr=4.51e-06, step=8164] Training: 82%|████████▏ | 8165/10000 [1:51:13<20:09, 1.52it/s, loss=0.0036, lr=4.51e-06, step=8164] Training: 82%|████████▏ | 8165/10000 [1:51:13<20:09, 1.52it/s, loss=0.0042, lr=4.51e-06, step=8165] Training: 82%|████████▏ | 8166/10000 [1:51:14<18:34, 1.65it/s, loss=0.0042, lr=4.51e-06, step=8165] Training: 82%|████████▏ | 8166/10000 [1:51:14<18:34, 1.65it/s, loss=0.0022, lr=4.51e-06, step=8166] Training: 82%|████████▏ | 8167/10000 [1:51:14<18:03, 1.69it/s, loss=0.0022, lr=4.51e-06, step=8166] Training: 82%|████████▏ | 8167/10000 [1:51:14<18:03, 1.69it/s, loss=0.0061, lr=4.51e-06, step=8167] Training: 82%|████████▏ | 8168/10000 [1:51:15<21:32, 1.42it/s, loss=0.0061, lr=4.51e-06, step=8167] Training: 82%|████████▏ | 8168/10000 [1:51:15<21:32, 1.42it/s, loss=0.0028, lr=4.50e-06, step=8168] Training: 82%|████████▏ | 8169/10000 [1:51:16<19:30, 1.56it/s, loss=0.0028, lr=4.50e-06, step=8168] Training: 82%|████████▏ | 8169/10000 [1:51:16<19:30, 1.56it/s, loss=0.0026, lr=4.50e-06, step=8169]20:35:49.046 [I] step=8170 loss=0.0061 smoothed_loss=0.0073 lr=4.51e-06 grad_norm=0.4206 step_time=0.5309s data_time=0.1157s it/s=1.547 eta_to_10000=1183.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0041 grad_action_out_proj_arms=0.0513 grad_arm_token_fuse=0.0215 grad_shared_expert=0.2099 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8170/10000 [1:51:17<21:24, 1.42it/s, loss=0.0026, lr=4.50e-06, step=8169] Training: 82%|████████▏ | 8170/10000 [1:51:17<21:24, 1.42it/s, loss=0.0061, lr=4.50e-06, step=8170] Training: 82%|████████▏ | 8171/10000 [1:51:18<23:43, 1.28it/s, loss=0.0061, lr=4.50e-06, step=8170] Training: 82%|████████▏ | 8171/10000 [1:51:18<23:43, 1.28it/s, loss=0.0068, lr=4.50e-06, step=8171] Training: 82%|████████▏ | 8172/10000 [1:51:19<25:06, 1.21it/s, loss=0.0068, lr=4.50e-06, step=8171] Training: 82%|████████▏ | 8172/10000 [1:51:19<25:06, 1.21it/s, loss=0.0114, lr=4.50e-06, step=8172] Training: 82%|████████▏ | 8173/10000 [1:51:19<22:19, 1.36it/s, loss=0.0114, lr=4.50e-06, step=8172] Training: 82%|████████▏ | 8173/10000 [1:51:19<22:19, 1.36it/s, loss=0.0173, lr=4.49e-06, step=8173] Training: 82%|████████▏ | 8174/10000 [1:51:20<20:06, 1.51it/s, loss=0.0173, lr=4.49e-06, step=8173] Training: 82%|████████▏ | 8174/10000 [1:51:20<20:06, 1.51it/s, loss=0.0107, lr=4.49e-06, step=8174] Training: 82%|████████▏ | 8175/10000 [1:51:21<22:09, 1.37it/s, loss=0.0107, lr=4.49e-06, step=8174] Training: 82%|████████▏ | 8175/10000 [1:51:21<22:09, 1.37it/s, loss=0.0201, lr=4.49e-06, step=8175] Training: 82%|████████▏ | 8176/10000 [1:51:21<23:21, 1.30it/s, loss=0.0201, lr=4.49e-06, step=8175] Training: 82%|████████▏ | 8176/10000 [1:51:21<23:21, 1.30it/s, loss=0.0067, lr=4.49e-06, step=8176] Training: 82%|████████▏ | 8177/10000 [1:51:22<21:35, 1.41it/s, loss=0.0067, lr=4.49e-06, step=8176] Training: 82%|████████▏ | 8177/10000 [1:51:22<21:35, 1.41it/s, loss=0.0251, lr=4.49e-06, step=8177] Training: 82%|████████▏ | 8178/10000 [1:51:23<24:09, 1.26it/s, loss=0.0251, lr=4.49e-06, step=8177] Training: 82%|████████▏ | 8178/10000 [1:51:23<24:09, 1.26it/s, loss=0.0202, lr=4.48e-06, step=8178] Training: 82%|████████▏ | 8179/10000 [1:51:24<26:53, 1.13it/s, loss=0.0202, lr=4.48e-06, step=8178] Training: 82%|████████▏ | 8179/10000 [1:51:24<26:53, 1.13it/s, loss=0.0010, lr=4.48e-06, step=8179]20:35:57.154 [I] step=8180 loss=0.0195 smoothed_loss=0.0118 lr=4.49e-06 grad_norm=0.5478 step_time=0.6453s data_time=0.1655s it/s=1.233 eta_to_10000=1475.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0216 grad_action_out_proj_arms=0.1245 grad_arm_token_fuse=0.1140 grad_shared_expert=0.6774 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8180/10000 [1:51:25<26:01, 1.17it/s, loss=0.0010, lr=4.48e-06, step=8179] Training: 82%|████████▏ | 8180/10000 [1:51:25<26:01, 1.17it/s, loss=0.0195, lr=4.48e-06, step=8180] Training: 82%|████████▏ | 8181/10000 [1:51:26<24:20, 1.25it/s, loss=0.0195, lr=4.48e-06, step=8180] Training: 82%|████████▏ | 8181/10000 [1:51:26<24:20, 1.25it/s, loss=0.0022, lr=4.48e-06, step=8181] Training: 82%|████████▏ | 8182/10000 [1:51:26<25:03, 1.21it/s, loss=0.0022, lr=4.48e-06, step=8181] Training: 82%|████████▏ | 8182/10000 [1:51:26<25:03, 1.21it/s, loss=0.0115, lr=4.47e-06, step=8182] Training: 82%|████████▏ | 8183/10000 [1:51:27<22:13, 1.36it/s, loss=0.0115, lr=4.47e-06, step=8182] Training: 82%|████████▏ | 8183/10000 [1:51:27<22:13, 1.36it/s, loss=0.0033, lr=4.47e-06, step=8183] Training: 82%|████████▏ | 8184/10000 [1:51:28<21:40, 1.40it/s, loss=0.0033, lr=4.47e-06, step=8183] Training: 82%|████████▏ | 8184/10000 [1:51:28<21:40, 1.40it/s, loss=0.0037, lr=4.47e-06, step=8184] Training: 82%|████████▏ | 8185/10000 [1:51:28<20:23, 1.48it/s, loss=0.0037, lr=4.47e-06, step=8184] Training: 82%|████████▏ | 8185/10000 [1:51:28<20:23, 1.48it/s, loss=0.0153, lr=4.47e-06, step=8185] Training: 82%|████████▏ | 8186/10000 [1:51:29<22:13, 1.36it/s, loss=0.0153, lr=4.47e-06, step=8185] Training: 82%|████████▏ | 8186/10000 [1:51:29<22:13, 1.36it/s, loss=0.0031, lr=4.47e-06, step=8186] Training: 82%|████████▏ | 8187/10000 [1:51:30<20:39, 1.46it/s, loss=0.0031, lr=4.47e-06, step=8186] Training: 82%|████████▏ | 8187/10000 [1:51:30<20:39, 1.46it/s, loss=0.0074, lr=4.46e-06, step=8187] Training: 82%|████████▏ | 8188/10000 [1:51:30<18:40, 1.62it/s, loss=0.0074, lr=4.46e-06, step=8187] Training: 82%|████████▏ | 8188/10000 [1:51:30<18:40, 1.62it/s, loss=0.0056, lr=4.46e-06, step=8188] Training: 82%|████████▏ | 8189/10000 [1:51:31<17:48, 1.70it/s, loss=0.0056, lr=4.46e-06, step=8188] Training: 82%|████████▏ | 8189/10000 [1:51:31<17:48, 1.70it/s, loss=0.0057, lr=4.46e-06, step=8189]20:36:03.874 [I] step=8190 loss=0.0042 smoothed_loss=0.0081 lr=4.47e-06 grad_norm=0.4320 step_time=0.5572s data_time=0.1147s it/s=1.489 eta_to_10000=1215.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0051 grad_action_out_proj_arms=0.0694 grad_arm_token_fuse=0.0269 grad_shared_expert=0.2673 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8190/10000 [1:51:32<21:09, 1.43it/s, loss=0.0057, lr=4.46e-06, step=8189] Training: 82%|████████▏ | 8190/10000 [1:51:32<21:09, 1.43it/s, loss=0.0042, lr=4.46e-06, step=8190] Training: 82%|████████▏ | 8191/10000 [1:51:32<20:00, 1.51it/s, loss=0.0042, lr=4.46e-06, step=8190] Training: 82%|████████▏ | 8191/10000 [1:51:32<20:00, 1.51it/s, loss=0.0070, lr=4.46e-06, step=8191] Training: 82%|████████▏ | 8192/10000 [1:51:33<18:22, 1.64it/s, loss=0.0070, lr=4.46e-06, step=8191] Training: 82%|████████▏ | 8192/10000 [1:51:33<18:22, 1.64it/s, loss=0.0098, lr=4.45e-06, step=8192] Training: 82%|████████▏ | 8193/10000 [1:51:33<19:10, 1.57it/s, loss=0.0098, lr=4.45e-06, step=8192] Training: 82%|████████▏ | 8193/10000 [1:51:33<19:10, 1.57it/s, loss=0.0265, lr=4.45e-06, step=8193] Training: 82%|████████▏ | 8194/10000 [1:51:34<17:52, 1.68it/s, loss=0.0265, lr=4.45e-06, step=8193] Training: 82%|████████▏ | 8194/10000 [1:51:34<17:52, 1.68it/s, loss=0.0838, lr=4.45e-06, step=8194] Training: 82%|████████▏ | 8195/10000 [1:51:34<18:47, 1.60it/s, loss=0.0838, lr=4.45e-06, step=8194] Training: 82%|████████▏ | 8195/10000 [1:51:34<18:47, 1.60it/s, loss=0.0078, lr=4.45e-06, step=8195] Training: 82%|████████▏ | 8196/10000 [1:51:35<19:10, 1.57it/s, loss=0.0078, lr=4.45e-06, step=8195] Training: 82%|████████▏ | 8196/10000 [1:51:35<19:10, 1.57it/s, loss=0.0079, lr=4.45e-06, step=8196] Training: 82%|████████▏ | 8197/10000 [1:51:36<18:52, 1.59it/s, loss=0.0079, lr=4.45e-06, step=8196] Training: 82%|████████▏ | 8197/10000 [1:51:36<18:52, 1.59it/s, loss=0.0134, lr=4.44e-06, step=8197] Training: 82%|████████▏ | 8198/10000 [1:51:36<19:29, 1.54it/s, loss=0.0134, lr=4.44e-06, step=8197] Training: 82%|████████▏ | 8198/10000 [1:51:36<19:29, 1.54it/s, loss=0.0173, lr=4.44e-06, step=8198] Training: 82%|████████▏ | 8199/10000 [1:51:37<17:50, 1.68it/s, loss=0.0173, lr=4.44e-06, step=8198] Training: 82%|████████▏ | 8199/10000 [1:51:37<17:50, 1.68it/s, loss=0.0007, lr=4.44e-06, step=8199]20:36:09.973 [I] step=8200 loss=0.0104 smoothed_loss=0.0137 lr=4.45e-06 grad_norm=0.4402 step_time=0.5244s data_time=0.0856s it/s=1.640 eta_to_10000=1097.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0731 grad_arm_token_fuse=0.0365 grad_shared_expert=0.3645 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8200/10000 [1:51:38<18:53, 1.59it/s, loss=0.0007, lr=4.44e-06, step=8199] Training: 82%|████████▏ | 8200/10000 [1:51:38<18:53, 1.59it/s, loss=0.0104, lr=4.44e-06, step=8200] Training: 82%|████████▏ | 8201/10000 [1:51:38<17:33, 1.71it/s, loss=0.0104, lr=4.44e-06, step=8200] Training: 82%|████████▏ | 8201/10000 [1:51:38<17:33, 1.71it/s, loss=0.0384, lr=4.43e-06, step=8201] Training: 82%|████████▏ | 8202/10000 [1:51:39<16:35, 1.81it/s, loss=0.0384, lr=4.43e-06, step=8201] Training: 82%|████████▏ | 8202/10000 [1:51:39<16:35, 1.81it/s, loss=0.0008, lr=4.43e-06, step=8202] Training: 82%|████████▏ | 8203/10000 [1:51:39<17:44, 1.69it/s, loss=0.0008, lr=4.43e-06, step=8202] Training: 82%|████████▏ | 8203/10000 [1:51:39<17:44, 1.69it/s, loss=0.0084, lr=4.43e-06, step=8203] Training: 82%|████████▏ | 8204/10000 [1:51:40<17:10, 1.74it/s, loss=0.0084, lr=4.43e-06, step=8203] Training: 82%|████████▏ | 8204/10000 [1:51:40<17:10, 1.74it/s, loss=0.0053, lr=4.43e-06, step=8204] Training: 82%|████████▏ | 8205/10000 [1:51:40<17:48, 1.68it/s, loss=0.0053, lr=4.43e-06, step=8204] Training: 82%|████████▏ | 8205/10000 [1:51:40<17:48, 1.68it/s, loss=0.0090, lr=4.43e-06, step=8205] Training: 82%|████████▏ | 8206/10000 [1:51:41<17:57, 1.67it/s, loss=0.0090, lr=4.43e-06, step=8205] Training: 82%|████████▏ | 8206/10000 [1:51:41<17:57, 1.67it/s, loss=0.0010, lr=4.42e-06, step=8206] Training: 82%|████████▏ | 8207/10000 [1:51:42<18:28, 1.62it/s, loss=0.0010, lr=4.42e-06, step=8206] Training: 82%|████████▏ | 8207/10000 [1:51:42<18:28, 1.62it/s, loss=0.0028, lr=4.42e-06, step=8207] Training: 82%|████████▏ | 8208/10000 [1:51:42<18:29, 1.62it/s, loss=0.0028, lr=4.42e-06, step=8207] Training: 82%|████████▏ | 8208/10000 [1:51:42<18:29, 1.62it/s, loss=0.0049, lr=4.42e-06, step=8208] Training: 82%|████████▏ | 8209/10000 [1:51:43<18:29, 1.61it/s, loss=0.0049, lr=4.42e-06, step=8208] Training: 82%|████████▏ | 8209/10000 [1:51:43<18:29, 1.61it/s, loss=0.0063, lr=4.42e-06, step=8209]20:36:15.943 [I] step=8210 loss=0.0040 smoothed_loss=0.0091 lr=4.43e-06 grad_norm=0.4173 step_time=0.5171s data_time=0.0799s it/s=1.675 eta_to_10000=1068.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0907 grad_arm_token_fuse=0.0408 grad_shared_expert=0.3357 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8210/10000 [1:51:44<18:37, 1.60it/s, loss=0.0063, lr=4.42e-06, step=8209] Training: 82%|████████▏ | 8210/10000 [1:51:44<18:37, 1.60it/s, loss=0.0040, lr=4.42e-06, step=8210] Training: 82%|████████▏ | 8211/10000 [1:51:44<18:13, 1.64it/s, loss=0.0040, lr=4.42e-06, step=8210] Training: 82%|████████▏ | 8211/10000 [1:51:44<18:13, 1.64it/s, loss=0.0008, lr=4.41e-06, step=8211] Training: 82%|████████▏ | 8212/10000 [1:51:45<18:25, 1.62it/s, loss=0.0008, lr=4.41e-06, step=8211] Training: 82%|████████▏ | 8212/10000 [1:51:45<18:25, 1.62it/s, loss=0.0121, lr=4.41e-06, step=8212] Training: 82%|████████▏ | 8213/10000 [1:51:45<17:22, 1.71it/s, loss=0.0121, lr=4.41e-06, step=8212] Training: 82%|████████▏ | 8213/10000 [1:51:45<17:22, 1.71it/s, loss=0.0091, lr=4.41e-06, step=8213] Training: 82%|████████▏ | 8214/10000 [1:51:46<17:46, 1.67it/s, loss=0.0091, lr=4.41e-06, step=8213] Training: 82%|████████▏ | 8214/10000 [1:51:46<17:46, 1.67it/s, loss=0.0013, lr=4.41e-06, step=8214] Training: 82%|████████▏ | 8215/10000 [1:51:47<18:27, 1.61it/s, loss=0.0013, lr=4.41e-06, step=8214] Training: 82%|████████▏ | 8215/10000 [1:51:47<18:27, 1.61it/s, loss=0.0087, lr=4.41e-06, step=8215] Training: 82%|████████▏ | 8216/10000 [1:51:47<17:11, 1.73it/s, loss=0.0087, lr=4.41e-06, step=8215] Training: 82%|████████▏ | 8216/10000 [1:51:47<17:11, 1.73it/s, loss=0.0049, lr=4.40e-06, step=8216] Training: 82%|████████▏ | 8217/10000 [1:51:48<16:25, 1.81it/s, loss=0.0049, lr=4.40e-06, step=8216] Training: 82%|████████▏ | 8217/10000 [1:51:48<16:25, 1.81it/s, loss=0.0064, lr=4.40e-06, step=8217] Training: 82%|████████▏ | 8218/10000 [1:51:48<16:01, 1.85it/s, loss=0.0064, lr=4.40e-06, step=8217] Training: 82%|████████▏ | 8218/10000 [1:51:48<16:01, 1.85it/s, loss=0.0017, lr=4.40e-06, step=8218] Training: 82%|████████▏ | 8219/10000 [1:51:49<16:23, 1.81it/s, loss=0.0017, lr=4.40e-06, step=8218] Training: 82%|████████▏ | 8219/10000 [1:51:49<16:23, 1.81it/s, loss=0.0117, lr=4.40e-06, step=8219]20:36:21.711 [I] step=8220 loss=0.0194 smoothed_loss=0.0087 lr=4.40e-06 grad_norm=0.4105 step_time=0.5061s data_time=0.0707s it/s=1.734 eta_to_10000=1026.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0154 grad_action_out_proj_arms=0.1703 grad_arm_token_fuse=0.0804 grad_shared_expert=0.7226 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8220/10000 [1:51:49<17:32, 1.69it/s, loss=0.0117, lr=4.40e-06, step=8219] Training: 82%|████████▏ | 8220/10000 [1:51:49<17:32, 1.69it/s, loss=0.0194, lr=4.40e-06, step=8220] Training: 82%|████████▏ | 8221/10000 [1:51:50<18:32, 1.60it/s, loss=0.0194, lr=4.40e-06, step=8220] Training: 82%|████████▏ | 8221/10000 [1:51:50<18:32, 1.60it/s, loss=0.0054, lr=4.39e-06, step=8221] Training: 82%|████████▏ | 8222/10000 [1:51:51<19:11, 1.54it/s, loss=0.0054, lr=4.39e-06, step=8221] Training: 82%|████████▏ | 8222/10000 [1:51:51<19:11, 1.54it/s, loss=0.0197, lr=4.39e-06, step=8222] Training: 82%|████████▏ | 8223/10000 [1:51:51<17:35, 1.68it/s, loss=0.0197, lr=4.39e-06, step=8222] Training: 82%|████████▏ | 8223/10000 [1:51:51<17:35, 1.68it/s, loss=0.0074, lr=4.39e-06, step=8223] Training: 82%|████████▏ | 8224/10000 [1:51:52<16:31, 1.79it/s, loss=0.0074, lr=4.39e-06, step=8223] Training: 82%|████████▏ | 8224/10000 [1:51:52<16:31, 1.79it/s, loss=0.0043, lr=4.39e-06, step=8224] Training: 82%|████████▏ | 8225/10000 [1:51:52<17:50, 1.66it/s, loss=0.0043, lr=4.39e-06, step=8224] Training: 82%|████████▏ | 8225/10000 [1:51:52<17:50, 1.66it/s, loss=0.0089, lr=4.39e-06, step=8225] Training: 82%|████████▏ | 8226/10000 [1:51:53<16:40, 1.77it/s, loss=0.0089, lr=4.39e-06, step=8225] Training: 82%|████████▏ | 8226/10000 [1:51:53<16:40, 1.77it/s, loss=0.0108, lr=4.38e-06, step=8226] Training: 82%|████████▏ | 8227/10000 [1:51:54<17:30, 1.69it/s, loss=0.0108, lr=4.38e-06, step=8226] Training: 82%|████████▏ | 8227/10000 [1:51:54<17:30, 1.69it/s, loss=0.0210, lr=4.38e-06, step=8227] Training: 82%|████████▏ | 8228/10000 [1:51:54<18:04, 1.63it/s, loss=0.0210, lr=4.38e-06, step=8227] Training: 82%|████████▏ | 8228/10000 [1:51:54<18:04, 1.63it/s, loss=0.0070, lr=4.38e-06, step=8228] Training: 82%|████████▏ | 8229/10000 [1:51:55<19:16, 1.53it/s, loss=0.0070, lr=4.38e-06, step=8228] Training: 82%|████████▏ | 8229/10000 [1:51:55<19:16, 1.53it/s, loss=0.0022, lr=4.38e-06, step=8229]20:36:27.821 [I] step=8230 loss=0.0038 smoothed_loss=0.0086 lr=4.38e-06 grad_norm=0.4568 step_time=0.5395s data_time=0.0715s it/s=1.637 eta_to_10000=1081.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0075 grad_action_out_proj_arms=0.0812 grad_arm_token_fuse=0.0402 grad_shared_expert=0.2119 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8230/10000 [1:51:55<18:02, 1.63it/s, loss=0.0022, lr=4.38e-06, step=8229] Training: 82%|████████▏ | 8230/10000 [1:51:55<18:02, 1.63it/s, loss=0.0038, lr=4.37e-06, step=8230] Training: 82%|████████▏ | 8231/10000 [1:51:56<17:03, 1.73it/s, loss=0.0038, lr=4.37e-06, step=8230] Training: 82%|████████▏ | 8231/10000 [1:51:56<17:03, 1.73it/s, loss=0.0063, lr=4.37e-06, step=8231] Training: 82%|████████▏ | 8232/10000 [1:51:57<17:58, 1.64it/s, loss=0.0063, lr=4.37e-06, step=8231] Training: 82%|████████▏ | 8232/10000 [1:51:57<17:58, 1.64it/s, loss=0.0035, lr=4.37e-06, step=8232] Training: 82%|████████▏ | 8233/10000 [1:51:57<16:55, 1.74it/s, loss=0.0035, lr=4.37e-06, step=8232] Training: 82%|████████▏ | 8233/10000 [1:51:57<16:55, 1.74it/s, loss=0.0103, lr=4.37e-06, step=8233] Training: 82%|████████▏ | 8234/10000 [1:51:58<17:43, 1.66it/s, loss=0.0103, lr=4.37e-06, step=8233] Training: 82%|████████▏ | 8234/10000 [1:51:58<17:43, 1.66it/s, loss=0.0175, lr=4.37e-06, step=8234] Training: 82%|████████▏ | 8235/10000 [1:51:58<16:42, 1.76it/s, loss=0.0175, lr=4.37e-06, step=8234] Training: 82%|████████▏ | 8235/10000 [1:51:58<16:42, 1.76it/s, loss=0.0024, lr=4.36e-06, step=8235] Training: 82%|████████▏ | 8236/10000 [1:51:59<17:39, 1.66it/s, loss=0.0024, lr=4.36e-06, step=8235] Training: 82%|████████▏ | 8236/10000 [1:51:59<17:39, 1.66it/s, loss=0.0097, lr=4.36e-06, step=8236] Training: 82%|████████▏ | 8237/10000 [1:51:59<16:37, 1.77it/s, loss=0.0097, lr=4.36e-06, step=8236] Training: 82%|████████▏ | 8237/10000 [1:51:59<16:37, 1.77it/s, loss=0.0141, lr=4.36e-06, step=8237] Training: 82%|████████▏ | 8238/10000 [1:52:00<15:46, 1.86it/s, loss=0.0141, lr=4.36e-06, step=8237] Training: 82%|████████▏ | 8238/10000 [1:52:00<15:46, 1.86it/s, loss=0.0108, lr=4.36e-06, step=8238] Training: 82%|████████▏ | 8239/10000 [1:52:00<15:10, 1.93it/s, loss=0.0108, lr=4.36e-06, step=8238] Training: 82%|████████▏ | 8239/10000 [1:52:00<15:10, 1.93it/s, loss=0.0053, lr=4.36e-06, step=8239]20:36:33.250 [I] step=8240 loss=0.0031 smoothed_loss=0.0083 lr=4.36e-06 grad_norm=0.3896 step_time=0.4752s data_time=0.0677s it/s=1.842 eta_to_10000=955.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0081 grad_action_out_proj_arms=0.0751 grad_arm_token_fuse=0.0420 grad_shared_expert=0.2647 (18633:train_pytorch.py:850) + Training: 82%|████████▏ | 8240/10000 [1:52:01<14:59, 1.96it/s, loss=0.0053, lr=4.36e-06, step=8239] Training: 82%|████████▏ | 8240/10000 [1:52:01<14:59, 1.96it/s, loss=0.0031, lr=4.35e-06, step=8240] Training: 82%|████████▏ | 8241/10000 [1:52:01<14:42, 1.99it/s, loss=0.0031, lr=4.35e-06, step=8240] Training: 82%|████████▏ | 8241/10000 [1:52:01<14:42, 1.99it/s, loss=0.0032, lr=4.35e-06, step=8241] Training: 82%|████████▏ | 8242/10000 [1:52:02<16:10, 1.81it/s, loss=0.0032, lr=4.35e-06, step=8241] Training: 82%|████████▏ | 8242/10000 [1:52:02<16:10, 1.81it/s, loss=0.0040, lr=4.35e-06, step=8242] Training: 82%|████████▏ | 8243/10000 [1:52:03<17:25, 1.68it/s, loss=0.0040, lr=4.35e-06, step=8242] Training: 82%|████████▏ | 8243/10000 [1:52:03<17:25, 1.68it/s, loss=0.0024, lr=4.35e-06, step=8243] Training: 82%|████████▏ | 8244/10000 [1:52:03<16:29, 1.77it/s, loss=0.0024, lr=4.35e-06, step=8243] Training: 82%|████████▏ | 8244/10000 [1:52:03<16:29, 1.77it/s, loss=0.0105, lr=4.35e-06, step=8244] Training: 82%|████████▏ | 8245/10000 [1:52:04<16:23, 1.78it/s, loss=0.0105, lr=4.35e-06, step=8244] Training: 82%|████████▏ | 8245/10000 [1:52:04<16:23, 1.78it/s, loss=0.0043, lr=4.34e-06, step=8245] Training: 82%|████████▏ | 8246/10000 [1:52:04<15:39, 1.87it/s, loss=0.0043, lr=4.34e-06, step=8245] Training: 82%|████████▏ | 8246/10000 [1:52:04<15:39, 1.87it/s, loss=0.0302, lr=4.34e-06, step=8246] Training: 82%|████████▏ | 8247/10000 [1:52:05<15:09, 1.93it/s, loss=0.0302, lr=4.34e-06, step=8246] Training: 82%|████████▏ | 8247/10000 [1:52:05<15:09, 1.93it/s, loss=0.0075, lr=4.34e-06, step=8247] Training: 82%|████████▏ | 8248/10000 [1:52:05<16:18, 1.79it/s, loss=0.0075, lr=4.34e-06, step=8247] Training: 82%|████████▏ | 8248/10000 [1:52:05<16:18, 1.79it/s, loss=0.1072, lr=4.34e-06, step=8248] Training: 82%|████████▏ | 8249/10000 [1:52:06<15:35, 1.87it/s, loss=0.1072, lr=4.34e-06, step=8248] Training: 82%|████████▏ | 8249/10000 [1:52:06<15:35, 1.87it/s, loss=0.0116, lr=4.34e-06, step=8249]20:36:38.917 [I] step=8250 loss=0.0035 smoothed_loss=0.0167 lr=4.34e-06 grad_norm=0.3837 step_time=0.5065s data_time=0.0602s it/s=1.765 eta_to_10000=991.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0146 grad_action_out_proj_arms=0.1121 grad_arm_token_fuse=0.0776 grad_shared_expert=0.3619 (18633:train_pytorch.py:850) + Training: 82%|████████▎ | 8250/10000 [1:52:07<16:58, 1.72it/s, loss=0.0116, lr=4.34e-06, step=8249] Training: 82%|████████▎ | 8250/10000 [1:52:07<16:58, 1.72it/s, loss=0.0035, lr=4.33e-06, step=8250] Training: 83%|████████▎ | 8251/10000 [1:52:07<16:16, 1.79it/s, loss=0.0035, lr=4.33e-06, step=8250] Training: 83%|████████▎ | 8251/10000 [1:52:07<16:16, 1.79it/s, loss=0.0035, lr=4.33e-06, step=8251] Training: 83%|████████▎ | 8252/10000 [1:52:08<15:47, 1.84it/s, loss=0.0035, lr=4.33e-06, step=8251] Training: 83%|████████▎ | 8252/10000 [1:52:08<15:47, 1.84it/s, loss=0.0069, lr=4.33e-06, step=8252] Training: 83%|████████▎ | 8253/10000 [1:52:08<15:16, 1.91it/s, loss=0.0069, lr=4.33e-06, step=8252] Training: 83%|████████▎ | 8253/10000 [1:52:08<15:16, 1.91it/s, loss=0.0046, lr=4.33e-06, step=8253] Training: 83%|████████▎ | 8254/10000 [1:52:09<14:52, 1.96it/s, loss=0.0046, lr=4.33e-06, step=8253] Training: 83%|████████▎ | 8254/10000 [1:52:09<14:52, 1.96it/s, loss=0.0040, lr=4.33e-06, step=8254] Training: 83%|████████▎ | 8255/10000 [1:52:09<16:06, 1.81it/s, loss=0.0040, lr=4.33e-06, step=8254] Training: 83%|████████▎ | 8255/10000 [1:52:09<16:06, 1.81it/s, loss=0.0073, lr=4.32e-06, step=8255] Training: 83%|████████▎ | 8256/10000 [1:52:10<15:53, 1.83it/s, loss=0.0073, lr=4.32e-06, step=8255] Training: 83%|████████▎ | 8256/10000 [1:52:10<15:53, 1.83it/s, loss=0.0201, lr=4.32e-06, step=8256] Training: 83%|████████▎ | 8257/10000 [1:52:11<18:45, 1.55it/s, loss=0.0201, lr=4.32e-06, step=8256] Training: 83%|████████▎ | 8257/10000 [1:52:11<18:45, 1.55it/s, loss=0.0038, lr=4.32e-06, step=8257] Training: 83%|████████▎ | 8258/10000 [1:52:11<17:23, 1.67it/s, loss=0.0038, lr=4.32e-06, step=8257] Training: 83%|████████▎ | 8258/10000 [1:52:11<17:23, 1.67it/s, loss=0.0081, lr=4.32e-06, step=8258] Training: 83%|████████▎ | 8259/10000 [1:52:12<16:26, 1.77it/s, loss=0.0081, lr=4.32e-06, step=8258] Training: 83%|████████▎ | 8259/10000 [1:52:12<16:26, 1.77it/s, loss=0.0066, lr=4.32e-06, step=8259]20:36:44.441 [I] step=8260 loss=0.0041 smoothed_loss=0.0104 lr=4.32e-06 grad_norm=0.4015 step_time=0.4800s data_time=0.0723s it/s=1.811 eta_to_10000=961.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0095 grad_action_out_proj_arms=0.1489 grad_arm_token_fuse=0.0540 grad_shared_expert=0.3344 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8260/10000 [1:52:12<15:56, 1.82it/s, loss=0.0066, lr=4.32e-06, step=8259] Training: 83%|████████▎ | 8260/10000 [1:52:12<15:56, 1.82it/s, loss=0.0041, lr=4.31e-06, step=8260] Training: 83%|████████▎ | 8261/10000 [1:52:13<15:31, 1.87it/s, loss=0.0041, lr=4.31e-06, step=8260] Training: 83%|████████▎ | 8261/10000 [1:52:13<15:31, 1.87it/s, loss=0.0223, lr=4.31e-06, step=8261] Training: 83%|████████▎ | 8262/10000 [1:52:13<16:18, 1.78it/s, loss=0.0223, lr=4.31e-06, step=8261] Training: 83%|████████▎ | 8262/10000 [1:52:13<16:18, 1.78it/s, loss=0.0103, lr=4.31e-06, step=8262] Training: 83%|████████▎ | 8263/10000 [1:52:14<15:29, 1.87it/s, loss=0.0103, lr=4.31e-06, step=8262] Training: 83%|████████▎ | 8263/10000 [1:52:14<15:29, 1.87it/s, loss=0.0051, lr=4.31e-06, step=8263] Training: 83%|████████▎ | 8264/10000 [1:52:15<17:52, 1.62it/s, loss=0.0051, lr=4.31e-06, step=8263] Training: 83%|████████▎ | 8264/10000 [1:52:15<17:52, 1.62it/s, loss=0.0513, lr=4.31e-06, step=8264] Training: 83%|████████▎ | 8265/10000 [1:52:15<18:42, 1.55it/s, loss=0.0513, lr=4.31e-06, step=8264] Training: 83%|████████▎ | 8265/10000 [1:52:15<18:42, 1.55it/s, loss=0.0069, lr=4.30e-06, step=8265] Training: 83%|████████▎ | 8266/10000 [1:52:16<17:20, 1.67it/s, loss=0.0069, lr=4.30e-06, step=8265] Training: 83%|████████▎ | 8266/10000 [1:52:16<17:20, 1.67it/s, loss=0.0405, lr=4.30e-06, step=8266] Training: 83%|████████▎ | 8267/10000 [1:52:16<16:19, 1.77it/s, loss=0.0405, lr=4.30e-06, step=8266] Training: 83%|████████▎ | 8267/10000 [1:52:16<16:19, 1.77it/s, loss=0.0046, lr=4.30e-06, step=8267] Training: 83%|████████▎ | 8268/10000 [1:52:17<15:27, 1.87it/s, loss=0.0046, lr=4.30e-06, step=8267] Training: 83%|████████▎ | 8268/10000 [1:52:17<15:27, 1.87it/s, loss=0.0089, lr=4.30e-06, step=8268] Training: 83%|████████▎ | 8269/10000 [1:52:17<16:20, 1.77it/s, loss=0.0089, lr=4.30e-06, step=8268] Training: 83%|████████▎ | 8269/10000 [1:52:17<16:20, 1.77it/s, loss=0.0049, lr=4.30e-06, step=8269]20:36:50.204 [I] step=8270 loss=0.0047 smoothed_loss=0.0129 lr=4.30e-06 grad_norm=0.4335 step_time=0.5055s data_time=0.0709s it/s=1.735 eta_to_10000=997.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0136 grad_action_out_proj_arms=0.0965 grad_arm_token_fuse=0.0720 grad_shared_expert=0.4575 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8270/10000 [1:52:18<16:16, 1.77it/s, loss=0.0049, lr=4.30e-06, step=8269] Training: 83%|████████▎ | 8270/10000 [1:52:18<16:16, 1.77it/s, loss=0.0047, lr=4.29e-06, step=8270] Training: 83%|████████▎ | 8271/10000 [1:52:19<17:14, 1.67it/s, loss=0.0047, lr=4.29e-06, step=8270] Training: 83%|████████▎ | 8271/10000 [1:52:19<17:14, 1.67it/s, loss=0.0054, lr=4.29e-06, step=8271] Training: 83%|████████▎ | 8272/10000 [1:52:19<17:50, 1.61it/s, loss=0.0054, lr=4.29e-06, step=8271] Training: 83%|████████▎ | 8272/10000 [1:52:19<17:50, 1.61it/s, loss=0.0054, lr=4.29e-06, step=8272] Training: 83%|████████▎ | 8273/10000 [1:52:20<16:36, 1.73it/s, loss=0.0054, lr=4.29e-06, step=8272] Training: 83%|████████▎ | 8273/10000 [1:52:20<16:36, 1.73it/s, loss=0.0031, lr=4.29e-06, step=8273] Training: 83%|████████▎ | 8274/10000 [1:52:20<15:42, 1.83it/s, loss=0.0031, lr=4.29e-06, step=8273] Training: 83%|████████▎ | 8274/10000 [1:52:20<15:42, 1.83it/s, loss=0.0239, lr=4.29e-06, step=8274] Training: 83%|████████▎ | 8275/10000 [1:52:21<15:10, 1.89it/s, loss=0.0239, lr=4.29e-06, step=8274] Training: 83%|████████▎ | 8275/10000 [1:52:21<15:10, 1.89it/s, loss=0.0024, lr=4.28e-06, step=8275] Training: 83%|████████▎ | 8276/10000 [1:52:21<16:10, 1.78it/s, loss=0.0024, lr=4.28e-06, step=8275] Training: 83%|████████▎ | 8276/10000 [1:52:21<16:10, 1.78it/s, loss=0.0014, lr=4.28e-06, step=8276] Training: 83%|████████▎ | 8277/10000 [1:52:22<15:33, 1.85it/s, loss=0.0014, lr=4.28e-06, step=8276] Training: 83%|████████▎ | 8277/10000 [1:52:22<15:33, 1.85it/s, loss=0.0033, lr=4.28e-06, step=8277] Training: 83%|████████▎ | 8278/10000 [1:52:22<16:40, 1.72it/s, loss=0.0033, lr=4.28e-06, step=8277] Training: 83%|████████▎ | 8278/10000 [1:52:22<16:40, 1.72it/s, loss=0.0054, lr=4.28e-06, step=8278] Training: 83%|████████▎ | 8279/10000 [1:52:23<18:03, 1.59it/s, loss=0.0054, lr=4.28e-06, step=8278] Training: 83%|████████▎ | 8279/10000 [1:52:23<18:03, 1.59it/s, loss=0.0017, lr=4.28e-06, step=8279]20:36:56.076 [I] step=8280 loss=0.0021 smoothed_loss=0.0076 lr=4.28e-06 grad_norm=0.3813 step_time=0.5261s data_time=0.0611s it/s=1.703 eta_to_10000=1009.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0044 grad_action_out_proj_arms=0.0521 grad_arm_token_fuse=0.0196 grad_shared_expert=0.2249 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8280/10000 [1:52:24<17:14, 1.66it/s, loss=0.0017, lr=4.28e-06, step=8279] Training: 83%|████████▎ | 8280/10000 [1:52:24<17:14, 1.66it/s, loss=0.0021, lr=4.27e-06, step=8280] Training: 83%|████████▎ | 8281/10000 [1:52:24<16:15, 1.76it/s, loss=0.0021, lr=4.27e-06, step=8280] Training: 83%|████████▎ | 8281/10000 [1:52:24<16:15, 1.76it/s, loss=0.0021, lr=4.27e-06, step=8281] Training: 83%|████████▎ | 8282/10000 [1:52:25<15:30, 1.85it/s, loss=0.0021, lr=4.27e-06, step=8281] Training: 83%|████████▎ | 8282/10000 [1:52:25<15:30, 1.85it/s, loss=0.0114, lr=4.27e-06, step=8282] Training: 83%|████████▎ | 8283/10000 [1:52:25<15:02, 1.90it/s, loss=0.0114, lr=4.27e-06, step=8282] Training: 83%|████████▎ | 8283/10000 [1:52:25<15:02, 1.90it/s, loss=0.0038, lr=4.27e-06, step=8283] Training: 83%|████████▎ | 8284/10000 [1:52:26<16:02, 1.78it/s, loss=0.0038, lr=4.27e-06, step=8283] Training: 83%|████████▎ | 8284/10000 [1:52:26<16:02, 1.78it/s, loss=0.0008, lr=4.27e-06, step=8284] Training: 83%|████████▎ | 8285/10000 [1:52:26<15:26, 1.85it/s, loss=0.0008, lr=4.27e-06, step=8284] Training: 83%|████████▎ | 8285/10000 [1:52:26<15:26, 1.85it/s, loss=0.0035, lr=4.26e-06, step=8285] Training: 83%|████████▎ | 8286/10000 [1:52:27<16:53, 1.69it/s, loss=0.0035, lr=4.26e-06, step=8285] Training: 83%|████████▎ | 8286/10000 [1:52:27<16:53, 1.69it/s, loss=0.0047, lr=4.26e-06, step=8286] Training: 83%|████████▎ | 8287/10000 [1:52:28<15:57, 1.79it/s, loss=0.0047, lr=4.26e-06, step=8286] Training: 83%|████████▎ | 8287/10000 [1:52:28<15:57, 1.79it/s, loss=0.0030, lr=4.26e-06, step=8287] Training: 83%|████████▎ | 8288/10000 [1:52:28<15:08, 1.88it/s, loss=0.0030, lr=4.26e-06, step=8287] Training: 83%|████████▎ | 8288/10000 [1:52:28<15:08, 1.88it/s, loss=0.0009, lr=4.26e-06, step=8288] Training: 83%|████████▎ | 8289/10000 [1:52:28<14:47, 1.93it/s, loss=0.0009, lr=4.26e-06, step=8288] Training: 83%|████████▎ | 8289/10000 [1:52:28<14:47, 1.93it/s, loss=0.0031, lr=4.26e-06, step=8289]20:37:01.482 [I] step=8290 loss=0.0016 smoothed_loss=0.0047 lr=4.26e-06 grad_norm=0.3508 step_time=0.4813s data_time=0.0593s it/s=1.850 eta_to_10000=924.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0059 grad_action_out_proj_arms=0.0544 grad_arm_token_fuse=0.0281 grad_shared_expert=0.1998 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8290/10000 [1:52:29<16:01, 1.78it/s, loss=0.0031, lr=4.26e-06, step=8289] Training: 83%|████████▎ | 8290/10000 [1:52:29<16:01, 1.78it/s, loss=0.0016, lr=4.25e-06, step=8290] Training: 83%|████████▎ | 8291/10000 [1:52:30<15:21, 1.85it/s, loss=0.0016, lr=4.25e-06, step=8290] Training: 83%|████████▎ | 8291/10000 [1:52:30<15:21, 1.85it/s, loss=0.0422, lr=4.25e-06, step=8291] Training: 83%|████████▎ | 8292/10000 [1:52:30<14:51, 1.92it/s, loss=0.0422, lr=4.25e-06, step=8291] Training: 83%|████████▎ | 8292/10000 [1:52:30<14:51, 1.92it/s, loss=0.0074, lr=4.25e-06, step=8292] Training: 83%|████████▎ | 8293/10000 [1:52:31<16:24, 1.73it/s, loss=0.0074, lr=4.25e-06, step=8292] Training: 83%|████████▎ | 8293/10000 [1:52:31<16:24, 1.73it/s, loss=0.0065, lr=4.25e-06, step=8293] Training: 83%|████████▎ | 8294/10000 [1:52:31<15:41, 1.81it/s, loss=0.0065, lr=4.25e-06, step=8293] Training: 83%|████████▎ | 8294/10000 [1:52:31<15:41, 1.81it/s, loss=0.0135, lr=4.25e-06, step=8294] Training: 83%|████████▎ | 8295/10000 [1:52:32<15:12, 1.87it/s, loss=0.0135, lr=4.25e-06, step=8294] Training: 83%|████████▎ | 8295/10000 [1:52:32<15:12, 1.87it/s, loss=0.0165, lr=4.24e-06, step=8295] Training: 83%|████████▎ | 8296/10000 [1:52:32<15:01, 1.89it/s, loss=0.0165, lr=4.24e-06, step=8295] Training: 83%|████████▎ | 8296/10000 [1:52:32<15:01, 1.89it/s, loss=0.0101, lr=4.24e-06, step=8296] Training: 83%|████████▎ | 8297/10000 [1:52:33<16:06, 1.76it/s, loss=0.0101, lr=4.24e-06, step=8296] Training: 83%|████████▎ | 8297/10000 [1:52:33<16:06, 1.76it/s, loss=0.0110, lr=4.24e-06, step=8297] Training: 83%|████████▎ | 8298/10000 [1:52:33<15:18, 1.85it/s, loss=0.0110, lr=4.24e-06, step=8297] Training: 83%|████████▎ | 8298/10000 [1:52:33<15:18, 1.85it/s, loss=0.0103, lr=4.24e-06, step=8298] Training: 83%|████████▎ | 8299/10000 [1:52:34<15:27, 1.83it/s, loss=0.0103, lr=4.24e-06, step=8298] Training: 83%|████████▎ | 8299/10000 [1:52:34<15:27, 1.83it/s, loss=0.0074, lr=4.24e-06, step=8299]20:37:07.393 [I] step=8300 loss=0.0565 smoothed_loss=0.0142 lr=4.24e-06 grad_norm=0.4685 step_time=0.5199s data_time=0.0712s it/s=1.692 eta_to_10000=1004.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0258 grad_action_out_proj_arms=0.1741 grad_arm_token_fuse=0.1327 grad_shared_expert=0.4978 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8300/10000 [1:52:35<19:41, 1.44it/s, loss=0.0074, lr=4.24e-06, step=8299] Training: 83%|████████▎ | 8300/10000 [1:52:35<19:41, 1.44it/s, loss=0.0565, lr=4.23e-06, step=8300] Training: 83%|████████▎ | 8301/10000 [1:52:36<18:14, 1.55it/s, loss=0.0565, lr=4.23e-06, step=8300] Training: 83%|████████▎ | 8301/10000 [1:52:36<18:14, 1.55it/s, loss=0.0017, lr=4.23e-06, step=8301] Training: 83%|████████▎ | 8302/10000 [1:52:36<16:55, 1.67it/s, loss=0.0017, lr=4.23e-06, step=8301] Training: 83%|████████▎ | 8302/10000 [1:52:36<16:55, 1.67it/s, loss=0.0022, lr=4.23e-06, step=8302] Training: 83%|████████▎ | 8303/10000 [1:52:37<15:55, 1.78it/s, loss=0.0022, lr=4.23e-06, step=8302] Training: 83%|████████▎ | 8303/10000 [1:52:37<15:55, 1.78it/s, loss=0.0074, lr=4.23e-06, step=8303] Training: 83%|████████▎ | 8304/10000 [1:52:37<16:09, 1.75it/s, loss=0.0074, lr=4.23e-06, step=8303] Training: 83%|████████▎ | 8304/10000 [1:52:37<16:09, 1.75it/s, loss=0.0577, lr=4.23e-06, step=8304] Training: 83%|████████▎ | 8305/10000 [1:52:38<18:06, 1.56it/s, loss=0.0577, lr=4.23e-06, step=8304] Training: 83%|████████▎ | 8305/10000 [1:52:38<18:06, 1.56it/s, loss=0.0049, lr=4.22e-06, step=8305] Training: 83%|████████▎ | 8306/10000 [1:52:38<16:52, 1.67it/s, loss=0.0049, lr=4.22e-06, step=8305] Training: 83%|████████▎ | 8306/10000 [1:52:38<16:52, 1.67it/s, loss=0.0232, lr=4.22e-06, step=8306] Training: 83%|████████▎ | 8307/10000 [1:52:39<18:18, 1.54it/s, loss=0.0232, lr=4.22e-06, step=8306] Training: 83%|████████▎ | 8307/10000 [1:52:39<18:18, 1.54it/s, loss=0.0389, lr=4.22e-06, step=8307] Training: 83%|████████▎ | 8308/10000 [1:52:40<19:06, 1.48it/s, loss=0.0389, lr=4.22e-06, step=8307] Training: 83%|████████▎ | 8308/10000 [1:52:40<19:06, 1.48it/s, loss=0.0044, lr=4.22e-06, step=8308] Training: 83%|████████▎ | 8309/10000 [1:52:41<18:08, 1.55it/s, loss=0.0044, lr=4.22e-06, step=8308] Training: 83%|████████▎ | 8309/10000 [1:52:41<18:08, 1.55it/s, loss=0.0012, lr=4.22e-06, step=8309]20:37:13.460 [I] step=8310 loss=0.0073 smoothed_loss=0.0144 lr=4.22e-06 grad_norm=0.3912 step_time=0.5258s data_time=0.0809s it/s=1.649 eta_to_10000=1025.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.0760 grad_arm_token_fuse=0.0358 grad_shared_expert=0.2565 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8310/10000 [1:52:41<17:46, 1.59it/s, loss=0.0012, lr=4.22e-06, step=8309] Training: 83%|████████▎ | 8310/10000 [1:52:41<17:46, 1.59it/s, loss=0.0073, lr=4.21e-06, step=8310] Training: 83%|████████▎ | 8311/10000 [1:52:42<16:39, 1.69it/s, loss=0.0073, lr=4.21e-06, step=8310] Training: 83%|████████▎ | 8311/10000 [1:52:42<16:39, 1.69it/s, loss=0.0047, lr=4.21e-06, step=8311] Training: 83%|████████▎ | 8312/10000 [1:52:42<17:28, 1.61it/s, loss=0.0047, lr=4.21e-06, step=8311] Training: 83%|████████▎ | 8312/10000 [1:52:42<17:28, 1.61it/s, loss=0.0018, lr=4.21e-06, step=8312] Training: 83%|████████▎ | 8313/10000 [1:52:43<18:34, 1.51it/s, loss=0.0018, lr=4.21e-06, step=8312] Training: 83%|████████▎ | 8313/10000 [1:52:43<18:34, 1.51it/s, loss=0.0052, lr=4.21e-06, step=8313] Training: 83%|████████▎ | 8314/10000 [1:52:44<18:39, 1.51it/s, loss=0.0052, lr=4.21e-06, step=8313] Training: 83%|████████▎ | 8314/10000 [1:52:44<18:39, 1.51it/s, loss=0.0045, lr=4.21e-06, step=8314] Training: 83%|████████▎ | 8315/10000 [1:52:44<19:17, 1.46it/s, loss=0.0045, lr=4.21e-06, step=8314] Training: 83%|████████▎ | 8315/10000 [1:52:44<19:17, 1.46it/s, loss=0.0035, lr=4.20e-06, step=8315] Training: 83%|████████▎ | 8316/10000 [1:52:45<18:24, 1.52it/s, loss=0.0035, lr=4.20e-06, step=8315] Training: 83%|████████▎ | 8316/10000 [1:52:45<18:24, 1.52it/s, loss=0.0230, lr=4.20e-06, step=8316] Training: 83%|████████▎ | 8317/10000 [1:52:46<18:31, 1.51it/s, loss=0.0230, lr=4.20e-06, step=8316] Training: 83%|████████▎ | 8317/10000 [1:52:46<18:31, 1.51it/s, loss=0.0091, lr=4.20e-06, step=8317] Training: 83%|████████▎ | 8318/10000 [1:52:46<17:17, 1.62it/s, loss=0.0091, lr=4.20e-06, step=8317] Training: 83%|████████▎ | 8318/10000 [1:52:46<17:17, 1.62it/s, loss=0.0042, lr=4.20e-06, step=8318] Training: 83%|████████▎ | 8319/10000 [1:52:47<16:34, 1.69it/s, loss=0.0042, lr=4.20e-06, step=8318] Training: 83%|████████▎ | 8319/10000 [1:52:47<16:34, 1.69it/s, loss=0.0028, lr=4.20e-06, step=8319]20:37:19.855 [I] step=8320 loss=0.0251 smoothed_loss=0.0112 lr=4.20e-06 grad_norm=0.3516 step_time=0.5447s data_time=0.0948s it/s=1.564 eta_to_10000=1074.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0123 grad_action_out_proj_arms=0.1011 grad_arm_token_fuse=0.0624 grad_shared_expert=0.5526 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8320/10000 [1:52:48<17:47, 1.57it/s, loss=0.0028, lr=4.20e-06, step=8319] Training: 83%|████████▎ | 8320/10000 [1:52:48<17:47, 1.57it/s, loss=0.0251, lr=4.19e-06, step=8320] Training: 83%|████████▎ | 8321/10000 [1:52:48<17:59, 1.55it/s, loss=0.0251, lr=4.19e-06, step=8320] Training: 83%|████████▎ | 8321/10000 [1:52:48<17:59, 1.55it/s, loss=0.0089, lr=4.19e-06, step=8321] Training: 83%|████████▎ | 8322/10000 [1:52:49<18:51, 1.48it/s, loss=0.0089, lr=4.19e-06, step=8321] Training: 83%|████████▎ | 8322/10000 [1:52:49<18:51, 1.48it/s, loss=0.0234, lr=4.19e-06, step=8322] Training: 83%|████████▎ | 8323/10000 [1:52:50<18:57, 1.47it/s, loss=0.0234, lr=4.19e-06, step=8322] Training: 83%|████████▎ | 8323/10000 [1:52:50<18:57, 1.47it/s, loss=0.0061, lr=4.19e-06, step=8323] Training: 83%|████████▎ | 8324/10000 [1:52:50<19:31, 1.43it/s, loss=0.0061, lr=4.19e-06, step=8323] Training: 83%|████████▎ | 8324/10000 [1:52:50<19:31, 1.43it/s, loss=0.0019, lr=4.19e-06, step=8324] Training: 83%|████████▎ | 8325/10000 [1:52:51<18:49, 1.48it/s, loss=0.0019, lr=4.19e-06, step=8324] Training: 83%|████████▎ | 8325/10000 [1:52:51<18:49, 1.48it/s, loss=0.0026, lr=4.18e-06, step=8325] Training: 83%|████████▎ | 8326/10000 [1:52:51<17:24, 1.60it/s, loss=0.0026, lr=4.18e-06, step=8325] Training: 83%|████████▎ | 8326/10000 [1:52:51<17:24, 1.60it/s, loss=0.0040, lr=4.18e-06, step=8326] Training: 83%|████████▎ | 8327/10000 [1:52:52<17:40, 1.58it/s, loss=0.0040, lr=4.18e-06, step=8326] Training: 83%|████████▎ | 8327/10000 [1:52:52<17:40, 1.58it/s, loss=0.0034, lr=4.18e-06, step=8327] Training: 83%|████████▎ | 8328/10000 [1:52:53<17:44, 1.57it/s, loss=0.0034, lr=4.18e-06, step=8327] Training: 83%|████████▎ | 8328/10000 [1:52:53<17:44, 1.57it/s, loss=0.0194, lr=4.18e-06, step=8328] Training: 83%|████████▎ | 8329/10000 [1:52:53<18:17, 1.52it/s, loss=0.0194, lr=4.18e-06, step=8328] Training: 83%|████████▎ | 8329/10000 [1:52:53<18:17, 1.52it/s, loss=0.0116, lr=4.18e-06, step=8329]20:37:26.363 [I] step=8330 loss=0.0062 smoothed_loss=0.0096 lr=4.18e-06 grad_norm=0.4463 step_time=0.5561s data_time=0.0948s it/s=1.537 eta_to_10000=1086.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0628 grad_arm_token_fuse=0.0274 grad_shared_expert=0.2531 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8330/10000 [1:52:54<17:17, 1.61it/s, loss=0.0116, lr=4.18e-06, step=8329] Training: 83%|████████▎ | 8330/10000 [1:52:54<17:17, 1.61it/s, loss=0.0062, lr=4.17e-06, step=8330] Training: 83%|████████▎ | 8331/10000 [1:52:55<16:50, 1.65it/s, loss=0.0062, lr=4.17e-06, step=8330] Training: 83%|████████▎ | 8331/10000 [1:52:55<16:50, 1.65it/s, loss=0.0146, lr=4.17e-06, step=8331] Training: 83%|████████▎ | 8332/10000 [1:52:55<15:59, 1.74it/s, loss=0.0146, lr=4.17e-06, step=8331] Training: 83%|████████▎ | 8332/10000 [1:52:55<15:59, 1.74it/s, loss=0.0280, lr=4.17e-06, step=8332] Training: 83%|████████▎ | 8333/10000 [1:52:56<15:34, 1.78it/s, loss=0.0280, lr=4.17e-06, step=8332] Training: 83%|████████▎ | 8333/10000 [1:52:56<15:34, 1.78it/s, loss=0.0077, lr=4.17e-06, step=8333] Training: 83%|████████▎ | 8334/10000 [1:52:56<16:53, 1.64it/s, loss=0.0077, lr=4.17e-06, step=8333] Training: 83%|████████▎ | 8334/10000 [1:52:56<16:53, 1.64it/s, loss=0.0151, lr=4.17e-06, step=8334] Training: 83%|████████▎ | 8335/10000 [1:52:57<16:41, 1.66it/s, loss=0.0151, lr=4.17e-06, step=8334] Training: 83%|████████▎ | 8335/10000 [1:52:57<16:41, 1.66it/s, loss=0.0280, lr=4.16e-06, step=8335] Training: 83%|████████▎ | 8336/10000 [1:52:58<18:03, 1.54it/s, loss=0.0280, lr=4.16e-06, step=8335] Training: 83%|████████▎ | 8336/10000 [1:52:58<18:03, 1.54it/s, loss=0.0034, lr=4.16e-06, step=8336] Training: 83%|████████▎ | 8337/10000 [1:52:58<17:25, 1.59it/s, loss=0.0034, lr=4.16e-06, step=8336] Training: 83%|████████▎ | 8337/10000 [1:52:58<17:25, 1.59it/s, loss=0.0025, lr=4.16e-06, step=8337] Training: 83%|████████▎ | 8338/10000 [1:52:59<18:20, 1.51it/s, loss=0.0025, lr=4.16e-06, step=8337] Training: 83%|████████▎ | 8338/10000 [1:52:59<18:20, 1.51it/s, loss=0.0043, lr=4.16e-06, step=8338] Training: 83%|████████▎ | 8339/10000 [1:53:00<17:19, 1.60it/s, loss=0.0043, lr=4.16e-06, step=8338] Training: 83%|████████▎ | 8339/10000 [1:53:00<17:19, 1.60it/s, loss=0.0013, lr=4.16e-06, step=8339]20:37:32.418 [I] step=8340 loss=0.0083 smoothed_loss=0.0096 lr=4.16e-06 grad_norm=0.4656 step_time=0.5118s data_time=0.0937s it/s=1.652 eta_to_10000=1004.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.1455 grad_arm_token_fuse=0.0572 grad_shared_expert=0.4118 (18633:train_pytorch.py:850) + Training: 83%|████████▎ | 8340/10000 [1:53:00<16:28, 1.68it/s, loss=0.0013, lr=4.16e-06, step=8339] Training: 83%|████████▎ | 8340/10000 [1:53:00<16:28, 1.68it/s, loss=0.0083, lr=4.15e-06, step=8340] Training: 83%|████████▎ | 8341/10000 [1:53:01<15:44, 1.76it/s, loss=0.0083, lr=4.15e-06, step=8340] Training: 83%|████████▎ | 8341/10000 [1:53:01<15:44, 1.76it/s, loss=0.0033, lr=4.15e-06, step=8341] Training: 83%|████████▎ | 8342/10000 [1:53:01<17:40, 1.56it/s, loss=0.0033, lr=4.15e-06, step=8341] Training: 83%|████████▎ | 8342/10000 [1:53:01<17:40, 1.56it/s, loss=0.0065, lr=4.15e-06, step=8342] Training: 83%|████████▎ | 8343/10000 [1:53:02<18:44, 1.47it/s, loss=0.0065, lr=4.15e-06, step=8342] Training: 83%|████████▎ | 8343/10000 [1:53:02<18:44, 1.47it/s, loss=0.0262, lr=4.15e-06, step=8343] Training: 83%|████████▎ | 8344/10000 [1:53:03<18:53, 1.46it/s, loss=0.0262, lr=4.15e-06, step=8343] Training: 83%|████████▎ | 8344/10000 [1:53:03<18:53, 1.46it/s, loss=0.0012, lr=4.15e-06, step=8344] Training: 83%|████████▎ | 8345/10000 [1:53:04<19:13, 1.44it/s, loss=0.0012, lr=4.15e-06, step=8344] Training: 83%|████████▎ | 8345/10000 [1:53:04<19:13, 1.44it/s, loss=0.0108, lr=4.15e-06, step=8345] Training: 83%|████████▎ | 8346/10000 [1:53:04<17:35, 1.57it/s, loss=0.0108, lr=4.15e-06, step=8345] Training: 83%|████████▎ | 8346/10000 [1:53:04<17:35, 1.57it/s, loss=0.0410, lr=4.14e-06, step=8346] Training: 83%|████████▎ | 8347/10000 [1:53:05<17:33, 1.57it/s, loss=0.0410, lr=4.14e-06, step=8346] Training: 83%|████████▎ | 8347/10000 [1:53:05<17:33, 1.57it/s, loss=0.0031, lr=4.14e-06, step=8347] Training: 83%|████████▎ | 8348/10000 [1:53:05<17:57, 1.53it/s, loss=0.0031, lr=4.14e-06, step=8347] Training: 83%|████████▎ | 8348/10000 [1:53:05<17:57, 1.53it/s, loss=0.0064, lr=4.14e-06, step=8348] Training: 83%|████████▎ | 8349/10000 [1:53:06<16:55, 1.63it/s, loss=0.0064, lr=4.14e-06, step=8348] Training: 83%|████████▎ | 8349/10000 [1:53:06<16:55, 1.63it/s, loss=0.0084, lr=4.14e-06, step=8349]20:37:39.052 [I] step=8350 loss=0.0097 smoothed_loss=0.0109 lr=4.14e-06 grad_norm=0.3877 step_time=0.5689s data_time=0.0946s it/s=1.507 eta_to_10000=1094.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0848 grad_arm_token_fuse=0.0375 grad_shared_expert=0.2870 (18633:train_pytorch.py:850) + Training: 84%|████████▎ | 8350/10000 [1:53:07<18:15, 1.51it/s, loss=0.0084, lr=4.14e-06, step=8349] Training: 84%|████████▎ | 8350/10000 [1:53:07<18:15, 1.51it/s, loss=0.0097, lr=4.14e-06, step=8350] Training: 84%|████████▎ | 8351/10000 [1:53:07<16:41, 1.65it/s, loss=0.0097, lr=4.14e-06, step=8350] Training: 84%|████████▎ | 8351/10000 [1:53:07<16:41, 1.65it/s, loss=0.0034, lr=4.13e-06, step=8351] Training: 84%|████████▎ | 8352/10000 [1:53:08<15:54, 1.73it/s, loss=0.0034, lr=4.13e-06, step=8351] Training: 84%|████████▎ | 8352/10000 [1:53:08<15:54, 1.73it/s, loss=0.0094, lr=4.13e-06, step=8352] Training: 84%|████████▎ | 8353/10000 [1:53:08<16:57, 1.62it/s, loss=0.0094, lr=4.13e-06, step=8352] Training: 84%|████████▎ | 8353/10000 [1:53:08<16:57, 1.62it/s, loss=0.0077, lr=4.13e-06, step=8353] Training: 84%|████████▎ | 8354/10000 [1:53:09<15:52, 1.73it/s, loss=0.0077, lr=4.13e-06, step=8353] Training: 84%|████████▎ | 8354/10000 [1:53:09<15:52, 1.73it/s, loss=0.0091, lr=4.13e-06, step=8354] Training: 84%|████████▎ | 8355/10000 [1:53:10<16:32, 1.66it/s, loss=0.0091, lr=4.13e-06, step=8354] Training: 84%|████████▎ | 8355/10000 [1:53:10<16:32, 1.66it/s, loss=0.0025, lr=4.13e-06, step=8355] Training: 84%|████████▎ | 8356/10000 [1:53:10<15:47, 1.73it/s, loss=0.0025, lr=4.13e-06, step=8355] Training: 84%|████████▎ | 8356/10000 [1:53:10<15:47, 1.73it/s, loss=0.0044, lr=4.12e-06, step=8356] Training: 84%|████████▎ | 8357/10000 [1:53:11<17:05, 1.60it/s, loss=0.0044, lr=4.12e-06, step=8356] Training: 84%|████████▎ | 8357/10000 [1:53:11<17:05, 1.60it/s, loss=0.0069, lr=4.12e-06, step=8357] Training: 84%|████████▎ | 8358/10000 [1:53:11<15:56, 1.72it/s, loss=0.0069, lr=4.12e-06, step=8357] Training: 84%|████████▎ | 8358/10000 [1:53:11<15:56, 1.72it/s, loss=0.0269, lr=4.12e-06, step=8358] Training: 84%|████████▎ | 8359/10000 [1:53:12<15:08, 1.81it/s, loss=0.0269, lr=4.12e-06, step=8358] Training: 84%|████████▎ | 8359/10000 [1:53:12<15:08, 1.81it/s, loss=0.0157, lr=4.12e-06, step=8359]20:37:44.659 [I] step=8360 loss=0.0055 smoothed_loss=0.0103 lr=4.12e-06 grad_norm=0.4472 step_time=0.4880s data_time=0.0726s it/s=1.784 eta_to_10000=919.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0052 grad_action_out_proj_arms=0.0642 grad_arm_token_fuse=0.0239 grad_shared_expert=0.2153 (18633:train_pytorch.py:850) + Training: 84%|████████▎ | 8360/10000 [1:53:12<15:01, 1.82it/s, loss=0.0157, lr=4.12e-06, step=8359] Training: 84%|████████▎ | 8360/10000 [1:53:12<15:01, 1.82it/s, loss=0.0055, lr=4.12e-06, step=8360] Training: 84%|████████▎ | 8361/10000 [1:53:13<16:03, 1.70it/s, loss=0.0055, lr=4.12e-06, step=8360] Training: 84%|████████▎ | 8361/10000 [1:53:13<16:03, 1.70it/s, loss=0.0079, lr=4.11e-06, step=8361] Training: 84%|████████▎ | 8362/10000 [1:53:13<15:09, 1.80it/s, loss=0.0079, lr=4.11e-06, step=8361] Training: 84%|████████▎ | 8362/10000 [1:53:13<15:09, 1.80it/s, loss=0.0052, lr=4.11e-06, step=8362] Training: 84%|████████▎ | 8363/10000 [1:53:14<16:06, 1.69it/s, loss=0.0052, lr=4.11e-06, step=8362] Training: 84%|████████▎ | 8363/10000 [1:53:14<16:06, 1.69it/s, loss=0.0046, lr=4.11e-06, step=8363] Training: 84%|████████▎ | 8364/10000 [1:53:15<16:44, 1.63it/s, loss=0.0046, lr=4.11e-06, step=8363] Training: 84%|████████▎ | 8364/10000 [1:53:15<16:44, 1.63it/s, loss=0.0127, lr=4.11e-06, step=8364] Training: 84%|████████▎ | 8365/10000 [1:53:16<17:44, 1.54it/s, loss=0.0127, lr=4.11e-06, step=8364] Training: 84%|████████▎ | 8365/10000 [1:53:16<17:44, 1.54it/s, loss=0.0433, lr=4.11e-06, step=8365] Training: 84%|████████▎ | 8366/10000 [1:53:16<16:27, 1.65it/s, loss=0.0433, lr=4.11e-06, step=8365] Training: 84%|████████▎ | 8366/10000 [1:53:16<16:27, 1.65it/s, loss=0.0327, lr=4.10e-06, step=8366] Training: 84%|████████▎ | 8367/10000 [1:53:17<15:44, 1.73it/s, loss=0.0327, lr=4.10e-06, step=8366] Training: 84%|████████▎ | 8367/10000 [1:53:17<15:44, 1.73it/s, loss=0.0022, lr=4.10e-06, step=8367] Training: 84%|████████▎ | 8368/10000 [1:53:17<16:22, 1.66it/s, loss=0.0022, lr=4.10e-06, step=8367] Training: 84%|████████▎ | 8368/10000 [1:53:17<16:22, 1.66it/s, loss=0.0042, lr=4.10e-06, step=8368] Training: 84%|████████▎ | 8369/10000 [1:53:18<15:44, 1.73it/s, loss=0.0042, lr=4.10e-06, step=8368] Training: 84%|████████▎ | 8369/10000 [1:53:18<15:44, 1.73it/s, loss=0.0099, lr=4.10e-06, step=8369]20:37:51.092 [I] step=8370 loss=0.0051 smoothed_loss=0.0116 lr=4.11e-06 grad_norm=0.3898 step_time=0.5578s data_time=0.0856s it/s=1.555 eta_to_10000=1048.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0860 grad_arm_token_fuse=0.0445 grad_shared_expert=0.3610 (18633:train_pytorch.py:850) + Training: 84%|████████▎ | 8370/10000 [1:53:19<19:10, 1.42it/s, loss=0.0099, lr=4.10e-06, step=8369] Training: 84%|████████▎ | 8370/10000 [1:53:19<19:10, 1.42it/s, loss=0.0051, lr=4.10e-06, step=8370] Training: 84%|████████▎ | 8371/10000 [1:53:19<19:00, 1.43it/s, loss=0.0051, lr=4.10e-06, step=8370] Training: 84%|████████▎ | 8371/10000 [1:53:19<19:00, 1.43it/s, loss=0.0015, lr=4.10e-06, step=8371] Training: 84%|████████▎ | 8372/10000 [1:53:20<19:09, 1.42it/s, loss=0.0015, lr=4.10e-06, step=8371] Training: 84%|████████▎ | 8372/10000 [1:53:20<19:09, 1.42it/s, loss=0.0062, lr=4.09e-06, step=8372] Training: 84%|████████▎ | 8373/10000 [1:53:21<17:25, 1.56it/s, loss=0.0062, lr=4.09e-06, step=8372] Training: 84%|████████▎ | 8373/10000 [1:53:21<17:25, 1.56it/s, loss=0.0014, lr=4.09e-06, step=8373] Training: 84%|████████▎ | 8374/10000 [1:53:21<16:11, 1.67it/s, loss=0.0014, lr=4.09e-06, step=8373] Training: 84%|████████▎ | 8374/10000 [1:53:21<16:11, 1.67it/s, loss=0.0333, lr=4.09e-06, step=8374] Training: 84%|████████▍ | 8375/10000 [1:53:22<16:02, 1.69it/s, loss=0.0333, lr=4.09e-06, step=8374] Training: 84%|████████▍ | 8375/10000 [1:53:22<16:02, 1.69it/s, loss=0.0207, lr=4.09e-06, step=8375] Training: 84%|████████▍ | 8376/10000 [1:53:22<16:22, 1.65it/s, loss=0.0207, lr=4.09e-06, step=8375] Training: 84%|████████▍ | 8376/10000 [1:53:22<16:22, 1.65it/s, loss=0.0055, lr=4.09e-06, step=8376] Training: 84%|████████▍ | 8377/10000 [1:53:23<16:08, 1.68it/s, loss=0.0055, lr=4.09e-06, step=8376] Training: 84%|████████▍ | 8377/10000 [1:53:23<16:08, 1.68it/s, loss=0.0039, lr=4.08e-06, step=8377] Training: 84%|████████▍ | 8378/10000 [1:53:24<18:55, 1.43it/s, loss=0.0039, lr=4.08e-06, step=8377] Training: 84%|████████▍ | 8378/10000 [1:53:24<18:55, 1.43it/s, loss=0.0062, lr=4.08e-06, step=8378] Training: 84%|████████▍ | 8379/10000 [1:53:25<21:12, 1.27it/s, loss=0.0062, lr=4.08e-06, step=8378] Training: 84%|████████▍ | 8379/10000 [1:53:25<21:12, 1.27it/s, loss=0.0101, lr=4.08e-06, step=8379]20:37:57.726 [I] step=8380 loss=0.0053 smoothed_loss=0.0100 lr=4.09e-06 grad_norm=0.3867 step_time=0.5682s data_time=0.0952s it/s=1.508 eta_to_10000=1074.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0150 grad_action_out_proj_arms=0.1103 grad_arm_token_fuse=0.0783 grad_shared_expert=0.2994 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8380/10000 [1:53:25<19:04, 1.41it/s, loss=0.0101, lr=4.08e-06, step=8379] Training: 84%|████████▍ | 8380/10000 [1:53:25<19:04, 1.41it/s, loss=0.0053, lr=4.08e-06, step=8380] Training: 84%|████████▍ | 8381/10000 [1:53:26<18:43, 1.44it/s, loss=0.0053, lr=4.08e-06, step=8380] Training: 84%|████████▍ | 8381/10000 [1:53:26<18:43, 1.44it/s, loss=0.0014, lr=4.08e-06, step=8381] Training: 84%|████████▍ | 8382/10000 [1:53:27<18:25, 1.46it/s, loss=0.0014, lr=4.08e-06, step=8381] Training: 84%|████████▍ | 8382/10000 [1:53:27<18:25, 1.46it/s, loss=0.0038, lr=4.07e-06, step=8382] Training: 84%|████████▍ | 8383/10000 [1:53:27<18:36, 1.45it/s, loss=0.0038, lr=4.07e-06, step=8382] Training: 84%|████████▍ | 8383/10000 [1:53:27<18:36, 1.45it/s, loss=0.0109, lr=4.07e-06, step=8383] Training: 84%|████████▍ | 8384/10000 [1:53:28<17:12, 1.56it/s, loss=0.0109, lr=4.07e-06, step=8383] Training: 84%|████████▍ | 8384/10000 [1:53:28<17:12, 1.56it/s, loss=0.0021, lr=4.07e-06, step=8384] Training: 84%|████████▍ | 8385/10000 [1:53:29<19:43, 1.36it/s, loss=0.0021, lr=4.07e-06, step=8384] Training: 84%|████████▍ | 8385/10000 [1:53:29<19:43, 1.36it/s, loss=0.0116, lr=4.07e-06, step=8385] Training: 84%|████████▍ | 8386/10000 [1:53:30<19:38, 1.37it/s, loss=0.0116, lr=4.07e-06, step=8385] Training: 84%|████████▍ | 8386/10000 [1:53:30<19:38, 1.37it/s, loss=0.0064, lr=4.07e-06, step=8386] Training: 84%|████████▍ | 8387/10000 [1:53:30<19:09, 1.40it/s, loss=0.0064, lr=4.07e-06, step=8386] Training: 84%|████████▍ | 8387/10000 [1:53:30<19:09, 1.40it/s, loss=0.0232, lr=4.06e-06, step=8387] Training: 84%|████████▍ | 8388/10000 [1:53:31<17:31, 1.53it/s, loss=0.0232, lr=4.06e-06, step=8387] Training: 84%|████████▍ | 8388/10000 [1:53:31<17:31, 1.53it/s, loss=0.0489, lr=4.06e-06, step=8388] Training: 84%|████████▍ | 8389/10000 [1:53:31<16:28, 1.63it/s, loss=0.0489, lr=4.06e-06, step=8388] Training: 84%|████████▍ | 8389/10000 [1:53:31<16:28, 1.63it/s, loss=0.0262, lr=4.06e-06, step=8389]20:38:04.262 [I] step=8390 loss=0.0015 smoothed_loss=0.0136 lr=4.07e-06 grad_norm=0.5097 step_time=0.5429s data_time=0.1107s it/s=1.530 eta_to_10000=1052.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0215 grad_action_out_proj_arms=0.1188 grad_arm_token_fuse=0.1087 grad_shared_expert=0.5643 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8390/10000 [1:53:32<16:24, 1.64it/s, loss=0.0262, lr=4.06e-06, step=8389] Training: 84%|████████▍ | 8390/10000 [1:53:32<16:24, 1.64it/s, loss=0.0015, lr=4.06e-06, step=8390] Training: 84%|████████▍ | 8391/10000 [1:53:32<15:29, 1.73it/s, loss=0.0015, lr=4.06e-06, step=8390] Training: 84%|████████▍ | 8391/10000 [1:53:32<15:29, 1.73it/s, loss=0.0168, lr=4.06e-06, step=8391] Training: 84%|████████▍ | 8392/10000 [1:53:33<15:15, 1.76it/s, loss=0.0168, lr=4.06e-06, step=8391] Training: 84%|████████▍ | 8392/10000 [1:53:33<15:15, 1.76it/s, loss=0.0021, lr=4.06e-06, step=8392] Training: 84%|████████▍ | 8393/10000 [1:53:34<18:40, 1.43it/s, loss=0.0021, lr=4.06e-06, step=8392] Training: 84%|████████▍ | 8393/10000 [1:53:34<18:40, 1.43it/s, loss=0.0075, lr=4.05e-06, step=8393] Training: 84%|████████▍ | 8394/10000 [1:53:35<18:51, 1.42it/s, loss=0.0075, lr=4.05e-06, step=8393] Training: 84%|████████▍ | 8394/10000 [1:53:35<18:51, 1.42it/s, loss=0.0061, lr=4.05e-06, step=8394] Training: 84%|████████▍ | 8395/10000 [1:53:35<17:19, 1.54it/s, loss=0.0061, lr=4.05e-06, step=8394] Training: 84%|████████▍ | 8395/10000 [1:53:35<17:19, 1.54it/s, loss=0.0055, lr=4.05e-06, step=8395] Training: 84%|████████▍ | 8396/10000 [1:53:36<18:01, 1.48it/s, loss=0.0055, lr=4.05e-06, step=8395] Training: 84%|████████▍ | 8396/10000 [1:53:36<18:01, 1.48it/s, loss=0.0036, lr=4.05e-06, step=8396] Training: 84%|████████▍ | 8397/10000 [1:53:37<17:01, 1.57it/s, loss=0.0036, lr=4.05e-06, step=8396] Training: 84%|████████▍ | 8397/10000 [1:53:37<17:01, 1.57it/s, loss=0.0193, lr=4.05e-06, step=8397] Training: 84%|████████▍ | 8398/10000 [1:53:37<16:36, 1.61it/s, loss=0.0193, lr=4.05e-06, step=8397] Training: 84%|████████▍ | 8398/10000 [1:53:37<16:36, 1.61it/s, loss=0.0065, lr=4.04e-06, step=8398] Training: 84%|████████▍ | 8399/10000 [1:53:38<16:40, 1.60it/s, loss=0.0065, lr=4.04e-06, step=8398] Training: 84%|████████▍ | 8399/10000 [1:53:38<16:40, 1.60it/s, loss=0.0064, lr=4.04e-06, step=8399]20:38:10.843 [I] step=8400 loss=0.0026 smoothed_loss=0.0095 lr=4.05e-06 grad_norm=0.4477 step_time=0.5492s data_time=0.1089s it/s=1.520 eta_to_10000=1052.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0824 grad_arm_token_fuse=0.0343 grad_shared_expert=0.4340 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8400/10000 [1:53:39<18:00, 1.48it/s, loss=0.0064, lr=4.04e-06, step=8399] Training: 84%|████████▍ | 8400/10000 [1:53:39<18:00, 1.48it/s, loss=0.0026, lr=4.04e-06, step=8400] Training: 84%|████████▍ | 8401/10000 [1:53:39<17:20, 1.54it/s, loss=0.0026, lr=4.04e-06, step=8400] Training: 84%|████████▍ | 8401/10000 [1:53:39<17:20, 1.54it/s, loss=0.0036, lr=4.04e-06, step=8401] Training: 84%|████████▍ | 8402/10000 [1:53:40<16:46, 1.59it/s, loss=0.0036, lr=4.04e-06, step=8401] Training: 84%|████████▍ | 8402/10000 [1:53:40<16:46, 1.59it/s, loss=0.0026, lr=4.04e-06, step=8402] Training: 84%|████████▍ | 8403/10000 [1:53:40<16:29, 1.61it/s, loss=0.0026, lr=4.04e-06, step=8402] Training: 84%|████████▍ | 8403/10000 [1:53:40<16:29, 1.61it/s, loss=0.0330, lr=4.03e-06, step=8403] Training: 84%|████████▍ | 8404/10000 [1:53:41<17:28, 1.52it/s, loss=0.0330, lr=4.03e-06, step=8403] Training: 84%|████████▍ | 8404/10000 [1:53:41<17:28, 1.52it/s, loss=0.0011, lr=4.03e-06, step=8404] Training: 84%|████████▍ | 8405/10000 [1:53:42<20:28, 1.30it/s, loss=0.0011, lr=4.03e-06, step=8404] Training: 84%|████████▍ | 8405/10000 [1:53:42<20:28, 1.30it/s, loss=0.0151, lr=4.03e-06, step=8405] Training: 84%|████████▍ | 8406/10000 [1:53:43<20:28, 1.30it/s, loss=0.0151, lr=4.03e-06, step=8405] Training: 84%|████████▍ | 8406/10000 [1:53:43<20:28, 1.30it/s, loss=0.0085, lr=4.03e-06, step=8406] Training: 84%|████████▍ | 8407/10000 [1:53:44<22:49, 1.16it/s, loss=0.0085, lr=4.03e-06, step=8406] Training: 84%|████████▍ | 8407/10000 [1:53:44<22:49, 1.16it/s, loss=0.0113, lr=4.03e-06, step=8407] Training: 84%|████████▍ | 8408/10000 [1:53:45<21:35, 1.23it/s, loss=0.0113, lr=4.03e-06, step=8407] Training: 84%|████████▍ | 8408/10000 [1:53:45<21:35, 1.23it/s, loss=0.0021, lr=4.03e-06, step=8408] Training: 84%|████████▍ | 8409/10000 [1:53:45<20:37, 1.29it/s, loss=0.0021, lr=4.03e-06, step=8408] Training: 84%|████████▍ | 8409/10000 [1:53:45<20:37, 1.29it/s, loss=0.0160, lr=4.02e-06, step=8409]20:38:18.667 [I] step=8410 loss=0.0022 smoothed_loss=0.0093 lr=4.03e-06 grad_norm=0.4153 step_time=0.5938s data_time=0.1885s it/s=1.278 eta_to_10000=1243.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0135 grad_action_out_proj_arms=0.0984 grad_arm_token_fuse=0.0666 grad_shared_expert=0.3394 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8410/10000 [1:53:46<22:41, 1.17it/s, loss=0.0160, lr=4.02e-06, step=8409] Training: 84%|████████▍ | 8410/10000 [1:53:46<22:41, 1.17it/s, loss=0.0022, lr=4.02e-06, step=8410] Training: 84%|████████▍ | 8411/10000 [1:53:47<21:09, 1.25it/s, loss=0.0022, lr=4.02e-06, step=8410] Training: 84%|████████▍ | 8411/10000 [1:53:47<21:09, 1.25it/s, loss=0.0078, lr=4.02e-06, step=8411] Training: 84%|████████▍ | 8412/10000 [1:53:48<21:51, 1.21it/s, loss=0.0078, lr=4.02e-06, step=8411] Training: 84%|████████▍ | 8412/10000 [1:53:48<21:51, 1.21it/s, loss=0.0008, lr=4.02e-06, step=8412] Training: 84%|████████▍ | 8413/10000 [1:53:49<21:49, 1.21it/s, loss=0.0008, lr=4.02e-06, step=8412] Training: 84%|████████▍ | 8413/10000 [1:53:49<21:49, 1.21it/s, loss=0.0256, lr=4.02e-06, step=8413] Training: 84%|████████▍ | 8414/10000 [1:53:50<25:27, 1.04it/s, loss=0.0256, lr=4.02e-06, step=8413] Training: 84%|████████▍ | 8414/10000 [1:53:50<25:27, 1.04it/s, loss=0.0091, lr=4.01e-06, step=8414] Training: 84%|████████▍ | 8415/10000 [1:53:51<27:31, 1.04s/it, loss=0.0091, lr=4.01e-06, step=8414] Training: 84%|████████▍ | 8415/10000 [1:53:51<27:31, 1.04s/it, loss=0.0285, lr=4.01e-06, step=8415] Training: 84%|████████▍ | 8416/10000 [1:53:52<26:39, 1.01s/it, loss=0.0285, lr=4.01e-06, step=8415] Training: 84%|████████▍ | 8416/10000 [1:53:52<26:39, 1.01s/it, loss=0.0109, lr=4.01e-06, step=8416] Training: 84%|████████▍ | 8417/10000 [1:53:53<24:43, 1.07it/s, loss=0.0109, lr=4.01e-06, step=8416] Training: 84%|████████▍ | 8417/10000 [1:53:53<24:43, 1.07it/s, loss=0.0048, lr=4.01e-06, step=8417] Training: 84%|████████▍ | 8418/10000 [1:53:53<21:25, 1.23it/s, loss=0.0048, lr=4.01e-06, step=8417] Training: 84%|████████▍ | 8418/10000 [1:53:53<21:25, 1.23it/s, loss=0.0125, lr=4.01e-06, step=8418] Training: 84%|████████▍ | 8419/10000 [1:53:54<22:25, 1.17it/s, loss=0.0125, lr=4.01e-06, step=8418] Training: 84%|████████▍ | 8419/10000 [1:53:54<22:25, 1.17it/s, loss=0.0061, lr=4.00e-06, step=8419]20:38:27.410 [I] step=8420 loss=0.0066 smoothed_loss=0.0103 lr=4.01e-06 grad_norm=0.4226 step_time=0.6661s data_time=0.2083s it/s=1.144 eta_to_10000=1380.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0057 grad_action_out_proj_arms=0.0593 grad_arm_token_fuse=0.0309 grad_shared_expert=0.3302 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8420/10000 [1:53:55<21:06, 1.25it/s, loss=0.0061, lr=4.00e-06, step=8419] Training: 84%|████████▍ | 8420/10000 [1:53:55<21:06, 1.25it/s, loss=0.0066, lr=4.00e-06, step=8420] Training: 84%|████████▍ | 8421/10000 [1:53:56<21:01, 1.25it/s, loss=0.0066, lr=4.00e-06, step=8420] Training: 84%|████████▍ | 8421/10000 [1:53:56<21:01, 1.25it/s, loss=0.0044, lr=4.00e-06, step=8421] Training: 84%|████████▍ | 8422/10000 [1:53:57<22:51, 1.15it/s, loss=0.0044, lr=4.00e-06, step=8421] Training: 84%|████████▍ | 8422/10000 [1:53:57<22:51, 1.15it/s, loss=0.0036, lr=4.00e-06, step=8422] Training: 84%|████████▍ | 8423/10000 [1:53:58<24:55, 1.05it/s, loss=0.0036, lr=4.00e-06, step=8422] Training: 84%|████████▍ | 8423/10000 [1:53:58<24:55, 1.05it/s, loss=0.0026, lr=4.00e-06, step=8423] Training: 84%|████████▍ | 8424/10000 [1:53:59<25:07, 1.05it/s, loss=0.0026, lr=4.00e-06, step=8423] Training: 84%|████████▍ | 8424/10000 [1:53:59<25:07, 1.05it/s, loss=0.0752, lr=4.00e-06, step=8424] Training: 84%|████████▍ | 8425/10000 [1:54:00<23:46, 1.10it/s, loss=0.0752, lr=4.00e-06, step=8424] Training: 84%|████████▍ | 8425/10000 [1:54:00<23:46, 1.10it/s, loss=0.0110, lr=3.99e-06, step=8425] Training: 84%|████████▍ | 8426/10000 [1:54:00<21:06, 1.24it/s, loss=0.0110, lr=3.99e-06, step=8425] Training: 84%|████████▍ | 8426/10000 [1:54:00<21:06, 1.24it/s, loss=0.0085, lr=3.99e-06, step=8426] Training: 84%|████████▍ | 8427/10000 [1:54:01<20:29, 1.28it/s, loss=0.0085, lr=3.99e-06, step=8426] Training: 84%|████████▍ | 8427/10000 [1:54:01<20:29, 1.28it/s, loss=0.0010, lr=3.99e-06, step=8427] Training: 84%|████████▍ | 8428/10000 [1:54:02<21:07, 1.24it/s, loss=0.0010, lr=3.99e-06, step=8427] Training: 84%|████████▍ | 8428/10000 [1:54:02<21:07, 1.24it/s, loss=0.0024, lr=3.99e-06, step=8428] Training: 84%|████████▍ | 8429/10000 [1:54:03<21:43, 1.21it/s, loss=0.0024, lr=3.99e-06, step=8428] Training: 84%|████████▍ | 8429/10000 [1:54:03<21:43, 1.21it/s, loss=0.0074, lr=3.99e-06, step=8429]20:38:36.361 [I] step=8430 loss=0.0521 smoothed_loss=0.0154 lr=3.99e-06 grad_norm=0.4794 step_time=0.6747s data_time=0.2204s it/s=1.117 eta_to_10000=1405.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0274 grad_action_out_proj_arms=0.1644 grad_arm_token_fuse=0.1468 grad_shared_expert=0.4235 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8430/10000 [1:54:04<24:28, 1.07it/s, loss=0.0074, lr=3.99e-06, step=8429] Training: 84%|████████▍ | 8430/10000 [1:54:04<24:28, 1.07it/s, loss=0.0521, lr=3.98e-06, step=8430] Training: 84%|████████▍ | 8431/10000 [1:54:05<22:00, 1.19it/s, loss=0.0521, lr=3.98e-06, step=8430] Training: 84%|████████▍ | 8431/10000 [1:54:05<22:00, 1.19it/s, loss=0.0219, lr=3.98e-06, step=8431] Training: 84%|████████▍ | 8432/10000 [1:54:05<21:24, 1.22it/s, loss=0.0219, lr=3.98e-06, step=8431] Training: 84%|████████▍ | 8432/10000 [1:54:05<21:24, 1.22it/s, loss=0.0058, lr=3.98e-06, step=8432] Training: 84%|████████▍ | 8433/10000 [1:54:06<20:39, 1.26it/s, loss=0.0058, lr=3.98e-06, step=8432] Training: 84%|████████▍ | 8433/10000 [1:54:06<20:39, 1.26it/s, loss=0.0027, lr=3.98e-06, step=8433] Training: 84%|████████▍ | 8434/10000 [1:54:07<19:47, 1.32it/s, loss=0.0027, lr=3.98e-06, step=8433] Training: 84%|████████▍ | 8434/10000 [1:54:07<19:47, 1.32it/s, loss=0.0064, lr=3.98e-06, step=8434] Training: 84%|████████▍ | 8435/10000 [1:54:08<20:09, 1.29it/s, loss=0.0064, lr=3.98e-06, step=8434] Training: 84%|████████▍ | 8435/10000 [1:54:08<20:09, 1.29it/s, loss=0.0166, lr=3.98e-06, step=8435] Training: 84%|████████▍ | 8436/10000 [1:54:08<20:08, 1.29it/s, loss=0.0166, lr=3.98e-06, step=8435] Training: 84%|████████▍ | 8436/10000 [1:54:08<20:08, 1.29it/s, loss=0.0075, lr=3.97e-06, step=8436] Training: 84%|████████▍ | 8437/10000 [1:54:09<18:37, 1.40it/s, loss=0.0075, lr=3.97e-06, step=8436] Training: 84%|████████▍ | 8437/10000 [1:54:09<18:37, 1.40it/s, loss=0.0038, lr=3.97e-06, step=8437] Training: 84%|████████▍ | 8438/10000 [1:54:10<18:51, 1.38it/s, loss=0.0038, lr=3.97e-06, step=8437] Training: 84%|████████▍ | 8438/10000 [1:54:10<18:51, 1.38it/s, loss=0.0190, lr=3.97e-06, step=8438] Training: 84%|████████▍ | 8439/10000 [1:54:10<17:02, 1.53it/s, loss=0.0190, lr=3.97e-06, step=8438] Training: 84%|████████▍ | 8439/10000 [1:54:10<17:02, 1.53it/s, loss=0.0087, lr=3.97e-06, step=8439]20:38:43.109 [I] step=8440 loss=0.0037 smoothed_loss=0.0114 lr=3.97e-06 grad_norm=0.3781 step_time=0.5587s data_time=0.1162s it/s=1.482 eta_to_10000=1052.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0044 grad_action_out_proj_arms=0.0368 grad_arm_token_fuse=0.0254 grad_shared_expert=0.3554 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8440/10000 [1:54:11<16:13, 1.60it/s, loss=0.0087, lr=3.97e-06, step=8439] Training: 84%|████████▍ | 8440/10000 [1:54:11<16:13, 1.60it/s, loss=0.0037, lr=3.97e-06, step=8440] Training: 84%|████████▍ | 8441/10000 [1:54:11<14:58, 1.73it/s, loss=0.0037, lr=3.97e-06, step=8440] Training: 84%|████████▍ | 8441/10000 [1:54:11<14:58, 1.73it/s, loss=0.0284, lr=3.96e-06, step=8441] Training: 84%|████████▍ | 8442/10000 [1:54:12<14:16, 1.82it/s, loss=0.0284, lr=3.96e-06, step=8441] Training: 84%|████████▍ | 8442/10000 [1:54:12<14:16, 1.82it/s, loss=0.0031, lr=3.96e-06, step=8442] Training: 84%|████████▍ | 8443/10000 [1:54:13<15:59, 1.62it/s, loss=0.0031, lr=3.96e-06, step=8442] Training: 84%|████████▍ | 8443/10000 [1:54:13<15:59, 1.62it/s, loss=0.0075, lr=3.96e-06, step=8443] Training: 84%|████████▍ | 8444/10000 [1:54:13<15:07, 1.71it/s, loss=0.0075, lr=3.96e-06, step=8443] Training: 84%|████████▍ | 8444/10000 [1:54:13<15:07, 1.71it/s, loss=0.0608, lr=3.96e-06, step=8444] Training: 84%|████████▍ | 8445/10000 [1:54:14<15:46, 1.64it/s, loss=0.0608, lr=3.96e-06, step=8444] Training: 84%|████████▍ | 8445/10000 [1:54:14<15:46, 1.64it/s, loss=0.0090, lr=3.96e-06, step=8445] Training: 84%|████████▍ | 8446/10000 [1:54:14<15:32, 1.67it/s, loss=0.0090, lr=3.96e-06, step=8445] Training: 84%|████████▍ | 8446/10000 [1:54:14<15:32, 1.67it/s, loss=0.0069, lr=3.95e-06, step=8446] Training: 84%|████████▍ | 8447/10000 [1:54:15<14:44, 1.76it/s, loss=0.0069, lr=3.95e-06, step=8446] Training: 84%|████████▍ | 8447/10000 [1:54:15<14:44, 1.76it/s, loss=0.0050, lr=3.95e-06, step=8447] Training: 84%|████████▍ | 8448/10000 [1:54:15<14:35, 1.77it/s, loss=0.0050, lr=3.95e-06, step=8447] Training: 84%|████████▍ | 8448/10000 [1:54:15<14:35, 1.77it/s, loss=0.0022, lr=3.95e-06, step=8448] Training: 84%|████████▍ | 8449/10000 [1:54:16<14:20, 1.80it/s, loss=0.0022, lr=3.95e-06, step=8448] Training: 84%|████████▍ | 8449/10000 [1:54:16<14:20, 1.80it/s, loss=0.0109, lr=3.95e-06, step=8449]20:38:49.026 [I] step=8450 loss=0.0242 smoothed_loss=0.0137 lr=3.96e-06 grad_norm=0.4392 step_time=0.5122s data_time=0.0795s it/s=1.690 eta_to_10000=916.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0135 grad_action_out_proj_arms=0.1761 grad_arm_token_fuse=0.0712 grad_shared_expert=0.4269 (18633:train_pytorch.py:850) + Training: 84%|████████▍ | 8450/10000 [1:54:17<16:39, 1.55it/s, loss=0.0109, lr=3.95e-06, step=8449] Training: 84%|████████▍ | 8450/10000 [1:54:17<16:39, 1.55it/s, loss=0.0242, lr=3.95e-06, step=8450] Training: 85%|████████▍ | 8451/10000 [1:54:17<16:00, 1.61it/s, loss=0.0242, lr=3.95e-06, step=8450] Training: 85%|████████▍ | 8451/10000 [1:54:17<16:00, 1.61it/s, loss=0.0120, lr=3.95e-06, step=8451] Training: 85%|████████▍ | 8452/10000 [1:54:18<16:27, 1.57it/s, loss=0.0120, lr=3.95e-06, step=8451] Training: 85%|████████▍ | 8452/10000 [1:54:18<16:27, 1.57it/s, loss=0.0052, lr=3.94e-06, step=8452] Training: 85%|████████▍ | 8453/10000 [1:54:19<15:53, 1.62it/s, loss=0.0052, lr=3.94e-06, step=8452] Training: 85%|████████▍ | 8453/10000 [1:54:19<15:53, 1.62it/s, loss=0.0688, lr=3.94e-06, step=8453] Training: 85%|████████▍ | 8454/10000 [1:54:19<15:40, 1.64it/s, loss=0.0688, lr=3.94e-06, step=8453] Training: 85%|████████▍ | 8454/10000 [1:54:19<15:40, 1.64it/s, loss=0.0079, lr=3.94e-06, step=8454] Training: 85%|████████▍ | 8455/10000 [1:54:20<14:40, 1.76it/s, loss=0.0079, lr=3.94e-06, step=8454] Training: 85%|████████▍ | 8455/10000 [1:54:20<14:40, 1.76it/s, loss=0.0064, lr=3.94e-06, step=8455] Training: 85%|████████▍ | 8456/10000 [1:54:20<15:17, 1.68it/s, loss=0.0064, lr=3.94e-06, step=8455] Training: 85%|████████▍ | 8456/10000 [1:54:20<15:17, 1.68it/s, loss=0.0103, lr=3.94e-06, step=8456] Training: 85%|████████▍ | 8457/10000 [1:54:21<16:01, 1.61it/s, loss=0.0103, lr=3.94e-06, step=8456] Training: 85%|████████▍ | 8457/10000 [1:54:21<16:01, 1.61it/s, loss=0.0021, lr=3.93e-06, step=8457] Training: 85%|████████▍ | 8458/10000 [1:54:21<14:56, 1.72it/s, loss=0.0021, lr=3.93e-06, step=8457] Training: 85%|████████▍ | 8458/10000 [1:54:21<14:56, 1.72it/s, loss=0.0024, lr=3.93e-06, step=8458] Training: 85%|████████▍ | 8459/10000 [1:54:22<15:18, 1.68it/s, loss=0.0024, lr=3.93e-06, step=8458] Training: 85%|████████▍ | 8459/10000 [1:54:22<15:18, 1.68it/s, loss=0.0013, lr=3.93e-06, step=8459]20:38:55.167 [I] step=8460 loss=0.0138 smoothed_loss=0.0121 lr=3.94e-06 grad_norm=0.3935 step_time=0.5257s data_time=0.0884s it/s=1.629 eta_to_10000=945.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0197 grad_action_out_proj_arms=0.1198 grad_arm_token_fuse=0.1045 grad_shared_expert=0.5744 (18633:train_pytorch.py:850) + Training: 85%|████████▍ | 8460/10000 [1:54:23<16:56, 1.52it/s, loss=0.0013, lr=3.93e-06, step=8459] Training: 85%|████████▍ | 8460/10000 [1:54:23<16:56, 1.52it/s, loss=0.0138, lr=3.93e-06, step=8460] Training: 85%|████████▍ | 8461/10000 [1:54:23<15:45, 1.63it/s, loss=0.0138, lr=3.93e-06, step=8460] Training: 85%|████████▍ | 8461/10000 [1:54:23<15:45, 1.63it/s, loss=0.0062, lr=3.93e-06, step=8461] Training: 85%|████████▍ | 8462/10000 [1:54:24<15:35, 1.64it/s, loss=0.0062, lr=3.93e-06, step=8461] Training: 85%|████████▍ | 8462/10000 [1:54:24<15:35, 1.64it/s, loss=0.0083, lr=3.93e-06, step=8462] Training: 85%|████████▍ | 8463/10000 [1:54:24<14:31, 1.76it/s, loss=0.0083, lr=3.93e-06, step=8462] Training: 85%|████████▍ | 8463/10000 [1:54:24<14:31, 1.76it/s, loss=0.0102, lr=3.92e-06, step=8463] Training: 85%|████████▍ | 8464/10000 [1:54:25<15:27, 1.66it/s, loss=0.0102, lr=3.92e-06, step=8463] Training: 85%|████████▍ | 8464/10000 [1:54:25<15:27, 1.66it/s, loss=0.0107, lr=3.92e-06, step=8464] Training: 85%|████████▍ | 8465/10000 [1:54:26<16:09, 1.58it/s, loss=0.0107, lr=3.92e-06, step=8464] Training: 85%|████████▍ | 8465/10000 [1:54:26<16:09, 1.58it/s, loss=0.0015, lr=3.92e-06, step=8465] Training: 85%|████████▍ | 8466/10000 [1:54:27<16:46, 1.52it/s, loss=0.0015, lr=3.92e-06, step=8465] Training: 85%|████████▍ | 8466/10000 [1:54:27<16:46, 1.52it/s, loss=0.0020, lr=3.92e-06, step=8466] Training: 85%|████████▍ | 8467/10000 [1:54:27<16:45, 1.52it/s, loss=0.0020, lr=3.92e-06, step=8466] Training: 85%|████████▍ | 8467/10000 [1:54:27<16:45, 1.52it/s, loss=0.0233, lr=3.92e-06, step=8467] Training: 85%|████████▍ | 8468/10000 [1:54:28<15:24, 1.66it/s, loss=0.0233, lr=3.92e-06, step=8467] Training: 85%|████████▍ | 8468/10000 [1:54:28<15:24, 1.66it/s, loss=0.0181, lr=3.91e-06, step=8468] Training: 85%|████████▍ | 8469/10000 [1:54:28<15:38, 1.63it/s, loss=0.0181, lr=3.91e-06, step=8468] Training: 85%|████████▍ | 8469/10000 [1:54:28<15:38, 1.63it/s, loss=0.0123, lr=3.91e-06, step=8469]20:39:01.283 [I] step=8470 loss=0.0033 smoothed_loss=0.0107 lr=3.92e-06 grad_norm=0.5111 step_time=0.5314s data_time=0.0803s it/s=1.635 eta_to_10000=935.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0075 grad_action_out_proj_arms=0.0953 grad_arm_token_fuse=0.0400 grad_shared_expert=0.4109 (18633:train_pytorch.py:850) + Training: 85%|████████▍ | 8470/10000 [1:54:29<16:04, 1.59it/s, loss=0.0123, lr=3.91e-06, step=8469] Training: 85%|████████▍ | 8470/10000 [1:54:29<16:04, 1.59it/s, loss=0.0033, lr=3.91e-06, step=8470] Training: 85%|████████▍ | 8471/10000 [1:54:30<16:01, 1.59it/s, loss=0.0033, lr=3.91e-06, step=8470] Training: 85%|████████▍ | 8471/10000 [1:54:30<16:01, 1.59it/s, loss=0.0016, lr=3.91e-06, step=8471] Training: 85%|████████▍ | 8472/10000 [1:54:30<17:38, 1.44it/s, loss=0.0016, lr=3.91e-06, step=8471] Training: 85%|████████▍ | 8472/10000 [1:54:30<17:38, 1.44it/s, loss=0.0170, lr=3.91e-06, step=8472] Training: 85%|████████▍ | 8473/10000 [1:54:31<17:54, 1.42it/s, loss=0.0170, lr=3.91e-06, step=8472] Training: 85%|████████▍ | 8473/10000 [1:54:31<17:54, 1.42it/s, loss=0.0165, lr=3.91e-06, step=8473] Training: 85%|████████▍ | 8474/10000 [1:54:32<18:18, 1.39it/s, loss=0.0165, lr=3.91e-06, step=8473] Training: 85%|████████▍ | 8474/10000 [1:54:32<18:18, 1.39it/s, loss=0.0033, lr=3.90e-06, step=8474] Training: 85%|████████▍ | 8475/10000 [1:54:33<18:33, 1.37it/s, loss=0.0033, lr=3.90e-06, step=8474] Training: 85%|████████▍ | 8475/10000 [1:54:33<18:33, 1.37it/s, loss=0.0159, lr=3.90e-06, step=8475] Training: 85%|████████▍ | 8476/10000 [1:54:33<16:59, 1.50it/s, loss=0.0159, lr=3.90e-06, step=8475] Training: 85%|████████▍ | 8476/10000 [1:54:33<16:59, 1.50it/s, loss=0.0037, lr=3.90e-06, step=8476] Training: 85%|████████▍ | 8477/10000 [1:54:34<15:46, 1.61it/s, loss=0.0037, lr=3.90e-06, step=8476] Training: 85%|████████▍ | 8477/10000 [1:54:34<15:46, 1.61it/s, loss=0.0099, lr=3.90e-06, step=8477] Training: 85%|████████▍ | 8478/10000 [1:54:34<16:02, 1.58it/s, loss=0.0099, lr=3.90e-06, step=8477] Training: 85%|████████▍ | 8478/10000 [1:54:34<16:02, 1.58it/s, loss=0.0058, lr=3.90e-06, step=8478] Training: 85%|████████▍ | 8479/10000 [1:54:35<17:42, 1.43it/s, loss=0.0058, lr=3.90e-06, step=8478] Training: 85%|████████▍ | 8479/10000 [1:54:35<17:42, 1.43it/s, loss=0.0030, lr=3.90e-06, step=8479]20:39:08.132 [I] step=8480 loss=0.0021 smoothed_loss=0.0083 lr=3.90e-06 grad_norm=0.4472 step_time=0.5828s data_time=0.1021s it/s=1.460 eta_to_10000=1041.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0088 grad_action_out_proj_arms=0.0798 grad_arm_token_fuse=0.0439 grad_shared_expert=0.4132 (18633:train_pytorch.py:850) + Training: 85%|████████▍ | 8480/10000 [1:54:36<16:54, 1.50it/s, loss=0.0030, lr=3.90e-06, step=8479] Training: 85%|████████▍ | 8480/10000 [1:54:36<16:54, 1.50it/s, loss=0.0021, lr=3.89e-06, step=8480] Training: 85%|████████▍ | 8481/10000 [1:54:36<16:59, 1.49it/s, loss=0.0021, lr=3.89e-06, step=8480] Training: 85%|████████▍ | 8481/10000 [1:54:36<16:59, 1.49it/s, loss=0.0026, lr=3.89e-06, step=8481] Training: 85%|████████▍ | 8482/10000 [1:54:37<17:51, 1.42it/s, loss=0.0026, lr=3.89e-06, step=8481] Training: 85%|████████▍ | 8482/10000 [1:54:37<17:51, 1.42it/s, loss=0.0024, lr=3.89e-06, step=8482] Training: 85%|████████▍ | 8483/10000 [1:54:38<17:24, 1.45it/s, loss=0.0024, lr=3.89e-06, step=8482] Training: 85%|████████▍ | 8483/10000 [1:54:38<17:24, 1.45it/s, loss=0.0093, lr=3.89e-06, step=8483] Training: 85%|████████▍ | 8484/10000 [1:54:39<17:29, 1.44it/s, loss=0.0093, lr=3.89e-06, step=8483] Training: 85%|████████▍ | 8484/10000 [1:54:39<17:29, 1.44it/s, loss=0.0185, lr=3.89e-06, step=8484] Training: 85%|████████▍ | 8485/10000 [1:54:39<16:49, 1.50it/s, loss=0.0185, lr=3.89e-06, step=8484] Training: 85%|████████▍ | 8485/10000 [1:54:39<16:49, 1.50it/s, loss=0.0028, lr=3.88e-06, step=8485] Training: 85%|████████▍ | 8486/10000 [1:54:40<16:57, 1.49it/s, loss=0.0028, lr=3.88e-06, step=8485] Training: 85%|████████▍ | 8486/10000 [1:54:40<16:57, 1.49it/s, loss=0.0280, lr=3.88e-06, step=8486] Training: 85%|████████▍ | 8487/10000 [1:54:40<15:23, 1.64it/s, loss=0.0280, lr=3.88e-06, step=8486] Training: 85%|████████▍ | 8487/10000 [1:54:40<15:23, 1.64it/s, loss=0.0033, lr=3.88e-06, step=8487] Training: 85%|████████▍ | 8488/10000 [1:54:41<15:30, 1.63it/s, loss=0.0033, lr=3.88e-06, step=8487] Training: 85%|████████▍ | 8488/10000 [1:54:41<15:30, 1.63it/s, loss=0.0163, lr=3.88e-06, step=8488] Training: 85%|████████▍ | 8489/10000 [1:54:42<15:03, 1.67it/s, loss=0.0163, lr=3.88e-06, step=8488] Training: 85%|████████▍ | 8489/10000 [1:54:42<15:03, 1.67it/s, loss=0.0034, lr=3.88e-06, step=8489]20:39:14.398 [I] step=8490 loss=0.0019 smoothed_loss=0.0086 lr=3.88e-06 grad_norm=0.4533 step_time=0.5290s data_time=0.0976s it/s=1.596 eta_to_10000=945.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0053 grad_action_out_proj_arms=0.0720 grad_arm_token_fuse=0.0289 grad_shared_expert=0.3330 (18633:train_pytorch.py:850) + Training: 85%|████████▍ | 8490/10000 [1:54:42<14:22, 1.75it/s, loss=0.0034, lr=3.88e-06, step=8489] Training: 85%|████████▍ | 8490/10000 [1:54:42<14:22, 1.75it/s, loss=0.0019, lr=3.88e-06, step=8490] Training: 85%|████████▍ | 8491/10000 [1:54:43<15:09, 1.66it/s, loss=0.0019, lr=3.88e-06, step=8490] Training: 85%|████████▍ | 8491/10000 [1:54:43<15:09, 1.66it/s, loss=0.0062, lr=3.87e-06, step=8491] Training: 85%|████████▍ | 8492/10000 [1:54:43<14:22, 1.75it/s, loss=0.0062, lr=3.87e-06, step=8491] Training: 85%|████████▍ | 8492/10000 [1:54:43<14:22, 1.75it/s, loss=0.0036, lr=3.87e-06, step=8492] Training: 85%|████████▍ | 8493/10000 [1:54:44<15:26, 1.63it/s, loss=0.0036, lr=3.87e-06, step=8492] Training: 85%|████████▍ | 8493/10000 [1:54:44<15:26, 1.63it/s, loss=0.0058, lr=3.87e-06, step=8493] Training: 85%|████████▍ | 8494/10000 [1:54:44<14:25, 1.74it/s, loss=0.0058, lr=3.87e-06, step=8493] Training: 85%|████████▍ | 8494/10000 [1:54:44<14:25, 1.74it/s, loss=0.0156, lr=3.87e-06, step=8494] Training: 85%|████████▍ | 8495/10000 [1:54:45<15:51, 1.58it/s, loss=0.0156, lr=3.87e-06, step=8494] Training: 85%|████████▍ | 8495/10000 [1:54:45<15:51, 1.58it/s, loss=0.0018, lr=3.87e-06, step=8495] Training: 85%|████████▍ | 8496/10000 [1:54:46<15:43, 1.59it/s, loss=0.0018, lr=3.87e-06, step=8495] Training: 85%|████████▍ | 8496/10000 [1:54:46<15:43, 1.59it/s, loss=0.0132, lr=3.86e-06, step=8496] Training: 85%|████████▍ | 8497/10000 [1:54:46<15:50, 1.58it/s, loss=0.0132, lr=3.86e-06, step=8496] Training: 85%|████████▍ | 8497/10000 [1:54:46<15:50, 1.58it/s, loss=0.0152, lr=3.86e-06, step=8497] Training: 85%|████████▍ | 8498/10000 [1:54:47<14:54, 1.68it/s, loss=0.0152, lr=3.86e-06, step=8497] Training: 85%|████████▍ | 8498/10000 [1:54:47<14:54, 1.68it/s, loss=0.0013, lr=3.86e-06, step=8498] Training: 85%|████████▍ | 8499/10000 [1:54:48<15:07, 1.65it/s, loss=0.0013, lr=3.86e-06, step=8498] Training: 85%|████████▍ | 8499/10000 [1:54:48<15:07, 1.65it/s, loss=0.0077, lr=3.86e-06, step=8499]20:39:20.634 [I] step=8500 loss=0.0052 smoothed_loss=0.0079 lr=3.87e-06 grad_norm=0.4030 step_time=0.5178s data_time=0.1058s it/s=1.604 eta_to_10000=935.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0140 grad_action_out_proj_arms=0.1440 grad_arm_token_fuse=0.0765 grad_shared_expert=0.4453 (18633:train_pytorch.py:850) + Training: 85%|████████▌ | 8500/10000 [1:54:48<15:49, 1.58it/s, loss=0.0077, lr=3.86e-06, step=8499] Training: 85%|████████▌ | 8500/10000 [1:54:48<15:49, 1.58it/s, loss=0.0052, lr=3.86e-06, step=8500] Training: 85%|████████▌ | 8501/10000 [1:54:49<15:11, 1.65it/s, loss=0.0052, lr=3.86e-06, step=8500] Training: 85%|████████▌ | 8501/10000 [1:54:49<15:11, 1.65it/s, loss=0.0210, lr=3.86e-06, step=8501] Training: 85%|████████▌ | 8502/10000 [1:54:49<15:23, 1.62it/s, loss=0.0210, lr=3.86e-06, step=8501] Training: 85%|████████▌ | 8502/10000 [1:54:49<15:23, 1.62it/s, loss=0.0059, lr=3.85e-06, step=8502] Training: 85%|████████▌ | 8503/10000 [1:54:50<14:26, 1.73it/s, loss=0.0059, lr=3.85e-06, step=8502] Training: 85%|████████▌ | 8503/10000 [1:54:50<14:26, 1.73it/s, loss=0.0106, lr=3.85e-06, step=8503] Training: 85%|████████▌ | 8504/10000 [1:54:50<13:38, 1.83it/s, loss=0.0106, lr=3.85e-06, step=8503] Training: 85%|████████▌ | 8504/10000 [1:54:50<13:38, 1.83it/s, loss=0.0015, lr=3.85e-06, step=8504] Training: 85%|████████▌ | 8505/10000 [1:54:51<13:04, 1.91it/s, loss=0.0015, lr=3.85e-06, step=8504] Training: 85%|████████▌ | 8505/10000 [1:54:51<13:04, 1.91it/s, loss=0.0056, lr=3.85e-06, step=8505] Training: 85%|████████▌ | 8506/10000 [1:54:51<12:44, 1.96it/s, loss=0.0056, lr=3.85e-06, step=8505] Training: 85%|████████▌ | 8506/10000 [1:54:51<12:44, 1.96it/s, loss=0.0338, lr=3.85e-06, step=8506] Training: 85%|████████▌ | 8507/10000 [1:54:52<14:03, 1.77it/s, loss=0.0338, lr=3.85e-06, step=8506] Training: 85%|████████▌ | 8507/10000 [1:54:52<14:03, 1.77it/s, loss=0.0114, lr=3.85e-06, step=8507] Training: 85%|████████▌ | 8508/10000 [1:54:53<13:23, 1.86it/s, loss=0.0114, lr=3.85e-06, step=8507] Training: 85%|████████▌ | 8508/10000 [1:54:53<13:23, 1.86it/s, loss=0.0022, lr=3.84e-06, step=8508] Training: 85%|████████▌ | 8509/10000 [1:54:53<14:11, 1.75it/s, loss=0.0022, lr=3.84e-06, step=8508] Training: 85%|████████▌ | 8509/10000 [1:54:53<14:11, 1.75it/s, loss=0.0428, lr=3.84e-06, step=8509]20:39:26.057 [I] step=8510 loss=0.0022 smoothed_loss=0.0120 lr=3.85e-06 grad_norm=0.4505 step_time=0.4760s data_time=0.0663s it/s=1.844 eta_to_10000=807.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0157 grad_action_out_proj_arms=0.1039 grad_arm_token_fuse=0.0817 grad_shared_expert=0.3303 (18633:train_pytorch.py:850) + Training: 85%|████████▌ | 8510/10000 [1:54:54<13:42, 1.81it/s, loss=0.0428, lr=3.84e-06, step=8509] Training: 85%|████████▌ | 8510/10000 [1:54:54<13:42, 1.81it/s, loss=0.0022, lr=3.84e-06, step=8510] Training: 85%|████████▌ | 8511/10000 [1:54:54<13:05, 1.89it/s, loss=0.0022, lr=3.84e-06, step=8510] Training: 85%|████████▌ | 8511/10000 [1:54:54<13:05, 1.89it/s, loss=0.0046, lr=3.84e-06, step=8511] Training: 85%|████████▌ | 8512/10000 [1:54:55<13:01, 1.90it/s, loss=0.0046, lr=3.84e-06, step=8511] Training: 85%|████████▌ | 8512/10000 [1:54:55<13:01, 1.90it/s, loss=0.0078, lr=3.84e-06, step=8512] Training: 85%|████████▌ | 8513/10000 [1:54:55<12:41, 1.95it/s, loss=0.0078, lr=3.84e-06, step=8512] Training: 85%|████████▌ | 8513/10000 [1:54:55<12:41, 1.95it/s, loss=0.0162, lr=3.83e-06, step=8513] Training: 85%|████████▌ | 8514/10000 [1:54:56<13:37, 1.82it/s, loss=0.0162, lr=3.83e-06, step=8513] Training: 85%|████████▌ | 8514/10000 [1:54:56<13:37, 1.82it/s, loss=0.0024, lr=3.83e-06, step=8514] Training: 85%|████████▌ | 8515/10000 [1:54:57<15:53, 1.56it/s, loss=0.0024, lr=3.83e-06, step=8514] Training: 85%|████████▌ | 8515/10000 [1:54:57<15:53, 1.56it/s, loss=0.0249, lr=3.83e-06, step=8515] Training: 85%|████████▌ | 8516/10000 [1:54:57<16:41, 1.48it/s, loss=0.0249, lr=3.83e-06, step=8515] Training: 85%|████████▌ | 8516/10000 [1:54:57<16:41, 1.48it/s, loss=0.0076, lr=3.83e-06, step=8516] Training: 85%|████████▌ | 8517/10000 [1:54:58<16:09, 1.53it/s, loss=0.0076, lr=3.83e-06, step=8516] Training: 85%|████████▌ | 8517/10000 [1:54:58<16:09, 1.53it/s, loss=0.0073, lr=3.83e-06, step=8517] Training: 85%|████████▌ | 8518/10000 [1:54:59<14:50, 1.67it/s, loss=0.0073, lr=3.83e-06, step=8517] Training: 85%|████████▌ | 8518/10000 [1:54:59<14:50, 1.67it/s, loss=0.0061, lr=3.83e-06, step=8518] Training: 85%|████████▌ | 8519/10000 [1:54:59<14:01, 1.76it/s, loss=0.0061, lr=3.83e-06, step=8518] Training: 85%|████████▌ | 8519/10000 [1:54:59<14:01, 1.76it/s, loss=0.0142, lr=3.82e-06, step=8519]20:39:31.861 [I] step=8520 loss=0.0064 smoothed_loss=0.0105 lr=3.83e-06 grad_norm=0.4188 step_time=0.5022s data_time=0.0782s it/s=1.723 eta_to_10000=858.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0070 grad_action_out_proj_arms=0.0649 grad_arm_token_fuse=0.0361 grad_shared_expert=0.2106 (18633:train_pytorch.py:850) + Training: 85%|████████▌ | 8520/10000 [1:55:00<13:35, 1.81it/s, loss=0.0142, lr=3.82e-06, step=8519] Training: 85%|████████▌ | 8520/10000 [1:55:00<13:35, 1.81it/s, loss=0.0064, lr=3.82e-06, step=8520] Training: 85%|████████▌ | 8521/10000 [1:55:00<14:13, 1.73it/s, loss=0.0064, lr=3.82e-06, step=8520] Training: 85%|████████▌ | 8521/10000 [1:55:00<14:13, 1.73it/s, loss=0.0132, lr=3.82e-06, step=8521] Training: 85%|████████▌ | 8522/10000 [1:55:01<14:56, 1.65it/s, loss=0.0132, lr=3.82e-06, step=8521] Training: 85%|████████▌ | 8522/10000 [1:55:01<14:56, 1.65it/s, loss=0.0064, lr=3.82e-06, step=8522] Training: 85%|████████▌ | 8523/10000 [1:55:01<14:14, 1.73it/s, loss=0.0064, lr=3.82e-06, step=8522] Training: 85%|████████▌ | 8523/10000 [1:55:01<14:14, 1.73it/s, loss=0.0166, lr=3.82e-06, step=8523] Training: 85%|████████▌ | 8524/10000 [1:55:02<14:50, 1.66it/s, loss=0.0166, lr=3.82e-06, step=8523] Training: 85%|████████▌ | 8524/10000 [1:55:02<14:50, 1.66it/s, loss=0.0031, lr=3.82e-06, step=8524] Training: 85%|████████▌ | 8525/10000 [1:55:03<14:40, 1.67it/s, loss=0.0031, lr=3.82e-06, step=8524] Training: 85%|████████▌ | 8525/10000 [1:55:03<14:40, 1.67it/s, loss=0.0067, lr=3.81e-06, step=8525] Training: 85%|████████▌ | 8526/10000 [1:55:03<13:50, 1.77it/s, loss=0.0067, lr=3.81e-06, step=8525] Training: 85%|████████▌ | 8526/10000 [1:55:03<13:50, 1.77it/s, loss=0.0068, lr=3.81e-06, step=8526] Training: 85%|████████▌ | 8527/10000 [1:55:04<13:07, 1.87it/s, loss=0.0068, lr=3.81e-06, step=8526] Training: 85%|████████▌ | 8527/10000 [1:55:04<13:07, 1.87it/s, loss=0.0189, lr=3.81e-06, step=8527] Training: 85%|████████▌ | 8528/10000 [1:55:04<13:49, 1.78it/s, loss=0.0189, lr=3.81e-06, step=8527] Training: 85%|████████▌ | 8528/10000 [1:55:04<13:49, 1.78it/s, loss=0.0818, lr=3.81e-06, step=8528] Training: 85%|████████▌ | 8529/10000 [1:55:05<14:48, 1.66it/s, loss=0.0818, lr=3.81e-06, step=8528] Training: 85%|████████▌ | 8529/10000 [1:55:05<14:48, 1.66it/s, loss=0.0087, lr=3.81e-06, step=8529]20:39:38.045 [I] step=8530 loss=0.0020 smoothed_loss=0.0152 lr=3.81e-06 grad_norm=0.4499 step_time=0.5422s data_time=0.0762s it/s=1.617 eta_to_10000=908.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0119 grad_action_out_proj_arms=0.1132 grad_arm_token_fuse=0.0590 grad_shared_expert=0.3243 (18633:train_pytorch.py:850) + Training: 85%|████████▌ | 8530/10000 [1:55:06<16:28, 1.49it/s, loss=0.0087, lr=3.81e-06, step=8529] Training: 85%|████████▌ | 8530/10000 [1:55:06<16:28, 1.49it/s, loss=0.0020, lr=3.81e-06, step=8530] Training: 85%|████████▌ | 8531/10000 [1:55:06<15:05, 1.62it/s, loss=0.0020, lr=3.81e-06, step=8530] Training: 85%|████████▌ | 8531/10000 [1:55:06<15:05, 1.62it/s, loss=0.0094, lr=3.80e-06, step=8531] Training: 85%|████████▌ | 8532/10000 [1:55:07<14:14, 1.72it/s, loss=0.0094, lr=3.80e-06, step=8531] Training: 85%|████████▌ | 8532/10000 [1:55:07<14:14, 1.72it/s, loss=0.0027, lr=3.80e-06, step=8532] Training: 85%|████████▌ | 8533/10000 [1:55:07<13:31, 1.81it/s, loss=0.0027, lr=3.80e-06, step=8532] Training: 85%|████████▌ | 8533/10000 [1:55:07<13:31, 1.81it/s, loss=0.0034, lr=3.80e-06, step=8533] Training: 85%|████████▌ | 8534/10000 [1:55:08<14:44, 1.66it/s, loss=0.0034, lr=3.80e-06, step=8533] Training: 85%|████████▌ | 8534/10000 [1:55:08<14:44, 1.66it/s, loss=0.0079, lr=3.80e-06, step=8534] Training: 85%|████████▌ | 8535/10000 [1:55:09<14:55, 1.64it/s, loss=0.0079, lr=3.80e-06, step=8534] Training: 85%|████████▌ | 8535/10000 [1:55:09<14:55, 1.64it/s, loss=0.0157, lr=3.80e-06, step=8535] Training: 85%|████████▌ | 8536/10000 [1:55:09<15:21, 1.59it/s, loss=0.0157, lr=3.80e-06, step=8535] Training: 85%|████████▌ | 8536/10000 [1:55:09<15:21, 1.59it/s, loss=0.0024, lr=3.79e-06, step=8536] Training: 85%|████████▌ | 8537/10000 [1:55:10<15:22, 1.59it/s, loss=0.0024, lr=3.79e-06, step=8536] Training: 85%|████████▌ | 8537/10000 [1:55:10<15:22, 1.59it/s, loss=0.0062, lr=3.79e-06, step=8537] Training: 85%|████████▌ | 8538/10000 [1:55:10<14:14, 1.71it/s, loss=0.0062, lr=3.79e-06, step=8537] Training: 85%|████████▌ | 8538/10000 [1:55:10<14:14, 1.71it/s, loss=0.0015, lr=3.79e-06, step=8538] Training: 85%|████████▌ | 8539/10000 [1:55:11<13:28, 1.81it/s, loss=0.0015, lr=3.79e-06, step=8538] Training: 85%|████████▌ | 8539/10000 [1:55:11<13:28, 1.81it/s, loss=0.0246, lr=3.79e-06, step=8539]20:39:43.635 [I] step=8540 loss=0.0036 smoothed_loss=0.0106 lr=3.80e-06 grad_norm=0.4003 step_time=0.4758s data_time=0.0832s it/s=1.789 eta_to_10000=816.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0410 grad_action_out_proj_arms=0.1498 grad_arm_token_fuse=0.2198 grad_shared_expert=0.4217 (18633:train_pytorch.py:850) + Training: 85%|████████▌ | 8540/10000 [1:55:11<13:06, 1.86it/s, loss=0.0246, lr=3.79e-06, step=8539] Training: 85%|████████▌ | 8540/10000 [1:55:11<13:06, 1.86it/s, loss=0.0036, lr=3.79e-06, step=8540] Training: 85%|████████▌ | 8541/10000 [1:55:12<12:47, 1.90it/s, loss=0.0036, lr=3.79e-06, step=8540] Training: 85%|████████▌ | 8541/10000 [1:55:12<12:47, 1.90it/s, loss=0.0027, lr=3.79e-06, step=8541] Training: 85%|████████▌ | 8542/10000 [1:55:12<12:27, 1.95it/s, loss=0.0027, lr=3.79e-06, step=8541] Training: 85%|████████▌ | 8542/10000 [1:55:12<12:27, 1.95it/s, loss=0.0030, lr=3.78e-06, step=8542] Training: 85%|████████▌ | 8543/10000 [1:55:13<13:58, 1.74it/s, loss=0.0030, lr=3.78e-06, step=8542] Training: 85%|████████▌ | 8543/10000 [1:55:13<13:58, 1.74it/s, loss=0.0038, lr=3.78e-06, step=8543] Training: 85%|████████▌ | 8544/10000 [1:55:14<13:32, 1.79it/s, loss=0.0038, lr=3.78e-06, step=8543] Training: 85%|████████▌ | 8544/10000 [1:55:14<13:32, 1.79it/s, loss=0.0042, lr=3.78e-06, step=8544] Training: 85%|████████▌ | 8545/10000 [1:55:14<14:17, 1.70it/s, loss=0.0042, lr=3.78e-06, step=8544] Training: 85%|████████▌ | 8545/10000 [1:55:14<14:17, 1.70it/s, loss=0.0061, lr=3.78e-06, step=8545] Training: 85%|████████▌ | 8546/10000 [1:55:15<13:30, 1.79it/s, loss=0.0061, lr=3.78e-06, step=8545] Training: 85%|████████▌ | 8546/10000 [1:55:15<13:30, 1.79it/s, loss=0.0168, lr=3.78e-06, step=8546] Training: 85%|████████▌ | 8547/10000 [1:55:15<13:06, 1.85it/s, loss=0.0168, lr=3.78e-06, step=8546] Training: 85%|████████▌ | 8547/10000 [1:55:15<13:06, 1.85it/s, loss=0.0042, lr=3.78e-06, step=8547] Training: 85%|████████▌ | 8548/10000 [1:55:16<13:49, 1.75it/s, loss=0.0042, lr=3.78e-06, step=8547] Training: 85%|████████▌ | 8548/10000 [1:55:16<13:49, 1.75it/s, loss=0.0017, lr=3.77e-06, step=8548] Training: 85%|████████▌ | 8549/10000 [1:55:16<13:15, 1.82it/s, loss=0.0017, lr=3.77e-06, step=8548] Training: 85%|████████▌ | 8549/10000 [1:55:16<13:15, 1.82it/s, loss=0.0021, lr=3.77e-06, step=8549]20:39:49.345 [I] step=8550 loss=0.0125 smoothed_loss=0.0077 lr=3.78e-06 grad_norm=0.3812 step_time=0.4914s data_time=0.0797s it/s=1.751 eta_to_10000=827.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0082 grad_action_out_proj_arms=0.1025 grad_arm_token_fuse=0.0439 grad_shared_expert=0.4480 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8550/10000 [1:55:17<14:24, 1.68it/s, loss=0.0021, lr=3.77e-06, step=8549] Training: 86%|████████▌ | 8550/10000 [1:55:17<14:24, 1.68it/s, loss=0.0125, lr=3.77e-06, step=8550] Training: 86%|████████▌ | 8551/10000 [1:55:18<13:36, 1.77it/s, loss=0.0125, lr=3.77e-06, step=8550] Training: 86%|████████▌ | 8551/10000 [1:55:18<13:36, 1.77it/s, loss=0.0054, lr=3.77e-06, step=8551] Training: 86%|████████▌ | 8552/10000 [1:55:18<16:26, 1.47it/s, loss=0.0054, lr=3.77e-06, step=8551] Training: 86%|████████▌ | 8552/10000 [1:55:18<16:26, 1.47it/s, loss=0.0189, lr=3.77e-06, step=8552] Training: 86%|████████▌ | 8553/10000 [1:55:19<15:09, 1.59it/s, loss=0.0189, lr=3.77e-06, step=8552] Training: 86%|████████▌ | 8553/10000 [1:55:19<15:09, 1.59it/s, loss=0.0032, lr=3.77e-06, step=8553] Training: 86%|████████▌ | 8554/10000 [1:55:19<14:18, 1.68it/s, loss=0.0032, lr=3.77e-06, step=8553] Training: 86%|████████▌ | 8554/10000 [1:55:19<14:18, 1.68it/s, loss=0.0064, lr=3.76e-06, step=8554] Training: 86%|████████▌ | 8555/10000 [1:55:20<14:58, 1.61it/s, loss=0.0064, lr=3.76e-06, step=8554] Training: 86%|████████▌ | 8555/10000 [1:55:20<14:58, 1.61it/s, loss=0.0184, lr=3.76e-06, step=8555] Training: 86%|████████▌ | 8556/10000 [1:55:21<15:47, 1.52it/s, loss=0.0184, lr=3.76e-06, step=8555] Training: 86%|████████▌ | 8556/10000 [1:55:21<15:47, 1.52it/s, loss=0.0060, lr=3.76e-06, step=8556] Training: 86%|████████▌ | 8557/10000 [1:55:22<16:03, 1.50it/s, loss=0.0060, lr=3.76e-06, step=8556] Training: 86%|████████▌ | 8557/10000 [1:55:22<16:03, 1.50it/s, loss=0.0158, lr=3.76e-06, step=8557] Training: 86%|████████▌ | 8558/10000 [1:55:22<14:36, 1.65it/s, loss=0.0158, lr=3.76e-06, step=8557] Training: 86%|████████▌ | 8558/10000 [1:55:22<14:36, 1.65it/s, loss=0.0073, lr=3.76e-06, step=8558] Training: 86%|████████▌ | 8559/10000 [1:55:23<13:35, 1.77it/s, loss=0.0073, lr=3.76e-06, step=8558] Training: 86%|████████▌ | 8559/10000 [1:55:23<13:35, 1.77it/s, loss=0.0041, lr=3.76e-06, step=8559]20:39:55.550 [I] step=8560 loss=0.0057 smoothed_loss=0.0084 lr=3.76e-06 grad_norm=0.4088 step_time=0.5161s data_time=0.1043s it/s=1.612 eta_to_10000=893.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0125 grad_action_out_proj_arms=0.1048 grad_arm_token_fuse=0.0693 grad_shared_expert=0.3397 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8560/10000 [1:55:23<14:28, 1.66it/s, loss=0.0041, lr=3.76e-06, step=8559] Training: 86%|████████▌ | 8560/10000 [1:55:23<14:28, 1.66it/s, loss=0.0057, lr=3.75e-06, step=8560] Training: 86%|████████▌ | 8561/10000 [1:55:24<14:14, 1.68it/s, loss=0.0057, lr=3.75e-06, step=8560] Training: 86%|████████▌ | 8561/10000 [1:55:24<14:14, 1.68it/s, loss=0.0034, lr=3.75e-06, step=8561] Training: 86%|████████▌ | 8562/10000 [1:55:24<13:21, 1.79it/s, loss=0.0034, lr=3.75e-06, step=8561] Training: 86%|████████▌ | 8562/10000 [1:55:24<13:21, 1.79it/s, loss=0.0024, lr=3.75e-06, step=8562] Training: 86%|████████▌ | 8563/10000 [1:55:25<14:34, 1.64it/s, loss=0.0024, lr=3.75e-06, step=8562] Training: 86%|████████▌ | 8563/10000 [1:55:25<14:34, 1.64it/s, loss=0.0340, lr=3.75e-06, step=8563] Training: 86%|████████▌ | 8564/10000 [1:55:26<18:08, 1.32it/s, loss=0.0340, lr=3.75e-06, step=8563] Training: 86%|████████▌ | 8564/10000 [1:55:26<18:08, 1.32it/s, loss=0.0044, lr=3.75e-06, step=8564] Training: 86%|████████▌ | 8565/10000 [1:55:27<17:43, 1.35it/s, loss=0.0044, lr=3.75e-06, step=8564] Training: 86%|████████▌ | 8565/10000 [1:55:27<17:43, 1.35it/s, loss=0.0170, lr=3.74e-06, step=8565] Training: 86%|████████▌ | 8566/10000 [1:55:28<17:39, 1.35it/s, loss=0.0170, lr=3.74e-06, step=8565] Training: 86%|████████▌ | 8566/10000 [1:55:28<17:39, 1.35it/s, loss=0.0008, lr=3.74e-06, step=8566] Training: 86%|████████▌ | 8567/10000 [1:55:28<16:58, 1.41it/s, loss=0.0008, lr=3.74e-06, step=8566] Training: 86%|████████▌ | 8567/10000 [1:55:28<16:58, 1.41it/s, loss=0.0046, lr=3.74e-06, step=8567] Training: 86%|████████▌ | 8568/10000 [1:55:29<15:21, 1.55it/s, loss=0.0046, lr=3.74e-06, step=8567] Training: 86%|████████▌ | 8568/10000 [1:55:29<15:21, 1.55it/s, loss=0.0036, lr=3.74e-06, step=8568] Training: 86%|████████▌ | 8569/10000 [1:55:29<16:10, 1.48it/s, loss=0.0036, lr=3.74e-06, step=8568] Training: 86%|████████▌ | 8569/10000 [1:55:29<16:10, 1.48it/s, loss=0.0304, lr=3.74e-06, step=8569]20:40:02.277 [I] step=8570 loss=0.0016 smoothed_loss=0.0096 lr=3.74e-06 grad_norm=0.4043 step_time=0.5483s data_time=0.1244s it/s=1.487 eta_to_10000=961.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0052 grad_action_out_proj_arms=0.0737 grad_arm_token_fuse=0.0253 grad_shared_expert=0.2957 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8570/10000 [1:55:30<15:02, 1.58it/s, loss=0.0304, lr=3.74e-06, step=8569] Training: 86%|████████▌ | 8570/10000 [1:55:30<15:02, 1.58it/s, loss=0.0016, lr=3.74e-06, step=8570] Training: 86%|████████▌ | 8571/10000 [1:55:31<15:13, 1.56it/s, loss=0.0016, lr=3.74e-06, step=8570] Training: 86%|████████▌ | 8571/10000 [1:55:31<15:13, 1.56it/s, loss=0.0023, lr=3.73e-06, step=8571] Training: 86%|████████▌ | 8572/10000 [1:55:31<15:19, 1.55it/s, loss=0.0023, lr=3.73e-06, step=8571] Training: 86%|████████▌ | 8572/10000 [1:55:31<15:19, 1.55it/s, loss=0.0218, lr=3.73e-06, step=8572] Training: 86%|████████▌ | 8573/10000 [1:55:32<15:05, 1.58it/s, loss=0.0218, lr=3.73e-06, step=8572] Training: 86%|████████▌ | 8573/10000 [1:55:32<15:05, 1.58it/s, loss=0.0078, lr=3.73e-06, step=8573] Training: 86%|████████▌ | 8574/10000 [1:55:33<16:07, 1.47it/s, loss=0.0078, lr=3.73e-06, step=8573] Training: 86%|████████▌ | 8574/10000 [1:55:33<16:07, 1.47it/s, loss=0.0105, lr=3.73e-06, step=8574] Training: 86%|████████▌ | 8575/10000 [1:55:33<16:17, 1.46it/s, loss=0.0105, lr=3.73e-06, step=8574] Training: 86%|████████▌ | 8575/10000 [1:55:33<16:17, 1.46it/s, loss=0.0019, lr=3.73e-06, step=8575] Training: 86%|████████▌ | 8576/10000 [1:55:34<15:24, 1.54it/s, loss=0.0019, lr=3.73e-06, step=8575] Training: 86%|████████▌ | 8576/10000 [1:55:34<15:24, 1.54it/s, loss=0.0019, lr=3.73e-06, step=8576] Training: 86%|████████▌ | 8577/10000 [1:55:35<16:17, 1.46it/s, loss=0.0019, lr=3.73e-06, step=8576] Training: 86%|████████▌ | 8577/10000 [1:55:35<16:17, 1.46it/s, loss=0.0014, lr=3.72e-06, step=8577] Training: 86%|████████▌ | 8578/10000 [1:55:35<15:52, 1.49it/s, loss=0.0014, lr=3.72e-06, step=8577] Training: 86%|████████▌ | 8578/10000 [1:55:35<15:52, 1.49it/s, loss=0.0007, lr=3.72e-06, step=8578] Training: 86%|████████▌ | 8579/10000 [1:55:36<17:32, 1.35it/s, loss=0.0007, lr=3.72e-06, step=8578] Training: 86%|████████▌ | 8579/10000 [1:55:36<17:32, 1.35it/s, loss=0.0008, lr=3.72e-06, step=8579]20:40:09.068 [I] step=8580 loss=0.0048 smoothed_loss=0.0063 lr=3.73e-06 grad_norm=0.4443 step_time=0.5625s data_time=0.1166s it/s=1.473 eta_to_10000=964.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.1049 grad_arm_token_fuse=0.0386 grad_shared_expert=0.4959 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8580/10000 [1:55:37<15:52, 1.49it/s, loss=0.0008, lr=3.72e-06, step=8579] Training: 86%|████████▌ | 8580/10000 [1:55:37<15:52, 1.49it/s, loss=0.0048, lr=3.72e-06, step=8580] Training: 86%|████████▌ | 8581/10000 [1:55:37<14:41, 1.61it/s, loss=0.0048, lr=3.72e-06, step=8580] Training: 86%|████████▌ | 8581/10000 [1:55:37<14:41, 1.61it/s, loss=0.0069, lr=3.72e-06, step=8581] Training: 86%|████████▌ | 8582/10000 [1:55:38<15:59, 1.48it/s, loss=0.0069, lr=3.72e-06, step=8581] Training: 86%|████████▌ | 8582/10000 [1:55:38<15:59, 1.48it/s, loss=0.0294, lr=3.72e-06, step=8582] Training: 86%|████████▌ | 8583/10000 [1:55:39<14:31, 1.63it/s, loss=0.0294, lr=3.72e-06, step=8582] Training: 86%|████████▌ | 8583/10000 [1:55:39<14:31, 1.63it/s, loss=0.0131, lr=3.71e-06, step=8583] Training: 86%|████████▌ | 8584/10000 [1:55:39<13:38, 1.73it/s, loss=0.0131, lr=3.71e-06, step=8583] Training: 86%|████████▌ | 8584/10000 [1:55:39<13:38, 1.73it/s, loss=0.0177, lr=3.71e-06, step=8584] Training: 86%|████████▌ | 8585/10000 [1:55:39<12:53, 1.83it/s, loss=0.0177, lr=3.71e-06, step=8584] Training: 86%|████████▌ | 8585/10000 [1:55:39<12:53, 1.83it/s, loss=0.0253, lr=3.71e-06, step=8585] Training: 86%|████████▌ | 8586/10000 [1:55:40<13:48, 1.71it/s, loss=0.0253, lr=3.71e-06, step=8585] Training: 86%|████████▌ | 8586/10000 [1:55:40<13:48, 1.71it/s, loss=0.0039, lr=3.71e-06, step=8586] Training: 86%|████████▌ | 8587/10000 [1:55:41<14:31, 1.62it/s, loss=0.0039, lr=3.71e-06, step=8586] Training: 86%|████████▌ | 8587/10000 [1:55:41<14:31, 1.62it/s, loss=0.0057, lr=3.71e-06, step=8587] Training: 86%|████████▌ | 8588/10000 [1:55:41<14:35, 1.61it/s, loss=0.0057, lr=3.71e-06, step=8587] Training: 86%|████████▌ | 8588/10000 [1:55:41<14:35, 1.61it/s, loss=0.0018, lr=3.71e-06, step=8588] Training: 86%|████████▌ | 8589/10000 [1:55:42<14:46, 1.59it/s, loss=0.0018, lr=3.71e-06, step=8588] Training: 86%|████████▌ | 8589/10000 [1:55:42<14:46, 1.59it/s, loss=0.0112, lr=3.70e-06, step=8589]20:40:14.965 [I] step=8590 loss=0.0015 smoothed_loss=0.0088 lr=3.71e-06 grad_norm=0.3763 step_time=0.4868s data_time=0.1029s it/s=1.696 eta_to_10000=831.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.0807 grad_arm_token_fuse=0.0348 grad_shared_expert=0.2434 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8590/10000 [1:55:43<13:54, 1.69it/s, loss=0.0112, lr=3.70e-06, step=8589] Training: 86%|████████▌ | 8590/10000 [1:55:43<13:54, 1.69it/s, loss=0.0015, lr=3.70e-06, step=8590] Training: 86%|████████▌ | 8591/10000 [1:55:43<14:35, 1.61it/s, loss=0.0015, lr=3.70e-06, step=8590] Training: 86%|████████▌ | 8591/10000 [1:55:43<14:35, 1.61it/s, loss=0.0066, lr=3.70e-06, step=8591] Training: 86%|████████▌ | 8592/10000 [1:55:44<14:01, 1.67it/s, loss=0.0066, lr=3.70e-06, step=8591] Training: 86%|████████▌ | 8592/10000 [1:55:44<14:01, 1.67it/s, loss=0.0047, lr=3.70e-06, step=8592] Training: 86%|████████▌ | 8593/10000 [1:55:45<17:24, 1.35it/s, loss=0.0047, lr=3.70e-06, step=8592] Training: 86%|████████▌ | 8593/10000 [1:55:45<17:24, 1.35it/s, loss=0.0145, lr=3.70e-06, step=8593] Training: 86%|████████▌ | 8594/10000 [1:55:45<15:37, 1.50it/s, loss=0.0145, lr=3.70e-06, step=8593] Training: 86%|████████▌ | 8594/10000 [1:55:45<15:37, 1.50it/s, loss=0.0050, lr=3.70e-06, step=8594] Training: 86%|████████▌ | 8595/10000 [1:55:46<15:27, 1.52it/s, loss=0.0050, lr=3.70e-06, step=8594] Training: 86%|████████▌ | 8595/10000 [1:55:46<15:27, 1.52it/s, loss=0.0165, lr=3.69e-06, step=8595] Training: 86%|████████▌ | 8596/10000 [1:55:47<14:04, 1.66it/s, loss=0.0165, lr=3.69e-06, step=8595] Training: 86%|████████▌ | 8596/10000 [1:55:47<14:04, 1.66it/s, loss=0.0051, lr=3.69e-06, step=8596] Training: 86%|████████▌ | 8597/10000 [1:55:47<13:13, 1.77it/s, loss=0.0051, lr=3.69e-06, step=8596] Training: 86%|████████▌ | 8597/10000 [1:55:47<13:13, 1.77it/s, loss=0.0076, lr=3.69e-06, step=8597] Training: 86%|████████▌ | 8598/10000 [1:55:47<12:31, 1.87it/s, loss=0.0076, lr=3.69e-06, step=8597] Training: 86%|████████▌ | 8598/10000 [1:55:47<12:31, 1.87it/s, loss=0.0177, lr=3.69e-06, step=8598] Training: 86%|████████▌ | 8599/10000 [1:55:48<12:01, 1.94it/s, loss=0.0177, lr=3.69e-06, step=8598] Training: 86%|████████▌ | 8599/10000 [1:55:48<12:01, 1.94it/s, loss=0.0037, lr=3.69e-06, step=8599]20:40:20.984 [I] step=8600 loss=0.0059 smoothed_loss=0.0087 lr=3.69e-06 grad_norm=0.4555 step_time=0.5116s data_time=0.0904s it/s=1.662 eta_to_10000=842.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0064 grad_action_out_proj_arms=0.0811 grad_arm_token_fuse=0.0367 grad_shared_expert=0.4083 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8600/10000 [1:55:49<13:15, 1.76it/s, loss=0.0037, lr=3.69e-06, step=8599] Training: 86%|████████▌ | 8600/10000 [1:55:49<13:15, 1.76it/s, loss=0.0059, lr=3.69e-06, step=8600] Training: 86%|████████▌ | 8601/10000 [1:55:49<12:36, 1.85it/s, loss=0.0059, lr=3.69e-06, step=8600] Training: 86%|████████▌ | 8601/10000 [1:55:49<12:36, 1.85it/s, loss=0.0103, lr=3.68e-06, step=8601] Training: 86%|████████▌ | 8602/10000 [1:55:50<14:02, 1.66it/s, loss=0.0103, lr=3.68e-06, step=8601] Training: 86%|████████▌ | 8602/10000 [1:55:50<14:02, 1.66it/s, loss=0.0041, lr=3.68e-06, step=8602] Training: 86%|████████▌ | 8603/10000 [1:55:51<16:10, 1.44it/s, loss=0.0041, lr=3.68e-06, step=8602] Training: 86%|████████▌ | 8603/10000 [1:55:51<16:10, 1.44it/s, loss=0.0099, lr=3.68e-06, step=8603] Training: 86%|████████▌ | 8604/10000 [1:55:51<14:33, 1.60it/s, loss=0.0099, lr=3.68e-06, step=8603] Training: 86%|████████▌ | 8604/10000 [1:55:51<14:33, 1.60it/s, loss=0.0099, lr=3.68e-06, step=8604] Training: 86%|████████▌ | 8605/10000 [1:55:52<13:33, 1.71it/s, loss=0.0099, lr=3.68e-06, step=8604] Training: 86%|████████▌ | 8605/10000 [1:55:52<13:33, 1.71it/s, loss=0.0022, lr=3.68e-06, step=8605] Training: 86%|████████▌ | 8606/10000 [1:55:52<12:49, 1.81it/s, loss=0.0022, lr=3.68e-06, step=8605] Training: 86%|████████▌ | 8606/10000 [1:55:52<12:49, 1.81it/s, loss=0.0096, lr=3.68e-06, step=8606] Training: 86%|████████▌ | 8607/10000 [1:55:53<13:38, 1.70it/s, loss=0.0096, lr=3.68e-06, step=8606] Training: 86%|████████▌ | 8607/10000 [1:55:53<13:38, 1.70it/s, loss=0.0049, lr=3.67e-06, step=8607] Training: 86%|████████▌ | 8608/10000 [1:55:54<15:09, 1.53it/s, loss=0.0049, lr=3.67e-06, step=8607] Training: 86%|████████▌ | 8608/10000 [1:55:54<15:09, 1.53it/s, loss=0.0066, lr=3.67e-06, step=8608] Training: 86%|████████▌ | 8609/10000 [1:55:54<13:57, 1.66it/s, loss=0.0066, lr=3.67e-06, step=8608] Training: 86%|████████▌ | 8609/10000 [1:55:54<13:57, 1.66it/s, loss=0.0233, lr=3.67e-06, step=8609]20:40:27.194 [I] step=8610 loss=0.0020 smoothed_loss=0.0086 lr=3.68e-06 grad_norm=0.4803 step_time=0.5169s data_time=0.1041s it/s=1.610 eta_to_10000=863.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0132 grad_action_out_proj_arms=0.0974 grad_arm_token_fuse=0.0674 grad_shared_expert=0.4539 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8610/10000 [1:55:55<14:33, 1.59it/s, loss=0.0233, lr=3.67e-06, step=8609] Training: 86%|████████▌ | 8610/10000 [1:55:55<14:33, 1.59it/s, loss=0.0020, lr=3.67e-06, step=8610] Training: 86%|████████▌ | 8611/10000 [1:55:55<13:35, 1.70it/s, loss=0.0020, lr=3.67e-06, step=8610] Training: 86%|████████▌ | 8611/10000 [1:55:55<13:35, 1.70it/s, loss=0.0114, lr=3.67e-06, step=8611] Training: 86%|████████▌ | 8612/10000 [1:55:56<12:54, 1.79it/s, loss=0.0114, lr=3.67e-06, step=8611] Training: 86%|████████▌ | 8612/10000 [1:55:56<12:54, 1.79it/s, loss=0.0082, lr=3.67e-06, step=8612] Training: 86%|████████▌ | 8613/10000 [1:55:56<12:23, 1.87it/s, loss=0.0082, lr=3.67e-06, step=8612] Training: 86%|████████▌ | 8613/10000 [1:55:56<12:23, 1.87it/s, loss=0.0372, lr=3.66e-06, step=8613] Training: 86%|████████▌ | 8614/10000 [1:55:57<14:39, 1.58it/s, loss=0.0372, lr=3.66e-06, step=8613] Training: 86%|████████▌ | 8614/10000 [1:55:57<14:39, 1.58it/s, loss=0.0050, lr=3.66e-06, step=8614] Training: 86%|████████▌ | 8615/10000 [1:55:58<14:57, 1.54it/s, loss=0.0050, lr=3.66e-06, step=8614] Training: 86%|████████▌ | 8615/10000 [1:55:58<14:57, 1.54it/s, loss=0.0032, lr=3.66e-06, step=8615] Training: 86%|████████▌ | 8616/10000 [1:55:58<13:41, 1.68it/s, loss=0.0032, lr=3.66e-06, step=8615] Training: 86%|████████▌ | 8616/10000 [1:55:58<13:41, 1.68it/s, loss=0.0059, lr=3.66e-06, step=8616] Training: 86%|████████▌ | 8617/10000 [1:55:59<13:01, 1.77it/s, loss=0.0059, lr=3.66e-06, step=8616] Training: 86%|████████▌ | 8617/10000 [1:55:59<13:01, 1.77it/s, loss=0.0028, lr=3.66e-06, step=8617] Training: 86%|████████▌ | 8618/10000 [1:55:59<13:27, 1.71it/s, loss=0.0028, lr=3.66e-06, step=8617] Training: 86%|████████▌ | 8618/10000 [1:55:59<13:27, 1.71it/s, loss=0.0086, lr=3.66e-06, step=8618] Training: 86%|████████▌ | 8619/10000 [1:56:00<12:56, 1.78it/s, loss=0.0086, lr=3.66e-06, step=8618] Training: 86%|████████▌ | 8619/10000 [1:56:00<12:56, 1.78it/s, loss=0.0076, lr=3.65e-06, step=8619]20:40:32.832 [I] step=8620 loss=0.0034 smoothed_loss=0.0083 lr=3.66e-06 grad_norm=0.4575 step_time=0.4861s data_time=0.0776s it/s=1.774 eta_to_10000=777.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0198 grad_action_out_proj_arms=0.1280 grad_arm_token_fuse=0.1091 grad_shared_expert=0.3989 (18633:train_pytorch.py:850) + Training: 86%|████████▌ | 8620/10000 [1:56:01<12:39, 1.82it/s, loss=0.0076, lr=3.65e-06, step=8619] Training: 86%|████████▌ | 8620/10000 [1:56:01<12:39, 1.82it/s, loss=0.0034, lr=3.65e-06, step=8620] Training: 86%|████████▌ | 8621/10000 [1:56:01<13:27, 1.71it/s, loss=0.0034, lr=3.65e-06, step=8620] Training: 86%|████████▌ | 8621/10000 [1:56:01<13:27, 1.71it/s, loss=0.0048, lr=3.65e-06, step=8621] Training: 86%|████████▌ | 8622/10000 [1:56:02<15:06, 1.52it/s, loss=0.0048, lr=3.65e-06, step=8621] Training: 86%|████████▌ | 8622/10000 [1:56:02<15:06, 1.52it/s, loss=0.0122, lr=3.65e-06, step=8622] Training: 86%|████████▌ | 8623/10000 [1:56:02<13:52, 1.65it/s, loss=0.0122, lr=3.65e-06, step=8622] Training: 86%|████████▌ | 8623/10000 [1:56:02<13:52, 1.65it/s, loss=0.0049, lr=3.65e-06, step=8623] Training: 86%|████████▌ | 8624/10000 [1:56:03<12:58, 1.77it/s, loss=0.0049, lr=3.65e-06, step=8623] Training: 86%|████████▌ | 8624/10000 [1:56:03<12:58, 1.77it/s, loss=0.0155, lr=3.65e-06, step=8624] Training: 86%|████████▋ | 8625/10000 [1:56:04<14:16, 1.60it/s, loss=0.0155, lr=3.65e-06, step=8624] Training: 86%|████████▋ | 8625/10000 [1:56:04<14:16, 1.60it/s, loss=0.0018, lr=3.64e-06, step=8625] Training: 86%|████████▋ | 8626/10000 [1:56:04<13:19, 1.72it/s, loss=0.0018, lr=3.64e-06, step=8625] Training: 86%|████████▋ | 8626/10000 [1:56:04<13:19, 1.72it/s, loss=0.0062, lr=3.64e-06, step=8626] Training: 86%|████████▋ | 8627/10000 [1:56:05<12:33, 1.82it/s, loss=0.0062, lr=3.64e-06, step=8626] Training: 86%|████████▋ | 8627/10000 [1:56:05<12:33, 1.82it/s, loss=0.0056, lr=3.64e-06, step=8627] Training: 86%|████████▋ | 8628/10000 [1:56:05<13:09, 1.74it/s, loss=0.0056, lr=3.64e-06, step=8627] Training: 86%|████████▋ | 8628/10000 [1:56:05<13:09, 1.74it/s, loss=0.0031, lr=3.64e-06, step=8628] Training: 86%|████████▋ | 8629/10000 [1:56:06<15:09, 1.51it/s, loss=0.0031, lr=3.64e-06, step=8628] Training: 86%|████████▋ | 8629/10000 [1:56:06<15:09, 1.51it/s, loss=0.0023, lr=3.64e-06, step=8629]20:40:39.025 [I] step=8630 loss=0.0058 smoothed_loss=0.0066 lr=3.64e-06 grad_norm=0.3701 step_time=0.5327s data_time=0.0866s it/s=1.615 eta_to_10000=848.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.0873 grad_arm_token_fuse=0.0506 grad_shared_expert=0.2788 (18633:train_pytorch.py:850) + Training: 86%|████████▋ | 8630/10000 [1:56:07<14:11, 1.61it/s, loss=0.0023, lr=3.64e-06, step=8629] Training: 86%|████████▋ | 8630/10000 [1:56:07<14:11, 1.61it/s, loss=0.0058, lr=3.64e-06, step=8630] Training: 86%|████████▋ | 8631/10000 [1:56:07<13:14, 1.72it/s, loss=0.0058, lr=3.64e-06, step=8630] Training: 86%|████████▋ | 8631/10000 [1:56:07<13:14, 1.72it/s, loss=0.0018, lr=3.63e-06, step=8631] Training: 86%|████████▋ | 8632/10000 [1:56:08<14:12, 1.60it/s, loss=0.0018, lr=3.63e-06, step=8631] Training: 86%|████████▋ | 8632/10000 [1:56:08<14:12, 1.60it/s, loss=0.0006, lr=3.63e-06, step=8632] Training: 86%|████████▋ | 8633/10000 [1:56:09<14:51, 1.53it/s, loss=0.0006, lr=3.63e-06, step=8632] Training: 86%|████████▋ | 8633/10000 [1:56:09<14:51, 1.53it/s, loss=0.0028, lr=3.63e-06, step=8633] Training: 86%|████████▋ | 8634/10000 [1:56:09<15:09, 1.50it/s, loss=0.0028, lr=3.63e-06, step=8633] Training: 86%|████████▋ | 8634/10000 [1:56:09<15:09, 1.50it/s, loss=0.0053, lr=3.63e-06, step=8634] Training: 86%|████████▋ | 8635/10000 [1:56:10<16:11, 1.41it/s, loss=0.0053, lr=3.63e-06, step=8634] Training: 86%|████████▋ | 8635/10000 [1:56:10<16:11, 1.41it/s, loss=0.0065, lr=3.63e-06, step=8635] Training: 86%|████████▋ | 8636/10000 [1:56:11<16:01, 1.42it/s, loss=0.0065, lr=3.63e-06, step=8635] Training: 86%|████████▋ | 8636/10000 [1:56:11<16:01, 1.42it/s, loss=0.0054, lr=3.63e-06, step=8636] Training: 86%|████████▋ | 8637/10000 [1:56:11<14:40, 1.55it/s, loss=0.0054, lr=3.63e-06, step=8636] Training: 86%|████████▋ | 8637/10000 [1:56:11<14:40, 1.55it/s, loss=0.0066, lr=3.63e-06, step=8637] Training: 86%|████████▋ | 8638/10000 [1:56:12<14:42, 1.54it/s, loss=0.0066, lr=3.63e-06, step=8637] Training: 86%|████████▋ | 8638/10000 [1:56:12<14:42, 1.54it/s, loss=0.0534, lr=3.62e-06, step=8638] Training: 86%|████████▋ | 8639/10000 [1:56:13<14:49, 1.53it/s, loss=0.0534, lr=3.62e-06, step=8638] Training: 86%|████████▋ | 8639/10000 [1:56:13<14:49, 1.53it/s, loss=0.0029, lr=3.62e-06, step=8639]20:40:45.655 [I] step=8640 loss=0.0040 smoothed_loss=0.0090 lr=3.63e-06 grad_norm=0.4240 step_time=0.5264s data_time=0.1366s it/s=1.509 eta_to_10000=901.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0170 grad_action_out_proj_arms=0.1252 grad_arm_token_fuse=0.0899 grad_shared_expert=0.3343 (18633:train_pytorch.py:850) + Training: 86%|████████▋ | 8640/10000 [1:56:13<14:54, 1.52it/s, loss=0.0029, lr=3.62e-06, step=8639] Training: 86%|████████▋ | 8640/10000 [1:56:13<14:54, 1.52it/s, loss=0.0040, lr=3.62e-06, step=8640] Training: 86%|████████▋ | 8641/10000 [1:56:14<15:31, 1.46it/s, loss=0.0040, lr=3.62e-06, step=8640] Training: 86%|████████▋ | 8641/10000 [1:56:14<15:31, 1.46it/s, loss=0.0095, lr=3.62e-06, step=8641] Training: 86%|████████▋ | 8642/10000 [1:56:15<14:37, 1.55it/s, loss=0.0095, lr=3.62e-06, step=8641] Training: 86%|████████▋ | 8642/10000 [1:56:15<14:37, 1.55it/s, loss=0.0072, lr=3.62e-06, step=8642] Training: 86%|████████▋ | 8643/10000 [1:56:15<15:45, 1.43it/s, loss=0.0072, lr=3.62e-06, step=8642] Training: 86%|████████▋ | 8643/10000 [1:56:15<15:45, 1.43it/s, loss=0.0033, lr=3.62e-06, step=8643] Training: 86%|████████▋ | 8644/10000 [1:56:16<14:32, 1.55it/s, loss=0.0033, lr=3.62e-06, step=8643] Training: 86%|████████▋ | 8644/10000 [1:56:16<14:32, 1.55it/s, loss=0.0082, lr=3.61e-06, step=8644] Training: 86%|████████▋ | 8645/10000 [1:56:17<15:00, 1.51it/s, loss=0.0082, lr=3.61e-06, step=8644] Training: 86%|████████▋ | 8645/10000 [1:56:17<15:00, 1.51it/s, loss=0.0062, lr=3.61e-06, step=8645] Training: 86%|████████▋ | 8646/10000 [1:56:17<14:26, 1.56it/s, loss=0.0062, lr=3.61e-06, step=8645] Training: 86%|████████▋ | 8646/10000 [1:56:17<14:26, 1.56it/s, loss=0.0998, lr=3.61e-06, step=8646] Training: 86%|████████▋ | 8647/10000 [1:56:18<13:46, 1.64it/s, loss=0.0998, lr=3.61e-06, step=8646] Training: 86%|████████▋ | 8647/10000 [1:56:18<13:46, 1.64it/s, loss=0.0077, lr=3.61e-06, step=8647] Training: 86%|████████▋ | 8648/10000 [1:56:19<14:25, 1.56it/s, loss=0.0077, lr=3.61e-06, step=8647] Training: 86%|████████▋ | 8648/10000 [1:56:19<14:25, 1.56it/s, loss=0.0016, lr=3.61e-06, step=8648] Training: 86%|████████▋ | 8649/10000 [1:56:19<13:33, 1.66it/s, loss=0.0016, lr=3.61e-06, step=8648] Training: 86%|████████▋ | 8649/10000 [1:56:19<13:33, 1.66it/s, loss=0.0059, lr=3.61e-06, step=8649]20:40:52.080 [I] step=8650 loss=0.0107 smoothed_loss=0.0136 lr=3.61e-06 grad_norm=0.4537 step_time=0.5368s data_time=0.1057s it/s=1.557 eta_to_10000=867.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0205 grad_action_out_proj_arms=0.1004 grad_arm_token_fuse=0.1048 grad_shared_expert=0.3096 (18633:train_pytorch.py:850) + Training: 86%|████████▋ | 8650/10000 [1:56:20<14:23, 1.56it/s, loss=0.0059, lr=3.61e-06, step=8649] Training: 86%|████████▋ | 8650/10000 [1:56:20<14:23, 1.56it/s, loss=0.0107, lr=3.60e-06, step=8650] Training: 87%|████████▋ | 8651/10000 [1:56:20<14:56, 1.50it/s, loss=0.0107, lr=3.60e-06, step=8650] Training: 87%|████████▋ | 8651/10000 [1:56:20<14:56, 1.50it/s, loss=0.0071, lr=3.60e-06, step=8651] Training: 87%|████████▋ | 8652/10000 [1:56:21<14:12, 1.58it/s, loss=0.0071, lr=3.60e-06, step=8651] Training: 87%|████████▋ | 8652/10000 [1:56:21<14:12, 1.58it/s, loss=0.0018, lr=3.60e-06, step=8652] Training: 87%|████████▋ | 8653/10000 [1:56:22<15:34, 1.44it/s, loss=0.0018, lr=3.60e-06, step=8652] Training: 87%|████████▋ | 8653/10000 [1:56:22<15:34, 1.44it/s, loss=0.0073, lr=3.60e-06, step=8653] Training: 87%|████████▋ | 8654/10000 [1:56:23<15:31, 1.44it/s, loss=0.0073, lr=3.60e-06, step=8653] Training: 87%|████████▋ | 8654/10000 [1:56:23<15:31, 1.44it/s, loss=0.0019, lr=3.60e-06, step=8654] Training: 87%|████████▋ | 8655/10000 [1:56:23<15:19, 1.46it/s, loss=0.0019, lr=3.60e-06, step=8654] Training: 87%|████████▋ | 8655/10000 [1:56:23<15:19, 1.46it/s, loss=0.0182, lr=3.60e-06, step=8655] Training: 87%|████████▋ | 8656/10000 [1:56:24<14:40, 1.53it/s, loss=0.0182, lr=3.60e-06, step=8655] Training: 87%|████████▋ | 8656/10000 [1:56:24<14:40, 1.53it/s, loss=0.0296, lr=3.59e-06, step=8656] Training: 87%|████████▋ | 8657/10000 [1:56:25<15:35, 1.44it/s, loss=0.0296, lr=3.59e-06, step=8656] Training: 87%|████████▋ | 8657/10000 [1:56:25<15:35, 1.44it/s, loss=0.0047, lr=3.59e-06, step=8657] Training: 87%|████████▋ | 8658/10000 [1:56:25<15:00, 1.49it/s, loss=0.0047, lr=3.59e-06, step=8657] Training: 87%|████████▋ | 8658/10000 [1:56:25<15:00, 1.49it/s, loss=0.0124, lr=3.59e-06, step=8658] Training: 87%|████████▋ | 8659/10000 [1:56:26<15:01, 1.49it/s, loss=0.0124, lr=3.59e-06, step=8658] Training: 87%|████████▋ | 8659/10000 [1:56:26<15:01, 1.49it/s, loss=0.0035, lr=3.59e-06, step=8659]20:40:58.829 [I] step=8660 loss=0.0098 smoothed_loss=0.0112 lr=3.60e-06 grad_norm=0.3961 step_time=0.5362s data_time=0.1387s it/s=1.482 eta_to_10000=904.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.0859 grad_arm_token_fuse=0.0555 grad_shared_expert=0.3343 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8660/10000 [1:56:27<14:36, 1.53it/s, loss=0.0035, lr=3.59e-06, step=8659] Training: 87%|████████▋ | 8660/10000 [1:56:27<14:36, 1.53it/s, loss=0.0098, lr=3.59e-06, step=8660] Training: 87%|████████▋ | 8661/10000 [1:56:27<14:13, 1.57it/s, loss=0.0098, lr=3.59e-06, step=8660] Training: 87%|████████▋ | 8661/10000 [1:56:27<14:13, 1.57it/s, loss=0.0022, lr=3.59e-06, step=8661] Training: 87%|████████▋ | 8662/10000 [1:56:28<15:08, 1.47it/s, loss=0.0022, lr=3.59e-06, step=8661] Training: 87%|████████▋ | 8662/10000 [1:56:28<15:08, 1.47it/s, loss=0.0383, lr=3.58e-06, step=8662] Training: 87%|████████▋ | 8663/10000 [1:56:29<16:12, 1.38it/s, loss=0.0383, lr=3.58e-06, step=8662] Training: 87%|████████▋ | 8663/10000 [1:56:29<16:12, 1.38it/s, loss=0.0147, lr=3.58e-06, step=8663] Training: 87%|████████▋ | 8664/10000 [1:56:29<16:31, 1.35it/s, loss=0.0147, lr=3.58e-06, step=8663] Training: 87%|████████▋ | 8664/10000 [1:56:29<16:31, 1.35it/s, loss=0.0034, lr=3.58e-06, step=8664] Training: 87%|████████▋ | 8665/10000 [1:56:30<17:34, 1.27it/s, loss=0.0034, lr=3.58e-06, step=8664] Training: 87%|████████▋ | 8665/10000 [1:56:30<17:34, 1.27it/s, loss=0.0017, lr=3.58e-06, step=8665] Training: 87%|████████▋ | 8666/10000 [1:56:31<18:59, 1.17it/s, loss=0.0017, lr=3.58e-06, step=8665] Training: 87%|████████▋ | 8666/10000 [1:56:31<18:59, 1.17it/s, loss=0.0114, lr=3.58e-06, step=8666] Training: 87%|████████▋ | 8667/10000 [1:56:32<19:24, 1.14it/s, loss=0.0114, lr=3.58e-06, step=8666] Training: 87%|████████▋ | 8667/10000 [1:56:32<19:24, 1.14it/s, loss=0.0120, lr=3.58e-06, step=8667] Training: 87%|████████▋ | 8668/10000 [1:56:33<19:43, 1.13it/s, loss=0.0120, lr=3.58e-06, step=8667] Training: 87%|████████▋ | 8668/10000 [1:56:33<19:43, 1.13it/s, loss=0.0027, lr=3.58e-06, step=8668] Training: 87%|████████▋ | 8669/10000 [1:56:34<19:24, 1.14it/s, loss=0.0027, lr=3.58e-06, step=8668] Training: 87%|████████▋ | 8669/10000 [1:56:34<19:24, 1.14it/s, loss=0.0037, lr=3.57e-06, step=8669]20:41:07.392 [I] step=8670 loss=0.0062 smoothed_loss=0.0094 lr=3.58e-06 grad_norm=0.4859 step_time=0.6668s data_time=0.1895s it/s=1.168 eta_to_10000=1138.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0165 grad_action_out_proj_arms=0.1002 grad_arm_token_fuse=0.0851 grad_shared_expert=0.4366 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8670/10000 [1:56:35<20:05, 1.10it/s, loss=0.0037, lr=3.57e-06, step=8669] Training: 87%|████████▋ | 8670/10000 [1:56:35<20:05, 1.10it/s, loss=0.0062, lr=3.57e-06, step=8670] Training: 87%|████████▋ | 8671/10000 [1:56:36<21:01, 1.05it/s, loss=0.0062, lr=3.57e-06, step=8670] Training: 87%|████████▋ | 8671/10000 [1:56:36<21:01, 1.05it/s, loss=0.0068, lr=3.57e-06, step=8671] Training: 87%|████████▋ | 8672/10000 [1:56:37<23:15, 1.05s/it, loss=0.0068, lr=3.57e-06, step=8671] Training: 87%|████████▋ | 8672/10000 [1:56:37<23:15, 1.05s/it, loss=0.0032, lr=3.57e-06, step=8672] Training: 87%|████████▋ | 8673/10000 [1:56:38<21:31, 1.03it/s, loss=0.0032, lr=3.57e-06, step=8672] Training: 87%|████████▋ | 8673/10000 [1:56:38<21:31, 1.03it/s, loss=0.0202, lr=3.57e-06, step=8673] Training: 87%|████████▋ | 8674/10000 [1:56:39<20:53, 1.06it/s, loss=0.0202, lr=3.57e-06, step=8673] Training: 87%|████████▋ | 8674/10000 [1:56:39<20:53, 1.06it/s, loss=0.0055, lr=3.57e-06, step=8674] Training: 87%|████████▋ | 8675/10000 [1:56:40<19:45, 1.12it/s, loss=0.0055, lr=3.57e-06, step=8674] Training: 87%|████████▋ | 8675/10000 [1:56:40<19:45, 1.12it/s, loss=0.0020, lr=3.56e-06, step=8675] Training: 87%|████████▋ | 8676/10000 [1:56:40<17:01, 1.30it/s, loss=0.0020, lr=3.56e-06, step=8675] Training: 87%|████████▋ | 8676/10000 [1:56:40<17:01, 1.30it/s, loss=0.0073, lr=3.56e-06, step=8676] Training: 87%|████████▋ | 8677/10000 [1:56:41<17:36, 1.25it/s, loss=0.0073, lr=3.56e-06, step=8676] Training: 87%|████████▋ | 8677/10000 [1:56:41<17:36, 1.25it/s, loss=0.0097, lr=3.56e-06, step=8677] Training: 87%|████████▋ | 8678/10000 [1:56:42<17:30, 1.26it/s, loss=0.0097, lr=3.56e-06, step=8677] Training: 87%|████████▋ | 8678/10000 [1:56:42<17:30, 1.26it/s, loss=0.0098, lr=3.56e-06, step=8678] Training: 87%|████████▋ | 8679/10000 [1:56:43<16:53, 1.30it/s, loss=0.0098, lr=3.56e-06, step=8678] Training: 87%|████████▋ | 8679/10000 [1:56:43<16:53, 1.30it/s, loss=0.0054, lr=3.56e-06, step=8679]20:41:15.514 [I] step=8680 loss=0.0016 smoothed_loss=0.0077 lr=3.56e-06 grad_norm=0.4620 step_time=0.6494s data_time=0.1628s it/s=1.231 eta_to_10000=1071.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0074 grad_action_out_proj_arms=0.1212 grad_arm_token_fuse=0.0394 grad_shared_expert=0.4374 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8680/10000 [1:56:43<15:07, 1.45it/s, loss=0.0054, lr=3.56e-06, step=8679] Training: 87%|████████▋ | 8680/10000 [1:56:43<15:07, 1.45it/s, loss=0.0016, lr=3.56e-06, step=8680] Training: 87%|████████▋ | 8681/10000 [1:56:44<14:31, 1.51it/s, loss=0.0016, lr=3.56e-06, step=8680] Training: 87%|████████▋ | 8681/10000 [1:56:44<14:31, 1.51it/s, loss=0.0025, lr=3.55e-06, step=8681] Training: 87%|████████▋ | 8682/10000 [1:56:44<13:28, 1.63it/s, loss=0.0025, lr=3.55e-06, step=8681] Training: 87%|████████▋ | 8682/10000 [1:56:44<13:28, 1.63it/s, loss=0.0314, lr=3.55e-06, step=8682] Training: 87%|████████▋ | 8683/10000 [1:56:45<14:16, 1.54it/s, loss=0.0314, lr=3.55e-06, step=8682] Training: 87%|████████▋ | 8683/10000 [1:56:45<14:16, 1.54it/s, loss=0.0090, lr=3.55e-06, step=8683] Training: 87%|████████▋ | 8684/10000 [1:56:46<14:37, 1.50it/s, loss=0.0090, lr=3.55e-06, step=8683] Training: 87%|████████▋ | 8684/10000 [1:56:46<14:37, 1.50it/s, loss=0.0024, lr=3.55e-06, step=8684] Training: 87%|████████▋ | 8685/10000 [1:56:47<16:40, 1.31it/s, loss=0.0024, lr=3.55e-06, step=8684] Training: 87%|████████▋ | 8685/10000 [1:56:47<16:40, 1.31it/s, loss=0.0095, lr=3.55e-06, step=8685] Training: 87%|████████▋ | 8686/10000 [1:56:48<18:34, 1.18it/s, loss=0.0095, lr=3.55e-06, step=8685] Training: 87%|████████▋ | 8686/10000 [1:56:48<18:34, 1.18it/s, loss=0.0310, lr=3.55e-06, step=8686] Training: 87%|████████▋ | 8687/10000 [1:56:49<18:14, 1.20it/s, loss=0.0310, lr=3.55e-06, step=8686] Training: 87%|████████▋ | 8687/10000 [1:56:49<18:14, 1.20it/s, loss=0.0016, lr=3.55e-06, step=8687] Training: 87%|████████▋ | 8688/10000 [1:56:49<17:20, 1.26it/s, loss=0.0016, lr=3.55e-06, step=8687] Training: 87%|████████▋ | 8688/10000 [1:56:49<17:20, 1.26it/s, loss=0.0027, lr=3.54e-06, step=8688] Training: 87%|████████▋ | 8689/10000 [1:56:50<17:01, 1.28it/s, loss=0.0027, lr=3.54e-06, step=8688] Training: 87%|████████▋ | 8689/10000 [1:56:50<17:01, 1.28it/s, loss=0.0057, lr=3.54e-06, step=8689]20:41:22.963 [I] step=8690 loss=0.0078 smoothed_loss=0.0089 lr=3.55e-06 grad_norm=0.4131 step_time=0.5907s data_time=0.1542s it/s=1.343 eta_to_10000=975.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0149 grad_action_out_proj_arms=0.1476 grad_arm_token_fuse=0.0826 grad_shared_expert=0.4484 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8690/10000 [1:56:51<16:01, 1.36it/s, loss=0.0057, lr=3.54e-06, step=8689] Training: 87%|████████▋ | 8690/10000 [1:56:51<16:01, 1.36it/s, loss=0.0078, lr=3.54e-06, step=8690] Training: 87%|████████▋ | 8691/10000 [1:56:51<15:28, 1.41it/s, loss=0.0078, lr=3.54e-06, step=8690] Training: 87%|████████▋ | 8691/10000 [1:56:51<15:28, 1.41it/s, loss=0.0104, lr=3.54e-06, step=8691] Training: 87%|████████▋ | 8692/10000 [1:56:52<16:38, 1.31it/s, loss=0.0104, lr=3.54e-06, step=8691] Training: 87%|████████▋ | 8692/10000 [1:56:52<16:38, 1.31it/s, loss=0.0083, lr=3.54e-06, step=8692] Training: 87%|████████▋ | 8693/10000 [1:56:53<18:12, 1.20it/s, loss=0.0083, lr=3.54e-06, step=8692] Training: 87%|████████▋ | 8693/10000 [1:56:53<18:12, 1.20it/s, loss=0.0022, lr=3.54e-06, step=8693] Training: 87%|████████▋ | 8694/10000 [1:56:54<16:33, 1.31it/s, loss=0.0022, lr=3.54e-06, step=8693] Training: 87%|████████▋ | 8694/10000 [1:56:54<16:33, 1.31it/s, loss=0.0017, lr=3.53e-06, step=8694] Training: 87%|████████▋ | 8695/10000 [1:56:55<17:23, 1.25it/s, loss=0.0017, lr=3.53e-06, step=8694] Training: 87%|████████▋ | 8695/10000 [1:56:55<17:23, 1.25it/s, loss=0.0022, lr=3.53e-06, step=8695] Training: 87%|████████▋ | 8696/10000 [1:56:55<17:04, 1.27it/s, loss=0.0022, lr=3.53e-06, step=8695] Training: 87%|████████▋ | 8696/10000 [1:56:55<17:04, 1.27it/s, loss=0.0157, lr=3.53e-06, step=8696] Training: 87%|████████▋ | 8697/10000 [1:56:56<17:27, 1.24it/s, loss=0.0157, lr=3.53e-06, step=8696] Training: 87%|████████▋ | 8697/10000 [1:56:56<17:27, 1.24it/s, loss=0.0198, lr=3.53e-06, step=8697] Training: 87%|████████▋ | 8698/10000 [1:56:57<17:01, 1.27it/s, loss=0.0198, lr=3.53e-06, step=8697] Training: 87%|████████▋ | 8698/10000 [1:56:57<17:01, 1.27it/s, loss=0.0126, lr=3.53e-06, step=8698] Training: 87%|████████▋ | 8699/10000 [1:56:58<16:16, 1.33it/s, loss=0.0126, lr=3.53e-06, step=8698] Training: 87%|████████▋ | 8699/10000 [1:56:58<16:16, 1.33it/s, loss=0.0039, lr=3.53e-06, step=8699]20:41:30.711 [I] step=8700 loss=0.0658 smoothed_loss=0.0146 lr=3.53e-06 grad_norm=0.4370 step_time=0.6110s data_time=0.1637s it/s=1.291 eta_to_10000=1007.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0302 grad_action_out_proj_arms=0.1568 grad_arm_token_fuse=0.1587 grad_shared_expert=0.5166 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8700/10000 [1:56:58<16:01, 1.35it/s, loss=0.0039, lr=3.53e-06, step=8699] Training: 87%|████████▋ | 8700/10000 [1:56:58<16:01, 1.35it/s, loss=0.0658, lr=3.53e-06, step=8700] Training: 87%|████████▋ | 8701/10000 [1:56:59<14:36, 1.48it/s, loss=0.0658, lr=3.53e-06, step=8700] Training: 87%|████████▋ | 8701/10000 [1:56:59<14:36, 1.48it/s, loss=0.0113, lr=3.52e-06, step=8701] Training: 87%|████████▋ | 8702/10000 [1:57:00<15:14, 1.42it/s, loss=0.0113, lr=3.52e-06, step=8701] Training: 87%|████████▋ | 8702/10000 [1:57:00<15:14, 1.42it/s, loss=0.0501, lr=3.52e-06, step=8702] Training: 87%|████████▋ | 8703/10000 [1:57:00<14:03, 1.54it/s, loss=0.0501, lr=3.52e-06, step=8702] Training: 87%|████████▋ | 8703/10000 [1:57:00<14:03, 1.54it/s, loss=0.0220, lr=3.52e-06, step=8703] Training: 87%|████████▋ | 8704/10000 [1:57:01<13:08, 1.64it/s, loss=0.0220, lr=3.52e-06, step=8703] Training: 87%|████████▋ | 8704/10000 [1:57:01<13:08, 1.64it/s, loss=0.0206, lr=3.52e-06, step=8704] Training: 87%|████████▋ | 8705/10000 [1:57:01<12:29, 1.73it/s, loss=0.0206, lr=3.52e-06, step=8704] Training: 87%|████████▋ | 8705/10000 [1:57:01<12:29, 1.73it/s, loss=0.0064, lr=3.52e-06, step=8705] Training: 87%|████████▋ | 8706/10000 [1:57:02<12:58, 1.66it/s, loss=0.0064, lr=3.52e-06, step=8705] Training: 87%|████████▋ | 8706/10000 [1:57:02<12:58, 1.66it/s, loss=0.0451, lr=3.52e-06, step=8706] Training: 87%|████████▋ | 8707/10000 [1:57:03<13:43, 1.57it/s, loss=0.0451, lr=3.52e-06, step=8706] Training: 87%|████████▋ | 8707/10000 [1:57:03<13:43, 1.57it/s, loss=0.0083, lr=3.51e-06, step=8707] Training: 87%|████████▋ | 8708/10000 [1:57:03<12:55, 1.67it/s, loss=0.0083, lr=3.51e-06, step=8707] Training: 87%|████████▋ | 8708/10000 [1:57:03<12:55, 1.67it/s, loss=0.0147, lr=3.51e-06, step=8708] Training: 87%|████████▋ | 8709/10000 [1:57:04<12:38, 1.70it/s, loss=0.0147, lr=3.51e-06, step=8708] Training: 87%|████████▋ | 8709/10000 [1:57:04<12:38, 1.70it/s, loss=0.0030, lr=3.51e-06, step=8709]20:41:36.763 [I] step=8710 loss=0.0051 smoothed_loss=0.0158 lr=3.52e-06 grad_norm=0.4729 step_time=0.4935s data_time=0.1117s it/s=1.653 eta_to_10000=780.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0046 grad_action_out_proj_arms=0.0517 grad_arm_token_fuse=0.0227 grad_shared_expert=0.2867 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8710/10000 [1:57:04<13:46, 1.56it/s, loss=0.0030, lr=3.51e-06, step=8709] Training: 87%|████████▋ | 8710/10000 [1:57:04<13:46, 1.56it/s, loss=0.0051, lr=3.51e-06, step=8710] Training: 87%|████████▋ | 8711/10000 [1:57:05<14:40, 1.46it/s, loss=0.0051, lr=3.51e-06, step=8710] Training: 87%|████████▋ | 8711/10000 [1:57:05<14:40, 1.46it/s, loss=0.0210, lr=3.51e-06, step=8711] Training: 87%|████████▋ | 8712/10000 [1:57:06<13:43, 1.56it/s, loss=0.0210, lr=3.51e-06, step=8711] Training: 87%|████████▋ | 8712/10000 [1:57:06<13:43, 1.56it/s, loss=0.0018, lr=3.51e-06, step=8712] Training: 87%|████████▋ | 8713/10000 [1:57:06<13:54, 1.54it/s, loss=0.0018, lr=3.51e-06, step=8712] Training: 87%|████████▋ | 8713/10000 [1:57:06<13:54, 1.54it/s, loss=0.0019, lr=3.51e-06, step=8713] Training: 87%|████████▋ | 8714/10000 [1:57:07<14:14, 1.50it/s, loss=0.0019, lr=3.51e-06, step=8713] Training: 87%|████████▋ | 8714/10000 [1:57:07<14:14, 1.50it/s, loss=0.0090, lr=3.50e-06, step=8714] Training: 87%|████████▋ | 8715/10000 [1:57:08<15:37, 1.37it/s, loss=0.0090, lr=3.50e-06, step=8714] Training: 87%|████████▋ | 8715/10000 [1:57:08<15:37, 1.37it/s, loss=0.0030, lr=3.50e-06, step=8715] Training: 87%|████████▋ | 8716/10000 [1:57:09<15:54, 1.35it/s, loss=0.0030, lr=3.50e-06, step=8715] Training: 87%|████████▋ | 8716/10000 [1:57:09<15:54, 1.35it/s, loss=0.0023, lr=3.50e-06, step=8716] Training: 87%|████████▋ | 8717/10000 [1:57:09<14:51, 1.44it/s, loss=0.0023, lr=3.50e-06, step=8716] Training: 87%|████████▋ | 8717/10000 [1:57:09<14:51, 1.44it/s, loss=0.0089, lr=3.50e-06, step=8717] Training: 87%|████████▋ | 8718/10000 [1:57:10<13:28, 1.59it/s, loss=0.0089, lr=3.50e-06, step=8717] Training: 87%|████████▋ | 8718/10000 [1:57:10<13:28, 1.59it/s, loss=0.0112, lr=3.50e-06, step=8718] Training: 87%|████████▋ | 8719/10000 [1:57:11<14:16, 1.50it/s, loss=0.0112, lr=3.50e-06, step=8718] Training: 87%|████████▋ | 8719/10000 [1:57:11<14:16, 1.50it/s, loss=0.0061, lr=3.50e-06, step=8719]20:41:43.436 [I] step=8720 loss=0.0039 smoothed_loss=0.0098 lr=3.50e-06 grad_norm=0.5053 step_time=0.5281s data_time=0.1392s it/s=1.499 eta_to_10000=854.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0050 grad_action_out_proj_arms=0.0737 grad_arm_token_fuse=0.0230 grad_shared_expert=0.1347 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8720/10000 [1:57:11<13:12, 1.61it/s, loss=0.0061, lr=3.50e-06, step=8719] Training: 87%|████████▋ | 8720/10000 [1:57:11<13:12, 1.61it/s, loss=0.0039, lr=3.49e-06, step=8720] Training: 87%|████████▋ | 8721/10000 [1:57:12<13:18, 1.60it/s, loss=0.0039, lr=3.49e-06, step=8720] Training: 87%|████████▋ | 8721/10000 [1:57:12<13:18, 1.60it/s, loss=0.0020, lr=3.49e-06, step=8721] Training: 87%|████████▋ | 8722/10000 [1:57:12<13:47, 1.54it/s, loss=0.0020, lr=3.49e-06, step=8721] Training: 87%|████████▋ | 8722/10000 [1:57:12<13:47, 1.54it/s, loss=0.0020, lr=3.49e-06, step=8722] Training: 87%|████████▋ | 8723/10000 [1:57:13<13:23, 1.59it/s, loss=0.0020, lr=3.49e-06, step=8722] Training: 87%|████████▋ | 8723/10000 [1:57:13<13:23, 1.59it/s, loss=0.0092, lr=3.49e-06, step=8723] Training: 87%|████████▋ | 8724/10000 [1:57:14<12:25, 1.71it/s, loss=0.0092, lr=3.49e-06, step=8723] Training: 87%|████████▋ | 8724/10000 [1:57:14<12:25, 1.71it/s, loss=0.0036, lr=3.49e-06, step=8724] Training: 87%|████████▋ | 8725/10000 [1:57:14<11:46, 1.81it/s, loss=0.0036, lr=3.49e-06, step=8724] Training: 87%|████████▋ | 8725/10000 [1:57:14<11:46, 1.81it/s, loss=0.0241, lr=3.49e-06, step=8725] Training: 87%|████████▋ | 8726/10000 [1:57:14<11:15, 1.89it/s, loss=0.0241, lr=3.49e-06, step=8725] Training: 87%|████████▋ | 8726/10000 [1:57:14<11:15, 1.89it/s, loss=0.0105, lr=3.49e-06, step=8726] Training: 87%|████████▋ | 8727/10000 [1:57:15<11:32, 1.84it/s, loss=0.0105, lr=3.49e-06, step=8726] Training: 87%|████████▋ | 8727/10000 [1:57:15<11:32, 1.84it/s, loss=0.0056, lr=3.48e-06, step=8727] Training: 87%|████████▋ | 8728/10000 [1:57:16<12:09, 1.74it/s, loss=0.0056, lr=3.48e-06, step=8727] Training: 87%|████████▋ | 8728/10000 [1:57:16<12:09, 1.74it/s, loss=0.0021, lr=3.48e-06, step=8728] Training: 87%|████████▋ | 8729/10000 [1:57:16<13:21, 1.59it/s, loss=0.0021, lr=3.48e-06, step=8728] Training: 87%|████████▋ | 8729/10000 [1:57:16<13:21, 1.59it/s, loss=0.0191, lr=3.48e-06, step=8729]20:41:49.501 [I] step=8730 loss=0.0027 smoothed_loss=0.0089 lr=3.49e-06 grad_norm=0.4889 step_time=0.5220s data_time=0.0846s it/s=1.649 eta_to_10000=770.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0197 grad_action_out_proj_arms=0.1773 grad_arm_token_fuse=0.1080 grad_shared_expert=0.4171 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8730/10000 [1:57:17<13:55, 1.52it/s, loss=0.0191, lr=3.48e-06, step=8729] Training: 87%|████████▋ | 8730/10000 [1:57:17<13:55, 1.52it/s, loss=0.0027, lr=3.48e-06, step=8730] Training: 87%|████████▋ | 8731/10000 [1:57:18<12:51, 1.65it/s, loss=0.0027, lr=3.48e-06, step=8730] Training: 87%|████████▋ | 8731/10000 [1:57:18<12:51, 1.65it/s, loss=0.0318, lr=3.48e-06, step=8731] Training: 87%|████████▋ | 8732/10000 [1:57:18<12:12, 1.73it/s, loss=0.0318, lr=3.48e-06, step=8731] Training: 87%|████████▋ | 8732/10000 [1:57:18<12:12, 1.73it/s, loss=0.0039, lr=3.48e-06, step=8732] Training: 87%|████████▋ | 8733/10000 [1:57:19<11:45, 1.80it/s, loss=0.0039, lr=3.48e-06, step=8732] Training: 87%|████████▋ | 8733/10000 [1:57:19<11:45, 1.80it/s, loss=0.0073, lr=3.47e-06, step=8733] Training: 87%|████████▋ | 8734/10000 [1:57:19<12:57, 1.63it/s, loss=0.0073, lr=3.47e-06, step=8733] Training: 87%|████████▋ | 8734/10000 [1:57:19<12:57, 1.63it/s, loss=0.0030, lr=3.47e-06, step=8734] Training: 87%|████████▋ | 8735/10000 [1:57:20<13:29, 1.56it/s, loss=0.0030, lr=3.47e-06, step=8734] Training: 87%|████████▋ | 8735/10000 [1:57:20<13:29, 1.56it/s, loss=0.0024, lr=3.47e-06, step=8735] Training: 87%|████████▋ | 8736/10000 [1:57:21<13:36, 1.55it/s, loss=0.0024, lr=3.47e-06, step=8735] Training: 87%|████████▋ | 8736/10000 [1:57:21<13:36, 1.55it/s, loss=0.0036, lr=3.47e-06, step=8736] Training: 87%|████████▋ | 8737/10000 [1:57:21<13:46, 1.53it/s, loss=0.0036, lr=3.47e-06, step=8736] Training: 87%|████████▋ | 8737/10000 [1:57:21<13:46, 1.53it/s, loss=0.0063, lr=3.47e-06, step=8737] Training: 87%|████████▋ | 8738/10000 [1:57:22<14:42, 1.43it/s, loss=0.0063, lr=3.47e-06, step=8737] Training: 87%|████████▋ | 8738/10000 [1:57:22<14:42, 1.43it/s, loss=0.0012, lr=3.47e-06, step=8738] Training: 87%|████████▋ | 8739/10000 [1:57:23<14:50, 1.42it/s, loss=0.0012, lr=3.47e-06, step=8738] Training: 87%|████████▋ | 8739/10000 [1:57:23<14:50, 1.42it/s, loss=0.0188, lr=3.47e-06, step=8739]20:41:55.909 [I] step=8740 loss=0.0138 smoothed_loss=0.0090 lr=3.47e-06 grad_norm=0.3856 step_time=0.5292s data_time=0.1117s it/s=1.561 eta_to_10000=807.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0113 grad_action_out_proj_arms=0.0892 grad_arm_token_fuse=0.0591 grad_shared_expert=0.3401 (18633:train_pytorch.py:850) + Training: 87%|████████▋ | 8740/10000 [1:57:24<14:07, 1.49it/s, loss=0.0188, lr=3.47e-06, step=8739] Training: 87%|████████▋ | 8740/10000 [1:57:24<14:07, 1.49it/s, loss=0.0138, lr=3.46e-06, step=8740] Training: 87%|████████▋ | 8741/10000 [1:57:24<13:12, 1.59it/s, loss=0.0138, lr=3.46e-06, step=8740] Training: 87%|████████▋ | 8741/10000 [1:57:24<13:12, 1.59it/s, loss=0.0182, lr=3.46e-06, step=8741] Training: 87%|████████▋ | 8742/10000 [1:57:25<14:58, 1.40it/s, loss=0.0182, lr=3.46e-06, step=8741] Training: 87%|████████▋ | 8742/10000 [1:57:25<14:58, 1.40it/s, loss=0.0055, lr=3.46e-06, step=8742] Training: 87%|████████▋ | 8743/10000 [1:57:26<15:47, 1.33it/s, loss=0.0055, lr=3.46e-06, step=8742] Training: 87%|████████▋ | 8743/10000 [1:57:26<15:47, 1.33it/s, loss=0.0135, lr=3.46e-06, step=8743] Training: 87%|████████▋ | 8744/10000 [1:57:26<14:56, 1.40it/s, loss=0.0135, lr=3.46e-06, step=8743] Training: 87%|████████▋ | 8744/10000 [1:57:26<14:56, 1.40it/s, loss=0.0110, lr=3.46e-06, step=8744] Training: 87%|████████▋ | 8745/10000 [1:57:27<14:09, 1.48it/s, loss=0.0110, lr=3.46e-06, step=8744] Training: 87%|████████▋ | 8745/10000 [1:57:27<14:09, 1.48it/s, loss=0.0055, lr=3.46e-06, step=8745] Training: 87%|████████▋ | 8746/10000 [1:57:28<13:01, 1.60it/s, loss=0.0055, lr=3.46e-06, step=8745] Training: 87%|████████▋ | 8746/10000 [1:57:28<13:01, 1.60it/s, loss=0.0059, lr=3.46e-06, step=8746] Training: 87%|████████▋ | 8747/10000 [1:57:28<13:46, 1.52it/s, loss=0.0059, lr=3.46e-06, step=8746] Training: 87%|████████▋ | 8747/10000 [1:57:28<13:46, 1.52it/s, loss=0.0582, lr=3.45e-06, step=8747] Training: 87%|████████▋ | 8748/10000 [1:57:29<14:18, 1.46it/s, loss=0.0582, lr=3.45e-06, step=8747] Training: 87%|████████▋ | 8748/10000 [1:57:29<14:18, 1.46it/s, loss=0.0056, lr=3.45e-06, step=8748] Training: 87%|████████▋ | 8749/10000 [1:57:30<15:11, 1.37it/s, loss=0.0056, lr=3.45e-06, step=8748] Training: 87%|████████▋ | 8749/10000 [1:57:30<15:11, 1.37it/s, loss=0.0038, lr=3.45e-06, step=8749]20:42:03.022 [I] step=8750 loss=0.0066 smoothed_loss=0.0117 lr=3.46e-06 grad_norm=0.4564 step_time=0.5976s data_time=0.1136s it/s=1.406 eta_to_10000=889.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0109 grad_action_out_proj_arms=0.0921 grad_arm_token_fuse=0.0553 grad_shared_expert=0.4222 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8750/10000 [1:57:31<15:35, 1.34it/s, loss=0.0038, lr=3.45e-06, step=8749] Training: 88%|████████▊ | 8750/10000 [1:57:31<15:35, 1.34it/s, loss=0.0066, lr=3.45e-06, step=8750] Training: 88%|████████▊ | 8751/10000 [1:57:31<15:39, 1.33it/s, loss=0.0066, lr=3.45e-06, step=8750] Training: 88%|████████▊ | 8751/10000 [1:57:31<15:39, 1.33it/s, loss=0.0009, lr=3.45e-06, step=8751] Training: 88%|████████▊ | 8752/10000 [1:57:32<14:04, 1.48it/s, loss=0.0009, lr=3.45e-06, step=8751] Training: 88%|████████▊ | 8752/10000 [1:57:32<14:04, 1.48it/s, loss=0.0072, lr=3.45e-06, step=8752] Training: 88%|████████▊ | 8753/10000 [1:57:32<12:50, 1.62it/s, loss=0.0072, lr=3.45e-06, step=8752] Training: 88%|████████▊ | 8753/10000 [1:57:32<12:50, 1.62it/s, loss=0.0090, lr=3.44e-06, step=8753] Training: 88%|████████▊ | 8754/10000 [1:57:33<12:26, 1.67it/s, loss=0.0090, lr=3.44e-06, step=8753] Training: 88%|████████▊ | 8754/10000 [1:57:33<12:26, 1.67it/s, loss=0.0067, lr=3.44e-06, step=8754] Training: 88%|████████▊ | 8755/10000 [1:57:34<12:01, 1.73it/s, loss=0.0067, lr=3.44e-06, step=8754] Training: 88%|████████▊ | 8755/10000 [1:57:34<12:01, 1.73it/s, loss=0.0321, lr=3.44e-06, step=8755] Training: 88%|████████▊ | 8756/10000 [1:57:34<12:25, 1.67it/s, loss=0.0321, lr=3.44e-06, step=8755] Training: 88%|████████▊ | 8756/10000 [1:57:34<12:25, 1.67it/s, loss=0.0126, lr=3.44e-06, step=8756] Training: 88%|████████▊ | 8757/10000 [1:57:35<14:28, 1.43it/s, loss=0.0126, lr=3.44e-06, step=8756] Training: 88%|████████▊ | 8757/10000 [1:57:35<14:28, 1.43it/s, loss=0.0151, lr=3.44e-06, step=8757] Training: 88%|████████▊ | 8758/10000 [1:57:36<13:12, 1.57it/s, loss=0.0151, lr=3.44e-06, step=8757] Training: 88%|████████▊ | 8758/10000 [1:57:36<13:12, 1.57it/s, loss=0.0113, lr=3.44e-06, step=8758] Training: 88%|████████▊ | 8759/10000 [1:57:36<12:31, 1.65it/s, loss=0.0113, lr=3.44e-06, step=8758] Training: 88%|████████▊ | 8759/10000 [1:57:36<12:31, 1.65it/s, loss=0.0017, lr=3.44e-06, step=8759]20:42:08.971 [I] step=8760 loss=0.0076 smoothed_loss=0.0109 lr=3.44e-06 grad_norm=0.3987 step_time=0.4992s data_time=0.0957s it/s=1.681 eta_to_10000=737.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0092 grad_action_out_proj_arms=0.0794 grad_arm_token_fuse=0.0464 grad_shared_expert=0.5099 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8760/10000 [1:57:37<11:56, 1.73it/s, loss=0.0017, lr=3.44e-06, step=8759] Training: 88%|████████▊ | 8760/10000 [1:57:37<11:56, 1.73it/s, loss=0.0076, lr=3.43e-06, step=8760] Training: 88%|████████▊ | 8761/10000 [1:57:37<11:34, 1.78it/s, loss=0.0076, lr=3.43e-06, step=8760] Training: 88%|████████▊ | 8761/10000 [1:57:37<11:34, 1.78it/s, loss=0.0032, lr=3.43e-06, step=8761] Training: 88%|████████▊ | 8762/10000 [1:57:38<12:29, 1.65it/s, loss=0.0032, lr=3.43e-06, step=8761] Training: 88%|████████▊ | 8762/10000 [1:57:38<12:29, 1.65it/s, loss=0.0020, lr=3.43e-06, step=8762] Training: 88%|████████▊ | 8763/10000 [1:57:38<11:45, 1.75it/s, loss=0.0020, lr=3.43e-06, step=8762] Training: 88%|████████▊ | 8763/10000 [1:57:38<11:45, 1.75it/s, loss=0.0094, lr=3.43e-06, step=8763] Training: 88%|████████▊ | 8764/10000 [1:57:39<12:23, 1.66it/s, loss=0.0094, lr=3.43e-06, step=8763] Training: 88%|████████▊ | 8764/10000 [1:57:39<12:23, 1.66it/s, loss=0.0021, lr=3.43e-06, step=8764] Training: 88%|████████▊ | 8765/10000 [1:57:40<13:00, 1.58it/s, loss=0.0021, lr=3.43e-06, step=8764] Training: 88%|████████▊ | 8765/10000 [1:57:40<13:00, 1.58it/s, loss=0.0059, lr=3.43e-06, step=8765] Training: 88%|████████▊ | 8766/10000 [1:57:40<12:56, 1.59it/s, loss=0.0059, lr=3.43e-06, step=8765] Training: 88%|████████▊ | 8766/10000 [1:57:40<12:56, 1.59it/s, loss=0.0039, lr=3.43e-06, step=8766] Training: 88%|████████▊ | 8767/10000 [1:57:41<14:32, 1.41it/s, loss=0.0039, lr=3.43e-06, step=8766] Training: 88%|████████▊ | 8767/10000 [1:57:41<14:32, 1.41it/s, loss=0.0108, lr=3.42e-06, step=8767] Training: 88%|████████▊ | 8768/10000 [1:57:42<13:14, 1.55it/s, loss=0.0108, lr=3.42e-06, step=8767] Training: 88%|████████▊ | 8768/10000 [1:57:42<13:14, 1.55it/s, loss=0.0431, lr=3.42e-06, step=8768] Training: 88%|████████▊ | 8769/10000 [1:57:42<12:24, 1.65it/s, loss=0.0431, lr=3.42e-06, step=8768] Training: 88%|████████▊ | 8769/10000 [1:57:42<12:24, 1.65it/s, loss=0.0021, lr=3.42e-06, step=8769]20:42:15.257 [I] step=8770 loss=0.0046 smoothed_loss=0.0101 lr=3.43e-06 grad_norm=0.4024 step_time=0.5240s data_time=0.1046s it/s=1.591 eta_to_10000=773.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0091 grad_action_out_proj_arms=0.0825 grad_arm_token_fuse=0.0499 grad_shared_expert=0.5755 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8770/10000 [1:57:43<12:47, 1.60it/s, loss=0.0021, lr=3.42e-06, step=8769] Training: 88%|████████▊ | 8770/10000 [1:57:43<12:47, 1.60it/s, loss=0.0046, lr=3.42e-06, step=8770] Training: 88%|████████▊ | 8771/10000 [1:57:44<12:56, 1.58it/s, loss=0.0046, lr=3.42e-06, step=8770] Training: 88%|████████▊ | 8771/10000 [1:57:44<12:56, 1.58it/s, loss=0.0031, lr=3.42e-06, step=8771] Training: 88%|████████▊ | 8772/10000 [1:57:45<14:51, 1.38it/s, loss=0.0031, lr=3.42e-06, step=8771] Training: 88%|████████▊ | 8772/10000 [1:57:45<14:51, 1.38it/s, loss=0.0047, lr=3.42e-06, step=8772] Training: 88%|████████▊ | 8773/10000 [1:57:45<14:19, 1.43it/s, loss=0.0047, lr=3.42e-06, step=8772] Training: 88%|████████▊ | 8773/10000 [1:57:45<14:19, 1.43it/s, loss=0.0061, lr=3.41e-06, step=8773] Training: 88%|████████▊ | 8774/10000 [1:57:46<14:19, 1.43it/s, loss=0.0061, lr=3.41e-06, step=8773] Training: 88%|████████▊ | 8774/10000 [1:57:46<14:19, 1.43it/s, loss=0.0213, lr=3.41e-06, step=8774] Training: 88%|████████▊ | 8775/10000 [1:57:47<13:56, 1.46it/s, loss=0.0213, lr=3.41e-06, step=8774] Training: 88%|████████▊ | 8775/10000 [1:57:47<13:56, 1.46it/s, loss=0.0095, lr=3.41e-06, step=8775] Training: 88%|████████▊ | 8776/10000 [1:57:47<14:13, 1.43it/s, loss=0.0095, lr=3.41e-06, step=8775] Training: 88%|████████▊ | 8776/10000 [1:57:47<14:13, 1.43it/s, loss=0.0233, lr=3.41e-06, step=8776] Training: 88%|████████▊ | 8777/10000 [1:57:48<14:18, 1.42it/s, loss=0.0233, lr=3.41e-06, step=8776] Training: 88%|████████▊ | 8777/10000 [1:57:48<14:18, 1.42it/s, loss=0.0011, lr=3.41e-06, step=8777] Training: 88%|████████▊ | 8778/10000 [1:57:49<14:06, 1.44it/s, loss=0.0011, lr=3.41e-06, step=8777] Training: 88%|████████▊ | 8778/10000 [1:57:49<14:06, 1.44it/s, loss=0.0076, lr=3.41e-06, step=8778] Training: 88%|████████▊ | 8779/10000 [1:57:49<14:09, 1.44it/s, loss=0.0076, lr=3.41e-06, step=8778] Training: 88%|████████▊ | 8779/10000 [1:57:49<14:09, 1.44it/s, loss=0.0191, lr=3.41e-06, step=8779]20:42:22.529 [I] step=8780 loss=0.0078 smoothed_loss=0.0106 lr=3.41e-06 grad_norm=0.4681 step_time=0.6108s data_time=0.1163s it/s=1.375 eta_to_10000=887.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0054 grad_action_out_proj_arms=0.0637 grad_arm_token_fuse=0.0261 grad_shared_expert=0.3466 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8780/10000 [1:57:50<15:13, 1.34it/s, loss=0.0191, lr=3.41e-06, step=8779] Training: 88%|████████▊ | 8780/10000 [1:57:50<15:13, 1.34it/s, loss=0.0078, lr=3.40e-06, step=8780] Training: 88%|████████▊ | 8781/10000 [1:57:51<13:40, 1.49it/s, loss=0.0078, lr=3.40e-06, step=8780] Training: 88%|████████▊ | 8781/10000 [1:57:51<13:40, 1.49it/s, loss=0.0049, lr=3.40e-06, step=8781] Training: 88%|████████▊ | 8782/10000 [1:57:51<14:13, 1.43it/s, loss=0.0049, lr=3.40e-06, step=8781] Training: 88%|████████▊ | 8782/10000 [1:57:51<14:13, 1.43it/s, loss=0.0053, lr=3.40e-06, step=8782] Training: 88%|████████▊ | 8783/10000 [1:57:52<12:59, 1.56it/s, loss=0.0053, lr=3.40e-06, step=8782] Training: 88%|████████▊ | 8783/10000 [1:57:52<12:59, 1.56it/s, loss=0.0011, lr=3.40e-06, step=8783] Training: 88%|████████▊ | 8784/10000 [1:57:52<11:57, 1.70it/s, loss=0.0011, lr=3.40e-06, step=8783] Training: 88%|████████▊ | 8784/10000 [1:57:52<11:57, 1.70it/s, loss=0.0136, lr=3.40e-06, step=8784] Training: 88%|████████▊ | 8785/10000 [1:57:53<14:47, 1.37it/s, loss=0.0136, lr=3.40e-06, step=8784] Training: 88%|████████▊ | 8785/10000 [1:57:53<14:47, 1.37it/s, loss=0.0300, lr=3.40e-06, step=8785] Training: 88%|████████▊ | 8786/10000 [1:57:54<15:48, 1.28it/s, loss=0.0300, lr=3.40e-06, step=8785] Training: 88%|████████▊ | 8786/10000 [1:57:54<15:48, 1.28it/s, loss=0.0054, lr=3.40e-06, step=8786] Training: 88%|████████▊ | 8787/10000 [1:57:55<13:58, 1.45it/s, loss=0.0054, lr=3.40e-06, step=8786] Training: 88%|████████▊ | 8787/10000 [1:57:55<13:58, 1.45it/s, loss=0.0183, lr=3.39e-06, step=8787] Training: 88%|████████▊ | 8788/10000 [1:57:55<12:37, 1.60it/s, loss=0.0183, lr=3.39e-06, step=8787] Training: 88%|████████▊ | 8788/10000 [1:57:55<12:37, 1.60it/s, loss=0.0551, lr=3.39e-06, step=8788] Training: 88%|████████▊ | 8789/10000 [1:57:56<11:46, 1.71it/s, loss=0.0551, lr=3.39e-06, step=8788] Training: 88%|████████▊ | 8789/10000 [1:57:56<11:46, 1.71it/s, loss=0.0307, lr=3.39e-06, step=8789]20:42:28.754 [I] step=8790 loss=0.0142 smoothed_loss=0.0170 lr=3.40e-06 grad_norm=0.5597 step_time=0.5163s data_time=0.1062s it/s=1.607 eta_to_10000=753.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0098 grad_action_out_proj_arms=0.1108 grad_arm_token_fuse=0.0529 grad_shared_expert=0.3256 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8790/10000 [1:57:56<11:50, 1.70it/s, loss=0.0307, lr=3.39e-06, step=8789] Training: 88%|████████▊ | 8790/10000 [1:57:56<11:50, 1.70it/s, loss=0.0142, lr=3.39e-06, step=8790] Training: 88%|████████▊ | 8791/10000 [1:57:57<12:01, 1.68it/s, loss=0.0142, lr=3.39e-06, step=8790] Training: 88%|████████▊ | 8791/10000 [1:57:57<12:01, 1.68it/s, loss=0.0033, lr=3.39e-06, step=8791] Training: 88%|████████▊ | 8792/10000 [1:57:58<12:33, 1.60it/s, loss=0.0033, lr=3.39e-06, step=8791] Training: 88%|████████▊ | 8792/10000 [1:57:58<12:33, 1.60it/s, loss=0.0029, lr=3.39e-06, step=8792] Training: 88%|████████▊ | 8793/10000 [1:57:59<15:12, 1.32it/s, loss=0.0029, lr=3.39e-06, step=8792] Training: 88%|████████▊ | 8793/10000 [1:57:59<15:12, 1.32it/s, loss=0.0072, lr=3.39e-06, step=8793] Training: 88%|████████▊ | 8794/10000 [1:57:59<13:26, 1.49it/s, loss=0.0072, lr=3.39e-06, step=8793] Training: 88%|████████▊ | 8794/10000 [1:57:59<13:26, 1.49it/s, loss=0.0067, lr=3.38e-06, step=8794] Training: 88%|████████▊ | 8795/10000 [1:58:00<13:57, 1.44it/s, loss=0.0067, lr=3.38e-06, step=8794] Training: 88%|████████▊ | 8795/10000 [1:58:00<13:57, 1.44it/s, loss=0.0026, lr=3.38e-06, step=8795] Training: 88%|████████▊ | 8796/10000 [1:58:01<12:53, 1.56it/s, loss=0.0026, lr=3.38e-06, step=8795] Training: 88%|████████▊ | 8796/10000 [1:58:01<12:53, 1.56it/s, loss=0.0082, lr=3.38e-06, step=8796] Training: 88%|████████▊ | 8797/10000 [1:58:01<13:53, 1.44it/s, loss=0.0082, lr=3.38e-06, step=8796] Training: 88%|████████▊ | 8797/10000 [1:58:01<13:53, 1.44it/s, loss=0.0177, lr=3.38e-06, step=8797] Training: 88%|████████▊ | 8798/10000 [1:58:02<12:34, 1.59it/s, loss=0.0177, lr=3.38e-06, step=8797] Training: 88%|████████▊ | 8798/10000 [1:58:02<12:34, 1.59it/s, loss=0.0633, lr=3.38e-06, step=8798] Training: 88%|████████▊ | 8799/10000 [1:58:02<11:37, 1.72it/s, loss=0.0633, lr=3.38e-06, step=8798] Training: 88%|████████▊ | 8799/10000 [1:58:02<11:37, 1.72it/s, loss=0.0020, lr=3.38e-06, step=8799]20:42:35.348 [I] step=8800 loss=0.0067 smoothed_loss=0.0148 lr=3.38e-06 grad_norm=0.4740 step_time=0.5462s data_time=0.1132s it/s=1.517 eta_to_10000=791.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0066 grad_action_out_proj_arms=0.1152 grad_arm_token_fuse=0.0340 grad_shared_expert=0.7290 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8800/10000 [1:58:03<12:28, 1.60it/s, loss=0.0020, lr=3.38e-06, step=8799] Training: 88%|████████▊ | 8800/10000 [1:58:03<12:28, 1.60it/s, loss=0.0067, lr=3.38e-06, step=8800] Training: 88%|████████▊ | 8801/10000 [1:58:03<11:31, 1.73it/s, loss=0.0067, lr=3.38e-06, step=8800] Training: 88%|████████▊ | 8801/10000 [1:58:03<11:31, 1.73it/s, loss=0.0038, lr=3.37e-06, step=8801] Training: 88%|████████▊ | 8802/10000 [1:58:04<10:53, 1.83it/s, loss=0.0038, lr=3.37e-06, step=8801] Training: 88%|████████▊ | 8802/10000 [1:58:04<10:53, 1.83it/s, loss=0.0044, lr=3.37e-06, step=8802] Training: 88%|████████▊ | 8803/10000 [1:58:04<10:43, 1.86it/s, loss=0.0044, lr=3.37e-06, step=8802] Training: 88%|████████▊ | 8803/10000 [1:58:04<10:43, 1.86it/s, loss=0.0130, lr=3.37e-06, step=8803] Training: 88%|████████▊ | 8804/10000 [1:58:05<11:14, 1.77it/s, loss=0.0130, lr=3.37e-06, step=8803] Training: 88%|████████▊ | 8804/10000 [1:58:05<11:14, 1.77it/s, loss=0.0075, lr=3.37e-06, step=8804] Training: 88%|████████▊ | 8805/10000 [1:58:06<10:48, 1.84it/s, loss=0.0075, lr=3.37e-06, step=8804] Training: 88%|████████▊ | 8805/10000 [1:58:06<10:48, 1.84it/s, loss=0.0058, lr=3.37e-06, step=8805] Training: 88%|████████▊ | 8806/10000 [1:58:06<10:48, 1.84it/s, loss=0.0058, lr=3.37e-06, step=8805] Training: 88%|████████▊ | 8806/10000 [1:58:06<10:48, 1.84it/s, loss=0.0061, lr=3.37e-06, step=8806] Training: 88%|████████▊ | 8807/10000 [1:58:07<11:31, 1.72it/s, loss=0.0061, lr=3.37e-06, step=8806] Training: 88%|████████▊ | 8807/10000 [1:58:07<11:31, 1.72it/s, loss=0.0083, lr=3.37e-06, step=8807] Training: 88%|████████▊ | 8808/10000 [1:58:07<10:53, 1.82it/s, loss=0.0083, lr=3.37e-06, step=8807] Training: 88%|████████▊ | 8808/10000 [1:58:07<10:53, 1.82it/s, loss=0.0260, lr=3.36e-06, step=8808] Training: 88%|████████▊ | 8809/10000 [1:58:08<11:51, 1.67it/s, loss=0.0260, lr=3.36e-06, step=8808] Training: 88%|████████▊ | 8809/10000 [1:58:08<11:51, 1.67it/s, loss=0.0077, lr=3.36e-06, step=8809]20:42:40.969 [I] step=8810 loss=0.0156 smoothed_loss=0.0122 lr=3.37e-06 grad_norm=0.4279 step_time=0.4755s data_time=0.0867s it/s=1.779 eta_to_10000=668.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0178 grad_action_out_proj_arms=0.1708 grad_arm_token_fuse=0.0944 grad_shared_expert=0.5746 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8810/10000 [1:58:09<12:08, 1.63it/s, loss=0.0077, lr=3.36e-06, step=8809] Training: 88%|████████▊ | 8810/10000 [1:58:09<12:08, 1.63it/s, loss=0.0156, lr=3.36e-06, step=8810] Training: 88%|████████▊ | 8811/10000 [1:58:09<11:53, 1.67it/s, loss=0.0156, lr=3.36e-06, step=8810] Training: 88%|████████▊ | 8811/10000 [1:58:09<11:53, 1.67it/s, loss=0.0070, lr=3.36e-06, step=8811] Training: 88%|████████▊ | 8812/10000 [1:58:10<12:42, 1.56it/s, loss=0.0070, lr=3.36e-06, step=8811] Training: 88%|████████▊ | 8812/10000 [1:58:10<12:42, 1.56it/s, loss=0.0088, lr=3.36e-06, step=8812] Training: 88%|████████▊ | 8813/10000 [1:58:11<12:24, 1.59it/s, loss=0.0088, lr=3.36e-06, step=8812] Training: 88%|████████▊ | 8813/10000 [1:58:11<12:24, 1.59it/s, loss=0.0080, lr=3.36e-06, step=8813] Training: 88%|████████▊ | 8814/10000 [1:58:11<13:21, 1.48it/s, loss=0.0080, lr=3.36e-06, step=8813] Training: 88%|████████▊ | 8814/10000 [1:58:11<13:21, 1.48it/s, loss=0.0106, lr=3.36e-06, step=8814] Training: 88%|████████▊ | 8815/10000 [1:58:12<14:56, 1.32it/s, loss=0.0106, lr=3.36e-06, step=8814] Training: 88%|████████▊ | 8815/10000 [1:58:12<14:56, 1.32it/s, loss=0.0357, lr=3.35e-06, step=8815] Training: 88%|████████▊ | 8816/10000 [1:58:13<13:14, 1.49it/s, loss=0.0357, lr=3.35e-06, step=8815] Training: 88%|████████▊ | 8816/10000 [1:58:13<13:14, 1.49it/s, loss=0.0330, lr=3.35e-06, step=8816] Training: 88%|████████▊ | 8817/10000 [1:58:13<12:28, 1.58it/s, loss=0.0330, lr=3.35e-06, step=8816] Training: 88%|████████▊ | 8817/10000 [1:58:13<12:28, 1.58it/s, loss=0.0197, lr=3.35e-06, step=8817] Training: 88%|████████▊ | 8818/10000 [1:58:14<11:33, 1.70it/s, loss=0.0197, lr=3.35e-06, step=8817] Training: 88%|████████▊ | 8818/10000 [1:58:14<11:33, 1.70it/s, loss=0.0013, lr=3.35e-06, step=8818] Training: 88%|████████▊ | 8819/10000 [1:58:14<10:59, 1.79it/s, loss=0.0013, lr=3.35e-06, step=8818] Training: 88%|████████▊ | 8819/10000 [1:58:14<10:59, 1.79it/s, loss=0.0055, lr=3.35e-06, step=8819]20:42:47.108 [I] step=8820 loss=0.0225 smoothed_loss=0.0144 lr=3.35e-06 grad_norm=0.4679 step_time=0.5073s data_time=0.1066s it/s=1.629 eta_to_10000=724.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0376 grad_action_out_proj_arms=0.2192 grad_arm_token_fuse=0.2105 grad_shared_expert=0.6616 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8820/10000 [1:58:15<10:43, 1.83it/s, loss=0.0055, lr=3.35e-06, step=8819] Training: 88%|████████▊ | 8820/10000 [1:58:15<10:43, 1.83it/s, loss=0.0225, lr=3.35e-06, step=8820] Training: 88%|████████▊ | 8821/10000 [1:58:15<11:23, 1.72it/s, loss=0.0225, lr=3.35e-06, step=8820] Training: 88%|████████▊ | 8821/10000 [1:58:15<11:23, 1.72it/s, loss=0.0082, lr=3.35e-06, step=8821] Training: 88%|████████▊ | 8822/10000 [1:58:16<11:59, 1.64it/s, loss=0.0082, lr=3.35e-06, step=8821] Training: 88%|████████▊ | 8822/10000 [1:58:16<11:59, 1.64it/s, loss=0.0053, lr=3.34e-06, step=8822] Training: 88%|████████▊ | 8823/10000 [1:58:17<11:20, 1.73it/s, loss=0.0053, lr=3.34e-06, step=8822] Training: 88%|████████▊ | 8823/10000 [1:58:17<11:20, 1.73it/s, loss=0.0025, lr=3.34e-06, step=8823] Training: 88%|████████▊ | 8824/10000 [1:58:17<10:43, 1.83it/s, loss=0.0025, lr=3.34e-06, step=8823] Training: 88%|████████▊ | 8824/10000 [1:58:17<10:43, 1.83it/s, loss=0.0027, lr=3.34e-06, step=8824] Training: 88%|████████▊ | 8825/10000 [1:58:18<10:20, 1.89it/s, loss=0.0027, lr=3.34e-06, step=8824] Training: 88%|████████▊ | 8825/10000 [1:58:18<10:20, 1.89it/s, loss=0.0085, lr=3.34e-06, step=8825] Training: 88%|████████▊ | 8826/10000 [1:58:18<10:03, 1.94it/s, loss=0.0085, lr=3.34e-06, step=8825] Training: 88%|████████▊ | 8826/10000 [1:58:18<10:03, 1.94it/s, loss=0.0065, lr=3.34e-06, step=8826] Training: 88%|████████▊ | 8827/10000 [1:58:19<09:56, 1.97it/s, loss=0.0065, lr=3.34e-06, step=8826] Training: 88%|████████▊ | 8827/10000 [1:58:19<09:56, 1.97it/s, loss=0.0054, lr=3.34e-06, step=8827] Training: 88%|████████▊ | 8828/10000 [1:58:19<10:52, 1.80it/s, loss=0.0054, lr=3.34e-06, step=8827] Training: 88%|████████▊ | 8828/10000 [1:58:19<10:52, 1.80it/s, loss=0.0041, lr=3.34e-06, step=8828] Training: 88%|████████▊ | 8829/10000 [1:58:20<11:41, 1.67it/s, loss=0.0041, lr=3.34e-06, step=8828] Training: 88%|████████▊ | 8829/10000 [1:58:20<11:41, 1.67it/s, loss=0.0017, lr=3.33e-06, step=8829]20:42:52.784 [I] step=8830 loss=0.0039 smoothed_loss=0.0080 lr=3.34e-06 grad_norm=0.3591 step_time=0.5026s data_time=0.0650s it/s=1.762 eta_to_10000=663.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0040 grad_action_out_proj_arms=0.0502 grad_arm_token_fuse=0.0217 grad_shared_expert=0.2223 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8830/10000 [1:58:20<11:16, 1.73it/s, loss=0.0017, lr=3.33e-06, step=8829] Training: 88%|████████▊ | 8830/10000 [1:58:20<11:16, 1.73it/s, loss=0.0039, lr=3.33e-06, step=8830] Training: 88%|████████▊ | 8831/10000 [1:58:21<10:53, 1.79it/s, loss=0.0039, lr=3.33e-06, step=8830] Training: 88%|████████▊ | 8831/10000 [1:58:21<10:53, 1.79it/s, loss=0.0032, lr=3.33e-06, step=8831] Training: 88%|████████▊ | 8832/10000 [1:58:21<10:29, 1.86it/s, loss=0.0032, lr=3.33e-06, step=8831] Training: 88%|████████▊ | 8832/10000 [1:58:21<10:29, 1.86it/s, loss=0.0097, lr=3.33e-06, step=8832] Training: 88%|████████▊ | 8833/10000 [1:58:22<10:17, 1.89it/s, loss=0.0097, lr=3.33e-06, step=8832] Training: 88%|████████▊ | 8833/10000 [1:58:22<10:17, 1.89it/s, loss=0.0140, lr=3.33e-06, step=8833] Training: 88%|████████▊ | 8834/10000 [1:58:23<11:18, 1.72it/s, loss=0.0140, lr=3.33e-06, step=8833] Training: 88%|████████▊ | 8834/10000 [1:58:23<11:18, 1.72it/s, loss=0.0034, lr=3.33e-06, step=8834] Training: 88%|████████▊ | 8835/10000 [1:58:23<12:06, 1.60it/s, loss=0.0034, lr=3.33e-06, step=8834] Training: 88%|████████▊ | 8835/10000 [1:58:23<12:06, 1.60it/s, loss=0.0071, lr=3.33e-06, step=8835] Training: 88%|████████▊ | 8836/10000 [1:58:24<13:11, 1.47it/s, loss=0.0071, lr=3.33e-06, step=8835] Training: 88%|████████▊ | 8836/10000 [1:58:24<13:11, 1.47it/s, loss=0.0192, lr=3.32e-06, step=8836] Training: 88%|████████▊ | 8837/10000 [1:58:25<13:14, 1.46it/s, loss=0.0192, lr=3.32e-06, step=8836] Training: 88%|████████▊ | 8837/10000 [1:58:25<13:14, 1.46it/s, loss=0.0256, lr=3.32e-06, step=8837] Training: 88%|████████▊ | 8838/10000 [1:58:26<13:08, 1.47it/s, loss=0.0256, lr=3.32e-06, step=8837] Training: 88%|████████▊ | 8838/10000 [1:58:26<13:08, 1.47it/s, loss=0.0018, lr=3.32e-06, step=8838] Training: 88%|████████▊ | 8839/10000 [1:58:26<12:31, 1.54it/s, loss=0.0018, lr=3.32e-06, step=8838] Training: 88%|████████▊ | 8839/10000 [1:58:26<12:31, 1.54it/s, loss=0.0069, lr=3.32e-06, step=8839]20:42:59.054 [I] step=8840 loss=0.0048 smoothed_loss=0.0090 lr=3.33e-06 grad_norm=0.4082 step_time=0.5235s data_time=0.1035s it/s=1.595 eta_to_10000=727.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0873 grad_arm_token_fuse=0.0459 grad_shared_expert=0.3215 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8840/10000 [1:58:27<12:09, 1.59it/s, loss=0.0069, lr=3.32e-06, step=8839] Training: 88%|████████▊ | 8840/10000 [1:58:27<12:09, 1.59it/s, loss=0.0048, lr=3.32e-06, step=8840] Training: 88%|████████▊ | 8841/10000 [1:58:27<11:55, 1.62it/s, loss=0.0048, lr=3.32e-06, step=8840] Training: 88%|████████▊ | 8841/10000 [1:58:27<11:55, 1.62it/s, loss=0.0207, lr=3.32e-06, step=8841] Training: 88%|████████▊ | 8842/10000 [1:58:28<11:59, 1.61it/s, loss=0.0207, lr=3.32e-06, step=8841] Training: 88%|████████▊ | 8842/10000 [1:58:28<11:59, 1.61it/s, loss=0.0094, lr=3.32e-06, step=8842] Training: 88%|████████▊ | 8843/10000 [1:58:29<12:26, 1.55it/s, loss=0.0094, lr=3.32e-06, step=8842] Training: 88%|████████▊ | 8843/10000 [1:58:29<12:26, 1.55it/s, loss=0.0060, lr=3.31e-06, step=8843] Training: 88%|████████▊ | 8844/10000 [1:58:29<11:28, 1.68it/s, loss=0.0060, lr=3.31e-06, step=8843] Training: 88%|████████▊ | 8844/10000 [1:58:29<11:28, 1.68it/s, loss=0.0093, lr=3.31e-06, step=8844] Training: 88%|████████▊ | 8845/10000 [1:58:30<12:43, 1.51it/s, loss=0.0093, lr=3.31e-06, step=8844] Training: 88%|████████▊ | 8845/10000 [1:58:30<12:43, 1.51it/s, loss=0.0040, lr=3.31e-06, step=8845] Training: 88%|████████▊ | 8846/10000 [1:58:30<11:47, 1.63it/s, loss=0.0040, lr=3.31e-06, step=8845] Training: 88%|████████▊ | 8846/10000 [1:58:30<11:47, 1.63it/s, loss=0.0075, lr=3.31e-06, step=8846] Training: 88%|████████▊ | 8847/10000 [1:58:31<12:17, 1.56it/s, loss=0.0075, lr=3.31e-06, step=8846] Training: 88%|████████▊ | 8847/10000 [1:58:31<12:17, 1.56it/s, loss=0.0041, lr=3.31e-06, step=8847] Training: 88%|████████▊ | 8848/10000 [1:58:32<11:57, 1.61it/s, loss=0.0041, lr=3.31e-06, step=8847] Training: 88%|████████▊ | 8848/10000 [1:58:32<11:57, 1.61it/s, loss=0.0127, lr=3.31e-06, step=8848] Training: 88%|████████▊ | 8849/10000 [1:58:32<11:13, 1.71it/s, loss=0.0127, lr=3.31e-06, step=8848] Training: 88%|████████▊ | 8849/10000 [1:58:32<11:13, 1.71it/s, loss=0.0036, lr=3.31e-06, step=8849]20:43:05.360 [I] step=8850 loss=0.0072 smoothed_loss=0.0082 lr=3.31e-06 grad_norm=0.4621 step_time=0.5117s data_time=0.1189s it/s=1.586 eta_to_10000=725.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.0918 grad_arm_token_fuse=0.0494 grad_shared_expert=0.3891 (18633:train_pytorch.py:850) + Training: 88%|████████▊ | 8850/10000 [1:58:33<12:29, 1.53it/s, loss=0.0036, lr=3.31e-06, step=8849] Training: 88%|████████▊ | 8850/10000 [1:58:33<12:29, 1.53it/s, loss=0.0072, lr=3.31e-06, step=8850] Training: 89%|████████▊ | 8851/10000 [1:58:34<11:32, 1.66it/s, loss=0.0072, lr=3.31e-06, step=8850] Training: 89%|████████▊ | 8851/10000 [1:58:34<11:32, 1.66it/s, loss=0.0035, lr=3.30e-06, step=8851] Training: 89%|████████▊ | 8852/10000 [1:58:34<11:06, 1.72it/s, loss=0.0035, lr=3.30e-06, step=8851] Training: 89%|████████▊ | 8852/10000 [1:58:34<11:06, 1.72it/s, loss=0.0018, lr=3.30e-06, step=8852] Training: 89%|████████▊ | 8853/10000 [1:58:35<11:28, 1.66it/s, loss=0.0018, lr=3.30e-06, step=8852] Training: 89%|████████▊ | 8853/10000 [1:58:35<11:28, 1.66it/s, loss=0.0018, lr=3.30e-06, step=8853] Training: 89%|████████▊ | 8854/10000 [1:58:35<11:00, 1.74it/s, loss=0.0018, lr=3.30e-06, step=8853] Training: 89%|████████▊ | 8854/10000 [1:58:35<11:00, 1.74it/s, loss=0.0025, lr=3.30e-06, step=8854] Training: 89%|████████▊ | 8855/10000 [1:58:36<12:48, 1.49it/s, loss=0.0025, lr=3.30e-06, step=8854] Training: 89%|████████▊ | 8855/10000 [1:58:36<12:48, 1.49it/s, loss=0.0014, lr=3.30e-06, step=8855] Training: 89%|████████▊ | 8856/10000 [1:58:37<12:04, 1.58it/s, loss=0.0014, lr=3.30e-06, step=8855] Training: 89%|████████▊ | 8856/10000 [1:58:37<12:04, 1.58it/s, loss=0.0104, lr=3.30e-06, step=8856] Training: 89%|████████▊ | 8857/10000 [1:58:37<12:46, 1.49it/s, loss=0.0104, lr=3.30e-06, step=8856] Training: 89%|████████▊ | 8857/10000 [1:58:37<12:46, 1.49it/s, loss=0.0046, lr=3.30e-06, step=8857] Training: 89%|████████▊ | 8858/10000 [1:58:38<14:21, 1.33it/s, loss=0.0046, lr=3.30e-06, step=8857] Training: 89%|████████▊ | 8858/10000 [1:58:38<14:21, 1.33it/s, loss=0.0071, lr=3.29e-06, step=8858] Training: 89%|████████▊ | 8859/10000 [1:58:39<15:33, 1.22it/s, loss=0.0071, lr=3.29e-06, step=8858] Training: 89%|████████▊ | 8859/10000 [1:58:39<15:33, 1.22it/s, loss=0.0129, lr=3.29e-06, step=8859]20:43:12.223 [I] step=8860 loss=0.0018 smoothed_loss=0.0063 lr=3.30e-06 grad_norm=0.3892 step_time=0.5351s data_time=0.1512s it/s=1.457 eta_to_10000=782.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0141 grad_action_out_proj_arms=0.1198 grad_arm_token_fuse=0.0769 grad_shared_expert=0.4590 (18633:train_pytorch.py:850) + Training: 89%|████████▊ | 8860/10000 [1:58:40<14:06, 1.35it/s, loss=0.0129, lr=3.29e-06, step=8859] Training: 89%|████████▊ | 8860/10000 [1:58:40<14:06, 1.35it/s, loss=0.0018, lr=3.29e-06, step=8860] Training: 89%|████████▊ | 8861/10000 [1:58:40<12:46, 1.49it/s, loss=0.0018, lr=3.29e-06, step=8860] Training: 89%|████████▊ | 8861/10000 [1:58:40<12:46, 1.49it/s, loss=0.0059, lr=3.29e-06, step=8861] Training: 89%|████████▊ | 8862/10000 [1:58:41<12:12, 1.55it/s, loss=0.0059, lr=3.29e-06, step=8861] Training: 89%|████████▊ | 8862/10000 [1:58:41<12:12, 1.55it/s, loss=0.0092, lr=3.29e-06, step=8862] Training: 89%|████████▊ | 8863/10000 [1:58:42<12:45, 1.49it/s, loss=0.0092, lr=3.29e-06, step=8862] Training: 89%|████████▊ | 8863/10000 [1:58:42<12:45, 1.49it/s, loss=0.0040, lr=3.29e-06, step=8863] Training: 89%|████████▊ | 8864/10000 [1:58:43<14:02, 1.35it/s, loss=0.0040, lr=3.29e-06, step=8863] Training: 89%|████████▊ | 8864/10000 [1:58:43<14:02, 1.35it/s, loss=0.0040, lr=3.29e-06, step=8864] Training: 89%|████████▊ | 8865/10000 [1:58:44<15:37, 1.21it/s, loss=0.0040, lr=3.29e-06, step=8864] Training: 89%|████████▊ | 8865/10000 [1:58:44<15:37, 1.21it/s, loss=0.0050, lr=3.28e-06, step=8865] Training: 89%|████████▊ | 8866/10000 [1:58:45<16:54, 1.12it/s, loss=0.0050, lr=3.28e-06, step=8865] Training: 89%|████████▊ | 8866/10000 [1:58:45<16:54, 1.12it/s, loss=0.0108, lr=3.28e-06, step=8866] Training: 89%|████████▊ | 8867/10000 [1:58:46<16:41, 1.13it/s, loss=0.0108, lr=3.28e-06, step=8866] Training: 89%|████████▊ | 8867/10000 [1:58:46<16:41, 1.13it/s, loss=0.0206, lr=3.28e-06, step=8867] Training: 89%|████████▊ | 8868/10000 [1:58:46<14:25, 1.31it/s, loss=0.0206, lr=3.28e-06, step=8867] Training: 89%|████████▊ | 8868/10000 [1:58:46<14:25, 1.31it/s, loss=0.0060, lr=3.28e-06, step=8868] Training: 89%|████████▊ | 8869/10000 [1:58:47<13:17, 1.42it/s, loss=0.0060, lr=3.28e-06, step=8868] Training: 89%|████████▊ | 8869/10000 [1:58:47<13:17, 1.42it/s, loss=0.0097, lr=3.28e-06, step=8869]20:43:19.462 [I] step=8870 loss=0.0081 smoothed_loss=0.0079 lr=3.28e-06 grad_norm=0.3240 step_time=0.5695s data_time=0.1544s it/s=1.381 eta_to_10000=818.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0055 grad_action_out_proj_arms=0.0626 grad_arm_token_fuse=0.0301 grad_shared_expert=0.2093 (18633:train_pytorch.py:850) + Training: 89%|████████▊ | 8870/10000 [1:58:47<12:14, 1.54it/s, loss=0.0097, lr=3.28e-06, step=8869] Training: 89%|████████▊ | 8870/10000 [1:58:47<12:14, 1.54it/s, loss=0.0081, lr=3.28e-06, step=8870] Training: 89%|████████▊ | 8871/10000 [1:58:48<13:55, 1.35it/s, loss=0.0081, lr=3.28e-06, step=8870] Training: 89%|████████▊ | 8871/10000 [1:58:48<13:55, 1.35it/s, loss=0.0033, lr=3.28e-06, step=8871] Training: 89%|████████▊ | 8872/10000 [1:58:49<16:04, 1.17it/s, loss=0.0033, lr=3.28e-06, step=8871] Training: 89%|████████▊ | 8872/10000 [1:58:49<16:04, 1.17it/s, loss=0.0189, lr=3.28e-06, step=8872] Training: 89%|████████▊ | 8873/10000 [1:58:50<14:52, 1.26it/s, loss=0.0189, lr=3.28e-06, step=8872] Training: 89%|████████▊ | 8873/10000 [1:58:50<14:52, 1.26it/s, loss=0.0204, lr=3.27e-06, step=8873] Training: 89%|████████▊ | 8874/10000 [1:58:50<13:08, 1.43it/s, loss=0.0204, lr=3.27e-06, step=8873] Training: 89%|████████▊ | 8874/10000 [1:58:50<13:08, 1.43it/s, loss=0.0106, lr=3.27e-06, step=8874] Training: 89%|████████▉ | 8875/10000 [1:58:51<11:58, 1.57it/s, loss=0.0106, lr=3.27e-06, step=8874] Training: 89%|████████▉ | 8875/10000 [1:58:51<11:58, 1.57it/s, loss=0.0051, lr=3.27e-06, step=8875] Training: 89%|████████▉ | 8876/10000 [1:58:51<11:07, 1.68it/s, loss=0.0051, lr=3.27e-06, step=8875] Training: 89%|████████▉ | 8876/10000 [1:58:51<11:07, 1.68it/s, loss=0.0052, lr=3.27e-06, step=8876] Training: 89%|████████▉ | 8877/10000 [1:58:52<10:32, 1.78it/s, loss=0.0052, lr=3.27e-06, step=8876] Training: 89%|████████▉ | 8877/10000 [1:58:52<10:32, 1.78it/s, loss=0.0400, lr=3.27e-06, step=8877] Training: 89%|████████▉ | 8878/10000 [1:58:53<12:40, 1.47it/s, loss=0.0400, lr=3.27e-06, step=8877] Training: 89%|████████▉ | 8878/10000 [1:58:53<12:40, 1.47it/s, loss=0.0024, lr=3.27e-06, step=8878] Training: 89%|████████▉ | 8879/10000 [1:58:54<14:16, 1.31it/s, loss=0.0024, lr=3.27e-06, step=8878] Training: 89%|████████▉ | 8879/10000 [1:58:54<14:16, 1.31it/s, loss=0.0032, lr=3.27e-06, step=8879]20:43:26.935 [I] step=8880 loss=0.0030 smoothed_loss=0.0096 lr=3.27e-06 grad_norm=0.3553 step_time=0.6109s data_time=0.1364s it/s=1.338 eta_to_10000=836.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0043 grad_action_out_proj_arms=0.0794 grad_arm_token_fuse=0.0219 grad_shared_expert=0.3047 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8880/10000 [1:58:55<14:55, 1.25it/s, loss=0.0032, lr=3.27e-06, step=8879] Training: 89%|████████▉ | 8880/10000 [1:58:55<14:55, 1.25it/s, loss=0.0030, lr=3.26e-06, step=8880] Training: 89%|████████▉ | 8881/10000 [1:58:55<13:06, 1.42it/s, loss=0.0030, lr=3.26e-06, step=8880] Training: 89%|████████▉ | 8881/10000 [1:58:55<13:06, 1.42it/s, loss=0.0023, lr=3.26e-06, step=8881] Training: 89%|████████▉ | 8882/10000 [1:58:56<11:47, 1.58it/s, loss=0.0023, lr=3.26e-06, step=8881] Training: 89%|████████▉ | 8882/10000 [1:58:56<11:47, 1.58it/s, loss=0.0365, lr=3.26e-06, step=8882] Training: 89%|████████▉ | 8883/10000 [1:58:56<12:00, 1.55it/s, loss=0.0365, lr=3.26e-06, step=8882] Training: 89%|████████▉ | 8883/10000 [1:58:56<12:00, 1.55it/s, loss=0.0073, lr=3.26e-06, step=8883] Training: 89%|████████▉ | 8884/10000 [1:58:57<11:06, 1.67it/s, loss=0.0073, lr=3.26e-06, step=8883] Training: 89%|████████▉ | 8884/10000 [1:58:57<11:06, 1.67it/s, loss=0.0027, lr=3.26e-06, step=8884] Training: 89%|████████▉ | 8885/10000 [1:58:57<11:18, 1.64it/s, loss=0.0027, lr=3.26e-06, step=8884] Training: 89%|████████▉ | 8885/10000 [1:58:57<11:18, 1.64it/s, loss=0.0062, lr=3.26e-06, step=8885] Training: 89%|████████▉ | 8886/10000 [1:58:58<11:42, 1.58it/s, loss=0.0062, lr=3.26e-06, step=8885] Training: 89%|████████▉ | 8886/10000 [1:58:58<11:42, 1.58it/s, loss=0.0028, lr=3.26e-06, step=8886] Training: 89%|████████▉ | 8887/10000 [1:58:59<12:43, 1.46it/s, loss=0.0028, lr=3.26e-06, step=8886] Training: 89%|████████▉ | 8887/10000 [1:58:59<12:43, 1.46it/s, loss=0.0045, lr=3.25e-06, step=8887] Training: 89%|████████▉ | 8888/10000 [1:59:00<12:39, 1.46it/s, loss=0.0045, lr=3.25e-06, step=8887] Training: 89%|████████▉ | 8888/10000 [1:59:00<12:39, 1.46it/s, loss=0.0108, lr=3.25e-06, step=8888] Training: 89%|████████▉ | 8889/10000 [1:59:00<12:57, 1.43it/s, loss=0.0108, lr=3.25e-06, step=8888] Training: 89%|████████▉ | 8889/10000 [1:59:00<12:57, 1.43it/s, loss=0.0045, lr=3.25e-06, step=8889]20:43:33.487 [I] step=8890 loss=0.0092 smoothed_loss=0.0086 lr=3.26e-06 grad_norm=0.4404 step_time=0.5336s data_time=0.1217s it/s=1.526 eta_to_10000=727.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0057 grad_action_out_proj_arms=0.0634 grad_arm_token_fuse=0.0271 grad_shared_expert=0.3575 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8890/10000 [1:59:01<14:03, 1.32it/s, loss=0.0045, lr=3.25e-06, step=8889] Training: 89%|████████▉ | 8890/10000 [1:59:01<14:03, 1.32it/s, loss=0.0092, lr=3.25e-06, step=8890] Training: 89%|████████▉ | 8891/10000 [1:59:02<12:29, 1.48it/s, loss=0.0092, lr=3.25e-06, step=8890] Training: 89%|████████▉ | 8891/10000 [1:59:02<12:29, 1.48it/s, loss=0.0105, lr=3.25e-06, step=8891] Training: 89%|████████▉ | 8892/10000 [1:59:02<12:57, 1.42it/s, loss=0.0105, lr=3.25e-06, step=8891] Training: 89%|████████▉ | 8892/10000 [1:59:02<12:57, 1.42it/s, loss=0.0010, lr=3.25e-06, step=8892] Training: 89%|████████▉ | 8893/10000 [1:59:03<13:28, 1.37it/s, loss=0.0010, lr=3.25e-06, step=8892] Training: 89%|████████▉ | 8893/10000 [1:59:03<13:28, 1.37it/s, loss=0.0085, lr=3.25e-06, step=8893] Training: 89%|████████▉ | 8894/10000 [1:59:04<13:22, 1.38it/s, loss=0.0085, lr=3.25e-06, step=8893] Training: 89%|████████▉ | 8894/10000 [1:59:04<13:22, 1.38it/s, loss=0.0139, lr=3.25e-06, step=8894] Training: 89%|████████▉ | 8895/10000 [1:59:05<12:59, 1.42it/s, loss=0.0139, lr=3.25e-06, step=8894] Training: 89%|████████▉ | 8895/10000 [1:59:05<12:59, 1.42it/s, loss=0.0063, lr=3.24e-06, step=8895] Training: 89%|████████▉ | 8896/10000 [1:59:06<14:21, 1.28it/s, loss=0.0063, lr=3.24e-06, step=8895] Training: 89%|████████▉ | 8896/10000 [1:59:06<14:21, 1.28it/s, loss=0.0093, lr=3.24e-06, step=8896] Training: 89%|████████▉ | 8897/10000 [1:59:06<12:38, 1.45it/s, loss=0.0093, lr=3.24e-06, step=8896] Training: 89%|████████▉ | 8897/10000 [1:59:06<12:38, 1.45it/s, loss=0.0118, lr=3.24e-06, step=8897] Training: 89%|████████▉ | 8898/10000 [1:59:07<12:02, 1.52it/s, loss=0.0118, lr=3.24e-06, step=8897] Training: 89%|████████▉ | 8898/10000 [1:59:07<12:02, 1.52it/s, loss=0.0168, lr=3.24e-06, step=8898] Training: 89%|████████▉ | 8899/10000 [1:59:07<11:10, 1.64it/s, loss=0.0168, lr=3.24e-06, step=8898] Training: 89%|████████▉ | 8899/10000 [1:59:07<11:10, 1.64it/s, loss=0.0031, lr=3.24e-06, step=8899]20:43:40.128 [I] step=8900 loss=0.0044 smoothed_loss=0.0085 lr=3.24e-06 grad_norm=0.4341 step_time=0.5441s data_time=0.1199s it/s=1.506 eta_to_10000=730.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0368 grad_action_out_proj_arms=0.1866 grad_arm_token_fuse=0.1844 grad_shared_expert=0.6679 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8900/10000 [1:59:08<11:46, 1.56it/s, loss=0.0031, lr=3.24e-06, step=8899] Training: 89%|████████▉ | 8900/10000 [1:59:08<11:46, 1.56it/s, loss=0.0044, lr=3.24e-06, step=8900] Training: 89%|████████▉ | 8901/10000 [1:59:09<12:29, 1.47it/s, loss=0.0044, lr=3.24e-06, step=8900] Training: 89%|████████▉ | 8901/10000 [1:59:09<12:29, 1.47it/s, loss=0.0371, lr=3.24e-06, step=8901] Training: 89%|████████▉ | 8902/10000 [1:59:09<12:26, 1.47it/s, loss=0.0371, lr=3.24e-06, step=8901] Training: 89%|████████▉ | 8902/10000 [1:59:09<12:26, 1.47it/s, loss=0.0112, lr=3.23e-06, step=8902] Training: 89%|████████▉ | 8903/10000 [1:59:10<12:01, 1.52it/s, loss=0.0112, lr=3.23e-06, step=8902] Training: 89%|████████▉ | 8903/10000 [1:59:10<12:01, 1.52it/s, loss=0.0055, lr=3.23e-06, step=8903] Training: 89%|████████▉ | 8904/10000 [1:59:10<11:00, 1.66it/s, loss=0.0055, lr=3.23e-06, step=8903] Training: 89%|████████▉ | 8904/10000 [1:59:10<11:00, 1.66it/s, loss=0.0169, lr=3.23e-06, step=8904] Training: 89%|████████▉ | 8905/10000 [1:59:11<10:17, 1.77it/s, loss=0.0169, lr=3.23e-06, step=8904] Training: 89%|████████▉ | 8905/10000 [1:59:11<10:17, 1.77it/s, loss=0.0088, lr=3.23e-06, step=8905] Training: 89%|████████▉ | 8906/10000 [1:59:11<10:36, 1.72it/s, loss=0.0088, lr=3.23e-06, step=8905] Training: 89%|████████▉ | 8906/10000 [1:59:11<10:36, 1.72it/s, loss=0.0057, lr=3.23e-06, step=8906] Training: 89%|████████▉ | 8907/10000 [1:59:12<12:05, 1.51it/s, loss=0.0057, lr=3.23e-06, step=8906] Training: 89%|████████▉ | 8907/10000 [1:59:12<12:05, 1.51it/s, loss=0.0085, lr=3.23e-06, step=8907] Training: 89%|████████▉ | 8908/10000 [1:59:13<12:39, 1.44it/s, loss=0.0085, lr=3.23e-06, step=8907] Training: 89%|████████▉ | 8908/10000 [1:59:13<12:39, 1.44it/s, loss=0.0021, lr=3.23e-06, step=8908] Training: 89%|████████▉ | 8909/10000 [1:59:14<12:00, 1.51it/s, loss=0.0021, lr=3.23e-06, step=8908] Training: 89%|████████▉ | 8909/10000 [1:59:14<12:00, 1.51it/s, loss=0.0010, lr=3.23e-06, step=8909]20:43:46.702 [I] step=8910 loss=0.0096 smoothed_loss=0.0088 lr=3.23e-06 grad_norm=0.4798 step_time=0.5414s data_time=0.1160s it/s=1.521 eta_to_10000=716.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0140 grad_action_out_proj_arms=0.1215 grad_arm_token_fuse=0.0741 grad_shared_expert=0.4035 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8910/10000 [1:59:14<12:27, 1.46it/s, loss=0.0010, lr=3.23e-06, step=8909] Training: 89%|████████▉ | 8910/10000 [1:59:14<12:27, 1.46it/s, loss=0.0096, lr=3.22e-06, step=8910] Training: 89%|████████▉ | 8911/10000 [1:59:15<12:20, 1.47it/s, loss=0.0096, lr=3.22e-06, step=8910] Training: 89%|████████▉ | 8911/10000 [1:59:15<12:20, 1.47it/s, loss=0.0069, lr=3.22e-06, step=8911] Training: 89%|████████▉ | 8912/10000 [1:59:16<13:31, 1.34it/s, loss=0.0069, lr=3.22e-06, step=8911] Training: 89%|████████▉ | 8912/10000 [1:59:16<13:31, 1.34it/s, loss=0.0064, lr=3.22e-06, step=8912] Training: 89%|████████▉ | 8913/10000 [1:59:16<12:27, 1.45it/s, loss=0.0064, lr=3.22e-06, step=8912] Training: 89%|████████▉ | 8913/10000 [1:59:16<12:27, 1.45it/s, loss=0.0018, lr=3.22e-06, step=8913] Training: 89%|████████▉ | 8914/10000 [1:59:17<12:58, 1.39it/s, loss=0.0018, lr=3.22e-06, step=8913] Training: 89%|████████▉ | 8914/10000 [1:59:17<12:58, 1.39it/s, loss=0.0036, lr=3.22e-06, step=8914] Training: 89%|████████▉ | 8915/10000 [1:59:18<12:50, 1.41it/s, loss=0.0036, lr=3.22e-06, step=8914] Training: 89%|████████▉ | 8915/10000 [1:59:18<12:50, 1.41it/s, loss=0.0016, lr=3.22e-06, step=8915] Training: 89%|████████▉ | 8916/10000 [1:59:19<13:03, 1.38it/s, loss=0.0016, lr=3.22e-06, step=8915] Training: 89%|████████▉ | 8916/10000 [1:59:19<13:03, 1.38it/s, loss=0.0079, lr=3.22e-06, step=8916] Training: 89%|████████▉ | 8917/10000 [1:59:19<11:50, 1.52it/s, loss=0.0079, lr=3.22e-06, step=8916] Training: 89%|████████▉ | 8917/10000 [1:59:19<11:50, 1.52it/s, loss=0.0039, lr=3.22e-06, step=8917] Training: 89%|████████▉ | 8918/10000 [1:59:20<12:46, 1.41it/s, loss=0.0039, lr=3.22e-06, step=8917] Training: 89%|████████▉ | 8918/10000 [1:59:20<12:46, 1.41it/s, loss=0.0013, lr=3.21e-06, step=8918] Training: 89%|████████▉ | 8919/10000 [1:59:21<12:53, 1.40it/s, loss=0.0013, lr=3.21e-06, step=8918] Training: 89%|████████▉ | 8919/10000 [1:59:21<12:53, 1.40it/s, loss=0.0011, lr=3.21e-06, step=8919]20:43:53.807 [I] step=8920 loss=0.0018 smoothed_loss=0.0052 lr=3.22e-06 grad_norm=0.3492 step_time=0.5811s data_time=0.1293s it/s=1.408 eta_to_10000=767.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0144 grad_action_out_proj_arms=0.0852 grad_arm_token_fuse=0.0743 grad_shared_expert=0.2822 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8920/10000 [1:59:21<12:46, 1.41it/s, loss=0.0011, lr=3.21e-06, step=8919] Training: 89%|████████▉ | 8920/10000 [1:59:21<12:46, 1.41it/s, loss=0.0018, lr=3.21e-06, step=8920] Training: 89%|████████▉ | 8921/10000 [1:59:22<13:37, 1.32it/s, loss=0.0018, lr=3.21e-06, step=8920] Training: 89%|████████▉ | 8921/10000 [1:59:22<13:37, 1.32it/s, loss=0.0042, lr=3.21e-06, step=8921] Training: 89%|████████▉ | 8922/10000 [1:59:23<14:37, 1.23it/s, loss=0.0042, lr=3.21e-06, step=8921] Training: 89%|████████▉ | 8922/10000 [1:59:23<14:37, 1.23it/s, loss=0.0024, lr=3.21e-06, step=8922] Training: 89%|████████▉ | 8923/10000 [1:59:24<13:34, 1.32it/s, loss=0.0024, lr=3.21e-06, step=8922] Training: 89%|████████▉ | 8923/10000 [1:59:24<13:34, 1.32it/s, loss=0.0572, lr=3.21e-06, step=8923] Training: 89%|████████▉ | 8924/10000 [1:59:24<12:17, 1.46it/s, loss=0.0572, lr=3.21e-06, step=8923] Training: 89%|████████▉ | 8924/10000 [1:59:24<12:17, 1.46it/s, loss=0.0038, lr=3.21e-06, step=8924] Training: 89%|████████▉ | 8925/10000 [1:59:25<12:24, 1.44it/s, loss=0.0038, lr=3.21e-06, step=8924] Training: 89%|████████▉ | 8925/10000 [1:59:25<12:24, 1.44it/s, loss=0.0045, lr=3.20e-06, step=8925] Training: 89%|████████▉ | 8926/10000 [1:59:26<12:21, 1.45it/s, loss=0.0045, lr=3.20e-06, step=8925] Training: 89%|████████▉ | 8926/10000 [1:59:26<12:21, 1.45it/s, loss=0.0099, lr=3.20e-06, step=8926] Training: 89%|████████▉ | 8927/10000 [1:59:26<11:12, 1.60it/s, loss=0.0099, lr=3.20e-06, step=8926] Training: 89%|████████▉ | 8927/10000 [1:59:26<11:12, 1.60it/s, loss=0.0084, lr=3.20e-06, step=8927] Training: 89%|████████▉ | 8928/10000 [1:59:27<11:50, 1.51it/s, loss=0.0084, lr=3.20e-06, step=8927] Training: 89%|████████▉ | 8928/10000 [1:59:27<11:50, 1.51it/s, loss=0.0076, lr=3.20e-06, step=8928] Training: 89%|████████▉ | 8929/10000 [1:59:28<13:33, 1.32it/s, loss=0.0076, lr=3.20e-06, step=8928] Training: 89%|████████▉ | 8929/10000 [1:59:28<13:33, 1.32it/s, loss=0.0007, lr=3.20e-06, step=8929]20:44:01.284 [I] step=8930 loss=0.0066 smoothed_loss=0.0079 lr=3.20e-06 grad_norm=0.3814 step_time=0.6142s data_time=0.1335s it/s=1.339 eta_to_10000=799.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0089 grad_action_out_proj_arms=0.0832 grad_arm_token_fuse=0.0460 grad_shared_expert=0.3588 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8930/10000 [1:59:29<14:21, 1.24it/s, loss=0.0007, lr=3.20e-06, step=8929] Training: 89%|████████▉ | 8930/10000 [1:59:29<14:21, 1.24it/s, loss=0.0066, lr=3.20e-06, step=8930] Training: 89%|████████▉ | 8931/10000 [1:59:30<13:45, 1.30it/s, loss=0.0066, lr=3.20e-06, step=8930] Training: 89%|████████▉ | 8931/10000 [1:59:30<13:45, 1.30it/s, loss=0.0126, lr=3.20e-06, step=8931] Training: 89%|████████▉ | 8932/10000 [1:59:30<13:14, 1.34it/s, loss=0.0126, lr=3.20e-06, step=8931] Training: 89%|████████▉ | 8932/10000 [1:59:30<13:14, 1.34it/s, loss=0.0007, lr=3.20e-06, step=8932] Training: 89%|████████▉ | 8933/10000 [1:59:31<11:56, 1.49it/s, loss=0.0007, lr=3.20e-06, step=8932] Training: 89%|████████▉ | 8933/10000 [1:59:31<11:56, 1.49it/s, loss=0.0030, lr=3.19e-06, step=8933] Training: 89%|████████▉ | 8934/10000 [1:59:32<12:09, 1.46it/s, loss=0.0030, lr=3.19e-06, step=8933] Training: 89%|████████▉ | 8934/10000 [1:59:32<12:09, 1.46it/s, loss=0.0031, lr=3.19e-06, step=8934] Training: 89%|████████▉ | 8935/10000 [1:59:32<12:04, 1.47it/s, loss=0.0031, lr=3.19e-06, step=8934] Training: 89%|████████▉ | 8935/10000 [1:59:32<12:04, 1.47it/s, loss=0.0019, lr=3.19e-06, step=8935] Training: 89%|████████▉ | 8936/10000 [1:59:33<12:14, 1.45it/s, loss=0.0019, lr=3.19e-06, step=8935] Training: 89%|████████▉ | 8936/10000 [1:59:33<12:14, 1.45it/s, loss=0.0038, lr=3.19e-06, step=8936] Training: 89%|████████▉ | 8937/10000 [1:59:34<12:12, 1.45it/s, loss=0.0038, lr=3.19e-06, step=8936] Training: 89%|████████▉ | 8937/10000 [1:59:34<12:12, 1.45it/s, loss=0.0017, lr=3.19e-06, step=8937] Training: 89%|████████▉ | 8938/10000 [1:59:34<12:13, 1.45it/s, loss=0.0017, lr=3.19e-06, step=8937] Training: 89%|████████▉ | 8938/10000 [1:59:34<12:13, 1.45it/s, loss=0.0058, lr=3.19e-06, step=8938] Training: 89%|████████▉ | 8939/10000 [1:59:35<11:09, 1.59it/s, loss=0.0058, lr=3.19e-06, step=8938] Training: 89%|████████▉ | 8939/10000 [1:59:35<11:09, 1.59it/s, loss=0.0112, lr=3.19e-06, step=8939]20:44:07.850 [I] step=8940 loss=0.0025 smoothed_loss=0.0058 lr=3.19e-06 grad_norm=0.3685 step_time=0.5539s data_time=0.1027s it/s=1.523 eta_to_10000=695.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0049 grad_action_out_proj_arms=0.0467 grad_arm_token_fuse=0.0248 grad_shared_expert=0.2722 (18633:train_pytorch.py:850) + Training: 89%|████████▉ | 8940/10000 [1:59:36<11:36, 1.52it/s, loss=0.0112, lr=3.19e-06, step=8939] Training: 89%|████████▉ | 8940/10000 [1:59:36<11:36, 1.52it/s, loss=0.0025, lr=3.19e-06, step=8940] Training: 89%|████████▉ | 8941/10000 [1:59:36<10:57, 1.61it/s, loss=0.0025, lr=3.19e-06, step=8940] Training: 89%|████████▉ | 8941/10000 [1:59:36<10:57, 1.61it/s, loss=0.0032, lr=3.18e-06, step=8941] Training: 89%|████████▉ | 8942/10000 [1:59:37<11:58, 1.47it/s, loss=0.0032, lr=3.18e-06, step=8941] Training: 89%|████████▉ | 8942/10000 [1:59:37<11:58, 1.47it/s, loss=0.0105, lr=3.18e-06, step=8942] Training: 89%|████████▉ | 8943/10000 [1:59:38<12:48, 1.38it/s, loss=0.0105, lr=3.18e-06, step=8942] Training: 89%|████████▉ | 8943/10000 [1:59:38<12:48, 1.38it/s, loss=0.0130, lr=3.18e-06, step=8943] Training: 89%|████████▉ | 8944/10000 [1:59:38<11:28, 1.53it/s, loss=0.0130, lr=3.18e-06, step=8943] Training: 89%|████████▉ | 8944/10000 [1:59:38<11:28, 1.53it/s, loss=0.0086, lr=3.18e-06, step=8944] Training: 89%|████████▉ | 8945/10000 [1:59:39<11:43, 1.50it/s, loss=0.0086, lr=3.18e-06, step=8944] Training: 89%|████████▉ | 8945/10000 [1:59:39<11:43, 1.50it/s, loss=0.0060, lr=3.18e-06, step=8945] Training: 89%|████████▉ | 8946/10000 [1:59:39<10:42, 1.64it/s, loss=0.0060, lr=3.18e-06, step=8945] Training: 89%|████████▉ | 8946/10000 [1:59:39<10:42, 1.64it/s, loss=0.0181, lr=3.18e-06, step=8946] Training: 89%|████████▉ | 8947/10000 [1:59:40<11:51, 1.48it/s, loss=0.0181, lr=3.18e-06, step=8946] Training: 89%|████████▉ | 8947/10000 [1:59:40<11:51, 1.48it/s, loss=0.0092, lr=3.18e-06, step=8947] Training: 89%|████████▉ | 8948/10000 [1:59:41<10:50, 1.62it/s, loss=0.0092, lr=3.18e-06, step=8947] Training: 89%|████████▉ | 8948/10000 [1:59:41<10:50, 1.62it/s, loss=0.0057, lr=3.18e-06, step=8948] Training: 89%|████████▉ | 8949/10000 [1:59:41<10:12, 1.72it/s, loss=0.0057, lr=3.18e-06, step=8948] Training: 89%|████████▉ | 8949/10000 [1:59:41<10:12, 1.72it/s, loss=0.0026, lr=3.17e-06, step=8949]20:44:14.272 [I] step=8950 loss=0.0124 smoothed_loss=0.0078 lr=3.18e-06 grad_norm=0.6050 step_time=0.5542s data_time=0.0880s it/s=1.557 eta_to_10000=674.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0375 grad_action_out_proj_arms=0.2056 grad_arm_token_fuse=0.1992 grad_shared_expert=0.6311 (18633:train_pytorch.py:850) + Training: 90%|████████▉ | 8950/10000 [1:59:42<11:08, 1.57it/s, loss=0.0026, lr=3.17e-06, step=8949] Training: 90%|████████▉ | 8950/10000 [1:59:42<11:08, 1.57it/s, loss=0.0124, lr=3.17e-06, step=8950] Training: 90%|████████▉ | 8951/10000 [1:59:42<10:19, 1.69it/s, loss=0.0124, lr=3.17e-06, step=8950] Training: 90%|████████▉ | 8951/10000 [1:59:42<10:19, 1.69it/s, loss=0.0285, lr=3.17e-06, step=8951] Training: 90%|████████▉ | 8952/10000 [1:59:43<11:45, 1.48it/s, loss=0.0285, lr=3.17e-06, step=8951] Training: 90%|████████▉ | 8952/10000 [1:59:43<11:45, 1.48it/s, loss=0.0016, lr=3.17e-06, step=8952] Training: 90%|████████▉ | 8953/10000 [1:59:44<11:17, 1.55it/s, loss=0.0016, lr=3.17e-06, step=8952] Training: 90%|████████▉ | 8953/10000 [1:59:44<11:17, 1.55it/s, loss=0.0023, lr=3.17e-06, step=8953] Training: 90%|████████▉ | 8954/10000 [1:59:45<13:01, 1.34it/s, loss=0.0023, lr=3.17e-06, step=8953] Training: 90%|████████▉ | 8954/10000 [1:59:45<13:01, 1.34it/s, loss=0.0193, lr=3.17e-06, step=8954] Training: 90%|████████▉ | 8955/10000 [1:59:46<13:00, 1.34it/s, loss=0.0193, lr=3.17e-06, step=8954] Training: 90%|████████▉ | 8955/10000 [1:59:46<13:00, 1.34it/s, loss=0.0169, lr=3.17e-06, step=8955] Training: 90%|████████▉ | 8956/10000 [1:59:46<13:25, 1.30it/s, loss=0.0169, lr=3.17e-06, step=8955] Training: 90%|████████▉ | 8956/10000 [1:59:46<13:25, 1.30it/s, loss=0.0031, lr=3.17e-06, step=8956] Training: 90%|████████▉ | 8957/10000 [1:59:48<14:58, 1.16it/s, loss=0.0031, lr=3.17e-06, step=8956] Training: 90%|████████▉ | 8957/10000 [1:59:48<14:58, 1.16it/s, loss=0.0052, lr=3.16e-06, step=8957] Training: 90%|████████▉ | 8958/10000 [1:59:48<13:38, 1.27it/s, loss=0.0052, lr=3.16e-06, step=8957] Training: 90%|████████▉ | 8958/10000 [1:59:48<13:38, 1.27it/s, loss=0.0026, lr=3.16e-06, step=8958] Training: 90%|████████▉ | 8959/10000 [1:59:49<13:30, 1.28it/s, loss=0.0026, lr=3.16e-06, step=8958] Training: 90%|████████▉ | 8959/10000 [1:59:49<13:30, 1.28it/s, loss=0.0068, lr=3.16e-06, step=8959]20:44:22.046 [I] step=8960 loss=0.0044 smoothed_loss=0.0079 lr=3.17e-06 grad_norm=0.3734 step_time=0.5999s data_time=0.1775s it/s=1.286 eta_to_10000=808.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0054 grad_action_out_proj_arms=0.0677 grad_arm_token_fuse=0.0282 grad_shared_expert=0.3855 (18633:train_pytorch.py:850) + Training: 90%|████████▉ | 8960/10000 [1:59:50<13:49, 1.25it/s, loss=0.0068, lr=3.16e-06, step=8959] Training: 90%|████████▉ | 8960/10000 [1:59:50<13:49, 1.25it/s, loss=0.0044, lr=3.16e-06, step=8960] Training: 90%|████████▉ | 8961/10000 [1:59:50<12:38, 1.37it/s, loss=0.0044, lr=3.16e-06, step=8960] Training: 90%|████████▉ | 8961/10000 [1:59:50<12:38, 1.37it/s, loss=0.0059, lr=3.16e-06, step=8961] Training: 90%|████████▉ | 8962/10000 [1:59:51<14:27, 1.20it/s, loss=0.0059, lr=3.16e-06, step=8961] Training: 90%|████████▉ | 8962/10000 [1:59:51<14:27, 1.20it/s, loss=0.0060, lr=3.16e-06, step=8962] Training: 90%|████████▉ | 8963/10000 [1:59:52<14:37, 1.18it/s, loss=0.0060, lr=3.16e-06, step=8962] Training: 90%|████████▉ | 8963/10000 [1:59:52<14:37, 1.18it/s, loss=0.0240, lr=3.16e-06, step=8963] Training: 90%|████████▉ | 8964/10000 [1:59:53<15:34, 1.11it/s, loss=0.0240, lr=3.16e-06, step=8963] Training: 90%|████████▉ | 8964/10000 [1:59:53<15:34, 1.11it/s, loss=0.0030, lr=3.16e-06, step=8964] Training: 90%|████████▉ | 8965/10000 [1:59:54<15:39, 1.10it/s, loss=0.0030, lr=3.16e-06, step=8964] Training: 90%|████████▉ | 8965/10000 [1:59:54<15:39, 1.10it/s, loss=0.0035, lr=3.15e-06, step=8965] Training: 90%|████████▉ | 8966/10000 [1:59:55<15:54, 1.08it/s, loss=0.0035, lr=3.15e-06, step=8965] Training: 90%|████████▉ | 8966/10000 [1:59:55<15:54, 1.08it/s, loss=0.0066, lr=3.15e-06, step=8966] Training: 90%|████████▉ | 8967/10000 [1:59:56<14:30, 1.19it/s, loss=0.0066, lr=3.15e-06, step=8966] Training: 90%|████████▉ | 8967/10000 [1:59:56<14:30, 1.19it/s, loss=0.0023, lr=3.15e-06, step=8967] Training: 90%|████████▉ | 8968/10000 [1:59:57<14:23, 1.19it/s, loss=0.0023, lr=3.15e-06, step=8967] Training: 90%|████████▉ | 8968/10000 [1:59:57<14:23, 1.19it/s, loss=0.0032, lr=3.15e-06, step=8968] Training: 90%|████████▉ | 8969/10000 [1:59:57<13:11, 1.30it/s, loss=0.0032, lr=3.15e-06, step=8968] Training: 90%|████████▉ | 8969/10000 [1:59:57<13:11, 1.30it/s, loss=0.0054, lr=3.15e-06, step=8969]20:44:30.330 [I] step=8970 loss=0.0021 smoothed_loss=0.0063 lr=3.15e-06 grad_norm=0.4070 step_time=0.6426s data_time=0.1858s it/s=1.207 eta_to_10000=853.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0035 grad_action_out_proj_arms=0.0434 grad_arm_token_fuse=0.0176 grad_shared_expert=0.1717 (18633:train_pytorch.py:850) + Training: 90%|████████▉ | 8970/10000 [1:59:58<13:08, 1.31it/s, loss=0.0054, lr=3.15e-06, step=8969] Training: 90%|████████▉ | 8970/10000 [1:59:58<13:08, 1.31it/s, loss=0.0021, lr=3.15e-06, step=8970] Training: 90%|████████▉ | 8971/10000 [1:59:59<14:40, 1.17it/s, loss=0.0021, lr=3.15e-06, step=8970] Training: 90%|████████▉ | 8971/10000 [1:59:59<14:40, 1.17it/s, loss=0.0018, lr=3.15e-06, step=8971] Training: 90%|████████▉ | 8972/10000 [2:00:00<14:12, 1.21it/s, loss=0.0018, lr=3.15e-06, step=8971] Training: 90%|████████▉ | 8972/10000 [2:00:00<14:12, 1.21it/s, loss=0.0062, lr=3.15e-06, step=8972] Training: 90%|████████▉ | 8973/10000 [2:00:01<13:21, 1.28it/s, loss=0.0062, lr=3.15e-06, step=8972] Training: 90%|████████▉ | 8973/10000 [2:00:01<13:21, 1.28it/s, loss=0.0154, lr=3.14e-06, step=8973] Training: 90%|████████▉ | 8974/10000 [2:00:01<12:07, 1.41it/s, loss=0.0154, lr=3.14e-06, step=8973] Training: 90%|████████▉ | 8974/10000 [2:00:01<12:07, 1.41it/s, loss=0.0320, lr=3.14e-06, step=8974] Training: 90%|████████▉ | 8975/10000 [2:00:02<11:44, 1.45it/s, loss=0.0320, lr=3.14e-06, step=8974] Training: 90%|████████▉ | 8975/10000 [2:00:02<11:44, 1.45it/s, loss=0.0031, lr=3.14e-06, step=8975] Training: 90%|████████▉ | 8976/10000 [2:00:02<10:41, 1.60it/s, loss=0.0031, lr=3.14e-06, step=8975] Training: 90%|████████▉ | 8976/10000 [2:00:02<10:41, 1.60it/s, loss=0.0059, lr=3.14e-06, step=8976] Training: 90%|████████▉ | 8977/10000 [2:00:03<09:57, 1.71it/s, loss=0.0059, lr=3.14e-06, step=8976] Training: 90%|████████▉ | 8977/10000 [2:00:03<09:57, 1.71it/s, loss=0.0071, lr=3.14e-06, step=8977] Training: 90%|████████▉ | 8978/10000 [2:00:03<10:17, 1.65it/s, loss=0.0071, lr=3.14e-06, step=8977] Training: 90%|████████▉ | 8978/10000 [2:00:03<10:17, 1.65it/s, loss=0.0022, lr=3.14e-06, step=8978] Training: 90%|████████▉ | 8979/10000 [2:00:04<12:14, 1.39it/s, loss=0.0022, lr=3.14e-06, step=8978] Training: 90%|████████▉ | 8979/10000 [2:00:04<12:14, 1.39it/s, loss=0.0024, lr=3.14e-06, step=8979]20:44:37.365 [I] step=8980 loss=0.0010 smoothed_loss=0.0066 lr=3.14e-06 grad_norm=0.4150 step_time=0.5838s data_time=0.1198s it/s=1.422 eta_to_10000=717.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0077 grad_action_out_proj_arms=0.0876 grad_arm_token_fuse=0.0423 grad_shared_expert=0.5295 (18633:train_pytorch.py:850) + Training: 90%|████████▉ | 8980/10000 [2:00:05<12:22, 1.37it/s, loss=0.0024, lr=3.14e-06, step=8979] Training: 90%|████████▉ | 8980/10000 [2:00:05<12:22, 1.37it/s, loss=0.0010, lr=3.14e-06, step=8980] Training: 90%|████████▉ | 8981/10000 [2:00:06<12:09, 1.40it/s, loss=0.0010, lr=3.14e-06, step=8980] Training: 90%|████████▉ | 8981/10000 [2:00:06<12:09, 1.40it/s, loss=0.0028, lr=3.13e-06, step=8981] Training: 90%|████████▉ | 8982/10000 [2:00:06<12:17, 1.38it/s, loss=0.0028, lr=3.13e-06, step=8981] Training: 90%|████████▉ | 8982/10000 [2:00:06<12:17, 1.38it/s, loss=0.0058, lr=3.13e-06, step=8982] Training: 90%|████████▉ | 8983/10000 [2:00:07<11:16, 1.50it/s, loss=0.0058, lr=3.13e-06, step=8982] Training: 90%|████████▉ | 8983/10000 [2:00:07<11:16, 1.50it/s, loss=0.0014, lr=3.13e-06, step=8983] Training: 90%|████████▉ | 8984/10000 [2:00:07<10:18, 1.64it/s, loss=0.0014, lr=3.13e-06, step=8983] Training: 90%|████████▉ | 8984/10000 [2:00:07<10:18, 1.64it/s, loss=0.0037, lr=3.13e-06, step=8984] Training: 90%|████████▉ | 8985/10000 [2:00:08<09:43, 1.74it/s, loss=0.0037, lr=3.13e-06, step=8984] Training: 90%|████████▉ | 8985/10000 [2:00:08<09:43, 1.74it/s, loss=0.0079, lr=3.13e-06, step=8985] Training: 90%|████████▉ | 8986/10000 [2:00:09<10:18, 1.64it/s, loss=0.0079, lr=3.13e-06, step=8985] Training: 90%|████████▉ | 8986/10000 [2:00:09<10:18, 1.64it/s, loss=0.0038, lr=3.13e-06, step=8986] Training: 90%|████████▉ | 8987/10000 [2:00:09<09:52, 1.71it/s, loss=0.0038, lr=3.13e-06, step=8986] Training: 90%|████████▉ | 8987/10000 [2:00:09<09:52, 1.71it/s, loss=0.0046, lr=3.13e-06, step=8987] Training: 90%|████████▉ | 8988/10000 [2:00:10<09:19, 1.81it/s, loss=0.0046, lr=3.13e-06, step=8987] Training: 90%|████████▉ | 8988/10000 [2:00:10<09:19, 1.81it/s, loss=0.0016, lr=3.13e-06, step=8988] Training: 90%|████████▉ | 8989/10000 [2:00:10<09:50, 1.71it/s, loss=0.0016, lr=3.13e-06, step=8988] Training: 90%|████████▉ | 8989/10000 [2:00:10<09:50, 1.71it/s, loss=0.0209, lr=3.12e-06, step=8989]20:44:43.174 [I] step=8990 loss=0.0036 smoothed_loss=0.0063 lr=3.13e-06 grad_norm=0.4070 step_time=0.4999s data_time=0.0810s it/s=1.722 eta_to_10000=586.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0093 grad_action_out_proj_arms=0.0644 grad_arm_token_fuse=0.0495 grad_shared_expert=0.6687 (18633:train_pytorch.py:850) + Training: 90%|████████▉ | 8990/10000 [2:00:11<09:31, 1.77it/s, loss=0.0209, lr=3.12e-06, step=8989] Training: 90%|████████▉ | 8990/10000 [2:00:11<09:31, 1.77it/s, loss=0.0036, lr=3.12e-06, step=8990] Training: 90%|████████▉ | 8991/10000 [2:00:11<09:02, 1.86it/s, loss=0.0036, lr=3.12e-06, step=8990] Training: 90%|████████▉ | 8991/10000 [2:00:11<09:02, 1.86it/s, loss=0.0739, lr=3.12e-06, step=8991] Training: 90%|████████▉ | 8992/10000 [2:00:12<08:47, 1.91it/s, loss=0.0739, lr=3.12e-06, step=8991] Training: 90%|████████▉ | 8992/10000 [2:00:12<08:47, 1.91it/s, loss=0.0070, lr=3.12e-06, step=8992] Training: 90%|████████▉ | 8993/10000 [2:00:13<10:57, 1.53it/s, loss=0.0070, lr=3.12e-06, step=8992] Training: 90%|████████▉ | 8993/10000 [2:00:13<10:57, 1.53it/s, loss=0.0047, lr=3.12e-06, step=8993] Training: 90%|████████▉ | 8994/10000 [2:00:13<10:06, 1.66it/s, loss=0.0047, lr=3.12e-06, step=8993] Training: 90%|████████▉ | 8994/10000 [2:00:13<10:06, 1.66it/s, loss=0.0109, lr=3.12e-06, step=8994] Training: 90%|████████▉ | 8995/10000 [2:00:14<10:47, 1.55it/s, loss=0.0109, lr=3.12e-06, step=8994] Training: 90%|████████▉ | 8995/10000 [2:00:14<10:47, 1.55it/s, loss=0.0096, lr=3.12e-06, step=8995] Training: 90%|████████▉ | 8996/10000 [2:00:15<10:49, 1.55it/s, loss=0.0096, lr=3.12e-06, step=8995] Training: 90%|████████▉ | 8996/10000 [2:00:15<10:49, 1.55it/s, loss=0.0047, lr=3.12e-06, step=8996] Training: 90%|████████▉ | 8997/10000 [2:00:15<10:02, 1.66it/s, loss=0.0047, lr=3.12e-06, step=8996] Training: 90%|████████▉ | 8997/10000 [2:00:15<10:02, 1.66it/s, loss=0.0262, lr=3.11e-06, step=8997] Training: 90%|████████▉ | 8998/10000 [2:00:16<09:31, 1.75it/s, loss=0.0262, lr=3.11e-06, step=8997] Training: 90%|████████▉ | 8998/10000 [2:00:16<09:31, 1.75it/s, loss=0.0030, lr=3.11e-06, step=8998] Training: 90%|████████▉ | 8999/10000 [2:00:16<09:06, 1.83it/s, loss=0.0030, lr=3.11e-06, step=8998] Training: 90%|████████▉ | 8999/10000 [2:00:16<09:06, 1.83it/s, loss=0.0040, lr=3.11e-06, step=8999]20:44:49.237 [I] step=9000 loss=0.0037 smoothed_loss=0.0099 lr=3.12e-06 grad_norm=0.3671 step_time=0.5221s data_time=0.0841s it/s=1.650 eta_to_10000=606.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0202 grad_action_out_proj_arms=0.1327 grad_arm_token_fuse=0.1122 grad_shared_expert=0.3523 (18633:train_pytorch.py:850) +20:46:05.807 [I] Saved checkpoint at step 9000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/9000 (18633:train_pytorch.py:350) + Training: 90%|█████████ | 9000/10000 [2:01:33<6:33:08, 23.59s/it, loss=0.0040, lr=3.11e-06, step=8999] Training: 90%|█████████ | 9000/10000 [2:01:33<6:33:08, 23.59s/it, loss=0.0037, lr=3.11e-06, step=9000] Training: 90%|█████████ | 9001/10000 [2:01:34<4:38:07, 16.70s/it, loss=0.0037, lr=3.11e-06, step=9000] Training: 90%|█████████ | 9001/10000 [2:01:34<4:38:07, 16.70s/it, loss=0.0012, lr=3.11e-06, step=9001] Training: 90%|█████████ | 9002/10000 [2:01:35<3:17:02, 11.85s/it, loss=0.0012, lr=3.11e-06, step=9001] Training: 90%|█████████ | 9002/10000 [2:01:35<3:17:02, 11.85s/it, loss=0.0059, lr=3.11e-06, step=9002] Training: 90%|█████████ | 9003/10000 [2:01:35<2:20:27, 8.45s/it, loss=0.0059, lr=3.11e-06, step=9002] Training: 90%|█████████ | 9003/10000 [2:01:35<2:20:27, 8.45s/it, loss=0.0085, lr=3.11e-06, step=9003] Training: 90%|█████████ | 9004/10000 [2:01:36<1:41:41, 6.13s/it, loss=0.0085, lr=3.11e-06, step=9003] Training: 90%|█████████ | 9004/10000 [2:01:36<1:41:41, 6.13s/it, loss=0.0028, lr=3.11e-06, step=9004] Training: 90%|█████████ | 9005/10000 [2:01:36<1:13:52, 4.45s/it, loss=0.0028, lr=3.11e-06, step=9004] Training: 90%|█████████ | 9005/10000 [2:01:36<1:13:52, 4.45s/it, loss=0.0045, lr=3.10e-06, step=9005] Training: 90%|█████████ | 9006/10000 [2:01:37<54:33, 3.29s/it, loss=0.0045, lr=3.10e-06, step=9005] Training: 90%|█████████ | 9006/10000 [2:01:37<54:33, 3.29s/it, loss=0.0074, lr=3.10e-06, step=9006] Training: 90%|█████████ | 9007/10000 [2:01:38<41:41, 2.52s/it, loss=0.0074, lr=3.10e-06, step=9006] Training: 90%|█████████ | 9007/10000 [2:01:38<41:41, 2.52s/it, loss=0.0073, lr=3.10e-06, step=9007] Training: 90%|█████████ | 9008/10000 [2:01:38<32:06, 1.94s/it, loss=0.0073, lr=3.10e-06, step=9007] Training: 90%|█████████ | 9008/10000 [2:01:38<32:06, 1.94s/it, loss=0.0075, lr=3.10e-06, step=9008] Training: 90%|█████████ | 9009/10000 [2:01:39<25:01, 1.52s/it, loss=0.0075, lr=3.10e-06, step=9008] Training: 90%|█████████ | 9009/10000 [2:01:39<25:01, 1.52s/it, loss=0.0107, lr=3.10e-06, step=9009]20:46:11.674 [I] step=9010 loss=0.0071 smoothed_loss=0.0079 lr=3.10e-06 grad_norm=0.3771 step_time=0.4996s data_time=7.7441s it/s=0.121 eta_to_10000=8161.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0045 grad_action_out_proj_arms=0.0531 grad_arm_token_fuse=0.0239 grad_shared_expert=0.3796 (18633:train_pytorch.py:850) + Training: 90%|█████████ | 9010/10000 [2:01:39<20:03, 1.22s/it, loss=0.0107, lr=3.10e-06, step=9009] Training: 90%|█████████ | 9010/10000 [2:01:39<20:03, 1.22s/it, loss=0.0071, lr=3.10e-06, step=9010] Training: 90%|█████████ | 9011/10000 [2:01:40<17:29, 1.06s/it, loss=0.0071, lr=3.10e-06, step=9010] Training: 90%|█████████ | 9011/10000 [2:01:40<17:29, 1.06s/it, loss=0.0037, lr=3.10e-06, step=9011] Training: 90%|█████████ | 9012/10000 [2:01:41<14:35, 1.13it/s, loss=0.0037, lr=3.10e-06, step=9011] Training: 90%|█████████ | 9012/10000 [2:01:41<14:35, 1.13it/s, loss=0.0014, lr=3.10e-06, step=9012] Training: 90%|█████████ | 9013/10000 [2:01:41<12:40, 1.30it/s, loss=0.0014, lr=3.10e-06, step=9012] Training: 90%|█████████ | 9013/10000 [2:01:41<12:40, 1.30it/s, loss=0.0062, lr=3.10e-06, step=9013] Training: 90%|█████████ | 9014/10000 [2:01:42<12:03, 1.36it/s, loss=0.0062, lr=3.10e-06, step=9013] Training: 90%|█████████ | 9014/10000 [2:01:42<12:03, 1.36it/s, loss=0.0076, lr=3.09e-06, step=9014] Training: 90%|█████████ | 9015/10000 [2:01:43<12:41, 1.29it/s, loss=0.0076, lr=3.09e-06, step=9014] Training: 90%|█████████ | 9015/10000 [2:01:43<12:41, 1.29it/s, loss=0.0164, lr=3.09e-06, step=9015] Training: 90%|█████████ | 9016/10000 [2:01:43<11:17, 1.45it/s, loss=0.0164, lr=3.09e-06, step=9015] Training: 90%|█████████ | 9016/10000 [2:01:43<11:17, 1.45it/s, loss=0.0498, lr=3.09e-06, step=9016] Training: 90%|█████████ | 9017/10000 [2:01:44<10:20, 1.58it/s, loss=0.0498, lr=3.09e-06, step=9016] Training: 90%|█████████ | 9017/10000 [2:01:44<10:20, 1.58it/s, loss=0.0050, lr=3.09e-06, step=9017] Training: 90%|█████████ | 9018/10000 [2:01:44<09:38, 1.70it/s, loss=0.0050, lr=3.09e-06, step=9017] Training: 90%|█████████ | 9018/10000 [2:01:44<09:38, 1.70it/s, loss=0.0036, lr=3.09e-06, step=9018] Training: 90%|█████████ | 9019/10000 [2:01:45<11:04, 1.48it/s, loss=0.0036, lr=3.09e-06, step=9018] Training: 90%|█████████ | 9019/10000 [2:01:45<11:04, 1.48it/s, loss=0.0322, lr=3.09e-06, step=9019]20:46:17.733 [I] step=9020 loss=0.0027 smoothed_loss=0.0117 lr=3.09e-06 grad_norm=0.4117 step_time=0.5231s data_time=0.0828s it/s=1.651 eta_to_10000=593.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0037 grad_action_out_proj_arms=0.0446 grad_arm_token_fuse=0.0189 grad_shared_expert=0.1786 (18633:train_pytorch.py:850) + Training: 90%|█████████ | 9020/10000 [2:01:45<10:12, 1.60it/s, loss=0.0322, lr=3.09e-06, step=9019] Training: 90%|█████████ | 9020/10000 [2:01:45<10:12, 1.60it/s, loss=0.0027, lr=3.09e-06, step=9020] Training: 90%|█████████ | 9021/10000 [2:01:46<10:29, 1.56it/s, loss=0.0027, lr=3.09e-06, step=9020] Training: 90%|█████████ | 9021/10000 [2:01:46<10:29, 1.56it/s, loss=0.0054, lr=3.09e-06, step=9021] Training: 90%|█████████ | 9022/10000 [2:01:47<10:40, 1.53it/s, loss=0.0054, lr=3.09e-06, step=9021] Training: 90%|█████████ | 9022/10000 [2:01:47<10:40, 1.53it/s, loss=0.0018, lr=3.08e-06, step=9022] Training: 90%|█████████ | 9023/10000 [2:01:47<09:52, 1.65it/s, loss=0.0018, lr=3.08e-06, step=9022] Training: 90%|█████████ | 9023/10000 [2:01:47<09:52, 1.65it/s, loss=0.0068, lr=3.08e-06, step=9023] Training: 90%|█████████ | 9024/10000 [2:01:48<09:13, 1.76it/s, loss=0.0068, lr=3.08e-06, step=9023] Training: 90%|█████████ | 9024/10000 [2:01:48<09:13, 1.76it/s, loss=0.0083, lr=3.08e-06, step=9024] Training: 90%|█████████ | 9025/10000 [2:01:48<09:04, 1.79it/s, loss=0.0083, lr=3.08e-06, step=9024] Training: 90%|█████████ | 9025/10000 [2:01:48<09:04, 1.79it/s, loss=0.0740, lr=3.08e-06, step=9025] Training: 90%|█████████ | 9026/10000 [2:01:49<09:44, 1.67it/s, loss=0.0740, lr=3.08e-06, step=9025] Training: 90%|█████████ | 9026/10000 [2:01:49<09:44, 1.67it/s, loss=0.0020, lr=3.08e-06, step=9026] Training: 90%|█████████ | 9027/10000 [2:01:49<09:12, 1.76it/s, loss=0.0020, lr=3.08e-06, step=9026] Training: 90%|█████████ | 9027/10000 [2:01:49<09:12, 1.76it/s, loss=0.0022, lr=3.08e-06, step=9027] Training: 90%|█████████ | 9028/10000 [2:01:50<09:35, 1.69it/s, loss=0.0022, lr=3.08e-06, step=9027] Training: 90%|█████████ | 9028/10000 [2:01:50<09:35, 1.69it/s, loss=0.0044, lr=3.08e-06, step=9028] Training: 90%|█████████ | 9029/10000 [2:01:51<10:15, 1.58it/s, loss=0.0044, lr=3.08e-06, step=9028] Training: 90%|█████████ | 9029/10000 [2:01:51<10:15, 1.58it/s, loss=0.0029, lr=3.08e-06, step=9029]20:46:23.715 [I] step=9030 loss=0.0037 smoothed_loss=0.0108 lr=3.08e-06 grad_norm=0.3428 step_time=0.5318s data_time=0.0665s it/s=1.672 eta_to_10000=580.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0375 grad_action_out_proj_arms=0.1373 grad_arm_token_fuse=0.1893 grad_shared_expert=0.4952 (18633:train_pytorch.py:850) + Training: 90%|█████████ | 9030/10000 [2:01:51<09:47, 1.65it/s, loss=0.0029, lr=3.08e-06, step=9029] Training: 90%|█████████ | 9030/10000 [2:01:51<09:47, 1.65it/s, loss=0.0037, lr=3.08e-06, step=9030] Training: 90%|█████████ | 9031/10000 [2:01:52<09:07, 1.77it/s, loss=0.0037, lr=3.08e-06, step=9030] Training: 90%|█████████ | 9031/10000 [2:01:52<09:07, 1.77it/s, loss=0.0045, lr=3.07e-06, step=9031] Training: 90%|█████████ | 9032/10000 [2:01:52<08:48, 1.83it/s, loss=0.0045, lr=3.07e-06, step=9031] Training: 90%|█████████ | 9032/10000 [2:01:52<08:48, 1.83it/s, loss=0.0045, lr=3.07e-06, step=9032] Training: 90%|█████████ | 9033/10000 [2:01:53<09:18, 1.73it/s, loss=0.0045, lr=3.07e-06, step=9032] Training: 90%|█████████ | 9033/10000 [2:01:53<09:18, 1.73it/s, loss=0.0031, lr=3.07e-06, step=9033] Training: 90%|█████████ | 9034/10000 [2:01:53<08:46, 1.83it/s, loss=0.0031, lr=3.07e-06, step=9033] Training: 90%|█████████ | 9034/10000 [2:01:53<08:46, 1.83it/s, loss=0.0079, lr=3.07e-06, step=9034] Training: 90%|█████████ | 9035/10000 [2:01:54<08:37, 1.86it/s, loss=0.0079, lr=3.07e-06, step=9034] Training: 90%|█████████ | 9035/10000 [2:01:54<08:37, 1.86it/s, loss=0.0148, lr=3.07e-06, step=9035] Training: 90%|█████████ | 9036/10000 [2:01:55<09:18, 1.73it/s, loss=0.0148, lr=3.07e-06, step=9035] Training: 90%|█████████ | 9036/10000 [2:01:55<09:18, 1.73it/s, loss=0.0073, lr=3.07e-06, step=9036] Training: 90%|█████████ | 9037/10000 [2:01:55<08:51, 1.81it/s, loss=0.0073, lr=3.07e-06, step=9036] Training: 90%|█████████ | 9037/10000 [2:01:55<08:51, 1.81it/s, loss=0.0065, lr=3.07e-06, step=9037] Training: 90%|█████████ | 9038/10000 [2:01:56<08:34, 1.87it/s, loss=0.0065, lr=3.07e-06, step=9037] Training: 90%|█████████ | 9038/10000 [2:01:56<08:34, 1.87it/s, loss=0.0013, lr=3.07e-06, step=9038] Training: 90%|█████████ | 9039/10000 [2:01:56<08:20, 1.92it/s, loss=0.0013, lr=3.07e-06, step=9038] Training: 90%|█████████ | 9039/10000 [2:01:56<08:20, 1.92it/s, loss=0.0032, lr=3.06e-06, step=9039]20:46:28.998 [I] step=9040 loss=0.0077 smoothed_loss=0.0077 lr=3.07e-06 grad_norm=0.3249 step_time=0.4639s data_time=0.0643s it/s=1.894 eta_to_10000=507.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0063 grad_action_out_proj_arms=0.0635 grad_arm_token_fuse=0.0327 grad_shared_expert=0.2980 (18633:train_pytorch.py:850) + Training: 90%|█████████ | 9040/10000 [2:01:57<08:20, 1.92it/s, loss=0.0032, lr=3.06e-06, step=9039] Training: 90%|█████████ | 9040/10000 [2:01:57<08:20, 1.92it/s, loss=0.0077, lr=3.06e-06, step=9040] Training: 90%|█████████ | 9041/10000 [2:01:57<08:59, 1.78it/s, loss=0.0077, lr=3.06e-06, step=9040] Training: 90%|█████████ | 9041/10000 [2:01:57<08:59, 1.78it/s, loss=0.0043, lr=3.06e-06, step=9041] Training: 90%|█████████ | 9042/10000 [2:01:58<08:35, 1.86it/s, loss=0.0043, lr=3.06e-06, step=9041] Training: 90%|█████████ | 9042/10000 [2:01:58<08:35, 1.86it/s, loss=0.0097, lr=3.06e-06, step=9042] Training: 90%|█████████ | 9043/10000 [2:01:59<09:26, 1.69it/s, loss=0.0097, lr=3.06e-06, step=9042] Training: 90%|█████████ | 9043/10000 [2:01:59<09:26, 1.69it/s, loss=0.0083, lr=3.06e-06, step=9043] Training: 90%|█████████ | 9044/10000 [2:01:59<08:51, 1.80it/s, loss=0.0083, lr=3.06e-06, step=9043] Training: 90%|█████████ | 9044/10000 [2:01:59<08:51, 1.80it/s, loss=0.0086, lr=3.06e-06, step=9044] Training: 90%|█████████ | 9045/10000 [2:01:59<08:30, 1.87it/s, loss=0.0086, lr=3.06e-06, step=9044] Training: 90%|█████████ | 9045/10000 [2:01:59<08:30, 1.87it/s, loss=0.0058, lr=3.06e-06, step=9045] Training: 90%|█████████ | 9046/10000 [2:02:00<08:29, 1.87it/s, loss=0.0058, lr=3.06e-06, step=9045] Training: 90%|█████████ | 9046/10000 [2:02:00<08:29, 1.87it/s, loss=0.0058, lr=3.06e-06, step=9046] Training: 90%|█████████ | 9047/10000 [2:02:00<08:13, 1.93it/s, loss=0.0058, lr=3.06e-06, step=9046] Training: 90%|█████████ | 9047/10000 [2:02:00<08:13, 1.93it/s, loss=0.0014, lr=3.06e-06, step=9047] Training: 90%|█████████ | 9048/10000 [2:02:01<08:01, 1.98it/s, loss=0.0014, lr=3.06e-06, step=9047] Training: 90%|█████████ | 9048/10000 [2:02:01<08:01, 1.98it/s, loss=0.0246, lr=3.05e-06, step=9048] Training: 90%|█████████ | 9049/10000 [2:02:02<08:42, 1.82it/s, loss=0.0246, lr=3.05e-06, step=9048] Training: 90%|█████████ | 9049/10000 [2:02:02<08:42, 1.82it/s, loss=0.0038, lr=3.05e-06, step=9049]20:46:34.665 [I] step=9050 loss=0.0054 smoothed_loss=0.0078 lr=3.06e-06 grad_norm=0.3706 step_time=0.5022s data_time=0.0645s it/s=1.765 eta_to_10000=538.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0554 grad_arm_token_fuse=0.0333 grad_shared_expert=0.3025 (18633:train_pytorch.py:850) + Training: 90%|█████████ | 9050/10000 [2:02:02<09:28, 1.67it/s, loss=0.0038, lr=3.05e-06, step=9049] Training: 90%|█████████ | 9050/10000 [2:02:02<09:28, 1.67it/s, loss=0.0054, lr=3.05e-06, step=9050] Training: 91%|█████████ | 9051/10000 [2:02:03<08:54, 1.78it/s, loss=0.0054, lr=3.05e-06, step=9050] Training: 91%|█████████ | 9051/10000 [2:02:03<08:54, 1.78it/s, loss=0.0104, lr=3.05e-06, step=9051] Training: 91%|█████████ | 9052/10000 [2:02:03<09:08, 1.73it/s, loss=0.0104, lr=3.05e-06, step=9051] Training: 91%|█████████ | 9052/10000 [2:02:03<09:08, 1.73it/s, loss=0.0025, lr=3.05e-06, step=9052] Training: 91%|█████████ | 9053/10000 [2:02:04<08:46, 1.80it/s, loss=0.0025, lr=3.05e-06, step=9052] Training: 91%|█████████ | 9053/10000 [2:02:04<08:46, 1.80it/s, loss=0.0129, lr=3.05e-06, step=9053] Training: 91%|█████████ | 9054/10000 [2:02:04<08:26, 1.87it/s, loss=0.0129, lr=3.05e-06, step=9053] Training: 91%|█████████ | 9054/10000 [2:02:04<08:26, 1.87it/s, loss=0.0091, lr=3.05e-06, step=9054] Training: 91%|█████████ | 9055/10000 [2:02:05<08:07, 1.94it/s, loss=0.0091, lr=3.05e-06, step=9054] Training: 91%|█████████ | 9055/10000 [2:02:05<08:07, 1.94it/s, loss=0.0014, lr=3.05e-06, step=9055] Training: 91%|█████████ | 9056/10000 [2:02:06<08:39, 1.82it/s, loss=0.0014, lr=3.05e-06, step=9055] Training: 91%|█████████ | 9056/10000 [2:02:06<08:39, 1.82it/s, loss=0.0730, lr=3.04e-06, step=9056] Training: 91%|█████████ | 9057/10000 [2:02:06<09:45, 1.61it/s, loss=0.0730, lr=3.04e-06, step=9056] Training: 91%|█████████ | 9057/10000 [2:02:06<09:45, 1.61it/s, loss=0.0062, lr=3.04e-06, step=9057] Training: 91%|█████████ | 9058/10000 [2:02:07<09:07, 1.72it/s, loss=0.0062, lr=3.04e-06, step=9057] Training: 91%|█████████ | 9058/10000 [2:02:07<09:07, 1.72it/s, loss=0.0032, lr=3.04e-06, step=9058] Training: 91%|█████████ | 9059/10000 [2:02:07<09:12, 1.70it/s, loss=0.0032, lr=3.04e-06, step=9058] Training: 91%|█████████ | 9059/10000 [2:02:07<09:12, 1.70it/s, loss=0.0024, lr=3.04e-06, step=9059]20:46:40.232 [I] step=9060 loss=0.0212 smoothed_loss=0.0123 lr=3.05e-06 grad_norm=0.4736 step_time=0.4848s data_time=0.0720s it/s=1.796 eta_to_10000=523.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.0912 grad_arm_token_fuse=0.0663 grad_shared_expert=0.3810 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9060/10000 [2:02:08<08:48, 1.78it/s, loss=0.0024, lr=3.04e-06, step=9059] Training: 91%|█████████ | 9060/10000 [2:02:08<08:48, 1.78it/s, loss=0.0212, lr=3.04e-06, step=9060] Training: 91%|█████████ | 9061/10000 [2:02:08<08:32, 1.83it/s, loss=0.0212, lr=3.04e-06, step=9060] Training: 91%|█████████ | 9061/10000 [2:02:08<08:32, 1.83it/s, loss=0.0018, lr=3.04e-06, step=9061] Training: 91%|█████████ | 9062/10000 [2:02:09<08:19, 1.88it/s, loss=0.0018, lr=3.04e-06, step=9061] Training: 91%|█████████ | 9062/10000 [2:02:09<08:19, 1.88it/s, loss=0.0024, lr=3.04e-06, step=9062] Training: 91%|█████████ | 9063/10000 [2:02:10<09:18, 1.68it/s, loss=0.0024, lr=3.04e-06, step=9062] Training: 91%|█████████ | 9063/10000 [2:02:10<09:18, 1.68it/s, loss=0.0119, lr=3.04e-06, step=9063] Training: 91%|█████████ | 9064/10000 [2:02:11<11:15, 1.39it/s, loss=0.0119, lr=3.04e-06, step=9063] Training: 91%|█████████ | 9064/10000 [2:02:11<11:15, 1.39it/s, loss=0.0191, lr=3.04e-06, step=9064] Training: 91%|█████████ | 9065/10000 [2:02:11<11:05, 1.40it/s, loss=0.0191, lr=3.04e-06, step=9064] Training: 91%|█████████ | 9065/10000 [2:02:11<11:05, 1.40it/s, loss=0.0127, lr=3.03e-06, step=9065] Training: 91%|█████████ | 9066/10000 [2:02:12<09:57, 1.56it/s, loss=0.0127, lr=3.03e-06, step=9065] Training: 91%|█████████ | 9066/10000 [2:02:12<09:57, 1.56it/s, loss=0.0130, lr=3.03e-06, step=9066] Training: 91%|█████████ | 9067/10000 [2:02:13<10:35, 1.47it/s, loss=0.0130, lr=3.03e-06, step=9066] Training: 91%|█████████ | 9067/10000 [2:02:13<10:35, 1.47it/s, loss=0.0043, lr=3.03e-06, step=9067] Training: 91%|█████████ | 9068/10000 [2:02:13<10:06, 1.54it/s, loss=0.0043, lr=3.03e-06, step=9067] Training: 91%|█████████ | 9068/10000 [2:02:13<10:06, 1.54it/s, loss=0.0053, lr=3.03e-06, step=9068] Training: 91%|█████████ | 9069/10000 [2:02:14<09:24, 1.65it/s, loss=0.0053, lr=3.03e-06, step=9068] Training: 91%|█████████ | 9069/10000 [2:02:14<09:24, 1.65it/s, loss=0.0099, lr=3.03e-06, step=9069]20:46:46.595 [I] step=9070 loss=0.0095 smoothed_loss=0.0102 lr=3.03e-06 grad_norm=0.4281 step_time=0.5376s data_time=0.0988s it/s=1.572 eta_to_10000=591.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0106 grad_action_out_proj_arms=0.0723 grad_arm_token_fuse=0.0548 grad_shared_expert=0.4284 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9070/10000 [2:02:14<09:15, 1.67it/s, loss=0.0099, lr=3.03e-06, step=9069] Training: 91%|█████████ | 9070/10000 [2:02:14<09:15, 1.67it/s, loss=0.0095, lr=3.03e-06, step=9070] Training: 91%|█████████ | 9071/10000 [2:02:15<10:32, 1.47it/s, loss=0.0095, lr=3.03e-06, step=9070] Training: 91%|█████████ | 9071/10000 [2:02:15<10:32, 1.47it/s, loss=0.0043, lr=3.03e-06, step=9071] Training: 91%|█████████ | 9072/10000 [2:02:16<11:00, 1.40it/s, loss=0.0043, lr=3.03e-06, step=9071] Training: 91%|█████████ | 9072/10000 [2:02:16<11:00, 1.40it/s, loss=0.0146, lr=3.03e-06, step=9072] Training: 91%|█████████ | 9073/10000 [2:02:16<10:00, 1.54it/s, loss=0.0146, lr=3.03e-06, step=9072] Training: 91%|█████████ | 9073/10000 [2:02:16<10:00, 1.54it/s, loss=0.0149, lr=3.03e-06, step=9073] Training: 91%|█████████ | 9074/10000 [2:02:17<10:43, 1.44it/s, loss=0.0149, lr=3.03e-06, step=9073] Training: 91%|█████████ | 9074/10000 [2:02:17<10:43, 1.44it/s, loss=0.0726, lr=3.02e-06, step=9074] Training: 91%|█████████ | 9075/10000 [2:02:18<10:45, 1.43it/s, loss=0.0726, lr=3.02e-06, step=9074] Training: 91%|█████████ | 9075/10000 [2:02:18<10:45, 1.43it/s, loss=0.0034, lr=3.02e-06, step=9075] Training: 91%|█████████ | 9076/10000 [2:02:19<10:16, 1.50it/s, loss=0.0034, lr=3.02e-06, step=9075] Training: 91%|█████████ | 9076/10000 [2:02:19<10:16, 1.50it/s, loss=0.0019, lr=3.02e-06, step=9076] Training: 91%|█████████ | 9077/10000 [2:02:19<09:35, 1.60it/s, loss=0.0019, lr=3.02e-06, step=9076] Training: 91%|█████████ | 9077/10000 [2:02:19<09:35, 1.60it/s, loss=0.0083, lr=3.02e-06, step=9077] Training: 91%|█████████ | 9078/10000 [2:02:20<09:54, 1.55it/s, loss=0.0083, lr=3.02e-06, step=9077] Training: 91%|█████████ | 9078/10000 [2:02:20<09:54, 1.55it/s, loss=0.0013, lr=3.02e-06, step=9078] Training: 91%|█████████ | 9079/10000 [2:02:21<10:34, 1.45it/s, loss=0.0013, lr=3.02e-06, step=9078] Training: 91%|█████████ | 9079/10000 [2:02:21<10:34, 1.45it/s, loss=0.0332, lr=3.02e-06, step=9079]20:46:53.589 [I] step=9080 loss=0.0031 smoothed_loss=0.0133 lr=3.02e-06 grad_norm=0.3872 step_time=0.5790s data_time=0.1204s it/s=1.430 eta_to_10000=643.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0040 grad_action_out_proj_arms=0.0649 grad_arm_token_fuse=0.0205 grad_shared_expert=0.2206 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9080/10000 [2:02:21<10:43, 1.43it/s, loss=0.0332, lr=3.02e-06, step=9079] Training: 91%|█████████ | 9080/10000 [2:02:21<10:43, 1.43it/s, loss=0.0031, lr=3.02e-06, step=9080] Training: 91%|█████████ | 9081/10000 [2:02:22<10:04, 1.52it/s, loss=0.0031, lr=3.02e-06, step=9080] Training: 91%|█████████ | 9081/10000 [2:02:22<10:04, 1.52it/s, loss=0.0117, lr=3.02e-06, step=9081] Training: 91%|█████████ | 9082/10000 [2:02:22<10:01, 1.53it/s, loss=0.0117, lr=3.02e-06, step=9081] Training: 91%|█████████ | 9082/10000 [2:02:22<10:01, 1.53it/s, loss=0.0016, lr=3.02e-06, step=9082] Training: 91%|█████████ | 9083/10000 [2:02:23<09:14, 1.65it/s, loss=0.0016, lr=3.02e-06, step=9082] Training: 91%|█████████ | 9083/10000 [2:02:23<09:14, 1.65it/s, loss=0.0015, lr=3.01e-06, step=9083] Training: 91%|█████████ | 9084/10000 [2:02:23<08:47, 1.74it/s, loss=0.0015, lr=3.01e-06, step=9083] Training: 91%|█████████ | 9084/10000 [2:02:23<08:47, 1.74it/s, loss=0.0199, lr=3.01e-06, step=9084] Training: 91%|█████████ | 9085/10000 [2:02:24<08:35, 1.78it/s, loss=0.0199, lr=3.01e-06, step=9084] Training: 91%|█████████ | 9085/10000 [2:02:24<08:35, 1.78it/s, loss=0.0041, lr=3.01e-06, step=9085] Training: 91%|█████████ | 9086/10000 [2:02:25<09:47, 1.55it/s, loss=0.0041, lr=3.01e-06, step=9085] Training: 91%|█████████ | 9086/10000 [2:02:25<09:47, 1.55it/s, loss=0.0016, lr=3.01e-06, step=9086] Training: 91%|█████████ | 9087/10000 [2:02:25<09:06, 1.67it/s, loss=0.0016, lr=3.01e-06, step=9086] Training: 91%|█████████ | 9087/10000 [2:02:25<09:06, 1.67it/s, loss=0.0077, lr=3.01e-06, step=9087] Training: 91%|█████████ | 9088/10000 [2:02:26<09:26, 1.61it/s, loss=0.0077, lr=3.01e-06, step=9087] Training: 91%|█████████ | 9088/10000 [2:02:26<09:26, 1.61it/s, loss=0.0094, lr=3.01e-06, step=9088] Training: 91%|█████████ | 9089/10000 [2:02:26<08:49, 1.72it/s, loss=0.0094, lr=3.01e-06, step=9088] Training: 91%|█████████ | 9089/10000 [2:02:26<08:49, 1.72it/s, loss=0.0121, lr=3.01e-06, step=9089]20:46:59.469 [I] step=9090 loss=0.0040 smoothed_loss=0.0094 lr=3.01e-06 grad_norm=0.3644 step_time=0.4954s data_time=0.0926s it/s=1.701 eta_to_10000=534.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0095 grad_action_out_proj_arms=0.1003 grad_arm_token_fuse=0.0489 grad_shared_expert=0.3482 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9090/10000 [2:02:27<09:08, 1.66it/s, loss=0.0121, lr=3.01e-06, step=9089] Training: 91%|█████████ | 9090/10000 [2:02:27<09:08, 1.66it/s, loss=0.0040, lr=3.01e-06, step=9090] Training: 91%|█████████ | 9091/10000 [2:02:28<09:33, 1.58it/s, loss=0.0040, lr=3.01e-06, step=9090] Training: 91%|█████████ | 9091/10000 [2:02:28<09:33, 1.58it/s, loss=0.0038, lr=3.01e-06, step=9091] Training: 91%|█████████ | 9092/10000 [2:02:28<08:54, 1.70it/s, loss=0.0038, lr=3.01e-06, step=9091] Training: 91%|█████████ | 9092/10000 [2:02:28<08:54, 1.70it/s, loss=0.0056, lr=3.00e-06, step=9092] Training: 91%|█████████ | 9093/10000 [2:02:29<10:57, 1.38it/s, loss=0.0056, lr=3.00e-06, step=9092] Training: 91%|█████████ | 9093/10000 [2:02:29<10:57, 1.38it/s, loss=0.0034, lr=3.00e-06, step=9093] Training: 91%|█████████ | 9094/10000 [2:02:30<10:01, 1.51it/s, loss=0.0034, lr=3.00e-06, step=9093] Training: 91%|█████████ | 9094/10000 [2:02:30<10:01, 1.51it/s, loss=0.0054, lr=3.00e-06, step=9094] Training: 91%|█████████ | 9095/10000 [2:02:30<09:15, 1.63it/s, loss=0.0054, lr=3.00e-06, step=9094] Training: 91%|█████████ | 9095/10000 [2:02:30<09:15, 1.63it/s, loss=0.0050, lr=3.00e-06, step=9095] Training: 91%|█████████ | 9096/10000 [2:02:31<08:36, 1.75it/s, loss=0.0050, lr=3.00e-06, step=9095] Training: 91%|█████████ | 9096/10000 [2:02:31<08:36, 1.75it/s, loss=0.0028, lr=3.00e-06, step=9096] Training: 91%|█████████ | 9097/10000 [2:02:32<09:48, 1.53it/s, loss=0.0028, lr=3.00e-06, step=9096] Training: 91%|█████████ | 9097/10000 [2:02:32<09:48, 1.53it/s, loss=0.0041, lr=3.00e-06, step=9097] Training: 91%|█████████ | 9098/10000 [2:02:32<09:11, 1.64it/s, loss=0.0041, lr=3.00e-06, step=9097] Training: 91%|█████████ | 9098/10000 [2:02:32<09:11, 1.64it/s, loss=0.0034, lr=3.00e-06, step=9098] Training: 91%|█████████ | 9099/10000 [2:02:33<08:34, 1.75it/s, loss=0.0034, lr=3.00e-06, step=9098] Training: 91%|█████████ | 9099/10000 [2:02:33<08:34, 1.75it/s, loss=0.0053, lr=3.00e-06, step=9099]20:47:05.766 [I] step=9100 loss=0.0095 smoothed_loss=0.0066 lr=3.00e-06 grad_norm=0.3812 step_time=0.5208s data_time=0.1089s it/s=1.588 eta_to_10000=566.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0425 grad_action_out_proj_arms=0.1630 grad_arm_token_fuse=0.2288 grad_shared_expert=0.4628 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9100/10000 [2:02:33<09:20, 1.61it/s, loss=0.0053, lr=3.00e-06, step=9099] Training: 91%|█████████ | 9100/10000 [2:02:33<09:20, 1.61it/s, loss=0.0095, lr=3.00e-06, step=9100] Training: 91%|█████████ | 9101/10000 [2:02:34<09:29, 1.58it/s, loss=0.0095, lr=3.00e-06, step=9100] Training: 91%|█████████ | 9101/10000 [2:02:34<09:29, 1.58it/s, loss=0.0066, lr=2.99e-06, step=9101] Training: 91%|█████████ | 9102/10000 [2:02:35<09:31, 1.57it/s, loss=0.0066, lr=2.99e-06, step=9101] Training: 91%|█████████ | 9102/10000 [2:02:35<09:31, 1.57it/s, loss=0.0021, lr=2.99e-06, step=9102] Training: 91%|█████████ | 9103/10000 [2:02:35<09:23, 1.59it/s, loss=0.0021, lr=2.99e-06, step=9102] Training: 91%|█████████ | 9103/10000 [2:02:35<09:23, 1.59it/s, loss=0.0005, lr=2.99e-06, step=9103] Training: 91%|█████████ | 9104/10000 [2:02:36<09:52, 1.51it/s, loss=0.0005, lr=2.99e-06, step=9103] Training: 91%|█████████ | 9104/10000 [2:02:36<09:52, 1.51it/s, loss=0.0011, lr=2.99e-06, step=9104] Training: 91%|█████████ | 9105/10000 [2:02:37<10:04, 1.48it/s, loss=0.0011, lr=2.99e-06, step=9104] Training: 91%|█████████ | 9105/10000 [2:02:37<10:04, 1.48it/s, loss=0.0039, lr=2.99e-06, step=9105] Training: 91%|█████████ | 9106/10000 [2:02:37<09:43, 1.53it/s, loss=0.0039, lr=2.99e-06, step=9105] Training: 91%|█████████ | 9106/10000 [2:02:37<09:43, 1.53it/s, loss=0.0029, lr=2.99e-06, step=9106] Training: 91%|█████████ | 9107/10000 [2:02:38<10:17, 1.45it/s, loss=0.0029, lr=2.99e-06, step=9106] Training: 91%|█████████ | 9107/10000 [2:02:38<10:17, 1.45it/s, loss=0.0020, lr=2.99e-06, step=9107] Training: 91%|█████████ | 9108/10000 [2:02:39<10:22, 1.43it/s, loss=0.0020, lr=2.99e-06, step=9107] Training: 91%|█████████ | 9108/10000 [2:02:39<10:22, 1.43it/s, loss=0.0038, lr=2.99e-06, step=9108] Training: 91%|█████████ | 9109/10000 [2:02:39<09:27, 1.57it/s, loss=0.0038, lr=2.99e-06, step=9108] Training: 91%|█████████ | 9109/10000 [2:02:39<09:27, 1.57it/s, loss=0.0016, lr=2.99e-06, step=9109]20:47:12.241 [I] step=9110 loss=0.0044 smoothed_loss=0.0042 lr=2.99e-06 grad_norm=0.3391 step_time=0.5376s data_time=0.1100s it/s=1.545 eta_to_10000=576.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0086 grad_action_out_proj_arms=0.0946 grad_arm_token_fuse=0.0444 grad_shared_expert=0.4766 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9110/10000 [2:02:40<08:57, 1.65it/s, loss=0.0016, lr=2.99e-06, step=9109] Training: 91%|█████████ | 9110/10000 [2:02:40<08:57, 1.65it/s, loss=0.0044, lr=2.98e-06, step=9110] Training: 91%|█████████ | 9111/10000 [2:02:40<08:24, 1.76it/s, loss=0.0044, lr=2.98e-06, step=9110] Training: 91%|█████████ | 9111/10000 [2:02:40<08:24, 1.76it/s, loss=0.0050, lr=2.98e-06, step=9111] Training: 91%|█████████ | 9112/10000 [2:02:41<08:54, 1.66it/s, loss=0.0050, lr=2.98e-06, step=9111] Training: 91%|█████████ | 9112/10000 [2:02:41<08:54, 1.66it/s, loss=0.0167, lr=2.98e-06, step=9112] Training: 91%|█████████ | 9113/10000 [2:02:42<08:29, 1.74it/s, loss=0.0167, lr=2.98e-06, step=9112] Training: 91%|█████████ | 9113/10000 [2:02:42<08:29, 1.74it/s, loss=0.0126, lr=2.98e-06, step=9113] Training: 91%|█████████ | 9114/10000 [2:02:42<08:52, 1.67it/s, loss=0.0126, lr=2.98e-06, step=9113] Training: 91%|█████████ | 9114/10000 [2:02:42<08:52, 1.67it/s, loss=0.0113, lr=2.98e-06, step=9114] Training: 91%|█████████ | 9115/10000 [2:02:43<09:21, 1.58it/s, loss=0.0113, lr=2.98e-06, step=9114] Training: 91%|█████████ | 9115/10000 [2:02:43<09:21, 1.58it/s, loss=0.0153, lr=2.98e-06, step=9115] Training: 91%|█████████ | 9116/10000 [2:02:44<09:03, 1.63it/s, loss=0.0153, lr=2.98e-06, step=9115] Training: 91%|█████████ | 9116/10000 [2:02:44<09:03, 1.63it/s, loss=0.0052, lr=2.98e-06, step=9116] Training: 91%|█████████ | 9117/10000 [2:02:44<08:36, 1.71it/s, loss=0.0052, lr=2.98e-06, step=9116] Training: 91%|█████████ | 9117/10000 [2:02:44<08:36, 1.71it/s, loss=0.0202, lr=2.98e-06, step=9117] Training: 91%|█████████ | 9118/10000 [2:02:45<08:12, 1.79it/s, loss=0.0202, lr=2.98e-06, step=9117] Training: 91%|█████████ | 9118/10000 [2:02:45<08:12, 1.79it/s, loss=0.0045, lr=2.98e-06, step=9118] Training: 91%|█████████ | 9119/10000 [2:02:45<08:01, 1.83it/s, loss=0.0045, lr=2.98e-06, step=9118] Training: 91%|█████████ | 9119/10000 [2:02:45<08:01, 1.83it/s, loss=0.1011, lr=2.98e-06, step=9119]20:47:17.929 [I] step=9120 loss=0.0019 smoothed_loss=0.0160 lr=2.98e-06 grad_norm=0.4887 step_time=0.4833s data_time=0.0855s it/s=1.758 eta_to_10000=500.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0251 grad_action_out_proj_arms=0.1270 grad_arm_token_fuse=0.1328 grad_shared_expert=0.4558 (18633:train_pytorch.py:850) + Training: 91%|█████████ | 9120/10000 [2:02:46<07:59, 1.84it/s, loss=0.1011, lr=2.98e-06, step=9119] Training: 91%|█████████ | 9120/10000 [2:02:46<07:59, 1.84it/s, loss=0.0019, lr=2.97e-06, step=9120] Training: 91%|█████████ | 9121/10000 [2:02:46<08:23, 1.75it/s, loss=0.0019, lr=2.97e-06, step=9120] Training: 91%|█████████ | 9121/10000 [2:02:46<08:23, 1.75it/s, loss=0.0027, lr=2.97e-06, step=9121] Training: 91%|█████████ | 9122/10000 [2:02:47<08:55, 1.64it/s, loss=0.0027, lr=2.97e-06, step=9121] Training: 91%|█████████ | 9122/10000 [2:02:47<08:55, 1.64it/s, loss=0.0049, lr=2.97e-06, step=9122] Training: 91%|█████████ | 9123/10000 [2:02:47<08:21, 1.75it/s, loss=0.0049, lr=2.97e-06, step=9122] Training: 91%|█████████ | 9123/10000 [2:02:47<08:21, 1.75it/s, loss=0.0070, lr=2.97e-06, step=9123] Training: 91%|█████████ | 9124/10000 [2:02:48<07:57, 1.83it/s, loss=0.0070, lr=2.97e-06, step=9123] Training: 91%|█████████ | 9124/10000 [2:02:48<07:57, 1.83it/s, loss=0.0031, lr=2.97e-06, step=9124] Training: 91%|█████████▏| 9125/10000 [2:02:49<08:23, 1.74it/s, loss=0.0031, lr=2.97e-06, step=9124] Training: 91%|█████████▏| 9125/10000 [2:02:49<08:23, 1.74it/s, loss=0.0176, lr=2.97e-06, step=9125] Training: 91%|█████████▏| 9126/10000 [2:02:49<08:17, 1.76it/s, loss=0.0176, lr=2.97e-06, step=9125] Training: 91%|█████████▏| 9126/10000 [2:02:49<08:17, 1.76it/s, loss=0.0049, lr=2.97e-06, step=9126] Training: 91%|█████████▏| 9127/10000 [2:02:50<07:55, 1.84it/s, loss=0.0049, lr=2.97e-06, step=9126] Training: 91%|█████████▏| 9127/10000 [2:02:50<07:55, 1.84it/s, loss=0.0051, lr=2.97e-06, step=9127] Training: 91%|█████████▏| 9128/10000 [2:02:50<08:21, 1.74it/s, loss=0.0051, lr=2.97e-06, step=9127] Training: 91%|█████████▏| 9128/10000 [2:02:50<08:21, 1.74it/s, loss=0.0854, lr=2.97e-06, step=9128] Training: 91%|█████████▏| 9129/10000 [2:02:51<08:49, 1.64it/s, loss=0.0854, lr=2.97e-06, step=9128] Training: 91%|█████████▏| 9129/10000 [2:02:51<08:49, 1.64it/s, loss=0.0026, lr=2.96e-06, step=9129]20:47:23.768 [I] step=9130 loss=0.0147 smoothed_loss=0.0167 lr=2.97e-06 grad_norm=0.4619 step_time=0.5086s data_time=0.0752s it/s=1.713 eta_to_10000=507.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0138 grad_action_out_proj_arms=0.0887 grad_arm_token_fuse=0.0736 grad_shared_expert=0.7799 (18633:train_pytorch.py:850) + Training: 91%|█████████▏| 9130/10000 [2:02:51<08:25, 1.72it/s, loss=0.0026, lr=2.96e-06, step=9129] Training: 91%|█████████▏| 9130/10000 [2:02:51<08:25, 1.72it/s, loss=0.0147, lr=2.96e-06, step=9130] Training: 91%|█████████▏| 9131/10000 [2:02:52<07:59, 1.81it/s, loss=0.0147, lr=2.96e-06, step=9130] Training: 91%|█████████▏| 9131/10000 [2:02:52<07:59, 1.81it/s, loss=0.0062, lr=2.96e-06, step=9131] Training: 91%|█████████▏| 9132/10000 [2:02:53<08:51, 1.63it/s, loss=0.0062, lr=2.96e-06, step=9131] Training: 91%|█████████▏| 9132/10000 [2:02:53<08:51, 1.63it/s, loss=0.0141, lr=2.96e-06, step=9132] Training: 91%|█████████▏| 9133/10000 [2:02:53<08:24, 1.72it/s, loss=0.0141, lr=2.96e-06, step=9132] Training: 91%|█████████▏| 9133/10000 [2:02:53<08:24, 1.72it/s, loss=0.0024, lr=2.96e-06, step=9133] Training: 91%|█████████▏| 9134/10000 [2:02:54<07:58, 1.81it/s, loss=0.0024, lr=2.96e-06, step=9133] Training: 91%|█████████▏| 9134/10000 [2:02:54<07:58, 1.81it/s, loss=0.0132, lr=2.96e-06, step=9134] Training: 91%|█████████▏| 9135/10000 [2:02:54<07:40, 1.88it/s, loss=0.0132, lr=2.96e-06, step=9134] Training: 91%|█████████▏| 9135/10000 [2:02:54<07:40, 1.88it/s, loss=0.0027, lr=2.96e-06, step=9135] Training: 91%|█████████▏| 9136/10000 [2:02:55<08:58, 1.60it/s, loss=0.0027, lr=2.96e-06, step=9135] Training: 91%|█████████▏| 9136/10000 [2:02:55<08:58, 1.60it/s, loss=0.0156, lr=2.96e-06, step=9136] Training: 91%|█████████▏| 9137/10000 [2:02:55<08:24, 1.71it/s, loss=0.0156, lr=2.96e-06, step=9136] Training: 91%|█████████▏| 9137/10000 [2:02:55<08:24, 1.71it/s, loss=0.0017, lr=2.96e-06, step=9137] Training: 91%|█████████▏| 9138/10000 [2:02:56<09:05, 1.58it/s, loss=0.0017, lr=2.96e-06, step=9137] Training: 91%|█████████▏| 9138/10000 [2:02:56<09:05, 1.58it/s, loss=0.0026, lr=2.96e-06, step=9138] Training: 91%|█████████▏| 9139/10000 [2:02:57<08:26, 1.70it/s, loss=0.0026, lr=2.96e-06, step=9138] Training: 91%|█████████▏| 9139/10000 [2:02:57<08:26, 1.70it/s, loss=0.0193, lr=2.95e-06, step=9139]20:47:29.689 [I] step=9140 loss=0.0137 smoothed_loss=0.0121 lr=2.96e-06 grad_norm=0.4123 step_time=0.4959s data_time=0.0962s it/s=1.689 eta_to_10000=509.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0244 grad_action_out_proj_arms=0.1624 grad_arm_token_fuse=0.1315 grad_shared_expert=0.4914 (18633:train_pytorch.py:850) + Training: 91%|█████████▏| 9140/10000 [2:02:57<08:40, 1.65it/s, loss=0.0193, lr=2.95e-06, step=9139] Training: 91%|█████████▏| 9140/10000 [2:02:57<08:40, 1.65it/s, loss=0.0137, lr=2.95e-06, step=9140] Training: 91%|█████████▏| 9141/10000 [2:02:58<08:12, 1.74it/s, loss=0.0137, lr=2.95e-06, step=9140] Training: 91%|█████████▏| 9141/10000 [2:02:58<08:12, 1.74it/s, loss=0.0061, lr=2.95e-06, step=9141] Training: 91%|█████████▏| 9142/10000 [2:02:59<09:24, 1.52it/s, loss=0.0061, lr=2.95e-06, step=9141] Training: 91%|█████████▏| 9142/10000 [2:02:59<09:24, 1.52it/s, loss=0.0016, lr=2.95e-06, step=9142] Training: 91%|█████████▏| 9143/10000 [2:03:00<10:14, 1.39it/s, loss=0.0016, lr=2.95e-06, step=9142] Training: 91%|█████████▏| 9143/10000 [2:03:00<10:14, 1.39it/s, loss=0.0056, lr=2.95e-06, step=9143] Training: 91%|█████████▏| 9144/10000 [2:03:00<09:36, 1.48it/s, loss=0.0056, lr=2.95e-06, step=9143] Training: 91%|█████████▏| 9144/10000 [2:03:00<09:36, 1.48it/s, loss=0.0028, lr=2.95e-06, step=9144] Training: 91%|█████████▏| 9145/10000 [2:03:01<08:46, 1.62it/s, loss=0.0028, lr=2.95e-06, step=9144] Training: 91%|█████████▏| 9145/10000 [2:03:01<08:46, 1.62it/s, loss=0.0040, lr=2.95e-06, step=9145] Training: 91%|█████████▏| 9146/10000 [2:03:01<08:12, 1.73it/s, loss=0.0040, lr=2.95e-06, step=9145] Training: 91%|█████████▏| 9146/10000 [2:03:01<08:12, 1.73it/s, loss=0.0151, lr=2.95e-06, step=9146] Training: 91%|█████████▏| 9147/10000 [2:03:02<08:43, 1.63it/s, loss=0.0151, lr=2.95e-06, step=9146] Training: 91%|█████████▏| 9147/10000 [2:03:02<08:43, 1.63it/s, loss=0.0032, lr=2.95e-06, step=9147] Training: 91%|█████████▏| 9148/10000 [2:03:02<08:08, 1.74it/s, loss=0.0032, lr=2.95e-06, step=9147] Training: 91%|█████████▏| 9148/10000 [2:03:02<08:08, 1.74it/s, loss=0.0217, lr=2.94e-06, step=9148] Training: 91%|█████████▏| 9149/10000 [2:03:03<08:27, 1.68it/s, loss=0.0217, lr=2.94e-06, step=9148] Training: 91%|█████████▏| 9149/10000 [2:03:03<08:27, 1.68it/s, loss=0.0070, lr=2.94e-06, step=9149]20:47:35.981 [I] step=9150 loss=0.0031 smoothed_loss=0.0091 lr=2.95e-06 grad_norm=0.3587 step_time=0.5291s data_time=0.1001s it/s=1.590 eta_to_10000=534.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0068 grad_action_out_proj_arms=0.0576 grad_arm_token_fuse=0.0352 grad_shared_expert=0.1439 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9150/10000 [2:03:04<08:56, 1.58it/s, loss=0.0070, lr=2.94e-06, step=9149] Training: 92%|█████████▏| 9150/10000 [2:03:04<08:56, 1.58it/s, loss=0.0031, lr=2.94e-06, step=9150] Training: 92%|█████████▏| 9151/10000 [2:03:04<08:20, 1.70it/s, loss=0.0031, lr=2.94e-06, step=9150] Training: 92%|█████████▏| 9151/10000 [2:03:04<08:20, 1.70it/s, loss=0.0011, lr=2.94e-06, step=9151] Training: 92%|█████████▏| 9152/10000 [2:03:05<07:52, 1.79it/s, loss=0.0011, lr=2.94e-06, step=9151] Training: 92%|█████████▏| 9152/10000 [2:03:05<07:52, 1.79it/s, loss=0.0015, lr=2.94e-06, step=9152] Training: 92%|█████████▏| 9153/10000 [2:03:05<07:28, 1.89it/s, loss=0.0015, lr=2.94e-06, step=9152] Training: 92%|█████████▏| 9153/10000 [2:03:05<07:28, 1.89it/s, loss=0.0020, lr=2.94e-06, step=9153] Training: 92%|█████████▏| 9154/10000 [2:03:06<07:19, 1.93it/s, loss=0.0020, lr=2.94e-06, step=9153] Training: 92%|█████████▏| 9154/10000 [2:03:06<07:19, 1.93it/s, loss=0.0086, lr=2.94e-06, step=9154] Training: 92%|█████████▏| 9155/10000 [2:03:06<07:10, 1.96it/s, loss=0.0086, lr=2.94e-06, step=9154] Training: 92%|█████████▏| 9155/10000 [2:03:06<07:10, 1.96it/s, loss=0.0008, lr=2.94e-06, step=9155] Training: 92%|█████████▏| 9156/10000 [2:03:07<07:04, 1.99it/s, loss=0.0008, lr=2.94e-06, step=9155] Training: 92%|█████████▏| 9156/10000 [2:03:07<07:04, 1.99it/s, loss=0.0077, lr=2.94e-06, step=9156] Training: 92%|█████████▏| 9157/10000 [2:03:07<07:54, 1.78it/s, loss=0.0077, lr=2.94e-06, step=9156] Training: 92%|█████████▏| 9157/10000 [2:03:07<07:54, 1.78it/s, loss=0.0036, lr=2.94e-06, step=9157] Training: 92%|█████████▏| 9158/10000 [2:03:08<08:33, 1.64it/s, loss=0.0036, lr=2.94e-06, step=9157] Training: 92%|█████████▏| 9158/10000 [2:03:08<08:33, 1.64it/s, loss=0.0229, lr=2.93e-06, step=9158] Training: 92%|█████████▏| 9159/10000 [2:03:09<08:44, 1.60it/s, loss=0.0229, lr=2.93e-06, step=9158] Training: 92%|█████████▏| 9159/10000 [2:03:09<08:44, 1.60it/s, loss=0.0115, lr=2.93e-06, step=9159]20:47:41.491 [I] step=9160 loss=0.0087 smoothed_loss=0.0084 lr=2.94e-06 grad_norm=0.4298 step_time=0.4616s data_time=0.0895s it/s=1.815 eta_to_10000=462.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0051 grad_action_out_proj_arms=0.0480 grad_arm_token_fuse=0.0294 grad_shared_expert=0.4031 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9160/10000 [2:03:09<08:19, 1.68it/s, loss=0.0115, lr=2.93e-06, step=9159] Training: 92%|█████████▏| 9160/10000 [2:03:09<08:19, 1.68it/s, loss=0.0087, lr=2.93e-06, step=9160] Training: 92%|█████████▏| 9161/10000 [2:03:10<07:51, 1.78it/s, loss=0.0087, lr=2.93e-06, step=9160] Training: 92%|█████████▏| 9161/10000 [2:03:10<07:51, 1.78it/s, loss=0.0067, lr=2.93e-06, step=9161] Training: 92%|█████████▏| 9162/10000 [2:03:10<07:30, 1.86it/s, loss=0.0067, lr=2.93e-06, step=9161] Training: 92%|█████████▏| 9162/10000 [2:03:10<07:30, 1.86it/s, loss=0.0039, lr=2.93e-06, step=9162] Training: 92%|█████████▏| 9163/10000 [2:03:11<07:17, 1.91it/s, loss=0.0039, lr=2.93e-06, step=9162] Training: 92%|█████████▏| 9163/10000 [2:03:11<07:17, 1.91it/s, loss=0.0027, lr=2.93e-06, step=9163] Training: 92%|█████████▏| 9164/10000 [2:03:11<07:53, 1.77it/s, loss=0.0027, lr=2.93e-06, step=9163] Training: 92%|█████████▏| 9164/10000 [2:03:11<07:53, 1.77it/s, loss=0.0175, lr=2.93e-06, step=9164] Training: 92%|█████████▏| 9165/10000 [2:03:12<08:23, 1.66it/s, loss=0.0175, lr=2.93e-06, step=9164] Training: 92%|█████████▏| 9165/10000 [2:03:12<08:23, 1.66it/s, loss=0.0086, lr=2.93e-06, step=9165] Training: 92%|█████████▏| 9166/10000 [2:03:13<08:17, 1.68it/s, loss=0.0086, lr=2.93e-06, step=9165] Training: 92%|█████████▏| 9166/10000 [2:03:13<08:17, 1.68it/s, loss=0.0039, lr=2.93e-06, step=9166] Training: 92%|█████████▏| 9167/10000 [2:03:13<07:54, 1.75it/s, loss=0.0039, lr=2.93e-06, step=9166] Training: 92%|█████████▏| 9167/10000 [2:03:13<07:54, 1.75it/s, loss=0.0071, lr=2.93e-06, step=9167] Training: 92%|█████████▏| 9168/10000 [2:03:14<09:08, 1.52it/s, loss=0.0071, lr=2.93e-06, step=9167] Training: 92%|█████████▏| 9168/10000 [2:03:14<09:08, 1.52it/s, loss=0.0036, lr=2.92e-06, step=9168] Training: 92%|█████████▏| 9169/10000 [2:03:15<10:09, 1.36it/s, loss=0.0036, lr=2.92e-06, step=9168] Training: 92%|█████████▏| 9169/10000 [2:03:15<10:09, 1.36it/s, loss=0.0204, lr=2.92e-06, step=9169]20:47:47.715 [I] step=9170 loss=0.0068 smoothed_loss=0.0085 lr=2.93e-06 grad_norm=0.4503 step_time=0.5115s data_time=0.1108s it/s=1.607 eta_to_10000=516.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0711 grad_arm_token_fuse=0.0332 grad_shared_expert=0.2030 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9170/10000 [2:03:15<09:22, 1.47it/s, loss=0.0204, lr=2.92e-06, step=9169] Training: 92%|█████████▏| 9170/10000 [2:03:15<09:22, 1.47it/s, loss=0.0068, lr=2.92e-06, step=9170] Training: 92%|█████████▏| 9171/10000 [2:03:16<10:36, 1.30it/s, loss=0.0068, lr=2.92e-06, step=9170] Training: 92%|█████████▏| 9171/10000 [2:03:16<10:36, 1.30it/s, loss=0.0067, lr=2.92e-06, step=9171] Training: 92%|█████████▏| 9172/10000 [2:03:17<11:04, 1.25it/s, loss=0.0067, lr=2.92e-06, step=9171] Training: 92%|█████████▏| 9172/10000 [2:03:17<11:04, 1.25it/s, loss=0.0042, lr=2.92e-06, step=9172] Training: 92%|█████████▏| 9173/10000 [2:03:18<10:48, 1.27it/s, loss=0.0042, lr=2.92e-06, step=9172] Training: 92%|█████████▏| 9173/10000 [2:03:18<10:48, 1.27it/s, loss=0.0153, lr=2.92e-06, step=9173] Training: 92%|█████████▏| 9174/10000 [2:03:19<10:36, 1.30it/s, loss=0.0153, lr=2.92e-06, step=9173] Training: 92%|█████████▏| 9174/10000 [2:03:19<10:36, 1.30it/s, loss=0.0138, lr=2.92e-06, step=9174] Training: 92%|█████████▏| 9175/10000 [2:03:19<09:28, 1.45it/s, loss=0.0138, lr=2.92e-06, step=9174] Training: 92%|█████████▏| 9175/10000 [2:03:19<09:28, 1.45it/s, loss=0.0056, lr=2.92e-06, step=9175] Training: 92%|█████████▏| 9176/10000 [2:03:20<08:38, 1.59it/s, loss=0.0056, lr=2.92e-06, step=9175] Training: 92%|█████████▏| 9176/10000 [2:03:20<08:38, 1.59it/s, loss=0.0218, lr=2.92e-06, step=9176] Training: 92%|█████████▏| 9177/10000 [2:03:20<08:13, 1.67it/s, loss=0.0218, lr=2.92e-06, step=9176] Training: 92%|█████████▏| 9177/10000 [2:03:20<08:13, 1.67it/s, loss=0.0066, lr=2.92e-06, step=9177] Training: 92%|█████████▏| 9178/10000 [2:03:21<08:59, 1.52it/s, loss=0.0066, lr=2.92e-06, step=9177] Training: 92%|█████████▏| 9178/10000 [2:03:21<08:59, 1.52it/s, loss=0.0091, lr=2.91e-06, step=9178] Training: 92%|█████████▏| 9179/10000 [2:03:22<09:14, 1.48it/s, loss=0.0091, lr=2.91e-06, step=9178] Training: 92%|█████████▏| 9179/10000 [2:03:22<09:14, 1.48it/s, loss=0.0146, lr=2.91e-06, step=9179]20:47:55.435 [I] step=9180 loss=0.0092 smoothed_loss=0.0101 lr=2.92e-06 grad_norm=0.4310 step_time=0.5902s data_time=0.1818s it/s=1.295 eta_to_10000=633.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0099 grad_action_out_proj_arms=0.0855 grad_arm_token_fuse=0.0554 grad_shared_expert=0.3096 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9180/10000 [2:03:23<11:59, 1.14it/s, loss=0.0146, lr=2.91e-06, step=9179] Training: 92%|█████████▏| 9180/10000 [2:03:23<11:59, 1.14it/s, loss=0.0092, lr=2.91e-06, step=9180] Training: 92%|█████████▏| 9181/10000 [2:03:24<11:12, 1.22it/s, loss=0.0092, lr=2.91e-06, step=9180] Training: 92%|█████████▏| 9181/10000 [2:03:24<11:12, 1.22it/s, loss=0.0180, lr=2.91e-06, step=9181] Training: 92%|█████████▏| 9182/10000 [2:03:24<10:00, 1.36it/s, loss=0.0180, lr=2.91e-06, step=9181] Training: 92%|█████████▏| 9182/10000 [2:03:24<10:00, 1.36it/s, loss=0.0104, lr=2.91e-06, step=9182] Training: 92%|█████████▏| 9183/10000 [2:03:25<10:15, 1.33it/s, loss=0.0104, lr=2.91e-06, step=9182] Training: 92%|█████████▏| 9183/10000 [2:03:25<10:15, 1.33it/s, loss=0.0070, lr=2.91e-06, step=9183] Training: 92%|█████████▏| 9184/10000 [2:03:26<09:25, 1.44it/s, loss=0.0070, lr=2.91e-06, step=9183] Training: 92%|█████████▏| 9184/10000 [2:03:26<09:25, 1.44it/s, loss=0.0137, lr=2.91e-06, step=9184] Training: 92%|█████████▏| 9185/10000 [2:03:26<09:16, 1.46it/s, loss=0.0137, lr=2.91e-06, step=9184] Training: 92%|█████████▏| 9185/10000 [2:03:26<09:16, 1.46it/s, loss=0.0050, lr=2.91e-06, step=9185] Training: 92%|█████████▏| 9186/10000 [2:03:27<11:01, 1.23it/s, loss=0.0050, lr=2.91e-06, step=9185] Training: 92%|█████████▏| 9186/10000 [2:03:27<11:01, 1.23it/s, loss=0.0032, lr=2.91e-06, step=9186] Training: 92%|█████████▏| 9187/10000 [2:03:28<10:02, 1.35it/s, loss=0.0032, lr=2.91e-06, step=9186] Training: 92%|█████████▏| 9187/10000 [2:03:28<10:02, 1.35it/s, loss=0.0047, lr=2.91e-06, step=9187] Training: 92%|█████████▏| 9188/10000 [2:03:29<10:03, 1.35it/s, loss=0.0047, lr=2.91e-06, step=9187] Training: 92%|█████████▏| 9188/10000 [2:03:29<10:03, 1.35it/s, loss=0.0080, lr=2.90e-06, step=9188] Training: 92%|█████████▏| 9189/10000 [2:03:29<09:07, 1.48it/s, loss=0.0080, lr=2.90e-06, step=9188] Training: 92%|█████████▏| 9189/10000 [2:03:29<09:07, 1.48it/s, loss=0.0067, lr=2.90e-06, step=9189]20:48:02.264 [I] step=9190 loss=0.0115 smoothed_loss=0.0090 lr=2.91e-06 grad_norm=0.3980 step_time=0.5593s data_time=0.1237s it/s=1.465 eta_to_10000=553.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0101 grad_action_out_proj_arms=0.0758 grad_arm_token_fuse=0.0537 grad_shared_expert=0.3365 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9190/10000 [2:03:30<08:59, 1.50it/s, loss=0.0067, lr=2.90e-06, step=9189] Training: 92%|█████████▏| 9190/10000 [2:03:30<08:59, 1.50it/s, loss=0.0115, lr=2.90e-06, step=9190] Training: 92%|█████████▏| 9191/10000 [2:03:31<09:34, 1.41it/s, loss=0.0115, lr=2.90e-06, step=9190] Training: 92%|█████████▏| 9191/10000 [2:03:31<09:34, 1.41it/s, loss=0.0010, lr=2.90e-06, step=9191] Training: 92%|█████████▏| 9192/10000 [2:03:32<10:00, 1.35it/s, loss=0.0010, lr=2.90e-06, step=9191] Training: 92%|█████████▏| 9192/10000 [2:03:32<10:00, 1.35it/s, loss=0.0010, lr=2.90e-06, step=9192] Training: 92%|█████████▏| 9193/10000 [2:03:33<10:58, 1.23it/s, loss=0.0010, lr=2.90e-06, step=9192] Training: 92%|█████████▏| 9193/10000 [2:03:33<10:58, 1.23it/s, loss=0.0015, lr=2.90e-06, step=9193] Training: 92%|█████████▏| 9194/10000 [2:03:33<09:37, 1.40it/s, loss=0.0015, lr=2.90e-06, step=9193] Training: 92%|█████████▏| 9194/10000 [2:03:33<09:37, 1.40it/s, loss=0.0011, lr=2.90e-06, step=9194] Training: 92%|█████████▏| 9195/10000 [2:03:34<08:43, 1.54it/s, loss=0.0011, lr=2.90e-06, step=9194] Training: 92%|█████████▏| 9195/10000 [2:03:34<08:43, 1.54it/s, loss=0.0096, lr=2.90e-06, step=9195] Training: 92%|█████████▏| 9196/10000 [2:03:34<08:28, 1.58it/s, loss=0.0096, lr=2.90e-06, step=9195] Training: 92%|█████████▏| 9196/10000 [2:03:34<08:28, 1.58it/s, loss=0.0154, lr=2.90e-06, step=9196] Training: 92%|█████████▏| 9197/10000 [2:03:35<08:01, 1.67it/s, loss=0.0154, lr=2.90e-06, step=9196] Training: 92%|█████████▏| 9197/10000 [2:03:35<08:01, 1.67it/s, loss=0.0019, lr=2.90e-06, step=9197] Training: 92%|█████████▏| 9198/10000 [2:03:35<07:39, 1.74it/s, loss=0.0019, lr=2.90e-06, step=9197] Training: 92%|█████████▏| 9198/10000 [2:03:35<07:39, 1.74it/s, loss=0.0019, lr=2.89e-06, step=9198] Training: 92%|█████████▏| 9199/10000 [2:03:36<08:24, 1.59it/s, loss=0.0019, lr=2.89e-06, step=9198] Training: 92%|█████████▏| 9199/10000 [2:03:36<08:24, 1.59it/s, loss=0.0062, lr=2.89e-06, step=9199]20:48:09.249 [I] step=9200 loss=0.0018 smoothed_loss=0.0059 lr=2.90e-06 grad_norm=0.3674 step_time=0.5812s data_time=0.1173s it/s=1.432 eta_to_10000=558.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0055 grad_action_out_proj_arms=0.0539 grad_arm_token_fuse=0.0331 grad_shared_expert=0.2594 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9200/10000 [2:03:37<09:52, 1.35it/s, loss=0.0062, lr=2.89e-06, step=9199] Training: 92%|█████████▏| 9200/10000 [2:03:37<09:52, 1.35it/s, loss=0.0018, lr=2.89e-06, step=9200] Training: 92%|█████████▏| 9201/10000 [2:03:38<09:19, 1.43it/s, loss=0.0018, lr=2.89e-06, step=9200] Training: 92%|█████████▏| 9201/10000 [2:03:38<09:19, 1.43it/s, loss=0.0183, lr=2.89e-06, step=9201] Training: 92%|█████████▏| 9202/10000 [2:03:38<08:33, 1.55it/s, loss=0.0183, lr=2.89e-06, step=9201] Training: 92%|█████████▏| 9202/10000 [2:03:38<08:33, 1.55it/s, loss=0.0017, lr=2.89e-06, step=9202] Training: 92%|█████████▏| 9203/10000 [2:03:39<08:51, 1.50it/s, loss=0.0017, lr=2.89e-06, step=9202] Training: 92%|█████████▏| 9203/10000 [2:03:39<08:51, 1.50it/s, loss=0.0050, lr=2.89e-06, step=9203] Training: 92%|█████████▏| 9204/10000 [2:03:39<08:13, 1.61it/s, loss=0.0050, lr=2.89e-06, step=9203] Training: 92%|█████████▏| 9204/10000 [2:03:39<08:13, 1.61it/s, loss=0.0009, lr=2.89e-06, step=9204] Training: 92%|█████████▏| 9205/10000 [2:03:40<08:35, 1.54it/s, loss=0.0009, lr=2.89e-06, step=9204] Training: 92%|█████████▏| 9205/10000 [2:03:40<08:35, 1.54it/s, loss=0.0028, lr=2.89e-06, step=9205] Training: 92%|█████████▏| 9206/10000 [2:03:41<08:33, 1.55it/s, loss=0.0028, lr=2.89e-06, step=9205] Training: 92%|█████████▏| 9206/10000 [2:03:41<08:33, 1.55it/s, loss=0.0026, lr=2.89e-06, step=9206] Training: 92%|█████████▏| 9207/10000 [2:03:41<08:49, 1.50it/s, loss=0.0026, lr=2.89e-06, step=9206] Training: 92%|█████████▏| 9207/10000 [2:03:41<08:49, 1.50it/s, loss=0.0041, lr=2.89e-06, step=9207] Training: 92%|█████████▏| 9208/10000 [2:03:42<08:11, 1.61it/s, loss=0.0041, lr=2.89e-06, step=9207] Training: 92%|█████████▏| 9208/10000 [2:03:42<08:11, 1.61it/s, loss=0.0028, lr=2.88e-06, step=9208] Training: 92%|█████████▏| 9209/10000 [2:03:42<08:11, 1.61it/s, loss=0.0028, lr=2.88e-06, step=9208] Training: 92%|█████████▏| 9209/10000 [2:03:42<08:11, 1.61it/s, loss=0.0026, lr=2.88e-06, step=9209]20:48:15.448 [I] step=9210 loss=0.0065 smoothed_loss=0.0049 lr=2.89e-06 grad_norm=0.4866 step_time=0.5180s data_time=0.1019s it/s=1.614 eta_to_10000=489.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0090 grad_action_out_proj_arms=0.0862 grad_arm_token_fuse=0.0447 grad_shared_expert=0.3653 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9210/10000 [2:03:43<08:15, 1.59it/s, loss=0.0026, lr=2.88e-06, step=9209] Training: 92%|█████████▏| 9210/10000 [2:03:43<08:15, 1.59it/s, loss=0.0065, lr=2.88e-06, step=9210] Training: 92%|█████████▏| 9211/10000 [2:03:44<07:39, 1.72it/s, loss=0.0065, lr=2.88e-06, step=9210] Training: 92%|█████████▏| 9211/10000 [2:03:44<07:39, 1.72it/s, loss=0.0089, lr=2.88e-06, step=9211] Training: 92%|█████████▏| 9212/10000 [2:03:44<07:16, 1.80it/s, loss=0.0089, lr=2.88e-06, step=9211] Training: 92%|█████████▏| 9212/10000 [2:03:44<07:16, 1.80it/s, loss=0.0170, lr=2.88e-06, step=9212] Training: 92%|█████████▏| 9213/10000 [2:03:45<08:30, 1.54it/s, loss=0.0170, lr=2.88e-06, step=9212] Training: 92%|█████████▏| 9213/10000 [2:03:45<08:30, 1.54it/s, loss=0.0103, lr=2.88e-06, step=9213] Training: 92%|█████████▏| 9214/10000 [2:03:46<09:13, 1.42it/s, loss=0.0103, lr=2.88e-06, step=9213] Training: 92%|█████████▏| 9214/10000 [2:03:46<09:13, 1.42it/s, loss=0.0029, lr=2.88e-06, step=9214] Training: 92%|█████████▏| 9215/10000 [2:03:47<09:17, 1.41it/s, loss=0.0029, lr=2.88e-06, step=9214] Training: 92%|█████████▏| 9215/10000 [2:03:47<09:17, 1.41it/s, loss=0.0092, lr=2.88e-06, step=9215] Training: 92%|█████████▏| 9216/10000 [2:03:47<08:25, 1.55it/s, loss=0.0092, lr=2.88e-06, step=9215] Training: 92%|█████████▏| 9216/10000 [2:03:47<08:25, 1.55it/s, loss=0.0028, lr=2.88e-06, step=9216] Training: 92%|█████████▏| 9217/10000 [2:03:47<07:49, 1.67it/s, loss=0.0028, lr=2.88e-06, step=9216] Training: 92%|█████████▏| 9217/10000 [2:03:47<07:49, 1.67it/s, loss=0.0081, lr=2.88e-06, step=9217] Training: 92%|█████████▏| 9218/10000 [2:03:48<07:25, 1.76it/s, loss=0.0081, lr=2.88e-06, step=9217] Training: 92%|█████████▏| 9218/10000 [2:03:48<07:25, 1.76it/s, loss=0.0116, lr=2.88e-06, step=9218] Training: 92%|█████████▏| 9219/10000 [2:03:49<07:09, 1.82it/s, loss=0.0116, lr=2.88e-06, step=9218] Training: 92%|█████████▏| 9219/10000 [2:03:49<07:09, 1.82it/s, loss=0.0069, lr=2.87e-06, step=9219]20:48:21.362 [I] step=9220 loss=0.0045 smoothed_loss=0.0068 lr=2.88e-06 grad_norm=0.3791 step_time=0.5084s data_time=0.0830s it/s=1.691 eta_to_10000=461.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0030 grad_action_out_proj_arms=0.0445 grad_arm_token_fuse=0.0142 grad_shared_expert=0.1731 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9220/10000 [2:03:49<07:04, 1.84it/s, loss=0.0069, lr=2.87e-06, step=9219] Training: 92%|█████████▏| 9220/10000 [2:03:49<07:04, 1.84it/s, loss=0.0045, lr=2.87e-06, step=9220] Training: 92%|█████████▏| 9221/10000 [2:03:50<07:42, 1.68it/s, loss=0.0045, lr=2.87e-06, step=9220] Training: 92%|█████████▏| 9221/10000 [2:03:50<07:42, 1.68it/s, loss=0.0029, lr=2.87e-06, step=9221] Training: 92%|█████████▏| 9222/10000 [2:03:51<08:21, 1.55it/s, loss=0.0029, lr=2.87e-06, step=9221] Training: 92%|█████████▏| 9222/10000 [2:03:51<08:21, 1.55it/s, loss=0.0037, lr=2.87e-06, step=9222] Training: 92%|█████████▏| 9223/10000 [2:03:51<07:43, 1.68it/s, loss=0.0037, lr=2.87e-06, step=9222] Training: 92%|█████████▏| 9223/10000 [2:03:51<07:43, 1.68it/s, loss=0.0032, lr=2.87e-06, step=9223] Training: 92%|█████████▏| 9224/10000 [2:03:52<08:03, 1.61it/s, loss=0.0032, lr=2.87e-06, step=9223] Training: 92%|█████████▏| 9224/10000 [2:03:52<08:03, 1.61it/s, loss=0.0140, lr=2.87e-06, step=9224] Training: 92%|█████████▏| 9225/10000 [2:03:53<09:04, 1.42it/s, loss=0.0140, lr=2.87e-06, step=9224] Training: 92%|█████████▏| 9225/10000 [2:03:53<09:04, 1.42it/s, loss=0.0046, lr=2.87e-06, step=9225] Training: 92%|█████████▏| 9226/10000 [2:03:53<08:18, 1.55it/s, loss=0.0046, lr=2.87e-06, step=9225] Training: 92%|█████████▏| 9226/10000 [2:03:53<08:18, 1.55it/s, loss=0.0037, lr=2.87e-06, step=9226] Training: 92%|█████████▏| 9227/10000 [2:03:54<08:49, 1.46it/s, loss=0.0037, lr=2.87e-06, step=9226] Training: 92%|█████████▏| 9227/10000 [2:03:54<08:49, 1.46it/s, loss=0.0136, lr=2.87e-06, step=9227] Training: 92%|█████████▏| 9228/10000 [2:03:55<09:57, 1.29it/s, loss=0.0136, lr=2.87e-06, step=9227] Training: 92%|█████████▏| 9228/10000 [2:03:55<09:57, 1.29it/s, loss=0.0915, lr=2.87e-06, step=9228] Training: 92%|█████████▏| 9229/10000 [2:03:56<10:06, 1.27it/s, loss=0.0915, lr=2.87e-06, step=9228] Training: 92%|█████████▏| 9229/10000 [2:03:56<10:06, 1.27it/s, loss=0.0036, lr=2.86e-06, step=9229]20:48:28.499 [I] step=9230 loss=0.0045 smoothed_loss=0.0132 lr=2.87e-06 grad_norm=0.3952 step_time=0.5755s data_time=0.1382s it/s=1.401 eta_to_10000=549.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0102 grad_action_out_proj_arms=0.0763 grad_arm_token_fuse=0.0495 grad_shared_expert=0.2698 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9230/10000 [2:03:56<09:04, 1.41it/s, loss=0.0036, lr=2.86e-06, step=9229] Training: 92%|█████████▏| 9230/10000 [2:03:56<09:04, 1.41it/s, loss=0.0045, lr=2.86e-06, step=9230] Training: 92%|█████████▏| 9231/10000 [2:03:57<09:19, 1.37it/s, loss=0.0045, lr=2.86e-06, step=9230] Training: 92%|█████████▏| 9231/10000 [2:03:57<09:19, 1.37it/s, loss=0.0017, lr=2.86e-06, step=9231] Training: 92%|█████████▏| 9232/10000 [2:03:57<08:28, 1.51it/s, loss=0.0017, lr=2.86e-06, step=9231] Training: 92%|█████████▏| 9232/10000 [2:03:57<08:28, 1.51it/s, loss=0.0015, lr=2.86e-06, step=9232] Training: 92%|█████████▏| 9233/10000 [2:03:58<08:33, 1.49it/s, loss=0.0015, lr=2.86e-06, step=9232] Training: 92%|█████████▏| 9233/10000 [2:03:58<08:33, 1.49it/s, loss=0.0111, lr=2.86e-06, step=9233] Training: 92%|█████████▏| 9234/10000 [2:03:59<09:07, 1.40it/s, loss=0.0111, lr=2.86e-06, step=9233] Training: 92%|█████████▏| 9234/10000 [2:03:59<09:07, 1.40it/s, loss=0.0129, lr=2.86e-06, step=9234] Training: 92%|█████████▏| 9235/10000 [2:04:00<08:31, 1.49it/s, loss=0.0129, lr=2.86e-06, step=9234] Training: 92%|█████████▏| 9235/10000 [2:04:00<08:31, 1.49it/s, loss=0.0146, lr=2.86e-06, step=9235] Training: 92%|█████████▏| 9236/10000 [2:04:00<08:36, 1.48it/s, loss=0.0146, lr=2.86e-06, step=9235] Training: 92%|█████████▏| 9236/10000 [2:04:00<08:36, 1.48it/s, loss=0.0040, lr=2.86e-06, step=9236] Training: 92%|█████████▏| 9237/10000 [2:04:01<07:54, 1.61it/s, loss=0.0040, lr=2.86e-06, step=9236] Training: 92%|█████████▏| 9237/10000 [2:04:01<07:54, 1.61it/s, loss=0.0027, lr=2.86e-06, step=9237] Training: 92%|█████████▏| 9238/10000 [2:04:01<08:10, 1.55it/s, loss=0.0027, lr=2.86e-06, step=9237] Training: 92%|█████████▏| 9238/10000 [2:04:01<08:10, 1.55it/s, loss=0.0150, lr=2.86e-06, step=9238] Training: 92%|█████████▏| 9239/10000 [2:04:02<08:25, 1.50it/s, loss=0.0150, lr=2.86e-06, step=9238] Training: 92%|█████████▏| 9239/10000 [2:04:02<08:25, 1.50it/s, loss=0.0013, lr=2.86e-06, step=9239]20:48:35.000 [I] step=9240 loss=0.0038 smoothed_loss=0.0090 lr=2.86e-06 grad_norm=0.3649 step_time=0.5200s data_time=0.1301s it/s=1.538 eta_to_10000=494.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0080 grad_action_out_proj_arms=0.0609 grad_arm_token_fuse=0.0391 grad_shared_expert=0.3390 (18633:train_pytorch.py:850) + Training: 92%|█████████▏| 9240/10000 [2:04:03<07:59, 1.59it/s, loss=0.0013, lr=2.86e-06, step=9239] Training: 92%|█████████▏| 9240/10000 [2:04:03<07:59, 1.59it/s, loss=0.0038, lr=2.85e-06, step=9240] Training: 92%|█████████▏| 9241/10000 [2:04:03<07:23, 1.71it/s, loss=0.0038, lr=2.85e-06, step=9240] Training: 92%|█████████▏| 9241/10000 [2:04:03<07:23, 1.71it/s, loss=0.0029, lr=2.85e-06, step=9241] Training: 92%|█████████▏| 9242/10000 [2:04:04<07:02, 1.80it/s, loss=0.0029, lr=2.85e-06, step=9241] Training: 92%|█████████▏| 9242/10000 [2:04:04<07:02, 1.80it/s, loss=0.0024, lr=2.85e-06, step=9242] Training: 92%|█████████▏| 9243/10000 [2:04:04<07:42, 1.64it/s, loss=0.0024, lr=2.85e-06, step=9242] Training: 92%|█████████▏| 9243/10000 [2:04:04<07:42, 1.64it/s, loss=0.0258, lr=2.85e-06, step=9243] Training: 92%|█████████▏| 9244/10000 [2:04:05<07:13, 1.75it/s, loss=0.0258, lr=2.85e-06, step=9243] Training: 92%|█████████▏| 9244/10000 [2:04:05<07:13, 1.75it/s, loss=0.0165, lr=2.85e-06, step=9244] Training: 92%|█████████▏| 9245/10000 [2:04:05<06:52, 1.83it/s, loss=0.0165, lr=2.85e-06, step=9244] Training: 92%|█████████▏| 9245/10000 [2:04:05<06:52, 1.83it/s, loss=0.0096, lr=2.85e-06, step=9245] Training: 92%|█████████▏| 9246/10000 [2:04:06<07:41, 1.63it/s, loss=0.0096, lr=2.85e-06, step=9245] Training: 92%|█████████▏| 9246/10000 [2:04:06<07:41, 1.63it/s, loss=0.0034, lr=2.85e-06, step=9246] Training: 92%|█████████▏| 9247/10000 [2:04:07<07:12, 1.74it/s, loss=0.0034, lr=2.85e-06, step=9246] Training: 92%|█████████▏| 9247/10000 [2:04:07<07:12, 1.74it/s, loss=0.0009, lr=2.85e-06, step=9247] Training: 92%|█████████▏| 9248/10000 [2:04:07<07:25, 1.69it/s, loss=0.0009, lr=2.85e-06, step=9247] Training: 92%|█████████▏| 9248/10000 [2:04:07<07:25, 1.69it/s, loss=0.0082, lr=2.85e-06, step=9248] Training: 92%|█████████▏| 9249/10000 [2:04:08<07:16, 1.72it/s, loss=0.0082, lr=2.85e-06, step=9248] Training: 92%|█████████▏| 9249/10000 [2:04:08<07:16, 1.72it/s, loss=0.0079, lr=2.85e-06, step=9249]20:48:40.876 [I] step=9250 loss=0.0053 smoothed_loss=0.0082 lr=2.85e-06 grad_norm=0.3301 step_time=0.4974s data_time=0.0902s it/s=1.702 eta_to_10000=440.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0047 grad_action_out_proj_arms=0.0521 grad_arm_token_fuse=0.0251 grad_shared_expert=0.3289 (18633:train_pytorch.py:850) + Training: 92%|█████████▎| 9250/10000 [2:04:09<07:56, 1.57it/s, loss=0.0079, lr=2.85e-06, step=9249] Training: 92%|█████████▎| 9250/10000 [2:04:09<07:56, 1.57it/s, loss=0.0053, lr=2.85e-06, step=9250] Training: 93%|█████████▎| 9251/10000 [2:04:09<07:58, 1.57it/s, loss=0.0053, lr=2.85e-06, step=9250] Training: 93%|█████████▎| 9251/10000 [2:04:09<07:58, 1.57it/s, loss=0.0065, lr=2.84e-06, step=9251] Training: 93%|█████████▎| 9252/10000 [2:04:10<07:47, 1.60it/s, loss=0.0065, lr=2.84e-06, step=9251] Training: 93%|█████████▎| 9252/10000 [2:04:10<07:47, 1.60it/s, loss=0.0053, lr=2.84e-06, step=9252] Training: 93%|█████████▎| 9253/10000 [2:04:10<07:15, 1.72it/s, loss=0.0053, lr=2.84e-06, step=9252] Training: 93%|█████████▎| 9253/10000 [2:04:10<07:15, 1.72it/s, loss=0.0074, lr=2.84e-06, step=9253] Training: 93%|█████████▎| 9254/10000 [2:04:11<07:31, 1.65it/s, loss=0.0074, lr=2.84e-06, step=9253] Training: 93%|█████████▎| 9254/10000 [2:04:11<07:31, 1.65it/s, loss=0.0037, lr=2.84e-06, step=9254] Training: 93%|█████████▎| 9255/10000 [2:04:11<07:10, 1.73it/s, loss=0.0037, lr=2.84e-06, step=9254] Training: 93%|█████████▎| 9255/10000 [2:04:11<07:10, 1.73it/s, loss=0.0034, lr=2.84e-06, step=9255] Training: 93%|█████████▎| 9256/10000 [2:04:12<06:49, 1.81it/s, loss=0.0034, lr=2.84e-06, step=9255] Training: 93%|█████████▎| 9256/10000 [2:04:12<06:49, 1.81it/s, loss=0.0031, lr=2.84e-06, step=9256] Training: 93%|█████████▎| 9257/10000 [2:04:13<07:19, 1.69it/s, loss=0.0031, lr=2.84e-06, step=9256] Training: 93%|█████████▎| 9257/10000 [2:04:13<07:19, 1.69it/s, loss=0.0107, lr=2.84e-06, step=9257] Training: 93%|█████████▎| 9258/10000 [2:04:13<07:30, 1.65it/s, loss=0.0107, lr=2.84e-06, step=9257] Training: 93%|█████████▎| 9258/10000 [2:04:13<07:30, 1.65it/s, loss=0.0329, lr=2.84e-06, step=9258] Training: 93%|█████████▎| 9259/10000 [2:04:14<07:13, 1.71it/s, loss=0.0329, lr=2.84e-06, step=9258] Training: 93%|█████████▎| 9259/10000 [2:04:14<07:13, 1.71it/s, loss=0.0345, lr=2.84e-06, step=9259]20:48:46.797 [I] step=9260 loss=0.0035 smoothed_loss=0.0112 lr=2.84e-06 grad_norm=0.4104 step_time=0.5088s data_time=0.0833s it/s=1.689 eta_to_10000=438.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0104 grad_action_out_proj_arms=0.0817 grad_arm_token_fuse=0.0532 grad_shared_expert=0.6380 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9260/10000 [2:04:14<07:32, 1.63it/s, loss=0.0345, lr=2.84e-06, step=9259] Training: 93%|█████████▎| 9260/10000 [2:04:14<07:32, 1.63it/s, loss=0.0035, lr=2.84e-06, step=9260] Training: 93%|█████████▎| 9261/10000 [2:04:15<07:49, 1.57it/s, loss=0.0035, lr=2.84e-06, step=9260] Training: 93%|█████████▎| 9261/10000 [2:04:15<07:49, 1.57it/s, loss=0.0164, lr=2.84e-06, step=9261] Training: 93%|█████████▎| 9262/10000 [2:04:16<07:42, 1.60it/s, loss=0.0164, lr=2.84e-06, step=9261] Training: 93%|█████████▎| 9262/10000 [2:04:16<07:42, 1.60it/s, loss=0.0051, lr=2.83e-06, step=9262] Training: 93%|█████████▎| 9263/10000 [2:04:16<07:20, 1.67it/s, loss=0.0051, lr=2.83e-06, step=9262] Training: 93%|█████████▎| 9263/10000 [2:04:16<07:20, 1.67it/s, loss=0.0022, lr=2.83e-06, step=9263] Training: 93%|█████████▎| 9264/10000 [2:04:17<07:33, 1.62it/s, loss=0.0022, lr=2.83e-06, step=9263] Training: 93%|█████████▎| 9264/10000 [2:04:17<07:33, 1.62it/s, loss=0.0116, lr=2.83e-06, step=9264] Training: 93%|█████████▎| 9265/10000 [2:04:18<07:47, 1.57it/s, loss=0.0116, lr=2.83e-06, step=9264] Training: 93%|█████████▎| 9265/10000 [2:04:18<07:47, 1.57it/s, loss=0.0051, lr=2.83e-06, step=9265] Training: 93%|█████████▎| 9266/10000 [2:04:19<09:02, 1.35it/s, loss=0.0051, lr=2.83e-06, step=9265] Training: 93%|█████████▎| 9266/10000 [2:04:19<09:02, 1.35it/s, loss=0.0030, lr=2.83e-06, step=9266] Training: 93%|█████████▎| 9267/10000 [2:04:19<08:17, 1.47it/s, loss=0.0030, lr=2.83e-06, step=9266] Training: 93%|█████████▎| 9267/10000 [2:04:19<08:17, 1.47it/s, loss=0.0288, lr=2.83e-06, step=9267] Training: 93%|█████████▎| 9268/10000 [2:04:20<07:47, 1.57it/s, loss=0.0288, lr=2.83e-06, step=9267] Training: 93%|█████████▎| 9268/10000 [2:04:20<07:47, 1.57it/s, loss=0.0021, lr=2.83e-06, step=9268] Training: 93%|█████████▎| 9269/10000 [2:04:21<08:42, 1.40it/s, loss=0.0021, lr=2.83e-06, step=9268] Training: 93%|█████████▎| 9269/10000 [2:04:21<08:42, 1.40it/s, loss=0.0016, lr=2.83e-06, step=9269]20:48:53.700 [I] step=9270 loss=0.0064 smoothed_loss=0.0090 lr=2.83e-06 grad_norm=0.4049 step_time=0.5568s data_time=0.1335s it/s=1.449 eta_to_10000=503.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0615 grad_arm_token_fuse=0.0357 grad_shared_expert=0.2373 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9270/10000 [2:04:21<08:56, 1.36it/s, loss=0.0016, lr=2.83e-06, step=9269] Training: 93%|█████████▎| 9270/10000 [2:04:21<08:56, 1.36it/s, loss=0.0064, lr=2.83e-06, step=9270] Training: 93%|█████████▎| 9271/10000 [2:04:22<09:36, 1.26it/s, loss=0.0064, lr=2.83e-06, step=9270] Training: 93%|█████████▎| 9271/10000 [2:04:22<09:36, 1.26it/s, loss=0.0080, lr=2.83e-06, step=9271] Training: 93%|█████████▎| 9272/10000 [2:04:23<09:47, 1.24it/s, loss=0.0080, lr=2.83e-06, step=9271] Training: 93%|█████████▎| 9272/10000 [2:04:23<09:47, 1.24it/s, loss=0.0208, lr=2.83e-06, step=9272] Training: 93%|█████████▎| 9273/10000 [2:04:24<10:51, 1.12it/s, loss=0.0208, lr=2.83e-06, step=9272] Training: 93%|█████████▎| 9273/10000 [2:04:24<10:51, 1.12it/s, loss=0.0091, lr=2.82e-06, step=9273] Training: 93%|█████████▎| 9274/10000 [2:04:25<09:28, 1.28it/s, loss=0.0091, lr=2.82e-06, step=9273] Training: 93%|█████████▎| 9274/10000 [2:04:25<09:28, 1.28it/s, loss=0.0114, lr=2.82e-06, step=9274] Training: 93%|█████████▎| 9275/10000 [2:04:26<09:18, 1.30it/s, loss=0.0114, lr=2.82e-06, step=9274] Training: 93%|█████████▎| 9275/10000 [2:04:26<09:18, 1.30it/s, loss=0.0043, lr=2.82e-06, step=9275] Training: 93%|█████████▎| 9276/10000 [2:04:26<09:06, 1.33it/s, loss=0.0043, lr=2.82e-06, step=9275] Training: 93%|█████████▎| 9276/10000 [2:04:26<09:06, 1.33it/s, loss=0.0070, lr=2.82e-06, step=9276] Training: 93%|█████████▎| 9277/10000 [2:04:27<08:12, 1.47it/s, loss=0.0070, lr=2.82e-06, step=9276] Training: 93%|█████████▎| 9277/10000 [2:04:27<08:12, 1.47it/s, loss=0.0026, lr=2.82e-06, step=9277] Training: 93%|█████████▎| 9278/10000 [2:04:27<08:05, 1.49it/s, loss=0.0026, lr=2.82e-06, step=9277] Training: 93%|█████████▎| 9278/10000 [2:04:27<08:05, 1.49it/s, loss=0.0063, lr=2.82e-06, step=9278] Training: 93%|█████████▎| 9279/10000 [2:04:28<08:40, 1.39it/s, loss=0.0063, lr=2.82e-06, step=9278] Training: 93%|█████████▎| 9279/10000 [2:04:28<08:40, 1.39it/s, loss=0.0116, lr=2.82e-06, step=9279]20:49:01.139 [I] step=9280 loss=0.0057 smoothed_loss=0.0084 lr=2.82e-06 grad_norm=0.4341 step_time=0.6150s data_time=0.1290s it/s=1.344 eta_to_10000=535.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0130 grad_action_out_proj_arms=0.0981 grad_arm_token_fuse=0.0670 grad_shared_expert=0.4208 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9280/10000 [2:04:29<08:11, 1.46it/s, loss=0.0116, lr=2.82e-06, step=9279] Training: 93%|█████████▎| 9280/10000 [2:04:29<08:11, 1.46it/s, loss=0.0057, lr=2.82e-06, step=9280] Training: 93%|█████████▎| 9281/10000 [2:04:29<08:03, 1.49it/s, loss=0.0057, lr=2.82e-06, step=9280] Training: 93%|█████████▎| 9281/10000 [2:04:29<08:03, 1.49it/s, loss=0.0129, lr=2.82e-06, step=9281] Training: 93%|█████████▎| 9282/10000 [2:04:30<08:28, 1.41it/s, loss=0.0129, lr=2.82e-06, step=9281] Training: 93%|█████████▎| 9282/10000 [2:04:30<08:28, 1.41it/s, loss=0.0067, lr=2.82e-06, step=9282] Training: 93%|█████████▎| 9283/10000 [2:04:31<07:55, 1.51it/s, loss=0.0067, lr=2.82e-06, step=9282] Training: 93%|█████████▎| 9283/10000 [2:04:31<07:55, 1.51it/s, loss=0.0070, lr=2.82e-06, step=9283] Training: 93%|█████████▎| 9284/10000 [2:04:31<07:15, 1.64it/s, loss=0.0070, lr=2.82e-06, step=9283] Training: 93%|█████████▎| 9284/10000 [2:04:31<07:15, 1.64it/s, loss=0.0060, lr=2.81e-06, step=9284] Training: 93%|█████████▎| 9285/10000 [2:04:32<07:24, 1.61it/s, loss=0.0060, lr=2.81e-06, step=9284] Training: 93%|█████████▎| 9285/10000 [2:04:32<07:24, 1.61it/s, loss=0.0205, lr=2.81e-06, step=9285] Training: 93%|█████████▎| 9286/10000 [2:04:33<07:44, 1.54it/s, loss=0.0205, lr=2.81e-06, step=9285] Training: 93%|█████████▎| 9286/10000 [2:04:33<07:44, 1.54it/s, loss=0.0039, lr=2.81e-06, step=9286] Training: 93%|█████████▎| 9287/10000 [2:04:33<07:29, 1.59it/s, loss=0.0039, lr=2.81e-06, step=9286] Training: 93%|█████████▎| 9287/10000 [2:04:33<07:29, 1.59it/s, loss=0.0030, lr=2.81e-06, step=9287] Training: 93%|█████████▎| 9288/10000 [2:04:34<07:30, 1.58it/s, loss=0.0030, lr=2.81e-06, step=9287] Training: 93%|█████████▎| 9288/10000 [2:04:34<07:30, 1.58it/s, loss=0.0019, lr=2.81e-06, step=9288] Training: 93%|█████████▎| 9289/10000 [2:04:34<07:01, 1.69it/s, loss=0.0019, lr=2.81e-06, step=9288] Training: 93%|█████████▎| 9289/10000 [2:04:34<07:01, 1.69it/s, loss=0.0045, lr=2.81e-06, step=9289]20:49:07.465 [I] step=9290 loss=0.0028 smoothed_loss=0.0069 lr=2.81e-06 grad_norm=0.3524 step_time=0.5226s data_time=0.1099s it/s=1.581 eta_to_10000=449.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0036 grad_action_out_proj_arms=0.0414 grad_arm_token_fuse=0.0185 grad_shared_expert=0.2138 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9290/10000 [2:04:35<07:35, 1.56it/s, loss=0.0045, lr=2.81e-06, step=9289] Training: 93%|█████████▎| 9290/10000 [2:04:35<07:35, 1.56it/s, loss=0.0028, lr=2.81e-06, step=9290] Training: 93%|█████████▎| 9291/10000 [2:04:36<07:03, 1.68it/s, loss=0.0028, lr=2.81e-06, step=9290] Training: 93%|█████████▎| 9291/10000 [2:04:36<07:03, 1.68it/s, loss=0.0094, lr=2.81e-06, step=9291] Training: 93%|█████████▎| 9292/10000 [2:04:36<06:39, 1.77it/s, loss=0.0094, lr=2.81e-06, step=9291] Training: 93%|█████████▎| 9292/10000 [2:04:36<06:39, 1.77it/s, loss=0.0133, lr=2.81e-06, step=9292] Training: 93%|█████████▎| 9293/10000 [2:04:37<07:14, 1.63it/s, loss=0.0133, lr=2.81e-06, step=9292] Training: 93%|█████████▎| 9293/10000 [2:04:37<07:14, 1.63it/s, loss=0.0028, lr=2.81e-06, step=9293] Training: 93%|█████████▎| 9294/10000 [2:04:37<06:56, 1.70it/s, loss=0.0028, lr=2.81e-06, step=9293] Training: 93%|█████████▎| 9294/10000 [2:04:37<06:56, 1.70it/s, loss=0.0103, lr=2.81e-06, step=9294] Training: 93%|█████████▎| 9295/10000 [2:04:38<07:11, 1.63it/s, loss=0.0103, lr=2.81e-06, step=9294] Training: 93%|█████████▎| 9295/10000 [2:04:38<07:11, 1.63it/s, loss=0.0047, lr=2.81e-06, step=9295] Training: 93%|█████████▎| 9296/10000 [2:04:39<06:51, 1.71it/s, loss=0.0047, lr=2.81e-06, step=9295] Training: 93%|█████████▎| 9296/10000 [2:04:39<06:51, 1.71it/s, loss=0.0032, lr=2.80e-06, step=9296] Training: 93%|█████████▎| 9297/10000 [2:04:39<06:28, 1.81it/s, loss=0.0032, lr=2.80e-06, step=9296] Training: 93%|█████████▎| 9297/10000 [2:04:39<06:28, 1.81it/s, loss=0.0023, lr=2.80e-06, step=9297] Training: 93%|█████████▎| 9298/10000 [2:04:40<06:24, 1.82it/s, loss=0.0023, lr=2.80e-06, step=9297] Training: 93%|█████████▎| 9298/10000 [2:04:40<06:24, 1.82it/s, loss=0.0877, lr=2.80e-06, step=9298] Training: 93%|█████████▎| 9299/10000 [2:04:40<06:21, 1.84it/s, loss=0.0877, lr=2.80e-06, step=9298] Training: 93%|█████████▎| 9299/10000 [2:04:40<06:21, 1.84it/s, loss=0.0059, lr=2.80e-06, step=9299]20:49:13.164 [I] step=9300 loss=0.0082 smoothed_loss=0.0131 lr=2.80e-06 grad_norm=0.4153 step_time=0.5028s data_time=0.0671s it/s=1.755 eta_to_10000=398.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0780 grad_arm_token_fuse=0.0318 grad_shared_expert=0.2874 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9300/10000 [2:04:41<06:57, 1.68it/s, loss=0.0059, lr=2.80e-06, step=9299] Training: 93%|█████████▎| 9300/10000 [2:04:41<06:57, 1.68it/s, loss=0.0082, lr=2.80e-06, step=9300] Training: 93%|█████████▎| 9301/10000 [2:04:42<07:23, 1.58it/s, loss=0.0082, lr=2.80e-06, step=9300] Training: 93%|█████████▎| 9301/10000 [2:04:42<07:23, 1.58it/s, loss=0.0105, lr=2.80e-06, step=9301] Training: 93%|█████████▎| 9302/10000 [2:04:42<07:22, 1.58it/s, loss=0.0105, lr=2.80e-06, step=9301] Training: 93%|█████████▎| 9302/10000 [2:04:42<07:22, 1.58it/s, loss=0.0099, lr=2.80e-06, step=9302] Training: 93%|█████████▎| 9303/10000 [2:04:43<06:57, 1.67it/s, loss=0.0099, lr=2.80e-06, step=9302] Training: 93%|█████████▎| 9303/10000 [2:04:43<06:57, 1.67it/s, loss=0.0041, lr=2.80e-06, step=9303] Training: 93%|█████████▎| 9304/10000 [2:04:43<07:23, 1.57it/s, loss=0.0041, lr=2.80e-06, step=9303] Training: 93%|█████████▎| 9304/10000 [2:04:43<07:23, 1.57it/s, loss=0.0229, lr=2.80e-06, step=9304] Training: 93%|█████████▎| 9305/10000 [2:04:44<08:01, 1.44it/s, loss=0.0229, lr=2.80e-06, step=9304] Training: 93%|█████████▎| 9305/10000 [2:04:44<08:01, 1.44it/s, loss=0.0101, lr=2.80e-06, step=9305] Training: 93%|█████████▎| 9306/10000 [2:04:45<07:58, 1.45it/s, loss=0.0101, lr=2.80e-06, step=9305] Training: 93%|█████████▎| 9306/10000 [2:04:45<07:58, 1.45it/s, loss=0.0050, lr=2.80e-06, step=9306] Training: 93%|█████████▎| 9307/10000 [2:04:46<08:16, 1.40it/s, loss=0.0050, lr=2.80e-06, step=9306] Training: 93%|█████████▎| 9307/10000 [2:04:46<08:16, 1.40it/s, loss=0.0014, lr=2.79e-06, step=9307] Training: 93%|█████████▎| 9308/10000 [2:04:47<08:45, 1.32it/s, loss=0.0014, lr=2.79e-06, step=9307] Training: 93%|█████████▎| 9308/10000 [2:04:47<08:45, 1.32it/s, loss=0.0052, lr=2.79e-06, step=9308] Training: 93%|█████████▎| 9309/10000 [2:04:47<08:28, 1.36it/s, loss=0.0052, lr=2.79e-06, step=9308] Training: 93%|█████████▎| 9309/10000 [2:04:47<08:28, 1.36it/s, loss=0.0059, lr=2.79e-06, step=9309]20:49:20.254 [I] step=9310 loss=0.0072 smoothed_loss=0.0095 lr=2.80e-06 grad_norm=0.4813 step_time=0.5649s data_time=0.1442s it/s=1.411 eta_to_10000=489.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0222 grad_action_out_proj_arms=0.1556 grad_arm_token_fuse=0.1264 grad_shared_expert=0.4084 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9310/10000 [2:04:48<08:13, 1.40it/s, loss=0.0059, lr=2.79e-06, step=9309] Training: 93%|█████████▎| 9310/10000 [2:04:48<08:13, 1.40it/s, loss=0.0072, lr=2.79e-06, step=9310] Training: 93%|█████████▎| 9311/10000 [2:04:48<07:23, 1.55it/s, loss=0.0072, lr=2.79e-06, step=9310] Training: 93%|█████████▎| 9311/10000 [2:04:48<07:23, 1.55it/s, loss=0.0006, lr=2.79e-06, step=9311] Training: 93%|█████████▎| 9312/10000 [2:04:49<07:48, 1.47it/s, loss=0.0006, lr=2.79e-06, step=9311] Training: 93%|█████████▎| 9312/10000 [2:04:49<07:48, 1.47it/s, loss=0.0088, lr=2.79e-06, step=9312] Training: 93%|█████████▎| 9313/10000 [2:04:50<07:04, 1.62it/s, loss=0.0088, lr=2.79e-06, step=9312] Training: 93%|█████████▎| 9313/10000 [2:04:50<07:04, 1.62it/s, loss=0.0077, lr=2.79e-06, step=9313] Training: 93%|█████████▎| 9314/10000 [2:04:50<07:16, 1.57it/s, loss=0.0077, lr=2.79e-06, step=9313] Training: 93%|█████████▎| 9314/10000 [2:04:50<07:16, 1.57it/s, loss=0.0122, lr=2.79e-06, step=9314] Training: 93%|█████████▎| 9315/10000 [2:04:51<07:25, 1.54it/s, loss=0.0122, lr=2.79e-06, step=9314] Training: 93%|█████████▎| 9315/10000 [2:04:51<07:25, 1.54it/s, loss=0.0132, lr=2.79e-06, step=9315] Training: 93%|█████████▎| 9316/10000 [2:04:52<07:21, 1.55it/s, loss=0.0132, lr=2.79e-06, step=9315] Training: 93%|█████████▎| 9316/10000 [2:04:52<07:21, 1.55it/s, loss=0.0268, lr=2.79e-06, step=9316] Training: 93%|█████████▎| 9317/10000 [2:04:52<06:56, 1.64it/s, loss=0.0268, lr=2.79e-06, step=9316] Training: 93%|█████████▎| 9317/10000 [2:04:52<06:56, 1.64it/s, loss=0.0437, lr=2.79e-06, step=9317] Training: 93%|█████████▎| 9318/10000 [2:04:53<06:45, 1.68it/s, loss=0.0437, lr=2.79e-06, step=9317] Training: 93%|█████████▎| 9318/10000 [2:04:53<06:45, 1.68it/s, loss=0.0013, lr=2.79e-06, step=9318] Training: 93%|█████████▎| 9319/10000 [2:04:53<06:22, 1.78it/s, loss=0.0013, lr=2.79e-06, step=9318] Training: 93%|█████████▎| 9319/10000 [2:04:53<06:22, 1.78it/s, loss=0.0052, lr=2.78e-06, step=9319]20:49:26.093 [I] step=9320 loss=0.0034 smoothed_loss=0.0114 lr=2.79e-06 grad_norm=0.5052 step_time=0.5061s data_time=0.0778s it/s=1.713 eta_to_10000=397.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0245 grad_action_out_proj_arms=0.1907 grad_arm_token_fuse=0.1287 grad_shared_expert=0.5633 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9320/10000 [2:04:54<06:20, 1.79it/s, loss=0.0052, lr=2.78e-06, step=9319] Training: 93%|█████████▎| 9320/10000 [2:04:54<06:20, 1.79it/s, loss=0.0034, lr=2.78e-06, step=9320] Training: 93%|█████████▎| 9321/10000 [2:04:55<07:36, 1.49it/s, loss=0.0034, lr=2.78e-06, step=9320] Training: 93%|█████████▎| 9321/10000 [2:04:55<07:36, 1.49it/s, loss=0.0045, lr=2.78e-06, step=9321] Training: 93%|█████████▎| 9322/10000 [2:04:56<08:20, 1.35it/s, loss=0.0045, lr=2.78e-06, step=9321] Training: 93%|█████████▎| 9322/10000 [2:04:56<08:20, 1.35it/s, loss=0.0051, lr=2.78e-06, step=9322] Training: 93%|█████████▎| 9323/10000 [2:04:56<08:08, 1.39it/s, loss=0.0051, lr=2.78e-06, step=9322] Training: 93%|█████████▎| 9323/10000 [2:04:56<08:08, 1.39it/s, loss=0.0416, lr=2.78e-06, step=9323] Training: 93%|█████████▎| 9324/10000 [2:04:57<08:29, 1.33it/s, loss=0.0416, lr=2.78e-06, step=9323] Training: 93%|█████████▎| 9324/10000 [2:04:57<08:29, 1.33it/s, loss=0.0190, lr=2.78e-06, step=9324] Training: 93%|█████████▎| 9325/10000 [2:04:58<08:19, 1.35it/s, loss=0.0190, lr=2.78e-06, step=9324] Training: 93%|█████████▎| 9325/10000 [2:04:58<08:19, 1.35it/s, loss=0.0029, lr=2.78e-06, step=9325] Training: 93%|█████████▎| 9326/10000 [2:04:59<08:36, 1.31it/s, loss=0.0029, lr=2.78e-06, step=9325] Training: 93%|█████████▎| 9326/10000 [2:04:59<08:36, 1.31it/s, loss=0.0070, lr=2.78e-06, step=9326] Training: 93%|█████████▎| 9327/10000 [2:04:59<08:44, 1.28it/s, loss=0.0070, lr=2.78e-06, step=9326] Training: 93%|█████████▎| 9327/10000 [2:04:59<08:44, 1.28it/s, loss=0.0088, lr=2.78e-06, step=9327] Training: 93%|█████████▎| 9328/10000 [2:05:00<09:28, 1.18it/s, loss=0.0088, lr=2.78e-06, step=9327] Training: 93%|█████████▎| 9328/10000 [2:05:00<09:28, 1.18it/s, loss=0.0039, lr=2.78e-06, step=9328] Training: 93%|█████████▎| 9329/10000 [2:05:01<08:52, 1.26it/s, loss=0.0039, lr=2.78e-06, step=9328] Training: 93%|█████████▎| 9329/10000 [2:05:01<08:52, 1.26it/s, loss=0.0385, lr=2.78e-06, step=9329]20:49:34.168 [I] step=9330 loss=0.0096 smoothed_loss=0.0134 lr=2.78e-06 grad_norm=0.4352 step_time=0.6530s data_time=0.1545s it/s=1.238 eta_to_10000=541.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0292 grad_action_out_proj_arms=0.1575 grad_arm_token_fuse=0.1561 grad_shared_expert=0.5135 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9330/10000 [2:05:02<08:37, 1.30it/s, loss=0.0385, lr=2.78e-06, step=9329] Training: 93%|█████████▎| 9330/10000 [2:05:02<08:37, 1.30it/s, loss=0.0096, lr=2.78e-06, step=9330] Training: 93%|█████████▎| 9331/10000 [2:05:03<08:22, 1.33it/s, loss=0.0096, lr=2.78e-06, step=9330] Training: 93%|█████████▎| 9331/10000 [2:05:03<08:22, 1.33it/s, loss=0.0024, lr=2.78e-06, step=9331] Training: 93%|█████████▎| 9332/10000 [2:05:03<08:01, 1.39it/s, loss=0.0024, lr=2.78e-06, step=9331] Training: 93%|█████████▎| 9332/10000 [2:05:03<08:01, 1.39it/s, loss=0.0027, lr=2.77e-06, step=9332] Training: 93%|█████████▎| 9333/10000 [2:05:04<07:17, 1.52it/s, loss=0.0027, lr=2.77e-06, step=9332] Training: 93%|█████████▎| 9333/10000 [2:05:04<07:17, 1.52it/s, loss=0.0060, lr=2.77e-06, step=9333] Training: 93%|█████████▎| 9334/10000 [2:05:04<07:28, 1.49it/s, loss=0.0060, lr=2.77e-06, step=9333] Training: 93%|█████████▎| 9334/10000 [2:05:04<07:28, 1.49it/s, loss=0.0062, lr=2.77e-06, step=9334] Training: 93%|█████████▎| 9335/10000 [2:05:05<07:36, 1.46it/s, loss=0.0062, lr=2.77e-06, step=9334] Training: 93%|█████████▎| 9335/10000 [2:05:05<07:36, 1.46it/s, loss=0.0015, lr=2.77e-06, step=9335] Training: 93%|█████████▎| 9336/10000 [2:05:06<08:42, 1.27it/s, loss=0.0015, lr=2.77e-06, step=9335] Training: 93%|█████████▎| 9336/10000 [2:05:06<08:42, 1.27it/s, loss=0.0075, lr=2.77e-06, step=9336] Training: 93%|█████████▎| 9337/10000 [2:05:07<09:33, 1.16it/s, loss=0.0075, lr=2.77e-06, step=9336] Training: 93%|█████████▎| 9337/10000 [2:05:07<09:33, 1.16it/s, loss=0.0167, lr=2.77e-06, step=9337] Training: 93%|█████████▎| 9338/10000 [2:05:08<10:24, 1.06it/s, loss=0.0167, lr=2.77e-06, step=9337] Training: 93%|█████████▎| 9338/10000 [2:05:08<10:24, 1.06it/s, loss=0.0136, lr=2.77e-06, step=9338] Training: 93%|█████████▎| 9339/10000 [2:05:09<10:12, 1.08it/s, loss=0.0136, lr=2.77e-06, step=9338] Training: 93%|█████████▎| 9339/10000 [2:05:09<10:12, 1.08it/s, loss=0.0037, lr=2.77e-06, step=9339]20:49:42.642 [I] step=9340 loss=0.0042 smoothed_loss=0.0091 lr=2.77e-06 grad_norm=0.4074 step_time=0.6193s data_time=0.2281s it/s=1.180 eta_to_10000=559.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0055 grad_action_out_proj_arms=0.0589 grad_arm_token_fuse=0.0276 grad_shared_expert=0.3097 (18633:train_pytorch.py:850) + Training: 93%|█████████▎| 9340/10000 [2:05:10<10:48, 1.02it/s, loss=0.0037, lr=2.77e-06, step=9339] Training: 93%|█████████▎| 9340/10000 [2:05:10<10:48, 1.02it/s, loss=0.0042, lr=2.77e-06, step=9340] Training: 93%|█████████▎| 9341/10000 [2:05:11<09:35, 1.14it/s, loss=0.0042, lr=2.77e-06, step=9340] Training: 93%|█████████▎| 9341/10000 [2:05:11<09:35, 1.14it/s, loss=0.0047, lr=2.77e-06, step=9341] Training: 93%|█████████▎| 9342/10000 [2:05:12<10:20, 1.06it/s, loss=0.0047, lr=2.77e-06, step=9341] Training: 93%|█████████▎| 9342/10000 [2:05:12<10:20, 1.06it/s, loss=0.0163, lr=2.77e-06, step=9342] Training: 93%|█████████▎| 9343/10000 [2:05:14<14:09, 1.29s/it, loss=0.0163, lr=2.77e-06, step=9342] Training: 93%|█████████▎| 9343/10000 [2:05:14<14:09, 1.29s/it, loss=0.0040, lr=2.77e-06, step=9343] Training: 93%|█████████▎| 9344/10000 [2:05:15<11:41, 1.07s/it, loss=0.0040, lr=2.77e-06, step=9343] Training: 93%|█████████▎| 9344/10000 [2:05:15<11:41, 1.07s/it, loss=0.0061, lr=2.76e-06, step=9344] Training: 93%|█████████▎| 9345/10000 [2:05:16<11:26, 1.05s/it, loss=0.0061, lr=2.76e-06, step=9344] Training: 93%|█████████▎| 9345/10000 [2:05:16<11:26, 1.05s/it, loss=0.0068, lr=2.76e-06, step=9345] Training: 93%|█████████▎| 9346/10000 [2:05:17<10:51, 1.00it/s, loss=0.0068, lr=2.76e-06, step=9345] Training: 93%|█████████▎| 9346/10000 [2:05:17<10:51, 1.00it/s, loss=0.0061, lr=2.76e-06, step=9346] Training: 93%|█████████▎| 9347/10000 [2:05:17<10:26, 1.04it/s, loss=0.0061, lr=2.76e-06, step=9346] Training: 93%|█████████▎| 9347/10000 [2:05:17<10:26, 1.04it/s, loss=0.0038, lr=2.76e-06, step=9347] Training: 93%|█████████▎| 9348/10000 [2:05:18<09:03, 1.20it/s, loss=0.0038, lr=2.76e-06, step=9347] Training: 93%|█████████▎| 9348/10000 [2:05:18<09:03, 1.20it/s, loss=0.0090, lr=2.76e-06, step=9348] Training: 93%|█████████▎| 9349/10000 [2:05:19<08:48, 1.23it/s, loss=0.0090, lr=2.76e-06, step=9348] Training: 93%|█████████▎| 9349/10000 [2:05:19<08:48, 1.23it/s, loss=0.0013, lr=2.76e-06, step=9349]20:49:52.236 [I] step=9350 loss=0.0107 smoothed_loss=0.0076 lr=2.76e-06 grad_norm=0.3512 step_time=0.6679s data_time=0.2915s it/s=1.044 eta_to_10000=622.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0089 grad_action_out_proj_arms=0.0800 grad_arm_token_fuse=0.0477 grad_shared_expert=0.4681 (18633:train_pytorch.py:850) + Training: 94%|█████████▎| 9350/10000 [2:05:20<09:54, 1.09it/s, loss=0.0013, lr=2.76e-06, step=9349] Training: 94%|█████████▎| 9350/10000 [2:05:20<09:54, 1.09it/s, loss=0.0107, lr=2.76e-06, step=9350] Training: 94%|█████████▎| 9351/10000 [2:05:21<09:44, 1.11it/s, loss=0.0107, lr=2.76e-06, step=9350] Training: 94%|█████████▎| 9351/10000 [2:05:21<09:44, 1.11it/s, loss=0.0017, lr=2.76e-06, step=9351] Training: 94%|█████████▎| 9352/10000 [2:05:22<09:13, 1.17it/s, loss=0.0017, lr=2.76e-06, step=9351] Training: 94%|█████████▎| 9352/10000 [2:05:22<09:13, 1.17it/s, loss=0.0018, lr=2.76e-06, step=9352] Training: 94%|█████████▎| 9353/10000 [2:05:22<08:35, 1.25it/s, loss=0.0018, lr=2.76e-06, step=9352] Training: 94%|█████████▎| 9353/10000 [2:05:22<08:35, 1.25it/s, loss=0.0044, lr=2.76e-06, step=9353] Training: 94%|█████████▎| 9354/10000 [2:05:23<08:24, 1.28it/s, loss=0.0044, lr=2.76e-06, step=9353] Training: 94%|█████████▎| 9354/10000 [2:05:23<08:24, 1.28it/s, loss=0.0069, lr=2.76e-06, step=9354] Training: 94%|█████████▎| 9355/10000 [2:05:24<09:59, 1.08it/s, loss=0.0069, lr=2.76e-06, step=9354] Training: 94%|█████████▎| 9355/10000 [2:05:24<09:59, 1.08it/s, loss=0.0193, lr=2.76e-06, step=9355] Training: 94%|█████████▎| 9356/10000 [2:05:26<11:56, 1.11s/it, loss=0.0193, lr=2.76e-06, step=9355] Training: 94%|█████████▎| 9356/10000 [2:05:26<11:56, 1.11s/it, loss=0.0093, lr=2.75e-06, step=9356] Training: 94%|█████████▎| 9357/10000 [2:05:27<12:58, 1.21s/it, loss=0.0093, lr=2.75e-06, step=9356] Training: 94%|█████████▎| 9357/10000 [2:05:27<12:58, 1.21s/it, loss=0.0081, lr=2.75e-06, step=9357] Training: 94%|█████████▎| 9358/10000 [2:05:28<12:23, 1.16s/it, loss=0.0081, lr=2.75e-06, step=9357] Training: 94%|█████████▎| 9358/10000 [2:05:28<12:23, 1.16s/it, loss=0.0028, lr=2.75e-06, step=9358] Training: 94%|█████████▎| 9359/10000 [2:05:29<10:24, 1.03it/s, loss=0.0028, lr=2.75e-06, step=9358] Training: 94%|█████████▎| 9359/10000 [2:05:29<10:24, 1.03it/s, loss=0.0132, lr=2.75e-06, step=9359]20:50:01.674 [I] step=9360 loss=0.0073 smoothed_loss=0.0078 lr=2.76e-06 grad_norm=0.4062 step_time=0.6462s data_time=0.2976s it/s=1.060 eta_to_10000=604.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0097 grad_action_out_proj_arms=0.0784 grad_arm_token_fuse=0.0509 grad_shared_expert=0.3262 (18633:train_pytorch.py:850) + Training: 94%|█████████▎| 9360/10000 [2:05:29<09:10, 1.16it/s, loss=0.0132, lr=2.75e-06, step=9359] Training: 94%|█████████▎| 9360/10000 [2:05:29<09:10, 1.16it/s, loss=0.0073, lr=2.75e-06, step=9360] Training: 94%|█████████▎| 9361/10000 [2:05:31<10:50, 1.02s/it, loss=0.0073, lr=2.75e-06, step=9360] Training: 94%|█████████▎| 9361/10000 [2:05:31<10:50, 1.02s/it, loss=0.0257, lr=2.75e-06, step=9361] Training: 94%|█████████▎| 9362/10000 [2:05:32<10:45, 1.01s/it, loss=0.0257, lr=2.75e-06, step=9361] Training: 94%|█████████▎| 9362/10000 [2:05:32<10:45, 1.01s/it, loss=0.0127, lr=2.75e-06, step=9362] Training: 94%|█████████▎| 9363/10000 [2:05:33<10:44, 1.01s/it, loss=0.0127, lr=2.75e-06, step=9362] Training: 94%|█████████▎| 9363/10000 [2:05:33<10:44, 1.01s/it, loss=0.0047, lr=2.75e-06, step=9363] Training: 94%|█████████▎| 9364/10000 [2:05:34<10:07, 1.05it/s, loss=0.0047, lr=2.75e-06, step=9363] Training: 94%|█████████▎| 9364/10000 [2:05:34<10:07, 1.05it/s, loss=0.0025, lr=2.75e-06, step=9364] Training: 94%|█████████▎| 9365/10000 [2:05:35<10:41, 1.01s/it, loss=0.0025, lr=2.75e-06, step=9364] Training: 94%|█████████▎| 9365/10000 [2:05:35<10:41, 1.01s/it, loss=0.0013, lr=2.75e-06, step=9365] Training: 94%|█████████▎| 9366/10000 [2:05:35<09:57, 1.06it/s, loss=0.0013, lr=2.75e-06, step=9365] Training: 94%|█████████▎| 9366/10000 [2:05:36<09:57, 1.06it/s, loss=0.0137, lr=2.75e-06, step=9366] Training: 94%|█████████▎| 9367/10000 [2:05:36<08:46, 1.20it/s, loss=0.0137, lr=2.75e-06, step=9366] Training: 94%|█████████▎| 9367/10000 [2:05:36<08:46, 1.20it/s, loss=0.0059, lr=2.75e-06, step=9367] Training: 94%|█████████▎| 9368/10000 [2:05:37<08:17, 1.27it/s, loss=0.0059, lr=2.75e-06, step=9367] Training: 94%|█████████▎| 9368/10000 [2:05:37<08:17, 1.27it/s, loss=0.0081, lr=2.75e-06, step=9368] Training: 94%|█████████▎| 9369/10000 [2:05:38<09:41, 1.09it/s, loss=0.0081, lr=2.75e-06, step=9368] Training: 94%|█████████▎| 9369/10000 [2:05:38<09:41, 1.09it/s, loss=0.0008, lr=2.74e-06, step=9369]20:50:11.809 [I] step=9370 loss=0.0054 smoothed_loss=0.0073 lr=2.75e-06 grad_norm=0.3689 step_time=0.7135s data_time=0.3001s it/s=0.988 eta_to_10000=637.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0041 grad_action_out_proj_arms=0.0506 grad_arm_token_fuse=0.0215 grad_shared_expert=0.1985 (18633:train_pytorch.py:850) + Training: 94%|█████████▎| 9370/10000 [2:05:39<11:27, 1.09s/it, loss=0.0008, lr=2.74e-06, step=9369] Training: 94%|█████████▎| 9370/10000 [2:05:39<11:27, 1.09s/it, loss=0.0054, lr=2.74e-06, step=9370] Training: 94%|█████████▎| 9371/10000 [2:05:40<10:32, 1.01s/it, loss=0.0054, lr=2.74e-06, step=9370] Training: 94%|█████████▎| 9371/10000 [2:05:40<10:32, 1.01s/it, loss=0.0013, lr=2.74e-06, step=9371] Training: 94%|█████████▎| 9372/10000 [2:05:41<11:04, 1.06s/it, loss=0.0013, lr=2.74e-06, step=9371] Training: 94%|█████████▎| 9372/10000 [2:05:41<11:04, 1.06s/it, loss=0.0278, lr=2.74e-06, step=9372] Training: 94%|█████████▎| 9373/10000 [2:05:42<10:08, 1.03it/s, loss=0.0278, lr=2.74e-06, step=9372] Training: 94%|█████████▎| 9373/10000 [2:05:42<10:08, 1.03it/s, loss=0.0032, lr=2.74e-06, step=9373] Training: 94%|█████████▎| 9374/10000 [2:05:43<09:13, 1.13it/s, loss=0.0032, lr=2.74e-06, step=9373] Training: 94%|█████████▎| 9374/10000 [2:05:43<09:13, 1.13it/s, loss=0.0101, lr=2.74e-06, step=9374] Training: 94%|█████████▍| 9375/10000 [2:05:44<08:57, 1.16it/s, loss=0.0101, lr=2.74e-06, step=9374] Training: 94%|█████████▍| 9375/10000 [2:05:44<08:57, 1.16it/s, loss=0.0060, lr=2.74e-06, step=9375] Training: 94%|█████████▍| 9376/10000 [2:05:45<09:09, 1.14it/s, loss=0.0060, lr=2.74e-06, step=9375] Training: 94%|█████████▍| 9376/10000 [2:05:45<09:09, 1.14it/s, loss=0.0031, lr=2.74e-06, step=9376] Training: 94%|█████████▍| 9377/10000 [2:05:46<09:24, 1.10it/s, loss=0.0031, lr=2.74e-06, step=9376] Training: 94%|█████████▍| 9377/10000 [2:05:46<09:24, 1.10it/s, loss=0.0027, lr=2.74e-06, step=9377] Training: 94%|█████████▍| 9378/10000 [2:05:47<09:59, 1.04it/s, loss=0.0027, lr=2.74e-06, step=9377] Training: 94%|█████████▍| 9378/10000 [2:05:47<09:59, 1.04it/s, loss=0.0011, lr=2.74e-06, step=9378] Training: 94%|█████████▍| 9379/10000 [2:05:48<09:39, 1.07it/s, loss=0.0011, lr=2.74e-06, step=9378] Training: 94%|█████████▍| 9379/10000 [2:05:48<09:39, 1.07it/s, loss=0.0017, lr=2.74e-06, step=9379]20:50:20.829 [I] step=9380 loss=0.0017 smoothed_loss=0.0057 lr=2.74e-06 grad_norm=0.4152 step_time=0.6749s data_time=0.2270s it/s=1.109 eta_to_10000=559.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0035 grad_action_out_proj_arms=0.0470 grad_arm_token_fuse=0.0159 grad_shared_expert=0.1556 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9380/10000 [2:05:49<09:39, 1.07it/s, loss=0.0017, lr=2.74e-06, step=9379] Training: 94%|█████████▍| 9380/10000 [2:05:49<09:39, 1.07it/s, loss=0.0017, lr=2.74e-06, step=9380] Training: 94%|█████████▍| 9381/10000 [2:05:49<08:49, 1.17it/s, loss=0.0017, lr=2.74e-06, step=9380] Training: 94%|█████████▍| 9381/10000 [2:05:49<08:49, 1.17it/s, loss=0.0062, lr=2.74e-06, step=9381] Training: 94%|█████████▍| 9382/10000 [2:05:50<09:10, 1.12it/s, loss=0.0062, lr=2.74e-06, step=9381] Training: 94%|█████████▍| 9382/10000 [2:05:50<09:10, 1.12it/s, loss=0.0048, lr=2.73e-06, step=9382] Training: 94%|█████████▍| 9383/10000 [2:05:52<10:59, 1.07s/it, loss=0.0048, lr=2.73e-06, step=9382] Training: 94%|█████████▍| 9383/10000 [2:05:52<10:59, 1.07s/it, loss=0.0102, lr=2.73e-06, step=9383] Training: 94%|█████████▍| 9384/10000 [2:05:52<09:13, 1.11it/s, loss=0.0102, lr=2.73e-06, step=9383] Training: 94%|█████████▍| 9384/10000 [2:05:52<09:13, 1.11it/s, loss=0.0137, lr=2.73e-06, step=9384] Training: 94%|█████████▍| 9385/10000 [2:05:53<08:34, 1.19it/s, loss=0.0137, lr=2.73e-06, step=9384] Training: 94%|█████████▍| 9385/10000 [2:05:53<08:34, 1.19it/s, loss=0.0037, lr=2.73e-06, step=9385] Training: 94%|█████████▍| 9386/10000 [2:05:54<08:47, 1.16it/s, loss=0.0037, lr=2.73e-06, step=9385] Training: 94%|█████████▍| 9386/10000 [2:05:54<08:47, 1.16it/s, loss=0.0032, lr=2.73e-06, step=9386] Training: 94%|█████████▍| 9387/10000 [2:05:55<08:53, 1.15it/s, loss=0.0032, lr=2.73e-06, step=9386] Training: 94%|█████████▍| 9387/10000 [2:05:55<08:53, 1.15it/s, loss=0.0111, lr=2.73e-06, step=9387] Training: 94%|█████████▍| 9388/10000 [2:05:55<07:51, 1.30it/s, loss=0.0111, lr=2.73e-06, step=9387] Training: 94%|█████████▍| 9388/10000 [2:05:55<07:51, 1.30it/s, loss=0.0101, lr=2.73e-06, step=9388] Training: 94%|█████████▍| 9389/10000 [2:05:56<08:37, 1.18it/s, loss=0.0101, lr=2.73e-06, step=9388] Training: 94%|█████████▍| 9389/10000 [2:05:56<08:37, 1.18it/s, loss=0.0018, lr=2.73e-06, step=9389]20:50:30.216 [I] step=9390 loss=0.0191 smoothed_loss=0.0078 lr=2.73e-06 grad_norm=0.5139 step_time=0.5566s data_time=0.3821s it/s=1.067 eta_to_10000=571.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0197 grad_action_out_proj_arms=0.1280 grad_arm_token_fuse=0.1040 grad_shared_expert=0.4927 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9390/10000 [2:05:58<11:15, 1.11s/it, loss=0.0018, lr=2.73e-06, step=9389] Training: 94%|█████████▍| 9390/10000 [2:05:58<11:15, 1.11s/it, loss=0.0191, lr=2.73e-06, step=9390] Training: 94%|█████████▍| 9391/10000 [2:05:59<10:19, 1.02s/it, loss=0.0191, lr=2.73e-06, step=9390] Training: 94%|█████████▍| 9391/10000 [2:05:59<10:19, 1.02s/it, loss=0.0028, lr=2.73e-06, step=9391] Training: 94%|█████████▍| 9392/10000 [2:05:59<09:19, 1.09it/s, loss=0.0028, lr=2.73e-06, step=9391] Training: 94%|█████████▍| 9392/10000 [2:05:59<09:19, 1.09it/s, loss=0.0100, lr=2.73e-06, step=9392] Training: 94%|█████████▍| 9393/10000 [2:06:01<10:57, 1.08s/it, loss=0.0100, lr=2.73e-06, step=9392] Training: 94%|█████████▍| 9393/10000 [2:06:01<10:57, 1.08s/it, loss=0.0207, lr=2.73e-06, step=9393] Training: 94%|█████████▍| 9394/10000 [2:06:02<09:51, 1.02it/s, loss=0.0207, lr=2.73e-06, step=9393] Training: 94%|█████████▍| 9394/10000 [2:06:02<09:51, 1.02it/s, loss=0.0063, lr=2.73e-06, step=9394] Training: 94%|█████████▍| 9395/10000 [2:06:02<09:21, 1.08it/s, loss=0.0063, lr=2.73e-06, step=9394] Training: 94%|█████████▍| 9395/10000 [2:06:02<09:21, 1.08it/s, loss=0.0010, lr=2.73e-06, step=9395] Training: 94%|█████████▍| 9396/10000 [2:06:03<08:52, 1.14it/s, loss=0.0010, lr=2.73e-06, step=9395] Training: 94%|█████████▍| 9396/10000 [2:06:03<08:52, 1.14it/s, loss=0.0079, lr=2.72e-06, step=9396] Training: 94%|█████████▍| 9397/10000 [2:06:04<09:06, 1.10it/s, loss=0.0079, lr=2.72e-06, step=9396] Training: 94%|█████████▍| 9397/10000 [2:06:04<09:06, 1.10it/s, loss=0.0395, lr=2.72e-06, step=9397] Training: 94%|█████████▍| 9398/10000 [2:06:05<08:31, 1.18it/s, loss=0.0395, lr=2.72e-06, step=9397] Training: 94%|█████████▍| 9398/10000 [2:06:05<08:31, 1.18it/s, loss=0.0269, lr=2.72e-06, step=9398] Training: 94%|█████████▍| 9399/10000 [2:06:06<08:53, 1.13it/s, loss=0.0269, lr=2.72e-06, step=9398] Training: 94%|█████████▍| 9399/10000 [2:06:06<08:53, 1.13it/s, loss=0.0053, lr=2.72e-06, step=9399]20:50:39.588 [I] step=9400 loss=0.0057 smoothed_loss=0.0113 lr=2.72e-06 grad_norm=0.3825 step_time=0.6667s data_time=0.2705s it/s=1.071 eta_to_10000=560.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0062 grad_action_out_proj_arms=0.0718 grad_arm_token_fuse=0.0302 grad_shared_expert=0.3045 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9400/10000 [2:06:07<10:28, 1.05s/it, loss=0.0053, lr=2.72e-06, step=9399] Training: 94%|█████████▍| 9400/10000 [2:06:07<10:28, 1.05s/it, loss=0.0057, lr=2.72e-06, step=9400] Training: 94%|█████████▍| 9401/10000 [2:06:08<09:56, 1.00it/s, loss=0.0057, lr=2.72e-06, step=9400] Training: 94%|█████████▍| 9401/10000 [2:06:08<09:56, 1.00it/s, loss=0.0027, lr=2.72e-06, step=9401] Training: 94%|█████████▍| 9402/10000 [2:06:09<09:39, 1.03it/s, loss=0.0027, lr=2.72e-06, step=9401] Training: 94%|█████████▍| 9402/10000 [2:06:09<09:39, 1.03it/s, loss=0.0064, lr=2.72e-06, step=9402] Training: 94%|█████████▍| 9403/10000 [2:06:11<12:36, 1.27s/it, loss=0.0064, lr=2.72e-06, step=9402] Training: 94%|█████████▍| 9403/10000 [2:06:11<12:36, 1.27s/it, loss=0.0079, lr=2.72e-06, step=9403] Training: 94%|█████████▍| 9404/10000 [2:06:12<11:03, 1.11s/it, loss=0.0079, lr=2.72e-06, step=9403] Training: 94%|█████████▍| 9404/10000 [2:06:12<11:03, 1.11s/it, loss=0.0024, lr=2.72e-06, step=9404] Training: 94%|█████████▍| 9405/10000 [2:06:14<13:05, 1.32s/it, loss=0.0024, lr=2.72e-06, step=9404] Training: 94%|█████████▍| 9405/10000 [2:06:14<13:05, 1.32s/it, loss=0.0111, lr=2.72e-06, step=9405] Training: 94%|█████████▍| 9406/10000 [2:06:15<13:56, 1.41s/it, loss=0.0111, lr=2.72e-06, step=9405] Training: 94%|█████████▍| 9406/10000 [2:06:15<13:56, 1.41s/it, loss=0.0052, lr=2.72e-06, step=9406] Training: 94%|█████████▍| 9407/10000 [2:06:16<12:12, 1.23s/it, loss=0.0052, lr=2.72e-06, step=9406] Training: 94%|█████████▍| 9407/10000 [2:06:16<12:12, 1.23s/it, loss=0.0107, lr=2.72e-06, step=9407] Training: 94%|█████████▍| 9408/10000 [2:06:17<10:12, 1.03s/it, loss=0.0107, lr=2.72e-06, step=9407] Training: 94%|█████████▍| 9408/10000 [2:06:17<10:12, 1.03s/it, loss=0.0057, lr=2.72e-06, step=9408] Training: 94%|█████████▍| 9409/10000 [2:06:18<10:40, 1.08s/it, loss=0.0057, lr=2.72e-06, step=9408] Training: 94%|█████████▍| 9409/10000 [2:06:18<10:40, 1.08s/it, loss=0.0018, lr=2.71e-06, step=9409]20:50:51.735 [I] step=9410 loss=0.0126 smoothed_loss=0.0085 lr=2.72e-06 grad_norm=0.3883 step_time=0.7344s data_time=0.4803s it/s=0.824 eta_to_10000=715.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0247 grad_action_out_proj_arms=0.1466 grad_arm_token_fuse=0.1244 grad_shared_expert=0.5192 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9410/10000 [2:06:19<12:17, 1.25s/it, loss=0.0018, lr=2.71e-06, step=9409] Training: 94%|█████████▍| 9410/10000 [2:06:19<12:17, 1.25s/it, loss=0.0126, lr=2.71e-06, step=9410] Training: 94%|█████████▍| 9411/10000 [2:06:20<11:42, 1.19s/it, loss=0.0126, lr=2.71e-06, step=9410] Training: 94%|█████████▍| 9411/10000 [2:06:20<11:42, 1.19s/it, loss=0.0041, lr=2.71e-06, step=9411] Training: 94%|█████████▍| 9412/10000 [2:06:22<11:12, 1.14s/it, loss=0.0041, lr=2.71e-06, step=9411] Training: 94%|█████████▍| 9412/10000 [2:06:22<11:12, 1.14s/it, loss=0.0098, lr=2.71e-06, step=9412] Training: 94%|█████████▍| 9413/10000 [2:06:22<10:25, 1.07s/it, loss=0.0098, lr=2.71e-06, step=9412] Training: 94%|█████████▍| 9413/10000 [2:06:22<10:25, 1.07s/it, loss=0.0034, lr=2.71e-06, step=9413] Training: 94%|█████████▍| 9414/10000 [2:06:23<09:35, 1.02it/s, loss=0.0034, lr=2.71e-06, step=9413] Training: 94%|█████████▍| 9414/10000 [2:06:23<09:35, 1.02it/s, loss=0.0060, lr=2.71e-06, step=9414] Training: 94%|█████████▍| 9415/10000 [2:06:24<09:23, 1.04it/s, loss=0.0060, lr=2.71e-06, step=9414] Training: 94%|█████████▍| 9415/10000 [2:06:24<09:23, 1.04it/s, loss=0.0029, lr=2.71e-06, step=9415] Training: 94%|█████████▍| 9416/10000 [2:06:26<11:32, 1.19s/it, loss=0.0029, lr=2.71e-06, step=9415] Training: 94%|█████████▍| 9416/10000 [2:06:26<11:32, 1.19s/it, loss=0.0125, lr=2.71e-06, step=9416] Training: 94%|█████████▍| 9417/10000 [2:06:27<10:40, 1.10s/it, loss=0.0125, lr=2.71e-06, step=9416] Training: 94%|█████████▍| 9417/10000 [2:06:27<10:40, 1.10s/it, loss=0.0022, lr=2.71e-06, step=9417] Training: 94%|█████████▍| 9418/10000 [2:06:28<10:15, 1.06s/it, loss=0.0022, lr=2.71e-06, step=9417] Training: 94%|█████████▍| 9418/10000 [2:06:28<10:15, 1.06s/it, loss=0.0064, lr=2.71e-06, step=9418] Training: 94%|█████████▍| 9419/10000 [2:06:28<09:33, 1.01it/s, loss=0.0064, lr=2.71e-06, step=9418] Training: 94%|█████████▍| 9419/10000 [2:06:28<09:33, 1.01it/s, loss=0.0019, lr=2.71e-06, step=9419]20:51:01.595 [I] step=9420 loss=0.0063 smoothed_loss=0.0065 lr=2.71e-06 grad_norm=0.4394 step_time=0.7633s data_time=0.2227s it/s=1.016 eta_to_10000=570.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0044 grad_action_out_proj_arms=0.0612 grad_arm_token_fuse=0.0234 grad_shared_expert=0.3507 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9420/10000 [2:06:29<08:56, 1.08it/s, loss=0.0019, lr=2.71e-06, step=9419] Training: 94%|█████████▍| 9420/10000 [2:06:29<08:56, 1.08it/s, loss=0.0063, lr=2.71e-06, step=9420] Training: 94%|█████████▍| 9421/10000 [2:06:30<08:29, 1.14it/s, loss=0.0063, lr=2.71e-06, step=9420] Training: 94%|█████████▍| 9421/10000 [2:06:30<08:29, 1.14it/s, loss=0.0089, lr=2.71e-06, step=9421] Training: 94%|█████████▍| 9422/10000 [2:06:31<09:20, 1.03it/s, loss=0.0089, lr=2.71e-06, step=9421] Training: 94%|█████████▍| 9422/10000 [2:06:31<09:20, 1.03it/s, loss=0.0219, lr=2.71e-06, step=9422] Training: 94%|█████████▍| 9423/10000 [2:06:32<09:33, 1.01it/s, loss=0.0219, lr=2.71e-06, step=9422] Training: 94%|█████████▍| 9423/10000 [2:06:32<09:33, 1.01it/s, loss=0.0071, lr=2.70e-06, step=9423] Training: 94%|█████████▍| 9424/10000 [2:06:33<09:38, 1.00s/it, loss=0.0071, lr=2.70e-06, step=9423] Training: 94%|█████████▍| 9424/10000 [2:06:33<09:38, 1.00s/it, loss=0.0193, lr=2.70e-06, step=9424] Training: 94%|█████████▍| 9425/10000 [2:06:34<09:44, 1.02s/it, loss=0.0193, lr=2.70e-06, step=9424] Training: 94%|█████████▍| 9425/10000 [2:06:34<09:44, 1.02s/it, loss=0.0060, lr=2.70e-06, step=9425] Training: 94%|█████████▍| 9426/10000 [2:06:36<10:37, 1.11s/it, loss=0.0060, lr=2.70e-06, step=9425] Training: 94%|█████████▍| 9426/10000 [2:06:36<10:37, 1.11s/it, loss=0.0044, lr=2.70e-06, step=9426] Training: 94%|█████████▍| 9427/10000 [2:06:37<10:45, 1.13s/it, loss=0.0044, lr=2.70e-06, step=9426] Training: 94%|█████████▍| 9427/10000 [2:06:37<10:45, 1.13s/it, loss=0.0096, lr=2.70e-06, step=9427] Training: 94%|█████████▍| 9428/10000 [2:06:39<12:48, 1.34s/it, loss=0.0096, lr=2.70e-06, step=9427] Training: 94%|█████████▍| 9428/10000 [2:06:39<12:48, 1.34s/it, loss=0.0027, lr=2.70e-06, step=9428] Training: 94%|█████████▍| 9429/10000 [2:06:40<13:46, 1.45s/it, loss=0.0027, lr=2.70e-06, step=9428] Training: 94%|█████████▍| 9429/10000 [2:06:40<13:46, 1.45s/it, loss=0.0046, lr=2.70e-06, step=9429]20:51:14.685 [I] step=9430 loss=0.0111 smoothed_loss=0.0080 lr=2.70e-06 grad_norm=0.4254 step_time=0.8938s data_time=0.4152s it/s=0.764 eta_to_10000=746.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0207 grad_action_out_proj_arms=0.1248 grad_arm_token_fuse=0.1081 grad_shared_expert=0.4637 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9430/10000 [2:06:42<15:19, 1.61s/it, loss=0.0046, lr=2.70e-06, step=9429] Training: 94%|█████████▍| 9430/10000 [2:06:42<15:19, 1.61s/it, loss=0.0111, lr=2.70e-06, step=9430] Training: 94%|█████████▍| 9431/10000 [2:06:44<14:03, 1.48s/it, loss=0.0111, lr=2.70e-06, step=9430] Training: 94%|█████████▍| 9431/10000 [2:06:44<14:03, 1.48s/it, loss=0.0122, lr=2.70e-06, step=9431] Training: 94%|█████████▍| 9432/10000 [2:06:44<12:29, 1.32s/it, loss=0.0122, lr=2.70e-06, step=9431] Training: 94%|█████████▍| 9432/10000 [2:06:44<12:29, 1.32s/it, loss=0.0018, lr=2.70e-06, step=9432] Training: 94%|█████████▍| 9433/10000 [2:06:45<10:50, 1.15s/it, loss=0.0018, lr=2.70e-06, step=9432] Training: 94%|█████████▍| 9433/10000 [2:06:45<10:50, 1.15s/it, loss=0.0070, lr=2.70e-06, step=9433] Training: 94%|█████████▍| 9434/10000 [2:06:46<10:07, 1.07s/it, loss=0.0070, lr=2.70e-06, step=9433] Training: 94%|█████████▍| 9434/10000 [2:06:46<10:07, 1.07s/it, loss=0.0218, lr=2.70e-06, step=9434] Training: 94%|█████████▍| 9435/10000 [2:06:47<08:51, 1.06it/s, loss=0.0218, lr=2.70e-06, step=9434] Training: 94%|█████████▍| 9435/10000 [2:06:47<08:51, 1.06it/s, loss=0.0261, lr=2.70e-06, step=9435] Training: 94%|█████████▍| 9436/10000 [2:06:48<09:31, 1.01s/it, loss=0.0261, lr=2.70e-06, step=9435] Training: 94%|█████████▍| 9436/10000 [2:06:48<09:31, 1.01s/it, loss=0.0151, lr=2.70e-06, step=9436] Training: 94%|█████████▍| 9437/10000 [2:06:49<08:39, 1.08it/s, loss=0.0151, lr=2.70e-06, step=9436] Training: 94%|█████████▍| 9437/10000 [2:06:49<08:39, 1.08it/s, loss=0.0046, lr=2.70e-06, step=9437] Training: 94%|█████████▍| 9438/10000 [2:06:50<08:38, 1.08it/s, loss=0.0046, lr=2.70e-06, step=9437] Training: 94%|█████████▍| 9438/10000 [2:06:50<08:38, 1.08it/s, loss=0.0069, lr=2.69e-06, step=9438] Training: 94%|█████████▍| 9439/10000 [2:06:50<08:33, 1.09it/s, loss=0.0069, lr=2.69e-06, step=9438] Training: 94%|█████████▍| 9439/10000 [2:06:50<08:33, 1.09it/s, loss=0.0060, lr=2.69e-06, step=9439]20:51:23.842 [I] step=9440 loss=0.0041 smoothed_loss=0.0092 lr=2.70e-06 grad_norm=0.4012 step_time=0.7034s data_time=0.2122s it/s=1.095 eta_to_10000=511.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0047 grad_action_out_proj_arms=0.0604 grad_arm_token_fuse=0.0259 grad_shared_expert=0.1900 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9440/10000 [2:06:52<08:51, 1.05it/s, loss=0.0060, lr=2.69e-06, step=9439] Training: 94%|█████████▍| 9440/10000 [2:06:52<08:51, 1.05it/s, loss=0.0041, lr=2.69e-06, step=9440] Training: 94%|█████████▍| 9441/10000 [2:06:52<08:27, 1.10it/s, loss=0.0041, lr=2.69e-06, step=9440] Training: 94%|█████████▍| 9441/10000 [2:06:52<08:27, 1.10it/s, loss=0.0165, lr=2.69e-06, step=9441] Training: 94%|█████████▍| 9442/10000 [2:06:53<07:29, 1.24it/s, loss=0.0165, lr=2.69e-06, step=9441] Training: 94%|█████████▍| 9442/10000 [2:06:53<07:29, 1.24it/s, loss=0.0070, lr=2.69e-06, step=9442] Training: 94%|█████████▍| 9443/10000 [2:06:54<07:51, 1.18it/s, loss=0.0070, lr=2.69e-06, step=9442] Training: 94%|█████████▍| 9443/10000 [2:06:54<07:51, 1.18it/s, loss=0.0202, lr=2.69e-06, step=9443] Training: 94%|█████████▍| 9444/10000 [2:06:55<08:01, 1.16it/s, loss=0.0202, lr=2.69e-06, step=9443] Training: 94%|█████████▍| 9444/10000 [2:06:55<08:01, 1.16it/s, loss=0.0026, lr=2.69e-06, step=9444] Training: 94%|█████████▍| 9445/10000 [2:06:55<07:05, 1.31it/s, loss=0.0026, lr=2.69e-06, step=9444] Training: 94%|█████████▍| 9445/10000 [2:06:55<07:05, 1.31it/s, loss=0.0155, lr=2.69e-06, step=9445] Training: 94%|█████████▍| 9446/10000 [2:06:56<07:02, 1.31it/s, loss=0.0155, lr=2.69e-06, step=9445] Training: 94%|█████████▍| 9446/10000 [2:06:56<07:02, 1.31it/s, loss=0.0051, lr=2.69e-06, step=9446] Training: 94%|█████████▍| 9447/10000 [2:06:57<06:21, 1.45it/s, loss=0.0051, lr=2.69e-06, step=9446] Training: 94%|█████████▍| 9447/10000 [2:06:57<06:21, 1.45it/s, loss=0.0035, lr=2.69e-06, step=9447] Training: 94%|█████████▍| 9448/10000 [2:06:57<05:48, 1.58it/s, loss=0.0035, lr=2.69e-06, step=9447] Training: 94%|█████████▍| 9448/10000 [2:06:57<05:48, 1.58it/s, loss=0.0023, lr=2.69e-06, step=9448] Training: 94%|█████████▍| 9449/10000 [2:06:58<05:24, 1.70it/s, loss=0.0023, lr=2.69e-06, step=9448] Training: 94%|█████████▍| 9449/10000 [2:06:58<05:24, 1.70it/s, loss=0.0034, lr=2.69e-06, step=9449]20:51:30.579 [I] step=9450 loss=0.0020 smoothed_loss=0.0075 lr=2.69e-06 grad_norm=0.4015 step_time=0.5494s data_time=0.1243s it/s=1.484 eta_to_10000=370.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0056 grad_action_out_proj_arms=0.0641 grad_arm_token_fuse=0.0278 grad_shared_expert=0.2417 (18633:train_pytorch.py:850) + Training: 94%|█████████▍| 9450/10000 [2:06:58<05:44, 1.60it/s, loss=0.0034, lr=2.69e-06, step=9449] Training: 94%|█████████▍| 9450/10000 [2:06:58<05:44, 1.60it/s, loss=0.0020, lr=2.69e-06, step=9450] Training: 95%|█████████▍| 9451/10000 [2:06:59<05:24, 1.69it/s, loss=0.0020, lr=2.69e-06, step=9450] Training: 95%|█████████▍| 9451/10000 [2:06:59<05:24, 1.69it/s, loss=0.0106, lr=2.69e-06, step=9451] Training: 95%|█████████▍| 9452/10000 [2:06:59<05:32, 1.65it/s, loss=0.0106, lr=2.69e-06, step=9451] Training: 95%|█████████▍| 9452/10000 [2:06:59<05:32, 1.65it/s, loss=0.0042, lr=2.68e-06, step=9452] Training: 95%|█████████▍| 9453/10000 [2:07:00<05:54, 1.54it/s, loss=0.0042, lr=2.68e-06, step=9452] Training: 95%|█████████▍| 9453/10000 [2:07:00<05:54, 1.54it/s, loss=0.0206, lr=2.68e-06, step=9453] Training: 95%|█████████▍| 9454/10000 [2:07:01<05:59, 1.52it/s, loss=0.0206, lr=2.68e-06, step=9453] Training: 95%|█████████▍| 9454/10000 [2:07:01<05:59, 1.52it/s, loss=0.0049, lr=2.68e-06, step=9454] Training: 95%|█████████▍| 9455/10000 [2:07:02<06:18, 1.44it/s, loss=0.0049, lr=2.68e-06, step=9454] Training: 95%|█████████▍| 9455/10000 [2:07:02<06:18, 1.44it/s, loss=0.0555, lr=2.68e-06, step=9455] Training: 95%|█████████▍| 9456/10000 [2:07:02<05:48, 1.56it/s, loss=0.0555, lr=2.68e-06, step=9455] Training: 95%|█████████▍| 9456/10000 [2:07:02<05:48, 1.56it/s, loss=0.0014, lr=2.68e-06, step=9456] Training: 95%|█████████▍| 9457/10000 [2:07:03<06:19, 1.43it/s, loss=0.0014, lr=2.68e-06, step=9456] Training: 95%|█████████▍| 9457/10000 [2:07:03<06:19, 1.43it/s, loss=0.0036, lr=2.68e-06, step=9457] Training: 95%|█████████▍| 9458/10000 [2:07:04<07:20, 1.23it/s, loss=0.0036, lr=2.68e-06, step=9457] Training: 95%|█████████▍| 9458/10000 [2:07:04<07:20, 1.23it/s, loss=0.0012, lr=2.68e-06, step=9458] Training: 95%|█████████▍| 9459/10000 [2:07:05<07:27, 1.21it/s, loss=0.0012, lr=2.68e-06, step=9458] Training: 95%|█████████▍| 9459/10000 [2:07:05<07:27, 1.21it/s, loss=0.0032, lr=2.68e-06, step=9459]20:51:37.928 [I] step=9460 loss=0.0218 smoothed_loss=0.0106 lr=2.68e-06 grad_norm=0.4419 step_time=0.5730s data_time=0.1619s it/s=1.361 eta_to_10000=396.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0134 grad_action_out_proj_arms=0.0915 grad_arm_token_fuse=0.0700 grad_shared_expert=0.4239 (18633:train_pytorch.py:850) + Training: 95%|█████████▍| 9460/10000 [2:07:06<07:06, 1.27it/s, loss=0.0032, lr=2.68e-06, step=9459] Training: 95%|█████████▍| 9460/10000 [2:07:06<07:06, 1.27it/s, loss=0.0218, lr=2.68e-06, step=9460] Training: 95%|█████████▍| 9461/10000 [2:07:06<07:02, 1.28it/s, loss=0.0218, lr=2.68e-06, step=9460] Training: 95%|█████████▍| 9461/10000 [2:07:06<07:02, 1.28it/s, loss=0.0100, lr=2.68e-06, step=9461] Training: 95%|█████████▍| 9462/10000 [2:07:07<07:22, 1.22it/s, loss=0.0100, lr=2.68e-06, step=9461] Training: 95%|█████████▍| 9462/10000 [2:07:07<07:22, 1.22it/s, loss=0.0011, lr=2.68e-06, step=9462] Training: 95%|█████████▍| 9463/10000 [2:07:08<06:32, 1.37it/s, loss=0.0011, lr=2.68e-06, step=9462] Training: 95%|█████████▍| 9463/10000 [2:07:08<06:32, 1.37it/s, loss=0.0012, lr=2.68e-06, step=9463] Training: 95%|█████████▍| 9464/10000 [2:07:09<07:09, 1.25it/s, loss=0.0012, lr=2.68e-06, step=9463] Training: 95%|█████████▍| 9464/10000 [2:07:09<07:09, 1.25it/s, loss=0.0017, lr=2.68e-06, step=9464] Training: 95%|█████████▍| 9465/10000 [2:07:10<08:20, 1.07it/s, loss=0.0017, lr=2.68e-06, step=9464] Training: 95%|█████████▍| 9465/10000 [2:07:10<08:20, 1.07it/s, loss=0.0088, lr=2.68e-06, step=9465] Training: 95%|█████████▍| 9466/10000 [2:07:11<07:21, 1.21it/s, loss=0.0088, lr=2.68e-06, step=9465] Training: 95%|█████████▍| 9466/10000 [2:07:11<07:21, 1.21it/s, loss=0.0068, lr=2.68e-06, step=9466] Training: 95%|█████████▍| 9467/10000 [2:07:11<07:29, 1.19it/s, loss=0.0068, lr=2.68e-06, step=9466] Training: 95%|█████████▍| 9467/10000 [2:07:11<07:29, 1.19it/s, loss=0.0009, lr=2.67e-06, step=9467] Training: 95%|█████████▍| 9468/10000 [2:07:12<07:06, 1.25it/s, loss=0.0009, lr=2.67e-06, step=9467] Training: 95%|█████████▍| 9468/10000 [2:07:12<07:06, 1.25it/s, loss=0.0121, lr=2.67e-06, step=9468] Training: 95%|█████████▍| 9469/10000 [2:07:13<06:33, 1.35it/s, loss=0.0121, lr=2.67e-06, step=9468] Training: 95%|█████████▍| 9469/10000 [2:07:13<06:33, 1.35it/s, loss=0.0050, lr=2.67e-06, step=9469]20:51:45.970 [I] step=9470 loss=0.0082 smoothed_loss=0.0076 lr=2.68e-06 grad_norm=0.4498 step_time=0.6410s data_time=0.1632s it/s=1.244 eta_to_10000=426.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.0794 grad_arm_token_fuse=0.0430 grad_shared_expert=0.2719 (18633:train_pytorch.py:850) + Training: 95%|█████████▍| 9470/10000 [2:07:14<06:55, 1.28it/s, loss=0.0050, lr=2.67e-06, step=9469] Training: 95%|█████████▍| 9470/10000 [2:07:14<06:55, 1.28it/s, loss=0.0082, lr=2.67e-06, step=9470] Training: 95%|█████████▍| 9471/10000 [2:07:15<07:09, 1.23it/s, loss=0.0082, lr=2.67e-06, step=9470] Training: 95%|█████████▍| 9471/10000 [2:07:15<07:09, 1.23it/s, loss=0.0110, lr=2.67e-06, step=9471] Training: 95%|█████████▍| 9472/10000 [2:07:16<07:33, 1.17it/s, loss=0.0110, lr=2.67e-06, step=9471] Training: 95%|█████████▍| 9472/10000 [2:07:16<07:33, 1.17it/s, loss=0.0051, lr=2.67e-06, step=9472] Training: 95%|█████████▍| 9473/10000 [2:07:16<07:04, 1.24it/s, loss=0.0051, lr=2.67e-06, step=9472] Training: 95%|█████████▍| 9473/10000 [2:07:16<07:04, 1.24it/s, loss=0.0196, lr=2.67e-06, step=9473] Training: 95%|█████████▍| 9474/10000 [2:07:17<07:04, 1.24it/s, loss=0.0196, lr=2.67e-06, step=9473] Training: 95%|█████████▍| 9474/10000 [2:07:17<07:04, 1.24it/s, loss=0.0029, lr=2.67e-06, step=9474] Training: 95%|█████████▍| 9475/10000 [2:07:18<07:37, 1.15it/s, loss=0.0029, lr=2.67e-06, step=9474] Training: 95%|█████████▍| 9475/10000 [2:07:18<07:37, 1.15it/s, loss=0.0117, lr=2.67e-06, step=9475] Training: 95%|█████████▍| 9476/10000 [2:07:19<07:10, 1.22it/s, loss=0.0117, lr=2.67e-06, step=9475] Training: 95%|█████████▍| 9476/10000 [2:07:19<07:10, 1.22it/s, loss=0.0023, lr=2.67e-06, step=9476] Training: 95%|█████████▍| 9477/10000 [2:07:19<06:54, 1.26it/s, loss=0.0023, lr=2.67e-06, step=9476] Training: 95%|█████████▍| 9477/10000 [2:07:19<06:54, 1.26it/s, loss=0.0012, lr=2.67e-06, step=9477] Training: 95%|█████████▍| 9478/10000 [2:07:20<06:56, 1.25it/s, loss=0.0012, lr=2.67e-06, step=9477] Training: 95%|█████████▍| 9478/10000 [2:07:20<06:56, 1.25it/s, loss=0.0010, lr=2.67e-06, step=9478] Training: 95%|█████████▍| 9479/10000 [2:07:21<08:03, 1.08it/s, loss=0.0010, lr=2.67e-06, step=9478] Training: 95%|█████████▍| 9479/10000 [2:07:21<08:03, 1.08it/s, loss=0.0120, lr=2.67e-06, step=9479]20:51:54.368 [I] step=9480 loss=0.0181 smoothed_loss=0.0083 lr=2.67e-06 grad_norm=0.4671 step_time=0.6686s data_time=0.1712s it/s=1.193 eta_to_10000=435.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0167 grad_action_out_proj_arms=0.1036 grad_arm_token_fuse=0.0846 grad_shared_expert=0.5053 (18633:train_pytorch.py:850) + Training: 95%|█████████▍| 9480/10000 [2:07:22<07:04, 1.22it/s, loss=0.0120, lr=2.67e-06, step=9479] Training: 95%|█████████▍| 9480/10000 [2:07:22<07:04, 1.22it/s, loss=0.0181, lr=2.67e-06, step=9480] Training: 95%|█████████▍| 9481/10000 [2:07:23<06:53, 1.25it/s, loss=0.0181, lr=2.67e-06, step=9480] Training: 95%|█████████▍| 9481/10000 [2:07:23<06:53, 1.25it/s, loss=0.0508, lr=2.67e-06, step=9481] Training: 95%|█████████▍| 9482/10000 [2:07:24<06:47, 1.27it/s, loss=0.0508, lr=2.67e-06, step=9481] Training: 95%|█████████▍| 9482/10000 [2:07:24<06:47, 1.27it/s, loss=0.0031, lr=2.67e-06, step=9482] Training: 95%|█████████▍| 9483/10000 [2:07:24<06:45, 1.28it/s, loss=0.0031, lr=2.67e-06, step=9482] Training: 95%|█████████▍| 9483/10000 [2:07:24<06:45, 1.28it/s, loss=0.0073, lr=2.66e-06, step=9483] Training: 95%|█████████▍| 9484/10000 [2:07:25<07:09, 1.20it/s, loss=0.0073, lr=2.66e-06, step=9483] Training: 95%|█████████▍| 9484/10000 [2:07:25<07:09, 1.20it/s, loss=0.0037, lr=2.66e-06, step=9484] Training: 95%|█████████▍| 9485/10000 [2:07:26<07:05, 1.21it/s, loss=0.0037, lr=2.66e-06, step=9484] Training: 95%|█████████▍| 9485/10000 [2:07:26<07:05, 1.21it/s, loss=0.0069, lr=2.66e-06, step=9485] Training: 95%|█████████▍| 9486/10000 [2:07:27<07:34, 1.13it/s, loss=0.0069, lr=2.66e-06, step=9485] Training: 95%|█████████▍| 9486/10000 [2:07:27<07:34, 1.13it/s, loss=0.0022, lr=2.66e-06, step=9486] Training: 95%|█████████▍| 9487/10000 [2:07:28<06:44, 1.27it/s, loss=0.0022, lr=2.66e-06, step=9486] Training: 95%|█████████▍| 9487/10000 [2:07:28<06:44, 1.27it/s, loss=0.0114, lr=2.66e-06, step=9487] Training: 95%|█████████▍| 9488/10000 [2:07:29<07:08, 1.19it/s, loss=0.0114, lr=2.66e-06, step=9487] Training: 95%|█████████▍| 9488/10000 [2:07:29<07:08, 1.19it/s, loss=0.0017, lr=2.66e-06, step=9488] Training: 95%|█████████▍| 9489/10000 [2:07:30<08:04, 1.05it/s, loss=0.0017, lr=2.66e-06, step=9488] Training: 95%|█████████▍| 9489/10000 [2:07:30<08:04, 1.05it/s, loss=0.0021, lr=2.66e-06, step=9489]20:52:03.172 [I] step=9490 loss=0.0317 smoothed_loss=0.0104 lr=2.66e-06 grad_norm=0.4732 step_time=0.6925s data_time=0.1879s it/s=1.136 eta_to_10000=448.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0095 grad_action_out_proj_arms=0.1255 grad_arm_token_fuse=0.0457 grad_shared_expert=0.9856 (18633:train_pytorch.py:850) + Training: 95%|█████████▍| 9490/10000 [2:07:31<08:14, 1.03it/s, loss=0.0021, lr=2.66e-06, step=9489] Training: 95%|█████████▍| 9490/10000 [2:07:31<08:14, 1.03it/s, loss=0.0317, lr=2.66e-06, step=9490] Training: 95%|█████████▍| 9491/10000 [2:07:32<08:05, 1.05it/s, loss=0.0317, lr=2.66e-06, step=9490] Training: 95%|█████████▍| 9491/10000 [2:07:32<08:05, 1.05it/s, loss=0.0323, lr=2.66e-06, step=9491] Training: 95%|█████████▍| 9492/10000 [2:07:32<07:21, 1.15it/s, loss=0.0323, lr=2.66e-06, step=9491] Training: 95%|█████████▍| 9492/10000 [2:07:32<07:21, 1.15it/s, loss=0.0050, lr=2.66e-06, step=9492] Training: 95%|█████████▍| 9493/10000 [2:07:34<08:22, 1.01it/s, loss=0.0050, lr=2.66e-06, step=9492] Training: 95%|█████████▍| 9493/10000 [2:07:34<08:22, 1.01it/s, loss=0.0080, lr=2.66e-06, step=9493] Training: 95%|█████████▍| 9494/10000 [2:07:35<07:55, 1.06it/s, loss=0.0080, lr=2.66e-06, step=9493] Training: 95%|█████████▍| 9494/10000 [2:07:35<07:55, 1.06it/s, loss=0.0014, lr=2.66e-06, step=9494] Training: 95%|█████████▍| 9495/10000 [2:07:35<07:02, 1.20it/s, loss=0.0014, lr=2.66e-06, step=9494] Training: 95%|█████████▍| 9495/10000 [2:07:35<07:02, 1.20it/s, loss=0.0167, lr=2.66e-06, step=9495] Training: 95%|█████████▍| 9496/10000 [2:07:36<06:37, 1.27it/s, loss=0.0167, lr=2.66e-06, step=9495] Training: 95%|█████████▍| 9496/10000 [2:07:36<06:37, 1.27it/s, loss=0.0033, lr=2.66e-06, step=9496] Training: 95%|█████████▍| 9497/10000 [2:07:36<06:14, 1.34it/s, loss=0.0033, lr=2.66e-06, step=9496] Training: 95%|█████████▍| 9497/10000 [2:07:36<06:14, 1.34it/s, loss=0.0044, lr=2.66e-06, step=9497] Training: 95%|█████████▍| 9498/10000 [2:07:37<05:51, 1.43it/s, loss=0.0044, lr=2.66e-06, step=9497] Training: 95%|█████████▍| 9498/10000 [2:07:37<05:51, 1.43it/s, loss=0.0161, lr=2.66e-06, step=9498] Training: 95%|█████████▍| 9499/10000 [2:07:38<06:06, 1.37it/s, loss=0.0161, lr=2.66e-06, step=9498] Training: 95%|█████████▍| 9499/10000 [2:07:38<06:06, 1.37it/s, loss=0.0058, lr=2.65e-06, step=9499]20:52:11.339 [I] step=9500 loss=0.0037 smoothed_loss=0.0093 lr=2.66e-06 grad_norm=0.4200 step_time=0.6087s data_time=0.2080s it/s=1.226 eta_to_10000=407.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0032 grad_action_out_proj_arms=0.0491 grad_arm_token_fuse=0.0176 grad_shared_expert=0.2581 (18633:train_pytorch.py:850) + Training: 95%|█████████▌| 9500/10000 [2:07:39<07:11, 1.16it/s, loss=0.0058, lr=2.65e-06, step=9499] Training: 95%|█████████▌| 9500/10000 [2:07:39<07:11, 1.16it/s, loss=0.0037, lr=2.65e-06, step=9500] Training: 95%|█████████▌| 9501/10000 [2:07:40<08:13, 1.01it/s, loss=0.0037, lr=2.65e-06, step=9500] Training: 95%|█████████▌| 9501/10000 [2:07:40<08:13, 1.01it/s, loss=0.0015, lr=2.65e-06, step=9501] Training: 95%|█████████▌| 9502/10000 [2:07:41<07:42, 1.08it/s, loss=0.0015, lr=2.65e-06, step=9501] Training: 95%|█████████▌| 9502/10000 [2:07:41<07:42, 1.08it/s, loss=0.0510, lr=2.65e-06, step=9502] Training: 95%|█████████▌| 9503/10000 [2:07:42<08:09, 1.01it/s, loss=0.0510, lr=2.65e-06, step=9502] Training: 95%|█████████▌| 9503/10000 [2:07:42<08:09, 1.01it/s, loss=0.0163, lr=2.65e-06, step=9503] Training: 95%|█████████▌| 9504/10000 [2:07:43<07:27, 1.11it/s, loss=0.0163, lr=2.65e-06, step=9503] Training: 95%|█████████▌| 9504/10000 [2:07:43<07:27, 1.11it/s, loss=0.0073, lr=2.65e-06, step=9504] Training: 95%|█████████▌| 9505/10000 [2:07:43<06:28, 1.27it/s, loss=0.0073, lr=2.65e-06, step=9504] Training: 95%|█████████▌| 9505/10000 [2:07:43<06:28, 1.27it/s, loss=0.0095, lr=2.65e-06, step=9505] Training: 95%|█████████▌| 9506/10000 [2:07:44<06:01, 1.36it/s, loss=0.0095, lr=2.65e-06, step=9505] Training: 95%|█████████▌| 9506/10000 [2:07:44<06:01, 1.36it/s, loss=0.0012, lr=2.65e-06, step=9506] Training: 95%|█████████▌| 9507/10000 [2:07:45<07:24, 1.11it/s, loss=0.0012, lr=2.65e-06, step=9506] Training: 95%|█████████▌| 9507/10000 [2:07:45<07:24, 1.11it/s, loss=0.0032, lr=2.65e-06, step=9507] Training: 95%|█████████▌| 9508/10000 [2:07:46<06:28, 1.27it/s, loss=0.0032, lr=2.65e-06, step=9507] Training: 95%|█████████▌| 9508/10000 [2:07:46<06:28, 1.27it/s, loss=0.0088, lr=2.65e-06, step=9508] Training: 95%|█████████▌| 9509/10000 [2:07:46<06:01, 1.36it/s, loss=0.0088, lr=2.65e-06, step=9508] Training: 95%|█████████▌| 9509/10000 [2:07:46<06:01, 1.36it/s, loss=0.0021, lr=2.65e-06, step=9509]20:52:19.546 [I] step=9510 loss=0.0344 smoothed_loss=0.0119 lr=2.65e-06 grad_norm=0.5275 step_time=0.6152s data_time=0.2054s it/s=1.221 eta_to_10000=401.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0236 grad_action_out_proj_arms=0.1308 grad_arm_token_fuse=0.1239 grad_shared_expert=0.4343 (18633:train_pytorch.py:850) + Training: 95%|█████████▌| 9510/10000 [2:07:47<06:01, 1.36it/s, loss=0.0021, lr=2.65e-06, step=9509] Training: 95%|█████████▌| 9510/10000 [2:07:47<06:01, 1.36it/s, loss=0.0344, lr=2.65e-06, step=9510] Training: 95%|█████████▌| 9511/10000 [2:07:48<05:25, 1.50it/s, loss=0.0344, lr=2.65e-06, step=9510] Training: 95%|█████████▌| 9511/10000 [2:07:48<05:25, 1.50it/s, loss=0.0017, lr=2.65e-06, step=9511] Training: 95%|█████████▌| 9512/10000 [2:07:48<05:20, 1.52it/s, loss=0.0017, lr=2.65e-06, step=9511] Training: 95%|█████████▌| 9512/10000 [2:07:48<05:20, 1.52it/s, loss=0.0052, lr=2.65e-06, step=9512] Training: 95%|█████████▌| 9513/10000 [2:07:49<05:46, 1.40it/s, loss=0.0052, lr=2.65e-06, step=9512] Training: 95%|█████████▌| 9513/10000 [2:07:49<05:46, 1.40it/s, loss=0.0053, lr=2.65e-06, step=9513] Training: 95%|█████████▌| 9514/10000 [2:07:50<06:09, 1.32it/s, loss=0.0053, lr=2.65e-06, step=9513] Training: 95%|█████████▌| 9514/10000 [2:07:50<06:09, 1.32it/s, loss=0.0033, lr=2.65e-06, step=9514] Training: 95%|█████████▌| 9515/10000 [2:07:51<06:42, 1.20it/s, loss=0.0033, lr=2.65e-06, step=9514] Training: 95%|█████████▌| 9515/10000 [2:07:51<06:42, 1.20it/s, loss=0.0062, lr=2.64e-06, step=9515] Training: 95%|█████████▌| 9516/10000 [2:07:52<06:23, 1.26it/s, loss=0.0062, lr=2.64e-06, step=9515] Training: 95%|█████████▌| 9516/10000 [2:07:52<06:23, 1.26it/s, loss=0.0056, lr=2.64e-06, step=9516] Training: 95%|█████████▌| 9517/10000 [2:07:52<06:14, 1.29it/s, loss=0.0056, lr=2.64e-06, step=9516] Training: 95%|█████████▌| 9517/10000 [2:07:52<06:14, 1.29it/s, loss=0.0028, lr=2.64e-06, step=9517] Training: 95%|█████████▌| 9518/10000 [2:07:53<05:39, 1.42it/s, loss=0.0028, lr=2.64e-06, step=9517] Training: 95%|█████████▌| 9518/10000 [2:07:53<05:39, 1.42it/s, loss=0.0164, lr=2.64e-06, step=9518] Training: 95%|█████████▌| 9519/10000 [2:07:54<05:29, 1.46it/s, loss=0.0164, lr=2.64e-06, step=9518] Training: 95%|█████████▌| 9519/10000 [2:07:54<05:29, 1.46it/s, loss=0.0174, lr=2.64e-06, step=9519]20:52:26.738 [I] step=9520 loss=0.0064 smoothed_loss=0.0093 lr=2.64e-06 grad_norm=0.3722 step_time=0.6031s data_time=0.1161s it/s=1.391 eta_to_10000=345.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0047 grad_action_out_proj_arms=0.0516 grad_arm_token_fuse=0.0265 grad_shared_expert=0.2911 (18633:train_pytorch.py:850) + Training: 95%|█████████▌| 9520/10000 [2:07:54<05:35, 1.43it/s, loss=0.0174, lr=2.64e-06, step=9519] Training: 95%|█████████▌| 9520/10000 [2:07:54<05:35, 1.43it/s, loss=0.0064, lr=2.64e-06, step=9520] Training: 95%|█████████▌| 9521/10000 [2:07:55<05:45, 1.39it/s, loss=0.0064, lr=2.64e-06, step=9520] Training: 95%|█████████▌| 9521/10000 [2:07:55<05:45, 1.39it/s, loss=0.0159, lr=2.64e-06, step=9521] Training: 95%|█████████▌| 9522/10000 [2:07:56<06:55, 1.15it/s, loss=0.0159, lr=2.64e-06, step=9521] Training: 95%|█████████▌| 9522/10000 [2:07:56<06:55, 1.15it/s, loss=0.0245, lr=2.64e-06, step=9522] Training: 95%|█████████▌| 9523/10000 [2:07:57<06:48, 1.17it/s, loss=0.0245, lr=2.64e-06, step=9522] Training: 95%|█████████▌| 9523/10000 [2:07:57<06:48, 1.17it/s, loss=0.0024, lr=2.64e-06, step=9523] Training: 95%|█████████▌| 9524/10000 [2:07:58<06:25, 1.24it/s, loss=0.0024, lr=2.64e-06, step=9523] Training: 95%|█████████▌| 9524/10000 [2:07:58<06:25, 1.24it/s, loss=0.0019, lr=2.64e-06, step=9524] Training: 95%|█████████▌| 9525/10000 [2:07:59<06:28, 1.22it/s, loss=0.0019, lr=2.64e-06, step=9524] Training: 95%|█████████▌| 9525/10000 [2:07:59<06:28, 1.22it/s, loss=0.0021, lr=2.64e-06, step=9525] Training: 95%|█████████▌| 9526/10000 [2:07:59<05:53, 1.34it/s, loss=0.0021, lr=2.64e-06, step=9525] Training: 95%|█████████▌| 9526/10000 [2:07:59<05:53, 1.34it/s, loss=0.0074, lr=2.64e-06, step=9526] Training: 95%|█████████▌| 9527/10000 [2:08:00<05:37, 1.40it/s, loss=0.0074, lr=2.64e-06, step=9526] Training: 95%|█████████▌| 9527/10000 [2:08:00<05:37, 1.40it/s, loss=0.0018, lr=2.64e-06, step=9527] Training: 95%|█████████▌| 9528/10000 [2:08:01<06:26, 1.22it/s, loss=0.0018, lr=2.64e-06, step=9527] Training: 95%|█████████▌| 9528/10000 [2:08:01<06:26, 1.22it/s, loss=0.0054, lr=2.64e-06, step=9528] Training: 95%|█████████▌| 9529/10000 [2:08:02<06:34, 1.19it/s, loss=0.0054, lr=2.64e-06, step=9528] Training: 95%|█████████▌| 9529/10000 [2:08:02<06:34, 1.19it/s, loss=0.0136, lr=2.64e-06, step=9529]20:52:35.061 [I] step=9530 loss=0.0044 smoothed_loss=0.0080 lr=2.64e-06 grad_norm=0.3223 step_time=0.6835s data_time=0.1488s it/s=1.202 eta_to_10000=391.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0032 grad_action_out_proj_arms=0.0453 grad_arm_token_fuse=0.0175 grad_shared_expert=0.1596 (18633:train_pytorch.py:850) + Training: 95%|█████████▌| 9530/10000 [2:08:03<06:30, 1.20it/s, loss=0.0136, lr=2.64e-06, step=9529] Training: 95%|█████████▌| 9530/10000 [2:08:03<06:30, 1.20it/s, loss=0.0044, lr=2.64e-06, step=9530] Training: 95%|█████████▌| 9531/10000 [2:08:04<06:46, 1.15it/s, loss=0.0044, lr=2.64e-06, step=9530] Training: 95%|█████████▌| 9531/10000 [2:08:04<06:46, 1.15it/s, loss=0.0173, lr=2.64e-06, step=9531] Training: 95%|█████████▌| 9532/10000 [2:08:04<05:53, 1.32it/s, loss=0.0173, lr=2.64e-06, step=9531] Training: 95%|█████████▌| 9532/10000 [2:08:04<05:53, 1.32it/s, loss=0.0256, lr=2.64e-06, step=9532] Training: 95%|█████████▌| 9533/10000 [2:08:05<05:39, 1.38it/s, loss=0.0256, lr=2.64e-06, step=9532] Training: 95%|█████████▌| 9533/10000 [2:08:05<05:39, 1.38it/s, loss=0.0208, lr=2.63e-06, step=9533] Training: 95%|█████████▌| 9534/10000 [2:08:05<05:25, 1.43it/s, loss=0.0208, lr=2.63e-06, step=9533] Training: 95%|█████████▌| 9534/10000 [2:08:05<05:25, 1.43it/s, loss=0.0255, lr=2.63e-06, step=9534] Training: 95%|█████████▌| 9535/10000 [2:08:06<05:26, 1.42it/s, loss=0.0255, lr=2.63e-06, step=9534] Training: 95%|█████████▌| 9535/10000 [2:08:06<05:26, 1.42it/s, loss=0.0284, lr=2.63e-06, step=9535] Training: 95%|█████████▌| 9536/10000 [2:08:07<05:40, 1.36it/s, loss=0.0284, lr=2.63e-06, step=9535] Training: 95%|█████████▌| 9536/10000 [2:08:07<05:40, 1.36it/s, loss=0.0021, lr=2.63e-06, step=9536] Training: 95%|█████████▌| 9537/10000 [2:08:08<05:43, 1.35it/s, loss=0.0021, lr=2.63e-06, step=9536] Training: 95%|█████████▌| 9537/10000 [2:08:08<05:43, 1.35it/s, loss=0.0029, lr=2.63e-06, step=9537] Training: 95%|█████████▌| 9538/10000 [2:08:08<05:35, 1.38it/s, loss=0.0029, lr=2.63e-06, step=9537] Training: 95%|█████████▌| 9538/10000 [2:08:08<05:35, 1.38it/s, loss=0.0056, lr=2.63e-06, step=9538] Training: 95%|█████████▌| 9539/10000 [2:08:09<05:34, 1.38it/s, loss=0.0056, lr=2.63e-06, step=9538] Training: 95%|█████████▌| 9539/10000 [2:08:09<05:34, 1.38it/s, loss=0.0123, lr=2.63e-06, step=9539]20:52:42.054 [I] step=9540 loss=0.0006 smoothed_loss=0.0106 lr=2.63e-06 grad_norm=0.4292 step_time=0.5674s data_time=0.1319s it/s=1.432 eta_to_10000=321.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0178 grad_action_out_proj_arms=0.1134 grad_arm_token_fuse=0.0911 grad_shared_expert=0.5885 (18633:train_pytorch.py:850) + Training: 95%|█████████▌| 9540/10000 [2:08:10<05:13, 1.47it/s, loss=0.0123, lr=2.63e-06, step=9539] Training: 95%|█████████▌| 9540/10000 [2:08:10<05:13, 1.47it/s, loss=0.0006, lr=2.63e-06, step=9540] Training: 95%|█████████▌| 9541/10000 [2:08:11<05:40, 1.35it/s, loss=0.0006, lr=2.63e-06, step=9540] Training: 95%|█████████▌| 9541/10000 [2:08:11<05:40, 1.35it/s, loss=0.0045, lr=2.63e-06, step=9541] Training: 95%|█████████▌| 9542/10000 [2:08:11<05:54, 1.29it/s, loss=0.0045, lr=2.63e-06, step=9541] Training: 95%|█████████▌| 9542/10000 [2:08:11<05:54, 1.29it/s, loss=0.0063, lr=2.63e-06, step=9542] Training: 95%|█████████▌| 9543/10000 [2:08:12<06:21, 1.20it/s, loss=0.0063, lr=2.63e-06, step=9542] Training: 95%|█████████▌| 9543/10000 [2:08:12<06:21, 1.20it/s, loss=0.0168, lr=2.63e-06, step=9543] Training: 95%|█████████▌| 9544/10000 [2:08:13<06:12, 1.22it/s, loss=0.0168, lr=2.63e-06, step=9543] Training: 95%|█████████▌| 9544/10000 [2:08:13<06:12, 1.22it/s, loss=0.0100, lr=2.63e-06, step=9544] Training: 95%|█████████▌| 9545/10000 [2:08:14<06:20, 1.20it/s, loss=0.0100, lr=2.63e-06, step=9544] Training: 95%|█████████▌| 9545/10000 [2:08:14<06:20, 1.20it/s, loss=0.0051, lr=2.63e-06, step=9545] Training: 95%|█████████▌| 9546/10000 [2:08:15<06:33, 1.16it/s, loss=0.0051, lr=2.63e-06, step=9545] Training: 95%|█████████▌| 9546/10000 [2:08:15<06:33, 1.16it/s, loss=0.0011, lr=2.63e-06, step=9546] Training: 95%|█████████▌| 9547/10000 [2:08:16<06:10, 1.22it/s, loss=0.0011, lr=2.63e-06, step=9546] Training: 95%|█████████▌| 9547/10000 [2:08:16<06:10, 1.22it/s, loss=0.0067, lr=2.63e-06, step=9547] Training: 95%|█████████▌| 9548/10000 [2:08:17<06:00, 1.25it/s, loss=0.0067, lr=2.63e-06, step=9547] Training: 95%|█████████▌| 9548/10000 [2:08:17<06:00, 1.25it/s, loss=0.0031, lr=2.63e-06, step=9548] Training: 95%|█████████▌| 9549/10000 [2:08:17<05:40, 1.33it/s, loss=0.0031, lr=2.63e-06, step=9548] Training: 95%|█████████▌| 9549/10000 [2:08:17<05:40, 1.33it/s, loss=0.0163, lr=2.63e-06, step=9549]20:52:50.641 [I] step=9550 loss=0.0036 smoothed_loss=0.0084 lr=2.63e-06 grad_norm=0.4260 step_time=0.6543s data_time=0.2044s it/s=1.168 eta_to_10000=385.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0948 grad_arm_token_fuse=0.0383 grad_shared_expert=0.2960 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9550/10000 [2:08:18<06:33, 1.14it/s, loss=0.0163, lr=2.63e-06, step=9549] Training: 96%|█████████▌| 9550/10000 [2:08:18<06:33, 1.14it/s, loss=0.0036, lr=2.62e-06, step=9550] Training: 96%|█████████▌| 9551/10000 [2:08:19<05:43, 1.31it/s, loss=0.0036, lr=2.62e-06, step=9550] Training: 96%|█████████▌| 9551/10000 [2:08:19<05:43, 1.31it/s, loss=0.0403, lr=2.62e-06, step=9551] Training: 96%|█████████▌| 9552/10000 [2:08:19<05:26, 1.37it/s, loss=0.0403, lr=2.62e-06, step=9551] Training: 96%|█████████▌| 9552/10000 [2:08:19<05:26, 1.37it/s, loss=0.0034, lr=2.62e-06, step=9552] Training: 96%|█████████▌| 9553/10000 [2:08:20<04:54, 1.52it/s, loss=0.0034, lr=2.62e-06, step=9552] Training: 96%|█████████▌| 9553/10000 [2:08:20<04:54, 1.52it/s, loss=0.0031, lr=2.62e-06, step=9553] Training: 96%|█████████▌| 9554/10000 [2:08:21<04:47, 1.55it/s, loss=0.0031, lr=2.62e-06, step=9553] Training: 96%|█████████▌| 9554/10000 [2:08:21<04:47, 1.55it/s, loss=0.0105, lr=2.62e-06, step=9554] Training: 96%|█████████▌| 9555/10000 [2:08:21<05:05, 1.46it/s, loss=0.0105, lr=2.62e-06, step=9554] Training: 96%|█████████▌| 9555/10000 [2:08:21<05:05, 1.46it/s, loss=0.0018, lr=2.62e-06, step=9555] Training: 96%|█████████▌| 9556/10000 [2:08:22<04:55, 1.50it/s, loss=0.0018, lr=2.62e-06, step=9555] Training: 96%|█████████▌| 9556/10000 [2:08:22<04:55, 1.50it/s, loss=0.0064, lr=2.62e-06, step=9556] Training: 96%|█████████▌| 9557/10000 [2:08:23<05:04, 1.45it/s, loss=0.0064, lr=2.62e-06, step=9556] Training: 96%|█████████▌| 9557/10000 [2:08:23<05:04, 1.45it/s, loss=0.0014, lr=2.62e-06, step=9557] Training: 96%|█████████▌| 9558/10000 [2:08:23<04:57, 1.48it/s, loss=0.0014, lr=2.62e-06, step=9557] Training: 96%|█████████▌| 9558/10000 [2:08:23<04:57, 1.48it/s, loss=0.0086, lr=2.62e-06, step=9558] Training: 96%|█████████▌| 9559/10000 [2:08:24<05:11, 1.42it/s, loss=0.0086, lr=2.62e-06, step=9558] Training: 96%|█████████▌| 9559/10000 [2:08:24<05:11, 1.42it/s, loss=0.0183, lr=2.62e-06, step=9559]20:52:57.109 [I] step=9560 loss=0.0075 smoothed_loss=0.0091 lr=2.62e-06 grad_norm=0.4058 step_time=0.5468s data_time=0.1000s it/s=1.546 eta_to_10000=284.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0259 grad_action_out_proj_arms=0.2377 grad_arm_token_fuse=0.1414 grad_shared_expert=0.5979 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9560/10000 [2:08:25<05:02, 1.45it/s, loss=0.0183, lr=2.62e-06, step=9559] Training: 96%|█████████▌| 9560/10000 [2:08:25<05:02, 1.45it/s, loss=0.0075, lr=2.62e-06, step=9560] Training: 96%|█████████▌| 9561/10000 [2:08:25<04:44, 1.54it/s, loss=0.0075, lr=2.62e-06, step=9560] Training: 96%|█████████▌| 9561/10000 [2:08:25<04:44, 1.54it/s, loss=0.0052, lr=2.62e-06, step=9561] Training: 96%|█████████▌| 9562/10000 [2:08:26<04:52, 1.50it/s, loss=0.0052, lr=2.62e-06, step=9561] Training: 96%|█████████▌| 9562/10000 [2:08:26<04:52, 1.50it/s, loss=0.0017, lr=2.62e-06, step=9562] Training: 96%|█████████▌| 9563/10000 [2:08:27<04:28, 1.63it/s, loss=0.0017, lr=2.62e-06, step=9562] Training: 96%|█████████▌| 9563/10000 [2:08:27<04:28, 1.63it/s, loss=0.0134, lr=2.62e-06, step=9563] Training: 96%|█████████▌| 9564/10000 [2:08:27<04:33, 1.59it/s, loss=0.0134, lr=2.62e-06, step=9563] Training: 96%|█████████▌| 9564/10000 [2:08:27<04:33, 1.59it/s, loss=0.0255, lr=2.62e-06, step=9564] Training: 96%|█████████▌| 9565/10000 [2:08:28<05:01, 1.44it/s, loss=0.0255, lr=2.62e-06, step=9564] Training: 96%|█████████▌| 9565/10000 [2:08:28<05:01, 1.44it/s, loss=0.0087, lr=2.62e-06, step=9565] Training: 96%|█████████▌| 9566/10000 [2:08:29<05:33, 1.30it/s, loss=0.0087, lr=2.62e-06, step=9565] Training: 96%|█████████▌| 9566/10000 [2:08:29<05:33, 1.30it/s, loss=0.0036, lr=2.62e-06, step=9566] Training: 96%|█████████▌| 9567/10000 [2:08:30<05:19, 1.36it/s, loss=0.0036, lr=2.62e-06, step=9566] Training: 96%|█████████▌| 9567/10000 [2:08:30<05:19, 1.36it/s, loss=0.0015, lr=2.62e-06, step=9567] Training: 96%|█████████▌| 9568/10000 [2:08:30<04:55, 1.46it/s, loss=0.0015, lr=2.62e-06, step=9567] Training: 96%|█████████▌| 9568/10000 [2:08:30<04:55, 1.46it/s, loss=0.0101, lr=2.62e-06, step=9568] Training: 96%|█████████▌| 9569/10000 [2:08:31<04:31, 1.58it/s, loss=0.0101, lr=2.62e-06, step=9568] Training: 96%|█████████▌| 9569/10000 [2:08:31<04:31, 1.58it/s, loss=0.0121, lr=2.61e-06, step=9569]20:53:03.581 [I] step=9570 loss=0.0028 smoothed_loss=0.0085 lr=2.62e-06 grad_norm=0.3999 step_time=0.5431s data_time=0.1040s it/s=1.545 eta_to_10000=278.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0181 grad_action_out_proj_arms=0.1213 grad_arm_token_fuse=0.0920 grad_shared_expert=0.4166 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9570/10000 [2:08:31<04:18, 1.66it/s, loss=0.0121, lr=2.61e-06, step=9569] Training: 96%|█████████▌| 9570/10000 [2:08:31<04:18, 1.66it/s, loss=0.0028, lr=2.61e-06, step=9570] Training: 96%|█████████▌| 9571/10000 [2:08:32<04:45, 1.50it/s, loss=0.0028, lr=2.61e-06, step=9570] Training: 96%|█████████▌| 9571/10000 [2:08:32<04:45, 1.50it/s, loss=0.0176, lr=2.61e-06, step=9571] Training: 96%|█████████▌| 9572/10000 [2:08:33<05:18, 1.34it/s, loss=0.0176, lr=2.61e-06, step=9571] Training: 96%|█████████▌| 9572/10000 [2:08:33<05:18, 1.34it/s, loss=0.0256, lr=2.61e-06, step=9572] Training: 96%|█████████▌| 9573/10000 [2:08:34<05:06, 1.39it/s, loss=0.0256, lr=2.61e-06, step=9572] Training: 96%|█████████▌| 9573/10000 [2:08:34<05:06, 1.39it/s, loss=0.0107, lr=2.61e-06, step=9573] Training: 96%|█████████▌| 9574/10000 [2:08:34<04:59, 1.42it/s, loss=0.0107, lr=2.61e-06, step=9573] Training: 96%|█████████▌| 9574/10000 [2:08:34<04:59, 1.42it/s, loss=0.0067, lr=2.61e-06, step=9574] Training: 96%|█████████▌| 9575/10000 [2:08:35<04:36, 1.54it/s, loss=0.0067, lr=2.61e-06, step=9574] Training: 96%|█████████▌| 9575/10000 [2:08:35<04:36, 1.54it/s, loss=0.0041, lr=2.61e-06, step=9575] Training: 96%|█████████▌| 9576/10000 [2:08:35<04:16, 1.65it/s, loss=0.0041, lr=2.61e-06, step=9575] Training: 96%|█████████▌| 9576/10000 [2:08:35<04:16, 1.65it/s, loss=0.0045, lr=2.61e-06, step=9576] Training: 96%|█████████▌| 9577/10000 [2:08:36<04:06, 1.72it/s, loss=0.0045, lr=2.61e-06, step=9576] Training: 96%|█████████▌| 9577/10000 [2:08:36<04:06, 1.72it/s, loss=0.0023, lr=2.61e-06, step=9577] Training: 96%|█████████▌| 9578/10000 [2:08:37<04:31, 1.55it/s, loss=0.0023, lr=2.61e-06, step=9577] Training: 96%|█████████▌| 9578/10000 [2:08:37<04:31, 1.55it/s, loss=0.0038, lr=2.61e-06, step=9578] Training: 96%|█████████▌| 9579/10000 [2:08:38<05:16, 1.33it/s, loss=0.0038, lr=2.61e-06, step=9578] Training: 96%|█████████▌| 9579/10000 [2:08:38<05:16, 1.33it/s, loss=0.0011, lr=2.61e-06, step=9579]20:53:10.553 [I] step=9580 loss=0.0023 smoothed_loss=0.0069 lr=2.61e-06 grad_norm=0.3991 step_time=0.5929s data_time=0.1043s it/s=1.435 eta_to_10000=292.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0330 grad_action_out_proj_arms=0.1326 grad_arm_token_fuse=0.1745 grad_shared_expert=0.4861 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9580/10000 [2:08:38<04:51, 1.44it/s, loss=0.0011, lr=2.61e-06, step=9579] Training: 96%|█████████▌| 9580/10000 [2:08:38<04:51, 1.44it/s, loss=0.0023, lr=2.61e-06, step=9580] Training: 96%|█████████▌| 9581/10000 [2:08:39<04:56, 1.41it/s, loss=0.0023, lr=2.61e-06, step=9580] Training: 96%|█████████▌| 9581/10000 [2:08:39<04:56, 1.41it/s, loss=0.0050, lr=2.61e-06, step=9581] Training: 96%|█████████▌| 9582/10000 [2:08:40<04:38, 1.50it/s, loss=0.0050, lr=2.61e-06, step=9581] Training: 96%|█████████▌| 9582/10000 [2:08:40<04:38, 1.50it/s, loss=0.0012, lr=2.61e-06, step=9582] Training: 96%|█████████▌| 9583/10000 [2:08:40<04:26, 1.56it/s, loss=0.0012, lr=2.61e-06, step=9582] Training: 96%|█████████▌| 9583/10000 [2:08:40<04:26, 1.56it/s, loss=0.0019, lr=2.61e-06, step=9583] Training: 96%|█████████▌| 9584/10000 [2:08:41<04:07, 1.68it/s, loss=0.0019, lr=2.61e-06, step=9583] Training: 96%|█████████▌| 9584/10000 [2:08:41<04:07, 1.68it/s, loss=0.0063, lr=2.61e-06, step=9584] Training: 96%|█████████▌| 9585/10000 [2:08:41<04:14, 1.63it/s, loss=0.0063, lr=2.61e-06, step=9584] Training: 96%|█████████▌| 9585/10000 [2:08:41<04:14, 1.63it/s, loss=0.0053, lr=2.61e-06, step=9585] Training: 96%|█████████▌| 9586/10000 [2:08:42<04:39, 1.48it/s, loss=0.0053, lr=2.61e-06, step=9585] Training: 96%|█████████▌| 9586/10000 [2:08:42<04:39, 1.48it/s, loss=0.0125, lr=2.61e-06, step=9586] Training: 96%|█████████▌| 9587/10000 [2:08:43<04:22, 1.58it/s, loss=0.0125, lr=2.61e-06, step=9586] Training: 96%|█████████▌| 9587/10000 [2:08:43<04:22, 1.58it/s, loss=0.0038, lr=2.61e-06, step=9587] Training: 96%|█████████▌| 9588/10000 [2:08:43<04:24, 1.56it/s, loss=0.0038, lr=2.61e-06, step=9587] Training: 96%|█████████▌| 9588/10000 [2:08:43<04:24, 1.56it/s, loss=0.0170, lr=2.60e-06, step=9588] Training: 96%|█████████▌| 9589/10000 [2:08:44<04:36, 1.49it/s, loss=0.0170, lr=2.60e-06, step=9588] Training: 96%|█████████▌| 9589/10000 [2:08:44<04:36, 1.49it/s, loss=0.0005, lr=2.60e-06, step=9589]20:53:17.176 [I] step=9590 loss=0.0062 smoothed_loss=0.0065 lr=2.61e-06 grad_norm=0.4512 step_time=0.5469s data_time=0.1154s it/s=1.510 eta_to_10000=271.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1116 grad_arm_token_fuse=0.0979 grad_shared_expert=0.5571 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9590/10000 [2:08:45<04:54, 1.39it/s, loss=0.0005, lr=2.60e-06, step=9589] Training: 96%|█████████▌| 9590/10000 [2:08:45<04:54, 1.39it/s, loss=0.0062, lr=2.60e-06, step=9590] Training: 96%|█████████▌| 9591/10000 [2:08:45<04:39, 1.46it/s, loss=0.0062, lr=2.60e-06, step=9590] Training: 96%|█████████▌| 9591/10000 [2:08:45<04:39, 1.46it/s, loss=0.0140, lr=2.60e-06, step=9591] Training: 96%|█████████▌| 9592/10000 [2:08:46<04:20, 1.57it/s, loss=0.0140, lr=2.60e-06, step=9591] Training: 96%|█████████▌| 9592/10000 [2:08:46<04:20, 1.57it/s, loss=0.0038, lr=2.60e-06, step=9592] Training: 96%|█████████▌| 9593/10000 [2:08:47<05:06, 1.33it/s, loss=0.0038, lr=2.60e-06, step=9592] Training: 96%|█████████▌| 9593/10000 [2:08:47<05:06, 1.33it/s, loss=0.0043, lr=2.60e-06, step=9593] Training: 96%|█████████▌| 9594/10000 [2:08:48<04:36, 1.47it/s, loss=0.0043, lr=2.60e-06, step=9593] Training: 96%|█████████▌| 9594/10000 [2:08:48<04:36, 1.47it/s, loss=0.0027, lr=2.60e-06, step=9594] Training: 96%|█████████▌| 9595/10000 [2:08:48<04:12, 1.60it/s, loss=0.0027, lr=2.60e-06, step=9594] Training: 96%|█████████▌| 9595/10000 [2:08:48<04:12, 1.60it/s, loss=0.0079, lr=2.60e-06, step=9595] Training: 96%|█████████▌| 9596/10000 [2:08:49<04:15, 1.58it/s, loss=0.0079, lr=2.60e-06, step=9595] Training: 96%|█████████▌| 9596/10000 [2:08:49<04:15, 1.58it/s, loss=0.0193, lr=2.60e-06, step=9596] Training: 96%|█████████▌| 9597/10000 [2:08:49<04:00, 1.68it/s, loss=0.0193, lr=2.60e-06, step=9596] Training: 96%|█████████▌| 9597/10000 [2:08:49<04:00, 1.68it/s, loss=0.0057, lr=2.60e-06, step=9597] Training: 96%|█████████▌| 9598/10000 [2:08:50<03:47, 1.76it/s, loss=0.0057, lr=2.60e-06, step=9597] Training: 96%|█████████▌| 9598/10000 [2:08:50<03:47, 1.76it/s, loss=0.0115, lr=2.60e-06, step=9598] Training: 96%|█████████▌| 9599/10000 [2:08:50<03:40, 1.82it/s, loss=0.0115, lr=2.60e-06, step=9598] Training: 96%|█████████▌| 9599/10000 [2:08:50<03:40, 1.82it/s, loss=0.0047, lr=2.60e-06, step=9599]20:53:23.228 [I] step=9600 loss=0.0082 smoothed_loss=0.0077 lr=2.60e-06 grad_norm=0.3632 step_time=0.5268s data_time=0.0784s it/s=1.653 eta_to_10000=242.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0084 grad_action_out_proj_arms=0.0841 grad_arm_token_fuse=0.0438 grad_shared_expert=0.3796 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9600/10000 [2:08:51<04:00, 1.67it/s, loss=0.0047, lr=2.60e-06, step=9599] Training: 96%|█████████▌| 9600/10000 [2:08:51<04:00, 1.67it/s, loss=0.0082, lr=2.60e-06, step=9600] Training: 96%|█████████▌| 9601/10000 [2:08:52<04:00, 1.66it/s, loss=0.0082, lr=2.60e-06, step=9600] Training: 96%|█████████▌| 9601/10000 [2:08:52<04:00, 1.66it/s, loss=0.0039, lr=2.60e-06, step=9601] Training: 96%|█████████▌| 9602/10000 [2:08:52<04:12, 1.58it/s, loss=0.0039, lr=2.60e-06, step=9601] Training: 96%|█████████▌| 9602/10000 [2:08:52<04:12, 1.58it/s, loss=0.0067, lr=2.60e-06, step=9602] Training: 96%|█████████▌| 9603/10000 [2:08:53<04:22, 1.51it/s, loss=0.0067, lr=2.60e-06, step=9602] Training: 96%|█████████▌| 9603/10000 [2:08:53<04:22, 1.51it/s, loss=0.0056, lr=2.60e-06, step=9603] Training: 96%|█████████▌| 9604/10000 [2:08:53<04:01, 1.64it/s, loss=0.0056, lr=2.60e-06, step=9603] Training: 96%|█████████▌| 9604/10000 [2:08:53<04:01, 1.64it/s, loss=0.0026, lr=2.60e-06, step=9604] Training: 96%|█████████▌| 9605/10000 [2:08:54<03:57, 1.67it/s, loss=0.0026, lr=2.60e-06, step=9604] Training: 96%|█████████▌| 9605/10000 [2:08:54<03:57, 1.67it/s, loss=0.0063, lr=2.60e-06, step=9605] Training: 96%|█████████▌| 9606/10000 [2:08:55<03:48, 1.73it/s, loss=0.0063, lr=2.60e-06, step=9605] Training: 96%|█████████▌| 9606/10000 [2:08:55<03:48, 1.73it/s, loss=0.0118, lr=2.60e-06, step=9606] Training: 96%|█████████▌| 9607/10000 [2:08:55<04:05, 1.60it/s, loss=0.0118, lr=2.60e-06, step=9606] Training: 96%|█████████▌| 9607/10000 [2:08:55<04:05, 1.60it/s, loss=0.0045, lr=2.60e-06, step=9607] Training: 96%|█████████▌| 9608/10000 [2:08:56<03:51, 1.69it/s, loss=0.0045, lr=2.60e-06, step=9607] Training: 96%|█████████▌| 9608/10000 [2:08:56<03:51, 1.69it/s, loss=0.0036, lr=2.59e-06, step=9608] Training: 96%|█████████▌| 9609/10000 [2:08:56<03:39, 1.78it/s, loss=0.0036, lr=2.59e-06, step=9608] Training: 96%|█████████▌| 9609/10000 [2:08:56<03:39, 1.78it/s, loss=0.0055, lr=2.59e-06, step=9609]20:53:29.143 [I] step=9610 loss=0.0072 smoothed_loss=0.0065 lr=2.60e-06 grad_norm=0.3360 step_time=0.5053s data_time=0.0863s it/s=1.691 eta_to_10000=230.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0051 grad_action_out_proj_arms=0.0544 grad_arm_token_fuse=0.0262 grad_shared_expert=0.2211 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9610/10000 [2:08:57<03:36, 1.80it/s, loss=0.0055, lr=2.59e-06, step=9609] Training: 96%|█████████▌| 9610/10000 [2:08:57<03:36, 1.80it/s, loss=0.0072, lr=2.59e-06, step=9610] Training: 96%|█████████▌| 9611/10000 [2:08:58<04:21, 1.49it/s, loss=0.0072, lr=2.59e-06, step=9610] Training: 96%|█████████▌| 9611/10000 [2:08:58<04:21, 1.49it/s, loss=0.0053, lr=2.59e-06, step=9611] Training: 96%|█████████▌| 9612/10000 [2:08:58<04:17, 1.51it/s, loss=0.0053, lr=2.59e-06, step=9611] Training: 96%|█████████▌| 9612/10000 [2:08:58<04:17, 1.51it/s, loss=0.0017, lr=2.59e-06, step=9612] Training: 96%|█████████▌| 9613/10000 [2:08:59<03:56, 1.63it/s, loss=0.0017, lr=2.59e-06, step=9612] Training: 96%|█████████▌| 9613/10000 [2:08:59<03:56, 1.63it/s, loss=0.0095, lr=2.59e-06, step=9613] Training: 96%|█████████▌| 9614/10000 [2:09:00<04:16, 1.50it/s, loss=0.0095, lr=2.59e-06, step=9613] Training: 96%|█████████▌| 9614/10000 [2:09:00<04:16, 1.50it/s, loss=0.0015, lr=2.59e-06, step=9614] Training: 96%|█████████▌| 9615/10000 [2:09:00<04:22, 1.46it/s, loss=0.0015, lr=2.59e-06, step=9614] Training: 96%|█████████▌| 9615/10000 [2:09:00<04:22, 1.46it/s, loss=0.0042, lr=2.59e-06, step=9615] Training: 96%|█████████▌| 9616/10000 [2:09:01<03:58, 1.61it/s, loss=0.0042, lr=2.59e-06, step=9615] Training: 96%|█████████▌| 9616/10000 [2:09:01<03:58, 1.61it/s, loss=0.0118, lr=2.59e-06, step=9616] Training: 96%|█████████▌| 9617/10000 [2:09:02<04:15, 1.50it/s, loss=0.0118, lr=2.59e-06, step=9616] Training: 96%|█████████▌| 9617/10000 [2:09:02<04:15, 1.50it/s, loss=0.0229, lr=2.59e-06, step=9617] Training: 96%|█████████▌| 9618/10000 [2:09:02<04:25, 1.44it/s, loss=0.0229, lr=2.59e-06, step=9617] Training: 96%|█████████▌| 9618/10000 [2:09:02<04:25, 1.44it/s, loss=0.0101, lr=2.59e-06, step=9618] Training: 96%|█████████▌| 9619/10000 [2:09:03<04:22, 1.45it/s, loss=0.0101, lr=2.59e-06, step=9618] Training: 96%|█████████▌| 9619/10000 [2:09:03<04:22, 1.45it/s, loss=0.0020, lr=2.59e-06, step=9619]20:53:36.133 [I] step=9620 loss=0.0081 smoothed_loss=0.0076 lr=2.59e-06 grad_norm=0.3703 step_time=0.5863s data_time=0.1127s it/s=1.431 eta_to_10000=265.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1007 grad_arm_token_fuse=0.0794 grad_shared_expert=0.4483 (18633:train_pytorch.py:850) + Training: 96%|█████████▌| 9620/10000 [2:09:04<04:23, 1.44it/s, loss=0.0020, lr=2.59e-06, step=9619] Training: 96%|█████████▌| 9620/10000 [2:09:04<04:23, 1.44it/s, loss=0.0081, lr=2.59e-06, step=9620] Training: 96%|█████████▌| 9621/10000 [2:09:05<04:43, 1.34it/s, loss=0.0081, lr=2.59e-06, step=9620] Training: 96%|█████████▌| 9621/10000 [2:09:05<04:43, 1.34it/s, loss=0.0022, lr=2.59e-06, step=9621] Training: 96%|█████████▌| 9622/10000 [2:09:06<05:14, 1.20it/s, loss=0.0022, lr=2.59e-06, step=9621] Training: 96%|█████████▌| 9622/10000 [2:09:06<05:14, 1.20it/s, loss=0.0024, lr=2.59e-06, step=9622] Training: 96%|█████████▌| 9623/10000 [2:09:06<05:08, 1.22it/s, loss=0.0024, lr=2.59e-06, step=9622] Training: 96%|█████████▌| 9623/10000 [2:09:06<05:08, 1.22it/s, loss=0.0016, lr=2.59e-06, step=9623] Training: 96%|█████████▌| 9624/10000 [2:09:07<04:46, 1.31it/s, loss=0.0016, lr=2.59e-06, step=9623] Training: 96%|█████████▌| 9624/10000 [2:09:07<04:46, 1.31it/s, loss=0.0071, lr=2.59e-06, step=9624] Training: 96%|█████████▋| 9625/10000 [2:09:08<04:14, 1.47it/s, loss=0.0071, lr=2.59e-06, step=9624] Training: 96%|█████████▋| 9625/10000 [2:09:08<04:14, 1.47it/s, loss=0.0064, lr=2.59e-06, step=9625] Training: 96%|█████████▋| 9626/10000 [2:09:09<04:38, 1.34it/s, loss=0.0064, lr=2.59e-06, step=9625] Training: 96%|█████████▋| 9626/10000 [2:09:09<04:38, 1.34it/s, loss=0.0136, lr=2.59e-06, step=9626] Training: 96%|█████████▋| 9627/10000 [2:09:09<04:35, 1.35it/s, loss=0.0136, lr=2.59e-06, step=9626] Training: 96%|█████████▋| 9627/10000 [2:09:09<04:35, 1.35it/s, loss=0.0055, lr=2.59e-06, step=9627] Training: 96%|█████████▋| 9628/10000 [2:09:10<04:49, 1.29it/s, loss=0.0055, lr=2.59e-06, step=9627] Training: 96%|█████████▋| 9628/10000 [2:09:10<04:49, 1.29it/s, loss=0.0091, lr=2.59e-06, step=9628] Training: 96%|█████████▋| 9629/10000 [2:09:11<04:43, 1.31it/s, loss=0.0091, lr=2.59e-06, step=9628] Training: 96%|█████████▋| 9629/10000 [2:09:11<04:43, 1.31it/s, loss=0.0192, lr=2.59e-06, step=9629]20:53:43.900 [I] step=9630 loss=0.0011 smoothed_loss=0.0075 lr=2.59e-06 grad_norm=0.3769 step_time=0.6311s data_time=0.1455s it/s=1.288 eta_to_10000=287.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0029 grad_action_out_proj_arms=0.0367 grad_arm_token_fuse=0.0156 grad_shared_expert=0.2092 (18633:train_pytorch.py:850) + Training: 96%|█████████▋| 9630/10000 [2:09:12<04:40, 1.32it/s, loss=0.0192, lr=2.59e-06, step=9629] Training: 96%|█████████▋| 9630/10000 [2:09:12<04:40, 1.32it/s, loss=0.0011, lr=2.58e-06, step=9630] Training: 96%|█████████▋| 9631/10000 [2:09:12<04:22, 1.40it/s, loss=0.0011, lr=2.58e-06, step=9630] Training: 96%|█████████▋| 9631/10000 [2:09:12<04:22, 1.40it/s, loss=0.0030, lr=2.58e-06, step=9631] Training: 96%|█████████▋| 9632/10000 [2:09:13<04:21, 1.41it/s, loss=0.0030, lr=2.58e-06, step=9631] Training: 96%|█████████▋| 9632/10000 [2:09:13<04:21, 1.41it/s, loss=0.0102, lr=2.58e-06, step=9632] Training: 96%|█████████▋| 9633/10000 [2:09:14<04:33, 1.34it/s, loss=0.0102, lr=2.58e-06, step=9632] Training: 96%|█████████▋| 9633/10000 [2:09:14<04:33, 1.34it/s, loss=0.0033, lr=2.58e-06, step=9633] Training: 96%|█████████▋| 9634/10000 [2:09:14<04:05, 1.49it/s, loss=0.0033, lr=2.58e-06, step=9633] Training: 96%|█████████▋| 9634/10000 [2:09:14<04:05, 1.49it/s, loss=0.0030, lr=2.58e-06, step=9634] Training: 96%|█████████▋| 9635/10000 [2:09:15<03:47, 1.60it/s, loss=0.0030, lr=2.58e-06, step=9634] Training: 96%|█████████▋| 9635/10000 [2:09:15<03:47, 1.60it/s, loss=0.0275, lr=2.58e-06, step=9635] Training: 96%|█████████▋| 9636/10000 [2:09:16<04:18, 1.41it/s, loss=0.0275, lr=2.58e-06, step=9635] Training: 96%|█████████▋| 9636/10000 [2:09:16<04:18, 1.41it/s, loss=0.0174, lr=2.58e-06, step=9636] Training: 96%|█████████▋| 9637/10000 [2:09:16<04:02, 1.50it/s, loss=0.0174, lr=2.58e-06, step=9636] Training: 96%|█████████▋| 9637/10000 [2:09:16<04:02, 1.50it/s, loss=0.0033, lr=2.58e-06, step=9637] Training: 96%|█████████▋| 9638/10000 [2:09:17<04:07, 1.46it/s, loss=0.0033, lr=2.58e-06, step=9637] Training: 96%|█████████▋| 9638/10000 [2:09:17<04:07, 1.46it/s, loss=0.0235, lr=2.58e-06, step=9638] Training: 96%|█████████▋| 9639/10000 [2:09:17<03:54, 1.54it/s, loss=0.0235, lr=2.58e-06, step=9638] Training: 96%|█████████▋| 9639/10000 [2:09:17<03:54, 1.54it/s, loss=0.0062, lr=2.58e-06, step=9639]20:53:50.541 [I] step=9640 loss=0.0043 smoothed_loss=0.0094 lr=2.58e-06 grad_norm=0.5049 step_time=0.5545s data_time=0.1096s it/s=1.506 eta_to_10000=239.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0212 grad_action_out_proj_arms=0.1238 grad_arm_token_fuse=0.1185 grad_shared_expert=0.4487 (18633:train_pytorch.py:850) + Training: 96%|█████████▋| 9640/10000 [2:09:18<04:01, 1.49it/s, loss=0.0062, lr=2.58e-06, step=9639] Training: 96%|█████████▋| 9640/10000 [2:09:18<04:01, 1.49it/s, loss=0.0043, lr=2.58e-06, step=9640] Training: 96%|█████████▋| 9641/10000 [2:09:19<04:20, 1.38it/s, loss=0.0043, lr=2.58e-06, step=9640] Training: 96%|█████████▋| 9641/10000 [2:09:19<04:20, 1.38it/s, loss=0.0022, lr=2.58e-06, step=9641] Training: 96%|█████████▋| 9642/10000 [2:09:20<04:01, 1.48it/s, loss=0.0022, lr=2.58e-06, step=9641] Training: 96%|█████████▋| 9642/10000 [2:09:20<04:01, 1.48it/s, loss=0.0048, lr=2.58e-06, step=9642] Training: 96%|█████████▋| 9643/10000 [2:09:20<04:05, 1.45it/s, loss=0.0048, lr=2.58e-06, step=9642] Training: 96%|█████████▋| 9643/10000 [2:09:20<04:05, 1.45it/s, loss=0.0032, lr=2.58e-06, step=9643] Training: 96%|█████████▋| 9644/10000 [2:09:21<03:44, 1.59it/s, loss=0.0032, lr=2.58e-06, step=9643] Training: 96%|█████████▋| 9644/10000 [2:09:21<03:44, 1.59it/s, loss=0.0045, lr=2.58e-06, step=9644] Training: 96%|█████████▋| 9645/10000 [2:09:21<03:32, 1.67it/s, loss=0.0045, lr=2.58e-06, step=9644] Training: 96%|█████████▋| 9645/10000 [2:09:21<03:32, 1.67it/s, loss=0.0034, lr=2.58e-06, step=9645] Training: 96%|█████████▋| 9646/10000 [2:09:22<03:47, 1.55it/s, loss=0.0034, lr=2.58e-06, step=9645] Training: 96%|█████████▋| 9646/10000 [2:09:22<03:47, 1.55it/s, loss=0.0069, lr=2.58e-06, step=9646] Training: 96%|█████████▋| 9647/10000 [2:09:23<03:49, 1.54it/s, loss=0.0069, lr=2.58e-06, step=9646] Training: 96%|█████████▋| 9647/10000 [2:09:23<03:49, 1.54it/s, loss=0.0145, lr=2.58e-06, step=9647] Training: 96%|█████████▋| 9648/10000 [2:09:24<04:18, 1.36it/s, loss=0.0145, lr=2.58e-06, step=9647] Training: 96%|█████████▋| 9648/10000 [2:09:24<04:18, 1.36it/s, loss=0.0114, lr=2.58e-06, step=9648] Training: 96%|█████████▋| 9649/10000 [2:09:24<04:06, 1.43it/s, loss=0.0114, lr=2.58e-06, step=9648] Training: 96%|█████████▋| 9649/10000 [2:09:24<04:06, 1.43it/s, loss=0.0040, lr=2.58e-06, step=9649]20:53:57.571 [I] step=9650 loss=0.0058 smoothed_loss=0.0075 lr=2.58e-06 grad_norm=0.3808 step_time=0.5825s data_time=0.1204s it/s=1.423 eta_to_10000=246.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0052 grad_action_out_proj_arms=0.0695 grad_arm_token_fuse=0.0270 grad_shared_expert=0.2840 (18633:train_pytorch.py:850) + Training: 96%|█████████▋| 9650/10000 [2:09:25<04:27, 1.31it/s, loss=0.0040, lr=2.58e-06, step=9649] Training: 96%|█████████▋| 9650/10000 [2:09:25<04:27, 1.31it/s, loss=0.0058, lr=2.58e-06, step=9650] Training: 97%|█████████▋| 9651/10000 [2:09:26<04:15, 1.37it/s, loss=0.0058, lr=2.58e-06, step=9650] Training: 97%|█████████▋| 9651/10000 [2:09:26<04:15, 1.37it/s, loss=0.0076, lr=2.58e-06, step=9651] Training: 97%|█████████▋| 9652/10000 [2:09:27<04:12, 1.38it/s, loss=0.0076, lr=2.58e-06, step=9651] Training: 97%|█████████▋| 9652/10000 [2:09:27<04:12, 1.38it/s, loss=0.0115, lr=2.57e-06, step=9652] Training: 97%|█████████▋| 9653/10000 [2:09:27<04:24, 1.31it/s, loss=0.0115, lr=2.57e-06, step=9652] Training: 97%|█████████▋| 9653/10000 [2:09:27<04:24, 1.31it/s, loss=0.0135, lr=2.57e-06, step=9653] Training: 97%|█████████▋| 9654/10000 [2:09:28<04:03, 1.42it/s, loss=0.0135, lr=2.57e-06, step=9653] Training: 97%|█████████▋| 9654/10000 [2:09:28<04:03, 1.42it/s, loss=0.0046, lr=2.57e-06, step=9654] Training: 97%|█████████▋| 9655/10000 [2:09:29<04:32, 1.27it/s, loss=0.0046, lr=2.57e-06, step=9654] Training: 97%|█████████▋| 9655/10000 [2:09:29<04:32, 1.27it/s, loss=0.0051, lr=2.57e-06, step=9655] Training: 97%|█████████▋| 9656/10000 [2:09:30<04:14, 1.35it/s, loss=0.0051, lr=2.57e-06, step=9655] Training: 97%|█████████▋| 9656/10000 [2:09:30<04:14, 1.35it/s, loss=0.0041, lr=2.57e-06, step=9656] Training: 97%|█████████▋| 9657/10000 [2:09:31<04:30, 1.27it/s, loss=0.0041, lr=2.57e-06, step=9656] Training: 97%|█████████▋| 9657/10000 [2:09:31<04:30, 1.27it/s, loss=0.0107, lr=2.57e-06, step=9657] Training: 97%|█████████▋| 9658/10000 [2:09:31<04:05, 1.39it/s, loss=0.0107, lr=2.57e-06, step=9657] Training: 97%|█████████▋| 9658/10000 [2:09:31<04:05, 1.39it/s, loss=0.0053, lr=2.57e-06, step=9658] Training: 97%|█████████▋| 9659/10000 [2:09:32<03:55, 1.45it/s, loss=0.0053, lr=2.57e-06, step=9658] Training: 97%|█████████▋| 9659/10000 [2:09:32<03:55, 1.45it/s, loss=0.0037, lr=2.57e-06, step=9659]20:54:04.713 [I] step=9660 loss=0.0168 smoothed_loss=0.0081 lr=2.57e-06 grad_norm=0.4655 step_time=0.5711s data_time=0.1432s it/s=1.400 eta_to_10000=242.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0306 grad_action_out_proj_arms=0.1874 grad_arm_token_fuse=0.1749 grad_shared_expert=0.6150 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9660/10000 [2:09:32<03:51, 1.47it/s, loss=0.0037, lr=2.57e-06, step=9659] Training: 97%|█████████▋| 9660/10000 [2:09:32<03:51, 1.47it/s, loss=0.0168, lr=2.57e-06, step=9660] Training: 97%|█████████▋| 9661/10000 [2:09:33<04:02, 1.40it/s, loss=0.0168, lr=2.57e-06, step=9660] Training: 97%|█████████▋| 9661/10000 [2:09:33<04:02, 1.40it/s, loss=0.0034, lr=2.57e-06, step=9661] Training: 97%|█████████▋| 9662/10000 [2:09:34<04:07, 1.37it/s, loss=0.0034, lr=2.57e-06, step=9661] Training: 97%|█████████▋| 9662/10000 [2:09:34<04:07, 1.37it/s, loss=0.0013, lr=2.57e-06, step=9662] Training: 97%|█████████▋| 9663/10000 [2:09:35<04:31, 1.24it/s, loss=0.0013, lr=2.57e-06, step=9662] Training: 97%|█████████▋| 9663/10000 [2:09:35<04:31, 1.24it/s, loss=0.0088, lr=2.57e-06, step=9663] Training: 97%|█████████▋| 9664/10000 [2:09:36<04:48, 1.17it/s, loss=0.0088, lr=2.57e-06, step=9663] Training: 97%|█████████▋| 9664/10000 [2:09:36<04:48, 1.17it/s, loss=0.0103, lr=2.57e-06, step=9664] Training: 97%|█████████▋| 9665/10000 [2:09:37<04:40, 1.19it/s, loss=0.0103, lr=2.57e-06, step=9664] Training: 97%|█████████▋| 9665/10000 [2:09:37<04:40, 1.19it/s, loss=0.0046, lr=2.57e-06, step=9665] Training: 97%|█████████▋| 9666/10000 [2:09:38<04:57, 1.12it/s, loss=0.0046, lr=2.57e-06, step=9665] Training: 97%|█████████▋| 9666/10000 [2:09:38<04:57, 1.12it/s, loss=0.0077, lr=2.57e-06, step=9666] Training: 97%|█████████▋| 9667/10000 [2:09:38<04:43, 1.17it/s, loss=0.0077, lr=2.57e-06, step=9666] Training: 97%|█████████▋| 9667/10000 [2:09:38<04:43, 1.17it/s, loss=0.0925, lr=2.57e-06, step=9667] Training: 97%|█████████▋| 9668/10000 [2:09:39<04:47, 1.16it/s, loss=0.0925, lr=2.57e-06, step=9667] Training: 97%|█████████▋| 9668/10000 [2:09:39<04:47, 1.16it/s, loss=0.0279, lr=2.57e-06, step=9668] Training: 97%|█████████▋| 9669/10000 [2:09:40<04:48, 1.15it/s, loss=0.0279, lr=2.57e-06, step=9668] Training: 97%|█████████▋| 9669/10000 [2:09:40<04:48, 1.15it/s, loss=0.0041, lr=2.57e-06, step=9669]20:54:13.468 [I] step=9670 loss=0.0110 smoothed_loss=0.0152 lr=2.57e-06 grad_norm=0.4949 step_time=0.6812s data_time=0.1942s it/s=1.142 eta_to_10000=288.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0107 grad_action_out_proj_arms=0.0753 grad_arm_token_fuse=0.0563 grad_shared_expert=0.2774 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9670/10000 [2:09:41<04:48, 1.14it/s, loss=0.0041, lr=2.57e-06, step=9669] Training: 97%|█████████▋| 9670/10000 [2:09:41<04:48, 1.14it/s, loss=0.0110, lr=2.57e-06, step=9670] Training: 97%|█████████▋| 9671/10000 [2:09:42<04:32, 1.21it/s, loss=0.0110, lr=2.57e-06, step=9670] Training: 97%|█████████▋| 9671/10000 [2:09:42<04:32, 1.21it/s, loss=0.0021, lr=2.57e-06, step=9671] Training: 97%|█████████▋| 9672/10000 [2:09:43<04:40, 1.17it/s, loss=0.0021, lr=2.57e-06, step=9671] Training: 97%|█████████▋| 9672/10000 [2:09:43<04:40, 1.17it/s, loss=0.0026, lr=2.57e-06, step=9672] Training: 97%|█████████▋| 9673/10000 [2:09:43<04:08, 1.32it/s, loss=0.0026, lr=2.57e-06, step=9672] Training: 97%|█████████▋| 9673/10000 [2:09:43<04:08, 1.32it/s, loss=0.0079, lr=2.57e-06, step=9673] Training: 97%|█████████▋| 9674/10000 [2:09:44<03:43, 1.46it/s, loss=0.0079, lr=2.57e-06, step=9673] Training: 97%|█████████▋| 9674/10000 [2:09:44<03:43, 1.46it/s, loss=0.0073, lr=2.57e-06, step=9674] Training: 97%|█████████▋| 9675/10000 [2:09:45<03:43, 1.46it/s, loss=0.0073, lr=2.57e-06, step=9674] Training: 97%|█████████▋| 9675/10000 [2:09:45<03:43, 1.46it/s, loss=0.0024, lr=2.57e-06, step=9675] Training: 97%|█████████▋| 9676/10000 [2:09:45<04:05, 1.32it/s, loss=0.0024, lr=2.57e-06, step=9675] Training: 97%|█████████▋| 9676/10000 [2:09:45<04:05, 1.32it/s, loss=0.0041, lr=2.56e-06, step=9676] Training: 97%|█████████▋| 9677/10000 [2:09:46<04:09, 1.29it/s, loss=0.0041, lr=2.56e-06, step=9676] Training: 97%|█████████▋| 9677/10000 [2:09:46<04:09, 1.29it/s, loss=0.0062, lr=2.56e-06, step=9677] Training: 97%|█████████▋| 9678/10000 [2:09:47<04:45, 1.13it/s, loss=0.0062, lr=2.56e-06, step=9677] Training: 97%|█████████▋| 9678/10000 [2:09:47<04:45, 1.13it/s, loss=0.0145, lr=2.56e-06, step=9678] Training: 97%|█████████▋| 9679/10000 [2:09:48<04:55, 1.09it/s, loss=0.0145, lr=2.56e-06, step=9678] Training: 97%|█████████▋| 9679/10000 [2:09:48<04:55, 1.09it/s, loss=0.0015, lr=2.56e-06, step=9679]20:54:21.278 [I] step=9680 loss=0.0014 smoothed_loss=0.0086 lr=2.57e-06 grad_norm=0.2976 step_time=0.6411s data_time=0.1399s it/s=1.280 eta_to_10000=249.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0045 grad_action_out_proj_arms=0.0598 grad_arm_token_fuse=0.0240 grad_shared_expert=0.1726 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9680/10000 [2:09:49<04:19, 1.24it/s, loss=0.0015, lr=2.56e-06, step=9679] Training: 97%|█████████▋| 9680/10000 [2:09:49<04:19, 1.24it/s, loss=0.0014, lr=2.56e-06, step=9680] Training: 97%|█████████▋| 9681/10000 [2:09:50<03:58, 1.34it/s, loss=0.0014, lr=2.56e-06, step=9680] Training: 97%|█████████▋| 9681/10000 [2:09:50<03:58, 1.34it/s, loss=0.0214, lr=2.56e-06, step=9681] Training: 97%|█████████▋| 9682/10000 [2:09:50<03:51, 1.38it/s, loss=0.0214, lr=2.56e-06, step=9681] Training: 97%|█████████▋| 9682/10000 [2:09:50<03:51, 1.38it/s, loss=0.0108, lr=2.56e-06, step=9682] Training: 97%|█████████▋| 9683/10000 [2:09:51<04:06, 1.29it/s, loss=0.0108, lr=2.56e-06, step=9682] Training: 97%|█████████▋| 9683/10000 [2:09:51<04:06, 1.29it/s, loss=0.0510, lr=2.56e-06, step=9683] Training: 97%|█████████▋| 9684/10000 [2:09:52<04:07, 1.28it/s, loss=0.0510, lr=2.56e-06, step=9683] Training: 97%|█████████▋| 9684/10000 [2:09:52<04:07, 1.28it/s, loss=0.0064, lr=2.56e-06, step=9684] Training: 97%|█████████▋| 9685/10000 [2:09:53<04:16, 1.23it/s, loss=0.0064, lr=2.56e-06, step=9684] Training: 97%|█████████▋| 9685/10000 [2:09:53<04:16, 1.23it/s, loss=0.0162, lr=2.56e-06, step=9685] Training: 97%|█████████▋| 9686/10000 [2:09:54<04:36, 1.13it/s, loss=0.0162, lr=2.56e-06, step=9685] Training: 97%|█████████▋| 9686/10000 [2:09:54<04:36, 1.13it/s, loss=0.0043, lr=2.56e-06, step=9686] Training: 97%|█████████▋| 9687/10000 [2:09:55<05:06, 1.02it/s, loss=0.0043, lr=2.56e-06, step=9686] Training: 97%|█████████▋| 9687/10000 [2:09:55<05:06, 1.02it/s, loss=0.0409, lr=2.56e-06, step=9687] Training: 97%|█████████▋| 9688/10000 [2:09:56<04:28, 1.16it/s, loss=0.0409, lr=2.56e-06, step=9687] Training: 97%|█████████▋| 9688/10000 [2:09:56<04:28, 1.16it/s, loss=0.0057, lr=2.56e-06, step=9688] Training: 97%|█████████▋| 9689/10000 [2:09:56<03:58, 1.31it/s, loss=0.0057, lr=2.56e-06, step=9688] Training: 97%|█████████▋| 9689/10000 [2:09:56<03:58, 1.31it/s, loss=0.0761, lr=2.56e-06, step=9689]20:54:29.300 [I] step=9690 loss=0.0030 smoothed_loss=0.0189 lr=2.56e-06 grad_norm=0.4810 step_time=0.6373s data_time=0.1648s it/s=1.247 eta_to_10000=248.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0055 grad_action_out_proj_arms=0.0607 grad_arm_token_fuse=0.0290 grad_shared_expert=0.4035 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9690/10000 [2:09:57<03:59, 1.29it/s, loss=0.0761, lr=2.56e-06, step=9689] Training: 97%|█████████▋| 9690/10000 [2:09:57<03:59, 1.29it/s, loss=0.0030, lr=2.56e-06, step=9690] Training: 97%|█████████▋| 9691/10000 [2:09:58<03:37, 1.42it/s, loss=0.0030, lr=2.56e-06, step=9690] Training: 97%|█████████▋| 9691/10000 [2:09:58<03:37, 1.42it/s, loss=0.0064, lr=2.56e-06, step=9691] Training: 97%|█████████▋| 9692/10000 [2:09:58<03:31, 1.46it/s, loss=0.0064, lr=2.56e-06, step=9691] Training: 97%|█████████▋| 9692/10000 [2:09:58<03:31, 1.46it/s, loss=0.0025, lr=2.56e-06, step=9692] Training: 97%|█████████▋| 9693/10000 [2:09:59<03:34, 1.43it/s, loss=0.0025, lr=2.56e-06, step=9692] Training: 97%|█████████▋| 9693/10000 [2:09:59<03:34, 1.43it/s, loss=0.0217, lr=2.56e-06, step=9693] Training: 97%|█████████▋| 9694/10000 [2:10:00<03:56, 1.29it/s, loss=0.0217, lr=2.56e-06, step=9693] Training: 97%|█████████▋| 9694/10000 [2:10:00<03:56, 1.29it/s, loss=0.0159, lr=2.56e-06, step=9694] Training: 97%|█████████▋| 9695/10000 [2:10:00<03:39, 1.39it/s, loss=0.0159, lr=2.56e-06, step=9694] Training: 97%|█████████▋| 9695/10000 [2:10:00<03:39, 1.39it/s, loss=0.0007, lr=2.56e-06, step=9695] Training: 97%|█████████▋| 9696/10000 [2:10:01<03:30, 1.45it/s, loss=0.0007, lr=2.56e-06, step=9695] Training: 97%|█████████▋| 9696/10000 [2:10:01<03:30, 1.45it/s, loss=0.0032, lr=2.56e-06, step=9696] Training: 97%|█████████▋| 9697/10000 [2:10:02<03:40, 1.37it/s, loss=0.0032, lr=2.56e-06, step=9696] Training: 97%|█████████▋| 9697/10000 [2:10:02<03:40, 1.37it/s, loss=0.0027, lr=2.56e-06, step=9697] Training: 97%|█████████▋| 9698/10000 [2:10:02<03:27, 1.45it/s, loss=0.0027, lr=2.56e-06, step=9697] Training: 97%|█████████▋| 9698/10000 [2:10:02<03:27, 1.45it/s, loss=0.0060, lr=2.56e-06, step=9698] Training: 97%|█████████▋| 9699/10000 [2:10:03<03:35, 1.40it/s, loss=0.0060, lr=2.56e-06, step=9698] Training: 97%|█████████▋| 9699/10000 [2:10:03<03:35, 1.40it/s, loss=0.0018, lr=2.56e-06, step=9699]20:54:36.541 [I] step=9700 loss=0.0032 smoothed_loss=0.0102 lr=2.56e-06 grad_norm=0.3934 step_time=0.5738s data_time=0.1503s it/s=1.381 eta_to_10000=217.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0073 grad_action_out_proj_arms=0.0822 grad_arm_token_fuse=0.0371 grad_shared_expert=0.3415 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9700/10000 [2:10:04<03:57, 1.26it/s, loss=0.0018, lr=2.56e-06, step=9699] Training: 97%|█████████▋| 9700/10000 [2:10:04<03:57, 1.26it/s, loss=0.0032, lr=2.56e-06, step=9700] Training: 97%|█████████▋| 9701/10000 [2:10:05<03:41, 1.35it/s, loss=0.0032, lr=2.56e-06, step=9700] Training: 97%|█████████▋| 9701/10000 [2:10:05<03:41, 1.35it/s, loss=0.0076, lr=2.56e-06, step=9701] Training: 97%|█████████▋| 9702/10000 [2:10:05<03:32, 1.40it/s, loss=0.0076, lr=2.56e-06, step=9701] Training: 97%|█████████▋| 9702/10000 [2:10:05<03:32, 1.40it/s, loss=0.0036, lr=2.55e-06, step=9702] Training: 97%|█████████▋| 9703/10000 [2:10:06<03:18, 1.49it/s, loss=0.0036, lr=2.55e-06, step=9702] Training: 97%|█████████▋| 9703/10000 [2:10:06<03:18, 1.49it/s, loss=0.0144, lr=2.55e-06, step=9703] Training: 97%|█████████▋| 9704/10000 [2:10:07<03:05, 1.60it/s, loss=0.0144, lr=2.55e-06, step=9703] Training: 97%|█████████▋| 9704/10000 [2:10:07<03:05, 1.60it/s, loss=0.0135, lr=2.55e-06, step=9704] Training: 97%|█████████▋| 9705/10000 [2:10:07<03:26, 1.43it/s, loss=0.0135, lr=2.55e-06, step=9704] Training: 97%|█████████▋| 9705/10000 [2:10:07<03:26, 1.43it/s, loss=0.0091, lr=2.55e-06, step=9705] Training: 97%|█████████▋| 9706/10000 [2:10:08<03:13, 1.52it/s, loss=0.0091, lr=2.55e-06, step=9705] Training: 97%|█████████▋| 9706/10000 [2:10:08<03:13, 1.52it/s, loss=0.0296, lr=2.55e-06, step=9706] Training: 97%|█████████▋| 9707/10000 [2:10:09<03:37, 1.35it/s, loss=0.0296, lr=2.55e-06, step=9706] Training: 97%|█████████▋| 9707/10000 [2:10:09<03:37, 1.35it/s, loss=0.0029, lr=2.55e-06, step=9707] Training: 97%|█████████▋| 9708/10000 [2:10:10<03:26, 1.41it/s, loss=0.0029, lr=2.55e-06, step=9707] Training: 97%|█████████▋| 9708/10000 [2:10:10<03:26, 1.41it/s, loss=0.0185, lr=2.55e-06, step=9708] Training: 97%|█████████▋| 9709/10000 [2:10:10<03:11, 1.52it/s, loss=0.0185, lr=2.55e-06, step=9708] Training: 97%|█████████▋| 9709/10000 [2:10:10<03:11, 1.52it/s, loss=0.0032, lr=2.55e-06, step=9709]20:54:43.096 [I] step=9710 loss=0.0044 smoothed_loss=0.0103 lr=2.55e-06 grad_norm=0.4788 step_time=0.5345s data_time=0.1210s it/s=1.526 eta_to_10000=190.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0137 grad_action_out_proj_arms=0.1480 grad_arm_token_fuse=0.0762 grad_shared_expert=0.3811 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9710/10000 [2:10:11<03:09, 1.53it/s, loss=0.0032, lr=2.55e-06, step=9709] Training: 97%|█████████▋| 9710/10000 [2:10:11<03:09, 1.53it/s, loss=0.0044, lr=2.55e-06, step=9710] Training: 97%|█████████▋| 9711/10000 [2:10:11<03:13, 1.49it/s, loss=0.0044, lr=2.55e-06, step=9710] Training: 97%|█████████▋| 9711/10000 [2:10:11<03:13, 1.49it/s, loss=0.0090, lr=2.55e-06, step=9711] Training: 97%|█████████▋| 9712/10000 [2:10:12<03:02, 1.58it/s, loss=0.0090, lr=2.55e-06, step=9711] Training: 97%|█████████▋| 9712/10000 [2:10:12<03:02, 1.58it/s, loss=0.0122, lr=2.55e-06, step=9712] Training: 97%|█████████▋| 9713/10000 [2:10:13<02:53, 1.65it/s, loss=0.0122, lr=2.55e-06, step=9712] Training: 97%|█████████▋| 9713/10000 [2:10:13<02:53, 1.65it/s, loss=0.0044, lr=2.55e-06, step=9713] Training: 97%|█████████▋| 9714/10000 [2:10:13<03:08, 1.52it/s, loss=0.0044, lr=2.55e-06, step=9713] Training: 97%|█████████▋| 9714/10000 [2:10:13<03:08, 1.52it/s, loss=0.0048, lr=2.55e-06, step=9714] Training: 97%|█████████▋| 9715/10000 [2:10:14<03:16, 1.45it/s, loss=0.0048, lr=2.55e-06, step=9714] Training: 97%|█████████▋| 9715/10000 [2:10:14<03:16, 1.45it/s, loss=0.0058, lr=2.55e-06, step=9715] Training: 97%|█████████▋| 9716/10000 [2:10:15<03:30, 1.35it/s, loss=0.0058, lr=2.55e-06, step=9715] Training: 97%|█████████▋| 9716/10000 [2:10:15<03:30, 1.35it/s, loss=0.0030, lr=2.55e-06, step=9716] Training: 97%|█████████▋| 9717/10000 [2:10:16<03:17, 1.43it/s, loss=0.0030, lr=2.55e-06, step=9716] Training: 97%|█████████▋| 9717/10000 [2:10:16<03:17, 1.43it/s, loss=0.0036, lr=2.55e-06, step=9717] Training: 97%|█████████▋| 9718/10000 [2:10:16<03:35, 1.31it/s, loss=0.0036, lr=2.55e-06, step=9717] Training: 97%|█████████▋| 9718/10000 [2:10:16<03:35, 1.31it/s, loss=0.0039, lr=2.55e-06, step=9718] Training: 97%|█████████▋| 9719/10000 [2:10:17<03:23, 1.38it/s, loss=0.0039, lr=2.55e-06, step=9718] Training: 97%|█████████▋| 9719/10000 [2:10:17<03:23, 1.38it/s, loss=0.0077, lr=2.55e-06, step=9719]20:54:50.386 [I] step=9720 loss=0.0027 smoothed_loss=0.0070 lr=2.55e-06 grad_norm=0.3853 step_time=0.5807s data_time=0.1482s it/s=1.372 eta_to_10000=204.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0416 grad_action_out_proj_arms=0.1354 grad_arm_token_fuse=0.2256 grad_shared_expert=0.4320 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9720/10000 [2:10:18<03:40, 1.27it/s, loss=0.0077, lr=2.55e-06, step=9719] Training: 97%|█████████▋| 9720/10000 [2:10:18<03:40, 1.27it/s, loss=0.0027, lr=2.55e-06, step=9720] Training: 97%|█████████▋| 9721/10000 [2:10:19<03:53, 1.20it/s, loss=0.0027, lr=2.55e-06, step=9720] Training: 97%|█████████▋| 9721/10000 [2:10:19<03:53, 1.20it/s, loss=0.0099, lr=2.55e-06, step=9721] Training: 97%|█████████▋| 9722/10000 [2:10:20<04:05, 1.13it/s, loss=0.0099, lr=2.55e-06, step=9721] Training: 97%|█████████▋| 9722/10000 [2:10:20<04:05, 1.13it/s, loss=0.0087, lr=2.55e-06, step=9722] Training: 97%|█████████▋| 9723/10000 [2:10:21<04:08, 1.12it/s, loss=0.0087, lr=2.55e-06, step=9722] Training: 97%|█████████▋| 9723/10000 [2:10:21<04:08, 1.12it/s, loss=0.0068, lr=2.55e-06, step=9723] Training: 97%|█████████▋| 9724/10000 [2:10:21<03:36, 1.27it/s, loss=0.0068, lr=2.55e-06, step=9723] Training: 97%|█████████▋| 9724/10000 [2:10:21<03:36, 1.27it/s, loss=0.0252, lr=2.55e-06, step=9724] Training: 97%|█████████▋| 9725/10000 [2:10:22<03:42, 1.24it/s, loss=0.0252, lr=2.55e-06, step=9724] Training: 97%|█████████▋| 9725/10000 [2:10:22<03:42, 1.24it/s, loss=0.0100, lr=2.55e-06, step=9725] Training: 97%|█████████▋| 9726/10000 [2:10:23<03:24, 1.34it/s, loss=0.0100, lr=2.55e-06, step=9725] Training: 97%|█████████▋| 9726/10000 [2:10:23<03:24, 1.34it/s, loss=0.0072, lr=2.55e-06, step=9726] Training: 97%|█████████▋| 9727/10000 [2:10:24<03:18, 1.38it/s, loss=0.0072, lr=2.55e-06, step=9726] Training: 97%|█████████▋| 9727/10000 [2:10:24<03:18, 1.38it/s, loss=0.0021, lr=2.55e-06, step=9727] Training: 97%|█████████▋| 9728/10000 [2:10:25<03:34, 1.27it/s, loss=0.0021, lr=2.55e-06, step=9727] Training: 97%|█████████▋| 9728/10000 [2:10:25<03:34, 1.27it/s, loss=0.0088, lr=2.55e-06, step=9728] Training: 97%|█████████▋| 9729/10000 [2:10:25<03:40, 1.23it/s, loss=0.0088, lr=2.55e-06, step=9728] Training: 97%|█████████▋| 9729/10000 [2:10:25<03:40, 1.23it/s, loss=0.0092, lr=2.55e-06, step=9729]20:54:58.642 [I] step=9730 loss=0.0006 smoothed_loss=0.0077 lr=2.55e-06 grad_norm=0.4474 step_time=0.6766s data_time=0.1491s it/s=1.211 eta_to_10000=222.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0219 grad_action_out_proj_arms=0.1055 grad_arm_token_fuse=0.1161 grad_shared_expert=0.5476 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9730/10000 [2:10:26<03:47, 1.19it/s, loss=0.0092, lr=2.55e-06, step=9729] Training: 97%|█████████▋| 9730/10000 [2:10:26<03:47, 1.19it/s, loss=0.0006, lr=2.55e-06, step=9730] Training: 97%|█████████▋| 9731/10000 [2:10:27<03:27, 1.29it/s, loss=0.0006, lr=2.55e-06, step=9730] Training: 97%|█████████▋| 9731/10000 [2:10:27<03:27, 1.29it/s, loss=0.0112, lr=2.54e-06, step=9731] Training: 97%|█████████▋| 9732/10000 [2:10:28<03:19, 1.34it/s, loss=0.0112, lr=2.54e-06, step=9731] Training: 97%|█████████▋| 9732/10000 [2:10:28<03:19, 1.34it/s, loss=0.0071, lr=2.54e-06, step=9732] Training: 97%|█████████▋| 9733/10000 [2:10:28<03:10, 1.40it/s, loss=0.0071, lr=2.54e-06, step=9732] Training: 97%|█████████▋| 9733/10000 [2:10:28<03:10, 1.40it/s, loss=0.0026, lr=2.54e-06, step=9733] Training: 97%|█████████▋| 9734/10000 [2:10:29<02:55, 1.52it/s, loss=0.0026, lr=2.54e-06, step=9733] Training: 97%|█████████▋| 9734/10000 [2:10:29<02:55, 1.52it/s, loss=0.0012, lr=2.54e-06, step=9734] Training: 97%|█████████▋| 9735/10000 [2:10:30<03:22, 1.31it/s, loss=0.0012, lr=2.54e-06, step=9734] Training: 97%|█████████▋| 9735/10000 [2:10:30<03:22, 1.31it/s, loss=0.0065, lr=2.54e-06, step=9735] Training: 97%|█████████▋| 9736/10000 [2:10:31<03:31, 1.25it/s, loss=0.0065, lr=2.54e-06, step=9735] Training: 97%|█████████▋| 9736/10000 [2:10:31<03:31, 1.25it/s, loss=0.0021, lr=2.54e-06, step=9736] Training: 97%|█████████▋| 9737/10000 [2:10:31<03:13, 1.36it/s, loss=0.0021, lr=2.54e-06, step=9736] Training: 97%|█████████▋| 9737/10000 [2:10:31<03:13, 1.36it/s, loss=0.0092, lr=2.54e-06, step=9737] Training: 97%|█████████▋| 9738/10000 [2:10:32<03:02, 1.43it/s, loss=0.0092, lr=2.54e-06, step=9737] Training: 97%|█████████▋| 9738/10000 [2:10:32<03:02, 1.43it/s, loss=0.0017, lr=2.54e-06, step=9738] Training: 97%|█████████▋| 9739/10000 [2:10:33<03:16, 1.33it/s, loss=0.0017, lr=2.54e-06, step=9738] Training: 97%|█████████▋| 9739/10000 [2:10:33<03:16, 1.33it/s, loss=0.0054, lr=2.54e-06, step=9739]20:55:05.937 [I] step=9740 loss=0.0017 smoothed_loss=0.0056 lr=2.54e-06 grad_norm=0.4352 step_time=0.5722s data_time=0.1573s it/s=1.371 eta_to_10000=189.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0175 grad_action_out_proj_arms=0.0902 grad_arm_token_fuse=0.0909 grad_shared_expert=0.3516 (18633:train_pytorch.py:850) + Training: 97%|█████████▋| 9740/10000 [2:10:34<03:24, 1.27it/s, loss=0.0054, lr=2.54e-06, step=9739] Training: 97%|█████████▋| 9740/10000 [2:10:34<03:24, 1.27it/s, loss=0.0017, lr=2.54e-06, step=9740] Training: 97%|█████████▋| 9741/10000 [2:10:35<03:49, 1.13it/s, loss=0.0017, lr=2.54e-06, step=9740] Training: 97%|█████████▋| 9741/10000 [2:10:35<03:49, 1.13it/s, loss=0.0067, lr=2.54e-06, step=9741] Training: 97%|█████████▋| 9742/10000 [2:10:35<03:26, 1.25it/s, loss=0.0067, lr=2.54e-06, step=9741] Training: 97%|█████████▋| 9742/10000 [2:10:35<03:26, 1.25it/s, loss=0.0097, lr=2.54e-06, step=9742] Training: 97%|█████████▋| 9743/10000 [2:10:36<03:53, 1.10it/s, loss=0.0097, lr=2.54e-06, step=9742] Training: 97%|█████████▋| 9743/10000 [2:10:36<03:53, 1.10it/s, loss=0.0015, lr=2.54e-06, step=9743] Training: 97%|█████████▋| 9744/10000 [2:10:37<03:25, 1.24it/s, loss=0.0015, lr=2.54e-06, step=9743] Training: 97%|█████████▋| 9744/10000 [2:10:37<03:25, 1.24it/s, loss=0.0036, lr=2.54e-06, step=9744] Training: 97%|█████████▋| 9745/10000 [2:10:38<03:23, 1.25it/s, loss=0.0036, lr=2.54e-06, step=9744] Training: 97%|█████████▋| 9745/10000 [2:10:38<03:23, 1.25it/s, loss=0.0103, lr=2.54e-06, step=9745] Training: 97%|█████████▋| 9746/10000 [2:10:39<03:24, 1.24it/s, loss=0.0103, lr=2.54e-06, step=9745] Training: 97%|█████████▋| 9746/10000 [2:10:39<03:24, 1.24it/s, loss=0.0015, lr=2.54e-06, step=9746] Training: 97%|█████████▋| 9747/10000 [2:10:39<03:03, 1.38it/s, loss=0.0015, lr=2.54e-06, step=9746] Training: 97%|█████████▋| 9747/10000 [2:10:39<03:03, 1.38it/s, loss=0.0032, lr=2.54e-06, step=9747] Training: 97%|█████████▋| 9748/10000 [2:10:40<02:59, 1.41it/s, loss=0.0032, lr=2.54e-06, step=9747] Training: 97%|█████████▋| 9748/10000 [2:10:40<02:59, 1.41it/s, loss=0.0095, lr=2.54e-06, step=9748] Training: 97%|█████████▋| 9749/10000 [2:10:40<02:51, 1.47it/s, loss=0.0095, lr=2.54e-06, step=9748] Training: 97%|█████████▋| 9749/10000 [2:10:40<02:51, 1.47it/s, loss=0.0067, lr=2.54e-06, step=9749]20:55:13.729 [I] step=9750 loss=0.0016 smoothed_loss=0.0054 lr=2.54e-06 grad_norm=0.3373 step_time=0.6085s data_time=0.1707s it/s=1.284 eta_to_10000=194.8s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0087 grad_action_out_proj_arms=0.0727 grad_arm_token_fuse=0.0456 grad_shared_expert=0.1877 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9750/10000 [2:10:41<03:08, 1.33it/s, loss=0.0067, lr=2.54e-06, step=9749] Training: 98%|█████████▊| 9750/10000 [2:10:41<03:08, 1.33it/s, loss=0.0016, lr=2.54e-06, step=9750] Training: 98%|█████████▊| 9751/10000 [2:10:42<03:10, 1.31it/s, loss=0.0016, lr=2.54e-06, step=9750] Training: 98%|█████████▊| 9751/10000 [2:10:42<03:10, 1.31it/s, loss=0.0023, lr=2.54e-06, step=9751] Training: 98%|█████████▊| 9752/10000 [2:10:43<03:17, 1.26it/s, loss=0.0023, lr=2.54e-06, step=9751] Training: 98%|█████████▊| 9752/10000 [2:10:43<03:17, 1.26it/s, loss=0.0127, lr=2.54e-06, step=9752] Training: 98%|█████████▊| 9753/10000 [2:10:44<03:36, 1.14it/s, loss=0.0127, lr=2.54e-06, step=9752] Training: 98%|█████████▊| 9753/10000 [2:10:44<03:36, 1.14it/s, loss=0.0171, lr=2.54e-06, step=9753] Training: 98%|█████████▊| 9754/10000 [2:10:45<03:10, 1.29it/s, loss=0.0171, lr=2.54e-06, step=9753] Training: 98%|█████████▊| 9754/10000 [2:10:45<03:10, 1.29it/s, loss=0.0040, lr=2.54e-06, step=9754] Training: 98%|█████████▊| 9755/10000 [2:10:45<02:58, 1.37it/s, loss=0.0040, lr=2.54e-06, step=9754] Training: 98%|█████████▊| 9755/10000 [2:10:45<02:58, 1.37it/s, loss=0.0015, lr=2.54e-06, step=9755] Training: 98%|█████████▊| 9756/10000 [2:10:46<02:43, 1.50it/s, loss=0.0015, lr=2.54e-06, step=9755] Training: 98%|█████████▊| 9756/10000 [2:10:46<02:43, 1.50it/s, loss=0.0037, lr=2.54e-06, step=9756] Training: 98%|█████████▊| 9757/10000 [2:10:47<02:58, 1.36it/s, loss=0.0037, lr=2.54e-06, step=9756] Training: 98%|█████████▊| 9757/10000 [2:10:47<02:58, 1.36it/s, loss=0.0232, lr=2.54e-06, step=9757] Training: 98%|█████████▊| 9758/10000 [2:10:47<02:47, 1.44it/s, loss=0.0232, lr=2.54e-06, step=9757] Training: 98%|█████████▊| 9758/10000 [2:10:47<02:47, 1.44it/s, loss=0.0092, lr=2.54e-06, step=9758] Training: 98%|█████████▊| 9759/10000 [2:10:48<02:41, 1.49it/s, loss=0.0092, lr=2.54e-06, step=9758] Training: 98%|█████████▊| 9759/10000 [2:10:48<02:41, 1.49it/s, loss=0.0079, lr=2.54e-06, step=9759]20:55:20.801 [I] step=9760 loss=0.0024 smoothed_loss=0.0073 lr=2.54e-06 grad_norm=0.4394 step_time=0.5525s data_time=0.1548s it/s=1.415 eta_to_10000=169.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0338 grad_action_out_proj_arms=0.1306 grad_arm_token_fuse=0.1797 grad_shared_expert=0.4530 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9760/10000 [2:10:48<02:32, 1.57it/s, loss=0.0079, lr=2.54e-06, step=9759] Training: 98%|█████████▊| 9760/10000 [2:10:48<02:32, 1.57it/s, loss=0.0024, lr=2.54e-06, step=9760] Training: 98%|█████████▊| 9761/10000 [2:10:49<02:45, 1.45it/s, loss=0.0024, lr=2.54e-06, step=9760] Training: 98%|█████████▊| 9761/10000 [2:10:49<02:45, 1.45it/s, loss=0.0016, lr=2.54e-06, step=9761] Training: 98%|█████████▊| 9762/10000 [2:10:50<02:51, 1.39it/s, loss=0.0016, lr=2.54e-06, step=9761] Training: 98%|█████████▊| 9762/10000 [2:10:50<02:51, 1.39it/s, loss=0.0034, lr=2.54e-06, step=9762] Training: 98%|█████████▊| 9763/10000 [2:10:51<02:51, 1.38it/s, loss=0.0034, lr=2.54e-06, step=9762] Training: 98%|█████████▊| 9763/10000 [2:10:51<02:51, 1.38it/s, loss=0.0038, lr=2.53e-06, step=9763] Training: 98%|█████████▊| 9764/10000 [2:10:51<02:46, 1.41it/s, loss=0.0038, lr=2.53e-06, step=9763] Training: 98%|█████████▊| 9764/10000 [2:10:51<02:46, 1.41it/s, loss=0.0058, lr=2.53e-06, step=9764] Training: 98%|█████████▊| 9765/10000 [2:10:52<02:54, 1.35it/s, loss=0.0058, lr=2.53e-06, step=9764] Training: 98%|█████████▊| 9765/10000 [2:10:52<02:54, 1.35it/s, loss=0.0103, lr=2.53e-06, step=9765] Training: 98%|█████████▊| 9766/10000 [2:10:53<02:39, 1.46it/s, loss=0.0103, lr=2.53e-06, step=9765] Training: 98%|█████████▊| 9766/10000 [2:10:53<02:39, 1.46it/s, loss=0.0010, lr=2.53e-06, step=9766] Training: 98%|█████████▊| 9767/10000 [2:10:54<02:46, 1.40it/s, loss=0.0010, lr=2.53e-06, step=9766] Training: 98%|█████████▊| 9767/10000 [2:10:54<02:46, 1.40it/s, loss=0.0152, lr=2.53e-06, step=9767] Training: 98%|█████████▊| 9768/10000 [2:10:55<02:58, 1.30it/s, loss=0.0152, lr=2.53e-06, step=9767] Training: 98%|█████████▊| 9768/10000 [2:10:55<02:58, 1.30it/s, loss=0.0767, lr=2.53e-06, step=9768] Training: 98%|█████████▊| 9769/10000 [2:10:56<03:24, 1.13it/s, loss=0.0767, lr=2.53e-06, step=9768] Training: 98%|█████████▊| 9769/10000 [2:10:56<03:24, 1.13it/s, loss=0.0061, lr=2.53e-06, step=9769]20:55:28.817 [I] step=9770 loss=0.0143 smoothed_loss=0.0132 lr=2.53e-06 grad_norm=0.4731 step_time=0.6265s data_time=0.1750s it/s=1.248 eta_to_10000=184.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0055 grad_action_out_proj_arms=0.0887 grad_arm_token_fuse=0.0298 grad_shared_expert=0.2931 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9770/10000 [2:10:56<03:17, 1.17it/s, loss=0.0061, lr=2.53e-06, step=9769] Training: 98%|█████████▊| 9770/10000 [2:10:56<03:17, 1.17it/s, loss=0.0143, lr=2.53e-06, step=9770] Training: 98%|█████████▊| 9771/10000 [2:10:57<03:13, 1.18it/s, loss=0.0143, lr=2.53e-06, step=9770] Training: 98%|█████████▊| 9771/10000 [2:10:57<03:13, 1.18it/s, loss=0.0040, lr=2.53e-06, step=9771] Training: 98%|█████████▊| 9772/10000 [2:10:58<03:03, 1.24it/s, loss=0.0040, lr=2.53e-06, step=9771] Training: 98%|█████████▊| 9772/10000 [2:10:58<03:03, 1.24it/s, loss=0.0047, lr=2.53e-06, step=9772] Training: 98%|█████████▊| 9773/10000 [2:10:59<02:43, 1.39it/s, loss=0.0047, lr=2.53e-06, step=9772] Training: 98%|█████████▊| 9773/10000 [2:10:59<02:43, 1.39it/s, loss=0.0012, lr=2.53e-06, step=9773] Training: 98%|█████████▊| 9774/10000 [2:10:59<02:49, 1.33it/s, loss=0.0012, lr=2.53e-06, step=9773] Training: 98%|█████████▊| 9774/10000 [2:10:59<02:49, 1.33it/s, loss=0.0032, lr=2.53e-06, step=9774] Training: 98%|█████████▊| 9775/10000 [2:11:00<02:34, 1.45it/s, loss=0.0032, lr=2.53e-06, step=9774] Training: 98%|█████████▊| 9775/10000 [2:11:00<02:34, 1.45it/s, loss=0.0025, lr=2.53e-06, step=9775] Training: 98%|█████████▊| 9776/10000 [2:11:01<02:43, 1.37it/s, loss=0.0025, lr=2.53e-06, step=9775] Training: 98%|█████████▊| 9776/10000 [2:11:01<02:43, 1.37it/s, loss=0.0122, lr=2.53e-06, step=9776] Training: 98%|█████████▊| 9777/10000 [2:11:01<02:26, 1.52it/s, loss=0.0122, lr=2.53e-06, step=9776] Training: 98%|█████████▊| 9777/10000 [2:11:01<02:26, 1.52it/s, loss=0.0082, lr=2.53e-06, step=9777] Training: 98%|█████████▊| 9778/10000 [2:11:02<02:37, 1.41it/s, loss=0.0082, lr=2.53e-06, step=9777] Training: 98%|█████████▊| 9778/10000 [2:11:02<02:37, 1.41it/s, loss=0.0097, lr=2.53e-06, step=9778] Training: 98%|█████████▊| 9779/10000 [2:11:03<02:54, 1.27it/s, loss=0.0097, lr=2.53e-06, step=9778] Training: 98%|█████████▊| 9779/10000 [2:11:03<02:54, 1.27it/s, loss=0.0120, lr=2.53e-06, step=9779]20:55:36.056 [I] step=9780 loss=0.0069 smoothed_loss=0.0093 lr=2.53e-06 grad_norm=0.3906 step_time=0.5995s data_time=0.1245s it/s=1.381 eta_to_10000=159.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0220 grad_action_out_proj_arms=0.1355 grad_arm_token_fuse=0.1190 grad_shared_expert=0.3965 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9780/10000 [2:11:04<02:48, 1.31it/s, loss=0.0120, lr=2.53e-06, step=9779] Training: 98%|█████████▊| 9780/10000 [2:11:04<02:48, 1.31it/s, loss=0.0069, lr=2.53e-06, step=9780] Training: 98%|█████████▊| 9781/10000 [2:11:04<02:42, 1.35it/s, loss=0.0069, lr=2.53e-06, step=9780] Training: 98%|█████████▊| 9781/10000 [2:11:04<02:42, 1.35it/s, loss=0.0139, lr=2.53e-06, step=9781] Training: 98%|█████████▊| 9782/10000 [2:11:05<02:41, 1.35it/s, loss=0.0139, lr=2.53e-06, step=9781] Training: 98%|█████████▊| 9782/10000 [2:11:05<02:41, 1.35it/s, loss=0.0008, lr=2.53e-06, step=9782] Training: 98%|█████████▊| 9783/10000 [2:11:06<02:57, 1.22it/s, loss=0.0008, lr=2.53e-06, step=9782] Training: 98%|█████████▊| 9783/10000 [2:11:06<02:57, 1.22it/s, loss=0.0029, lr=2.53e-06, step=9783] Training: 98%|█████████▊| 9784/10000 [2:11:07<02:54, 1.24it/s, loss=0.0029, lr=2.53e-06, step=9783] Training: 98%|█████████▊| 9784/10000 [2:11:07<02:54, 1.24it/s, loss=0.0036, lr=2.53e-06, step=9784] Training: 98%|█████████▊| 9785/10000 [2:11:08<02:38, 1.36it/s, loss=0.0036, lr=2.53e-06, step=9784] Training: 98%|█████████▊| 9785/10000 [2:11:08<02:38, 1.36it/s, loss=0.0026, lr=2.53e-06, step=9785] Training: 98%|█████████▊| 9786/10000 [2:11:08<02:44, 1.30it/s, loss=0.0026, lr=2.53e-06, step=9785] Training: 98%|█████████▊| 9786/10000 [2:11:08<02:44, 1.30it/s, loss=0.0168, lr=2.53e-06, step=9786] Training: 98%|█████████▊| 9787/10000 [2:11:09<02:28, 1.44it/s, loss=0.0168, lr=2.53e-06, step=9786] Training: 98%|█████████▊| 9787/10000 [2:11:09<02:28, 1.44it/s, loss=0.0008, lr=2.53e-06, step=9787] Training: 98%|█████████▊| 9788/10000 [2:11:09<02:14, 1.58it/s, loss=0.0008, lr=2.53e-06, step=9787] Training: 98%|█████████▊| 9788/10000 [2:11:09<02:14, 1.58it/s, loss=0.0245, lr=2.53e-06, step=9788] Training: 98%|█████████▊| 9789/10000 [2:11:10<02:04, 1.70it/s, loss=0.0245, lr=2.53e-06, step=9788] Training: 98%|█████████▊| 9789/10000 [2:11:10<02:04, 1.70it/s, loss=0.0063, lr=2.53e-06, step=9789]20:55:42.836 [I] step=9790 loss=0.0055 smoothed_loss=0.0086 lr=2.53e-06 grad_norm=0.4834 step_time=0.5613s data_time=0.1167s it/s=1.475 eta_to_10000=142.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0047 grad_action_out_proj_arms=0.0611 grad_arm_token_fuse=0.0239 grad_shared_expert=0.2492 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9790/10000 [2:11:11<02:07, 1.64it/s, loss=0.0063, lr=2.53e-06, step=9789] Training: 98%|█████████▊| 9790/10000 [2:11:11<02:07, 1.64it/s, loss=0.0055, lr=2.53e-06, step=9790] Training: 98%|█████████▊| 9791/10000 [2:11:11<02:16, 1.53it/s, loss=0.0055, lr=2.53e-06, step=9790] Training: 98%|█████████▊| 9791/10000 [2:11:11<02:16, 1.53it/s, loss=0.0009, lr=2.53e-06, step=9791] Training: 98%|█████████▊| 9792/10000 [2:11:12<02:11, 1.58it/s, loss=0.0009, lr=2.53e-06, step=9791] Training: 98%|█████████▊| 9792/10000 [2:11:12<02:11, 1.58it/s, loss=0.0267, lr=2.53e-06, step=9792] Training: 98%|█████████▊| 9793/10000 [2:11:13<02:38, 1.31it/s, loss=0.0267, lr=2.53e-06, step=9792] Training: 98%|█████████▊| 9793/10000 [2:11:13<02:38, 1.31it/s, loss=0.0005, lr=2.53e-06, step=9793] Training: 98%|█████████▊| 9794/10000 [2:11:14<02:29, 1.38it/s, loss=0.0005, lr=2.53e-06, step=9793] Training: 98%|█████████▊| 9794/10000 [2:11:14<02:29, 1.38it/s, loss=0.0018, lr=2.53e-06, step=9794] Training: 98%|█████████▊| 9795/10000 [2:11:14<02:25, 1.41it/s, loss=0.0018, lr=2.53e-06, step=9794] Training: 98%|█████████▊| 9795/10000 [2:11:14<02:25, 1.41it/s, loss=0.2090, lr=2.53e-06, step=9795] Training: 98%|█████████▊| 9796/10000 [2:11:15<02:29, 1.37it/s, loss=0.2090, lr=2.53e-06, step=9795] Training: 98%|█████████▊| 9796/10000 [2:11:15<02:29, 1.37it/s, loss=0.0022, lr=2.53e-06, step=9796] Training: 98%|█████████▊| 9797/10000 [2:11:16<02:29, 1.35it/s, loss=0.0022, lr=2.53e-06, step=9796] Training: 98%|█████████▊| 9797/10000 [2:11:16<02:29, 1.35it/s, loss=0.0013, lr=2.53e-06, step=9797] Training: 98%|█████████▊| 9798/10000 [2:11:17<02:34, 1.31it/s, loss=0.0013, lr=2.53e-06, step=9797] Training: 98%|█████████▊| 9798/10000 [2:11:17<02:34, 1.31it/s, loss=0.0023, lr=2.53e-06, step=9798] Training: 98%|█████████▊| 9799/10000 [2:11:17<02:31, 1.33it/s, loss=0.0023, lr=2.53e-06, step=9798] Training: 98%|█████████▊| 9799/10000 [2:11:17<02:31, 1.33it/s, loss=0.0023, lr=2.53e-06, step=9799]20:55:50.619 [I] step=9800 loss=0.0010 smoothed_loss=0.0174 lr=2.53e-06 grad_norm=0.3383 step_time=0.6231s data_time=0.1551s it/s=1.285 eta_to_10000=155.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0072 grad_action_out_proj_arms=0.0659 grad_arm_token_fuse=0.0369 grad_shared_expert=0.3727 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9800/10000 [2:11:18<02:43, 1.22it/s, loss=0.0023, lr=2.53e-06, step=9799] Training: 98%|█████████▊| 9800/10000 [2:11:18<02:43, 1.22it/s, loss=0.0010, lr=2.52e-06, step=9800] Training: 98%|█████████▊| 9801/10000 [2:11:19<02:45, 1.20it/s, loss=0.0010, lr=2.52e-06, step=9800] Training: 98%|█████████▊| 9801/10000 [2:11:19<02:45, 1.20it/s, loss=0.0243, lr=2.52e-06, step=9801] Training: 98%|█████████▊| 9802/10000 [2:11:20<02:41, 1.22it/s, loss=0.0243, lr=2.52e-06, step=9801] Training: 98%|█████████▊| 9802/10000 [2:11:20<02:41, 1.22it/s, loss=0.0042, lr=2.52e-06, step=9802] Training: 98%|█████████▊| 9803/10000 [2:11:20<02:24, 1.37it/s, loss=0.0042, lr=2.52e-06, step=9802] Training: 98%|█████████▊| 9803/10000 [2:11:20<02:24, 1.37it/s, loss=0.0057, lr=2.52e-06, step=9803] Training: 98%|█████████▊| 9804/10000 [2:11:21<02:17, 1.43it/s, loss=0.0057, lr=2.52e-06, step=9803] Training: 98%|█████████▊| 9804/10000 [2:11:21<02:17, 1.43it/s, loss=0.0014, lr=2.52e-06, step=9804] Training: 98%|█████████▊| 9805/10000 [2:11:22<02:33, 1.27it/s, loss=0.0014, lr=2.52e-06, step=9804] Training: 98%|█████████▊| 9805/10000 [2:11:22<02:33, 1.27it/s, loss=0.0022, lr=2.52e-06, step=9805] Training: 98%|█████████▊| 9806/10000 [2:11:23<02:43, 1.19it/s, loss=0.0022, lr=2.52e-06, step=9805] Training: 98%|█████████▊| 9806/10000 [2:11:23<02:43, 1.19it/s, loss=0.0011, lr=2.52e-06, step=9806] Training: 98%|█████████▊| 9807/10000 [2:11:24<02:33, 1.25it/s, loss=0.0011, lr=2.52e-06, step=9806] Training: 98%|█████████▊| 9807/10000 [2:11:24<02:33, 1.25it/s, loss=0.0026, lr=2.52e-06, step=9807] Training: 98%|█████████▊| 9808/10000 [2:11:25<02:34, 1.24it/s, loss=0.0026, lr=2.52e-06, step=9807] Training: 98%|█████████▊| 9808/10000 [2:11:25<02:34, 1.24it/s, loss=0.0013, lr=2.52e-06, step=9808] Training: 98%|█████████▊| 9809/10000 [2:11:25<02:23, 1.33it/s, loss=0.0013, lr=2.52e-06, step=9808] Training: 98%|█████████▊| 9809/10000 [2:11:25<02:23, 1.33it/s, loss=0.0253, lr=2.52e-06, step=9809]20:55:58.054 [I] step=9810 loss=0.0280 smoothed_loss=0.0131 lr=2.52e-06 grad_norm=0.4536 step_time=0.6109s data_time=0.1326s it/s=1.345 eta_to_10000=141.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0152 grad_action_out_proj_arms=0.1061 grad_arm_token_fuse=0.0787 grad_shared_expert=0.6149 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9810/10000 [2:11:26<02:10, 1.46it/s, loss=0.0253, lr=2.52e-06, step=9809] Training: 98%|█████████▊| 9810/10000 [2:11:26<02:10, 1.46it/s, loss=0.0280, lr=2.52e-06, step=9810] Training: 98%|█████████▊| 9811/10000 [2:11:26<02:05, 1.50it/s, loss=0.0280, lr=2.52e-06, step=9810] Training: 98%|█████████▊| 9811/10000 [2:11:26<02:05, 1.50it/s, loss=0.0056, lr=2.52e-06, step=9811] Training: 98%|█████████▊| 9812/10000 [2:11:27<02:12, 1.42it/s, loss=0.0056, lr=2.52e-06, step=9811] Training: 98%|█████████▊| 9812/10000 [2:11:27<02:12, 1.42it/s, loss=0.0036, lr=2.52e-06, step=9812] Training: 98%|█████████▊| 9813/10000 [2:11:28<02:25, 1.29it/s, loss=0.0036, lr=2.52e-06, step=9812] Training: 98%|█████████▊| 9813/10000 [2:11:28<02:25, 1.29it/s, loss=0.0210, lr=2.52e-06, step=9813] Training: 98%|█████████▊| 9814/10000 [2:11:29<02:36, 1.19it/s, loss=0.0210, lr=2.52e-06, step=9813] Training: 98%|█████████▊| 9814/10000 [2:11:29<02:36, 1.19it/s, loss=0.0077, lr=2.52e-06, step=9814] Training: 98%|█████████▊| 9815/10000 [2:11:30<02:37, 1.17it/s, loss=0.0077, lr=2.52e-06, step=9814] Training: 98%|█████████▊| 9815/10000 [2:11:30<02:37, 1.17it/s, loss=0.0115, lr=2.52e-06, step=9815] Training: 98%|█████████▊| 9816/10000 [2:11:31<02:33, 1.20it/s, loss=0.0115, lr=2.52e-06, step=9815] Training: 98%|█████████▊| 9816/10000 [2:11:31<02:33, 1.20it/s, loss=0.0101, lr=2.52e-06, step=9816] Training: 98%|█████████▊| 9817/10000 [2:11:31<02:14, 1.36it/s, loss=0.0101, lr=2.52e-06, step=9816] Training: 98%|█████████▊| 9817/10000 [2:11:31<02:14, 1.36it/s, loss=0.0125, lr=2.52e-06, step=9817] Training: 98%|█████████▊| 9818/10000 [2:11:32<02:20, 1.30it/s, loss=0.0125, lr=2.52e-06, step=9817] Training: 98%|█████████▊| 9818/10000 [2:11:32<02:20, 1.30it/s, loss=0.0041, lr=2.52e-06, step=9818] Training: 98%|█████████▊| 9819/10000 [2:11:33<02:16, 1.33it/s, loss=0.0041, lr=2.52e-06, step=9818] Training: 98%|█████████▊| 9819/10000 [2:11:33<02:16, 1.33it/s, loss=0.0021, lr=2.52e-06, step=9819]20:56:06.072 [I] step=9820 loss=0.0159 smoothed_loss=0.0107 lr=2.52e-06 grad_norm=0.4589 step_time=0.6155s data_time=0.1863s it/s=1.247 eta_to_10000=144.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0066 grad_action_out_proj_arms=0.0850 grad_arm_token_fuse=0.0349 grad_shared_expert=0.3753 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9820/10000 [2:11:34<02:25, 1.24it/s, loss=0.0021, lr=2.52e-06, step=9819] Training: 98%|█████████▊| 9820/10000 [2:11:34<02:25, 1.24it/s, loss=0.0159, lr=2.52e-06, step=9820] Training: 98%|█████████▊| 9821/10000 [2:11:35<02:34, 1.16it/s, loss=0.0159, lr=2.52e-06, step=9820] Training: 98%|█████████▊| 9821/10000 [2:11:35<02:34, 1.16it/s, loss=0.0008, lr=2.52e-06, step=9821] Training: 98%|█████████▊| 9822/10000 [2:11:36<02:52, 1.03it/s, loss=0.0008, lr=2.52e-06, step=9821] Training: 98%|█████████▊| 9822/10000 [2:11:36<02:52, 1.03it/s, loss=0.0067, lr=2.52e-06, step=9822] Training: 98%|█████████▊| 9823/10000 [2:11:37<02:44, 1.08it/s, loss=0.0067, lr=2.52e-06, step=9822] Training: 98%|█████████▊| 9823/10000 [2:11:37<02:44, 1.08it/s, loss=0.0046, lr=2.52e-06, step=9823] Training: 98%|█████████▊| 9824/10000 [2:11:38<02:35, 1.13it/s, loss=0.0046, lr=2.52e-06, step=9823] Training: 98%|█████████▊| 9824/10000 [2:11:38<02:35, 1.13it/s, loss=0.0506, lr=2.52e-06, step=9824] Training: 98%|█████████▊| 9825/10000 [2:11:38<02:20, 1.24it/s, loss=0.0506, lr=2.52e-06, step=9824] Training: 98%|█████████▊| 9825/10000 [2:11:38<02:20, 1.24it/s, loss=0.0042, lr=2.52e-06, step=9825] Training: 98%|█████████▊| 9826/10000 [2:11:39<02:09, 1.34it/s, loss=0.0042, lr=2.52e-06, step=9825] Training: 98%|█████████▊| 9826/10000 [2:11:39<02:09, 1.34it/s, loss=0.0033, lr=2.52e-06, step=9826] Training: 98%|█████████▊| 9827/10000 [2:11:40<02:15, 1.28it/s, loss=0.0033, lr=2.52e-06, step=9826] Training: 98%|█████████▊| 9827/10000 [2:11:40<02:15, 1.28it/s, loss=0.0225, lr=2.52e-06, step=9827] Training: 98%|█████████▊| 9828/10000 [2:11:41<02:29, 1.15it/s, loss=0.0225, lr=2.52e-06, step=9827] Training: 98%|█████████▊| 9828/10000 [2:11:41<02:29, 1.15it/s, loss=0.0061, lr=2.52e-06, step=9828] Training: 98%|█████████▊| 9829/10000 [2:11:42<02:44, 1.04it/s, loss=0.0061, lr=2.52e-06, step=9828] Training: 98%|█████████▊| 9829/10000 [2:11:42<02:44, 1.04it/s, loss=0.0036, lr=2.52e-06, step=9829]20:56:15.284 [I] step=9830 loss=0.0012 smoothed_loss=0.0100 lr=2.52e-06 grad_norm=0.3746 step_time=0.7067s data_time=0.2145s it/s=1.086 eta_to_10000=156.6s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0060 grad_action_out_proj_arms=0.0483 grad_arm_token_fuse=0.0323 grad_shared_expert=0.1940 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9830/10000 [2:11:43<02:46, 1.02it/s, loss=0.0036, lr=2.52e-06, step=9829] Training: 98%|█████████▊| 9830/10000 [2:11:43<02:46, 1.02it/s, loss=0.0012, lr=2.52e-06, step=9830] Training: 98%|█████████▊| 9831/10000 [2:11:44<02:42, 1.04it/s, loss=0.0012, lr=2.52e-06, step=9830] Training: 98%|█████████▊| 9831/10000 [2:11:44<02:42, 1.04it/s, loss=0.0131, lr=2.52e-06, step=9831] Training: 98%|█████████▊| 9832/10000 [2:11:45<02:38, 1.06it/s, loss=0.0131, lr=2.52e-06, step=9831] Training: 98%|█████████▊| 9832/10000 [2:11:45<02:38, 1.06it/s, loss=0.0132, lr=2.52e-06, step=9832] Training: 98%|█████████▊| 9833/10000 [2:11:45<02:15, 1.23it/s, loss=0.0132, lr=2.52e-06, step=9832] Training: 98%|█████████▊| 9833/10000 [2:11:45<02:15, 1.23it/s, loss=0.0055, lr=2.52e-06, step=9833] Training: 98%|█████████▊| 9834/10000 [2:11:46<02:16, 1.22it/s, loss=0.0055, lr=2.52e-06, step=9833] Training: 98%|█████████▊| 9834/10000 [2:11:46<02:16, 1.22it/s, loss=0.0065, lr=2.52e-06, step=9834] Training: 98%|█████████▊| 9835/10000 [2:11:47<02:10, 1.27it/s, loss=0.0065, lr=2.52e-06, step=9834] Training: 98%|█████████▊| 9835/10000 [2:11:47<02:10, 1.27it/s, loss=0.0080, lr=2.52e-06, step=9835] Training: 98%|█████████▊| 9836/10000 [2:11:48<02:06, 1.30it/s, loss=0.0080, lr=2.52e-06, step=9835] Training: 98%|█████████▊| 9836/10000 [2:11:48<02:06, 1.30it/s, loss=0.0133, lr=2.52e-06, step=9836] Training: 98%|█████████▊| 9837/10000 [2:11:48<01:51, 1.46it/s, loss=0.0133, lr=2.52e-06, step=9836] Training: 98%|█████████▊| 9837/10000 [2:11:48<01:51, 1.46it/s, loss=0.0061, lr=2.52e-06, step=9837] Training: 98%|█████████▊| 9838/10000 [2:11:49<02:01, 1.33it/s, loss=0.0061, lr=2.52e-06, step=9837] Training: 98%|█████████▊| 9838/10000 [2:11:49<02:01, 1.33it/s, loss=0.0025, lr=2.52e-06, step=9838] Training: 98%|█████████▊| 9839/10000 [2:11:50<02:05, 1.29it/s, loss=0.0025, lr=2.52e-06, step=9838] Training: 98%|█████████▊| 9839/10000 [2:11:50<02:05, 1.29it/s, loss=0.0043, lr=2.52e-06, step=9839]20:56:22.620 [I] step=9840 loss=0.0058 smoothed_loss=0.0081 lr=2.52e-06 grad_norm=0.4139 step_time=0.5723s data_time=0.1612s it/s=1.363 eta_to_10000=117.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0111 grad_action_out_proj_arms=0.1267 grad_arm_token_fuse=0.0570 grad_shared_expert=0.3986 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9840/10000 [2:11:50<01:51, 1.44it/s, loss=0.0043, lr=2.52e-06, step=9839] Training: 98%|█████████▊| 9840/10000 [2:11:50<01:51, 1.44it/s, loss=0.0058, lr=2.52e-06, step=9840] Training: 98%|█████████▊| 9841/10000 [2:11:51<01:51, 1.42it/s, loss=0.0058, lr=2.52e-06, step=9840] Training: 98%|█████████▊| 9841/10000 [2:11:51<01:51, 1.42it/s, loss=0.0063, lr=2.52e-06, step=9841] Training: 98%|█████████▊| 9842/10000 [2:11:52<02:08, 1.23it/s, loss=0.0063, lr=2.52e-06, step=9841] Training: 98%|█████████▊| 9842/10000 [2:11:52<02:08, 1.23it/s, loss=0.0119, lr=2.52e-06, step=9842] Training: 98%|█████████▊| 9843/10000 [2:11:53<02:20, 1.12it/s, loss=0.0119, lr=2.52e-06, step=9842] Training: 98%|█████████▊| 9843/10000 [2:11:53<02:20, 1.12it/s, loss=0.0428, lr=2.52e-06, step=9843] Training: 98%|█████████▊| 9844/10000 [2:11:54<02:07, 1.23it/s, loss=0.0428, lr=2.52e-06, step=9843] Training: 98%|█████████▊| 9844/10000 [2:11:54<02:07, 1.23it/s, loss=0.0027, lr=2.52e-06, step=9844] Training: 98%|█████████▊| 9845/10000 [2:11:54<01:55, 1.34it/s, loss=0.0027, lr=2.52e-06, step=9844] Training: 98%|█████████▊| 9845/10000 [2:11:54<01:55, 1.34it/s, loss=0.0050, lr=2.51e-06, step=9845] Training: 98%|█████████▊| 9846/10000 [2:11:55<01:59, 1.29it/s, loss=0.0050, lr=2.51e-06, step=9845] Training: 98%|█████████▊| 9846/10000 [2:11:55<01:59, 1.29it/s, loss=0.0085, lr=2.51e-06, step=9846] Training: 98%|█████████▊| 9847/10000 [2:11:56<01:54, 1.34it/s, loss=0.0085, lr=2.51e-06, step=9846] Training: 98%|█████████▊| 9847/10000 [2:11:56<01:54, 1.34it/s, loss=0.0042, lr=2.51e-06, step=9847] Training: 98%|█████████▊| 9848/10000 [2:11:56<01:44, 1.46it/s, loss=0.0042, lr=2.51e-06, step=9847] Training: 98%|█████████▊| 9848/10000 [2:11:56<01:44, 1.46it/s, loss=0.0027, lr=2.51e-06, step=9848] Training: 98%|█████████▊| 9849/10000 [2:11:57<01:46, 1.41it/s, loss=0.0027, lr=2.51e-06, step=9848] Training: 98%|█████████▊| 9849/10000 [2:11:57<01:46, 1.41it/s, loss=0.0095, lr=2.51e-06, step=9849]20:56:30.336 [I] step=9850 loss=0.0074 smoothed_loss=0.0088 lr=2.51e-06 grad_norm=0.4130 step_time=0.6382s data_time=0.1334s it/s=1.296 eta_to_10000=115.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0111 grad_action_out_proj_arms=0.0930 grad_arm_token_fuse=0.0578 grad_shared_expert=0.3258 (18633:train_pytorch.py:850) + Training: 98%|█████████▊| 9850/10000 [2:11:58<01:50, 1.36it/s, loss=0.0095, lr=2.51e-06, step=9849] Training: 98%|█████████▊| 9850/10000 [2:11:58<01:50, 1.36it/s, loss=0.0074, lr=2.51e-06, step=9850] Training: 99%|█████████▊| 9851/10000 [2:11:59<01:49, 1.36it/s, loss=0.0074, lr=2.51e-06, step=9850] Training: 99%|█████████▊| 9851/10000 [2:11:59<01:49, 1.36it/s, loss=0.0037, lr=2.51e-06, step=9851] Training: 99%|█████████▊| 9852/10000 [2:11:59<01:37, 1.51it/s, loss=0.0037, lr=2.51e-06, step=9851] Training: 99%|█████████▊| 9852/10000 [2:11:59<01:37, 1.51it/s, loss=0.0079, lr=2.51e-06, step=9852] Training: 99%|█████████▊| 9853/10000 [2:12:00<01:36, 1.52it/s, loss=0.0079, lr=2.51e-06, step=9852] Training: 99%|█████████▊| 9853/10000 [2:12:00<01:36, 1.52it/s, loss=0.0095, lr=2.51e-06, step=9853] Training: 99%|█████████▊| 9854/10000 [2:12:01<01:46, 1.37it/s, loss=0.0095, lr=2.51e-06, step=9853] Training: 99%|█████████▊| 9854/10000 [2:12:01<01:46, 1.37it/s, loss=0.0018, lr=2.51e-06, step=9854] Training: 99%|█████████▊| 9855/10000 [2:12:02<01:53, 1.28it/s, loss=0.0018, lr=2.51e-06, step=9854] Training: 99%|█████████▊| 9855/10000 [2:12:02<01:53, 1.28it/s, loss=0.0106, lr=2.51e-06, step=9855] Training: 99%|█████████▊| 9856/10000 [2:12:03<01:57, 1.22it/s, loss=0.0106, lr=2.51e-06, step=9855] Training: 99%|█████████▊| 9856/10000 [2:12:03<01:57, 1.22it/s, loss=0.0039, lr=2.51e-06, step=9856] Training: 99%|█████████▊| 9857/10000 [2:12:03<01:53, 1.25it/s, loss=0.0039, lr=2.51e-06, step=9856] Training: 99%|█████████▊| 9857/10000 [2:12:03<01:53, 1.25it/s, loss=0.0067, lr=2.51e-06, step=9857] Training: 99%|█████████▊| 9858/10000 [2:12:04<01:51, 1.28it/s, loss=0.0067, lr=2.51e-06, step=9857] Training: 99%|█████████▊| 9858/10000 [2:12:04<01:51, 1.28it/s, loss=0.0225, lr=2.51e-06, step=9858] Training: 99%|█████████▊| 9859/10000 [2:12:05<01:39, 1.42it/s, loss=0.0225, lr=2.51e-06, step=9858] Training: 99%|█████████▊| 9859/10000 [2:12:05<01:39, 1.42it/s, loss=0.0014, lr=2.51e-06, step=9859]20:56:37.828 [I] step=9860 loss=0.0123 smoothed_loss=0.0086 lr=2.51e-06 grad_norm=0.3696 step_time=0.6094s data_time=0.1398s it/s=1.335 eta_to_10000=104.9s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0114 grad_action_out_proj_arms=0.0773 grad_arm_token_fuse=0.0634 grad_shared_expert=0.3325 (18633:train_pytorch.py:850) + Training: 99%|█████████▊| 9860/10000 [2:12:05<01:46, 1.31it/s, loss=0.0014, lr=2.51e-06, step=9859] Training: 99%|█████████▊| 9860/10000 [2:12:05<01:46, 1.31it/s, loss=0.0123, lr=2.51e-06, step=9860] Training: 99%|█████████▊| 9861/10000 [2:12:06<01:49, 1.26it/s, loss=0.0123, lr=2.51e-06, step=9860] Training: 99%|█████████▊| 9861/10000 [2:12:06<01:49, 1.26it/s, loss=0.0148, lr=2.51e-06, step=9861] Training: 99%|█████████▊| 9862/10000 [2:12:07<02:03, 1.12it/s, loss=0.0148, lr=2.51e-06, step=9861] Training: 99%|█████████▊| 9862/10000 [2:12:07<02:03, 1.12it/s, loss=0.0249, lr=2.51e-06, step=9862] Training: 99%|█████████▊| 9863/10000 [2:12:08<01:52, 1.22it/s, loss=0.0249, lr=2.51e-06, step=9862] Training: 99%|█████████▊| 9863/10000 [2:12:08<01:52, 1.22it/s, loss=0.0022, lr=2.51e-06, step=9863] Training: 99%|█████████▊| 9864/10000 [2:12:09<01:46, 1.28it/s, loss=0.0022, lr=2.51e-06, step=9863] Training: 99%|█████████▊| 9864/10000 [2:12:09<01:46, 1.28it/s, loss=0.0112, lr=2.51e-06, step=9864] Training: 99%|█████████▊| 9865/10000 [2:12:10<01:47, 1.26it/s, loss=0.0112, lr=2.51e-06, step=9864] Training: 99%|█████████▊| 9865/10000 [2:12:10<01:47, 1.26it/s, loss=0.0028, lr=2.51e-06, step=9865] Training: 99%|█████████▊| 9866/10000 [2:12:11<01:48, 1.23it/s, loss=0.0028, lr=2.51e-06, step=9865] Training: 99%|█████████▊| 9866/10000 [2:12:11<01:48, 1.23it/s, loss=0.0041, lr=2.51e-06, step=9866] Training: 99%|█████████▊| 9867/10000 [2:12:11<01:34, 1.40it/s, loss=0.0041, lr=2.51e-06, step=9866] Training: 99%|█████████▊| 9867/10000 [2:12:11<01:34, 1.40it/s, loss=0.0032, lr=2.51e-06, step=9867] Training: 99%|█████████▊| 9868/10000 [2:12:12<01:32, 1.43it/s, loss=0.0032, lr=2.51e-06, step=9867] Training: 99%|█████████▊| 9868/10000 [2:12:12<01:32, 1.43it/s, loss=0.0010, lr=2.51e-06, step=9868] Training: 99%|█████████▊| 9869/10000 [2:12:12<01:23, 1.57it/s, loss=0.0010, lr=2.51e-06, step=9868] Training: 99%|█████████▊| 9869/10000 [2:12:12<01:23, 1.57it/s, loss=0.0031, lr=2.51e-06, step=9869]20:56:45.445 [I] step=9870 loss=0.0050 smoothed_loss=0.0069 lr=2.51e-06 grad_norm=0.5057 step_time=0.6025s data_time=0.1592s it/s=1.313 eta_to_10000=99.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0189 grad_action_out_proj_arms=0.1746 grad_arm_token_fuse=0.1006 grad_shared_expert=0.6312 (18633:train_pytorch.py:850) + Training: 99%|█████████▊| 9870/10000 [2:12:13<01:35, 1.36it/s, loss=0.0031, lr=2.51e-06, step=9869] Training: 99%|█████████▊| 9870/10000 [2:12:13<01:35, 1.36it/s, loss=0.0050, lr=2.51e-06, step=9870] Training: 99%|█████████▊| 9871/10000 [2:12:14<01:33, 1.38it/s, loss=0.0050, lr=2.51e-06, step=9870] Training: 99%|█████████▊| 9871/10000 [2:12:14<01:33, 1.38it/s, loss=0.0024, lr=2.51e-06, step=9871] Training: 99%|█████████▊| 9872/10000 [2:12:15<01:49, 1.17it/s, loss=0.0024, lr=2.51e-06, step=9871] Training: 99%|█████████▊| 9872/10000 [2:12:15<01:49, 1.17it/s, loss=0.0066, lr=2.51e-06, step=9872] Training: 99%|█████████▊| 9873/10000 [2:12:16<01:37, 1.30it/s, loss=0.0066, lr=2.51e-06, step=9872] Training: 99%|█████████▊| 9873/10000 [2:12:16<01:37, 1.30it/s, loss=0.0149, lr=2.51e-06, step=9873] Training: 99%|█████████▊| 9874/10000 [2:12:16<01:26, 1.45it/s, loss=0.0149, lr=2.51e-06, step=9873] Training: 99%|█████████▊| 9874/10000 [2:12:16<01:26, 1.45it/s, loss=0.0013, lr=2.51e-06, step=9874] Training: 99%|█████████▉| 9875/10000 [2:12:17<01:27, 1.43it/s, loss=0.0013, lr=2.51e-06, step=9874] Training: 99%|█████████▉| 9875/10000 [2:12:17<01:27, 1.43it/s, loss=0.0027, lr=2.51e-06, step=9875] Training: 99%|█████████▉| 9876/10000 [2:12:17<01:23, 1.49it/s, loss=0.0027, lr=2.51e-06, step=9875] Training: 99%|█████████▉| 9876/10000 [2:12:17<01:23, 1.49it/s, loss=0.0167, lr=2.51e-06, step=9876] Training: 99%|█████████▉| 9877/10000 [2:12:18<01:32, 1.33it/s, loss=0.0167, lr=2.51e-06, step=9876] Training: 99%|█████████▉| 9877/10000 [2:12:18<01:32, 1.33it/s, loss=0.0030, lr=2.51e-06, step=9877] Training: 99%|█████████▉| 9878/10000 [2:12:19<01:29, 1.36it/s, loss=0.0030, lr=2.51e-06, step=9877] Training: 99%|█████████▉| 9878/10000 [2:12:19<01:29, 1.36it/s, loss=0.0074, lr=2.51e-06, step=9878] Training: 99%|█████████▉| 9879/10000 [2:12:20<01:29, 1.35it/s, loss=0.0074, lr=2.51e-06, step=9878] Training: 99%|█████████▉| 9879/10000 [2:12:20<01:29, 1.35it/s, loss=0.0019, lr=2.51e-06, step=9879]20:56:52.650 [I] step=9880 loss=0.0041 smoothed_loss=0.0062 lr=2.51e-06 grad_norm=0.4778 step_time=0.6105s data_time=0.1100s it/s=1.388 eta_to_10000=86.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0380 grad_action_out_proj_arms=0.2231 grad_arm_token_fuse=0.2347 grad_shared_expert=0.6008 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9880/10000 [2:12:20<01:22, 1.46it/s, loss=0.0019, lr=2.51e-06, step=9879] Training: 99%|█████████▉| 9880/10000 [2:12:20<01:22, 1.46it/s, loss=0.0041, lr=2.51e-06, step=9880] Training: 99%|█████████▉| 9881/10000 [2:12:21<01:18, 1.52it/s, loss=0.0041, lr=2.51e-06, step=9880] Training: 99%|█████████▉| 9881/10000 [2:12:21<01:18, 1.52it/s, loss=0.0053, lr=2.51e-06, step=9881] Training: 99%|█████████▉| 9882/10000 [2:12:22<01:22, 1.43it/s, loss=0.0053, lr=2.51e-06, step=9881] Training: 99%|█████████▉| 9882/10000 [2:12:22<01:22, 1.43it/s, loss=0.0362, lr=2.51e-06, step=9882] Training: 99%|█████████▉| 9883/10000 [2:12:23<01:33, 1.25it/s, loss=0.0362, lr=2.51e-06, step=9882] Training: 99%|█████████▉| 9883/10000 [2:12:23<01:33, 1.25it/s, loss=0.0070, lr=2.51e-06, step=9883] Training: 99%|█████████▉| 9884/10000 [2:12:24<01:35, 1.22it/s, loss=0.0070, lr=2.51e-06, step=9883] Training: 99%|█████████▉| 9884/10000 [2:12:24<01:35, 1.22it/s, loss=0.0021, lr=2.51e-06, step=9884] Training: 99%|█████████▉| 9885/10000 [2:12:25<01:46, 1.08it/s, loss=0.0021, lr=2.51e-06, step=9884] Training: 99%|█████████▉| 9885/10000 [2:12:25<01:46, 1.08it/s, loss=0.0079, lr=2.51e-06, step=9885] Training: 99%|█████████▉| 9886/10000 [2:12:26<01:47, 1.06it/s, loss=0.0079, lr=2.51e-06, step=9885] Training: 99%|█████████▉| 9886/10000 [2:12:26<01:47, 1.06it/s, loss=0.0095, lr=2.51e-06, step=9886] Training: 99%|█████████▉| 9887/10000 [2:12:26<01:35, 1.18it/s, loss=0.0095, lr=2.51e-06, step=9886] Training: 99%|█████████▉| 9887/10000 [2:12:26<01:35, 1.18it/s, loss=0.0003, lr=2.51e-06, step=9887] Training: 99%|█████████▉| 9888/10000 [2:12:27<01:22, 1.35it/s, loss=0.0003, lr=2.51e-06, step=9887] Training: 99%|█████████▉| 9888/10000 [2:12:27<01:22, 1.35it/s, loss=0.0155, lr=2.51e-06, step=9888] Training: 99%|█████████▉| 9889/10000 [2:12:28<01:24, 1.32it/s, loss=0.0155, lr=2.51e-06, step=9888] Training: 99%|█████████▉| 9889/10000 [2:12:28<01:24, 1.32it/s, loss=0.0082, lr=2.51e-06, step=9889]20:57:00.692 [I] step=9890 loss=0.0051 smoothed_loss=0.0080 lr=2.51e-06 grad_norm=0.4368 step_time=0.6293s data_time=0.1749s it/s=1.244 eta_to_10000=88.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0176 grad_action_out_proj_arms=0.1279 grad_arm_token_fuse=0.0924 grad_shared_expert=0.4272 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9890/10000 [2:12:28<01:20, 1.37it/s, loss=0.0082, lr=2.51e-06, step=9889] Training: 99%|█████████▉| 9890/10000 [2:12:28<01:20, 1.37it/s, loss=0.0051, lr=2.51e-06, step=9890] Training: 99%|█████████▉| 9891/10000 [2:12:29<01:19, 1.37it/s, loss=0.0051, lr=2.51e-06, step=9890] Training: 99%|█████████▉| 9891/10000 [2:12:29<01:19, 1.37it/s, loss=0.0184, lr=2.51e-06, step=9891] Training: 99%|█████████▉| 9892/10000 [2:12:30<01:19, 1.36it/s, loss=0.0184, lr=2.51e-06, step=9891] Training: 99%|█████████▉| 9892/10000 [2:12:30<01:19, 1.36it/s, loss=0.0183, lr=2.51e-06, step=9892] Training: 99%|█████████▉| 9893/10000 [2:12:31<01:22, 1.29it/s, loss=0.0183, lr=2.51e-06, step=9892] Training: 99%|█████████▉| 9893/10000 [2:12:31<01:22, 1.29it/s, loss=0.0068, lr=2.51e-06, step=9893] Training: 99%|█████████▉| 9894/10000 [2:12:32<01:23, 1.27it/s, loss=0.0068, lr=2.51e-06, step=9893] Training: 99%|█████████▉| 9894/10000 [2:12:32<01:23, 1.27it/s, loss=0.0060, lr=2.51e-06, step=9894] Training: 99%|█████████▉| 9895/10000 [2:12:32<01:23, 1.26it/s, loss=0.0060, lr=2.51e-06, step=9894] Training: 99%|█████████▉| 9895/10000 [2:12:32<01:23, 1.26it/s, loss=0.0018, lr=2.51e-06, step=9895] Training: 99%|█████████▉| 9896/10000 [2:12:33<01:14, 1.39it/s, loss=0.0018, lr=2.51e-06, step=9895] Training: 99%|█████████▉| 9896/10000 [2:12:33<01:14, 1.39it/s, loss=0.0032, lr=2.51e-06, step=9896] Training: 99%|█████████▉| 9897/10000 [2:12:34<01:11, 1.44it/s, loss=0.0032, lr=2.51e-06, step=9896] Training: 99%|█████████▉| 9897/10000 [2:12:34<01:11, 1.44it/s, loss=0.0029, lr=2.51e-06, step=9897] Training: 99%|█████████▉| 9898/10000 [2:12:34<01:11, 1.43it/s, loss=0.0029, lr=2.51e-06, step=9897] Training: 99%|█████████▉| 9898/10000 [2:12:34<01:11, 1.43it/s, loss=0.0276, lr=2.51e-06, step=9898] Training: 99%|█████████▉| 9899/10000 [2:12:35<01:05, 1.53it/s, loss=0.0276, lr=2.51e-06, step=9898] Training: 99%|█████████▉| 9899/10000 [2:12:35<01:05, 1.53it/s, loss=0.0125, lr=2.51e-06, step=9899]20:57:08.112 [I] step=9900 loss=0.0275 smoothed_loss=0.0116 lr=2.51e-06 grad_norm=0.4404 step_time=0.6005s data_time=0.1415s it/s=1.348 eta_to_10000=74.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0117 grad_action_out_proj_arms=0.0780 grad_arm_token_fuse=0.0662 grad_shared_expert=0.5700 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9900/10000 [2:12:36<01:16, 1.31it/s, loss=0.0125, lr=2.51e-06, step=9899] Training: 99%|█████████▉| 9900/10000 [2:12:36<01:16, 1.31it/s, loss=0.0275, lr=2.51e-06, step=9900] Training: 99%|█████████▉| 9901/10000 [2:12:37<01:14, 1.33it/s, loss=0.0275, lr=2.51e-06, step=9900] Training: 99%|█████████▉| 9901/10000 [2:12:37<01:14, 1.33it/s, loss=0.0048, lr=2.51e-06, step=9901] Training: 99%|█████████▉| 9902/10000 [2:12:37<01:12, 1.35it/s, loss=0.0048, lr=2.51e-06, step=9901] Training: 99%|█████████▉| 9902/10000 [2:12:37<01:12, 1.35it/s, loss=0.0082, lr=2.51e-06, step=9902] Training: 99%|█████████▉| 9903/10000 [2:12:38<01:04, 1.50it/s, loss=0.0082, lr=2.51e-06, step=9902] Training: 99%|█████████▉| 9903/10000 [2:12:38<01:04, 1.50it/s, loss=0.0079, lr=2.51e-06, step=9903] Training: 99%|█████████▉| 9904/10000 [2:12:38<00:58, 1.63it/s, loss=0.0079, lr=2.51e-06, step=9903] Training: 99%|█████████▉| 9904/10000 [2:12:38<00:58, 1.63it/s, loss=0.0230, lr=2.51e-06, step=9904] Training: 99%|█████████▉| 9905/10000 [2:12:39<01:05, 1.46it/s, loss=0.0230, lr=2.51e-06, step=9904] Training: 99%|█████████▉| 9905/10000 [2:12:39<01:05, 1.46it/s, loss=0.0034, lr=2.51e-06, step=9905] Training: 99%|█████████▉| 9906/10000 [2:12:40<01:02, 1.50it/s, loss=0.0034, lr=2.51e-06, step=9905] Training: 99%|█████████▉| 9906/10000 [2:12:40<01:02, 1.50it/s, loss=0.0023, lr=2.51e-06, step=9906] Training: 99%|█████████▉| 9907/10000 [2:12:41<01:06, 1.40it/s, loss=0.0023, lr=2.51e-06, step=9906] Training: 99%|█████████▉| 9907/10000 [2:12:41<01:06, 1.40it/s, loss=0.0102, lr=2.51e-06, step=9907] Training: 99%|█████████▉| 9908/10000 [2:12:41<01:01, 1.50it/s, loss=0.0102, lr=2.51e-06, step=9907] Training: 99%|█████████▉| 9908/10000 [2:12:41<01:01, 1.50it/s, loss=0.0011, lr=2.51e-06, step=9908] Training: 99%|█████████▉| 9909/10000 [2:12:42<01:05, 1.40it/s, loss=0.0011, lr=2.51e-06, step=9908] Training: 99%|█████████▉| 9909/10000 [2:12:42<01:05, 1.40it/s, loss=0.0068, lr=2.51e-06, step=9909]20:57:14.792 [I] step=9910 loss=0.0895 smoothed_loss=0.0169 lr=2.51e-06 grad_norm=0.3966 step_time=0.5523s data_time=0.1157s it/s=1.497 eta_to_10000=60.1s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0340 grad_action_out_proj_arms=0.1855 grad_arm_token_fuse=0.1967 grad_shared_expert=0.4599 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9910/10000 [2:12:43<01:05, 1.37it/s, loss=0.0068, lr=2.51e-06, step=9909] Training: 99%|█████████▉| 9910/10000 [2:12:43<01:05, 1.37it/s, loss=0.0895, lr=2.51e-06, step=9910] Training: 99%|█████████▉| 9911/10000 [2:12:43<01:06, 1.34it/s, loss=0.0895, lr=2.51e-06, step=9910] Training: 99%|█████████▉| 9911/10000 [2:12:43<01:06, 1.34it/s, loss=0.0135, lr=2.50e-06, step=9911] Training: 99%|█████████▉| 9912/10000 [2:12:44<01:10, 1.25it/s, loss=0.0135, lr=2.50e-06, step=9911] Training: 99%|█████████▉| 9912/10000 [2:12:44<01:10, 1.25it/s, loss=0.0112, lr=2.50e-06, step=9912] Training: 99%|█████████▉| 9913/10000 [2:12:45<01:06, 1.31it/s, loss=0.0112, lr=2.50e-06, step=9912] Training: 99%|█████████▉| 9913/10000 [2:12:45<01:06, 1.31it/s, loss=0.0055, lr=2.50e-06, step=9913] Training: 99%|█████████▉| 9914/10000 [2:12:46<01:06, 1.29it/s, loss=0.0055, lr=2.50e-06, step=9913] Training: 99%|█████████▉| 9914/10000 [2:12:46<01:06, 1.29it/s, loss=0.0080, lr=2.50e-06, step=9914] Training: 99%|█████████▉| 9915/10000 [2:12:47<01:09, 1.22it/s, loss=0.0080, lr=2.50e-06, step=9914] Training: 99%|█████████▉| 9915/10000 [2:12:47<01:09, 1.22it/s, loss=0.0276, lr=2.50e-06, step=9915] Training: 99%|█████████▉| 9916/10000 [2:12:47<01:04, 1.30it/s, loss=0.0276, lr=2.50e-06, step=9915] Training: 99%|█████████▉| 9916/10000 [2:12:47<01:04, 1.30it/s, loss=0.0256, lr=2.50e-06, step=9916] Training: 99%|█████████▉| 9917/10000 [2:12:48<01:02, 1.33it/s, loss=0.0256, lr=2.50e-06, step=9916] Training: 99%|█████████▉| 9917/10000 [2:12:48<01:02, 1.33it/s, loss=0.0071, lr=2.50e-06, step=9917] Training: 99%|█████████▉| 9918/10000 [2:12:49<00:56, 1.46it/s, loss=0.0071, lr=2.50e-06, step=9917] Training: 99%|█████████▉| 9918/10000 [2:12:49<00:56, 1.46it/s, loss=0.0026, lr=2.50e-06, step=9918] Training: 99%|█████████▉| 9919/10000 [2:12:50<01:02, 1.29it/s, loss=0.0026, lr=2.50e-06, step=9918] Training: 99%|█████████▉| 9919/10000 [2:12:50<01:02, 1.29it/s, loss=0.0039, lr=2.50e-06, step=9919]20:57:22.734 [I] step=9920 loss=0.0010 smoothed_loss=0.0121 lr=2.50e-06 grad_norm=0.4060 step_time=0.6385s data_time=0.1557s it/s=1.291 eta_to_10000=62.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0168 grad_action_out_proj_arms=0.1197 grad_arm_token_fuse=0.0913 grad_shared_expert=0.2734 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9920/10000 [2:12:50<01:01, 1.30it/s, loss=0.0039, lr=2.50e-06, step=9919] Training: 99%|█████████▉| 9920/10000 [2:12:50<01:01, 1.30it/s, loss=0.0010, lr=2.50e-06, step=9920] Training: 99%|█████████▉| 9921/10000 [2:12:51<01:04, 1.22it/s, loss=0.0010, lr=2.50e-06, step=9920] Training: 99%|█████████▉| 9921/10000 [2:12:51<01:04, 1.22it/s, loss=0.0040, lr=2.50e-06, step=9921] Training: 99%|█████████▉| 9922/10000 [2:12:52<01:01, 1.27it/s, loss=0.0040, lr=2.50e-06, step=9921] Training: 99%|█████████▉| 9922/10000 [2:12:52<01:01, 1.27it/s, loss=0.0089, lr=2.50e-06, step=9922] Training: 99%|█████████▉| 9923/10000 [2:12:53<00:54, 1.42it/s, loss=0.0089, lr=2.50e-06, step=9922] Training: 99%|█████████▉| 9923/10000 [2:12:53<00:54, 1.42it/s, loss=0.0057, lr=2.50e-06, step=9923] Training: 99%|█████████▉| 9924/10000 [2:12:53<00:52, 1.45it/s, loss=0.0057, lr=2.50e-06, step=9923] Training: 99%|█████████▉| 9924/10000 [2:12:53<00:52, 1.45it/s, loss=0.0103, lr=2.50e-06, step=9924] Training: 99%|█████████▉| 9925/10000 [2:12:54<00:55, 1.36it/s, loss=0.0103, lr=2.50e-06, step=9924] Training: 99%|█████████▉| 9925/10000 [2:12:54<00:55, 1.36it/s, loss=0.0029, lr=2.50e-06, step=9925] Training: 99%|█████████▉| 9926/10000 [2:12:55<01:00, 1.23it/s, loss=0.0029, lr=2.50e-06, step=9925] Training: 99%|█████████▉| 9926/10000 [2:12:55<01:00, 1.23it/s, loss=0.0275, lr=2.50e-06, step=9926] Training: 99%|█████████▉| 9927/10000 [2:12:56<00:52, 1.40it/s, loss=0.0275, lr=2.50e-06, step=9926] Training: 99%|█████████▉| 9927/10000 [2:12:56<00:52, 1.40it/s, loss=0.0053, lr=2.50e-06, step=9927] Training: 99%|█████████▉| 9928/10000 [2:12:56<00:55, 1.31it/s, loss=0.0053, lr=2.50e-06, step=9927] Training: 99%|█████████▉| 9928/10000 [2:12:56<00:55, 1.31it/s, loss=0.0255, lr=2.50e-06, step=9928] Training: 99%|█████████▉| 9929/10000 [2:12:57<00:55, 1.29it/s, loss=0.0255, lr=2.50e-06, step=9928] Training: 99%|█████████▉| 9929/10000 [2:12:57<00:55, 1.29it/s, loss=0.0081, lr=2.50e-06, step=9929]20:57:30.526 [I] step=9930 loss=0.0087 smoothed_loss=0.0116 lr=2.50e-06 grad_norm=0.3961 step_time=0.6442s data_time=0.1350s it/s=1.284 eta_to_10000=54.5s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0067 grad_action_out_proj_arms=0.0760 grad_arm_token_fuse=0.0369 grad_shared_expert=0.2789 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9930/10000 [2:12:58<00:58, 1.20it/s, loss=0.0081, lr=2.50e-06, step=9929] Training: 99%|█████████▉| 9930/10000 [2:12:58<00:58, 1.20it/s, loss=0.0087, lr=2.50e-06, step=9930] Training: 99%|█████████▉| 9931/10000 [2:12:59<00:51, 1.33it/s, loss=0.0087, lr=2.50e-06, step=9930] Training: 99%|█████████▉| 9931/10000 [2:12:59<00:51, 1.33it/s, loss=0.0154, lr=2.50e-06, step=9931] Training: 99%|█████████▉| 9932/10000 [2:12:59<00:45, 1.49it/s, loss=0.0154, lr=2.50e-06, step=9931] Training: 99%|█████████▉| 9932/10000 [2:12:59<00:45, 1.49it/s, loss=0.0073, lr=2.50e-06, step=9932] Training: 99%|█████████▉| 9933/10000 [2:13:00<00:44, 1.50it/s, loss=0.0073, lr=2.50e-06, step=9932] Training: 99%|█████████▉| 9933/10000 [2:13:00<00:44, 1.50it/s, loss=0.0044, lr=2.50e-06, step=9933] Training: 99%|█████████▉| 9934/10000 [2:13:00<00:40, 1.63it/s, loss=0.0044, lr=2.50e-06, step=9933] Training: 99%|█████████▉| 9934/10000 [2:13:00<00:40, 1.63it/s, loss=0.0059, lr=2.50e-06, step=9934] Training: 99%|█████████▉| 9935/10000 [2:13:01<00:41, 1.58it/s, loss=0.0059, lr=2.50e-06, step=9934] Training: 99%|█████████▉| 9935/10000 [2:13:01<00:41, 1.58it/s, loss=0.0076, lr=2.50e-06, step=9935] Training: 99%|█████████▉| 9936/10000 [2:13:02<00:46, 1.37it/s, loss=0.0076, lr=2.50e-06, step=9935] Training: 99%|█████████▉| 9936/10000 [2:13:02<00:46, 1.37it/s, loss=0.0090, lr=2.50e-06, step=9936] Training: 99%|█████████▉| 9937/10000 [2:13:03<00:42, 1.48it/s, loss=0.0090, lr=2.50e-06, step=9936] Training: 99%|█████████▉| 9937/10000 [2:13:03<00:42, 1.48it/s, loss=0.0150, lr=2.50e-06, step=9937] Training: 99%|█████████▉| 9938/10000 [2:13:03<00:43, 1.43it/s, loss=0.0150, lr=2.50e-06, step=9937] Training: 99%|█████████▉| 9938/10000 [2:13:03<00:43, 1.43it/s, loss=0.0025, lr=2.50e-06, step=9938] Training: 99%|█████████▉| 9939/10000 [2:13:04<00:46, 1.30it/s, loss=0.0025, lr=2.50e-06, step=9938] Training: 99%|█████████▉| 9939/10000 [2:13:04<00:46, 1.30it/s, loss=0.0107, lr=2.50e-06, step=9939]20:57:37.261 [I] step=9940 loss=0.0048 smoothed_loss=0.0093 lr=2.50e-06 grad_norm=0.5094 step_time=0.5468s data_time=0.1266s it/s=1.485 eta_to_10000=40.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0169 grad_action_out_proj_arms=0.0987 grad_arm_token_fuse=0.0887 grad_shared_expert=0.4413 (18633:train_pytorch.py:850) + Training: 99%|█████████▉| 9940/10000 [2:13:05<00:44, 1.35it/s, loss=0.0107, lr=2.50e-06, step=9939] Training: 99%|█████████▉| 9940/10000 [2:13:05<00:44, 1.35it/s, loss=0.0048, lr=2.50e-06, step=9940] Training: 99%|█████████▉| 9941/10000 [2:13:06<00:42, 1.39it/s, loss=0.0048, lr=2.50e-06, step=9940] Training: 99%|█████████▉| 9941/10000 [2:13:06<00:42, 1.39it/s, loss=0.0010, lr=2.50e-06, step=9941] Training: 99%|█████████▉| 9942/10000 [2:13:06<00:40, 1.45it/s, loss=0.0010, lr=2.50e-06, step=9941] Training: 99%|█████████▉| 9942/10000 [2:13:06<00:40, 1.45it/s, loss=0.0026, lr=2.50e-06, step=9942] Training: 99%|█████████▉| 9943/10000 [2:13:07<00:39, 1.43it/s, loss=0.0026, lr=2.50e-06, step=9942] Training: 99%|█████████▉| 9943/10000 [2:13:07<00:39, 1.43it/s, loss=0.0110, lr=2.50e-06, step=9943] Training: 99%|█████████▉| 9944/10000 [2:13:08<00:43, 1.30it/s, loss=0.0110, lr=2.50e-06, step=9943] Training: 99%|█████████▉| 9944/10000 [2:13:08<00:43, 1.30it/s, loss=0.0202, lr=2.50e-06, step=9944] Training: 99%|█████████▉| 9945/10000 [2:13:08<00:37, 1.45it/s, loss=0.0202, lr=2.50e-06, step=9944] Training: 99%|█████████▉| 9945/10000 [2:13:08<00:37, 1.45it/s, loss=0.0041, lr=2.50e-06, step=9945] Training: 99%|█████████▉| 9946/10000 [2:13:09<00:37, 1.45it/s, loss=0.0041, lr=2.50e-06, step=9945] Training: 99%|█████████▉| 9946/10000 [2:13:09<00:37, 1.45it/s, loss=0.0035, lr=2.50e-06, step=9946] Training: 99%|█████████▉| 9947/10000 [2:13:10<00:34, 1.56it/s, loss=0.0035, lr=2.50e-06, step=9946] Training: 99%|█████████▉| 9947/10000 [2:13:10<00:34, 1.56it/s, loss=0.0710, lr=2.50e-06, step=9947] Training: 99%|█████████▉| 9948/10000 [2:13:10<00:31, 1.63it/s, loss=0.0710, lr=2.50e-06, step=9947] Training: 99%|█████████▉| 9948/10000 [2:13:10<00:31, 1.63it/s, loss=0.0064, lr=2.50e-06, step=9948] Training: 99%|█████████▉| 9949/10000 [2:13:11<00:32, 1.56it/s, loss=0.0064, lr=2.50e-06, step=9948] Training: 99%|█████████▉| 9949/10000 [2:13:11<00:32, 1.56it/s, loss=0.0037, lr=2.50e-06, step=9949]20:57:44.136 [I] step=9950 loss=0.0041 smoothed_loss=0.0119 lr=2.50e-06 grad_norm=0.5186 step_time=0.5790s data_time=0.1085s it/s=1.455 eta_to_10000=34.4s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0113 grad_action_out_proj_arms=0.1369 grad_arm_token_fuse=0.0550 grad_shared_expert=0.9936 (18633:train_pytorch.py:850) + Training: 100%|█████████▉| 9950/10000 [2:13:12<00:36, 1.36it/s, loss=0.0037, lr=2.50e-06, step=9949] Training: 100%|█████████▉| 9950/10000 [2:13:12<00:36, 1.36it/s, loss=0.0041, lr=2.50e-06, step=9950] Training: 100%|█████████▉| 9951/10000 [2:13:13<00:35, 1.38it/s, loss=0.0041, lr=2.50e-06, step=9950] Training: 100%|█████████▉| 9951/10000 [2:13:13<00:35, 1.38it/s, loss=0.0844, lr=2.50e-06, step=9951] Training: 100%|█████████▉| 9952/10000 [2:13:13<00:31, 1.52it/s, loss=0.0844, lr=2.50e-06, step=9951] Training: 100%|█████████▉| 9952/10000 [2:13:13<00:31, 1.52it/s, loss=0.0056, lr=2.50e-06, step=9952] Training: 100%|█████████▉| 9953/10000 [2:13:14<00:34, 1.37it/s, loss=0.0056, lr=2.50e-06, step=9952] Training: 100%|█████████▉| 9953/10000 [2:13:14<00:34, 1.37it/s, loss=0.0070, lr=2.50e-06, step=9953] Training: 100%|█████████▉| 9954/10000 [2:13:15<00:34, 1.33it/s, loss=0.0070, lr=2.50e-06, step=9953] Training: 100%|█████████▉| 9954/10000 [2:13:15<00:34, 1.33it/s, loss=0.0044, lr=2.50e-06, step=9954] Training: 100%|█████████▉| 9955/10000 [2:13:16<00:34, 1.30it/s, loss=0.0044, lr=2.50e-06, step=9954] Training: 100%|█████████▉| 9955/10000 [2:13:16<00:34, 1.30it/s, loss=0.0081, lr=2.50e-06, step=9955] Training: 100%|█████████▉| 9956/10000 [2:13:16<00:35, 1.23it/s, loss=0.0081, lr=2.50e-06, step=9955] Training: 100%|█████████▉| 9956/10000 [2:13:16<00:35, 1.23it/s, loss=0.0037, lr=2.50e-06, step=9956] Training: 100%|█████████▉| 9957/10000 [2:13:17<00:37, 1.16it/s, loss=0.0037, lr=2.50e-06, step=9956] Training: 100%|█████████▉| 9957/10000 [2:13:17<00:37, 1.16it/s, loss=0.0123, lr=2.50e-06, step=9957] Training: 100%|█████████▉| 9958/10000 [2:13:18<00:34, 1.23it/s, loss=0.0123, lr=2.50e-06, step=9957] Training: 100%|█████████▉| 9958/10000 [2:13:18<00:34, 1.23it/s, loss=0.0339, lr=2.50e-06, step=9958] Training: 100%|█████████▉| 9959/10000 [2:13:19<00:32, 1.26it/s, loss=0.0339, lr=2.50e-06, step=9958] Training: 100%|█████████▉| 9959/10000 [2:13:19<00:32, 1.26it/s, loss=0.0012, lr=2.50e-06, step=9959]20:57:51.812 [I] step=9960 loss=0.0336 smoothed_loss=0.0161 lr=2.50e-06 grad_norm=0.4095 step_time=0.6125s data_time=0.1551s it/s=1.303 eta_to_10000=30.7s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0322 grad_action_out_proj_arms=0.1669 grad_arm_token_fuse=0.1656 grad_shared_expert=0.8340 (18633:train_pytorch.py:850) + Training: 100%|█████████▉| 9960/10000 [2:13:19<00:29, 1.35it/s, loss=0.0012, lr=2.50e-06, step=9959] Training: 100%|█████████▉| 9960/10000 [2:13:19<00:29, 1.35it/s, loss=0.0336, lr=2.50e-06, step=9960] Training: 100%|█████████▉| 9961/10000 [2:13:20<00:26, 1.48it/s, loss=0.0336, lr=2.50e-06, step=9960] Training: 100%|█████████▉| 9961/10000 [2:13:20<00:26, 1.48it/s, loss=0.0165, lr=2.50e-06, step=9961] Training: 100%|█████████▉| 9962/10000 [2:13:21<00:23, 1.60it/s, loss=0.0165, lr=2.50e-06, step=9961] Training: 100%|█████████▉| 9962/10000 [2:13:21<00:23, 1.60it/s, loss=0.0087, lr=2.50e-06, step=9962] Training: 100%|█████████▉| 9963/10000 [2:13:22<00:27, 1.34it/s, loss=0.0087, lr=2.50e-06, step=9962] Training: 100%|█████████▉| 9963/10000 [2:13:22<00:27, 1.34it/s, loss=0.0026, lr=2.50e-06, step=9963] Training: 100%|█████████▉| 9964/10000 [2:13:23<00:29, 1.20it/s, loss=0.0026, lr=2.50e-06, step=9963] Training: 100%|█████████▉| 9964/10000 [2:13:23<00:29, 1.20it/s, loss=0.0080, lr=2.50e-06, step=9964] Training: 100%|█████████▉| 9965/10000 [2:13:24<00:30, 1.13it/s, loss=0.0080, lr=2.50e-06, step=9964] Training: 100%|█████████▉| 9965/10000 [2:13:24<00:30, 1.13it/s, loss=0.0013, lr=2.50e-06, step=9965] Training: 100%|█████████▉| 9966/10000 [2:13:24<00:30, 1.12it/s, loss=0.0013, lr=2.50e-06, step=9965] Training: 100%|█████████▉| 9966/10000 [2:13:24<00:30, 1.12it/s, loss=0.0029, lr=2.50e-06, step=9966] Training: 100%|█████████▉| 9967/10000 [2:13:25<00:28, 1.17it/s, loss=0.0029, lr=2.50e-06, step=9966] Training: 100%|█████████▉| 9967/10000 [2:13:25<00:28, 1.17it/s, loss=0.0071, lr=2.50e-06, step=9967] Training: 100%|█████████▉| 9968/10000 [2:13:26<00:26, 1.22it/s, loss=0.0071, lr=2.50e-06, step=9967] Training: 100%|█████████▉| 9968/10000 [2:13:26<00:26, 1.22it/s, loss=0.0089, lr=2.50e-06, step=9968] Training: 100%|█████████▉| 9969/10000 [2:13:27<00:24, 1.29it/s, loss=0.0089, lr=2.50e-06, step=9968] Training: 100%|█████████▉| 9969/10000 [2:13:27<00:24, 1.29it/s, loss=0.0132, lr=2.50e-06, step=9969]20:57:59.813 [I] step=9970 loss=0.0142 smoothed_loss=0.0113 lr=2.50e-06 grad_norm=0.4276 step_time=0.6209s data_time=0.1792s it/s=1.250 eta_to_10000=24.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0120 grad_action_out_proj_arms=0.1224 grad_arm_token_fuse=0.0636 grad_shared_expert=0.4228 (18633:train_pytorch.py:850) + Training: 100%|█████████▉| 9970/10000 [2:13:27<00:23, 1.27it/s, loss=0.0132, lr=2.50e-06, step=9969] Training: 100%|█████████▉| 9970/10000 [2:13:27<00:23, 1.27it/s, loss=0.0142, lr=2.50e-06, step=9970] Training: 100%|█████████▉| 9971/10000 [2:13:28<00:23, 1.24it/s, loss=0.0142, lr=2.50e-06, step=9970] Training: 100%|█████████▉| 9971/10000 [2:13:28<00:23, 1.24it/s, loss=0.0041, lr=2.50e-06, step=9971] Training: 100%|█████████▉| 9972/10000 [2:13:29<00:25, 1.11it/s, loss=0.0041, lr=2.50e-06, step=9971] Training: 100%|█████████▉| 9972/10000 [2:13:29<00:25, 1.11it/s, loss=0.0108, lr=2.50e-06, step=9972] Training: 100%|█████████▉| 9973/10000 [2:13:30<00:22, 1.21it/s, loss=0.0108, lr=2.50e-06, step=9972] Training: 100%|█████████▉| 9973/10000 [2:13:30<00:22, 1.21it/s, loss=0.0641, lr=2.50e-06, step=9973] Training: 100%|█████████▉| 9974/10000 [2:13:31<00:20, 1.24it/s, loss=0.0641, lr=2.50e-06, step=9973] Training: 100%|█████████▉| 9974/10000 [2:13:31<00:20, 1.24it/s, loss=0.0083, lr=2.50e-06, step=9974] Training: 100%|█████████▉| 9975/10000 [2:13:31<00:18, 1.38it/s, loss=0.0083, lr=2.50e-06, step=9974] Training: 100%|█████████▉| 9975/10000 [2:13:31<00:18, 1.38it/s, loss=0.0020, lr=2.50e-06, step=9975] Training: 100%|█████████▉| 9976/10000 [2:13:32<00:15, 1.52it/s, loss=0.0020, lr=2.50e-06, step=9975] Training: 100%|█████████▉| 9976/10000 [2:13:32<00:15, 1.52it/s, loss=0.0023, lr=2.50e-06, step=9976] Training: 100%|█████████▉| 9977/10000 [2:13:32<00:14, 1.64it/s, loss=0.0023, lr=2.50e-06, step=9976] Training: 100%|█████████▉| 9977/10000 [2:13:32<00:14, 1.64it/s, loss=0.0080, lr=2.50e-06, step=9977] Training: 100%|█████████▉| 9978/10000 [2:13:33<00:15, 1.41it/s, loss=0.0080, lr=2.50e-06, step=9977] Training: 100%|█████████▉| 9978/10000 [2:13:33<00:15, 1.41it/s, loss=0.0011, lr=2.50e-06, step=9978] Training: 100%|█████████▉| 9979/10000 [2:13:34<00:17, 1.21it/s, loss=0.0011, lr=2.50e-06, step=9978] Training: 100%|█████████▉| 9979/10000 [2:13:34<00:17, 1.21it/s, loss=0.0015, lr=2.50e-06, step=9979]20:58:07.396 [I] step=9980 loss=0.0031 smoothed_loss=0.0095 lr=2.50e-06 grad_norm=0.3769 step_time=0.6279s data_time=0.1305s it/s=1.319 eta_to_10000=15.2s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0020 grad_action_out_proj_arms=0.0264 grad_arm_token_fuse=0.0107 grad_shared_expert=0.1136 (18633:train_pytorch.py:850) + Training: 100%|█████████▉| 9980/10000 [2:13:35<00:15, 1.31it/s, loss=0.0015, lr=2.50e-06, step=9979] Training: 100%|█████████▉| 9980/10000 [2:13:35<00:15, 1.31it/s, loss=0.0031, lr=2.50e-06, step=9980] Training: 100%|█████████▉| 9981/10000 [2:13:36<00:14, 1.34it/s, loss=0.0031, lr=2.50e-06, step=9980] Training: 100%|█████████▉| 9981/10000 [2:13:36<00:14, 1.34it/s, loss=0.0043, lr=2.50e-06, step=9981] Training: 100%|█████████▉| 9982/10000 [2:13:36<00:12, 1.47it/s, loss=0.0043, lr=2.50e-06, step=9981] Training: 100%|█████████▉| 9982/10000 [2:13:36<00:12, 1.47it/s, loss=0.0093, lr=2.50e-06, step=9982] Training: 100%|█████████▉| 9983/10000 [2:13:37<00:11, 1.52it/s, loss=0.0093, lr=2.50e-06, step=9982] Training: 100%|█████████▉| 9983/10000 [2:13:37<00:11, 1.52it/s, loss=0.0019, lr=2.50e-06, step=9983] Training: 100%|█████████▉| 9984/10000 [2:13:38<00:11, 1.39it/s, loss=0.0019, lr=2.50e-06, step=9983] Training: 100%|█████████▉| 9984/10000 [2:13:38<00:11, 1.39it/s, loss=0.0075, lr=2.50e-06, step=9984] Training: 100%|█████████▉| 9985/10000 [2:13:39<00:11, 1.27it/s, loss=0.0075, lr=2.50e-06, step=9984] Training: 100%|█████████▉| 9985/10000 [2:13:39<00:11, 1.27it/s, loss=0.0019, lr=2.50e-06, step=9985] Training: 100%|█████████▉| 9986/10000 [2:13:40<00:11, 1.25it/s, loss=0.0019, lr=2.50e-06, step=9985] Training: 100%|█████████▉| 9986/10000 [2:13:40<00:11, 1.25it/s, loss=0.0009, lr=2.50e-06, step=9986] Training: 100%|█████████▉| 9987/10000 [2:13:40<00:09, 1.36it/s, loss=0.0009, lr=2.50e-06, step=9986] Training: 100%|█████████▉| 9987/10000 [2:13:40<00:09, 1.36it/s, loss=0.0042, lr=2.50e-06, step=9987] Training: 100%|█████████▉| 9988/10000 [2:13:41<00:08, 1.34it/s, loss=0.0042, lr=2.50e-06, step=9987] Training: 100%|█████████▉| 9988/10000 [2:13:41<00:08, 1.34it/s, loss=0.0117, lr=2.50e-06, step=9988] Training: 100%|█████████▉| 9989/10000 [2:13:42<00:08, 1.29it/s, loss=0.0117, lr=2.50e-06, step=9988] Training: 100%|█████████▉| 9989/10000 [2:13:42<00:08, 1.29it/s, loss=0.0059, lr=2.50e-06, step=9989]20:58:14.664 [I] step=9990 loss=0.0030 smoothed_loss=0.0066 lr=2.50e-06 grad_norm=0.4428 step_time=0.6006s data_time=0.1262s it/s=1.376 eta_to_10000=7.3s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0084 grad_action_out_proj_arms=0.0666 grad_arm_token_fuse=0.0445 grad_shared_expert=0.4760 (18633:train_pytorch.py:850) + Training: 100%|█████████▉| 9990/10000 [2:13:42<00:07, 1.39it/s, loss=0.0059, lr=2.50e-06, step=9989] Training: 100%|█████████▉| 9990/10000 [2:13:42<00:07, 1.39it/s, loss=0.0030, lr=2.50e-06, step=9990] Training: 100%|█████████▉| 9991/10000 [2:13:43<00:06, 1.30it/s, loss=0.0030, lr=2.50e-06, step=9990] Training: 100%|█████████▉| 9991/10000 [2:13:43<00:06, 1.30it/s, loss=0.0241, lr=2.50e-06, step=9991] Training: 100%|█████████▉| 9992/10000 [2:13:44<00:06, 1.15it/s, loss=0.0241, lr=2.50e-06, step=9991] Training: 100%|█████████▉| 9992/10000 [2:13:44<00:06, 1.15it/s, loss=0.0044, lr=2.50e-06, step=9992] Training: 100%|█████████▉| 9993/10000 [2:13:45<00:06, 1.05it/s, loss=0.0044, lr=2.50e-06, step=9992] Training: 100%|█████████▉| 9993/10000 [2:13:45<00:06, 1.05it/s, loss=0.0027, lr=2.50e-06, step=9993] Training: 100%|█████████▉| 9994/10000 [2:13:46<00:05, 1.07it/s, loss=0.0027, lr=2.50e-06, step=9993] Training: 100%|█████████▉| 9994/10000 [2:13:46<00:05, 1.07it/s, loss=0.0019, lr=2.50e-06, step=9994] Training: 100%|█████████▉| 9995/10000 [2:13:47<00:04, 1.07it/s, loss=0.0019, lr=2.50e-06, step=9994] Training: 100%|█████████▉| 9995/10000 [2:13:47<00:04, 1.07it/s, loss=0.0053, lr=2.50e-06, step=9995] Training: 100%|█████████▉| 9996/10000 [2:13:48<00:03, 1.06it/s, loss=0.0053, lr=2.50e-06, step=9995] Training: 100%|█████████▉| 9996/10000 [2:13:48<00:03, 1.06it/s, loss=0.0039, lr=2.50e-06, step=9996] Training: 100%|█████████▉| 9997/10000 [2:13:49<00:02, 1.13it/s, loss=0.0039, lr=2.50e-06, step=9996] Training: 100%|█████████▉| 9997/10000 [2:13:49<00:02, 1.13it/s, loss=0.0080, lr=2.50e-06, step=9997] Training: 100%|█████████▉| 9998/10000 [2:13:50<00:01, 1.27it/s, loss=0.0080, lr=2.50e-06, step=9997] Training: 100%|█████████▉| 9998/10000 [2:13:50<00:01, 1.27it/s, loss=0.0022, lr=2.50e-06, step=9998] Training: 100%|█████████▉| 9999/10000 [2:13:50<00:00, 1.24it/s, loss=0.0022, lr=2.50e-06, step=9998] Training: 100%|█████████▉| 9999/10000 [2:13:50<00:00, 1.24it/s, loss=0.1173, lr=2.50e-06, step=9999]20:58:23.797 [I] step=10000 loss=0.0140 smoothed_loss=0.0169 lr=2.50e-06 grad_norm=0.4269 step_time=0.6919s data_time=0.2213s it/s=1.095 eta_to_10000=0.0s max_cuda_memory=35.27GB grad_action_in_proj_arms=0.0121 grad_action_out_proj_arms=0.1277 grad_arm_token_fuse=0.0634 grad_shared_expert=0.4071 (18633:train_pytorch.py:850) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +21:01:52.032 [I] Saved checkpoint at step 10000 -> /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000 (18633:train_pytorch.py:350) + Training: 100%|██████████| 10000/10000 [2:17:20<00:00, 63.35s/it, loss=0.1173, lr=2.50e-06, step=9999] Training: 100%|██████████| 10000/10000 [2:17:20<00:00, 63.35s/it, loss=0.0140, lr=2.50e-06, step=1e+4] Training: 100%|██████████| 10000/10000 [2:17:20<00:00, 1.21it/s, loss=0.0140, lr=2.50e-06, step=1e+4] +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_1000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_1000.log new file mode 100644 index 0000000000000000000000000000000000000000..b277f0474cd3352448ebca7783891fef17dbb9be --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_1000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_parallel_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.049506 left_arm_loss=0.040621 right_arm_loss=0.058391 imbalance=0.017770 batch_time_s=4.6353 +eval_batch=2 loss=0.013146 left_arm_loss=0.013595 right_arm_loss=0.012698 imbalance=0.000897 batch_time_s=0.2253 +eval_batch=3 loss=0.014637 left_arm_loss=0.019903 right_arm_loss=0.009370 imbalance=0.010533 batch_time_s=0.2946 +eval_batch=4 loss=0.064632 left_arm_loss=0.061204 right_arm_loss=0.068061 imbalance=0.006857 batch_time_s=0.2289 +eval_batch=5 loss=0.049220 left_arm_loss=0.068132 right_arm_loss=0.030307 imbalance=0.037825 batch_time_s=0.2334 +eval_batch=6 loss=0.044489 left_arm_loss=0.084255 right_arm_loss=0.004724 imbalance=0.079531 batch_time_s=0.3232 +eval_batch=7 loss=0.038667 left_arm_loss=0.073409 right_arm_loss=0.003924 imbalance=0.069485 batch_time_s=0.2285 +eval_batch=8 loss=0.018589 left_arm_loss=0.034451 right_arm_loss=0.002728 imbalance=0.031723 batch_time_s=0.2299 +eval_batch=9 loss=0.025908 left_arm_loss=0.049782 right_arm_loss=0.002034 imbalance=0.047748 batch_time_s=0.2356 +eval_batch=10 loss=0.035559 left_arm_loss=0.068822 right_arm_loss=0.002296 imbalance=0.066526 batch_time_s=0.2449 +eval_batch=11 loss=0.030806 left_arm_loss=0.058047 right_arm_loss=0.003565 imbalance=0.054483 batch_time_s=0.3058 +eval_batch=12 loss=0.047394 left_arm_loss=0.090843 right_arm_loss=0.003945 imbalance=0.086899 batch_time_s=0.2833 +eval_batch=13 loss=0.049660 left_arm_loss=0.095403 right_arm_loss=0.003917 imbalance=0.091486 batch_time_s=0.2489 +eval_batch=14 loss=0.061841 left_arm_loss=0.104474 right_arm_loss=0.019209 imbalance=0.085265 batch_time_s=0.2382 +eval_batch=15 loss=0.085757 left_arm_loss=0.037049 right_arm_loss=0.134464 imbalance=0.097415 batch_time_s=0.2364 +eval_batch=16 loss=0.076827 left_arm_loss=0.045344 right_arm_loss=0.108310 imbalance=0.062966 batch_time_s=0.2900 +eval_batch=17 loss=0.056418 left_arm_loss=0.100516 right_arm_loss=0.012320 imbalance=0.088197 batch_time_s=0.4810 +eval_batch=18 loss=0.070686 left_arm_loss=0.076775 right_arm_loss=0.064597 imbalance=0.012178 batch_time_s=0.2382 +eval_batch=19 loss=0.033053 left_arm_loss=0.041608 right_arm_loss=0.024499 imbalance=0.017110 batch_time_s=0.2385 +eval_batch=20 loss=0.031012 left_arm_loss=0.045658 right_arm_loss=0.016366 imbalance=0.029292 batch_time_s=0.2304 +eval_batch=21 loss=0.028765 left_arm_loss=0.044768 right_arm_loss=0.012761 imbalance=0.032007 batch_time_s=0.2992 +eval_batch=22 loss=0.057293 left_arm_loss=0.061262 right_arm_loss=0.053323 imbalance=0.007940 batch_time_s=0.2391 +eval_batch=23 loss=0.094658 left_arm_loss=0.165888 right_arm_loss=0.023429 imbalance=0.142458 batch_time_s=0.3353 +eval_batch=24 loss=0.097680 left_arm_loss=0.184031 right_arm_loss=0.011328 imbalance=0.172703 batch_time_s=0.3058 +eval_batch=25 loss=0.064214 left_arm_loss=0.125794 right_arm_loss=0.002633 imbalance=0.123161 batch_time_s=0.3103 +eval_batch=26 loss=0.029143 left_arm_loss=0.050796 right_arm_loss=0.007489 imbalance=0.043307 batch_time_s=0.7111 +eval_batch=27 loss=0.036844 left_arm_loss=0.063446 right_arm_loss=0.010242 imbalance=0.053204 batch_time_s=0.3351 +eval_batch=28 loss=0.031578 left_arm_loss=0.060362 right_arm_loss=0.002794 imbalance=0.057568 batch_time_s=0.2335 +eval_batch=29 loss=0.047676 left_arm_loss=0.092382 right_arm_loss=0.002970 imbalance=0.089412 batch_time_s=0.2433 +eval_batch=30 loss=0.084667 left_arm_loss=0.165372 right_arm_loss=0.003963 imbalance=0.161408 batch_time_s=0.2322 +eval_batch=31 loss=0.159263 left_arm_loss=0.298709 right_arm_loss=0.019817 imbalance=0.278892 batch_time_s=0.2344 +eval_batch=32 loss=0.112677 left_arm_loss=0.118272 right_arm_loss=0.107082 imbalance=0.011190 batch_time_s=0.3530 +eval_batch=33 loss=0.068681 left_arm_loss=0.031034 right_arm_loss=0.106329 imbalance=0.075295 batch_time_s=0.2477 +eval_batch=34 loss=0.073726 left_arm_loss=0.121948 right_arm_loss=0.025504 imbalance=0.096444 batch_time_s=0.3370 +eval_batch=35 loss=0.061882 left_arm_loss=0.109883 right_arm_loss=0.013881 imbalance=0.096002 batch_time_s=0.2556 +eval_batch=36 loss=0.046614 left_arm_loss=0.054836 right_arm_loss=0.038392 imbalance=0.016444 batch_time_s=0.2569 +eval_batch=37 loss=0.032190 left_arm_loss=0.051540 right_arm_loss=0.012840 imbalance=0.038700 batch_time_s=0.3450 +eval_batch=38 loss=0.066159 left_arm_loss=0.083341 right_arm_loss=0.048978 imbalance=0.034363 batch_time_s=0.3564 +eval_batch=39 loss=0.074041 left_arm_loss=0.047870 right_arm_loss=0.100211 imbalance=0.052341 batch_time_s=0.2406 +eval_batch=40 loss=0.047020 left_arm_loss=0.053036 right_arm_loss=0.041005 imbalance=0.012030 batch_time_s=0.3074 +eval_batch=41 loss=0.057365 left_arm_loss=0.109413 right_arm_loss=0.005316 imbalance=0.104097 batch_time_s=0.2427 +eval_batch=42 loss=0.033981 left_arm_loss=0.063623 right_arm_loss=0.004340 imbalance=0.059283 batch_time_s=0.2765 +eval_batch=43 loss=0.018033 left_arm_loss=0.029285 right_arm_loss=0.006781 imbalance=0.022504 batch_time_s=0.2264 +eval_batch=44 loss=0.017014 left_arm_loss=0.028932 right_arm_loss=0.005096 imbalance=0.023836 batch_time_s=0.2224 +eval_batch=45 loss=0.021894 left_arm_loss=0.040422 right_arm_loss=0.003366 imbalance=0.037055 batch_time_s=0.2718 +eval_batch=46 loss=0.041116 left_arm_loss=0.076250 right_arm_loss=0.005983 imbalance=0.070267 batch_time_s=0.2373 +eval_batch=47 loss=0.134721 left_arm_loss=0.042995 right_arm_loss=0.226447 imbalance=0.183451 batch_time_s=0.2264 +eval_batch=48 loss=0.258522 left_arm_loss=0.016699 right_arm_loss=0.500345 imbalance=0.483646 batch_time_s=0.2296 +eval_batch=49 loss=0.043552 left_arm_loss=0.017405 right_arm_loss=0.069698 imbalance=0.052293 batch_time_s=0.2327 +eval_batch=50 loss=0.146957 left_arm_loss=0.064547 right_arm_loss=0.229367 imbalance=0.164819 batch_time_s=0.3012 +config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/1000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.059715 +std_val_loss: 0.042962 +mean_left_arm_loss: 0.073681 +std_left_arm_loss: 0.049928 +mean_right_arm_loss: 0.045749 +std_right_arm_loss: 0.082818 +mean_left_joint_loss: 0.078129 +std_left_joint_loss: 0.055212 +mean_left_gripper_loss: 0.042541 +std_left_gripper_loss: 0.084910 +mean_right_joint_loss: 0.047261 +std_right_joint_loss: 0.090299 +mean_right_gripper_loss: 0.035161 +std_right_gripper_loss: 0.079674 +mean_left_right_imbalance: 0.075806 +std_left_right_imbalance: 0.079713 +per_batch_timing_seconds: mean=0.3663 std=0.6150 min=0.2224 max=4.6353 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.119875 left_arm_mae=0.108588 right_arm_mae=0.131163 imbalance_mae=0.022575 batch_time_s=0.3299 +sample_eval_batch=2 num_steps=4 masked_mae=0.056468 left_arm_mae=0.059824 right_arm_mae=0.053113 imbalance_mae=0.006710 batch_time_s=0.3864 +sample_eval_batch=3 num_steps=4 masked_mae=0.069907 left_arm_mae=0.072771 right_arm_mae=0.067042 imbalance_mae=0.005730 batch_time_s=0.2686 +sample_eval_batch=4 num_steps=4 masked_mae=0.116824 left_arm_mae=0.118923 right_arm_mae=0.114724 imbalance_mae=0.004199 batch_time_s=0.3825 +sample_eval_batch=5 num_steps=4 masked_mae=0.082754 left_arm_mae=0.103956 right_arm_mae=0.061551 imbalance_mae=0.042404 batch_time_s=0.3197 +sample_eval_batch=6 num_steps=4 masked_mae=0.083889 left_arm_mae=0.145390 right_arm_mae=0.022387 imbalance_mae=0.123002 batch_time_s=0.2733 +sample_eval_batch=7 num_steps=4 masked_mae=0.095747 left_arm_mae=0.170531 right_arm_mae=0.020963 imbalance_mae=0.149568 batch_time_s=0.3479 +sample_eval_batch=8 num_steps=4 masked_mae=0.067250 left_arm_mae=0.114657 right_arm_mae=0.019842 imbalance_mae=0.094815 batch_time_s=0.3748 +sample_eval_batch=9 num_steps=4 masked_mae=0.070907 left_arm_mae=0.122207 right_arm_mae=0.019607 imbalance_mae=0.102601 batch_time_s=0.2877 +sample_eval_batch=10 num_steps=4 masked_mae=0.087629 left_arm_mae=0.153592 right_arm_mae=0.021666 imbalance_mae=0.131926 batch_time_s=0.2709 +sample_eval_batch=11 num_steps=4 masked_mae=0.075383 left_arm_mae=0.129150 right_arm_mae=0.021616 imbalance_mae=0.107533 batch_time_s=0.2654 +sample_eval_batch=12 num_steps=4 masked_mae=0.100087 left_arm_mae=0.177705 right_arm_mae=0.022468 imbalance_mae=0.155237 batch_time_s=0.2791 +sample_eval_batch=13 num_steps=4 masked_mae=0.097545 left_arm_mae=0.173683 right_arm_mae=0.021406 imbalance_mae=0.152276 batch_time_s=0.3463 +sample_eval_batch=14 num_steps=4 masked_mae=0.119858 left_arm_mae=0.192049 right_arm_mae=0.047666 imbalance_mae=0.144383 batch_time_s=0.2714 +sample_eval_batch=15 num_steps=4 masked_mae=0.126125 left_arm_mae=0.043271 right_arm_mae=0.208979 imbalance_mae=0.165708 batch_time_s=0.2719 +sample_eval_batch=16 num_steps=4 masked_mae=0.110233 left_arm_mae=0.067434 right_arm_mae=0.153031 imbalance_mae=0.085598 batch_time_s=0.3009 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.092530 +sample_eval_num_steps_4_std_masked_mae: 0.020956 +sample_eval_num_steps_4_mean_left_arm_mae: 0.122108 +sample_eval_num_steps_4_std_left_arm_mae: 0.043780 +sample_eval_num_steps_4_mean_right_arm_mae: 0.062952 +sample_eval_num_steps_4_std_right_arm_mae: 0.056483 +sample_eval_num_steps_4_mean_left_joint_mae: 0.133062 +sample_eval_num_steps_4_std_left_joint_mae: 0.052111 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.045431 +sample_eval_num_steps_4_std_left_gripper_mae: 0.055952 +sample_eval_num_steps_4_mean_right_joint_mae: 0.065476 +sample_eval_num_steps_4_std_right_joint_mae: 0.060695 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.045280 +sample_eval_num_steps_4_std_right_gripper_mae: 0.053039 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.093392 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.056874 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3110 std=0.0430 min=0.2654 max=0.3864 +sample_eval_batch=1 num_steps=10 masked_mae=0.135566 left_arm_mae=0.122877 right_arm_mae=0.148255 imbalance_mae=0.025378 batch_time_s=0.4122 +sample_eval_batch=2 num_steps=10 masked_mae=0.068124 left_arm_mae=0.071843 right_arm_mae=0.064406 imbalance_mae=0.007438 batch_time_s=0.3659 +sample_eval_batch=3 num_steps=10 masked_mae=0.081230 left_arm_mae=0.083152 right_arm_mae=0.079308 imbalance_mae=0.003844 batch_time_s=0.4764 +sample_eval_batch=4 num_steps=10 masked_mae=0.128195 left_arm_mae=0.129532 right_arm_mae=0.126857 imbalance_mae=0.002675 batch_time_s=0.3405 +sample_eval_batch=5 num_steps=10 masked_mae=0.090927 left_arm_mae=0.113657 right_arm_mae=0.068196 imbalance_mae=0.045462 batch_time_s=0.3940 +sample_eval_batch=6 num_steps=10 masked_mae=0.095554 left_arm_mae=0.164228 right_arm_mae=0.026880 imbalance_mae=0.137348 batch_time_s=0.4560 +sample_eval_batch=7 num_steps=10 masked_mae=0.103011 left_arm_mae=0.180335 right_arm_mae=0.025687 imbalance_mae=0.154648 batch_time_s=0.4857 +sample_eval_batch=8 num_steps=10 masked_mae=0.071890 left_arm_mae=0.119614 right_arm_mae=0.024165 imbalance_mae=0.095449 batch_time_s=0.3618 +sample_eval_batch=9 num_steps=10 masked_mae=0.079933 left_arm_mae=0.135905 right_arm_mae=0.023962 imbalance_mae=0.111943 batch_time_s=0.4824 +sample_eval_batch=10 num_steps=10 masked_mae=0.096654 left_arm_mae=0.168318 right_arm_mae=0.024991 imbalance_mae=0.143327 batch_time_s=0.5017 +sample_eval_batch=11 num_steps=10 masked_mae=0.083773 left_arm_mae=0.144171 right_arm_mae=0.023375 imbalance_mae=0.120796 batch_time_s=0.4778 +sample_eval_batch=12 num_steps=10 masked_mae=0.107955 left_arm_mae=0.189506 right_arm_mae=0.026404 imbalance_mae=0.163102 batch_time_s=0.3573 +sample_eval_batch=13 num_steps=10 masked_mae=0.106832 left_arm_mae=0.187708 right_arm_mae=0.025955 imbalance_mae=0.161753 batch_time_s=0.3475 +sample_eval_batch=14 num_steps=10 masked_mae=0.127854 left_arm_mae=0.200072 right_arm_mae=0.055635 imbalance_mae=0.144437 batch_time_s=0.4218 +sample_eval_batch=15 num_steps=10 masked_mae=0.140580 left_arm_mae=0.052115 right_arm_mae=0.229045 imbalance_mae=0.176931 batch_time_s=0.3976 +sample_eval_batch=16 num_steps=10 masked_mae=0.121160 left_arm_mae=0.074721 right_arm_mae=0.167600 imbalance_mae=0.092879 batch_time_s=0.3501 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.102452 +sample_eval_num_steps_10_std_masked_mae: 0.022208 +sample_eval_num_steps_10_mean_left_arm_mae: 0.133610 +sample_eval_num_steps_10_std_left_arm_mae: 0.044796 +sample_eval_num_steps_10_mean_right_arm_mae: 0.071295 +sample_eval_num_steps_10_std_right_arm_mae: 0.061523 +sample_eval_num_steps_10_mean_left_joint_mae: 0.145474 +sample_eval_num_steps_10_std_left_joint_mae: 0.053589 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.050560 +sample_eval_num_steps_10_std_left_gripper_mae: 0.060317 +sample_eval_num_steps_10_mean_right_joint_mae: 0.073909 +sample_eval_num_steps_10_std_right_joint_mae: 0.066406 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.053000 +sample_eval_num_steps_10_std_right_gripper_mae: 0.051143 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.099213 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.060422 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.4143 std=0.0560 min=0.3405 max=0.5017 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_10000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_10000.log new file mode 100644 index 0000000000000000000000000000000000000000..e5549da44f056244ddcb08293967e08115badb9e --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_10000.log @@ -0,0 +1,198 @@ +starting_eval config=pi05_twin_handover_256_packed_parallel_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=100 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.008598 left_arm_loss=0.008966 right_arm_loss=0.008230 imbalance=0.000736 batch_time_s=0.7969 +eval_batch=2 loss=0.001298 left_arm_loss=0.001120 right_arm_loss=0.001477 imbalance=0.000356 batch_time_s=0.4212 +eval_batch=3 loss=0.002089 left_arm_loss=0.002916 right_arm_loss=0.001262 imbalance=0.001655 batch_time_s=0.3802 +eval_batch=4 loss=0.014139 left_arm_loss=0.013196 right_arm_loss=0.015083 imbalance=0.001887 batch_time_s=0.4351 +eval_batch=5 loss=0.010969 left_arm_loss=0.011818 right_arm_loss=0.010120 imbalance=0.001699 batch_time_s=0.4440 +eval_batch=6 loss=0.006583 left_arm_loss=0.012185 right_arm_loss=0.000980 imbalance=0.011205 batch_time_s=0.3722 +eval_batch=7 loss=0.007013 left_arm_loss=0.013410 right_arm_loss=0.000616 imbalance=0.012795 batch_time_s=0.3687 +eval_batch=8 loss=0.009583 left_arm_loss=0.018835 right_arm_loss=0.000331 imbalance=0.018504 batch_time_s=0.3495 +eval_batch=9 loss=0.012016 left_arm_loss=0.023703 right_arm_loss=0.000330 imbalance=0.023373 batch_time_s=0.3122 +eval_batch=10 loss=0.022936 left_arm_loss=0.045498 right_arm_loss=0.000375 imbalance=0.045123 batch_time_s=0.3299 +eval_batch=11 loss=0.017193 left_arm_loss=0.033949 right_arm_loss=0.000436 imbalance=0.033513 batch_time_s=0.2463 +eval_batch=12 loss=0.019110 left_arm_loss=0.037568 right_arm_loss=0.000652 imbalance=0.036916 batch_time_s=0.2415 +eval_batch=13 loss=0.022467 left_arm_loss=0.044283 right_arm_loss=0.000650 imbalance=0.043633 batch_time_s=0.3323 +eval_batch=14 loss=0.005440 left_arm_loss=0.008297 right_arm_loss=0.002582 imbalance=0.005715 batch_time_s=0.3667 +eval_batch=15 loss=0.028680 left_arm_loss=0.010137 right_arm_loss=0.047224 imbalance=0.037087 batch_time_s=0.3973 +eval_batch=16 loss=0.020404 left_arm_loss=0.012046 right_arm_loss=0.028762 imbalance=0.016716 batch_time_s=0.3513 +eval_batch=17 loss=0.012630 left_arm_loss=0.023091 right_arm_loss=0.002169 imbalance=0.020923 batch_time_s=0.2340 +eval_batch=18 loss=0.037282 left_arm_loss=0.054978 right_arm_loss=0.019586 imbalance=0.035393 batch_time_s=0.3184 +eval_batch=19 loss=0.009588 left_arm_loss=0.016425 right_arm_loss=0.002752 imbalance=0.013673 batch_time_s=0.2645 +eval_batch=20 loss=0.017728 left_arm_loss=0.031880 right_arm_loss=0.003577 imbalance=0.028302 batch_time_s=0.2603 +eval_batch=21 loss=0.098740 left_arm_loss=0.195217 right_arm_loss=0.002263 imbalance=0.192954 batch_time_s=0.2219 +eval_batch=22 loss=0.114247 left_arm_loss=0.199525 right_arm_loss=0.028969 imbalance=0.170556 batch_time_s=0.2241 +eval_batch=23 loss=0.042247 left_arm_loss=0.076375 right_arm_loss=0.008120 imbalance=0.068255 batch_time_s=0.3778 +eval_batch=24 loss=0.124734 left_arm_loss=0.247885 right_arm_loss=0.001583 imbalance=0.246302 batch_time_s=0.2236 +eval_batch=25 loss=0.080652 left_arm_loss=0.160831 right_arm_loss=0.000473 imbalance=0.160358 batch_time_s=0.2245 +eval_batch=26 loss=0.058621 left_arm_loss=0.116006 right_arm_loss=0.001236 imbalance=0.114770 batch_time_s=0.3346 +eval_batch=27 loss=0.066596 left_arm_loss=0.132247 right_arm_loss=0.000946 imbalance=0.131301 batch_time_s=0.2557 +eval_batch=28 loss=0.029455 left_arm_loss=0.058478 right_arm_loss=0.000432 imbalance=0.058046 batch_time_s=0.2267 +eval_batch=29 loss=0.040648 left_arm_loss=0.080583 right_arm_loss=0.000712 imbalance=0.079871 batch_time_s=0.4551 +eval_batch=30 loss=0.027801 left_arm_loss=0.054392 right_arm_loss=0.001210 imbalance=0.053182 batch_time_s=0.4952 +eval_batch=31 loss=0.049047 left_arm_loss=0.095610 right_arm_loss=0.002484 imbalance=0.093125 batch_time_s=0.4630 +eval_batch=32 loss=0.017039 left_arm_loss=0.025473 right_arm_loss=0.008604 imbalance=0.016869 batch_time_s=0.3515 +eval_batch=33 loss=0.036852 left_arm_loss=0.017885 right_arm_loss=0.055819 imbalance=0.037935 batch_time_s=0.4058 +eval_batch=34 loss=0.075331 left_arm_loss=0.115144 right_arm_loss=0.035518 imbalance=0.079626 batch_time_s=0.4361 +eval_batch=35 loss=0.024882 left_arm_loss=0.047816 right_arm_loss=0.001948 imbalance=0.045868 batch_time_s=0.4481 +eval_batch=36 loss=0.007855 left_arm_loss=0.003467 right_arm_loss=0.012244 imbalance=0.008777 batch_time_s=0.3434 +eval_batch=37 loss=0.002229 left_arm_loss=0.003409 right_arm_loss=0.001049 imbalance=0.002359 batch_time_s=0.2315 +eval_batch=38 loss=0.023309 left_arm_loss=0.028313 right_arm_loss=0.018306 imbalance=0.010007 batch_time_s=0.2646 +eval_batch=39 loss=0.020423 left_arm_loss=0.007732 right_arm_loss=0.033114 imbalance=0.025382 batch_time_s=0.3098 +eval_batch=40 loss=0.011275 left_arm_loss=0.014338 right_arm_loss=0.008213 imbalance=0.006125 batch_time_s=0.2342 +eval_batch=41 loss=0.011151 left_arm_loss=0.021560 right_arm_loss=0.000741 imbalance=0.020819 batch_time_s=0.3639 +eval_batch=42 loss=0.008415 left_arm_loss=0.016206 right_arm_loss=0.000625 imbalance=0.015581 batch_time_s=0.3638 +eval_batch=43 loss=0.003240 left_arm_loss=0.006059 right_arm_loss=0.000420 imbalance=0.005639 batch_time_s=0.3456 +eval_batch=44 loss=0.001447 left_arm_loss=0.002358 right_arm_loss=0.000536 imbalance=0.001821 batch_time_s=0.3825 +eval_batch=45 loss=0.003787 left_arm_loss=0.006964 right_arm_loss=0.000610 imbalance=0.006355 batch_time_s=0.2629 +eval_batch=46 loss=0.010355 left_arm_loss=0.019963 right_arm_loss=0.000747 imbalance=0.019216 batch_time_s=0.3228 +eval_batch=47 loss=0.093454 left_arm_loss=0.004442 right_arm_loss=0.182465 imbalance=0.178023 batch_time_s=0.3782 +eval_batch=48 loss=0.085065 left_arm_loss=0.003706 right_arm_loss=0.166424 imbalance=0.162718 batch_time_s=0.3985 +eval_batch=49 loss=0.005288 left_arm_loss=0.002214 right_arm_loss=0.008361 imbalance=0.006147 batch_time_s=0.3958 +eval_batch=50 loss=0.045286 left_arm_loss=0.022582 right_arm_loss=0.067991 imbalance=0.045409 batch_time_s=0.2282 +eval_batch=51 loss=0.008671 left_arm_loss=0.012865 right_arm_loss=0.004476 imbalance=0.008390 batch_time_s=0.3869 +eval_batch=52 loss=0.011851 left_arm_loss=0.017357 right_arm_loss=0.006346 imbalance=0.011011 batch_time_s=0.2768 +eval_batch=53 loss=0.001221 left_arm_loss=0.001389 right_arm_loss=0.001052 imbalance=0.000337 batch_time_s=0.3491 +eval_batch=54 loss=0.002532 left_arm_loss=0.002639 right_arm_loss=0.002425 imbalance=0.000214 batch_time_s=0.2220 +eval_batch=55 loss=0.017695 left_arm_loss=0.018922 right_arm_loss=0.016468 imbalance=0.002454 batch_time_s=0.3504 +eval_batch=56 loss=0.023422 left_arm_loss=0.005041 right_arm_loss=0.041803 imbalance=0.036762 batch_time_s=0.3002 +eval_batch=57 loss=0.007411 left_arm_loss=0.009387 right_arm_loss=0.005434 imbalance=0.003953 batch_time_s=0.2261 +eval_batch=58 loss=0.009703 left_arm_loss=0.018272 right_arm_loss=0.001134 imbalance=0.017138 batch_time_s=0.5275 +eval_batch=59 loss=0.011092 left_arm_loss=0.021360 right_arm_loss=0.000824 imbalance=0.020536 batch_time_s=0.2356 +eval_batch=60 loss=0.004315 left_arm_loss=0.007742 right_arm_loss=0.000887 imbalance=0.006854 batch_time_s=0.3182 +eval_batch=61 loss=0.001272 left_arm_loss=0.002311 right_arm_loss=0.000233 imbalance=0.002078 batch_time_s=0.2562 +eval_batch=62 loss=0.001647 left_arm_loss=0.003064 right_arm_loss=0.000229 imbalance=0.002836 batch_time_s=0.2236 +eval_batch=63 loss=0.004060 left_arm_loss=0.007585 right_arm_loss=0.000534 imbalance=0.007051 batch_time_s=0.2289 +eval_batch=64 loss=0.004870 left_arm_loss=0.008584 right_arm_loss=0.001156 imbalance=0.007428 batch_time_s=0.3408 +eval_batch=65 loss=0.009187 left_arm_loss=0.002123 right_arm_loss=0.016250 imbalance=0.014127 batch_time_s=0.2414 +eval_batch=66 loss=0.003548 left_arm_loss=0.004316 right_arm_loss=0.002779 imbalance=0.001537 batch_time_s=0.2221 +eval_batch=67 loss=0.005461 left_arm_loss=0.010117 right_arm_loss=0.000805 imbalance=0.009312 batch_time_s=0.3501 +eval_batch=68 loss=0.019510 left_arm_loss=0.022769 right_arm_loss=0.016250 imbalance=0.006519 batch_time_s=0.2250 +eval_batch=69 loss=0.040062 left_arm_loss=0.064019 right_arm_loss=0.016104 imbalance=0.047915 batch_time_s=0.2227 +eval_batch=70 loss=0.022290 left_arm_loss=0.011103 right_arm_loss=0.033476 imbalance=0.022373 batch_time_s=0.3519 +eval_batch=71 loss=0.014200 left_arm_loss=0.006097 right_arm_loss=0.022303 imbalance=0.016206 batch_time_s=0.2229 +eval_batch=72 loss=0.028683 left_arm_loss=0.049631 right_arm_loss=0.007736 imbalance=0.041895 batch_time_s=0.4946 +eval_batch=73 loss=0.017749 left_arm_loss=0.033027 right_arm_loss=0.002471 imbalance=0.030555 batch_time_s=0.3221 +eval_batch=74 loss=0.018133 left_arm_loss=0.034962 right_arm_loss=0.001304 imbalance=0.033658 batch_time_s=0.3079 +eval_batch=75 loss=0.005441 left_arm_loss=0.010617 right_arm_loss=0.000264 imbalance=0.010353 batch_time_s=0.3381 +eval_batch=76 loss=0.022286 left_arm_loss=0.036838 right_arm_loss=0.007734 imbalance=0.029103 batch_time_s=0.2776 +eval_batch=77 loss=0.006658 left_arm_loss=0.012715 right_arm_loss=0.000601 imbalance=0.012114 batch_time_s=0.2320 +eval_batch=78 loss=0.028744 left_arm_loss=0.004508 right_arm_loss=0.052980 imbalance=0.048472 batch_time_s=0.4010 +eval_batch=79 loss=0.067121 left_arm_loss=0.015094 right_arm_loss=0.119147 imbalance=0.104052 batch_time_s=0.3640 +eval_batch=80 loss=0.040183 left_arm_loss=0.006979 right_arm_loss=0.073386 imbalance=0.066408 batch_time_s=0.3644 +eval_batch=81 loss=0.020130 left_arm_loss=0.016093 right_arm_loss=0.024167 imbalance=0.008074 batch_time_s=0.2203 +eval_batch=82 loss=0.008735 left_arm_loss=0.006954 right_arm_loss=0.010516 imbalance=0.003563 batch_time_s=0.2563 +eval_batch=83 loss=0.012587 left_arm_loss=0.014092 right_arm_loss=0.011083 imbalance=0.003008 batch_time_s=0.3576 +eval_batch=84 loss=0.007837 left_arm_loss=0.010760 right_arm_loss=0.004913 imbalance=0.005847 batch_time_s=0.2301 +eval_batch=85 loss=0.019506 left_arm_loss=0.013295 right_arm_loss=0.025716 imbalance=0.012420 batch_time_s=0.3843 +eval_batch=86 loss=0.016306 left_arm_loss=0.004147 right_arm_loss=0.028465 imbalance=0.024318 batch_time_s=0.3352 +eval_batch=87 loss=0.007440 left_arm_loss=0.006391 right_arm_loss=0.008488 imbalance=0.002097 batch_time_s=0.3479 +eval_batch=88 loss=0.017042 left_arm_loss=0.033084 right_arm_loss=0.001001 imbalance=0.032082 batch_time_s=0.3755 +eval_batch=89 loss=0.016117 left_arm_loss=0.029051 right_arm_loss=0.003184 imbalance=0.025867 batch_time_s=0.3429 +eval_batch=90 loss=0.001942 left_arm_loss=0.003463 right_arm_loss=0.000421 imbalance=0.003043 batch_time_s=0.2972 +eval_batch=91 loss=0.001894 left_arm_loss=0.003272 right_arm_loss=0.000516 imbalance=0.002756 batch_time_s=0.2248 +eval_batch=92 loss=0.001436 left_arm_loss=0.002526 right_arm_loss=0.000346 imbalance=0.002180 batch_time_s=0.2841 +eval_batch=93 loss=0.008758 left_arm_loss=0.016516 right_arm_loss=0.001000 imbalance=0.015516 batch_time_s=0.2259 +eval_batch=94 loss=0.003155 left_arm_loss=0.004276 right_arm_loss=0.002035 imbalance=0.002241 batch_time_s=0.2262 +eval_batch=95 loss=0.008810 left_arm_loss=0.003654 right_arm_loss=0.013966 imbalance=0.010312 batch_time_s=0.3757 +eval_batch=96 loss=0.007168 left_arm_loss=0.008067 right_arm_loss=0.006269 imbalance=0.001798 batch_time_s=0.3337 +eval_batch=97 loss=0.015145 left_arm_loss=0.027200 right_arm_loss=0.003091 imbalance=0.024109 batch_time_s=0.3449 +eval_batch=98 loss=0.035565 left_arm_loss=0.066956 right_arm_loss=0.004173 imbalance=0.062783 batch_time_s=0.2216 +eval_batch=99 loss=0.021572 left_arm_loss=0.039440 right_arm_loss=0.003703 imbalance=0.035737 batch_time_s=0.3681 +eval_batch=100 loss=0.021406 left_arm_loss=0.036665 right_arm_loss=0.006146 imbalance=0.030518 batch_time_s=0.3594 +config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/10000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 100 +mean_val_loss: 0.022168 +std_val_loss: 0.024902 +mean_left_arm_loss: 0.030184 +std_left_arm_loss: 0.043653 +mean_right_arm_loss: 0.014151 +std_right_arm_loss: 0.029382 +mean_left_joint_loss: 0.032356 +std_left_joint_loss: 0.048977 +mean_left_gripper_loss: 0.014984 +std_left_gripper_loss: 0.037395 +mean_right_joint_loss: 0.014888 +std_right_joint_loss: 0.032582 +mean_right_gripper_loss: 0.008996 +std_right_gripper_loss: 0.025757 +mean_left_right_imbalance: 0.033825 +std_left_right_imbalance: 0.046586 +per_batch_timing_seconds: mean=0.3248 std=0.0893 min=0.2203 max=0.7969 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.027761 left_arm_mae=0.027813 right_arm_mae=0.027708 imbalance_mae=0.000105 batch_time_s=0.4093 +sample_eval_batch=2 num_steps=4 masked_mae=0.013821 left_arm_mae=0.015602 right_arm_mae=0.012041 imbalance_mae=0.003561 batch_time_s=0.2600 +sample_eval_batch=3 num_steps=4 masked_mae=0.017503 left_arm_mae=0.019663 right_arm_mae=0.015342 imbalance_mae=0.004321 batch_time_s=0.2638 +sample_eval_batch=4 num_steps=4 masked_mae=0.025637 left_arm_mae=0.023152 right_arm_mae=0.028121 imbalance_mae=0.004969 batch_time_s=0.2750 +sample_eval_batch=5 num_steps=4 masked_mae=0.027783 left_arm_mae=0.028539 right_arm_mae=0.027028 imbalance_mae=0.001512 batch_time_s=0.4241 +sample_eval_batch=6 num_steps=4 masked_mae=0.026692 left_arm_mae=0.044013 right_arm_mae=0.009371 imbalance_mae=0.034641 batch_time_s=0.2679 +sample_eval_batch=7 num_steps=4 masked_mae=0.027037 left_arm_mae=0.046942 right_arm_mae=0.007132 imbalance_mae=0.039809 batch_time_s=0.3080 +sample_eval_batch=8 num_steps=4 masked_mae=0.024824 left_arm_mae=0.042787 right_arm_mae=0.006862 imbalance_mae=0.035925 batch_time_s=0.3591 +sample_eval_batch=9 num_steps=4 masked_mae=0.034854 left_arm_mae=0.062094 right_arm_mae=0.007614 imbalance_mae=0.054480 batch_time_s=0.3635 +sample_eval_batch=10 num_steps=4 masked_mae=0.043676 left_arm_mae=0.080232 right_arm_mae=0.007120 imbalance_mae=0.073112 batch_time_s=0.3643 +sample_eval_batch=11 num_steps=4 masked_mae=0.030659 left_arm_mae=0.054766 right_arm_mae=0.006553 imbalance_mae=0.048212 batch_time_s=0.3975 +sample_eval_batch=12 num_steps=4 masked_mae=0.037423 left_arm_mae=0.066644 right_arm_mae=0.008201 imbalance_mae=0.058443 batch_time_s=0.2758 +sample_eval_batch=13 num_steps=4 masked_mae=0.033165 left_arm_mae=0.058826 right_arm_mae=0.007504 imbalance_mae=0.051322 batch_time_s=0.3565 +sample_eval_batch=14 num_steps=4 masked_mae=0.023418 left_arm_mae=0.036001 right_arm_mae=0.010836 imbalance_mae=0.025165 batch_time_s=0.3043 +sample_eval_batch=15 num_steps=4 masked_mae=0.039746 left_arm_mae=0.018612 right_arm_mae=0.060879 imbalance_mae=0.042267 batch_time_s=0.2737 +sample_eval_batch=16 num_steps=4 masked_mae=0.034426 left_arm_mae=0.020315 right_arm_mae=0.048536 imbalance_mae=0.028221 batch_time_s=0.2826 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.029277 +sample_eval_num_steps_4_std_masked_mae: 0.007579 +sample_eval_num_steps_4_mean_left_arm_mae: 0.040375 +sample_eval_num_steps_4_std_left_arm_mae: 0.019190 +sample_eval_num_steps_4_mean_right_arm_mae: 0.018178 +sample_eval_num_steps_4_std_right_arm_mae: 0.015856 +sample_eval_num_steps_4_mean_left_joint_mae: 0.043636 +sample_eval_num_steps_4_std_left_joint_mae: 0.022278 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.017546 +sample_eval_num_steps_4_std_left_gripper_mae: 0.013485 +sample_eval_num_steps_4_mean_right_joint_mae: 0.018908 +sample_eval_num_steps_4_std_right_joint_mae: 0.017028 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.013066 +sample_eval_num_steps_4_std_right_gripper_mae: 0.016678 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.031629 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.022404 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3241 std=0.0551 min=0.2600 max=0.4241 +sample_eval_batch=1 num_steps=10 masked_mae=0.030055 left_arm_mae=0.029251 right_arm_mae=0.030859 imbalance_mae=0.001608 batch_time_s=0.4774 +sample_eval_batch=2 num_steps=10 masked_mae=0.018086 left_arm_mae=0.019810 right_arm_mae=0.016363 imbalance_mae=0.003447 batch_time_s=0.3371 +sample_eval_batch=3 num_steps=10 masked_mae=0.020473 left_arm_mae=0.021246 right_arm_mae=0.019700 imbalance_mae=0.001546 batch_time_s=0.5100 +sample_eval_batch=4 num_steps=10 masked_mae=0.027667 left_arm_mae=0.024581 right_arm_mae=0.030754 imbalance_mae=0.006173 batch_time_s=0.4261 +sample_eval_batch=5 num_steps=10 masked_mae=0.029556 left_arm_mae=0.029850 right_arm_mae=0.029262 imbalance_mae=0.000588 batch_time_s=0.3332 +sample_eval_batch=6 num_steps=10 masked_mae=0.026114 left_arm_mae=0.043516 right_arm_mae=0.008712 imbalance_mae=0.034804 batch_time_s=0.3380 +sample_eval_batch=7 num_steps=10 masked_mae=0.027823 left_arm_mae=0.048532 right_arm_mae=0.007113 imbalance_mae=0.041419 batch_time_s=0.4218 +sample_eval_batch=8 num_steps=10 masked_mae=0.024256 left_arm_mae=0.040880 right_arm_mae=0.007631 imbalance_mae=0.033249 batch_time_s=0.4172 +sample_eval_batch=9 num_steps=10 masked_mae=0.033590 left_arm_mae=0.060695 right_arm_mae=0.006485 imbalance_mae=0.054211 batch_time_s=0.4333 +sample_eval_batch=10 num_steps=10 masked_mae=0.043233 left_arm_mae=0.079897 right_arm_mae=0.006570 imbalance_mae=0.073327 batch_time_s=0.3751 +sample_eval_batch=11 num_steps=10 masked_mae=0.031220 left_arm_mae=0.055826 right_arm_mae=0.006614 imbalance_mae=0.049212 batch_time_s=0.4710 +sample_eval_batch=12 num_steps=10 masked_mae=0.037113 left_arm_mae=0.066601 right_arm_mae=0.007624 imbalance_mae=0.058976 batch_time_s=0.3541 +sample_eval_batch=13 num_steps=10 masked_mae=0.034603 left_arm_mae=0.062063 right_arm_mae=0.007144 imbalance_mae=0.054919 batch_time_s=0.4295 +sample_eval_batch=14 num_steps=10 masked_mae=0.024069 left_arm_mae=0.037691 right_arm_mae=0.010447 imbalance_mae=0.027245 batch_time_s=0.3363 +sample_eval_batch=15 num_steps=10 masked_mae=0.040480 left_arm_mae=0.016929 right_arm_mae=0.064030 imbalance_mae=0.047101 batch_time_s=0.3608 +sample_eval_batch=16 num_steps=10 masked_mae=0.035514 left_arm_mae=0.019780 right_arm_mae=0.051249 imbalance_mae=0.031470 batch_time_s=0.4725 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.030241 +sample_eval_num_steps_10_std_masked_mae: 0.006740 +sample_eval_num_steps_10_mean_left_arm_mae: 0.041072 +sample_eval_num_steps_10_std_left_arm_mae: 0.018866 +sample_eval_num_steps_10_mean_right_arm_mae: 0.019410 +sample_eval_num_steps_10_std_right_arm_mae: 0.017031 +sample_eval_num_steps_10_mean_left_joint_mae: 0.044817 +sample_eval_num_steps_10_std_left_joint_mae: 0.022046 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.014857 +sample_eval_num_steps_10_std_left_gripper_mae: 0.014376 +sample_eval_num_steps_10_mean_right_joint_mae: 0.020279 +sample_eval_num_steps_10_std_right_joint_mae: 0.018425 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.013323 +sample_eval_num_steps_10_std_right_gripper_mae: 0.014475 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.032456 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.022935 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.4058 std=0.0569 min=0.3332 max=0.5100 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_2000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_2000.log new file mode 100644 index 0000000000000000000000000000000000000000..ac9d8bf54bba3b181d0c0a4b6bd768246d949c00 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_2000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_parallel_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.025787 left_arm_loss=0.021923 right_arm_loss=0.029651 imbalance=0.007728 batch_time_s=1.3050 +eval_batch=2 loss=0.010885 left_arm_loss=0.011649 right_arm_loss=0.010121 imbalance=0.001528 batch_time_s=0.2327 +eval_batch=3 loss=0.011956 left_arm_loss=0.016623 right_arm_loss=0.007290 imbalance=0.009332 batch_time_s=0.5065 +eval_batch=4 loss=0.038901 left_arm_loss=0.042096 right_arm_loss=0.035706 imbalance=0.006391 batch_time_s=0.3083 +eval_batch=5 loss=0.022632 left_arm_loss=0.029108 right_arm_loss=0.016157 imbalance=0.012951 batch_time_s=0.4952 +eval_batch=6 loss=0.035525 left_arm_loss=0.067873 right_arm_loss=0.003178 imbalance=0.064695 batch_time_s=0.5699 +eval_batch=7 loss=0.037493 left_arm_loss=0.072243 right_arm_loss=0.002743 imbalance=0.069500 batch_time_s=0.3339 +eval_batch=8 loss=0.011528 left_arm_loss=0.021023 right_arm_loss=0.002032 imbalance=0.018991 batch_time_s=0.3246 +eval_batch=9 loss=0.014947 left_arm_loss=0.028066 right_arm_loss=0.001828 imbalance=0.026238 batch_time_s=0.2926 +eval_batch=10 loss=0.023378 left_arm_loss=0.045036 right_arm_loss=0.001720 imbalance=0.043316 batch_time_s=0.5283 +eval_batch=11 loss=0.025311 left_arm_loss=0.047408 right_arm_loss=0.003213 imbalance=0.044196 batch_time_s=0.4648 +eval_batch=12 loss=0.022664 left_arm_loss=0.043080 right_arm_loss=0.002247 imbalance=0.040833 batch_time_s=0.2914 +eval_batch=13 loss=0.043299 left_arm_loss=0.083316 right_arm_loss=0.003283 imbalance=0.080034 batch_time_s=0.2490 +eval_batch=14 loss=0.028448 left_arm_loss=0.049884 right_arm_loss=0.007012 imbalance=0.042872 batch_time_s=0.3239 +eval_batch=15 loss=0.055534 left_arm_loss=0.023412 right_arm_loss=0.087656 imbalance=0.064244 batch_time_s=0.7896 +eval_batch=16 loss=0.070242 left_arm_loss=0.037843 right_arm_loss=0.102640 imbalance=0.064797 batch_time_s=0.3277 +eval_batch=17 loss=0.034091 left_arm_loss=0.061349 right_arm_loss=0.006834 imbalance=0.054514 batch_time_s=0.3386 +eval_batch=18 loss=0.048450 left_arm_loss=0.065674 right_arm_loss=0.031225 imbalance=0.034449 batch_time_s=0.2716 +eval_batch=19 loss=0.020858 left_arm_loss=0.026401 right_arm_loss=0.015315 imbalance=0.011086 batch_time_s=0.2662 +eval_batch=20 loss=0.012802 left_arm_loss=0.017406 right_arm_loss=0.008198 imbalance=0.009208 batch_time_s=0.3161 +eval_batch=21 loss=0.022067 left_arm_loss=0.035582 right_arm_loss=0.008551 imbalance=0.027031 batch_time_s=0.2446 +eval_batch=22 loss=0.052524 left_arm_loss=0.058496 right_arm_loss=0.046553 imbalance=0.011943 batch_time_s=0.3242 +eval_batch=23 loss=0.049664 left_arm_loss=0.082497 right_arm_loss=0.016830 imbalance=0.065667 batch_time_s=0.3345 +eval_batch=24 loss=0.057649 left_arm_loss=0.109523 right_arm_loss=0.005776 imbalance=0.103747 batch_time_s=0.3507 +eval_batch=25 loss=0.065660 left_arm_loss=0.129855 right_arm_loss=0.001465 imbalance=0.128389 batch_time_s=0.3145 +eval_batch=26 loss=0.030339 left_arm_loss=0.056315 right_arm_loss=0.004364 imbalance=0.051951 batch_time_s=0.5504 +eval_batch=27 loss=0.026639 left_arm_loss=0.048688 right_arm_loss=0.004590 imbalance=0.044098 batch_time_s=0.2753 +eval_batch=28 loss=0.027996 left_arm_loss=0.054026 right_arm_loss=0.001966 imbalance=0.052060 batch_time_s=0.3143 +eval_batch=29 loss=0.035882 left_arm_loss=0.069171 right_arm_loss=0.002594 imbalance=0.066576 batch_time_s=0.2392 +eval_batch=30 loss=0.053704 left_arm_loss=0.104880 right_arm_loss=0.002527 imbalance=0.102353 batch_time_s=0.2710 +eval_batch=31 loss=0.081458 left_arm_loss=0.154924 right_arm_loss=0.007991 imbalance=0.146933 batch_time_s=0.3473 +eval_batch=32 loss=0.070487 left_arm_loss=0.072677 right_arm_loss=0.068297 imbalance=0.004380 batch_time_s=0.3377 +eval_batch=33 loss=0.046639 left_arm_loss=0.018259 right_arm_loss=0.075019 imbalance=0.056760 batch_time_s=0.3076 +eval_batch=34 loss=0.085334 left_arm_loss=0.123811 right_arm_loss=0.046856 imbalance=0.076955 batch_time_s=0.2470 +eval_batch=35 loss=0.043193 left_arm_loss=0.075120 right_arm_loss=0.011267 imbalance=0.063853 batch_time_s=0.2781 +eval_batch=36 loss=0.024055 left_arm_loss=0.014381 right_arm_loss=0.033729 imbalance=0.019349 batch_time_s=0.3140 +eval_batch=37 loss=0.015806 left_arm_loss=0.021006 right_arm_loss=0.010606 imbalance=0.010401 batch_time_s=0.3179 +eval_batch=38 loss=0.046615 left_arm_loss=0.061180 right_arm_loss=0.032049 imbalance=0.029131 batch_time_s=0.3286 +eval_batch=39 loss=0.054128 left_arm_loss=0.033725 right_arm_loss=0.074530 imbalance=0.040805 batch_time_s=0.3452 +eval_batch=40 loss=0.022496 left_arm_loss=0.022509 right_arm_loss=0.022484 imbalance=0.000026 batch_time_s=0.2541 +eval_batch=41 loss=0.050047 left_arm_loss=0.097146 right_arm_loss=0.002948 imbalance=0.094197 batch_time_s=0.3104 +eval_batch=42 loss=0.024861 left_arm_loss=0.046637 right_arm_loss=0.003085 imbalance=0.043553 batch_time_s=0.3127 +eval_batch=43 loss=0.013173 left_arm_loss=0.023176 right_arm_loss=0.003170 imbalance=0.020006 batch_time_s=0.3346 +eval_batch=44 loss=0.013327 left_arm_loss=0.024117 right_arm_loss=0.002537 imbalance=0.021580 batch_time_s=0.3632 +eval_batch=45 loss=0.016324 left_arm_loss=0.029968 right_arm_loss=0.002681 imbalance=0.027287 batch_time_s=0.3180 +eval_batch=46 loss=0.028118 left_arm_loss=0.051257 right_arm_loss=0.004978 imbalance=0.046279 batch_time_s=0.4321 +eval_batch=47 loss=0.106531 left_arm_loss=0.017094 right_arm_loss=0.195969 imbalance=0.178875 batch_time_s=0.5261 +eval_batch=48 loss=0.120483 left_arm_loss=0.010918 right_arm_loss=0.230047 imbalance=0.219129 batch_time_s=0.3434 +eval_batch=49 loss=0.026319 left_arm_loss=0.006001 right_arm_loss=0.046636 imbalance=0.040635 batch_time_s=0.3443 +eval_batch=50 loss=0.091088 left_arm_loss=0.043066 right_arm_loss=0.139110 imbalance=0.096045 batch_time_s=0.4223 +config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/2000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.039947 +std_val_loss: 0.025053 +mean_left_arm_loss: 0.050148 +std_left_arm_loss: 0.033233 +mean_right_arm_loss: 0.029745 +std_right_arm_loss: 0.047860 +mean_left_joint_loss: 0.051925 +std_left_joint_loss: 0.036277 +mean_left_gripper_loss: 0.037711 +std_left_gripper_loss: 0.077017 +mean_right_joint_loss: 0.030139 +std_right_joint_loss: 0.051862 +mean_right_gripper_loss: 0.026984 +std_right_gripper_loss: 0.065713 +mean_left_right_imbalance: 0.051938 +std_left_right_imbalance: 0.044701 +per_batch_timing_seconds: mean=0.3708 std=0.1690 min=0.2327 max=1.3050 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.061882 left_arm_mae=0.057364 right_arm_mae=0.066401 imbalance_mae=0.009037 batch_time_s=0.3858 +sample_eval_batch=2 num_steps=4 masked_mae=0.041649 left_arm_mae=0.049056 right_arm_mae=0.034243 imbalance_mae=0.014814 batch_time_s=0.3355 +sample_eval_batch=3 num_steps=4 masked_mae=0.043529 left_arm_mae=0.052206 right_arm_mae=0.034851 imbalance_mae=0.017355 batch_time_s=0.2794 +sample_eval_batch=4 num_steps=4 masked_mae=0.056773 left_arm_mae=0.064577 right_arm_mae=0.048968 imbalance_mae=0.015609 batch_time_s=0.2793 +sample_eval_batch=5 num_steps=4 masked_mae=0.049480 left_arm_mae=0.055472 right_arm_mae=0.043487 imbalance_mae=0.011986 batch_time_s=0.3373 +sample_eval_batch=6 num_steps=4 masked_mae=0.073431 left_arm_mae=0.128902 right_arm_mae=0.017959 imbalance_mae=0.110943 batch_time_s=0.3862 +sample_eval_batch=7 num_steps=4 masked_mae=0.076275 left_arm_mae=0.134920 right_arm_mae=0.017629 imbalance_mae=0.117291 batch_time_s=0.3713 +sample_eval_batch=8 num_steps=4 masked_mae=0.042675 left_arm_mae=0.068631 right_arm_mae=0.016719 imbalance_mae=0.051913 batch_time_s=0.2806 +sample_eval_batch=9 num_steps=4 masked_mae=0.047099 left_arm_mae=0.078409 right_arm_mae=0.015790 imbalance_mae=0.062619 batch_time_s=0.3906 +sample_eval_batch=10 num_steps=4 masked_mae=0.062623 left_arm_mae=0.107672 right_arm_mae=0.017573 imbalance_mae=0.090099 batch_time_s=0.3369 +sample_eval_batch=11 num_steps=4 masked_mae=0.058652 left_arm_mae=0.098428 right_arm_mae=0.018877 imbalance_mae=0.079552 batch_time_s=0.3893 +sample_eval_batch=12 num_steps=4 masked_mae=0.051733 left_arm_mae=0.085662 right_arm_mae=0.017805 imbalance_mae=0.067856 batch_time_s=0.4135 +sample_eval_batch=13 num_steps=4 masked_mae=0.073952 left_arm_mae=0.130081 right_arm_mae=0.017823 imbalance_mae=0.112258 batch_time_s=0.3486 +sample_eval_batch=14 num_steps=4 masked_mae=0.064349 left_arm_mae=0.101978 right_arm_mae=0.026721 imbalance_mae=0.075257 batch_time_s=0.2832 +sample_eval_batch=15 num_steps=4 masked_mae=0.067566 left_arm_mae=0.030355 right_arm_mae=0.104778 imbalance_mae=0.074423 batch_time_s=0.7256 +sample_eval_batch=16 num_steps=4 masked_mae=0.086088 left_arm_mae=0.052028 right_arm_mae=0.120148 imbalance_mae=0.068119 batch_time_s=0.3804 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.059860 +sample_eval_num_steps_4_std_masked_mae: 0.012924 +sample_eval_num_steps_4_mean_left_arm_mae: 0.080984 +sample_eval_num_steps_4_std_left_arm_mae: 0.031604 +sample_eval_num_steps_4_mean_right_arm_mae: 0.038736 +sample_eval_num_steps_4_std_right_arm_mae: 0.031293 +sample_eval_num_steps_4_mean_left_joint_mae: 0.086197 +sample_eval_num_steps_4_std_left_joint_mae: 0.035912 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.044490 +sample_eval_num_steps_4_std_left_gripper_mae: 0.062755 +sample_eval_num_steps_4_mean_right_joint_mae: 0.039304 +sample_eval_num_steps_4_std_right_joint_mae: 0.030982 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.034761 +sample_eval_num_steps_4_std_right_gripper_mae: 0.051397 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.061196 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.036442 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3702 std=0.1017 min=0.2793 max=0.7256 +sample_eval_batch=1 num_steps=10 masked_mae=0.068575 left_arm_mae=0.066392 right_arm_mae=0.070757 imbalance_mae=0.004365 batch_time_s=0.4156 +sample_eval_batch=2 num_steps=10 masked_mae=0.048682 left_arm_mae=0.056914 right_arm_mae=0.040451 imbalance_mae=0.016462 batch_time_s=0.3402 +sample_eval_batch=3 num_steps=10 masked_mae=0.048330 left_arm_mae=0.056728 right_arm_mae=0.039932 imbalance_mae=0.016797 batch_time_s=0.6590 +sample_eval_batch=4 num_steps=10 masked_mae=0.064731 left_arm_mae=0.072759 right_arm_mae=0.056703 imbalance_mae=0.016055 batch_time_s=0.4853 +sample_eval_batch=5 num_steps=10 masked_mae=0.056433 left_arm_mae=0.061980 right_arm_mae=0.050886 imbalance_mae=0.011094 batch_time_s=0.4784 +sample_eval_batch=6 num_steps=10 masked_mae=0.079709 left_arm_mae=0.137447 right_arm_mae=0.021970 imbalance_mae=0.115477 batch_time_s=0.3479 +sample_eval_batch=7 num_steps=10 masked_mae=0.079619 left_arm_mae=0.139576 right_arm_mae=0.019663 imbalance_mae=0.119913 batch_time_s=0.4953 +sample_eval_batch=8 num_steps=10 masked_mae=0.047182 left_arm_mae=0.076524 right_arm_mae=0.017840 imbalance_mae=0.058684 batch_time_s=0.4351 +sample_eval_batch=9 num_steps=10 masked_mae=0.053413 left_arm_mae=0.088859 right_arm_mae=0.017968 imbalance_mae=0.070891 batch_time_s=0.6540 +sample_eval_batch=10 num_steps=10 masked_mae=0.066754 left_arm_mae=0.114514 right_arm_mae=0.018994 imbalance_mae=0.095520 batch_time_s=0.3876 +sample_eval_batch=11 num_steps=10 masked_mae=0.064689 left_arm_mae=0.108810 right_arm_mae=0.020569 imbalance_mae=0.088241 batch_time_s=0.4600 +sample_eval_batch=12 num_steps=10 masked_mae=0.060080 left_arm_mae=0.098145 right_arm_mae=0.022016 imbalance_mae=0.076129 batch_time_s=0.4352 +sample_eval_batch=13 num_steps=10 masked_mae=0.079265 left_arm_mae=0.137559 right_arm_mae=0.020971 imbalance_mae=0.116587 batch_time_s=0.3373 +sample_eval_batch=14 num_steps=10 masked_mae=0.071031 left_arm_mae=0.110774 right_arm_mae=0.031288 imbalance_mae=0.079487 batch_time_s=0.4683 +sample_eval_batch=15 num_steps=10 masked_mae=0.074507 left_arm_mae=0.037228 right_arm_mae=0.111785 imbalance_mae=0.074557 batch_time_s=0.4737 +sample_eval_batch=16 num_steps=10 masked_mae=0.091350 left_arm_mae=0.055550 right_arm_mae=0.127150 imbalance_mae=0.071600 batch_time_s=0.4467 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.065897 +sample_eval_num_steps_10_std_masked_mae: 0.012628 +sample_eval_num_steps_10_mean_left_arm_mae: 0.088735 +sample_eval_num_steps_10_std_left_arm_mae: 0.032010 +sample_eval_num_steps_10_mean_right_arm_mae: 0.043059 +sample_eval_num_steps_10_std_right_arm_mae: 0.032823 +sample_eval_num_steps_10_mean_left_joint_mae: 0.094654 +sample_eval_num_steps_10_std_left_joint_mae: 0.036668 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.047298 +sample_eval_num_steps_10_std_left_gripper_mae: 0.064660 +sample_eval_num_steps_10_mean_right_joint_mae: 0.043769 +sample_eval_num_steps_10_std_right_joint_mae: 0.032862 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.038089 +sample_eval_num_steps_10_std_right_gripper_mae: 0.049635 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.064491 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.038643 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.4575 std=0.0902 min=0.3373 max=0.6590 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_5000.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_5000.log new file mode 100644 index 0000000000000000000000000000000000000000..0b7c1bdb3576f500506264531bd828b723138788 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/handover_packed_parallel_10k_val_5000.log @@ -0,0 +1,148 @@ +starting_eval config=pi05_twin_handover_256_packed_parallel_pytorch_10k checkpoint=/workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000 repo_id=lsnu/twin_handover_256_val +eval_loader batch_size=16 num_batches=50 num_workers=0 +teacher_forced_eval_seed: 123 +sample_eval enabled=True batch_size=16 num_batches=16 num_steps=[4, 10] seed=321 +weight_loading missing=0 unexpected=0 device=cuda:0 +eval_batch=1 loss=0.018009 left_arm_loss=0.018792 right_arm_loss=0.017225 imbalance=0.001567 batch_time_s=1.7875 +eval_batch=2 loss=0.003388 left_arm_loss=0.002589 right_arm_loss=0.004187 imbalance=0.001598 batch_time_s=0.4062 +eval_batch=3 loss=0.003306 left_arm_loss=0.002658 right_arm_loss=0.003954 imbalance=0.001296 batch_time_s=0.5074 +eval_batch=4 loss=0.017967 left_arm_loss=0.019657 right_arm_loss=0.016276 imbalance=0.003381 batch_time_s=0.4412 +eval_batch=5 loss=0.012909 left_arm_loss=0.015657 right_arm_loss=0.010161 imbalance=0.005496 batch_time_s=0.4610 +eval_batch=6 loss=0.012707 left_arm_loss=0.023879 right_arm_loss=0.001535 imbalance=0.022344 batch_time_s=0.5589 +eval_batch=7 loss=0.012281 left_arm_loss=0.023433 right_arm_loss=0.001129 imbalance=0.022304 batch_time_s=0.3979 +eval_batch=8 loss=0.010313 left_arm_loss=0.019642 right_arm_loss=0.000985 imbalance=0.018657 batch_time_s=0.2939 +eval_batch=9 loss=0.011270 left_arm_loss=0.021697 right_arm_loss=0.000842 imbalance=0.020855 batch_time_s=0.2766 +eval_batch=10 loss=0.020419 left_arm_loss=0.040029 right_arm_loss=0.000809 imbalance=0.039219 batch_time_s=0.2618 +eval_batch=11 loss=0.012979 left_arm_loss=0.024547 right_arm_loss=0.001411 imbalance=0.023136 batch_time_s=0.2342 +eval_batch=12 loss=0.016370 left_arm_loss=0.031587 right_arm_loss=0.001153 imbalance=0.030434 batch_time_s=0.2544 +eval_batch=13 loss=0.022673 left_arm_loss=0.043847 right_arm_loss=0.001498 imbalance=0.042349 batch_time_s=0.3947 +eval_batch=14 loss=0.015649 left_arm_loss=0.013524 right_arm_loss=0.017774 imbalance=0.004250 batch_time_s=0.3622 +eval_batch=15 loss=0.065092 left_arm_loss=0.016442 right_arm_loss=0.113742 imbalance=0.097301 batch_time_s=0.3778 +eval_batch=16 loss=0.031027 left_arm_loss=0.014831 right_arm_loss=0.047224 imbalance=0.032393 batch_time_s=0.2350 +eval_batch=17 loss=0.020677 left_arm_loss=0.037752 right_arm_loss=0.003602 imbalance=0.034149 batch_time_s=0.2326 +eval_batch=18 loss=0.032304 left_arm_loss=0.042944 right_arm_loss=0.021663 imbalance=0.021281 batch_time_s=0.2283 +eval_batch=19 loss=0.010371 left_arm_loss=0.016259 right_arm_loss=0.004484 imbalance=0.011775 batch_time_s=0.3932 +eval_batch=20 loss=0.015657 left_arm_loss=0.026673 right_arm_loss=0.004640 imbalance=0.022033 batch_time_s=0.4344 +eval_batch=21 loss=0.073863 left_arm_loss=0.143820 right_arm_loss=0.003905 imbalance=0.139915 batch_time_s=0.3016 +eval_batch=22 loss=0.086733 left_arm_loss=0.138835 right_arm_loss=0.034632 imbalance=0.104203 batch_time_s=0.3656 +eval_batch=23 loss=0.041098 left_arm_loss=0.072591 right_arm_loss=0.009606 imbalance=0.062984 batch_time_s=0.2442 +eval_batch=24 loss=0.083534 left_arm_loss=0.164134 right_arm_loss=0.002933 imbalance=0.161201 batch_time_s=0.3228 +eval_batch=25 loss=0.067565 left_arm_loss=0.134226 right_arm_loss=0.000903 imbalance=0.133323 batch_time_s=0.4508 +eval_batch=26 loss=0.030208 left_arm_loss=0.057778 right_arm_loss=0.002639 imbalance=0.055139 batch_time_s=0.3326 +eval_batch=27 loss=0.029988 left_arm_loss=0.055316 right_arm_loss=0.004661 imbalance=0.050655 batch_time_s=0.2515 +eval_batch=28 loss=0.017679 left_arm_loss=0.034427 right_arm_loss=0.000931 imbalance=0.033496 batch_time_s=0.3445 +eval_batch=29 loss=0.028188 left_arm_loss=0.054125 right_arm_loss=0.002251 imbalance=0.051874 batch_time_s=0.3502 +eval_batch=30 loss=0.025111 left_arm_loss=0.046639 right_arm_loss=0.003583 imbalance=0.043056 batch_time_s=0.4274 +eval_batch=31 loss=0.047902 left_arm_loss=0.091445 right_arm_loss=0.004359 imbalance=0.087086 batch_time_s=0.2993 +eval_batch=32 loss=0.034540 left_arm_loss=0.036401 right_arm_loss=0.032679 imbalance=0.003722 batch_time_s=0.3326 +eval_batch=33 loss=0.030009 left_arm_loss=0.011000 right_arm_loss=0.049019 imbalance=0.038019 batch_time_s=0.3898 +eval_batch=34 loss=0.064066 left_arm_loss=0.109818 right_arm_loss=0.018313 imbalance=0.091505 batch_time_s=0.3321 +eval_batch=35 loss=0.038442 left_arm_loss=0.072379 right_arm_loss=0.004506 imbalance=0.067873 batch_time_s=0.4945 +eval_batch=36 loss=0.015525 left_arm_loss=0.012302 right_arm_loss=0.018747 imbalance=0.006445 batch_time_s=0.3318 +eval_batch=37 loss=0.004400 left_arm_loss=0.005982 right_arm_loss=0.002817 imbalance=0.003166 batch_time_s=0.2853 +eval_batch=38 loss=0.033808 left_arm_loss=0.038027 right_arm_loss=0.029589 imbalance=0.008438 batch_time_s=0.3567 +eval_batch=39 loss=0.031964 left_arm_loss=0.013754 right_arm_loss=0.050174 imbalance=0.036420 batch_time_s=0.2974 +eval_batch=40 loss=0.014522 left_arm_loss=0.017470 right_arm_loss=0.011574 imbalance=0.005896 batch_time_s=0.3888 +eval_batch=41 loss=0.024863 left_arm_loss=0.048452 right_arm_loss=0.001273 imbalance=0.047179 batch_time_s=0.4214 +eval_batch=42 loss=0.012502 left_arm_loss=0.023855 right_arm_loss=0.001148 imbalance=0.022707 batch_time_s=0.3489 +eval_batch=43 loss=0.004550 left_arm_loss=0.007728 right_arm_loss=0.001372 imbalance=0.006356 batch_time_s=0.3647 +eval_batch=44 loss=0.003732 left_arm_loss=0.006069 right_arm_loss=0.001396 imbalance=0.004672 batch_time_s=0.2821 +eval_batch=45 loss=0.006992 left_arm_loss=0.012467 right_arm_loss=0.001518 imbalance=0.010949 batch_time_s=0.3792 +eval_batch=46 loss=0.022667 left_arm_loss=0.043763 right_arm_loss=0.001571 imbalance=0.042192 batch_time_s=0.2396 +eval_batch=47 loss=0.026646 left_arm_loss=0.008901 right_arm_loss=0.044391 imbalance=0.035490 batch_time_s=0.2305 +eval_batch=48 loss=0.032550 left_arm_loss=0.005242 right_arm_loss=0.059858 imbalance=0.054616 batch_time_s=0.3562 +eval_batch=49 loss=0.007825 left_arm_loss=0.002985 right_arm_loss=0.012665 imbalance=0.009680 batch_time_s=0.2352 +eval_batch=50 loss=0.060185 left_arm_loss=0.031356 right_arm_loss=0.089014 imbalance=0.057658 batch_time_s=0.2872 +config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k +checkpoint_path: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/handover_packed_parallel_10k/5000 +repo_id_used: lsnu/twin_handover_256_val +num_batches: 50 +mean_val_loss: 0.027340 +std_val_loss: 0.020897 +mean_left_arm_loss: 0.039155 +std_left_arm_loss: 0.038641 +mean_right_arm_loss: 0.015526 +std_right_arm_loss: 0.023413 +mean_left_joint_loss: 0.042035 +std_left_joint_loss: 0.043377 +mean_left_gripper_loss: 0.018994 +std_left_gripper_loss: 0.032843 +mean_right_joint_loss: 0.015753 +std_right_joint_loss: 0.024564 +mean_right_gripper_loss: 0.013938 +std_right_gripper_loss: 0.029304 +mean_left_right_imbalance: 0.038635 +std_left_right_imbalance: 0.037436 +per_batch_timing_seconds: mean=0.3717 std=0.2172 min=0.2283 max=1.7875 +active_mask_dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] +masked_dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] +weight_loading_missing_keys: [] +weight_loading_unexpected_keys: [] +sample_eval_batch=1 num_steps=4 masked_mae=0.050586 left_arm_mae=0.058916 right_arm_mae=0.042257 imbalance_mae=0.016659 batch_time_s=0.3724 +sample_eval_batch=2 num_steps=4 masked_mae=0.022248 left_arm_mae=0.021135 right_arm_mae=0.023362 imbalance_mae=0.002226 batch_time_s=0.3071 +sample_eval_batch=3 num_steps=4 masked_mae=0.023393 left_arm_mae=0.020391 right_arm_mae=0.026394 imbalance_mae=0.006003 batch_time_s=0.3356 +sample_eval_batch=4 num_steps=4 masked_mae=0.035006 left_arm_mae=0.031920 right_arm_mae=0.038093 imbalance_mae=0.006173 batch_time_s=0.3073 +sample_eval_batch=5 num_steps=4 masked_mae=0.033634 left_arm_mae=0.037647 right_arm_mae=0.029620 imbalance_mae=0.008027 batch_time_s=0.4116 +sample_eval_batch=6 num_steps=4 masked_mae=0.037616 left_arm_mae=0.063739 right_arm_mae=0.011493 imbalance_mae=0.052246 batch_time_s=0.2947 +sample_eval_batch=7 num_steps=4 masked_mae=0.034674 left_arm_mae=0.057874 right_arm_mae=0.011474 imbalance_mae=0.046401 batch_time_s=0.3792 +sample_eval_batch=8 num_steps=4 masked_mae=0.032207 left_arm_mae=0.053714 right_arm_mae=0.010699 imbalance_mae=0.043015 batch_time_s=0.4709 +sample_eval_batch=9 num_steps=4 masked_mae=0.044335 left_arm_mae=0.077539 right_arm_mae=0.011131 imbalance_mae=0.066409 batch_time_s=0.3545 +sample_eval_batch=10 num_steps=4 masked_mae=0.051304 left_arm_mae=0.091093 right_arm_mae=0.011515 imbalance_mae=0.079578 batch_time_s=0.2719 +sample_eval_batch=11 num_steps=4 masked_mae=0.032892 left_arm_mae=0.055199 right_arm_mae=0.010585 imbalance_mae=0.044614 batch_time_s=0.2832 +sample_eval_batch=12 num_steps=4 masked_mae=0.040746 left_arm_mae=0.070150 right_arm_mae=0.011341 imbalance_mae=0.058809 batch_time_s=0.3939 +sample_eval_batch=13 num_steps=4 masked_mae=0.040115 left_arm_mae=0.068278 right_arm_mae=0.011951 imbalance_mae=0.056326 batch_time_s=0.4705 +sample_eval_batch=14 num_steps=4 masked_mae=0.035901 left_arm_mae=0.049350 right_arm_mae=0.022452 imbalance_mae=0.026898 batch_time_s=0.5485 +sample_eval_batch=15 num_steps=4 masked_mae=0.080623 left_arm_mae=0.025484 right_arm_mae=0.135762 imbalance_mae=0.110278 batch_time_s=0.5356 +sample_eval_batch=16 num_steps=4 masked_mae=0.056104 left_arm_mae=0.028460 right_arm_mae=0.083749 imbalance_mae=0.055290 batch_time_s=0.4407 +sample_eval_num_steps_4_num_batches: 16 +sample_eval_num_steps_4_mean_masked_mae: 0.040712 +sample_eval_num_steps_4_std_masked_mae: 0.013646 +sample_eval_num_steps_4_mean_left_arm_mae: 0.050681 +sample_eval_num_steps_4_std_left_arm_mae: 0.020624 +sample_eval_num_steps_4_mean_right_arm_mae: 0.030742 +sample_eval_num_steps_4_std_right_arm_mae: 0.032790 +sample_eval_num_steps_4_mean_left_joint_mae: 0.053976 +sample_eval_num_steps_4_std_left_joint_mae: 0.024153 +sample_eval_num_steps_4_mean_left_gripper_mae: 0.027611 +sample_eval_num_steps_4_std_left_gripper_mae: 0.024580 +sample_eval_num_steps_4_mean_right_joint_mae: 0.032227 +sample_eval_num_steps_4_std_right_joint_mae: 0.036350 +sample_eval_num_steps_4_mean_right_gripper_mae: 0.020349 +sample_eval_num_steps_4_std_right_gripper_mae: 0.017496 +sample_eval_num_steps_4_mean_left_right_imbalance_mae: 0.042435 +sample_eval_num_steps_4_std_left_right_imbalance_mae: 0.029207 +sample_eval_num_steps_4_per_batch_timing_seconds: mean=0.3861 std=0.0848 min=0.2719 max=0.5485 +sample_eval_batch=1 num_steps=10 masked_mae=0.060244 left_arm_mae=0.069755 right_arm_mae=0.050732 imbalance_mae=0.019023 batch_time_s=0.3931 +sample_eval_batch=2 num_steps=10 masked_mae=0.028194 left_arm_mae=0.026266 right_arm_mae=0.030122 imbalance_mae=0.003856 batch_time_s=0.4060 +sample_eval_batch=3 num_steps=10 masked_mae=0.029302 left_arm_mae=0.026488 right_arm_mae=0.032115 imbalance_mae=0.005627 batch_time_s=0.6280 +sample_eval_batch=4 num_steps=10 masked_mae=0.040353 left_arm_mae=0.038823 right_arm_mae=0.041882 imbalance_mae=0.003059 batch_time_s=0.5683 +sample_eval_batch=5 num_steps=10 masked_mae=0.037448 left_arm_mae=0.040207 right_arm_mae=0.034689 imbalance_mae=0.005518 batch_time_s=0.4537 +sample_eval_batch=6 num_steps=10 masked_mae=0.041892 left_arm_mae=0.069450 right_arm_mae=0.014334 imbalance_mae=0.055116 batch_time_s=0.5177 +sample_eval_batch=7 num_steps=10 masked_mae=0.037873 left_arm_mae=0.061853 right_arm_mae=0.013892 imbalance_mae=0.047961 batch_time_s=0.3831 +sample_eval_batch=8 num_steps=10 masked_mae=0.035303 left_arm_mae=0.058263 right_arm_mae=0.012343 imbalance_mae=0.045920 batch_time_s=0.3624 +sample_eval_batch=9 num_steps=10 masked_mae=0.049224 left_arm_mae=0.084585 right_arm_mae=0.013863 imbalance_mae=0.070723 batch_time_s=0.4046 +sample_eval_batch=10 num_steps=10 masked_mae=0.053856 left_arm_mae=0.092990 right_arm_mae=0.014723 imbalance_mae=0.078267 batch_time_s=0.3373 +sample_eval_batch=11 num_steps=10 masked_mae=0.036063 left_arm_mae=0.058790 right_arm_mae=0.013336 imbalance_mae=0.045454 batch_time_s=0.4558 +sample_eval_batch=12 num_steps=10 masked_mae=0.043667 left_arm_mae=0.073829 right_arm_mae=0.013505 imbalance_mae=0.060324 batch_time_s=0.3940 +sample_eval_batch=13 num_steps=10 masked_mae=0.044050 left_arm_mae=0.071945 right_arm_mae=0.016154 imbalance_mae=0.055791 batch_time_s=0.5080 +sample_eval_batch=14 num_steps=10 masked_mae=0.040370 left_arm_mae=0.054512 right_arm_mae=0.026228 imbalance_mae=0.028284 batch_time_s=0.5988 +sample_eval_batch=15 num_steps=10 masked_mae=0.080254 left_arm_mae=0.023710 right_arm_mae=0.136797 imbalance_mae=0.113086 batch_time_s=0.4224 +sample_eval_batch=16 num_steps=10 masked_mae=0.058699 left_arm_mae=0.028788 right_arm_mae=0.088609 imbalance_mae=0.059822 batch_time_s=0.4455 +sample_eval_num_steps_10_num_batches: 16 +sample_eval_num_steps_10_mean_masked_mae: 0.044799 +sample_eval_num_steps_10_std_masked_mae: 0.012807 +sample_eval_num_steps_10_mean_left_arm_mae: 0.055016 +sample_eval_num_steps_10_std_left_arm_mae: 0.021278 +sample_eval_num_steps_10_mean_right_arm_mae: 0.034583 +sample_eval_num_steps_10_std_right_arm_mae: 0.032757 +sample_eval_num_steps_10_mean_left_joint_mae: 0.059296 +sample_eval_num_steps_10_std_left_joint_mae: 0.025068 +sample_eval_num_steps_10_mean_left_gripper_mae: 0.025058 +sample_eval_num_steps_10_std_left_gripper_mae: 0.027173 +sample_eval_num_steps_10_mean_right_joint_mae: 0.035777 +sample_eval_num_steps_10_std_right_joint_mae: 0.036454 +sample_eval_num_steps_10_mean_right_gripper_mae: 0.026224 +sample_eval_num_steps_10_std_right_gripper_mae: 0.016890 +sample_eval_num_steps_10_mean_left_right_imbalance_mae: 0.043614 +sample_eval_num_steps_10_std_left_right_imbalance_mae: 0.030178 +sample_eval_num_steps_10_per_batch_timing_seconds: mean=0.4549 std=0.0835 min=0.3373 max=0.6280 diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_baseline_10k_diag.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_baseline_10k_diag.log new file mode 100644 index 0000000000000000000000000000000000000000..a21e94b1abd8901a10e9291eaee1e7db5d837fee --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_baseline_10k_diag.log @@ -0,0 +1,149 @@ +W0309 15:40:17.171000 3586 torch/distributed/run.py:766] +W0309 15:40:17.171000 3586 torch/distributed/run.py:766] ***************************************** +W0309 15:40:17.171000 3586 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0309 15:40:17.171000 3586 torch/distributed/run.py:766] ***************************************** +15:41:17.850 [I] Created experiment checkpoint directory: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/smoke_baseline_10k_diag (3655:train_pytorch.py:505) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank0]:[W309 15:41:18.330229924 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank2]:[W309 15:41:18.361924667 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank3]:[W309 15:41:18.083889614 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank1]:[W309 15:41:19.988503311 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +15:41:20.805 [I] Using batch size per GPU: 4 (total batch size across 4 GPUs: 16) (3655:train_pytorch.py:524) +15:41:20.957 [I] Loaded norm stats from /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train (3655:config.py:234) +15:41:20.960 [I] data_config: DataConfig(repo_id='lsnu/twin_handover_256_train', asset_id='lsnu/twin_handover_256_train', norm_stats={'state': NormStats(mean=array([ 0.40321857, 0.17899239, -0.07588876, -2.06326795, -0.46418607, + 1.79356563, 0.70229131, 0.48194093, 0.93952829, 0.86693275, + -1.03168762, -1.9056077 , -0.53421056, 1.87584054, 2.36738205, + 0.91249251]), std=array([0.73344636, 0.47653052, 0.72710407, 0.42399687, 0.63613892, + 0.61144608, 1.11724186, 0.49967375, 0.86981195, 0.75071597, + 0.90787333, 0.35008711, 0.51183224, 0.36600712, 0.56947577, + 0.28257725]), q01=array([-1.52408956, -1.32446341, -1.91092197, -2.89885788, -1.66315554, + 0.59010215, -2.27611645, 0. , -1.77352981, -1.62131719, + -1.77092851, -2.19172778, -2.03159353, 0.55409113, 0.79255736, + 0. ]), q99=array([ 2.16638614, 1.38857444, 1.93436338, -0.88548369, 1.39976143, + 2.99162304, 2.8194857 , 0.9998 , 1.46557211, 1.74660106, + 1.58644652, -0.87876934, 2.25910752, 2.54628449, 2.89347284, + 0.9998 ])), 'actions': NormStats(mean=array([ 0.05879939, -0.00704042, -0.02719213, -0.07685276, -0.07520971, + -0.00498583, 0.03577602, 0.48164892, 0.06564316, 0.06023132, + -0.10068271, -0.09547432, -0.0526481 , 0.08205888, 0.13954687, + 0.88333535]), std=array([0.18337056, 0.28128958, 0.18525195, 0.29767084, 0.22944973, + 0.40312037, 0.3896611 , 0.49966311, 0.21938531, 0.16883859, + 0.20206179, 0.14864719, 0.12629333, 0.15546791, 0.23423795, + 0.32102022]), q01=array([-0.34140511, -0.71597991, -0.55301429, -0.8233152 , -0.68097536, + -0.87723451, -0.86000918, 0. , -0.53261366, -0.49289397, + -0.48524564, -0.35752607, -0.42426748, -0.18230745, -0.09212705, + 0. ]), q99=array([0.55444025, 0.69361174, 0.44115428, 0.550829 , 0.49707318, + 0.68353445, 0.82907713, 0.9998 , 0.42654409, 0.44255511, + 0.4114292 , 0.01550327, 0.38038206, 0.71452535, 0.62808441, + 0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (3655:data_loader.py:283) +15:41:20.969 [I] Using existing local LeRobot dataset mirror for lsnu/twin_handover_256_train: /workspace/lerobot/lsnu/twin_handover_256_train (3655:data_loader.py:149) +15:41:24.542 [I] local_batch_size: 4 (3655:data_loader.py:364) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +15:42:35.770 [I] Enabled gradient checkpointing for PI0Pytorch model (3655:pi0_pytorch.py:150) +15:42:35.771 [I] Enabled gradient checkpointing for memory optimization (3655:train_pytorch.py:596) +15:42:35.773 [I] Step 0 (after_model_creation): GPU memory - allocated: 7.47GB, reserved: 7.48GB, free: 0.01GB, peak_allocated: 7.47GB, peak_reserved: 7.48GB | DDP: rank=0, world_size=4 (3655:train_pytorch.py:465) +15:42:35.940 [I] Loading weights from: /workspace/checkpoints/pi05_base_single_pytorch (3655:train_pytorch.py:625) +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +15:42:38.116 [I] Weight loading missing key count: 0 (3655:train_pytorch.py:629) +15:42:38.117 [I] Weight loading missing keys: set() (3655:train_pytorch.py:630) +15:42:38.118 [I] Weight loading unexpected key count: 0 (3655:train_pytorch.py:631) +15:42:38.118 [I] Weight loading unexpected keys: [] (3655:train_pytorch.py:632) +15:42:38.118 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_single_pytorch (3655:train_pytorch.py:633) +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +15:42:38.122 [I] Running on: 9a96de7d560b | world_size=4 (3655:train_pytorch.py:673) +15:42:38.122 [I] Training config: batch_size=16, effective_batch_size=4, num_train_steps=20 (3655:train_pytorch.py:674) +15:42:38.123 [I] Memory optimizations: gradient_checkpointing=True (3655:train_pytorch.py:677) +15:42:38.123 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (3655:train_pytorch.py:678) +15:42:38.124 [I] LR schedule: warmup=500, peak_lr=2.50e-05, decay_steps=10000, end_lr=2.50e-06 (3655:train_pytorch.py:679) +15:42:38.124 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (3655:train_pytorch.py:682) +15:42:38.124 [I] EMA is not supported for PyTorch training (3655:train_pytorch.py:685) +15:42:38.125 [I] Training precision: bfloat16 (3655:train_pytorch.py:686) +15:42:38.129 [I] Resolved config name: pi05_twin_handover_256_packed_baseline_pytorch_10k (3655:train_pytorch.py:280) +15:42:38.129 [I] Dataset repo_id: lsnu/twin_handover_256_train (3655:train_pytorch.py:281) +15:42:38.129 [I] Norm-stats file path: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (3655:train_pytorch.py:282) +15:42:38.129 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (3655:train_pytorch.py:283) +15:42:38.130 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_single_pytorch (3655:train_pytorch.py:284) +15:42:38.130 [I] Model type: baseline (3655:train_pytorch.py:285) +15:42:38.130 [I] Packed transforms active: True (3655:train_pytorch.py:286) +15:42:38.130 [I] World size: 4 (3655:train_pytorch.py:287) +15:42:38.130 [I] Batch size: local=4, global=16 (3655:train_pytorch.py:288) +15:42:38.131 [I] num_workers: 8 (3655:train_pytorch.py:289) +15:42:38.131 [I] Precision: bfloat16 (3655:train_pytorch.py:290) +15:42:38.131 [I] LR schedule summary: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (3655:train_pytorch.py:291) +15:42:38.131 [I] Save/log intervals: save_interval=20, log_interval=5 (3655:train_pytorch.py:298) +15:42:38.132 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (3655:train_pytorch.py:299) +15:42:38.132 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (3655:train_pytorch.py:300) +15:42:38.132 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (3655:train_pytorch.py:301) +15:42:38.132 [I] Gradient bucket diagnostics: action_in_proj, action_out_proj, shared_expert (3655:train_pytorch.py:694) + Training: 0%| | 0/20 [00:00 /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_baseline_pytorch_10k/smoke_baseline_10k_diag/20 (3655:train_pytorch.py:350) + Training: 100%|██████████| 20/20 [01:56<00:00, 29.94s/it, loss=1.5393, lr=9.48e-07, step=19] Training: 100%|██████████| 20/20 [01:56<00:00, 29.94s/it, loss=0.8768, lr=9.98e-07, step=20] Training: 100%|██████████| 20/20 [01:56<00:00, 5.82s/it, loss=0.8768, lr=9.98e-07, step=20] +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_parallel_10k_diag.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_parallel_10k_diag.log new file mode 100644 index 0000000000000000000000000000000000000000..d21bc62aee79f08abd5e72b6f9795703d21c6ae1 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/run_logs/smoke_parallel_10k_diag.log @@ -0,0 +1,149 @@ +W0309 15:46:21.273000 6578 torch/distributed/run.py:766] +W0309 15:46:21.273000 6578 torch/distributed/run.py:766] ***************************************** +W0309 15:46:21.273000 6578 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0309 15:46:21.273000 6578 torch/distributed/run.py:766] ***************************************** +15:47:11.286 [I] Created experiment checkpoint directory: /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/smoke_parallel_10k_diag (6647:train_pytorch.py:505) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank2]:[W309 15:47:11.762262237 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +[rank0]:[W309 15:47:11.772293922 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank1]:[W309 15:47:12.078834637 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +[rank3]:[W309 15:47:13.952599935 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. +15:47:14.872 [I] Using batch size per GPU: 4 (total batch size across 4 GPUs: 16) (6647:train_pytorch.py:524) +15:47:15.088 [I] Loaded norm stats from /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train (6647:config.py:234) +15:47:15.090 [I] data_config: DataConfig(repo_id='lsnu/twin_handover_256_train', asset_id='lsnu/twin_handover_256_train', norm_stats={'state': NormStats(mean=array([ 0.40321857, 0.17899239, -0.07588876, -2.06326795, -0.46418607, + 1.79356563, 0.70229131, 0.48194093, 0.93952829, 0.86693275, + -1.03168762, -1.9056077 , -0.53421056, 1.87584054, 2.36738205, + 0.91249251]), std=array([0.73344636, 0.47653052, 0.72710407, 0.42399687, 0.63613892, + 0.61144608, 1.11724186, 0.49967375, 0.86981195, 0.75071597, + 0.90787333, 0.35008711, 0.51183224, 0.36600712, 0.56947577, + 0.28257725]), q01=array([-1.52408956, -1.32446341, -1.91092197, -2.89885788, -1.66315554, + 0.59010215, -2.27611645, 0. , -1.77352981, -1.62131719, + -1.77092851, -2.19172778, -2.03159353, 0.55409113, 0.79255736, + 0. ]), q99=array([ 2.16638614, 1.38857444, 1.93436338, -0.88548369, 1.39976143, + 2.99162304, 2.8194857 , 0.9998 , 1.46557211, 1.74660106, + 1.58644652, -0.87876934, 2.25910752, 2.54628449, 2.89347284, + 0.9998 ])), 'actions': NormStats(mean=array([ 0.05879939, -0.00704042, -0.02719213, -0.07685276, -0.07520971, + -0.00498583, 0.03577602, 0.48164892, 0.06564316, 0.06023132, + -0.10068271, -0.09547432, -0.0526481 , 0.08205888, 0.13954687, + 0.88333535]), std=array([0.18337056, 0.28128958, 0.18525195, 0.29767084, 0.22944973, + 0.40312037, 0.3896611 , 0.49966311, 0.21938531, 0.16883859, + 0.20206179, 0.14864719, 0.12629333, 0.15546791, 0.23423795, + 0.32102022]), q01=array([-0.34140511, -0.71597991, -0.55301429, -0.8233152 , -0.68097536, + -0.87723451, -0.86000918, 0. , -0.53261366, -0.49289397, + -0.48524564, -0.35752607, -0.42426748, -0.18230745, -0.09212705, + 0. ]), q99=array([0.55444025, 0.69361174, 0.44115428, 0.550829 , 0.49707318, + 0.68353445, 0.82907713, 0.9998 , 0.42654409, 0.44255511, + 0.4114292 , 0.01550327, 0.38038206, 0.71452535, 0.62808441, + 0.9998 ]))}, repack_transforms=Group(inputs=[RepackTransform(structure={'images': {'cam_high': 'front_image', 'cam_left_wrist': 'wrist_left_image', 'cam_right_wrist': 'wrist_right_image'}, 'state': 'state', 'actions': 'action', 'prompt': 'task'})], outputs=()), data_transforms=Group(inputs=[AlohaInputs(adapt_to_pi=False)], outputs=[]), model_transforms=Group(inputs=[InjectDefaultPrompt(prompt=None), ResizeImages(height=224, width=224), TokenizePrompt(tokenizer=, discrete_state_input=True), PackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))], outputs=[UnpackPerArmBlocks(real_arm_dims=(8, 8), block_dims=(16, 16))]), use_quantile_norm=True, action_sequence_keys=('action',), prompt_from_task=False, rlds_data_dir=None, action_space=None, datasets=()) (6647:data_loader.py:283) +15:47:15.124 [I] Using existing local LeRobot dataset mirror for lsnu/twin_handover_256_train: /workspace/lerobot/lsnu/twin_handover_256_train (6647:data_loader.py:149) +15:47:21.449 [I] local_batch_size: 4 (6647:data_loader.py:364) +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once +15:50:36.938 [I] Enabled gradient checkpointing for PI0Pytorch model (6647:pi0_pytorch.py:150) +15:50:36.949 [I] Enabled gradient checkpointing for memory optimization (6647:train_pytorch.py:596) +15:50:36.951 [I] Step 0 (after_model_creation): GPU memory - allocated: 7.48GB, reserved: 7.48GB, free: 0.00GB, peak_allocated: 7.48GB, peak_reserved: 7.48GB | DDP: rank=0, world_size=4 (6647:train_pytorch.py:465) +15:51:05.826 [I] Loading weights from: /workspace/checkpoints/pi05_base_parallel_packed_from_single (6647:train_pytorch.py:625) +15:51:08.127 [I] Weight loading missing key count: 0 (6647:train_pytorch.py:629) +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +/usr/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock. + self.pid = os.fork() +15:51:08.133 [I] Weight loading missing keys: set() (6647:train_pytorch.py:630) +15:51:08.134 [I] Weight loading unexpected key count: 0 (6647:train_pytorch.py:631) +15:51:08.135 [I] Weight loading unexpected keys: [] (6647:train_pytorch.py:632) +15:51:08.135 [I] Loaded PyTorch weights from /workspace/checkpoints/pi05_base_parallel_packed_from_single (6647:train_pytorch.py:633) +15:51:08.138 [I] Running on: 9a96de7d560b | world_size=4 (6647:train_pytorch.py:673) +15:51:08.139 [I] Training config: batch_size=16, effective_batch_size=4, num_train_steps=20 (6647:train_pytorch.py:674) +15:51:08.139 [I] Memory optimizations: gradient_checkpointing=True (6647:train_pytorch.py:677) +15:51:08.140 [I] DDP settings: find_unused_parameters=False, gradient_as_bucket_view=True, static_graph=True (6647:train_pytorch.py:678) +15:51:08.140 [I] LR schedule: warmup=500, peak_lr=2.50e-05, decay_steps=10000, end_lr=2.50e-06 (6647:train_pytorch.py:679) +15:51:08.140 [I] Optimizer: AdamW, weight_decay=1e-10, clip_norm=1.0 (6647:train_pytorch.py:682) +15:51:08.140 [I] EMA is not supported for PyTorch training (6647:train_pytorch.py:685) +15:51:08.140 [I] Training precision: bfloat16 (6647:train_pytorch.py:686) +15:51:08.162 [I] Resolved config name: pi05_twin_handover_256_packed_parallel_pytorch_10k (6647:train_pytorch.py:280) +15:51:08.162 [I] Dataset repo_id: lsnu/twin_handover_256_train (6647:train_pytorch.py:281) +15:51:08.163 [I] Norm-stats file path: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_parallel_pytorch_10k/lsnu/twin_handover_256_train/norm_stats.json (6647:train_pytorch.py:282) +15:51:08.163 [I] Norm-stats summary: {'keys': ['actions', 'state'], 'state_mean_len': 16, 'state_std_len': 16, 'actions_mean_len': 16, 'actions_std_len': 16} (6647:train_pytorch.py:283) +15:51:08.163 [I] Checkpoint source path: /workspace/checkpoints/pi05_base_parallel_packed_from_single (6647:train_pytorch.py:284) +15:51:08.163 [I] Model type: parallel (6647:train_pytorch.py:285) +15:51:08.164 [I] Packed transforms active: True (6647:train_pytorch.py:286) +15:51:08.164 [I] World size: 4 (6647:train_pytorch.py:287) +15:51:08.164 [I] Batch size: local=4, global=16 (6647:train_pytorch.py:288) +15:51:08.164 [I] num_workers: 8 (6647:train_pytorch.py:289) +15:51:08.164 [I] Precision: bfloat16 (6647:train_pytorch.py:290) +15:51:08.165 [I] LR schedule summary: warmup_steps=500, peak_lr=2.50e-05, decay_steps=10000, decay_lr=2.50e-06 (6647:train_pytorch.py:291) +15:51:08.165 [I] Save/log intervals: save_interval=20, log_interval=5 (6647:train_pytorch.py:298) +15:51:08.165 [I] Action-loss mask: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (6647:train_pytorch.py:299) +15:51:08.166 [I] Active mask dims: [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] (6647:train_pytorch.py:300) +15:51:08.166 [I] Masked dims: [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] (6647:train_pytorch.py:301) +15:51:08.166 [I] Gradient bucket diagnostics: action_in_proj_arms, arm_token_fuse, action_out_proj_arms, shared_expert (6647:train_pytorch.py:694) + Training: 0%| | 0/20 [00:00 /workspace/pi05tests-openpi-multiarm/openpi/checkpoints/pi05_twin_handover_256_packed_parallel_pytorch_10k/smoke_parallel_10k_diag/20 (6647:train_pytorch.py:350) + Training: 100%|██████████| 20/20 [02:18<00:00, 35.63s/it, loss=1.5224, lr=9.48e-07, step=19] Training: 100%|██████████| 20/20 [02:18<00:00, 35.63s/it, loss=0.8767, lr=9.98e-07, step=20] Training: 100%|██████████| 20/20 [02:18<00:00, 6.92s/it, loss=0.8767, lr=9.98e-07, step=20] +/workspace/pi05tests-openpi-multiarm/openpi/.venv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. + warnings.warn( # warn only once diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/inspect_twin_packed_batch_handover_train.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/inspect_twin_packed_batch_handover_train.log new file mode 100644 index 0000000000000000000000000000000000000000..f3b807731a9b0a41abea274c1877df9201f6cd69 --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/inspect_twin_packed_batch_handover_train.log @@ -0,0 +1,176 @@ +config_name: pi05_twin_handover_256_packed_baseline_pytorch_2k +repo_id: lsnu/twin_handover_256_train +sample_index: 0 +norm_stats_path: /workspace/pi05tests-openpi-multiarm/openpi/assets/pi05_twin_handover_256_packed_baseline_pytorch_2k/lsnu/twin_handover_256_train/norm_stats.json +norm_stats_keys: ['actions', 'state'] +norm_stats_lengths: state_mean=16 state_std=16 action_mean=16 action_std=16 +block_boundaries: [0:8] [8:16] [16:24] [24:32] +raw_state_16d_shape: (16,) +raw_state_16d: +[ 7.1883e-07 1.7515e-01 -5.6890e-06 -8.7299e-01 -6.3130e-06 1.2216e+00 + 7.8540e-01 1.0000e+00 1.1957e-06 1.7514e-01 -9.2062e-07 -8.7312e-01 + 1.6098e-05 1.2216e+00 7.8539e-01 1.0000e+00] +raw_actions_16d_shape: (16, 16) +raw_actions_16d: +[[ 2.3842e-05 -8.2493e-04 -5.7220e-05 3.9577e-04 2.8610e-05 7.8201e-04 + -1.2398e-04 1.0000e+00 9.5367e-05 4.0293e-03 9.5367e-06 7.2479e-04 + 1.8120e-04 -1.4305e-05 -2.2411e-04 1.0000e+00] + [ 5.0068e-04 -1.5645e-02 2.6083e-03 -5.5575e-02 1.8883e-03 2.5430e-02 + -1.9326e-02 1.0000e+00 2.7800e-02 2.4877e-02 -2.7924e-02 -2.7843e-02 + -1.6832e-02 1.0629e-02 3.8543e-02 1.0000e+00] + [ 1.7738e-03 -7.6041e-02 8.9645e-03 -1.7257e-01 6.0558e-03 8.7943e-02 + -6.4831e-02 1.0000e+00 9.2287e-02 5.8761e-02 -9.3136e-02 -7.6413e-02 + -5.3630e-02 4.2353e-02 1.2606e-01 1.0000e+00] + [ 3.2425e-03 -1.3747e-01 1.5845e-02 -3.1527e-01 1.0653e-02 1.6477e-01 + -1.1840e-01 1.0000e+00 1.7036e-01 1.0629e-01 -1.7153e-01 -1.4015e-01 + -9.7461e-02 7.8468e-02 2.3009e-01 1.0000e+00] + [ 5.5885e-03 -2.1545e-01 2.4767e-02 -4.6663e-01 1.6103e-02 2.4452e-01 + -1.7446e-01 1.0000e+00 2.5305e-01 1.5107e-01 -2.5392e-01 -2.1260e-01 + -1.4490e-01 1.1766e-01 3.4122e-01 1.0000e+00] + [ 6.1035e-03 -2.8390e-01 3.3288e-02 -6.1909e-01 2.1739e-02 3.2683e-01 + -2.3199e-01 1.0000e+00 3.3677e-01 1.9970e-01 -3.3804e-01 -2.8173e-01 + -1.9161e-01 1.5831e-01 4.5282e-01 1.0000e+00] + [ 9.3937e-03 -3.1736e-01 3.8815e-02 -7.2264e-01 2.9097e-02 3.8407e-01 + -2.9788e-01 1.0000e+00 3.9431e-01 2.3764e-01 -3.9650e-01 -3.2045e-01 + -2.2884e-01 1.8487e-01 5.3961e-01 1.0000e+00] + [ 1.1177e-02 -3.3051e-01 4.2367e-02 -7.4072e-01 3.5295e-02 4.0234e-01 + -3.4810e-01 1.0000e+00 4.1353e-01 2.4687e-01 -4.1600e-01 -3.4033e-01 + -2.4390e-01 1.9067e-01 5.7513e-01 1.0000e+00] + [ 1.2674e-02 -3.1841e-01 4.3559e-02 -7.5366e-01 3.7665e-02 4.1035e-01 + -3.7488e-01 1.0000e+00 4.2095e-01 2.5672e-01 -4.2238e-01 -3.4335e-01 + -2.4950e-01 1.9567e-01 5.8634e-01 1.0000e+00] + [ 1.5645e-02 -3.0324e-01 4.3592e-02 -7.4167e-01 4.2624e-02 4.1367e-01 + -4.1199e-01 1.0000e+00 4.2353e-01 2.6254e-01 -4.2444e-01 -3.4899e-01 + -2.5064e-01 1.9762e-01 5.8977e-01 1.0000e+00] + [ 1.6398e-02 -2.9560e-01 4.2553e-02 -7.3503e-01 4.5595e-02 4.1383e-01 + -4.3354e-01 1.0000e+00 4.2382e-01 2.5776e-01 -4.2612e-01 -3.5491e-01 + -2.5177e-01 1.9462e-01 5.9134e-01 1.0000e+00] + [ 2.0757e-02 -2.9058e-01 4.2739e-02 -7.3133e-01 4.6840e-02 4.1339e-01 + -4.5310e-01 1.0000e+00 4.2468e-01 2.5057e-01 -4.2498e-01 -3.4835e-01 + -2.5149e-01 2.0029e-01 5.9138e-01 1.0000e+00] + [ 2.3303e-02 -2.7753e-01 4.1437e-02 -7.2254e-01 4.8075e-02 4.1380e-01 + -4.7155e-01 1.0000e+00 4.2468e-01 2.5254e-01 -4.2522e-01 -3.4195e-01 + -2.5130e-01 1.9623e-01 5.9127e-01 1.0000e+00] + [ 2.7924e-02 -2.5505e-01 4.0684e-02 -7.0069e-01 5.3768e-02 4.1076e-01 + -5.1048e-01 1.0000e+00 4.2446e-01 2.5574e-01 -4.2656e-01 -3.5101e-01 + -2.5181e-01 1.9645e-01 5.9101e-01 1.0000e+00] + [ 3.2401e-02 -2.4053e-01 4.1451e-02 -6.8364e-01 5.6882e-02 4.1132e-01 + -5.4158e-01 1.0000e+00 4.2435e-01 2.5109e-01 -4.2632e-01 -3.5082e-01 + -2.5095e-01 1.9805e-01 5.9107e-01 1.0000e+00] + [ 3.4809e-02 -2.2431e-01 4.0565e-02 -6.7288e-01 5.6076e-02 4.0839e-01 + -5.6400e-01 1.0000e+00 4.2504e-01 2.5486e-01 -4.2588e-01 -3.4874e-01 + -2.5139e-01 1.9783e-01 5.9183e-01 1.0000e+00]] +normalized_state_16d_shape: (16,) +normalized_state_16d: +[-0.174 0.1055 -0.0061 1.0124 0.086 -0.4741 0.2016 1.0004 0.0951 + 0.0668 0.0549 1.0086 -0.053 -0.3299 -1.0068 1.0004] +normalized_actions_16d_shape: (16, 16) +normalized_actions_16d: +[[-0.2378 0.0147 0.1124 0.1989 0.1562 0.1251 0.0182 1.0004 0.1108 + 0.0624 0.0823 0.9208 0.055 -0.5935 -0.7448 1.0004] + [-0.2367 -0.0063 0.1178 0.1174 0.1593 0.1567 -0.0046 1.0004 0.1686 + 0.107 0.02 0.7676 0.0127 -0.5697 -0.6371 1.0004] + [-0.2338 -0.092 0.1305 -0.0529 0.1664 0.2368 -0.0585 1.0004 0.303 + 0.1794 -0.1254 0.5072 -0.0788 -0.499 -0.3941 1.0004] + [-0.2306 -0.1792 0.1444 -0.2606 0.1742 0.3352 -0.1219 1.0004 0.4658 + 0.2811 -0.3003 0.1655 -0.1877 -0.4185 -0.1052 1.0004] + [-0.2253 -0.2898 0.1623 -0.4809 0.1834 0.4374 -0.1883 1.0004 0.6382 + 0.3768 -0.484 -0.223 -0.3056 -0.3311 0.2034 1.0004] + [-0.2242 -0.3869 0.1795 -0.7028 0.193 0.5429 -0.2564 1.0004 0.8128 + 0.4808 -0.6717 -0.5936 -0.4217 -0.2404 0.5133 1.0004] + [-0.2168 -0.4344 0.1906 -0.8535 0.2055 0.6163 -0.3344 1.0004 0.9328 + 0.5619 -0.8021 -0.8012 -0.5143 -0.1812 0.7543 1.0004] + [-0.2129 -0.4531 0.1977 -0.8798 0.216 0.6397 -0.3939 1.0004 0.9729 + 0.5816 -0.8455 -0.9078 -0.5517 -0.1682 0.8529 1.0004] + [-0.2095 -0.4359 0.2001 -0.8986 0.2201 0.6499 -0.4256 1.0004 0.9883 + 0.6027 -0.8598 -0.924 -0.5656 -0.1571 0.8841 1.0004] + [-0.2029 -0.4144 0.2002 -0.8812 0.2285 0.6542 -0.4695 1.0004 0.9937 + 0.6151 -0.8644 -0.9542 -0.5684 -0.1527 0.8936 1.0004] + [-0.2012 -0.4035 0.1981 -0.8715 0.2335 0.6544 -0.495 1.0004 0.9943 + 0.6049 -0.8681 -0.986 -0.5713 -0.1594 0.8979 1.0004] + [-0.1915 -0.3964 0.1985 -0.8661 0.2356 0.6538 -0.5182 1.0004 0.9961 + 0.5895 -0.8656 -0.9508 -0.5705 -0.1468 0.8981 1.0004] + [-0.1858 -0.3779 0.1959 -0.8533 0.2377 0.6544 -0.54 1.0004 0.9961 + 0.5937 -0.8661 -0.9165 -0.5701 -0.1558 0.8978 1.0004] + [-0.1755 -0.346 0.1944 -0.8215 0.2474 0.6505 -0.5861 1.0004 0.9956 + 0.6006 -0.8691 -0.9651 -0.5713 -0.1554 0.897 1.0004] + [-0.1655 -0.3254 0.1959 -0.7967 0.2527 0.6512 -0.623 1.0004 0.9954 + 0.5907 -0.8686 -0.9641 -0.5692 -0.1518 0.8972 1.0004] + [-0.1601 -0.3024 0.1941 -0.7811 0.2513 0.6474 -0.6495 1.0004 0.9969 + 0.5987 -0.8676 -0.9529 -0.5703 -0.1523 0.8993 1.0004]] +packed_state_32d_shape: (32,) +packed_state_32d: +[-0.174 0.1055 -0.0061 1.0124 0.086 -0.4741 0.2016 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.0951 0.0668 + 0.0549 1.0086 -0.053 -0.3299 -1.0068 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] +packed_actions_32d_shape: (16, 32) +packed_actions_32d: +[[-0.2378 0.0147 0.1124 0.1989 0.1562 0.1251 0.0182 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.1108 0.0624 + 0.0823 0.9208 0.055 -0.5935 -0.7448 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2367 -0.0063 0.1178 0.1174 0.1593 0.1567 -0.0046 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.1686 0.107 + 0.02 0.7676 0.0127 -0.5697 -0.6371 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2338 -0.092 0.1305 -0.0529 0.1664 0.2368 -0.0585 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.303 0.1794 + -0.1254 0.5072 -0.0788 -0.499 -0.3941 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2306 -0.1792 0.1444 -0.2606 0.1742 0.3352 -0.1219 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.4658 0.2811 + -0.3003 0.1655 -0.1877 -0.4185 -0.1052 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2253 -0.2898 0.1623 -0.4809 0.1834 0.4374 -0.1883 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.6382 0.3768 + -0.484 -0.223 -0.3056 -0.3311 0.2034 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2242 -0.3869 0.1795 -0.7028 0.193 0.5429 -0.2564 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.8128 0.4808 + -0.6717 -0.5936 -0.4217 -0.2404 0.5133 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2168 -0.4344 0.1906 -0.8535 0.2055 0.6163 -0.3344 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9328 0.5619 + -0.8021 -0.8012 -0.5143 -0.1812 0.7543 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2129 -0.4531 0.1977 -0.8798 0.216 0.6397 -0.3939 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9729 0.5816 + -0.8455 -0.9078 -0.5517 -0.1682 0.8529 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2095 -0.4359 0.2001 -0.8986 0.2201 0.6499 -0.4256 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9883 0.6027 + -0.8598 -0.924 -0.5656 -0.1571 0.8841 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2029 -0.4144 0.2002 -0.8812 0.2285 0.6542 -0.4695 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9937 0.6151 + -0.8644 -0.9542 -0.5684 -0.1527 0.8936 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.2012 -0.4035 0.1981 -0.8715 0.2335 0.6544 -0.495 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9943 0.6049 + -0.8681 -0.986 -0.5713 -0.1594 0.8979 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.1915 -0.3964 0.1985 -0.8661 0.2356 0.6538 -0.5182 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9961 0.5895 + -0.8656 -0.9508 -0.5705 -0.1468 0.8981 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.1858 -0.3779 0.1959 -0.8533 0.2377 0.6544 -0.54 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9961 0.5937 + -0.8661 -0.9165 -0.5701 -0.1558 0.8978 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.1755 -0.346 0.1944 -0.8215 0.2474 0.6505 -0.5861 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9956 0.6006 + -0.8691 -0.9651 -0.5713 -0.1554 0.897 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.1655 -0.3254 0.1959 -0.7967 0.2527 0.6512 -0.623 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9954 0.5907 + -0.8686 -0.9641 -0.5692 -0.1518 0.8972 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ] + [-0.1601 -0.3024 0.1941 -0.7811 0.2513 0.6474 -0.6495 1.0004 0. + 0. 0. 0. 0. 0. 0. 0. 0.9969 0.5987 + -0.8676 -0.9529 -0.5703 -0.1523 0.8993 1.0004 0. 0. 0. + 0. 0. 0. 0. 0. ]] +state_padded_zero_count: 16 / 16 +actions_padded_zero_count: 256 / 256 +state_padded_exact_zero: True +actions_padded_exact_zero: True diff --git a/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/warmstart_equivalence_10k.log b/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/warmstart_equivalence_10k.log new file mode 100644 index 0000000000000000000000000000000000000000..87116c87e90f4e687da2bf93fd22ff95b965c66e --- /dev/null +++ b/artifacts/twin_handover_packed_parallelization_10k_20260309/sanity_checks/warmstart_equivalence_10k.log @@ -0,0 +1,29 @@ +starting_warmstart_equivalence baseline_config=pi05_twin_handover_256_packed_baseline_pytorch_10k parallel_config=pi05_twin_handover_256_packed_parallel_pytorch_10k repo_id=lsnu/twin_handover_256_train +loaded_eval_dataloader +loaded_reference_batch +loading_model config=pi05_twin_handover_256_packed_baseline_pytorch_10k checkpoint=/workspace/checkpoints/pi05_base_single_pytorch +running_forward config=pi05_twin_handover_256_packed_baseline_pytorch_10k +finished_forward config=pi05_twin_handover_256_packed_baseline_pytorch_10k +loading_model config=pi05_twin_handover_256_packed_parallel_pytorch_10k checkpoint=/workspace/checkpoints/pi05_base_parallel_packed_from_single +running_forward config=pi05_twin_handover_256_packed_parallel_pytorch_10k +finished_forward config=pi05_twin_handover_256_packed_parallel_pytorch_10k +baseline_config_name: pi05_twin_handover_256_packed_baseline_pytorch_10k +parallel_config_name: pi05_twin_handover_256_packed_parallel_pytorch_10k +repo_id_used: lsnu/twin_handover_256_train +baseline_ckpt: /workspace/checkpoints/pi05_base_single_pytorch +parallel_ckpt: /workspace/checkpoints/pi05_base_parallel_packed_from_single +batch_size: 4 +eval_seed: 777 +tolerance: 1e-06 +baseline_missing_keys: [] +baseline_unexpected_keys: [] +parallel_missing_keys: [] +parallel_unexpected_keys: [] +input_projection_max_abs_diff: 0.00122881 +input_projection_mean_abs_diff: 0.00015435 +loss_max_abs_diff: 0.90186501 +loss_mean_abs_diff: 0.04585753 +baseline_masked_loss: 1.00531137 +parallel_masked_loss: 1.00929189 +masked_loss_abs_diff: 0.00398052 +warmstart_equivalent: False