diff --git "a/evalplus/nohup.out" "b/evalplus/nohup.out" --- "a/evalplus/nohup.out" +++ "b/evalplus/nohup.out" @@ -1,2163 +1,3 @@ -Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0 -Initializing a decoder model: /home/aiscuser/fhw/model_weights/warriordeep224 ... -WARNING 01-16 05:10:38 config.py:2276] Casting torch.bfloat16 to torch.float16. -INFO 01-16 05:10:47 config.py:510] This model supports multiple tasks: {'embed', 'generate', 'reward', 'score', 'classify'}. Defaulting to 'generate'. -INFO 01-16 05:10:47 config.py:1310] Defaulting to use mp for distributed inference -INFO 01-16 05:10:47 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='/home/aiscuser/fhw/model_weights/warriordeep224', speculative_config=None, tokenizer='/home/aiscuser/fhw/model_weights/warriordeep224', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/aiscuser/fhw/model_weights/warriordeep224, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"candidate_compile_sizes":[],"compile_sizes":[],"capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, -WARNING 01-16 05:10:47 multiproc_worker_utils.py:312] Reducing Torch parallelism from 40 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -INFO 01-16 05:10:47 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager -(VllmWorkerProcess pid=2082181) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082181) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082185) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082184) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082185) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082184) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082183) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082183) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082188) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082188) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082182) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082182) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082186) INFO 01-16 05:10:49 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2082186) INFO 01-16 05:10:49 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2082181) INFO 01-16 05:10:51 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082184) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082185) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082186) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082182) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082188) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082183) INFO 01-16 05:10:52 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2082181) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082181) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082182) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082183) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082182) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082184) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082183) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082186) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082188) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082185) INFO 01-16 05:10:57 utils.py:918] Found nccl from library libnccl.so.2 -(VllmWorkerProcess pid=2082184) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082186) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082188) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -(VllmWorkerProcess pid=2082185) INFO 01-16 05:10:57 pynccl.py:69] vLLM is using nccl==2.21.5 -node-0:2081505:2081505 [0] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2081505:2081505 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2081505:2081505 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2081505:2081505 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2081505:2081505 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2081505:2081505 [0] NCCL INFO cudaDriverVersion 12010 -NCCL version 2.21.5+cuda12.4 -node-0:2082184:2082184 [4] NCCL INFO cudaDriverVersion 12010 -node-0:2082184:2082184 [4] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082184:2082184 [4] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082184:2082184 [4] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082184:2082184 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082184:2082184 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082184:2082184 [4] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082184:2082184 [4] NCCL INFO P2P plugin IBext -node-0:2082184:2082184 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082184:2082184 [4] NCCL INFO Using non-device net plugin version 0 -node-0:2082184:2082184 [4] NCCL INFO Using network IBext -node-0:2082184:2082184 [4] NCCL INFO ncclCommInitRank comm 0x80d84270 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 500000 commId 0xb73c88640b2dd535 - Init START -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082181:2082181 [1] NCCL INFO cudaDriverVersion 12010 -node-0:2082181:2082181 [1] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082181:2082181 [1] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082181:2082181 [1] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082181:2082181 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082181:2082181 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082181:2082181 [1] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082181:2082181 [1] NCCL INFO P2P plugin IBext -node-0:2082181:2082181 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082181:2082181 [1] NCCL INFO Using non-device net plugin version 0 -node-0:2082181:2082181 [1] NCCL INFO Using network IBext -node-0:2082181:2082181 [1] NCCL INFO ncclCommInitRank comm 0x80d7fea0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0xb73c88640b2dd535 - Init START -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082183:2082183 [3] NCCL INFO cudaDriverVersion 12010 -node-0:2082183:2082183 [3] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082183:2082183 [3] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082183:2082183 [3] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082183:2082183 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082183:2082183 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082183:2082183 [3] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082183:2082183 [3] NCCL INFO P2P plugin IBext -node-0:2082183:2082183 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082183:2082183 [3] NCCL INFO Using non-device net plugin version 0 -node-0:2082183:2082183 [3] NCCL INFO Using network IBext -node-0:2082183:2082183 [3] NCCL INFO ncclCommInitRank comm 0x80d81ac0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 400000 commId 0xb73c88640b2dd535 - Init START -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082186:2082186 [6] NCCL INFO cudaDriverVersion 12010 -node-0:2082186:2082186 [6] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082186:2082186 [6] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082186:2082186 [6] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082186:2082186 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082186:2082186 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082186:2082186 [6] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082186:2082186 [6] NCCL INFO P2P plugin IBext -node-0:2082186:2082186 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082186:2082186 [6] NCCL INFO Using non-device net plugin version 0 -node-0:2082186:2082186 [6] NCCL INFO Using network IBext -node-0:2082186:2082186 [6] NCCL INFO ncclCommInitRank comm 0x80d85230 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 700000 commId 0xb73c88640b2dd535 - Init START -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082182:2082182 [2] NCCL INFO cudaDriverVersion 12010 -node-0:2082182:2082182 [2] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082182:2082182 [2] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082182:2082182 [2] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082182:2082182 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082182:2082182 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082182:2082182 [2] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082182:2082182 [2] NCCL INFO P2P plugin IBext -node-0:2082182:2082182 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082182:2082182 [2] NCCL INFO Using non-device net plugin version 0 -node-0:2082182:2082182 [2] NCCL INFO Using network IBext -node-0:2082182:2082182 [2] NCCL INFO ncclCommInitRank comm 0x80d81c50 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 300000 commId 0xb73c88640b2dd535 - Init START -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082185:2082185 [5] NCCL INFO cudaDriverVersion 12010 -node-0:2082185:2082185 [5] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082185:2082185 [5] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082185:2082185 [5] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082185:2082185 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082185:2082185 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082185:2082185 [5] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082185:2082185 [5] NCCL INFO P2P plugin IBext -node-0:2082185:2082185 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082185:2082185 [5] NCCL INFO Using non-device net plugin version 0 -node-0:2082185:2082185 [5] NCCL INFO Using network IBext -node-0:2082185:2082185 [5] NCCL INFO ncclCommInitRank comm 0x80d846d0 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 600000 commId 0xb73c88640b2dd535 - Init START -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:node-0:2082188:2082188 [7] NCCL INFO cudaDriverVersion 12010 -node-0:2082188:2082188 [7] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2082188:2082188 [7] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2082188:2082188 [7] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2082188:2082188 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2082188:2082188 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -node-0:2082188:2082188 [7] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2082188:2082188 [7] NCCL INFO P2P plugin IBext -node-0:2082188:2082188 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2082188:2082188 [7] NCCL INFO Using non-device net plugin version 0 -node-0:2082188:2082188 [7] NCCL INFO Using network IBext -node-0:2082188:2082188 [7] NCCL INFO ncclCommInitRank comm 0x80d85f50 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 800000 commId 0xb73c88640b2dd535 - Init START -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082184:2082184 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082184:2082184 [4] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082184:2082184 [4] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082184:2082184 [4] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082184:2082184 [4] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082184:2082184 [4] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[40.0] - GPUnode-0:2081505:2081505 [0] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2081505:2081505 [0] NCCL INFO P2P plugin IBext -node-0:2081505:2081505 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2081505:2081505 [0] NCCL INFO Using non-device net plugin version 0 -node-0:2081505:2081505 [0] NCCL INFO Using network IBext -node-0:2081505:2081505 [0] NCCL INFO ncclCommInitRank comm 0x80d88f50 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0xb73c88640b2dd535 - Init START -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INF00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082181:2082181 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082181:2082181 [1] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082181:2082181 [1] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082181:2082181 [1] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082181:2082181 [1] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082181:2082181 [1] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[40.0] - GPUO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2081505:2081505 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2081505:2081505 [0] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2081505:2081505 [0] NCCL INFO CPU/0-0 (1/1/2) -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2081505:2081505 [0] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2081505:2081505 [0] NCCL INFO CPU/0-1 (1/1/2) -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2081505:2081505 [0] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2081505:2081505 [0] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2081505:2081505 [0] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2081505:2081505 [0] NCCL INFO ========================================== -node-0:2081505:2081505 [0] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082186:2082186 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082186:2082186 [6] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082186:2082186 [6] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082186:2082186 [6] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082186:2082186 [6] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082186:2082186 [6] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[40.0] - GPU00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082183:2082183 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082183:2082183 [3] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082183:2082183 [3] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082183:2082183 [3] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082183:2082183 [3] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082183:2082183 [3] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[40.0] - GPU00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082182:2082182 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082182:2082182 [2] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082182:2082182 [2] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082182:2082182 [2] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082182:2082182 [2] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082182:2082182 [2] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[40.0] - GPU00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082188:2082188 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082188:2082188 [7] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082188:2082188 [7] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082188:2082188 [7] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082188:2082188 [7] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082188:2082188 [7] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[40.0] - GPU00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2082185:2082185 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2082185:2082185 [5] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2082185:2082185 [5] NCCL INFO CPU/0-0 (1/1/2) -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2082185:2082185 [5] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2082185:2082185 [5] NCCL INFO CPU/0-1 (1/1/2) -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2082185:2082185 [5] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082184:2082184 [4] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082184:2082184 [4] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082184:2082184 [4] NCCL INFO ========================================== -node-0:2082184:2082184 [4] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082184:2082184 [4] NCCL INFO Setting affinity for GPU 4 to ff,fff00000 -node-0:2082184:2082184 [4] NCCL INFO NVLS multicast support is not available on dev 4 -node-0:2082184:2082184 [4] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082184:2082184 [4] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082184:2082184 [4] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082184:2082184 [4] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082184:2082184 [4] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082184:2082184 [4] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082184:2082184 [4] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082184:2082184 [4] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082184:2082184 [4] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082184:2082184 [4] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082184:2082184 [4] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082184:2082184 [4] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082184:2082184 [4] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 /500000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082186:2082186 [6] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082186:2082186 [6] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082186:2082186 [6] NCCL INFO ========================================== -node-0:2082186:2082186 [6] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082186:2082186 [6] NCCL INFO Setting affinity for GPU 6 to ff,fff00000 -node-0:2082186:2082186 [6] NCCL INFO NVLS multicast support is not available on dev 6 -node-0:2082186:2082186 [6] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082186:2082186 [6] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082186:2082186 [6] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082186:2082186 [6] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082186:2082186 [6] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082186:2082186 [6] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082186:2082186 [6] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082186:2082186 [6] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082186:2082186 [6] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082186:2082186 [6] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082186:2082186 [6] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082186:2082186 [6] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082186:2082186 [6] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 /500000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082181:2082181 [1] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082181:2082181 [1] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082181:2082181 [1] NCCL INFO ========================================== -node-0:2082181:2082181 [1] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082181:2082181 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff -node-0:2082181:2082181 [1] NCCL INFO NVLS multicast support is not available on dev 1 -node-0:2082181:2082181 [1] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082181:2082181 [1] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082181:2082181 [1] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082181:2082181 [1] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082181:2082181 [1] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082181:2082181 [1] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082181:2082181 [1] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082181:2082181 [1] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082181:2082181 [1] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082181:2082181 [1] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082181:2082181 [1] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082181:2082181 [1] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082181:2082181 [1] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7/500000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082188:2082188 [7] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082188:2082188 [7] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082188:2082188 [7] NCCL INFO ========================================== -node-0:2082188:2082188 [7] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082188:2082188 [7] NCCL INFO Setting affinity for GPU 7 to ff,fff00000 -node-0:2082188:2082188 [7] NCCL INFO NVLS multicast support is not available on dev 7 -node-0:2082188:2082188 [7] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082188:2082188 [7] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082188:2082188 [7] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082188:2082188 [7] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082188:2082188 [7] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082188:2082188 [7] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082188:2082188 [7] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082188:2082188 [7] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082188:2082188 [7] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082188:2082188 [7] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082188:2082188 [7] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082188:2082188 [7] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082188:2082188 [7] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 /500000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082185:2082185 [5] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082185:2082185 [5] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082185:2082185 [5] NCCL INFO ========================================== -node-0:2082185:2082185 [5] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082185:2082185 [5] NCCL INFO Setting affinity for GPU 5 to ff,fff00000 -node-0:2082185:2082185 [5] NCCL INFO NVLS multicast support is not available on dev 5 -node-0:2082185:2082185 [5] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082185:2082185 [5] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082185:2082185 [5] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082185:2082185 [5] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082185:2082185 [5] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082185:2082185 [5] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082185:2082185 [5] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082185:2082185 [5] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082185:2082185 [5] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082185:2082185 [5] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082185:2082185 [5] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082185:2082185 [5] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082185:2082185 [5] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 /500000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082183:2082183 [3] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082183:2082183 [3] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082183:2082183 [3] NCCL INFO ========================================== -node-0:2082183:2082183 [3] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082183:2082183 [3] NCCL INFO Setting affinity for GPU 3 to 0fffff -node-0:2082183:2082183 [3] NCCL INFO NVLS multicast support is not available on dev 3 -node-0:2082183:2082183 [3] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082183:2082183 [3] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082183:2082183 [3] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082183:2082183 [3] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082183:2082183 [3] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082183:2082183 [3] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082183:2082183 [3] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082183:2082183 [3] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082183:2082183 [3] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082183:2082183 [3] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082183:2082183 [3] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082183:2082183 [3] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082183:2082183 [3] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7/500000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2082182:2082182 [2] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2082182:2082182 [2] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2082182:2082182 [2] NCCL INFO ========================================== -node-0:2082182:2082182 [2] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2082182:2082182 [2] NCCL INFO Setting affinity for GPU 2 to 0fffff -node-0:2082182:2082182 [2] NCCL INFO NVLS multicast support is not available on dev 2 -node-0:2082182:2082182 [2] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082182:2082182 [2] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082182:2082182 [2] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082182:2082182 [2] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082182:2082182 [2] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082182:2082182 [2] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082182:2082182 [2] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082182:2082182 [2] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2082182:2082182 [2] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082182:2082182 [2] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2082182:2082182 [2] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082182:2082182 [2] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2082182:2082182 [2] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/712.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2081505:2081505 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff -node-0:2081505:2081505 [0] NCCL INFO NVLS multicast support is not available on dev 0 -node-0:2081505:2081505 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2081505:2081505 [0] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2081505:2081505 [0] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2081505:2081505 [0] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2081505:2081505 [0] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2081505:2081505 [0] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2081505:2081505 [0] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2081505:2081505 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2081505:2081505 [0] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2081505:2081505 [0] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2081505:2081505 [0] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2081505:2081505 [0] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2081505:2081505 [0] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2081505:2081505 [0] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2081505:2081505 [0] NCCL INFO comm 0x80d88f50 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 -node-0:2081505:2081505 [0] NCCL INFO Tree 0 : -1 -> 0 -> 1/-1/-1 -node-0:2081505:2081505 [0] NCCL INFO Tree 6 : -1 -> 0 -> 1/-1/-1 -node-0:2081505:2081505 [0] NCCL INFO Tree 1 : -1 -> 0 -> 1/-1/-1 -node-0:2081505:2081505 [0] NCCL INFO Tree 7 : -1 -> 0 -> 1/-1/-1 -node-0:2081505:2081505 [0] NCCL INFO Tree 2 : -1 -> 0 -> 3/-1/-1 -node-0:2081505:2081505(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] Exception in worker VllmWorkerProcess while processing method init_device. -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] Traceback (most recent call last): -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/executor/multiproc_worker_utils.py", line 230, in _run_worker_process -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] output = executor(*args, **kwargs) -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/worker/worker.py", line 148, in init_device -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] init_worker_distributed_environment(self.vllm_config, self.rank, -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/worker/worker.py", line 460, in init_worker_distributed_environment -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1101, in ensure_model_parallel_initialized -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] initialize_model_parallel(tensor_model_parallel_size, -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1045, in initialize_model_parallel -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] _TP = init_model_parallel_group(group_ranks, -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 876, in init_model_parallel_group -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] return GroupCoordinator( -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 216, in __init__ -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] self.pynccl_comm = PyNcclCommunicator( -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/device_communicators/pynccl.py", line 99, in __init__ -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] self.comm: ncclComm_t = self.nccl.ncclCommInitRank( -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 275, in ncclCommInitRank -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm), -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 254, in NCCL_CHECK -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] raise RuntimeError(f"NCCL error: {error_str}") -(VllmWorkerProcess pid=2082185) ERROR 01-16 05:10:59 multiproc_worker_utils.py:236] RuntimeError: NCCL error: unhandled cuda error (run with NCCL_DEBUG=INFO for details) -GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2082185:2082185 [5] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2082185:2082185 [5] NCCL INFO comm 0x80d846d0 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 -node-0:2082185:2082185 [5] NCCL INFO Ring 00 : 6 -> 5 -> 3 -node-0:2082185:2082185 [5] NCCL INFO Ring 01 : 6 -> 5 -> 3 -node-0:2082185:2082185 [5] NCCL INFO Ring 02 : 3 -> 5 -> 6 -node-0:2082185:2082185 [5] NCCL INFO Ring 03 : 3 -> 5 -> 6 -node-0:2082185:2082185 [5] NCCL INFO Ring 04 : 7 -> 5 -> 4 -node-0:2082185:2082185 [5] NCCL INFO Ring 05 : 4 -> 5 -> 7 -node-0:2082185:2082185 [5] NCCL INFO Ring 06 : 6 -> 5 -> 3 -node-0:2082185:2082185 [5] NCCL INFO Ring 07 : 6 -> 5 -> 3 -node-0:2082185:2082185 [5] NCCL INFO Ring 08 : 3 -> 5 -> 6 -node-0:2082185:2082185 [5] NCCL INFO Ring 09 : 3 -> 5 -> 6 -node-0:2082185:2082185 [5] NCCL INFO Ring 10 : 7 -> 5 -> 4 -node-0:2082185:2082185 [5] NCCL INFO Ring 11 : 4 -> 5 -> 7 -node-0:2082185:2082185 [5] NCCL INFO Trees [0] 3/-1/-1->5->6 [1] 3/-1/-1->5->6 [2] 6/-1/-1->5->3 [3] 6/-1/-1->5->3 [4] 4/-1/-1->5->7 [5] 7/-1/-1->5->4 [6] 3/-1/-1->5->6 [7] 3/-1/-1->5->6 [8] 6/-1/-1->5->3 [9] 6/-1/-1->5->3 [10] 4/-1/-1->5->7 [11] 7/-1/-1->5->4 -node-0:2082185:2082185 [5] NCCL INFO P2P Chunksize set to 524288 -node-0:2082185:2082185 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 05/0 : 5[5] -> 7[7] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 11/0 : 5[5] -> 7[7] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 00/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 01/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 06/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 07/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Connected all rings -node-0:2082185:2082185 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 04/0 : 5[5] -> 7[7] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 10/0 : 5[5] -> 7[7] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 02/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 03/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 08/0 : 5[5] -> 3[3] via P2P/IPC -node-0:2082185:2082185 [5] NCCL INFO Channel 09/0 : 5[5] -> 3[3] via P2P/IPC - -node-0:2082185:2082878 [5] include/alloc.h:179 NCCL WARN Cuda failure 'out of memory' - -node-0:2082185:2082878 [5] include/alloc.h:186 NCCL WARN Failed to CUDA calloc 10485760 bytes -node-0:2082185:2082878 [5] NCCL INFO transport/p2p.cc:218 -> 1 -node-0:2082185:2082878 [5] NCCL INFO transport/p2p.cc:619 -> 1 -node-0:2082185:2082185 [5] NCCL INFO transport/p2p.cc:448 -> 1 -node-0:2082185:2082185 [5] NCCL INFO transport.cc:33 -> 1 -node-0:2082185:2082185 [5] NCCL INFO transport.cc:113 -> 1 -node-0:2082185:2082185 [5] NCCL INFO init.cc:1273 -> 1 -node-0:2082185:2082185 [5] NCCL INFO init.cc:1548 -> 1 -node-0:2082185:2082185 [5] NCCL INFO init.cc:1799 -> 1 -node-0:2082185:2082185 [5] NCCL INFO init.cc:1837 -> 1 -[node-0:2082185:0:2082880] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f67cb600000) -ERROR 01-16 05:11:00 multiproc_worker_utils.py:123] Worker VllmWorkerProcess pid 2082185 died, exit code: -11 -INFO 01-16 05:11:00 multiproc_worker_utils.py:127] Killing local vLLM worker processes -Killed -Downloading dataset from https://github.com/evalplus/mbppplus_release/releases/download/v0.2.0/MbppPlus.jsonl.gz -Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0 -Initializing a decoder model: /home/aiscuser/fhw/model_weights/warriordeep224 ... -WARNING 01-16 05:12:42 config.py:2276] Casting torch.bfloat16 to torch.float16. -INFO 01-16 05:12:51 config.py:510] This model supports multiple tasks: {'score', 'classify', 'generate', 'embed', 'reward'}. Defaulting to 'generate'. -INFO 01-16 05:12:51 config.py:1310] Defaulting to use mp for distributed inference -INFO 01-16 05:12:51 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='/home/aiscuser/fhw/model_weights/warriordeep224', speculative_config=None, tokenizer='/home/aiscuser/fhw/model_weights/warriordeep224', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/aiscuser/fhw/model_weights/warriordeep224, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"candidate_compile_sizes":[],"compile_sizes":[],"capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, -WARNING 01-16 05:12:51 multiproc_worker_utils.py:312] Reducing Torch parallelism from 40 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -INFO 01-16 05:12:51 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager -INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089185) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089186) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089185) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089186) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089190) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089189) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089189) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089190) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089188) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089188) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089191) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089191) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089187) INFO 01-16 05:12:53 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs. -(VllmWorkerProcess pid=2089187) INFO 01-16 05:12:53 selector.py:129] Using XFormers backend. -(VllmWorkerProcess pid=2089188) INFO 01-16 05:12:55 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089186) INFO 01-16 05:12:55 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089190) INFO 01-16 05:12:55 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089185) INFO 01-16 05:12:56 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089191) INFO 01-16 05:12:56 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089187) INFO 01-16 05:12:56 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -(VllmWorkerProcess pid=2089189) INFO 01-16 05:12:56 multiproc_worker_utils.py:222] Worker ready; awaiting tasks -Killed -Traceback (most recent call last): - File "/data/local/zhangdi/DPO/DPO_train.py", line 120, in - gpu_task() - File "/data/local/zhangdi/DPO/DPO_train.py", line 111, in gpu_task - outputs = net(fake_input) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl - return forward_call(*args, **kwargs) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 183, in forward - inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 207, in scatter - return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 88, in scatter_kwargs - scattered_inputs = scatter(inputs, target_gpus, dim) if inputs else [] - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 75, in scatter - res = scatter_map(inputs) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 62, in scatter_map - return list(zip(*map(scatter_map, obj))) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 58, in scatter_map - return Scatter.apply(target_gpus, None, dim, obj) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply - return super().apply(*args, **kwargs) # type: ignore[misc] - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/_functions.py", line 104, in forward - outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) - File "/home/aiscuser/.conda/envs/evalplus/lib/python3.10/site-packages/torch/nn/parallel/comm.py", line 205, in scatter - return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) -RuntimeError: CUDA error: out of memory -CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. -For debugging consider passing CUDA_LAUNCH_BLOCKING=1 -Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. - -node-0:2137159:2137159 [0] NCCL INFO cudaDriverVersion 12010 -node-0:2137159:2137159 [0] NCCL INFO Bootstrap : Using eth0:10.29.40.157<0> -node-0:2137159:2137159 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v8 symbol. -node-0:2137159:2137159 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin v6 (v6) -node-0:2137159:2137159 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol. -node-0:2137159:2137159 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported. -NCCL version 2.21.5+cuda12.4 -node-0:2137159:2138159 [0] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so -node-0:2137159:2138159 [0] NCCL INFO P2P plugin IBext -node-0:2137159:2138159 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB eth0:10.29.40.157<0> -node-0:2137159:2138159 [0] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138159 [0] NCCL INFO Using network IBext -node-0:2137159:2138163 [4] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138163 [4] NCCL INFO Using network IBext -node-0:2137159:2138166 [7] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138166 [7] NCCL INFO Using network IBext -node-0:2137159:2138160 [1] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138160 [1] NCCL INFO Using network IBext -node-0:2137159:2138162 [3] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138162 [3] NCCL INFO Using network IBext -node-0:2137159:2138165 [6] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138165 [6] NCCL INFO Using network IBext -node-0:2137159:2138164 [5] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138164 [5] NCCL INFO Using network IBext -node-0:2137159:2138161 [2] NCCL INFO Using non-device net plugin version 0 -node-0:2137159:2138161 [2] NCCL INFO Using network IBext -node-0:2137159:2138165 [6] NCCL INFO ncclCommInitRank comm 0x191fada60 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 700000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138159 [0] NCCL INFO ncclCommInitRank comm 0x19512d5d0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138163 [4] NCCL INFO ncclCommInitRank comm 0x191fa35f0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 500000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138166 [7] NCCL INFO ncclCommInitRank comm 0x191fb4bb0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 800000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138164 [5] NCCL INFO ncclCommInitRank comm 0x191fa75a0 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 600000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138161 [2] NCCL INFO ncclCommInitRank comm 0x195135530 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 300000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138162 [3] NCCL INFO ncclCommInitRank comm 0x1951394e0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 400000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138160 [1] NCCL INFO ncclCommInitRank comm 0x195131580 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0xbdb9b3d54baf996a - Init START -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0001-0000-3130-444531444235/pci0001:00/0001:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0002-0000-3130-444531444235/pci0002:00/0002:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0003-0000-3130-444531444235/pci0003:00/0003:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138166 [7] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138166 [7] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138166 [7] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138166 [7] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138166 [7] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138166 [7] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138166 [7] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138166 [7] NCCL INFO ========================================== -node-0:2137159:2138166 [7] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138166 [7] NCCL INFO Setting affinity for GPU 7 to ff,fff00000 -node-0:2137159:2138166 [7] NCCL INFO NVLS multicast support is not available on dev 7 -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138163 [4] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138163 [4] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138163 [4] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138163 [4] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138163 [4] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138163 [4] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138163 [4] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138163 [4] NCCL INFO ========================================== -node-0:2137159:2138163 [4] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138163 [4] NCCL INFO Setting affinity for GPU 4 to ff,fff00000 -node-0:2137159:2138163 [4] NCCL INFO NVLS multicast support is not available on dev 4 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0004-0000-3130-444531444235/pci0004:00/0004:00:00.0/../max_link_width, ignoring -node-0:2137159:2138166 [7] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138166 [7] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138166 [7] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138166 [7] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138166 [7] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138166 [7] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138166 [7] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138166 [7] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138166 [7] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138166 [7] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138166 [7] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138166 [7] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138166 [7] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138166 [7] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0005-0001-3130-444531444235/pci0005:00/0005:00:00.0/../max_link_width, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138163 [4] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138163 [4] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138163 [4] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138163 [4] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138163 [4] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138163 [4] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138163 [4] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138163 [4] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138163 [4] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138163 [4] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138163 [4] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138163 [4] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138163 [4] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138162 [3] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138162 [3] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138162 [3] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138162 [3] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138162 [3] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138162 [3] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138162 [3] NCCL INFO ========================================== -node-0:2137159:2138162 [3] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138162 [3] NCCL INFO Setting affinity for GPU 3 to 0fffff -node-0:2137159:2138162 [3] NCCL INFO NVLS multicast support is not available on dev 3 -node-0:2137159:2138159 [0] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138159 [0] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138159 [0] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138159 [0] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138159 [0] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138159 [0] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138159 [0] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138159 [0] NCCL INFO ========================================== -node-0:2137159:2138159 [0] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138159 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff -node-0:2137159:2138159 [0] NCCL INFO NVLS multicast support is not available on dev 0 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0006-0001-3130-444531444235/pci0006:00/0006:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138165 [6] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138162 [3] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138162 [3] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138162 [3] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138162 [3] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138162 [3] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138162 [3] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138162 [3] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138165 [6] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138165 [6] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138165 [6] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138165 [6] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138162 [3] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138162 [3] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138162 [3] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138162 [3] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138162 [3] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138165 [6] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138162 [3] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138165 [6] NCCL INFO ========================================== -node-0:2137159:2138162 [3] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138165 [6] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138165 [6] NCCL INFO Setting affinity for GPU 6 to ff,fff00000 -node-0:2137159:2138165 [6] NCCL INFO NVLS multicast support is not available on dev 6 -node-0:2137159:2138159 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138159 [0] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138159 [0] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138159 [0] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138159 [0] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138159 [0] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138159 [0] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138159 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138159 [0] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138159 [0] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138159 [0] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138159 [0] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138159 [0] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138159 [0] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0007-0001-3130-444531444235/pci0007:00/0007:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138160 [1] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138160 [1] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138160 [1] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138160 [1] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138160 [1] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138160 [1] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138160 [1] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138160 [1] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138160 [1] NCCL INFO ========================================== -node-0:2137159:2138160 [1] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138160 [1] NCCL INFO Setting affinity for GPU 1 to 0fffff -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138161 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138165 [6] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138165 [6] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138165 [6] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138160 [1] NCCL INFO NVLS multicast support is not available on dev 1 -node-0:2137159:2138165 [6] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138165 [6] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138165 [6] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138165 [6] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138165 [6] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/47505500-0008-0001-3130-444531444235/pci0008:00/0008:00:00.0/../max_link_width, ignoring -node-0:2137159:2138161 [2] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138161 [2] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138161 [2] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138161 [2] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138161 [2] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138161 [2] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138161 [2] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138161 [2] NCCL INFO ========================================== -node-0:2137159:2138161 [2] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138161 [2] NCCL INFO Setting affinity for GPU 2 to 0fffff -node-0:2137159:2138161 [2] NCCL INFO NVLS multicast support is not available on dev 2 -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_speed, ignoring -node-0:2137159:2138164 [5] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/00000060-0101-0000-3135-423331303142/pci0101:00/0101:00:00.0/../max_link_width, ignoring -node-0:2137159:2138164 [5] NCCL INFO === System : maxBw 40.0 totalBw 120.0 === -node-0:2137159:2138164 [5] NCCL INFO CPU/0-0 (1/1/2) -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-100000 (0) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-200000 (1) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-300000 (2) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/200000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/400000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-400000 (3) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/100000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/300000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - NIC/0-10100000 -node-0:2137159:2138164 [5] NCCL INFO + SYS[10.0] - CPU/1 -node-0:2137159:2138164 [5] NCCL INFO CPU/0-1 (1/1/2) -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-500000 (4) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/300000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/700000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-600000 (5) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/400000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/800000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-700000 (6) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/600000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/800000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/100000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/500000 -node-0:2137159:2138164 [5] NCCL INFO + PCI[12.0] - GPU/0-800000 (7) -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/700000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[40.0] - GPU/500000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/600000 -node-0:2137159:2138164 [5] NCCL INFO + NVL[20.0] - GPU/200000 -node-0:2137159:2138164 [5] NCCL INFO + SYS[10.0] - CPU/0 -node-0:2137159:2138164 [5] NCCL INFO ========================================== -node-0:2137159:2138164 [5] NCCL INFO GPU/100000 :GPU/0-100000 (0/5000.0/LOC) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (2/40.0/NVB) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/200000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (0/5000.0/LOC) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (2/40.0/NVB) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/300000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (1/40.0/NVL) GPU/0-300000 (0/5000.0/LOC) GPU/0-400000 (1/20.0/NVL) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (2/20.0/NVB) GPU/0-700000 (2/20.0/NVB) GPU/0-800000 (2/40.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/400000 :GPU/0-100000 (1/40.0/NVL) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (1/20.0/NVL) GPU/0-400000 (0/5000.0/LOC) GPU/0-500000 (2/20.0/NVB) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (2/40.0/NVB) GPU/0-800000 (2/20.0/NVB) CPU/0-0 (1/12.0/PHB) CPU/0-1 (2/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/500000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (2/40.0/NVB) GPU/0-300000 (1/40.0/NVL) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (0/5000.0/LOC) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/20.0/NVL) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/600000 :GPU/0-100000 (2/40.0/NVB) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (1/40.0/NVL) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (0/5000.0/LOC) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (1/20.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/700000 :GPU/0-100000 (1/20.0/NVL) GPU/0-200000 (2/20.0/NVB) GPU/0-300000 (2/20.0/NVB) GPU/0-400000 (2/40.0/NVB) GPU/0-500000 (1/20.0/NVL) GPU/0-600000 (1/40.0/NVL) GPU/0-700000 (0/5000.0/LOC) GPU/0-800000 (1/40.0/NVL) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO GPU/800000 :GPU/0-100000 (2/20.0/NVB) GPU/0-200000 (1/20.0/NVL) GPU/0-300000 (2/40.0/NVB) GPU/0-400000 (2/20.0/NVB) GPU/0-500000 (1/40.0/NVL) GPU/0-600000 (1/20.0/NVL) GPU/0-700000 (1/40.0/NVL) GPU/0-800000 (0/5000.0/LOC) CPU/0-0 (2/12.0/PHB) CPU/0-1 (1/12.0/PHB) -node-0:2137159:2138164 [5] NCCL INFO Setting affinity for GPU 5 to ff,fff00000 -node-0:2137159:2138164 [5] NCCL INFO NVLS multicast support is not available on dev 5 -node-0:2137159:2138160 [1] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138160 [1] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138160 [1] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138160 [1] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138160 [1] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138160 [1] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138160 [1] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138160 [1] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138160 [1] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138160 [1] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138160 [1] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138160 [1] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138160 [1] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138160 [1] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138161 [2] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138161 [2] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138161 [2] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138161 [2] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138161 [2] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138161 [2] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138161 [2] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138161 [2] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138161 [2] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138161 [2] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138161 [2] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138161 [2] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138161 [2] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138161 [2] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138164 [5] NCCL INFO Pattern 4, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138164 [5] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138164 [5] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138164 [5] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138164 [5] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138164 [5] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138164 [5] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138164 [5] NCCL INFO Pattern 1, crossNic 0, nChannels 6, bw 20.000000/20.000000, type NVL/PIX, sameChannels 0 -node-0:2137159:2138164 [5] NCCL INFO 0 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138164 [5] NCCL INFO 1 : GPU/0 GPU/1 GPU/2 GPU/4 GPU/7 GPU/6 GPU/5 GPU/3 -node-0:2137159:2138164 [5] NCCL INFO 2 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138164 [5] NCCL INFO 3 : GPU/0 GPU/3 GPU/5 GPU/6 GPU/7 GPU/4 GPU/2 GPU/1 -node-0:2137159:2138164 [5] NCCL INFO 4 : GPU/0 GPU/2 GPU/3 GPU/1 GPU/7 GPU/5 GPU/4 GPU/6 -node-0:2137159:2138164 [5] NCCL INFO 5 : GPU/0 GPU/6 GPU/4 GPU/5 GPU/7 GPU/1 GPU/3 GPU/2 -node-0:2137159:2138165 [6] NCCL INFO comm 0x191fada60 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 -node-0:2137159:2138165 [6] NCCL INFO Tree 5 : 0 -> 6 -> 4/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO comm 0x195135530 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 -node-0:2137159:2138160 [1] NCCL INFO comm 0x195131580 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 -node-0:2137159:2138166 [7] NCCL INFO comm 0x191fb4bb0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 -node-0:2137159:2138165 [6] NCCL INFO Tree 11 : 0 -> 6 -> 4/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO comm 0x1951394e0 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 -node-0:2137159:2138163 [4] NCCL INFO comm 0x191fa35f0 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 -node-0:2137159:2138159 [0] NCCL INFO comm 0x19512d5d0 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 -node-0:2137159:2138160 [1] NCCL INFO Tree 0 : 0 -> 1 -> 2/-1/-1 -node-0:2137159:2138160 [1] NCCL INFO Tree 6 : 0 -> 1 -> 2/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO Tree 2 : 0 -> 3 -> 5/-1/-1 -node-0:2137159:2138160 [1] NCCL INFO Tree 1 : 0 -> 1 -> 2/-1/-1 -node-0:2137159:2138159 [0] NCCL INFO Tree 0 : -1 -> 0 -> 1/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO Tree 4 : 0 -> 2 -> 3/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO Tree 10 : 0 -> 2 -> 3/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO Tree 8 : 0 -> 3 -> 5/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO Ring 00 : 1 -> 2 -> 4 -node-0:2137159:2138161 [2] NCCL INFO Ring 01 : 1 -> 2 -> 4 -node-0:2137159:2138161 [2] NCCL INFO Ring 02 : 4 -> 2 -> 1 -node-0:2137159:2138161 [2] NCCL INFO Ring 03 : 4 -> 2 -> 1 -node-0:2137159:2138161 [2] NCCL INFO Ring 04 : 0 -> 2 -> 3 -node-0:2137159:2138162 [3] NCCL INFO Tree 3 : 0 -> 3 -> 5/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO Ring 05 : 3 -> 2 -> 0 -node-0:2137159:2138162 [3] NCCL INFO Tree 9 : 0 -> 3 -> 5/-1/-1 -node-0:2137159:2138161 [2] NCCL INFO Ring 06 : 1 -> 2 -> 4 -node-0:2137159:2138161 [2] NCCL INFO Ring 07 : 1 -> 2 -> 4 -node-0:2137159:2138161 [2] NCCL INFO Ring 08 : 4 -> 2 -> 1 -node-0:2137159:2138161 [2] NCCL INFO Ring 09 : 4 -> 2 -> 1 -node-0:2137159:2138160 [1] NCCL INFO Tree 7 : 0 -> 1 -> 2/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO Ring 00 : 5 -> 3 -> 0 -node-0:2137159:2138161 [2] NCCL INFO Ring 10 : 0 -> 2 -> 3 -node-0:2137159:2138162 [3] NCCL INFO Ring 01 : 5 -> 3 -> 0 -node-0:2137159:2138162 [3] NCCL INFO Ring 02 : 0 -> 3 -> 5 -node-0:2137159:2138162 [3] NCCL INFO Ring 03 : 0 -> 3 -> 5 -node-0:2137159:2138162 [3] NCCL INFO Ring 04 : 2 -> 3 -> 1 -node-0:2137159:2138161 [2] NCCL INFO Ring 11 : 3 -> 2 -> 0 -node-0:2137159:2138161 [2] NCCL INFO Trees [0] 4/-1/-1->2->1 [1] 4/-1/-1->2->1 [2] 1/-1/-1->2->4 [3] 1/-1/-1->2->4 [4] 3/-1/-1->2->0 [5] -1/-1/-1->2->3 [6] 4/-1/-1->2->1 [7] 4/-1/-1->2->1 [8] 1/-1/-1->2->4 [9] 1/-1/-1->2->4 [10] 3/-1/-1->2->0 [11] -1/-1/-1->2->3 -node-0:2137159:2138162 [3] NCCL INFO Ring 05 : 1 -> 3 -> 2 -node-0:2137159:2138160 [1] NCCL INFO Ring 00 : 0 -> 1 -> 2 -node-0:2137159:2138160 [1] NCCL INFO Ring 01 : 0 -> 1 -> 2 -node-0:2137159:2138164 [5] NCCL INFO comm 0x191fa75a0 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 -node-0:2137159:2138159 [0] NCCL INFO Tree 6 : -1 -> 0 -> 1/-1/-1 -node-0:2137159:2138160 [1] NCCL INFO Ring 02 : 2 -> 1 -> 0 -node-0:2137159:2138162 [3] NCCL INFO Ring 06 : 5 -> 3 -> 0 -node-0:2137159:2138161 [2] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138160 [1] NCCL INFO Ring 03 : 2 -> 1 -> 0 -node-0:2137159:2138162 [3] NCCL INFO Ring 07 : 5 -> 3 -> 0 -node-0:2137159:2138159 [0] NCCL INFO Tree 1 : -1 -> 0 -> 1/-1/-1 -node-0:2137159:2138160 [1] NCCL INFO Ring 04 : 3 -> 1 -> 7 -node-0:2137159:2138162 [3] NCCL INFO Ring 08 : 0 -> 3 -> 5 -node-0:2137159:2138165 [6] NCCL INFO Ring 00 : 7 -> 6 -> 5 -node-0:2137159:2138163 [4] NCCL INFO Ring 00 : 2 -> 4 -> 7 -node-0:2137159:2138162 [3] NCCL INFO Ring 09 : 0 -> 3 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Tree 7 : -1 -> 0 -> 1/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO Ring 10 : 2 -> 3 -> 1 -node-0:2137159:2138159 [0] NCCL INFO Tree 2 : -1 -> 0 -> 3/-1/-1 -node-0:2137159:2138162 [3] NCCL INFO Ring 11 : 1 -> 3 -> 2 -node-0:2137159:2138162 [3] NCCL INFO Trees [0] -1/-1/-1->3->5 [1] -1/-1/-1->3->5 [2] 5/-1/-1->3->0 [3] 5/-1/-1->3->0 [4] 1/-1/-1->3->2 [5] 2/-1/-1->3->1 [6] -1/-1/-1->3->5 [7] -1/-1/-1->3->5 [8] 5/-1/-1->3->0 [9] 5/-1/-1->3->0 [10] 1/-1/-1->3->2 [11] 2/-1/-1->3->1 -node-0:2137159:2138159 [0] NCCL INFO Tree 8 : -1 -> 0 -> 3/-1/-1 -node-0:2137159:2138166 [7] NCCL INFO Ring 00 : 4 -> 7 -> 6 -node-0:2137159:2138163 [4] NCCL INFO Ring 01 : 2 -> 4 -> 7 -node-0:2137159:2138159 [0] NCCL INFO Tree 3 : -1 -> 0 -> 3/-1/-1 -node-0:2137159:2138165 [6] NCCL INFO Ring 01 : 7 -> 6 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Tree 9 : -1 -> 0 -> 3/-1/-1 -node-0:2137159:2138163 [4] NCCL INFO Ring 02 : 7 -> 4 -> 2 -node-0:2137159:2138162 [3] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Tree 4 : -1 -> 0 -> 2/-1/-1 -node-0:2137159:2138166 [7] NCCL INFO Ring 01 : 4 -> 7 -> 6 -node-0:2137159:2138159 [0] NCCL INFO Tree 10 : -1 -> 0 -> 2/-1/-1 -node-0:2137159:2138164 [5] NCCL INFO Ring 00 : 6 -> 5 -> 3 -node-0:2137159:2138165 [6] NCCL INFO Ring 02 : 5 -> 6 -> 7 -node-0:2137159:2138159 [0] NCCL INFO Tree 5 : -1 -> 0 -> 6/-1/-1 -node-0:2137159:2138163 [4] NCCL INFO Ring 03 : 7 -> 4 -> 2 -node-0:2137159:2138160 [1] NCCL INFO Ring 05 : 7 -> 1 -> 3 -node-0:2137159:2138160 [1] NCCL INFO Ring 06 : 0 -> 1 -> 2 -node-0:2137159:2138160 [1] NCCL INFO Ring 07 : 0 -> 1 -> 2 -node-0:2137159:2138160 [1] NCCL INFO Ring 08 : 2 -> 1 -> 0 -node-0:2137159:2138160 [1] NCCL INFO Ring 09 : 2 -> 1 -> 0 -node-0:2137159:2138159 [0] NCCL INFO Tree 11 : -1 -> 0 -> 6/-1/-1 -node-0:2137159:2138160 [1] NCCL INFO Ring 10 : 3 -> 1 -> 7 -node-0:2137159:2138160 [1] NCCL INFO Ring 11 : 7 -> 1 -> 3 -node-0:2137159:2138160 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] -1/-1/-1->1->2 [3] -1/-1/-1->1->2 [4] 7/-1/-1->1->3 [5] 3/-1/-1->1->7 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] -1/-1/-1->1->2 [9] -1/-1/-1->1->2 [10] 7/-1/-1->1->3 [11] 3/-1/-1->1->7 -node-0:2137159:2138160 [1] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Channel 00/12 : 0 1 2 4 7 6 5 3 -node-0:2137159:2138159 [0] NCCL INFO Channel 01/12 : 0 1 2 4 7 6 5 3 -node-0:2137159:2138166 [7] NCCL INFO Ring 02 : 6 -> 7 -> 4 -node-0:2137159:2138159 [0] NCCL INFO Channel 02/12 : 0 3 5 6 7 4 2 1 -node-0:2137159:2138159 [0] NCCL INFO Channel 03/12 : 0 3 5 6 7 4 2 1 -node-0:2137159:2138159 [0] NCCL INFO Channel 04/12 : 0 2 3 1 7 5 4 6 -node-0:2137159:2138159 [0] NCCL INFO Channel 05/12 : 0 6 4 5 7 1 3 2 -node-0:2137159:2138166 [7] NCCL INFO Ring 03 : 6 -> 7 -> 4 -node-0:2137159:2138166 [7] NCCL INFO Ring 04 : 1 -> 7 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Channel 06/12 : 0 1 2 4 7 6 5 3 -node-0:2137159:2138163 [4] NCCL INFO Ring 04 : 5 -> 4 -> 6 -node-0:2137159:2138166 [7] NCCL INFO Ring 05 : 5 -> 7 -> 1 -node-0:2137159:2138166 [7] NCCL INFO Ring 06 : 4 -> 7 -> 6 -node-0:2137159:2138163 [4] NCCL INFO Ring 05 : 6 -> 4 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Channel 07/12 : 0 1 2 4 7 6 5 3 -node-0:2137159:2138165 [6] NCCL INFO Ring 03 : 5 -> 6 -> 7 -node-0:2137159:2138163 [4] NCCL INFO Ring 06 : 2 -> 4 -> 7 -node-0:2137159:2138159 [0] NCCL INFO Channel 08/12 : 0 3 5 6 7 4 2 1 -node-0:2137159:2138163 [4] NCCL INFO Ring 07 : 2 -> 4 -> 7 -node-0:2137159:2138159 [0] NCCL INFO Channel 09/12 : 0 3 5 6 7 4 2 1 -node-0:2137159:2138163 [4] NCCL INFO Ring 08 : 7 -> 4 -> 2 -node-0:2137159:2138164 [5] NCCL INFO Ring 01 : 6 -> 5 -> 3 -node-0:2137159:2138165 [6] NCCL INFO Ring 04 : 4 -> 6 -> 0 -node-0:2137159:2138163 [4] NCCL INFO Ring 09 : 7 -> 4 -> 2 -node-0:2137159:2138159 [0] NCCL INFO Channel 10/12 : 0 2 3 1 7 5 4 6 -node-0:2137159:2138164 [5] NCCL INFO Ring 02 : 3 -> 5 -> 6 -node-0:2137159:2138165 [6] NCCL INFO Ring 05 : 0 -> 6 -> 4 -node-0:2137159:2138163 [4] NCCL INFO Ring 10 : 5 -> 4 -> 6 -node-0:2137159:2138159 [0] NCCL INFO Channel 11/12 : 0 6 4 5 7 1 3 2 -node-0:2137159:2138165 [6] NCCL INFO Ring 06 : 7 -> 6 -> 5 -node-0:2137159:2138166 [7] NCCL INFO Ring 07 : 4 -> 7 -> 6 -node-0:2137159:2138163 [4] NCCL INFO Ring 11 : 6 -> 4 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Ring 00 : 3 -> 0 -> 1 -node-0:2137159:2138163 [4] NCCL INFO Trees [0] 7/-1/-1->4->2 [1] 7/-1/-1->4->2 [2] 2/-1/-1->4->7 [3] 2/-1/-1->4->7 [4] 6/-1/-1->4->5 [5] 5/-1/-1->4->6 [6] 7/-1/-1->4->2 [7] 7/-1/-1->4->2 [8] 2/-1/-1->4->7 [9] 2/-1/-1->4->7 [10] 6/-1/-1->4->5 [11] 5/-1/-1->4->6 -node-0:2137159:2138166 [7] NCCL INFO Ring 08 : 6 -> 7 -> 4 -node-0:2137159:2138159 [0] NCCL INFO Ring 01 : 3 -> 0 -> 1 -node-0:2137159:2138165 [6] NCCL INFO Ring 07 : 7 -> 6 -> 5 -node-0:2137159:2138164 [5] NCCL INFO Ring 03 : 3 -> 5 -> 6 -node-0:2137159:2138166 [7] NCCL INFO Ring 09 : 6 -> 7 -> 4 -node-0:2137159:2138159 [0] NCCL INFO Ring 02 : 1 -> 0 -> 3 -node-0:2137159:2138165 [6] NCCL INFO Ring 08 : 5 -> 6 -> 7 -node-0:2137159:2138164 [5] NCCL INFO Ring 04 : 7 -> 5 -> 4 -node-0:2137159:2138163 [4] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Ring 03 : 1 -> 0 -> 3 -node-0:2137159:2138165 [6] NCCL INFO Ring 09 : 5 -> 6 -> 7 -node-0:2137159:2138164 [5] NCCL INFO Ring 05 : 4 -> 5 -> 7 -node-0:2137159:2138165 [6] NCCL INFO Ring 10 : 4 -> 6 -> 0 -node-0:2137159:2138164 [5] NCCL INFO Ring 06 : 6 -> 5 -> 3 -node-0:2137159:2138164 [5] NCCL INFO Ring 07 : 6 -> 5 -> 3 -node-0:2137159:2138164 [5] NCCL INFO Ring 08 : 3 -> 5 -> 6 -node-0:2137159:2138164 [5] NCCL INFO Ring 09 : 3 -> 5 -> 6 -node-0:2137159:2138164 [5] NCCL INFO Ring 10 : 7 -> 5 -> 4 -node-0:2137159:2138164 [5] NCCL INFO Ring 11 : 4 -> 5 -> 7 -node-0:2137159:2138164 [5] NCCL INFO Trees [0] 3/-1/-1->5->6 [1] 3/-1/-1->5->6 [2] 6/-1/-1->5->3 [3] 6/-1/-1->5->3 [4] 4/-1/-1->5->7 [5] 7/-1/-1->5->4 [6] 3/-1/-1->5->6 [7] 3/-1/-1->5->6 [8] 6/-1/-1->5->3 [9] 6/-1/-1->5->3 [10] 4/-1/-1->5->7 [11] 7/-1/-1->5->4 -node-0:2137159:2138159 [0] NCCL INFO Ring 04 : 6 -> 0 -> 2 -node-0:2137159:2138164 [5] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Ring 05 : 2 -> 0 -> 6 -node-0:2137159:2138159 [0] NCCL INFO Ring 06 : 3 -> 0 -> 1 -node-0:2137159:2138166 [7] NCCL INFO Ring 10 : 1 -> 7 -> 5 -node-0:2137159:2138159 [0] NCCL INFO Ring 07 : 3 -> 0 -> 1 -node-0:2137159:2138165 [6] NCCL INFO Ring 11 : 0 -> 6 -> 4 -node-0:2137159:2138159 [0] NCCL INFO Ring 08 : 1 -> 0 -> 3 -node-0:2137159:2138166 [7] NCCL INFO Ring 11 : 5 -> 7 -> 1 -node-0:2137159:2138159 [0] NCCL INFO Ring 09 : 1 -> 0 -> 3 -node-0:2137159:2138165 [6] NCCL INFO Trees [0] 5/-1/-1->6->7 [1] 5/-1/-1->6->7 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] -1/-1/-1->6->4 [5] 4/-1/-1->6->0 [6] 5/-1/-1->6->7 [7] 5/-1/-1->6->7 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] -1/-1/-1->6->4 [11] 4/-1/-1->6->0 -node-0:2137159:2138166 [7] NCCL INFO Trees [0] 6/-1/-1->7->4 [1] 6/-1/-1->7->4 [2] 4/-1/-1->7->6 [3] 4/-1/-1->7->6 [4] 5/-1/-1->7->1 [5] 1/-1/-1->7->5 [6] 6/-1/-1->7->4 [7] 6/-1/-1->7->4 [8] 4/-1/-1->7->6 [9] 4/-1/-1->7->6 [10] 5/-1/-1->7->1 [11] 1/-1/-1->7->5 -node-0:2137159:2138159 [0] NCCL INFO Ring 10 : 6 -> 0 -> 2 -node-0:2137159:2138165 [6] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Ring 11 : 2 -> 0 -> 6 -node-0:2137159:2138166 [7] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138159 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 3/-1/-1->0->-1 [3] 3/-1/-1->0->-1 [4] 2/-1/-1->0->-1 [5] 6/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 3/-1/-1->0->-1 [9] 3/-1/-1->0->-1 [10] 2/-1/-1->0->-1 [11] 6/-1/-1->0->-1 -node-0:2137159:2138159 [0] NCCL INFO P2P Chunksize set to 524288 -node-0:2137159:2138163 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 04/0 : 4[4] -> 6[6] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 10/0 : 4[4] -> 6[6] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 02/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 03/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 05/0 : 7[7] -> 1[1] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 04/0 : 6[6] -> 0[0] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 08/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 11/0 : 7[7] -> 1[1] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 10/0 : 6[6] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 05/0 : 5[5] -> 7[7] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 09/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 04/0 : 0[0] -> 2[2] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 00/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 11/0 : 5[5] -> 7[7] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 10/0 : 0[0] -> 2[2] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 05/0 : 1[1] -> 3[3] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 01/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 11/0 : 1[1] -> 3[3] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 06/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 07/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 05/0 : 6[6] -> 4[4] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 02/0 : 0[0] -> 3[3] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 00/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 00/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 11/0 : 6[6] -> 4[4] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 03/0 : 0[0] -> 3[3] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 01/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 04/0 : 1[1] -> 7[7] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 08/0 : 0[0] -> 3[3] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 06/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 01/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 09/0 : 0[0] -> 3[3] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 07/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 10/0 : 1[1] -> 7[7] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 06/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 02/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 07/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 03/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 08/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 09/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 05/0 : 0[0] -> 6[6] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 02/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 04/0 : 7[7] -> 5[5] via P2P/direct pointer -node-0:2137159:2138159 [0] NCCL INFO Channel 11/0 : 0[0] -> 6[6] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 03/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 10/0 : 7[7] -> 5[5] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 04/0 : 3[3] -> 1[1] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 08/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 09/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Connected all rings -node-0:2137159:2138159 [0] NCCL INFO Connected all rings -node-0:2137159:2138163 [4] NCCL INFO Connected all rings -node-0:2137159:2138163 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Connected all rings -node-0:2137159:2138160 [1] NCCL INFO Connected all rings -node-0:2137159:2138161 [2] NCCL INFO Connected all rings -node-0:2137159:2138160 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Connected all rings -node-0:2137159:2138164 [5] NCCL INFO Connected all rings -node-0:2137159:2138163 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 02/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 03/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 08/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 00/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 04/0 : 1[1] -> 3[3] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 09/0 : 2[2] -> 4[4] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 01/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 05/0 : 4[4] -> 6[6] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 06/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 05/0 : 6[6] -> 0[0] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 04/0 : 7[7] -> 1[1] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 11/0 : 4[4] -> 6[6] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 07/0 : 3[3] -> 5[5] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 11/0 : 6[6] -> 0[0] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 10/0 : 7[7] -> 1[1] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 02/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 04/0 : 6[6] -> 4[4] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 04/0 : 5[5] -> 7[7] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 03/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 10/0 : 6[6] -> 4[4] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 08/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 10/0 : 5[5] -> 7[7] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 05/0 : 1[1] -> 7[7] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 09/0 : 4[4] -> 7[7] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 11/0 : 1[1] -> 7[7] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 04/0 : 2[2] -> 0[0] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 02/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 03/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 00/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 08/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 01/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 09/0 : 5[5] -> 3[3] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 06/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 07/0 : 7[7] -> 4[4] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 00/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 05/0 : 7[7] -> 5[5] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 01/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 11/0 : 7[7] -> 5[5] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 06/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138163 [4] NCCL INFO Channel 07/0 : 4[4] -> 2[2] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138166 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138164 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138161 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138160 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/direct pointer -node-0:2137159:2138165 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/direct pointer -node-0:2137159:2138162 [3] NCCL INFO Connected all trees -node-0:2137159:2138159 [0] NCCL INFO Connected all trees -node-0:2137159:2138164 [5] NCCL INFO Connected all trees -node-0:2137159:2138162 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138162 [3] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138164 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138163 [4] NCCL INFO Connected all trees -node-0:2137159:2138164 [5] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138163 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138163 [4] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138162 [3] NCCL INFO Channel 08/1 : 3[3] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138166 [7] NCCL INFO Connected all trees -node-0:2137159:2138166 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138166 [7] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138166 [7] NCCL INFO Channel 08/1 : 7[7] -> 0[0] via P2P/indirect/1[1] -node-0:2137159:2138161 [2] NCCL INFO Connected all trees -node-0:2137159:2138161 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138161 [2] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138165 [6] NCCL INFO Connected all trees -node-0:2137159:2138165 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138165 [6] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138160 [1] NCCL INFO Connected all trees -node-0:2137159:2138159 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138160 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 -node-0:2137159:2138160 [1] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138159 [0] NCCL INFO 12 coll channels, 12 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer -node-0:2137159:2138162 [3] NCCL INFO Channel 09/1 : 3[3] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138166 [7] NCCL INFO Channel 09/1 : 7[7] -> 0[0] via P2P/indirect/1[1] -node-0:2137159:2138160 [1] NCCL INFO Channel 12/1 : 1[1] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138166 [7] NCCL INFO Channel 12/1 : 7[7] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138161 [2] NCCL INFO Channel 12/1 : 2[2] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138160 [1] NCCL INFO Channel 13/1 : 1[1] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138162 [3] NCCL INFO Channel 12/1 : 3[3] -> 6[6] via P2P/indirect/5[5] -node-0:2137159:2138166 [7] NCCL INFO Channel 13/1 : 7[7] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138164 [5] NCCL INFO Channel 12/1 : 5[5] -> 0[0] via P2P/indirect/3[3] -node-0:2137159:2138162 [3] NCCL INFO Channel 13/1 : 3[3] -> 6[6] via P2P/indirect/5[5] -node-0:2137159:2138161 [2] NCCL INFO Channel 13/1 : 2[2] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138165 [6] NCCL INFO Channel 12/1 : 6[6] -> 1[1] via P2P/indirect/0[0] -node-0:2137159:2138164 [5] NCCL INFO Channel 13/1 : 5[5] -> 0[0] via P2P/indirect/3[3] -node-0:2137159:2138165 [6] NCCL INFO Channel 13/1 : 6[6] -> 1[1] via P2P/indirect/0[0] -node-0:2137159:2138166 [7] NCCL INFO Channel 02/1 : 7[7] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138161 [2] NCCL INFO Channel 02/1 : 2[2] -> 6[6] via P2P/indirect/0[0] -node-0:2137159:2138163 [4] NCCL INFO Channel 02/1 : 4[4] -> 0[0] via P2P/indirect/2[2] -node-0:2137159:2138160 [1] NCCL INFO Channel 02/1 : 1[1] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138164 [5] NCCL INFO Channel 02/1 : 5[5] -> 1[1] via P2P/indirect/7[7] -node-0:2137159:2138159 [0] NCCL INFO Channel 02/1 : 0[0] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138165 [6] NCCL INFO Channel 02/1 : 6[6] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138166 [7] NCCL INFO Channel 03/1 : 7[7] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138161 [2] NCCL INFO Channel 03/1 : 2[2] -> 6[6] via P2P/indirect/0[0] -node-0:2137159:2138160 [1] NCCL INFO Channel 03/1 : 1[1] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138163 [4] NCCL INFO Channel 03/1 : 4[4] -> 0[0] via P2P/indirect/2[2] -node-0:2137159:2138165 [6] NCCL INFO Channel 03/1 : 6[6] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138164 [5] NCCL INFO Channel 03/1 : 5[5] -> 1[1] via P2P/indirect/7[7] -node-0:2137159:2138162 [3] NCCL INFO Channel 02/1 : 3[3] -> 7[7] via P2P/indirect/5[5] -node-0:2137159:2138159 [0] NCCL INFO Channel 03/1 : 0[0] -> 4[4] via P2P/indirect/2[2] -node-0:2137159:2138162 [3] NCCL INFO Channel 03/1 : 3[3] -> 7[7] via P2P/indirect/5[5] -node-0:2137159:2138165 [6] NCCL INFO Channel 10/1 : 6[6] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138160 [1] NCCL INFO Channel 10/1 : 1[1] -> 6[6] via P2P/indirect/7[7] -node-0:2137159:2138163 [4] NCCL INFO Channel 10/1 : 4[4] -> 1[1] via P2P/indirect/2[2] -node-0:2137159:2138164 [5] NCCL INFO Channel 10/1 : 5[5] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138159 [0] NCCL INFO Channel 10/1 : 0[0] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138160 [1] NCCL INFO Channel 11/1 : 1[1] -> 6[6] via P2P/indirect/7[7] -node-0:2137159:2138161 [2] NCCL INFO Channel 10/1 : 2[2] -> 7[7] via P2P/indirect/4[4] -node-0:2137159:2138165 [6] NCCL INFO Channel 11/1 : 6[6] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138163 [4] NCCL INFO Channel 11/1 : 4[4] -> 1[1] via P2P/indirect/2[2] -node-0:2137159:2138159 [0] NCCL INFO Channel 11/1 : 0[0] -> 5[5] via P2P/indirect/3[3] -node-0:2137159:2138164 [5] NCCL INFO Channel 11/1 : 5[5] -> 2[2] via P2P/indirect/4[4] -node-0:2137159:2138161 [2] NCCL INFO Channel 11/1 : 2[2] -> 7[7] via P2P/indirect/4[4] -node-0:2137159:2138163 [4] NCCL INFO Channel 14/1 : 4[4] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138159 [0] NCCL INFO Channel 14/1 : 0[0] -> 7[7] via P2P/indirect/6[6] -node-0:2137159:2138163 [4] NCCL INFO Channel 15/1 : 4[4] -> 3[3] via P2P/indirect/5[5] -node-0:2137159:2138159 [0] NCCL INFO Channel 15/1 : 0[0] -> 7[7] via P2P/indirect/6[6] -node-0:2137159:2138165 [6] NCCL INFO ncclCommInitRank comm 0x191fada60 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId 700000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138163 [4] NCCL INFO ncclCommInitRank comm 0x191fa35f0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 500000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138159 [0] NCCL INFO ncclCommInitRank comm 0x19512d5d0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138161 [2] NCCL INFO ncclCommInitRank comm 0x195135530 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 300000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138164 [5] NCCL INFO ncclCommInitRank comm 0x191fa75a0 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 600000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138160 [1] NCCL INFO ncclCommInitRank comm 0x195131580 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138162 [3] NCCL INFO ncclCommInitRank comm 0x1951394e0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 400000 commId 0xbdb9b3d54baf996a - Init COMPLETE -node-0:2137159:2138166 [7] NCCL INFO ncclCommInitRank comm 0x191fb4bb0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId 800000 commId 0xbdb9b3d54baf996a - Init COMPLETE +version https://git-lfs.github.com/spec/v1 +oid sha256:31533cdca8ab8a16d90393b8e38dfbc069a96f9c27a4c32466cb35d2fcd49c85 +size 251265